diff options
author | Ronald S. Bultje <rsbultje@gmail.com> | 2014-01-24 18:48:56 -0500 |
---|---|---|
committer | Ronald S. Bultje <rsbultje@gmail.com> | 2014-01-24 19:25:25 -0500 |
commit | c9e6325ed9844070dca03ac1c5fec946533cb315 (patch) | |
tree | 58f9bdb9550f5849aea131b02384c8fe22565e12 /libavcodec/x86/vp9itxfm.asm | |
parent | 97474d527f9a17340c29018548502c178ed7d966 (diff) | |
download | ffmpeg-c9e6325ed9844070dca03ac1c5fec946533cb315.tar.gz |
vp9/x86: use explicit register for relative stack references.
Before this patch, we explicitly modify rsp, which isn't necessarily
universally acceptable, since the space under the stack pointer might
be modified in things like signal handlers. Therefore, use an explicit
register to hold the stack pointer relative to the bottom of the stack
(i.e. rsp). This will also clear out valgrind errors about the use of
uninitialized data that started occurring after the idct16x16/ssse3
optimizations were first merged.
Diffstat (limited to 'libavcodec/x86/vp9itxfm.asm')
-rw-r--r-- | libavcodec/x86/vp9itxfm.asm | 440 |
1 files changed, 218 insertions, 222 deletions
diff --git a/libavcodec/x86/vp9itxfm.asm b/libavcodec/x86/vp9itxfm.asm index b142b8f778..8087c2e336 100644 --- a/libavcodec/x86/vp9itxfm.asm +++ b/libavcodec/x86/vp9itxfm.asm @@ -772,40 +772,40 @@ IADST8_FN iadst, IADST8, iadst, IADST8, avx %endmacro %macro VP9_IDCT16_1D 2-3 16 ; src, pass, nnzc - VP9_IDCT16_1D_START %1, %3, 32, rsp+32 + VP9_IDCT16_1D_START %1, %3, 32, tmpq+32 %if %2 == 1 ; backup a different register - mova [rsp+16], m15 - mova m7, [rsp+32] + mova [tmpq+16], m15 + mova m7, [tmpq+32] SUMSUB_BA w, 6, 9, 15 ; t6, t9 SUMSUB_BA w, 7, 8, 15 ; t7, t8 TRANSPOSE8x8W 0, 1, 2, 3, 4, 5, 6, 7, 15 - mova [rsp+ 0], m0 - mova [rsp+ 32], m1 - mova [rsp+ 64], m2 - mova [rsp+ 96], m3 - mova [rsp+128], m4 - mova [rsp+160], m5 - mova [rsp+192], m6 - mova [rsp+224], m7 - - mova m15, [rsp+16] + mova [tmpq+ 0], m0 + mova [tmpq+ 32], m1 + mova [tmpq+ 64], m2 + mova [tmpq+ 96], m3 + mova [tmpq+128], m4 + mova [tmpq+160], m5 + mova [tmpq+192], m6 + mova [tmpq+224], m7 + + mova m15, [tmpq+16] TRANSPOSE8x8W 8, 9, 10, 11, 12, 13, 14, 15, 0 - mova [rsp+ 16], m8 - mova [rsp+ 48], m9 - mova [rsp+ 80], m10 - mova [rsp+112], m11 - mova [rsp+144], m12 - mova [rsp+176], m13 - mova [rsp+208], m14 - mova [rsp+240], m15 + mova [tmpq+ 16], m8 + mova [tmpq+ 48], m9 + mova [tmpq+ 80], m10 + mova [tmpq+112], m11 + mova [tmpq+144], m12 + mova [tmpq+176], m13 + mova [tmpq+208], m14 + mova [tmpq+240], m15 %else ; %2 == 2 ; backup more registers - mova [rsp+64], m8 - mova [rsp+96], m9 + mova [tmpq+64], m8 + mova [tmpq+96], m9 pxor m7, m7 pmulhrsw m0, [pw_512] @@ -823,9 +823,9 @@ IADST8_FN iadst, IADST8, iadst, IADST8, avx ; restore from cache SWAP 0, 7 ; move zero from m7 to m0 - mova m7, [rsp+32] - mova m8, [rsp+64] - mova m9, [rsp+96] + mova m7, [tmpq+32] + mova m8, [tmpq+64] + mova m9, [tmpq+96] SUMSUB_BA w, 6, 9, 1 ; t6, t9 SUMSUB_BA w, 7, 8, 1 ; t7, t8 @@ -871,7 +871,7 @@ IADST8_FN iadst, IADST8, iadst, IADST8, avx %macro VP9_IDCT_IDCT_16x16_ADD_XMM 1 INIT_XMM %1 -cglobal vp9_idct_idct_16x16_add, 4, 5, 16, 512, dst, stride, block, eob +cglobal vp9_idct_idct_16x16_add, 4, 6, 16, 512, dst, stride, block, eob ; 2x2=eob=3, 4x4=eob=10 cmp eobd, 38 jg .idctfull @@ -894,19 +894,19 @@ cglobal vp9_idct_idct_16x16_add, 4, 5, 16, 512, dst, stride, block, eob VP9_STORE_2XFULL 0, 1, 2, 3, 4, 5 RET + DEFINE_ARGS dst, stride, block, cnt, dst_bak, tmp .idct8x8: - DEFINE_ARGS dst, stride, block, cnt, dst_bak + mov tmpq, rsp VP9_IDCT16_1D blockq, 1, 8 mov cntd, 2 mov dst_bakq, dstq .loop2_8x8: - VP9_IDCT16_1D rsp, 2, 8 + VP9_IDCT16_1D tmpq, 2, 8 lea dstq, [dst_bakq+8] - add rsp, 16 + add tmpq, 16 dec cntd jg .loop2_8x8 - sub rsp, 32 ; at the end of the loop, m0 should still be zero ; use that to zero out block coefficients @@ -914,26 +914,25 @@ cglobal vp9_idct_idct_16x16_add, 4, 5, 16, 512, dst, stride, block, eob RET .idctfull: - DEFINE_ARGS dst, stride, block, cnt, dst_bak mov cntd, 2 + mov tmpq, rsp .loop1_full: VP9_IDCT16_1D blockq, 1 add blockq, 16 - add rsp, 256 + add tmpq, 256 dec cntd jg .loop1_full sub blockq, 32 - sub rsp, 512 mov cntd, 2 + mov tmpq, rsp mov dst_bakq, dstq .loop2_full: - VP9_IDCT16_1D rsp, 2 + VP9_IDCT16_1D tmpq, 2 lea dstq, [dst_bakq+8] - add rsp, 16 + add tmpq, 16 dec cntd jg .loop2_full - sub rsp, 32 ; at the end of the loop, m0 should still be zero ; use that to zero out block coefficients @@ -970,7 +969,7 @@ VP9_IDCT_IDCT_16x16_ADD_XMM avx VP9_RND_SH_SUMSUB_BA 2, 11, 5, 7, 12, [pd_8192] ; m2=t2[w], m11=t10[w] VP9_RND_SH_SUMSUB_BA 3, 10, 4, 6, 12, [pd_8192] ; m3=t3[w], m10=t11[w] - mova [rsp+ 0*%%str], m9 ; make some scratch space (t0:m9->r0) + mova [tmpq+ 0*%%str], m9 ; make some scratch space (t0:m9->r0) mova m4, [%1+ 4*32] ; in4 mova m5, [%1+11*32] ; in11 mova m12, [%1+ 3*32] ; in3 @@ -981,10 +980,10 @@ VP9_IDCT_IDCT_16x16_ADD_XMM avx VP9_RND_SH_SUMSUB_BA 13, 4, 15, 6, 9, [pd_8192] ; m13=t4[w], m4=t12[w] VP9_RND_SH_SUMSUB_BA 12, 5, 14, 7, 9, [pd_8192] ; m12=t5[w], m5=t13[w] - mova [rsp+ 2*%%str], m8 ; t1:m9->r2 - mova [rsp+ 3*%%str], m2 ; t2:m2->r3 - mova [rsp+ 4*%%str], m3 ; t3:m3->r4 - mova [rsp+ 5*%%str], m13 ; t4:m13->r5 + mova [tmpq+ 2*%%str], m8 ; t1:m9->r2 + mova [tmpq+ 3*%%str], m2 ; t2:m2->r3 + mova [tmpq+ 4*%%str], m3 ; t3:m3->r4 + mova [tmpq+ 5*%%str], m13 ; t4:m13->r5 mova m2, [%1+ 6*32] ; in6 mova m3, [%1+ 9*32] ; in9 mova m8, [%1+ 1*32] ; in1 @@ -1030,16 +1029,16 @@ VP9_IDCT_IDCT_16x16_ADD_XMM avx ; m3=out1, m11=out2, m1=out5, m5=out6, m4=out9, m0=out10, m10=out13, m2=out14 - mova m6, [rsp+ 0*%%str] - mova m7, [rsp+ 2*%%str] - mova m13, [rsp+ 3*%%str] - mova m14, [rsp+ 4*%%str] - mova m15, [rsp+ 5*%%str] - mova [rsp+ 8*%%str], m5 - mova [rsp+ 9*%%str], m4 - mova [rsp+10*%%str], m0 - mova [rsp+11*%%str], m10 - mova [rsp+12*%%str], m2 + mova m6, [tmpq+ 0*%%str] + mova m7, [tmpq+ 2*%%str] + mova m13, [tmpq+ 3*%%str] + mova m14, [tmpq+ 4*%%str] + mova m15, [tmpq+ 5*%%str] + mova [tmpq+ 8*%%str], m5 + mova [tmpq+ 9*%%str], m4 + mova [tmpq+10*%%str], m0 + mova [tmpq+11*%%str], m10 + mova [tmpq+12*%%str], m2 ; m6=t0, m7=t1, m13=t2, m14=t3, m15=t4, m12=t5, m9=t6, m8=t7 ; m3=out1, m11=out2, m1=out5, r8=out6, r9=out9, r10=out10, r11=out13, r12=out14 @@ -1069,32 +1068,32 @@ VP9_IDCT_IDCT_16x16_ADD_XMM avx ; m3=out1, m11=out2, m1=out5, r8=out6, r9=out9, r10=out10, r11=out13, r12=out14 %if %2 == 1 - mova m0, [rsp+ 8*%%str] + mova m0, [tmpq+ 8*%%str] TRANSPOSE8x8W 9, 3, 11, 14, 7, 1, 0, 12, 2 - mova [rsp+ 0*16], m9 - mova [rsp+ 2*16], m3 - mova [rsp+ 4*16], m11 - mova [rsp+ 6*16], m14 - mova m9, [rsp+ 9*%%str] - mova m3, [rsp+10*%%str] - mova m11, [rsp+11*%%str] - mova m14, [rsp+12*%%str] - mova [rsp+ 8*16], m7 - mova [rsp+10*16], m1 - mova [rsp+12*16], m0 - mova [rsp+14*16], m12 + mova [tmpq+ 0*16], m9 + mova [tmpq+ 2*16], m3 + mova [tmpq+ 4*16], m11 + mova [tmpq+ 6*16], m14 + mova m9, [tmpq+ 9*%%str] + mova m3, [tmpq+10*%%str] + mova m11, [tmpq+11*%%str] + mova m14, [tmpq+12*%%str] + mova [tmpq+ 8*16], m7 + mova [tmpq+10*16], m1 + mova [tmpq+12*16], m0 + mova [tmpq+14*16], m12 TRANSPOSE8x8W 15, 9, 3, 6, 13, 11, 14, 8, 2 - mova [rsp+ 1*16], m15 - mova [rsp+ 3*16], m9 - mova [rsp+ 5*16], m3 - mova [rsp+ 7*16], m6 - mova [rsp+ 9*16], m13 - mova [rsp+11*16], m11 - mova [rsp+13*16], m14 - mova [rsp+15*16], m8 + mova [tmpq+ 1*16], m15 + mova [tmpq+ 3*16], m9 + mova [tmpq+ 5*16], m3 + mova [tmpq+ 7*16], m6 + mova [tmpq+ 9*16], m13 + mova [tmpq+11*16], m11 + mova [tmpq+13*16], m14 + mova [tmpq+15*16], m8 %else - mova m5, [rsp+ 8*%%str] + mova m5, [tmpq+ 8*%%str] pxor m0, m0 pmulhrsw m9, [pw_512] @@ -1114,10 +1113,10 @@ VP9_IDCT_IDCT_16x16_ADD_XMM avx VP9_STORE_2X 5, 12, 2, 4, 0 lea dstq, [dstq+strideq*2] - mova m9, [rsp+ 9*%%str] - mova m3, [rsp+10*%%str] - mova m11, [rsp+11*%%str] - mova m14, [rsp+12*%%str] + mova m9, [tmpq+ 9*%%str] + mova m3, [tmpq+10*%%str] + mova m11, [tmpq+11*%%str] + mova m14, [tmpq+12*%%str] pmulhrsw m15, [pw_512] pmulhrsw m9, [pw_512] @@ -1139,29 +1138,26 @@ VP9_IDCT_IDCT_16x16_ADD_XMM avx %macro IADST16_FN 5 INIT_XMM %5 -cglobal vp9_%1_%3_16x16_add, 3, 5, 16, 512, dst, stride, block, eob - ; potential eob checks go here - - DEFINE_ARGS dst, stride, block, cnt, dst_bak +cglobal vp9_%1_%3_16x16_add, 3, 6, 16, 512, dst, stride, block, cnt, dst_bak, tmp mov cntd, 2 + mov tmpq, rsp .loop1_full: VP9_%2_1D blockq, 1 add blockq, 16 - add rsp, 256 + add tmpq, 256 dec cntd jg .loop1_full sub blockq, 32 - sub rsp, 512 mov cntd, 2 + mov tmpq, rsp mov dst_bakq, dstq .loop2_full: - VP9_%4_1D rsp, 2 + VP9_%4_1D tmpq, 2 lea dstq, [dst_bakq+8] - add rsp, 16 + add tmpq, 16 dec cntd jg .loop2_full - sub rsp, 32 ; at the end of the loop, m0 should still be zero ; use that to zero out block coefficients @@ -1183,11 +1179,11 @@ IADST16_FN iadst, IADST16, iadst, IADST16, avx %macro VP9_IDCT32_1D 2-3 32 ; src, pass, nnzc %assign %%str 16*%2*%2 ; first do t0-15, this can be done identical to idct16x16 - VP9_IDCT16_1D_START %1, %3/2, 64*2, rsp+ 4*%%str + VP9_IDCT16_1D_START %1, %3/2, 64*2, tmpq+ 4*%%str ; backup a different register - mova [rsp+30*%%str], m15 ; t15 - mova m7, [rsp+ 4*%%str] + mova [tmpq+30*%%str], m15 ; t15 + mova m7, [tmpq+ 4*%%str] SUMSUB_BA w, 6, 9, 15 ; t6, t9 SUMSUB_BA w, 7, 8, 15 ; t7, t8 @@ -1195,21 +1191,21 @@ IADST16_FN iadst, IADST16, iadst, IADST16, avx ; store everything on stack to make space available for t16-31 ; we store interleaved with the output of the second half (t16-31) ; so we don't need to allocate extra stack space - mova [rsp+ 0*%%str], m0 ; t0 - mova [rsp+ 4*%%str], m1 ; t1 - mova [rsp+ 8*%%str], m2 ; t2 - mova [rsp+12*%%str], m3 ; t3 - mova [rsp+16*%%str], m4 ; t4 - mova [rsp+20*%%str], m5 ; t5 - mova [rsp+24*%%str], m6 ; t6 - mova [rsp+28*%%str], m7 ; t7 - mova [rsp+ 2*%%str], m8 ; t8 - mova [rsp+ 6*%%str], m9 ; t9 - mova [rsp+10*%%str], m10 ; t10 - mova [rsp+14*%%str], m11 ; t11 - mova [rsp+18*%%str], m12 ; t12 - mova [rsp+22*%%str], m13 ; t13 - mova [rsp+26*%%str], m14 ; t14 + mova [tmpq+ 0*%%str], m0 ; t0 + mova [tmpq+ 4*%%str], m1 ; t1 + mova [tmpq+ 8*%%str], m2 ; t2 + mova [tmpq+12*%%str], m3 ; t3 + mova [tmpq+16*%%str], m4 ; t4 + mova [tmpq+20*%%str], m5 ; t5 + mova [tmpq+24*%%str], m6 ; t6 + mova [tmpq+28*%%str], m7 ; t7 + mova [tmpq+ 2*%%str], m8 ; t8 + mova [tmpq+ 6*%%str], m9 ; t9 + mova [tmpq+10*%%str], m10 ; t10 + mova [tmpq+14*%%str], m11 ; t11 + mova [tmpq+18*%%str], m12 ; t12 + mova [tmpq+22*%%str], m13 ; t13 + mova [tmpq+26*%%str], m14 ; t14 ; then, secondly, do t16-31 %if %3 <= 8 @@ -1235,8 +1231,8 @@ IADST16_FN iadst, IADST16, iadst, IADST16, avx ; from 1 stage forward SUMSUB_BA w, 8, 4, 1 ; temporary storage - mova [rsp+17*%%str], m8 ; t16 - mova [rsp+21*%%str], m4 ; t19 + mova [tmpq+17*%%str], m8 ; t16 + mova [tmpq+21*%%str], m4 ; t19 VP9_UNPACK_MULSUB_2W_4X 1, 14, 15, 0, 9102, 13623, [pd_8192], 4, 8 ; t21, t26 VP9_UNPACK_MULSUB_2W_4X 13, 2, 3, 12, 13623, m9102, [pd_8192], 4, 8 ; t22, t25 @@ -1289,8 +1285,8 @@ IADST16_FN iadst, IADST16, iadst, IADST16, avx ; from 2 stages forward SUMSUB_BA w, 8, 4, 2 ; temporary storage - mova [rsp+17*%%str], m8 ; t16 - mova [rsp+21*%%str], m4 ; t19 + mova [tmpq+17*%%str], m8 ; t16 + mova [tmpq+21*%%str], m4 ; t19 %if %3 <= 16 pmulhrsw m3, m12, [pw_13160x2] pmulhrsw m12, [pw_9760x2] @@ -1336,7 +1332,7 @@ IADST16_FN iadst, IADST16, iadst, IADST16, avx ; m8[s]=t16, m9=t17, m5=t18, m4[s]=t19, m12=t20, m13=t21, m1=t22, m0=t23, ; m15=t24, m14=t25, m2=t26, m3=t27, m11=t28, m10=t29, m6=t30, m7=t31 - mova m8, [rsp+17*%%str] ; t16 + mova m8, [tmpq+17*%%str] ; t16 ; from 2 stages forward SUMSUB_BA w, 0, 8, 4 SUMSUB_BA w, 15, 7, 4 @@ -1345,10 +1341,10 @@ IADST16_FN iadst, IADST16, iadst, IADST16, avx pmulhrsw m7, [pw_11585x2] pmulhrsw m8, [pw_11585x2] ; store t16/t23 - mova [rsp+ 1*%%str], m0 ; t16 - mova [rsp+29*%%str], m7 ; t23 + mova [tmpq+ 1*%%str], m0 ; t16 + mova [tmpq+29*%%str], m7 ; t23 - mova m4, [rsp+21*%%str] ; t19 + mova m4, [tmpq+21*%%str] ; t19 VP9_UNPACK_MULSUB_2W_4X 10, 5, 15137, 6270, [pd_8192], 0, 7 ; t18, t29 VP9_UNPACK_MULSUB_2W_4X 11, 4, 15137, 6270, [pd_8192], 0, 7 ; t19, t28 VP9_UNPACK_MULSUB_2W_4X 3, 12, 6270, m15137, [pd_8192], 0, 7 ; t20, t27 @@ -1384,27 +1380,27 @@ IADST16_FN iadst, IADST16, iadst, IADST16, avx ; store t17-19 (and t20-22 for pass 1) - keep t24-31 in registers for ; final sumsub in pass 1, or keep t20-22 and t24-31 in registers for ; final sumsub of pass 2 - mova [rsp+ 5*%%str], m1 ; t17 - mova [rsp+ 9*%%str], m2 ; t18 - mova [rsp+13*%%str], m3 ; t19 + mova [tmpq+ 5*%%str], m1 ; t17 + mova [tmpq+ 9*%%str], m2 ; t18 + mova [tmpq+13*%%str], m3 ; t19 ; then do final pass to sumsub+store the two halves %if %2 == 1 - mova [rsp+17*%%str], m4 ; t20 - mova [rsp+21*%%str], m5 ; t21 - mova [rsp+25*%%str], m6 ; t22 - - mova m0, [rsp+ 0*%%str] ; t0 - mova m1, [rsp+ 4*%%str] ; t1 - mova m2, [rsp+ 8*%%str] ; t2 - mova m3, [rsp+12*%%str] ; t3 - mova m4, [rsp+16*%%str] ; t4 - mova m5, [rsp+20*%%str] ; t5 - mova m6, [rsp+24*%%str] ; t6 + mova [tmpq+17*%%str], m4 ; t20 + mova [tmpq+21*%%str], m5 ; t21 + mova [tmpq+25*%%str], m6 ; t22 + + mova m0, [tmpq+ 0*%%str] ; t0 + mova m1, [tmpq+ 4*%%str] ; t1 + mova m2, [tmpq+ 8*%%str] ; t2 + mova m3, [tmpq+12*%%str] ; t3 + mova m4, [tmpq+16*%%str] ; t4 + mova m5, [tmpq+20*%%str] ; t5 + mova m6, [tmpq+24*%%str] ; t6 SUMSUB_BA w, 15, 0, 7 - mova [rsp+ 3*%%str], m0 ; t15 - mova m7, [rsp+28*%%str] ; t7 + mova [tmpq+ 3*%%str], m0 ; t15 + mova m7, [tmpq+28*%%str] ; t7 SUMSUB_BA w, 14, 1, 0 SUMSUB_BA w, 13, 2, 0 SUMSUB_BA w, 12, 3, 0 @@ -1414,45 +1410,45 @@ IADST16_FN iadst, IADST16, iadst, IADST16, avx SUMSUB_BA w, 8, 7, 0 TRANSPOSE8x8W 15, 14, 13, 12, 11, 10, 9, 8, 0 - mova [rsp+ 0*%%str], m15 - mova [rsp+ 4*%%str], m14 - mova [rsp+ 8*%%str], m13 - mova [rsp+12*%%str], m12 - mova [rsp+16*%%str], m11 - mova [rsp+20*%%str], m10 - mova [rsp+24*%%str], m9 - mova [rsp+28*%%str], m8 - - mova m0, [rsp+ 3*%%str] ; t15 + mova [tmpq+ 0*%%str], m15 + mova [tmpq+ 4*%%str], m14 + mova [tmpq+ 8*%%str], m13 + mova [tmpq+12*%%str], m12 + mova [tmpq+16*%%str], m11 + mova [tmpq+20*%%str], m10 + mova [tmpq+24*%%str], m9 + mova [tmpq+28*%%str], m8 + + mova m0, [tmpq+ 3*%%str] ; t15 TRANSPOSE8x8W 7, 6, 5, 4, 3, 2, 1, 0, 8 - mova [rsp+ 3*%%str], m7 - mova [rsp+ 7*%%str], m6 - mova [rsp+11*%%str], m5 - mova [rsp+15*%%str], m4 - mova [rsp+19*%%str], m3 - mova [rsp+23*%%str], m2 - mova [rsp+27*%%str], m1 - mova [rsp+31*%%str], m0 - - mova m15, [rsp+ 2*%%str] ; t8 - mova m14, [rsp+ 6*%%str] ; t9 - mova m13, [rsp+10*%%str] ; t10 - mova m12, [rsp+14*%%str] ; t11 - mova m11, [rsp+18*%%str] ; t12 - mova m10, [rsp+22*%%str] ; t13 - mova m9, [rsp+26*%%str] ; t14 - mova m8, [rsp+30*%%str] ; t15 - mova m7, [rsp+ 1*%%str] ; t16 - mova m6, [rsp+ 5*%%str] ; t17 - mova m5, [rsp+ 9*%%str] ; t18 - mova m4, [rsp+13*%%str] ; t19 - mova m3, [rsp+17*%%str] ; t20 - mova m2, [rsp+21*%%str] ; t21 - mova m1, [rsp+25*%%str] ; t22 + mova [tmpq+ 3*%%str], m7 + mova [tmpq+ 7*%%str], m6 + mova [tmpq+11*%%str], m5 + mova [tmpq+15*%%str], m4 + mova [tmpq+19*%%str], m3 + mova [tmpq+23*%%str], m2 + mova [tmpq+27*%%str], m1 + mova [tmpq+31*%%str], m0 + + mova m15, [tmpq+ 2*%%str] ; t8 + mova m14, [tmpq+ 6*%%str] ; t9 + mova m13, [tmpq+10*%%str] ; t10 + mova m12, [tmpq+14*%%str] ; t11 + mova m11, [tmpq+18*%%str] ; t12 + mova m10, [tmpq+22*%%str] ; t13 + mova m9, [tmpq+26*%%str] ; t14 + mova m8, [tmpq+30*%%str] ; t15 + mova m7, [tmpq+ 1*%%str] ; t16 + mova m6, [tmpq+ 5*%%str] ; t17 + mova m5, [tmpq+ 9*%%str] ; t18 + mova m4, [tmpq+13*%%str] ; t19 + mova m3, [tmpq+17*%%str] ; t20 + mova m2, [tmpq+21*%%str] ; t21 + mova m1, [tmpq+25*%%str] ; t22 SUMSUB_BA w, 7, 8, 0 - mova [rsp+ 2*%%str], m8 - mova m0, [rsp+29*%%str] ; t23 + mova [tmpq+ 2*%%str], m8 + mova m0, [tmpq+29*%%str] ; t23 SUMSUB_BA w, 6, 9, 8 SUMSUB_BA w, 5, 10, 8 SUMSUB_BA w, 4, 11, 8 @@ -1462,29 +1458,29 @@ IADST16_FN iadst, IADST16, iadst, IADST16, avx SUMSUB_BA w, 0, 15, 8 TRANSPOSE8x8W 0, 1, 2, 3, 4, 5, 6, 7, 8 - mova [rsp+ 1*%%str], m0 - mova [rsp+ 5*%%str], m1 - mova [rsp+ 9*%%str], m2 - mova [rsp+13*%%str], m3 - mova [rsp+17*%%str], m4 - mova [rsp+21*%%str], m5 - mova [rsp+25*%%str], m6 - mova [rsp+29*%%str], m7 - - mova m8, [rsp+ 2*%%str] + mova [tmpq+ 1*%%str], m0 + mova [tmpq+ 5*%%str], m1 + mova [tmpq+ 9*%%str], m2 + mova [tmpq+13*%%str], m3 + mova [tmpq+17*%%str], m4 + mova [tmpq+21*%%str], m5 + mova [tmpq+25*%%str], m6 + mova [tmpq+29*%%str], m7 + + mova m8, [tmpq+ 2*%%str] TRANSPOSE8x8W 8, 9, 10, 11, 12, 13, 14, 15, 0 - mova [rsp+ 2*%%str], m8 - mova [rsp+ 6*%%str], m9 - mova [rsp+10*%%str], m10 - mova [rsp+14*%%str], m11 - mova [rsp+18*%%str], m12 - mova [rsp+22*%%str], m13 - mova [rsp+26*%%str], m14 - mova [rsp+30*%%str], m15 + mova [tmpq+ 2*%%str], m8 + mova [tmpq+ 6*%%str], m9 + mova [tmpq+10*%%str], m10 + mova [tmpq+14*%%str], m11 + mova [tmpq+18*%%str], m12 + mova [tmpq+22*%%str], m13 + mova [tmpq+26*%%str], m14 + mova [tmpq+30*%%str], m15 %else - ; t0-7 is in [rsp+{0,4,8,12,16,20,24,28}*%%str] - ; t8-15 is in [rsp+{2,6,10,14,18,22,26,30}*%%str] - ; t16-19 and t23 is in [rsp+{1,5,9,13,29}*%%str] + ; t0-7 is in [tmpq+{0,4,8,12,16,20,24,28}*%%str] + ; t8-15 is in [tmpq+{2,6,10,14,18,22,26,30}*%%str] + ; t16-19 and t23 is in [tmpq+{1,5,9,13,29}*%%str] ; t20-22 is in m4-6 ; t24-31 is in m8-15 pxor m7, m7 @@ -1507,55 +1503,55 @@ IADST16_FN iadst, IADST16, iadst, IADST16, avx %endmacro ; store t0-1 and t30-31 - mova m0, [rsp+ 0*%%str] - mova m1, [rsp+ 4*%%str] + mova m0, [tmpq+ 0*%%str] + mova m1, [tmpq+ 4*%%str] %%STORE_2X2 0, 1, 14, 15, 2, 3, 7 ; store t2-3 and t28-29 - mova m0, [rsp+ 8*%%str] - mova m1, [rsp+12*%%str] + mova m0, [tmpq+ 8*%%str] + mova m1, [tmpq+12*%%str] %%STORE_2X2 0, 1, 12, 13, 2, 3, 7 ; store t4-5 and t26-27 - mova m0, [rsp+16*%%str] - mova m1, [rsp+20*%%str] + mova m0, [tmpq+16*%%str] + mova m1, [tmpq+20*%%str] %%STORE_2X2 0, 1, 10, 11, 2, 3, 7 ; store t6-7 and t24-25 - mova m0, [rsp+24*%%str] - mova m1, [rsp+28*%%str] + mova m0, [tmpq+24*%%str] + mova m1, [tmpq+28*%%str] %%STORE_2X2 0, 1, 8, 9, 2, 3, 7 ; store t8-9 and t22-23 - mova m0, [rsp+ 2*%%str] - mova m1, [rsp+ 6*%%str] - mova m8, [rsp+29*%%str] + mova m0, [tmpq+ 2*%%str] + mova m1, [tmpq+ 6*%%str] + mova m8, [tmpq+29*%%str] %%STORE_2X2 0, 1, 6, 8, 2, 3, 7 ; store t10-11 and t20-21 - mova m0, [rsp+10*%%str] - mova m1, [rsp+14*%%str] + mova m0, [tmpq+10*%%str] + mova m1, [tmpq+14*%%str] %%STORE_2X2 0, 1, 4, 5, 2, 3, 7 ; store t12-13 and t18-19 - mova m0, [rsp+18*%%str] - mova m1, [rsp+22*%%str] - mova m5, [rsp+13*%%str] - mova m4, [rsp+ 9*%%str] + mova m0, [tmpq+18*%%str] + mova m1, [tmpq+22*%%str] + mova m5, [tmpq+13*%%str] + mova m4, [tmpq+ 9*%%str] %%STORE_2X2 0, 1, 4, 5, 2, 3, 7 ; store t14-17 - mova m0, [rsp+26*%%str] - mova m1, [rsp+30*%%str] - mova m5, [rsp+ 5*%%str] - mova m4, [rsp+ 1*%%str] + mova m0, [tmpq+26*%%str] + mova m1, [tmpq+30*%%str] + mova m5, [tmpq+ 5*%%str] + mova m4, [tmpq+ 1*%%str] %%STORE_2X2 0, 1, 4, 5, 2, 3, 7, 0 %endif %endmacro %macro VP9_IDCT_IDCT_32x32_ADD_XMM 1 INIT_XMM %1 -cglobal vp9_idct_idct_32x32_add, 4, 8, 16, 2048, dst, stride, block, eob +cglobal vp9_idct_idct_32x32_add, 4, 9, 16, 2048, dst, stride, block, eob cmp eobd, 135 jg .idctfull cmp eobd, 34 @@ -1580,8 +1576,9 @@ cglobal vp9_idct_idct_32x32_add, 4, 8, 16, 2048, dst, stride, block, eob VP9_STORE_2XFULL 0, 1, 2, 3, 4, 5, mmsize RET - DEFINE_ARGS dst_bak, stride, block, cnt, dst, stride30, dst_end, stride2 + DEFINE_ARGS dst_bak, stride, block, cnt, dst, stride30, dst_end, stride2, tmp .idct8x8: + mov tmpq, rsp VP9_IDCT32_1D blockq, 1, 8 mov stride30q, strideq ; stride @@ -1592,12 +1589,11 @@ cglobal vp9_idct_idct_32x32_add, 4, 8, 16, 2048, dst, stride, block, eob .loop2_8x8: mov dstq, dst_bakq lea dst_endq, [dst_bakq+stride30q] - VP9_IDCT32_1D rsp, 2, 8 + VP9_IDCT32_1D tmpq, 2, 8 add dst_bakq, 8 - add rsp, 16 + add tmpq, 16 dec cntd jg .loop2_8x8 - sub rsp, 64 ; at the end of the loop, m7 should still be zero ; use that to zero out block coefficients @@ -1606,29 +1602,29 @@ cglobal vp9_idct_idct_32x32_add, 4, 8, 16, 2048, dst, stride, block, eob .idct16x16: mov cntd, 2 + mov tmpq, rsp .loop1_16x16: VP9_IDCT32_1D blockq, 1, 16 add blockq, 16 - add rsp, 512 + add tmpq, 512 dec cntd jg .loop1_16x16 sub blockq, 32 - sub rsp, 1024 mov stride30q, strideq ; stride lea stride2q, [strideq*2] ; stride*2 shl stride30q, 5 ; stride*32 mov cntd, 4 + mov tmpq, rsp sub stride30q, stride2q ; stride*30 .loop2_16x16: mov dstq, dst_bakq lea dst_endq, [dst_bakq+stride30q] - VP9_IDCT32_1D rsp, 2, 16 + VP9_IDCT32_1D tmpq, 2, 16 add dst_bakq, 8 - add rsp, 16 + add tmpq, 16 dec cntd jg .loop2_16x16 - sub rsp, 64 ; at the end of the loop, m7 should still be zero ; use that to zero out block coefficients @@ -1637,29 +1633,29 @@ cglobal vp9_idct_idct_32x32_add, 4, 8, 16, 2048, dst, stride, block, eob .idctfull: mov cntd, 4 + mov tmpq, rsp .loop1_full: VP9_IDCT32_1D blockq, 1 add blockq, 16 - add rsp, 512 + add tmpq, 512 dec cntd jg .loop1_full sub blockq, 64 - sub rsp, 2048 mov stride30q, strideq ; stride lea stride2q, [strideq*2] ; stride*2 shl stride30q, 5 ; stride*32 mov cntd, 4 + mov tmpq, rsp sub stride30q, stride2q ; stride*30 .loop2_full: mov dstq, dst_bakq lea dst_endq, [dst_bakq+stride30q] - VP9_IDCT32_1D rsp, 2 + VP9_IDCT32_1D tmpq, 2 add dst_bakq, 8 - add rsp, 16 + add tmpq, 16 dec cntd jg .loop2_full - sub rsp, 64 ; at the end of the loop, m7 should still be zero ; use that to zero out block coefficients |