diff options
author | Ronald S. Bultje <rsbultje@gmail.com> | 2014-01-05 11:18:47 -0500 |
---|---|---|
committer | Ronald S. Bultje <rsbultje@gmail.com> | 2014-01-07 20:43:35 -0500 |
commit | 04a187fb2ae2c14645c34e0e678f797af3ebb7a9 (patch) | |
tree | 55561e9dad1666d8b87896bc2f3c283cbdb3b096 | |
parent | 37b001d14d3426c02f032ff3a3f90e3f02a639cd (diff) | |
download | ffmpeg-04a187fb2ae2c14645c34e0e678f797af3ebb7a9.tar.gz |
vp9/x86: idct_32x32_add_ssse3 sub-8x8-idct.
Runtime of the full 32x32 idct goes from 2446 to 2441 cycles (intra) or
from 1425 to 1306 cycles (inter). Overall runtime is not significantly
affected.
-rw-r--r-- | libavcodec/x86/vp9itxfm.asm | 109 |
1 files changed, 107 insertions, 2 deletions
diff --git a/libavcodec/x86/vp9itxfm.asm b/libavcodec/x86/vp9itxfm.asm index b137df83d4..c5d52f612c 100644 --- a/libavcodec/x86/vp9itxfm.asm +++ b/libavcodec/x86/vp9itxfm.asm @@ -74,13 +74,22 @@ SECTION .text psrad m%2, 14 %endmacro -%macro VP9_UNPACK_MULSUB_2W_4X 7 ; dst1, dst2, coef1, coef2, rnd, tmp1, tmp2 +%macro VP9_UNPACK_MULSUB_2W_4X 7-9 ; dst1, dst2, (src1, src2,) coef1, coef2, rnd, tmp1, tmp2 +%if %0 == 7 punpckhwd m%6, m%2, m%1 VP9_MULSUB_2W_2X %7, %6, %6, %5, [pw_m%3_%4], [pw_%4_%3] punpcklwd m%2, m%1 VP9_MULSUB_2W_2X %1, %2, %2, %5, [pw_m%3_%4], [pw_%4_%3] packssdw m%1, m%7 packssdw m%2, m%6 +%else + punpckhwd m%8, m%4, m%3 + VP9_MULSUB_2W_2X %9, %8, %8, %7, [pw_m%5_%6], [pw_%6_%5] + punpcklwd m%2, m%4, m%3 + VP9_MULSUB_2W_2X %1, %2, %2, %7, [pw_m%5_%6], [pw_%6_%5] + packssdw m%1, m%9 + packssdw m%2, m%8 +%endif %endmacro %macro VP9_STORE_2X 5-6 dstq ; reg1, reg2, tmp1, tmp2, zero, dst @@ -381,6 +390,32 @@ cglobal vp9_idct_idct_8x8_add, 4,4,13, dst, stride, block, eob ; SUMSUB_BA w, 6, 9, 15 ; t6, t9 ; SUMSUB_BA w, 7, 8, 15 ; t7, t8 %macro VP9_IDCT16_1D_START 4 ; src, nnzc, stride, stack_scratch +%if %2 <= 4 + mova m3, [%1+ 1*%3] ; IN(1) + mova m12, [%1+ 2*%3] ; IN(2) + mova m0, [%1+ 3*%3] ; IN(3) + + pmulhrsw m15, m12, [pw_16069x2] ; t6-7 + pmulhrsw m12, [pw_3196x2] ; t4-5 + pmulhrsw m4, m3, [pw_16305x2] ; t14-15 + pmulhrsw m3, [pw_1606x2] ; t8-9 + pmulhrsw m7, m0, [pw_m4756x2] ; t10-11 + pmulhrsw m0, [pw_15679x2] ; t12-13 + + ; m8=t0, m9=t1, m10=t2, m11=t3, m12=t4, m14=t5, m13=t6, m15=t7 + ; m3=t8, m5=t9, m1=t10, m7=t11, m0=t12, m6=t13, m2=t14, m4=t15 + + paddw m14, m15, m12 + psubw m13, m15, m12 + pmulhrsw m13, [pw_11585x2] ; t5 + pmulhrsw m14, [pw_11585x2] ; t6 + + VP9_UNPACK_MULSUB_2W_4X 2, 5, 4, 3, 15137, 6270, [pd_8192], 10, 11 ; t9, t14 + VP9_UNPACK_MULSUB_2W_4X 6, 1, 0, 7, 6270, m15137, [pd_8192], 10, 11 ; t10, t13 + + ; m15=t0, m14=t1, m13=t2, m12=t3, m11=t4, m10=t5, m9=t6, m8=t7 + ; m7=t8, m6=t9, m2=t10, m3=t11, m4=t12, m5=t13, m1=t14, m0=t15 +%else mova m5, [%1+ 1*%3] ; IN(1) mova m14, [%1+ 2*%3] ; IN(2) mova m6, [%1+ 3*%3] ; IN(3) @@ -442,6 +477,7 @@ cglobal vp9_idct_idct_8x8_add, 4,4,13, dst, stride, block, eob pmulhrsw m14, [pw_11585x2] ; t6 VP9_UNPACK_MULSUB_2W_4X 2, 5, 15137, 6270, [pd_8192], 10, 11 ; t9, t14 VP9_UNPACK_MULSUB_2W_4X 6, 1, 6270, m15137, [pd_8192], 10, 11 ; t10, t13 +%endif ; m8=t0, m9=t1, m10=t2, m11=t3, m12=t4, m13=t5, m14=t6, m15=t7 ; m3=t8, m2=t9, m6=t10, m7=t11, m0=t12, m1=t13, m5=t14, m4=t15 @@ -468,6 +504,17 @@ cglobal vp9_idct_idct_8x8_add, 4,4,13, dst, stride, block, eob ; m7=t8, m6=t9, m5=t10, m4=t11, m3=t12, m2=t13, m1=t14, m0=t15 ; from load/start +%if %2 <= 4 + mova m11, [%1+ 0*%3] ; IN(0) + pmulhrsw m11, [pw_11585x2] ; t0-t3 + + psubw m8, m11, m15 + paddw m15, m11 + psubw m9, m11, m14 + paddw m14, m11 + psubw m10, m11, m13 + paddw m13, m11 +%else mova m10, [%1+ 0*%3] ; IN(0) %if %2 <= 8 pmulhrsw m10, [pw_11585x2] ; t0 and t1 @@ -490,6 +537,7 @@ cglobal vp9_idct_idct_8x8_add, 4,4,13, dst, stride, block, eob SUMSUB_BA w, 15, 8, 7 ; t0, t7 SUMSUB_BA w, 14, 9, 7 ; t1, t6 SUMSUB_BA w, 13, 10, 7 ; t2, t5 +%endif SUMSUB_BA w, 12, 11, 7 ; t3, t4 SUMSUB_BA w, 0, 15, 7 ; t0, t15 @@ -716,6 +764,37 @@ cglobal vp9_idct_idct_16x16_add, 4, 5, 16, 512, dst, stride, block, eob mova [rsp+26*%%str], m14 ; t14 ; then, secondly, do t16-31 +%if %3 <= 8 + mova m4, [%1+ 1*64] + mova m3, [%1+ 3*64] + mova m0, [%1+ 5*64] + mova m7, [%1+ 7*64] + + pmulhrsw m11, m4, [pw_16364x2] ;t31 + pmulhrsw m4, [pw_804x2] ;t16 + pmulhrsw m8, m7, [pw_m5520x2] ;t19 + pmulhrsw m7, [pw_15426x2] ;t28 + pmulhrsw m15, m0, [pw_15893x2] ;t27 + pmulhrsw m0, [pw_3981x2] ;t20 + pmulhrsw m12, m3, [pw_m2404x2] ;t23 + pmulhrsw m3, [pw_16207x2] ;t24 + + ; m4=t16/17, m8=t18/19, m0=t20/21, m12=t22/23, + ; m3=t24/25, m15=t26/27, m7=t28/29, m11=t30/31 + + VP9_UNPACK_MULSUB_2W_4X 5, 10, 11, 4, 16069, 3196, [pd_8192], 6, 9 ; t17, t30 + VP9_UNPACK_MULSUB_2W_4X 9, 6, 7, 8, 3196, m16069, [pd_8192], 1, 14 ; t18, t29 + ; from 1 stage forward + SUMSUB_BA w, 8, 4, 1 + ; temporary storage + mova [rsp+17*%%str], m8 ; t16 + mova [rsp+21*%%str], m4 ; t19 + VP9_UNPACK_MULSUB_2W_4X 1, 14, 15, 0, 9102, 13623, [pd_8192], 4, 8 ; t21, t26 + VP9_UNPACK_MULSUB_2W_4X 13, 2, 3, 12, 13623, m9102, [pd_8192], 4, 8 ; t22, t25 + + ; m4=t16, m5=t17, m9=t18, m8=t19, m0=t20, m1=t21, m13=t22, m12=t23, + ; m3=t24, m2=t25, m14=t26, m15=t27, m7=t28, m6=t29, m10=t30, m11=t31 +%else mova m10, [%1+ 1*64] mova m13, [%1+ 3*64] mova m14, [%1+ 5*64] @@ -793,6 +872,7 @@ cglobal vp9_idct_idct_16x16_add, 4, 5, 16, 512, dst, stride, block, eob VP9_UNPACK_MULSUB_2W_4X 9, 6, 3196, m16069, [pd_8192], 4, 8 ; t18, t29 VP9_UNPACK_MULSUB_2W_4X 1, 14, 9102, 13623, [pd_8192], 4, 8 ; t21, t26 VP9_UNPACK_MULSUB_2W_4X 13, 2, 13623, m9102, [pd_8192], 4, 8 ; t22, t25 +%endif ; m4=t16, m5=t17, m9=t18, m8=t19, m0=t20, m1=t21, m13=t22, m12=t23, ; m3=t24, m2=t25, m14=t26, m15=t27, m7=t28, m6=t29, m10=t30, m11=t31 @@ -1029,8 +1109,10 @@ INIT_XMM ssse3 cglobal vp9_idct_idct_32x32_add, 4, 8, 16, 2048, dst, stride, block, eob cmp eobd, 135 jg .idctfull - cmp eobd, 1 + cmp eobd, 34 jg .idct16x16 + cmp eobd, 1 + jg .idct8x8 ; dc-only case movd m0, [blockq] @@ -1050,6 +1132,29 @@ cglobal vp9_idct_idct_32x32_add, 4, 8, 16, 2048, dst, stride, block, eob RET DEFINE_ARGS dst_bak, stride, block, cnt, dst, stride30, dst_end, stride2 +.idct8x8: + VP9_IDCT32_1D blockq, 1, 8 + + mov stride30q, strideq ; stride + lea stride2q, [strideq*2] ; stride*2 + shl stride30q, 5 ; stride*32 + mov cntd, 4 + sub stride30q, stride2q ; stride*30 +.loop2_8x8: + mov dstq, dst_bakq + lea dst_endq, [dst_bakq+stride30q] + VP9_IDCT32_1D rsp, 2, 8 + add dst_bakq, 8 + add rsp, 16 + dec cntd + jg .loop2_8x8 + sub rsp, 64 + + ; at the end of the loop, m7 should still be zero + ; use that to zero out block coefficients + ZERO_BLOCK blockq, 64, 8, m7 + RET + .idct16x16: mov cntd, 2 .loop1_16x16: |