diff options
author | Ronald S. Bultje <rsbultje@gmail.com> | 2014-01-20 15:30:22 -0500 |
---|---|---|
committer | Ronald S. Bultje <rsbultje@gmail.com> | 2014-01-24 19:25:25 -0500 |
commit | d43efa68bd53eecc9cea6e1081b6fb163e45665c (patch) | |
tree | ea15479d05ffbba87f9a24dc709c0ada7532f71f | |
parent | baf47020cd2309e57f73883b703b5e6c36c7f4df (diff) | |
download | ffmpeg-d43efa68bd53eecc9cea6e1081b6fb163e45665c.tar.gz |
vp9/x86: 4x4 iadst SIMD (ssse3) variants.
Cycle measurements for intra itxfm_4x4_add on ped1080p.webm:
idct_idct: 66 -> 67 cycles (noise measurement)
idct_iadst: 199 -> 79 cycles
iadst_idct: 165 -> 70 cycles
iadst_iadst: 183 -> 82 cycles
-rw-r--r-- | libavcodec/x86/vp9dsp_init.c | 5 | ||||
-rw-r--r-- | libavcodec/x86/vp9itxfm.asm | 69 |
2 files changed, 73 insertions, 1 deletions
diff --git a/libavcodec/x86/vp9dsp_init.c b/libavcodec/x86/vp9dsp_init.c index a61b589b4e..612da8d2b7 100644 --- a/libavcodec/x86/vp9dsp_init.c +++ b/libavcodec/x86/vp9dsp_init.c @@ -166,7 +166,7 @@ itxfm_func(iadst, idct, size, opt); \ itxfm_func(idct, iadst, size, opt); \ itxfm_func(iadst, iadst, size, opt) -itxfm_func(idct, idct, 4, ssse3); +itxfm_funcs(4, ssse3); itxfm_funcs(8, ssse3); itxfm_funcs(8, avx); itxfm_funcs(16, ssse3); @@ -250,6 +250,9 @@ av_cold void ff_vp9dsp_init_x86(VP9DSPContext *dsp) init_subpel3(0, put, ssse3); init_subpel3(1, avg, ssse3); dsp->itxfm_add[TX_4X4][DCT_DCT] = ff_vp9_idct_idct_4x4_add_ssse3; + dsp->itxfm_add[TX_4X4][ADST_DCT] = ff_vp9_idct_iadst_4x4_add_ssse3; + dsp->itxfm_add[TX_4X4][DCT_ADST] = ff_vp9_iadst_idct_4x4_add_ssse3; + dsp->itxfm_add[TX_4X4][ADST_ADST] = ff_vp9_iadst_iadst_4x4_add_ssse3; if (ARCH_X86_64) { dsp->itxfm_add[TX_8X8][DCT_DCT] = ff_vp9_idct_idct_8x8_add_ssse3; dsp->itxfm_add[TX_8X8][ADST_DCT] = ff_vp9_idct_iadst_8x8_add_ssse3; diff --git a/libavcodec/x86/vp9itxfm.asm b/libavcodec/x86/vp9itxfm.asm index f814a613e4..8fd838dc9b 100644 --- a/libavcodec/x86/vp9itxfm.asm +++ b/libavcodec/x86/vp9itxfm.asm @@ -58,6 +58,13 @@ VP9_IDCT_COEFFS 8423, 14053 VP9_IDCT_COEFFS 13160, 9760 VP9_IDCT_COEFFS 2404, 16207 +pw_5283_13377: times 4 dw 5283, 13377 +pw_9929_13377: times 4 dw 9929, 13377 +pw_15212_m13377: times 4 dw 15212, -13377 +pw_15212_9929: times 4 dw 15212, 9929 +pw_m5283_m15212: times 4 dw -5283, -15212 +pw_13377x2: times 8 dw 13377*2 + pd_8192: times 4 dd 8192 pw_2048: times 8 dw 2048 pw_1024: times 8 dw 1024 @@ -239,6 +246,68 @@ cglobal vp9_idct_idct_4x4_add, 4,4,0, dst, stride, block, eob VP9_IDCT4_WRITEOUT RET +;------------------------------------------------------------------------------------------- +; void vp9_iadst_iadst_4x4_add_<opt>(uint8_t *dst, ptrdiff_t stride, int16_t *block, int eob); +;------------------------------------------------------------------------------------------- + +%macro VP9_IADST4_1D 0 + movq2dq xmm0, m0 + movq2dq xmm1, m1 + movq2dq xmm2, m2 + movq2dq xmm3, m3 + paddw m3, m0 + punpcklwd xmm0, xmm1 + punpcklwd xmm2, xmm3 + pmaddwd xmm1, xmm0, [pw_5283_13377] + pmaddwd xmm4, xmm0, [pw_9929_13377] + pmaddwd xmm0, [pw_15212_m13377] + pmaddwd xmm3, xmm2, [pw_15212_9929] + pmaddwd xmm2, [pw_m5283_m15212] + psubw m3, m2 + paddd xmm0, xmm2 + paddd xmm3, [pd_8192] + paddd xmm2, [pd_8192] + paddd xmm1, xmm3 + paddd xmm0, xmm3 + paddd xmm4, xmm2 + psrad xmm1, 14 + psrad xmm0, 14 + psrad xmm4, 14 + pmulhrsw m3, [pw_13377x2] ; out2 + packssdw xmm0, xmm0 + packssdw xmm1, xmm1 + packssdw xmm4, xmm4 + movdq2q m0, xmm0 ; out3 + movdq2q m1, xmm1 ; out0 + movdq2q m2, xmm4 ; out1 + SWAP 0, 1, 2, 3 +%endmacro + +%macro IADST4_FN 5 +INIT_MMX %5 +cglobal vp9_%1_%3_4x4_add, 3, 3, 8, dst, stride, block, eob + mova m0, [blockq+ 0] + mova m1, [blockq+ 8] + mova m2, [blockq+16] + mova m3, [blockq+24] + mova m6, [pw_11585x2] + mova m7, [pd_8192] ; rounding + VP9_%2_1D + TRANSPOSE4x4W 0, 1, 2, 3, 4 + VP9_%4_1D + pxor m4, m4 ; used for the block reset, and VP9_STORE_2X + mova [blockq+ 0], m4 + mova [blockq+ 8], m4 + mova [blockq+16], m4 + mova [blockq+24], m4 + VP9_IDCT4_WRITEOUT + RET +%endmacro + +IADST4_FN idct, IDCT4, iadst, IADST4, ssse3 +IADST4_FN iadst, IADST4, idct, IDCT4, ssse3 +IADST4_FN iadst, IADST4, iadst, IADST4, ssse3 + %if ARCH_X86_64 ; TODO: 32-bit? (32-bit limited to 8 xmm reg, we use more) ;------------------------------------------------------------------------------------------- |