diff options
author | Timothy Gu <timothygu99@gmail.com> | 2016-01-31 11:42:24 -0800 |
---|---|---|
committer | Timothy Gu <timothygu99@gmail.com> | 2016-02-01 17:01:11 -0800 |
commit | 838abfc1d711b42beaf401153b36ef80922b85b8 (patch) | |
tree | 14aa4ca3f18a1a65ea8d91ef4640175f10951a9f | |
parent | b62825a480517eed151bfb105323c1549b325d00 (diff) | |
download | ffmpeg-838abfc1d711b42beaf401153b36ef80922b85b8.tar.gz |
x86: vc1dsp: Convert vc1_inv_trans_*_dc to NASM format
-rw-r--r-- | libavcodec/x86/vc1dsp.asm | 98 | ||||
-rw-r--r-- | libavcodec/x86/vc1dsp_init.c | 13 | ||||
-rw-r--r-- | libavcodec/x86/vc1dsp_mmx.c | 207 |
3 files changed, 111 insertions, 207 deletions
diff --git a/libavcodec/x86/vc1dsp.asm b/libavcodec/x86/vc1dsp.asm index 6415a836c0..91a1991851 100644 --- a/libavcodec/x86/vc1dsp.asm +++ b/libavcodec/x86/vc1dsp.asm @@ -395,3 +395,101 @@ cglobal vc1_put_ver_16b_shift2, 4,7,0, dst, src, stride jnz .loop REP_RET %endif ; HAVE_MMX_INLINE + +%macro INV_TRANS_INIT 0 + movsxdifnidn linesizeq, linesized + movd m0, blockd + SPLATW m0, m0 + pxor m1, m1 + psubw m1, m0 + packuswb m0, m0 + packuswb m1, m1 + + DEFINE_ARGS dest, linesize, linesize3 + lea linesize3q, [linesizeq*3] +%endmacro + +%macro INV_TRANS_PROCESS 1 + mov%1 m2, [destq+linesizeq*0] + mov%1 m3, [destq+linesizeq*1] + mov%1 m4, [destq+linesizeq*2] + mov%1 m5, [destq+linesize3q] + paddusb m2, m0 + paddusb m3, m0 + paddusb m4, m0 + paddusb m5, m0 + psubusb m2, m1 + psubusb m3, m1 + psubusb m4, m1 + psubusb m5, m1 + mov%1 [linesizeq*0+destq], m2 + mov%1 [linesizeq*1+destq], m3 + mov%1 [linesizeq*2+destq], m4 + mov%1 [linesize3q +destq], m5 +%endmacro + +; ff_vc1_inv_trans_?x?_dc_mmxext(uint8_t *dest, int linesize, int16_t *block) +INIT_MMX mmxext +cglobal vc1_inv_trans_4x4_dc, 3,4,0, dest, linesize, block + movsx r3d, WORD [blockq] + mov blockd, r3d ; dc + shl blockd, 4 ; 16 * dc + lea blockd, [blockq+r3+4] ; 17 * dc + 4 + sar blockd, 3 ; >> 3 + mov r3d, blockd ; dc + shl blockd, 4 ; 16 * dc + lea blockd, [blockq+r3+64] ; 17 * dc + 64 + sar blockd, 7 ; >> 7 + + INV_TRANS_INIT + + INV_TRANS_PROCESS h + RET + +INIT_MMX mmxext +cglobal vc1_inv_trans_4x8_dc, 3,4,0, dest, linesize, block + movsx r3d, WORD [blockq] + mov blockd, r3d ; dc + shl blockd, 4 ; 16 * dc + lea blockd, [blockq+r3+4] ; 17 * dc + 4 + sar blockd, 3 ; >> 3 + shl blockd, 2 ; 4 * dc + lea blockd, [blockq*3+64] ; 12 * dc + 64 + sar blockd, 7 ; >> 7 + + INV_TRANS_INIT + + INV_TRANS_PROCESS h + lea destq, [destq+linesizeq*4] + INV_TRANS_PROCESS h + RET + +INIT_MMX mmxext +cglobal vc1_inv_trans_8x4_dc, 3,4,0, dest, linesize, block + movsx blockd, WORD [blockq] ; dc + lea blockd, [blockq*3+1] ; 3 * dc + 1 + sar blockd, 1 ; >> 1 + mov r3d, blockd ; dc + shl blockd, 4 ; 16 * dc + lea blockd, [blockq+r3+64] ; 17 * dc + 64 + sar blockd, 7 ; >> 7 + + INV_TRANS_INIT + + INV_TRANS_PROCESS a + RET + +INIT_MMX mmxext +cglobal vc1_inv_trans_8x8_dc, 3,3,0, dest, linesize, block + movsx blockd, WORD [blockq] ; dc + lea blockd, [blockq*3+1] ; 3 * dc + 1 + sar blockd, 1 ; >> 1 + lea blockd, [blockq*3+16] ; 3 * dc + 16 + sar blockd, 5 ; >> 5 + + INV_TRANS_INIT + + INV_TRANS_PROCESS a + lea destq, [destq+linesizeq*4] + INV_TRANS_PROCESS a + RET diff --git a/libavcodec/x86/vc1dsp_init.c b/libavcodec/x86/vc1dsp_init.c index 1747305d1f..c8943fa2f7 100644 --- a/libavcodec/x86/vc1dsp_init.c +++ b/libavcodec/x86/vc1dsp_init.c @@ -92,6 +92,14 @@ void ff_put_vc1_chroma_mc8_nornd_ssse3(uint8_t *dst, uint8_t *src, int stride, int h, int x, int y); void ff_avg_vc1_chroma_mc8_nornd_ssse3(uint8_t *dst, uint8_t *src, int stride, int h, int x, int y); +void ff_vc1_inv_trans_4x4_dc_mmxext(uint8_t *dest, int linesize, + int16_t *block); +void ff_vc1_inv_trans_4x8_dc_mmxext(uint8_t *dest, int linesize, + int16_t *block); +void ff_vc1_inv_trans_8x4_dc_mmxext(uint8_t *dest, int linesize, + int16_t *block); +void ff_vc1_inv_trans_8x8_dc_mmxext(uint8_t *dest, int linesize, + int16_t *block); av_cold void ff_vc1dsp_init_x86(VC1DSPContext *dsp) @@ -130,6 +138,11 @@ av_cold void ff_vc1dsp_init_x86(VC1DSPContext *dsp) dsp->avg_vc1_mspel_pixels_tab[1][0] = avg_vc1_mspel_mc00_8_mmxext; dsp->avg_vc1_mspel_pixels_tab[0][0] = avg_vc1_mspel_mc00_16_mmxext; + + dsp->vc1_inv_trans_8x8_dc = ff_vc1_inv_trans_8x8_dc_mmxext; + dsp->vc1_inv_trans_4x8_dc = ff_vc1_inv_trans_4x8_dc_mmxext; + dsp->vc1_inv_trans_8x4_dc = ff_vc1_inv_trans_8x4_dc_mmxext; + dsp->vc1_inv_trans_4x4_dc = ff_vc1_inv_trans_4x4_dc_mmxext; } if (EXTERNAL_SSE2(cpu_flags)) { dsp->vc1_v_loop_filter8 = ff_vc1_v_loop_filter8_sse2; diff --git a/libavcodec/x86/vc1dsp_mmx.c b/libavcodec/x86/vc1dsp_mmx.c index c268cc6a8e..ff13d9b119 100644 --- a/libavcodec/x86/vc1dsp_mmx.c +++ b/libavcodec/x86/vc1dsp_mmx.c @@ -481,208 +481,6 @@ DECLARE_FUNCTION(3, 1) DECLARE_FUNCTION(3, 2) DECLARE_FUNCTION(3, 3) -static void vc1_inv_trans_4x4_dc_mmxext(uint8_t *dest, int linesize, - int16_t *block) -{ - int dc = block[0]; - dc = (17 * dc + 4) >> 3; - dc = (17 * dc + 64) >> 7; - __asm__ volatile( - "movd %0, %%mm0 \n\t" - "pshufw $0, %%mm0, %%mm0 \n\t" - "pxor %%mm1, %%mm1 \n\t" - "psubw %%mm0, %%mm1 \n\t" - "packuswb %%mm0, %%mm0 \n\t" - "packuswb %%mm1, %%mm1 \n\t" - ::"r"(dc) - ); - __asm__ volatile( - "movd %0, %%mm2 \n\t" - "movd %1, %%mm3 \n\t" - "movd %2, %%mm4 \n\t" - "movd %3, %%mm5 \n\t" - "paddusb %%mm0, %%mm2 \n\t" - "paddusb %%mm0, %%mm3 \n\t" - "paddusb %%mm0, %%mm4 \n\t" - "paddusb %%mm0, %%mm5 \n\t" - "psubusb %%mm1, %%mm2 \n\t" - "psubusb %%mm1, %%mm3 \n\t" - "psubusb %%mm1, %%mm4 \n\t" - "psubusb %%mm1, %%mm5 \n\t" - "movd %%mm2, %0 \n\t" - "movd %%mm3, %1 \n\t" - "movd %%mm4, %2 \n\t" - "movd %%mm5, %3 \n\t" - :"+m"(*(uint32_t*)(dest+0*linesize)), - "+m"(*(uint32_t*)(dest+1*linesize)), - "+m"(*(uint32_t*)(dest+2*linesize)), - "+m"(*(uint32_t*)(dest+3*linesize)) - ); -} - -static void vc1_inv_trans_4x8_dc_mmxext(uint8_t *dest, int linesize, - int16_t *block) -{ - int dc = block[0]; - dc = (17 * dc + 4) >> 3; - dc = (12 * dc + 64) >> 7; - __asm__ volatile( - "movd %0, %%mm0 \n\t" - "pshufw $0, %%mm0, %%mm0 \n\t" - "pxor %%mm1, %%mm1 \n\t" - "psubw %%mm0, %%mm1 \n\t" - "packuswb %%mm0, %%mm0 \n\t" - "packuswb %%mm1, %%mm1 \n\t" - ::"r"(dc) - ); - __asm__ volatile( - "movd %0, %%mm2 \n\t" - "movd %1, %%mm3 \n\t" - "movd %2, %%mm4 \n\t" - "movd %3, %%mm5 \n\t" - "paddusb %%mm0, %%mm2 \n\t" - "paddusb %%mm0, %%mm3 \n\t" - "paddusb %%mm0, %%mm4 \n\t" - "paddusb %%mm0, %%mm5 \n\t" - "psubusb %%mm1, %%mm2 \n\t" - "psubusb %%mm1, %%mm3 \n\t" - "psubusb %%mm1, %%mm4 \n\t" - "psubusb %%mm1, %%mm5 \n\t" - "movd %%mm2, %0 \n\t" - "movd %%mm3, %1 \n\t" - "movd %%mm4, %2 \n\t" - "movd %%mm5, %3 \n\t" - :"+m"(*(uint32_t*)(dest+0*linesize)), - "+m"(*(uint32_t*)(dest+1*linesize)), - "+m"(*(uint32_t*)(dest+2*linesize)), - "+m"(*(uint32_t*)(dest+3*linesize)) - ); - dest += 4*linesize; - __asm__ volatile( - "movd %0, %%mm2 \n\t" - "movd %1, %%mm3 \n\t" - "movd %2, %%mm4 \n\t" - "movd %3, %%mm5 \n\t" - "paddusb %%mm0, %%mm2 \n\t" - "paddusb %%mm0, %%mm3 \n\t" - "paddusb %%mm0, %%mm4 \n\t" - "paddusb %%mm0, %%mm5 \n\t" - "psubusb %%mm1, %%mm2 \n\t" - "psubusb %%mm1, %%mm3 \n\t" - "psubusb %%mm1, %%mm4 \n\t" - "psubusb %%mm1, %%mm5 \n\t" - "movd %%mm2, %0 \n\t" - "movd %%mm3, %1 \n\t" - "movd %%mm4, %2 \n\t" - "movd %%mm5, %3 \n\t" - :"+m"(*(uint32_t*)(dest+0*linesize)), - "+m"(*(uint32_t*)(dest+1*linesize)), - "+m"(*(uint32_t*)(dest+2*linesize)), - "+m"(*(uint32_t*)(dest+3*linesize)) - ); -} - -static void vc1_inv_trans_8x4_dc_mmxext(uint8_t *dest, int linesize, - int16_t *block) -{ - int dc = block[0]; - dc = ( 3 * dc + 1) >> 1; - dc = (17 * dc + 64) >> 7; - __asm__ volatile( - "movd %0, %%mm0 \n\t" - "pshufw $0, %%mm0, %%mm0 \n\t" - "pxor %%mm1, %%mm1 \n\t" - "psubw %%mm0, %%mm1 \n\t" - "packuswb %%mm0, %%mm0 \n\t" - "packuswb %%mm1, %%mm1 \n\t" - ::"r"(dc) - ); - __asm__ volatile( - "movq %0, %%mm2 \n\t" - "movq %1, %%mm3 \n\t" - "movq %2, %%mm4 \n\t" - "movq %3, %%mm5 \n\t" - "paddusb %%mm0, %%mm2 \n\t" - "paddusb %%mm0, %%mm3 \n\t" - "paddusb %%mm0, %%mm4 \n\t" - "paddusb %%mm0, %%mm5 \n\t" - "psubusb %%mm1, %%mm2 \n\t" - "psubusb %%mm1, %%mm3 \n\t" - "psubusb %%mm1, %%mm4 \n\t" - "psubusb %%mm1, %%mm5 \n\t" - "movq %%mm2, %0 \n\t" - "movq %%mm3, %1 \n\t" - "movq %%mm4, %2 \n\t" - "movq %%mm5, %3 \n\t" - :"+m"(*(uint32_t*)(dest+0*linesize)), - "+m"(*(uint32_t*)(dest+1*linesize)), - "+m"(*(uint32_t*)(dest+2*linesize)), - "+m"(*(uint32_t*)(dest+3*linesize)) - ); -} - -static void vc1_inv_trans_8x8_dc_mmxext(uint8_t *dest, int linesize, - int16_t *block) -{ - int dc = block[0]; - dc = (3 * dc + 1) >> 1; - dc = (3 * dc + 16) >> 5; - __asm__ volatile( - "movd %0, %%mm0 \n\t" - "pshufw $0, %%mm0, %%mm0 \n\t" - "pxor %%mm1, %%mm1 \n\t" - "psubw %%mm0, %%mm1 \n\t" - "packuswb %%mm0, %%mm0 \n\t" - "packuswb %%mm1, %%mm1 \n\t" - ::"r"(dc) - ); - __asm__ volatile( - "movq %0, %%mm2 \n\t" - "movq %1, %%mm3 \n\t" - "movq %2, %%mm4 \n\t" - "movq %3, %%mm5 \n\t" - "paddusb %%mm0, %%mm2 \n\t" - "paddusb %%mm0, %%mm3 \n\t" - "paddusb %%mm0, %%mm4 \n\t" - "paddusb %%mm0, %%mm5 \n\t" - "psubusb %%mm1, %%mm2 \n\t" - "psubusb %%mm1, %%mm3 \n\t" - "psubusb %%mm1, %%mm4 \n\t" - "psubusb %%mm1, %%mm5 \n\t" - "movq %%mm2, %0 \n\t" - "movq %%mm3, %1 \n\t" - "movq %%mm4, %2 \n\t" - "movq %%mm5, %3 \n\t" - :"+m"(*(uint32_t*)(dest+0*linesize)), - "+m"(*(uint32_t*)(dest+1*linesize)), - "+m"(*(uint32_t*)(dest+2*linesize)), - "+m"(*(uint32_t*)(dest+3*linesize)) - ); - dest += 4*linesize; - __asm__ volatile( - "movq %0, %%mm2 \n\t" - "movq %1, %%mm3 \n\t" - "movq %2, %%mm4 \n\t" - "movq %3, %%mm5 \n\t" - "paddusb %%mm0, %%mm2 \n\t" - "paddusb %%mm0, %%mm3 \n\t" - "paddusb %%mm0, %%mm4 \n\t" - "paddusb %%mm0, %%mm5 \n\t" - "psubusb %%mm1, %%mm2 \n\t" - "psubusb %%mm1, %%mm3 \n\t" - "psubusb %%mm1, %%mm4 \n\t" - "psubusb %%mm1, %%mm5 \n\t" - "movq %%mm2, %0 \n\t" - "movq %%mm3, %1 \n\t" - "movq %%mm4, %2 \n\t" - "movq %%mm5, %3 \n\t" - :"+m"(*(uint32_t*)(dest+0*linesize)), - "+m"(*(uint32_t*)(dest+1*linesize)), - "+m"(*(uint32_t*)(dest+2*linesize)), - "+m"(*(uint32_t*)(dest+3*linesize)) - ); -} - #define FN_ASSIGN(OP, X, Y, INSN) \ dsp->OP##vc1_mspel_pixels_tab[1][X+4*Y] = OP##vc1_mspel_mc##X##Y##INSN; \ dsp->OP##vc1_mspel_pixels_tab[0][X+4*Y] = OP##vc1_mspel_mc##X##Y##_16##INSN @@ -729,10 +527,5 @@ av_cold void ff_vc1dsp_init_mmxext(VC1DSPContext *dsp) FN_ASSIGN(avg_, 3, 1, _mmxext); FN_ASSIGN(avg_, 3, 2, _mmxext); FN_ASSIGN(avg_, 3, 3, _mmxext); - - dsp->vc1_inv_trans_8x8_dc = vc1_inv_trans_8x8_dc_mmxext; - dsp->vc1_inv_trans_4x8_dc = vc1_inv_trans_4x8_dc_mmxext; - dsp->vc1_inv_trans_8x4_dc = vc1_inv_trans_8x4_dc_mmxext; - dsp->vc1_inv_trans_4x4_dc = vc1_inv_trans_4x4_dc_mmxext; } #endif /* HAVE_6REGS && HAVE_INLINE_ASM && HAVE_MMX_EXTERNAL */ |