diff options
author | Ronald S. Bultje <rsbultje@gmail.com> | 2013-02-18 21:03:02 -0800 |
---|---|---|
committer | Martin Storsjö <martin@martin.st> | 2013-04-10 11:03:06 +0300 |
commit | 62844c3fd66940c7747e9b2bb7804e265319f43f (patch) | |
tree | b0e5a05644457aa5d7598d1fefa1a41f83550753 /libavcodec/x86/h264_idct.asm | |
parent | e8cafd2773bc56455c8816593cbd9368f2d69a80 (diff) | |
download | ffmpeg-62844c3fd66940c7747e9b2bb7804e265319f43f.tar.gz |
h264: Integrate clear_blocks calls with IDCT
The non-intra-pcm branch in hl_decode_mb (simple, 8bpp) goes from 700
to 672 cycles, and the complete loop of decode_mb_cabac and hl_decode_mb
(in the decode_slice loop) goes from 1759 to 1733 cycles on the clip
tested (cathedral), i.e. almost 30 cycles per mb faster.
Signed-off-by: Martin Storsjö <martin@martin.st>
Diffstat (limited to 'libavcodec/x86/h264_idct.asm')
-rw-r--r-- | libavcodec/x86/h264_idct.asm | 108 |
1 files changed, 83 insertions, 25 deletions
diff --git a/libavcodec/x86/h264_idct.asm b/libavcodec/x86/h264_idct.asm index de0de24428..3ced8af40e 100644 --- a/libavcodec/x86/h264_idct.asm +++ b/libavcodec/x86/h264_idct.asm @@ -70,6 +70,10 @@ SECTION .text paddw m0, m6 IDCT4_1D w, 0, 1, 2, 3, 4, 5 pxor m7, m7 + movq [%2+ 0], m7 + movq [%2+ 8], m7 + movq [%2+16], m7 + movq [%2+24], m7 STORE_DIFFx2 m0, m1, m4, m5, m7, 6, %1, %3 lea %1, [%1+%3*2] @@ -161,13 +165,31 @@ cglobal h264_idct_add_8, 3, 3, 0 %endmacro ; %1=uint8_t *dst, %2=int16_t *block, %3=int stride -%macro IDCT8_ADD_MMX_END 3 +%macro IDCT8_ADD_MMX_END 3-4 IDCT8_1D_FULL %2 mova [%2 ], m5 mova [%2+16], m6 mova [%2+32], m7 pxor m7, m7 +%if %0 == 4 + movq [%4+ 0], m7 + movq [%4+ 8], m7 + movq [%4+ 16], m7 + movq [%4+ 24], m7 + movq [%4+ 32], m7 + movq [%4+ 40], m7 + movq [%4+ 48], m7 + movq [%4+ 56], m7 + movq [%4+ 64], m7 + movq [%4+ 72], m7 + movq [%4+ 80], m7 + movq [%4+ 88], m7 + movq [%4+ 96], m7 + movq [%4+104], m7 + movq [%4+112], m7 + movq [%4+120], m7 +%endif STORE_DIFFx2 m0, m1, m5, m6, m7, 6, %1, %3 lea %1, [%1+%3*2] STORE_DIFFx2 m2, m3, m5, m6, m7, 6, %1, %3 @@ -190,7 +212,7 @@ cglobal h264_idct8_add_8, 3, 4, 0 IDCT8_ADD_MMX_START r1 , rsp IDCT8_ADD_MMX_START r1+8, rsp+64 lea r3, [r0+4] - IDCT8_ADD_MMX_END r0 , rsp, r2 + IDCT8_ADD_MMX_END r0 , rsp, r2, r1 IDCT8_ADD_MMX_END r3 , rsp+8, r2 ADD rsp, pad @@ -233,6 +255,14 @@ cglobal h264_idct8_add_8, 3, 4, 0 SWAP 0, 8 SWAP 1, 9 %endif + mova [%2+ 0], m7 + mova [%2+ 16], m7 + mova [%2+ 32], m7 + mova [%2+ 48], m7 + mova [%2+ 64], m7 + mova [%2+ 80], m7 + mova [%2+ 96], m7 + mova [%2+112], m7 lea %1, [%1+%3*4] STORE_DIFF m4, m6, m7, [%1 ] STORE_DIFF m5, m6, m7, [%1+%3 ] @@ -246,19 +276,11 @@ cglobal h264_idct8_add_8, 3, 4, 10 IDCT8_ADD_SSE r0, r1, r2, r3 RET -%macro DC_ADD_MMXEXT_INIT 2-3 -%if %0 == 2 - movsx %1, word [%1] +%macro DC_ADD_MMXEXT_INIT 2 add %1, 32 sar %1, 6 movd m0, %1d lea %1, [%2*3] -%else - add %3, 32 - sar %3, 6 - movd m0, %3d - lea %3, [%2*3] -%endif pshufw m0, m0, 0 pxor m1, m1 psubw m1, m0 @@ -287,19 +309,44 @@ cglobal h264_idct8_add_8, 3, 4, 10 INIT_MMX mmxext ; ff_h264_idct_dc_add_mmxext(uint8_t *dst, int16_t *block, int stride) -cglobal h264_idct_dc_add_8, 3, 3, 0 - DC_ADD_MMXEXT_INIT r1, r2 - DC_ADD_MMXEXT_OP movh, r0, r2, r1 +%if ARCH_X86_64 +cglobal h264_idct_dc_add_8, 3, 4, 0 + movsx r3, word [r1] + mov dword [r1], 0 + DC_ADD_MMXEXT_INIT r3, r2 + DC_ADD_MMXEXT_OP movh, r0, r2, r3 RET ; ff_h264_idct8_dc_add_mmxext(uint8_t *dst, int16_t *block, int stride) -cglobal h264_idct8_dc_add_8, 3, 3, 0 - DC_ADD_MMXEXT_INIT r1, r2 - DC_ADD_MMXEXT_OP mova, r0, r2, r1 +cglobal h264_idct8_dc_add_8, 3, 4, 0 + movsx r3, word [r1] + mov dword [r1], 0 + DC_ADD_MMXEXT_INIT r3, r2 + DC_ADD_MMXEXT_OP mova, r0, r2, r3 lea r0, [r0+r2*4] - DC_ADD_MMXEXT_OP mova, r0, r2, r1 + DC_ADD_MMXEXT_OP mova, r0, r2, r3 + RET +%else +cglobal h264_idct_dc_add_8, 2, 3, 0 + movsx r2, word [r1] + mov dword [r1], 0 + mov r1, r2m + DC_ADD_MMXEXT_INIT r2, r1 + DC_ADD_MMXEXT_OP movh, r0, r1, r2 RET +; ff_h264_idct8_dc_add_mmxext(uint8_t *dst, int16_t *block, int stride) +cglobal h264_idct8_dc_add_8, 2, 3, 0 + movsx r2, word [r1] + mov dword [r1], 0 + mov r1, r2m + DC_ADD_MMXEXT_INIT r2, r1 + DC_ADD_MMXEXT_OP mova, r0, r1, r2 + lea r0, [r0+r1*4] + DC_ADD_MMXEXT_OP mova, r0, r1, r2 + RET +%endif + INIT_MMX mmx ; ff_h264_idct_add16_mmx(uint8_t *dst, const int *block_offset, ; int16_t *block, int stride, const uint8_t nnzc[6*8]) @@ -343,7 +390,7 @@ cglobal h264_idct8_add4_8, 5, 7 + npicregs, 0, dst, block_offset, block, stride, add word [r2], 32 IDCT8_ADD_MMX_START r2 , rsp IDCT8_ADD_MMX_START r2+8, rsp+64 - IDCT8_ADD_MMX_END r6 , rsp, r3 + IDCT8_ADD_MMX_END r6 , rsp, r3, r2 mov r6d, dword [r1+r5*4] lea r6, [r0+r6+4] IDCT8_ADD_MMX_END r6 , rsp+8, r3 @@ -373,7 +420,8 @@ cglobal h264_idct_add16_8, 5, 8 + npicregs, 0, dst1, block_offset, block, stride movsx r6, word [r2] test r6, r6 jz .no_dc - DC_ADD_MMXEXT_INIT r2, r3, r6 + mov word [r2], 0 + DC_ADD_MMXEXT_INIT r6, r3 %if ARCH_X86_64 == 0 %define dst2q r1 %define dst2d r1d @@ -450,7 +498,8 @@ cglobal h264_idct_add16intra_8, 5, 8 + npicregs, 0, dst1, block_offset, block, s movsx r6, word [r2] test r6, r6 jz .skipblock - DC_ADD_MMXEXT_INIT r2, r3, r6 + mov word [r2], 0 + DC_ADD_MMXEXT_INIT r6, r3 %if ARCH_X86_64 == 0 %define dst2q r1 %define dst2d r1d @@ -489,7 +538,8 @@ cglobal h264_idct8_add4_8, 5, 8 + npicregs, 0, dst1, block_offset, block, stride movsx r6, word [r2] test r6, r6 jz .no_dc - DC_ADD_MMXEXT_INIT r2, r3, r6 + mov word [r2], 0 + DC_ADD_MMXEXT_INIT r6, r3 %if ARCH_X86_64 == 0 %define dst2q r1 %define dst2d r1d @@ -515,7 +565,7 @@ cglobal h264_idct8_add4_8, 5, 8 + npicregs, 0, dst1, block_offset, block, stride add word [r2], 32 IDCT8_ADD_MMX_START r2 , rsp IDCT8_ADD_MMX_START r2+8, rsp+64 - IDCT8_ADD_MMX_END r6 , rsp, r3 + IDCT8_ADD_MMX_END r6 , rsp, r3, r2 mov r6d, dword [r1+r5*4] lea r6, [r0+r6+4] IDCT8_ADD_MMX_END r6 , rsp+8, r3 @@ -547,7 +597,8 @@ cglobal h264_idct8_add4_8, 5, 8 + npicregs, 10, dst1, block_offset, block, strid test r6, r6 jz .no_dc INIT_MMX cpuname - DC_ADD_MMXEXT_INIT r2, r3, r6 + mov word [r2], 0 + DC_ADD_MMXEXT_INIT r6, r3 %if ARCH_X86_64 == 0 %define dst2q r1 %define dst2d r1d @@ -650,7 +701,8 @@ h264_idct_add8_mmxext_plane: movsx r6, word [r2] test r6, r6 jz .skipblock - DC_ADD_MMXEXT_INIT r2, r3, r6 + mov word [r2], 0 + DC_ADD_MMXEXT_INIT r6, r3 %if ARCH_X86_64 mov r0d, dword [r1+r5*4] add r0, [dst2q] @@ -693,7 +745,9 @@ cglobal h264_idct_add8_8, 5, 8 + npicregs, 0, dst1, block_offset, block, stride, ; r0 = uint8_t *dst, r2 = int16_t *block, r3 = int stride, r6=clobbered h264_idct_dc_add8_mmxext: movd m0, [r2 ] ; 0 0 X D + mov word [r2+ 0], 0 punpcklwd m0, [r2+32] ; x X d D + mov word [r2+32], 0 paddsw m0, [pw_32] psraw m0, 6 punpcklwd m0, m0 ; d d D D @@ -723,6 +777,10 @@ h264_add8x4_idct_sse2: paddw m0, [pw_32] IDCT4_1D w,0,1,2,3,4,5 pxor m7, m7 + mova [r2+ 0], m7 + mova [r2+16], m7 + mova [r2+32], m7 + mova [r2+48], m7 STORE_DIFFx2 m0, m1, m4, m5, m7, 6, r0, r3 lea r0, [r0+r3*2] STORE_DIFFx2 m2, m3, m4, m5, m7, 6, r0, r3 |