diff options
author | Ronald S. Bultje <rsbultje@gmail.com> | 2013-02-18 21:03:02 -0800 |
---|---|---|
committer | Martin Storsjö <martin@martin.st> | 2013-04-10 11:03:06 +0300 |
commit | 62844c3fd66940c7747e9b2bb7804e265319f43f (patch) | |
tree | b0e5a05644457aa5d7598d1fefa1a41f83550753 /libavcodec/x86/h264_idct_10bit.asm | |
parent | e8cafd2773bc56455c8816593cbd9368f2d69a80 (diff) | |
download | ffmpeg-62844c3fd66940c7747e9b2bb7804e265319f43f.tar.gz |
h264: Integrate clear_blocks calls with IDCT
The non-intra-pcm branch in hl_decode_mb (simple, 8bpp) goes from 700
to 672 cycles, and the complete loop of decode_mb_cabac and hl_decode_mb
(in the decode_slice loop) goes from 1759 to 1733 cycles on the clip
tested (cathedral), i.e. almost 30 cycles per mb faster.
Signed-off-by: Martin Storsjö <martin@martin.st>
Diffstat (limited to 'libavcodec/x86/h264_idct_10bit.asm')
-rw-r--r-- | libavcodec/x86/h264_idct_10bit.asm | 53 |
1 files changed, 48 insertions, 5 deletions
diff --git a/libavcodec/x86/h264_idct_10bit.asm b/libavcodec/x86/h264_idct_10bit.asm index 51965f0f9f..4e51d2b5d0 100644 --- a/libavcodec/x86/h264_idct_10bit.asm +++ b/libavcodec/x86/h264_idct_10bit.asm @@ -66,6 +66,10 @@ SECTION .text paddd m0, [pd_32] IDCT4_1D d,0,1,2,3,4,5 pxor m5, m5 + mova [%2+ 0], m5 + mova [%2+16], m5 + mova [%2+32], m5 + mova [%2+48], m5 STORE_DIFFx2 m0, m1, m4, m5, %1, %3 lea %1, [%1+%3*2] STORE_DIFFx2 m2, m3, m4, m5, %1, %3 @@ -98,6 +102,10 @@ add4x4_idct %+ SUFFIX: paddd m0, [pd_32] IDCT4_1D d,0,1,2,3,4,5 pxor m5, m5 + mova [r2+ 0], m5 + mova [r2+16], m5 + mova [r2+32], m5 + mova [r2+48], m5 STORE_DIFFx2 m0, m1, m4, m5, r5, r3 lea r5, [r5+r3*2] STORE_DIFFx2 m2, m3, m4, m5, r5, r3 @@ -181,6 +189,7 @@ IDCT_ADD16_10 INIT_MMX mmxext cglobal h264_idct_dc_add_10,3,3 movd m0, [r1] + mov dword [r1], 0 paddd m0, [pd_32] psrad m0, 6 lea r1, [r2*3] @@ -193,11 +202,11 @@ cglobal h264_idct_dc_add_10,3,3 ; void h264_idct8_dc_add(pixel *dst, dctcoef *block, int stride) ;----------------------------------------------------------------------------- %macro IDCT8_DC_ADD 0 -cglobal h264_idct8_dc_add_10,3,3,7 - mov r1d, [r1] - add r1, 32 - sar r1, 6 - movd m0, r1d +cglobal h264_idct8_dc_add_10,3,4,7 + movd m0, [r1] + mov dword[r1], 0 + paddd m0, [pd_32] + psrad m0, 6 lea r1, [r2*3] SPLATW m0, m0, 0 mova m6, [pw_pixel_max] @@ -247,6 +256,8 @@ idct_dc_add %+ SUFFIX: add r5, r0 movq m0, [r2+ 0] movhps m0, [r2+64] + mov dword [r2+ 0], 0 + mov dword [r2+64], 0 paddd m0, [pd_32] psrad m0, 6 pshufhw m0, m0, 0 @@ -461,6 +472,22 @@ h264_idct8_add1_10 %+ SUFFIX: packssdw m8, m0 paddsw m8, [r0] pxor m0, m0 + mova [r1+ 0], m0 + mova [r1+ 16], m0 + mova [r1+ 32], m0 + mova [r1+ 48], m0 + mova [r1+ 64], m0 + mova [r1+ 80], m0 + mova [r1+ 96], m0 + mova [r1+112], m0 + mova [r1+128], m0 + mova [r1+144], m0 + mova [r1+160], m0 + mova [r1+176], m0 + mova [r1+192], m0 + mova [r1+208], m0 + mova [r1+224], m0 + mova [r1+240], m0 CLIPW m8, m0, [pw_pixel_max] mova [r0], m8 mova m8, [pw_pixel_max] @@ -480,6 +507,22 @@ h264_idct8_add1_10 %+ SUFFIX: lea r3, [r0+8] IDCT8_ADD_SSE_END r0, rsp, r2 IDCT8_ADD_SSE_END r3, rsp+16, r2 + mova [r1+ 0], m7 + mova [r1+ 16], m7 + mova [r1+ 32], m7 + mova [r1+ 48], m7 + mova [r1+ 64], m7 + mova [r1+ 80], m7 + mova [r1+ 96], m7 + mova [r1+112], m7 + mova [r1+128], m7 + mova [r1+144], m7 + mova [r1+160], m7 + mova [r1+176], m7 + mova [r1+192], m7 + mova [r1+208], m7 + mova [r1+224], m7 + mova [r1+240], m7 %endif ; ARCH_X86_64 add rsp, pad |