aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorRonald S. Bultje <rsbultje@gmail.com>2010-09-24 14:05:45 +0000
committerRonald S. Bultje <rsbultje@gmail.com>2010-09-24 14:05:45 +0000
commit4bca677494c7d39753cfbc356e1b8c3988024d16 (patch)
treec03e2276886ca2bfa89f2f47b564ca5f8c3a16ca
parentd801f1c8482151cd9f504469965793bd00852556 (diff)
downloadffmpeg-4bca677494c7d39753cfbc356e1b8c3988024d16.tar.gz
Unroll loop in h264_idct_add8_sse2(). This means we can inline scan8[] in the
code directly also and remove loop setup. 20% faster in function, 0.8% overall. See "[PATCH] unroll loop in h264_idct_add8_sse2()" thread on ML. Originally committed as revision 25171 to svn://svn.ffmpeg.org/ffmpeg/trunk
-rw-r--r--libavcodec/x86/h264_idct.asm49
1 files changed, 20 insertions, 29 deletions
diff --git a/libavcodec/x86/h264_idct.asm b/libavcodec/x86/h264_idct.asm
index 3311ab5595..7af6952141 100644
--- a/libavcodec/x86/h264_idct.asm
+++ b/libavcodec/x86/h264_idct.asm
@@ -804,62 +804,53 @@ cglobal h264_idct_add16intra_sse2, 5, 7, 8
jl .next2blocks
REP_RET
-h264_idct_add8_sse2_plane:
-.next2blocks
- movzx r0, byte [scan8+r5]
- movzx r0, word [r4+r0]
+%macro add8_sse2_cycle 2
+ movzx r0, word [r4+%2]
test r0, r0
- jz .try_dc
+ jz .try%1dc
%ifdef ARCH_X86_64
- mov r0d, dword [r1+r5*4]
+ mov r0d, dword [r1+%1*8+64]
add r0, [r10]
%else
- mov r0, r1m ; XXX r1m here is actually r0m of the calling func
+ mov r0, r0m
mov r0, [r0]
- add r0, dword [r1+r5*4]
+ add r0, dword [r1+%1*8+64]
%endif
call x264_add8x4_idct_sse2
- add r5, 2
- add r2, 64
- test r5, 3
- jnz .next2blocks
- rep ret
-.try_dc
+ jmp .cycle%1end
+.try%1dc
movsx r0, word [r2 ]
or r0w, word [r2+32]
- jz .skip2blocks
+ jz .cycle%1end
%ifdef ARCH_X86_64
- mov r0d, dword [r1+r5*4]
+ mov r0d, dword [r1+%1*8+64]
add r0, [r10]
%else
- mov r0, r1m ; XXX r1m here is actually r0m of the calling func
+ mov r0, r0m
mov r0, [r0]
- add r0, dword [r1+r5*4]
+ add r0, dword [r1+%1*8+64]
%endif
call h264_idct_dc_add8_mmx2
-.skip2blocks
- add r5, 2
+.cycle%1end
+%if %1 < 3
add r2, 64
- test r5, 3
- jnz .next2blocks
- rep ret
+%endif
+%endmacro
; ff_h264_idct_add8_sse2(uint8_t **dest, const int *block_offset,
; DCTELEM *block, int stride, const uint8_t nnzc[6*8])
cglobal h264_idct_add8_sse2, 5, 7, 8
- mov r5, 16
add r2, 512
-%ifdef PIC
- lea r11, [scan8_mem]
-%endif
%ifdef ARCH_X86_64
mov r10, r0
%endif
- call h264_idct_add8_sse2_plane
+ add8_sse2_cycle 0, 0x09
+ add8_sse2_cycle 1, 0x11
%ifdef ARCH_X86_64
add r10, gprsize
%else
add r0mp, gprsize
%endif
- call h264_idct_add8_sse2_plane
+ add8_sse2_cycle 2, 0x21
+ add8_sse2_cycle 3, 0x29
RET