diff options
author | Ronald S. Bultje <rsbultje@gmail.com> | 2010-09-29 13:35:24 +0000 |
---|---|---|
committer | Ronald S. Bultje <rsbultje@gmail.com> | 2010-09-29 13:35:24 +0000 |
commit | 2c3135f6d3faf764f5df364db00da1b2d4dcb097 (patch) | |
tree | 6eb7d5ef843e450efca99a00f9fd42815b571a76 /libavcodec | |
parent | 4b81511cab1d53f7f189bcb09aac4303b20a4ce8 (diff) | |
download | ffmpeg-2c3135f6d3faf764f5df364db00da1b2d4dcb097.tar.gz |
Unroll inner bidir loop in h264_loop_filter_strength_mmx2(), which gets rid
of the d_idx variable and therefore allows for future optimizations. No speed
difference by this commit itself.
Originally committed as revision 25253 to svn://svn.ffmpeg.org/ffmpeg/trunk
Diffstat (limited to 'libavcodec')
-rw-r--r-- | libavcodec/x86/h264dsp_mmx.c | 24 |
1 files changed, 19 insertions, 5 deletions
diff --git a/libavcodec/x86/h264dsp_mmx.c b/libavcodec/x86/h264dsp_mmx.c index 4df3f12154..ed0dbc6b0d 100644 --- a/libavcodec/x86/h264dsp_mmx.c +++ b/libavcodec/x86/h264dsp_mmx.c @@ -86,7 +86,7 @@ static av_always_inline void h264_loop_filter_strength_iteration_mmx2(int16_t bS "pshufw $0x4E, %%mm2, %%mm3 \n" "psubb %%mm2, %%mm0 \n" // { ref0[b]!=ref0[bn], ref0[b]!=ref1[bn] } "psubb %%mm3, %%mm1 \n" // { ref1[b]!=ref1[bn], ref1[b]!=ref0[bn] } - "1: \n" + "por %%mm1, %%mm0 \n" "movq (%2,%0,4), %%mm1 \n" "movq 8(%2,%0,4), %%mm2 \n" @@ -103,10 +103,24 @@ static av_always_inline void h264_loop_filter_strength_iteration_mmx2(int16_t bS "psubusb %%mm5, %%mm1 \n" // abs(mv[b] - mv[bn]) >= limit "psubusb %%mm5, %%mm3 \n" "packsswb %%mm3, %%mm1 \n" - "add $40, %0 \n" - "cmp $40, %0 \n" - "jl 1b \n" - "sub $80, %0 \n" + + "por %%mm1, %%mm0 \n" + "movq 160(%2,%0,4), %%mm1 \n" + "movq 168(%2,%0,4), %%mm2 \n" + "movq %%mm1, %%mm3 \n" + "movq %%mm2, %%mm4 \n" + "psubw (%2), %%mm1 \n" + "psubw 8(%2), %%mm2 \n" + "psubw 160(%2), %%mm3 \n" + "psubw 168(%2), %%mm4 \n" + "packsswb %%mm2, %%mm1 \n" + "packsswb %%mm4, %%mm3 \n" + "paddb %%mm6, %%mm1 \n" + "paddb %%mm6, %%mm3 \n" + "psubusb %%mm5, %%mm1 \n" // abs(mv[b] - mv[bn]) >= limit + "psubusb %%mm5, %%mm3 \n" + "packsswb %%mm3, %%mm1 \n" + "pshufw $0x4E, %%mm1, %%mm1 \n" "por %%mm1, %%mm0 \n" "pshufw $0x4E, %%mm0, %%mm1 \n" |