aboutsummaryrefslogtreecommitdiffstats
path: root/libavcodec/x86/vp8dsp-init.c
diff options
context:
space:
mode:
authorRonald S. Bultje <rsbultje@gmail.com>2010-07-31 23:13:15 +0000
committerRonald S. Bultje <rsbultje@gmail.com>2010-07-31 23:13:15 +0000
commit6341838f3ca69c7850aa11b067165ef544cead95 (patch)
tree7914c26ff9b26b9f544024e56b5032254d27f2b9 /libavcodec/x86/vp8dsp-init.c
parentace7f813cd4b2bc092bd827f7e8257368781e9bb (diff)
downloadffmpeg-6341838f3ca69c7850aa11b067165ef544cead95.tar.gz
Use word-writing instead of dword-writing (with two cached but otherwise
unchanged bytes) in the horizontal simple loopfilter. This makes the filter quite a bit faster in itself (~30 cycles less on Core1), probably mostly because we don't need a complex 4x4 transpose, but only a simple byte interleave. Also allows using pextrw on SSE4, which speeds up even more (e.g. 25% faster on Core i7). Originally committed as revision 24638 to svn://svn.ffmpeg.org/ffmpeg/trunk
Diffstat (limited to 'libavcodec/x86/vp8dsp-init.c')
-rw-r--r--libavcodec/x86/vp8dsp-init.c4
1 files changed, 3 insertions, 1 deletions
diff --git a/libavcodec/x86/vp8dsp-init.c b/libavcodec/x86/vp8dsp-init.c
index 4f40bdd11f..4bf49364e7 100644
--- a/libavcodec/x86/vp8dsp-init.c
+++ b/libavcodec/x86/vp8dsp-init.c
@@ -346,7 +346,6 @@ av_cold void ff_vp8dsp_init_x86(VP8DSPContext* c)
VP8_BILINEAR_MC_FUNC(1, 8, sse2);
c->vp8_v_loop_filter_simple = ff_vp8_v_loop_filter_simple_sse2;
- c->vp8_h_loop_filter_simple = ff_vp8_h_loop_filter_simple_sse2;
c->vp8_v_loop_filter16y_inner = ff_vp8_v_loop_filter16y_inner_sse2;
c->vp8_v_loop_filter8uv_inner = ff_vp8_v_loop_filter8uv_inner_sse2;
@@ -358,6 +357,8 @@ av_cold void ff_vp8dsp_init_x86(VP8DSPContext* c)
if (mm_flags & FF_MM_SSE2) {
c->vp8_idct_dc_add4y = ff_vp8_idct_dc_add4y_sse2;
+ c->vp8_h_loop_filter_simple = ff_vp8_h_loop_filter_simple_sse2;
+
c->vp8_h_loop_filter16y_inner = ff_vp8_h_loop_filter16y_inner_sse2;
c->vp8_h_loop_filter8uv_inner = ff_vp8_h_loop_filter8uv_inner_sse2;
@@ -390,6 +391,7 @@ av_cold void ff_vp8dsp_init_x86(VP8DSPContext* c)
if (mm_flags & FF_MM_SSE4) {
c->vp8_idct_dc_add = ff_vp8_idct_dc_add_sse4;
+ c->vp8_h_loop_filter_simple = ff_vp8_h_loop_filter_simple_sse4;
c->vp8_h_loop_filter16y = ff_vp8_h_loop_filter16y_mbedge_sse4;
c->vp8_h_loop_filter8uv = ff_vp8_h_loop_filter8uv_mbedge_sse4;
}