VP8: move zeroing of luma DC block into the WHT

Lets us do the zeroing in asm instead of C. Also makes it consistent with the way the regular iDCT code does it. Originally committed as revision 24668 to svn://svn.ffmpeg.org/ffmpeg/trunk
author: Jason Garrett-Glaser <darkshikari@gmail.com> 2010-08-02 20:18:09 +0000
committer: Jason Garrett-Glaser <darkshikari@gmail.com> 2010-08-02 20:18:09 +0000
commit: 827d43bb9d439254eff5fa44b9809d263fbdc3cf (patch)
tree: 09174c0b8f38461476b8e02bcc16cbd95ee43107 /libavcodec/x86
parent: 42907c6ab5e41ca29db3454851e15bb0bce3601d (diff)
download: ffmpeg-827d43bb9d439254eff5fa44b9809d263fbdc3cf.tar.gz
2 files changed, 20 insertions, 2 deletions
diff --git a/libavcodec/x86/vp8dsp-init.c b/libavcodec/x86/vp8dsp-init.c
index 4bf49364e7..aceec6a346 100644
--- a/libavcodec/x86/vp8dsp-init.c
+++ b/libavcodec/x86/vp8dsp-init.c
@@ -224,6 +224,7 @@ extern void ff_vp8_idct_dc_add4y_mmx(uint8_t *dst, DCTELEM block[4][16], int str
 extern void ff_vp8_idct_dc_add4y_sse2(uint8_t *dst, DCTELEM block[4][16], int stride);
 extern void ff_vp8_idct_dc_add4uv_mmx(uint8_t *dst, DCTELEM block[2][16], int stride);
 extern void ff_vp8_luma_dc_wht_mmx(DCTELEM block[4][4][16], DCTELEM dc[16]);
+extern void ff_vp8_luma_dc_wht_sse(DCTELEM block[4][4][16], DCTELEM dc[16]);
 extern void ff_vp8_idct_add_mmx(uint8_t *dst, DCTELEM block[16], int stride);
 extern void ff_vp8_idct_add_sse(uint8_t *dst, DCTELEM block[16], int stride);
 
@@ -335,6 +336,7 @@ av_cold void ff_vp8dsp_init_x86(VP8DSPContext* c)
 
     if (mm_flags & FF_MM_SSE) {
         c->vp8_idct_add                         = ff_vp8_idct_add_sse;
+        c->vp8_luma_dc_wht                      = ff_vp8_luma_dc_wht_sse;
         c->put_vp8_epel_pixels_tab[0][0][0]     =
         c->put_vp8_bilinear_pixels_tab[0][0][0] = ff_put_vp8_pixels16_sse;
     }
diff --git a/libavcodec/x86/vp8dsp.asm b/libavcodec/x86/vp8dsp.asm
index 4f430d80c8..6999e87b63 100644
--- a/libavcodec/x86/vp8dsp.asm
+++ b/libavcodec/x86/vp8dsp.asm
@@ -1186,12 +1186,23 @@ VP8_IDCT_ADD sse
     SWAP %1, %4, %3
 %endmacro
 
-INIT_MMX
-cglobal vp8_luma_dc_wht_mmx, 2,3
+%macro VP8_DC_WHT 1
+cglobal vp8_luma_dc_wht_%1, 2,3
     movq          m0, [r1]
     movq          m1, [r1+8]
     movq          m2, [r1+16]
     movq          m3, [r1+24]
+%ifidn %1, sse
+    xorps      xmm0, xmm0
+    movaps  [r1+ 0], xmm0
+    movaps  [r1+16], xmm0
+%else
+    pxor         m4, m4
+    movq    [r1+ 0], m4
+    movq    [r1+ 8], m4
+    movq    [r1+16], m4
+    movq    [r1+24], m4
+%endif
     HADAMARD4_1D  0, 1, 2, 3
     TRANSPOSE4x4W 0, 1, 2, 3, 4
     paddw         m0, [pw_3]
@@ -1203,6 +1214,11 @@ cglobal vp8_luma_dc_wht_mmx, 2,3
     SCATTER_WHT   0, 1, 0
     SCATTER_WHT   2, 3, 2
     RET
+%endmacro
+
+INIT_MMX
+VP8_DC_WHT mmx
+VP8_DC_WHT sse
 
 ;-----------------------------------------------------------------------------
 ; void vp8_h/v_loop_filter_simple_<opt>(uint8_t *dst, int stride, int flim);
author	Jason Garrett-Glaser <darkshikari@gmail.com>	2010-08-02 20:18:09 +0000
committer	Jason Garrett-Glaser <darkshikari@gmail.com>	2010-08-02 20:18:09 +0000
commit	827d43bb9d439254eff5fa44b9809d263fbdc3cf (patch)
tree	09174c0b8f38461476b8e02bcc16cbd95ee43107 /libavcodec/x86
parent	42907c6ab5e41ca29db3454851e15bb0bce3601d (diff)
download	ffmpeg-827d43bb9d439254eff5fa44b9809d263fbdc3cf.tar.gz