summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorJason Garrett-Glaser <[email protected]>2010-07-21 20:51:01 +0000
committerJason Garrett-Glaser <[email protected]>2010-07-21 20:51:01 +0000
commitb8b231b5dc104192c0b766798a04cdf6e748472f (patch)
tree1631715ed493ab17b3f687251a0930644dac2be3
parenta4e6fa86ce7b3e4c7795fb725fa1de1b8dff15b4 (diff)
Make mmx VP8 WHT faster
Avoid pextrw, since it's slow on many older CPUs. Now it doesn't require mmxext either. Originally committed as revision 24397 to svn://svn.ffmpeg.org/ffmpeg/trunk
-rw-r--r--libavcodec/x86/vp8dsp-init.c4
-rw-r--r--libavcodec/x86/vp8dsp.asm39
2 files changed, 24 insertions, 19 deletions
diff --git a/libavcodec/x86/vp8dsp-init.c b/libavcodec/x86/vp8dsp-init.c
index 00f4bfbd15..a7b1ce0de7 100644
--- a/libavcodec/x86/vp8dsp-init.c
+++ b/libavcodec/x86/vp8dsp-init.c
@@ -220,7 +220,7 @@ HVBILIN(ssse3, 8, 16, 16)
extern void ff_vp8_idct_dc_add_mmx(uint8_t *dst, DCTELEM block[16], int stride);
extern void ff_vp8_idct_dc_add_sse4(uint8_t *dst, DCTELEM block[16], int stride);
-extern void ff_vp8_luma_dc_wht_mmxext(DCTELEM block[4][4][16], DCTELEM dc[16]);
+extern void ff_vp8_luma_dc_wht_mmx(DCTELEM block[4][4][16], DCTELEM dc[16]);
extern void ff_vp8_idct_add_mmx(uint8_t *dst, DCTELEM block[16], int stride);
extern void ff_vp8_v_loop_filter_simple_mmx (uint8_t *dst, int stride, int flim);
@@ -315,6 +315,7 @@ av_cold void ff_vp8dsp_init_x86(VP8DSPContext* c)
if (mm_flags & FF_MM_MMX) {
c->vp8_idct_dc_add = ff_vp8_idct_dc_add_mmx;
c->vp8_idct_add = ff_vp8_idct_add_mmx;
+ c->vp8_luma_dc_wht = ff_vp8_luma_dc_wht_mmx;
c->put_vp8_epel_pixels_tab[0][0][0] =
c->put_vp8_bilinear_pixels_tab[0][0][0] = ff_put_vp8_pixels16_mmx;
c->put_vp8_epel_pixels_tab[1][0][0] =
@@ -337,7 +338,6 @@ av_cold void ff_vp8dsp_init_x86(VP8DSPContext* c)
/* note that 4-tap width=16 functions are missing because w=16
* is only used for luma, and luma is always a copy or sixtap. */
if (mm_flags & FF_MM_MMX2) {
- c->vp8_luma_dc_wht = ff_vp8_luma_dc_wht_mmxext;
VP8_LUMA_MC_FUNC(0, 16, mmxext);
VP8_MC_FUNC(1, 8, mmxext);
VP8_MC_FUNC(2, 4, mmxext);
diff --git a/libavcodec/x86/vp8dsp.asm b/libavcodec/x86/vp8dsp.asm
index 2ff415266d..1d9d8c5b6c 100644
--- a/libavcodec/x86/vp8dsp.asm
+++ b/libavcodec/x86/vp8dsp.asm
@@ -1034,15 +1034,25 @@ cglobal vp8_idct_add_mmx, 3, 3
; void vp8_luma_dc_wht_mmxext(DCTELEM block[4][4][16], DCTELEM dc[16])
;-----------------------------------------------------------------------------
-%macro SCATTER_WHT 1
- pextrw r1d, m0, %1
- pextrw r2d, m1, %1
- mov [r0+2*16*0], r1w
- mov [r0+2*16*1], r2w
- pextrw r1d, m2, %1
- pextrw r2d, m3, %1
- mov [r0+2*16*2], r1w
- mov [r0+2*16*3], r2w
+%macro SCATTER_WHT 3
+ movd r1d, m%1
+ movd r2d, m%2
+ mov [r0+2*16*(0+%3)], r1w
+ mov [r0+2*16*(1+%3)], r2w
+ shr r1d, 16
+ shr r2d, 16
+ psrlq m%1, 32
+ psrlq m%2, 32
+ mov [r0+2*16*(4+%3)], r1w
+ mov [r0+2*16*(5+%3)], r2w
+ movd r1d, m%1
+ movd r2d, m%2
+ mov [r0+2*16*(8+%3)], r1w
+ mov [r0+2*16*(9+%3)], r2w
+ shr r1d, 16
+ shr r2d, 16
+ mov [r0+2*16*(12+%3)], r1w
+ mov [r0+2*16*(13+%3)], r2w
%endmacro
%macro HADAMARD4_1D 4
@@ -1052,7 +1062,7 @@ cglobal vp8_idct_add_mmx, 3, 3
%endmacro
INIT_MMX
-cglobal vp8_luma_dc_wht_mmxext, 2,3
+cglobal vp8_luma_dc_wht_mmx, 2,3
movq m0, [r1]
movq m1, [r1+8]
movq m2, [r1+16]
@@ -1065,13 +1075,8 @@ cglobal vp8_luma_dc_wht_mmxext, 2,3
psraw m1, 3
psraw m2, 3
psraw m3, 3
- SCATTER_WHT 0
- add r0, 2*16*4
- SCATTER_WHT 1
- add r0, 2*16*4
- SCATTER_WHT 2
- add r0, 2*16*4
- SCATTER_WHT 3
+ SCATTER_WHT 0, 1, 0
+ SCATTER_WHT 2, 3, 2
RET
;-----------------------------------------------------------------------------