aboutsummaryrefslogtreecommitdiffstats
path: root/libavcodec/x86
diff options
context:
space:
mode:
authorJason Garrett-Glaser <darkshikari@gmail.com>2010-07-23 02:58:27 +0000
committerJason Garrett-Glaser <darkshikari@gmail.com>2010-07-23 02:58:27 +0000
commit8a467b2d44d20c1a0b731dce9edeff772732a558 (patch)
tree259b69473524a9db002d79b90009817f84af4113 /libavcodec/x86
parentef38842f0bc97ce5b158f51f3e65aae4164fc6a5 (diff)
downloadffmpeg-8a467b2d44d20c1a0b731dce9edeff772732a558.tar.gz
VP8: 30% faster idct_mb
Take shortcuts based on statistically common situations. Add 4-at-a-time idct_dc function (mmx and sse2) since rows of 4 DC-only DCT blocks are common. TODO: tie this more directly into the MB mode, since the DC-level transform is only used for non-splitmv blocks? Originally committed as revision 24452 to svn://svn.ffmpeg.org/ffmpeg/trunk
Diffstat (limited to 'libavcodec/x86')
-rw-r--r--libavcodec/x86/vp8dsp-init.c5
-rw-r--r--libavcodec/x86/vp8dsp.asm181
2 files changed, 132 insertions, 54 deletions
diff --git a/libavcodec/x86/vp8dsp-init.c b/libavcodec/x86/vp8dsp-init.c
index 6cf1704594..5da70824fc 100644
--- a/libavcodec/x86/vp8dsp-init.c
+++ b/libavcodec/x86/vp8dsp-init.c
@@ -220,6 +220,8 @@ HVBILIN(ssse3, 8, 16, 16)
extern void ff_vp8_idct_dc_add_mmx(uint8_t *dst, DCTELEM block[16], int stride);
extern void ff_vp8_idct_dc_add_sse4(uint8_t *dst, DCTELEM block[16], int stride);
+extern void ff_vp8_idct_dc_add4_mmx(uint8_t *dst, DCTELEM block[4][16], int stride);
+extern void ff_vp8_idct_dc_add4_sse2(uint8_t *dst, DCTELEM block[4][16], int stride);
extern void ff_vp8_luma_dc_wht_mmx(DCTELEM block[4][4][16], DCTELEM dc[16]);
extern void ff_vp8_idct_add_mmx(uint8_t *dst, DCTELEM block[16], int stride);
extern void ff_vp8_idct_add_sse(uint8_t *dst, DCTELEM block[16], int stride);
@@ -283,6 +285,7 @@ av_cold void ff_vp8dsp_init_x86(VP8DSPContext* c)
#if HAVE_YASM
if (mm_flags & FF_MM_MMX) {
c->vp8_idct_dc_add = ff_vp8_idct_dc_add_mmx;
+ c->vp8_idct_dc_add4 = ff_vp8_idct_dc_add4_mmx;
c->vp8_idct_add = ff_vp8_idct_add_mmx;
c->vp8_luma_dc_wht = ff_vp8_luma_dc_wht_mmx;
c->put_vp8_epel_pixels_tab[0][0][0] =
@@ -351,6 +354,8 @@ av_cold void ff_vp8dsp_init_x86(VP8DSPContext* c)
}
if (mm_flags & FF_MM_SSE2) {
+ c->vp8_idct_dc_add4 = ff_vp8_idct_dc_add4_sse2;
+
c->vp8_h_loop_filter16y_inner = ff_vp8_h_loop_filter16y_inner_sse2;
c->vp8_h_loop_filter8uv_inner = ff_vp8_h_loop_filter8uv_inner_sse2;
diff --git a/libavcodec/x86/vp8dsp.asm b/libavcodec/x86/vp8dsp.asm
index 0cf4771abd..305bd71d08 100644
--- a/libavcodec/x86/vp8dsp.asm
+++ b/libavcodec/x86/vp8dsp.asm
@@ -900,75 +900,148 @@ cglobal put_vp8_pixels16_sse, 5,5,2
REP_RET
;-----------------------------------------------------------------------------
-; IDCT functions:
-;
; void vp8_idct_dc_add_<opt>(uint8_t *dst, DCTELEM block[16], int stride);
;-----------------------------------------------------------------------------
+%macro ADD_DC 4
+ %4 m2, [r0+%3]
+ %4 m3, [r0+r2+%3]
+ %4 m4, [r1+%3]
+ %4 m5, [r1+r2+%3]
+ paddusb m2, %1
+ paddusb m3, %1
+ paddusb m4, %1
+ paddusb m5, %1
+ psubusb m2, %2
+ psubusb m3, %2
+ psubusb m4, %2
+ psubusb m5, %2
+ %4 [r0+%3], m2
+ %4 [r0+r2+%3], m3
+ %4 [r1+%3], m4
+ %4 [r1+r2+%3], m5
+%endmacro
+
+INIT_MMX
cglobal vp8_idct_dc_add_mmx, 3, 3
; load data
- movd mm0, [r1]
+ movd m0, [r1]
; calculate DC
- paddw mm0, [pw_4]
- pxor mm1, mm1
- psraw mm0, 3
- movd [r1], mm1
- psubw mm1, mm0
- packuswb mm0, mm0
- packuswb mm1, mm1
- punpcklbw mm0, mm0
- punpcklbw mm1, mm1
- punpcklwd mm0, mm0
- punpcklwd mm1, mm1
+ paddw m0, [pw_4]
+ pxor m1, m1
+ psraw m0, 3
+ movd [r1], m1
+ psubw m1, m0
+ packuswb m0, m0
+ packuswb m1, m1
+ punpcklbw m0, m0
+ punpcklbw m1, m1
+ punpcklwd m0, m0
+ punpcklwd m1, m1
; add DC
- lea r1, [r0+r2*2]
- movd mm2, [r0]
- movd mm3, [r0+r2]
- movd mm4, [r1]
- movd mm5, [r1+r2]
- paddusb mm2, mm0
- paddusb mm3, mm0
- paddusb mm4, mm0
- paddusb mm5, mm0
- psubusb mm2, mm1
- psubusb mm3, mm1
- psubusb mm4, mm1
- psubusb mm5, mm1
- movd [r0], mm2
- movd [r0+r2], mm3
- movd [r1], mm4
- movd [r1+r2], mm5
+ lea r1, [r0+r2*2]
+ ADD_DC m0, m1, 0, movh
RET
+INIT_XMM
cglobal vp8_idct_dc_add_sse4, 3, 3, 6
; load data
- movd xmm0, [r1]
- pxor xmm1, xmm1
+ movd m0, [r1]
+ pxor m1, m1
+
+ ; calculate DC
+ paddw m0, [pw_4]
+ movd [r1], m1
+ lea r1, [r0+r2*2]
+ movd m2, [r0]
+ movd m3, [r0+r2]
+ movd m4, [r1]
+ movd m5, [r1+r2]
+ psraw m0, 3
+ pshuflw m0, m0, 0
+ punpcklqdq m0, m0
+ punpckldq m2, m3
+ punpckldq m4, m5
+ punpcklbw m2, m1
+ punpcklbw m4, m1
+ paddw m2, m0
+ paddw m4, m0
+ packuswb m2, m4
+ movd [r0], m2
+ pextrd [r0+r2], m2, 1
+ pextrd [r1], m2, 2
+ pextrd [r1+r2], m2, 3
+ RET
+
+;-----------------------------------------------------------------------------
+; void vp8_idct_dc_add4_<opt>(uint8_t *dst, DCTELEM block[4][16], int stride);
+;-----------------------------------------------------------------------------
+
+INIT_MMX
+cglobal vp8_idct_dc_add4_mmx, 3, 3
+ ; load data
+ movd m0, [r1+32*0] ; A
+ movd m1, [r1+32*2] ; C
+ punpcklwd m0, [r1+32*1] ; A B
+ punpcklwd m1, [r1+32*3] ; C D
+ punpckldq m0, m1 ; A B C D
+ pxor m6, m6
; calculate DC
- paddw xmm0, [pw_4]
- movd [r1], xmm1
- lea r1, [r0+r2*2]
- movd xmm2, [r0]
- movd xmm3, [r0+r2]
- movd xmm4, [r1]
- movd xmm5, [r1+r2]
- psraw xmm0, 3
- pshuflw xmm0, xmm0, 0
- punpcklqdq xmm0, xmm0
- punpckldq xmm2, xmm3
- punpckldq xmm4, xmm5
- punpcklbw xmm2, xmm1
- punpcklbw xmm4, xmm1
- paddw xmm2, xmm0
- paddw xmm4, xmm0
- packuswb xmm2, xmm4
- movd [r0], xmm2
- pextrd [r0+r2], xmm2, 1
- pextrd [r1], xmm2, 2
- pextrd [r1+r2], xmm2, 3
+ paddw m0, [pw_4]
+ movd [r1+32*0], m6
+ movd [r1+32*1], m6
+ movd [r1+32*2], m6
+ movd [r1+32*3], m6
+ psraw m0, 3
+ psubw m6, m0
+ packuswb m0, m0
+ packuswb m6, m6
+ punpcklbw m0, m0 ; AABBCCDD
+ punpcklbw m6, m6 ; AABBCCDD
+ movq m1, m0
+ movq m7, m6
+ punpcklbw m0, m0 ; AAAABBBB
+ punpckhbw m1, m1 ; CCCCDDDD
+ punpcklbw m6, m6 ; AAAABBBB
+ punpckhbw m7, m7 ; CCCCDDDD
+
+ ; add DC
+ lea r1, [r0+r2*2]
+ ADD_DC m0, m6, 0, mova
+ ADD_DC m1, m7, 8, mova
+ RET
+
+INIT_XMM
+cglobal vp8_idct_dc_add4_sse2, 3, 3
+ ; load data
+ movd m0, [r1+32*0] ; A
+ movd m1, [r1+32*2] ; C
+ punpcklwd m0, [r1+32*1] ; A B
+ punpcklwd m1, [r1+32*3] ; C D
+ punpckldq m0, m1 ; A B C D
+ pxor m1, m1
+
+ ; calculate DC
+ paddw m0, [pw_4]
+ movd [r1+32*0], m1
+ movd [r1+32*1], m1
+ movd [r1+32*2], m1
+ movd [r1+32*3], m1
+ psraw m0, 3
+ psubw m1, m0
+ packuswb m0, m0
+ packuswb m1, m1
+ punpcklbw m0, m0
+ punpcklbw m1, m1
+ punpcklbw m0, m0
+ punpcklbw m1, m1
+
+ ; add DC
+ lea r1, [r0+r2*2]
+ ADD_DC m0, m1, 0, mova
RET
;-----------------------------------------------------------------------------