diff options
author | Jason Garrett-Glaser <darkshikari@gmail.com> | 2010-07-23 02:58:27 +0000 |
---|---|---|
committer | Jason Garrett-Glaser <darkshikari@gmail.com> | 2010-07-23 02:58:27 +0000 |
commit | 8a467b2d44d20c1a0b731dce9edeff772732a558 (patch) | |
tree | 259b69473524a9db002d79b90009817f84af4113 | |
parent | ef38842f0bc97ce5b158f51f3e65aae4164fc6a5 (diff) | |
download | ffmpeg-8a467b2d44d20c1a0b731dce9edeff772732a558.tar.gz |
VP8: 30% faster idct_mb
Take shortcuts based on statistically common situations.
Add 4-at-a-time idct_dc function (mmx and sse2) since rows of 4 DC-only DCT
blocks are common.
TODO: tie this more directly into the MB mode, since the DC-level transform is
only used for non-splitmv blocks?
Originally committed as revision 24452 to svn://svn.ffmpeg.org/ffmpeg/trunk
-rw-r--r-- | libavcodec/vp8.c | 64 | ||||
-rw-r--r-- | libavcodec/vp8dsp.c | 26 | ||||
-rw-r--r-- | libavcodec/vp8dsp.h | 1 | ||||
-rw-r--r-- | libavcodec/x86/vp8dsp-init.c | 5 | ||||
-rw-r--r-- | libavcodec/x86/vp8dsp.asm | 181 |
5 files changed, 190 insertions, 87 deletions
diff --git a/libavcodec/vp8.c b/libavcodec/vp8.c index 18b69b8de1..92de1bc605 100644 --- a/libavcodec/vp8.c +++ b/libavcodec/vp8.c @@ -1186,45 +1186,49 @@ static void inter_predict(VP8Context *s, uint8_t *dst[3], VP8Macroblock *mb, } } -static void idct_mb(VP8Context *s, uint8_t *y_dst, uint8_t *u_dst, uint8_t *v_dst, - VP8Macroblock *mb) +static void idct_mb(VP8Context *s, uint8_t *dst[3], VP8Macroblock *mb) { - int x, y, nnz; + int x, y, ch, nnz; - if (mb->mode != MODE_I4x4) + if (mb->mode != MODE_I4x4) { + uint8_t *y_dst = dst[0]; for (y = 0; y < 4; y++) { - for (x = 0; x < 4; x++) { - nnz = s->non_zero_count_cache[y][x]; - if (nnz) { - if (nnz == 1) - s->vp8dsp.vp8_idct_dc_add(y_dst+4*x, s->block[y][x], s->linesize); - else - s->vp8dsp.vp8_idct_add(y_dst+4*x, s->block[y][x], s->linesize); + uint32_t nnz = AV_RN32A(s->non_zero_count_cache[y]); + if (nnz) { + if (nnz&~0x01010101) { + for (x = 0; x < 4; x++) { + nnz = s->non_zero_count_cache[y][x]; + if (nnz) { + if (nnz == 1) + s->vp8dsp.vp8_idct_dc_add(y_dst+4*x, s->block[y][x], s->linesize); + else + s->vp8dsp.vp8_idct_add(y_dst+4*x, s->block[y][x], s->linesize); + } + } + } else { + s->vp8dsp.vp8_idct_dc_add4(y_dst, s->block[y], s->linesize); } } y_dst += 4*s->linesize; } + } - for (y = 0; y < 2; y++) { - for (x = 0; x < 2; x++) { - nnz = s->non_zero_count_cache[4][(y<<1)+x]; - if (nnz) { - if (nnz == 1) - s->vp8dsp.vp8_idct_dc_add(u_dst+4*x, s->block[4][(y<<1)+x], s->uvlinesize); - else - s->vp8dsp.vp8_idct_add(u_dst+4*x, s->block[4][(y<<1)+x], s->uvlinesize); - } - - nnz = s->non_zero_count_cache[5][(y<<1)+x]; - if (nnz) { - if (nnz == 1) - s->vp8dsp.vp8_idct_dc_add(v_dst+4*x, s->block[5][(y<<1)+x], s->uvlinesize); - else - s->vp8dsp.vp8_idct_add(v_dst+4*x, s->block[5][(y<<1)+x], s->uvlinesize); + for (ch = 0; ch < 2; ch++) { + if (AV_RN32A(s->non_zero_count_cache[4+ch])) { + uint8_t *ch_dst = dst[1+ch]; + for (y = 0; y < 2; y++) { + for (x = 0; x < 2; x++) { + nnz = s->non_zero_count_cache[4+ch][(y<<1)+x]; + if (nnz) { + if (nnz == 1) + s->vp8dsp.vp8_idct_dc_add(ch_dst+4*x, s->block[4+ch][(y<<1)+x], s->uvlinesize); + else + s->vp8dsp.vp8_idct_add(ch_dst+4*x, s->block[4+ch][(y<<1)+x], s->uvlinesize); + } + } + ch_dst += 4*s->uvlinesize; } } - u_dst += 4*s->uvlinesize; - v_dst += 4*s->uvlinesize; } } @@ -1511,7 +1515,7 @@ static int vp8_decode_frame(AVCodecContext *avctx, void *data, int *data_size, prefetch_motion(s, mb, mb_x, mb_y, mb_xy, VP56_FRAME_GOLDEN); if (!mb->skip) { - idct_mb(s, dst[0], dst[1], dst[2], mb); + idct_mb(s, dst, mb); } else { AV_ZERO64(s->left_nnz); AV_WN64(s->top_nnz[mb_x], 0); // array of 9, so unaligned diff --git a/libavcodec/vp8dsp.c b/libavcodec/vp8dsp.c index 3e6463d598..64b09d52ee 100644 --- a/libavcodec/vp8dsp.c +++ b/libavcodec/vp8dsp.c @@ -109,6 +109,25 @@ static void vp8_idct_dc_add_c(uint8_t *dst, DCTELEM block[16], int stride) } } +static void vp8_idct_dc_add4_c(uint8_t *dst, DCTELEM block[4][16], int stride) +{ + int i, j; + for (j = 0; j < 4; j++) { + uint8_t *pix = dst+j*4; + int dc = (block[j][0] + 4) >> 3; + uint8_t *cm = ff_cropTbl + MAX_NEG_CROP + dc; + block[j][0] = 0; + if (!dc) + continue; + for (i = 0; i < 4; i++) { + pix[0] = cm[pix[0]]; + pix[1] = cm[pix[1]]; + pix[2] = cm[pix[2]]; + pix[3] = cm[pix[3]]; + pix += stride; + } + } +} // because I like only having two parameters to pass functions... #define LOAD_PIXELS\ @@ -460,9 +479,10 @@ VP8_BILINEAR(4) av_cold void ff_vp8dsp_init(VP8DSPContext *dsp) { - dsp->vp8_luma_dc_wht = vp8_luma_dc_wht_c; - dsp->vp8_idct_add = vp8_idct_add_c; - dsp->vp8_idct_dc_add = vp8_idct_dc_add_c; + dsp->vp8_luma_dc_wht = vp8_luma_dc_wht_c; + dsp->vp8_idct_add = vp8_idct_add_c; + dsp->vp8_idct_dc_add = vp8_idct_dc_add_c; + dsp->vp8_idct_dc_add4 = vp8_idct_dc_add4_c; dsp->vp8_v_loop_filter16y = vp8_v_loop_filter16_c; dsp->vp8_h_loop_filter16y = vp8_h_loop_filter16_c; diff --git a/libavcodec/vp8dsp.h b/libavcodec/vp8dsp.h index 64a3bfbc57..82ea684eab 100644 --- a/libavcodec/vp8dsp.h +++ b/libavcodec/vp8dsp.h @@ -33,6 +33,7 @@ typedef struct VP8DSPContext { void (*vp8_luma_dc_wht)(DCTELEM block[4][4][16], DCTELEM dc[16]); void (*vp8_idct_add)(uint8_t *dst, DCTELEM block[16], int stride); void (*vp8_idct_dc_add)(uint8_t *dst, DCTELEM block[16], int stride); + void (*vp8_idct_dc_add4)(uint8_t *dst, DCTELEM block[4][16], int stride); // loop filter applied to edges between macroblocks void (*vp8_v_loop_filter16y)(uint8_t *dst, int stride, diff --git a/libavcodec/x86/vp8dsp-init.c b/libavcodec/x86/vp8dsp-init.c index 6cf1704594..5da70824fc 100644 --- a/libavcodec/x86/vp8dsp-init.c +++ b/libavcodec/x86/vp8dsp-init.c @@ -220,6 +220,8 @@ HVBILIN(ssse3, 8, 16, 16) extern void ff_vp8_idct_dc_add_mmx(uint8_t *dst, DCTELEM block[16], int stride); extern void ff_vp8_idct_dc_add_sse4(uint8_t *dst, DCTELEM block[16], int stride); +extern void ff_vp8_idct_dc_add4_mmx(uint8_t *dst, DCTELEM block[4][16], int stride); +extern void ff_vp8_idct_dc_add4_sse2(uint8_t *dst, DCTELEM block[4][16], int stride); extern void ff_vp8_luma_dc_wht_mmx(DCTELEM block[4][4][16], DCTELEM dc[16]); extern void ff_vp8_idct_add_mmx(uint8_t *dst, DCTELEM block[16], int stride); extern void ff_vp8_idct_add_sse(uint8_t *dst, DCTELEM block[16], int stride); @@ -283,6 +285,7 @@ av_cold void ff_vp8dsp_init_x86(VP8DSPContext* c) #if HAVE_YASM if (mm_flags & FF_MM_MMX) { c->vp8_idct_dc_add = ff_vp8_idct_dc_add_mmx; + c->vp8_idct_dc_add4 = ff_vp8_idct_dc_add4_mmx; c->vp8_idct_add = ff_vp8_idct_add_mmx; c->vp8_luma_dc_wht = ff_vp8_luma_dc_wht_mmx; c->put_vp8_epel_pixels_tab[0][0][0] = @@ -351,6 +354,8 @@ av_cold void ff_vp8dsp_init_x86(VP8DSPContext* c) } if (mm_flags & FF_MM_SSE2) { + c->vp8_idct_dc_add4 = ff_vp8_idct_dc_add4_sse2; + c->vp8_h_loop_filter16y_inner = ff_vp8_h_loop_filter16y_inner_sse2; c->vp8_h_loop_filter8uv_inner = ff_vp8_h_loop_filter8uv_inner_sse2; diff --git a/libavcodec/x86/vp8dsp.asm b/libavcodec/x86/vp8dsp.asm index 0cf4771abd..305bd71d08 100644 --- a/libavcodec/x86/vp8dsp.asm +++ b/libavcodec/x86/vp8dsp.asm @@ -900,75 +900,148 @@ cglobal put_vp8_pixels16_sse, 5,5,2 REP_RET ;----------------------------------------------------------------------------- -; IDCT functions: -; ; void vp8_idct_dc_add_<opt>(uint8_t *dst, DCTELEM block[16], int stride); ;----------------------------------------------------------------------------- +%macro ADD_DC 4 + %4 m2, [r0+%3] + %4 m3, [r0+r2+%3] + %4 m4, [r1+%3] + %4 m5, [r1+r2+%3] + paddusb m2, %1 + paddusb m3, %1 + paddusb m4, %1 + paddusb m5, %1 + psubusb m2, %2 + psubusb m3, %2 + psubusb m4, %2 + psubusb m5, %2 + %4 [r0+%3], m2 + %4 [r0+r2+%3], m3 + %4 [r1+%3], m4 + %4 [r1+r2+%3], m5 +%endmacro + +INIT_MMX cglobal vp8_idct_dc_add_mmx, 3, 3 ; load data - movd mm0, [r1] + movd m0, [r1] ; calculate DC - paddw mm0, [pw_4] - pxor mm1, mm1 - psraw mm0, 3 - movd [r1], mm1 - psubw mm1, mm0 - packuswb mm0, mm0 - packuswb mm1, mm1 - punpcklbw mm0, mm0 - punpcklbw mm1, mm1 - punpcklwd mm0, mm0 - punpcklwd mm1, mm1 + paddw m0, [pw_4] + pxor m1, m1 + psraw m0, 3 + movd [r1], m1 + psubw m1, m0 + packuswb m0, m0 + packuswb m1, m1 + punpcklbw m0, m0 + punpcklbw m1, m1 + punpcklwd m0, m0 + punpcklwd m1, m1 ; add DC - lea r1, [r0+r2*2] - movd mm2, [r0] - movd mm3, [r0+r2] - movd mm4, [r1] - movd mm5, [r1+r2] - paddusb mm2, mm0 - paddusb mm3, mm0 - paddusb mm4, mm0 - paddusb mm5, mm0 - psubusb mm2, mm1 - psubusb mm3, mm1 - psubusb mm4, mm1 - psubusb mm5, mm1 - movd [r0], mm2 - movd [r0+r2], mm3 - movd [r1], mm4 - movd [r1+r2], mm5 + lea r1, [r0+r2*2] + ADD_DC m0, m1, 0, movh RET +INIT_XMM cglobal vp8_idct_dc_add_sse4, 3, 3, 6 ; load data - movd xmm0, [r1] - pxor xmm1, xmm1 + movd m0, [r1] + pxor m1, m1 + + ; calculate DC + paddw m0, [pw_4] + movd [r1], m1 + lea r1, [r0+r2*2] + movd m2, [r0] + movd m3, [r0+r2] + movd m4, [r1] + movd m5, [r1+r2] + psraw m0, 3 + pshuflw m0, m0, 0 + punpcklqdq m0, m0 + punpckldq m2, m3 + punpckldq m4, m5 + punpcklbw m2, m1 + punpcklbw m4, m1 + paddw m2, m0 + paddw m4, m0 + packuswb m2, m4 + movd [r0], m2 + pextrd [r0+r2], m2, 1 + pextrd [r1], m2, 2 + pextrd [r1+r2], m2, 3 + RET + +;----------------------------------------------------------------------------- +; void vp8_idct_dc_add4_<opt>(uint8_t *dst, DCTELEM block[4][16], int stride); +;----------------------------------------------------------------------------- + +INIT_MMX +cglobal vp8_idct_dc_add4_mmx, 3, 3 + ; load data + movd m0, [r1+32*0] ; A + movd m1, [r1+32*2] ; C + punpcklwd m0, [r1+32*1] ; A B + punpcklwd m1, [r1+32*3] ; C D + punpckldq m0, m1 ; A B C D + pxor m6, m6 ; calculate DC - paddw xmm0, [pw_4] - movd [r1], xmm1 - lea r1, [r0+r2*2] - movd xmm2, [r0] - movd xmm3, [r0+r2] - movd xmm4, [r1] - movd xmm5, [r1+r2] - psraw xmm0, 3 - pshuflw xmm0, xmm0, 0 - punpcklqdq xmm0, xmm0 - punpckldq xmm2, xmm3 - punpckldq xmm4, xmm5 - punpcklbw xmm2, xmm1 - punpcklbw xmm4, xmm1 - paddw xmm2, xmm0 - paddw xmm4, xmm0 - packuswb xmm2, xmm4 - movd [r0], xmm2 - pextrd [r0+r2], xmm2, 1 - pextrd [r1], xmm2, 2 - pextrd [r1+r2], xmm2, 3 + paddw m0, [pw_4] + movd [r1+32*0], m6 + movd [r1+32*1], m6 + movd [r1+32*2], m6 + movd [r1+32*3], m6 + psraw m0, 3 + psubw m6, m0 + packuswb m0, m0 + packuswb m6, m6 + punpcklbw m0, m0 ; AABBCCDD + punpcklbw m6, m6 ; AABBCCDD + movq m1, m0 + movq m7, m6 + punpcklbw m0, m0 ; AAAABBBB + punpckhbw m1, m1 ; CCCCDDDD + punpcklbw m6, m6 ; AAAABBBB + punpckhbw m7, m7 ; CCCCDDDD + + ; add DC + lea r1, [r0+r2*2] + ADD_DC m0, m6, 0, mova + ADD_DC m1, m7, 8, mova + RET + +INIT_XMM +cglobal vp8_idct_dc_add4_sse2, 3, 3 + ; load data + movd m0, [r1+32*0] ; A + movd m1, [r1+32*2] ; C + punpcklwd m0, [r1+32*1] ; A B + punpcklwd m1, [r1+32*3] ; C D + punpckldq m0, m1 ; A B C D + pxor m1, m1 + + ; calculate DC + paddw m0, [pw_4] + movd [r1+32*0], m1 + movd [r1+32*1], m1 + movd [r1+32*2], m1 + movd [r1+32*3], m1 + psraw m0, 3 + psubw m1, m0 + packuswb m0, m0 + packuswb m1, m1 + punpcklbw m0, m0 + punpcklbw m1, m1 + punpcklbw m0, m0 + punpcklbw m1, m1 + + ; add DC + lea r1, [r0+r2*2] + ADD_DC m0, m1, 0, mova RET ;----------------------------------------------------------------------------- |