diff options
author | David Conrad <lessen42@gmail.com> | 2010-04-17 02:04:30 +0000 |
---|---|---|
committer | David Conrad <lessen42@gmail.com> | 2010-04-17 02:04:30 +0000 |
commit | eb6a6cd788a172f146534c5fab9b98d6cbf59520 (patch) | |
tree | 23225d7976eefaf0292342e6ee8b4ac946efcb8e | |
parent | f32f7d8b24d1228df447be85046b9346292d936e (diff) | |
download | ffmpeg-eb6a6cd788a172f146534c5fab9b98d6cbf59520.tar.gz |
vp3: DC-only IDCT
2-4% faster overall decode
Originally committed as revision 22896 to svn://svn.ffmpeg.org/ffmpeg/trunk
-rw-r--r-- | libavcodec/arm/dsputil_init_neon.c | 2 | ||||
-rw-r--r-- | libavcodec/arm/vp3dsp_neon.S | 44 | ||||
-rw-r--r-- | libavcodec/dsputil.c | 1 | ||||
-rw-r--r-- | libavcodec/dsputil.h | 2 | ||||
-rw-r--r-- | libavcodec/vp3.c | 8 | ||||
-rw-r--r-- | libavcodec/vp3dsp.c | 19 | ||||
-rw-r--r-- | libavcodec/x86/dsputil_mmx.c | 3 | ||||
-rw-r--r-- | libavcodec/x86/vp3dsp_mmx.c | 41 | ||||
-rw-r--r-- | libavcodec/x86/vp3dsp_mmx.h | 1 |
9 files changed, 118 insertions, 3 deletions
diff --git a/libavcodec/arm/dsputil_init_neon.c b/libavcodec/arm/dsputil_init_neon.c index 1f2169ead5..0e44160392 100644 --- a/libavcodec/arm/dsputil_init_neon.c +++ b/libavcodec/arm/dsputil_init_neon.c @@ -32,6 +32,7 @@ void ff_simple_idct_add_neon(uint8_t *dest, int line_size, DCTELEM *data); void ff_vp3_idct_neon(DCTELEM *data); void ff_vp3_idct_put_neon(uint8_t *dest, int line_size, DCTELEM *data); void ff_vp3_idct_add_neon(uint8_t *dest, int line_size, DCTELEM *data); +void ff_vp3_idct_dc_add_neon(uint8_t *dest, int line_size, const DCTELEM *data); void ff_put_pixels16_neon(uint8_t *, const uint8_t *, int, int); void ff_put_pixels16_x2_neon(uint8_t *, const uint8_t *, int, int); @@ -294,6 +295,7 @@ void ff_dsputil_init_neon(DSPContext *c, AVCodecContext *avctx) if (CONFIG_VP3_DECODER) { c->vp3_v_loop_filter = ff_vp3_v_loop_filter_neon; c->vp3_h_loop_filter = ff_vp3_h_loop_filter_neon; + c->vp3_idct_dc_add = ff_vp3_idct_dc_add_neon; } c->vector_fmul = ff_vector_fmul_neon; diff --git a/libavcodec/arm/vp3dsp_neon.S b/libavcodec/arm/vp3dsp_neon.S index 6deae4725e..ade19984c2 100644 --- a/libavcodec/arm/vp3dsp_neon.S +++ b/libavcodec/arm/vp3dsp_neon.S @@ -374,3 +374,47 @@ function ff_vp3_idct_add_neon, export=1 vst1.64 {d7}, [r2,:64], r1 bx lr endfunc + +function ff_vp3_idct_dc_add_neon, export=1 + ldrsh r2, [r2] + movw r3, #46341 + mul r2, r3, r2 + smulwt r2, r3, r2 + mov r3, r0 + vdup.16 q15, r2 + vrshr.s16 q15, q15, #4 + + vld1.8 {d0}, [r0,:64], r1 + vld1.8 {d1}, [r0,:64], r1 + vld1.8 {d2}, [r0,:64], r1 + vaddw.u8 q8, q15, d0 + vld1.8 {d3}, [r0,:64], r1 + vaddw.u8 q9, q15, d1 + vld1.8 {d4}, [r0,:64], r1 + vaddw.u8 q10, q15, d2 + vld1.8 {d5}, [r0,:64], r1 + vaddw.u8 q11, q15, d3 + vld1.8 {d6}, [r0,:64], r1 + vaddw.u8 q12, q15, d4 + vld1.8 {d7}, [r0,:64], r1 + vaddw.u8 q13, q15, d5 + vqmovun.s16 d0, q8 + vaddw.u8 q14, q15, d6 + vqmovun.s16 d1, q9 + vaddw.u8 q15, q15, d7 + vqmovun.s16 d2, q10 + vst1.8 {d0}, [r3,:64], r1 + vqmovun.s16 d3, q11 + vst1.8 {d1}, [r3,:64], r1 + vqmovun.s16 d4, q12 + vst1.8 {d2}, [r3,:64], r1 + vqmovun.s16 d5, q13 + vst1.8 {d3}, [r3,:64], r1 + vqmovun.s16 d6, q14 + vst1.8 {d4}, [r3,:64], r1 + vqmovun.s16 d7, q15 + vst1.8 {d5}, [r3,:64], r1 + vst1.8 {d6}, [r3,:64], r1 + vst1.8 {d7}, [r3,:64], r1 + bx lr +endfunc diff --git a/libavcodec/dsputil.c b/libavcodec/dsputil.c index bbfdb6ae8d..bbff06df78 100644 --- a/libavcodec/dsputil.c +++ b/libavcodec/dsputil.c @@ -4467,6 +4467,7 @@ av_cold void dsputil_init(DSPContext* c, AVCodecContext *avctx) if (CONFIG_VP3_DECODER) { c->vp3_h_loop_filter= ff_vp3_h_loop_filter_c; c->vp3_v_loop_filter= ff_vp3_v_loop_filter_c; + c->vp3_idct_dc_add= ff_vp3_idct_dc_add_c; } if (CONFIG_VP6_DECODER) { c->vp6_filter_diag4= ff_vp6_filter_diag4_c; diff --git a/libavcodec/dsputil.h b/libavcodec/dsputil.h index d1816e66ba..2c361b9f76 100644 --- a/libavcodec/dsputil.h +++ b/libavcodec/dsputil.h @@ -86,6 +86,7 @@ extern uint8_t ff_cropTbl[256 + 2 * MAX_NEG_CROP]; void ff_vp3_idct_c(DCTELEM *block/* align 16*/); void ff_vp3_idct_put_c(uint8_t *dest/*align 8*/, int line_size, DCTELEM *block/*align 16*/); void ff_vp3_idct_add_c(uint8_t *dest/*align 8*/, int line_size, DCTELEM *block/*align 16*/); +void ff_vp3_idct_dc_add_c(uint8_t *dest/*align 8*/, int line_size, const DCTELEM *block/*align 16*/); void ff_vp3_v_loop_filter_c(uint8_t *src, int stride, int *bounding_values); void ff_vp3_h_loop_filter_c(uint8_t *src, int stride, int *bounding_values); @@ -373,6 +374,7 @@ typedef struct DSPContext { void (*x8_v_loop_filter)(uint8_t *src, int stride, int qscale); void (*x8_h_loop_filter)(uint8_t *src, int stride, int qscale); + void (*vp3_idct_dc_add)(uint8_t *dest/*align 8*/, int line_size, const DCTELEM *block/*align 16*/); void (*vp3_v_loop_filter)(uint8_t *src, int stride, int *bounding_values); void (*vp3_h_loop_filter)(uint8_t *src, int stride, int *bounding_values); diff --git a/libavcodec/vp3.c b/libavcodec/vp3.c index e46e6a437a..2e72fba0fc 100644 --- a/libavcodec/vp3.c +++ b/libavcodec/vp3.c @@ -1395,8 +1395,6 @@ static void render_slice(Vp3DecodeContext *s, int slice) /* transform if this block was coded */ if (s->all_fragments[i].coding_method != MODE_COPY) { - int intra = s->all_fragments[i].coding_method == MODE_INTRA; - if ((s->all_fragments[i].coding_method == MODE_USING_GOLDEN) || (s->all_fragments[i].coding_method == MODE_GOLDEN_MV)) motion_source= golden_plane; @@ -1456,11 +1454,11 @@ static void render_slice(Vp3DecodeContext *s, int slice) } s->dsp.clear_block(block); - vp3_dequant(s, s->all_fragments + i, plane, !intra, block); /* invert DCT and place (or add) in final output */ if (s->all_fragments[i].coding_method == MODE_INTRA) { + vp3_dequant(s, s->all_fragments + i, plane, 0, block); if(s->avctx->idct_algo!=FF_IDCT_VP3) block[0] += 128<<3; s->dsp.idct_put( @@ -1468,10 +1466,14 @@ static void render_slice(Vp3DecodeContext *s, int slice) stride, block); } else { + if (vp3_dequant(s, s->all_fragments + i, plane, 1, block)) { s->dsp.idct_add( output_plane + first_pixel, stride, block); + } else { + s->dsp.vp3_idct_dc_add(output_plane + first_pixel, stride, block); + } } } else { diff --git a/libavcodec/vp3dsp.c b/libavcodec/vp3dsp.c index 87b64de385..049758e671 100644 --- a/libavcodec/vp3dsp.c +++ b/libavcodec/vp3dsp.c @@ -223,6 +223,25 @@ void ff_vp3_idct_add_c(uint8_t *dest/*align 8*/, int line_size, DCTELEM *block/* idct(dest, line_size, block, 2); } +void ff_vp3_idct_dc_add_c(uint8_t *dest/*align 8*/, int line_size, const DCTELEM *block/*align 16*/){ + const uint8_t *cm = ff_cropTbl + MAX_NEG_CROP; + int i, dc = block[0]; + dc = (46341*dc)>>16; + dc = (46341*dc + (8<<16))>>20; + + for(i = 0; i < 8; i++){ + dest[0] = cm[dest[0]+dc]; + dest[1] = cm[dest[1]+dc]; + dest[2] = cm[dest[2]+dc]; + dest[3] = cm[dest[3]+dc]; + dest[4] = cm[dest[4]+dc]; + dest[5] = cm[dest[5]+dc]; + dest[6] = cm[dest[6]+dc]; + dest[7] = cm[dest[7]+dc]; + dest += line_size; + } +} + void ff_vp3_v_loop_filter_c(uint8_t *first_pixel, int stride, int *bounding_values) { unsigned char *end; diff --git a/libavcodec/x86/dsputil_mmx.c b/libavcodec/x86/dsputil_mmx.c index 31387ebb30..cc2f881303 100644 --- a/libavcodec/x86/dsputil_mmx.c +++ b/libavcodec/x86/dsputil_mmx.c @@ -2653,6 +2653,9 @@ void dsputil_init_mmx(DSPContext* c, AVCodecContext *avctx) c->vp3_h_loop_filter= ff_vp3_h_loop_filter_mmx2; } } + if (CONFIG_VP3_DECODER) { + c->vp3_idct_dc_add = ff_vp3_idct_dc_add_mmx2; + } #define SET_QPEL_FUNCS(PFX, IDX, SIZE, CPU) \ c->PFX ## _pixels_tab[IDX][ 0] = PFX ## SIZE ## _mc00_ ## CPU; \ diff --git a/libavcodec/x86/vp3dsp_mmx.c b/libavcodec/x86/vp3dsp_mmx.c index fead8e8cef..309dd4aa5d 100644 --- a/libavcodec/x86/vp3dsp_mmx.c +++ b/libavcodec/x86/vp3dsp_mmx.c @@ -395,3 +395,44 @@ void ff_vp3_idct_add_mmx(uint8_t *dest, int line_size, DCTELEM *block) ff_vp3_idct_mmx(block); add_pixels_clamped_mmx(block, dest, line_size); } + +void ff_vp3_idct_dc_add_mmx2(uint8_t *dest, int linesize, const DCTELEM *block) +{ + int dc = block[0]; + dc = (46341*dc)>>16; + dc = (46341*dc + (8<<16))>>20; + + __asm__ volatile( + "movd %3, %%mm0 \n\t" + "pshufw $0, %%mm0, %%mm0 \n\t" + "pxor %%mm1, %%mm1 \n\t" + "psubw %%mm0, %%mm1 \n\t" + "packuswb %%mm0, %%mm0 \n\t" + "packuswb %%mm1, %%mm1 \n\t" + +#define DC_ADD \ + "movq (%0), %%mm2 \n\t" \ + "movq (%0,%1), %%mm3 \n\t" \ + "paddusb %%mm0, %%mm2 \n\t" \ + "movq (%0,%1,2), %%mm4 \n\t" \ + "paddusb %%mm0, %%mm3 \n\t" \ + "movq (%0,%2), %%mm5 \n\t" \ + "paddusb %%mm0, %%mm4 \n\t" \ + "paddusb %%mm0, %%mm5 \n\t" \ + "psubusb %%mm1, %%mm2 \n\t" \ + "psubusb %%mm1, %%mm3 \n\t" \ + "movq %%mm2, (%0) \n\t" \ + "psubusb %%mm1, %%mm4 \n\t" \ + "movq %%mm3, (%0,%1) \n\t" \ + "psubusb %%mm1, %%mm5 \n\t" \ + "movq %%mm4, (%0,%1,2) \n\t" \ + "movq %%mm5, (%0,%2) \n\t" + + DC_ADD + "lea (%0,%1,4), %0 \n\t" + DC_ADD + + : "+r"(dest) + : "r"((x86_reg)linesize), "r"((x86_reg)3*linesize), "r"(dc) + ); +} diff --git a/libavcodec/x86/vp3dsp_mmx.h b/libavcodec/x86/vp3dsp_mmx.h index e565a33023..e0ebf0b0f4 100644 --- a/libavcodec/x86/vp3dsp_mmx.h +++ b/libavcodec/x86/vp3dsp_mmx.h @@ -28,6 +28,7 @@ void ff_vp3_idct_mmx(int16_t *data); void ff_vp3_idct_put_mmx(uint8_t *dest, int line_size, DCTELEM *block); void ff_vp3_idct_add_mmx(uint8_t *dest, int line_size, DCTELEM *block); +void ff_vp3_idct_dc_add_mmx2(uint8_t *dest, int line_size, const DCTELEM *block); void ff_vp3_v_loop_filter_mmx2(uint8_t *src, int stride, int *bounding_values); void ff_vp3_h_loop_filter_mmx2(uint8_t *src, int stride, int *bounding_values); |