diff options
author | David Conrad <lessen42@gmail.com> | 2010-04-17 02:04:30 +0000 |
---|---|---|
committer | David Conrad <lessen42@gmail.com> | 2010-04-17 02:04:30 +0000 |
commit | eb6a6cd788a172f146534c5fab9b98d6cbf59520 (patch) | |
tree | 23225d7976eefaf0292342e6ee8b4ac946efcb8e /libavcodec/arm | |
parent | f32f7d8b24d1228df447be85046b9346292d936e (diff) | |
download | ffmpeg-eb6a6cd788a172f146534c5fab9b98d6cbf59520.tar.gz |
vp3: DC-only IDCT
2-4% faster overall decode
Originally committed as revision 22896 to svn://svn.ffmpeg.org/ffmpeg/trunk
Diffstat (limited to 'libavcodec/arm')
-rw-r--r-- | libavcodec/arm/dsputil_init_neon.c | 2 | ||||
-rw-r--r-- | libavcodec/arm/vp3dsp_neon.S | 44 |
2 files changed, 46 insertions, 0 deletions
diff --git a/libavcodec/arm/dsputil_init_neon.c b/libavcodec/arm/dsputil_init_neon.c index 1f2169ead5..0e44160392 100644 --- a/libavcodec/arm/dsputil_init_neon.c +++ b/libavcodec/arm/dsputil_init_neon.c @@ -32,6 +32,7 @@ void ff_simple_idct_add_neon(uint8_t *dest, int line_size, DCTELEM *data); void ff_vp3_idct_neon(DCTELEM *data); void ff_vp3_idct_put_neon(uint8_t *dest, int line_size, DCTELEM *data); void ff_vp3_idct_add_neon(uint8_t *dest, int line_size, DCTELEM *data); +void ff_vp3_idct_dc_add_neon(uint8_t *dest, int line_size, const DCTELEM *data); void ff_put_pixels16_neon(uint8_t *, const uint8_t *, int, int); void ff_put_pixels16_x2_neon(uint8_t *, const uint8_t *, int, int); @@ -294,6 +295,7 @@ void ff_dsputil_init_neon(DSPContext *c, AVCodecContext *avctx) if (CONFIG_VP3_DECODER) { c->vp3_v_loop_filter = ff_vp3_v_loop_filter_neon; c->vp3_h_loop_filter = ff_vp3_h_loop_filter_neon; + c->vp3_idct_dc_add = ff_vp3_idct_dc_add_neon; } c->vector_fmul = ff_vector_fmul_neon; diff --git a/libavcodec/arm/vp3dsp_neon.S b/libavcodec/arm/vp3dsp_neon.S index 6deae4725e..ade19984c2 100644 --- a/libavcodec/arm/vp3dsp_neon.S +++ b/libavcodec/arm/vp3dsp_neon.S @@ -374,3 +374,47 @@ function ff_vp3_idct_add_neon, export=1 vst1.64 {d7}, [r2,:64], r1 bx lr endfunc + +function ff_vp3_idct_dc_add_neon, export=1 + ldrsh r2, [r2] + movw r3, #46341 + mul r2, r3, r2 + smulwt r2, r3, r2 + mov r3, r0 + vdup.16 q15, r2 + vrshr.s16 q15, q15, #4 + + vld1.8 {d0}, [r0,:64], r1 + vld1.8 {d1}, [r0,:64], r1 + vld1.8 {d2}, [r0,:64], r1 + vaddw.u8 q8, q15, d0 + vld1.8 {d3}, [r0,:64], r1 + vaddw.u8 q9, q15, d1 + vld1.8 {d4}, [r0,:64], r1 + vaddw.u8 q10, q15, d2 + vld1.8 {d5}, [r0,:64], r1 + vaddw.u8 q11, q15, d3 + vld1.8 {d6}, [r0,:64], r1 + vaddw.u8 q12, q15, d4 + vld1.8 {d7}, [r0,:64], r1 + vaddw.u8 q13, q15, d5 + vqmovun.s16 d0, q8 + vaddw.u8 q14, q15, d6 + vqmovun.s16 d1, q9 + vaddw.u8 q15, q15, d7 + vqmovun.s16 d2, q10 + vst1.8 {d0}, [r3,:64], r1 + vqmovun.s16 d3, q11 + vst1.8 {d1}, [r3,:64], r1 + vqmovun.s16 d4, q12 + vst1.8 {d2}, [r3,:64], r1 + vqmovun.s16 d5, q13 + vst1.8 {d3}, [r3,:64], r1 + vqmovun.s16 d6, q14 + vst1.8 {d4}, [r3,:64], r1 + vqmovun.s16 d7, q15 + vst1.8 {d5}, [r3,:64], r1 + vst1.8 {d6}, [r3,:64], r1 + vst1.8 {d7}, [r3,:64], r1 + bx lr +endfunc |