diff options
author | Janne Grunau <janne-libav@jannau.net> | 2012-01-12 17:21:48 +0100 |
---|---|---|
committer | Janne Grunau <janne-libav@jannau.net> | 2012-01-12 18:33:55 +0100 |
commit | e1e369049e3d2f88eed6ed38eb3dd704681c7f1a (patch) | |
tree | ecaa96dfa9a4220e504cc93d30494a3a45cb8b79 | |
parent | 136ee32da3c728fb4e3490393efb947cc7c4e898 (diff) | |
download | ffmpeg-e1e369049e3d2f88eed6ed38eb3dd704681c7f1a.tar.gz |
rv34: NEON optimised dc only inverse transform
30-50% faster than the C implementation, 0.5% overall speedup on
bourne.rmvb.
-rw-r--r-- | libavcodec/arm/rv34dsp_init_neon.c | 9 | ||||
-rw-r--r-- | libavcodec/arm/rv34dsp_neon.S | 29 |
2 files changed, 36 insertions, 2 deletions
diff --git a/libavcodec/arm/rv34dsp_init_neon.c b/libavcodec/arm/rv34dsp_init_neon.c index 9a09fde7a9..16bda46658 100644 --- a/libavcodec/arm/rv34dsp_init_neon.c +++ b/libavcodec/arm/rv34dsp_init_neon.c @@ -26,8 +26,13 @@ void ff_rv34_inv_transform_neon(DCTELEM *block); void ff_rv34_inv_transform_noround_neon(DCTELEM *block); +void ff_rv34_inv_transform_dc_neon(DCTELEM *block); +void ff_rv34_inv_transform_noround_dc_neon(DCTELEM *block); + void ff_rv34dsp_init_neon(RV34DSPContext *c, DSPContext* dsp) { - c->rv34_inv_transform_tab[0] = ff_rv34_inv_transform_neon; - c->rv34_inv_transform_tab[1] = ff_rv34_inv_transform_noround_neon; + c->rv34_inv_transform_tab[0] = ff_rv34_inv_transform_neon; + c->rv34_inv_transform_tab[1] = ff_rv34_inv_transform_noround_neon; + c->rv34_inv_transform_dc_tab[0] = ff_rv34_inv_transform_dc_neon; + c->rv34_inv_transform_dc_tab[1] = ff_rv34_inv_transform_noround_dc_neon; } diff --git a/libavcodec/arm/rv34dsp_neon.S b/libavcodec/arm/rv34dsp_neon.S index f700f5c321..e776af0330 100644 --- a/libavcodec/arm/rv34dsp_neon.S +++ b/libavcodec/arm/rv34dsp_neon.S @@ -107,3 +107,32 @@ function ff_rv34_inv_transform_noround_neon, export=1 vst4.16 {d0[3], d1[3], d2[3], d3[3]}, [r2,:64], r1 bx lr endfunc + +/* void rv34_inv_transform_dc_c(DCTELEM *block) */ +function ff_rv34_inv_transform_dc_neon, export=1 + vld1.16 d28[], [r0:16] @ block[0] + vmov.i16 d4, #169 + mov r1, #16 + vmull.s16 q3, d28, d4 + vrshrn.s32 d0, q3, #10 + vst1.16 {d0}, [r0:64], r1 + vst1.16 {d0}, [r0:64], r1 + vst1.16 {d0}, [r0:64], r1 + vst1.16 {d0}, [r0:64], r1 + bx lr +endfunc + +/* void rv34_inv_transform_dc_noround_c(DCTELEM *block) */ +function ff_rv34_inv_transform_noround_dc_neon, export=1 + vld1.16 d28[], [r0:16] @ block[0] + vmov.i16 d4, #251 + vorr.s16 d4, #256 @ 13^2 * 3 + mov r1, #16 + vmull.s16 q3, d28, d4 + vshrn.s32 d0, q3, #11 + vst1.64 {d0}, [r0:64], r1 + vst1.64 {d0}, [r0:64], r1 + vst1.64 {d0}, [r0:64], r1 + vst1.64 {d0}, [r0:64], r1 + bx lr +endfunc |