diff options
author | Christophe GISQUET <christophe.gisquet@gmail.com> | 2012-01-01 15:28:47 +0100 |
---|---|---|
committer | Kostya Shishkov <kostya.shishkov@gmail.com> | 2012-01-04 10:30:01 +0100 |
commit | 98f24ecd6cfc9c57a555aae6bfcd3d9a4ce9503d (patch) | |
tree | a4e562412fcd7dacf25afdbd3c2f8cfa5737c911 /libavcodec/arm | |
parent | 0749720b6cdce68e4908dc59f1b4e1399852372b (diff) | |
download | ffmpeg-98f24ecd6cfc9c57a555aae6bfcd3d9a4ce9503d.tar.gz |
rv34: joint coefficient decoding and dequantization
Perform dequantization while decoding coefficients instead of performing it
on the entire coefficients buffer.
Since quantized coefficients are very sparse, this usually causes a small
speedup. Speedup of around 1% on Panda board compared to the removed here
neon code. Global speedup is probably around 3%.
Signed-off-by: Kostya Shishkov <kostya.shishkov@gmail.com>
Diffstat (limited to 'libavcodec/arm')
-rw-r--r-- | libavcodec/arm/rv34dsp_init_neon.c | 3 | ||||
-rw-r--r-- | libavcodec/arm/rv34dsp_neon.S | 24 |
2 files changed, 0 insertions, 27 deletions
diff --git a/libavcodec/arm/rv34dsp_init_neon.c b/libavcodec/arm/rv34dsp_init_neon.c index acf2a7dcd3..9a09fde7a9 100644 --- a/libavcodec/arm/rv34dsp_init_neon.c +++ b/libavcodec/arm/rv34dsp_init_neon.c @@ -25,12 +25,9 @@ void ff_rv34_inv_transform_neon(DCTELEM *block); void ff_rv34_inv_transform_noround_neon(DCTELEM *block); -void ff_rv34_dequant4x4_neon(DCTELEM *block, int Qdc, int Q); void ff_rv34dsp_init_neon(RV34DSPContext *c, DSPContext* dsp) { c->rv34_inv_transform_tab[0] = ff_rv34_inv_transform_neon; c->rv34_inv_transform_tab[1] = ff_rv34_inv_transform_noround_neon; - - c->rv34_dequant4x4 = ff_rv34_dequant4x4_neon; } diff --git a/libavcodec/arm/rv34dsp_neon.S b/libavcodec/arm/rv34dsp_neon.S index 423b537fb9..f700f5c321 100644 --- a/libavcodec/arm/rv34dsp_neon.S +++ b/libavcodec/arm/rv34dsp_neon.S @@ -107,27 +107,3 @@ function ff_rv34_inv_transform_noround_neon, export=1 vst4.16 {d0[3], d1[3], d2[3], d3[3]}, [r2,:64], r1 bx lr endfunc - -function ff_rv34_dequant4x4_neon, export=1 - mov r3, r0 - mov r12, #16 - vdup.16 q0, r2 - vmov.16 d0[0], r1 - vld1.16 {d2}, [r0,:64], r12 - vld1.16 {d4}, [r0,:64], r12 - vld1.16 {d6}, [r0,:64], r12 - vld1.16 {d16}, [r0,:64], r12 - vmull.s16 q1, d2, d0 - vmull.s16 q2, d4, d1 - vmull.s16 q3, d6, d1 - vmull.s16 q8, d16, d1 - vqrshrn.s32 d2, q1, #4 - vqrshrn.s32 d4, q2, #4 - vqrshrn.s32 d6, q3, #4 - vqrshrn.s32 d16, q8, #4 - vst1.16 {d2}, [r3,:64], r12 - vst1.16 {d4}, [r3,:64], r12 - vst1.16 {d6}, [r3,:64], r12 - vst1.16 {d16}, [r3,:64], r12 - bx lr -endfunc |