aboutsummaryrefslogtreecommitdiffstats
path: root/libavcodec/arm
diff options
context:
space:
mode:
authorJanne Grunau <janne-libav@jannau.net>2012-01-12 17:21:48 +0100
committerJanne Grunau <janne-libav@jannau.net>2012-01-12 18:33:55 +0100
commite1e369049e3d2f88eed6ed38eb3dd704681c7f1a (patch)
treeecaa96dfa9a4220e504cc93d30494a3a45cb8b79 /libavcodec/arm
parent136ee32da3c728fb4e3490393efb947cc7c4e898 (diff)
downloadffmpeg-e1e369049e3d2f88eed6ed38eb3dd704681c7f1a.tar.gz
rv34: NEON optimised dc only inverse transform
30-50% faster than the C implementation, 0.5% overall speedup on bourne.rmvb.
Diffstat (limited to 'libavcodec/arm')
-rw-r--r--libavcodec/arm/rv34dsp_init_neon.c9
-rw-r--r--libavcodec/arm/rv34dsp_neon.S29
2 files changed, 36 insertions, 2 deletions
diff --git a/libavcodec/arm/rv34dsp_init_neon.c b/libavcodec/arm/rv34dsp_init_neon.c
index 9a09fde7a9..16bda46658 100644
--- a/libavcodec/arm/rv34dsp_init_neon.c
+++ b/libavcodec/arm/rv34dsp_init_neon.c
@@ -26,8 +26,13 @@
void ff_rv34_inv_transform_neon(DCTELEM *block);
void ff_rv34_inv_transform_noround_neon(DCTELEM *block);
+void ff_rv34_inv_transform_dc_neon(DCTELEM *block);
+void ff_rv34_inv_transform_noround_dc_neon(DCTELEM *block);
+
void ff_rv34dsp_init_neon(RV34DSPContext *c, DSPContext* dsp)
{
- c->rv34_inv_transform_tab[0] = ff_rv34_inv_transform_neon;
- c->rv34_inv_transform_tab[1] = ff_rv34_inv_transform_noround_neon;
+ c->rv34_inv_transform_tab[0] = ff_rv34_inv_transform_neon;
+ c->rv34_inv_transform_tab[1] = ff_rv34_inv_transform_noround_neon;
+ c->rv34_inv_transform_dc_tab[0] = ff_rv34_inv_transform_dc_neon;
+ c->rv34_inv_transform_dc_tab[1] = ff_rv34_inv_transform_noround_dc_neon;
}
diff --git a/libavcodec/arm/rv34dsp_neon.S b/libavcodec/arm/rv34dsp_neon.S
index f700f5c321..e776af0330 100644
--- a/libavcodec/arm/rv34dsp_neon.S
+++ b/libavcodec/arm/rv34dsp_neon.S
@@ -107,3 +107,32 @@ function ff_rv34_inv_transform_noround_neon, export=1
vst4.16 {d0[3], d1[3], d2[3], d3[3]}, [r2,:64], r1
bx lr
endfunc
+
+/* void rv34_inv_transform_dc_c(DCTELEM *block) */
+function ff_rv34_inv_transform_dc_neon, export=1
+ vld1.16 d28[], [r0:16] @ block[0]
+ vmov.i16 d4, #169
+ mov r1, #16
+ vmull.s16 q3, d28, d4
+ vrshrn.s32 d0, q3, #10
+ vst1.16 {d0}, [r0:64], r1
+ vst1.16 {d0}, [r0:64], r1
+ vst1.16 {d0}, [r0:64], r1
+ vst1.16 {d0}, [r0:64], r1
+ bx lr
+endfunc
+
+/* void rv34_inv_transform_dc_noround_c(DCTELEM *block) */
+function ff_rv34_inv_transform_noround_dc_neon, export=1
+ vld1.16 d28[], [r0:16] @ block[0]
+ vmov.i16 d4, #251
+ vorr.s16 d4, #256 @ 13^2 * 3
+ mov r1, #16
+ vmull.s16 q3, d28, d4
+ vshrn.s32 d0, q3, #11
+ vst1.64 {d0}, [r0:64], r1
+ vst1.64 {d0}, [r0:64], r1
+ vst1.64 {d0}, [r0:64], r1
+ vst1.64 {d0}, [r0:64], r1
+ bx lr
+endfunc