aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorMåns Rullgård <mans@mansr.com>2009-08-14 01:02:06 +0000
committerMåns Rullgård <mans@mansr.com>2009-08-14 01:02:06 +0000
commite814015d69578cb0d45407a8070a77fd542df6bf (patch)
treed09bfa8012e316e541a074c72f96849ee122ce6b
parent737cbcde0873998f51cc7f640f6ea91d9e3fffd3 (diff)
downloadffmpeg-e814015d69578cb0d45407a8070a77fd542df6bf.tar.gz
ARM: NEON optimised vorbis_inverse_coupling
12% faster Vorbis decoding on Cortex-A8. Originally committed as revision 19637 to svn://svn.ffmpeg.org/ffmpeg/trunk
-rw-r--r--libavcodec/arm/dsputil_neon.c5
-rw-r--r--libavcodec/arm/dsputil_neon_s.S64
2 files changed, 69 insertions, 0 deletions
diff --git a/libavcodec/arm/dsputil_neon.c b/libavcodec/arm/dsputil_neon.c
index d7ee435c93..67315297df 100644
--- a/libavcodec/arm/dsputil_neon.c
+++ b/libavcodec/arm/dsputil_neon.c
@@ -161,6 +161,8 @@ void ff_vector_fmul_window_neon(float *dst, const float *src0,
void ff_float_to_int16_neon(int16_t *, const float *, long);
void ff_float_to_int16_interleave_neon(int16_t *, const float **, long, int);
+void ff_vorbis_inverse_coupling_neon(float *mag, float *ang, int blocksize);
+
void ff_dsputil_init_neon(DSPContext *c, AVCodecContext *avctx)
{
c->put_pixels_tab[0][0] = ff_put_pixels16_neon;
@@ -272,4 +274,7 @@ void ff_dsputil_init_neon(DSPContext *c, AVCodecContext *avctx)
c->float_to_int16 = ff_float_to_int16_neon;
c->float_to_int16_interleave = ff_float_to_int16_interleave_neon;
}
+
+ if (CONFIG_VORBIS_DECODER)
+ c->vorbis_inverse_coupling = ff_vorbis_inverse_coupling_neon;
}
diff --git a/libavcodec/arm/dsputil_neon_s.S b/libavcodec/arm/dsputil_neon_s.S
index c816f08089..71d09c6512 100644
--- a/libavcodec/arm/dsputil_neon_s.S
+++ b/libavcodec/arm/dsputil_neon_s.S
@@ -19,6 +19,7 @@
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
+#include "config.h"
#include "asm.S"
preserve8
@@ -795,3 +796,66 @@ NOVFP ldr lr, [sp, #16]
vst1.64 {d22,d23},[ip,:128], r5
pop {r4,r5,pc}
.endfunc
+
+#if CONFIG_VORBIS_DECODER
+function ff_vorbis_inverse_coupling_neon, export=1
+ vmov.i32 q10, #1<<31
+ subs r2, r2, #4
+ tst r2, #4
+ mov r3, r0
+ mov r12, r1
+ beq 3f
+
+ vld1.32 {d24-d25},[r1,:128]!
+ vld1.32 {d22-d23},[r0,:128]!
+ vcle.s32 q8, q12, #0
+ vand q9, q11, q10
+ veor q12, q12, q9
+ vand q2, q12, q8
+ vbic q3, q12, q8
+ vadd.f32 q12, q11, q2
+ vsub.f32 q11, q11, q3
+1: vld1.32 {d2-d3}, [r1,:128]!
+ vld1.32 {d0-d1}, [r0,:128]!
+ vcle.s32 q8, q1, #0
+ vand q9, q0, q10
+ veor q1, q1, q9
+ vst1.32 {d24-d25},[r3, :128]!
+ vst1.32 {d22-d23},[r12,:128]!
+ vand q2, q1, q8
+ vbic q3, q1, q8
+ vadd.f32 q1, q0, q2
+ vsub.f32 q0, q0, q3
+ subs r2, r2, #8
+ ble 2f
+ vld1.32 {d24-d25},[r1,:128]!
+ vld1.32 {d22-d23},[r0,:128]!
+ vcle.s32 q8, q12, #0
+ vand q9, q11, q10
+ veor q12, q12, q9
+ vst1.32 {d2-d3}, [r3, :128]!
+ vst1.32 {d0-d1}, [r12,:128]!
+ vand q2, q12, q8
+ vbic q3, q12, q8
+ vadd.f32 q12, q11, q2
+ vsub.f32 q11, q11, q3
+ b 1b
+
+2: vst1.32 {d2-d3}, [r3, :128]!
+ vst1.32 {d0-d1}, [r12,:128]!
+ bxlt lr
+
+3: vld1.32 {d2-d3}, [r1,:128]
+ vld1.32 {d0-d1}, [r0,:128]
+ vcle.s32 q8, q1, #0
+ vand q9, q0, q10
+ veor q1, q1, q9
+ vand q2, q1, q8
+ vbic q3, q1, q8
+ vadd.f32 q1, q0, q2
+ vsub.f32 q0, q0, q3
+ vst1.32 {d2-d3}, [r0,:128]!
+ vst1.32 {d0-d1}, [r1,:128]!
+ bx lr
+ .endfunc
+#endif