diff options
author | Måns Rullgård <mans@mansr.com> | 2009-09-22 00:48:48 +0000 |
---|---|---|
committer | Måns Rullgård <mans@mansr.com> | 2009-09-22 00:48:48 +0000 |
commit | 1dee3e97c6971ac78404e5dd43b28c8fb7edbb39 (patch) | |
tree | 27722e6d0d5366f7d78124a2dca1f98b37672541 | |
parent | 42d3fbb3f454cbc45a15213b0728cd49c189b59a (diff) | |
download | ffmpeg-1dee3e97c6971ac78404e5dd43b28c8fb7edbb39.tar.gz |
ARM: NEON optimisations for some dsputil functions
NEON versions of the following functions are added:
vector_fmul_scalar
vector_fmul_sv_scalar
sv_fmul_scalar
butterflies_float
Originally committed as revision 19957 to svn://svn.ffmpeg.org/ffmpeg/trunk
-rw-r--r-- | libavcodec/arm/dsputil_neon.c | 19 | ||||
-rw-r--r-- | libavcodec/arm/dsputil_neon_s.S | 152 |
2 files changed, 171 insertions, 0 deletions
diff --git a/libavcodec/arm/dsputil_neon.c b/libavcodec/arm/dsputil_neon.c index 67315297df..03e17466b6 100644 --- a/libavcodec/arm/dsputil_neon.c +++ b/libavcodec/arm/dsputil_neon.c @@ -157,6 +157,17 @@ void ff_vector_fmul_neon(float *dst, const float *src, int len); void ff_vector_fmul_window_neon(float *dst, const float *src0, const float *src1, const float *win, float add_bias, int len); +void ff_vector_fmul_scalar_neon(float *dst, const float *src, float mul, + int len); +void ff_vector_fmul_sv_scalar_2_neon(float *dst, const float *src, + const float **vp, float mul, int len); +void ff_vector_fmul_sv_scalar_4_neon(float *dst, const float *src, + const float **vp, float mul, int len); +void ff_sv_fmul_scalar_2_neon(float *dst, const float **vp, float mul, + int len); +void ff_sv_fmul_scalar_4_neon(float *dst, const float **vp, float mul, + int len); +void ff_butterflies_float_neon(float *v1, float *v2, int len); void ff_float_to_int16_neon(int16_t *, const float *, long); void ff_float_to_int16_interleave_neon(int16_t *, const float **, long, int); @@ -269,6 +280,14 @@ void ff_dsputil_init_neon(DSPContext *c, AVCodecContext *avctx) c->vector_fmul = ff_vector_fmul_neon; c->vector_fmul_window = ff_vector_fmul_window_neon; + c->vector_fmul_scalar = ff_vector_fmul_scalar_neon; + c->butterflies_float = ff_butterflies_float_neon; + + c->vector_fmul_sv_scalar[0] = ff_vector_fmul_sv_scalar_2_neon; + c->vector_fmul_sv_scalar[1] = ff_vector_fmul_sv_scalar_4_neon; + + c->sv_fmul_scalar[0] = ff_sv_fmul_scalar_2_neon; + c->sv_fmul_scalar[1] = ff_sv_fmul_scalar_4_neon; if (!(avctx->flags & CODEC_FLAG_BITEXACT)) { c->float_to_int16 = ff_float_to_int16_neon; diff --git a/libavcodec/arm/dsputil_neon_s.S b/libavcodec/arm/dsputil_neon_s.S index 2fd751a8f8..8764cedc4e 100644 --- a/libavcodec/arm/dsputil_neon_s.S +++ b/libavcodec/arm/dsputil_neon_s.S @@ -858,3 +858,155 @@ function ff_vorbis_inverse_coupling_neon, export=1 bx lr .endfunc #endif + +function ff_vector_fmul_scalar_neon, export=1 +VFP len .req r2 +NOVFP len .req r3 +VFP vdup.32 q8, d0[0] +NOVFP vdup.32 q8, r2 + bics r12, len, #15 + beq 3f + vld1.32 {q0},[r1,:128]! + vld1.32 {q1},[r1,:128]! +1: vmul.f32 q0, q0, q8 + vld1.32 {q2},[r1,:128]! + vmul.f32 q1, q1, q8 + vld1.32 {q3},[r1,:128]! + vmul.f32 q2, q2, q8 + vst1.32 {q0},[r0,:128]! + vmul.f32 q3, q3, q8 + vst1.32 {q1},[r0,:128]! + subs r12, r12, #16 + beq 2f + vld1.32 {q0},[r1,:128]! + vst1.32 {q2},[r0,:128]! + vld1.32 {q1},[r1,:128]! + vst1.32 {q3},[r0,:128]! + b 1b +2: vst1.32 {q2},[r0,:128]! + vst1.32 {q3},[r0,:128]! + ands len, len, #15 + bxeq lr +3: vld1.32 {q0},[r1,:128]! + vmul.f32 q0, q0, q8 + vst1.32 {q0},[r0,:128]! + subs len, len, #4 + bgt 3b + bx lr + .unreq len + .endfunc + +function ff_vector_fmul_sv_scalar_2_neon, export=1 +VFP vdup.32 d16, d0[0] +NOVFP vdup.32 d16, r3 +NOVFP ldr r3, [sp] + vld1.32 {d0},[r1,:64]! + vld1.32 {d1},[r1,:64]! +1: subs r3, r3, #4 + vmul.f32 d4, d0, d16 + vmul.f32 d5, d1, d16 + ldr r12, [r2], #4 + vld1.32 {d2},[r12,:64] + ldr r12, [r2], #4 + vld1.32 {d3},[r12,:64] + vmul.f32 d4, d4, d2 + vmul.f32 d5, d5, d3 + beq 2f + vld1.32 {d0},[r1,:64]! + vld1.32 {d1},[r1,:64]! + vst1.32 {d4},[r0,:64]! + vst1.32 {d5},[r0,:64]! + b 1b +2: vst1.32 {d4},[r0,:64]! + vst1.32 {d5},[r0,:64]! + bx lr + .endfunc + +function ff_vector_fmul_sv_scalar_4_neon, export=1 +VFP vdup.32 q10, d0[0] +NOVFP vdup.32 q10, r3 +NOVFP ldr r3, [sp] + push {lr} + bics lr, r3, #7 + beq 3f + vld1.32 {q0},[r1,:128]! + vld1.32 {q2},[r1,:128]! +1: ldr r12, [r2], #4 + vld1.32 {q1},[r12,:128] + ldr r12, [r2], #4 + vld1.32 {q3},[r12,:128] + vmul.f32 q8, q0, q10 + vmul.f32 q8, q8, q1 + vmul.f32 q9, q2, q10 + vmul.f32 q9, q9, q3 + subs lr, lr, #8 + beq 2f + vld1.32 {q0},[r1,:128]! + vld1.32 {q2},[r1,:128]! + vst1.32 {q8},[r0,:128]! + vst1.32 {q9},[r0,:128]! + b 1b +2: vst1.32 {q8},[r0,:128]! + vst1.32 {q9},[r0,:128]! + ands r3, r3, #7 + popeq {pc} +3: vld1.32 {q0},[r1,:128]! + ldr r12, [r2], #4 + vld1.32 {q1},[r12,:128] + vmul.f32 q0, q0, q10 + vmul.f32 q0, q0, q1 + vst1.32 {q0},[r0,:128]! + subs r3, r3, #4 + bgt 3b + pop {pc} + .endfunc + +function ff_sv_fmul_scalar_2_neon, export=1 +VFP len .req r2 +NOVFP len .req r3 +VFP vdup.32 q8, d0[0] +NOVFP vdup.32 q8, r2 + ldr r12, [r1], #4 + vld1.32 {d0},[r12,:64] + ldr r12, [r1], #4 + vld1.32 {d1},[r12,:64] +1: vmul.f32 q1, q0, q8 + subs len, len, #4 + beq 2f + ldr r12, [r1], #4 + vld1.32 {d0},[r12,:64] + ldr r12, [r1], #4 + vld1.32 {d1},[r12,:64] + vst1.32 {q1},[r0,:128]! + b 1b +2: vst1.32 {q1},[r0,:128]! + bx lr + .unreq len + .endfunc + +function ff_sv_fmul_scalar_4_neon, export=1 +VFP len .req r2 +NOVFP len .req r3 +VFP vdup.32 q8, d0[0] +NOVFP vdup.32 q8, r2 +1: ldr r12, [r1], #4 + vld1.32 {q0},[r12,:128] + vmul.f32 q0, q0, q8 + vst1.32 {q0},[r0,:128]! + subs len, len, #4 + bgt 1b + bx lr + .unreq len + .endfunc + +function ff_butterflies_float_neon, export=1 +1: vld1.32 {q0},[r0,:128] + vld1.32 {q1},[r1,:128] + vsub.f32 q2, q0, q1 + vadd.f32 q1, q0, q1 + vst1.32 {q2},[r1,:128]! + vst1.32 {q1},[r0,:128]! + subs r2, r2, #4 + bgt 1b + bx lr + .endfunc |