diff options
author | Måns Rullgård <mans@mansr.com> | 2009-09-22 00:48:48 +0000 |
---|---|---|
committer | Måns Rullgård <mans@mansr.com> | 2009-09-22 00:48:48 +0000 |
commit | 1dee3e97c6971ac78404e5dd43b28c8fb7edbb39 (patch) | |
tree | 27722e6d0d5366f7d78124a2dca1f98b37672541 /libavcodec/arm/dsputil_neon_s.S | |
parent | 42d3fbb3f454cbc45a15213b0728cd49c189b59a (diff) | |
download | ffmpeg-1dee3e97c6971ac78404e5dd43b28c8fb7edbb39.tar.gz |
ARM: NEON optimisations for some dsputil functions
NEON versions of the following functions are added:
vector_fmul_scalar
vector_fmul_sv_scalar
sv_fmul_scalar
butterflies_float
Originally committed as revision 19957 to svn://svn.ffmpeg.org/ffmpeg/trunk
Diffstat (limited to 'libavcodec/arm/dsputil_neon_s.S')
-rw-r--r-- | libavcodec/arm/dsputil_neon_s.S | 152 |
1 files changed, 152 insertions, 0 deletions
diff --git a/libavcodec/arm/dsputil_neon_s.S b/libavcodec/arm/dsputil_neon_s.S index 2fd751a8f8..8764cedc4e 100644 --- a/libavcodec/arm/dsputil_neon_s.S +++ b/libavcodec/arm/dsputil_neon_s.S @@ -858,3 +858,155 @@ function ff_vorbis_inverse_coupling_neon, export=1 bx lr .endfunc #endif + +function ff_vector_fmul_scalar_neon, export=1 +VFP len .req r2 +NOVFP len .req r3 +VFP vdup.32 q8, d0[0] +NOVFP vdup.32 q8, r2 + bics r12, len, #15 + beq 3f + vld1.32 {q0},[r1,:128]! + vld1.32 {q1},[r1,:128]! +1: vmul.f32 q0, q0, q8 + vld1.32 {q2},[r1,:128]! + vmul.f32 q1, q1, q8 + vld1.32 {q3},[r1,:128]! + vmul.f32 q2, q2, q8 + vst1.32 {q0},[r0,:128]! + vmul.f32 q3, q3, q8 + vst1.32 {q1},[r0,:128]! + subs r12, r12, #16 + beq 2f + vld1.32 {q0},[r1,:128]! + vst1.32 {q2},[r0,:128]! + vld1.32 {q1},[r1,:128]! + vst1.32 {q3},[r0,:128]! + b 1b +2: vst1.32 {q2},[r0,:128]! + vst1.32 {q3},[r0,:128]! + ands len, len, #15 + bxeq lr +3: vld1.32 {q0},[r1,:128]! + vmul.f32 q0, q0, q8 + vst1.32 {q0},[r0,:128]! + subs len, len, #4 + bgt 3b + bx lr + .unreq len + .endfunc + +function ff_vector_fmul_sv_scalar_2_neon, export=1 +VFP vdup.32 d16, d0[0] +NOVFP vdup.32 d16, r3 +NOVFP ldr r3, [sp] + vld1.32 {d0},[r1,:64]! + vld1.32 {d1},[r1,:64]! +1: subs r3, r3, #4 + vmul.f32 d4, d0, d16 + vmul.f32 d5, d1, d16 + ldr r12, [r2], #4 + vld1.32 {d2},[r12,:64] + ldr r12, [r2], #4 + vld1.32 {d3},[r12,:64] + vmul.f32 d4, d4, d2 + vmul.f32 d5, d5, d3 + beq 2f + vld1.32 {d0},[r1,:64]! + vld1.32 {d1},[r1,:64]! + vst1.32 {d4},[r0,:64]! + vst1.32 {d5},[r0,:64]! + b 1b +2: vst1.32 {d4},[r0,:64]! + vst1.32 {d5},[r0,:64]! + bx lr + .endfunc + +function ff_vector_fmul_sv_scalar_4_neon, export=1 +VFP vdup.32 q10, d0[0] +NOVFP vdup.32 q10, r3 +NOVFP ldr r3, [sp] + push {lr} + bics lr, r3, #7 + beq 3f + vld1.32 {q0},[r1,:128]! + vld1.32 {q2},[r1,:128]! +1: ldr r12, [r2], #4 + vld1.32 {q1},[r12,:128] + ldr r12, [r2], #4 + vld1.32 {q3},[r12,:128] + vmul.f32 q8, q0, q10 + vmul.f32 q8, q8, q1 + vmul.f32 q9, q2, q10 + vmul.f32 q9, q9, q3 + subs lr, lr, #8 + beq 2f + vld1.32 {q0},[r1,:128]! + vld1.32 {q2},[r1,:128]! + vst1.32 {q8},[r0,:128]! + vst1.32 {q9},[r0,:128]! + b 1b +2: vst1.32 {q8},[r0,:128]! + vst1.32 {q9},[r0,:128]! + ands r3, r3, #7 + popeq {pc} +3: vld1.32 {q0},[r1,:128]! + ldr r12, [r2], #4 + vld1.32 {q1},[r12,:128] + vmul.f32 q0, q0, q10 + vmul.f32 q0, q0, q1 + vst1.32 {q0},[r0,:128]! + subs r3, r3, #4 + bgt 3b + pop {pc} + .endfunc + +function ff_sv_fmul_scalar_2_neon, export=1 +VFP len .req r2 +NOVFP len .req r3 +VFP vdup.32 q8, d0[0] +NOVFP vdup.32 q8, r2 + ldr r12, [r1], #4 + vld1.32 {d0},[r12,:64] + ldr r12, [r1], #4 + vld1.32 {d1},[r12,:64] +1: vmul.f32 q1, q0, q8 + subs len, len, #4 + beq 2f + ldr r12, [r1], #4 + vld1.32 {d0},[r12,:64] + ldr r12, [r1], #4 + vld1.32 {d1},[r12,:64] + vst1.32 {q1},[r0,:128]! + b 1b +2: vst1.32 {q1},[r0,:128]! + bx lr + .unreq len + .endfunc + +function ff_sv_fmul_scalar_4_neon, export=1 +VFP len .req r2 +NOVFP len .req r3 +VFP vdup.32 q8, d0[0] +NOVFP vdup.32 q8, r2 +1: ldr r12, [r1], #4 + vld1.32 {q0},[r12,:128] + vmul.f32 q0, q0, q8 + vst1.32 {q0},[r0,:128]! + subs len, len, #4 + bgt 1b + bx lr + .unreq len + .endfunc + +function ff_butterflies_float_neon, export=1 +1: vld1.32 {q0},[r0,:128] + vld1.32 {q1},[r1,:128] + vsub.f32 q2, q0, q1 + vadd.f32 q1, q0, q1 + vst1.32 {q2},[r1,:128]! + vst1.32 {q1},[r0,:128]! + subs r2, r2, #4 + bgt 1b + bx lr + .endfunc |