diff options
author | Ben Avison <bavison@riscosopen.org> | 2014-07-11 00:12:34 +0100 |
---|---|---|
committer | Martin Storsjö <martin@martin.st> | 2014-07-18 01:34:38 +0300 |
commit | 5a272190a04666f0fe41be767396b30712638c21 (patch) | |
tree | 1bff50bd3d7926346d90e319e80bb526b128ee89 /libavutil/arm/float_dsp_vfp.S | |
parent | 5edad2c4a1f46bcc56be755af86ab355c2f1b37f (diff) | |
download | ffmpeg-5a272190a04666f0fe41be767396b30712638c21.tar.gz |
armv6: Accelerate butterflies_float
I benchmarked the result by measuring the number of gperftools samples that
hit anywhere in the AAC decoder (starting from aac_decode_frame()) or
specifically in butterflies_float_c() / ff_butterflies_float_vfp() for the
same sample AAC stream:
Before After
Mean StdDev Mean StdDev Confidence Change
Audio decode 1542.8 43.7 1470.5 41.5 100.0% +4.9%
butterflies_float 130.0 11.9 70.2 12.1 100.0% +85.2%
Signed-off-by: Martin Storsjö <martin@martin.st>
Diffstat (limited to 'libavutil/arm/float_dsp_vfp.S')
-rw-r--r-- | libavutil/arm/float_dsp_vfp.S | 116 |
1 files changed, 116 insertions, 0 deletions
diff --git a/libavutil/arm/float_dsp_vfp.S b/libavutil/arm/float_dsp_vfp.S index c25588f978..9f920aae70 100644 --- a/libavutil/arm/float_dsp_vfp.S +++ b/libavutil/arm/float_dsp_vfp.S @@ -339,3 +339,119 @@ function ff_vector_fmul_reverse_vfp, export=1 vpop {d8-d15} bx lr endfunc + +/** + * ARM VFP implementation of 'butterflies_float_c' function + * Assume that len is a positive non-zero number + */ +@ void ff_butterflies_float_vfp(float *restrict v1, float *restrict v2, int len) +function ff_butterflies_float_vfp, export=1 +BASE1 .req a1 +BASE2 .req a2 +LEN .req a3 +OLDFPSCR .req a4 + + vpush {s16-s31} + fmrx OLDFPSCR, FPSCR + + tst LEN, #7 + beq 4f @ common case: len is a multiple of 8 + + ldr ip, =0x03000000 @ RunFast mode, scalar mode + fmxr FPSCR, ip + + tst LEN, #1 + beq 1f + vldmia BASE1!, {s0} + vldmia BASE2!, {s8} + vadd.f s16, s0, s8 + vsub.f s24, s0, s8 + vstr s16, [BASE1, #0-4*1] + vstr s24, [BASE2, #0-4*1] +1: + tst LEN, #2 + beq 2f + vldmia BASE1!, {s0-s1} + vldmia BASE2!, {s8-s9} + vadd.f s16, s0, s8 + vadd.f s17, s1, s9 + vsub.f s24, s0, s8 + vsub.f s25, s1, s9 + vstr d8, [BASE1, #0-8*1] @ s16,s17 + vstr d12, [BASE2, #0-8*1] @ s24,s25 +2: + tst LEN, #4 + beq 3f + vldmia BASE1!, {s0-s1} + vldmia BASE2!, {s8-s9} + vldmia BASE1!, {s2-s3} + vldmia BASE2!, {s10-s11} + vadd.f s16, s0, s8 + vadd.f s17, s1, s9 + vsub.f s24, s0, s8 + vsub.f s25, s1, s9 + vadd.f s18, s2, s10 + vadd.f s19, s3, s11 + vsub.f s26, s2, s10 + vsub.f s27, s3, s11 + vstr d8, [BASE1, #0-16*1] @ s16,s17 + vstr d12, [BASE2, #0-16*1] @ s24,s25 + vstr d9, [BASE1, #8-16*1] @ s18,s19 + vstr d13, [BASE2, #8-16*1] @ s26,s27 +3: + bics LEN, LEN, #7 + beq 7f +4: + ldr ip, =0x03030000 @ RunFast mode, short vectors of length 4, stride 1 + fmxr FPSCR, ip + + vldmia BASE1!, {s0-s1} + vldmia BASE2!, {s8-s9} + vldmia BASE1!, {s2-s3} + vldmia BASE2!, {s10-s11} + vadd.f s16, s0, s8 + vldmia BASE1!, {s4-s5} + vldmia BASE2!, {s12-s13} + vldmia BASE1!, {s6-s7} + vldmia BASE2!, {s14-s15} + vsub.f s24, s0, s8 + vadd.f s20, s4, s12 + subs LEN, LEN, #8 + beq 6f +5: vldmia BASE1!, {s0-s3} + vldmia BASE2!, {s8-s11} + vsub.f s28, s4, s12 + vstr d8, [BASE1, #0-16*3] @ s16,s17 + vstr d9, [BASE1, #8-16*3] @ s18,s19 + vstr d12, [BASE2, #0-16*3] @ s24,s25 + vstr d13, [BASE2, #8-16*3] @ s26,s27 + vadd.f s16, s0, s8 + vldmia BASE1!, {s4-s7} + vldmia BASE2!, {s12-s15} + vsub.f s24, s0, s8 + vstr d10, [BASE1, #0-16*3] @ s20,s21 + vstr d11, [BASE1, #8-16*3] @ s22,s23 + vstr d14, [BASE2, #0-16*3] @ s28,s29 + vstr d15, [BASE2, #8-16*3] @ s30,s31 + vadd.f s20, s4, s12 + subs LEN, LEN, #8 + bne 5b +6: vsub.f s28, s4, s12 + vstr d8, [BASE1, #0-16*2] @ s16,s17 + vstr d9, [BASE1, #8-16*2] @ s18,s19 + vstr d12, [BASE2, #0-16*2] @ s24,s25 + vstr d13, [BASE2, #8-16*2] @ s26,s27 + vstr d10, [BASE1, #0-16*1] @ s20,s21 + vstr d11, [BASE1, #8-16*1] @ s22,s23 + vstr d14, [BASE2, #0-16*1] @ s28,s29 + vstr d15, [BASE2, #8-16*1] @ s30,s31 +7: + fmxr FPSCR, OLDFPSCR + vpop {s16-s31} + bx lr + + .unreq BASE1 + .unreq BASE2 + .unreq LEN + .unreq OLDFPSCR +endfunc |