aboutsummaryrefslogtreecommitdiffstats
path: root/libavutil/arm/float_dsp_vfp.S
diff options
context:
space:
mode:
authorBen Avison <bavison@riscosopen.org>2014-07-11 00:12:34 +0100
committerMartin Storsjö <martin@martin.st>2014-07-18 01:34:38 +0300
commit5a272190a04666f0fe41be767396b30712638c21 (patch)
tree1bff50bd3d7926346d90e319e80bb526b128ee89 /libavutil/arm/float_dsp_vfp.S
parent5edad2c4a1f46bcc56be755af86ab355c2f1b37f (diff)
downloadffmpeg-5a272190a04666f0fe41be767396b30712638c21.tar.gz
armv6: Accelerate butterflies_float
I benchmarked the result by measuring the number of gperftools samples that hit anywhere in the AAC decoder (starting from aac_decode_frame()) or specifically in butterflies_float_c() / ff_butterflies_float_vfp() for the same sample AAC stream: Before After Mean StdDev Mean StdDev Confidence Change Audio decode 1542.8 43.7 1470.5 41.5 100.0% +4.9% butterflies_float 130.0 11.9 70.2 12.1 100.0% +85.2% Signed-off-by: Martin Storsjö <martin@martin.st>
Diffstat (limited to 'libavutil/arm/float_dsp_vfp.S')
-rw-r--r--libavutil/arm/float_dsp_vfp.S116
1 files changed, 116 insertions, 0 deletions
diff --git a/libavutil/arm/float_dsp_vfp.S b/libavutil/arm/float_dsp_vfp.S
index c25588f978..9f920aae70 100644
--- a/libavutil/arm/float_dsp_vfp.S
+++ b/libavutil/arm/float_dsp_vfp.S
@@ -339,3 +339,119 @@ function ff_vector_fmul_reverse_vfp, export=1
vpop {d8-d15}
bx lr
endfunc
+
+/**
+ * ARM VFP implementation of 'butterflies_float_c' function
+ * Assume that len is a positive non-zero number
+ */
+@ void ff_butterflies_float_vfp(float *restrict v1, float *restrict v2, int len)
+function ff_butterflies_float_vfp, export=1
+BASE1 .req a1
+BASE2 .req a2
+LEN .req a3
+OLDFPSCR .req a4
+
+ vpush {s16-s31}
+ fmrx OLDFPSCR, FPSCR
+
+ tst LEN, #7
+ beq 4f @ common case: len is a multiple of 8
+
+ ldr ip, =0x03000000 @ RunFast mode, scalar mode
+ fmxr FPSCR, ip
+
+ tst LEN, #1
+ beq 1f
+ vldmia BASE1!, {s0}
+ vldmia BASE2!, {s8}
+ vadd.f s16, s0, s8
+ vsub.f s24, s0, s8
+ vstr s16, [BASE1, #0-4*1]
+ vstr s24, [BASE2, #0-4*1]
+1:
+ tst LEN, #2
+ beq 2f
+ vldmia BASE1!, {s0-s1}
+ vldmia BASE2!, {s8-s9}
+ vadd.f s16, s0, s8
+ vadd.f s17, s1, s9
+ vsub.f s24, s0, s8
+ vsub.f s25, s1, s9
+ vstr d8, [BASE1, #0-8*1] @ s16,s17
+ vstr d12, [BASE2, #0-8*1] @ s24,s25
+2:
+ tst LEN, #4
+ beq 3f
+ vldmia BASE1!, {s0-s1}
+ vldmia BASE2!, {s8-s9}
+ vldmia BASE1!, {s2-s3}
+ vldmia BASE2!, {s10-s11}
+ vadd.f s16, s0, s8
+ vadd.f s17, s1, s9
+ vsub.f s24, s0, s8
+ vsub.f s25, s1, s9
+ vadd.f s18, s2, s10
+ vadd.f s19, s3, s11
+ vsub.f s26, s2, s10
+ vsub.f s27, s3, s11
+ vstr d8, [BASE1, #0-16*1] @ s16,s17
+ vstr d12, [BASE2, #0-16*1] @ s24,s25
+ vstr d9, [BASE1, #8-16*1] @ s18,s19
+ vstr d13, [BASE2, #8-16*1] @ s26,s27
+3:
+ bics LEN, LEN, #7
+ beq 7f
+4:
+ ldr ip, =0x03030000 @ RunFast mode, short vectors of length 4, stride 1
+ fmxr FPSCR, ip
+
+ vldmia BASE1!, {s0-s1}
+ vldmia BASE2!, {s8-s9}
+ vldmia BASE1!, {s2-s3}
+ vldmia BASE2!, {s10-s11}
+ vadd.f s16, s0, s8
+ vldmia BASE1!, {s4-s5}
+ vldmia BASE2!, {s12-s13}
+ vldmia BASE1!, {s6-s7}
+ vldmia BASE2!, {s14-s15}
+ vsub.f s24, s0, s8
+ vadd.f s20, s4, s12
+ subs LEN, LEN, #8
+ beq 6f
+5: vldmia BASE1!, {s0-s3}
+ vldmia BASE2!, {s8-s11}
+ vsub.f s28, s4, s12
+ vstr d8, [BASE1, #0-16*3] @ s16,s17
+ vstr d9, [BASE1, #8-16*3] @ s18,s19
+ vstr d12, [BASE2, #0-16*3] @ s24,s25
+ vstr d13, [BASE2, #8-16*3] @ s26,s27
+ vadd.f s16, s0, s8
+ vldmia BASE1!, {s4-s7}
+ vldmia BASE2!, {s12-s15}
+ vsub.f s24, s0, s8
+ vstr d10, [BASE1, #0-16*3] @ s20,s21
+ vstr d11, [BASE1, #8-16*3] @ s22,s23
+ vstr d14, [BASE2, #0-16*3] @ s28,s29
+ vstr d15, [BASE2, #8-16*3] @ s30,s31
+ vadd.f s20, s4, s12
+ subs LEN, LEN, #8
+ bne 5b
+6: vsub.f s28, s4, s12
+ vstr d8, [BASE1, #0-16*2] @ s16,s17
+ vstr d9, [BASE1, #8-16*2] @ s18,s19
+ vstr d12, [BASE2, #0-16*2] @ s24,s25
+ vstr d13, [BASE2, #8-16*2] @ s26,s27
+ vstr d10, [BASE1, #0-16*1] @ s20,s21
+ vstr d11, [BASE1, #8-16*1] @ s22,s23
+ vstr d14, [BASE2, #0-16*1] @ s28,s29
+ vstr d15, [BASE2, #8-16*1] @ s30,s31
+7:
+ fmxr FPSCR, OLDFPSCR
+ vpop {s16-s31}
+ bx lr
+
+ .unreq BASE1
+ .unreq BASE2
+ .unreq LEN
+ .unreq OLDFPSCR
+endfunc