aboutsummaryrefslogtreecommitdiffstats
path: root/libavutil/aarch64/float_dsp_neon.S
diff options
context:
space:
mode:
authorMartin Storsjö <martin@martin.st>2023-10-17 14:16:24 +0300
committerMartin Storsjö <martin@martin.st>2023-10-21 23:25:18 +0300
commit184103b3105f02f1189fa0047af4269e027dfbd6 (patch)
tree3e50ad549ed68292f91594c4e6fb26551de90369 /libavutil/aarch64/float_dsp_neon.S
parent393d1ee541b143633bfba2ff0e821d734fd511c2 (diff)
downloadffmpeg-184103b3105f02f1189fa0047af4269e027dfbd6.tar.gz
aarch64: Consistently use lowercase for vector element specifiers
Signed-off-by: Martin Storsjö <martin@martin.st>
Diffstat (limited to 'libavutil/aarch64/float_dsp_neon.S')
-rw-r--r--libavutil/aarch64/float_dsp_neon.S200
1 files changed, 100 insertions, 100 deletions
diff --git a/libavutil/aarch64/float_dsp_neon.S b/libavutil/aarch64/float_dsp_neon.S
index 02d790c0cc..35e2715b87 100644
--- a/libavutil/aarch64/float_dsp_neon.S
+++ b/libavutil/aarch64/float_dsp_neon.S
@@ -25,16 +25,16 @@
function ff_vector_fmul_neon, export=1
1: subs w3, w3, #16
- ld1 {v0.4S, v1.4S}, [x1], #32
- ld1 {v2.4S, v3.4S}, [x1], #32
- ld1 {v4.4S, v5.4S}, [x2], #32
- ld1 {v6.4S, v7.4S}, [x2], #32
- fmul v16.4S, v0.4S, v4.4S
- fmul v17.4S, v1.4S, v5.4S
- fmul v18.4S, v2.4S, v6.4S
- fmul v19.4S, v3.4S, v7.4S
- st1 {v16.4S, v17.4S}, [x0], #32
- st1 {v18.4S, v19.4S}, [x0], #32
+ ld1 {v0.4s, v1.4s}, [x1], #32
+ ld1 {v2.4s, v3.4s}, [x1], #32
+ ld1 {v4.4s, v5.4s}, [x2], #32
+ ld1 {v6.4s, v7.4s}, [x2], #32
+ fmul v16.4s, v0.4s, v4.4s
+ fmul v17.4s, v1.4s, v5.4s
+ fmul v18.4s, v2.4s, v6.4s
+ fmul v19.4s, v3.4s, v7.4s
+ st1 {v16.4s, v17.4s}, [x0], #32
+ st1 {v18.4s, v19.4s}, [x0], #32
b.ne 1b
ret
endfunc
@@ -42,16 +42,16 @@ endfunc
function ff_vector_fmac_scalar_neon, export=1
mov x3, #-32
1: subs w2, w2, #16
- ld1 {v16.4S, v17.4S}, [x0], #32
- ld1 {v18.4S, v19.4S}, [x0], x3
- ld1 {v4.4S, v5.4S}, [x1], #32
- ld1 {v6.4S, v7.4S}, [x1], #32
- fmla v16.4S, v4.4S, v0.S[0]
- fmla v17.4S, v5.4S, v0.S[0]
- fmla v18.4S, v6.4S, v0.S[0]
- fmla v19.4S, v7.4S, v0.S[0]
- st1 {v16.4S, v17.4S}, [x0], #32
- st1 {v18.4S, v19.4S}, [x0], #32
+ ld1 {v16.4s, v17.4s}, [x0], #32
+ ld1 {v18.4s, v19.4s}, [x0], x3
+ ld1 {v4.4s, v5.4s}, [x1], #32
+ ld1 {v6.4s, v7.4s}, [x1], #32
+ fmla v16.4s, v4.4s, v0.s[0]
+ fmla v17.4s, v5.4s, v0.s[0]
+ fmla v18.4s, v6.4s, v0.s[0]
+ fmla v19.4s, v7.4s, v0.s[0]
+ st1 {v16.4s, v17.4s}, [x0], #32
+ st1 {v18.4s, v19.4s}, [x0], #32
b.ne 1b
ret
endfunc
@@ -59,43 +59,43 @@ endfunc
function ff_vector_fmul_scalar_neon, export=1
mov w4, #15
bics w3, w2, w4
- dup v16.4S, v0.S[0]
+ dup v16.4s, v0.s[0]
b.eq 3f
- ld1 {v0.4S, v1.4S}, [x1], #32
+ ld1 {v0.4s, v1.4s}, [x1], #32
1: subs w3, w3, #16
- fmul v0.4S, v0.4S, v16.4S
- ld1 {v2.4S, v3.4S}, [x1], #32
- fmul v1.4S, v1.4S, v16.4S
- fmul v2.4S, v2.4S, v16.4S
- st1 {v0.4S, v1.4S}, [x0], #32
- fmul v3.4S, v3.4S, v16.4S
+ fmul v0.4s, v0.4s, v16.4s
+ ld1 {v2.4s, v3.4s}, [x1], #32
+ fmul v1.4s, v1.4s, v16.4s
+ fmul v2.4s, v2.4s, v16.4s
+ st1 {v0.4s, v1.4s}, [x0], #32
+ fmul v3.4s, v3.4s, v16.4s
b.eq 2f
- ld1 {v0.4S, v1.4S}, [x1], #32
- st1 {v2.4S, v3.4S}, [x0], #32
+ ld1 {v0.4s, v1.4s}, [x1], #32
+ st1 {v2.4s, v3.4s}, [x0], #32
b 1b
2: ands w2, w2, #15
- st1 {v2.4S, v3.4S}, [x0], #32
+ st1 {v2.4s, v3.4s}, [x0], #32
b.eq 4f
-3: ld1 {v0.4S}, [x1], #16
- fmul v0.4S, v0.4S, v16.4S
- st1 {v0.4S}, [x0], #16
+3: ld1 {v0.4s}, [x1], #16
+ fmul v0.4s, v0.4s, v16.4s
+ st1 {v0.4s}, [x0], #16
subs w2, w2, #4
b.gt 3b
4: ret
endfunc
function ff_vector_dmul_scalar_neon, export=1
- dup v16.2D, v0.D[0]
- ld1 {v0.2D, v1.2D}, [x1], #32
+ dup v16.2d, v0.d[0]
+ ld1 {v0.2d, v1.2d}, [x1], #32
1: subs w2, w2, #8
- fmul v0.2D, v0.2D, v16.2D
- ld1 {v2.2D, v3.2D}, [x1], #32
- fmul v1.2D, v1.2D, v16.2D
- fmul v2.2D, v2.2D, v16.2D
- st1 {v0.2D, v1.2D}, [x0], #32
- fmul v3.2D, v3.2D, v16.2D
- ld1 {v0.2D, v1.2D}, [x1], #32
- st1 {v2.2D, v3.2D}, [x0], #32
+ fmul v0.2d, v0.2d, v16.2d
+ ld1 {v2.2d, v3.2d}, [x1], #32
+ fmul v1.2d, v1.2d, v16.2d
+ fmul v2.2d, v2.2d, v16.2d
+ st1 {v0.2d, v1.2d}, [x0], #32
+ fmul v3.2d, v3.2d, v16.2d
+ ld1 {v0.2d, v1.2d}, [x1], #32
+ st1 {v2.2d, v3.2d}, [x0], #32
b.gt 1b
ret
endfunc
@@ -108,49 +108,49 @@ function ff_vector_fmul_window_neon, export=1
add x6, x3, x5, lsl #3 // win + 8 * (len - 2)
add x5, x0, x5, lsl #3 // dst + 8 * (len - 2)
mov x7, #-16
- ld1 {v0.4S}, [x1], #16 // s0
- ld1 {v2.4S}, [x3], #16 // wi
- ld1 {v1.4S}, [x2], x7 // s1
-1: ld1 {v3.4S}, [x6], x7 // wj
+ ld1 {v0.4s}, [x1], #16 // s0
+ ld1 {v2.4s}, [x3], #16 // wi
+ ld1 {v1.4s}, [x2], x7 // s1
+1: ld1 {v3.4s}, [x6], x7 // wj
subs x4, x4, #4
- fmul v17.4S, v0.4S, v2.4S // s0 * wi
- rev64 v4.4S, v1.4S
- rev64 v5.4S, v3.4S
- rev64 v17.4S, v17.4S
- ext v4.16B, v4.16B, v4.16B, #8 // s1_r
- ext v5.16B, v5.16B, v5.16B, #8 // wj_r
- ext v17.16B, v17.16B, v17.16B, #8 // (s0 * wi)_rev
- fmul v16.4S, v0.4S, v5.4S // s0 * wj_r
- fmla v17.4S, v1.4S, v3.4S // (s0 * wi)_rev + s1 * wj
+ fmul v17.4s, v0.4s, v2.4s // s0 * wi
+ rev64 v4.4s, v1.4s
+ rev64 v5.4s, v3.4s
+ rev64 v17.4s, v17.4s
+ ext v4.16b, v4.16b, v4.16b, #8 // s1_r
+ ext v5.16b, v5.16b, v5.16b, #8 // wj_r
+ ext v17.16b, v17.16b, v17.16b, #8 // (s0 * wi)_rev
+ fmul v16.4s, v0.4s, v5.4s // s0 * wj_r
+ fmla v17.4s, v1.4s, v3.4s // (s0 * wi)_rev + s1 * wj
b.eq 2f
- ld1 {v0.4S}, [x1], #16
- fmls v16.4S, v4.4S, v2.4S // s0 * wj_r - s1_r * wi
- st1 {v17.4S}, [x5], x7
- ld1 {v2.4S}, [x3], #16
- ld1 {v1.4S}, [x2], x7
- st1 {v16.4S}, [x0], #16
+ ld1 {v0.4s}, [x1], #16
+ fmls v16.4s, v4.4s, v2.4s // s0 * wj_r - s1_r * wi
+ st1 {v17.4s}, [x5], x7
+ ld1 {v2.4s}, [x3], #16
+ ld1 {v1.4s}, [x2], x7
+ st1 {v16.4s}, [x0], #16
b 1b
2:
- fmls v16.4S, v4.4S, v2.4S // s0 * wj_r - s1_r * wi
- st1 {v17.4S}, [x5], x7
- st1 {v16.4S}, [x0], #16
+ fmls v16.4s, v4.4s, v2.4s // s0 * wj_r - s1_r * wi
+ st1 {v17.4s}, [x5], x7
+ st1 {v16.4s}, [x0], #16
ret
endfunc
function ff_vector_fmul_add_neon, export=1
- ld1 {v0.4S, v1.4S}, [x1], #32
- ld1 {v2.4S, v3.4S}, [x2], #32
- ld1 {v4.4S, v5.4S}, [x3], #32
+ ld1 {v0.4s, v1.4s}, [x1], #32
+ ld1 {v2.4s, v3.4s}, [x2], #32
+ ld1 {v4.4s, v5.4s}, [x3], #32
1: subs w4, w4, #8
- fmla v4.4S, v0.4S, v2.4S
- fmla v5.4S, v1.4S, v3.4S
+ fmla v4.4s, v0.4s, v2.4s
+ fmla v5.4s, v1.4s, v3.4s
b.eq 2f
- ld1 {v0.4S, v1.4S}, [x1], #32
- ld1 {v2.4S, v3.4S}, [x2], #32
- st1 {v4.4S, v5.4S}, [x0], #32
- ld1 {v4.4S, v5.4S}, [x3], #32
+ ld1 {v0.4s, v1.4s}, [x1], #32
+ ld1 {v2.4s, v3.4s}, [x2], #32
+ st1 {v4.4s, v5.4s}, [x0], #32
+ ld1 {v4.4s, v5.4s}, [x3], #32
b 1b
-2: st1 {v4.4S, v5.4S}, [x0], #32
+2: st1 {v4.4s, v5.4s}, [x0], #32
ret
endfunc
@@ -159,44 +159,44 @@ function ff_vector_fmul_reverse_neon, export=1
add x2, x2, x3, lsl #2
sub x2, x2, #32
mov x4, #-32
- ld1 {v2.4S, v3.4S}, [x2], x4
- ld1 {v0.4S, v1.4S}, [x1], #32
+ ld1 {v2.4s, v3.4s}, [x2], x4
+ ld1 {v0.4s, v1.4s}, [x1], #32
1: subs x3, x3, #8
- rev64 v3.4S, v3.4S
- rev64 v2.4S, v2.4S
- ext v3.16B, v3.16B, v3.16B, #8
- ext v2.16B, v2.16B, v2.16B, #8
- fmul v16.4S, v0.4S, v3.4S
- fmul v17.4S, v1.4S, v2.4S
+ rev64 v3.4s, v3.4s
+ rev64 v2.4s, v2.4s
+ ext v3.16b, v3.16b, v3.16b, #8
+ ext v2.16b, v2.16b, v2.16b, #8
+ fmul v16.4s, v0.4s, v3.4s
+ fmul v17.4s, v1.4s, v2.4s
b.eq 2f
- ld1 {v2.4S, v3.4S}, [x2], x4
- ld1 {v0.4S, v1.4S}, [x1], #32
- st1 {v16.4S, v17.4S}, [x0], #32
+ ld1 {v2.4s, v3.4s}, [x2], x4
+ ld1 {v0.4s, v1.4s}, [x1], #32
+ st1 {v16.4s, v17.4s}, [x0], #32
b 1b
-2: st1 {v16.4S, v17.4S}, [x0], #32
+2: st1 {v16.4s, v17.4s}, [x0], #32
ret
endfunc
function ff_butterflies_float_neon, export=1
-1: ld1 {v0.4S}, [x0]
- ld1 {v1.4S}, [x1]
+1: ld1 {v0.4s}, [x0]
+ ld1 {v1.4s}, [x1]
subs w2, w2, #4
- fsub v2.4S, v0.4S, v1.4S
- fadd v3.4S, v0.4S, v1.4S
- st1 {v2.4S}, [x1], #16
- st1 {v3.4S}, [x0], #16
+ fsub v2.4s, v0.4s, v1.4s
+ fadd v3.4s, v0.4s, v1.4s
+ st1 {v2.4s}, [x1], #16
+ st1 {v3.4s}, [x0], #16
b.gt 1b
ret
endfunc
function ff_scalarproduct_float_neon, export=1
- movi v2.4S, #0
-1: ld1 {v0.4S}, [x0], #16
- ld1 {v1.4S}, [x1], #16
+ movi v2.4s, #0
+1: ld1 {v0.4s}, [x0], #16
+ ld1 {v1.4s}, [x1], #16
subs w2, w2, #4
- fmla v2.4S, v0.4S, v1.4S
+ fmla v2.4s, v0.4s, v1.4s
b.gt 1b
- faddp v0.4S, v2.4S, v2.4S
- faddp s0, v0.2S
+ faddp v0.4s, v2.4s, v2.4s
+ faddp s0, v0.2s
ret
endfunc