aboutsummaryrefslogtreecommitdiffstats
path: root/libavcodec/aarch64
diff options
context:
space:
mode:
authorMartin Storsjö <martin@martin.st>2023-10-17 14:16:24 +0300
committerMartin Storsjö <martin@martin.st>2023-10-21 23:25:18 +0300
commit184103b3105f02f1189fa0047af4269e027dfbd6 (patch)
tree3e50ad549ed68292f91594c4e6fb26551de90369 /libavcodec/aarch64
parent393d1ee541b143633bfba2ff0e821d734fd511c2 (diff)
downloadffmpeg-184103b3105f02f1189fa0047af4269e027dfbd6.tar.gz
aarch64: Consistently use lowercase for vector element specifiers
Signed-off-by: Martin Storsjö <martin@martin.st>
Diffstat (limited to 'libavcodec/aarch64')
-rw-r--r--libavcodec/aarch64/aacpsdsp_neon.S194
-rw-r--r--libavcodec/aarch64/h264cmc_neon.S406
-rw-r--r--libavcodec/aarch64/h264dsp_neon.S594
-rw-r--r--libavcodec/aarch64/h264idct_neon.S390
-rw-r--r--libavcodec/aarch64/h264qpel_neon.S556
-rw-r--r--libavcodec/aarch64/hpeldsp_neon.S362
-rw-r--r--libavcodec/aarch64/me_cmp_neon.S2
-rw-r--r--libavcodec/aarch64/neon.S246
-rw-r--r--libavcodec/aarch64/sbrdsp_neon.S294
-rw-r--r--libavcodec/aarch64/simple_idct_neon.S386
10 files changed, 1715 insertions, 1715 deletions
diff --git a/libavcodec/aarch64/aacpsdsp_neon.S b/libavcodec/aarch64/aacpsdsp_neon.S
index ff4e6e244a..686c62eb2e 100644
--- a/libavcodec/aarch64/aacpsdsp_neon.S
+++ b/libavcodec/aarch64/aacpsdsp_neon.S
@@ -19,82 +19,82 @@
#include "libavutil/aarch64/asm.S"
function ff_ps_add_squares_neon, export=1
-1: ld1 {v0.4S,v1.4S}, [x1], #32
- fmul v0.4S, v0.4S, v0.4S
- fmul v1.4S, v1.4S, v1.4S
- faddp v2.4S, v0.4S, v1.4S
- ld1 {v3.4S}, [x0]
- fadd v3.4S, v3.4S, v2.4S
- st1 {v3.4S}, [x0], #16
+1: ld1 {v0.4s,v1.4s}, [x1], #32
+ fmul v0.4s, v0.4s, v0.4s
+ fmul v1.4s, v1.4s, v1.4s
+ faddp v2.4s, v0.4s, v1.4s
+ ld1 {v3.4s}, [x0]
+ fadd v3.4s, v3.4s, v2.4s
+ st1 {v3.4s}, [x0], #16
subs w2, w2, #4
b.gt 1b
ret
endfunc
function ff_ps_mul_pair_single_neon, export=1
-1: ld1 {v0.4S,v1.4S}, [x1], #32
- ld1 {v2.4S}, [x2], #16
- zip1 v3.4S, v2.4S, v2.4S
- zip2 v4.4S, v2.4S, v2.4S
- fmul v0.4S, v0.4S, v3.4S
- fmul v1.4S, v1.4S, v4.4S
- st1 {v0.4S,v1.4S}, [x0], #32
+1: ld1 {v0.4s,v1.4s}, [x1], #32
+ ld1 {v2.4s}, [x2], #16
+ zip1 v3.4s, v2.4s, v2.4s
+ zip2 v4.4s, v2.4s, v2.4s
+ fmul v0.4s, v0.4s, v3.4s
+ fmul v1.4s, v1.4s, v4.4s
+ st1 {v0.4s,v1.4s}, [x0], #32
subs w3, w3, #4
b.gt 1b
ret
endfunc
function ff_ps_stereo_interpolate_neon, export=1
- ld1 {v0.4S}, [x2]
- ld1 {v1.4S}, [x3]
- zip1 v4.4S, v0.4S, v0.4S
- zip2 v5.4S, v0.4S, v0.4S
- zip1 v6.4S, v1.4S, v1.4S
- zip2 v7.4S, v1.4S, v1.4S
-1: ld1 {v2.2S}, [x0]
- ld1 {v3.2S}, [x1]
- fadd v4.4S, v4.4S, v6.4S
- fadd v5.4S, v5.4S, v7.4S
- mov v2.D[1], v2.D[0]
- mov v3.D[1], v3.D[0]
- fmul v2.4S, v2.4S, v4.4S
- fmla v2.4S, v3.4S, v5.4S
- st1 {v2.D}[0], [x0], #8
- st1 {v2.D}[1], [x1], #8
+ ld1 {v0.4s}, [x2]
+ ld1 {v1.4s}, [x3]
+ zip1 v4.4s, v0.4s, v0.4s
+ zip2 v5.4s, v0.4s, v0.4s
+ zip1 v6.4s, v1.4s, v1.4s
+ zip2 v7.4s, v1.4s, v1.4s
+1: ld1 {v2.2s}, [x0]
+ ld1 {v3.2s}, [x1]
+ fadd v4.4s, v4.4s, v6.4s
+ fadd v5.4s, v5.4s, v7.4s
+ mov v2.d[1], v2.d[0]
+ mov v3.d[1], v3.d[0]
+ fmul v2.4s, v2.4s, v4.4s
+ fmla v2.4s, v3.4s, v5.4s
+ st1 {v2.d}[0], [x0], #8
+ st1 {v2.d}[1], [x1], #8
subs w4, w4, #1
b.gt 1b
ret
endfunc
function ff_ps_stereo_interpolate_ipdopd_neon, export=1
- ld1 {v0.4S,v1.4S}, [x2]
- ld1 {v6.4S,v7.4S}, [x3]
- fneg v2.4S, v1.4S
- fneg v3.4S, v7.4S
- zip1 v16.4S, v0.4S, v0.4S
- zip2 v17.4S, v0.4S, v0.4S
- zip1 v18.4S, v2.4S, v1.4S
- zip2 v19.4S, v2.4S, v1.4S
- zip1 v20.4S, v6.4S, v6.4S
- zip2 v21.4S, v6.4S, v6.4S
- zip1 v22.4S, v3.4S, v7.4S
- zip2 v23.4S, v3.4S, v7.4S
-1: ld1 {v2.2S}, [x0]
- ld1 {v3.2S}, [x1]
- fadd v16.4S, v16.4S, v20.4S
- fadd v17.4S, v17.4S, v21.4S
- mov v2.D[1], v2.D[0]
- mov v3.D[1], v3.D[0]
- fmul v4.4S, v2.4S, v16.4S
- fmla v4.4S, v3.4S, v17.4S
- fadd v18.4S, v18.4S, v22.4S
- fadd v19.4S, v19.4S, v23.4S
- ext v2.16B, v2.16B, v2.16B, #4
- ext v3.16B, v3.16B, v3.16B, #4
- fmla v4.4S, v2.4S, v18.4S
- fmla v4.4S, v3.4S, v19.4S
- st1 {v4.D}[0], [x0], #8
- st1 {v4.D}[1], [x1], #8
+ ld1 {v0.4s,v1.4s}, [x2]
+ ld1 {v6.4s,v7.4s}, [x3]
+ fneg v2.4s, v1.4s
+ fneg v3.4s, v7.4s
+ zip1 v16.4s, v0.4s, v0.4s
+ zip2 v17.4s, v0.4s, v0.4s
+ zip1 v18.4s, v2.4s, v1.4s
+ zip2 v19.4s, v2.4s, v1.4s
+ zip1 v20.4s, v6.4s, v6.4s
+ zip2 v21.4s, v6.4s, v6.4s
+ zip1 v22.4s, v3.4s, v7.4s
+ zip2 v23.4s, v3.4s, v7.4s
+1: ld1 {v2.2s}, [x0]
+ ld1 {v3.2s}, [x1]
+ fadd v16.4s, v16.4s, v20.4s
+ fadd v17.4s, v17.4s, v21.4s
+ mov v2.d[1], v2.d[0]
+ mov v3.d[1], v3.d[0]
+ fmul v4.4s, v2.4s, v16.4s
+ fmla v4.4s, v3.4s, v17.4s
+ fadd v18.4s, v18.4s, v22.4s
+ fadd v19.4s, v19.4s, v23.4s
+ ext v2.16b, v2.16b, v2.16b, #4
+ ext v3.16b, v3.16b, v3.16b, #4
+ fmla v4.4s, v2.4s, v18.4s
+ fmla v4.4s, v3.4s, v19.4s
+ st1 {v4.d}[0], [x0], #8
+ st1 {v4.d}[1], [x1], #8
subs w4, w4, #1
b.gt 1b
ret
@@ -102,46 +102,46 @@ endfunc
function ff_ps_hybrid_analysis_neon, export=1
lsl x3, x3, #3
- ld2 {v0.4S,v1.4S}, [x1], #32
- ld2 {v2.2S,v3.2S}, [x1], #16
- ld1 {v24.2S}, [x1], #8
- ld2 {v4.2S,v5.2S}, [x1], #16
- ld2 {v6.4S,v7.4S}, [x1]
- rev64 v6.4S, v6.4S
- rev64 v7.4S, v7.4S
- ext v6.16B, v6.16B, v6.16B, #8
- ext v7.16B, v7.16B, v7.16B, #8
- rev64 v4.2S, v4.2S
- rev64 v5.2S, v5.2S
- mov v2.D[1], v3.D[0]
- mov v4.D[1], v5.D[0]
- mov v5.D[1], v2.D[0]
- mov v3.D[1], v4.D[0]
- fadd v16.4S, v0.4S, v6.4S
- fadd v17.4S, v1.4S, v7.4S
- fsub v18.4S, v1.4S, v7.4S
- fsub v19.4S, v0.4S, v6.4S
- fadd v22.4S, v2.4S, v4.4S
- fsub v23.4S, v5.4S, v3.4S
- trn1 v20.2D, v22.2D, v23.2D // {re4+re8, re5+re7, im8-im4, im7-im5}
- trn2 v21.2D, v22.2D, v23.2D // {im4+im8, im5+im7, re4-re8, re5-re7}
-1: ld2 {v2.4S,v3.4S}, [x2], #32
- ld2 {v4.2S,v5.2S}, [x2], #16
- ld1 {v6.2S}, [x2], #8
+ ld2 {v0.4s,v1.4s}, [x1], #32
+ ld2 {v2.2s,v3.2s}, [x1], #16
+ ld1 {v24.2s}, [x1], #8
+ ld2 {v4.2s,v5.2s}, [x1], #16
+ ld2 {v6.4s,v7.4s}, [x1]
+ rev64 v6.4s, v6.4s
+ rev64 v7.4s, v7.4s
+ ext v6.16b, v6.16b, v6.16b, #8
+ ext v7.16b, v7.16b, v7.16b, #8
+ rev64 v4.2s, v4.2s
+ rev64 v5.2s, v5.2s
+ mov v2.d[1], v3.d[0]
+ mov v4.d[1], v5.d[0]
+ mov v5.d[1], v2.d[0]
+ mov v3.d[1], v4.d[0]
+ fadd v16.4s, v0.4s, v6.4s
+ fadd v17.4s, v1.4s, v7.4s
+ fsub v18.4s, v1.4s, v7.4s
+ fsub v19.4s, v0.4s, v6.4s
+ fadd v22.4s, v2.4s, v4.4s
+ fsub v23.4s, v5.4s, v3.4s
+ trn1 v20.2d, v22.2d, v23.2d // {re4+re8, re5+re7, im8-im4, im7-im5}
+ trn2 v21.2d, v22.2d, v23.2d // {im4+im8, im5+im7, re4-re8, re5-re7}
+1: ld2 {v2.4s,v3.4s}, [x2], #32
+ ld2 {v4.2s,v5.2s}, [x2], #16
+ ld1 {v6.2s}, [x2], #8
add x2, x2, #8
- mov v4.D[1], v5.D[0]
- mov v6.S[1], v6.S[0]
- fmul v6.2S, v6.2S, v24.2S
- fmul v0.4S, v2.4S, v16.4S
- fmul v1.4S, v2.4S, v17.4S
- fmls v0.4S, v3.4S, v18.4S
- fmla v1.4S, v3.4S, v19.4S
- fmla v0.4S, v4.4S, v20.4S
- fmla v1.4S, v4.4S, v21.4S
- faddp v0.4S, v0.4S, v1.4S
- faddp v0.4S, v0.4S, v0.4S
- fadd v0.2S, v0.2S, v6.2S
- st1 {v0.2S}, [x0], x3
+ mov v4.d[1], v5.d[0]
+ mov v6.s[1], v6.s[0]
+ fmul v6.2s, v6.2s, v24.2s
+ fmul v0.4s, v2.4s, v16.4s
+ fmul v1.4s, v2.4s, v17.4s
+ fmls v0.4s, v3.4s, v18.4s
+ fmla v1.4s, v3.4s, v19.4s
+ fmla v0.4s, v4.4s, v20.4s
+ fmla v1.4s, v4.4s, v21.4s
+ faddp v0.4s, v0.4s, v1.4s
+ faddp v0.4s, v0.4s, v0.4s
+ fadd v0.2s, v0.2s, v6.2s
+ st1 {v0.2s}, [x0], x3
subs w4, w4, #1
b.gt 1b
ret
diff --git a/libavcodec/aarch64/h264cmc_neon.S b/libavcodec/aarch64/h264cmc_neon.S
index 88ccd727d0..5b959b87d3 100644
--- a/libavcodec/aarch64/h264cmc_neon.S
+++ b/libavcodec/aarch64/h264cmc_neon.S
@@ -39,10 +39,10 @@ function ff_\type\()_\codec\()_chroma_mc8_neon, export=1
lsl w10, w10, #1
add w9, w9, w10
add x6, x6, w9, UXTW
- ld1r {v22.8H}, [x6]
+ ld1r {v22.8h}, [x6]
.endif
.ifc \codec,vc1
- movi v22.8H, #28
+ movi v22.8h, #28
.endif
mul w7, w4, w5
lsl w14, w5, #3
@@ -55,139 +55,139 @@ function ff_\type\()_\codec\()_chroma_mc8_neon, export=1
add w4, w4, #64
b.eq 2f
- dup v0.8B, w4
- dup v1.8B, w12
- ld1 {v4.8B, v5.8B}, [x1], x2
- dup v2.8B, w6
- dup v3.8B, w7
- ext v5.8B, v4.8B, v5.8B, #1
-1: ld1 {v6.8B, v7.8B}, [x1], x2
- umull v16.8H, v4.8B, v0.8B
- umlal v16.8H, v5.8B, v1.8B
- ext v7.8B, v6.8B, v7.8B, #1
- ld1 {v4.8B, v5.8B}, [x1], x2
- umlal v16.8H, v6.8B, v2.8B
+ dup v0.8b, w4
+ dup v1.8b, w12
+ ld1 {v4.8b, v5.8b}, [x1], x2
+ dup v2.8b, w6
+ dup v3.8b, w7
+ ext v5.8b, v4.8b, v5.8b, #1
+1: ld1 {v6.8b, v7.8b}, [x1], x2
+ umull v16.8h, v4.8b, v0.8b
+ umlal v16.8h, v5.8b, v1.8b
+ ext v7.8b, v6.8b, v7.8b, #1
+ ld1 {v4.8b, v5.8b}, [x1], x2
+ umlal v16.8h, v6.8b, v2.8b
prfm pldl1strm, [x1]
- ext v5.8B, v4.8B, v5.8B, #1
- umlal v16.8H, v7.8B, v3.8B
- umull v17.8H, v6.8B, v0.8B
+ ext v5.8b, v4.8b, v5.8b, #1
+ umlal v16.8h, v7.8b, v3.8b
+ umull v17.8h, v6.8b, v0.8b
subs w3, w3, #2
- umlal v17.8H, v7.8B, v1.8B
- umlal v17.8H, v4.8B, v2.8B
- umlal v17.8H, v5.8B, v3.8B
+ umlal v17.8h, v7.8b, v1.8b
+ umlal v17.8h, v4.8b, v2.8b
+ umlal v17.8h, v5.8b, v3.8b
prfm pldl1strm, [x1, x2]
.ifc \codec,h264
- rshrn v16.8B, v16.8H, #6
- rshrn v17.8B, v17.8H, #6
+ rshrn v16.8b, v16.8h, #6
+ rshrn v17.8b, v17.8h, #6
.else
- add v16.8H, v16.8H, v22.8H
- add v17.8H, v17.8H, v22.8H
- shrn v16.8B, v16.8H, #6
- shrn v17.8B, v17.8H, #6
+ add v16.8h, v16.8h, v22.8h
+ add v17.8h, v17.8h, v22.8h
+ shrn v16.8b, v16.8h, #6
+ shrn v17.8b, v17.8h, #6
.endif
.ifc \type,avg
- ld1 {v20.8B}, [x8], x2
- ld1 {v21.8B}, [x8], x2
- urhadd v16.8B, v16.8B, v20.8B
- urhadd v17.8B, v17.8B, v21.8B
+ ld1 {v20.8b}, [x8], x2
+ ld1 {v21.8b}, [x8], x2
+ urhadd v16.8b, v16.8b, v20.8b
+ urhadd v17.8b, v17.8b, v21.8b
.endif
- st1 {v16.8B}, [x0], x2
- st1 {v17.8B}, [x0], x2
+ st1 {v16.8b}, [x0], x2
+ st1 {v17.8b}, [x0], x2
b.gt 1b
ret
2: adds w12, w12, w6
- dup v0.8B, w4
+ dup v0.8b, w4
b.eq 5f
tst w6, w6
- dup v1.8B, w12
+ dup v1.8b, w12
b.eq 4f
- ld1 {v4.8B}, [x1], x2
-3: ld1 {v6.8B}, [x1], x2
- umull v16.8H, v4.8B, v0.8B
- umlal v16.8H, v6.8B, v1.8B
- ld1 {v4.8B}, [x1], x2
- umull v17.8H, v6.8B, v0.8B
- umlal v17.8H, v4.8B, v1.8B
+ ld1 {v4.8b}, [x1], x2
+3: ld1 {v6.8b}, [x1], x2
+ umull v16.8h, v4.8b, v0.8b
+ umlal v16.8h, v6.8b, v1.8b
+ ld1 {v4.8b}, [x1], x2
+ umull v17.8h, v6.8b, v0.8b
+ umlal v17.8h, v4.8b, v1.8b
prfm pldl1strm, [x1]
.ifc \codec,h264
- rshrn v16.8B, v16.8H, #6
- rshrn v17.8B, v17.8H, #6
+ rshrn v16.8b, v16.8h, #6
+ rshrn v17.8b, v17.8h, #6
.else
- add v16.8H, v16.8H, v22.8H
- add v17.8H, v17.8H, v22.8H
- shrn v16.8B, v16.8H, #6
- shrn v17.8B, v17.8H, #6
+ add v16.8h, v16.8h, v22.8h
+ add v17.8h, v17.8h, v22.8h
+ shrn v16.8b, v16.8h, #6
+ shrn v17.8b, v17.8h, #6
.endif
prfm pldl1strm, [x1, x2]
.ifc \type,avg
- ld1 {v20.8B}, [x8], x2
- ld1 {v21.8B}, [x8], x2
- urhadd v16.8B, v16.8B, v20.8B
- urhadd v17.8B, v17.8B, v21.8B
+ ld1 {v20.8b}, [x8], x2
+ ld1 {v21.8b}, [x8], x2
+ urhadd v16.8b, v16.8b, v20.8b
+ urhadd v17.8b, v17.8b, v21.8b
.endif
subs w3, w3, #2
- st1 {v16.8B}, [x0], x2
- st1 {v17.8B}, [x0], x2
+ st1 {v16.8b}, [x0], x2
+ st1 {v17.8b}, [x0], x2
b.gt 3b
ret
-4: ld1 {v4.8B, v5.8B}, [x1], x2
- ld1 {v6.8B, v7.8B}, [x1], x2
- ext v5.8B, v4.8B, v5.8B, #1
- ext v7.8B, v6.8B, v7.8B, #1
+4: ld1 {v4.8b, v5.8b}, [x1], x2
+ ld1 {v6.8b, v7.8b}, [x1], x2
+ ext v5.8b, v4.8b, v5.8b, #1
+ ext v7.8b, v6.8b, v7.8b, #1
prfm pldl1strm, [x1]
subs w3, w3, #2
- umull v16.8H, v4.8B, v0.8B
- umlal v16.8H, v5.8B, v1.8B
- umull v17.8H, v6.8B, v0.8B
- umlal v17.8H, v7.8B, v1.8B
+ umull v16.8h, v4.8b, v0.8b
+ umlal v16.8h, v5.8b, v1.8b
+ umull v17.8h, v6.8b, v0.8b
+ umlal v17.8h, v7.8b, v1.8b
prfm pldl1strm, [x1, x2]
.ifc \codec,h264
- rshrn v16.8B, v16.8H, #6
- rshrn v17.8B, v17.8H, #6
+ rshrn v16.8b, v16.8h, #6
+ rshrn v17.8b, v17.8h, #6
.else
- add v16.8H, v16.8H, v22.8H
- add v17.8H, v17.8H, v22.8H
- shrn v16.8B, v16.8H, #6
- shrn v17.8B, v17.8H, #6
+ add v16.8h, v16.8h, v22.8h
+ add v17.8h, v17.8h, v22.8h
+ shrn v16.8b, v16.8h, #6
+ shrn v17.8b, v17.8h, #6
.endif
.ifc \type,avg
- ld1 {v20.8B}, [x8], x2
- ld1 {v21.8B}, [x8], x2
- urhadd v16.8B, v16.8B, v20.8B
- urhadd v17.8B, v17.8B, v21.8B
+ ld1 {v20.8b}, [x8], x2
+ ld1 {v21.8b}, [x8], x2
+ urhadd v16.8b, v16.8b, v20.8b
+ urhadd v17.8b, v17.8b, v21.8b
.endif
- st1 {v16.8B}, [x0], x2
- st1 {v17.8B}, [x0], x2
+ st1 {v16.8b}, [x0], x2
+ st1 {v17.8b}, [x0], x2
b.gt 4b
ret
-5: ld1 {v4.8B}, [x1], x2
- ld1 {v5.8B}, [x1], x2
+5: ld1 {v4.8b}, [x1], x2
+ ld1 {v5.8b}, [x1], x2
prfm pldl1strm, [x1]
subs w3, w3, #2
- umull v16.8H, v4.8B, v0.8B
- umull v17.8H, v5.8B, v0.8B
+ umull v16.8h, v4.8b, v0.8b
+ umull v17.8h, v5.8b, v0.8b
prfm pldl1strm, [x1, x2]
.ifc \codec,h264
- rshrn v16.8B, v16.8H, #6
- rshrn v17.8B, v17.8H, #6
+ rshrn v16.8b, v16.8h, #6
+ rshrn v17.8b, v17.8h, #6
.else
- add v16.8H, v16.8H, v22.8H
- add v17.8H, v17.8H, v22.8H
- shrn v16.8B, v16.8H, #6
- shrn v17.8B, v17.8H, #6
+ add v16.8h, v16.8h, v22.8h
+ add v17.8h, v17.8h, v22.8h
+ shrn v16.8b, v16.8h, #6
+ shrn v17.8b, v17.8h, #6
.endif
.ifc \type,avg
- ld1 {v20.8B}, [x8], x2
- ld1 {v21.8B}, [x8], x2
- urhadd v16.8B, v16.8B, v20.8B
- urhadd v17.8B, v17.8B, v21.8B
+ ld1 {v20.8b}, [x8], x2
+ ld1 {v21.8b}, [x8], x2
+ urhadd v16.8b, v16.8b, v20.8b
+ urhadd v17.8b, v17.8b, v21.8b
.endif
- st1 {v16.8B}, [x0], x2
- st1 {v17.8B}, [x0], x2
+ st1 {v16.8b}, [x0], x2
+ st1 {v17.8b}, [x0], x2
b.gt 5b
ret
endfunc
@@ -209,10 +209,10 @@ function ff_\type\()_\codec\()_chroma_mc4_neon, export=1
lsl w10, w10, #1
add w9, w9, w10
add x6, x6, w9, UXTW
- ld1r {v22.8H}, [x6]
+ ld1r {v22.8h}, [x6]
.endif
.ifc \codec,vc1
- movi v22.8H, #28
+ movi v22.8h, #28
.endif
mul w7, w4, w5
lsl w14, w5, #3
@@ -225,133 +225,133 @@ function ff_\type\()_\codec\()_chroma_mc4_neon, export=1
add w4, w4, #64
b.eq 2f
- dup v24.8B, w4
- dup v25.8B, w12
- ld1 {v4.8B}, [x1], x2
- dup v26.8B, w6
- dup v27.8B, w7
- ext v5.8B, v4.8B, v5.8B, #1
- trn1 v0.2S, v24.2S, v25.2S
- trn1 v2.2S, v26.2S, v27.2S
- trn1 v4.2S, v4.2S, v5.2S
-1: ld1 {v6.8B}, [x1], x2
- ext v7.8B, v6.8B, v7.8B, #1
- trn1 v6.2S, v6.2S, v7.2S
- umull v18.8H, v4.8B, v0.8B
- umlal v18.8H, v6.8B, v2.8B
- ld1 {v4.8B}, [x1], x2
- ext v5.8B, v4.8B, v5.8B, #1
- trn1 v4.2S, v4.2S, v5.2S
+ dup v24.8b, w4
+ dup v25.8b, w12
+ ld1 {v4.8b}, [x1], x2
+ dup v26.8b, w6
+ dup v27.8b, w7
+ ext v5.8b, v4.8b, v5.8b, #1
+ trn1 v0.2s, v24.2s, v25.2s
+ trn1 v2.2s, v26.2s, v27.2s
+ trn1 v4.2s, v4.2s, v5.2s
+1: ld1 {v6.8b}, [x1], x2
+ ext v7.8b, v6.8b, v7.8b, #1
+ trn1 v6.2s, v6.2s, v7.2s
+ umull v18.8h, v4.8b, v0.8b
+ umlal v18.8h, v6.8b, v2.8b
+ ld1 {v4.8b}, [x1], x2
+ ext v5.8b, v4.8b, v5.8b, #1
+ trn1 v4.2s, v4.2s, v5.2s
prfm pldl1strm, [x1]
- umull v19.8H, v6.8B, v0.8B
- umlal v19.8H, v4.8B, v2.8B
- trn1 v30.2D, v18.2D, v19.2D
- trn2 v31.2D, v18.2D, v19.2D
- add v18.8H, v30.8H, v31.8H
+ umull v19.8h, v6.8b, v0.8b
+ umlal v19.8h, v4.8b, v2.8b
+ trn1 v30.2d, v18.2d, v19.2d
+ trn2 v31.2d, v18.2d, v19.2d
+ add v18.8h, v30.8h, v31.8h
.ifc \codec,h264
- rshrn v16.8B, v18.8H, #6
+ rshrn v16.8b, v18.8h, #6
.else
- add v18.8H, v18.8H, v22.8H
- shrn v16.8B, v18.8H, #6
+ add v18.8h, v18.8h, v22.8h
+ shrn v16.8b, v18.8h, #6
.endif
subs w3, w3, #2
prfm pldl1strm, [x1, x2]
.ifc \type,avg
- ld1 {v20.S}[0], [x8], x2
- ld1 {v20.S}[1], [x8], x2
- urhadd v16.8B, v16.8B, v20.8B
+ ld1 {v20.s}[0], [x8], x2
+ ld1 {v20.s}[1], [x8], x2
+ urhadd v16.8b, v16.8b, v20.8b
.endif
- st1 {v16.S}[0], [x0], x2
- st1 {v16.S}[1], [x0], x2
+ st1 {v16.s}[0], [x0], x2
+ st1 {v16.s}[1], [x0], x2
b.gt 1b
ret
2: adds w12, w12, w6
- dup v30.8B, w4
+ dup v30.8b, w4
b.eq 5f
tst w6, w6
- dup v31.8B, w12
- trn1 v0.2S, v30.2S, v31.2S
- trn2 v1.2S, v30.2S, v31.2S
+ dup v31.8b, w12
+ trn1 v0.2s, v30.2s, v31.2s
+ trn2 v1.2s, v30.2s, v31.2s
b.eq 4f
- ext v1.8B, v0.8B, v1.8B, #4
- ld1 {v4.S}[0], [x1], x2
-3: ld1 {v4.S}[1], [x1], x2
- umull v18.8H, v4.8B, v0.8B
- ld1 {v4.S}[0], [x1], x2
- umull v19.8H, v4.8B, v1.8B
- trn1 v30.2D, v18.2D, v19.2D
- trn2 v31.2D, v18.2D, v19.2D
- add v18.8H, v30.8H, v31.8H
+ ext v1.8b, v0.8b, v1.8b, #4
+ ld1 {v4.s}[0], [x1], x2
+3: ld1 {v4.s}[1], [x1], x2
+ umull v18.8h, v4.8b, v0.8b
+ ld1 {v4.s}[0], [x1], x2
+ umull v19.8h, v4.8b, v1.8b
+ trn1 v30.2d, v18.2d, v19.2d
+ trn2 v31.2d, v18.2d, v19.2d
+ add v18.8h, v30.8h, v31.8h
prfm pldl1strm, [x1]
.ifc \codec,h264
- rshrn v16.8B, v18.8H, #6
+ rshrn v16.8b, v18.8h, #6
.else
- add v18.8H, v18.8H, v22.8H
- shrn v16.8B, v18.8H, #6
+ add v18.8h, v18.8h, v22.8h
+ shrn v16.8b, v18.8h, #6
.endif
.ifc \type,avg
- ld1 {v20.S}[0], [x8], x2
- ld1 {v20.S}[1], [x8], x2
- urhadd v16.8B, v16.8B, v20.8B
+ ld1 {v20.s}[0], [x8], x2
+ ld1 {v20.s}[1], [x8], x2
+ urhadd v16.8b, v16.8b, v20.8b
.endif
subs w3, w3, #2
prfm pldl1strm, [x1, x2]
- st1 {v16.S}[0], [x0], x2
- st1 {v16.S}[1], [x0], x2
+ st1 {v16.s}[0], [x0], x2
+ st1 {v16.s}[1], [x0], x2
b.gt 3b
ret
-4: ld1 {v4.8B}, [x1], x2
- ld1 {v6.8B}, [x1], x2
- ext v5.8B, v4.8B, v5.8B, #1
- ext v7.8B, v6.8B, v7.8B, #1
- trn1 v4.2S, v4.2S, v5.2S
- trn1 v6.2S, v6.2S, v7.2S
- umull v18.8H, v4.8B, v0.8B
- umull v19.8H, v6.8B, v0.8B
+4: ld1 {v4.8b}, [x1], x2
+ ld1 {v6.8b}, [x1], x2
+ ext v5.8b, v4.8b, v5.8b, #1
+ ext v7.8b, v6.8b, v7.8b, #1
+ trn1 v4.2s, v4.2s, v5.2s
+ trn1 v6.2s, v6.2s, v7.2s
+ umull v18.8h, v4.8b, v0.8b
+ umull v19.8h, v6.8b, v0.8b
subs w3, w3, #2
- trn1 v30.2D, v18.2D, v19.2D
- trn2 v31.2D, v18.2D, v19.2D
- add v18.8H, v30.8H, v31.8H
+ trn1 v30.2d, v18.2d, v19.2d
+ trn2 v31.2d, v18.2d, v19.2d
+ add v18.8h, v30.8h, v31.8h
prfm pldl1strm, [x1]
.ifc \codec,h264
- rshrn v16.8B, v18.8H, #6
+ rshrn v16.8b, v18.8h, #6
.else
- add v18.8H, v18.8H, v22.8H
- shrn v16.8B, v18.8H, #6
+ add v18.8h, v18.8h, v22.8h
+ shrn v16.8b, v18.8h, #6
.endif
.ifc \type,avg
- ld1 {v20.S}[0], [x8], x2
- ld1 {v20.S}[1], [x8], x2
- urhadd v16.8B, v16.8B, v20.8B
+ ld1 {v20.s}[0], [x8], x2
+ ld1 {v20.s}[1], [x8], x2
+ urhadd v16.8b, v16.8b, v20.8b
.endif
prfm pldl1strm, [x1]
- st1 {v16.S}[0], [x0], x2
- st1 {v16.S}[1], [x0], x2
+ st1 {v16.s}[0], [x0], x2
+ st1 {v16.s}[1], [x0], x2
b.gt 4b
ret
-5: ld1 {v4.S}[0], [x1], x2
- ld1 {v4.S}[1], [x1], x2
- umull v18.8H, v4.8B, v30.8B
+5: ld1 {v4.s}[0], [x1], x2
+ ld1 {v4.s}[1], [x1], x2
+ umull v18.8h, v4.8b, v30.8b
subs w3, w3, #2
prfm pldl1strm, [x1]
.ifc \codec,h264
- rshrn v16.8B, v18.8H, #6
+ rshrn v16.8b, v18.8h, #6
.else
- add v18.8H, v18.8H, v22.8H
- shrn v16.8B, v18.8H, #6
+ add v18.8h, v18.8h, v22.8h
+ shrn v16.8b, v18.8h, #6
.endif
.ifc \type,avg
- ld1 {v20.S}[0], [x8], x2
- ld1 {v20.S}[1], [x8], x2
- urhadd v16.8B, v16.8B, v20.8B
+ ld1 {v20.s}[0], [x8], x2
+ ld1 {v20.s}[1], [x8], x2
+ urhadd v16.8b, v16.8b, v20.8b
.endif
prfm pldl1strm, [x1]
- st1 {v16.S}[0], [x0], x2
- st1 {v16.S}[1], [x0], x2
+ st1 {v16.s}[0], [x0], x2
+ st1 {v16.s}[1], [x0], x2
b.gt 5b
ret
endfunc
@@ -372,51 +372,51 @@ function ff_\type\()_h264_chroma_mc2_neon, export=1
sub w4, w7, w13
sub w4, w4, w14
add w4, w4, #64
- dup v0.8B, w4
- dup v2.8B, w12
- dup v1.8B, w6
- dup v3.8B, w7
- trn1 v0.4H, v0.4H, v2.4H
- trn1 v1.4H, v1.4H, v3.4H
+ dup v0.8b, w4
+ dup v2.8b, w12
+ dup v1.8b, w6
+ dup v3.8b, w7
+ trn1 v0.4h, v0.4h, v2.4h
+ trn1 v1.4h, v1.4h, v3.4h
1:
- ld1 {v4.S}[0], [x1], x2
- ld1 {v4.S}[1], [x1], x2
- rev64 v5.2S, v4.2S
- ld1 {v5.S}[1], [x1]
- ext v6.8B, v4.8B, v5.8B, #1
- ext v7.8B, v5.8B, v4.8B, #1
- trn1 v4.4H, v4.4H, v6.4H
- trn1 v5.4H, v5.4H, v7.4H
- umull v16.8H, v4.8B, v0.8B
- umlal v16.8H, v5.8B, v1.8B
+ ld1 {v4.s}[0], [x1], x2
+ ld1 {v4.s}[1], [x1], x2
+ rev64 v5.2s, v4.2s
+ ld1 {v5.s}[1], [x1]
+ ext v6.8b, v4.8b, v5.8b, #1
+ ext v7.8b, v5.8b, v4.8b, #1
+ trn1 v4.4h, v4.4h, v6.4h
+ trn1 v5.4h, v5.4h, v7.4h
+ umull v16.8h, v4.8b, v0.8b
+ umlal v16.8h, v5.8b, v1.8b
.ifc \type,avg
- ld1 {v18.H}[0], [x0], x2
- ld1 {v18.H}[2], [x0]
+ ld1 {v18.h}[0], [x0], x2
+ ld1 {v18.h}[2], [x0]
sub x0, x0, x2
.endif
- rev64 v17.4S, v16.4S
- add v16.8H, v16.8H, v17.8H
- rshrn v16.8B, v16.8H, #6
+ rev64 v17.4s, v16.4s
+ add v16.8h, v16.8h, v17.8h
+ rshrn v16.8b, v16.8h, #6
.ifc \type,avg
- urhadd v16.8B, v16.8B, v18.8B
+ urhadd v16.8b, v16.8b, v18.8b
.endif
- st1 {v16.H}[0], [x0], x2
- st1 {v16.H}[2], [x0], x2
+ st1 {v16.h}[0], [x0], x2
+ st1 {v16.h}[2], [x0], x2
subs w3, w3, #2
b.gt 1b
ret
2:
- ld1 {v16.H}[0], [x1], x2
- ld1 {v16.H}[1], [x1], x2
+ ld1 {v16.h}[0], [x1], x2
+ ld1 {v16.h}[1], [x1], x2
.ifc \type,avg
- ld1 {v18.H}[0], [x0], x2
- ld1 {v18.H}[1], [x0]
+ ld1 {v18.h}[0], [x0], x2
+ ld1 {v18.h}[1], [x0]
sub x0, x0, x2
- urhadd v16.8B, v16.8B, v18.8B
+ urhadd v16.8b, v16.8b, v18.8b
.endif
- st1 {v16.H}[0], [x0], x2
- st1 {v16.H}[1], [x0], x2
+ st1 {v16.h}[0], [x0], x2
+ st1 {v16.h}[1], [x0], x2
subs w3, w3, #2
b.gt 2b
ret
diff --git a/libavcodec/aarch64/h264dsp_neon.S b/libavcodec/aarch64/h264dsp_neon.S
index ea221e6862..71c2ddfd0c 100644
--- a/libavcodec/aarch64/h264dsp_neon.S
+++ b/libavcodec/aarch64/h264dsp_neon.S
@@ -27,7 +27,7 @@
cmp w2, #0
ldr w6, [x4]
ccmp w3, #0, #0, ne
- mov v24.S[0], w6
+ mov v24.s[0], w6
and w8, w6, w6, lsl #16
b.eq 1f
ands w8, w8, w8, lsl #8
@@ -38,95 +38,95 @@
.endm
.macro h264_loop_filter_luma
- dup v22.16B, w2 // alpha
- uxtl v24.8H, v24.8B
- uabd v21.16B, v16.16B, v0.16B // abs(p0 - q0)
- uxtl v24.4S, v24.4H
- uabd v28.16B, v18.16B, v16.16B // abs(p1 - p0)
- sli v24.8H, v24.8H, #8
- uabd v30.16B, v2.16B, v0.16B // abs(q1 - q0)
- sli v24.4S, v24.4S, #16
- cmhi v21.16B, v22.16B, v21.16B // < alpha
- dup v22.16B, w3 // beta
- cmlt v23.16B, v24.16B, #0
- cmhi v28.16B, v22.16B, v28.16B // < beta
- cmhi v30.16B, v22.16B, v30.16B // < beta
- bic v21.16B, v21.16B, v23.16B
- uabd v17.16B, v20.16B, v16.16B // abs(p2 - p0)
- and v21.16B, v21.16B, v28.16B
- uabd v19.16B, v4.16B, v0.16B // abs(q2 - q0)
- and v21.16B, v21.16B, v30.16B // < beta
+ dup v22.16b, w2 // alpha
+ uxtl v24.8h, v24.8b
+ uabd v21.16b, v16.16b, v0.16b // abs(p0 - q0)
+ uxtl v24.4s, v24.4h
+ uabd v28.16b, v18.16b, v16.16b // abs(p1 - p0)
+ sli v24.8h, v24.8h, #8
+ uabd v30.16b, v2.16b, v0.16b // abs(q1 - q0)
+ sli v24.4s, v24.4s, #16
+ cmhi v21.16b, v22.16b, v21.16b // < alpha
+ dup v22.16b, w3 // beta
+ cmlt v23.16b, v24.16b, #0
+ cmhi v28.16b, v22.16b, v28.16b // < beta
+ cmhi v30.16b, v22.16b, v30.16b // < beta
+ bic v21.16b, v21.16b, v23.16b
+ uabd v17.16b, v20.16b, v16.16b // abs(p2 - p0)
+ and v21.16b, v21.16b, v28.16b
+ uabd v19.16b, v4.16b, v0.16b // abs(q2 - q0)
+ and v21.16b, v21.16b, v30.16b // < beta
shrn v30.8b, v21.8h, #4
mov x7, v30.d[0]
- cmhi v17.16B, v22.16B, v17.16B // < beta
- cmhi v19.16B, v22.16B, v19.16B // < beta
+ cmhi v17.16b, v22.16b, v17.16b // < beta
+ cmhi v19.16b, v22.16b, v19.16b // < beta
cbz x7, 9f
- and v17.16B, v17.16B, v21.16B
- and v19.16B, v19.16B, v21.16B
- and v24.16B, v24.16B, v21.16B
- urhadd v28.16B, v16.16B, v0.16B
- sub v21.16B, v24.16B, v17.16B
- uqadd v23.16B, v18.16B, v24.16B
- uhadd v20.16B, v20.16B, v28.16B
- sub v21.16B, v21.16B, v19.16B
- uhadd v28.16B, v4.16B, v28.16B
- umin v23.16B, v23.16B, v20.16B
- uqsub v22.16B, v18.16B, v24.16B
- uqadd v4.16B, v2.16B, v24.16B
- umax v23.16B, v23.16B, v22.16B
- uqsub v22.16B, v2.16B, v24.16B
- umin v28.16B, v4.16B, v28.16B
- uxtl v4.8H, v0.8B
- umax v28.16B, v28.16B, v22.16B
- uxtl2 v20.8H, v0.16B
- usubw v4.8H, v4.8H, v16.8B
- usubw2 v20.8H, v20.8H, v16.16B
- shl v4.8H, v4.8H, #2
- shl v20.8H, v20.8H, #2
- uaddw v4.8H, v4.8H, v18.8B
- uaddw2 v20.8H, v20.8H, v18.16B
- usubw v4.8H, v4.8H, v2.8B
- usubw2 v20.8H, v20.8H, v2.16B
- rshrn v4.8B, v4.8H, #3
- rshrn2 v4.16B, v20.8H, #3
- bsl v17.16B, v23.16B, v18.16B
- bsl v19.16B, v28.16B, v2.16B
- neg v23.16B, v21.16B
- uxtl v28.8H, v16.8B
- smin v4.16B, v4.16B, v21.16B
- uxtl2 v21.8H, v16.16B
- smax v4.16B, v4.16B, v23.16B
- uxtl v22.8H, v0.8B
- uxtl2 v24.8H, v0.16B
- saddw v28.8H, v28.8H, v4.8B
- saddw2 v21.8H, v21.8H, v4.16B
- ssubw v22.8H, v22.8H, v4.8B
- ssubw2 v24.8H, v24.8H, v4.16B
- sqxtun v16.8B, v28.8H
- sqxtun2 v16.16B, v21.8H
- sqxtun v0.8B, v22.8H
- sqxtun2 v0.16B, v24.8H
+ and v17.16b, v17.16b, v21.16b
+ and v19.16b, v19.16b, v21.16b
+ and v24.16b, v24.16b, v21.16b
+ urhadd v28.16b, v16.16b, v0.16b
+ sub v21.16b, v24.16b, v17.16b
+ uqadd v23.16b, v18.16b, v24.16b
+ uhadd v20.16b, v20.16b, v28.16b
+ sub v21.16b, v21.16b, v19.16b
+ uhadd v28.16b, v4.16b, v28.16b
+ umin v23.16b, v23.16b, v20.16b
+ uqsub v22.16b, v18.16b, v24.16b
+ uqadd v4.16b, v2.16b, v24.16b
+ umax v23.16b, v23.16b, v22.16b
+ uqsub v22.16b, v2.16b, v24.16b
+ umin v28.16b, v4.16b, v28.16b
+ uxtl v4.8h, v0.8b
+ umax v28.16b, v28.16b, v22.16b
+ uxtl2 v20.8h, v0.16b
+ usubw v4.8h, v4.8h, v16.8b
+ usubw2 v20.8h, v20.8h, v16.16b
+ shl v4.8h, v4.8h, #2
+ shl v20.8h, v20.8h, #2
+ uaddw v4.8h, v4.8h, v18.8b
+ uaddw2 v20.8h, v20.8h, v18.16b
+ usubw v4.8h, v4.8h, v2.8b
+ usubw2 v20.8h, v20.8h, v2.16b
+ rshrn v4.8b, v4.8h, #3
+ rshrn2 v4.16b, v20.8h, #3
+ bsl v17.16b, v23.16b, v18.16b
+ bsl v19.16b, v28.16b, v2.16b
+ neg v23.16b, v21.16b
+ uxtl v28.8h, v16.8b
+ smin v4.16b, v4.16b, v21.16b
+ uxtl2 v21.8h, v16.16b
+ smax v4.16b, v4.16b, v23.16b
+ uxtl v22.8h, v0.8b
+ uxtl2 v24.8h, v0.16b
+ saddw v28.8h, v28.8h, v4.8b
+ saddw2 v21.8h, v21.8h, v4.16b
+ ssubw v22.8h, v22.8h, v4.8b
+ ssubw2 v24.8h, v24.8h, v4.16b
+ sqxtun v16.8b, v28.8h
+ sqxtun2 v16.16b, v21.8h
+ sqxtun v0.8b, v22.8h
+ sqxtun2 v0.16b, v24.8h
.endm
function ff_h264_v_loop_filter_luma_neon, export=1
h264_loop_filter_start
- ld1 {v0.16B}, [x0], x1
- ld1 {v2.16B}, [x0], x1
- ld1 {v4.16B}, [x0], x1
+ ld1 {v0.16b}, [x0], x1
+ ld1 {v2.16b}, [x0], x1
+ ld1 {v4.16b}, [x0], x1
sub x0, x0, x1, lsl #2
sub x0, x0, x1, lsl #1
- ld1 {v20.16B}, [x0], x1
- ld1 {v18.16B}, [x0], x1
- ld1 {v16.16B}, [x0], x1
+ ld1 {v20.16b}, [x0], x1
+ ld1 {v18.16b}, [x0], x1
+ ld1 {v16.16b}, [x0], x1
h264_loop_filter_luma
sub x0, x0, x1, lsl #1
- st1 {v17.16B}, [x0], x1
- st1 {v16.16B}, [x0], x1
- st1 {v0.16B}, [x0], x1
- st1 {v19.16B}, [x0]
+ st1 {v17.16b}, [x0], x1
+ st1 {v16.16b}, [x0], x1
+ st1 {v0.16b}, [x0], x1
+ st1 {v19.16b}, [x0]
9:
ret
endfunc
@@ -135,22 +135,22 @@ function ff_h264_h_loop_filter_luma_neon, export=1
h264_loop_filter_start
sub x0, x0, #4
- ld1 {v6.8B}, [x0], x1
- ld1 {v20.8B}, [x0], x1
- ld1 {v18.8B}, [x0], x1
- ld1 {v16.8B}, [x0], x1
- ld1 {v0.8B}, [x0], x1
- ld1 {v2.8B}, [x0], x1
- ld1 {v4.8B}, [x0], x1
- ld1 {v26.8B}, [x0], x1
- ld1 {v6.D}[1], [x0], x1
- ld1 {v20.D}[1], [x0], x1
- ld1 {v18.D}[1], [x0], x1
- ld1 {v16.D}[1], [x0], x1
- ld1 {v0.D}[1], [x0], x1
- ld1 {v2.D}[1], [x0], x1
- ld1 {v4.D}[1], [x0], x1
- ld1 {v26.D}[1], [x0], x1
+ ld1 {v6.8b}, [x0], x1
+ ld1 {v20.8b}, [x0], x1
+ ld1 {v18.8b}, [x0], x1
+ ld1 {v16.8b}, [x0], x1
+ ld1 {v0.8b}, [x0], x1
+ ld1 {v2.8b}, [x0], x1
+ ld1 {v4.8b}, [x0], x1
+ ld1 {v26.8b}, [x0], x1
+ ld1 {v6.d}[1], [x0], x1
+ ld1 {v20.d}[1], [x0], x1
+ ld1 {v18.d}[1], [x0], x1
+ ld1 {v16.d}[1], [x0], x1
+ ld1 {v0.d}[1], [x0], x1
+ ld1 {v2.d}[1], [x0], x1
+ ld1 {v4.d}[1], [x0], x1
+ ld1 {v26.d}[1], [x0], x1
transpose_8x16B v6, v20, v18, v16, v0, v2, v4, v26, v21, v23
@@ -160,22 +160,22 @@ function ff_h264_h_loop_filter_luma_neon, export=1
sub x0, x0, x1, lsl #4
add x0, x0, #2
- st1 {v17.S}[0], [x0], x1
- st1 {v16.S}[0], [x0], x1
- st1 {v0.S}[0], [x0], x1
- st1 {v19.S}[0], [x0], x1
- st1 {v17.S}[1], [x0], x1
- st1 {v16.S}[1], [x0], x1
- st1 {v0.S}[1], [x0], x1
- st1 {v19.S}[1], [x0], x1
- st1 {v17.S}[2], [x0], x1
- st1 {v16.S}[2], [x0], x1
- st1 {v0.S}[2], [x0], x1
- st1 {v19.S}[2], [x0], x1
- st1 {v17.S}[3], [x0], x1
- st1 {v16.S}[3], [x0], x1
- st1 {v0.S}[3], [x0], x1
- st1 {v19.S}[3], [x0], x1
+ st1 {v17.s}[0], [x0], x1
+ st1 {v16.s}[0], [x0], x1
+ st1 {v0.s}[0], [x0], x1
+ st1 {v19.s}[0], [x0], x1
+ st1 {v17.s}[1], [x0], x1
+ st1 {v16.s}[1], [x0], x1
+ st1 {v0.s}[1], [x0], x1
+ st1 {v19.s}[1], [x0], x1
+ st1 {v17.s}[2], [x0], x1
+ st1 {v16.s}[2], [x0], x1
+ st1 {v0.s}[2], [x0], x1
+ st1 {v19.s}[2], [x0], x1
+ st1 {v17.s}[3], [x0], x1
+ st1 {v16.s}[3], [x0], x1
+ st1 {v0.s}[3], [x0], x1
+ st1 {v19.s}[3], [x0], x1
9:
ret
endfunc
@@ -377,52 +377,52 @@ function ff_h264_h_loop_filter_luma_intra_neon, export=1
endfunc
.macro h264_loop_filter_chroma
- dup v22.8B, w2 // alpha
- dup v23.8B, w3 // beta
- uxtl v24.8H, v24.8B
- uabd v26.8B, v16.8B, v0.8B // abs(p0 - q0)
- uabd v28.8B, v18.8B, v16.8B // abs(p1 - p0)
- uabd v30.8B, v2.8B, v0.8B // abs(q1 - q0)
- cmhi v26.8B, v22.8B, v26.8B // < alpha
- cmhi v28.8B, v23.8B, v28.8B // < beta
- cmhi v30.8B, v23.8B, v30.8B // < beta
- uxtl v4.8H, v0.8B
- and v26.8B, v26.8B, v28.8B
- usubw v4.8H, v4.8H, v16.8B
- and v26.8B, v26.8B, v30.8B
- shl v4.8H, v4.8H, #2
+ dup v22.8b, w2 // alpha
+ dup v23.8b, w3 // beta
+ uxtl v24.8h, v24.8b
+ uabd v26.8b, v16.8b, v0.8b // abs(p0 - q0)
+ uabd v28.8b, v18.8b, v16.8b // abs(p1 - p0)
+ uabd v30.8b, v2.8b, v0.8b // abs(q1 - q0)
+ cmhi v26.8b, v22.8b, v26.8b // < alpha
+ cmhi v28.8b, v23.8b, v28.8b // < beta
+ cmhi v30.8b, v23.8b, v30.8b // < beta
+ uxtl v4.8h, v0.8b
+ and v26.8b, v26.8b, v28.8b
+ usubw v4.8h, v4.8h, v16.8b
+ and v26.8b, v26.8b, v30.8b
+ shl v4.8h, v4.8h, #2
mov x8, v26.d[0]
- sli v24.8H, v24.8H, #8
- uaddw v4.8H, v4.8H, v18.8B
+ sli v24.8h, v24.8h, #8
+ uaddw v4.8h, v4.8h, v18.8b
cbz x8, 9f
- usubw v4.8H, v4.8H, v2.8B
- rshrn v4.8B, v4.8H, #3
- smin v4.8B, v4.8B, v24.8B
- neg v25.8B, v24.8B
- smax v4.8B, v4.8B, v25.8B
- uxtl v22.8H, v0.8B
- and v4.8B, v4.8B, v26.8B
- uxtl v28.8H, v16.8B
- saddw v28.8H, v28.8H, v4.8B
- ssubw v22.8H, v22.8H, v4.8B
- sqxtun v16.8B, v28.8H
- sqxtun v0.8B, v22.8H
+ usubw v4.8h, v4.8h, v2.8b
+ rshrn v4.8b, v4.8h, #3
+ smin v4.8b, v4.8b, v24.8b
+ neg v25.8b, v24.8b
+ smax v4.8b, v4.8b, v25.8b
+ uxtl v22.8h, v0.8b
+ and v4.8b, v4.8b, v26.8b
+ uxtl v28.8h, v16.8b
+ saddw v28.8h, v28.8h, v4.8b
+ ssubw v22.8h, v22.8h, v4.8b
+ sqxtun v16.8b, v28.8h
+ sqxtun v0.8b, v22.8h
.endm
function ff_h264_v_loop_filter_chroma_neon, export=1
h264_loop_filter_start
sub x0, x0, x1, lsl #1
- ld1 {v18.8B}, [x0], x1
- ld1 {v16.8B}, [x0], x1
- ld1 {v0.8B}, [x0], x1
- ld1 {v2.8B}, [x0]
+ ld1 {v18.8b}, [x0], x1
+ ld1 {v16.8b}, [x0], x1
+ ld1 {v0.8b}, [x0], x1
+ ld1 {v2.8b}, [x0]
h264_loop_filter_chroma
sub x0, x0, x1, lsl #1
- st1 {v16.8B}, [x0], x1
- st1 {v0.8B}, [x0], x1
+ st1 {v16.8b}, [x0], x1
+ st1 {v0.8b}, [x0], x1
9:
ret
endfunc
@@ -432,14 +432,14 @@ function ff_h264_h_loop_filter_chroma_neon, export=1
sub x0, x0, #2
h_loop_filter_chroma420:
- ld1 {v18.S}[0], [x0], x1
- ld1 {v16.S}[0], [x0], x1
- ld1 {v0.S}[0], [x0], x1
- ld1 {v2.S}[0], [x0], x1
- ld1 {v18.S}[1], [x0], x1
- ld1 {v16.S}[1], [x0], x1
- ld1 {v0.S}[1], [x0], x1
- ld1 {v2.S}[1], [x0], x1
+ ld1 {v18.s}[0], [x0], x1
+ ld1 {v16.s}[0], [x0], x1
+ ld1 {v0.s}[0], [x0], x1
+ ld1 {v2.s}[0], [x0], x1
+ ld1 {v18.s}[1], [x0], x1
+ ld1 {v16.s}[1], [x0], x1
+ ld1 {v0.s}[1], [x0], x1
+ ld1 {v2.s}[1], [x0], x1
transpose_4x8B v18, v16, v0, v2, v28, v29, v30, v31
@@ -448,14 +448,14 @@ h_loop_filter_chroma420:
transpose_4x8B v18, v16, v0, v2, v28, v29, v30, v31
sub x0, x0, x1, lsl #3
- st1 {v18.S}[0], [x0], x1
- st1 {v16.S}[0], [x0], x1
- st1 {v0.S}[0], [x0], x1
- st1 {v2.S}[0], [x0], x1
- st1 {v18.S}[1], [x0], x1
- st1 {v16.S}[1], [x0], x1
- st1 {v0.S}[1], [x0], x1
- st1 {v2.S}[1], [x0], x1
+ st1 {v18.s}[0], [x0], x1
+ st1 {v16.s}[0], [x0], x1
+ st1 {v0.s}[0], [x0], x1
+ st1 {v2.s}[0], [x0], x1
+ st1 {v18.s}[1], [x0], x1
+ st1 {v16.s}[1], [x0], x1
+ st1 {v0.s}[1], [x0], x1
+ st1 {v2.s}[1], [x0], x1
9:
ret
endfunc
@@ -584,102 +584,102 @@ function ff_h264_h_loop_filter_chroma422_intra_neon, export=1
endfunc
.macro biweight_16 macs, macd
- dup v0.16B, w5
- dup v1.16B, w6
- mov v4.16B, v16.16B
- mov v6.16B, v16.16B
+ dup v0.16b, w5
+ dup v1.16b, w6
+ mov v4.16b, v16.16b
+ mov v6.16b, v16.16b
1: subs w3, w3, #2
- ld1 {v20.16B}, [x0], x2
- \macd v4.8H, v0.8B, v20.8B
+ ld1 {v20.16b}, [x0], x2
+ \macd v4.8h, v0.8b, v20.8b
\macd\()2 v6.8H, v0.16B, v20.16B
- ld1 {v22.16B}, [x1], x2
- \macs v4.8H, v1.8B, v22.8B
+ ld1 {v22.16b}, [x1], x2
+ \macs v4.8h, v1.8b, v22.8b
\macs\()2 v6.8H, v1.16B, v22.16B
- mov v24.16B, v16.16B
- ld1 {v28.16B}, [x0], x2
- mov v26.16B, v16.16B
- \macd v24.8H, v0.8B, v28.8B
+ mov v24.16b, v16.16b
+ ld1 {v28.16b}, [x0], x2
+ mov v26.16b, v16.16b
+ \macd v24.8h, v0.8b, v28.8b
\macd\()2 v26.8H, v0.16B, v28.16B
- ld1 {v30.16B}, [x1], x2
- \macs v24.8H, v1.8B, v30.8B
+ ld1 {v30.16b}, [x1], x2
+ \macs v24.8h, v1.8b, v30.8b
\macs\()2 v26.8H, v1.16B, v30.16B
- sshl v4.8H, v4.8H, v18.8H
- sshl v6.8H, v6.8H, v18.8H
- sqxtun v4.8B, v4.8H
- sqxtun2 v4.16B, v6.8H
- sshl v24.8H, v24.8H, v18.8H
- sshl v26.8H, v26.8H, v18.8H
- sqxtun v24.8B, v24.8H
- sqxtun2 v24.16B, v26.8H
- mov v6.16B, v16.16B
- st1 {v4.16B}, [x7], x2
- mov v4.16B, v16.16B
- st1 {v24.16B}, [x7], x2
+ sshl v4.8h, v4.8h, v18.8h
+ sshl v6.8h, v6.8h, v18.8h
+ sqxtun v4.8b, v4.8h
+ sqxtun2 v4.16b, v6.8h
+ sshl v24.8h, v24.8h, v18.8h
+ sshl v26.8h, v26.8h, v18.8h
+ sqxtun v24.8b, v24.8h
+ sqxtun2 v24.16b, v26.8h
+ mov v6.16b, v16.16b
+ st1 {v4.16b}, [x7], x2
+ mov v4.16b, v16.16b
+ st1 {v24.16b}, [x7], x2
b.ne 1b
ret
.endm
.macro biweight_8 macs, macd
- dup v0.8B, w5
- dup v1.8B, w6
- mov v2.16B, v16.16B
- mov v20.16B, v16.16B
+ dup v0.8b, w5
+ dup v1.8b, w6
+ mov v2.16b, v16.16b
+ mov v20.16b, v16.16b
1: subs w3, w3, #2
- ld1 {v4.8B}, [x0], x2
- \macd v2.8H, v0.8B, v4.8B
- ld1 {v5.8B}, [x1], x2
- \macs v2.8H, v1.8B, v5.8B
- ld1 {v6.8B}, [x0], x2
- \macd v20.8H, v0.8B, v6.8B
- ld1 {v7.8B}, [x1], x2
- \macs v20.8H, v1.8B, v7.8B
- sshl v2.8H, v2.8H, v18.8H
- sqxtun v2.8B, v2.8H
- sshl v20.8H, v20.8H, v18.8H
- sqxtun v4.8B, v20.8H
- mov v20.16B, v16.16B
- st1 {v2.8B}, [x7], x2
- mov v2.16B, v16.16B
- st1 {v4.8B}, [x7], x2
+ ld1 {v4.8b}, [x0], x2
+ \macd v2.8h, v0.8b, v4.8b
+ ld1 {v5.8b}, [x1], x2
+ \macs v2.8h, v1.8b, v5.8b
+ ld1 {v6.8b}, [x0], x2
+ \macd v20.8h, v0.8b, v6.8b
+ ld1 {v7.8b}, [x1], x2
+ \macs v20.8h, v1.8b, v7.8b
+ sshl v2.8h, v2.8h, v18.8h
+ sqxtun v2.8b, v2.8h
+ sshl v20.8h, v20.8h, v18.8h
+ sqxtun v4.8b, v20.8h
+ mov v20.16b, v16.16b
+ st1 {v2.8b}, [x7], x2
+ mov v2.16b, v16.16b
+ st1 {v4.8b}, [x7], x2
b.ne 1b
ret
.endm
.macro biweight_4 macs, macd
- dup v0.8B, w5
- dup v1.8B, w6
- mov v2.16B, v16.16B
- mov v20.16B,v16.16B
+ dup v0.8b, w5
+ dup v1.8b, w6
+ mov v2.16b, v16.16b
+ mov v20.16b,v16.16b
1: subs w3, w3, #4
- ld1 {v4.S}[0], [x0], x2
- ld1 {v4.S}[1], [x0], x2
- \macd v2.8H, v0.8B, v4.8B
- ld1 {v5.S}[0], [x1], x2
- ld1 {v5.S}[1], [x1], x2
- \macs v2.8H, v1.8B, v5.8B
+ ld1 {v4.s}[0], [x0], x2
+ ld1 {v4.s}[1], [x0], x2
+ \macd v2.8h, v0.8b, v4.8b
+ ld1 {v5.s}[0], [x1], x2
+ ld1 {v5.s}[1], [x1], x2
+ \macs v2.8h, v1.8b, v5.8b
b.lt 2f
- ld1 {v6.S}[0], [x0], x2
- ld1 {v6.S}[1], [x0], x2
- \macd v20.8H, v0.8B, v6.8B
- ld1 {v7.S}[0], [x1], x2
- ld1 {v7.S}[1], [x1], x2
- \macs v20.8H, v1.8B, v7.8B
- sshl v2.8H, v2.8H, v18.8H
- sqxtun v2.8B, v2.8H
- sshl v20.8H, v20.8H, v18.8H
- sqxtun v4.8B, v20.8H
- mov v20.16B, v16.16B
- st1 {v2.S}[0], [x7], x2
- st1 {v2.S}[1], [x7], x2
- mov v2.16B, v16.16B
- st1 {v4.S}[0], [x7], x2
- st1 {v4.S}[1], [x7], x2
+ ld1 {v6.s}[0], [x0], x2
+ ld1 {v6.s}[1], [x0], x2
+ \macd v20.8h, v0.8b, v6.8b
+ ld1 {v7.s}[0], [x1], x2
+ ld1 {v7.s}[1], [x1], x2
+ \macs v20.8h, v1.8b, v7.8b
+ sshl v2.8h, v2.8h, v18.8h
+ sqxtun v2.8b, v2.8h
+ sshl v20.8h, v20.8h, v18.8h
+ sqxtun v4.8b, v20.8h
+ mov v20.16b, v16.16b
+ st1 {v2.s}[0], [x7], x2
+ st1 {v2.s}[1], [x7], x2
+ mov v2.16b, v16.16b
+ st1 {v4.s}[0], [x7], x2
+ st1 {v4.s}[1], [x7], x2
b.ne 1b
ret
-2: sshl v2.8H, v2.8H, v18.8H
- sqxtun v2.8B, v2.8H
- st1 {v2.S}[0], [x7], x2
- st1 {v2.S}[1], [x7], x2
+2: sshl v2.8h, v2.8h, v18.8h
+ sqxtun v2.8b, v2.8h
+ st1 {v2.s}[0], [x7], x2
+ st1 {v2.s}[1], [x7], x2
ret
.endm
@@ -689,10 +689,10 @@ function ff_biweight_h264_pixels_\w\()_neon, export=1
add w7, w7, #1
eor w8, w8, w6, lsr #30
orr w7, w7, #1
- dup v18.8H, w4
+ dup v18.8h, w4
lsl w7, w7, w4
- not v18.16B, v18.16B
- dup v16.8H, w7
+ not v18.16b, v18.16b
+ dup v16.8h, w7
mov x7, x0
cbz w8, 10f
subs w8, w8, #1
@@ -716,78 +716,78 @@ endfunc
biweight_func 4
.macro weight_16 add
- dup v0.16B, w4
+ dup v0.16b, w4
1: subs w2, w2, #2
- ld1 {v20.16B}, [x0], x1
- umull v4.8H, v0.8B, v20.8B
- umull2 v6.8H, v0.16B, v20.16B
- ld1 {v28.16B}, [x0], x1
- umull v24.8H, v0.8B, v28.8B
- umull2 v26.8H, v0.16B, v28.16B
- \add v4.8H, v16.8H, v4.8H
- srshl v4.8H, v4.8H, v18.8H
- \add v6.8H, v16.8H, v6.8H
- srshl v6.8H, v6.8H, v18.8H
- sqxtun v4.8B, v4.8H
- sqxtun2 v4.16B, v6.8H
- \add v24.8H, v16.8H, v24.8H
- srshl v24.8H, v24.8H, v18.8H
- \add v26.8H, v16.8H, v26.8H
- srshl v26.8H, v26.8H, v18.8H
- sqxtun v24.8B, v24.8H
- sqxtun2 v24.16B, v26.8H
- st1 {v4.16B}, [x5], x1
- st1 {v24.16B}, [x5], x1
+ ld1 {v20.16b}, [x0], x1
+ umull v4.8h, v0.8b, v20.8b
+ umull2 v6.8h, v0.16b, v20.16b
+ ld1 {v28.16b}, [x0], x1
+ umull v24.8h, v0.8b, v28.8b
+ umull2 v26.8h, v0.16b, v28.16b
+ \add v4.8h, v16.8h, v4.8h
+ srshl v4.8h, v4.8h, v18.8h
+ \add v6.8h, v16.8h, v6.8h
+ srshl v6.8h, v6.8h, v18.8h
+ sqxtun v4.8b, v4.8h
+ sqxtun2 v4.16b, v6.8h
+ \add v24.8h, v16.8h, v24.8h
+ srshl v24.8h, v24.8h, v18.8h
+ \add v26.8h, v16.8h, v26.8h
+ srshl v26.8h, v26.8h, v18.8h
+ sqxtun v24.8b, v24.8h
+ sqxtun2 v24.16b, v26.8h
+ st1 {v4.16b}, [x5], x1
+ st1 {v24.16b}, [x5], x1
b.ne 1b
ret
.endm
.macro weight_8 add
- dup v0.8B, w4
+ dup v0.8b, w4
1: subs w2, w2, #2
- ld1 {v4.8B}, [x0], x1
- umull v2.8H, v0.8B, v4.8B
- ld1 {v6.8B}, [x0], x1
- umull v20.8H, v0.8B, v6.8B
- \add v2.8H, v16.8H, v2.8H
- srshl v2.8H, v2.8H, v18.8H
- sqxtun v2.8B, v2.8H
- \add v20.8H, v16.8H, v20.8H
- srshl v20.8H, v20.8H, v18.8H
- sqxtun v4.8B, v20.8H
- st1 {v2.8B}, [x5], x1
- st1 {v4.8B}, [x5], x1
+ ld1 {v4.8b}, [x0], x1
+ umull v2.8h, v0.8b, v4.8b
+ ld1 {v6.8b}, [x0], x1
+ umull v20.8h, v0.8b, v6.8b
+ \add v2.8h, v16.8h, v2.8h
+ srshl v2.8h, v2.8h, v18.8h
+ sqxtun v2.8b, v2.8h
+ \add v20.8h, v16.8h, v20.8h
+ srshl v20.8h, v20.8h, v18.8h
+ sqxtun v4.8b, v20.8h
+ st1 {v2.8b}, [x5], x1
+ st1 {v4.8b}, [x5], x1
b.ne 1b
ret
.endm
.macro weight_4 add
- dup v0.8B, w4
+ dup v0.8b, w4
1: subs w2, w2, #4
- ld1 {v4.S}[0], [x0], x1
- ld1 {v4.S}[1], [x0], x1
- umull v2.8H, v0.8B, v4.8B
+ ld1 {v4.s}[0], [x0], x1
+ ld1 {v4.s}[1], [x0], x1
+ umull v2.8h, v0.8b, v4.8b
b.lt 2f
- ld1 {v6.S}[0], [x0], x1
- ld1 {v6.S}[1], [x0], x1
- umull v20.8H, v0.8B, v6.8B
- \add v2.8H, v16.8H, v2.8H
- srshl v2.8H, v2.8H, v18.8H
- sqxtun v2.8B, v2.8H
- \add v20.8H, v16.8H, v20.8H
- srshl v20.8H, v20.8h, v18.8H
- sqxtun v4.8B, v20.8H
- st1 {v2.S}[0], [x5], x1
- st1 {v2.S}[1], [x5], x1
- st1 {v4.S}[0], [x5], x1
- st1 {v4.S}[1], [x5], x1
+ ld1 {v6.s}[0], [x0], x1
+ ld1 {v6.s}[1], [x0], x1
+ umull v20.8h, v0.8b, v6.8b
+ \add v2.8h, v16.8h, v2.8h
+ srshl v2.8h, v2.8h, v18.8h
+ sqxtun v2.8b, v2.8h
+ \add v20.8h, v16.8h, v20.8h
+ srshl v20.8h, v20.8h, v18.8h
+ sqxtun v4.8b, v20.8h
+ st1 {v2.s}[0], [x5], x1
+ st1 {v2.s}[1], [x5], x1
+ st1 {v4.s}[0], [x5], x1
+ st1 {v4.s}[1], [x5], x1
b.ne 1b
ret
-2: \add v2.8H, v16.8H, v2.8H
- srshl v2.8H, v2.8H, v18.8H
- sqxtun v2.8B, v2.8H
- st1 {v2.S}[0], [x5], x1
- st1 {v2.S}[1], [x5], x1
+2: \add v2.8h, v16.8h, v2.8h
+ srshl v2.8h, v2.8h, v18.8h
+ sqxtun v2.8b, v2.8h
+ st1 {v2.s}[0], [x5], x1
+ st1 {v2.s}[1], [x5], x1
ret
.endm
@@ -796,18 +796,18 @@ function ff_weight_h264_pixels_\w\()_neon, export=1
cmp w3, #1
mov w6, #1
lsl w5, w5, w3
- dup v16.8H, w5
+ dup v16.8h, w5
mov x5, x0
b.le 20f
sub w6, w6, w3
- dup v18.8H, w6
+ dup v18.8h, w6
cmp w4, #0
b.lt 10f
weight_\w shadd
10: neg w4, w4
weight_\w shsub
20: neg w6, w3
- dup v18.8H, w6
+ dup v18.8h, w6
cmp w4, #0
b.lt 10f
weight_\w add
@@ -825,7 +825,7 @@ endfunc
ldr w6, [x4]
ccmp w3, #0, #0, ne
lsl w2, w2, #2
- mov v24.S[0], w6
+ mov v24.s[0], w6
lsl w3, w3, #2
and w8, w6, w6, lsl #16
b.eq 1f
diff --git a/libavcodec/aarch64/h264idct_neon.S b/libavcodec/aarch64/h264idct_neon.S
index 375da31d65..1bab2ca7c8 100644
--- a/libavcodec/aarch64/h264idct_neon.S
+++ b/libavcodec/aarch64/h264idct_neon.S
@@ -25,54 +25,54 @@
function ff_h264_idct_add_neon, export=1
.L_ff_h264_idct_add_neon:
AARCH64_VALID_CALL_TARGET
- ld1 {v0.4H, v1.4H, v2.4H, v3.4H}, [x1]
+ ld1 {v0.4h, v1.4h, v2.4h, v3.4h}, [x1]
sxtw x2, w2
- movi v30.8H, #0
+ movi v30.8h, #0
- add v4.4H, v0.4H, v2.4H
- sshr v16.4H, v1.4H, #1
- st1 {v30.8H}, [x1], #16
- sshr v17.4H, v3.4H, #1
- st1 {v30.8H}, [x1], #16
- sub v5.4H, v0.4H, v2.4H
- sub v6.4H, v16.4H, v3.4H
- add v7.4H, v1.4H, v17.4H
- add v0.4H, v4.4H, v7.4H
- add v1.4H, v5.4H, v6.4H
- sub v2.4H, v5.4H, v6.4H
- sub v3.4H, v4.4H, v7.4H
+ add v4.4h, v0.4h, v2.4h
+ sshr v16.4h, v1.4h, #1
+ st1 {v30.8h}, [x1], #16
+ sshr v17.4h, v3.4h, #1
+ st1 {v30.8h}, [x1], #16
+ sub v5.4h, v0.4h, v2.4h
+ sub v6.4h, v16.4h, v3.4h
+ add v7.4h, v1.4h, v17.4h
+ add v0.4h, v4.4h, v7.4h
+ add v1.4h, v5.4h, v6.4h
+ sub v2.4h, v5.4h, v6.4h
+ sub v3.4h, v4.4h, v7.4h
transpose_4x4H v0, v1, v2, v3, v4, v5, v6, v7
- add v4.4H, v0.4H, v2.4H
- ld1 {v18.S}[0], [x0], x2
- sshr v16.4H, v3.4H, #1
- sshr v17.4H, v1.4H, #1
- ld1 {v18.S}[1], [x0], x2
- sub v5.4H, v0.4H, v2.4H
- ld1 {v19.S}[1], [x0], x2
- add v6.4H, v16.4H, v1.4H
- ins v4.D[1], v5.D[0]
- sub v7.4H, v17.4H, v3.4H
- ld1 {v19.S}[0], [x0], x2
- ins v6.D[1], v7.D[0]
+ add v4.4h, v0.4h, v2.4h
+ ld1 {v18.s}[0], [x0], x2
+ sshr v16.4h, v3.4h, #1
+ sshr v17.4h, v1.4h, #1
+ ld1 {v18.s}[1], [x0], x2
+ sub v5.4h, v0.4h, v2.4h
+ ld1 {v19.s}[1], [x0], x2
+ add v6.4h, v16.4h, v1.4h
+ ins v4.d[1], v5.d[0]
+ sub v7.4h, v17.4h, v3.4h
+ ld1 {v19.s}[0], [x0], x2
+ ins v6.d[1], v7.d[0]
sub x0, x0, x2, lsl #2
- add v0.8H, v4.8H, v6.8H
- sub v1.8H, v4.8H, v6.8H
+ add v0.8h, v4.8h, v6.8h
+ sub v1.8h, v4.8h, v6.8h
- srshr v0.8H, v0.8H, #6
- srshr v1.8H, v1.8H, #6
+ srshr v0.8h, v0.8h, #6
+ srshr v1.8h, v1.8h, #6
- uaddw v0.8H, v0.8H, v18.8B
- uaddw v1.8H, v1.8H, v19.8B
+ uaddw v0.8h, v0.8h, v18.8b
+ uaddw v1.8h, v1.8h, v19.8b
- sqxtun v0.8B, v0.8H
- sqxtun v1.8B, v1.8H
+ sqxtun v0.8b, v0.8h
+ sqxtun v1.8b, v1.8h
- st1 {v0.S}[0], [x0], x2
- st1 {v0.S}[1], [x0], x2
- st1 {v1.S}[1], [x0], x2
- st1 {v1.S}[0], [x0], x2
+ st1 {v0.s}[0], [x0], x2
+ st1 {v0.s}[1], [x0], x2
+ st1 {v1.s}[1], [x0], x2
+ st1 {v1.s}[0], [x0], x2
sub x1, x1, #32
ret
@@ -83,22 +83,22 @@ function ff_h264_idct_dc_add_neon, export=1
AARCH64_VALID_CALL_TARGET
sxtw x2, w2
mov w3, #0
- ld1r {v2.8H}, [x1]
+ ld1r {v2.8h}, [x1]
strh w3, [x1]
- srshr v2.8H, v2.8H, #6
- ld1 {v0.S}[0], [x0], x2
- ld1 {v0.S}[1], [x0], x2
- uaddw v3.8H, v2.8H, v0.8B
- ld1 {v1.S}[0], [x0], x2
- ld1 {v1.S}[1], [x0], x2
- uaddw v4.8H, v2.8H, v1.8B
- sqxtun v0.8B, v3.8H
- sqxtun v1.8B, v4.8H
+ srshr v2.8h, v2.8h, #6
+ ld1 {v0.s}[0], [x0], x2
+ ld1 {v0.s}[1], [x0], x2
+ uaddw v3.8h, v2.8h, v0.8b
+ ld1 {v1.s}[0], [x0], x2
+ ld1 {v1.s}[1], [x0], x2
+ uaddw v4.8h, v2.8h, v1.8b
+ sqxtun v0.8b, v3.8h
+ sqxtun v1.8b, v4.8h
sub x0, x0, x2, lsl #2
- st1 {v0.S}[0], [x0], x2
- st1 {v0.S}[1], [x0], x2
- st1 {v1.S}[0], [x0], x2
- st1 {v1.S}[1], [x0], x2
+ st1 {v0.s}[0], [x0], x2
+ st1 {v0.s}[1], [x0], x2
+ st1 {v1.s}[0], [x0], x2
+ st1 {v1.s}[1], [x0], x2
ret
endfunc
@@ -194,71 +194,71 @@ endfunc
.if \pass == 0
va .req v18
vb .req v30
- sshr v18.8H, v26.8H, #1
- add v16.8H, v24.8H, v28.8H
- ld1 {v30.8H, v31.8H}, [x1]
- st1 {v19.8H}, [x1], #16
- st1 {v19.8H}, [x1], #16
- sub v17.8H, v24.8H, v28.8H
- sshr v19.8H, v30.8H, #1
- sub v18.8H, v18.8H, v30.8H
- add v19.8H, v19.8H, v26.8H
+ sshr v18.8h, v26.8h, #1
+ add v16.8h, v24.8h, v28.8h
+ ld1 {v30.8h, v31.8h}, [x1]
+ st1 {v19.8h}, [x1], #16
+ st1 {v19.8h}, [x1], #16
+ sub v17.8h, v24.8h, v28.8h
+ sshr v19.8h, v30.8h, #1
+ sub v18.8h, v18.8h, v30.8h
+ add v19.8h, v19.8h, v26.8h
.else
va .req v30
vb .req v18
- sshr v30.8H, v26.8H, #1
- sshr v19.8H, v18.8H, #1
- add v16.8H, v24.8H, v28.8H
- sub v17.8H, v24.8H, v28.8H
- sub v30.8H, v30.8H, v18.8H
- add v19.8H, v19.8H, v26.8H
+ sshr v30.8h, v26.8h, #1
+ sshr v19.8h, v18.8h, #1
+ add v16.8h, v24.8h, v28.8h
+ sub v17.8h, v24.8h, v28.8h
+ sub v30.8h, v30.8h, v18.8h
+ add v19.8h, v19.8h, v26.8h
.endif
- add v26.8H, v17.8H, va.8H
- sub v28.8H, v17.8H, va.8H
- add v24.8H, v16.8H, v19.8H
- sub vb.8H, v16.8H, v19.8H
- sub v16.8H, v29.8H, v27.8H
- add v17.8H, v31.8H, v25.8H
- sub va.8H, v31.8H, v25.8H
- add v19.8H, v29.8H, v27.8H
- sub v16.8H, v16.8H, v31.8H
- sub v17.8H, v17.8H, v27.8H
- add va.8H, va.8H, v29.8H
- add v19.8H, v19.8H, v25.8H
- sshr v25.8H, v25.8H, #1
- sshr v27.8H, v27.8H, #1
- sshr v29.8H, v29.8H, #1
- sshr v31.8H, v31.8H, #1
- sub v16.8H, v16.8H, v31.8H
- sub v17.8H, v17.8H, v27.8H
- add va.8H, va.8H, v29.8H
- add v19.8H, v19.8H, v25.8H
- sshr v25.8H, v16.8H, #2
- sshr v27.8H, v17.8H, #2
- sshr v29.8H, va.8H, #2
- sshr v31.8H, v19.8H, #2
- sub v19.8H, v19.8H, v25.8H
- sub va.8H, v27.8H, va.8H
- add v17.8H, v17.8H, v29.8H
- add v16.8H, v16.8H, v31.8H
+ add v26.8h, v17.8h, va.8h
+ sub v28.8h, v17.8h, va.8h
+ add v24.8h, v16.8h, v19.8h
+ sub vb.8h, v16.8h, v19.8h
+ sub v16.8h, v29.8h, v27.8h
+ add v17.8h, v31.8h, v25.8h
+ sub va.8h, v31.8h, v25.8h
+ add v19.8h, v29.8h, v27.8h
+ sub v16.8h, v16.8h, v31.8h
+ sub v17.8h, v17.8h, v27.8h
+ add va.8h, va.8h, v29.8h
+ add v19.8h, v19.8h, v25.8h
+ sshr v25.8h, v25.8h, #1
+ sshr v27.8h, v27.8h, #1
+ sshr v29.8h, v29.8h, #1
+ sshr v31.8h, v31.8h, #1
+ sub v16.8h, v16.8h, v31.8h
+ sub v17.8h, v17.8h, v27.8h
+ add va.8h, va.8h, v29.8h
+ add v19.8h, v19.8h, v25.8h
+ sshr v25.8h, v16.8h, #2
+ sshr v27.8h, v17.8h, #2
+ sshr v29.8h, va.8h, #2
+ sshr v31.8h, v19.8h, #2
+ sub v19.8h, v19.8h, v25.8h
+ sub va.8h, v27.8h, va.8h
+ add v17.8h, v17.8h, v29.8h
+ add v16.8h, v16.8h, v31.8h
.if \pass == 0
- sub v31.8H, v24.8H, v19.8H
- add v24.8H, v24.8H, v19.8H
- add v25.8H, v26.8H, v18.8H
- sub v18.8H, v26.8H, v18.8H
- add v26.8H, v28.8H, v17.8H
- add v27.8H, v30.8H, v16.8H
- sub v29.8H, v28.8H, v17.8H
- sub v28.8H, v30.8H, v16.8H
+ sub v31.8h, v24.8h, v19.8h
+ add v24.8h, v24.8h, v19.8h
+ add v25.8h, v26.8h, v18.8h
+ sub v18.8h, v26.8h, v18.8h
+ add v26.8h, v28.8h, v17.8h
+ add v27.8h, v30.8h, v16.8h
+ sub v29.8h, v28.8h, v17.8h
+ sub v28.8h, v30.8h, v16.8h
.else
- sub v31.8H, v24.8H, v19.8H
- add v24.8H, v24.8H, v19.8H
- add v25.8H, v26.8H, v30.8H
- sub v30.8H, v26.8H, v30.8H
- add v26.8H, v28.8H, v17.8H
- sub v29.8H, v28.8H, v17.8H
- add v27.8H, v18.8H, v16.8H
- sub v28.8H, v18.8H, v16.8H
+ sub v31.8h, v24.8h, v19.8h
+ add v24.8h, v24.8h, v19.8h
+ add v25.8h, v26.8h, v30.8h
+ sub v30.8h, v26.8h, v30.8h
+ add v26.8h, v28.8h, v17.8h
+ sub v29.8h, v28.8h, v17.8h
+ add v27.8h, v18.8h, v16.8h
+ sub v28.8h, v18.8h, v16.8h
.endif
.unreq va
.unreq vb
@@ -267,63 +267,63 @@ endfunc
function ff_h264_idct8_add_neon, export=1
.L_ff_h264_idct8_add_neon:
AARCH64_VALID_CALL_TARGET
- movi v19.8H, #0
+ movi v19.8h, #0
sxtw x2, w2
- ld1 {v24.8H, v25.8H}, [x1]
- st1 {v19.8H}, [x1], #16
- st1 {v19.8H}, [x1], #16
- ld1 {v26.8H, v27.8H}, [x1]
- st1 {v19.8H}, [x1], #16
- st1 {v19.8H}, [x1], #16
- ld1 {v28.8H, v29.8H}, [x1]
- st1 {v19.8H}, [x1], #16
- st1 {v19.8H}, [x1], #16
+ ld1 {v24.8h, v25.8h}, [x1]
+ st1 {v19.8h}, [x1], #16
+ st1 {v19.8h}, [x1], #16
+ ld1 {v26.8h, v27.8h}, [x1]
+ st1 {v19.8h}, [x1], #16
+ st1 {v19.8h}, [x1], #16
+ ld1 {v28.8h, v29.8h}, [x1]
+ st1 {v19.8h}, [x1], #16
+ st1 {v19.8h}, [x1], #16
idct8x8_cols 0
transpose_8x8H v24, v25, v26, v27, v28, v29, v18, v31, v6, v7
idct8x8_cols 1
mov x3, x0
- srshr v24.8H, v24.8H, #6
- ld1 {v0.8B}, [x0], x2
- srshr v25.8H, v25.8H, #6
- ld1 {v1.8B}, [x0], x2
- srshr v26.8H, v26.8H, #6
- ld1 {v2.8B}, [x0], x2
- srshr v27.8H, v27.8H, #6
- ld1 {v3.8B}, [x0], x2
- srshr v28.8H, v28.8H, #6
- ld1 {v4.8B}, [x0], x2
- srshr v29.8H, v29.8H, #6
- ld1 {v5.8B}, [x0], x2
- srshr v30.8H, v30.8H, #6
- ld1 {v6.8B}, [x0], x2
- srshr v31.8H, v31.8H, #6
- ld1 {v7.8B}, [x0], x2
- uaddw v24.8H, v24.8H, v0.8B
- uaddw v25.8H, v25.8H, v1.8B
- uaddw v26.8H, v26.8H, v2.8B
- sqxtun v0.8B, v24.8H
- uaddw v27.8H, v27.8H, v3.8B
- sqxtun v1.8B, v25.8H
- uaddw v28.8H, v28.8H, v4.8B
- sqxtun v2.8B, v26.8H
- st1 {v0.8B}, [x3], x2
- uaddw v29.8H, v29.8H, v5.8B
- sqxtun v3.8B, v27.8H
- st1 {v1.8B}, [x3], x2
- uaddw v30.8H, v30.8H, v6.8B
- sqxtun v4.8B, v28.8H
- st1 {v2.8B}, [x3], x2
- uaddw v31.8H, v31.8H, v7.8B
- sqxtun v5.8B, v29.8H
- st1 {v3.8B}, [x3], x2
- sqxtun v6.8B, v30.8H
- sqxtun v7.8B, v31.8H
- st1 {v4.8B}, [x3], x2
- st1 {v5.8B}, [x3], x2
- st1 {v6.8B}, [x3], x2
- st1 {v7.8B}, [x3], x2
+ srshr v24.8h, v24.8h, #6
+ ld1 {v0.8b}, [x0], x2
+ srshr v25.8h, v25.8h, #6
+ ld1 {v1.8b}, [x0], x2
+ srshr v26.8h, v26.8h, #6
+ ld1 {v2.8b}, [x0], x2
+ srshr v27.8h, v27.8h, #6
+ ld1 {v3.8b}, [x0], x2
+ srshr v28.8h, v28.8h, #6
+ ld1 {v4.8b}, [x0], x2
+ srshr v29.8h, v29.8h, #6
+ ld1 {v5.8b}, [x0], x2
+ srshr v30.8h, v30.8h, #6
+ ld1 {v6.8b}, [x0], x2
+ srshr v31.8h, v31.8h, #6
+ ld1 {v7.8b}, [x0], x2
+ uaddw v24.8h, v24.8h, v0.8b
+ uaddw v25.8h, v25.8h, v1.8b
+ uaddw v26.8h, v26.8h, v2.8b
+ sqxtun v0.8b, v24.8h
+ uaddw v27.8h, v27.8h, v3.8b
+ sqxtun v1.8b, v25.8h
+ uaddw v28.8h, v28.8h, v4.8b
+ sqxtun v2.8b, v26.8h
+ st1 {v0.8b}, [x3], x2
+ uaddw v29.8h, v29.8h, v5.8b
+ sqxtun v3.8b, v27.8h
+ st1 {v1.8b}, [x3], x2
+ uaddw v30.8h, v30.8h, v6.8b
+ sqxtun v4.8b, v28.8h
+ st1 {v2.8b}, [x3], x2
+ uaddw v31.8h, v31.8h, v7.8b
+ sqxtun v5.8b, v29.8h
+ st1 {v3.8b}, [x3], x2
+ sqxtun v6.8b, v30.8h
+ sqxtun v7.8b, v31.8h
+ st1 {v4.8b}, [x3], x2
+ st1 {v5.8b}, [x3], x2
+ st1 {v6.8b}, [x3], x2
+ st1 {v7.8b}, [x3], x2
sub x1, x1, #128
ret
@@ -334,42 +334,42 @@ function ff_h264_idct8_dc_add_neon, export=1
AARCH64_VALID_CALL_TARGET
mov w3, #0
sxtw x2, w2
- ld1r {v31.8H}, [x1]
+ ld1r {v31.8h}, [x1]
strh w3, [x1]
- ld1 {v0.8B}, [x0], x2
- srshr v31.8H, v31.8H, #6
- ld1 {v1.8B}, [x0], x2
- ld1 {v2.8B}, [x0], x2
- uaddw v24.8H, v31.8H, v0.8B
- ld1 {v3.8B}, [x0], x2
- uaddw v25.8H, v31.8H, v1.8B
- ld1 {v4.8B}, [x0], x2
- uaddw v26.8H, v31.8H, v2.8B
- ld1 {v5.8B}, [x0], x2
- uaddw v27.8H, v31.8H, v3.8B
- ld1 {v6.8B}, [x0], x2
- uaddw v28.8H, v31.8H, v4.8B
- ld1 {v7.8B}, [x0], x2
- uaddw v29.8H, v31.8H, v5.8B
- uaddw v30.8H, v31.8H, v6.8B
- uaddw v31.8H, v31.8H, v7.8B
- sqxtun v0.8B, v24.8H
- sqxtun v1.8B, v25.8H
- sqxtun v2.8B, v26.8H
- sqxtun v3.8B, v27.8H
+ ld1 {v0.8b}, [x0], x2
+ srshr v31.8h, v31.8h, #6
+ ld1 {v1.8b}, [x0], x2
+ ld1 {v2.8b}, [x0], x2
+ uaddw v24.8h, v31.8h, v0.8b
+ ld1 {v3.8b}, [x0], x2
+ uaddw v25.8h, v31.8h, v1.8b
+ ld1 {v4.8b}, [x0], x2
+ uaddw v26.8h, v31.8h, v2.8b
+ ld1 {v5.8b}, [x0], x2
+ uaddw v27.8h, v31.8h, v3.8b
+ ld1 {v6.8b}, [x0], x2
+ uaddw v28.8h, v31.8h, v4.8b
+ ld1 {v7.8b}, [x0], x2
+ uaddw v29.8h, v31.8h, v5.8b
+ uaddw v30.8h, v31.8h, v6.8b
+ uaddw v31.8h, v31.8h, v7.8b
+ sqxtun v0.8b, v24.8h
+ sqxtun v1.8b, v25.8h
+ sqxtun v2.8b, v26.8h
+ sqxtun v3.8b, v27.8h
sub x0, x0, x2, lsl #3
- st1 {v0.8B}, [x0], x2
- sqxtun v4.8B, v28.8H
- st1 {v1.8B}, [x0], x2
- sqxtun v5.8B, v29.8H
- st1 {v2.8B}, [x0], x2
- sqxtun v6.8B, v30.8H
- st1 {v3.8B}, [x0], x2
- sqxtun v7.8B, v31.8H
- st1 {v4.8B}, [x0], x2
- st1 {v5.8B}, [x0], x2
- st1 {v6.8B}, [x0], x2
- st1 {v7.8B}, [x0], x2
+ st1 {v0.8b}, [x0], x2
+ sqxtun v4.8b, v28.8h
+ st1 {v1.8b}, [x0], x2
+ sqxtun v5.8b, v29.8h
+ st1 {v2.8b}, [x0], x2
+ sqxtun v6.8b, v30.8h
+ st1 {v3.8b}, [x0], x2
+ sqxtun v7.8b, v31.8h
+ st1 {v4.8b}, [x0], x2
+ st1 {v5.8b}, [x0], x2
+ st1 {v6.8b}, [x0], x2
+ st1 {v7.8b}, [x0], x2
ret
endfunc
diff --git a/libavcodec/aarch64/h264qpel_neon.S b/libavcodec/aarch64/h264qpel_neon.S
index 451fd8af24..21906327cd 100644
--- a/libavcodec/aarch64/h264qpel_neon.S
+++ b/libavcodec/aarch64/h264qpel_neon.S
@@ -27,127 +27,127 @@
.macro lowpass_const r
movz \r, #20, lsl #16
movk \r, #5
- mov v6.S[0], \r
+ mov v6.s[0], \r
.endm
//trashes v0-v5
.macro lowpass_8 r0, r1, r2, r3, d0, d1, narrow=1
- ext v2.8B, \r0\().8B, \r1\().8B, #2
- ext v3.8B, \r0\().8B, \r1\().8B, #3
- uaddl v2.8H, v2.8B, v3.8B
- ext v4.8B, \r0\().8B, \r1\().8B, #1
- ext v5.8B, \r0\().8B, \r1\().8B, #4
- uaddl v4.8H, v4.8B, v5.8B
- ext v1.8B, \r0\().8B, \r1\().8B, #5
- uaddl \d0\().8H, \r0\().8B, v1.8B
- ext v0.8B, \r2\().8B, \r3\().8B, #2
- mla \d0\().8H, v2.8H, v6.H[1]
- ext v1.8B, \r2\().8B, \r3\().8B, #3
- uaddl v0.8H, v0.8B, v1.8B
- ext v1.8B, \r2\().8B, \r3\().8B, #1
- mls \d0\().8H, v4.8H, v6.H[0]
- ext v3.8B, \r2\().8B, \r3\().8B, #4
- uaddl v1.8H, v1.8B, v3.8B
- ext v2.8B, \r2\().8B, \r3\().8B, #5
- uaddl \d1\().8H, \r2\().8B, v2.8B
- mla \d1\().8H, v0.8H, v6.H[1]
- mls \d1\().8H, v1.8H, v6.H[0]
+ ext v2.8b, \r0\().8b, \r1\().8b, #2
+ ext v3.8b, \r0\().8b, \r1\().8b, #3
+ uaddl v2.8h, v2.8b, v3.8b
+ ext v4.8b, \r0\().8b, \r1\().8b, #1
+ ext v5.8b, \r0\().8b, \r1\().8b, #4
+ uaddl v4.8h, v4.8b, v5.8b
+ ext v1.8b, \r0\().8b, \r1\().8b, #5
+ uaddl \d0\().8h, \r0\().8b, v1.8b
+ ext v0.8b, \r2\().8b, \r3\().8b, #2
+ mla \d0\().8h, v2.8h, v6.h[1]
+ ext v1.8b, \r2\().8b, \r3\().8b, #3
+ uaddl v0.8h, v0.8b, v1.8b
+ ext v1.8b, \r2\().8b, \r3\().8b, #1
+ mls \d0\().8h, v4.8h, v6.h[0]
+ ext v3.8b, \r2\().8b, \r3\().8b, #4
+ uaddl v1.8h, v1.8b, v3.8b
+ ext v2.8b, \r2\().8b, \r3\().8b, #5
+ uaddl \d1\().8h, \r2\().8b, v2.8b
+ mla \d1\().8h, v0.8h, v6.h[1]
+ mls \d1\().8h, v1.8h, v6.h[0]
.if \narrow
- sqrshrun \d0\().8B, \d0\().8H, #5
- sqrshrun \d1\().8B, \d1\().8H, #5
+ sqrshrun \d0\().8b, \d0\().8h, #5
+ sqrshrun \d1\().8b, \d1\().8h, #5
.endif
.endm
//trashes v0-v4
.macro lowpass_8_v r0, r1, r2, r3, r4, r5, r6, d0, d1, narrow=1
- uaddl v2.8H, \r2\().8B, \r3\().8B
- uaddl v0.8H, \r3\().8B, \r4\().8B
- uaddl v4.8H, \r1\().8B, \r4\().8B
- uaddl v1.8H, \r2\().8B, \r5\().8B
- uaddl \d0\().8H, \r0\().8B, \r5\().8B
- uaddl \d1\().8H, \r1\().8B, \r6\().8B
- mla \d0\().8H, v2.8H, v6.H[1]
- mls \d0\().8H, v4.8H, v6.H[0]
- mla \d1\().8H, v0.8H, v6.H[1]
- mls \d1\().8H, v1.8H, v6.H[0]
+ uaddl v2.8h, \r2\().8b, \r3\().8b
+ uaddl v0.8h, \r3\().8b, \r4\().8b
+ uaddl v4.8h, \r1\().8b, \r4\().8b
+ uaddl v1.8h, \r2\().8b, \r5\().8b
+ uaddl \d0\().8h, \r0\().8b, \r5\().8b
+ uaddl \d1\().8h, \r1\().8b, \r6\().8b
+ mla \d0\().8h, v2.8h, v6.h[1]
+ mls \d0\().8h, v4.8h, v6.h[0]
+ mla \d1\().8h, v0.8h, v6.h[1]
+ mls \d1\().8h, v1.8h, v6.h[0]
.if \narrow
- sqrshrun \d0\().8B, \d0\().8H, #5
- sqrshrun \d1\().8B, \d1\().8H, #5
+ sqrshrun \d0\().8b, \d0\().8h, #5
+ sqrshrun \d1\().8b, \d1\().8h, #5
.endif
.endm
//trashes v0-v5, v7, v30-v31
.macro lowpass_8H r0, r1
- ext v0.16B, \r0\().16B, \r0\().16B, #2
- ext v1.16B, \r0\().16B, \r0\().16B, #3
- uaddl v0.8H, v0.8B, v1.8B
- ext v2.16B, \r0\().16B, \r0\().16B, #1
- ext v3.16B, \r0\().16B, \r0\().16B, #4
- uaddl v2.8H, v2.8B, v3.8B
- ext v30.16B, \r0\().16B, \r0\().16B, #5
- uaddl \r0\().8H, \r0\().8B, v30.8B
- ext v4.16B, \r1\().16B, \r1\().16B, #2
- mla \r0\().8H, v0.8H, v6.H[1]
- ext v5.16B, \r1\().16B, \r1\().16B, #3
- uaddl v4.8H, v4.8B, v5.8B
- ext v7.16B, \r1\().16B, \r1\().16B, #1
- mls \r0\().8H, v2.8H, v6.H[0]
- ext v0.16B, \r1\().16B, \r1\().16B, #4
- uaddl v7.8H, v7.8B, v0.8B
- ext v31.16B, \r1\().16B, \r1\().16B, #5
- uaddl \r1\().8H, \r1\().8B, v31.8B
- mla \r1\().8H, v4.8H, v6.H[1]
- mls \r1\().8H, v7.8H, v6.H[0]
+ ext v0.16b, \r0\().16b, \r0\().16b, #2
+ ext v1.16b, \r0\().16b, \r0\().16b, #3
+ uaddl v0.8h, v0.8b, v1.8b
+ ext v2.16b, \r0\().16b, \r0\().16b, #1
+ ext v3.16b, \r0\().16b, \r0\().16b, #4
+ uaddl v2.8h, v2.8b, v3.8b
+ ext v30.16b, \r0\().16b, \r0\().16b, #5
+ uaddl \r0\().8h, \r0\().8b, v30.8b
+ ext v4.16b, \r1\().16b, \r1\().16b, #2
+ mla \r0\().8h, v0.8h, v6.h[1]
+ ext v5.16b, \r1\().16b, \r1\().16b, #3
+ uaddl v4.8h, v4.8b, v5.8b
+ ext v7.16b, \r1\().16b, \r1\().16b, #1
+ mls \r0\().8h, v2.8h, v6.h[0]
+ ext v0.16b, \r1\().16b, \r1\().16b, #4
+ uaddl v7.8h, v7.8b, v0.8b
+ ext v31.16b, \r1\().16b, \r1\().16b, #5
+ uaddl \r1\().8h, \r1\().8b, v31.8b
+ mla \r1\().8h, v4.8h, v6.h[1]
+ mls \r1\().8h, v7.8h, v6.h[0]
.endm
// trashes v2-v5, v30
.macro lowpass_8_1 r0, r1, d0, narrow=1
- ext v2.8B, \r0\().8B, \r1\().8B, #2
- ext v3.8B, \r0\().8B, \r1\().8B, #3
- uaddl v2.8H, v2.8B, v3.8B
- ext v4.8B, \r0\().8B, \r1\().8B, #1
- ext v5.8B, \r0\().8B, \r1\().8B, #4
- uaddl v4.8H, v4.8B, v5.8B
- ext v30.8B, \r0\().8B, \r1\().8B, #5
- uaddl \d0\().8H, \r0\().8B, v30.8B
- mla \d0\().8H, v2.8H, v6.H[1]
- mls \d0\().8H, v4.8H, v6.H[0]
+ ext v2.8b, \r0\().8b, \r1\().8b, #2
+ ext v3.8b, \r0\().8b, \r1\().8b, #3
+ uaddl v2.8h, v2.8b, v3.8b
+ ext v4.8b, \r0\().8b, \r1\().8b, #1
+ ext v5.8b, \r0\().8b, \r1\().8b, #4
+ uaddl v4.8h, v4.8b, v5.8b
+ ext v30.8b, \r0\().8b, \r1\().8b, #5
+ uaddl \d0\().8h, \r0\().8b, v30.8b
+ mla \d0\().8h, v2.8h, v6.h[1]
+ mls \d0\().8h, v4.8h, v6.h[0]
.if \narrow
- sqrshrun \d0\().8B, \d0\().8H, #5
+ sqrshrun \d0\().8b, \d0\().8h, #5
.endif
.endm
// trashed v0-v7
.macro lowpass_8.16 r0, r1, r2, r3, r4, r5
- saddl v5.4S, \r2\().4H, \r3\().4H
- saddl2 v1.4S, \r2\().8H, \r3\().8H
- saddl v6.4S, \r1\().4H, \r4\().4H
- saddl2 v2.4S, \r1\().8H, \r4\().8H
- saddl v0.4S, \r0\().4H, \r5\().4H
- saddl2 v4.4S, \r0\().8H, \r5\().8H
-
- shl v3.4S, v5.4S, #4
- shl v5.4S, v5.4S, #2
- shl v7.4S, v6.4S, #2
- add v5.4S, v5.4S, v3.4S
- add v6.4S, v6.4S, v7.4S
-
- shl v3.4S, v1.4S, #4
- shl v1.4S, v1.4S, #2
- shl v7.4S, v2.4S, #2
- add v1.4S, v1.4S, v3.4S
- add v2.4S, v2.4S, v7.4S
-
- add v5.4S, v5.4S, v0.4S
- sub v5.4S, v5.4S, v6.4S
-
- add v1.4S, v1.4S, v4.4S
- sub v1.4S, v1.4S, v2.4S
-
- rshrn v5.4H, v5.4S, #10
- rshrn2 v5.8H, v1.4S, #10
-
- sqxtun \r0\().8B, v5.8H
+ saddl v5.4s, \r2\().4h, \r3\().4h
+ saddl2 v1.4s, \r2\().8h, \r3\().8h
+ saddl v6.4s, \r1\().4h, \r4\().4h
+ saddl2 v2.4s, \r1\().8h, \r4\().8h
+ saddl v0.4s, \r0\().4h, \r5\().4h
+ saddl2 v4.4s, \r0\().8h, \r5\().8h
+
+ shl v3.4s, v5.4s, #4
+ shl v5.4s, v5.4s, #2
+ shl v7.4s, v6.4s, #2
+ add v5.4s, v5.4s, v3.4s
+ add v6.4s, v6.4s, v7.4s
+
+ shl v3.4s, v1.4s, #4
+ shl v1.4s, v1.4s, #2
+ shl v7.4s, v2.4s, #2
+ add v1.4s, v1.4s, v3.4s
+ add v2.4s, v2.4s, v7.4s
+
+ add v5.4s, v5.4s, v0.4s
+ sub v5.4s, v5.4s, v6.4s
+
+ add v1.4s, v1.4s, v4.4s
+ sub v1.4s, v1.4s, v2.4s
+
+ rshrn v5.4h, v5.4s, #10
+ rshrn2 v5.8h, v1.4s, #10
+
+ sqxtun \r0\().8b, v5.8h
.endm
function put_h264_qpel16_h_lowpass_neon_packed
@@ -176,19 +176,19 @@ function \type\()_h264_qpel16_h_lowpass_neon
endfunc
function \type\()_h264_qpel8_h_lowpass_neon
-1: ld1 {v28.8B, v29.8B}, [x1], x2
- ld1 {v16.8B, v17.8B}, [x1], x2
+1: ld1 {v28.8b, v29.8b}, [x1], x2
+ ld1 {v16.8b, v17.8b}, [x1], x2
subs x12, x12, #2
lowpass_8 v28, v29, v16, v17, v28, v16
.ifc \type,avg
- ld1 {v2.8B}, [x0], x3
- ld1 {v3.8B}, [x0]
- urhadd v28.8B, v28.8B, v2.8B
- urhadd v16.8B, v16.8B, v3.8B
+ ld1 {v2.8b}, [x0], x3
+ ld1 {v3.8b}, [x0]
+ urhadd v28.8b, v28.8b, v2.8b
+ urhadd v16.8b, v16.8b, v3.8b
sub x0, x0, x3
.endif
- st1 {v28.8B}, [x0], x3
- st1 {v16.8B}, [x0], x3
+ st1 {v28.8b}, [x0], x3
+ st1 {v16.8b}, [x0], x3
b.ne 1b
ret
endfunc
@@ -213,23 +213,23 @@ function \type\()_h264_qpel16_h_lowpass_l2_neon
endfunc
function \type\()_h264_qpel8_h_lowpass_l2_neon
-1: ld1 {v26.8B, v27.8B}, [x1], x2
- ld1 {v16.8B, v17.8B}, [x1], x2
- ld1 {v28.8B}, [x3], x2
- ld1 {v29.8B}, [x3], x2
+1: ld1 {v26.8b, v27.8b}, [x1], x2
+ ld1 {v16.8b, v17.8b}, [x1], x2
+ ld1 {v28.8b}, [x3], x2
+ ld1 {v29.8b}, [x3], x2
subs x12, x12, #2
lowpass_8 v26, v27, v16, v17, v26, v27
- urhadd v26.8B, v26.8B, v28.8B
- urhadd v27.8B, v27.8B, v29.8B
+ urhadd v26.8b, v26.8b, v28.8b
+ urhadd v27.8b, v27.8b, v29.8b
.ifc \type,avg
- ld1 {v2.8B}, [x0], x2
- ld1 {v3.8B}, [x0]
- urhadd v26.8B, v26.8B, v2.8B
- urhadd v27.8B, v27.8B, v3.8B
+ ld1 {v2.8b}, [x0], x2
+ ld1 {v3.8b}, [x0]
+ urhadd v26.8b, v26.8b, v2.8b
+ urhadd v27.8b, v27.8b, v3.8b
sub x0, x0, x2
.endif
- st1 {v26.8B}, [x0], x2
- st1 {v27.8B}, [x0], x2
+ st1 {v26.8b}, [x0], x2
+ st1 {v27.8b}, [x0], x2
b.ne 1b
ret
endfunc
@@ -270,52 +270,52 @@ function \type\()_h264_qpel16_v_lowpass_neon
endfunc
function \type\()_h264_qpel8_v_lowpass_neon
- ld1 {v16.8B}, [x1], x3
- ld1 {v17.8B}, [x1], x3
- ld1 {v18.8B}, [x1], x3
- ld1 {v19.8B}, [x1], x3
- ld1 {v20.8B}, [x1], x3
- ld1 {v21.8B}, [x1], x3
- ld1 {v22.8B}, [x1], x3
- ld1 {v23.8B}, [x1], x3
- ld1 {v24.8B}, [x1], x3
- ld1 {v25.8B}, [x1], x3
- ld1 {v26.8B}, [x1], x3
- ld1 {v27.8B}, [x1], x3
- ld1 {v28.8B}, [x1]
+ ld1 {v16.8b}, [x1], x3
+ ld1 {v17.8b}, [x1], x3
+ ld1 {v18.8b}, [x1], x3
+ ld1 {v19.8b}, [x1], x3
+ ld1 {v20.8b}, [x1], x3
+ ld1 {v21.8b}, [x1], x3
+ ld1 {v22.8b}, [x1], x3
+ ld1 {v23.8b}, [x1], x3
+ ld1 {v24.8b}, [x1], x3
+ ld1 {v25.8b}, [x1], x3
+ ld1 {v26.8b}, [x1], x3
+ ld1 {v27.8b}, [x1], x3
+ ld1 {v28.8b}, [x1]
lowpass_8_v v16, v17, v18, v19, v20, v21, v22, v16, v17
lowpass_8_v v18, v19, v20, v21, v22, v23, v24, v18, v19
lowpass_8_v v20, v21, v22, v23, v24, v25, v26, v20, v21
lowpass_8_v v22, v23, v24, v25, v26, v27, v28, v22, v23
.ifc \type,avg
- ld1 {v24.8B}, [x0], x2
- ld1 {v25.8B}, [x0], x2
- ld1 {v26.8B}, [x0], x2
- urhadd v16.8B, v16.8B, v24.8B
- ld1 {v27.8B}, [x0], x2
- urhadd v17.8B, v17.8B, v25.8B
- ld1 {v28.8B}, [x0], x2
- urhadd v18.8B, v18.8B, v26.8B
- ld1 {v29.8B}, [x0], x2
- urhadd v19.8B, v19.8B, v27.8B
- ld1 {v30.8B}, [x0], x2
- urhadd v20.8B, v20.8B, v28.8B
- ld1 {v31.8B}, [x0], x2
- urhadd v21.8B, v21.8B, v29.8B
- urhadd v22.8B, v22.8B, v30.8B
- urhadd v23.8B, v23.8B, v31.8B
+ ld1 {v24.8b}, [x0], x2
+ ld1 {v25.8b}, [x0], x2
+ ld1 {v26.8b}, [x0], x2
+ urhadd v16.8b, v16.8b, v24.8b
+ ld1 {v27.8b}, [x0], x2
+ urhadd v17.8b, v17.8b, v25.8b
+ ld1 {v28.8b}, [x0], x2
+ urhadd v18.8b, v18.8b, v26.8b
+ ld1 {v29.8b}, [x0], x2
+ urhadd v19.8b, v19.8b, v27.8b
+ ld1 {v30.8b}, [x0], x2
+ urhadd v20.8b, v20.8b, v28.8b
+ ld1 {v31.8b}, [x0], x2
+ urhadd v21.8b, v21.8b, v29.8b
+ urhadd v22.8b, v22.8b, v30.8b
+ urhadd v23.8b, v23.8b, v31.8b
sub x0, x0, x2, lsl #3
.endif
- st1 {v16.8B}, [x0], x2
- st1 {v17.8B}, [x0], x2
- st1 {v18.8B}, [x0], x2
- st1 {v19.8B}, [x0], x2
- st1 {v20.8B}, [x0], x2
- st1 {v21.8B}, [x0], x2
- st1 {v22.8B}, [x0], x2
- st1 {v23.8B}, [x0], x2
+ st1 {v16.8b}, [x0], x2
+ st1 {v17.8b}, [x0], x2
+ st1 {v18.8b}, [x0], x2
+ st1 {v19.8b}, [x0], x2
+ st1 {v20.8b}, [x0], x2
+ st1 {v21.8b}, [x0], x2
+ st1 {v22.8b}, [x0], x2
+ st1 {v23.8b}, [x0], x2
ret
endfunc
@@ -343,70 +343,70 @@ function \type\()_h264_qpel16_v_lowpass_l2_neon
endfunc
function \type\()_h264_qpel8_v_lowpass_l2_neon
- ld1 {v16.8B}, [x1], x3
- ld1 {v17.8B}, [x1], x3
- ld1 {v18.8B}, [x1], x3
- ld1 {v19.8B}, [x1], x3
- ld1 {v20.8B}, [x1], x3
- ld1 {v21.8B}, [x1], x3
- ld1 {v22.8B}, [x1], x3
- ld1 {v23.8B}, [x1], x3
- ld1 {v24.8B}, [x1], x3
- ld1 {v25.8B}, [x1], x3
- ld1 {v26.8B}, [x1], x3
- ld1 {v27.8B}, [x1], x3
- ld1 {v28.8B}, [x1]
+ ld1 {v16.8b}, [x1], x3
+ ld1 {v17.8b}, [x1], x3
+ ld1 {v18.8b}, [x1], x3
+ ld1 {v19.8b}, [x1], x3
+ ld1 {v20.8b}, [x1], x3
+ ld1 {v21.8b}, [x1], x3
+ ld1 {v22.8b}, [x1], x3
+ ld1 {v23.8b}, [x1], x3
+ ld1 {v24.8b}, [x1], x3
+ ld1 {v25.8b}, [x1], x3
+ ld1 {v26.8b}, [x1], x3
+ ld1 {v27.8b}, [x1], x3
+ ld1 {v28.8b}, [x1]
lowpass_8_v v16, v17, v18, v19, v20, v21, v22, v16, v17
lowpass_8_v v18, v19, v20, v21, v22, v23, v24, v18, v19
lowpass_8_v v20, v21, v22, v23, v24, v25, v26, v20, v21
lowpass_8_v v22, v23, v24, v25, v26, v27, v28, v22, v23
- ld1 {v24.8B}, [x12], x2
- ld1 {v25.8B}, [x12], x2
- ld1 {v26.8B}, [x12], x2
- ld1 {v27.8B}, [x12], x2
- ld1 {v28.8B}, [x12], x2
- urhadd v16.8B, v24.8B, v16.8B
- urhadd v17.8B, v25.8B, v17.8B
- ld1 {v29.8B}, [x12], x2
- urhadd v18.8B, v26.8B, v18.8B
- urhadd v19.8B, v27.8B, v19.8B
- ld1 {v30.8B}, [x12], x2
- urhadd v20.8B, v28.8B, v20.8B
- urhadd v21.8B, v29.8B, v21.8B
- ld1 {v31.8B}, [x12], x2
- urhadd v22.8B, v30.8B, v22.8B
- urhadd v23.8B, v31.8B, v23.8B
+ ld1 {v24.8b}, [x12], x2
+ ld1 {v25.8b}, [x12], x2
+ ld1 {v26.8b}, [x12], x2
+ ld1 {v27.8b}, [x12], x2
+ ld1 {v28.8b}, [x12], x2
+ urhadd v16.8b, v24.8b, v16.8b
+ urhadd v17.8b, v25.8b, v17.8b
+ ld1 {v29.8b}, [x12], x2
+ urhadd v18.8b, v26.8b, v18.8b
+ urhadd v19.8b, v27.8b, v19.8b
+ ld1 {v30.8b}, [x12], x2
+ urhadd v20.8b, v28.8b, v20.8b
+ urhadd v21.8b, v29.8b, v21.8b
+ ld1 {v31.8b}, [x12], x2
+ urhadd v22.8b, v30.8b, v22.8b
+ urhadd v23.8b, v31.8b, v23.8b
.ifc \type,avg
- ld1 {v24.8B}, [x0], x3
- ld1 {v25.8B}, [x0], x3
- ld1 {v26.8B}, [x0], x3
- urhadd v16.8B, v16.8B, v24.8B
- ld1 {v27.8B}, [x0], x3
- urhadd v17.8B, v17.8B, v25.8B
- ld1 {v28.8B}, [x0], x3
- urhadd v18.8B, v18.8B, v26.8B
- ld1 {v29.8B}, [x0], x3
- urhadd v19.8B, v19.8B, v27.8B
- ld1 {v30.8B}, [x0], x3
- urhadd v20.8B, v20.8B, v28.8B
- ld1 {v31.8B}, [x0], x3
- urhadd v21.8B, v21.8B, v29.8B
- urhadd v22.8B, v22.8B, v30.8B
- urhadd v23.8B, v23.8B, v31.8B
+ ld1 {v24.8b}, [x0], x3
+ ld1 {v25.8b}, [x0], x3
+ ld1 {v26.8b}, [x0], x3
+ urhadd v16.8b, v16.8b, v24.8b
+ ld1 {v27.8b}, [x0], x3
+ urhadd v17.8b, v17.8b, v25.8b
+ ld1 {v28.8b}, [x0], x3
+ urhadd v18.8b, v18.8b, v26.8b
+ ld1 {v29.8b}, [x0], x3
+ urhadd v19.8b, v19.8b, v27.8b
+ ld1 {v30.8b}, [x0], x3
+ urhadd v20.8b, v20.8b, v28.8b
+ ld1 {v31.8b}, [x0], x3
+ urhadd v21.8b, v21.8b, v29.8b
+ urhadd v22.8b, v22.8b, v30.8b
+ urhadd v23.8b, v23.8b, v31.8b
sub x0, x0, x3, lsl #3
.endif
- st1 {v16.8B}, [x0], x3
- st1 {v17.8B}, [x0], x3
- st1 {v18.8B}, [x0], x3
- st1 {v19.8B}, [x0], x3
- st1 {v20.8B}, [x0], x3
- st1 {v21.8B}, [x0], x3
- st1 {v22.8B}, [x0], x3
- st1 {v23.8B}, [x0], x3
+ st1 {v16.8b}, [x0], x3
+ st1 {v17.8b}, [x0], x3
+ st1 {v18.8b}, [x0], x3
+ st1 {v19.8b}, [x0], x3
+ st1 {v20.8b}, [x0], x3
+ st1 {v21.8b}, [x0], x3
+ st1 {v22.8b}, [x0], x3
+ st1 {v23.8b}, [x0], x3
ret
endfunc
@@ -417,19 +417,19 @@ endfunc
function put_h264_qpel8_hv_lowpass_neon_top
lowpass_const w12
- ld1 {v16.8H}, [x1], x3
- ld1 {v17.8H}, [x1], x3
- ld1 {v18.8H}, [x1], x3
- ld1 {v19.8H}, [x1], x3
- ld1 {v20.8H}, [x1], x3
- ld1 {v21.8H}, [x1], x3
- ld1 {v22.8H}, [x1], x3
- ld1 {v23.8H}, [x1], x3
- ld1 {v24.8H}, [x1], x3
- ld1 {v25.8H}, [x1], x3
- ld1 {v26.8H}, [x1], x3
- ld1 {v27.8H}, [x1], x3
- ld1 {v28.8H}, [x1]
+ ld1 {v16.8h}, [x1], x3
+ ld1 {v17.8h}, [x1], x3
+ ld1 {v18.8h}, [x1], x3
+ ld1 {v19.8h}, [x1], x3
+ ld1 {v20.8h}, [x1], x3
+ ld1 {v21.8h}, [x1], x3
+ ld1 {v22.8h}, [x1], x3
+ ld1 {v23.8h}, [x1], x3
+ ld1 {v24.8h}, [x1], x3
+ ld1 {v25.8h}, [x1], x3
+ ld1 {v26.8h}, [x1], x3
+ ld1 {v27.8h}, [x1], x3
+ ld1 {v28.8h}, [x1]
lowpass_8H v16, v17
lowpass_8H v18, v19
lowpass_8H v20, v21
@@ -458,33 +458,33 @@ function \type\()_h264_qpel8_hv_lowpass_neon
mov x10, x30
bl put_h264_qpel8_hv_lowpass_neon_top
.ifc \type,avg
- ld1 {v0.8B}, [x0], x2
- ld1 {v1.8B}, [x0], x2
- ld1 {v2.8B}, [x0], x2
- urhadd v16.8B, v16.8B, v0.8B
- ld1 {v3.8B}, [x0], x2
- urhadd v17.8B, v17.8B, v1.8B
- ld1 {v4.8B}, [x0], x2
- urhadd v18.8B, v18.8B, v2.8B
- ld1 {v5.8B}, [x0], x2
- urhadd v19.8B, v19.8B, v3.8B
- ld1 {v6.8B}, [x0], x2
- urhadd v20.8B, v20.8B, v4.8B
- ld1 {v7.8B}, [x0], x2
- urhadd v21.8B, v21.8B, v5.8B
- urhadd v22.8B, v22.8B, v6.8B
- urhadd v23.8B, v23.8B, v7.8B
+ ld1 {v0.8b}, [x0], x2
+ ld1 {v1.8b}, [x0], x2
+ ld1 {v2.8b}, [x0], x2
+ urhadd v16.8b, v16.8b, v0.8b
+ ld1 {v3.8b}, [x0], x2
+ urhadd v17.8b, v17.8b, v1.8b
+ ld1 {v4.8b}, [x0], x2
+ urhadd v18.8b, v18.8b, v2.8b
+ ld1 {v5.8b}, [x0], x2
+ urhadd v19.8b, v19.8b, v3.8b
+ ld1 {v6.8b}, [x0], x2
+ urhadd v20.8b, v20.8b, v4.8b
+ ld1 {v7.8b}, [x0], x2
+ urhadd v21.8b, v21.8b, v5.8b
+ urhadd v22.8b, v22.8b, v6.8b
+ urhadd v23.8b, v23.8b, v7.8b
sub x0, x0, x2, lsl #3
.endif
- st1 {v16.8B}, [x0], x2
- st1 {v17.8B}, [x0], x2
- st1 {v18.8B}, [x0], x2
- st1 {v19.8B}, [x0], x2
- st1 {v20.8B}, [x0], x2
- st1 {v21.8B}, [x0], x2
- st1 {v22.8B}, [x0], x2
- st1 {v23.8B}, [x0], x2
+ st1 {v16.8b}, [x0], x2
+ st1 {v17.8b}, [x0], x2
+ st1 {v18.8b}, [x0], x2
+ st1 {v19.8b}, [x0], x2
+ st1 {v20.8b}, [x0], x2
+ st1 {v21.8b}, [x0], x2
+ st1 {v22.8b}, [x0], x2
+ st1 {v23.8b}, [x0], x2
ret x10
endfunc
@@ -498,45 +498,45 @@ function \type\()_h264_qpel8_hv_lowpass_l2_neon
mov x10, x30
bl put_h264_qpel8_hv_lowpass_neon_top
- ld1 {v0.8B, v1.8B}, [x2], #16
- ld1 {v2.8B, v3.8B}, [x2], #16
- urhadd v0.8B, v0.8B, v16.8B
- urhadd v1.8B, v1.8B, v17.8B
- ld1 {v4.8B, v5.8B}, [x2], #16
- urhadd v2.8B, v2.8B, v18.8B
- urhadd v3.8B, v3.8B, v19.8B
- ld1 {v6.8B, v7.8B}, [x2], #16
- urhadd v4.8B, v4.8B, v20.8B
- urhadd v5.8B, v5.8B, v21.8B
- urhadd v6.8B, v6.8B, v22.8B
- urhadd v7.8B, v7.8B, v23.8B
+ ld1 {v0.8b, v1.8b}, [x2], #16
+ ld1 {v2.8b, v3.8b}, [x2], #16
+ urhadd v0.8b, v0.8b, v16.8b
+ urhadd v1.8b, v1.8b, v17.8b
+ ld1 {v4.8b, v5.8b}, [x2], #16
+ urhadd v2.8b, v2.8b, v18.8b
+ urhadd v3.8b, v3.8b, v19.8b
+ ld1 {v6.8b, v7.8b}, [x2], #16
+ urhadd v4.8b, v4.8b, v20.8b
+ urhadd v5.8b, v5.8b, v21.8b
+ urhadd v6.8b, v6.8b, v22.8b
+ urhadd v7.8b, v7.8b, v23.8b
.ifc \type,avg
- ld1 {v16.8B}, [x0], x3
- ld1 {v17.8B}, [x0], x3
- ld1 {v18.8B}, [x0], x3
- urhadd v0.8B, v0.8B, v16.8B
- ld1 {v19.8B}, [x0], x3
- urhadd v1.8B, v1.8B, v17.8B
- ld1 {v20.8B}, [x0], x3
- urhadd v2.8B, v2.8B, v18.8B
- ld1 {v21.8B}, [x0], x3
- urhadd v3.8B, v3.8B, v19.8B
- ld1 {v22.8B}, [x0], x3
- urhadd v4.8B, v4.8B, v20.8B
- ld1 {v23.8B}, [x0], x3
- urhadd v5.8B, v5.8B, v21.8B
- urhadd v6.8B, v6.8B, v22.8B
- urhadd v7.8B, v7.8B, v23.8B
+ ld1 {v16.8b}, [x0], x3
+ ld1 {v17.8b}, [x0], x3
+ ld1 {v18.8b}, [x0], x3
+ urhadd v0.8b, v0.8b, v16.8b
+ ld1 {v19.8b}, [x0], x3
+ urhadd v1.8b, v1.8b, v17.8b
+ ld1 {v20.8b}, [x0], x3
+ urhadd v2.8b, v2.8b, v18.8b
+ ld1 {v21.8b}, [x0], x3
+ urhadd v3.8b, v3.8b, v19.8b
+ ld1 {v22.8b}, [x0], x3
+ urhadd v4.8b, v4.8b, v20.8b
+ ld1 {v23.8b}, [x0], x3
+ urhadd v5.8b, v5.8b, v21.8b
+ urhadd v6.8b, v6.8b, v22.8b
+ urhadd v7.8b, v7.8b, v23.8b
sub x0, x0, x3, lsl #3
.endif
- st1 {v0.8B}, [x0], x3
- st1 {v1.8B}, [x0], x3
- st1 {v2.8B}, [x0], x3
- st1 {v3.8B}, [x0], x3
- st1 {v4.8B}, [x0], x3
- st1 {v5.8B}, [x0], x3
- st1 {v6.8B}, [x0], x3
- st1 {v7.8B}, [x0], x3
+ st1 {v0.8b}, [x0], x3
+ st1 {v1.8b}, [x0], x3
+ st1 {v2.8b}, [x0], x3
+ st1 {v3.8b}, [x0], x3
+ st1 {v4.8b}, [x0], x3
+ st1 {v5.8b}, [x0], x3
+ st1 {v6.8b}, [x0], x3
+ st1 {v7.8b}, [x0], x3
ret x10
endfunc
diff --git a/libavcodec/aarch64/hpeldsp_neon.S b/libavcodec/aarch64/hpeldsp_neon.S
index a491c173bb..e7c1549c40 100644
--- a/libavcodec/aarch64/hpeldsp_neon.S
+++ b/libavcodec/aarch64/hpeldsp_neon.S
@@ -26,295 +26,295 @@
.if \avg
mov x12, x0
.endif
-1: ld1 {v0.16B}, [x1], x2
- ld1 {v1.16B}, [x1], x2
- ld1 {v2.16B}, [x1], x2
- ld1 {v3.16B}, [x1], x2
+1: ld1 {v0.16b}, [x1], x2
+ ld1 {v1.16b}, [x1], x2
+ ld1 {v2.16b}, [x1], x2
+ ld1 {v3.16b}, [x1], x2
.if \avg
- ld1 {v4.16B}, [x12], x2
- urhadd v0.16B, v0.16B, v4.16B
- ld1 {v5.16B}, [x12], x2
- urhadd v1.16B, v1.16B, v5.16B
- ld1 {v6.16B}, [x12], x2
- urhadd v2.16B, v2.16B, v6.16B
- ld1 {v7.16B}, [x12], x2
- urhadd v3.16B, v3.16B, v7.16B
+ ld1 {v4.16b}, [x12], x2
+ urhadd v0.16b, v0.16b, v4.16b
+ ld1 {v5.16b}, [x12], x2
+ urhadd v1.16b, v1.16b, v5.16b
+ ld1 {v6.16b}, [x12], x2
+ urhadd v2.16b, v2.16b, v6.16b
+ ld1 {v7.16b}, [x12], x2
+ urhadd v3.16b, v3.16b, v7.16b
.endif
subs w3, w3, #4
- st1 {v0.16B}, [x0], x2
- st1 {v1.16B}, [x0], x2
- st1 {v2.16B}, [x0], x2
- st1 {v3.16B}, [x0], x2
+ st1 {v0.16b}, [x0], x2
+ st1 {v1.16b}, [x0], x2
+ st1 {v2.16b}, [x0], x2
+ st1 {v3.16b}, [x0], x2
b.ne 1b
ret
.endm
.macro pixels16_x2 rnd=1, avg=0
-1: ld1 {v0.16B, v1.16B}, [x1], x2
- ld1 {v2.16B, v3.16B}, [x1], x2
+1: ld1 {v0.16b, v1.16b}, [x1], x2
+ ld1 {v2.16b, v3.16b}, [x1], x2
subs w3, w3, #2
- ext v1.16B, v0.16B, v1.16B, #1
- avg v0.16B, v0.16B, v1.16B
- ext v3.16B, v2.16B, v3.16B, #1
- avg v2.16B, v2.16B, v3.16B
+ ext v1.16b, v0.16b, v1.16b, #1
+ avg v0.16b, v0.16b, v1.16b
+ ext v3.16b, v2.16b, v3.16b, #1
+ avg v2.16b, v2.16b, v3.16b
.if \avg
- ld1 {v1.16B}, [x0], x2
- ld1 {v3.16B}, [x0]
- urhadd v0.16B, v0.16B, v1.16B
- urhadd v2.16B, v2.16B, v3.16B
+ ld1 {v1.16b}, [x0], x2
+ ld1 {v3.16b}, [x0]
+ urhadd v0.16b, v0.16b, v1.16b
+ urhadd v2.16b, v2.16b, v3.16b
sub x0, x0, x2
.endif
- st1 {v0.16B}, [x0], x2
- st1 {v2.16B}, [x0], x2
+ st1 {v0.16b}, [x0], x2
+ st1 {v2.16b}, [x0], x2
b.ne 1b
ret
.endm
.macro pixels16_y2 rnd=1, avg=0
sub w3, w3, #2
- ld1 {v0.16B}, [x1], x2
- ld1 {v1.16B}, [x1], x2
+ ld1 {v0.16b}, [x1], x2
+ ld1 {v1.16b}, [x1], x2
1: subs w3, w3, #2
- avg v2.16B, v0.16B, v1.16B
- ld1 {v0.16B}, [x1], x2
- avg v3.16B, v0.16B, v1.16B
- ld1 {v1.16B}, [x1], x2
+ avg v2.16b, v0.16b, v1.16b
+ ld1 {v0.16b}, [x1], x2
+ avg v3.16b, v0.16b, v1.16b
+ ld1 {v1.16b}, [x1], x2
.if \avg
- ld1 {v4.16B}, [x0], x2
- ld1 {v5.16B}, [x0]
- urhadd v2.16B, v2.16B, v4.16B
- urhadd v3.16B, v3.16B, v5.16B
+ ld1 {v4.16b}, [x0], x2
+ ld1 {v5.16b}, [x0]
+ urhadd v2.16b, v2.16b, v4.16b
+ urhadd v3.16b, v3.16b, v5.16b
sub x0, x0, x2
.endif
- st1 {v2.16B}, [x0], x2
- st1 {v3.16B}, [x0], x2
+ st1 {v2.16b}, [x0], x2
+ st1 {v3.16b}, [x0], x2
b.ne 1b
- avg v2.16B, v0.16B, v1.16B
- ld1 {v0.16B}, [x1], x2
- avg v3.16B, v0.16B, v1.16B
+ avg v2.16b, v0.16b, v1.16b
+ ld1 {v0.16b}, [x1], x2
+ avg v3.16b, v0.16b, v1.16b
.if \avg
- ld1 {v4.16B}, [x0], x2
- ld1 {v5.16B}, [x0]
- urhadd v2.16B, v2.16B, v4.16B
- urhadd v3.16B, v3.16B, v5.16B
+ ld1 {v4.16b}, [x0], x2
+ ld1 {v5.16b}, [x0]
+ urhadd v2.16b, v2.16b, v4.16b
+ urhadd v3.16b, v3.16b, v5.16b
sub x0, x0, x2
.endif
- st1 {v2.16B}, [x0], x2
- st1 {v3.16B}, [x0], x2
+ st1 {v2.16b}, [x0], x2
+ st1 {v3.16b}, [x0], x2
ret
.endm
.macro pixels16_xy2 rnd=1, avg=0
sub w3, w3, #2
- ld1 {v0.16B, v1.16B}, [x1], x2
- ld1 {v4.16B, v5.16B}, [x1], x2
+ ld1 {v0.16b, v1.16b}, [x1], x2
+ ld1 {v4.16b, v5.16b}, [x1], x2
NRND movi v26.8H, #1
- ext v1.16B, v0.16B, v1.16B, #1
- ext v5.16B, v4.16B, v5.16B, #1
- uaddl v16.8H, v0.8B, v1.8B
- uaddl2 v20.8H, v0.16B, v1.16B
- uaddl v18.8H, v4.8B, v5.8B
- uaddl2 v22.8H, v4.16B, v5.16B
+ ext v1.16b, v0.16b, v1.16b, #1
+ ext v5.16b, v4.16b, v5.16b, #1
+ uaddl v16.8h, v0.8b, v1.8b
+ uaddl2 v20.8h, v0.16b, v1.16b
+ uaddl v18.8h, v4.8b, v5.8b
+ uaddl2 v22.8h, v4.16b, v5.16b
1: subs w3, w3, #2
- ld1 {v0.16B, v1.16B}, [x1], x2
- add v24.8H, v16.8H, v18.8H
+ ld1 {v0.16b, v1.16b}, [x1], x2
+ add v24.8h, v16.8h, v18.8h
NRND add v24.8H, v24.8H, v26.8H
- ext v30.16B, v0.16B, v1.16B, #1
- add v1.8H, v20.8H, v22.8H
- mshrn v28.8B, v24.8H, #2
+ ext v30.16b, v0.16b, v1.16b, #1
+ add v1.8h, v20.8h, v22.8h
+ mshrn v28.8b, v24.8h, #2
NRND add v1.8H, v1.8H, v26.8H
- mshrn2 v28.16B, v1.8H, #2
+ mshrn2 v28.16b, v1.8h, #2
.if \avg
- ld1 {v16.16B}, [x0]
- urhadd v28.16B, v28.16B, v16.16B
+ ld1 {v16.16b}, [x0]
+ urhadd v28.16b, v28.16b, v16.16b
.endif
- uaddl v16.8H, v0.8B, v30.8B
- ld1 {v2.16B, v3.16B}, [x1], x2
- uaddl2 v20.8H, v0.16B, v30.16B
- st1 {v28.16B}, [x0], x2
- add v24.8H, v16.8H, v18.8H
+ uaddl v16.8h, v0.8b, v30.8b
+ ld1 {v2.16b, v3.16b}, [x1], x2
+ uaddl2 v20.8h, v0.16b, v30.16b
+ st1 {v28.16b}, [x0], x2
+ add v24.8h, v16.8h, v18.8h
NRND add v24.8H, v24.8H, v26.8H
- ext v3.16B, v2.16B, v3.16B, #1
- add v0.8H, v20.8H, v22.8H
- mshrn v30.8B, v24.8H, #2
+ ext v3.16b, v2.16b, v3.16b, #1
+ add v0.8h, v20.8h, v22.8h
+ mshrn v30.8b, v24.8h, #2
NRND add v0.8H, v0.8H, v26.8H
- mshrn2 v30.16B, v0.8H, #2
+ mshrn2 v30.16b, v0.8h, #2
.if \avg
- ld1 {v18.16B}, [x0]
- urhadd v30.16B, v30.16B, v18.16B
+ ld1 {v18.16b}, [x0]
+ urhadd v30.16b, v30.16b, v18.16b
.endif
- uaddl v18.8H, v2.8B, v3.8B
- uaddl2 v22.8H, v2.16B, v3.16B
- st1 {v30.16B}, [x0], x2
+ uaddl v18.8h, v2.8b, v3.8b
+ uaddl2 v22.8h, v2.16b, v3.16b
+ st1 {v30.16b}, [x0], x2
b.gt 1b
- ld1 {v0.16B, v1.16B}, [x1], x2
- add v24.8H, v16.8H, v18.8H
+ ld1 {v0.16b, v1.16b}, [x1], x2
+ add v24.8h, v16.8h, v18.8h
NRND add v24.8H, v24.8H, v26.8H
- ext v30.16B, v0.16B, v1.16B, #1
- add v1.8H, v20.8H, v22.8H
- mshrn v28.8B, v24.8H, #2
+ ext v30.16b, v0.16b, v1.16b, #1
+ add v1.8h, v20.8h, v22.8h
+ mshrn v28.8b, v24.8h, #2
NRND add v1.8H, v1.8H, v26.8H
- mshrn2 v28.16B, v1.8H, #2
+ mshrn2 v28.16b, v1.8h, #2
.if \avg
- ld1 {v16.16B}, [x0]
- urhadd v28.16B, v28.16B, v16.16B
+ ld1 {v16.16b}, [x0]
+ urhadd v28.16b, v28.16b, v16.16b
.endif
- uaddl v16.8H, v0.8B, v30.8B
- uaddl2 v20.8H, v0.16B, v30.16B
- st1 {v28.16B}, [x0], x2
- add v24.8H, v16.8H, v18.8H
+ uaddl v16.8h, v0.8b, v30.8b
+ uaddl2 v20.8h, v0.16b, v30.16b
+ st1 {v28.16b}, [x0], x2
+ add v24.8h, v16.8h, v18.8h
NRND add v24.8H, v24.8H, v26.8H
- add v0.8H, v20.8H, v22.8H
- mshrn v30.8B, v24.8H, #2
+ add v0.8h, v20.8h, v22.8h
+ mshrn v30.8b, v24.8h, #2
NRND add v0.8H, v0.8H, v26.8H
- mshrn2 v30.16B, v0.8H, #2
+ mshrn2 v30.16b, v0.8h, #2
.if \avg
- ld1 {v18.16B}, [x0]
- urhadd v30.16B, v30.16B, v18.16B
+ ld1 {v18.16b}, [x0]
+ urhadd v30.16b, v30.16b, v18.16b
.endif
- st1 {v30.16B}, [x0], x2
+ st1 {v30.16b}, [x0], x2
ret
.endm
.macro pixels8 rnd=1, avg=0
-1: ld1 {v0.8B}, [x1], x2
- ld1 {v1.8B}, [x1], x2
- ld1 {v2.8B}, [x1], x2
- ld1 {v3.8B}, [x1], x2
+1: ld1 {v0.8b}, [x1], x2
+ ld1 {v1.8b}, [x1], x2
+ ld1 {v2.8b}, [x1], x2
+ ld1 {v3.8b}, [x1], x2
.if \avg
- ld1 {v4.8B}, [x0], x2
- urhadd v0.8B, v0.8B, v4.8B
- ld1 {v5.8B}, [x0], x2
- urhadd v1.8B, v1.8B, v5.8B
- ld1 {v6.8B}, [x0], x2
- urhadd v2.8B, v2.8B, v6.8B
- ld1 {v7.8B}, [x0], x2
- urhadd v3.8B, v3.8B, v7.8B
+ ld1 {v4.8b}, [x0], x2
+ urhadd v0.8b, v0.8b, v4.8b
+ ld1 {v5.8b}, [x0], x2
+ urhadd v1.8b, v1.8b, v5.8b
+ ld1 {v6.8b}, [x0], x2
+ urhadd v2.8b, v2.8b, v6.8b
+ ld1 {v7.8b}, [x0], x2
+ urhadd v3.8b, v3.8b, v7.8b
sub x0, x0, x2, lsl #2
.endif
subs w3, w3, #4
- st1 {v0.8B}, [x0], x2
- st1 {v1.8B}, [x0], x2
- st1 {v2.8B}, [x0], x2
- st1 {v3.8B}, [x0], x2
+ st1 {v0.8b}, [x0], x2
+ st1 {v1.8b}, [x0], x2
+ st1 {v2.8b}, [x0], x2
+ st1 {v3.8b}, [x0], x2
b.ne 1b
ret
.endm
.macro pixels8_x2 rnd=1, avg=0
-1: ld1 {v0.8B, v1.8B}, [x1], x2
- ext v1.8B, v0.8B, v1.8B, #1
- ld1 {v2.8B, v3.8B}, [x1], x2
- ext v3.8B, v2.8B, v3.8B, #1
+1: ld1 {v0.8b, v1.8b}, [x1], x2
+ ext v1.8b, v0.8b, v1.8b, #1
+ ld1 {v2.8b, v3.8b}, [x1], x2
+ ext v3.8b, v2.8b, v3.8b, #1
subs w3, w3, #2
- avg v0.8B, v0.8B, v1.8B
- avg v2.8B, v2.8B, v3.8B
+ avg v0.8b, v0.8b, v1.8b
+ avg v2.8b, v2.8b, v3.8b
.if \avg
- ld1 {v4.8B}, [x0], x2
- ld1 {v5.8B}, [x0]
- urhadd v0.8B, v0.8B, v4.8B
- urhadd v2.8B, v2.8B, v5.8B
+ ld1 {v4.8b}, [x0], x2
+ ld1 {v5.8b}, [x0]
+ urhadd v0.8b, v0.8b, v4.8b
+ urhadd v2.8b, v2.8b, v5.8b
sub x0, x0, x2
.endif
- st1 {v0.8B}, [x0], x2
- st1 {v2.8B}, [x0], x2
+ st1 {v0.8b}, [x0], x2
+ st1 {v2.8b}, [x0], x2
b.ne 1b
ret
.endm
.macro pixels8_y2 rnd=1, avg=0
sub w3, w3, #2
- ld1 {v0.8B}, [x1], x2
- ld1 {v1.8B}, [x1], x2
+ ld1 {v0.8b}, [x1], x2
+ ld1 {v1.8b}, [x1], x2
1: subs w3, w3, #2
- avg v4.8B, v0.8B, v1.8B
- ld1 {v0.8B}, [x1], x2
- avg v5.8B, v0.8B, v1.8B
- ld1 {v1.8B}, [x1], x2
+ avg v4.8b, v0.8b, v1.8b
+ ld1 {v0.8b}, [x1], x2
+ avg v5.8b, v0.8b, v1.8b
+ ld1 {v1.8b}, [x1], x2
.if \avg
- ld1 {v2.8B}, [x0], x2
- ld1 {v3.8B}, [x0]
- urhadd v4.8B, v4.8B, v2.8B
- urhadd v5.8B, v5.8B, v3.8B
+ ld1 {v2.8b}, [x0], x2
+ ld1 {v3.8b}, [x0]
+ urhadd v4.8b, v4.8b, v2.8b
+ urhadd v5.8b, v5.8b, v3.8b
sub x0, x0, x2
.endif
- st1 {v4.8B}, [x0], x2
- st1 {v5.8B}, [x0], x2
+ st1 {v4.8b}, [x0], x2
+ st1 {v5.8b}, [x0], x2
b.ne 1b
- avg v4.8B, v0.8B, v1.8B
- ld1 {v0.8B}, [x1], x2
- avg v5.8B, v0.8B, v1.8B
+ avg v4.8b, v0.8b, v1.8b
+ ld1 {v0.8b}, [x1], x2
+ avg v5.8b, v0.8b, v1.8b
.if \avg
- ld1 {v2.8B}, [x0], x2
- ld1 {v3.8B}, [x0]
- urhadd v4.8B, v4.8B, v2.8B
- urhadd v5.8B, v5.8B, v3.8B
+ ld1 {v2.8b}, [x0], x2
+ ld1 {v3.8b}, [x0]
+ urhadd v4.8b, v4.8b, v2.8b
+ urhadd v5.8b, v5.8b, v3.8b
sub x0, x0, x2
.endif
- st1 {v4.8B}, [x0], x2
- st1 {v5.8B}, [x0], x2
+ st1 {v4.8b}, [x0], x2
+ st1 {v5.8b}, [x0], x2
ret
.endm
.macro pixels8_xy2 rnd=1, avg=0
sub w3, w3, #2
- ld1 {v0.16B}, [x1], x2
- ld1 {v1.16B}, [x1], x2
+ ld1 {v0.16b}, [x1], x2
+ ld1 {v1.16b}, [x1], x2
NRND movi v19.8H, #1
- ext v4.16B, v0.16B, v4.16B, #1
- ext v6.16B, v1.16B, v6.16B, #1
- uaddl v16.8H, v0.8B, v4.8B
- uaddl v17.8H, v1.8B, v6.8B
+ ext v4.16b, v0.16b, v4.16b, #1
+ ext v6.16b, v1.16b, v6.16b, #1
+ uaddl v16.8h, v0.8b, v4.8b
+ uaddl v17.8h, v1.8b, v6.8b
1: subs w3, w3, #2
- ld1 {v0.16B}, [x1], x2
- add v18.8H, v16.8H, v17.8H
- ext v4.16B, v0.16B, v4.16B, #1
+ ld1 {v0.16b}, [x1], x2
+ add v18.8h, v16.8h, v17.8h
+ ext v4.16b, v0.16b, v4.16b, #1
NRND add v18.8H, v18.8H, v19.8H
- uaddl v16.8H, v0.8B, v4.8B
- mshrn v5.8B, v18.8H, #2
- ld1 {v1.16B}, [x1], x2
- add v18.8H, v16.8H, v17.8H
+ uaddl v16.8h, v0.8b, v4.8b
+ mshrn v5.8b, v18.8h, #2
+ ld1 {v1.16b}, [x1], x2
+ add v18.8h, v16.8h, v17.8h
.if \avg
- ld1 {v7.8B}, [x0]
- urhadd v5.8B, v5.8B, v7.8B
+ ld1 {v7.8b}, [x0]
+ urhadd v5.8b, v5.8b, v7.8b
.endif
NRND add v18.8H, v18.8H, v19.8H
- st1 {v5.8B}, [x0], x2
- mshrn v7.8B, v18.8H, #2
+ st1 {v5.8b}, [x0], x2
+ mshrn v7.8b, v18.8h, #2
.if \avg
- ld1 {v5.8B}, [x0]
- urhadd v7.8B, v7.8B, v5.8B
+ ld1 {v5.8b}, [x0]
+ urhadd v7.8b, v7.8b, v5.8b
.endif
- ext v6.16B, v1.16B, v6.16B, #1
- uaddl v17.8H, v1.8B, v6.8B
- st1 {v7.8B}, [x0], x2
+ ext v6.16b, v1.16b, v6.16b, #1
+ uaddl v17.8h, v1.8b, v6.8b
+ st1 {v7.8b}, [x0], x2
b.gt 1b
- ld1 {v0.16B}, [x1], x2
- add v18.8H, v16.8H, v17.8H
- ext v4.16B, v0.16B, v4.16B, #1
+ ld1 {v0.16b}, [x1], x2
+ add v18.8h, v16.8h, v17.8h
+ ext v4.16b, v0.16b, v4.16b, #1
NRND add v18.8H, v18.8H, v19.8H
- uaddl v16.8H, v0.8B, v4.8B
- mshrn v5.8B, v18.8H, #2
- add v18.8H, v16.8H, v17.8H
+ uaddl v16.8h, v0.8b, v4.8b
+ mshrn v5.8b, v18.8h, #2
+ add v18.8h, v16.8h, v17.8h
.if \avg
- ld1 {v7.8B}, [x0]
- urhadd v5.8B, v5.8B, v7.8B
+ ld1 {v7.8b}, [x0]
+ urhadd v5.8b, v5.8b, v7.8b
.endif
NRND add v18.8H, v18.8H, v19.8H
- st1 {v5.8B}, [x0], x2
- mshrn v7.8B, v18.8H, #2
+ st1 {v5.8b}, [x0], x2
+ mshrn v7.8b, v18.8h, #2
.if \avg
- ld1 {v5.8B}, [x0]
- urhadd v7.8B, v7.8B, v5.8B
+ ld1 {v5.8b}, [x0]
+ urhadd v7.8b, v7.8b, v5.8b
.endif
- st1 {v7.8B}, [x0], x2
+ st1 {v7.8b}, [x0], x2
ret
.endm
diff --git a/libavcodec/aarch64/me_cmp_neon.S b/libavcodec/aarch64/me_cmp_neon.S
index cf86e5081d..7500c324bd 100644
--- a/libavcodec/aarch64/me_cmp_neon.S
+++ b/libavcodec/aarch64/me_cmp_neon.S
@@ -1099,7 +1099,7 @@ function vsse_intra16_neon, export=1
cbnz w4, 2b
3:
- add v16.4s, v16.4s, v17.4S
+ add v16.4s, v16.4s, v17.4s
uaddlv d17, v16.4s
fmov w0, s17
diff --git a/libavcodec/aarch64/neon.S b/libavcodec/aarch64/neon.S
index bc105e4861..f6fb13bea0 100644
--- a/libavcodec/aarch64/neon.S
+++ b/libavcodec/aarch64/neon.S
@@ -28,146 +28,146 @@
.endm
.macro transpose_8x8B r0, r1, r2, r3, r4, r5, r6, r7, r8, r9
- trn1 \r8\().8B, \r0\().8B, \r1\().8B
- trn2 \r9\().8B, \r0\().8B, \r1\().8B
- trn1 \r1\().8B, \r2\().8B, \r3\().8B
- trn2 \r3\().8B, \r2\().8B, \r3\().8B
- trn1 \r0\().8B, \r4\().8B, \r5\().8B
- trn2 \r5\().8B, \r4\().8B, \r5\().8B
- trn1 \r2\().8B, \r6\().8B, \r7\().8B
- trn2 \r7\().8B, \r6\().8B, \r7\().8B
-
- trn1 \r4\().4H, \r0\().4H, \r2\().4H
- trn2 \r2\().4H, \r0\().4H, \r2\().4H
- trn1 \r6\().4H, \r5\().4H, \r7\().4H
- trn2 \r7\().4H, \r5\().4H, \r7\().4H
- trn1 \r5\().4H, \r9\().4H, \r3\().4H
- trn2 \r9\().4H, \r9\().4H, \r3\().4H
- trn1 \r3\().4H, \r8\().4H, \r1\().4H
- trn2 \r8\().4H, \r8\().4H, \r1\().4H
-
- trn1 \r0\().2S, \r3\().2S, \r4\().2S
- trn2 \r4\().2S, \r3\().2S, \r4\().2S
-
- trn1 \r1\().2S, \r5\().2S, \r6\().2S
- trn2 \r5\().2S, \r5\().2S, \r6\().2S
-
- trn2 \r6\().2S, \r8\().2S, \r2\().2S
- trn1 \r2\().2S, \r8\().2S, \r2\().2S
-
- trn1 \r3\().2S, \r9\().2S, \r7\().2S
- trn2 \r7\().2S, \r9\().2S, \r7\().2S
+ trn1 \r8\().8b, \r0\().8b, \r1\().8b
+ trn2 \r9\().8b, \r0\().8b, \r1\().8b
+ trn1 \r1\().8b, \r2\().8b, \r3\().8b
+ trn2 \r3\().8b, \r2\().8b, \r3\().8b
+ trn1 \r0\().8b, \r4\().8b, \r5\().8b
+ trn2 \r5\().8b, \r4\().8b, \r5\().8b
+ trn1 \r2\().8b, \r6\().8b, \r7\().8b
+ trn2 \r7\().8b, \r6\().8b, \r7\().8b
+
+ trn1 \r4\().4h, \r0\().4h, \r2\().4h
+ trn2 \r2\().4h, \r0\().4h, \r2\().4h
+ trn1 \r6\().4h, \r5\().4h, \r7\().4h
+ trn2 \r7\().4h, \r5\().4h, \r7\().4h
+ trn1 \r5\().4h, \r9\().4h, \r3\().4h
+ trn2 \r9\().4h, \r9\().4h, \r3\().4h
+ trn1 \r3\().4h, \r8\().4h, \r1\().4h
+ trn2 \r8\().4h, \r8\().4h, \r1\().4h
+
+ trn1 \r0\().2s, \r3\().2s, \r4\().2s
+ trn2 \r4\().2s, \r3\().2s, \r4\().2s
+
+ trn1 \r1\().2s, \r5\().2s, \r6\().2s
+ trn2 \r5\().2s, \r5\().2s, \r6\().2s
+
+ trn2 \r6\().2s, \r8\().2s, \r2\().2s
+ trn1 \r2\().2s, \r8\().2s, \r2\().2s
+
+ trn1 \r3\().2s, \r9\().2s, \r7\().2s
+ trn2 \r7\().2s, \r9\().2s, \r7\().2s
.endm
.macro transpose_8x16B r0, r1, r2, r3, r4, r5, r6, r7, t0, t1
- trn1 \t0\().16B, \r0\().16B, \r1\().16B
- trn2 \t1\().16B, \r0\().16B, \r1\().16B
- trn1 \r1\().16B, \r2\().16B, \r3\().16B
- trn2 \r3\().16B, \r2\().16B, \r3\().16B
- trn1 \r0\().16B, \r4\().16B, \r5\().16B
- trn2 \r5\().16B, \r4\().16B, \r5\().16B
- trn1 \r2\().16B, \r6\().16B, \r7\().16B
- trn2 \r7\().16B, \r6\().16B, \r7\().16B
-
- trn1 \r4\().8H, \r0\().8H, \r2\().8H
- trn2 \r2\().8H, \r0\().8H, \r2\().8H
- trn1 \r6\().8H, \r5\().8H, \r7\().8H
- trn2 \r7\().8H, \r5\().8H, \r7\().8H
- trn1 \r5\().8H, \t1\().8H, \r3\().8H
- trn2 \t1\().8H, \t1\().8H, \r3\().8H
- trn1 \r3\().8H, \t0\().8H, \r1\().8H
- trn2 \t0\().8H, \t0\().8H, \r1\().8H
-
- trn1 \r0\().4S, \r3\().4S, \r4\().4S
- trn2 \r4\().4S, \r3\().4S, \r4\().4S
-
- trn1 \r1\().4S, \r5\().4S, \r6\().4S
- trn2 \r5\().4S, \r5\().4S, \r6\().4S
-
- trn2 \r6\().4S, \t0\().4S, \r2\().4S
- trn1 \r2\().4S, \t0\().4S, \r2\().4S
-
- trn1 \r3\().4S, \t1\().4S, \r7\().4S
- trn2 \r7\().4S, \t1\().4S, \r7\().4S
+ trn1 \t0\().16b, \r0\().16b, \r1\().16b
+ trn2 \t1\().16b, \r0\().16b, \r1\().16b
+ trn1 \r1\().16b, \r2\().16b, \r3\().16b
+ trn2 \r3\().16b, \r2\().16b, \r3\().16b
+ trn1 \r0\().16b, \r4\().16b, \r5\().16b
+ trn2 \r5\().16b, \r4\().16b, \r5\().16b
+ trn1 \r2\().16b, \r6\().16b, \r7\().16b
+ trn2 \r7\().16b, \r6\().16b, \r7\().16b
+
+ trn1 \r4\().8h, \r0\().8h, \r2\().8h
+ trn2 \r2\().8h, \r0\().8h, \r2\().8h
+ trn1 \r6\().8h, \r5\().8h, \r7\().8h
+ trn2 \r7\().8h, \r5\().8h, \r7\().8h
+ trn1 \r5\().8h, \t1\().8h, \r3\().8h
+ trn2 \t1\().8h, \t1\().8h, \r3\().8h
+ trn1 \r3\().8h, \t0\().8h, \r1\().8h
+ trn2 \t0\().8h, \t0\().8h, \r1\().8h
+
+ trn1 \r0\().4s, \r3\().4s, \r4\().4s
+ trn2 \r4\().4s, \r3\().4s, \r4\().4s
+
+ trn1 \r1\().4s, \r5\().4s, \r6\().4s
+ trn2 \r5\().4s, \r5\().4s, \r6\().4s
+
+ trn2 \r6\().4s, \t0\().4s, \r2\().4s
+ trn1 \r2\().4s, \t0\().4s, \r2\().4s
+
+ trn1 \r3\().4s, \t1\().4s, \r7\().4s
+ trn2 \r7\().4s, \t1\().4s, \r7\().4s
.endm
.macro transpose_4x16B r0, r1, r2, r3, t4, t5, t6, t7
- trn1 \t4\().16B, \r0\().16B, \r1\().16B
- trn2 \t5\().16B, \r0\().16B, \r1\().16B
- trn1 \t6\().16B, \r2\().16B, \r3\().16B
- trn2 \t7\().16B, \r2\().16B, \r3\().16B
-
- trn1 \r0\().8H, \t4\().8H, \t6\().8H
- trn2 \r2\().8H, \t4\().8H, \t6\().8H
- trn1 \r1\().8H, \t5\().8H, \t7\().8H
- trn2 \r3\().8H, \t5\().8H, \t7\().8H
+ trn1 \t4\().16b, \r0\().16b, \r1\().16b
+ trn2 \t5\().16b, \r0\().16b, \r1\().16b
+ trn1 \t6\().16b, \r2\().16b, \r3\().16b
+ trn2 \t7\().16b, \r2\().16b, \r3\().16b
+
+ trn1 \r0\().8h, \t4\().8h, \t6\().8h
+ trn2 \r2\().8h, \t4\().8h, \t6\().8h
+ trn1 \r1\().8h, \t5\().8h, \t7\().8h
+ trn2 \r3\().8h, \t5\().8h, \t7\().8h
.endm
.macro transpose_4x8B r0, r1, r2, r3, t4, t5, t6, t7
- trn1 \t4\().8B, \r0\().8B, \r1\().8B
- trn2 \t5\().8B, \r0\().8B, \r1\().8B
- trn1 \t6\().8B, \r2\().8B, \r3\().8B
- trn2 \t7\().8B, \r2\().8B, \r3\().8B
-
- trn1 \r0\().4H, \t4\().4H, \t6\().4H
- trn2 \r2\().4H, \t4\().4H, \t6\().4H
- trn1 \r1\().4H, \t5\().4H, \t7\().4H
- trn2 \r3\().4H, \t5\().4H, \t7\().4H
+ trn1 \t4\().8b, \r0\().8b, \r1\().8b
+ trn2 \t5\().8b, \r0\().8b, \r1\().8b
+ trn1 \t6\().8b, \r2\().8b, \r3\().8b
+ trn2 \t7\().8b, \r2\().8b, \r3\().8b
+
+ trn1 \r0\().4h, \t4\().4h, \t6\().4h
+ trn2 \r2\().4h, \t4\().4h, \t6\().4h
+ trn1 \r1\().4h, \t5\().4h, \t7\().4h
+ trn2 \r3\().4h, \t5\().4h, \t7\().4h
.endm
.macro transpose_4x4H r0, r1, r2, r3, r4, r5, r6, r7
- trn1 \r4\().4H, \r0\().4H, \r1\().4H
- trn2 \r5\().4H, \r0\().4H, \r1\().4H
- trn1 \r6\().4H, \r2\().4H, \r3\().4H
- trn2 \r7\().4H, \r2\().4H, \r3\().4H
-
- trn1 \r0\().2S, \r4\().2S, \r6\().2S
- trn2 \r2\().2S, \r4\().2S, \r6\().2S
- trn1 \r1\().2S, \r5\().2S, \r7\().2S
- trn2 \r3\().2S, \r5\().2S, \r7\().2S
+ trn1 \r4\().4h, \r0\().4h, \r1\().4h
+ trn2 \r5\().4h, \r0\().4h, \r1\().4h
+ trn1 \r6\().4h, \r2\().4h, \r3\().4h
+ trn2 \r7\().4h, \r2\().4h, \r3\().4h
+
+ trn1 \r0\().2s, \r4\().2s, \r6\().2s
+ trn2 \r2\().2s, \r4\().2s, \r6\().2s
+ trn1 \r1\().2s, \r5\().2s, \r7\().2s
+ trn2 \r3\().2s, \r5\().2s, \r7\().2s
.endm
.macro transpose_4x8H r0, r1, r2, r3, t4, t5, t6, t7
- trn1 \t4\().8H, \r0\().8H, \r1\().8H
- trn2 \t5\().8H, \r0\().8H, \r1\().8H
- trn1 \t6\().8H, \r2\().8H, \r3\().8H
- trn2 \t7\().8H, \r2\().8H, \r3\().8H
-
- trn1 \r0\().4S, \t4\().4S, \t6\().4S
- trn2 \r2\().4S, \t4\().4S, \t6\().4S
- trn1 \r1\().4S, \t5\().4S, \t7\().4S
- trn2 \r3\().4S, \t5\().4S, \t7\().4S
+ trn1 \t4\().8h, \r0\().8h, \r1\().8h
+ trn2 \t5\().8h, \r0\().8h, \r1\().8h
+ trn1 \t6\().8h, \r2\().8h, \r3\().8h
+ trn2 \t7\().8h, \r2\().8h, \r3\().8h
+
+ trn1 \r0\().4s, \t4\().4s, \t6\().4s
+ trn2 \r2\().4s, \t4\().4s, \t6\().4s
+ trn1 \r1\().4s, \t5\().4s, \t7\().4s
+ trn2 \r3\().4s, \t5\().4s, \t7\().4s
.endm
.macro transpose_8x8H r0, r1, r2, r3, r4, r5, r6, r7, r8, r9
- trn1 \r8\().8H, \r0\().8H, \r1\().8H
- trn2 \r9\().8H, \r0\().8H, \r1\().8H
- trn1 \r1\().8H, \r2\().8H, \r3\().8H
- trn2 \r3\().8H, \r2\().8H, \r3\().8H
- trn1 \r0\().8H, \r4\().8H, \r5\().8H
- trn2 \r5\().8H, \r4\().8H, \r5\().8H
- trn1 \r2\().8H, \r6\().8H, \r7\().8H
- trn2 \r7\().8H, \r6\().8H, \r7\().8H
-
- trn1 \r4\().4S, \r0\().4S, \r2\().4S
- trn2 \r2\().4S, \r0\().4S, \r2\().4S
- trn1 \r6\().4S, \r5\().4S, \r7\().4S
- trn2 \r7\().4S, \r5\().4S, \r7\().4S
- trn1 \r5\().4S, \r9\().4S, \r3\().4S
- trn2 \r9\().4S, \r9\().4S, \r3\().4S
- trn1 \r3\().4S, \r8\().4S, \r1\().4S
- trn2 \r8\().4S, \r8\().4S, \r1\().4S
-
- trn1 \r0\().2D, \r3\().2D, \r4\().2D
- trn2 \r4\().2D, \r3\().2D, \r4\().2D
-
- trn1 \r1\().2D, \r5\().2D, \r6\().2D
- trn2 \r5\().2D, \r5\().2D, \r6\().2D
-
- trn2 \r6\().2D, \r8\().2D, \r2\().2D
- trn1 \r2\().2D, \r8\().2D, \r2\().2D
-
- trn1 \r3\().2D, \r9\().2D, \r7\().2D
- trn2 \r7\().2D, \r9\().2D, \r7\().2D
+ trn1 \r8\().8h, \r0\().8h, \r1\().8h
+ trn2 \r9\().8h, \r0\().8h, \r1\().8h
+ trn1 \r1\().8h, \r2\().8h, \r3\().8h
+ trn2 \r3\().8h, \r2\().8h, \r3\().8h
+ trn1 \r0\().8h, \r4\().8h, \r5\().8h
+ trn2 \r5\().8h, \r4\().8h, \r5\().8h
+ trn1 \r2\().8h, \r6\().8h, \r7\().8h
+ trn2 \r7\().8h, \r6\().8h, \r7\().8h
+
+ trn1 \r4\().4s, \r0\().4s, \r2\().4s
+ trn2 \r2\().4s, \r0\().4s, \r2\().4s
+ trn1 \r6\().4s, \r5\().4s, \r7\().4s
+ trn2 \r7\().4s, \r5\().4s, \r7\().4s
+ trn1 \r5\().4s, \r9\().4s, \r3\().4s
+ trn2 \r9\().4s, \r9\().4s, \r3\().4s
+ trn1 \r3\().4s, \r8\().4s, \r1\().4s
+ trn2 \r8\().4s, \r8\().4s, \r1\().4s
+
+ trn1 \r0\().2d, \r3\().2d, \r4\().2d
+ trn2 \r4\().2d, \r3\().2d, \r4\().2d
+
+ trn1 \r1\().2d, \r5\().2d, \r6\().2d
+ trn2 \r5\().2d, \r5\().2d, \r6\().2d
+
+ trn2 \r6\().2d, \r8\().2d, \r2\().2d
+ trn1 \r2\().2d, \r8\().2d, \r2\().2d
+
+ trn1 \r3\().2d, \r9\().2d, \r7\().2d
+ trn2 \r7\().2d, \r9\().2d, \r7\().2d
.endm
diff --git a/libavcodec/aarch64/sbrdsp_neon.S b/libavcodec/aarch64/sbrdsp_neon.S
index d23717e760..1fdde6ccb6 100644
--- a/libavcodec/aarch64/sbrdsp_neon.S
+++ b/libavcodec/aarch64/sbrdsp_neon.S
@@ -46,49 +46,49 @@ function ff_sbr_sum64x5_neon, export=1
add x3, x0, #192*4
add x4, x0, #256*4
mov x5, #64
-1: ld1 {v0.4S}, [x0]
- ld1 {v1.4S}, [x1], #16
- fadd v0.4S, v0.4S, v1.4S
- ld1 {v2.4S}, [x2], #16
- fadd v0.4S, v0.4S, v2.4S
- ld1 {v3.4S}, [x3], #16
- fadd v0.4S, v0.4S, v3.4S
- ld1 {v4.4S}, [x4], #16
- fadd v0.4S, v0.4S, v4.4S
- st1 {v0.4S}, [x0], #16
+1: ld1 {v0.4s}, [x0]
+ ld1 {v1.4s}, [x1], #16
+ fadd v0.4s, v0.4s, v1.4s
+ ld1 {v2.4s}, [x2], #16
+ fadd v0.4s, v0.4s, v2.4s
+ ld1 {v3.4s}, [x3], #16
+ fadd v0.4s, v0.4s, v3.4s
+ ld1 {v4.4s}, [x4], #16
+ fadd v0.4s, v0.4s, v4.4s
+ st1 {v0.4s}, [x0], #16
subs x5, x5, #4
b.gt 1b
ret
endfunc
function ff_sbr_sum_square_neon, export=1
- movi v0.4S, #0
-1: ld1 {v1.4S}, [x0], #16
- fmla v0.4S, v1.4S, v1.4S
+ movi v0.4s, #0
+1: ld1 {v1.4s}, [x0], #16
+ fmla v0.4s, v1.4s, v1.4s
subs w1, w1, #2
b.gt 1b
- faddp v0.4S, v0.4S, v0.4S
- faddp v0.4S, v0.4S, v0.4S
+ faddp v0.4s, v0.4s, v0.4s
+ faddp v0.4s, v0.4s, v0.4s
ret
endfunc
function ff_sbr_neg_odd_64_neon, export=1
mov x1, x0
- movi v5.4S, #1<<7, lsl #24
- ld2 {v0.4S, v1.4S}, [x0], #32
- eor v1.16B, v1.16B, v5.16B
- ld2 {v2.4S, v3.4S}, [x0], #32
+ movi v5.4s, #1<<7, lsl #24
+ ld2 {v0.4s, v1.4s}, [x0], #32
+ eor v1.16b, v1.16b, v5.16b
+ ld2 {v2.4s, v3.4s}, [x0], #32
.rept 3
- st2 {v0.4S, v1.4S}, [x1], #32
- eor v3.16B, v3.16B, v5.16B
- ld2 {v0.4S, v1.4S}, [x0], #32
- st2 {v2.4S, v3.4S}, [x1], #32
- eor v1.16B, v1.16B, v5.16B
- ld2 {v2.4S, v3.4S}, [x0], #32
+ st2 {v0.4s, v1.4s}, [x1], #32
+ eor v3.16b, v3.16b, v5.16b
+ ld2 {v0.4s, v1.4s}, [x0], #32
+ st2 {v2.4s, v3.4s}, [x1], #32
+ eor v1.16b, v1.16b, v5.16b
+ ld2 {v2.4s, v3.4s}, [x0], #32
.endr
- eor v3.16B, v3.16B, v5.16B
- st2 {v0.4S, v1.4S}, [x1], #32
- st2 {v2.4S, v3.4S}, [x1], #32
+ eor v3.16b, v3.16b, v5.16b
+ st2 {v0.4s, v1.4s}, [x1], #32
+ st2 {v2.4s, v3.4s}, [x1], #32
ret
endfunc
@@ -97,26 +97,26 @@ function ff_sbr_qmf_pre_shuffle_neon, export=1
add x2, x0, #64*4
mov x3, #-16
mov x4, #-4
- movi v6.4S, #1<<7, lsl #24
- ld1 {v0.2S}, [x0], #8
- st1 {v0.2S}, [x2], #8
+ movi v6.4s, #1<<7, lsl #24
+ ld1 {v0.2s}, [x0], #8
+ st1 {v0.2s}, [x2], #8
.rept 7
- ld1 {v1.4S}, [x1], x3
- ld1 {v2.4S}, [x0], #16
- eor v1.16B, v1.16B, v6.16B
- rev64 v1.4S, v1.4S
- ext v1.16B, v1.16B, v1.16B, #8
- st2 {v1.4S, v2.4S}, [x2], #32
+ ld1 {v1.4s}, [x1], x3
+ ld1 {v2.4s}, [x0], #16
+ eor v1.16b, v1.16b, v6.16b
+ rev64 v1.4s, v1.4s
+ ext v1.16b, v1.16b, v1.16b, #8
+ st2 {v1.4s, v2.4s}, [x2], #32
.endr
add x1, x1, #8
- ld1 {v1.2S}, [x1], x4
- ld1 {v2.2S}, [x0], #8
- ld1 {v1.S}[3], [x1]
- ld1 {v2.S}[2], [x0]
- eor v1.16B, v1.16B, v6.16B
- rev64 v1.4S, v1.4S
- st2 {v1.2S, v2.2S}, [x2], #16
- st2 {v1.S, v2.S}[2], [x2]
+ ld1 {v1.2s}, [x1], x4
+ ld1 {v2.2s}, [x0], #8
+ ld1 {v1.s}[3], [x1]
+ ld1 {v2.s}[2], [x0]
+ eor v1.16b, v1.16b, v6.16b
+ rev64 v1.4s, v1.4s
+ st2 {v1.2s, v2.2s}, [x2], #16
+ st2 {v1.s, v2.s}[2], [x2]
ret
endfunc
@@ -124,13 +124,13 @@ function ff_sbr_qmf_post_shuffle_neon, export=1
add x2, x1, #60*4
mov x3, #-16
mov x4, #32
- movi v6.4S, #1<<7, lsl #24
-1: ld1 {v0.4S}, [x2], x3
- ld1 {v1.4S}, [x1], #16
- eor v0.16B, v0.16B, v6.16B
- rev64 v0.4S, v0.4S
- ext v0.16B, v0.16B, v0.16B, #8
- st2 {v0.4S, v1.4S}, [x0], #32
+ movi v6.4s, #1<<7, lsl #24
+1: ld1 {v0.4s}, [x2], x3
+ ld1 {v1.4s}, [x1], #16
+ eor v0.16b, v0.16b, v6.16b
+ rev64 v0.4s, v0.4s
+ ext v0.16b, v0.16b, v0.16b, #8
+ st2 {v0.4s, v1.4s}, [x0], #32
subs x4, x4, #4
b.gt 1b
ret
@@ -141,13 +141,13 @@ function ff_sbr_qmf_deint_neg_neon, export=1
add x2, x0, #60*4
mov x3, #-32
mov x4, #32
- movi v2.4S, #1<<7, lsl #24
-1: ld2 {v0.4S, v1.4S}, [x1], x3
- eor v0.16B, v0.16B, v2.16B
- rev64 v1.4S, v1.4S
- ext v1.16B, v1.16B, v1.16B, #8
- st1 {v0.4S}, [x2]
- st1 {v1.4S}, [x0], #16
+ movi v2.4s, #1<<7, lsl #24
+1: ld2 {v0.4s, v1.4s}, [x1], x3
+ eor v0.16b, v0.16b, v2.16b
+ rev64 v1.4s, v1.4s
+ ext v1.16b, v1.16b, v1.16b, #8
+ st1 {v0.4s}, [x2]
+ st1 {v1.4s}, [x0], #16
sub x2, x2, #16
subs x4, x4, #4
b.gt 1b
@@ -159,16 +159,16 @@ function ff_sbr_qmf_deint_bfly_neon, export=1
add x3, x0, #124*4
mov x4, #64
mov x5, #-16
-1: ld1 {v0.4S}, [x1], #16
- ld1 {v1.4S}, [x2], x5
- rev64 v2.4S, v0.4S
- ext v2.16B, v2.16B, v2.16B, #8
- rev64 v3.4S, v1.4S
- ext v3.16B, v3.16B, v3.16B, #8
- fadd v1.4S, v1.4S, v2.4S
- fsub v0.4S, v0.4S, v3.4S
- st1 {v0.4S}, [x0], #16
- st1 {v1.4S}, [x3], x5
+1: ld1 {v0.4s}, [x1], #16
+ ld1 {v1.4s}, [x2], x5
+ rev64 v2.4s, v0.4s
+ ext v2.16b, v2.16b, v2.16b, #8
+ rev64 v3.4s, v1.4s
+ ext v3.16b, v3.16b, v3.16b, #8
+ fadd v1.4s, v1.4s, v2.4s
+ fsub v0.4s, v0.4s, v3.4s
+ st1 {v0.4s}, [x0], #16
+ st1 {v1.4s}, [x3], x5
subs x4, x4, #4
b.gt 1b
ret
@@ -178,32 +178,32 @@ function ff_sbr_hf_gen_neon, export=1
sxtw x4, w4
sxtw x5, w5
movrel x6, factors
- ld1 {v7.4S}, [x6]
- dup v1.4S, v0.S[0]
- mov v2.8B, v1.8B
- mov v2.S[2], v7.S[0]
- mov v2.S[3], v7.S[0]
- fmul v1.4S, v1.4S, v2.4S
- ld1 {v0.D}[0], [x3]
- ld1 {v0.D}[1], [x2]
- fmul v0.4S, v0.4S, v1.4S
- fmul v1.4S, v0.4S, v7.4S
- rev64 v0.4S, v0.4S
+ ld1 {v7.4s}, [x6]
+ dup v1.4s, v0.s[0]
+ mov v2.8b, v1.8b
+ mov v2.s[2], v7.s[0]
+ mov v2.s[3], v7.s[0]
+ fmul v1.4s, v1.4s, v2.4s
+ ld1 {v0.d}[0], [x3]
+ ld1 {v0.d}[1], [x2]
+ fmul v0.4s, v0.4s, v1.4s
+ fmul v1.4s, v0.4s, v7.4s
+ rev64 v0.4s, v0.4s
sub x7, x5, x4
add x0, x0, x4, lsl #3
add x1, x1, x4, lsl #3
sub x1, x1, #16
-1: ld1 {v2.4S}, [x1], #16
- ld1 {v3.2S}, [x1]
- fmul v4.4S, v2.4S, v1.4S
- fmul v5.4S, v2.4S, v0.4S
- faddp v4.4S, v4.4S, v4.4S
- faddp v5.4S, v5.4S, v5.4S
- faddp v4.4S, v4.4S, v4.4S
- faddp v5.4S, v5.4S, v5.4S
- mov v4.S[1], v5.S[0]
- fadd v4.2S, v4.2S, v3.2S
- st1 {v4.2S}, [x0], #8
+1: ld1 {v2.4s}, [x1], #16
+ ld1 {v3.2s}, [x1]
+ fmul v4.4s, v2.4s, v1.4s
+ fmul v5.4s, v2.4s, v0.4s
+ faddp v4.4s, v4.4s, v4.4s
+ faddp v5.4s, v5.4s, v5.4s
+ faddp v4.4s, v4.4s, v4.4s
+ faddp v5.4s, v5.4s, v5.4s
+ mov v4.s[1], v5.s[0]
+ fadd v4.2s, v4.2s, v3.2s
+ st1 {v4.2s}, [x0], #8
sub x1, x1, #8
subs x7, x7, #1
b.gt 1b
@@ -215,10 +215,10 @@ function ff_sbr_hf_g_filt_neon, export=1
sxtw x4, w4
mov x5, #40*2*4
add x1, x1, x4, lsl #3
-1: ld1 {v0.2S}, [x1], x5
- ld1 {v1.S}[0], [x2], #4
- fmul v2.4S, v0.4S, v1.S[0]
- st1 {v2.2S}, [x0], #8
+1: ld1 {v0.2s}, [x1], x5
+ ld1 {v1.s}[0], [x2], #4
+ fmul v2.4s, v0.4s, v1.s[0]
+ st1 {v2.2s}, [x0], #8
subs x3, x3, #1
b.gt 1b
ret
@@ -227,46 +227,46 @@ endfunc
function ff_sbr_autocorrelate_neon, export=1
mov x2, #38
movrel x3, factors
- ld1 {v0.4S}, [x3]
- movi v1.4S, #0
- movi v2.4S, #0
- movi v3.4S, #0
- ld1 {v4.2S}, [x0], #8
- ld1 {v5.2S}, [x0], #8
- fmul v16.2S, v4.2S, v4.2S
- fmul v17.2S, v5.2S, v4.S[0]
- fmul v18.2S, v5.2S, v4.S[1]
-1: ld1 {v5.D}[1], [x0], #8
- fmla v1.2S, v4.2S, v4.2S
- fmla v2.4S, v5.4S, v4.S[0]
- fmla v3.4S, v5.4S, v4.S[1]
- mov v4.D[0], v5.D[0]
- mov v5.D[0], v5.D[1]
+ ld1 {v0.4s}, [x3]
+ movi v1.4s, #0
+ movi v2.4s, #0
+ movi v3.4s, #0
+ ld1 {v4.2s}, [x0], #8
+ ld1 {v5.2s}, [x0], #8
+ fmul v16.2s, v4.2s, v4.2s
+ fmul v17.2s, v5.2s, v4.s[0]
+ fmul v18.2s, v5.2s, v4.s[1]
+1: ld1 {v5.d}[1], [x0], #8
+ fmla v1.2s, v4.2s, v4.2s
+ fmla v2.4s, v5.4s, v4.s[0]
+ fmla v3.4s, v5.4s, v4.s[1]
+ mov v4.d[0], v5.d[0]
+ mov v5.d[0], v5.d[1]
subs x2, x2, #1
b.gt 1b
- fmul v19.2S, v4.2S, v4.2S
- fmul v20.2S, v5.2S, v4.S[0]
- fmul v21.2S, v5.2S, v4.S[1]
- fadd v22.4S, v2.4S, v20.4S
- fsub v22.4S, v22.4S, v17.4S
- fadd v23.4S, v3.4S, v21.4S
- fsub v23.4S, v23.4S, v18.4S
- rev64 v23.4S, v23.4S
- fmul v23.4S, v23.4S, v0.4S
- fadd v22.4S, v22.4S, v23.4S
- st1 {v22.4S}, [x1], #16
- fadd v23.2S, v1.2S, v19.2S
- fsub v23.2S, v23.2S, v16.2S
- faddp v23.2S, v23.2S, v23.2S
- st1 {v23.S}[0], [x1]
+ fmul v19.2s, v4.2s, v4.2s
+ fmul v20.2s, v5.2s, v4.s[0]
+ fmul v21.2s, v5.2s, v4.s[1]
+ fadd v22.4s, v2.4s, v20.4s
+ fsub v22.4s, v22.4s, v17.4s
+ fadd v23.4s, v3.4s, v21.4s
+ fsub v23.4s, v23.4s, v18.4s
+ rev64 v23.4s, v23.4s
+ fmul v23.4s, v23.4s, v0.4s
+ fadd v22.4s, v22.4s, v23.4s
+ st1 {v22.4s}, [x1], #16
+ fadd v23.2s, v1.2s, v19.2s
+ fsub v23.2s, v23.2s, v16.2s
+ faddp v23.2s, v23.2s, v23.2s
+ st1 {v23.s}[0], [x1]
add x1, x1, #8
- rev64 v3.2S, v3.2S
- fmul v3.2S, v3.2S, v0.2S
- fadd v2.2S, v2.2S, v3.2S
- st1 {v2.2S}, [x1]
+ rev64 v3.2s, v3.2s
+ fmul v3.2s, v3.2s, v0.2s
+ fadd v2.2s, v2.2s, v3.2s
+ st1 {v2.2s}, [x1]
add x1, x1, #16
- faddp v1.2S, v1.2S, v1.2S
- st1 {v1.S}[0], [x1]
+ faddp v1.2s, v1.2s, v1.2s
+ st1 {v1.s}[0], [x1]
ret
endfunc
@@ -278,25 +278,25 @@ endfunc
1: and x3, x3, #0x1ff
add x8, x7, x3, lsl #3
add x3, x3, #2
- ld1 {v2.4S}, [x0]
- ld1 {v3.2S}, [x1], #8
- ld1 {v4.2S}, [x2], #8
- ld1 {v5.4S}, [x8]
- mov v6.16B, v2.16B
- zip1 v3.4S, v3.4S, v3.4S
- zip1 v4.4S, v4.4S, v4.4S
- fmla v6.4S, v1.4S, v3.4S
- fmla v2.4S, v5.4S, v4.4S
- fcmeq v7.4S, v3.4S, #0
- bif v2.16B, v6.16B, v7.16B
- st1 {v2.4S}, [x0], #16
+ ld1 {v2.4s}, [x0]
+ ld1 {v3.2s}, [x1], #8
+ ld1 {v4.2s}, [x2], #8
+ ld1 {v5.4s}, [x8]
+ mov v6.16b, v2.16b
+ zip1 v3.4s, v3.4s, v3.4s
+ zip1 v4.4s, v4.4s, v4.4s
+ fmla v6.4s, v1.4s, v3.4s
+ fmla v2.4s, v5.4s, v4.4s
+ fcmeq v7.4s, v3.4s, #0
+ bif v2.16b, v6.16b, v7.16b
+ st1 {v2.4s}, [x0], #16
subs x5, x5, #2
b.gt 1b
.endm
function ff_sbr_hf_apply_noise_0_neon, export=1
movrel x9, phi_noise_0
- ld1 {v1.4S}, [x9]
+ ld1 {v1.4s}, [x9]
apply_noise_common
ret
endfunc
@@ -305,14 +305,14 @@ function ff_sbr_hf_apply_noise_1_neon, export=1
movrel x9, phi_noise_1
and x4, x4, #1
add x9, x9, x4, lsl #4
- ld1 {v1.4S}, [x9]
+ ld1 {v1.4s}, [x9]
apply_noise_common
ret
endfunc
function ff_sbr_hf_apply_noise_2_neon, export=1
movrel x9, phi_noise_2
- ld1 {v1.4S}, [x9]
+ ld1 {v1.4s}, [x9]
apply_noise_common
ret
endfunc
@@ -321,7 +321,7 @@ function ff_sbr_hf_apply_noise_3_neon, export=1
movrel x9, phi_noise_3
and x4, x4, #1
add x9, x9, x4, lsl #4
- ld1 {v1.4S}, [x9]
+ ld1 {v1.4s}, [x9]
apply_noise_common
ret
endfunc
diff --git a/libavcodec/aarch64/simple_idct_neon.S b/libavcodec/aarch64/simple_idct_neon.S
index 210182ff21..a4438e9922 100644
--- a/libavcodec/aarch64/simple_idct_neon.S
+++ b/libavcodec/aarch64/simple_idct_neon.S
@@ -54,7 +54,7 @@ endconst
prfm pldl1keep, [\data]
mov x10, x30
movrel x3, idct_coeff_neon
- ld1 {v0.2D}, [x3]
+ ld1 {v0.2d}, [x3]
.endm
.macro idct_end
@@ -74,146 +74,146 @@ endconst
.endm
.macro idct_col4_top y1, y2, y3, y4, i, l
- smull\i v7.4S, \y3\l, z2
- smull\i v16.4S, \y3\l, z6
- smull\i v17.4S, \y2\l, z1
- add v19.4S, v23.4S, v7.4S
- smull\i v18.4S, \y2\l, z3
- add v20.4S, v23.4S, v16.4S
- smull\i v5.4S, \y2\l, z5
- sub v21.4S, v23.4S, v16.4S
- smull\i v6.4S, \y2\l, z7
- sub v22.4S, v23.4S, v7.4S
-
- smlal\i v17.4S, \y4\l, z3
- smlsl\i v18.4S, \y4\l, z7
- smlsl\i v5.4S, \y4\l, z1
- smlsl\i v6.4S, \y4\l, z5
+ smull\i v7.4s, \y3\l, z2
+ smull\i v16.4s, \y3\l, z6
+ smull\i v17.4s, \y2\l, z1
+ add v19.4s, v23.4s, v7.4s
+ smull\i v18.4s, \y2\l, z3
+ add v20.4s, v23.4s, v16.4s
+ smull\i v5.4s, \y2\l, z5
+ sub v21.4s, v23.4s, v16.4s
+ smull\i v6.4s, \y2\l, z7
+ sub v22.4s, v23.4s, v7.4s
+
+ smlal\i v17.4s, \y4\l, z3
+ smlsl\i v18.4s, \y4\l, z7
+ smlsl\i v5.4s, \y4\l, z1
+ smlsl\i v6.4s, \y4\l, z5
.endm
.macro idct_row4_neon y1, y2, y3, y4, pass
- ld1 {\y1\().2D,\y2\().2D}, [x2], #32
- movi v23.4S, #1<<2, lsl #8
- orr v5.16B, \y1\().16B, \y2\().16B
- ld1 {\y3\().2D,\y4\().2D}, [x2], #32
- orr v6.16B, \y3\().16B, \y4\().16B
- orr v5.16B, v5.16B, v6.16B
- mov x3, v5.D[1]
- smlal v23.4S, \y1\().4H, z4
+ ld1 {\y1\().2d,\y2\().2d}, [x2], #32
+ movi v23.4s, #1<<2, lsl #8
+ orr v5.16b, \y1\().16b, \y2\().16b
+ ld1 {\y3\().2d,\y4\().2d}, [x2], #32
+ orr v6.16b, \y3\().16b, \y4\().16b
+ orr v5.16b, v5.16b, v6.16b
+ mov x3, v5.d[1]
+ smlal v23.4s, \y1\().4h, z4
- idct_col4_top \y1, \y2, \y3, \y4, 1, .4H
+ idct_col4_top \y1, \y2, \y3, \y4, 1, .4h
cmp x3, #0
b.eq \pass\()f
- smull2 v7.4S, \y1\().8H, z4
- smlal2 v17.4S, \y2\().8H, z5
- smlsl2 v18.4S, \y2\().8H, z1
- smull2 v16.4S, \y3\().8H, z2
- smlal2 v5.4S, \y2\().8H, z7
- add v19.4S, v19.4S, v7.4S
- sub v20.4S, v20.4S, v7.4S
- sub v21.4S, v21.4S, v7.4S
- add v22.4S, v22.4S, v7.4S
- smlal2 v6.4S, \y2\().8H, z3
- smull2 v7.4S, \y3\().8H, z6
- smlal2 v17.4S, \y4\().8H, z7
- smlsl2 v18.4S, \y4\().8H, z5
- smlal2 v5.4S, \y4\().8H, z3
- smlsl2 v6.4S, \y4\().8H, z1
- add v19.4S, v19.4S, v7.4S
- sub v20.4S, v20.4S, v16.4S
- add v21.4S, v21.4S, v16.4S
- sub v22.4S, v22.4S, v7.4S
+ smull2 v7.4s, \y1\().8h, z4
+ smlal2 v17.4s, \y2\().8h, z5
+ smlsl2 v18.4s, \y2\().8h, z1
+ smull2 v16.4s, \y3\().8h, z2
+ smlal2 v5.4s, \y2\().8h, z7
+ add v19.4s, v19.4s, v7.4s
+ sub v20.4s, v20.4s, v7.4s
+ sub v21.4s, v21.4s, v7.4s
+ add v22.4s, v22.4s, v7.4s
+ smlal2 v6.4s, \y2\().8h, z3
+ smull2 v7.4s, \y3\().8h, z6
+ smlal2 v17.4s, \y4\().8h, z7
+ smlsl2 v18.4s, \y4\().8h, z5
+ smlal2 v5.4s, \y4\().8h, z3
+ smlsl2 v6.4s, \y4\().8h, z1
+ add v19.4s, v19.4s, v7.4s
+ sub v20.4s, v20.4s, v16.4s
+ add v21.4s, v21.4s, v16.4s
+ sub v22.4s, v22.4s, v7.4s
\pass: add \y3\().4S, v19.4S, v17.4S
- add \y4\().4S, v20.4S, v18.4S
- shrn \y1\().4H, \y3\().4S, #ROW_SHIFT
- shrn \y2\().4H, \y4\().4S, #ROW_SHIFT
- add v7.4S, v21.4S, v5.4S
- add v16.4S, v22.4S, v6.4S
- shrn \y3\().4H, v7.4S, #ROW_SHIFT
- shrn \y4\().4H, v16.4S, #ROW_SHIFT
- sub v22.4S, v22.4S, v6.4S
- sub v19.4S, v19.4S, v17.4S
- sub v21.4S, v21.4S, v5.4S
- shrn2 \y1\().8H, v22.4S, #ROW_SHIFT
- sub v20.4S, v20.4S, v18.4S
- shrn2 \y2\().8H, v21.4S, #ROW_SHIFT
- shrn2 \y3\().8H, v20.4S, #ROW_SHIFT
- shrn2 \y4\().8H, v19.4S, #ROW_SHIFT
-
- trn1 v16.8H, \y1\().8H, \y2\().8H
- trn2 v17.8H, \y1\().8H, \y2\().8H
- trn1 v18.8H, \y3\().8H, \y4\().8H
- trn2 v19.8H, \y3\().8H, \y4\().8H
- trn1 \y1\().4S, v16.4S, v18.4S
- trn1 \y2\().4S, v17.4S, v19.4S
- trn2 \y3\().4S, v16.4S, v18.4S
- trn2 \y4\().4S, v17.4S, v19.4S
+ add \y4\().4s, v20.4s, v18.4s
+ shrn \y1\().4h, \y3\().4s, #ROW_SHIFT
+ shrn \y2\().4h, \y4\().4s, #ROW_SHIFT
+ add v7.4s, v21.4s, v5.4s
+ add v16.4s, v22.4s, v6.4s
+ shrn \y3\().4h, v7.4s, #ROW_SHIFT
+ shrn \y4\().4h, v16.4s, #ROW_SHIFT
+ sub v22.4s, v22.4s, v6.4s
+ sub v19.4s, v19.4s, v17.4s
+ sub v21.4s, v21.4s, v5.4s
+ shrn2 \y1\().8h, v22.4s, #ROW_SHIFT
+ sub v20.4s, v20.4s, v18.4s
+ shrn2 \y2\().8h, v21.4s, #ROW_SHIFT
+ shrn2 \y3\().8h, v20.4s, #ROW_SHIFT
+ shrn2 \y4\().8h, v19.4s, #ROW_SHIFT
+
+ trn1 v16.8h, \y1\().8h, \y2\().8h
+ trn2 v17.8h, \y1\().8h, \y2\().8h
+ trn1 v18.8h, \y3\().8h, \y4\().8h
+ trn2 v19.8h, \y3\().8h, \y4\().8h
+ trn1 \y1\().4s, v16.4s, v18.4s
+ trn1 \y2\().4s, v17.4s, v19.4s
+ trn2 \y3\().4s, v16.4s, v18.4s
+ trn2 \y4\().4s, v17.4s, v19.4s
.endm
.macro declare_idct_col4_neon i, l
function idct_col4_neon\i
- dup v23.4H, z4c
+ dup v23.4h, z4c
.if \i == 1
- add v23.4H, v23.4H, v24.4H
+ add v23.4h, v23.4h, v24.4h
.else
- mov v5.D[0], v24.D[1]
- add v23.4H, v23.4H, v5.4H
+ mov v5.d[0], v24.d[1]
+ add v23.4h, v23.4h, v5.4h
.endif
- smull v23.4S, v23.4H, z4
+ smull v23.4s, v23.4h, z4
idct_col4_top v24, v25, v26, v27, \i, \l
- mov x4, v28.D[\i - 1]
- mov x5, v29.D[\i - 1]
+ mov x4, v28.d[\i - 1]
+ mov x5, v29.d[\i - 1]
cmp x4, #0
b.eq 1f
- smull\i v7.4S, v28\l, z4
- add v19.4S, v19.4S, v7.4S
- sub v20.4S, v20.4S, v7.4S
- sub v21.4S, v21.4S, v7.4S
- add v22.4S, v22.4S, v7.4S
+ smull\i v7.4s, v28\l, z4
+ add v19.4s, v19.4s, v7.4s
+ sub v20.4s, v20.4s, v7.4s
+ sub v21.4s, v21.4s, v7.4s
+ add v22.4s, v22.4s, v7.4s
-1: mov x4, v30.D[\i - 1]
+1: mov x4, v30.d[\i - 1]
cmp x5, #0
b.eq 2f
- smlal\i v17.4S, v29\l, z5
- smlsl\i v18.4S, v29\l, z1
- smlal\i v5.4S, v29\l, z7
- smlal\i v6.4S, v29\l, z3
+ smlal\i v17.4s, v29\l, z5
+ smlsl\i v18.4s, v29\l, z1
+ smlal\i v5.4s, v29\l, z7
+ smlal\i v6.4s, v29\l, z3
-2: mov x5, v31.D[\i - 1]
+2: mov x5, v31.d[\i - 1]
cmp x4, #0
b.eq 3f
- smull\i v7.4S, v30\l, z6
- smull\i v16.4S, v30\l, z2
- add v19.4S, v19.4S, v7.4S
- sub v22.4S, v22.4S, v7.4S
- sub v20.4S, v20.4S, v16.4S
- add v21.4S, v21.4S, v16.4S
+ smull\i v7.4s, v30\l, z6
+ smull\i v16.4s, v30\l, z2
+ add v19.4s, v19.4s, v7.4s
+ sub v22.4s, v22.4s, v7.4s
+ sub v20.4s, v20.4s, v16.4s
+ add v21.4s, v21.4s, v16.4s
3: cmp x5, #0
b.eq 4f
- smlal\i v17.4S, v31\l, z7
- smlsl\i v18.4S, v31\l, z5
- smlal\i v5.4S, v31\l, z3
- smlsl\i v6.4S, v31\l, z1
+ smlal\i v17.4s, v31\l, z7
+ smlsl\i v18.4s, v31\l, z5
+ smlal\i v5.4s, v31\l, z3
+ smlsl\i v6.4s, v31\l, z1
-4: addhn v7.4H, v19.4S, v17.4S
- addhn2 v7.8H, v20.4S, v18.4S
- subhn v18.4H, v20.4S, v18.4S
- subhn2 v18.8H, v19.4S, v17.4S
+4: addhn v7.4h, v19.4s, v17.4s
+ addhn2 v7.8h, v20.4s, v18.4s
+ subhn v18.4h, v20.4s, v18.4s
+ subhn2 v18.8h, v19.4s, v17.4s
- addhn v16.4H, v21.4S, v5.4S
- addhn2 v16.8H, v22.4S, v6.4S
- subhn v17.4H, v22.4S, v6.4S
- subhn2 v17.8H, v21.4S, v5.4S
+ addhn v16.4h, v21.4s, v5.4s
+ addhn2 v16.8h, v22.4s, v6.4s
+ subhn v17.4h, v22.4s, v6.4s
+ subhn2 v17.8h, v21.4s, v5.4s
ret
endfunc
@@ -229,33 +229,33 @@ function ff_simple_idct_put_neon, export=1
idct_row4_neon v28, v29, v30, v31, 2
bl idct_col4_neon1
- sqshrun v1.8B, v7.8H, #COL_SHIFT-16
- sqshrun2 v1.16B, v16.8H, #COL_SHIFT-16
- sqshrun v3.8B, v17.8H, #COL_SHIFT-16
- sqshrun2 v3.16B, v18.8H, #COL_SHIFT-16
+ sqshrun v1.8b, v7.8h, #COL_SHIFT-16
+ sqshrun2 v1.16b, v16.8h, #COL_SHIFT-16
+ sqshrun v3.8b, v17.8h, #COL_SHIFT-16
+ sqshrun2 v3.16b, v18.8h, #COL_SHIFT-16
bl idct_col4_neon2
- sqshrun v2.8B, v7.8H, #COL_SHIFT-16
- sqshrun2 v2.16B, v16.8H, #COL_SHIFT-16
- sqshrun v4.8B, v17.8H, #COL_SHIFT-16
- sqshrun2 v4.16B, v18.8H, #COL_SHIFT-16
+ sqshrun v2.8b, v7.8h, #COL_SHIFT-16
+ sqshrun2 v2.16b, v16.8h, #COL_SHIFT-16
+ sqshrun v4.8b, v17.8h, #COL_SHIFT-16
+ sqshrun2 v4.16b, v18.8h, #COL_SHIFT-16
- zip1 v16.4S, v1.4S, v2.4S
- zip2 v17.4S, v1.4S, v2.4S
+ zip1 v16.4s, v1.4s, v2.4s
+ zip2 v17.4s, v1.4s, v2.4s
- st1 {v16.D}[0], [x0], x1
- st1 {v16.D}[1], [x0], x1
+ st1 {v16.d}[0], [x0], x1
+ st1 {v16.d}[1], [x0], x1
- zip1 v18.4S, v3.4S, v4.4S
- zip2 v19.4S, v3.4S, v4.4S
+ zip1 v18.4s, v3.4s, v4.4s
+ zip2 v19.4s, v3.4s, v4.4s
- st1 {v17.D}[0], [x0], x1
- st1 {v17.D}[1], [x0], x1
- st1 {v18.D}[0], [x0], x1
- st1 {v18.D}[1], [x0], x1
- st1 {v19.D}[0], [x0], x1
- st1 {v19.D}[1], [x0], x1
+ st1 {v17.d}[0], [x0], x1
+ st1 {v17.d}[1], [x0], x1
+ st1 {v18.d}[0], [x0], x1
+ st1 {v18.d}[1], [x0], x1
+ st1 {v19.d}[0], [x0], x1
+ st1 {v19.d}[1], [x0], x1
idct_end
endfunc
@@ -267,59 +267,59 @@ function ff_simple_idct_add_neon, export=1
idct_row4_neon v28, v29, v30, v31, 2
bl idct_col4_neon1
- sshr v1.8H, v7.8H, #COL_SHIFT-16
- sshr v2.8H, v16.8H, #COL_SHIFT-16
- sshr v3.8H, v17.8H, #COL_SHIFT-16
- sshr v4.8H, v18.8H, #COL_SHIFT-16
+ sshr v1.8h, v7.8h, #COL_SHIFT-16
+ sshr v2.8h, v16.8h, #COL_SHIFT-16
+ sshr v3.8h, v17.8h, #COL_SHIFT-16
+ sshr v4.8h, v18.8h, #COL_SHIFT-16
bl idct_col4_neon2
- sshr v7.8H, v7.8H, #COL_SHIFT-16
- sshr v16.8H, v16.8H, #COL_SHIFT-16
- sshr v17.8H, v17.8H, #COL_SHIFT-16
- sshr v18.8H, v18.8H, #COL_SHIFT-16
+ sshr v7.8h, v7.8h, #COL_SHIFT-16
+ sshr v16.8h, v16.8h, #COL_SHIFT-16
+ sshr v17.8h, v17.8h, #COL_SHIFT-16
+ sshr v18.8h, v18.8h, #COL_SHIFT-16
mov x9, x0
- ld1 {v19.D}[0], [x0], x1
- zip1 v23.2D, v1.2D, v7.2D
- zip2 v24.2D, v1.2D, v7.2D
- ld1 {v19.D}[1], [x0], x1
- zip1 v25.2D, v2.2D, v16.2D
- zip2 v26.2D, v2.2D, v16.2D
- ld1 {v20.D}[0], [x0], x1
- zip1 v27.2D, v3.2D, v17.2D
- zip2 v28.2D, v3.2D, v17.2D
- ld1 {v20.D}[1], [x0], x1
- zip1 v29.2D, v4.2D, v18.2D
- zip2 v30.2D, v4.2D, v18.2D
- ld1 {v21.D}[0], [x0], x1
- uaddw v23.8H, v23.8H, v19.8B
- uaddw2 v24.8H, v24.8H, v19.16B
- ld1 {v21.D}[1], [x0], x1
- sqxtun v23.8B, v23.8H
- sqxtun2 v23.16B, v24.8H
- ld1 {v22.D}[0], [x0], x1
- uaddw v24.8H, v25.8H, v20.8B
- uaddw2 v25.8H, v26.8H, v20.16B
- ld1 {v22.D}[1], [x0], x1
- sqxtun v24.8B, v24.8H
- sqxtun2 v24.16B, v25.8H
- st1 {v23.D}[0], [x9], x1
- uaddw v25.8H, v27.8H, v21.8B
- uaddw2 v26.8H, v28.8H, v21.16B
- st1 {v23.D}[1], [x9], x1
- sqxtun v25.8B, v25.8H
- sqxtun2 v25.16B, v26.8H
- st1 {v24.D}[0], [x9], x1
- uaddw v26.8H, v29.8H, v22.8B
- uaddw2 v27.8H, v30.8H, v22.16B
- st1 {v24.D}[1], [x9], x1
- sqxtun v26.8B, v26.8H
- sqxtun2 v26.16B, v27.8H
- st1 {v25.D}[0], [x9], x1
- st1 {v25.D}[1], [x9], x1
- st1 {v26.D}[0], [x9], x1
- st1 {v26.D}[1], [x9], x1
+ ld1 {v19.d}[0], [x0], x1
+ zip1 v23.2d, v1.2d, v7.2d
+ zip2 v24.2d, v1.2d, v7.2d
+ ld1 {v19.d}[1], [x0], x1
+ zip1 v25.2d, v2.2d, v16.2d
+ zip2 v26.2d, v2.2d, v16.2d
+ ld1 {v20.d}[0], [x0], x1
+ zip1 v27.2d, v3.2d, v17.2d
+ zip2 v28.2d, v3.2d, v17.2d
+ ld1 {v20.d}[1], [x0], x1
+ zip1 v29.2d, v4.2d, v18.2d
+ zip2 v30.2d, v4.2d, v18.2d
+ ld1 {v21.d}[0], [x0], x1
+ uaddw v23.8h, v23.8h, v19.8b
+ uaddw2 v24.8h, v24.8h, v19.16b
+ ld1 {v21.d}[1], [x0], x1
+ sqxtun v23.8b, v23.8h
+ sqxtun2 v23.16b, v24.8h
+ ld1 {v22.d}[0], [x0], x1
+ uaddw v24.8h, v25.8h, v20.8b
+ uaddw2 v25.8h, v26.8h, v20.16b
+ ld1 {v22.d}[1], [x0], x1
+ sqxtun v24.8b, v24.8h
+ sqxtun2 v24.16b, v25.8h
+ st1 {v23.d}[0], [x9], x1
+ uaddw v25.8h, v27.8h, v21.8b
+ uaddw2 v26.8h, v28.8h, v21.16b
+ st1 {v23.d}[1], [x9], x1
+ sqxtun v25.8b, v25.8h
+ sqxtun2 v25.16b, v26.8h
+ st1 {v24.d}[0], [x9], x1
+ uaddw v26.8h, v29.8h, v22.8b
+ uaddw2 v27.8h, v30.8h, v22.16b
+ st1 {v24.d}[1], [x9], x1
+ sqxtun v26.8b, v26.8h
+ sqxtun2 v26.16b, v27.8h
+ st1 {v25.d}[0], [x9], x1
+ st1 {v25.d}[1], [x9], x1
+ st1 {v26.d}[0], [x9], x1
+ st1 {v26.d}[1], [x9], x1
idct_end
endfunc
@@ -333,30 +333,30 @@ function ff_simple_idct_neon, export=1
sub x2, x2, #128
bl idct_col4_neon1
- sshr v1.8H, v7.8H, #COL_SHIFT-16
- sshr v2.8H, v16.8H, #COL_SHIFT-16
- sshr v3.8H, v17.8H, #COL_SHIFT-16
- sshr v4.8H, v18.8H, #COL_SHIFT-16
+ sshr v1.8h, v7.8h, #COL_SHIFT-16
+ sshr v2.8h, v16.8h, #COL_SHIFT-16
+ sshr v3.8h, v17.8h, #COL_SHIFT-16
+ sshr v4.8h, v18.8h, #COL_SHIFT-16
bl idct_col4_neon2
- sshr v7.8H, v7.8H, #COL_SHIFT-16
- sshr v16.8H, v16.8H, #COL_SHIFT-16
- sshr v17.8H, v17.8H, #COL_SHIFT-16
- sshr v18.8H, v18.8H, #COL_SHIFT-16
-
- zip1 v23.2D, v1.2D, v7.2D
- zip2 v24.2D, v1.2D, v7.2D
- st1 {v23.2D,v24.2D}, [x2], #32
- zip1 v25.2D, v2.2D, v16.2D
- zip2 v26.2D, v2.2D, v16.2D
- st1 {v25.2D,v26.2D}, [x2], #32
- zip1 v27.2D, v3.2D, v17.2D
- zip2 v28.2D, v3.2D, v17.2D
- st1 {v27.2D,v28.2D}, [x2], #32
- zip1 v29.2D, v4.2D, v18.2D
- zip2 v30.2D, v4.2D, v18.2D
- st1 {v29.2D,v30.2D}, [x2], #32
+ sshr v7.8h, v7.8h, #COL_SHIFT-16
+ sshr v16.8h, v16.8h, #COL_SHIFT-16
+ sshr v17.8h, v17.8h, #COL_SHIFT-16
+ sshr v18.8h, v18.8h, #COL_SHIFT-16
+
+ zip1 v23.2d, v1.2d, v7.2d
+ zip2 v24.2d, v1.2d, v7.2d
+ st1 {v23.2d,v24.2d}, [x2], #32
+ zip1 v25.2d, v2.2d, v16.2d
+ zip2 v26.2d, v2.2d, v16.2d
+ st1 {v25.2d,v26.2d}, [x2], #32
+ zip1 v27.2d, v3.2d, v17.2d
+ zip2 v28.2d, v3.2d, v17.2d
+ st1 {v27.2d,v28.2d}, [x2], #32
+ zip1 v29.2d, v4.2d, v18.2d
+ zip2 v30.2d, v4.2d, v18.2d
+ st1 {v29.2d,v30.2d}, [x2], #32
idct_end
endfunc