diff options
author | Martin Storsjö <martin@martin.st> | 2023-10-17 14:16:24 +0300 |
---|---|---|
committer | Martin Storsjö <martin@martin.st> | 2023-10-21 23:25:18 +0300 |
commit | 184103b3105f02f1189fa0047af4269e027dfbd6 (patch) | |
tree | 3e50ad549ed68292f91594c4e6fb26551de90369 /libavcodec/aarch64 | |
parent | 393d1ee541b143633bfba2ff0e821d734fd511c2 (diff) | |
download | ffmpeg-184103b3105f02f1189fa0047af4269e027dfbd6.tar.gz |
aarch64: Consistently use lowercase for vector element specifiers
Signed-off-by: Martin Storsjö <martin@martin.st>
Diffstat (limited to 'libavcodec/aarch64')
-rw-r--r-- | libavcodec/aarch64/aacpsdsp_neon.S | 194 | ||||
-rw-r--r-- | libavcodec/aarch64/h264cmc_neon.S | 406 | ||||
-rw-r--r-- | libavcodec/aarch64/h264dsp_neon.S | 594 | ||||
-rw-r--r-- | libavcodec/aarch64/h264idct_neon.S | 390 | ||||
-rw-r--r-- | libavcodec/aarch64/h264qpel_neon.S | 556 | ||||
-rw-r--r-- | libavcodec/aarch64/hpeldsp_neon.S | 362 | ||||
-rw-r--r-- | libavcodec/aarch64/me_cmp_neon.S | 2 | ||||
-rw-r--r-- | libavcodec/aarch64/neon.S | 246 | ||||
-rw-r--r-- | libavcodec/aarch64/sbrdsp_neon.S | 294 | ||||
-rw-r--r-- | libavcodec/aarch64/simple_idct_neon.S | 386 |
10 files changed, 1715 insertions, 1715 deletions
diff --git a/libavcodec/aarch64/aacpsdsp_neon.S b/libavcodec/aarch64/aacpsdsp_neon.S index ff4e6e244a..686c62eb2e 100644 --- a/libavcodec/aarch64/aacpsdsp_neon.S +++ b/libavcodec/aarch64/aacpsdsp_neon.S @@ -19,82 +19,82 @@ #include "libavutil/aarch64/asm.S" function ff_ps_add_squares_neon, export=1 -1: ld1 {v0.4S,v1.4S}, [x1], #32 - fmul v0.4S, v0.4S, v0.4S - fmul v1.4S, v1.4S, v1.4S - faddp v2.4S, v0.4S, v1.4S - ld1 {v3.4S}, [x0] - fadd v3.4S, v3.4S, v2.4S - st1 {v3.4S}, [x0], #16 +1: ld1 {v0.4s,v1.4s}, [x1], #32 + fmul v0.4s, v0.4s, v0.4s + fmul v1.4s, v1.4s, v1.4s + faddp v2.4s, v0.4s, v1.4s + ld1 {v3.4s}, [x0] + fadd v3.4s, v3.4s, v2.4s + st1 {v3.4s}, [x0], #16 subs w2, w2, #4 b.gt 1b ret endfunc function ff_ps_mul_pair_single_neon, export=1 -1: ld1 {v0.4S,v1.4S}, [x1], #32 - ld1 {v2.4S}, [x2], #16 - zip1 v3.4S, v2.4S, v2.4S - zip2 v4.4S, v2.4S, v2.4S - fmul v0.4S, v0.4S, v3.4S - fmul v1.4S, v1.4S, v4.4S - st1 {v0.4S,v1.4S}, [x0], #32 +1: ld1 {v0.4s,v1.4s}, [x1], #32 + ld1 {v2.4s}, [x2], #16 + zip1 v3.4s, v2.4s, v2.4s + zip2 v4.4s, v2.4s, v2.4s + fmul v0.4s, v0.4s, v3.4s + fmul v1.4s, v1.4s, v4.4s + st1 {v0.4s,v1.4s}, [x0], #32 subs w3, w3, #4 b.gt 1b ret endfunc function ff_ps_stereo_interpolate_neon, export=1 - ld1 {v0.4S}, [x2] - ld1 {v1.4S}, [x3] - zip1 v4.4S, v0.4S, v0.4S - zip2 v5.4S, v0.4S, v0.4S - zip1 v6.4S, v1.4S, v1.4S - zip2 v7.4S, v1.4S, v1.4S -1: ld1 {v2.2S}, [x0] - ld1 {v3.2S}, [x1] - fadd v4.4S, v4.4S, v6.4S - fadd v5.4S, v5.4S, v7.4S - mov v2.D[1], v2.D[0] - mov v3.D[1], v3.D[0] - fmul v2.4S, v2.4S, v4.4S - fmla v2.4S, v3.4S, v5.4S - st1 {v2.D}[0], [x0], #8 - st1 {v2.D}[1], [x1], #8 + ld1 {v0.4s}, [x2] + ld1 {v1.4s}, [x3] + zip1 v4.4s, v0.4s, v0.4s + zip2 v5.4s, v0.4s, v0.4s + zip1 v6.4s, v1.4s, v1.4s + zip2 v7.4s, v1.4s, v1.4s +1: ld1 {v2.2s}, [x0] + ld1 {v3.2s}, [x1] + fadd v4.4s, v4.4s, v6.4s + fadd v5.4s, v5.4s, v7.4s + mov v2.d[1], v2.d[0] + mov v3.d[1], v3.d[0] + fmul v2.4s, v2.4s, v4.4s + fmla v2.4s, v3.4s, v5.4s + st1 {v2.d}[0], [x0], #8 + st1 {v2.d}[1], [x1], #8 subs w4, w4, #1 b.gt 1b ret endfunc function ff_ps_stereo_interpolate_ipdopd_neon, export=1 - ld1 {v0.4S,v1.4S}, [x2] - ld1 {v6.4S,v7.4S}, [x3] - fneg v2.4S, v1.4S - fneg v3.4S, v7.4S - zip1 v16.4S, v0.4S, v0.4S - zip2 v17.4S, v0.4S, v0.4S - zip1 v18.4S, v2.4S, v1.4S - zip2 v19.4S, v2.4S, v1.4S - zip1 v20.4S, v6.4S, v6.4S - zip2 v21.4S, v6.4S, v6.4S - zip1 v22.4S, v3.4S, v7.4S - zip2 v23.4S, v3.4S, v7.4S -1: ld1 {v2.2S}, [x0] - ld1 {v3.2S}, [x1] - fadd v16.4S, v16.4S, v20.4S - fadd v17.4S, v17.4S, v21.4S - mov v2.D[1], v2.D[0] - mov v3.D[1], v3.D[0] - fmul v4.4S, v2.4S, v16.4S - fmla v4.4S, v3.4S, v17.4S - fadd v18.4S, v18.4S, v22.4S - fadd v19.4S, v19.4S, v23.4S - ext v2.16B, v2.16B, v2.16B, #4 - ext v3.16B, v3.16B, v3.16B, #4 - fmla v4.4S, v2.4S, v18.4S - fmla v4.4S, v3.4S, v19.4S - st1 {v4.D}[0], [x0], #8 - st1 {v4.D}[1], [x1], #8 + ld1 {v0.4s,v1.4s}, [x2] + ld1 {v6.4s,v7.4s}, [x3] + fneg v2.4s, v1.4s + fneg v3.4s, v7.4s + zip1 v16.4s, v0.4s, v0.4s + zip2 v17.4s, v0.4s, v0.4s + zip1 v18.4s, v2.4s, v1.4s + zip2 v19.4s, v2.4s, v1.4s + zip1 v20.4s, v6.4s, v6.4s + zip2 v21.4s, v6.4s, v6.4s + zip1 v22.4s, v3.4s, v7.4s + zip2 v23.4s, v3.4s, v7.4s +1: ld1 {v2.2s}, [x0] + ld1 {v3.2s}, [x1] + fadd v16.4s, v16.4s, v20.4s + fadd v17.4s, v17.4s, v21.4s + mov v2.d[1], v2.d[0] + mov v3.d[1], v3.d[0] + fmul v4.4s, v2.4s, v16.4s + fmla v4.4s, v3.4s, v17.4s + fadd v18.4s, v18.4s, v22.4s + fadd v19.4s, v19.4s, v23.4s + ext v2.16b, v2.16b, v2.16b, #4 + ext v3.16b, v3.16b, v3.16b, #4 + fmla v4.4s, v2.4s, v18.4s + fmla v4.4s, v3.4s, v19.4s + st1 {v4.d}[0], [x0], #8 + st1 {v4.d}[1], [x1], #8 subs w4, w4, #1 b.gt 1b ret @@ -102,46 +102,46 @@ endfunc function ff_ps_hybrid_analysis_neon, export=1 lsl x3, x3, #3 - ld2 {v0.4S,v1.4S}, [x1], #32 - ld2 {v2.2S,v3.2S}, [x1], #16 - ld1 {v24.2S}, [x1], #8 - ld2 {v4.2S,v5.2S}, [x1], #16 - ld2 {v6.4S,v7.4S}, [x1] - rev64 v6.4S, v6.4S - rev64 v7.4S, v7.4S - ext v6.16B, v6.16B, v6.16B, #8 - ext v7.16B, v7.16B, v7.16B, #8 - rev64 v4.2S, v4.2S - rev64 v5.2S, v5.2S - mov v2.D[1], v3.D[0] - mov v4.D[1], v5.D[0] - mov v5.D[1], v2.D[0] - mov v3.D[1], v4.D[0] - fadd v16.4S, v0.4S, v6.4S - fadd v17.4S, v1.4S, v7.4S - fsub v18.4S, v1.4S, v7.4S - fsub v19.4S, v0.4S, v6.4S - fadd v22.4S, v2.4S, v4.4S - fsub v23.4S, v5.4S, v3.4S - trn1 v20.2D, v22.2D, v23.2D // {re4+re8, re5+re7, im8-im4, im7-im5} - trn2 v21.2D, v22.2D, v23.2D // {im4+im8, im5+im7, re4-re8, re5-re7} -1: ld2 {v2.4S,v3.4S}, [x2], #32 - ld2 {v4.2S,v5.2S}, [x2], #16 - ld1 {v6.2S}, [x2], #8 + ld2 {v0.4s,v1.4s}, [x1], #32 + ld2 {v2.2s,v3.2s}, [x1], #16 + ld1 {v24.2s}, [x1], #8 + ld2 {v4.2s,v5.2s}, [x1], #16 + ld2 {v6.4s,v7.4s}, [x1] + rev64 v6.4s, v6.4s + rev64 v7.4s, v7.4s + ext v6.16b, v6.16b, v6.16b, #8 + ext v7.16b, v7.16b, v7.16b, #8 + rev64 v4.2s, v4.2s + rev64 v5.2s, v5.2s + mov v2.d[1], v3.d[0] + mov v4.d[1], v5.d[0] + mov v5.d[1], v2.d[0] + mov v3.d[1], v4.d[0] + fadd v16.4s, v0.4s, v6.4s + fadd v17.4s, v1.4s, v7.4s + fsub v18.4s, v1.4s, v7.4s + fsub v19.4s, v0.4s, v6.4s + fadd v22.4s, v2.4s, v4.4s + fsub v23.4s, v5.4s, v3.4s + trn1 v20.2d, v22.2d, v23.2d // {re4+re8, re5+re7, im8-im4, im7-im5} + trn2 v21.2d, v22.2d, v23.2d // {im4+im8, im5+im7, re4-re8, re5-re7} +1: ld2 {v2.4s,v3.4s}, [x2], #32 + ld2 {v4.2s,v5.2s}, [x2], #16 + ld1 {v6.2s}, [x2], #8 add x2, x2, #8 - mov v4.D[1], v5.D[0] - mov v6.S[1], v6.S[0] - fmul v6.2S, v6.2S, v24.2S - fmul v0.4S, v2.4S, v16.4S - fmul v1.4S, v2.4S, v17.4S - fmls v0.4S, v3.4S, v18.4S - fmla v1.4S, v3.4S, v19.4S - fmla v0.4S, v4.4S, v20.4S - fmla v1.4S, v4.4S, v21.4S - faddp v0.4S, v0.4S, v1.4S - faddp v0.4S, v0.4S, v0.4S - fadd v0.2S, v0.2S, v6.2S - st1 {v0.2S}, [x0], x3 + mov v4.d[1], v5.d[0] + mov v6.s[1], v6.s[0] + fmul v6.2s, v6.2s, v24.2s + fmul v0.4s, v2.4s, v16.4s + fmul v1.4s, v2.4s, v17.4s + fmls v0.4s, v3.4s, v18.4s + fmla v1.4s, v3.4s, v19.4s + fmla v0.4s, v4.4s, v20.4s + fmla v1.4s, v4.4s, v21.4s + faddp v0.4s, v0.4s, v1.4s + faddp v0.4s, v0.4s, v0.4s + fadd v0.2s, v0.2s, v6.2s + st1 {v0.2s}, [x0], x3 subs w4, w4, #1 b.gt 1b ret diff --git a/libavcodec/aarch64/h264cmc_neon.S b/libavcodec/aarch64/h264cmc_neon.S index 88ccd727d0..5b959b87d3 100644 --- a/libavcodec/aarch64/h264cmc_neon.S +++ b/libavcodec/aarch64/h264cmc_neon.S @@ -39,10 +39,10 @@ function ff_\type\()_\codec\()_chroma_mc8_neon, export=1 lsl w10, w10, #1 add w9, w9, w10 add x6, x6, w9, UXTW - ld1r {v22.8H}, [x6] + ld1r {v22.8h}, [x6] .endif .ifc \codec,vc1 - movi v22.8H, #28 + movi v22.8h, #28 .endif mul w7, w4, w5 lsl w14, w5, #3 @@ -55,139 +55,139 @@ function ff_\type\()_\codec\()_chroma_mc8_neon, export=1 add w4, w4, #64 b.eq 2f - dup v0.8B, w4 - dup v1.8B, w12 - ld1 {v4.8B, v5.8B}, [x1], x2 - dup v2.8B, w6 - dup v3.8B, w7 - ext v5.8B, v4.8B, v5.8B, #1 -1: ld1 {v6.8B, v7.8B}, [x1], x2 - umull v16.8H, v4.8B, v0.8B - umlal v16.8H, v5.8B, v1.8B - ext v7.8B, v6.8B, v7.8B, #1 - ld1 {v4.8B, v5.8B}, [x1], x2 - umlal v16.8H, v6.8B, v2.8B + dup v0.8b, w4 + dup v1.8b, w12 + ld1 {v4.8b, v5.8b}, [x1], x2 + dup v2.8b, w6 + dup v3.8b, w7 + ext v5.8b, v4.8b, v5.8b, #1 +1: ld1 {v6.8b, v7.8b}, [x1], x2 + umull v16.8h, v4.8b, v0.8b + umlal v16.8h, v5.8b, v1.8b + ext v7.8b, v6.8b, v7.8b, #1 + ld1 {v4.8b, v5.8b}, [x1], x2 + umlal v16.8h, v6.8b, v2.8b prfm pldl1strm, [x1] - ext v5.8B, v4.8B, v5.8B, #1 - umlal v16.8H, v7.8B, v3.8B - umull v17.8H, v6.8B, v0.8B + ext v5.8b, v4.8b, v5.8b, #1 + umlal v16.8h, v7.8b, v3.8b + umull v17.8h, v6.8b, v0.8b subs w3, w3, #2 - umlal v17.8H, v7.8B, v1.8B - umlal v17.8H, v4.8B, v2.8B - umlal v17.8H, v5.8B, v3.8B + umlal v17.8h, v7.8b, v1.8b + umlal v17.8h, v4.8b, v2.8b + umlal v17.8h, v5.8b, v3.8b prfm pldl1strm, [x1, x2] .ifc \codec,h264 - rshrn v16.8B, v16.8H, #6 - rshrn v17.8B, v17.8H, #6 + rshrn v16.8b, v16.8h, #6 + rshrn v17.8b, v17.8h, #6 .else - add v16.8H, v16.8H, v22.8H - add v17.8H, v17.8H, v22.8H - shrn v16.8B, v16.8H, #6 - shrn v17.8B, v17.8H, #6 + add v16.8h, v16.8h, v22.8h + add v17.8h, v17.8h, v22.8h + shrn v16.8b, v16.8h, #6 + shrn v17.8b, v17.8h, #6 .endif .ifc \type,avg - ld1 {v20.8B}, [x8], x2 - ld1 {v21.8B}, [x8], x2 - urhadd v16.8B, v16.8B, v20.8B - urhadd v17.8B, v17.8B, v21.8B + ld1 {v20.8b}, [x8], x2 + ld1 {v21.8b}, [x8], x2 + urhadd v16.8b, v16.8b, v20.8b + urhadd v17.8b, v17.8b, v21.8b .endif - st1 {v16.8B}, [x0], x2 - st1 {v17.8B}, [x0], x2 + st1 {v16.8b}, [x0], x2 + st1 {v17.8b}, [x0], x2 b.gt 1b ret 2: adds w12, w12, w6 - dup v0.8B, w4 + dup v0.8b, w4 b.eq 5f tst w6, w6 - dup v1.8B, w12 + dup v1.8b, w12 b.eq 4f - ld1 {v4.8B}, [x1], x2 -3: ld1 {v6.8B}, [x1], x2 - umull v16.8H, v4.8B, v0.8B - umlal v16.8H, v6.8B, v1.8B - ld1 {v4.8B}, [x1], x2 - umull v17.8H, v6.8B, v0.8B - umlal v17.8H, v4.8B, v1.8B + ld1 {v4.8b}, [x1], x2 +3: ld1 {v6.8b}, [x1], x2 + umull v16.8h, v4.8b, v0.8b + umlal v16.8h, v6.8b, v1.8b + ld1 {v4.8b}, [x1], x2 + umull v17.8h, v6.8b, v0.8b + umlal v17.8h, v4.8b, v1.8b prfm pldl1strm, [x1] .ifc \codec,h264 - rshrn v16.8B, v16.8H, #6 - rshrn v17.8B, v17.8H, #6 + rshrn v16.8b, v16.8h, #6 + rshrn v17.8b, v17.8h, #6 .else - add v16.8H, v16.8H, v22.8H - add v17.8H, v17.8H, v22.8H - shrn v16.8B, v16.8H, #6 - shrn v17.8B, v17.8H, #6 + add v16.8h, v16.8h, v22.8h + add v17.8h, v17.8h, v22.8h + shrn v16.8b, v16.8h, #6 + shrn v17.8b, v17.8h, #6 .endif prfm pldl1strm, [x1, x2] .ifc \type,avg - ld1 {v20.8B}, [x8], x2 - ld1 {v21.8B}, [x8], x2 - urhadd v16.8B, v16.8B, v20.8B - urhadd v17.8B, v17.8B, v21.8B + ld1 {v20.8b}, [x8], x2 + ld1 {v21.8b}, [x8], x2 + urhadd v16.8b, v16.8b, v20.8b + urhadd v17.8b, v17.8b, v21.8b .endif subs w3, w3, #2 - st1 {v16.8B}, [x0], x2 - st1 {v17.8B}, [x0], x2 + st1 {v16.8b}, [x0], x2 + st1 {v17.8b}, [x0], x2 b.gt 3b ret -4: ld1 {v4.8B, v5.8B}, [x1], x2 - ld1 {v6.8B, v7.8B}, [x1], x2 - ext v5.8B, v4.8B, v5.8B, #1 - ext v7.8B, v6.8B, v7.8B, #1 +4: ld1 {v4.8b, v5.8b}, [x1], x2 + ld1 {v6.8b, v7.8b}, [x1], x2 + ext v5.8b, v4.8b, v5.8b, #1 + ext v7.8b, v6.8b, v7.8b, #1 prfm pldl1strm, [x1] subs w3, w3, #2 - umull v16.8H, v4.8B, v0.8B - umlal v16.8H, v5.8B, v1.8B - umull v17.8H, v6.8B, v0.8B - umlal v17.8H, v7.8B, v1.8B + umull v16.8h, v4.8b, v0.8b + umlal v16.8h, v5.8b, v1.8b + umull v17.8h, v6.8b, v0.8b + umlal v17.8h, v7.8b, v1.8b prfm pldl1strm, [x1, x2] .ifc \codec,h264 - rshrn v16.8B, v16.8H, #6 - rshrn v17.8B, v17.8H, #6 + rshrn v16.8b, v16.8h, #6 + rshrn v17.8b, v17.8h, #6 .else - add v16.8H, v16.8H, v22.8H - add v17.8H, v17.8H, v22.8H - shrn v16.8B, v16.8H, #6 - shrn v17.8B, v17.8H, #6 + add v16.8h, v16.8h, v22.8h + add v17.8h, v17.8h, v22.8h + shrn v16.8b, v16.8h, #6 + shrn v17.8b, v17.8h, #6 .endif .ifc \type,avg - ld1 {v20.8B}, [x8], x2 - ld1 {v21.8B}, [x8], x2 - urhadd v16.8B, v16.8B, v20.8B - urhadd v17.8B, v17.8B, v21.8B + ld1 {v20.8b}, [x8], x2 + ld1 {v21.8b}, [x8], x2 + urhadd v16.8b, v16.8b, v20.8b + urhadd v17.8b, v17.8b, v21.8b .endif - st1 {v16.8B}, [x0], x2 - st1 {v17.8B}, [x0], x2 + st1 {v16.8b}, [x0], x2 + st1 {v17.8b}, [x0], x2 b.gt 4b ret -5: ld1 {v4.8B}, [x1], x2 - ld1 {v5.8B}, [x1], x2 +5: ld1 {v4.8b}, [x1], x2 + ld1 {v5.8b}, [x1], x2 prfm pldl1strm, [x1] subs w3, w3, #2 - umull v16.8H, v4.8B, v0.8B - umull v17.8H, v5.8B, v0.8B + umull v16.8h, v4.8b, v0.8b + umull v17.8h, v5.8b, v0.8b prfm pldl1strm, [x1, x2] .ifc \codec,h264 - rshrn v16.8B, v16.8H, #6 - rshrn v17.8B, v17.8H, #6 + rshrn v16.8b, v16.8h, #6 + rshrn v17.8b, v17.8h, #6 .else - add v16.8H, v16.8H, v22.8H - add v17.8H, v17.8H, v22.8H - shrn v16.8B, v16.8H, #6 - shrn v17.8B, v17.8H, #6 + add v16.8h, v16.8h, v22.8h + add v17.8h, v17.8h, v22.8h + shrn v16.8b, v16.8h, #6 + shrn v17.8b, v17.8h, #6 .endif .ifc \type,avg - ld1 {v20.8B}, [x8], x2 - ld1 {v21.8B}, [x8], x2 - urhadd v16.8B, v16.8B, v20.8B - urhadd v17.8B, v17.8B, v21.8B + ld1 {v20.8b}, [x8], x2 + ld1 {v21.8b}, [x8], x2 + urhadd v16.8b, v16.8b, v20.8b + urhadd v17.8b, v17.8b, v21.8b .endif - st1 {v16.8B}, [x0], x2 - st1 {v17.8B}, [x0], x2 + st1 {v16.8b}, [x0], x2 + st1 {v17.8b}, [x0], x2 b.gt 5b ret endfunc @@ -209,10 +209,10 @@ function ff_\type\()_\codec\()_chroma_mc4_neon, export=1 lsl w10, w10, #1 add w9, w9, w10 add x6, x6, w9, UXTW - ld1r {v22.8H}, [x6] + ld1r {v22.8h}, [x6] .endif .ifc \codec,vc1 - movi v22.8H, #28 + movi v22.8h, #28 .endif mul w7, w4, w5 lsl w14, w5, #3 @@ -225,133 +225,133 @@ function ff_\type\()_\codec\()_chroma_mc4_neon, export=1 add w4, w4, #64 b.eq 2f - dup v24.8B, w4 - dup v25.8B, w12 - ld1 {v4.8B}, [x1], x2 - dup v26.8B, w6 - dup v27.8B, w7 - ext v5.8B, v4.8B, v5.8B, #1 - trn1 v0.2S, v24.2S, v25.2S - trn1 v2.2S, v26.2S, v27.2S - trn1 v4.2S, v4.2S, v5.2S -1: ld1 {v6.8B}, [x1], x2 - ext v7.8B, v6.8B, v7.8B, #1 - trn1 v6.2S, v6.2S, v7.2S - umull v18.8H, v4.8B, v0.8B - umlal v18.8H, v6.8B, v2.8B - ld1 {v4.8B}, [x1], x2 - ext v5.8B, v4.8B, v5.8B, #1 - trn1 v4.2S, v4.2S, v5.2S + dup v24.8b, w4 + dup v25.8b, w12 + ld1 {v4.8b}, [x1], x2 + dup v26.8b, w6 + dup v27.8b, w7 + ext v5.8b, v4.8b, v5.8b, #1 + trn1 v0.2s, v24.2s, v25.2s + trn1 v2.2s, v26.2s, v27.2s + trn1 v4.2s, v4.2s, v5.2s +1: ld1 {v6.8b}, [x1], x2 + ext v7.8b, v6.8b, v7.8b, #1 + trn1 v6.2s, v6.2s, v7.2s + umull v18.8h, v4.8b, v0.8b + umlal v18.8h, v6.8b, v2.8b + ld1 {v4.8b}, [x1], x2 + ext v5.8b, v4.8b, v5.8b, #1 + trn1 v4.2s, v4.2s, v5.2s prfm pldl1strm, [x1] - umull v19.8H, v6.8B, v0.8B - umlal v19.8H, v4.8B, v2.8B - trn1 v30.2D, v18.2D, v19.2D - trn2 v31.2D, v18.2D, v19.2D - add v18.8H, v30.8H, v31.8H + umull v19.8h, v6.8b, v0.8b + umlal v19.8h, v4.8b, v2.8b + trn1 v30.2d, v18.2d, v19.2d + trn2 v31.2d, v18.2d, v19.2d + add v18.8h, v30.8h, v31.8h .ifc \codec,h264 - rshrn v16.8B, v18.8H, #6 + rshrn v16.8b, v18.8h, #6 .else - add v18.8H, v18.8H, v22.8H - shrn v16.8B, v18.8H, #6 + add v18.8h, v18.8h, v22.8h + shrn v16.8b, v18.8h, #6 .endif subs w3, w3, #2 prfm pldl1strm, [x1, x2] .ifc \type,avg - ld1 {v20.S}[0], [x8], x2 - ld1 {v20.S}[1], [x8], x2 - urhadd v16.8B, v16.8B, v20.8B + ld1 {v20.s}[0], [x8], x2 + ld1 {v20.s}[1], [x8], x2 + urhadd v16.8b, v16.8b, v20.8b .endif - st1 {v16.S}[0], [x0], x2 - st1 {v16.S}[1], [x0], x2 + st1 {v16.s}[0], [x0], x2 + st1 {v16.s}[1], [x0], x2 b.gt 1b ret 2: adds w12, w12, w6 - dup v30.8B, w4 + dup v30.8b, w4 b.eq 5f tst w6, w6 - dup v31.8B, w12 - trn1 v0.2S, v30.2S, v31.2S - trn2 v1.2S, v30.2S, v31.2S + dup v31.8b, w12 + trn1 v0.2s, v30.2s, v31.2s + trn2 v1.2s, v30.2s, v31.2s b.eq 4f - ext v1.8B, v0.8B, v1.8B, #4 - ld1 {v4.S}[0], [x1], x2 -3: ld1 {v4.S}[1], [x1], x2 - umull v18.8H, v4.8B, v0.8B - ld1 {v4.S}[0], [x1], x2 - umull v19.8H, v4.8B, v1.8B - trn1 v30.2D, v18.2D, v19.2D - trn2 v31.2D, v18.2D, v19.2D - add v18.8H, v30.8H, v31.8H + ext v1.8b, v0.8b, v1.8b, #4 + ld1 {v4.s}[0], [x1], x2 +3: ld1 {v4.s}[1], [x1], x2 + umull v18.8h, v4.8b, v0.8b + ld1 {v4.s}[0], [x1], x2 + umull v19.8h, v4.8b, v1.8b + trn1 v30.2d, v18.2d, v19.2d + trn2 v31.2d, v18.2d, v19.2d + add v18.8h, v30.8h, v31.8h prfm pldl1strm, [x1] .ifc \codec,h264 - rshrn v16.8B, v18.8H, #6 + rshrn v16.8b, v18.8h, #6 .else - add v18.8H, v18.8H, v22.8H - shrn v16.8B, v18.8H, #6 + add v18.8h, v18.8h, v22.8h + shrn v16.8b, v18.8h, #6 .endif .ifc \type,avg - ld1 {v20.S}[0], [x8], x2 - ld1 {v20.S}[1], [x8], x2 - urhadd v16.8B, v16.8B, v20.8B + ld1 {v20.s}[0], [x8], x2 + ld1 {v20.s}[1], [x8], x2 + urhadd v16.8b, v16.8b, v20.8b .endif subs w3, w3, #2 prfm pldl1strm, [x1, x2] - st1 {v16.S}[0], [x0], x2 - st1 {v16.S}[1], [x0], x2 + st1 {v16.s}[0], [x0], x2 + st1 {v16.s}[1], [x0], x2 b.gt 3b ret -4: ld1 {v4.8B}, [x1], x2 - ld1 {v6.8B}, [x1], x2 - ext v5.8B, v4.8B, v5.8B, #1 - ext v7.8B, v6.8B, v7.8B, #1 - trn1 v4.2S, v4.2S, v5.2S - trn1 v6.2S, v6.2S, v7.2S - umull v18.8H, v4.8B, v0.8B - umull v19.8H, v6.8B, v0.8B +4: ld1 {v4.8b}, [x1], x2 + ld1 {v6.8b}, [x1], x2 + ext v5.8b, v4.8b, v5.8b, #1 + ext v7.8b, v6.8b, v7.8b, #1 + trn1 v4.2s, v4.2s, v5.2s + trn1 v6.2s, v6.2s, v7.2s + umull v18.8h, v4.8b, v0.8b + umull v19.8h, v6.8b, v0.8b subs w3, w3, #2 - trn1 v30.2D, v18.2D, v19.2D - trn2 v31.2D, v18.2D, v19.2D - add v18.8H, v30.8H, v31.8H + trn1 v30.2d, v18.2d, v19.2d + trn2 v31.2d, v18.2d, v19.2d + add v18.8h, v30.8h, v31.8h prfm pldl1strm, [x1] .ifc \codec,h264 - rshrn v16.8B, v18.8H, #6 + rshrn v16.8b, v18.8h, #6 .else - add v18.8H, v18.8H, v22.8H - shrn v16.8B, v18.8H, #6 + add v18.8h, v18.8h, v22.8h + shrn v16.8b, v18.8h, #6 .endif .ifc \type,avg - ld1 {v20.S}[0], [x8], x2 - ld1 {v20.S}[1], [x8], x2 - urhadd v16.8B, v16.8B, v20.8B + ld1 {v20.s}[0], [x8], x2 + ld1 {v20.s}[1], [x8], x2 + urhadd v16.8b, v16.8b, v20.8b .endif prfm pldl1strm, [x1] - st1 {v16.S}[0], [x0], x2 - st1 {v16.S}[1], [x0], x2 + st1 {v16.s}[0], [x0], x2 + st1 {v16.s}[1], [x0], x2 b.gt 4b ret -5: ld1 {v4.S}[0], [x1], x2 - ld1 {v4.S}[1], [x1], x2 - umull v18.8H, v4.8B, v30.8B +5: ld1 {v4.s}[0], [x1], x2 + ld1 {v4.s}[1], [x1], x2 + umull v18.8h, v4.8b, v30.8b subs w3, w3, #2 prfm pldl1strm, [x1] .ifc \codec,h264 - rshrn v16.8B, v18.8H, #6 + rshrn v16.8b, v18.8h, #6 .else - add v18.8H, v18.8H, v22.8H - shrn v16.8B, v18.8H, #6 + add v18.8h, v18.8h, v22.8h + shrn v16.8b, v18.8h, #6 .endif .ifc \type,avg - ld1 {v20.S}[0], [x8], x2 - ld1 {v20.S}[1], [x8], x2 - urhadd v16.8B, v16.8B, v20.8B + ld1 {v20.s}[0], [x8], x2 + ld1 {v20.s}[1], [x8], x2 + urhadd v16.8b, v16.8b, v20.8b .endif prfm pldl1strm, [x1] - st1 {v16.S}[0], [x0], x2 - st1 {v16.S}[1], [x0], x2 + st1 {v16.s}[0], [x0], x2 + st1 {v16.s}[1], [x0], x2 b.gt 5b ret endfunc @@ -372,51 +372,51 @@ function ff_\type\()_h264_chroma_mc2_neon, export=1 sub w4, w7, w13 sub w4, w4, w14 add w4, w4, #64 - dup v0.8B, w4 - dup v2.8B, w12 - dup v1.8B, w6 - dup v3.8B, w7 - trn1 v0.4H, v0.4H, v2.4H - trn1 v1.4H, v1.4H, v3.4H + dup v0.8b, w4 + dup v2.8b, w12 + dup v1.8b, w6 + dup v3.8b, w7 + trn1 v0.4h, v0.4h, v2.4h + trn1 v1.4h, v1.4h, v3.4h 1: - ld1 {v4.S}[0], [x1], x2 - ld1 {v4.S}[1], [x1], x2 - rev64 v5.2S, v4.2S - ld1 {v5.S}[1], [x1] - ext v6.8B, v4.8B, v5.8B, #1 - ext v7.8B, v5.8B, v4.8B, #1 - trn1 v4.4H, v4.4H, v6.4H - trn1 v5.4H, v5.4H, v7.4H - umull v16.8H, v4.8B, v0.8B - umlal v16.8H, v5.8B, v1.8B + ld1 {v4.s}[0], [x1], x2 + ld1 {v4.s}[1], [x1], x2 + rev64 v5.2s, v4.2s + ld1 {v5.s}[1], [x1] + ext v6.8b, v4.8b, v5.8b, #1 + ext v7.8b, v5.8b, v4.8b, #1 + trn1 v4.4h, v4.4h, v6.4h + trn1 v5.4h, v5.4h, v7.4h + umull v16.8h, v4.8b, v0.8b + umlal v16.8h, v5.8b, v1.8b .ifc \type,avg - ld1 {v18.H}[0], [x0], x2 - ld1 {v18.H}[2], [x0] + ld1 {v18.h}[0], [x0], x2 + ld1 {v18.h}[2], [x0] sub x0, x0, x2 .endif - rev64 v17.4S, v16.4S - add v16.8H, v16.8H, v17.8H - rshrn v16.8B, v16.8H, #6 + rev64 v17.4s, v16.4s + add v16.8h, v16.8h, v17.8h + rshrn v16.8b, v16.8h, #6 .ifc \type,avg - urhadd v16.8B, v16.8B, v18.8B + urhadd v16.8b, v16.8b, v18.8b .endif - st1 {v16.H}[0], [x0], x2 - st1 {v16.H}[2], [x0], x2 + st1 {v16.h}[0], [x0], x2 + st1 {v16.h}[2], [x0], x2 subs w3, w3, #2 b.gt 1b ret 2: - ld1 {v16.H}[0], [x1], x2 - ld1 {v16.H}[1], [x1], x2 + ld1 {v16.h}[0], [x1], x2 + ld1 {v16.h}[1], [x1], x2 .ifc \type,avg - ld1 {v18.H}[0], [x0], x2 - ld1 {v18.H}[1], [x0] + ld1 {v18.h}[0], [x0], x2 + ld1 {v18.h}[1], [x0] sub x0, x0, x2 - urhadd v16.8B, v16.8B, v18.8B + urhadd v16.8b, v16.8b, v18.8b .endif - st1 {v16.H}[0], [x0], x2 - st1 {v16.H}[1], [x0], x2 + st1 {v16.h}[0], [x0], x2 + st1 {v16.h}[1], [x0], x2 subs w3, w3, #2 b.gt 2b ret diff --git a/libavcodec/aarch64/h264dsp_neon.S b/libavcodec/aarch64/h264dsp_neon.S index ea221e6862..71c2ddfd0c 100644 --- a/libavcodec/aarch64/h264dsp_neon.S +++ b/libavcodec/aarch64/h264dsp_neon.S @@ -27,7 +27,7 @@ cmp w2, #0 ldr w6, [x4] ccmp w3, #0, #0, ne - mov v24.S[0], w6 + mov v24.s[0], w6 and w8, w6, w6, lsl #16 b.eq 1f ands w8, w8, w8, lsl #8 @@ -38,95 +38,95 @@ .endm .macro h264_loop_filter_luma - dup v22.16B, w2 // alpha - uxtl v24.8H, v24.8B - uabd v21.16B, v16.16B, v0.16B // abs(p0 - q0) - uxtl v24.4S, v24.4H - uabd v28.16B, v18.16B, v16.16B // abs(p1 - p0) - sli v24.8H, v24.8H, #8 - uabd v30.16B, v2.16B, v0.16B // abs(q1 - q0) - sli v24.4S, v24.4S, #16 - cmhi v21.16B, v22.16B, v21.16B // < alpha - dup v22.16B, w3 // beta - cmlt v23.16B, v24.16B, #0 - cmhi v28.16B, v22.16B, v28.16B // < beta - cmhi v30.16B, v22.16B, v30.16B // < beta - bic v21.16B, v21.16B, v23.16B - uabd v17.16B, v20.16B, v16.16B // abs(p2 - p0) - and v21.16B, v21.16B, v28.16B - uabd v19.16B, v4.16B, v0.16B // abs(q2 - q0) - and v21.16B, v21.16B, v30.16B // < beta + dup v22.16b, w2 // alpha + uxtl v24.8h, v24.8b + uabd v21.16b, v16.16b, v0.16b // abs(p0 - q0) + uxtl v24.4s, v24.4h + uabd v28.16b, v18.16b, v16.16b // abs(p1 - p0) + sli v24.8h, v24.8h, #8 + uabd v30.16b, v2.16b, v0.16b // abs(q1 - q0) + sli v24.4s, v24.4s, #16 + cmhi v21.16b, v22.16b, v21.16b // < alpha + dup v22.16b, w3 // beta + cmlt v23.16b, v24.16b, #0 + cmhi v28.16b, v22.16b, v28.16b // < beta + cmhi v30.16b, v22.16b, v30.16b // < beta + bic v21.16b, v21.16b, v23.16b + uabd v17.16b, v20.16b, v16.16b // abs(p2 - p0) + and v21.16b, v21.16b, v28.16b + uabd v19.16b, v4.16b, v0.16b // abs(q2 - q0) + and v21.16b, v21.16b, v30.16b // < beta shrn v30.8b, v21.8h, #4 mov x7, v30.d[0] - cmhi v17.16B, v22.16B, v17.16B // < beta - cmhi v19.16B, v22.16B, v19.16B // < beta + cmhi v17.16b, v22.16b, v17.16b // < beta + cmhi v19.16b, v22.16b, v19.16b // < beta cbz x7, 9f - and v17.16B, v17.16B, v21.16B - and v19.16B, v19.16B, v21.16B - and v24.16B, v24.16B, v21.16B - urhadd v28.16B, v16.16B, v0.16B - sub v21.16B, v24.16B, v17.16B - uqadd v23.16B, v18.16B, v24.16B - uhadd v20.16B, v20.16B, v28.16B - sub v21.16B, v21.16B, v19.16B - uhadd v28.16B, v4.16B, v28.16B - umin v23.16B, v23.16B, v20.16B - uqsub v22.16B, v18.16B, v24.16B - uqadd v4.16B, v2.16B, v24.16B - umax v23.16B, v23.16B, v22.16B - uqsub v22.16B, v2.16B, v24.16B - umin v28.16B, v4.16B, v28.16B - uxtl v4.8H, v0.8B - umax v28.16B, v28.16B, v22.16B - uxtl2 v20.8H, v0.16B - usubw v4.8H, v4.8H, v16.8B - usubw2 v20.8H, v20.8H, v16.16B - shl v4.8H, v4.8H, #2 - shl v20.8H, v20.8H, #2 - uaddw v4.8H, v4.8H, v18.8B - uaddw2 v20.8H, v20.8H, v18.16B - usubw v4.8H, v4.8H, v2.8B - usubw2 v20.8H, v20.8H, v2.16B - rshrn v4.8B, v4.8H, #3 - rshrn2 v4.16B, v20.8H, #3 - bsl v17.16B, v23.16B, v18.16B - bsl v19.16B, v28.16B, v2.16B - neg v23.16B, v21.16B - uxtl v28.8H, v16.8B - smin v4.16B, v4.16B, v21.16B - uxtl2 v21.8H, v16.16B - smax v4.16B, v4.16B, v23.16B - uxtl v22.8H, v0.8B - uxtl2 v24.8H, v0.16B - saddw v28.8H, v28.8H, v4.8B - saddw2 v21.8H, v21.8H, v4.16B - ssubw v22.8H, v22.8H, v4.8B - ssubw2 v24.8H, v24.8H, v4.16B - sqxtun v16.8B, v28.8H - sqxtun2 v16.16B, v21.8H - sqxtun v0.8B, v22.8H - sqxtun2 v0.16B, v24.8H + and v17.16b, v17.16b, v21.16b + and v19.16b, v19.16b, v21.16b + and v24.16b, v24.16b, v21.16b + urhadd v28.16b, v16.16b, v0.16b + sub v21.16b, v24.16b, v17.16b + uqadd v23.16b, v18.16b, v24.16b + uhadd v20.16b, v20.16b, v28.16b + sub v21.16b, v21.16b, v19.16b + uhadd v28.16b, v4.16b, v28.16b + umin v23.16b, v23.16b, v20.16b + uqsub v22.16b, v18.16b, v24.16b + uqadd v4.16b, v2.16b, v24.16b + umax v23.16b, v23.16b, v22.16b + uqsub v22.16b, v2.16b, v24.16b + umin v28.16b, v4.16b, v28.16b + uxtl v4.8h, v0.8b + umax v28.16b, v28.16b, v22.16b + uxtl2 v20.8h, v0.16b + usubw v4.8h, v4.8h, v16.8b + usubw2 v20.8h, v20.8h, v16.16b + shl v4.8h, v4.8h, #2 + shl v20.8h, v20.8h, #2 + uaddw v4.8h, v4.8h, v18.8b + uaddw2 v20.8h, v20.8h, v18.16b + usubw v4.8h, v4.8h, v2.8b + usubw2 v20.8h, v20.8h, v2.16b + rshrn v4.8b, v4.8h, #3 + rshrn2 v4.16b, v20.8h, #3 + bsl v17.16b, v23.16b, v18.16b + bsl v19.16b, v28.16b, v2.16b + neg v23.16b, v21.16b + uxtl v28.8h, v16.8b + smin v4.16b, v4.16b, v21.16b + uxtl2 v21.8h, v16.16b + smax v4.16b, v4.16b, v23.16b + uxtl v22.8h, v0.8b + uxtl2 v24.8h, v0.16b + saddw v28.8h, v28.8h, v4.8b + saddw2 v21.8h, v21.8h, v4.16b + ssubw v22.8h, v22.8h, v4.8b + ssubw2 v24.8h, v24.8h, v4.16b + sqxtun v16.8b, v28.8h + sqxtun2 v16.16b, v21.8h + sqxtun v0.8b, v22.8h + sqxtun2 v0.16b, v24.8h .endm function ff_h264_v_loop_filter_luma_neon, export=1 h264_loop_filter_start - ld1 {v0.16B}, [x0], x1 - ld1 {v2.16B}, [x0], x1 - ld1 {v4.16B}, [x0], x1 + ld1 {v0.16b}, [x0], x1 + ld1 {v2.16b}, [x0], x1 + ld1 {v4.16b}, [x0], x1 sub x0, x0, x1, lsl #2 sub x0, x0, x1, lsl #1 - ld1 {v20.16B}, [x0], x1 - ld1 {v18.16B}, [x0], x1 - ld1 {v16.16B}, [x0], x1 + ld1 {v20.16b}, [x0], x1 + ld1 {v18.16b}, [x0], x1 + ld1 {v16.16b}, [x0], x1 h264_loop_filter_luma sub x0, x0, x1, lsl #1 - st1 {v17.16B}, [x0], x1 - st1 {v16.16B}, [x0], x1 - st1 {v0.16B}, [x0], x1 - st1 {v19.16B}, [x0] + st1 {v17.16b}, [x0], x1 + st1 {v16.16b}, [x0], x1 + st1 {v0.16b}, [x0], x1 + st1 {v19.16b}, [x0] 9: ret endfunc @@ -135,22 +135,22 @@ function ff_h264_h_loop_filter_luma_neon, export=1 h264_loop_filter_start sub x0, x0, #4 - ld1 {v6.8B}, [x0], x1 - ld1 {v20.8B}, [x0], x1 - ld1 {v18.8B}, [x0], x1 - ld1 {v16.8B}, [x0], x1 - ld1 {v0.8B}, [x0], x1 - ld1 {v2.8B}, [x0], x1 - ld1 {v4.8B}, [x0], x1 - ld1 {v26.8B}, [x0], x1 - ld1 {v6.D}[1], [x0], x1 - ld1 {v20.D}[1], [x0], x1 - ld1 {v18.D}[1], [x0], x1 - ld1 {v16.D}[1], [x0], x1 - ld1 {v0.D}[1], [x0], x1 - ld1 {v2.D}[1], [x0], x1 - ld1 {v4.D}[1], [x0], x1 - ld1 {v26.D}[1], [x0], x1 + ld1 {v6.8b}, [x0], x1 + ld1 {v20.8b}, [x0], x1 + ld1 {v18.8b}, [x0], x1 + ld1 {v16.8b}, [x0], x1 + ld1 {v0.8b}, [x0], x1 + ld1 {v2.8b}, [x0], x1 + ld1 {v4.8b}, [x0], x1 + ld1 {v26.8b}, [x0], x1 + ld1 {v6.d}[1], [x0], x1 + ld1 {v20.d}[1], [x0], x1 + ld1 {v18.d}[1], [x0], x1 + ld1 {v16.d}[1], [x0], x1 + ld1 {v0.d}[1], [x0], x1 + ld1 {v2.d}[1], [x0], x1 + ld1 {v4.d}[1], [x0], x1 + ld1 {v26.d}[1], [x0], x1 transpose_8x16B v6, v20, v18, v16, v0, v2, v4, v26, v21, v23 @@ -160,22 +160,22 @@ function ff_h264_h_loop_filter_luma_neon, export=1 sub x0, x0, x1, lsl #4 add x0, x0, #2 - st1 {v17.S}[0], [x0], x1 - st1 {v16.S}[0], [x0], x1 - st1 {v0.S}[0], [x0], x1 - st1 {v19.S}[0], [x0], x1 - st1 {v17.S}[1], [x0], x1 - st1 {v16.S}[1], [x0], x1 - st1 {v0.S}[1], [x0], x1 - st1 {v19.S}[1], [x0], x1 - st1 {v17.S}[2], [x0], x1 - st1 {v16.S}[2], [x0], x1 - st1 {v0.S}[2], [x0], x1 - st1 {v19.S}[2], [x0], x1 - st1 {v17.S}[3], [x0], x1 - st1 {v16.S}[3], [x0], x1 - st1 {v0.S}[3], [x0], x1 - st1 {v19.S}[3], [x0], x1 + st1 {v17.s}[0], [x0], x1 + st1 {v16.s}[0], [x0], x1 + st1 {v0.s}[0], [x0], x1 + st1 {v19.s}[0], [x0], x1 + st1 {v17.s}[1], [x0], x1 + st1 {v16.s}[1], [x0], x1 + st1 {v0.s}[1], [x0], x1 + st1 {v19.s}[1], [x0], x1 + st1 {v17.s}[2], [x0], x1 + st1 {v16.s}[2], [x0], x1 + st1 {v0.s}[2], [x0], x1 + st1 {v19.s}[2], [x0], x1 + st1 {v17.s}[3], [x0], x1 + st1 {v16.s}[3], [x0], x1 + st1 {v0.s}[3], [x0], x1 + st1 {v19.s}[3], [x0], x1 9: ret endfunc @@ -377,52 +377,52 @@ function ff_h264_h_loop_filter_luma_intra_neon, export=1 endfunc .macro h264_loop_filter_chroma - dup v22.8B, w2 // alpha - dup v23.8B, w3 // beta - uxtl v24.8H, v24.8B - uabd v26.8B, v16.8B, v0.8B // abs(p0 - q0) - uabd v28.8B, v18.8B, v16.8B // abs(p1 - p0) - uabd v30.8B, v2.8B, v0.8B // abs(q1 - q0) - cmhi v26.8B, v22.8B, v26.8B // < alpha - cmhi v28.8B, v23.8B, v28.8B // < beta - cmhi v30.8B, v23.8B, v30.8B // < beta - uxtl v4.8H, v0.8B - and v26.8B, v26.8B, v28.8B - usubw v4.8H, v4.8H, v16.8B - and v26.8B, v26.8B, v30.8B - shl v4.8H, v4.8H, #2 + dup v22.8b, w2 // alpha + dup v23.8b, w3 // beta + uxtl v24.8h, v24.8b + uabd v26.8b, v16.8b, v0.8b // abs(p0 - q0) + uabd v28.8b, v18.8b, v16.8b // abs(p1 - p0) + uabd v30.8b, v2.8b, v0.8b // abs(q1 - q0) + cmhi v26.8b, v22.8b, v26.8b // < alpha + cmhi v28.8b, v23.8b, v28.8b // < beta + cmhi v30.8b, v23.8b, v30.8b // < beta + uxtl v4.8h, v0.8b + and v26.8b, v26.8b, v28.8b + usubw v4.8h, v4.8h, v16.8b + and v26.8b, v26.8b, v30.8b + shl v4.8h, v4.8h, #2 mov x8, v26.d[0] - sli v24.8H, v24.8H, #8 - uaddw v4.8H, v4.8H, v18.8B + sli v24.8h, v24.8h, #8 + uaddw v4.8h, v4.8h, v18.8b cbz x8, 9f - usubw v4.8H, v4.8H, v2.8B - rshrn v4.8B, v4.8H, #3 - smin v4.8B, v4.8B, v24.8B - neg v25.8B, v24.8B - smax v4.8B, v4.8B, v25.8B - uxtl v22.8H, v0.8B - and v4.8B, v4.8B, v26.8B - uxtl v28.8H, v16.8B - saddw v28.8H, v28.8H, v4.8B - ssubw v22.8H, v22.8H, v4.8B - sqxtun v16.8B, v28.8H - sqxtun v0.8B, v22.8H + usubw v4.8h, v4.8h, v2.8b + rshrn v4.8b, v4.8h, #3 + smin v4.8b, v4.8b, v24.8b + neg v25.8b, v24.8b + smax v4.8b, v4.8b, v25.8b + uxtl v22.8h, v0.8b + and v4.8b, v4.8b, v26.8b + uxtl v28.8h, v16.8b + saddw v28.8h, v28.8h, v4.8b + ssubw v22.8h, v22.8h, v4.8b + sqxtun v16.8b, v28.8h + sqxtun v0.8b, v22.8h .endm function ff_h264_v_loop_filter_chroma_neon, export=1 h264_loop_filter_start sub x0, x0, x1, lsl #1 - ld1 {v18.8B}, [x0], x1 - ld1 {v16.8B}, [x0], x1 - ld1 {v0.8B}, [x0], x1 - ld1 {v2.8B}, [x0] + ld1 {v18.8b}, [x0], x1 + ld1 {v16.8b}, [x0], x1 + ld1 {v0.8b}, [x0], x1 + ld1 {v2.8b}, [x0] h264_loop_filter_chroma sub x0, x0, x1, lsl #1 - st1 {v16.8B}, [x0], x1 - st1 {v0.8B}, [x0], x1 + st1 {v16.8b}, [x0], x1 + st1 {v0.8b}, [x0], x1 9: ret endfunc @@ -432,14 +432,14 @@ function ff_h264_h_loop_filter_chroma_neon, export=1 sub x0, x0, #2 h_loop_filter_chroma420: - ld1 {v18.S}[0], [x0], x1 - ld1 {v16.S}[0], [x0], x1 - ld1 {v0.S}[0], [x0], x1 - ld1 {v2.S}[0], [x0], x1 - ld1 {v18.S}[1], [x0], x1 - ld1 {v16.S}[1], [x0], x1 - ld1 {v0.S}[1], [x0], x1 - ld1 {v2.S}[1], [x0], x1 + ld1 {v18.s}[0], [x0], x1 + ld1 {v16.s}[0], [x0], x1 + ld1 {v0.s}[0], [x0], x1 + ld1 {v2.s}[0], [x0], x1 + ld1 {v18.s}[1], [x0], x1 + ld1 {v16.s}[1], [x0], x1 + ld1 {v0.s}[1], [x0], x1 + ld1 {v2.s}[1], [x0], x1 transpose_4x8B v18, v16, v0, v2, v28, v29, v30, v31 @@ -448,14 +448,14 @@ h_loop_filter_chroma420: transpose_4x8B v18, v16, v0, v2, v28, v29, v30, v31 sub x0, x0, x1, lsl #3 - st1 {v18.S}[0], [x0], x1 - st1 {v16.S}[0], [x0], x1 - st1 {v0.S}[0], [x0], x1 - st1 {v2.S}[0], [x0], x1 - st1 {v18.S}[1], [x0], x1 - st1 {v16.S}[1], [x0], x1 - st1 {v0.S}[1], [x0], x1 - st1 {v2.S}[1], [x0], x1 + st1 {v18.s}[0], [x0], x1 + st1 {v16.s}[0], [x0], x1 + st1 {v0.s}[0], [x0], x1 + st1 {v2.s}[0], [x0], x1 + st1 {v18.s}[1], [x0], x1 + st1 {v16.s}[1], [x0], x1 + st1 {v0.s}[1], [x0], x1 + st1 {v2.s}[1], [x0], x1 9: ret endfunc @@ -584,102 +584,102 @@ function ff_h264_h_loop_filter_chroma422_intra_neon, export=1 endfunc .macro biweight_16 macs, macd - dup v0.16B, w5 - dup v1.16B, w6 - mov v4.16B, v16.16B - mov v6.16B, v16.16B + dup v0.16b, w5 + dup v1.16b, w6 + mov v4.16b, v16.16b + mov v6.16b, v16.16b 1: subs w3, w3, #2 - ld1 {v20.16B}, [x0], x2 - \macd v4.8H, v0.8B, v20.8B + ld1 {v20.16b}, [x0], x2 + \macd v4.8h, v0.8b, v20.8b \macd\()2 v6.8H, v0.16B, v20.16B - ld1 {v22.16B}, [x1], x2 - \macs v4.8H, v1.8B, v22.8B + ld1 {v22.16b}, [x1], x2 + \macs v4.8h, v1.8b, v22.8b \macs\()2 v6.8H, v1.16B, v22.16B - mov v24.16B, v16.16B - ld1 {v28.16B}, [x0], x2 - mov v26.16B, v16.16B - \macd v24.8H, v0.8B, v28.8B + mov v24.16b, v16.16b + ld1 {v28.16b}, [x0], x2 + mov v26.16b, v16.16b + \macd v24.8h, v0.8b, v28.8b \macd\()2 v26.8H, v0.16B, v28.16B - ld1 {v30.16B}, [x1], x2 - \macs v24.8H, v1.8B, v30.8B + ld1 {v30.16b}, [x1], x2 + \macs v24.8h, v1.8b, v30.8b \macs\()2 v26.8H, v1.16B, v30.16B - sshl v4.8H, v4.8H, v18.8H - sshl v6.8H, v6.8H, v18.8H - sqxtun v4.8B, v4.8H - sqxtun2 v4.16B, v6.8H - sshl v24.8H, v24.8H, v18.8H - sshl v26.8H, v26.8H, v18.8H - sqxtun v24.8B, v24.8H - sqxtun2 v24.16B, v26.8H - mov v6.16B, v16.16B - st1 {v4.16B}, [x7], x2 - mov v4.16B, v16.16B - st1 {v24.16B}, [x7], x2 + sshl v4.8h, v4.8h, v18.8h + sshl v6.8h, v6.8h, v18.8h + sqxtun v4.8b, v4.8h + sqxtun2 v4.16b, v6.8h + sshl v24.8h, v24.8h, v18.8h + sshl v26.8h, v26.8h, v18.8h + sqxtun v24.8b, v24.8h + sqxtun2 v24.16b, v26.8h + mov v6.16b, v16.16b + st1 {v4.16b}, [x7], x2 + mov v4.16b, v16.16b + st1 {v24.16b}, [x7], x2 b.ne 1b ret .endm .macro biweight_8 macs, macd - dup v0.8B, w5 - dup v1.8B, w6 - mov v2.16B, v16.16B - mov v20.16B, v16.16B + dup v0.8b, w5 + dup v1.8b, w6 + mov v2.16b, v16.16b + mov v20.16b, v16.16b 1: subs w3, w3, #2 - ld1 {v4.8B}, [x0], x2 - \macd v2.8H, v0.8B, v4.8B - ld1 {v5.8B}, [x1], x2 - \macs v2.8H, v1.8B, v5.8B - ld1 {v6.8B}, [x0], x2 - \macd v20.8H, v0.8B, v6.8B - ld1 {v7.8B}, [x1], x2 - \macs v20.8H, v1.8B, v7.8B - sshl v2.8H, v2.8H, v18.8H - sqxtun v2.8B, v2.8H - sshl v20.8H, v20.8H, v18.8H - sqxtun v4.8B, v20.8H - mov v20.16B, v16.16B - st1 {v2.8B}, [x7], x2 - mov v2.16B, v16.16B - st1 {v4.8B}, [x7], x2 + ld1 {v4.8b}, [x0], x2 + \macd v2.8h, v0.8b, v4.8b + ld1 {v5.8b}, [x1], x2 + \macs v2.8h, v1.8b, v5.8b + ld1 {v6.8b}, [x0], x2 + \macd v20.8h, v0.8b, v6.8b + ld1 {v7.8b}, [x1], x2 + \macs v20.8h, v1.8b, v7.8b + sshl v2.8h, v2.8h, v18.8h + sqxtun v2.8b, v2.8h + sshl v20.8h, v20.8h, v18.8h + sqxtun v4.8b, v20.8h + mov v20.16b, v16.16b + st1 {v2.8b}, [x7], x2 + mov v2.16b, v16.16b + st1 {v4.8b}, [x7], x2 b.ne 1b ret .endm .macro biweight_4 macs, macd - dup v0.8B, w5 - dup v1.8B, w6 - mov v2.16B, v16.16B - mov v20.16B,v16.16B + dup v0.8b, w5 + dup v1.8b, w6 + mov v2.16b, v16.16b + mov v20.16b,v16.16b 1: subs w3, w3, #4 - ld1 {v4.S}[0], [x0], x2 - ld1 {v4.S}[1], [x0], x2 - \macd v2.8H, v0.8B, v4.8B - ld1 {v5.S}[0], [x1], x2 - ld1 {v5.S}[1], [x1], x2 - \macs v2.8H, v1.8B, v5.8B + ld1 {v4.s}[0], [x0], x2 + ld1 {v4.s}[1], [x0], x2 + \macd v2.8h, v0.8b, v4.8b + ld1 {v5.s}[0], [x1], x2 + ld1 {v5.s}[1], [x1], x2 + \macs v2.8h, v1.8b, v5.8b b.lt 2f - ld1 {v6.S}[0], [x0], x2 - ld1 {v6.S}[1], [x0], x2 - \macd v20.8H, v0.8B, v6.8B - ld1 {v7.S}[0], [x1], x2 - ld1 {v7.S}[1], [x1], x2 - \macs v20.8H, v1.8B, v7.8B - sshl v2.8H, v2.8H, v18.8H - sqxtun v2.8B, v2.8H - sshl v20.8H, v20.8H, v18.8H - sqxtun v4.8B, v20.8H - mov v20.16B, v16.16B - st1 {v2.S}[0], [x7], x2 - st1 {v2.S}[1], [x7], x2 - mov v2.16B, v16.16B - st1 {v4.S}[0], [x7], x2 - st1 {v4.S}[1], [x7], x2 + ld1 {v6.s}[0], [x0], x2 + ld1 {v6.s}[1], [x0], x2 + \macd v20.8h, v0.8b, v6.8b + ld1 {v7.s}[0], [x1], x2 + ld1 {v7.s}[1], [x1], x2 + \macs v20.8h, v1.8b, v7.8b + sshl v2.8h, v2.8h, v18.8h + sqxtun v2.8b, v2.8h + sshl v20.8h, v20.8h, v18.8h + sqxtun v4.8b, v20.8h + mov v20.16b, v16.16b + st1 {v2.s}[0], [x7], x2 + st1 {v2.s}[1], [x7], x2 + mov v2.16b, v16.16b + st1 {v4.s}[0], [x7], x2 + st1 {v4.s}[1], [x7], x2 b.ne 1b ret -2: sshl v2.8H, v2.8H, v18.8H - sqxtun v2.8B, v2.8H - st1 {v2.S}[0], [x7], x2 - st1 {v2.S}[1], [x7], x2 +2: sshl v2.8h, v2.8h, v18.8h + sqxtun v2.8b, v2.8h + st1 {v2.s}[0], [x7], x2 + st1 {v2.s}[1], [x7], x2 ret .endm @@ -689,10 +689,10 @@ function ff_biweight_h264_pixels_\w\()_neon, export=1 add w7, w7, #1 eor w8, w8, w6, lsr #30 orr w7, w7, #1 - dup v18.8H, w4 + dup v18.8h, w4 lsl w7, w7, w4 - not v18.16B, v18.16B - dup v16.8H, w7 + not v18.16b, v18.16b + dup v16.8h, w7 mov x7, x0 cbz w8, 10f subs w8, w8, #1 @@ -716,78 +716,78 @@ endfunc biweight_func 4 .macro weight_16 add - dup v0.16B, w4 + dup v0.16b, w4 1: subs w2, w2, #2 - ld1 {v20.16B}, [x0], x1 - umull v4.8H, v0.8B, v20.8B - umull2 v6.8H, v0.16B, v20.16B - ld1 {v28.16B}, [x0], x1 - umull v24.8H, v0.8B, v28.8B - umull2 v26.8H, v0.16B, v28.16B - \add v4.8H, v16.8H, v4.8H - srshl v4.8H, v4.8H, v18.8H - \add v6.8H, v16.8H, v6.8H - srshl v6.8H, v6.8H, v18.8H - sqxtun v4.8B, v4.8H - sqxtun2 v4.16B, v6.8H - \add v24.8H, v16.8H, v24.8H - srshl v24.8H, v24.8H, v18.8H - \add v26.8H, v16.8H, v26.8H - srshl v26.8H, v26.8H, v18.8H - sqxtun v24.8B, v24.8H - sqxtun2 v24.16B, v26.8H - st1 {v4.16B}, [x5], x1 - st1 {v24.16B}, [x5], x1 + ld1 {v20.16b}, [x0], x1 + umull v4.8h, v0.8b, v20.8b + umull2 v6.8h, v0.16b, v20.16b + ld1 {v28.16b}, [x0], x1 + umull v24.8h, v0.8b, v28.8b + umull2 v26.8h, v0.16b, v28.16b + \add v4.8h, v16.8h, v4.8h + srshl v4.8h, v4.8h, v18.8h + \add v6.8h, v16.8h, v6.8h + srshl v6.8h, v6.8h, v18.8h + sqxtun v4.8b, v4.8h + sqxtun2 v4.16b, v6.8h + \add v24.8h, v16.8h, v24.8h + srshl v24.8h, v24.8h, v18.8h + \add v26.8h, v16.8h, v26.8h + srshl v26.8h, v26.8h, v18.8h + sqxtun v24.8b, v24.8h + sqxtun2 v24.16b, v26.8h + st1 {v4.16b}, [x5], x1 + st1 {v24.16b}, [x5], x1 b.ne 1b ret .endm .macro weight_8 add - dup v0.8B, w4 + dup v0.8b, w4 1: subs w2, w2, #2 - ld1 {v4.8B}, [x0], x1 - umull v2.8H, v0.8B, v4.8B - ld1 {v6.8B}, [x0], x1 - umull v20.8H, v0.8B, v6.8B - \add v2.8H, v16.8H, v2.8H - srshl v2.8H, v2.8H, v18.8H - sqxtun v2.8B, v2.8H - \add v20.8H, v16.8H, v20.8H - srshl v20.8H, v20.8H, v18.8H - sqxtun v4.8B, v20.8H - st1 {v2.8B}, [x5], x1 - st1 {v4.8B}, [x5], x1 + ld1 {v4.8b}, [x0], x1 + umull v2.8h, v0.8b, v4.8b + ld1 {v6.8b}, [x0], x1 + umull v20.8h, v0.8b, v6.8b + \add v2.8h, v16.8h, v2.8h + srshl v2.8h, v2.8h, v18.8h + sqxtun v2.8b, v2.8h + \add v20.8h, v16.8h, v20.8h + srshl v20.8h, v20.8h, v18.8h + sqxtun v4.8b, v20.8h + st1 {v2.8b}, [x5], x1 + st1 {v4.8b}, [x5], x1 b.ne 1b ret .endm .macro weight_4 add - dup v0.8B, w4 + dup v0.8b, w4 1: subs w2, w2, #4 - ld1 {v4.S}[0], [x0], x1 - ld1 {v4.S}[1], [x0], x1 - umull v2.8H, v0.8B, v4.8B + ld1 {v4.s}[0], [x0], x1 + ld1 {v4.s}[1], [x0], x1 + umull v2.8h, v0.8b, v4.8b b.lt 2f - ld1 {v6.S}[0], [x0], x1 - ld1 {v6.S}[1], [x0], x1 - umull v20.8H, v0.8B, v6.8B - \add v2.8H, v16.8H, v2.8H - srshl v2.8H, v2.8H, v18.8H - sqxtun v2.8B, v2.8H - \add v20.8H, v16.8H, v20.8H - srshl v20.8H, v20.8h, v18.8H - sqxtun v4.8B, v20.8H - st1 {v2.S}[0], [x5], x1 - st1 {v2.S}[1], [x5], x1 - st1 {v4.S}[0], [x5], x1 - st1 {v4.S}[1], [x5], x1 + ld1 {v6.s}[0], [x0], x1 + ld1 {v6.s}[1], [x0], x1 + umull v20.8h, v0.8b, v6.8b + \add v2.8h, v16.8h, v2.8h + srshl v2.8h, v2.8h, v18.8h + sqxtun v2.8b, v2.8h + \add v20.8h, v16.8h, v20.8h + srshl v20.8h, v20.8h, v18.8h + sqxtun v4.8b, v20.8h + st1 {v2.s}[0], [x5], x1 + st1 {v2.s}[1], [x5], x1 + st1 {v4.s}[0], [x5], x1 + st1 {v4.s}[1], [x5], x1 b.ne 1b ret -2: \add v2.8H, v16.8H, v2.8H - srshl v2.8H, v2.8H, v18.8H - sqxtun v2.8B, v2.8H - st1 {v2.S}[0], [x5], x1 - st1 {v2.S}[1], [x5], x1 +2: \add v2.8h, v16.8h, v2.8h + srshl v2.8h, v2.8h, v18.8h + sqxtun v2.8b, v2.8h + st1 {v2.s}[0], [x5], x1 + st1 {v2.s}[1], [x5], x1 ret .endm @@ -796,18 +796,18 @@ function ff_weight_h264_pixels_\w\()_neon, export=1 cmp w3, #1 mov w6, #1 lsl w5, w5, w3 - dup v16.8H, w5 + dup v16.8h, w5 mov x5, x0 b.le 20f sub w6, w6, w3 - dup v18.8H, w6 + dup v18.8h, w6 cmp w4, #0 b.lt 10f weight_\w shadd 10: neg w4, w4 weight_\w shsub 20: neg w6, w3 - dup v18.8H, w6 + dup v18.8h, w6 cmp w4, #0 b.lt 10f weight_\w add @@ -825,7 +825,7 @@ endfunc ldr w6, [x4] ccmp w3, #0, #0, ne lsl w2, w2, #2 - mov v24.S[0], w6 + mov v24.s[0], w6 lsl w3, w3, #2 and w8, w6, w6, lsl #16 b.eq 1f diff --git a/libavcodec/aarch64/h264idct_neon.S b/libavcodec/aarch64/h264idct_neon.S index 375da31d65..1bab2ca7c8 100644 --- a/libavcodec/aarch64/h264idct_neon.S +++ b/libavcodec/aarch64/h264idct_neon.S @@ -25,54 +25,54 @@ function ff_h264_idct_add_neon, export=1 .L_ff_h264_idct_add_neon: AARCH64_VALID_CALL_TARGET - ld1 {v0.4H, v1.4H, v2.4H, v3.4H}, [x1] + ld1 {v0.4h, v1.4h, v2.4h, v3.4h}, [x1] sxtw x2, w2 - movi v30.8H, #0 + movi v30.8h, #0 - add v4.4H, v0.4H, v2.4H - sshr v16.4H, v1.4H, #1 - st1 {v30.8H}, [x1], #16 - sshr v17.4H, v3.4H, #1 - st1 {v30.8H}, [x1], #16 - sub v5.4H, v0.4H, v2.4H - sub v6.4H, v16.4H, v3.4H - add v7.4H, v1.4H, v17.4H - add v0.4H, v4.4H, v7.4H - add v1.4H, v5.4H, v6.4H - sub v2.4H, v5.4H, v6.4H - sub v3.4H, v4.4H, v7.4H + add v4.4h, v0.4h, v2.4h + sshr v16.4h, v1.4h, #1 + st1 {v30.8h}, [x1], #16 + sshr v17.4h, v3.4h, #1 + st1 {v30.8h}, [x1], #16 + sub v5.4h, v0.4h, v2.4h + sub v6.4h, v16.4h, v3.4h + add v7.4h, v1.4h, v17.4h + add v0.4h, v4.4h, v7.4h + add v1.4h, v5.4h, v6.4h + sub v2.4h, v5.4h, v6.4h + sub v3.4h, v4.4h, v7.4h transpose_4x4H v0, v1, v2, v3, v4, v5, v6, v7 - add v4.4H, v0.4H, v2.4H - ld1 {v18.S}[0], [x0], x2 - sshr v16.4H, v3.4H, #1 - sshr v17.4H, v1.4H, #1 - ld1 {v18.S}[1], [x0], x2 - sub v5.4H, v0.4H, v2.4H - ld1 {v19.S}[1], [x0], x2 - add v6.4H, v16.4H, v1.4H - ins v4.D[1], v5.D[0] - sub v7.4H, v17.4H, v3.4H - ld1 {v19.S}[0], [x0], x2 - ins v6.D[1], v7.D[0] + add v4.4h, v0.4h, v2.4h + ld1 {v18.s}[0], [x0], x2 + sshr v16.4h, v3.4h, #1 + sshr v17.4h, v1.4h, #1 + ld1 {v18.s}[1], [x0], x2 + sub v5.4h, v0.4h, v2.4h + ld1 {v19.s}[1], [x0], x2 + add v6.4h, v16.4h, v1.4h + ins v4.d[1], v5.d[0] + sub v7.4h, v17.4h, v3.4h + ld1 {v19.s}[0], [x0], x2 + ins v6.d[1], v7.d[0] sub x0, x0, x2, lsl #2 - add v0.8H, v4.8H, v6.8H - sub v1.8H, v4.8H, v6.8H + add v0.8h, v4.8h, v6.8h + sub v1.8h, v4.8h, v6.8h - srshr v0.8H, v0.8H, #6 - srshr v1.8H, v1.8H, #6 + srshr v0.8h, v0.8h, #6 + srshr v1.8h, v1.8h, #6 - uaddw v0.8H, v0.8H, v18.8B - uaddw v1.8H, v1.8H, v19.8B + uaddw v0.8h, v0.8h, v18.8b + uaddw v1.8h, v1.8h, v19.8b - sqxtun v0.8B, v0.8H - sqxtun v1.8B, v1.8H + sqxtun v0.8b, v0.8h + sqxtun v1.8b, v1.8h - st1 {v0.S}[0], [x0], x2 - st1 {v0.S}[1], [x0], x2 - st1 {v1.S}[1], [x0], x2 - st1 {v1.S}[0], [x0], x2 + st1 {v0.s}[0], [x0], x2 + st1 {v0.s}[1], [x0], x2 + st1 {v1.s}[1], [x0], x2 + st1 {v1.s}[0], [x0], x2 sub x1, x1, #32 ret @@ -83,22 +83,22 @@ function ff_h264_idct_dc_add_neon, export=1 AARCH64_VALID_CALL_TARGET sxtw x2, w2 mov w3, #0 - ld1r {v2.8H}, [x1] + ld1r {v2.8h}, [x1] strh w3, [x1] - srshr v2.8H, v2.8H, #6 - ld1 {v0.S}[0], [x0], x2 - ld1 {v0.S}[1], [x0], x2 - uaddw v3.8H, v2.8H, v0.8B - ld1 {v1.S}[0], [x0], x2 - ld1 {v1.S}[1], [x0], x2 - uaddw v4.8H, v2.8H, v1.8B - sqxtun v0.8B, v3.8H - sqxtun v1.8B, v4.8H + srshr v2.8h, v2.8h, #6 + ld1 {v0.s}[0], [x0], x2 + ld1 {v0.s}[1], [x0], x2 + uaddw v3.8h, v2.8h, v0.8b + ld1 {v1.s}[0], [x0], x2 + ld1 {v1.s}[1], [x0], x2 + uaddw v4.8h, v2.8h, v1.8b + sqxtun v0.8b, v3.8h + sqxtun v1.8b, v4.8h sub x0, x0, x2, lsl #2 - st1 {v0.S}[0], [x0], x2 - st1 {v0.S}[1], [x0], x2 - st1 {v1.S}[0], [x0], x2 - st1 {v1.S}[1], [x0], x2 + st1 {v0.s}[0], [x0], x2 + st1 {v0.s}[1], [x0], x2 + st1 {v1.s}[0], [x0], x2 + st1 {v1.s}[1], [x0], x2 ret endfunc @@ -194,71 +194,71 @@ endfunc .if \pass == 0 va .req v18 vb .req v30 - sshr v18.8H, v26.8H, #1 - add v16.8H, v24.8H, v28.8H - ld1 {v30.8H, v31.8H}, [x1] - st1 {v19.8H}, [x1], #16 - st1 {v19.8H}, [x1], #16 - sub v17.8H, v24.8H, v28.8H - sshr v19.8H, v30.8H, #1 - sub v18.8H, v18.8H, v30.8H - add v19.8H, v19.8H, v26.8H + sshr v18.8h, v26.8h, #1 + add v16.8h, v24.8h, v28.8h + ld1 {v30.8h, v31.8h}, [x1] + st1 {v19.8h}, [x1], #16 + st1 {v19.8h}, [x1], #16 + sub v17.8h, v24.8h, v28.8h + sshr v19.8h, v30.8h, #1 + sub v18.8h, v18.8h, v30.8h + add v19.8h, v19.8h, v26.8h .else va .req v30 vb .req v18 - sshr v30.8H, v26.8H, #1 - sshr v19.8H, v18.8H, #1 - add v16.8H, v24.8H, v28.8H - sub v17.8H, v24.8H, v28.8H - sub v30.8H, v30.8H, v18.8H - add v19.8H, v19.8H, v26.8H + sshr v30.8h, v26.8h, #1 + sshr v19.8h, v18.8h, #1 + add v16.8h, v24.8h, v28.8h + sub v17.8h, v24.8h, v28.8h + sub v30.8h, v30.8h, v18.8h + add v19.8h, v19.8h, v26.8h .endif - add v26.8H, v17.8H, va.8H - sub v28.8H, v17.8H, va.8H - add v24.8H, v16.8H, v19.8H - sub vb.8H, v16.8H, v19.8H - sub v16.8H, v29.8H, v27.8H - add v17.8H, v31.8H, v25.8H - sub va.8H, v31.8H, v25.8H - add v19.8H, v29.8H, v27.8H - sub v16.8H, v16.8H, v31.8H - sub v17.8H, v17.8H, v27.8H - add va.8H, va.8H, v29.8H - add v19.8H, v19.8H, v25.8H - sshr v25.8H, v25.8H, #1 - sshr v27.8H, v27.8H, #1 - sshr v29.8H, v29.8H, #1 - sshr v31.8H, v31.8H, #1 - sub v16.8H, v16.8H, v31.8H - sub v17.8H, v17.8H, v27.8H - add va.8H, va.8H, v29.8H - add v19.8H, v19.8H, v25.8H - sshr v25.8H, v16.8H, #2 - sshr v27.8H, v17.8H, #2 - sshr v29.8H, va.8H, #2 - sshr v31.8H, v19.8H, #2 - sub v19.8H, v19.8H, v25.8H - sub va.8H, v27.8H, va.8H - add v17.8H, v17.8H, v29.8H - add v16.8H, v16.8H, v31.8H + add v26.8h, v17.8h, va.8h + sub v28.8h, v17.8h, va.8h + add v24.8h, v16.8h, v19.8h + sub vb.8h, v16.8h, v19.8h + sub v16.8h, v29.8h, v27.8h + add v17.8h, v31.8h, v25.8h + sub va.8h, v31.8h, v25.8h + add v19.8h, v29.8h, v27.8h + sub v16.8h, v16.8h, v31.8h + sub v17.8h, v17.8h, v27.8h + add va.8h, va.8h, v29.8h + add v19.8h, v19.8h, v25.8h + sshr v25.8h, v25.8h, #1 + sshr v27.8h, v27.8h, #1 + sshr v29.8h, v29.8h, #1 + sshr v31.8h, v31.8h, #1 + sub v16.8h, v16.8h, v31.8h + sub v17.8h, v17.8h, v27.8h + add va.8h, va.8h, v29.8h + add v19.8h, v19.8h, v25.8h + sshr v25.8h, v16.8h, #2 + sshr v27.8h, v17.8h, #2 + sshr v29.8h, va.8h, #2 + sshr v31.8h, v19.8h, #2 + sub v19.8h, v19.8h, v25.8h + sub va.8h, v27.8h, va.8h + add v17.8h, v17.8h, v29.8h + add v16.8h, v16.8h, v31.8h .if \pass == 0 - sub v31.8H, v24.8H, v19.8H - add v24.8H, v24.8H, v19.8H - add v25.8H, v26.8H, v18.8H - sub v18.8H, v26.8H, v18.8H - add v26.8H, v28.8H, v17.8H - add v27.8H, v30.8H, v16.8H - sub v29.8H, v28.8H, v17.8H - sub v28.8H, v30.8H, v16.8H + sub v31.8h, v24.8h, v19.8h + add v24.8h, v24.8h, v19.8h + add v25.8h, v26.8h, v18.8h + sub v18.8h, v26.8h, v18.8h + add v26.8h, v28.8h, v17.8h + add v27.8h, v30.8h, v16.8h + sub v29.8h, v28.8h, v17.8h + sub v28.8h, v30.8h, v16.8h .else - sub v31.8H, v24.8H, v19.8H - add v24.8H, v24.8H, v19.8H - add v25.8H, v26.8H, v30.8H - sub v30.8H, v26.8H, v30.8H - add v26.8H, v28.8H, v17.8H - sub v29.8H, v28.8H, v17.8H - add v27.8H, v18.8H, v16.8H - sub v28.8H, v18.8H, v16.8H + sub v31.8h, v24.8h, v19.8h + add v24.8h, v24.8h, v19.8h + add v25.8h, v26.8h, v30.8h + sub v30.8h, v26.8h, v30.8h + add v26.8h, v28.8h, v17.8h + sub v29.8h, v28.8h, v17.8h + add v27.8h, v18.8h, v16.8h + sub v28.8h, v18.8h, v16.8h .endif .unreq va .unreq vb @@ -267,63 +267,63 @@ endfunc function ff_h264_idct8_add_neon, export=1 .L_ff_h264_idct8_add_neon: AARCH64_VALID_CALL_TARGET - movi v19.8H, #0 + movi v19.8h, #0 sxtw x2, w2 - ld1 {v24.8H, v25.8H}, [x1] - st1 {v19.8H}, [x1], #16 - st1 {v19.8H}, [x1], #16 - ld1 {v26.8H, v27.8H}, [x1] - st1 {v19.8H}, [x1], #16 - st1 {v19.8H}, [x1], #16 - ld1 {v28.8H, v29.8H}, [x1] - st1 {v19.8H}, [x1], #16 - st1 {v19.8H}, [x1], #16 + ld1 {v24.8h, v25.8h}, [x1] + st1 {v19.8h}, [x1], #16 + st1 {v19.8h}, [x1], #16 + ld1 {v26.8h, v27.8h}, [x1] + st1 {v19.8h}, [x1], #16 + st1 {v19.8h}, [x1], #16 + ld1 {v28.8h, v29.8h}, [x1] + st1 {v19.8h}, [x1], #16 + st1 {v19.8h}, [x1], #16 idct8x8_cols 0 transpose_8x8H v24, v25, v26, v27, v28, v29, v18, v31, v6, v7 idct8x8_cols 1 mov x3, x0 - srshr v24.8H, v24.8H, #6 - ld1 {v0.8B}, [x0], x2 - srshr v25.8H, v25.8H, #6 - ld1 {v1.8B}, [x0], x2 - srshr v26.8H, v26.8H, #6 - ld1 {v2.8B}, [x0], x2 - srshr v27.8H, v27.8H, #6 - ld1 {v3.8B}, [x0], x2 - srshr v28.8H, v28.8H, #6 - ld1 {v4.8B}, [x0], x2 - srshr v29.8H, v29.8H, #6 - ld1 {v5.8B}, [x0], x2 - srshr v30.8H, v30.8H, #6 - ld1 {v6.8B}, [x0], x2 - srshr v31.8H, v31.8H, #6 - ld1 {v7.8B}, [x0], x2 - uaddw v24.8H, v24.8H, v0.8B - uaddw v25.8H, v25.8H, v1.8B - uaddw v26.8H, v26.8H, v2.8B - sqxtun v0.8B, v24.8H - uaddw v27.8H, v27.8H, v3.8B - sqxtun v1.8B, v25.8H - uaddw v28.8H, v28.8H, v4.8B - sqxtun v2.8B, v26.8H - st1 {v0.8B}, [x3], x2 - uaddw v29.8H, v29.8H, v5.8B - sqxtun v3.8B, v27.8H - st1 {v1.8B}, [x3], x2 - uaddw v30.8H, v30.8H, v6.8B - sqxtun v4.8B, v28.8H - st1 {v2.8B}, [x3], x2 - uaddw v31.8H, v31.8H, v7.8B - sqxtun v5.8B, v29.8H - st1 {v3.8B}, [x3], x2 - sqxtun v6.8B, v30.8H - sqxtun v7.8B, v31.8H - st1 {v4.8B}, [x3], x2 - st1 {v5.8B}, [x3], x2 - st1 {v6.8B}, [x3], x2 - st1 {v7.8B}, [x3], x2 + srshr v24.8h, v24.8h, #6 + ld1 {v0.8b}, [x0], x2 + srshr v25.8h, v25.8h, #6 + ld1 {v1.8b}, [x0], x2 + srshr v26.8h, v26.8h, #6 + ld1 {v2.8b}, [x0], x2 + srshr v27.8h, v27.8h, #6 + ld1 {v3.8b}, [x0], x2 + srshr v28.8h, v28.8h, #6 + ld1 {v4.8b}, [x0], x2 + srshr v29.8h, v29.8h, #6 + ld1 {v5.8b}, [x0], x2 + srshr v30.8h, v30.8h, #6 + ld1 {v6.8b}, [x0], x2 + srshr v31.8h, v31.8h, #6 + ld1 {v7.8b}, [x0], x2 + uaddw v24.8h, v24.8h, v0.8b + uaddw v25.8h, v25.8h, v1.8b + uaddw v26.8h, v26.8h, v2.8b + sqxtun v0.8b, v24.8h + uaddw v27.8h, v27.8h, v3.8b + sqxtun v1.8b, v25.8h + uaddw v28.8h, v28.8h, v4.8b + sqxtun v2.8b, v26.8h + st1 {v0.8b}, [x3], x2 + uaddw v29.8h, v29.8h, v5.8b + sqxtun v3.8b, v27.8h + st1 {v1.8b}, [x3], x2 + uaddw v30.8h, v30.8h, v6.8b + sqxtun v4.8b, v28.8h + st1 {v2.8b}, [x3], x2 + uaddw v31.8h, v31.8h, v7.8b + sqxtun v5.8b, v29.8h + st1 {v3.8b}, [x3], x2 + sqxtun v6.8b, v30.8h + sqxtun v7.8b, v31.8h + st1 {v4.8b}, [x3], x2 + st1 {v5.8b}, [x3], x2 + st1 {v6.8b}, [x3], x2 + st1 {v7.8b}, [x3], x2 sub x1, x1, #128 ret @@ -334,42 +334,42 @@ function ff_h264_idct8_dc_add_neon, export=1 AARCH64_VALID_CALL_TARGET mov w3, #0 sxtw x2, w2 - ld1r {v31.8H}, [x1] + ld1r {v31.8h}, [x1] strh w3, [x1] - ld1 {v0.8B}, [x0], x2 - srshr v31.8H, v31.8H, #6 - ld1 {v1.8B}, [x0], x2 - ld1 {v2.8B}, [x0], x2 - uaddw v24.8H, v31.8H, v0.8B - ld1 {v3.8B}, [x0], x2 - uaddw v25.8H, v31.8H, v1.8B - ld1 {v4.8B}, [x0], x2 - uaddw v26.8H, v31.8H, v2.8B - ld1 {v5.8B}, [x0], x2 - uaddw v27.8H, v31.8H, v3.8B - ld1 {v6.8B}, [x0], x2 - uaddw v28.8H, v31.8H, v4.8B - ld1 {v7.8B}, [x0], x2 - uaddw v29.8H, v31.8H, v5.8B - uaddw v30.8H, v31.8H, v6.8B - uaddw v31.8H, v31.8H, v7.8B - sqxtun v0.8B, v24.8H - sqxtun v1.8B, v25.8H - sqxtun v2.8B, v26.8H - sqxtun v3.8B, v27.8H + ld1 {v0.8b}, [x0], x2 + srshr v31.8h, v31.8h, #6 + ld1 {v1.8b}, [x0], x2 + ld1 {v2.8b}, [x0], x2 + uaddw v24.8h, v31.8h, v0.8b + ld1 {v3.8b}, [x0], x2 + uaddw v25.8h, v31.8h, v1.8b + ld1 {v4.8b}, [x0], x2 + uaddw v26.8h, v31.8h, v2.8b + ld1 {v5.8b}, [x0], x2 + uaddw v27.8h, v31.8h, v3.8b + ld1 {v6.8b}, [x0], x2 + uaddw v28.8h, v31.8h, v4.8b + ld1 {v7.8b}, [x0], x2 + uaddw v29.8h, v31.8h, v5.8b + uaddw v30.8h, v31.8h, v6.8b + uaddw v31.8h, v31.8h, v7.8b + sqxtun v0.8b, v24.8h + sqxtun v1.8b, v25.8h + sqxtun v2.8b, v26.8h + sqxtun v3.8b, v27.8h sub x0, x0, x2, lsl #3 - st1 {v0.8B}, [x0], x2 - sqxtun v4.8B, v28.8H - st1 {v1.8B}, [x0], x2 - sqxtun v5.8B, v29.8H - st1 {v2.8B}, [x0], x2 - sqxtun v6.8B, v30.8H - st1 {v3.8B}, [x0], x2 - sqxtun v7.8B, v31.8H - st1 {v4.8B}, [x0], x2 - st1 {v5.8B}, [x0], x2 - st1 {v6.8B}, [x0], x2 - st1 {v7.8B}, [x0], x2 + st1 {v0.8b}, [x0], x2 + sqxtun v4.8b, v28.8h + st1 {v1.8b}, [x0], x2 + sqxtun v5.8b, v29.8h + st1 {v2.8b}, [x0], x2 + sqxtun v6.8b, v30.8h + st1 {v3.8b}, [x0], x2 + sqxtun v7.8b, v31.8h + st1 {v4.8b}, [x0], x2 + st1 {v5.8b}, [x0], x2 + st1 {v6.8b}, [x0], x2 + st1 {v7.8b}, [x0], x2 ret endfunc diff --git a/libavcodec/aarch64/h264qpel_neon.S b/libavcodec/aarch64/h264qpel_neon.S index 451fd8af24..21906327cd 100644 --- a/libavcodec/aarch64/h264qpel_neon.S +++ b/libavcodec/aarch64/h264qpel_neon.S @@ -27,127 +27,127 @@ .macro lowpass_const r movz \r, #20, lsl #16 movk \r, #5 - mov v6.S[0], \r + mov v6.s[0], \r .endm //trashes v0-v5 .macro lowpass_8 r0, r1, r2, r3, d0, d1, narrow=1 - ext v2.8B, \r0\().8B, \r1\().8B, #2 - ext v3.8B, \r0\().8B, \r1\().8B, #3 - uaddl v2.8H, v2.8B, v3.8B - ext v4.8B, \r0\().8B, \r1\().8B, #1 - ext v5.8B, \r0\().8B, \r1\().8B, #4 - uaddl v4.8H, v4.8B, v5.8B - ext v1.8B, \r0\().8B, \r1\().8B, #5 - uaddl \d0\().8H, \r0\().8B, v1.8B - ext v0.8B, \r2\().8B, \r3\().8B, #2 - mla \d0\().8H, v2.8H, v6.H[1] - ext v1.8B, \r2\().8B, \r3\().8B, #3 - uaddl v0.8H, v0.8B, v1.8B - ext v1.8B, \r2\().8B, \r3\().8B, #1 - mls \d0\().8H, v4.8H, v6.H[0] - ext v3.8B, \r2\().8B, \r3\().8B, #4 - uaddl v1.8H, v1.8B, v3.8B - ext v2.8B, \r2\().8B, \r3\().8B, #5 - uaddl \d1\().8H, \r2\().8B, v2.8B - mla \d1\().8H, v0.8H, v6.H[1] - mls \d1\().8H, v1.8H, v6.H[0] + ext v2.8b, \r0\().8b, \r1\().8b, #2 + ext v3.8b, \r0\().8b, \r1\().8b, #3 + uaddl v2.8h, v2.8b, v3.8b + ext v4.8b, \r0\().8b, \r1\().8b, #1 + ext v5.8b, \r0\().8b, \r1\().8b, #4 + uaddl v4.8h, v4.8b, v5.8b + ext v1.8b, \r0\().8b, \r1\().8b, #5 + uaddl \d0\().8h, \r0\().8b, v1.8b + ext v0.8b, \r2\().8b, \r3\().8b, #2 + mla \d0\().8h, v2.8h, v6.h[1] + ext v1.8b, \r2\().8b, \r3\().8b, #3 + uaddl v0.8h, v0.8b, v1.8b + ext v1.8b, \r2\().8b, \r3\().8b, #1 + mls \d0\().8h, v4.8h, v6.h[0] + ext v3.8b, \r2\().8b, \r3\().8b, #4 + uaddl v1.8h, v1.8b, v3.8b + ext v2.8b, \r2\().8b, \r3\().8b, #5 + uaddl \d1\().8h, \r2\().8b, v2.8b + mla \d1\().8h, v0.8h, v6.h[1] + mls \d1\().8h, v1.8h, v6.h[0] .if \narrow - sqrshrun \d0\().8B, \d0\().8H, #5 - sqrshrun \d1\().8B, \d1\().8H, #5 + sqrshrun \d0\().8b, \d0\().8h, #5 + sqrshrun \d1\().8b, \d1\().8h, #5 .endif .endm //trashes v0-v4 .macro lowpass_8_v r0, r1, r2, r3, r4, r5, r6, d0, d1, narrow=1 - uaddl v2.8H, \r2\().8B, \r3\().8B - uaddl v0.8H, \r3\().8B, \r4\().8B - uaddl v4.8H, \r1\().8B, \r4\().8B - uaddl v1.8H, \r2\().8B, \r5\().8B - uaddl \d0\().8H, \r0\().8B, \r5\().8B - uaddl \d1\().8H, \r1\().8B, \r6\().8B - mla \d0\().8H, v2.8H, v6.H[1] - mls \d0\().8H, v4.8H, v6.H[0] - mla \d1\().8H, v0.8H, v6.H[1] - mls \d1\().8H, v1.8H, v6.H[0] + uaddl v2.8h, \r2\().8b, \r3\().8b + uaddl v0.8h, \r3\().8b, \r4\().8b + uaddl v4.8h, \r1\().8b, \r4\().8b + uaddl v1.8h, \r2\().8b, \r5\().8b + uaddl \d0\().8h, \r0\().8b, \r5\().8b + uaddl \d1\().8h, \r1\().8b, \r6\().8b + mla \d0\().8h, v2.8h, v6.h[1] + mls \d0\().8h, v4.8h, v6.h[0] + mla \d1\().8h, v0.8h, v6.h[1] + mls \d1\().8h, v1.8h, v6.h[0] .if \narrow - sqrshrun \d0\().8B, \d0\().8H, #5 - sqrshrun \d1\().8B, \d1\().8H, #5 + sqrshrun \d0\().8b, \d0\().8h, #5 + sqrshrun \d1\().8b, \d1\().8h, #5 .endif .endm //trashes v0-v5, v7, v30-v31 .macro lowpass_8H r0, r1 - ext v0.16B, \r0\().16B, \r0\().16B, #2 - ext v1.16B, \r0\().16B, \r0\().16B, #3 - uaddl v0.8H, v0.8B, v1.8B - ext v2.16B, \r0\().16B, \r0\().16B, #1 - ext v3.16B, \r0\().16B, \r0\().16B, #4 - uaddl v2.8H, v2.8B, v3.8B - ext v30.16B, \r0\().16B, \r0\().16B, #5 - uaddl \r0\().8H, \r0\().8B, v30.8B - ext v4.16B, \r1\().16B, \r1\().16B, #2 - mla \r0\().8H, v0.8H, v6.H[1] - ext v5.16B, \r1\().16B, \r1\().16B, #3 - uaddl v4.8H, v4.8B, v5.8B - ext v7.16B, \r1\().16B, \r1\().16B, #1 - mls \r0\().8H, v2.8H, v6.H[0] - ext v0.16B, \r1\().16B, \r1\().16B, #4 - uaddl v7.8H, v7.8B, v0.8B - ext v31.16B, \r1\().16B, \r1\().16B, #5 - uaddl \r1\().8H, \r1\().8B, v31.8B - mla \r1\().8H, v4.8H, v6.H[1] - mls \r1\().8H, v7.8H, v6.H[0] + ext v0.16b, \r0\().16b, \r0\().16b, #2 + ext v1.16b, \r0\().16b, \r0\().16b, #3 + uaddl v0.8h, v0.8b, v1.8b + ext v2.16b, \r0\().16b, \r0\().16b, #1 + ext v3.16b, \r0\().16b, \r0\().16b, #4 + uaddl v2.8h, v2.8b, v3.8b + ext v30.16b, \r0\().16b, \r0\().16b, #5 + uaddl \r0\().8h, \r0\().8b, v30.8b + ext v4.16b, \r1\().16b, \r1\().16b, #2 + mla \r0\().8h, v0.8h, v6.h[1] + ext v5.16b, \r1\().16b, \r1\().16b, #3 + uaddl v4.8h, v4.8b, v5.8b + ext v7.16b, \r1\().16b, \r1\().16b, #1 + mls \r0\().8h, v2.8h, v6.h[0] + ext v0.16b, \r1\().16b, \r1\().16b, #4 + uaddl v7.8h, v7.8b, v0.8b + ext v31.16b, \r1\().16b, \r1\().16b, #5 + uaddl \r1\().8h, \r1\().8b, v31.8b + mla \r1\().8h, v4.8h, v6.h[1] + mls \r1\().8h, v7.8h, v6.h[0] .endm // trashes v2-v5, v30 .macro lowpass_8_1 r0, r1, d0, narrow=1 - ext v2.8B, \r0\().8B, \r1\().8B, #2 - ext v3.8B, \r0\().8B, \r1\().8B, #3 - uaddl v2.8H, v2.8B, v3.8B - ext v4.8B, \r0\().8B, \r1\().8B, #1 - ext v5.8B, \r0\().8B, \r1\().8B, #4 - uaddl v4.8H, v4.8B, v5.8B - ext v30.8B, \r0\().8B, \r1\().8B, #5 - uaddl \d0\().8H, \r0\().8B, v30.8B - mla \d0\().8H, v2.8H, v6.H[1] - mls \d0\().8H, v4.8H, v6.H[0] + ext v2.8b, \r0\().8b, \r1\().8b, #2 + ext v3.8b, \r0\().8b, \r1\().8b, #3 + uaddl v2.8h, v2.8b, v3.8b + ext v4.8b, \r0\().8b, \r1\().8b, #1 + ext v5.8b, \r0\().8b, \r1\().8b, #4 + uaddl v4.8h, v4.8b, v5.8b + ext v30.8b, \r0\().8b, \r1\().8b, #5 + uaddl \d0\().8h, \r0\().8b, v30.8b + mla \d0\().8h, v2.8h, v6.h[1] + mls \d0\().8h, v4.8h, v6.h[0] .if \narrow - sqrshrun \d0\().8B, \d0\().8H, #5 + sqrshrun \d0\().8b, \d0\().8h, #5 .endif .endm // trashed v0-v7 .macro lowpass_8.16 r0, r1, r2, r3, r4, r5 - saddl v5.4S, \r2\().4H, \r3\().4H - saddl2 v1.4S, \r2\().8H, \r3\().8H - saddl v6.4S, \r1\().4H, \r4\().4H - saddl2 v2.4S, \r1\().8H, \r4\().8H - saddl v0.4S, \r0\().4H, \r5\().4H - saddl2 v4.4S, \r0\().8H, \r5\().8H - - shl v3.4S, v5.4S, #4 - shl v5.4S, v5.4S, #2 - shl v7.4S, v6.4S, #2 - add v5.4S, v5.4S, v3.4S - add v6.4S, v6.4S, v7.4S - - shl v3.4S, v1.4S, #4 - shl v1.4S, v1.4S, #2 - shl v7.4S, v2.4S, #2 - add v1.4S, v1.4S, v3.4S - add v2.4S, v2.4S, v7.4S - - add v5.4S, v5.4S, v0.4S - sub v5.4S, v5.4S, v6.4S - - add v1.4S, v1.4S, v4.4S - sub v1.4S, v1.4S, v2.4S - - rshrn v5.4H, v5.4S, #10 - rshrn2 v5.8H, v1.4S, #10 - - sqxtun \r0\().8B, v5.8H + saddl v5.4s, \r2\().4h, \r3\().4h + saddl2 v1.4s, \r2\().8h, \r3\().8h + saddl v6.4s, \r1\().4h, \r4\().4h + saddl2 v2.4s, \r1\().8h, \r4\().8h + saddl v0.4s, \r0\().4h, \r5\().4h + saddl2 v4.4s, \r0\().8h, \r5\().8h + + shl v3.4s, v5.4s, #4 + shl v5.4s, v5.4s, #2 + shl v7.4s, v6.4s, #2 + add v5.4s, v5.4s, v3.4s + add v6.4s, v6.4s, v7.4s + + shl v3.4s, v1.4s, #4 + shl v1.4s, v1.4s, #2 + shl v7.4s, v2.4s, #2 + add v1.4s, v1.4s, v3.4s + add v2.4s, v2.4s, v7.4s + + add v5.4s, v5.4s, v0.4s + sub v5.4s, v5.4s, v6.4s + + add v1.4s, v1.4s, v4.4s + sub v1.4s, v1.4s, v2.4s + + rshrn v5.4h, v5.4s, #10 + rshrn2 v5.8h, v1.4s, #10 + + sqxtun \r0\().8b, v5.8h .endm function put_h264_qpel16_h_lowpass_neon_packed @@ -176,19 +176,19 @@ function \type\()_h264_qpel16_h_lowpass_neon endfunc function \type\()_h264_qpel8_h_lowpass_neon -1: ld1 {v28.8B, v29.8B}, [x1], x2 - ld1 {v16.8B, v17.8B}, [x1], x2 +1: ld1 {v28.8b, v29.8b}, [x1], x2 + ld1 {v16.8b, v17.8b}, [x1], x2 subs x12, x12, #2 lowpass_8 v28, v29, v16, v17, v28, v16 .ifc \type,avg - ld1 {v2.8B}, [x0], x3 - ld1 {v3.8B}, [x0] - urhadd v28.8B, v28.8B, v2.8B - urhadd v16.8B, v16.8B, v3.8B + ld1 {v2.8b}, [x0], x3 + ld1 {v3.8b}, [x0] + urhadd v28.8b, v28.8b, v2.8b + urhadd v16.8b, v16.8b, v3.8b sub x0, x0, x3 .endif - st1 {v28.8B}, [x0], x3 - st1 {v16.8B}, [x0], x3 + st1 {v28.8b}, [x0], x3 + st1 {v16.8b}, [x0], x3 b.ne 1b ret endfunc @@ -213,23 +213,23 @@ function \type\()_h264_qpel16_h_lowpass_l2_neon endfunc function \type\()_h264_qpel8_h_lowpass_l2_neon -1: ld1 {v26.8B, v27.8B}, [x1], x2 - ld1 {v16.8B, v17.8B}, [x1], x2 - ld1 {v28.8B}, [x3], x2 - ld1 {v29.8B}, [x3], x2 +1: ld1 {v26.8b, v27.8b}, [x1], x2 + ld1 {v16.8b, v17.8b}, [x1], x2 + ld1 {v28.8b}, [x3], x2 + ld1 {v29.8b}, [x3], x2 subs x12, x12, #2 lowpass_8 v26, v27, v16, v17, v26, v27 - urhadd v26.8B, v26.8B, v28.8B - urhadd v27.8B, v27.8B, v29.8B + urhadd v26.8b, v26.8b, v28.8b + urhadd v27.8b, v27.8b, v29.8b .ifc \type,avg - ld1 {v2.8B}, [x0], x2 - ld1 {v3.8B}, [x0] - urhadd v26.8B, v26.8B, v2.8B - urhadd v27.8B, v27.8B, v3.8B + ld1 {v2.8b}, [x0], x2 + ld1 {v3.8b}, [x0] + urhadd v26.8b, v26.8b, v2.8b + urhadd v27.8b, v27.8b, v3.8b sub x0, x0, x2 .endif - st1 {v26.8B}, [x0], x2 - st1 {v27.8B}, [x0], x2 + st1 {v26.8b}, [x0], x2 + st1 {v27.8b}, [x0], x2 b.ne 1b ret endfunc @@ -270,52 +270,52 @@ function \type\()_h264_qpel16_v_lowpass_neon endfunc function \type\()_h264_qpel8_v_lowpass_neon - ld1 {v16.8B}, [x1], x3 - ld1 {v17.8B}, [x1], x3 - ld1 {v18.8B}, [x1], x3 - ld1 {v19.8B}, [x1], x3 - ld1 {v20.8B}, [x1], x3 - ld1 {v21.8B}, [x1], x3 - ld1 {v22.8B}, [x1], x3 - ld1 {v23.8B}, [x1], x3 - ld1 {v24.8B}, [x1], x3 - ld1 {v25.8B}, [x1], x3 - ld1 {v26.8B}, [x1], x3 - ld1 {v27.8B}, [x1], x3 - ld1 {v28.8B}, [x1] + ld1 {v16.8b}, [x1], x3 + ld1 {v17.8b}, [x1], x3 + ld1 {v18.8b}, [x1], x3 + ld1 {v19.8b}, [x1], x3 + ld1 {v20.8b}, [x1], x3 + ld1 {v21.8b}, [x1], x3 + ld1 {v22.8b}, [x1], x3 + ld1 {v23.8b}, [x1], x3 + ld1 {v24.8b}, [x1], x3 + ld1 {v25.8b}, [x1], x3 + ld1 {v26.8b}, [x1], x3 + ld1 {v27.8b}, [x1], x3 + ld1 {v28.8b}, [x1] lowpass_8_v v16, v17, v18, v19, v20, v21, v22, v16, v17 lowpass_8_v v18, v19, v20, v21, v22, v23, v24, v18, v19 lowpass_8_v v20, v21, v22, v23, v24, v25, v26, v20, v21 lowpass_8_v v22, v23, v24, v25, v26, v27, v28, v22, v23 .ifc \type,avg - ld1 {v24.8B}, [x0], x2 - ld1 {v25.8B}, [x0], x2 - ld1 {v26.8B}, [x0], x2 - urhadd v16.8B, v16.8B, v24.8B - ld1 {v27.8B}, [x0], x2 - urhadd v17.8B, v17.8B, v25.8B - ld1 {v28.8B}, [x0], x2 - urhadd v18.8B, v18.8B, v26.8B - ld1 {v29.8B}, [x0], x2 - urhadd v19.8B, v19.8B, v27.8B - ld1 {v30.8B}, [x0], x2 - urhadd v20.8B, v20.8B, v28.8B - ld1 {v31.8B}, [x0], x2 - urhadd v21.8B, v21.8B, v29.8B - urhadd v22.8B, v22.8B, v30.8B - urhadd v23.8B, v23.8B, v31.8B + ld1 {v24.8b}, [x0], x2 + ld1 {v25.8b}, [x0], x2 + ld1 {v26.8b}, [x0], x2 + urhadd v16.8b, v16.8b, v24.8b + ld1 {v27.8b}, [x0], x2 + urhadd v17.8b, v17.8b, v25.8b + ld1 {v28.8b}, [x0], x2 + urhadd v18.8b, v18.8b, v26.8b + ld1 {v29.8b}, [x0], x2 + urhadd v19.8b, v19.8b, v27.8b + ld1 {v30.8b}, [x0], x2 + urhadd v20.8b, v20.8b, v28.8b + ld1 {v31.8b}, [x0], x2 + urhadd v21.8b, v21.8b, v29.8b + urhadd v22.8b, v22.8b, v30.8b + urhadd v23.8b, v23.8b, v31.8b sub x0, x0, x2, lsl #3 .endif - st1 {v16.8B}, [x0], x2 - st1 {v17.8B}, [x0], x2 - st1 {v18.8B}, [x0], x2 - st1 {v19.8B}, [x0], x2 - st1 {v20.8B}, [x0], x2 - st1 {v21.8B}, [x0], x2 - st1 {v22.8B}, [x0], x2 - st1 {v23.8B}, [x0], x2 + st1 {v16.8b}, [x0], x2 + st1 {v17.8b}, [x0], x2 + st1 {v18.8b}, [x0], x2 + st1 {v19.8b}, [x0], x2 + st1 {v20.8b}, [x0], x2 + st1 {v21.8b}, [x0], x2 + st1 {v22.8b}, [x0], x2 + st1 {v23.8b}, [x0], x2 ret endfunc @@ -343,70 +343,70 @@ function \type\()_h264_qpel16_v_lowpass_l2_neon endfunc function \type\()_h264_qpel8_v_lowpass_l2_neon - ld1 {v16.8B}, [x1], x3 - ld1 {v17.8B}, [x1], x3 - ld1 {v18.8B}, [x1], x3 - ld1 {v19.8B}, [x1], x3 - ld1 {v20.8B}, [x1], x3 - ld1 {v21.8B}, [x1], x3 - ld1 {v22.8B}, [x1], x3 - ld1 {v23.8B}, [x1], x3 - ld1 {v24.8B}, [x1], x3 - ld1 {v25.8B}, [x1], x3 - ld1 {v26.8B}, [x1], x3 - ld1 {v27.8B}, [x1], x3 - ld1 {v28.8B}, [x1] + ld1 {v16.8b}, [x1], x3 + ld1 {v17.8b}, [x1], x3 + ld1 {v18.8b}, [x1], x3 + ld1 {v19.8b}, [x1], x3 + ld1 {v20.8b}, [x1], x3 + ld1 {v21.8b}, [x1], x3 + ld1 {v22.8b}, [x1], x3 + ld1 {v23.8b}, [x1], x3 + ld1 {v24.8b}, [x1], x3 + ld1 {v25.8b}, [x1], x3 + ld1 {v26.8b}, [x1], x3 + ld1 {v27.8b}, [x1], x3 + ld1 {v28.8b}, [x1] lowpass_8_v v16, v17, v18, v19, v20, v21, v22, v16, v17 lowpass_8_v v18, v19, v20, v21, v22, v23, v24, v18, v19 lowpass_8_v v20, v21, v22, v23, v24, v25, v26, v20, v21 lowpass_8_v v22, v23, v24, v25, v26, v27, v28, v22, v23 - ld1 {v24.8B}, [x12], x2 - ld1 {v25.8B}, [x12], x2 - ld1 {v26.8B}, [x12], x2 - ld1 {v27.8B}, [x12], x2 - ld1 {v28.8B}, [x12], x2 - urhadd v16.8B, v24.8B, v16.8B - urhadd v17.8B, v25.8B, v17.8B - ld1 {v29.8B}, [x12], x2 - urhadd v18.8B, v26.8B, v18.8B - urhadd v19.8B, v27.8B, v19.8B - ld1 {v30.8B}, [x12], x2 - urhadd v20.8B, v28.8B, v20.8B - urhadd v21.8B, v29.8B, v21.8B - ld1 {v31.8B}, [x12], x2 - urhadd v22.8B, v30.8B, v22.8B - urhadd v23.8B, v31.8B, v23.8B + ld1 {v24.8b}, [x12], x2 + ld1 {v25.8b}, [x12], x2 + ld1 {v26.8b}, [x12], x2 + ld1 {v27.8b}, [x12], x2 + ld1 {v28.8b}, [x12], x2 + urhadd v16.8b, v24.8b, v16.8b + urhadd v17.8b, v25.8b, v17.8b + ld1 {v29.8b}, [x12], x2 + urhadd v18.8b, v26.8b, v18.8b + urhadd v19.8b, v27.8b, v19.8b + ld1 {v30.8b}, [x12], x2 + urhadd v20.8b, v28.8b, v20.8b + urhadd v21.8b, v29.8b, v21.8b + ld1 {v31.8b}, [x12], x2 + urhadd v22.8b, v30.8b, v22.8b + urhadd v23.8b, v31.8b, v23.8b .ifc \type,avg - ld1 {v24.8B}, [x0], x3 - ld1 {v25.8B}, [x0], x3 - ld1 {v26.8B}, [x0], x3 - urhadd v16.8B, v16.8B, v24.8B - ld1 {v27.8B}, [x0], x3 - urhadd v17.8B, v17.8B, v25.8B - ld1 {v28.8B}, [x0], x3 - urhadd v18.8B, v18.8B, v26.8B - ld1 {v29.8B}, [x0], x3 - urhadd v19.8B, v19.8B, v27.8B - ld1 {v30.8B}, [x0], x3 - urhadd v20.8B, v20.8B, v28.8B - ld1 {v31.8B}, [x0], x3 - urhadd v21.8B, v21.8B, v29.8B - urhadd v22.8B, v22.8B, v30.8B - urhadd v23.8B, v23.8B, v31.8B + ld1 {v24.8b}, [x0], x3 + ld1 {v25.8b}, [x0], x3 + ld1 {v26.8b}, [x0], x3 + urhadd v16.8b, v16.8b, v24.8b + ld1 {v27.8b}, [x0], x3 + urhadd v17.8b, v17.8b, v25.8b + ld1 {v28.8b}, [x0], x3 + urhadd v18.8b, v18.8b, v26.8b + ld1 {v29.8b}, [x0], x3 + urhadd v19.8b, v19.8b, v27.8b + ld1 {v30.8b}, [x0], x3 + urhadd v20.8b, v20.8b, v28.8b + ld1 {v31.8b}, [x0], x3 + urhadd v21.8b, v21.8b, v29.8b + urhadd v22.8b, v22.8b, v30.8b + urhadd v23.8b, v23.8b, v31.8b sub x0, x0, x3, lsl #3 .endif - st1 {v16.8B}, [x0], x3 - st1 {v17.8B}, [x0], x3 - st1 {v18.8B}, [x0], x3 - st1 {v19.8B}, [x0], x3 - st1 {v20.8B}, [x0], x3 - st1 {v21.8B}, [x0], x3 - st1 {v22.8B}, [x0], x3 - st1 {v23.8B}, [x0], x3 + st1 {v16.8b}, [x0], x3 + st1 {v17.8b}, [x0], x3 + st1 {v18.8b}, [x0], x3 + st1 {v19.8b}, [x0], x3 + st1 {v20.8b}, [x0], x3 + st1 {v21.8b}, [x0], x3 + st1 {v22.8b}, [x0], x3 + st1 {v23.8b}, [x0], x3 ret endfunc @@ -417,19 +417,19 @@ endfunc function put_h264_qpel8_hv_lowpass_neon_top lowpass_const w12 - ld1 {v16.8H}, [x1], x3 - ld1 {v17.8H}, [x1], x3 - ld1 {v18.8H}, [x1], x3 - ld1 {v19.8H}, [x1], x3 - ld1 {v20.8H}, [x1], x3 - ld1 {v21.8H}, [x1], x3 - ld1 {v22.8H}, [x1], x3 - ld1 {v23.8H}, [x1], x3 - ld1 {v24.8H}, [x1], x3 - ld1 {v25.8H}, [x1], x3 - ld1 {v26.8H}, [x1], x3 - ld1 {v27.8H}, [x1], x3 - ld1 {v28.8H}, [x1] + ld1 {v16.8h}, [x1], x3 + ld1 {v17.8h}, [x1], x3 + ld1 {v18.8h}, [x1], x3 + ld1 {v19.8h}, [x1], x3 + ld1 {v20.8h}, [x1], x3 + ld1 {v21.8h}, [x1], x3 + ld1 {v22.8h}, [x1], x3 + ld1 {v23.8h}, [x1], x3 + ld1 {v24.8h}, [x1], x3 + ld1 {v25.8h}, [x1], x3 + ld1 {v26.8h}, [x1], x3 + ld1 {v27.8h}, [x1], x3 + ld1 {v28.8h}, [x1] lowpass_8H v16, v17 lowpass_8H v18, v19 lowpass_8H v20, v21 @@ -458,33 +458,33 @@ function \type\()_h264_qpel8_hv_lowpass_neon mov x10, x30 bl put_h264_qpel8_hv_lowpass_neon_top .ifc \type,avg - ld1 {v0.8B}, [x0], x2 - ld1 {v1.8B}, [x0], x2 - ld1 {v2.8B}, [x0], x2 - urhadd v16.8B, v16.8B, v0.8B - ld1 {v3.8B}, [x0], x2 - urhadd v17.8B, v17.8B, v1.8B - ld1 {v4.8B}, [x0], x2 - urhadd v18.8B, v18.8B, v2.8B - ld1 {v5.8B}, [x0], x2 - urhadd v19.8B, v19.8B, v3.8B - ld1 {v6.8B}, [x0], x2 - urhadd v20.8B, v20.8B, v4.8B - ld1 {v7.8B}, [x0], x2 - urhadd v21.8B, v21.8B, v5.8B - urhadd v22.8B, v22.8B, v6.8B - urhadd v23.8B, v23.8B, v7.8B + ld1 {v0.8b}, [x0], x2 + ld1 {v1.8b}, [x0], x2 + ld1 {v2.8b}, [x0], x2 + urhadd v16.8b, v16.8b, v0.8b + ld1 {v3.8b}, [x0], x2 + urhadd v17.8b, v17.8b, v1.8b + ld1 {v4.8b}, [x0], x2 + urhadd v18.8b, v18.8b, v2.8b + ld1 {v5.8b}, [x0], x2 + urhadd v19.8b, v19.8b, v3.8b + ld1 {v6.8b}, [x0], x2 + urhadd v20.8b, v20.8b, v4.8b + ld1 {v7.8b}, [x0], x2 + urhadd v21.8b, v21.8b, v5.8b + urhadd v22.8b, v22.8b, v6.8b + urhadd v23.8b, v23.8b, v7.8b sub x0, x0, x2, lsl #3 .endif - st1 {v16.8B}, [x0], x2 - st1 {v17.8B}, [x0], x2 - st1 {v18.8B}, [x0], x2 - st1 {v19.8B}, [x0], x2 - st1 {v20.8B}, [x0], x2 - st1 {v21.8B}, [x0], x2 - st1 {v22.8B}, [x0], x2 - st1 {v23.8B}, [x0], x2 + st1 {v16.8b}, [x0], x2 + st1 {v17.8b}, [x0], x2 + st1 {v18.8b}, [x0], x2 + st1 {v19.8b}, [x0], x2 + st1 {v20.8b}, [x0], x2 + st1 {v21.8b}, [x0], x2 + st1 {v22.8b}, [x0], x2 + st1 {v23.8b}, [x0], x2 ret x10 endfunc @@ -498,45 +498,45 @@ function \type\()_h264_qpel8_hv_lowpass_l2_neon mov x10, x30 bl put_h264_qpel8_hv_lowpass_neon_top - ld1 {v0.8B, v1.8B}, [x2], #16 - ld1 {v2.8B, v3.8B}, [x2], #16 - urhadd v0.8B, v0.8B, v16.8B - urhadd v1.8B, v1.8B, v17.8B - ld1 {v4.8B, v5.8B}, [x2], #16 - urhadd v2.8B, v2.8B, v18.8B - urhadd v3.8B, v3.8B, v19.8B - ld1 {v6.8B, v7.8B}, [x2], #16 - urhadd v4.8B, v4.8B, v20.8B - urhadd v5.8B, v5.8B, v21.8B - urhadd v6.8B, v6.8B, v22.8B - urhadd v7.8B, v7.8B, v23.8B + ld1 {v0.8b, v1.8b}, [x2], #16 + ld1 {v2.8b, v3.8b}, [x2], #16 + urhadd v0.8b, v0.8b, v16.8b + urhadd v1.8b, v1.8b, v17.8b + ld1 {v4.8b, v5.8b}, [x2], #16 + urhadd v2.8b, v2.8b, v18.8b + urhadd v3.8b, v3.8b, v19.8b + ld1 {v6.8b, v7.8b}, [x2], #16 + urhadd v4.8b, v4.8b, v20.8b + urhadd v5.8b, v5.8b, v21.8b + urhadd v6.8b, v6.8b, v22.8b + urhadd v7.8b, v7.8b, v23.8b .ifc \type,avg - ld1 {v16.8B}, [x0], x3 - ld1 {v17.8B}, [x0], x3 - ld1 {v18.8B}, [x0], x3 - urhadd v0.8B, v0.8B, v16.8B - ld1 {v19.8B}, [x0], x3 - urhadd v1.8B, v1.8B, v17.8B - ld1 {v20.8B}, [x0], x3 - urhadd v2.8B, v2.8B, v18.8B - ld1 {v21.8B}, [x0], x3 - urhadd v3.8B, v3.8B, v19.8B - ld1 {v22.8B}, [x0], x3 - urhadd v4.8B, v4.8B, v20.8B - ld1 {v23.8B}, [x0], x3 - urhadd v5.8B, v5.8B, v21.8B - urhadd v6.8B, v6.8B, v22.8B - urhadd v7.8B, v7.8B, v23.8B + ld1 {v16.8b}, [x0], x3 + ld1 {v17.8b}, [x0], x3 + ld1 {v18.8b}, [x0], x3 + urhadd v0.8b, v0.8b, v16.8b + ld1 {v19.8b}, [x0], x3 + urhadd v1.8b, v1.8b, v17.8b + ld1 {v20.8b}, [x0], x3 + urhadd v2.8b, v2.8b, v18.8b + ld1 {v21.8b}, [x0], x3 + urhadd v3.8b, v3.8b, v19.8b + ld1 {v22.8b}, [x0], x3 + urhadd v4.8b, v4.8b, v20.8b + ld1 {v23.8b}, [x0], x3 + urhadd v5.8b, v5.8b, v21.8b + urhadd v6.8b, v6.8b, v22.8b + urhadd v7.8b, v7.8b, v23.8b sub x0, x0, x3, lsl #3 .endif - st1 {v0.8B}, [x0], x3 - st1 {v1.8B}, [x0], x3 - st1 {v2.8B}, [x0], x3 - st1 {v3.8B}, [x0], x3 - st1 {v4.8B}, [x0], x3 - st1 {v5.8B}, [x0], x3 - st1 {v6.8B}, [x0], x3 - st1 {v7.8B}, [x0], x3 + st1 {v0.8b}, [x0], x3 + st1 {v1.8b}, [x0], x3 + st1 {v2.8b}, [x0], x3 + st1 {v3.8b}, [x0], x3 + st1 {v4.8b}, [x0], x3 + st1 {v5.8b}, [x0], x3 + st1 {v6.8b}, [x0], x3 + st1 {v7.8b}, [x0], x3 ret x10 endfunc diff --git a/libavcodec/aarch64/hpeldsp_neon.S b/libavcodec/aarch64/hpeldsp_neon.S index a491c173bb..e7c1549c40 100644 --- a/libavcodec/aarch64/hpeldsp_neon.S +++ b/libavcodec/aarch64/hpeldsp_neon.S @@ -26,295 +26,295 @@ .if \avg mov x12, x0 .endif -1: ld1 {v0.16B}, [x1], x2 - ld1 {v1.16B}, [x1], x2 - ld1 {v2.16B}, [x1], x2 - ld1 {v3.16B}, [x1], x2 +1: ld1 {v0.16b}, [x1], x2 + ld1 {v1.16b}, [x1], x2 + ld1 {v2.16b}, [x1], x2 + ld1 {v3.16b}, [x1], x2 .if \avg - ld1 {v4.16B}, [x12], x2 - urhadd v0.16B, v0.16B, v4.16B - ld1 {v5.16B}, [x12], x2 - urhadd v1.16B, v1.16B, v5.16B - ld1 {v6.16B}, [x12], x2 - urhadd v2.16B, v2.16B, v6.16B - ld1 {v7.16B}, [x12], x2 - urhadd v3.16B, v3.16B, v7.16B + ld1 {v4.16b}, [x12], x2 + urhadd v0.16b, v0.16b, v4.16b + ld1 {v5.16b}, [x12], x2 + urhadd v1.16b, v1.16b, v5.16b + ld1 {v6.16b}, [x12], x2 + urhadd v2.16b, v2.16b, v6.16b + ld1 {v7.16b}, [x12], x2 + urhadd v3.16b, v3.16b, v7.16b .endif subs w3, w3, #4 - st1 {v0.16B}, [x0], x2 - st1 {v1.16B}, [x0], x2 - st1 {v2.16B}, [x0], x2 - st1 {v3.16B}, [x0], x2 + st1 {v0.16b}, [x0], x2 + st1 {v1.16b}, [x0], x2 + st1 {v2.16b}, [x0], x2 + st1 {v3.16b}, [x0], x2 b.ne 1b ret .endm .macro pixels16_x2 rnd=1, avg=0 -1: ld1 {v0.16B, v1.16B}, [x1], x2 - ld1 {v2.16B, v3.16B}, [x1], x2 +1: ld1 {v0.16b, v1.16b}, [x1], x2 + ld1 {v2.16b, v3.16b}, [x1], x2 subs w3, w3, #2 - ext v1.16B, v0.16B, v1.16B, #1 - avg v0.16B, v0.16B, v1.16B - ext v3.16B, v2.16B, v3.16B, #1 - avg v2.16B, v2.16B, v3.16B + ext v1.16b, v0.16b, v1.16b, #1 + avg v0.16b, v0.16b, v1.16b + ext v3.16b, v2.16b, v3.16b, #1 + avg v2.16b, v2.16b, v3.16b .if \avg - ld1 {v1.16B}, [x0], x2 - ld1 {v3.16B}, [x0] - urhadd v0.16B, v0.16B, v1.16B - urhadd v2.16B, v2.16B, v3.16B + ld1 {v1.16b}, [x0], x2 + ld1 {v3.16b}, [x0] + urhadd v0.16b, v0.16b, v1.16b + urhadd v2.16b, v2.16b, v3.16b sub x0, x0, x2 .endif - st1 {v0.16B}, [x0], x2 - st1 {v2.16B}, [x0], x2 + st1 {v0.16b}, [x0], x2 + st1 {v2.16b}, [x0], x2 b.ne 1b ret .endm .macro pixels16_y2 rnd=1, avg=0 sub w3, w3, #2 - ld1 {v0.16B}, [x1], x2 - ld1 {v1.16B}, [x1], x2 + ld1 {v0.16b}, [x1], x2 + ld1 {v1.16b}, [x1], x2 1: subs w3, w3, #2 - avg v2.16B, v0.16B, v1.16B - ld1 {v0.16B}, [x1], x2 - avg v3.16B, v0.16B, v1.16B - ld1 {v1.16B}, [x1], x2 + avg v2.16b, v0.16b, v1.16b + ld1 {v0.16b}, [x1], x2 + avg v3.16b, v0.16b, v1.16b + ld1 {v1.16b}, [x1], x2 .if \avg - ld1 {v4.16B}, [x0], x2 - ld1 {v5.16B}, [x0] - urhadd v2.16B, v2.16B, v4.16B - urhadd v3.16B, v3.16B, v5.16B + ld1 {v4.16b}, [x0], x2 + ld1 {v5.16b}, [x0] + urhadd v2.16b, v2.16b, v4.16b + urhadd v3.16b, v3.16b, v5.16b sub x0, x0, x2 .endif - st1 {v2.16B}, [x0], x2 - st1 {v3.16B}, [x0], x2 + st1 {v2.16b}, [x0], x2 + st1 {v3.16b}, [x0], x2 b.ne 1b - avg v2.16B, v0.16B, v1.16B - ld1 {v0.16B}, [x1], x2 - avg v3.16B, v0.16B, v1.16B + avg v2.16b, v0.16b, v1.16b + ld1 {v0.16b}, [x1], x2 + avg v3.16b, v0.16b, v1.16b .if \avg - ld1 {v4.16B}, [x0], x2 - ld1 {v5.16B}, [x0] - urhadd v2.16B, v2.16B, v4.16B - urhadd v3.16B, v3.16B, v5.16B + ld1 {v4.16b}, [x0], x2 + ld1 {v5.16b}, [x0] + urhadd v2.16b, v2.16b, v4.16b + urhadd v3.16b, v3.16b, v5.16b sub x0, x0, x2 .endif - st1 {v2.16B}, [x0], x2 - st1 {v3.16B}, [x0], x2 + st1 {v2.16b}, [x0], x2 + st1 {v3.16b}, [x0], x2 ret .endm .macro pixels16_xy2 rnd=1, avg=0 sub w3, w3, #2 - ld1 {v0.16B, v1.16B}, [x1], x2 - ld1 {v4.16B, v5.16B}, [x1], x2 + ld1 {v0.16b, v1.16b}, [x1], x2 + ld1 {v4.16b, v5.16b}, [x1], x2 NRND movi v26.8H, #1 - ext v1.16B, v0.16B, v1.16B, #1 - ext v5.16B, v4.16B, v5.16B, #1 - uaddl v16.8H, v0.8B, v1.8B - uaddl2 v20.8H, v0.16B, v1.16B - uaddl v18.8H, v4.8B, v5.8B - uaddl2 v22.8H, v4.16B, v5.16B + ext v1.16b, v0.16b, v1.16b, #1 + ext v5.16b, v4.16b, v5.16b, #1 + uaddl v16.8h, v0.8b, v1.8b + uaddl2 v20.8h, v0.16b, v1.16b + uaddl v18.8h, v4.8b, v5.8b + uaddl2 v22.8h, v4.16b, v5.16b 1: subs w3, w3, #2 - ld1 {v0.16B, v1.16B}, [x1], x2 - add v24.8H, v16.8H, v18.8H + ld1 {v0.16b, v1.16b}, [x1], x2 + add v24.8h, v16.8h, v18.8h NRND add v24.8H, v24.8H, v26.8H - ext v30.16B, v0.16B, v1.16B, #1 - add v1.8H, v20.8H, v22.8H - mshrn v28.8B, v24.8H, #2 + ext v30.16b, v0.16b, v1.16b, #1 + add v1.8h, v20.8h, v22.8h + mshrn v28.8b, v24.8h, #2 NRND add v1.8H, v1.8H, v26.8H - mshrn2 v28.16B, v1.8H, #2 + mshrn2 v28.16b, v1.8h, #2 .if \avg - ld1 {v16.16B}, [x0] - urhadd v28.16B, v28.16B, v16.16B + ld1 {v16.16b}, [x0] + urhadd v28.16b, v28.16b, v16.16b .endif - uaddl v16.8H, v0.8B, v30.8B - ld1 {v2.16B, v3.16B}, [x1], x2 - uaddl2 v20.8H, v0.16B, v30.16B - st1 {v28.16B}, [x0], x2 - add v24.8H, v16.8H, v18.8H + uaddl v16.8h, v0.8b, v30.8b + ld1 {v2.16b, v3.16b}, [x1], x2 + uaddl2 v20.8h, v0.16b, v30.16b + st1 {v28.16b}, [x0], x2 + add v24.8h, v16.8h, v18.8h NRND add v24.8H, v24.8H, v26.8H - ext v3.16B, v2.16B, v3.16B, #1 - add v0.8H, v20.8H, v22.8H - mshrn v30.8B, v24.8H, #2 + ext v3.16b, v2.16b, v3.16b, #1 + add v0.8h, v20.8h, v22.8h + mshrn v30.8b, v24.8h, #2 NRND add v0.8H, v0.8H, v26.8H - mshrn2 v30.16B, v0.8H, #2 + mshrn2 v30.16b, v0.8h, #2 .if \avg - ld1 {v18.16B}, [x0] - urhadd v30.16B, v30.16B, v18.16B + ld1 {v18.16b}, [x0] + urhadd v30.16b, v30.16b, v18.16b .endif - uaddl v18.8H, v2.8B, v3.8B - uaddl2 v22.8H, v2.16B, v3.16B - st1 {v30.16B}, [x0], x2 + uaddl v18.8h, v2.8b, v3.8b + uaddl2 v22.8h, v2.16b, v3.16b + st1 {v30.16b}, [x0], x2 b.gt 1b - ld1 {v0.16B, v1.16B}, [x1], x2 - add v24.8H, v16.8H, v18.8H + ld1 {v0.16b, v1.16b}, [x1], x2 + add v24.8h, v16.8h, v18.8h NRND add v24.8H, v24.8H, v26.8H - ext v30.16B, v0.16B, v1.16B, #1 - add v1.8H, v20.8H, v22.8H - mshrn v28.8B, v24.8H, #2 + ext v30.16b, v0.16b, v1.16b, #1 + add v1.8h, v20.8h, v22.8h + mshrn v28.8b, v24.8h, #2 NRND add v1.8H, v1.8H, v26.8H - mshrn2 v28.16B, v1.8H, #2 + mshrn2 v28.16b, v1.8h, #2 .if \avg - ld1 {v16.16B}, [x0] - urhadd v28.16B, v28.16B, v16.16B + ld1 {v16.16b}, [x0] + urhadd v28.16b, v28.16b, v16.16b .endif - uaddl v16.8H, v0.8B, v30.8B - uaddl2 v20.8H, v0.16B, v30.16B - st1 {v28.16B}, [x0], x2 - add v24.8H, v16.8H, v18.8H + uaddl v16.8h, v0.8b, v30.8b + uaddl2 v20.8h, v0.16b, v30.16b + st1 {v28.16b}, [x0], x2 + add v24.8h, v16.8h, v18.8h NRND add v24.8H, v24.8H, v26.8H - add v0.8H, v20.8H, v22.8H - mshrn v30.8B, v24.8H, #2 + add v0.8h, v20.8h, v22.8h + mshrn v30.8b, v24.8h, #2 NRND add v0.8H, v0.8H, v26.8H - mshrn2 v30.16B, v0.8H, #2 + mshrn2 v30.16b, v0.8h, #2 .if \avg - ld1 {v18.16B}, [x0] - urhadd v30.16B, v30.16B, v18.16B + ld1 {v18.16b}, [x0] + urhadd v30.16b, v30.16b, v18.16b .endif - st1 {v30.16B}, [x0], x2 + st1 {v30.16b}, [x0], x2 ret .endm .macro pixels8 rnd=1, avg=0 -1: ld1 {v0.8B}, [x1], x2 - ld1 {v1.8B}, [x1], x2 - ld1 {v2.8B}, [x1], x2 - ld1 {v3.8B}, [x1], x2 +1: ld1 {v0.8b}, [x1], x2 + ld1 {v1.8b}, [x1], x2 + ld1 {v2.8b}, [x1], x2 + ld1 {v3.8b}, [x1], x2 .if \avg - ld1 {v4.8B}, [x0], x2 - urhadd v0.8B, v0.8B, v4.8B - ld1 {v5.8B}, [x0], x2 - urhadd v1.8B, v1.8B, v5.8B - ld1 {v6.8B}, [x0], x2 - urhadd v2.8B, v2.8B, v6.8B - ld1 {v7.8B}, [x0], x2 - urhadd v3.8B, v3.8B, v7.8B + ld1 {v4.8b}, [x0], x2 + urhadd v0.8b, v0.8b, v4.8b + ld1 {v5.8b}, [x0], x2 + urhadd v1.8b, v1.8b, v5.8b + ld1 {v6.8b}, [x0], x2 + urhadd v2.8b, v2.8b, v6.8b + ld1 {v7.8b}, [x0], x2 + urhadd v3.8b, v3.8b, v7.8b sub x0, x0, x2, lsl #2 .endif subs w3, w3, #4 - st1 {v0.8B}, [x0], x2 - st1 {v1.8B}, [x0], x2 - st1 {v2.8B}, [x0], x2 - st1 {v3.8B}, [x0], x2 + st1 {v0.8b}, [x0], x2 + st1 {v1.8b}, [x0], x2 + st1 {v2.8b}, [x0], x2 + st1 {v3.8b}, [x0], x2 b.ne 1b ret .endm .macro pixels8_x2 rnd=1, avg=0 -1: ld1 {v0.8B, v1.8B}, [x1], x2 - ext v1.8B, v0.8B, v1.8B, #1 - ld1 {v2.8B, v3.8B}, [x1], x2 - ext v3.8B, v2.8B, v3.8B, #1 +1: ld1 {v0.8b, v1.8b}, [x1], x2 + ext v1.8b, v0.8b, v1.8b, #1 + ld1 {v2.8b, v3.8b}, [x1], x2 + ext v3.8b, v2.8b, v3.8b, #1 subs w3, w3, #2 - avg v0.8B, v0.8B, v1.8B - avg v2.8B, v2.8B, v3.8B + avg v0.8b, v0.8b, v1.8b + avg v2.8b, v2.8b, v3.8b .if \avg - ld1 {v4.8B}, [x0], x2 - ld1 {v5.8B}, [x0] - urhadd v0.8B, v0.8B, v4.8B - urhadd v2.8B, v2.8B, v5.8B + ld1 {v4.8b}, [x0], x2 + ld1 {v5.8b}, [x0] + urhadd v0.8b, v0.8b, v4.8b + urhadd v2.8b, v2.8b, v5.8b sub x0, x0, x2 .endif - st1 {v0.8B}, [x0], x2 - st1 {v2.8B}, [x0], x2 + st1 {v0.8b}, [x0], x2 + st1 {v2.8b}, [x0], x2 b.ne 1b ret .endm .macro pixels8_y2 rnd=1, avg=0 sub w3, w3, #2 - ld1 {v0.8B}, [x1], x2 - ld1 {v1.8B}, [x1], x2 + ld1 {v0.8b}, [x1], x2 + ld1 {v1.8b}, [x1], x2 1: subs w3, w3, #2 - avg v4.8B, v0.8B, v1.8B - ld1 {v0.8B}, [x1], x2 - avg v5.8B, v0.8B, v1.8B - ld1 {v1.8B}, [x1], x2 + avg v4.8b, v0.8b, v1.8b + ld1 {v0.8b}, [x1], x2 + avg v5.8b, v0.8b, v1.8b + ld1 {v1.8b}, [x1], x2 .if \avg - ld1 {v2.8B}, [x0], x2 - ld1 {v3.8B}, [x0] - urhadd v4.8B, v4.8B, v2.8B - urhadd v5.8B, v5.8B, v3.8B + ld1 {v2.8b}, [x0], x2 + ld1 {v3.8b}, [x0] + urhadd v4.8b, v4.8b, v2.8b + urhadd v5.8b, v5.8b, v3.8b sub x0, x0, x2 .endif - st1 {v4.8B}, [x0], x2 - st1 {v5.8B}, [x0], x2 + st1 {v4.8b}, [x0], x2 + st1 {v5.8b}, [x0], x2 b.ne 1b - avg v4.8B, v0.8B, v1.8B - ld1 {v0.8B}, [x1], x2 - avg v5.8B, v0.8B, v1.8B + avg v4.8b, v0.8b, v1.8b + ld1 {v0.8b}, [x1], x2 + avg v5.8b, v0.8b, v1.8b .if \avg - ld1 {v2.8B}, [x0], x2 - ld1 {v3.8B}, [x0] - urhadd v4.8B, v4.8B, v2.8B - urhadd v5.8B, v5.8B, v3.8B + ld1 {v2.8b}, [x0], x2 + ld1 {v3.8b}, [x0] + urhadd v4.8b, v4.8b, v2.8b + urhadd v5.8b, v5.8b, v3.8b sub x0, x0, x2 .endif - st1 {v4.8B}, [x0], x2 - st1 {v5.8B}, [x0], x2 + st1 {v4.8b}, [x0], x2 + st1 {v5.8b}, [x0], x2 ret .endm .macro pixels8_xy2 rnd=1, avg=0 sub w3, w3, #2 - ld1 {v0.16B}, [x1], x2 - ld1 {v1.16B}, [x1], x2 + ld1 {v0.16b}, [x1], x2 + ld1 {v1.16b}, [x1], x2 NRND movi v19.8H, #1 - ext v4.16B, v0.16B, v4.16B, #1 - ext v6.16B, v1.16B, v6.16B, #1 - uaddl v16.8H, v0.8B, v4.8B - uaddl v17.8H, v1.8B, v6.8B + ext v4.16b, v0.16b, v4.16b, #1 + ext v6.16b, v1.16b, v6.16b, #1 + uaddl v16.8h, v0.8b, v4.8b + uaddl v17.8h, v1.8b, v6.8b 1: subs w3, w3, #2 - ld1 {v0.16B}, [x1], x2 - add v18.8H, v16.8H, v17.8H - ext v4.16B, v0.16B, v4.16B, #1 + ld1 {v0.16b}, [x1], x2 + add v18.8h, v16.8h, v17.8h + ext v4.16b, v0.16b, v4.16b, #1 NRND add v18.8H, v18.8H, v19.8H - uaddl v16.8H, v0.8B, v4.8B - mshrn v5.8B, v18.8H, #2 - ld1 {v1.16B}, [x1], x2 - add v18.8H, v16.8H, v17.8H + uaddl v16.8h, v0.8b, v4.8b + mshrn v5.8b, v18.8h, #2 + ld1 {v1.16b}, [x1], x2 + add v18.8h, v16.8h, v17.8h .if \avg - ld1 {v7.8B}, [x0] - urhadd v5.8B, v5.8B, v7.8B + ld1 {v7.8b}, [x0] + urhadd v5.8b, v5.8b, v7.8b .endif NRND add v18.8H, v18.8H, v19.8H - st1 {v5.8B}, [x0], x2 - mshrn v7.8B, v18.8H, #2 + st1 {v5.8b}, [x0], x2 + mshrn v7.8b, v18.8h, #2 .if \avg - ld1 {v5.8B}, [x0] - urhadd v7.8B, v7.8B, v5.8B + ld1 {v5.8b}, [x0] + urhadd v7.8b, v7.8b, v5.8b .endif - ext v6.16B, v1.16B, v6.16B, #1 - uaddl v17.8H, v1.8B, v6.8B - st1 {v7.8B}, [x0], x2 + ext v6.16b, v1.16b, v6.16b, #1 + uaddl v17.8h, v1.8b, v6.8b + st1 {v7.8b}, [x0], x2 b.gt 1b - ld1 {v0.16B}, [x1], x2 - add v18.8H, v16.8H, v17.8H - ext v4.16B, v0.16B, v4.16B, #1 + ld1 {v0.16b}, [x1], x2 + add v18.8h, v16.8h, v17.8h + ext v4.16b, v0.16b, v4.16b, #1 NRND add v18.8H, v18.8H, v19.8H - uaddl v16.8H, v0.8B, v4.8B - mshrn v5.8B, v18.8H, #2 - add v18.8H, v16.8H, v17.8H + uaddl v16.8h, v0.8b, v4.8b + mshrn v5.8b, v18.8h, #2 + add v18.8h, v16.8h, v17.8h .if \avg - ld1 {v7.8B}, [x0] - urhadd v5.8B, v5.8B, v7.8B + ld1 {v7.8b}, [x0] + urhadd v5.8b, v5.8b, v7.8b .endif NRND add v18.8H, v18.8H, v19.8H - st1 {v5.8B}, [x0], x2 - mshrn v7.8B, v18.8H, #2 + st1 {v5.8b}, [x0], x2 + mshrn v7.8b, v18.8h, #2 .if \avg - ld1 {v5.8B}, [x0] - urhadd v7.8B, v7.8B, v5.8B + ld1 {v5.8b}, [x0] + urhadd v7.8b, v7.8b, v5.8b .endif - st1 {v7.8B}, [x0], x2 + st1 {v7.8b}, [x0], x2 ret .endm diff --git a/libavcodec/aarch64/me_cmp_neon.S b/libavcodec/aarch64/me_cmp_neon.S index cf86e5081d..7500c324bd 100644 --- a/libavcodec/aarch64/me_cmp_neon.S +++ b/libavcodec/aarch64/me_cmp_neon.S @@ -1099,7 +1099,7 @@ function vsse_intra16_neon, export=1 cbnz w4, 2b 3: - add v16.4s, v16.4s, v17.4S + add v16.4s, v16.4s, v17.4s uaddlv d17, v16.4s fmov w0, s17 diff --git a/libavcodec/aarch64/neon.S b/libavcodec/aarch64/neon.S index bc105e4861..f6fb13bea0 100644 --- a/libavcodec/aarch64/neon.S +++ b/libavcodec/aarch64/neon.S @@ -28,146 +28,146 @@ .endm .macro transpose_8x8B r0, r1, r2, r3, r4, r5, r6, r7, r8, r9 - trn1 \r8\().8B, \r0\().8B, \r1\().8B - trn2 \r9\().8B, \r0\().8B, \r1\().8B - trn1 \r1\().8B, \r2\().8B, \r3\().8B - trn2 \r3\().8B, \r2\().8B, \r3\().8B - trn1 \r0\().8B, \r4\().8B, \r5\().8B - trn2 \r5\().8B, \r4\().8B, \r5\().8B - trn1 \r2\().8B, \r6\().8B, \r7\().8B - trn2 \r7\().8B, \r6\().8B, \r7\().8B - - trn1 \r4\().4H, \r0\().4H, \r2\().4H - trn2 \r2\().4H, \r0\().4H, \r2\().4H - trn1 \r6\().4H, \r5\().4H, \r7\().4H - trn2 \r7\().4H, \r5\().4H, \r7\().4H - trn1 \r5\().4H, \r9\().4H, \r3\().4H - trn2 \r9\().4H, \r9\().4H, \r3\().4H - trn1 \r3\().4H, \r8\().4H, \r1\().4H - trn2 \r8\().4H, \r8\().4H, \r1\().4H - - trn1 \r0\().2S, \r3\().2S, \r4\().2S - trn2 \r4\().2S, \r3\().2S, \r4\().2S - - trn1 \r1\().2S, \r5\().2S, \r6\().2S - trn2 \r5\().2S, \r5\().2S, \r6\().2S - - trn2 \r6\().2S, \r8\().2S, \r2\().2S - trn1 \r2\().2S, \r8\().2S, \r2\().2S - - trn1 \r3\().2S, \r9\().2S, \r7\().2S - trn2 \r7\().2S, \r9\().2S, \r7\().2S + trn1 \r8\().8b, \r0\().8b, \r1\().8b + trn2 \r9\().8b, \r0\().8b, \r1\().8b + trn1 \r1\().8b, \r2\().8b, \r3\().8b + trn2 \r3\().8b, \r2\().8b, \r3\().8b + trn1 \r0\().8b, \r4\().8b, \r5\().8b + trn2 \r5\().8b, \r4\().8b, \r5\().8b + trn1 \r2\().8b, \r6\().8b, \r7\().8b + trn2 \r7\().8b, \r6\().8b, \r7\().8b + + trn1 \r4\().4h, \r0\().4h, \r2\().4h + trn2 \r2\().4h, \r0\().4h, \r2\().4h + trn1 \r6\().4h, \r5\().4h, \r7\().4h + trn2 \r7\().4h, \r5\().4h, \r7\().4h + trn1 \r5\().4h, \r9\().4h, \r3\().4h + trn2 \r9\().4h, \r9\().4h, \r3\().4h + trn1 \r3\().4h, \r8\().4h, \r1\().4h + trn2 \r8\().4h, \r8\().4h, \r1\().4h + + trn1 \r0\().2s, \r3\().2s, \r4\().2s + trn2 \r4\().2s, \r3\().2s, \r4\().2s + + trn1 \r1\().2s, \r5\().2s, \r6\().2s + trn2 \r5\().2s, \r5\().2s, \r6\().2s + + trn2 \r6\().2s, \r8\().2s, \r2\().2s + trn1 \r2\().2s, \r8\().2s, \r2\().2s + + trn1 \r3\().2s, \r9\().2s, \r7\().2s + trn2 \r7\().2s, \r9\().2s, \r7\().2s .endm .macro transpose_8x16B r0, r1, r2, r3, r4, r5, r6, r7, t0, t1 - trn1 \t0\().16B, \r0\().16B, \r1\().16B - trn2 \t1\().16B, \r0\().16B, \r1\().16B - trn1 \r1\().16B, \r2\().16B, \r3\().16B - trn2 \r3\().16B, \r2\().16B, \r3\().16B - trn1 \r0\().16B, \r4\().16B, \r5\().16B - trn2 \r5\().16B, \r4\().16B, \r5\().16B - trn1 \r2\().16B, \r6\().16B, \r7\().16B - trn2 \r7\().16B, \r6\().16B, \r7\().16B - - trn1 \r4\().8H, \r0\().8H, \r2\().8H - trn2 \r2\().8H, \r0\().8H, \r2\().8H - trn1 \r6\().8H, \r5\().8H, \r7\().8H - trn2 \r7\().8H, \r5\().8H, \r7\().8H - trn1 \r5\().8H, \t1\().8H, \r3\().8H - trn2 \t1\().8H, \t1\().8H, \r3\().8H - trn1 \r3\().8H, \t0\().8H, \r1\().8H - trn2 \t0\().8H, \t0\().8H, \r1\().8H - - trn1 \r0\().4S, \r3\().4S, \r4\().4S - trn2 \r4\().4S, \r3\().4S, \r4\().4S - - trn1 \r1\().4S, \r5\().4S, \r6\().4S - trn2 \r5\().4S, \r5\().4S, \r6\().4S - - trn2 \r6\().4S, \t0\().4S, \r2\().4S - trn1 \r2\().4S, \t0\().4S, \r2\().4S - - trn1 \r3\().4S, \t1\().4S, \r7\().4S - trn2 \r7\().4S, \t1\().4S, \r7\().4S + trn1 \t0\().16b, \r0\().16b, \r1\().16b + trn2 \t1\().16b, \r0\().16b, \r1\().16b + trn1 \r1\().16b, \r2\().16b, \r3\().16b + trn2 \r3\().16b, \r2\().16b, \r3\().16b + trn1 \r0\().16b, \r4\().16b, \r5\().16b + trn2 \r5\().16b, \r4\().16b, \r5\().16b + trn1 \r2\().16b, \r6\().16b, \r7\().16b + trn2 \r7\().16b, \r6\().16b, \r7\().16b + + trn1 \r4\().8h, \r0\().8h, \r2\().8h + trn2 \r2\().8h, \r0\().8h, \r2\().8h + trn1 \r6\().8h, \r5\().8h, \r7\().8h + trn2 \r7\().8h, \r5\().8h, \r7\().8h + trn1 \r5\().8h, \t1\().8h, \r3\().8h + trn2 \t1\().8h, \t1\().8h, \r3\().8h + trn1 \r3\().8h, \t0\().8h, \r1\().8h + trn2 \t0\().8h, \t0\().8h, \r1\().8h + + trn1 \r0\().4s, \r3\().4s, \r4\().4s + trn2 \r4\().4s, \r3\().4s, \r4\().4s + + trn1 \r1\().4s, \r5\().4s, \r6\().4s + trn2 \r5\().4s, \r5\().4s, \r6\().4s + + trn2 \r6\().4s, \t0\().4s, \r2\().4s + trn1 \r2\().4s, \t0\().4s, \r2\().4s + + trn1 \r3\().4s, \t1\().4s, \r7\().4s + trn2 \r7\().4s, \t1\().4s, \r7\().4s .endm .macro transpose_4x16B r0, r1, r2, r3, t4, t5, t6, t7 - trn1 \t4\().16B, \r0\().16B, \r1\().16B - trn2 \t5\().16B, \r0\().16B, \r1\().16B - trn1 \t6\().16B, \r2\().16B, \r3\().16B - trn2 \t7\().16B, \r2\().16B, \r3\().16B - - trn1 \r0\().8H, \t4\().8H, \t6\().8H - trn2 \r2\().8H, \t4\().8H, \t6\().8H - trn1 \r1\().8H, \t5\().8H, \t7\().8H - trn2 \r3\().8H, \t5\().8H, \t7\().8H + trn1 \t4\().16b, \r0\().16b, \r1\().16b + trn2 \t5\().16b, \r0\().16b, \r1\().16b + trn1 \t6\().16b, \r2\().16b, \r3\().16b + trn2 \t7\().16b, \r2\().16b, \r3\().16b + + trn1 \r0\().8h, \t4\().8h, \t6\().8h + trn2 \r2\().8h, \t4\().8h, \t6\().8h + trn1 \r1\().8h, \t5\().8h, \t7\().8h + trn2 \r3\().8h, \t5\().8h, \t7\().8h .endm .macro transpose_4x8B r0, r1, r2, r3, t4, t5, t6, t7 - trn1 \t4\().8B, \r0\().8B, \r1\().8B - trn2 \t5\().8B, \r0\().8B, \r1\().8B - trn1 \t6\().8B, \r2\().8B, \r3\().8B - trn2 \t7\().8B, \r2\().8B, \r3\().8B - - trn1 \r0\().4H, \t4\().4H, \t6\().4H - trn2 \r2\().4H, \t4\().4H, \t6\().4H - trn1 \r1\().4H, \t5\().4H, \t7\().4H - trn2 \r3\().4H, \t5\().4H, \t7\().4H + trn1 \t4\().8b, \r0\().8b, \r1\().8b + trn2 \t5\().8b, \r0\().8b, \r1\().8b + trn1 \t6\().8b, \r2\().8b, \r3\().8b + trn2 \t7\().8b, \r2\().8b, \r3\().8b + + trn1 \r0\().4h, \t4\().4h, \t6\().4h + trn2 \r2\().4h, \t4\().4h, \t6\().4h + trn1 \r1\().4h, \t5\().4h, \t7\().4h + trn2 \r3\().4h, \t5\().4h, \t7\().4h .endm .macro transpose_4x4H r0, r1, r2, r3, r4, r5, r6, r7 - trn1 \r4\().4H, \r0\().4H, \r1\().4H - trn2 \r5\().4H, \r0\().4H, \r1\().4H - trn1 \r6\().4H, \r2\().4H, \r3\().4H - trn2 \r7\().4H, \r2\().4H, \r3\().4H - - trn1 \r0\().2S, \r4\().2S, \r6\().2S - trn2 \r2\().2S, \r4\().2S, \r6\().2S - trn1 \r1\().2S, \r5\().2S, \r7\().2S - trn2 \r3\().2S, \r5\().2S, \r7\().2S + trn1 \r4\().4h, \r0\().4h, \r1\().4h + trn2 \r5\().4h, \r0\().4h, \r1\().4h + trn1 \r6\().4h, \r2\().4h, \r3\().4h + trn2 \r7\().4h, \r2\().4h, \r3\().4h + + trn1 \r0\().2s, \r4\().2s, \r6\().2s + trn2 \r2\().2s, \r4\().2s, \r6\().2s + trn1 \r1\().2s, \r5\().2s, \r7\().2s + trn2 \r3\().2s, \r5\().2s, \r7\().2s .endm .macro transpose_4x8H r0, r1, r2, r3, t4, t5, t6, t7 - trn1 \t4\().8H, \r0\().8H, \r1\().8H - trn2 \t5\().8H, \r0\().8H, \r1\().8H - trn1 \t6\().8H, \r2\().8H, \r3\().8H - trn2 \t7\().8H, \r2\().8H, \r3\().8H - - trn1 \r0\().4S, \t4\().4S, \t6\().4S - trn2 \r2\().4S, \t4\().4S, \t6\().4S - trn1 \r1\().4S, \t5\().4S, \t7\().4S - trn2 \r3\().4S, \t5\().4S, \t7\().4S + trn1 \t4\().8h, \r0\().8h, \r1\().8h + trn2 \t5\().8h, \r0\().8h, \r1\().8h + trn1 \t6\().8h, \r2\().8h, \r3\().8h + trn2 \t7\().8h, \r2\().8h, \r3\().8h + + trn1 \r0\().4s, \t4\().4s, \t6\().4s + trn2 \r2\().4s, \t4\().4s, \t6\().4s + trn1 \r1\().4s, \t5\().4s, \t7\().4s + trn2 \r3\().4s, \t5\().4s, \t7\().4s .endm .macro transpose_8x8H r0, r1, r2, r3, r4, r5, r6, r7, r8, r9 - trn1 \r8\().8H, \r0\().8H, \r1\().8H - trn2 \r9\().8H, \r0\().8H, \r1\().8H - trn1 \r1\().8H, \r2\().8H, \r3\().8H - trn2 \r3\().8H, \r2\().8H, \r3\().8H - trn1 \r0\().8H, \r4\().8H, \r5\().8H - trn2 \r5\().8H, \r4\().8H, \r5\().8H - trn1 \r2\().8H, \r6\().8H, \r7\().8H - trn2 \r7\().8H, \r6\().8H, \r7\().8H - - trn1 \r4\().4S, \r0\().4S, \r2\().4S - trn2 \r2\().4S, \r0\().4S, \r2\().4S - trn1 \r6\().4S, \r5\().4S, \r7\().4S - trn2 \r7\().4S, \r5\().4S, \r7\().4S - trn1 \r5\().4S, \r9\().4S, \r3\().4S - trn2 \r9\().4S, \r9\().4S, \r3\().4S - trn1 \r3\().4S, \r8\().4S, \r1\().4S - trn2 \r8\().4S, \r8\().4S, \r1\().4S - - trn1 \r0\().2D, \r3\().2D, \r4\().2D - trn2 \r4\().2D, \r3\().2D, \r4\().2D - - trn1 \r1\().2D, \r5\().2D, \r6\().2D - trn2 \r5\().2D, \r5\().2D, \r6\().2D - - trn2 \r6\().2D, \r8\().2D, \r2\().2D - trn1 \r2\().2D, \r8\().2D, \r2\().2D - - trn1 \r3\().2D, \r9\().2D, \r7\().2D - trn2 \r7\().2D, \r9\().2D, \r7\().2D + trn1 \r8\().8h, \r0\().8h, \r1\().8h + trn2 \r9\().8h, \r0\().8h, \r1\().8h + trn1 \r1\().8h, \r2\().8h, \r3\().8h + trn2 \r3\().8h, \r2\().8h, \r3\().8h + trn1 \r0\().8h, \r4\().8h, \r5\().8h + trn2 \r5\().8h, \r4\().8h, \r5\().8h + trn1 \r2\().8h, \r6\().8h, \r7\().8h + trn2 \r7\().8h, \r6\().8h, \r7\().8h + + trn1 \r4\().4s, \r0\().4s, \r2\().4s + trn2 \r2\().4s, \r0\().4s, \r2\().4s + trn1 \r6\().4s, \r5\().4s, \r7\().4s + trn2 \r7\().4s, \r5\().4s, \r7\().4s + trn1 \r5\().4s, \r9\().4s, \r3\().4s + trn2 \r9\().4s, \r9\().4s, \r3\().4s + trn1 \r3\().4s, \r8\().4s, \r1\().4s + trn2 \r8\().4s, \r8\().4s, \r1\().4s + + trn1 \r0\().2d, \r3\().2d, \r4\().2d + trn2 \r4\().2d, \r3\().2d, \r4\().2d + + trn1 \r1\().2d, \r5\().2d, \r6\().2d + trn2 \r5\().2d, \r5\().2d, \r6\().2d + + trn2 \r6\().2d, \r8\().2d, \r2\().2d + trn1 \r2\().2d, \r8\().2d, \r2\().2d + + trn1 \r3\().2d, \r9\().2d, \r7\().2d + trn2 \r7\().2d, \r9\().2d, \r7\().2d .endm diff --git a/libavcodec/aarch64/sbrdsp_neon.S b/libavcodec/aarch64/sbrdsp_neon.S index d23717e760..1fdde6ccb6 100644 --- a/libavcodec/aarch64/sbrdsp_neon.S +++ b/libavcodec/aarch64/sbrdsp_neon.S @@ -46,49 +46,49 @@ function ff_sbr_sum64x5_neon, export=1 add x3, x0, #192*4 add x4, x0, #256*4 mov x5, #64 -1: ld1 {v0.4S}, [x0] - ld1 {v1.4S}, [x1], #16 - fadd v0.4S, v0.4S, v1.4S - ld1 {v2.4S}, [x2], #16 - fadd v0.4S, v0.4S, v2.4S - ld1 {v3.4S}, [x3], #16 - fadd v0.4S, v0.4S, v3.4S - ld1 {v4.4S}, [x4], #16 - fadd v0.4S, v0.4S, v4.4S - st1 {v0.4S}, [x0], #16 +1: ld1 {v0.4s}, [x0] + ld1 {v1.4s}, [x1], #16 + fadd v0.4s, v0.4s, v1.4s + ld1 {v2.4s}, [x2], #16 + fadd v0.4s, v0.4s, v2.4s + ld1 {v3.4s}, [x3], #16 + fadd v0.4s, v0.4s, v3.4s + ld1 {v4.4s}, [x4], #16 + fadd v0.4s, v0.4s, v4.4s + st1 {v0.4s}, [x0], #16 subs x5, x5, #4 b.gt 1b ret endfunc function ff_sbr_sum_square_neon, export=1 - movi v0.4S, #0 -1: ld1 {v1.4S}, [x0], #16 - fmla v0.4S, v1.4S, v1.4S + movi v0.4s, #0 +1: ld1 {v1.4s}, [x0], #16 + fmla v0.4s, v1.4s, v1.4s subs w1, w1, #2 b.gt 1b - faddp v0.4S, v0.4S, v0.4S - faddp v0.4S, v0.4S, v0.4S + faddp v0.4s, v0.4s, v0.4s + faddp v0.4s, v0.4s, v0.4s ret endfunc function ff_sbr_neg_odd_64_neon, export=1 mov x1, x0 - movi v5.4S, #1<<7, lsl #24 - ld2 {v0.4S, v1.4S}, [x0], #32 - eor v1.16B, v1.16B, v5.16B - ld2 {v2.4S, v3.4S}, [x0], #32 + movi v5.4s, #1<<7, lsl #24 + ld2 {v0.4s, v1.4s}, [x0], #32 + eor v1.16b, v1.16b, v5.16b + ld2 {v2.4s, v3.4s}, [x0], #32 .rept 3 - st2 {v0.4S, v1.4S}, [x1], #32 - eor v3.16B, v3.16B, v5.16B - ld2 {v0.4S, v1.4S}, [x0], #32 - st2 {v2.4S, v3.4S}, [x1], #32 - eor v1.16B, v1.16B, v5.16B - ld2 {v2.4S, v3.4S}, [x0], #32 + st2 {v0.4s, v1.4s}, [x1], #32 + eor v3.16b, v3.16b, v5.16b + ld2 {v0.4s, v1.4s}, [x0], #32 + st2 {v2.4s, v3.4s}, [x1], #32 + eor v1.16b, v1.16b, v5.16b + ld2 {v2.4s, v3.4s}, [x0], #32 .endr - eor v3.16B, v3.16B, v5.16B - st2 {v0.4S, v1.4S}, [x1], #32 - st2 {v2.4S, v3.4S}, [x1], #32 + eor v3.16b, v3.16b, v5.16b + st2 {v0.4s, v1.4s}, [x1], #32 + st2 {v2.4s, v3.4s}, [x1], #32 ret endfunc @@ -97,26 +97,26 @@ function ff_sbr_qmf_pre_shuffle_neon, export=1 add x2, x0, #64*4 mov x3, #-16 mov x4, #-4 - movi v6.4S, #1<<7, lsl #24 - ld1 {v0.2S}, [x0], #8 - st1 {v0.2S}, [x2], #8 + movi v6.4s, #1<<7, lsl #24 + ld1 {v0.2s}, [x0], #8 + st1 {v0.2s}, [x2], #8 .rept 7 - ld1 {v1.4S}, [x1], x3 - ld1 {v2.4S}, [x0], #16 - eor v1.16B, v1.16B, v6.16B - rev64 v1.4S, v1.4S - ext v1.16B, v1.16B, v1.16B, #8 - st2 {v1.4S, v2.4S}, [x2], #32 + ld1 {v1.4s}, [x1], x3 + ld1 {v2.4s}, [x0], #16 + eor v1.16b, v1.16b, v6.16b + rev64 v1.4s, v1.4s + ext v1.16b, v1.16b, v1.16b, #8 + st2 {v1.4s, v2.4s}, [x2], #32 .endr add x1, x1, #8 - ld1 {v1.2S}, [x1], x4 - ld1 {v2.2S}, [x0], #8 - ld1 {v1.S}[3], [x1] - ld1 {v2.S}[2], [x0] - eor v1.16B, v1.16B, v6.16B - rev64 v1.4S, v1.4S - st2 {v1.2S, v2.2S}, [x2], #16 - st2 {v1.S, v2.S}[2], [x2] + ld1 {v1.2s}, [x1], x4 + ld1 {v2.2s}, [x0], #8 + ld1 {v1.s}[3], [x1] + ld1 {v2.s}[2], [x0] + eor v1.16b, v1.16b, v6.16b + rev64 v1.4s, v1.4s + st2 {v1.2s, v2.2s}, [x2], #16 + st2 {v1.s, v2.s}[2], [x2] ret endfunc @@ -124,13 +124,13 @@ function ff_sbr_qmf_post_shuffle_neon, export=1 add x2, x1, #60*4 mov x3, #-16 mov x4, #32 - movi v6.4S, #1<<7, lsl #24 -1: ld1 {v0.4S}, [x2], x3 - ld1 {v1.4S}, [x1], #16 - eor v0.16B, v0.16B, v6.16B - rev64 v0.4S, v0.4S - ext v0.16B, v0.16B, v0.16B, #8 - st2 {v0.4S, v1.4S}, [x0], #32 + movi v6.4s, #1<<7, lsl #24 +1: ld1 {v0.4s}, [x2], x3 + ld1 {v1.4s}, [x1], #16 + eor v0.16b, v0.16b, v6.16b + rev64 v0.4s, v0.4s + ext v0.16b, v0.16b, v0.16b, #8 + st2 {v0.4s, v1.4s}, [x0], #32 subs x4, x4, #4 b.gt 1b ret @@ -141,13 +141,13 @@ function ff_sbr_qmf_deint_neg_neon, export=1 add x2, x0, #60*4 mov x3, #-32 mov x4, #32 - movi v2.4S, #1<<7, lsl #24 -1: ld2 {v0.4S, v1.4S}, [x1], x3 - eor v0.16B, v0.16B, v2.16B - rev64 v1.4S, v1.4S - ext v1.16B, v1.16B, v1.16B, #8 - st1 {v0.4S}, [x2] - st1 {v1.4S}, [x0], #16 + movi v2.4s, #1<<7, lsl #24 +1: ld2 {v0.4s, v1.4s}, [x1], x3 + eor v0.16b, v0.16b, v2.16b + rev64 v1.4s, v1.4s + ext v1.16b, v1.16b, v1.16b, #8 + st1 {v0.4s}, [x2] + st1 {v1.4s}, [x0], #16 sub x2, x2, #16 subs x4, x4, #4 b.gt 1b @@ -159,16 +159,16 @@ function ff_sbr_qmf_deint_bfly_neon, export=1 add x3, x0, #124*4 mov x4, #64 mov x5, #-16 -1: ld1 {v0.4S}, [x1], #16 - ld1 {v1.4S}, [x2], x5 - rev64 v2.4S, v0.4S - ext v2.16B, v2.16B, v2.16B, #8 - rev64 v3.4S, v1.4S - ext v3.16B, v3.16B, v3.16B, #8 - fadd v1.4S, v1.4S, v2.4S - fsub v0.4S, v0.4S, v3.4S - st1 {v0.4S}, [x0], #16 - st1 {v1.4S}, [x3], x5 +1: ld1 {v0.4s}, [x1], #16 + ld1 {v1.4s}, [x2], x5 + rev64 v2.4s, v0.4s + ext v2.16b, v2.16b, v2.16b, #8 + rev64 v3.4s, v1.4s + ext v3.16b, v3.16b, v3.16b, #8 + fadd v1.4s, v1.4s, v2.4s + fsub v0.4s, v0.4s, v3.4s + st1 {v0.4s}, [x0], #16 + st1 {v1.4s}, [x3], x5 subs x4, x4, #4 b.gt 1b ret @@ -178,32 +178,32 @@ function ff_sbr_hf_gen_neon, export=1 sxtw x4, w4 sxtw x5, w5 movrel x6, factors - ld1 {v7.4S}, [x6] - dup v1.4S, v0.S[0] - mov v2.8B, v1.8B - mov v2.S[2], v7.S[0] - mov v2.S[3], v7.S[0] - fmul v1.4S, v1.4S, v2.4S - ld1 {v0.D}[0], [x3] - ld1 {v0.D}[1], [x2] - fmul v0.4S, v0.4S, v1.4S - fmul v1.4S, v0.4S, v7.4S - rev64 v0.4S, v0.4S + ld1 {v7.4s}, [x6] + dup v1.4s, v0.s[0] + mov v2.8b, v1.8b + mov v2.s[2], v7.s[0] + mov v2.s[3], v7.s[0] + fmul v1.4s, v1.4s, v2.4s + ld1 {v0.d}[0], [x3] + ld1 {v0.d}[1], [x2] + fmul v0.4s, v0.4s, v1.4s + fmul v1.4s, v0.4s, v7.4s + rev64 v0.4s, v0.4s sub x7, x5, x4 add x0, x0, x4, lsl #3 add x1, x1, x4, lsl #3 sub x1, x1, #16 -1: ld1 {v2.4S}, [x1], #16 - ld1 {v3.2S}, [x1] - fmul v4.4S, v2.4S, v1.4S - fmul v5.4S, v2.4S, v0.4S - faddp v4.4S, v4.4S, v4.4S - faddp v5.4S, v5.4S, v5.4S - faddp v4.4S, v4.4S, v4.4S - faddp v5.4S, v5.4S, v5.4S - mov v4.S[1], v5.S[0] - fadd v4.2S, v4.2S, v3.2S - st1 {v4.2S}, [x0], #8 +1: ld1 {v2.4s}, [x1], #16 + ld1 {v3.2s}, [x1] + fmul v4.4s, v2.4s, v1.4s + fmul v5.4s, v2.4s, v0.4s + faddp v4.4s, v4.4s, v4.4s + faddp v5.4s, v5.4s, v5.4s + faddp v4.4s, v4.4s, v4.4s + faddp v5.4s, v5.4s, v5.4s + mov v4.s[1], v5.s[0] + fadd v4.2s, v4.2s, v3.2s + st1 {v4.2s}, [x0], #8 sub x1, x1, #8 subs x7, x7, #1 b.gt 1b @@ -215,10 +215,10 @@ function ff_sbr_hf_g_filt_neon, export=1 sxtw x4, w4 mov x5, #40*2*4 add x1, x1, x4, lsl #3 -1: ld1 {v0.2S}, [x1], x5 - ld1 {v1.S}[0], [x2], #4 - fmul v2.4S, v0.4S, v1.S[0] - st1 {v2.2S}, [x0], #8 +1: ld1 {v0.2s}, [x1], x5 + ld1 {v1.s}[0], [x2], #4 + fmul v2.4s, v0.4s, v1.s[0] + st1 {v2.2s}, [x0], #8 subs x3, x3, #1 b.gt 1b ret @@ -227,46 +227,46 @@ endfunc function ff_sbr_autocorrelate_neon, export=1 mov x2, #38 movrel x3, factors - ld1 {v0.4S}, [x3] - movi v1.4S, #0 - movi v2.4S, #0 - movi v3.4S, #0 - ld1 {v4.2S}, [x0], #8 - ld1 {v5.2S}, [x0], #8 - fmul v16.2S, v4.2S, v4.2S - fmul v17.2S, v5.2S, v4.S[0] - fmul v18.2S, v5.2S, v4.S[1] -1: ld1 {v5.D}[1], [x0], #8 - fmla v1.2S, v4.2S, v4.2S - fmla v2.4S, v5.4S, v4.S[0] - fmla v3.4S, v5.4S, v4.S[1] - mov v4.D[0], v5.D[0] - mov v5.D[0], v5.D[1] + ld1 {v0.4s}, [x3] + movi v1.4s, #0 + movi v2.4s, #0 + movi v3.4s, #0 + ld1 {v4.2s}, [x0], #8 + ld1 {v5.2s}, [x0], #8 + fmul v16.2s, v4.2s, v4.2s + fmul v17.2s, v5.2s, v4.s[0] + fmul v18.2s, v5.2s, v4.s[1] +1: ld1 {v5.d}[1], [x0], #8 + fmla v1.2s, v4.2s, v4.2s + fmla v2.4s, v5.4s, v4.s[0] + fmla v3.4s, v5.4s, v4.s[1] + mov v4.d[0], v5.d[0] + mov v5.d[0], v5.d[1] subs x2, x2, #1 b.gt 1b - fmul v19.2S, v4.2S, v4.2S - fmul v20.2S, v5.2S, v4.S[0] - fmul v21.2S, v5.2S, v4.S[1] - fadd v22.4S, v2.4S, v20.4S - fsub v22.4S, v22.4S, v17.4S - fadd v23.4S, v3.4S, v21.4S - fsub v23.4S, v23.4S, v18.4S - rev64 v23.4S, v23.4S - fmul v23.4S, v23.4S, v0.4S - fadd v22.4S, v22.4S, v23.4S - st1 {v22.4S}, [x1], #16 - fadd v23.2S, v1.2S, v19.2S - fsub v23.2S, v23.2S, v16.2S - faddp v23.2S, v23.2S, v23.2S - st1 {v23.S}[0], [x1] + fmul v19.2s, v4.2s, v4.2s + fmul v20.2s, v5.2s, v4.s[0] + fmul v21.2s, v5.2s, v4.s[1] + fadd v22.4s, v2.4s, v20.4s + fsub v22.4s, v22.4s, v17.4s + fadd v23.4s, v3.4s, v21.4s + fsub v23.4s, v23.4s, v18.4s + rev64 v23.4s, v23.4s + fmul v23.4s, v23.4s, v0.4s + fadd v22.4s, v22.4s, v23.4s + st1 {v22.4s}, [x1], #16 + fadd v23.2s, v1.2s, v19.2s + fsub v23.2s, v23.2s, v16.2s + faddp v23.2s, v23.2s, v23.2s + st1 {v23.s}[0], [x1] add x1, x1, #8 - rev64 v3.2S, v3.2S - fmul v3.2S, v3.2S, v0.2S - fadd v2.2S, v2.2S, v3.2S - st1 {v2.2S}, [x1] + rev64 v3.2s, v3.2s + fmul v3.2s, v3.2s, v0.2s + fadd v2.2s, v2.2s, v3.2s + st1 {v2.2s}, [x1] add x1, x1, #16 - faddp v1.2S, v1.2S, v1.2S - st1 {v1.S}[0], [x1] + faddp v1.2s, v1.2s, v1.2s + st1 {v1.s}[0], [x1] ret endfunc @@ -278,25 +278,25 @@ endfunc 1: and x3, x3, #0x1ff add x8, x7, x3, lsl #3 add x3, x3, #2 - ld1 {v2.4S}, [x0] - ld1 {v3.2S}, [x1], #8 - ld1 {v4.2S}, [x2], #8 - ld1 {v5.4S}, [x8] - mov v6.16B, v2.16B - zip1 v3.4S, v3.4S, v3.4S - zip1 v4.4S, v4.4S, v4.4S - fmla v6.4S, v1.4S, v3.4S - fmla v2.4S, v5.4S, v4.4S - fcmeq v7.4S, v3.4S, #0 - bif v2.16B, v6.16B, v7.16B - st1 {v2.4S}, [x0], #16 + ld1 {v2.4s}, [x0] + ld1 {v3.2s}, [x1], #8 + ld1 {v4.2s}, [x2], #8 + ld1 {v5.4s}, [x8] + mov v6.16b, v2.16b + zip1 v3.4s, v3.4s, v3.4s + zip1 v4.4s, v4.4s, v4.4s + fmla v6.4s, v1.4s, v3.4s + fmla v2.4s, v5.4s, v4.4s + fcmeq v7.4s, v3.4s, #0 + bif v2.16b, v6.16b, v7.16b + st1 {v2.4s}, [x0], #16 subs x5, x5, #2 b.gt 1b .endm function ff_sbr_hf_apply_noise_0_neon, export=1 movrel x9, phi_noise_0 - ld1 {v1.4S}, [x9] + ld1 {v1.4s}, [x9] apply_noise_common ret endfunc @@ -305,14 +305,14 @@ function ff_sbr_hf_apply_noise_1_neon, export=1 movrel x9, phi_noise_1 and x4, x4, #1 add x9, x9, x4, lsl #4 - ld1 {v1.4S}, [x9] + ld1 {v1.4s}, [x9] apply_noise_common ret endfunc function ff_sbr_hf_apply_noise_2_neon, export=1 movrel x9, phi_noise_2 - ld1 {v1.4S}, [x9] + ld1 {v1.4s}, [x9] apply_noise_common ret endfunc @@ -321,7 +321,7 @@ function ff_sbr_hf_apply_noise_3_neon, export=1 movrel x9, phi_noise_3 and x4, x4, #1 add x9, x9, x4, lsl #4 - ld1 {v1.4S}, [x9] + ld1 {v1.4s}, [x9] apply_noise_common ret endfunc diff --git a/libavcodec/aarch64/simple_idct_neon.S b/libavcodec/aarch64/simple_idct_neon.S index 210182ff21..a4438e9922 100644 --- a/libavcodec/aarch64/simple_idct_neon.S +++ b/libavcodec/aarch64/simple_idct_neon.S @@ -54,7 +54,7 @@ endconst prfm pldl1keep, [\data] mov x10, x30 movrel x3, idct_coeff_neon - ld1 {v0.2D}, [x3] + ld1 {v0.2d}, [x3] .endm .macro idct_end @@ -74,146 +74,146 @@ endconst .endm .macro idct_col4_top y1, y2, y3, y4, i, l - smull\i v7.4S, \y3\l, z2 - smull\i v16.4S, \y3\l, z6 - smull\i v17.4S, \y2\l, z1 - add v19.4S, v23.4S, v7.4S - smull\i v18.4S, \y2\l, z3 - add v20.4S, v23.4S, v16.4S - smull\i v5.4S, \y2\l, z5 - sub v21.4S, v23.4S, v16.4S - smull\i v6.4S, \y2\l, z7 - sub v22.4S, v23.4S, v7.4S - - smlal\i v17.4S, \y4\l, z3 - smlsl\i v18.4S, \y4\l, z7 - smlsl\i v5.4S, \y4\l, z1 - smlsl\i v6.4S, \y4\l, z5 + smull\i v7.4s, \y3\l, z2 + smull\i v16.4s, \y3\l, z6 + smull\i v17.4s, \y2\l, z1 + add v19.4s, v23.4s, v7.4s + smull\i v18.4s, \y2\l, z3 + add v20.4s, v23.4s, v16.4s + smull\i v5.4s, \y2\l, z5 + sub v21.4s, v23.4s, v16.4s + smull\i v6.4s, \y2\l, z7 + sub v22.4s, v23.4s, v7.4s + + smlal\i v17.4s, \y4\l, z3 + smlsl\i v18.4s, \y4\l, z7 + smlsl\i v5.4s, \y4\l, z1 + smlsl\i v6.4s, \y4\l, z5 .endm .macro idct_row4_neon y1, y2, y3, y4, pass - ld1 {\y1\().2D,\y2\().2D}, [x2], #32 - movi v23.4S, #1<<2, lsl #8 - orr v5.16B, \y1\().16B, \y2\().16B - ld1 {\y3\().2D,\y4\().2D}, [x2], #32 - orr v6.16B, \y3\().16B, \y4\().16B - orr v5.16B, v5.16B, v6.16B - mov x3, v5.D[1] - smlal v23.4S, \y1\().4H, z4 + ld1 {\y1\().2d,\y2\().2d}, [x2], #32 + movi v23.4s, #1<<2, lsl #8 + orr v5.16b, \y1\().16b, \y2\().16b + ld1 {\y3\().2d,\y4\().2d}, [x2], #32 + orr v6.16b, \y3\().16b, \y4\().16b + orr v5.16b, v5.16b, v6.16b + mov x3, v5.d[1] + smlal v23.4s, \y1\().4h, z4 - idct_col4_top \y1, \y2, \y3, \y4, 1, .4H + idct_col4_top \y1, \y2, \y3, \y4, 1, .4h cmp x3, #0 b.eq \pass\()f - smull2 v7.4S, \y1\().8H, z4 - smlal2 v17.4S, \y2\().8H, z5 - smlsl2 v18.4S, \y2\().8H, z1 - smull2 v16.4S, \y3\().8H, z2 - smlal2 v5.4S, \y2\().8H, z7 - add v19.4S, v19.4S, v7.4S - sub v20.4S, v20.4S, v7.4S - sub v21.4S, v21.4S, v7.4S - add v22.4S, v22.4S, v7.4S - smlal2 v6.4S, \y2\().8H, z3 - smull2 v7.4S, \y3\().8H, z6 - smlal2 v17.4S, \y4\().8H, z7 - smlsl2 v18.4S, \y4\().8H, z5 - smlal2 v5.4S, \y4\().8H, z3 - smlsl2 v6.4S, \y4\().8H, z1 - add v19.4S, v19.4S, v7.4S - sub v20.4S, v20.4S, v16.4S - add v21.4S, v21.4S, v16.4S - sub v22.4S, v22.4S, v7.4S + smull2 v7.4s, \y1\().8h, z4 + smlal2 v17.4s, \y2\().8h, z5 + smlsl2 v18.4s, \y2\().8h, z1 + smull2 v16.4s, \y3\().8h, z2 + smlal2 v5.4s, \y2\().8h, z7 + add v19.4s, v19.4s, v7.4s + sub v20.4s, v20.4s, v7.4s + sub v21.4s, v21.4s, v7.4s + add v22.4s, v22.4s, v7.4s + smlal2 v6.4s, \y2\().8h, z3 + smull2 v7.4s, \y3\().8h, z6 + smlal2 v17.4s, \y4\().8h, z7 + smlsl2 v18.4s, \y4\().8h, z5 + smlal2 v5.4s, \y4\().8h, z3 + smlsl2 v6.4s, \y4\().8h, z1 + add v19.4s, v19.4s, v7.4s + sub v20.4s, v20.4s, v16.4s + add v21.4s, v21.4s, v16.4s + sub v22.4s, v22.4s, v7.4s \pass: add \y3\().4S, v19.4S, v17.4S - add \y4\().4S, v20.4S, v18.4S - shrn \y1\().4H, \y3\().4S, #ROW_SHIFT - shrn \y2\().4H, \y4\().4S, #ROW_SHIFT - add v7.4S, v21.4S, v5.4S - add v16.4S, v22.4S, v6.4S - shrn \y3\().4H, v7.4S, #ROW_SHIFT - shrn \y4\().4H, v16.4S, #ROW_SHIFT - sub v22.4S, v22.4S, v6.4S - sub v19.4S, v19.4S, v17.4S - sub v21.4S, v21.4S, v5.4S - shrn2 \y1\().8H, v22.4S, #ROW_SHIFT - sub v20.4S, v20.4S, v18.4S - shrn2 \y2\().8H, v21.4S, #ROW_SHIFT - shrn2 \y3\().8H, v20.4S, #ROW_SHIFT - shrn2 \y4\().8H, v19.4S, #ROW_SHIFT - - trn1 v16.8H, \y1\().8H, \y2\().8H - trn2 v17.8H, \y1\().8H, \y2\().8H - trn1 v18.8H, \y3\().8H, \y4\().8H - trn2 v19.8H, \y3\().8H, \y4\().8H - trn1 \y1\().4S, v16.4S, v18.4S - trn1 \y2\().4S, v17.4S, v19.4S - trn2 \y3\().4S, v16.4S, v18.4S - trn2 \y4\().4S, v17.4S, v19.4S + add \y4\().4s, v20.4s, v18.4s + shrn \y1\().4h, \y3\().4s, #ROW_SHIFT + shrn \y2\().4h, \y4\().4s, #ROW_SHIFT + add v7.4s, v21.4s, v5.4s + add v16.4s, v22.4s, v6.4s + shrn \y3\().4h, v7.4s, #ROW_SHIFT + shrn \y4\().4h, v16.4s, #ROW_SHIFT + sub v22.4s, v22.4s, v6.4s + sub v19.4s, v19.4s, v17.4s + sub v21.4s, v21.4s, v5.4s + shrn2 \y1\().8h, v22.4s, #ROW_SHIFT + sub v20.4s, v20.4s, v18.4s + shrn2 \y2\().8h, v21.4s, #ROW_SHIFT + shrn2 \y3\().8h, v20.4s, #ROW_SHIFT + shrn2 \y4\().8h, v19.4s, #ROW_SHIFT + + trn1 v16.8h, \y1\().8h, \y2\().8h + trn2 v17.8h, \y1\().8h, \y2\().8h + trn1 v18.8h, \y3\().8h, \y4\().8h + trn2 v19.8h, \y3\().8h, \y4\().8h + trn1 \y1\().4s, v16.4s, v18.4s + trn1 \y2\().4s, v17.4s, v19.4s + trn2 \y3\().4s, v16.4s, v18.4s + trn2 \y4\().4s, v17.4s, v19.4s .endm .macro declare_idct_col4_neon i, l function idct_col4_neon\i - dup v23.4H, z4c + dup v23.4h, z4c .if \i == 1 - add v23.4H, v23.4H, v24.4H + add v23.4h, v23.4h, v24.4h .else - mov v5.D[0], v24.D[1] - add v23.4H, v23.4H, v5.4H + mov v5.d[0], v24.d[1] + add v23.4h, v23.4h, v5.4h .endif - smull v23.4S, v23.4H, z4 + smull v23.4s, v23.4h, z4 idct_col4_top v24, v25, v26, v27, \i, \l - mov x4, v28.D[\i - 1] - mov x5, v29.D[\i - 1] + mov x4, v28.d[\i - 1] + mov x5, v29.d[\i - 1] cmp x4, #0 b.eq 1f - smull\i v7.4S, v28\l, z4 - add v19.4S, v19.4S, v7.4S - sub v20.4S, v20.4S, v7.4S - sub v21.4S, v21.4S, v7.4S - add v22.4S, v22.4S, v7.4S + smull\i v7.4s, v28\l, z4 + add v19.4s, v19.4s, v7.4s + sub v20.4s, v20.4s, v7.4s + sub v21.4s, v21.4s, v7.4s + add v22.4s, v22.4s, v7.4s -1: mov x4, v30.D[\i - 1] +1: mov x4, v30.d[\i - 1] cmp x5, #0 b.eq 2f - smlal\i v17.4S, v29\l, z5 - smlsl\i v18.4S, v29\l, z1 - smlal\i v5.4S, v29\l, z7 - smlal\i v6.4S, v29\l, z3 + smlal\i v17.4s, v29\l, z5 + smlsl\i v18.4s, v29\l, z1 + smlal\i v5.4s, v29\l, z7 + smlal\i v6.4s, v29\l, z3 -2: mov x5, v31.D[\i - 1] +2: mov x5, v31.d[\i - 1] cmp x4, #0 b.eq 3f - smull\i v7.4S, v30\l, z6 - smull\i v16.4S, v30\l, z2 - add v19.4S, v19.4S, v7.4S - sub v22.4S, v22.4S, v7.4S - sub v20.4S, v20.4S, v16.4S - add v21.4S, v21.4S, v16.4S + smull\i v7.4s, v30\l, z6 + smull\i v16.4s, v30\l, z2 + add v19.4s, v19.4s, v7.4s + sub v22.4s, v22.4s, v7.4s + sub v20.4s, v20.4s, v16.4s + add v21.4s, v21.4s, v16.4s 3: cmp x5, #0 b.eq 4f - smlal\i v17.4S, v31\l, z7 - smlsl\i v18.4S, v31\l, z5 - smlal\i v5.4S, v31\l, z3 - smlsl\i v6.4S, v31\l, z1 + smlal\i v17.4s, v31\l, z7 + smlsl\i v18.4s, v31\l, z5 + smlal\i v5.4s, v31\l, z3 + smlsl\i v6.4s, v31\l, z1 -4: addhn v7.4H, v19.4S, v17.4S - addhn2 v7.8H, v20.4S, v18.4S - subhn v18.4H, v20.4S, v18.4S - subhn2 v18.8H, v19.4S, v17.4S +4: addhn v7.4h, v19.4s, v17.4s + addhn2 v7.8h, v20.4s, v18.4s + subhn v18.4h, v20.4s, v18.4s + subhn2 v18.8h, v19.4s, v17.4s - addhn v16.4H, v21.4S, v5.4S - addhn2 v16.8H, v22.4S, v6.4S - subhn v17.4H, v22.4S, v6.4S - subhn2 v17.8H, v21.4S, v5.4S + addhn v16.4h, v21.4s, v5.4s + addhn2 v16.8h, v22.4s, v6.4s + subhn v17.4h, v22.4s, v6.4s + subhn2 v17.8h, v21.4s, v5.4s ret endfunc @@ -229,33 +229,33 @@ function ff_simple_idct_put_neon, export=1 idct_row4_neon v28, v29, v30, v31, 2 bl idct_col4_neon1 - sqshrun v1.8B, v7.8H, #COL_SHIFT-16 - sqshrun2 v1.16B, v16.8H, #COL_SHIFT-16 - sqshrun v3.8B, v17.8H, #COL_SHIFT-16 - sqshrun2 v3.16B, v18.8H, #COL_SHIFT-16 + sqshrun v1.8b, v7.8h, #COL_SHIFT-16 + sqshrun2 v1.16b, v16.8h, #COL_SHIFT-16 + sqshrun v3.8b, v17.8h, #COL_SHIFT-16 + sqshrun2 v3.16b, v18.8h, #COL_SHIFT-16 bl idct_col4_neon2 - sqshrun v2.8B, v7.8H, #COL_SHIFT-16 - sqshrun2 v2.16B, v16.8H, #COL_SHIFT-16 - sqshrun v4.8B, v17.8H, #COL_SHIFT-16 - sqshrun2 v4.16B, v18.8H, #COL_SHIFT-16 + sqshrun v2.8b, v7.8h, #COL_SHIFT-16 + sqshrun2 v2.16b, v16.8h, #COL_SHIFT-16 + sqshrun v4.8b, v17.8h, #COL_SHIFT-16 + sqshrun2 v4.16b, v18.8h, #COL_SHIFT-16 - zip1 v16.4S, v1.4S, v2.4S - zip2 v17.4S, v1.4S, v2.4S + zip1 v16.4s, v1.4s, v2.4s + zip2 v17.4s, v1.4s, v2.4s - st1 {v16.D}[0], [x0], x1 - st1 {v16.D}[1], [x0], x1 + st1 {v16.d}[0], [x0], x1 + st1 {v16.d}[1], [x0], x1 - zip1 v18.4S, v3.4S, v4.4S - zip2 v19.4S, v3.4S, v4.4S + zip1 v18.4s, v3.4s, v4.4s + zip2 v19.4s, v3.4s, v4.4s - st1 {v17.D}[0], [x0], x1 - st1 {v17.D}[1], [x0], x1 - st1 {v18.D}[0], [x0], x1 - st1 {v18.D}[1], [x0], x1 - st1 {v19.D}[0], [x0], x1 - st1 {v19.D}[1], [x0], x1 + st1 {v17.d}[0], [x0], x1 + st1 {v17.d}[1], [x0], x1 + st1 {v18.d}[0], [x0], x1 + st1 {v18.d}[1], [x0], x1 + st1 {v19.d}[0], [x0], x1 + st1 {v19.d}[1], [x0], x1 idct_end endfunc @@ -267,59 +267,59 @@ function ff_simple_idct_add_neon, export=1 idct_row4_neon v28, v29, v30, v31, 2 bl idct_col4_neon1 - sshr v1.8H, v7.8H, #COL_SHIFT-16 - sshr v2.8H, v16.8H, #COL_SHIFT-16 - sshr v3.8H, v17.8H, #COL_SHIFT-16 - sshr v4.8H, v18.8H, #COL_SHIFT-16 + sshr v1.8h, v7.8h, #COL_SHIFT-16 + sshr v2.8h, v16.8h, #COL_SHIFT-16 + sshr v3.8h, v17.8h, #COL_SHIFT-16 + sshr v4.8h, v18.8h, #COL_SHIFT-16 bl idct_col4_neon2 - sshr v7.8H, v7.8H, #COL_SHIFT-16 - sshr v16.8H, v16.8H, #COL_SHIFT-16 - sshr v17.8H, v17.8H, #COL_SHIFT-16 - sshr v18.8H, v18.8H, #COL_SHIFT-16 + sshr v7.8h, v7.8h, #COL_SHIFT-16 + sshr v16.8h, v16.8h, #COL_SHIFT-16 + sshr v17.8h, v17.8h, #COL_SHIFT-16 + sshr v18.8h, v18.8h, #COL_SHIFT-16 mov x9, x0 - ld1 {v19.D}[0], [x0], x1 - zip1 v23.2D, v1.2D, v7.2D - zip2 v24.2D, v1.2D, v7.2D - ld1 {v19.D}[1], [x0], x1 - zip1 v25.2D, v2.2D, v16.2D - zip2 v26.2D, v2.2D, v16.2D - ld1 {v20.D}[0], [x0], x1 - zip1 v27.2D, v3.2D, v17.2D - zip2 v28.2D, v3.2D, v17.2D - ld1 {v20.D}[1], [x0], x1 - zip1 v29.2D, v4.2D, v18.2D - zip2 v30.2D, v4.2D, v18.2D - ld1 {v21.D}[0], [x0], x1 - uaddw v23.8H, v23.8H, v19.8B - uaddw2 v24.8H, v24.8H, v19.16B - ld1 {v21.D}[1], [x0], x1 - sqxtun v23.8B, v23.8H - sqxtun2 v23.16B, v24.8H - ld1 {v22.D}[0], [x0], x1 - uaddw v24.8H, v25.8H, v20.8B - uaddw2 v25.8H, v26.8H, v20.16B - ld1 {v22.D}[1], [x0], x1 - sqxtun v24.8B, v24.8H - sqxtun2 v24.16B, v25.8H - st1 {v23.D}[0], [x9], x1 - uaddw v25.8H, v27.8H, v21.8B - uaddw2 v26.8H, v28.8H, v21.16B - st1 {v23.D}[1], [x9], x1 - sqxtun v25.8B, v25.8H - sqxtun2 v25.16B, v26.8H - st1 {v24.D}[0], [x9], x1 - uaddw v26.8H, v29.8H, v22.8B - uaddw2 v27.8H, v30.8H, v22.16B - st1 {v24.D}[1], [x9], x1 - sqxtun v26.8B, v26.8H - sqxtun2 v26.16B, v27.8H - st1 {v25.D}[0], [x9], x1 - st1 {v25.D}[1], [x9], x1 - st1 {v26.D}[0], [x9], x1 - st1 {v26.D}[1], [x9], x1 + ld1 {v19.d}[0], [x0], x1 + zip1 v23.2d, v1.2d, v7.2d + zip2 v24.2d, v1.2d, v7.2d + ld1 {v19.d}[1], [x0], x1 + zip1 v25.2d, v2.2d, v16.2d + zip2 v26.2d, v2.2d, v16.2d + ld1 {v20.d}[0], [x0], x1 + zip1 v27.2d, v3.2d, v17.2d + zip2 v28.2d, v3.2d, v17.2d + ld1 {v20.d}[1], [x0], x1 + zip1 v29.2d, v4.2d, v18.2d + zip2 v30.2d, v4.2d, v18.2d + ld1 {v21.d}[0], [x0], x1 + uaddw v23.8h, v23.8h, v19.8b + uaddw2 v24.8h, v24.8h, v19.16b + ld1 {v21.d}[1], [x0], x1 + sqxtun v23.8b, v23.8h + sqxtun2 v23.16b, v24.8h + ld1 {v22.d}[0], [x0], x1 + uaddw v24.8h, v25.8h, v20.8b + uaddw2 v25.8h, v26.8h, v20.16b + ld1 {v22.d}[1], [x0], x1 + sqxtun v24.8b, v24.8h + sqxtun2 v24.16b, v25.8h + st1 {v23.d}[0], [x9], x1 + uaddw v25.8h, v27.8h, v21.8b + uaddw2 v26.8h, v28.8h, v21.16b + st1 {v23.d}[1], [x9], x1 + sqxtun v25.8b, v25.8h + sqxtun2 v25.16b, v26.8h + st1 {v24.d}[0], [x9], x1 + uaddw v26.8h, v29.8h, v22.8b + uaddw2 v27.8h, v30.8h, v22.16b + st1 {v24.d}[1], [x9], x1 + sqxtun v26.8b, v26.8h + sqxtun2 v26.16b, v27.8h + st1 {v25.d}[0], [x9], x1 + st1 {v25.d}[1], [x9], x1 + st1 {v26.d}[0], [x9], x1 + st1 {v26.d}[1], [x9], x1 idct_end endfunc @@ -333,30 +333,30 @@ function ff_simple_idct_neon, export=1 sub x2, x2, #128 bl idct_col4_neon1 - sshr v1.8H, v7.8H, #COL_SHIFT-16 - sshr v2.8H, v16.8H, #COL_SHIFT-16 - sshr v3.8H, v17.8H, #COL_SHIFT-16 - sshr v4.8H, v18.8H, #COL_SHIFT-16 + sshr v1.8h, v7.8h, #COL_SHIFT-16 + sshr v2.8h, v16.8h, #COL_SHIFT-16 + sshr v3.8h, v17.8h, #COL_SHIFT-16 + sshr v4.8h, v18.8h, #COL_SHIFT-16 bl idct_col4_neon2 - sshr v7.8H, v7.8H, #COL_SHIFT-16 - sshr v16.8H, v16.8H, #COL_SHIFT-16 - sshr v17.8H, v17.8H, #COL_SHIFT-16 - sshr v18.8H, v18.8H, #COL_SHIFT-16 - - zip1 v23.2D, v1.2D, v7.2D - zip2 v24.2D, v1.2D, v7.2D - st1 {v23.2D,v24.2D}, [x2], #32 - zip1 v25.2D, v2.2D, v16.2D - zip2 v26.2D, v2.2D, v16.2D - st1 {v25.2D,v26.2D}, [x2], #32 - zip1 v27.2D, v3.2D, v17.2D - zip2 v28.2D, v3.2D, v17.2D - st1 {v27.2D,v28.2D}, [x2], #32 - zip1 v29.2D, v4.2D, v18.2D - zip2 v30.2D, v4.2D, v18.2D - st1 {v29.2D,v30.2D}, [x2], #32 + sshr v7.8h, v7.8h, #COL_SHIFT-16 + sshr v16.8h, v16.8h, #COL_SHIFT-16 + sshr v17.8h, v17.8h, #COL_SHIFT-16 + sshr v18.8h, v18.8h, #COL_SHIFT-16 + + zip1 v23.2d, v1.2d, v7.2d + zip2 v24.2d, v1.2d, v7.2d + st1 {v23.2d,v24.2d}, [x2], #32 + zip1 v25.2d, v2.2d, v16.2d + zip2 v26.2d, v2.2d, v16.2d + st1 {v25.2d,v26.2d}, [x2], #32 + zip1 v27.2d, v3.2d, v17.2d + zip2 v28.2d, v3.2d, v17.2d + st1 {v27.2d,v28.2d}, [x2], #32 + zip1 v29.2d, v4.2d, v18.2d + zip2 v30.2d, v4.2d, v18.2d + st1 {v29.2d,v30.2d}, [x2], #32 idct_end endfunc |