aboutsummaryrefslogtreecommitdiffstats
path: root/libswscale
diff options
context:
space:
mode:
authorMartin Storsjö <martin@martin.st>2023-10-17 14:16:24 +0300
committerMartin Storsjö <martin@martin.st>2023-10-21 23:25:18 +0300
commit184103b3105f02f1189fa0047af4269e027dfbd6 (patch)
tree3e50ad549ed68292f91594c4e6fb26551de90369 /libswscale
parent393d1ee541b143633bfba2ff0e821d734fd511c2 (diff)
downloadffmpeg-184103b3105f02f1189fa0047af4269e027dfbd6.tar.gz
aarch64: Consistently use lowercase for vector element specifiers
Signed-off-by: Martin Storsjö <martin@martin.st>
Diffstat (limited to 'libswscale')
-rw-r--r--libswscale/aarch64/hscale.S432
-rw-r--r--libswscale/aarch64/output.S150
-rw-r--r--libswscale/aarch64/yuv2rgb_neon.S116
3 files changed, 349 insertions, 349 deletions
diff --git a/libswscale/aarch64/hscale.S b/libswscale/aarch64/hscale.S
index 8d4dcb2541..f3c404eb5f 100644
--- a/libswscale/aarch64/hscale.S
+++ b/libswscale/aarch64/hscale.S
@@ -50,43 +50,43 @@ function ff_hscale8to15_X8_neon, export=1
add x12, x16, x7 // filter1 = filter0 + filterSize*2
add x13, x12, x7 // filter2 = filter1 + filterSize*2
add x4, x13, x7 // filter3 = filter2 + filterSize*2
- movi v0.2D, #0 // val sum part 1 (for dst[0])
- movi v1.2D, #0 // val sum part 2 (for dst[1])
- movi v2.2D, #0 // val sum part 3 (for dst[2])
- movi v3.2D, #0 // val sum part 4 (for dst[3])
+ movi v0.2d, #0 // val sum part 1 (for dst[0])
+ movi v1.2d, #0 // val sum part 2 (for dst[1])
+ movi v2.2d, #0 // val sum part 3 (for dst[2])
+ movi v3.2d, #0 // val sum part 4 (for dst[3])
add x17, x3, w8, UXTW // srcp + filterPos[0]
add x8, x3, w0, UXTW // srcp + filterPos[1]
add x0, x3, w11, UXTW // srcp + filterPos[2]
add x11, x3, w9, UXTW // srcp + filterPos[3]
mov w15, w6 // filterSize counter
-2: ld1 {v4.8B}, [x17], #8 // srcp[filterPos[0] + {0..7}]
- ld1 {v5.8H}, [x16], #16 // load 8x16-bit filter values, part 1
- ld1 {v6.8B}, [x8], #8 // srcp[filterPos[1] + {0..7}]
- ld1 {v7.8H}, [x12], #16 // load 8x16-bit at filter+filterSize
- uxtl v4.8H, v4.8B // unpack part 1 to 16-bit
- smlal v0.4S, v4.4H, v5.4H // v0 accumulates srcp[filterPos[0] + {0..3}] * filter[{0..3}]
- smlal2 v0.4S, v4.8H, v5.8H // v0 accumulates srcp[filterPos[0] + {4..7}] * filter[{4..7}]
- ld1 {v16.8B}, [x0], #8 // srcp[filterPos[2] + {0..7}]
- ld1 {v17.8H}, [x13], #16 // load 8x16-bit at filter+2*filterSize
- uxtl v6.8H, v6.8B // unpack part 2 to 16-bit
- smlal v1.4S, v6.4H, v7.4H // v1 accumulates srcp[filterPos[1] + {0..3}] * filter[{0..3}]
- uxtl v16.8H, v16.8B // unpack part 3 to 16-bit
- smlal v2.4S, v16.4H, v17.4H // v2 accumulates srcp[filterPos[2] + {0..3}] * filter[{0..3}]
- smlal2 v2.4S, v16.8H, v17.8H // v2 accumulates srcp[filterPos[2] + {4..7}] * filter[{4..7}]
- ld1 {v18.8B}, [x11], #8 // srcp[filterPos[3] + {0..7}]
- smlal2 v1.4S, v6.8H, v7.8H // v1 accumulates srcp[filterPos[1] + {4..7}] * filter[{4..7}]
- ld1 {v19.8H}, [x4], #16 // load 8x16-bit at filter+3*filterSize
+2: ld1 {v4.8b}, [x17], #8 // srcp[filterPos[0] + {0..7}]
+ ld1 {v5.8h}, [x16], #16 // load 8x16-bit filter values, part 1
+ ld1 {v6.8b}, [x8], #8 // srcp[filterPos[1] + {0..7}]
+ ld1 {v7.8h}, [x12], #16 // load 8x16-bit at filter+filterSize
+ uxtl v4.8h, v4.8b // unpack part 1 to 16-bit
+ smlal v0.4s, v4.4h, v5.4h // v0 accumulates srcp[filterPos[0] + {0..3}] * filter[{0..3}]
+ smlal2 v0.4s, v4.8h, v5.8h // v0 accumulates srcp[filterPos[0] + {4..7}] * filter[{4..7}]
+ ld1 {v16.8b}, [x0], #8 // srcp[filterPos[2] + {0..7}]
+ ld1 {v17.8h}, [x13], #16 // load 8x16-bit at filter+2*filterSize
+ uxtl v6.8h, v6.8b // unpack part 2 to 16-bit
+ smlal v1.4s, v6.4h, v7.4h // v1 accumulates srcp[filterPos[1] + {0..3}] * filter[{0..3}]
+ uxtl v16.8h, v16.8b // unpack part 3 to 16-bit
+ smlal v2.4s, v16.4h, v17.4h // v2 accumulates srcp[filterPos[2] + {0..3}] * filter[{0..3}]
+ smlal2 v2.4s, v16.8h, v17.8h // v2 accumulates srcp[filterPos[2] + {4..7}] * filter[{4..7}]
+ ld1 {v18.8b}, [x11], #8 // srcp[filterPos[3] + {0..7}]
+ smlal2 v1.4s, v6.8h, v7.8h // v1 accumulates srcp[filterPos[1] + {4..7}] * filter[{4..7}]
+ ld1 {v19.8h}, [x4], #16 // load 8x16-bit at filter+3*filterSize
subs w15, w15, #8 // j -= 8: processed 8/filterSize
- uxtl v18.8H, v18.8B // unpack part 4 to 16-bit
- smlal v3.4S, v18.4H, v19.4H // v3 accumulates srcp[filterPos[3] + {0..3}] * filter[{0..3}]
- smlal2 v3.4S, v18.8H, v19.8H // v3 accumulates srcp[filterPos[3] + {4..7}] * filter[{4..7}]
+ uxtl v18.8h, v18.8b // unpack part 4 to 16-bit
+ smlal v3.4s, v18.4h, v19.4h // v3 accumulates srcp[filterPos[3] + {0..3}] * filter[{0..3}]
+ smlal2 v3.4s, v18.8h, v19.8h // v3 accumulates srcp[filterPos[3] + {4..7}] * filter[{4..7}]
b.gt 2b // inner loop if filterSize not consumed completely
- addp v0.4S, v0.4S, v1.4S // part01 horizontal pair adding
- addp v2.4S, v2.4S, v3.4S // part23 horizontal pair adding
- addp v0.4S, v0.4S, v2.4S // part0123 horizontal pair adding
+ addp v0.4s, v0.4s, v1.4s // part01 horizontal pair adding
+ addp v2.4s, v2.4s, v3.4s // part23 horizontal pair adding
+ addp v0.4s, v0.4s, v2.4s // part0123 horizontal pair adding
subs w2, w2, #4 // dstW -= 4
- sqshrn v0.4H, v0.4S, #7 // shift and clip the 2x16-bit final values
- st1 {v0.4H}, [x1], #8 // write to destination part0123
+ sqshrn v0.4h, v0.4s, #7 // shift and clip the 2x16-bit final values
+ st1 {v0.4h}, [x1], #8 // write to destination part0123
b.gt 1b // loop until end of line
ret
endfunc
@@ -245,7 +245,7 @@ function ff_hscale8to15_4_neon, export=1
stp w14, w15, [sp, #24] // *scratch_mem = { src[filterPos[idx + 6]][0..3], src[filterPos[idx + 7]][0..3] }
1:
- ld4 {v16.8B, v17.8B, v18.8B, v19.8B}, [sp] // transpose 8 bytes each from src into 4 registers
+ ld4 {v16.8b, v17.8b, v18.8b, v19.8b}, [sp] // transpose 8 bytes each from src into 4 registers
// load 8 values from filterPos to be used as offsets into src
ldp w8, w9, [x5] // filterPos[idx + 0][0..3], [idx + 1][0..3], next iteration
@@ -253,74 +253,74 @@ function ff_hscale8to15_4_neon, export=1
ldp w12, w13, [x5, #16] // filterPos[idx + 4][0..3], [idx + 5][0..3], next iteration
ldp w14, w15, [x5, #24] // filterPos[idx + 6][0..3], [idx + 7][0..3], next iteration
- movi v0.2D, #0 // Clear madd accumulator for idx 0..3
- movi v5.2D, #0 // Clear madd accumulator for idx 4..7
+ movi v0.2d, #0 // Clear madd accumulator for idx 0..3
+ movi v5.2d, #0 // Clear madd accumulator for idx 4..7
- ld4 {v1.8H, v2.8H, v3.8H, v4.8H}, [x4], #64 // load filter idx + 0..7
+ ld4 {v1.8h, v2.8h, v3.8h, v4.8h}, [x4], #64 // load filter idx + 0..7
add x5, x5, #32 // advance filterPos
// interleaved SIMD and prefetching intended to keep ld/st and vector pipelines busy
- uxtl v16.8H, v16.8B // unsigned extend long, covert src data to 16-bit
- uxtl v17.8H, v17.8B // unsigned extend long, covert src data to 16-bit
+ uxtl v16.8h, v16.8b // unsigned extend long, covert src data to 16-bit
+ uxtl v17.8h, v17.8b // unsigned extend long, covert src data to 16-bit
ldr w8, [x3, w8, UXTW] // src[filterPos[idx + 0]], next iteration
ldr w9, [x3, w9, UXTW] // src[filterPos[idx + 1]], next iteration
- uxtl v18.8H, v18.8B // unsigned extend long, covert src data to 16-bit
- uxtl v19.8H, v19.8B // unsigned extend long, covert src data to 16-bit
+ uxtl v18.8h, v18.8b // unsigned extend long, covert src data to 16-bit
+ uxtl v19.8h, v19.8b // unsigned extend long, covert src data to 16-bit
ldr w10, [x3, w10, UXTW] // src[filterPos[idx + 2]], next iteration
ldr w11, [x3, w11, UXTW] // src[filterPos[idx + 3]], next iteration
- smlal v0.4S, v1.4H, v16.4H // multiply accumulate inner loop j = 0, idx = 0..3
- smlal v0.4S, v2.4H, v17.4H // multiply accumulate inner loop j = 1, idx = 0..3
+ smlal v0.4s, v1.4h, v16.4h // multiply accumulate inner loop j = 0, idx = 0..3
+ smlal v0.4s, v2.4h, v17.4h // multiply accumulate inner loop j = 1, idx = 0..3
ldr w12, [x3, w12, UXTW] // src[filterPos[idx + 4]], next iteration
ldr w13, [x3, w13, UXTW] // src[filterPos[idx + 5]], next iteration
- smlal v0.4S, v3.4H, v18.4H // multiply accumulate inner loop j = 2, idx = 0..3
- smlal v0.4S, v4.4H, v19.4H // multiply accumulate inner loop j = 3, idx = 0..3
+ smlal v0.4s, v3.4h, v18.4h // multiply accumulate inner loop j = 2, idx = 0..3
+ smlal v0.4s, v4.4h, v19.4h // multiply accumulate inner loop j = 3, idx = 0..3
ldr w14, [x3, w14, UXTW] // src[filterPos[idx + 6]], next iteration
ldr w15, [x3, w15, UXTW] // src[filterPos[idx + 7]], next iteration
- smlal2 v5.4S, v1.8H, v16.8H // multiply accumulate inner loop j = 0, idx = 4..7
- smlal2 v5.4S, v2.8H, v17.8H // multiply accumulate inner loop j = 1, idx = 4..7
+ smlal2 v5.4s, v1.8h, v16.8h // multiply accumulate inner loop j = 0, idx = 4..7
+ smlal2 v5.4s, v2.8h, v17.8h // multiply accumulate inner loop j = 1, idx = 4..7
stp w8, w9, [sp] // *scratch_mem = { src[filterPos[idx + 0]][0..3], src[filterPos[idx + 1]][0..3] }
stp w10, w11, [sp, #8] // *scratch_mem = { src[filterPos[idx + 2]][0..3], src[filterPos[idx + 3]][0..3] }
- smlal2 v5.4S, v3.8H, v18.8H // multiply accumulate inner loop j = 2, idx = 4..7
- smlal2 v5.4S, v4.8H, v19.8H // multiply accumulate inner loop j = 3, idx = 4..7
+ smlal2 v5.4s, v3.8h, v18.8h // multiply accumulate inner loop j = 2, idx = 4..7
+ smlal2 v5.4s, v4.8h, v19.8h // multiply accumulate inner loop j = 3, idx = 4..7
stp w12, w13, [sp, #16] // *scratch_mem = { src[filterPos[idx + 4]][0..3], src[filterPos[idx + 5]][0..3] }
stp w14, w15, [sp, #24] // *scratch_mem = { src[filterPos[idx + 6]][0..3], src[filterPos[idx + 7]][0..3] }
sub w2, w2, #8 // dstW -= 8
- sqshrn v0.4H, v0.4S, #7 // shift and clip the 2x16-bit final values
- sqshrn v1.4H, v5.4S, #7 // shift and clip the 2x16-bit final values
- st1 {v0.4H, v1.4H}, [x1], #16 // write to dst[idx + 0..7]
+ sqshrn v0.4h, v0.4s, #7 // shift and clip the 2x16-bit final values
+ sqshrn v1.4h, v5.4s, #7 // shift and clip the 2x16-bit final values
+ st1 {v0.4h, v1.4h}, [x1], #16 // write to dst[idx + 0..7]
cmp w2, #16 // continue on main loop if there are at least 16 iterations left
b.ge 1b
// last full iteration
- ld4 {v16.8B, v17.8B, v18.8B, v19.8B}, [sp]
- ld4 {v1.8H, v2.8H, v3.8H, v4.8H}, [x4], #64 // load filter idx + 0..7
+ ld4 {v16.8b, v17.8b, v18.8b, v19.8b}, [sp]
+ ld4 {v1.8h, v2.8h, v3.8h, v4.8h}, [x4], #64 // load filter idx + 0..7
- movi v0.2D, #0 // Clear madd accumulator for idx 0..3
- movi v5.2D, #0 // Clear madd accumulator for idx 4..7
+ movi v0.2d, #0 // Clear madd accumulator for idx 0..3
+ movi v5.2d, #0 // Clear madd accumulator for idx 4..7
- uxtl v16.8H, v16.8B // unsigned extend long, covert src data to 16-bit
- uxtl v17.8H, v17.8B // unsigned extend long, covert src data to 16-bit
- uxtl v18.8H, v18.8B // unsigned extend long, covert src data to 16-bit
- uxtl v19.8H, v19.8B // unsigned extend long, covert src data to 16-bit
+ uxtl v16.8h, v16.8b // unsigned extend long, covert src data to 16-bit
+ uxtl v17.8h, v17.8b // unsigned extend long, covert src data to 16-bit
+ uxtl v18.8h, v18.8b // unsigned extend long, covert src data to 16-bit
+ uxtl v19.8h, v19.8b // unsigned extend long, covert src data to 16-bit
- smlal v0.4S, v1.4H, v16.4H // multiply accumulate inner loop j = 0, idx = 0..3
- smlal v0.4S, v2.4H, v17.4H // multiply accumulate inner loop j = 1, idx = 0..3
- smlal v0.4S, v3.4H, v18.4H // multiply accumulate inner loop j = 2, idx = 0..3
- smlal v0.4S, v4.4H, v19.4H // multiply accumulate inner loop j = 3, idx = 0..3
+ smlal v0.4s, v1.4h, v16.4h // multiply accumulate inner loop j = 0, idx = 0..3
+ smlal v0.4s, v2.4h, v17.4h // multiply accumulate inner loop j = 1, idx = 0..3
+ smlal v0.4s, v3.4h, v18.4h // multiply accumulate inner loop j = 2, idx = 0..3
+ smlal v0.4s, v4.4h, v19.4h // multiply accumulate inner loop j = 3, idx = 0..3
- smlal2 v5.4S, v1.8H, v16.8H // multiply accumulate inner loop j = 0, idx = 4..7
- smlal2 v5.4S, v2.8H, v17.8H // multiply accumulate inner loop j = 1, idx = 4..7
- smlal2 v5.4S, v3.8H, v18.8H // multiply accumulate inner loop j = 2, idx = 4..7
- smlal2 v5.4S, v4.8H, v19.8H // multiply accumulate inner loop j = 3, idx = 4..7
+ smlal2 v5.4s, v1.8h, v16.8h // multiply accumulate inner loop j = 0, idx = 4..7
+ smlal2 v5.4s, v2.8h, v17.8h // multiply accumulate inner loop j = 1, idx = 4..7
+ smlal2 v5.4s, v3.8h, v18.8h // multiply accumulate inner loop j = 2, idx = 4..7
+ smlal2 v5.4s, v4.8h, v19.8h // multiply accumulate inner loop j = 3, idx = 4..7
subs w2, w2, #8 // dstW -= 8
- sqshrn v0.4H, v0.4S, #7 // shift and clip the 2x16-bit final values
- sqshrn v1.4H, v5.4S, #7 // shift and clip the 2x16-bit final values
- st1 {v0.4H, v1.4H}, [x1], #16 // write to dst[idx + 0..7]
+ sqshrn v0.4h, v0.4s, #7 // shift and clip the 2x16-bit final values
+ sqshrn v1.4h, v5.4s, #7 // shift and clip the 2x16-bit final values
+ st1 {v0.4h, v1.4h}, [x1], #16 // write to dst[idx + 0..7]
cbnz w2, 2f // if >0 iterations remain, jump to the wrap up section
@@ -332,15 +332,15 @@ function ff_hscale8to15_4_neon, export=1
// load src
ldr w8, [x5], #4 // filterPos[i]
add x9, x3, w8, UXTW // calculate the address for src load
- ld1 {v5.S}[0], [x9] // src[filterPos[i] + 0..3]
+ ld1 {v5.s}[0], [x9] // src[filterPos[i] + 0..3]
// load filter
- ld1 {v6.4H}, [x4], #8 // filter[filterSize * i + 0..3]
+ ld1 {v6.4h}, [x4], #8 // filter[filterSize * i + 0..3]
- uxtl v5.8H, v5.8B // unsigned exten long, convert src data to 16-bit
- smull v0.4S, v5.4H, v6.4H // 4 iterations of src[...] * filter[...]
- addv s0, v0.4S // add up products of src and filter values
+ uxtl v5.8h, v5.8b // unsigned exten long, convert src data to 16-bit
+ smull v0.4s, v5.4h, v6.4h // 4 iterations of src[...] * filter[...]
+ addv s0, v0.4s // add up products of src and filter values
sqshrn h0, s0, #7 // shift and clip the 2x16-bit final value
- st1 {v0.H}[0], [x1], #2 // dst[i] = ...
+ st1 {v0.h}[0], [x1], #2 // dst[i] = ...
sub w2, w2, #1 // dstW--
cbnz w2, 2b
@@ -445,12 +445,12 @@ function ff_hscale8to19_4_neon, export=1
smull v5.4s, v0.4h, v28.4h
smull2 v6.4s, v0.8h, v28.8h
uxtl v2.8h, v2.8b
- smlal v5.4s, v1.4h, v29.4H
- smlal2 v6.4s, v1.8h, v29.8H
+ smlal v5.4s, v1.4h, v29.4h
+ smlal2 v6.4s, v1.8h, v29.8h
uxtl v3.8h, v3.8b
- smlal v5.4s, v2.4h, v30.4H
- smlal2 v6.4s, v2.8h, v30.8H
- smlal v5.4s, v3.4h, v31.4H
+ smlal v5.4s, v2.4h, v30.4h
+ smlal2 v6.4s, v2.8h, v30.8h
+ smlal v5.4s, v3.4h, v31.4h
smlal2 v6.4s, v3.8h, v31.8h
sshr v5.4s, v5.4s, #3
@@ -472,8 +472,8 @@ function ff_hscale8to19_4_neon, export=1
ld1 {v0.s}[0], [x9] // load 4 * uint8_t* into one single
ld1 {v31.4h}, [x4], #8
uxtl v0.8h, v0.8b
- smull v5.4s, v0.4h, v31.4H
- saddlv d0, v5.4S
+ smull v5.4s, v0.4h, v31.4h
+ saddlv d0, v5.4s
sqshrn s0, d0, #3
smin v0.4s, v0.4s, v18.4s
st1 {v0.s}[0], [x1], #4
@@ -499,42 +499,42 @@ function ff_hscale8to19_X8_neon, export=1
ldr w11, [x5], #4 // filterPos[idx + 2]
add x4, x13, x7 // filter3 = filter2 + filterSize*2
ldr w9, [x5], #4 // filterPos[idx + 3]
- movi v0.2D, #0 // val sum part 1 (for dst[0])
- movi v1.2D, #0 // val sum part 2 (for dst[1])
- movi v2.2D, #0 // val sum part 3 (for dst[2])
- movi v3.2D, #0 // val sum part 4 (for dst[3])
+ movi v0.2d, #0 // val sum part 1 (for dst[0])
+ movi v1.2d, #0 // val sum part 2 (for dst[1])
+ movi v2.2d, #0 // val sum part 3 (for dst[2])
+ movi v3.2d, #0 // val sum part 4 (for dst[3])
add x17, x3, w8, UXTW // srcp + filterPos[0]
add x8, x3, w0, UXTW // srcp + filterPos[1]
add x0, x3, w11, UXTW // srcp + filterPos[2]
add x11, x3, w9, UXTW // srcp + filterPos[3]
mov w15, w6 // filterSize counter
-2: ld1 {v4.8B}, [x17], #8 // srcp[filterPos[0] + {0..7}]
- ld1 {v5.8H}, [x16], #16 // load 8x16-bit filter values, part 1
- uxtl v4.8H, v4.8B // unpack part 1 to 16-bit
- smlal v0.4S, v4.4H, v5.4H // v0 accumulates srcp[filterPos[0] + {0..3}] * filter[{0..3}]
- ld1 {v6.8B}, [x8], #8 // srcp[filterPos[1] + {0..7}]
- smlal2 v0.4S, v4.8H, v5.8H // v0 accumulates srcp[filterPos[0] + {4..7}] * filter[{4..7}]
- ld1 {v7.8H}, [x12], #16 // load 8x16-bit at filter+filterSize
- ld1 {v16.8B}, [x0], #8 // srcp[filterPos[2] + {0..7}]
- uxtl v6.8H, v6.8B // unpack part 2 to 16-bit
- ld1 {v17.8H}, [x13], #16 // load 8x16-bit at filter+2*filterSize
- uxtl v16.8H, v16.8B // unpack part 3 to 16-bit
- smlal v1.4S, v6.4H, v7.4H // v1 accumulates srcp[filterPos[1] + {0..3}] * filter[{0..3}]
- ld1 {v18.8B}, [x11], #8 // srcp[filterPos[3] + {0..7}]
- smlal v2.4S, v16.4H, v17.4H // v2 accumulates srcp[filterPos[2] + {0..3}] * filter[{0..3}]
- ld1 {v19.8H}, [x4], #16 // load 8x16-bit at filter+3*filterSize
- smlal2 v2.4S, v16.8H, v17.8H // v2 accumulates srcp[filterPos[2] + {4..7}] * filter[{4..7}]
- uxtl v18.8H, v18.8B // unpack part 4 to 16-bit
- smlal2 v1.4S, v6.8H, v7.8H // v1 accumulates srcp[filterPos[1] + {4..7}] * filter[{4..7}]
- smlal v3.4S, v18.4H, v19.4H // v3 accumulates srcp[filterPos[3] + {0..3}] * filter[{0..3}]
+2: ld1 {v4.8b}, [x17], #8 // srcp[filterPos[0] + {0..7}]
+ ld1 {v5.8h}, [x16], #16 // load 8x16-bit filter values, part 1
+ uxtl v4.8h, v4.8b // unpack part 1 to 16-bit
+ smlal v0.4s, v4.4h, v5.4h // v0 accumulates srcp[filterPos[0] + {0..3}] * filter[{0..3}]
+ ld1 {v6.8b}, [x8], #8 // srcp[filterPos[1] + {0..7}]
+ smlal2 v0.4s, v4.8h, v5.8h // v0 accumulates srcp[filterPos[0] + {4..7}] * filter[{4..7}]
+ ld1 {v7.8h}, [x12], #16 // load 8x16-bit at filter+filterSize
+ ld1 {v16.8b}, [x0], #8 // srcp[filterPos[2] + {0..7}]
+ uxtl v6.8h, v6.8b // unpack part 2 to 16-bit
+ ld1 {v17.8h}, [x13], #16 // load 8x16-bit at filter+2*filterSize
+ uxtl v16.8h, v16.8b // unpack part 3 to 16-bit
+ smlal v1.4s, v6.4h, v7.4h // v1 accumulates srcp[filterPos[1] + {0..3}] * filter[{0..3}]
+ ld1 {v18.8b}, [x11], #8 // srcp[filterPos[3] + {0..7}]
+ smlal v2.4s, v16.4h, v17.4h // v2 accumulates srcp[filterPos[2] + {0..3}] * filter[{0..3}]
+ ld1 {v19.8h}, [x4], #16 // load 8x16-bit at filter+3*filterSize
+ smlal2 v2.4s, v16.8h, v17.8h // v2 accumulates srcp[filterPos[2] + {4..7}] * filter[{4..7}]
+ uxtl v18.8h, v18.8b // unpack part 4 to 16-bit
+ smlal2 v1.4s, v6.8h, v7.8h // v1 accumulates srcp[filterPos[1] + {4..7}] * filter[{4..7}]
+ smlal v3.4s, v18.4h, v19.4h // v3 accumulates srcp[filterPos[3] + {0..3}] * filter[{0..3}]
subs w15, w15, #8 // j -= 8: processed 8/filterSize
- smlal2 v3.4S, v18.8H, v19.8H // v3 accumulates srcp[filterPos[3] + {4..7}] * filter[{4..7}]
+ smlal2 v3.4s, v18.8h, v19.8h // v3 accumulates srcp[filterPos[3] + {4..7}] * filter[{4..7}]
b.gt 2b // inner loop if filterSize not consumed completely
- addp v0.4S, v0.4S, v1.4S // part01 horizontal pair adding
- addp v2.4S, v2.4S, v3.4S // part23 horizontal pair adding
- addp v0.4S, v0.4S, v2.4S // part0123 horizontal pair adding
+ addp v0.4s, v0.4s, v1.4s // part01 horizontal pair adding
+ addp v2.4s, v2.4s, v3.4s // part23 horizontal pair adding
+ addp v0.4s, v0.4s, v2.4s // part0123 horizontal pair adding
subs w2, w2, #4 // dstW -= 4
- sshr v0.4s, v0.4S, #3 // shift and clip the 2x16-bit final values
+ sshr v0.4s, v0.4s, #3 // shift and clip the 2x16-bit final values
smin v0.4s, v0.4s, v20.4s
st1 {v0.4s}, [x1], #16 // write to destination part0123
b.gt 1b // loop until end of line
@@ -588,16 +588,16 @@ function ff_hscale8to19_X4_neon, export=1
smlal2 v16.4s, v4.8h, v31.8h // multiplication of upper half for idx 0
ldr d6, [x10], #8 // load src values for idx 2
ldr q29, [x14, x16] // load filter values for idx 2
- smlal v17.4s, v5.4h, v30.4H // multiplication of lower half for idx 1
+ smlal v17.4s, v5.4h, v30.4h // multiplication of lower half for idx 1
ldr d7, [x11], #8 // load src values for idx 3
- smlal2 v17.4s, v5.8h, v30.8H // multiplication of upper half for idx 1
- uxtl v6.8h, v6.8B // extend tpye to matchi the filter's size
+ smlal2 v17.4s, v5.8h, v30.8h // multiplication of upper half for idx 1
+ uxtl v6.8h, v6.8b // extend tpye to matchi the filter's size
ldr q28, [x15, x16] // load filter values for idx 3
smlal v18.4s, v6.4h, v29.4h // multiplication of lower half for idx 2
- uxtl v7.8h, v7.8B
- smlal2 v18.4s, v6.8h, v29.8H // multiplication of upper half for idx 2
+ uxtl v7.8h, v7.8b
+ smlal2 v18.4s, v6.8h, v29.8h // multiplication of upper half for idx 2
sub w0, w0, #8
- smlal v19.4s, v7.4h, v28.4H // multiplication of lower half for idx 3
+ smlal v19.4s, v7.4h, v28.4h // multiplication of lower half for idx 3
cmp w0, #8
smlal2 v19.4s, v7.8h, v28.8h // multiplication of upper half for idx 3
add x16, x16, #16 // advance filter values indexing
@@ -618,11 +618,11 @@ function ff_hscale8to19_X4_neon, export=1
uxtl v5.8h, v5.8b // extend type to match the filter' size
ldr s6, [x10] // load src values for idx 2
smlal v17.4s, v5.4h, v30.4h
- uxtl v6.8h, v6.8B // extend type to match the filter's size
+ uxtl v6.8h, v6.8b // extend type to match the filter's size
ldr d29, [x14, x17] // load filter values for idx 2
ldr s7, [x11] // load src values for idx 3
addp v16.4s, v16.4s, v17.4s
- uxtl v7.8h, v7.8B
+ uxtl v7.8h, v7.8b
ldr d28, [x15, x17] // load filter values for idx 3
smlal v18.4s, v6.4h, v29.4h
smlal v19.4s, v7.4h, v28.4h
@@ -700,31 +700,31 @@ function ff_hscale16to15_4_neon_asm, export=1
// Extending to 32 bits is necessary, as unit16_t values can't
// be represented as int16_t without type promotion.
uxtl v26.4s, v0.4h
- sxtl v27.4s, v28.4H
+ sxtl v27.4s, v28.4h
uxtl2 v0.4s, v0.8h
mul v5.4s, v26.4s, v27.4s
- sxtl2 v28.4s, v28.8H
+ sxtl2 v28.4s, v28.8h
uxtl v26.4s, v1.4h
mul v6.4s, v0.4s, v28.4s
- sxtl v27.4s, v29.4H
+ sxtl v27.4s, v29.4h
uxtl2 v0.4s, v1.8h
mla v5.4s, v27.4s, v26.4s
- sxtl2 v28.4s, v29.8H
+ sxtl2 v28.4s, v29.8h
uxtl v26.4s, v2.4h
mla v6.4s, v28.4s, v0.4s
- sxtl v27.4s, v30.4H
+ sxtl v27.4s, v30.4h
uxtl2 v0.4s, v2.8h
mla v5.4s, v27.4s, v26.4s
- sxtl2 v28.4s, v30.8H
+ sxtl2 v28.4s, v30.8h
uxtl v26.4s, v3.4h
mla v6.4s, v28.4s, v0.4s
- sxtl v27.4s, v31.4H
+ sxtl v27.4s, v31.4h
uxtl2 v0.4s, v3.8h
mla v5.4s, v27.4s, v26.4s
- sxtl2 v28.4s, v31.8H
+ sxtl2 v28.4s, v31.8h
sub w2, w2, #8
mla v6.4s, v28.4s, v0.4s
@@ -775,31 +775,31 @@ function ff_hscale16to15_4_neon_asm, export=1
ld4 {v28.8h, v29.8h, v30.8h, v31.8h}, [x4], #64
uxtl v26.4s, v0.4h
- sxtl v27.4s, v28.4H
+ sxtl v27.4s, v28.4h
uxtl2 v0.4s, v0.8h
mul v5.4s, v26.4s, v27.4s
- sxtl2 v28.4s, v28.8H
+ sxtl2 v28.4s, v28.8h
uxtl v26.4s, v1.4h
mul v6.4s, v0.4s, v28.4s
- sxtl v27.4s, v29.4H
+ sxtl v27.4s, v29.4h
uxtl2 v0.4s, v1.8h
mla v5.4s, v26.4s, v27.4s
- sxtl2 v28.4s, v29.8H
+ sxtl2 v28.4s, v29.8h
uxtl v26.4s, v2.4h
mla v6.4s, v0.4s, v28.4s
- sxtl v27.4s, v30.4H
+ sxtl v27.4s, v30.4h
uxtl2 v0.4s, v2.8h
mla v5.4s, v26.4s, v27.4s
- sxtl2 v28.4s, v30.8H
+ sxtl2 v28.4s, v30.8h
uxtl v26.4s, v3.4h
mla v6.4s, v0.4s, v28.4s
- sxtl v27.4s, v31.4H
+ sxtl v27.4s, v31.4h
uxtl2 v0.4s, v3.8h
mla v5.4s, v26.4s, v27.4s
- sxtl2 v28.4s, v31.8H
+ sxtl2 v28.4s, v31.8h
subs w2, w2, #8
mla v6.4s, v0.4s, v28.4s
@@ -807,7 +807,7 @@ function ff_hscale16to15_4_neon_asm, export=1
sshl v6.4s, v6.4s, v17.4s
smin v5.4s, v5.4s, v18.4s
smin v6.4s, v6.4s, v18.4s
- xtn v5.4h, v5.4S
+ xtn v5.4h, v5.4s
xtn2 v5.8h, v6.4s
st1 {v5.8h}, [x1], #16
@@ -826,7 +826,7 @@ function ff_hscale16to15_4_neon_asm, export=1
uxtl v0.4s, v0.4h
sxtl v31.4s, v31.4h
mul v5.4s, v0.4s, v31.4s
- addv s0, v5.4S
+ addv s0, v5.4s
sshl v0.4s, v0.4s, v17.4s
smin v0.4s, v0.4s, v18.4s
st1 {v0.h}[0], [x1], #2
@@ -865,58 +865,58 @@ function ff_hscale16to15_X8_neon_asm, export=1
add x12, x16, x7 // filter1 = filter0 + filterSize*2
add x13, x12, x7 // filter2 = filter1 + filterSize*2
add x4, x13, x7 // filter3 = filter2 + filterSize*2
- movi v0.2D, #0 // val sum part 1 (for dst[0])
- movi v1.2D, #0 // val sum part 2 (for dst[1])
- movi v2.2D, #0 // val sum part 3 (for dst[2])
- movi v3.2D, #0 // val sum part 4 (for dst[3])
+ movi v0.2d, #0 // val sum part 1 (for dst[0])
+ movi v1.2d, #0 // val sum part 2 (for dst[1])
+ movi v2.2d, #0 // val sum part 3 (for dst[2])
+ movi v3.2d, #0 // val sum part 4 (for dst[3])
add x17, x3, w8, UXTW // srcp + filterPos[0]
add x8, x3, w10, UXTW // srcp + filterPos[1]
add x10, x3, w11, UXTW // srcp + filterPos[2]
add x11, x3, w9, UXTW // srcp + filterPos[3]
mov w15, w6 // filterSize counter
-2: ld1 {v4.8H}, [x17], #16 // srcp[filterPos[0] + {0..7}]
- ld1 {v5.8H}, [x16], #16 // load 8x16-bit filter values, part 1
- ld1 {v6.8H}, [x8], #16 // srcp[filterPos[1] + {0..7}]
- ld1 {v7.8H}, [x12], #16 // load 8x16-bit at filter+filterSize
- uxtl v24.4s, v4.4H // extend srcp lower half to 32 bits to preserve sign
- sxtl v25.4s, v5.4H // extend filter lower half to 32 bits to match srcp size
+2: ld1 {v4.8h}, [x17], #16 // srcp[filterPos[0] + {0..7}]
+ ld1 {v5.8h}, [x16], #16 // load 8x16-bit filter values, part 1
+ ld1 {v6.8h}, [x8], #16 // srcp[filterPos[1] + {0..7}]
+ ld1 {v7.8h}, [x12], #16 // load 8x16-bit at filter+filterSize
+ uxtl v24.4s, v4.4h // extend srcp lower half to 32 bits to preserve sign
+ sxtl v25.4s, v5.4h // extend filter lower half to 32 bits to match srcp size
uxtl2 v4.4s, v4.8h // extend srcp upper half to 32 bits
mla v0.4s, v24.4s, v25.4s // multiply accumulate lower half of v4 * v5
sxtl2 v5.4s, v5.8h // extend filter upper half to 32 bits
uxtl v26.4s, v6.4h // extend srcp lower half to 32 bits
- mla v0.4S, v4.4s, v5.4s // multiply accumulate upper half of v4 * v5
- sxtl v27.4s, v7.4H // exted filter lower half
- uxtl2 v6.4s, v6.8H // extend srcp upper half
+ mla v0.4s, v4.4s, v5.4s // multiply accumulate upper half of v4 * v5
+ sxtl v27.4s, v7.4h // exted filter lower half
+ uxtl2 v6.4s, v6.8h // extend srcp upper half
sxtl2 v7.4s, v7.8h // extend filter upper half
- ld1 {v16.8H}, [x10], #16 // srcp[filterPos[2] + {0..7}]
- mla v1.4S, v26.4s, v27.4s // v1 accumulates srcp[filterPos[1] + {0..3}] * filter[{0..3}]
- ld1 {v17.8H}, [x13], #16 // load 8x16-bit at filter+2*filterSize
- uxtl v22.4s, v16.4H // extend srcp lower half
- sxtl v23.4s, v17.4H // extend filter lower half
- uxtl2 v16.4s, v16.8H // extend srcp upper half
+ ld1 {v16.8h}, [x10], #16 // srcp[filterPos[2] + {0..7}]
+ mla v1.4s, v26.4s, v27.4s // v1 accumulates srcp[filterPos[1] + {0..3}] * filter[{0..3}]
+ ld1 {v17.8h}, [x13], #16 // load 8x16-bit at filter+2*filterSize
+ uxtl v22.4s, v16.4h // extend srcp lower half
+ sxtl v23.4s, v17.4h // extend filter lower half
+ uxtl2 v16.4s, v16.8h // extend srcp upper half
sxtl2 v17.4s, v17.8h // extend filter upper half
- mla v2.4S, v22.4s, v23.4s // v2 accumulates srcp[filterPos[2] + {0..3}] * filter[{0..3}]
- mla v2.4S, v16.4s, v17.4s // v2 accumulates srcp[filterPos[2] + {4..7}] * filter[{4..7}]
- ld1 {v18.8H}, [x11], #16 // srcp[filterPos[3] + {0..7}]
- mla v1.4S, v6.4s, v7.4s // v1 accumulates srcp[filterPos[1] + {4..7}] * filter[{4..7}]
- ld1 {v19.8H}, [x4], #16 // load 8x16-bit at filter+3*filterSize
+ mla v2.4s, v22.4s, v23.4s // v2 accumulates srcp[filterPos[2] + {0..3}] * filter[{0..3}]
+ mla v2.4s, v16.4s, v17.4s // v2 accumulates srcp[filterPos[2] + {4..7}] * filter[{4..7}]
+ ld1 {v18.8h}, [x11], #16 // srcp[filterPos[3] + {0..7}]
+ mla v1.4s, v6.4s, v7.4s // v1 accumulates srcp[filterPos[1] + {4..7}] * filter[{4..7}]
+ ld1 {v19.8h}, [x4], #16 // load 8x16-bit at filter+3*filterSize
subs w15, w15, #8 // j -= 8: processed 8/filterSize
- uxtl v28.4s, v18.4H // extend srcp lower half
- sxtl v29.4s, v19.4H // extend filter lower half
- uxtl2 v18.4s, v18.8H // extend srcp upper half
+ uxtl v28.4s, v18.4h // extend srcp lower half
+ sxtl v29.4s, v19.4h // extend filter lower half
+ uxtl2 v18.4s, v18.8h // extend srcp upper half
sxtl2 v19.4s, v19.8h // extend filter upper half
- mla v3.4S, v28.4s, v29.4s // v3 accumulates srcp[filterPos[3] + {0..3}] * filter[{0..3}]
- mla v3.4S, v18.4s, v19.4s // v3 accumulates srcp[filterPos[3] + {4..7}] * filter[{4..7}]
+ mla v3.4s, v28.4s, v29.4s // v3 accumulates srcp[filterPos[3] + {0..3}] * filter[{0..3}]
+ mla v3.4s, v18.4s, v19.4s // v3 accumulates srcp[filterPos[3] + {4..7}] * filter[{4..7}]
b.gt 2b // inner loop if filterSize not consumed completely
- addp v0.4S, v0.4S, v1.4S // part01 horizontal pair adding
- addp v2.4S, v2.4S, v3.4S // part23 horizontal pair adding
- addp v0.4S, v0.4S, v2.4S // part0123 horizontal pair adding
+ addp v0.4s, v0.4s, v1.4s // part01 horizontal pair adding
+ addp v2.4s, v2.4s, v3.4s // part23 horizontal pair adding
+ addp v0.4s, v0.4s, v2.4s // part0123 horizontal pair adding
subs w2, w2, #4 // dstW -= 4
sshl v0.4s, v0.4s, v21.4s // shift right (effectively rigth, as shift is negative); overflow expected
smin v0.4s, v0.4s, v20.4s // apply min (do not use sqshl)
xtn v0.4h, v0.4s // narrow down to 16 bits
- st1 {v0.4H}, [x1], #8 // write to destination part0123
+ st1 {v0.4h}, [x1], #8 // write to destination part0123
b.gt 1b // loop until end of line
ret
endfunc
@@ -1108,31 +1108,31 @@ function ff_hscale16to19_4_neon_asm, export=1
// Extending to 32 bits is necessary, as unit16_t values can't
// be represented as int16_t without type promotion.
uxtl v26.4s, v0.4h
- sxtl v27.4s, v28.4H
+ sxtl v27.4s, v28.4h
uxtl2 v0.4s, v0.8h
mul v5.4s, v26.4s, v27.4s
- sxtl2 v28.4s, v28.8H
+ sxtl2 v28.4s, v28.8h
uxtl v26.4s, v1.4h
mul v6.4s, v0.4s, v28.4s
- sxtl v27.4s, v29.4H
+ sxtl v27.4s, v29.4h
uxtl2 v0.4s, v1.8h
mla v5.4s, v27.4s, v26.4s
- sxtl2 v28.4s, v29.8H
+ sxtl2 v28.4s, v29.8h
uxtl v26.4s, v2.4h
mla v6.4s, v28.4s, v0.4s
- sxtl v27.4s, v30.4H
+ sxtl v27.4s, v30.4h
uxtl2 v0.4s, v2.8h
mla v5.4s, v27.4s, v26.4s
- sxtl2 v28.4s, v30.8H
+ sxtl2 v28.4s, v30.8h
uxtl v26.4s, v3.4h
mla v6.4s, v28.4s, v0.4s
- sxtl v27.4s, v31.4H
+ sxtl v27.4s, v31.4h
uxtl2 v0.4s, v3.8h
mla v5.4s, v27.4s, v26.4s
- sxtl2 v28.4s, v31.8H
+ sxtl2 v28.4s, v31.8h
sub w2, w2, #8
mla v6.4s, v28.4s, v0.4s
@@ -1181,31 +1181,31 @@ function ff_hscale16to19_4_neon_asm, export=1
ld4 {v28.8h, v29.8h, v30.8h, v31.8h}, [x4], #64
uxtl v26.4s, v0.4h
- sxtl v27.4s, v28.4H
+ sxtl v27.4s, v28.4h
uxtl2 v0.4s, v0.8h
mul v5.4s, v26.4s, v27.4s
- sxtl2 v28.4s, v28.8H
+ sxtl2 v28.4s, v28.8h
uxtl v26.4s, v1.4h
mul v6.4s, v0.4s, v28.4s
- sxtl v27.4s, v29.4H
+ sxtl v27.4s, v29.4h
uxtl2 v0.4s, v1.8h
mla v5.4s, v26.4s, v27.4s
- sxtl2 v28.4s, v29.8H
+ sxtl2 v28.4s, v29.8h
uxtl v26.4s, v2.4h
mla v6.4s, v0.4s, v28.4s
- sxtl v27.4s, v30.4H
+ sxtl v27.4s, v30.4h
uxtl2 v0.4s, v2.8h
mla v5.4s, v26.4s, v27.4s
- sxtl2 v28.4s, v30.8H
+ sxtl2 v28.4s, v30.8h
uxtl v26.4s, v3.4h
mla v6.4s, v0.4s, v28.4s
- sxtl v27.4s, v31.4H
+ sxtl v27.4s, v31.4h
uxtl2 v0.4s, v3.8h
mla v5.4s, v26.4s, v27.4s
- sxtl2 v28.4s, v31.8H
+ sxtl2 v28.4s, v31.8h
subs w2, w2, #8
mla v6.4s, v0.4s, v28.4s
@@ -1232,7 +1232,7 @@ function ff_hscale16to19_4_neon_asm, export=1
sxtl v31.4s, v31.4h
subs w2, w2, #1
mul v5.4s, v0.4s, v31.4s
- addv s0, v5.4S
+ addv s0, v5.4s
sshl v0.4s, v0.4s, v17.4s
smin v0.4s, v0.4s, v18.4s
st1 {v0.s}[0], [x1], #4
@@ -1270,52 +1270,52 @@ function ff_hscale16to19_X8_neon_asm, export=1
add x13, x12, x7 // filter2 = filter1 + filterSize*2
lsl w10, w10, #1
add x4, x13, x7 // filter3 = filter2 + filterSize*2
- movi v0.2D, #0 // val sum part 1 (for dst[0])
- movi v1.2D, #0 // val sum part 2 (for dst[1])
- movi v2.2D, #0 // val sum part 3 (for dst[2])
- movi v3.2D, #0 // val sum part 4 (for dst[3])
+ movi v0.2d, #0 // val sum part 1 (for dst[0])
+ movi v1.2d, #0 // val sum part 2 (for dst[1])
+ movi v2.2d, #0 // val sum part 3 (for dst[2])
+ movi v3.2d, #0 // val sum part 4 (for dst[3])
add x17, x3, w8, UXTW // srcp + filterPos[0]
add x8, x3, w10, UXTW // srcp + filterPos[1]
add x10, x3, w11, UXTW // srcp + filterPos[2]
add x11, x3, w9, UXTW // srcp + filterPos[3]
mov w15, w6 // filterSize counter
-2: ld1 {v4.8H}, [x17], #16 // srcp[filterPos[0] + {0..7}]
- ld1 {v5.8H}, [x16], #16 // load 8x16-bit filter values, part 1
- ld1 {v6.8H}, [x8], #16 // srcp[filterPos[1] + {0..7}]
- ld1 {v7.8H}, [x12], #16 // load 8x16-bit at filter+filterSize
- uxtl v24.4s, v4.4H // extend srcp lower half to 32 bits to preserve sign
- sxtl v25.4s, v5.4H // extend filter lower half to 32 bits to match srcp size
+2: ld1 {v4.8h}, [x17], #16 // srcp[filterPos[0] + {0..7}]
+ ld1 {v5.8h}, [x16], #16 // load 8x16-bit filter values, part 1
+ ld1 {v6.8h}, [x8], #16 // srcp[filterPos[1] + {0..7}]
+ ld1 {v7.8h}, [x12], #16 // load 8x16-bit at filter+filterSize
+ uxtl v24.4s, v4.4h // extend srcp lower half to 32 bits to preserve sign
+ sxtl v25.4s, v5.4h // extend filter lower half to 32 bits to match srcp size
uxtl2 v4.4s, v4.8h // extend srcp upper half to 32 bits
mla v0.4s, v24.4s, v25.4s // multiply accumulate lower half of v4 * v5
sxtl2 v5.4s, v5.8h // extend filter upper half to 32 bits
uxtl v26.4s, v6.4h // extend srcp lower half to 32 bits
- mla v0.4S, v4.4s, v5.4s // multiply accumulate upper half of v4 * v5
- sxtl v27.4s, v7.4H // exted filter lower half
- uxtl2 v6.4s, v6.8H // extend srcp upper half
+ mla v0.4s, v4.4s, v5.4s // multiply accumulate upper half of v4 * v5
+ sxtl v27.4s, v7.4h // exted filter lower half
+ uxtl2 v6.4s, v6.8h // extend srcp upper half
sxtl2 v7.4s, v7.8h // extend filter upper half
- ld1 {v16.8H}, [x10], #16 // srcp[filterPos[2] + {0..7}]
- mla v1.4S, v26.4s, v27.4s // v1 accumulates srcp[filterPos[1] + {0..3}] * filter[{0..3}]
- ld1 {v17.8H}, [x13], #16 // load 8x16-bit at filter+2*filterSize
- uxtl v22.4s, v16.4H // extend srcp lower half
- sxtl v23.4s, v17.4H // extend filter lower half
- uxtl2 v16.4s, v16.8H // extend srcp upper half
+ ld1 {v16.8h}, [x10], #16 // srcp[filterPos[2] + {0..7}]
+ mla v1.4s, v26.4s, v27.4s // v1 accumulates srcp[filterPos[1] + {0..3}] * filter[{0..3}]
+ ld1 {v17.8h}, [x13], #16 // load 8x16-bit at filter+2*filterSize
+ uxtl v22.4s, v16.4h // extend srcp lower half
+ sxtl v23.4s, v17.4h // extend filter lower half
+ uxtl2 v16.4s, v16.8h // extend srcp upper half
sxtl2 v17.4s, v17.8h // extend filter upper half
- mla v2.4S, v22.4s, v23.4s // v2 accumulates srcp[filterPos[2] + {0..3}] * filter[{0..3}]
- mla v2.4S, v16.4s, v17.4s // v2 accumulates srcp[filterPos[2] + {4..7}] * filter[{4..7}]
- ld1 {v18.8H}, [x11], #16 // srcp[filterPos[3] + {0..7}]
- mla v1.4S, v6.4s, v7.4s // v1 accumulates srcp[filterPos[1] + {4..7}] * filter[{4..7}]
- ld1 {v19.8H}, [x4], #16 // load 8x16-bit at filter+3*filterSize
+ mla v2.4s, v22.4s, v23.4s // v2 accumulates srcp[filterPos[2] + {0..3}] * filter[{0..3}]
+ mla v2.4s, v16.4s, v17.4s // v2 accumulates srcp[filterPos[2] + {4..7}] * filter[{4..7}]
+ ld1 {v18.8h}, [x11], #16 // srcp[filterPos[3] + {0..7}]
+ mla v1.4s, v6.4s, v7.4s // v1 accumulates srcp[filterPos[1] + {4..7}] * filter[{4..7}]
+ ld1 {v19.8h}, [x4], #16 // load 8x16-bit at filter+3*filterSize
subs w15, w15, #8 // j -= 8: processed 8/filterSize
- uxtl v28.4s, v18.4H // extend srcp lower half
- sxtl v29.4s, v19.4H // extend filter lower half
- uxtl2 v18.4s, v18.8H // extend srcp upper half
+ uxtl v28.4s, v18.4h // extend srcp lower half
+ sxtl v29.4s, v19.4h // extend filter lower half
+ uxtl2 v18.4s, v18.8h // extend srcp upper half
sxtl2 v19.4s, v19.8h // extend filter upper half
- mla v3.4S, v28.4s, v29.4s // v3 accumulates srcp[filterPos[3] + {0..3}] * filter[{0..3}]
- mla v3.4S, v18.4s, v19.4s // v3 accumulates srcp[filterPos[3] + {4..7}] * filter[{4..7}]
+ mla v3.4s, v28.4s, v29.4s // v3 accumulates srcp[filterPos[3] + {0..3}] * filter[{0..3}]
+ mla v3.4s, v18.4s, v19.4s // v3 accumulates srcp[filterPos[3] + {4..7}] * filter[{4..7}]
b.gt 2b // inner loop if filterSize not consumed completely
- addp v0.4S, v0.4S, v1.4S // part01 horizontal pair adding
- addp v2.4S, v2.4S, v3.4S // part23 horizontal pair adding
- addp v0.4S, v0.4S, v2.4S // part0123 horizontal pair adding
+ addp v0.4s, v0.4s, v1.4s // part01 horizontal pair adding
+ addp v2.4s, v2.4s, v3.4s // part23 horizontal pair adding
+ addp v0.4s, v0.4s, v2.4s // part0123 horizontal pair adding
subs w2, w2, #4 // dstW -= 4
sshl v0.4s, v0.4s, v21.4s // shift right (effectively rigth, as shift is negative); overflow expected
smin v0.4s, v0.4s, v20.4s // apply min (do not use sqshl)
diff --git a/libswscale/aarch64/output.S b/libswscale/aarch64/output.S
index b8a2818c9b..344d0659ea 100644
--- a/libswscale/aarch64/output.S
+++ b/libswscale/aarch64/output.S
@@ -29,13 +29,13 @@ function ff_yuv2planeX_8_neon, export=1
// x5 - const uint8_t *dither,
// w6 - int offset
- ld1 {v0.8B}, [x5] // load 8x8-bit dither
+ ld1 {v0.8b}, [x5] // load 8x8-bit dither
and w6, w6, #7
cbz w6, 1f // check if offsetting present
- ext v0.8B, v0.8B, v0.8B, #3 // honor offsetting which can be 0 or 3 only
-1: uxtl v0.8H, v0.8B // extend dither to 16-bit
- ushll v1.4S, v0.4H, #12 // extend dither to 32-bit with left shift by 12 (part 1)
- ushll2 v2.4S, v0.8H, #12 // extend dither to 32-bit with left shift by 12 (part 2)
+ ext v0.8b, v0.8b, v0.8b, #3 // honor offsetting which can be 0 or 3 only
+1: uxtl v0.8h, v0.8b // extend dither to 16-bit
+ ushll v1.4s, v0.4h, #12 // extend dither to 32-bit with left shift by 12 (part 1)
+ ushll2 v2.4s, v0.8h, #12 // extend dither to 32-bit with left shift by 12 (part 2)
cmp w1, #8 // if filterSize == 8, branch to specialized version
b.eq 6f
cmp w1, #4 // if filterSize == 4, branch to specialized version
@@ -48,8 +48,8 @@ function ff_yuv2planeX_8_neon, export=1
mov x7, #0 // i = 0
tbnz w1, #0, 4f // if filterSize % 2 != 0 branch to specialized version
// fs % 2 == 0
-2: mov v3.16B, v1.16B // initialize accumulator part 1 with dithering value
- mov v4.16B, v2.16B // initialize accumulator part 2 with dithering value
+2: mov v3.16b, v1.16b // initialize accumulator part 1 with dithering value
+ mov v4.16b, v2.16b // initialize accumulator part 2 with dithering value
mov w8, w1 // tmpfilterSize = filterSize
mov x9, x2 // srcp = src
mov x10, x0 // filterp = filter
@@ -57,12 +57,12 @@ function ff_yuv2planeX_8_neon, export=1
ldr s7, [x10], #4 // read 2x16-bit coeff X and Y at filter[j] and filter[j+1]
add x11, x11, x7, lsl #1 // &src[j ][i]
add x12, x12, x7, lsl #1 // &src[j+1][i]
- ld1 {v5.8H}, [x11] // read 8x16-bit @ src[j ][i + {0..7}]: A,B,C,D,E,F,G,H
- ld1 {v6.8H}, [x12] // read 8x16-bit @ src[j+1][i + {0..7}]: I,J,K,L,M,N,O,P
- smlal v3.4S, v5.4H, v7.H[0] // val0 += {A,B,C,D} * X
- smlal2 v4.4S, v5.8H, v7.H[0] // val1 += {E,F,G,H} * X
- smlal v3.4S, v6.4H, v7.H[1] // val0 += {I,J,K,L} * Y
- smlal2 v4.4S, v6.8H, v7.H[1] // val1 += {M,N,O,P} * Y
+ ld1 {v5.8h}, [x11] // read 8x16-bit @ src[j ][i + {0..7}]: A,B,C,D,E,F,G,H
+ ld1 {v6.8h}, [x12] // read 8x16-bit @ src[j+1][i + {0..7}]: I,J,K,L,M,N,O,P
+ smlal v3.4s, v5.4h, v7.h[0] // val0 += {A,B,C,D} * X
+ smlal2 v4.4s, v5.8h, v7.h[0] // val1 += {E,F,G,H} * X
+ smlal v3.4s, v6.4h, v7.h[1] // val0 += {I,J,K,L} * Y
+ smlal2 v4.4s, v6.8h, v7.h[1] // val1 += {M,N,O,P} * Y
subs w8, w8, #2 // tmpfilterSize -= 2
b.gt 3b // loop until filterSize consumed
@@ -77,17 +77,17 @@ function ff_yuv2planeX_8_neon, export=1
// If filter size is odd (most likely == 1), then use this section.
// fs % 2 != 0
-4: mov v3.16B, v1.16B // initialize accumulator part 1 with dithering value
- mov v4.16B, v2.16B // initialize accumulator part 2 with dithering value
+4: mov v3.16b, v1.16b // initialize accumulator part 1 with dithering value
+ mov v4.16b, v2.16b // initialize accumulator part 2 with dithering value
mov w8, w1 // tmpfilterSize = filterSize
mov x9, x2 // srcp = src
mov x10, x0 // filterp = filter
5: ldr x11, [x9], #8 // get 1 pointer: src[j]
ldr h6, [x10], #2 // read 1 16 bit coeff X at filter[j]
add x11, x11, x7, lsl #1 // &src[j ][i]
- ld1 {v5.8H}, [x11] // read 8x16-bit @ src[j ][i + {0..7}]: A,B,C,D,E,F,G,H
- smlal v3.4S, v5.4H, v6.H[0] // val0 += {A,B,C,D} * X
- smlal2 v4.4S, v5.8H, v6.H[0] // val1 += {E,F,G,H} * X
+ ld1 {v5.8h}, [x11] // read 8x16-bit @ src[j ][i + {0..7}]: A,B,C,D,E,F,G,H
+ smlal v3.4s, v5.4h, v6.h[0] // val0 += {A,B,C,D} * X
+ smlal2 v4.4s, v5.8h, v6.h[0] // val1 += {E,F,G,H} * X
subs w8, w8, #1 // tmpfilterSize -= 2
b.gt 5b // loop until filterSize consumed
@@ -107,36 +107,36 @@ function ff_yuv2planeX_8_neon, export=1
ldp x12, x13, [x2, #48] // load 2 pointers: src[j+6] and src[j+7]
// load 8x16-bit values for filter[j], where j=0..7
- ld1 {v6.8H}, [x0]
+ ld1 {v6.8h}, [x0]
7:
- mov v3.16B, v1.16B // initialize accumulator part 1 with dithering value
- mov v4.16B, v2.16B // initialize accumulator part 2 with dithering value
-
- ld1 {v24.8H}, [x5], #16 // load 8x16-bit values for src[j + 0][i + {0..7}]
- ld1 {v25.8H}, [x6], #16 // load 8x16-bit values for src[j + 1][i + {0..7}]
- ld1 {v26.8H}, [x7], #16 // load 8x16-bit values for src[j + 2][i + {0..7}]
- ld1 {v27.8H}, [x9], #16 // load 8x16-bit values for src[j + 3][i + {0..7}]
- ld1 {v28.8H}, [x10], #16 // load 8x16-bit values for src[j + 4][i + {0..7}]
- ld1 {v29.8H}, [x11], #16 // load 8x16-bit values for src[j + 5][i + {0..7}]
- ld1 {v30.8H}, [x12], #16 // load 8x16-bit values for src[j + 6][i + {0..7}]
- ld1 {v31.8H}, [x13], #16 // load 8x16-bit values for src[j + 7][i + {0..7}]
-
- smlal v3.4S, v24.4H, v6.H[0] // val0 += src[0][i + {0..3}] * filter[0]
- smlal2 v4.4S, v24.8H, v6.H[0] // val1 += src[0][i + {4..7}] * filter[0]
- smlal v3.4S, v25.4H, v6.H[1] // val0 += src[1][i + {0..3}] * filter[1]
- smlal2 v4.4S, v25.8H, v6.H[1] // val1 += src[1][i + {4..7}] * filter[1]
- smlal v3.4S, v26.4H, v6.H[2] // val0 += src[2][i + {0..3}] * filter[2]
- smlal2 v4.4S, v26.8H, v6.H[2] // val1 += src[2][i + {4..7}] * filter[2]
- smlal v3.4S, v27.4H, v6.H[3] // val0 += src[3][i + {0..3}] * filter[3]
- smlal2 v4.4S, v27.8H, v6.H[3] // val1 += src[3][i + {4..7}] * filter[3]
- smlal v3.4S, v28.4H, v6.H[4] // val0 += src[4][i + {0..3}] * filter[4]
- smlal2 v4.4S, v28.8H, v6.H[4] // val1 += src[4][i + {4..7}] * filter[4]
- smlal v3.4S, v29.4H, v6.H[5] // val0 += src[5][i + {0..3}] * filter[5]
- smlal2 v4.4S, v29.8H, v6.H[5] // val1 += src[5][i + {4..7}] * filter[5]
- smlal v3.4S, v30.4H, v6.H[6] // val0 += src[6][i + {0..3}] * filter[6]
- smlal2 v4.4S, v30.8H, v6.H[6] // val1 += src[6][i + {4..7}] * filter[6]
- smlal v3.4S, v31.4H, v6.H[7] // val0 += src[7][i + {0..3}] * filter[7]
- smlal2 v4.4S, v31.8H, v6.H[7] // val1 += src[7][i + {4..7}] * filter[7]
+ mov v3.16b, v1.16b // initialize accumulator part 1 with dithering value
+ mov v4.16b, v2.16b // initialize accumulator part 2 with dithering value
+
+ ld1 {v24.8h}, [x5], #16 // load 8x16-bit values for src[j + 0][i + {0..7}]
+ ld1 {v25.8h}, [x6], #16 // load 8x16-bit values for src[j + 1][i + {0..7}]
+ ld1 {v26.8h}, [x7], #16 // load 8x16-bit values for src[j + 2][i + {0..7}]
+ ld1 {v27.8h}, [x9], #16 // load 8x16-bit values for src[j + 3][i + {0..7}]
+ ld1 {v28.8h}, [x10], #16 // load 8x16-bit values for src[j + 4][i + {0..7}]
+ ld1 {v29.8h}, [x11], #16 // load 8x16-bit values for src[j + 5][i + {0..7}]
+ ld1 {v30.8h}, [x12], #16 // load 8x16-bit values for src[j + 6][i + {0..7}]
+ ld1 {v31.8h}, [x13], #16 // load 8x16-bit values for src[j + 7][i + {0..7}]
+
+ smlal v3.4s, v24.4h, v6.h[0] // val0 += src[0][i + {0..3}] * filter[0]
+ smlal2 v4.4s, v24.8h, v6.h[0] // val1 += src[0][i + {4..7}] * filter[0]
+ smlal v3.4s, v25.4h, v6.h[1] // val0 += src[1][i + {0..3}] * filter[1]
+ smlal2 v4.4s, v25.8h, v6.h[1] // val1 += src[1][i + {4..7}] * filter[1]
+ smlal v3.4s, v26.4h, v6.h[2] // val0 += src[2][i + {0..3}] * filter[2]
+ smlal2 v4.4s, v26.8h, v6.h[2] // val1 += src[2][i + {4..7}] * filter[2]
+ smlal v3.4s, v27.4h, v6.h[3] // val0 += src[3][i + {0..3}] * filter[3]
+ smlal2 v4.4s, v27.8h, v6.h[3] // val1 += src[3][i + {4..7}] * filter[3]
+ smlal v3.4s, v28.4h, v6.h[4] // val0 += src[4][i + {0..3}] * filter[4]
+ smlal2 v4.4s, v28.8h, v6.h[4] // val1 += src[4][i + {4..7}] * filter[4]
+ smlal v3.4s, v29.4h, v6.h[5] // val0 += src[5][i + {0..3}] * filter[5]
+ smlal2 v4.4s, v29.8h, v6.h[5] // val1 += src[5][i + {4..7}] * filter[5]
+ smlal v3.4s, v30.4h, v6.h[6] // val0 += src[6][i + {0..3}] * filter[6]
+ smlal2 v4.4s, v30.8h, v6.h[6] // val1 += src[6][i + {4..7}] * filter[6]
+ smlal v3.4s, v31.4h, v6.h[7] // val0 += src[7][i + {0..3}] * filter[7]
+ smlal2 v4.4s, v31.8h, v6.h[7] // val1 += src[7][i + {4..7}] * filter[7]
sqshrun v3.4h, v3.4s, #16 // clip16(val0>>16)
sqshrun2 v3.8h, v4.4s, #16 // clip16(val1>>16)
@@ -151,24 +151,24 @@ function ff_yuv2planeX_8_neon, export=1
ldp x7, x9, [x2, #16] // load 2 pointers: src[j+2] and src[j+3]
// load 4x16-bit values for filter[j], where j=0..3 and replicated across lanes
- ld1 {v6.4H}, [x0]
+ ld1 {v6.4h}, [x0]
9:
- mov v3.16B, v1.16B // initialize accumulator part 1 with dithering value
- mov v4.16B, v2.16B // initialize accumulator part 2 with dithering value
-
- ld1 {v24.8H}, [x5], #16 // load 8x16-bit values for src[j + 0][i + {0..7}]
- ld1 {v25.8H}, [x6], #16 // load 8x16-bit values for src[j + 1][i + {0..7}]
- ld1 {v26.8H}, [x7], #16 // load 8x16-bit values for src[j + 2][i + {0..7}]
- ld1 {v27.8H}, [x9], #16 // load 8x16-bit values for src[j + 3][i + {0..7}]
-
- smlal v3.4S, v24.4H, v6.H[0] // val0 += src[0][i + {0..3}] * filter[0]
- smlal2 v4.4S, v24.8H, v6.H[0] // val1 += src[0][i + {4..7}] * filter[0]
- smlal v3.4S, v25.4H, v6.H[1] // val0 += src[1][i + {0..3}] * filter[1]
- smlal2 v4.4S, v25.8H, v6.H[1] // val1 += src[1][i + {4..7}] * filter[1]
- smlal v3.4S, v26.4H, v6.H[2] // val0 += src[2][i + {0..3}] * filter[2]
- smlal2 v4.4S, v26.8H, v6.H[2] // val1 += src[2][i + {4..7}] * filter[2]
- smlal v3.4S, v27.4H, v6.H[3] // val0 += src[3][i + {0..3}] * filter[3]
- smlal2 v4.4S, v27.8H, v6.H[3] // val1 += src[3][i + {4..7}] * filter[3]
+ mov v3.16b, v1.16b // initialize accumulator part 1 with dithering value
+ mov v4.16b, v2.16b // initialize accumulator part 2 with dithering value
+
+ ld1 {v24.8h}, [x5], #16 // load 8x16-bit values for src[j + 0][i + {0..7}]
+ ld1 {v25.8h}, [x6], #16 // load 8x16-bit values for src[j + 1][i + {0..7}]
+ ld1 {v26.8h}, [x7], #16 // load 8x16-bit values for src[j + 2][i + {0..7}]
+ ld1 {v27.8h}, [x9], #16 // load 8x16-bit values for src[j + 3][i + {0..7}]
+
+ smlal v3.4s, v24.4h, v6.h[0] // val0 += src[0][i + {0..3}] * filter[0]
+ smlal2 v4.4s, v24.8h, v6.h[0] // val1 += src[0][i + {4..7}] * filter[0]
+ smlal v3.4s, v25.4h, v6.h[1] // val0 += src[1][i + {0..3}] * filter[1]
+ smlal2 v4.4s, v25.8h, v6.h[1] // val1 += src[1][i + {4..7}] * filter[1]
+ smlal v3.4s, v26.4h, v6.h[2] // val0 += src[2][i + {0..3}] * filter[2]
+ smlal2 v4.4s, v26.8h, v6.h[2] // val1 += src[2][i + {4..7}] * filter[2]
+ smlal v3.4s, v27.4h, v6.h[3] // val0 += src[3][i + {0..3}] * filter[3]
+ smlal2 v4.4s, v27.8h, v6.h[3] // val1 += src[3][i + {4..7}] * filter[3]
sqshrun v3.4h, v3.4s, #16 // clip16(val0>>16)
sqshrun2 v3.8h, v4.4s, #16 // clip16(val1>>16)
@@ -184,16 +184,16 @@ function ff_yuv2planeX_8_neon, export=1
// load 2x16-bit values for filter[j], where j=0..1 and replicated across lanes
ldr s6, [x0]
11:
- mov v3.16B, v1.16B // initialize accumulator part 1 with dithering value
- mov v4.16B, v2.16B // initialize accumulator part 2 with dithering value
+ mov v3.16b, v1.16b // initialize accumulator part 1 with dithering value
+ mov v4.16b, v2.16b // initialize accumulator part 2 with dithering value
- ld1 {v24.8H}, [x5], #16 // load 8x16-bit values for src[j + 0][i + {0..7}]
- ld1 {v25.8H}, [x6], #16 // load 8x16-bit values for src[j + 1][i + {0..7}]
+ ld1 {v24.8h}, [x5], #16 // load 8x16-bit values for src[j + 0][i + {0..7}]
+ ld1 {v25.8h}, [x6], #16 // load 8x16-bit values for src[j + 1][i + {0..7}]
- smlal v3.4S, v24.4H, v6.H[0] // val0 += src[0][i + {0..3}] * filter[0]
- smlal2 v4.4S, v24.8H, v6.H[0] // val1 += src[0][i + {4..7}] * filter[0]
- smlal v3.4S, v25.4H, v6.H[1] // val0 += src[1][i + {0..3}] * filter[1]
- smlal2 v4.4S, v25.8H, v6.H[1] // val1 += src[1][i + {4..7}] * filter[1]
+ smlal v3.4s, v24.4h, v6.h[0] // val0 += src[0][i + {0..3}] * filter[0]
+ smlal2 v4.4s, v24.8h, v6.h[0] // val1 += src[0][i + {4..7}] * filter[0]
+ smlal v3.4s, v25.4h, v6.h[1] // val0 += src[1][i + {0..3}] * filter[1]
+ smlal2 v4.4s, v25.8h, v6.h[1] // val1 += src[1][i + {4..7}] * filter[1]
sqshrun v3.4h, v3.4s, #16 // clip16(val0>>16)
sqshrun2 v3.8h, v4.4s, #16 // clip16(val1>>16)
@@ -210,11 +210,11 @@ function ff_yuv2plane1_8_neon, export=1
// w2 - int dstW,
// x3 - const uint8_t *dither,
// w4 - int offset
- ld1 {v0.8B}, [x3] // load 8x8-bit dither
+ ld1 {v0.8b}, [x3] // load 8x8-bit dither
and w4, w4, #7
cbz w4, 1f // check if offsetting present
- ext v0.8B, v0.8B, v0.8B, #3 // honor offsetting which can be 0 or 3 only
-1: uxtl v0.8H, v0.8B // extend dither to 32-bit
+ ext v0.8b, v0.8b, v0.8b, #3 // honor offsetting which can be 0 or 3 only
+1: uxtl v0.8h, v0.8b // extend dither to 32-bit
uxtl v1.4s, v0.4h
uxtl2 v2.4s, v0.8h
2:
diff --git a/libswscale/aarch64/yuv2rgb_neon.S b/libswscale/aarch64/yuv2rgb_neon.S
index f341268c5d..3fc91530b6 100644
--- a/libswscale/aarch64/yuv2rgb_neon.S
+++ b/libswscale/aarch64/yuv2rgb_neon.S
@@ -33,9 +33,9 @@
.macro load_args_nv12
ldr x8, [sp] // table
load_yoff_ycoeff 8, 16 // y_offset, y_coeff
- ld1 {v1.1D}, [x8]
- dup v0.8H, w10
- dup v3.8H, w9
+ ld1 {v1.1d}, [x8]
+ dup v0.8h, w10
+ dup v3.8h, w9
sub w3, w3, w0, lsl #2 // w3 = linesize - width * 4 (padding)
sub w5, w5, w0 // w5 = linesizeY - width (paddingY)
sub w7, w7, w0 // w7 = linesizeC - width (paddingC)
@@ -51,9 +51,9 @@
ldr w14, [sp, #8] // linesizeV
ldr x8, [sp, #16] // table
load_yoff_ycoeff 24, 32 // y_offset, y_coeff
- ld1 {v1.1D}, [x8]
- dup v0.8H, w10
- dup v3.8H, w9
+ ld1 {v1.1d}, [x8]
+ dup v0.8h, w10
+ dup v3.8h, w9
sub w3, w3, w0, lsl #2 // w3 = linesize - width * 4 (padding)
sub w5, w5, w0 // w5 = linesizeY - width (paddingY)
sub w7, w7, w0, lsr #1 // w7 = linesizeU - width / 2 (paddingU)
@@ -67,9 +67,9 @@
ldr w14, [sp, #8] // linesizeV
ldr x8, [sp, #16] // table
load_yoff_ycoeff 24, 32 // y_offset, y_coeff
- ld1 {v1.1D}, [x8]
- dup v0.8H, w10
- dup v3.8H, w9
+ ld1 {v1.1d}, [x8]
+ dup v0.8h, w10
+ dup v3.8h, w9
sub w3, w3, w0, lsl #2 // w3 = linesize - width * 4 (padding)
sub w5, w5, w0 // w5 = linesizeY - width (paddingY)
sub w7, w7, w0, lsr #1 // w7 = linesizeU - width / 2 (paddingU)
@@ -77,22 +77,22 @@
.endm
.macro load_chroma_nv12
- ld2 {v16.8B, v17.8B}, [x6], #16
- ushll v18.8H, v16.8B, #3
- ushll v19.8H, v17.8B, #3
+ ld2 {v16.8b, v17.8b}, [x6], #16
+ ushll v18.8h, v16.8b, #3
+ ushll v19.8h, v17.8b, #3
.endm
.macro load_chroma_nv21
- ld2 {v16.8B, v17.8B}, [x6], #16
- ushll v19.8H, v16.8B, #3
- ushll v18.8H, v17.8B, #3
+ ld2 {v16.8b, v17.8b}, [x6], #16
+ ushll v19.8h, v16.8b, #3
+ ushll v18.8h, v17.8b, #3
.endm
.macro load_chroma_yuv420p
- ld1 {v16.8B}, [ x6], #8
- ld1 {v17.8B}, [x13], #8
- ushll v18.8H, v16.8B, #3
- ushll v19.8H, v17.8B, #3
+ ld1 {v16.8b}, [ x6], #8
+ ld1 {v17.8b}, [x13], #8
+ ushll v18.8h, v16.8b, #3
+ ushll v19.8h, v17.8b, #3
.endm
.macro load_chroma_yuv422p
@@ -123,18 +123,18 @@
.endm
.macro compute_rgba r1 g1 b1 a1 r2 g2 b2 a2
- add v20.8H, v26.8H, v20.8H // Y1 + R1
- add v21.8H, v27.8H, v21.8H // Y2 + R2
- add v22.8H, v26.8H, v22.8H // Y1 + G1
- add v23.8H, v27.8H, v23.8H // Y2 + G2
- add v24.8H, v26.8H, v24.8H // Y1 + B1
- add v25.8H, v27.8H, v25.8H // Y2 + B2
- sqrshrun \r1, v20.8H, #1 // clip_u8((Y1 + R1) >> 1)
- sqrshrun \r2, v21.8H, #1 // clip_u8((Y2 + R1) >> 1)
- sqrshrun \g1, v22.8H, #1 // clip_u8((Y1 + G1) >> 1)
- sqrshrun \g2, v23.8H, #1 // clip_u8((Y2 + G1) >> 1)
- sqrshrun \b1, v24.8H, #1 // clip_u8((Y1 + B1) >> 1)
- sqrshrun \b2, v25.8H, #1 // clip_u8((Y2 + B1) >> 1)
+ add v20.8h, v26.8h, v20.8h // Y1 + R1
+ add v21.8h, v27.8h, v21.8h // Y2 + R2
+ add v22.8h, v26.8h, v22.8h // Y1 + G1
+ add v23.8h, v27.8h, v23.8h // Y2 + G2
+ add v24.8h, v26.8h, v24.8h // Y1 + B1
+ add v25.8h, v27.8h, v25.8h // Y2 + B2
+ sqrshrun \r1, v20.8h, #1 // clip_u8((Y1 + R1) >> 1)
+ sqrshrun \r2, v21.8h, #1 // clip_u8((Y2 + R1) >> 1)
+ sqrshrun \g1, v22.8h, #1 // clip_u8((Y1 + G1) >> 1)
+ sqrshrun \g2, v23.8h, #1 // clip_u8((Y2 + G1) >> 1)
+ sqrshrun \b1, v24.8h, #1 // clip_u8((Y1 + B1) >> 1)
+ sqrshrun \b2, v25.8h, #1 // clip_u8((Y2 + B1) >> 1)
movi \a1, #255
movi \a2, #255
.endm
@@ -146,47 +146,47 @@ function ff_\ifmt\()_to_\ofmt\()_neon, export=1
1:
mov w8, w0 // w8 = width
2:
- movi v5.8H, #4, lsl #8 // 128 * (1<<3)
+ movi v5.8h, #4, lsl #8 // 128 * (1<<3)
load_chroma_\ifmt
- sub v18.8H, v18.8H, v5.8H // U*(1<<3) - 128*(1<<3)
- sub v19.8H, v19.8H, v5.8H // V*(1<<3) - 128*(1<<3)
- sqdmulh v20.8H, v19.8H, v1.H[0] // V * v2r (R)
- sqdmulh v22.8H, v18.8H, v1.H[1] // U * u2g
- sqdmulh v19.8H, v19.8H, v1.H[2] // V * v2g
- add v22.8H, v22.8H, v19.8H // U * u2g + V * v2g (G)
- sqdmulh v24.8H, v18.8H, v1.H[3] // U * u2b (B)
- zip2 v21.8H, v20.8H, v20.8H // R2
- zip1 v20.8H, v20.8H, v20.8H // R1
- zip2 v23.8H, v22.8H, v22.8H // G2
- zip1 v22.8H, v22.8H, v22.8H // G1
- zip2 v25.8H, v24.8H, v24.8H // B2
- zip1 v24.8H, v24.8H, v24.8H // B1
- ld1 {v2.16B}, [x4], #16 // load luma
- ushll v26.8H, v2.8B, #3 // Y1*(1<<3)
- ushll2 v27.8H, v2.16B, #3 // Y2*(1<<3)
- sub v26.8H, v26.8H, v3.8H // Y1*(1<<3) - y_offset
- sub v27.8H, v27.8H, v3.8H // Y2*(1<<3) - y_offset
- sqdmulh v26.8H, v26.8H, v0.8H // ((Y1*(1<<3) - y_offset) * y_coeff) >> 15
- sqdmulh v27.8H, v27.8H, v0.8H // ((Y2*(1<<3) - y_offset) * y_coeff) >> 15
+ sub v18.8h, v18.8h, v5.8h // U*(1<<3) - 128*(1<<3)
+ sub v19.8h, v19.8h, v5.8h // V*(1<<3) - 128*(1<<3)
+ sqdmulh v20.8h, v19.8h, v1.h[0] // V * v2r (R)
+ sqdmulh v22.8h, v18.8h, v1.h[1] // U * u2g
+ sqdmulh v19.8h, v19.8h, v1.h[2] // V * v2g
+ add v22.8h, v22.8h, v19.8h // U * u2g + V * v2g (G)
+ sqdmulh v24.8h, v18.8h, v1.h[3] // U * u2b (B)
+ zip2 v21.8h, v20.8h, v20.8h // R2
+ zip1 v20.8h, v20.8h, v20.8h // R1
+ zip2 v23.8h, v22.8h, v22.8h // G2
+ zip1 v22.8h, v22.8h, v22.8h // G1
+ zip2 v25.8h, v24.8h, v24.8h // B2
+ zip1 v24.8h, v24.8h, v24.8h // B1
+ ld1 {v2.16b}, [x4], #16 // load luma
+ ushll v26.8h, v2.8b, #3 // Y1*(1<<3)
+ ushll2 v27.8h, v2.16b, #3 // Y2*(1<<3)
+ sub v26.8h, v26.8h, v3.8h // Y1*(1<<3) - y_offset
+ sub v27.8h, v27.8h, v3.8h // Y2*(1<<3) - y_offset
+ sqdmulh v26.8h, v26.8h, v0.8h // ((Y1*(1<<3) - y_offset) * y_coeff) >> 15
+ sqdmulh v27.8h, v27.8h, v0.8h // ((Y2*(1<<3) - y_offset) * y_coeff) >> 15
.ifc \ofmt,argb // 1 2 3 0
- compute_rgba v5.8B,v6.8B,v7.8B,v4.8B, v17.8B,v18.8B,v19.8B,v16.8B
+ compute_rgba v5.8b,v6.8b,v7.8b,v4.8b, v17.8b,v18.8b,v19.8b,v16.8b
.endif
.ifc \ofmt,rgba // 0 1 2 3
- compute_rgba v4.8B,v5.8B,v6.8B,v7.8B, v16.8B,v17.8B,v18.8B,v19.8B
+ compute_rgba v4.8b,v5.8b,v6.8b,v7.8b, v16.8b,v17.8b,v18.8b,v19.8b
.endif
.ifc \ofmt,abgr // 3 2 1 0
- compute_rgba v7.8B,v6.8B,v5.8B,v4.8B, v19.8B,v18.8B,v17.8B,v16.8B
+ compute_rgba v7.8b,v6.8b,v5.8b,v4.8b, v19.8b,v18.8b,v17.8b,v16.8b
.endif
.ifc \ofmt,bgra // 2 1 0 3
- compute_rgba v6.8B,v5.8B,v4.8B,v7.8B, v18.8B,v17.8B,v16.8B,v19.8B
+ compute_rgba v6.8b,v5.8b,v4.8b,v7.8b, v18.8b,v17.8b,v16.8b,v19.8b
.endif
- st4 { v4.8B, v5.8B, v6.8B, v7.8B}, [x2], #32
- st4 {v16.8B,v17.8B,v18.8B,v19.8B}, [x2], #32
+ st4 { v4.8b, v5.8b, v6.8b, v7.8b}, [x2], #32
+ st4 {v16.8b,v17.8b,v18.8b,v19.8b}, [x2], #32
subs w8, w8, #16 // width -= 16
b.gt 2b
add x2, x2, w3, SXTW // dst += padding