diff options
author | Martin Storsjö <martin@martin.st> | 2023-10-17 14:16:24 +0300 |
---|---|---|
committer | Martin Storsjö <martin@martin.st> | 2023-10-21 23:25:18 +0300 |
commit | 184103b3105f02f1189fa0047af4269e027dfbd6 (patch) | |
tree | 3e50ad549ed68292f91594c4e6fb26551de90369 /libswscale | |
parent | 393d1ee541b143633bfba2ff0e821d734fd511c2 (diff) | |
download | ffmpeg-184103b3105f02f1189fa0047af4269e027dfbd6.tar.gz |
aarch64: Consistently use lowercase for vector element specifiers
Signed-off-by: Martin Storsjö <martin@martin.st>
Diffstat (limited to 'libswscale')
-rw-r--r-- | libswscale/aarch64/hscale.S | 432 | ||||
-rw-r--r-- | libswscale/aarch64/output.S | 150 | ||||
-rw-r--r-- | libswscale/aarch64/yuv2rgb_neon.S | 116 |
3 files changed, 349 insertions, 349 deletions
diff --git a/libswscale/aarch64/hscale.S b/libswscale/aarch64/hscale.S index 8d4dcb2541..f3c404eb5f 100644 --- a/libswscale/aarch64/hscale.S +++ b/libswscale/aarch64/hscale.S @@ -50,43 +50,43 @@ function ff_hscale8to15_X8_neon, export=1 add x12, x16, x7 // filter1 = filter0 + filterSize*2 add x13, x12, x7 // filter2 = filter1 + filterSize*2 add x4, x13, x7 // filter3 = filter2 + filterSize*2 - movi v0.2D, #0 // val sum part 1 (for dst[0]) - movi v1.2D, #0 // val sum part 2 (for dst[1]) - movi v2.2D, #0 // val sum part 3 (for dst[2]) - movi v3.2D, #0 // val sum part 4 (for dst[3]) + movi v0.2d, #0 // val sum part 1 (for dst[0]) + movi v1.2d, #0 // val sum part 2 (for dst[1]) + movi v2.2d, #0 // val sum part 3 (for dst[2]) + movi v3.2d, #0 // val sum part 4 (for dst[3]) add x17, x3, w8, UXTW // srcp + filterPos[0] add x8, x3, w0, UXTW // srcp + filterPos[1] add x0, x3, w11, UXTW // srcp + filterPos[2] add x11, x3, w9, UXTW // srcp + filterPos[3] mov w15, w6 // filterSize counter -2: ld1 {v4.8B}, [x17], #8 // srcp[filterPos[0] + {0..7}] - ld1 {v5.8H}, [x16], #16 // load 8x16-bit filter values, part 1 - ld1 {v6.8B}, [x8], #8 // srcp[filterPos[1] + {0..7}] - ld1 {v7.8H}, [x12], #16 // load 8x16-bit at filter+filterSize - uxtl v4.8H, v4.8B // unpack part 1 to 16-bit - smlal v0.4S, v4.4H, v5.4H // v0 accumulates srcp[filterPos[0] + {0..3}] * filter[{0..3}] - smlal2 v0.4S, v4.8H, v5.8H // v0 accumulates srcp[filterPos[0] + {4..7}] * filter[{4..7}] - ld1 {v16.8B}, [x0], #8 // srcp[filterPos[2] + {0..7}] - ld1 {v17.8H}, [x13], #16 // load 8x16-bit at filter+2*filterSize - uxtl v6.8H, v6.8B // unpack part 2 to 16-bit - smlal v1.4S, v6.4H, v7.4H // v1 accumulates srcp[filterPos[1] + {0..3}] * filter[{0..3}] - uxtl v16.8H, v16.8B // unpack part 3 to 16-bit - smlal v2.4S, v16.4H, v17.4H // v2 accumulates srcp[filterPos[2] + {0..3}] * filter[{0..3}] - smlal2 v2.4S, v16.8H, v17.8H // v2 accumulates srcp[filterPos[2] + {4..7}] * filter[{4..7}] - ld1 {v18.8B}, [x11], #8 // srcp[filterPos[3] + {0..7}] - smlal2 v1.4S, v6.8H, v7.8H // v1 accumulates srcp[filterPos[1] + {4..7}] * filter[{4..7}] - ld1 {v19.8H}, [x4], #16 // load 8x16-bit at filter+3*filterSize +2: ld1 {v4.8b}, [x17], #8 // srcp[filterPos[0] + {0..7}] + ld1 {v5.8h}, [x16], #16 // load 8x16-bit filter values, part 1 + ld1 {v6.8b}, [x8], #8 // srcp[filterPos[1] + {0..7}] + ld1 {v7.8h}, [x12], #16 // load 8x16-bit at filter+filterSize + uxtl v4.8h, v4.8b // unpack part 1 to 16-bit + smlal v0.4s, v4.4h, v5.4h // v0 accumulates srcp[filterPos[0] + {0..3}] * filter[{0..3}] + smlal2 v0.4s, v4.8h, v5.8h // v0 accumulates srcp[filterPos[0] + {4..7}] * filter[{4..7}] + ld1 {v16.8b}, [x0], #8 // srcp[filterPos[2] + {0..7}] + ld1 {v17.8h}, [x13], #16 // load 8x16-bit at filter+2*filterSize + uxtl v6.8h, v6.8b // unpack part 2 to 16-bit + smlal v1.4s, v6.4h, v7.4h // v1 accumulates srcp[filterPos[1] + {0..3}] * filter[{0..3}] + uxtl v16.8h, v16.8b // unpack part 3 to 16-bit + smlal v2.4s, v16.4h, v17.4h // v2 accumulates srcp[filterPos[2] + {0..3}] * filter[{0..3}] + smlal2 v2.4s, v16.8h, v17.8h // v2 accumulates srcp[filterPos[2] + {4..7}] * filter[{4..7}] + ld1 {v18.8b}, [x11], #8 // srcp[filterPos[3] + {0..7}] + smlal2 v1.4s, v6.8h, v7.8h // v1 accumulates srcp[filterPos[1] + {4..7}] * filter[{4..7}] + ld1 {v19.8h}, [x4], #16 // load 8x16-bit at filter+3*filterSize subs w15, w15, #8 // j -= 8: processed 8/filterSize - uxtl v18.8H, v18.8B // unpack part 4 to 16-bit - smlal v3.4S, v18.4H, v19.4H // v3 accumulates srcp[filterPos[3] + {0..3}] * filter[{0..3}] - smlal2 v3.4S, v18.8H, v19.8H // v3 accumulates srcp[filterPos[3] + {4..7}] * filter[{4..7}] + uxtl v18.8h, v18.8b // unpack part 4 to 16-bit + smlal v3.4s, v18.4h, v19.4h // v3 accumulates srcp[filterPos[3] + {0..3}] * filter[{0..3}] + smlal2 v3.4s, v18.8h, v19.8h // v3 accumulates srcp[filterPos[3] + {4..7}] * filter[{4..7}] b.gt 2b // inner loop if filterSize not consumed completely - addp v0.4S, v0.4S, v1.4S // part01 horizontal pair adding - addp v2.4S, v2.4S, v3.4S // part23 horizontal pair adding - addp v0.4S, v0.4S, v2.4S // part0123 horizontal pair adding + addp v0.4s, v0.4s, v1.4s // part01 horizontal pair adding + addp v2.4s, v2.4s, v3.4s // part23 horizontal pair adding + addp v0.4s, v0.4s, v2.4s // part0123 horizontal pair adding subs w2, w2, #4 // dstW -= 4 - sqshrn v0.4H, v0.4S, #7 // shift and clip the 2x16-bit final values - st1 {v0.4H}, [x1], #8 // write to destination part0123 + sqshrn v0.4h, v0.4s, #7 // shift and clip the 2x16-bit final values + st1 {v0.4h}, [x1], #8 // write to destination part0123 b.gt 1b // loop until end of line ret endfunc @@ -245,7 +245,7 @@ function ff_hscale8to15_4_neon, export=1 stp w14, w15, [sp, #24] // *scratch_mem = { src[filterPos[idx + 6]][0..3], src[filterPos[idx + 7]][0..3] } 1: - ld4 {v16.8B, v17.8B, v18.8B, v19.8B}, [sp] // transpose 8 bytes each from src into 4 registers + ld4 {v16.8b, v17.8b, v18.8b, v19.8b}, [sp] // transpose 8 bytes each from src into 4 registers // load 8 values from filterPos to be used as offsets into src ldp w8, w9, [x5] // filterPos[idx + 0][0..3], [idx + 1][0..3], next iteration @@ -253,74 +253,74 @@ function ff_hscale8to15_4_neon, export=1 ldp w12, w13, [x5, #16] // filterPos[idx + 4][0..3], [idx + 5][0..3], next iteration ldp w14, w15, [x5, #24] // filterPos[idx + 6][0..3], [idx + 7][0..3], next iteration - movi v0.2D, #0 // Clear madd accumulator for idx 0..3 - movi v5.2D, #0 // Clear madd accumulator for idx 4..7 + movi v0.2d, #0 // Clear madd accumulator for idx 0..3 + movi v5.2d, #0 // Clear madd accumulator for idx 4..7 - ld4 {v1.8H, v2.8H, v3.8H, v4.8H}, [x4], #64 // load filter idx + 0..7 + ld4 {v1.8h, v2.8h, v3.8h, v4.8h}, [x4], #64 // load filter idx + 0..7 add x5, x5, #32 // advance filterPos // interleaved SIMD and prefetching intended to keep ld/st and vector pipelines busy - uxtl v16.8H, v16.8B // unsigned extend long, covert src data to 16-bit - uxtl v17.8H, v17.8B // unsigned extend long, covert src data to 16-bit + uxtl v16.8h, v16.8b // unsigned extend long, covert src data to 16-bit + uxtl v17.8h, v17.8b // unsigned extend long, covert src data to 16-bit ldr w8, [x3, w8, UXTW] // src[filterPos[idx + 0]], next iteration ldr w9, [x3, w9, UXTW] // src[filterPos[idx + 1]], next iteration - uxtl v18.8H, v18.8B // unsigned extend long, covert src data to 16-bit - uxtl v19.8H, v19.8B // unsigned extend long, covert src data to 16-bit + uxtl v18.8h, v18.8b // unsigned extend long, covert src data to 16-bit + uxtl v19.8h, v19.8b // unsigned extend long, covert src data to 16-bit ldr w10, [x3, w10, UXTW] // src[filterPos[idx + 2]], next iteration ldr w11, [x3, w11, UXTW] // src[filterPos[idx + 3]], next iteration - smlal v0.4S, v1.4H, v16.4H // multiply accumulate inner loop j = 0, idx = 0..3 - smlal v0.4S, v2.4H, v17.4H // multiply accumulate inner loop j = 1, idx = 0..3 + smlal v0.4s, v1.4h, v16.4h // multiply accumulate inner loop j = 0, idx = 0..3 + smlal v0.4s, v2.4h, v17.4h // multiply accumulate inner loop j = 1, idx = 0..3 ldr w12, [x3, w12, UXTW] // src[filterPos[idx + 4]], next iteration ldr w13, [x3, w13, UXTW] // src[filterPos[idx + 5]], next iteration - smlal v0.4S, v3.4H, v18.4H // multiply accumulate inner loop j = 2, idx = 0..3 - smlal v0.4S, v4.4H, v19.4H // multiply accumulate inner loop j = 3, idx = 0..3 + smlal v0.4s, v3.4h, v18.4h // multiply accumulate inner loop j = 2, idx = 0..3 + smlal v0.4s, v4.4h, v19.4h // multiply accumulate inner loop j = 3, idx = 0..3 ldr w14, [x3, w14, UXTW] // src[filterPos[idx + 6]], next iteration ldr w15, [x3, w15, UXTW] // src[filterPos[idx + 7]], next iteration - smlal2 v5.4S, v1.8H, v16.8H // multiply accumulate inner loop j = 0, idx = 4..7 - smlal2 v5.4S, v2.8H, v17.8H // multiply accumulate inner loop j = 1, idx = 4..7 + smlal2 v5.4s, v1.8h, v16.8h // multiply accumulate inner loop j = 0, idx = 4..7 + smlal2 v5.4s, v2.8h, v17.8h // multiply accumulate inner loop j = 1, idx = 4..7 stp w8, w9, [sp] // *scratch_mem = { src[filterPos[idx + 0]][0..3], src[filterPos[idx + 1]][0..3] } stp w10, w11, [sp, #8] // *scratch_mem = { src[filterPos[idx + 2]][0..3], src[filterPos[idx + 3]][0..3] } - smlal2 v5.4S, v3.8H, v18.8H // multiply accumulate inner loop j = 2, idx = 4..7 - smlal2 v5.4S, v4.8H, v19.8H // multiply accumulate inner loop j = 3, idx = 4..7 + smlal2 v5.4s, v3.8h, v18.8h // multiply accumulate inner loop j = 2, idx = 4..7 + smlal2 v5.4s, v4.8h, v19.8h // multiply accumulate inner loop j = 3, idx = 4..7 stp w12, w13, [sp, #16] // *scratch_mem = { src[filterPos[idx + 4]][0..3], src[filterPos[idx + 5]][0..3] } stp w14, w15, [sp, #24] // *scratch_mem = { src[filterPos[idx + 6]][0..3], src[filterPos[idx + 7]][0..3] } sub w2, w2, #8 // dstW -= 8 - sqshrn v0.4H, v0.4S, #7 // shift and clip the 2x16-bit final values - sqshrn v1.4H, v5.4S, #7 // shift and clip the 2x16-bit final values - st1 {v0.4H, v1.4H}, [x1], #16 // write to dst[idx + 0..7] + sqshrn v0.4h, v0.4s, #7 // shift and clip the 2x16-bit final values + sqshrn v1.4h, v5.4s, #7 // shift and clip the 2x16-bit final values + st1 {v0.4h, v1.4h}, [x1], #16 // write to dst[idx + 0..7] cmp w2, #16 // continue on main loop if there are at least 16 iterations left b.ge 1b // last full iteration - ld4 {v16.8B, v17.8B, v18.8B, v19.8B}, [sp] - ld4 {v1.8H, v2.8H, v3.8H, v4.8H}, [x4], #64 // load filter idx + 0..7 + ld4 {v16.8b, v17.8b, v18.8b, v19.8b}, [sp] + ld4 {v1.8h, v2.8h, v3.8h, v4.8h}, [x4], #64 // load filter idx + 0..7 - movi v0.2D, #0 // Clear madd accumulator for idx 0..3 - movi v5.2D, #0 // Clear madd accumulator for idx 4..7 + movi v0.2d, #0 // Clear madd accumulator for idx 0..3 + movi v5.2d, #0 // Clear madd accumulator for idx 4..7 - uxtl v16.8H, v16.8B // unsigned extend long, covert src data to 16-bit - uxtl v17.8H, v17.8B // unsigned extend long, covert src data to 16-bit - uxtl v18.8H, v18.8B // unsigned extend long, covert src data to 16-bit - uxtl v19.8H, v19.8B // unsigned extend long, covert src data to 16-bit + uxtl v16.8h, v16.8b // unsigned extend long, covert src data to 16-bit + uxtl v17.8h, v17.8b // unsigned extend long, covert src data to 16-bit + uxtl v18.8h, v18.8b // unsigned extend long, covert src data to 16-bit + uxtl v19.8h, v19.8b // unsigned extend long, covert src data to 16-bit - smlal v0.4S, v1.4H, v16.4H // multiply accumulate inner loop j = 0, idx = 0..3 - smlal v0.4S, v2.4H, v17.4H // multiply accumulate inner loop j = 1, idx = 0..3 - smlal v0.4S, v3.4H, v18.4H // multiply accumulate inner loop j = 2, idx = 0..3 - smlal v0.4S, v4.4H, v19.4H // multiply accumulate inner loop j = 3, idx = 0..3 + smlal v0.4s, v1.4h, v16.4h // multiply accumulate inner loop j = 0, idx = 0..3 + smlal v0.4s, v2.4h, v17.4h // multiply accumulate inner loop j = 1, idx = 0..3 + smlal v0.4s, v3.4h, v18.4h // multiply accumulate inner loop j = 2, idx = 0..3 + smlal v0.4s, v4.4h, v19.4h // multiply accumulate inner loop j = 3, idx = 0..3 - smlal2 v5.4S, v1.8H, v16.8H // multiply accumulate inner loop j = 0, idx = 4..7 - smlal2 v5.4S, v2.8H, v17.8H // multiply accumulate inner loop j = 1, idx = 4..7 - smlal2 v5.4S, v3.8H, v18.8H // multiply accumulate inner loop j = 2, idx = 4..7 - smlal2 v5.4S, v4.8H, v19.8H // multiply accumulate inner loop j = 3, idx = 4..7 + smlal2 v5.4s, v1.8h, v16.8h // multiply accumulate inner loop j = 0, idx = 4..7 + smlal2 v5.4s, v2.8h, v17.8h // multiply accumulate inner loop j = 1, idx = 4..7 + smlal2 v5.4s, v3.8h, v18.8h // multiply accumulate inner loop j = 2, idx = 4..7 + smlal2 v5.4s, v4.8h, v19.8h // multiply accumulate inner loop j = 3, idx = 4..7 subs w2, w2, #8 // dstW -= 8 - sqshrn v0.4H, v0.4S, #7 // shift and clip the 2x16-bit final values - sqshrn v1.4H, v5.4S, #7 // shift and clip the 2x16-bit final values - st1 {v0.4H, v1.4H}, [x1], #16 // write to dst[idx + 0..7] + sqshrn v0.4h, v0.4s, #7 // shift and clip the 2x16-bit final values + sqshrn v1.4h, v5.4s, #7 // shift and clip the 2x16-bit final values + st1 {v0.4h, v1.4h}, [x1], #16 // write to dst[idx + 0..7] cbnz w2, 2f // if >0 iterations remain, jump to the wrap up section @@ -332,15 +332,15 @@ function ff_hscale8to15_4_neon, export=1 // load src ldr w8, [x5], #4 // filterPos[i] add x9, x3, w8, UXTW // calculate the address for src load - ld1 {v5.S}[0], [x9] // src[filterPos[i] + 0..3] + ld1 {v5.s}[0], [x9] // src[filterPos[i] + 0..3] // load filter - ld1 {v6.4H}, [x4], #8 // filter[filterSize * i + 0..3] + ld1 {v6.4h}, [x4], #8 // filter[filterSize * i + 0..3] - uxtl v5.8H, v5.8B // unsigned exten long, convert src data to 16-bit - smull v0.4S, v5.4H, v6.4H // 4 iterations of src[...] * filter[...] - addv s0, v0.4S // add up products of src and filter values + uxtl v5.8h, v5.8b // unsigned exten long, convert src data to 16-bit + smull v0.4s, v5.4h, v6.4h // 4 iterations of src[...] * filter[...] + addv s0, v0.4s // add up products of src and filter values sqshrn h0, s0, #7 // shift and clip the 2x16-bit final value - st1 {v0.H}[0], [x1], #2 // dst[i] = ... + st1 {v0.h}[0], [x1], #2 // dst[i] = ... sub w2, w2, #1 // dstW-- cbnz w2, 2b @@ -445,12 +445,12 @@ function ff_hscale8to19_4_neon, export=1 smull v5.4s, v0.4h, v28.4h smull2 v6.4s, v0.8h, v28.8h uxtl v2.8h, v2.8b - smlal v5.4s, v1.4h, v29.4H - smlal2 v6.4s, v1.8h, v29.8H + smlal v5.4s, v1.4h, v29.4h + smlal2 v6.4s, v1.8h, v29.8h uxtl v3.8h, v3.8b - smlal v5.4s, v2.4h, v30.4H - smlal2 v6.4s, v2.8h, v30.8H - smlal v5.4s, v3.4h, v31.4H + smlal v5.4s, v2.4h, v30.4h + smlal2 v6.4s, v2.8h, v30.8h + smlal v5.4s, v3.4h, v31.4h smlal2 v6.4s, v3.8h, v31.8h sshr v5.4s, v5.4s, #3 @@ -472,8 +472,8 @@ function ff_hscale8to19_4_neon, export=1 ld1 {v0.s}[0], [x9] // load 4 * uint8_t* into one single ld1 {v31.4h}, [x4], #8 uxtl v0.8h, v0.8b - smull v5.4s, v0.4h, v31.4H - saddlv d0, v5.4S + smull v5.4s, v0.4h, v31.4h + saddlv d0, v5.4s sqshrn s0, d0, #3 smin v0.4s, v0.4s, v18.4s st1 {v0.s}[0], [x1], #4 @@ -499,42 +499,42 @@ function ff_hscale8to19_X8_neon, export=1 ldr w11, [x5], #4 // filterPos[idx + 2] add x4, x13, x7 // filter3 = filter2 + filterSize*2 ldr w9, [x5], #4 // filterPos[idx + 3] - movi v0.2D, #0 // val sum part 1 (for dst[0]) - movi v1.2D, #0 // val sum part 2 (for dst[1]) - movi v2.2D, #0 // val sum part 3 (for dst[2]) - movi v3.2D, #0 // val sum part 4 (for dst[3]) + movi v0.2d, #0 // val sum part 1 (for dst[0]) + movi v1.2d, #0 // val sum part 2 (for dst[1]) + movi v2.2d, #0 // val sum part 3 (for dst[2]) + movi v3.2d, #0 // val sum part 4 (for dst[3]) add x17, x3, w8, UXTW // srcp + filterPos[0] add x8, x3, w0, UXTW // srcp + filterPos[1] add x0, x3, w11, UXTW // srcp + filterPos[2] add x11, x3, w9, UXTW // srcp + filterPos[3] mov w15, w6 // filterSize counter -2: ld1 {v4.8B}, [x17], #8 // srcp[filterPos[0] + {0..7}] - ld1 {v5.8H}, [x16], #16 // load 8x16-bit filter values, part 1 - uxtl v4.8H, v4.8B // unpack part 1 to 16-bit - smlal v0.4S, v4.4H, v5.4H // v0 accumulates srcp[filterPos[0] + {0..3}] * filter[{0..3}] - ld1 {v6.8B}, [x8], #8 // srcp[filterPos[1] + {0..7}] - smlal2 v0.4S, v4.8H, v5.8H // v0 accumulates srcp[filterPos[0] + {4..7}] * filter[{4..7}] - ld1 {v7.8H}, [x12], #16 // load 8x16-bit at filter+filterSize - ld1 {v16.8B}, [x0], #8 // srcp[filterPos[2] + {0..7}] - uxtl v6.8H, v6.8B // unpack part 2 to 16-bit - ld1 {v17.8H}, [x13], #16 // load 8x16-bit at filter+2*filterSize - uxtl v16.8H, v16.8B // unpack part 3 to 16-bit - smlal v1.4S, v6.4H, v7.4H // v1 accumulates srcp[filterPos[1] + {0..3}] * filter[{0..3}] - ld1 {v18.8B}, [x11], #8 // srcp[filterPos[3] + {0..7}] - smlal v2.4S, v16.4H, v17.4H // v2 accumulates srcp[filterPos[2] + {0..3}] * filter[{0..3}] - ld1 {v19.8H}, [x4], #16 // load 8x16-bit at filter+3*filterSize - smlal2 v2.4S, v16.8H, v17.8H // v2 accumulates srcp[filterPos[2] + {4..7}] * filter[{4..7}] - uxtl v18.8H, v18.8B // unpack part 4 to 16-bit - smlal2 v1.4S, v6.8H, v7.8H // v1 accumulates srcp[filterPos[1] + {4..7}] * filter[{4..7}] - smlal v3.4S, v18.4H, v19.4H // v3 accumulates srcp[filterPos[3] + {0..3}] * filter[{0..3}] +2: ld1 {v4.8b}, [x17], #8 // srcp[filterPos[0] + {0..7}] + ld1 {v5.8h}, [x16], #16 // load 8x16-bit filter values, part 1 + uxtl v4.8h, v4.8b // unpack part 1 to 16-bit + smlal v0.4s, v4.4h, v5.4h // v0 accumulates srcp[filterPos[0] + {0..3}] * filter[{0..3}] + ld1 {v6.8b}, [x8], #8 // srcp[filterPos[1] + {0..7}] + smlal2 v0.4s, v4.8h, v5.8h // v0 accumulates srcp[filterPos[0] + {4..7}] * filter[{4..7}] + ld1 {v7.8h}, [x12], #16 // load 8x16-bit at filter+filterSize + ld1 {v16.8b}, [x0], #8 // srcp[filterPos[2] + {0..7}] + uxtl v6.8h, v6.8b // unpack part 2 to 16-bit + ld1 {v17.8h}, [x13], #16 // load 8x16-bit at filter+2*filterSize + uxtl v16.8h, v16.8b // unpack part 3 to 16-bit + smlal v1.4s, v6.4h, v7.4h // v1 accumulates srcp[filterPos[1] + {0..3}] * filter[{0..3}] + ld1 {v18.8b}, [x11], #8 // srcp[filterPos[3] + {0..7}] + smlal v2.4s, v16.4h, v17.4h // v2 accumulates srcp[filterPos[2] + {0..3}] * filter[{0..3}] + ld1 {v19.8h}, [x4], #16 // load 8x16-bit at filter+3*filterSize + smlal2 v2.4s, v16.8h, v17.8h // v2 accumulates srcp[filterPos[2] + {4..7}] * filter[{4..7}] + uxtl v18.8h, v18.8b // unpack part 4 to 16-bit + smlal2 v1.4s, v6.8h, v7.8h // v1 accumulates srcp[filterPos[1] + {4..7}] * filter[{4..7}] + smlal v3.4s, v18.4h, v19.4h // v3 accumulates srcp[filterPos[3] + {0..3}] * filter[{0..3}] subs w15, w15, #8 // j -= 8: processed 8/filterSize - smlal2 v3.4S, v18.8H, v19.8H // v3 accumulates srcp[filterPos[3] + {4..7}] * filter[{4..7}] + smlal2 v3.4s, v18.8h, v19.8h // v3 accumulates srcp[filterPos[3] + {4..7}] * filter[{4..7}] b.gt 2b // inner loop if filterSize not consumed completely - addp v0.4S, v0.4S, v1.4S // part01 horizontal pair adding - addp v2.4S, v2.4S, v3.4S // part23 horizontal pair adding - addp v0.4S, v0.4S, v2.4S // part0123 horizontal pair adding + addp v0.4s, v0.4s, v1.4s // part01 horizontal pair adding + addp v2.4s, v2.4s, v3.4s // part23 horizontal pair adding + addp v0.4s, v0.4s, v2.4s // part0123 horizontal pair adding subs w2, w2, #4 // dstW -= 4 - sshr v0.4s, v0.4S, #3 // shift and clip the 2x16-bit final values + sshr v0.4s, v0.4s, #3 // shift and clip the 2x16-bit final values smin v0.4s, v0.4s, v20.4s st1 {v0.4s}, [x1], #16 // write to destination part0123 b.gt 1b // loop until end of line @@ -588,16 +588,16 @@ function ff_hscale8to19_X4_neon, export=1 smlal2 v16.4s, v4.8h, v31.8h // multiplication of upper half for idx 0 ldr d6, [x10], #8 // load src values for idx 2 ldr q29, [x14, x16] // load filter values for idx 2 - smlal v17.4s, v5.4h, v30.4H // multiplication of lower half for idx 1 + smlal v17.4s, v5.4h, v30.4h // multiplication of lower half for idx 1 ldr d7, [x11], #8 // load src values for idx 3 - smlal2 v17.4s, v5.8h, v30.8H // multiplication of upper half for idx 1 - uxtl v6.8h, v6.8B // extend tpye to matchi the filter's size + smlal2 v17.4s, v5.8h, v30.8h // multiplication of upper half for idx 1 + uxtl v6.8h, v6.8b // extend tpye to matchi the filter's size ldr q28, [x15, x16] // load filter values for idx 3 smlal v18.4s, v6.4h, v29.4h // multiplication of lower half for idx 2 - uxtl v7.8h, v7.8B - smlal2 v18.4s, v6.8h, v29.8H // multiplication of upper half for idx 2 + uxtl v7.8h, v7.8b + smlal2 v18.4s, v6.8h, v29.8h // multiplication of upper half for idx 2 sub w0, w0, #8 - smlal v19.4s, v7.4h, v28.4H // multiplication of lower half for idx 3 + smlal v19.4s, v7.4h, v28.4h // multiplication of lower half for idx 3 cmp w0, #8 smlal2 v19.4s, v7.8h, v28.8h // multiplication of upper half for idx 3 add x16, x16, #16 // advance filter values indexing @@ -618,11 +618,11 @@ function ff_hscale8to19_X4_neon, export=1 uxtl v5.8h, v5.8b // extend type to match the filter' size ldr s6, [x10] // load src values for idx 2 smlal v17.4s, v5.4h, v30.4h - uxtl v6.8h, v6.8B // extend type to match the filter's size + uxtl v6.8h, v6.8b // extend type to match the filter's size ldr d29, [x14, x17] // load filter values for idx 2 ldr s7, [x11] // load src values for idx 3 addp v16.4s, v16.4s, v17.4s - uxtl v7.8h, v7.8B + uxtl v7.8h, v7.8b ldr d28, [x15, x17] // load filter values for idx 3 smlal v18.4s, v6.4h, v29.4h smlal v19.4s, v7.4h, v28.4h @@ -700,31 +700,31 @@ function ff_hscale16to15_4_neon_asm, export=1 // Extending to 32 bits is necessary, as unit16_t values can't // be represented as int16_t without type promotion. uxtl v26.4s, v0.4h - sxtl v27.4s, v28.4H + sxtl v27.4s, v28.4h uxtl2 v0.4s, v0.8h mul v5.4s, v26.4s, v27.4s - sxtl2 v28.4s, v28.8H + sxtl2 v28.4s, v28.8h uxtl v26.4s, v1.4h mul v6.4s, v0.4s, v28.4s - sxtl v27.4s, v29.4H + sxtl v27.4s, v29.4h uxtl2 v0.4s, v1.8h mla v5.4s, v27.4s, v26.4s - sxtl2 v28.4s, v29.8H + sxtl2 v28.4s, v29.8h uxtl v26.4s, v2.4h mla v6.4s, v28.4s, v0.4s - sxtl v27.4s, v30.4H + sxtl v27.4s, v30.4h uxtl2 v0.4s, v2.8h mla v5.4s, v27.4s, v26.4s - sxtl2 v28.4s, v30.8H + sxtl2 v28.4s, v30.8h uxtl v26.4s, v3.4h mla v6.4s, v28.4s, v0.4s - sxtl v27.4s, v31.4H + sxtl v27.4s, v31.4h uxtl2 v0.4s, v3.8h mla v5.4s, v27.4s, v26.4s - sxtl2 v28.4s, v31.8H + sxtl2 v28.4s, v31.8h sub w2, w2, #8 mla v6.4s, v28.4s, v0.4s @@ -775,31 +775,31 @@ function ff_hscale16to15_4_neon_asm, export=1 ld4 {v28.8h, v29.8h, v30.8h, v31.8h}, [x4], #64 uxtl v26.4s, v0.4h - sxtl v27.4s, v28.4H + sxtl v27.4s, v28.4h uxtl2 v0.4s, v0.8h mul v5.4s, v26.4s, v27.4s - sxtl2 v28.4s, v28.8H + sxtl2 v28.4s, v28.8h uxtl v26.4s, v1.4h mul v6.4s, v0.4s, v28.4s - sxtl v27.4s, v29.4H + sxtl v27.4s, v29.4h uxtl2 v0.4s, v1.8h mla v5.4s, v26.4s, v27.4s - sxtl2 v28.4s, v29.8H + sxtl2 v28.4s, v29.8h uxtl v26.4s, v2.4h mla v6.4s, v0.4s, v28.4s - sxtl v27.4s, v30.4H + sxtl v27.4s, v30.4h uxtl2 v0.4s, v2.8h mla v5.4s, v26.4s, v27.4s - sxtl2 v28.4s, v30.8H + sxtl2 v28.4s, v30.8h uxtl v26.4s, v3.4h mla v6.4s, v0.4s, v28.4s - sxtl v27.4s, v31.4H + sxtl v27.4s, v31.4h uxtl2 v0.4s, v3.8h mla v5.4s, v26.4s, v27.4s - sxtl2 v28.4s, v31.8H + sxtl2 v28.4s, v31.8h subs w2, w2, #8 mla v6.4s, v0.4s, v28.4s @@ -807,7 +807,7 @@ function ff_hscale16to15_4_neon_asm, export=1 sshl v6.4s, v6.4s, v17.4s smin v5.4s, v5.4s, v18.4s smin v6.4s, v6.4s, v18.4s - xtn v5.4h, v5.4S + xtn v5.4h, v5.4s xtn2 v5.8h, v6.4s st1 {v5.8h}, [x1], #16 @@ -826,7 +826,7 @@ function ff_hscale16to15_4_neon_asm, export=1 uxtl v0.4s, v0.4h sxtl v31.4s, v31.4h mul v5.4s, v0.4s, v31.4s - addv s0, v5.4S + addv s0, v5.4s sshl v0.4s, v0.4s, v17.4s smin v0.4s, v0.4s, v18.4s st1 {v0.h}[0], [x1], #2 @@ -865,58 +865,58 @@ function ff_hscale16to15_X8_neon_asm, export=1 add x12, x16, x7 // filter1 = filter0 + filterSize*2 add x13, x12, x7 // filter2 = filter1 + filterSize*2 add x4, x13, x7 // filter3 = filter2 + filterSize*2 - movi v0.2D, #0 // val sum part 1 (for dst[0]) - movi v1.2D, #0 // val sum part 2 (for dst[1]) - movi v2.2D, #0 // val sum part 3 (for dst[2]) - movi v3.2D, #0 // val sum part 4 (for dst[3]) + movi v0.2d, #0 // val sum part 1 (for dst[0]) + movi v1.2d, #0 // val sum part 2 (for dst[1]) + movi v2.2d, #0 // val sum part 3 (for dst[2]) + movi v3.2d, #0 // val sum part 4 (for dst[3]) add x17, x3, w8, UXTW // srcp + filterPos[0] add x8, x3, w10, UXTW // srcp + filterPos[1] add x10, x3, w11, UXTW // srcp + filterPos[2] add x11, x3, w9, UXTW // srcp + filterPos[3] mov w15, w6 // filterSize counter -2: ld1 {v4.8H}, [x17], #16 // srcp[filterPos[0] + {0..7}] - ld1 {v5.8H}, [x16], #16 // load 8x16-bit filter values, part 1 - ld1 {v6.8H}, [x8], #16 // srcp[filterPos[1] + {0..7}] - ld1 {v7.8H}, [x12], #16 // load 8x16-bit at filter+filterSize - uxtl v24.4s, v4.4H // extend srcp lower half to 32 bits to preserve sign - sxtl v25.4s, v5.4H // extend filter lower half to 32 bits to match srcp size +2: ld1 {v4.8h}, [x17], #16 // srcp[filterPos[0] + {0..7}] + ld1 {v5.8h}, [x16], #16 // load 8x16-bit filter values, part 1 + ld1 {v6.8h}, [x8], #16 // srcp[filterPos[1] + {0..7}] + ld1 {v7.8h}, [x12], #16 // load 8x16-bit at filter+filterSize + uxtl v24.4s, v4.4h // extend srcp lower half to 32 bits to preserve sign + sxtl v25.4s, v5.4h // extend filter lower half to 32 bits to match srcp size uxtl2 v4.4s, v4.8h // extend srcp upper half to 32 bits mla v0.4s, v24.4s, v25.4s // multiply accumulate lower half of v4 * v5 sxtl2 v5.4s, v5.8h // extend filter upper half to 32 bits uxtl v26.4s, v6.4h // extend srcp lower half to 32 bits - mla v0.4S, v4.4s, v5.4s // multiply accumulate upper half of v4 * v5 - sxtl v27.4s, v7.4H // exted filter lower half - uxtl2 v6.4s, v6.8H // extend srcp upper half + mla v0.4s, v4.4s, v5.4s // multiply accumulate upper half of v4 * v5 + sxtl v27.4s, v7.4h // exted filter lower half + uxtl2 v6.4s, v6.8h // extend srcp upper half sxtl2 v7.4s, v7.8h // extend filter upper half - ld1 {v16.8H}, [x10], #16 // srcp[filterPos[2] + {0..7}] - mla v1.4S, v26.4s, v27.4s // v1 accumulates srcp[filterPos[1] + {0..3}] * filter[{0..3}] - ld1 {v17.8H}, [x13], #16 // load 8x16-bit at filter+2*filterSize - uxtl v22.4s, v16.4H // extend srcp lower half - sxtl v23.4s, v17.4H // extend filter lower half - uxtl2 v16.4s, v16.8H // extend srcp upper half + ld1 {v16.8h}, [x10], #16 // srcp[filterPos[2] + {0..7}] + mla v1.4s, v26.4s, v27.4s // v1 accumulates srcp[filterPos[1] + {0..3}] * filter[{0..3}] + ld1 {v17.8h}, [x13], #16 // load 8x16-bit at filter+2*filterSize + uxtl v22.4s, v16.4h // extend srcp lower half + sxtl v23.4s, v17.4h // extend filter lower half + uxtl2 v16.4s, v16.8h // extend srcp upper half sxtl2 v17.4s, v17.8h // extend filter upper half - mla v2.4S, v22.4s, v23.4s // v2 accumulates srcp[filterPos[2] + {0..3}] * filter[{0..3}] - mla v2.4S, v16.4s, v17.4s // v2 accumulates srcp[filterPos[2] + {4..7}] * filter[{4..7}] - ld1 {v18.8H}, [x11], #16 // srcp[filterPos[3] + {0..7}] - mla v1.4S, v6.4s, v7.4s // v1 accumulates srcp[filterPos[1] + {4..7}] * filter[{4..7}] - ld1 {v19.8H}, [x4], #16 // load 8x16-bit at filter+3*filterSize + mla v2.4s, v22.4s, v23.4s // v2 accumulates srcp[filterPos[2] + {0..3}] * filter[{0..3}] + mla v2.4s, v16.4s, v17.4s // v2 accumulates srcp[filterPos[2] + {4..7}] * filter[{4..7}] + ld1 {v18.8h}, [x11], #16 // srcp[filterPos[3] + {0..7}] + mla v1.4s, v6.4s, v7.4s // v1 accumulates srcp[filterPos[1] + {4..7}] * filter[{4..7}] + ld1 {v19.8h}, [x4], #16 // load 8x16-bit at filter+3*filterSize subs w15, w15, #8 // j -= 8: processed 8/filterSize - uxtl v28.4s, v18.4H // extend srcp lower half - sxtl v29.4s, v19.4H // extend filter lower half - uxtl2 v18.4s, v18.8H // extend srcp upper half + uxtl v28.4s, v18.4h // extend srcp lower half + sxtl v29.4s, v19.4h // extend filter lower half + uxtl2 v18.4s, v18.8h // extend srcp upper half sxtl2 v19.4s, v19.8h // extend filter upper half - mla v3.4S, v28.4s, v29.4s // v3 accumulates srcp[filterPos[3] + {0..3}] * filter[{0..3}] - mla v3.4S, v18.4s, v19.4s // v3 accumulates srcp[filterPos[3] + {4..7}] * filter[{4..7}] + mla v3.4s, v28.4s, v29.4s // v3 accumulates srcp[filterPos[3] + {0..3}] * filter[{0..3}] + mla v3.4s, v18.4s, v19.4s // v3 accumulates srcp[filterPos[3] + {4..7}] * filter[{4..7}] b.gt 2b // inner loop if filterSize not consumed completely - addp v0.4S, v0.4S, v1.4S // part01 horizontal pair adding - addp v2.4S, v2.4S, v3.4S // part23 horizontal pair adding - addp v0.4S, v0.4S, v2.4S // part0123 horizontal pair adding + addp v0.4s, v0.4s, v1.4s // part01 horizontal pair adding + addp v2.4s, v2.4s, v3.4s // part23 horizontal pair adding + addp v0.4s, v0.4s, v2.4s // part0123 horizontal pair adding subs w2, w2, #4 // dstW -= 4 sshl v0.4s, v0.4s, v21.4s // shift right (effectively rigth, as shift is negative); overflow expected smin v0.4s, v0.4s, v20.4s // apply min (do not use sqshl) xtn v0.4h, v0.4s // narrow down to 16 bits - st1 {v0.4H}, [x1], #8 // write to destination part0123 + st1 {v0.4h}, [x1], #8 // write to destination part0123 b.gt 1b // loop until end of line ret endfunc @@ -1108,31 +1108,31 @@ function ff_hscale16to19_4_neon_asm, export=1 // Extending to 32 bits is necessary, as unit16_t values can't // be represented as int16_t without type promotion. uxtl v26.4s, v0.4h - sxtl v27.4s, v28.4H + sxtl v27.4s, v28.4h uxtl2 v0.4s, v0.8h mul v5.4s, v26.4s, v27.4s - sxtl2 v28.4s, v28.8H + sxtl2 v28.4s, v28.8h uxtl v26.4s, v1.4h mul v6.4s, v0.4s, v28.4s - sxtl v27.4s, v29.4H + sxtl v27.4s, v29.4h uxtl2 v0.4s, v1.8h mla v5.4s, v27.4s, v26.4s - sxtl2 v28.4s, v29.8H + sxtl2 v28.4s, v29.8h uxtl v26.4s, v2.4h mla v6.4s, v28.4s, v0.4s - sxtl v27.4s, v30.4H + sxtl v27.4s, v30.4h uxtl2 v0.4s, v2.8h mla v5.4s, v27.4s, v26.4s - sxtl2 v28.4s, v30.8H + sxtl2 v28.4s, v30.8h uxtl v26.4s, v3.4h mla v6.4s, v28.4s, v0.4s - sxtl v27.4s, v31.4H + sxtl v27.4s, v31.4h uxtl2 v0.4s, v3.8h mla v5.4s, v27.4s, v26.4s - sxtl2 v28.4s, v31.8H + sxtl2 v28.4s, v31.8h sub w2, w2, #8 mla v6.4s, v28.4s, v0.4s @@ -1181,31 +1181,31 @@ function ff_hscale16to19_4_neon_asm, export=1 ld4 {v28.8h, v29.8h, v30.8h, v31.8h}, [x4], #64 uxtl v26.4s, v0.4h - sxtl v27.4s, v28.4H + sxtl v27.4s, v28.4h uxtl2 v0.4s, v0.8h mul v5.4s, v26.4s, v27.4s - sxtl2 v28.4s, v28.8H + sxtl2 v28.4s, v28.8h uxtl v26.4s, v1.4h mul v6.4s, v0.4s, v28.4s - sxtl v27.4s, v29.4H + sxtl v27.4s, v29.4h uxtl2 v0.4s, v1.8h mla v5.4s, v26.4s, v27.4s - sxtl2 v28.4s, v29.8H + sxtl2 v28.4s, v29.8h uxtl v26.4s, v2.4h mla v6.4s, v0.4s, v28.4s - sxtl v27.4s, v30.4H + sxtl v27.4s, v30.4h uxtl2 v0.4s, v2.8h mla v5.4s, v26.4s, v27.4s - sxtl2 v28.4s, v30.8H + sxtl2 v28.4s, v30.8h uxtl v26.4s, v3.4h mla v6.4s, v0.4s, v28.4s - sxtl v27.4s, v31.4H + sxtl v27.4s, v31.4h uxtl2 v0.4s, v3.8h mla v5.4s, v26.4s, v27.4s - sxtl2 v28.4s, v31.8H + sxtl2 v28.4s, v31.8h subs w2, w2, #8 mla v6.4s, v0.4s, v28.4s @@ -1232,7 +1232,7 @@ function ff_hscale16to19_4_neon_asm, export=1 sxtl v31.4s, v31.4h subs w2, w2, #1 mul v5.4s, v0.4s, v31.4s - addv s0, v5.4S + addv s0, v5.4s sshl v0.4s, v0.4s, v17.4s smin v0.4s, v0.4s, v18.4s st1 {v0.s}[0], [x1], #4 @@ -1270,52 +1270,52 @@ function ff_hscale16to19_X8_neon_asm, export=1 add x13, x12, x7 // filter2 = filter1 + filterSize*2 lsl w10, w10, #1 add x4, x13, x7 // filter3 = filter2 + filterSize*2 - movi v0.2D, #0 // val sum part 1 (for dst[0]) - movi v1.2D, #0 // val sum part 2 (for dst[1]) - movi v2.2D, #0 // val sum part 3 (for dst[2]) - movi v3.2D, #0 // val sum part 4 (for dst[3]) + movi v0.2d, #0 // val sum part 1 (for dst[0]) + movi v1.2d, #0 // val sum part 2 (for dst[1]) + movi v2.2d, #0 // val sum part 3 (for dst[2]) + movi v3.2d, #0 // val sum part 4 (for dst[3]) add x17, x3, w8, UXTW // srcp + filterPos[0] add x8, x3, w10, UXTW // srcp + filterPos[1] add x10, x3, w11, UXTW // srcp + filterPos[2] add x11, x3, w9, UXTW // srcp + filterPos[3] mov w15, w6 // filterSize counter -2: ld1 {v4.8H}, [x17], #16 // srcp[filterPos[0] + {0..7}] - ld1 {v5.8H}, [x16], #16 // load 8x16-bit filter values, part 1 - ld1 {v6.8H}, [x8], #16 // srcp[filterPos[1] + {0..7}] - ld1 {v7.8H}, [x12], #16 // load 8x16-bit at filter+filterSize - uxtl v24.4s, v4.4H // extend srcp lower half to 32 bits to preserve sign - sxtl v25.4s, v5.4H // extend filter lower half to 32 bits to match srcp size +2: ld1 {v4.8h}, [x17], #16 // srcp[filterPos[0] + {0..7}] + ld1 {v5.8h}, [x16], #16 // load 8x16-bit filter values, part 1 + ld1 {v6.8h}, [x8], #16 // srcp[filterPos[1] + {0..7}] + ld1 {v7.8h}, [x12], #16 // load 8x16-bit at filter+filterSize + uxtl v24.4s, v4.4h // extend srcp lower half to 32 bits to preserve sign + sxtl v25.4s, v5.4h // extend filter lower half to 32 bits to match srcp size uxtl2 v4.4s, v4.8h // extend srcp upper half to 32 bits mla v0.4s, v24.4s, v25.4s // multiply accumulate lower half of v4 * v5 sxtl2 v5.4s, v5.8h // extend filter upper half to 32 bits uxtl v26.4s, v6.4h // extend srcp lower half to 32 bits - mla v0.4S, v4.4s, v5.4s // multiply accumulate upper half of v4 * v5 - sxtl v27.4s, v7.4H // exted filter lower half - uxtl2 v6.4s, v6.8H // extend srcp upper half + mla v0.4s, v4.4s, v5.4s // multiply accumulate upper half of v4 * v5 + sxtl v27.4s, v7.4h // exted filter lower half + uxtl2 v6.4s, v6.8h // extend srcp upper half sxtl2 v7.4s, v7.8h // extend filter upper half - ld1 {v16.8H}, [x10], #16 // srcp[filterPos[2] + {0..7}] - mla v1.4S, v26.4s, v27.4s // v1 accumulates srcp[filterPos[1] + {0..3}] * filter[{0..3}] - ld1 {v17.8H}, [x13], #16 // load 8x16-bit at filter+2*filterSize - uxtl v22.4s, v16.4H // extend srcp lower half - sxtl v23.4s, v17.4H // extend filter lower half - uxtl2 v16.4s, v16.8H // extend srcp upper half + ld1 {v16.8h}, [x10], #16 // srcp[filterPos[2] + {0..7}] + mla v1.4s, v26.4s, v27.4s // v1 accumulates srcp[filterPos[1] + {0..3}] * filter[{0..3}] + ld1 {v17.8h}, [x13], #16 // load 8x16-bit at filter+2*filterSize + uxtl v22.4s, v16.4h // extend srcp lower half + sxtl v23.4s, v17.4h // extend filter lower half + uxtl2 v16.4s, v16.8h // extend srcp upper half sxtl2 v17.4s, v17.8h // extend filter upper half - mla v2.4S, v22.4s, v23.4s // v2 accumulates srcp[filterPos[2] + {0..3}] * filter[{0..3}] - mla v2.4S, v16.4s, v17.4s // v2 accumulates srcp[filterPos[2] + {4..7}] * filter[{4..7}] - ld1 {v18.8H}, [x11], #16 // srcp[filterPos[3] + {0..7}] - mla v1.4S, v6.4s, v7.4s // v1 accumulates srcp[filterPos[1] + {4..7}] * filter[{4..7}] - ld1 {v19.8H}, [x4], #16 // load 8x16-bit at filter+3*filterSize + mla v2.4s, v22.4s, v23.4s // v2 accumulates srcp[filterPos[2] + {0..3}] * filter[{0..3}] + mla v2.4s, v16.4s, v17.4s // v2 accumulates srcp[filterPos[2] + {4..7}] * filter[{4..7}] + ld1 {v18.8h}, [x11], #16 // srcp[filterPos[3] + {0..7}] + mla v1.4s, v6.4s, v7.4s // v1 accumulates srcp[filterPos[1] + {4..7}] * filter[{4..7}] + ld1 {v19.8h}, [x4], #16 // load 8x16-bit at filter+3*filterSize subs w15, w15, #8 // j -= 8: processed 8/filterSize - uxtl v28.4s, v18.4H // extend srcp lower half - sxtl v29.4s, v19.4H // extend filter lower half - uxtl2 v18.4s, v18.8H // extend srcp upper half + uxtl v28.4s, v18.4h // extend srcp lower half + sxtl v29.4s, v19.4h // extend filter lower half + uxtl2 v18.4s, v18.8h // extend srcp upper half sxtl2 v19.4s, v19.8h // extend filter upper half - mla v3.4S, v28.4s, v29.4s // v3 accumulates srcp[filterPos[3] + {0..3}] * filter[{0..3}] - mla v3.4S, v18.4s, v19.4s // v3 accumulates srcp[filterPos[3] + {4..7}] * filter[{4..7}] + mla v3.4s, v28.4s, v29.4s // v3 accumulates srcp[filterPos[3] + {0..3}] * filter[{0..3}] + mla v3.4s, v18.4s, v19.4s // v3 accumulates srcp[filterPos[3] + {4..7}] * filter[{4..7}] b.gt 2b // inner loop if filterSize not consumed completely - addp v0.4S, v0.4S, v1.4S // part01 horizontal pair adding - addp v2.4S, v2.4S, v3.4S // part23 horizontal pair adding - addp v0.4S, v0.4S, v2.4S // part0123 horizontal pair adding + addp v0.4s, v0.4s, v1.4s // part01 horizontal pair adding + addp v2.4s, v2.4s, v3.4s // part23 horizontal pair adding + addp v0.4s, v0.4s, v2.4s // part0123 horizontal pair adding subs w2, w2, #4 // dstW -= 4 sshl v0.4s, v0.4s, v21.4s // shift right (effectively rigth, as shift is negative); overflow expected smin v0.4s, v0.4s, v20.4s // apply min (do not use sqshl) diff --git a/libswscale/aarch64/output.S b/libswscale/aarch64/output.S index b8a2818c9b..344d0659ea 100644 --- a/libswscale/aarch64/output.S +++ b/libswscale/aarch64/output.S @@ -29,13 +29,13 @@ function ff_yuv2planeX_8_neon, export=1 // x5 - const uint8_t *dither, // w6 - int offset - ld1 {v0.8B}, [x5] // load 8x8-bit dither + ld1 {v0.8b}, [x5] // load 8x8-bit dither and w6, w6, #7 cbz w6, 1f // check if offsetting present - ext v0.8B, v0.8B, v0.8B, #3 // honor offsetting which can be 0 or 3 only -1: uxtl v0.8H, v0.8B // extend dither to 16-bit - ushll v1.4S, v0.4H, #12 // extend dither to 32-bit with left shift by 12 (part 1) - ushll2 v2.4S, v0.8H, #12 // extend dither to 32-bit with left shift by 12 (part 2) + ext v0.8b, v0.8b, v0.8b, #3 // honor offsetting which can be 0 or 3 only +1: uxtl v0.8h, v0.8b // extend dither to 16-bit + ushll v1.4s, v0.4h, #12 // extend dither to 32-bit with left shift by 12 (part 1) + ushll2 v2.4s, v0.8h, #12 // extend dither to 32-bit with left shift by 12 (part 2) cmp w1, #8 // if filterSize == 8, branch to specialized version b.eq 6f cmp w1, #4 // if filterSize == 4, branch to specialized version @@ -48,8 +48,8 @@ function ff_yuv2planeX_8_neon, export=1 mov x7, #0 // i = 0 tbnz w1, #0, 4f // if filterSize % 2 != 0 branch to specialized version // fs % 2 == 0 -2: mov v3.16B, v1.16B // initialize accumulator part 1 with dithering value - mov v4.16B, v2.16B // initialize accumulator part 2 with dithering value +2: mov v3.16b, v1.16b // initialize accumulator part 1 with dithering value + mov v4.16b, v2.16b // initialize accumulator part 2 with dithering value mov w8, w1 // tmpfilterSize = filterSize mov x9, x2 // srcp = src mov x10, x0 // filterp = filter @@ -57,12 +57,12 @@ function ff_yuv2planeX_8_neon, export=1 ldr s7, [x10], #4 // read 2x16-bit coeff X and Y at filter[j] and filter[j+1] add x11, x11, x7, lsl #1 // &src[j ][i] add x12, x12, x7, lsl #1 // &src[j+1][i] - ld1 {v5.8H}, [x11] // read 8x16-bit @ src[j ][i + {0..7}]: A,B,C,D,E,F,G,H - ld1 {v6.8H}, [x12] // read 8x16-bit @ src[j+1][i + {0..7}]: I,J,K,L,M,N,O,P - smlal v3.4S, v5.4H, v7.H[0] // val0 += {A,B,C,D} * X - smlal2 v4.4S, v5.8H, v7.H[0] // val1 += {E,F,G,H} * X - smlal v3.4S, v6.4H, v7.H[1] // val0 += {I,J,K,L} * Y - smlal2 v4.4S, v6.8H, v7.H[1] // val1 += {M,N,O,P} * Y + ld1 {v5.8h}, [x11] // read 8x16-bit @ src[j ][i + {0..7}]: A,B,C,D,E,F,G,H + ld1 {v6.8h}, [x12] // read 8x16-bit @ src[j+1][i + {0..7}]: I,J,K,L,M,N,O,P + smlal v3.4s, v5.4h, v7.h[0] // val0 += {A,B,C,D} * X + smlal2 v4.4s, v5.8h, v7.h[0] // val1 += {E,F,G,H} * X + smlal v3.4s, v6.4h, v7.h[1] // val0 += {I,J,K,L} * Y + smlal2 v4.4s, v6.8h, v7.h[1] // val1 += {M,N,O,P} * Y subs w8, w8, #2 // tmpfilterSize -= 2 b.gt 3b // loop until filterSize consumed @@ -77,17 +77,17 @@ function ff_yuv2planeX_8_neon, export=1 // If filter size is odd (most likely == 1), then use this section. // fs % 2 != 0 -4: mov v3.16B, v1.16B // initialize accumulator part 1 with dithering value - mov v4.16B, v2.16B // initialize accumulator part 2 with dithering value +4: mov v3.16b, v1.16b // initialize accumulator part 1 with dithering value + mov v4.16b, v2.16b // initialize accumulator part 2 with dithering value mov w8, w1 // tmpfilterSize = filterSize mov x9, x2 // srcp = src mov x10, x0 // filterp = filter 5: ldr x11, [x9], #8 // get 1 pointer: src[j] ldr h6, [x10], #2 // read 1 16 bit coeff X at filter[j] add x11, x11, x7, lsl #1 // &src[j ][i] - ld1 {v5.8H}, [x11] // read 8x16-bit @ src[j ][i + {0..7}]: A,B,C,D,E,F,G,H - smlal v3.4S, v5.4H, v6.H[0] // val0 += {A,B,C,D} * X - smlal2 v4.4S, v5.8H, v6.H[0] // val1 += {E,F,G,H} * X + ld1 {v5.8h}, [x11] // read 8x16-bit @ src[j ][i + {0..7}]: A,B,C,D,E,F,G,H + smlal v3.4s, v5.4h, v6.h[0] // val0 += {A,B,C,D} * X + smlal2 v4.4s, v5.8h, v6.h[0] // val1 += {E,F,G,H} * X subs w8, w8, #1 // tmpfilterSize -= 2 b.gt 5b // loop until filterSize consumed @@ -107,36 +107,36 @@ function ff_yuv2planeX_8_neon, export=1 ldp x12, x13, [x2, #48] // load 2 pointers: src[j+6] and src[j+7] // load 8x16-bit values for filter[j], where j=0..7 - ld1 {v6.8H}, [x0] + ld1 {v6.8h}, [x0] 7: - mov v3.16B, v1.16B // initialize accumulator part 1 with dithering value - mov v4.16B, v2.16B // initialize accumulator part 2 with dithering value - - ld1 {v24.8H}, [x5], #16 // load 8x16-bit values for src[j + 0][i + {0..7}] - ld1 {v25.8H}, [x6], #16 // load 8x16-bit values for src[j + 1][i + {0..7}] - ld1 {v26.8H}, [x7], #16 // load 8x16-bit values for src[j + 2][i + {0..7}] - ld1 {v27.8H}, [x9], #16 // load 8x16-bit values for src[j + 3][i + {0..7}] - ld1 {v28.8H}, [x10], #16 // load 8x16-bit values for src[j + 4][i + {0..7}] - ld1 {v29.8H}, [x11], #16 // load 8x16-bit values for src[j + 5][i + {0..7}] - ld1 {v30.8H}, [x12], #16 // load 8x16-bit values for src[j + 6][i + {0..7}] - ld1 {v31.8H}, [x13], #16 // load 8x16-bit values for src[j + 7][i + {0..7}] - - smlal v3.4S, v24.4H, v6.H[0] // val0 += src[0][i + {0..3}] * filter[0] - smlal2 v4.4S, v24.8H, v6.H[0] // val1 += src[0][i + {4..7}] * filter[0] - smlal v3.4S, v25.4H, v6.H[1] // val0 += src[1][i + {0..3}] * filter[1] - smlal2 v4.4S, v25.8H, v6.H[1] // val1 += src[1][i + {4..7}] * filter[1] - smlal v3.4S, v26.4H, v6.H[2] // val0 += src[2][i + {0..3}] * filter[2] - smlal2 v4.4S, v26.8H, v6.H[2] // val1 += src[2][i + {4..7}] * filter[2] - smlal v3.4S, v27.4H, v6.H[3] // val0 += src[3][i + {0..3}] * filter[3] - smlal2 v4.4S, v27.8H, v6.H[3] // val1 += src[3][i + {4..7}] * filter[3] - smlal v3.4S, v28.4H, v6.H[4] // val0 += src[4][i + {0..3}] * filter[4] - smlal2 v4.4S, v28.8H, v6.H[4] // val1 += src[4][i + {4..7}] * filter[4] - smlal v3.4S, v29.4H, v6.H[5] // val0 += src[5][i + {0..3}] * filter[5] - smlal2 v4.4S, v29.8H, v6.H[5] // val1 += src[5][i + {4..7}] * filter[5] - smlal v3.4S, v30.4H, v6.H[6] // val0 += src[6][i + {0..3}] * filter[6] - smlal2 v4.4S, v30.8H, v6.H[6] // val1 += src[6][i + {4..7}] * filter[6] - smlal v3.4S, v31.4H, v6.H[7] // val0 += src[7][i + {0..3}] * filter[7] - smlal2 v4.4S, v31.8H, v6.H[7] // val1 += src[7][i + {4..7}] * filter[7] + mov v3.16b, v1.16b // initialize accumulator part 1 with dithering value + mov v4.16b, v2.16b // initialize accumulator part 2 with dithering value + + ld1 {v24.8h}, [x5], #16 // load 8x16-bit values for src[j + 0][i + {0..7}] + ld1 {v25.8h}, [x6], #16 // load 8x16-bit values for src[j + 1][i + {0..7}] + ld1 {v26.8h}, [x7], #16 // load 8x16-bit values for src[j + 2][i + {0..7}] + ld1 {v27.8h}, [x9], #16 // load 8x16-bit values for src[j + 3][i + {0..7}] + ld1 {v28.8h}, [x10], #16 // load 8x16-bit values for src[j + 4][i + {0..7}] + ld1 {v29.8h}, [x11], #16 // load 8x16-bit values for src[j + 5][i + {0..7}] + ld1 {v30.8h}, [x12], #16 // load 8x16-bit values for src[j + 6][i + {0..7}] + ld1 {v31.8h}, [x13], #16 // load 8x16-bit values for src[j + 7][i + {0..7}] + + smlal v3.4s, v24.4h, v6.h[0] // val0 += src[0][i + {0..3}] * filter[0] + smlal2 v4.4s, v24.8h, v6.h[0] // val1 += src[0][i + {4..7}] * filter[0] + smlal v3.4s, v25.4h, v6.h[1] // val0 += src[1][i + {0..3}] * filter[1] + smlal2 v4.4s, v25.8h, v6.h[1] // val1 += src[1][i + {4..7}] * filter[1] + smlal v3.4s, v26.4h, v6.h[2] // val0 += src[2][i + {0..3}] * filter[2] + smlal2 v4.4s, v26.8h, v6.h[2] // val1 += src[2][i + {4..7}] * filter[2] + smlal v3.4s, v27.4h, v6.h[3] // val0 += src[3][i + {0..3}] * filter[3] + smlal2 v4.4s, v27.8h, v6.h[3] // val1 += src[3][i + {4..7}] * filter[3] + smlal v3.4s, v28.4h, v6.h[4] // val0 += src[4][i + {0..3}] * filter[4] + smlal2 v4.4s, v28.8h, v6.h[4] // val1 += src[4][i + {4..7}] * filter[4] + smlal v3.4s, v29.4h, v6.h[5] // val0 += src[5][i + {0..3}] * filter[5] + smlal2 v4.4s, v29.8h, v6.h[5] // val1 += src[5][i + {4..7}] * filter[5] + smlal v3.4s, v30.4h, v6.h[6] // val0 += src[6][i + {0..3}] * filter[6] + smlal2 v4.4s, v30.8h, v6.h[6] // val1 += src[6][i + {4..7}] * filter[6] + smlal v3.4s, v31.4h, v6.h[7] // val0 += src[7][i + {0..3}] * filter[7] + smlal2 v4.4s, v31.8h, v6.h[7] // val1 += src[7][i + {4..7}] * filter[7] sqshrun v3.4h, v3.4s, #16 // clip16(val0>>16) sqshrun2 v3.8h, v4.4s, #16 // clip16(val1>>16) @@ -151,24 +151,24 @@ function ff_yuv2planeX_8_neon, export=1 ldp x7, x9, [x2, #16] // load 2 pointers: src[j+2] and src[j+3] // load 4x16-bit values for filter[j], where j=0..3 and replicated across lanes - ld1 {v6.4H}, [x0] + ld1 {v6.4h}, [x0] 9: - mov v3.16B, v1.16B // initialize accumulator part 1 with dithering value - mov v4.16B, v2.16B // initialize accumulator part 2 with dithering value - - ld1 {v24.8H}, [x5], #16 // load 8x16-bit values for src[j + 0][i + {0..7}] - ld1 {v25.8H}, [x6], #16 // load 8x16-bit values for src[j + 1][i + {0..7}] - ld1 {v26.8H}, [x7], #16 // load 8x16-bit values for src[j + 2][i + {0..7}] - ld1 {v27.8H}, [x9], #16 // load 8x16-bit values for src[j + 3][i + {0..7}] - - smlal v3.4S, v24.4H, v6.H[0] // val0 += src[0][i + {0..3}] * filter[0] - smlal2 v4.4S, v24.8H, v6.H[0] // val1 += src[0][i + {4..7}] * filter[0] - smlal v3.4S, v25.4H, v6.H[1] // val0 += src[1][i + {0..3}] * filter[1] - smlal2 v4.4S, v25.8H, v6.H[1] // val1 += src[1][i + {4..7}] * filter[1] - smlal v3.4S, v26.4H, v6.H[2] // val0 += src[2][i + {0..3}] * filter[2] - smlal2 v4.4S, v26.8H, v6.H[2] // val1 += src[2][i + {4..7}] * filter[2] - smlal v3.4S, v27.4H, v6.H[3] // val0 += src[3][i + {0..3}] * filter[3] - smlal2 v4.4S, v27.8H, v6.H[3] // val1 += src[3][i + {4..7}] * filter[3] + mov v3.16b, v1.16b // initialize accumulator part 1 with dithering value + mov v4.16b, v2.16b // initialize accumulator part 2 with dithering value + + ld1 {v24.8h}, [x5], #16 // load 8x16-bit values for src[j + 0][i + {0..7}] + ld1 {v25.8h}, [x6], #16 // load 8x16-bit values for src[j + 1][i + {0..7}] + ld1 {v26.8h}, [x7], #16 // load 8x16-bit values for src[j + 2][i + {0..7}] + ld1 {v27.8h}, [x9], #16 // load 8x16-bit values for src[j + 3][i + {0..7}] + + smlal v3.4s, v24.4h, v6.h[0] // val0 += src[0][i + {0..3}] * filter[0] + smlal2 v4.4s, v24.8h, v6.h[0] // val1 += src[0][i + {4..7}] * filter[0] + smlal v3.4s, v25.4h, v6.h[1] // val0 += src[1][i + {0..3}] * filter[1] + smlal2 v4.4s, v25.8h, v6.h[1] // val1 += src[1][i + {4..7}] * filter[1] + smlal v3.4s, v26.4h, v6.h[2] // val0 += src[2][i + {0..3}] * filter[2] + smlal2 v4.4s, v26.8h, v6.h[2] // val1 += src[2][i + {4..7}] * filter[2] + smlal v3.4s, v27.4h, v6.h[3] // val0 += src[3][i + {0..3}] * filter[3] + smlal2 v4.4s, v27.8h, v6.h[3] // val1 += src[3][i + {4..7}] * filter[3] sqshrun v3.4h, v3.4s, #16 // clip16(val0>>16) sqshrun2 v3.8h, v4.4s, #16 // clip16(val1>>16) @@ -184,16 +184,16 @@ function ff_yuv2planeX_8_neon, export=1 // load 2x16-bit values for filter[j], where j=0..1 and replicated across lanes ldr s6, [x0] 11: - mov v3.16B, v1.16B // initialize accumulator part 1 with dithering value - mov v4.16B, v2.16B // initialize accumulator part 2 with dithering value + mov v3.16b, v1.16b // initialize accumulator part 1 with dithering value + mov v4.16b, v2.16b // initialize accumulator part 2 with dithering value - ld1 {v24.8H}, [x5], #16 // load 8x16-bit values for src[j + 0][i + {0..7}] - ld1 {v25.8H}, [x6], #16 // load 8x16-bit values for src[j + 1][i + {0..7}] + ld1 {v24.8h}, [x5], #16 // load 8x16-bit values for src[j + 0][i + {0..7}] + ld1 {v25.8h}, [x6], #16 // load 8x16-bit values for src[j + 1][i + {0..7}] - smlal v3.4S, v24.4H, v6.H[0] // val0 += src[0][i + {0..3}] * filter[0] - smlal2 v4.4S, v24.8H, v6.H[0] // val1 += src[0][i + {4..7}] * filter[0] - smlal v3.4S, v25.4H, v6.H[1] // val0 += src[1][i + {0..3}] * filter[1] - smlal2 v4.4S, v25.8H, v6.H[1] // val1 += src[1][i + {4..7}] * filter[1] + smlal v3.4s, v24.4h, v6.h[0] // val0 += src[0][i + {0..3}] * filter[0] + smlal2 v4.4s, v24.8h, v6.h[0] // val1 += src[0][i + {4..7}] * filter[0] + smlal v3.4s, v25.4h, v6.h[1] // val0 += src[1][i + {0..3}] * filter[1] + smlal2 v4.4s, v25.8h, v6.h[1] // val1 += src[1][i + {4..7}] * filter[1] sqshrun v3.4h, v3.4s, #16 // clip16(val0>>16) sqshrun2 v3.8h, v4.4s, #16 // clip16(val1>>16) @@ -210,11 +210,11 @@ function ff_yuv2plane1_8_neon, export=1 // w2 - int dstW, // x3 - const uint8_t *dither, // w4 - int offset - ld1 {v0.8B}, [x3] // load 8x8-bit dither + ld1 {v0.8b}, [x3] // load 8x8-bit dither and w4, w4, #7 cbz w4, 1f // check if offsetting present - ext v0.8B, v0.8B, v0.8B, #3 // honor offsetting which can be 0 or 3 only -1: uxtl v0.8H, v0.8B // extend dither to 32-bit + ext v0.8b, v0.8b, v0.8b, #3 // honor offsetting which can be 0 or 3 only +1: uxtl v0.8h, v0.8b // extend dither to 32-bit uxtl v1.4s, v0.4h uxtl2 v2.4s, v0.8h 2: diff --git a/libswscale/aarch64/yuv2rgb_neon.S b/libswscale/aarch64/yuv2rgb_neon.S index f341268c5d..3fc91530b6 100644 --- a/libswscale/aarch64/yuv2rgb_neon.S +++ b/libswscale/aarch64/yuv2rgb_neon.S @@ -33,9 +33,9 @@ .macro load_args_nv12 ldr x8, [sp] // table load_yoff_ycoeff 8, 16 // y_offset, y_coeff - ld1 {v1.1D}, [x8] - dup v0.8H, w10 - dup v3.8H, w9 + ld1 {v1.1d}, [x8] + dup v0.8h, w10 + dup v3.8h, w9 sub w3, w3, w0, lsl #2 // w3 = linesize - width * 4 (padding) sub w5, w5, w0 // w5 = linesizeY - width (paddingY) sub w7, w7, w0 // w7 = linesizeC - width (paddingC) @@ -51,9 +51,9 @@ ldr w14, [sp, #8] // linesizeV ldr x8, [sp, #16] // table load_yoff_ycoeff 24, 32 // y_offset, y_coeff - ld1 {v1.1D}, [x8] - dup v0.8H, w10 - dup v3.8H, w9 + ld1 {v1.1d}, [x8] + dup v0.8h, w10 + dup v3.8h, w9 sub w3, w3, w0, lsl #2 // w3 = linesize - width * 4 (padding) sub w5, w5, w0 // w5 = linesizeY - width (paddingY) sub w7, w7, w0, lsr #1 // w7 = linesizeU - width / 2 (paddingU) @@ -67,9 +67,9 @@ ldr w14, [sp, #8] // linesizeV ldr x8, [sp, #16] // table load_yoff_ycoeff 24, 32 // y_offset, y_coeff - ld1 {v1.1D}, [x8] - dup v0.8H, w10 - dup v3.8H, w9 + ld1 {v1.1d}, [x8] + dup v0.8h, w10 + dup v3.8h, w9 sub w3, w3, w0, lsl #2 // w3 = linesize - width * 4 (padding) sub w5, w5, w0 // w5 = linesizeY - width (paddingY) sub w7, w7, w0, lsr #1 // w7 = linesizeU - width / 2 (paddingU) @@ -77,22 +77,22 @@ .endm .macro load_chroma_nv12 - ld2 {v16.8B, v17.8B}, [x6], #16 - ushll v18.8H, v16.8B, #3 - ushll v19.8H, v17.8B, #3 + ld2 {v16.8b, v17.8b}, [x6], #16 + ushll v18.8h, v16.8b, #3 + ushll v19.8h, v17.8b, #3 .endm .macro load_chroma_nv21 - ld2 {v16.8B, v17.8B}, [x6], #16 - ushll v19.8H, v16.8B, #3 - ushll v18.8H, v17.8B, #3 + ld2 {v16.8b, v17.8b}, [x6], #16 + ushll v19.8h, v16.8b, #3 + ushll v18.8h, v17.8b, #3 .endm .macro load_chroma_yuv420p - ld1 {v16.8B}, [ x6], #8 - ld1 {v17.8B}, [x13], #8 - ushll v18.8H, v16.8B, #3 - ushll v19.8H, v17.8B, #3 + ld1 {v16.8b}, [ x6], #8 + ld1 {v17.8b}, [x13], #8 + ushll v18.8h, v16.8b, #3 + ushll v19.8h, v17.8b, #3 .endm .macro load_chroma_yuv422p @@ -123,18 +123,18 @@ .endm .macro compute_rgba r1 g1 b1 a1 r2 g2 b2 a2 - add v20.8H, v26.8H, v20.8H // Y1 + R1 - add v21.8H, v27.8H, v21.8H // Y2 + R2 - add v22.8H, v26.8H, v22.8H // Y1 + G1 - add v23.8H, v27.8H, v23.8H // Y2 + G2 - add v24.8H, v26.8H, v24.8H // Y1 + B1 - add v25.8H, v27.8H, v25.8H // Y2 + B2 - sqrshrun \r1, v20.8H, #1 // clip_u8((Y1 + R1) >> 1) - sqrshrun \r2, v21.8H, #1 // clip_u8((Y2 + R1) >> 1) - sqrshrun \g1, v22.8H, #1 // clip_u8((Y1 + G1) >> 1) - sqrshrun \g2, v23.8H, #1 // clip_u8((Y2 + G1) >> 1) - sqrshrun \b1, v24.8H, #1 // clip_u8((Y1 + B1) >> 1) - sqrshrun \b2, v25.8H, #1 // clip_u8((Y2 + B1) >> 1) + add v20.8h, v26.8h, v20.8h // Y1 + R1 + add v21.8h, v27.8h, v21.8h // Y2 + R2 + add v22.8h, v26.8h, v22.8h // Y1 + G1 + add v23.8h, v27.8h, v23.8h // Y2 + G2 + add v24.8h, v26.8h, v24.8h // Y1 + B1 + add v25.8h, v27.8h, v25.8h // Y2 + B2 + sqrshrun \r1, v20.8h, #1 // clip_u8((Y1 + R1) >> 1) + sqrshrun \r2, v21.8h, #1 // clip_u8((Y2 + R1) >> 1) + sqrshrun \g1, v22.8h, #1 // clip_u8((Y1 + G1) >> 1) + sqrshrun \g2, v23.8h, #1 // clip_u8((Y2 + G1) >> 1) + sqrshrun \b1, v24.8h, #1 // clip_u8((Y1 + B1) >> 1) + sqrshrun \b2, v25.8h, #1 // clip_u8((Y2 + B1) >> 1) movi \a1, #255 movi \a2, #255 .endm @@ -146,47 +146,47 @@ function ff_\ifmt\()_to_\ofmt\()_neon, export=1 1: mov w8, w0 // w8 = width 2: - movi v5.8H, #4, lsl #8 // 128 * (1<<3) + movi v5.8h, #4, lsl #8 // 128 * (1<<3) load_chroma_\ifmt - sub v18.8H, v18.8H, v5.8H // U*(1<<3) - 128*(1<<3) - sub v19.8H, v19.8H, v5.8H // V*(1<<3) - 128*(1<<3) - sqdmulh v20.8H, v19.8H, v1.H[0] // V * v2r (R) - sqdmulh v22.8H, v18.8H, v1.H[1] // U * u2g - sqdmulh v19.8H, v19.8H, v1.H[2] // V * v2g - add v22.8H, v22.8H, v19.8H // U * u2g + V * v2g (G) - sqdmulh v24.8H, v18.8H, v1.H[3] // U * u2b (B) - zip2 v21.8H, v20.8H, v20.8H // R2 - zip1 v20.8H, v20.8H, v20.8H // R1 - zip2 v23.8H, v22.8H, v22.8H // G2 - zip1 v22.8H, v22.8H, v22.8H // G1 - zip2 v25.8H, v24.8H, v24.8H // B2 - zip1 v24.8H, v24.8H, v24.8H // B1 - ld1 {v2.16B}, [x4], #16 // load luma - ushll v26.8H, v2.8B, #3 // Y1*(1<<3) - ushll2 v27.8H, v2.16B, #3 // Y2*(1<<3) - sub v26.8H, v26.8H, v3.8H // Y1*(1<<3) - y_offset - sub v27.8H, v27.8H, v3.8H // Y2*(1<<3) - y_offset - sqdmulh v26.8H, v26.8H, v0.8H // ((Y1*(1<<3) - y_offset) * y_coeff) >> 15 - sqdmulh v27.8H, v27.8H, v0.8H // ((Y2*(1<<3) - y_offset) * y_coeff) >> 15 + sub v18.8h, v18.8h, v5.8h // U*(1<<3) - 128*(1<<3) + sub v19.8h, v19.8h, v5.8h // V*(1<<3) - 128*(1<<3) + sqdmulh v20.8h, v19.8h, v1.h[0] // V * v2r (R) + sqdmulh v22.8h, v18.8h, v1.h[1] // U * u2g + sqdmulh v19.8h, v19.8h, v1.h[2] // V * v2g + add v22.8h, v22.8h, v19.8h // U * u2g + V * v2g (G) + sqdmulh v24.8h, v18.8h, v1.h[3] // U * u2b (B) + zip2 v21.8h, v20.8h, v20.8h // R2 + zip1 v20.8h, v20.8h, v20.8h // R1 + zip2 v23.8h, v22.8h, v22.8h // G2 + zip1 v22.8h, v22.8h, v22.8h // G1 + zip2 v25.8h, v24.8h, v24.8h // B2 + zip1 v24.8h, v24.8h, v24.8h // B1 + ld1 {v2.16b}, [x4], #16 // load luma + ushll v26.8h, v2.8b, #3 // Y1*(1<<3) + ushll2 v27.8h, v2.16b, #3 // Y2*(1<<3) + sub v26.8h, v26.8h, v3.8h // Y1*(1<<3) - y_offset + sub v27.8h, v27.8h, v3.8h // Y2*(1<<3) - y_offset + sqdmulh v26.8h, v26.8h, v0.8h // ((Y1*(1<<3) - y_offset) * y_coeff) >> 15 + sqdmulh v27.8h, v27.8h, v0.8h // ((Y2*(1<<3) - y_offset) * y_coeff) >> 15 .ifc \ofmt,argb // 1 2 3 0 - compute_rgba v5.8B,v6.8B,v7.8B,v4.8B, v17.8B,v18.8B,v19.8B,v16.8B + compute_rgba v5.8b,v6.8b,v7.8b,v4.8b, v17.8b,v18.8b,v19.8b,v16.8b .endif .ifc \ofmt,rgba // 0 1 2 3 - compute_rgba v4.8B,v5.8B,v6.8B,v7.8B, v16.8B,v17.8B,v18.8B,v19.8B + compute_rgba v4.8b,v5.8b,v6.8b,v7.8b, v16.8b,v17.8b,v18.8b,v19.8b .endif .ifc \ofmt,abgr // 3 2 1 0 - compute_rgba v7.8B,v6.8B,v5.8B,v4.8B, v19.8B,v18.8B,v17.8B,v16.8B + compute_rgba v7.8b,v6.8b,v5.8b,v4.8b, v19.8b,v18.8b,v17.8b,v16.8b .endif .ifc \ofmt,bgra // 2 1 0 3 - compute_rgba v6.8B,v5.8B,v4.8B,v7.8B, v18.8B,v17.8B,v16.8B,v19.8B + compute_rgba v6.8b,v5.8b,v4.8b,v7.8b, v18.8b,v17.8b,v16.8b,v19.8b .endif - st4 { v4.8B, v5.8B, v6.8B, v7.8B}, [x2], #32 - st4 {v16.8B,v17.8B,v18.8B,v19.8B}, [x2], #32 + st4 { v4.8b, v5.8b, v6.8b, v7.8b}, [x2], #32 + st4 {v16.8b,v17.8b,v18.8b,v19.8b}, [x2], #32 subs w8, w8, #16 // width -= 16 b.gt 2b add x2, x2, w3, SXTW // dst += padding |