diff options
author | Krzysztof Pyrkosz <ffmpeg@szaka.eu> | 2025-03-01 13:59:00 +0100 |
---|---|---|
committer | Martin Storsjö <martin@martin.st> | 2025-03-02 01:17:29 +0200 |
commit | 38929b824bcc4b3307af3e0711c5c03b823a83e3 (patch) | |
tree | 15b70b7fbc7f912ee6c2dd8eb8a3986c55f384af | |
parent | 76b18100177843f443276e02a9592bbae9cd09c8 (diff) | |
download | ffmpeg-38929b824bcc4b3307af3e0711c5c03b823a83e3.tar.gz |
swscale/aarch64: Refactor hscale_16_to_15__fs_4
This patch removes the use of stack for temporary state and replaces
interleaved ld4 loads with ld1.
Before/after:
A78
hscale_16_to_15__fs_4_dstW_8_neon: 86.8 ( 1.72x)
hscale_16_to_15__fs_4_dstW_24_neon: 147.5 ( 2.73x)
hscale_16_to_15__fs_4_dstW_128_neon: 614.0 ( 3.14x)
hscale_16_to_15__fs_4_dstW_144_neon: 680.5 ( 3.18x)
hscale_16_to_15__fs_4_dstW_256_neon: 1193.2 ( 3.19x)
hscale_16_to_15__fs_4_dstW_512_neon: 2305.0 ( 3.27x)
hscale_16_to_15__fs_4_dstW_8_neon: 86.0 ( 1.74x)
hscale_16_to_15__fs_4_dstW_24_neon: 106.8 ( 3.78x)
hscale_16_to_15__fs_4_dstW_128_neon: 404.0 ( 4.81x)
hscale_16_to_15__fs_4_dstW_144_neon: 451.8 ( 4.80x)
hscale_16_to_15__fs_4_dstW_256_neon: 760.5 ( 5.06x)
hscale_16_to_15__fs_4_dstW_512_neon: 1520.0 ( 5.01x)
A72
hscale_16_to_15__fs_4_dstW_8_neon: 156.8 ( 1.52x)
hscale_16_to_15__fs_4_dstW_24_neon: 217.8 ( 2.52x)
hscale_16_to_15__fs_4_dstW_128_neon: 906.8 ( 2.90x)
hscale_16_to_15__fs_4_dstW_144_neon: 1014.5 ( 2.91x)
hscale_16_to_15__fs_4_dstW_256_neon: 1751.5 ( 2.96x)
hscale_16_to_15__fs_4_dstW_512_neon: 3469.3 ( 2.97x)
hscale_16_to_15__fs_4_dstW_8_neon: 151.2 ( 1.54x)
hscale_16_to_15__fs_4_dstW_24_neon: 173.4 ( 3.15x)
hscale_16_to_15__fs_4_dstW_128_neon: 660.0 ( 3.98x)
hscale_16_to_15__fs_4_dstW_144_neon: 735.7 ( 4.00x)
hscale_16_to_15__fs_4_dstW_256_neon: 1273.5 ( 4.09x)
hscale_16_to_15__fs_4_dstW_512_neon: 2488.2 ( 4.16x)
Signed-off-by: Martin Storsjö <martin@martin.st>
-rw-r--r-- | libswscale/aarch64/hscale.S | 183 |
1 files changed, 70 insertions, 113 deletions
diff --git a/libswscale/aarch64/hscale.S b/libswscale/aarch64/hscale.S index 435460c1af..4140fa9c60 100644 --- a/libswscale/aarch64/hscale.S +++ b/libswscale/aarch64/hscale.S @@ -638,6 +638,16 @@ function ff_hscale8to19_X4_neon, export=1 ret endfunc + +.macro hscale_iter src, src2, filter, dst1, dst2 + uxtl \src\().4s, \src\().4h + sxtl v19.4s, \filter\().4h + mul \dst1\().4s, \src\().4s, v19.4s + uxtl \src2\().4s, \src2\().4h + sxtl2 \filter\().4s, \filter\().8h + mul \dst2\().4s, \src2\().4s, \filter\().4s +.endm + function ff_hscale16to15_4_neon_asm, export=1 // w0 int shift // x1 int32_t *dst @@ -664,6 +674,7 @@ function ff_hscale16to15_4_neon_asm, export=1 add x5, x5, #32 // shift all filterPos left by one, as uint16_t will be read + ld1 {v28.8h, v29.8h, v30.8h, v31.8h}, [x4], #64 // filter[0..7] lsl x8, x8, #1 lsl x9, x9, #1 lsl x10, x10, #1 @@ -674,154 +685,101 @@ function ff_hscale16to15_4_neon_asm, export=1 lsl x15, x15, #1 // load src with given offset - ldr x8, [x3, w8, uxtw] - ldr x9, [x3, w9, uxtw] - ldr x10, [x3, w10, uxtw] - ldr x11, [x3, w11, uxtw] - ldr x12, [x3, w12, uxtw] - ldr x13, [x3, w13, uxtw] - ldr x14, [x3, w14, uxtw] - ldr x15, [x3, w15, uxtw] - - sub sp, sp, #64 - // push src on stack so it can be loaded into vectors later - stp x8, x9, [sp] - stp x10, x11, [sp, #16] - stp x12, x13, [sp, #32] - stp x14, x15, [sp, #48] + ldr d0, [x3, w8, uxtw] + ldr d1, [x3, w9, uxtw] + ldr d2, [x3, w10, uxtw] + ldr d3, [x3, w11, uxtw] + ldr d4, [x3, w12, uxtw] + ldr d5, [x3, w13, uxtw] + ldr d6, [x3, w14, uxtw] + ldr d7, [x3, w15, uxtw] 1: - ld4 {v0.8h, v1.8h, v2.8h, v3.8h}, [sp] - ld4 {v28.8h, v29.8h, v30.8h, v31.8h}, [x4], #64 // filter[0..7] - - // Each of blocks does the following: - // Extend src and filter to 32 bits with uxtl and sxtl - // multiply or multiply and accumulate results - // Extending to 32 bits is necessary, as unit16_t values can't - // be represented as int16_t without type promotion. - uxtl v26.4s, v0.4h - sxtl v27.4s, v28.4h - uxtl2 v0.4s, v0.8h - mul v5.4s, v26.4s, v27.4s - sxtl2 v28.4s, v28.8h - uxtl v26.4s, v1.4h - mul v6.4s, v0.4s, v28.4s - - sxtl v27.4s, v29.4h - uxtl2 v0.4s, v1.8h - mla v5.4s, v27.4s, v26.4s - sxtl2 v28.4s, v29.8h - uxtl v26.4s, v2.4h - mla v6.4s, v28.4s, v0.4s - - sxtl v27.4s, v30.4h - uxtl2 v0.4s, v2.8h - mla v5.4s, v27.4s, v26.4s - sxtl2 v28.4s, v30.8h - uxtl v26.4s, v3.4h - mla v6.4s, v28.4s, v0.4s - - sxtl v27.4s, v31.4h - uxtl2 v0.4s, v3.8h - mla v5.4s, v27.4s, v26.4s - sxtl2 v28.4s, v31.8h - sub w2, w2, #8 - mla v6.4s, v28.4s, v0.4s - - sshl v5.4s, v5.4s, v17.4s - sshl v6.4s, v6.4s, v17.4s - smin v5.4s, v5.4s, v18.4s - smin v6.4s, v6.4s, v18.4s - xtn v5.4h, v5.4s - xtn2 v5.8h, v6.4s - - st1 {v5.8h}, [x1], #16 - cmp w2, #16 // load filterPositions into registers for next iteration + + hscale_iter v0, v1, v28, v20, v21 ldp w8, w9, [x5] // filterPos[0], filterPos[1] + hscale_iter v2, v3, v29, v22, v23 ldp w10, w11, [x5, #8] // filterPos[2], filterPos[3] + hscale_iter v4, v5, v30, v24, v25 ldp w12, w13, [x5, #16] // filterPos[4], filterPos[5] + hscale_iter v6, v7, v31, v26, v27 ldp w14, w15, [x5, #24] // filterPos[6], filterPos[7] + subs w2, w2, #8 add x5, x5, #32 + ldp q28, q29, [x4], #32 // filter[0..7] lsl x8, x8, #1 lsl x9, x9, #1 lsl x10, x10, #1 lsl x11, x11, #1 + ldp q30, q31, [x4], #32 // filter[0..7] lsl x12, x12, #1 lsl x13, x13, #1 lsl x14, x14, #1 lsl x15, x15, #1 - ldr x8, [x3, w8, uxtw] - ldr x9, [x3, w9, uxtw] - ldr x10, [x3, w10, uxtw] - ldr x11, [x3, w11, uxtw] - ldr x12, [x3, w12, uxtw] - ldr x13, [x3, w13, uxtw] - ldr x14, [x3, w14, uxtw] - ldr x15, [x3, w15, uxtw] + addp v20.4s, v20.4s, v21.4s + ldr d0, [x3, w8, uxtw] + addp v22.4s, v22.4s, v23.4s + ldr d1, [x3, w9, uxtw] + addp v24.4s, v24.4s, v25.4s + ldr d2, [x3, w10, uxtw] + addp v26.4s, v26.4s, v27.4s + ldr d3, [x3, w11, uxtw] + addp v20.4s, v20.4s, v22.4s + ldr d4, [x3, w12, uxtw] + addp v21.4s, v24.4s, v26.4s + ldr d5, [x3, w13, uxtw] + cmp w2, #16 - stp x8, x9, [sp] - stp x10, x11, [sp, #16] - stp x12, x13, [sp, #32] - stp x14, x15, [sp, #48] + sshl v20.4s, v20.4s, v17.4s + ldr d6, [x3, w14, uxtw] + sshl v21.4s, v21.4s, v17.4s + ldr d7, [x3, w15, uxtw] + smin v20.4s, v20.4s, v18.4s + smin v21.4s, v21.4s, v18.4s + xtn v20.4h, v20.4s + xtn2 v20.8h, v21.4s + + st1 {v20.8h}, [x1], #16 b.ge 1b // here we make last iteration, without updating the registers - ld4 {v0.8h, v1.8h, v2.8h, v3.8h}, [sp] - ld4 {v28.8h, v29.8h, v30.8h, v31.8h}, [x4], #64 - - uxtl v26.4s, v0.4h - sxtl v27.4s, v28.4h - uxtl2 v0.4s, v0.8h - mul v5.4s, v26.4s, v27.4s - sxtl2 v28.4s, v28.8h - uxtl v26.4s, v1.4h - mul v6.4s, v0.4s, v28.4s - - sxtl v27.4s, v29.4h - uxtl2 v0.4s, v1.8h - mla v5.4s, v26.4s, v27.4s - sxtl2 v28.4s, v29.8h - uxtl v26.4s, v2.4h - mla v6.4s, v0.4s, v28.4s - sxtl v27.4s, v30.4h - uxtl2 v0.4s, v2.8h - mla v5.4s, v26.4s, v27.4s - sxtl2 v28.4s, v30.8h - uxtl v26.4s, v3.4h - mla v6.4s, v0.4s, v28.4s - - sxtl v27.4s, v31.4h - uxtl2 v0.4s, v3.8h - mla v5.4s, v26.4s, v27.4s - sxtl2 v28.4s, v31.8h + hscale_iter v0, v1, v28, v20, v21 + hscale_iter v2, v3, v29, v22, v23 + hscale_iter v4, v5, v30, v24, v25 + hscale_iter v6, v7, v31, v26, v27 subs w2, w2, #8 - mla v6.4s, v0.4s, v28.4s - sshl v5.4s, v5.4s, v17.4s - sshl v6.4s, v6.4s, v17.4s - smin v5.4s, v5.4s, v18.4s - smin v6.4s, v6.4s, v18.4s - xtn v5.4h, v5.4s - xtn2 v5.8h, v6.4s + addp v20.4s, v20.4s, v21.4s + addp v22.4s, v22.4s, v23.4s + addp v24.4s, v24.4s, v25.4s + addp v26.4s, v26.4s, v27.4s + addp v0.4s, v20.4s, v22.4s + addp v1.4s, v24.4s, v26.4s - st1 {v5.8h}, [x1], #16 - add sp, sp, #64 // restore stack + sshl v0.4s, v0.4s, v17.4s + sshl v1.4s, v1.4s, v17.4s + smin v0.4s, v0.4s, v18.4s + smin v1.4s, v1.4s, v18.4s + xtn v0.4h, v0.4s + xtn2 v0.8h, v1.4s + + st1 {v0.8h}, [x1], #16 cbnz w2, 2f ret 2: ldr w8, [x5], #4 // load filterPos - lsl w8, w8, #1 - add x9, x3, w8, uxtw // src + filterPos + add x9, x3, w8, uxtw #1 // src + filterPos ld1 {v0.4h}, [x9] // load 4 * uint16_t ld1 {v31.4h}, [x4], #8 + sub w2, w2, #1 uxtl v0.4s, v0.4h sxtl v31.4s, v31.4h @@ -830,7 +788,6 @@ function ff_hscale16to15_4_neon_asm, export=1 sshl v0.4s, v0.4s, v17.4s smin v0.4s, v0.4s, v18.4s st1 {v0.h}[0], [x1], #2 - sub w2, w2, #1 cbnz w2, 2b // if iterations remain jump to beginning ret |