diff options
author | Zhao Zhili <zhilizhao@tencent.com> | 2024-09-11 17:13:38 +0800 |
---|---|---|
committer | Nuo Mi <nuomi2021@gmail.com> | 2024-09-14 16:36:34 +0800 |
commit | 1be5a2374f224a1fc5e429aafe2ac8fc4efb548c (patch) | |
tree | 90fc6b8638b86a10ddb606c227140e34377dc5ed | |
parent | 0dcf204e5d7df388e477fd40c6d4eb4609c7c671 (diff) | |
download | ffmpeg-1be5a2374f224a1fc5e429aafe2ac8fc4efb548c.tar.gz |
aarch64/vvc: Add put_epel_hv
On Apple M1:
put_chroma_hv_8_4x4_c: 1.7 ( 1.00x)
put_chroma_hv_8_4x4_neon: 0.2 ( 7.67x)
put_chroma_hv_8_8x8_c: 5.5 ( 1.00x)
put_chroma_hv_8_8x8_neon: 0.5 (11.53x)
put_chroma_hv_8_16x16_c: 18.5 ( 1.00x)
put_chroma_hv_8_16x16_neon: 1.5 (12.53x)
put_chroma_hv_8_32x32_c: 72.5 ( 1.00x)
put_chroma_hv_8_32x32_neon: 4.7 (15.34x)
put_chroma_hv_8_64x64_c: 274.0 ( 1.00x)
put_chroma_hv_8_64x64_neon: 18.5 (14.83x)
put_chroma_hv_8_128x128_c: 1058.7 ( 1.00x)
put_chroma_hv_8_128x128_neon: 75.2 (14.07x)
On Android Pixel 8 Pro:
put_chroma_hv_8_4x4_c: 1.2 ( 1.00x)
put_chroma_hv_8_4x4_neon: 0.0 ( 0.00x)
put_chroma_hv_8_4x4_i8mm: 0.2 ( 5.00x)
put_chroma_hv_8_8x8_c: 4.0 ( 1.00x)
put_chroma_hv_8_8x8_neon: 0.5 ( 8.00x)
put_chroma_hv_8_8x8_i8mm: 0.5 ( 8.00x)
put_chroma_hv_8_16x16_c: 15.2 ( 1.00x)
put_chroma_hv_8_16x16_neon: 2.5 ( 6.10x)
put_chroma_hv_8_16x16_i8mm: 2.2 ( 6.78x)
put_chroma_hv_8_32x32_c: 61.0 ( 1.00x)
put_chroma_hv_8_32x32_neon: 9.8 ( 6.26x)
put_chroma_hv_8_32x32_i8mm: 8.5 ( 7.18x)
put_chroma_hv_8_64x64_c: 229.5 ( 1.00x)
put_chroma_hv_8_64x64_neon: 38.5 ( 5.96x)
put_chroma_hv_8_64x64_i8mm: 34.0 ( 6.75x)
put_chroma_hv_8_128x128_c: 919.8 ( 1.00x)
put_chroma_hv_8_128x128_neon: 154.5 ( 5.95x)
put_chroma_hv_8_128x128_i8mm: 140.0 ( 6.57x)
-rw-r--r-- | libavcodec/aarch64/h26x/dsp.h | 8 | ||||
-rw-r--r-- | libavcodec/aarch64/h26x/epel_neon.S | 125 | ||||
-rw-r--r-- | libavcodec/aarch64/vvc/dsp_init.c | 14 |
3 files changed, 147 insertions, 0 deletions
diff --git a/libavcodec/aarch64/h26x/dsp.h b/libavcodec/aarch64/h26x/dsp.h index 90a42d7108..0fefb4d70f 100644 --- a/libavcodec/aarch64/h26x/dsp.h +++ b/libavcodec/aarch64/h26x/dsp.h @@ -297,4 +297,12 @@ NEON8_FNPROTO_PARTIAL_6(qpel_hv, (int16_t *dst, const uint8_t *src, ptrdiff_t srcstride, int height, const int8_t *hf, const int8_t *vf, int width), _i8mm); +NEON8_FNPROTO_PARTIAL_6(epel_hv, (int16_t *dst, + const uint8_t *src, ptrdiff_t srcstride, int height, + const int8_t *hf, const int8_t *vf, int width),); + +NEON8_FNPROTO_PARTIAL_6(epel_hv, (int16_t *dst, + const uint8_t *src, ptrdiff_t srcstride, int height, + const int8_t *hf, const int8_t *vf, int width), _i8mm); + #endif diff --git a/libavcodec/aarch64/h26x/epel_neon.S b/libavcodec/aarch64/h26x/epel_neon.S index cad8f2a5f4..e44a448b1f 100644 --- a/libavcodec/aarch64/h26x/epel_neon.S +++ b/libavcodec/aarch64/h26x/epel_neon.S @@ -72,6 +72,11 @@ endconst sxtl v0.8h, v0.8b .endm +.macro vvc_load_epel_filterh freg + ld1 {v0.8b}, [\freg] + sxtl v0.8h, v0.8b +.endm + .macro calc_epelh dst, src0, src1, src2, src3 smull \dst\().4s, \src0\().4h, v0.h[0] smlal \dst\().4s, \src1\().4h, v0.h[1] @@ -2299,10 +2304,16 @@ endfunc DISABLE_I8MM #endif +function vvc_put_epel_hv4_8_end_neon + vvc_load_epel_filterh x5 + mov x10, #(VVC_MAX_PB_SIZE * 2) + b 0f +endfunc function hevc_put_hevc_epel_hv4_8_end_neon load_epel_filterh x5, x4 mov x10, #(HEVC_MAX_PB_SIZE * 2) +0: ldr d16, [sp] ldr d17, [sp, x10] add sp, sp, x10, lsl #1 @@ -2339,9 +2350,16 @@ function hevc_put_hevc_epel_hv6_8_end_neon 2: ret endfunc +function vvc_put_epel_hv8_8_end_neon + vvc_load_epel_filterh x5 + mov x10, #(VVC_MAX_PB_SIZE * 2) + b 0f +endfunc + function hevc_put_hevc_epel_hv8_8_end_neon load_epel_filterh x5, x4 mov x10, #(HEVC_MAX_PB_SIZE * 2) +0: ldr q16, [sp] ldr q17, [sp, x10] add sp, sp, x10, lsl #1 @@ -2379,9 +2397,16 @@ function hevc_put_hevc_epel_hv12_8_end_neon 2: ret endfunc +function vvc_put_epel_hv16_8_end_neon + vvc_load_epel_filterh x5 + mov x10, #(VVC_MAX_PB_SIZE * 2) + b 0f +endfunc + function hevc_put_hevc_epel_hv16_8_end_neon load_epel_filterh x5, x4 mov x10, #(HEVC_MAX_PB_SIZE * 2) +0: ld1 {v16.8h, v17.8h}, [sp], x10 ld1 {v18.8h, v19.8h}, [sp], x10 ld1 {v20.8h, v21.8h}, [sp], x10 @@ -2437,6 +2462,21 @@ function ff_hevc_put_hevc_epel_hv4_8_\suffix, export=1 b hevc_put_hevc_epel_hv4_8_end_neon endfunc +function ff_vvc_put_epel_hv4_8_\suffix, export=1 + add w10, w3, #3 + lsl x10, x10, #8 + sub sp, sp, x10 // tmp_array + stp x5, x30, [sp, #-32]! + stp x0, x3, [sp, #16] + add x0, sp, #32 + sub x1, x1, x2 + add w3, w3, #3 + bl X(ff_vvc_put_epel_h4_8_\suffix) + ldp x0, x3, [sp, #16] + ldp x5, x30, [sp], #32 + b vvc_put_epel_hv4_8_end_neon +endfunc + function ff_hevc_put_hevc_epel_hv6_8_\suffix, export=1 add w10, w3, #3 lsl x10, x10, #7 @@ -2467,6 +2507,21 @@ function ff_hevc_put_hevc_epel_hv8_8_\suffix, export=1 b hevc_put_hevc_epel_hv8_8_end_neon endfunc +function ff_vvc_put_epel_hv8_8_\suffix, export=1 + add w10, w3, #3 + lsl x10, x10, #8 + sub sp, sp, x10 // tmp_array + stp x5, x30, [sp, #-32]! + stp x0, x3, [sp, #16] + add x0, sp, #32 + sub x1, x1, x2 + add w3, w3, #3 + bl X(ff_vvc_put_epel_h8_8_\suffix) + ldp x0, x3, [sp, #16] + ldp x5, x30, [sp], #32 + b vvc_put_epel_hv8_8_end_neon +endfunc + function ff_hevc_put_hevc_epel_hv12_8_\suffix, export=1 add w10, w3, #3 lsl x10, x10, #7 @@ -2497,6 +2552,21 @@ function ff_hevc_put_hevc_epel_hv16_8_\suffix, export=1 b hevc_put_hevc_epel_hv16_8_end_neon endfunc +function ff_vvc_put_epel_hv16_8_\suffix, export=1 + add w10, w3, #3 + lsl x10, x10, #8 + sub sp, sp, x10 // tmp_array + stp x5, x30, [sp, #-32]! + stp x0, x3, [sp, #16] + add x0, sp, #32 + sub x1, x1, x2 + add w3, w3, #3 + bl X(ff_vvc_put_epel_h16_8_\suffix) + ldp x0, x3, [sp, #16] + ldp x5, x30, [sp], #32 + b vvc_put_epel_hv16_8_end_neon +endfunc + function ff_hevc_put_hevc_epel_hv24_8_\suffix, export=1 add w10, w3, #3 lsl x10, x10, #7 @@ -2530,6 +2600,24 @@ function ff_hevc_put_hevc_epel_hv32_8_\suffix, export=1 ret endfunc +function ff_vvc_put_epel_hv32_8_\suffix, export=1 + stp x4, x5, [sp, #-64]! + stp x2, x3, [sp, #16] + stp x0, x1, [sp, #32] + str x30, [sp, #48] + mov x6, #16 + bl X(ff_vvc_put_epel_hv16_8_\suffix) + ldp x0, x1, [sp, #32] + ldp x2, x3, [sp, #16] + ldp x4, x5, [sp], #48 + add x0, x0, #32 + add x1, x1, #16 + mov x6, #16 + bl X(ff_vvc_put_epel_hv16_8_\suffix) + ldr x30, [sp], #16 + ret +endfunc + function ff_hevc_put_hevc_epel_hv48_8_\suffix, export=1 stp x4, x5, [sp, #-64]! stp x2, x3, [sp, #16] @@ -2579,6 +2667,43 @@ function ff_hevc_put_hevc_epel_hv64_8_\suffix, export=1 ldr x30, [sp], #16 ret endfunc + +function ff_vvc_put_epel_hv64_8_\suffix, export=1 + stp x4, x5, [sp, #-64]! + stp x2, x3, [sp, #16] + stp x0, x1, [sp, #32] + str x30, [sp, #48] + mov x6, #32 + bl X(ff_vvc_put_epel_hv32_8_\suffix) + ldp x0, x1, [sp, #32] + ldp x2, x3, [sp, #16] + ldp x4, x5, [sp], #48 + add x0, x0, #64 + add x1, x1, #32 + mov x6, #32 + bl X(ff_vvc_put_epel_hv32_8_\suffix) + ldr x30, [sp], #16 + ret +endfunc + +function ff_vvc_put_epel_hv128_8_\suffix, export=1 + stp x4, x5, [sp, #-64]! + stp x2, x3, [sp, #16] + stp x0, x1, [sp, #32] + str x30, [sp, #48] + mov x6, #64 + bl X(ff_vvc_put_epel_hv64_8_\suffix) + ldp x0, x1, [sp, #32] + ldp x2, x3, [sp, #16] + ldp x4, x5, [sp], #48 + add x0, x0, #128 + add x1, x1, #64 + mov x6, #64 + bl X(ff_vvc_put_epel_hv64_8_\suffix) + ldr x30, [sp], #16 + ret +endfunc + .endm epel_hv neon diff --git a/libavcodec/aarch64/vvc/dsp_init.c b/libavcodec/aarch64/vvc/dsp_init.c index c947885145..4867491620 100644 --- a/libavcodec/aarch64/vvc/dsp_init.c +++ b/libavcodec/aarch64/vvc/dsp_init.c @@ -84,6 +84,13 @@ void ff_vvc_dsp_init_aarch64(VVCDSPContext *const c, const int bd) c->inter.put[1][5][0][1] = c->inter.put[1][6][0][1] = ff_vvc_put_epel_h32_8_neon; + c->inter.put[1][1][1][1] = ff_vvc_put_epel_hv4_8_neon; + c->inter.put[1][2][1][1] = ff_vvc_put_epel_hv8_8_neon; + c->inter.put[1][3][1][1] = ff_vvc_put_epel_hv16_8_neon; + c->inter.put[1][4][1][1] = ff_vvc_put_epel_hv32_8_neon; + c->inter.put[1][5][1][1] = ff_vvc_put_epel_hv64_8_neon; + c->inter.put[1][6][1][1] = ff_vvc_put_epel_hv128_8_neon; + c->inter.put_uni[0][1][0][0] = ff_vvc_put_pel_uni_pixels4_8_neon; c->inter.put_uni[0][2][0][0] = ff_vvc_put_pel_uni_pixels8_8_neon; c->inter.put_uni[0][3][0][0] = ff_vvc_put_pel_uni_pixels16_8_neon; @@ -134,6 +141,13 @@ void ff_vvc_dsp_init_aarch64(VVCDSPContext *const c, const int bd) c->inter.put[1][4][0][1] = ff_vvc_put_epel_h32_8_neon_i8mm; c->inter.put[1][5][0][1] = ff_vvc_put_epel_h64_8_neon_i8mm; c->inter.put[1][6][0][1] = ff_vvc_put_epel_h128_8_neon_i8mm; + + c->inter.put[1][1][1][1] = ff_vvc_put_epel_hv4_8_neon_i8mm; + c->inter.put[1][2][1][1] = ff_vvc_put_epel_hv8_8_neon_i8mm; + c->inter.put[1][3][1][1] = ff_vvc_put_epel_hv16_8_neon_i8mm; + c->inter.put[1][4][1][1] = ff_vvc_put_epel_hv32_8_neon_i8mm; + c->inter.put[1][5][1][1] = ff_vvc_put_epel_hv64_8_neon_i8mm; + c->inter.put[1][6][1][1] = ff_vvc_put_epel_hv128_8_neon_i8mm; } } else if (bd == 10) { c->alf.filter[LUMA] = alf_filter_luma_10_neon; |