diff options
author | Zhao Zhili <zhilizhao@tencent.com> | 2024-09-29 20:02:48 +0800 |
---|---|---|
committer | Nuo Mi <nuomi2021@gmail.com> | 2024-10-01 10:28:54 +0800 |
commit | 5988a2729bc21ade237a9d0f9bb1112f39870f56 (patch) | |
tree | a046a875dcb4e9374ac99cf22890d2c34b6f242b | |
parent | bcd65ebd8fcdc4bf81db8a36bac0d7e4ddab0449 (diff) | |
download | ffmpeg-5988a2729bc21ade237a9d0f9bb1112f39870f56.tar.gz |
aarch64/vvc: Add dmvr
dmvr_8_12x20_c: 1.5 ( 1.00x)
dmvr_8_12x20_neon: 0.2 ( 6.56x)
dmvr_8_20x12_c: 1.0 ( 1.00x)
dmvr_8_20x12_neon: 0.2 ( 4.33x)
dmvr_8_20x20_c: 1.7 ( 1.00x)
dmvr_8_20x20_neon: 0.5 ( 3.63x)
dmvr_12_12x20_c: 2.2 ( 1.00x)
dmvr_12_12x20_neon: 0.5 ( 4.68x)
dmvr_12_20x12_c: 2.0 ( 1.00x)
dmvr_12_20x12_neon: 0.5 ( 4.16x)
dmvr_12_20x20_c: 3.7 ( 1.00x)
dmvr_12_20x20_neon: 0.7 ( 5.14x)
Signed-off-by: Zhao Zhili <zhilizhao@tencent.com>
-rw-r--r-- | libavcodec/aarch64/vvc/dsp_init.c | 4 | ||||
-rw-r--r-- | libavcodec/aarch64/vvc/inter.S | 87 |
2 files changed, 90 insertions, 1 deletions
diff --git a/libavcodec/aarch64/vvc/dsp_init.c b/libavcodec/aarch64/vvc/dsp_init.c index b5f6f6da93..e47f1ab4cc 100644 --- a/libavcodec/aarch64/vvc/dsp_init.c +++ b/libavcodec/aarch64/vvc/dsp_init.c @@ -90,6 +90,8 @@ W_AVG_FUN(12) const uint8_t *_src, ptrdiff_t _src_stride, int height, \ intptr_t mx, intptr_t my, int width); +DMVR_FUN(, 8) +DMVR_FUN(, 12) DMVR_FUN(hv_, 8) DMVR_FUN(hv_, 10) DMVR_FUN(hv_, 12) @@ -166,6 +168,7 @@ void ff_vvc_dsp_init_aarch64(VVCDSPContext *const c, const int bd) c->inter.avg = ff_vvc_avg_8_neon; c->inter.w_avg = vvc_w_avg_8; + c->inter.dmvr[0][0] = ff_vvc_dmvr_8_neon; c->inter.dmvr[1][1] = ff_vvc_dmvr_hv_8_neon; for (int i = 0; i < FF_ARRAY_ELEMS(c->sao.band_filter); i++) @@ -215,6 +218,7 @@ void ff_vvc_dsp_init_aarch64(VVCDSPContext *const c, const int bd) } else if (bd == 12) { c->inter.avg = ff_vvc_avg_12_neon; c->inter.w_avg = vvc_w_avg_12; + c->inter.dmvr[0][0] = ff_vvc_dmvr_12_neon; c->inter.dmvr[1][1] = ff_vvc_dmvr_hv_12_neon; c->alf.filter[LUMA] = alf_filter_luma_12_neon; diff --git a/libavcodec/aarch64/vvc/inter.S b/libavcodec/aarch64/vvc/inter.S index 4fc8def133..b6b079b569 100644 --- a/libavcodec/aarch64/vvc/inter.S +++ b/libavcodec/aarch64/vvc/inter.S @@ -235,7 +235,7 @@ vvc_avg w_avg, 12 * x5: intptr_t my * w6: int width */ -function ff_vvc_dmvr_hv_8_neon, export=1 +function ff_vvc_dmvr_8_neon, export=1 dst .req x0 src .req x1 src_stride .req x2 @@ -243,6 +243,91 @@ function ff_vvc_dmvr_hv_8_neon, export=1 mx .req x4 my .req x5 width .req w6 + + sxtw x6, w6 + mov x7, #(VVC_MAX_PB_SIZE * 2 + 8) + cmp width, #16 + sub src_stride, src_stride, x6 + cset w15, gt // width > 16 + movi v16.8h, #2 // DMVR_SHIFT + sub x7, x7, x6, lsl #1 +1: + cbz w15, 2f + ldr q0, [src], #16 + uxtl v1.8h, v0.8b + uxtl2 v2.8h, v0.16b + ushl v1.8h, v1.8h, v16.8h + ushl v2.8h, v2.8h, v16.8h + stp q1, q2, [dst], #32 + b 3f +2: + ldr d0, [src], #8 + uxtl v1.8h, v0.8b + ushl v1.8h, v1.8h, v16.8h + str q1, [dst], #16 +3: + subs height, height, #1 + ldr s3, [src], #4 + uxtl v4.8h, v3.8b + ushl v4.4h, v4.4h, v16.4h + st1 {v4.4h}, [dst], x7 + + add src, src, src_stride + b.ne 1b + + ret +endfunc + +function ff_vvc_dmvr_12_neon, export=1 + sxtw x6, w6 + mov x7, #(VVC_MAX_PB_SIZE * 2 + 8) + cmp width, #16 + sub src_stride, src_stride, x6, lsl #1 + cset w15, gt // width > 16 + movi v16.8h, #2 // offset4 + sub x7, x7, x6, lsl #1 +1: + cbz w15, 2f + ldp q0, q1, [src], #32 + uaddl v2.4s, v0.4h, v16.4h + uaddl2 v3.4s, v0.8h, v16.8h + uaddl v4.4s, v1.4h, v16.4h + uaddl2 v5.4s, v1.8h, v16.8h + ushr v2.4s, v2.4s, #2 + ushr v3.4s, v3.4s, #2 + ushr v4.4s, v4.4s, #2 + ushr v5.4s, v5.4s, #2 + uqxtn v2.4h, v2.4s + uqxtn2 v2.8h, v3.4s + uqxtn v4.4h, v4.4s + uqxtn2 v4.8h, v5.4s + + stp q2, q4, [dst], #32 + b 3f +2: + ldr q0, [src], #16 + uaddl v2.4s, v0.4h, v16.4h + uaddl2 v3.4s, v0.8h, v16.8h + ushr v2.4s, v2.4s, #2 + ushr v3.4s, v3.4s, #2 + uqxtn v2.4h, v2.4s + uqxtn2 v2.8h, v3.4s + str q2, [dst], #16 +3: + subs height, height, #1 + ldr d0, [src], #8 + uaddl v3.4s, v0.4h, v16.4h + ushr v3.4s, v3.4s, #2 + uqxtn v3.4h, v3.4s + st1 {v3.4h}, [dst], x7 + + add src, src, src_stride + b.ne 1b + + ret +endfunc + +function ff_vvc_dmvr_hv_8_neon, export=1 tmp0 .req x7 tmp1 .req x8 |