diff options
author | Nuo Mi <nuomi2021@gmail.com> | 2024-08-20 21:22:35 +0800 |
---|---|---|
committer | Nuo Mi <nuomi2021@gmail.com> | 2024-08-31 14:06:19 +0800 |
commit | 15eb10c6deea1103d5ae7c5acd36e10af511672b (patch) | |
tree | 89c8d9a9fb5b83bb42ffa5b7d8bc0aed4ffcc4f5 | |
parent | f851abb4b352a1aa94c4d354f760851536803661 (diff) | |
download | ffmpeg-15eb10c6deea1103d5ae7c5acd36e10af511672b.tar.gz |
x86/vvcdec: inter, add optical flow avx2 code
BDoF used about 10%–25% of the CPU for some clips.
Here are the FPS for one run; please ignore the negative values, as they may be due to round-to-round variation
clips | before | after | delta
--------------------------------------------|--------|-------|------
RitualDance_1920x1080_60_10_420_37_RA.266 | 310.0 | 363.0 | 14.60%
NovosobornayaSquare_1920x1080.bin | 322.3 | 339.7 | 5.12%
Tango2_3840x2160_60_10_420_27_LD.266 | 71.0 | 68.7 | -3.35%
RitualDance_1920x1080_60_10_420_32_LD.266 | 250.0 | 245.3 | -1.92%
Chimera_8bit_1080P_1000_frames.vvc | 359.3 | 422.7 | 15.00%
BQTerrace_1920x1080_60_10_420_22_RA.vvc | 142.3 | 147.7 | 3.66%
Reviewed-by: James Almer <jamrial@gmail.com>
-rw-r--r-- | libavcodec/x86/vvc/Makefile | 1 | ||||
-rw-r--r-- | libavcodec/x86/vvc/vvc_of.asm | 385 | ||||
-rw-r--r-- | libavcodec/x86/vvc/vvcdsp_init.c | 21 |
3 files changed, 407 insertions, 0 deletions
diff --git a/libavcodec/x86/vvc/Makefile b/libavcodec/x86/vvc/Makefile index 04f16bc10c..aa59aa59cf 100644 --- a/libavcodec/x86/vvc/Makefile +++ b/libavcodec/x86/vvc/Makefile @@ -6,5 +6,6 @@ OBJS-$(CONFIG_VVC_DECODER) += x86/vvc/vvcdsp_init.o \ X86ASM-OBJS-$(CONFIG_VVC_DECODER) += x86/vvc/vvc_alf.o \ x86/vvc/vvc_dmvr.o \ x86/vvc/vvc_mc.o \ + x86/vvc/vvc_of.o \ x86/vvc/vvc_sad.o \ x86/h26x/h2656_inter.o diff --git a/libavcodec/x86/vvc/vvc_of.asm b/libavcodec/x86/vvc/vvc_of.asm new file mode 100644 index 0000000000..5893bfb23a --- /dev/null +++ b/libavcodec/x86/vvc/vvc_of.asm @@ -0,0 +1,385 @@ +; /* +; * Provide AVX2 luma optical flow functions for VVC decoding +; * Copyright (c) 2024 Nuo Mi +; * +; * This file is part of FFmpeg. +; * +; * FFmpeg is free software; you can redistribute it and/or +; * modify it under the terms of the GNU Lesser General Public +; * License as published by the Free Software Foundation; either +; * version 2.1 of the License, or (at your option) any later version. +; * +; * FFmpeg is distributed in the hope that it will be useful, +; * but WITHOUT ANY WARRANTY; without even the implied warranty of +; * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +; * Lesser General Public License for more details. +; * +; * You should have received a copy of the GNU Lesser General Public +; * License along with FFmpeg; if not, write to the Free Software +; * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA +; */ +%include "libavutil/x86/x86util.asm" + +%define MAX_PB_SIZE 128 +%define SRC_STRIDE (MAX_PB_SIZE * 2) +%define SRC_PS 2 ; source pixel size, sizeof(int16_t) +%define BDOF_STACK_SIZE 10 ; (4 + 1) * 2, 4 lines + the first line, *2 for h and v +%define bdof_stack_offset(line) ((line) * 2 % BDOF_STACK_SIZE * mmsize) +%define SHIFT 6 +%define SHIFT2 4 + +SECTION_RODATA 32 +pd_15 times 8 dd 15 +pd_m15 times 8 dd -15 + +pb_shuffle_w8 times 2 db 0, 1, 0xff, 0xff, 8, 9, 0xff, 0xff, 6, 7, 0xff, 0xff, 14, 15, 0xff, 0xff +pb_shuffle_w16 times 2 db 0, 1, 0xff, 0xff, 6, 7, 0xff, 0xff, 8, 9, 0xff, 0xff, 14, 15, 0xff, 0xff +pd_perm_w16 dd 0, 2, 1, 4, 3, 6, 5, 7 +%if ARCH_X86_64 + +%if HAVE_AVX2_EXTERNAL + +SECTION .text + +INIT_YMM avx2 + +; dst = (src0 >> shift) - (src1 >> shift) +%macro DIFF 5 ; dst, src0, src1, shift, tmp + psraw %1, %2, %4 + psraw %5, %3, %4 + psubw %1, %5 +%endmacro + +%macro LOAD_GRAD_H 4 ; dst, src, off, tmp + movu %1, [%2 + %3 + 2 * SRC_PS] + movu %4, [%2 + %3] + + DIFF %1, %1, %4, SHIFT, %4 +%endmacro + +%macro SUM_GRAD 2 ;(dst/grad0, grad1) + paddw %1, %2 + psraw %1, 1 ; shift3 +%endmacro + +%macro APPLY_BDOF_MIN_BLOCK_LINE 5 ; dst, vx, vy, tmp, line_num +%define off bdof_stack_offset(%5) + pmullw %1, %2, [rsp + off + 0 * mmsize] ; vx * (gradient_h[0] - gradient_h[1]) + pmullw %4, %3, [rsp + off + 1 * mmsize] ; vy * (gradient_v[0] - gradient_v[1]) + paddw %1, [src0q + (%5 + 1) * SRC_STRIDE + SRC_PS] + paddw %4, [src1q + (%5 + 1) * SRC_STRIDE + SRC_PS] + paddsw %1, %4 ; src0[x] + src1[x] + bdof_offset + pmulhrsw %1, m11 + CLIPW %1, m9, m10 +%endmacro + +%macro SAVE_8BPC 2 ; dst, src + packuswb m%2, m%2 + vpermq m%2, m%2, q0020 + + cmp wd, 16 + je %%w16 + movq %1, xm%2 + jmp %%wend +%%w16: + movu %1, xm%2 +%%wend: +%endmacro + +%macro SAVE_16BPC 2 ; dst, src + cmp wd, 16 + je %%w16 + movu %1, xm%2 + jmp %%wend +%%w16: + movu %1, m%2 +%%wend: +%endmacro + +%macro SAVE 2 ; dst, src + cmp pixel_maxd, (1 << 8) - 1 + jne %%save_16bpc + SAVE_8BPC %1, %2 + jmp %%end +%%save_16bpc: + SAVE_16BPC %1, %2 +%%end: +%endmacro + +; [rsp + even * mmsize] are gradient_h[0] - gradient_h[1] +; [rsp + odd * mmsize] are gradient_v[0] - gradient_v[1] +%macro APPLY_BDOF_MIN_BLOCK 4 ; block_num, vx, vy, bd + pxor m9, m9 + + movd xm10, pixel_maxd + vpbroadcastw m10, xm10 + + lea tmp0d, [pixel_maxd + 1] + movd xm11, tmp0d + VPBROADCASTW m11, xm11 ;shift_4 for pmulhrsw + + APPLY_BDOF_MIN_BLOCK_LINE m6, %2, %3, m7, (%1) * 4 + 0 + SAVE [dstq + 0 * dsq], 6 + + APPLY_BDOF_MIN_BLOCK_LINE m6, %2, %3, m7, (%1) * 4 + 1 + SAVE [dstq + 1 * dsq], 6 + + APPLY_BDOF_MIN_BLOCK_LINE m6, %2, %3, m7, (%1) * 4 + 2 + SAVE [dstq + 2 * dsq], 6 + + APPLY_BDOF_MIN_BLOCK_LINE m6, %2, %3, m7, (%1) * 4 + 3 + SAVE [dstq + ds3q], 6 +%endmacro + +%macro SUM_MIN_BLOCK_W16 4 ; src/dst, shuffle, perm, tmp + pshufb %4, %1, %2 + vpermd %4, %3, %4 + paddw %1, %4 +%endmacro + +%macro SUM_MIN_BLOCK_W8 3 ; src/dst, shuffle, tmp + pshufb %3, %1, %2 + paddw %1, %3 +%endmacro + +%macro BDOF_PROF_GRAD 2 ; line_no, last_line +%assign i0 (%1 + 0) % 3 +%assign j0 (%1 + 1) % 3 +%assign k0 (%1 + 2) % 3 +%assign i1 3 + (%1 + 0) % 3 +%assign j1 3 + (%1 + 1) % 3 +%assign k1 3 + (%1 + 2) % 3 + +; we cached src0 in m0 to m2 +%define t0 m %+ i0 +%define c0 m %+ j0 +%define b0 m %+ k0 + +; we cached src1 in m3 to m5 +%define t1 m %+ i1 +%define c1 m %+ j1 +%define b1 m %+ k1 +%define ndiff t1 +%define off bdof_stack_offset(%1) + + movu b0, [src0q + (%1 + 2) * SRC_STRIDE + SRC_PS] + movu b1, [src1q + (%1 + 2) * SRC_STRIDE + SRC_PS] + + ; gradient_v[0], gradient_v[1] + DIFF m6, b0, t0, SHIFT, t0 + DIFF m7, b1, t1, SHIFT, t1 + + ; save gradient_v[0] - gradient_v[1] + psubw m10, m6, m7 + mova [rsp + off + mmsize], m10 + + ; gradient_h[0], gradient_h[1] + LOAD_GRAD_H m8, src0q, (%1 + 1) * SRC_STRIDE, t0 + LOAD_GRAD_H m9, src1q, (%1 + 1) * SRC_STRIDE, t1 + + ; save gradient_h[0] - gradient_h[1] + psubw m11, m8, m9 + mova [rsp + off], m11 + + SUM_GRAD m8, m9 ; temph + SUM_GRAD m6, m7 ; tempv + + DIFF ndiff, c1, c0, SHIFT2, t0 ; -diff + + psignw m7, ndiff, m8 ; sgxdi + psignw m9, ndiff, m6 ; sgydi + psignw m10, m8, m6 ; sgxgy + + pabsw m6, m6 ; sgy2 + pabsw m8, m8 ; sgx2 + + ; use t0, t1 as temporary buffers + cmp wd, 16 + + je %%w16 + mova t0, [pb_shuffle_w8] + SUM_MIN_BLOCK_W8 m6, t0, m11 + SUM_MIN_BLOCK_W8 m7, t0, m11 + SUM_MIN_BLOCK_W8 m8, t0, m11 + SUM_MIN_BLOCK_W8 m9, t0, m11 + SUM_MIN_BLOCK_W8 m10, t0, m11 + jmp %%wend + +%%w16: + mova t0, [pb_shuffle_w16] + mova t1, [pd_perm_w16] + SUM_MIN_BLOCK_W16 m6, t0, t1, m11 + SUM_MIN_BLOCK_W16 m7, t0, t1, m11 + SUM_MIN_BLOCK_W16 m8, t0, t1, m11 + SUM_MIN_BLOCK_W16 m9, t0, t1, m11 + SUM_MIN_BLOCK_W16 m10, t0, t1, m11 + +%%wend: + vpblendd m11, m8, m7, 10101010b + vpblendd m7, m8, m7, 01010101b + pshufd m7, m7, q2301 + paddw m8, m7, m11 ;4 x (2sgx2, 2sgxdi) + + vpblendd m11, m6, m9, 10101010b + vpblendd m9, m6, m9, 01010101b + pshufd m9, m9, q2301 + paddw m6, m9, m11 ;4 x (2sgy2, 2sgydi) + + vpblendw m11, m8, m6, 10101010b + vpblendw m6, m8, m6, 01010101b + pshuflw m6, m6, q2301 + pshufhw m6, m6, q2301 + paddw m8, m6, m11 ; 4 x (4sgx2, 4sgy2, 4sgxdi, 4sgydi) + +%if (%1) == 0 || (%2) + ; pad for top and bottom + paddw m8, m8 + paddw m10, m10 +%endif + + paddw m12, m8 + paddw m13, m10 +%endmacro + + +%macro LOG2 5 ; log_sum, src, cmp, shift, tmp + pcmpgtw %5, %2, %3 + pandd %5, %4 + paddw %1, %5 + + psrlw %2, %5 + psrlw %4, 1 + psrlw %3, %4 +%endmacro + +%macro LOG2 2 ; dst/src, offset + pextrw tmp0d, xm%1, %2 + bsr tmp0d, tmp0d + pinsrw xm%1, tmp0d, %2 +%endmacro + +%macro LOG2 1 ; dst/src + LOG2 %1, 0 + LOG2 %1, 1 + LOG2 %1, 2 + LOG2 %1, 3 + LOG2 %1, 4 + LOG2 %1, 5 + LOG2 %1, 6 + LOG2 %1, 7 +%endmacro + +; %1: 4 (sgx2, sgy2, sgxdi, gydi) +; %2: 4 (4sgxgy) +%macro BDOF_VX_VY 2 ; + pshufd m6, m%1, q0032 + punpckldq m%1, m6 + vextracti128 xm7, m%1, 1 + + punpcklqdq m8, m%1, m7 ; 4 (sgx2, sgy2) + punpckhqdq m9, m%1, m7 ; 4 (sgxdi, sgydi) + mova m10, m8 + LOG2 10 ; 4 (log2(sgx2), log2(sgy2)) + + ; Promote to dword since vpsrlvw is AVX-512 only + pmovsxwd m8, xm8 + pmovsxwd m9, xm9 + pmovsxwd m10, xm10 + + pslld m9, 2 ; 4 (log2(sgx2) << 2, log2(sgy2) << 2) + + psignd m11, m9, m8 + vpsravd m11, m11, m10 + CLIPD m11, [pd_m15], [pd_15] ; 4 (vx, junk) + + pshuflw m%1, m11, q0000 + pshufhw m%1, m%1, q0000 ; 4 (2junk, 2vx) + + psllq m6, m%2, 32 + paddw m%2, m6 + + pmaddwd m%2, m%1 ; 4 (junk, vx * sgxgy) + psrad m%2, 1 + psubd m9, m%2 ; 4 (junk, (sgydi << 2) - (vx * sgxgy >> 1)) + + psignd m9, m8 + vpsravd m%2, m9, m10 + CLIPD m%2, [pd_m15], [pd_15] ; 4 (junk, vy) + + pshuflw m%2, m%2, q2222 + pshufhw m%2, m%2, q2222 ; 4 (4vy) +%endmacro + + +%macro BDOF_MINI_BLOCKS 2 ; (block_num, last_block) + +%if (%1) == 0 + movu m0, [src0q + 0 * SRC_STRIDE + SRC_PS] + movu m1, [src0q + 1 * SRC_STRIDE + SRC_PS] + movu m3, [src1q + 0 * SRC_STRIDE + SRC_PS] + movu m4, [src1q + 1 * SRC_STRIDE + SRC_PS] + + pxor m12, m12 + pxor m13, m13 + + BDOF_PROF_GRAD 0, 0 +%endif + + mova m14, m12 + mova m15, m13 + + pxor m12, m12 + pxor m13, m13 + BDOF_PROF_GRAD %1 * 4 + 1, 0 + BDOF_PROF_GRAD %1 * 4 + 2, 0 + paddw m14, m12 + paddw m15, m13 + + pxor m12, m12 + pxor m13, m13 + BDOF_PROF_GRAD %1 * 4 + 3, %2 +%if (%2) == 0 + BDOF_PROF_GRAD %1 * 4 + 4, 0 +%endif + paddw m14, m12 + paddw m15, m13 + + BDOF_VX_VY 14, 15 + APPLY_BDOF_MIN_BLOCK %1, m14, m15, bd + lea dstq, [dstq + 4 * dsq] +%endmacro + +;void ff_vvc_apply_bdof_%1(uint8_t *dst, const ptrdiff_t dst_stride, int16_t *src0, int16_t *src1, +; const int w, const int h, const int int pixel_max) +%macro BDOF_AVX2 0 +cglobal vvc_apply_bdof, 7, 10, 16, BDOF_STACK_SIZE*32, dst, ds, src0, src1, w, h, pixel_max, ds3, tmp0, tmp1 + + lea ds3q, [dsq * 3] + sub src0q, SRC_STRIDE + SRC_PS + sub src1q, SRC_STRIDE + SRC_PS + + BDOF_MINI_BLOCKS 0, 0 + + cmp hd, 16 + je .h16 + BDOF_MINI_BLOCKS 1, 1 + jmp .end + +.h16: + BDOF_MINI_BLOCKS 1, 0 + BDOF_MINI_BLOCKS 2, 0 + BDOF_MINI_BLOCKS 3, 1 + +.end: + RET +%endmacro + +%macro VVC_OF_AVX2 0 + BDOF_AVX2 +%endmacro + +VVC_OF_AVX2 + +%endif ; HAVE_AVX2_EXTERNAL + +%endif ; ARCH_X86_64 diff --git a/libavcodec/x86/vvc/vvcdsp_init.c b/libavcodec/x86/vvc/vvcdsp_init.c index d5b4f4f8a5..f3e2e3a27b 100644 --- a/libavcodec/x86/vvc/vvcdsp_init.c +++ b/libavcodec/x86/vvc/vvcdsp_init.c @@ -102,6 +102,20 @@ DMVR_PROTOTYPES( 8, avx2) DMVR_PROTOTYPES(10, avx2) DMVR_PROTOTYPES(12, avx2) +void ff_vvc_apply_bdof_avx2(uint8_t *dst, ptrdiff_t dst_stride, \ + const int16_t *src0, const int16_t *src1, int w, int h, int pixel_max); \ + +#define OF_PROTOTYPES(bd, opt) \ +static void ff_vvc_apply_bdof_##bd##_##opt(uint8_t *dst, ptrdiff_t dst_stride, \ + const int16_t *src0, const int16_t *src1, int w, int h) \ +{ \ + ff_vvc_apply_bdof##_##opt(dst, dst_stride, src0, src1, w, h, (1 << bd) - 1); \ +} \ + +OF_PROTOTYPES( 8, avx2) +OF_PROTOTYPES(10, avx2) +OF_PROTOTYPES(12, avx2) + #define ALF_BPC_PROTOTYPES(bpc, opt) \ void BF(ff_vvc_alf_filter_luma, bpc, opt)(uint8_t *dst, ptrdiff_t dst_stride, \ const uint8_t *src, ptrdiff_t src_stride, ptrdiff_t width, ptrdiff_t height, \ @@ -328,6 +342,10 @@ ALF_FUNCS(16, 12, avx2) c->inter.dmvr[1][1] = ff_vvc_dmvr_hv_##bd##_avx2; \ } while (0) +#define OF_INIT(bd) do { \ + c->inter.apply_bdof = ff_vvc_apply_bdof_##bd##_avx2; \ +} while (0) + #define ALF_INIT(bd) do { \ c->alf.filter[LUMA] = ff_vvc_alf_filter_luma_##bd##_avx2; \ c->alf.filter[CHROMA] = ff_vvc_alf_filter_chroma_##bd##_avx2; \ @@ -352,6 +370,7 @@ void ff_vvc_dsp_init_x86(VVCDSPContext *const c, const int bd) ALF_INIT(8); AVG_INIT(8, avx2); MC_LINKS_AVX2(8); + OF_INIT(8); DMVR_INIT(8); SAD_INIT(); } @@ -365,6 +384,7 @@ void ff_vvc_dsp_init_x86(VVCDSPContext *const c, const int bd) AVG_INIT(10, avx2); MC_LINKS_AVX2(10); MC_LINKS_16BPC_AVX2(10); + OF_INIT(10); DMVR_INIT(10); SAD_INIT(); } @@ -378,6 +398,7 @@ void ff_vvc_dsp_init_x86(VVCDSPContext *const c, const int bd) AVG_INIT(12, avx2); MC_LINKS_AVX2(12); MC_LINKS_16BPC_AVX2(12); + OF_INIT(12); DMVR_INIT(12); SAD_INIT(); } |