aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorNuo Mi <nuomi2021@gmail.com>2024-08-20 21:22:35 +0800
committerNuo Mi <nuomi2021@gmail.com>2024-08-31 14:06:19 +0800
commit15eb10c6deea1103d5ae7c5acd36e10af511672b (patch)
tree89c8d9a9fb5b83bb42ffa5b7d8bc0aed4ffcc4f5
parentf851abb4b352a1aa94c4d354f760851536803661 (diff)
downloadffmpeg-15eb10c6deea1103d5ae7c5acd36e10af511672b.tar.gz
x86/vvcdec: inter, add optical flow avx2 code
BDoF used about 10%–25% of the CPU for some clips. Here are the FPS for one run; please ignore the negative values, as they may be due to round-to-round variation clips | before | after | delta --------------------------------------------|--------|-------|------ RitualDance_1920x1080_60_10_420_37_RA.266 | 310.0 | 363.0 | 14.60% NovosobornayaSquare_1920x1080.bin | 322.3 | 339.7 | 5.12% Tango2_3840x2160_60_10_420_27_LD.266 | 71.0 | 68.7 | -3.35% RitualDance_1920x1080_60_10_420_32_LD.266 | 250.0 | 245.3 | -1.92% Chimera_8bit_1080P_1000_frames.vvc | 359.3 | 422.7 | 15.00% BQTerrace_1920x1080_60_10_420_22_RA.vvc | 142.3 | 147.7 | 3.66% Reviewed-by: James Almer <jamrial@gmail.com>
-rw-r--r--libavcodec/x86/vvc/Makefile1
-rw-r--r--libavcodec/x86/vvc/vvc_of.asm385
-rw-r--r--libavcodec/x86/vvc/vvcdsp_init.c21
3 files changed, 407 insertions, 0 deletions
diff --git a/libavcodec/x86/vvc/Makefile b/libavcodec/x86/vvc/Makefile
index 04f16bc10c..aa59aa59cf 100644
--- a/libavcodec/x86/vvc/Makefile
+++ b/libavcodec/x86/vvc/Makefile
@@ -6,5 +6,6 @@ OBJS-$(CONFIG_VVC_DECODER) += x86/vvc/vvcdsp_init.o \
X86ASM-OBJS-$(CONFIG_VVC_DECODER) += x86/vvc/vvc_alf.o \
x86/vvc/vvc_dmvr.o \
x86/vvc/vvc_mc.o \
+ x86/vvc/vvc_of.o \
x86/vvc/vvc_sad.o \
x86/h26x/h2656_inter.o
diff --git a/libavcodec/x86/vvc/vvc_of.asm b/libavcodec/x86/vvc/vvc_of.asm
new file mode 100644
index 0000000000..5893bfb23a
--- /dev/null
+++ b/libavcodec/x86/vvc/vvc_of.asm
@@ -0,0 +1,385 @@
+; /*
+; * Provide AVX2 luma optical flow functions for VVC decoding
+; * Copyright (c) 2024 Nuo Mi
+; *
+; * This file is part of FFmpeg.
+; *
+; * FFmpeg is free software; you can redistribute it and/or
+; * modify it under the terms of the GNU Lesser General Public
+; * License as published by the Free Software Foundation; either
+; * version 2.1 of the License, or (at your option) any later version.
+; *
+; * FFmpeg is distributed in the hope that it will be useful,
+; * but WITHOUT ANY WARRANTY; without even the implied warranty of
+; * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+; * Lesser General Public License for more details.
+; *
+; * You should have received a copy of the GNU Lesser General Public
+; * License along with FFmpeg; if not, write to the Free Software
+; * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+; */
+%include "libavutil/x86/x86util.asm"
+
+%define MAX_PB_SIZE 128
+%define SRC_STRIDE (MAX_PB_SIZE * 2)
+%define SRC_PS 2 ; source pixel size, sizeof(int16_t)
+%define BDOF_STACK_SIZE 10 ; (4 + 1) * 2, 4 lines + the first line, *2 for h and v
+%define bdof_stack_offset(line) ((line) * 2 % BDOF_STACK_SIZE * mmsize)
+%define SHIFT 6
+%define SHIFT2 4
+
+SECTION_RODATA 32
+pd_15 times 8 dd 15
+pd_m15 times 8 dd -15
+
+pb_shuffle_w8 times 2 db 0, 1, 0xff, 0xff, 8, 9, 0xff, 0xff, 6, 7, 0xff, 0xff, 14, 15, 0xff, 0xff
+pb_shuffle_w16 times 2 db 0, 1, 0xff, 0xff, 6, 7, 0xff, 0xff, 8, 9, 0xff, 0xff, 14, 15, 0xff, 0xff
+pd_perm_w16 dd 0, 2, 1, 4, 3, 6, 5, 7
+%if ARCH_X86_64
+
+%if HAVE_AVX2_EXTERNAL
+
+SECTION .text
+
+INIT_YMM avx2
+
+; dst = (src0 >> shift) - (src1 >> shift)
+%macro DIFF 5 ; dst, src0, src1, shift, tmp
+ psraw %1, %2, %4
+ psraw %5, %3, %4
+ psubw %1, %5
+%endmacro
+
+%macro LOAD_GRAD_H 4 ; dst, src, off, tmp
+ movu %1, [%2 + %3 + 2 * SRC_PS]
+ movu %4, [%2 + %3]
+
+ DIFF %1, %1, %4, SHIFT, %4
+%endmacro
+
+%macro SUM_GRAD 2 ;(dst/grad0, grad1)
+ paddw %1, %2
+ psraw %1, 1 ; shift3
+%endmacro
+
+%macro APPLY_BDOF_MIN_BLOCK_LINE 5 ; dst, vx, vy, tmp, line_num
+%define off bdof_stack_offset(%5)
+ pmullw %1, %2, [rsp + off + 0 * mmsize] ; vx * (gradient_h[0] - gradient_h[1])
+ pmullw %4, %3, [rsp + off + 1 * mmsize] ; vy * (gradient_v[0] - gradient_v[1])
+ paddw %1, [src0q + (%5 + 1) * SRC_STRIDE + SRC_PS]
+ paddw %4, [src1q + (%5 + 1) * SRC_STRIDE + SRC_PS]
+ paddsw %1, %4 ; src0[x] + src1[x] + bdof_offset
+ pmulhrsw %1, m11
+ CLIPW %1, m9, m10
+%endmacro
+
+%macro SAVE_8BPC 2 ; dst, src
+ packuswb m%2, m%2
+ vpermq m%2, m%2, q0020
+
+ cmp wd, 16
+ je %%w16
+ movq %1, xm%2
+ jmp %%wend
+%%w16:
+ movu %1, xm%2
+%%wend:
+%endmacro
+
+%macro SAVE_16BPC 2 ; dst, src
+ cmp wd, 16
+ je %%w16
+ movu %1, xm%2
+ jmp %%wend
+%%w16:
+ movu %1, m%2
+%%wend:
+%endmacro
+
+%macro SAVE 2 ; dst, src
+ cmp pixel_maxd, (1 << 8) - 1
+ jne %%save_16bpc
+ SAVE_8BPC %1, %2
+ jmp %%end
+%%save_16bpc:
+ SAVE_16BPC %1, %2
+%%end:
+%endmacro
+
+; [rsp + even * mmsize] are gradient_h[0] - gradient_h[1]
+; [rsp + odd * mmsize] are gradient_v[0] - gradient_v[1]
+%macro APPLY_BDOF_MIN_BLOCK 4 ; block_num, vx, vy, bd
+ pxor m9, m9
+
+ movd xm10, pixel_maxd
+ vpbroadcastw m10, xm10
+
+ lea tmp0d, [pixel_maxd + 1]
+ movd xm11, tmp0d
+ VPBROADCASTW m11, xm11 ;shift_4 for pmulhrsw
+
+ APPLY_BDOF_MIN_BLOCK_LINE m6, %2, %3, m7, (%1) * 4 + 0
+ SAVE [dstq + 0 * dsq], 6
+
+ APPLY_BDOF_MIN_BLOCK_LINE m6, %2, %3, m7, (%1) * 4 + 1
+ SAVE [dstq + 1 * dsq], 6
+
+ APPLY_BDOF_MIN_BLOCK_LINE m6, %2, %3, m7, (%1) * 4 + 2
+ SAVE [dstq + 2 * dsq], 6
+
+ APPLY_BDOF_MIN_BLOCK_LINE m6, %2, %3, m7, (%1) * 4 + 3
+ SAVE [dstq + ds3q], 6
+%endmacro
+
+%macro SUM_MIN_BLOCK_W16 4 ; src/dst, shuffle, perm, tmp
+ pshufb %4, %1, %2
+ vpermd %4, %3, %4
+ paddw %1, %4
+%endmacro
+
+%macro SUM_MIN_BLOCK_W8 3 ; src/dst, shuffle, tmp
+ pshufb %3, %1, %2
+ paddw %1, %3
+%endmacro
+
+%macro BDOF_PROF_GRAD 2 ; line_no, last_line
+%assign i0 (%1 + 0) % 3
+%assign j0 (%1 + 1) % 3
+%assign k0 (%1 + 2) % 3
+%assign i1 3 + (%1 + 0) % 3
+%assign j1 3 + (%1 + 1) % 3
+%assign k1 3 + (%1 + 2) % 3
+
+; we cached src0 in m0 to m2
+%define t0 m %+ i0
+%define c0 m %+ j0
+%define b0 m %+ k0
+
+; we cached src1 in m3 to m5
+%define t1 m %+ i1
+%define c1 m %+ j1
+%define b1 m %+ k1
+%define ndiff t1
+%define off bdof_stack_offset(%1)
+
+ movu b0, [src0q + (%1 + 2) * SRC_STRIDE + SRC_PS]
+ movu b1, [src1q + (%1 + 2) * SRC_STRIDE + SRC_PS]
+
+ ; gradient_v[0], gradient_v[1]
+ DIFF m6, b0, t0, SHIFT, t0
+ DIFF m7, b1, t1, SHIFT, t1
+
+ ; save gradient_v[0] - gradient_v[1]
+ psubw m10, m6, m7
+ mova [rsp + off + mmsize], m10
+
+ ; gradient_h[0], gradient_h[1]
+ LOAD_GRAD_H m8, src0q, (%1 + 1) * SRC_STRIDE, t0
+ LOAD_GRAD_H m9, src1q, (%1 + 1) * SRC_STRIDE, t1
+
+ ; save gradient_h[0] - gradient_h[1]
+ psubw m11, m8, m9
+ mova [rsp + off], m11
+
+ SUM_GRAD m8, m9 ; temph
+ SUM_GRAD m6, m7 ; tempv
+
+ DIFF ndiff, c1, c0, SHIFT2, t0 ; -diff
+
+ psignw m7, ndiff, m8 ; sgxdi
+ psignw m9, ndiff, m6 ; sgydi
+ psignw m10, m8, m6 ; sgxgy
+
+ pabsw m6, m6 ; sgy2
+ pabsw m8, m8 ; sgx2
+
+ ; use t0, t1 as temporary buffers
+ cmp wd, 16
+
+ je %%w16
+ mova t0, [pb_shuffle_w8]
+ SUM_MIN_BLOCK_W8 m6, t0, m11
+ SUM_MIN_BLOCK_W8 m7, t0, m11
+ SUM_MIN_BLOCK_W8 m8, t0, m11
+ SUM_MIN_BLOCK_W8 m9, t0, m11
+ SUM_MIN_BLOCK_W8 m10, t0, m11
+ jmp %%wend
+
+%%w16:
+ mova t0, [pb_shuffle_w16]
+ mova t1, [pd_perm_w16]
+ SUM_MIN_BLOCK_W16 m6, t0, t1, m11
+ SUM_MIN_BLOCK_W16 m7, t0, t1, m11
+ SUM_MIN_BLOCK_W16 m8, t0, t1, m11
+ SUM_MIN_BLOCK_W16 m9, t0, t1, m11
+ SUM_MIN_BLOCK_W16 m10, t0, t1, m11
+
+%%wend:
+ vpblendd m11, m8, m7, 10101010b
+ vpblendd m7, m8, m7, 01010101b
+ pshufd m7, m7, q2301
+ paddw m8, m7, m11 ;4 x (2sgx2, 2sgxdi)
+
+ vpblendd m11, m6, m9, 10101010b
+ vpblendd m9, m6, m9, 01010101b
+ pshufd m9, m9, q2301
+ paddw m6, m9, m11 ;4 x (2sgy2, 2sgydi)
+
+ vpblendw m11, m8, m6, 10101010b
+ vpblendw m6, m8, m6, 01010101b
+ pshuflw m6, m6, q2301
+ pshufhw m6, m6, q2301
+ paddw m8, m6, m11 ; 4 x (4sgx2, 4sgy2, 4sgxdi, 4sgydi)
+
+%if (%1) == 0 || (%2)
+ ; pad for top and bottom
+ paddw m8, m8
+ paddw m10, m10
+%endif
+
+ paddw m12, m8
+ paddw m13, m10
+%endmacro
+
+
+%macro LOG2 5 ; log_sum, src, cmp, shift, tmp
+ pcmpgtw %5, %2, %3
+ pandd %5, %4
+ paddw %1, %5
+
+ psrlw %2, %5
+ psrlw %4, 1
+ psrlw %3, %4
+%endmacro
+
+%macro LOG2 2 ; dst/src, offset
+ pextrw tmp0d, xm%1, %2
+ bsr tmp0d, tmp0d
+ pinsrw xm%1, tmp0d, %2
+%endmacro
+
+%macro LOG2 1 ; dst/src
+ LOG2 %1, 0
+ LOG2 %1, 1
+ LOG2 %1, 2
+ LOG2 %1, 3
+ LOG2 %1, 4
+ LOG2 %1, 5
+ LOG2 %1, 6
+ LOG2 %1, 7
+%endmacro
+
+; %1: 4 (sgx2, sgy2, sgxdi, gydi)
+; %2: 4 (4sgxgy)
+%macro BDOF_VX_VY 2 ;
+ pshufd m6, m%1, q0032
+ punpckldq m%1, m6
+ vextracti128 xm7, m%1, 1
+
+ punpcklqdq m8, m%1, m7 ; 4 (sgx2, sgy2)
+ punpckhqdq m9, m%1, m7 ; 4 (sgxdi, sgydi)
+ mova m10, m8
+ LOG2 10 ; 4 (log2(sgx2), log2(sgy2))
+
+ ; Promote to dword since vpsrlvw is AVX-512 only
+ pmovsxwd m8, xm8
+ pmovsxwd m9, xm9
+ pmovsxwd m10, xm10
+
+ pslld m9, 2 ; 4 (log2(sgx2) << 2, log2(sgy2) << 2)
+
+ psignd m11, m9, m8
+ vpsravd m11, m11, m10
+ CLIPD m11, [pd_m15], [pd_15] ; 4 (vx, junk)
+
+ pshuflw m%1, m11, q0000
+ pshufhw m%1, m%1, q0000 ; 4 (2junk, 2vx)
+
+ psllq m6, m%2, 32
+ paddw m%2, m6
+
+ pmaddwd m%2, m%1 ; 4 (junk, vx * sgxgy)
+ psrad m%2, 1
+ psubd m9, m%2 ; 4 (junk, (sgydi << 2) - (vx * sgxgy >> 1))
+
+ psignd m9, m8
+ vpsravd m%2, m9, m10
+ CLIPD m%2, [pd_m15], [pd_15] ; 4 (junk, vy)
+
+ pshuflw m%2, m%2, q2222
+ pshufhw m%2, m%2, q2222 ; 4 (4vy)
+%endmacro
+
+
+%macro BDOF_MINI_BLOCKS 2 ; (block_num, last_block)
+
+%if (%1) == 0
+ movu m0, [src0q + 0 * SRC_STRIDE + SRC_PS]
+ movu m1, [src0q + 1 * SRC_STRIDE + SRC_PS]
+ movu m3, [src1q + 0 * SRC_STRIDE + SRC_PS]
+ movu m4, [src1q + 1 * SRC_STRIDE + SRC_PS]
+
+ pxor m12, m12
+ pxor m13, m13
+
+ BDOF_PROF_GRAD 0, 0
+%endif
+
+ mova m14, m12
+ mova m15, m13
+
+ pxor m12, m12
+ pxor m13, m13
+ BDOF_PROF_GRAD %1 * 4 + 1, 0
+ BDOF_PROF_GRAD %1 * 4 + 2, 0
+ paddw m14, m12
+ paddw m15, m13
+
+ pxor m12, m12
+ pxor m13, m13
+ BDOF_PROF_GRAD %1 * 4 + 3, %2
+%if (%2) == 0
+ BDOF_PROF_GRAD %1 * 4 + 4, 0
+%endif
+ paddw m14, m12
+ paddw m15, m13
+
+ BDOF_VX_VY 14, 15
+ APPLY_BDOF_MIN_BLOCK %1, m14, m15, bd
+ lea dstq, [dstq + 4 * dsq]
+%endmacro
+
+;void ff_vvc_apply_bdof_%1(uint8_t *dst, const ptrdiff_t dst_stride, int16_t *src0, int16_t *src1,
+; const int w, const int h, const int int pixel_max)
+%macro BDOF_AVX2 0
+cglobal vvc_apply_bdof, 7, 10, 16, BDOF_STACK_SIZE*32, dst, ds, src0, src1, w, h, pixel_max, ds3, tmp0, tmp1
+
+ lea ds3q, [dsq * 3]
+ sub src0q, SRC_STRIDE + SRC_PS
+ sub src1q, SRC_STRIDE + SRC_PS
+
+ BDOF_MINI_BLOCKS 0, 0
+
+ cmp hd, 16
+ je .h16
+ BDOF_MINI_BLOCKS 1, 1
+ jmp .end
+
+.h16:
+ BDOF_MINI_BLOCKS 1, 0
+ BDOF_MINI_BLOCKS 2, 0
+ BDOF_MINI_BLOCKS 3, 1
+
+.end:
+ RET
+%endmacro
+
+%macro VVC_OF_AVX2 0
+ BDOF_AVX2
+%endmacro
+
+VVC_OF_AVX2
+
+%endif ; HAVE_AVX2_EXTERNAL
+
+%endif ; ARCH_X86_64
diff --git a/libavcodec/x86/vvc/vvcdsp_init.c b/libavcodec/x86/vvc/vvcdsp_init.c
index d5b4f4f8a5..f3e2e3a27b 100644
--- a/libavcodec/x86/vvc/vvcdsp_init.c
+++ b/libavcodec/x86/vvc/vvcdsp_init.c
@@ -102,6 +102,20 @@ DMVR_PROTOTYPES( 8, avx2)
DMVR_PROTOTYPES(10, avx2)
DMVR_PROTOTYPES(12, avx2)
+void ff_vvc_apply_bdof_avx2(uint8_t *dst, ptrdiff_t dst_stride, \
+ const int16_t *src0, const int16_t *src1, int w, int h, int pixel_max); \
+
+#define OF_PROTOTYPES(bd, opt) \
+static void ff_vvc_apply_bdof_##bd##_##opt(uint8_t *dst, ptrdiff_t dst_stride, \
+ const int16_t *src0, const int16_t *src1, int w, int h) \
+{ \
+ ff_vvc_apply_bdof##_##opt(dst, dst_stride, src0, src1, w, h, (1 << bd) - 1); \
+} \
+
+OF_PROTOTYPES( 8, avx2)
+OF_PROTOTYPES(10, avx2)
+OF_PROTOTYPES(12, avx2)
+
#define ALF_BPC_PROTOTYPES(bpc, opt) \
void BF(ff_vvc_alf_filter_luma, bpc, opt)(uint8_t *dst, ptrdiff_t dst_stride, \
const uint8_t *src, ptrdiff_t src_stride, ptrdiff_t width, ptrdiff_t height, \
@@ -328,6 +342,10 @@ ALF_FUNCS(16, 12, avx2)
c->inter.dmvr[1][1] = ff_vvc_dmvr_hv_##bd##_avx2; \
} while (0)
+#define OF_INIT(bd) do { \
+ c->inter.apply_bdof = ff_vvc_apply_bdof_##bd##_avx2; \
+} while (0)
+
#define ALF_INIT(bd) do { \
c->alf.filter[LUMA] = ff_vvc_alf_filter_luma_##bd##_avx2; \
c->alf.filter[CHROMA] = ff_vvc_alf_filter_chroma_##bd##_avx2; \
@@ -352,6 +370,7 @@ void ff_vvc_dsp_init_x86(VVCDSPContext *const c, const int bd)
ALF_INIT(8);
AVG_INIT(8, avx2);
MC_LINKS_AVX2(8);
+ OF_INIT(8);
DMVR_INIT(8);
SAD_INIT();
}
@@ -365,6 +384,7 @@ void ff_vvc_dsp_init_x86(VVCDSPContext *const c, const int bd)
AVG_INIT(10, avx2);
MC_LINKS_AVX2(10);
MC_LINKS_16BPC_AVX2(10);
+ OF_INIT(10);
DMVR_INIT(10);
SAD_INIT();
}
@@ -378,6 +398,7 @@ void ff_vvc_dsp_init_x86(VVCDSPContext *const c, const int bd)
AVG_INIT(12, avx2);
MC_LINKS_AVX2(12);
MC_LINKS_16BPC_AVX2(12);
+ OF_INIT(12);
DMVR_INIT(12);
SAD_INIT();
}