aboutsummaryrefslogtreecommitdiffstats
path: root/libavcodec/x86/hevc_mc.asm
diff options
context:
space:
mode:
authorWu Jianhua <toqsxw@outlook.com>2024-01-24 02:17:06 +0800
committerNuo Mi <nuomi2021@gmail.com>2024-02-01 19:54:28 +0800
commit7d9f1f5485f5ceabc943b08f16fb9eb10ca6bbd5 (patch)
tree60693050955b208d0c38823c53ad622f7c9d598d /libavcodec/x86/hevc_mc.asm
parent04c2e246a375851fb83ba7244efefbf07caf9c65 (diff)
downloadffmpeg-7d9f1f5485f5ceabc943b08f16fb9eb10ca6bbd5.tar.gz
avcodec/x86/hevc_mc: move put/put_uni to h26x/h2656_inter.asm
This enable that the asm optimization can be reused by VVC Signed-off-by: Wu Jianhua <toqsxw@outlook.com>
Diffstat (limited to 'libavcodec/x86/hevc_mc.asm')
-rw-r--r--libavcodec/x86/hevc_mc.asm462
1 files changed, 17 insertions, 445 deletions
diff --git a/libavcodec/x86/hevc_mc.asm b/libavcodec/x86/hevc_mc.asm
index eb267453fe..5489701e44 100644
--- a/libavcodec/x86/hevc_mc.asm
+++ b/libavcodec/x86/hevc_mc.asm
@@ -715,35 +715,6 @@ SECTION .text
; int height, int mx, int my)
; ******************************
-%macro HEVC_PUT_HEVC_PEL_PIXELS 2
-HEVC_PEL_PIXELS %1, %2
-HEVC_UNI_PEL_PIXELS %1, %2
-HEVC_BI_PEL_PIXELS %1, %2
-%endmacro
-
-%macro HEVC_PEL_PIXELS 2
-cglobal hevc_put_hevc_pel_pixels%1_%2, 4, 4, 3, dst, src, srcstride,height
- pxor m2, m2
-.loop:
- SIMPLE_LOAD %1, %2, srcq, m0
- MC_PIXEL_COMPUTE %1, %2, 1
- PEL_10STORE%1 dstq, m0, m1
- LOOP_END dst, src, srcstride
- RET
- %endmacro
-
-%macro HEVC_UNI_PEL_PIXELS 2
-cglobal hevc_put_hevc_uni_pel_pixels%1_%2, 5, 5, 2, dst, dststride, src, srcstride,height
-.loop:
- SIMPLE_LOAD %1, %2, srcq, m0
- PEL_%2STORE%1 dstq, m0, m1
- add dstq, dststrideq ; dst += dststride
- add srcq, srcstrideq ; src += srcstride
- dec heightd ; cmp height
- jnz .loop ; height loop
- RET
-%endmacro
-
%macro HEVC_BI_PEL_PIXELS 2
cglobal hevc_put_hevc_bi_pel_pixels%1_%2, 6, 6, 6, dst, dststride, src, srcstride, src2, height
pxor m2, m2
@@ -777,32 +748,8 @@ cglobal hevc_put_hevc_bi_pel_pixels%1_%2, 6, 6, 6, dst, dststride, src, srcstrid
%define XMM_REGS 8
%endif
-cglobal hevc_put_hevc_epel_h%1_%2, 5, 6, XMM_REGS, dst, src, srcstride, height, mx, rfilter
-%assign %%stride ((%2 + 7)/8)
- EPEL_FILTER %2, mx, m4, m5, rfilter
-.loop:
- EPEL_LOAD %2, srcq-%%stride, %%stride, %1
- EPEL_COMPUTE %2, %1, m4, m5, 1
- PEL_10STORE%1 dstq, m0, m1
- LOOP_END dst, src, srcstride
- RET
-
-cglobal hevc_put_hevc_uni_epel_h%1_%2, 6, 7, XMM_REGS, dst, dststride, src, srcstride, height, mx, rfilter
-%assign %%stride ((%2 + 7)/8)
- movdqa m6, [pw_%2]
- EPEL_FILTER %2, mx, m4, m5, rfilter
-.loop:
- EPEL_LOAD %2, srcq-%%stride, %%stride, %1
- EPEL_COMPUTE %2, %1, m4, m5
- UNI_COMPUTE %1, %2, m0, m1, m6
- PEL_%2STORE%1 dstq, m0, m1
- add dstq, dststrideq ; dst += dststride
- add srcq, srcstrideq ; src += srcstride
- dec heightd ; cmp height
- jnz .loop ; height loop
- RET
-
cglobal hevc_put_hevc_bi_epel_h%1_%2, 7, 8, XMM_REGS, dst, dststride, src, srcstride, src2, height, mx, rfilter
+%assign %%stride ((%2 + 7)/8)
movdqa m6, [pw_bi_%2]
EPEL_FILTER %2, mx, m4, m5, rfilter
.loop:
@@ -824,36 +771,6 @@ cglobal hevc_put_hevc_bi_epel_h%1_%2, 7, 8, XMM_REGS, dst, dststride, src, srcst
; int height, int mx, int my, int width)
; ******************************
-cglobal hevc_put_hevc_epel_v%1_%2, 4, 6, XMM_REGS, dst, src, srcstride, height, r3src, my
- movifnidn myd, mym
- sub srcq, srcstrideq
- EPEL_FILTER %2, my, m4, m5, r3src
- lea r3srcq, [srcstrideq*3]
-.loop:
- EPEL_LOAD %2, srcq, srcstride, %1
- EPEL_COMPUTE %2, %1, m4, m5, 1
- PEL_10STORE%1 dstq, m0, m1
- LOOP_END dst, src, srcstride
- RET
-
-cglobal hevc_put_hevc_uni_epel_v%1_%2, 5, 7, XMM_REGS, dst, dststride, src, srcstride, height, r3src, my
- movifnidn myd, mym
- movdqa m6, [pw_%2]
- sub srcq, srcstrideq
- EPEL_FILTER %2, my, m4, m5, r3src
- lea r3srcq, [srcstrideq*3]
-.loop:
- EPEL_LOAD %2, srcq, srcstride, %1
- EPEL_COMPUTE %2, %1, m4, m5
- UNI_COMPUTE %1, %2, m0, m1, m6
- PEL_%2STORE%1 dstq, m0, m1
- add dstq, dststrideq ; dst += dststride
- add srcq, srcstrideq ; src += srcstride
- dec heightd ; cmp height
- jnz .loop ; height loop
- RET
-
-
cglobal hevc_put_hevc_bi_epel_v%1_%2, 6, 8, XMM_REGS, dst, dststride, src, srcstride, src2, height, r3src, my
movifnidn myd, mym
movdqa m6, [pw_bi_%2]
@@ -882,135 +799,6 @@ cglobal hevc_put_hevc_bi_epel_v%1_%2, 6, 8, XMM_REGS, dst, dststride, src, srcst
; ******************************
%macro HEVC_PUT_HEVC_EPEL_HV 2
-cglobal hevc_put_hevc_epel_hv%1_%2, 6, 7, 16 , dst, src, srcstride, height, mx, my, r3src
-%assign %%stride ((%2 + 7)/8)
- sub srcq, srcstrideq
- EPEL_HV_FILTER %2
- EPEL_LOAD %2, srcq-%%stride, %%stride, %1
- EPEL_COMPUTE %2, %1, m14, m15
-%if (%1 > 8 && (%2 == 8))
- SWAP m8, m1
-%endif
- SWAP m4, m0
- add srcq, srcstrideq
- EPEL_LOAD %2, srcq-%%stride, %%stride, %1
- EPEL_COMPUTE %2, %1, m14, m15
-%if (%1 > 8 && (%2 == 8))
- SWAP m9, m1
-%endif
- SWAP m5, m0
- add srcq, srcstrideq
- EPEL_LOAD %2, srcq-%%stride, %%stride, %1
- EPEL_COMPUTE %2, %1, m14, m15
-%if (%1 > 8 && (%2 == 8))
- SWAP m10, m1
-%endif
- SWAP m6, m0
- add srcq, srcstrideq
-.loop:
- EPEL_LOAD %2, srcq-%%stride, %%stride, %1
- EPEL_COMPUTE %2, %1, m14, m15
-%if (%1 > 8 && (%2 == 8))
- SWAP m11, m1
-%endif
- SWAP m7, m0
- punpcklwd m0, m4, m5
- punpcklwd m2, m6, m7
-%if %1 > 4
- punpckhwd m1, m4, m5
- punpckhwd m3, m6, m7
-%endif
- EPEL_COMPUTE 14, %1, m12, m13
-%if (%1 > 8 && (%2 == 8))
- punpcklwd m4, m8, m9
- punpcklwd m2, m10, m11
- punpckhwd m8, m8, m9
- punpckhwd m3, m10, m11
- EPEL_COMPUTE 14, %1, m12, m13, m4, m2, m8, m3
-%if cpuflag(avx2)
- vinserti128 m2, m0, xm4, 1
- vperm2i128 m3, m0, m4, q0301
- PEL_10STORE%1 dstq, m2, m3
-%else
- PEL_10STORE%1 dstq, m0, m4
-%endif
-%else
- PEL_10STORE%1 dstq, m0, m1
-%endif
- movdqa m4, m5
- movdqa m5, m6
- movdqa m6, m7
-%if (%1 > 8 && (%2 == 8))
- mova m8, m9
- mova m9, m10
- mova m10, m11
-%endif
- LOOP_END dst, src, srcstride
- RET
-
-cglobal hevc_put_hevc_uni_epel_hv%1_%2, 7, 8, 16 , dst, dststride, src, srcstride, height, mx, my, r3src
-%assign %%stride ((%2 + 7)/8)
- sub srcq, srcstrideq
- EPEL_HV_FILTER %2
- EPEL_LOAD %2, srcq-%%stride, %%stride, %1
- EPEL_COMPUTE %2, %1, m14, m15
-%if (%1 > 8 && (%2 == 8))
- SWAP m8, m1
-%endif
- SWAP m4, m0
- add srcq, srcstrideq
- EPEL_LOAD %2, srcq-%%stride, %%stride, %1
- EPEL_COMPUTE %2, %1, m14, m15
-%if (%1 > 8 && (%2 == 8))
- SWAP m9, m1
-%endif
- SWAP m5, m0
- add srcq, srcstrideq
- EPEL_LOAD %2, srcq-%%stride, %%stride, %1
- EPEL_COMPUTE %2, %1, m14, m15
-%if (%1 > 8 && (%2 == 8))
- SWAP m10, m1
-%endif
- SWAP m6, m0
- add srcq, srcstrideq
-.loop:
- EPEL_LOAD %2, srcq-%%stride, %%stride, %1
- EPEL_COMPUTE %2, %1, m14, m15
-%if (%1 > 8 && (%2 == 8))
- SWAP m11, m1
-%endif
- mova m7, m0
- punpcklwd m0, m4, m5
- punpcklwd m2, m6, m7
-%if %1 > 4
- punpckhwd m1, m4, m5
- punpckhwd m3, m6, m7
-%endif
- EPEL_COMPUTE 14, %1, m12, m13
-%if (%1 > 8 && (%2 == 8))
- punpcklwd m4, m8, m9
- punpcklwd m2, m10, m11
- punpckhwd m8, m8, m9
- punpckhwd m3, m10, m11
- EPEL_COMPUTE 14, %1, m12, m13, m4, m2, m8, m3
- UNI_COMPUTE %1, %2, m0, m4, [pw_%2]
-%else
- UNI_COMPUTE %1, %2, m0, m1, [pw_%2]
-%endif
- PEL_%2STORE%1 dstq, m0, m1
- mova m4, m5
- mova m5, m6
- mova m6, m7
-%if (%1 > 8 && (%2 == 8))
- mova m8, m9
- mova m9, m10
- mova m10, m11
-%endif
- add dstq, dststrideq ; dst += dststride
- add srcq, srcstrideq ; src += srcstride
- dec heightd ; cmp height
- jnz .loop ; height loop
- RET
cglobal hevc_put_hevc_bi_epel_hv%1_%2, 8, 9, 16, dst, dststride, src, srcstride, src2, height, mx, my, r3src
%assign %%stride ((%2 + 7)/8)
@@ -1093,34 +881,6 @@ cglobal hevc_put_hevc_bi_epel_hv%1_%2, 8, 9, 16, dst, dststride, src, srcstride,
; ******************************
%macro HEVC_PUT_HEVC_QPEL 2
-cglobal hevc_put_hevc_qpel_h%1_%2, 5, 6, 16, dst, src, srcstride, height, mx, rfilter
- QPEL_FILTER %2, mx
-.loop:
- QPEL_H_LOAD %2, srcq, %1, 10
- QPEL_COMPUTE %1, %2, 1
-%if %2 > 8
- packssdw m0, m1
-%endif
- PEL_10STORE%1 dstq, m0, m1
- LOOP_END dst, src, srcstride
- RET
-
-cglobal hevc_put_hevc_uni_qpel_h%1_%2, 6, 7, 16 , dst, dststride, src, srcstride, height, mx, rfilter
- mova m9, [pw_%2]
- QPEL_FILTER %2, mx
-.loop:
- QPEL_H_LOAD %2, srcq, %1, 10
- QPEL_COMPUTE %1, %2
-%if %2 > 8
- packssdw m0, m1
-%endif
- UNI_COMPUTE %1, %2, m0, m1, m9
- PEL_%2STORE%1 dstq, m0, m1
- add dstq, dststrideq ; dst += dststride
- add srcq, srcstrideq ; src += srcstride
- dec heightd ; cmp height
- jnz .loop ; height loop
- RET
cglobal hevc_put_hevc_bi_qpel_h%1_%2, 7, 8, 16 , dst, dststride, src, srcstride, src2, height, mx, rfilter
movdqa m9, [pw_bi_%2]
@@ -1148,38 +908,6 @@ cglobal hevc_put_hevc_bi_qpel_h%1_%2, 7, 8, 16 , dst, dststride, src, srcstride,
; int height, int mx, int my, int width)
; ******************************
-cglobal hevc_put_hevc_qpel_v%1_%2, 4, 8, 16, dst, src, srcstride, height, r3src, my, rfilter
- movifnidn myd, mym
- lea r3srcq, [srcstrideq*3]
- QPEL_FILTER %2, my
-.loop:
- QPEL_V_LOAD %2, srcq, srcstride, %1, r7
- QPEL_COMPUTE %1, %2, 1
-%if %2 > 8
- packssdw m0, m1
-%endif
- PEL_10STORE%1 dstq, m0, m1
- LOOP_END dst, src, srcstride
- RET
-
-cglobal hevc_put_hevc_uni_qpel_v%1_%2, 5, 9, 16, dst, dststride, src, srcstride, height, r3src, my, rfilter
- movifnidn myd, mym
- movdqa m9, [pw_%2]
- lea r3srcq, [srcstrideq*3]
- QPEL_FILTER %2, my
-.loop:
- QPEL_V_LOAD %2, srcq, srcstride, %1, r8
- QPEL_COMPUTE %1, %2
-%if %2 > 8
- packssdw m0, m1
-%endif
- UNI_COMPUTE %1, %2, m0, m1, m9
- PEL_%2STORE%1 dstq, m0, m1
- add dstq, dststrideq ; dst += dststride
- add srcq, srcstrideq ; src += srcstride
- dec heightd ; cmp height
- jnz .loop ; height loop
- RET
cglobal hevc_put_hevc_bi_qpel_v%1_%2, 6, 10, 16, dst, dststride, src, srcstride, src2, height, r3src, my, rfilter
movifnidn myd, mym
@@ -1210,162 +938,6 @@ cglobal hevc_put_hevc_bi_qpel_v%1_%2, 6, 10, 16, dst, dststride, src, srcstride,
; int height, int mx, int my)
; ******************************
%macro HEVC_PUT_HEVC_QPEL_HV 2
-cglobal hevc_put_hevc_qpel_hv%1_%2, 6, 8, 16, dst, src, srcstride, height, mx, my, r3src, rfilter
-%if cpuflag(avx2)
-%assign %%shift 4
-%else
-%assign %%shift 3
-%endif
- sub mxq, 1
- sub myq, 1
- shl mxq, %%shift ; multiply by 32
- shl myq, %%shift ; multiply by 32
- lea r3srcq, [srcstrideq*3]
- sub srcq, r3srcq
- QPEL_H_LOAD %2, srcq, %1, 15
- QPEL_HV_COMPUTE %1, %2, mx, ackssdw
- SWAP m8, m0
- add srcq, srcstrideq
- QPEL_H_LOAD %2, srcq, %1, 15
- QPEL_HV_COMPUTE %1, %2, mx, ackssdw
- SWAP m9, m0
- add srcq, srcstrideq
- QPEL_H_LOAD %2, srcq, %1, 15
- QPEL_HV_COMPUTE %1, %2, mx, ackssdw
- SWAP m10, m0
- add srcq, srcstrideq
- QPEL_H_LOAD %2, srcq, %1, 15
- QPEL_HV_COMPUTE %1, %2, mx, ackssdw
- SWAP m11, m0
- add srcq, srcstrideq
- QPEL_H_LOAD %2, srcq, %1, 15
- QPEL_HV_COMPUTE %1, %2, mx, ackssdw
- SWAP m12, m0
- add srcq, srcstrideq
- QPEL_H_LOAD %2, srcq, %1, 15
- QPEL_HV_COMPUTE %1, %2, mx, ackssdw
- SWAP m13, m0
- add srcq, srcstrideq
- QPEL_H_LOAD %2, srcq, %1, 15
- QPEL_HV_COMPUTE %1, %2, mx, ackssdw
- SWAP m14, m0
- add srcq, srcstrideq
-.loop:
- QPEL_H_LOAD %2, srcq, %1, 15
- QPEL_HV_COMPUTE %1, %2, mx, ackssdw
- SWAP m15, m0
- punpcklwd m0, m8, m9
- punpcklwd m2, m10, m11
- punpcklwd m4, m12, m13
- punpcklwd m6, m14, m15
-%if %1 > 4
- punpckhwd m1, m8, m9
- punpckhwd m3, m10, m11
- punpckhwd m5, m12, m13
- punpckhwd m7, m14, m15
-%endif
- QPEL_HV_COMPUTE %1, 14, my, ackssdw
- PEL_10STORE%1 dstq, m0, m1
-%if %1 <= 4
- movq m8, m9
- movq m9, m10
- movq m10, m11
- movq m11, m12
- movq m12, m13
- movq m13, m14
- movq m14, m15
-%else
- movdqa m8, m9
- movdqa m9, m10
- movdqa m10, m11
- movdqa m11, m12
- movdqa m12, m13
- movdqa m13, m14
- movdqa m14, m15
-%endif
- LOOP_END dst, src, srcstride
- RET
-
-cglobal hevc_put_hevc_uni_qpel_hv%1_%2, 7, 9, 16 , dst, dststride, src, srcstride, height, mx, my, r3src, rfilter
-%if cpuflag(avx2)
-%assign %%shift 4
-%else
-%assign %%shift 3
-%endif
- sub mxq, 1
- sub myq, 1
- shl mxq, %%shift ; multiply by 32
- shl myq, %%shift ; multiply by 32
- lea r3srcq, [srcstrideq*3]
- sub srcq, r3srcq
- QPEL_H_LOAD %2, srcq, %1, 15
- QPEL_HV_COMPUTE %1, %2, mx, ackssdw
- SWAP m8, m0
- add srcq, srcstrideq
- QPEL_H_LOAD %2, srcq, %1, 15
- QPEL_HV_COMPUTE %1, %2, mx, ackssdw
- SWAP m9, m0
- add srcq, srcstrideq
- QPEL_H_LOAD %2, srcq, %1, 15
- QPEL_HV_COMPUTE %1, %2, mx, ackssdw
- SWAP m10, m0
- add srcq, srcstrideq
- QPEL_H_LOAD %2, srcq, %1, 15
- QPEL_HV_COMPUTE %1, %2, mx, ackssdw
- SWAP m11, m0
- add srcq, srcstrideq
- QPEL_H_LOAD %2, srcq, %1, 15
- QPEL_HV_COMPUTE %1, %2, mx, ackssdw
- SWAP m12, m0
- add srcq, srcstrideq
- QPEL_H_LOAD %2, srcq, %1, 15
- QPEL_HV_COMPUTE %1, %2, mx, ackssdw
- SWAP m13, m0
- add srcq, srcstrideq
- QPEL_H_LOAD %2, srcq, %1, 15
- QPEL_HV_COMPUTE %1, %2, mx, ackssdw
- SWAP m14, m0
- add srcq, srcstrideq
-.loop:
- QPEL_H_LOAD %2, srcq, %1, 15
- QPEL_HV_COMPUTE %1, %2, mx, ackssdw
- SWAP m15, m0
- punpcklwd m0, m8, m9
- punpcklwd m2, m10, m11
- punpcklwd m4, m12, m13
- punpcklwd m6, m14, m15
-%if %1 > 4
- punpckhwd m1, m8, m9
- punpckhwd m3, m10, m11
- punpckhwd m5, m12, m13
- punpckhwd m7, m14, m15
-%endif
- QPEL_HV_COMPUTE %1, 14, my, ackusdw
- UNI_COMPUTE %1, %2, m0, m1, [pw_%2]
- PEL_%2STORE%1 dstq, m0, m1
-
-%if %1 <= 4
- movq m8, m9
- movq m9, m10
- movq m10, m11
- movq m11, m12
- movq m12, m13
- movq m13, m14
- movq m14, m15
-%else
- mova m8, m9
- mova m9, m10
- mova m10, m11
- mova m11, m12
- mova m12, m13
- mova m13, m14
- mova m14, m15
-%endif
- add dstq, dststrideq ; dst += dststride
- add srcq, srcstrideq ; src += srcstride
- dec heightd ; cmp height
- jnz .loop ; height loop
- RET
cglobal hevc_put_hevc_bi_qpel_hv%1_%2, 8, 10, 16, dst, dststride, src, srcstride, src2, height, mx, my, r3src, rfilter
%if cpuflag(avx2)
@@ -1613,22 +1185,22 @@ WEIGHTING_FUNCS 4, 12
WEIGHTING_FUNCS 6, 12
WEIGHTING_FUNCS 8, 12
-HEVC_PUT_HEVC_PEL_PIXELS 2, 8
-HEVC_PUT_HEVC_PEL_PIXELS 4, 8
-HEVC_PUT_HEVC_PEL_PIXELS 6, 8
-HEVC_PUT_HEVC_PEL_PIXELS 8, 8
-HEVC_PUT_HEVC_PEL_PIXELS 12, 8
-HEVC_PUT_HEVC_PEL_PIXELS 16, 8
+HEVC_BI_PEL_PIXELS 2, 8
+HEVC_BI_PEL_PIXELS 4, 8
+HEVC_BI_PEL_PIXELS 6, 8
+HEVC_BI_PEL_PIXELS 8, 8
+HEVC_BI_PEL_PIXELS 12, 8
+HEVC_BI_PEL_PIXELS 16, 8
-HEVC_PUT_HEVC_PEL_PIXELS 2, 10
-HEVC_PUT_HEVC_PEL_PIXELS 4, 10
-HEVC_PUT_HEVC_PEL_PIXELS 6, 10
-HEVC_PUT_HEVC_PEL_PIXELS 8, 10
+HEVC_BI_PEL_PIXELS 2, 10
+HEVC_BI_PEL_PIXELS 4, 10
+HEVC_BI_PEL_PIXELS 6, 10
+HEVC_BI_PEL_PIXELS 8, 10
-HEVC_PUT_HEVC_PEL_PIXELS 2, 12
-HEVC_PUT_HEVC_PEL_PIXELS 4, 12
-HEVC_PUT_HEVC_PEL_PIXELS 6, 12
-HEVC_PUT_HEVC_PEL_PIXELS 8, 12
+HEVC_BI_PEL_PIXELS 2, 12
+HEVC_BI_PEL_PIXELS 4, 12
+HEVC_BI_PEL_PIXELS 6, 12
+HEVC_BI_PEL_PIXELS 8, 12
HEVC_PUT_HEVC_EPEL 2, 8
HEVC_PUT_HEVC_EPEL 4, 8
@@ -1693,8 +1265,8 @@ HEVC_PUT_HEVC_QPEL_HV 8, 12
%if HAVE_AVX2_EXTERNAL
INIT_YMM avx2 ; adds ff_ and _avx2 to function name & enables 256b registers : m0 for 256b, xm0 for 128b. cpuflag(avx2) = 1 / notcpuflag(avx) = 0
-HEVC_PUT_HEVC_PEL_PIXELS 32, 8
-HEVC_PUT_HEVC_PEL_PIXELS 16, 10
+HEVC_BI_PEL_PIXELS 32, 8
+HEVC_BI_PEL_PIXELS 16, 10
HEVC_PUT_HEVC_EPEL 32, 8
HEVC_PUT_HEVC_EPEL 16, 10