diff options
author | jinbo <jinbo@loongson.cn> | 2023-12-28 16:21:01 +0800 |
---|---|---|
committer | Michael Niedermayer <michael@niedermayer.cc> | 2024-01-12 23:35:40 +0100 |
commit | a28eea2a277bb58004dc7ecccd543fa4baf69170 (patch) | |
tree | 57dcb211a8d8a00175e28fda770ed9dc7f7c7b56 /libavcodec | |
parent | cfbdda607d02f9e23ead8252243643e167d38414 (diff) | |
download | ffmpeg-a28eea2a277bb58004dc7ecccd543fa4baf69170.tar.gz |
avcodec/hevc: Add pel_uni_w_pixels4/6/8/12/16/24/32/48/64 asm opt
tests/checkasm/checkasm: C LSX LASX
put_hevc_pel_uni_w_pixels4_8_c: 2.7 1.0
put_hevc_pel_uni_w_pixels6_8_c: 6.2 2.0 1.5
put_hevc_pel_uni_w_pixels8_8_c: 10.7 2.5 1.7
put_hevc_pel_uni_w_pixels12_8_c: 23.0 5.5 5.0
put_hevc_pel_uni_w_pixels16_8_c: 41.0 8.2 5.0
put_hevc_pel_uni_w_pixels24_8_c: 91.0 19.7 13.2
put_hevc_pel_uni_w_pixels32_8_c: 161.7 32.5 16.2
put_hevc_pel_uni_w_pixels48_8_c: 354.5 73.7 43.0
put_hevc_pel_uni_w_pixels64_8_c: 641.5 130.0 64.2
Speedup of decoding H265 4K 30FPS 30Mbps on 3A6000 with
8 threads is 1fps(47fps-->48fps).
Reviewed-by: yinshiyou-hf@loongson.cn
Signed-off-by: Michael Niedermayer <michael@niedermayer.cc>
Diffstat (limited to 'libavcodec')
-rw-r--r-- | libavcodec/loongarch/Makefile | 3 | ||||
-rw-r--r-- | libavcodec/loongarch/hevc_mc.S | 471 | ||||
-rw-r--r-- | libavcodec/loongarch/hevcdsp_init_loongarch.c | 43 | ||||
-rw-r--r-- | libavcodec/loongarch/hevcdsp_lasx.h | 53 | ||||
-rw-r--r-- | libavcodec/loongarch/hevcdsp_lsx.h | 27 |
5 files changed, 596 insertions, 1 deletions
diff --git a/libavcodec/loongarch/Makefile b/libavcodec/loongarch/Makefile index 07ea97f803..ad98cd4054 100644 --- a/libavcodec/loongarch/Makefile +++ b/libavcodec/loongarch/Makefile @@ -28,7 +28,8 @@ LSX-OBJS-$(CONFIG_HEVC_DECODER) += loongarch/hevcdsp_lsx.o \ loongarch/hevc_mc_bi_lsx.o \ loongarch/hevc_mc_uni_lsx.o \ loongarch/hevc_mc_uniw_lsx.o \ - loongarch/hevc_add_res.o + loongarch/hevc_add_res.o \ + loongarch/hevc_mc.o LSX-OBJS-$(CONFIG_H264DSP) += loongarch/h264idct.o \ loongarch/h264idct_loongarch.o \ loongarch/h264dsp.o diff --git a/libavcodec/loongarch/hevc_mc.S b/libavcodec/loongarch/hevc_mc.S new file mode 100644 index 0000000000..c5d553effe --- /dev/null +++ b/libavcodec/loongarch/hevc_mc.S @@ -0,0 +1,471 @@ +/* + * Copyright (c) 2023 Loongson Technology Corporation Limited + * Contributed by jinbo <jinbo@loongson.cn> + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include "loongson_asm.S" + +.macro LOAD_VAR bit + addi.w t1, a5, 6 //shift + addi.w t3, zero, 1 //one + sub.w t4, t1, t3 + sll.w t3, t3, t4 //offset +.if \bit == 128 + vreplgr2vr.w vr1, a6 //wx + vreplgr2vr.w vr2, t3 //offset + vreplgr2vr.w vr3, t1 //shift + vreplgr2vr.w vr4, a7 //ox +.else + xvreplgr2vr.w xr1, a6 + xvreplgr2vr.w xr2, t3 + xvreplgr2vr.w xr3, t1 + xvreplgr2vr.w xr4, a7 +.endif +.endm + +.macro HEVC_PEL_UNI_W_PIXELS8_LSX src0, dst0, w + vldrepl.d vr0, \src0, 0 + vsllwil.hu.bu vr0, vr0, 0 + vexth.wu.hu vr5, vr0 + vsllwil.wu.hu vr0, vr0, 0 + vslli.w vr0, vr0, 6 + vslli.w vr5, vr5, 6 + vmul.w vr0, vr0, vr1 + vmul.w vr5, vr5, vr1 + vadd.w vr0, vr0, vr2 + vadd.w vr5, vr5, vr2 + vsra.w vr0, vr0, vr3 + vsra.w vr5, vr5, vr3 + vadd.w vr0, vr0, vr4 + vadd.w vr5, vr5, vr4 + vssrani.h.w vr5, vr0, 0 + vssrani.bu.h vr5, vr5, 0 +.if \w == 6 + fst.s f5, \dst0, 0 + vstelm.h vr5, \dst0, 4, 2 +.else + fst.d f5, \dst0, 0 +.endif +.endm + +.macro HEVC_PEL_UNI_W_PIXELS8x2_LASX src0, dst0, w + vldrepl.d vr0, \src0, 0 + add.d t2, \src0, a3 + vldrepl.d vr5, t2, 0 + xvpermi.q xr0, xr5, 0x02 + xvsllwil.hu.bu xr0, xr0, 0 + xvexth.wu.hu xr5, xr0 + xvsllwil.wu.hu xr0, xr0, 0 + xvslli.w xr0, xr0, 6 + xvslli.w xr5, xr5, 6 + xvmul.w xr0, xr0, xr1 + xvmul.w xr5, xr5, xr1 + xvadd.w xr0, xr0, xr2 + xvadd.w xr5, xr5, xr2 + xvsra.w xr0, xr0, xr3 + xvsra.w xr5, xr5, xr3 + xvadd.w xr0, xr0, xr4 + xvadd.w xr5, xr5, xr4 + xvssrani.h.w xr5, xr0, 0 + xvpermi.q xr0, xr5, 0x01 + xvssrani.bu.h xr0, xr5, 0 + add.d t3, \dst0, a1 +.if \w == 6 + vstelm.w vr0, \dst0, 0, 0 + vstelm.h vr0, \dst0, 4, 2 + vstelm.w vr0, t3, 0, 2 + vstelm.h vr0, t3, 4, 6 +.else + vstelm.d vr0, \dst0, 0, 0 + vstelm.d vr0, t3, 0, 1 +.endif +.endm + +.macro HEVC_PEL_UNI_W_PIXELS16_LSX src0, dst0 + vld vr0, \src0, 0 + vexth.hu.bu vr7, vr0 + vexth.wu.hu vr8, vr7 + vsllwil.wu.hu vr7, vr7, 0 + vsllwil.hu.bu vr5, vr0, 0 + vexth.wu.hu vr6, vr5 + vsllwil.wu.hu vr5, vr5, 0 + vslli.w vr5, vr5, 6 + vslli.w vr6, vr6, 6 + vslli.w vr7, vr7, 6 + vslli.w vr8, vr8, 6 + vmul.w vr5, vr5, vr1 + vmul.w vr6, vr6, vr1 + vmul.w vr7, vr7, vr1 + vmul.w vr8, vr8, vr1 + vadd.w vr5, vr5, vr2 + vadd.w vr6, vr6, vr2 + vadd.w vr7, vr7, vr2 + vadd.w vr8, vr8, vr2 + vsra.w vr5, vr5, vr3 + vsra.w vr6, vr6, vr3 + vsra.w vr7, vr7, vr3 + vsra.w vr8, vr8, vr3 + vadd.w vr5, vr5, vr4 + vadd.w vr6, vr6, vr4 + vadd.w vr7, vr7, vr4 + vadd.w vr8, vr8, vr4 + vssrani.h.w vr6, vr5, 0 + vssrani.h.w vr8, vr7, 0 + vssrani.bu.h vr8, vr6, 0 + vst vr8, \dst0, 0 +.endm + +.macro HEVC_PEL_UNI_W_PIXELS16_LASX src0, dst0 + vld vr0, \src0, 0 + xvpermi.d xr0, xr0, 0xd8 + xvsllwil.hu.bu xr0, xr0, 0 + xvexth.wu.hu xr6, xr0 + xvsllwil.wu.hu xr5, xr0, 0 + xvslli.w xr5, xr5, 6 + xvslli.w xr6, xr6, 6 + xvmul.w xr5, xr5, xr1 + xvmul.w xr6, xr6, xr1 + xvadd.w xr5, xr5, xr2 + xvadd.w xr6, xr6, xr2 + xvsra.w xr5, xr5, xr3 + xvsra.w xr6, xr6, xr3 + xvadd.w xr5, xr5, xr4 + xvadd.w xr6, xr6, xr4 + xvssrani.h.w xr6, xr5, 0 + xvpermi.q xr7, xr6, 0x01 + xvssrani.bu.h xr7, xr6, 0 + vst vr7, \dst0, 0 +.endm + +.macro HEVC_PEL_UNI_W_PIXELS32_LASX src0, dst0, w +.if \w == 16 + vld vr0, \src0, 0 + add.d t2, \src0, a3 + vld vr5, t2, 0 + xvpermi.q xr0, xr5, 0x02 +.else //w=24/32 + xvld xr0, \src0, 0 +.endif + xvexth.hu.bu xr7, xr0 + xvexth.wu.hu xr8, xr7 + xvsllwil.wu.hu xr7, xr7, 0 + xvsllwil.hu.bu xr5, xr0, 0 + xvexth.wu.hu xr6, xr5 + xvsllwil.wu.hu xr5, xr5, 0 + xvslli.w xr5, xr5, 6 + xvslli.w xr6, xr6, 6 + xvslli.w xr7, xr7, 6 + xvslli.w xr8, xr8, 6 + xvmul.w xr5, xr5, xr1 + xvmul.w xr6, xr6, xr1 + xvmul.w xr7, xr7, xr1 + xvmul.w xr8, xr8, xr1 + xvadd.w xr5, xr5, xr2 + xvadd.w xr6, xr6, xr2 + xvadd.w xr7, xr7, xr2 + xvadd.w xr8, xr8, xr2 + xvsra.w xr5, xr5, xr3 + xvsra.w xr6, xr6, xr3 + xvsra.w xr7, xr7, xr3 + xvsra.w xr8, xr8, xr3 + xvadd.w xr5, xr5, xr4 + xvadd.w xr6, xr6, xr4 + xvadd.w xr7, xr7, xr4 + xvadd.w xr8, xr8, xr4 + xvssrani.h.w xr6, xr5, 0 + xvssrani.h.w xr8, xr7, 0 + xvssrani.bu.h xr8, xr6, 0 +.if \w == 16 + vst vr8, \dst0, 0 + add.d t2, \dst0, a1 + xvpermi.q xr8, xr8, 0x01 + vst vr8, t2, 0 +.elseif \w == 24 + vst vr8, \dst0, 0 + xvstelm.d xr8, \dst0, 16, 2 +.else + xvst xr8, \dst0, 0 +.endif +.endm + +function ff_hevc_put_hevc_pel_uni_w_pixels4_8_lsx + LOAD_VAR 128 + srli.w t0, a4, 1 +.LOOP_PIXELS4: + vldrepl.w vr0, a2, 0 + add.d t1, a2, a3 + vldrepl.w vr5, t1, 0 + vsllwil.hu.bu vr0, vr0, 0 + vsllwil.wu.hu vr0, vr0, 0 + vsllwil.hu.bu vr5, vr5, 0 + vsllwil.wu.hu vr5, vr5, 0 + vslli.w vr0, vr0, 6 + vslli.w vr5, vr5, 6 + vmul.w vr0, vr0, vr1 + vmul.w vr5, vr5, vr1 + vadd.w vr0, vr0, vr2 + vadd.w vr5, vr5, vr2 + vsra.w vr0, vr0, vr3 + vsra.w vr5, vr5, vr3 + vadd.w vr0, vr0, vr4 + vadd.w vr5, vr5, vr4 + vssrani.h.w vr5, vr0, 0 + vssrani.bu.h vr5, vr5, 0 + fst.s f5, a0, 0 + add.d t2, a0, a1 + vstelm.w vr5, t2, 0, 1 + alsl.d a2, a3, a2, 1 + alsl.d a0, a1, a0, 1 + addi.w t0, t0, -1 + bnez t0, .LOOP_PIXELS4 +endfunc + +function ff_hevc_put_hevc_pel_uni_w_pixels6_8_lsx + LOAD_VAR 128 +.LOOP_PIXELS6: + HEVC_PEL_UNI_W_PIXELS8_LSX a2, a0, 6 + add.d a2, a2, a3 + add.d a0, a0, a1 + addi.w a4, a4, -1 + bnez a4, .LOOP_PIXELS6 +endfunc + +function ff_hevc_put_hevc_pel_uni_w_pixels6_8_lasx + LOAD_VAR 256 + srli.w t0, a4, 1 +.LOOP_PIXELS6_LASX: + HEVC_PEL_UNI_W_PIXELS8x2_LASX a2, a0, 6 + alsl.d a2, a3, a2, 1 + alsl.d a0, a1, a0, 1 + addi.w t0, t0, -1 + bnez t0, .LOOP_PIXELS6_LASX +endfunc + +function ff_hevc_put_hevc_pel_uni_w_pixels8_8_lsx + LOAD_VAR 128 +.LOOP_PIXELS8: + HEVC_PEL_UNI_W_PIXELS8_LSX a2, a0, 8 + add.d a2, a2, a3 + add.d a0, a0, a1 + addi.w a4, a4, -1 + bnez a4, .LOOP_PIXELS8 +endfunc + +function ff_hevc_put_hevc_pel_uni_w_pixels8_8_lasx + LOAD_VAR 256 + srli.w t0, a4, 1 +.LOOP_PIXELS8_LASX: + HEVC_PEL_UNI_W_PIXELS8x2_LASX a2, a0, 8 + alsl.d a2, a3, a2, 1 + alsl.d a0, a1, a0, 1 + addi.w t0, t0, -1 + bnez t0, .LOOP_PIXELS8_LASX +endfunc + +function ff_hevc_put_hevc_pel_uni_w_pixels12_8_lsx + LOAD_VAR 128 +.LOOP_PIXELS12: + vld vr0, a2, 0 + vexth.hu.bu vr7, vr0 + vsllwil.wu.hu vr7, vr7, 0 + vsllwil.hu.bu vr5, vr0, 0 + vexth.wu.hu vr6, vr5 + vsllwil.wu.hu vr5, vr5, 0 + vslli.w vr5, vr5, 6 + vslli.w vr6, vr6, 6 + vslli.w vr7, vr7, 6 + vmul.w vr5, vr5, vr1 + vmul.w vr6, vr6, vr1 + vmul.w vr7, vr7, vr1 + vadd.w vr5, vr5, vr2 + vadd.w vr6, vr6, vr2 + vadd.w vr7, vr7, vr2 + vsra.w vr5, vr5, vr3 + vsra.w vr6, vr6, vr3 + vsra.w vr7, vr7, vr3 + vadd.w vr5, vr5, vr4 + vadd.w vr6, vr6, vr4 + vadd.w vr7, vr7, vr4 + vssrani.h.w vr6, vr5, 0 + vssrani.h.w vr7, vr7, 0 + vssrani.bu.h vr7, vr6, 0 + fst.d f7, a0, 0 + vstelm.w vr7, a0, 8, 2 + add.d a2, a2, a3 + add.d a0, a0, a1 + addi.w a4, a4, -1 + bnez a4, .LOOP_PIXELS12 +endfunc + +function ff_hevc_put_hevc_pel_uni_w_pixels12_8_lasx + LOAD_VAR 256 +.LOOP_PIXELS12_LASX: + vld vr0, a2, 0 + xvpermi.d xr0, xr0, 0xd8 + xvsllwil.hu.bu xr0, xr0, 0 + xvexth.wu.hu xr6, xr0 + xvsllwil.wu.hu xr5, xr0, 0 + xvslli.w xr5, xr5, 6 + xvslli.w xr6, xr6, 6 + xvmul.w xr5, xr5, xr1 + xvmul.w xr6, xr6, xr1 + xvadd.w xr5, xr5, xr2 + xvadd.w xr6, xr6, xr2 + xvsra.w xr5, xr5, xr3 + xvsra.w xr6, xr6, xr3 + xvadd.w xr5, xr5, xr4 + xvadd.w xr6, xr6, xr4 + xvssrani.h.w xr6, xr5, 0 + xvpermi.q xr7, xr6, 0x01 + xvssrani.bu.h xr7, xr6, 0 + fst.d f7, a0, 0 + vstelm.w vr7, a0, 8, 2 + add.d a2, a2, a3 + add.d a0, a0, a1 + addi.w a4, a4, -1 + bnez a4, .LOOP_PIXELS12_LASX +endfunc + +function ff_hevc_put_hevc_pel_uni_w_pixels16_8_lsx + LOAD_VAR 128 +.LOOP_PIXELS16: + HEVC_PEL_UNI_W_PIXELS16_LSX a2, a0 + add.d a2, a2, a3 + add.d a0, a0, a1 + addi.w a4, a4, -1 + bnez a4, .LOOP_PIXELS16 +endfunc + +function ff_hevc_put_hevc_pel_uni_w_pixels16_8_lasx + LOAD_VAR 256 + srli.w t0, a4, 1 +.LOOP_PIXELS16_LASX: + HEVC_PEL_UNI_W_PIXELS32_LASX a2, a0, 16 + alsl.d a2, a3, a2, 1 + alsl.d a0, a1, a0, 1 + addi.w t0, t0, -1 + bnez t0, .LOOP_PIXELS16_LASX +endfunc + +function ff_hevc_put_hevc_pel_uni_w_pixels24_8_lsx + LOAD_VAR 128 +.LOOP_PIXELS24: + HEVC_PEL_UNI_W_PIXELS16_LSX a2, a0 + addi.d t0, a2, 16 + addi.d t1, a0, 16 + HEVC_PEL_UNI_W_PIXELS8_LSX t0, t1, 8 + add.d a2, a2, a3 + add.d a0, a0, a1 + addi.w a4, a4, -1 + bnez a4, .LOOP_PIXELS24 +endfunc + +function ff_hevc_put_hevc_pel_uni_w_pixels24_8_lasx + LOAD_VAR 256 +.LOOP_PIXELS24_LASX: + HEVC_PEL_UNI_W_PIXELS32_LASX a2, a0, 24 + add.d a2, a2, a3 + add.d a0, a0, a1 + addi.w a4, a4, -1 + bnez a4, .LOOP_PIXELS24_LASX +endfunc + +function ff_hevc_put_hevc_pel_uni_w_pixels32_8_lsx + LOAD_VAR 128 +.LOOP_PIXELS32: + HEVC_PEL_UNI_W_PIXELS16_LSX a2, a0 + addi.d t0, a2, 16 + addi.d t1, a0, 16 + HEVC_PEL_UNI_W_PIXELS16_LSX t0, t1 + add.d a2, a2, a3 + add.d a0, a0, a1 + addi.w a4, a4, -1 + bnez a4, .LOOP_PIXELS32 +endfunc + +function ff_hevc_put_hevc_pel_uni_w_pixels32_8_lasx + LOAD_VAR 256 +.LOOP_PIXELS32_LASX: + HEVC_PEL_UNI_W_PIXELS32_LASX a2, a0, 32 + add.d a2, a2, a3 + add.d a0, a0, a1 + addi.w a4, a4, -1 + bnez a4, .LOOP_PIXELS32_LASX +endfunc + +function ff_hevc_put_hevc_pel_uni_w_pixels48_8_lsx + LOAD_VAR 128 +.LOOP_PIXELS48: + HEVC_PEL_UNI_W_PIXELS16_LSX a2, a0 + addi.d t0, a2, 16 + addi.d t1, a0, 16 + HEVC_PEL_UNI_W_PIXELS16_LSX t0, t1 + addi.d t0, a2, 32 + addi.d t1, a0, 32 + HEVC_PEL_UNI_W_PIXELS16_LSX t0, t1 + add.d a2, a2, a3 + add.d a0, a0, a1 + addi.w a4, a4, -1 + bnez a4, .LOOP_PIXELS48 +endfunc + +function ff_hevc_put_hevc_pel_uni_w_pixels48_8_lasx + LOAD_VAR 256 +.LOOP_PIXELS48_LASX: + HEVC_PEL_UNI_W_PIXELS32_LASX a2, a0, 32 + addi.d t0, a2, 32 + addi.d t1, a0, 32 + HEVC_PEL_UNI_W_PIXELS16_LASX t0, t1 + add.d a2, a2, a3 + add.d a0, a0, a1 + addi.w a4, a4, -1 + bnez a4, .LOOP_PIXELS48_LASX +endfunc + +function ff_hevc_put_hevc_pel_uni_w_pixels64_8_lsx + LOAD_VAR 128 +.LOOP_PIXELS64: + HEVC_PEL_UNI_W_PIXELS16_LSX a2, a0 + addi.d t0, a2, 16 + addi.d t1, a0, 16 + HEVC_PEL_UNI_W_PIXELS16_LSX t0, t1 + addi.d t0, a2, 32 + addi.d t1, a0, 32 + HEVC_PEL_UNI_W_PIXELS16_LSX t0, t1 + addi.d t0, a2, 48 + addi.d t1, a0, 48 + HEVC_PEL_UNI_W_PIXELS16_LSX t0, t1 + add.d a2, a2, a3 + add.d a0, a0, a1 + addi.w a4, a4, -1 + bnez a4, .LOOP_PIXELS64 +endfunc + +function ff_hevc_put_hevc_pel_uni_w_pixels64_8_lasx + LOAD_VAR 256 +.LOOP_PIXELS64_LASX: + HEVC_PEL_UNI_W_PIXELS32_LASX a2, a0, 32 + addi.d t0, a2, 32 + addi.d t1, a0, 32 + HEVC_PEL_UNI_W_PIXELS32_LASX t0, t1, 32 + add.d a2, a2, a3 + add.d a0, a0, a1 + addi.w a4, a4, -1 + bnez a4, .LOOP_PIXELS64_LASX +endfunc diff --git a/libavcodec/loongarch/hevcdsp_init_loongarch.c b/libavcodec/loongarch/hevcdsp_init_loongarch.c index a8f753dc86..d0ee99d6b5 100644 --- a/libavcodec/loongarch/hevcdsp_init_loongarch.c +++ b/libavcodec/loongarch/hevcdsp_init_loongarch.c @@ -22,6 +22,7 @@ #include "libavutil/loongarch/cpu.h" #include "hevcdsp_lsx.h" +#include "hevcdsp_lasx.h" void ff_hevc_dsp_init_loongarch(HEVCDSPContext *c, const int bit_depth) { @@ -160,6 +161,26 @@ void ff_hevc_dsp_init_loongarch(HEVCDSPContext *c, const int bit_depth) c->put_hevc_epel_uni[6][1][1] = ff_hevc_put_hevc_uni_epel_hv24_8_lsx; c->put_hevc_epel_uni[7][1][1] = ff_hevc_put_hevc_uni_epel_hv32_8_lsx; + c->put_hevc_qpel_uni_w[1][0][0] = ff_hevc_put_hevc_pel_uni_w_pixels4_8_lsx; + c->put_hevc_qpel_uni_w[2][0][0] = ff_hevc_put_hevc_pel_uni_w_pixels6_8_lsx; + c->put_hevc_qpel_uni_w[3][0][0] = ff_hevc_put_hevc_pel_uni_w_pixels8_8_lsx; + c->put_hevc_qpel_uni_w[4][0][0] = ff_hevc_put_hevc_pel_uni_w_pixels12_8_lsx; + c->put_hevc_qpel_uni_w[5][0][0] = ff_hevc_put_hevc_pel_uni_w_pixels16_8_lsx; + c->put_hevc_qpel_uni_w[6][0][0] = ff_hevc_put_hevc_pel_uni_w_pixels24_8_lsx; + c->put_hevc_qpel_uni_w[7][0][0] = ff_hevc_put_hevc_pel_uni_w_pixels32_8_lsx; + c->put_hevc_qpel_uni_w[8][0][0] = ff_hevc_put_hevc_pel_uni_w_pixels48_8_lsx; + c->put_hevc_qpel_uni_w[9][0][0] = ff_hevc_put_hevc_pel_uni_w_pixels64_8_lsx; + + c->put_hevc_epel_uni_w[1][0][0] = ff_hevc_put_hevc_pel_uni_w_pixels4_8_lsx; + c->put_hevc_epel_uni_w[2][0][0] = ff_hevc_put_hevc_pel_uni_w_pixels6_8_lsx; + c->put_hevc_epel_uni_w[3][0][0] = ff_hevc_put_hevc_pel_uni_w_pixels8_8_lsx; + c->put_hevc_epel_uni_w[4][0][0] = ff_hevc_put_hevc_pel_uni_w_pixels12_8_lsx; + c->put_hevc_epel_uni_w[5][0][0] = ff_hevc_put_hevc_pel_uni_w_pixels16_8_lsx; + c->put_hevc_epel_uni_w[6][0][0] = ff_hevc_put_hevc_pel_uni_w_pixels24_8_lsx; + c->put_hevc_epel_uni_w[7][0][0] = ff_hevc_put_hevc_pel_uni_w_pixels32_8_lsx; + c->put_hevc_epel_uni_w[8][0][0] = ff_hevc_put_hevc_pel_uni_w_pixels48_8_lsx; + c->put_hevc_epel_uni_w[9][0][0] = ff_hevc_put_hevc_pel_uni_w_pixels64_8_lsx; + c->put_hevc_qpel_uni_w[3][1][1] = ff_hevc_put_hevc_uni_w_qpel_hv8_8_lsx; c->put_hevc_qpel_uni_w[5][1][1] = ff_hevc_put_hevc_uni_w_qpel_hv16_8_lsx; c->put_hevc_qpel_uni_w[6][1][1] = ff_hevc_put_hevc_uni_w_qpel_hv24_8_lsx; @@ -196,4 +217,26 @@ void ff_hevc_dsp_init_loongarch(HEVCDSPContext *c, const int bit_depth) c->add_residual[3] = ff_hevc_add_residual32x32_8_lsx; } } + + if (have_lasx(cpu_flags)) { + if (bit_depth == 8) { + c->put_hevc_qpel_uni_w[2][0][0] = ff_hevc_put_hevc_pel_uni_w_pixels6_8_lasx; + c->put_hevc_qpel_uni_w[3][0][0] = ff_hevc_put_hevc_pel_uni_w_pixels8_8_lasx; + c->put_hevc_qpel_uni_w[4][0][0] = ff_hevc_put_hevc_pel_uni_w_pixels12_8_lasx; + c->put_hevc_qpel_uni_w[5][0][0] = ff_hevc_put_hevc_pel_uni_w_pixels16_8_lasx; + c->put_hevc_qpel_uni_w[6][0][0] = ff_hevc_put_hevc_pel_uni_w_pixels24_8_lasx; + c->put_hevc_qpel_uni_w[7][0][0] = ff_hevc_put_hevc_pel_uni_w_pixels32_8_lasx; + c->put_hevc_qpel_uni_w[8][0][0] = ff_hevc_put_hevc_pel_uni_w_pixels48_8_lasx; + c->put_hevc_qpel_uni_w[9][0][0] = ff_hevc_put_hevc_pel_uni_w_pixels64_8_lasx; + + c->put_hevc_epel_uni_w[2][0][0] = ff_hevc_put_hevc_pel_uni_w_pixels6_8_lasx; + c->put_hevc_epel_uni_w[3][0][0] = ff_hevc_put_hevc_pel_uni_w_pixels8_8_lasx; + c->put_hevc_epel_uni_w[4][0][0] = ff_hevc_put_hevc_pel_uni_w_pixels12_8_lasx; + c->put_hevc_epel_uni_w[5][0][0] = ff_hevc_put_hevc_pel_uni_w_pixels16_8_lasx; + c->put_hevc_epel_uni_w[6][0][0] = ff_hevc_put_hevc_pel_uni_w_pixels24_8_lasx; + c->put_hevc_epel_uni_w[7][0][0] = ff_hevc_put_hevc_pel_uni_w_pixels32_8_lasx; + c->put_hevc_epel_uni_w[8][0][0] = ff_hevc_put_hevc_pel_uni_w_pixels48_8_lasx; + c->put_hevc_epel_uni_w[9][0][0] = ff_hevc_put_hevc_pel_uni_w_pixels64_8_lasx; + } + } } diff --git a/libavcodec/loongarch/hevcdsp_lasx.h b/libavcodec/loongarch/hevcdsp_lasx.h new file mode 100644 index 0000000000..819c3c3ecf --- /dev/null +++ b/libavcodec/loongarch/hevcdsp_lasx.h @@ -0,0 +1,53 @@ +/* + * Copyright (c) 2023 Loongson Technology Corporation Limited + * Contributed by jinbo <jinbo@loongson.cn> + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#ifndef AVCODEC_LOONGARCH_HEVCDSP_LASX_H +#define AVCODEC_LOONGARCH_HEVCDSP_LASX_H + +#include "libavcodec/hevcdsp.h" + +#define PEL_UNI_W(PEL, DIR, WIDTH) \ +void ff_hevc_put_hevc_##PEL##_uni_w_##DIR##WIDTH##_8_lasx(uint8_t *dst, \ + ptrdiff_t \ + dst_stride, \ + const uint8_t *src, \ + ptrdiff_t \ + src_stride, \ + int height, \ + int denom, \ + int wx, \ + int ox, \ + intptr_t mx, \ + intptr_t my, \ + int width) + +PEL_UNI_W(pel, pixels, 6); +PEL_UNI_W(pel, pixels, 8); +PEL_UNI_W(pel, pixels, 12); +PEL_UNI_W(pel, pixels, 16); +PEL_UNI_W(pel, pixels, 24); +PEL_UNI_W(pel, pixels, 32); +PEL_UNI_W(pel, pixels, 48); +PEL_UNI_W(pel, pixels, 64); + +#undef PEL_UNI_W + +#endif // #ifndef AVCODEC_LOONGARCH_HEVCDSP_LASX_H diff --git a/libavcodec/loongarch/hevcdsp_lsx.h b/libavcodec/loongarch/hevcdsp_lsx.h index ac509984fd..0d724a90ef 100644 --- a/libavcodec/loongarch/hevcdsp_lsx.h +++ b/libavcodec/loongarch/hevcdsp_lsx.h @@ -232,4 +232,31 @@ void ff_hevc_add_residual8x8_8_lsx(uint8_t *dst, const int16_t *res, ptrdiff_t s void ff_hevc_add_residual16x16_8_lsx(uint8_t *dst, const int16_t *res, ptrdiff_t stride); void ff_hevc_add_residual32x32_8_lsx(uint8_t *dst, const int16_t *res, ptrdiff_t stride); +#define PEL_UNI_W(PEL, DIR, WIDTH) \ +void ff_hevc_put_hevc_##PEL##_uni_w_##DIR##WIDTH##_8_lsx(uint8_t *dst, \ + ptrdiff_t \ + dst_stride, \ + const uint8_t *src, \ + ptrdiff_t \ + src_stride, \ + int height, \ + int denom, \ + int wx, \ + int ox, \ + intptr_t mx, \ + intptr_t my, \ + int width) + +PEL_UNI_W(pel, pixels, 4); +PEL_UNI_W(pel, pixels, 6); +PEL_UNI_W(pel, pixels, 8); +PEL_UNI_W(pel, pixels, 12); +PEL_UNI_W(pel, pixels, 16); +PEL_UNI_W(pel, pixels, 24); +PEL_UNI_W(pel, pixels, 32); +PEL_UNI_W(pel, pixels, 48); +PEL_UNI_W(pel, pixels, 64); + +#undef PEL_UNI_W + #endif // #ifndef AVCODEC_LOONGARCH_HEVCDSP_LSX_H |