diff options
author | jinbo <jinbo@loongson.cn> | 2023-12-28 16:21:00 +0800 |
---|---|---|
committer | Michael Niedermayer <michael@niedermayer.cc> | 2024-01-12 23:35:40 +0100 |
commit | cfbdda607d02f9e23ead8252243643e167d38414 (patch) | |
tree | c22dd53666503b00cc79521b6f7e2293e6858e84 | |
parent | c915dc4c5059730a5263ac8d4c99e47d13db87da (diff) | |
download | ffmpeg-cfbdda607d02f9e23ead8252243643e167d38414.tar.gz |
avcodec/hevc: Add add_residual_4/8/16/32 asm opt
After this patch, the peformance of decoding H265 4K 30FPS 30Mbps
on 3A6000 with 8 threads improves 2fps (45fps-->47fsp).
Reviewed-by: yinshiyou-hf@loongson.cn
Signed-off-by: Michael Niedermayer <michael@niedermayer.cc>
-rw-r--r-- | libavcodec/loongarch/Makefile | 3 | ||||
-rw-r--r-- | libavcodec/loongarch/hevc_add_res.S | 162 | ||||
-rw-r--r-- | libavcodec/loongarch/hevcdsp_init_loongarch.c | 5 | ||||
-rw-r--r-- | libavcodec/loongarch/hevcdsp_lsx.h | 5 |
4 files changed, 174 insertions, 1 deletions
diff --git a/libavcodec/loongarch/Makefile b/libavcodec/loongarch/Makefile index 06cfab5c20..07ea97f803 100644 --- a/libavcodec/loongarch/Makefile +++ b/libavcodec/loongarch/Makefile @@ -27,7 +27,8 @@ LSX-OBJS-$(CONFIG_HEVC_DECODER) += loongarch/hevcdsp_lsx.o \ loongarch/hevc_lpf_sao_lsx.o \ loongarch/hevc_mc_bi_lsx.o \ loongarch/hevc_mc_uni_lsx.o \ - loongarch/hevc_mc_uniw_lsx.o + loongarch/hevc_mc_uniw_lsx.o \ + loongarch/hevc_add_res.o LSX-OBJS-$(CONFIG_H264DSP) += loongarch/h264idct.o \ loongarch/h264idct_loongarch.o \ loongarch/h264dsp.o diff --git a/libavcodec/loongarch/hevc_add_res.S b/libavcodec/loongarch/hevc_add_res.S new file mode 100644 index 0000000000..dd2d820af8 --- /dev/null +++ b/libavcodec/loongarch/hevc_add_res.S @@ -0,0 +1,162 @@ +/* + * Loongson LSX optimized add_residual functions for HEVC decoding + * + * Copyright (c) 2023 Loongson Technology Corporation Limited + * Contributed by jinbo <jinbo@loongson.cn> + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include "loongson_asm.S" + +/* + * void ff_hevc_add_residual4x4_lsx(uint8_t *dst, const int16_t *res, ptrdiff_t stride) + */ +.macro ADD_RES_LSX_4x4_8 + vldrepl.w vr0, a0, 0 + add.d t0, a0, a2 + vldrepl.w vr1, t0, 0 + vld vr2, a1, 0 + + vilvl.w vr1, vr1, vr0 + vsllwil.hu.bu vr1, vr1, 0 + vadd.h vr1, vr1, vr2 + vssrani.bu.h vr1, vr1, 0 + + vstelm.w vr1, a0, 0, 0 + vstelm.w vr1, t0, 0, 1 +.endm + +function ff_hevc_add_residual4x4_8_lsx + ADD_RES_LSX_4x4_8 + alsl.d a0, a2, a0, 1 + addi.d a1, a1, 16 + ADD_RES_LSX_4x4_8 +endfunc + +/* + * void ff_hevc_add_residual8x8_8_lsx(uint8_t *dst, const int16_t *res, ptrdiff_t stride) + */ +.macro ADD_RES_LSX_8x8_8 + vldrepl.d vr0, a0, 0 + add.d t0, a0, a2 + vldrepl.d vr1, t0, 0 + add.d t1, t0, a2 + vldrepl.d vr2, t1, 0 + add.d t2, t1, a2 + vldrepl.d vr3, t2, 0 + + vld vr4, a1, 0 + addi.d t3, zero, 16 + vldx vr5, a1, t3 + addi.d t4, a1, 32 + vld vr6, t4, 0 + vldx vr7, t4, t3 + + vsllwil.hu.bu vr0, vr0, 0 + vsllwil.hu.bu vr1, vr1, 0 + vsllwil.hu.bu vr2, vr2, 0 + vsllwil.hu.bu vr3, vr3, 0 + vadd.h vr0, vr0, vr4 + vadd.h vr1, vr1, vr5 + vadd.h vr2, vr2, vr6 + vadd.h vr3, vr3, vr7 + vssrani.bu.h vr1, vr0, 0 + vssrani.bu.h vr3, vr2, 0 + + vstelm.d vr1, a0, 0, 0 + vstelm.d vr1, t0, 0, 1 + vstelm.d vr3, t1, 0, 0 + vstelm.d vr3, t2, 0, 1 +.endm + +function ff_hevc_add_residual8x8_8_lsx + ADD_RES_LSX_8x8_8 + alsl.d a0, a2, a0, 2 + addi.d a1, a1, 64 + ADD_RES_LSX_8x8_8 +endfunc + +/* + * void ff_hevc_add_residual16x16_8_lsx(uint8_t *dst, const int16_t *res, ptrdiff_t stride) + */ +function ff_hevc_add_residual16x16_8_lsx +.rept 8 + vld vr0, a0, 0 + vldx vr2, a0, a2 + + vld vr4, a1, 0 + addi.d t0, zero, 16 + vldx vr5, a1, t0 + addi.d t1, a1, 32 + vld vr6, t1, 0 + vldx vr7, t1, t0 + + vexth.hu.bu vr1, vr0 + vsllwil.hu.bu vr0, vr0, 0 + vexth.hu.bu vr3, vr2 + vsllwil.hu.bu vr2, vr2, 0 + vadd.h vr0, vr0, vr4 + vadd.h vr1, vr1, vr5 + vadd.h vr2, vr2, vr6 + vadd.h vr3, vr3, vr7 + + vssrani.bu.h vr1, vr0, 0 + vssrani.bu.h vr3, vr2, 0 + + vst vr1, a0, 0 + vstx vr3, a0, a2 + + alsl.d a0, a2, a0, 1 + addi.d a1, a1, 64 +.endr +endfunc + +/* + * void ff_hevc_add_residual32x32_8_lsx(uint8_t *dst, const int16_t *res, ptrdiff_t stride) + */ +function ff_hevc_add_residual32x32_8_lsx +.rept 32 + vld vr0, a0, 0 + addi.w t0, zero, 16 + vldx vr2, a0, t0 + + vld vr4, a1, 0 + vldx vr5, a1, t0 + addi.d t1, a1, 32 + vld vr6, t1, 0 + vldx vr7, t1, t0 + + vexth.hu.bu vr1, vr0 + vsllwil.hu.bu vr0, vr0, 0 + vexth.hu.bu vr3, vr2 + vsllwil.hu.bu vr2, vr2, 0 + vadd.h vr0, vr0, vr4 + vadd.h vr1, vr1, vr5 + vadd.h vr2, vr2, vr6 + vadd.h vr3, vr3, vr7 + + vssrani.bu.h vr1, vr0, 0 + vssrani.bu.h vr3, vr2, 0 + + vst vr1, a0, 0 + vstx vr3, a0, t0 + + add.d a0, a0, a2 + addi.d a1, a1, 64 +.endr +endfunc diff --git a/libavcodec/loongarch/hevcdsp_init_loongarch.c b/libavcodec/loongarch/hevcdsp_init_loongarch.c index 5a96f3a4c9..a8f753dc86 100644 --- a/libavcodec/loongarch/hevcdsp_init_loongarch.c +++ b/libavcodec/loongarch/hevcdsp_init_loongarch.c @@ -189,6 +189,11 @@ void ff_hevc_dsp_init_loongarch(HEVCDSPContext *c, const int bit_depth) c->idct[1] = ff_hevc_idct_8x8_lsx; c->idct[2] = ff_hevc_idct_16x16_lsx; c->idct[3] = ff_hevc_idct_32x32_lsx; + + c->add_residual[0] = ff_hevc_add_residual4x4_8_lsx; + c->add_residual[1] = ff_hevc_add_residual8x8_8_lsx; + c->add_residual[2] = ff_hevc_add_residual16x16_8_lsx; + c->add_residual[3] = ff_hevc_add_residual32x32_8_lsx; } } } diff --git a/libavcodec/loongarch/hevcdsp_lsx.h b/libavcodec/loongarch/hevcdsp_lsx.h index 0d54196caf..ac509984fd 100644 --- a/libavcodec/loongarch/hevcdsp_lsx.h +++ b/libavcodec/loongarch/hevcdsp_lsx.h @@ -227,4 +227,9 @@ void ff_hevc_idct_8x8_lsx(int16_t *coeffs, int col_limit); void ff_hevc_idct_16x16_lsx(int16_t *coeffs, int col_limit); void ff_hevc_idct_32x32_lsx(int16_t *coeffs, int col_limit); +void ff_hevc_add_residual4x4_8_lsx(uint8_t *dst, const int16_t *res, ptrdiff_t stride); +void ff_hevc_add_residual8x8_8_lsx(uint8_t *dst, const int16_t *res, ptrdiff_t stride); +void ff_hevc_add_residual16x16_8_lsx(uint8_t *dst, const int16_t *res, ptrdiff_t stride); +void ff_hevc_add_residual32x32_8_lsx(uint8_t *dst, const int16_t *res, ptrdiff_t stride); + #endif // #ifndef AVCODEC_LOONGARCH_HEVCDSP_LSX_H |