aboutsummaryrefslogtreecommitdiffstats
path: root/libavcodec/loongarch/hevcdsp_init_loongarch.c
diff options
context:
space:
mode:
authorjinbo <jinbo@loongson.cn>2023-12-28 16:21:01 +0800
committerMichael Niedermayer <michael@niedermayer.cc>2024-01-12 23:35:40 +0100
commita28eea2a277bb58004dc7ecccd543fa4baf69170 (patch)
tree57dcb211a8d8a00175e28fda770ed9dc7f7c7b56 /libavcodec/loongarch/hevcdsp_init_loongarch.c
parentcfbdda607d02f9e23ead8252243643e167d38414 (diff)
downloadffmpeg-a28eea2a277bb58004dc7ecccd543fa4baf69170.tar.gz
avcodec/hevc: Add pel_uni_w_pixels4/6/8/12/16/24/32/48/64 asm opt
tests/checkasm/checkasm: C LSX LASX put_hevc_pel_uni_w_pixels4_8_c: 2.7 1.0 put_hevc_pel_uni_w_pixels6_8_c: 6.2 2.0 1.5 put_hevc_pel_uni_w_pixels8_8_c: 10.7 2.5 1.7 put_hevc_pel_uni_w_pixels12_8_c: 23.0 5.5 5.0 put_hevc_pel_uni_w_pixels16_8_c: 41.0 8.2 5.0 put_hevc_pel_uni_w_pixels24_8_c: 91.0 19.7 13.2 put_hevc_pel_uni_w_pixels32_8_c: 161.7 32.5 16.2 put_hevc_pel_uni_w_pixels48_8_c: 354.5 73.7 43.0 put_hevc_pel_uni_w_pixels64_8_c: 641.5 130.0 64.2 Speedup of decoding H265 4K 30FPS 30Mbps on 3A6000 with 8 threads is 1fps(47fps-->48fps). Reviewed-by: yinshiyou-hf@loongson.cn Signed-off-by: Michael Niedermayer <michael@niedermayer.cc>
Diffstat (limited to 'libavcodec/loongarch/hevcdsp_init_loongarch.c')
-rw-r--r--libavcodec/loongarch/hevcdsp_init_loongarch.c43
1 files changed, 43 insertions, 0 deletions
diff --git a/libavcodec/loongarch/hevcdsp_init_loongarch.c b/libavcodec/loongarch/hevcdsp_init_loongarch.c
index a8f753dc86..d0ee99d6b5 100644
--- a/libavcodec/loongarch/hevcdsp_init_loongarch.c
+++ b/libavcodec/loongarch/hevcdsp_init_loongarch.c
@@ -22,6 +22,7 @@
#include "libavutil/loongarch/cpu.h"
#include "hevcdsp_lsx.h"
+#include "hevcdsp_lasx.h"
void ff_hevc_dsp_init_loongarch(HEVCDSPContext *c, const int bit_depth)
{
@@ -160,6 +161,26 @@ void ff_hevc_dsp_init_loongarch(HEVCDSPContext *c, const int bit_depth)
c->put_hevc_epel_uni[6][1][1] = ff_hevc_put_hevc_uni_epel_hv24_8_lsx;
c->put_hevc_epel_uni[7][1][1] = ff_hevc_put_hevc_uni_epel_hv32_8_lsx;
+ c->put_hevc_qpel_uni_w[1][0][0] = ff_hevc_put_hevc_pel_uni_w_pixels4_8_lsx;
+ c->put_hevc_qpel_uni_w[2][0][0] = ff_hevc_put_hevc_pel_uni_w_pixels6_8_lsx;
+ c->put_hevc_qpel_uni_w[3][0][0] = ff_hevc_put_hevc_pel_uni_w_pixels8_8_lsx;
+ c->put_hevc_qpel_uni_w[4][0][0] = ff_hevc_put_hevc_pel_uni_w_pixels12_8_lsx;
+ c->put_hevc_qpel_uni_w[5][0][0] = ff_hevc_put_hevc_pel_uni_w_pixels16_8_lsx;
+ c->put_hevc_qpel_uni_w[6][0][0] = ff_hevc_put_hevc_pel_uni_w_pixels24_8_lsx;
+ c->put_hevc_qpel_uni_w[7][0][0] = ff_hevc_put_hevc_pel_uni_w_pixels32_8_lsx;
+ c->put_hevc_qpel_uni_w[8][0][0] = ff_hevc_put_hevc_pel_uni_w_pixels48_8_lsx;
+ c->put_hevc_qpel_uni_w[9][0][0] = ff_hevc_put_hevc_pel_uni_w_pixels64_8_lsx;
+
+ c->put_hevc_epel_uni_w[1][0][0] = ff_hevc_put_hevc_pel_uni_w_pixels4_8_lsx;
+ c->put_hevc_epel_uni_w[2][0][0] = ff_hevc_put_hevc_pel_uni_w_pixels6_8_lsx;
+ c->put_hevc_epel_uni_w[3][0][0] = ff_hevc_put_hevc_pel_uni_w_pixels8_8_lsx;
+ c->put_hevc_epel_uni_w[4][0][0] = ff_hevc_put_hevc_pel_uni_w_pixels12_8_lsx;
+ c->put_hevc_epel_uni_w[5][0][0] = ff_hevc_put_hevc_pel_uni_w_pixels16_8_lsx;
+ c->put_hevc_epel_uni_w[6][0][0] = ff_hevc_put_hevc_pel_uni_w_pixels24_8_lsx;
+ c->put_hevc_epel_uni_w[7][0][0] = ff_hevc_put_hevc_pel_uni_w_pixels32_8_lsx;
+ c->put_hevc_epel_uni_w[8][0][0] = ff_hevc_put_hevc_pel_uni_w_pixels48_8_lsx;
+ c->put_hevc_epel_uni_w[9][0][0] = ff_hevc_put_hevc_pel_uni_w_pixels64_8_lsx;
+
c->put_hevc_qpel_uni_w[3][1][1] = ff_hevc_put_hevc_uni_w_qpel_hv8_8_lsx;
c->put_hevc_qpel_uni_w[5][1][1] = ff_hevc_put_hevc_uni_w_qpel_hv16_8_lsx;
c->put_hevc_qpel_uni_w[6][1][1] = ff_hevc_put_hevc_uni_w_qpel_hv24_8_lsx;
@@ -196,4 +217,26 @@ void ff_hevc_dsp_init_loongarch(HEVCDSPContext *c, const int bit_depth)
c->add_residual[3] = ff_hevc_add_residual32x32_8_lsx;
}
}
+
+ if (have_lasx(cpu_flags)) {
+ if (bit_depth == 8) {
+ c->put_hevc_qpel_uni_w[2][0][0] = ff_hevc_put_hevc_pel_uni_w_pixels6_8_lasx;
+ c->put_hevc_qpel_uni_w[3][0][0] = ff_hevc_put_hevc_pel_uni_w_pixels8_8_lasx;
+ c->put_hevc_qpel_uni_w[4][0][0] = ff_hevc_put_hevc_pel_uni_w_pixels12_8_lasx;
+ c->put_hevc_qpel_uni_w[5][0][0] = ff_hevc_put_hevc_pel_uni_w_pixels16_8_lasx;
+ c->put_hevc_qpel_uni_w[6][0][0] = ff_hevc_put_hevc_pel_uni_w_pixels24_8_lasx;
+ c->put_hevc_qpel_uni_w[7][0][0] = ff_hevc_put_hevc_pel_uni_w_pixels32_8_lasx;
+ c->put_hevc_qpel_uni_w[8][0][0] = ff_hevc_put_hevc_pel_uni_w_pixels48_8_lasx;
+ c->put_hevc_qpel_uni_w[9][0][0] = ff_hevc_put_hevc_pel_uni_w_pixels64_8_lasx;
+
+ c->put_hevc_epel_uni_w[2][0][0] = ff_hevc_put_hevc_pel_uni_w_pixels6_8_lasx;
+ c->put_hevc_epel_uni_w[3][0][0] = ff_hevc_put_hevc_pel_uni_w_pixels8_8_lasx;
+ c->put_hevc_epel_uni_w[4][0][0] = ff_hevc_put_hevc_pel_uni_w_pixels12_8_lasx;
+ c->put_hevc_epel_uni_w[5][0][0] = ff_hevc_put_hevc_pel_uni_w_pixels16_8_lasx;
+ c->put_hevc_epel_uni_w[6][0][0] = ff_hevc_put_hevc_pel_uni_w_pixels24_8_lasx;
+ c->put_hevc_epel_uni_w[7][0][0] = ff_hevc_put_hevc_pel_uni_w_pixels32_8_lasx;
+ c->put_hevc_epel_uni_w[8][0][0] = ff_hevc_put_hevc_pel_uni_w_pixels48_8_lasx;
+ c->put_hevc_epel_uni_w[9][0][0] = ff_hevc_put_hevc_pel_uni_w_pixels64_8_lasx;
+ }
+ }
}