diff options
author | jinbo <jinbo@loongson.cn> | 2023-12-28 16:21:01 +0800 |
---|---|---|
committer | Michael Niedermayer <michael@niedermayer.cc> | 2024-01-12 23:35:40 +0100 |
commit | a28eea2a277bb58004dc7ecccd543fa4baf69170 (patch) | |
tree | 57dcb211a8d8a00175e28fda770ed9dc7f7c7b56 /libavcodec/loongarch/hevcdsp_init_loongarch.c | |
parent | cfbdda607d02f9e23ead8252243643e167d38414 (diff) | |
download | ffmpeg-a28eea2a277bb58004dc7ecccd543fa4baf69170.tar.gz |
avcodec/hevc: Add pel_uni_w_pixels4/6/8/12/16/24/32/48/64 asm opt
tests/checkasm/checkasm: C LSX LASX
put_hevc_pel_uni_w_pixels4_8_c: 2.7 1.0
put_hevc_pel_uni_w_pixels6_8_c: 6.2 2.0 1.5
put_hevc_pel_uni_w_pixels8_8_c: 10.7 2.5 1.7
put_hevc_pel_uni_w_pixels12_8_c: 23.0 5.5 5.0
put_hevc_pel_uni_w_pixels16_8_c: 41.0 8.2 5.0
put_hevc_pel_uni_w_pixels24_8_c: 91.0 19.7 13.2
put_hevc_pel_uni_w_pixels32_8_c: 161.7 32.5 16.2
put_hevc_pel_uni_w_pixels48_8_c: 354.5 73.7 43.0
put_hevc_pel_uni_w_pixels64_8_c: 641.5 130.0 64.2
Speedup of decoding H265 4K 30FPS 30Mbps on 3A6000 with
8 threads is 1fps(47fps-->48fps).
Reviewed-by: yinshiyou-hf@loongson.cn
Signed-off-by: Michael Niedermayer <michael@niedermayer.cc>
Diffstat (limited to 'libavcodec/loongarch/hevcdsp_init_loongarch.c')
-rw-r--r-- | libavcodec/loongarch/hevcdsp_init_loongarch.c | 43 |
1 files changed, 43 insertions, 0 deletions
diff --git a/libavcodec/loongarch/hevcdsp_init_loongarch.c b/libavcodec/loongarch/hevcdsp_init_loongarch.c index a8f753dc86..d0ee99d6b5 100644 --- a/libavcodec/loongarch/hevcdsp_init_loongarch.c +++ b/libavcodec/loongarch/hevcdsp_init_loongarch.c @@ -22,6 +22,7 @@ #include "libavutil/loongarch/cpu.h" #include "hevcdsp_lsx.h" +#include "hevcdsp_lasx.h" void ff_hevc_dsp_init_loongarch(HEVCDSPContext *c, const int bit_depth) { @@ -160,6 +161,26 @@ void ff_hevc_dsp_init_loongarch(HEVCDSPContext *c, const int bit_depth) c->put_hevc_epel_uni[6][1][1] = ff_hevc_put_hevc_uni_epel_hv24_8_lsx; c->put_hevc_epel_uni[7][1][1] = ff_hevc_put_hevc_uni_epel_hv32_8_lsx; + c->put_hevc_qpel_uni_w[1][0][0] = ff_hevc_put_hevc_pel_uni_w_pixels4_8_lsx; + c->put_hevc_qpel_uni_w[2][0][0] = ff_hevc_put_hevc_pel_uni_w_pixels6_8_lsx; + c->put_hevc_qpel_uni_w[3][0][0] = ff_hevc_put_hevc_pel_uni_w_pixels8_8_lsx; + c->put_hevc_qpel_uni_w[4][0][0] = ff_hevc_put_hevc_pel_uni_w_pixels12_8_lsx; + c->put_hevc_qpel_uni_w[5][0][0] = ff_hevc_put_hevc_pel_uni_w_pixels16_8_lsx; + c->put_hevc_qpel_uni_w[6][0][0] = ff_hevc_put_hevc_pel_uni_w_pixels24_8_lsx; + c->put_hevc_qpel_uni_w[7][0][0] = ff_hevc_put_hevc_pel_uni_w_pixels32_8_lsx; + c->put_hevc_qpel_uni_w[8][0][0] = ff_hevc_put_hevc_pel_uni_w_pixels48_8_lsx; + c->put_hevc_qpel_uni_w[9][0][0] = ff_hevc_put_hevc_pel_uni_w_pixels64_8_lsx; + + c->put_hevc_epel_uni_w[1][0][0] = ff_hevc_put_hevc_pel_uni_w_pixels4_8_lsx; + c->put_hevc_epel_uni_w[2][0][0] = ff_hevc_put_hevc_pel_uni_w_pixels6_8_lsx; + c->put_hevc_epel_uni_w[3][0][0] = ff_hevc_put_hevc_pel_uni_w_pixels8_8_lsx; + c->put_hevc_epel_uni_w[4][0][0] = ff_hevc_put_hevc_pel_uni_w_pixels12_8_lsx; + c->put_hevc_epel_uni_w[5][0][0] = ff_hevc_put_hevc_pel_uni_w_pixels16_8_lsx; + c->put_hevc_epel_uni_w[6][0][0] = ff_hevc_put_hevc_pel_uni_w_pixels24_8_lsx; + c->put_hevc_epel_uni_w[7][0][0] = ff_hevc_put_hevc_pel_uni_w_pixels32_8_lsx; + c->put_hevc_epel_uni_w[8][0][0] = ff_hevc_put_hevc_pel_uni_w_pixels48_8_lsx; + c->put_hevc_epel_uni_w[9][0][0] = ff_hevc_put_hevc_pel_uni_w_pixels64_8_lsx; + c->put_hevc_qpel_uni_w[3][1][1] = ff_hevc_put_hevc_uni_w_qpel_hv8_8_lsx; c->put_hevc_qpel_uni_w[5][1][1] = ff_hevc_put_hevc_uni_w_qpel_hv16_8_lsx; c->put_hevc_qpel_uni_w[6][1][1] = ff_hevc_put_hevc_uni_w_qpel_hv24_8_lsx; @@ -196,4 +217,26 @@ void ff_hevc_dsp_init_loongarch(HEVCDSPContext *c, const int bit_depth) c->add_residual[3] = ff_hevc_add_residual32x32_8_lsx; } } + + if (have_lasx(cpu_flags)) { + if (bit_depth == 8) { + c->put_hevc_qpel_uni_w[2][0][0] = ff_hevc_put_hevc_pel_uni_w_pixels6_8_lasx; + c->put_hevc_qpel_uni_w[3][0][0] = ff_hevc_put_hevc_pel_uni_w_pixels8_8_lasx; + c->put_hevc_qpel_uni_w[4][0][0] = ff_hevc_put_hevc_pel_uni_w_pixels12_8_lasx; + c->put_hevc_qpel_uni_w[5][0][0] = ff_hevc_put_hevc_pel_uni_w_pixels16_8_lasx; + c->put_hevc_qpel_uni_w[6][0][0] = ff_hevc_put_hevc_pel_uni_w_pixels24_8_lasx; + c->put_hevc_qpel_uni_w[7][0][0] = ff_hevc_put_hevc_pel_uni_w_pixels32_8_lasx; + c->put_hevc_qpel_uni_w[8][0][0] = ff_hevc_put_hevc_pel_uni_w_pixels48_8_lasx; + c->put_hevc_qpel_uni_w[9][0][0] = ff_hevc_put_hevc_pel_uni_w_pixels64_8_lasx; + + c->put_hevc_epel_uni_w[2][0][0] = ff_hevc_put_hevc_pel_uni_w_pixels6_8_lasx; + c->put_hevc_epel_uni_w[3][0][0] = ff_hevc_put_hevc_pel_uni_w_pixels8_8_lasx; + c->put_hevc_epel_uni_w[4][0][0] = ff_hevc_put_hevc_pel_uni_w_pixels12_8_lasx; + c->put_hevc_epel_uni_w[5][0][0] = ff_hevc_put_hevc_pel_uni_w_pixels16_8_lasx; + c->put_hevc_epel_uni_w[6][0][0] = ff_hevc_put_hevc_pel_uni_w_pixels24_8_lasx; + c->put_hevc_epel_uni_w[7][0][0] = ff_hevc_put_hevc_pel_uni_w_pixels32_8_lasx; + c->put_hevc_epel_uni_w[8][0][0] = ff_hevc_put_hevc_pel_uni_w_pixels48_8_lasx; + c->put_hevc_epel_uni_w[9][0][0] = ff_hevc_put_hevc_pel_uni_w_pixels64_8_lasx; + } + } } |