diff options
author | Lu Wang <wanglu@loongson.cn> | 2022-02-17 19:11:49 +0800 |
---|---|---|
committer | Michael Niedermayer <michael@niedermayer.cc> | 2022-03-01 23:53:40 +0100 |
commit | b6ceeee16bebab698321cd03f7010701e92294b4 (patch) | |
tree | bc22b515dbd2fc08f1c9d029498e3279f213d650 /libavcodec | |
parent | 20194d573d75f637f45181e8a3a88f71fea2c97e (diff) | |
download | ffmpeg-b6ceeee16bebab698321cd03f7010701e92294b4.tar.gz |
avcodec: [loongarch] Optimize Hevc_idct/lpf with LSX.
ffmpeg -i 5_h265_1080p_60fps_3Mbps.mkv -f rawvideo -y /dev/null -an
before: 110fps
after : 124fps
Signed-off-by: Hao Chen <chenhao@loongson.cn>
Reviewed-by: 殷时友 <yinshiyou-hf@loongson.cn>
Signed-off-by: Michael Niedermayer <michael@niedermayer.cc>
Diffstat (limited to 'libavcodec')
-rw-r--r-- | libavcodec/loongarch/Makefile | 2 | ||||
-rw-r--r-- | libavcodec/loongarch/hevc_idct_lsx.c | 842 | ||||
-rw-r--r-- | libavcodec/loongarch/hevc_lpf_sao_lsx.c | 2485 | ||||
-rw-r--r-- | libavcodec/loongarch/hevcdsp_init_loongarch.c | 19 | ||||
-rw-r--r-- | libavcodec/loongarch/hevcdsp_lsx.h | 26 |
5 files changed, 3374 insertions, 0 deletions
diff --git a/libavcodec/loongarch/Makefile b/libavcodec/loongarch/Makefile index 9f416d0c6e..cfc8e3aaff 100644 --- a/libavcodec/loongarch/Makefile +++ b/libavcodec/loongarch/Makefile @@ -26,3 +26,5 @@ LSX-OBJS-$(CONFIG_VP9_DECODER) += loongarch/vp9_mc_lsx.o \ loongarch/vp9_lpf_lsx.o \ loongarch/vp9_idct_lsx.o LSX-OBJS-$(CONFIG_HEVC_DECODER) += loongarch/hevcdsp_lsx.o \ + loongarch/hevc_idct_lsx.o \ + loongarch/hevc_lpf_sao_lsx.o diff --git a/libavcodec/loongarch/hevc_idct_lsx.c b/libavcodec/loongarch/hevc_idct_lsx.c new file mode 100644 index 0000000000..2193b27546 --- /dev/null +++ b/libavcodec/loongarch/hevc_idct_lsx.c @@ -0,0 +1,842 @@ +/* + * Copyright (c) 2022 Loongson Technology Corporation Limited + * Contributed by Shiyou Yin <yinshiyou-hf@loongson.cn> + * Hao Chen <chenhao@loongson.cn> + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include "libavutil/loongarch/loongson_intrinsics.h" +#include "hevcdsp_lsx.h" + +static const int16_t gt8x8_cnst[16] __attribute__ ((aligned (64))) = { + 64, 64, 83, 36, 89, 50, 18, 75, 64, -64, 36, -83, 75, -89, -50, -18 +}; + +static const int16_t gt16x16_cnst[64] __attribute__ ((aligned (64))) = { + 64, 83, 64, 36, 89, 75, 50, 18, 90, 80, 57, 25, 70, 87, 9, 43, + 64, 36, -64, -83, 75, -18, -89, -50, 87, 9, -80, -70, -43, 57, -25, -90, + 64, -36, -64, 83, 50, -89, 18, 75, 80, -70, -25, 90, -87, 9, 43, 57, + 64, -83, 64, -36, 18, -50, 75, -89, 70, -87, 90, -80, 9, -43, -57, 25 +}; + +static const int16_t gt32x32_cnst0[256] __attribute__ ((aligned (64))) = { + 90, 90, 88, 85, 82, 78, 73, 67, 61, 54, 46, 38, 31, 22, 13, 4, + 90, 82, 67, 46, 22, -4, -31, -54, -73, -85, -90, -88, -78, -61, -38, -13, + 88, 67, 31, -13, -54, -82, -90, -78, -46, -4, 38, 73, 90, 85, 61, 22, + 85, 46, -13, -67, -90, -73, -22, 38, 82, 88, 54, -4, -61, -90, -78, -31, + 82, 22, -54, -90, -61, 13, 78, 85, 31, -46, -90, -67, 4, 73, 88, 38, + 78, -4, -82, -73, 13, 85, 67, -22, -88, -61, 31, 90, 54, -38, -90, -46, + 73, -31, -90, -22, 78, 67, -38, -90, -13, 82, 61, -46, -88, -4, 85, 54, + 67, -54, -78, 38, 85, -22, -90, 4, 90, 13, -88, -31, 82, 46, -73, -61, + 61, -73, -46, 82, 31, -88, -13, 90, -4, -90, 22, 85, -38, -78, 54, 67, + 54, -85, -4, 88, -46, -61, 82, 13, -90, 38, 67, -78, -22, 90, -31, -73, + 46, -90, 38, 54, -90, 31, 61, -88, 22, 67, -85, 13, 73, -82, 4, 78, + 38, -88, 73, -4, -67, 90, -46, -31, 85, -78, 13, 61, -90, 54, 22, -82, + 31, -78, 90, -61, 4, 54, -88, 82, -38, -22, 73, -90, 67, -13, -46, 85, + 22, -61, 85, -90, 73, -38, -4, 46, -78, 90, -82, 54, -13, -31, 67, -88, + 13, -38, 61, -78, 88, -90, 85, -73, 54, -31, 4, 22, -46, 67, -82, 90, + 4, -13, 22, -31, 38, -46, 54, -61, 67, -73, 78, -82, 85, -88, 90, -90 +}; + +static const int16_t gt32x32_cnst1[64] __attribute__ ((aligned (64))) = { + 90, 87, 80, 70, 57, 43, 25, 9, 87, 57, 9, -43, -80, -90, -70, -25, + 80, 9, -70, -87, -25, 57, 90, 43, 70, -43, -87, 9, 90, 25, -80, -57, + 57, -80, -25, 90, -9, -87, 43, 70, 43, -90, 57, 25, -87, 70, 9, -80, + 25, -70, 90, -80, 43, 9, -57, 87, 9, -25, 43, -57, 70, -80, 87, -90 +}; + +static const int16_t gt32x32_cnst2[16] __attribute__ ((aligned (64))) = { + 89, 75, 50, 18, 75, -18, -89, -50, 50, -89, 18, 75, 18, -50, 75, -89 +}; + +#define HEVC_IDCT4x4_COL(in_r0, in_l0, in_r1, in_l1, \ + sum0, sum1, sum2, sum3, shift) \ +{ \ + __m128i vec0, vec1, vec2, vec3, vec4, vec5; \ + __m128i cnst64 = __lsx_vldi(0x0840); \ + __m128i cnst83 = __lsx_vldi(0x0853); \ + __m128i cnst36 = __lsx_vldi(0x0824); \ + \ + vec0 = __lsx_vdp2_w_h(in_r0, cnst64); \ + vec1 = __lsx_vdp2_w_h(in_l0, cnst83); \ + vec2 = __lsx_vdp2_w_h(in_r1, cnst64); \ + vec3 = __lsx_vdp2_w_h(in_l1, cnst36); \ + vec4 = __lsx_vdp2_w_h(in_l0, cnst36); \ + vec5 = __lsx_vdp2_w_h(in_l1, cnst83); \ + \ + sum0 = __lsx_vadd_w(vec0, vec2); \ + sum1 = __lsx_vsub_w(vec0, vec2); \ + vec1 = __lsx_vadd_w(vec1, vec3); \ + vec4 = __lsx_vsub_w(vec4, vec5); \ + sum2 = __lsx_vsub_w(sum1, vec4); \ + sum3 = __lsx_vsub_w(sum0, vec1); \ + sum0 = __lsx_vadd_w(sum0, vec1); \ + sum1 = __lsx_vadd_w(sum1, vec4); \ + \ + sum0 = __lsx_vsrari_w(sum0, shift); \ + sum1 = __lsx_vsrari_w(sum1, shift); \ + sum2 = __lsx_vsrari_w(sum2, shift); \ + sum3 = __lsx_vsrari_w(sum3, shift); \ + sum0 = __lsx_vsat_w(sum0, 15); \ + sum1 = __lsx_vsat_w(sum1, 15); \ + sum2 = __lsx_vsat_w(sum2, 15); \ + sum3 = __lsx_vsat_w(sum3, 15); \ +} + +#define HEVC_IDCT8x8_COL(in0, in1, in2, in3, in4, in5, in6, in7, shift) \ +{ \ + __m128i src0_r, src1_r, src2_r, src3_r; \ + __m128i src0_l, src1_l, src2_l, src3_l; \ + __m128i filter0, filter1, filter2, filter3; \ + __m128i temp0_r, temp1_r, temp2_r, temp3_r, temp4_r, temp5_r; \ + __m128i temp0_l, temp1_l, temp2_l, temp3_l, temp4_l, temp5_l; \ + __m128i sum0_r, sum1_r, sum2_r, sum3_r; \ + __m128i sum0_l, sum1_l, sum2_l, sum3_l; \ + \ + DUP4_ARG2(__lsx_vilvl_h, in4, in0, in6, in2, in5, in1, in3, in7, \ + src0_r, src1_r, src2_r, src3_r); \ + DUP4_ARG2(__lsx_vilvh_h, in4, in0, in6, in2, in5, in1, in3, in7, \ + src0_l, src1_l, src2_l, src3_l); \ + \ + DUP4_ARG2(__lsx_vldrepl_w, filter, 0, filter, 4, filter, 8, \ + filter, 12, filter0, filter1, filter2, filter3); \ + DUP4_ARG2(__lsx_vdp2_w_h, src0_r, filter0, src0_l, filter0, \ + src1_r, filter1, src1_l, filter1, temp0_r, temp0_l, \ + temp1_r, temp1_l); \ + \ + LSX_BUTTERFLY_4_W(temp0_r, temp0_l, temp1_l, temp1_r, sum0_r, sum0_l,\ + sum1_l, sum1_r); \ + sum2_r = sum1_r; \ + sum2_l = sum1_l; \ + sum3_r = sum0_r; \ + sum3_l = sum0_l; \ + \ + DUP4_ARG2(__lsx_vdp2_w_h, src2_r, filter2, src2_l, filter2, \ + src3_r, filter3, src3_l, filter3, temp2_r, temp2_l, \ + temp3_r, temp3_l); \ + temp2_r = __lsx_vadd_w(temp2_r, temp3_r); \ + temp2_l = __lsx_vadd_w(temp2_l, temp3_l); \ + sum0_r = __lsx_vadd_w(sum0_r, temp2_r); \ + sum0_l = __lsx_vadd_w(sum0_l, temp2_l); \ + sum3_r = __lsx_vsub_w(sum3_r, temp2_r); \ + sum3_l = __lsx_vsub_w(sum3_l, temp2_l); \ + \ + in0 = __lsx_vssrarni_h_w(sum0_l, sum0_r, shift); \ + in7 = __lsx_vssrarni_h_w(sum3_l, sum3_r, shift); \ + \ + DUP4_ARG2(__lsx_vdp2_w_h, src2_r, filter3, src2_l, filter3, \ + src3_r, filter2, src3_l, filter2, temp4_r, temp4_l, \ + temp5_r, temp5_l); \ + temp4_r = __lsx_vsub_w(temp4_r, temp5_r); \ + temp4_l = __lsx_vsub_w(temp4_l, temp5_l); \ + sum1_r = __lsx_vadd_w(sum1_r, temp4_r); \ + sum1_l = __lsx_vadd_w(sum1_l, temp4_l); \ + sum2_r = __lsx_vsub_w(sum2_r, temp4_r); \ + sum2_l = __lsx_vsub_w(sum2_l, temp4_l); \ + \ + in3 = __lsx_vssrarni_h_w(sum1_l, sum1_r, shift); \ + in4 = __lsx_vssrarni_h_w(sum2_l, sum2_r, shift); \ + \ + DUP4_ARG2(__lsx_vldrepl_w, filter, 16, filter, 20, filter, 24, \ + filter, 28, filter0, filter1, filter2, filter3); \ + DUP4_ARG2(__lsx_vdp2_w_h, src0_r, filter0, src0_l, filter0, \ + src1_r, filter1, src1_l, filter1, temp0_r, temp0_l, \ + temp1_r, temp1_l); \ + \ + LSX_BUTTERFLY_4_W(temp0_r, temp0_l, temp1_l, temp1_r, sum0_r, sum0_l,\ + sum1_l, sum1_r); \ + sum2_r = sum1_r; \ + sum2_l = sum1_l; \ + sum3_r = sum0_r; \ + sum3_l = sum0_l; \ + \ + DUP4_ARG2(__lsx_vdp2_w_h, src2_r, filter2, src2_l, filter2, \ + src3_r, filter3, src3_l, filter3, temp2_r, temp2_l, \ + temp3_r, temp3_l); \ + temp2_r = __lsx_vadd_w(temp2_r, temp3_r); \ + temp2_l = __lsx_vadd_w(temp2_l, temp3_l); \ + sum0_r = __lsx_vadd_w(sum0_r, temp2_r); \ + sum0_l = __lsx_vadd_w(sum0_l, temp2_l); \ + sum3_r = __lsx_vsub_w(sum3_r, temp2_r); \ + sum3_l = __lsx_vsub_w(sum3_l, temp2_l); \ + \ + in1 = __lsx_vssrarni_h_w(sum0_l, sum0_r, shift); \ + in6 = __lsx_vssrarni_h_w(sum3_l, sum3_r, shift); \ + \ + DUP4_ARG2(__lsx_vdp2_w_h, src2_r, filter3, src2_l, filter3, \ + src3_r, filter2, src3_l, filter2, temp4_r, temp4_l, \ + temp5_r, temp5_l); \ + temp4_r = __lsx_vsub_w(temp4_r, temp5_r); \ + temp4_l = __lsx_vsub_w(temp4_l, temp5_l); \ + sum1_r = __lsx_vsub_w(sum1_r, temp4_r); \ + sum1_l = __lsx_vsub_w(sum1_l, temp4_l); \ + sum2_r = __lsx_vadd_w(sum2_r, temp4_r); \ + sum2_l = __lsx_vadd_w(sum2_l, temp4_l); \ + \ + in2 = __lsx_vssrarni_h_w(sum1_l, sum1_r, shift); \ + in5 = __lsx_vssrarni_h_w(sum2_l, sum2_r, shift); \ +} + +#define HEVC_IDCT16x16_COL(src0_r, src1_r, src2_r, src3_r, \ + src4_r, src5_r, src6_r, src7_r, \ + src0_l, src1_l, src2_l, src3_l, \ + src4_l, src5_l, src6_l, src7_l, shift) \ +{ \ + int16_t *ptr0, *ptr1; \ + __m128i dst0, dst1; \ + __m128i filter0, filter1, filter2, filter3; \ + __m128i temp0_r, temp1_r, temp0_l, temp1_l; \ + __m128i sum0_r, sum1_r, sum2_r, sum3_r, sum0_l, sum1_l, sum2_l; \ + __m128i sum3_l, res0_r, res1_r, res0_l, res1_l; \ + \ + ptr0 = (buf_ptr + 112); \ + ptr1 = (buf_ptr + 128); \ + k = -1; \ + \ + for (j = 0; j < 4; j++) \ + { \ + DUP4_ARG2(__lsx_vldrepl_w, filter, 0, filter, 4, filter, 16, \ + filter, 20, filter0, filter1, filter2, filter3); \ + DUP4_ARG2(__lsx_vdp2_w_h, src0_r, filter0, src0_l, filter0, \ + src4_r, filter2, src4_l, filter2, sum0_r, sum0_l, \ + sum2_r, sum2_l); \ + DUP2_ARG2(__lsx_vdp2_w_h, src7_r, filter2, src7_l, filter2, \ + sum3_r, sum3_l); \ + DUP4_ARG3(__lsx_vdp2add_w_h, sum0_r, src1_r, filter1, sum0_l, \ + src1_l, filter1, sum2_r, src5_r, filter3, sum2_l, \ + src5_l, filter3, sum0_r, sum0_l, sum2_r, sum2_l); \ + DUP2_ARG3(__lsx_vdp2add_w_h, sum3_r, src6_r, filter3, sum3_l, \ + src6_l, filter3, sum3_r, sum3_l); \ + \ + sum1_r = sum0_r; \ + sum1_l = sum0_l; \ + \ + DUP4_ARG2(__lsx_vldrepl_w, filter, 8, filter, 12, filter, 24, \ + filter, 28, filter0, filter1, filter2, filter3); \ + filter += 16; \ + DUP2_ARG2(__lsx_vdp2_w_h, src2_r, filter0, src2_l, filter0, \ + temp0_r, temp0_l); \ + DUP2_ARG3(__lsx_vdp2add_w_h, sum2_r, src6_r, filter2, sum2_l, \ + src6_l, filter2, sum2_r, sum2_l); \ + DUP2_ARG2(__lsx_vdp2_w_h, src5_r, filter2, src5_l, filter2, \ + temp1_r, temp1_l); \ + \ + sum0_r = __lsx_vadd_w(sum0_r, temp0_r); \ + sum0_l = __lsx_vadd_w(sum0_l, temp0_l); \ + sum1_r = __lsx_vsub_w(sum1_r, temp0_r); \ + sum1_l = __lsx_vsub_w(sum1_l, temp0_l); \ + sum3_r = __lsx_vsub_w(temp1_r, sum3_r); \ + sum3_l = __lsx_vsub_w(temp1_l, sum3_l); \ + \ + DUP2_ARG2(__lsx_vdp2_w_h, src3_r, filter1, src3_l, filter1, \ + temp0_r, temp0_l); \ + DUP4_ARG3(__lsx_vdp2add_w_h, sum2_r, src7_r, filter3, sum2_l, \ + src7_l, filter3, sum3_r, src4_r, filter3, sum3_l, \ + src4_l, filter3, sum2_r, sum2_l, sum3_r, sum3_l); \ + \ + sum0_r = __lsx_vadd_w(sum0_r, temp0_r); \ + sum0_l = __lsx_vadd_w(sum0_l, temp0_l); \ + sum1_r = __lsx_vsub_w(sum1_r, temp0_r); \ + sum1_l = __lsx_vsub_w(sum1_l, temp0_l); \ + \ + LSX_BUTTERFLY_4_W(sum0_r, sum0_l, sum2_l, sum2_r, res0_r, res0_l, \ + res1_l, res1_r); \ + dst0 = __lsx_vssrarni_h_w(res0_l, res0_r, shift); \ + dst1 = __lsx_vssrarni_h_w(res1_l, res1_r, shift); \ + __lsx_vst(dst0, buf_ptr, 0); \ + __lsx_vst(dst1, (buf_ptr + ((15 - (j * 2)) << 4)), 0); \ + \ + LSX_BUTTERFLY_4_W(sum1_r, sum1_l, sum3_l, sum3_r, res0_r, res0_l, \ + res1_l, res1_r); \ + \ + dst0 = __lsx_vssrarni_h_w(res0_l, res0_r, shift); \ + dst1 = __lsx_vssrarni_h_w(res1_l, res1_r, shift); \ + __lsx_vst(dst0, (ptr0 + ((((j + 1) >> 1) * 2 * k) << 4)), 0); \ + __lsx_vst(dst1, (ptr1 - ((((j + 1) >> 1) * 2 * k) << 4)), 0); \ + \ + k *= -1; \ + buf_ptr += 16; \ + } \ +} + +#define HEVC_EVEN16_CALC(input, sum0_r, sum0_l, load_idx, store_idx) \ +{ \ + tmp0_r = __lsx_vld(input + load_idx * 8, 0); \ + tmp0_l = __lsx_vld(input + load_idx * 8, 16); \ + tmp1_r = sum0_r; \ + tmp1_l = sum0_l; \ + sum0_r = __lsx_vadd_w(sum0_r, tmp0_r); \ + sum0_l = __lsx_vadd_w(sum0_l, tmp0_l); \ + __lsx_vst(sum0_r, (input + load_idx * 8), 0); \ + __lsx_vst(sum0_l, (input + load_idx * 8), 16); \ + tmp1_r = __lsx_vsub_w(tmp1_r, tmp0_r); \ + tmp1_l = __lsx_vsub_w(tmp1_l, tmp0_l); \ + __lsx_vst(tmp1_r, (input + store_idx * 8), 0); \ + __lsx_vst(tmp1_l, (input + store_idx * 8), 16); \ +} + +#define HEVC_IDCT_LUMA4x4_COL(in_r0, in_l0, in_r1, in_l1, \ + res0, res1, res2, res3, shift) \ +{ \ + __m128i vec0, vec1, vec2, vec3; \ + __m128i cnst74 = __lsx_vldi(0x84a); \ + __m128i cnst55 = __lsx_vldi(0x837); \ + __m128i cnst29 = __lsx_vldi(0x81d); \ + \ + vec0 = __lsx_vadd_w(in_r0, in_r1); \ + vec2 = __lsx_vsub_w(in_r0, in_l1); \ + res0 = __lsx_vmul_w(vec0, cnst29); \ + res1 = __lsx_vmul_w(vec2, cnst55); \ + res2 = __lsx_vsub_w(in_r0, in_r1); \ + vec1 = __lsx_vadd_w(in_r1, in_l1); \ + res2 = __lsx_vadd_w(res2, in_l1); \ + vec3 = __lsx_vmul_w(in_l0, cnst74); \ + res3 = __lsx_vmul_w(vec0, cnst55); \ + \ + res0 = __lsx_vadd_w(res0, __lsx_vmul_w(vec1, cnst55)); \ + res1 = __lsx_vsub_w(res1, __lsx_vmul_w(vec1, cnst29)); \ + res2 = __lsx_vmul_w(res2, cnst74); \ + res3 = __lsx_vadd_w(res3, __lsx_vmul_w(vec2, cnst29)); \ + \ + res0 = __lsx_vadd_w(res0, vec3); \ + res1 = __lsx_vadd_w(res1, vec3); \ + res3 = __lsx_vsub_w(res3, vec3); \ + \ + res0 = __lsx_vsrari_w(res0, shift); \ + res1 = __lsx_vsrari_w(res1, shift); \ + res2 = __lsx_vsrari_w(res2, shift); \ + res3 = __lsx_vsrari_w(res3, shift); \ + res0 = __lsx_vsat_w(res0, 15); \ + res1 = __lsx_vsat_w(res1, 15); \ + res2 = __lsx_vsat_w(res2, 15); \ + res3 = __lsx_vsat_w(res3, 15); \ +} + +void ff_hevc_idct_4x4_lsx(int16_t *coeffs, int col_limit) +{ + __m128i in0, in1; + __m128i in_r0, in_l0, in_r1, in_l1; + __m128i sum0, sum1, sum2, sum3; + __m128i zero = __lsx_vldi(0x00); + + in0 = __lsx_vld(coeffs, 0); + in1 = __lsx_vld(coeffs, 16); + in_r0 = __lsx_vilvl_h(zero, in0); + in_l0 = __lsx_vilvh_h(zero, in0); + in_r1 = __lsx_vilvl_h(zero, in1); + in_l1 = __lsx_vilvh_h(zero, in1); + + HEVC_IDCT4x4_COL(in_r0, in_l0, in_r1, in_l1, sum0, sum1, sum2, sum3, 7); + LSX_TRANSPOSE4x4_W(sum0, sum1, sum2, sum3, in_r0, in_l0, in_r1, in_l1); + HEVC_IDCT4x4_COL(in_r0, in_l0, in_r1, in_l1, sum0, sum1, sum2, sum3, 12); + + /* Pack and transpose */ + in0 = __lsx_vpickev_h(sum2, sum0); + in1 = __lsx_vpickev_h(sum3, sum1); + sum0 = __lsx_vilvl_h(in1, in0); + sum1 = __lsx_vilvh_h(in1, in0); + in0 = __lsx_vilvl_w(sum1, sum0); + in1 = __lsx_vilvh_w(sum1, sum0); + + __lsx_vst(in0, coeffs, 0); + __lsx_vst(in1, coeffs, 16); +} + +void ff_hevc_idct_8x8_lsx(int16_t *coeffs, int col_limit) +{ + const int16_t *filter = >8x8_cnst[0]; + __m128i in0, in1, in2, in3, in4, in5, in6, in7; + + DUP4_ARG2(__lsx_vld, coeffs, 0, coeffs, 16, coeffs, 32, + coeffs, 48, in0, in1, in2, in3); + DUP4_ARG2(__lsx_vld, coeffs, 64, coeffs, 80, coeffs, 96, + coeffs, 112, in4, in5, in6, in7); + HEVC_IDCT8x8_COL(in0, in1, in2, in3, in4, in5, in6, in7, 7); + LSX_TRANSPOSE8x8_H(in0, in1, in2, in3, in4, in5, in6, in7, + in0, in1, in2, in3, in4, in5, in6, in7); + HEVC_IDCT8x8_COL(in0, in1, in2, in3, in4, in5, in6, in7, 12); + LSX_TRANSPOSE8x8_H(in0, in1, in2, in3, in4, in5, in6, in7, + in0, in1, in2, in3, in4, in5, in6, in7); + + __lsx_vst(in0, coeffs, 0); + __lsx_vst(in1, coeffs, 16); + __lsx_vst(in2, coeffs, 32); + __lsx_vst(in3, coeffs, 48); + __lsx_vst(in4, coeffs, 64); + __lsx_vst(in5, coeffs, 80); + __lsx_vst(in6, coeffs, 96); + __lsx_vst(in7, coeffs, 112); +} + +void ff_hevc_idct_16x16_lsx(int16_t *coeffs, int col_limit) +{ + int16_t i, j, k; + int16_t buf[256]; + int16_t *buf_ptr = &buf[0]; + int16_t *src = coeffs; + const int16_t *filter = >16x16_cnst[0]; + __m128i in0, in1, in2, in3, in4, in5, in6, in7; + __m128i in8, in9, in10, in11, in12, in13, in14, in15; + __m128i vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7; + __m128i src0_r, src1_r, src2_r, src3_r, src4_r, src5_r, src6_r, src7_r; + __m128i src0_l, src1_l, src2_l, src3_l, src4_l, src5_l, src6_l, src7_l; + + for (i = 2; i--;) { + DUP4_ARG2(__lsx_vld, src, 0, src, 32, src, 64, src, 96, + in0, in1, in2, in3); + DUP4_ARG2(__lsx_vld, src, 128, src, 160, src, 192, src, 224, + in4, in5, in6, in7); + DUP4_ARG2(__lsx_vld, src, 256, src, 288, src, 320, src, 352, + in8, in9, in10, in11); + DUP4_ARG2(__lsx_vld, src, 384, src, 416, src, 448, src, 480, + in12, in13, in14, in15); + + DUP4_ARG2(__lsx_vilvl_h, in4, in0, in12, in8, in6, in2, in14, in10, + src0_r, src1_r, src2_r, src3_r); + DUP4_ARG2(__lsx_vilvl_h, in5, in1, in13, in9, in3, in7, in11, in15, + src4_r, src5_r, src6_r, src7_r); + DUP4_ARG2(__lsx_vilvh_h, in4, in0, in12, in8, in6, in2, in14, in10, + src0_l, src1_l, src2_l, src3_l); + DUP4_ARG2(__lsx_vilvh_h, in5, in1, in13, in9, in3, in7, in11, in15, + src4_l, src5_l, src6_l, src7_l); + + HEVC_IDCT16x16_COL(src0_r, src1_r, src2_r, src3_r, src4_r, src5_r, + src6_r, src7_r, src0_l, src1_l, src2_l, src3_l, + src4_l, src5_l, src6_l, src7_l, 7); + + src += 8; + buf_ptr = (&buf[0] + 8); + filter = >16x16_cnst[0]; + } + + src = &buf[0]; + buf_ptr = coeffs; + filter = >16x16_cnst[0]; + + for (i = 2; i--;) { + DUP4_ARG2(__lsx_vld, src, 0, src, 16, src, 32, src, 48, + in0, in8, in1, in9); + DUP4_ARG2(__lsx_vld, src, 64, src, 80, src, 96, src, 112, + in2, in10, in3, in11); + DUP4_ARG2(__lsx_vld, src, 128, src, 144, src, 160, src, 176, + in4, in12, in5, in13); + DUP4_ARG2(__lsx_vld, src, 192, src, 208, src, 224, src, 240, + in6, in14, in7, in15); + LSX_TRANSPOSE8x8_H(in0, in1, in2, in3, in4, in5, in6, in7, + in0, in1, in2, in3, in4, in5, in6, in7); + LSX_TRANSPOSE8x8_H(in8, in9, in10, in11, in12, in13, in14, in15, + in8, in9, in10, in11, in12, in13, in14, in15); + DUP4_ARG2(__lsx_vilvl_h, in4, in0, in12, in8, in6, in2, in14, in10, + src0_r, src1_r, src2_r, src3_r); + DUP4_ARG2(__lsx_vilvl_h, in5, in1, in13, in9, in3, in7, in11, in15, + src4_r, src5_r, src6_r, src7_r); + DUP4_ARG2(__lsx_vilvh_h, in4, in0, in12, in8, in6, in2, in14, in10, + src0_l, src1_l, src2_l, src3_l); + DUP4_ARG2(__lsx_vilvh_h, in5, in1, in13, in9, in3, in7, in11, in15, + src4_l, src5_l, src6_l, src7_l); + HEVC_IDCT16x16_COL(src0_r, src1_r, src2_r, src3_r, src4_r, src5_r, + src6_r, src7_r, src0_l, src1_l, src2_l, src3_l, + src4_l, src5_l, src6_l, src7_l, 12); + + src += 128; + buf_ptr = coeffs + 8; + filter = >16x16_cnst[0]; + } + + DUP4_ARG2(__lsx_vld, coeffs, 0, coeffs, 32, coeffs, 64, coeffs, 96, + in0, in1, in2, in3); + DUP4_ARG2(__lsx_vld, coeffs, 128, coeffs, 160, coeffs, 192, coeffs, 224, + in4, in5, in6, in7); + LSX_TRANSPOSE8x8_H(in0, in1, in2, in3, in4, in5, in6, in7, + vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7); + __lsx_vst(vec0, coeffs, 0); + __lsx_vst(vec1, coeffs, 32); + __lsx_vst(vec2, coeffs, 64); + __lsx_vst(vec3, coeffs, 96); + __lsx_vst(vec4, coeffs, 128); + __lsx_vst(vec5, coeffs, 160); + __lsx_vst(vec6, coeffs, 192); + __lsx_vst(vec7, coeffs, 224); + + src = coeffs + 8; + DUP4_ARG2(__lsx_vld, src, 0, src, 32, src, 64, src, 96, in0, in1, in2, in3); + DUP4_ARG2(__lsx_vld, src, 128, src, 160, src, 192, src, 224, + in4, in5, in6, in7); + LSX_TRANSPOSE8x8_H(in0, in1, in2, in3, in4, in5, in6, in7, + vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7); + src = coeffs + 128; + DUP4_ARG2(__lsx_vld, src, 0, src, 32, src, 64, src, 96, + in8, in9, in10, in11); + DUP4_ARG2(__lsx_vld, src, 128, src, 160, src, 192, src, 224, + in12, in13, in14, in15); + + __lsx_vst(vec0, src, 0); + __lsx_vst(vec1, src, 32); + __lsx_vst(vec2, src, 64); + __lsx_vst(vec3, src, 96); + __lsx_vst(vec4, src, 128); + __lsx_vst(vec5, src, 160); + __lsx_vst(vec6, src, 192); + __lsx_vst(vec7, src, 224); + LSX_TRANSPOSE8x8_H(in8, in9, in10, in11, in12, in13, in14, in15, + vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7); + src = coeffs + 8; + __lsx_vst(vec0, src, 0); + __lsx_vst(vec1, src, 32); + __lsx_vst(vec2, src, 64); + __lsx_vst(vec3, src, 96); + __lsx_vst(vec4, src, 128); + __lsx_vst(vec5, src, 160); + __lsx_vst(vec6, src, 192); + __lsx_vst(vec7, src, 224); + + src = coeffs + 136; + DUP4_ARG2(__lsx_vld, src, 0, src, 32, src, 64, src, 96, + in0, in1, in2, in3); + DUP4_ARG2(__lsx_vld, src, 128, src, 160, src, 192, src, 224, + in4, in5, in6, in7); + LSX_TRANSPOSE8x8_H(in0, in1, in2, in3, in4, in5, in6, in7, + vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7); + __lsx_vst(vec0, src, 0); + __lsx_vst(vec1, src, 32); + __lsx_vst(vec2, src, 64); + __lsx_vst(vec3, src, 96); + __lsx_vst(vec4, src, 128); + __lsx_vst(vec5, src, 160); + __lsx_vst(vec6, src, 192); + __lsx_vst(vec7, src, 224); +} + +static void hevc_idct_8x32_column_lsx(int16_t *coeffs, int32_t buf_pitch, + uint8_t round) +{ + uint8_t i; + int32_t buf_pitch_2 = buf_pitch << 1; + int32_t buf_pitch_4 = buf_pitch << 2; + int32_t buf_pitch_8 = buf_pitch << 3; + int32_t buf_pitch_16 = buf_pitch << 4; + + const int16_t *filter_ptr0 = >32x32_cnst0[0]; + const int16_t *filter_ptr1 = >32x32_cnst1[0]; + const int16_t *filter_ptr2 = >32x32_cnst2[0]; + const int16_t *filter_ptr3 = >8x8_cnst[0]; + int16_t *src0 = (coeffs + buf_pitch); + int16_t *src1 = (coeffs + buf_pitch_2); + int16_t *src2 = (coeffs + buf_pitch_4); + int16_t *src3 = (coeffs); + int32_t tmp_buf[8 * 32 + 15]; + int32_t *tmp_buf_ptr = tmp_buf + 15; + __m128i in0, in1, in2, in3, in4, in5, in6, in7; + __m128i src0_r, src1_r, src2_r, src3_r, src4_r, src5_r, src6_r, src7_r; + __m128i src0_l, src1_l, src2_l, src3_l, src4_l, src5_l, src6_l, src7_l; + __m128i filter0, filter1, filter2, filter3; + __m128i sum0_r, sum0_l, sum1_r, sum1_l, tmp0_r, tmp0_l, tmp1_r, tmp1_l; + + /* Align pointer to 64 byte boundary */ + tmp_buf_ptr = (int32_t *)(((uintptr_t) tmp_buf_ptr) & ~(uintptr_t) 63); + + /* process coeff 4, 12, 20, 28 */ + in0 = __lsx_vld(src2, 0); + in1 = __lsx_vld(src2 + buf_pitch_8, 0); + in2 = __lsx_vld(src2 + buf_pitch_16, 0); + in3 = __lsx_vld(src2 + buf_pitch_16 + buf_pitch_8, 0); + in4 = __lsx_vld(src3, 0); + in5 = __lsx_vld(src3 + buf_pitch_8, 0); + in6 = __lsx_vld(src3 + buf_pitch_16, 0); + in7 = __lsx_vld(src3 + buf_pitch_16 + buf_pitch_8, 0); + DUP4_ARG2(__lsx_vilvl_h, in1, in0, in3, in2, in6, in4, in7, in5, + src0_r, src1_r, src2_r, src3_r); + DUP4_ARG2(__lsx_vilvh_h, in1, in0, in3, in2, in6, in4, in7, in5, + src0_l, src1_l, src2_l, src3_l); + + filter0 = __lsx_vldrepl_w(filter_ptr2, 0); + filter1 = __lsx_vldrepl_w(filter_ptr2, 4); + sum0_r = __lsx_vdp2_w_h(src0_r, filter0); + sum0_l = __lsx_vdp2_w_h(src0_l, filter0); + sum0_r = __lsx_vdp2add_w_h(sum0_r, src1_r, filter1); + sum0_l = __lsx_vdp2add_w_h(sum0_l, src1_l, filter1); + __lsx_vst(sum0_r, tmp_buf_ptr, 0); + __lsx_vst(sum0_l, tmp_buf_ptr, 16); + + filter0 = __lsx_vldrepl_w(filter_ptr2, 8); + filter1 = __lsx_vldrepl_w(filter_ptr2, 12); + sum0_r = __lsx_vdp2_w_h(src0_r, filter0); + sum0_l = __lsx_vdp2_w_h(src0_l, filter0); + sum0_r = __lsx_vdp2add_w_h(sum0_r, src1_r, filter1); + sum0_l = __lsx_vdp2add_w_h(sum0_l, src1_l, filter1); + __lsx_vst(sum0_r, tmp_buf_ptr, 32); + __lsx_vst(sum0_l, tmp_buf_ptr, 48); + + filter0 = __lsx_vldrepl_w(filter_ptr2, 16); + filter1 = __lsx_vldrepl_w(filter_ptr2, 20); + sum0_r = __lsx_vdp2_w_h(src0_r, filter0); + sum0_l = __lsx_vdp2_w_h(src0_l, filter0); + sum0_r = __lsx_vdp2add_w_h(sum0_r, src1_r, filter1); + sum0_l = __lsx_vdp2add_w_h(sum0_l, src1_l, filter1); + __lsx_vst(sum0_r, tmp_buf_ptr, 64); + __lsx_vst(sum0_l, tmp_buf_ptr, 80); + + filter0 = __lsx_vldrepl_w(filter_ptr2, 24); + filter1 = __lsx_vldrepl_w(filter_ptr2, 28); + sum0_r = __lsx_vdp2_w_h(src0_r, filter0); + sum0_l = __lsx_vdp2_w_h(src0_l, filter0); + sum0_r = __lsx_vdp2add_w_h(sum0_r, src1_r, filter1); + sum0_l = __lsx_vdp2add_w_h(sum0_l, src1_l, filter1); + __lsx_vst(sum0_r, tmp_buf_ptr, 96); + __lsx_vst(sum0_l, tmp_buf_ptr, 112); + + /* process coeff 0, 8, 16, 24 */ + filter0 = __lsx_vldrepl_w(filter_ptr3, 0); + filter1 = __lsx_vldrepl_w(filter_ptr3, 4); + + DUP4_ARG2(__lsx_vdp2_w_h, src2_r, filter0, src2_l, filter0, + src3_r, filter1, src3_l, filter1, sum0_r, sum0_l, tmp1_r, tmp1_l); + sum1_r = __lsx_vsub_w(sum0_r, tmp1_r); + sum1_l = __lsx_vsub_w(sum0_l, tmp1_l); + sum0_r = __lsx_vadd_w(sum0_r, tmp1_r); + sum0_l = __lsx_vadd_w(sum0_l, tmp1_l); + + HEVC_EVEN16_CALC(tmp_buf_ptr, sum0_r, sum0_l, 0, 7); + HEVC_EVEN16_CALC(tmp_buf_ptr, sum1_r, sum1_l, 3, 4); + + filter0 = __lsx_vldrepl_w(filter_ptr3, 16); + filter1 = __lsx_vldrepl_w(filter_ptr3, 20); + + DUP4_ARG2(__lsx_vdp2_w_h, src2_r, filter0, src2_l, filter0, + src3_r, filter1, src3_l, filter1, sum0_r, sum0_l, tmp1_r, tmp1_l); + sum1_r = __lsx_vsub_w(sum0_r, tmp1_r); + sum1_l = __lsx_vsub_w(sum0_l, tmp1_l); + sum0_r = __lsx_vadd_w(sum0_r, tmp1_r); + sum0_l = __lsx_vadd_w(sum0_l, tmp1_l); + + HEVC_EVEN16_CALC(tmp_buf_ptr, sum0_r, sum0_l, 1, 6); + HEVC_EVEN16_CALC(tmp_buf_ptr, sum1_r, sum1_l, 2, 5); + + /* process coeff 2 6 10 14 18 22 26 30 */ + in0 = __lsx_vld(src1, 0); + in1 = __lsx_vld(src1 + buf_pitch_4, 0); + in2 = __lsx_vld(src1 + buf_pitch_8, 0); + in3 = __lsx_vld(src1 + buf_pitch_8 + buf_pitch_4, 0); + in4 = __lsx_vld(src1 + buf_pitch_16, 0); + in5 = __lsx_vld(src1 + buf_pitch_16 + buf_pitch_4, 0); + in6 = __lsx_vld(src1 + buf_pitch_16 + buf_pitch_8, 0); + in7 = __lsx_vld(src1 + buf_pitch_16 + buf_pitch_8 + buf_pitch_4, 0); + + DUP4_ARG2(__lsx_vilvl_h, in1, in0, in3, in2, in5, in4, in7, in6, + src0_r, src1_r, src2_r, src3_r); + DUP4_ARG2(__lsx_vilvh_h, in1, in0, in3, in2, in5, in4, in7, in6, + src0_l, src1_l, src2_l, src3_l); + + /* loop for all columns of constants */ + for (i = 0; i < 8; i++) { + /* processing single column of constants */ + filter0 = __lsx_vldrepl_w(filter_ptr1, 0); + filter1 = __lsx_vldrepl_w(filter_ptr1, 4); + filter2 = __lsx_vldrepl_w(filter_ptr1, 8); + filter3 = __lsx_vldrepl_w(filter_ptr1, 12); + sum0_r = __lsx_vdp2_w_h(src0_r, filter0); + sum0_l = __lsx_vdp2_w_h(src0_l, filter0); + sum0_r = __lsx_vdp2add_w_h(sum0_r, src1_r, filter1); + sum0_l = __lsx_vdp2add_w_h(sum0_l, src1_l, filter1); + sum0_r = __lsx_vdp2add_w_h(sum0_r, src2_r, filter2); + sum0_l = __lsx_vdp2add_w_h(sum0_l, src2_l, filter2); + sum0_r = __lsx_vdp2add_w_h(sum0_r, src3_r, filter3); + sum0_l = __lsx_vdp2add_w_h(sum0_l, src3_l, filter3); + + tmp0_r = __lsx_vld(tmp_buf_ptr + (i << 3), 0); + tmp0_l = __lsx_vld(tmp_buf_ptr + (i << 3), 16); + tmp1_r = tmp0_r; + tmp1_l = tmp0_l; + tmp0_r = __lsx_vadd_w(tmp0_r, sum0_r); + tmp0_l = __lsx_vadd_w(tmp0_l, sum0_l); + tmp1_r = __lsx_vsub_w(tmp1_r, sum0_r); + tmp1_l = __lsx_vsub_w(tmp1_l, sum0_l); + __lsx_vst(tmp0_r, tmp_buf_ptr + (i << 3), 0); + __lsx_vst(tmp0_l, tmp_buf_ptr + (i << 3), 16); + __lsx_vst(tmp1_r, tmp_buf_ptr + ((15 - i) * 8), 0); + __lsx_vst(tmp1_l, tmp_buf_ptr + ((15 - i) * 8), 16); + + filter_ptr1 += 8; + } + + /* process coeff 1 3 5 7 9 11 13 15 17 19 21 23 25 27 29 31 */ + in0 = __lsx_vld(src0, 0); + in1 = __lsx_vld(src0 + buf_pitch_2, 0); + in2 = __lsx_vld(src0 + buf_pitch_4, 0); + in3 = __lsx_vld(src0 + buf_pitch_4 + buf_pitch_2, 0); + in4 = __lsx_vld(src0 + buf_pitch_8, 0); + in5 = __lsx_vld(src0 + buf_pitch_8 + buf_pitch_2, 0); + in6 = __lsx_vld(src0 + buf_pitch_8 + buf_pitch_4, 0); + in7 = __lsx_vld(src0 + buf_pitch_8 + buf_pitch_4 + buf_pitch_2, 0); + + src0 += 16 * buf_pitch; + DUP4_ARG2(__lsx_vilvl_h, in1, in0, in3, in2, in5, in4, in7, in6, + src0_r, src1_r, src2_r, src3_r); + DUP4_ARG2(__lsx_vilvh_h, in1, in0, in3, in2, in5, in4, in7, in6, + src0_l, src1_l, src2_l, src3_l); + in0 = __lsx_vld(src0, 0); + in1 = __lsx_vld(src0 + buf_pitch_2, 0); + in2 = __lsx_vld(src0 + buf_pitch_4, 0); + in3 = __lsx_vld(src0 + buf_pitch_4 + buf_pitch_2, 0); + in4 = __lsx_vld(src0 + buf_pitch_8, 0); + in5 = __lsx_vld(src0 + buf_pitch_8 + buf_pitch_2, 0); + in6 = __lsx_vld(src0 + buf_pitch_8 + buf_pitch_4, 0); + in7 = __lsx_vld(src0 + buf_pitch_8 + buf_pitch_4 + buf_pitch_2, 0); + + DUP4_ARG2(__lsx_vilvl_h, in1, in0, in3, in2, in5, in4, in7, in6, + src4_r, src5_r, src6_r, src7_r); + DUP4_ARG2(__lsx_vilvh_h, in1, in0, in3, in2, in5, in4, in7, in6, + src4_l, src5_l, src6_l, src7_l); + + /* loop for all columns of filter constants */ + for (i = 0; i < 16; i++) { + /* processing single column of constants */ + filter0 = __lsx_vldrepl_w(filter_ptr0, 0); + filter1 = __lsx_vldrepl_w(filter_ptr0, 4); + filter2 = __lsx_vldrepl_w(filter_ptr0, 8); + filter3 = __lsx_vldrepl_w(filter_ptr0, 12); + sum0_r = __lsx_vdp2_w_h(src0_r, filter0); + sum0_l = __lsx_vdp2_w_h(src0_l, filter0); + sum0_r = __lsx_vdp2add_w_h(sum0_r, src1_r, filter1); + sum0_l = __lsx_vdp2add_w_h(sum0_l, src1_l, filter1); + sum0_r = __lsx_vdp2add_w_h(sum0_r, src2_r, filter2); + sum0_l = __lsx_vdp2add_w_h(sum0_l, src2_l, filter2); + sum0_r = __lsx_vdp2add_w_h(sum0_r, src3_r, filter3); + sum0_l = __lsx_vdp2add_w_h(sum0_l, src3_l, filter3); + tmp1_r = sum0_r; + tmp1_l = sum0_l; + + filter0 = __lsx_vldrepl_w(filter_ptr0, 16); + filter1 = __lsx_vldrepl_w(filter_ptr0, 20); + filter2 = __lsx_vldrepl_w(filter_ptr0, 24); + filter3 = __lsx_vldrepl_w(filter_ptr0, 28); + sum0_r = __lsx_vdp2_w_h(src4_r, filter0); + sum0_l = __lsx_vdp2_w_h(src4_l, filter0); + sum0_r = __lsx_vdp2add_w_h(sum0_r, src5_r, filter1); + sum0_l = __lsx_vdp2add_w_h(sum0_l, src5_l, filter1); + sum0_r = __lsx_vdp2add_w_h(sum0_r, src6_r, filter2); + sum0_l = __lsx_vdp2add_w_h(sum0_l, src6_l, filter2); + sum0_r = __lsx_vdp2add_w_h(sum0_r, src7_r, filter3); + sum0_l = __lsx_vdp2add_w_h(sum0_l, src7_l, filter3); + sum0_r = __lsx_vadd_w(sum0_r, tmp1_r); + sum0_l = __lsx_vadd_w(sum0_l, tmp1_l); + + tmp0_r = __lsx_vld(tmp_buf_ptr + i * 8, 0); + tmp0_l = __lsx_vld(tmp_buf_ptr + i * 8, 16); + tmp1_r = tmp0_r; + tmp1_l = tmp0_l; + tmp0_r = __lsx_vadd_w(tmp0_r, sum0_r); + tmp0_l = __lsx_vadd_w(tmp0_l, sum0_l); + sum1_r = __lsx_vreplgr2vr_w(round); + tmp0_r = __lsx_vssrarn_h_w(tmp0_r, sum1_r); + tmp0_l = __lsx_vssrarn_h_w(tmp0_l, sum1_r); + in0 = __lsx_vpackev_d(tmp0_l, tmp0_r); + __lsx_vst(in0, (coeffs + i * buf_pitch), 0); + tmp1_r = __lsx_vsub_w(tmp1_r, sum0_r); + tmp1_l = __lsx_vsub_w(tmp1_l, sum0_l); + tmp1_r = __lsx_vssrarn_h_w(tmp1_r, sum1_r); + tmp1_l = __lsx_vssrarn_h_w(tmp1_l, sum1_r); + in0 = __lsx_vpackev_d(tmp1_l, tmp1_r); + __lsx_vst(in0, (coeffs + (31 - i) * buf_pitch), 0); + + filter_ptr0 += 16; + } +} + +static void hevc_idct_transpose_32x8_to_8x32(int16_t *coeffs, int16_t *tmp_buf) +{ + uint8_t i; + __m128i in0, in1, in2, in3, in4, in5, in6, in7; + + for (i = 0; i < 4; i++) { + DUP4_ARG2(__lsx_vld, coeffs, 0, coeffs, 64, coeffs, 128, + coeffs, 192, in0, in1, in2, in3); + DUP4_ARG2(__lsx_vld, coeffs, 256, coeffs, 320, coeffs, 384, + coeffs, 448, in4, in5, in6, in7); + coeffs += 8; + LSX_TRANSPOSE8x8_H(in0, in1, in2, in3, in4, in5, in6, in7, + in0, in1, in2, in3, in4, in5, in6, in7); + __lsx_vst(in0, tmp_buf, 0); + __lsx_vst(in1, tmp_buf, 16); + __lsx_vst(in2, tmp_buf, 32); + __lsx_vst(in3, tmp_buf, 48); + __lsx_vst(in4, tmp_buf, 64); + __lsx_vst(in5, tmp_buf, 80); + __lsx_vst(in6, tmp_buf, 96); + __lsx_vst(in7, tmp_buf, 112); + tmp_buf += 64; + } +} + +static void hevc_idct_transpose_8x32_to_32x8(int16_t *tmp_buf, int16_t *coeffs) +{ + uint8_t i; + __m128i in0, in1, in2, in3, in4, in5, in6, in7; + + for (i = 0; i < 4; i++) { + DUP4_ARG2(__lsx_vld, tmp_buf, 0, tmp_buf, 16, tmp_buf, 32, + tmp_buf, 48, in0, in1, in2, in3); + DUP4_ARG2(__lsx_vld, tmp_buf, 64, tmp_buf, 80, tmp_buf, 96, + tmp_buf, 112, in4, in5, in6, in7); + tmp_buf += 64; + LSX_TRANSPOSE8x8_H(in0, in1, in2, in3, in4, in5, in6, in7, + in0, in1, in2, in3, in4, in5, in6, in7); + __lsx_vst(in0, coeffs, 0); + __lsx_vst(in1, coeffs, 64); + __lsx_vst(in2, coeffs, 128); + __lsx_vst(in3, coeffs, 192); + __lsx_vst(in4, coeffs, 256); + __lsx_vst(in5, coeffs, 320); + __lsx_vst(in6, coeffs, 384); + __lsx_vst(in7, coeffs, 448); + coeffs += 8; + } +} + +void ff_hevc_idct_32x32_lsx(int16_t *coeffs, int col_limit) +{ + uint8_t row_cnt, col_cnt; + int16_t *src = coeffs; + int16_t tmp_buf[8 * 32 + 31]; + int16_t *tmp_buf_ptr = tmp_buf + 31; + uint8_t round; + int32_t buf_pitch; + + /* Align pointer to 64 byte boundary */ + tmp_buf_ptr = (int16_t *)(((uintptr_t) tmp_buf_ptr) & ~(uintptr_t) 63); + + /* column transform */ + round = 7; + buf_pitch = 32; + for (col_cnt = 0; col_cnt < 4; col_cnt++) { + /* process 8x32 blocks */ + hevc_idct_8x32_column_lsx((coeffs + col_cnt * 8), buf_pitch, round); + } + + /* row transform */ + round = 12; + buf_pitch = 8; + for (row_cnt = 0; row_cnt < 4; row_cnt++) { + /* process 32x8 blocks */ + src = (coeffs + 32 * 8 * row_cnt); + + hevc_idct_transpose_32x8_to_8x32(src, tmp_buf_ptr); + hevc_idct_8x32_column_lsx(tmp_buf_ptr, buf_pitch, round); + hevc_idct_transpose_8x32_to_32x8(tmp_buf_ptr, src); + } +} diff --git a/libavcodec/loongarch/hevc_lpf_sao_lsx.c b/libavcodec/loongarch/hevc_lpf_sao_lsx.c new file mode 100644 index 0000000000..fc10e8eda8 --- /dev/null +++ b/libavcodec/loongarch/hevc_lpf_sao_lsx.c @@ -0,0 +1,2485 @@ +/* + * Copyright (c) 2022 Loongson Technology Corporation Limited + * Contributed by Lu Wang <wanglu@loongson.cn> + * Hao Chen <chenhao@loongson.cn> + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include "libavutil/loongarch/loongson_intrinsics.h" +#include "hevcdsp_lsx.h" + +void ff_hevc_loop_filter_luma_h_8_lsx(uint8_t *src, ptrdiff_t stride, + int32_t beta, int32_t *tc, + uint8_t *p_is_pcm, uint8_t *q_is_pcm) +{ + ptrdiff_t stride_2x = (stride << 1); + ptrdiff_t stride_4x = (stride << 2); + ptrdiff_t stride_3x = stride_2x + stride; + uint8_t *p3 = src - stride_4x; + uint8_t *p2 = src - stride_3x; + uint8_t *p1 = src - stride_2x; + uint8_t *p0 = src - stride; + uint8_t *q0 = src; + uint8_t *q1 = src + stride; + uint8_t *q2 = src + stride_2x; + uint8_t *q3 = src + stride_3x; + uint8_t flag0, flag1; + int32_t dp00, dq00, dp30, dq30, d00, d30, d0030, d0434; + int32_t dp04, dq04, dp34, dq34, d04, d34; + int32_t tc0, p_is_pcm0, q_is_pcm0, beta30, beta20, tc250; + int32_t tc4, p_is_pcm4, q_is_pcm4, tc254, tmp; + + __m128i dst0, dst1, dst2, dst3, dst4, dst5; + __m128i cmp0, cmp1, cmp2, cmp3, p_is_pcm_vec, q_is_pcm_vec; + __m128i temp0, temp1; + __m128i temp2, tc_pos, tc_neg; + __m128i diff0, diff1, delta0, delta1, delta2, abs_delta0; + __m128i zero = {0}; + __m128i p3_src, p2_src, p1_src, p0_src, q0_src, q1_src, q2_src, q3_src; + + dp00 = abs(p2[0] - (p1[0] << 1) + p0[0]); + dq00 = abs(q2[0] - (q1[0] << 1) + q0[0]); + dp30 = abs(p2[3] - (p1[3] << 1) + p0[3]); + dq30 = abs(q2[3] - (q1[3] << 1) + q0[3]); + d00 = dp00 + dq00; + d30 = dp30 + dq30; + dp04 = abs(p2[4] - (p1[4] << 1) + p0[4]); + dq04 = abs(q2[4] - (q1[4] << 1) + q0[4]); + dp34 = abs(p2[7] - (p1[7] << 1) + p0[7]); + dq34 = abs(q2[7] - (q1[7] << 1) + q0[7]); + d04 = dp04 + dq04; + d34 = dp34 + dq34; + + p_is_pcm0 = p_is_pcm[0]; + p_is_pcm4 = p_is_pcm[1]; + q_is_pcm0 = q_is_pcm[0]; + q_is_pcm4 = q_is_pcm[1]; + + DUP2_ARG1(__lsx_vreplgr2vr_d, p_is_pcm0, p_is_pcm4, cmp0, cmp1); + p_is_pcm_vec = __lsx_vpackev_d(cmp1, cmp0); + p_is_pcm_vec = __lsx_vseqi_d(p_is_pcm_vec, 0); + d0030 = (d00 + d30) >= beta; + d0434 = (d04 + d34) >= beta; + DUP2_ARG1(__lsx_vreplgr2vr_w, d0030, d0434, cmp0, cmp1); + cmp3 = __lsx_vpackev_w(cmp1, cmp0); + cmp3 = __lsx_vseqi_w(cmp3, 0); + + if ((!p_is_pcm0 || !p_is_pcm4 || !q_is_pcm0 || !q_is_pcm4) && + (!d0030 || !d0434)) { + DUP4_ARG2(__lsx_vld, p3, 0, p2, 0, p1, 0, p0, 0, + p3_src, p2_src, p1_src, p0_src); + DUP2_ARG1(__lsx_vreplgr2vr_d, q_is_pcm0, q_is_pcm4, cmp0, cmp1); + q_is_pcm_vec = __lsx_vpackev_d(cmp1, cmp0); + q_is_pcm_vec = __lsx_vseqi_d(q_is_pcm_vec, 0); + + tc0 = tc[0]; + beta30 = beta >> 3; + beta20 = beta >> 2; + tc250 = (((tc0 << 2) + tc0 + 1) >> 1); + tc4 = tc[1]; + tc254 = (((tc4 << 2) + tc4 + 1) >> 1); + + DUP2_ARG1(__lsx_vreplgr2vr_h, tc0, tc4, cmp0, cmp1); + DUP4_ARG2(__lsx_vilvl_b, zero, p3_src, zero, p2_src, zero, p1_src, zero, + p0_src, p3_src, p2_src, p1_src, p0_src); + DUP4_ARG2(__lsx_vld, q0, 0, q1, 0, q2, 0, q3, 0, + q0_src, q1_src, q2_src, q3_src); + flag0 = abs(p3[0] - p0[0]) + abs(q3[0] - q0[0]) < beta30 && + abs(p0[0] - q0[0]) < tc250; + flag0 = flag0 && (abs(p3[3] - p0[3]) + abs(q3[3] - q0[3]) < beta30 && + abs(p0[3] - q0[3]) < tc250 && (d00 << 1) < beta20 && + (d30 << 1) < beta20); + tc_pos = __lsx_vpackev_d(cmp1, cmp0); + DUP4_ARG2(__lsx_vilvl_b, zero, q0_src, zero, q1_src, zero, q2_src, + zero, q3_src, q0_src, q1_src, q2_src, q3_src); + + flag1 = abs(p3[4] - p0[4]) + abs(q3[4] - q0[4]) < beta30 && + abs(p0[4] - q0[4]) < tc254; + flag1 = flag1 && (abs(p3[7] - p0[7]) + abs(q3[7] - q0[7]) < beta30 && + abs(p0[7] - q0[7]) < tc254 && (d04 << 1) < beta20 && + (d34 << 1) < beta20); + DUP2_ARG1(__lsx_vreplgr2vr_w, flag0, flag1, cmp0, cmp1); + cmp2 = __lsx_vpackev_w(cmp1, cmp0); + cmp2 = __lsx_vseqi_w(cmp2, 0); + + if (flag0 && flag1) { /* strong only */ + /* strong filter */ + tc_pos = __lsx_vslli_h(tc_pos, 1); + tc_neg = __lsx_vneg_h(tc_pos); + + /* p part */ + DUP2_ARG2(__lsx_vadd_h, p1_src, p0_src, temp0, q0_src, + temp0, temp0); + temp1 = __lsx_vadd_h(p3_src, p2_src); + temp1 = __lsx_vslli_h(temp1, 1); + DUP2_ARG2(__lsx_vadd_h, temp1, p2_src, temp1, temp0, temp1, temp1); + temp1 = __lsx_vsrari_h(temp1, 3); + temp2 = __lsx_vsub_h(temp1, p2_src); + temp2 = __lsx_vclip_h(temp2, tc_neg, tc_pos); + dst0 = __lsx_vadd_h(temp2, p2_src); + + temp1 = __lsx_vadd_h(temp0, p2_src); + temp1 = __lsx_vsrari_h(temp1, 2); + temp2 = __lsx_vsub_h(temp1, p1_src); + temp2 = __lsx_vclip_h(temp2, tc_neg, tc_pos); + dst1 = __lsx_vadd_h(temp2, p1_src); + + temp1 = __lsx_vslli_h(temp0, 1); + DUP2_ARG2(__lsx_vadd_h, temp1, p2_src, temp1, q1_src, + temp1, temp1); + temp1 = __lsx_vsrari_h(temp1, 3); + temp2 = __lsx_vsub_h(temp1, p0_src); + temp2 = __lsx_vclip_h(temp2, tc_neg, tc_pos); + dst2 = __lsx_vadd_h(temp2, p0_src); + + p_is_pcm_vec = __lsx_vnor_v(p_is_pcm_vec, p_is_pcm_vec); + DUP2_ARG3(__lsx_vbitsel_v, dst0, p2_src, p_is_pcm_vec, dst1, + p1_src, p_is_pcm_vec, dst0, dst1); + dst2 = __lsx_vbitsel_v(dst2, p0_src, p_is_pcm_vec); + + /* q part */ + DUP2_ARG2(__lsx_vadd_h, q1_src, p0_src, temp0, q0_src, + temp0, temp0); + temp1 = __lsx_vadd_h(q3_src, q2_src); + temp1 = __lsx_vslli_h(temp1, 1); + DUP2_ARG2(__lsx_vadd_h, temp1, q2_src, temp1, temp0, temp1, temp1); + temp1 = __lsx_vsrari_h(temp1, 3); + temp2 = __lsx_vsub_h(temp1, q2_src); + temp2 = __lsx_vclip_h(temp2, tc_neg, tc_pos); + dst5 = __lsx_vadd_h(temp2, q2_src); + + temp1 = __lsx_vadd_h(temp0, q2_src); + temp1 = __lsx_vsrari_h(temp1, 2); + temp2 = __lsx_vsub_h(temp1, q1_src); + temp2 = __lsx_vclip_h(temp2, tc_neg, tc_pos); + dst4 = __lsx_vadd_h(temp2, q1_src); + + temp0 = __lsx_vslli_h(temp0, 1); + DUP2_ARG2(__lsx_vadd_h, temp0, p1_src, temp1, q2_src, + temp1, temp1); + temp1 = __lsx_vsrari_h(temp1, 3); + temp2 = __lsx_vsub_h(temp1, q0_src); + temp2 = __lsx_vclip_h(temp2, tc_neg, tc_pos); + dst3 = __lsx_vadd_h(temp2, q0_src); + + q_is_pcm_vec = __lsx_vnor_v(q_is_pcm_vec, q_is_pcm_vec); + DUP2_ARG3(__lsx_vbitsel_v, dst3, q0_src, q_is_pcm_vec, dst4, + q1_src, q_is_pcm_vec, dst3, dst4); + dst5 = __lsx_vbitsel_v(dst5, q2_src, q_is_pcm_vec); + + /* pack results to 8 bit */ + DUP2_ARG2(__lsx_vpickev_b, dst1, dst0, dst3, dst2, dst0, dst1); + dst2 = __lsx_vpickev_b(dst5, dst4); + + /* pack src to 8 bit */ + DUP2_ARG2(__lsx_vpickev_b, p1_src, p2_src, q0_src, p0_src, + dst3, dst4); + dst5 = __lsx_vpickev_b(q2_src, q1_src); + + cmp3 = __lsx_vnor_v(cmp3, cmp3); + DUP2_ARG3(__lsx_vbitsel_v, dst0, dst3, cmp3, dst1, dst4, cmp3, + dst0, dst1); + dst2 = __lsx_vbitsel_v(dst2, dst5, cmp3); + + __lsx_vstelm_d(dst0, p2, 0, 0); + __lsx_vstelm_d(dst0, p2 + stride, 0, 1); + __lsx_vstelm_d(dst1, p2 + stride_2x, 0, 0); + __lsx_vstelm_d(dst1, p2 + stride_3x, 0, 1); + __lsx_vstelm_d(dst2, p2 + stride_4x, 0, 0); + __lsx_vstelm_d(dst2, p2 + stride_4x + stride, 0, 1); + /* strong filter ends */ + } else if (flag0 == flag1) { /* weak only */ + /* weak filter */ + tc_neg = __lsx_vneg_h(tc_pos); + DUP2_ARG2(__lsx_vsub_h, q0_src, p0_src, q1_src, p1_src, + diff0, diff1); + DUP2_ARG2(__lsx_vadd_h, __lsx_vslli_h(diff0, 3), diff0, + __lsx_vslli_h(diff1, 1), diff1, diff0, diff1); + delta0 = __lsx_vsub_h(diff0, diff1); + delta0 = __lsx_vsrari_h(delta0, 4); + temp1 = __lsx_vadd_h(__lsx_vslli_h(tc_pos, 3), + __lsx_vslli_h(tc_pos, 1)); + abs_delta0 = __lsx_vadda_h(delta0, zero); + abs_delta0 = __lsx_vsle_hu(temp1, abs_delta0); + abs_delta0 = __lsx_vnor_v(abs_delta0, abs_delta0); + + delta0 = __lsx_vclip_h(delta0, tc_neg, tc_pos); + temp2 = __lsx_vadd_h(delta0, p0_src); + temp2 = __lsx_vclip255_h(temp2); + temp0 = __lsx_vbitsel_v(temp2, p0_src, + __lsx_vnor_v(p_is_pcm_vec, p_is_pcm_vec)); + temp2 = __lsx_vsub_h(q0_src, delta0); + temp2 = __lsx_vclip255_h(temp2); + temp2 = __lsx_vbitsel_v(temp2, q0_src, __lsx_vnor_v(q_is_pcm_vec, + q_is_pcm_vec)); + DUP2_ARG2(__lsx_vnor_v, p_is_pcm_vec, p_is_pcm_vec, q_is_pcm_vec, + q_is_pcm_vec, p_is_pcm_vec, q_is_pcm_vec); + + tmp = (beta + (beta >> 1)) >> 3; + DUP2_ARG1(__lsx_vreplgr2vr_d, dp00 + dp30 < tmp, dp04 + dp34 < tmp, + cmp0, cmp1); + cmp0 = __lsx_vpackev_d(cmp1, cmp0); + cmp0 = __lsx_vseqi_d(cmp0, 0); + p_is_pcm_vec = __lsx_vor_v(p_is_pcm_vec, cmp0); + + DUP2_ARG1(__lsx_vreplgr2vr_d, dq00 + dq30 < tmp, dq04 + dq34 < tmp, + cmp0, cmp1); + cmp0 = __lsx_vpackev_d(cmp1, cmp0); + cmp0 = __lsx_vseqi_d(cmp0, 0); + q_is_pcm_vec = __lsx_vor_v(q_is_pcm_vec, cmp0); + tc_pos = __lsx_vsrai_h(tc_pos, 1); + tc_neg = __lsx_vneg_h(tc_pos); + + DUP2_ARG2(__lsx_vavgr_hu, p2_src, p0_src, q0_src, q2_src, + delta1, delta2); + DUP2_ARG2(__lsx_vsub_h, delta1, p1_src, delta2, q1_src, + delta1, delta2); + delta1 = __lsx_vadd_h(delta1, delta0); + delta2 = __lsx_vsub_h(delta2, delta0); + DUP2_ARG2(__lsx_vsrai_h, delta1, 1, delta2, 1, delta1, delta2); + DUP2_ARG3(__lsx_vclip_h, delta1, tc_neg, tc_pos, delta2, + tc_neg, tc_pos, delta1, delta2); + DUP2_ARG2(__lsx_vadd_h, p1_src, delta1, q1_src, delta2, + delta1, delta2); + DUP2_ARG1(__lsx_vclip255_h, delta1, delta2, delta1, delta2); + DUP2_ARG3(__lsx_vbitsel_v, delta1, p1_src, p_is_pcm_vec, delta2, + q1_src, q_is_pcm_vec, delta1, delta2); + + abs_delta0 = __lsx_vnor_v(abs_delta0, abs_delta0); + DUP4_ARG3(__lsx_vbitsel_v, delta1, p1_src, abs_delta0, temp0, + p0_src, abs_delta0, temp2, q0_src, abs_delta0, delta2, + q1_src, abs_delta0, dst1, dst2, dst3, dst4); + /* pack results to 8 bit */ + DUP2_ARG2(__lsx_vpickev_b, dst2, dst1, dst4, dst3, dst0, dst1); + /* pack src to 8 bit */ + DUP2_ARG2(__lsx_vpickev_b, p0_src, p1_src, q1_src, q0_src, + dst2, dst3); + cmp3 = __lsx_vnor_v(cmp3, cmp3); + DUP2_ARG3(__lsx_vbitsel_v, dst0, dst2, cmp3, dst1, dst3, cmp3, + dst0, dst1); + + p2 += stride; + __lsx_vstelm_d(dst0, p2, 0, 0); + __lsx_vstelm_d(dst0, p2 + stride, 0, 1); + __lsx_vstelm_d(dst1, p2 + stride_2x, 0, 0); + __lsx_vstelm_d(dst1, p2 + stride_3x, 0, 1); + /* weak filter ends */ + } else { /* strong + weak */ + /* strong filter */ + tc_pos = __lsx_vslli_h(tc_pos, 1); + tc_neg = __lsx_vneg_h(tc_pos); + + /* p part */ + DUP2_ARG2(__lsx_vadd_h, p1_src, p0_src, temp0, q0_src, + temp0, temp0); + temp1 = __lsx_vadd_h(p3_src, p2_src); + temp1 = __lsx_vslli_h(temp1, 1); + DUP2_ARG2(__lsx_vadd_h, temp1, p2_src, temp1, temp0, temp1, temp1); + temp1 = __lsx_vsrari_h(temp1, 3); + temp2 = __lsx_vsub_h(temp1, p2_src); + temp2 = __lsx_vclip_h(temp2, tc_neg, tc_pos); + dst0 = __lsx_vadd_h(temp2, p2_src); + + temp1 = __lsx_vadd_h(temp0, p2_src); + temp1 = __lsx_vsrari_h(temp1, 2); + temp2 = __lsx_vsub_h(temp1, p1_src); + temp2 = __lsx_vclip_h(temp2, tc_neg, tc_pos); + dst1 = __lsx_vadd_h(temp2, p1_src); + + temp1 = __lsx_vslli_h(temp0, 1); + DUP2_ARG2(__lsx_vadd_h, temp1, p2_src, temp1, q1_src, temp1, temp1); + temp1 = __lsx_vsrari_h(temp1, 3); + temp2 = __lsx_vsub_h(temp1, p0_src); + temp2 = __lsx_vclip_h(temp2, tc_neg, tc_pos); + dst2 = __lsx_vadd_h(temp2, p0_src); + + p_is_pcm_vec = __lsx_vnor_v(p_is_pcm_vec, p_is_pcm_vec); + DUP2_ARG3(__lsx_vbitsel_v, dst0, p2_src, p_is_pcm_vec, dst1, + p1_src, p_is_pcm_vec, dst0, dst1); + dst2 = __lsx_vbitsel_v(dst2, p0_src, p_is_pcm_vec); + + /* q part */ + DUP2_ARG2(__lsx_vadd_h, q1_src, p0_src, temp0, q0_src, + temp0, temp0); + temp1 = __lsx_vadd_h(q3_src, q2_src); + temp1 = __lsx_vslli_h(temp1, 1); + DUP2_ARG2(__lsx_vadd_h, temp1, q2_src, temp1, temp0, temp1, temp1); + temp1 = __lsx_vsrari_h(temp1, 3); + temp2 = __lsx_vsub_h(temp1, q2_src); + temp2 = __lsx_vclip_h(temp2, tc_neg, tc_pos); + dst5 = __lsx_vadd_h(temp2, q2_src); + + temp1 = __lsx_vadd_h(temp0, q2_src); + temp1 = __lsx_vsrari_h(temp1, 2); + temp2 = __lsx_vsub_h(temp1, q1_src); + temp2 = __lsx_vclip_h(temp2, tc_neg, tc_pos); + dst4 = __lsx_vadd_h(temp2, q1_src); + + temp1 = __lsx_vslli_h(temp0, 1); + DUP2_ARG2(__lsx_vadd_h, temp1, p1_src, temp1, q2_src, temp1, temp1); + temp1 = __lsx_vsrari_h(temp1, 3); + temp2 = __lsx_vsub_h(temp1, q0_src); + temp2 = __lsx_vclip_h(temp2, tc_neg, tc_pos); + dst3 = __lsx_vadd_h(temp2, q0_src); + + q_is_pcm_vec = __lsx_vnor_v(q_is_pcm_vec, q_is_pcm_vec); + DUP2_ARG3(__lsx_vbitsel_v, dst3, q0_src, q_is_pcm_vec, dst4, + q1_src, q_is_pcm_vec, dst3, dst4); + dst5 = __lsx_vbitsel_v(dst5, q2_src, q_is_pcm_vec); + + /* pack strong results to 8 bit */ + DUP2_ARG2(__lsx_vpickev_b, dst1, dst0, dst3, dst2, dst0, dst1); + dst2 = __lsx_vpickev_b(dst5, dst4); + /* strong filter ends */ + + /* weak filter */ + tc_pos = __lsx_vsrai_h(tc_pos, 1); + tc_neg = __lsx_vneg_h(tc_pos); + + DUP2_ARG2(__lsx_vsub_h, q0_src, p0_src, q1_src, p1_src, + diff0, diff1); + DUP2_ARG2(__lsx_vadd_h, __lsx_vslli_h(diff0, 3), diff0, + __lsx_vslli_h(diff1, 1), diff1, diff0, diff1); + delta0 = __lsx_vsub_h(diff0, diff1); + delta0 = __lsx_vsrari_h(delta0, 4); + temp1 = __lsx_vadd_h(__lsx_vslli_h(tc_pos, 3), + __lsx_vslli_h(tc_pos, 1)); + abs_delta0 = __lsx_vadda_h(delta0, zero); + abs_delta0 = __lsx_vsle_hu(temp1, abs_delta0); + abs_delta0 = __lsx_vnor_v(abs_delta0, abs_delta0); + + delta0 = __lsx_vclip_h(delta0, tc_neg, tc_pos); + temp2 = __lsx_vadd_h(delta0, p0_src); + temp2 = __lsx_vclip255_h(temp2); + temp0 = __lsx_vbitsel_v(temp2, p0_src, p_is_pcm_vec); + + temp2 = __lsx_vsub_h(q0_src, delta0); + temp2 = __lsx_vclip255_h(temp2); + temp2 = __lsx_vbitsel_v(temp2, q0_src, q_is_pcm_vec); + + tmp = (beta + (beta >> 1)) >> 3; + DUP2_ARG1(__lsx_vreplgr2vr_d, dp00 + dp30 < tmp, dp04 + dp34 < tmp, + cmp0, cmp1); + cmp0 = __lsx_vpackev_d(cmp1, cmp0); + p_is_pcm_vec = __lsx_vor_v(p_is_pcm_vec, __lsx_vseqi_d(cmp0, 0)); + DUP2_ARG1(__lsx_vreplgr2vr_d, dq00 + dq30 < tmp, dq04 + dq34 < tmp, + cmp0, cmp1); + cmp0 = __lsx_vpackev_d(cmp1, cmp0); + q_is_pcm_vec = __lsx_vor_v(q_is_pcm_vec, __lsx_vseqi_d(cmp0, 0)); + + tc_pos = __lsx_vsrai_h(tc_pos, 1); + tc_neg = __lsx_vneg_h(tc_pos); + + DUP2_ARG2(__lsx_vavgr_hu, p2_src, p0_src, q0_src, q2_src, + delta1, delta2); + DUP2_ARG2(__lsx_vsub_h, delta1, p1_src, delta2, q1_src, + delta1, delta2); + delta1 = __lsx_vadd_h(delta1, delta0); + delta2 = __lsx_vsub_h(delta2, delta0); + DUP2_ARG2(__lsx_vsrai_h, delta1, 1, delta2, 1, delta1, delta2); + DUP2_ARG3(__lsx_vclip_h, delta1, tc_neg, tc_pos, delta2, tc_neg, + tc_pos, delta1, delta2); + DUP2_ARG2(__lsx_vadd_h, p1_src, delta1, q1_src, delta2, + delta1, delta2); + DUP2_ARG1(__lsx_vclip255_h, delta1, delta2, delta1, delta2); + DUP2_ARG3(__lsx_vbitsel_v, delta1, p1_src, p_is_pcm_vec, delta2, + q1_src, q_is_pcm_vec, delta1, delta2); + abs_delta0 = __lsx_vnor_v(abs_delta0, abs_delta0); + DUP4_ARG3(__lsx_vbitsel_v, delta1, p1_src, abs_delta0, delta2, + q1_src, abs_delta0, temp0, p0_src, abs_delta0, temp2, + q0_src, abs_delta0, delta1, delta2, temp0, temp2); + /* weak filter ends */ + + /* pack weak results to 8 bit */ + DUP2_ARG2(__lsx_vpickev_b, delta1, p2_src, temp2, temp0, + dst3, dst4); + dst5 = __lsx_vpickev_b(q2_src, delta2); + + /* select between weak or strong */ + DUP2_ARG3(__lsx_vbitsel_v, dst0, dst3, cmp2, dst1, dst4, cmp2, + dst0, dst1); + dst2 = __lsx_vbitsel_v(dst2, dst5, cmp2); + + /* pack src to 8 bit */ + DUP2_ARG2(__lsx_vpickev_b, p1_src, p2_src, q0_src, p0_src, + dst3, dst4); + dst5 = __lsx_vpickev_b(q2_src, q1_src); + + cmp3 = __lsx_vnor_v(cmp3, cmp3); + DUP2_ARG3(__lsx_vbitsel_v, dst0, dst3, cmp3, dst1, dst4, cmp3, + dst0, dst1); + dst2 = __lsx_vbitsel_v(dst2, dst5, cmp3); + + __lsx_vstelm_d(dst0, p2, 0, 0); + __lsx_vstelm_d(dst0, p2 + stride, 0, 1); + __lsx_vstelm_d(dst1, p2 + stride_2x, 0, 0); + __lsx_vstelm_d(dst1, p2 + stride_3x, 0, 1); + __lsx_vstelm_d(dst2, p2 + stride_4x, 0, 0); + __lsx_vstelm_d(dst2, p2 + stride_4x + stride, 0, 1); + } + } +} + +void ff_hevc_loop_filter_luma_v_8_lsx(uint8_t *src, ptrdiff_t stride, + int32_t beta, int32_t *tc, + uint8_t *p_is_pcm, uint8_t *q_is_pcm) +{ + ptrdiff_t stride_2x = (stride << 1); + ptrdiff_t stride_4x = (stride << 2); + ptrdiff_t stride_3x = stride_2x + stride; + uint8_t *p3 = src; + uint8_t *p2 = src + stride_3x; + uint8_t *p1 = src + stride_4x; + uint8_t *p0 = src + stride_4x + stride_3x; + uint8_t flag0, flag1; + int32_t dp00, dq00, dp30, dq30, d00, d30; + int32_t d0030, d0434; + int32_t dp04, dq04, dp34, dq34, d04, d34; + int32_t tc0, p_is_pcm0, q_is_pcm0, beta30, beta20, tc250; + int32_t tc4, p_is_pcm4, q_is_pcm4, tc254, tmp; + + __m128i dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7; + __m128i cmp0, cmp1, cmp2, p_is_pcm_vec, q_is_pcm_vec; + __m128i cmp3; + __m128i temp0, temp1; + __m128i temp2; + __m128i tc_pos, tc_neg; + __m128i diff0, diff1, delta0, delta1, delta2, abs_delta0; + __m128i zero = {0}; + __m128i p3_src, p2_src, p1_src, p0_src, q0_src, q1_src, q2_src, q3_src; + + dp00 = abs(p3[-3] - (p3[-2] << 1) + p3[-1]); + dq00 = abs(p3[2] - (p3[1] << 1) + p3[0]); + dp30 = abs(p2[-3] - (p2[-2] << 1) + p2[-1]); + dq30 = abs(p2[2] - (p2[1] << 1) + p2[0]); + d00 = dp00 + dq00; + d30 = dp30 + dq30; + p_is_pcm0 = p_is_pcm[0]; + q_is_pcm0 = q_is_pcm[0]; + + dp04 = abs(p1[-3] - (p1[-2] << 1) + p1[-1]); + dq04 = abs(p1[2] - (p1[1] << 1) + p1[0]); + dp34 = abs(p0[-3] - (p0[-2] << 1) + p0[-1]); + dq34 = abs(p0[2] - (p0[1] << 1) + p0[0]); + d04 = dp04 + dq04; + d34 = dp34 + dq34; + p_is_pcm4 = p_is_pcm[1]; + q_is_pcm4 = q_is_pcm[1]; + + DUP2_ARG1(__lsx_vreplgr2vr_d, p_is_pcm0, p_is_pcm4, cmp0, cmp1); + p_is_pcm_vec = __lsx_vpackev_d(cmp1, cmp0); + p_is_pcm_vec = __lsx_vseqi_d(p_is_pcm_vec, 0); + + d0030 = (d00 + d30) >= beta; + d0434 = (d04 + d34) >= beta; + + DUP2_ARG1(__lsx_vreplgr2vr_d, d0030, d0434, cmp0, cmp1); + cmp3 = __lsx_vpackev_d(cmp1, cmp0); + cmp3 = __lsx_vseqi_d(cmp3, 0); + + if ((!p_is_pcm0 || !p_is_pcm4 || !q_is_pcm0 || !q_is_pcm4) && + (!d0030 || !d0434)) { + src -= 4; + DUP4_ARG2(__lsx_vld, src, 0, src + stride, 0, src + stride_2x, 0, + src + stride_3x, 0, p3_src, p2_src, p1_src, p0_src); + src += stride_4x; + DUP4_ARG2(__lsx_vld, src, 0, src + stride, 0, src + stride_2x, 0, + src + stride_3x, 0, q0_src, q1_src, q2_src, q3_src); + src -= stride_4x; + + DUP2_ARG1(__lsx_vreplgr2vr_d, q_is_pcm0, q_is_pcm4, cmp0, cmp1); + q_is_pcm_vec = __lsx_vpackev_d(cmp1, cmp0); + q_is_pcm_vec = __lsx_vseqi_d(q_is_pcm_vec, 0); + + tc0 = tc[0]; + beta30 = beta >> 3; + beta20 = beta >> 2; + tc250 = (((tc0 << 2) + tc0 + 1) >> 1); + tc4 = tc[1]; + tc254 = (((tc4 << 2) + tc4 + 1) >> 1); + DUP2_ARG1( __lsx_vreplgr2vr_h, tc0 << 1, tc4 << 1, cmp0, cmp1); + tc_pos = __lsx_vpackev_d(cmp1, cmp0); + LSX_TRANSPOSE8x8_B(p3_src, p2_src, p1_src, p0_src, q0_src, q1_src, + q2_src, q3_src, p3_src, p2_src, p1_src, p0_src, + q0_src, q1_src, q2_src, q3_src); + + flag0 = abs(p3[-4] - p3[-1]) + abs(p3[3] - p3[0]) < beta30 && + abs(p3[-1] - p3[0]) < tc250; + flag0 = flag0 && (abs(p2[-4] - p2[-1]) + abs(p2[3] - p2[0]) < beta30 && + abs(p2[-1] - p2[0]) < tc250 && (d00 << 1) < beta20 && + (d30 << 1) < beta20); + cmp0 = __lsx_vreplgr2vr_d(flag0); + DUP4_ARG2(__lsx_vilvl_b, zero, p3_src, zero, p2_src, zero, p1_src, zero, + p0_src, p3_src, p2_src, p1_src, p0_src); + + flag1 = abs(p1[-4] - p1[-1]) + abs(p1[3] - p1[0]) < beta30 && + abs(p1[-1] - p1[0]) < tc254; + flag1 = flag1 && (abs(p0[-4] - p0[-1]) + abs(p0[3] - p0[0]) < beta30 && + abs(p0[-1] - p0[0]) < tc254 && (d04 << 1) < beta20 && + (d34 << 1) < beta20); + DUP4_ARG2(__lsx_vilvl_b, zero, q0_src, zero, q1_src, zero, q2_src, zero, + q3_src, q0_src, q1_src, q2_src, q3_src); + + cmp1 = __lsx_vreplgr2vr_d(flag1); + cmp2 = __lsx_vpackev_d(cmp1, cmp0); + cmp2 = __lsx_vseqi_d(cmp2, 0); + + if (flag0 && flag1) { /* strong only */ + /* strong filter */ + tc_neg = __lsx_vneg_h(tc_pos); + /* p part */ + DUP2_ARG2(__lsx_vadd_h, p1_src, p0_src, temp0, q0_src, + temp0, temp0); + temp1 = __lsx_vadd_h(p3_src, p2_src); + temp1 = __lsx_vslli_h(temp1, 1); + DUP2_ARG2(__lsx_vadd_h, temp1, p2_src, temp1, temp0, temp1, temp1); + temp1 = __lsx_vsrari_h(temp1, 3); + temp2 = __lsx_vsub_h(temp1, p2_src); + temp2 = __lsx_vclip_h(temp2, tc_neg, tc_pos); + dst0 = __lsx_vadd_h(temp2, p2_src); + + temp1 = __lsx_vadd_h(temp0, p2_src); + temp1 = __lsx_vsrari_h(temp1, 2); + temp2 = __lsx_vsub_h(temp1, p1_src); + temp2 = __lsx_vclip_h(temp2, tc_neg, tc_pos); + dst1 = __lsx_vadd_h(temp2, p1_src); + + temp1 = __lsx_vslli_h(temp0, 1); + DUP2_ARG2(__lsx_vadd_h, temp1, p2_src, temp1, q1_src, temp1, temp1); + temp1 = __lsx_vsrari_h(temp1, 3); + temp2 = __lsx_vsub_h(temp1, p0_src); + temp2 = __lsx_vclip_h(temp2, tc_neg, tc_pos); + dst2 = __lsx_vadd_h(temp2, p0_src); + + p_is_pcm_vec = __lsx_vnor_v(p_is_pcm_vec, p_is_pcm_vec); + DUP2_ARG3(__lsx_vbitsel_v, dst0, p2_src, p_is_pcm_vec, dst1, p1_src, + p_is_pcm_vec, dst0, dst1); + dst2 = __lsx_vbitsel_v(dst2, p0_src, p_is_pcm_vec); + + /* q part */ + DUP2_ARG2(__lsx_vadd_h, q1_src, p0_src, temp0, q0_src, + temp0, temp0); + temp1 = __lsx_vadd_h(q3_src, q2_src); + temp1 = __lsx_vslli_h(temp1, 1); + DUP2_ARG2(__lsx_vadd_h, temp1, q2_src, temp1, temp0, temp1, temp1); + temp1 = __lsx_vsrari_h(temp1, 3); + temp2 = __lsx_vsub_h(temp1, q2_src); + temp2 = __lsx_vclip_h(temp2, tc_neg, tc_pos); + dst5 = __lsx_vadd_h(temp2, q2_src); + + temp1 = __lsx_vadd_h(temp0, q2_src); + temp1 = __lsx_vsrari_h(temp1, 2); + temp2 = __lsx_vsub_h(temp1, q1_src); + temp2 = __lsx_vclip_h(temp2, tc_neg, tc_pos); + dst4 = __lsx_vadd_h(temp2, q1_src); + + temp1 = __lsx_vslli_h(temp0, 1); + DUP2_ARG2(__lsx_vadd_h, temp1, p1_src, temp1, q2_src, temp1, temp1); + temp1 = __lsx_vsrari_h(temp1, 3); + temp2 = __lsx_vsub_h(temp1, q0_src); + temp2 = __lsx_vclip_h(temp2, tc_neg, tc_pos); + dst3 = __lsx_vadd_h(temp2, q0_src); + + q_is_pcm_vec = __lsx_vnor_v(q_is_pcm_vec, q_is_pcm_vec); + DUP2_ARG3(__lsx_vbitsel_v, dst3, q0_src, q_is_pcm_vec, dst4, q1_src, + q_is_pcm_vec, dst3, dst4); + dst5 = __lsx_vbitsel_v(dst5, q2_src, q_is_pcm_vec); + /* strong filter ends */ + } else if (flag0 == flag1) { /* weak only */ + /* weak filter */ + tc_pos = __lsx_vsrai_h(tc_pos, 1); + tc_neg = __lsx_vneg_h(tc_pos); + + DUP2_ARG2(__lsx_vsub_h, q0_src, p0_src, q1_src, p1_src, + diff0, diff1); + DUP2_ARG2(__lsx_vadd_h, __lsx_vslli_h(diff0, 3), diff0, + __lsx_vslli_h(diff1, 1), diff1, diff0, diff1); + delta0 = __lsx_vsub_h(diff0, diff1); + delta0 = __lsx_vsrari_h(delta0, 4); + temp1 = __lsx_vadd_h(__lsx_vslli_h(tc_pos, 3), + __lsx_vslli_h(tc_pos, 1)); + abs_delta0 = __lsx_vadda_h(delta0, zero); + abs_delta0 = __lsx_vsle_hu(temp1, abs_delta0); + abs_delta0 = __lsx_vnor_v(abs_delta0, abs_delta0); + + delta0 = __lsx_vclip_h(delta0, tc_neg, tc_pos); + temp2 = __lsx_vadd_h(delta0, p0_src); + temp2 = __lsx_vclip255_h(temp2); + p_is_pcm_vec = __lsx_vnor_v(p_is_pcm_vec, p_is_pcm_vec); + temp0 = __lsx_vbitsel_v(temp2, p0_src, p_is_pcm_vec); + + temp2 = __lsx_vsub_h(q0_src, delta0); + temp2 = __lsx_vclip255_h(temp2); + q_is_pcm_vec = __lsx_vnor_v(q_is_pcm_vec, q_is_pcm_vec); + temp2 = __lsx_vbitsel_v(temp2, q0_src, q_is_pcm_vec); + + tmp = ((beta + (beta >> 1)) >> 3); + DUP2_ARG1(__lsx_vreplgr2vr_d, !p_is_pcm0 && ((dp00 + dp30) < tmp), + !p_is_pcm4 && ((dp04 + dp34) < tmp), cmp0, cmp1); + p_is_pcm_vec = __lsx_vpackev_d(cmp1, cmp0); + p_is_pcm_vec = __lsx_vseqi_d(p_is_pcm_vec, 0); + + DUP2_ARG1(__lsx_vreplgr2vr_h, (!q_is_pcm0) && (dq00 + dq30 < tmp), + (!q_is_pcm4) && (dq04 + dq34 < tmp), cmp0, cmp1); + q_is_pcm_vec = __lsx_vpackev_d(cmp1, cmp0); + q_is_pcm_vec = __lsx_vseqi_d(q_is_pcm_vec, 0); + tc_pos = __lsx_vsrai_h(tc_pos, 1); + tc_neg = __lsx_vneg_h(tc_pos); + + DUP2_ARG2(__lsx_vavgr_hu, p2_src, p0_src, q0_src, q2_src, + delta1, delta2); + DUP2_ARG2(__lsx_vsub_h, delta1, p1_src, delta2, q1_src, + delta1, delta2); + delta1 = __lsx_vadd_h(delta1, delta0); + delta2 = __lsx_vsub_h(delta2, delta0); + DUP2_ARG2(__lsx_vsrai_h, delta1, 1, delta2, 1, delta1, delta2); + DUP2_ARG3(__lsx_vclip_h, delta1, tc_neg, tc_pos, delta2, tc_neg, + tc_pos, delta1, delta2); + DUP2_ARG2(__lsx_vadd_h, p1_src, delta1, q1_src, delta2, + delta1, delta2); + DUP2_ARG1(__lsx_vclip255_h, delta1, delta2, delta1, delta2); + DUP2_ARG3(__lsx_vbitsel_v, delta1, p1_src, p_is_pcm_vec, delta2, + q1_src, q_is_pcm_vec, delta1, delta2); + + abs_delta0 = __lsx_vnor_v(abs_delta0, abs_delta0); + DUP4_ARG3(__lsx_vbitsel_v, delta1, p1_src, abs_delta0, temp0, + p0_src, abs_delta0, temp2, q0_src, abs_delta0, delta2, + q1_src, abs_delta0, dst0, dst1, dst2, dst3); + /* weak filter ends */ + + cmp3 = __lsx_vnor_v(cmp3, cmp3); + DUP4_ARG3(__lsx_vbitsel_v, dst0, p1_src, cmp3, dst1, p0_src, + cmp3, dst2, q0_src, cmp3, dst3, q1_src, cmp3, + dst0, dst1, dst2, dst3); + DUP2_ARG2(__lsx_vpickev_b, dst2, dst0, dst3, dst1, dst0, dst1); + + /* transpose */ + dst4 = __lsx_vilvl_b(dst1, dst0); + dst5 = __lsx_vilvh_b(dst1, dst0); + dst0 = __lsx_vilvl_h(dst5, dst4); + dst1 = __lsx_vilvh_h(dst5, dst4); + + src += 2; + __lsx_vstelm_w(dst0, src, 0, 0); + __lsx_vstelm_w(dst0, src + stride, 0, 1); + __lsx_vstelm_w(dst0, src + stride_2x, 0, 2); + __lsx_vstelm_w(dst0, src + stride_3x, 0, 3); + src += stride_4x; + __lsx_vstelm_w(dst1, src, 0, 0); + __lsx_vstelm_w(dst1, src + stride, 0, 1); + __lsx_vstelm_w(dst1, src + stride_2x, 0, 2); + __lsx_vstelm_w(dst1, src + stride_3x, 0, 3); + return; + } else { /* strong + weak */ + /* strong filter */ + tc_neg = __lsx_vneg_h(tc_pos); + + /* p part */ + DUP2_ARG2(__lsx_vadd_h, p1_src, p0_src, temp0, q0_src, + temp0, temp0); + + temp1 = __lsx_vadd_h(p3_src, p2_src); + temp1 = __lsx_vslli_h(temp1, 1); + DUP2_ARG2(__lsx_vadd_h, temp1, p2_src, temp1, temp0, temp1, temp1); + temp1 = __lsx_vsrari_h(temp1, 3); + temp2 = __lsx_vsub_h(temp1, p2_src); + temp2 = __lsx_vclip_h(temp2, tc_neg, tc_pos); + dst0 = __lsx_vadd_h(temp2, p2_src); + + temp1 = __lsx_vadd_h(temp0, p2_src); + temp1 = __lsx_vsrari_h(temp1, 2); + temp2 = __lsx_vsub_h(temp1, p1_src); + temp2 = __lsx_vclip_h(temp2, tc_neg, tc_pos); + dst1 = __lsx_vadd_h(temp2, p1_src); + + temp1 = __lsx_vslli_h(temp0, 1); + DUP2_ARG2(__lsx_vadd_h, temp1, p2_src, temp1, q1_src, temp1, temp1); + temp1 = __lsx_vsrari_h(temp1, 3); + temp2 = __lsx_vsub_h(temp1, p0_src); + temp2 = __lsx_vclip_h(temp2, tc_neg, tc_pos); + dst2 = __lsx_vadd_h(temp2, p0_src); + + p_is_pcm_vec = __lsx_vnor_v(p_is_pcm_vec, p_is_pcm_vec); + DUP2_ARG3(__lsx_vbitsel_v, dst0, p2_src, p_is_pcm_vec, dst1, p1_src, + p_is_pcm_vec, dst0, dst1); + dst2 = __lsx_vbitsel_v(dst2, p0_src, p_is_pcm_vec); + + /* q part */ + DUP2_ARG2(__lsx_vadd_h, q1_src, p0_src, temp0, q0_src, temp0, temp0); + temp1 = __lsx_vadd_h(q3_src, q2_src); + temp1 = __lsx_vslli_h(temp1, 1); + DUP2_ARG2(__lsx_vadd_h, temp1, q2_src, temp1, temp0, temp1, temp1); + temp1 = __lsx_vsrari_h(temp1, 3); + temp2 = __lsx_vsub_h(temp1, q2_src); + temp2 = __lsx_vclip_h(temp2, tc_neg, tc_pos); + dst5 = __lsx_vadd_h(temp2, q2_src); + + temp1 = __lsx_vadd_h(temp0, q2_src); + temp1 = __lsx_vsrari_h(temp1, 2); + temp2 = __lsx_vsub_h(temp1, q1_src); + temp2 = __lsx_vclip_h(temp2, tc_neg, tc_pos); + dst4 = __lsx_vadd_h(temp2, q1_src); + + temp1 = __lsx_vslli_h(temp0, 1); + DUP2_ARG2(__lsx_vadd_h, temp1, p1_src, temp1, q2_src, temp1, temp1); + temp1 = __lsx_vsrari_h(temp1, 3); + temp2 = __lsx_vsub_h(temp1, q0_src); + temp2 = __lsx_vclip_h(temp2, tc_neg, tc_pos); + dst3 = __lsx_vadd_h(temp2, q0_src); + + q_is_pcm_vec = __lsx_vnor_v(q_is_pcm_vec, q_is_pcm_vec); + DUP2_ARG3(__lsx_vbitsel_v, dst3, q0_src, q_is_pcm_vec, dst4, q1_src, + q_is_pcm_vec, dst3, dst4); + dst5 = __lsx_vbitsel_v(dst5, q2_src, q_is_pcm_vec); + /* strong filter ends */ + + /* weak filter */ + tc_pos = __lsx_vsrai_h(tc_pos, 1); + tc_neg = __lsx_vneg_h(tc_pos); + + DUP2_ARG2(__lsx_vsub_h, q0_src, p0_src, q1_src, p1_src, + diff0, diff1); + DUP2_ARG2(__lsx_vadd_h, __lsx_vslli_h(diff0, 3), diff0, + __lsx_vslli_h(diff1, 1), diff1, diff0, diff1); + delta0 = __lsx_vsub_h(diff0, diff1); + delta0 = __lsx_vsrari_h(delta0, 4); + + temp1 = __lsx_vadd_h(__lsx_vslli_h(tc_pos, 3), + __lsx_vslli_h(tc_pos, 1)); + abs_delta0 = __lsx_vadda_h(delta0, zero); + abs_delta0 = __lsx_vsle_hu(temp1, abs_delta0); + abs_delta0 = __lsx_vnor_v(abs_delta0, abs_delta0); + delta0 = __lsx_vclip_h(delta0, tc_neg, tc_pos); + temp2 = __lsx_vadd_h(delta0, p0_src); + temp2 = __lsx_vclip255_h(temp2); + temp0 = __lsx_vbitsel_v(temp2, p0_src, p_is_pcm_vec); + temp2 = __lsx_vsub_h(q0_src, delta0); + temp2 = __lsx_vclip255_h(temp2); + temp2 = __lsx_vbitsel_v(temp2, q0_src, q_is_pcm_vec); + + tmp = (beta + (beta >> 1)) >> 3; + DUP2_ARG1(__lsx_vreplgr2vr_d, !p_is_pcm0 && ((dp00 + dp30) < tmp), + !p_is_pcm4 && ((dp04 + dp34) < tmp), cmp0, cmp1); + p_is_pcm_vec = __lsx_vpackev_d(cmp1, cmp0); + p_is_pcm_vec = __lsx_vseqi_d(p_is_pcm_vec, 0); + + DUP2_ARG1(__lsx_vreplgr2vr_h, (!q_is_pcm0) && (dq00 + dq30 < tmp), + (!q_is_pcm4) && (dq04 + dq34 < tmp), cmp0, cmp1); + q_is_pcm_vec = __lsx_vpackev_d(cmp1, cmp0); + q_is_pcm_vec = __lsx_vseqi_d(q_is_pcm_vec, 0); + tc_pos = __lsx_vsrai_h(tc_pos, 1); + tc_neg = __lsx_vneg_h(tc_pos); + + DUP2_ARG2(__lsx_vavgr_hu, p2_src, p0_src, q0_src, q2_src, + delta1, delta2); + DUP2_ARG2(__lsx_vsub_h, delta1, p1_src, delta2, q1_src, + delta1, delta2); + delta1 = __lsx_vadd_h(delta1, delta0); + delta2 = __lsx_vsub_h(delta2, delta0); + DUP2_ARG2(__lsx_vsrai_h, delta1, 1, delta2, 1, delta1, delta2); + DUP2_ARG3(__lsx_vclip_h, delta1, tc_neg, tc_pos, delta2, tc_neg, + tc_pos, delta1, delta2); + DUP2_ARG2(__lsx_vadd_h, p1_src, delta1, q1_src, delta2, + delta1, delta2); + DUP2_ARG1(__lsx_vclip255_h, delta1, delta2, delta1, delta2); + DUP2_ARG3(__lsx_vbitsel_v, delta1, p1_src, p_is_pcm_vec, delta2, + q1_src, q_is_pcm_vec, delta1, delta2); + + abs_delta0 = __lsx_vnor_v(abs_delta0, abs_delta0); + DUP4_ARG3(__lsx_vbitsel_v, delta1, p1_src, abs_delta0, delta2, + q1_src, abs_delta0, temp0, p0_src, abs_delta0, temp2, + q0_src, abs_delta0, delta1, delta2, temp0, temp2); + /* weak filter ends*/ + + /* select between weak or strong */ + DUP4_ARG3(__lsx_vbitsel_v, dst0, p2_src, cmp2, dst1, delta1, + cmp2, dst2, temp0, cmp2, dst3, temp2, cmp2, + dst0, dst1, dst2, dst3); + DUP2_ARG3(__lsx_vbitsel_v, dst4, delta2, cmp2, dst5, q2_src, cmp2, + dst4, dst5); + } + + cmp3 = __lsx_vnor_v(cmp3, cmp3); + DUP4_ARG3(__lsx_vbitsel_v, dst0, p2_src, cmp3, dst1, p1_src, cmp3, dst2, + p0_src, cmp3, dst3, q0_src, cmp3, dst0, dst1, dst2, dst3); + DUP2_ARG3(__lsx_vbitsel_v, dst4, q1_src, cmp3, dst5, q2_src, cmp3, + dst4, dst5); + + /* pack results to 8 bit */ + DUP4_ARG2(__lsx_vpickev_b, dst2, dst0, dst3, dst1, dst4, dst4, dst5, + dst5, dst0, dst1, dst2, dst3); + + /* transpose */ + DUP2_ARG2(__lsx_vilvl_b, dst1, dst0, dst3, dst2, dst4, dst6); + DUP2_ARG2(__lsx_vilvh_b, dst1, dst0, dst3, dst2, dst5, dst7); + DUP2_ARG2(__lsx_vilvl_h, dst5, dst4, dst7, dst6, dst0, dst2); + DUP2_ARG2(__lsx_vilvh_h, dst5, dst4, dst7, dst6, dst1, dst3); + + src += 1; + __lsx_vstelm_w(dst0, src, 0, 0); + __lsx_vstelm_h(dst2, src, 4, 0); + src += stride; + __lsx_vstelm_w(dst0, src, 0, 1); + __lsx_vstelm_h(dst2, src, 4, 2); + src += stride; + + __lsx_vstelm_w(dst0, src, 0, 2); + __lsx_vstelm_h(dst2, src, 4, 4); + src += stride; + __lsx_vstelm_w(dst0, src, 0, 3); + __lsx_vstelm_h(dst2, src, 4, 6); + src += stride; + + __lsx_vstelm_w(dst1, src, 0, 0); + __lsx_vstelm_h(dst3, src, 4, 0); + src += stride; + __lsx_vstelm_w(dst1, src, 0, 1); + __lsx_vstelm_h(dst3, src, 4, 2); + src += stride; + + __lsx_vstelm_w(dst1, src, 0, 2); + __lsx_vstelm_h(dst3, src, 4, 4); + src += stride; + __lsx_vstelm_w(dst1, src, 0, 3); + __lsx_vstelm_h(dst3, src, 4, 6); + } +} + +void ff_hevc_loop_filter_chroma_h_8_lsx(uint8_t *src, ptrdiff_t stride, + int32_t *tc, uint8_t *p_is_pcm, + uint8_t *q_is_pcm) +{ + uint8_t *p1_ptr = src - (stride << 1); + uint8_t *p0_ptr = src - stride; + uint8_t *q0_ptr = src; + uint8_t *q1_ptr = src + stride; + __m128i cmp0, cmp1, p_is_pcm_vec, q_is_pcm_vec; + __m128i p1, p0, q0, q1; + __m128i tc_pos, tc_neg; + __m128i zero = {0}; + __m128i temp0, temp1, delta; + + if (!(tc[0] <= 0) || !(tc[1] <= 0)) { + DUP2_ARG1(__lsx_vreplgr2vr_h, tc[0], tc[1], cmp0, cmp1); + tc_pos = __lsx_vpackev_d(cmp1, cmp0); + tc_neg = __lsx_vneg_h(tc_pos); + DUP2_ARG1(__lsx_vreplgr2vr_d, p_is_pcm[0], p_is_pcm[1], cmp0, cmp1); + p_is_pcm_vec = __lsx_vpackev_d(cmp1, cmp0); + p_is_pcm_vec = __lsx_vseqi_d(p_is_pcm_vec, 0); + + DUP2_ARG1(__lsx_vreplgr2vr_d, q_is_pcm[0], q_is_pcm[1], cmp0, cmp1); + q_is_pcm_vec = __lsx_vpackev_d(cmp1, cmp0); + q_is_pcm_vec = __lsx_vseqi_d(q_is_pcm_vec, 0); + + DUP4_ARG2(__lsx_vld, p1_ptr, 0, p0_ptr, 0, q0_ptr, 0, q1_ptr, 0, + p1, p0, q0, q1); + DUP4_ARG2(__lsx_vilvl_b, zero, p1, zero, p0, zero, q0, zero, q1, + p1, p0, q0, q1); + DUP2_ARG2(__lsx_vsub_h, q0, p0, p1, q1, temp0, temp1); + temp0 = __lsx_vslli_h(temp0, 2); + temp0 = __lsx_vadd_h(temp0, temp1); + delta = __lsx_vsrari_h(temp0, 3); + delta = __lsx_vclip_h(delta, tc_neg, tc_pos); + temp0 = __lsx_vadd_h(p0, delta); + temp0 = __lsx_vclip255_h(temp0); + p_is_pcm_vec = __lsx_vnor_v(p_is_pcm_vec, p_is_pcm_vec); + temp0 = __lsx_vbitsel_v(temp0, p0, p_is_pcm_vec); + + temp1 = __lsx_vsub_h(q0, delta); + temp1 = __lsx_vclip255_h(temp1); + q_is_pcm_vec = __lsx_vnor_v(q_is_pcm_vec, q_is_pcm_vec); + temp1 = __lsx_vbitsel_v(temp1, q0, q_is_pcm_vec); + + tc_pos = __lsx_vslei_d(tc_pos, 0); + DUP2_ARG3(__lsx_vbitsel_v, temp0, p0, tc_pos, temp1, q0, tc_pos, + temp0, temp1); + temp0 = __lsx_vpickev_b(temp1, temp0); + __lsx_vstelm_d(temp0, p0_ptr, 0, 0); + __lsx_vstelm_d(temp0, p0_ptr + stride, 0, 1); + } +} + +void ff_hevc_loop_filter_chroma_v_8_lsx(uint8_t *src, ptrdiff_t stride, + int32_t *tc, uint8_t *p_is_pcm, + uint8_t *q_is_pcm) +{ + ptrdiff_t stride_2x = (stride << 1); + ptrdiff_t stride_4x = (stride << 2); + ptrdiff_t stride_3x = stride_2x + stride; + __m128i cmp0, cmp1, p_is_pcm_vec, q_is_pcm_vec; + __m128i src0, src1, src2, src3, src4, src5, src6, src7; + __m128i p1, p0, q0, q1; + __m128i tc_pos, tc_neg; + __m128i zero = {0}; + __m128i temp0, temp1, delta; + + if (!(tc[0] <= 0) || !(tc[1] <= 0)) { + DUP2_ARG1(__lsx_vreplgr2vr_h, tc[0], tc[1], cmp0, cmp1); + tc_pos = __lsx_vpackev_d(cmp1, cmp0); + tc_neg = __lsx_vneg_h(tc_pos); + + DUP2_ARG1(__lsx_vreplgr2vr_d, p_is_pcm[0], p_is_pcm[1], cmp0, cmp1); + p_is_pcm_vec = __lsx_vpackev_d(cmp1, cmp0); + p_is_pcm_vec = __lsx_vseqi_d(p_is_pcm_vec, 0); + DUP2_ARG1(__lsx_vreplgr2vr_d, q_is_pcm[0], q_is_pcm[1], cmp0, cmp1); + q_is_pcm_vec = __lsx_vpackev_d(cmp1, cmp0); + q_is_pcm_vec = __lsx_vseqi_d(q_is_pcm_vec, 0); + + src -= 2; + DUP4_ARG2(__lsx_vld, src, 0, src + stride, 0, src + stride_2x, 0, + src + stride_3x, 0, src0, src1, src2, src3); + src += stride_4x; + DUP4_ARG2(__lsx_vld, src, 0, src + stride, 0, src + stride_2x, 0, + src + stride_3x, 0, src4, src5, src6, src7); + src -= stride_4x; + LSX_TRANSPOSE8x4_B(src0, src1, src2, src3, src4, src5, src6, src7, + p1, p0, q0, q1); + DUP4_ARG2(__lsx_vilvl_b, zero, p1, zero, p0, zero, q0, zero, q1, + p1, p0, q0, q1); + + DUP2_ARG2(__lsx_vsub_h, q0, p0, p1, q1, temp0, temp1); + temp0 = __lsx_vslli_h(temp0, 2); + temp0 = __lsx_vadd_h(temp0, temp1); + delta = __lsx_vsrari_h(temp0, 3); + delta = __lsx_vclip_h(delta, tc_neg, tc_pos); + + temp0 = __lsx_vadd_h(p0, delta); + temp1 = __lsx_vsub_h(q0, delta); + DUP2_ARG1(__lsx_vclip255_h, temp0, temp1, temp0, temp1); + DUP2_ARG2(__lsx_vnor_v, p_is_pcm_vec, p_is_pcm_vec, q_is_pcm_vec, + q_is_pcm_vec, p_is_pcm_vec, q_is_pcm_vec); + DUP2_ARG3(__lsx_vbitsel_v, temp0, p0, p_is_pcm_vec, temp1, q0, + q_is_pcm_vec, temp0, temp1); + + tc_pos = __lsx_vslei_d(tc_pos, 0); + DUP2_ARG3(__lsx_vbitsel_v, temp0, p0, tc_pos, temp1, q0, tc_pos, + temp0, temp1); + temp0 = __lsx_vpackev_b(temp1, temp0); + + src += 1; + __lsx_vstelm_h(temp0, src, 0, 0); + __lsx_vstelm_h(temp0, src + stride, 0, 1); + __lsx_vstelm_h(temp0, src + stride_2x, 0, 2); + __lsx_vstelm_h(temp0, src + stride_3x, 0, 3); + src += stride_4x; + __lsx_vstelm_h(temp0, src, 0, 4); + __lsx_vstelm_h(temp0, src + stride, 0, 5); + __lsx_vstelm_h(temp0, src + stride_2x, 0, 6); + __lsx_vstelm_h(temp0, src + stride_3x, 0, 7); + src -= stride_4x; + } +} + +static void hevc_sao_edge_filter_0degree_4width_lsx(uint8_t *dst, + int32_t dst_stride, + uint8_t *src, + int32_t src_stride, + int16_t *sao_offset_val, + int32_t height) +{ + const int32_t src_stride_2x = (src_stride << 1); + const int32_t dst_stride_2x = (dst_stride << 1); + __m128i shuf1 = {0x807060504030201, 0x100F0E0D0C0B0A09}; + __m128i shuf2 = {0x908070605040302, 0x11100F0E0D0C0B0A}; + __m128i edge_idx = {0x403000201, 0x0}; + __m128i cmp_minus10, cmp_minus11, diff_minus10, diff_minus11; + __m128i sao_offset = __lsx_vld(sao_offset_val, 0); + __m128i src_minus10, src_minus11, src_plus10, offset, src0, dst0; + __m128i const1 = __lsx_vldi(1); + __m128i zero = {0}; + + sao_offset = __lsx_vpickev_b(sao_offset, sao_offset); + src -= 1; + + /* load in advance */ + DUP2_ARG2(__lsx_vld, src, 0, src + src_stride, 0, src_minus10, src_minus11); + + for (height -= 2; height; height -= 2) { + src += src_stride_2x; + src_minus10 = __lsx_vpickev_d(src_minus11, src_minus10); + src0 = __lsx_vshuf_b(zero, src_minus10, shuf1); + src_plus10 = __lsx_vshuf_b(zero, src_minus10, shuf2); + + DUP2_ARG2(__lsx_vseq_b, src0, src_minus10, src0, src_plus10, + cmp_minus10, cmp_minus11); + DUP2_ARG2(__lsx_vnor_v, cmp_minus10, cmp_minus10, cmp_minus11, + cmp_minus11, diff_minus10, diff_minus11); + DUP2_ARG2(__lsx_vsle_bu, src0, src_minus10, src0, src_plus10, + cmp_minus10, cmp_minus11); + DUP2_ARG2(__lsx_vnor_v, cmp_minus10, cmp_minus10, cmp_minus11, + cmp_minus11, cmp_minus10, cmp_minus11); + DUP2_ARG3(__lsx_vbitsel_v, diff_minus10, const1, cmp_minus10, + diff_minus11, const1, cmp_minus11, diff_minus10, diff_minus11); + + offset = __lsx_vadd_b(diff_minus10, diff_minus11); + offset = __lsx_vaddi_bu(offset, 2); + + /* load in advance */ + DUP2_ARG2(__lsx_vld, src, 0, src + src_stride, 0, + src_minus10, src_minus11); + DUP2_ARG3(__lsx_vshuf_b, edge_idx, edge_idx, offset, + sao_offset, sao_offset, offset, offset, offset); + src0 = __lsx_vxori_b(src0, 128); + dst0 = __lsx_vsadd_b(src0, offset); + dst0 = __lsx_vxori_b(dst0, 128); + + __lsx_vstelm_w(dst0, dst, 0, 0); + __lsx_vstelm_w(dst0, dst + dst_stride, 0, 2); + dst += dst_stride_2x; + } + + src_minus10 = __lsx_vpickev_d(src_minus11, src_minus10); + src0 = __lsx_vshuf_b(zero, src_minus10, shuf1); + src_plus10 = __lsx_vshuf_b(zero, src_minus10, shuf2); + + DUP2_ARG2(__lsx_vseq_b, src0, src_minus10, src0, src_plus10, cmp_minus10, + cmp_minus11); + DUP2_ARG2(__lsx_vnor_v, cmp_minus10, cmp_minus10, cmp_minus11, cmp_minus11, + diff_minus10, diff_minus11); + DUP2_ARG2(__lsx_vsle_bu, src0, src_minus10, src0, src_plus10, cmp_minus10, + cmp_minus11); + DUP2_ARG2(__lsx_vnor_v, cmp_minus10, cmp_minus10, cmp_minus11, cmp_minus11, + cmp_minus10, cmp_minus11); + DUP2_ARG3(__lsx_vbitsel_v, diff_minus10, const1, cmp_minus10, diff_minus11, + const1, cmp_minus11, diff_minus10, diff_minus11); + + offset = __lsx_vadd_b(diff_minus10, diff_minus11); + offset = __lsx_vaddi_bu(offset, 2); + DUP2_ARG3(__lsx_vshuf_b, edge_idx, edge_idx, offset, sao_offset, sao_offset, + offset, offset, offset); + src0 = __lsx_vxori_b(src0, 128); + dst0 = __lsx_vsadd_b(src0, offset); + dst0 = __lsx_vxori_b(dst0, 128); + + __lsx_vstelm_w(dst0, dst, 0, 0); + __lsx_vstelm_w(dst0, dst + dst_stride, 0, 2); +} + +static void hevc_sao_edge_filter_0degree_8width_lsx(uint8_t *dst, + int32_t dst_stride, + uint8_t *src, + int32_t src_stride, + int16_t *sao_offset_val, + int32_t height) +{ + const int32_t src_stride_2x = (src_stride << 1); + const int32_t dst_stride_2x = (dst_stride << 1); + __m128i shuf1 = {0x807060504030201, 0x100F0E0D0C0B0A09}; + __m128i shuf2 = {0x908070605040302, 0x11100F0E0D0C0B0A}; + __m128i edge_idx = {0x403000201, 0x0}; + __m128i const1 = __lsx_vldi(1); + __m128i cmp_minus10, cmp_minus11, diff_minus10, diff_minus11; + __m128i src0, src1, dst0, src_minus10, src_minus11, src_plus10, src_plus11; + __m128i offset, sao_offset = __lsx_vld(sao_offset_val, 0); + __m128i zeros = {0}; + + sao_offset = __lsx_vpickev_b(sao_offset, sao_offset); + src -= 1; + + /* load in advance */ + DUP2_ARG2(__lsx_vld, src, 0, src + src_stride, 0, src_minus10, src_minus11); + + for (height -= 2; height; height -= 2) { + src += src_stride_2x; + DUP2_ARG3(__lsx_vshuf_b, zeros, src_minus10, shuf1, zeros, + src_minus11, shuf1, src0, src1); + DUP2_ARG3(__lsx_vshuf_b, zeros, src_minus10, shuf2, zeros, + src_minus11, shuf2, src_plus10, src_plus11); + DUP2_ARG2(__lsx_vpickev_d, src_minus11, src_minus10, src_plus11, + src_plus10, src_minus10, src_plus10); + src0 = __lsx_vpickev_d(src1, src0); + + DUP2_ARG2(__lsx_vseq_b, src0, src_minus10, src0, src_plus10, + cmp_minus10, cmp_minus11); + DUP2_ARG2(__lsx_vnor_v, cmp_minus10, cmp_minus10, cmp_minus11, + cmp_minus11, diff_minus10, diff_minus11); + DUP2_ARG2(__lsx_vsle_bu, src0, src_minus10, src0, src_plus10, + cmp_minus10, cmp_minus11); + DUP2_ARG2(__lsx_vnor_v, cmp_minus10, cmp_minus10, cmp_minus11, + cmp_minus11, cmp_minus10, cmp_minus11); + DUP2_ARG3(__lsx_vbitsel_v, diff_minus10, const1, cmp_minus10, + diff_minus11, const1, cmp_minus11, diff_minus10, diff_minus11); + + offset = __lsx_vadd_b(diff_minus10, diff_minus11); + offset = __lsx_vaddi_bu(offset, 2); + + /* load in advance */ + DUP2_ARG2(__lsx_vld, src, 0, src + src_stride, 0, + src_minus10, src_minus11); + DUP2_ARG3(__lsx_vshuf_b, edge_idx, edge_idx, offset, sao_offset, + sao_offset, offset, offset, offset); + src0 = __lsx_vxori_b(src0, 128); + dst0 = __lsx_vsadd_b(src0, offset); + dst0 = __lsx_vxori_b(dst0, 128); + + __lsx_vstelm_d(dst0, dst, 0, 0); + __lsx_vstelm_d(dst0, dst + dst_stride, 0, 1); + dst += dst_stride_2x; + } + + DUP2_ARG3(__lsx_vshuf_b, zeros, src_minus10, shuf1, zeros, src_minus11, + shuf1, src0, src1); + DUP2_ARG3(__lsx_vshuf_b, zeros, src_minus10, shuf2, zeros, src_minus11, + shuf2, src_plus10, src_plus11); + DUP2_ARG2(__lsx_vpickev_d, src_minus11, src_minus10, src_plus11, + src_plus10, src_minus10, src_plus10); + src0 = __lsx_vpickev_d(src1, src0); + + DUP2_ARG2(__lsx_vseq_b, src0, src_minus10, src0, src_plus10, cmp_minus10, + cmp_minus11); + DUP2_ARG2(__lsx_vnor_v, cmp_minus10, cmp_minus10, cmp_minus11, cmp_minus11, + diff_minus10, diff_minus11); + DUP2_ARG2(__lsx_vsle_bu, src0, src_minus10, src0, src_plus10, cmp_minus10, + cmp_minus11); + DUP2_ARG2(__lsx_vnor_v, cmp_minus10, cmp_minus10, cmp_minus11, cmp_minus11, + cmp_minus10, cmp_minus11); + DUP2_ARG3(__lsx_vbitsel_v, diff_minus10, const1, cmp_minus10, diff_minus11, + const1, cmp_minus11, diff_minus10, diff_minus11); + + offset = __lsx_vadd_b(diff_minus10, diff_minus11); + offset = __lsx_vaddi_bu(offset, 2); + DUP2_ARG3(__lsx_vshuf_b, edge_idx, edge_idx, offset, sao_offset, + sao_offset, offset, offset, offset); + src0 = __lsx_vxori_b(src0, 128); + dst0 = __lsx_vsadd_b(src0, offset); + dst0 = __lsx_vxori_b(dst0, 128); + + __lsx_vstelm_d(dst0, dst, 0, 0); + __lsx_vstelm_d(dst0, dst + dst_stride, 0, 1); +} + +static void hevc_sao_edge_filter_0degree_16multiple_lsx(uint8_t *dst, + int32_t dst_stride, + uint8_t *src, + int32_t src_stride, + int16_t *sao_offset_val, + int32_t width, + int32_t height) +{ + uint8_t *dst_ptr, *src_minus1; + int32_t v_cnt; + const int32_t src_stride_2x = (src_stride << 1); + const int32_t dst_stride_2x = (dst_stride << 1); + const int32_t src_stride_4x = (src_stride << 2); + const int32_t dst_stride_4x = (dst_stride << 2); + const int32_t src_stride_3x = src_stride_2x + src_stride; + const int32_t dst_stride_3x = dst_stride_2x + dst_stride; + + __m128i shuf1 = {0x807060504030201, 0x100F0E0D0C0B0A09}; + __m128i shuf2 = {0x908070605040302, 0x11100F0E0D0C0B0A}; + __m128i edge_idx = {0x403000201, 0x0}; + __m128i const1 = __lsx_vldi(1); + __m128i sao_offset; + __m128i cmp_minus10, cmp_plus10, diff_minus10, diff_plus10, cmp_minus11; + __m128i cmp_plus11, diff_minus11, diff_plus11, cmp_minus12, cmp_plus12; + __m128i diff_minus12, diff_plus12, cmp_minus13, cmp_plus13, diff_minus13; + __m128i diff_plus13; + __m128i src10, src11, src12, src13, dst0, dst1, dst2, dst3; + __m128i src_minus10, src_minus11, src_minus12, src_minus13; + __m128i offset_mask0, offset_mask1, offset_mask2, offset_mask3; + __m128i src_zero0, src_zero1, src_zero2, src_zero3; + __m128i src_plus10, src_plus11, src_plus12, src_plus13; + + sao_offset = __lsx_vld(sao_offset_val, 0); + sao_offset = __lsx_vpickev_b(sao_offset, sao_offset); + + for (; height; height -= 4) { + src_minus1 = src - 1; + src_minus10 = __lsx_vld(src_minus1, 0); + DUP2_ARG2(__lsx_vldx, src_minus1, src_stride, src_minus1, + src_stride_2x, src_minus11, src_minus12); + src_minus13 = __lsx_vldx(src_minus1, src_stride_3x); + + for (v_cnt = 0; v_cnt < width; v_cnt += 16) { + src_minus1 += 16; + dst_ptr = dst + v_cnt; + src10 = __lsx_vld(src_minus1, 0); + DUP2_ARG2(__lsx_vldx, src_minus1, src_stride, src_minus1, + src_stride_2x, src11, src12); + src13 = __lsx_vldx(src_minus1, src_stride_3x); + DUP4_ARG3(__lsx_vshuf_b, src10, src_minus10, shuf1, src11, + src_minus11, shuf1, src12, src_minus12, shuf1, src13, + src_minus13, shuf1, src_zero0, src_zero1, + src_zero2, src_zero3); + DUP4_ARG3(__lsx_vshuf_b, src10, src_minus10, shuf2, src11, + src_minus11, shuf2, src12, src_minus12, shuf2, src13, + src_minus13, shuf2, src_plus10, src_plus11, + src_plus12, src_plus13); + DUP4_ARG2(__lsx_vseq_b, src_zero0, src_minus10, src_zero0, + src_plus10, src_zero1, src_minus11, src_zero1, src_plus11, + cmp_minus10, cmp_plus10, cmp_minus11, cmp_plus11); + DUP4_ARG2(__lsx_vseq_b, src_zero2, src_minus12, src_zero2, + src_plus12, src_zero3, src_minus13, src_zero3, src_plus13, + cmp_minus12, cmp_plus12, cmp_minus13, cmp_plus13); + DUP4_ARG2(__lsx_vnor_v, cmp_minus10, cmp_minus10, cmp_plus10, + cmp_plus10, cmp_minus11, cmp_minus11, cmp_plus11, + cmp_plus11, diff_minus10, diff_plus10, diff_minus11, + diff_plus11); + DUP4_ARG2(__lsx_vnor_v, cmp_minus12, cmp_minus12, cmp_plus12, + cmp_plus12, cmp_minus13, cmp_minus13, cmp_plus13, + cmp_plus13, diff_minus12, diff_plus12, diff_minus13, + diff_plus13); + DUP4_ARG2(__lsx_vsle_bu, src_zero0, src_minus10, src_zero0, + src_plus10, src_zero1, src_minus11, src_zero1, src_plus11, + cmp_minus10, cmp_plus10, cmp_minus11, cmp_plus11); + DUP4_ARG2(__lsx_vsle_bu, src_zero2, src_minus12, src_zero2, + src_plus12, src_zero3, src_minus13, src_zero3, src_plus13, + cmp_minus12, cmp_plus12, cmp_minus13, cmp_plus13); + DUP4_ARG2(__lsx_vnor_v, cmp_minus10, cmp_minus10, cmp_plus10, + cmp_plus10, cmp_minus11, cmp_minus11, cmp_plus11, + cmp_plus11, cmp_minus10, cmp_plus10, cmp_minus11, + cmp_plus11); + DUP4_ARG2(__lsx_vnor_v, cmp_minus12, cmp_minus12, cmp_plus12, + cmp_plus12, cmp_minus13, cmp_minus13, cmp_plus13, + cmp_plus13, cmp_minus12, cmp_plus12, cmp_minus13, + cmp_plus13); + DUP4_ARG3(__lsx_vbitsel_v, diff_minus10, const1, cmp_minus10, + diff_plus10, const1, cmp_plus10, diff_minus11, const1, + cmp_minus11, diff_plus11, const1, cmp_plus11, + diff_minus10, diff_plus10, diff_minus11, diff_plus11); + DUP4_ARG3(__lsx_vbitsel_v, diff_minus12, const1, cmp_minus12, + diff_plus12, const1, cmp_plus12, diff_minus13, const1, + cmp_minus13, diff_plus13, const1, cmp_plus13, + diff_minus12, diff_plus12, diff_minus13, diff_plus13); + + DUP4_ARG2(__lsx_vadd_b, diff_minus10, diff_plus10, diff_minus11, + diff_plus11, diff_minus12, diff_plus12, diff_minus13, + diff_plus13, offset_mask0, offset_mask1, offset_mask2, + offset_mask3); + DUP4_ARG2(__lsx_vaddi_bu, offset_mask0, 2, offset_mask1, 2, + offset_mask2, 2, offset_mask3, 2, offset_mask0, + offset_mask1, offset_mask2, offset_mask3); + DUP2_ARG3(__lsx_vshuf_b, edge_idx, edge_idx, offset_mask0, + sao_offset, sao_offset, offset_mask0, offset_mask0, + offset_mask0); + DUP2_ARG3(__lsx_vshuf_b, edge_idx, edge_idx, offset_mask1, + sao_offset, sao_offset, offset_mask1, offset_mask1, + offset_mask1); + DUP2_ARG3(__lsx_vshuf_b, edge_idx, edge_idx, offset_mask2, + sao_offset, sao_offset, offset_mask2, offset_mask2, + offset_mask2); + DUP2_ARG3(__lsx_vshuf_b, edge_idx, edge_idx, offset_mask3, + sao_offset, sao_offset, offset_mask3, offset_mask3, + offset_mask3); + + DUP4_ARG2(__lsx_vxori_b, src_zero0, 128, src_zero1, 128, + src_zero2, 128, src_zero3, 128, src_zero0, src_zero1, + src_zero2, src_zero3); + DUP4_ARG2(__lsx_vsadd_b, src_zero0, offset_mask0, src_zero1, + offset_mask1, src_zero2, offset_mask2, src_zero3, + offset_mask3, dst0, dst1, dst2, dst3); + DUP4_ARG2(__lsx_vxori_b, dst0, 128, dst1, 128, dst2, 128, dst3, + 128, dst0, dst1, dst2, dst3); + + src_minus10 = src10; + src_minus11 = src11; + src_minus12 = src12; + src_minus13 = src13; + + __lsx_vst(dst0, dst_ptr, 0); + __lsx_vst(dst1, dst_ptr + dst_stride, 0); + __lsx_vst(dst2, dst_ptr + dst_stride_2x, 0); + __lsx_vst(dst3, dst_ptr + dst_stride_3x, 0); + } + src += src_stride_4x; + dst += dst_stride_4x; + } +} + +static void hevc_sao_edge_filter_90degree_4width_lsx(uint8_t *dst, + int32_t dst_stride, + uint8_t *src, + int32_t src_stride, + int16_t *sao_offset_val, + int32_t height) +{ + const int32_t src_stride_2x = (src_stride << 1); + const int32_t dst_stride_2x = (dst_stride << 1); + __m128i edge_idx = {0x403000201, 0x0}; + __m128i const1 = __lsx_vldi(1); + __m128i dst0; + __m128i sao_offset = __lsx_vld(sao_offset_val, 0); + __m128i cmp_minus10, diff_minus10, cmp_minus11, diff_minus11; + __m128i src_minus10, src_minus11, src10, src11; + __m128i src_zero0, src_zero1; + __m128i offset; + __m128i offset_mask0, offset_mask1; + + sao_offset = __lsx_vpickev_b(sao_offset, sao_offset); + + /* load in advance */ + DUP4_ARG2(__lsx_vld, src - src_stride, 0, src, 0, src + src_stride, 0, + src + src_stride_2x, 0, src_minus10, src_minus11, src10, src11); + + for (height -= 2; height; height -= 2) { + src += src_stride_2x; + DUP4_ARG2(__lsx_vilvl_b, src10, src_minus10, src_minus11, src_minus11, + src11, src_minus11, src10, src10, src_minus10, src_zero0, + src_minus11, src_zero1); + DUP2_ARG2(__lsx_vseq_b, src_zero0, src_minus10, src_zero1, src_minus11, + cmp_minus10, cmp_minus11); + DUP2_ARG2(__lsx_vnor_v, cmp_minus10, cmp_minus10, cmp_minus11, + cmp_minus11, diff_minus10, diff_minus11); + DUP2_ARG2(__lsx_vsle_bu, src_zero0, src_minus10, src_zero1, + src_minus11, cmp_minus10, cmp_minus11); + DUP2_ARG2(__lsx_vnor_v, cmp_minus10, cmp_minus10, cmp_minus11, + cmp_minus11, cmp_minus10, cmp_minus11); + DUP2_ARG3(__lsx_vbitsel_v, diff_minus10, const1, cmp_minus10, + diff_minus11, const1, cmp_minus11, diff_minus10, diff_minus11); + + DUP2_ARG2(__lsx_vhaddw_hu_bu, diff_minus10, diff_minus10, diff_minus11, + diff_minus11, offset_mask0, offset_mask1); + DUP2_ARG2(__lsx_vaddi_hu, offset_mask0, 2, offset_mask1, 2, + offset_mask0, offset_mask1); + DUP2_ARG2(__lsx_vpickev_b, offset_mask1, offset_mask0, src_zero1, + src_zero0, offset, dst0); + DUP2_ARG3(__lsx_vshuf_b, edge_idx, edge_idx, offset, sao_offset, + sao_offset, offset, offset, offset); + + dst0 = __lsx_vxori_b(dst0, 128); + dst0 = __lsx_vsadd_b(dst0, offset); + dst0 = __lsx_vxori_b(dst0, 128); + src_minus10 = src10; + src_minus11 = src11; + + /* load in advance */ + DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride_2x, + src10, src11); + + __lsx_vstelm_w(dst0, dst, 0, 0); + __lsx_vstelm_w(dst0, dst + dst_stride, 0, 2); + dst += dst_stride_2x; + } + + DUP4_ARG2(__lsx_vilvl_b, src10, src_minus10, src_minus11, src_minus11, + src11, src_minus11, src10, src10, src_minus10, src_zero0, + src_minus11, src_zero1); + DUP2_ARG2(__lsx_vseq_b, src_zero0, src_minus10, src_zero1, src_minus11, + cmp_minus10, cmp_minus11); + DUP2_ARG2(__lsx_vnor_v, cmp_minus10, cmp_minus10, cmp_minus11, cmp_minus11, + diff_minus10, diff_minus11); + DUP2_ARG2(__lsx_vsle_bu, src_zero0, src_minus10, src_zero1, src_minus11, + cmp_minus10, cmp_minus11); + DUP2_ARG2(__lsx_vnor_v, cmp_minus10, cmp_minus10, cmp_minus11, cmp_minus11, + cmp_minus10, cmp_minus11); + DUP2_ARG3(__lsx_vbitsel_v, diff_minus10, const1, cmp_minus10, diff_minus11, + const1, cmp_minus11, diff_minus10, diff_minus11); + + DUP2_ARG2(__lsx_vhaddw_hu_bu, diff_minus10, diff_minus10, diff_minus11, + diff_minus11, offset_mask0, offset_mask1); + DUP2_ARG2(__lsx_vaddi_bu, offset_mask0, 2, offset_mask1, 2, + offset_mask0, offset_mask1); + DUP2_ARG2(__lsx_vpickev_b, offset_mask1, offset_mask0, src_zero1, + src_zero0, offset, dst0); + DUP2_ARG3(__lsx_vshuf_b, edge_idx, edge_idx, offset, sao_offset, + sao_offset, offset, offset, offset); + dst0 = __lsx_vxori_b(dst0, 128); + dst0 = __lsx_vsadd_b(dst0, offset); + dst0 = __lsx_vxori_b(dst0, 128); + + __lsx_vstelm_w(dst0, dst, 0, 0); + __lsx_vstelm_w(dst0, dst + dst_stride, 0, 2); +} + +static void hevc_sao_edge_filter_90degree_8width_lsx(uint8_t *dst, + int32_t dst_stride, + uint8_t *src, + int32_t src_stride, + int16_t *sao_offset_val, + int32_t height) +{ + const int32_t src_stride_2x = (src_stride << 1); + const int32_t dst_stride_2x = (dst_stride << 1); + __m128i edge_idx = {0x403000201, 0x0}; + __m128i const1 = __lsx_vldi(1); + __m128i offset, sao_offset = __lsx_vld(sao_offset_val, 0); + __m128i src_zero0, src_zero1, dst0; + __m128i cmp_minus10, diff_minus10, cmp_minus11, diff_minus11; + __m128i src_minus10, src_minus11, src10, src11; + __m128i offset_mask0, offset_mask1; + + sao_offset = __lsx_vpickev_b(sao_offset, sao_offset); + + /* load in advance */ + DUP2_ARG2(__lsx_vld, src - src_stride, 0, src, 0, src_minus10, src_minus11); + DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride_2x, src10, src11); + + for (height -= 2; height; height -= 2) { + src += src_stride_2x; + DUP4_ARG2(__lsx_vilvl_b, src10, src_minus10, src_minus11, src_minus11, + src11, src_minus11, src10, src10, src_minus10, src_zero0, + src_minus11, src_zero1); + DUP2_ARG2(__lsx_vseq_b, src_zero0, src_minus10, src_zero1, src_minus11, + cmp_minus10, cmp_minus11); + DUP2_ARG2(__lsx_vnor_v, cmp_minus10, cmp_minus10, cmp_minus11, + cmp_minus11, diff_minus10, diff_minus11); + DUP2_ARG2(__lsx_vsle_bu, src_zero0, src_minus10, src_zero1, + src_minus11, cmp_minus10, cmp_minus11); + DUP2_ARG2(__lsx_vnor_v, cmp_minus10, cmp_minus10, cmp_minus11, + cmp_minus11, cmp_minus10, cmp_minus11); + DUP2_ARG3(__lsx_vbitsel_v, diff_minus10, const1, cmp_minus10, + diff_minus11, const1, cmp_minus11, diff_minus10, diff_minus11); + + DUP2_ARG2(__lsx_vhaddw_hu_bu, diff_minus10, diff_minus10, diff_minus11, + diff_minus11, offset_mask0, offset_mask1); + DUP2_ARG2(__lsx_vaddi_hu, offset_mask0, 2, offset_mask1, 2, + offset_mask0, offset_mask1); + DUP2_ARG2(__lsx_vpickev_b, offset_mask1, offset_mask0, src_zero1, + src_zero0, offset, dst0); + DUP2_ARG3(__lsx_vshuf_b, edge_idx, edge_idx, offset, sao_offset, + sao_offset, offset, offset, offset); + + dst0 = __lsx_vxori_b(dst0, 128); + dst0 = __lsx_vsadd_b(dst0, offset); + dst0 = __lsx_vxori_b(dst0, 128); + src_minus10 = src10; + src_minus11 = src11; + + /* load in advance */ + DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride_2x, + src10, src11); + + __lsx_vstelm_d(dst0, dst, 0, 0); + __lsx_vstelm_d(dst0, dst + dst_stride, 0, 1); + dst += dst_stride_2x; + } + + DUP4_ARG2(__lsx_vilvl_b, src10, src_minus10, src_minus11, src_minus11, + src11, src_minus11, src10, src10, src_minus10, src_zero0, + src_minus11, src_zero1); + DUP2_ARG2(__lsx_vseq_b, src_zero0, src_minus10, src_zero1, src_minus11, + cmp_minus10, cmp_minus11); + DUP2_ARG2(__lsx_vnor_v, cmp_minus10, cmp_minus10, cmp_minus11, cmp_minus11, + diff_minus10, diff_minus11); + DUP2_ARG2(__lsx_vsle_bu, src_zero0, src_minus10, src_zero1, src_minus11, + cmp_minus10, cmp_minus11); + DUP2_ARG2(__lsx_vnor_v, cmp_minus10, cmp_minus10, cmp_minus11, cmp_minus11, + cmp_minus10, cmp_minus11); + DUP2_ARG3(__lsx_vbitsel_v, diff_minus10, const1, cmp_minus10, diff_minus11, + const1, cmp_minus11, diff_minus10, diff_minus11); + + DUP2_ARG2(__lsx_vhaddw_hu_bu, diff_minus10, diff_minus10, diff_minus11, + diff_minus11, offset_mask0, offset_mask1); + DUP2_ARG2(__lsx_vaddi_hu, offset_mask0, 2, offset_mask1, 2, + offset_mask0, offset_mask1); + DUP2_ARG2(__lsx_vpickev_b, offset_mask1, offset_mask0, src_zero1, + src_zero0, offset, dst0); + DUP2_ARG3(__lsx_vshuf_b, edge_idx, edge_idx, offset, sao_offset, + sao_offset, offset, offset, offset); + dst0 = __lsx_vxori_b(dst0, 128); + dst0 = __lsx_vsadd_b(dst0, offset); + dst0 = __lsx_vxori_b(dst0, 128); + + __lsx_vstelm_d(dst0, dst, 0, 0); + __lsx_vstelm_d(dst0, dst + dst_stride, 0, 1); +} + +static void hevc_sao_edge_filter_90degree_16multiple_lsx(uint8_t *dst, + int32_t dst_stride, + uint8_t *src, + int32_t src_stride, + int16_t * + sao_offset_val, + int32_t width, + int32_t height) +{ + uint8_t *src_orig = src; + uint8_t *dst_orig = dst; + int32_t h_cnt, v_cnt; + const int32_t src_stride_2x = (src_stride << 1); + const int32_t dst_stride_2x = (dst_stride << 1); + const int32_t src_stride_4x = (src_stride << 2); + const int32_t dst_stride_4x = (dst_stride << 2); + const int32_t src_stride_3x = src_stride_2x + src_stride; + const int32_t dst_stride_3x = dst_stride_2x + dst_stride; + __m128i edge_idx = {0x403000201, 0x0}; + __m128i const1 = __lsx_vldi(1); + __m128i cmp_minus10, cmp_plus10, diff_minus10, diff_plus10, cmp_minus11; + __m128i cmp_plus11, diff_minus11, diff_plus11, cmp_minus12, cmp_plus12; + __m128i diff_minus12, diff_plus12, cmp_minus13, cmp_plus13, diff_minus13; + __m128i diff_plus13; + __m128i src10, src_minus10, dst0, src11, src_minus11, dst1; + __m128i src12, dst2, src13, dst3; + __m128i offset_mask0, offset_mask1, offset_mask2, offset_mask3, sao_offset; + + sao_offset = __lsx_vld(sao_offset_val, 0); + sao_offset = __lsx_vpickev_b(sao_offset, sao_offset); + + for (v_cnt = 0; v_cnt < width; v_cnt += 16) { + src = src_orig + v_cnt; + dst = dst_orig + v_cnt; + + DUP2_ARG2(__lsx_vld, src - src_stride, 0, src, 0, + src_minus10, src_minus11); + + for (h_cnt = (height >> 2); h_cnt--;) { + DUP4_ARG2(__lsx_vldx, src, src_stride, src, src_stride_2x, + src, src_stride_3x, src, src_stride_4x, + src10, src11, src12, src13); + DUP4_ARG2(__lsx_vseq_b, src_minus11, src_minus10, src_minus11, + src10, src10, src_minus11, src10, src11, cmp_minus10, + cmp_plus10, cmp_minus11, cmp_plus11); + DUP4_ARG2(__lsx_vseq_b, src11, src10, src11, src12, src12, src11, + src12, src13, cmp_minus12, cmp_plus12, + cmp_minus13, cmp_plus13); + DUP4_ARG2(__lsx_vnor_v, cmp_minus10, cmp_minus10, cmp_plus10, + cmp_plus10, cmp_minus11, cmp_minus11, cmp_plus11, + cmp_plus11, diff_minus10, diff_plus10, diff_minus11, + diff_plus11); + DUP4_ARG2(__lsx_vnor_v, cmp_minus12, cmp_minus12, cmp_plus12, + cmp_plus12, cmp_minus13, cmp_minus13, cmp_plus13, + cmp_plus13, diff_minus12, diff_plus12, diff_minus13, + diff_plus13); + DUP4_ARG2(__lsx_vsle_bu, src_minus11, src_minus10, src_minus11, + src10, src10, src_minus11, src10, src11, cmp_minus10, + cmp_plus10, cmp_minus11, cmp_plus11); + DUP4_ARG2(__lsx_vsle_bu, src11, src10, src11, src12, src12, src11, + src12, src13, cmp_minus12, cmp_plus12, cmp_minus13, + cmp_plus13); + DUP4_ARG2(__lsx_vnor_v, cmp_minus10, cmp_minus10, cmp_plus10, + cmp_plus10, cmp_minus11, cmp_minus11, cmp_plus11, + cmp_plus11, cmp_minus10, cmp_plus10, cmp_minus11, + cmp_plus11); + DUP4_ARG2(__lsx_vnor_v, cmp_minus12, cmp_minus12, cmp_plus12, + cmp_plus12, cmp_minus13, cmp_minus13, cmp_plus13, + cmp_plus13, cmp_minus12, cmp_plus12, cmp_minus13, + cmp_plus13); + DUP4_ARG3(__lsx_vbitsel_v, diff_minus10, const1, cmp_minus10, + diff_plus10, const1, cmp_plus10, diff_minus11, const1, + cmp_minus11, diff_plus11, const1, cmp_plus11, + diff_minus10, diff_plus10, diff_minus11, diff_plus11); + DUP4_ARG3(__lsx_vbitsel_v, diff_minus12, const1, cmp_minus12, + diff_plus12, const1, cmp_plus12, diff_minus13, const1, + cmp_minus13, diff_plus13, const1, cmp_plus13, + diff_minus12, diff_plus12, diff_minus13, diff_plus13); + + DUP4_ARG2(__lsx_vadd_b, diff_minus10, diff_plus10, diff_minus11, + diff_plus11, diff_minus12, diff_plus12, diff_minus13, + diff_plus13, offset_mask0, offset_mask1, offset_mask2, + offset_mask3); + DUP4_ARG2(__lsx_vaddi_bu, offset_mask0, 2, offset_mask1, 2, + offset_mask2, 2, offset_mask3, 2, offset_mask0, + offset_mask1, offset_mask2, offset_mask3); + DUP2_ARG3(__lsx_vshuf_b, edge_idx, edge_idx, offset_mask0, + sao_offset, sao_offset, offset_mask0,\ + offset_mask0, offset_mask0); + DUP2_ARG3(__lsx_vshuf_b, edge_idx, edge_idx, offset_mask1, + sao_offset, sao_offset, offset_mask1, offset_mask1, + offset_mask1); + DUP2_ARG3(__lsx_vshuf_b, edge_idx, edge_idx, offset_mask2, + sao_offset, sao_offset, offset_mask2, offset_mask2, + offset_mask2); + DUP2_ARG3(__lsx_vshuf_b, edge_idx, edge_idx, offset_mask3, + sao_offset, sao_offset, offset_mask3, offset_mask3, + offset_mask3); + + src_minus10 = src12; + DUP4_ARG2(__lsx_vxori_b, src_minus11, 128, src10, 128, src11, 128, + src12, 128, src_minus11, src10, src11, src12); + DUP4_ARG2(__lsx_vsadd_b, src_minus11, offset_mask0, src10, + offset_mask1, src11, offset_mask2, src12, + offset_mask3, dst0, dst1, dst2, dst3); + DUP4_ARG2(__lsx_vxori_b, dst0, 128, dst1, 128, dst2, 128, dst3, + 128, dst0, dst1, dst2, dst3); + src_minus11 = src13; + + __lsx_vst(dst0, dst, 0); + __lsx_vstx(dst1, dst, dst_stride); + __lsx_vstx(dst2, dst, dst_stride_2x); + __lsx_vstx(dst3, dst, dst_stride_3x); + src += src_stride_4x; + dst += dst_stride_4x; + } + } +} + +static void hevc_sao_edge_filter_45degree_4width_lsx(uint8_t *dst, + int32_t dst_stride, + uint8_t *src, + int32_t src_stride, + int16_t *sao_offset_val, + int32_t height) +{ + uint8_t *src_orig; + const int32_t src_stride_2x = (src_stride << 1); + const int32_t dst_stride_2x = (dst_stride << 1); + __m128i shuf1 = {0x807060504030201, 0x100F0E0D0C0B0A09}; + __m128i shuf2 = {0x908070605040302, 0x11100F0E0D0C0B0A}; + __m128i edge_idx = {0x403000201, 0x0}; + __m128i const1 = __lsx_vldi(1); + __m128i offset, sao_offset = __lsx_vld(sao_offset_val, 0); + __m128i cmp_minus10, diff_minus10, src_minus10, cmp_minus11, diff_minus11; + __m128i src_minus11, src10, src11; + __m128i src_plus0, src_zero0, src_plus1, src_zero1, dst0; + __m128i offset_mask0, offset_mask1; + __m128i zeros = {0}; + + sao_offset = __lsx_vpickev_b(sao_offset, sao_offset); + src_orig = src - 1; + + /* load in advance */ + DUP2_ARG2(__lsx_vld, src_orig - src_stride, 0, src_orig, 0, + src_minus10, src_minus11); + DUP2_ARG2(__lsx_vldx, src_orig, src_stride, src_orig, src_stride_2x, + src10, src11); + + for (height -= 2; height; height -= 2) { + src_orig += src_stride_2x; + + DUP2_ARG3(__lsx_vshuf_b, zeros, src_minus11, shuf1, zeros, src10, + shuf1, src_zero0, src_zero1); + DUP2_ARG3(__lsx_vshuf_b, zeros, src10, shuf2, zeros, src11, shuf2, + src_plus0, src_plus1); + + DUP2_ARG2(__lsx_vilvl_b, src_plus0, src_minus10, src_plus1, + src_minus11, src_minus10, src_minus11); + DUP2_ARG2(__lsx_vilvl_b, src_zero0, src_zero0, src_zero1, + src_zero1, src_zero0, src_zero1); + DUP2_ARG2(__lsx_vseq_b, src_zero0, src_minus10, src_zero1, + src_minus11, cmp_minus10, cmp_minus11); + DUP2_ARG2(__lsx_vnor_v, cmp_minus10, cmp_minus10, cmp_minus11, + cmp_minus11, diff_minus10, diff_minus11); + DUP2_ARG2(__lsx_vsle_bu, src_zero0, src_minus10, src_zero1, + src_minus11, cmp_minus10, cmp_minus11); + DUP2_ARG2(__lsx_vnor_v, cmp_minus10, cmp_minus10, cmp_minus11, + cmp_minus11, cmp_minus10, cmp_minus11); + DUP2_ARG3(__lsx_vbitsel_v, diff_minus10, const1, cmp_minus10, + diff_minus11, const1, cmp_minus11, diff_minus10, diff_minus11); + + DUP2_ARG2(__lsx_vhaddw_hu_bu, diff_minus10, diff_minus10, diff_minus11, + diff_minus11, offset_mask0, offset_mask1); + DUP2_ARG2(__lsx_vaddi_hu, offset_mask0, 2, offset_mask1, 2, + offset_mask0, offset_mask1); + DUP2_ARG2(__lsx_vpickev_b, offset_mask1, offset_mask0, src_zero1, + src_zero0, offset, dst0); + DUP2_ARG3(__lsx_vshuf_b, edge_idx, edge_idx, offset, sao_offset, + sao_offset, offset, offset, offset); + dst0 = __lsx_vxori_b(dst0, 128); + dst0 = __lsx_vsadd_b(dst0, offset); + dst0 = __lsx_vxori_b(dst0, 128); + + src_minus10 = src10; + src_minus11 = src11; + + /* load in advance */ + DUP2_ARG2(__lsx_vldx, src_orig, src_stride, src_orig, src_stride_2x, + src10, src11); + + __lsx_vstelm_w(dst0, dst, 0, 0); + __lsx_vstelm_w(dst0, dst + dst_stride, 0, 2); + dst += dst_stride_2x; + } + + DUP2_ARG3(__lsx_vshuf_b, zeros, src_minus11, shuf1, zeros, src10, shuf1, + src_zero0, src_zero1); + DUP2_ARG3(__lsx_vshuf_b, zeros, src10, shuf2, zeros, src11, shuf2, + src_plus0, src_plus1); + + DUP2_ARG2(__lsx_vilvl_b, src_plus0, src_minus10, src_plus1, src_minus11, + src_minus10, src_minus11); + DUP2_ARG2(__lsx_vilvl_b, src_zero0, src_zero0, src_zero1, src_zero1, + src_zero0, src_zero1); + DUP2_ARG2(__lsx_vseq_b, src_zero0, src_minus10, src_zero1, src_minus11, + cmp_minus10, cmp_minus11); + DUP2_ARG2(__lsx_vnor_v, cmp_minus10, cmp_minus10, cmp_minus11, cmp_minus11, + diff_minus10, diff_minus11); + DUP2_ARG2(__lsx_vsle_bu, src_zero0, src_minus10, src_zero1, src_minus11, + cmp_minus10, cmp_minus11); + DUP2_ARG2(__lsx_vnor_v, cmp_minus10, cmp_minus10, cmp_minus11, cmp_minus11, + cmp_minus10, cmp_minus11); + DUP2_ARG3(__lsx_vbitsel_v, diff_minus10, const1, cmp_minus10, diff_minus11, + const1, cmp_minus11, diff_minus10, diff_minus11); + + DUP2_ARG2(__lsx_vhaddw_hu_bu, diff_minus10, diff_minus10, diff_minus11, + diff_minus11, offset_mask0, offset_mask1); + DUP2_ARG2(__lsx_vaddi_hu, offset_mask0, 2, offset_mask1, 2, offset_mask0, + offset_mask1); + DUP2_ARG2(__lsx_vpickev_b, offset_mask1, offset_mask0, src_zero1, + src_zero0, offset, dst0); + DUP2_ARG3(__lsx_vshuf_b, edge_idx, edge_idx, offset, sao_offset, + sao_offset, offset, offset, offset); + dst0 = __lsx_vxori_b(dst0, 128); + dst0 = __lsx_vsadd_b(dst0, offset); + dst0 = __lsx_vxori_b(dst0, 128); + + __lsx_vstelm_w(dst0, dst, 0, 0); + __lsx_vstelm_w(dst0, dst + dst_stride, 0, 2); +} + +static void hevc_sao_edge_filter_45degree_8width_lsx(uint8_t *dst, + int32_t dst_stride, + uint8_t *src, + int32_t src_stride, + int16_t *sao_offset_val, + int32_t height) +{ + uint8_t *src_orig; + const int32_t src_stride_2x = (src_stride << 1); + const int32_t dst_stride_2x = (dst_stride << 1); + __m128i shuf1 = {0x807060504030201, 0x100F0E0D0C0B0A09}; + __m128i shuf2 = {0x908070605040302, 0x11100F0E0D0C0B0A}; + __m128i edge_idx = {0x403000201, 0x0}; + __m128i const1 = __lsx_vldi(1); + __m128i offset, sao_offset = __lsx_vld(sao_offset_val, 0); + __m128i cmp_minus10, diff_minus10, cmp_minus11, diff_minus11; + __m128i src_minus10, src10, src_minus11, src11; + __m128i src_zero0, src_plus10, src_zero1, src_plus11, dst0; + __m128i offset_mask0, offset_mask1; + __m128i zeros = {0}; + + sao_offset = __lsx_vpickev_b(sao_offset, sao_offset); + src_orig = src - 1; + + /* load in advance */ + DUP2_ARG2(__lsx_vld, src_orig - src_stride, 0, src_orig, 0, src_minus10, + src_minus11); + DUP2_ARG2(__lsx_vldx, src_orig, src_stride, src_orig, src_stride_2x, + src10, src11); + + for (height -= 2; height; height -= 2) { + src_orig += src_stride_2x; + + DUP2_ARG3(__lsx_vshuf_b, zeros, src_minus11, shuf1, zeros, src10, + shuf1, src_zero0, src_zero1); + DUP2_ARG3(__lsx_vshuf_b, zeros, src10, shuf2, zeros, src11, shuf2, + src_plus10, src_plus11); + + DUP2_ARG2(__lsx_vilvl_b, src_plus10, src_minus10, src_plus11, + src_minus11, src_minus10, src_minus11); + DUP2_ARG2(__lsx_vilvl_b, src_zero0, src_zero0, src_zero1, src_zero1, + src_zero0, src_zero1); + DUP2_ARG2(__lsx_vseq_b, src_zero0, src_minus10, src_zero1, src_minus11, + cmp_minus10, cmp_minus11); + DUP2_ARG2(__lsx_vnor_v, cmp_minus10, cmp_minus10, cmp_minus11, + cmp_minus11, diff_minus10, diff_minus11); + DUP2_ARG2(__lsx_vsle_bu, src_zero0, src_minus10, src_zero1, + src_minus11, cmp_minus10, cmp_minus11); + DUP2_ARG2(__lsx_vnor_v, cmp_minus10, cmp_minus10, cmp_minus11, + cmp_minus11, cmp_minus10, cmp_minus11); + DUP2_ARG3(__lsx_vbitsel_v, diff_minus10, const1, cmp_minus10, + diff_minus11, const1, cmp_minus11, diff_minus10, diff_minus11); + + DUP2_ARG2(__lsx_vhaddw_hu_bu, diff_minus10, diff_minus10, diff_minus11, + diff_minus11, offset_mask0, offset_mask1); + DUP2_ARG2(__lsx_vaddi_hu, offset_mask0, 2, offset_mask1, 2, + offset_mask0, offset_mask1); + DUP2_ARG2(__lsx_vpickev_b, offset_mask1, offset_mask0, src_zero1, + src_zero0, offset, dst0); + DUP2_ARG3(__lsx_vshuf_b, edge_idx, edge_idx, offset, sao_offset, + sao_offset, offset, offset, offset); + dst0 = __lsx_vxori_b(dst0, 128); + dst0 = __lsx_vsadd_b(dst0, offset); + dst0 = __lsx_vxori_b(dst0, 128); + + src_minus10 = src10; + src_minus11 = src11; + + /* load in advance */ + DUP2_ARG2(__lsx_vldx, src_orig, src_stride, src_orig, src_stride_2x, + src10, src11) + __lsx_vstelm_d(dst0, dst, 0, 0); + __lsx_vstelm_d(dst0, dst + dst_stride, 0, 1); + dst += dst_stride_2x; + } + + DUP2_ARG3(__lsx_vshuf_b, zeros, src_minus11, shuf1, zeros, src10, shuf1, + src_zero0, src_zero1); + DUP2_ARG3(__lsx_vshuf_b, zeros, src10, shuf2, zeros, src11, shuf2, + src_plus10, src_plus11); + DUP2_ARG2(__lsx_vilvl_b, src_plus10, src_minus10, src_plus11, src_minus11, + src_minus10, src_minus11); + DUP2_ARG2(__lsx_vilvl_b, src_zero0, src_zero0, src_zero1, src_zero1, + src_zero0, src_zero1); + + DUP2_ARG2(__lsx_vseq_b, src_zero0, src_minus10, src_zero1, src_minus11, + cmp_minus10, cmp_minus11); + DUP2_ARG2(__lsx_vnor_v, cmp_minus10, cmp_minus10, cmp_minus11, + cmp_minus11, diff_minus10, diff_minus11); + DUP2_ARG2(__lsx_vsle_bu, src_zero0, src_minus10, src_zero1, src_minus11, + cmp_minus10, cmp_minus11); + DUP2_ARG2(__lsx_vnor_v, cmp_minus10, cmp_minus10, cmp_minus11, cmp_minus11, + cmp_minus10, cmp_minus11); + DUP2_ARG3(__lsx_vbitsel_v, diff_minus10, const1, cmp_minus10, diff_minus11, + const1, cmp_minus11, diff_minus10, diff_minus11); + + DUP2_ARG2(__lsx_vhaddw_hu_bu, diff_minus10, diff_minus10, diff_minus11, + diff_minus11, offset_mask0, offset_mask1); + DUP2_ARG2(__lsx_vaddi_hu, offset_mask0, 2, offset_mask1, 2, offset_mask0, + offset_mask1); + DUP2_ARG2(__lsx_vpickev_b, offset_mask1, offset_mask0, src_zero1, + src_zero0, offset, dst0); + DUP2_ARG3(__lsx_vshuf_b, edge_idx, edge_idx, offset, sao_offset, + sao_offset, offset, offset, offset); + dst0 = __lsx_vxori_b(dst0, 128); + dst0 = __lsx_vsadd_b(dst0, offset); + dst0 = __lsx_vxori_b(dst0, 128); + + src_minus10 = src10; + src_minus11 = src11; + + /* load in advance */ + DUP2_ARG2(__lsx_vldx, src_orig, src_stride, src_orig, src_stride_2x, + src10, src11); + + __lsx_vstelm_d(dst0, dst, 0, 0); + __lsx_vstelm_d(dst0, dst + dst_stride, 0, 1); +} + +static void hevc_sao_edge_filter_45degree_16multiple_lsx(uint8_t *dst, + int32_t dst_stride, + uint8_t *src, + int32_t src_stride, + int16_t * + sao_offset_val, + int32_t width, + int32_t height) +{ + uint8_t *src_orig = src; + uint8_t *dst_orig = dst; + int32_t v_cnt; + const int32_t src_stride_2x = (src_stride << 1); + const int32_t dst_stride_2x = (dst_stride << 1); + const int32_t src_stride_4x = (src_stride << 2); + const int32_t dst_stride_4x = (dst_stride << 2); + const int32_t src_stride_3x = src_stride_2x + src_stride; + const int32_t dst_stride_3x = dst_stride_2x + dst_stride; + + __m128i shuf1 = {0x807060504030201, 0x100F0E0D0C0B0A09}; + __m128i shuf2 = {0x908070605040302, 0x11100F0E0D0C0B0A}; + __m128i edge_idx = {0x403000201, 0x0}; + __m128i const1 = __lsx_vldi(1); + __m128i cmp_minus10, cmp_plus10, diff_minus10, diff_plus10, cmp_minus11; + __m128i cmp_plus11, diff_minus11, diff_plus11, cmp_minus12, cmp_plus12; + __m128i diff_minus12, diff_plus12, cmp_minus13, cmp_plus13, diff_minus13; + __m128i diff_plus13, src_minus14, src_plus13; + __m128i offset_mask0, offset_mask1, offset_mask2, offset_mask3; + __m128i src10, src_minus10, dst0, src11, src_minus11, dst1; + __m128i src12, src_minus12, dst2, src13, src_minus13, dst3; + __m128i src_zero0, src_plus10, src_zero1, src_plus11, src_zero2; + __m128i src_zero3, sao_offset, src_plus12; + + sao_offset = __lsx_vld(sao_offset_val, 0); + sao_offset = __lsx_vpickev_b(sao_offset, sao_offset); + + for (; height; height -= 4) { + src_orig = src - 1; + dst_orig = dst; + src_minus11 = __lsx_vld(src_orig, 0); + DUP2_ARG2(__lsx_vldx, src_orig, src_stride, src_orig, src_stride_2x, + src_minus12, src_minus13); + src_minus14 = __lsx_vldx(src_orig, src_stride_3x); + + for (v_cnt = 0; v_cnt < width; v_cnt += 16) { + src_minus10 = __lsx_vld(src_orig - src_stride, 0); + src_orig += 16; + src10 = __lsx_vld(src_orig, 0); + DUP2_ARG2(__lsx_vldx, src_orig, src_stride, src_orig, + src_stride_2x, src11, src12); + src13 = __lsx_vldx(src_orig, src_stride_3x); + src_plus13 = __lsx_vld(src + v_cnt + src_stride_4x, 1); + + DUP4_ARG3(__lsx_vshuf_b, src10, src_minus11, shuf1, src11, + src_minus12, shuf1, src12, src_minus13, shuf1, + src13, src_minus14, shuf1, src_zero0, src_zero1, + src_zero2, src_zero3); + DUP2_ARG3(__lsx_vshuf_b, src11, src_minus12, shuf2, src12, + src_minus13, shuf2, src_plus10, src_plus11); + src_plus12 = __lsx_vshuf_b(src13, src_minus14, shuf2); + + DUP4_ARG2(__lsx_vseq_b, src_zero0, src_minus10, src_zero0, + src_plus10, src_zero1, src_minus11, src_zero1, + src_plus11, cmp_minus10, cmp_plus10, + cmp_minus11, cmp_plus11); + DUP4_ARG2(__lsx_vseq_b, src_zero2, src_minus12, src_zero2, + src_plus12, src_zero3, src_minus13, src_zero3, + src_plus13, cmp_minus12, cmp_plus12, + cmp_minus13, cmp_plus13); + DUP4_ARG2(__lsx_vnor_v, cmp_minus10, cmp_minus10, cmp_plus10, + cmp_plus10, cmp_minus11, cmp_minus11, cmp_plus11, + cmp_plus11, diff_minus10, diff_plus10, diff_minus11, + diff_plus11); + DUP4_ARG2(__lsx_vnor_v, cmp_minus12, cmp_minus12, cmp_plus12, + cmp_plus12, cmp_minus13, cmp_minus13, cmp_plus13, + cmp_plus13, diff_minus12, diff_plus12, diff_minus13, + diff_plus13); + DUP4_ARG2(__lsx_vsle_bu, src_zero0, src_minus10, src_zero0, + src_plus10, src_zero1, src_minus11, src_zero1, + src_plus11, cmp_minus10, cmp_plus10, cmp_minus11, + cmp_plus11); + DUP4_ARG2(__lsx_vsle_bu, src_zero2, src_minus12, src_zero2, + src_plus12, src_zero3, src_minus13, src_zero3, + src_plus13, cmp_minus12, cmp_plus12, cmp_minus13, + cmp_plus13); + DUP4_ARG2(__lsx_vnor_v, cmp_minus10, cmp_minus10, cmp_plus10, + cmp_plus10, cmp_minus11, cmp_minus11, cmp_plus11, + cmp_plus11, cmp_minus10, cmp_plus10, cmp_minus11, + cmp_plus11); + DUP4_ARG2(__lsx_vnor_v, cmp_minus12, cmp_minus12, cmp_plus12, + cmp_plus12, cmp_minus13, cmp_minus13, cmp_plus13, + cmp_plus13, cmp_minus12, cmp_plus12, cmp_minus13, + cmp_plus13); + DUP4_ARG3(__lsx_vbitsel_v, diff_minus10, const1, cmp_minus10, + diff_plus10, const1, cmp_plus10, diff_minus11, const1, + cmp_minus11, diff_plus11, const1, cmp_plus11, + diff_minus10, diff_plus10, diff_minus11, diff_plus11); + DUP4_ARG3(__lsx_vbitsel_v, diff_minus12, const1, cmp_minus12, + diff_plus12, const1, cmp_plus12, diff_minus13, const1, + cmp_minus13, diff_plus13, const1, cmp_plus13, + diff_minus12, diff_plus12, diff_minus13, diff_plus13); + + DUP4_ARG2(__lsx_vadd_b, diff_minus10, diff_plus10, diff_minus11, + diff_plus11, diff_minus12, diff_plus12, diff_minus13, + diff_plus13, offset_mask0, offset_mask1, offset_mask2, + offset_mask3); + DUP4_ARG2(__lsx_vaddi_bu, offset_mask0, 2, offset_mask1, 2, + offset_mask2, 2, offset_mask3, 2, offset_mask0, + offset_mask1, offset_mask2, offset_mask3); + + DUP2_ARG3(__lsx_vshuf_b, edge_idx, edge_idx, offset_mask0, + sao_offset, sao_offset, offset_mask0, offset_mask0, + offset_mask0); + DUP2_ARG3(__lsx_vshuf_b, edge_idx, edge_idx, offset_mask1, + sao_offset, sao_offset, offset_mask1, offset_mask1, + offset_mask1); + DUP2_ARG3(__lsx_vshuf_b, edge_idx, edge_idx, offset_mask2, + sao_offset, sao_offset, offset_mask2, offset_mask2, + offset_mask2); + DUP2_ARG3(__lsx_vshuf_b, edge_idx, edge_idx, offset_mask3, + sao_offset, sao_offset, offset_mask3, offset_mask3, + offset_mask3); + + DUP4_ARG2(__lsx_vxori_b, src_zero0, 128, src_zero1, 128, src_zero2, + 128, src_zero3, 128, src_zero0, src_zero1, src_zero2, + src_zero3); + DUP4_ARG2(__lsx_vsadd_b, src_zero0, offset_mask0, src_zero1, + offset_mask1, src_zero2, offset_mask2, src_zero3, + offset_mask3, dst0, dst1, dst2, dst3); + DUP4_ARG2(__lsx_vxori_b, dst0, 128, dst1, 128, dst2, 128, dst3, + 128, dst0, dst1, dst2, dst3); + + src_minus11 = src10; + src_minus12 = src11; + src_minus13 = src12; + src_minus14 = src13; + + __lsx_vst(dst0, dst_orig, 0); + __lsx_vstx(dst1, dst_orig, dst_stride); + __lsx_vstx(dst2, dst_orig, dst_stride_2x); + __lsx_vstx(dst3, dst_orig, dst_stride_3x); + dst_orig += 16; + } + src += src_stride_4x; + dst += dst_stride_4x; + } +} + +static void hevc_sao_edge_filter_135degree_4width_lsx(uint8_t *dst, + int32_t dst_stride, + uint8_t *src, + int32_t src_stride, + int16_t *sao_offset_val, + int32_t height) +{ + uint8_t *src_orig; + const int32_t src_stride_2x = (src_stride << 1); + const int32_t dst_stride_2x = (dst_stride << 1); + + __m128i shuf1 = {0x807060504030201, 0x100F0E0D0C0B0A09}; + __m128i shuf2 = {0x908070605040302, 0x11100F0E0D0C0B0A}; + __m128i edge_idx = {0x403000201, 0x0}; + __m128i const1 = __lsx_vldi(1); + __m128i offset, sao_offset = __lsx_vld(sao_offset_val, 0); + __m128i src_zero0, src_zero1, dst0; + __m128i cmp_minus10, diff_minus10, cmp_minus11, diff_minus11; + __m128i src_minus10, src10, src_minus11, src11; + __m128i offset_mask0, offset_mask1; + __m128i zeros = {0}; + + sao_offset = __lsx_vpickev_b(sao_offset, sao_offset); + src_orig = src - 1; + + /* load in advance */ + DUP2_ARG2(__lsx_vld, src_orig - src_stride, 0, src_orig, 0, + src_minus10, src_minus11); + DUP2_ARG2(__lsx_vldx, src_orig, src_stride, src_orig, src_stride_2x, + src10, src11); + + for (height -= 2; height; height -= 2) { + src_orig += src_stride_2x; + + DUP2_ARG3(__lsx_vshuf_b, zeros, src_minus11, shuf1, zeros, src10, + shuf1, src_zero0, src_zero1); + DUP2_ARG3(__lsx_vshuf_b, zeros, src_minus10, shuf2, zeros, src_minus11, + shuf2, src_minus10, src_minus11); + + DUP2_ARG2(__lsx_vilvl_b, src10, src_minus10, src11, src_minus11, + src_minus10, src_minus11); + DUP2_ARG2(__lsx_vilvl_b, src_zero0, src_zero0, src_zero1, src_zero1, + src_zero0, src_zero1); + DUP2_ARG2(__lsx_vseq_b, src_zero0, src_minus10, src_zero1, src_minus11, + cmp_minus10, cmp_minus11); + DUP2_ARG2(__lsx_vnor_v, cmp_minus10, cmp_minus10, cmp_minus11, + cmp_minus11, diff_minus10, diff_minus11); + DUP2_ARG2(__lsx_vsle_bu, src_zero0, src_minus10, src_zero1, + src_minus11, cmp_minus10, cmp_minus11); + DUP2_ARG2(__lsx_vnor_v, cmp_minus10, cmp_minus10, cmp_minus11, + cmp_minus11, cmp_minus10, cmp_minus11); + DUP2_ARG3(__lsx_vbitsel_v, diff_minus10, const1, cmp_minus10, + diff_minus11, const1, cmp_minus11, diff_minus10, diff_minus11); + + DUP2_ARG2(__lsx_vhaddw_hu_bu, diff_minus10, diff_minus10, diff_minus11, + diff_minus11, offset_mask0, offset_mask1); + DUP2_ARG2(__lsx_vaddi_hu, offset_mask0, 2, offset_mask1, 2, + offset_mask0, offset_mask1); + DUP2_ARG2(__lsx_vpickev_b, offset_mask1, offset_mask0, src_zero1, + src_zero0, offset, dst0); + DUP2_ARG3(__lsx_vshuf_b, edge_idx, edge_idx, offset, sao_offset, + sao_offset, offset, offset, offset); + dst0 = __lsx_vxori_b(dst0, 128); + dst0 = __lsx_vsadd_b(dst0, offset); + dst0 = __lsx_vxori_b(dst0, 128); + + src_minus10 = src10; + src_minus11 = src11; + + /* load in advance */ + DUP2_ARG2(__lsx_vldx, src_orig, src_stride, src_orig, src_stride_2x, + src10, src11); + + __lsx_vstelm_w(dst0, dst, 0, 0); + __lsx_vstelm_w(dst0, dst + dst_stride, 0, 2); + dst += dst_stride_2x; + } + + DUP2_ARG3(__lsx_vshuf_b, zeros, src_minus11, shuf1, zeros, src10, shuf1, + src_zero0, src_zero1); + DUP2_ARG3(__lsx_vshuf_b, zeros, src_minus10, shuf2, zeros, src_minus11, + shuf2, src_minus10, src_minus11); + + DUP2_ARG2(__lsx_vilvl_b, src10, src_minus10, src11, src_minus11, + src_minus10, src_minus11); + DUP2_ARG2(__lsx_vilvl_b, src_zero0, src_zero0, src_zero1, src_zero1, + src_zero0, src_zero1); + DUP2_ARG2(__lsx_vseq_b, src_zero0, src_minus10, src_zero1, src_minus11, + cmp_minus10, cmp_minus11); + DUP2_ARG2(__lsx_vnor_v, cmp_minus10, cmp_minus10, cmp_minus11, + cmp_minus11, diff_minus10, diff_minus11); + DUP2_ARG2(__lsx_vsle_bu, src_zero0, src_minus10, src_zero1, src_minus11, + cmp_minus10, cmp_minus11); + DUP2_ARG2(__lsx_vnor_v, cmp_minus10, cmp_minus10, cmp_minus11, cmp_minus11, + cmp_minus10, cmp_minus11); + DUP2_ARG3(__lsx_vbitsel_v, diff_minus10, const1, cmp_minus10, diff_minus11, + const1, cmp_minus11, diff_minus10, diff_minus11); + + DUP2_ARG2(__lsx_vhaddw_hu_bu, diff_minus10, diff_minus10, diff_minus11, + diff_minus11, offset_mask0, offset_mask1); + DUP2_ARG2(__lsx_vaddi_hu, offset_mask0, 2, offset_mask1, 2, offset_mask0, + offset_mask1); + DUP2_ARG2(__lsx_vpickev_b, offset_mask1, offset_mask0, src_zero1, + src_zero0, offset, dst0); + DUP2_ARG3(__lsx_vshuf_b, edge_idx, edge_idx, offset, sao_offset, + sao_offset, offset, offset, offset); + dst0 = __lsx_vxori_b(dst0, 128); + dst0 = __lsx_vsadd_b(dst0, offset); + dst0 = __lsx_vxori_b(dst0, 128); + + __lsx_vstelm_w(dst0, dst, 0, 0); + __lsx_vstelm_w(dst0, dst + dst_stride, 0, 2); + dst += dst_stride_2x; +} + +static void hevc_sao_edge_filter_135degree_8width_lsx(uint8_t *dst, + int32_t dst_stride, + uint8_t *src, + int32_t src_stride, + int16_t *sao_offset_val, + int32_t height) +{ + uint8_t *src_orig; + const int32_t src_stride_2x = (src_stride << 1); + const int32_t dst_stride_2x = (dst_stride << 1); + + __m128i shuf1 = {0x807060504030201, 0x100F0E0D0C0B0A09}; + __m128i shuf2 = {0x908070605040302, 0x11100F0E0D0C0B0A}; + __m128i edge_idx = {0x403000201, 0x0}; + __m128i const1 = __lsx_vldi(1); + __m128i offset, sao_offset = __lsx_vld(sao_offset_val, 0); + __m128i cmp_minus10, diff_minus10, cmp_minus11, diff_minus11; + __m128i src_minus10, src10, src_minus11, src11; + __m128i src_zero0, src_zero1, dst0; + __m128i offset_mask0, offset_mask1; + __m128i zeros = {0}; + + sao_offset = __lsx_vpickev_b(sao_offset, sao_offset); + src_orig = src - 1; + + /* load in advance */ + DUP2_ARG2(__lsx_vld, src_orig - src_stride, 0, src_orig, 0, + src_minus10, src_minus11); + DUP2_ARG2(__lsx_vldx, src_orig, src_stride, src_orig, src_stride_2x, + src10, src11); + + for (height -= 2; height; height -= 2) { + src_orig += src_stride_2x; + + DUP2_ARG3(__lsx_vshuf_b, zeros, src_minus11, shuf1, zeros, src10, + shuf1, src_zero0, src_zero1); + DUP2_ARG3(__lsx_vshuf_b, zeros, src_minus10, shuf2, zeros, src_minus11, + shuf2, src_minus10, src_minus11); + + DUP2_ARG2(__lsx_vilvl_b, src10, src_minus10, src11, src_minus11, + src_minus10, src_minus11); + DUP2_ARG2(__lsx_vilvl_b, src_zero0, src_zero0, src_zero1, src_zero1, + src_zero0, src_zero1); + DUP2_ARG2(__lsx_vseq_b, src_zero0, src_minus10, src_zero1, src_minus11, + cmp_minus10, cmp_minus11); + DUP2_ARG2(__lsx_vnor_v, cmp_minus10, cmp_minus10, cmp_minus11, + cmp_minus11, diff_minus10, diff_minus11); + DUP2_ARG2(__lsx_vsle_bu, src_zero0, src_minus10, src_zero1, + src_minus11, cmp_minus10, cmp_minus11); + DUP2_ARG2(__lsx_vnor_v, cmp_minus10, cmp_minus10, cmp_minus11, + cmp_minus11, cmp_minus10, cmp_minus11); + DUP2_ARG3(__lsx_vbitsel_v, diff_minus10, const1, cmp_minus10, + diff_minus11, const1, cmp_minus11, diff_minus10, diff_minus11); + + DUP2_ARG2(__lsx_vhaddw_hu_bu, diff_minus10, diff_minus10, diff_minus11, + diff_minus11, offset_mask0, offset_mask1); + DUP2_ARG2(__lsx_vaddi_hu, offset_mask0, 2, offset_mask1, 2, + offset_mask0, offset_mask1); + DUP2_ARG2(__lsx_vpickev_b, offset_mask1, offset_mask0, src_zero1, + src_zero0, offset, dst0); + DUP2_ARG3(__lsx_vshuf_b, edge_idx, edge_idx, offset, sao_offset, + sao_offset, offset, offset, offset); + dst0 = __lsx_vxori_b(dst0, 128); + dst0 = __lsx_vsadd_b(dst0, offset); + dst0 = __lsx_vxori_b(dst0, 128); + + src_minus10 = src10; + src_minus11 = src11; + + /* load in advance */ + DUP2_ARG2(__lsx_vldx, src_orig, src_stride, src_orig, src_stride_2x, + src10, src11); + + __lsx_vstelm_d(dst0, dst, 0, 0); + __lsx_vstelm_d(dst0, dst + dst_stride, 0, 1); + dst += dst_stride_2x; + } + + DUP2_ARG3(__lsx_vshuf_b, zeros, src_minus11, shuf1, zeros, src10, shuf1, + src_zero0, src_zero1); + DUP2_ARG3(__lsx_vshuf_b, zeros, src_minus10, shuf2, zeros, src_minus11, + shuf2, src_minus10, src_minus11); + + DUP2_ARG2(__lsx_vilvl_b, src10, src_minus10, src11, src_minus11, + src_minus10, src_minus11); + DUP2_ARG2(__lsx_vilvl_b, src_zero0, src_zero0, src_zero1, src_zero1, + src_zero0, src_zero1); + DUP2_ARG2(__lsx_vseq_b, src_zero0, src_minus10, src_zero1, src_minus11, + cmp_minus10, cmp_minus11); + DUP2_ARG2(__lsx_vnor_v, cmp_minus10, cmp_minus10, cmp_minus11, cmp_minus11, + diff_minus10, diff_minus11); + DUP2_ARG2(__lsx_vsle_bu, src_zero0, src_minus10, src_zero1, src_minus11, + cmp_minus10, cmp_minus11); + DUP2_ARG2(__lsx_vnor_v, cmp_minus10, cmp_minus10, cmp_minus11, cmp_minus11, + cmp_minus10, cmp_minus11); + DUP2_ARG3(__lsx_vbitsel_v, diff_minus10, const1, cmp_minus10, diff_minus11, + const1, cmp_minus11, diff_minus10, diff_minus11); + + DUP2_ARG2(__lsx_vhaddw_hu_bu, diff_minus10, diff_minus10, diff_minus11, + diff_minus11, offset_mask0, offset_mask1); + DUP2_ARG2(__lsx_vaddi_hu, offset_mask0, 2, offset_mask1, 2, offset_mask0, + offset_mask1); + DUP2_ARG2(__lsx_vpickev_b, offset_mask1, offset_mask0, src_zero1, + src_zero0, offset, dst0); + DUP2_ARG3(__lsx_vshuf_b, edge_idx, edge_idx, offset, sao_offset, + sao_offset, offset, offset, offset); + dst0 = __lsx_vxori_b(dst0, 128); + dst0 = __lsx_vsadd_b(dst0, offset); + dst0 = __lsx_vxori_b(dst0, 128); + + __lsx_vstelm_d(dst0, dst, 0, 0); + __lsx_vstelm_d(dst0, dst + dst_stride, 0, 1); +} + +static void hevc_sao_edge_filter_135degree_16multiple_lsx(uint8_t *dst, + int32_t dst_stride, + uint8_t *src, + int32_t src_stride, + int16_t *sao_offset_val, + int32_t width, + int32_t height) +{ + uint8_t *src_orig, *dst_orig; + int32_t v_cnt; + const int32_t src_stride_2x = (src_stride << 1); + const int32_t dst_stride_2x = (dst_stride << 1); + const int32_t src_stride_4x = (src_stride << 2); + const int32_t dst_stride_4x = (dst_stride << 2); + const int32_t src_stride_3x = src_stride_2x + src_stride; + const int32_t dst_stride_3x = dst_stride_2x + dst_stride; + + __m128i shuf1 = {0x807060504030201, 0x100F0E0D0C0B0A09}; + __m128i shuf2 = {0x908070605040302, 0x11100F0E0D0C0B0A}; + __m128i edge_idx = {0x403000201, 0x0}; + __m128i const1 = __lsx_vldi(1); + __m128i dst0, dst1, dst2, dst3; + __m128i cmp_minus10, cmp_minus11, cmp_minus12, cmp_minus13, cmp_plus10; + __m128i cmp_plus11, cmp_plus12, cmp_plus13, diff_minus10, diff_minus11; + __m128i diff_minus12, diff_minus13, diff_plus10, diff_plus11, diff_plus12; + __m128i diff_plus13, src10, src11, src12, src13, src_minus10, src_minus11; + __m128i src_plus10, src_plus11, src_plus12, src_plus13; + __m128i src_minus12, src_minus13, src_zero0, src_zero1, src_zero2, src_zero3; + __m128i offset_mask0, offset_mask1, offset_mask2, offset_mask3, sao_offset; + + sao_offset = __lsx_vld(sao_offset_val, 0); + sao_offset = __lsx_vpickev_b(sao_offset, sao_offset); + + for (; height; height -= 4) { + src_orig = src - 1; + dst_orig = dst; + + src_minus11 = __lsx_vld(src_orig, 0); + DUP2_ARG2(__lsx_vldx, src_orig, src_stride, src_orig, src_stride_2x, + src_plus10, src_plus11); + src_plus12 = __lsx_vldx(src_orig, src_stride_3x); + + for (v_cnt = 0; v_cnt < width; v_cnt += 16) { + src_minus10 = __lsx_vld(src_orig - src_stride, 2); + src_plus13 = __lsx_vldx(src_orig, src_stride_4x); + src_orig += 16; + src10 = __lsx_vld(src_orig, 0); + DUP2_ARG2(__lsx_vldx, src_orig, src_stride, src_orig, src_stride_2x, + src11, src12); + src13 =__lsx_vldx(src_orig, src_stride_3x); + + DUP4_ARG3(__lsx_vshuf_b, src10, src_minus11, shuf1, src11, + src_plus10, shuf1, src12, src_plus11, shuf1, src13, + src_plus12, shuf1, src_zero0, src_zero1, src_zero2, + src_zero3); + src_minus11 = __lsx_vshuf_b(src10, src_minus11, shuf2); + DUP2_ARG3(__lsx_vshuf_b, src11, src_plus10, shuf2, src12, + src_plus11, shuf2, src_minus12, src_minus13); + + DUP4_ARG2(__lsx_vseq_b, src_zero0, src_minus10, src_zero0, + src_plus10, src_zero1, src_minus11, src_zero1, + src_plus11, cmp_minus10, cmp_plus10, cmp_minus11, + cmp_plus11); + DUP4_ARG2(__lsx_vseq_b, src_zero2, src_minus12, src_zero2, + src_plus12, src_zero3, src_minus13, src_zero3, + src_plus13, cmp_minus12, cmp_plus12, cmp_minus13, + cmp_plus13); + DUP4_ARG2(__lsx_vnor_v, cmp_minus10, cmp_minus10, cmp_plus10, + cmp_plus10, cmp_minus11, cmp_minus11, cmp_plus11, + cmp_plus11, diff_minus10, diff_plus10, diff_minus11, + diff_plus11); + DUP4_ARG2(__lsx_vnor_v, cmp_minus12, cmp_minus12, cmp_plus12, + cmp_plus12, cmp_minus13, cmp_minus13, cmp_plus13, + cmp_plus13, diff_minus12, diff_plus12, diff_minus13, + diff_plus13); + DUP4_ARG2(__lsx_vsle_bu, src_zero0, src_minus10, src_zero0, + src_plus10, src_zero1, src_minus11, src_zero1, src_plus11, + cmp_minus10, cmp_plus10, cmp_minus11, cmp_plus11); + DUP4_ARG2(__lsx_vsle_bu, src_zero2, src_minus12, src_zero2, + src_plus12, src_zero3, src_minus13, src_zero3, src_plus13, + cmp_minus12, cmp_plus12, cmp_minus13, cmp_plus13); + DUP4_ARG2(__lsx_vnor_v, cmp_minus10, cmp_minus10, cmp_plus10, + cmp_plus10, cmp_minus11, cmp_minus11, cmp_plus11, + cmp_plus11, cmp_minus10, cmp_plus10, cmp_minus11, + cmp_plus11); + DUP4_ARG2(__lsx_vnor_v, cmp_minus12, cmp_minus12, cmp_plus12, + cmp_plus12, cmp_minus13, cmp_minus13, cmp_plus13, + cmp_plus13, cmp_minus12, cmp_plus12, cmp_minus13, + cmp_plus13); + DUP4_ARG3(__lsx_vbitsel_v, diff_minus10, const1, cmp_minus10, + diff_plus10, const1, cmp_plus10, diff_minus11, const1, + cmp_minus11, diff_plus11, const1, cmp_plus11, + diff_minus10, diff_plus10, diff_minus11, diff_plus11); + DUP4_ARG3(__lsx_vbitsel_v, diff_minus12, const1, cmp_minus12, + diff_plus12, const1, cmp_plus12, diff_minus13, const1, + cmp_minus13, diff_plus13, const1, cmp_plus13, + diff_minus12, diff_plus12, diff_minus13, diff_plus13); + + DUP4_ARG2(__lsx_vadd_b, diff_minus10, diff_plus10, diff_minus11, + diff_plus11, diff_minus12, diff_plus12, diff_minus13, + diff_plus13, offset_mask0, offset_mask1, offset_mask2, + offset_mask3); + DUP4_ARG2(__lsx_vaddi_bu, offset_mask0, 2, offset_mask1, 2, + offset_mask2, 2, offset_mask3, 2, offset_mask0, + offset_mask1, offset_mask2, offset_mask3); + + DUP2_ARG3(__lsx_vshuf_b, edge_idx, edge_idx, offset_mask0, + sao_offset, sao_offset, offset_mask0, offset_mask0, + offset_mask0); + DUP2_ARG3(__lsx_vshuf_b, edge_idx, edge_idx, offset_mask1, + sao_offset, sao_offset, offset_mask1, offset_mask1, + offset_mask1); + DUP2_ARG3(__lsx_vshuf_b, edge_idx, edge_idx, offset_mask2, + sao_offset, sao_offset, offset_mask2, offset_mask2, + offset_mask2); + DUP2_ARG3(__lsx_vshuf_b, edge_idx, edge_idx, offset_mask3, + sao_offset, sao_offset, offset_mask3, offset_mask3, + offset_mask3); + + DUP4_ARG2(__lsx_vxori_b, src_zero0, 128, src_zero1, 128, + src_zero2, 128, src_zero3, 128, src_zero0, src_zero1, + src_zero2, src_zero3); + DUP4_ARG2(__lsx_vsadd_b, src_zero0, offset_mask0, src_zero1, + offset_mask1, src_zero2, offset_mask2, src_zero3, + offset_mask3, dst0, dst1, dst2, dst3); + DUP4_ARG2(__lsx_vxori_b, dst0, 128, dst1, 128, dst2, 128, dst3, + 128, dst0, dst1, dst2, dst3); + + src_minus11 = src10; + src_plus10 = src11; + src_plus11 = src12; + src_plus12 = src13; + + __lsx_vst(dst0, dst_orig, 0); + __lsx_vstx(dst1, dst_orig, dst_stride); + __lsx_vstx(dst2, dst_orig, dst_stride_2x); + __lsx_vstx(dst3, dst_orig, dst_stride_3x); + dst_orig += 16; + } + + src += src_stride_4x; + dst += dst_stride_4x; + } +} + +void ff_hevc_sao_edge_filter_8_lsx(uint8_t *dst, uint8_t *src, + ptrdiff_t stride_dst, + int16_t *sao_offset_val, + int eo, int width, int height) +{ + ptrdiff_t stride_src = (2 * MAX_PB_SIZE + AV_INPUT_BUFFER_PADDING_SIZE); + + switch (eo) { + case 0: + if (width >> 4) { + hevc_sao_edge_filter_0degree_16multiple_lsx(dst, stride_dst, + src, stride_src, + sao_offset_val, + width - (width & 0x0F), + height); + dst += width & 0xFFFFFFF0; + src += width & 0xFFFFFFF0; + width &= 0x0F; + } + + if (width >> 3) { + hevc_sao_edge_filter_0degree_8width_lsx(dst, stride_dst, + src, stride_src, + sao_offset_val, height); + dst += 8; + src += 8; + width &= 0x07; + } + + if (width) { + hevc_sao_edge_filter_0degree_4width_lsx(dst, stride_dst, + src, stride_src, + sao_offset_val, height); + } + break; + + case 1: + if (width >> 4) { + hevc_sao_edge_filter_90degree_16multiple_lsx(dst, stride_dst, + src, stride_src, + sao_offset_val, + width - (width & 0x0F), + height); + dst += width & 0xFFFFFFF0; + src += width & 0xFFFFFFF0; + width &= 0x0F; + } + + if (width >> 3) { + hevc_sao_edge_filter_90degree_8width_lsx(dst, stride_dst, + src, stride_src, + sao_offset_val, height); + dst += 8; + src += 8; + width &= 0x07; + } + + if (width) { + hevc_sao_edge_filter_90degree_4width_lsx(dst, stride_dst, + src, stride_src, + sao_offset_val, height); + } + break; + + case 2: + if (width >> 4) { + hevc_sao_edge_filter_45degree_16multiple_lsx(dst, stride_dst, + src, stride_src, + sao_offset_val, + width - (width & 0x0F), + height); + dst += width & 0xFFFFFFF0; + src += width & 0xFFFFFFF0; + width &= 0x0F; + } + + if (width >> 3) { + hevc_sao_edge_filter_45degree_8width_lsx(dst, stride_dst, + src, stride_src, + sao_offset_val, height); + dst += 8; + src += 8; + width &= 0x07; + } + + if (width) { + hevc_sao_edge_filter_45degree_4width_lsx(dst, stride_dst, + src, stride_src, + sao_offset_val, height); + } + break; + + case 3: + if (width >> 4) { + hevc_sao_edge_filter_135degree_16multiple_lsx(dst, stride_dst, + src, stride_src, + sao_offset_val, + width - (width & 0x0F), + height); + dst += width & 0xFFFFFFF0; + src += width & 0xFFFFFFF0; + width &= 0x0F; + } + + if (width >> 3) { + hevc_sao_edge_filter_135degree_8width_lsx(dst, stride_dst, + src, stride_src, + sao_offset_val, height); + dst += 8; + src += 8; + width &= 0x07; + } + + if (width) { + hevc_sao_edge_filter_135degree_4width_lsx(dst, stride_dst, + src, stride_src, + sao_offset_val, height); + } + break; + } +} diff --git a/libavcodec/loongarch/hevcdsp_init_loongarch.c b/libavcodec/loongarch/hevcdsp_init_loongarch.c index fc0e8fb0df..f39674be64 100644 --- a/libavcodec/loongarch/hevcdsp_init_loongarch.c +++ b/libavcodec/loongarch/hevcdsp_init_loongarch.c @@ -85,6 +85,25 @@ void ff_hevc_dsp_init_loongarch(HEVCDSPContext *c, const int bit_depth) c->put_hevc_epel[5][1][1] = ff_hevc_put_hevc_epel_hv16_8_lsx; c->put_hevc_epel[6][1][1] = ff_hevc_put_hevc_epel_hv24_8_lsx; c->put_hevc_epel[7][1][1] = ff_hevc_put_hevc_epel_hv32_8_lsx; + + c->sao_edge_filter[4] = ff_hevc_sao_edge_filter_8_lsx; + + c->hevc_h_loop_filter_luma = ff_hevc_loop_filter_luma_h_8_lsx; + c->hevc_v_loop_filter_luma = ff_hevc_loop_filter_luma_v_8_lsx; + + c->hevc_h_loop_filter_luma_c = ff_hevc_loop_filter_luma_h_8_lsx; + c->hevc_v_loop_filter_luma_c = ff_hevc_loop_filter_luma_v_8_lsx; + + c->hevc_h_loop_filter_chroma = ff_hevc_loop_filter_chroma_h_8_lsx; + c->hevc_v_loop_filter_chroma = ff_hevc_loop_filter_chroma_v_8_lsx; + + c->hevc_h_loop_filter_chroma_c = ff_hevc_loop_filter_chroma_h_8_lsx; + c->hevc_v_loop_filter_chroma_c = ff_hevc_loop_filter_chroma_v_8_lsx; + + c->idct[0] = ff_hevc_idct_4x4_lsx; + c->idct[1] = ff_hevc_idct_8x8_lsx; + c->idct[2] = ff_hevc_idct_16x16_lsx; + c->idct[3] = ff_hevc_idct_32x32_lsx; } } } diff --git a/libavcodec/loongarch/hevcdsp_lsx.h b/libavcodec/loongarch/hevcdsp_lsx.h index 3259e03f13..0e73fd1f8e 100644 --- a/libavcodec/loongarch/hevcdsp_lsx.h +++ b/libavcodec/loongarch/hevcdsp_lsx.h @@ -85,4 +85,30 @@ MC(epel, hv, 32); #undef MC +void ff_hevc_loop_filter_luma_h_8_lsx(uint8_t *src, ptrdiff_t stride, + int32_t beta, int32_t *tc, + uint8_t *p_is_pcm, uint8_t *q_is_pcm); + +void ff_hevc_loop_filter_luma_v_8_lsx(uint8_t *src, ptrdiff_t stride, + int32_t beta, int32_t *tc, + uint8_t *p_is_pcm, uint8_t *q_is_pcm); + +void ff_hevc_loop_filter_chroma_h_8_lsx(uint8_t *src, ptrdiff_t stride, + int32_t *tc, uint8_t *p_is_pcm, + uint8_t *q_is_pcm); + +void ff_hevc_loop_filter_chroma_v_8_lsx(uint8_t *src, ptrdiff_t stride, + int32_t *tc, uint8_t *p_is_pcm, + uint8_t *q_is_pcm); + +void ff_hevc_sao_edge_filter_8_lsx(uint8_t *dst, uint8_t *src, + ptrdiff_t stride_dst, + int16_t *sao_offset_val, + int eo, int width, int height); + +void ff_hevc_idct_4x4_lsx(int16_t *coeffs, int col_limit); +void ff_hevc_idct_8x8_lsx(int16_t *coeffs, int col_limit); +void ff_hevc_idct_16x16_lsx(int16_t *coeffs, int col_limit); +void ff_hevc_idct_32x32_lsx(int16_t *coeffs, int col_limit); + #endif // #ifndef AVCODEC_LOONGARCH_HEVCDSP_LSX_H |