aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorShiyou Yin <yinshiyou-hf@loongson.cn>2023-05-25 15:24:26 +0800
committerMichael Niedermayer <michael@niedermayer.cc>2023-05-25 21:04:25 +0200
commite1b6ecd20a01c5d902cd89c034b960ddadea778e (patch)
treebbf79db79a2500f049492b4df3dfee60dbeb101c
parent90fba27743041a896bde618d99633c418ebc2a45 (diff)
downloadffmpeg-e1b6ecd20a01c5d902cd89c034b960ddadea778e.tar.gz
avcodec/la: add LSX optimization for h264 idct.
loongson_asm.S is LoongArch asm optimization helper. Add functions: ff_h264_idct_add_8_lsx ff_h264_idct8_add_8_lsx ff_h264_idct_dc_add_8_lsx ff_h264_idct8_dc_add_8_lsx ff_h264_idct_add16_8_lsx ff_h264_idct8_add4_8_lsx ff_h264_idct_add8_8_lsx ff_h264_idct_add8_422_8_lsx ff_h264_idct_add16_intra_8_lsx ff_h264_luma_dc_dequant_idct_8_lsx Replaced function(LSX is sufficient for these functions): ff_h264_idct_add_lasx ff_h264_idct4x4_addblk_dc_lasx ff_h264_idct_add16_lasx ff_h264_idct8_add4_lasx ff_h264_idct_add8_lasx ff_h264_idct_add8_422_lasx ff_h264_idct_add16_intra_lasx ff_h264_deq_idct_luma_dc_lasx Renamed functions: ff_h264_idct8_addblk_lasx ==> ff_h264_idct8_add_8_lasx ff_h264_idct8_dc_addblk_lasx ==> ff_h264_idct8_dc_add_8_lasx ./configure --disable-lasx ffmpeg -i 1_h264_1080p_30fps_3Mbps.mp4 -f rawvideo -y /dev/null -an before: 155fps after: 161fps Reviewed-by: Shiyou Yin <yinshiyou-hf@loongson.cn> Signed-off-by: Michael Niedermayer <michael@niedermayer.cc>
-rw-r--r--libavcodec/loongarch/Makefile3
-rw-r--r--libavcodec/loongarch/h264_deblock_lasx.c2
-rw-r--r--libavcodec/loongarch/h264dsp_init_loongarch.c39
-rw-r--r--libavcodec/loongarch/h264dsp_lasx.c2
-rw-r--r--libavcodec/loongarch/h264dsp_loongarch.h (renamed from libavcodec/loongarch/h264dsp_lasx.h)60
-rw-r--r--libavcodec/loongarch/h264idct.S658
-rw-r--r--libavcodec/loongarch/h264idct_lasx.c498
-rw-r--r--libavcodec/loongarch/h264idct_loongarch.c184
-rw-r--r--libavcodec/loongarch/loongson_asm.S945
9 files changed, 1848 insertions, 543 deletions
diff --git a/libavcodec/loongarch/Makefile b/libavcodec/loongarch/Makefile
index c1b5de5c44..34ebbbe133 100644
--- a/libavcodec/loongarch/Makefile
+++ b/libavcodec/loongarch/Makefile
@@ -12,7 +12,6 @@ OBJS-$(CONFIG_HEVC_DECODER) += loongarch/hevcdsp_init_loongarch.o
LASX-OBJS-$(CONFIG_H264CHROMA) += loongarch/h264chroma_lasx.o
LASX-OBJS-$(CONFIG_H264QPEL) += loongarch/h264qpel_lasx.o
LASX-OBJS-$(CONFIG_H264DSP) += loongarch/h264dsp_lasx.o \
- loongarch/h264idct_lasx.o \
loongarch/h264_deblock_lasx.o
LASX-OBJS-$(CONFIG_H264PRED) += loongarch/h264_intrapred_lasx.o
LASX-OBJS-$(CONFIG_VC1_DECODER) += loongarch/vc1dsp_lasx.o
@@ -31,3 +30,5 @@ LSX-OBJS-$(CONFIG_HEVC_DECODER) += loongarch/hevcdsp_lsx.o \
loongarch/hevc_mc_bi_lsx.o \
loongarch/hevc_mc_uni_lsx.o \
loongarch/hevc_mc_uniw_lsx.o
+LSX-OBJS-$(CONFIG_H264DSP) += loongarch/h264idct.o \
+ loongarch/h264idct_loongarch.o
diff --git a/libavcodec/loongarch/h264_deblock_lasx.c b/libavcodec/loongarch/h264_deblock_lasx.c
index c89bea9a84..eead931dcf 100644
--- a/libavcodec/loongarch/h264_deblock_lasx.c
+++ b/libavcodec/loongarch/h264_deblock_lasx.c
@@ -20,7 +20,7 @@
*/
#include "libavcodec/bit_depth_template.c"
-#include "h264dsp_lasx.h"
+#include "h264dsp_loongarch.h"
#include "libavutil/loongarch/loongson_intrinsics.h"
#define H264_LOOP_FILTER_STRENGTH_ITERATION_LASX(edges, step, mask_mv, dir, \
diff --git a/libavcodec/loongarch/h264dsp_init_loongarch.c b/libavcodec/loongarch/h264dsp_init_loongarch.c
index 37633c3e51..cb07deb398 100644
--- a/libavcodec/loongarch/h264dsp_init_loongarch.c
+++ b/libavcodec/loongarch/h264dsp_init_loongarch.c
@@ -21,13 +21,32 @@
*/
#include "libavutil/loongarch/cpu.h"
-#include "h264dsp_lasx.h"
+#include "h264dsp_loongarch.h"
av_cold void ff_h264dsp_init_loongarch(H264DSPContext *c, const int bit_depth,
const int chroma_format_idc)
{
int cpu_flags = av_get_cpu_flags();
+ if (have_lsx(cpu_flags)) {
+ if (bit_depth == 8) {
+ c->h264_idct_add = ff_h264_idct_add_8_lsx;
+ c->h264_idct8_add = ff_h264_idct8_add_8_lsx;
+ c->h264_idct_dc_add = ff_h264_idct_dc_add_8_lsx;
+ c->h264_idct8_dc_add = ff_h264_idct8_dc_add_8_lsx;
+
+ if (chroma_format_idc <= 1)
+ c->h264_idct_add8 = ff_h264_idct_add8_8_lsx;
+ else
+ c->h264_idct_add8 = ff_h264_idct_add8_422_8_lsx;
+
+ c->h264_idct_add16 = ff_h264_idct_add16_8_lsx;
+ c->h264_idct8_add4 = ff_h264_idct8_add4_8_lsx;
+ c->h264_luma_dc_dequant_idct = ff_h264_luma_dc_dequant_idct_8_lsx;
+ c->h264_idct_add16intra = ff_h264_idct_add16_intra_8_lsx;
+ }
+ }
+#if HAVE_LASX
if (have_lasx(cpu_flags)) {
if (chroma_format_idc <= 1)
c->h264_loop_filter_strength = ff_h264_loop_filter_strength_lasx;
@@ -56,20 +75,10 @@ av_cold void ff_h264dsp_init_loongarch(H264DSPContext *c, const int bit_depth,
c->biweight_h264_pixels_tab[1] = ff_biweight_h264_pixels8_8_lasx;
c->biweight_h264_pixels_tab[2] = ff_biweight_h264_pixels4_8_lasx;
- c->h264_idct_add = ff_h264_idct_add_lasx;
- c->h264_idct8_add = ff_h264_idct8_addblk_lasx;
- c->h264_idct_dc_add = ff_h264_idct4x4_addblk_dc_lasx;
- c->h264_idct8_dc_add = ff_h264_idct8_dc_addblk_lasx;
- c->h264_idct_add16 = ff_h264_idct_add16_lasx;
- c->h264_idct8_add4 = ff_h264_idct8_add4_lasx;
-
- if (chroma_format_idc <= 1)
- c->h264_idct_add8 = ff_h264_idct_add8_lasx;
- else
- c->h264_idct_add8 = ff_h264_idct_add8_422_lasx;
-
- c->h264_idct_add16intra = ff_h264_idct_add16_intra_lasx;
- c->h264_luma_dc_dequant_idct = ff_h264_deq_idct_luma_dc_lasx;
+ c->h264_idct8_add = ff_h264_idct8_add_8_lasx;
+ c->h264_idct8_dc_add = ff_h264_idct8_dc_add_8_lasx;
+ c->h264_idct8_add4 = ff_h264_idct8_add4_8_lasx;
}
}
+#endif // #if HAVE_LASX
}
diff --git a/libavcodec/loongarch/h264dsp_lasx.c b/libavcodec/loongarch/h264dsp_lasx.c
index 7fd4cedf7e..7b2b8ff0f0 100644
--- a/libavcodec/loongarch/h264dsp_lasx.c
+++ b/libavcodec/loongarch/h264dsp_lasx.c
@@ -23,7 +23,7 @@
*/
#include "libavutil/loongarch/loongson_intrinsics.h"
-#include "h264dsp_lasx.h"
+#include "h264dsp_loongarch.h"
#define AVC_LPF_P1_OR_Q1(p0_or_q0_org_in, q0_or_p0_org_in, \
p1_or_q1_org_in, p2_or_q2_org_in, \
diff --git a/libavcodec/loongarch/h264dsp_lasx.h b/libavcodec/loongarch/h264dsp_loongarch.h
index 4cf813750b..28dca2b537 100644
--- a/libavcodec/loongarch/h264dsp_lasx.h
+++ b/libavcodec/loongarch/h264dsp_loongarch.h
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2021 Loongson Technology Corporation Limited
+ * Copyright (c) 2023 Loongson Technology Corporation Limited
* Contributed by Shiyou Yin <yinshiyou-hf@loongson.cn>
* Xiwei Gu <guxiwei-hf@loongson.cn>
*
@@ -20,11 +20,34 @@
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
-#ifndef AVCODEC_LOONGARCH_H264DSP_LASX_H
-#define AVCODEC_LOONGARCH_H264DSP_LASX_H
+#ifndef AVCODEC_LOONGARCH_H264DSP_LOONGARCH_H
+#define AVCODEC_LOONGARCH_H264DSP_LOONGARCH_H
#include "libavcodec/h264dec.h"
+#include "config.h"
+void ff_h264_idct_add_8_lsx(uint8_t *dst, int16_t *src, int dst_stride);
+void ff_h264_idct8_add_8_lsx(uint8_t *dst, int16_t *src, int dst_stride);
+void ff_h264_idct_dc_add_8_lsx(uint8_t *dst, int16_t *src, int dst_stride);
+void ff_h264_idct8_dc_add_8_lsx(uint8_t *dst, int16_t *src, int dst_stride);
+void ff_h264_luma_dc_dequant_idct_8_lsx(int16_t *_output, int16_t *_input, int qmul);
+void ff_h264_idct_add16_8_lsx(uint8_t *dst, const int32_t *blk_offset,
+ int16_t *block, int32_t dst_stride,
+ const uint8_t nzc[15 * 8]);
+void ff_h264_idct8_add4_8_lsx(uint8_t *dst, const int32_t *blk_offset,
+ int16_t *block, int32_t dst_stride,
+ const uint8_t nzc[15 * 8]);
+void ff_h264_idct_add8_8_lsx(uint8_t **dst, const int32_t *blk_offset,
+ int16_t *block, int32_t dst_stride,
+ const uint8_t nzc[15 * 8]);
+void ff_h264_idct_add8_422_8_lsx(uint8_t **dst, const int32_t *blk_offset,
+ int16_t *block, int32_t dst_stride,
+ const uint8_t nzc[15 * 8]);
+void ff_h264_idct_add16_intra_8_lsx(uint8_t *dst, const int32_t *blk_offset,
+ int16_t *block, int32_t dst_stride,
+ const uint8_t nzc[15 * 8]);
+
+#if HAVE_LASX
void ff_h264_h_lpf_luma_8_lasx(uint8_t *src, ptrdiff_t stride,
int alpha, int beta, int8_t *tc0);
void ff_h264_v_lpf_luma_8_lasx(uint8_t *src, ptrdiff_t stride,
@@ -65,33 +88,16 @@ void ff_weight_h264_pixels4_8_lasx(uint8_t *src, ptrdiff_t stride,
void ff_h264_add_pixels4_8_lasx(uint8_t *_dst, int16_t *_src, int stride);
void ff_h264_add_pixels8_8_lasx(uint8_t *_dst, int16_t *_src, int stride);
-void ff_h264_idct_add_lasx(uint8_t *dst, int16_t *src, int32_t dst_stride);
-void ff_h264_idct8_addblk_lasx(uint8_t *dst, int16_t *src, int32_t dst_stride);
-void ff_h264_idct4x4_addblk_dc_lasx(uint8_t *dst, int16_t *src,
- int32_t dst_stride);
-void ff_h264_idct8_dc_addblk_lasx(uint8_t *dst, int16_t *src,
+void ff_h264_idct8_add_8_lasx(uint8_t *dst, int16_t *src, int32_t dst_stride);
+void ff_h264_idct8_dc_add_8_lasx(uint8_t *dst, int16_t *src,
int32_t dst_stride);
-void ff_h264_idct_add16_lasx(uint8_t *dst, const int32_t *blk_offset,
- int16_t *block, int32_t dst_stride,
- const uint8_t nzc[15 * 8]);
-void ff_h264_idct8_add4_lasx(uint8_t *dst, const int32_t *blk_offset,
- int16_t *block, int32_t dst_stride,
- const uint8_t nzc[15 * 8]);
-void ff_h264_idct_add8_lasx(uint8_t **dst, const int32_t *blk_offset,
- int16_t *block, int32_t dst_stride,
- const uint8_t nzc[15 * 8]);
-void ff_h264_idct_add8_422_lasx(uint8_t **dst, const int32_t *blk_offset,
- int16_t *block, int32_t dst_stride,
- const uint8_t nzc[15 * 8]);
-void ff_h264_idct_add16_intra_lasx(uint8_t *dst, const int32_t *blk_offset,
- int16_t *block, int32_t dst_stride,
- const uint8_t nzc[15 * 8]);
-void ff_h264_deq_idct_luma_dc_lasx(int16_t *dst, int16_t *src,
- int32_t de_qval);
-
+void ff_h264_idct8_add4_8_lasx(uint8_t *dst, const int32_t *blk_offset,
+ int16_t *block, int32_t dst_stride,
+ const uint8_t nzc[15 * 8]);
void ff_h264_loop_filter_strength_lasx(int16_t bS[2][4][4], uint8_t nnz[40],
int8_t ref[2][40], int16_t mv[2][40][2],
int bidir, int edges, int step,
int mask_mv0, int mask_mv1, int field);
+#endif // #if HAVE_LASX
-#endif // #ifndef AVCODEC_LOONGARCH_H264DSP_LASX_H
+#endif // #ifndef AVCODEC_LOONGARCH_H264DSP_LOONGARCH_H
diff --git a/libavcodec/loongarch/h264idct.S b/libavcodec/loongarch/h264idct.S
new file mode 100644
index 0000000000..f504cfb714
--- /dev/null
+++ b/libavcodec/loongarch/h264idct.S
@@ -0,0 +1,658 @@
+/*
+ * Loongson LASX optimized h264idct
+ *
+ * Copyright (c) 2023 Loongson Technology Corporation Limited
+ * Contributed by Shiyou Yin <yinshiyou-hf@loongson.cn>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "loongson_asm.S"
+
+/*
+ * #define FUNC2(a, b, c) FUNC3(a, b, c)
+ * #define FUNCC(a) FUNC2(a, BIT_DEPTH, _c)
+ * void FUNCC(ff_h264_idct_add)(uint8_t *_dst, int16_t *_block, int stride)
+ * LSX optimization is enough for this function.
+ */
+function ff_h264_idct_add_8_lsx
+ fld.d f0, a1, 0
+ fld.d f1, a1, 8
+ fld.d f2, a1, 16
+ fld.d f3, a1, 24
+ vxor.v vr7, vr7, vr7
+ add.d t2, a2, a2
+ add.d t3, t2, a2
+ vst vr7, a1, 0
+ vst vr7, a1, 16
+
+ vadd.h vr4, vr0, vr2
+ vsub.h vr5, vr0, vr2
+ vsrai.h vr6, vr1, 1
+ vsrai.h vr7, vr3, 1
+ vsub.h vr6, vr6, vr3
+ vadd.h vr7, vr1, vr7
+ LSX_BUTTERFLY_4_H vr4, vr5, vr6, vr7, vr0, vr1, vr2, vr3
+ LSX_TRANSPOSE4x4_H vr0, vr1, vr2, vr3, vr0, vr1, vr2, vr3, vr4, vr5
+ vadd.h vr4, vr0, vr2
+ vsub.h vr5, vr0, vr2
+ vsrai.h vr6, vr1, 1
+ vsrai.h vr7, vr3, 1
+ vsub.h vr6, vr6, vr3
+ vadd.h vr7, vr1, vr7
+ LSX_BUTTERFLY_4_H vr4, vr5, vr6, vr7, vr0, vr1, vr2, vr3
+
+ fld.s f4, a0, 0
+ fldx.s f5, a0, a2
+ fldx.s f6, a0, t2
+ fldx.s f7, a0, t3
+
+ vsrari.h vr0, vr0, 6
+ vsrari.h vr1, vr1, 6
+ vsrari.h vr2, vr2, 6
+ vsrari.h vr3, vr3, 6
+
+ vsllwil.hu.bu vr4, vr4, 0
+ vsllwil.hu.bu vr5, vr5, 0
+ vsllwil.hu.bu vr6, vr6, 0
+ vsllwil.hu.bu vr7, vr7, 0
+ vadd.h vr0, vr0, vr4
+ vadd.h vr1, vr1, vr5
+ vadd.h vr2, vr2, vr6
+ vadd.h vr3, vr3, vr7
+ vssrarni.bu.h vr1, vr0, 0
+ vssrarni.bu.h vr3, vr2, 0
+
+ vbsrl.v vr0, vr1, 8
+ vbsrl.v vr2, vr3, 8
+ fst.s f1, a0, 0
+ fstx.s f0, a0, a2
+ fstx.s f3, a0, t2
+ fstx.s f2, a0, t3
+endfunc
+
+/*
+ * #define FUNC2(a, b, c) FUNC3(a, b, c)
+ * #define FUNCC(a) FUNC2(a, BIT_DEPTH, _c)
+ * void FUNCC(ff_h264_idct8_add)(uint8_t *_dst, int16_t *_block, int stride)
+ */
+function ff_h264_idct8_add_8_lsx
+ ld.h t0, a1, 0
+ add.d t2, a2, a2
+ add.d t3, t2, a2
+ add.d t4, t3, a2
+ add.d t5, t4, a2
+ add.d t6, t5, a2
+ add.d t7, t6, a2
+ addi.w t0, t0, 32
+ st.h t0, a1, 0
+
+ vld vr0, a1, 0
+ vld vr1, a1, 16
+ vld vr2, a1, 32
+ vld vr3, a1, 48
+ vld vr4, a1, 64
+ vld vr5, a1, 80
+ vld vr6, a1, 96
+ vld vr7, a1, 112
+ vxor.v vr8, vr8, vr8
+ vst vr8, a1, 0
+ vst vr8, a1, 16
+ vst vr8, a1, 32
+ vst vr8, a1, 48
+ vst vr8, a1, 64
+ vst vr8, a1, 80
+ vst vr8, a1, 96
+ vst vr8, a1, 112
+
+ vadd.h vr18, vr0, vr4
+ vsub.h vr19, vr0, vr4
+ vsrai.h vr20, vr2, 1
+ vsrai.h vr21, vr6, 1
+ vsub.h vr20, vr20, vr6
+ vadd.h vr21, vr21, vr2
+ LSX_BUTTERFLY_4_H vr18, vr19, vr20, vr21, vr10, vr12, vr14, vr16
+ vsrai.h vr11, vr7, 1
+ vsrai.h vr13, vr3, 1
+ vsrai.h vr15, vr5, 1
+ vsrai.h vr17, vr1, 1
+ vsub.h vr11, vr5, vr11
+ vsub.h vr13, vr7, vr13
+ vadd.h vr15, vr7, vr15
+ vadd.h vr17, vr5, vr17
+ vsub.h vr11, vr11, vr7
+ vsub.h vr13, vr13, vr3
+ vadd.h vr15, vr15, vr5
+ vadd.h vr17, vr17, vr1
+ vsub.h vr11, vr11, vr3
+ vadd.h vr13, vr13, vr1
+ vsub.h vr15, vr15, vr1
+ vadd.h vr17, vr17, vr3
+ vsrai.h vr18, vr11, 2
+ vsrai.h vr19, vr13, 2
+ vsrai.h vr20, vr15, 2
+ vsrai.h vr21, vr17, 2
+ vadd.h vr11, vr11, vr21
+ vadd.h vr13, vr13, vr20
+ vsub.h vr15, vr19, vr15
+ vsub.h vr17, vr17, vr18
+ LSX_BUTTERFLY_8_H vr10, vr16, vr12, vr14, vr13, vr15, vr11, vr17, \
+ vr0, vr3, vr1, vr2, vr5, vr6, vr4, vr7
+
+ LSX_TRANSPOSE8x8_H vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, \
+ vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, \
+ vr10, vr11, vr12, vr13, vr14, vr15, vr16, vr17
+ vexth.w.h vr20, vr0
+ vexth.w.h vr21, vr1
+ vexth.w.h vr22, vr2
+ vexth.w.h vr23, vr3
+ vexth.w.h vr8, vr4
+ vexth.w.h vr9, vr5
+ vexth.w.h vr18, vr6
+ vexth.w.h vr19, vr7
+ vsllwil.w.h vr0, vr0, 0
+ vsllwil.w.h vr1, vr1, 0
+ vsllwil.w.h vr2, vr2, 0
+ vsllwil.w.h vr3, vr3, 0
+ vsllwil.w.h vr4, vr4, 0
+ vsllwil.w.h vr5, vr5, 0
+ vsllwil.w.h vr6, vr6, 0
+ vsllwil.w.h vr7, vr7, 0
+
+ vadd.w vr11, vr0, vr4
+ vsub.w vr13, vr0, vr4
+ vsrai.w vr15, vr2, 1
+ vsrai.w vr17, vr6, 1
+ vsub.w vr15, vr15, vr6
+ vadd.w vr17, vr17, vr2
+ LSX_BUTTERFLY_4_W vr11, vr13, vr15, vr17, vr10, vr12, vr14, vr16
+ vsrai.w vr11, vr7, 1
+ vsrai.w vr13, vr3, 1
+ vsrai.w vr15, vr5, 1
+ vsrai.w vr17, vr1, 1
+ vsub.w vr11, vr5, vr11
+ vsub.w vr13, vr7, vr13
+ vadd.w vr15, vr7, vr15
+ vadd.w vr17, vr5, vr17
+ vsub.w vr11, vr11, vr7
+ vsub.w vr13, vr13, vr3
+ vadd.w vr15, vr15, vr5
+ vadd.w vr17, vr17, vr1
+ vsub.w vr11, vr11, vr3
+ vadd.w vr13, vr13, vr1
+ vsub.w vr15, vr15, vr1
+ vadd.w vr17, vr17, vr3
+ vsrai.w vr0, vr11, 2
+ vsrai.w vr1, vr13, 2
+ vsrai.w vr2, vr15, 2
+ vsrai.w vr3, vr17, 2
+ vadd.w vr11, vr11, vr3
+ vadd.w vr13, vr13, vr2
+ vsub.w vr15, vr1, vr15
+ vsub.w vr17, vr17, vr0
+ LSX_BUTTERFLY_8_W vr10, vr12, vr14, vr16, vr11, vr13, vr15, vr17, \
+ vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7
+
+ vadd.w vr11, vr20, vr8
+ vsub.w vr13, vr20, vr8
+ vsrai.w vr15, vr22, 1
+ vsrai.w vr17, vr18, 1
+ vsub.w vr15, vr15, vr18
+ vadd.w vr17, vr17, vr22
+ LSX_BUTTERFLY_4_W vr11, vr13, vr15, vr17, vr10, vr12, vr14, vr16
+ vsrai.w vr11, vr19, 1
+ vsrai.w vr13, vr23, 1
+ vsrai.w vr15, vr9, 1
+ vsrai.w vr17, vr21, 1
+ vsub.w vr11, vr9, vr11
+ vsub.w vr13, vr19, vr13
+ vadd.w vr15, vr19, vr15
+ vadd.w vr17, vr9, vr17
+ vsub.w vr11, vr11, vr19
+ vsub.w vr13, vr13, vr23
+ vadd.w vr15, vr15, vr9
+ vadd.w vr17, vr17, vr21
+ vsub.w vr11, vr11, vr23
+ vadd.w vr13, vr13, vr21
+ vsub.w vr15, vr15, vr21
+ vadd.w vr17, vr17, vr23
+ vsrai.w vr20, vr11, 2
+ vsrai.w vr21, vr13, 2
+ vsrai.w vr22, vr15, 2
+ vsrai.w vr23, vr17, 2
+ vadd.w vr11, vr11, vr23
+ vadd.w vr13, vr13, vr22
+ vsub.w vr15, vr21, vr15
+ vsub.w vr17, vr17, vr20
+ LSX_BUTTERFLY_8_W vr10, vr12, vr14, vr16, vr11, vr13, vr15, vr17, \
+ vr20, vr21, vr22, vr23, vr8, vr9, vr18, vr19
+
+ vld vr10, a0, 0
+ vldx vr11, a0, a2
+ vldx vr12, a0, t2
+ vldx vr13, a0, t3
+ vldx vr14, a0, t4
+ vldx vr15, a0, t5
+ vldx vr16, a0, t6
+ vldx vr17, a0, t7
+ vsrani.h.w vr20, vr0, 6
+ vsrani.h.w vr21, vr1, 6
+ vsrani.h.w vr22, vr2, 6
+ vsrani.h.w vr23, vr3, 6
+ vsrani.h.w vr8, vr4, 6
+ vsrani.h.w vr9, vr5, 6
+ vsrani.h.w vr18, vr6, 6
+ vsrani.h.w vr19, vr7, 6
+ vsllwil.hu.bu vr10, vr10, 0
+ vsllwil.hu.bu vr11, vr11, 0
+ vsllwil.hu.bu vr12, vr12, 0
+ vsllwil.hu.bu vr13, vr13, 0
+ vsllwil.hu.bu vr14, vr14, 0
+ vsllwil.hu.bu vr15, vr15, 0
+ vsllwil.hu.bu vr16, vr16, 0
+ vsllwil.hu.bu vr17, vr17, 0
+
+ vadd.h vr0, vr20, vr10
+ vadd.h vr1, vr21, vr11
+ vadd.h vr2, vr22, vr12
+ vadd.h vr3, vr23, vr13
+ vadd.h vr4, vr8, vr14
+ vadd.h vr5, vr9, vr15
+ vadd.h vr6, vr18, vr16
+ vadd.h vr7, vr19, vr17
+ vssrarni.bu.h vr1, vr0, 0
+ vssrarni.bu.h vr3, vr2, 0
+ vssrarni.bu.h vr5, vr4, 0
+ vssrarni.bu.h vr7, vr6, 0
+ vbsrl.v vr0, vr1, 8
+ vbsrl.v vr2, vr3, 8
+ vbsrl.v vr4, vr5, 8
+ vbsrl.v vr6, vr7, 8
+ fst.d f1, a0, 0
+ fstx.d f0, a0, a2
+ fstx.d f3, a0, t2
+ fstx.d f2, a0, t3
+ fstx.d f5, a0, t4
+ fstx.d f4, a0, t5
+ fstx.d f7, a0, t6
+ fstx.d f6, a0, t7
+endfunc
+
+/*
+ * #define FUNC2(a, b, c) FUNC3(a, b, c)
+ * #define FUNCC(a) FUNC2(a, BIT_DEPTH, _c)
+ * void FUNCC(ff_h264_idct8_add)(uint8_t *_dst, int16_t *_block, int stride)
+ */
+function ff_h264_idct8_add_8_lasx
+ ld.h t0, a1, 0
+ add.d t2, a2, a2
+ add.d t3, t2, a2
+ add.d t4, t3, a2
+ add.d t5, t4, a2
+ add.d t6, t5, a2
+ add.d t7, t6, a2
+ addi.w t0, t0, 32
+ st.h t0, a1, 0
+
+ vld vr0, a1, 0
+ vld vr1, a1, 16
+ vld vr2, a1, 32
+ vld vr3, a1, 48
+ vld vr4, a1, 64
+ vld vr5, a1, 80
+ vld vr6, a1, 96
+ vld vr7, a1, 112
+ xvxor.v xr8, xr8, xr8
+ xvst xr8, a1, 0
+ xvst xr8, a1, 32
+ xvst xr8, a1, 64
+ xvst xr8, a1, 96
+
+ vadd.h vr18, vr0, vr4
+ vsub.h vr19, vr0, vr4
+ vsrai.h vr20, vr2, 1
+ vsrai.h vr21, vr6, 1
+ vsub.h vr20, vr20, vr6
+ vadd.h vr21, vr21, vr2
+ LSX_BUTTERFLY_4_H vr18, vr19, vr20, vr21, vr10, vr12, vr14, vr16
+ vsrai.h vr11, vr7, 1
+ vsrai.h vr13, vr3, 1
+ vsrai.h vr15, vr5, 1
+ vsrai.h vr17, vr1, 1
+ vsub.h vr11, vr5, vr11
+ vsub.h vr13, vr7, vr13
+ vadd.h vr15, vr7, vr15
+ vadd.h vr17, vr5, vr17
+ vsub.h vr11, vr11, vr7
+ vsub.h vr13, vr13, vr3
+ vadd.h vr15, vr15, vr5
+ vadd.h vr17, vr17, vr1
+ vsub.h vr11, vr11, vr3
+ vadd.h vr13, vr13, vr1
+ vsub.h vr15, vr15, vr1
+ vadd.h vr17, vr17, vr3
+ vsrai.h vr18, vr11, 2
+ vsrai.h vr19, vr13, 2
+ vsrai.h vr20, vr15, 2
+ vsrai.h vr21, vr17, 2
+ vadd.h vr11, vr11, vr21
+ vadd.h vr13, vr13, vr20
+ vsub.h vr15, vr19, vr15
+ vsub.h vr17, vr17, vr18
+ LSX_BUTTERFLY_8_H vr10, vr16, vr12, vr14, vr13, vr15, vr11, vr17, \
+ vr0, vr3, vr1, vr2, vr5, vr6, vr4, vr7
+
+ LSX_TRANSPOSE8x8_H vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, \
+ vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, \
+ vr10, vr11, vr12, vr13, vr14, vr15, vr16, vr17
+ vext2xv.w.h xr0, xr0
+ vext2xv.w.h xr1, xr1
+ vext2xv.w.h xr2, xr2
+ vext2xv.w.h xr3, xr3
+ vext2xv.w.h xr4, xr4
+ vext2xv.w.h xr5, xr5
+ vext2xv.w.h xr6, xr6
+ vext2xv.w.h xr7, xr7
+
+ xvadd.w xr11, xr0, xr4
+ xvsub.w xr13, xr0, xr4
+ xvsrai.w xr15, xr2, 1
+ xvsrai.w xr17, xr6, 1
+ xvsub.w xr15, xr15, xr6
+ xvadd.w xr17, xr17, xr2
+ LASX_BUTTERFLY_4_W xr11, xr13, xr15, xr17, xr10, xr12, xr14, xr16
+ xvsrai.w xr11, xr7, 1
+ xvsrai.w xr13, xr3, 1
+ xvsrai.w xr15, xr5, 1
+ xvsrai.w xr17, xr1, 1
+ xvsub.w xr11, xr5, xr11
+ xvsub.w xr13, xr7, xr13
+ xvadd.w xr15, xr7, xr15
+ xvadd.w xr17, xr5, xr17
+ xvsub.w xr11, xr11, xr7
+ xvsub.w xr13, xr13, xr3
+ xvadd.w xr15, xr15, xr5
+ xvadd.w xr17, xr17, xr1
+ xvsub.w xr11, xr11, xr3
+ xvadd.w xr13, xr13, xr1
+ xvsub.w xr15, xr15, xr1
+ xvadd.w xr17, xr17, xr3
+ xvsrai.w xr0, xr11, 2
+ xvsrai.w xr1, xr13, 2
+ xvsrai.w xr2, xr15, 2
+ xvsrai.w xr3, xr17, 2
+ xvadd.w xr11, xr11, xr3
+ xvadd.w xr13, xr13, xr2
+ xvsub.w xr15, xr1, xr15
+ xvsub.w xr17, xr17, xr0
+ LASX_BUTTERFLY_8_W xr10, xr12, xr14, xr16, xr11, xr13, xr15, xr17, \
+ xr0, xr1, xr2, xr3, xr4, xr5, xr6, xr7
+
+ vld vr10, a0, 0
+ vldx vr11, a0, a2
+ vldx vr12, a0, t2
+ vldx vr13, a0, t3
+ vldx vr14, a0, t4
+ vldx vr15, a0, t5
+ vldx vr16, a0, t6
+ vldx vr17, a0, t7
+ xvldi xr8, 0x806 //"xvldi.w xr8 6"
+ xvsran.h.w xr0, xr0, xr8
+ xvsran.h.w xr1, xr1, xr8
+ xvsran.h.w xr2, xr2, xr8
+ xvsran.h.w xr3, xr3, xr8
+ xvsran.h.w xr4, xr4, xr8
+ xvsran.h.w xr5, xr5, xr8
+ xvsran.h.w xr6, xr6, xr8
+ xvsran.h.w xr7, xr7, xr8
+ xvpermi.d xr0, xr0, 0x08
+ xvpermi.d xr1, xr1, 0x08
+ xvpermi.d xr2, xr2, 0x08
+ xvpermi.d xr3, xr3, 0x08
+ xvpermi.d xr4, xr4, 0x08
+ xvpermi.d xr5, xr5, 0x08
+ xvpermi.d xr6, xr6, 0x08
+ xvpermi.d xr7, xr7, 0x08
+
+ vsllwil.hu.bu vr10, vr10, 0
+ vsllwil.hu.bu vr11, vr11, 0
+ vsllwil.hu.bu vr12, vr12, 0
+ vsllwil.hu.bu vr13, vr13, 0
+ vsllwil.hu.bu vr14, vr14, 0
+ vsllwil.hu.bu vr15, vr15, 0
+ vsllwil.hu.bu vr16, vr16, 0
+ vsllwil.hu.bu vr17, vr17, 0
+
+ vadd.h vr0, vr0, vr10
+ vadd.h vr1, vr1, vr11
+ vadd.h vr2, vr2, vr12
+ vadd.h vr3, vr3, vr13
+ vadd.h vr4, vr4, vr14
+ vadd.h vr5, vr5, vr15
+ vadd.h vr6, vr6, vr16
+ vadd.h vr7, vr7, vr17
+ vssrarni.bu.h vr1, vr0, 0
+ vssrarni.bu.h vr3, vr2, 0
+ vssrarni.bu.h vr5, vr4, 0
+ vssrarni.bu.h vr7, vr6, 0
+ vbsrl.v vr0, vr1, 8
+ vbsrl.v vr2, vr3, 8
+ vbsrl.v vr4, vr5, 8
+ vbsrl.v vr6, vr7, 8
+ fst.d f1, a0, 0
+ fstx.d f0, a0, a2
+ fstx.d f3, a0, t2
+ fstx.d f2, a0, t3
+ fstx.d f5, a0, t4
+ fstx.d f4, a0, t5
+ fstx.d f7, a0, t6
+ fstx.d f6, a0, t7
+endfunc
+
+/*
+ * #define FUNC2(a, b, c) FUNC3(a, b, c)
+ * #define FUNCC(a) FUNC2(a, BIT_DEPTH, _c)
+ * void FUNCC(ff_h264_idct_dc_add)(uint8_t *_dst, int16_t *_block, int stride)
+ * LSX optimization is enough for this function.
+ */
+function ff_h264_idct_dc_add_8_lsx
+ vldrepl.h vr4, a1, 0
+ add.d t2, a2, a2
+ add.d t3, t2, a2
+ fld.s f0, a0, 0
+ fldx.s f1, a0, a2
+ fldx.s f2, a0, t2
+ fldx.s f3, a0, t3
+ st.h zero, a1, 0
+
+ vsrari.h vr4, vr4, 6
+ vilvl.w vr0, vr1, vr0
+ vilvl.w vr1, vr3, vr2
+ vsllwil.hu.bu vr0, vr0, 0
+ vsllwil.hu.bu vr1, vr1, 0
+ vadd.h vr0, vr0, vr4
+ vadd.h vr1, vr1, vr4
+ vssrarni.bu.h vr1, vr0, 0
+
+ vbsrl.v vr2, vr1, 4
+ vbsrl.v vr3, vr1, 8
+ vbsrl.v vr4, vr1, 12
+ fst.s f1, a0, 0
+ fstx.s f2, a0, a2
+ fstx.s f3, a0, t2
+ fstx.s f4, a0, t3
+endfunc
+
+/*
+ * #define FUNC2(a, b, c) FUNC3(a, b, c)
+ * #define FUNCC(a) FUNC2(a, BIT_DEPTH, _c)
+ * void FUNCC(ff_h264_idct8_dc_add)(uint8_t *_dst, int16_t *_block, int stride)
+ */
+function ff_h264_idct8_dc_add_8_lsx
+ vldrepl.h vr8, a1, 0
+ add.d t2, a2, a2
+ add.d t3, t2, a2
+ add.d t4, t3, a2
+ add.d t5, t4, a2
+ add.d t6, t5, a2
+ add.d t7, t6, a2
+
+ fld.d f0, a0, 0
+ fldx.d f1, a0, a2
+ fldx.d f2, a0, t2
+ fldx.d f3, a0, t3
+ fldx.d f4, a0, t4
+ fldx.d f5, a0, t5
+ fldx.d f6, a0, t6
+ fldx.d f7, a0, t7
+ st.h zero, a1, 0
+
+ vsrari.h vr8, vr8, 6
+ vsllwil.hu.bu vr0, vr0, 0
+ vsllwil.hu.bu vr1, vr1, 0
+ vsllwil.hu.bu vr2, vr2, 0
+ vsllwil.hu.bu vr3, vr3, 0
+ vsllwil.hu.bu vr4, vr4, 0
+ vsllwil.hu.bu vr5, vr5, 0
+ vsllwil.hu.bu vr6, vr6, 0
+ vsllwil.hu.bu vr7, vr7, 0
+ vadd.h vr0, vr0, vr8
+ vadd.h vr1, vr1, vr8
+ vadd.h vr2, vr2, vr8
+ vadd.h vr3, vr3, vr8
+ vadd.h vr4, vr4, vr8
+ vadd.h vr5, vr5, vr8
+ vadd.h vr6, vr6, vr8
+ vadd.h vr7, vr7, vr8
+ vssrarni.bu.h vr1, vr0, 0
+ vssrarni.bu.h vr3, vr2, 0
+ vssrarni.bu.h vr5, vr4, 0
+ vssrarni.bu.h vr7, vr6, 0
+
+ vbsrl.v vr0, vr1, 8
+ vbsrl.v vr2, vr3, 8
+ vbsrl.v vr4, vr5, 8
+ vbsrl.v vr6, vr7, 8
+ fst.d f1, a0, 0
+ fstx.d f0, a0, a2
+ fstx.d f3, a0, t2
+ fstx.d f2, a0, t3
+ fstx.d f5, a0, t4
+ fstx.d f4, a0, t5
+ fstx.d f7, a0, t6
+ fstx.d f6, a0, t7
+endfunc
+function ff_h264_idct8_dc_add_8_lasx
+ xvldrepl.h xr8, a1, 0
+ add.d t2, a2, a2
+ add.d t3, t2, a2
+ add.d t4, t3, a2
+ add.d t5, t4, a2
+ add.d t6, t5, a2
+ add.d t7, t6, a2
+
+ fld.d f0, a0, 0
+ fldx.d f1, a0, a2
+ fldx.d f2, a0, t2
+ fldx.d f3, a0, t3
+ fldx.d f4, a0, t4
+ fldx.d f5, a0, t5
+ fldx.d f6, a0, t6
+ fldx.d f7, a0, t7
+ st.h zero, a1, 0
+
+ xvsrari.h xr8, xr8, 6
+ xvpermi.q xr1, xr0, 0x20
+ xvpermi.q xr3, xr2, 0x20
+ xvpermi.q xr5, xr4, 0x20
+ xvpermi.q xr7, xr6, 0x20
+ xvsllwil.hu.bu xr1, xr1, 0
+ xvsllwil.hu.bu xr3, xr3, 0
+ xvsllwil.hu.bu xr5, xr5, 0
+ xvsllwil.hu.bu xr7, xr7, 0
+ xvadd.h xr1, xr1, xr8
+ xvadd.h xr3, xr3, xr8
+ xvadd.h xr5, xr5, xr8
+ xvadd.h xr7, xr7, xr8
+
+ xvssrarni.bu.h xr3, xr1, 0
+ xvssrarni.bu.h xr7, xr5, 0
+
+ xvpermi.q xr1, xr3, 0x11
+ xvpermi.q xr5, xr7, 0x11
+ xvbsrl.v xr0, xr1, 8
+ xvbsrl.v xr2, xr3, 8
+ xvbsrl.v xr4, xr5, 8
+ xvbsrl.v xr6, xr7, 8
+
+ fst.d f3, a0, 0
+ fstx.d f1, a0, a2
+ fstx.d f2, a0, t2
+ fstx.d f0, a0, t3
+ fstx.d f7, a0, t4
+ fstx.d f5, a0, t5
+ fstx.d f6, a0, t6
+ fstx.d f4, a0, t7
+endfunc
+
+/**
+ * IDCT transforms the 16 dc values and dequantizes them.
+ * @param qmul quantization parameter
+ * void FUNCC(ff_h264_luma_dc_dequant_idct)(int16_t *_output, int16_t *_input, int qmul){
+ * LSX optimization is enough for this function.
+ */
+function ff_h264_luma_dc_dequant_idct_8_lsx
+ vld vr0, a1, 0
+ vld vr1, a1, 8
+ vld vr2, a1, 16
+ vld vr3, a1, 24
+ vreplgr2vr.w vr8, a2
+ LSX_TRANSPOSE4x4_H vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, vr9, vr10
+ LSX_BUTTERFLY_4_H vr4, vr6, vr7, vr5, vr0, vr3, vr2, vr1
+ LSX_BUTTERFLY_4_H vr0, vr1, vr2, vr3, vr4, vr7, vr6, vr5
+ LSX_TRANSPOSE4x4_H vr4, vr5, vr6, vr7, vr0, vr1, vr2, vr3, vr9, vr10
+ LSX_BUTTERFLY_4_H vr0, vr1, vr3, vr2, vr4, vr7, vr6, vr5
+ LSX_BUTTERFLY_4_H vr4, vr5, vr6, vr7, vr0, vr1, vr2, vr3
+ vsllwil.w.h vr0, vr0, 0
+ vsllwil.w.h vr1, vr1, 0
+ vsllwil.w.h vr2, vr2, 0
+ vsllwil.w.h vr3, vr3, 0
+ vmul.w vr0, vr0, vr8
+ vmul.w vr1, vr1, vr8
+ vmul.w vr2, vr2, vr8
+ vmul.w vr3, vr3, vr8
+ vsrarni.h.w vr1, vr0, 8
+ vsrarni.h.w vr3, vr2, 8
+
+ vstelm.h vr1, a0, 0, 0
+ vstelm.h vr1, a0, 32, 4
+ vstelm.h vr1, a0, 64, 1
+ vstelm.h vr1, a0, 96, 5
+ vstelm.h vr3, a0, 128, 0
+ vstelm.h vr3, a0, 160, 4
+ vstelm.h vr3, a0, 192, 1
+ vstelm.h vr3, a0, 224, 5
+ addi.d a0, a0, 256
+ vstelm.h vr1, a0, 0, 2
+ vstelm.h vr1, a0, 32, 6
+ vstelm.h vr1, a0, 64, 3
+ vstelm.h vr1, a0, 96, 7
+ vstelm.h vr3, a0, 128, 2
+ vstelm.h vr3, a0, 160, 6
+ vstelm.h vr3, a0, 192, 3
+ vstelm.h vr3, a0, 224, 7
+endfunc
diff --git a/libavcodec/loongarch/h264idct_lasx.c b/libavcodec/loongarch/h264idct_lasx.c
deleted file mode 100644
index 46bd3b74d5..0000000000
--- a/libavcodec/loongarch/h264idct_lasx.c
+++ /dev/null
@@ -1,498 +0,0 @@
-/*
- * Loongson LASX optimized h264dsp
- *
- * Copyright (c) 2021 Loongson Technology Corporation Limited
- * Contributed by Shiyou Yin <yinshiyou-hf@loongson.cn>
- * Xiwei Gu <guxiwei-hf@loongson.cn>
- *
- * This file is part of FFmpeg.
- *
- * FFmpeg is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation; either
- * version 2.1 of the License, or (at your option) any later version.
- *
- * FFmpeg is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with FFmpeg; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
- */
-
-#include "libavutil/loongarch/loongson_intrinsics.h"
-#include "h264dsp_lasx.h"
-#include "libavcodec/bit_depth_template.c"
-
-#define AVC_ITRANS_H(in0, in1, in2, in3, out0, out1, out2, out3) \
-{ \
- __m256i tmp0_m, tmp1_m, tmp2_m, tmp3_m; \
- \
- tmp0_m = __lasx_xvadd_h(in0, in2); \
- tmp1_m = __lasx_xvsub_h(in0, in2); \
- tmp2_m = __lasx_xvsrai_h(in1, 1); \
- tmp2_m = __lasx_xvsub_h(tmp2_m, in3); \
- tmp3_m = __lasx_xvsrai_h(in3, 1); \
- tmp3_m = __lasx_xvadd_h(in1, tmp3_m); \
- \
- LASX_BUTTERFLY_4_H(tmp0_m, tmp1_m, tmp2_m, tmp3_m, \
- out0, out1, out2, out3); \
-}
-
-void ff_h264_idct_add_lasx(uint8_t *dst, int16_t *src, int32_t dst_stride)
-{
- __m256i src0_m, src1_m, src2_m, src3_m;
- __m256i dst0_m, dst1_m;
- __m256i hres0, hres1, hres2, hres3, vres0, vres1, vres2, vres3;
- __m256i inp0_m, inp1_m, res0_m, src1, src3;
- __m256i src0 = __lasx_xvld(src, 0);
- __m256i src2 = __lasx_xvld(src, 16);
- __m256i zero = __lasx_xvldi(0);
- int32_t dst_stride_2x = dst_stride << 1;
- int32_t dst_stride_3x = dst_stride_2x + dst_stride;
-
- __lasx_xvst(zero, src, 0);
- DUP2_ARG2(__lasx_xvilvh_d, src0, src0, src2, src2, src1, src3);
- AVC_ITRANS_H(src0, src1, src2, src3, hres0, hres1, hres2, hres3);
- LASX_TRANSPOSE4x4_H(hres0, hres1, hres2, hres3, hres0, hres1, hres2, hres3);
- AVC_ITRANS_H(hres0, hres1, hres2, hres3, vres0, vres1, vres2, vres3);
- DUP4_ARG2(__lasx_xvldx, dst, 0, dst, dst_stride, dst, dst_stride_2x,
- dst, dst_stride_3x, src0_m, src1_m, src2_m, src3_m);
- DUP4_ARG2(__lasx_xvld, dst, 0, dst + dst_stride, 0, dst + dst_stride_2x,
- 0, dst + dst_stride_3x, 0, src0_m, src1_m, src2_m, src3_m);
- DUP2_ARG2(__lasx_xvilvl_d, vres1, vres0, vres3, vres2, inp0_m, inp1_m);
- inp0_m = __lasx_xvpermi_q(inp1_m, inp0_m, 0x20);
- inp0_m = __lasx_xvsrari_h(inp0_m, 6);
- DUP2_ARG2(__lasx_xvilvl_w, src1_m, src0_m, src3_m, src2_m, dst0_m, dst1_m);
- dst0_m = __lasx_xvilvl_d(dst1_m, dst0_m);
- res0_m = __lasx_vext2xv_hu_bu(dst0_m);
- res0_m = __lasx_xvadd_h(res0_m, inp0_m);
- res0_m = __lasx_xvclip255_h(res0_m);
- dst0_m = __lasx_xvpickev_b(res0_m, res0_m);
- __lasx_xvstelm_w(dst0_m, dst, 0, 0);
- __lasx_xvstelm_w(dst0_m, dst + dst_stride, 0, 1);
- __lasx_xvstelm_w(dst0_m, dst + dst_stride_2x, 0, 4);
- __lasx_xvstelm_w(dst0_m, dst + dst_stride_3x, 0, 5);
-}
-
-void ff_h264_idct8_addblk_lasx(uint8_t *dst, int16_t *src,
- int32_t dst_stride)
-{
- __m256i src0, src1, src2, src3, src4, src5, src6, src7;
- __m256i vec0, vec1, vec2, vec3;
- __m256i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
- __m256i res0, res1, res2, res3, res4, res5, res6, res7;
- __m256i dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
- __m256i zero = __lasx_xvldi(0);
- int32_t dst_stride_2x = dst_stride << 1;
- int32_t dst_stride_4x = dst_stride << 2;
- int32_t dst_stride_3x = dst_stride_2x + dst_stride;
-
- src[0] += 32;
- DUP4_ARG2(__lasx_xvld, src, 0, src, 16, src, 32, src, 48,
- src0, src1, src2, src3);
- DUP4_ARG2(__lasx_xvld, src, 64, src, 80, src, 96, src, 112,
- src4, src5, src6, src7);
- __lasx_xvst(zero, src, 0);
- __lasx_xvst(zero, src, 32);
- __lasx_xvst(zero, src, 64);
- __lasx_xvst(zero, src, 96);
-
- vec0 = __lasx_xvadd_h(src0, src4);
- vec1 = __lasx_xvsub_h(src0, src4);
- vec2 = __lasx_xvsrai_h(src2, 1);
- vec2 = __lasx_xvsub_h(vec2, src6);
- vec3 = __lasx_xvsrai_h(src6, 1);
- vec3 = __lasx_xvadd_h(src2, vec3);
-
- LASX_BUTTERFLY_4_H(vec0, vec1, vec2, vec3, tmp0, tmp1, tmp2, tmp3);
-
- vec0 = __lasx_xvsrai_h(src7, 1);
- vec0 = __lasx_xvsub_h(src5, vec0);
- vec0 = __lasx_xvsub_h(vec0, src3);
- vec0 = __lasx_xvsub_h(vec0, src7);
-
- vec1 = __lasx_xvsrai_h(src3, 1);
- vec1 = __lasx_xvsub_h(src1, vec1);
- vec1 = __lasx_xvadd_h(vec1, src7);
- vec1 = __lasx_xvsub_h(vec1, src3);
-
- vec2 = __lasx_xvsrai_h(src5, 1);
- vec2 = __lasx_xvsub_h(vec2, src1);
- vec2 = __lasx_xvadd_h(vec2, src7);
- vec2 = __lasx_xvadd_h(vec2, src5);
-
- vec3 = __lasx_xvsrai_h(src1, 1);
- vec3 = __lasx_xvadd_h(src3, vec3);
- vec3 = __lasx_xvadd_h(vec3, src5);
- vec3 = __lasx_xvadd_h(vec3, src1);
-
- tmp4 = __lasx_xvsrai_h(vec3, 2);
- tmp4 = __lasx_xvadd_h(tmp4, vec0);
- tmp5 = __lasx_xvsrai_h(vec2, 2);
- tmp5 = __lasx_xvadd_h(tmp5, vec1);
- tmp6 = __lasx_xvsrai_h(vec1, 2);
- tmp6 = __lasx_xvsub_h(tmp6, vec2);
- tmp7 = __lasx_xvsrai_h(vec0, 2);
- tmp7 = __lasx_xvsub_h(vec3, tmp7);
-
- LASX_BUTTERFLY_8_H(tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7,
- res0, res1, res2, res3, res4, res5, res6, res7);
- LASX_TRANSPOSE8x8_H(res0, res1, res2, res3, res4, res5, res6, res7,
- res0, res1, res2, res3, res4, res5, res6, res7);
-
- DUP4_ARG1(__lasx_vext2xv_w_h, res0, res1, res2, res3,
- tmp0, tmp1, tmp2, tmp3);
- DUP4_ARG1(__lasx_vext2xv_w_h, res4, res5, res6, res7,
- tmp4, tmp5, tmp6, tmp7);
- vec0 = __lasx_xvadd_w(tmp0, tmp4);
- vec1 = __lasx_xvsub_w(tmp0, tmp4);
-
- vec2 = __lasx_xvsrai_w(tmp2, 1);
- vec2 = __lasx_xvsub_w(vec2, tmp6);
- vec3 = __lasx_xvsrai_w(tmp6, 1);
- vec3 = __lasx_xvadd_w(vec3, tmp2);
-
- tmp0 = __lasx_xvadd_w(vec0, vec3);
- tmp2 = __lasx_xvadd_w(vec1, vec2);
- tmp4 = __lasx_xvsub_w(vec1, vec2);
- tmp6 = __lasx_xvsub_w(vec0, vec3);
-
- vec0 = __lasx_xvsrai_w(tmp7, 1);
- vec0 = __lasx_xvsub_w(tmp5, vec0);
- vec0 = __lasx_xvsub_w(vec0, tmp3);
- vec0 = __lasx_xvsub_w(vec0, tmp7);
-
- vec1 = __lasx_xvsrai_w(tmp3, 1);
- vec1 = __lasx_xvsub_w(tmp1, vec1);
- vec1 = __lasx_xvadd_w(vec1, tmp7);
- vec1 = __lasx_xvsub_w(vec1, tmp3);
-
- vec2 = __lasx_xvsrai_w(tmp5, 1);
- vec2 = __lasx_xvsub_w(vec2, tmp1);
- vec2 = __lasx_xvadd_w(vec2, tmp7);
- vec2 = __lasx_xvadd_w(vec2, tmp5);
-
- vec3 = __lasx_xvsrai_w(tmp1, 1);
- vec3 = __lasx_xvadd_w(tmp3, vec3);
- vec3 = __lasx_xvadd_w(vec3, tmp5);
- vec3 = __lasx_xvadd_w(vec3, tmp1);
-
- tmp1 = __lasx_xvsrai_w(vec3, 2);
- tmp1 = __lasx_xvadd_w(tmp1, vec0);
- tmp3 = __lasx_xvsrai_w(vec2, 2);
- tmp3 = __lasx_xvadd_w(tmp3, vec1);
- tmp5 = __lasx_xvsrai_w(vec1, 2);
- tmp5 = __lasx_xvsub_w(tmp5, vec2);
- tmp7 = __lasx_xvsrai_w(vec0, 2);
- tmp7 = __lasx_xvsub_w(vec3, tmp7);
-
- LASX_BUTTERFLY_4_W(tmp0, tmp2, tmp5, tmp7, res0, res1, res6, res7);
- LASX_BUTTERFLY_4_W(tmp4, tmp6, tmp1, tmp3, res2, res3, res4, res5);
-
- DUP4_ARG2(__lasx_xvsrai_w, res0, 6, res1, 6, res2, 6, res3, 6,
- res0, res1, res2, res3);
- DUP4_ARG2(__lasx_xvsrai_w, res4, 6, res5, 6, res6, 6, res7, 6,
- res4, res5, res6, res7);
- DUP4_ARG2(__lasx_xvpickev_h, res1, res0, res3, res2, res5, res4, res7,
- res6, res0, res1, res2, res3);
- DUP4_ARG2(__lasx_xvpermi_d, res0, 0xd8, res1, 0xd8, res2, 0xd8, res3, 0xd8,
- res0, res1, res2, res3);
-
- DUP4_ARG2(__lasx_xvldx, dst, 0, dst, dst_stride, dst, dst_stride_2x,
- dst, dst_stride_3x, dst0, dst1, dst2, dst3);
- dst += dst_stride_4x;
- DUP4_ARG2(__lasx_xvldx, dst, 0, dst, dst_stride, dst, dst_stride_2x,
- dst, dst_stride_3x, dst4, dst5, dst6, dst7);
- dst -= dst_stride_4x;
- DUP4_ARG2(__lasx_xvilvl_b, zero, dst0, zero, dst1, zero, dst2, zero, dst3,
- dst0, dst1, dst2, dst3);
- DUP4_ARG2(__lasx_xvilvl_b, zero, dst4, zero, dst5, zero, dst6, zero, dst7,
- dst4, dst5, dst6, dst7);
- DUP4_ARG3(__lasx_xvpermi_q, dst1, dst0, 0x20, dst3, dst2, 0x20, dst5,
- dst4, 0x20, dst7, dst6, 0x20, dst0, dst1, dst2, dst3);
- res0 = __lasx_xvadd_h(res0, dst0);
- res1 = __lasx_xvadd_h(res1, dst1);
- res2 = __lasx_xvadd_h(res2, dst2);
- res3 = __lasx_xvadd_h(res3, dst3);
- DUP4_ARG1(__lasx_xvclip255_h, res0, res1, res2, res3, res0, res1,
- res2, res3);
- DUP2_ARG2(__lasx_xvpickev_b, res1, res0, res3, res2, res0, res1);
- __lasx_xvstelm_d(res0, dst, 0, 0);
- __lasx_xvstelm_d(res0, dst + dst_stride, 0, 2);
- __lasx_xvstelm_d(res0, dst + dst_stride_2x, 0, 1);
- __lasx_xvstelm_d(res0, dst + dst_stride_3x, 0, 3);
- dst += dst_stride_4x;
- __lasx_xvstelm_d(res1, dst, 0, 0);
- __lasx_xvstelm_d(res1, dst + dst_stride, 0, 2);
- __lasx_xvstelm_d(res1, dst + dst_stride_2x, 0, 1);
- __lasx_xvstelm_d(res1, dst + dst_stride_3x, 0, 3);
-}
-
-void ff_h264_idct4x4_addblk_dc_lasx(uint8_t *dst, int16_t *src,
- int32_t dst_stride)
-{
- const int16_t dc = (src[0] + 32) >> 6;
- int32_t dst_stride_2x = dst_stride << 1;
- int32_t dst_stride_3x = dst_stride_2x + dst_stride;
- __m256i pred, out;
- __m256i src0, src1, src2, src3;
- __m256i input_dc = __lasx_xvreplgr2vr_h(dc);
-
- src[0] = 0;
- DUP4_ARG2(__lasx_xvldx, dst, 0, dst, dst_stride, dst, dst_stride_2x,
- dst, dst_stride_3x, src0, src1, src2, src3);
- DUP2_ARG2(__lasx_xvilvl_w, src1, src0, src3, src2, src0, src1);
-
- pred = __lasx_xvpermi_q(src0, src1, 0x02);
- pred = __lasx_xvaddw_h_h_bu(input_dc, pred);
- pred = __lasx_xvclip255_h(pred);
- out = __lasx_xvpickev_b(pred, pred);
- __lasx_xvstelm_w(out, dst, 0, 0);
- __lasx_xvstelm_w(out, dst + dst_stride, 0, 1);
- __lasx_xvstelm_w(out, dst + dst_stride_2x, 0, 4);
- __lasx_xvstelm_w(out, dst + dst_stride_3x, 0, 5);
-}
-
-void ff_h264_idct8_dc_addblk_lasx(uint8_t *dst, int16_t *src,
- int32_t dst_stride)
-{
- int32_t dc_val;
- int32_t dst_stride_2x = dst_stride << 1;
- int32_t dst_stride_4x = dst_stride << 2;
- int32_t dst_stride_3x = dst_stride_2x + dst_stride;
- __m256i dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
- __m256i dc;
-
- dc_val = (src[0] + 32) >> 6;
- dc = __lasx_xvreplgr2vr_h(dc_val);
-
- src[0] = 0;
-
- DUP4_ARG2(__lasx_xvldx, dst, 0, dst, dst_stride, dst, dst_stride_2x,
- dst, dst_stride_3x, dst0, dst1, dst2, dst3);
- dst += dst_stride_4x;
- DUP4_ARG2(__lasx_xvldx, dst, 0, dst, dst_stride, dst, dst_stride_2x,
- dst, dst_stride_3x, dst4, dst5, dst6, dst7);
- dst -= dst_stride_4x;
- DUP4_ARG1(__lasx_vext2xv_hu_bu, dst0, dst1, dst2, dst3,
- dst0, dst1, dst2, dst3);
- DUP4_ARG1(__lasx_vext2xv_hu_bu, dst4, dst5, dst6, dst7,
- dst4, dst5, dst6, dst7);
- DUP4_ARG3(__lasx_xvpermi_q, dst1, dst0, 0x20, dst3, dst2, 0x20, dst5,
- dst4, 0x20, dst7, dst6, 0x20, dst0, dst1, dst2, dst3);
- dst0 = __lasx_xvadd_h(dst0, dc);
- dst1 = __lasx_xvadd_h(dst1, dc);
- dst2 = __lasx_xvadd_h(dst2, dc);
- dst3 = __lasx_xvadd_h(dst3, dc);
- DUP4_ARG1(__lasx_xvclip255_h, dst0, dst1, dst2, dst3,
- dst0, dst1, dst2, dst3);
- DUP2_ARG2(__lasx_xvpickev_b, dst1, dst0, dst3, dst2, dst0, dst1);
- __lasx_xvstelm_d(dst0, dst, 0, 0);
- __lasx_xvstelm_d(dst0, dst + dst_stride, 0, 2);
- __lasx_xvstelm_d(dst0, dst + dst_stride_2x, 0, 1);
- __lasx_xvstelm_d(dst0, dst + dst_stride_3x, 0, 3);
- dst += dst_stride_4x;
- __lasx_xvstelm_d(dst1, dst, 0, 0);
- __lasx_xvstelm_d(dst1, dst + dst_stride, 0, 2);
- __lasx_xvstelm_d(dst1, dst + dst_stride_2x, 0, 1);
- __lasx_xvstelm_d(dst1, dst + dst_stride_3x, 0, 3);
-}
-
-void ff_h264_idct_add16_lasx(uint8_t *dst,
- const int32_t *blk_offset,
- int16_t *block, int32_t dst_stride,
- const uint8_t nzc[15 * 8])
-{
- int32_t i;
-
- for (i = 0; i < 16; i++) {
- int32_t nnz = nzc[scan8[i]];
-
- if (nnz) {
- if (nnz == 1 && ((dctcoef *) block)[i * 16])
- ff_h264_idct4x4_addblk_dc_lasx(dst + blk_offset[i],
- block + i * 16 * sizeof(pixel),
- dst_stride);
- else
- ff_h264_idct_add_lasx(dst + blk_offset[i],
- block + i * 16 * sizeof(pixel),
- dst_stride);
- }
- }
-}
-
-void ff_h264_idct8_add4_lasx(uint8_t *dst, const int32_t *blk_offset,
- int16_t *block, int32_t dst_stride,
- const uint8_t nzc[15 * 8])
-{
- int32_t cnt;
-
- for (cnt = 0; cnt < 16; cnt += 4) {
- int32_t nnz = nzc[scan8[cnt]];
-
- if (nnz) {
- if (nnz == 1 && ((dctcoef *) block)[cnt * 16])
- ff_h264_idct8_dc_addblk_lasx(dst + blk_offset[cnt],
- block + cnt * 16 * sizeof(pixel),
- dst_stride);
- else
- ff_h264_idct8_addblk_lasx(dst + blk_offset[cnt],
- block + cnt * 16 * sizeof(pixel),
- dst_stride);
- }
- }
-}
-
-
-void ff_h264_idct_add8_lasx(uint8_t **dst,
- const int32_t *blk_offset,
- int16_t *block, int32_t dst_stride,
- const uint8_t nzc[15 * 8])
-{
- int32_t i;
-
- for (i = 16; i < 20; i++) {
- if (nzc[scan8[i]])
- ff_h264_idct_add_lasx(dst[0] + blk_offset[i],
- block + i * 16 * sizeof(pixel),
- dst_stride);
- else if (((dctcoef *) block)[i * 16])
- ff_h264_idct4x4_addblk_dc_lasx(dst[0] + blk_offset[i],
- block + i * 16 * sizeof(pixel),
- dst_stride);
- }
- for (i = 32; i < 36; i++) {
- if (nzc[scan8[i]])
- ff_h264_idct_add_lasx(dst[1] + blk_offset[i],
- block + i * 16 * sizeof(pixel),
- dst_stride);
- else if (((dctcoef *) block)[i * 16])
- ff_h264_idct4x4_addblk_dc_lasx(dst[1] + blk_offset[i],
- block + i * 16 * sizeof(pixel),
- dst_stride);
- }
-}
-
-void ff_h264_idct_add8_422_lasx(uint8_t **dst,
- const int32_t *blk_offset,
- int16_t *block, int32_t dst_stride,
- const uint8_t nzc[15 * 8])
-{
- int32_t i;
-
- for (i = 16; i < 20; i++) {
- if (nzc[scan8[i]])
- ff_h264_idct_add_lasx(dst[0] + blk_offset[i],
- block + i * 16 * sizeof(pixel),
- dst_stride);
- else if (((dctcoef *) block)[i * 16])
- ff_h264_idct4x4_addblk_dc_lasx(dst[0] + blk_offset[i],
- block + i * 16 * sizeof(pixel),
- dst_stride);
- }
- for (i = 32; i < 36; i++) {
- if (nzc[scan8[i]])
- ff_h264_idct_add_lasx(dst[1] + blk_offset[i],
- block + i * 16 * sizeof(pixel),
- dst_stride);
- else if (((dctcoef *) block)[i * 16])
- ff_h264_idct4x4_addblk_dc_lasx(dst[1] + blk_offset[i],
- block + i * 16 * sizeof(pixel),
- dst_stride);
- }
- for (i = 20; i < 24; i++) {
- if (nzc[scan8[i + 4]])
- ff_h264_idct_add_lasx(dst[0] + blk_offset[i + 4],
- block + i * 16 * sizeof(pixel),
- dst_stride);
- else if (((dctcoef *) block)[i * 16])
- ff_h264_idct4x4_addblk_dc_lasx(dst[0] + blk_offset[i + 4],
- block + i * 16 * sizeof(pixel),
- dst_stride);
- }
- for (i = 36; i < 40; i++) {
- if (nzc[scan8[i + 4]])
- ff_h264_idct_add_lasx(dst[1] + blk_offset[i + 4],
- block + i * 16 * sizeof(pixel),
- dst_stride);
- else if (((dctcoef *) block)[i * 16])
- ff_h264_idct4x4_addblk_dc_lasx(dst[1] + blk_offset[i + 4],
- block + i * 16 * sizeof(pixel),
- dst_stride);
- }
-}
-
-void ff_h264_idct_add16_intra_lasx(uint8_t *dst,
- const int32_t *blk_offset,
- int16_t *block,
- int32_t dst_stride,
- const uint8_t nzc[15 * 8])
-{
- int32_t i;
-
- for (i = 0; i < 16; i++) {
- if (nzc[scan8[i]])
- ff_h264_idct_add_lasx(dst + blk_offset[i],
- block + i * 16 * sizeof(pixel), dst_stride);
- else if (((dctcoef *) block)[i * 16])
- ff_h264_idct4x4_addblk_dc_lasx(dst + blk_offset[i],
- block + i * 16 * sizeof(pixel),
- dst_stride);
- }
-}
-
-void ff_h264_deq_idct_luma_dc_lasx(int16_t *dst, int16_t *src,
- int32_t de_qval)
-{
-#define DC_DEST_STRIDE 16
-
- __m256i src0, src1, src2, src3;
- __m256i vec0, vec1, vec2, vec3;
- __m256i tmp0, tmp1, tmp2, tmp3;
- __m256i hres0, hres1, hres2, hres3;
- __m256i vres0, vres1, vres2, vres3;
- __m256i de_q_vec = __lasx_xvreplgr2vr_w(de_qval);
-
- DUP4_ARG2(__lasx_xvld, src, 0, src, 8, src, 16, src, 24,
- src0, src1, src2, src3);
- LASX_TRANSPOSE4x4_H(src0, src1, src2, src3, tmp0, tmp1, tmp2, tmp3);
- LASX_BUTTERFLY_4_H(tmp0, tmp2, tmp3, tmp1, vec0, vec3, vec2, vec1);
- LASX_BUTTERFLY_4_H(vec0, vec1, vec2, vec3, hres0, hres3, hres2, hres1);
- LASX_TRANSPOSE4x4_H(hres0, hres1, hres2, hres3,
- hres0, hres1, hres2, hres3);
- LASX_BUTTERFLY_4_H(hres0, hres1, hres3, hres2, vec0, vec3, vec2, vec1);
- LASX_BUTTERFLY_4_H(vec0, vec1, vec2, vec3, vres0, vres1, vres2, vres3);
- DUP4_ARG1(__lasx_vext2xv_w_h, vres0, vres1, vres2, vres3,
- vres0, vres1, vres2, vres3);
- DUP2_ARG3(__lasx_xvpermi_q, vres1, vres0, 0x20, vres3, vres2, 0x20,
- vres0, vres1);
-
- vres0 = __lasx_xvmul_w(vres0, de_q_vec);
- vres1 = __lasx_xvmul_w(vres1, de_q_vec);
-
- vres0 = __lasx_xvsrari_w(vres0, 8);
- vres1 = __lasx_xvsrari_w(vres1, 8);
- vec0 = __lasx_xvpickev_h(vres1, vres0);
- vec0 = __lasx_xvpermi_d(vec0, 0xd8);
- __lasx_xvstelm_h(vec0, dst + 0 * DC_DEST_STRIDE, 0, 0);
- __lasx_xvstelm_h(vec0, dst + 2 * DC_DEST_STRIDE, 0, 1);
- __lasx_xvstelm_h(vec0, dst + 8 * DC_DEST_STRIDE, 0, 2);
- __lasx_xvstelm_h(vec0, dst + 10 * DC_DEST_STRIDE, 0, 3);
- __lasx_xvstelm_h(vec0, dst + 1 * DC_DEST_STRIDE, 0, 4);
- __lasx_xvstelm_h(vec0, dst + 3 * DC_DEST_STRIDE, 0, 5);
- __lasx_xvstelm_h(vec0, dst + 9 * DC_DEST_STRIDE, 0, 6);
- __lasx_xvstelm_h(vec0, dst + 11 * DC_DEST_STRIDE, 0, 7);
- __lasx_xvstelm_h(vec0, dst + 4 * DC_DEST_STRIDE, 0, 8);
- __lasx_xvstelm_h(vec0, dst + 6 * DC_DEST_STRIDE, 0, 9);
- __lasx_xvstelm_h(vec0, dst + 12 * DC_DEST_STRIDE, 0, 10);
- __lasx_xvstelm_h(vec0, dst + 14 * DC_DEST_STRIDE, 0, 11);
- __lasx_xvstelm_h(vec0, dst + 5 * DC_DEST_STRIDE, 0, 12);
- __lasx_xvstelm_h(vec0, dst + 7 * DC_DEST_STRIDE, 0, 13);
- __lasx_xvstelm_h(vec0, dst + 13 * DC_DEST_STRIDE, 0, 14);
- __lasx_xvstelm_h(vec0, dst + 15 * DC_DEST_STRIDE, 0, 15);
-
-#undef DC_DEST_STRIDE
-}
diff --git a/libavcodec/loongarch/h264idct_loongarch.c b/libavcodec/loongarch/h264idct_loongarch.c
new file mode 100644
index 0000000000..26af45503f
--- /dev/null
+++ b/libavcodec/loongarch/h264idct_loongarch.c
@@ -0,0 +1,184 @@
+/*
+ * Loongson LSX/LASX optimized h264idct
+ *
+ * Copyright (c) 2023 Loongson Technology Corporation Limited
+ * Contributed by Shiyou Yin <yinshiyou-hf@loongson.cn>
+ * Xiwei Gu <guxiwei-hf@loongson.cn>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "h264dsp_loongarch.h"
+#include "libavcodec/bit_depth_template.c"
+
+void ff_h264_idct_add16_8_lsx(uint8_t *dst, const int32_t *blk_offset,
+ int16_t *block, int32_t dst_stride,
+ const uint8_t nzc[15 * 8])
+{
+ int32_t i;
+
+ for (i = 0; i < 16; i++) {
+ int32_t nnz = nzc[scan8[i]];
+
+ if (nnz == 1 && ((dctcoef *) block)[i * 16]) {
+ ff_h264_idct_dc_add_8_lsx(dst + blk_offset[i],
+ block + i * 16 * sizeof(pixel),
+ dst_stride);
+ } else if (nnz) {
+ ff_h264_idct_add_8_lsx(dst + blk_offset[i],
+ block + i * 16 * sizeof(pixel),
+ dst_stride);
+ }
+ }
+}
+
+void ff_h264_idct8_add4_8_lsx(uint8_t *dst, const int32_t *blk_offset,
+ int16_t *block, int32_t dst_stride,
+ const uint8_t nzc[15 * 8])
+{
+ int32_t cnt;
+
+ for (cnt = 0; cnt < 16; cnt += 4) {
+ int32_t nnz = nzc[scan8[cnt]];
+
+ if (nnz == 1 && ((dctcoef *) block)[cnt * 16]) {
+ ff_h264_idct8_dc_add_8_lsx(dst + blk_offset[cnt],
+ block + cnt * 16 * sizeof(pixel),
+ dst_stride);
+ } else if (nnz) {
+ ff_h264_idct8_add_8_lsx(dst + blk_offset[cnt],
+ block + cnt * 16 * sizeof(pixel),
+ dst_stride);
+ }
+ }
+}
+
+#if HAVE_LASX
+void ff_h264_idct8_add4_8_lasx(uint8_t *dst, const int32_t *blk_offset,
+ int16_t *block, int32_t dst_stride,
+ const uint8_t nzc[15 * 8])
+{
+ int32_t cnt;
+
+ for (cnt = 0; cnt < 16; cnt += 4) {
+ int32_t nnz = nzc[scan8[cnt]];
+
+ if (nnz == 1 && ((dctcoef *) block)[cnt * 16]) {
+ ff_h264_idct8_dc_add_8_lasx(dst + blk_offset[cnt],
+ block + cnt * 16 * sizeof(pixel),
+ dst_stride);
+ } else if (nnz) {
+ ff_h264_idct8_add_8_lasx(dst + blk_offset[cnt],
+ block + cnt * 16 * sizeof(pixel),
+ dst_stride);
+ }
+ }
+}
+#endif // #if HAVE_LASX
+
+void ff_h264_idct_add8_8_lsx(uint8_t **dst, const int32_t *blk_offset,
+ int16_t *block, int32_t dst_stride,
+ const uint8_t nzc[15 * 8])
+{
+ int32_t i;
+
+ for (i = 16; i < 20; i++) {
+ if (nzc[scan8[i]])
+ ff_h264_idct_add_8_lsx(dst[0] + blk_offset[i],
+ block + i * 16 * sizeof(pixel),
+ dst_stride);
+ else if (((dctcoef *) block)[i * 16])
+ ff_h264_idct_dc_add_8_lsx(dst[0] + blk_offset[i],
+ block + i * 16 * sizeof(pixel),
+ dst_stride);
+ }
+ for (i = 32; i < 36; i++) {
+ if (nzc[scan8[i]])
+ ff_h264_idct_add_8_lsx(dst[1] + blk_offset[i],
+ block + i * 16 * sizeof(pixel),
+ dst_stride);
+ else if (((dctcoef *) block)[i * 16])
+ ff_h264_idct_dc_add_8_lsx(dst[1] + blk_offset[i],
+ block + i * 16 * sizeof(pixel),
+ dst_stride);
+ }
+}
+
+void ff_h264_idct_add8_422_8_lsx(uint8_t **dst, const int32_t *blk_offset,
+ int16_t *block, int32_t dst_stride,
+ const uint8_t nzc[15 * 8])
+{
+ int32_t i;
+
+ for (i = 16; i < 20; i++) {
+ if (nzc[scan8[i]])
+ ff_h264_idct_add_8_lsx(dst[0] + blk_offset[i],
+ block + i * 16 * sizeof(pixel),
+ dst_stride);
+ else if (((dctcoef *) block)[i * 16])
+ ff_h264_idct_dc_add_8_lsx(dst[0] + blk_offset[i],
+ block + i * 16 * sizeof(pixel),
+ dst_stride);
+ }
+ for (i = 20; i < 24; i++) {
+ if (nzc[scan8[i + 4]])
+ ff_h264_idct_add_8_lsx(dst[0] + blk_offset[i + 4],
+ block + i * 16 * sizeof(pixel),
+ dst_stride);
+ else if (((dctcoef *) block)[i * 16])
+ ff_h264_idct_dc_add_8_lsx(dst[0] + blk_offset[i + 4],
+ block + i * 16 * sizeof(pixel),
+ dst_stride);
+ }
+ for (i = 32; i < 36; i++) {
+ if (nzc[scan8[i]])
+ ff_h264_idct_add_8_lsx(dst[1] + blk_offset[i],
+ block + i * 16 * sizeof(pixel),
+ dst_stride);
+ else if (((dctcoef *) block)[i * 16])
+ ff_h264_idct_dc_add_8_lsx(dst[1] + blk_offset[i],
+ block + i * 16 * sizeof(pixel),
+ dst_stride);
+ }
+ for (i = 36; i < 40; i++) {
+ if (nzc[scan8[i + 4]])
+ ff_h264_idct_add_8_lsx(dst[1] + blk_offset[i + 4],
+ block + i * 16 * sizeof(pixel),
+ dst_stride);
+ else if (((dctcoef *) block)[i * 16])
+ ff_h264_idct_dc_add_8_lsx(dst[1] + blk_offset[i + 4],
+ block + i * 16 * sizeof(pixel),
+ dst_stride);
+ }
+}
+
+void ff_h264_idct_add16_intra_8_lsx(uint8_t *dst, const int32_t *blk_offset,
+ int16_t *block, int32_t dst_stride,
+ const uint8_t nzc[15 * 8])
+{
+ int32_t i;
+
+ for (i = 0; i < 16; i++) {
+ if (nzc[scan8[i]])
+ ff_h264_idct_add_8_lsx(dst + blk_offset[i],
+ block + i * 16 * sizeof(pixel), dst_stride);
+ else if (((dctcoef *) block)[i * 16])
+ ff_h264_idct_dc_add_8_lsx(dst + blk_offset[i],
+ block + i * 16 * sizeof(pixel),
+ dst_stride);
+ }
+}
diff --git a/libavcodec/loongarch/loongson_asm.S b/libavcodec/loongarch/loongson_asm.S
new file mode 100644
index 0000000000..0a649f51c7
--- /dev/null
+++ b/libavcodec/loongarch/loongson_asm.S
@@ -0,0 +1,945 @@
+/*
+ * Loongson asm helper.
+ *
+ * Copyright (c) 2022 Loongson Technology Corporation Limited
+ * Contributed by Gu Xiwei(guxiwei-hf@loongson.cn)
+ * Shiyou Yin(yinshiyou-hf@loongson.cn)
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+/**
+ * MAJOR version: Macro usage changes.
+ * MINOR version: Add new functions, or bug fixes.
+ * MICRO version: Comment changes or implementation changes.
+ */
+#define LML_VERSION_MAJOR 0
+#define LML_VERSION_MINOR 2
+#define LML_VERSION_MICRO 0
+
+/*
+ *============================================================================
+ * macros for specific projetc, set them as needed.
+ * Following LoongML macros for your reference.
+ *============================================================================
+ */
+#define ASM_PREF
+#define DEFAULT_ALIGN 5
+
+.macro function name, align=DEFAULT_ALIGN
+.macro endfunc
+ jirl $r0, $r1, 0x0
+ .size ASM_PREF\name, . - ASM_PREF\name
+ .purgem endfunc
+.endm
+.text ;
+.align \align ;
+.globl ASM_PREF\name ;
+.type ASM_PREF\name, @function ;
+ASM_PREF\name: ;
+.endm
+
+/**
+ * Attention: If align is not zero, the macro will use
+ * t7 until the end of function
+ */
+.macro alloc_stack size, align=0
+.if \align
+ .macro clean_stack
+ add.d sp, sp, t7
+ .endm
+ addi.d sp, sp, - \size
+ andi.d t7, sp, \align - 1
+ sub.d sp, sp, t7
+ addi.d t7, t7, \size
+.else
+ .macro clean_stack
+ addi.d sp, sp, \size
+ .endm
+ addi.d sp, sp, - \size
+.endif
+.endm
+
+.macro const name, align=DEFAULT_ALIGN
+ .macro endconst
+ .size \name, . - \name
+ .purgem endconst
+ .endm
+.section .rodata
+.align \align
+\name:
+.endm
+
+/*
+ *============================================================================
+ * LoongArch register alias
+ *============================================================================
+ */
+
+#define a0 $a0
+#define a1 $a1
+#define a2 $a2
+#define a3 $a3
+#define a4 $a4
+#define a5 $a5
+#define a6 $a6
+#define a7 $a7
+
+#define t0 $t0
+#define t1 $t1
+#define t2 $t2
+#define t3 $t3
+#define t4 $t4
+#define t5 $t5
+#define t6 $t6
+#define t7 $t7
+#define t8 $t8
+
+#define s0 $s0
+#define s1 $s1
+#define s2 $s2
+#define s3 $s3
+#define s4 $s4
+#define s5 $s5
+#define s6 $s6
+#define s7 $s7
+#define s8 $s8
+
+#define zero $zero
+#define sp $sp
+#define ra $ra
+
+#define f0 $f0
+#define f1 $f1
+#define f2 $f2
+#define f3 $f3
+#define f4 $f4
+#define f5 $f5
+#define f6 $f6
+#define f7 $f7
+#define f8 $f8
+#define f9 $f9
+#define f10 $f10
+#define f11 $f11
+#define f12 $f12
+#define f13 $f13
+#define f14 $f14
+#define f15 $f15
+#define f16 $f16
+#define f17 $f17
+#define f18 $f18
+#define f19 $f19
+#define f20 $f20
+#define f21 $f21
+#define f22 $f22
+#define f23 $f23
+#define f24 $f24
+#define f25 $f25
+#define f26 $f26
+#define f27 $f27
+#define f28 $f28
+#define f29 $f29
+#define f30 $f30
+#define f31 $f31
+
+#define vr0 $vr0
+#define vr1 $vr1
+#define vr2 $vr2
+#define vr3 $vr3
+#define vr4 $vr4
+#define vr5 $vr5
+#define vr6 $vr6
+#define vr7 $vr7
+#define vr8 $vr8
+#define vr9 $vr9
+#define vr10 $vr10
+#define vr11 $vr11
+#define vr12 $vr12
+#define vr13 $vr13
+#define vr14 $vr14
+#define vr15 $vr15
+#define vr16 $vr16
+#define vr17 $vr17
+#define vr18 $vr18
+#define vr19 $vr19
+#define vr20 $vr20
+#define vr21 $vr21
+#define vr22 $vr22
+#define vr23 $vr23
+#define vr24 $vr24
+#define vr25 $vr25
+#define vr26 $vr26
+#define vr27 $vr27
+#define vr28 $vr28
+#define vr29 $vr29
+#define vr30 $vr30
+#define vr31 $vr31
+
+#define xr0 $xr0
+#define xr1 $xr1
+#define xr2 $xr2
+#define xr3 $xr3
+#define xr4 $xr4
+#define xr5 $xr5
+#define xr6 $xr6
+#define xr7 $xr7
+#define xr8 $xr8
+#define xr9 $xr9
+#define xr10 $xr10
+#define xr11 $xr11
+#define xr12 $xr12
+#define xr13 $xr13
+#define xr14 $xr14
+#define xr15 $xr15
+#define xr16 $xr16
+#define xr17 $xr17
+#define xr18 $xr18
+#define xr19 $xr19
+#define xr20 $xr20
+#define xr21 $xr21
+#define xr22 $xr22
+#define xr23 $xr23
+#define xr24 $xr24
+#define xr25 $xr25
+#define xr26 $xr26
+#define xr27 $xr27
+#define xr28 $xr28
+#define xr29 $xr29
+#define xr30 $xr30
+#define xr31 $xr31
+
+/*
+ *============================================================================
+ * LSX/LASX synthesize instructions
+ *============================================================================
+ */
+
+/*
+ * Description : Dot product of byte vector elements
+ * Arguments : Inputs - vj, vk
+ * Outputs - vd
+ * Return Type - halfword
+ */
+.macro vdp2.h.bu vd, vj, vk
+ vmulwev.h.bu \vd, \vj, \vk
+ vmaddwod.h.bu \vd, \vj, \vk
+.endm
+
+.macro vdp2.h.bu.b vd, vj, vk
+ vmulwev.h.bu.b \vd, \vj, \vk
+ vmaddwod.h.bu.b \vd, \vj, \vk
+.endm
+
+.macro vdp2.w.h vd, vj, vk
+ vmulwev.w.h \vd, \vj, \vk
+ vmaddwod.w.h \vd, \vj, \vk
+.endm
+
+.macro xvdp2.h.bu xd, xj, xk
+ xvmulwev.h.bu \xd, \xj, \xk
+ xvmaddwod.h.bu \xd, \xj, \xk
+.endm
+
+.macro xvdp2.h.bu.b xd, xj, xk
+ xvmulwev.h.bu.b \xd, \xj, \xk
+ xvmaddwod.h.bu.b \xd, \xj, \xk
+.endm
+
+.macro xvdp2.w.h xd, xj, xk
+ xvmulwev.w.h \xd, \xj, \xk
+ xvmaddwod.w.h \xd, \xj, \xk
+.endm
+
+/*
+ * Description : Dot product & addition of halfword vector elements
+ * Arguments : Inputs - vj, vk
+ * Outputs - vd
+ * Return Type - twice size of input
+ */
+.macro vdp2add.h.bu vd, vj, vk
+ vmaddwev.h.bu \vd, \vj, \vk
+ vmaddwod.h.bu \vd, \vj, \vk
+.endm
+
+.macro vdp2add.h.bu.b vd, vj, vk
+ vmaddwev.h.bu.b \vd, \vj, \vk
+ vmaddwod.h.bu.b \vd, \vj, \vk
+.endm
+
+.macro vdp2add.w.h vd, vj, vk
+ vmaddwev.w.h \vd, \vj, \vk
+ vmaddwod.w.h \vd, \vj, \vk
+.endm
+
+.macro xvdp2add.h.bu.b xd, xj, xk
+ xvmaddwev.h.bu.b \xd, \xj, \xk
+ xvmaddwod.h.bu.b \xd, \xj, \xk
+.endm
+
+.macro xvdp2add.w.h xd, xj, xk
+ xvmaddwev.w.h \xd, \xj, \xk
+ xvmaddwod.w.h \xd, \xj, \xk
+.endm
+
+/*
+ * Description : Range each element of vector
+ * clip: vj > vk ? vj : vk && vj < va ? vj : va
+ * clip255: vj < 255 ? vj : 255 && vj > 0 ? vj : 0
+ */
+.macro vclip.h vd, vj, vk, va
+ vmax.h \vd, \vj, \vk
+ vmin.h \vd, \vd, \va
+.endm
+
+.macro vclip255.w vd, vj
+ vmaxi.w \vd, \vj, 0
+ vsat.wu \vd, \vd, 7
+.endm
+
+.macro vclip255.h vd, vj
+ vmaxi.h \vd, \vj, 0
+ vsat.hu \vd, \vd, 7
+.endm
+
+.macro xvclip.h xd, xj, xk, xa
+ xvmax.h \xd, \xj, \xk
+ xvmin.h \xd, \xd, \xa
+.endm
+
+.macro xvclip255.h xd, xj
+ xvmaxi.h \xd, \xj, 0
+ xvsat.hu \xd, \xd, 7
+.endm
+
+.macro xvclip255.w xd, xj
+ xvmaxi.w \xd, \xj, 0
+ xvsat.wu \xd, \xd, 7
+.endm
+
+/*
+ * Description : Store elements of vector
+ * vd : Data vector to be stroed
+ * rk : Address of data storage
+ * ra : Offset of address
+ * si : Index of data in vd
+ */
+.macro vstelmx.b vd, rk, ra, si
+ add.d \rk, \rk, \ra
+ vstelm.b \vd, \rk, 0, \si
+.endm
+
+.macro vstelmx.h vd, rk, ra, si
+ add.d \rk, \rk, \ra
+ vstelm.h \vd, \rk, 0, \si
+.endm
+
+.macro vstelmx.w vd, rk, ra, si
+ add.d \rk, \rk, \ra
+ vstelm.w \vd, \rk, 0, \si
+.endm
+
+.macro vstelmx.d vd, rk, ra, si
+ add.d \rk, \rk, \ra
+ vstelm.d \vd, \rk, 0, \si
+.endm
+
+.macro vmov xd, xj
+ vor.v \xd, \xj, \xj
+.endm
+
+.macro xmov xd, xj
+ xvor.v \xd, \xj, \xj
+.endm
+
+.macro xvstelmx.d xd, rk, ra, si
+ add.d \rk, \rk, \ra
+ xvstelm.d \xd, \rk, 0, \si
+.endm
+
+/*
+ *============================================================================
+ * LSX/LASX custom macros
+ *============================================================================
+ */
+
+/*
+ * Load 4 float, double, V128, v256 elements with stride.
+ */
+.macro FLDS_LOADX_4 src, stride, stride2, stride3, out0, out1, out2, out3
+ fld.s \out0, \src, 0
+ fldx.s \out1, \src, \stride
+ fldx.s \out2, \src, \stride2
+ fldx.s \out3, \src, \stride3
+.endm
+
+.macro FLDD_LOADX_4 src, stride, stride2, stride3, out0, out1, out2, out3
+ fld.d \out0, \src, 0
+ fldx.d \out1, \src, \stride
+ fldx.d \out2, \src, \stride2
+ fldx.d \out3, \src, \stride3
+.endm
+
+.macro LSX_LOADX_4 src, stride, stride2, stride3, out0, out1, out2, out3
+ vld \out0, \src, 0
+ vldx \out1, \src, \stride
+ vldx \out2, \src, \stride2
+ vldx \out3, \src, \stride3
+.endm
+
+.macro LASX_LOADX_4 src, stride, stride2, stride3, out0, out1, out2, out3
+ xvld \out0, \src, 0
+ xvldx \out1, \src, \stride
+ xvldx \out2, \src, \stride2
+ xvldx \out3, \src, \stride3
+.endm
+
+/*
+ * Description : Transpose 4x4 block with half-word elements in vectors
+ * Arguments : Inputs - in0, in1, in2, in3
+ * Outputs - out0, out1, out2, out3
+ */
+.macro LSX_TRANSPOSE4x4_H in0, in1, in2, in3, out0, out1, out2, out3, \
+ tmp0, tmp1
+ vilvl.h \tmp0, \in1, \in0
+ vilvl.h \tmp1, \in3, \in2
+ vilvl.w \out0, \tmp1, \tmp0
+ vilvh.w \out2, \tmp1, \tmp0
+ vilvh.d \out1, \out0, \out0
+ vilvh.d \out3, \out0, \out2
+.endm
+
+/*
+ * Description : Transpose 4x4 block with word elements in vectors
+ * Arguments : Inputs - in0, in1, in2, in3
+ * Outputs - out0, out1, out2, out3
+ * Details :
+ * Example :
+ * 1, 2, 3, 4 1, 5, 9,13
+ * 5, 6, 7, 8 to 2, 6,10,14
+ * 9,10,11,12 =====> 3, 7,11,15
+ * 13,14,15,16 4, 8,12,16
+ */
+.macro LSX_TRANSPOSE4x4_W _in0, _in1, _in2, _in3, _out0, _out1, _out2, _out3, \
+ _tmp0, _tmp1
+
+ vilvl.w \_tmp0, \_in1, \_in0
+ vilvh.w \_out1, \_in1, \_in0
+ vilvl.w \_tmp1, \_in3, \_in2
+ vilvh.w \_out3, \_in3, \_in2
+
+ vilvl.d \_out0, \_tmp1, \_tmp0
+ vilvl.d \_out2, \_out3, \_out1
+ vilvh.d \_out3, \_out3, \_out1
+ vilvh.d \_out1, \_tmp1, \_tmp0
+.endm
+
+/*
+ * Description : Transpose 8x8 block with half-word elements in vectors
+ * Arguments : Inputs - in0, in1, in2, in3, in4, in5, in6, in7
+ * Outputs - out0, out1, out2, out3, out4, out5, out6, out7
+ */
+.macro LSX_TRANSPOSE8x8_H in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, \
+ out2, out3, out4, out5, out6, out7, tmp0, tmp1, tmp2, \
+ tmp3, tmp4, tmp5, tmp6, tmp7
+ vilvl.h \tmp0, \in6, \in4
+ vilvl.h \tmp1, \in7, \in5
+ vilvl.h \tmp2, \in2, \in0
+ vilvl.h \tmp3, \in3, \in1
+
+ vilvl.h \tmp4, \tmp1, \tmp0
+ vilvh.h \tmp5, \tmp1, \tmp0
+ vilvl.h \tmp6, \tmp3, \tmp2
+ vilvh.h \tmp7, \tmp3, \tmp2
+
+ vilvh.h \tmp0, \in6, \in4
+ vilvh.h \tmp1, \in7, \in5
+ vilvh.h \tmp2, \in2, \in0
+ vilvh.h \tmp3, \in3, \in1
+
+ vpickev.d \out0, \tmp4, \tmp6
+ vpickod.d \out1, \tmp4, \tmp6
+ vpickev.d \out2, \tmp5, \tmp7
+ vpickod.d \out3, \tmp5, \tmp7
+
+ vilvl.h \tmp4, \tmp1, \tmp0
+ vilvh.h \tmp5, \tmp1, \tmp0
+ vilvl.h \tmp6, \tmp3, \tmp2
+ vilvh.h \tmp7, \tmp3, \tmp2
+
+ vpickev.d \out4, \tmp4, \tmp6
+ vpickod.d \out5, \tmp4, \tmp6
+ vpickev.d \out6, \tmp5, \tmp7
+ vpickod.d \out7, \tmp5, \tmp7
+.endm
+
+/*
+ * Description : Transpose 16x8 block with byte elements in vectors
+ * Arguments : Inputs - in0, in1, in2, in3, in4, in5, in6, in7
+ * Outputs - out0, out1, out2, out3, out4, out5, out6, out7
+ */
+.macro LASX_TRANSPOSE16X8_B in0, in1, in2, in3, in4, in5, in6, in7, \
+ in8, in9, in10, in11, in12, in13, in14, in15, \
+ out0, out1, out2, out3, out4, out5, out6, out7,\
+ tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7
+ xvilvl.b \tmp0, \in2, \in0
+ xvilvl.b \tmp1, \in3, \in1
+ xvilvl.b \tmp2, \in6, \in4
+ xvilvl.b \tmp3, \in7, \in5
+ xvilvl.b \tmp4, \in10, \in8
+ xvilvl.b \tmp5, \in11, \in9
+ xvilvl.b \tmp6, \in14, \in12
+ xvilvl.b \tmp7, \in15, \in13
+ xvilvl.b \out0, \tmp1, \tmp0
+ xvilvh.b \out1, \tmp1, \tmp0
+ xvilvl.b \out2, \tmp3, \tmp2
+ xvilvh.b \out3, \tmp3, \tmp2
+ xvilvl.b \out4, \tmp5, \tmp4
+ xvilvh.b \out5, \tmp5, \tmp4
+ xvilvl.b \out6, \tmp7, \tmp6
+ xvilvh.b \out7, \tmp7, \tmp6
+ xvilvl.w \tmp0, \out2, \out0
+ xvilvh.w \tmp2, \out2, \out0
+ xvilvl.w \tmp4, \out3, \out1
+ xvilvh.w \tmp6, \out3, \out1
+ xvilvl.w \tmp1, \out6, \out4
+ xvilvh.w \tmp3, \out6, \out4
+ xvilvl.w \tmp5, \out7, \out5
+ xvilvh.w \tmp7, \out7, \out5
+ xvilvl.d \out0, \tmp1, \tmp0
+ xvilvh.d \out1, \tmp1, \tmp0
+ xvilvl.d \out2, \tmp3, \tmp2
+ xvilvh.d \out3, \tmp3, \tmp2
+ xvilvl.d \out4, \tmp5, \tmp4
+ xvilvh.d \out5, \tmp5, \tmp4
+ xvilvl.d \out6, \tmp7, \tmp6
+ xvilvh.d \out7, \tmp7, \tmp6
+.endm
+
+/*
+ * Description : Transpose 16x8 block with byte elements in vectors
+ * Arguments : Inputs - in0, in1, in2, in3, in4, in5, in6, in7
+ * Outputs - out0, out1, out2, out3, out4, out5, out6, out7
+ */
+.macro LSX_TRANSPOSE16X8_B in0, in1, in2, in3, in4, in5, in6, in7, \
+ in8, in9, in10, in11, in12, in13, in14, in15, \
+ out0, out1, out2, out3, out4, out5, out6, out7,\
+ tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7
+ vilvl.b \tmp0, \in2, \in0
+ vilvl.b \tmp1, \in3, \in1
+ vilvl.b \tmp2, \in6, \in4
+ vilvl.b \tmp3, \in7, \in5
+ vilvl.b \tmp4, \in10, \in8
+ vilvl.b \tmp5, \in11, \in9
+ vilvl.b \tmp6, \in14, \in12
+ vilvl.b \tmp7, \in15, \in13
+
+ vilvl.b \out0, \tmp1, \tmp0
+ vilvh.b \out1, \tmp1, \tmp0
+ vilvl.b \out2, \tmp3, \tmp2
+ vilvh.b \out3, \tmp3, \tmp2
+ vilvl.b \out4, \tmp5, \tmp4
+ vilvh.b \out5, \tmp5, \tmp4
+ vilvl.b \out6, \tmp7, \tmp6
+ vilvh.b \out7, \tmp7, \tmp6
+ vilvl.w \tmp0, \out2, \out0
+ vilvh.w \tmp2, \out2, \out0
+ vilvl.w \tmp4, \out3, \out1
+ vilvh.w \tmp6, \out3, \out1
+ vilvl.w \tmp1, \out6, \out4
+ vilvh.w \tmp3, \out6, \out4
+ vilvl.w \tmp5, \out7, \out5
+ vilvh.w \tmp7, \out7, \out5
+ vilvl.d \out0, \tmp1, \tmp0
+ vilvh.d \out1, \tmp1, \tmp0
+ vilvl.d \out2, \tmp3, \tmp2
+ vilvh.d \out3, \tmp3, \tmp2
+ vilvl.d \out4, \tmp5, \tmp4
+ vilvh.d \out5, \tmp5, \tmp4
+ vilvl.d \out6, \tmp7, \tmp6
+ vilvh.d \out7, \tmp7, \tmp6
+.endm
+
+/*
+ * Description : Transpose 4x4 block with half-word elements in vectors
+ * Arguments : Inputs - in0, in1, in2, in3
+ * Outputs - out0, out1, out2, out3
+ */
+.macro LASX_TRANSPOSE4x4_H in0, in1, in2, in3, out0, out1, out2, out3, \
+ tmp0, tmp1
+ xvilvl.h \tmp0, \in1, \in0
+ xvilvl.h \tmp1, \in3, \in2
+ xvilvl.w \out0, \tmp1, \tmp0
+ xvilvh.w \out2, \tmp1, \tmp0
+ xvilvh.d \out1, \out0, \out0
+ xvilvh.d \out3, \out0, \out2
+.endm
+
+/*
+ * Description : Transpose 4x8 block with half-word elements in vectors
+ * Arguments : Inputs - in0, in1, in2, in3
+ * Outputs - out0, out1, out2, out3
+ */
+.macro LASX_TRANSPOSE4x8_H in0, in1, in2, in3, out0, out1, out2, out3, \
+ tmp0, tmp1
+ xvilvl.h \tmp0, \in2, \in0
+ xvilvl.h \tmp1, \in3, \in1
+ xvilvl.h \out2, \tmp1, \tmp0
+ xvilvh.h \out3, \tmp1, \tmp0
+
+ xvilvl.d \out0, \out2, \out2
+ xvilvh.d \out1, \out2, \out2
+ xvilvl.d \out2, \out3, \out3
+ xvilvh.d \out3, \out3, \out3
+.endm
+
+/*
+ * Description : Transpose 8x8 block with half-word elements in vectors
+ * Arguments : Inputs - in0, in1, in2, in3, in4, in5, in6, in7
+ * Outputs - out0, out1, out2, out3, out4, out5, out6, out7
+ */
+.macro LASX_TRANSPOSE8x8_H in0, in1, in2, in3, in4, in5, in6, in7, \
+ out0, out1, out2, out3, out4, out5, out6, out7, \
+ tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7
+ xvilvl.h \tmp0, \in6, \in4
+ xvilvl.h \tmp1, \in7, \in5
+ xvilvl.h \tmp2, \in2, \in0
+ xvilvl.h \tmp3, \in3, \in1
+
+ xvilvl.h \tmp4, \tmp1, \tmp0
+ xvilvh.h \tmp5, \tmp1, \tmp0
+ xvilvl.h \tmp6, \tmp3, \tmp2
+ xvilvh.h \tmp7, \tmp3, \tmp2
+
+ xvilvh.h \tmp0, \in6, \in4
+ xvilvh.h \tmp1, \in7, \in5
+ xvilvh.h \tmp2, \in2, \in0
+ xvilvh.h \tmp3, \in3, \in1
+
+ xvpickev.d \out0, \tmp4, \tmp6
+ xvpickod.d \out1, \tmp4, \tmp6
+ xvpickev.d \out2, \tmp5, \tmp7
+ xvpickod.d \out3, \tmp5, \tmp7
+
+ xvilvl.h \tmp4, \tmp1, \tmp0
+ xvilvh.h \tmp5, \tmp1, \tmp0
+ xvilvl.h \tmp6, \tmp3, \tmp2
+ xvilvh.h \tmp7, \tmp3, \tmp2
+
+ xvpickev.d \out4, \tmp4, \tmp6
+ xvpickod.d \out5, \tmp4, \tmp6
+ xvpickev.d \out6, \tmp5, \tmp7
+ xvpickod.d \out7, \tmp5, \tmp7
+.endm
+
+/*
+ * Description : Transpose 2x4x4 block with half-word elements in vectors
+ * Arguments : Inputs - in0, in1, in2, in3
+ * Outputs - out0, out1, out2, out3
+ */
+.macro LASX_TRANSPOSE2x4x4_H in0, in1, in2, in3, out0, out1, out2, out3, \
+ tmp0, tmp1, tmp2
+ xvilvh.h \tmp1, \in0, \in1
+ xvilvl.h \out1, \in0, \in1
+ xvilvh.h \tmp0, \in2, \in3
+ xvilvl.h \out3, \in2, \in3
+
+ xvilvh.w \tmp2, \out3, \out1
+ xvilvl.w \out3, \out3, \out1
+
+ xvilvl.w \out2, \tmp0, \tmp1
+ xvilvh.w \tmp1, \tmp0, \tmp1
+
+ xvilvh.d \out0, \out2, \out3
+ xvilvl.d \out2, \out2, \out3
+ xvilvh.d \out1, \tmp1, \tmp2
+ xvilvl.d \out3, \tmp1, \tmp2
+.endm
+
+/*
+ * Description : Transpose 4x4 block with word elements in vectors
+ * Arguments : Inputs - in0, in1, in2, in3
+ * Outputs - out0, out1, out2, out3
+ * Details :
+ * Example :
+ * 1, 2, 3, 4, 1, 2, 3, 4 1,5, 9,13, 1,5, 9,13
+ * 5, 6, 7, 8, 5, 6, 7, 8 to 2,6,10,14, 2,6,10,14
+ * 9,10,11,12, 9,10,11,12 =====> 3,7,11,15, 3,7,11,15
+ * 13,14,15,16, 13,14,15,16 4,8,12,16, 4,8,12,16
+ */
+.macro LASX_TRANSPOSE4x4_W _in0, _in1, _in2, _in3, _out0, _out1, _out2, _out3, \
+ _tmp0, _tmp1
+
+ xvilvl.w \_tmp0, \_in1, \_in0
+ xvilvh.w \_out1, \_in1, \_in0
+ xvilvl.w \_tmp1, \_in3, \_in2
+ xvilvh.w \_out3, \_in3, \_in2
+
+ xvilvl.d \_out0, \_tmp1, \_tmp0
+ xvilvl.d \_out2, \_out3, \_out1
+ xvilvh.d \_out3, \_out3, \_out1
+ xvilvh.d \_out1, \_tmp1, \_tmp0
+.endm
+
+/*
+ * Description : Transpose 8x8 block with word elements in vectors
+ * Arguments : Inputs - _in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7
+ * Outputs - _out0, _out1, _out2, _out3, _out4, _out5, _out6,
+ * _out7
+ * Example : LASX_TRANSPOSE8x8_W
+ * _in0 : 1,2,3,4,5,6,7,8
+ * _in1 : 2,2,3,4,5,6,7,8
+ * _in2 : 3,2,3,4,5,6,7,8
+ * _in3 : 4,2,3,4,5,6,7,8
+ * _in4 : 5,2,3,4,5,6,7,8
+ * _in5 : 6,2,3,4,5,6,7,8
+ * _in6 : 7,2,3,4,5,6,7,8
+ * _in7 : 8,2,3,4,5,6,7,8
+ *
+ * _out0 : 1,2,3,4,5,6,7,8
+ * _out1 : 2,2,2,2,2,2,2,2
+ * _out2 : 3,3,3,3,3,3,3,3
+ * _out3 : 4,4,4,4,4,4,4,4
+ * _out4 : 5,5,5,5,5,5,5,5
+ * _out5 : 6,6,6,6,6,6,6,6
+ * _out6 : 7,7,7,7,7,7,7,7
+ * _out7 : 8,8,8,8,8,8,8,8
+ */
+.macro LASX_TRANSPOSE8x8_W _in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7,\
+ _out0, _out1, _out2, _out3, _out4, _out5, _out6, _out7,\
+ _tmp0, _tmp1, _tmp2, _tmp3
+ xvilvl.w \_tmp0, \_in2, \_in0
+ xvilvl.w \_tmp1, \_in3, \_in1
+ xvilvh.w \_tmp2, \_in2, \_in0
+ xvilvh.w \_tmp3, \_in3, \_in1
+ xvilvl.w \_out0, \_tmp1, \_tmp0
+ xvilvh.w \_out1, \_tmp1, \_tmp0
+ xvilvl.w \_out2, \_tmp3, \_tmp2
+ xvilvh.w \_out3, \_tmp3, \_tmp2
+
+ xvilvl.w \_tmp0, \_in6, \_in4
+ xvilvl.w \_tmp1, \_in7, \_in5
+ xvilvh.w \_tmp2, \_in6, \_in4
+ xvilvh.w \_tmp3, \_in7, \_in5
+ xvilvl.w \_out4, \_tmp1, \_tmp0
+ xvilvh.w \_out5, \_tmp1, \_tmp0
+ xvilvl.w \_out6, \_tmp3, \_tmp2
+ xvilvh.w \_out7, \_tmp3, \_tmp2
+
+ xmov \_tmp0, \_out0
+ xmov \_tmp1, \_out1
+ xmov \_tmp2, \_out2
+ xmov \_tmp3, \_out3
+ xvpermi.q \_out0, \_out4, 0x02
+ xvpermi.q \_out1, \_out5, 0x02
+ xvpermi.q \_out2, \_out6, 0x02
+ xvpermi.q \_out3, \_out7, 0x02
+ xvpermi.q \_out4, \_tmp0, 0x31
+ xvpermi.q \_out5, \_tmp1, 0x31
+ xvpermi.q \_out6, \_tmp2, 0x31
+ xvpermi.q \_out7, \_tmp3, 0x31
+.endm
+
+/*
+ * Description : Transpose 4x4 block with double-word elements in vectors
+ * Arguments : Inputs - _in0, _in1, _in2, _in3
+ * Outputs - _out0, _out1, _out2, _out3
+ * Example : LASX_TRANSPOSE4x4_D
+ * _in0 : 1,2,3,4
+ * _in1 : 1,2,3,4
+ * _in2 : 1,2,3,4
+ * _in3 : 1,2,3,4
+ *
+ * _out0 : 1,1,1,1
+ * _out1 : 2,2,2,2
+ * _out2 : 3,3,3,3
+ * _out3 : 4,4,4,4
+ */
+.macro LASX_TRANSPOSE4x4_D _in0, _in1, _in2, _in3, _out0, _out1, _out2, _out3, \
+ _tmp0, _tmp1
+ xvilvl.d \_tmp0, \_in1, \_in0
+ xvilvh.d \_out1, \_in1, \_in0
+ xvilvh.d \_tmp1, \_in3, \_in2
+ xvilvl.d \_out2, \_in3, \_in2
+
+ xvor.v \_out0, \_tmp0, \_tmp0
+ xvor.v \_out3, \_tmp1, \_tmp1
+
+ xvpermi.q \_out0, \_out2, 0x02
+ xvpermi.q \_out2, \_tmp0, 0x31
+ xvpermi.q \_out3, \_out1, 0x31
+ xvpermi.q \_out1, \_tmp1, 0x02
+.endm
+
+/*
+ * Description : Butterfly of 4 input vectors
+ * Arguments : Inputs - _in0, _in1, _in2, _in3
+ * Outputs - _out0, _out1, _out2, _out3
+ * Details : Butterfly operation
+ * Example : LSX_BUTTERFLY_4
+ * _out0 = _in0 + _in3;
+ * _out1 = _in1 + _in2;
+ * _out2 = _in1 - _in2;
+ * _out3 = _in0 - _in3;
+ */
+.macro LSX_BUTTERFLY_4_B _in0, _in1, _in2, _in3, _out0, _out1, _out2, _out3
+ vadd.b \_out0, \_in0, \_in3
+ vadd.b \_out1, \_in1, \_in2
+ vsub.b \_out2, \_in1, \_in2
+ vsub.b \_out3, \_in0, \_in3
+.endm
+.macro LSX_BUTTERFLY_4_H _in0, _in1, _in2, _in3, _out0, _out1, _out2, _out3
+ vadd.h \_out0, \_in0, \_in3
+ vadd.h \_out1, \_in1, \_in2
+ vsub.h \_out2, \_in1, \_in2
+ vsub.h \_out3, \_in0, \_in3
+.endm
+.macro LSX_BUTTERFLY_4_W _in0, _in1, _in2, _in3, _out0, _out1, _out2, _out3
+ vadd.w \_out0, \_in0, \_in3
+ vadd.w \_out1, \_in1, \_in2
+ vsub.w \_out2, \_in1, \_in2
+ vsub.w \_out3, \_in0, \_in3
+.endm
+.macro LSX_BUTTERFLY_4_D _in0, _in1, _in2, _in3, _out0, _out1, _out2, _out3
+ vadd.d \_out0, \_in0, \_in3
+ vadd.d \_out1, \_in1, \_in2
+ vsub.d \_out2, \_in1, \_in2
+ vsub.d \_out3, \_in0, \_in3
+.endm
+
+.macro LASX_BUTTERFLY_4_B _in0, _in1, _in2, _in3, _out0, _out1, _out2, _out3
+ xvadd.b \_out0, \_in0, \_in3
+ xvadd.b \_out1, \_in1, \_in2
+ xvsub.b \_out2, \_in1, \_in2
+ xvsub.b \_out3, \_in0, \_in3
+.endm
+.macro LASX_BUTTERFLY_4_H _in0, _in1, _in2, _in3, _out0, _out1, _out2, _out3
+ xvadd.h \_out0, \_in0, \_in3
+ xvadd.h \_out1, \_in1, \_in2
+ xvsub.h \_out2, \_in1, \_in2
+ xvsub.h \_out3, \_in0, \_in3
+.endm
+.macro LASX_BUTTERFLY_4_W _in0, _in1, _in2, _in3, _out0, _out1, _out2, _out3
+ xvadd.w \_out0, \_in0, \_in3
+ xvadd.w \_out1, \_in1, \_in2
+ xvsub.w \_out2, \_in1, \_in2
+ xvsub.w \_out3, \_in0, \_in3
+.endm
+.macro LASX_BUTTERFLY_4_D _in0, _in1, _in2, _in3, _out0, _out1, _out2, _out3
+ xvadd.d \_out0, \_in0, \_in3
+ xvadd.d \_out1, \_in1, \_in2
+ xvsub.d \_out2, \_in1, \_in2
+ xvsub.d \_out3, \_in0, \_in3
+.endm
+
+/*
+ * Description : Butterfly of 8 input vectors
+ * Arguments : Inputs - _in0, _in1, _in2, _in3, ~
+ * Outputs - _out0, _out1, _out2, _out3, ~
+ * Details : Butterfly operation
+ * Example : LASX_BUTTERFLY_8
+ * _out0 = _in0 + _in7;
+ * _out1 = _in1 + _in6;
+ * _out2 = _in2 + _in5;
+ * _out3 = _in3 + _in4;
+ * _out4 = _in3 - _in4;
+ * _out5 = _in2 - _in5;
+ * _out6 = _in1 - _in6;
+ * _out7 = _in0 - _in7;
+ */
+.macro LSX_BUTTERFLY_8_B _in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7, \
+ _out0, _out1, _out2, _out3, _out4, _out5, _out6, _out7
+ vadd.b \_out0, \_in0, \_in7
+ vadd.b \_out1, \_in1, \_in6
+ vadd.b \_out2, \_in2, \_in5
+ vadd.b \_out3, \_in3, \_in4
+ vsub.b \_out4, \_in3, \_in4
+ vsub.b \_out5, \_in2, \_in5
+ vsub.b \_out6, \_in1, \_in6
+ vsub.b \_out7, \_in0, \_in7
+.endm
+
+.macro LSX_BUTTERFLY_8_H _in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7, \
+ _out0, _out1, _out2, _out3, _out4, _out5, _out6, _out7
+ vadd.h \_out0, \_in0, \_in7
+ vadd.h \_out1, \_in1, \_in6
+ vadd.h \_out2, \_in2, \_in5
+ vadd.h \_out3, \_in3, \_in4
+ vsub.h \_out4, \_in3, \_in4
+ vsub.h \_out5, \_in2, \_in5
+ vsub.h \_out6, \_in1, \_in6
+ vsub.h \_out7, \_in0, \_in7
+.endm
+
+.macro LSX_BUTTERFLY_8_W _in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7, \
+ _out0, _out1, _out2, _out3, _out4, _out5, _out6, _out7
+ vadd.w \_out0, \_in0, \_in7
+ vadd.w \_out1, \_in1, \_in6
+ vadd.w \_out2, \_in2, \_in5
+ vadd.w \_out3, \_in3, \_in4
+ vsub.w \_out4, \_in3, \_in4
+ vsub.w \_out5, \_in2, \_in5
+ vsub.w \_out6, \_in1, \_in6
+ vsub.w \_out7, \_in0, \_in7
+.endm
+
+.macro LSX_BUTTERFLY_8_D _in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7, \
+ _out0, _out1, _out2, _out3, _out4, _out5, _out6, _out7
+ vadd.d \_out0, \_in0, \_in7
+ vadd.d \_out1, \_in1, \_in6
+ vadd.d \_out2, \_in2, \_in5
+ vadd.d \_out3, \_in3, \_in4
+ vsub.d \_out4, \_in3, \_in4
+ vsub.d \_out5, \_in2, \_in5
+ vsub.d \_out6, \_in1, \_in6
+ vsub.d \_out7, \_in0, \_in7
+.endm
+
+.macro LASX_BUTTERFLY_8_B _in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7, \
+ _out0, _out1, _out2, _out3, _out4, _out5, _out6, _out7
+ xvadd.b \_out0, \_in0, \_in7
+ xvadd.b \_out1, \_in1, \_in6
+ xvadd.b \_out2, \_in2, \_in5
+ xvadd.b \_out3, \_in3, \_in4
+ xvsub.b \_out4, \_in3, \_in4
+ xvsub.b \_out5, \_in2, \_in5
+ xvsub.b \_out6, \_in1, \_in6
+ xvsub.b \_out7, \_in0, \_in7
+.endm
+
+.macro LASX_BUTTERFLY_8_H _in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7, \
+ _out0, _out1, _out2, _out3, _out4, _out5, _out6, _out7
+ xvadd.h \_out0, \_in0, \_in7
+ xvadd.h \_out1, \_in1, \_in6
+ xvadd.h \_out2, \_in2, \_in5
+ xvadd.h \_out3, \_in3, \_in4
+ xvsub.h \_out4, \_in3, \_in4
+ xvsub.h \_out5, \_in2, \_in5
+ xvsub.h \_out6, \_in1, \_in6
+ xvsub.h \_out7, \_in0, \_in7
+.endm
+
+.macro LASX_BUTTERFLY_8_W _in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7, \
+ _out0, _out1, _out2, _out3, _out4, _out5, _out6, _out7
+ xvadd.w \_out0, \_in0, \_in7
+ xvadd.w \_out1, \_in1, \_in6
+ xvadd.w \_out2, \_in2, \_in5
+ xvadd.w \_out3, \_in3, \_in4
+ xvsub.w \_out4, \_in3, \_in4
+ xvsub.w \_out5, \_in2, \_in5
+ xvsub.w \_out6, \_in1, \_in6
+ xvsub.w \_out7, \_in0, \_in7
+.endm