aboutsummaryrefslogtreecommitdiffstats
path: root/libswscale/loongarch/output.S
diff options
context:
space:
mode:
authorShiyou Yin <yinshiyou-hf@loongson.cn>2024-03-16 11:03:32 +0800
committerMichael Niedermayer <michael@niedermayer.cc>2024-04-11 23:53:59 +0200
commit8b76df914285b1e10460c16134715531050e7a74 (patch)
tree66465c4e0cd6a2fac5819854f2a959b50ade54aa /libswscale/loongarch/output.S
parentf3fe2cb5f72a669bd737203f6f82ed7f2fa60ded (diff)
downloadffmpeg-8b76df914285b1e10460c16134715531050e7a74.tar.gz
swscale: [LA] Optimize yuv2plane1_8_c.
Reviewed-by: colleague of Shiyou Yin Signed-off-by: Michael Niedermayer <michael@niedermayer.cc>
Diffstat (limited to 'libswscale/loongarch/output.S')
-rw-r--r--libswscale/loongarch/output.S254
1 files changed, 252 insertions, 2 deletions
diff --git a/libswscale/loongarch/output.S b/libswscale/loongarch/output.S
index b44bac502a..d71667e38a 100644
--- a/libswscale/loongarch/output.S
+++ b/libswscale/loongarch/output.S
@@ -23,11 +23,11 @@
#include "libavcodec/loongarch/loongson_asm.S"
-/* static void ff_yuv2planeX_8_lsx(const int16_t *filter, int filterSize,
+/* static void yuv2planeX_8_lsx(const int16_t *filter, int filterSize,
* const int16_t **src, uint8_t *dest, int dstW,
* const uint8_t *dither, int offset)
*/
-function ff_yuv2planeX_8_lsx
+function yuv2planeX_8_lsx
addi.w t1, a6, 1
addi.w t2, a6, 2
addi.w t3, a6, 3
@@ -136,3 +136,253 @@ function ff_yuv2planeX_8_lsx
blt zero, a4, .DEST
.END:
endfunc
+
+/*
+ * void yuv2plane1_8_lsx(const int16_t *src, uint8_t *dest, int dstW,
+ * const uint8_t *dither, int offset)
+ */
+function yuv2plane1_8_lsx
+ addi.w t1, a4, 1
+ addi.w t2, a4, 2
+ addi.w t3, a4, 3
+ addi.w t4, a4, 4
+ addi.w t5, a4, 5
+ addi.w t6, a4, 6
+ addi.w t7, a4, 7
+ andi t0, a4, 7
+ andi t1, t1, 7
+ andi t2, t2, 7
+ andi t3, t3, 7
+ andi t4, t4, 7
+ andi t5, t5, 7
+ andi t6, t6, 7
+ andi t7, t7, 7
+ ldx.bu t0, a3, t0
+ ldx.bu t1, a3, t1
+ ldx.bu t2, a3, t2
+ ldx.bu t3, a3, t3
+ ldx.bu t4, a3, t4
+ ldx.bu t5, a3, t5
+ ldx.bu t6, a3, t6
+ ldx.bu t7, a3, t7
+ vinsgr2vr.h vr1, t0, 0
+ vinsgr2vr.h vr1, t1, 1
+ vinsgr2vr.h vr1, t2, 2
+ vinsgr2vr.h vr1, t3, 3
+ vinsgr2vr.h vr1, t4, 4
+ vinsgr2vr.h vr1, t5, 5
+ vinsgr2vr.h vr1, t6, 6
+ vinsgr2vr.h vr1, t7, 7
+ vsub.h vr0, vr0, vr0
+ vilvl.h vr2, vr0, vr1
+ vilvh.h vr3, vr0, vr1
+
+ andi t8, a2, 7
+ srli.d a2, a2, 3
+ beqz a2, 2f
+1:
+ vld vr1, a0, 0
+ addi.d a0, a0, 16
+ vshuf4i.d vr0, vr1, 8
+ vexth.w.h vr4, vr0
+ vexth.w.h vr5, vr1
+
+ vadd.w vr4, vr2, vr4
+ vadd.w vr5, vr3, vr5
+ vsrai.w vr4, vr4, 7
+ vsrai.w vr5, vr5, 7
+ vclip255.w vr4, vr4
+ vclip255.w vr5, vr5
+ vpickev.h vr1, vr5, vr4
+ vpickev.b vr1, vr1, vr1
+ fst.d f1, a1, 0
+ addi.d a1, a1, 8
+ addi.d a2, a2, -1
+ bnez a2, 1b
+2:
+ beqz t8, 4f
+3:
+ add.w a4, a4, t8
+ addi.w t1, a4, 1
+ addi.w t2, a4, 2
+ addi.w t3, a4, 3
+ addi.w t4, a4, 4
+ addi.w t5, a4, 5
+ addi.w t6, a4, 6
+ addi.w t7, a4, 7
+ andi t0, a4, 7
+ andi t1, t1, 7
+ andi t2, t2, 7
+ andi t3, t3, 7
+ andi t4, t4, 7
+ andi t5, t5, 7
+ andi t6, t6, 7
+ andi t7, t7, 7
+ ldx.bu t0, a3, t0
+ ldx.bu t1, a3, t1
+ ldx.bu t2, a3, t2
+ ldx.bu t3, a3, t3
+ ldx.bu t4, a3, t4
+ ldx.bu t5, a3, t5
+ ldx.bu t6, a3, t6
+ ldx.bu t7, a3, t7
+ vinsgr2vr.h vr1, t0, 0
+ vinsgr2vr.h vr1, t1, 1
+ vinsgr2vr.h vr1, t2, 2
+ vinsgr2vr.h vr1, t3, 3
+ vinsgr2vr.h vr1, t4, 4
+ vinsgr2vr.h vr1, t5, 5
+ vinsgr2vr.h vr1, t6, 6
+ vinsgr2vr.h vr1, t7, 7
+ vsub.h vr0, vr0, vr0
+ vilvl.h vr2, vr0, vr1
+ vilvh.h vr3, vr0, vr1
+
+ addi.d a0, a0, -16
+ add.d a0, a0, t8
+ add.d a0, a0, t8
+ addi.d a1, a1, -8
+ add.d a1, a1, t8
+
+ vld vr1, a0, 0
+ vshuf4i.d vr0, vr1, 8
+ vexth.w.h vr4, vr0
+ vexth.w.h vr5, vr1
+
+ vadd.w vr4, vr2, vr4
+ vadd.w vr5, vr3, vr5
+ vsrai.w vr4, vr4, 7
+ vsrai.w vr5, vr5, 7
+ vclip255.w vr4, vr4
+ vclip255.w vr5, vr5
+ vpickev.h vr1, vr5, vr4
+ vpickev.b vr1, vr1, vr1
+ fst.d f1, a1, 0
+4:
+endfunc
+
+function yuv2plane1_8_lasx
+ addi.w t1, a4, 1
+ addi.w t2, a4, 2
+ addi.w t3, a4, 3
+ addi.w t4, a4, 4
+ addi.w t5, a4, 5
+ addi.w t6, a4, 6
+ addi.w t7, a4, 7
+ andi t0, a4, 7
+ andi t1, t1, 7
+ andi t2, t2, 7
+ andi t3, t3, 7
+ andi t4, t4, 7
+ andi t5, t5, 7
+ andi t6, t6, 7
+ andi t7, t7, 7
+ ldx.bu t0, a3, t0
+ ldx.bu t1, a3, t1
+ ldx.bu t2, a3, t2
+ ldx.bu t3, a3, t3
+ ldx.bu t4, a3, t4
+ ldx.bu t5, a3, t5
+ ldx.bu t6, a3, t6
+ ldx.bu t7, a3, t7
+ vinsgr2vr.h vr1, t0, 0
+ vinsgr2vr.h vr1, t1, 1
+ vinsgr2vr.h vr1, t2, 2
+ vinsgr2vr.h vr1, t3, 3
+ vinsgr2vr.h vr1, t4, 4
+ vinsgr2vr.h vr1, t5, 5
+ vinsgr2vr.h vr1, t6, 6
+ vinsgr2vr.h vr1, t7, 7
+ xvpermi.q xr1, xr1, 0
+ xvsub.h xr0, xr0, xr0
+ xvilvl.h xr2, xr0, xr1
+ xvilvh.h xr3, xr0, xr1
+
+ andi t8, a2, 15
+ srli.d a2, a2, 4
+ beqz a2, 2f
+1:
+ xvld xr1, a0, 0
+ addi.d a0, a0, 32
+ xvpermi.d xr0, xr1, 0xa0
+ xvexth.w.h xr4, xr0
+ xvexth.w.h xr5, xr1
+
+ xvadd.w xr4, xr2, xr4
+ xvadd.w xr5, xr3, xr5
+ xvsrai.w xr4, xr4, 7
+ xvsrai.w xr5, xr5, 7
+ xvclip255.w xr4, xr4
+ xvclip255.w xr5, xr5
+ xvpickev.h xr1, xr5, xr4
+ xvpickev.b xr0, xr1, xr1
+ xvpermi.q xr1, xr0, 1
+ fst.d f0, a1, 0
+ fst.d f1, a1, 8
+ addi.d a1, a1, 16
+ addi.d a2, a2, -1
+ bnez a2, 1b
+2:
+ beqz t8, 4f
+3:
+ add.w a4, a4, t8
+ addi.w t1, a4, 1
+ addi.w t2, a4, 2
+ addi.w t3, a4, 3
+ addi.w t4, a4, 4
+ addi.w t5, a4, 5
+ addi.w t6, a4, 6
+ addi.w t7, a4, 7
+ andi t0, a4, 7
+ andi t1, t1, 7
+ andi t2, t2, 7
+ andi t3, t3, 7
+ andi t4, t4, 7
+ andi t5, t5, 7
+ andi t6, t6, 7
+ andi t7, t7, 7
+ ldx.bu t0, a3, t0
+ ldx.bu t1, a3, t1
+ ldx.bu t2, a3, t2
+ ldx.bu t3, a3, t3
+ ldx.bu t4, a3, t4
+ ldx.bu t5, a3, t5
+ ldx.bu t6, a3, t6
+ ldx.bu t7, a3, t7
+ vinsgr2vr.h vr1, t0, 0
+ vinsgr2vr.h vr1, t1, 1
+ vinsgr2vr.h vr1, t2, 2
+ vinsgr2vr.h vr1, t3, 3
+ vinsgr2vr.h vr1, t4, 4
+ vinsgr2vr.h vr1, t5, 5
+ vinsgr2vr.h vr1, t6, 6
+ vinsgr2vr.h vr1, t7, 7
+ xvpermi.q xr1, xr1, 0
+ xvsub.h xr0, xr0, xr0
+ xvilvl.h xr2, xr0, xr1
+ xvilvh.h xr3, xr0, xr1
+
+ addi.d a0, a0, -32
+ add.d a0, a0, t8
+ add.d a0, a0, t8
+ addi.d a1, a1, -16
+ add.d a1, a1, t8
+
+ xvld xr1, a0, 0
+ xvpermi.d xr0, xr1, 0xa0
+ xvexth.w.h xr4, xr0
+ xvexth.w.h xr5, xr1
+
+ xvadd.w xr4, xr2, xr4
+ xvadd.w xr5, xr3, xr5
+ xvsrai.w xr4, xr4, 7
+ xvsrai.w xr5, xr5, 7
+ xvclip255.w xr4, xr4
+ xvclip255.w xr5, xr5
+ xvpickev.h xr1, xr5, xr4
+ xvpickev.b xr0, xr1, xr1
+ xvpermi.q xr1, xr0, 1
+ fst.d f0, a1, 0
+ fst.d f1, a1, 8
+4:
+endfunc