aboutsummaryrefslogtreecommitdiffstats
path: root/libswscale/loongarch
diff options
context:
space:
mode:
authorShiyou Yin <yinshiyou-hf@loongson.cn>2024-03-16 11:03:31 +0800
committerMichael Niedermayer <michael@niedermayer.cc>2024-04-11 23:53:41 +0200
commitf3fe2cb5f72a669bd737203f6f82ed7f2fa60ded (patch)
tree8cafffea066099b0f68245f1db4db558fcbfe5cb /libswscale/loongarch
parentdd5f665b4010f8a0142ce3cba3305b173eb37dfe (diff)
downloadffmpeg-f3fe2cb5f72a669bd737203f6f82ed7f2fa60ded.tar.gz
swscale: [LA] Optimize range convert for yuvj420p.
Reviewed-by: 陈昊 <chenhao@loongson.cn> Signed-off-by: Michael Niedermayer <michael@niedermayer.cc>
Diffstat (limited to 'libswscale/loongarch')
-rw-r--r--libswscale/loongarch/swscale.S368
-rw-r--r--libswscale/loongarch/swscale_init_loongarch.c33
-rw-r--r--libswscale/loongarch/swscale_loongarch.h11
3 files changed, 412 insertions, 0 deletions
diff --git a/libswscale/loongarch/swscale.S b/libswscale/loongarch/swscale.S
index aa4c5cbe28..67b1bc834d 100644
--- a/libswscale/loongarch/swscale.S
+++ b/libswscale/loongarch/swscale.S
@@ -1866,3 +1866,371 @@ function ff_hscale_16_to_19_sub_lsx
ld.d s8, sp, 64
addi.d sp, sp, 72
endfunc
+
+function lumRangeFromJpeg_lsx
+ li.w t0, 14071
+ li.w t1, 33561947
+ vreplgr2vr.h vr0, t0
+ srli.w t2, a1, 3
+ andi t3, a1, 7
+ beqz t2, 2f
+1:
+ vld vr1, a0, 0
+ vreplgr2vr.w vr2, t1
+ vreplgr2vr.w vr3, t1
+ vmaddwev.w.h vr2, vr0, vr1
+ vmaddwod.w.h vr3, vr0, vr1
+ vsrai.w vr2, vr2, 14
+ vsrai.w vr3, vr3, 14
+ vpackev.h vr1, vr3, vr2
+ vst vr1, a0, 0
+ addi.d a0, a0, 16
+ addi.d t2, t2, -1
+ bnez t2, 1b
+2:
+ beqz t3, 4f
+3:
+ ld.h t4, a0, 0
+ mul.w t4, t4, t0
+ add.w t4, t4, t1
+ srai.w t4, t4, 14
+ st.h t4, a0, 0
+ addi.d a0, a0, 2
+ addi.d t3, t3, -1
+ bnez t3, 3b
+4:
+endfunc
+
+function lumRangeFromJpeg_lasx
+ li.w t0, 14071
+ li.w t1, 33561947
+ xvreplgr2vr.h xr0, t0
+ srli.w t2, a1, 4
+ andi t3, a1, 15
+ beqz t2, 2f
+1:
+ xvld xr1, a0, 0
+ xvreplgr2vr.w xr2, t1
+ xvreplgr2vr.w xr3, t1
+ xvmaddwev.w.h xr2, xr0, xr1
+ xvmaddwod.w.h xr3, xr0, xr1
+ xvsrai.w xr2, xr2, 14
+ xvsrai.w xr3, xr3, 14
+ xvpackev.h xr1, xr3, xr2
+ xvst xr1, a0, 0
+ addi.d a0, a0, 32
+ addi.d t2, t2, -1
+ bnez t2, 1b
+2:
+ beqz t3, 4f
+3:
+ ld.h t4, a0, 0
+ mul.w t4, t4, t0
+ add.w t4, t4, t1
+ srai.w t4, t4, 14
+ st.h t4, a0, 0
+ addi.d a0, a0, 2
+ addi.d t3, t3, -1
+ bnez t3, 3b
+4:
+endfunc
+
+function lumRangeToJpeg_lsx
+ li.w t0, 19077
+ li.w t1, -39057361
+ li.w t2, 30189
+ vreplgr2vr.h vr0, t0
+ vreplgr2vr.h vr4, t2
+ srli.w t2, a1, 3
+ andi t3, a1, 7
+ beqz t2, 2f
+1:
+ vld vr1, a0, 0
+ vreplgr2vr.w vr2, t1
+ vreplgr2vr.w vr3, t1
+ vmin.h vr1, vr1, vr4
+ vmaddwev.w.h vr2, vr0, vr1
+ vmaddwod.w.h vr3, vr0, vr1
+ vsrai.w vr2, vr2, 14
+ vsrai.w vr3, vr3, 14
+ vpackev.h vr1, vr3, vr2
+ vst vr1, a0, 0
+ addi.d a0, a0, 16
+ addi.d t2, t2, -1
+ bnez t2, 1b
+2:
+ beqz t3, 4f
+3:
+ ld.h t4, a0, 0
+ vreplgr2vr.h vr1, t4
+ vmin.h vr1, vr1, vr4
+ vpickve2gr.h t4, vr1, 0
+ mul.w t4, t4, t0
+ add.w t4, t4, t1
+ srai.w t4, t4, 14
+ st.h t4, a0, 0
+ addi.d a0, a0, 2
+ addi.d t3, t3, -1
+ bnez t3, 3b
+4:
+endfunc
+
+function lumRangeToJpeg_lasx
+ li.w t0, 19077
+ li.w t1, -39057361
+ li.w t2, 30189
+ xvreplgr2vr.h xr0, t0
+ xvreplgr2vr.h xr4, t2
+ srli.w t2, a1, 4
+ andi t3, a1, 15
+ beqz t2, 2f
+1:
+ xvld xr1, a0, 0
+ xvreplgr2vr.w xr2, t1
+ xvreplgr2vr.w xr3, t1
+ xvmin.h xr1, xr1, xr4
+ xvmaddwev.w.h xr2, xr0, xr1
+ xvmaddwod.w.h xr3, xr0, xr1
+ xvsrai.w xr2, xr2, 14
+ xvsrai.w xr3, xr3, 14
+ xvpackev.h xr1, xr3, xr2
+ xvst xr1, a0, 0
+ addi.d a0, a0, 32
+ addi.d t2, t2, -1
+ bnez t2, 1b
+2:
+ beqz t3, 4f
+3:
+ ld.h t4, a0, 0
+ vreplgr2vr.h vr1, t4
+ vmin.h vr1, vr1, vr4
+ vpickve2gr.h t4, vr1, 0
+ mul.w t4, t4, t0
+ add.w t4, t4, t1
+ srai.w t4, t4, 14
+ st.h t4, a0, 0
+ addi.d a0, a0, 2
+ addi.d t3, t3, -1
+ bnez t3, 3b
+4:
+endfunc
+
+function chrRangeFromJpeg_lsx
+ li.w t0, 1799
+ li.w t1, 4081085
+ vreplgr2vr.h vr0, t0
+ srli.w t2, a2, 3
+ andi t3, a2, 7
+ beqz t2, 2f
+1:
+ vld vr1, a0, 0
+ vld vr2, a1, 0
+ vreplgr2vr.w vr3, t1
+ vreplgr2vr.w vr4, t1
+ vreplgr2vr.w vr5, t1
+ vreplgr2vr.w vr6, t1
+ vmaddwev.w.h vr3, vr0, vr1
+ vmaddwod.w.h vr4, vr0, vr1
+ vmaddwev.w.h vr5, vr0, vr2
+ vmaddwod.w.h vr6, vr0, vr2
+ vsrai.w vr3, vr3, 11
+ vsrai.w vr4, vr4, 11
+ vsrai.w vr5, vr5, 11
+ vsrai.w vr6, vr6, 11
+ vpackev.h vr1, vr4, vr3
+ vpackev.h vr2, vr6, vr5
+ vst vr1, a0, 0
+ vst vr2, a1, 0
+ addi.d a0, a0, 16
+ addi.d a1, a1, 16
+ addi.d t2, t2, -1
+ bnez t2, 1b
+2:
+ beqz t3, 4f
+3:
+ ld.h t4, a0, 0
+ ld.h t5, a1, 0
+ mul.w t4, t4, t0
+ mul.w t5, t5, t0
+ add.w t4, t4, t1
+ add.w t5, t5, t1
+ srai.w t4, t4, 11
+ srai.w t5, t5, 11
+ st.h t4, a0, 0
+ st.h t5, a1, 0
+ addi.d a0, a0, 2
+ addi.d a1, a1, 2
+ addi.d t3, t3, -1
+ bnez t3, 3b
+4:
+endfunc
+
+function chrRangeFromJpeg_lasx
+ li.w t0, 1799
+ li.w t1, 4081085
+ xvreplgr2vr.h xr0, t0
+ srli.w t2, a2, 4
+ andi t3, a2, 15
+ beqz t2, 2f
+1:
+ xvld xr1, a0, 0
+ xvld xr2, a1, 0
+ xvreplgr2vr.w xr3, t1
+ xvreplgr2vr.w xr4, t1
+ xvreplgr2vr.w xr5, t1
+ xvreplgr2vr.w xr6, t1
+ xvmaddwev.w.h xr3, xr0, xr1
+ xvmaddwod.w.h xr4, xr0, xr1
+ xvmaddwev.w.h xr5, xr0, xr2
+ xvmaddwod.w.h xr6, xr0, xr2
+ xvsrai.w xr3, xr3, 11
+ xvsrai.w xr4, xr4, 11
+ xvsrai.w xr5, xr5, 11
+ xvsrai.w xr6, xr6, 11
+ xvpackev.h xr1, xr4, xr3
+ xvpackev.h xr2, xr6, xr5
+ xvst xr1, a0, 0
+ xvst xr2, a1, 0
+ addi.d a0, a0, 32
+ addi.d a1, a1, 32
+ addi.d t2, t2, -1
+ bnez t2, 1b
+2:
+ beqz t3, 4f
+3:
+ ld.h t4, a0, 0
+ ld.h t5, a1, 0
+ mul.w t4, t4, t0
+ mul.w t5, t5, t0
+ add.w t4, t4, t1
+ add.w t5, t5, t1
+ srai.w t4, t4, 11
+ srai.w t5, t5, 11
+ st.h t4, a0, 0
+ st.h t5, a1, 0
+ addi.d a0, a0, 2
+ addi.d a1, a1, 2
+ addi.d t3, t3, -1
+ bnez t3, 3b
+4:
+endfunc
+
+function chrRangeToJpeg_lsx
+ li.w t0, 4663
+ li.w t1, -9289992
+ li.w t2, 30775
+ vreplgr2vr.h vr0, t0
+ vreplgr2vr.h vr7, t2
+ srli.w t2, a2, 3
+ andi t3, a2, 7
+ beqz t2, 2f
+1:
+ vld vr1, a0, 0
+ vld vr2, a1, 0
+ vreplgr2vr.w vr3, t1
+ vreplgr2vr.w vr4, t1
+ vreplgr2vr.w vr5, t1
+ vreplgr2vr.w vr6, t1
+ vmin.h vr1, vr1, vr7
+ vmin.h vr2, vr2, vr7
+ vmaddwev.w.h vr3, vr0, vr1
+ vmaddwod.w.h vr4, vr0, vr1
+ vmaddwev.w.h vr5, vr0, vr2
+ vmaddwod.w.h vr6, vr0, vr2
+ vsrai.w vr3, vr3, 12
+ vsrai.w vr4, vr4, 12
+ vsrai.w vr5, vr5, 12
+ vsrai.w vr6, vr6, 12
+ vpackev.h vr1, vr4, vr3
+ vpackev.h vr2, vr6, vr5
+ vst vr1, a0, 0
+ vst vr2, a1, 0
+ addi.d a0, a0, 16
+ addi.d a1, a1, 16
+ addi.d t2, t2, -1
+ bnez t2, 1b
+2:
+ beqz t3, 4f
+3:
+ ld.h t4, a0, 0
+ ld.h t5, a1, 0
+ vreplgr2vr.h vr1, t4
+ vreplgr2vr.h vr2, t5
+ vmin.h vr1, vr1, vr7
+ vmin.h vr2, vr2, vr7
+ vpickve2gr.h t4, vr1, 0
+ vpickve2gr.h t5, vr2, 0
+ mul.w t4, t4, t0
+ mul.w t5, t5, t0
+ add.w t4, t4, t1
+ add.w t5, t5, t1
+ srai.w t4, t4, 12
+ srai.w t5, t5, 12
+ st.h t4, a0, 0
+ st.h t5, a1, 0
+ addi.d a0, a0, 2
+ addi.d a1, a1, 2
+ addi.d t3, t3, -1
+ bnez t3, 3b
+4:
+endfunc
+
+function chrRangeToJpeg_lasx
+ li.w t0, 4663
+ li.w t1, -9289992
+ li.w t2, 30775
+ xvreplgr2vr.h xr0, t0
+ xvreplgr2vr.h xr7, t2
+ srli.w t2, a2, 4
+ andi t3, a2, 15
+ beqz t2, 2f
+1:
+ xvld xr1, a0, 0
+ xvld xr2, a1, 0
+ xvreplgr2vr.w xr3, t1
+ xvreplgr2vr.w xr4, t1
+ xvreplgr2vr.w xr5, t1
+ xvreplgr2vr.w xr6, t1
+ xvmin.h xr1, xr1, xr7
+ xvmin.h xr2, xr2, xr7
+ xvmaddwev.w.h xr3, xr0, xr1
+ xvmaddwod.w.h xr4, xr0, xr1
+ xvmaddwev.w.h xr5, xr0, xr2
+ xvmaddwod.w.h xr6, xr0, xr2
+ xvsrai.w xr3, xr3, 12
+ xvsrai.w xr4, xr4, 12
+ xvsrai.w xr5, xr5, 12
+ xvsrai.w xr6, xr6, 12
+ xvpackev.h xr1, xr4, xr3
+ xvpackev.h xr2, xr6, xr5
+ xvst xr1, a0, 0
+ xvst xr2, a1, 0
+ addi.d a0, a0, 32
+ addi.d a1, a1, 32
+ addi.d t2, t2, -1
+ bnez t2, 1b
+2:
+ beqz t3, 4f
+3:
+ ld.h t4, a0, 0
+ ld.h t5, a1, 0
+ vreplgr2vr.h vr1, t4
+ vreplgr2vr.h vr2, t5
+ vmin.h vr1, vr1, vr7
+ vmin.h vr2, vr2, vr7
+ vpickve2gr.h t4, vr1, 0
+ vpickve2gr.h t5, vr2, 0
+ mul.w t4, t4, t0
+ mul.w t5, t5, t0
+ add.w t4, t4, t1
+ add.w t5, t5, t1
+ srai.w t4, t4, 12
+ srai.w t5, t5, 12
+ st.h t4, a0, 0
+ st.h t5, a1, 0
+ addi.d a0, a0, 2
+ addi.d a1, a1, 2
+ addi.d t3, t3, -1
+ bnez t3, 3b
+4:
+endfunc
diff --git a/libswscale/loongarch/swscale_init_loongarch.c b/libswscale/loongarch/swscale_init_loongarch.c
index 53e4f970b6..6d2786c55f 100644
--- a/libswscale/loongarch/swscale_init_loongarch.c
+++ b/libswscale/loongarch/swscale_init_loongarch.c
@@ -24,6 +24,38 @@
#include "libswscale/rgb2rgb.h"
#include "libavutil/loongarch/cpu.h"
+av_cold void ff_sws_init_range_convert_loongarch(SwsContext *c)
+{
+ int cpu_flags = av_get_cpu_flags();
+
+ if (have_lsx(cpu_flags)) {
+ if (c->srcRange != c->dstRange && !isAnyRGB(c->dstFormat)) {
+ if (c->dstBpc <= 14) {
+ if (c->srcRange) {
+ c->lumConvertRange = lumRangeFromJpeg_lsx;
+ c->chrConvertRange = chrRangeFromJpeg_lsx;
+ } else {
+ c->lumConvertRange = lumRangeToJpeg_lsx;
+ c->chrConvertRange = chrRangeToJpeg_lsx;
+ }
+ }
+ }
+ }
+ if (have_lasx(cpu_flags)) {
+ if (c->srcRange != c->dstRange && !isAnyRGB(c->dstFormat)) {
+ if (c->dstBpc <= 14) {
+ if (c->srcRange) {
+ c->lumConvertRange = lumRangeFromJpeg_lasx;
+ c->chrConvertRange = chrRangeFromJpeg_lasx;
+ } else {
+ c->lumConvertRange = lumRangeToJpeg_lasx;
+ c->chrConvertRange = chrRangeToJpeg_lasx;
+ }
+ }
+ }
+ }
+}
+
av_cold void ff_sws_init_swscale_loongarch(SwsContext *c)
{
int cpu_flags = av_get_cpu_flags();
@@ -77,6 +109,7 @@ av_cold void ff_sws_init_swscale_loongarch(SwsContext *c)
c->yuv2planeX = ff_yuv2planeX_8_lasx;
}
#endif // #if HAVE_LASX
+ ff_sws_init_range_convert_loongarch(c);
}
av_cold void rgb2rgb_init_loongarch(void)
diff --git a/libswscale/loongarch/swscale_loongarch.h b/libswscale/loongarch/swscale_loongarch.h
index 0514abae21..c96b085982 100644
--- a/libswscale/loongarch/swscale_loongarch.h
+++ b/libswscale/loongarch/swscale_loongarch.h
@@ -50,6 +50,11 @@ void ff_hscale_16_to_19_sub_lsx(SwsContext *c, int16_t *_dst, int dstW,
const uint8_t *_src, const int16_t *filter,
const int32_t *filterPos, int filterSize, int sh);
+void lumRangeFromJpeg_lsx(int16_t *dst, int width);
+void chrRangeFromJpeg_lsx(int16_t *dstU, int16_t *dstV, int width);
+void lumRangeToJpeg_lsx(int16_t *dst, int width);
+void chrRangeToJpeg_lsx(int16_t *dstU, int16_t *dstV, int width);
+
void planar_rgb_to_uv_lsx(uint8_t *_dstU, uint8_t *_dstV, const uint8_t *src[4],
int width, int32_t *rgb2yuv, void *opq);
@@ -97,6 +102,11 @@ void ff_hscale_16_to_15_lasx(SwsContext *c, int16_t *dst, int dstW,
const uint8_t *_src, const int16_t *filter,
const int32_t *filterPos, int filterSize);
+void lumRangeFromJpeg_lasx(int16_t *dst, int width);
+void chrRangeFromJpeg_lasx(int16_t *dstU, int16_t *dstV, int width);
+void lumRangeToJpeg_lasx(int16_t *dst, int width);
+void chrRangeToJpeg_lasx(int16_t *dstU, int16_t *dstV, int width);
+
void planar_rgb_to_uv_lasx(uint8_t *_dstU, uint8_t *_dstV, const uint8_t *src[4],
int width, int32_t *rgb2yuv, void *opq);
@@ -130,6 +140,7 @@ void ff_yuv2planeX_8_lasx(const int16_t *filter, int filterSize,
const uint8_t *dither, int offset);
av_cold void ff_sws_init_output_lasx(SwsContext *c);
+
#endif // #if HAVE_LASX
#endif /* SWSCALE_LOONGARCH_SWSCALE_LOONGARCH_H */