diff options
author | Shiyou Yin <yinshiyou-hf@loongson.cn> | 2024-03-16 11:03:31 +0800 |
---|---|---|
committer | Michael Niedermayer <michael@niedermayer.cc> | 2024-04-11 23:53:41 +0200 |
commit | f3fe2cb5f72a669bd737203f6f82ed7f2fa60ded (patch) | |
tree | 8cafffea066099b0f68245f1db4db558fcbfe5cb /libswscale/loongarch | |
parent | dd5f665b4010f8a0142ce3cba3305b173eb37dfe (diff) | |
download | ffmpeg-f3fe2cb5f72a669bd737203f6f82ed7f2fa60ded.tar.gz |
swscale: [LA] Optimize range convert for yuvj420p.
Reviewed-by: 陈昊 <chenhao@loongson.cn>
Signed-off-by: Michael Niedermayer <michael@niedermayer.cc>
Diffstat (limited to 'libswscale/loongarch')
-rw-r--r-- | libswscale/loongarch/swscale.S | 368 | ||||
-rw-r--r-- | libswscale/loongarch/swscale_init_loongarch.c | 33 | ||||
-rw-r--r-- | libswscale/loongarch/swscale_loongarch.h | 11 |
3 files changed, 412 insertions, 0 deletions
diff --git a/libswscale/loongarch/swscale.S b/libswscale/loongarch/swscale.S index aa4c5cbe28..67b1bc834d 100644 --- a/libswscale/loongarch/swscale.S +++ b/libswscale/loongarch/swscale.S @@ -1866,3 +1866,371 @@ function ff_hscale_16_to_19_sub_lsx ld.d s8, sp, 64 addi.d sp, sp, 72 endfunc + +function lumRangeFromJpeg_lsx + li.w t0, 14071 + li.w t1, 33561947 + vreplgr2vr.h vr0, t0 + srli.w t2, a1, 3 + andi t3, a1, 7 + beqz t2, 2f +1: + vld vr1, a0, 0 + vreplgr2vr.w vr2, t1 + vreplgr2vr.w vr3, t1 + vmaddwev.w.h vr2, vr0, vr1 + vmaddwod.w.h vr3, vr0, vr1 + vsrai.w vr2, vr2, 14 + vsrai.w vr3, vr3, 14 + vpackev.h vr1, vr3, vr2 + vst vr1, a0, 0 + addi.d a0, a0, 16 + addi.d t2, t2, -1 + bnez t2, 1b +2: + beqz t3, 4f +3: + ld.h t4, a0, 0 + mul.w t4, t4, t0 + add.w t4, t4, t1 + srai.w t4, t4, 14 + st.h t4, a0, 0 + addi.d a0, a0, 2 + addi.d t3, t3, -1 + bnez t3, 3b +4: +endfunc + +function lumRangeFromJpeg_lasx + li.w t0, 14071 + li.w t1, 33561947 + xvreplgr2vr.h xr0, t0 + srli.w t2, a1, 4 + andi t3, a1, 15 + beqz t2, 2f +1: + xvld xr1, a0, 0 + xvreplgr2vr.w xr2, t1 + xvreplgr2vr.w xr3, t1 + xvmaddwev.w.h xr2, xr0, xr1 + xvmaddwod.w.h xr3, xr0, xr1 + xvsrai.w xr2, xr2, 14 + xvsrai.w xr3, xr3, 14 + xvpackev.h xr1, xr3, xr2 + xvst xr1, a0, 0 + addi.d a0, a0, 32 + addi.d t2, t2, -1 + bnez t2, 1b +2: + beqz t3, 4f +3: + ld.h t4, a0, 0 + mul.w t4, t4, t0 + add.w t4, t4, t1 + srai.w t4, t4, 14 + st.h t4, a0, 0 + addi.d a0, a0, 2 + addi.d t3, t3, -1 + bnez t3, 3b +4: +endfunc + +function lumRangeToJpeg_lsx + li.w t0, 19077 + li.w t1, -39057361 + li.w t2, 30189 + vreplgr2vr.h vr0, t0 + vreplgr2vr.h vr4, t2 + srli.w t2, a1, 3 + andi t3, a1, 7 + beqz t2, 2f +1: + vld vr1, a0, 0 + vreplgr2vr.w vr2, t1 + vreplgr2vr.w vr3, t1 + vmin.h vr1, vr1, vr4 + vmaddwev.w.h vr2, vr0, vr1 + vmaddwod.w.h vr3, vr0, vr1 + vsrai.w vr2, vr2, 14 + vsrai.w vr3, vr3, 14 + vpackev.h vr1, vr3, vr2 + vst vr1, a0, 0 + addi.d a0, a0, 16 + addi.d t2, t2, -1 + bnez t2, 1b +2: + beqz t3, 4f +3: + ld.h t4, a0, 0 + vreplgr2vr.h vr1, t4 + vmin.h vr1, vr1, vr4 + vpickve2gr.h t4, vr1, 0 + mul.w t4, t4, t0 + add.w t4, t4, t1 + srai.w t4, t4, 14 + st.h t4, a0, 0 + addi.d a0, a0, 2 + addi.d t3, t3, -1 + bnez t3, 3b +4: +endfunc + +function lumRangeToJpeg_lasx + li.w t0, 19077 + li.w t1, -39057361 + li.w t2, 30189 + xvreplgr2vr.h xr0, t0 + xvreplgr2vr.h xr4, t2 + srli.w t2, a1, 4 + andi t3, a1, 15 + beqz t2, 2f +1: + xvld xr1, a0, 0 + xvreplgr2vr.w xr2, t1 + xvreplgr2vr.w xr3, t1 + xvmin.h xr1, xr1, xr4 + xvmaddwev.w.h xr2, xr0, xr1 + xvmaddwod.w.h xr3, xr0, xr1 + xvsrai.w xr2, xr2, 14 + xvsrai.w xr3, xr3, 14 + xvpackev.h xr1, xr3, xr2 + xvst xr1, a0, 0 + addi.d a0, a0, 32 + addi.d t2, t2, -1 + bnez t2, 1b +2: + beqz t3, 4f +3: + ld.h t4, a0, 0 + vreplgr2vr.h vr1, t4 + vmin.h vr1, vr1, vr4 + vpickve2gr.h t4, vr1, 0 + mul.w t4, t4, t0 + add.w t4, t4, t1 + srai.w t4, t4, 14 + st.h t4, a0, 0 + addi.d a0, a0, 2 + addi.d t3, t3, -1 + bnez t3, 3b +4: +endfunc + +function chrRangeFromJpeg_lsx + li.w t0, 1799 + li.w t1, 4081085 + vreplgr2vr.h vr0, t0 + srli.w t2, a2, 3 + andi t3, a2, 7 + beqz t2, 2f +1: + vld vr1, a0, 0 + vld vr2, a1, 0 + vreplgr2vr.w vr3, t1 + vreplgr2vr.w vr4, t1 + vreplgr2vr.w vr5, t1 + vreplgr2vr.w vr6, t1 + vmaddwev.w.h vr3, vr0, vr1 + vmaddwod.w.h vr4, vr0, vr1 + vmaddwev.w.h vr5, vr0, vr2 + vmaddwod.w.h vr6, vr0, vr2 + vsrai.w vr3, vr3, 11 + vsrai.w vr4, vr4, 11 + vsrai.w vr5, vr5, 11 + vsrai.w vr6, vr6, 11 + vpackev.h vr1, vr4, vr3 + vpackev.h vr2, vr6, vr5 + vst vr1, a0, 0 + vst vr2, a1, 0 + addi.d a0, a0, 16 + addi.d a1, a1, 16 + addi.d t2, t2, -1 + bnez t2, 1b +2: + beqz t3, 4f +3: + ld.h t4, a0, 0 + ld.h t5, a1, 0 + mul.w t4, t4, t0 + mul.w t5, t5, t0 + add.w t4, t4, t1 + add.w t5, t5, t1 + srai.w t4, t4, 11 + srai.w t5, t5, 11 + st.h t4, a0, 0 + st.h t5, a1, 0 + addi.d a0, a0, 2 + addi.d a1, a1, 2 + addi.d t3, t3, -1 + bnez t3, 3b +4: +endfunc + +function chrRangeFromJpeg_lasx + li.w t0, 1799 + li.w t1, 4081085 + xvreplgr2vr.h xr0, t0 + srli.w t2, a2, 4 + andi t3, a2, 15 + beqz t2, 2f +1: + xvld xr1, a0, 0 + xvld xr2, a1, 0 + xvreplgr2vr.w xr3, t1 + xvreplgr2vr.w xr4, t1 + xvreplgr2vr.w xr5, t1 + xvreplgr2vr.w xr6, t1 + xvmaddwev.w.h xr3, xr0, xr1 + xvmaddwod.w.h xr4, xr0, xr1 + xvmaddwev.w.h xr5, xr0, xr2 + xvmaddwod.w.h xr6, xr0, xr2 + xvsrai.w xr3, xr3, 11 + xvsrai.w xr4, xr4, 11 + xvsrai.w xr5, xr5, 11 + xvsrai.w xr6, xr6, 11 + xvpackev.h xr1, xr4, xr3 + xvpackev.h xr2, xr6, xr5 + xvst xr1, a0, 0 + xvst xr2, a1, 0 + addi.d a0, a0, 32 + addi.d a1, a1, 32 + addi.d t2, t2, -1 + bnez t2, 1b +2: + beqz t3, 4f +3: + ld.h t4, a0, 0 + ld.h t5, a1, 0 + mul.w t4, t4, t0 + mul.w t5, t5, t0 + add.w t4, t4, t1 + add.w t5, t5, t1 + srai.w t4, t4, 11 + srai.w t5, t5, 11 + st.h t4, a0, 0 + st.h t5, a1, 0 + addi.d a0, a0, 2 + addi.d a1, a1, 2 + addi.d t3, t3, -1 + bnez t3, 3b +4: +endfunc + +function chrRangeToJpeg_lsx + li.w t0, 4663 + li.w t1, -9289992 + li.w t2, 30775 + vreplgr2vr.h vr0, t0 + vreplgr2vr.h vr7, t2 + srli.w t2, a2, 3 + andi t3, a2, 7 + beqz t2, 2f +1: + vld vr1, a0, 0 + vld vr2, a1, 0 + vreplgr2vr.w vr3, t1 + vreplgr2vr.w vr4, t1 + vreplgr2vr.w vr5, t1 + vreplgr2vr.w vr6, t1 + vmin.h vr1, vr1, vr7 + vmin.h vr2, vr2, vr7 + vmaddwev.w.h vr3, vr0, vr1 + vmaddwod.w.h vr4, vr0, vr1 + vmaddwev.w.h vr5, vr0, vr2 + vmaddwod.w.h vr6, vr0, vr2 + vsrai.w vr3, vr3, 12 + vsrai.w vr4, vr4, 12 + vsrai.w vr5, vr5, 12 + vsrai.w vr6, vr6, 12 + vpackev.h vr1, vr4, vr3 + vpackev.h vr2, vr6, vr5 + vst vr1, a0, 0 + vst vr2, a1, 0 + addi.d a0, a0, 16 + addi.d a1, a1, 16 + addi.d t2, t2, -1 + bnez t2, 1b +2: + beqz t3, 4f +3: + ld.h t4, a0, 0 + ld.h t5, a1, 0 + vreplgr2vr.h vr1, t4 + vreplgr2vr.h vr2, t5 + vmin.h vr1, vr1, vr7 + vmin.h vr2, vr2, vr7 + vpickve2gr.h t4, vr1, 0 + vpickve2gr.h t5, vr2, 0 + mul.w t4, t4, t0 + mul.w t5, t5, t0 + add.w t4, t4, t1 + add.w t5, t5, t1 + srai.w t4, t4, 12 + srai.w t5, t5, 12 + st.h t4, a0, 0 + st.h t5, a1, 0 + addi.d a0, a0, 2 + addi.d a1, a1, 2 + addi.d t3, t3, -1 + bnez t3, 3b +4: +endfunc + +function chrRangeToJpeg_lasx + li.w t0, 4663 + li.w t1, -9289992 + li.w t2, 30775 + xvreplgr2vr.h xr0, t0 + xvreplgr2vr.h xr7, t2 + srli.w t2, a2, 4 + andi t3, a2, 15 + beqz t2, 2f +1: + xvld xr1, a0, 0 + xvld xr2, a1, 0 + xvreplgr2vr.w xr3, t1 + xvreplgr2vr.w xr4, t1 + xvreplgr2vr.w xr5, t1 + xvreplgr2vr.w xr6, t1 + xvmin.h xr1, xr1, xr7 + xvmin.h xr2, xr2, xr7 + xvmaddwev.w.h xr3, xr0, xr1 + xvmaddwod.w.h xr4, xr0, xr1 + xvmaddwev.w.h xr5, xr0, xr2 + xvmaddwod.w.h xr6, xr0, xr2 + xvsrai.w xr3, xr3, 12 + xvsrai.w xr4, xr4, 12 + xvsrai.w xr5, xr5, 12 + xvsrai.w xr6, xr6, 12 + xvpackev.h xr1, xr4, xr3 + xvpackev.h xr2, xr6, xr5 + xvst xr1, a0, 0 + xvst xr2, a1, 0 + addi.d a0, a0, 32 + addi.d a1, a1, 32 + addi.d t2, t2, -1 + bnez t2, 1b +2: + beqz t3, 4f +3: + ld.h t4, a0, 0 + ld.h t5, a1, 0 + vreplgr2vr.h vr1, t4 + vreplgr2vr.h vr2, t5 + vmin.h vr1, vr1, vr7 + vmin.h vr2, vr2, vr7 + vpickve2gr.h t4, vr1, 0 + vpickve2gr.h t5, vr2, 0 + mul.w t4, t4, t0 + mul.w t5, t5, t0 + add.w t4, t4, t1 + add.w t5, t5, t1 + srai.w t4, t4, 12 + srai.w t5, t5, 12 + st.h t4, a0, 0 + st.h t5, a1, 0 + addi.d a0, a0, 2 + addi.d a1, a1, 2 + addi.d t3, t3, -1 + bnez t3, 3b +4: +endfunc diff --git a/libswscale/loongarch/swscale_init_loongarch.c b/libswscale/loongarch/swscale_init_loongarch.c index 53e4f970b6..6d2786c55f 100644 --- a/libswscale/loongarch/swscale_init_loongarch.c +++ b/libswscale/loongarch/swscale_init_loongarch.c @@ -24,6 +24,38 @@ #include "libswscale/rgb2rgb.h" #include "libavutil/loongarch/cpu.h" +av_cold void ff_sws_init_range_convert_loongarch(SwsContext *c) +{ + int cpu_flags = av_get_cpu_flags(); + + if (have_lsx(cpu_flags)) { + if (c->srcRange != c->dstRange && !isAnyRGB(c->dstFormat)) { + if (c->dstBpc <= 14) { + if (c->srcRange) { + c->lumConvertRange = lumRangeFromJpeg_lsx; + c->chrConvertRange = chrRangeFromJpeg_lsx; + } else { + c->lumConvertRange = lumRangeToJpeg_lsx; + c->chrConvertRange = chrRangeToJpeg_lsx; + } + } + } + } + if (have_lasx(cpu_flags)) { + if (c->srcRange != c->dstRange && !isAnyRGB(c->dstFormat)) { + if (c->dstBpc <= 14) { + if (c->srcRange) { + c->lumConvertRange = lumRangeFromJpeg_lasx; + c->chrConvertRange = chrRangeFromJpeg_lasx; + } else { + c->lumConvertRange = lumRangeToJpeg_lasx; + c->chrConvertRange = chrRangeToJpeg_lasx; + } + } + } + } +} + av_cold void ff_sws_init_swscale_loongarch(SwsContext *c) { int cpu_flags = av_get_cpu_flags(); @@ -77,6 +109,7 @@ av_cold void ff_sws_init_swscale_loongarch(SwsContext *c) c->yuv2planeX = ff_yuv2planeX_8_lasx; } #endif // #if HAVE_LASX + ff_sws_init_range_convert_loongarch(c); } av_cold void rgb2rgb_init_loongarch(void) diff --git a/libswscale/loongarch/swscale_loongarch.h b/libswscale/loongarch/swscale_loongarch.h index 0514abae21..c96b085982 100644 --- a/libswscale/loongarch/swscale_loongarch.h +++ b/libswscale/loongarch/swscale_loongarch.h @@ -50,6 +50,11 @@ void ff_hscale_16_to_19_sub_lsx(SwsContext *c, int16_t *_dst, int dstW, const uint8_t *_src, const int16_t *filter, const int32_t *filterPos, int filterSize, int sh); +void lumRangeFromJpeg_lsx(int16_t *dst, int width); +void chrRangeFromJpeg_lsx(int16_t *dstU, int16_t *dstV, int width); +void lumRangeToJpeg_lsx(int16_t *dst, int width); +void chrRangeToJpeg_lsx(int16_t *dstU, int16_t *dstV, int width); + void planar_rgb_to_uv_lsx(uint8_t *_dstU, uint8_t *_dstV, const uint8_t *src[4], int width, int32_t *rgb2yuv, void *opq); @@ -97,6 +102,11 @@ void ff_hscale_16_to_15_lasx(SwsContext *c, int16_t *dst, int dstW, const uint8_t *_src, const int16_t *filter, const int32_t *filterPos, int filterSize); +void lumRangeFromJpeg_lasx(int16_t *dst, int width); +void chrRangeFromJpeg_lasx(int16_t *dstU, int16_t *dstV, int width); +void lumRangeToJpeg_lasx(int16_t *dst, int width); +void chrRangeToJpeg_lasx(int16_t *dstU, int16_t *dstV, int width); + void planar_rgb_to_uv_lasx(uint8_t *_dstU, uint8_t *_dstV, const uint8_t *src[4], int width, int32_t *rgb2yuv, void *opq); @@ -130,6 +140,7 @@ void ff_yuv2planeX_8_lasx(const int16_t *filter, int filterSize, const uint8_t *dither, int offset); av_cold void ff_sws_init_output_lasx(SwsContext *c); + #endif // #if HAVE_LASX #endif /* SWSCALE_LOONGARCH_SWSCALE_LOONGARCH_H */ |