diff options
author | Shiyou Yin <yinshiyou-hf@loongson.cn> | 2024-03-16 11:03:32 +0800 |
---|---|---|
committer | Michael Niedermayer <michael@niedermayer.cc> | 2024-04-11 23:53:59 +0200 |
commit | 8b76df914285b1e10460c16134715531050e7a74 (patch) | |
tree | 66465c4e0cd6a2fac5819854f2a959b50ade54aa | |
parent | f3fe2cb5f72a669bd737203f6f82ed7f2fa60ded (diff) | |
download | ffmpeg-8b76df914285b1e10460c16134715531050e7a74.tar.gz |
swscale: [LA] Optimize yuv2plane1_8_c.
Reviewed-by: colleague of Shiyou Yin
Signed-off-by: Michael Niedermayer <michael@niedermayer.cc>
-rw-r--r-- | libswscale/loongarch/output.S | 254 | ||||
-rw-r--r-- | libswscale/loongarch/output_lasx.c | 23 | ||||
-rw-r--r-- | libswscale/loongarch/output_lsx.c | 22 | ||||
-rw-r--r-- | libswscale/loongarch/swscale_init_loongarch.c | 12 | ||||
-rw-r--r-- | libswscale/loongarch/swscale_loongarch.h | 29 |
5 files changed, 324 insertions, 16 deletions
diff --git a/libswscale/loongarch/output.S b/libswscale/loongarch/output.S index b44bac502a..d71667e38a 100644 --- a/libswscale/loongarch/output.S +++ b/libswscale/loongarch/output.S @@ -23,11 +23,11 @@ #include "libavcodec/loongarch/loongson_asm.S" -/* static void ff_yuv2planeX_8_lsx(const int16_t *filter, int filterSize, +/* static void yuv2planeX_8_lsx(const int16_t *filter, int filterSize, * const int16_t **src, uint8_t *dest, int dstW, * const uint8_t *dither, int offset) */ -function ff_yuv2planeX_8_lsx +function yuv2planeX_8_lsx addi.w t1, a6, 1 addi.w t2, a6, 2 addi.w t3, a6, 3 @@ -136,3 +136,253 @@ function ff_yuv2planeX_8_lsx blt zero, a4, .DEST .END: endfunc + +/* + * void yuv2plane1_8_lsx(const int16_t *src, uint8_t *dest, int dstW, + * const uint8_t *dither, int offset) + */ +function yuv2plane1_8_lsx + addi.w t1, a4, 1 + addi.w t2, a4, 2 + addi.w t3, a4, 3 + addi.w t4, a4, 4 + addi.w t5, a4, 5 + addi.w t6, a4, 6 + addi.w t7, a4, 7 + andi t0, a4, 7 + andi t1, t1, 7 + andi t2, t2, 7 + andi t3, t3, 7 + andi t4, t4, 7 + andi t5, t5, 7 + andi t6, t6, 7 + andi t7, t7, 7 + ldx.bu t0, a3, t0 + ldx.bu t1, a3, t1 + ldx.bu t2, a3, t2 + ldx.bu t3, a3, t3 + ldx.bu t4, a3, t4 + ldx.bu t5, a3, t5 + ldx.bu t6, a3, t6 + ldx.bu t7, a3, t7 + vinsgr2vr.h vr1, t0, 0 + vinsgr2vr.h vr1, t1, 1 + vinsgr2vr.h vr1, t2, 2 + vinsgr2vr.h vr1, t3, 3 + vinsgr2vr.h vr1, t4, 4 + vinsgr2vr.h vr1, t5, 5 + vinsgr2vr.h vr1, t6, 6 + vinsgr2vr.h vr1, t7, 7 + vsub.h vr0, vr0, vr0 + vilvl.h vr2, vr0, vr1 + vilvh.h vr3, vr0, vr1 + + andi t8, a2, 7 + srli.d a2, a2, 3 + beqz a2, 2f +1: + vld vr1, a0, 0 + addi.d a0, a0, 16 + vshuf4i.d vr0, vr1, 8 + vexth.w.h vr4, vr0 + vexth.w.h vr5, vr1 + + vadd.w vr4, vr2, vr4 + vadd.w vr5, vr3, vr5 + vsrai.w vr4, vr4, 7 + vsrai.w vr5, vr5, 7 + vclip255.w vr4, vr4 + vclip255.w vr5, vr5 + vpickev.h vr1, vr5, vr4 + vpickev.b vr1, vr1, vr1 + fst.d f1, a1, 0 + addi.d a1, a1, 8 + addi.d a2, a2, -1 + bnez a2, 1b +2: + beqz t8, 4f +3: + add.w a4, a4, t8 + addi.w t1, a4, 1 + addi.w t2, a4, 2 + addi.w t3, a4, 3 + addi.w t4, a4, 4 + addi.w t5, a4, 5 + addi.w t6, a4, 6 + addi.w t7, a4, 7 + andi t0, a4, 7 + andi t1, t1, 7 + andi t2, t2, 7 + andi t3, t3, 7 + andi t4, t4, 7 + andi t5, t5, 7 + andi t6, t6, 7 + andi t7, t7, 7 + ldx.bu t0, a3, t0 + ldx.bu t1, a3, t1 + ldx.bu t2, a3, t2 + ldx.bu t3, a3, t3 + ldx.bu t4, a3, t4 + ldx.bu t5, a3, t5 + ldx.bu t6, a3, t6 + ldx.bu t7, a3, t7 + vinsgr2vr.h vr1, t0, 0 + vinsgr2vr.h vr1, t1, 1 + vinsgr2vr.h vr1, t2, 2 + vinsgr2vr.h vr1, t3, 3 + vinsgr2vr.h vr1, t4, 4 + vinsgr2vr.h vr1, t5, 5 + vinsgr2vr.h vr1, t6, 6 + vinsgr2vr.h vr1, t7, 7 + vsub.h vr0, vr0, vr0 + vilvl.h vr2, vr0, vr1 + vilvh.h vr3, vr0, vr1 + + addi.d a0, a0, -16 + add.d a0, a0, t8 + add.d a0, a0, t8 + addi.d a1, a1, -8 + add.d a1, a1, t8 + + vld vr1, a0, 0 + vshuf4i.d vr0, vr1, 8 + vexth.w.h vr4, vr0 + vexth.w.h vr5, vr1 + + vadd.w vr4, vr2, vr4 + vadd.w vr5, vr3, vr5 + vsrai.w vr4, vr4, 7 + vsrai.w vr5, vr5, 7 + vclip255.w vr4, vr4 + vclip255.w vr5, vr5 + vpickev.h vr1, vr5, vr4 + vpickev.b vr1, vr1, vr1 + fst.d f1, a1, 0 +4: +endfunc + +function yuv2plane1_8_lasx + addi.w t1, a4, 1 + addi.w t2, a4, 2 + addi.w t3, a4, 3 + addi.w t4, a4, 4 + addi.w t5, a4, 5 + addi.w t6, a4, 6 + addi.w t7, a4, 7 + andi t0, a4, 7 + andi t1, t1, 7 + andi t2, t2, 7 + andi t3, t3, 7 + andi t4, t4, 7 + andi t5, t5, 7 + andi t6, t6, 7 + andi t7, t7, 7 + ldx.bu t0, a3, t0 + ldx.bu t1, a3, t1 + ldx.bu t2, a3, t2 + ldx.bu t3, a3, t3 + ldx.bu t4, a3, t4 + ldx.bu t5, a3, t5 + ldx.bu t6, a3, t6 + ldx.bu t7, a3, t7 + vinsgr2vr.h vr1, t0, 0 + vinsgr2vr.h vr1, t1, 1 + vinsgr2vr.h vr1, t2, 2 + vinsgr2vr.h vr1, t3, 3 + vinsgr2vr.h vr1, t4, 4 + vinsgr2vr.h vr1, t5, 5 + vinsgr2vr.h vr1, t6, 6 + vinsgr2vr.h vr1, t7, 7 + xvpermi.q xr1, xr1, 0 + xvsub.h xr0, xr0, xr0 + xvilvl.h xr2, xr0, xr1 + xvilvh.h xr3, xr0, xr1 + + andi t8, a2, 15 + srli.d a2, a2, 4 + beqz a2, 2f +1: + xvld xr1, a0, 0 + addi.d a0, a0, 32 + xvpermi.d xr0, xr1, 0xa0 + xvexth.w.h xr4, xr0 + xvexth.w.h xr5, xr1 + + xvadd.w xr4, xr2, xr4 + xvadd.w xr5, xr3, xr5 + xvsrai.w xr4, xr4, 7 + xvsrai.w xr5, xr5, 7 + xvclip255.w xr4, xr4 + xvclip255.w xr5, xr5 + xvpickev.h xr1, xr5, xr4 + xvpickev.b xr0, xr1, xr1 + xvpermi.q xr1, xr0, 1 + fst.d f0, a1, 0 + fst.d f1, a1, 8 + addi.d a1, a1, 16 + addi.d a2, a2, -1 + bnez a2, 1b +2: + beqz t8, 4f +3: + add.w a4, a4, t8 + addi.w t1, a4, 1 + addi.w t2, a4, 2 + addi.w t3, a4, 3 + addi.w t4, a4, 4 + addi.w t5, a4, 5 + addi.w t6, a4, 6 + addi.w t7, a4, 7 + andi t0, a4, 7 + andi t1, t1, 7 + andi t2, t2, 7 + andi t3, t3, 7 + andi t4, t4, 7 + andi t5, t5, 7 + andi t6, t6, 7 + andi t7, t7, 7 + ldx.bu t0, a3, t0 + ldx.bu t1, a3, t1 + ldx.bu t2, a3, t2 + ldx.bu t3, a3, t3 + ldx.bu t4, a3, t4 + ldx.bu t5, a3, t5 + ldx.bu t6, a3, t6 + ldx.bu t7, a3, t7 + vinsgr2vr.h vr1, t0, 0 + vinsgr2vr.h vr1, t1, 1 + vinsgr2vr.h vr1, t2, 2 + vinsgr2vr.h vr1, t3, 3 + vinsgr2vr.h vr1, t4, 4 + vinsgr2vr.h vr1, t5, 5 + vinsgr2vr.h vr1, t6, 6 + vinsgr2vr.h vr1, t7, 7 + xvpermi.q xr1, xr1, 0 + xvsub.h xr0, xr0, xr0 + xvilvl.h xr2, xr0, xr1 + xvilvh.h xr3, xr0, xr1 + + addi.d a0, a0, -32 + add.d a0, a0, t8 + add.d a0, a0, t8 + addi.d a1, a1, -16 + add.d a1, a1, t8 + + xvld xr1, a0, 0 + xvpermi.d xr0, xr1, 0xa0 + xvexth.w.h xr4, xr0 + xvexth.w.h xr5, xr1 + + xvadd.w xr4, xr2, xr4 + xvadd.w xr5, xr3, xr5 + xvsrai.w xr4, xr4, 7 + xvsrai.w xr5, xr5, 7 + xvclip255.w xr4, xr4 + xvclip255.w xr5, xr5 + xvpickev.h xr1, xr5, xr4 + xvpickev.b xr0, xr1, xr1 + xvpermi.q xr1, xr0, 1 + fst.d f0, a1, 0 + fst.d f1, a1, 8 +4: +endfunc diff --git a/libswscale/loongarch/output_lasx.c b/libswscale/loongarch/output_lasx.c index 277d7063e6..bc8ab8cf36 100644 --- a/libswscale/loongarch/output_lasx.c +++ b/libswscale/loongarch/output_lasx.c @@ -22,7 +22,7 @@ #include "swscale_loongarch.h" #include "libavutil/loongarch/loongson_intrinsics.h" -void ff_yuv2planeX_8_lasx(const int16_t *filter, int filterSize, +void yuv2planeX_8_lasx(const int16_t *filter, int filterSize, const int16_t **src, uint8_t *dest, int dstW, const uint8_t *dither, int offset) { @@ -1775,8 +1775,27 @@ YUV2RGBWRAPPER(yuv2, rgb_full, bgr8_full, AV_PIX_FMT_BGR8, 0) YUV2RGBWRAPPER(yuv2, rgb_full, rgb8_full, AV_PIX_FMT_RGB8, 0) -av_cold void ff_sws_init_output_lasx(SwsContext *c) +av_cold void ff_sws_init_output_lasx(SwsContext *c, + yuv2planar1_fn *yuv2plane1, + yuv2planarX_fn *yuv2planeX, + yuv2interleavedX_fn *yuv2nv12cX, + yuv2packed1_fn *yuv2packed1, + yuv2packed2_fn *yuv2packed2, + yuv2packedX_fn *yuv2packedX, + yuv2anyX_fn *yuv2anyX) { + enum AVPixelFormat dstFormat = c->dstFormat; + + /* Add initialization once optimized */ + if (isSemiPlanarYUV(dstFormat) && isDataInHighBits(dstFormat)) { + } else if (is16BPS(dstFormat)) { + } else if (isNBPS(dstFormat)) { + } else if (dstFormat == AV_PIX_FMT_GRAYF32BE) { + } else if (dstFormat == AV_PIX_FMT_GRAYF32LE) { + } else { + *yuv2plane1 = yuv2plane1_8_lasx; + *yuv2planeX = yuv2planeX_8_lasx; + } if(c->flags & SWS_FULL_CHR_H_INT) { switch (c->dstFormat) { diff --git a/libswscale/loongarch/output_lsx.c b/libswscale/loongarch/output_lsx.c index 768cc3abc6..de9b1534ee 100644 --- a/libswscale/loongarch/output_lsx.c +++ b/libswscale/loongarch/output_lsx.c @@ -1624,8 +1624,28 @@ YUV2RGBWRAPPER(yuv2, rgb_full, bgr8_full, AV_PIX_FMT_BGR8, 0) YUV2RGBWRAPPER(yuv2, rgb_full, rgb8_full, AV_PIX_FMT_RGB8, 0) -av_cold void ff_sws_init_output_lsx(SwsContext *c) +av_cold void ff_sws_init_output_lsx(SwsContext *c, + yuv2planar1_fn *yuv2plane1, + yuv2planarX_fn *yuv2planeX, + yuv2interleavedX_fn *yuv2nv12cX, + yuv2packed1_fn *yuv2packed1, + yuv2packed2_fn *yuv2packed2, + yuv2packedX_fn *yuv2packedX, + yuv2anyX_fn *yuv2anyX) { + enum AVPixelFormat dstFormat = c->dstFormat; + + /* Add initialization once optimized */ + if (isSemiPlanarYUV(dstFormat) && isDataInHighBits(dstFormat)) { + } else if (is16BPS(dstFormat)) { + } else if (isNBPS(dstFormat)) { + } else if (dstFormat == AV_PIX_FMT_GRAYF32BE) { + } else if (dstFormat == AV_PIX_FMT_GRAYF32LE) { + } else { + *yuv2plane1 = yuv2plane1_8_lsx; + *yuv2planeX = yuv2planeX_8_lsx; + } + if(c->flags & SWS_FULL_CHR_H_INT) { switch (c->dstFormat) { case AV_PIX_FMT_RGBA: diff --git a/libswscale/loongarch/swscale_init_loongarch.c b/libswscale/loongarch/swscale_init_loongarch.c index 6d2786c55f..04d2553fa4 100644 --- a/libswscale/loongarch/swscale_init_loongarch.c +++ b/libswscale/loongarch/swscale_init_loongarch.c @@ -60,7 +60,9 @@ av_cold void ff_sws_init_swscale_loongarch(SwsContext *c) { int cpu_flags = av_get_cpu_flags(); if (have_lsx(cpu_flags)) { - ff_sws_init_output_lsx(c); + ff_sws_init_output_lsx(c, &c->yuv2plane1, &c->yuv2planeX, + &c->yuv2nv12cX, &c->yuv2packed1, + &c->yuv2packed2, &c->yuv2packedX, &c->yuv2anyX); if (c->srcBpc == 8) { if (c->dstBpc <= 14) { c->hyScale = c->hcScale = ff_hscale_8_to_15_lsx; @@ -80,12 +82,12 @@ av_cold void ff_sws_init_swscale_loongarch(SwsContext *c) } break; } - if (c->dstBpc == 8) - c->yuv2planeX = ff_yuv2planeX_8_lsx; } #if HAVE_LASX if (have_lasx(cpu_flags)) { - ff_sws_init_output_lasx(c); + ff_sws_init_output_lasx(c, &c->yuv2plane1, &c->yuv2planeX, + &c->yuv2nv12cX, &c->yuv2packed1, + &c->yuv2packed2, &c->yuv2packedX, &c->yuv2anyX); if (c->srcBpc == 8) { if (c->dstBpc <= 14) { c->hyScale = c->hcScale = ff_hscale_8_to_15_lasx; @@ -105,8 +107,6 @@ av_cold void ff_sws_init_swscale_loongarch(SwsContext *c) } break; } - if (c->dstBpc == 8) - c->yuv2planeX = ff_yuv2planeX_8_lasx; } #endif // #if HAVE_LASX ff_sws_init_range_convert_loongarch(c); diff --git a/libswscale/loongarch/swscale_loongarch.h b/libswscale/loongarch/swscale_loongarch.h index c96b085982..ea93881f8e 100644 --- a/libswscale/loongarch/swscale_loongarch.h +++ b/libswscale/loongarch/swscale_loongarch.h @@ -61,11 +61,21 @@ void planar_rgb_to_uv_lsx(uint8_t *_dstU, uint8_t *_dstV, const uint8_t *src[4], void planar_rgb_to_y_lsx(uint8_t *_dst, const uint8_t *src[4], int width, int32_t *rgb2yuv, void *opq); -void ff_yuv2planeX_8_lsx(const int16_t *filter, int filterSize, +void yuv2planeX_8_lsx(const int16_t *filter, int filterSize, const int16_t **src, uint8_t *dest, int dstW, const uint8_t *dither, int offset); -av_cold void ff_sws_init_output_lsx(SwsContext *c); +void yuv2plane1_8_lsx(const int16_t *src, uint8_t *dest, int dstW, + const uint8_t *dither, int offset); + +av_cold void ff_sws_init_output_lsx(SwsContext *c, + yuv2planar1_fn *yuv2plane1, + yuv2planarX_fn *yuv2planeX, + yuv2interleavedX_fn *yuv2nv12cX, + yuv2packed1_fn *yuv2packed1, + yuv2packed2_fn *yuv2packed2, + yuv2packedX_fn *yuv2packedX, + yuv2anyX_fn *yuv2anyX); int yuv420_rgb24_lsx(SwsContext *c, const uint8_t *src[], int srcStride[], int srcSliceY, int srcSliceH, uint8_t *dst[], int dstStride[]); @@ -135,12 +145,21 @@ void ff_interleave_bytes_lasx(const uint8_t *src1, const uint8_t *src2, uint8_t *dest, int width, int height, int src1Stride, int src2Stride, int dstStride); -void ff_yuv2planeX_8_lasx(const int16_t *filter, int filterSize, +void yuv2planeX_8_lasx(const int16_t *filter, int filterSize, const int16_t **src, uint8_t *dest, int dstW, const uint8_t *dither, int offset); -av_cold void ff_sws_init_output_lasx(SwsContext *c); - +void yuv2plane1_8_lasx(const int16_t *src, uint8_t *dest, int dstW, + const uint8_t *dither, int offset); + +av_cold void ff_sws_init_output_lasx(SwsContext *c, + yuv2planar1_fn *yuv2plane1, + yuv2planarX_fn *yuv2planeX, + yuv2interleavedX_fn *yuv2nv12cX, + yuv2packed1_fn *yuv2packed1, + yuv2packed2_fn *yuv2packed2, + yuv2packedX_fn *yuv2packedX, + yuv2anyX_fn *yuv2anyX); #endif // #if HAVE_LASX #endif /* SWSCALE_LOONGARCH_SWSCALE_LOONGARCH_H */ |