diff options
author | James Almer <jamrial@gmail.com> | 2024-06-05 13:41:32 -0300 |
---|---|---|
committer | James Almer <jamrial@gmail.com> | 2024-06-09 13:43:11 -0300 |
commit | c578bb9864de0e9d63dbd2e334a624ad1b99eaf2 (patch) | |
tree | 264b192c1a8543871822e3a9f9b4431818872f02 | |
parent | e9cfd532579cb33cd814ce7fb9e7480cd7054750 (diff) | |
download | ffmpeg-c578bb9864de0e9d63dbd2e334a624ad1b99eaf2.tar.gz |
swscale/x86/input: add AVX2 optimized uyvytoyuv422
uyvytoyuv422_c: 23991.8
uyvytoyuv422_sse2: 2817.8
uyvytoyuv422_avx: 2819.3
uyvytoyuv422_avx2: 1972.3
Signed-off-by: James Almer <jamrial@gmail.com>
-rw-r--r-- | libswscale/x86/rgb2rgb.c | 6 | ||||
-rw-r--r-- | libswscale/x86/rgb_2_rgb.asm | 32 |
2 files changed, 30 insertions, 8 deletions
diff --git a/libswscale/x86/rgb2rgb.c b/libswscale/x86/rgb2rgb.c index 2bfab2cf16..1dc8f1549c 100644 --- a/libswscale/x86/rgb2rgb.c +++ b/libswscale/x86/rgb2rgb.c @@ -2363,6 +2363,9 @@ void ff_uyvytoyuv422_sse2(uint8_t *ydst, uint8_t *udst, uint8_t *vdst, void ff_uyvytoyuv422_avx(uint8_t *ydst, uint8_t *udst, uint8_t *vdst, const uint8_t *src, int width, int height, int lumStride, int chromStride, int srcStride); +void ff_uyvytoyuv422_avx2(uint8_t *ydst, uint8_t *udst, uint8_t *vdst, + const uint8_t *src, int width, int height, + int lumStride, int chromStride, int srcStride); #endif #define DEINTERLEAVE_BYTES(cpuext) \ @@ -2435,5 +2438,8 @@ av_cold void rgb2rgb_init_x86(void) shuffle_bytes_3210 = ff_shuffle_bytes_3210_avx2; #endif } + if (EXTERNAL_AVX2_FAST(cpu_flags)) { + uyvytoyuv422 = ff_uyvytoyuv422_avx2; + } #endif } diff --git a/libswscale/x86/rgb_2_rgb.asm b/libswscale/x86/rgb_2_rgb.asm index 76ca1eec03..0bf1278718 100644 --- a/libswscale/x86/rgb_2_rgb.asm +++ b/libswscale/x86/rgb_2_rgb.asm @@ -34,13 +34,16 @@ pb_shuffle3210: db 3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12 SECTION .text -%macro RSHIFT_COPY 3 +%macro RSHIFT_COPY 5 ; %1 dst ; %2 src ; %3 shift -%if cpuflag(avx) - psrldq %1, %2, %3 +%if mmsize == 32 + vperm2i128 %1, %2, %3, %5 + RSHIFT %1, %4 +%elif cpuflag(avx) + psrldq %1, %2, %4 %else mova %1, %2 - RSHIFT %1, %3 + RSHIFT %1, %4 %endif %endmacro @@ -233,26 +236,37 @@ cglobal uyvytoyuv422, 9, 14, 8, ydst, udst, vdst, src, w, h, lum_stride, chrom_s jge .end_line .loop_simd: +%if mmsize == 32 + movu xm2, [srcq + wtwoq ] + movu xm3, [srcq + wtwoq + 16 ] + movu xm4, [srcq + wtwoq + 16 * 2] + movu xm5, [srcq + wtwoq + 16 * 3] + vinserti128 m2, m2, [srcq + wtwoq + 16 * 4], 1 + vinserti128 m3, m3, [srcq + wtwoq + 16 * 5], 1 + vinserti128 m4, m4, [srcq + wtwoq + 16 * 6], 1 + vinserti128 m5, m5, [srcq + wtwoq + 16 * 7], 1 +%else movu m2, [srcq + wtwoq ] movu m3, [srcq + wtwoq + mmsize ] movu m4, [srcq + wtwoq + mmsize * 2] movu m5, [srcq + wtwoq + mmsize * 3] +%endif ; extract y part 1 - RSHIFT_COPY m6, m2, 1 ; UYVY UYVY -> YVYU YVY... + RSHIFT_COPY m6, m2, m4, 1, 0x20 ; UYVY UYVY -> YVYU YVY... pand m6, m1; YxYx YxYx... - RSHIFT_COPY m7, m3, 1 ; UYVY UYVY -> YVYU YVY... + RSHIFT_COPY m7, m3, m5, 1, 0x20 ; UYVY UYVY -> YVYU YVY... pand m7, m1 ; YxYx YxYx... packuswb m6, m7 ; YYYY YYYY... movu [ydstq + wq], m6 ; extract y part 2 - RSHIFT_COPY m6, m4, 1 ; UYVY UYVY -> YVYU YVY... + RSHIFT_COPY m6, m4, m2, 1, 0x13 ; UYVY UYVY -> YVYU YVY... pand m6, m1; YxYx YxYx... - RSHIFT_COPY m7, m5, 1 ; UYVY UYVY -> YVYU YVY... + RSHIFT_COPY m7, m5, m3, 1, 0x13 ; UYVY UYVY -> YVYU YVY... pand m7, m1 ; YxYx YxYx... packuswb m6, m7 ; YYYY YYYY... @@ -309,4 +323,6 @@ UYVY_TO_YUV422 INIT_XMM avx UYVY_TO_YUV422 +INIT_YMM avx2 +UYVY_TO_YUV422 %endif |