diff options
author | Alan Kelly <alankelly@google.com> | 2021-04-01 12:00:15 +0200 |
---|---|---|
committer | Michael Niedermayer <michael@niedermayer.cc> | 2021-04-03 19:43:39 +0200 |
commit | 4aeedf4c2a8f35be667d5dd40c84bd27730ef1db (patch) | |
tree | 3b2643d5d7d771199b2bba29ca616ca5ba5210bb | |
parent | 95aacf30e3803c57d91ff62975b375e394f61d49 (diff) | |
download | ffmpeg-4aeedf4c2a8f35be667d5dd40c84bd27730ef1db.tar.gz |
libswscale/x86/yuv2yuvX: Removes unrolling for mmx and mmxext
Signed-off-by: Michael Niedermayer <michael@niedermayer.cc>
(cherry picked from commit 3ce8d092448827842c451807f03010ad5129fd8f)
Signed-off-by: Michael Niedermayer <michael@niedermayer.cc>
-rw-r--r-- | libswscale/x86/yuv2yuvX.asm | 14 |
1 files changed, 13 insertions, 1 deletions
diff --git a/libswscale/x86/yuv2yuvX.asm b/libswscale/x86/yuv2yuvX.asm index 521880dabe..b6294cb919 100644 --- a/libswscale/x86/yuv2yuvX.asm +++ b/libswscale/x86/yuv2yuvX.asm @@ -37,8 +37,10 @@ SECTION .text cglobal yuv2yuvX, 7, 7, 8, filter, filterSize, src, dest, dstW, dither, offset %if notcpuflag(sse3) %define movr mova +%define unroll 1 %else %define movr movdqu +%define unroll 2 %endif movsxdifnidn dstWq, dstWd movsxdifnidn offsetq, offsetd @@ -70,8 +72,10 @@ cglobal yuv2yuvX, 7, 7, 8, filter, filterSize, src, dest, dstW, dither, offset .outerloop: mova m4, m7 mova m3, m7 +%if cpuflag(sse3) mova m6, m7 mova m1, m7 +%endif .loop: %if cpuflag(avx2) vpbroadcastq m0, [filterSizeq + 8] @@ -84,28 +88,36 @@ cglobal yuv2yuvX, 7, 7, 8, filter, filterSize, src, dest, dstW, dither, offset pmulhw m5, m0, [srcq + offsetq * 2 + mmsize] paddw m3, m3, m2 paddw m4, m4, m5 +%if cpuflag(sse3) pmulhw m2, m0, [srcq + offsetq * 2 + 2 * mmsize] pmulhw m5, m0, [srcq + offsetq * 2 + 3 * mmsize] paddw m6, m6, m2 paddw m1, m1, m5 +%endif add filterSizeq, $10 mov srcq, [filterSizeq] test srcq, srcq jnz .loop psraw m3, m3, 3 psraw m4, m4, 3 +%if cpuflag(sse3) psraw m6, m6, 3 psraw m1, m1, 3 +%endif packuswb m3, m3, m4 +%if cpuflag(sse3) packuswb m6, m6, m1 +%endif mov srcq, [filterq] %if cpuflag(avx2) vpermq m3, m3, 216 vpermq m6, m6, 216 %endif movr [destq + offsetq], m3 +%if cpuflag(sse3) movr [destq + offsetq + mmsize], m6 - add offsetq, mmsize * 2 +%endif + add offsetq, mmsize * unroll mov filterSizeq, filterq cmp offsetq, dstWq jb .outerloop |