diff options
author | Andreas Rheinhardt <andreas.rheinhardt@outlook.com> | 2022-06-09 16:57:34 +0200 |
---|---|---|
committer | Andreas Rheinhardt <andreas.rheinhardt@outlook.com> | 2022-06-22 13:36:04 +0200 |
commit | a05f22eaf393177b94432431c145cbc5ba10390a (patch) | |
tree | 31a2ae01b520b2578477ea4b6b2febf1d0e6c1e6 /libswscale/x86/scale.asm | |
parent | 2831837182fe26f0a19a4d366f3f0553311f1291 (diff) | |
download | ffmpeg-a05f22eaf393177b94432431c145cbc5ba10390a.tar.gz |
swscale/x86/swscale: Remove obsolete and harmful MMX(EXT) functions
x64 always has MMX, MMXEXT, SSE and SSE2 and this means
that some functions for MMX, MMXEXT, SSE and 3dnow are always
overridden by other functions (unless one e.g. explicitly
disables SSE2). So given that the only systems that
benefit from these functions are truely ancient 32bit x86s
they are removed.
Moreover, some of the removed code was buggy/not bitexact
and lead to failures involving the f32le and f32be versions of
gray, gbrp and gbrap on x86-32 when SSE2 was not disabled.
See e.g.
https://fate.ffmpeg.org/report.cgi?time=20220609221253&slot=x86_32-debian-kfreebsd-gcc-4.4-cpuflags-mmx
Notice that yuv2yuvX_mmx is not removed, because it is used
by SSE3 and AVX2 as fallback in case of unaligned data and
also for tail processing. I don't know why yuv2yuvX_mmxext
isn't being used for this; an earlier version [1] of
554c2bc7086f49ef5a6a989ad6bc4bc11807eb6f used it, but
the version that was eventually applied does not.
[1]: https://ffmpeg.org/pipermail/ffmpeg-devel/2020-November/272124.html
Signed-off-by: Andreas Rheinhardt <andreas.rheinhardt@outlook.com>
Diffstat (limited to 'libswscale/x86/scale.asm')
-rw-r--r-- | libswscale/x86/scale.asm | 45 |
1 files changed, 8 insertions, 37 deletions
diff --git a/libswscale/x86/scale.asm b/libswscale/x86/scale.asm index 83cabff722..c62ae3dcc2 100644 --- a/libswscale/x86/scale.asm +++ b/libswscale/x86/scale.asm @@ -61,13 +61,11 @@ cglobal hscale%1to%2_%4, %5, 10, %6, pos0, dst, w, srcmem, filter, fltpos, fltsi %define mov32 mov %endif ; x86-64 %if %2 == 19 -%if mmsize == 8 ; mmx - mova m2, [max_19bit_int] -%elif cpuflag(sse4) +%if cpuflag(sse4) mova m2, [max_19bit_int] %else ; ssse3/sse2 mova m2, [max_19bit_flt] -%endif ; mmx/sse2/ssse3/sse4 +%endif ; sse2/ssse3/sse4 %endif ; %2 == 19 %if %1 == 16 mova m6, [minshort] @@ -144,12 +142,7 @@ cglobal hscale%1to%2_%4, %5, 10, %6, pos0, dst, w, srcmem, filter, fltpos, fltsi pmaddwd m1, [filterq+wq*8+mmsize*1] ; *= filter[{8,9,..,14,15}] ; add up horizontally (4 srcpix * 4 coefficients -> 1 dstpix) -%if mmsize == 8 ; mmx - movq m4, m0 - punpckldq m0, m1 - punpckhdq m4, m1 - paddd m0, m4 -%elif notcpuflag(ssse3) ; sse2 +%if notcpuflag(ssse3) ; sse2 mova m4, m0 shufps m0, m1, 10001000b shufps m4, m1, 11011101b @@ -159,7 +152,7 @@ cglobal hscale%1to%2_%4, %5, 10, %6, pos0, dst, w, srcmem, filter, fltpos, fltsi ; filter[{ 4, 5, 6, 7}]*src[filterPos[1]+{0,1,2,3}], ; filter[{ 8, 9,10,11}]*src[filterPos[2]+{0,1,2,3}], ; filter[{12,13,14,15}]*src[filterPos[3]+{0,1,2,3}] -%endif ; mmx/sse2/ssse3/sse4 +%endif ; sse2/ssse3/sse4 %else ; %3 == 8, i.e. filterSize == 8 scaling ; load 2x8 or 4x8 source pixels into m0, m1, m4 and m5 mov32 pos0q, dword [fltposq+wq*2+0] ; filterPos[0] @@ -197,14 +190,7 @@ cglobal hscale%1to%2_%4, %5, 10, %6, pos0, dst, w, srcmem, filter, fltpos, fltsi pmaddwd m5, [filterq+wq*8+mmsize*3] ; *= filter[{24,25,..,30,31}] ; add up horizontally (8 srcpix * 8 coefficients -> 1 dstpix) -%if mmsize == 8 - paddd m0, m1 - paddd m4, m5 - movq m1, m0 - punpckldq m0, m4 - punpckhdq m1, m4 - paddd m0, m1 -%elif notcpuflag(ssse3) ; sse2 +%if notcpuflag(ssse3) ; sse2 %if %1 == 8 %define mex m6 %else @@ -233,7 +219,7 @@ cglobal hscale%1to%2_%4, %5, 10, %6, pos0, dst, w, srcmem, filter, fltpos, fltsi ; filter[{ 8, 9,...,14,15}]*src[filterPos[1]+{0,1,...,6,7}], ; filter[{16,17,...,22,23}]*src[filterPos[2]+{0,1,...,6,7}], ; filter[{24,25,...,30,31}]*src[filterPos[3]+{0,1,...,6,7}] -%endif ; mmx/sse2/ssse3/sse4 +%endif ; sse2/ssse3/sse4 %endif ; %3 == 4/8 %else ; %3 == X, i.e. any filterSize scaling @@ -274,7 +260,7 @@ cglobal hscale%1to%2_%4, %5, 10, %6, pos0, dst, w, srcmem, filter, fltpos, fltsi mov srcq, srcmemmp .innerloop: - ; load 2x4 (mmx) or 2x8 (sse) source pixels into m0/m1 -> m4/m5 + ; load 2x8 (sse) source pixels into m0/m1 -> m4/m5 movbh m0, [srcq+ pos0q *srcmul] ; src[filterPos[0] + {0,1,2,3(,4,5,6,7)}] movbh m1, [srcq+(pos1q+dlt)*srcmul] ; src[filterPos[1] + {0,1,2,3(,4,5,6,7)}] %if %1 == 8 @@ -319,12 +305,6 @@ cglobal hscale%1to%2_%4, %5, 10, %6, pos0, dst, w, srcmem, filter, fltpos, fltsi lea filterq, [filterq+(fltsizeq+dlt)*2] -%if mmsize == 8 ; mmx - movq m0, m4 - punpckldq m4, m5 - punpckhdq m0, m5 - paddd m0, m4 -%else ; mmsize == 16 %if notcpuflag(ssse3) ; sse2 mova m1, m4 punpcklqdq m4, m5 @@ -344,7 +324,6 @@ cglobal hscale%1to%2_%4, %5, 10, %6, pos0, dst, w, srcmem, filter, fltpos, fltsi phaddd m4, m4 SWAP 0, 4 %endif ; sse2/ssse3/sse4 -%endif ; mmsize == 8/16 %endif ; %3 ==/!= X %if %1 == 16 ; add 0x8000 * sum(coeffs), i.e. back from signed -> unsigned @@ -372,7 +351,7 @@ cglobal hscale%1to%2_%4, %5, 10, %6, pos0, dst, w, srcmem, filter, fltpos, fltsi %endif ; %3 ==/!= X %endif ; %2 == 15/19 %ifnidn %3, X - add wq, (mmsize<<wshr)/4 ; both 8tap and 4tap really only do 4 pixels (or for mmx: 2 pixels) + add wq, (mmsize<<wshr)/4 ; both 8tap and 4tap really only do 4 pixels ; per iteration. see "shl wq,1" above as for why we do this %else ; %3 == X add wq, 2 @@ -385,12 +364,8 @@ cglobal hscale%1to%2_%4, %5, 10, %6, pos0, dst, w, srcmem, filter, fltpos, fltsi %macro SCALE_FUNCS 3 SCALE_FUNC %1, %2, 4, 4, 6, %3 SCALE_FUNC %1, %2, 8, 8, 6, %3 -%if mmsize == 8 -SCALE_FUNC %1, %2, X, X, 7, %3 -%else SCALE_FUNC %1, %2, X, X4, 7, %3 SCALE_FUNC %1, %2, X, X8, 7, %3 -%endif %endmacro ; SCALE_FUNCS2 8_xmm_args, 9to10_xmm_args, 16_xmm_args @@ -411,10 +386,6 @@ SCALE_FUNCS 14, 19, %2 SCALE_FUNCS 16, 19, %3 %endmacro -%if ARCH_X86_32 -INIT_MMX mmx -SCALE_FUNCS2 0, 0, 0 -%endif INIT_XMM sse2 SCALE_FUNCS2 7, 6, 8 INIT_XMM ssse3 |