diff options
author | Michael Niedermayer <michaelni@gmx.at> | 2014-06-28 05:33:52 +0200 |
---|---|---|
committer | Michael Niedermayer <michaelni@gmx.at> | 2014-06-28 05:39:07 +0200 |
commit | 5bca5f87d1a32669e0357790e0d0ad8a5c9c998b (patch) | |
tree | 9ac36e13704260d32a0646ec86dd06ad28d96dd6 /libavcodec/x86/videodsp.asm | |
parent | 4d1fa38d984cba322284cfce416c4c969da58f51 (diff) | |
download | ffmpeg-5bca5f87d1a32669e0357790e0d0ad8a5c9c998b.tar.gz |
Revert "x86/videodsp: add emulated_edge_mc_mmxext"
The commit causes minor out of array reads and was mainly intended for
future optimizations which turned out not to be meassurably faster.
Itself it was just 1 cpu cycle faster
Approved-by: jamrial
This reverts commit 057d2704e78b52fec357b4fc646c9de37a085413.
Diffstat (limited to 'libavcodec/x86/videodsp.asm')
-rw-r--r-- | libavcodec/x86/videodsp.asm | 59 |
1 files changed, 24 insertions, 35 deletions
diff --git a/libavcodec/x86/videodsp.asm b/libavcodec/x86/videodsp.asm index ad15af9a27..1ac02574d6 100644 --- a/libavcodec/x86/videodsp.asm +++ b/libavcodec/x86/videodsp.asm @@ -92,13 +92,21 @@ INIT_XMM sse vvar_fn %macro hvar_fn 0 -cglobal emu_edge_hvar, 5, 6, 2, dst, dst_stride, start_x, n_words, h, w +cglobal emu_edge_hvar, 5, 6, 1, dst, dst_stride, start_x, n_words, h, w lea dstq, [dstq+n_wordsq*2] neg n_wordsq lea start_xq, [start_xq+n_wordsq*2] .y_loop: ; do { + ; FIXME also write a ssse3 version using pshufb + movzx wd, byte [dstq+start_xq] ; w = read(1) + imul wd, 0x01010101 ; w *= 0x01010101 + movd m0, wd mov wq, n_wordsq ; initialize w - SPLATB_LOAD m0, dstq+start_xq, m1 ; read(1); splat +%if cpuflag(sse2) + pshufd m0, m0, q0000 ; splat +%else ; mmx + punpckldq m0, m0 ; splat +%endif ; mmx/sse .x_loop: ; do { movu [dstq+wq*2], m0 ; write($reg, $mmsize) add wq, mmsize/2 ; w -= $mmsize/2 @@ -114,8 +122,6 @@ cglobal emu_edge_hvar, 5, 6, 2, dst, dst_stride, start_x, n_words, h, w %if ARCH_X86_32 INIT_MMX mmx hvar_fn -INIT_MMX mmxext -hvar_fn %endif INIT_XMM sse2 @@ -338,12 +344,16 @@ VERTICAL_EXTEND 16, 22 ; obviously not the same on both sides. %macro READ_V_PIXEL 2 -%if notcpuflag(mmxext) && %1 < 8 - movzx vald, byte [%2] + movzx vald, byte %2 imul vald, 0x01010101 +%if %1 >= 8 + movd m0, vald +%if mmsize == 16 + pshufd m0, m0, q0000 %else - SPLATB_LOAD m0, %2, m1 -%endif ; %1 < 8 + punpckldq m0, m0 +%endif ; mmsize == 16 +%endif ; %1 > 16 %endmacro ; READ_V_PIXEL %macro WRITE_V_PIXEL 2 @@ -378,42 +388,26 @@ VERTICAL_EXTEND 16, 22 %endif %endif ; %1-%%off >= 4 -%if %1-%%off == 2 - movd [%2+%%off-2], m0 -%endif ; (%1-%%off)/2 - %else ; %1 < 8 -%if cpuflag(mmxext) - movd [%2+%%off], m0 -%if %1 == 6 - movd [%2+%%off+2], m0 -%endif ; (%1-%%off)/2 - -%else ; notcpuflag(mmxext) - %rep %1/4 mov [%2+%%off], vald %assign %%off %%off+4 %endrep ; %1/4 +%endif ; %1 >=/< 8 + %if %1-%%off == 2 mov [%2+%%off], valw %endif ; (%1-%%off)/2 -%endif ; cpuflag -%endif ; %1 >=/< 8 %endmacro ; WRITE_V_PIXEL %macro H_EXTEND 2 %assign %%n %1 %rep 1+(%2-%1)/2 -%if %%n < 8 && notcpuflag(mmxext) -cglobal emu_edge_hfix %+ %%n, 4, 5, 2, dst, dst_stride, start_x, bh, val -%else -cglobal emu_edge_hfix %+ %%n, 4, 4, 2, dst, dst_stride, start_x, bh -%endif +cglobal emu_edge_hfix %+ %%n, 4, 5, 1, dst, dst_stride, start_x, bh, val .loop_y: ; do { - READ_V_PIXEL %%n, dstq+start_xq ; $variable_regs = read($n) + READ_V_PIXEL %%n, [dstq+start_xq] ; $variable_regs = read($n) WRITE_V_PIXEL %%n, dstq ; write($variable_regs, $n) add dstq, dst_strideq ; dst += dst_stride dec bhq ; } while (--bh) @@ -424,16 +418,11 @@ cglobal emu_edge_hfix %+ %%n, 4, 4, 2, dst, dst_stride, start_x, bh %endmacro ; H_EXTEND INIT_MMX mmx -H_EXTEND 2, 2 -%if ARCH_X86_32 -H_EXTEND 4, 22 -%endif - -INIT_MMX mmxext -H_EXTEND 4, 14 +H_EXTEND 2, 14 %if ARCH_X86_32 H_EXTEND 16, 22 %endif + INIT_XMM sse2 H_EXTEND 16, 22 |