diff options
author | James Almer <jamrial@gmail.com> | 2014-06-23 15:51:53 -0300 |
---|---|---|
committer | Michael Niedermayer <michaelni@gmx.at> | 2014-06-26 17:58:57 +0200 |
commit | 057d2704e78b52fec357b4fc646c9de37a085413 (patch) | |
tree | 060823a1e5b0c8cbad9a12ed34250ce35da4e6b0 /libavcodec/x86/videodsp.asm | |
parent | e18b48c6fdfdc91fe56b1d1a1312b6d486d9fe43 (diff) | |
download | ffmpeg-057d2704e78b52fec357b4fc646c9de37a085413.tar.gz |
x86/videodsp: add emulated_edge_mc_mmxext
This also changes hfix8_mmx and above to use mmx regs instead of
gprs, and makes emulated_edge_mc_sse and emulated_edge_mc_sse2 use
mmxext hfix and hvar functions instead of mmx where possible.
This is mostly in preparation for an ssse3 version.
Signed-off-by: James Almer <jamrial@gmail.com>
code is about 1 cpu cycle faster approximately
Signed-off-by: Michael Niedermayer <michaelni@gmx.at>
Diffstat (limited to 'libavcodec/x86/videodsp.asm')
-rw-r--r-- | libavcodec/x86/videodsp.asm | 59 |
1 files changed, 35 insertions, 24 deletions
diff --git a/libavcodec/x86/videodsp.asm b/libavcodec/x86/videodsp.asm index 1ac02574d6..ad15af9a27 100644 --- a/libavcodec/x86/videodsp.asm +++ b/libavcodec/x86/videodsp.asm @@ -92,21 +92,13 @@ INIT_XMM sse vvar_fn %macro hvar_fn 0 -cglobal emu_edge_hvar, 5, 6, 1, dst, dst_stride, start_x, n_words, h, w +cglobal emu_edge_hvar, 5, 6, 2, dst, dst_stride, start_x, n_words, h, w lea dstq, [dstq+n_wordsq*2] neg n_wordsq lea start_xq, [start_xq+n_wordsq*2] .y_loop: ; do { - ; FIXME also write a ssse3 version using pshufb - movzx wd, byte [dstq+start_xq] ; w = read(1) - imul wd, 0x01010101 ; w *= 0x01010101 - movd m0, wd mov wq, n_wordsq ; initialize w -%if cpuflag(sse2) - pshufd m0, m0, q0000 ; splat -%else ; mmx - punpckldq m0, m0 ; splat -%endif ; mmx/sse + SPLATB_LOAD m0, dstq+start_xq, m1 ; read(1); splat .x_loop: ; do { movu [dstq+wq*2], m0 ; write($reg, $mmsize) add wq, mmsize/2 ; w -= $mmsize/2 @@ -122,6 +114,8 @@ cglobal emu_edge_hvar, 5, 6, 1, dst, dst_stride, start_x, n_words, h, w %if ARCH_X86_32 INIT_MMX mmx hvar_fn +INIT_MMX mmxext +hvar_fn %endif INIT_XMM sse2 @@ -344,16 +338,12 @@ VERTICAL_EXTEND 16, 22 ; obviously not the same on both sides. %macro READ_V_PIXEL 2 - movzx vald, byte %2 +%if notcpuflag(mmxext) && %1 < 8 + movzx vald, byte [%2] imul vald, 0x01010101 -%if %1 >= 8 - movd m0, vald -%if mmsize == 16 - pshufd m0, m0, q0000 %else - punpckldq m0, m0 -%endif ; mmsize == 16 -%endif ; %1 > 16 + SPLATB_LOAD m0, %2, m1 +%endif ; %1 < 8 %endmacro ; READ_V_PIXEL %macro WRITE_V_PIXEL 2 @@ -388,26 +378,42 @@ VERTICAL_EXTEND 16, 22 %endif %endif ; %1-%%off >= 4 +%if %1-%%off == 2 + movd [%2+%%off-2], m0 +%endif ; (%1-%%off)/2 + %else ; %1 < 8 +%if cpuflag(mmxext) + movd [%2+%%off], m0 +%if %1 == 6 + movd [%2+%%off+2], m0 +%endif ; (%1-%%off)/2 + +%else ; notcpuflag(mmxext) + %rep %1/4 mov [%2+%%off], vald %assign %%off %%off+4 %endrep ; %1/4 -%endif ; %1 >=/< 8 - %if %1-%%off == 2 mov [%2+%%off], valw %endif ; (%1-%%off)/2 +%endif ; cpuflag +%endif ; %1 >=/< 8 %endmacro ; WRITE_V_PIXEL %macro H_EXTEND 2 %assign %%n %1 %rep 1+(%2-%1)/2 -cglobal emu_edge_hfix %+ %%n, 4, 5, 1, dst, dst_stride, start_x, bh, val +%if %%n < 8 && notcpuflag(mmxext) +cglobal emu_edge_hfix %+ %%n, 4, 5, 2, dst, dst_stride, start_x, bh, val +%else +cglobal emu_edge_hfix %+ %%n, 4, 4, 2, dst, dst_stride, start_x, bh +%endif .loop_y: ; do { - READ_V_PIXEL %%n, [dstq+start_xq] ; $variable_regs = read($n) + READ_V_PIXEL %%n, dstq+start_xq ; $variable_regs = read($n) WRITE_V_PIXEL %%n, dstq ; write($variable_regs, $n) add dstq, dst_strideq ; dst += dst_stride dec bhq ; } while (--bh) @@ -418,11 +424,16 @@ cglobal emu_edge_hfix %+ %%n, 4, 5, 1, dst, dst_stride, start_x, bh, val %endmacro ; H_EXTEND INIT_MMX mmx -H_EXTEND 2, 14 +H_EXTEND 2, 2 %if ARCH_X86_32 -H_EXTEND 16, 22 +H_EXTEND 4, 22 %endif +INIT_MMX mmxext +H_EXTEND 4, 14 +%if ARCH_X86_32 +H_EXTEND 16, 22 +%endif INIT_XMM sse2 H_EXTEND 16, 22 |