diff options
author | Michael Niedermayer <michaelni@gmx.at> | 2014-06-28 05:33:52 +0200 |
---|---|---|
committer | Michael Niedermayer <michaelni@gmx.at> | 2014-06-28 05:39:07 +0200 |
commit | 5bca5f87d1a32669e0357790e0d0ad8a5c9c998b (patch) | |
tree | 9ac36e13704260d32a0646ec86dd06ad28d96dd6 | |
parent | 4d1fa38d984cba322284cfce416c4c969da58f51 (diff) | |
download | ffmpeg-5bca5f87d1a32669e0357790e0d0ad8a5c9c998b.tar.gz |
Revert "x86/videodsp: add emulated_edge_mc_mmxext"
The commit causes minor out of array reads and was mainly intended for
future optimizations which turned out not to be meassurably faster.
Itself it was just 1 cpu cycle faster
Approved-by: jamrial
This reverts commit 057d2704e78b52fec357b4fc646c9de37a085413.
-rw-r--r-- | libavcodec/x86/videodsp.asm | 59 | ||||
-rw-r--r-- | libavcodec/x86/videodsp_init.c | 44 |
2 files changed, 29 insertions, 74 deletions
diff --git a/libavcodec/x86/videodsp.asm b/libavcodec/x86/videodsp.asm index ad15af9a27..1ac02574d6 100644 --- a/libavcodec/x86/videodsp.asm +++ b/libavcodec/x86/videodsp.asm @@ -92,13 +92,21 @@ INIT_XMM sse vvar_fn %macro hvar_fn 0 -cglobal emu_edge_hvar, 5, 6, 2, dst, dst_stride, start_x, n_words, h, w +cglobal emu_edge_hvar, 5, 6, 1, dst, dst_stride, start_x, n_words, h, w lea dstq, [dstq+n_wordsq*2] neg n_wordsq lea start_xq, [start_xq+n_wordsq*2] .y_loop: ; do { + ; FIXME also write a ssse3 version using pshufb + movzx wd, byte [dstq+start_xq] ; w = read(1) + imul wd, 0x01010101 ; w *= 0x01010101 + movd m0, wd mov wq, n_wordsq ; initialize w - SPLATB_LOAD m0, dstq+start_xq, m1 ; read(1); splat +%if cpuflag(sse2) + pshufd m0, m0, q0000 ; splat +%else ; mmx + punpckldq m0, m0 ; splat +%endif ; mmx/sse .x_loop: ; do { movu [dstq+wq*2], m0 ; write($reg, $mmsize) add wq, mmsize/2 ; w -= $mmsize/2 @@ -114,8 +122,6 @@ cglobal emu_edge_hvar, 5, 6, 2, dst, dst_stride, start_x, n_words, h, w %if ARCH_X86_32 INIT_MMX mmx hvar_fn -INIT_MMX mmxext -hvar_fn %endif INIT_XMM sse2 @@ -338,12 +344,16 @@ VERTICAL_EXTEND 16, 22 ; obviously not the same on both sides. %macro READ_V_PIXEL 2 -%if notcpuflag(mmxext) && %1 < 8 - movzx vald, byte [%2] + movzx vald, byte %2 imul vald, 0x01010101 +%if %1 >= 8 + movd m0, vald +%if mmsize == 16 + pshufd m0, m0, q0000 %else - SPLATB_LOAD m0, %2, m1 -%endif ; %1 < 8 + punpckldq m0, m0 +%endif ; mmsize == 16 +%endif ; %1 > 16 %endmacro ; READ_V_PIXEL %macro WRITE_V_PIXEL 2 @@ -378,42 +388,26 @@ VERTICAL_EXTEND 16, 22 %endif %endif ; %1-%%off >= 4 -%if %1-%%off == 2 - movd [%2+%%off-2], m0 -%endif ; (%1-%%off)/2 - %else ; %1 < 8 -%if cpuflag(mmxext) - movd [%2+%%off], m0 -%if %1 == 6 - movd [%2+%%off+2], m0 -%endif ; (%1-%%off)/2 - -%else ; notcpuflag(mmxext) - %rep %1/4 mov [%2+%%off], vald %assign %%off %%off+4 %endrep ; %1/4 +%endif ; %1 >=/< 8 + %if %1-%%off == 2 mov [%2+%%off], valw %endif ; (%1-%%off)/2 -%endif ; cpuflag -%endif ; %1 >=/< 8 %endmacro ; WRITE_V_PIXEL %macro H_EXTEND 2 %assign %%n %1 %rep 1+(%2-%1)/2 -%if %%n < 8 && notcpuflag(mmxext) -cglobal emu_edge_hfix %+ %%n, 4, 5, 2, dst, dst_stride, start_x, bh, val -%else -cglobal emu_edge_hfix %+ %%n, 4, 4, 2, dst, dst_stride, start_x, bh -%endif +cglobal emu_edge_hfix %+ %%n, 4, 5, 1, dst, dst_stride, start_x, bh, val .loop_y: ; do { - READ_V_PIXEL %%n, dstq+start_xq ; $variable_regs = read($n) + READ_V_PIXEL %%n, [dstq+start_xq] ; $variable_regs = read($n) WRITE_V_PIXEL %%n, dstq ; write($variable_regs, $n) add dstq, dst_strideq ; dst += dst_stride dec bhq ; } while (--bh) @@ -424,16 +418,11 @@ cglobal emu_edge_hfix %+ %%n, 4, 4, 2, dst, dst_stride, start_x, bh %endmacro ; H_EXTEND INIT_MMX mmx -H_EXTEND 2, 2 -%if ARCH_X86_32 -H_EXTEND 4, 22 -%endif - -INIT_MMX mmxext -H_EXTEND 4, 14 +H_EXTEND 2, 14 %if ARCH_X86_32 H_EXTEND 16, 22 %endif + INIT_XMM sse2 H_EXTEND 16, 22 diff --git a/libavcodec/x86/videodsp_init.c b/libavcodec/x86/videodsp_init.c index bd61ab461f..3218abdd88 100644 --- a/libavcodec/x86/videodsp_init.c +++ b/libavcodec/x86/videodsp_init.c @@ -117,34 +117,15 @@ static emu_edge_hfix_func *hfixtbl_mmx[11] = { }; #endif extern emu_edge_hvar_func ff_emu_edge_hvar_mmx; -extern emu_edge_hfix_func ff_emu_edge_hfix4_mmxext; -extern emu_edge_hfix_func ff_emu_edge_hfix6_mmxext; -extern emu_edge_hfix_func ff_emu_edge_hfix8_mmxext; -extern emu_edge_hfix_func ff_emu_edge_hfix10_mmxext; -extern emu_edge_hfix_func ff_emu_edge_hfix12_mmxext; -extern emu_edge_hfix_func ff_emu_edge_hfix14_mmxext; -extern emu_edge_hfix_func ff_emu_edge_hfix16_mmxext; -extern emu_edge_hfix_func ff_emu_edge_hfix18_mmxext; -extern emu_edge_hfix_func ff_emu_edge_hfix20_mmxext; -extern emu_edge_hfix_func ff_emu_edge_hfix22_mmxext; -#if ARCH_X86_32 -static emu_edge_hfix_func *hfixtbl_mmxext[11] = { - ff_emu_edge_hfix2_mmx, ff_emu_edge_hfix4_mmxext, ff_emu_edge_hfix6_mmxext, - ff_emu_edge_hfix8_mmxext, ff_emu_edge_hfix10_mmxext, ff_emu_edge_hfix12_mmxext, - ff_emu_edge_hfix14_mmxext, ff_emu_edge_hfix16_mmxext, ff_emu_edge_hfix18_mmxext, - ff_emu_edge_hfix20_mmxext, ff_emu_edge_hfix22_mmxext -}; -#endif -extern emu_edge_hvar_func ff_emu_edge_hvar_mmxext; extern emu_edge_hfix_func ff_emu_edge_hfix16_sse2; extern emu_edge_hfix_func ff_emu_edge_hfix18_sse2; extern emu_edge_hfix_func ff_emu_edge_hfix20_sse2; extern emu_edge_hfix_func ff_emu_edge_hfix22_sse2; static emu_edge_hfix_func *hfixtbl_sse2[11] = { - ff_emu_edge_hfix2_mmx, ff_emu_edge_hfix4_mmxext, ff_emu_edge_hfix6_mmxext, - ff_emu_edge_hfix8_mmxext, ff_emu_edge_hfix10_mmxext, ff_emu_edge_hfix12_mmxext, - ff_emu_edge_hfix14_mmxext, ff_emu_edge_hfix16_sse2, ff_emu_edge_hfix18_sse2, - ff_emu_edge_hfix20_sse2, ff_emu_edge_hfix22_sse2 + ff_emu_edge_hfix2_mmx, ff_emu_edge_hfix4_mmx, ff_emu_edge_hfix6_mmx, + ff_emu_edge_hfix8_mmx, ff_emu_edge_hfix10_mmx, ff_emu_edge_hfix12_mmx, + ff_emu_edge_hfix14_mmx, ff_emu_edge_hfix16_sse2, ff_emu_edge_hfix18_sse2, + ff_emu_edge_hfix20_sse2, ff_emu_edge_hfix22_sse2 }; extern emu_edge_hvar_func ff_emu_edge_hvar_sse2; @@ -234,17 +215,6 @@ static av_noinline void emulated_edge_mc_mmx(uint8_t *buf, const uint8_t *src, hfixtbl_mmx, &ff_emu_edge_hvar_mmx); } -static av_noinline void emulated_edge_mc_mmxext(uint8_t *buf, const uint8_t *src, - ptrdiff_t buf_stride, - ptrdiff_t src_stride, - int block_w, int block_h, - int src_x, int src_y, int w, int h) -{ - emulated_edge_mc(buf, src, buf_stride, src_stride, block_w, block_h, - src_x, src_y, w, h, vfixtbl_mmx, &ff_emu_edge_vvar_mmx, - hfixtbl_mmxext, &ff_emu_edge_hvar_mmxext); -} - static av_noinline void emulated_edge_mc_sse(uint8_t *buf, const uint8_t *src, ptrdiff_t buf_stride, ptrdiff_t src_stride, @@ -253,7 +223,7 @@ static av_noinline void emulated_edge_mc_sse(uint8_t *buf, const uint8_t *src, { emulated_edge_mc(buf, src, buf_stride, src_stride, block_w, block_h, src_x, src_y, w, h, vfixtbl_sse, &ff_emu_edge_vvar_sse, - hfixtbl_mmxext, &ff_emu_edge_hvar_mmxext); + hfixtbl_mmx, &ff_emu_edge_hvar_mmx); } #endif @@ -288,10 +258,6 @@ av_cold void ff_videodsp_init_x86(VideoDSPContext *ctx, int bpc) #endif /* ARCH_X86_32 */ if (EXTERNAL_MMXEXT(cpu_flags)) { ctx->prefetch = ff_prefetch_mmxext; -#if ARCH_X86_32 - if (bpc <= 8) - ctx->emulated_edge_mc = emulated_edge_mc_mmxext; -#endif /* ARCH_X86_32 */ } #if ARCH_X86_32 if (EXTERNAL_SSE(cpu_flags) && bpc <= 8) { |