diff options
author | James Almer <jamrial@gmail.com> | 2014-09-23 18:42:35 -0300 |
---|---|---|
committer | James Almer <jamrial@gmail.com> | 2014-09-24 16:12:55 -0300 |
commit | 70277d1d234b33a80477f75435758a194fed5873 (patch) | |
tree | 1d795d1fd6bf7ca36882d729e67a5d03ece87678 /libavcodec | |
parent | 280ef183db554bd4eaeaa7fe487ad398ec5208fb (diff) | |
download | ffmpeg-70277d1d234b33a80477f75435758a194fed5873.tar.gz |
x86/videodsp: add ff_emu_edge_{hfix,hvar}_avx2
~15% faster than sse2.
Reviewed-by: Ronald S. Bultje <rsbultje@gmail.com>
Signed-off-by: James Almer <jamrial@gmail.com>
Diffstat (limited to 'libavcodec')
-rw-r--r-- | libavcodec/x86/videodsp.asm | 28 | ||||
-rw-r--r-- | libavcodec/x86/videodsp_init.c | 36 |
2 files changed, 63 insertions, 1 deletions
diff --git a/libavcodec/x86/videodsp.asm b/libavcodec/x86/videodsp.asm index 1ac02574d6..25d43640ab 100644 --- a/libavcodec/x86/videodsp.asm +++ b/libavcodec/x86/videodsp.asm @@ -97,7 +97,10 @@ cglobal emu_edge_hvar, 5, 6, 1, dst, dst_stride, start_x, n_words, h, w neg n_wordsq lea start_xq, [start_xq+n_wordsq*2] .y_loop: ; do { - ; FIXME also write a ssse3 version using pshufb +%if cpuflag(avx2) + vpbroadcastb m0, [dstq+start_xq] + mov wq, n_wordsq ; initialize w +%else movzx wd, byte [dstq+start_xq] ; w = read(1) imul wd, 0x01010101 ; w *= 0x01010101 movd m0, wd @@ -107,6 +110,7 @@ cglobal emu_edge_hvar, 5, 6, 1, dst, dst_stride, start_x, n_words, h, w %else ; mmx punpckldq m0, m0 ; splat %endif ; mmx/sse +%endif ; avx2 .x_loop: ; do { movu [dstq+wq*2], m0 ; write($reg, $mmsize) add wq, mmsize/2 ; w -= $mmsize/2 @@ -127,6 +131,11 @@ hvar_fn INIT_XMM sse2 hvar_fn +%if HAVE_AVX2_EXTERNAL +INIT_XMM avx2 +hvar_fn +%endif + ; macro to read/write a horizontal number of pixels (%2) to/from registers ; on sse, - fills xmm0-15 for consecutive sets of 16 pixels ; - if (%2 & 8) fills 8 bytes into xmm$next @@ -344,6 +353,9 @@ VERTICAL_EXTEND 16, 22 ; obviously not the same on both sides. %macro READ_V_PIXEL 2 +%if cpuflag(avx2) + vpbroadcastb m0, %2 +%else movzx vald, byte %2 imul vald, 0x01010101 %if %1 >= 8 @@ -354,6 +366,7 @@ VERTICAL_EXTEND 16, 22 punpckldq m0, m0 %endif ; mmsize == 16 %endif ; %1 > 16 +%endif ; avx2 %endmacro ; READ_V_PIXEL %macro WRITE_V_PIXEL 2 @@ -398,14 +411,22 @@ VERTICAL_EXTEND 16, 22 %endif ; %1 >=/< 8 %if %1-%%off == 2 +%if cpuflag(avx2) + movd [%2+%%off-2], m0 +%else mov [%2+%%off], valw +%endif ; avx2 %endif ; (%1-%%off)/2 %endmacro ; WRITE_V_PIXEL %macro H_EXTEND 2 %assign %%n %1 %rep 1+(%2-%1)/2 +%if cpuflag(avx2) +cglobal emu_edge_hfix %+ %%n, 4, 4, 1, dst, dst_stride, start_x, bh +%else cglobal emu_edge_hfix %+ %%n, 4, 5, 1, dst, dst_stride, start_x, bh, val +%endif .loop_y: ; do { READ_V_PIXEL %%n, [dstq+start_xq] ; $variable_regs = read($n) WRITE_V_PIXEL %%n, dstq ; write($variable_regs, $n) @@ -426,6 +447,11 @@ H_EXTEND 16, 22 INIT_XMM sse2 H_EXTEND 16, 22 +%if HAVE_AVX2_EXTERNAL +INIT_XMM avx2 +H_EXTEND 8, 22 +%endif + %macro PREFETCH_FN 1 cglobal prefetch, 3, 3, 0, buf, stride, h .loop: diff --git a/libavcodec/x86/videodsp_init.c b/libavcodec/x86/videodsp_init.c index 602f141c05..885cdf1d8c 100644 --- a/libavcodec/x86/videodsp_init.c +++ b/libavcodec/x86/videodsp_init.c @@ -128,6 +128,23 @@ static emu_edge_hfix_func * const hfixtbl_sse2[11] = { ff_emu_edge_hfix20_sse2, ff_emu_edge_hfix22_sse2 }; extern emu_edge_hvar_func ff_emu_edge_hvar_sse2; +#if HAVE_AVX2_EXTERNAL +extern emu_edge_hfix_func ff_emu_edge_hfix8_avx2; +extern emu_edge_hfix_func ff_emu_edge_hfix10_avx2; +extern emu_edge_hfix_func ff_emu_edge_hfix12_avx2; +extern emu_edge_hfix_func ff_emu_edge_hfix14_avx2; +extern emu_edge_hfix_func ff_emu_edge_hfix16_avx2; +extern emu_edge_hfix_func ff_emu_edge_hfix18_avx2; +extern emu_edge_hfix_func ff_emu_edge_hfix20_avx2; +extern emu_edge_hfix_func ff_emu_edge_hfix22_avx2; +static emu_edge_hfix_func * const hfixtbl_avx2[11] = { + ff_emu_edge_hfix2_mmx, ff_emu_edge_hfix4_mmx, ff_emu_edge_hfix6_mmx, + ff_emu_edge_hfix8_avx2, ff_emu_edge_hfix10_avx2, ff_emu_edge_hfix12_avx2, + ff_emu_edge_hfix14_avx2, ff_emu_edge_hfix16_avx2, ff_emu_edge_hfix18_avx2, + ff_emu_edge_hfix20_avx2, ff_emu_edge_hfix22_avx2 +}; +extern emu_edge_hvar_func ff_emu_edge_hvar_avx2; +#endif static av_always_inline void emulated_edge_mc(uint8_t *dst, const uint8_t *src, ptrdiff_t dst_stride, @@ -238,6 +255,20 @@ static av_noinline void emulated_edge_mc_sse2(uint8_t *buf, const uint8_t *src, src_x, src_y, w, h, vfixtbl_sse, &ff_emu_edge_vvar_sse, hfixtbl_sse2, &ff_emu_edge_hvar_sse2); } + +#if HAVE_AVX2_EXTERNAL +static av_noinline void emulated_edge_mc_avx2(uint8_t *buf, const uint8_t *src, + ptrdiff_t buf_stride, + ptrdiff_t src_stride, + int block_w, int block_h, + int src_x, int src_y, int w, + int h) +{ + emulated_edge_mc(buf, src, buf_stride, src_stride, block_w, block_h, + src_x, src_y, w, h, vfixtbl_sse, &ff_emu_edge_vvar_sse, + hfixtbl_avx2, &ff_emu_edge_hvar_avx2); +} +#endif /* HAVE_AVX2_EXTERNAL */ #endif /* HAVE_YASM */ void ff_prefetch_mmxext(uint8_t *buf, ptrdiff_t stride, int h); @@ -267,5 +298,10 @@ av_cold void ff_videodsp_init_x86(VideoDSPContext *ctx, int bpc) if (EXTERNAL_SSE2(cpu_flags) && bpc <= 8) { ctx->emulated_edge_mc = emulated_edge_mc_sse2; } +#if HAVE_AVX2_EXTERNAL + if (EXTERNAL_AVX2(cpu_flags) && bpc <= 8) { + ctx->emulated_edge_mc = emulated_edge_mc_avx2; + } +#endif #endif /* HAVE_YASM */ } |