diff options
author | James Almer <jamrial@gmail.com> | 2014-09-23 18:42:35 -0300 |
---|---|---|
committer | James Almer <jamrial@gmail.com> | 2014-09-24 16:12:55 -0300 |
commit | 70277d1d234b33a80477f75435758a194fed5873 (patch) | |
tree | 1d795d1fd6bf7ca36882d729e67a5d03ece87678 /libavcodec/x86/videodsp.asm | |
parent | 280ef183db554bd4eaeaa7fe487ad398ec5208fb (diff) | |
download | ffmpeg-70277d1d234b33a80477f75435758a194fed5873.tar.gz |
x86/videodsp: add ff_emu_edge_{hfix,hvar}_avx2
~15% faster than sse2.
Reviewed-by: Ronald S. Bultje <rsbultje@gmail.com>
Signed-off-by: James Almer <jamrial@gmail.com>
Diffstat (limited to 'libavcodec/x86/videodsp.asm')
-rw-r--r-- | libavcodec/x86/videodsp.asm | 28 |
1 files changed, 27 insertions, 1 deletions
diff --git a/libavcodec/x86/videodsp.asm b/libavcodec/x86/videodsp.asm index 1ac02574d6..25d43640ab 100644 --- a/libavcodec/x86/videodsp.asm +++ b/libavcodec/x86/videodsp.asm @@ -97,7 +97,10 @@ cglobal emu_edge_hvar, 5, 6, 1, dst, dst_stride, start_x, n_words, h, w neg n_wordsq lea start_xq, [start_xq+n_wordsq*2] .y_loop: ; do { - ; FIXME also write a ssse3 version using pshufb +%if cpuflag(avx2) + vpbroadcastb m0, [dstq+start_xq] + mov wq, n_wordsq ; initialize w +%else movzx wd, byte [dstq+start_xq] ; w = read(1) imul wd, 0x01010101 ; w *= 0x01010101 movd m0, wd @@ -107,6 +110,7 @@ cglobal emu_edge_hvar, 5, 6, 1, dst, dst_stride, start_x, n_words, h, w %else ; mmx punpckldq m0, m0 ; splat %endif ; mmx/sse +%endif ; avx2 .x_loop: ; do { movu [dstq+wq*2], m0 ; write($reg, $mmsize) add wq, mmsize/2 ; w -= $mmsize/2 @@ -127,6 +131,11 @@ hvar_fn INIT_XMM sse2 hvar_fn +%if HAVE_AVX2_EXTERNAL +INIT_XMM avx2 +hvar_fn +%endif + ; macro to read/write a horizontal number of pixels (%2) to/from registers ; on sse, - fills xmm0-15 for consecutive sets of 16 pixels ; - if (%2 & 8) fills 8 bytes into xmm$next @@ -344,6 +353,9 @@ VERTICAL_EXTEND 16, 22 ; obviously not the same on both sides. %macro READ_V_PIXEL 2 +%if cpuflag(avx2) + vpbroadcastb m0, %2 +%else movzx vald, byte %2 imul vald, 0x01010101 %if %1 >= 8 @@ -354,6 +366,7 @@ VERTICAL_EXTEND 16, 22 punpckldq m0, m0 %endif ; mmsize == 16 %endif ; %1 > 16 +%endif ; avx2 %endmacro ; READ_V_PIXEL %macro WRITE_V_PIXEL 2 @@ -398,14 +411,22 @@ VERTICAL_EXTEND 16, 22 %endif ; %1 >=/< 8 %if %1-%%off == 2 +%if cpuflag(avx2) + movd [%2+%%off-2], m0 +%else mov [%2+%%off], valw +%endif ; avx2 %endif ; (%1-%%off)/2 %endmacro ; WRITE_V_PIXEL %macro H_EXTEND 2 %assign %%n %1 %rep 1+(%2-%1)/2 +%if cpuflag(avx2) +cglobal emu_edge_hfix %+ %%n, 4, 4, 1, dst, dst_stride, start_x, bh +%else cglobal emu_edge_hfix %+ %%n, 4, 5, 1, dst, dst_stride, start_x, bh, val +%endif .loop_y: ; do { READ_V_PIXEL %%n, [dstq+start_xq] ; $variable_regs = read($n) WRITE_V_PIXEL %%n, dstq ; write($variable_regs, $n) @@ -426,6 +447,11 @@ H_EXTEND 16, 22 INIT_XMM sse2 H_EXTEND 16, 22 +%if HAVE_AVX2_EXTERNAL +INIT_XMM avx2 +H_EXTEND 8, 22 +%endif + %macro PREFETCH_FN 1 cglobal prefetch, 3, 3, 0, buf, stride, h .loop: |