diff options
author | James Almer <jamrial@gmail.com> | 2017-04-11 21:29:09 -0300 |
---|---|---|
committer | James Almer <jamrial@gmail.com> | 2017-04-11 21:35:35 -0300 |
commit | f1d80bc6305221506ad96f0ab82088f9229881ab (patch) | |
tree | 3f8a091209491b47e9886c8864cc9eb04155a899 | |
parent | 5b441d2981f35e6183b0ac29fa89e089c91cf7ba (diff) | |
download | ffmpeg-f1d80bc6305221506ad96f0ab82088f9229881ab.tar.gz |
x86/float_dsp: add ff_vector_fmul_reverse_avx2
~20% faster than AVX.
Signed-off-by: James Almer <jamrial@gmail.com>
-rw-r--r-- | libavutil/x86/float_dsp.asm | 15 | ||||
-rw-r--r-- | libavutil/x86/float_dsp_init.c | 5 |
2 files changed, 19 insertions, 1 deletions
diff --git a/libavutil/x86/float_dsp.asm b/libavutil/x86/float_dsp.asm index 9affacb72b..edade0d55d 100644 --- a/libavutil/x86/float_dsp.asm +++ b/libavutil/x86/float_dsp.asm @@ -22,6 +22,9 @@ %include "x86util.asm" +SECTION_RODATA 32 +pd_reverse: dd 7, 6, 5, 4, 3, 2, 1, 0 + SECTION .text ;----------------------------------------------------------------------------- @@ -359,10 +362,16 @@ VECTOR_FMUL_ADD ;----------------------------------------------------------------------------- %macro VECTOR_FMUL_REVERSE 0 cglobal vector_fmul_reverse, 4,4,2, dst, src0, src1, len +%if cpuflag(avx2) + mova m2, [pd_reverse] +%endif lea lenq, [lend*4 - 2*mmsize] ALIGN 16 .loop: -%if cpuflag(avx) +%if cpuflag(avx2) + vpermd m0, m2, [src1q] + vpermd m1, m2, [src1q+mmsize] +%elif cpuflag(avx) vmovaps xmm0, [src1q + 16] vinsertf128 m0, m0, [src1q], 1 vshufps m0, m0, m0, q0123 @@ -391,6 +400,10 @@ VECTOR_FMUL_REVERSE INIT_YMM avx VECTOR_FMUL_REVERSE %endif +%if HAVE_AVX2_EXTERNAL +INIT_YMM avx2 +VECTOR_FMUL_REVERSE +%endif ; float scalarproduct_float_sse(const float *v1, const float *v2, int len) INIT_XMM sse diff --git a/libavutil/x86/float_dsp_init.c b/libavutil/x86/float_dsp_init.c index 09c7a4d3b2..122087a196 100644 --- a/libavutil/x86/float_dsp_init.c +++ b/libavutil/x86/float_dsp_init.c @@ -67,6 +67,8 @@ void ff_vector_fmul_reverse_sse(float *dst, const float *src0, const float *src1, int len); void ff_vector_fmul_reverse_avx(float *dst, const float *src0, const float *src1, int len); +void ff_vector_fmul_reverse_avx2(float *dst, const float *src0, + const float *src1, int len); float ff_scalarproduct_float_sse(const float *v1, const float *v2, int order); @@ -101,6 +103,9 @@ av_cold void ff_float_dsp_init_x86(AVFloatDSPContext *fdsp) fdsp->vector_fmul_add = ff_vector_fmul_add_avx; fdsp->vector_fmul_reverse = ff_vector_fmul_reverse_avx; } + if (EXTERNAL_AVX2_FAST(cpu_flags)) { + fdsp->vector_fmul_reverse = ff_vector_fmul_reverse_avx2; + } if (EXTERNAL_FMA3_FAST(cpu_flags)) { fdsp->vector_fmac_scalar = ff_vector_fmac_scalar_fma3; fdsp->vector_fmul_add = ff_vector_fmul_add_fma3; |