diff options
author | Justin Ruggles <justin.ruggles@gmail.com> | 2012-06-08 23:20:59 -0400 |
---|---|---|
committer | Justin Ruggles <justin.ruggles@gmail.com> | 2012-06-18 18:01:14 -0400 |
commit | 82b2df979069063beb14be340350501c8340f9cd (patch) | |
tree | 3c6a61d185f9fbab0d21ebce259ef232973d7219 | |
parent | cb5042d02c66aed68643633446f6bf623b72416e (diff) | |
download | ffmpeg-82b2df979069063beb14be340350501c8340f9cd.tar.gz |
float_dsp: add x86-optimized functions for vector_fmac_scalar()
-rw-r--r-- | libavutil/float_dsp.h | 6 | ||||
-rw-r--r-- | libavutil/x86/float_dsp.asm | 47 | ||||
-rw-r--r-- | libavutil/x86/float_dsp_init.c | 7 |
3 files changed, 57 insertions, 3 deletions
diff --git a/libavutil/float_dsp.h b/libavutil/float_dsp.h index 4e266304da..95cef62f29 100644 --- a/libavutil/float_dsp.h +++ b/libavutil/float_dsp.h @@ -42,12 +42,12 @@ typedef struct AVFloatDSPContext { * overlap exactly or not at all. * * @param dst result vector - * constraints: 16-byte aligned + * constraints: 32-byte aligned * @param src input vector - * constraints: 16-byte aligned + * constraints: 32-byte aligned * @param mul scalar value * @param len length of vector - * constraints: multiple of 4 + * constraints: multiple of 16 */ void (*vector_fmac_scalar)(float *dst, const float *src, float mul, int len); diff --git a/libavutil/x86/float_dsp.asm b/libavutil/x86/float_dsp.asm index 53be7ab99a..66ef09398d 100644 --- a/libavutil/x86/float_dsp.asm +++ b/libavutil/x86/float_dsp.asm @@ -19,6 +19,7 @@ ;****************************************************************************** %include "x86inc.asm" +%include "x86util.asm" SECTION .text @@ -53,3 +54,49 @@ VECTOR_FMUL INIT_YMM avx VECTOR_FMUL %endif + +;------------------------------------------------------------------------------ +; void ff_vector_fmac_scalar(float *dst, const float *src, float mul, int len) +;------------------------------------------------------------------------------ + +%macro VECTOR_FMAC_SCALAR 0 +%if UNIX64 +cglobal vector_fmac_scalar, 3,3,3, dst, src, len +%else +cglobal vector_fmac_scalar, 4,4,3, dst, src, mul, len +%endif +%if WIN64 + SWAP 0, 2 +%endif +%if ARCH_X86_32 + VBROADCASTSS m0, mulm +%else + shufps xmm0, xmm0, 0 +%if cpuflag(avx) + vinsertf128 m0, m0, xmm0, 1 +%endif +%endif + lea lenq, [lend*4-2*mmsize] +.loop + mulps m1, m0, [srcq+lenq ] + mulps m2, m0, [srcq+lenq+mmsize] + addps m1, m1, [dstq+lenq ] + addps m2, m2, [dstq+lenq+mmsize] + mova [dstq+lenq ], m1 + mova [dstq+lenq+mmsize], m2 + sub lenq, 2*mmsize + jge .loop +%if mmsize == 32 + vzeroupper + RET +%else + REP_RET +%endif +%endmacro + +INIT_XMM sse +VECTOR_FMAC_SCALAR +%if HAVE_AVX +INIT_YMM avx +VECTOR_FMAC_SCALAR +%endif diff --git a/libavutil/x86/float_dsp_init.c b/libavutil/x86/float_dsp_init.c index 10bb226f23..d259a367e0 100644 --- a/libavutil/x86/float_dsp_init.c +++ b/libavutil/x86/float_dsp_init.c @@ -26,6 +26,11 @@ extern void ff_vector_fmul_sse(float *dst, const float *src0, const float *src1, extern void ff_vector_fmul_avx(float *dst, const float *src0, const float *src1, int len); +extern void ff_vector_fmac_scalar_sse(float *dst, const float *src, float mul, + int len); +extern void ff_vector_fmac_scalar_avx(float *dst, const float *src, float mul, + int len); + void ff_float_dsp_init_x86(AVFloatDSPContext *fdsp) { #if HAVE_YASM @@ -33,9 +38,11 @@ void ff_float_dsp_init_x86(AVFloatDSPContext *fdsp) if (mm_flags & AV_CPU_FLAG_SSE && HAVE_SSE) { fdsp->vector_fmul = ff_vector_fmul_sse; + fdsp->vector_fmac_scalar = ff_vector_fmac_scalar_sse; } if (mm_flags & AV_CPU_FLAG_AVX && HAVE_AVX) { fdsp->vector_fmul = ff_vector_fmul_avx; + fdsp->vector_fmac_scalar = ff_vector_fmac_scalar_avx; } #endif } |