diff options
author | Christophe Gisquet <christophe.gisquet@gmail.com> | 2013-04-12 21:07:01 +0200 |
---|---|---|
committer | Anton Khirnov <anton@khirnov.net> | 2013-05-03 08:08:02 +0200 |
commit | 566b7a20fd0cab44d344329538d314454a0bcc2f (patch) | |
tree | f2ca45002f7479bdaa9e1cb4abc92c794b42e338 /libavutil/x86 | |
parent | b333f3a22a4db4cf65d6a0457ac82ecbe7c7ac44 (diff) | |
download | ffmpeg-566b7a20fd0cab44d344329538d314454a0bcc2f.tar.gz |
x86: float dsp: butterflies_float SSE
97c -> 49c
Some codecs could benefit from more unrolling, but AAC doesn't.
Diffstat (limited to 'libavutil/x86')
-rw-r--r-- | libavutil/x86/float_dsp.asm | 26 | ||||
-rw-r--r-- | libavutil/x86/float_dsp_init.c | 3 |
2 files changed, 29 insertions, 0 deletions
diff --git a/libavutil/x86/float_dsp.asm b/libavutil/x86/float_dsp.asm index 779339c575..10330ff336 100644 --- a/libavutil/x86/float_dsp.asm +++ b/libavutil/x86/float_dsp.asm @@ -252,3 +252,29 @@ cglobal scalarproduct_float, 3,3,2, v1, v2, offset fld dword r0m %endif RET + +;----------------------------------------------------------------------------- +; void ff_butterflies_float(float *src0, float *src1, int len); +;----------------------------------------------------------------------------- +INIT_XMM sse +cglobal butterflies_float, 3,3,3, src0, src1, len +%if ARCH_X86_64 + movsxd lenq, lend +%endif + test lenq, lenq + jz .end + shl lenq, 2 + lea src0q, [src0q + lenq] + lea src1q, [src1q + lenq] + neg lenq +.loop: + mova m0, [src0q + lenq] + mova m1, [src1q + lenq] + subps m2, m0, m1 + addps m0, m0, m1 + mova [src1q + lenq], m2 + mova [src0q + lenq], m0 + add lenq, mmsize + jl .loop +.end: + REP_RET diff --git a/libavutil/x86/float_dsp_init.c b/libavutil/x86/float_dsp_init.c index b5e9af935f..34863013cc 100644 --- a/libavutil/x86/float_dsp_init.c +++ b/libavutil/x86/float_dsp_init.c @@ -53,6 +53,8 @@ void ff_vector_fmul_reverse_avx(float *dst, const float *src0, float ff_scalarproduct_float_sse(const float *v1, const float *v2, int order); +void ff_butterflies_float_sse(float *src0, float *src1, int len); + #if HAVE_6REGS && HAVE_INLINE_ASM static void vector_fmul_window_3dnowext(float *dst, const float *src0, const float *src1, const float *win, @@ -138,6 +140,7 @@ void ff_float_dsp_init_x86(AVFloatDSPContext *fdsp) fdsp->vector_fmul_add = ff_vector_fmul_add_sse; fdsp->vector_fmul_reverse = ff_vector_fmul_reverse_sse; fdsp->scalarproduct_float = ff_scalarproduct_float_sse; + fdsp->butterflies_float = ff_butterflies_float_sse; } if (EXTERNAL_SSE2(mm_flags)) { fdsp->vector_dmul_scalar = ff_vector_dmul_scalar_sse2; |