diff options
author | Christophe Gisquet <christophe.gisquet@gmail.com> | 2014-02-14 16:00:46 +0000 |
---|---|---|
committer | Michael Niedermayer <michaelni@gmx.at> | 2014-02-28 19:20:03 +0100 |
commit | 169243112c1e310d90c030fb258092f6d2e46117 (patch) | |
tree | 6febfd2d736cafa28b2cac5a556f1fa0800be397 /libavcodec | |
parent | 90f674d55bd76e23a70ed68b8cb104b5d8b9505f (diff) | |
download | ffmpeg-169243112c1e310d90c030fb258092f6d2e46117.tar.gz |
x86: dcadsp: implement SSE lfe_dir
Results for Arrandale/Windows:
32: 1670 -> 316
64: 728 -> 298
Signed-off-by: Michael Niedermayer <michaelni@gmx.at>
Diffstat (limited to 'libavcodec')
-rw-r--r-- | libavcodec/x86/dcadsp.asm | 87 | ||||
-rw-r--r-- | libavcodec/x86/dcadsp_init.c | 4 |
2 files changed, 91 insertions, 0 deletions
diff --git a/libavcodec/x86/dcadsp.asm b/libavcodec/x86/dcadsp.asm index a0995c909f..f4149d2658 100644 --- a/libavcodec/x86/dcadsp.asm +++ b/libavcodec/x86/dcadsp.asm @@ -88,3 +88,90 @@ INT8X8_FMUL_INT32 INIT_XMM sse4 INT8X8_FMUL_INT32 + +; %1=v0/v1 %2=in1 %3=in2 +%macro FIR_LOOP 2-3 +.loop%1: +%define va m1 +%define vb m2 +%if %1 +%define OFFSET 0 +%else +%define OFFSET NUM_COEF*count +%endif +; for v0, incrementint and for v1, decrementing + mova va, [cf0q + OFFSET] + mova vb, [cf0q + OFFSET + 4*NUM_COEF] +%if %0 == 3 + mova m4, [cf0q + OFFSET + mmsize] + mova m0, [cf0q + OFFSET + 4*NUM_COEF + mmsize] +%endif + mulps va, %2 + mulps vb, %2 +%if %0 == 3 + mulps m4, %3 + mulps m0, %3 + addps va, m4 + addps vb, m0 +%endif + ; va = va1 va2 va3 va4 + ; vb = vb1 vb2 vb3 vb4 +%if %1 + SWAP va, vb +%endif + mova m4, va + unpcklps va, vb ; va3 vb3 va4 vb4 + unpckhps m4, vb ; va1 vb1 va2 vb2 + addps m4, va ; va1+3 vb1+3 va2+4 vb2+4 + movhlps vb, m4 ; va1+3 vb1+3 + addps vb, m4 ; va0..4 vb0..4 + movh [outq + count], vb +%if %1 + sub cf0q, 8*NUM_COEF +%endif + add count, 8 + jl .loop%1 +%endmacro + +; dca_lfe_fir(float *out, float *in, float *coefs) +%macro DCA_LFE_FIR 1 +cglobal dca_lfe_fir%1, 3,3,6-%1, out, in, cf0 +%define IN1 m3 +%define IN2 m5 +%define count inq +%define NUM_COEF 4*(2-%1) +%define NUM_OUT 32*(%1+1) + + movu IN1, [inq + 4 - 1*mmsize] + shufps IN1, IN1, q0123 +%if %1 == 0 + movu IN2, [inq + 4 - 2*mmsize] + shufps IN2, IN2, q0123 +%endif + + mov count, -4*NUM_OUT + add cf0q, 4*NUM_COEF*NUM_OUT + add outq, 4*NUM_OUT + ; compute v0 first +%if %1 == 0 + FIR_LOOP 0, IN1, IN2 +%else + FIR_LOOP 0, IN1 +%endif + shufps IN1, IN1, q0123 + mov count, -4*NUM_OUT + ; cf1 already correctly positioned + add outq, 4*NUM_OUT ; outq now at out2 + sub cf0q, 8*NUM_COEF +%if %1 == 0 + shufps IN2, IN2, q0123 + FIR_LOOP 1, IN2, IN1 +%else + FIR_LOOP 1, IN1 +%endif + RET +%endmacro + +INIT_XMM sse +DCA_LFE_FIR 0 +DCA_LFE_FIR 1 diff --git a/libavcodec/x86/dcadsp_init.c b/libavcodec/x86/dcadsp_init.c index adb454fa02..664019d991 100644 --- a/libavcodec/x86/dcadsp_init.c +++ b/libavcodec/x86/dcadsp_init.c @@ -26,6 +26,8 @@ void ff_int8x8_fmul_int32_sse(float *dst, const int8_t *src, int scale); void ff_int8x8_fmul_int32_sse2(float *dst, const int8_t *src, int scale); void ff_int8x8_fmul_int32_sse4(float *dst, const int8_t *src, int scale); +void ff_dca_lfe_fir0_sse(float *out, const float *in, const float *coefs); +void ff_dca_lfe_fir1_sse(float *out, const float *in, const float *coefs); av_cold void ff_dcadsp_init_x86(DCADSPContext *s) { @@ -35,6 +37,8 @@ av_cold void ff_dcadsp_init_x86(DCADSPContext *s) #if ARCH_X86_32 s->int8x8_fmul_int32 = ff_int8x8_fmul_int32_sse; #endif + s->lfe_fir[0] = ff_dca_lfe_fir0_sse; + s->lfe_fir[1] = ff_dca_lfe_fir1_sse; } if (EXTERNAL_SSE2(cpu_flags)) { |