diff options
author | Vitor Sessak <vitor1001@gmail.com> | 2011-07-30 18:39:25 +0200 |
---|---|---|
committer | Ronald S. Bultje <rsbultje@gmail.com> | 2011-08-02 10:17:29 -0700 |
commit | 18b131de0473a3110c63966cd7c6cd2ab118d401 (patch) | |
tree | 822a1fd96cdd780477b3a22622b72a9245735e97 | |
parent | 6f7fe4723b9bfbb52341568906e6168966f486b3 (diff) | |
download | ffmpeg-18b131de0473a3110c63966cd7c6cd2ab118d401.tar.gz |
dct32: Add SSE2 ASM optimizations
Signed-off-by: Ronald S. Bultje <rsbultje@gmail.com>
-rw-r--r-- | libavcodec/x86/dct32_sse.asm | 39 | ||||
-rw-r--r-- | libavcodec/x86/fft.c | 2 | ||||
-rw-r--r-- | libavcodec/x86/fft.h | 1 |
3 files changed, 33 insertions, 9 deletions
diff --git a/libavcodec/x86/dct32_sse.asm b/libavcodec/x86/dct32_sse.asm index 46daa43d8c..720a061078 100644 --- a/libavcodec/x86/dct32_sse.asm +++ b/libavcodec/x86/dct32_sse.asm @@ -63,6 +63,13 @@ ps_p1p1m1m1: dd 0, 0, 0x80000000, 0x80000000, 0, 0, 0x80000000, 0x80000000 mulps %1, %3 %endmacro +%macro BUTTERFLY0_SSE2 5 + pshufd %4, %1, %5 + xorps %1, %2 + addps %1, %4 + mulps %1, %3 +%endmacro + %macro BUTTERFLY0_AVX 5 vshufps %4, %1, %1, %5 vxorps %1, %1, %2 @@ -405,18 +412,17 @@ INIT_XMM INIT_XMM +%macro DCT32_FUNC 1 ; void ff_dct32_float_sse(FFTSample *out, const FFTSample *in) -cglobal dct32_float_sse, 2,3,16, out, in, tmp +cglobal dct32_float_%1, 2,3,16, out, in, tmp ; pass 1 movaps m0, [inq+0] - movaps m1, [inq+112] - shufps m1, m1, 0x1b + LOAD_INV m1, [inq+112] BUTTERFLY m0, m1, [ps_cos_vec], m3 movaps m7, [inq+64] - movaps m4, [inq+48] - shufps m4, m4, 0x1b + LOAD_INV m4, [inq+48] BUTTERFLY m7, m4, [ps_cos_vec+32], m3 ; pass 2 @@ -427,13 +433,11 @@ cglobal dct32_float_sse, 2,3,16, out, in, tmp ; pass 1 movaps m1, [inq+16] - movaps m6, [inq+96] - shufps m6, m6, 0x1b + LOAD_INV m6, [inq+96] BUTTERFLY m1, m6, [ps_cos_vec+16], m3 movaps m4, [inq+80] - movaps m5, [inq+32] - shufps m5, m5, 0x1b + LOAD_INV m5, [inq+32] BUTTERFLY m4, m5, [ps_cos_vec+48], m3 ; pass 2 @@ -492,3 +496,20 @@ cglobal dct32_float_sse, 2,3,16, out, in, tmp PASS5 PASS6 RET +%endmacro + +%macro LOAD_INV_SSE 2 + movaps %1, %2 + shufps %1, %1, 0x1b +%endmacro + +%define LOAD_INV LOAD_INV_SSE +DCT32_FUNC sse + +%macro LOAD_INV_SSE2 2 + pshufd %1, %2, 0x1b +%endmacro + +%define LOAD_INV LOAD_INV_SSE2 +%define BUTTERFLY0 BUTTERFLY0_SSE2 +DCT32_FUNC sse2 diff --git a/libavcodec/x86/fft.c b/libavcodec/x86/fft.c index 899f0f7ad5..f7308cca32 100644 --- a/libavcodec/x86/fft.c +++ b/libavcodec/x86/fft.c @@ -60,6 +60,8 @@ av_cold void ff_dct_init_mmx(DCTContext *s) int has_vectors = av_get_cpu_flags(); if (has_vectors & AV_CPU_FLAG_AVX && HAVE_AVX) s->dct32 = ff_dct32_float_avx; + else if (has_vectors & AV_CPU_FLAG_SSE2 && HAVE_SSE) + s->dct32 = ff_dct32_float_sse2; else if (has_vectors & AV_CPU_FLAG_SSE && HAVE_SSE) s->dct32 = ff_dct32_float_sse; #endif diff --git a/libavcodec/x86/fft.h b/libavcodec/x86/fft.h index 0ade2b2e7b..9d68d5b219 100644 --- a/libavcodec/x86/fft.h +++ b/libavcodec/x86/fft.h @@ -35,6 +35,7 @@ void ff_imdct_calc_sse(FFTContext *s, FFTSample *output, const FFTSample *input) void ff_imdct_half_sse(FFTContext *s, FFTSample *output, const FFTSample *input); void ff_imdct_half_avx(FFTContext *s, FFTSample *output, const FFTSample *input); void ff_dct32_float_sse(FFTSample *out, const FFTSample *in); +void ff_dct32_float_sse2(FFTSample *out, const FFTSample *in); void ff_dct32_float_avx(FFTSample *out, const FFTSample *in); #endif /* AVCODEC_X86_FFT_H */ |