diff options
author | Justin Ruggles <justin.ruggles@gmail.com> | 2011-04-24 17:50:17 -0400 |
---|---|---|
committer | Justin Ruggles <justin.ruggles@gmail.com> | 2011-05-18 17:27:05 -0400 |
commit | 32f8fb8ecf8178b9c9ec8d7152f1fdd8537f7f3a (patch) | |
tree | 75ece9cd372dafb36670424033068b6edafaceda /libavcodec/x86 | |
parent | f907ad9b85d5e08e4a024e24734181940cd4fc48 (diff) | |
download | ffmpeg-32f8fb8ecf8178b9c9ec8d7152f1fdd8537f7f3a.tar.gz |
Add float_interleave() to FmtConvertContext with x86-optimized versions.
Partially based on patches by clsid2 in ffdshow-tryout.
ff_float_interleave6() x86 improvements by Loren Merrit.
Diffstat (limited to 'libavcodec/x86')
-rw-r--r-- | libavcodec/x86/fmtconvert.asm | 141 | ||||
-rw-r--r-- | libavcodec/x86/fmtconvert_mmx.c | 30 |
2 files changed, 171 insertions, 0 deletions
diff --git a/libavcodec/x86/fmtconvert.asm b/libavcodec/x86/fmtconvert.asm index 5cd8f6c596..e023b48322 100644 --- a/libavcodec/x86/fmtconvert.asm +++ b/libavcodec/x86/fmtconvert.asm @@ -20,6 +20,7 @@ ;****************************************************************************** %include "x86inc.asm" +%include "x86util.asm" section .text align=16 @@ -89,3 +90,143 @@ FLOAT_TO_INT16_INTERLEAVE6 3dnow %undef pswapd FLOAT_TO_INT16_INTERLEAVE6 3dn2 %undef cvtps2pi + +;----------------------------------------------------------------------------- +; void ff_float_interleave6(float *dst, const float **src, unsigned int len); +;----------------------------------------------------------------------------- + +%macro BUTTERFLYPS 3 + movaps m%3, m%1 + unpcklps m%1, m%2 + unpckhps m%3, m%2 + SWAP %2, %3 +%endmacro + +%macro FLOAT_INTERLEAVE6 2 +cglobal float_interleave6_%1, 2,7,%2, dst, src, src1, src2, src3, src4, src5 +%ifdef ARCH_X86_64 + %define lend r10d + mov lend, r2d +%else + %define lend dword r2m +%endif + mov src1q, [srcq+1*gprsize] + mov src2q, [srcq+2*gprsize] + mov src3q, [srcq+3*gprsize] + mov src4q, [srcq+4*gprsize] + mov src5q, [srcq+5*gprsize] + mov srcq, [srcq] + sub src1q, srcq + sub src2q, srcq + sub src3q, srcq + sub src4q, srcq + sub src5q, srcq +.loop: +%ifidn %1, sse + movaps m0, [srcq] + movaps m1, [srcq+src1q] + movaps m2, [srcq+src2q] + movaps m3, [srcq+src3q] + movaps m4, [srcq+src4q] + movaps m5, [srcq+src5q] + + BUTTERFLYPS 0, 1, 6 + BUTTERFLYPS 2, 3, 6 + BUTTERFLYPS 4, 5, 6 + + movaps m6, m4 + shufps m4, m0, 0xe4 + movlhps m0, m2 + movhlps m6, m2 + movaps [dstq ], m0 + movaps [dstq+16], m4 + movaps [dstq+32], m6 + + movaps m6, m5 + shufps m5, m1, 0xe4 + movlhps m1, m3 + movhlps m6, m3 + movaps [dstq+48], m1 + movaps [dstq+64], m5 + movaps [dstq+80], m6 +%else ; mmx + movq m0, [srcq] + movq m1, [srcq+src1q] + movq m2, [srcq+src2q] + movq m3, [srcq+src3q] + movq m4, [srcq+src4q] + movq m5, [srcq+src5q] + + SBUTTERFLY dq, 0, 1, 6 + SBUTTERFLY dq, 2, 3, 6 + SBUTTERFLY dq, 4, 5, 6 + movq [dstq ], m0 + movq [dstq+ 8], m2 + movq [dstq+16], m4 + movq [dstq+24], m1 + movq [dstq+32], m3 + movq [dstq+40], m5 +%endif + add srcq, mmsize + add dstq, mmsize*6 + sub lend, mmsize/4 + jg .loop +%ifidn %1, mmx + emms +%endif + REP_RET +%endmacro + +INIT_MMX +FLOAT_INTERLEAVE6 mmx, 0 +INIT_XMM +FLOAT_INTERLEAVE6 sse, 7 + +;----------------------------------------------------------------------------- +; void ff_float_interleave2(float *dst, const float **src, unsigned int len); +;----------------------------------------------------------------------------- + +%macro FLOAT_INTERLEAVE2 2 +cglobal float_interleave2_%1, 3,4,%2, dst, src, len, src1 + mov src1q, [srcq+gprsize] + mov srcq, [srcq ] + sub src1q, srcq +.loop + MOVPS m0, [srcq ] + MOVPS m1, [srcq+src1q ] + MOVPS m3, [srcq +mmsize] + MOVPS m4, [srcq+src1q+mmsize] + + MOVPS m2, m0 + PUNPCKLDQ m0, m1 + PUNPCKHDQ m2, m1 + + MOVPS m1, m3 + PUNPCKLDQ m3, m4 + PUNPCKHDQ m1, m4 + + MOVPS [dstq ], m0 + MOVPS [dstq+1*mmsize], m2 + MOVPS [dstq+2*mmsize], m3 + MOVPS [dstq+3*mmsize], m1 + + add srcq, mmsize*2 + add dstq, mmsize*4 + sub lend, mmsize/2 + jg .loop +%ifidn %1, mmx + emms +%endif + REP_RET +%endmacro + +INIT_MMX +%define MOVPS movq +%define PUNPCKLDQ punpckldq +%define PUNPCKHDQ punpckhdq +FLOAT_INTERLEAVE2 mmx, 0 +INIT_XMM +%define MOVPS movaps +%define PUNPCKLDQ unpcklps +%define PUNPCKHDQ unpckhps +FLOAT_INTERLEAVE2 sse, 5 diff --git a/libavcodec/x86/fmtconvert_mmx.c b/libavcodec/x86/fmtconvert_mmx.c index 847bd80fcd..61a4272a69 100644 --- a/libavcodec/x86/fmtconvert_mmx.c +++ b/libavcodec/x86/fmtconvert_mmx.c @@ -235,11 +235,40 @@ static void float_to_int16_interleave_3dn2(int16_t *dst, const float **src, long float_to_int16_interleave_3dnow(dst, src, len, channels); } +void ff_float_interleave2_mmx(float *dst, const float **src, unsigned int len); +void ff_float_interleave2_sse(float *dst, const float **src, unsigned int len); + +void ff_float_interleave6_mmx(float *dst, const float **src, unsigned int len); +void ff_float_interleave6_sse(float *dst, const float **src, unsigned int len); + +static void float_interleave_mmx(float *dst, const float **src, + unsigned int len, int channels) +{ + if (channels == 2) { + ff_float_interleave2_mmx(dst, src, len); + } else if (channels == 6) + ff_float_interleave6_mmx(dst, src, len); + else + ff_float_interleave_c(dst, src, len, channels); +} + +static void float_interleave_sse(float *dst, const float **src, + unsigned int len, int channels) +{ + if (channels == 2) { + ff_float_interleave2_sse(dst, src, len); + } else if (channels == 6) + ff_float_interleave6_sse(dst, src, len); + else + ff_float_interleave_c(dst, src, len, channels); +} + void ff_fmt_convert_init_x86(FmtConvertContext *c, AVCodecContext *avctx) { int mm_flags = av_get_cpu_flags(); if (mm_flags & AV_CPU_FLAG_MMX) { + c->float_interleave = float_interleave_mmx; if(mm_flags & AV_CPU_FLAG_3DNOW){ if(!(avctx->flags & CODEC_FLAG_BITEXACT)){ @@ -256,6 +285,7 @@ void ff_fmt_convert_init_x86(FmtConvertContext *c, AVCodecContext *avctx) c->int32_to_float_fmul_scalar = int32_to_float_fmul_scalar_sse; c->float_to_int16 = float_to_int16_sse; c->float_to_int16_interleave = float_to_int16_interleave_sse; + c->float_interleave = float_interleave_sse; } if(mm_flags & AV_CPU_FLAG_SSE2){ c->int32_to_float_fmul_scalar = int32_to_float_fmul_scalar_sse2; |