diff options
author | Michael Niedermayer <michaelni@gmx.at> | 2012-05-05 15:31:06 +0200 |
---|---|---|
committer | Michael Niedermayer <michaelni@gmx.at> | 2012-05-05 18:32:34 +0200 |
commit | 47055b8913c96a1c41a3bbdf30205255c8453f25 (patch) | |
tree | aead4a059b9d199ab0dd5f7aad19a93d92b90de2 | |
parent | fec3700dcd8c7cba7b85c5ce6f9c5f3092c1bc17 (diff) | |
download | ffmpeg-47055b8913c96a1c41a3bbdf30205255c8453f25.tar.gz |
swr: implement stereo S16/S32/FLT->S16/S32/FLT planar->packed in SSE/SSE2
Signed-off-by: Michael Niedermayer <michaelni@gmx.at>
-rw-r--r-- | libswresample/x86/audio_convert.asm | 148 | ||||
-rw-r--r-- | libswresample/x86/swresample_x86.c | 34 |
2 files changed, 182 insertions, 0 deletions
diff --git a/libswresample/x86/audio_convert.asm b/libswresample/x86/audio_convert.asm index 31723529e4..e5209652c4 100644 --- a/libswresample/x86/audio_convert.asm +++ b/libswresample/x86/audio_convert.asm @@ -227,6 +227,135 @@ int32_to_int16_u_int %+ SUFFIX REP_RET %endmacro +;to, from, a/u, log2_outsize, log_intsize, const +%macro PACK_2CH 5-7 +cglobal pack_2ch_%2_to_%1_%3, 3, 4, 5, dst, src, len, src2 + mov src2q , [srcq+gprsize] + mov srcq , [srcq] + mov dstq , [dstq] +%ifidn %3, a + test dstq, mmsize-1 + jne pack_2ch_%1_to_%2_u_int %+ SUFFIX + test srcq, mmsize-1 + jne pack_2ch_%1_to_%2_u_int %+ SUFFIX + test src2q, mmsize-1 + jne pack_2ch_%1_to_%2_u_int %+ SUFFIX +%else +pack_2ch_%1_to_%2_u_int %+ SUFFIX +%endif + lea srcq , [srcq + (1<<%5)*lenq] + lea src2q, [src2q + (1<<%5)*lenq] + lea dstq , [dstq + (2<<%4)*lenq] + neg lenq + %7 +.next: + mov%3 m0, [ srcq +(1<<%5)*lenq] + mova m1, m0 + mov%3 m2, [ src2q+(1<<%5)*lenq] +%if %5 == 1 + punpcklwd m0, m2 + punpckhwd m1, m2 +%else + punpckldq m0, m2 + punpckhdq m1, m2 +%endif +%if %4 < %5 + mov%3 m2, [mmsize + srcq +(1<<%5)*lenq] + mova m3, m2 + mov%3 m4, [mmsize + src2q+(1<<%5)*lenq] + punpckldq m2, m4 + punpckhdq m3, m4 +%endif + %6 + mov%3 [ dstq+(2<<%4)*lenq], m0 + mov%3 [ mmsize + dstq+(2<<%4)*lenq], m1 +%if %4 > %5 + mov%3 [2*mmsize + dstq+(2<<%4)*lenq], m2 + mov%3 [3*mmsize + dstq+(2<<%4)*lenq], m3 + add lenq, 4*mmsize/(2<<%4) +%else + add lenq, 2*mmsize/(2<<%4) +%endif + jl .next + REP_RET +%endmacro + +%macro INT16_TO_INT32_N 0 + pxor m2, m2 + pxor m3, m3 + punpcklwd m2, m1 + punpckhwd m3, m1 + SWAP 4,0 + pxor m0, m0 + pxor m1, m1 + punpcklwd m0, m4 + punpckhwd m1, m4 +%endmacro + +%macro INT32_TO_INT16_N 0 + psrad m0, 16 + psrad m1, 16 + psrad m2, 16 + psrad m3, 16 + packssdw m0, m1 + packssdw m2, m3 + SWAP 1,2 +%endmacro + +%macro INT32_TO_FLOAT_INIT 0 + mova m3, [flt2pm31] +%endmacro +%macro INT32_TO_FLOAT_N 0 + cvtdq2ps m0, m0 + cvtdq2ps m1, m1 + mulps m0, m0, m3 + mulps m1, m1, m3 +%endmacro + +%macro FLOAT_TO_INT32_INIT 0 + mova m3, [flt2p31] +%endmacro +%macro FLOAT_TO_INT32_N 0 + mulps m0, m3 + mulps m1, m3 + cvtps2dq m2, m0 + cvtps2dq m4, m1 + cmpnltps m0, m3 + cmpnltps m1, m3 + paddd m0, m2 + paddd m1, m4 +%endmacro + +%macro INT16_TO_FLOAT_INIT 0 + mova m5, [flt2pm31] +%endmacro +%macro INT16_TO_FLOAT_N 0 + INT16_TO_INT32_N + cvtdq2ps m0, m0 + cvtdq2ps m1, m1 + cvtdq2ps m2, m2 + cvtdq2ps m3, m3 + mulps m0, m0, m5 + mulps m1, m1, m5 + mulps m2, m2, m5 + mulps m3, m3, m5 +%endmacro + +%macro FLOAT_TO_INT16_INIT 0 + mova m5, [flt2p15] +%endmacro +%macro FLOAT_TO_INT16_N 0 + mulps m0, m5 + mulps m1, m5 + mulps m2, m5 + mulps m3, m5 + cvtps2dq m0, m0 + cvtps2dq m1, m1 + packssdw m0, m1 + cvtps2dq m1, m2 + cvtps2dq m3, m3 + packssdw m1, m3 +%endmacro INIT_MMX mmx INT16_TO_INT32 u @@ -240,6 +369,15 @@ INT16_TO_INT32 a INT32_TO_INT16 u INT32_TO_INT16 a +PACK_2CH int16, int16, u, 1, 1 +PACK_2CH int16, int16, a, 1, 1 +PACK_2CH int32, int32, u, 2, 2 +PACK_2CH int32, int32, a, 2, 2 +PACK_2CH int32, int16, u, 2, 1, INT16_TO_INT32_N +PACK_2CH int32, int16, a, 2, 1, INT16_TO_INT32_N +PACK_2CH int16, int32, u, 1, 2, INT32_TO_INT16_N +PACK_2CH int16, int32, a, 1, 2, INT32_TO_INT16_N + INIT_XMM sse2 INT32_TO_FLOAT u INT32_TO_FLOAT a @@ -250,6 +388,16 @@ FLOAT_TO_INT32 a FLOAT_TO_INT16 u FLOAT_TO_INT16 a +PACK_2CH float, int32, u, 2, 2, INT32_TO_FLOAT_N, INT32_TO_FLOAT_INIT +PACK_2CH float, int32, a, 2, 2, INT32_TO_FLOAT_N, INT32_TO_FLOAT_INIT +PACK_2CH int32, float, u, 2, 2, FLOAT_TO_INT32_N, FLOAT_TO_INT32_INIT +PACK_2CH int32, float, a, 2, 2, FLOAT_TO_INT32_N, FLOAT_TO_INT32_INIT +PACK_2CH float, int16, u, 2, 1, INT16_TO_FLOAT_N, INT16_TO_FLOAT_INIT +PACK_2CH float, int16, a, 2, 1, INT16_TO_FLOAT_N, INT16_TO_FLOAT_INIT +PACK_2CH int16, float, u, 1, 2, FLOAT_TO_INT16_N, FLOAT_TO_INT16_INIT +PACK_2CH int16, float, a, 1, 2, FLOAT_TO_INT16_N, FLOAT_TO_INT16_INIT + + %if HAVE_AVX INIT_YMM avx INT32_TO_FLOAT u diff --git a/libswresample/x86/swresample_x86.c b/libswresample/x86/swresample_x86.c index dc6ade5709..9370221513 100644 --- a/libswresample/x86/swresample_x86.c +++ b/libswresample/x86/swresample_x86.c @@ -35,6 +35,16 @@ void ff_float_to_int16_a_sse2(uint8_t **dst, const uint8_t **src, int len); void ff_int32_to_float_a_avx(uint8_t **dst, const uint8_t **src, int len); +void ff_pack_2ch_int16_to_int16_a_sse(uint8_t **dst, const uint8_t **src, int len); +void ff_pack_2ch_int32_to_int32_a_sse(uint8_t **dst, const uint8_t **src, int len); +void ff_pack_2ch_int16_to_int32_a_sse(uint8_t **dst, const uint8_t **src, int len); +void ff_pack_2ch_int32_to_int16_a_sse(uint8_t **dst, const uint8_t **src, int len); + +void ff_pack_2ch_int32_to_float_a_sse2(uint8_t **dst, const uint8_t **src, int len); +void ff_pack_2ch_float_to_int32_a_sse2(uint8_t **dst, const uint8_t **src, int len); +void ff_pack_2ch_int16_to_float_a_sse2(uint8_t **dst, const uint8_t **src, int len); +void ff_pack_2ch_float_to_int16_a_sse2(uint8_t **dst, const uint8_t **src, int len); + void swri_audio_convert_init_x86(struct AudioConvert *ac, enum AVSampleFormat out_fmt, enum AVSampleFormat in_fmt, @@ -56,6 +66,19 @@ void swri_audio_convert_init_x86(struct AudioConvert *ac, MULTI_CAPS_FUNC(AV_CPU_FLAG_MMX, mmx) MULTI_CAPS_FUNC(AV_CPU_FLAG_SSE, sse) + if(mm_flags & AV_CPU_FLAG_SSE) { + if(channels == 2) { + if( out_fmt == AV_SAMPLE_FMT_FLT && in_fmt == AV_SAMPLE_FMT_FLTP || out_fmt == AV_SAMPLE_FMT_S32 && in_fmt == AV_SAMPLE_FMT_S32P) + ac->simd_f = ff_pack_2ch_int32_to_int32_a_sse; + if( out_fmt == AV_SAMPLE_FMT_S16 && in_fmt == AV_SAMPLE_FMT_S16P) + ac->simd_f = ff_pack_2ch_int16_to_int16_a_sse; + if( out_fmt == AV_SAMPLE_FMT_S32 && in_fmt == AV_SAMPLE_FMT_S16P) + ac->simd_f = ff_pack_2ch_int16_to_int32_a_sse; + if( out_fmt == AV_SAMPLE_FMT_S16 && in_fmt == AV_SAMPLE_FMT_S32P) + ac->simd_f = ff_pack_2ch_int32_to_int16_a_sse; + } + } + if(mm_flags & AV_CPU_FLAG_SSE2) { if( out_fmt == AV_SAMPLE_FMT_FLT && in_fmt == AV_SAMPLE_FMT_S32 || out_fmt == AV_SAMPLE_FMT_FLTP && in_fmt == AV_SAMPLE_FMT_S32P) ac->simd_f = ff_int32_to_float_a_sse2; @@ -65,6 +88,17 @@ MULTI_CAPS_FUNC(AV_CPU_FLAG_SSE, sse) ac->simd_f = ff_float_to_int32_a_sse2; if( out_fmt == AV_SAMPLE_FMT_S16 && in_fmt == AV_SAMPLE_FMT_FLT || out_fmt == AV_SAMPLE_FMT_S16P && in_fmt == AV_SAMPLE_FMT_FLTP) ac->simd_f = ff_float_to_int16_a_sse2; + + if(channels == 2) { + if( out_fmt == AV_SAMPLE_FMT_FLT && in_fmt == AV_SAMPLE_FMT_S32P) + ac->simd_f = ff_pack_2ch_int32_to_float_a_sse2; + if( out_fmt == AV_SAMPLE_FMT_S32 && in_fmt == AV_SAMPLE_FMT_FLTP) + ac->simd_f = ff_pack_2ch_float_to_int32_a_sse2; + if( out_fmt == AV_SAMPLE_FMT_FLT && in_fmt == AV_SAMPLE_FMT_S16P) + ac->simd_f = ff_pack_2ch_int16_to_float_a_sse2; + if( out_fmt == AV_SAMPLE_FMT_S16 && in_fmt == AV_SAMPLE_FMT_FLTP) + ac->simd_f = ff_pack_2ch_float_to_int16_a_sse2; + } } if(HAVE_AVX && mm_flags & AV_CPU_FLAG_AVX) { if( out_fmt == AV_SAMPLE_FMT_FLT && in_fmt == AV_SAMPLE_FMT_S32 || out_fmt == AV_SAMPLE_FMT_FLTP && in_fmt == AV_SAMPLE_FMT_S32P) |