aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorJustin Ruggles <justin.ruggles@gmail.com>2011-10-09 23:52:03 -0400
committerJustin Ruggles <justin.ruggles@gmail.com>2011-10-21 10:13:05 -0400
commit4e8e2624767f4af0eaa932c543d072fed96fd586 (patch)
tree56c363bfc5cdff42c6346d883680d620826fde22
parent185142a5ea93ef723f70a3ea43797f6c8827eb79 (diff)
downloadffmpeg-4e8e2624767f4af0eaa932c543d072fed96fd586.tar.gz
fmtconvert: port int32_to_float_fmul_scalar() x86 inline asm to yasm
-rw-r--r--libavcodec/x86/dsputil_yasm.asm8
-rw-r--r--libavcodec/x86/fmtconvert.asm46
-rw-r--r--libavcodec/x86/fmtconvert_mmx.c59
-rw-r--r--libavutil/x86/x86util.asm12
4 files changed, 65 insertions, 60 deletions
diff --git a/libavcodec/x86/dsputil_yasm.asm b/libavcodec/x86/dsputil_yasm.asm
index 2a2108404a..fe96d8b12b 100644
--- a/libavcodec/x86/dsputil_yasm.asm
+++ b/libavcodec/x86/dsputil_yasm.asm
@@ -1055,14 +1055,6 @@ emu_edge mmx
; int32_t max, unsigned int len)
;-----------------------------------------------------------------------------
-%macro SPLATD_MMX 1
- punpckldq %1, %1
-%endmacro
-
-%macro SPLATD_SSE2 1
- pshufd %1, %1, 0
-%endmacro
-
%macro VECTOR_CLIP_INT32 4
cglobal vector_clip_int32_%1, 5,5,%2, dst, src, min, max, len
%ifidn %1, sse2
diff --git a/libavcodec/x86/fmtconvert.asm b/libavcodec/x86/fmtconvert.asm
index d314a4e14e..e3eb5d2286 100644
--- a/libavcodec/x86/fmtconvert.asm
+++ b/libavcodec/x86/fmtconvert.asm
@@ -24,6 +24,52 @@
SECTION_TEXT
+;---------------------------------------------------------------------------------
+; void int32_to_float_fmul_scalar(float *dst, const int *src, float mul, int len);
+;---------------------------------------------------------------------------------
+%macro INT32_TO_FLOAT_FMUL_SCALAR 2
+%ifdef ARCH_X86_64
+cglobal int32_to_float_fmul_scalar_%1, 3,3,%2, dst, src, len
+%else
+cglobal int32_to_float_fmul_scalar_%1, 4,4,%2, dst, src, mul, len
+ movss m0, mulm
+%endif
+ SPLATD m0
+ shl lenq, 2
+ add srcq, lenq
+ add dstq, lenq
+ neg lenq
+.loop:
+%ifidn %1, sse2
+ cvtdq2ps m1, [srcq+lenq ]
+ cvtdq2ps m2, [srcq+lenq+16]
+%else
+ cvtpi2ps m1, [srcq+lenq ]
+ cvtpi2ps m3, [srcq+lenq+ 8]
+ cvtpi2ps m2, [srcq+lenq+16]
+ cvtpi2ps m4, [srcq+lenq+24]
+ movlhps m1, m3
+ movlhps m2, m4
+%endif
+ mulps m1, m0
+ mulps m2, m0
+ mova [dstq+lenq ], m1
+ mova [dstq+lenq+16], m2
+ add lenq, 32
+ jl .loop
+ REP_RET
+%endmacro
+
+INIT_XMM
+%define SPLATD SPLATD_SSE
+%define movdqa movaps
+INT32_TO_FLOAT_FMUL_SCALAR sse, 5
+%undef movdqa
+%define SPLATD SPLATD_SSE2
+INT32_TO_FLOAT_FMUL_SCALAR sse2, 3
+%undef SPLATD
+
+
;------------------------------------------------------------------------------
; void ff_float_to_int16(int16_t *dst, const float *src, long len);
;------------------------------------------------------------------------------
diff --git a/libavcodec/x86/fmtconvert_mmx.c b/libavcodec/x86/fmtconvert_mmx.c
index 6e43280d66..86957b45ff 100644
--- a/libavcodec/x86/fmtconvert_mmx.c
+++ b/libavcodec/x86/fmtconvert_mmx.c
@@ -26,52 +26,11 @@
#include "libavutil/x86_cpu.h"
#include "libavcodec/fmtconvert.h"
-static void int32_to_float_fmul_scalar_sse(float *dst, const int *src, float mul, int len)
-{
- x86_reg i = -4*len;
- __asm__ volatile(
- "movss %3, %%xmm4 \n"
- "shufps $0, %%xmm4, %%xmm4 \n"
- "1: \n"
- "cvtpi2ps (%2,%0), %%xmm0 \n"
- "cvtpi2ps 8(%2,%0), %%xmm1 \n"
- "cvtpi2ps 16(%2,%0), %%xmm2 \n"
- "cvtpi2ps 24(%2,%0), %%xmm3 \n"
- "movlhps %%xmm1, %%xmm0 \n"
- "movlhps %%xmm3, %%xmm2 \n"
- "mulps %%xmm4, %%xmm0 \n"
- "mulps %%xmm4, %%xmm2 \n"
- "movaps %%xmm0, (%1,%0) \n"
- "movaps %%xmm2, 16(%1,%0) \n"
- "add $32, %0 \n"
- "jl 1b \n"
- :"+r"(i)
- :"r"(dst+len), "r"(src+len), "m"(mul)
- );
-}
-
-static void int32_to_float_fmul_scalar_sse2(float *dst, const int *src, float mul, int len)
-{
- x86_reg i = -4*len;
- __asm__ volatile(
- "movss %3, %%xmm4 \n"
- "shufps $0, %%xmm4, %%xmm4 \n"
- "1: \n"
- "cvtdq2ps (%2,%0), %%xmm0 \n"
- "cvtdq2ps 16(%2,%0), %%xmm1 \n"
- "mulps %%xmm4, %%xmm0 \n"
- "mulps %%xmm4, %%xmm1 \n"
- "movaps %%xmm0, (%1,%0) \n"
- "movaps %%xmm1, 16(%1,%0) \n"
- "add $32, %0 \n"
- "jl 1b \n"
- :"+r"(i)
- :"r"(dst+len), "r"(src+len), "m"(mul)
- );
-}
-
#if HAVE_YASM
+void ff_int32_to_float_fmul_scalar_sse (float *dst, const int *src, float mul, int len);
+void ff_int32_to_float_fmul_scalar_sse2(float *dst, const int *src, float mul, int len);
+
void ff_float_to_int16_3dnow(int16_t *dst, const float *src, long len);
void ff_float_to_int16_sse (int16_t *dst, const float *src, long len);
void ff_float_to_int16_sse2 (int16_t *dst, const float *src, long len);
@@ -204,8 +163,8 @@ void ff_fmt_convert_init_x86(FmtConvertContext *c, AVCodecContext *avctx)
{
int mm_flags = av_get_cpu_flags();
- if (mm_flags & AV_CPU_FLAG_MMX) {
#if HAVE_YASM
+ if (mm_flags & AV_CPU_FLAG_MMX) {
c->float_interleave = float_interleave_mmx;
if (HAVE_AMD3DNOW && mm_flags & AV_CPU_FLAG_3DNOW) {
@@ -219,21 +178,17 @@ void ff_fmt_convert_init_x86(FmtConvertContext *c, AVCodecContext *avctx)
c->float_to_int16_interleave = float_to_int16_interleave_3dn2;
}
}
-#endif
if (HAVE_SSE && mm_flags & AV_CPU_FLAG_SSE) {
- c->int32_to_float_fmul_scalar = int32_to_float_fmul_scalar_sse;
-#if HAVE_YASM
+ c->int32_to_float_fmul_scalar = ff_int32_to_float_fmul_scalar_sse;
c->float_to_int16 = ff_float_to_int16_sse;
c->float_to_int16_interleave = float_to_int16_interleave_sse;
c->float_interleave = float_interleave_sse;
-#endif
}
if (HAVE_SSE && mm_flags & AV_CPU_FLAG_SSE2) {
- c->int32_to_float_fmul_scalar = int32_to_float_fmul_scalar_sse2;
-#if HAVE_YASM
+ c->int32_to_float_fmul_scalar = ff_int32_to_float_fmul_scalar_sse2;
c->float_to_int16 = ff_float_to_int16_sse2;
c->float_to_int16_interleave = float_to_int16_interleave_sse2;
-#endif
}
}
+#endif
}
diff --git a/libavutil/x86/x86util.asm b/libavutil/x86/x86util.asm
index 7e16c15db2..874443a2ef 100644
--- a/libavutil/x86/x86util.asm
+++ b/libavutil/x86/x86util.asm
@@ -536,6 +536,18 @@
%endif
%endmacro
+%macro SPLATD_MMX 1
+ punpckldq %1, %1
+%endmacro
+
+%macro SPLATD_SSE 1
+ shufps %1, %1, 0
+%endmacro
+
+%macro SPLATD_SSE2 1
+ pshufd %1, %1, 0
+%endmacro
+
%macro CLIPW 3 ;(dst, min, max)
pmaxsw %1, %2
pminsw %1, %3