aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorJustin Ruggles <justin.ruggles@gmail.com>2012-09-24 15:00:53 -0400
committerJustin Ruggles <justin.ruggles@gmail.com>2012-12-05 11:23:36 -0500
commitac7eb4cb20ea84cfc911794722695d501b354ee9 (patch)
treeb9736adb1c0e993f0051b11cc249b67404e518f3
parentda025d115a1ada58081cd869e85b81a1c183ae9e (diff)
downloadffmpeg-ac7eb4cb20ea84cfc911794722695d501b354ee9.tar.gz
float_dsp: add vector_dmul_scalar() to multiply a vector of doubles
Include x86-optimized versions for SSE2 and AVX.
-rw-r--r--libavutil/float_dsp.c9
-rw-r--r--libavutil/float_dsp.h15
-rw-r--r--libavutil/x86/float_dsp.asm45
-rw-r--r--libavutil/x86/float_dsp_init.c9
-rw-r--r--libavutil/x86/x86util.asm11
5 files changed, 89 insertions, 0 deletions
diff --git a/libavutil/float_dsp.c b/libavutil/float_dsp.c
index b6b11818b5..22139defe4 100644
--- a/libavutil/float_dsp.c
+++ b/libavutil/float_dsp.c
@@ -44,11 +44,20 @@ static void vector_fmul_scalar_c(float *dst, const float *src, float mul,
dst[i] = src[i] * mul;
}
+static void vector_dmul_scalar_c(double *dst, const double *src, double mul,
+ int len)
+{
+ int i;
+ for (i = 0; i < len; i++)
+ dst[i] = src[i] * mul;
+}
+
void avpriv_float_dsp_init(AVFloatDSPContext *fdsp, int bit_exact)
{
fdsp->vector_fmul = vector_fmul_c;
fdsp->vector_fmac_scalar = vector_fmac_scalar_c;
fdsp->vector_fmul_scalar = vector_fmul_scalar_c;
+ fdsp->vector_dmul_scalar = vector_dmul_scalar_c;
#if ARCH_ARM
ff_float_dsp_init_arm(fdsp);
diff --git a/libavutil/float_dsp.h b/libavutil/float_dsp.h
index cb4b28f0e2..41b73c5b26 100644
--- a/libavutil/float_dsp.h
+++ b/libavutil/float_dsp.h
@@ -66,6 +66,21 @@ typedef struct AVFloatDSPContext {
*/
void (*vector_fmul_scalar)(float *dst, const float *src, float mul,
int len);
+
+ /**
+ * Multiply a vector of double by a scalar double. Source and
+ * destination vectors must overlap exactly or not at all.
+ *
+ * @param dst result vector
+ * constraints: 32-byte aligned
+ * @param src input vector
+ * constraints: 32-byte aligned
+ * @param mul scalar value
+ * @param len length of vector
+ * constraints: multiple of 8
+ */
+ void (*vector_dmul_scalar)(double *dst, const double *src, double mul,
+ int len);
} AVFloatDSPContext;
/**
diff --git a/libavutil/x86/float_dsp.asm b/libavutil/x86/float_dsp.asm
index 317df9c3c1..d8fd93a625 100644
--- a/libavutil/x86/float_dsp.asm
+++ b/libavutil/x86/float_dsp.asm
@@ -114,3 +114,48 @@ cglobal vector_fmul_scalar, 4,4,3, dst, src, mul, len
INIT_XMM sse
VECTOR_FMUL_SCALAR
+
+;------------------------------------------------------------------------------
+; void ff_vector_dmul_scalar(double *dst, const double *src, double mul,
+; int len)
+;------------------------------------------------------------------------------
+
+%macro VECTOR_DMUL_SCALAR 0
+%if UNIX64
+cglobal vector_dmul_scalar, 3,3,3, dst, src, len
+%else
+cglobal vector_dmul_scalar, 4,4,3, dst, src, mul, len
+%endif
+%if ARCH_X86_32
+ VBROADCASTSD xmm0, mulm
+%else
+%if WIN64
+ movlhps xmm2, xmm2
+%if cpuflag(avx)
+ vinsertf128 ymm2, ymm2, xmm2, 1
+%endif
+ SWAP 0, 2
+%else
+ movlhps xmm0, xmm0
+%if cpuflag(avx)
+ vinsertf128 ymm0, ymm0, xmm0, 1
+%endif
+%endif
+%endif
+ lea lenq, [lend*8-2*mmsize]
+.loop:
+ mulpd m1, m0, [srcq+lenq ]
+ mulpd m2, m0, [srcq+lenq+mmsize]
+ mova [dstq+lenq ], m1
+ mova [dstq+lenq+mmsize], m2
+ sub lenq, 2*mmsize
+ jge .loop
+ REP_RET
+%endmacro
+
+INIT_XMM sse2
+VECTOR_DMUL_SCALAR
+%if HAVE_AVX_EXTERNAL
+INIT_YMM avx
+VECTOR_DMUL_SCALAR
+%endif
diff --git a/libavutil/x86/float_dsp_init.c b/libavutil/x86/float_dsp_init.c
index d14ec6a377..b3b7ff4c3a 100644
--- a/libavutil/x86/float_dsp_init.c
+++ b/libavutil/x86/float_dsp_init.c
@@ -35,6 +35,11 @@ extern void ff_vector_fmac_scalar_avx(float *dst, const float *src, float mul,
extern void ff_vector_fmul_scalar_sse(float *dst, const float *src, float mul,
int len);
+extern void ff_vector_dmul_scalar_sse2(double *dst, const double *src,
+ double mul, int len);
+extern void ff_vector_dmul_scalar_avx(double *dst, const double *src,
+ double mul, int len);
+
void ff_float_dsp_init_x86(AVFloatDSPContext *fdsp)
{
int mm_flags = av_get_cpu_flags();
@@ -44,8 +49,12 @@ void ff_float_dsp_init_x86(AVFloatDSPContext *fdsp)
fdsp->vector_fmac_scalar = ff_vector_fmac_scalar_sse;
fdsp->vector_fmul_scalar = ff_vector_fmul_scalar_sse;
}
+ if (EXTERNAL_SSE2(mm_flags)) {
+ fdsp->vector_dmul_scalar = ff_vector_dmul_scalar_sse2;
+ }
if (EXTERNAL_AVX(mm_flags)) {
fdsp->vector_fmul = ff_vector_fmul_avx;
fdsp->vector_fmac_scalar = ff_vector_fmac_scalar_avx;
+ fdsp->vector_dmul_scalar = ff_vector_dmul_scalar_avx;
}
}
diff --git a/libavutil/x86/x86util.asm b/libavutil/x86/x86util.asm
index d3c0d86056..16ee6cfe94 100644
--- a/libavutil/x86/x86util.asm
+++ b/libavutil/x86/x86util.asm
@@ -631,6 +631,17 @@
%endif
%endmacro
+%macro VBROADCASTSD 2 ; dst xmm/ymm, src m64
+%if cpuflag(avx) && mmsize == 32
+ vbroadcastsd %1, %2
+%elif cpuflag(sse3)
+ movddup %1, %2
+%else ; sse2
+ movsd %1, %2
+ movlhps %1, %1
+%endif
+%endmacro
+
%macro SHUFFLE_MASK_W 8
%rep 8
%if %1>=0x80