aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorDiego Biurrun <diego@biurrun.de>2013-12-29 02:32:16 +0100
committerDiego Biurrun <diego@biurrun.de>2014-05-29 06:41:15 -0700
commit054013a0fc6f2b52c60cee3e051be8cc7f82cef3 (patch)
tree87098f4b0443359b7109066486c15fdaad09dddb
parent256da0770e495176d1b2699ec6e9c7993c2a6d7b (diff)
downloadffmpeg-054013a0fc6f2b52c60cee3e051be8cc7f82cef3.tar.gz
dsputil: Move APE-specific bits into apedsp
-rw-r--r--libavcodec/apedec.c33
-rw-r--r--libavcodec/apedsp.h44
-rw-r--r--libavcodec/arm/Makefile2
-rw-r--r--libavcodec/arm/apedsp_init_arm.c38
-rw-r--r--libavcodec/arm/apedsp_neon.S62
-rw-r--r--libavcodec/arm/dsputil_init_neon.c5
-rw-r--r--libavcodec/arm/int_neon.S40
-rw-r--r--libavcodec/dsputil.c15
-rw-r--r--libavcodec/dsputil.h10
-rw-r--r--libavcodec/ppc/Makefile1
-rw-r--r--libavcodec/ppc/apedsp_altivec.c77
-rw-r--r--libavcodec/ppc/int_altivec.c42
-rw-r--r--libavcodec/x86/Makefile2
-rw-r--r--libavcodec/x86/apedsp.asm167
-rw-r--r--libavcodec/x86/apedsp_init.c47
-rw-r--r--libavcodec/x86/dsputil.asm137
-rw-r--r--libavcodec/x86/dsputil_init.c13
17 files changed, 468 insertions, 267 deletions
diff --git a/libavcodec/apedec.c b/libavcodec/apedec.c
index fb41918265..6329295c9a 100644
--- a/libavcodec/apedec.c
+++ b/libavcodec/apedec.c
@@ -25,6 +25,7 @@
#include "libavutil/avassert.h"
#include "libavutil/channel_layout.h"
#include "libavutil/opt.h"
+#include "apedsp.h"
#include "avcodec.h"
#include "dsputil.h"
#include "bytestream.h"
@@ -136,6 +137,7 @@ typedef struct APEContext {
AVClass *class; ///< class for AVOptions
AVCodecContext *avctx;
DSPContext dsp;
+ APEDSPContext adsp;
int channels;
int samples; ///< samples left to decode in current frame
int bps;
@@ -195,8 +197,6 @@ static void predictor_decode_stereo_3930(APEContext *ctx, int count);
static void predictor_decode_mono_3950(APEContext *ctx, int count);
static void predictor_decode_stereo_3950(APEContext *ctx, int count);
-// TODO: dsputilize
-
static av_cold int ape_decode_close(AVCodecContext *avctx)
{
APEContext *s = avctx->priv_data;
@@ -212,6 +212,19 @@ static av_cold int ape_decode_close(AVCodecContext *avctx)
return 0;
}
+static int32_t scalarproduct_and_madd_int16_c(int16_t *v1, const int16_t *v2,
+ const int16_t *v3,
+ int order, int mul)
+{
+ int res = 0;
+
+ while (order--) {
+ res += *v1 * *v2++;
+ *v1++ += mul * *v3++;
+ }
+ return res;
+}
+
static av_cold int ape_decode_init(AVCodecContext *avctx)
{
APEContext *s = avctx->priv_data;
@@ -292,6 +305,15 @@ static av_cold int ape_decode_init(AVCodecContext *avctx)
s->predictor_decode_stereo = predictor_decode_stereo_3950;
}
+ s->adsp.scalarproduct_and_madd_int16 = scalarproduct_and_madd_int16_c;
+
+ if (ARCH_ARM)
+ ff_apedsp_init_arm(&s->adsp);
+ if (ARCH_PPC)
+ ff_apedsp_init_ppc(&s->adsp);
+ if (ARCH_X86)
+ ff_apedsp_init_x86(&s->adsp);
+
ff_dsputil_init(&s->dsp, avctx);
avctx->channel_layout = (avctx->channels==2) ? AV_CH_LAYOUT_STEREO : AV_CH_LAYOUT_MONO;
@@ -1263,9 +1285,10 @@ static void do_apply_filter(APEContext *ctx, int version, APEFilter *f,
while (count--) {
/* round fixedpoint scalar product */
- res = ctx->dsp.scalarproduct_and_madd_int16(f->coeffs, f->delay - order,
- f->adaptcoeffs - order,
- order, APESIGN(*data));
+ res = ctx->adsp.scalarproduct_and_madd_int16(f->coeffs,
+ f->delay - order,
+ f->adaptcoeffs - order,
+ order, APESIGN(*data));
res = (res + (1 << (fracbits - 1))) >> fracbits;
res += *data;
*data++ = res;
diff --git a/libavcodec/apedsp.h b/libavcodec/apedsp.h
new file mode 100644
index 0000000000..64e2749679
--- /dev/null
+++ b/libavcodec/apedsp.h
@@ -0,0 +1,44 @@
+/*
+ * Monkey's Audio lossless audio decoder
+ * Copyright (c) 2007 Benjamin Zores <ben@geexbox.org>
+ * based upon libdemac from Dave Chapman.
+ *
+ * This file is part of Libav.
+ *
+ * Libav is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * Libav is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with Libav; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#ifndef AVCODEC_APEDSP_H
+#define AVCODEC_APEDSP_H
+
+#include <stdint.h>
+
+typedef struct APEDSPContext {
+ /**
+ * Calculate scalar product of v1 and v2,
+ * and v1[i] += v3[i] * mul
+ * @param len length of vectors, should be multiple of 16
+ */
+ int32_t (*scalarproduct_and_madd_int16)(int16_t *v1 /* align 16 */,
+ const int16_t *v2,
+ const int16_t *v3,
+ int len, int mul);
+} APEDSPContext;
+
+void ff_apedsp_init_arm(APEDSPContext *c);
+void ff_apedsp_init_ppc(APEDSPContext *c);
+void ff_apedsp_init_x86(APEDSPContext *c);
+
+#endif /* AVCODEC_APEDSP_H */
diff --git a/libavcodec/arm/Makefile b/libavcodec/arm/Makefile
index 9d5b6aab5b..13025af9c1 100644
--- a/libavcodec/arm/Makefile
+++ b/libavcodec/arm/Makefile
@@ -24,6 +24,7 @@ OBJS-$(CONFIG_VP3DSP) += arm/vp3dsp_init_arm.o
OBJS-$(CONFIG_AAC_DECODER) += arm/aacpsdsp_init_arm.o \
arm/sbrdsp_init_arm.o
+OBJS-$(CONFIG_APE_DECODER) += arm/apedsp_init_arm.o
OBJS-$(CONFIG_DCA_DECODER) += arm/dcadsp_init_arm.o
OBJS-$(CONFIG_FLAC_DECODER) += arm/flacdsp_init_arm.o \
arm/flacdsp_arm.o
@@ -97,6 +98,7 @@ NEON-OBJS-$(CONFIG_VP3DSP) += arm/vp3dsp_neon.o
NEON-OBJS-$(CONFIG_AAC_DECODER) += arm/aacpsdsp_neon.o \
arm/sbrdsp_neon.o
+NEON-OBJS-$(CONFIG_APE_DECODER) += arm/apedsp_neon.o
NEON-OBJS-$(CONFIG_DCA_DECODER) += arm/dcadsp_neon.o \
arm/synth_filter_neon.o
NEON-OBJS-$(CONFIG_RV30_DECODER) += arm/rv34dsp_neon.o
diff --git a/libavcodec/arm/apedsp_init_arm.c b/libavcodec/arm/apedsp_init_arm.c
new file mode 100644
index 0000000000..47ea034359
--- /dev/null
+++ b/libavcodec/arm/apedsp_init_arm.c
@@ -0,0 +1,38 @@
+/*
+ * Copyright (c) 2011 Mans Rullgard <mans@mansr.com>
+ *
+ * This file is part of Libav.
+ *
+ * Libav is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * Libav is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with Libav; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include <stdint.h>
+
+#include "libavutil/attributes.h"
+#include "libavutil/cpu.h"
+#include "libavutil/arm/cpu.h"
+#include "libavcodec/apedsp.h"
+
+int32_t ff_scalarproduct_and_madd_int16_neon(int16_t *v1, const int16_t *v2,
+ const int16_t *v3, int len, int mul);
+
+av_cold void ff_apedsp_init_arm(APEDSPContext *c)
+{
+ int cpu_flags = av_get_cpu_flags();
+
+ if (have_neon(cpu_flags)) {
+ c->scalarproduct_and_madd_int16 = ff_scalarproduct_and_madd_int16_neon;
+ }
+}
diff --git a/libavcodec/arm/apedsp_neon.S b/libavcodec/arm/apedsp_neon.S
new file mode 100644
index 0000000000..7cfbf43c6d
--- /dev/null
+++ b/libavcodec/arm/apedsp_neon.S
@@ -0,0 +1,62 @@
+/*
+ * ARM NEON optimised integer operations
+ * Copyright (c) 2009 Kostya Shishkov
+ *
+ * This file is part of Libav.
+ *
+ * Libav is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * Libav is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with Libav; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/arm/asm.S"
+
+@ scalarproduct_and_madd_int16(/*aligned*/v0,v1,v2,order,mul)
+function ff_scalarproduct_and_madd_int16_neon, export=1
+ vld1.16 {d28[],d29[]}, [sp]
+ vmov.i16 q0, #0
+ vmov.i16 q1, #0
+ vmov.i16 q2, #0
+ vmov.i16 q3, #0
+ mov r12, r0
+
+1: vld1.16 {d16-d17}, [r0,:128]!
+ vld1.16 {d18-d19}, [r1]!
+ vld1.16 {d20-d21}, [r2]!
+ vld1.16 {d22-d23}, [r0,:128]!
+ vld1.16 {d24-d25}, [r1]!
+ vld1.16 {d26-d27}, [r2]!
+ vmul.s16 q10, q10, q14
+ vmul.s16 q13, q13, q14
+ vmlal.s16 q0, d16, d18
+ vmlal.s16 q1, d17, d19
+ vadd.s16 q10, q8, q10
+ vadd.s16 q13, q11, q13
+ vmlal.s16 q2, d22, d24
+ vmlal.s16 q3, d23, d25
+ vst1.16 {q10}, [r12,:128]!
+ subs r3, r3, #16
+ vst1.16 {q13}, [r12,:128]!
+ bne 1b
+
+ vpadd.s32 d16, d0, d1
+ vpadd.s32 d17, d2, d3
+ vpadd.s32 d18, d4, d5
+ vpadd.s32 d19, d6, d7
+ vpadd.s32 d0, d16, d17
+ vpadd.s32 d1, d18, d19
+ vpadd.s32 d2, d0, d1
+ vpaddl.s32 d3, d2
+ vmov.32 r0, d3[0]
+ bx lr
+endfunc
diff --git a/libavcodec/arm/dsputil_init_neon.c b/libavcodec/arm/dsputil_init_neon.c
index 16e052dddd..c9bdaa5a78 100644
--- a/libavcodec/arm/dsputil_init_neon.c
+++ b/libavcodec/arm/dsputil_init_neon.c
@@ -44,9 +44,6 @@ void ff_vector_clip_int32_neon(int32_t *dst, const int32_t *src, int32_t min,
int32_t ff_scalarproduct_int16_neon(const int16_t *v1, const int16_t *v2, int len);
-int32_t ff_scalarproduct_and_madd_int16_neon(int16_t *v1, const int16_t *v2,
- const int16_t *v3, int len, int mul);
-
av_cold void ff_dsputil_init_neon(DSPContext *c, AVCodecContext *avctx,
unsigned high_bit_depth)
{
@@ -73,6 +70,4 @@ av_cold void ff_dsputil_init_neon(DSPContext *c, AVCodecContext *avctx,
c->vector_clip_int32 = ff_vector_clip_int32_neon;
c->scalarproduct_int16 = ff_scalarproduct_int16_neon;
-
- c->scalarproduct_and_madd_int16 = ff_scalarproduct_and_madd_int16_neon;
}
diff --git a/libavcodec/arm/int_neon.S b/libavcodec/arm/int_neon.S
index 3d2faffa48..42f37392e1 100644
--- a/libavcodec/arm/int_neon.S
+++ b/libavcodec/arm/int_neon.S
@@ -48,43 +48,3 @@ function ff_scalarproduct_int16_neon, export=1
vmov.32 r0, d3[0]
bx lr
endfunc
-
-@ scalarproduct_and_madd_int16(/*aligned*/v0,v1,v2,order,mul)
-function ff_scalarproduct_and_madd_int16_neon, export=1
- vld1.16 {d28[],d29[]}, [sp]
- vmov.i16 q0, #0
- vmov.i16 q1, #0
- vmov.i16 q2, #0
- vmov.i16 q3, #0
- mov r12, r0
-
-1: vld1.16 {d16-d17}, [r0,:128]!
- vld1.16 {d18-d19}, [r1]!
- vld1.16 {d20-d21}, [r2]!
- vld1.16 {d22-d23}, [r0,:128]!
- vld1.16 {d24-d25}, [r1]!
- vld1.16 {d26-d27}, [r2]!
- vmul.s16 q10, q10, q14
- vmul.s16 q13, q13, q14
- vmlal.s16 q0, d16, d18
- vmlal.s16 q1, d17, d19
- vadd.s16 q10, q8, q10
- vadd.s16 q13, q11, q13
- vmlal.s16 q2, d22, d24
- vmlal.s16 q3, d23, d25
- vst1.16 {q10}, [r12,:128]!
- subs r3, r3, #16
- vst1.16 {q13}, [r12,:128]!
- bne 1b
-
- vpadd.s32 d16, d0, d1
- vpadd.s32 d17, d2, d3
- vpadd.s32 d18, d4, d5
- vpadd.s32 d19, d6, d7
- vpadd.s32 d0, d16, d17
- vpadd.s32 d1, d18, d19
- vpadd.s32 d2, d0, d1
- vpaddl.s32 d3, d2
- vmov.32 r0, d3[0]
- bx lr
-endfunc
diff --git a/libavcodec/dsputil.c b/libavcodec/dsputil.c
index 11447c01e8..6b846588fa 100644
--- a/libavcodec/dsputil.c
+++ b/libavcodec/dsputil.c
@@ -2069,19 +2069,6 @@ static int32_t scalarproduct_int16_c(const int16_t *v1, const int16_t *v2,
return res;
}
-static int32_t scalarproduct_and_madd_int16_c(int16_t *v1, const int16_t *v2,
- const int16_t *v3,
- int order, int mul)
-{
- int res = 0;
-
- while (order--) {
- res += *v1 * *v2++;
- *v1++ += mul * *v3++;
- }
- return res;
-}
-
static void vector_clip_int32_c(int32_t *dst, const int32_t *src, int32_t min,
int32_t max, unsigned int len)
{
@@ -2294,8 +2281,6 @@ av_cold void ff_dsputil_init(DSPContext *c, AVCodecContext *avctx)
c->try_8x8basis = try_8x8basis_c;
c->add_8x8basis = add_8x8basis_c;
- c->scalarproduct_and_madd_int16 = scalarproduct_and_madd_int16_c;
-
c->scalarproduct_int16 = scalarproduct_int16_c;
c->vector_clip_int32 = vector_clip_int32_c;
c->vector_clipf = vector_clipf_c;
diff --git a/libavcodec/dsputil.h b/libavcodec/dsputil.h
index d261f7e702..471988bddd 100644
--- a/libavcodec/dsputil.h
+++ b/libavcodec/dsputil.h
@@ -255,16 +255,6 @@ typedef struct DSPContext {
*/
int32_t (*scalarproduct_int16)(const int16_t *v1,
const int16_t *v2 /* align 16 */, int len);
- /* ape functions */
- /**
- * Calculate scalar product of v1 and v2,
- * and v1[i] += v3[i] * mul
- * @param len length of vectors, should be multiple of 16
- */
- int32_t (*scalarproduct_and_madd_int16)(int16_t *v1 /* align 16 */,
- const int16_t *v2,
- const int16_t *v3,
- int len, int mul);
/**
* Clip each element in an array of int32_t to a given minimum and
diff --git a/libavcodec/ppc/Makefile b/libavcodec/ppc/Makefile
index ec0674c817..b78d4be8ae 100644
--- a/libavcodec/ppc/Makefile
+++ b/libavcodec/ppc/Makefile
@@ -12,6 +12,7 @@ OBJS-$(CONFIG_MPEGVIDEO) += ppc/mpegvideo_altivec.o
OBJS-$(CONFIG_VIDEODSP) += ppc/videodsp_ppc.o
OBJS-$(CONFIG_VP3DSP) += ppc/vp3dsp_altivec.o
+OBJS-$(CONFIG_APE_DECODER) += ppc/apedsp_altivec.o
OBJS-$(CONFIG_SVQ1_ENCODER) += ppc/svq1enc_altivec.o
OBJS-$(CONFIG_VC1_DECODER) += ppc/vc1dsp_altivec.o
OBJS-$(CONFIG_VORBIS_DECODER) += ppc/vorbisdsp_altivec.o
diff --git a/libavcodec/ppc/apedsp_altivec.c b/libavcodec/ppc/apedsp_altivec.c
new file mode 100644
index 0000000000..de9df45c6c
--- /dev/null
+++ b/libavcodec/ppc/apedsp_altivec.c
@@ -0,0 +1,77 @@
+/*
+ * Copyright (c) 2007 Luca Barbato <lu_zero@gentoo.org>
+ *
+ * This file is part of Libav.
+ *
+ * Libav is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * Libav is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with Libav; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "config.h"
+#if HAVE_ALTIVEC_H
+#include <altivec.h>
+#endif
+
+#include "libavutil/attributes.h"
+#include "libavutil/ppc/types_altivec.h"
+#include "libavcodec/apedsp.h"
+
+#if HAVE_ALTIVEC
+static int32_t scalarproduct_and_madd_int16_altivec(int16_t *v1,
+ const int16_t *v2,
+ const int16_t *v3,
+ int order, int mul)
+{
+ LOAD_ZERO;
+ vec_s16 *pv1 = (vec_s16 *) v1;
+ register vec_s16 muls = { mul, mul, mul, mul, mul, mul, mul, mul };
+ register vec_s16 t0, t1, i0, i1, i4;
+ register vec_s16 i2 = vec_ld(0, v2), i3 = vec_ld(0, v3);
+ register vec_s32 res = zero_s32v;
+ register vec_u8 align = vec_lvsl(0, v2);
+ int32_t ires;
+
+ order >>= 4;
+ do {
+ i1 = vec_ld(16, v2);
+ t0 = vec_perm(i2, i1, align);
+ i2 = vec_ld(32, v2);
+ t1 = vec_perm(i1, i2, align);
+ i0 = pv1[0];
+ i1 = pv1[1];
+ res = vec_msum(t0, i0, res);
+ res = vec_msum(t1, i1, res);
+ i4 = vec_ld(16, v3);
+ t0 = vec_perm(i3, i4, align);
+ i3 = vec_ld(32, v3);
+ t1 = vec_perm(i4, i3, align);
+ pv1[0] = vec_mladd(t0, muls, i0);
+ pv1[1] = vec_mladd(t1, muls, i1);
+ pv1 += 2;
+ v2 += 16;
+ v3 += 16;
+ } while (--order);
+ res = vec_splat(vec_sums(res, zero_s32v), 3);
+ vec_ste(res, 0, &ires);
+
+ return ires;
+}
+#endif /* HAVE_ALTIVEC */
+
+av_cold void ff_apedsp_init_ppc(APEDSPContext *c)
+{
+#if HAVE_ALTIVEC
+ c->scalarproduct_and_madd_int16 = scalarproduct_and_madd_int16_altivec;
+#endif /* HAVE_ALTIVEC */
+}
diff --git a/libavcodec/ppc/int_altivec.c b/libavcodec/ppc/int_altivec.c
index fa3cb66095..d76d34a5b1 100644
--- a/libavcodec/ppc/int_altivec.c
+++ b/libavcodec/ppc/int_altivec.c
@@ -56,49 +56,7 @@ static int32_t scalarproduct_int16_altivec(const int16_t *v1, const int16_t *v2,
return ires;
}
-static int32_t scalarproduct_and_madd_int16_altivec(int16_t *v1,
- const int16_t *v2,
- const int16_t *v3,
- int order, int mul)
-{
- LOAD_ZERO;
- vec_s16 *pv1 = (vec_s16 *) v1;
- register vec_s16 muls = { mul, mul, mul, mul, mul, mul, mul, mul };
- register vec_s16 t0, t1, i0, i1, i4;
- register vec_s16 i2 = vec_ld(0, v2), i3 = vec_ld(0, v3);
- register vec_s32 res = zero_s32v;
- register vec_u8 align = vec_lvsl(0, v2);
- int32_t ires;
-
- order >>= 4;
- do {
- i1 = vec_ld(16, v2);
- t0 = vec_perm(i2, i1, align);
- i2 = vec_ld(32, v2);
- t1 = vec_perm(i1, i2, align);
- i0 = pv1[0];
- i1 = pv1[1];
- res = vec_msum(t0, i0, res);
- res = vec_msum(t1, i1, res);
- i4 = vec_ld(16, v3);
- t0 = vec_perm(i3, i4, align);
- i3 = vec_ld(32, v3);
- t1 = vec_perm(i4, i3, align);
- pv1[0] = vec_mladd(t0, muls, i0);
- pv1[1] = vec_mladd(t1, muls, i1);
- pv1 += 2;
- v2 += 16;
- v3 += 16;
- } while (--order);
- res = vec_splat(vec_sums(res, zero_s32v), 3);
- vec_ste(res, 0, &ires);
-
- return ires;
-}
-
av_cold void ff_int_init_altivec(DSPContext *c, AVCodecContext *avctx)
{
c->scalarproduct_int16 = scalarproduct_int16_altivec;
-
- c->scalarproduct_and_madd_int16 = scalarproduct_and_madd_int16_altivec;
}
diff --git a/libavcodec/x86/Makefile b/libavcodec/x86/Makefile
index 8830a22a8f..10242269c2 100644
--- a/libavcodec/x86/Makefile
+++ b/libavcodec/x86/Makefile
@@ -25,6 +25,7 @@ OBJS-$(CONFIG_VP3DSP) += x86/vp3dsp_init.o
OBJS-$(CONFIG_XMM_CLOBBER_TEST) += x86/w64xmmtest.o
OBJS-$(CONFIG_AAC_DECODER) += x86/sbrdsp_init.o
+OBJS-$(CONFIG_APE_DECODER) += x86/apedsp_init.o
OBJS-$(CONFIG_CAVS_DECODER) += x86/cavsdsp.o
OBJS-$(CONFIG_DCA_DECODER) += x86/dcadsp_init.o
OBJS-$(CONFIG_DNXHD_ENCODER) += x86/dnxhdenc_init.o
@@ -89,6 +90,7 @@ YASM-OBJS-$(CONFIG_VIDEODSP) += x86/videodsp.o
YASM-OBJS-$(CONFIG_VP3DSP) += x86/vp3dsp.o
YASM-OBJS-$(CONFIG_AAC_DECODER) += x86/sbrdsp.o
+YASM-OBJS-$(CONFIG_APE_DECODER) += x86/apedsp.o
YASM-OBJS-$(CONFIG_DCA_DECODER) += x86/dcadsp.o
YASM-OBJS-$(CONFIG_PNG_DECODER) += x86/pngdsp.o
YASM-OBJS-$(CONFIG_PRORES_DECODER) += x86/proresdsp.o
diff --git a/libavcodec/x86/apedsp.asm b/libavcodec/x86/apedsp.asm
new file mode 100644
index 0000000000..d721ebda6b
--- /dev/null
+++ b/libavcodec/x86/apedsp.asm
@@ -0,0 +1,167 @@
+;******************************************************************************
+;* Copyright (c) 2008 Loren Merritt
+;*
+;* This file is part of Libav.
+;*
+;* Libav is free software; you can redistribute it and/or
+;* modify it under the terms of the GNU Lesser General Public
+;* License as published by the Free Software Foundation; either
+;* version 2.1 of the License, or (at your option) any later version.
+;*
+;* Libav is distributed in the hope that it will be useful,
+;* but WITHOUT ANY WARRANTY; without even the implied warranty of
+;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+;* Lesser General Public License for more details.
+;*
+;* You should have received a copy of the GNU Lesser General Public
+;* License along with Libav; if not, write to the Free Software
+;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+;******************************************************************************
+
+%include "libavutil/x86/x86util.asm"
+
+SECTION_TEXT
+
+%macro SCALARPRODUCT 0
+; int ff_scalarproduct_and_madd_int16(int16_t *v1, int16_t *v2, int16_t *v3,
+; int order, int mul)
+cglobal scalarproduct_and_madd_int16, 4,4,8, v1, v2, v3, order, mul
+ shl orderq, 1
+ movd m7, mulm
+%if mmsize == 16
+ pshuflw m7, m7, 0
+ punpcklqdq m7, m7
+%else
+ pshufw m7, m7, 0
+%endif
+ pxor m6, m6
+ add v1q, orderq
+ add v2q, orderq
+ add v3q, orderq
+ neg orderq
+.loop:
+ movu m0, [v2q + orderq]
+ movu m1, [v2q + orderq + mmsize]
+ mova m4, [v1q + orderq]
+ mova m5, [v1q + orderq + mmsize]
+ movu m2, [v3q + orderq]
+ movu m3, [v3q + orderq + mmsize]
+ pmaddwd m0, m4
+ pmaddwd m1, m5
+ pmullw m2, m7
+ pmullw m3, m7
+ paddd m6, m0
+ paddd m6, m1
+ paddw m2, m4
+ paddw m3, m5
+ mova [v1q + orderq], m2
+ mova [v1q + orderq + mmsize], m3
+ add orderq, mmsize*2
+ jl .loop
+%if mmsize == 16
+ movhlps m0, m6
+ paddd m6, m0
+ pshuflw m0, m6, 0x4e
+%else
+ pshufw m0, m6, 0x4e
+%endif
+ paddd m6, m0
+ movd eax, m6
+ RET
+%endmacro
+
+INIT_MMX mmxext
+SCALARPRODUCT
+INIT_XMM sse2
+SCALARPRODUCT
+
+%macro SCALARPRODUCT_LOOP 1
+align 16
+.loop%1:
+ sub orderq, mmsize*2
+%if %1
+ mova m1, m4
+ mova m4, [v2q + orderq]
+ mova m0, [v2q + orderq + mmsize]
+ palignr m1, m0, %1
+ palignr m0, m4, %1
+ mova m3, m5
+ mova m5, [v3q + orderq]
+ mova m2, [v3q + orderq + mmsize]
+ palignr m3, m2, %1
+ palignr m2, m5, %1
+%else
+ mova m0, [v2q + orderq]
+ mova m1, [v2q + orderq + mmsize]
+ mova m2, [v3q + orderq]
+ mova m3, [v3q + orderq + mmsize]
+%endif
+ %define t0 [v1q + orderq]
+ %define t1 [v1q + orderq + mmsize]
+%if ARCH_X86_64
+ mova m8, t0
+ mova m9, t1
+ %define t0 m8
+ %define t1 m9
+%endif
+ pmaddwd m0, t0
+ pmaddwd m1, t1
+ pmullw m2, m7
+ pmullw m3, m7
+ paddw m2, t0
+ paddw m3, t1
+ paddd m6, m0
+ paddd m6, m1
+ mova [v1q + orderq], m2
+ mova [v1q + orderq + mmsize], m3
+ jg .loop%1
+%if %1
+ jmp .end
+%endif
+%endmacro
+
+; int ff_scalarproduct_and_madd_int16(int16_t *v1, int16_t *v2, int16_t *v3,
+; int order, int mul)
+INIT_XMM ssse3
+cglobal scalarproduct_and_madd_int16, 4,5,10, v1, v2, v3, order, mul
+ shl orderq, 1
+ movd m7, mulm
+ pshuflw m7, m7, 0
+ punpcklqdq m7, m7
+ pxor m6, m6
+ mov r4d, v2d
+ and r4d, 15
+ and v2q, ~15
+ and v3q, ~15
+ mova m4, [v2q + orderq]
+ mova m5, [v3q + orderq]
+ ; linear is faster than branch tree or jump table, because the branches taken are cyclic (i.e. predictable)
+ cmp r4d, 0
+ je .loop0
+ cmp r4d, 2
+ je .loop2
+ cmp r4d, 4
+ je .loop4
+ cmp r4d, 6
+ je .loop6
+ cmp r4d, 8
+ je .loop8
+ cmp r4d, 10
+ je .loop10
+ cmp r4d, 12
+ je .loop12
+SCALARPRODUCT_LOOP 14
+SCALARPRODUCT_LOOP 12
+SCALARPRODUCT_LOOP 10
+SCALARPRODUCT_LOOP 8
+SCALARPRODUCT_LOOP 6
+SCALARPRODUCT_LOOP 4
+SCALARPRODUCT_LOOP 2
+SCALARPRODUCT_LOOP 0
+.end:
+ movhlps m0, m6
+ paddd m6, m0
+ pshuflw m0, m6, 0x4e
+ paddd m6, m0
+ movd eax, m6
+ RET
diff --git a/libavcodec/x86/apedsp_init.c b/libavcodec/x86/apedsp_init.c
new file mode 100644
index 0000000000..f692c2b9b6
--- /dev/null
+++ b/libavcodec/x86/apedsp_init.c
@@ -0,0 +1,47 @@
+/*
+ * This file is part of Libav.
+ *
+ * Libav is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * Libav is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with Libav; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/attributes.h"
+#include "libavutil/cpu.h"
+#include "libavutil/x86/cpu.h"
+#include "libavcodec/apedsp.h"
+
+int32_t ff_scalarproduct_and_madd_int16_mmxext(int16_t *v1, const int16_t *v2,
+ const int16_t *v3,
+ int order, int mul);
+int32_t ff_scalarproduct_and_madd_int16_sse2(int16_t *v1, const int16_t *v2,
+ const int16_t *v3,
+ int order, int mul);
+int32_t ff_scalarproduct_and_madd_int16_ssse3(int16_t *v1, const int16_t *v2,
+ const int16_t *v3,
+ int order, int mul);
+
+av_cold void ff_apedsp_init_x86(APEDSPContext *c)
+{
+ int cpu_flags = av_get_cpu_flags();
+
+ if (EXTERNAL_MMXEXT(cpu_flags))
+ c->scalarproduct_and_madd_int16 = ff_scalarproduct_and_madd_int16_mmxext;
+
+ if (EXTERNAL_SSE2(cpu_flags))
+ c->scalarproduct_and_madd_int16 = ff_scalarproduct_and_madd_int16_sse2;
+
+ if (EXTERNAL_SSSE3(cpu_flags) &&
+ !(cpu_flags & (AV_CPU_FLAG_SSE42 | AV_CPU_FLAG_3DNOW))) // cachesplit
+ c->scalarproduct_and_madd_int16 = ff_scalarproduct_and_madd_int16_ssse3;
+}
diff --git a/libavcodec/x86/dsputil.asm b/libavcodec/x86/dsputil.asm
index 684f09b7fc..b5d6d3cc65 100644
--- a/libavcodec/x86/dsputil.asm
+++ b/libavcodec/x86/dsputil.asm
@@ -53,52 +53,6 @@ cglobal scalarproduct_int16, 3,3,3, v1, v2, order
paddd m2, m0
movd eax, m2
RET
-
-; int ff_scalarproduct_and_madd_int16(int16_t *v1, int16_t *v2, int16_t *v3,
-; int order, int mul)
-cglobal scalarproduct_and_madd_int16, 4,4,8, v1, v2, v3, order, mul
- shl orderq, 1
- movd m7, mulm
-%if mmsize == 16
- pshuflw m7, m7, 0
- punpcklqdq m7, m7
-%else
- pshufw m7, m7, 0
-%endif
- pxor m6, m6
- add v1q, orderq
- add v2q, orderq
- add v3q, orderq
- neg orderq
-.loop:
- movu m0, [v2q + orderq]
- movu m1, [v2q + orderq + mmsize]
- mova m4, [v1q + orderq]
- mova m5, [v1q + orderq + mmsize]
- movu m2, [v3q + orderq]
- movu m3, [v3q + orderq + mmsize]
- pmaddwd m0, m4
- pmaddwd m1, m5
- pmullw m2, m7
- pmullw m3, m7
- paddd m6, m0
- paddd m6, m1
- paddw m2, m4
- paddw m3, m5
- mova [v1q + orderq], m2
- mova [v1q + orderq + mmsize], m3
- add orderq, mmsize*2
- jl .loop
-%if mmsize == 16
- movhlps m0, m6
- paddd m6, m0
- pshuflw m0, m6, 0x4e
-%else
- pshufw m0, m6, 0x4e
-%endif
- paddd m6, m0
- movd eax, m6
- RET
%endmacro
INIT_MMX mmxext
@@ -106,97 +60,6 @@ SCALARPRODUCT
INIT_XMM sse2
SCALARPRODUCT
-%macro SCALARPRODUCT_LOOP 1
-align 16
-.loop%1:
- sub orderq, mmsize*2
-%if %1
- mova m1, m4
- mova m4, [v2q + orderq]
- mova m0, [v2q + orderq + mmsize]
- palignr m1, m0, %1
- palignr m0, m4, %1
- mova m3, m5
- mova m5, [v3q + orderq]
- mova m2, [v3q + orderq + mmsize]
- palignr m3, m2, %1
- palignr m2, m5, %1
-%else
- mova m0, [v2q + orderq]
- mova m1, [v2q + orderq + mmsize]
- mova m2, [v3q + orderq]
- mova m3, [v3q + orderq + mmsize]
-%endif
- %define t0 [v1q + orderq]
- %define t1 [v1q + orderq + mmsize]
-%if ARCH_X86_64
- mova m8, t0
- mova m9, t1
- %define t0 m8
- %define t1 m9
-%endif
- pmaddwd m0, t0
- pmaddwd m1, t1
- pmullw m2, m7
- pmullw m3, m7
- paddw m2, t0
- paddw m3, t1
- paddd m6, m0
- paddd m6, m1
- mova [v1q + orderq], m2
- mova [v1q + orderq + mmsize], m3
- jg .loop%1
-%if %1
- jmp .end
-%endif
-%endmacro
-
-; int ff_scalarproduct_and_madd_int16(int16_t *v1, int16_t *v2, int16_t *v3,
-; int order, int mul)
-INIT_XMM ssse3
-cglobal scalarproduct_and_madd_int16, 4,5,10, v1, v2, v3, order, mul
- shl orderq, 1
- movd m7, mulm
- pshuflw m7, m7, 0
- punpcklqdq m7, m7
- pxor m6, m6
- mov r4d, v2d
- and r4d, 15
- and v2q, ~15
- and v3q, ~15
- mova m4, [v2q + orderq]
- mova m5, [v3q + orderq]
- ; linear is faster than branch tree or jump table, because the branches taken are cyclic (i.e. predictable)
- cmp r4d, 0
- je .loop0
- cmp r4d, 2
- je .loop2
- cmp r4d, 4
- je .loop4
- cmp r4d, 6
- je .loop6
- cmp r4d, 8
- je .loop8
- cmp r4d, 10
- je .loop10
- cmp r4d, 12
- je .loop12
-SCALARPRODUCT_LOOP 14
-SCALARPRODUCT_LOOP 12
-SCALARPRODUCT_LOOP 10
-SCALARPRODUCT_LOOP 8
-SCALARPRODUCT_LOOP 6
-SCALARPRODUCT_LOOP 4
-SCALARPRODUCT_LOOP 2
-SCALARPRODUCT_LOOP 0
-.end:
- movhlps m0, m6
- paddd m6, m0
- pshuflw m0, m6, 0x4e
- paddd m6, m0
- movd eax, m6
- RET
-
;-----------------------------------------------------------------------------
; void ff_vector_clip_int32(int32_t *dst, const int32_t *src, int32_t min,
diff --git a/libavcodec/x86/dsputil_init.c b/libavcodec/x86/dsputil_init.c
index 10fa166db4..9b0788ff73 100644
--- a/libavcodec/x86/dsputil_init.c
+++ b/libavcodec/x86/dsputil_init.c
@@ -76,15 +76,6 @@ int32_t ff_scalarproduct_int16_mmxext(const int16_t *v1, const int16_t *v2,
int order);
int32_t ff_scalarproduct_int16_sse2(const int16_t *v1, const int16_t *v2,
int order);
-int32_t ff_scalarproduct_and_madd_int16_mmxext(int16_t *v1, const int16_t *v2,
- const int16_t *v3,
- int order, int mul);
-int32_t ff_scalarproduct_and_madd_int16_sse2(int16_t *v1, const int16_t *v2,
- const int16_t *v3,
- int order, int mul);
-int32_t ff_scalarproduct_and_madd_int16_ssse3(int16_t *v1, const int16_t *v2,
- const int16_t *v3,
- int order, int mul);
void ff_bswap32_buf_ssse3(uint32_t *dst, const uint32_t *src, int w);
void ff_bswap32_buf_sse2(uint32_t *dst, const uint32_t *src, int w);
@@ -568,7 +559,6 @@ static av_cold void dsputil_init_mmxext(DSPContext *c, AVCodecContext *avctx,
SET_QPEL_FUNCS(put_no_rnd_qpel, 1, 8, mmxext, );
c->scalarproduct_int16 = ff_scalarproduct_int16_mmxext;
- c->scalarproduct_and_madd_int16 = ff_scalarproduct_and_madd_int16_mmxext;
#endif /* HAVE_MMXEXT_EXTERNAL */
}
@@ -607,7 +597,6 @@ static av_cold void dsputil_init_sse2(DSPContext *c, AVCodecContext *avctx,
#if HAVE_SSE2_EXTERNAL
c->scalarproduct_int16 = ff_scalarproduct_int16_sse2;
- c->scalarproduct_and_madd_int16 = ff_scalarproduct_and_madd_int16_sse2;
if (cpu_flags & AV_CPU_FLAG_ATOM) {
c->vector_clip_int32 = ff_vector_clip_int32_int_sse2;
} else {
@@ -621,8 +610,6 @@ static av_cold void dsputil_init_ssse3(DSPContext *c, AVCodecContext *avctx,
int cpu_flags, unsigned high_bit_depth)
{
#if HAVE_SSSE3_EXTERNAL
- if (!(cpu_flags & (AV_CPU_FLAG_SSE42 | AV_CPU_FLAG_3DNOW))) // cachesplit
- c->scalarproduct_and_madd_int16 = ff_scalarproduct_and_madd_int16_ssse3;
c->bswap_buf = ff_bswap32_buf_ssse3;
#endif /* HAVE_SSSE3_EXTERNAL */
}