aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorJames Almer <jamrial@gmail.com>2016-01-24 23:52:19 -0300
committerJames Almer <jamrial@gmail.com>2016-01-25 14:57:38 -0300
commit209f50e16b5e66424d593ba4f9d4d8be5feff947 (patch)
tree6c1961c7a75a07ec7af466047d3bfe50edc09630
parent5dc37a5d8abd18755c5a2cae0840c32579ec299c (diff)
downloadffmpeg-209f50e16b5e66424d593ba4f9d4d8be5feff947.tar.gz
avcodec/synth_filter: split off remaining code from dcadec files
Signed-off-by: James Almer <jamrial@gmail.com>
-rw-r--r--libavcodec/aarch64/Makefile3
-rw-r--r--libavcodec/aarch64/dcadsp_init.c21
-rw-r--r--libavcodec/aarch64/synth_filter_init.c47
-rw-r--r--libavcodec/arm/Makefile3
-rw-r--r--libavcodec/arm/dcadsp_init_arm.c22
-rw-r--r--libavcodec/arm/synth_filter_init_arm.c49
-rw-r--r--libavcodec/x86/Makefile6
-rw-r--r--libavcodec/x86/dcadsp.asm222
-rw-r--r--libavcodec/x86/dcadsp_init.c51
-rw-r--r--libavcodec/x86/synth_filter.asm246
-rw-r--r--libavcodec/x86/synth_filter_init.c74
11 files changed, 424 insertions, 320 deletions
diff --git a/libavcodec/aarch64/Makefile b/libavcodec/aarch64/Makefile
index 022ed847a3..99f590c650 100644
--- a/libavcodec/aarch64/Makefile
+++ b/libavcodec/aarch64/Makefile
@@ -1,4 +1,5 @@
-OBJS-$(CONFIG_DCA_DECODER) += aarch64/dcadsp_init.o
+OBJS-$(CONFIG_DCA_DECODER) += aarch64/dcadsp_init.o \
+ aarch64/synth_filter_init.o
OBJS-$(CONFIG_FFT) += aarch64/fft_init_aarch64.o
OBJS-$(CONFIG_FMTCONVERT) += aarch64/fmtconvert_init.o
OBJS-$(CONFIG_H264CHROMA) += aarch64/h264chroma_init_aarch64.o
diff --git a/libavcodec/aarch64/dcadsp_init.c b/libavcodec/aarch64/dcadsp_init.c
index 78642a5ed8..4440e4b95f 100644
--- a/libavcodec/aarch64/dcadsp_init.c
+++ b/libavcodec/aarch64/dcadsp_init.c
@@ -24,23 +24,10 @@
#include "libavutil/attributes.h"
#include "libavutil/internal.h"
#include "libavcodec/dcadsp.h"
-#include "libavcodec/fft.h"
-
-#include "asm-offsets.h"
-
-#if HAVE_NEON || HAVE_VFP
-AV_CHECK_OFFSET(FFTContext, imdct_half, IMDCT_HALF);
-#endif
void ff_dca_lfe_fir0_neon(float *out, const float *in, const float *coefs);
void ff_dca_lfe_fir1_neon(float *out, const float *in, const float *coefs);
-void ff_synth_filter_float_neon(FFTContext *imdct,
- float *synth_buf_ptr, int *synth_buf_offset,
- float synth_buf2[32], const float window[512],
- float out[32], const float in[32],
- float scale);
-
av_cold void ff_dcadsp_init_aarch64(DCADSPContext *s)
{
int cpu_flags = av_get_cpu_flags();
@@ -50,11 +37,3 @@ av_cold void ff_dcadsp_init_aarch64(DCADSPContext *s)
s->lfe_fir[1] = ff_dca_lfe_fir1_neon;
}
}
-
-av_cold void ff_synth_filter_init_aarch64(SynthFilterContext *s)
-{
- int cpu_flags = av_get_cpu_flags();
-
- if (have_neon(cpu_flags))
- s->synth_filter_float = ff_synth_filter_float_neon;
-}
diff --git a/libavcodec/aarch64/synth_filter_init.c b/libavcodec/aarch64/synth_filter_init.c
new file mode 100644
index 0000000000..767b01112a
--- /dev/null
+++ b/libavcodec/aarch64/synth_filter_init.c
@@ -0,0 +1,47 @@
+/*
+ * Copyright (c) 2010 Mans Rullgard <mans@mansr.com>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "config.h"
+
+#include "libavutil/aarch64/cpu.h"
+#include "libavutil/attributes.h"
+#include "libavutil/internal.h"
+#include "libavcodec/fft.h"
+#include "libavcodec/synth_filter.h"
+
+#include "asm-offsets.h"
+
+#if HAVE_NEON || HAVE_VFP
+AV_CHECK_OFFSET(FFTContext, imdct_half, IMDCT_HALF);
+#endif
+
+void ff_synth_filter_float_neon(FFTContext *imdct,
+ float *synth_buf_ptr, int *synth_buf_offset,
+ float synth_buf2[32], const float window[512],
+ float out[32], const float in[32],
+ float scale);
+
+av_cold void ff_synth_filter_init_aarch64(SynthFilterContext *s)
+{
+ int cpu_flags = av_get_cpu_flags();
+
+ if (have_neon(cpu_flags))
+ s->synth_filter_float = ff_synth_filter_float_neon;
+}
diff --git a/libavcodec/arm/Makefile b/libavcodec/arm/Makefile
index cdd35b08ea..6a29a5fbb7 100644
--- a/libavcodec/arm/Makefile
+++ b/libavcodec/arm/Makefile
@@ -36,7 +36,8 @@ OBJS-$(CONFIG_VP8DSP) += arm/vp8dsp_init_arm.o
# decoders/encoders
OBJS-$(CONFIG_AAC_DECODER) += arm/aacpsdsp_init_arm.o \
arm/sbrdsp_init_arm.o
-OBJS-$(CONFIG_DCA_DECODER) += arm/dcadsp_init_arm.o
+OBJS-$(CONFIG_DCA_DECODER) += arm/dcadsp_init_arm.o \
+ arm/synth_filter_init_arm.o
OBJS-$(CONFIG_HEVC_DECODER) += arm/hevcdsp_init_arm.o
OBJS-$(CONFIG_MLP_DECODER) += arm/mlpdsp_init_arm.o
OBJS-$(CONFIG_RV40_DECODER) += arm/rv40dsp_init_arm.o
diff --git a/libavcodec/arm/dcadsp_init_arm.c b/libavcodec/arm/dcadsp_init_arm.c
index 0f2e4c49c9..febb4445d2 100644
--- a/libavcodec/arm/dcadsp_init_arm.c
+++ b/libavcodec/arm/dcadsp_init_arm.c
@@ -37,18 +37,6 @@ void ff_dca_qmf_32_subbands_vfp(float samples_in[32][8], int sb_act,
const float window[512], float *samples_out,
float raXin[32], float scale);
-void ff_synth_filter_float_vfp(FFTContext *imdct,
- float *synth_buf_ptr, int *synth_buf_offset,
- float synth_buf2[32], const float window[512],
- float out[32], const float in[32],
- float scale);
-
-void ff_synth_filter_float_neon(FFTContext *imdct,
- float *synth_buf_ptr, int *synth_buf_offset,
- float synth_buf2[32], const float window[512],
- float out[32], const float in[32],
- float scale);
-
av_cold void ff_dcadsp_init_arm(DCADSPContext *s)
{
int cpu_flags = av_get_cpu_flags();
@@ -63,13 +51,3 @@ av_cold void ff_dcadsp_init_arm(DCADSPContext *s)
s->lfe_fir[1] = ff_dca_lfe_fir1_neon;
}
}
-
-av_cold void ff_synth_filter_init_arm(SynthFilterContext *s)
-{
- int cpu_flags = av_get_cpu_flags();
-
- if (have_vfp_vm(cpu_flags))
- s->synth_filter_float = ff_synth_filter_float_vfp;
- if (have_neon(cpu_flags))
- s->synth_filter_float = ff_synth_filter_float_neon;
-}
diff --git a/libavcodec/arm/synth_filter_init_arm.c b/libavcodec/arm/synth_filter_init_arm.c
new file mode 100644
index 0000000000..ea0ce148d4
--- /dev/null
+++ b/libavcodec/arm/synth_filter_init_arm.c
@@ -0,0 +1,49 @@
+/*
+ * Copyright (c) 2010 Mans Rullgard <mans@mansr.com>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "config.h"
+
+#include "libavutil/arm/cpu.h"
+#include "libavutil/attributes.h"
+#include "libavutil/internal.h"
+#include "libavcodec/fft.h"
+#include "libavcodec/synth_filter.h"
+
+void ff_synth_filter_float_vfp(FFTContext *imdct,
+ float *synth_buf_ptr, int *synth_buf_offset,
+ float synth_buf2[32], const float window[512],
+ float out[32], const float in[32],
+ float scale);
+
+void ff_synth_filter_float_neon(FFTContext *imdct,
+ float *synth_buf_ptr, int *synth_buf_offset,
+ float synth_buf2[32], const float window[512],
+ float out[32], const float in[32],
+ float scale);
+
+av_cold void ff_synth_filter_init_arm(SynthFilterContext *s)
+{
+ int cpu_flags = av_get_cpu_flags();
+
+ if (have_vfp_vm(cpu_flags))
+ s->synth_filter_float = ff_synth_filter_float_vfp;
+ if (have_neon(cpu_flags))
+ s->synth_filter_float = ff_synth_filter_float_neon;
+}
diff --git a/libavcodec/x86/Makefile b/libavcodec/x86/Makefile
index 0d09fe6663..bcb42332a0 100644
--- a/libavcodec/x86/Makefile
+++ b/libavcodec/x86/Makefile
@@ -44,7 +44,8 @@ OBJS-$(CONFIG_ADPCM_G722_ENCODER) += x86/g722dsp_init.o
OBJS-$(CONFIG_ALAC_DECODER) += x86/alacdsp_init.o
OBJS-$(CONFIG_APNG_DECODER) += x86/pngdsp_init.o
OBJS-$(CONFIG_CAVS_DECODER) += x86/cavsdsp.o
-OBJS-$(CONFIG_DCA_DECODER) += x86/dcadsp_init.o
+OBJS-$(CONFIG_DCA_DECODER) += x86/dcadsp_init.o \
+ x86/synth_filter_init.o
OBJS-$(CONFIG_DNXHD_ENCODER) += x86/dnxhdenc_init.o
OBJS-$(CONFIG_HEVC_DECODER) += x86/hevcdsp_init.o
OBJS-$(CONFIG_JPEG2000_DECODER) += x86/jpeg2000dsp_init.o
@@ -132,7 +133,8 @@ YASM-OBJS-$(CONFIG_ADPCM_G722_DECODER) += x86/g722dsp.o
YASM-OBJS-$(CONFIG_ADPCM_G722_ENCODER) += x86/g722dsp.o
YASM-OBJS-$(CONFIG_ALAC_DECODER) += x86/alacdsp.o
YASM-OBJS-$(CONFIG_APNG_DECODER) += x86/pngdsp.o
-YASM-OBJS-$(CONFIG_DCA_DECODER) += x86/dcadsp.o
+YASM-OBJS-$(CONFIG_DCA_DECODER) += x86/dcadsp.o \
+ x86/synth_filter.o
YASM-OBJS-$(CONFIG_DIRAC_DECODER) += x86/diracdsp_mmx.o x86/diracdsp_yasm.o \
x86/dwt_yasm.o
YASM-OBJS-$(CONFIG_DNXHD_ENCODER) += x86/dnxhdenc.o
diff --git a/libavcodec/x86/dcadsp.asm b/libavcodec/x86/dcadsp.asm
index 502b70a4cb..55e73bcc29 100644
--- a/libavcodec/x86/dcadsp.asm
+++ b/libavcodec/x86/dcadsp.asm
@@ -121,225 +121,3 @@ DCA_LFE_FIR 1
INIT_XMM fma3
DCA_LFE_FIR 0
%endif
-
-%macro SETZERO 1
-%if cpuflag(sse2) && notcpuflag(avx)
- pxor %1, %1
-%else
- xorps %1, %1, %1
-%endif
-%endmacro
-
-%macro SHUF 3
-%if cpuflag(avx)
- mova %3, [%2 - 16]
- vperm2f128 %1, %3, %3, 1
- vshufps %1, %1, %1, q0123
-%elif cpuflag(sse2)
- pshufd %1, [%2], q0123
-%else
- mova %1, [%2]
- shufps %1, %1, q0123
-%endif
-%endmacro
-
-%macro INNER_LOOP 1
- ; reading backwards: ptr1 = synth_buf + j + i; ptr2 = synth_buf + j - i
- ;~ a += window[i + j] * (-synth_buf[15 - i + j])
- ;~ b += window[i + j + 16] * (synth_buf[i + j])
- SHUF m5, ptr2 + j + (15 - 3) * 4, m6
- mova m6, [ptr1 + j]
-%if ARCH_X86_64
- SHUF m11, ptr2 + j + (15 - 3) * 4 - mmsize, m12
- mova m12, [ptr1 + j + mmsize]
-%endif
-%if cpuflag(fma3)
- fmaddps m2, m6, [win + %1 + j + 16 * 4], m2
- fnmaddps m1, m5, [win + %1 + j], m1
-%if ARCH_X86_64
- fmaddps m8, m12, [win + %1 + j + mmsize + 16 * 4], m8
- fnmaddps m7, m11, [win + %1 + j + mmsize], m7
-%endif
-%else ; non-FMA
- mulps m6, m6, [win + %1 + j + 16 * 4]
- mulps m5, m5, [win + %1 + j]
-%if ARCH_X86_64
- mulps m12, m12, [win + %1 + j + mmsize + 16 * 4]
- mulps m11, m11, [win + %1 + j + mmsize]
-%endif
- addps m2, m2, m6
- subps m1, m1, m5
-%if ARCH_X86_64
- addps m8, m8, m12
- subps m7, m7, m11
-%endif
-%endif ; cpuflag(fma3)
- ;~ c += window[i + j + 32] * (synth_buf[16 + i + j])
- ;~ d += window[i + j + 48] * (synth_buf[31 - i + j])
- SHUF m6, ptr2 + j + (31 - 3) * 4, m5
- mova m5, [ptr1 + j + 16 * 4]
-%if ARCH_X86_64
- SHUF m12, ptr2 + j + (31 - 3) * 4 - mmsize, m11
- mova m11, [ptr1 + j + mmsize + 16 * 4]
-%endif
-%if cpuflag(fma3)
- fmaddps m3, m5, [win + %1 + j + 32 * 4], m3
- fmaddps m4, m6, [win + %1 + j + 48 * 4], m4
-%if ARCH_X86_64
- fmaddps m9, m11, [win + %1 + j + mmsize + 32 * 4], m9
- fmaddps m10, m12, [win + %1 + j + mmsize + 48 * 4], m10
-%endif
-%else ; non-FMA
- mulps m5, m5, [win + %1 + j + 32 * 4]
- mulps m6, m6, [win + %1 + j + 48 * 4]
-%if ARCH_X86_64
- mulps m11, m11, [win + %1 + j + mmsize + 32 * 4]
- mulps m12, m12, [win + %1 + j + mmsize + 48 * 4]
-%endif
- addps m3, m3, m5
- addps m4, m4, m6
-%if ARCH_X86_64
- addps m9, m9, m11
- addps m10, m10, m12
-%endif
-%endif ; cpuflag(fma3)
- sub j, 64 * 4
-%endmacro
-
-; void ff_synth_filter_inner_<opt>(float *synth_buf, float synth_buf2[32],
-; const float window[512], float out[32],
-; intptr_t offset, float scale)
-%macro SYNTH_FILTER 0
-cglobal synth_filter_inner, 0, 6 + 4 * ARCH_X86_64, 7 + 6 * ARCH_X86_64, \
- synth_buf, synth_buf2, window, out, off, scale
-%define scale m0
-%if ARCH_X86_32 || WIN64
-%if cpuflag(sse2) && notcpuflag(avx)
- movd scale, scalem
- SPLATD m0
-%else
- VBROADCASTSS m0, scalem
-%endif
-; Make sure offset is in a register and not on the stack
-%define OFFQ r4q
-%else
- SPLATD xmm0
-%if cpuflag(avx)
- vinsertf128 m0, m0, xmm0, 1
-%endif
-%define OFFQ offq
-%endif
- ; prepare inner counter limit 1
- mov r5q, 480
- sub r5q, offmp
- and r5q, -64
- shl r5q, 2
-%if ARCH_X86_32 || notcpuflag(avx)
- mov OFFQ, r5q
-%define i r5q
- mov i, 16 * 4 - (ARCH_X86_64 + 1) * mmsize ; main loop counter
-%else
-%define i 0
-%define OFFQ r5q
-%endif
-
-%define buf2 synth_buf2q
-%if ARCH_X86_32
- mov buf2, synth_buf2mp
-%endif
-.mainloop:
- ; m1 = a m2 = b m3 = c m4 = d
- SETZERO m3
- SETZERO m4
- mova m1, [buf2 + i]
- mova m2, [buf2 + i + 16 * 4]
-%if ARCH_X86_32
-%define ptr1 r0q
-%define ptr2 r1q
-%define win r2q
-%define j r3q
- mov win, windowm
- mov ptr1, synth_bufm
-%if ARCH_X86_32 || notcpuflag(avx)
- add win, i
- add ptr1, i
-%endif
-%else ; ARCH_X86_64
-%define ptr1 r6q
-%define ptr2 r7q ; must be loaded
-%define win r8q
-%define j r9q
- SETZERO m9
- SETZERO m10
- mova m7, [buf2 + i + mmsize]
- mova m8, [buf2 + i + mmsize + 16 * 4]
- lea win, [windowq + i]
- lea ptr1, [synth_bufq + i]
-%endif
- mov ptr2, synth_bufmp
- ; prepare the inner loop counter
- mov j, OFFQ
-%if ARCH_X86_32 || notcpuflag(avx)
- sub ptr2, i
-%endif
-.loop1:
- INNER_LOOP 0
- jge .loop1
-
- mov j, 448 * 4
- sub j, OFFQ
- jz .end
- sub ptr1, j
- sub ptr2, j
- add win, OFFQ ; now at j-64, so define OFFSET
- sub j, 64 * 4
-.loop2:
- INNER_LOOP 64 * 4
- jge .loop2
-
-.end:
-%if ARCH_X86_32
- mov buf2, synth_buf2m ; needed for next iteration anyway
- mov outq, outmp ; j, which will be set again during it
-%endif
- ;~ out[i] = a * scale;
- ;~ out[i + 16] = b * scale;
- mulps m1, m1, scale
- mulps m2, m2, scale
-%if ARCH_X86_64
- mulps m7, m7, scale
- mulps m8, m8, scale
-%endif
- ;~ synth_buf2[i] = c;
- ;~ synth_buf2[i + 16] = d;
- mova [buf2 + i + 0 * 4], m3
- mova [buf2 + i + 16 * 4], m4
-%if ARCH_X86_64
- mova [buf2 + i + 0 * 4 + mmsize], m9
- mova [buf2 + i + 16 * 4 + mmsize], m10
-%endif
- ;~ out[i] = a;
- ;~ out[i + 16] = a;
- mova [outq + i + 0 * 4], m1
- mova [outq + i + 16 * 4], m2
-%if ARCH_X86_64
- mova [outq + i + 0 * 4 + mmsize], m7
- mova [outq + i + 16 * 4 + mmsize], m8
-%endif
-%if ARCH_X86_32 || notcpuflag(avx)
- sub i, (ARCH_X86_64 + 1) * mmsize
- jge .mainloop
-%endif
- RET
-%endmacro
-
-%if ARCH_X86_32
-INIT_XMM sse
-SYNTH_FILTER
-%endif
-INIT_XMM sse2
-SYNTH_FILTER
-INIT_YMM avx
-SYNTH_FILTER
-INIT_YMM fma3
-SYNTH_FILTER
diff --git a/libavcodec/x86/dcadsp_init.c b/libavcodec/x86/dcadsp_init.c
index 1321dda652..c27c045d1d 100644
--- a/libavcodec/x86/dcadsp_init.c
+++ b/libavcodec/x86/dcadsp_init.c
@@ -40,54 +40,3 @@ av_cold void ff_dcadsp_init_x86(DCADSPContext *s)
s->lfe_fir[0] = ff_dca_lfe_fir0_fma3;
}
}
-
-
-#define SYNTH_FILTER_FUNC(opt) \
-void ff_synth_filter_inner_##opt(float *synth_buf_ptr, float synth_buf2[32], \
- const float window[512], \
- float out[32], intptr_t offset, float scale); \
-static void synth_filter_##opt(FFTContext *imdct, \
- float *synth_buf_ptr, int *synth_buf_offset, \
- float synth_buf2[32], const float window[512], \
- float out[32], const float in[32], float scale) \
-{ \
- float *synth_buf= synth_buf_ptr + *synth_buf_offset; \
- \
- imdct->imdct_half(imdct, synth_buf, in); \
- \
- ff_synth_filter_inner_##opt(synth_buf, synth_buf2, window, \
- out, *synth_buf_offset, scale); \
- \
- *synth_buf_offset = (*synth_buf_offset - 32) & 511; \
-} \
-
-#if HAVE_YASM
-#if ARCH_X86_32
-SYNTH_FILTER_FUNC(sse)
-#endif
-SYNTH_FILTER_FUNC(sse2)
-SYNTH_FILTER_FUNC(avx)
-SYNTH_FILTER_FUNC(fma3)
-#endif /* HAVE_YASM */
-
-av_cold void ff_synth_filter_init_x86(SynthFilterContext *s)
-{
-#if HAVE_YASM
- int cpu_flags = av_get_cpu_flags();
-
-#if ARCH_X86_32
- if (EXTERNAL_SSE(cpu_flags)) {
- s->synth_filter_float = synth_filter_sse;
- }
-#endif
- if (EXTERNAL_SSE2(cpu_flags)) {
- s->synth_filter_float = synth_filter_sse2;
- }
- if (EXTERNAL_AVX_FAST(cpu_flags)) {
- s->synth_filter_float = synth_filter_avx;
- }
- if (EXTERNAL_FMA3(cpu_flags) && !(cpu_flags & AV_CPU_FLAG_AVXSLOW)) {
- s->synth_filter_float = synth_filter_fma3;
- }
-#endif /* HAVE_YASM */
-}
diff --git a/libavcodec/x86/synth_filter.asm b/libavcodec/x86/synth_filter.asm
new file mode 100644
index 0000000000..bc1a48f409
--- /dev/null
+++ b/libavcodec/x86/synth_filter.asm
@@ -0,0 +1,246 @@
+;******************************************************************************
+;* SSE-optimized functions for the DCA decoder
+;* Copyright (C) 2012-2014 Christophe Gisquet <christophe.gisquet@gmail.com>
+;*
+;* This file is part of FFmpeg.
+;*
+;* FFmpeg is free software; you can redistribute it and/or
+;* modify it under the terms of the GNU Lesser General Public
+;* License as published by the Free Software Foundation; either
+;* version 2.1 of the License, or (at your option) any later version.
+;*
+;* FFmpeg is distributed in the hope that it will be useful,
+;* but WITHOUT ANY WARRANTY; without even the implied warranty of
+;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+;* Lesser General Public License for more details.
+;*
+;* You should have received a copy of the GNU Lesser General Public
+;* License along with FFmpeg; if not, write to the Free Software
+;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+;******************************************************************************
+
+%include "libavutil/x86/x86util.asm"
+
+SECTION .text
+
+%macro SETZERO 1
+%if cpuflag(sse2) && notcpuflag(avx)
+ pxor %1, %1
+%else
+ xorps %1, %1, %1
+%endif
+%endmacro
+
+%macro SHUF 3
+%if cpuflag(avx)
+ mova %3, [%2 - 16]
+ vperm2f128 %1, %3, %3, 1
+ vshufps %1, %1, %1, q0123
+%elif cpuflag(sse2)
+ pshufd %1, [%2], q0123
+%else
+ mova %1, [%2]
+ shufps %1, %1, q0123
+%endif
+%endmacro
+
+%macro INNER_LOOP 1
+ ; reading backwards: ptr1 = synth_buf + j + i; ptr2 = synth_buf + j - i
+ ;~ a += window[i + j] * (-synth_buf[15 - i + j])
+ ;~ b += window[i + j + 16] * (synth_buf[i + j])
+ SHUF m5, ptr2 + j + (15 - 3) * 4, m6
+ mova m6, [ptr1 + j]
+%if ARCH_X86_64
+ SHUF m11, ptr2 + j + (15 - 3) * 4 - mmsize, m12
+ mova m12, [ptr1 + j + mmsize]
+%endif
+%if cpuflag(fma3)
+ fmaddps m2, m6, [win + %1 + j + 16 * 4], m2
+ fnmaddps m1, m5, [win + %1 + j], m1
+%if ARCH_X86_64
+ fmaddps m8, m12, [win + %1 + j + mmsize + 16 * 4], m8
+ fnmaddps m7, m11, [win + %1 + j + mmsize], m7
+%endif
+%else ; non-FMA
+ mulps m6, m6, [win + %1 + j + 16 * 4]
+ mulps m5, m5, [win + %1 + j]
+%if ARCH_X86_64
+ mulps m12, m12, [win + %1 + j + mmsize + 16 * 4]
+ mulps m11, m11, [win + %1 + j + mmsize]
+%endif
+ addps m2, m2, m6
+ subps m1, m1, m5
+%if ARCH_X86_64
+ addps m8, m8, m12
+ subps m7, m7, m11
+%endif
+%endif ; cpuflag(fma3)
+ ;~ c += window[i + j + 32] * (synth_buf[16 + i + j])
+ ;~ d += window[i + j + 48] * (synth_buf[31 - i + j])
+ SHUF m6, ptr2 + j + (31 - 3) * 4, m5
+ mova m5, [ptr1 + j + 16 * 4]
+%if ARCH_X86_64
+ SHUF m12, ptr2 + j + (31 - 3) * 4 - mmsize, m11
+ mova m11, [ptr1 + j + mmsize + 16 * 4]
+%endif
+%if cpuflag(fma3)
+ fmaddps m3, m5, [win + %1 + j + 32 * 4], m3
+ fmaddps m4, m6, [win + %1 + j + 48 * 4], m4
+%if ARCH_X86_64
+ fmaddps m9, m11, [win + %1 + j + mmsize + 32 * 4], m9
+ fmaddps m10, m12, [win + %1 + j + mmsize + 48 * 4], m10
+%endif
+%else ; non-FMA
+ mulps m5, m5, [win + %1 + j + 32 * 4]
+ mulps m6, m6, [win + %1 + j + 48 * 4]
+%if ARCH_X86_64
+ mulps m11, m11, [win + %1 + j + mmsize + 32 * 4]
+ mulps m12, m12, [win + %1 + j + mmsize + 48 * 4]
+%endif
+ addps m3, m3, m5
+ addps m4, m4, m6
+%if ARCH_X86_64
+ addps m9, m9, m11
+ addps m10, m10, m12
+%endif
+%endif ; cpuflag(fma3)
+ sub j, 64 * 4
+%endmacro
+
+; void ff_synth_filter_inner_<opt>(float *synth_buf, float synth_buf2[32],
+; const float window[512], float out[32],
+; intptr_t offset, float scale)
+%macro SYNTH_FILTER 0
+cglobal synth_filter_inner, 0, 6 + 4 * ARCH_X86_64, 7 + 6 * ARCH_X86_64, \
+ synth_buf, synth_buf2, window, out, off, scale
+%define scale m0
+%if ARCH_X86_32 || WIN64
+%if cpuflag(sse2) && notcpuflag(avx)
+ movd scale, scalem
+ SPLATD m0
+%else
+ VBROADCASTSS m0, scalem
+%endif
+; Make sure offset is in a register and not on the stack
+%define OFFQ r4q
+%else
+ SPLATD xmm0
+%if cpuflag(avx)
+ vinsertf128 m0, m0, xmm0, 1
+%endif
+%define OFFQ offq
+%endif
+ ; prepare inner counter limit 1
+ mov r5q, 480
+ sub r5q, offmp
+ and r5q, -64
+ shl r5q, 2
+%if ARCH_X86_32 || notcpuflag(avx)
+ mov OFFQ, r5q
+%define i r5q
+ mov i, 16 * 4 - (ARCH_X86_64 + 1) * mmsize ; main loop counter
+%else
+%define i 0
+%define OFFQ r5q
+%endif
+
+%define buf2 synth_buf2q
+%if ARCH_X86_32
+ mov buf2, synth_buf2mp
+%endif
+.mainloop:
+ ; m1 = a m2 = b m3 = c m4 = d
+ SETZERO m3
+ SETZERO m4
+ mova m1, [buf2 + i]
+ mova m2, [buf2 + i + 16 * 4]
+%if ARCH_X86_32
+%define ptr1 r0q
+%define ptr2 r1q
+%define win r2q
+%define j r3q
+ mov win, windowm
+ mov ptr1, synth_bufm
+%if ARCH_X86_32 || notcpuflag(avx)
+ add win, i
+ add ptr1, i
+%endif
+%else ; ARCH_X86_64
+%define ptr1 r6q
+%define ptr2 r7q ; must be loaded
+%define win r8q
+%define j r9q
+ SETZERO m9
+ SETZERO m10
+ mova m7, [buf2 + i + mmsize]
+ mova m8, [buf2 + i + mmsize + 16 * 4]
+ lea win, [windowq + i]
+ lea ptr1, [synth_bufq + i]
+%endif
+ mov ptr2, synth_bufmp
+ ; prepare the inner loop counter
+ mov j, OFFQ
+%if ARCH_X86_32 || notcpuflag(avx)
+ sub ptr2, i
+%endif
+.loop1:
+ INNER_LOOP 0
+ jge .loop1
+
+ mov j, 448 * 4
+ sub j, OFFQ
+ jz .end
+ sub ptr1, j
+ sub ptr2, j
+ add win, OFFQ ; now at j-64, so define OFFSET
+ sub j, 64 * 4
+.loop2:
+ INNER_LOOP 64 * 4
+ jge .loop2
+
+.end:
+%if ARCH_X86_32
+ mov buf2, synth_buf2m ; needed for next iteration anyway
+ mov outq, outmp ; j, which will be set again during it
+%endif
+ ;~ out[i] = a * scale;
+ ;~ out[i + 16] = b * scale;
+ mulps m1, m1, scale
+ mulps m2, m2, scale
+%if ARCH_X86_64
+ mulps m7, m7, scale
+ mulps m8, m8, scale
+%endif
+ ;~ synth_buf2[i] = c;
+ ;~ synth_buf2[i + 16] = d;
+ mova [buf2 + i + 0 * 4], m3
+ mova [buf2 + i + 16 * 4], m4
+%if ARCH_X86_64
+ mova [buf2 + i + 0 * 4 + mmsize], m9
+ mova [buf2 + i + 16 * 4 + mmsize], m10
+%endif
+ ;~ out[i] = a;
+ ;~ out[i + 16] = a;
+ mova [outq + i + 0 * 4], m1
+ mova [outq + i + 16 * 4], m2
+%if ARCH_X86_64
+ mova [outq + i + 0 * 4 + mmsize], m7
+ mova [outq + i + 16 * 4 + mmsize], m8
+%endif
+%if ARCH_X86_32 || notcpuflag(avx)
+ sub i, (ARCH_X86_64 + 1) * mmsize
+ jge .mainloop
+%endif
+ RET
+%endmacro
+
+%if ARCH_X86_32
+INIT_XMM sse
+SYNTH_FILTER
+%endif
+INIT_XMM sse2
+SYNTH_FILTER
+INIT_YMM avx
+SYNTH_FILTER
+INIT_YMM fma3
+SYNTH_FILTER
diff --git a/libavcodec/x86/synth_filter_init.c b/libavcodec/x86/synth_filter_init.c
new file mode 100644
index 0000000000..0649ea20a6
--- /dev/null
+++ b/libavcodec/x86/synth_filter_init.c
@@ -0,0 +1,74 @@
+/*
+ * Copyright (c) 2012-2014 Christophe Gisquet <christophe.gisquet@gmail.com>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/attributes.h"
+#include "libavutil/cpu.h"
+#include "libavutil/x86/cpu.h"
+#include "libavcodec/synth_filter.h"
+
+#define SYNTH_FILTER_FUNC(opt) \
+void ff_synth_filter_inner_##opt(float *synth_buf_ptr, float synth_buf2[32], \
+ const float window[512], \
+ float out[32], intptr_t offset, float scale); \
+static void synth_filter_##opt(FFTContext *imdct, \
+ float *synth_buf_ptr, int *synth_buf_offset, \
+ float synth_buf2[32], const float window[512], \
+ float out[32], const float in[32], float scale) \
+{ \
+ float *synth_buf= synth_buf_ptr + *synth_buf_offset; \
+ \
+ imdct->imdct_half(imdct, synth_buf, in); \
+ \
+ ff_synth_filter_inner_##opt(synth_buf, synth_buf2, window, \
+ out, *synth_buf_offset, scale); \
+ \
+ *synth_buf_offset = (*synth_buf_offset - 32) & 511; \
+} \
+
+#if HAVE_YASM
+#if ARCH_X86_32
+SYNTH_FILTER_FUNC(sse)
+#endif
+SYNTH_FILTER_FUNC(sse2)
+SYNTH_FILTER_FUNC(avx)
+SYNTH_FILTER_FUNC(fma3)
+#endif /* HAVE_YASM */
+
+av_cold void ff_synth_filter_init_x86(SynthFilterContext *s)
+{
+#if HAVE_YASM
+ int cpu_flags = av_get_cpu_flags();
+
+#if ARCH_X86_32
+ if (EXTERNAL_SSE(cpu_flags)) {
+ s->synth_filter_float = synth_filter_sse;
+ }
+#endif
+ if (EXTERNAL_SSE2(cpu_flags)) {
+ s->synth_filter_float = synth_filter_sse2;
+ }
+ if (EXTERNAL_AVX_FAST(cpu_flags)) {
+ s->synth_filter_float = synth_filter_avx;
+ }
+ if (EXTERNAL_FMA3(cpu_flags) && !(cpu_flags & AV_CPU_FLAG_AVXSLOW)) {
+ s->synth_filter_float = synth_filter_fma3;
+ }
+#endif /* HAVE_YASM */
+}