aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorRamiro Polla <ramiro.polla@gmail.com>2009-05-23 00:23:30 +0000
committerRamiro Polla <ramiro.polla@gmail.com>2009-05-23 00:23:30 +0000
commit5624766d18f0529487d074e96dce9084b6561996 (patch)
treed1af178e0d17d70889548e9224be2750622cc777
parentc2b4c859a61f290b46252b0a884e888e81bd152c (diff)
downloadffmpeg-5624766d18f0529487d074e96dce9084b6561996.tar.gz
MLP DSP functions x86-optimized.
12.59% overall speedup in x86_32 9.98% overall speedup in x86_64 compared to gcc 4.3.3 Originally committed as revision 18903 to svn://svn.ffmpeg.org/ffmpeg/trunk
-rw-r--r--libavcodec/Makefile2
-rw-r--r--libavcodec/mlpdsp.c4
-rw-r--r--libavcodec/x86/mlpdsp.c190
3 files changed, 196 insertions, 0 deletions
diff --git a/libavcodec/Makefile b/libavcodec/Makefile
index 551576a7bb..18ca9eae62 100644
--- a/libavcodec/Makefile
+++ b/libavcodec/Makefile
@@ -445,8 +445,10 @@ MMX-OBJS-$(CONFIG_CAVS_DECODER) += x86/cavsdsp_mmx.o
MMX-OBJS-$(CONFIG_ENCODERS) += x86/dsputilenc_mmx.o
MMX-OBJS-$(CONFIG_FLAC_ENCODER) += x86/flacdsp_mmx.o
MMX-OBJS-$(CONFIG_GPL) += x86/idct_mmx.o
+MMX-OBJS-$(CONFIG_MLP_DECODER) += x86/mlpdsp.o
MMX-OBJS-$(CONFIG_SNOW_DECODER) += x86/snowdsp_mmx.o
MMX-OBJS-$(CONFIG_THEORA_DECODER) += x86/vp3dsp_mmx.o x86/vp3dsp_sse2.o
+MMX-OBJS-$(CONFIG_TRUEHD_DECODER) += x86/mlpdsp.o
MMX-OBJS-$(CONFIG_VC1_DECODER) += x86/vc1dsp_mmx.o
MMX-OBJS-$(CONFIG_VP3_DECODER) += x86/vp3dsp_mmx.o x86/vp3dsp_sse2.o
MMX-OBJS-$(CONFIG_VP5_DECODER) += x86/vp3dsp_mmx.o x86/vp3dsp_sse2.o
diff --git a/libavcodec/mlpdsp.c b/libavcodec/mlpdsp.c
index 6519b16eab..79059d925a 100644
--- a/libavcodec/mlpdsp.c
+++ b/libavcodec/mlpdsp.c
@@ -55,7 +55,11 @@ static void ff_mlp_filter_channel(int32_t *state, const int32_t *coeff,
}
}
+void ff_mlp_init_x86(DSPContext* c, AVCodecContext *avctx);
+
void ff_mlp_init(DSPContext* c, AVCodecContext *avctx)
{
c->mlp_filter_channel = ff_mlp_filter_channel;
+ if (ARCH_X86)
+ ff_mlp_init_x86(c, avctx);
}
diff --git a/libavcodec/x86/mlpdsp.c b/libavcodec/x86/mlpdsp.c
new file mode 100644
index 0000000000..7577b09fd5
--- /dev/null
+++ b/libavcodec/x86/mlpdsp.c
@@ -0,0 +1,190 @@
+/*
+ * MLP DSP functions x86-optimized
+ * Copyright (c) 2009 Ramiro Polla
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/x86_cpu.h"
+#include "libavcodec/dsputil.h"
+#include "libavcodec/mlp.h"
+
+#if HAVE_7REGS && HAVE_TEN_OPERANDS
+
+extern void ff_mlp_firorder_8;
+extern void ff_mlp_firorder_7;
+extern void ff_mlp_firorder_6;
+extern void ff_mlp_firorder_5;
+extern void ff_mlp_firorder_4;
+extern void ff_mlp_firorder_3;
+extern void ff_mlp_firorder_2;
+extern void ff_mlp_firorder_1;
+extern void ff_mlp_firorder_0;
+
+extern void ff_mlp_iirorder_4;
+extern void ff_mlp_iirorder_3;
+extern void ff_mlp_iirorder_2;
+extern void ff_mlp_iirorder_1;
+extern void ff_mlp_iirorder_0;
+
+static const void *firtable[9] = { &ff_mlp_firorder_0, &ff_mlp_firorder_1,
+ &ff_mlp_firorder_2, &ff_mlp_firorder_3,
+ &ff_mlp_firorder_4, &ff_mlp_firorder_5,
+ &ff_mlp_firorder_6, &ff_mlp_firorder_7,
+ &ff_mlp_firorder_8 };
+static const void *iirtable[5] = { &ff_mlp_iirorder_0, &ff_mlp_iirorder_1,
+ &ff_mlp_iirorder_2, &ff_mlp_iirorder_3,
+ &ff_mlp_iirorder_4 };
+
+#if ARCH_X86_64
+
+#define MLPMUL(label, offset, offs, offc) \
+ MANGLE(label)": \n\t" \
+ "movslq "offset"+"offs"(%0), %%rax\n\t" \
+ "movslq "offset"+"offc"(%1), %%rdx\n\t" \
+ "imul %%rdx, %%rax\n\t" \
+ "add %%rax, %%rsi\n\t"
+
+#define FIRMULREG(label, offset, firc)\
+ MANGLE(label)": \n\t" \
+ "movslq "#offset"(%0), %%rax\n\t" \
+ "imul %"#firc", %%rax\n\t" \
+ "add %%rax, %%rsi\n\t"
+
+#define CLEAR_ACCUM \
+ "xor %%rsi, %%rsi\n\t"
+
+#define SHIFT_ACCUM \
+ "shr %%cl, %%rsi\n\t"
+
+#define ACCUM "%%rdx"
+#define RESULT "%%rsi"
+#define RESULT32 "%%esi"
+
+#define READVAL "r"
+#define RDWRVAL "+r"
+#define COUNTER "c"
+#define ECXUSED
+
+#else /* if ARCH_X86_32 */
+
+#define MLPMUL(label, offset, offs, offc) \
+ MANGLE(label)": \n\t" \
+ "mov "offset"+"offs"(%0), %%eax\n\t" \
+ "imull "offset"+"offc"(%1) \n\t" \
+ "add %%eax , %%esi\n\t" \
+ "adc %%edx , %%ecx\n\t"
+
+#define FIRMULREG(label, offset, firc) \
+ MLPMUL(label, #offset, "0", "0")
+
+#define CLEAR_ACCUM \
+ "xor %%esi, %%esi\n\t" \
+ "xor %%ecx, %%ecx\n\t"
+
+#define SHIFT_ACCUM \
+ "mov %%ecx, %%edx\n\t" \
+ "mov %%esi, %%eax\n\t" \
+ "movzbl %7 , %%ecx\n\t" \
+ "shrd %%cl, %%edx, %%eax\n\t" \
+
+#define ACCUM "%%edx"
+#define RESULT "%%eax"
+#define RESULT32 "%%eax"
+
+#define READVAL "m"
+#define RDWRVAL "+m"
+#define COUNTER "m"
+#define ECXUSED , "ecx"
+
+#endif /* !ARCH_X86_64 */
+
+#define BINC AV_STRINGIFY(4* MAX_CHANNELS)
+#define IOFFS AV_STRINGIFY(4*(MAX_FIR_ORDER + MAX_BLOCKSIZE))
+#define IOFFC AV_STRINGIFY(4* MAX_FIR_ORDER)
+
+#define FIRMUL(label, offset) MLPMUL(label, #offset, "0", "0")
+#define IIRMUL(label, offset) MLPMUL(label, #offset, IOFFS, IOFFC)
+
+static void mlp_filter_channel_x86(int32_t *state, const int32_t *coeff,
+ int firorder, int iirorder,
+ unsigned int filter_shift, int32_t mask,
+ int blocksize, int32_t *sample_buffer)
+{
+ const void *firjump = firtable[firorder];
+ const void *iirjump = iirtable[iirorder];
+
+ blocksize = -blocksize;
+
+ __asm__ volatile(
+ "1: \n\t"
+ CLEAR_ACCUM
+ "jmp *%5 \n\t"
+ FIRMUL (ff_mlp_firorder_8, 0x1c )
+ FIRMUL (ff_mlp_firorder_7, 0x18 )
+ FIRMUL (ff_mlp_firorder_6, 0x14 )
+ FIRMUL (ff_mlp_firorder_5, 0x10 )
+ FIRMUL (ff_mlp_firorder_4, 0x0c )
+ FIRMULREG(ff_mlp_firorder_3, 0x08,10)
+ FIRMULREG(ff_mlp_firorder_2, 0x04, 9)
+ FIRMULREG(ff_mlp_firorder_1, 0x00, 8)
+ MANGLE (ff_mlp_firorder_0)":\n\t"
+ "jmp *%6 \n\t"
+ IIRMUL (ff_mlp_iirorder_4, 0x0c )
+ IIRMUL (ff_mlp_iirorder_3, 0x08 )
+ IIRMUL (ff_mlp_iirorder_2, 0x04 )
+ IIRMUL (ff_mlp_iirorder_1, 0x00 )
+ MANGLE (ff_mlp_iirorder_0)":\n\t"
+ SHIFT_ACCUM
+ "mov "RESULT" ,"ACCUM" \n\t"
+ "add (%2) ,"RESULT" \n\t"
+ "and %4 ,"RESULT" \n\t"
+ "sub $4 , %0 \n\t"
+ "mov "RESULT32", (%0) \n\t"
+ "mov "RESULT32", (%2) \n\t"
+ "add $"BINC" , %2 \n\t"
+ "sub "ACCUM" ,"RESULT" \n\t"
+ "mov "RESULT32","IOFFS"(%0) \n\t"
+ "incl %3 \n\t"
+ "js 1b \n\t"
+ : /* 0*/"+r"(state),
+ /* 1*/"+r"(coeff),
+ /* 2*/"+r"(sample_buffer),
+ /* 3*/RDWRVAL(blocksize)
+ :
+ /* 4*/READVAL((x86_reg)mask),
+ /* 5*/READVAL(firjump),
+ /* 6*/READVAL(iirjump),
+ /* 7*/COUNTER(filter_shift)
+#if ARCH_X86_64
+ , /* 8*/"r"((int64_t)coeff[0])
+ , /* 9*/"r"((int64_t)coeff[1])
+ , /*10*/"r"((int64_t)coeff[2])
+#endif /* ARCH_X86_64 */
+ : REG_a, REG_d, REG_S
+ ECXUSED
+ );
+}
+
+#endif /* HAVE_7REGS && HAVE_TEN_OPERANDS */
+
+void ff_mlp_init_x86(DSPContext* c, AVCodecContext *avctx)
+{
+#if HAVE_7REGS && HAVE_TEN_OPERANDS
+ c->mlp_filter_channel = mlp_filter_channel_x86;
+#endif
+}