aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorArpi <arpi@thot.banki.hu>2003-04-16 20:03:07 +0000
committerArpi <arpi@thot.banki.hu>2003-04-16 20:03:07 +0000
commit1a7c3c85622fa0ff48127cfe38ece15e27bfb17c (patch)
treed2129e00537036d9851375e045518d4e2b1d4597
parent6814a25c676ae3f0eb73a2d7180b7fe9e62a62ec (diff)
downloadffmpeg-1a7c3c85622fa0ff48127cfe38ece15e27bfb17c.tar.gz
Utility functions (CRC calc & float->int converters)
[imported from MPlayer, based on a52dec's libao] Originally committed as revision 1780 to svn://svn.ffmpeg.org/ffmpeg/trunk
-rw-r--r--libavcodec/liba52/a52_internal.h2
-rw-r--r--libavcodec/liba52/a52_util.h32
-rw-r--r--libavcodec/liba52/crc.c73
-rw-r--r--libavcodec/liba52/mm_accel.h5
-rw-r--r--libavcodec/liba52/resample.c45
-rw-r--r--libavcodec/liba52/resample_c.c183
-rw-r--r--libavcodec/liba52/resample_mmx.c518
7 files changed, 858 insertions, 0 deletions
diff --git a/libavcodec/liba52/a52_internal.h b/libavcodec/liba52/a52_internal.h
index a158227699..5235704ffe 100644
--- a/libavcodec/liba52/a52_internal.h
+++ b/libavcodec/liba52/a52_internal.h
@@ -118,6 +118,8 @@ void a52_upmix (sample_t * samples, int acmod, int output);
void a52_imdct_init (uint32_t mm_accel);
void a52_imdct_256 (sample_t * data, sample_t * delay, sample_t bias);
void a52_imdct_512 (sample_t * data, sample_t * delay, sample_t bias);
+//extern void (* a52_imdct_256) (sample_t data[], sample_t delay[], sample_t bias);
+//extern void (* a52_imdct_512) (sample_t data[], sample_t delay[], sample_t bias);
#define ROUND(x) ((int)((x) + ((x) > 0 ? 0.5 : -0.5)))
diff --git a/libavcodec/liba52/a52_util.h b/libavcodec/liba52/a52_util.h
new file mode 100644
index 0000000000..121393ec19
--- /dev/null
+++ b/libavcodec/liba52/a52_util.h
@@ -0,0 +1,32 @@
+/*
+ * a52_util.h
+ * Copyright (C) 2000-2003 Michel Lespinasse <walken@zoy.org>
+ * Copyright (C) 1999-2000 Aaron Holtzman <aholtzma@ess.engr.uvic.ca>
+ *
+ * This file is part of a52dec, a free ATSC A-52 stream decoder.
+ * See http://liba52.sourceforge.net/ for updates.
+ *
+ * a52dec is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * a52dec is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ */
+
+#ifndef A52_UTIL_H
+#define A52_UTIL_H
+
+uint16_t a52_crc16_block(uint8_t *data,uint32_t num_bytes);
+
+void* a52_resample_init(uint32_t mm_accel,int flags,int chans);
+extern int (* a52_resample) (float * _f, int16_t * s16);
+
+#endif /* A52_H */
diff --git a/libavcodec/liba52/crc.c b/libavcodec/liba52/crc.c
new file mode 100644
index 0000000000..6698155bd4
--- /dev/null
+++ b/libavcodec/liba52/crc.c
@@ -0,0 +1,73 @@
+/*
+ * crc.c
+ *
+ * Copyright (C) Aaron Holtzman - May 1999
+ *
+ * This file is part of ac3dec, a free Dolby AC-3 stream decoder.
+ *
+ * ac3dec is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2, or (at your option)
+ * any later version.
+ *
+ * ac3dec is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with GNU Make; see the file COPYING. If not, write to
+ * the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
+ *
+ */
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <inttypes.h>
+
+static const uint16_t crc_lut[256] =
+{
+ 0x0000,0x8005,0x800f,0x000a,0x801b,0x001e,0x0014,0x8011,
+ 0x8033,0x0036,0x003c,0x8039,0x0028,0x802d,0x8027,0x0022,
+ 0x8063,0x0066,0x006c,0x8069,0x0078,0x807d,0x8077,0x0072,
+ 0x0050,0x8055,0x805f,0x005a,0x804b,0x004e,0x0044,0x8041,
+ 0x80c3,0x00c6,0x00cc,0x80c9,0x00d8,0x80dd,0x80d7,0x00d2,
+ 0x00f0,0x80f5,0x80ff,0x00fa,0x80eb,0x00ee,0x00e4,0x80e1,
+ 0x00a0,0x80a5,0x80af,0x00aa,0x80bb,0x00be,0x00b4,0x80b1,
+ 0x8093,0x0096,0x009c,0x8099,0x0088,0x808d,0x8087,0x0082,
+ 0x8183,0x0186,0x018c,0x8189,0x0198,0x819d,0x8197,0x0192,
+ 0x01b0,0x81b5,0x81bf,0x01ba,0x81ab,0x01ae,0x01a4,0x81a1,
+ 0x01e0,0x81e5,0x81ef,0x01ea,0x81fb,0x01fe,0x01f4,0x81f1,
+ 0x81d3,0x01d6,0x01dc,0x81d9,0x01c8,0x81cd,0x81c7,0x01c2,
+ 0x0140,0x8145,0x814f,0x014a,0x815b,0x015e,0x0154,0x8151,
+ 0x8173,0x0176,0x017c,0x8179,0x0168,0x816d,0x8167,0x0162,
+ 0x8123,0x0126,0x012c,0x8129,0x0138,0x813d,0x8137,0x0132,
+ 0x0110,0x8115,0x811f,0x011a,0x810b,0x010e,0x0104,0x8101,
+ 0x8303,0x0306,0x030c,0x8309,0x0318,0x831d,0x8317,0x0312,
+ 0x0330,0x8335,0x833f,0x033a,0x832b,0x032e,0x0324,0x8321,
+ 0x0360,0x8365,0x836f,0x036a,0x837b,0x037e,0x0374,0x8371,
+ 0x8353,0x0356,0x035c,0x8359,0x0348,0x834d,0x8347,0x0342,
+ 0x03c0,0x83c5,0x83cf,0x03ca,0x83db,0x03de,0x03d4,0x83d1,
+ 0x83f3,0x03f6,0x03fc,0x83f9,0x03e8,0x83ed,0x83e7,0x03e2,
+ 0x83a3,0x03a6,0x03ac,0x83a9,0x03b8,0x83bd,0x83b7,0x03b2,
+ 0x0390,0x8395,0x839f,0x039a,0x838b,0x038e,0x0384,0x8381,
+ 0x0280,0x8285,0x828f,0x028a,0x829b,0x029e,0x0294,0x8291,
+ 0x82b3,0x02b6,0x02bc,0x82b9,0x02a8,0x82ad,0x82a7,0x02a2,
+ 0x82e3,0x02e6,0x02ec,0x82e9,0x02f8,0x82fd,0x82f7,0x02f2,
+ 0x02d0,0x82d5,0x82df,0x02da,0x82cb,0x02ce,0x02c4,0x82c1,
+ 0x8243,0x0246,0x024c,0x8249,0x0258,0x825d,0x8257,0x0252,
+ 0x0270,0x8275,0x827f,0x027a,0x826b,0x026e,0x0264,0x8261,
+ 0x0220,0x8225,0x822f,0x022a,0x823b,0x023e,0x0234,0x8231,
+ 0x8213,0x0216,0x021c,0x8219,0x0208,0x820d,0x8207,0x0202
+};
+
+uint16_t a52_crc16_block(uint8_t *data,uint32_t num_bytes)
+{
+ uint32_t i;
+ uint16_t state=0;
+
+ for(i=0;i<num_bytes;i++)
+ state = crc_lut[data[i] ^ (state>>8)] ^ (state<<8);
+
+ return state;
+}
diff --git a/libavcodec/liba52/mm_accel.h b/libavcodec/liba52/mm_accel.h
index 25258c3683..8afbd354cd 100644
--- a/libavcodec/liba52/mm_accel.h
+++ b/libavcodec/liba52/mm_accel.h
@@ -31,6 +31,11 @@
#define MM_ACCEL_X86_MMX 0x80000000
#define MM_ACCEL_X86_3DNOW 0x40000000
#define MM_ACCEL_X86_MMXEXT 0x20000000
+#define MM_ACCEL_X86_SSE 0x10000000
+#define MM_ACCEL_X86_3DNOWEXT 0x08000000
+
+/* PPC accelerations */
+#define MM_ACCEL_PPC_ALTIVEC 0x00010000
uint32_t mm_accel (void);
diff --git a/libavcodec/liba52/resample.c b/libavcodec/liba52/resample.c
new file mode 100644
index 0000000000..284cbbe78d
--- /dev/null
+++ b/libavcodec/liba52/resample.c
@@ -0,0 +1,45 @@
+
+// a52_resample_init should find the requested converter (from type flags ->
+// given number of channels) and set up some function pointers...
+
+// a52_resample() should do the conversion.
+
+#include <inttypes.h>
+#include <stdio.h>
+#include "a52.h"
+#include "mm_accel.h"
+#include "config.h"
+#include "../libpostproc/mangle.h"
+
+int (* a52_resample) (float * _f, int16_t * s16)=NULL;
+
+#include "resample_c.c"
+
+#ifdef ARCH_X86
+#include "resample_mmx.c"
+#endif
+
+void* a52_resample_init(uint32_t mm_accel,int flags,int chans){
+void* tmp;
+
+#ifdef ARCH_X86
+ if(mm_accel&MM_ACCEL_X86_MMX){
+ tmp=a52_resample_MMX(flags,chans);
+ if(tmp){
+ if(a52_resample==NULL) fprintf(stderr, "Using MMX optimized resampler\n");
+ a52_resample=tmp;
+ return tmp;
+ }
+ }
+#endif
+
+ tmp=a52_resample_C(flags,chans);
+ if(tmp){
+ if(a52_resample==NULL) fprintf(stderr, "No accelerated resampler found\n");
+ a52_resample=tmp;
+ return tmp;
+ }
+
+ fprintf(stderr, "Unimplemented resampler for mode 0x%X -> %d channels conversion - Contact MPlayer developers!\n", flags, chans);
+ return NULL;
+}
diff --git a/libavcodec/liba52/resample_c.c b/libavcodec/liba52/resample_c.c
new file mode 100644
index 0000000000..a618ec6e9e
--- /dev/null
+++ b/libavcodec/liba52/resample_c.c
@@ -0,0 +1,183 @@
+// this code is based on a52dec/libao/audio_out_oss.c
+
+static inline int16_t convert (int32_t i)
+{
+ if (i > 0x43c07fff)
+ return 32767;
+ else if (i < 0x43bf8000)
+ return -32768;
+ else
+ return i - 0x43c00000;
+}
+
+static int a52_resample_MONO_to_5_C(float * _f, int16_t * s16){
+ int i;
+ int32_t * f = (int32_t *) _f;
+ for (i = 0; i < 256; i++) {
+ s16[5*i] = s16[5*i+1] = s16[5*i+2] = s16[5*i+3] = 0;
+ s16[5*i+4] = convert (f[i]);
+ }
+ return 5*256;
+}
+
+static int a52_resample_MONO_to_1_C(float * _f, int16_t * s16){
+ int i;
+ int32_t * f = (int32_t *) _f;
+ for (i = 0; i < 256; i++) {
+ s16[i] = convert (f[i]);
+ }
+ return 1*256;
+}
+
+static int a52_resample_STEREO_to_2_C(float * _f, int16_t * s16){
+ int i;
+ int32_t * f = (int32_t *) _f;
+ for (i = 0; i < 256; i++) {
+ s16[2*i] = convert (f[i]);
+ s16[2*i+1] = convert (f[i+256]);
+ }
+ return 2*256;
+}
+
+static int a52_resample_3F_to_5_C(float * _f, int16_t * s16){
+ int i;
+ int32_t * f = (int32_t *) _f;
+ for (i = 0; i < 256; i++) {
+ s16[5*i] = convert (f[i]);
+ s16[5*i+1] = convert (f[i+512]);
+ s16[5*i+2] = s16[5*i+3] = 0;
+ s16[5*i+4] = convert (f[i+256]);
+ }
+ return 5*256;
+}
+
+static int a52_resample_2F_2R_to_4_C(float * _f, int16_t * s16){
+ int i;
+ int32_t * f = (int32_t *) _f;
+ for (i = 0; i < 256; i++) {
+ s16[4*i] = convert (f[i]);
+ s16[4*i+1] = convert (f[i+256]);
+ s16[4*i+2] = convert (f[i+512]);
+ s16[4*i+3] = convert (f[i+768]);
+ }
+ return 4*256;
+}
+
+static int a52_resample_3F_2R_to_5_C(float * _f, int16_t * s16){
+ int i;
+ int32_t * f = (int32_t *) _f;
+ for (i = 0; i < 256; i++) {
+ s16[5*i] = convert (f[i]);
+ s16[5*i+1] = convert (f[i+512]);
+ s16[5*i+2] = convert (f[i+768]);
+ s16[5*i+3] = convert (f[i+1024]);
+ s16[5*i+4] = convert (f[i+256]);
+ }
+ return 5*256;
+}
+
+static int a52_resample_MONO_LFE_to_6_C(float * _f, int16_t * s16){
+ int i;
+ int32_t * f = (int32_t *) _f;
+ for (i = 0; i < 256; i++) {
+ s16[6*i] = s16[6*i+1] = s16[6*i+2] = s16[6*i+3] = 0;
+ s16[6*i+4] = convert (f[i+256]);
+ s16[6*i+5] = convert (f[i]);
+ }
+ return 6*256;
+}
+
+static int a52_resample_STEREO_LFE_to_6_C(float * _f, int16_t * s16){
+ int i;
+ int32_t * f = (int32_t *) _f;
+ for (i = 0; i < 256; i++) {
+ s16[6*i] = convert (f[i+256]);
+ s16[6*i+1] = convert (f[i+512]);
+ s16[6*i+2] = s16[6*i+3] = s16[6*i+4] = 0;
+ s16[6*i+5] = convert (f[i]);
+ }
+ return 6*256;
+}
+
+static int a52_resample_3F_LFE_to_6_C(float * _f, int16_t * s16){
+ int i;
+ int32_t * f = (int32_t *) _f;
+ for (i = 0; i < 256; i++) {
+ s16[6*i] = convert (f[i+256]);
+ s16[6*i+1] = convert (f[i+768]);
+ s16[6*i+2] = s16[6*i+3] = 0;
+ s16[6*i+4] = convert (f[i+512]);
+ s16[6*i+5] = convert (f[i]);
+ }
+ return 6*256;
+}
+
+static int a52_resample_2F_2R_LFE_to_6_C(float * _f, int16_t * s16){
+ int i;
+ int32_t * f = (int32_t *) _f;
+ for (i = 0; i < 256; i++) {
+ s16[6*i] = convert (f[i+256]);
+ s16[6*i+1] = convert (f[i+512]);
+ s16[6*i+2] = convert (f[i+768]);
+ s16[6*i+3] = convert (f[i+1024]);
+ s16[6*i+4] = 0;
+ s16[6*i+5] = convert (f[i]);
+ }
+ return 6*256;
+}
+
+static int a52_resample_3F_2R_LFE_to_6_C(float * _f, int16_t * s16){
+ int i;
+ int32_t * f = (int32_t *) _f;
+ for (i = 0; i < 256; i++) {
+ s16[6*i] = convert (f[i+256]);
+ s16[6*i+1] = convert (f[i+768]);
+ s16[6*i+2] = convert (f[i+1024]);
+ s16[6*i+3] = convert (f[i+1280]);
+ s16[6*i+4] = convert (f[i+512]);
+ s16[6*i+5] = convert (f[i]);
+ }
+ return 6*256;
+}
+
+
+static void* a52_resample_C(int flags, int ch){
+ switch (flags) {
+ case A52_MONO:
+ if(ch==5) return a52_resample_MONO_to_5_C;
+ if(ch==1) return a52_resample_MONO_to_1_C;
+ break;
+ case A52_CHANNEL:
+ case A52_STEREO:
+ case A52_DOLBY:
+ if(ch==2) return a52_resample_STEREO_to_2_C;
+ break;
+ case A52_3F:
+ if(ch==5) return a52_resample_3F_to_5_C;
+ break;
+ case A52_2F2R:
+ if(ch==4) return a52_resample_2F_2R_to_4_C;
+ break;
+ case A52_3F2R:
+ if(ch==5) return a52_resample_3F_2R_to_5_C;
+ break;
+ case A52_MONO | A52_LFE:
+ if(ch==6) return a52_resample_MONO_LFE_to_6_C;
+ break;
+ case A52_CHANNEL | A52_LFE:
+ case A52_STEREO | A52_LFE:
+ case A52_DOLBY | A52_LFE:
+ if(ch==6) return a52_resample_STEREO_LFE_to_6_C;
+ break;
+ case A52_3F | A52_LFE:
+ if(ch==6) return a52_resample_3F_LFE_to_6_C;
+ break;
+ case A52_2F2R | A52_LFE:
+ if(ch==6) return a52_resample_2F_2R_LFE_to_6_C;
+ break;
+ case A52_3F2R | A52_LFE:
+ if(ch==6) return a52_resample_3F_2R_LFE_to_6_C;
+ break;
+ }
+ return NULL;
+}
diff --git a/libavcodec/liba52/resample_mmx.c b/libavcodec/liba52/resample_mmx.c
new file mode 100644
index 0000000000..a4079798f7
--- /dev/null
+++ b/libavcodec/liba52/resample_mmx.c
@@ -0,0 +1,518 @@
+
+// MMX optimizations from Michael Niedermayer (michaelni@gmx.at) (under GPL)
+
+/* optimization TODO / NOTES
+ movntq is slightly faster (0.5% with the current test.c benchmark)
+ (but thats just test.c so that needs to be testd in reallity)
+ and it would mean (C / MMX2 / MMX / 3DNOW) versions
+*/
+
+static uint64_t __attribute__((aligned(8))) magicF2W= 0x43c0000043c00000LL;
+static uint64_t __attribute__((aligned(8))) wm1010= 0xFFFF0000FFFF0000LL;
+static uint64_t __attribute__((aligned(8))) wm0101= 0x0000FFFF0000FFFFLL;
+static uint64_t __attribute__((aligned(8))) wm1100= 0xFFFFFFFF00000000LL;
+
+static int a52_resample_MONO_to_5_MMX(float * _f, int16_t * s16){
+ int32_t * f = (int32_t *) _f;
+ asm volatile(
+ "movl $-512, %%esi \n\t"
+ "movq "MANGLE(magicF2W)", %%mm7 \n\t"
+ "movq "MANGLE(wm1100)", %%mm3 \n\t"
+ "movq "MANGLE(wm0101)", %%mm4 \n\t"
+ "movq "MANGLE(wm1010)", %%mm5 \n\t"
+ "pxor %%mm6, %%mm6 \n\t"
+ "1: \n\t"
+ "movq (%1, %%esi, 2), %%mm0 \n\t"
+ "movq 8(%1, %%esi, 2), %%mm1 \n\t"
+ "leal (%%esi, %%esi, 4), %%edi \n\t"
+ "psubd %%mm7, %%mm0 \n\t"
+ "psubd %%mm7, %%mm1 \n\t"
+ "packssdw %%mm1, %%mm0 \n\t"
+ "movq %%mm0, %%mm1 \n\t"
+ "pand %%mm4, %%mm0 \n\t"
+ "pand %%mm5, %%mm1 \n\t"
+ "movq %%mm6, (%0, %%edi) \n\t" // 0 0 0 0
+ "movd %%mm0, 8(%0, %%edi) \n\t" // A 0
+ "pand %%mm3, %%mm0 \n\t"
+ "movd %%mm6, 12(%0, %%edi) \n\t" // 0 0
+ "movd %%mm1, 16(%0, %%edi) \n\t" // 0 B
+ "pand %%mm3, %%mm1 \n\t"
+ "movd %%mm6, 20(%0, %%edi) \n\t" // 0 0
+ "movq %%mm0, 24(%0, %%edi) \n\t" // 0 0 C 0
+ "movq %%mm1, 32(%0, %%edi) \n\t" // 0 0 0 B
+ "addl $8, %%esi \n\t"
+ " jnz 1b \n\t"
+ "emms \n\t"
+ :: "r" (s16+1280), "r" (f+256)
+ :"%esi", "%edi", "memory"
+ );
+ return 5*256;
+}
+
+static int a52_resample_STEREO_to_2_MMX(float * _f, int16_t * s16){
+ int32_t * f = (int32_t *) _f;
+/* benchmark scores are 0.3% better with SSE but we would need to set bias=0 and premultiply it
+#ifdef HAVE_SSE
+ asm volatile(
+ "movl $-1024, %%esi \n\t"
+ "1: \n\t"
+ "cvtps2pi (%1, %%esi), %%mm0 \n\t"
+ "cvtps2pi 1024(%1, %%esi), %%mm2\n\t"
+ "movq %%mm0, %%mm1 \n\t"
+ "punpcklwd %%mm2, %%mm0 \n\t"
+ "punpckhwd %%mm2, %%mm1 \n\t"
+ "movq %%mm0, (%0, %%esi) \n\t"
+ "movq %%mm1, 8(%0, %%esi) \n\t"
+ "addl $16, %%esi \n\t"
+ " jnz 1b \n\t"
+ "emms \n\t"
+ :: "r" (s16+512), "r" (f+256)
+ :"%esi", "memory"
+ );*/
+ asm volatile(
+ "movl $-1024, %%esi \n\t"
+ "movq "MANGLE(magicF2W)", %%mm7 \n\t"
+ "1: \n\t"
+ "movq (%1, %%esi), %%mm0 \n\t"
+ "movq 8(%1, %%esi), %%mm1 \n\t"
+ "movq 1024(%1, %%esi), %%mm2 \n\t"
+ "movq 1032(%1, %%esi), %%mm3 \n\t"
+ "psubd %%mm7, %%mm0 \n\t"
+ "psubd %%mm7, %%mm1 \n\t"
+ "psubd %%mm7, %%mm2 \n\t"
+ "psubd %%mm7, %%mm3 \n\t"
+ "packssdw %%mm1, %%mm0 \n\t"
+ "packssdw %%mm3, %%mm2 \n\t"
+ "movq %%mm0, %%mm1 \n\t"
+ "punpcklwd %%mm2, %%mm0 \n\t"
+ "punpckhwd %%mm2, %%mm1 \n\t"
+ "movq %%mm0, (%0, %%esi) \n\t"
+ "movq %%mm1, 8(%0, %%esi) \n\t"
+ "addl $16, %%esi \n\t"
+ " jnz 1b \n\t"
+ "emms \n\t"
+ :: "r" (s16+512), "r" (f+256)
+ :"%esi", "memory"
+ );
+ return 2*256;
+}
+
+static int a52_resample_3F_to_5_MMX(float * _f, int16_t * s16){
+ int32_t * f = (int32_t *) _f;
+ asm volatile(
+ "movl $-1024, %%esi \n\t"
+ "movq "MANGLE(magicF2W)", %%mm7 \n\t"
+ "pxor %%mm6, %%mm6 \n\t"
+ "movq %%mm7, %%mm5 \n\t"
+ "punpckldq %%mm6, %%mm5 \n\t"
+ "1: \n\t"
+ "movd (%1, %%esi), %%mm0 \n\t"
+ "punpckldq 2048(%1, %%esi), %%mm0\n\t"
+ "movd 1024(%1, %%esi), %%mm1 \n\t"
+ "punpckldq 4(%1, %%esi), %%mm1 \n\t"
+ "movd 2052(%1, %%esi), %%mm2 \n\t"
+ "movq %%mm7, %%mm3 \n\t"
+ "punpckldq 1028(%1, %%esi), %%mm3\n\t"
+ "movd 8(%1, %%esi), %%mm4 \n\t"
+ "punpckldq 2056(%1, %%esi), %%mm4\n\t"
+ "leal (%%esi, %%esi, 4), %%edi \n\t"
+ "sarl $1, %%edi \n\t"
+ "psubd %%mm7, %%mm0 \n\t"
+ "psubd %%mm7, %%mm1 \n\t"
+ "psubd %%mm5, %%mm2 \n\t"
+ "psubd %%mm7, %%mm3 \n\t"
+ "psubd %%mm7, %%mm4 \n\t"
+ "packssdw %%mm6, %%mm0 \n\t"
+ "packssdw %%mm2, %%mm1 \n\t"
+ "packssdw %%mm4, %%mm3 \n\t"
+ "movq %%mm0, (%0, %%edi) \n\t"
+ "movq %%mm1, 8(%0, %%edi) \n\t"
+ "movq %%mm3, 16(%0, %%edi) \n\t"
+
+ "movd 1032(%1, %%esi), %%mm1 \n\t"
+ "punpckldq 12(%1, %%esi), %%mm1\n\t"
+ "movd 2060(%1, %%esi), %%mm2 \n\t"
+ "movq %%mm7, %%mm3 \n\t"
+ "punpckldq 1036(%1, %%esi), %%mm3\n\t"
+ "pxor %%mm0, %%mm0 \n\t"
+ "psubd %%mm7, %%mm1 \n\t"
+ "psubd %%mm5, %%mm2 \n\t"
+ "psubd %%mm7, %%mm3 \n\t"
+ "packssdw %%mm1, %%mm0 \n\t"
+ "packssdw %%mm3, %%mm2 \n\t"
+ "movq %%mm0, 24(%0, %%edi) \n\t"
+ "movq %%mm2, 32(%0, %%edi) \n\t"
+
+ "addl $16, %%esi \n\t"
+ " jnz 1b \n\t"
+ "emms \n\t"
+ :: "r" (s16+1280), "r" (f+256)
+ :"%esi", "%edi", "memory"
+ );
+ return 5*256;
+}
+
+static int a52_resample_2F_2R_to_4_MMX(float * _f, int16_t * s16){
+ int32_t * f = (int32_t *) _f;
+ asm volatile(
+ "movl $-1024, %%esi \n\t"
+ "movq "MANGLE(magicF2W)", %%mm7 \n\t"
+ "1: \n\t"
+ "movq (%1, %%esi), %%mm0 \n\t"
+ "movq 8(%1, %%esi), %%mm1 \n\t"
+ "movq 1024(%1, %%esi), %%mm2 \n\t"
+ "movq 1032(%1, %%esi), %%mm3 \n\t"
+ "psubd %%mm7, %%mm0 \n\t"
+ "psubd %%mm7, %%mm1 \n\t"
+ "psubd %%mm7, %%mm2 \n\t"
+ "psubd %%mm7, %%mm3 \n\t"
+ "packssdw %%mm1, %%mm0 \n\t"
+ "packssdw %%mm3, %%mm2 \n\t"
+ "movq 2048(%1, %%esi), %%mm3 \n\t"
+ "movq 2056(%1, %%esi), %%mm4 \n\t"
+ "movq 3072(%1, %%esi), %%mm5 \n\t"
+ "movq 3080(%1, %%esi), %%mm6 \n\t"
+ "psubd %%mm7, %%mm3 \n\t"
+ "psubd %%mm7, %%mm4 \n\t"
+ "psubd %%mm7, %%mm5 \n\t"
+ "psubd %%mm7, %%mm6 \n\t"
+ "packssdw %%mm4, %%mm3 \n\t"
+ "packssdw %%mm6, %%mm5 \n\t"
+ "movq %%mm0, %%mm1 \n\t"
+ "movq %%mm3, %%mm4 \n\t"
+ "punpcklwd %%mm2, %%mm0 \n\t"
+ "punpckhwd %%mm2, %%mm1 \n\t"
+ "punpcklwd %%mm5, %%mm3 \n\t"
+ "punpckhwd %%mm5, %%mm4 \n\t"
+ "movq %%mm0, %%mm2 \n\t"
+ "movq %%mm1, %%mm5 \n\t"
+ "punpckldq %%mm3, %%mm0 \n\t"
+ "punpckhdq %%mm3, %%mm2 \n\t"
+ "punpckldq %%mm4, %%mm1 \n\t"
+ "punpckhdq %%mm4, %%mm5 \n\t"
+ "movq %%mm0, (%0, %%esi,2) \n\t"
+ "movq %%mm2, 8(%0, %%esi,2) \n\t"
+ "movq %%mm1, 16(%0, %%esi,2) \n\t"
+ "movq %%mm5, 24(%0, %%esi,2) \n\t"
+ "addl $16, %%esi \n\t"
+ " jnz 1b \n\t"
+ "emms \n\t"
+ :: "r" (s16+1024), "r" (f+256)
+ :"%esi", "memory"
+ );
+ return 4*256;
+}
+
+static int a52_resample_3F_2R_to_5_MMX(float * _f, int16_t * s16){
+ int32_t * f = (int32_t *) _f;
+ asm volatile(
+ "movl $-1024, %%esi \n\t"
+ "movq "MANGLE(magicF2W)", %%mm7 \n\t"
+ "1: \n\t"
+ "movd (%1, %%esi), %%mm0 \n\t"
+ "punpckldq 2048(%1, %%esi), %%mm0\n\t"
+ "movd 3072(%1, %%esi), %%mm1 \n\t"
+ "punpckldq 4096(%1, %%esi), %%mm1\n\t"
+ "movd 1024(%1, %%esi), %%mm2 \n\t"
+ "punpckldq 4(%1, %%esi), %%mm2 \n\t"
+ "movd 2052(%1, %%esi), %%mm3 \n\t"
+ "punpckldq 3076(%1, %%esi), %%mm3\n\t"
+ "movd 4100(%1, %%esi), %%mm4 \n\t"
+ "punpckldq 1028(%1, %%esi), %%mm4\n\t"
+ "movd 8(%1, %%esi), %%mm5 \n\t"
+ "punpckldq 2056(%1, %%esi), %%mm5\n\t"
+ "leal (%%esi, %%esi, 4), %%edi \n\t"
+ "sarl $1, %%edi \n\t"
+ "psubd %%mm7, %%mm0 \n\t"
+ "psubd %%mm7, %%mm1 \n\t"
+ "psubd %%mm7, %%mm2 \n\t"
+ "psubd %%mm7, %%mm3 \n\t"
+ "psubd %%mm7, %%mm4 \n\t"
+ "psubd %%mm7, %%mm5 \n\t"
+ "packssdw %%mm1, %%mm0 \n\t"
+ "packssdw %%mm3, %%mm2 \n\t"
+ "packssdw %%mm5, %%mm4 \n\t"
+ "movq %%mm0, (%0, %%edi) \n\t"
+ "movq %%mm2, 8(%0, %%edi) \n\t"
+ "movq %%mm4, 16(%0, %%edi) \n\t"
+
+ "movd 3080(%1, %%esi), %%mm0 \n\t"
+ "punpckldq 4104(%1, %%esi), %%mm0\n\t"
+ "movd 1032(%1, %%esi), %%mm1 \n\t"
+ "punpckldq 12(%1, %%esi), %%mm1\n\t"
+ "movd 2060(%1, %%esi), %%mm2 \n\t"
+ "punpckldq 3084(%1, %%esi), %%mm2\n\t"
+ "movd 4108(%1, %%esi), %%mm3 \n\t"
+ "punpckldq 1036(%1, %%esi), %%mm3\n\t"
+ "psubd %%mm7, %%mm0 \n\t"
+ "psubd %%mm7, %%mm1 \n\t"
+ "psubd %%mm7, %%mm2 \n\t"
+ "psubd %%mm7, %%mm3 \n\t"
+ "packssdw %%mm1, %%mm0 \n\t"
+ "packssdw %%mm3, %%mm2 \n\t"
+ "movq %%mm0, 24(%0, %%edi) \n\t"
+ "movq %%mm2, 32(%0, %%edi) \n\t"
+
+ "addl $16, %%esi \n\t"
+ " jnz 1b \n\t"
+ "emms \n\t"
+ :: "r" (s16+1280), "r" (f+256)
+ :"%esi", "%edi", "memory"
+ );
+ return 5*256;
+}
+
+static int a52_resample_MONO_LFE_to_6_MMX(float * _f, int16_t * s16){
+ int32_t * f = (int32_t *) _f;
+ asm volatile(
+ "movl $-1024, %%esi \n\t"
+ "movq "MANGLE(magicF2W)", %%mm7 \n\t"
+ "pxor %%mm6, %%mm6 \n\t"
+ "1: \n\t"
+ "movq 1024(%1, %%esi), %%mm0 \n\t"
+ "movq 1032(%1, %%esi), %%mm1 \n\t"
+ "movq (%1, %%esi), %%mm2 \n\t"
+ "movq 8(%1, %%esi), %%mm3 \n\t"
+ "psubd %%mm7, %%mm0 \n\t"
+ "psubd %%mm7, %%mm1 \n\t"
+ "psubd %%mm7, %%mm2 \n\t"
+ "psubd %%mm7, %%mm3 \n\t"
+ "packssdw %%mm1, %%mm0 \n\t"
+ "packssdw %%mm3, %%mm2 \n\t"
+ "movq %%mm0, %%mm1 \n\t"
+ "punpcklwd %%mm2, %%mm0 \n\t"
+ "punpckhwd %%mm2, %%mm1 \n\t"
+ "leal (%%esi, %%esi, 2), %%edi \n\t"
+ "movq %%mm6, (%0, %%edi) \n\t"
+ "movd %%mm0, 8(%0, %%edi) \n\t"
+ "punpckhdq %%mm0, %%mm0 \n\t"
+ "movq %%mm6, 12(%0, %%edi) \n\t"
+ "movd %%mm0, 20(%0, %%edi) \n\t"
+ "movq %%mm6, 24(%0, %%edi) \n\t"
+ "movd %%mm1, 32(%0, %%edi) \n\t"
+ "punpckhdq %%mm1, %%mm1 \n\t"
+ "movq %%mm6, 36(%0, %%edi) \n\t"
+ "movd %%mm1, 44(%0, %%edi) \n\t"
+ "addl $16, %%esi \n\t"
+ " jnz 1b \n\t"
+ "emms \n\t"
+ :: "r" (s16+1536), "r" (f+256)
+ :"%esi", "%edi", "memory"
+ );
+ return 6*256;
+}
+
+static int a52_resample_STEREO_LFE_to_6_MMX(float * _f, int16_t * s16){
+ int32_t * f = (int32_t *) _f;
+ asm volatile(
+ "movl $-1024, %%esi \n\t"
+ "movq "MANGLE(magicF2W)", %%mm7 \n\t"
+ "pxor %%mm6, %%mm6 \n\t"
+ "1: \n\t"
+ "movq 1024(%1, %%esi), %%mm0 \n\t"
+ "movq 2048(%1, %%esi), %%mm1 \n\t"
+ "movq (%1, %%esi), %%mm5 \n\t"
+ "psubd %%mm7, %%mm0 \n\t"
+ "psubd %%mm7, %%mm1 \n\t"
+ "psubd %%mm7, %%mm5 \n\t"
+ "leal (%%esi, %%esi, 2), %%edi \n\t"
+
+ "pxor %%mm4, %%mm4 \n\t"
+ "packssdw %%mm5, %%mm0 \n\t" // FfAa
+ "packssdw %%mm4, %%mm1 \n\t" // 00Bb
+ "punpckhwd %%mm0, %%mm4 \n\t" // F0f0
+ "punpcklwd %%mm1, %%mm0 \n\t" // BAba
+ "movq %%mm0, %%mm1 \n\t" // BAba
+ "punpckldq %%mm4, %%mm3 \n\t" // f0XX
+ "punpckldq %%mm6, %%mm0 \n\t" // 00ba
+ "punpckhdq %%mm1, %%mm3 \n\t" // BAf0
+
+ "movq %%mm0, (%0, %%edi) \n\t" // 00ba
+ "punpckhdq %%mm4, %%mm0 \n\t" // F000
+ "movq %%mm3, 8(%0, %%edi) \n\t" // BAf0
+ "movq %%mm0, 16(%0, %%edi) \n\t" // F000
+ "addl $8, %%esi \n\t"
+ " jnz 1b \n\t"
+ "emms \n\t"
+ :: "r" (s16+1536), "r" (f+256)
+ :"%esi", "%edi", "memory"
+ );
+ return 6*256;
+}
+
+static int a52_resample_3F_LFE_to_6_MMX(float * _f, int16_t * s16){
+ int32_t * f = (int32_t *) _f;
+ asm volatile(
+ "movl $-1024, %%esi \n\t"
+ "movq "MANGLE(magicF2W)", %%mm7 \n\t"
+ "pxor %%mm6, %%mm6 \n\t"
+ "1: \n\t"
+ "movq 1024(%1, %%esi), %%mm0 \n\t"
+ "movq 3072(%1, %%esi), %%mm1 \n\t"
+ "movq 2048(%1, %%esi), %%mm4 \n\t"
+ "movq (%1, %%esi), %%mm5 \n\t"
+ "psubd %%mm7, %%mm0 \n\t"
+ "psubd %%mm7, %%mm1 \n\t"
+ "psubd %%mm7, %%mm4 \n\t"
+ "psubd %%mm7, %%mm5 \n\t"
+ "leal (%%esi, %%esi, 2), %%edi \n\t"
+
+ "packssdw %%mm4, %%mm0 \n\t" // EeAa
+ "packssdw %%mm5, %%mm1 \n\t" // FfBb
+ "movq %%mm0, %%mm2 \n\t" // EeAa
+ "punpcklwd %%mm1, %%mm0 \n\t" // BAba
+ "punpckhwd %%mm1, %%mm2 \n\t" // FEfe
+ "movq %%mm0, %%mm1 \n\t" // BAba
+ "punpckldq %%mm6, %%mm0 \n\t" // 00ba
+ "punpckhdq %%mm1, %%mm1 \n\t" // BABA
+
+ "movq %%mm0, (%0, %%edi) \n\t"
+ "punpckhdq %%mm2, %%mm0 \n\t" // FE00
+ "punpckldq %%mm1, %%mm2 \n\t" // BAfe
+ "movq %%mm2, 8(%0, %%edi) \n\t"
+ "movq %%mm0, 16(%0, %%edi) \n\t"
+ "addl $8, %%esi \n\t"
+ " jnz 1b \n\t"
+ "emms \n\t"
+ :: "r" (s16+1536), "r" (f+256)
+ :"%esi", "%edi", "memory"
+ );
+ return 6*256;
+}
+
+static int a52_resample_2F_2R_LFE_to_6_MMX(float * _f, int16_t * s16){
+ int32_t * f = (int32_t *) _f;
+ asm volatile(
+ "movl $-1024, %%esi \n\t"
+ "movq "MANGLE(magicF2W)", %%mm7 \n\t"
+// "pxor %%mm6, %%mm6 \n\t"
+ "1: \n\t"
+ "movq 1024(%1, %%esi), %%mm0 \n\t"
+ "movq 2048(%1, %%esi), %%mm1 \n\t"
+ "movq 3072(%1, %%esi), %%mm2 \n\t"
+ "movq 4096(%1, %%esi), %%mm3 \n\t"
+ "movq (%1, %%esi), %%mm5 \n\t"
+ "psubd %%mm7, %%mm0 \n\t"
+ "psubd %%mm7, %%mm1 \n\t"
+ "psubd %%mm7, %%mm2 \n\t"
+ "psubd %%mm7, %%mm3 \n\t"
+ "psubd %%mm7, %%mm5 \n\t"
+ "leal (%%esi, %%esi, 2), %%edi \n\t"
+
+ "packssdw %%mm2, %%mm0 \n\t" // CcAa
+ "packssdw %%mm3, %%mm1 \n\t" // DdBb
+ "packssdw %%mm5, %%mm5 \n\t" // FfFf
+ "movq %%mm0, %%mm2 \n\t" // CcAa
+ "punpcklwd %%mm1, %%mm0 \n\t" // BAba
+ "punpckhwd %%mm1, %%mm2 \n\t" // DCdc
+ "pxor %%mm4, %%mm4 \n\t" // 0000
+ "punpcklwd %%mm5, %%mm4 \n\t" // F0f0
+ "movq %%mm0, %%mm1 \n\t" // BAba
+ "movq %%mm4, %%mm3 \n\t" // F0f0
+ "punpckldq %%mm2, %%mm0 \n\t" // dcba
+ "punpckhdq %%mm1, %%mm1 \n\t" // BABA
+ "punpckldq %%mm1, %%mm4 \n\t" // BAf0
+ "punpckhdq %%mm3, %%mm2 \n\t" // F0DC
+
+ "movq %%mm0, (%0, %%edi) \n\t"
+ "movq %%mm4, 8(%0, %%edi) \n\t"
+ "movq %%mm2, 16(%0, %%edi) \n\t"
+ "addl $8, %%esi \n\t"
+ " jnz 1b \n\t"
+ "emms \n\t"
+ :: "r" (s16+1536), "r" (f+256)
+ :"%esi", "%edi", "memory"
+ );
+ return 6*256;
+}
+
+static int a52_resample_3F_2R_LFE_to_6_MMX(float * _f, int16_t * s16){
+ int32_t * f = (int32_t *) _f;
+ asm volatile(
+ "movl $-1024, %%esi \n\t"
+ "movq "MANGLE(magicF2W)", %%mm7 \n\t"
+// "pxor %%mm6, %%mm6 \n\t"
+ "1: \n\t"
+ "movq 1024(%1, %%esi), %%mm0 \n\t"
+ "movq 3072(%1, %%esi), %%mm1 \n\t"
+ "movq 4096(%1, %%esi), %%mm2 \n\t"
+ "movq 5120(%1, %%esi), %%mm3 \n\t"
+ "movq 2048(%1, %%esi), %%mm4 \n\t"
+ "movq (%1, %%esi), %%mm5 \n\t"
+ "psubd %%mm7, %%mm0 \n\t"
+ "psubd %%mm7, %%mm1 \n\t"
+ "psubd %%mm7, %%mm2 \n\t"
+ "psubd %%mm7, %%mm3 \n\t"
+ "psubd %%mm7, %%mm4 \n\t"
+ "psubd %%mm7, %%mm5 \n\t"
+ "leal (%%esi, %%esi, 2), %%edi \n\t"
+
+ "packssdw %%mm2, %%mm0 \n\t" // CcAa
+ "packssdw %%mm3, %%mm1 \n\t" // DdBb
+ "packssdw %%mm4, %%mm4 \n\t" // EeEe
+ "packssdw %%mm5, %%mm5 \n\t" // FfFf
+ "movq %%mm0, %%mm2 \n\t" // CcAa
+ "punpcklwd %%mm1, %%mm0 \n\t" // BAba
+ "punpckhwd %%mm1, %%mm2 \n\t" // DCdc
+ "punpcklwd %%mm5, %%mm4 \n\t" // FEfe
+ "movq %%mm0, %%mm1 \n\t" // BAba
+ "movq %%mm4, %%mm3 \n\t" // FEfe
+ "punpckldq %%mm2, %%mm0 \n\t" // dcba
+ "punpckhdq %%mm1, %%mm1 \n\t" // BABA
+ "punpckldq %%mm1, %%mm4 \n\t" // BAfe
+ "punpckhdq %%mm3, %%mm2 \n\t" // FEDC
+
+ "movq %%mm0, (%0, %%edi) \n\t"
+ "movq %%mm4, 8(%0, %%edi) \n\t"
+ "movq %%mm2, 16(%0, %%edi) \n\t"
+ "addl $8, %%esi \n\t"
+ " jnz 1b \n\t"
+ "emms \n\t"
+ :: "r" (s16+1536), "r" (f+256)
+ :"%esi", "%edi", "memory"
+ );
+ return 6*256;
+}
+
+
+static void* a52_resample_MMX(int flags, int ch){
+ switch (flags) {
+ case A52_MONO:
+ if(ch==5) return a52_resample_MONO_to_5_MMX;
+ break;
+ case A52_CHANNEL:
+ case A52_STEREO:
+ case A52_DOLBY:
+ if(ch==2) return a52_resample_STEREO_to_2_MMX;
+ break;
+ case A52_3F:
+ if(ch==5) return a52_resample_3F_to_5_MMX;
+ break;
+ case A52_2F2R:
+ if(ch==4) return a52_resample_2F_2R_to_4_MMX;
+ break;
+ case A52_3F2R:
+ if(ch==5) return a52_resample_3F_2R_to_5_MMX;
+ break;
+ case A52_MONO | A52_LFE:
+ if(ch==6) return a52_resample_MONO_LFE_to_6_MMX;
+ break;
+ case A52_CHANNEL | A52_LFE:
+ case A52_STEREO | A52_LFE:
+ case A52_DOLBY | A52_LFE:
+ if(ch==6) return a52_resample_STEREO_LFE_to_6_MMX;
+ break;
+ case A52_3F | A52_LFE:
+ if(ch==6) return a52_resample_3F_LFE_to_6_MMX;
+ break;
+ case A52_2F2R | A52_LFE:
+ if(ch==6) return a52_resample_2F_2R_LFE_to_6_MMX;
+ break;
+ case A52_3F2R | A52_LFE:
+ if(ch==6) return a52_resample_3F_2R_LFE_to_6_MMX;
+ break;
+ }
+ return NULL;
+}
+
+