From e3fcb14347466095839c2a3c47ebecff02da891e Mon Sep 17 00:00:00 2001
From: Diego Biurrun <diego@biurrun.de>
Date: Fri, 24 Jan 2014 11:55:16 +0100
Subject: dsputil: Split off IDCT bits into their own context

---
 libavcodec/x86/Makefile                |  10 +-
 libavcodec/x86/cavsdsp.c               |   3 +-
 libavcodec/x86/dsputil_init.c          |  85 -----------------
 libavcodec/x86/dsputil_mmx.c           | 135 --------------------------
 libavcodec/x86/dsputil_x86.h           |   7 --
 libavcodec/x86/idct_mmx_xvid.c         |   2 +-
 libavcodec/x86/idct_sse2_xvid.c        |   2 +-
 libavcodec/x86/idctdsp.h               |  31 ++++++
 libavcodec/x86/idctdsp_init.c          | 106 +++++++++++++++++++++
 libavcodec/x86/idctdsp_mmx.c           | 168 +++++++++++++++++++++++++++++++++
 libavcodec/x86/mpegvideoenc_template.c |   2 +-
 libavcodec/x86/proresdsp_init.c        |   2 +-
 libavcodec/x86/simple_idct.c           |   2 +-
 13 files changed, 318 insertions(+), 237 deletions(-)
 create mode 100644 libavcodec/x86/idctdsp.h
 create mode 100644 libavcodec/x86/idctdsp_init.c
 create mode 100644 libavcodec/x86/idctdsp_mmx.c

(limited to 'libavcodec/x86')

diff --git a/libavcodec/x86/Makefile b/libavcodec/x86/Makefile
index 13f9affdb2..14e58f9a9c 100644
--- a/libavcodec/x86/Makefile
+++ b/libavcodec/x86/Makefile
@@ -18,6 +18,7 @@ OBJS-$(CONFIG_H264QPEL)                += x86/h264_qpel.o
 OBJS-$(CONFIG_HPELDSP)                 += x86/hpeldsp_init.o
 OBJS-$(CONFIG_HUFFYUVDSP)              += x86/huffyuvdsp_init.o
 OBJS-$(CONFIG_HUFFYUVENCDSP)           += x86/huffyuvencdsp_mmx.o
+OBJS-$(CONFIG_IDCTDSP)                 += x86/idctdsp_init.o
 OBJS-$(CONFIG_LPC)                     += x86/lpc.o
 OBJS-$(CONFIG_MPEGAUDIODSP)            += x86/mpegaudiodsp.o
 OBJS-$(CONFIG_MPEGVIDEO)               += x86/mpegvideo.o              \
@@ -49,13 +50,14 @@ OBJS-$(CONFIG_VP9_DECODER)             += x86/vp9dsp_init.o
 
 MMX-OBJS-$(CONFIG_AUDIODSP)            += x86/audiodsp_mmx.o
 MMX-OBJS-$(CONFIG_BLOCKDSP)            += x86/blockdsp_mmx.o
-MMX-OBJS-$(CONFIG_DSPUTIL)             += x86/dsputil_mmx.o             \
-                                          x86/idct_mmx_xvid.o           \
-                                          x86/idct_sse2_xvid.o          \
-                                          x86/simple_idct.o
+MMX-OBJS-$(CONFIG_DSPUTIL)             += x86/dsputil_mmx.o
 MMX-OBJS-$(CONFIG_HPELDSP)             += x86/fpel_mmx.o                \
                                           x86/hpeldsp_mmx.o
 MMX-OBJS-$(CONFIG_HUFFYUVDSP)          += x86/huffyuvdsp_mmx.o
+MMX-OBJS-$(CONFIG_IDCTDSP)             += x86/idctdsp_mmx.o             \
+                                          x86/idct_mmx_xvid.o           \
+                                          x86/idct_sse2_xvid.o          \
+                                          x86/simple_idct.o
 MMX-OBJS-$(CONFIG_QPELDSP)             += x86/fpel_mmx.o
 
 MMX-OBJS-$(CONFIG_SVQ1_ENCODER)        += x86/svq1enc_mmx.o
diff --git a/libavcodec/x86/cavsdsp.c b/libavcodec/x86/cavsdsp.c
index d5c441f1e5..f0e8cfcd17 100644
--- a/libavcodec/x86/cavsdsp.c
+++ b/libavcodec/x86/cavsdsp.c
@@ -28,9 +28,10 @@
 #include "libavutil/x86/asm.h"
 #include "libavutil/x86/cpu.h"
 #include "libavcodec/cavsdsp.h"
+#include "libavcodec/idctdsp.h"
 #include "constants.h"
-#include "dsputil_x86.h"
 #include "fpel.h"
+#include "idctdsp.h"
 #include "config.h"
 
 #if HAVE_MMX_INLINE
diff --git a/libavcodec/x86/dsputil_init.c b/libavcodec/x86/dsputil_init.c
index 74dab48e72..adc7aa95d6 100644
--- a/libavcodec/x86/dsputil_init.c
+++ b/libavcodec/x86/dsputil_init.c
@@ -22,97 +22,18 @@
 #include "libavutil/x86/cpu.h"
 #include "libavcodec/avcodec.h"
 #include "libavcodec/dsputil.h"
-#include "libavcodec/simple_idct.h"
 #include "dsputil_x86.h"
-#include "idct_xvid.h"
-
-/* Input permutation for the simple_idct_mmx */
-static const uint8_t simple_mmx_permutation[64] = {
-    0x00, 0x08, 0x04, 0x09, 0x01, 0x0C, 0x05, 0x0D,
-    0x10, 0x18, 0x14, 0x19, 0x11, 0x1C, 0x15, 0x1D,
-    0x20, 0x28, 0x24, 0x29, 0x21, 0x2C, 0x25, 0x2D,
-    0x12, 0x1A, 0x16, 0x1B, 0x13, 0x1E, 0x17, 0x1F,
-    0x02, 0x0A, 0x06, 0x0B, 0x03, 0x0E, 0x07, 0x0F,
-    0x30, 0x38, 0x34, 0x39, 0x31, 0x3C, 0x35, 0x3D,
-    0x22, 0x2A, 0x26, 0x2B, 0x23, 0x2E, 0x27, 0x2F,
-    0x32, 0x3A, 0x36, 0x3B, 0x33, 0x3E, 0x37, 0x3F,
-};
-
-static const uint8_t idct_sse2_row_perm[8] = { 0, 4, 1, 5, 2, 6, 3, 7 };
-
-av_cold int ff_init_scantable_permutation_x86(uint8_t *idct_permutation,
-                                              int idct_permutation_type)
-{
-    int i;
-
-    switch (idct_permutation_type) {
-    case FF_SIMPLE_IDCT_PERM:
-        for (i = 0; i < 64; i++)
-            idct_permutation[i] = simple_mmx_permutation[i];
-        return 1;
-    case FF_SSE2_IDCT_PERM:
-        for (i = 0; i < 64; i++)
-            idct_permutation[i] = (i & 0x38) | idct_sse2_row_perm[i & 7];
-        return 1;
-    }
-
-    return 0;
-}
 
 static av_cold void dsputil_init_mmx(DSPContext *c, AVCodecContext *avctx,
                                      int cpu_flags, unsigned high_bit_depth)
 {
 #if HAVE_MMX_INLINE
-    c->put_pixels_clamped        = ff_put_pixels_clamped_mmx;
-    c->put_signed_pixels_clamped = ff_put_signed_pixels_clamped_mmx;
-    c->add_pixels_clamped        = ff_add_pixels_clamped_mmx;
-
     if (!high_bit_depth) {
         c->draw_edges   = ff_draw_edges_mmx;
-
-        switch (avctx->idct_algo) {
-        case FF_IDCT_AUTO:
-        case FF_IDCT_SIMPLEMMX:
-            c->idct_put              = ff_simple_idct_put_mmx;
-            c->idct_add              = ff_simple_idct_add_mmx;
-            c->idct                  = ff_simple_idct_mmx;
-            c->idct_permutation_type = FF_SIMPLE_IDCT_PERM;
-            break;
-        case FF_IDCT_XVIDMMX:
-            c->idct_put              = ff_idct_xvid_mmx_put;
-            c->idct_add              = ff_idct_xvid_mmx_add;
-            c->idct                  = ff_idct_xvid_mmx;
-            break;
-        }
     }
 #endif /* HAVE_MMX_INLINE */
 }
 
-static av_cold void dsputil_init_mmxext(DSPContext *c, AVCodecContext *avctx,
-                                        int cpu_flags, unsigned high_bit_depth)
-{
-#if HAVE_MMXEXT_INLINE
-    if (!high_bit_depth && avctx->idct_algo == FF_IDCT_XVIDMMX) {
-        c->idct_put = ff_idct_xvid_mmxext_put;
-        c->idct_add = ff_idct_xvid_mmxext_add;
-        c->idct     = ff_idct_xvid_mmxext;
-    }
-#endif /* HAVE_MMXEXT_INLINE */
-}
-
-static av_cold void dsputil_init_sse2(DSPContext *c, AVCodecContext *avctx,
-                                      int cpu_flags, unsigned high_bit_depth)
-{
-#if HAVE_SSE2_INLINE
-    if (!high_bit_depth && avctx->idct_algo == FF_IDCT_XVIDMMX) {
-        c->idct_put              = ff_idct_xvid_sse2_put;
-        c->idct_add              = ff_idct_xvid_sse2_add;
-        c->idct                  = ff_idct_xvid_sse2;
-        c->idct_permutation_type = FF_SSE2_IDCT_PERM;
-    }
-#endif /* HAVE_SSE2_INLINE */
-}
-
 av_cold void ff_dsputil_init_x86(DSPContext *c, AVCodecContext *avctx,
                                  unsigned high_bit_depth)
 {
@@ -121,12 +42,6 @@ av_cold void ff_dsputil_init_x86(DSPContext *c, AVCodecContext *avctx,
     if (X86_MMX(cpu_flags))
         dsputil_init_mmx(c, avctx, cpu_flags, high_bit_depth);
 
-    if (X86_MMXEXT(cpu_flags))
-        dsputil_init_mmxext(c, avctx, cpu_flags, high_bit_depth);
-
-    if (X86_SSE2(cpu_flags))
-        dsputil_init_sse2(c, avctx, cpu_flags, high_bit_depth);
-
     if (CONFIG_ENCODERS)
         ff_dsputilenc_init_mmx(c, avctx, high_bit_depth);
 }
diff --git a/libavcodec/x86/dsputil_mmx.c b/libavcodec/x86/dsputil_mmx.c
index 5fa047da7b..d205a48ea4 100644
--- a/libavcodec/x86/dsputil_mmx.c
+++ b/libavcodec/x86/dsputil_mmx.c
@@ -30,141 +30,6 @@
 
 #if HAVE_INLINE_ASM
 
-void ff_put_pixels_clamped_mmx(const int16_t *block, uint8_t *pixels,
-                               int line_size)
-{
-    const int16_t *p;
-    uint8_t *pix;
-
-    /* read the pixels */
-    p   = block;
-    pix = pixels;
-    /* unrolled loop */
-    __asm__ volatile (
-        "movq      (%3), %%mm0          \n\t"
-        "movq     8(%3), %%mm1          \n\t"
-        "movq    16(%3), %%mm2          \n\t"
-        "movq    24(%3), %%mm3          \n\t"
-        "movq    32(%3), %%mm4          \n\t"
-        "movq    40(%3), %%mm5          \n\t"
-        "movq    48(%3), %%mm6          \n\t"
-        "movq    56(%3), %%mm7          \n\t"
-        "packuswb %%mm1, %%mm0          \n\t"
-        "packuswb %%mm3, %%mm2          \n\t"
-        "packuswb %%mm5, %%mm4          \n\t"
-        "packuswb %%mm7, %%mm6          \n\t"
-        "movq     %%mm0, (%0)           \n\t"
-        "movq     %%mm2, (%0, %1)       \n\t"
-        "movq     %%mm4, (%0, %1, 2)    \n\t"
-        "movq     %%mm6, (%0, %2)       \n\t"
-        :: "r" (pix), "r" ((x86_reg) line_size), "r" ((x86_reg) line_size * 3),
-           "r" (p)
-        : "memory");
-    pix += line_size * 4;
-    p   += 32;
-
-    // if here would be an exact copy of the code above
-    // compiler would generate some very strange code
-    // thus using "r"
-    __asm__ volatile (
-        "movq       (%3), %%mm0         \n\t"
-        "movq      8(%3), %%mm1         \n\t"
-        "movq     16(%3), %%mm2         \n\t"
-        "movq     24(%3), %%mm3         \n\t"
-        "movq     32(%3), %%mm4         \n\t"
-        "movq     40(%3), %%mm5         \n\t"
-        "movq     48(%3), %%mm6         \n\t"
-        "movq     56(%3), %%mm7         \n\t"
-        "packuswb  %%mm1, %%mm0         \n\t"
-        "packuswb  %%mm3, %%mm2         \n\t"
-        "packuswb  %%mm5, %%mm4         \n\t"
-        "packuswb  %%mm7, %%mm6         \n\t"
-        "movq      %%mm0, (%0)          \n\t"
-        "movq      %%mm2, (%0, %1)      \n\t"
-        "movq      %%mm4, (%0, %1, 2)   \n\t"
-        "movq      %%mm6, (%0, %2)      \n\t"
-        :: "r" (pix), "r" ((x86_reg) line_size), "r" ((x86_reg) line_size * 3),
-           "r" (p)
-        : "memory");
-}
-
-#define put_signed_pixels_clamped_mmx_half(off)             \
-    "movq          "#off"(%2), %%mm1        \n\t"           \
-    "movq     16 + "#off"(%2), %%mm2        \n\t"           \
-    "movq     32 + "#off"(%2), %%mm3        \n\t"           \
-    "movq     48 + "#off"(%2), %%mm4        \n\t"           \
-    "packsswb  8 + "#off"(%2), %%mm1        \n\t"           \
-    "packsswb 24 + "#off"(%2), %%mm2        \n\t"           \
-    "packsswb 40 + "#off"(%2), %%mm3        \n\t"           \
-    "packsswb 56 + "#off"(%2), %%mm4        \n\t"           \
-    "paddb              %%mm0, %%mm1        \n\t"           \
-    "paddb              %%mm0, %%mm2        \n\t"           \
-    "paddb              %%mm0, %%mm3        \n\t"           \
-    "paddb              %%mm0, %%mm4        \n\t"           \
-    "movq               %%mm1, (%0)         \n\t"           \
-    "movq               %%mm2, (%0, %3)     \n\t"           \
-    "movq               %%mm3, (%0, %3, 2)  \n\t"           \
-    "movq               %%mm4, (%0, %1)     \n\t"
-
-void ff_put_signed_pixels_clamped_mmx(const int16_t *block, uint8_t *pixels,
-                                      int line_size)
-{
-    x86_reg line_skip = line_size;
-    x86_reg line_skip3;
-
-    __asm__ volatile (
-        "movq "MANGLE(ff_pb_80)", %%mm0     \n\t"
-        "lea         (%3, %3, 2), %1        \n\t"
-        put_signed_pixels_clamped_mmx_half(0)
-        "lea         (%0, %3, 4), %0        \n\t"
-        put_signed_pixels_clamped_mmx_half(64)
-        : "+&r" (pixels), "=&r" (line_skip3)
-        : "r" (block), "r" (line_skip)
-        : "memory");
-}
-
-void ff_add_pixels_clamped_mmx(const int16_t *block, uint8_t *pixels,
-                               int line_size)
-{
-    const int16_t *p;
-    uint8_t *pix;
-    int i;
-
-    /* read the pixels */
-    p   = block;
-    pix = pixels;
-    MOVQ_ZERO(mm7);
-    i = 4;
-    do {
-        __asm__ volatile (
-            "movq        (%2), %%mm0    \n\t"
-            "movq       8(%2), %%mm1    \n\t"
-            "movq      16(%2), %%mm2    \n\t"
-            "movq      24(%2), %%mm3    \n\t"
-            "movq          %0, %%mm4    \n\t"
-            "movq          %1, %%mm6    \n\t"
-            "movq       %%mm4, %%mm5    \n\t"
-            "punpcklbw  %%mm7, %%mm4    \n\t"
-            "punpckhbw  %%mm7, %%mm5    \n\t"
-            "paddsw     %%mm4, %%mm0    \n\t"
-            "paddsw     %%mm5, %%mm1    \n\t"
-            "movq       %%mm6, %%mm5    \n\t"
-            "punpcklbw  %%mm7, %%mm6    \n\t"
-            "punpckhbw  %%mm7, %%mm5    \n\t"
-            "paddsw     %%mm6, %%mm2    \n\t"
-            "paddsw     %%mm5, %%mm3    \n\t"
-            "packuswb   %%mm1, %%mm0    \n\t"
-            "packuswb   %%mm3, %%mm2    \n\t"
-            "movq       %%mm0, %0       \n\t"
-            "movq       %%mm2, %1       \n\t"
-            : "+m" (*pix), "+m" (*(pix + line_size))
-            : "r" (p)
-            : "memory");
-        pix += line_size * 2;
-        p   += 16;
-    } while (--i);
-}
-
 /* Draw the edges of width 'w' of an image of size width, height
  * this MMX version can only handle w == 8 || w == 16. */
 void ff_draw_edges_mmx(uint8_t *buf, int wrap, int width, int height,
diff --git a/libavcodec/x86/dsputil_x86.h b/libavcodec/x86/dsputil_x86.h
index 4beb6c11ca..7e1e8af051 100644
--- a/libavcodec/x86/dsputil_x86.h
+++ b/libavcodec/x86/dsputil_x86.h
@@ -31,13 +31,6 @@ void ff_dsputilenc_init_mmx(DSPContext *c, AVCodecContext *avctx,
                             unsigned high_bit_depth);
 void ff_dsputil_init_pix_mmx(DSPContext *c, AVCodecContext *avctx);
 
-void ff_add_pixels_clamped_mmx(const int16_t *block, uint8_t *pixels,
-                               int line_size);
-void ff_put_pixels_clamped_mmx(const int16_t *block, uint8_t *pixels,
-                               int line_size);
-void ff_put_signed_pixels_clamped_mmx(const int16_t *block, uint8_t *pixels,
-                                      int line_size);
-
 void ff_draw_edges_mmx(uint8_t *buf, int wrap, int width, int height,
                        int w, int h, int sides);
 
diff --git a/libavcodec/x86/idct_mmx_xvid.c b/libavcodec/x86/idct_mmx_xvid.c
index 27723393bf..920ea4c0dc 100644
--- a/libavcodec/x86/idct_mmx_xvid.c
+++ b/libavcodec/x86/idct_mmx_xvid.c
@@ -44,8 +44,8 @@
 #include "config.h"
 #include "libavcodec/avcodec.h"
 #include "libavutil/mem.h"
-#include "dsputil_x86.h"
 #include "idct_xvid.h"
+#include "idctdsp.h"
 
 #if HAVE_MMX_INLINE
 
diff --git a/libavcodec/x86/idct_sse2_xvid.c b/libavcodec/x86/idct_sse2_xvid.c
index 50655d6bc0..aadeb122c6 100644
--- a/libavcodec/x86/idct_sse2_xvid.c
+++ b/libavcodec/x86/idct_sse2_xvid.c
@@ -42,7 +42,7 @@
 #include "libavutil/mem.h"
 #include "libavutil/x86/asm.h"
 #include "idct_xvid.h"
-#include "dsputil_x86.h"
+#include "idctdsp.h"
 
 #if HAVE_SSE2_INLINE
 
diff --git a/libavcodec/x86/idctdsp.h b/libavcodec/x86/idctdsp.h
new file mode 100644
index 0000000000..22df3dd758
--- /dev/null
+++ b/libavcodec/x86/idctdsp.h
@@ -0,0 +1,31 @@
+/*
+ * This file is part of Libav.
+ *
+ * Libav is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * Libav is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with Libav; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#ifndef AVCODEC_X86_IDCTDSP_H
+#define AVCODEC_X86_IDCTDSP_H
+
+#include <stdint.h>
+
+void ff_add_pixels_clamped_mmx(const int16_t *block, uint8_t *pixels,
+                               int line_size);
+void ff_put_pixels_clamped_mmx(const int16_t *block, uint8_t *pixels,
+                               int line_size);
+void ff_put_signed_pixels_clamped_mmx(const int16_t *block, uint8_t *pixels,
+                                      int line_size);
+
+#endif /* AVCODEC_X86_IDCTDSP_H */
diff --git a/libavcodec/x86/idctdsp_init.c b/libavcodec/x86/idctdsp_init.c
new file mode 100644
index 0000000000..9b68497502
--- /dev/null
+++ b/libavcodec/x86/idctdsp_init.c
@@ -0,0 +1,106 @@
+/*
+ * This file is part of Libav.
+ *
+ * Libav is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * Libav is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with Libav; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "config.h"
+#include "libavutil/attributes.h"
+#include "libavutil/cpu.h"
+#include "libavutil/x86/cpu.h"
+#include "libavcodec/avcodec.h"
+#include "libavcodec/idctdsp.h"
+#include "libavcodec/simple_idct.h"
+#include "idct_xvid.h"
+#include "idctdsp.h"
+
+/* Input permutation for the simple_idct_mmx */
+static const uint8_t simple_mmx_permutation[64] = {
+    0x00, 0x08, 0x04, 0x09, 0x01, 0x0C, 0x05, 0x0D,
+    0x10, 0x18, 0x14, 0x19, 0x11, 0x1C, 0x15, 0x1D,
+    0x20, 0x28, 0x24, 0x29, 0x21, 0x2C, 0x25, 0x2D,
+    0x12, 0x1A, 0x16, 0x1B, 0x13, 0x1E, 0x17, 0x1F,
+    0x02, 0x0A, 0x06, 0x0B, 0x03, 0x0E, 0x07, 0x0F,
+    0x30, 0x38, 0x34, 0x39, 0x31, 0x3C, 0x35, 0x3D,
+    0x22, 0x2A, 0x26, 0x2B, 0x23, 0x2E, 0x27, 0x2F,
+    0x32, 0x3A, 0x36, 0x3B, 0x33, 0x3E, 0x37, 0x3F,
+};
+
+static const uint8_t idct_sse2_row_perm[8] = { 0, 4, 1, 5, 2, 6, 3, 7 };
+
+av_cold int ff_init_scantable_permutation_x86(uint8_t *idct_permutation,
+                                              int idct_permutation_type)
+{
+    int i;
+
+    switch (idct_permutation_type) {
+    case FF_SIMPLE_IDCT_PERM:
+        for (i = 0; i < 64; i++)
+            idct_permutation[i] = simple_mmx_permutation[i];
+        return 1;
+    case FF_SSE2_IDCT_PERM:
+        for (i = 0; i < 64; i++)
+            idct_permutation[i] = (i & 0x38) | idct_sse2_row_perm[i & 7];
+        return 1;
+    }
+
+    return 0;
+}
+
+av_cold void ff_idctdsp_init_x86(IDCTDSPContext *c, AVCodecContext *avctx,
+                                 unsigned high_bit_depth)
+{
+    int cpu_flags = av_get_cpu_flags();
+
+    if (INLINE_MMX(cpu_flags)) {
+        c->put_pixels_clamped        = ff_put_pixels_clamped_mmx;
+        c->put_signed_pixels_clamped = ff_put_signed_pixels_clamped_mmx;
+        c->add_pixels_clamped        = ff_add_pixels_clamped_mmx;
+
+        if (!high_bit_depth) {
+            switch (avctx->idct_algo) {
+            case FF_IDCT_AUTO:
+            case FF_IDCT_SIMPLEMMX:
+                c->idct_put              = ff_simple_idct_put_mmx;
+                c->idct_add              = ff_simple_idct_add_mmx;
+                c->idct                  = ff_simple_idct_mmx;
+                c->idct_permutation_type = FF_SIMPLE_IDCT_PERM;
+                break;
+            case FF_IDCT_XVIDMMX:
+                c->idct_put              = ff_idct_xvid_mmx_put;
+                c->idct_add              = ff_idct_xvid_mmx_add;
+                c->idct                  = ff_idct_xvid_mmx;
+                break;
+            }
+        }
+    }
+
+    if (INLINE_MMXEXT(cpu_flags)) {
+        if (!high_bit_depth && avctx->idct_algo == FF_IDCT_XVIDMMX) {
+            c->idct_put = ff_idct_xvid_mmxext_put;
+            c->idct_add = ff_idct_xvid_mmxext_add;
+            c->idct     = ff_idct_xvid_mmxext;
+        }
+    }
+
+    if (INLINE_SSE2(cpu_flags)) {
+        if (!high_bit_depth && avctx->idct_algo == FF_IDCT_XVIDMMX) {
+            c->idct_put              = ff_idct_xvid_sse2_put;
+            c->idct_add              = ff_idct_xvid_sse2_add;
+            c->idct                  = ff_idct_xvid_sse2;
+            c->idct_permutation_type = FF_SSE2_IDCT_PERM;
+        }
+    }
+}
diff --git a/libavcodec/x86/idctdsp_mmx.c b/libavcodec/x86/idctdsp_mmx.c
new file mode 100644
index 0000000000..7285b1d357
--- /dev/null
+++ b/libavcodec/x86/idctdsp_mmx.c
@@ -0,0 +1,168 @@
+/*
+ * SIMD-optimized IDCT-related routines
+ * Copyright (c) 2000, 2001 Fabrice Bellard
+ * Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
+ *
+ * MMX optimization by Nick Kurshev <nickols_k@mail.ru>
+ *
+ * This file is part of Libav.
+ *
+ * Libav is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * Libav is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with Libav; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "config.h"
+#include "libavutil/cpu.h"
+#include "libavutil/x86/asm.h"
+#include "idctdsp.h"
+#include "inline_asm.h"
+
+#if HAVE_INLINE_ASM
+
+void ff_put_pixels_clamped_mmx(const int16_t *block, uint8_t *pixels,
+                               int line_size)
+{
+    const int16_t *p;
+    uint8_t *pix;
+
+    /* read the pixels */
+    p   = block;
+    pix = pixels;
+    /* unrolled loop */
+    __asm__ volatile (
+        "movq      (%3), %%mm0          \n\t"
+        "movq     8(%3), %%mm1          \n\t"
+        "movq    16(%3), %%mm2          \n\t"
+        "movq    24(%3), %%mm3          \n\t"
+        "movq    32(%3), %%mm4          \n\t"
+        "movq    40(%3), %%mm5          \n\t"
+        "movq    48(%3), %%mm6          \n\t"
+        "movq    56(%3), %%mm7          \n\t"
+        "packuswb %%mm1, %%mm0          \n\t"
+        "packuswb %%mm3, %%mm2          \n\t"
+        "packuswb %%mm5, %%mm4          \n\t"
+        "packuswb %%mm7, %%mm6          \n\t"
+        "movq     %%mm0, (%0)           \n\t"
+        "movq     %%mm2, (%0, %1)       \n\t"
+        "movq     %%mm4, (%0, %1, 2)    \n\t"
+        "movq     %%mm6, (%0, %2)       \n\t"
+        :: "r" (pix), "r" ((x86_reg) line_size), "r" ((x86_reg) line_size * 3),
+           "r" (p)
+        : "memory");
+    pix += line_size * 4;
+    p   += 32;
+
+    // if here would be an exact copy of the code above
+    // compiler would generate some very strange code
+    // thus using "r"
+    __asm__ volatile (
+        "movq       (%3), %%mm0         \n\t"
+        "movq      8(%3), %%mm1         \n\t"
+        "movq     16(%3), %%mm2         \n\t"
+        "movq     24(%3), %%mm3         \n\t"
+        "movq     32(%3), %%mm4         \n\t"
+        "movq     40(%3), %%mm5         \n\t"
+        "movq     48(%3), %%mm6         \n\t"
+        "movq     56(%3), %%mm7         \n\t"
+        "packuswb  %%mm1, %%mm0         \n\t"
+        "packuswb  %%mm3, %%mm2         \n\t"
+        "packuswb  %%mm5, %%mm4         \n\t"
+        "packuswb  %%mm7, %%mm6         \n\t"
+        "movq      %%mm0, (%0)          \n\t"
+        "movq      %%mm2, (%0, %1)      \n\t"
+        "movq      %%mm4, (%0, %1, 2)   \n\t"
+        "movq      %%mm6, (%0, %2)      \n\t"
+        :: "r" (pix), "r" ((x86_reg) line_size), "r" ((x86_reg) line_size * 3),
+           "r" (p)
+        : "memory");
+}
+
+#define put_signed_pixels_clamped_mmx_half(off)             \
+    "movq          "#off"(%2), %%mm1        \n\t"           \
+    "movq     16 + "#off"(%2), %%mm2        \n\t"           \
+    "movq     32 + "#off"(%2), %%mm3        \n\t"           \
+    "movq     48 + "#off"(%2), %%mm4        \n\t"           \
+    "packsswb  8 + "#off"(%2), %%mm1        \n\t"           \
+    "packsswb 24 + "#off"(%2), %%mm2        \n\t"           \
+    "packsswb 40 + "#off"(%2), %%mm3        \n\t"           \
+    "packsswb 56 + "#off"(%2), %%mm4        \n\t"           \
+    "paddb              %%mm0, %%mm1        \n\t"           \
+    "paddb              %%mm0, %%mm2        \n\t"           \
+    "paddb              %%mm0, %%mm3        \n\t"           \
+    "paddb              %%mm0, %%mm4        \n\t"           \
+    "movq               %%mm1, (%0)         \n\t"           \
+    "movq               %%mm2, (%0, %3)     \n\t"           \
+    "movq               %%mm3, (%0, %3, 2)  \n\t"           \
+    "movq               %%mm4, (%0, %1)     \n\t"
+
+void ff_put_signed_pixels_clamped_mmx(const int16_t *block, uint8_t *pixels,
+                                      int line_size)
+{
+    x86_reg line_skip = line_size;
+    x86_reg line_skip3;
+
+    __asm__ volatile (
+        "movq "MANGLE(ff_pb_80)", %%mm0     \n\t"
+        "lea         (%3, %3, 2), %1        \n\t"
+        put_signed_pixels_clamped_mmx_half(0)
+        "lea         (%0, %3, 4), %0        \n\t"
+        put_signed_pixels_clamped_mmx_half(64)
+        : "+&r" (pixels), "=&r" (line_skip3)
+        : "r" (block), "r" (line_skip)
+        : "memory");
+}
+
+void ff_add_pixels_clamped_mmx(const int16_t *block, uint8_t *pixels,
+                               int line_size)
+{
+    const int16_t *p;
+    uint8_t *pix;
+    int i;
+
+    /* read the pixels */
+    p   = block;
+    pix = pixels;
+    MOVQ_ZERO(mm7);
+    i = 4;
+    do {
+        __asm__ volatile (
+            "movq        (%2), %%mm0    \n\t"
+            "movq       8(%2), %%mm1    \n\t"
+            "movq      16(%2), %%mm2    \n\t"
+            "movq      24(%2), %%mm3    \n\t"
+            "movq          %0, %%mm4    \n\t"
+            "movq          %1, %%mm6    \n\t"
+            "movq       %%mm4, %%mm5    \n\t"
+            "punpcklbw  %%mm7, %%mm4    \n\t"
+            "punpckhbw  %%mm7, %%mm5    \n\t"
+            "paddsw     %%mm4, %%mm0    \n\t"
+            "paddsw     %%mm5, %%mm1    \n\t"
+            "movq       %%mm6, %%mm5    \n\t"
+            "punpcklbw  %%mm7, %%mm6    \n\t"
+            "punpckhbw  %%mm7, %%mm5    \n\t"
+            "paddsw     %%mm6, %%mm2    \n\t"
+            "paddsw     %%mm5, %%mm3    \n\t"
+            "packuswb   %%mm1, %%mm0    \n\t"
+            "packuswb   %%mm3, %%mm2    \n\t"
+            "movq       %%mm0, %0       \n\t"
+            "movq       %%mm2, %1       \n\t"
+            : "+m" (*pix), "+m" (*(pix + line_size))
+            : "r" (p)
+            : "memory");
+        pix += line_size * 2;
+        p   += 16;
+    } while (--i);
+}
+
+#endif /* HAVE_INLINE_ASM */
diff --git a/libavcodec/x86/mpegvideoenc_template.c b/libavcodec/x86/mpegvideoenc_template.c
index d01ff1c0f8..fa590066d6 100644
--- a/libavcodec/x86/mpegvideoenc_template.c
+++ b/libavcodec/x86/mpegvideoenc_template.c
@@ -229,7 +229,7 @@ static int RENAME(dct_quantize)(MpegEncContext *s,
     if(s->mb_intra) block[0]= level;
     else            block[0]= temp_block[0];
 
-    if(s->dsp.idct_permutation_type == FF_SIMPLE_IDCT_PERM){
+    if (s->idsp.idct_permutation_type == FF_SIMPLE_IDCT_PERM) {
         if(last_non_zero_p1 <= 1) goto end;
         block[0x08] = temp_block[0x01]; block[0x10] = temp_block[0x08];
         block[0x20] = temp_block[0x10];
diff --git a/libavcodec/x86/proresdsp_init.c b/libavcodec/x86/proresdsp_init.c
index 68ad929067..a66fc70982 100644
--- a/libavcodec/x86/proresdsp_init.c
+++ b/libavcodec/x86/proresdsp_init.c
@@ -22,7 +22,7 @@
 
 #include "libavutil/attributes.h"
 #include "libavutil/x86/cpu.h"
-#include "libavcodec/dsputil.h"
+#include "libavcodec/idctdsp.h"
 #include "libavcodec/proresdsp.h"
 
 void ff_prores_idct_put_10_sse2(uint16_t *dst, int linesize,
diff --git a/libavcodec/x86/simple_idct.c b/libavcodec/x86/simple_idct.c
index a342110cd3..bbe5a67472 100644
--- a/libavcodec/x86/simple_idct.c
+++ b/libavcodec/x86/simple_idct.c
@@ -23,7 +23,7 @@
 #include "libavutil/internal.h"
 #include "libavutil/mem.h"
 #include "libavutil/x86/asm.h"
-#include "dsputil_x86.h"
+#include "idctdsp.h"
 
 #if HAVE_INLINE_ASM
 
-- 
cgit v1.2.3