aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorDaniel Kang <daniel.d.kang@gmail.com>2013-01-27 03:45:43 +0000
committerLuca Barbato <lu_zero@gentoo.org>2013-01-27 06:45:31 +0100
commit71155d7b4157fee44c0d3d0fc1b660ebfb9ccf46 (patch)
treed2cabb39ed7fe2930a124cc00630f0f9693c776e
parentf90ff772e7e35b4923c2de429d1fab9f2569b568 (diff)
downloadffmpeg-71155d7b4157fee44c0d3d0fc1b660ebfb9ccf46.tar.gz
dsputil: x86: Convert mpeg4 qpel and dsputil avg to yasm
Signed-off-by: Luca Barbato <lu_zero@gentoo.org>
-rw-r--r--libavcodec/x86/Makefile2
-rw-r--r--libavcodec/x86/dsputil_avg_template.c789
-rw-r--r--libavcodec/x86/dsputil_mmx.c874
-rw-r--r--libavcodec/x86/hpeldsp.asm465
-rw-r--r--libavcodec/x86/mpeg4qpel.asm558
-rw-r--r--libavcodec/x86/vc1dsp_mmx.c4
6 files changed, 1380 insertions, 1312 deletions
diff --git a/libavcodec/x86/Makefile b/libavcodec/x86/Makefile
index 9b8b6531d0..1feb0607d4 100644
--- a/libavcodec/x86/Makefile
+++ b/libavcodec/x86/Makefile
@@ -71,3 +71,5 @@ YASM-OBJS-$(CONFIG_VP8_DECODER) += x86/vp8dsp.o
YASM-OBJS += x86/dsputil.o \
x86/deinterlace.o \
x86/fmtconvert.o \
+ x86/hpeldsp.o \
+ x86/mpeg4qpel.o \
diff --git a/libavcodec/x86/dsputil_avg_template.c b/libavcodec/x86/dsputil_avg_template.c
index 4fc188c982..90e4074f7a 100644
--- a/libavcodec/x86/dsputil_avg_template.c
+++ b/libavcodec/x86/dsputil_avg_template.c
@@ -24,781 +24,54 @@
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
-/* XXX: we use explicit registers to avoid a gcc 2.95.2 register asm
- clobber bug - now it will work with 2.95.2 and also with -fPIC
- */
-static void DEF(put_pixels8_x2)(uint8_t *block, const uint8_t *pixels, int line_size, int h)
-{
- __asm__ volatile(
- "lea (%3, %3), %%"REG_a" \n\t"
- "1: \n\t"
- "movq (%1), %%mm0 \n\t"
- "movq (%1, %3), %%mm1 \n\t"
- PAVGB" 1(%1), %%mm0 \n\t"
- PAVGB" 1(%1, %3), %%mm1 \n\t"
- "movq %%mm0, (%2) \n\t"
- "movq %%mm1, (%2, %3) \n\t"
- "add %%"REG_a", %1 \n\t"
- "add %%"REG_a", %2 \n\t"
- "movq (%1), %%mm0 \n\t"
- "movq (%1, %3), %%mm1 \n\t"
- PAVGB" 1(%1), %%mm0 \n\t"
- PAVGB" 1(%1, %3), %%mm1 \n\t"
- "add %%"REG_a", %1 \n\t"
- "movq %%mm0, (%2) \n\t"
- "movq %%mm1, (%2, %3) \n\t"
- "add %%"REG_a", %2 \n\t"
- "subl $4, %0 \n\t"
- "jnz 1b \n\t"
- :"+g"(h), "+S"(pixels), "+D"(block)
- :"r" ((x86_reg)line_size)
- :"%"REG_a, "memory");
-}
-
-#ifndef SKIP_FOR_3DNOW
-static void DEF(put_pixels8_l2)(uint8_t *dst, uint8_t *src1, uint8_t *src2, int dstStride, int src1Stride, int h)
-{
- __asm__ volatile(
- "testl $1, %0 \n\t"
- " jz 1f \n\t"
- "movq (%1), %%mm0 \n\t"
- "movq (%2), %%mm1 \n\t"
- "add %4, %1 \n\t"
- "add $8, %2 \n\t"
- PAVGB" %%mm1, %%mm0 \n\t"
- "movq %%mm0, (%3) \n\t"
- "add %5, %3 \n\t"
- "decl %0 \n\t"
- "1: \n\t"
- "movq (%1), %%mm0 \n\t"
- "add %4, %1 \n\t"
- "movq (%1), %%mm1 \n\t"
- "add %4, %1 \n\t"
- PAVGB" (%2), %%mm0 \n\t"
- PAVGB" 8(%2), %%mm1 \n\t"
- "movq %%mm0, (%3) \n\t"
- "add %5, %3 \n\t"
- "movq %%mm1, (%3) \n\t"
- "add %5, %3 \n\t"
- "movq (%1), %%mm0 \n\t"
- "add %4, %1 \n\t"
- "movq (%1), %%mm1 \n\t"
- "add %4, %1 \n\t"
- PAVGB" 16(%2), %%mm0 \n\t"
- PAVGB" 24(%2), %%mm1 \n\t"
- "movq %%mm0, (%3) \n\t"
- "add %5, %3 \n\t"
- "movq %%mm1, (%3) \n\t"
- "add %5, %3 \n\t"
- "add $32, %2 \n\t"
- "subl $4, %0 \n\t"
- "jnz 1b \n\t"
-#if !HAVE_EBX_AVAILABLE //Note "+bm" and "+mb" are buggy too (with gcc 3.2.2 at least) and cannot be used
- :"+m"(h), "+a"(src1), "+c"(src2), "+d"(dst)
-#else
- :"+b"(h), "+a"(src1), "+c"(src2), "+d"(dst)
-#endif
- :"S"((x86_reg)src1Stride), "D"((x86_reg)dstStride)
- :"memory");
-//the following should be used, though better not with gcc ...
-/* :"+g"(h), "+r"(src1), "+r"(src2), "+r"(dst)
- :"r"(src1Stride), "r"(dstStride)
- :"memory");*/
-}
-
-static void DEF(put_no_rnd_pixels8_l2)(uint8_t *dst, uint8_t *src1, uint8_t *src2, int dstStride, int src1Stride, int h)
-{
- __asm__ volatile(
- "pcmpeqb %%mm6, %%mm6 \n\t"
- "testl $1, %0 \n\t"
- " jz 1f \n\t"
- "movq (%1), %%mm0 \n\t"
- "movq (%2), %%mm1 \n\t"
- "add %4, %1 \n\t"
- "add $8, %2 \n\t"
- "pxor %%mm6, %%mm0 \n\t"
- "pxor %%mm6, %%mm1 \n\t"
- PAVGB" %%mm1, %%mm0 \n\t"
- "pxor %%mm6, %%mm0 \n\t"
- "movq %%mm0, (%3) \n\t"
- "add %5, %3 \n\t"
- "decl %0 \n\t"
- "1: \n\t"
- "movq (%1), %%mm0 \n\t"
- "add %4, %1 \n\t"
- "movq (%1), %%mm1 \n\t"
- "add %4, %1 \n\t"
- "movq (%2), %%mm2 \n\t"
- "movq 8(%2), %%mm3 \n\t"
- "pxor %%mm6, %%mm0 \n\t"
- "pxor %%mm6, %%mm1 \n\t"
- "pxor %%mm6, %%mm2 \n\t"
- "pxor %%mm6, %%mm3 \n\t"
- PAVGB" %%mm2, %%mm0 \n\t"
- PAVGB" %%mm3, %%mm1 \n\t"
- "pxor %%mm6, %%mm0 \n\t"
- "pxor %%mm6, %%mm1 \n\t"
- "movq %%mm0, (%3) \n\t"
- "add %5, %3 \n\t"
- "movq %%mm1, (%3) \n\t"
- "add %5, %3 \n\t"
- "movq (%1), %%mm0 \n\t"
- "add %4, %1 \n\t"
- "movq (%1), %%mm1 \n\t"
- "add %4, %1 \n\t"
- "movq 16(%2), %%mm2 \n\t"
- "movq 24(%2), %%mm3 \n\t"
- "pxor %%mm6, %%mm0 \n\t"
- "pxor %%mm6, %%mm1 \n\t"
- "pxor %%mm6, %%mm2 \n\t"
- "pxor %%mm6, %%mm3 \n\t"
- PAVGB" %%mm2, %%mm0 \n\t"
- PAVGB" %%mm3, %%mm1 \n\t"
- "pxor %%mm6, %%mm0 \n\t"
- "pxor %%mm6, %%mm1 \n\t"
- "movq %%mm0, (%3) \n\t"
- "add %5, %3 \n\t"
- "movq %%mm1, (%3) \n\t"
- "add %5, %3 \n\t"
- "add $32, %2 \n\t"
- "subl $4, %0 \n\t"
- "jnz 1b \n\t"
-#if !HAVE_EBX_AVAILABLE //Note "+bm" and "+mb" are buggy too (with gcc 3.2.2 at least) and cannot be used
- :"+m"(h), "+a"(src1), "+c"(src2), "+d"(dst)
-#else
- :"+b"(h), "+a"(src1), "+c"(src2), "+d"(dst)
-#endif
- :"S"((x86_reg)src1Stride), "D"((x86_reg)dstStride)
- :"memory");
-//the following should be used, though better not with gcc ...
-/* :"+g"(h), "+r"(src1), "+r"(src2), "+r"(dst)
- :"r"(src1Stride), "r"(dstStride)
- :"memory");*/
-}
-
-static void DEF(avg_pixels8_l2)(uint8_t *dst, uint8_t *src1, uint8_t *src2, int dstStride, int src1Stride, int h)
-{
- __asm__ volatile(
- "testl $1, %0 \n\t"
- " jz 1f \n\t"
- "movq (%1), %%mm0 \n\t"
- "movq (%2), %%mm1 \n\t"
- "add %4, %1 \n\t"
- "add $8, %2 \n\t"
- PAVGB" %%mm1, %%mm0 \n\t"
- PAVGB" (%3), %%mm0 \n\t"
- "movq %%mm0, (%3) \n\t"
- "add %5, %3 \n\t"
- "decl %0 \n\t"
- "1: \n\t"
- "movq (%1), %%mm0 \n\t"
- "add %4, %1 \n\t"
- "movq (%1), %%mm1 \n\t"
- "add %4, %1 \n\t"
- PAVGB" (%2), %%mm0 \n\t"
- PAVGB" 8(%2), %%mm1 \n\t"
- PAVGB" (%3), %%mm0 \n\t"
- "movq %%mm0, (%3) \n\t"
- "add %5, %3 \n\t"
- PAVGB" (%3), %%mm1 \n\t"
- "movq %%mm1, (%3) \n\t"
- "add %5, %3 \n\t"
- "movq (%1), %%mm0 \n\t"
- "add %4, %1 \n\t"
- "movq (%1), %%mm1 \n\t"
- "add %4, %1 \n\t"
- PAVGB" 16(%2), %%mm0 \n\t"
- PAVGB" 24(%2), %%mm1 \n\t"
- PAVGB" (%3), %%mm0 \n\t"
- "movq %%mm0, (%3) \n\t"
- "add %5, %3 \n\t"
- PAVGB" (%3), %%mm1 \n\t"
- "movq %%mm1, (%3) \n\t"
- "add %5, %3 \n\t"
- "add $32, %2 \n\t"
- "subl $4, %0 \n\t"
- "jnz 1b \n\t"
-#if !HAVE_EBX_AVAILABLE //Note "+bm" and "+mb" are buggy too (with gcc 3.2.2 at least) and cannot be used
- :"+m"(h), "+a"(src1), "+c"(src2), "+d"(dst)
-#else
- :"+b"(h), "+a"(src1), "+c"(src2), "+d"(dst)
-#endif
- :"S"((x86_reg)src1Stride), "D"((x86_reg)dstStride)
- :"memory");
-//the following should be used, though better not with gcc ...
-/* :"+g"(h), "+r"(src1), "+r"(src2), "+r"(dst)
- :"r"(src1Stride), "r"(dstStride)
- :"memory");*/
-}
-#endif /* SKIP_FOR_3DNOW */
-
-static void DEF(put_pixels16_x2)(uint8_t *block, const uint8_t *pixels, int line_size, int h)
-{
- __asm__ volatile(
- "lea (%3, %3), %%"REG_a" \n\t"
- "1: \n\t"
- "movq (%1), %%mm0 \n\t"
- "movq (%1, %3), %%mm1 \n\t"
- "movq 8(%1), %%mm2 \n\t"
- "movq 8(%1, %3), %%mm3 \n\t"
- PAVGB" 1(%1), %%mm0 \n\t"
- PAVGB" 1(%1, %3), %%mm1 \n\t"
- PAVGB" 9(%1), %%mm2 \n\t"
- PAVGB" 9(%1, %3), %%mm3 \n\t"
- "movq %%mm0, (%2) \n\t"
- "movq %%mm1, (%2, %3) \n\t"
- "movq %%mm2, 8(%2) \n\t"
- "movq %%mm3, 8(%2, %3) \n\t"
- "add %%"REG_a", %1 \n\t"
- "add %%"REG_a", %2 \n\t"
- "movq (%1), %%mm0 \n\t"
- "movq (%1, %3), %%mm1 \n\t"
- "movq 8(%1), %%mm2 \n\t"
- "movq 8(%1, %3), %%mm3 \n\t"
- PAVGB" 1(%1), %%mm0 \n\t"
- PAVGB" 1(%1, %3), %%mm1 \n\t"
- PAVGB" 9(%1), %%mm2 \n\t"
- PAVGB" 9(%1, %3), %%mm3 \n\t"
- "add %%"REG_a", %1 \n\t"
- "movq %%mm0, (%2) \n\t"
- "movq %%mm1, (%2, %3) \n\t"
- "movq %%mm2, 8(%2) \n\t"
- "movq %%mm3, 8(%2, %3) \n\t"
- "add %%"REG_a", %2 \n\t"
- "subl $4, %0 \n\t"
- "jnz 1b \n\t"
- :"+g"(h), "+S"(pixels), "+D"(block)
- :"r" ((x86_reg)line_size)
- :"%"REG_a, "memory");
-}
-
-#ifndef SKIP_FOR_3DNOW
-static void DEF(put_pixels16_l2)(uint8_t *dst, uint8_t *src1, uint8_t *src2, int dstStride, int src1Stride, int h)
-{
- __asm__ volatile(
- "testl $1, %0 \n\t"
- " jz 1f \n\t"
- "movq (%1), %%mm0 \n\t"
- "movq 8(%1), %%mm1 \n\t"
- PAVGB" (%2), %%mm0 \n\t"
- PAVGB" 8(%2), %%mm1 \n\t"
- "add %4, %1 \n\t"
- "add $16, %2 \n\t"
- "movq %%mm0, (%3) \n\t"
- "movq %%mm1, 8(%3) \n\t"
- "add %5, %3 \n\t"
- "decl %0 \n\t"
- "1: \n\t"
- "movq (%1), %%mm0 \n\t"
- "movq 8(%1), %%mm1 \n\t"
- "add %4, %1 \n\t"
- PAVGB" (%2), %%mm0 \n\t"
- PAVGB" 8(%2), %%mm1 \n\t"
- "movq %%mm0, (%3) \n\t"
- "movq %%mm1, 8(%3) \n\t"
- "add %5, %3 \n\t"
- "movq (%1), %%mm0 \n\t"
- "movq 8(%1), %%mm1 \n\t"
- "add %4, %1 \n\t"
- PAVGB" 16(%2), %%mm0 \n\t"
- PAVGB" 24(%2), %%mm1 \n\t"
- "movq %%mm0, (%3) \n\t"
- "movq %%mm1, 8(%3) \n\t"
- "add %5, %3 \n\t"
- "add $32, %2 \n\t"
- "subl $2, %0 \n\t"
- "jnz 1b \n\t"
-#if !HAVE_EBX_AVAILABLE //Note "+bm" and "+mb" are buggy too (with gcc 3.2.2 at least) and cannot be used
- :"+m"(h), "+a"(src1), "+c"(src2), "+d"(dst)
-#else
- :"+b"(h), "+a"(src1), "+c"(src2), "+d"(dst)
-#endif
- :"S"((x86_reg)src1Stride), "D"((x86_reg)dstStride)
- :"memory");
-//the following should be used, though better not with gcc ...
-/* :"+g"(h), "+r"(src1), "+r"(src2), "+r"(dst)
- :"r"(src1Stride), "r"(dstStride)
- :"memory");*/
-}
-
-static void DEF(avg_pixels16_l2)(uint8_t *dst, uint8_t *src1, uint8_t *src2, int dstStride, int src1Stride, int h)
-{
- __asm__ volatile(
- "testl $1, %0 \n\t"
- " jz 1f \n\t"
- "movq (%1), %%mm0 \n\t"
- "movq 8(%1), %%mm1 \n\t"
- PAVGB" (%2), %%mm0 \n\t"
- PAVGB" 8(%2), %%mm1 \n\t"
- "add %4, %1 \n\t"
- "add $16, %2 \n\t"
- PAVGB" (%3), %%mm0 \n\t"
- PAVGB" 8(%3), %%mm1 \n\t"
- "movq %%mm0, (%3) \n\t"
- "movq %%mm1, 8(%3) \n\t"
- "add %5, %3 \n\t"
- "decl %0 \n\t"
- "1: \n\t"
- "movq (%1), %%mm0 \n\t"
- "movq 8(%1), %%mm1 \n\t"
- "add %4, %1 \n\t"
- PAVGB" (%2), %%mm0 \n\t"
- PAVGB" 8(%2), %%mm1 \n\t"
- PAVGB" (%3), %%mm0 \n\t"
- PAVGB" 8(%3), %%mm1 \n\t"
- "movq %%mm0, (%3) \n\t"
- "movq %%mm1, 8(%3) \n\t"
- "add %5, %3 \n\t"
- "movq (%1), %%mm0 \n\t"
- "movq 8(%1), %%mm1 \n\t"
- "add %4, %1 \n\t"
- PAVGB" 16(%2), %%mm0 \n\t"
- PAVGB" 24(%2), %%mm1 \n\t"
- PAVGB" (%3), %%mm0 \n\t"
- PAVGB" 8(%3), %%mm1 \n\t"
- "movq %%mm0, (%3) \n\t"
- "movq %%mm1, 8(%3) \n\t"
- "add %5, %3 \n\t"
- "add $32, %2 \n\t"
- "subl $2, %0 \n\t"
- "jnz 1b \n\t"
-#if !HAVE_EBX_AVAILABLE //Note "+bm" and "+mb" are buggy too (with gcc 3.2.2 at least) and cannot be used
- :"+m"(h), "+a"(src1), "+c"(src2), "+d"(dst)
-#else
- :"+b"(h), "+a"(src1), "+c"(src2), "+d"(dst)
-#endif
- :"S"((x86_reg)src1Stride), "D"((x86_reg)dstStride)
- :"memory");
-//the following should be used, though better not with gcc ...
-/* :"+g"(h), "+r"(src1), "+r"(src2), "+r"(dst)
- :"r"(src1Stride), "r"(dstStride)
- :"memory");*/
-}
-
-static void DEF(put_no_rnd_pixels16_l2)(uint8_t *dst, uint8_t *src1, uint8_t *src2, int dstStride, int src1Stride, int h)
-{
- __asm__ volatile(
- "pcmpeqb %%mm6, %%mm6 \n\t"
- "testl $1, %0 \n\t"
- " jz 1f \n\t"
- "movq (%1), %%mm0 \n\t"
- "movq 8(%1), %%mm1 \n\t"
- "movq (%2), %%mm2 \n\t"
- "movq 8(%2), %%mm3 \n\t"
- "pxor %%mm6, %%mm0 \n\t"
- "pxor %%mm6, %%mm1 \n\t"
- "pxor %%mm6, %%mm2 \n\t"
- "pxor %%mm6, %%mm3 \n\t"
- PAVGB" %%mm2, %%mm0 \n\t"
- PAVGB" %%mm3, %%mm1 \n\t"
- "pxor %%mm6, %%mm0 \n\t"
- "pxor %%mm6, %%mm1 \n\t"
- "add %4, %1 \n\t"
- "add $16, %2 \n\t"
- "movq %%mm0, (%3) \n\t"
- "movq %%mm1, 8(%3) \n\t"
- "add %5, %3 \n\t"
- "decl %0 \n\t"
- "1: \n\t"
- "movq (%1), %%mm0 \n\t"
- "movq 8(%1), %%mm1 \n\t"
- "add %4, %1 \n\t"
- "movq (%2), %%mm2 \n\t"
- "movq 8(%2), %%mm3 \n\t"
- "pxor %%mm6, %%mm0 \n\t"
- "pxor %%mm6, %%mm1 \n\t"
- "pxor %%mm6, %%mm2 \n\t"
- "pxor %%mm6, %%mm3 \n\t"
- PAVGB" %%mm2, %%mm0 \n\t"
- PAVGB" %%mm3, %%mm1 \n\t"
- "pxor %%mm6, %%mm0 \n\t"
- "pxor %%mm6, %%mm1 \n\t"
- "movq %%mm0, (%3) \n\t"
- "movq %%mm1, 8(%3) \n\t"
- "add %5, %3 \n\t"
- "movq (%1), %%mm0 \n\t"
- "movq 8(%1), %%mm1 \n\t"
- "add %4, %1 \n\t"
- "movq 16(%2), %%mm2 \n\t"
- "movq 24(%2), %%mm3 \n\t"
- "pxor %%mm6, %%mm0 \n\t"
- "pxor %%mm6, %%mm1 \n\t"
- "pxor %%mm6, %%mm2 \n\t"
- "pxor %%mm6, %%mm3 \n\t"
- PAVGB" %%mm2, %%mm0 \n\t"
- PAVGB" %%mm3, %%mm1 \n\t"
- "pxor %%mm6, %%mm0 \n\t"
- "pxor %%mm6, %%mm1 \n\t"
- "movq %%mm0, (%3) \n\t"
- "movq %%mm1, 8(%3) \n\t"
- "add %5, %3 \n\t"
- "add $32, %2 \n\t"
- "subl $2, %0 \n\t"
- "jnz 1b \n\t"
-#if !HAVE_EBX_AVAILABLE //Note "+bm" and "+mb" are buggy too (with gcc 3.2.2 at least) and cannot be used
- :"+m"(h), "+a"(src1), "+c"(src2), "+d"(dst)
-#else
- :"+b"(h), "+a"(src1), "+c"(src2), "+d"(dst)
-#endif
- :"S"((x86_reg)src1Stride), "D"((x86_reg)dstStride)
- :"memory");
-//the following should be used, though better not with gcc ...
-/* :"+g"(h), "+r"(src1), "+r"(src2), "+r"(dst)
- :"r"(src1Stride), "r"(dstStride)
- :"memory");*/
-}
-#endif /* SKIP_FOR_3DNOW */
-
-/* GL: this function does incorrect rounding if overflow */
-static void DEF(put_no_rnd_pixels8_x2)(uint8_t *block, const uint8_t *pixels, int line_size, int h)
-{
- MOVQ_BONE(mm6);
- __asm__ volatile(
- "lea (%3, %3), %%"REG_a" \n\t"
- "1: \n\t"
- "movq (%1), %%mm0 \n\t"
- "movq (%1, %3), %%mm2 \n\t"
- "movq 1(%1), %%mm1 \n\t"
- "movq 1(%1, %3), %%mm3 \n\t"
- "add %%"REG_a", %1 \n\t"
- "psubusb %%mm6, %%mm0 \n\t"
- "psubusb %%mm6, %%mm2 \n\t"
- PAVGB" %%mm1, %%mm0 \n\t"
- PAVGB" %%mm3, %%mm2 \n\t"
- "movq %%mm0, (%2) \n\t"
- "movq %%mm2, (%2, %3) \n\t"
- "movq (%1), %%mm0 \n\t"
- "movq 1(%1), %%mm1 \n\t"
- "movq (%1, %3), %%mm2 \n\t"
- "movq 1(%1, %3), %%mm3 \n\t"
- "add %%"REG_a", %2 \n\t"
- "add %%"REG_a", %1 \n\t"
- "psubusb %%mm6, %%mm0 \n\t"
- "psubusb %%mm6, %%mm2 \n\t"
- PAVGB" %%mm1, %%mm0 \n\t"
- PAVGB" %%mm3, %%mm2 \n\t"
- "movq %%mm0, (%2) \n\t"
- "movq %%mm2, (%2, %3) \n\t"
- "add %%"REG_a", %2 \n\t"
- "subl $4, %0 \n\t"
- "jnz 1b \n\t"
- :"+g"(h), "+S"(pixels), "+D"(block)
- :"r" ((x86_reg)line_size)
- :"%"REG_a, "memory");
-}
-
-static void DEF(put_no_rnd_pixels8_x2_exact)(uint8_t *block, const uint8_t *pixels, int line_size, int h)
-{
- __asm__ volatile (
- "pcmpeqb %%mm6, %%mm6 \n\t"
- "1: \n\t"
- "movq (%1), %%mm0 \n\t"
- "movq (%1, %3), %%mm2 \n\t"
- "movq 1(%1), %%mm1 \n\t"
- "movq 1(%1, %3), %%mm3 \n\t"
- "pxor %%mm6, %%mm0 \n\t"
- "pxor %%mm6, %%mm2 \n\t"
- "pxor %%mm6, %%mm1 \n\t"
- "pxor %%mm6, %%mm3 \n\t"
- PAVGB" %%mm1, %%mm0 \n\t"
- PAVGB" %%mm3, %%mm2 \n\t"
- "pxor %%mm6, %%mm0 \n\t"
- "pxor %%mm6, %%mm2 \n\t"
- "movq %%mm0, (%2) \n\t"
- "movq %%mm2, (%2, %3) \n\t"
- "movq (%1, %3,2), %%mm0 \n\t"
- "movq 1(%1, %3,2), %%mm1 \n\t"
- "movq (%1, %4), %%mm2 \n\t"
- "movq 1(%1, %4), %%mm3 \n\t"
- "pxor %%mm6, %%mm0 \n\t"
- "pxor %%mm6, %%mm1 \n\t"
- "pxor %%mm6, %%mm2 \n\t"
- "pxor %%mm6, %%mm3 \n\t"
- PAVGB" %%mm1, %%mm0 \n\t"
- PAVGB" %%mm3, %%mm2 \n\t"
- "pxor %%mm6, %%mm0 \n\t"
- "pxor %%mm6, %%mm2 \n\t"
- "movq %%mm0, (%2, %3,2) \n\t"
- "movq %%mm2, (%2, %4) \n\t"
- "lea (%1, %3,4), %1 \n\t"
- "lea (%2, %3,4), %2 \n\t"
- "subl $4, %0 \n\t"
- "jg 1b \n\t"
- : "+g"(h), "+r"(pixels), "+r"(block)
- : "r" ((x86_reg)line_size), "r"((x86_reg)3*line_size)
- : "memory"
- );
-}
-
-static void DEF(put_pixels8_y2)(uint8_t *block, const uint8_t *pixels, int line_size, int h)
+//FIXME the following could be optimized too ...
+static void DEF(ff_put_no_rnd_pixels16_x2)(uint8_t *block,
+ const uint8_t *pixels,
+ int line_size, int h)
{
- __asm__ volatile(
- "lea (%3, %3), %%"REG_a" \n\t"
- "movq (%1), %%mm0 \n\t"
- "sub %3, %2 \n\t"
- "1: \n\t"
- "movq (%1, %3), %%mm1 \n\t"
- "movq (%1, %%"REG_a"), %%mm2 \n\t"
- "add %%"REG_a", %1 \n\t"
- PAVGB" %%mm1, %%mm0 \n\t"
- PAVGB" %%mm2, %%mm1 \n\t"
- "movq %%mm0, (%2, %3) \n\t"
- "movq %%mm1, (%2, %%"REG_a") \n\t"
- "movq (%1, %3), %%mm1 \n\t"
- "movq (%1, %%"REG_a"), %%mm0 \n\t"
- "add %%"REG_a", %2 \n\t"
- "add %%"REG_a", %1 \n\t"
- PAVGB" %%mm1, %%mm2 \n\t"
- PAVGB" %%mm0, %%mm1 \n\t"
- "movq %%mm2, (%2, %3) \n\t"
- "movq %%mm1, (%2, %%"REG_a") \n\t"
- "add %%"REG_a", %2 \n\t"
- "subl $4, %0 \n\t"
- "jnz 1b \n\t"
- :"+g"(h), "+S"(pixels), "+D" (block)
- :"r" ((x86_reg)line_size)
- :"%"REG_a, "memory");
+ DEF(ff_put_no_rnd_pixels8_x2)(block, pixels, line_size, h);
+ DEF(ff_put_no_rnd_pixels8_x2)(block + 8, pixels + 8, line_size, h);
}
-/* GL: this function does incorrect rounding if overflow */
-static void DEF(put_no_rnd_pixels8_y2)(uint8_t *block, const uint8_t *pixels, int line_size, int h)
+static void DEF(ff_put_pixels16_y2)(uint8_t *block, const uint8_t *pixels,
+ int line_size, int h)
{
- MOVQ_BONE(mm6);
- __asm__ volatile(
- "lea (%3, %3), %%"REG_a" \n\t"
- "movq (%1), %%mm0 \n\t"
- "sub %3, %2 \n\t"
- "1: \n\t"
- "movq (%1, %3), %%mm1 \n\t"
- "movq (%1, %%"REG_a"), %%mm2 \n\t"
- "add %%"REG_a", %1 \n\t"
- "psubusb %%mm6, %%mm1 \n\t"
- PAVGB" %%mm1, %%mm0 \n\t"
- PAVGB" %%mm2, %%mm1 \n\t"
- "movq %%mm0, (%2, %3) \n\t"
- "movq %%mm1, (%2, %%"REG_a") \n\t"
- "movq (%1, %3), %%mm1 \n\t"
- "movq (%1, %%"REG_a"), %%mm0 \n\t"
- "add %%"REG_a", %2 \n\t"
- "add %%"REG_a", %1 \n\t"
- "psubusb %%mm6, %%mm1 \n\t"
- PAVGB" %%mm1, %%mm2 \n\t"
- PAVGB" %%mm0, %%mm1 \n\t"
- "movq %%mm2, (%2, %3) \n\t"
- "movq %%mm1, (%2, %%"REG_a") \n\t"
- "add %%"REG_a", %2 \n\t"
- "subl $4, %0 \n\t"
- "jnz 1b \n\t"
- :"+g"(h), "+S"(pixels), "+D" (block)
- :"r" ((x86_reg)line_size)
- :"%"REG_a, "memory");
+ DEF(ff_put_pixels8_y2)(block, pixels, line_size, h);
+ DEF(ff_put_pixels8_y2)(block + 8, pixels + 8, line_size, h);
}
-static void DEF(put_no_rnd_pixels8_y2_exact)(uint8_t *block, const uint8_t *pixels, int line_size, int h)
+static void DEF(ff_put_no_rnd_pixels16_y2)(uint8_t *block,
+ const uint8_t *pixels,
+ int line_size, int h)
{
- __asm__ volatile (
- "movq (%1), %%mm0 \n\t"
- "pcmpeqb %%mm6, %%mm6 \n\t"
- "add %3, %1 \n\t"
- "pxor %%mm6, %%mm0 \n\t"
- "1: \n\t"
- "movq (%1), %%mm1 \n\t"
- "movq (%1, %3), %%mm2 \n\t"
- "pxor %%mm6, %%mm1 \n\t"
- "pxor %%mm6, %%mm2 \n\t"
- PAVGB" %%mm1, %%mm0 \n\t"
- PAVGB" %%mm2, %%mm1 \n\t"
- "pxor %%mm6, %%mm0 \n\t"
- "pxor %%mm6, %%mm1 \n\t"
- "movq %%mm0, (%2) \n\t"
- "movq %%mm1, (%2, %3) \n\t"
- "movq (%1, %3,2), %%mm1 \n\t"
- "movq (%1, %4), %%mm0 \n\t"
- "pxor %%mm6, %%mm1 \n\t"
- "pxor %%mm6, %%mm0 \n\t"
- PAVGB" %%mm1, %%mm2 \n\t"
- PAVGB" %%mm0, %%mm1 \n\t"
- "pxor %%mm6, %%mm2 \n\t"
- "pxor %%mm6, %%mm1 \n\t"
- "movq %%mm2, (%2, %3,2) \n\t"
- "movq %%mm1, (%2, %4) \n\t"
- "lea (%1, %3,4), %1 \n\t"
- "lea (%2, %3,4), %2 \n\t"
- "subl $4, %0 \n\t"
- "jg 1b \n\t"
- :"+g"(h), "+r"(pixels), "+r" (block)
- :"r" ((x86_reg)line_size), "r"((x86_reg)3*line_size)
- :"memory"
- );
+ DEF(ff_put_no_rnd_pixels8_y2)(block, pixels, line_size, h);
+ DEF(ff_put_no_rnd_pixels8_y2)(block + 8, pixels + 8, line_size, h);
}
-static void DEF(avg_pixels8)(uint8_t *block, const uint8_t *pixels, int line_size, int h)
+static void DEF(ff_avg_pixels16)(uint8_t *block, const uint8_t *pixels,
+ int line_size, int h)
{
- __asm__ volatile(
- "lea (%3, %3), %%"REG_a" \n\t"
- "1: \n\t"
- "movq (%2), %%mm0 \n\t"
- "movq (%2, %3), %%mm1 \n\t"
- PAVGB" (%1), %%mm0 \n\t"
- PAVGB" (%1, %3), %%mm1 \n\t"
- "movq %%mm0, (%2) \n\t"
- "movq %%mm1, (%2, %3) \n\t"
- "add %%"REG_a", %1 \n\t"
- "add %%"REG_a", %2 \n\t"
- "movq (%2), %%mm0 \n\t"
- "movq (%2, %3), %%mm1 \n\t"
- PAVGB" (%1), %%mm0 \n\t"
- PAVGB" (%1, %3), %%mm1 \n\t"
- "add %%"REG_a", %1 \n\t"
- "movq %%mm0, (%2) \n\t"
- "movq %%mm1, (%2, %3) \n\t"
- "add %%"REG_a", %2 \n\t"
- "subl $4, %0 \n\t"
- "jnz 1b \n\t"
- :"+g"(h), "+S"(pixels), "+D"(block)
- :"r" ((x86_reg)line_size)
- :"%"REG_a, "memory");
+ DEF(ff_avg_pixels8)(block, pixels, line_size, h);
+ DEF(ff_avg_pixels8)(block + 8, pixels + 8, line_size, h);
}
-static void DEF(avg_pixels8_x2)(uint8_t *block, const uint8_t *pixels, int line_size, int h)
+static void DEF(ff_avg_pixels16_x2)(uint8_t *block, const uint8_t *pixels,
+ int line_size, int h)
{
- __asm__ volatile(
- "lea (%3, %3), %%"REG_a" \n\t"
- "1: \n\t"
- "movq (%1), %%mm0 \n\t"
- "movq (%1, %3), %%mm2 \n\t"
- PAVGB" 1(%1), %%mm0 \n\t"
- PAVGB" 1(%1, %3), %%mm2 \n\t"
- PAVGB" (%2), %%mm0 \n\t"
- PAVGB" (%2, %3), %%mm2 \n\t"
- "add %%"REG_a", %1 \n\t"
- "movq %%mm0, (%2) \n\t"
- "movq %%mm2, (%2, %3) \n\t"
- "movq (%1), %%mm0 \n\t"
- "movq (%1, %3), %%mm2 \n\t"
- PAVGB" 1(%1), %%mm0 \n\t"
- PAVGB" 1(%1, %3), %%mm2 \n\t"
- "add %%"REG_a", %2 \n\t"
- "add %%"REG_a", %1 \n\t"
- PAVGB" (%2), %%mm0 \n\t"
- PAVGB" (%2, %3), %%mm2 \n\t"
- "movq %%mm0, (%2) \n\t"
- "movq %%mm2, (%2, %3) \n\t"
- "add %%"REG_a", %2 \n\t"
- "subl $4, %0 \n\t"
- "jnz 1b \n\t"
- :"+g"(h), "+S"(pixels), "+D"(block)
- :"r" ((x86_reg)line_size)
- :"%"REG_a, "memory");
+ DEF(ff_avg_pixels8_x2)(block, pixels, line_size, h);
+ DEF(ff_avg_pixels8_x2)(block + 8, pixels + 8, line_size, h);
}
-static void DEF(avg_pixels8_y2)(uint8_t *block, const uint8_t *pixels, int line_size, int h)
+static void DEF(ff_avg_pixels16_y2)(uint8_t *block, const uint8_t *pixels,
+ int line_size, int h)
{
- __asm__ volatile(
- "lea (%3, %3), %%"REG_a" \n\t"
- "movq (%1), %%mm0 \n\t"
- "sub %3, %2 \n\t"
- "1: \n\t"
- "movq (%1, %3), %%mm1 \n\t"
- "movq (%1, %%"REG_a"), %%mm2 \n\t"
- "add %%"REG_a", %1 \n\t"
- PAVGB" %%mm1, %%mm0 \n\t"
- PAVGB" %%mm2, %%mm1 \n\t"
- "movq (%2, %3), %%mm3 \n\t"
- "movq (%2, %%"REG_a"), %%mm4 \n\t"
- PAVGB" %%mm3, %%mm0 \n\t"
- PAVGB" %%mm4, %%mm1 \n\t"
- "movq %%mm0, (%2, %3) \n\t"
- "movq %%mm1, (%2, %%"REG_a") \n\t"
- "movq (%1, %3), %%mm1 \n\t"
- "movq (%1, %%"REG_a"), %%mm0 \n\t"
- PAVGB" %%mm1, %%mm2 \n\t"
- PAVGB" %%mm0, %%mm1 \n\t"
- "add %%"REG_a", %2 \n\t"
- "add %%"REG_a", %1 \n\t"
- "movq (%2, %3), %%mm3 \n\t"
- "movq (%2, %%"REG_a"), %%mm4 \n\t"
- PAVGB" %%mm3, %%mm2 \n\t"
- PAVGB" %%mm4, %%mm1 \n\t"
- "movq %%mm2, (%2, %3) \n\t"
- "movq %%mm1, (%2, %%"REG_a") \n\t"
- "add %%"REG_a", %2 \n\t"
- "subl $4, %0 \n\t"
- "jnz 1b \n\t"
- :"+g"(h), "+S"(pixels), "+D"(block)
- :"r" ((x86_reg)line_size)
- :"%"REG_a, "memory");
+ DEF(ff_avg_pixels8_y2)(block, pixels, line_size, h);
+ DEF(ff_avg_pixels8_y2)(block + 8, pixels + 8, line_size, h);
}
-/* Note this is not correctly rounded, but this function is only
- * used for B-frames so it does not matter. */
-static void DEF(avg_pixels8_xy2)(uint8_t *block, const uint8_t *pixels, int line_size, int h)
+static void DEF(ff_avg_pixels16_xy2)(uint8_t *block, const uint8_t *pixels,
+ int line_size, int h)
{
- MOVQ_BONE(mm6);
- __asm__ volatile(
- "lea (%3, %3), %%"REG_a" \n\t"
- "movq (%1), %%mm0 \n\t"
- PAVGB" 1(%1), %%mm0 \n\t"
- ".p2align 3 \n\t"
- "1: \n\t"
- "movq (%1, %%"REG_a"), %%mm2 \n\t"
- "movq (%1, %3), %%mm1 \n\t"
- "psubusb %%mm6, %%mm2 \n\t"
- PAVGB" 1(%1, %3), %%mm1 \n\t"
- PAVGB" 1(%1, %%"REG_a"), %%mm2 \n\t"
- "add %%"REG_a", %1 \n\t"
- PAVGB" %%mm1, %%mm0 \n\t"
- PAVGB" %%mm2, %%mm1 \n\t"
- PAVGB" (%2), %%mm0 \n\t"
- PAVGB" (%2, %3), %%mm1 \n\t"
- "movq %%mm0, (%2) \n\t"
- "movq %%mm1, (%2, %3) \n\t"
- "movq (%1, %3), %%mm1 \n\t"
- "movq (%1, %%"REG_a"), %%mm0 \n\t"
- PAVGB" 1(%1, %3), %%mm1 \n\t"
- PAVGB" 1(%1, %%"REG_a"), %%mm0 \n\t"
- "add %%"REG_a", %2 \n\t"
- "add %%"REG_a", %1 \n\t"
- PAVGB" %%mm1, %%mm2 \n\t"
- PAVGB" %%mm0, %%mm1 \n\t"
- PAVGB" (%2), %%mm2 \n\t"
- PAVGB" (%2, %3), %%mm1 \n\t"
- "movq %%mm2, (%2) \n\t"
- "movq %%mm1, (%2, %3) \n\t"
- "add %%"REG_a", %2 \n\t"
- "subl $4, %0 \n\t"
- "jnz 1b \n\t"
- :"+g"(h), "+S"(pixels), "+D"(block)
- :"r" ((x86_reg)line_size)
- :"%"REG_a, "memory");
-}
-
-//FIXME the following could be optimized too ...
-static void DEF(put_no_rnd_pixels16_x2)(uint8_t *block, const uint8_t *pixels, int line_size, int h){
- DEF(put_no_rnd_pixels8_x2)(block , pixels , line_size, h);
- DEF(put_no_rnd_pixels8_x2)(block+8, pixels+8, line_size, h);
-}
-static void DEF(put_pixels16_y2)(uint8_t *block, const uint8_t *pixels, int line_size, int h){
- DEF(put_pixels8_y2)(block , pixels , line_size, h);
- DEF(put_pixels8_y2)(block+8, pixels+8, line_size, h);
-}
-static void DEF(put_no_rnd_pixels16_y2)(uint8_t *block, const uint8_t *pixels, int line_size, int h){
- DEF(put_no_rnd_pixels8_y2)(block , pixels , line_size, h);
- DEF(put_no_rnd_pixels8_y2)(block+8, pixels+8, line_size, h);
-}
-static void DEF(avg_pixels16)(uint8_t *block, const uint8_t *pixels, int line_size, int h){
- DEF(avg_pixels8)(block , pixels , line_size, h);
- DEF(avg_pixels8)(block+8, pixels+8, line_size, h);
-}
-static void DEF(avg_pixels16_x2)(uint8_t *block, const uint8_t *pixels, int line_size, int h){
- DEF(avg_pixels8_x2)(block , pixels , line_size, h);
- DEF(avg_pixels8_x2)(block+8, pixels+8, line_size, h);
-}
-static void DEF(avg_pixels16_y2)(uint8_t *block, const uint8_t *pixels, int line_size, int h){
- DEF(avg_pixels8_y2)(block , pixels , line_size, h);
- DEF(avg_pixels8_y2)(block+8, pixels+8, line_size, h);
-}
-static void DEF(avg_pixels16_xy2)(uint8_t *block, const uint8_t *pixels, int line_size, int h){
- DEF(avg_pixels8_xy2)(block , pixels , line_size, h);
- DEF(avg_pixels8_xy2)(block+8, pixels+8, line_size, h);
+ DEF(ff_avg_pixels8_xy2)(block, pixels, line_size, h);
+ DEF(ff_avg_pixels8_xy2)(block + 8, pixels + 8, line_size, h);
}
diff --git a/libavcodec/x86/dsputil_mmx.c b/libavcodec/x86/dsputil_mmx.c
index f72500e4f0..743a7c116f 100644
--- a/libavcodec/x86/dsputil_mmx.c
+++ b/libavcodec/x86/dsputil_mmx.c
@@ -80,6 +80,107 @@ DECLARE_ALIGNED(16, const xmm_reg, ff_pb_FE) = { 0xFEFEFEFEFEFEFEFEULL, 0xFEF
DECLARE_ALIGNED(16, const double, ff_pd_1)[2] = { 1.0, 1.0 };
DECLARE_ALIGNED(16, const double, ff_pd_2)[2] = { 2.0, 2.0 };
+
+void ff_put_pixels8_x2_mmxext(uint8_t *block, const uint8_t *pixels,
+ int line_size, int h);
+void ff_put_pixels8_x2_3dnow(uint8_t *block, const uint8_t *pixels,
+ int line_size, int h);
+void ff_put_pixels8_l2_mmxext(uint8_t *dst, uint8_t *src1, uint8_t *src2,
+ int dstStride, int src1Stride, int h);
+void ff_put_no_rnd_pixels8_l2_mmxext(uint8_t *dst, uint8_t *src1,
+ uint8_t *src2, int dstStride,
+ int src1Stride, int h);
+void ff_avg_pixels8_l2_mmxext(uint8_t *dst, uint8_t *src1, uint8_t *src2,
+ int dstStride, int src1Stride, int h);
+void ff_put_pixels16_x2_mmxext(uint8_t *block, const uint8_t *pixels,
+ int line_size, int h);
+void ff_put_pixels16_x2_3dnow(uint8_t *block, const uint8_t *pixels,
+ int line_size, int h);
+void ff_put_pixels16_l2_mmxext(uint8_t *dst, uint8_t *src1, uint8_t *src2,
+ int dstStride, int src1Stride, int h);
+void ff_avg_pixels16_l2_mmxext(uint8_t *dst, uint8_t *src1, uint8_t *src2,
+ int dstStride, int src1Stride, int h);
+void ff_put_no_rnd_pixels16_l2_mmxext(uint8_t *dst, uint8_t *src1, uint8_t *src2,
+ int dstStride, int src1Stride, int h);
+void ff_put_no_rnd_pixels8_x2_mmxext(uint8_t *block, const uint8_t *pixels,
+ int line_size, int h);
+void ff_put_no_rnd_pixels8_x2_3dnow(uint8_t *block, const uint8_t *pixels,
+ int line_size, int h);
+void ff_put_no_rnd_pixels8_x2_exact_mmxext(uint8_t *block,
+ const uint8_t *pixels,
+ int line_size, int h);
+void ff_put_no_rnd_pixels8_x2_exact_3dnow(uint8_t *block,
+ const uint8_t *pixels,
+ int line_size, int h);
+void ff_put_pixels8_y2_mmxext(uint8_t *block, const uint8_t *pixels,
+ int line_size, int h);
+void ff_put_pixels8_y2_3dnow(uint8_t *block, const uint8_t *pixels,
+ int line_size, int h);
+void ff_put_no_rnd_pixels8_y2_mmxext(uint8_t *block, const uint8_t *pixels,
+ int line_size, int h);
+void ff_put_no_rnd_pixels8_y2_3dnow(uint8_t *block, const uint8_t *pixels,
+ int line_size, int h);
+void ff_put_no_rnd_pixels8_y2_exact_mmxext(uint8_t *block,
+ const uint8_t *pixels,
+ int line_size, int h);
+void ff_put_no_rnd_pixels8_y2_exact_3dnow(uint8_t *block,
+ const uint8_t *pixels,
+ int line_size, int h);
+void ff_avg_pixels8_mmxext(uint8_t *block, const uint8_t *pixels,
+ int line_size, int h);
+void ff_avg_pixels8_3dnow(uint8_t *block, const uint8_t *pixels,
+ int line_size, int h);
+void ff_avg_pixels8_x2_mmxext(uint8_t *block, const uint8_t *pixels,
+ int line_size, int h);
+void ff_avg_pixels8_x2_3dnow(uint8_t *block, const uint8_t *pixels,
+ int line_size, int h);
+void ff_avg_pixels8_y2_mmxext(uint8_t *block, const uint8_t *pixels,
+ int line_size, int h);
+void ff_avg_pixels8_y2_3dnow(uint8_t *block, const uint8_t *pixels,
+ int line_size, int h);
+void ff_avg_pixels8_xy2_mmxext(uint8_t *block, const uint8_t *pixels,
+ int line_size, int h);
+void ff_avg_pixels8_xy2_3dnow(uint8_t *block, const uint8_t *pixels,
+ int line_size, int h);
+
+void ff_put_pixels8_mmxext(uint8_t *block, const uint8_t *pixels, int line_size, int h);
+static void ff_put_pixels16_mmxext(uint8_t *block, const uint8_t *pixels,
+ int line_size, int h)
+{
+ ff_put_pixels8_mmxext(block, pixels, line_size, h);
+ ff_put_pixels8_mmxext(block + 8, pixels + 8, line_size, h);
+}
+
+void ff_put_mpeg4_qpel16_h_lowpass_mmxext(uint8_t *dst, uint8_t *src,
+ int dstStride, int srcStride, int h);
+void ff_avg_mpeg4_qpel16_h_lowpass_mmxext(uint8_t *dst, uint8_t *src,
+ int dstStride, int srcStride, int h);
+void ff_put_no_rnd_mpeg4_qpel16_h_lowpass_mmxext(uint8_t *dst, uint8_t *src,
+ int dstStride, int srcStride,
+ int h);
+void ff_put_mpeg4_qpel8_h_lowpass_mmxext(uint8_t *dst, uint8_t *src,
+ int dstStride, int srcStride, int h);
+void ff_avg_mpeg4_qpel8_h_lowpass_mmxext(uint8_t *dst, uint8_t *src,
+ int dstStride, int srcStride, int h);
+void ff_put_no_rnd_mpeg4_qpel8_h_lowpass_mmxext(uint8_t *dst, uint8_t *src,
+ int dstStride, int srcStride,
+ int h);
+void ff_put_mpeg4_qpel16_v_lowpass_mmxext(uint8_t *dst, uint8_t *src,
+ int dstStride, int srcStride);
+void ff_avg_mpeg4_qpel16_v_lowpass_mmxext(uint8_t *dst, uint8_t *src,
+ int dstStride, int srcStride);
+void ff_put_no_rnd_mpeg4_qpel16_v_lowpass_mmxext(uint8_t *dst, uint8_t *src,
+ int dstStride, int srcStride);
+void ff_put_mpeg4_qpel8_v_lowpass_mmxext(uint8_t *dst, uint8_t *src,
+ int dstStride, int srcStride);
+void ff_avg_mpeg4_qpel8_v_lowpass_mmxext(uint8_t *dst, uint8_t *src,
+ int dstStride, int srcStride);
+void ff_put_no_rnd_mpeg4_qpel8_v_lowpass_mmxext(uint8_t *dst, uint8_t *src,
+ int dstStride, int srcStride);
+#define ff_put_no_rnd_pixels16_mmxext ff_put_pixels16_mmxext
+#define ff_put_no_rnd_pixels8_mmxext ff_put_pixels8_mmxext
+
+
#if HAVE_INLINE_ASM
#define JUMPALIGN() __asm__ volatile (".p2align 3"::)
@@ -190,32 +291,34 @@ DECLARE_ALIGNED(16, const double, ff_pd_2)[2] = { 2.0, 2.0 };
#undef PAVGB
#undef OP_AVG
+#endif /* HAVE_INLINE_ASM */
+
+
+#if HAVE_YASM
+#define ff_put_pixels8_mmx ff_put_pixels8_mmxext
+
/***********************************/
/* 3Dnow specific */
#define DEF(x) x ## _3dnow
-#define PAVGB "pavgusb"
-#define SKIP_FOR_3DNOW
#include "dsputil_avg_template.c"
#undef DEF
-#undef PAVGB
-#undef SKIP_FOR_3DNOW
/***********************************/
/* MMXEXT specific */
#define DEF(x) x ## _mmxext
-/* Introduced only in MMXEXT set */
-#define PAVGB "pavgb"
-
#include "dsputil_avg_template.c"
#undef DEF
-#undef PAVGB
+#endif /* HAVE_YASM */
+
+
+#if HAVE_INLINE_ASM
#define put_no_rnd_pixels16_mmx put_pixels16_mmx
#define put_no_rnd_pixels8_mmx put_pixels8_mmx
#define put_pixels16_mmxext put_pixels16_mmx
@@ -815,382 +918,15 @@ static void draw_edges_mmx(uint8_t *buf, int wrap, int width, int height,
}
}
}
+#endif /* HAVE_INLINE_ASM */
-#define QPEL_V_LOW(m3, m4, m5, m6, pw_20, pw_3, rnd, \
- in0, in1, in2, in7, out, OP) \
- "paddw "#m4", "#m3" \n\t" /* x1 */ \
- "movq "MANGLE(ff_pw_20)", %%mm4 \n\t" /* 20 */ \
- "pmullw "#m3", %%mm4 \n\t" /* 20x1 */ \
- "movq "#in7", "#m3" \n\t" /* d */ \
- "movq "#in0", %%mm5 \n\t" /* D */ \
- "paddw "#m3", %%mm5 \n\t" /* x4 */ \
- "psubw %%mm5, %%mm4 \n\t" /* 20x1 - x4 */ \
- "movq "#in1", %%mm5 \n\t" /* C */ \
- "movq "#in2", %%mm6 \n\t" /* B */ \
- "paddw "#m6", %%mm5 \n\t" /* x3 */ \
- "paddw "#m5", %%mm6 \n\t" /* x2 */ \
- "paddw %%mm6, %%mm6 \n\t" /* 2x2 */ \
- "psubw %%mm6, %%mm5 \n\t" /* -2x2 + x3 */ \
- "pmullw "MANGLE(ff_pw_3)", %%mm5 \n\t" /* -6x2 + 3x3 */ \
- "paddw "#rnd", %%mm4 \n\t" /* x2 */ \
- "paddw %%mm4, %%mm5 \n\t" /* 20x1 - 6x2 + 3x3 - x4 */ \
- "psraw $5, %%mm5 \n\t" \
- "packuswb %%mm5, %%mm5 \n\t" \
- OP(%%mm5, out, %%mm7, d)
-
-#define QPEL_BASE(OPNAME, ROUNDER, RND, OP_MMXEXT) \
-static void OPNAME ## mpeg4_qpel16_h_lowpass_mmxext(uint8_t *dst, \
- uint8_t *src, \
- int dstStride, \
- int srcStride, \
- int h) \
-{ \
- uint64_t temp; \
- \
- __asm__ volatile ( \
- "pxor %%mm7, %%mm7 \n\t" \
- "1: \n\t" \
- "movq (%0), %%mm0 \n\t" /* ABCDEFGH */ \
- "movq %%mm0, %%mm1 \n\t" /* ABCDEFGH */ \
- "movq %%mm0, %%mm2 \n\t" /* ABCDEFGH */ \
- "punpcklbw %%mm7, %%mm0 \n\t" /* 0A0B0C0D */ \
- "punpckhbw %%mm7, %%mm1 \n\t" /* 0E0F0G0H */ \
- "pshufw $0x90, %%mm0, %%mm5 \n\t" /* 0A0A0B0C */ \
- "pshufw $0x41, %%mm0, %%mm6 \n\t" /* 0B0A0A0B */ \
- "movq %%mm2, %%mm3 \n\t" /* ABCDEFGH */ \
- "movq %%mm2, %%mm4 \n\t" /* ABCDEFGH */ \
- "psllq $8, %%mm2 \n\t" /* 0ABCDEFG */ \
- "psllq $16, %%mm3 \n\t" /* 00ABCDEF */ \
- "psllq $24, %%mm4 \n\t" /* 000ABCDE */ \
- "punpckhbw %%mm7, %%mm2 \n\t" /* 0D0E0F0G */ \
- "punpckhbw %%mm7, %%mm3 \n\t" /* 0C0D0E0F */ \
- "punpckhbw %%mm7, %%mm4 \n\t" /* 0B0C0D0E */ \
- "paddw %%mm3, %%mm5 \n\t" /* b */ \
- "paddw %%mm2, %%mm6 \n\t" /* c */ \
- "paddw %%mm5, %%mm5 \n\t" /* 2b */ \
- "psubw %%mm5, %%mm6 \n\t" /* c - 2b */ \
- "pshufw $0x06, %%mm0, %%mm5 \n\t" /* 0C0B0A0A */ \
- "pmullw "MANGLE(ff_pw_3)", %%mm6 \n\t" /* 3c - 6b */ \
- "paddw %%mm4, %%mm0 \n\t" /* a */ \
- "paddw %%mm1, %%mm5 \n\t" /* d */ \
- "pmullw "MANGLE(ff_pw_20)", %%mm0 \n\t" /* 20a */ \
- "psubw %%mm5, %%mm0 \n\t" /* 20a - d */ \
- "paddw %6, %%mm6 \n\t" \
- "paddw %%mm6, %%mm0 \n\t" /* 20a - 6b + 3c - d */ \
- "psraw $5, %%mm0 \n\t" \
- "movq %%mm0, %5 \n\t" \
- /* mm1 = EFGH, mm2 = DEFG, mm3 = CDEF, mm4 = BCDE, mm7 = 0 */ \
- \
- "movq 5(%0), %%mm0 \n\t" /* FGHIJKLM */ \
- "movq %%mm0, %%mm5 \n\t" /* FGHIJKLM */ \
- "movq %%mm0, %%mm6 \n\t" /* FGHIJKLM */ \
- "psrlq $8, %%mm0 \n\t" /* GHIJKLM0 */ \
- "psrlq $16, %%mm5 \n\t" /* HIJKLM00 */ \
- "punpcklbw %%mm7, %%mm0 \n\t" /* 0G0H0I0J */ \
- "punpcklbw %%mm7, %%mm5 \n\t" /* 0H0I0J0K */ \
- "paddw %%mm0, %%mm2 \n\t" /* b */ \
- "paddw %%mm5, %%mm3 \n\t" /* c */ \
- "paddw %%mm2, %%mm2 \n\t" /* 2b */ \
- "psubw %%mm2, %%mm3 \n\t" /* c - 2b */ \
- "movq %%mm6, %%mm2 \n\t" /* FGHIJKLM */ \
- "psrlq $24, %%mm6 \n\t" /* IJKLM000 */ \
- "punpcklbw %%mm7, %%mm2 \n\t" /* 0F0G0H0I */ \
- "punpcklbw %%mm7, %%mm6 \n\t" /* 0I0J0K0L */ \
- "pmullw "MANGLE(ff_pw_3)", %%mm3 \n\t" /* 3c - 6b */ \
- "paddw %%mm2, %%mm1 \n\t" /* a */ \
- "paddw %%mm6, %%mm4 \n\t" /* d */ \
- "pmullw "MANGLE(ff_pw_20)", %%mm1 \n\t" /* 20a */ \
- "psubw %%mm4, %%mm3 \n\t" /* - 6b +3c - d */ \
- "paddw %6, %%mm1 \n\t" \
- "paddw %%mm1, %%mm3 \n\t" /* 20a - 6b +3c - d */ \
- "psraw $5, %%mm3 \n\t" \
- "movq %5, %%mm1 \n\t" \
- "packuswb %%mm3, %%mm1 \n\t" \
- OP_MMXEXT(%%mm1, (%1), %%mm4, q) \
- /* mm0 = GHIJ, mm2 = FGHI, mm5 = HIJK, mm6 = IJKL, mm7 = 0 */ \
- \
- "movq 9(%0), %%mm1 \n\t" /* JKLMNOPQ */ \
- "movq %%mm1, %%mm4 \n\t" /* JKLMNOPQ */ \
- "movq %%mm1, %%mm3 \n\t" /* JKLMNOPQ */ \
- "psrlq $8, %%mm1 \n\t" /* KLMNOPQ0 */ \
- "psrlq $16, %%mm4 \n\t" /* LMNOPQ00 */ \
- "punpcklbw %%mm7, %%mm1 \n\t" /* 0K0L0M0N */ \
- "punpcklbw %%mm7, %%mm4 \n\t" /* 0L0M0N0O */ \
- "paddw %%mm1, %%mm5 \n\t" /* b */ \
- "paddw %%mm4, %%mm0 \n\t" /* c */ \
- "paddw %%mm5, %%mm5 \n\t" /* 2b */ \
- "psubw %%mm5, %%mm0 \n\t" /* c - 2b */ \
- "movq %%mm3, %%mm5 \n\t" /* JKLMNOPQ */ \
- "psrlq $24, %%mm3 \n\t" /* MNOPQ000 */ \
- "pmullw "MANGLE(ff_pw_3)", %%mm0 \n\t" /* 3c - 6b */ \
- "punpcklbw %%mm7, %%mm3 \n\t" /* 0M0N0O0P */ \
- "paddw %%mm3, %%mm2 \n\t" /* d */ \
- "psubw %%mm2, %%mm0 \n\t" /* -6b + 3c - d */ \
- "movq %%mm5, %%mm2 \n\t" /* JKLMNOPQ */ \
- "punpcklbw %%mm7, %%mm2 \n\t" /* 0J0K0L0M */ \
- "punpckhbw %%mm7, %%mm5 \n\t" /* 0N0O0P0Q */ \
- "paddw %%mm2, %%mm6 \n\t" /* a */ \
- "pmullw "MANGLE(ff_pw_20)", %%mm6 \n\t" /* 20a */ \
- "paddw %6, %%mm0 \n\t" \
- "paddw %%mm6, %%mm0 \n\t" /* 20a - 6b + 3c - d */ \
- "psraw $5, %%mm0 \n\t" \
- /* mm1 = KLMN, mm2 = JKLM, mm3 = MNOP, */ \
- /* mm4 = LMNO, mm5 = NOPQ mm7 = 0 */ \
- \
- "paddw %%mm5, %%mm3 \n\t" /* a */ \
- "pshufw $0xF9, %%mm5, %%mm6 \n\t" /* 0O0P0Q0Q */ \
- "paddw %%mm4, %%mm6 \n\t" /* b */ \
- "pshufw $0xBE, %%mm5, %%mm4 \n\t" /* 0P0Q0Q0P */ \
- "pshufw $0x6F, %%mm5, %%mm5 \n\t" /* 0Q0Q0P0O */ \
- "paddw %%mm1, %%mm4 \n\t" /* c */ \
- "paddw %%mm2, %%mm5 \n\t" /* d */ \
- "paddw %%mm6, %%mm6 \n\t" /* 2b */ \
- "psubw %%mm6, %%mm4 \n\t" /* c - 2b */ \
- "pmullw "MANGLE(ff_pw_20)", %%mm3 \n\t" /* 20a */ \
- "pmullw "MANGLE(ff_pw_3)", %%mm4 \n\t" /* 3c - 6b */ \
- "psubw %%mm5, %%mm3 \n\t" /* -6b + 3c - d */ \
- "paddw %6, %%mm4 \n\t" \
- "paddw %%mm3, %%mm4 \n\t" /* 20a - 6b + 3c - d */ \
- "psraw $5, %%mm4 \n\t" \
- "packuswb %%mm4, %%mm0 \n\t" \
- OP_MMXEXT(%%mm0, 8(%1), %%mm4, q) \
- \
- "add %3, %0 \n\t" \
- "add %4, %1 \n\t" \
- "decl %2 \n\t" \
- "jnz 1b \n\t" \
- : "+a"(src), "+c"(dst), "+D"(h) \
- : "d"((x86_reg)srcStride), "S"((x86_reg)dstStride), \
- /* "m"(ff_pw_20), "m"(ff_pw_3), */ "m"(temp), "m"(ROUNDER) \
- : "memory" \
- ); \
-} \
- \
-static void OPNAME ## mpeg4_qpel8_h_lowpass_mmxext(uint8_t *dst, \
- uint8_t *src, \
- int dstStride, \
- int srcStride, \
- int h) \
-{ \
- __asm__ volatile ( \
- "pxor %%mm7, %%mm7 \n\t" \
- "1: \n\t" \
- "movq (%0), %%mm0 \n\t" /* ABCDEFGH */ \
- "movq %%mm0, %%mm1 \n\t" /* ABCDEFGH */ \
- "movq %%mm0, %%mm2 \n\t" /* ABCDEFGH */ \
- "punpcklbw %%mm7, %%mm0 \n\t" /* 0A0B0C0D */ \
- "punpckhbw %%mm7, %%mm1 \n\t" /* 0E0F0G0H */ \
- "pshufw $0x90, %%mm0, %%mm5 \n\t" /* 0A0A0B0C */ \
- "pshufw $0x41, %%mm0, %%mm6 \n\t" /* 0B0A0A0B */ \
- "movq %%mm2, %%mm3 \n\t" /* ABCDEFGH */ \
- "movq %%mm2, %%mm4 \n\t" /* ABCDEFGH */ \
- "psllq $8, %%mm2 \n\t" /* 0ABCDEFG */ \
- "psllq $16, %%mm3 \n\t" /* 00ABCDEF */ \
- "psllq $24, %%mm4 \n\t" /* 000ABCDE */ \
- "punpckhbw %%mm7, %%mm2 \n\t" /* 0D0E0F0G */ \
- "punpckhbw %%mm7, %%mm3 \n\t" /* 0C0D0E0F */ \
- "punpckhbw %%mm7, %%mm4 \n\t" /* 0B0C0D0E */ \
- "paddw %%mm3, %%mm5 \n\t" /* b */ \
- "paddw %%mm2, %%mm6 \n\t" /* c */ \
- "paddw %%mm5, %%mm5 \n\t" /* 2b */ \
- "psubw %%mm5, %%mm6 \n\t" /* c - 2b */ \
- "pshufw $0x06, %%mm0, %%mm5 \n\t" /* 0C0B0A0A */ \
- "pmullw "MANGLE(ff_pw_3)", %%mm6 \n\t" /* 3c - 6b */ \
- "paddw %%mm4, %%mm0 \n\t" /* a */ \
- "paddw %%mm1, %%mm5 \n\t" /* d */ \
- "pmullw "MANGLE(ff_pw_20)", %%mm0 \n\t" /* 20a */ \
- "psubw %%mm5, %%mm0 \n\t" /* 20a - d */ \
- "paddw %5, %%mm6 \n\t" \
- "paddw %%mm6, %%mm0 \n\t" /* 20a - 6b + 3c - d */ \
- "psraw $5, %%mm0 \n\t" \
- /* mm1 = EFGH, mm2 = DEFG, mm3 = CDEF, mm4 = BCDE, mm7 = 0 */ \
- \
- "movd 5(%0), %%mm5 \n\t" /* FGHI */ \
- "punpcklbw %%mm7, %%mm5 \n\t" /* 0F0G0H0I */ \
- "pshufw $0xF9, %%mm5, %%mm6 \n\t" /* 0G0H0I0I */ \
- "paddw %%mm5, %%mm1 \n\t" /* a */ \
- "paddw %%mm6, %%mm2 \n\t" /* b */ \
- "pshufw $0xBE, %%mm5, %%mm6 \n\t" /* 0H0I0I0H */ \
- "pshufw $0x6F, %%mm5, %%mm5 \n\t" /* 0I0I0H0G */ \
- "paddw %%mm6, %%mm3 \n\t" /* c */ \
- "paddw %%mm5, %%mm4 \n\t" /* d */ \
- "paddw %%mm2, %%mm2 \n\t" /* 2b */ \
- "psubw %%mm2, %%mm3 \n\t" /* c - 2b */ \
- "pmullw "MANGLE(ff_pw_20)", %%mm1 \n\t" /* 20a */ \
- "pmullw "MANGLE(ff_pw_3)", %%mm3 \n\t" /* 3c - 6b */ \
- "psubw %%mm4, %%mm3 \n\t" /* -6b + 3c - d */ \
- "paddw %5, %%mm1 \n\t" \
- "paddw %%mm1, %%mm3 \n\t" /* 20a - 6b + 3c - d */ \
- "psraw $5, %%mm3 \n\t" \
- "packuswb %%mm3, %%mm0 \n\t" \
- OP_MMXEXT(%%mm0, (%1), %%mm4, q) \
- \
- "add %3, %0 \n\t" \
- "add %4, %1 \n\t" \
- "decl %2 \n\t" \
- "jnz 1b \n\t" \
- : "+a"(src), "+c"(dst), "+d"(h) \
- : "S"((x86_reg)srcStride), "D"((x86_reg)dstStride), \
- /* "m"(ff_pw_20), "m"(ff_pw_3), */ "m"(ROUNDER) \
- : "memory" \
- ); \
-}
+#if HAVE_YASM
#define QPEL_OP(OPNAME, ROUNDER, RND, OP, MMX) \
-static void OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(uint8_t *dst, \
- uint8_t *src, \
- int dstStride, \
- int srcStride) \
-{ \
- uint64_t temp[17 * 4]; \
- uint64_t *temp_ptr = temp; \
- int count = 17; \
- \
- /* FIXME unroll */ \
- __asm__ volatile ( \
- "pxor %%mm7, %%mm7 \n\t" \
- "1: \n\t" \
- "movq (%0), %%mm0 \n\t" \
- "movq (%0), %%mm1 \n\t" \
- "movq 8(%0), %%mm2 \n\t" \
- "movq 8(%0), %%mm3 \n\t" \
- "punpcklbw %%mm7, %%mm0 \n\t" \
- "punpckhbw %%mm7, %%mm1 \n\t" \
- "punpcklbw %%mm7, %%mm2 \n\t" \
- "punpckhbw %%mm7, %%mm3 \n\t" \
- "movq %%mm0, (%1) \n\t" \
- "movq %%mm1, 17 * 8(%1) \n\t" \
- "movq %%mm2, 2 * 17 * 8(%1) \n\t" \
- "movq %%mm3, 3 * 17 * 8(%1) \n\t" \
- "add $8, %1 \n\t" \
- "add %3, %0 \n\t" \
- "decl %2 \n\t" \
- "jnz 1b \n\t" \
- : "+r"(src), "+r"(temp_ptr), "+r"(count) \
- : "r"((x86_reg)srcStride) \
- : "memory" \
- ); \
- \
- temp_ptr = temp; \
- count = 4; \
- \
- /* FIXME reorder for speed */ \
- __asm__ volatile ( \
- /* "pxor %%mm7, %%mm7 \n\t" */ \
- "1: \n\t" \
- "movq (%0), %%mm0 \n\t" \
- "movq 8(%0), %%mm1 \n\t" \
- "movq 16(%0), %%mm2 \n\t" \
- "movq 24(%0), %%mm3 \n\t" \
- QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5, 16(%0), 8(%0), (%0), 32(%0), (%1), OP) \
- QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5, 8(%0), (%0), (%0), 40(%0), (%1, %3), OP) \
- "add %4, %1 \n\t" \
- QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5, (%0), (%0), 8(%0), 48(%0), (%1), OP) \
- \
- QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5, (%0), 8(%0), 16(%0), 56(%0), (%1, %3), OP) \
- "add %4, %1 \n\t" \
- QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5, 8(%0), 16(%0), 24(%0), 64(%0), (%1), OP) \
- QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5, 16(%0), 24(%0), 32(%0), 72(%0), (%1, %3), OP) \
- "add %4, %1 \n\t" \
- QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5, 24(%0), 32(%0), 40(%0), 80(%0), (%1), OP) \
- QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5, 32(%0), 40(%0), 48(%0), 88(%0), (%1, %3), OP) \
- "add %4, %1 \n\t" \
- QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5, 40(%0), 48(%0), 56(%0), 96(%0), (%1), OP) \
- QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5, 48(%0), 56(%0), 64(%0), 104(%0), (%1, %3), OP) \
- "add %4, %1 \n\t" \
- QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5, 56(%0), 64(%0), 72(%0), 112(%0), (%1), OP) \
- QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5, 64(%0), 72(%0), 80(%0), 120(%0), (%1, %3), OP) \
- "add %4, %1 \n\t" \
- QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5, 72(%0), 80(%0), 88(%0), 128(%0), (%1), OP) \
- \
- QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5, 80(%0), 88(%0), 96(%0), 128(%0), (%1, %3), OP) \
- "add %4, %1 \n\t" \
- QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5, 88(%0), 96(%0), 104(%0), 120(%0), (%1), OP) \
- QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5, 96(%0), 104(%0), 112(%0), 112(%0), (%1, %3), OP) \
- \
- "add $136, %0 \n\t" \
- "add %6, %1 \n\t" \
- "decl %2 \n\t" \
- "jnz 1b \n\t" \
- \
- : "+r"(temp_ptr), "+r"(dst), "+g"(count) \
- : "r"((x86_reg)dstStride), "r"(2 * (x86_reg)dstStride), \
- /* "m"(ff_pw_20), "m"(ff_pw_3), */ "m"(ROUNDER), \
- "g"(4 - 14 * (x86_reg)dstStride) \
- : "memory" \
- ); \
-} \
- \
-static void OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(uint8_t *dst, \
- uint8_t *src, \
- int dstStride, \
- int srcStride) \
-{ \
- uint64_t temp[9 * 2]; \
- uint64_t *temp_ptr = temp; \
- int count = 9; \
- \
- /* FIXME unroll */ \
- __asm__ volatile ( \
- "pxor %%mm7, %%mm7 \n\t" \
- "1: \n\t" \
- "movq (%0), %%mm0 \n\t" \
- "movq (%0), %%mm1 \n\t" \
- "punpcklbw %%mm7, %%mm0 \n\t" \
- "punpckhbw %%mm7, %%mm1 \n\t" \
- "movq %%mm0, (%1) \n\t" \
- "movq %%mm1, 9*8(%1) \n\t" \
- "add $8, %1 \n\t" \
- "add %3, %0 \n\t" \
- "decl %2 \n\t" \
- "jnz 1b \n\t" \
- : "+r"(src), "+r"(temp_ptr), "+r"(count) \
- : "r"((x86_reg)srcStride) \
- : "memory" \
- ); \
- \
- temp_ptr = temp; \
- count = 2; \
- \
- /* FIXME reorder for speed */ \
- __asm__ volatile ( \
- /* "pxor %%mm7, %%mm7 \n\t" */ \
- "1: \n\t" \
- "movq (%0), %%mm0 \n\t" \
- "movq 8(%0), %%mm1 \n\t" \
- "movq 16(%0), %%mm2 \n\t" \
- "movq 24(%0), %%mm3 \n\t" \
- QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5, 16(%0), 8(%0), (%0), 32(%0), (%1), OP) \
- QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5, 8(%0), (%0), (%0), 40(%0), (%1, %3), OP) \
- "add %4, %1 \n\t" \
- QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5, (%0), (%0), 8(%0), 48(%0), (%1), OP) \
- \
- QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5, (%0), 8(%0), 16(%0), 56(%0), (%1, %3), OP) \
- "add %4, %1 \n\t" \
- QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5, 8(%0), 16(%0), 24(%0), 64(%0), (%1), OP) \
- \
- QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5, 16(%0), 24(%0), 32(%0), 64(%0), (%1, %3), OP) \
- "add %4, %1 \n\t" \
- QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5, 24(%0), 32(%0), 40(%0), 56(%0), (%1), OP) \
- QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5, 32(%0), 40(%0), 48(%0), 48(%0), (%1, %3), OP) \
- \
- "add $72, %0 \n\t" \
- "add %6, %1 \n\t" \
- "decl %2 \n\t" \
- "jnz 1b \n\t" \
- \
- : "+r"(temp_ptr), "+r"(dst), "+g"(count) \
- : "r"((x86_reg)dstStride), "r"(2 * (x86_reg)dstStride), \
- /* "m"(ff_pw_20), "m"(ff_pw_3), */ "m"(ROUNDER), \
- "g"(4 - 6 * (x86_reg)dstStride) \
- : "memory" \
- ); \
-} \
- \
static void OPNAME ## qpel8_mc00_ ## MMX (uint8_t *dst, uint8_t *src, \
int stride) \
{ \
- OPNAME ## pixels8_ ## MMX(dst, src, stride, 8); \
+ ff_ ## OPNAME ## pixels8_ ## MMX(dst, src, stride, 8); \
} \
\
static void OPNAME ## qpel8_mc10_ ## MMX(uint8_t *dst, uint8_t *src, \
@@ -1198,16 +934,17 @@ static void OPNAME ## qpel8_mc10_ ## MMX(uint8_t *dst, uint8_t *src, \
{ \
uint64_t temp[8]; \
uint8_t * const half = (uint8_t*)temp; \
- put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(half, src, 8, \
- stride, 8); \
- OPNAME ## pixels8_l2_ ## MMX(dst, src, half, stride, stride, 8); \
+ ff_put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(half, src, 8, \
+ stride, 8); \
+ ff_ ## OPNAME ## pixels8_l2_ ## MMX(dst, src, half, \
+ stride, stride, 8); \
} \
\
static void OPNAME ## qpel8_mc20_ ## MMX(uint8_t *dst, uint8_t *src, \
int stride) \
{ \
- OPNAME ## mpeg4_qpel8_h_lowpass_ ## MMX(dst, src, stride, \
- stride, 8); \
+ ff_ ## OPNAME ## mpeg4_qpel8_h_lowpass_ ## MMX(dst, src, stride, \
+ stride, 8); \
} \
\
static void OPNAME ## qpel8_mc30_ ## MMX(uint8_t *dst, uint8_t *src, \
@@ -1215,10 +952,10 @@ static void OPNAME ## qpel8_mc30_ ## MMX(uint8_t *dst, uint8_t *src, \
{ \
uint64_t temp[8]; \
uint8_t * const half = (uint8_t*)temp; \
- put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(half, src, 8, \
- stride, 8); \
- OPNAME ## pixels8_l2_ ## MMX(dst, src + 1, half, stride, \
- stride, 8); \
+ ff_put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(half, src, 8, \
+ stride, 8); \
+ ff_ ## OPNAME ## pixels8_l2_ ## MMX(dst, src + 1, half, stride, \
+ stride, 8); \
} \
\
static void OPNAME ## qpel8_mc01_ ## MMX(uint8_t *dst, uint8_t *src, \
@@ -1226,14 +963,17 @@ static void OPNAME ## qpel8_mc01_ ## MMX(uint8_t *dst, uint8_t *src, \
{ \
uint64_t temp[8]; \
uint8_t * const half = (uint8_t*)temp; \
- put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(half, src, 8, stride); \
- OPNAME ## pixels8_l2_ ## MMX(dst, src, half, stride, stride, 8); \
+ ff_put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(half, src, \
+ 8, stride); \
+ ff_ ## OPNAME ## pixels8_l2_ ## MMX(dst, src, half, \
+ stride, stride, 8); \
} \
\
static void OPNAME ## qpel8_mc02_ ## MMX(uint8_t *dst, uint8_t *src, \
int stride) \
{ \
- OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(dst, src, stride, stride); \
+ ff_ ## OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(dst, src, \
+ stride, stride); \
} \
\
static void OPNAME ## qpel8_mc03_ ## MMX(uint8_t *dst, uint8_t *src, \
@@ -1241,9 +981,10 @@ static void OPNAME ## qpel8_mc03_ ## MMX(uint8_t *dst, uint8_t *src, \
{ \
uint64_t temp[8]; \
uint8_t * const half = (uint8_t*)temp; \
- put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(half, src, 8, stride); \
- OPNAME ## pixels8_l2_ ## MMX(dst, src + stride, half, stride, \
- stride, 8); \
+ ff_put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(half, src, \
+ 8, stride); \
+ ff_ ## OPNAME ## pixels8_l2_ ## MMX(dst, src + stride, half, stride,\
+ stride, 8); \
} \
\
static void OPNAME ## qpel8_mc11_ ## MMX(uint8_t *dst, uint8_t *src, \
@@ -1252,11 +993,13 @@ static void OPNAME ## qpel8_mc11_ ## MMX(uint8_t *dst, uint8_t *src, \
uint64_t half[8 + 9]; \
uint8_t * const halfH = ((uint8_t*)half) + 64; \
uint8_t * const halfHV = ((uint8_t*)half); \
- put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, \
- stride, 9); \
- put ## RND ## pixels8_l2_ ## MMX(halfH, src, halfH, 8, stride, 9); \
- put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8); \
- OPNAME ## pixels8_l2_ ## MMX(dst, halfH, halfHV, stride, 8, 8); \
+ ff_put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, \
+ stride, 9); \
+ ff_put ## RND ## pixels8_l2_ ## MMX(halfH, src, halfH, 8, \
+ stride, 9); \
+ ff_put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\
+ ff_ ## OPNAME ## pixels8_l2_ ## MMX(dst, halfH, halfHV, \
+ stride, 8, 8); \
} \
\
static void OPNAME ## qpel8_mc31_ ## MMX(uint8_t *dst, uint8_t *src, \
@@ -1265,12 +1008,13 @@ static void OPNAME ## qpel8_mc31_ ## MMX(uint8_t *dst, uint8_t *src, \
uint64_t half[8 + 9]; \
uint8_t * const halfH = ((uint8_t*)half) + 64; \
uint8_t * const halfHV = ((uint8_t*)half); \
- put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, \
- stride, 9); \
- put ## RND ## pixels8_l2_ ## MMX(halfH, src + 1, halfH, 8, \
- stride, 9); \
- put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8); \
- OPNAME ## pixels8_l2_ ## MMX(dst, halfH, halfHV, stride, 8, 8); \
+ ff_put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, \
+ stride, 9); \
+ ff_put ## RND ## pixels8_l2_ ## MMX(halfH, src + 1, halfH, 8, \
+ stride, 9); \
+ ff_put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\
+ ff_ ## OPNAME ## pixels8_l2_ ## MMX(dst, halfH, halfHV, \
+ stride, 8, 8); \
} \
\
static void OPNAME ## qpel8_mc13_ ## MMX(uint8_t *dst, uint8_t *src, \
@@ -1279,11 +1023,13 @@ static void OPNAME ## qpel8_mc13_ ## MMX(uint8_t *dst, uint8_t *src, \
uint64_t half[8 + 9]; \
uint8_t * const halfH = ((uint8_t*)half) + 64; \
uint8_t * const halfHV = ((uint8_t*)half); \
- put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, \
- stride, 9); \
- put ## RND ## pixels8_l2_ ## MMX(halfH, src, halfH, 8, stride, 9); \
- put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8); \
- OPNAME ## pixels8_l2_ ## MMX(dst, halfH + 8, halfHV, stride, 8, 8); \
+ ff_put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, \
+ stride, 9); \
+ ff_put ## RND ## pixels8_l2_ ## MMX(halfH, src, halfH, 8, \
+ stride, 9); \
+ ff_put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\
+ ff_ ## OPNAME ## pixels8_l2_ ## MMX(dst, halfH + 8, halfHV, \
+ stride, 8, 8); \
} \
\
static void OPNAME ## qpel8_mc33_ ## MMX(uint8_t *dst, uint8_t *src, \
@@ -1292,12 +1038,13 @@ static void OPNAME ## qpel8_mc33_ ## MMX(uint8_t *dst, uint8_t *src, \
uint64_t half[8 + 9]; \
uint8_t * const halfH = ((uint8_t*)half) + 64; \
uint8_t * const halfHV = ((uint8_t*)half); \
- put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, \
- stride, 9); \
- put ## RND ## pixels8_l2_ ## MMX(halfH, src + 1, halfH, 8, \
- stride, 9); \
- put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8); \
- OPNAME ## pixels8_l2_ ## MMX(dst, halfH + 8, halfHV, stride, 8, 8); \
+ ff_put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, \
+ stride, 9); \
+ ff_put ## RND ## pixels8_l2_ ## MMX(halfH, src + 1, halfH, 8, \
+ stride, 9); \
+ ff_put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\
+ ff_ ## OPNAME ## pixels8_l2_ ## MMX(dst, halfH + 8, halfHV, \
+ stride, 8, 8); \
} \
\
static void OPNAME ## qpel8_mc21_ ## MMX(uint8_t *dst, uint8_t *src, \
@@ -1306,10 +1053,11 @@ static void OPNAME ## qpel8_mc21_ ## MMX(uint8_t *dst, uint8_t *src, \
uint64_t half[8 + 9]; \
uint8_t * const halfH = ((uint8_t*)half) + 64; \
uint8_t * const halfHV = ((uint8_t*)half); \
- put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, \
- stride, 9); \
- put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8); \
- OPNAME ## pixels8_l2_ ## MMX(dst, halfH, halfHV, stride, 8, 8); \
+ ff_put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, \
+ stride, 9); \
+ ff_put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\
+ ff_ ## OPNAME ## pixels8_l2_ ## MMX(dst, halfH, halfHV, \
+ stride, 8, 8); \
} \
\
static void OPNAME ## qpel8_mc23_ ## MMX(uint8_t *dst, uint8_t *src, \
@@ -1318,10 +1066,11 @@ static void OPNAME ## qpel8_mc23_ ## MMX(uint8_t *dst, uint8_t *src, \
uint64_t half[8 + 9]; \
uint8_t * const halfH = ((uint8_t*)half) + 64; \
uint8_t * const halfHV = ((uint8_t*)half); \
- put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, \
- stride, 9); \
- put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8); \
- OPNAME ## pixels8_l2_ ## MMX(dst, halfH + 8, halfHV, stride, 8, 8); \
+ ff_put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, \
+ stride, 9); \
+ ff_put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\
+ ff_ ## OPNAME ## pixels8_l2_ ## MMX(dst, halfH + 8, halfHV, \
+ stride, 8, 8); \
} \
\
static void OPNAME ## qpel8_mc12_ ## MMX(uint8_t *dst, uint8_t *src, \
@@ -1329,10 +1078,12 @@ static void OPNAME ## qpel8_mc12_ ## MMX(uint8_t *dst, uint8_t *src, \
{ \
uint64_t half[8 + 9]; \
uint8_t * const halfH = ((uint8_t*)half); \
- put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, \
- stride, 9); \
- put ## RND ## pixels8_l2_ ## MMX(halfH, src, halfH, 8, stride, 9); \
- OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(dst, halfH, stride, 8); \
+ ff_put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, \
+ stride, 9); \
+ ff_put ## RND ## pixels8_l2_ ## MMX(halfH, src, halfH, \
+ 8, stride, 9); \
+ ff_ ## OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(dst, halfH, \
+ stride, 8); \
} \
\
static void OPNAME ## qpel8_mc32_ ## MMX(uint8_t *dst, uint8_t *src, \
@@ -1340,11 +1091,12 @@ static void OPNAME ## qpel8_mc32_ ## MMX(uint8_t *dst, uint8_t *src, \
{ \
uint64_t half[8 + 9]; \
uint8_t * const halfH = ((uint8_t*)half); \
- put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, \
- stride, 9); \
- put ## RND ## pixels8_l2_ ## MMX(halfH, src + 1, halfH, 8, \
- stride, 9); \
- OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(dst, halfH, stride, 8); \
+ ff_put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, \
+ stride, 9); \
+ ff_put ## RND ## pixels8_l2_ ## MMX(halfH, src + 1, halfH, 8, \
+ stride, 9); \
+ ff_ ## OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(dst, halfH, \
+ stride, 8); \
} \
\
static void OPNAME ## qpel8_mc22_ ## MMX(uint8_t *dst, uint8_t *src, \
@@ -1352,15 +1104,16 @@ static void OPNAME ## qpel8_mc22_ ## MMX(uint8_t *dst, uint8_t *src, \
{ \
uint64_t half[9]; \
uint8_t * const halfH = ((uint8_t*)half); \
- put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, \
- stride, 9); \
- OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(dst, halfH, stride, 8); \
+ ff_put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, \
+ stride, 9); \
+ ff_ ## OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(dst, halfH, \
+ stride, 8); \
} \
\
static void OPNAME ## qpel16_mc00_ ## MMX (uint8_t *dst, uint8_t *src, \
int stride) \
{ \
- OPNAME ## pixels16_ ## MMX(dst, src, stride, 16); \
+ ff_ ## OPNAME ## pixels16_ ## MMX(dst, src, stride, 16); \
} \
\
static void OPNAME ## qpel16_mc10_ ## MMX(uint8_t *dst, uint8_t *src, \
@@ -1368,16 +1121,17 @@ static void OPNAME ## qpel16_mc10_ ## MMX(uint8_t *dst, uint8_t *src, \
{ \
uint64_t temp[32]; \
uint8_t * const half = (uint8_t*)temp; \
- put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(half, src, 16, \
- stride, 16); \
- OPNAME ## pixels16_l2_ ## MMX(dst, src, half, stride, stride, 16); \
+ ff_put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(half, src, 16, \
+ stride, 16); \
+ ff_ ## OPNAME ## pixels16_l2_ ## MMX(dst, src, half, stride, \
+ stride, 16); \
} \
\
static void OPNAME ## qpel16_mc20_ ## MMX(uint8_t *dst, uint8_t *src, \
int stride) \
{ \
- OPNAME ## mpeg4_qpel16_h_lowpass_ ## MMX(dst, src, \
- stride, stride, 16); \
+ ff_ ## OPNAME ## mpeg4_qpel16_h_lowpass_ ## MMX(dst, src, \
+ stride, stride, 16);\
} \
\
static void OPNAME ## qpel16_mc30_ ## MMX(uint8_t *dst, uint8_t *src, \
@@ -1385,10 +1139,10 @@ static void OPNAME ## qpel16_mc30_ ## MMX(uint8_t *dst, uint8_t *src, \
{ \
uint64_t temp[32]; \
uint8_t * const half = (uint8_t*)temp; \
- put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(half, src, 16, \
- stride, 16); \
- OPNAME ## pixels16_l2_ ## MMX(dst, src + 1, half, \
- stride, stride, 16); \
+ ff_put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(half, src, 16, \
+ stride, 16); \
+ ff_ ## OPNAME ## pixels16_l2_ ## MMX(dst, src + 1, half, \
+ stride, stride, 16); \
} \
\
static void OPNAME ## qpel16_mc01_ ## MMX(uint8_t *dst, uint8_t *src, \
@@ -1396,15 +1150,17 @@ static void OPNAME ## qpel16_mc01_ ## MMX(uint8_t *dst, uint8_t *src, \
{ \
uint64_t temp[32]; \
uint8_t * const half = (uint8_t*)temp; \
- put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(half, src, 16, \
- stride); \
- OPNAME ## pixels16_l2_ ## MMX(dst, src, half, stride, stride, 16); \
+ ff_put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(half, src, 16, \
+ stride); \
+ ff_ ## OPNAME ## pixels16_l2_ ## MMX(dst, src, half, stride, \
+ stride, 16); \
} \
\
static void OPNAME ## qpel16_mc02_ ## MMX(uint8_t *dst, uint8_t *src, \
int stride) \
{ \
- OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(dst, src, stride, stride); \
+ ff_ ## OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(dst, src, \
+ stride, stride); \
} \
\
static void OPNAME ## qpel16_mc03_ ## MMX(uint8_t *dst, uint8_t *src, \
@@ -1412,10 +1168,10 @@ static void OPNAME ## qpel16_mc03_ ## MMX(uint8_t *dst, uint8_t *src, \
{ \
uint64_t temp[32]; \
uint8_t * const half = (uint8_t*)temp; \
- put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(half, src, 16, \
- stride); \
- OPNAME ## pixels16_l2_ ## MMX(dst, src+stride, half, \
- stride, stride, 16); \
+ ff_put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(half, src, 16, \
+ stride); \
+ ff_ ## OPNAME ## pixels16_l2_ ## MMX(dst, src+stride, half, \
+ stride, stride, 16); \
} \
\
static void OPNAME ## qpel16_mc11_ ## MMX(uint8_t *dst, uint8_t *src, \
@@ -1424,13 +1180,14 @@ static void OPNAME ## qpel16_mc11_ ## MMX(uint8_t *dst, uint8_t *src, \
uint64_t half[16 * 2 + 17 * 2]; \
uint8_t * const halfH = ((uint8_t*)half) + 256; \
uint8_t * const halfHV = ((uint8_t*)half); \
- put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, \
- stride, 17); \
- put ## RND ## pixels16_l2_ ## MMX(halfH, src, halfH, 16, \
- stride, 17); \
- put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, \
- 16, 16); \
- OPNAME ## pixels16_l2_ ## MMX(dst, halfH, halfHV, stride, 16, 16); \
+ ff_put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, \
+ stride, 17); \
+ ff_put ## RND ## pixels16_l2_ ## MMX(halfH, src, halfH, 16, \
+ stride, 17); \
+ ff_put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, \
+ 16, 16); \
+ ff_ ## OPNAME ## pixels16_l2_ ## MMX(dst, halfH, halfHV, \
+ stride, 16, 16); \
} \
\
static void OPNAME ## qpel16_mc31_ ## MMX(uint8_t *dst, uint8_t *src, \
@@ -1439,13 +1196,14 @@ static void OPNAME ## qpel16_mc31_ ## MMX(uint8_t *dst, uint8_t *src, \
uint64_t half[16 * 2 + 17 * 2]; \
uint8_t * const halfH = ((uint8_t*)half) + 256; \
uint8_t * const halfHV = ((uint8_t*)half); \
- put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, \
- stride, 17); \
- put ## RND ## pixels16_l2_ ## MMX(halfH, src + 1, halfH, 16, \
- stride, 17); \
- put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, \
- 16, 16); \
- OPNAME ## pixels16_l2_ ## MMX(dst, halfH, halfHV, stride, 16, 16); \
+ ff_put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, \
+ stride, 17); \
+ ff_put ## RND ## pixels16_l2_ ## MMX(halfH, src + 1, halfH, 16, \
+ stride, 17); \
+ ff_put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, \
+ 16, 16); \
+ ff_ ## OPNAME ## pixels16_l2_ ## MMX(dst, halfH, halfHV, \
+ stride, 16, 16); \
} \
\
static void OPNAME ## qpel16_mc13_ ## MMX(uint8_t *dst, uint8_t *src, \
@@ -1454,14 +1212,14 @@ static void OPNAME ## qpel16_mc13_ ## MMX(uint8_t *dst, uint8_t *src, \
uint64_t half[16 * 2 + 17 * 2]; \
uint8_t * const halfH = ((uint8_t*)half) + 256; \
uint8_t * const halfHV = ((uint8_t*)half); \
- put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, \
- stride, 17); \
- put ## RND ## pixels16_l2_ ## MMX(halfH, src, halfH, 16, \
- stride, 17); \
- put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, \
- 16, 16); \
- OPNAME ## pixels16_l2_ ## MMX(dst, halfH + 16, halfHV, stride, \
- 16, 16); \
+ ff_put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, \
+ stride, 17); \
+ ff_put ## RND ## pixels16_l2_ ## MMX(halfH, src, halfH, 16, \
+ stride, 17); \
+ ff_put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, \
+ 16, 16); \
+ ff_ ## OPNAME ## pixels16_l2_ ## MMX(dst, halfH + 16, halfHV, \
+ stride, 16, 16); \
} \
\
static void OPNAME ## qpel16_mc33_ ## MMX(uint8_t *dst, uint8_t *src, \
@@ -1470,14 +1228,14 @@ static void OPNAME ## qpel16_mc33_ ## MMX(uint8_t *dst, uint8_t *src, \
uint64_t half[16 * 2 + 17 * 2]; \
uint8_t * const halfH = ((uint8_t*)half) + 256; \
uint8_t * const halfHV = ((uint8_t*)half); \
- put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, \
- stride, 17); \
- put ## RND ## pixels16_l2_ ## MMX(halfH, src + 1, halfH, 16, \
- stride, 17); \
- put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, \
- 16, 16); \
- OPNAME ## pixels16_l2_ ## MMX(dst, halfH + 16, halfHV, stride, \
- 16, 16); \
+ ff_put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, \
+ stride, 17); \
+ ff_put ## RND ## pixels16_l2_ ## MMX(halfH, src + 1, halfH, 16, \
+ stride, 17); \
+ ff_put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, \
+ 16, 16); \
+ ff_ ## OPNAME ## pixels16_l2_ ## MMX(dst, halfH + 16, halfHV, \
+ stride, 16, 16); \
} \
\
static void OPNAME ## qpel16_mc21_ ## MMX(uint8_t *dst, uint8_t *src, \
@@ -1486,11 +1244,12 @@ static void OPNAME ## qpel16_mc21_ ## MMX(uint8_t *dst, uint8_t *src, \
uint64_t half[16 * 2 + 17 * 2]; \
uint8_t * const halfH = ((uint8_t*)half) + 256; \
uint8_t * const halfHV = ((uint8_t*)half); \
- put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, \
- stride, 17); \
- put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, \
- 16, 16); \
- OPNAME ## pixels16_l2_ ## MMX(dst, halfH, halfHV, stride, 16, 16); \
+ ff_put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, \
+ stride, 17); \
+ ff_put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, \
+ 16, 16); \
+ ff_ ## OPNAME ## pixels16_l2_ ## MMX(dst, halfH, halfHV, \
+ stride, 16, 16); \
} \
\
static void OPNAME ## qpel16_mc23_ ## MMX(uint8_t *dst, uint8_t *src, \
@@ -1499,12 +1258,12 @@ static void OPNAME ## qpel16_mc23_ ## MMX(uint8_t *dst, uint8_t *src, \
uint64_t half[16 * 2 + 17 * 2]; \
uint8_t * const halfH = ((uint8_t*)half) + 256; \
uint8_t * const halfHV = ((uint8_t*)half); \
- put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, \
- stride, 17); \
- put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, \
- 16, 16); \
- OPNAME ## pixels16_l2_ ## MMX(dst, halfH + 16, halfHV, stride, \
- 16, 16); \
+ ff_put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, \
+ stride, 17); \
+ ff_put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, \
+ 16, 16); \
+ ff_ ## OPNAME ## pixels16_l2_ ## MMX(dst, halfH + 16, halfHV, \
+ stride, 16, 16); \
} \
\
static void OPNAME ## qpel16_mc12_ ## MMX(uint8_t *dst, uint8_t *src, \
@@ -1512,11 +1271,12 @@ static void OPNAME ## qpel16_mc12_ ## MMX(uint8_t *dst, uint8_t *src, \
{ \
uint64_t half[17 * 2]; \
uint8_t * const halfH = ((uint8_t*)half); \
- put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, \
- stride, 17); \
- put ## RND ## pixels16_l2_ ## MMX(halfH, src, halfH, 16, \
- stride, 17); \
- OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(dst, halfH, stride, 16); \
+ ff_put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, \
+ stride, 17); \
+ ff_put ## RND ## pixels16_l2_ ## MMX(halfH, src, halfH, 16, \
+ stride, 17); \
+ ff_ ## OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(dst, halfH, \
+ stride, 16); \
} \
\
static void OPNAME ## qpel16_mc32_ ## MMX(uint8_t *dst, uint8_t *src, \
@@ -1524,11 +1284,12 @@ static void OPNAME ## qpel16_mc32_ ## MMX(uint8_t *dst, uint8_t *src, \
{ \
uint64_t half[17 * 2]; \
uint8_t * const halfH = ((uint8_t*)half); \
- put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, \
- stride, 17); \
- put ## RND ## pixels16_l2_ ## MMX(halfH, src + 1, halfH, 16, \
- stride, 17); \
- OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(dst, halfH, stride, 16); \
+ ff_put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, \
+ stride, 17); \
+ ff_put ## RND ## pixels16_l2_ ## MMX(halfH, src + 1, halfH, 16, \
+ stride, 17); \
+ ff_ ## OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(dst, halfH, \
+ stride, 16); \
} \
\
static void OPNAME ## qpel16_mc22_ ## MMX(uint8_t *dst, uint8_t *src, \
@@ -1536,9 +1297,10 @@ static void OPNAME ## qpel16_mc22_ ## MMX(uint8_t *dst, uint8_t *src, \
{ \
uint64_t half[17 * 2]; \
uint8_t * const halfH = ((uint8_t*)half); \
- put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, \
- stride, 17); \
- OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(dst, halfH, stride, 16); \
+ ff_put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, \
+ stride, 17); \
+ ff_ ## OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(dst, halfH, \
+ stride, 16); \
}
#define PUT_OP(a, b, temp, size) \
@@ -1549,13 +1311,13 @@ static void OPNAME ## qpel16_mc22_ ## MMX(uint8_t *dst, uint8_t *src, \
"pavgb "#temp", "#a" \n\t" \
"mov"#size" "#a", "#b" \n\t"
-QPEL_BASE(put_, ff_pw_16, _, PUT_OP)
-QPEL_BASE(avg_, ff_pw_16, _, AVG_MMXEXT_OP)
-QPEL_BASE(put_no_rnd_, ff_pw_15, _no_rnd_, PUT_OP)
QPEL_OP(put_, ff_pw_16, _, PUT_OP, mmxext)
QPEL_OP(avg_, ff_pw_16, _, AVG_MMXEXT_OP, mmxext)
QPEL_OP(put_no_rnd_, ff_pw_15, _no_rnd_, PUT_OP, mmxext)
+#endif /* HAVE_YASM */
+
+#if HAVE_INLINE_ASM
void ff_put_rv40_qpel8_mc33_mmx(uint8_t *dst, uint8_t *src, int stride)
{
put_pixels8_xy2_mmx(dst, src, stride, 8);
@@ -1760,20 +1522,24 @@ void ff_avg_cavs_qpel16_mc00_mmxext(uint8_t *dst, uint8_t *src, int stride)
{
avg_pixels16_mmx(dst, src, stride, 16);
}
+#endif /* HAVE_INLINE_ASM */
+#if HAVE_YASM
/* VC-1-specific */
void ff_put_vc1_mspel_mc00_mmx(uint8_t *dst, const uint8_t *src,
int stride, int rnd)
{
- put_pixels8_mmx(dst, src, stride, 8);
+ ff_put_pixels8_mmx(dst, src, stride, 8);
}
void ff_avg_vc1_mspel_mc00_mmxext(uint8_t *dst, const uint8_t *src,
int stride, int rnd)
{
- avg_pixels8_mmxext(dst, src, stride, 8);
+ ff_avg_pixels8_mmxext(dst, src, stride, 8);
}
+#endif /* HAVE_YASM */
+#if HAVE_INLINE_ASM
static void vector_clipf_sse(float *dst, const float *src,
float min, float max, int len)
{
@@ -1950,7 +1716,7 @@ static void dsputil_init_mmxext(DSPContext *c, AVCodecContext *avctx,
const int bit_depth = avctx->bits_per_raw_sample;
const int high_bit_depth = bit_depth > 8;
-#if HAVE_INLINE_ASM
+#if HAVE_YASM
SET_QPEL_FUNCS(avg_qpel, 0, 16, mmxext, );
SET_QPEL_FUNCS(avg_qpel, 1, 8, mmxext, );
@@ -1960,47 +1726,49 @@ static void dsputil_init_mmxext(DSPContext *c, AVCodecContext *avctx,
SET_QPEL_FUNCS(put_no_rnd_qpel, 1, 8, mmxext, );
if (!high_bit_depth) {
- c->put_pixels_tab[0][1] = put_pixels16_x2_mmxext;
- c->put_pixels_tab[0][2] = put_pixels16_y2_mmxext;
+ c->put_pixels_tab[0][1] = ff_put_pixels16_x2_mmxext;
+ c->put_pixels_tab[0][2] = ff_put_pixels16_y2_mmxext;
- c->avg_pixels_tab[0][0] = avg_pixels16_mmxext;
- c->avg_pixels_tab[0][1] = avg_pixels16_x2_mmxext;
- c->avg_pixels_tab[0][2] = avg_pixels16_y2_mmxext;
+ c->avg_pixels_tab[0][0] = ff_avg_pixels16_mmxext;
+ c->avg_pixels_tab[0][1] = ff_avg_pixels16_x2_mmxext;
+ c->avg_pixels_tab[0][2] = ff_avg_pixels16_y2_mmxext;
- c->put_pixels_tab[1][1] = put_pixels8_x2_mmxext;
- c->put_pixels_tab[1][2] = put_pixels8_y2_mmxext;
+ c->put_pixels_tab[1][1] = ff_put_pixels8_x2_mmxext;
+ c->put_pixels_tab[1][2] = ff_put_pixels8_y2_mmxext;
- c->avg_pixels_tab[1][0] = avg_pixels8_mmxext;
- c->avg_pixels_tab[1][1] = avg_pixels8_x2_mmxext;
- c->avg_pixels_tab[1][2] = avg_pixels8_y2_mmxext;
+ c->avg_pixels_tab[1][0] = ff_avg_pixels8_mmxext;
+ c->avg_pixels_tab[1][1] = ff_avg_pixels8_x2_mmxext;
+ c->avg_pixels_tab[1][2] = ff_avg_pixels8_y2_mmxext;
}
if (!(avctx->flags & CODEC_FLAG_BITEXACT)) {
if (!high_bit_depth) {
- c->put_no_rnd_pixels_tab[0][1] = put_no_rnd_pixels16_x2_mmxext;
- c->put_no_rnd_pixels_tab[0][2] = put_no_rnd_pixels16_y2_mmxext;
- c->put_no_rnd_pixels_tab[1][1] = put_no_rnd_pixels8_x2_mmxext;
- c->put_no_rnd_pixels_tab[1][2] = put_no_rnd_pixels8_y2_mmxext;
+ c->put_no_rnd_pixels_tab[0][1] = ff_put_no_rnd_pixels16_x2_mmxext;
+ c->put_no_rnd_pixels_tab[0][2] = ff_put_no_rnd_pixels16_y2_mmxext;
+ c->put_no_rnd_pixels_tab[1][1] = ff_put_no_rnd_pixels8_x2_mmxext;
+ c->put_no_rnd_pixels_tab[1][2] = ff_put_no_rnd_pixels8_y2_mmxext;
- c->avg_pixels_tab[0][3] = avg_pixels16_xy2_mmxext;
- c->avg_pixels_tab[1][3] = avg_pixels8_xy2_mmxext;
+ c->avg_pixels_tab[0][3] = ff_avg_pixels16_xy2_mmxext;
+ c->avg_pixels_tab[1][3] = ff_avg_pixels8_xy2_mmxext;
}
}
+#endif /* HAVE_YASM */
+#if HAVE_INLINE_ASM
if (!high_bit_depth && avctx->idct_algo == FF_IDCT_XVIDMMX) {
c->idct_put = ff_idct_xvid_mmxext_put;
c->idct_add = ff_idct_xvid_mmxext_add;
c->idct = ff_idct_xvid_mmxext;
}
+#endif /* HAVE_INLINE_ASM */
+#if HAVE_MMXEXT_EXTERNAL
if (CONFIG_VP3_DECODER && (avctx->codec_id == AV_CODEC_ID_VP3 ||
avctx->codec_id == AV_CODEC_ID_THEORA)) {
- c->put_no_rnd_pixels_tab[1][1] = put_no_rnd_pixels8_x2_exact_mmxext;
- c->put_no_rnd_pixels_tab[1][2] = put_no_rnd_pixels8_y2_exact_mmxext;
+ c->put_no_rnd_pixels_tab[1][1] = ff_put_no_rnd_pixels8_x2_exact_mmxext;
+ c->put_no_rnd_pixels_tab[1][2] = ff_put_no_rnd_pixels8_y2_exact_mmxext;
}
-#endif /* HAVE_INLINE_ASM */
-#if HAVE_MMXEXT_EXTERNAL
if (!high_bit_depth && CONFIG_H264CHROMA) {
c->avg_h264_chroma_pixels_tab[0] = ff_avg_h264_chroma_mc8_rnd_mmxext;
c->avg_h264_chroma_pixels_tab[1] = ff_avg_h264_chroma_mc4_mmxext;
@@ -2034,41 +1802,39 @@ static void dsputil_init_3dnow(DSPContext *c, AVCodecContext *avctx,
{
const int high_bit_depth = avctx->bits_per_raw_sample > 8;
-#if HAVE_INLINE_ASM
+#if HAVE_YASM
if (!high_bit_depth) {
- c->put_pixels_tab[0][1] = put_pixels16_x2_3dnow;
- c->put_pixels_tab[0][2] = put_pixels16_y2_3dnow;
+ c->put_pixels_tab[0][1] = ff_put_pixels16_x2_3dnow;
+ c->put_pixels_tab[0][2] = ff_put_pixels16_y2_3dnow;
- c->avg_pixels_tab[0][0] = avg_pixels16_3dnow;
- c->avg_pixels_tab[0][1] = avg_pixels16_x2_3dnow;
- c->avg_pixels_tab[0][2] = avg_pixels16_y2_3dnow;
+ c->avg_pixels_tab[0][0] = ff_avg_pixels16_3dnow;
+ c->avg_pixels_tab[0][1] = ff_avg_pixels16_x2_3dnow;
+ c->avg_pixels_tab[0][2] = ff_avg_pixels16_y2_3dnow;
- c->put_pixels_tab[1][1] = put_pixels8_x2_3dnow;
- c->put_pixels_tab[1][2] = put_pixels8_y2_3dnow;
+ c->put_pixels_tab[1][1] = ff_put_pixels8_x2_3dnow;
+ c->put_pixels_tab[1][2] = ff_put_pixels8_y2_3dnow;
- c->avg_pixels_tab[1][0] = avg_pixels8_3dnow;
- c->avg_pixels_tab[1][1] = avg_pixels8_x2_3dnow;
- c->avg_pixels_tab[1][2] = avg_pixels8_y2_3dnow;
+ c->avg_pixels_tab[1][0] = ff_avg_pixels8_3dnow;
+ c->avg_pixels_tab[1][1] = ff_avg_pixels8_x2_3dnow;
+ c->avg_pixels_tab[1][2] = ff_avg_pixels8_y2_3dnow;
if (!(avctx->flags & CODEC_FLAG_BITEXACT)){
- c->put_no_rnd_pixels_tab[0][1] = put_no_rnd_pixels16_x2_3dnow;
- c->put_no_rnd_pixels_tab[0][2] = put_no_rnd_pixels16_y2_3dnow;
- c->put_no_rnd_pixels_tab[1][1] = put_no_rnd_pixels8_x2_3dnow;
- c->put_no_rnd_pixels_tab[1][2] = put_no_rnd_pixels8_y2_3dnow;
+ c->put_no_rnd_pixels_tab[0][1] = ff_put_no_rnd_pixels16_x2_3dnow;
+ c->put_no_rnd_pixels_tab[0][2] = ff_put_no_rnd_pixels16_y2_3dnow;
+ c->put_no_rnd_pixels_tab[1][1] = ff_put_no_rnd_pixels8_x2_3dnow;
+ c->put_no_rnd_pixels_tab[1][2] = ff_put_no_rnd_pixels8_y2_3dnow;
- c->avg_pixels_tab[0][3] = avg_pixels16_xy2_3dnow;
- c->avg_pixels_tab[1][3] = avg_pixels8_xy2_3dnow;
+ c->avg_pixels_tab[0][3] = ff_avg_pixels16_xy2_3dnow;
+ c->avg_pixels_tab[1][3] = ff_avg_pixels8_xy2_3dnow;
}
}
if (CONFIG_VP3_DECODER && (avctx->codec_id == AV_CODEC_ID_VP3 ||
avctx->codec_id == AV_CODEC_ID_THEORA)) {
- c->put_no_rnd_pixels_tab[1][1] = put_no_rnd_pixels8_x2_exact_3dnow;
- c->put_no_rnd_pixels_tab[1][2] = put_no_rnd_pixels8_y2_exact_3dnow;
+ c->put_no_rnd_pixels_tab[1][1] = ff_put_no_rnd_pixels8_x2_exact_3dnow;
+ c->put_no_rnd_pixels_tab[1][2] = ff_put_no_rnd_pixels8_y2_exact_3dnow;
}
-#endif /* HAVE_INLINE_ASM */
-#if HAVE_YASM
if (!high_bit_depth && CONFIG_H264CHROMA) {
c->avg_h264_chroma_pixels_tab[0] = ff_avg_h264_chroma_mc8_rnd_3dnow;
c->avg_h264_chroma_pixels_tab[1] = ff_avg_h264_chroma_mc4_3dnow;
diff --git a/libavcodec/x86/hpeldsp.asm b/libavcodec/x86/hpeldsp.asm
new file mode 100644
index 0000000000..8afd955bf0
--- /dev/null
+++ b/libavcodec/x86/hpeldsp.asm
@@ -0,0 +1,465 @@
+;******************************************************************************
+;* MMX optimized hpel functions
+;*
+;* This file is part of Libav.
+;*
+;* Libav is free software; you can redistribute it and/or
+;* modify it under the terms of the GNU Lesser General Public
+;* License as published by the Free Software Foundation; either
+;* version 2.1 of the License, or (at your option) any later version.
+;*
+;* Libav is distributed in the hope that it will be useful,
+;* but WITHOUT ANY WARRANTY; without even the implied warranty of
+;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+;* Lesser General Public License for more details.
+;*
+;* You should have received a copy of the GNU Lesser General Public
+;* License along with Libav; if not, write to the Free Software
+;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+;******************************************************************************
+
+%include "libavutil/x86/x86util.asm"
+
+SECTION_RODATA
+cextern pb_1
+
+SECTION_TEXT
+
+; put_pixels8_x2(uint8_t *block, const uint8_t *pixels, int line_size, int h)
+%macro PUT_PIXELS8_X2 0
+cglobal put_pixels8_x2, 4,5
+ movsxdifnidn r2, r2d
+ lea r4, [r2*2]
+.loop:
+ mova m0, [r1]
+ mova m1, [r1+r2]
+ PAVGB m0, [r1+1]
+ PAVGB m1, [r1+r2+1]
+ mova [r0], m0
+ mova [r0+r2], m1
+ add r1, r4
+ add r0, r4
+ mova m0, [r1]
+ mova m1, [r1+r2]
+ PAVGB m0, [r1+1]
+ PAVGB m1, [r1+r2+1]
+ add r1, r4
+ mova [r0], m0
+ mova [r0+r2], m1
+ add r0, r4
+ sub r3d, 4
+ jne .loop
+ REP_RET
+%endmacro
+
+INIT_MMX mmxext
+PUT_PIXELS8_X2
+INIT_MMX 3dnow
+PUT_PIXELS8_X2
+
+
+; put_pixels16_x2(uint8_t *block, const uint8_t *pixels, int line_size, int h)
+%macro PUT_PIXELS_16 0
+cglobal put_pixels16_x2, 4,5
+ movsxdifnidn r2, r2d
+ lea r4, [r2*2]
+.loop:
+ mova m0, [r1]
+ mova m1, [r1+r2]
+ mova m2, [r1+8]
+ mova m3, [r1+r2+8]
+ PAVGB m0, [r1+1]
+ PAVGB m1, [r1+r2+1]
+ PAVGB m2, [r1+9]
+ PAVGB m3, [r1+r2+9]
+ mova [r0], m0
+ mova [r0+r2], m1
+ mova [r0+8], m2
+ mova [r0+r2+8], m3
+ add r1, r4
+ add r0, r4
+ mova m0, [r1]
+ mova m1, [r1+r2]
+ mova m2, [r1+8]
+ mova m3, [r1+r2+8]
+ PAVGB m0, [r1+1]
+ PAVGB m1, [r1+r2+1]
+ PAVGB m2, [r1+9]
+ PAVGB m3, [r1+r2+9]
+ add r1, r4
+ mova [r0], m0
+ mova [r0+r2], m1
+ mova [r0+8], m2
+ mova [r0+r2+8], m3
+ add r0, r4
+ sub r3d, 4
+ jne .loop
+ REP_RET
+%endmacro
+
+INIT_MMX mmxext
+PUT_PIXELS_16
+INIT_MMX 3dnow
+PUT_PIXELS_16
+
+
+; put_no_rnd_pixels8_x2(uint8_t *block, const uint8_t *pixels, int line_size, int h)
+%macro PUT_NO_RND_PIXELS8_X2 0
+cglobal put_no_rnd_pixels8_x2, 4,5
+ mova m6, [pb_1]
+ movsxdifnidn r2, r2d
+ lea r4, [r2*2]
+.loop:
+ mova m0, [r1]
+ mova m2, [r1+r2]
+ mova m1, [r1+1]
+ mova m3, [r1+r2+1]
+ add r1, r4
+ psubusb m0, m6
+ psubusb m2, m6
+ PAVGB m0, m1
+ PAVGB m2, m3
+ mova [r0], m0
+ mova [r0+r2], m2
+ mova m0, [r1]
+ mova m1, [r1+1]
+ mova m2, [r1+r2]
+ mova m3, [r1+r2+1]
+ add r0, r4
+ add r1, r4
+ psubusb m0, m6
+ psubusb m2, m6
+ PAVGB m0, m1
+ PAVGB m2, m3
+ mova [r0], m0
+ mova [r0+r2], m2
+ add r0, r4
+ sub r3d, 4
+ jne .loop
+ REP_RET
+%endmacro
+
+INIT_MMX mmxext
+PUT_NO_RND_PIXELS8_X2
+INIT_MMX 3dnow
+PUT_NO_RND_PIXELS8_X2
+
+
+; put_no_rnd_pixels8_x2_exact(uint8_t *block, const uint8_t *pixels, int line_size, int h)
+%macro PUT_NO_RND_PIXELS8_X2_EXACT 0
+cglobal put_no_rnd_pixels8_x2_exact, 4,5
+ movsxdifnidn r2, r2d
+ lea r4, [r2*3]
+ pcmpeqb m6, m6
+.loop:
+ mova m0, [r1]
+ mova m2, [r1+r2]
+ mova m1, [r1+1]
+ mova m3, [r1+r2+1]
+ pxor m0, m6
+ pxor m2, m6
+ pxor m1, m6
+ pxor m3, m6
+ PAVGB m0, m1
+ PAVGB m2, m3
+ pxor m0, m6
+ pxor m2, m6
+ mova [r0], m0
+ mova [r0+r2], m2
+ mova m0, [r1+r2*2]
+ mova m1, [r1+r2*2+1]
+ mova m2, [r1+r4]
+ mova m3, [r1+r4+1]
+ pxor m0, m6
+ pxor m1, m6
+ pxor m2, m6
+ pxor m3, m6
+ PAVGB m0, m1
+ PAVGB m2, m3
+ pxor m0, m6
+ pxor m2, m6
+ mova [r0+r2*2], m0
+ mova [r0+r4], m2
+ lea r1, [r1+r2*4]
+ lea r0, [r0+r2*4]
+ sub r3d, 4
+ jg .loop
+ REP_RET
+%endmacro
+
+INIT_MMX mmxext
+PUT_NO_RND_PIXELS8_X2_EXACT
+INIT_MMX 3dnow
+PUT_NO_RND_PIXELS8_X2_EXACT
+
+
+; put_pixels8_y2(uint8_t *block, const uint8_t *pixels, int line_size, int h)
+%macro PUT_PIXELS8_Y2 0
+cglobal put_pixels8_y2, 4,5
+ movsxdifnidn r2, r2d
+ lea r4, [r2*2]
+ mova m0, [r1]
+ sub r0, r2
+.loop:
+ mova m1, [r1+r2]
+ mova m2, [r1+r4]
+ add r1, r4
+ PAVGB m0, m1
+ PAVGB m1, m2
+ mova [r0+r2], m0
+ mova [r0+r4], m1
+ mova m1, [r1+r2]
+ mova m0, [r1+r4]
+ add r0, r4
+ add r1, r4
+ PAVGB m2, m1
+ PAVGB m1, m0
+ mova [r0+r2], m2
+ mova [r0+r4], m1
+ add r0, r4
+ sub r3d, 4
+ jne .loop
+ REP_RET
+%endmacro
+
+INIT_MMX mmxext
+PUT_PIXELS8_Y2
+INIT_MMX 3dnow
+PUT_PIXELS8_Y2
+
+
+; put_no_rnd_pixels8_y2(uint8_t *block, const uint8_t *pixels, int line_size, int h)
+%macro PUT_NO_RND_PIXELS8_Y2 0
+cglobal put_no_rnd_pixels8_y2, 4,5
+ mova m6, [pb_1]
+ movsxdifnidn r2, r2d
+ lea r4, [r2+r2]
+ mova m0, [r1]
+ sub r0, r2
+.loop:
+ mova m1, [r1+r2]
+ mova m2, [r1+r4]
+ add r1, r4
+ psubusb m1, m6
+ PAVGB m0, m1
+ PAVGB m1, m2
+ mova [r0+r2], m0
+ mova [r0+r4], m1
+ mova m1, [r1+r2]
+ mova m0, [r1+r4]
+ add r0, r4
+ add r1, r4
+ psubusb m1, m6
+ PAVGB m2, m1
+ PAVGB m1, m0
+ mova [r0+r2], m2
+ mova [r0+r4], m1
+ add r0, r4
+ sub r3d, 4
+ jne .loop
+ REP_RET
+%endmacro
+
+INIT_MMX mmxext
+PUT_NO_RND_PIXELS8_Y2
+INIT_MMX 3dnow
+PUT_NO_RND_PIXELS8_Y2
+
+
+; put_no_rnd_pixels8_y2_exact(uint8_t *block, const uint8_t *pixels, int line_size, int h)
+%macro PUT_NO_RND_PIXELS8_Y2_EXACT 0
+cglobal put_no_rnd_pixels8_y2_exact, 4,5
+ movsxdifnidn r2, r2d
+ lea r4, [r2*3]
+ mova m0, [r1]
+ pcmpeqb m6, m6
+ add r1, r2
+ pxor m0, m6
+.loop:
+ mova m1, [r1]
+ mova m2, [r1+r2]
+ pxor m1, m6
+ pxor m2, m6
+ PAVGB m0, m1
+ PAVGB m1, m2
+ pxor m0, m6
+ pxor m1, m6
+ mova [r0], m0
+ mova [r0+r2], m1
+ mova m1, [r1+r2*2]
+ mova m0, [r1+r4]
+ pxor m1, m6
+ pxor m0, m6
+ PAVGB m2, m1
+ PAVGB m1, m0
+ pxor m2, m6
+ pxor m1, m6
+ mova [r0+r2*2], m2
+ mova [r0+r4], m1
+ lea r1, [r1+r2*4]
+ lea r0, [r0+r2*4]
+ sub r3d, 4
+ jg .loop
+ REP_RET
+%endmacro
+
+INIT_MMX mmxext
+PUT_NO_RND_PIXELS8_Y2_EXACT
+INIT_MMX 3dnow
+PUT_NO_RND_PIXELS8_Y2_EXACT
+
+
+; avg_pixels8(uint8_t *block, const uint8_t *pixels, int line_size, int h)
+%macro AVG_PIXELS8 0
+cglobal avg_pixels8, 4,5
+ movsxdifnidn r2, edx
+ lea r4, [r2+r2]
+.loop:
+ mova m0, [r0]
+ mova m1, [r0+r2]
+ PAVGB m0, [r1]
+ PAVGB m1, [r1+r2]
+ mova [r0], m0
+ mova [r0+r2], m1
+ add r1, r4
+ add r0, r4
+ mova m0, [r0]
+ mova m1, [r0+r2]
+ PAVGB m0, [r1]
+ PAVGB m1, [r1+r2]
+ add r1, r4
+ mova [r0], m0
+ mova [r0+r2], m1
+ add r0, r4
+ sub r3d, 4
+ jne .loop
+ REP_RET
+%endmacro
+
+INIT_MMX 3dnow
+AVG_PIXELS8
+
+
+; avg_pixels8_x2(uint8_t *block, const uint8_t *pixels, int line_size, int h)
+%macro AVG_PIXELS8_X2 0
+cglobal avg_pixels8_x2, 4,5
+ movsxdifnidn r2, edx
+ lea r4, [r2*2]
+.loop:
+ mova m0, [r1]
+ mova m2, [r1+r2]
+ PAVGB m0, [r1+1]
+ PAVGB m2, [r1+r2+1]
+ PAVGB m0, [r0]
+ PAVGB m2, [r0+r2]
+ add r1, r4
+ mova [r0], m0
+ mova [r0+r2], m2
+ mova m0, [r1]
+ mova m2, [r1+r2]
+ PAVGB m0, [r1+1]
+ PAVGB m2, [r1+r2+1]
+ add r0, r4
+ add r1, r4
+ PAVGB m0, [r0]
+ PAVGB m2, [r0+r2]
+ mova [r0], m0
+ mova [r0+r2], m2
+ add r0, r4
+ sub r3d, 4
+ jne .loop
+ REP_RET
+%endmacro
+
+INIT_MMX mmxext
+AVG_PIXELS8_X2
+INIT_MMX 3dnow
+AVG_PIXELS8_X2
+
+
+; avg_pixels8_y2(uint8_t *block, const uint8_t *pixels, int line_size, int h)
+%macro AVG_PIXELS8_Y2 0
+cglobal avg_pixels8_y2, 4,5
+ movsxdifnidn r2, r2d
+ lea r4, [r2*2]
+ mova m0, [r1]
+ sub r0, r2
+.loop:
+ mova m1, [r1+r2]
+ mova m2, [r1+r4]
+ add r1, r4
+ PAVGB m0, m1
+ PAVGB m1, m2
+ mova m3, [r0+r2]
+ mova m4, [r0+r4]
+ PAVGB m0, m3
+ PAVGB m1, m4
+ mova [r0+r2], m0
+ mova [r0+r4], m1
+ mova m1, [r1+r2]
+ mova m0, [r1+r4]
+ PAVGB m2, m1
+ PAVGB m1, m0
+ add r0, r4
+ add r1, r4
+ mova m3, [r0+r2]
+ mova m4, [r0+r4]
+ PAVGB m2, m3
+ PAVGB m1, m4
+ mova [r0+r2], m2
+ mova [r0+r4], m1
+ add r0, r4
+ sub r3d, 4
+ jne .loop
+ REP_RET
+%endmacro
+
+INIT_MMX mmxext
+AVG_PIXELS8_Y2
+INIT_MMX 3dnow
+AVG_PIXELS8_Y2
+
+
+; avg_pixels8_xy2(uint8_t *block, const uint8_t *pixels, int line_size, int h)
+%macro AVG_PIXELS8_XY2 0
+cglobal avg_pixels8_xy2, 4,5
+ mova m6, [pb_1]
+ movsxdifnidn r2, r2d
+ lea r4, [r2*2]
+ mova m0, [r1]
+ pavgb m0, [r1+1]
+.loop:
+ mova m2, [r1+r4]
+ mova m1, [r1+r2]
+ psubusb m2, m6
+ pavgb m1, [r1+r2+1]
+ pavgb m2, [r1+r4+1]
+ add r1, r4
+ pavgb m0, m1
+ pavgb m1, m2
+ pavgb m0, [r0]
+ pavgb m1, [r0+r2]
+ mova [r0], m0
+ mova [r0+r2], m1
+ mova m1, [r1+r2]
+ mova m0, [r1+r4]
+ pavgb m1, [r1+r2+1]
+ pavgb m0, [r1+r4+1]
+ add r0, r4
+ add r1, r4
+ pavgb m2, m1
+ pavgb m1, m0
+ pavgb m2, [r0]
+ pavgb m1, [r0+r2]
+ mova [r0], m2
+ mova [r0+r2], m2
+ add r0, r4
+ sub r3d, 4
+ jne .loop
+ REP_RET
+%endmacro
+
+INIT_MMX mmxext
+AVG_PIXELS8_XY2
+INIT_MMX 3dnow
+AVG_PIXELS8_XY2
diff --git a/libavcodec/x86/mpeg4qpel.asm b/libavcodec/x86/mpeg4qpel.asm
new file mode 100644
index 0000000000..39c9fc803f
--- /dev/null
+++ b/libavcodec/x86/mpeg4qpel.asm
@@ -0,0 +1,558 @@
+;******************************************************************************
+;* mpeg4 qpel
+;* Copyright (c) 2008 Loren Merritt
+;*
+;* This file is part of Libav.
+;*
+;* Libav is free software; you can redistribute it and/or
+;* modify it under the terms of the GNU Lesser General Public
+;* License as published by the Free Software Foundation; either
+;* version 2.1 of the License, or (at your option) any later version.
+;*
+;* Libav is distributed in the hope that it will be useful,
+;* but WITHOUT ANY WARRANTY; without even the implied warranty of
+;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+;* Lesser General Public License for more details.
+;*
+;* You should have received a copy of the GNU Lesser General Public
+;* License along with Libav; if not, write to the Free Software
+;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+;******************************************************************************
+
+%include "libavutil/x86/x86util.asm"
+
+SECTION_RODATA
+cextern pb_1
+cextern pw_3
+cextern pw_15
+cextern pw_16
+cextern pw_20
+
+
+SECTION_TEXT
+
+; put_no_rnd_pixels8_l2(uint8_t *dst, uint8_t *src1, uint8_t *src2, int dstStride, int src1Stride, int h)
+%macro PUT_NO_RND_PIXELS8_L2 0
+cglobal put_no_rnd_pixels8_l2, 6,6
+ movsxdifnidn r4, r4d
+ movsxdifnidn r3, r3d
+ pcmpeqb m6, m6
+ test r5d, 1
+ je .loop
+ mova m0, [r1]
+ mova m1, [r2]
+ add r1, r4
+ add r2, 8
+ pxor m0, m6
+ pxor m1, m6
+ PAVGB m0, m1
+ pxor m0, m6
+ mova [r0], m0
+ add r0, r3
+ dec r5d
+.loop:
+ mova m0, [r1]
+ add r1, r4
+ mova m1, [r1]
+ add r1, r4
+ mova m2, [r2]
+ mova m3, [r2+8]
+ pxor m0, m6
+ pxor m1, m6
+ pxor m2, m6
+ pxor m3, m6
+ PAVGB m0, m2
+ PAVGB m1, m3
+ pxor m0, m6
+ pxor m1, m6
+ mova [r0], m0
+ add r0, r3
+ mova [r0], m1
+ add r0, r3
+ mova m0, [r1]
+ add r1, r4
+ mova m1, [r1]
+ add r1, r4
+ mova m2, [r2+16]
+ mova m3, [r2+24]
+ pxor m0, m6
+ pxor m1, m6
+ pxor m2, m6
+ pxor m3, m6
+ PAVGB m0, m2
+ PAVGB m1, m3
+ pxor m0, m6
+ pxor m1, m6
+ mova [r0], m0
+ add r0, r3
+ mova [r0], m1
+ add r0, r3
+ add r2, 32
+ sub r5d, 4
+ jne .loop
+ REP_RET
+%endmacro
+
+INIT_MMX mmxext
+PUT_NO_RND_PIXELS8_L2
+
+
+; put_no_rnd_pixels16_l2(uint8_t *dst, uint8_t *src1, uint8_t *src2, int dstStride, int src1Stride, int h)
+%macro PUT_NO_RND_PIXELS16_l2 0
+cglobal put_no_rnd_pixels16_l2, 5,5
+ movsxdifnidn r3, r3
+ movsxdifnidn r4, r4d
+ pcmpeqb m6, m6
+ test r5d, 1
+ je .loop
+ mova m0, [r1]
+ mova m1, [r1+8]
+ mova m2, [r2]
+ mova m3, [r2+8]
+ pxor m0, m6
+ pxor m1, m6
+ pxor m2, m6
+ pxor m3, m6
+ PAVGB m0, m2
+ PAVGB m1, m3
+ pxor m0, m6
+ pxor m1, m6
+ add r1, r4
+ add r2, 16
+ mova [r0], m0
+ mova [r0+8], m1
+ add r0, r3
+ dec r5d
+.loop:
+ mova m0, [r1]
+ mova m1, [r1+8]
+ add r1, r4
+ mova m2, [r2]
+ mova m3, [r2+8]
+ pxor m0, m6
+ pxor m1, m6
+ pxor m2, m6
+ pxor m3, m6
+ PAVGB m0, m2
+ PAVGB m1, m3
+ pxor m0, m6
+ pxor m1, m6
+ mova [r0], m0
+ mova [r0+8], m1
+ add r0, r3
+ mova m0, [r1]
+ mova m1, [r1+8]
+ add r1, r4
+ mova m2, [r2+16]
+ mova m3, [r2+24]
+ pxor m0, m6
+ pxor m1, m6
+ pxor m2, m6
+ pxor m3, m6
+ PAVGB m0, m2
+ PAVGB m1, m3
+ pxor m0, m6
+ pxor m1, m6
+ mova [r0], m0
+ mova [r0+8], m1
+ add r0, r3
+ add r2, 32
+ sub r5d, 2
+ jne .loop
+ REP_RET
+%endmacro
+
+INIT_MMX mmxext
+PUT_NO_RND_PIXELS16_l2
+INIT_MMX 3dnow
+PUT_NO_RND_PIXELS16_l2
+
+%macro MPEG4_QPEL16_H_LOWPASS 1
+cglobal %1_mpeg4_qpel16_h_lowpass, 5, 5, 0, 8
+ movsxdifnidn r2, r2d
+ movsxdifnidn r3, r3d
+ pxor m7, m7
+.loop:
+ mova m0, [r1]
+ mova m1, m0
+ mova m2, m0
+ punpcklbw m0, m7
+ punpckhbw m1, m7
+ pshufw m5, m0, 0x90
+ pshufw m6, m0, 0x41
+ mova m3, m2
+ mova m4, m2
+ psllq m2, 8
+ psllq m3, 16
+ psllq m4, 24
+ punpckhbw m2, m7
+ punpckhbw m3, m7
+ punpckhbw m4, m7
+ paddw m5, m3
+ paddw m6, m2
+ paddw m5, m5
+ psubw m6, m5
+ pshufw m5, m0, 6
+ pmullw m6, [pw_3]
+ paddw m0, m4
+ paddw m5, m1
+ pmullw m0, [pw_20]
+ psubw m0, m5
+ paddw m6, [PW_ROUND]
+ paddw m0, m6
+ psraw m0, 5
+ mova [rsp-8], m0
+ mova m0, [r1+5]
+ mova m5, m0
+ mova m6, m0
+ psrlq m0, 8
+ psrlq m5, 16
+ punpcklbw m0, m7
+ punpcklbw m5, m7
+ paddw m2, m0
+ paddw m3, m5
+ paddw m2, m2
+ psubw m3, m2
+ mova m2, m6
+ psrlq m6, 24
+ punpcklbw m2, m7
+ punpcklbw m6, m7
+ pmullw m3, [pw_3]
+ paddw m1, m2
+ paddw m4, m6
+ pmullw m1, [pw_20]
+ psubw m3, m4
+ paddw m1, [PW_ROUND]
+ paddw m3, m1
+ psraw m3, 5
+ mova m1, [rsp-8]
+ packuswb m1, m3
+ OP_MOV [r0], m1, m4
+ mova m1, [r1+9]
+ mova m4, m1
+ mova m3, m1
+ psrlq m1, 8
+ psrlq m4, 16
+ punpcklbw m1, m7
+ punpcklbw m4, m7
+ paddw m5, m1
+ paddw m0, m4
+ paddw m5, m5
+ psubw m0, m5
+ mova m5, m3
+ psrlq m3, 24
+ pmullw m0, [pw_3]
+ punpcklbw m3, m7
+ paddw m2, m3
+ psubw m0, m2
+ mova m2, m5
+ punpcklbw m2, m7
+ punpckhbw m5, m7
+ paddw m6, m2
+ pmullw m6, [pw_20]
+ paddw m0, [PW_ROUND]
+ paddw m0, m6
+ psraw m0, 5
+ paddw m3, m5
+ pshufw m6, m5, 0xf9
+ paddw m6, m4
+ pshufw m4, m5, 0xbe
+ pshufw m5, m5, 0x6f
+ paddw m4, m1
+ paddw m5, m2
+ paddw m6, m6
+ psubw m4, m6
+ pmullw m3, [pw_20]
+ pmullw m4, [pw_3]
+ psubw m3, m5
+ paddw m4, [PW_ROUND]
+ paddw m4, m3
+ psraw m4, 5
+ packuswb m0, m4
+ OP_MOV [r0+8], m0, m4
+ add r1, r3
+ add r0, r2
+ dec r4d
+ jne .loop
+ REP_RET
+%endmacro
+
+%macro PUT_OP 2-3
+ mova %1, %2
+%endmacro
+
+%macro AVG_OP 2-3
+ mova %3, %1
+ pavgb %2, %3
+ mova %1, %2
+%endmacro
+
+INIT_MMX mmxext
+%define PW_ROUND pw_16
+%define OP_MOV PUT_OP
+MPEG4_QPEL16_H_LOWPASS put
+%define PW_ROUND pw_16
+%define OP_MOV AVG_OP
+MPEG4_QPEL16_H_LOWPASS avg
+%define PW_ROUND pw_15
+%define OP_MOV PUT_OP
+MPEG4_QPEL16_H_LOWPASS put_no_rnd
+
+
+
+%macro MPEG4_QPEL8_H_LOWPASS 1
+cglobal %1_mpeg4_qpel8_h_lowpass, 5, 5, 0, 8
+ movsxdifnidn r2, r2d
+ movsxdifnidn r3, r3d
+ pxor m7, m7
+.loop:
+ mova m0, [r1]
+ mova m1, m0
+ mova m2, m0
+ punpcklbw m0, m7
+ punpckhbw m1, m7
+ pshufw m5, m0, 0x90
+ pshufw m6, m0, 0x41
+ mova m3, m2
+ mova m4, m2
+ psllq m2, 8
+ psllq m3, 16
+ psllq m4, 24
+ punpckhbw m2, m7
+ punpckhbw m3, m7
+ punpckhbw m4, m7
+ paddw m5, m3
+ paddw m6, m2
+ paddw m5, m5
+ psubw m6, m5
+ pshufw m5, m0, 0x6
+ pmullw m6, [pw_3]
+ paddw m0, m4
+ paddw m5, m1
+ pmullw m0, [pw_20]
+ psubw m0, m5
+ paddw m6, [PW_ROUND]
+ paddw m0, m6
+ psraw m0, 5
+ movh m5, [r1+5]
+ punpcklbw m5, m7
+ pshufw m6, m5, 0xf9
+ paddw m1, m5
+ paddw m2, m6
+ pshufw m6, m5, 0xbe
+ pshufw m5, m5, 0x6f
+ paddw m3, m6
+ paddw m4, m5
+ paddw m2, m2
+ psubw m3, m2
+ pmullw m1, [pw_20]
+ pmullw m3, [pw_3]
+ psubw m3, m4
+ paddw m1, [PW_ROUND]
+ paddw m3, m1
+ psraw m3, 5
+ packuswb m0, m3
+ OP_MOV [r0], m0, m4
+ add r1, r3
+ add r0, r2
+ dec r4d
+ jne .loop
+ REP_RET
+%endmacro
+
+INIT_MMX mmxext
+%define PW_ROUND pw_16
+%define OP_MOV PUT_OP
+MPEG4_QPEL8_H_LOWPASS put
+%define PW_ROUND pw_16
+%define OP_MOV AVG_OP
+MPEG4_QPEL8_H_LOWPASS avg
+%define PW_ROUND pw_15
+%define OP_MOV PUT_OP
+MPEG4_QPEL8_H_LOWPASS put_no_rnd
+
+
+
+%macro QPEL_V_LOW 5
+ paddw m0, m1
+ mova m4, [pw_20]
+ pmullw m4, m0
+ mova m0, %4
+ mova m5, %1
+ paddw m5, m0
+ psubw m4, m5
+ mova m5, %2
+ mova m6, %3
+ paddw m5, m3
+ paddw m6, m2
+ paddw m6, m6
+ psubw m5, m6
+ pmullw m5, [pw_3]
+ paddw m4, [PW_ROUND]
+ paddw m5, m4
+ psraw m5, 5
+ packuswb m5, m5
+ OP_MOV %5, m5, m7
+ SWAP 0,1,2,3
+%endmacro
+
+%macro MPEG4_QPEL16_V_LOWPASS 1
+cglobal %1_mpeg4_qpel16_v_lowpass, 4, 6, 0, 544
+ movsxdifnidn r2, r2d
+ movsxdifnidn r3, r3d
+
+ mov r4d, 17
+ mov r5, rsp
+ pxor m7, m7
+.looph:
+ mova m0, [r1]
+ mova m1, [r1]
+ mova m2, [r1+8]
+ mova m3, [r1+8]
+ punpcklbw m0, m7
+ punpckhbw m1, m7
+ punpcklbw m2, m7
+ punpckhbw m3, m7
+ mova [r5], m0
+ mova [r5+0x88], m1
+ mova [r5+0x110], m2
+ mova [r5+0x198], m3
+ add r5, 8
+ add r1, r3
+ dec r4d
+ jne .looph
+
+
+ ; NOTE: r1 CHANGES VALUES: r1 -> 4 - 14*dstStride
+ mov r4d, 4
+ mov r1, 4
+ neg r2
+ lea r1, [r1+r2*8]
+ lea r1, [r1+r2*4]
+ lea r1, [r1+r2*2]
+ neg r2
+ mov r5, rsp
+.loopv:
+ pxor m7, m7
+ mova m0, [r5+ 0x0]
+ mova m1, [r5+ 0x8]
+ mova m2, [r5+0x10]
+ mova m3, [r5+0x18]
+ QPEL_V_LOW [r5+0x10], [r5+ 0x8], [r5+ 0x0], [r5+0x20], [r0]
+ QPEL_V_LOW [r5+ 0x8], [r5+ 0x0], [r5+ 0x0], [r5+0x28], [r0+r2]
+ lea r0, [r0+r2*2]
+ QPEL_V_LOW [r5+ 0x0], [r5+ 0x0], [r5+ 0x8], [r5+0x30], [r0]
+ QPEL_V_LOW [r5+ 0x0], [r5+ 0x8], [r5+0x10], [r5+0x38], [r0+r2]
+ lea r0, [r0+r2*2]
+ QPEL_V_LOW [r5+ 0x8], [r5+0x10], [r5+0x18], [r5+0x40], [r0]
+ QPEL_V_LOW [r5+0x10], [r5+0x18], [r5+0x20], [r5+0x48], [r0+r2]
+ lea r0, [r0+r2*2]
+ QPEL_V_LOW [r5+0x18], [r5+0x20], [r5+0x28], [r5+0x50], [r0]
+ QPEL_V_LOW [r5+0x20], [r5+0x28], [r5+0x30], [r5+0x58], [r0+r2]
+ lea r0, [r0+r2*2]
+ QPEL_V_LOW [r5+0x28], [r5+0x30], [r5+0x38], [r5+0x60], [r0]
+ QPEL_V_LOW [r5+0x30], [r5+0x38], [r5+0x40], [r5+0x68], [r0+r2]
+ lea r0, [r0+r2*2]
+ QPEL_V_LOW [r5+0x38], [r5+0x40], [r5+0x48], [r5+0x70], [r0]
+ QPEL_V_LOW [r5+0x40], [r5+0x48], [r5+0x50], [r5+0x78], [r0+r2]
+ lea r0, [r0+r2*2]
+ QPEL_V_LOW [r5+0x48], [r5+0x50], [r5+0x58], [r5+0x80], [r0]
+ QPEL_V_LOW [r5+0x50], [r5+0x58], [r5+0x60], [r5+0x80], [r0+r2]
+ lea r0, [r0+r2*2]
+ QPEL_V_LOW [r5+0x58], [r5+0x60], [r5+0x68], [r5+0x78], [r0]
+ QPEL_V_LOW [r5+0x60], [r5+0x68], [r5+0x70], [r5+0x70], [r0+r2]
+
+ add r5, 0x88
+ add r0, r1
+ dec r4d
+ jne .loopv
+ REP_RET
+%endmacro
+
+%macro PUT_OPH 2-3
+ movh %1, %2
+%endmacro
+
+%macro AVG_OPH 2-3
+ movh %3, %1
+ pavgb %2, %3
+ movh %1, %2
+%endmacro
+
+INIT_MMX mmxext
+%define PW_ROUND pw_16
+%define OP_MOV PUT_OPH
+MPEG4_QPEL16_V_LOWPASS put
+%define PW_ROUND pw_16
+%define OP_MOV AVG_OPH
+MPEG4_QPEL16_V_LOWPASS avg
+%define PW_ROUND pw_15
+%define OP_MOV PUT_OPH
+MPEG4_QPEL16_V_LOWPASS put_no_rnd
+
+
+
+%macro MPEG4_QPEL8_V_LOWPASS 1
+cglobal %1_mpeg4_qpel8_v_lowpass, 4, 6, 0, 288
+ movsxdifnidn r2, r2d
+ movsxdifnidn r3, r3d
+
+ mov r4d, 9
+ mov r5, rsp
+ pxor m7, m7
+.looph:
+ mova m0, [r1]
+ mova m1, [r1]
+ punpcklbw m0, m7
+ punpckhbw m1, m7
+ mova [r5], m0
+ mova [r5+0x48], m1
+ add r5, 8
+ add r1, r3
+ dec r4d
+ jne .looph
+
+
+ ; NOTE: r1 CHANGES VALUES: r1 -> 4 - 6*dstStride
+ mov r4d, 2
+ mov r1, 4
+ neg r2
+ lea r1, [r1+r2*4]
+ lea r1, [r1+r2*2]
+ neg r2
+ mov r5, rsp
+.loopv:
+ pxor m7, m7
+ mova m0, [r5+ 0x0]
+ mova m1, [r5+ 0x8]
+ mova m2, [r5+0x10]
+ mova m3, [r5+0x18]
+ QPEL_V_LOW [r5+0x10], [r5+ 0x8], [r5+ 0x0], [r5+0x20], [r0]
+ QPEL_V_LOW [r5+ 0x8], [r5+ 0x0], [r5+ 0x0], [r5+0x28], [r0+r2]
+ lea r0, [r0+r2*2]
+ QPEL_V_LOW [r5+ 0x0], [r5+ 0x0], [r5+ 0x8], [r5+0x30], [r0]
+ QPEL_V_LOW [r5+ 0x0], [r5+ 0x8], [r5+0x10], [r5+0x38], [r0+r2]
+ lea r0, [r0+r2*2]
+ QPEL_V_LOW [r5+ 0x8], [r5+0x10], [r5+0x18], [r5+0x40], [r0]
+ QPEL_V_LOW [r5+0x10], [r5+0x18], [r5+0x20], [r5+0x40], [r0+r2]
+ lea r0, [r0+r2*2]
+ QPEL_V_LOW [r5+0x18], [r5+0x20], [r5+0x28], [r5+0x38], [r0]
+ QPEL_V_LOW [r5+0x20], [r5+0x28], [r5+0x30], [r5+0x30], [r0+r2]
+
+ add r5, 0x48
+ add r0, r1
+ dec r4d
+ jne .loopv
+ REP_RET
+%endmacro
+
+INIT_MMX mmxext
+%define PW_ROUND pw_16
+%define OP_MOV PUT_OPH
+MPEG4_QPEL8_V_LOWPASS put
+%define PW_ROUND pw_16
+%define OP_MOV AVG_OPH
+MPEG4_QPEL8_V_LOWPASS avg
+%define PW_ROUND pw_15
+%define OP_MOV PUT_OPH
+MPEG4_QPEL8_V_LOWPASS put_no_rnd
diff --git a/libavcodec/x86/vc1dsp_mmx.c b/libavcodec/x86/vc1dsp_mmx.c
index a64ec414be..5037aeea24 100644
--- a/libavcodec/x86/vc1dsp_mmx.c
+++ b/libavcodec/x86/vc1dsp_mmx.c
@@ -697,7 +697,9 @@ static void vc1_inv_trans_8x8_dc_mmxext(uint8_t *dest, int linesize,
av_cold void ff_vc1dsp_init_mmx(VC1DSPContext *dsp)
{
+#if HAVE_YASM
dsp->put_vc1_mspel_pixels_tab[ 0] = ff_put_vc1_mspel_mc00_mmx;
+#endif /* HAVE_YASM */
dsp->put_vc1_mspel_pixels_tab[ 4] = put_vc1_mspel_mc01_mmx;
dsp->put_vc1_mspel_pixels_tab[ 8] = put_vc1_mspel_mc02_mmx;
dsp->put_vc1_mspel_pixels_tab[12] = put_vc1_mspel_mc03_mmx;
@@ -720,7 +722,9 @@ av_cold void ff_vc1dsp_init_mmx(VC1DSPContext *dsp)
av_cold void ff_vc1dsp_init_mmxext(VC1DSPContext *dsp)
{
+#if HAVE_YASM
dsp->avg_vc1_mspel_pixels_tab[ 0] = ff_avg_vc1_mspel_mc00_mmxext;
+#endif /* HAVE_YASM */
dsp->avg_vc1_mspel_pixels_tab[ 4] = avg_vc1_mspel_mc01_mmxext;
dsp->avg_vc1_mspel_pixels_tab[ 8] = avg_vc1_mspel_mc02_mmxext;
dsp->avg_vc1_mspel_pixels_tab[12] = avg_vc1_mspel_mc03_mmxext;