* code with new PAVGB for MMX only CPU splited into separate file

and being compiled in the same way as _avg.h * PAVG_MMX macros accept also output parameter * implemented faster put_pixels_xy2, but it has slightly smaller precission. But there is not visible difference in the image quality - might be eventualy easily switched back (#if 0 #endif)- please check Originally committed as revision 624 to svn://svn.ffmpeg.org/ffmpeg/trunk
author: Zdenek Kabelac <kabi@informatics.muni.cz> 2002-05-29 17:16:22 +0000
committer: Zdenek Kabelac <kabi@informatics.muni.cz> 2002-05-29 17:16:22 +0000
commit: 91abb473fb8432226918da4fe03365ebaf688978 (patch)
tree: 3c0635d9fe957b6793b5375d8e248efb8bf9d357 /libavcodec/i386/dsputil_mmx_rnd.h
parent: def60345ad03b26802b5bf6801e00d2f03d262d9 (diff)
download: ffmpeg-91abb473fb8432226918da4fe03365ebaf688978.tar.gz
1 files changed, 139 insertions, 0 deletions
diff --git a/libavcodec/i386/dsputil_mmx_rnd.h b/libavcodec/i386/dsputil_mmx_rnd.h
new file mode 100644
index 0000000000..787f706dc2
--- /dev/null
+++ b/libavcodec/i386/dsputil_mmx_rnd.h
@@ -0,0 +1,139 @@
+/*
+ * DSP utils mmx functions are compiled twice for rnd/no_rnd
+ * Copyright (c) 2000, 2001 Fabrice Bellard.
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ *
+ * MMX optimization by Nick Kurshev <nickols_k@mail.ru>
+ * mostly rewritten by Michael Niedermayer <michaelni@gmx.at>
+ * and improved by Zdenek Kabelac <kabi@users.sf.net>
+ */
+
+// will have to be check if it's better to have bigger
+// unrolled code also on Celerons - for now  yes
+#define LONG_UNROLL 1
+
+// put_pixels
+static void DEF(put, pixels_x2)(UINT8 *block, const UINT8 *pixels, int line_size, int h)
+{
+    __asm __volatile(
+	MOVQ_BFE(%%mm7)
+	"lea	(%3, %3), %%eax		\n\t"
+	".balign 8			\n\t"
+	"1:				\n\t"
+	"movq	(%1), %%mm0		\n\t"
+	"movq	(%1, %3), %%mm2		\n\t"
+	"movq	1(%1), %%mm1		\n\t"
+	"movq	1(%1, %3), %%mm3	\n\t"
+	PAVGB(%%mm0, %%mm1)
+	"movq	%%mm6, (%2)		\n\t"
+	PAVGB(%%mm2, %%mm3)
+	"movq	%%mm6, (%2, %3)		\n\t"
+	"addl	%%eax, %1		\n\t"
+	"addl	%%eax, %2		\n\t"
+#if LONG_UNROLL
+	"movq	(%1), %%mm0		\n\t"
+	"movq	(%1, %3), %%mm2		\n\t"
+	"movq	1(%1), %%mm1		\n\t"
+	"movq	1(%1, %3), %%mm3	\n\t"
+	PAVGB(%%mm0, %%mm1)
+	"movq	%%mm6, (%2)		\n\t"
+	PAVGB(%%mm2, %%mm3)
+	"movq	%%mm6, (%2, %3)		\n\t"
+	"addl	%%eax, %1		\n\t"
+	"addl	%%eax, %2		\n\t"
+	"subl	$4, %0			\n\t"
+#else
+	"subl	$2, %0			\n\t"
+#endif
+	"jnz	1b			\n\t"
+	:"+g"(h), "+S"(pixels), "+D"(block)
+	:"r"(line_size)
+	:"eax", "memory");
+}
+
+static void DEF(put, pixels_y2)(UINT8 *block, const UINT8 *pixels, int line_size, int h)
+{
+  __asm __volatile(
+	MOVQ_BFE(%%mm7)
+	"lea (%3, %3), %%eax		\n\t"
+	"movq (%1), %%mm0		\n\t"
+	".balign 8			\n\t"
+	"1:				\n\t"
+	"movq	(%1, %3), %%mm1		\n\t"
+	"movq	(%1, %%eax),%%mm2	\n\t"
+	PAVGB(%%mm1, %%mm0)
+	"movq	%%mm6, (%2)		\n\t"
+	PAVGB(%%mm2, %%mm1)
+	"movq	%%mm6, (%2, %3)		\n\t"
+	"addl	%%eax, %1		\n\t"
+	"addl	%%eax, %2		\n\t"
+#ifdef LONG_UNROLL
+	"movq	(%1, %3), %%mm1		\n\t"
+	"movq	(%1, %%eax),%%mm0	\n\t"
+	PAVGB(%%mm1, %%mm2)
+	"movq	%%mm6, (%2)		\n\t"
+	PAVGB(%%mm0, %%mm1)
+	"movq	%%mm6, (%2, %3)		\n\t"
+	"addl	%%eax, %1		\n\t"
+	"addl	%%eax, %2		\n\t"
+	"subl	$4, %0			\n\t"
+#else
+	"subl	$2, %0			\n\t"
+#endif
+	"jnz	1b			\n\t"
+	:"+g"(h), "+S"(pixels), "+D"(block)
+	:"r"(line_size)
+	:"eax", "memory");
+}
+
+// ((a + b)/2 + (c + d)/2)/2
+// not sure if this is properly replacing original code
+static void DEF(put, pixels_xy2)(UINT8 *block, const UINT8 *pixels, int line_size, int h)
+{
+    __asm __volatile(
+	MOVQ_BFE(%%mm7)
+	"lea (%3, %3), %%eax		\n\t"
+	"movq (%1), %%mm0		\n\t"
+	"movq	(%1), %%mm0		\n\t"
+	"movq	1(%1), %%mm1		\n\t"
+	".balign 8			\n\t"
+	"1:				\n\t"
+	"movq	(%1, %3), %%mm2		\n\t"
+	"movq	1(%1, %3), %%mm3	\n\t"
+	PAVGBR(%%mm2, %%mm0, %%mm4)
+	PAVGBR(%%mm3, %%mm1, %%mm5)
+	PAVGB(%%mm4, %%mm5)
+	"movq	%%mm6, (%2)		\n\t"
+
+	"movq	(%1, %%eax), %%mm0	\n\t"
+	"movq	1(%1, %%eax), %%mm1	\n\t"
+	PAVGBR(%%mm0, %%mm2, %%mm4)
+	PAVGBR(%%mm1, %%mm3, %%mm5)
+	PAVGB(%%mm4, %%mm5)
+	"movq	%%mm6, (%2, %3)		\n\t"
+	"addl	%%eax, %1		\n\t"
+	"addl	%%eax, %2		\n\t"
+
+	"subl	$2, %0			\n\t"
+
+	"jnz	1b			\n\t"
+	:"+g"(h), "+S"(pixels), "+D"(block)
+	:"r"(line_size)
+	:"eax", "memory");
+}
+
+// avg_pixels
+
author	Zdenek Kabelac <kabi@informatics.muni.cz>	2002-05-29 17:16:22 +0000
committer	Zdenek Kabelac <kabi@informatics.muni.cz>	2002-05-29 17:16:22 +0000
commit	91abb473fb8432226918da4fe03365ebaf688978 (patch)
tree	3c0635d9fe957b6793b5375d8e248efb8bf9d357 /libavcodec/i386/dsputil_mmx_rnd.h
parent	def60345ad03b26802b5bf6801e00d2f03d262d9 (diff)
download	ffmpeg-91abb473fb8432226918da4fe03365ebaf688978.tar.gz