aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorZdenek Kabelac <kabi@informatics.muni.cz>2002-05-30 15:14:56 +0000
committerZdenek Kabelac <kabi@informatics.muni.cz>2002-05-30 15:14:56 +0000
commitfca0f0e59f4879423a6d5396c3e8aeb6306ab09c (patch)
tree634a419cd13d78bcac7702471946ef697c88825f
parent4e33b83b18770b8f2b260e2efdaf7305ffcf45a7 (diff)
downloadffmpeg-fca0f0e59f4879423a6d5396c3e8aeb6306ab09c.tar.gz
* removed MANGLE from macros for setting constants
* using MOVQ_WONE/MOVQ_BFE as two instruction instead of static memory value access as its always faster * PAVGB_MMX macro is using now mm6 -> mm7 is unmodified * replaced original pixels_xy2_mmx with new faster and equal implementation * replaced usage of mm7 for other then ZERO contstant in _rnd & _avg file with mm6 Originally committed as revision 632 to svn://svn.ffmpeg.org/ffmpeg/trunk
-rw-r--r--libavcodec/i386/dsputil_mmx.c167
-rw-r--r--libavcodec/i386/dsputil_mmx_avg.h20
-rw-r--r--libavcodec/i386/dsputil_mmx_rnd.h114
3 files changed, 112 insertions, 189 deletions
diff --git a/libavcodec/i386/dsputil_mmx.c b/libavcodec/i386/dsputil_mmx.c
index 2e8baef4fe..fb8fdc741e 100644
--- a/libavcodec/i386/dsputil_mmx.c
+++ b/libavcodec/i386/dsputil_mmx.c
@@ -49,53 +49,51 @@ void ff_mmx_idct(DCTELEM *block);
void ff_mmxext_idct(DCTELEM *block);
/* pixel operations */
-static const uint64_t mm_bfe __attribute__ ((aligned(8))) = 0xfefefefefefefefeULL;
static const uint64_t mm_bone __attribute__ ((aligned(8))) = 0x0101010101010101ULL;
static const uint64_t mm_wone __attribute__ ((aligned(8))) = 0x0001000100010001ULL;
static const uint64_t mm_wtwo __attribute__ ((aligned(8))) = 0x0002000200020002ULL;
-//static const unsigned short mm_wone[4] __attribute__ ((aligned(8))) = { 0x1, 0x1, 0x1, 0x1 };
-//static const unsigned short mm_wtwo[4] __attribute__ ((aligned(8))) = { 0x2, 0x2, 0x2, 0x2 };
#define JUMPALIGN() __asm __volatile (".balign 8"::)
#define MOVQ_ZERO(regd) __asm __volatile ("pxor %%" #regd ", %%" #regd ::)
+#define MOVQ_WONE(regd) \
+ __asm __volatile ( \
+ "pcmpeqd %%" #regd ", %%" #regd " \n\t" \
+ "psrlw $15, %%" #regd ::)
+
+#define MOVQ_BFE(regd) \
+ __asm __volatile ( \
+ "pcmpeqd %%" #regd ", %%" #regd " \n\t"\
+ "paddb %%" #regd ", %%" #regd " \n\t" ::)
+
#ifndef PIC
-#define MOVQ_WONE(regd) __asm __volatile ("movq %0, %%" #regd " \n\t" ::"m"(mm_wone))
+#define MOVQ_BONE(regd) __asm __volatile ("movq %0, %%" #regd " \n\t" ::"m"(mm_bone))
#define MOVQ_WTWO(regd) __asm __volatile ("movq %0, %%" #regd " \n\t" ::"m"(mm_wtwo))
-#define MOVQ_BONE(regd) "movq "MANGLE(mm_bone)", "#regd" \n\t"
-#define MOVQ_BFE(regd) "movq "MANGLE(mm_bfe)", "#regd" \n\t"
#else
// for shared library it's better to use this way for accessing constants
// pcmpeqd -> -1
-#define MOVQ_WONE(regd) \
+#define MOVQ_BONE(regd) \
__asm __volatile ( \
- "pcmpeqd %%" #regd ", %%" #regd " \n\t" \
- "psrlw $15, %%" #regd ::)
+ "pcmpeqd %%" #regd ", %%" #regd " \n\t" \
+ "psrlw $15, %%" #regd " \n\t" \
+ "packuswb %%" #regd ", %%" #regd " \n\t" ::)
#define MOVQ_WTWO(regd) \
__asm __volatile ( \
- "pcmpeqd %%" #regd ", %%" #regd " \n\t" \
- "psrlw $15, %%" #regd " \n\t" \
- "psllw $1, %%" #regd ::)
-
-#define MOVQ_BONE(regd) \
- "pcmpeqd " #regd ", " #regd " \n\t" \
- "psrlw $15, " #regd " \n\t"\
- "packuswb " #regd ", " #regd " \n\t"
+ "pcmpeqd %%" #regd ", %%" #regd " \n\t" \
+ "psrlw $15, %%" #regd " \n\t" \
+ "psllw $1, %%" #regd " \n\t"::)
-#define MOVQ_BFE(regd) \
- "pcmpeqd " #regd ", " #regd " \n\t"\
- "paddb " #regd ", " #regd " \n\t"
#endif
-// using mm6 as temporary and for the output result
+// using regr as temporary and for the output result
// first argument is unmodifed and second is trashed
-// mm7 is supposed to contain 0xfefefefefefefefe
+// mm6 is supposed to contain 0xfefefefefefefefe
#define PAVGB_MMX_NO_RND(rega, regb, regr) \
"movq " #rega ", " #regr " \n\t"\
"pand " #regb ", " #regr " \n\t"\
"pxor " #rega ", " #regb " \n\t"\
- "pand %%mm7, " #regb " \n\t"\
+ "pand %%mm6, " #regb " \n\t"\
"psrlq $1, " #regb " \n\t"\
"paddb " #regb ", " #regr " \n\t"
@@ -103,7 +101,7 @@ static const uint64_t mm_wtwo __attribute__ ((aligned(8))) = 0x0002000200020002U
"movq " #rega ", " #regr " \n\t"\
"por " #regb ", " #regr " \n\t"\
"pxor " #rega ", " #regb " \n\t"\
- "pand %%mm7, " #regb " \n\t"\
+ "pand %%mm6, " #regb " \n\t"\
"psrlq $1, " #regb " \n\t"\
"psubb " #regb ", " #regr " \n\t"
@@ -114,8 +112,8 @@ static const uint64_t mm_wtwo __attribute__ ((aligned(8))) = 0x0002000200020002U
"pand " #regd ", " #regp " \n\t"\
"pxor " #rega ", " #regb " \n\t"\
"pxor " #regc ", " #regd " \n\t"\
- "pand %%mm7, " #regb " \n\t"\
- "pand %%mm7, " #regd " \n\t"\
+ "pand %%mm6, " #regb " \n\t"\
+ "pand %%mm6, " #regd " \n\t"\
"psrlq $1, " #regb " \n\t"\
"psrlq $1, " #regd " \n\t"\
"paddb " #regb ", " #regr " \n\t"\
@@ -128,8 +126,8 @@ static const uint64_t mm_wtwo __attribute__ ((aligned(8))) = 0x0002000200020002U
"por " #regd ", " #regp " \n\t"\
"pxor " #rega ", " #regb " \n\t"\
"pxor " #regc ", " #regd " \n\t"\
- "pand %%mm7, " #regb " \n\t"\
- "pand %%mm7, " #regd " \n\t"\
+ "pand %%mm6, " #regb " \n\t"\
+ "pand %%mm6, " #regd " \n\t"\
"psrlq $1, " #regd " \n\t"\
"psrlq $1, " #regb " \n\t"\
"psubb " #regb ", " #regr " \n\t"\
@@ -138,29 +136,25 @@ static const uint64_t mm_wtwo __attribute__ ((aligned(8))) = 0x0002000200020002U
/***********************************/
/* MMX no rounding */
#define DEF(x, y) x ## _no_rnd_ ## y ##_mmx
-
-#define PAVGB(a, b) PAVGB_MMX_NO_RND(a, b, %%mm6)
-#define PAVGBR(a, b, c) PAVGB_MMX_NO_RND(a, b, c)
+#define SET_RND MOVQ_WONE
#define PAVGBP(a, b, c, d, e, f) PAVGBP_MMX_NO_RND(a, b, c, d, e, f)
+
#include "dsputil_mmx_rnd.h"
#undef DEF
-#undef PAVGB
-#undef PAVGBR
+#undef SET_RND
#undef PAVGBP
/***********************************/
/* MMX rounding */
#define DEF(x, y) x ## _ ## y ##_mmx
-
-#define PAVGB(a, b) PAVGB_MMX(a, b, %%mm6)
-#define PAVGBR(a, b, c) PAVGB_MMX(a, b, c)
+#define SET_RND MOVQ_WTWO
#define PAVGBP(a, b, c, d, e, f) PAVGBP_MMX(a, b, c, d, e, f)
+
#include "dsputil_mmx_rnd.h"
#undef DEF
-#undef PAVGB
-#undef PAVGBR
+#undef SET_RND
#undef PAVGBP
/***********************************/
@@ -371,103 +365,6 @@ static void put_pixels_mmx(UINT8 *block, const UINT8 *pixels, int line_size, int
);
}
-#if 1
-static void put_pixels_xy2_mmx(UINT8 *block, const UINT8 *pixels, int line_size, int h)
-{
- UINT8 *p;
- const UINT8 *pix;
- p = block;
- pix = pixels; // 1s
- MOVQ_ZERO(mm7);
- MOVQ_WTWO(mm6);
- JUMPALIGN();
- do {
- __asm __volatile(
- "movq %1, %%mm0\n\t"
- "movq %2, %%mm1\n\t"
- "movq 1%1, %%mm4\n\t"
- "movq 1%2, %%mm5\n\t"
- "movq %%mm0, %%mm2\n\t"
- "movq %%mm1, %%mm3\n\t"
- "punpcklbw %%mm7, %%mm0\n\t"
- "punpcklbw %%mm7, %%mm1\n\t"
- "punpckhbw %%mm7, %%mm2\n\t"
- "punpckhbw %%mm7, %%mm3\n\t"
- "paddusw %%mm1, %%mm0\n\t"
- "paddusw %%mm3, %%mm2\n\t"
- "movq %%mm4, %%mm1\n\t"
- "movq %%mm5, %%mm3\n\t"
- "punpcklbw %%mm7, %%mm4\n\t"
- "punpcklbw %%mm7, %%mm5\n\t"
- "punpckhbw %%mm7, %%mm1\n\t"
- "punpckhbw %%mm7, %%mm3\n\t"
- "paddusw %%mm5, %%mm4\n\t"
- "paddusw %%mm3, %%mm1\n\t"
- "paddusw %%mm6, %%mm4\n\t"
- "paddusw %%mm6, %%mm1\n\t"
- "paddusw %%mm4, %%mm0\n\t"
- "paddusw %%mm1, %%mm2\n\t"
- "psrlw $2, %%mm0\n\t"
- "psrlw $2, %%mm2\n\t"
- "packuswb %%mm2, %%mm0\n\t"
- "movq %%mm0, %0\n\t"
- :"=m"(*p)
- :"m"(*pix),
- "m"(*(pix+line_size))
- :"memory");
- pix += line_size;
- p += line_size;
- } while(--h);
-}
-
-static void put_no_rnd_pixels_xy2_mmx( UINT8 *block, const UINT8 *pixels, int line_size, int h)
-{
- UINT8 *p;
- const UINT8 *pix;
- p = block;
- pix = pixels;
- MOVQ_ZERO(mm7);
- MOVQ_WONE(mm6);
- JUMPALIGN();
- do {
- __asm __volatile(
- "movq %1, %%mm0\n\t"
- "movq %2, %%mm1\n\t"
- "movq 1%1, %%mm4\n\t"
- "movq 1%2, %%mm5\n\t"
- "movq %%mm0, %%mm2\n\t"
- "movq %%mm1, %%mm3\n\t"
- "punpcklbw %%mm7, %%mm0\n\t"
- "punpcklbw %%mm7, %%mm1\n\t"
- "punpckhbw %%mm7, %%mm2\n\t"
- "punpckhbw %%mm7, %%mm3\n\t"
- "paddusw %%mm1, %%mm0\n\t"
- "paddusw %%mm3, %%mm2\n\t"
- "movq %%mm4, %%mm1\n\t"
- "movq %%mm5, %%mm3\n\t"
- "punpcklbw %%mm7, %%mm4\n\t"
- "punpcklbw %%mm7, %%mm5\n\t"
- "punpckhbw %%mm7, %%mm1\n\t"
- "punpckhbw %%mm7, %%mm3\n\t"
- "paddusw %%mm5, %%mm4\n\t"
- "paddusw %%mm3, %%mm1\n\t"
- "paddusw %%mm6, %%mm4\n\t"
- "paddusw %%mm6, %%mm1\n\t"
- "paddusw %%mm4, %%mm0\n\t"
- "paddusw %%mm1, %%mm2\n\t"
- "psrlw $2, %%mm0\n\t"
- "psrlw $2, %%mm2\n\t"
- "packuswb %%mm2, %%mm0\n\t"
- "movq %%mm0, %0\n\t"
- :"=m"(*p)
- :"m"(*pix),
- "m"(*(pix+line_size))
- :"memory");
- pix += line_size;
- p += line_size;
- } while(--h);
-}
-#endif
static void avg_pixels_mmx(UINT8 *block, const UINT8 *pixels, int line_size, int h)
{
UINT8 *p;
diff --git a/libavcodec/i386/dsputil_mmx_avg.h b/libavcodec/i386/dsputil_mmx_avg.h
index 0178144e97..a16ccc88b0 100644
--- a/libavcodec/i386/dsputil_mmx_avg.h
+++ b/libavcodec/i386/dsputil_mmx_avg.h
@@ -56,17 +56,17 @@ static void DEF(put_pixels_x2)(UINT8 *block, const UINT8 *pixels, int line_size,
/* GL: this function does incorrect rounding if overflow */
static void DEF(put_no_rnd_pixels_x2)(UINT8 *block, const UINT8 *pixels, int line_size, int h)
{
+ MOVQ_BONE(mm6);
__asm __volatile(
"lea (%3, %3), %%eax \n\t"
- MOVQ_BONE(%%mm7)
"1: \n\t"
"movq (%1), %%mm0 \n\t"
"movq (%1, %3), %%mm2 \n\t"
"movq 1(%1), %%mm1 \n\t"
"movq 1(%1, %3), %%mm3 \n\t"
"addl %%eax, %1 \n\t"
- "psubusb %%mm7, %%mm0 \n\t"
- "psubusb %%mm7, %%mm2 \n\t"
+ "psubusb %%mm6, %%mm0 \n\t"
+ "psubusb %%mm6, %%mm2 \n\t"
PAVGB" %%mm1, %%mm0 \n\t"
PAVGB" %%mm3, %%mm2 \n\t"
"movq %%mm0, (%2) \n\t"
@@ -77,8 +77,8 @@ static void DEF(put_no_rnd_pixels_x2)(UINT8 *block, const UINT8 *pixels, int lin
"movq 1(%1, %3), %%mm3 \n\t"
"addl %%eax, %2 \n\t"
"addl %%eax, %1 \n\t"
- "psubusb %%mm7, %%mm0 \n\t"
- "psubusb %%mm7, %%mm2 \n\t"
+ "psubusb %%mm6, %%mm0 \n\t"
+ "psubusb %%mm6, %%mm2 \n\t"
PAVGB" %%mm1, %%mm0 \n\t"
PAVGB" %%mm3, %%mm2 \n\t"
"movq %%mm0, (%2) \n\t"
@@ -124,8 +124,8 @@ static void DEF(put_pixels_y2)(UINT8 *block, const UINT8 *pixels, int line_size,
/* GL: this function does incorrect rounding if overflow */
static void DEF(put_no_rnd_pixels_y2)(UINT8 *block, const UINT8 *pixels, int line_size, int h)
{
+ MOVQ_BONE(mm6);
__asm __volatile(
- MOVQ_BONE(%%mm7)
"lea (%3, %3), %%eax \n\t"
"movq (%1), %%mm0 \n\t"
"subl %3, %2 \n\t"
@@ -133,7 +133,7 @@ static void DEF(put_no_rnd_pixels_y2)(UINT8 *block, const UINT8 *pixels, int lin
"movq (%1, %3), %%mm1 \n\t"
"movq (%1, %%eax), %%mm2 \n\t"
"addl %%eax, %1 \n\t"
- "psubusb %%mm7, %%mm1 \n\t"
+ "psubusb %%mm6, %%mm1 \n\t"
PAVGB" %%mm1, %%mm0 \n\t"
PAVGB" %%mm2, %%mm1 \n\t"
"movq %%mm0, (%2, %3) \n\t"
@@ -142,7 +142,7 @@ static void DEF(put_no_rnd_pixels_y2)(UINT8 *block, const UINT8 *pixels, int lin
"movq (%1, %%eax), %%mm0 \n\t"
"addl %%eax, %2 \n\t"
"addl %%eax, %1 \n\t"
- "psubusb %%mm7, %%mm1 \n\t"
+ "psubusb %%mm6, %%mm1 \n\t"
PAVGB" %%mm1, %%mm2 \n\t"
PAVGB" %%mm0, %%mm1 \n\t"
"movq %%mm2, (%2, %3) \n\t"
@@ -256,8 +256,8 @@ static void DEF(avg_pixels_y2)(UINT8 *block, const UINT8 *pixels, int line_size,
// Note this is not correctly rounded, but this function is only used for b frames so it doesnt matter
static void DEF(avg_pixels_xy2)(UINT8 *block, const UINT8 *pixels, int line_size, int h)
{
+ MOVQ_BONE(mm6);
__asm __volatile(
- MOVQ_BONE(%%mm7)
"lea (%3, %3), %%eax \n\t"
"movq (%1), %%mm0 \n\t"
PAVGB" 1(%1), %%mm0 \n\t"
@@ -265,7 +265,7 @@ static void DEF(avg_pixels_xy2)(UINT8 *block, const UINT8 *pixels, int line_size
"1: \n\t"
"movq (%1, %%eax), %%mm2 \n\t"
"movq (%1, %3), %%mm1 \n\t"
- "psubusb %%mm7, %%mm2 \n\t"
+ "psubusb %%mm6, %%mm2 \n\t"
PAVGB" 1(%1, %3), %%mm1 \n\t"
PAVGB" 1(%1, %%eax), %%mm2 \n\t"
"addl %%eax, %1 \n\t"
diff --git a/libavcodec/i386/dsputil_mmx_rnd.h b/libavcodec/i386/dsputil_mmx_rnd.h
index e43d4de58c..585fdb0e7f 100644
--- a/libavcodec/i386/dsputil_mmx_rnd.h
+++ b/libavcodec/i386/dsputil_mmx_rnd.h
@@ -24,8 +24,8 @@
// put_pixels
static void DEF(put, pixels_x2)(UINT8 *block, const UINT8 *pixels, int line_size, int h)
{
+ MOVQ_BFE(mm6);
__asm __volatile(
- MOVQ_BFE(%%mm7)
"lea (%3, %3), %%eax \n\t"
".balign 8 \n\t"
"1: \n\t"
@@ -33,18 +33,18 @@ static void DEF(put, pixels_x2)(UINT8 *block, const UINT8 *pixels, int line_size
"movq 1(%1), %%mm1 \n\t"
"movq (%1, %3), %%mm2 \n\t"
"movq 1(%1, %3), %%mm3 \n\t"
- PAVGBP(%%mm0, %%mm1, %%mm5, %%mm2, %%mm3, %%mm6)
- "movq %%mm5, (%2) \n\t"
- "movq %%mm6, (%2, %3) \n\t"
+ PAVGBP(%%mm0, %%mm1, %%mm4, %%mm2, %%mm3, %%mm5)
+ "movq %%mm4, (%2) \n\t"
+ "movq %%mm5, (%2, %3) \n\t"
"addl %%eax, %1 \n\t"
"addl %%eax, %2 \n\t"
"movq (%1), %%mm0 \n\t"
"movq 1(%1), %%mm1 \n\t"
"movq (%1, %3), %%mm2 \n\t"
"movq 1(%1, %3), %%mm3 \n\t"
- PAVGBP(%%mm0, %%mm1, %%mm5, %%mm2, %%mm3, %%mm6)
- "movq %%mm5, (%2) \n\t"
- "movq %%mm6, (%2, %3) \n\t"
+ PAVGBP(%%mm0, %%mm1, %%mm4, %%mm2, %%mm3, %%mm5)
+ "movq %%mm4, (%2) \n\t"
+ "movq %%mm5, (%2, %3) \n\t"
"addl %%eax, %1 \n\t"
"addl %%eax, %2 \n\t"
"subl $4, %0 \n\t"
@@ -56,24 +56,24 @@ static void DEF(put, pixels_x2)(UINT8 *block, const UINT8 *pixels, int line_size
static void DEF(put, pixels_y2)(UINT8 *block, const UINT8 *pixels, int line_size, int h)
{
- __asm __volatile(
- MOVQ_BFE(%%mm7)
+ MOVQ_BFE(mm6);
+ __asm __volatile(
"lea (%3, %3), %%eax \n\t"
"movq (%1), %%mm0 \n\t"
".balign 8 \n\t"
"1: \n\t"
"movq (%1, %3), %%mm1 \n\t"
"movq (%1, %%eax),%%mm2 \n\t"
- PAVGBP(%%mm1, %%mm0, %%mm5, %%mm2, %%mm1, %%mm6)
- "movq %%mm5, (%2) \n\t"
- "movq %%mm6, (%2, %3) \n\t"
+ PAVGBP(%%mm1, %%mm0, %%mm4, %%mm2, %%mm1, %%mm5)
+ "movq %%mm4, (%2) \n\t"
+ "movq %%mm5, (%2, %3) \n\t"
"addl %%eax, %1 \n\t"
"addl %%eax, %2 \n\t"
"movq (%1, %3), %%mm1 \n\t"
"movq (%1, %%eax),%%mm0 \n\t"
- PAVGBP(%%mm1, %%mm2, %%mm5, %%mm0, %%mm1, %%mm6)
- "movq %%mm5, (%2) \n\t"
- "movq %%mm6, (%2, %3) \n\t"
+ PAVGBP(%%mm1, %%mm2, %%mm4, %%mm0, %%mm1, %%mm5)
+ "movq %%mm4, (%2) \n\t"
+ "movq %%mm5, (%2, %3) \n\t"
"addl %%eax, %1 \n\t"
"addl %%eax, %2 \n\t"
"subl $4, %0 \n\t"
@@ -83,44 +83,70 @@ static void DEF(put, pixels_y2)(UINT8 *block, const UINT8 *pixels, int line_size
:"eax", "memory");
}
-// ((a + b)/2 + (c + d)/2)/2
-// not sure if this is properly replacing original code
-// - ok it's really unsable at this moment -> disabled
-static void DEF(put, disabled_pixels_xy2)(UINT8 *block, const UINT8 *pixels, int line_size, int h)
+static void DEF(put, pixels_xy2)(UINT8 *block, const UINT8 *pixels, int line_size, int h)
{
+ MOVQ_ZERO(mm7);
+ SET_RND(mm6); // =2 for rnd and =1 for no_rnd version
__asm __volatile(
- MOVQ_BFE(%%mm7)
- "lea (%3, %3), %%eax \n\t"
- "movq (%1), %%mm0 \n\t"
"movq (%1), %%mm0 \n\t"
- "movq 1(%1), %%mm1 \n\t"
- ".balign 8 \n\t"
+ "movq 1(%1), %%mm4 \n\t"
+ "movq %%mm0, %%mm1 \n\t"
+ "movq %%mm4, %%mm5 \n\t"
+ "punpcklbw %%mm7, %%mm0 \n\t"
+ "punpcklbw %%mm7, %%mm4 \n\t"
+ "punpckhbw %%mm7, %%mm1 \n\t"
+ "punpckhbw %%mm7, %%mm5 \n\t"
+ "paddusw %%mm0, %%mm4 \n\t"
+ "paddusw %%mm1, %%mm5 \n\t"
+ "xorl %%eax, %%eax \n\t"
+ "addl %3, %1 \n\t"
+ ".balign 4 \n\t"
"1: \n\t"
- "movq (%1, %3), %%mm2 \n\t"
- "movq 1(%1, %3), %%mm3 \n\t"
- PAVGBP(%%mm2, %%mm0, %%mm4, %%mm3, %%mm1, %%mm5)
- //PAVGBR(%%mm2, %%mm0, %%mm4)
- //PAVGBR(%%mm3, %%mm1, %%mm5)
- PAVGB(%%mm4, %%mm5)
- "movq %%mm6, (%2) \n\t"
-
"movq (%1, %%eax), %%mm0 \n\t"
- "movq 1(%1, %%eax), %%mm1 \n\t"
- PAVGBP(%%mm0, %%mm2, %%mm4, %%mm1, %%mm3, %%mm5)
- //PAVGBR(%%mm0, %%mm2, %%mm4)
- //PAVGBR(%%mm1, %%mm3, %%mm5)
- PAVGB(%%mm4, %%mm5)
- "movq %%mm6, (%2, %3) \n\t"
- "addl %%eax, %1 \n\t"
- "addl %%eax, %2 \n\t"
+ "movq 1(%1, %%eax), %%mm2 \n\t"
+ "movq %%mm0, %%mm1 \n\t"
+ "movq %%mm2, %%mm3 \n\t"
+ "punpcklbw %%mm7, %%mm0 \n\t"
+ "punpcklbw %%mm7, %%mm2 \n\t"
+ "punpckhbw %%mm7, %%mm1 \n\t"
+ "punpckhbw %%mm7, %%mm3 \n\t"
+ "paddusw %%mm2, %%mm0 \n\t"
+ "paddusw %%mm3, %%mm1 \n\t"
+ "paddusw %%mm6, %%mm4 \n\t"
+ "paddusw %%mm6, %%mm5 \n\t"
+ "paddusw %%mm0, %%mm4 \n\t"
+ "paddusw %%mm1, %%mm5 \n\t"
+ "psrlw $2, %%mm4 \n\t"
+ "psrlw $2, %%mm5 \n\t"
+ "packuswb %%mm5, %%mm4 \n\t"
+ "movq %%mm4, (%2, %%eax) \n\t"
+ "addl %3, %%eax \n\t"
- "subl $2, %0 \n\t"
+ "movq (%1, %%eax), %%mm2 \n\t" // 0 <-> 2 1 <-> 3
+ "movq 1(%1, %%eax), %%mm4 \n\t"
+ "movq %%mm2, %%mm3 \n\t"
+ "movq %%mm4, %%mm5 \n\t"
+ "punpcklbw %%mm7, %%mm2 \n\t"
+ "punpcklbw %%mm7, %%mm4 \n\t"
+ "punpckhbw %%mm7, %%mm3 \n\t"
+ "punpckhbw %%mm7, %%mm5 \n\t"
+ "paddusw %%mm2, %%mm4 \n\t"
+ "paddusw %%mm3, %%mm5 \n\t"
+ "paddusw %%mm6, %%mm0 \n\t"
+ "paddusw %%mm6, %%mm1 \n\t"
+ "paddusw %%mm4, %%mm0 \n\t"
+ "paddusw %%mm5, %%mm1 \n\t"
+ "psrlw $2, %%mm0 \n\t"
+ "psrlw $2, %%mm1 \n\t"
+ "packuswb %%mm1, %%mm0 \n\t"
+ "movq %%mm0, (%2, %%eax) \n\t"
+ "addl %3, %%eax \n\t"
+ "subl $2, %0 \n\t"
"jnz 1b \n\t"
- :"+g"(h), "+S"(pixels), "+D"(block)
- :"r"(line_size)
+ :"+g"(h), "+S"(pixels)
+ :"D"(block), "r"(line_size)
:"eax", "memory");
}
// avg_pixels
-