aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorLoren Merritt <lorenm@u.washington.edu>2007-05-11 01:11:45 +0000
committerLoren Merritt <lorenm@u.washington.edu>2007-05-11 01:11:45 +0000
commit72946825fa6083a8074984036257ed09109d1910 (patch)
tree166fc01f5de70054cbff0df8d6768aa230ed5b79
parent164d75ebf3d6868926598661d78d8372d0020105 (diff)
downloadffmpeg-72946825fa6083a8074984036257ed09109d1910.tar.gz
sse2 version of fullpel sad.
16% faster on core2, 5% faster on p4. 10% slower (and thus disabled) on k8. Originally committed as revision 8992 to svn://svn.ffmpeg.org/ffmpeg/trunk
-rw-r--r--libavcodec/i386/motion_est_mmx.c32
1 files changed, 32 insertions, 0 deletions
diff --git a/libavcodec/i386/motion_est_mmx.c b/libavcodec/i386/motion_est_mmx.c
index af8db79b6b..b6998f67b5 100644
--- a/libavcodec/i386/motion_est_mmx.c
+++ b/libavcodec/i386/motion_est_mmx.c
@@ -88,6 +88,35 @@ static inline void sad8_1_mmx2(uint8_t *blk1, uint8_t *blk2, int stride, int h)
);
}
+static int sad16_sse2(void *v, uint8_t *blk2, uint8_t *blk1, int stride, int h)
+{
+ int ret;
+ asm volatile(
+ "pxor %%xmm6, %%xmm6 \n\t"
+ ASMALIGN(4)
+ "1: \n\t"
+ "movdqu (%1), %%xmm0 \n\t"
+ "movdqu (%1, %3), %%xmm1 \n\t"
+ "psadbw (%2), %%xmm0 \n\t"
+ "psadbw (%2, %3), %%xmm1 \n\t"
+ "paddw %%xmm0, %%xmm6 \n\t"
+ "paddw %%xmm1, %%xmm6 \n\t"
+ "lea (%1,%3,2), %1 \n\t"
+ "lea (%2,%3,2), %2 \n\t"
+ "sub $2, %0 \n\t"
+ " jg 1b \n\t"
+ : "+r" (h), "+r" (blk1), "+r" (blk2)
+ : "r" ((long)stride)
+ );
+ asm volatile(
+ "movhlps %%xmm6, %%xmm0 \n\t"
+ "paddw %%xmm0, %%xmm6 \n\t"
+ "movd %%xmm6, %0 \n\t"
+ : "=r"(ret)
+ );
+ return ret;
+}
+
static inline void sad8_x2a_mmx2(uint8_t *blk1, uint8_t *blk2, int stride, int h)
{
asm volatile(
@@ -424,4 +453,7 @@ void dsputil_init_pix_mmx(DSPContext* c, AVCodecContext *avctx)
c->pix_abs[1][3] = sad8_xy2_mmx2;
}
}
+ if ((mm_flags & MM_SSE2) && !(mm_flags & MM_3DNOW)) {
+ c->sad[0]= sad16_sse2;
+ }
}