port ape dsp functions from sse2 to mmx

now requires yasm Originally committed as revision 20722 to svn://svn.ffmpeg.org/ffmpeg/trunk
author: Loren Merritt <lorenm@u.washington.edu> 2009-12-03 18:53:12 +0000
committer: Loren Merritt <lorenm@u.washington.edu> 2009-12-03 18:53:12 +0000
commit: b10fa1bb8b9d4074b1d4dd7364ca236a574c9951 (patch)
tree: b7f75f84fb13d8a68ee8079da1f03089cc33b9c5 /libavcodec/x86/dsputil_mmx.c
parent: f415be684dda3958332d19e0854c080378eef4c2 (diff)
download: ffmpeg-b10fa1bb8b9d4074b1d4dd7364ca236a574c9951.tar.gz
1 files changed, 18 insertions, 75 deletions
diff --git a/libavcodec/x86/dsputil_mmx.c b/libavcodec/x86/dsputil_mmx.c
index af33707df0..93d4af5565 100644
--- a/libavcodec/x86/dsputil_mmx.c
+++ b/libavcodec/x86/dsputil_mmx.c
@@ -2384,6 +2384,12 @@ static void float_to_int16_sse2(int16_t *dst, const float *src, long len){
 void ff_float_to_int16_interleave6_sse(int16_t *dst, const float **src, int len);
 void ff_float_to_int16_interleave6_3dnow(int16_t *dst, const float **src, int len);
 void ff_float_to_int16_interleave6_3dn2(int16_t *dst, const float **src, int len);
+void ff_add_int16_mmx2(int16_t * v1, int16_t * v2, int order);
+void ff_add_int16_sse2(int16_t * v1, int16_t * v2, int order);
+void ff_sub_int16_mmx2(int16_t * v1, int16_t * v2, int order);
+void ff_sub_int16_sse2(int16_t * v1, int16_t * v2, int order);
+int32_t ff_scalarproduct_int16_mmx2(int16_t * v1, int16_t * v2, int order, int shift);
+int32_t ff_scalarproduct_int16_sse2(int16_t * v1, int16_t * v2, int order, int shift);
 void ff_add_hfyu_median_prediction_mmx2(uint8_t *dst, const uint8_t *top, const uint8_t *diff, int w, int *left, int *left_top);
 int  ff_add_hfyu_left_prediction_ssse3(uint8_t *dst, const uint8_t *src, int w, int left);
 int  ff_add_hfyu_left_prediction_sse4(uint8_t *dst, const uint8_t *src, int w, int left);
@@ -2507,78 +2513,6 @@ void ff_snow_inner_add_yblock_mmx(const uint8_t *obmc, const int obmc_stride, ui
                                   int src_x, int src_y, int src_stride, slice_buffer * sb, int add, uint8_t * dst8);
 
 
-static void add_int16_sse2(int16_t * v1, int16_t * v2, int order)
-{
-    x86_reg o = -(order << 1);
-    v1 += order;
-    v2 += order;
-    __asm__ volatile(
-        "1:                          \n\t"
-        "movdqu   (%1,%2),   %%xmm0  \n\t"
-        "movdqu 16(%1,%2),   %%xmm1  \n\t"
-        "paddw    (%0,%2),   %%xmm0  \n\t"
-        "paddw  16(%0,%2),   %%xmm1  \n\t"
-        "movdqa   %%xmm0,    (%0,%2) \n\t"
-        "movdqa   %%xmm1,  16(%0,%2) \n\t"
-        "add      $32,       %2      \n\t"
-        "js       1b                 \n\t"
-        : "+r"(v1), "+r"(v2), "+r"(o)
-    );
-}
-
-static void sub_int16_sse2(int16_t * v1, int16_t * v2, int order)
-{
-    x86_reg o = -(order << 1);
-    v1 += order;
-    v2 += order;
-    __asm__ volatile(
-        "1:                           \n\t"
-        "movdqa    (%0,%2),   %%xmm0  \n\t"
-        "movdqa  16(%0,%2),   %%xmm2  \n\t"
-        "movdqu    (%1,%2),   %%xmm1  \n\t"
-        "movdqu  16(%1,%2),   %%xmm3  \n\t"
-        "psubw     %%xmm1,    %%xmm0  \n\t"
-        "psubw     %%xmm3,    %%xmm2  \n\t"
-        "movdqa    %%xmm0,    (%0,%2) \n\t"
-        "movdqa    %%xmm2,  16(%0,%2) \n\t"
-        "add       $32,       %2      \n\t"
-        "js        1b                 \n\t"
-        : "+r"(v1), "+r"(v2), "+r"(o)
-    );
-}
-
-static int32_t scalarproduct_int16_sse2(int16_t * v1, int16_t * v2, int order, int shift)
-{
-    int res = 0;
-    DECLARE_ALIGNED_16(xmm_reg, sh);
-    x86_reg o = -(order << 1);
-
-    v1 += order;
-    v2 += order;
-    sh.a = shift;
-    __asm__ volatile(
-        "pxor      %%xmm7,  %%xmm7        \n\t"
-        "1:                               \n\t"
-        "movdqu    (%0,%3), %%xmm0        \n\t"
-        "movdqu  16(%0,%3), %%xmm1        \n\t"
-        "pmaddwd   (%1,%3), %%xmm0        \n\t"
-        "pmaddwd 16(%1,%3), %%xmm1        \n\t"
-        "paddd     %%xmm0,  %%xmm7        \n\t"
-        "paddd     %%xmm1,  %%xmm7        \n\t"
-        "add       $32,     %3            \n\t"
-        "js        1b                     \n\t"
-        "movhlps   %%xmm7,  %%xmm2        \n\t"
-        "paddd     %%xmm2,  %%xmm7        \n\t"
-        "psrad     %4,      %%xmm7        \n\t"
-        "pshuflw   $0x4E,   %%xmm7,%%xmm2 \n\t"
-        "paddd     %%xmm2,  %%xmm7        \n\t"
-        "movd      %%xmm7,  %2            \n\t"
-        : "+r"(v1), "+r"(v2), "=r"(res), "+r"(o)
-        : "m"(sh)
-    );
-    return res;
-}
-
 void dsputil_init_mmx(DSPContext* c, AVCodecContext *avctx)
 {
     mm_flags = mm_support();
@@ -3015,6 +2949,13 @@ void dsputil_init_mmx(DSPContext* c, AVCodecContext *avctx)
                 c->float_to_int16_interleave = float_to_int16_interleave_3dn2;
             }
         }
+        if(mm_flags & FF_MM_MMX2){
+#if HAVE_YASM
+            c->add_int16 = ff_add_int16_mmx2;
+            c->sub_int16 = ff_sub_int16_mmx2;
+            c->scalarproduct_int16 = ff_scalarproduct_int16_mmx2;
+#endif
+        }
         if(mm_flags & FF_MM_SSE){
             c->vorbis_inverse_coupling = vorbis_inverse_coupling_sse;
             c->ac3_downmix = ac3_downmix_sse;
@@ -3033,9 +2974,11 @@ void dsputil_init_mmx(DSPContext* c, AVCodecContext *avctx)
             c->int32_to_float_fmul_scalar = int32_to_float_fmul_scalar_sse2;
             c->float_to_int16 = float_to_int16_sse2;
             c->float_to_int16_interleave = float_to_int16_interleave_sse2;
-            c->add_int16 = add_int16_sse2;
-            c->sub_int16 = sub_int16_sse2;
-            c->scalarproduct_int16 = scalarproduct_int16_sse2;
+#if HAVE_YASM
+            c->add_int16 = ff_add_int16_sse2;
+            c->sub_int16 = ff_sub_int16_sse2;
+            c->scalarproduct_int16 = ff_scalarproduct_int16_sse2;
+#endif
         }
     }
author	Loren Merritt <lorenm@u.washington.edu>	2009-12-03 18:53:12 +0000
committer	Loren Merritt <lorenm@u.washington.edu>	2009-12-03 18:53:12 +0000
commit	b10fa1bb8b9d4074b1d4dd7364ca236a574c9951 (patch)
tree	b7f75f84fb13d8a68ee8079da1f03089cc33b9c5 /libavcodec/x86/dsputil_mmx.c
parent	f415be684dda3958332d19e0854c080378eef4c2 (diff)
download	ffmpeg-b10fa1bb8b9d4074b1d4dd7364ca236a574c9951.tar.gz