Remove FF_MM_SSE2/3 flags for CPUs where this is generally not faster than

regular MMX code. Examples of this are the Core1 CPU. Instead, set a new flag, FF_MM_SSE2/3SLOW, which can be checked for particular SSE2/3 functions that have been checked specifically on such CPUs and are actually faster than their MMX counterparts. In addition, use this flag to enable particular VP8 and LPC SSE2 functions that are faster than their MMX counterparts. Based on a patch by Loren Merritt <lorenm AT u washington edu>. Originally committed as revision 24340 to svn://svn.ffmpeg.org/ffmpeg/trunk
author: Ronald S. Bultje <rsbultje@gmail.com> 2010-07-19 22:38:23 +0000
committer: Ronald S. Bultje <rsbultje@gmail.com> 2010-07-19 22:38:23 +0000
commit: 6526976f0cbb3fa152797b3a15bd634ad14cabe3 (patch)
tree: e4c61c62e99aa4d99b3e1adb67cde6a227a18dc1
parent: 1878f685c0f69d1bf0acc78c5fc09dae03ac48d5 (diff)
download: ffmpeg-6526976f0cbb3fa152797b3a15bd634ad14cabe3.tar.gz
4 files changed, 29 insertions, 6 deletions
diff --git a/libavcodec/avcodec.h b/libavcodec/avcodec.h
index ff295760cf..560d57594b 100644
--- a/libavcodec/avcodec.h
+++ b/libavcodec/avcodec.h
@@ -1656,8 +1656,12 @@ typedef struct AVCodecContext {
 #define FF_MM_MMX2     0x0002 ///< SSE integer functions or AMD MMX ext
 #define FF_MM_SSE      0x0008 ///< SSE functions
 #define FF_MM_SSE2     0x0010 ///< PIV SSE2 functions
+#define FF_MM_SSE2SLOW 0x40000000 ///< SSE2 supported, but usually not faster
+                                  ///< than regular MMX/SSE (e.g. Core1)
 #define FF_MM_3DNOWEXT 0x0020 ///< AMD 3DNowExt
 #define FF_MM_SSE3     0x0040 ///< Prescott SSE3 functions
+#define FF_MM_SSE3SLOW 0x20000000 ///< SSE3 supported, but usually not faster
+                                  ///< than regular MMX/SSE (e.g. Core1)
 #define FF_MM_SSSE3    0x0080 ///< Conroe SSSE3 functions
 #define FF_MM_SSE4     0x0100 ///< Penryn SSE4.1 functions
 #define FF_MM_SSE42    0x0200 ///< Nehalem SSE4.2 functions
diff --git a/libavcodec/x86/cpuid.c b/libavcodec/x86/cpuid.c
index 1ed4d2e7e3..f9afd6e729 100644
--- a/libavcodec/x86/cpuid.c
+++ b/libavcodec/x86/cpuid.c
@@ -42,6 +42,8 @@ int mm_support(void)
     int rval = 0;
     int eax, ebx, ecx, edx;
     int max_std_level, max_ext_level, std_caps=0, ext_caps=0;
+    int family=0, model=0;
+    union { int i[3]; char c[12]; } vendor;
 
 #if ARCH_X86_32
     x86_reg a, c;
@@ -70,10 +72,12 @@ int mm_support(void)
         return 0; /* CPUID not supported */
 #endif
 
-    cpuid(0, max_std_level, ebx, ecx, edx);
+    cpuid(0, max_std_level, vendor.i[0], vendor.i[2], vendor.i[1]);
 
     if(max_std_level >= 1){
         cpuid(1, eax, ebx, ecx, std_caps);
+        family = ((eax>>8)&0xf) + ((eax>>20)&0xff);
+        model  = ((eax>>4)&0xf) + ((eax>>12)&0xf0);
         if (std_caps & (1<<23))
             rval |= FF_MM_MMX;
         if (std_caps & (1<<25))
@@ -108,13 +112,24 @@ int mm_support(void)
             rval |= FF_MM_MMX2;
     }
 
+    if (!strncmp(vendor.c, "GenuineIntel", 12) &&
+        family == 6 && (model == 9 || model == 13 || model == 14)) {
+        /* 6/9 (pentium-m "banias"), 6/13 (pentium-m "dothan"), and 6/14 (core1 "yonah")
+         * theoretically support sse2, but it's usually slower than mmx,
+         * so let's just pretend they don't. */
+        if (rval & FF_MM_SSE2) rval ^= FF_MM_SSE2SLOW|FF_MM_SSE2;
+        if (rval & FF_MM_SSE3) rval ^= FF_MM_SSE3SLOW|FF_MM_SSE3;
+    }
+
 #if 0
-    av_log(NULL, AV_LOG_DEBUG, "%s%s%s%s%s%s%s%s%s%s\n",
+    av_log(NULL, AV_LOG_DEBUG, "%s%s%s%s%s%s%s%s%s%s%s%s\n",
         (rval&FF_MM_MMX) ? "MMX ":"",
         (rval&FF_MM_MMX2) ? "MMX2 ":"",
         (rval&FF_MM_SSE) ? "SSE ":"",
         (rval&FF_MM_SSE2) ? "SSE2 ":"",
+        (rval&FF_MM_SSE2SLOW) ? "SSE2(slow) ":"",
         (rval&FF_MM_SSE3) ? "SSE3 ":"",
+        (rval&FF_MM_SSE3SLOW) ? "SSE3(slow) ":"",
         (rval&FF_MM_SSSE3) ? "SSSE3 ":"",
         (rval&FF_MM_SSE4) ? "SSE4.1 ":"",
         (rval&FF_MM_SSE42) ? "SSE4.2 ":"",
diff --git a/libavcodec/x86/dsputilenc_mmx.c b/libavcodec/x86/dsputilenc_mmx.c
index f491111ca5..d3e412a3dc 100644
--- a/libavcodec/x86/dsputilenc_mmx.c
+++ b/libavcodec/x86/dsputilenc_mmx.c
@@ -1409,9 +1409,10 @@ void dsputilenc_init_mmx(DSPContext* c, AVCodecContext *avctx)
             c->sum_abs_dctelem= sum_abs_dctelem_sse2;
             c->hadamard8_diff[0]= hadamard8_diff16_sse2;
             c->hadamard8_diff[1]= hadamard8_diff_sse2;
-#if CONFIG_LPC
+        }
+
+        if (CONFIG_LPC && mm_flags & (FF_MM_SSE2|FF_MM_SSE2SLOW)) {
             c->lpc_compute_autocorr = ff_lpc_compute_autocorr_sse2;
-#endif
         }
 
 #if HAVE_SSSE3
diff --git a/libavcodec/x86/vp8dsp-init.c b/libavcodec/x86/vp8dsp-init.c
index c7b02d1541..f8de2d272f 100644
--- a/libavcodec/x86/vp8dsp-init.c
+++ b/libavcodec/x86/vp8dsp-init.c
@@ -328,7 +328,7 @@ av_cold void ff_vp8dsp_init_x86(VP8DSPContext* c)
         c->put_vp8_bilinear_pixels_tab[0][0][0] = ff_put_vp8_pixels16_sse;
     }
 
-    if (mm_flags & FF_MM_SSE2) {
+    if (mm_flags & (FF_MM_SSE2|FF_MM_SSE2SLOW)) {
         VP8_LUMA_MC_FUNC(0, 16, sse2);
         VP8_MC_FUNC(1, 8, sse2);
         VP8_BILINEAR_MC_FUNC(0, 16, sse2);
@@ -338,8 +338,11 @@ av_cold void ff_vp8dsp_init_x86(VP8DSPContext* c)
         c->vp8_h_loop_filter_simple = ff_vp8_h_loop_filter_simple_sse2;
 
         c->vp8_v_loop_filter16y_inner = ff_vp8_v_loop_filter16y_inner_sse2;
-        c->vp8_h_loop_filter16y_inner = ff_vp8_h_loop_filter16y_inner_sse2;
         c->vp8_v_loop_filter8uv_inner = ff_vp8_v_loop_filter8uv_inner_sse2;
+    }
+
+    if (mm_flags & FF_MM_SSE2) {
+        c->vp8_h_loop_filter16y_inner = ff_vp8_h_loop_filter16y_inner_sse2;
         c->vp8_h_loop_filter8uv_inner = ff_vp8_h_loop_filter8uv_inner_sse2;
     }
author	Ronald S. Bultje <rsbultje@gmail.com>	2010-07-19 22:38:23 +0000
committer	Ronald S. Bultje <rsbultje@gmail.com>	2010-07-19 22:38:23 +0000
commit	6526976f0cbb3fa152797b3a15bd634ad14cabe3 (patch)
tree	e4c61c62e99aa4d99b3e1adb67cde6a227a18dc1
parent	1878f685c0f69d1bf0acc78c5fc09dae03ac48d5 (diff)
download	ffmpeg-6526976f0cbb3fa152797b3a15bd634ad14cabe3.tar.gz