Browse code

x86: add AV_CPU_FLAG_AVXSLOW flag

Signed-off-by: James Almer <jamrial@gmail.com>
Signed-off-by: Luca Barbato <lu_zero@gentoo.org>

James Almer authored on 2015/05/27 02:29:06
Showing 5 changed files
... ...
@@ -13,6 +13,9 @@ libavutil:     2014-08-09
13 13
 
14 14
 API changes, most recent first:
15 15
 
16
+2015-xx-xx - xxxxxxx - lavu 54.14.0 - cpu.h
17
+  Add AV_CPU_FLAG_AVXSLOW.
18
+
16 19
 2015-xx-xx - xxxxxxx - lavc 56.23.0
17 20
   Add av_vda_default_init2.
18 21
 
... ...
@@ -86,6 +86,7 @@ int av_parse_cpu_flags(const char *s)
86 86
 #define CPUFLAG_SSE4     (AV_CPU_FLAG_SSE4     | CPUFLAG_SSSE3)
87 87
 #define CPUFLAG_SSE42    (AV_CPU_FLAG_SSE42    | CPUFLAG_SSE4)
88 88
 #define CPUFLAG_AVX      (AV_CPU_FLAG_AVX      | CPUFLAG_SSE42)
89
+#define CPUFLAG_AVXSLOW  (AV_CPU_FLAG_AVXSLOW  | CPUFLAG_AVX)
89 90
 #define CPUFLAG_XOP      (AV_CPU_FLAG_XOP      | CPUFLAG_AVX)
90 91
 #define CPUFLAG_FMA3     (AV_CPU_FLAG_FMA3     | CPUFLAG_AVX)
91 92
 #define CPUFLAG_FMA4     (AV_CPU_FLAG_FMA4     | CPUFLAG_AVX)
... ...
@@ -108,6 +109,7 @@ int av_parse_cpu_flags(const char *s)
108 108
         { "sse4.1"  , NULL, 0, AV_OPT_TYPE_CONST, { .i64 = CPUFLAG_SSE4         },    .unit = "flags" },
109 109
         { "sse4.2"  , NULL, 0, AV_OPT_TYPE_CONST, { .i64 = CPUFLAG_SSE42        },    .unit = "flags" },
110 110
         { "avx"     , NULL, 0, AV_OPT_TYPE_CONST, { .i64 = CPUFLAG_AVX          },    .unit = "flags" },
111
+        { "avxslow" , NULL, 0, AV_OPT_TYPE_CONST, { .i64 = CPUFLAG_AVXSLOW      },    .unit = "flags" },
111 112
         { "xop"     , NULL, 0, AV_OPT_TYPE_CONST, { .i64 = CPUFLAG_XOP          },    .unit = "flags" },
112 113
         { "fma3"    , NULL, 0, AV_OPT_TYPE_CONST, { .i64 = CPUFLAG_FMA3         },    .unit = "flags" },
113 114
         { "fma4"    , NULL, 0, AV_OPT_TYPE_CONST, { .i64 = CPUFLAG_FMA4         },    .unit = "flags" },
... ...
@@ -219,6 +221,7 @@ static const struct {
219 219
     { AV_CPU_FLAG_SSE4,      "sse4.1"     },
220 220
     { AV_CPU_FLAG_SSE42,     "sse4.2"     },
221 221
     { AV_CPU_FLAG_AVX,       "avx"        },
222
+    { AV_CPU_FLAG_AVXSLOW,   "avxslow"    },
222 223
     { AV_CPU_FLAG_XOP,       "xop"        },
223 224
     { AV_CPU_FLAG_FMA3,      "fma3"       },
224 225
     { AV_CPU_FLAG_FMA4,      "fma4"       },
... ...
@@ -45,6 +45,7 @@
45 45
 #define AV_CPU_FLAG_SSE4         0x0100 ///< Penryn SSE4.1 functions
46 46
 #define AV_CPU_FLAG_SSE42        0x0200 ///< Nehalem SSE4.2 functions
47 47
 #define AV_CPU_FLAG_AVX          0x4000 ///< AVX functions: requires OS support even if YMM registers aren't used
48
+#define AV_CPU_FLAG_AVXSLOW   0x8000000 ///< AVX supported, but slow when using YMM registers (e.g. Bulldozer)
48 49
 #define AV_CPU_FLAG_XOP          0x0400 ///< Bulldozer XOP functions
49 50
 #define AV_CPU_FLAG_FMA4         0x0800 ///< Bulldozer FMA4 functions
50 51
 #define AV_CPU_FLAG_CMOV         0x1000 ///< i686 cmov
... ...
@@ -54,8 +54,8 @@
54 54
  */
55 55
 
56 56
 #define LIBAVUTIL_VERSION_MAJOR 54
57
-#define LIBAVUTIL_VERSION_MINOR 13
58
-#define LIBAVUTIL_VERSION_MICRO  1
57
+#define LIBAVUTIL_VERSION_MINOR 14
58
+#define LIBAVUTIL_VERSION_MICRO  0
59 59
 
60 60
 #define LIBAVUTIL_VERSION_INT   AV_VERSION_INT(LIBAVUTIL_VERSION_MAJOR, \
61 61
                                                LIBAVUTIL_VERSION_MINOR, \
... ...
@@ -167,6 +167,7 @@ int ff_get_cpu_flags_x86(void)
167 167
         if (ext_caps & (1 << 22))
168 168
             rval |= AV_CPU_FLAG_MMXEXT;
169 169
 
170
+        if (!strncmp(vendor.c, "AuthenticAMD", 12)) {
170 171
         /* Allow for selectively disabling SSE2 functions on AMD processors
171 172
            with SSE2 support but not SSE4a. This includes Athlon64, some
172 173
            Opteron, and some Sempron processors. MMX, SSE, or 3DNow! are faster
... ...
@@ -174,9 +175,19 @@ int ff_get_cpu_flags_x86(void)
174 174
            AV_CPU_FLAG_SSE2 and AV_CPU_FLAG_SSE2SLOW are both set in this case
175 175
            so that SSE2 is used unless explicitly disabled by checking
176 176
            AV_CPU_FLAG_SSE2SLOW. */
177
-        if (!strncmp(vendor.c, "AuthenticAMD", 12) &&
178
-            rval & AV_CPU_FLAG_SSE2 && !(ecx & 0x00000040)) {
179
-            rval |= AV_CPU_FLAG_SSE2SLOW;
177
+            if (rval & AV_CPU_FLAG_SSE2 && !(ecx & 0x00000040))
178
+                rval |= AV_CPU_FLAG_SSE2SLOW;
179
+
180
+        /* Similar to the above but for AVX functions on AMD processors.
181
+           This is necessary only for functions using YMM registers on Bulldozer
182
+           based CPUs as they lack 256-bits execution units. SSE/AVX functions
183
+           using XMM registers are always faster on them.
184
+           AV_CPU_FLAG_AVX and AV_CPU_FLAG_AVXSLOW are both set so that AVX is
185
+           used unless explicitly disabled by checking AV_CPU_FLAG_AVXSLOW.
186
+           TODO: Confirm if Excavator is affected or not by this once it's
187
+                 released, and update the check if necessary. Same for btver2. */
188
+            if (family == 0x15 && (rval & AV_CPU_FLAG_AVX))
189
+                rval |= AV_CPU_FLAG_AVXSLOW;
180 190
         }
181 191
 
182 192
         /* XOP and FMA4 use the AVX instruction coding scheme, so they can't be