[FFmpeg-devel] [PATCH 1/2] libavutil/cpu: Adds fast gather detection.
Alan Kelly
alankelly at google.com
Fri Jun 25 10:54:29 EEST 2021
Broadwell and later and Zen3 and later have fast gather instructions.
---
Gather requires between 9 and 12 cycles on Haswell, 5 to 7 on Broadwell,
and 2 to 5 on Skylake and newer. It is also slow on AMD before Zen 3.
libavutil/cpu.h | 2 ++
libavutil/x86/cpu.c | 18 ++++++++++++++++--
libavutil/x86/cpu.h | 1 +
3 files changed, 19 insertions(+), 2 deletions(-)
diff --git a/libavutil/cpu.h b/libavutil/cpu.h
index b555422dae..f94eb79af1 100644
--- a/libavutil/cpu.h
+++ b/libavutil/cpu.h
@@ -50,6 +50,7 @@
#define AV_CPU_FLAG_FMA4 0x0800 ///< Bulldozer FMA4 functions
#define AV_CPU_FLAG_CMOV 0x1000 ///< supports cmov instruction
#define AV_CPU_FLAG_AVX2 0x8000 ///< AVX2 functions: requires OS support even if YMM registers aren't used
+#define AV_CPU_FLAG_AVX2SLOW 0x2000000 ///< AVX2 supported but gather is slower.
#define AV_CPU_FLAG_FMA3 0x10000 ///< Haswell FMA3 functions
#define AV_CPU_FLAG_BMI1 0x20000 ///< Bit Manipulation Instruction Set 1
#define AV_CPU_FLAG_BMI2 0x40000 ///< Bit Manipulation Instruction Set 2
@@ -107,6 +108,7 @@ int av_cpu_count(void);
* av_set_cpu_flags_mask(), then this function will behave as if AVX is not
* present.
*/
+
size_t av_cpu_max_align(void);
#endif /* AVUTIL_CPU_H */
diff --git a/libavutil/x86/cpu.c b/libavutil/x86/cpu.c
index bcd41a50a2..56fcde594c 100644
--- a/libavutil/x86/cpu.c
+++ b/libavutil/x86/cpu.c
@@ -146,8 +146,20 @@ int ff_get_cpu_flags_x86(void)
if (max_std_level >= 7) {
cpuid(7, eax, ebx, ecx, edx);
#if HAVE_AVX2
- if ((rval & AV_CPU_FLAG_AVX) && (ebx & 0x00000020))
+ if ((rval & AV_CPU_FLAG_AVX) && (ebx & 0x00000020)){
rval |= AV_CPU_FLAG_AVX2;
+
+ cpuid(1, eax, ebx, ecx, std_caps);
+ family = ((eax >> 8) & 0xf) + ((eax >> 20) & 0xff);
+ model = ((eax >> 4) & 0xf) + ((eax >> 12) & 0xf0);
+ // Haswell and earlier has slow gather
+ if(family == 6 && model < 70)
+ rval |= AV_CPU_FLAG_AVX2SLOW;
+ // Zen 2 and earlier
+ if (!strncmp(vendor.c, "AuthenticAMD", 12) && family < 25)
+ rval |= AV_CPU_FLAG_AVX2SLOW;
+ }
+
#if HAVE_AVX512 /* F, CD, BW, DQ, VL */
if ((xcr0_lo & 0xe0) == 0xe0) { /* OPMASK/ZMM state */
if ((rval & AV_CPU_FLAG_AVX2) && (ebx & 0xd0030000) == 0xd0030000)
@@ -194,8 +206,10 @@ int ff_get_cpu_flags_x86(void)
functions using XMM registers are always faster on them.
AV_CPU_FLAG_AVX and AV_CPU_FLAG_AVXSLOW are both set so that AVX is
used unless explicitly disabled by checking AV_CPU_FLAG_AVXSLOW. */
- if ((family == 0x15 || family == 0x16) && (rval & AV_CPU_FLAG_AVX))
+ if ((family == 0x15 || family == 0x16) && (rval & AV_CPU_FLAG_AVX)){
rval |= AV_CPU_FLAG_AVXSLOW;
+ rval |= AV_CPU_FLAG_AVX2SLOW;
+ }
}
/* XOP and FMA4 use the AVX instruction coding scheme, so they can't be
diff --git a/libavutil/x86/cpu.h b/libavutil/x86/cpu.h
index 937c697fa0..a42a15a997 100644
--- a/libavutil/x86/cpu.h
+++ b/libavutil/x86/cpu.h
@@ -78,6 +78,7 @@
#define EXTERNAL_AVX2(flags) CPUEXT_SUFFIX(flags, _EXTERNAL, AVX2)
#define EXTERNAL_AVX2_FAST(flags) CPUEXT_SUFFIX_FAST2(flags, _EXTERNAL, AVX2, AVX)
#define EXTERNAL_AVX2_SLOW(flags) CPUEXT_SUFFIX_SLOW2(flags, _EXTERNAL, AVX2, AVX)
+#define EXTERNAL_AVX2_FAST_GATHER(flags) CPUEXT_SUFFIX_FAST(flags, _EXTERNAL, AVX2)
#define EXTERNAL_AESNI(flags) CPUEXT_SUFFIX(flags, _EXTERNAL, AESNI)
#define EXTERNAL_AVX512(flags) CPUEXT_SUFFIX(flags, _EXTERNAL, AVX512)
--
2.32.0.93.g670b81a890-goog
More information about the ffmpeg-devel
mailing list