diff --git a/lib/arm/cpu_features.c b/lib/arm/cpu_features.c index 42f81f5e..7b2584b7 100644 --- a/lib/arm/cpu_features.c +++ b/lib/arm/cpu_features.c @@ -54,6 +54,7 @@ #include #include +#include #include #include @@ -130,6 +131,51 @@ static u32 query_arm_cpu_features(void) return features; } +#ifdef ARCH_ARM64 +/* + * Return whether cpu0's MIDR_EL1 identifies one of the Arm Neoverse + * V-class server cores (V1 / V2 / V3 / V3AE). MIDR_EL1 is exposed + * unprivileged via sysfs (added in Linux 4.7). Reading cpu0 only is fine + * in practice: no Neoverse V-class server SKU has shipped as part of a + * big.LITTLE cluster. Any failure (file missing, read error, parse + * failure, unrecognized CPU) returns false. + */ +static bool arm64_cpu_is_neoverse_v_class(void) +{ + int fd; + char buf[32]; + ssize_t n; + unsigned long midr; + u32 part; + + fd = open("/sys/devices/system/cpu/cpu0/regs/identification/midr_el1", + O_RDONLY); + if (fd < 0) + return false; + do { + n = read(fd, buf, sizeof(buf) - 1); + } while (n < 0 && errno == EINTR); + close(fd); + if (n <= 0) + return false; + buf[n] = '\0'; + midr = strtoul(buf, NULL, 0); /* sysfs prints "0x%016llx\n" */ + + /* MIDR_EL1: [31:24]=Implementer, [15:4]=PartNum. */ + if (((midr >> 24) & 0xff) != 0x41) /* Implementer must be Arm Ltd. */ + return false; + part = (midr >> 4) & 0xfff; + switch (part) { + case 0xd40: /* Neoverse V1 (e.g. AWS Graviton 3) */ + case 0xd4f: /* Neoverse V2 (e.g. AWS Graviton 4) */ + case 0xd83: /* Neoverse V3AE */ + case 0xd84: /* Neoverse V3 */ + return true; + } + return false; +} +#endif /* ARCH_ARM64 */ + #elif defined(__APPLE__) /* On Apple platforms, arm64 CPU features can be detected via sysctlbyname(). */ @@ -202,24 +248,40 @@ static const struct cpu_feature arm_cpu_feature_table[] = { {ARM_CPU_FEATURE_DOTPROD, "dotprod"}, }; +/* + * Whether to set ARM_CPU_FEATURE_PREFER_PMULL on this CPU. This is the + * right choice on CPUs whose pmull pipes have more aggregate throughput + * than the crc32 unit -- in measured cases by a wide margin: the Apple M + * series sustains ~68 GB/s on pmull vs ~25 GB/s on crc32 (M1), and the Arm + * Neoverse V class sustains ~40 GB/s vs ~22 GB/s (Graviton 4 / V2). + * + * We detect Apple at compile time, and Neoverse V-class cores at runtime + * via MIDR_EL1 on Linux. Elsewhere we leave this unset, and the dispatcher + * picks the crc32-instruction path which is the right default for most + * other modern ARM CPUs. + */ +static bool arm_cpu_prefers_pmull(void) +{ +#if defined(__APPLE__) && TARGET_OS_OSX + return true; +#elif defined(__linux__) && defined(ARCH_ARM64) + if (arm64_cpu_is_neoverse_v_class()) + return true; +#endif +#ifdef TEST_SUPPORT__DO_NOT_USE + return true; +#endif + return false; +} + volatile u32 libdeflate_arm_cpu_features = 0; void libdeflate_init_arm_cpu_features(void) { u32 features = query_arm_cpu_features(); - /* - * On the Apple M1 processor, crc32 instructions max out at about 25.5 - * GB/s in the best case of using a 3-way or greater interleaved chunked - * implementation, whereas a pmull-based implementation achieves 68 GB/s - * provided that the stride length is large enough (about 10+ vectors - * with eor3, or 12+ without). - * - * Assume that crc32 instructions are preferable in other cases. - */ -#if (defined(__APPLE__) && TARGET_OS_OSX) || defined(TEST_SUPPORT__DO_NOT_USE) - features |= ARM_CPU_FEATURE_PREFER_PMULL; -#endif + if (arm_cpu_prefers_pmull()) + features |= ARM_CPU_FEATURE_PREFER_PMULL; disable_cpu_features_for_testing(&features, arm_cpu_feature_table, ARRAY_LEN(arm_cpu_feature_table));