Skip to content
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
86 changes: 74 additions & 12 deletions lib/arm/cpu_features.c
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,7 @@

#include <errno.h>
#include <fcntl.h>
#include <stdlib.h>
#include <string.h>
#include <unistd.h>

Expand Down Expand Up @@ -130,6 +131,51 @@ static u32 query_arm_cpu_features(void)
return features;
}

#ifdef ARCH_ARM64
/*
* Return whether cpu0's MIDR_EL1 identifies one of the Arm Neoverse
* V-class server cores (V1 / V2 / V3 / V3AE). MIDR_EL1 is exposed
* unprivileged via sysfs (added in Linux 4.7). Reading cpu0 only is fine
* in practice: no Neoverse V-class server SKU has shipped as part of a
* big.LITTLE cluster. Any failure (file missing, read error, parse
* failure, unrecognized CPU) returns false.
*/
static bool arm64_cpu_is_neoverse_v_class(void)
{
int fd;
char buf[32];
ssize_t n;
unsigned long midr;
u32 part;

fd = open("/sys/devices/system/cpu/cpu0/regs/identification/midr_el1",
O_RDONLY);
if (fd < 0)
return false;
do {
n = read(fd, buf, sizeof(buf) - 1);
} while (n < 0 && errno == EINTR);
close(fd);
if (n <= 0)
return false;
buf[n] = '\0';
midr = strtoul(buf, NULL, 0); /* sysfs prints "0x%016llx\n" */

/* MIDR_EL1: [31:24]=Implementer, [15:4]=PartNum. */
if (((midr >> 24) & 0xff) != 0x41) /* Implementer must be Arm Ltd. */
return false;
part = (midr >> 4) & 0xfff;
switch (part) {
case 0xd40: /* Neoverse V1 (e.g. AWS Graviton 3) */
case 0xd4f: /* Neoverse V2 (e.g. AWS Graviton 4) */
case 0xd83: /* Neoverse V3AE */
case 0xd84: /* Neoverse V3 */
return true;
}
return false;
}
#endif /* ARCH_ARM64 */

#elif defined(__APPLE__)
/* On Apple platforms, arm64 CPU features can be detected via sysctlbyname(). */

Expand Down Expand Up @@ -202,24 +248,40 @@ static const struct cpu_feature arm_cpu_feature_table[] = {
{ARM_CPU_FEATURE_DOTPROD, "dotprod"},
};

/*
* Whether to set ARM_CPU_FEATURE_PREFER_PMULL on this CPU. This is the
* right choice on CPUs whose pmull pipes have more aggregate throughput
* than the crc32 unit -- in measured cases by a wide margin: the Apple M
* series sustains ~68 GB/s on pmull vs ~25 GB/s on crc32 (M1), and the Arm
* Neoverse V class sustains ~40 GB/s vs ~22 GB/s (Graviton 4 / V2).
*
* We detect Apple at compile time, and Neoverse V-class cores at runtime
* via MIDR_EL1 on Linux. Elsewhere we leave this unset, and the dispatcher
* picks the crc32-instruction path which is the right default for most
* other modern ARM CPUs.
*/
static bool arm_cpu_prefers_pmull(void)
{
#if defined(__APPLE__) && TARGET_OS_OSX
return true;
#elif defined(__linux__) && defined(ARCH_ARM64)
if (arm64_cpu_is_neoverse_v_class())
return true;
#endif
#ifdef TEST_SUPPORT__DO_NOT_USE
return true;
#endif
return false;
}

volatile u32 libdeflate_arm_cpu_features = 0;

void libdeflate_init_arm_cpu_features(void)
{
u32 features = query_arm_cpu_features();

/*
* On the Apple M1 processor, crc32 instructions max out at about 25.5
* GB/s in the best case of using a 3-way or greater interleaved chunked
* implementation, whereas a pmull-based implementation achieves 68 GB/s
* provided that the stride length is large enough (about 10+ vectors
* with eor3, or 12+ without).
*
* Assume that crc32 instructions are preferable in other cases.
*/
#if (defined(__APPLE__) && TARGET_OS_OSX) || defined(TEST_SUPPORT__DO_NOT_USE)
features |= ARM_CPU_FEATURE_PREFER_PMULL;
#endif
if (arm_cpu_prefers_pmull())
features |= ARM_CPU_FEATURE_PREFER_PMULL;

disable_cpu_features_for_testing(&features, arm_cpu_feature_table,
ARRAY_LEN(arm_cpu_feature_table));
Expand Down
Loading