diff --git a/lib/arm/cpu_features.c b/lib/arm/cpu_features.c
index 42f81f5e..7b2584b7 100644
--- a/lib/arm/cpu_features.c
+++ b/lib/arm/cpu_features.c
@@ -54,6 +54,7 @@
 
 #include <errno.h>
 #include <fcntl.h>
+#include <stdlib.h>
 #include <string.h>
 #include <unistd.h>
 
@@ -130,6 +131,51 @@ static u32 query_arm_cpu_features(void)
 	return features;
 }
 
+#ifdef ARCH_ARM64
+/*
+ * Return whether cpu0's MIDR_EL1 identifies one of the Arm Neoverse
+ * V-class server cores (V1 / V2 / V3 / V3AE).  MIDR_EL1 is exposed
+ * unprivileged via sysfs (added in Linux 4.7).  Reading cpu0 only is fine
+ * in practice: no Neoverse V-class server SKU has shipped as part of a
+ * big.LITTLE cluster.  Any failure (file missing, read error, parse
+ * failure, unrecognized CPU) returns false.
+ */
+static bool arm64_cpu_is_neoverse_v_class(void)
+{
+	int fd;
+	char buf[32];
+	ssize_t n;
+	unsigned long midr;
+	u32 part;
+
+	fd = open("/sys/devices/system/cpu/cpu0/regs/identification/midr_el1",
+		  O_RDONLY);
+	if (fd < 0)
+		return false;
+	do {
+		n = read(fd, buf, sizeof(buf) - 1);
+	} while (n < 0 && errno == EINTR);
+	close(fd);
+	if (n <= 0)
+		return false;
+	buf[n] = '\0';
+	midr = strtoul(buf, NULL, 0); /* sysfs prints "0x%016llx\n" */
+
+	/* MIDR_EL1: [31:24]=Implementer, [15:4]=PartNum. */
+	if (((midr >> 24) & 0xff) != 0x41) /* Implementer must be Arm Ltd. */
+		return false;
+	part = (midr >> 4) & 0xfff;
+	switch (part) {
+	case 0xd40: /* Neoverse V1   (e.g. AWS Graviton 3) */
+	case 0xd4f: /* Neoverse V2   (e.g. AWS Graviton 4) */
+	case 0xd83: /* Neoverse V3AE */
+	case 0xd84: /* Neoverse V3 */
+		return true;
+	}
+	return false;
+}
+#endif /* ARCH_ARM64 */
+
 #elif defined(__APPLE__)
 /* On Apple platforms, arm64 CPU features can be detected via sysctlbyname(). */
 
@@ -202,24 +248,40 @@ static const struct cpu_feature arm_cpu_feature_table[] = {
 	{ARM_CPU_FEATURE_DOTPROD,	"dotprod"},
 };
 
+/*
+ * Whether to set ARM_CPU_FEATURE_PREFER_PMULL on this CPU.  This is the
+ * right choice on CPUs whose pmull pipes have more aggregate throughput
+ * than the crc32 unit -- in measured cases by a wide margin: the Apple M
+ * series sustains ~68 GB/s on pmull vs ~25 GB/s on crc32 (M1), and the Arm
+ * Neoverse V class sustains ~40 GB/s vs ~22 GB/s (Graviton 4 / V2).
+ *
+ * We detect Apple at compile time, and Neoverse V-class cores at runtime
+ * via MIDR_EL1 on Linux.  Elsewhere we leave this unset, and the dispatcher
+ * picks the crc32-instruction path which is the right default for most
+ * other modern ARM CPUs.
+ */
+static bool arm_cpu_prefers_pmull(void)
+{
+#if defined(__APPLE__) && TARGET_OS_OSX
+	return true;
+#elif defined(__linux__) && defined(ARCH_ARM64)
+	if (arm64_cpu_is_neoverse_v_class())
+		return true;
+#endif
+#ifdef TEST_SUPPORT__DO_NOT_USE
+	return true;
+#endif
+	return false;
+}
+
 volatile u32 libdeflate_arm_cpu_features = 0;
 
 void libdeflate_init_arm_cpu_features(void)
 {
 	u32 features = query_arm_cpu_features();
 
-	/*
-	 * On the Apple M1 processor, crc32 instructions max out at about 25.5
-	 * GB/s in the best case of using a 3-way or greater interleaved chunked
-	 * implementation, whereas a pmull-based implementation achieves 68 GB/s
-	 * provided that the stride length is large enough (about 10+ vectors
-	 * with eor3, or 12+ without).
-	 *
-	 * Assume that crc32 instructions are preferable in other cases.
-	 */
-#if (defined(__APPLE__) && TARGET_OS_OSX) || defined(TEST_SUPPORT__DO_NOT_USE)
-	features |= ARM_CPU_FEATURE_PREFER_PMULL;
-#endif
+	if (arm_cpu_prefers_pmull())
+		features |= ARM_CPU_FEATURE_PREFER_PMULL;
 
 	disable_cpu_features_for_testing(&features, arm_cpu_feature_table,
 					 ARRAY_LEN(arm_cpu_feature_table));