Files
HackerSM64/tools/libdeflate/arm/cpu_features.c
2025-07-03 20:55:47 +08:00

231 lines
6.4 KiB
C

/*
* arm/cpu_features.c - feature detection for ARM CPUs
*
* Copyright 2018 Eric Biggers
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*/
/*
* ARM CPUs don't have a standard way for unprivileged programs to detect CPU
* features. But an OS-specific way can be used when available.
*/
#ifdef __APPLE__
# undef _ANSI_SOURCE
# undef _DARWIN_C_SOURCE
# define _DARWIN_C_SOURCE /* for sysctlbyname() */
#endif
#include "../cpu_features_common.h" /* must be included first */
#include "cpu_features.h"
#ifdef ARM_CPU_FEATURES_KNOWN
/* Runtime ARM CPU feature detection is supported. */
#ifdef __linux__
/*
* On Linux, arm32 and arm64 CPU features can be detected by reading the
* AT_HWCAP and AT_HWCAP2 values from /proc/self/auxv.
*
* Ideally we'd use the C library function getauxval(), but it's not guaranteed
* to be available: it was only added to glibc in 2.16, and in Android it was
* added to API level 18 for arm32 and level 21 for arm64.
*/
#include <errno.h>
#include <fcntl.h>
#include <string.h>
#include <unistd.h>
#define AT_HWCAP 16
#define AT_HWCAP2 26
static void scan_auxv(unsigned long *hwcap, unsigned long *hwcap2)
{
int fd;
unsigned long auxbuf[32];
int filled = 0;
int i;
fd = open("/proc/self/auxv", O_RDONLY);
if (fd < 0)
return;
for (;;) {
do {
int ret = read(fd, &((char *)auxbuf)[filled],
sizeof(auxbuf) - filled);
if (ret <= 0) {
if (ret < 0 && errno == EINTR)
continue;
goto out;
}
filled += ret;
} while (filled < 2 * sizeof(long));
i = 0;
do {
unsigned long type = auxbuf[i];
unsigned long value = auxbuf[i + 1];
if (type == AT_HWCAP)
*hwcap = value;
else if (type == AT_HWCAP2)
*hwcap2 = value;
i += 2;
filled -= 2 * sizeof(long);
} while (filled >= 2 * sizeof(long));
memmove(auxbuf, &auxbuf[i], filled);
}
out:
close(fd);
}
static u32 query_arm_cpu_features(void)
{
u32 features = 0;
unsigned long hwcap = 0;
unsigned long hwcap2 = 0;
scan_auxv(&hwcap, &hwcap2);
#ifdef ARCH_ARM32
STATIC_ASSERT(sizeof(long) == 4);
if (hwcap & (1 << 12)) /* HWCAP_NEON */
features |= ARM_CPU_FEATURE_NEON;
#else
STATIC_ASSERT(sizeof(long) == 8);
if (hwcap & (1 << 1)) /* HWCAP_ASIMD */
features |= ARM_CPU_FEATURE_NEON;
if (hwcap & (1 << 4)) /* HWCAP_PMULL */
features |= ARM_CPU_FEATURE_PMULL;
if (hwcap & (1 << 7)) /* HWCAP_CRC32 */
features |= ARM_CPU_FEATURE_CRC32;
if (hwcap & (1 << 17)) /* HWCAP_SHA3 */
features |= ARM_CPU_FEATURE_SHA3;
if (hwcap & (1 << 20)) /* HWCAP_ASIMDDP */
features |= ARM_CPU_FEATURE_DOTPROD;
#endif
return features;
}
#elif defined(__APPLE__)
/* On Apple platforms, arm64 CPU features can be detected via sysctlbyname(). */
#include <sys/types.h>
#include <sys/sysctl.h>
#include <TargetConditionals.h>
static const struct {
const char *name;
u32 feature;
} feature_sysctls[] = {
{ "hw.optional.neon", ARM_CPU_FEATURE_NEON },
{ "hw.optional.AdvSIMD", ARM_CPU_FEATURE_NEON },
{ "hw.optional.arm.FEAT_PMULL", ARM_CPU_FEATURE_PMULL },
{ "hw.optional.armv8_crc32", ARM_CPU_FEATURE_CRC32 },
{ "hw.optional.armv8_2_sha3", ARM_CPU_FEATURE_SHA3 },
{ "hw.optional.arm.FEAT_SHA3", ARM_CPU_FEATURE_SHA3 },
{ "hw.optional.arm.FEAT_DotProd", ARM_CPU_FEATURE_DOTPROD },
};
static u32 query_arm_cpu_features(void)
{
u32 features = 0;
size_t i;
for (i = 0; i < ARRAY_LEN(feature_sysctls); i++) {
const char *name = feature_sysctls[i].name;
u32 val = 0;
size_t valsize = sizeof(val);
if (sysctlbyname(name, &val, &valsize, NULL, 0) == 0 &&
valsize == sizeof(val) && val == 1)
features |= feature_sysctls[i].feature;
}
return features;
}
#elif defined(_WIN32)
#include <windows.h>
#ifndef PF_ARM_V82_DP_INSTRUCTIONS_AVAILABLE /* added in Windows SDK 20348 */
# define PF_ARM_V82_DP_INSTRUCTIONS_AVAILABLE 43
#endif
static u32 query_arm_cpu_features(void)
{
u32 features = ARM_CPU_FEATURE_NEON;
if (IsProcessorFeaturePresent(PF_ARM_V8_CRYPTO_INSTRUCTIONS_AVAILABLE))
features |= ARM_CPU_FEATURE_PMULL;
if (IsProcessorFeaturePresent(PF_ARM_V8_CRC32_INSTRUCTIONS_AVAILABLE))
features |= ARM_CPU_FEATURE_CRC32;
if (IsProcessorFeaturePresent(PF_ARM_V82_DP_INSTRUCTIONS_AVAILABLE))
features |= ARM_CPU_FEATURE_DOTPROD;
/* FIXME: detect SHA3 support too. */
return features;
}
#else
#error "unhandled case"
#endif
static const struct cpu_feature arm_cpu_feature_table[] = {
{ARM_CPU_FEATURE_NEON, "neon"},
{ARM_CPU_FEATURE_PMULL, "pmull"},
{ARM_CPU_FEATURE_PREFER_PMULL, "prefer_pmull"},
{ARM_CPU_FEATURE_CRC32, "crc32"},
{ARM_CPU_FEATURE_SHA3, "sha3"},
{ARM_CPU_FEATURE_DOTPROD, "dotprod"},
};
volatile u32 libdeflate_arm_cpu_features = 0;
void libdeflate_init_arm_cpu_features(void)
{
u32 features = query_arm_cpu_features();
/*
* On the Apple M1 processor, crc32 instructions max out at about 25.5
* GB/s in the best case of using a 3-way or greater interleaved chunked
* implementation, whereas a pmull-based implementation achieves 68 GB/s
* provided that the stride length is large enough (about 10+ vectors
* with eor3, or 12+ without).
*
* Assume that crc32 instructions are preferable in other cases.
*/
#if (defined(__APPLE__) && TARGET_OS_OSX) || defined(TEST_SUPPORT__DO_NOT_USE)
features |= ARM_CPU_FEATURE_PREFER_PMULL;
#endif
disable_cpu_features_for_testing(&features, arm_cpu_feature_table,
ARRAY_LEN(arm_cpu_feature_table));
libdeflate_arm_cpu_features = features | ARM_CPU_FEATURES_KNOWN;
}
#endif /* ARM_CPU_FEATURES_KNOWN */