From 4b6049b643f42d5744cba685d602cd5bc79f31b3 Mon Sep 17 00:00:00 2001 From: Anshuman Khandual Date: Wed, 24 Jul 2024 10:17:12 +0530 Subject: [PATCH 01/94] arm64/mm: Drop PMD_SECT_VALID This just drops off the macro PMD_SECT_VALID which remains unused. Because macro PMD_TYPE_SECT with same value (_AT(pmdval_t, 1) << 0), gets used for creating or updating given block mappings. Cc: Catalin Marinas Cc: Will Deacon Cc: Ryan Roberts Cc: linux-arm-kernel@lists.infradead.org Cc: linux-kernel@vger.kernel.org Signed-off-by: Anshuman Khandual Reviewed-by: Ryan Roberts Link: https://lore.kernel.org/r/20240724044712.602210-1-anshuman.khandual@arm.com Signed-off-by: Will Deacon --- arch/arm64/include/asm/pgtable-hwdef.h | 1 - 1 file changed, 1 deletion(-) diff --git a/arch/arm64/include/asm/pgtable-hwdef.h b/arch/arm64/include/asm/pgtable-hwdef.h index 1f60aa1bc750..86e803ea8885 100644 --- a/arch/arm64/include/asm/pgtable-hwdef.h +++ b/arch/arm64/include/asm/pgtable-hwdef.h @@ -135,7 +135,6 @@ /* * Section */ -#define PMD_SECT_VALID (_AT(pmdval_t, 1) << 0) #define PMD_SECT_USER (_AT(pmdval_t, 1) << 6) /* AP[1] */ #define PMD_SECT_RDONLY (_AT(pmdval_t, 1) << 7) /* AP[2] */ #define PMD_SECT_S (_AT(pmdval_t, 3) << 8) From 6ac96d6f9a8ec3227ceb1e935aeda61bdaeb62ac Mon Sep 17 00:00:00 2001 From: Anshuman Khandual Date: Wed, 24 Jul 2024 09:44:28 +0530 Subject: [PATCH 02/94] arm64/mm: Drop TCR_SMP_FLAGS Earlier TCR_SMP_FLAGS gets conditionally set as TCR_SHARED with CONFIG_SMP. Currently CONFIG_SMP is always enabled on arm64 platforms, hence drop this indirection via TCR_SMP_FLAGS and instead always directly use TCR_SHARED. Cc: Catalin Marinas Cc: Will Deacon Cc: Ryan Roberts Cc: linux-arm-kernel@lists.infradead.org Cc: linux-kernel@vger.kernel.org Signed-off-by: Anshuman Khandual Reviewed-by: Ryan Roberts Link: https://lore.kernel.org/r/20240724041428.573748-1-anshuman.khandual@arm.com Signed-off-by: Will Deacon --- arch/arm64/mm/proc.S | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/arch/arm64/mm/proc.S b/arch/arm64/mm/proc.S index f4bc6c5bac06..8abdc7fed321 100644 --- a/arch/arm64/mm/proc.S +++ b/arch/arm64/mm/proc.S @@ -36,8 +36,6 @@ #define TCR_KASLR_FLAGS 0 #endif -#define TCR_SMP_FLAGS TCR_SHARED - /* PTWs cacheable, inner/outer WBWA */ #define TCR_CACHE_FLAGS TCR_IRGN_WBWA | TCR_ORGN_WBWA @@ -469,7 +467,7 @@ SYM_FUNC_START(__cpu_setup) tcr .req x16 mov_q mair, MAIR_EL1_SET mov_q tcr, TCR_T0SZ(IDMAP_VA_BITS) | TCR_T1SZ(VA_BITS_MIN) | TCR_CACHE_FLAGS | \ - TCR_SMP_FLAGS | TCR_TG_FLAGS | TCR_KASLR_FLAGS | TCR_ASID16 | \ + TCR_SHARED | TCR_TG_FLAGS | TCR_KASLR_FLAGS | TCR_ASID16 | \ TCR_TBI0 | TCR_A1 | TCR_KASAN_SW_FLAGS | TCR_MTE_FLAGS tcr_clear_errata_bits tcr, x9, x5 From fc2220c9b15828319b09384e68399b4afc6276d9 Mon Sep 17 00:00:00 2001 From: Dave Martin Date: Mon, 29 Jul 2024 16:20:05 +0100 Subject: [PATCH 03/94] arm64: signal: Fix some under-bracketed UAPI macros A few SME-related sigcontext UAPI macros leave an argument unprotected from misparsing during macro expansion. Add parentheses around references to macro arguments where appropriate. Signed-off-by: Dave Martin Fixes: ee072cf70804 ("arm64/sme: Implement signal handling for ZT") Fixes: 39782210eb7e ("arm64/sme: Implement ZA signal handling") Reviewed-by: Mark Brown Link: https://lore.kernel.org/r/20240729152005.289844-1-Dave.Martin@arm.com Signed-off-by: Will Deacon --- arch/arm64/include/uapi/asm/sigcontext.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/arch/arm64/include/uapi/asm/sigcontext.h b/arch/arm64/include/uapi/asm/sigcontext.h index 8a45b7a411e0..57f76d82077e 100644 --- a/arch/arm64/include/uapi/asm/sigcontext.h +++ b/arch/arm64/include/uapi/asm/sigcontext.h @@ -320,10 +320,10 @@ struct zt_context { ((sizeof(struct za_context) + (__SVE_VQ_BYTES - 1)) \ / __SVE_VQ_BYTES * __SVE_VQ_BYTES) -#define ZA_SIG_REGS_SIZE(vq) ((vq * __SVE_VQ_BYTES) * (vq * __SVE_VQ_BYTES)) +#define ZA_SIG_REGS_SIZE(vq) (((vq) * __SVE_VQ_BYTES) * ((vq) * __SVE_VQ_BYTES)) #define ZA_SIG_ZAV_OFFSET(vq, n) (ZA_SIG_REGS_OFFSET + \ - (SVE_SIG_ZREG_SIZE(vq) * n)) + (SVE_SIG_ZREG_SIZE(vq) * (n))) #define ZA_SIG_CONTEXT_SIZE(vq) \ (ZA_SIG_REGS_OFFSET + ZA_SIG_REGS_SIZE(vq)) @@ -334,7 +334,7 @@ struct zt_context { #define ZT_SIG_REGS_OFFSET sizeof(struct zt_context) -#define ZT_SIG_REGS_SIZE(n) (ZT_SIG_REG_BYTES * n) +#define ZT_SIG_REGS_SIZE(n) (ZT_SIG_REG_BYTES * (n)) #define ZT_SIG_CONTEXT_SIZE(n) \ (sizeof(struct zt_context) + ZT_SIG_REGS_SIZE(n)) From 5b39db6037e7cba1659f2149aef76934370aa6d5 Mon Sep 17 00:00:00 2001 From: Dave Martin Date: Mon, 29 Jul 2024 17:25:42 +0100 Subject: [PATCH 04/94] arm64: el2_setup.h: Rename some labels to be more diff-friendly A minor anti-pattern has established itself in __init_el2_fgt, where each block of instructions is skipped by jumping to a label named for the next (typically unrelated) block. This makes diffs more noisy than necessary, since appending each new block to deal with some new architecture feature now requires altering a branch destination in the existing code. Fix it by naming the affected labels based on the block that is skipping itself instead, as is done elsewhere in the el2_setup code. No functional change. Signed-off-by: Dave Martin Link: https://lore.kernel.org/r/20240729162542.367059-1-Dave.Martin@arm.com Signed-off-by: Will Deacon --- arch/arm64/include/asm/el2_setup.h | 17 ++++++++++------- 1 file changed, 10 insertions(+), 7 deletions(-) diff --git a/arch/arm64/include/asm/el2_setup.h b/arch/arm64/include/asm/el2_setup.h index fd87c4b8f984..8eb113a6b452 100644 --- a/arch/arm64/include/asm/el2_setup.h +++ b/arch/arm64/include/asm/el2_setup.h @@ -165,42 +165,45 @@ mrs x1, id_aa64dfr0_el1 ubfx x1, x1, #ID_AA64DFR0_EL1_PMSVer_SHIFT, #4 cmp x1, #3 - b.lt .Lset_debug_fgt_\@ + b.lt .Lskip_spe_fgt_\@ /* Disable PMSNEVFR_EL1 read and write traps */ orr x0, x0, #(1 << 62) -.Lset_debug_fgt_\@: +.Lskip_spe_fgt_\@: msr_s SYS_HDFGRTR_EL2, x0 msr_s SYS_HDFGWTR_EL2, x0 mov x0, xzr mrs x1, id_aa64pfr1_el1 ubfx x1, x1, #ID_AA64PFR1_EL1_SME_SHIFT, #4 - cbz x1, .Lset_pie_fgt_\@ + cbz x1, .Lskip_debug_fgt_\@ /* Disable nVHE traps of TPIDR2 and SMPRI */ orr x0, x0, #HFGxTR_EL2_nSMPRI_EL1_MASK orr x0, x0, #HFGxTR_EL2_nTPIDR2_EL0_MASK -.Lset_pie_fgt_\@: +.Lskip_debug_fgt_\@: mrs_s x1, SYS_ID_AA64MMFR3_EL1 ubfx x1, x1, #ID_AA64MMFR3_EL1_S1PIE_SHIFT, #4 - cbz x1, .Lset_fgt_\@ + cbz x1, .Lskip_pie_fgt_\@ /* Disable trapping of PIR_EL1 / PIRE0_EL1 */ orr x0, x0, #HFGxTR_EL2_nPIR_EL1 orr x0, x0, #HFGxTR_EL2_nPIRE0_EL1 -.Lset_fgt_\@: +.Lskip_pie_fgt_\@: msr_s SYS_HFGRTR_EL2, x0 msr_s SYS_HFGWTR_EL2, x0 msr_s SYS_HFGITR_EL2, xzr mrs x1, id_aa64pfr0_el1 // AMU traps UNDEF without AMU ubfx x1, x1, #ID_AA64PFR0_EL1_AMU_SHIFT, #4 - cbz x1, .Lskip_fgt_\@ + cbz x1, .Lskip_amu_fgt_\@ msr_s SYS_HAFGRTR_EL2, xzr + +.Lskip_amu_fgt_\@: + .Lskip_fgt_\@: .endm From 4960f9a5a5ac7784e03c9b5fe5456e80557e4b32 Mon Sep 17 00:00:00 2001 From: Yue Haibing Date: Mon, 5 Aug 2024 22:00:38 +0800 Subject: [PATCH 05/94] arm64: mm: Remove unused declaration early_io_map() Commit bf4b558eba92 ("arm64: add early_ioremap support") removed the implementation but leave declaration. Signed-off-by: Yue Haibing Reviewed-by: Anshuman Khandual Acked-by: Catalin Marinas Link: https://lore.kernel.org/r/20240805140038.1366033-1-yuehaibing@huawei.com Signed-off-by: Will Deacon --- arch/arm64/include/asm/mmu.h | 1 - 1 file changed, 1 deletion(-) diff --git a/arch/arm64/include/asm/mmu.h b/arch/arm64/include/asm/mmu.h index 65977c7783c5..fc414cfd337e 100644 --- a/arch/arm64/include/asm/mmu.h +++ b/arch/arm64/include/asm/mmu.h @@ -63,7 +63,6 @@ static inline bool arm64_kernel_unmapped_at_el0(void) extern void arm64_memblock_init(void); extern void paging_init(void); extern void bootmem_init(void); -extern void __iomem *early_io_map(phys_addr_t phys, unsigned long virt); extern void create_mapping_noalloc(phys_addr_t phys, unsigned long virt, phys_addr_t size, pgprot_t prot); extern void create_pgd_mapping(struct mm_struct *mm, phys_addr_t phys, From 9cd8062b38e61f11054f13b3c98695c7b1d73b11 Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Thu, 6 Jun 2024 19:50:05 +0300 Subject: [PATCH 06/94] ACPI/IORT: Switch to use kmemdup_array() Let the kememdup_array() take care about multiplication and possible overflows. Signed-off-by: Andy Shevchenko Acked-by: Hanjun Guo Link: https://lore.kernel.org/r/20240606165005.3031490-1-andriy.shevchenko@linux.intel.com Signed-off-by: Will Deacon --- drivers/acpi/arm64/iort.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/acpi/arm64/iort.c b/drivers/acpi/arm64/iort.c index 1b39e9ae7ac1..034d303c3df1 100644 --- a/drivers/acpi/arm64/iort.c +++ b/drivers/acpi/arm64/iort.c @@ -822,7 +822,7 @@ static struct iommu_iort_rmr_data *iort_rmr_alloc( return NULL; /* Create a copy of SIDs array to associate with this rmr_data */ - sids_copy = kmemdup(sids, num_sids * sizeof(*sids), GFP_KERNEL); + sids_copy = kmemdup_array(sids, num_sids, sizeof(*sids), GFP_KERNEL); if (!sids_copy) { kfree(rmr_data); return NULL; From ba8b7f7f2b7961471c2364b2b15d3a1c5407d63a Mon Sep 17 00:00:00 2001 From: Hanjun Guo Date: Thu, 8 Aug 2024 21:09:46 +0800 Subject: [PATCH 07/94] ACPI: ARM64: add acpi_iort.h to MAINTAINERS IORT(Input Output Remapping Table) represents the I/O topology of an Arm-based system for use with the ACPI, so acpi_iort.h is for arm64 only. This helps git-send-email to figure out the proper maintainers when touching the file. Signed-off-by: Hanjun Guo Acked-by: Sudeep Holla Link: https://lore.kernel.org/r/20240808130946.1028376-1-guohanjun@huawei.com Signed-off-by: Will Deacon --- MAINTAINERS | 1 + 1 file changed, 1 insertion(+) diff --git a/MAINTAINERS b/MAINTAINERS index f328373463b0..7573f344b9fa 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -334,6 +334,7 @@ L: linux-acpi@vger.kernel.org L: linux-arm-kernel@lists.infradead.org (moderated for non-subscribers) S: Maintained F: drivers/acpi/arm64 +F: include/linux/acpi_iort.h ACPI FOR RISC-V (ACPI/riscv) M: Sunil V L From 93b81abc6ea9bf4eafec2af68484c6735e4f9167 Mon Sep 17 00:00:00 2001 From: Yue Haibing Date: Sat, 10 Aug 2024 17:39:44 +0800 Subject: [PATCH 08/94] arm64/sve: Remove unused declaration read_smcr_features() Commit 391208485c3a ("arm64/sve: Remove SMCR pseudo register from cpufeature code") removed the implementation but leave declaration. Signed-off-by: Yue Haibing Reviewed-by: Mark Brown Reviewed-by: Zenghui Yu Reviewed-by: Anshuman Khandual Link: https://lore.kernel.org/r/20240810093944.2587809-1-yuehaibing@huawei.com Signed-off-by: Will Deacon --- arch/arm64/include/asm/fpsimd.h | 2 -- 1 file changed, 2 deletions(-) diff --git a/arch/arm64/include/asm/fpsimd.h b/arch/arm64/include/asm/fpsimd.h index bc69ac368d73..f2a84efc3618 100644 --- a/arch/arm64/include/asm/fpsimd.h +++ b/arch/arm64/include/asm/fpsimd.h @@ -155,8 +155,6 @@ extern void cpu_enable_sme2(const struct arm64_cpu_capabilities *__unused); extern void cpu_enable_fa64(const struct arm64_cpu_capabilities *__unused); extern void cpu_enable_fpmr(const struct arm64_cpu_capabilities *__unused); -extern u64 read_smcr_features(void); - /* * Helpers to translate bit indices in sve_vq_map to VQ values (and * vice versa). This allows find_next_bit() to be used to find the From 48b035121a564f2695462fbab0d2af51cb4f4b7a Mon Sep 17 00:00:00 2001 From: "Rob Herring (Arm)" Date: Wed, 31 Jul 2024 13:12:53 -0600 Subject: [PATCH 09/94] perf: arm_pmu: Use of_property_present() Use of_property_present() to test for property presence rather than of_find_property(). This is part of a larger effort to remove callers of of_find_property() and similar functions. of_find_property() leaks the DT struct property and data pointers which is a problem for dynamically allocated nodes which may be freed. Signed-off-by: Rob Herring (Arm) Acked-by: Mark Rutland Reviewed-by: Anshuman Khandual Link: https://lore.kernel.org/r/20240731191312.1710417-15-robh@kernel.org Signed-off-by: Will Deacon --- drivers/perf/arm_pmu_platform.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/perf/arm_pmu_platform.c b/drivers/perf/arm_pmu_platform.c index 4b1a9a92ea11..118170a5cede 100644 --- a/drivers/perf/arm_pmu_platform.c +++ b/drivers/perf/arm_pmu_platform.c @@ -59,7 +59,7 @@ static int pmu_parse_percpu_irq(struct arm_pmu *pmu, int irq) static bool pmu_has_irq_affinity(struct device_node *node) { - return !!of_find_property(node, "interrupt-affinity", NULL); + return of_property_present(node, "interrupt-affinity"); } static int pmu_parse_irq_affinity(struct device *dev, int i) From bf5ffc8c80e0cf5205849cd0c9c3cb261d2beee6 Mon Sep 17 00:00:00 2001 From: "Rob Herring (Arm)" Date: Wed, 31 Jul 2024 10:51:18 -0600 Subject: [PATCH 10/94] perf: arm_pmu: Remove event index to counter remapping Xscale and Armv6 PMUs defined the cycle counter at 0 and event counters starting at 1 and had 1:1 event index to counter numbering. On Armv7 and later, this changed the cycle counter to 31 and event counters start at 0. The drivers for Armv7 and PMUv3 kept the old event index numbering and introduced an event index to counter conversion. The conversion uses masking to convert from event index to a counter number. This operation relies on having at most 32 counters so that the cycle counter index 0 can be transformed to counter number 31. Armv9.4 adds support for an additional fixed function counter (instructions) which increases possible counters to more than 32, and the conversion won't work anymore as a simple subtract and mask. The primary reason for the translation (other than history) seems to be to have a contiguous mask of counters 0-N. Keeping that would result in more complicated index to counter conversions. Instead, store a mask of available counters rather than just number of events. That provides more information in addition to the number of events. No (intended) functional changes. Acked-by: Mark Rutland Signed-off-by: Rob Herring (Arm) Tested-by: James Clark Link: https://lore.kernel.org/r/20240731-arm-pmu-3-9-icntr-v3-1-280a8d7ff465@kernel.org Signed-off-by: Will Deacon --- arch/arm64/kvm/pmu-emul.c | 6 +-- drivers/perf/apple_m1_cpu_pmu.c | 4 +- drivers/perf/arm_pmu.c | 11 ++--- drivers/perf/arm_pmuv3.c | 62 +++++++++----------------- drivers/perf/arm_v6_pmu.c | 6 ++- drivers/perf/arm_v7_pmu.c | 77 +++++++++++++-------------------- drivers/perf/arm_xscale_pmu.c | 12 +++-- include/linux/perf/arm_pmu.h | 2 +- include/linux/perf/arm_pmuv3.h | 1 + 9 files changed, 75 insertions(+), 106 deletions(-) diff --git a/arch/arm64/kvm/pmu-emul.c b/arch/arm64/kvm/pmu-emul.c index 82a2a003259c..0e598f6c42c0 100644 --- a/arch/arm64/kvm/pmu-emul.c +++ b/arch/arm64/kvm/pmu-emul.c @@ -910,10 +910,10 @@ u8 kvm_arm_pmu_get_max_counters(struct kvm *kvm) struct arm_pmu *arm_pmu = kvm->arch.arm_pmu; /* - * The arm_pmu->num_events considers the cycle counter as well. - * Ignore that and return only the general-purpose counters. + * The arm_pmu->cntr_mask considers the fixed counter(s) as well. + * Ignore those and return only the general-purpose counters. */ - return arm_pmu->num_events - 1; + return bitmap_weight(arm_pmu->cntr_mask, ARMV8_PMU_MAX_GENERAL_COUNTERS); } static void kvm_arm_set_pmu(struct kvm *kvm, struct arm_pmu *arm_pmu) diff --git a/drivers/perf/apple_m1_cpu_pmu.c b/drivers/perf/apple_m1_cpu_pmu.c index f322e5ca1114..c8f607912567 100644 --- a/drivers/perf/apple_m1_cpu_pmu.c +++ b/drivers/perf/apple_m1_cpu_pmu.c @@ -400,7 +400,7 @@ static irqreturn_t m1_pmu_handle_irq(struct arm_pmu *cpu_pmu) regs = get_irq_regs(); - for (idx = 0; idx < cpu_pmu->num_events; idx++) { + for_each_set_bit(idx, cpu_pmu->cntr_mask, M1_PMU_NR_COUNTERS) { struct perf_event *event = cpuc->events[idx]; struct perf_sample_data data; @@ -560,7 +560,7 @@ static int m1_pmu_init(struct arm_pmu *cpu_pmu, u32 flags) cpu_pmu->reset = m1_pmu_reset; cpu_pmu->set_event_filter = m1_pmu_set_event_filter; - cpu_pmu->num_events = M1_PMU_NR_COUNTERS; + bitmap_set(cpu_pmu->cntr_mask, 0, M1_PMU_NR_COUNTERS); cpu_pmu->attr_groups[ARMPMU_ATTR_GROUP_EVENTS] = &m1_pmu_events_attr_group; cpu_pmu->attr_groups[ARMPMU_ATTR_GROUP_FORMATS] = &m1_pmu_format_attr_group; return 0; diff --git a/drivers/perf/arm_pmu.c b/drivers/perf/arm_pmu.c index 8458fe2cebb4..398cce3d76fc 100644 --- a/drivers/perf/arm_pmu.c +++ b/drivers/perf/arm_pmu.c @@ -522,7 +522,7 @@ static void armpmu_enable(struct pmu *pmu) { struct arm_pmu *armpmu = to_arm_pmu(pmu); struct pmu_hw_events *hw_events = this_cpu_ptr(armpmu->hw_events); - bool enabled = !bitmap_empty(hw_events->used_mask, armpmu->num_events); + bool enabled = !bitmap_empty(hw_events->used_mask, ARMPMU_MAX_HWEVENTS); /* For task-bound events we may be called on other CPUs */ if (!cpumask_test_cpu(smp_processor_id(), &armpmu->supported_cpus)) @@ -742,7 +742,7 @@ static void cpu_pm_pmu_setup(struct arm_pmu *armpmu, unsigned long cmd) struct perf_event *event; int idx; - for (idx = 0; idx < armpmu->num_events; idx++) { + for_each_set_bit(idx, armpmu->cntr_mask, ARMPMU_MAX_HWEVENTS) { event = hw_events->events[idx]; if (!event) continue; @@ -772,7 +772,7 @@ static int cpu_pm_pmu_notify(struct notifier_block *b, unsigned long cmd, { struct arm_pmu *armpmu = container_of(b, struct arm_pmu, cpu_pm_nb); struct pmu_hw_events *hw_events = this_cpu_ptr(armpmu->hw_events); - bool enabled = !bitmap_empty(hw_events->used_mask, armpmu->num_events); + bool enabled = !bitmap_empty(hw_events->used_mask, ARMPMU_MAX_HWEVENTS); if (!cpumask_test_cpu(smp_processor_id(), &armpmu->supported_cpus)) return NOTIFY_DONE; @@ -924,8 +924,9 @@ int armpmu_register(struct arm_pmu *pmu) if (ret) goto out_destroy; - pr_info("enabled with %s PMU driver, %d counters available%s\n", - pmu->name, pmu->num_events, + pr_info("enabled with %s PMU driver, %d (%*pb) counters available%s\n", + pmu->name, bitmap_weight(pmu->cntr_mask, ARMPMU_MAX_HWEVENTS), + ARMPMU_MAX_HWEVENTS, &pmu->cntr_mask, has_nmi ? ", using NMIs" : ""); kvm_host_pmu_init(pmu); diff --git a/drivers/perf/arm_pmuv3.c b/drivers/perf/arm_pmuv3.c index d246840797b6..b48773ec892e 100644 --- a/drivers/perf/arm_pmuv3.c +++ b/drivers/perf/arm_pmuv3.c @@ -454,9 +454,7 @@ static const struct attribute_group armv8_pmuv3_caps_attr_group = { /* * Perf Events' indices */ -#define ARMV8_IDX_CYCLE_COUNTER 0 -#define ARMV8_IDX_COUNTER0 1 -#define ARMV8_IDX_CYCLE_COUNTER_USER 32 +#define ARMV8_IDX_CYCLE_COUNTER 31 /* * We unconditionally enable ARMv8.5-PMU long event counter support @@ -489,19 +487,12 @@ static bool armv8pmu_event_is_chained(struct perf_event *event) return !armv8pmu_event_has_user_read(event) && armv8pmu_event_is_64bit(event) && !armv8pmu_has_long_event(cpu_pmu) && - (idx != ARMV8_IDX_CYCLE_COUNTER); + (idx < ARMV8_PMU_MAX_GENERAL_COUNTERS); } /* * ARMv8 low level PMU access */ - -/* - * Perf Event to low level counters mapping - */ -#define ARMV8_IDX_TO_COUNTER(x) \ - (((x) - ARMV8_IDX_COUNTER0) & ARMV8_PMU_COUNTER_MASK) - static u64 armv8pmu_pmcr_read(void) { return read_pmcr(); @@ -521,14 +512,12 @@ static int armv8pmu_has_overflowed(u32 pmovsr) static int armv8pmu_counter_has_overflowed(u32 pmnc, int idx) { - return pmnc & BIT(ARMV8_IDX_TO_COUNTER(idx)); + return pmnc & BIT(idx); } static u64 armv8pmu_read_evcntr(int idx) { - u32 counter = ARMV8_IDX_TO_COUNTER(idx); - - return read_pmevcntrn(counter); + return read_pmevcntrn(idx); } static u64 armv8pmu_read_hw_counter(struct perf_event *event) @@ -557,7 +546,7 @@ static bool armv8pmu_event_needs_bias(struct perf_event *event) return false; if (armv8pmu_has_long_event(cpu_pmu) || - idx == ARMV8_IDX_CYCLE_COUNTER) + idx >= ARMV8_PMU_MAX_GENERAL_COUNTERS) return true; return false; @@ -595,9 +584,7 @@ static u64 armv8pmu_read_counter(struct perf_event *event) static void armv8pmu_write_evcntr(int idx, u64 value) { - u32 counter = ARMV8_IDX_TO_COUNTER(idx); - - write_pmevcntrn(counter, value); + write_pmevcntrn(idx, value); } static void armv8pmu_write_hw_counter(struct perf_event *event, @@ -628,7 +615,6 @@ static void armv8pmu_write_counter(struct perf_event *event, u64 value) static void armv8pmu_write_evtype(int idx, unsigned long val) { - u32 counter = ARMV8_IDX_TO_COUNTER(idx); unsigned long mask = ARMV8_PMU_EVTYPE_EVENT | ARMV8_PMU_INCLUDE_EL2 | ARMV8_PMU_EXCLUDE_EL0 | @@ -638,7 +624,7 @@ static void armv8pmu_write_evtype(int idx, unsigned long val) mask |= ARMV8_PMU_EVTYPE_TC | ARMV8_PMU_EVTYPE_TH; val &= mask; - write_pmevtypern(counter, val); + write_pmevtypern(idx, val); } static void armv8pmu_write_event_type(struct perf_event *event) @@ -667,7 +653,7 @@ static void armv8pmu_write_event_type(struct perf_event *event) static u32 armv8pmu_event_cnten_mask(struct perf_event *event) { - int counter = ARMV8_IDX_TO_COUNTER(event->hw.idx); + int counter = event->hw.idx; u32 mask = BIT(counter); if (armv8pmu_event_is_chained(event)) @@ -726,8 +712,7 @@ static void armv8pmu_enable_intens(u32 mask) static void armv8pmu_enable_event_irq(struct perf_event *event) { - u32 counter = ARMV8_IDX_TO_COUNTER(event->hw.idx); - armv8pmu_enable_intens(BIT(counter)); + armv8pmu_enable_intens(BIT(event->hw.idx)); } static void armv8pmu_disable_intens(u32 mask) @@ -741,8 +726,7 @@ static void armv8pmu_disable_intens(u32 mask) static void armv8pmu_disable_event_irq(struct perf_event *event) { - u32 counter = ARMV8_IDX_TO_COUNTER(event->hw.idx); - armv8pmu_disable_intens(BIT(counter)); + armv8pmu_disable_intens(BIT(event->hw.idx)); } static u32 armv8pmu_getreset_flags(void) @@ -786,7 +770,8 @@ static void armv8pmu_enable_user_access(struct arm_pmu *cpu_pmu) struct pmu_hw_events *cpuc = this_cpu_ptr(cpu_pmu->hw_events); /* Clear any unused counters to avoid leaking their contents */ - for_each_clear_bit(i, cpuc->used_mask, cpu_pmu->num_events) { + for_each_andnot_bit(i, cpu_pmu->cntr_mask, cpuc->used_mask, + ARMPMU_MAX_HWEVENTS) { if (i == ARMV8_IDX_CYCLE_COUNTER) write_pmccntr(0); else @@ -869,7 +854,7 @@ static irqreturn_t armv8pmu_handle_irq(struct arm_pmu *cpu_pmu) * to prevent skews in group events. */ armv8pmu_stop(cpu_pmu); - for (idx = 0; idx < cpu_pmu->num_events; ++idx) { + for_each_set_bit(idx, cpu_pmu->cntr_mask, ARMPMU_MAX_HWEVENTS) { struct perf_event *event = cpuc->events[idx]; struct hw_perf_event *hwc; @@ -908,7 +893,7 @@ static int armv8pmu_get_single_idx(struct pmu_hw_events *cpuc, { int idx; - for (idx = ARMV8_IDX_COUNTER0; idx < cpu_pmu->num_events; idx++) { + for_each_set_bit(idx, cpu_pmu->cntr_mask, ARMV8_PMU_MAX_GENERAL_COUNTERS) { if (!test_and_set_bit(idx, cpuc->used_mask)) return idx; } @@ -924,7 +909,9 @@ static int armv8pmu_get_chain_idx(struct pmu_hw_events *cpuc, * Chaining requires two consecutive event counters, where * the lower idx must be even. */ - for (idx = ARMV8_IDX_COUNTER0 + 1; idx < cpu_pmu->num_events; idx += 2) { + for_each_set_bit(idx, cpu_pmu->cntr_mask, ARMV8_PMU_MAX_GENERAL_COUNTERS) { + if (!(idx & 0x1)) + continue; if (!test_and_set_bit(idx, cpuc->used_mask)) { /* Check if the preceding even counter is available */ if (!test_and_set_bit(idx - 1, cpuc->used_mask)) @@ -978,15 +965,7 @@ static int armv8pmu_user_event_idx(struct perf_event *event) if (!sysctl_perf_user_access || !armv8pmu_event_has_user_read(event)) return 0; - /* - * We remap the cycle counter index to 32 to - * match the offset applied to the rest of - * the counter indices. - */ - if (event->hw.idx == ARMV8_IDX_CYCLE_COUNTER) - return ARMV8_IDX_CYCLE_COUNTER_USER; - - return event->hw.idx; + return event->hw.idx + 1; } /* @@ -1211,10 +1190,11 @@ static void __armv8pmu_probe_pmu(void *info) probe->present = true; /* Read the nb of CNTx counters supported from PMNC */ - cpu_pmu->num_events = FIELD_GET(ARMV8_PMU_PMCR_N, armv8pmu_pmcr_read()); + bitmap_set(cpu_pmu->cntr_mask, + 0, FIELD_GET(ARMV8_PMU_PMCR_N, armv8pmu_pmcr_read())); /* Add the CPU cycles counter */ - cpu_pmu->num_events += 1; + set_bit(ARMV8_IDX_CYCLE_COUNTER, cpu_pmu->cntr_mask); pmceid[0] = pmceid_raw[0] = read_pmceid0(); pmceid[1] = pmceid_raw[1] = read_pmceid1(); diff --git a/drivers/perf/arm_v6_pmu.c b/drivers/perf/arm_v6_pmu.c index 0bb685b4bac5..b09615bb2bb2 100644 --- a/drivers/perf/arm_v6_pmu.c +++ b/drivers/perf/arm_v6_pmu.c @@ -64,6 +64,7 @@ enum armv6_counters { ARMV6_CYCLE_COUNTER = 0, ARMV6_COUNTER0, ARMV6_COUNTER1, + ARMV6_NUM_COUNTERS }; /* @@ -254,7 +255,7 @@ armv6pmu_handle_irq(struct arm_pmu *cpu_pmu) */ armv6_pmcr_write(pmcr); - for (idx = 0; idx < cpu_pmu->num_events; ++idx) { + for_each_set_bit(idx, cpu_pmu->cntr_mask, ARMV6_NUM_COUNTERS) { struct perf_event *event = cpuc->events[idx]; struct hw_perf_event *hwc; @@ -391,7 +392,8 @@ static void armv6pmu_init(struct arm_pmu *cpu_pmu) cpu_pmu->start = armv6pmu_start; cpu_pmu->stop = armv6pmu_stop; cpu_pmu->map_event = armv6_map_event; - cpu_pmu->num_events = 3; + + bitmap_set(cpu_pmu->cntr_mask, 0, ARMV6_NUM_COUNTERS); } static int armv6_1136_pmu_init(struct arm_pmu *cpu_pmu) diff --git a/drivers/perf/arm_v7_pmu.c b/drivers/perf/arm_v7_pmu.c index 928ac3d626ed..420cadd108e7 100644 --- a/drivers/perf/arm_v7_pmu.c +++ b/drivers/perf/arm_v7_pmu.c @@ -649,24 +649,12 @@ static struct attribute_group armv7_pmuv2_events_attr_group = { /* * Perf Events' indices */ -#define ARMV7_IDX_CYCLE_COUNTER 0 -#define ARMV7_IDX_COUNTER0 1 -#define ARMV7_IDX_COUNTER_LAST(cpu_pmu) \ - (ARMV7_IDX_CYCLE_COUNTER + cpu_pmu->num_events - 1) - -#define ARMV7_MAX_COUNTERS 32 -#define ARMV7_COUNTER_MASK (ARMV7_MAX_COUNTERS - 1) - +#define ARMV7_IDX_CYCLE_COUNTER 31 +#define ARMV7_IDX_COUNTER_MAX 31 /* * ARMv7 low level PMNC access */ -/* - * Perf Event to low level counters mapping - */ -#define ARMV7_IDX_TO_COUNTER(x) \ - (((x) - ARMV7_IDX_COUNTER0) & ARMV7_COUNTER_MASK) - /* * Per-CPU PMNC: config reg */ @@ -725,19 +713,17 @@ static inline int armv7_pmnc_has_overflowed(u32 pmnc) static inline int armv7_pmnc_counter_valid(struct arm_pmu *cpu_pmu, int idx) { - return idx >= ARMV7_IDX_CYCLE_COUNTER && - idx <= ARMV7_IDX_COUNTER_LAST(cpu_pmu); + return test_bit(idx, cpu_pmu->cntr_mask); } static inline int armv7_pmnc_counter_has_overflowed(u32 pmnc, int idx) { - return pmnc & BIT(ARMV7_IDX_TO_COUNTER(idx)); + return pmnc & BIT(idx); } static inline void armv7_pmnc_select_counter(int idx) { - u32 counter = ARMV7_IDX_TO_COUNTER(idx); - asm volatile("mcr p15, 0, %0, c9, c12, 5" : : "r" (counter)); + asm volatile("mcr p15, 0, %0, c9, c12, 5" : : "r" (idx)); isb(); } @@ -787,29 +773,25 @@ static inline void armv7_pmnc_write_evtsel(int idx, u32 val) static inline void armv7_pmnc_enable_counter(int idx) { - u32 counter = ARMV7_IDX_TO_COUNTER(idx); - asm volatile("mcr p15, 0, %0, c9, c12, 1" : : "r" (BIT(counter))); + asm volatile("mcr p15, 0, %0, c9, c12, 1" : : "r" (BIT(idx))); } static inline void armv7_pmnc_disable_counter(int idx) { - u32 counter = ARMV7_IDX_TO_COUNTER(idx); - asm volatile("mcr p15, 0, %0, c9, c12, 2" : : "r" (BIT(counter))); + asm volatile("mcr p15, 0, %0, c9, c12, 2" : : "r" (BIT(idx))); } static inline void armv7_pmnc_enable_intens(int idx) { - u32 counter = ARMV7_IDX_TO_COUNTER(idx); - asm volatile("mcr p15, 0, %0, c9, c14, 1" : : "r" (BIT(counter))); + asm volatile("mcr p15, 0, %0, c9, c14, 1" : : "r" (BIT(idx))); } static inline void armv7_pmnc_disable_intens(int idx) { - u32 counter = ARMV7_IDX_TO_COUNTER(idx); - asm volatile("mcr p15, 0, %0, c9, c14, 2" : : "r" (BIT(counter))); + asm volatile("mcr p15, 0, %0, c9, c14, 2" : : "r" (BIT(idx))); isb(); /* Clear the overflow flag in case an interrupt is pending. */ - asm volatile("mcr p15, 0, %0, c9, c12, 3" : : "r" (BIT(counter))); + asm volatile("mcr p15, 0, %0, c9, c12, 3" : : "r" (BIT(idx))); isb(); } @@ -853,15 +835,12 @@ static void armv7_pmnc_dump_regs(struct arm_pmu *cpu_pmu) asm volatile("mrc p15, 0, %0, c9, c13, 0" : "=r" (val)); pr_info("CCNT =0x%08x\n", val); - for (cnt = ARMV7_IDX_COUNTER0; - cnt <= ARMV7_IDX_COUNTER_LAST(cpu_pmu); cnt++) { + for_each_set_bit(cnt, cpu_pmu->cntr_mask, ARMV7_IDX_COUNTER_MAX) { armv7_pmnc_select_counter(cnt); asm volatile("mrc p15, 0, %0, c9, c13, 2" : "=r" (val)); - pr_info("CNT[%d] count =0x%08x\n", - ARMV7_IDX_TO_COUNTER(cnt), val); + pr_info("CNT[%d] count =0x%08x\n", cnt, val); asm volatile("mrc p15, 0, %0, c9, c13, 1" : "=r" (val)); - pr_info("CNT[%d] evtsel=0x%08x\n", - ARMV7_IDX_TO_COUNTER(cnt), val); + pr_info("CNT[%d] evtsel=0x%08x\n", cnt, val); } } #endif @@ -958,7 +937,7 @@ static irqreturn_t armv7pmu_handle_irq(struct arm_pmu *cpu_pmu) */ regs = get_irq_regs(); - for (idx = 0; idx < cpu_pmu->num_events; ++idx) { + for_each_set_bit(idx, cpu_pmu->cntr_mask, ARMPMU_MAX_HWEVENTS) { struct perf_event *event = cpuc->events[idx]; struct hw_perf_event *hwc; @@ -1027,7 +1006,7 @@ static int armv7pmu_get_event_idx(struct pmu_hw_events *cpuc, * For anything other than a cycle counter, try and use * the events counters */ - for (idx = ARMV7_IDX_COUNTER0; idx < cpu_pmu->num_events; ++idx) { + for_each_set_bit(idx, cpu_pmu->cntr_mask, ARMV7_IDX_COUNTER_MAX) { if (!test_and_set_bit(idx, cpuc->used_mask)) return idx; } @@ -1073,7 +1052,7 @@ static int armv7pmu_set_event_filter(struct hw_perf_event *event, static void armv7pmu_reset(void *info) { struct arm_pmu *cpu_pmu = (struct arm_pmu *)info; - u32 idx, nb_cnt = cpu_pmu->num_events, val; + u32 idx, val; if (cpu_pmu->secure_access) { asm volatile("mrc p15, 0, %0, c1, c1, 1" : "=r" (val)); @@ -1082,7 +1061,7 @@ static void armv7pmu_reset(void *info) } /* The counter and interrupt enable registers are unknown at reset. */ - for (idx = ARMV7_IDX_CYCLE_COUNTER; idx < nb_cnt; ++idx) { + for_each_set_bit(idx, cpu_pmu->cntr_mask, ARMPMU_MAX_HWEVENTS) { armv7_pmnc_disable_counter(idx); armv7_pmnc_disable_intens(idx); } @@ -1161,20 +1140,22 @@ static void armv7pmu_init(struct arm_pmu *cpu_pmu) static void armv7_read_num_pmnc_events(void *info) { - int *nb_cnt = info; + int nb_cnt; + struct arm_pmu *cpu_pmu = info; /* Read the nb of CNTx counters supported from PMNC */ - *nb_cnt = (armv7_pmnc_read() >> ARMV7_PMNC_N_SHIFT) & ARMV7_PMNC_N_MASK; + nb_cnt = (armv7_pmnc_read() >> ARMV7_PMNC_N_SHIFT) & ARMV7_PMNC_N_MASK; + bitmap_set(cpu_pmu->cntr_mask, 0, nb_cnt); /* Add the CPU cycles counter */ - *nb_cnt += 1; + set_bit(ARMV7_IDX_CYCLE_COUNTER, cpu_pmu->cntr_mask); } static int armv7_probe_num_events(struct arm_pmu *arm_pmu) { return smp_call_function_any(&arm_pmu->supported_cpus, armv7_read_num_pmnc_events, - &arm_pmu->num_events, 1); + arm_pmu, 1); } static int armv7_a8_pmu_init(struct arm_pmu *cpu_pmu) @@ -1524,7 +1505,7 @@ static void krait_pmu_reset(void *info) { u32 vval, fval; struct arm_pmu *cpu_pmu = info; - u32 idx, nb_cnt = cpu_pmu->num_events; + u32 idx; armv7pmu_reset(info); @@ -1538,7 +1519,7 @@ static void krait_pmu_reset(void *info) venum_post_pmresr(vval, fval); /* Reset PMxEVNCTCR to sane default */ - for (idx = ARMV7_IDX_CYCLE_COUNTER; idx < nb_cnt; ++idx) { + for_each_set_bit(idx, cpu_pmu->cntr_mask, ARMV7_IDX_COUNTER_MAX) { armv7_pmnc_select_counter(idx); asm volatile("mcr p15, 0, %0, c9, c15, 0" : : "r" (0)); } @@ -1562,7 +1543,7 @@ static int krait_event_to_bit(struct perf_event *event, unsigned int region, * Lower bits are reserved for use by the counters (see * armv7pmu_get_event_idx() for more info) */ - bit += ARMV7_IDX_COUNTER_LAST(cpu_pmu) + 1; + bit += bitmap_weight(cpu_pmu->cntr_mask, ARMV7_IDX_COUNTER_MAX); return bit; } @@ -1845,7 +1826,7 @@ static void scorpion_pmu_reset(void *info) { u32 vval, fval; struct arm_pmu *cpu_pmu = info; - u32 idx, nb_cnt = cpu_pmu->num_events; + u32 idx; armv7pmu_reset(info); @@ -1860,7 +1841,7 @@ static void scorpion_pmu_reset(void *info) venum_post_pmresr(vval, fval); /* Reset PMxEVNCTCR to sane default */ - for (idx = ARMV7_IDX_CYCLE_COUNTER; idx < nb_cnt; ++idx) { + for_each_set_bit(idx, cpu_pmu->cntr_mask, ARMV7_IDX_COUNTER_MAX) { armv7_pmnc_select_counter(idx); asm volatile("mcr p15, 0, %0, c9, c15, 0" : : "r" (0)); } @@ -1883,7 +1864,7 @@ static int scorpion_event_to_bit(struct perf_event *event, unsigned int region, * Lower bits are reserved for use by the counters (see * armv7pmu_get_event_idx() for more info) */ - bit += ARMV7_IDX_COUNTER_LAST(cpu_pmu) + 1; + bit += bitmap_weight(cpu_pmu->cntr_mask, ARMV7_IDX_COUNTER_MAX); return bit; } diff --git a/drivers/perf/arm_xscale_pmu.c b/drivers/perf/arm_xscale_pmu.c index 3d8b72d6b37f..638fea9b1263 100644 --- a/drivers/perf/arm_xscale_pmu.c +++ b/drivers/perf/arm_xscale_pmu.c @@ -53,6 +53,8 @@ enum xscale_counters { XSCALE_COUNTER2, XSCALE_COUNTER3, }; +#define XSCALE1_NUM_COUNTERS 3 +#define XSCALE2_NUM_COUNTERS 5 static const unsigned xscale_perf_map[PERF_COUNT_HW_MAX] = { PERF_MAP_ALL_UNSUPPORTED, @@ -168,7 +170,7 @@ xscale1pmu_handle_irq(struct arm_pmu *cpu_pmu) regs = get_irq_regs(); - for (idx = 0; idx < cpu_pmu->num_events; ++idx) { + for_each_set_bit(idx, cpu_pmu->cntr_mask, XSCALE1_NUM_COUNTERS) { struct perf_event *event = cpuc->events[idx]; struct hw_perf_event *hwc; @@ -364,7 +366,8 @@ static int xscale1pmu_init(struct arm_pmu *cpu_pmu) cpu_pmu->start = xscale1pmu_start; cpu_pmu->stop = xscale1pmu_stop; cpu_pmu->map_event = xscale_map_event; - cpu_pmu->num_events = 3; + + bitmap_set(cpu_pmu->cntr_mask, 0, XSCALE1_NUM_COUNTERS); return 0; } @@ -500,7 +503,7 @@ xscale2pmu_handle_irq(struct arm_pmu *cpu_pmu) regs = get_irq_regs(); - for (idx = 0; idx < cpu_pmu->num_events; ++idx) { + for_each_set_bit(idx, cpu_pmu->cntr_mask, XSCALE2_NUM_COUNTERS) { struct perf_event *event = cpuc->events[idx]; struct hw_perf_event *hwc; @@ -719,7 +722,8 @@ static int xscale2pmu_init(struct arm_pmu *cpu_pmu) cpu_pmu->start = xscale2pmu_start; cpu_pmu->stop = xscale2pmu_stop; cpu_pmu->map_event = xscale_map_event; - cpu_pmu->num_events = 5; + + bitmap_set(cpu_pmu->cntr_mask, 0, XSCALE2_NUM_COUNTERS); return 0; } diff --git a/include/linux/perf/arm_pmu.h b/include/linux/perf/arm_pmu.h index b3b34f6670cf..e5d6d204beab 100644 --- a/include/linux/perf/arm_pmu.h +++ b/include/linux/perf/arm_pmu.h @@ -96,7 +96,7 @@ struct arm_pmu { void (*stop)(struct arm_pmu *); void (*reset)(void *); int (*map_event)(struct perf_event *event); - int num_events; + DECLARE_BITMAP(cntr_mask, ARMPMU_MAX_HWEVENTS); bool secure_access; /* 32-bit ARM only */ #define ARMV8_PMUV3_MAX_COMMON_EVENTS 0x40 DECLARE_BITMAP(pmceid_bitmap, ARMV8_PMUV3_MAX_COMMON_EVENTS); diff --git a/include/linux/perf/arm_pmuv3.h b/include/linux/perf/arm_pmuv3.h index 7867db04ec98..eccbdd8eb98f 100644 --- a/include/linux/perf/arm_pmuv3.h +++ b/include/linux/perf/arm_pmuv3.h @@ -6,6 +6,7 @@ #ifndef __PERF_ARM_PMUV3_H #define __PERF_ARM_PMUV3_H +#define ARMV8_PMU_MAX_GENERAL_COUNTERS 31 #define ARMV8_PMU_MAX_COUNTERS 32 #define ARMV8_PMU_COUNTER_MASK (ARMV8_PMU_MAX_COUNTERS - 1) From a4a6e2078d85a9d94bcc7eab77845cb8cd39f680 Mon Sep 17 00:00:00 2001 From: "Rob Herring (Arm)" Date: Wed, 31 Jul 2024 10:51:19 -0600 Subject: [PATCH 11/94] perf: arm_pmuv3: Prepare for more than 32 counters Various PMUv3 registers which are a mask of counters are 64-bit registers, but the accessor functions take a u32. This has been fine as the upper 32-bits have been RES0 as there has been a maximum of 32 counters prior to Armv9.4/8.9. With Armv9.4/8.9, a 33rd counter is added. Update the accessor functions to use a u64 instead. Acked-by: Mark Rutland Signed-off-by: Rob Herring (Arm) Tested-by: James Clark Link: https://lore.kernel.org/r/20240731-arm-pmu-3-9-icntr-v3-2-280a8d7ff465@kernel.org Signed-off-by: Will Deacon --- arch/arm64/include/asm/arm_pmuv3.h | 12 ++++----- arch/arm64/include/asm/kvm_host.h | 8 +++--- arch/arm64/kvm/pmu.c | 8 +++--- drivers/perf/arm_pmuv3.c | 40 ++++++++++++++++-------------- include/kvm/arm_pmu.h | 4 +-- 5 files changed, 37 insertions(+), 35 deletions(-) diff --git a/arch/arm64/include/asm/arm_pmuv3.h b/arch/arm64/include/asm/arm_pmuv3.h index a4697a0b6835..19b3f9150058 100644 --- a/arch/arm64/include/asm/arm_pmuv3.h +++ b/arch/arm64/include/asm/arm_pmuv3.h @@ -71,22 +71,22 @@ static inline u64 read_pmccntr(void) return read_sysreg(pmccntr_el0); } -static inline void write_pmcntenset(u32 val) +static inline void write_pmcntenset(u64 val) { write_sysreg(val, pmcntenset_el0); } -static inline void write_pmcntenclr(u32 val) +static inline void write_pmcntenclr(u64 val) { write_sysreg(val, pmcntenclr_el0); } -static inline void write_pmintenset(u32 val) +static inline void write_pmintenset(u64 val) { write_sysreg(val, pmintenset_el1); } -static inline void write_pmintenclr(u32 val) +static inline void write_pmintenclr(u64 val) { write_sysreg(val, pmintenclr_el1); } @@ -96,12 +96,12 @@ static inline void write_pmccfiltr(u64 val) write_sysreg(val, pmccfiltr_el0); } -static inline void write_pmovsclr(u32 val) +static inline void write_pmovsclr(u64 val) { write_sysreg(val, pmovsclr_el0); } -static inline u32 read_pmovsclr(void) +static inline u64 read_pmovsclr(void) { return read_sysreg(pmovsclr_el0); } diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h index a33f5996ca9f..c0fc753aac87 100644 --- a/arch/arm64/include/asm/kvm_host.h +++ b/arch/arm64/include/asm/kvm_host.h @@ -1330,12 +1330,12 @@ void kvm_arch_vcpu_load_debug_state_flags(struct kvm_vcpu *vcpu); void kvm_arch_vcpu_put_debug_state_flags(struct kvm_vcpu *vcpu); #ifdef CONFIG_KVM -void kvm_set_pmu_events(u32 set, struct perf_event_attr *attr); -void kvm_clr_pmu_events(u32 clr); +void kvm_set_pmu_events(u64 set, struct perf_event_attr *attr); +void kvm_clr_pmu_events(u64 clr); bool kvm_set_pmuserenr(u64 val); #else -static inline void kvm_set_pmu_events(u32 set, struct perf_event_attr *attr) {} -static inline void kvm_clr_pmu_events(u32 clr) {} +static inline void kvm_set_pmu_events(u64 set, struct perf_event_attr *attr) {} +static inline void kvm_clr_pmu_events(u64 clr) {} static inline bool kvm_set_pmuserenr(u64 val) { return false; diff --git a/arch/arm64/kvm/pmu.c b/arch/arm64/kvm/pmu.c index 329819806096..e633b4434c6a 100644 --- a/arch/arm64/kvm/pmu.c +++ b/arch/arm64/kvm/pmu.c @@ -35,7 +35,7 @@ struct kvm_pmu_events *kvm_get_pmu_events(void) * Add events to track that we may want to switch at guest entry/exit * time. */ -void kvm_set_pmu_events(u32 set, struct perf_event_attr *attr) +void kvm_set_pmu_events(u64 set, struct perf_event_attr *attr) { struct kvm_pmu_events *pmu = kvm_get_pmu_events(); @@ -51,7 +51,7 @@ void kvm_set_pmu_events(u32 set, struct perf_event_attr *attr) /* * Stop tracking events */ -void kvm_clr_pmu_events(u32 clr) +void kvm_clr_pmu_events(u64 clr) { struct kvm_pmu_events *pmu = kvm_get_pmu_events(); @@ -176,7 +176,7 @@ static void kvm_vcpu_pmu_disable_el0(unsigned long events) void kvm_vcpu_pmu_restore_guest(struct kvm_vcpu *vcpu) { struct kvm_pmu_events *pmu; - u32 events_guest, events_host; + u64 events_guest, events_host; if (!kvm_arm_support_pmu_v3() || !has_vhe()) return; @@ -197,7 +197,7 @@ void kvm_vcpu_pmu_restore_guest(struct kvm_vcpu *vcpu) void kvm_vcpu_pmu_restore_host(struct kvm_vcpu *vcpu) { struct kvm_pmu_events *pmu; - u32 events_guest, events_host; + u64 events_guest, events_host; if (!kvm_arm_support_pmu_v3() || !has_vhe()) return; diff --git a/drivers/perf/arm_pmuv3.c b/drivers/perf/arm_pmuv3.c index b48773ec892e..bd45fbcb9a5a 100644 --- a/drivers/perf/arm_pmuv3.c +++ b/drivers/perf/arm_pmuv3.c @@ -505,14 +505,14 @@ static void armv8pmu_pmcr_write(u64 val) write_pmcr(val); } -static int armv8pmu_has_overflowed(u32 pmovsr) +static int armv8pmu_has_overflowed(u64 pmovsr) { - return pmovsr & ARMV8_PMU_OVERFLOWED_MASK; + return !!(pmovsr & ARMV8_PMU_OVERFLOWED_MASK); } -static int armv8pmu_counter_has_overflowed(u32 pmnc, int idx) +static int armv8pmu_counter_has_overflowed(u64 pmnc, int idx) { - return pmnc & BIT(idx); + return !!(pmnc & BIT(idx)); } static u64 armv8pmu_read_evcntr(int idx) @@ -651,17 +651,17 @@ static void armv8pmu_write_event_type(struct perf_event *event) } } -static u32 armv8pmu_event_cnten_mask(struct perf_event *event) +static u64 armv8pmu_event_cnten_mask(struct perf_event *event) { int counter = event->hw.idx; - u32 mask = BIT(counter); + u64 mask = BIT(counter); if (armv8pmu_event_is_chained(event)) mask |= BIT(counter - 1); return mask; } -static void armv8pmu_enable_counter(u32 mask) +static void armv8pmu_enable_counter(u64 mask) { /* * Make sure event configuration register writes are visible before we @@ -674,7 +674,7 @@ static void armv8pmu_enable_counter(u32 mask) static void armv8pmu_enable_event_counter(struct perf_event *event) { struct perf_event_attr *attr = &event->attr; - u32 mask = armv8pmu_event_cnten_mask(event); + u64 mask = armv8pmu_event_cnten_mask(event); kvm_set_pmu_events(mask, attr); @@ -683,7 +683,7 @@ static void armv8pmu_enable_event_counter(struct perf_event *event) armv8pmu_enable_counter(mask); } -static void armv8pmu_disable_counter(u32 mask) +static void armv8pmu_disable_counter(u64 mask) { write_pmcntenclr(mask); /* @@ -696,7 +696,7 @@ static void armv8pmu_disable_counter(u32 mask) static void armv8pmu_disable_event_counter(struct perf_event *event) { struct perf_event_attr *attr = &event->attr; - u32 mask = armv8pmu_event_cnten_mask(event); + u64 mask = armv8pmu_event_cnten_mask(event); kvm_clr_pmu_events(mask); @@ -705,7 +705,7 @@ static void armv8pmu_disable_event_counter(struct perf_event *event) armv8pmu_disable_counter(mask); } -static void armv8pmu_enable_intens(u32 mask) +static void armv8pmu_enable_intens(u64 mask) { write_pmintenset(mask); } @@ -715,7 +715,7 @@ static void armv8pmu_enable_event_irq(struct perf_event *event) armv8pmu_enable_intens(BIT(event->hw.idx)); } -static void armv8pmu_disable_intens(u32 mask) +static void armv8pmu_disable_intens(u64 mask) { write_pmintenclr(mask); isb(); @@ -729,9 +729,9 @@ static void armv8pmu_disable_event_irq(struct perf_event *event) armv8pmu_disable_intens(BIT(event->hw.idx)); } -static u32 armv8pmu_getreset_flags(void) +static u64 armv8pmu_getreset_flags(void) { - u32 value; + u64 value; /* Read */ value = read_pmovsclr(); @@ -827,7 +827,7 @@ static void armv8pmu_stop(struct arm_pmu *cpu_pmu) static irqreturn_t armv8pmu_handle_irq(struct arm_pmu *cpu_pmu) { - u32 pmovsr; + u64 pmovsr; struct perf_sample_data data; struct pmu_hw_events *cpuc = this_cpu_ptr(cpu_pmu->hw_events); struct pt_regs *regs; @@ -1040,14 +1040,16 @@ static int armv8pmu_set_event_filter(struct hw_perf_event *event, static void armv8pmu_reset(void *info) { struct arm_pmu *cpu_pmu = (struct arm_pmu *)info; - u64 pmcr; + u64 pmcr, mask; + + bitmap_to_arr64(&mask, cpu_pmu->cntr_mask, ARMPMU_MAX_HWEVENTS); /* The counter and interrupt enable registers are unknown at reset. */ - armv8pmu_disable_counter(U32_MAX); - armv8pmu_disable_intens(U32_MAX); + armv8pmu_disable_counter(mask); + armv8pmu_disable_intens(mask); /* Clear the counters we flip at guest entry/exit */ - kvm_clr_pmu_events(U32_MAX); + kvm_clr_pmu_events(mask); /* * Initialize & Reset PMNC. Request overflow interrupt for diff --git a/include/kvm/arm_pmu.h b/include/kvm/arm_pmu.h index 35d4ca4f6122..334d7c5503cf 100644 --- a/include/kvm/arm_pmu.h +++ b/include/kvm/arm_pmu.h @@ -19,8 +19,8 @@ struct kvm_pmc { }; struct kvm_pmu_events { - u32 events_host; - u32 events_guest; + u64 events_host; + u64 events_guest; }; struct kvm_pmu { From 741ee5284551cf5daae95d9c8c1e34f47382ed3c Mon Sep 17 00:00:00 2001 From: "Rob Herring (Arm)" Date: Wed, 31 Jul 2024 10:51:20 -0600 Subject: [PATCH 12/94] KVM: arm64: pmu: Use arm_pmuv3.h register accessors Commit df29ddf4f04b ("arm64: perf: Abstract system register accesses away") split off PMU register accessor functions to a standalone header. Let's use it for KVM PMU code and get rid one copy of the ugly switch macro. Acked-by: Mark Rutland Reviewed-by: Marc Zyngier Signed-off-by: Rob Herring (Arm) Tested-by: James Clark Link: https://lore.kernel.org/r/20240731-arm-pmu-3-9-icntr-v3-3-280a8d7ff465@kernel.org Signed-off-by: Will Deacon --- arch/arm64/include/asm/arm_pmuv3.h | 13 ++++++ arch/arm64/kvm/pmu.c | 66 ++++-------------------------- 2 files changed, 21 insertions(+), 58 deletions(-) diff --git a/arch/arm64/include/asm/arm_pmuv3.h b/arch/arm64/include/asm/arm_pmuv3.h index 19b3f9150058..36c3e82b4eec 100644 --- a/arch/arm64/include/asm/arm_pmuv3.h +++ b/arch/arm64/include/asm/arm_pmuv3.h @@ -33,6 +33,14 @@ static inline void write_pmevtypern(int n, unsigned long val) PMEVN_SWITCH(n, WRITE_PMEVTYPERN); } +#define RETURN_READ_PMEVTYPERN(n) \ + return read_sysreg(pmevtyper##n##_el0) +static inline unsigned long read_pmevtypern(int n) +{ + PMEVN_SWITCH(n, RETURN_READ_PMEVTYPERN); + return 0; +} + static inline unsigned long read_pmmir(void) { return read_cpuid(PMMIR_EL1); @@ -96,6 +104,11 @@ static inline void write_pmccfiltr(u64 val) write_sysreg(val, pmccfiltr_el0); } +static inline u64 read_pmccfiltr(void) +{ + return read_sysreg(pmccfiltr_el0); +} + static inline void write_pmovsclr(u64 val) { write_sysreg(val, pmovsclr_el0); diff --git a/arch/arm64/kvm/pmu.c b/arch/arm64/kvm/pmu.c index e633b4434c6a..a47ae311d4a8 100644 --- a/arch/arm64/kvm/pmu.c +++ b/arch/arm64/kvm/pmu.c @@ -5,6 +5,7 @@ */ #include #include +#include static DEFINE_PER_CPU(struct kvm_pmu_events, kvm_pmu_events); @@ -62,63 +63,16 @@ void kvm_clr_pmu_events(u64 clr) pmu->events_guest &= ~clr; } -#define PMEVTYPER_READ_CASE(idx) \ - case idx: \ - return read_sysreg(pmevtyper##idx##_el0) - -#define PMEVTYPER_WRITE_CASE(idx) \ - case idx: \ - write_sysreg(val, pmevtyper##idx##_el0); \ - break - -#define PMEVTYPER_CASES(readwrite) \ - PMEVTYPER_##readwrite##_CASE(0); \ - PMEVTYPER_##readwrite##_CASE(1); \ - PMEVTYPER_##readwrite##_CASE(2); \ - PMEVTYPER_##readwrite##_CASE(3); \ - PMEVTYPER_##readwrite##_CASE(4); \ - PMEVTYPER_##readwrite##_CASE(5); \ - PMEVTYPER_##readwrite##_CASE(6); \ - PMEVTYPER_##readwrite##_CASE(7); \ - PMEVTYPER_##readwrite##_CASE(8); \ - PMEVTYPER_##readwrite##_CASE(9); \ - PMEVTYPER_##readwrite##_CASE(10); \ - PMEVTYPER_##readwrite##_CASE(11); \ - PMEVTYPER_##readwrite##_CASE(12); \ - PMEVTYPER_##readwrite##_CASE(13); \ - PMEVTYPER_##readwrite##_CASE(14); \ - PMEVTYPER_##readwrite##_CASE(15); \ - PMEVTYPER_##readwrite##_CASE(16); \ - PMEVTYPER_##readwrite##_CASE(17); \ - PMEVTYPER_##readwrite##_CASE(18); \ - PMEVTYPER_##readwrite##_CASE(19); \ - PMEVTYPER_##readwrite##_CASE(20); \ - PMEVTYPER_##readwrite##_CASE(21); \ - PMEVTYPER_##readwrite##_CASE(22); \ - PMEVTYPER_##readwrite##_CASE(23); \ - PMEVTYPER_##readwrite##_CASE(24); \ - PMEVTYPER_##readwrite##_CASE(25); \ - PMEVTYPER_##readwrite##_CASE(26); \ - PMEVTYPER_##readwrite##_CASE(27); \ - PMEVTYPER_##readwrite##_CASE(28); \ - PMEVTYPER_##readwrite##_CASE(29); \ - PMEVTYPER_##readwrite##_CASE(30) - /* * Read a value direct from PMEVTYPER where idx is 0-30 * or PMCCFILTR_EL0 where idx is ARMV8_PMU_CYCLE_IDX (31). */ static u64 kvm_vcpu_pmu_read_evtype_direct(int idx) { - switch (idx) { - PMEVTYPER_CASES(READ); - case ARMV8_PMU_CYCLE_IDX: - return read_sysreg(pmccfiltr_el0); - default: - WARN_ON(1); - } + if (idx == ARMV8_PMU_CYCLE_IDX) + return read_pmccfiltr(); - return 0; + return read_pmevtypern(idx); } /* @@ -127,14 +81,10 @@ static u64 kvm_vcpu_pmu_read_evtype_direct(int idx) */ static void kvm_vcpu_pmu_write_evtype_direct(int idx, u32 val) { - switch (idx) { - PMEVTYPER_CASES(WRITE); - case ARMV8_PMU_CYCLE_IDX: - write_sysreg(val, pmccfiltr_el0); - break; - default: - WARN_ON(1); - } + if (idx == ARMV8_PMU_CYCLE_IDX) + write_pmccfiltr(val); + else + write_pmevtypern(idx, val); } /* From f9b11aa00708d94a0cd78bfde34b68c0f95d8b50 Mon Sep 17 00:00:00 2001 From: "Rob Herring (Arm)" Date: Wed, 31 Jul 2024 10:51:21 -0600 Subject: [PATCH 13/94] KVM: arm64: pmu: Use generated define for PMSELR_EL0.SEL access ARMV8_PMU_COUNTER_MASK is really a mask for the PMSELR_EL0.SEL register field. Make that clear by adding a standard sysreg definition for the register, and using it instead. Reviewed-by: Mark Rutland Acked-by: Mark Rutland Reviewed-by: Marc Zyngier Signed-off-by: Rob Herring (Arm) Tested-by: James Clark Link: https://lore.kernel.org/r/20240731-arm-pmu-3-9-icntr-v3-4-280a8d7ff465@kernel.org Signed-off-by: Will Deacon --- arch/arm64/include/asm/sysreg.h | 1 - arch/arm64/kvm/sys_regs.c | 10 +++++----- arch/arm64/tools/sysreg | 5 +++++ include/linux/perf/arm_pmuv3.h | 1 - 4 files changed, 10 insertions(+), 7 deletions(-) diff --git a/arch/arm64/include/asm/sysreg.h b/arch/arm64/include/asm/sysreg.h index 4a9ea103817e..00af1c331c1e 100644 --- a/arch/arm64/include/asm/sysreg.h +++ b/arch/arm64/include/asm/sysreg.h @@ -403,7 +403,6 @@ #define SYS_PMCNTENCLR_EL0 sys_reg(3, 3, 9, 12, 2) #define SYS_PMOVSCLR_EL0 sys_reg(3, 3, 9, 12, 3) #define SYS_PMSWINC_EL0 sys_reg(3, 3, 9, 12, 4) -#define SYS_PMSELR_EL0 sys_reg(3, 3, 9, 12, 5) #define SYS_PMCEID0_EL0 sys_reg(3, 3, 9, 12, 6) #define SYS_PMCEID1_EL0 sys_reg(3, 3, 9, 12, 7) #define SYS_PMCCNTR_EL0 sys_reg(3, 3, 9, 13, 0) diff --git a/arch/arm64/kvm/sys_regs.c b/arch/arm64/kvm/sys_regs.c index c90324060436..33497db257fb 100644 --- a/arch/arm64/kvm/sys_regs.c +++ b/arch/arm64/kvm/sys_regs.c @@ -887,7 +887,7 @@ static u64 reset_pmevtyper(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r) static u64 reset_pmselr(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r) { reset_unknown(vcpu, r); - __vcpu_sys_reg(vcpu, r->reg) &= ARMV8_PMU_COUNTER_MASK; + __vcpu_sys_reg(vcpu, r->reg) &= PMSELR_EL0_SEL_MASK; return __vcpu_sys_reg(vcpu, r->reg); } @@ -979,7 +979,7 @@ static bool access_pmselr(struct kvm_vcpu *vcpu, struct sys_reg_params *p, else /* return PMSELR.SEL field */ p->regval = __vcpu_sys_reg(vcpu, PMSELR_EL0) - & ARMV8_PMU_COUNTER_MASK; + & PMSELR_EL0_SEL_MASK; return true; } @@ -1047,8 +1047,8 @@ static bool access_pmu_evcntr(struct kvm_vcpu *vcpu, if (pmu_access_event_counter_el0_disabled(vcpu)) return false; - idx = __vcpu_sys_reg(vcpu, PMSELR_EL0) - & ARMV8_PMU_COUNTER_MASK; + idx = SYS_FIELD_GET(PMSELR_EL0, SEL, + __vcpu_sys_reg(vcpu, PMSELR_EL0)); } else if (r->Op2 == 0) { /* PMCCNTR_EL0 */ if (pmu_access_cycle_counter_el0_disabled(vcpu)) @@ -1098,7 +1098,7 @@ static bool access_pmu_evtyper(struct kvm_vcpu *vcpu, struct sys_reg_params *p, if (r->CRn == 9 && r->CRm == 13 && r->Op2 == 1) { /* PMXEVTYPER_EL0 */ - idx = __vcpu_sys_reg(vcpu, PMSELR_EL0) & ARMV8_PMU_COUNTER_MASK; + idx = SYS_FIELD_GET(PMSELR_EL0, SEL, __vcpu_sys_reg(vcpu, PMSELR_EL0)); reg = PMEVTYPER0_EL0 + idx; } else if (r->CRn == 14 && (r->CRm & 12) == 12) { idx = ((r->CRm & 3) << 3) | (r->Op2 & 7); diff --git a/arch/arm64/tools/sysreg b/arch/arm64/tools/sysreg index 7ceaa1e0b4bc..37aa7eaad07b 100644 --- a/arch/arm64/tools/sysreg +++ b/arch/arm64/tools/sysreg @@ -2153,6 +2153,11 @@ Field 4 P Field 3:0 ALIGN EndSysreg +Sysreg PMSELR_EL0 3 3 9 12 5 +Res0 63:5 +Field 4:0 SEL +EndSysreg + SysregFields CONTEXTIDR_ELx Res0 63:32 Field 31:0 PROCID diff --git a/include/linux/perf/arm_pmuv3.h b/include/linux/perf/arm_pmuv3.h index eccbdd8eb98f..792b8e10b72a 100644 --- a/include/linux/perf/arm_pmuv3.h +++ b/include/linux/perf/arm_pmuv3.h @@ -8,7 +8,6 @@ #define ARMV8_PMU_MAX_GENERAL_COUNTERS 31 #define ARMV8_PMU_MAX_COUNTERS 32 -#define ARMV8_PMU_COUNTER_MASK (ARMV8_PMU_MAX_COUNTERS - 1) /* * Common architectural and microarchitectural event numbers. From 126d7d7cce5e048fb82477a9842d088d10ff0df6 Mon Sep 17 00:00:00 2001 From: "Rob Herring (Arm)" Date: Wed, 31 Jul 2024 10:51:22 -0600 Subject: [PATCH 14/94] arm64: perf/kvm: Use a common PMU cycle counter define The PMUv3 and KVM code each have a define for the PMU cycle counter index. Move KVM's define to a shared location and use it for PMUv3 driver. Reviewed-by: Marc Zyngier Acked-by: Mark Rutland Signed-off-by: Rob Herring (Arm) Tested-by: James Clark Link: https://lore.kernel.org/r/20240731-arm-pmu-3-9-icntr-v3-5-280a8d7ff465@kernel.org Signed-off-by: Will Deacon --- arch/arm64/kvm/sys_regs.c | 1 + drivers/perf/arm_pmuv3.c | 19 +++++++------------ include/kvm/arm_pmu.h | 1 - include/linux/perf/arm_pmuv3.h | 3 +++ 4 files changed, 11 insertions(+), 13 deletions(-) diff --git a/arch/arm64/kvm/sys_regs.c b/arch/arm64/kvm/sys_regs.c index 33497db257fb..7db24de37ed6 100644 --- a/arch/arm64/kvm/sys_regs.c +++ b/arch/arm64/kvm/sys_regs.c @@ -18,6 +18,7 @@ #include #include +#include #include #include #include diff --git a/drivers/perf/arm_pmuv3.c b/drivers/perf/arm_pmuv3.c index bd45fbcb9a5a..18046cf4b3a3 100644 --- a/drivers/perf/arm_pmuv3.c +++ b/drivers/perf/arm_pmuv3.c @@ -451,11 +451,6 @@ static const struct attribute_group armv8_pmuv3_caps_attr_group = { .attrs = armv8_pmuv3_caps_attrs, }; -/* - * Perf Events' indices - */ -#define ARMV8_IDX_CYCLE_COUNTER 31 - /* * We unconditionally enable ARMv8.5-PMU long event counter support * (64-bit events) where supported. Indicate if this arm_pmu has long @@ -574,7 +569,7 @@ static u64 armv8pmu_read_counter(struct perf_event *event) int idx = hwc->idx; u64 value; - if (idx == ARMV8_IDX_CYCLE_COUNTER) + if (idx == ARMV8_PMU_CYCLE_IDX) value = read_pmccntr(); else value = armv8pmu_read_hw_counter(event); @@ -607,7 +602,7 @@ static void armv8pmu_write_counter(struct perf_event *event, u64 value) value = armv8pmu_bias_long_counter(event, value); - if (idx == ARMV8_IDX_CYCLE_COUNTER) + if (idx == ARMV8_PMU_CYCLE_IDX) write_pmccntr(value); else armv8pmu_write_hw_counter(event, value); @@ -644,7 +639,7 @@ static void armv8pmu_write_event_type(struct perf_event *event) armv8pmu_write_evtype(idx - 1, hwc->config_base); armv8pmu_write_evtype(idx, chain_evt); } else { - if (idx == ARMV8_IDX_CYCLE_COUNTER) + if (idx == ARMV8_PMU_CYCLE_IDX) write_pmccfiltr(hwc->config_base); else armv8pmu_write_evtype(idx, hwc->config_base); @@ -772,7 +767,7 @@ static void armv8pmu_enable_user_access(struct arm_pmu *cpu_pmu) /* Clear any unused counters to avoid leaking their contents */ for_each_andnot_bit(i, cpu_pmu->cntr_mask, cpuc->used_mask, ARMPMU_MAX_HWEVENTS) { - if (i == ARMV8_IDX_CYCLE_COUNTER) + if (i == ARMV8_PMU_CYCLE_IDX) write_pmccntr(0); else armv8pmu_write_evcntr(i, 0); @@ -933,8 +928,8 @@ static int armv8pmu_get_event_idx(struct pmu_hw_events *cpuc, /* Always prefer to place a cycle counter into the cycle counter. */ if ((evtype == ARMV8_PMUV3_PERFCTR_CPU_CYCLES) && !armv8pmu_event_get_threshold(&event->attr)) { - if (!test_and_set_bit(ARMV8_IDX_CYCLE_COUNTER, cpuc->used_mask)) - return ARMV8_IDX_CYCLE_COUNTER; + if (!test_and_set_bit(ARMV8_PMU_CYCLE_IDX, cpuc->used_mask)) + return ARMV8_PMU_CYCLE_IDX; else if (armv8pmu_event_is_64bit(event) && armv8pmu_event_want_user_access(event) && !armv8pmu_has_long_event(cpu_pmu)) @@ -1196,7 +1191,7 @@ static void __armv8pmu_probe_pmu(void *info) 0, FIELD_GET(ARMV8_PMU_PMCR_N, armv8pmu_pmcr_read())); /* Add the CPU cycles counter */ - set_bit(ARMV8_IDX_CYCLE_COUNTER, cpu_pmu->cntr_mask); + set_bit(ARMV8_PMU_CYCLE_IDX, cpu_pmu->cntr_mask); pmceid[0] = pmceid_raw[0] = read_pmceid0(); pmceid[1] = pmceid_raw[1] = read_pmceid1(); diff --git a/include/kvm/arm_pmu.h b/include/kvm/arm_pmu.h index 334d7c5503cf..871067fb2616 100644 --- a/include/kvm/arm_pmu.h +++ b/include/kvm/arm_pmu.h @@ -10,7 +10,6 @@ #include #include -#define ARMV8_PMU_CYCLE_IDX (ARMV8_PMU_MAX_COUNTERS - 1) #if IS_ENABLED(CONFIG_HW_PERF_EVENTS) && IS_ENABLED(CONFIG_KVM) struct kvm_pmc { diff --git a/include/linux/perf/arm_pmuv3.h b/include/linux/perf/arm_pmuv3.h index 792b8e10b72a..f4ec76f725a3 100644 --- a/include/linux/perf/arm_pmuv3.h +++ b/include/linux/perf/arm_pmuv3.h @@ -9,6 +9,9 @@ #define ARMV8_PMU_MAX_GENERAL_COUNTERS 31 #define ARMV8_PMU_MAX_COUNTERS 32 +#define ARMV8_PMU_CYCLE_IDX 31 + + /* * Common architectural and microarchitectural event numbers. */ From 2f62701fa5b0ee94c68d2fcfc470d08aef195441 Mon Sep 17 00:00:00 2001 From: "Rob Herring (Arm)" Date: Wed, 31 Jul 2024 10:51:23 -0600 Subject: [PATCH 15/94] KVM: arm64: Refine PMU defines for number of counters There are 2 defines for the number of PMU counters: ARMV8_PMU_MAX_COUNTERS and ARMPMU_MAX_HWEVENTS. Both are the same currently, but Armv9.4/8.9 increases the number of possible counters from 32 to 33. With this change, the maximum number of counters will differ for KVM's PMU emulation which is PMUv3.4. Give KVM PMU emulation its own define to decouple it from the rest of the kernel's number PMU counters. The VHE PMU code needs to match the PMU driver, so switch it to use ARMPMU_MAX_HWEVENTS instead. Acked-by: Mark Rutland Reviewed-by: Marc Zyngier Signed-off-by: Rob Herring (Arm) Tested-by: James Clark Link: https://lore.kernel.org/r/20240731-arm-pmu-3-9-icntr-v3-6-280a8d7ff465@kernel.org Signed-off-by: Will Deacon --- arch/arm64/kvm/pmu-emul.c | 8 ++++---- arch/arm64/kvm/pmu.c | 5 +++-- include/kvm/arm_pmu.h | 3 ++- include/linux/perf/arm_pmuv3.h | 2 -- 4 files changed, 9 insertions(+), 9 deletions(-) diff --git a/arch/arm64/kvm/pmu-emul.c b/arch/arm64/kvm/pmu-emul.c index 0e598f6c42c0..ac36c438b8c1 100644 --- a/arch/arm64/kvm/pmu-emul.c +++ b/arch/arm64/kvm/pmu-emul.c @@ -233,7 +233,7 @@ void kvm_pmu_vcpu_init(struct kvm_vcpu *vcpu) int i; struct kvm_pmu *pmu = &vcpu->arch.pmu; - for (i = 0; i < ARMV8_PMU_MAX_COUNTERS; i++) + for (i = 0; i < KVM_ARMV8_PMU_MAX_COUNTERS; i++) pmu->pmc[i].idx = i; } @@ -260,7 +260,7 @@ void kvm_pmu_vcpu_destroy(struct kvm_vcpu *vcpu) { int i; - for (i = 0; i < ARMV8_PMU_MAX_COUNTERS; i++) + for (i = 0; i < KVM_ARMV8_PMU_MAX_COUNTERS; i++) kvm_pmu_release_perf_event(kvm_vcpu_idx_to_pmc(vcpu, i)); irq_work_sync(&vcpu->arch.pmu.overflow_work); } @@ -291,7 +291,7 @@ void kvm_pmu_enable_counter_mask(struct kvm_vcpu *vcpu, u64 val) if (!(kvm_vcpu_read_pmcr(vcpu) & ARMV8_PMU_PMCR_E) || !val) return; - for (i = 0; i < ARMV8_PMU_MAX_COUNTERS; i++) { + for (i = 0; i < KVM_ARMV8_PMU_MAX_COUNTERS; i++) { struct kvm_pmc *pmc; if (!(val & BIT(i))) @@ -323,7 +323,7 @@ void kvm_pmu_disable_counter_mask(struct kvm_vcpu *vcpu, u64 val) if (!kvm_vcpu_has_pmu(vcpu) || !val) return; - for (i = 0; i < ARMV8_PMU_MAX_COUNTERS; i++) { + for (i = 0; i < KVM_ARMV8_PMU_MAX_COUNTERS; i++) { struct kvm_pmc *pmc; if (!(val & BIT(i))) diff --git a/arch/arm64/kvm/pmu.c b/arch/arm64/kvm/pmu.c index a47ae311d4a8..215b74875815 100644 --- a/arch/arm64/kvm/pmu.c +++ b/arch/arm64/kvm/pmu.c @@ -5,6 +5,7 @@ */ #include #include +#include #include static DEFINE_PER_CPU(struct kvm_pmu_events, kvm_pmu_events); @@ -95,7 +96,7 @@ static void kvm_vcpu_pmu_enable_el0(unsigned long events) u64 typer; u32 counter; - for_each_set_bit(counter, &events, 32) { + for_each_set_bit(counter, &events, ARMPMU_MAX_HWEVENTS) { typer = kvm_vcpu_pmu_read_evtype_direct(counter); typer &= ~ARMV8_PMU_EXCLUDE_EL0; kvm_vcpu_pmu_write_evtype_direct(counter, typer); @@ -110,7 +111,7 @@ static void kvm_vcpu_pmu_disable_el0(unsigned long events) u64 typer; u32 counter; - for_each_set_bit(counter, &events, 32) { + for_each_set_bit(counter, &events, ARMPMU_MAX_HWEVENTS) { typer = kvm_vcpu_pmu_read_evtype_direct(counter); typer |= ARMV8_PMU_EXCLUDE_EL0; kvm_vcpu_pmu_write_evtype_direct(counter, typer); diff --git a/include/kvm/arm_pmu.h b/include/kvm/arm_pmu.h index 871067fb2616..e08aeec5d936 100644 --- a/include/kvm/arm_pmu.h +++ b/include/kvm/arm_pmu.h @@ -10,6 +10,7 @@ #include #include +#define KVM_ARMV8_PMU_MAX_COUNTERS 32 #if IS_ENABLED(CONFIG_HW_PERF_EVENTS) && IS_ENABLED(CONFIG_KVM) struct kvm_pmc { @@ -25,7 +26,7 @@ struct kvm_pmu_events { struct kvm_pmu { struct irq_work overflow_work; struct kvm_pmu_events events; - struct kvm_pmc pmc[ARMV8_PMU_MAX_COUNTERS]; + struct kvm_pmc pmc[KVM_ARMV8_PMU_MAX_COUNTERS]; int irq_num; bool created; bool irq_level; diff --git a/include/linux/perf/arm_pmuv3.h b/include/linux/perf/arm_pmuv3.h index f4ec76f725a3..4f7a7f2222e5 100644 --- a/include/linux/perf/arm_pmuv3.h +++ b/include/linux/perf/arm_pmuv3.h @@ -7,8 +7,6 @@ #define __PERF_ARM_PMUV3_H #define ARMV8_PMU_MAX_GENERAL_COUNTERS 31 -#define ARMV8_PMU_MAX_COUNTERS 32 - #define ARMV8_PMU_CYCLE_IDX 31 From d8226d8cfbaf5eb9771af8ad8b4e58697e2ffb74 Mon Sep 17 00:00:00 2001 From: "Rob Herring (Arm)" Date: Wed, 31 Jul 2024 10:51:24 -0600 Subject: [PATCH 16/94] perf: arm_pmuv3: Add support for Armv9.4 PMU instruction counter Armv9.4/8.9 PMU adds optional support for a fixed instruction counter similar to the fixed cycle counter. Support for the feature is indicated in the ID_AA64DFR1_EL1 register PMICNTR field. The counter is not accessible in AArch32. Existing userspace using direct counter access won't know how to handle the fixed instruction counter, so we have to avoid using the counter when user access is requested. Acked-by: Mark Rutland Signed-off-by: Rob Herring (Arm) Tested-by: James Clark Link: https://lore.kernel.org/r/20240731-arm-pmu-3-9-icntr-v3-7-280a8d7ff465@kernel.org Signed-off-by: Will Deacon --- arch/arm/include/asm/arm_pmuv3.h | 20 ++++++++++++++++++++ arch/arm64/include/asm/arm_pmuv3.h | 28 ++++++++++++++++++++++++++++ arch/arm64/kvm/pmu.c | 8 ++++++-- arch/arm64/tools/sysreg | 25 +++++++++++++++++++++++++ drivers/perf/arm_pmuv3.c | 25 +++++++++++++++++++++++++ include/linux/perf/arm_pmu.h | 8 ++++++-- include/linux/perf/arm_pmuv3.h | 6 ++++-- 7 files changed, 114 insertions(+), 6 deletions(-) diff --git a/arch/arm/include/asm/arm_pmuv3.h b/arch/arm/include/asm/arm_pmuv3.h index a41b503b7dcd..f63ba8986b24 100644 --- a/arch/arm/include/asm/arm_pmuv3.h +++ b/arch/arm/include/asm/arm_pmuv3.h @@ -127,6 +127,12 @@ static inline u32 read_pmuver(void) return (dfr0 >> 24) & 0xf; } +static inline bool pmuv3_has_icntr(void) +{ + /* FEAT_PMUv3_ICNTR not accessible for 32-bit */ + return false; +} + static inline void write_pmcr(u32 val) { write_sysreg(val, PMCR); @@ -152,6 +158,13 @@ static inline u64 read_pmccntr(void) return read_sysreg(PMCCNTR); } +static inline void write_pmicntr(u64 val) {} + +static inline u64 read_pmicntr(void) +{ + return 0; +} + static inline void write_pmcntenset(u32 val) { write_sysreg(val, PMCNTENSET); @@ -177,6 +190,13 @@ static inline void write_pmccfiltr(u32 val) write_sysreg(val, PMCCFILTR); } +static inline void write_pmicfiltr(u64 val) {} + +static inline u64 read_pmicfiltr(void) +{ + return 0; +} + static inline void write_pmovsclr(u32 val) { write_sysreg(val, PMOVSR); diff --git a/arch/arm64/include/asm/arm_pmuv3.h b/arch/arm64/include/asm/arm_pmuv3.h index 36c3e82b4eec..468a049bc63b 100644 --- a/arch/arm64/include/asm/arm_pmuv3.h +++ b/arch/arm64/include/asm/arm_pmuv3.h @@ -54,6 +54,14 @@ static inline u32 read_pmuver(void) ID_AA64DFR0_EL1_PMUVer_SHIFT); } +static inline bool pmuv3_has_icntr(void) +{ + u64 dfr1 = read_sysreg(id_aa64dfr1_el1); + + return !!cpuid_feature_extract_unsigned_field(dfr1, + ID_AA64DFR1_EL1_PMICNTR_SHIFT); +} + static inline void write_pmcr(u64 val) { write_sysreg(val, pmcr_el0); @@ -79,6 +87,16 @@ static inline u64 read_pmccntr(void) return read_sysreg(pmccntr_el0); } +static inline void write_pmicntr(u64 val) +{ + write_sysreg_s(val, SYS_PMICNTR_EL0); +} + +static inline u64 read_pmicntr(void) +{ + return read_sysreg_s(SYS_PMICNTR_EL0); +} + static inline void write_pmcntenset(u64 val) { write_sysreg(val, pmcntenset_el0); @@ -109,6 +127,16 @@ static inline u64 read_pmccfiltr(void) return read_sysreg(pmccfiltr_el0); } +static inline void write_pmicfiltr(u64 val) +{ + write_sysreg_s(val, SYS_PMICFILTR_EL0); +} + +static inline u64 read_pmicfiltr(void) +{ + return read_sysreg_s(SYS_PMICFILTR_EL0); +} + static inline void write_pmovsclr(u64 val) { write_sysreg(val, pmovsclr_el0); diff --git a/arch/arm64/kvm/pmu.c b/arch/arm64/kvm/pmu.c index 215b74875815..0b3adf3e17b4 100644 --- a/arch/arm64/kvm/pmu.c +++ b/arch/arm64/kvm/pmu.c @@ -66,24 +66,28 @@ void kvm_clr_pmu_events(u64 clr) /* * Read a value direct from PMEVTYPER where idx is 0-30 - * or PMCCFILTR_EL0 where idx is ARMV8_PMU_CYCLE_IDX (31). + * or PMxCFILTR_EL0 where idx is 31-32. */ static u64 kvm_vcpu_pmu_read_evtype_direct(int idx) { if (idx == ARMV8_PMU_CYCLE_IDX) return read_pmccfiltr(); + else if (idx == ARMV8_PMU_INSTR_IDX) + return read_pmicfiltr(); return read_pmevtypern(idx); } /* * Write a value direct to PMEVTYPER where idx is 0-30 - * or PMCCFILTR_EL0 where idx is ARMV8_PMU_CYCLE_IDX (31). + * or PMxCFILTR_EL0 where idx is 31-32. */ static void kvm_vcpu_pmu_write_evtype_direct(int idx, u32 val) { if (idx == ARMV8_PMU_CYCLE_IDX) write_pmccfiltr(val); + else if (idx == ARMV8_PMU_INSTR_IDX) + write_pmicfiltr(val); else write_pmevtypern(idx, val); } diff --git a/arch/arm64/tools/sysreg b/arch/arm64/tools/sysreg index 37aa7eaad07b..8d637ac4b7c6 100644 --- a/arch/arm64/tools/sysreg +++ b/arch/arm64/tools/sysreg @@ -2029,6 +2029,31 @@ Sysreg FAR_EL1 3 0 6 0 0 Field 63:0 ADDR EndSysreg +Sysreg PMICNTR_EL0 3 3 9 4 0 +Field 63:0 ICNT +EndSysreg + +Sysreg PMICFILTR_EL0 3 3 9 6 0 +Res0 63:59 +Field 58 SYNC +Field 57:56 VS +Res0 55:32 +Field 31 P +Field 30 U +Field 29 NSK +Field 28 NSU +Field 27 NSH +Field 26 M +Res0 25 +Field 24 SH +Field 23 T +Field 22 RLK +Field 21 RLU +Field 20 RLH +Res0 19:16 +Field 15:0 evtCount +EndSysreg + Sysreg PMSCR_EL1 3 0 9 9 0 Res0 63:8 Field 7:6 PCT diff --git a/drivers/perf/arm_pmuv3.c b/drivers/perf/arm_pmuv3.c index 18046cf4b3a3..4d000532a07f 100644 --- a/drivers/perf/arm_pmuv3.c +++ b/drivers/perf/arm_pmuv3.c @@ -571,6 +571,8 @@ static u64 armv8pmu_read_counter(struct perf_event *event) if (idx == ARMV8_PMU_CYCLE_IDX) value = read_pmccntr(); + else if (idx == ARMV8_PMU_INSTR_IDX) + value = read_pmicntr(); else value = armv8pmu_read_hw_counter(event); @@ -604,6 +606,8 @@ static void armv8pmu_write_counter(struct perf_event *event, u64 value) if (idx == ARMV8_PMU_CYCLE_IDX) write_pmccntr(value); + else if (idx == ARMV8_PMU_INSTR_IDX) + write_pmicntr(value); else armv8pmu_write_hw_counter(event, value); } @@ -641,6 +645,8 @@ static void armv8pmu_write_event_type(struct perf_event *event) } else { if (idx == ARMV8_PMU_CYCLE_IDX) write_pmccfiltr(hwc->config_base); + else if (idx == ARMV8_PMU_INSTR_IDX) + write_pmicfiltr(hwc->config_base); else armv8pmu_write_evtype(idx, hwc->config_base); } @@ -769,6 +775,8 @@ static void armv8pmu_enable_user_access(struct arm_pmu *cpu_pmu) ARMPMU_MAX_HWEVENTS) { if (i == ARMV8_PMU_CYCLE_IDX) write_pmccntr(0); + else if (i == ARMV8_PMU_INSTR_IDX) + write_pmicntr(0); else armv8pmu_write_evcntr(i, 0); } @@ -936,6 +944,19 @@ static int armv8pmu_get_event_idx(struct pmu_hw_events *cpuc, return -EAGAIN; } + /* + * Always prefer to place a instruction counter into the instruction counter, + * but don't expose the instruction counter to userspace access as userspace + * may not know how to handle it. + */ + if ((evtype == ARMV8_PMUV3_PERFCTR_INST_RETIRED) && + !armv8pmu_event_get_threshold(&event->attr) && + test_bit(ARMV8_PMU_INSTR_IDX, cpu_pmu->cntr_mask) && + !armv8pmu_event_want_user_access(event)) { + if (!test_and_set_bit(ARMV8_PMU_INSTR_IDX, cpuc->used_mask)) + return ARMV8_PMU_INSTR_IDX; + } + /* * Otherwise use events counters */ @@ -1193,6 +1214,10 @@ static void __armv8pmu_probe_pmu(void *info) /* Add the CPU cycles counter */ set_bit(ARMV8_PMU_CYCLE_IDX, cpu_pmu->cntr_mask); + /* Add the CPU instructions counter */ + if (pmuv3_has_icntr()) + set_bit(ARMV8_PMU_INSTR_IDX, cpu_pmu->cntr_mask); + pmceid[0] = pmceid_raw[0] = read_pmceid0(); pmceid[1] = pmceid_raw[1] = read_pmceid1(); diff --git a/include/linux/perf/arm_pmu.h b/include/linux/perf/arm_pmu.h index e5d6d204beab..4b5b83677e3f 100644 --- a/include/linux/perf/arm_pmu.h +++ b/include/linux/perf/arm_pmu.h @@ -17,10 +17,14 @@ #ifdef CONFIG_ARM_PMU /* - * The ARMv7 CPU PMU supports up to 32 event counters. + * The Armv7 and Armv8.8 or less CPU PMU supports up to 32 event counters. + * The Armv8.9/9.4 CPU PMU supports up to 33 event counters. */ +#ifdef CONFIG_ARM #define ARMPMU_MAX_HWEVENTS 32 - +#else +#define ARMPMU_MAX_HWEVENTS 33 +#endif /* * ARM PMU hw_event flags */ diff --git a/include/linux/perf/arm_pmuv3.h b/include/linux/perf/arm_pmuv3.h index 4f7a7f2222e5..3372c1b56486 100644 --- a/include/linux/perf/arm_pmuv3.h +++ b/include/linux/perf/arm_pmuv3.h @@ -8,7 +8,7 @@ #define ARMV8_PMU_MAX_GENERAL_COUNTERS 31 #define ARMV8_PMU_CYCLE_IDX 31 - +#define ARMV8_PMU_INSTR_IDX 32 /* Not accessible from AArch32 */ /* * Common architectural and microarchitectural event numbers. @@ -228,8 +228,10 @@ */ #define ARMV8_PMU_OVSR_P GENMASK(30, 0) #define ARMV8_PMU_OVSR_C BIT(31) +#define ARMV8_PMU_OVSR_F BIT_ULL(32) /* arm64 only */ /* Mask for writable bits is both P and C fields */ -#define ARMV8_PMU_OVERFLOWED_MASK (ARMV8_PMU_OVSR_P | ARMV8_PMU_OVSR_C) +#define ARMV8_PMU_OVERFLOWED_MASK (ARMV8_PMU_OVSR_P | ARMV8_PMU_OVSR_C | \ + ARMV8_PMU_OVSR_F) /* * PMXEVTYPER: Event selection reg From f3b78b470f28bb2a3a40e88bdf5c6de6a35a9b76 Mon Sep 17 00:00:00 2001 From: Yicong Yang Date: Wed, 31 Jul 2024 17:26:58 +0800 Subject: [PATCH 17/94] ACPI/IORT: Add PMCG platform information for HiSilicon HIP10/11 HiSilicon HIP10/11 platforms using the same SMMU PMCG with HIP09 and thus suffers the same erratum. List them in the PMCG platform information list without introducing a new SMMU PMCG Model. Update the silicon-errata.rst as well. Signed-off-by: Yicong Yang Link: https://lore.kernel.org/r/20240731092658.11012-1-yangyicong@huawei.com Signed-off-by: Will Deacon --- Documentation/arch/arm64/silicon-errata.rst | 4 ++-- drivers/acpi/arm64/iort.c | 7 +++++++ 2 files changed, 9 insertions(+), 2 deletions(-) diff --git a/Documentation/arch/arm64/silicon-errata.rst b/Documentation/arch/arm64/silicon-errata.rst index 50327c05be8d..35892c28d831 100644 --- a/Documentation/arch/arm64/silicon-errata.rst +++ b/Documentation/arch/arm64/silicon-errata.rst @@ -249,8 +249,8 @@ stable kernels. +----------------+-----------------+-----------------+-----------------------------+ | Hisilicon | Hip08 SMMU PMCG | #162001800 | N/A | +----------------+-----------------+-----------------+-----------------------------+ -| Hisilicon | Hip08 SMMU PMCG | #162001900 | N/A | -| | Hip09 SMMU PMCG | | | +| Hisilicon | Hip{08,09,10,10C| #162001900 | N/A | +| | ,11} SMMU PMCG | | | +----------------+-----------------+-----------------+-----------------------------+ +----------------+-----------------+-----------------+-----------------------------+ | Qualcomm Tech. | Kryo/Falkor v1 | E1003 | QCOM_FALKOR_ERRATUM_1003 | diff --git a/drivers/acpi/arm64/iort.c b/drivers/acpi/arm64/iort.c index 034d303c3df1..4c745a26226b 100644 --- a/drivers/acpi/arm64/iort.c +++ b/drivers/acpi/arm64/iort.c @@ -1703,6 +1703,13 @@ static struct acpi_platform_list pmcg_plat_info[] __initdata = { /* HiSilicon Hip09 Platform */ {"HISI ", "HIP09 ", 0, ACPI_SIG_IORT, greater_than_or_equal, "Erratum #162001900", IORT_SMMU_V3_PMCG_HISI_HIP09}, + /* HiSilicon Hip10/11 Platform uses the same SMMU IP with Hip09 */ + {"HISI ", "HIP10 ", 0, ACPI_SIG_IORT, greater_than_or_equal, + "Erratum #162001900", IORT_SMMU_V3_PMCG_HISI_HIP09}, + {"HISI ", "HIP10C ", 0, ACPI_SIG_IORT, greater_than_or_equal, + "Erratum #162001900", IORT_SMMU_V3_PMCG_HISI_HIP09}, + {"HISI ", "HIP11 ", 0, ACPI_SIG_IORT, greater_than_or_equal, + "Erratum #162001900", IORT_SMMU_V3_PMCG_HISI_HIP09}, { } }; From 5225b6562b9a7dc808d5a1e465aaf5e2ebb220cd Mon Sep 17 00:00:00 2001 From: Andre Przywara Date: Wed, 21 Aug 2024 17:44:01 +0100 Subject: [PATCH 18/94] kselftest/arm64: signal: fix/refactor SVE vector length enumeration Currently a number of SVE/SME related tests have almost identical functions to enumerate all supported vector lengths. However over time the copy&pasted code has diverged, allowing some bugs to creep in: - fake_sigreturn_sme_change_vl reports a failure, not a SKIP if only one vector length is supported (but the SVE version is fine) - fake_sigreturn_sme_change_vl tries to set the SVE vector length, not the SME one (but the other SME tests are fine) - za_no_regs keeps iterating forever if only one vector length is supported (but za_regs is correct) Since those bugs seem to be mostly copy&paste ones, let's consolidate the enumeration loop into one shared function, and just call that from each test. That should fix the above bugs, and prevent similar issues from happening again. Fixes: 4963aeb35a9e ("kselftest/arm64: signal: Add SME signal handling tests") Signed-off-by: Andre Przywara Reviewed-by: Mark Brown Link: https://lore.kernel.org/r/20240821164401.3598545-1-andre.przywara@arm.com Signed-off-by: Will Deacon --- tools/testing/selftests/arm64/signal/Makefile | 2 +- .../selftests/arm64/signal/sve_helpers.c | 56 +++++++++++++++++++ .../selftests/arm64/signal/sve_helpers.h | 21 +++++++ .../testcases/fake_sigreturn_sme_change_vl.c | 32 +++-------- .../testcases/fake_sigreturn_sve_change_vl.c | 30 ++-------- .../arm64/signal/testcases/ssve_regs.c | 36 +++--------- .../arm64/signal/testcases/ssve_za_regs.c | 36 +++--------- .../arm64/signal/testcases/sve_regs.c | 32 +++-------- .../arm64/signal/testcases/za_no_regs.c | 32 +++-------- .../arm64/signal/testcases/za_regs.c | 36 +++--------- 10 files changed, 132 insertions(+), 181 deletions(-) create mode 100644 tools/testing/selftests/arm64/signal/sve_helpers.c create mode 100644 tools/testing/selftests/arm64/signal/sve_helpers.h diff --git a/tools/testing/selftests/arm64/signal/Makefile b/tools/testing/selftests/arm64/signal/Makefile index 8f5febaf1a9a..edb3613513b8 100644 --- a/tools/testing/selftests/arm64/signal/Makefile +++ b/tools/testing/selftests/arm64/signal/Makefile @@ -23,7 +23,7 @@ $(TEST_GEN_PROGS): $(PROGS) # Common test-unit targets to build common-layout test-cases executables # Needs secondary expansion to properly include the testcase c-file in pre-reqs COMMON_SOURCES := test_signals.c test_signals_utils.c testcases/testcases.c \ - signals.S + signals.S sve_helpers.c COMMON_HEADERS := test_signals.h test_signals_utils.h testcases/testcases.h .SECONDEXPANSION: diff --git a/tools/testing/selftests/arm64/signal/sve_helpers.c b/tools/testing/selftests/arm64/signal/sve_helpers.c new file mode 100644 index 000000000000..0acc121af306 --- /dev/null +++ b/tools/testing/selftests/arm64/signal/sve_helpers.c @@ -0,0 +1,56 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (C) 2024 ARM Limited + * + * Common helper functions for SVE and SME functionality. + */ + +#include +#include +#include +#include + +unsigned int vls[SVE_VQ_MAX]; +unsigned int nvls; + +int sve_fill_vls(bool use_sme, int min_vls) +{ + int vq, vl; + int pr_set_vl = use_sme ? PR_SME_SET_VL : PR_SVE_SET_VL; + int len_mask = use_sme ? PR_SME_VL_LEN_MASK : PR_SVE_VL_LEN_MASK; + + /* + * Enumerate up to SVE_VQ_MAX vector lengths + */ + for (vq = SVE_VQ_MAX; vq > 0; --vq) { + vl = prctl(pr_set_vl, vq * 16); + if (vl == -1) + return KSFT_FAIL; + + vl &= len_mask; + + /* + * Unlike SVE, SME does not require the minimum vector length + * to be implemented, or the VLs to be consecutive, so any call + * to the prctl might return the single implemented VL, which + * might be larger than 16. So to avoid this loop never + * terminating, bail out here when we find a higher VL than + * we asked for. + * See the ARM ARM, DDI 0487K.a, B1.4.2: I_QQRNR and I_NWYBP. + */ + if (vq < sve_vq_from_vl(vl)) + break; + + /* Skip missing VLs */ + vq = sve_vq_from_vl(vl); + + vls[nvls++] = vl; + } + + if (nvls < min_vls) { + fprintf(stderr, "Only %d VL supported\n", nvls); + return KSFT_SKIP; + } + + return KSFT_PASS; +} diff --git a/tools/testing/selftests/arm64/signal/sve_helpers.h b/tools/testing/selftests/arm64/signal/sve_helpers.h new file mode 100644 index 000000000000..50948ce471cc --- /dev/null +++ b/tools/testing/selftests/arm64/signal/sve_helpers.h @@ -0,0 +1,21 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright (C) 2024 ARM Limited + * + * Common helper functions for SVE and SME functionality. + */ + +#ifndef __SVE_HELPERS_H__ +#define __SVE_HELPERS_H__ + +#include + +#define VLS_USE_SVE false +#define VLS_USE_SME true + +extern unsigned int vls[]; +extern unsigned int nvls; + +int sve_fill_vls(bool use_sme, int min_vls); + +#endif diff --git a/tools/testing/selftests/arm64/signal/testcases/fake_sigreturn_sme_change_vl.c b/tools/testing/selftests/arm64/signal/testcases/fake_sigreturn_sme_change_vl.c index ebd5815b54bb..cb8c051b5c8f 100644 --- a/tools/testing/selftests/arm64/signal/testcases/fake_sigreturn_sme_change_vl.c +++ b/tools/testing/selftests/arm64/signal/testcases/fake_sigreturn_sme_change_vl.c @@ -6,44 +6,28 @@ * handler, this is not supported and is expected to segfault. */ +#include #include #include #include #include "test_signals_utils.h" +#include "sve_helpers.h" #include "testcases.h" struct fake_sigframe sf; -static unsigned int vls[SVE_VQ_MAX]; -unsigned int nvls = 0; static bool sme_get_vls(struct tdescr *td) { - int vq, vl; + int res = sve_fill_vls(VLS_USE_SME, 2); - /* - * Enumerate up to SVE_VQ_MAX vector lengths - */ - for (vq = SVE_VQ_MAX; vq > 0; --vq) { - vl = prctl(PR_SVE_SET_VL, vq * 16); - if (vl == -1) - return false; + if (!res) + return true; - vl &= PR_SME_VL_LEN_MASK; + if (res == KSFT_SKIP) + td->result = KSFT_SKIP; - /* Skip missing VLs */ - vq = sve_vq_from_vl(vl); - - vls[nvls++] = vl; - } - - /* We need at least two VLs */ - if (nvls < 2) { - fprintf(stderr, "Only %d VL supported\n", nvls); - return false; - } - - return true; + return false; } static int fake_sigreturn_ssve_change_vl(struct tdescr *td, diff --git a/tools/testing/selftests/arm64/signal/testcases/fake_sigreturn_sve_change_vl.c b/tools/testing/selftests/arm64/signal/testcases/fake_sigreturn_sve_change_vl.c index e2a452190511..e1ccf8f85a70 100644 --- a/tools/testing/selftests/arm64/signal/testcases/fake_sigreturn_sve_change_vl.c +++ b/tools/testing/selftests/arm64/signal/testcases/fake_sigreturn_sve_change_vl.c @@ -12,40 +12,22 @@ #include #include "test_signals_utils.h" +#include "sve_helpers.h" #include "testcases.h" struct fake_sigframe sf; -static unsigned int vls[SVE_VQ_MAX]; -unsigned int nvls = 0; static bool sve_get_vls(struct tdescr *td) { - int vq, vl; + int res = sve_fill_vls(VLS_USE_SVE, 2); - /* - * Enumerate up to SVE_VQ_MAX vector lengths - */ - for (vq = SVE_VQ_MAX; vq > 0; --vq) { - vl = prctl(PR_SVE_SET_VL, vq * 16); - if (vl == -1) - return false; + if (!res) + return true; - vl &= PR_SVE_VL_LEN_MASK; - - /* Skip missing VLs */ - vq = sve_vq_from_vl(vl); - - vls[nvls++] = vl; - } - - /* We need at least two VLs */ - if (nvls < 2) { - fprintf(stderr, "Only %d VL supported\n", nvls); + if (res == KSFT_SKIP) td->result = KSFT_SKIP; - return false; - } - return true; + return false; } static int fake_sigreturn_sve_change_vl(struct tdescr *td, diff --git a/tools/testing/selftests/arm64/signal/testcases/ssve_regs.c b/tools/testing/selftests/arm64/signal/testcases/ssve_regs.c index 3d37daafcff5..6dbe48cf8b09 100644 --- a/tools/testing/selftests/arm64/signal/testcases/ssve_regs.c +++ b/tools/testing/selftests/arm64/signal/testcases/ssve_regs.c @@ -6,51 +6,31 @@ * set up as expected. */ +#include #include #include #include #include "test_signals_utils.h" +#include "sve_helpers.h" #include "testcases.h" static union { ucontext_t uc; char buf[1024 * 64]; } context; -static unsigned int vls[SVE_VQ_MAX]; -unsigned int nvls = 0; static bool sme_get_vls(struct tdescr *td) { - int vq, vl; + int res = sve_fill_vls(VLS_USE_SME, 1); - /* - * Enumerate up to SVE_VQ_MAX vector lengths - */ - for (vq = SVE_VQ_MAX; vq > 0; --vq) { - vl = prctl(PR_SME_SET_VL, vq * 16); - if (vl == -1) - return false; + if (!res) + return true; - vl &= PR_SME_VL_LEN_MASK; + if (res == KSFT_SKIP) + td->result = KSFT_SKIP; - /* Did we find the lowest supported VL? */ - if (vq < sve_vq_from_vl(vl)) - break; - - /* Skip missing VLs */ - vq = sve_vq_from_vl(vl); - - vls[nvls++] = vl; - } - - /* We need at least one VL */ - if (nvls < 1) { - fprintf(stderr, "Only %d VL supported\n", nvls); - return false; - } - - return true; + return false; } static void setup_ssve_regs(void) diff --git a/tools/testing/selftests/arm64/signal/testcases/ssve_za_regs.c b/tools/testing/selftests/arm64/signal/testcases/ssve_za_regs.c index 9dc5f128bbc0..5557e116e973 100644 --- a/tools/testing/selftests/arm64/signal/testcases/ssve_za_regs.c +++ b/tools/testing/selftests/arm64/signal/testcases/ssve_za_regs.c @@ -6,51 +6,31 @@ * signal frames is set up as expected when enabled simultaneously. */ +#include #include #include #include #include "test_signals_utils.h" +#include "sve_helpers.h" #include "testcases.h" static union { ucontext_t uc; char buf[1024 * 128]; } context; -static unsigned int vls[SVE_VQ_MAX]; -unsigned int nvls = 0; static bool sme_get_vls(struct tdescr *td) { - int vq, vl; + int res = sve_fill_vls(VLS_USE_SME, 1); - /* - * Enumerate up to SVE_VQ_MAX vector lengths - */ - for (vq = SVE_VQ_MAX; vq > 0; --vq) { - vl = prctl(PR_SME_SET_VL, vq * 16); - if (vl == -1) - return false; + if (!res) + return true; - vl &= PR_SME_VL_LEN_MASK; + if (res == KSFT_SKIP) + td->result = KSFT_SKIP; - /* Did we find the lowest supported VL? */ - if (vq < sve_vq_from_vl(vl)) - break; - - /* Skip missing VLs */ - vq = sve_vq_from_vl(vl); - - vls[nvls++] = vl; - } - - /* We need at least one VL */ - if (nvls < 1) { - fprintf(stderr, "Only %d VL supported\n", nvls); - return false; - } - - return true; + return false; } static void setup_regs(void) diff --git a/tools/testing/selftests/arm64/signal/testcases/sve_regs.c b/tools/testing/selftests/arm64/signal/testcases/sve_regs.c index 8b16eabbb769..8143eb1c58c1 100644 --- a/tools/testing/selftests/arm64/signal/testcases/sve_regs.c +++ b/tools/testing/selftests/arm64/signal/testcases/sve_regs.c @@ -6,47 +6,31 @@ * expected. */ +#include #include #include #include #include "test_signals_utils.h" +#include "sve_helpers.h" #include "testcases.h" static union { ucontext_t uc; char buf[1024 * 64]; } context; -static unsigned int vls[SVE_VQ_MAX]; -unsigned int nvls = 0; static bool sve_get_vls(struct tdescr *td) { - int vq, vl; + int res = sve_fill_vls(VLS_USE_SVE, 1); - /* - * Enumerate up to SVE_VQ_MAX vector lengths - */ - for (vq = SVE_VQ_MAX; vq > 0; --vq) { - vl = prctl(PR_SVE_SET_VL, vq * 16); - if (vl == -1) - return false; + if (!res) + return true; - vl &= PR_SVE_VL_LEN_MASK; + if (res == KSFT_SKIP) + td->result = KSFT_SKIP; - /* Skip missing VLs */ - vq = sve_vq_from_vl(vl); - - vls[nvls++] = vl; - } - - /* We need at least one VL */ - if (nvls < 1) { - fprintf(stderr, "Only %d VL supported\n", nvls); - return false; - } - - return true; + return false; } static void setup_sve_regs(void) diff --git a/tools/testing/selftests/arm64/signal/testcases/za_no_regs.c b/tools/testing/selftests/arm64/signal/testcases/za_no_regs.c index 4d6f94b6178f..ce26e9c2fa5e 100644 --- a/tools/testing/selftests/arm64/signal/testcases/za_no_regs.c +++ b/tools/testing/selftests/arm64/signal/testcases/za_no_regs.c @@ -6,47 +6,31 @@ * expected. */ +#include #include #include #include #include "test_signals_utils.h" +#include "sve_helpers.h" #include "testcases.h" static union { ucontext_t uc; char buf[1024 * 128]; } context; -static unsigned int vls[SVE_VQ_MAX]; -unsigned int nvls = 0; static bool sme_get_vls(struct tdescr *td) { - int vq, vl; + int res = sve_fill_vls(VLS_USE_SME, 1); - /* - * Enumerate up to SME_VQ_MAX vector lengths - */ - for (vq = SVE_VQ_MAX; vq > 0; --vq) { - vl = prctl(PR_SME_SET_VL, vq * 16); - if (vl == -1) - return false; + if (!res) + return true; - vl &= PR_SME_VL_LEN_MASK; + if (res == KSFT_SKIP) + td->result = KSFT_SKIP; - /* Skip missing VLs */ - vq = sve_vq_from_vl(vl); - - vls[nvls++] = vl; - } - - /* We need at least one VL */ - if (nvls < 1) { - fprintf(stderr, "Only %d VL supported\n", nvls); - return false; - } - - return true; + return false; } static int do_one_sme_vl(struct tdescr *td, siginfo_t *si, ucontext_t *uc, diff --git a/tools/testing/selftests/arm64/signal/testcases/za_regs.c b/tools/testing/selftests/arm64/signal/testcases/za_regs.c index 174ad6656696..b9e13f27f1f9 100644 --- a/tools/testing/selftests/arm64/signal/testcases/za_regs.c +++ b/tools/testing/selftests/arm64/signal/testcases/za_regs.c @@ -6,51 +6,31 @@ * expected. */ +#include #include #include #include #include "test_signals_utils.h" +#include "sve_helpers.h" #include "testcases.h" static union { ucontext_t uc; char buf[1024 * 128]; } context; -static unsigned int vls[SVE_VQ_MAX]; -unsigned int nvls = 0; static bool sme_get_vls(struct tdescr *td) { - int vq, vl; + int res = sve_fill_vls(VLS_USE_SME, 1); - /* - * Enumerate up to SME_VQ_MAX vector lengths - */ - for (vq = SVE_VQ_MAX; vq > 0; --vq) { - vl = prctl(PR_SME_SET_VL, vq * 16); - if (vl == -1) - return false; + if (!res) + return true; - vl &= PR_SME_VL_LEN_MASK; + if (res == KSFT_SKIP) + td->result = KSFT_SKIP; - /* Did we find the lowest supported VL? */ - if (vq < sve_vq_from_vl(vl)) - break; - - /* Skip missing VLs */ - vq = sve_vq_from_vl(vl); - - vls[nvls++] = vl; - } - - /* We need at least one VL */ - if (nvls < 1) { - fprintf(stderr, "Only %d VL supported\n", nvls); - return false; - } - - return true; + return false; } static void setup_za_regs(void) From fdfa588124b6356cd08e5d3f0c3643c4ec3d6887 Mon Sep 17 00:00:00 2001 From: Douglas Anderson Date: Wed, 21 Aug 2024 14:53:57 -0700 Subject: [PATCH 19/94] arm64: smp: smp_send_stop() and crash_smp_send_stop() should try non-NMI first When testing hard lockup handling on my sc7180-trogdor-lazor device with pseudo-NMI enabled, with serial console enabled and with kgdb disabled, I found that the stack crawls printed to the serial console ended up as a jumbled mess. After rebooting, the pstore-based console looked fine though. Also, enabling kgdb to trap the panic made the console look fine and avoided the mess. After a bit of tracking down, I came to the conclusion that this was what was happening: 1. The panic path was stopping all other CPUs with panic_other_cpus_shutdown(). 2. At least one of those other CPUs was in the middle of printing to the serial console and holding the console port's lock, which is grabbed with "irqsave". ...but since we were stopping with an NMI we didn't care about the "irqsave" and interrupted anyway. 3. Since we stopped the CPU while it was holding the lock it would never release it. 4. All future calls to output to the console would end up failing to get the lock in qcom_geni_serial_console_write(). This isn't _totally_ unexpected at panic time but it's a code path that's not well tested, hard to get right, and apparently doesn't work terribly well on the Qualcomm geni serial driver. The Qualcomm geni serial driver was fixed to be a bit better in commit 9e957a155005 ("serial: qcom-geni: Don't cancel/abort if we can't get the port lock") but it's nice not to get into this situation in the first place. Taking a page from what x86 appears to do in native_stop_other_cpus(), do this: 1. First, try to stop other CPUs with a normal IPI and wait a second. This gives them a chance to leave critical sections. 2. If CPUs fail to stop then retry with an NMI, but give a much lower timeout since there's no good reason for a CPU not to react quickly to a NMI. This works well and avoids the corrupted console and (presumably) could help avoid other similar issues. In order to do this, we need to do a little re-organization of our IPIs since we don't have any more free IDs. Do what was suggested in previous conversations and combine "stop" and "crash stop". That frees up an IPI so now we can have a "stop" and "stop NMI". In order to do this we also need a slight change in the way we keep track of which CPUs still need to be stopped. We need to know specifically which CPUs haven't stopped yet when we fall back to NMI but in the "crash stop" case the "cpu_online_mask" isn't updated as CPUs go down. This is why that code path had an atomic of the number of CPUs left. Solve this by also updating the "cpu_online_mask" for crash stops. All of the above lets us combine the logic for "stop" and "crash stop" code, which appeared to have a bunch of arbitrary implementation differences. Aside from the above change where we try a normal IPI and then an NMI, the combined function has a few subtle differences: * In the normal smp_send_stop(), if we fail to stop one or more CPUs then we won't include the current CPU (the one running smp_send_stop()) in the error message. * In crash_smp_send_stop(), if we fail to stop some CPUs we'll print the CPUs that we failed to stop instead of printing all _but_ the current running CPU. * In crash_smp_send_stop(), we will now only print "SMP: stopping secondary CPUs" if (system_state <= SYSTEM_RUNNING). Fixes: d7402513c935 ("arm64: smp: IPI_CPU_STOP and IPI_CPU_CRASH_STOP should try for NMI") Signed-off-by: Douglas Anderson Link: https://lore.kernel.org/r/20240821145353.v3.1.Id4817adef610302554b8aa42b090d57270dc119c@changeid Signed-off-by: Will Deacon --- arch/arm64/kernel/smp.c | 164 ++++++++++++++++++++++++---------------- 1 file changed, 99 insertions(+), 65 deletions(-) diff --git a/arch/arm64/kernel/smp.c b/arch/arm64/kernel/smp.c index 5e18fbcee9a2..33ebcac231da 100644 --- a/arch/arm64/kernel/smp.c +++ b/arch/arm64/kernel/smp.c @@ -68,7 +68,7 @@ enum ipi_msg_type { IPI_RESCHEDULE, IPI_CALL_FUNC, IPI_CPU_STOP, - IPI_CPU_CRASH_STOP, + IPI_CPU_STOP_NMI, IPI_TIMER, IPI_IRQ_WORK, NR_IPI, @@ -85,6 +85,8 @@ static int ipi_irq_base __ro_after_init; static int nr_ipi __ro_after_init = NR_IPI; static struct irq_desc *ipi_desc[MAX_IPI] __ro_after_init; +static bool crash_stop; + static void ipi_setup(int cpu); #ifdef CONFIG_HOTPLUG_CPU @@ -821,7 +823,7 @@ static const char *ipi_types[MAX_IPI] __tracepoint_string = { [IPI_RESCHEDULE] = "Rescheduling interrupts", [IPI_CALL_FUNC] = "Function call interrupts", [IPI_CPU_STOP] = "CPU stop interrupts", - [IPI_CPU_CRASH_STOP] = "CPU stop (for crash dump) interrupts", + [IPI_CPU_STOP_NMI] = "CPU stop NMIs", [IPI_TIMER] = "Timer broadcast interrupts", [IPI_IRQ_WORK] = "IRQ work interrupts", [IPI_CPU_BACKTRACE] = "CPU backtrace interrupts", @@ -865,9 +867,9 @@ void arch_irq_work_raise(void) } #endif -static void __noreturn local_cpu_stop(void) +static void __noreturn local_cpu_stop(unsigned int cpu) { - set_cpu_online(smp_processor_id(), false); + set_cpu_online(cpu, false); local_daif_mask(); sdei_mask_local_cpu(); @@ -881,21 +883,26 @@ static void __noreturn local_cpu_stop(void) */ void __noreturn panic_smp_self_stop(void) { - local_cpu_stop(); + local_cpu_stop(smp_processor_id()); } -#ifdef CONFIG_KEXEC_CORE -static atomic_t waiting_for_crash_ipi = ATOMIC_INIT(0); -#endif - static void __noreturn ipi_cpu_crash_stop(unsigned int cpu, struct pt_regs *regs) { #ifdef CONFIG_KEXEC_CORE + /* + * Use local_daif_mask() instead of local_irq_disable() to make sure + * that pseudo-NMIs are disabled. The "crash stop" code starts with + * an IRQ and falls back to NMI (which might be pseudo). If the IRQ + * finally goes through right as we're timing out then the NMI could + * interrupt us. It's better to prevent the NMI and let the IRQ + * finish since the pt_regs will be better. + */ + local_daif_mask(); + crash_save_cpu(regs, cpu); - atomic_dec(&waiting_for_crash_ipi); + set_cpu_online(cpu, false); - local_irq_disable(); sdei_mask_local_cpu(); if (IS_ENABLED(CONFIG_HOTPLUG_CPU)) @@ -960,14 +967,12 @@ static void do_handle_IPI(int ipinr) break; case IPI_CPU_STOP: - local_cpu_stop(); - break; - - case IPI_CPU_CRASH_STOP: - if (IS_ENABLED(CONFIG_KEXEC_CORE)) { + case IPI_CPU_STOP_NMI: + if (IS_ENABLED(CONFIG_KEXEC_CORE) && crash_stop) { ipi_cpu_crash_stop(cpu, get_irq_regs()); - unreachable(); + } else { + local_cpu_stop(cpu); } break; @@ -1022,8 +1027,7 @@ static bool ipi_should_be_nmi(enum ipi_msg_type ipi) return false; switch (ipi) { - case IPI_CPU_STOP: - case IPI_CPU_CRASH_STOP: + case IPI_CPU_STOP_NMI: case IPI_CPU_BACKTRACE: case IPI_KGDB_ROUNDUP: return true; @@ -1136,47 +1140,10 @@ static inline unsigned int num_other_online_cpus(void) void smp_send_stop(void) { - unsigned long timeout; - - if (num_other_online_cpus()) { - cpumask_t mask; - - cpumask_copy(&mask, cpu_online_mask); - cpumask_clear_cpu(smp_processor_id(), &mask); - - if (system_state <= SYSTEM_RUNNING) - pr_crit("SMP: stopping secondary CPUs\n"); - smp_cross_call(&mask, IPI_CPU_STOP); - } - - /* Wait up to one second for other CPUs to stop */ - timeout = USEC_PER_SEC; - while (num_other_online_cpus() && timeout--) - udelay(1); - - if (num_other_online_cpus()) - pr_warn("SMP: failed to stop secondary CPUs %*pbl\n", - cpumask_pr_args(cpu_online_mask)); - - sdei_mask_local_cpu(); -} - -#ifdef CONFIG_KEXEC_CORE -void crash_smp_send_stop(void) -{ - static int cpus_stopped; + static unsigned long stop_in_progress; cpumask_t mask; unsigned long timeout; - /* - * This function can be called twice in panic path, but obviously - * we execute this only once. - */ - if (cpus_stopped) - return; - - cpus_stopped = 1; - /* * If this cpu is the only one alive at this point in time, online or * not, there are no stop messages to be sent around, so just back out. @@ -1184,31 +1151,98 @@ void crash_smp_send_stop(void) if (num_other_online_cpus() == 0) goto skip_ipi; + /* Only proceed if this is the first CPU to reach this code */ + if (test_and_set_bit(0, &stop_in_progress)) + return; + + /* + * Send an IPI to all currently online CPUs except the CPU running + * this code. + * + * NOTE: we don't do anything here to prevent other CPUs from coming + * online after we snapshot `cpu_online_mask`. Ideally, the calling code + * should do something to prevent other CPUs from coming up. This code + * can be called in the panic path and thus it doesn't seem wise to + * grab the CPU hotplug mutex ourselves. Worst case: + * - If a CPU comes online as we're running, we'll likely notice it + * during the 1 second wait below and then we'll catch it when we try + * with an NMI (assuming NMIs are enabled) since we re-snapshot the + * mask before sending an NMI. + * - If we leave the function and see that CPUs are still online we'll + * at least print a warning. Especially without NMIs this function + * isn't foolproof anyway so calling code will just have to accept + * the fact that there could be cases where a CPU can't be stopped. + */ cpumask_copy(&mask, cpu_online_mask); cpumask_clear_cpu(smp_processor_id(), &mask); - atomic_set(&waiting_for_crash_ipi, num_other_online_cpus()); + if (system_state <= SYSTEM_RUNNING) + pr_crit("SMP: stopping secondary CPUs\n"); - pr_crit("SMP: stopping secondary CPUs\n"); - smp_cross_call(&mask, IPI_CPU_CRASH_STOP); - - /* Wait up to one second for other CPUs to stop */ + /* + * Start with a normal IPI and wait up to one second for other CPUs to + * stop. We do this first because it gives other processors a chance + * to exit critical sections / drop locks and makes the rest of the + * stop process (especially console flush) more robust. + */ + smp_cross_call(&mask, IPI_CPU_STOP); timeout = USEC_PER_SEC; - while ((atomic_read(&waiting_for_crash_ipi) > 0) && timeout--) + while (num_other_online_cpus() && timeout--) udelay(1); - if (atomic_read(&waiting_for_crash_ipi) > 0) + /* + * If CPUs are still online, try an NMI. There's no excuse for this to + * be slow, so we only give them an extra 10 ms to respond. + */ + if (num_other_online_cpus() && ipi_should_be_nmi(IPI_CPU_STOP_NMI)) { + smp_rmb(); + cpumask_copy(&mask, cpu_online_mask); + cpumask_clear_cpu(smp_processor_id(), &mask); + + pr_info("SMP: retry stop with NMI for CPUs %*pbl\n", + cpumask_pr_args(&mask)); + + smp_cross_call(&mask, IPI_CPU_STOP_NMI); + timeout = USEC_PER_MSEC * 10; + while (num_other_online_cpus() && timeout--) + udelay(1); + } + + if (num_other_online_cpus()) { + smp_rmb(); + cpumask_copy(&mask, cpu_online_mask); + cpumask_clear_cpu(smp_processor_id(), &mask); + pr_warn("SMP: failed to stop secondary CPUs %*pbl\n", cpumask_pr_args(&mask)); + } skip_ipi: sdei_mask_local_cpu(); +} + +#ifdef CONFIG_KEXEC_CORE +void crash_smp_send_stop(void) +{ + /* + * This function can be called twice in panic path, but obviously + * we execute this only once. + * + * We use this same boolean to tell whether the IPI we send was a + * stop or a "crash stop". + */ + if (crash_stop) + return; + crash_stop = 1; + + smp_send_stop(); + sdei_handler_abort(); } bool smp_crash_stop_failed(void) { - return (atomic_read(&waiting_for_crash_ipi) > 0); + return num_other_online_cpus() != 0; } #endif From 3cce331ee2a4cc99328757a9bd297ad9a39609e8 Mon Sep 17 00:00:00 2001 From: Yangyu Chen Date: Wed, 7 Aug 2024 11:35:18 +0900 Subject: [PATCH 20/94] drivers/perf: apple_m1: add known PMU events This patch adds known PMU events that can be found on /usr/share/kpep in macOS. The m1_pmu_events and m1_pmu_event_affinity are generated from the script [1], which consumes the plist file from Apple. And then added these events to m1_pmu_perf_map and m1_pmu_event_attrs with Apple's documentation [2]. Link: https://github.com/cyyself/m1-pmu-gen [1] Link: https://developer.apple.com/download/apple-silicon-cpu-optimization-guide/ [2] Signed-off-by: Yangyu Chen Acked-by: Hector Martin Link: https://lore.kernel.org/r/tencent_C5DA658E64B8D13125210C8D707CD8823F08@qq.com Signed-off-by: Will Deacon --- drivers/perf/apple_m1_cpu_pmu.c | 178 +++++++++++++++++++------------- 1 file changed, 105 insertions(+), 73 deletions(-) diff --git a/drivers/perf/apple_m1_cpu_pmu.c b/drivers/perf/apple_m1_cpu_pmu.c index c8f607912567..1d4d01e1275e 100644 --- a/drivers/perf/apple_m1_cpu_pmu.c +++ b/drivers/perf/apple_m1_cpu_pmu.c @@ -47,46 +47,79 @@ * implementations, we'll have to introduce per cpu-type tables. */ enum m1_pmu_events { - M1_PMU_PERFCTR_UNKNOWN_01 = 0x01, - M1_PMU_PERFCTR_CPU_CYCLES = 0x02, - M1_PMU_PERFCTR_INSTRUCTIONS = 0x8c, - M1_PMU_PERFCTR_UNKNOWN_8d = 0x8d, - M1_PMU_PERFCTR_UNKNOWN_8e = 0x8e, - M1_PMU_PERFCTR_UNKNOWN_8f = 0x8f, - M1_PMU_PERFCTR_UNKNOWN_90 = 0x90, - M1_PMU_PERFCTR_UNKNOWN_93 = 0x93, - M1_PMU_PERFCTR_UNKNOWN_94 = 0x94, - M1_PMU_PERFCTR_UNKNOWN_95 = 0x95, - M1_PMU_PERFCTR_UNKNOWN_96 = 0x96, - M1_PMU_PERFCTR_UNKNOWN_97 = 0x97, - M1_PMU_PERFCTR_UNKNOWN_98 = 0x98, - M1_PMU_PERFCTR_UNKNOWN_99 = 0x99, - M1_PMU_PERFCTR_UNKNOWN_9a = 0x9a, - M1_PMU_PERFCTR_UNKNOWN_9b = 0x9b, - M1_PMU_PERFCTR_UNKNOWN_9c = 0x9c, - M1_PMU_PERFCTR_UNKNOWN_9f = 0x9f, - M1_PMU_PERFCTR_UNKNOWN_bf = 0xbf, - M1_PMU_PERFCTR_UNKNOWN_c0 = 0xc0, - M1_PMU_PERFCTR_UNKNOWN_c1 = 0xc1, - M1_PMU_PERFCTR_UNKNOWN_c4 = 0xc4, - M1_PMU_PERFCTR_UNKNOWN_c5 = 0xc5, - M1_PMU_PERFCTR_UNKNOWN_c6 = 0xc6, - M1_PMU_PERFCTR_UNKNOWN_c8 = 0xc8, - M1_PMU_PERFCTR_UNKNOWN_ca = 0xca, - M1_PMU_PERFCTR_UNKNOWN_cb = 0xcb, - M1_PMU_PERFCTR_UNKNOWN_f5 = 0xf5, - M1_PMU_PERFCTR_UNKNOWN_f6 = 0xf6, - M1_PMU_PERFCTR_UNKNOWN_f7 = 0xf7, - M1_PMU_PERFCTR_UNKNOWN_f8 = 0xf8, - M1_PMU_PERFCTR_UNKNOWN_fd = 0xfd, - M1_PMU_PERFCTR_LAST = M1_PMU_CFG_EVENT, + M1_PMU_PERFCTR_RETIRE_UOP = 0x1, + M1_PMU_PERFCTR_CORE_ACTIVE_CYCLE = 0x2, + M1_PMU_PERFCTR_L1I_TLB_FILL = 0x4, + M1_PMU_PERFCTR_L1D_TLB_FILL = 0x5, + M1_PMU_PERFCTR_MMU_TABLE_WALK_INSTRUCTION = 0x7, + M1_PMU_PERFCTR_MMU_TABLE_WALK_DATA = 0x8, + M1_PMU_PERFCTR_L2_TLB_MISS_INSTRUCTION = 0xa, + M1_PMU_PERFCTR_L2_TLB_MISS_DATA = 0xb, + M1_PMU_PERFCTR_MMU_VIRTUAL_MEMORY_FAULT_NONSPEC = 0xd, + M1_PMU_PERFCTR_SCHEDULE_UOP = 0x52, + M1_PMU_PERFCTR_INTERRUPT_PENDING = 0x6c, + M1_PMU_PERFCTR_MAP_STALL_DISPATCH = 0x70, + M1_PMU_PERFCTR_MAP_REWIND = 0x75, + M1_PMU_PERFCTR_MAP_STALL = 0x76, + M1_PMU_PERFCTR_MAP_INT_UOP = 0x7c, + M1_PMU_PERFCTR_MAP_LDST_UOP = 0x7d, + M1_PMU_PERFCTR_MAP_SIMD_UOP = 0x7e, + M1_PMU_PERFCTR_FLUSH_RESTART_OTHER_NONSPEC = 0x84, + M1_PMU_PERFCTR_INST_ALL = 0x8c, + M1_PMU_PERFCTR_INST_BRANCH = 0x8d, + M1_PMU_PERFCTR_INST_BRANCH_CALL = 0x8e, + M1_PMU_PERFCTR_INST_BRANCH_RET = 0x8f, + M1_PMU_PERFCTR_INST_BRANCH_TAKEN = 0x90, + M1_PMU_PERFCTR_INST_BRANCH_INDIR = 0x93, + M1_PMU_PERFCTR_INST_BRANCH_COND = 0x94, + M1_PMU_PERFCTR_INST_INT_LD = 0x95, + M1_PMU_PERFCTR_INST_INT_ST = 0x96, + M1_PMU_PERFCTR_INST_INT_ALU = 0x97, + M1_PMU_PERFCTR_INST_SIMD_LD = 0x98, + M1_PMU_PERFCTR_INST_SIMD_ST = 0x99, + M1_PMU_PERFCTR_INST_SIMD_ALU = 0x9a, + M1_PMU_PERFCTR_INST_LDST = 0x9b, + M1_PMU_PERFCTR_INST_BARRIER = 0x9c, + M1_PMU_PERFCTR_UNKNOWN_9f = 0x9f, + M1_PMU_PERFCTR_L1D_TLB_ACCESS = 0xa0, + M1_PMU_PERFCTR_L1D_TLB_MISS = 0xa1, + M1_PMU_PERFCTR_L1D_CACHE_MISS_ST = 0xa2, + M1_PMU_PERFCTR_L1D_CACHE_MISS_LD = 0xa3, + M1_PMU_PERFCTR_LD_UNIT_UOP = 0xa6, + M1_PMU_PERFCTR_ST_UNIT_UOP = 0xa7, + M1_PMU_PERFCTR_L1D_CACHE_WRITEBACK = 0xa8, + M1_PMU_PERFCTR_LDST_X64_UOP = 0xb1, + M1_PMU_PERFCTR_LDST_XPG_UOP = 0xb2, + M1_PMU_PERFCTR_ATOMIC_OR_EXCLUSIVE_SUCC = 0xb3, + M1_PMU_PERFCTR_ATOMIC_OR_EXCLUSIVE_FAIL = 0xb4, + M1_PMU_PERFCTR_L1D_CACHE_MISS_LD_NONSPEC = 0xbf, + M1_PMU_PERFCTR_L1D_CACHE_MISS_ST_NONSPEC = 0xc0, + M1_PMU_PERFCTR_L1D_TLB_MISS_NONSPEC = 0xc1, + M1_PMU_PERFCTR_ST_MEMORY_ORDER_VIOLATION_NONSPEC = 0xc4, + M1_PMU_PERFCTR_BRANCH_COND_MISPRED_NONSPEC = 0xc5, + M1_PMU_PERFCTR_BRANCH_INDIR_MISPRED_NONSPEC = 0xc6, + M1_PMU_PERFCTR_BRANCH_RET_INDIR_MISPRED_NONSPEC = 0xc8, + M1_PMU_PERFCTR_BRANCH_CALL_INDIR_MISPRED_NONSPEC = 0xca, + M1_PMU_PERFCTR_BRANCH_MISPRED_NONSPEC = 0xcb, + M1_PMU_PERFCTR_L1I_TLB_MISS_DEMAND = 0xd4, + M1_PMU_PERFCTR_MAP_DISPATCH_BUBBLE = 0xd6, + M1_PMU_PERFCTR_L1I_CACHE_MISS_DEMAND = 0xdb, + M1_PMU_PERFCTR_FETCH_RESTART = 0xde, + M1_PMU_PERFCTR_ST_NT_UOP = 0xe5, + M1_PMU_PERFCTR_LD_NT_UOP = 0xe6, + M1_PMU_PERFCTR_UNKNOWN_f5 = 0xf5, + M1_PMU_PERFCTR_UNKNOWN_f6 = 0xf6, + M1_PMU_PERFCTR_UNKNOWN_f7 = 0xf7, + M1_PMU_PERFCTR_UNKNOWN_f8 = 0xf8, + M1_PMU_PERFCTR_UNKNOWN_fd = 0xfd, + M1_PMU_PERFCTR_LAST = M1_PMU_CFG_EVENT, /* * From this point onwards, these are not actual HW events, * but attributes that get stored in hw->config_base. */ - M1_PMU_CFG_COUNT_USER = BIT(8), - M1_PMU_CFG_COUNT_KERNEL = BIT(9), + M1_PMU_CFG_COUNT_USER = BIT(8), + M1_PMU_CFG_COUNT_KERNEL = BIT(9), }; /* @@ -96,46 +129,45 @@ enum m1_pmu_events { * counters had strange affinities. */ static const u16 m1_pmu_event_affinity[M1_PMU_PERFCTR_LAST + 1] = { - [0 ... M1_PMU_PERFCTR_LAST] = ANY_BUT_0_1, - [M1_PMU_PERFCTR_UNKNOWN_01] = BIT(7), - [M1_PMU_PERFCTR_CPU_CYCLES] = ANY_BUT_0_1 | BIT(0), - [M1_PMU_PERFCTR_INSTRUCTIONS] = BIT(7) | BIT(1), - [M1_PMU_PERFCTR_UNKNOWN_8d] = ONLY_5_6_7, - [M1_PMU_PERFCTR_UNKNOWN_8e] = ONLY_5_6_7, - [M1_PMU_PERFCTR_UNKNOWN_8f] = ONLY_5_6_7, - [M1_PMU_PERFCTR_UNKNOWN_90] = ONLY_5_6_7, - [M1_PMU_PERFCTR_UNKNOWN_93] = ONLY_5_6_7, - [M1_PMU_PERFCTR_UNKNOWN_94] = ONLY_5_6_7, - [M1_PMU_PERFCTR_UNKNOWN_95] = ONLY_5_6_7, - [M1_PMU_PERFCTR_UNKNOWN_96] = ONLY_5_6_7, - [M1_PMU_PERFCTR_UNKNOWN_97] = BIT(7), - [M1_PMU_PERFCTR_UNKNOWN_98] = ONLY_5_6_7, - [M1_PMU_PERFCTR_UNKNOWN_99] = ONLY_5_6_7, - [M1_PMU_PERFCTR_UNKNOWN_9a] = BIT(7), - [M1_PMU_PERFCTR_UNKNOWN_9b] = ONLY_5_6_7, - [M1_PMU_PERFCTR_UNKNOWN_9c] = ONLY_5_6_7, - [M1_PMU_PERFCTR_UNKNOWN_9f] = BIT(7), - [M1_PMU_PERFCTR_UNKNOWN_bf] = ONLY_5_6_7, - [M1_PMU_PERFCTR_UNKNOWN_c0] = ONLY_5_6_7, - [M1_PMU_PERFCTR_UNKNOWN_c1] = ONLY_5_6_7, - [M1_PMU_PERFCTR_UNKNOWN_c4] = ONLY_5_6_7, - [M1_PMU_PERFCTR_UNKNOWN_c5] = ONLY_5_6_7, - [M1_PMU_PERFCTR_UNKNOWN_c6] = ONLY_5_6_7, - [M1_PMU_PERFCTR_UNKNOWN_c8] = ONLY_5_6_7, - [M1_PMU_PERFCTR_UNKNOWN_ca] = ONLY_5_6_7, - [M1_PMU_PERFCTR_UNKNOWN_cb] = ONLY_5_6_7, - [M1_PMU_PERFCTR_UNKNOWN_f5] = ONLY_2_4_6, - [M1_PMU_PERFCTR_UNKNOWN_f6] = ONLY_2_4_6, - [M1_PMU_PERFCTR_UNKNOWN_f7] = ONLY_2_4_6, - [M1_PMU_PERFCTR_UNKNOWN_f8] = ONLY_2_TO_7, - [M1_PMU_PERFCTR_UNKNOWN_fd] = ONLY_2_4_6, + [0 ... M1_PMU_PERFCTR_LAST] = ANY_BUT_0_1, + [M1_PMU_PERFCTR_RETIRE_UOP] = BIT(7), + [M1_PMU_PERFCTR_CORE_ACTIVE_CYCLE] = ANY_BUT_0_1 | BIT(0), + [M1_PMU_PERFCTR_INST_ALL] = BIT(7) | BIT(1), + [M1_PMU_PERFCTR_INST_BRANCH] = ONLY_5_6_7, + [M1_PMU_PERFCTR_INST_BRANCH_CALL] = ONLY_5_6_7, + [M1_PMU_PERFCTR_INST_BRANCH_RET] = ONLY_5_6_7, + [M1_PMU_PERFCTR_INST_BRANCH_TAKEN] = ONLY_5_6_7, + [M1_PMU_PERFCTR_INST_BRANCH_INDIR] = ONLY_5_6_7, + [M1_PMU_PERFCTR_INST_BRANCH_COND] = ONLY_5_6_7, + [M1_PMU_PERFCTR_INST_INT_LD] = ONLY_5_6_7, + [M1_PMU_PERFCTR_INST_INT_ST] = BIT(7), + [M1_PMU_PERFCTR_INST_INT_ALU] = BIT(7), + [M1_PMU_PERFCTR_INST_SIMD_LD] = ONLY_5_6_7, + [M1_PMU_PERFCTR_INST_SIMD_ST] = ONLY_5_6_7, + [M1_PMU_PERFCTR_INST_SIMD_ALU] = BIT(7), + [M1_PMU_PERFCTR_INST_LDST] = BIT(7), + [M1_PMU_PERFCTR_INST_BARRIER] = ONLY_5_6_7, + [M1_PMU_PERFCTR_UNKNOWN_9f] = BIT(7), + [M1_PMU_PERFCTR_L1D_CACHE_MISS_LD_NONSPEC] = ONLY_5_6_7, + [M1_PMU_PERFCTR_L1D_CACHE_MISS_ST_NONSPEC] = ONLY_5_6_7, + [M1_PMU_PERFCTR_L1D_TLB_MISS_NONSPEC] = ONLY_5_6_7, + [M1_PMU_PERFCTR_ST_MEMORY_ORDER_VIOLATION_NONSPEC] = ONLY_5_6_7, + [M1_PMU_PERFCTR_BRANCH_COND_MISPRED_NONSPEC] = ONLY_5_6_7, + [M1_PMU_PERFCTR_BRANCH_INDIR_MISPRED_NONSPEC] = ONLY_5_6_7, + [M1_PMU_PERFCTR_BRANCH_RET_INDIR_MISPRED_NONSPEC] = ONLY_5_6_7, + [M1_PMU_PERFCTR_BRANCH_CALL_INDIR_MISPRED_NONSPEC] = ONLY_5_6_7, + [M1_PMU_PERFCTR_BRANCH_MISPRED_NONSPEC] = ONLY_5_6_7, + [M1_PMU_PERFCTR_UNKNOWN_f5] = ONLY_2_4_6, + [M1_PMU_PERFCTR_UNKNOWN_f6] = ONLY_2_4_6, + [M1_PMU_PERFCTR_UNKNOWN_f7] = ONLY_2_4_6, + [M1_PMU_PERFCTR_UNKNOWN_f8] = ONLY_2_TO_7, + [M1_PMU_PERFCTR_UNKNOWN_fd] = ONLY_2_4_6, }; static const unsigned m1_pmu_perf_map[PERF_COUNT_HW_MAX] = { PERF_MAP_ALL_UNSUPPORTED, - [PERF_COUNT_HW_CPU_CYCLES] = M1_PMU_PERFCTR_CPU_CYCLES, - [PERF_COUNT_HW_INSTRUCTIONS] = M1_PMU_PERFCTR_INSTRUCTIONS, - /* No idea about the rest yet */ + [PERF_COUNT_HW_CPU_CYCLES] = M1_PMU_PERFCTR_CORE_ACTIVE_CYCLE, + [PERF_COUNT_HW_INSTRUCTIONS] = M1_PMU_PERFCTR_INST_ALL, }; /* sysfs definitions */ @@ -154,8 +186,8 @@ static ssize_t m1_pmu_events_sysfs_show(struct device *dev, PMU_EVENT_ATTR_ID(name, m1_pmu_events_sysfs_show, config) static struct attribute *m1_pmu_event_attrs[] = { - M1_PMU_EVENT_ATTR(cycles, M1_PMU_PERFCTR_CPU_CYCLES), - M1_PMU_EVENT_ATTR(instructions, M1_PMU_PERFCTR_INSTRUCTIONS), + M1_PMU_EVENT_ATTR(cycles, M1_PMU_PERFCTR_CORE_ACTIVE_CYCLE), + M1_PMU_EVENT_ATTR(instructions, M1_PMU_PERFCTR_INST_ALL), NULL, }; From a3dd920977dccc453c550260c4b7605b280b79c3 Mon Sep 17 00:00:00 2001 From: Jing Zhang Date: Thu, 22 Aug 2024 11:33:31 +0800 Subject: [PATCH 21/94] drivers/perf: Fix ali_drw_pmu driver interrupt status clearing The alibaba_uncore_pmu driver forgot to clear all interrupt status in the interrupt processing function. After the PMU counter overflow interrupt occurred, an interrupt storm occurred, causing the system to hang. Therefore, clear the correct interrupt status in the interrupt handling function to fix it. Fixes: cf7b61073e45 ("drivers/perf: add DDR Sub-System Driveway PMU driver for Yitian 710 SoC") Signed-off-by: Jing Zhang Reviewed-by: Shuai Xue Acked-by: Mark Rutland Link: https://lore.kernel.org/r/1724297611-20686-1-git-send-email-renyu.zj@linux.alibaba.com Signed-off-by: Will Deacon --- drivers/perf/alibaba_uncore_drw_pmu.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/perf/alibaba_uncore_drw_pmu.c b/drivers/perf/alibaba_uncore_drw_pmu.c index 38a2947ae813..c6ff1bc7d336 100644 --- a/drivers/perf/alibaba_uncore_drw_pmu.c +++ b/drivers/perf/alibaba_uncore_drw_pmu.c @@ -400,7 +400,7 @@ static irqreturn_t ali_drw_pmu_isr(int irq_num, void *data) } /* clear common counter intr status */ - clr_status = FIELD_PREP(ALI_DRW_PMCOM_CNT_OV_INTR_MASK, 1); + clr_status = FIELD_PREP(ALI_DRW_PMCOM_CNT_OV_INTR_MASK, status); writel(clr_status, drw_pmu->cfg_base + ALI_DRW_PMU_OV_INTR_CLR); } From e669388537c472142804eb5a0449cc23d5409694 Mon Sep 17 00:00:00 2001 From: Krishna chaitanya chundru Date: Fri, 16 Aug 2024 20:47:20 +0530 Subject: [PATCH 22/94] perf/dwc_pcie: Fix registration issue in multi PCIe controller instances When there are multiple of instances of PCIe controllers, registration to perf driver fails with this error. sysfs: cannot create duplicate filename '/devices/platform/dwc_pcie_pmu.0' CPU: 0 PID: 166 Comm: modprobe Not tainted 6.10.0-rc2-next-20240607-dirty Hardware name: Qualcomm SA8775P Ride (DT) Call trace: dump_backtrace.part.8+0x98/0xf0 show_stack+0x14/0x1c dump_stack_lvl+0x74/0x88 dump_stack+0x14/0x1c sysfs_warn_dup+0x60/0x78 sysfs_create_dir_ns+0xe8/0x100 kobject_add_internal+0x94/0x224 kobject_add+0xa8/0x118 device_add+0x298/0x7b4 platform_device_add+0x1a0/0x228 platform_device_register_full+0x11c/0x148 dwc_pcie_register_dev+0x74/0xf0 [dwc_pcie_pmu] dwc_pcie_pmu_init+0x7c/0x1000 [dwc_pcie_pmu] do_one_initcall+0x58/0x1c0 do_init_module+0x58/0x208 load_module+0x1804/0x188c __do_sys_init_module+0x18c/0x1f0 __arm64_sys_init_module+0x14/0x1c invoke_syscall+0x40/0xf8 el0_svc_common.constprop.1+0x70/0xf4 do_el0_svc+0x18/0x20 el0_svc+0x28/0xb0 el0t_64_sync_handler+0x9c/0xc0 el0t_64_sync+0x160/0x164 kobject: kobject_add_internal failed for dwc_pcie_pmu.0 with -EEXIST, don't try to register things with the same name in the same directory. This is because of having same bdf value for devices under two different controllers. Update the logic to use sbdf which is a unique number in case of multi instance also. Fixes: af9597adc2f1 ("drivers/perf: add DesignWare PCIe PMU driver") Signed-off-by: Krishna chaitanya chundru Reviewed-by: Yicong Yang Link: https://lore.kernel.org/r/20240816-dwc_pmu_fix-v2-1-198b8ab1077c@quicinc.com Signed-off-by: Will Deacon --- drivers/perf/dwc_pcie_pmu.c | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/drivers/perf/dwc_pcie_pmu.c b/drivers/perf/dwc_pcie_pmu.c index c5e328f23841..85a5155d6018 100644 --- a/drivers/perf/dwc_pcie_pmu.c +++ b/drivers/perf/dwc_pcie_pmu.c @@ -556,10 +556,10 @@ static int dwc_pcie_register_dev(struct pci_dev *pdev) { struct platform_device *plat_dev; struct dwc_pcie_dev_info *dev_info; - u32 bdf; + u32 sbdf; - bdf = PCI_DEVID(pdev->bus->number, pdev->devfn); - plat_dev = platform_device_register_data(NULL, "dwc_pcie_pmu", bdf, + sbdf = (pci_domain_nr(pdev->bus) << 16) | PCI_DEVID(pdev->bus->number, pdev->devfn); + plat_dev = platform_device_register_data(NULL, "dwc_pcie_pmu", sbdf, pdev, sizeof(*pdev)); if (IS_ERR(plat_dev)) @@ -611,15 +611,15 @@ static int dwc_pcie_pmu_probe(struct platform_device *plat_dev) struct pci_dev *pdev = plat_dev->dev.platform_data; struct dwc_pcie_pmu *pcie_pmu; char *name; - u32 bdf, val; + u32 sbdf, val; u16 vsec; int ret; vsec = pci_find_vsec_capability(pdev, pdev->vendor, DWC_PCIE_VSEC_RAS_DES_ID); pci_read_config_dword(pdev, vsec + PCI_VNDR_HEADER, &val); - bdf = PCI_DEVID(pdev->bus->number, pdev->devfn); - name = devm_kasprintf(&plat_dev->dev, GFP_KERNEL, "dwc_rootport_%x", bdf); + sbdf = plat_dev->id; + name = devm_kasprintf(&plat_dev->dev, GFP_KERNEL, "dwc_rootport_%x", sbdf); if (!name) return -ENOMEM; @@ -650,7 +650,7 @@ static int dwc_pcie_pmu_probe(struct platform_device *plat_dev) ret = cpuhp_state_add_instance(dwc_pcie_pmu_hp_state, &pcie_pmu->cpuhp_node); if (ret) { - pci_err(pdev, "Error %d registering hotplug @%x\n", ret, bdf); + pci_err(pdev, "Error %d registering hotplug @%x\n", ret, sbdf); return ret; } @@ -663,7 +663,7 @@ static int dwc_pcie_pmu_probe(struct platform_device *plat_dev) ret = perf_pmu_register(&pcie_pmu->pmu, name, -1); if (ret) { - pci_err(pdev, "Error %d registering PMU @%x\n", ret, bdf); + pci_err(pdev, "Error %d registering PMU @%x\n", ret, sbdf); return ret; } ret = devm_add_action_or_reset(&plat_dev->dev, dwc_pcie_unregister_pmu, From 96a37ec98664303e581a24934df4350e3a5c7070 Mon Sep 17 00:00:00 2001 From: Krishna chaitanya chundru Date: Fri, 16 Aug 2024 20:47:21 +0530 Subject: [PATCH 23/94] Documentation: dwc_pcie_pmu: Update bdf to sbdf Update document to reflect the driver change to use sbdf instead of bdf alone. Signed-off-by: Krishna chaitanya chundru Reviewed-by: Yicong Yang Link: https://lore.kernel.org/r/20240816-dwc_pmu_fix-v2-2-198b8ab1077c@quicinc.com Signed-off-by: Will Deacon --- Documentation/admin-guide/perf/dwc_pcie_pmu.rst | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/Documentation/admin-guide/perf/dwc_pcie_pmu.rst b/Documentation/admin-guide/perf/dwc_pcie_pmu.rst index d47cd229d710..39b8e1fdd0cd 100644 --- a/Documentation/admin-guide/perf/dwc_pcie_pmu.rst +++ b/Documentation/admin-guide/perf/dwc_pcie_pmu.rst @@ -46,16 +46,16 @@ Some of the events only exist for specific configurations. DesignWare Cores (DWC) PCIe PMU Driver ======================================= -This driver adds PMU devices for each PCIe Root Port named based on the BDF of +This driver adds PMU devices for each PCIe Root Port named based on the SBDF of the Root Port. For example, - 30:03.0 PCI bridge: Device 1ded:8000 (rev 01) + 0001:30:03.0 PCI bridge: Device 1ded:8000 (rev 01) -the PMU device name for this Root Port is dwc_rootport_3018. +the PMU device name for this Root Port is dwc_rootport_13018. The DWC PCIe PMU driver registers a perf PMU driver, which provides description of available events and configuration options in sysfs, see -/sys/bus/event_source/devices/dwc_rootport_{bdf}. +/sys/bus/event_source/devices/dwc_rootport_{sbdf}. The "format" directory describes format of the config fields of the perf_event_attr structure. The "events" directory provides configuration @@ -66,16 +66,16 @@ The "perf list" command shall list the available events from sysfs, e.g.:: $# perf list | grep dwc_rootport <...> - dwc_rootport_3018/Rx_PCIe_TLP_Data_Payload/ [Kernel PMU event] + dwc_rootport_13018/Rx_PCIe_TLP_Data_Payload/ [Kernel PMU event] <...> - dwc_rootport_3018/rx_memory_read,lane=?/ [Kernel PMU event] + dwc_rootport_13018/rx_memory_read,lane=?/ [Kernel PMU event] Time Based Analysis Event Usage ------------------------------- Example usage of counting PCIe RX TLP data payload (Units of bytes):: - $# perf stat -a -e dwc_rootport_3018/Rx_PCIe_TLP_Data_Payload/ + $# perf stat -a -e dwc_rootport_13018/Rx_PCIe_TLP_Data_Payload/ The average RX/TX bandwidth can be calculated using the following formula: @@ -88,7 +88,7 @@ Lane Event Usage Each lane has the same event set and to avoid generating a list of hundreds of events, the user need to specify the lane ID explicitly, e.g.:: - $# perf stat -a -e dwc_rootport_3018/rx_memory_read,lane=4/ + $# perf stat -a -e dwc_rootport_13018/rx_memory_read,lane=4/ The driver does not support sampling, therefore "perf record" will not work. Per-task (without "-a") perf sessions are not supported. From b94b05478fb6a09033bf70c6edd03f8930a0fe24 Mon Sep 17 00:00:00 2001 From: Krishna chaitanya chundru Date: Fri, 16 Aug 2024 20:47:22 +0530 Subject: [PATCH 24/94] perf/dwc_pcie: Always register for PCIe bus notifier When the PCIe devices are discovered late, the driver can't find the PCIe devices and returns in the init without registering with the bus notifier. Due to that the devices which are discovered late the driver can't register for this. Register for bus notifier & driver even if the device is not found as part of init. Fixes: af9597adc2f1 ("drivers/perf: add DesignWare PCIe PMU driver") Signed-off-by: Krishna chaitanya chundru Reviewed-by: Yicong Yang Link: https://lore.kernel.org/r/20240816-dwc_pmu_fix-v2-3-198b8ab1077c@quicinc.com Signed-off-by: Will Deacon --- drivers/perf/dwc_pcie_pmu.c | 5 ----- 1 file changed, 5 deletions(-) diff --git a/drivers/perf/dwc_pcie_pmu.c b/drivers/perf/dwc_pcie_pmu.c index 85a5155d6018..f205ecad2e4c 100644 --- a/drivers/perf/dwc_pcie_pmu.c +++ b/drivers/perf/dwc_pcie_pmu.c @@ -726,7 +726,6 @@ static struct platform_driver dwc_pcie_pmu_driver = { static int __init dwc_pcie_pmu_init(void) { struct pci_dev *pdev = NULL; - bool found = false; int ret; for_each_pci_dev(pdev) { @@ -738,11 +737,7 @@ static int __init dwc_pcie_pmu_init(void) pci_dev_put(pdev); return ret; } - - found = true; } - if (!found) - return -ENODEV; ret = cpuhp_setup_state_multi(CPUHP_AP_ONLINE_DYN, "perf/dwc_pcie_pmu:online", From db9e7a83d30821ba50a84e9726099946900abde8 Mon Sep 17 00:00:00 2001 From: Krishna chaitanya chundru Date: Fri, 16 Aug 2024 20:47:23 +0530 Subject: [PATCH 25/94] perf/dwc_pcie: Add support for QCOM vendor devices Update the vendor table with QCOM PCIe vendorid. Signed-off-by: Krishna chaitanya chundru Reviewed-by: Yicong Yang Link: https://lore.kernel.org/r/20240816-dwc_pmu_fix-v2-4-198b8ab1077c@quicinc.com Signed-off-by: Will Deacon --- drivers/perf/dwc_pcie_pmu.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/perf/dwc_pcie_pmu.c b/drivers/perf/dwc_pcie_pmu.c index f205ecad2e4c..4ca50f9b6dfe 100644 --- a/drivers/perf/dwc_pcie_pmu.c +++ b/drivers/perf/dwc_pcie_pmu.c @@ -107,6 +107,7 @@ struct dwc_pcie_vendor_id { static const struct dwc_pcie_vendor_id dwc_pcie_vendor_ids[] = { {.vendor_id = PCI_VENDOR_ID_ALIBABA }, + {.vendor_id = PCI_VENDOR_ID_QCOM }, {} /* terminator */ }; From 3e9e67e129434fdeae905a5b60648d10126c4a8d Mon Sep 17 00:00:00 2001 From: Peter Collingbourne Date: Fri, 23 Aug 2024 18:54:13 -0700 Subject: [PATCH 26/94] arm64: Implement prctl(PR_{G,S}ET_TSC) On arm64, this prctl controls access to CNTVCT_EL0, CNTVCTSS_EL0 and CNTFRQ_EL0 via CNTKCTL_EL1.EL0VCTEN. Since this bit is also used to implement various erratum workarounds, check whether the CPU needs a workaround whenever we potentially need to change it. This is needed for a correct implementation of non-instrumenting record-replay debugging on arm64 (i.e. rr; https://rr-project.org/). rr must trap and record any sources of non-determinism from the userspace program's perspective so it can be replayed later. This includes the results of syscalls as well as the results of access to architected timers exposed directly to the program. This prctl was originally added for x86 by commit 8fb402bccf20 ("generic, x86: add prctl commands PR_GET_TSC and PR_SET_TSC"), and rr uses it to trap RDTSC on x86 for the same reason. We also considered exposing this as a PTRACE_EVENT. However, prctl seems like a better choice for these reasons: 1) In general an in-process control seems more useful than an out-of-process control, since anything that you would be able to do with ptrace could also be done with prctl (tracer can inject a call to the prctl and handle signal-delivery-stops), and it avoids needing an additional process (which will complicate debugging of the ptraced process since it cannot have more than one tracer, and will be incompatible with ptrace_scope=3) in cases where that is not otherwise necessary. 2) Consistency with x86_64. Note that on x86_64, RDTSC has been there since the start, so it's the same situation as on arm64. Signed-off-by: Peter Collingbourne Link: https://linux-review.googlesource.com/id/I233a1867d1ccebe2933a347552e7eae862344421 Link: https://lore.kernel.org/r/20240824015415.488474-1-pcc@google.com Signed-off-by: Will Deacon --- arch/arm64/include/asm/processor.h | 5 ++ arch/arm64/include/asm/thread_info.h | 2 + arch/arm64/kernel/process.c | 73 +++++++++++++++++++++++----- arch/arm64/kernel/traps.c | 20 +++++--- 4 files changed, 82 insertions(+), 18 deletions(-) diff --git a/arch/arm64/include/asm/processor.h b/arch/arm64/include/asm/processor.h index f77371232d8c..347bd3464fcb 100644 --- a/arch/arm64/include/asm/processor.h +++ b/arch/arm64/include/asm/processor.h @@ -402,5 +402,10 @@ long get_tagged_addr_ctrl(struct task_struct *task); #define GET_TAGGED_ADDR_CTRL() get_tagged_addr_ctrl(current) #endif +int get_tsc_mode(unsigned long adr); +int set_tsc_mode(unsigned int val); +#define GET_TSC_CTL(adr) get_tsc_mode((adr)) +#define SET_TSC_CTL(val) set_tsc_mode((val)) + #endif /* __ASSEMBLY__ */ #endif /* __ASM_PROCESSOR_H */ diff --git a/arch/arm64/include/asm/thread_info.h b/arch/arm64/include/asm/thread_info.h index e72a3bf9e563..1114c1c3300a 100644 --- a/arch/arm64/include/asm/thread_info.h +++ b/arch/arm64/include/asm/thread_info.h @@ -81,6 +81,7 @@ void arch_setup_new_exec(void); #define TIF_SME 27 /* SME in use */ #define TIF_SME_VL_INHERIT 28 /* Inherit SME vl_onexec across exec */ #define TIF_KERNEL_FPSTATE 29 /* Task is in a kernel mode FPSIMD section */ +#define TIF_TSC_SIGSEGV 30 /* SIGSEGV on counter-timer access */ #define _TIF_SIGPENDING (1 << TIF_SIGPENDING) #define _TIF_NEED_RESCHED (1 << TIF_NEED_RESCHED) @@ -97,6 +98,7 @@ void arch_setup_new_exec(void); #define _TIF_SVE (1 << TIF_SVE) #define _TIF_MTE_ASYNC_FAULT (1 << TIF_MTE_ASYNC_FAULT) #define _TIF_NOTIFY_SIGNAL (1 << TIF_NOTIFY_SIGNAL) +#define _TIF_TSC_SIGSEGV (1 << TIF_TSC_SIGSEGV) #define _TIF_WORK_MASK (_TIF_NEED_RESCHED | _TIF_SIGPENDING | \ _TIF_NOTIFY_RESUME | _TIF_FOREIGN_FPSTATE | \ diff --git a/arch/arm64/kernel/process.c b/arch/arm64/kernel/process.c index 4ae31b7af6c3..1b6bbf839bb5 100644 --- a/arch/arm64/kernel/process.c +++ b/arch/arm64/kernel/process.c @@ -43,6 +43,7 @@ #include #include +#include #include #include #include @@ -472,27 +473,52 @@ static void entry_task_switch(struct task_struct *next) } /* - * ARM erratum 1418040 handling, affecting the 32bit view of CNTVCT. - * Ensure access is disabled when switching to a 32bit task, ensure - * access is enabled when switching to a 64bit task. + * Handle sysreg updates for ARM erratum 1418040 which affects the 32bit view of + * CNTVCT, various other errata which require trapping all CNTVCT{,_EL0} + * accesses and prctl(PR_SET_TSC). Ensure access is disabled iff a workaround is + * required or PR_TSC_SIGSEGV is set. */ -static void erratum_1418040_thread_switch(struct task_struct *next) +static void update_cntkctl_el1(struct task_struct *next) { - if (!IS_ENABLED(CONFIG_ARM64_ERRATUM_1418040) || - !this_cpu_has_cap(ARM64_WORKAROUND_1418040)) - return; + struct thread_info *ti = task_thread_info(next); - if (is_compat_thread(task_thread_info(next))) + if (test_ti_thread_flag(ti, TIF_TSC_SIGSEGV) || + has_erratum_handler(read_cntvct_el0) || + (IS_ENABLED(CONFIG_ARM64_ERRATUM_1418040) && + this_cpu_has_cap(ARM64_WORKAROUND_1418040) && + is_compat_thread(ti))) sysreg_clear_set(cntkctl_el1, ARCH_TIMER_USR_VCT_ACCESS_EN, 0); else sysreg_clear_set(cntkctl_el1, 0, ARCH_TIMER_USR_VCT_ACCESS_EN); } -static void erratum_1418040_new_exec(void) +static void cntkctl_thread_switch(struct task_struct *prev, + struct task_struct *next) { + if ((read_ti_thread_flags(task_thread_info(prev)) & + (_TIF_32BIT | _TIF_TSC_SIGSEGV)) != + (read_ti_thread_flags(task_thread_info(next)) & + (_TIF_32BIT | _TIF_TSC_SIGSEGV))) + update_cntkctl_el1(next); +} + +static int do_set_tsc_mode(unsigned int val) +{ + bool tsc_sigsegv; + + if (val == PR_TSC_SIGSEGV) + tsc_sigsegv = true; + else if (val == PR_TSC_ENABLE) + tsc_sigsegv = false; + else + return -EINVAL; + preempt_disable(); - erratum_1418040_thread_switch(current); + update_thread_flag(TIF_TSC_SIGSEGV, tsc_sigsegv); + update_cntkctl_el1(current); preempt_enable(); + + return 0; } /* @@ -528,7 +554,7 @@ struct task_struct *__switch_to(struct task_struct *prev, contextidr_thread_switch(next); entry_task_switch(next); ssbs_thread_switch(next); - erratum_1418040_thread_switch(next); + cntkctl_thread_switch(prev, next); ptrauth_thread_switch_user(next); /* @@ -645,7 +671,7 @@ void arch_setup_new_exec(void) current->mm->context.flags = mmflags; ptrauth_thread_init_user(); mte_thread_init_user(); - erratum_1418040_new_exec(); + do_set_tsc_mode(PR_TSC_ENABLE); if (task_spec_ssb_noexec(current)) { arch_prctl_spec_ctrl_set(current, PR_SPEC_STORE_BYPASS, @@ -754,3 +780,26 @@ int arch_elf_adjust_prot(int prot, const struct arch_elf_state *state, return prot; } #endif + +int get_tsc_mode(unsigned long adr) +{ + unsigned int val; + + if (is_compat_task()) + return -EINVAL; + + if (test_thread_flag(TIF_TSC_SIGSEGV)) + val = PR_TSC_SIGSEGV; + else + val = PR_TSC_ENABLE; + + return put_user(val, (unsigned int __user *)adr); +} + +int set_tsc_mode(unsigned int val) +{ + if (is_compat_task()) + return -EINVAL; + + return do_set_tsc_mode(val); +} diff --git a/arch/arm64/kernel/traps.c b/arch/arm64/kernel/traps.c index 9e22683aa921..baf02ac437f8 100644 --- a/arch/arm64/kernel/traps.c +++ b/arch/arm64/kernel/traps.c @@ -601,18 +601,26 @@ static void ctr_read_handler(unsigned long esr, struct pt_regs *regs) static void cntvct_read_handler(unsigned long esr, struct pt_regs *regs) { - int rt = ESR_ELx_SYS64_ISS_RT(esr); + if (test_thread_flag(TIF_TSC_SIGSEGV)) { + force_sig(SIGSEGV); + } else { + int rt = ESR_ELx_SYS64_ISS_RT(esr); - pt_regs_write_reg(regs, rt, arch_timer_read_counter()); - arm64_skip_faulting_instruction(regs, AARCH64_INSN_SIZE); + pt_regs_write_reg(regs, rt, arch_timer_read_counter()); + arm64_skip_faulting_instruction(regs, AARCH64_INSN_SIZE); + } } static void cntfrq_read_handler(unsigned long esr, struct pt_regs *regs) { - int rt = ESR_ELx_SYS64_ISS_RT(esr); + if (test_thread_flag(TIF_TSC_SIGSEGV)) { + force_sig(SIGSEGV); + } else { + int rt = ESR_ELx_SYS64_ISS_RT(esr); - pt_regs_write_reg(regs, rt, arch_timer_get_rate()); - arm64_skip_faulting_instruction(regs, AARCH64_INSN_SIZE); + pt_regs_write_reg(regs, rt, arch_timer_get_rate()); + arm64_skip_faulting_instruction(regs, AARCH64_INSN_SIZE); + } } static void mrs_handler(unsigned long esr, struct pt_regs *regs) From 684fbd42d323c9c4a69a905451ea270678e36d1e Mon Sep 17 00:00:00 2001 From: Huang Xiaojia Date: Mon, 26 Aug 2024 23:12:50 +0800 Subject: [PATCH 27/94] arm64: Constify struct kobj_type 'struct kobj_type' is not modified. It is only used in kobject_init() which takes a 'const struct kobj_type *ktype' parameter. Constifying this structure moves some data to a read-only section, so increase over all security. On a x86_64, compiled with arm defconfig: Before: ====== text data bss dec hex filename 5602 548 352 6502 1966 arch/arm64/kernel/cpuinfo.o After: ====== text data bss dec hex filename 5650 500 352 6502 1966 arch/arm64/kernel/cpuinfo.o Signed-off-by: Huang Xiaojia Link: https://lore.kernel.org/r/20240826151250.3500302-1-huangxiaojia2@huawei.com Signed-off-by: Will Deacon --- arch/arm64/kernel/cpuinfo.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/arm64/kernel/cpuinfo.c b/arch/arm64/kernel/cpuinfo.c index 09eeaa24d456..369dd72a8b22 100644 --- a/arch/arm64/kernel/cpuinfo.c +++ b/arch/arm64/kernel/cpuinfo.c @@ -280,7 +280,7 @@ const struct seq_operations cpuinfo_op = { }; -static struct kobj_type cpuregs_kobj_type = { +static const struct kobj_type cpuregs_kobj_type = { .sysfs_ops = &kobj_sysfs_ops, }; From 5e9629d0ae977d6f6916d7e519724804e95f0b07 Mon Sep 17 00:00:00 2001 From: James Clark Date: Tue, 27 Aug 2024 15:51:12 +0100 Subject: [PATCH 28/94] drivers/perf: arm_spe: Use perf_allow_kernel() for permissions Use perf_allow_kernel() for 'pa_enable' (physical addresses), 'pct_enable' (physical timestamps) and context IDs. This means that perf_event_paranoid is now taken into account and LSM hooks can be used, which is more consistent with other perf_event_open calls. For example PERF_SAMPLE_PHYS_ADDR uses perf_allow_kernel() rather than just perfmon_capable(). This also indirectly fixes the following error message which is misleading because perf_event_paranoid is not taken into account by perfmon_capable(): $ perf record -e arm_spe/pa_enable/ Error: Access to performance monitoring and observability operations is limited. Consider adjusting /proc/sys/kernel/perf_event_paranoid setting ... Suggested-by: Al Grant Signed-off-by: James Clark Link: https://lore.kernel.org/r/20240827145113.1224604-1-james.clark@linaro.org Link: https://lore.kernel.org/all/20240807120039.GD37996@noisy.programming.kicks-ass.net/ Signed-off-by: Will Deacon --- drivers/perf/arm_spe_pmu.c | 9 ++++----- include/linux/perf_event.h | 8 +------- kernel/events/core.c | 9 +++++++++ 3 files changed, 14 insertions(+), 12 deletions(-) diff --git a/drivers/perf/arm_spe_pmu.c b/drivers/perf/arm_spe_pmu.c index 9100d82bfabc..3569050f9cf3 100644 --- a/drivers/perf/arm_spe_pmu.c +++ b/drivers/perf/arm_spe_pmu.c @@ -41,7 +41,7 @@ /* * Cache if the event is allowed to trace Context information. - * This allows us to perform the check, i.e, perfmon_capable(), + * This allows us to perform the check, i.e, perf_allow_kernel(), * in the context of the event owner, once, during the event_init(). */ #define SPE_PMU_HW_FLAGS_CX 0x00001 @@ -50,7 +50,7 @@ static_assert((PERF_EVENT_FLAG_ARCH & SPE_PMU_HW_FLAGS_CX) == SPE_PMU_HW_FLAGS_C static void set_spe_event_has_cx(struct perf_event *event) { - if (IS_ENABLED(CONFIG_PID_IN_CONTEXTIDR) && perfmon_capable()) + if (IS_ENABLED(CONFIG_PID_IN_CONTEXTIDR) && !perf_allow_kernel(&event->attr)) event->hw.flags |= SPE_PMU_HW_FLAGS_CX; } @@ -745,9 +745,8 @@ static int arm_spe_pmu_event_init(struct perf_event *event) set_spe_event_has_cx(event); reg = arm_spe_event_to_pmscr(event); - if (!perfmon_capable() && - (reg & (PMSCR_EL1_PA | PMSCR_EL1_PCT))) - return -EACCES; + if (reg & (PMSCR_EL1_PA | PMSCR_EL1_PCT)) + return perf_allow_kernel(&event->attr); return 0; } diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index 1a8942277dda..e336306b8c08 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -1602,13 +1602,7 @@ static inline int perf_is_paranoid(void) return sysctl_perf_event_paranoid > -1; } -static inline int perf_allow_kernel(struct perf_event_attr *attr) -{ - if (sysctl_perf_event_paranoid > 1 && !perfmon_capable()) - return -EACCES; - - return security_perf_event_open(attr, PERF_SECURITY_KERNEL); -} +int perf_allow_kernel(struct perf_event_attr *attr); static inline int perf_allow_cpu(struct perf_event_attr *attr) { diff --git a/kernel/events/core.c b/kernel/events/core.c index aa3450bdc227..ae7d63c0c593 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -13351,6 +13351,15 @@ const struct perf_event_attr *perf_event_attrs(struct perf_event *event) return &event->attr; } +int perf_allow_kernel(struct perf_event_attr *attr) +{ + if (sysctl_perf_event_paranoid > 1 && !perfmon_capable()) + return -EACCES; + + return security_perf_event_open(attr, PERF_SECURITY_KERNEL); +} +EXPORT_SYMBOL_GPL(perf_allow_kernel); + /* * Inherit an event from parent task to child task. * From daecd3373a16a039ad241086e30a1ec46fc9d61f Mon Sep 17 00:00:00 2001 From: Yicong Yang Date: Thu, 29 Aug 2024 17:03:30 +0800 Subject: [PATCH 29/94] drivers/perf: hisi_pcie: Record hardware counts correctly Currently we set the period and record it as the initial value of the counter without checking it's set to the hardware successfully or not. However the counter maybe unwritable if the target event is unsupported by the device. In such case we will pass user a wrong count: [start counts when setting the period] hwc->prev_count = 0x8000000000000000 device.counter_value = 0 // the counter is not set as the period [when user reads the counter] event->count = device.counter_value - hwc->prev_count = 0x8000000000000000 // wrong. should be 0. Fix this by record the hardware counter counts correctly when setting the period. Fixes: 8404b0fbc7fb ("drivers/perf: hisi: Add driver for HiSilicon PCIe PMU") Signed-off-by: Yicong Yang Acked-by: Jonathan Cameron Link: https://lore.kernel.org/r/20240829090332.28756-2-yangyicong@huawei.com Signed-off-by: Will Deacon --- drivers/perf/hisilicon/hisi_pcie_pmu.c | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/drivers/perf/hisilicon/hisi_pcie_pmu.c b/drivers/perf/hisilicon/hisi_pcie_pmu.c index f06027574a24..fba569a8640c 100644 --- a/drivers/perf/hisilicon/hisi_pcie_pmu.c +++ b/drivers/perf/hisilicon/hisi_pcie_pmu.c @@ -452,10 +452,24 @@ static void hisi_pcie_pmu_set_period(struct perf_event *event) struct hisi_pcie_pmu *pcie_pmu = to_pcie_pmu(event->pmu); struct hw_perf_event *hwc = &event->hw; int idx = hwc->idx; + u64 orig_cnt, cnt; + + orig_cnt = hisi_pcie_pmu_read_counter(event); local64_set(&hwc->prev_count, HISI_PCIE_INIT_VAL); hisi_pcie_pmu_writeq(pcie_pmu, HISI_PCIE_CNT, idx, HISI_PCIE_INIT_VAL); hisi_pcie_pmu_writeq(pcie_pmu, HISI_PCIE_EXT_CNT, idx, HISI_PCIE_INIT_VAL); + + /* + * The counter maybe unwritable if the target event is unsupported. + * Check this by comparing the counts after setting the period. If + * the counts stay unchanged after setting the period then update + * the hwc->prev_count correctly. Otherwise the final counts user + * get maybe totally wrong. + */ + cnt = hisi_pcie_pmu_read_counter(event); + if (orig_cnt == cnt) + local64_set(&hwc->prev_count, cnt); } static void hisi_pcie_pmu_enable_counter(struct hisi_pcie_pmu *pcie_pmu, struct hw_perf_event *hwc) From 17bf68aeb3642221e3e770399b5a52f370747ac1 Mon Sep 17 00:00:00 2001 From: Yicong Yang Date: Thu, 29 Aug 2024 17:03:31 +0800 Subject: [PATCH 30/94] drivers/perf: hisi_pcie: Fix TLP headers bandwidth counting We make the initial value of event ctrl register as HISI_PCIE_INIT_SET and modify according to the user options. This will make TLP headers bandwidth only counting never take effect since HISI_PCIE_INIT_SET configures to count the TLP payloads bandwidth. Fix this by making the initial value of event ctrl register as 0. Fixes: 17d573984d4d ("drivers/perf: hisi: Add TLP filter support") Signed-off-by: Yicong Yang Acked-by: Jonathan Cameron Link: https://lore.kernel.org/r/20240829090332.28756-3-yangyicong@huawei.com Signed-off-by: Will Deacon --- drivers/perf/hisilicon/hisi_pcie_pmu.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/perf/hisilicon/hisi_pcie_pmu.c b/drivers/perf/hisilicon/hisi_pcie_pmu.c index fba569a8640c..f7d6c59d9930 100644 --- a/drivers/perf/hisilicon/hisi_pcie_pmu.c +++ b/drivers/perf/hisilicon/hisi_pcie_pmu.c @@ -208,7 +208,7 @@ static void hisi_pcie_pmu_writeq(struct hisi_pcie_pmu *pcie_pmu, u32 reg_offset, static u64 hisi_pcie_pmu_get_event_ctrl_val(struct perf_event *event) { u64 port, trig_len, thr_len, len_mode; - u64 reg = HISI_PCIE_INIT_SET; + u64 reg = 0; /* Config HISI_PCIE_EVENT_CTRL according to event. */ reg |= FIELD_PREP(HISI_PCIE_EVENT_M, hisi_pcie_get_real_event(event)); From d1c93d5c67ebc7cf5b70ecff7172a0c399975d55 Mon Sep 17 00:00:00 2001 From: Yicong Yang Date: Thu, 29 Aug 2024 17:03:32 +0800 Subject: [PATCH 31/94] drivers/perf: hisi_pcie: Export supported Root Ports [bdf_min, bdf_max] Currently users can get the Root Ports supported by the PCIe PMU by "bus" sysfs attributes which indicates the PCIe bus number where Root Ports are located. This maybe insufficient since Root Ports supported by different PCIe PMUs may be located on the same PCIe bus. So export the BDF range the Root Ports additionally. Signed-off-by: Yicong Yang Acked-by: Jonathan Cameron Link: https://lore.kernel.org/r/20240829090332.28756-4-yangyicong@huawei.com Signed-off-by: Will Deacon --- .../admin-guide/perf/hisi-pcie-pmu.rst | 4 +++- drivers/perf/hisilicon/hisi_pcie_pmu.c | 18 ++++++++++++++++++ 2 files changed, 21 insertions(+), 1 deletion(-) diff --git a/Documentation/admin-guide/perf/hisi-pcie-pmu.rst b/Documentation/admin-guide/perf/hisi-pcie-pmu.rst index 5541ff40e06a..083ca50de896 100644 --- a/Documentation/admin-guide/perf/hisi-pcie-pmu.rst +++ b/Documentation/admin-guide/perf/hisi-pcie-pmu.rst @@ -28,7 +28,9 @@ The "identifier" sysfs file allows users to identify the version of the PMU hardware device. The "bus" sysfs file allows users to get the bus number of Root Ports -monitored by PMU. +monitored by PMU. Furthermore users can get the Root Ports range in +[bdf_min, bdf_max] from "bdf_min" and "bdf_max" sysfs attributes +respectively. Example usage of perf:: diff --git a/drivers/perf/hisilicon/hisi_pcie_pmu.c b/drivers/perf/hisilicon/hisi_pcie_pmu.c index f7d6c59d9930..c5394d007b61 100644 --- a/drivers/perf/hisilicon/hisi_pcie_pmu.c +++ b/drivers/perf/hisilicon/hisi_pcie_pmu.c @@ -141,6 +141,22 @@ static ssize_t bus_show(struct device *dev, struct device_attribute *attr, char } static DEVICE_ATTR_RO(bus); +static ssize_t bdf_min_show(struct device *dev, struct device_attribute *attr, char *buf) +{ + struct hisi_pcie_pmu *pcie_pmu = to_pcie_pmu(dev_get_drvdata(dev)); + + return sysfs_emit(buf, "%#04x\n", pcie_pmu->bdf_min); +} +static DEVICE_ATTR_RO(bdf_min); + +static ssize_t bdf_max_show(struct device *dev, struct device_attribute *attr, char *buf) +{ + struct hisi_pcie_pmu *pcie_pmu = to_pcie_pmu(dev_get_drvdata(dev)); + + return sysfs_emit(buf, "%#04x\n", pcie_pmu->bdf_max); +} +static DEVICE_ATTR_RO(bdf_max); + static struct hisi_pcie_reg_pair hisi_pcie_parse_reg_value(struct hisi_pcie_pmu *pcie_pmu, u32 reg_off) { @@ -763,6 +779,8 @@ static const struct attribute_group hisi_pcie_pmu_format_group = { static struct attribute *hisi_pcie_pmu_bus_attrs[] = { &dev_attr_bus.attr, + &dev_attr_bdf_max.attr, + &dev_attr_bdf_min.attr, NULL }; From 69231a6fcb638b7929e9fc88c4fa73a04e6d4e0c Mon Sep 17 00:00:00 2001 From: Joey Gouly Date: Fri, 3 May 2024 14:01:26 +0100 Subject: [PATCH 32/94] KVM: arm64: Make kvm_at() take an OP_AT_* To allow using newer instructions that current assemblers don't know about, replace the `at` instruction with the underlying SYS instruction. Signed-off-by: Joey Gouly Cc: Oliver Upton Cc: Catalin Marinas Cc: Will Deacon Reviewed-by: Marc Zyngier Reviewed-by: Anshuman Khandual Acked-by: Will Deacon Signed-off-by: Marc Zyngier --- arch/arm64/include/asm/kvm_asm.h | 3 ++- arch/arm64/kvm/hyp/include/hyp/fault.h | 2 +- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/arch/arm64/include/asm/kvm_asm.h b/arch/arm64/include/asm/kvm_asm.h index 2181a11b9d92..25f49f5fc4a6 100644 --- a/arch/arm64/include/asm/kvm_asm.h +++ b/arch/arm64/include/asm/kvm_asm.h @@ -10,6 +10,7 @@ #include #include #include +#include #define ARM_EXIT_WITH_SERROR_BIT 31 #define ARM_EXCEPTION_CODE(x) ((x) & ~(1U << ARM_EXIT_WITH_SERROR_BIT)) @@ -259,7 +260,7 @@ extern u64 __kvm_get_mdcr_el2(void); asm volatile( \ " mrs %1, spsr_el2\n" \ " mrs %2, elr_el2\n" \ - "1: at "at_op", %3\n" \ + "1: " __msr_s(at_op, "%3") "\n" \ " isb\n" \ " b 9f\n" \ "2: msr spsr_el2, %1\n" \ diff --git a/arch/arm64/kvm/hyp/include/hyp/fault.h b/arch/arm64/kvm/hyp/include/hyp/fault.h index 9e13c1bc2ad5..487c06099d6f 100644 --- a/arch/arm64/kvm/hyp/include/hyp/fault.h +++ b/arch/arm64/kvm/hyp/include/hyp/fault.h @@ -27,7 +27,7 @@ static inline bool __translate_far_to_hpfar(u64 far, u64 *hpfar) * saved the guest context yet, and we may return early... */ par = read_sysreg_par(); - if (!__kvm_at("s1e1r", far)) + if (!__kvm_at(OP_AT_S1E1R, far)) tmp = read_sysreg_par(); else tmp = SYS_PAR_EL1_F; /* back to the guest */ From 6f0315330af7a57c1c00587fdfb69c7778bf1c50 Mon Sep 17 00:00:00 2001 From: Mark Brown Date: Thu, 29 Aug 2024 18:20:09 +0100 Subject: [PATCH 33/94] kselftest/arm64: Actually test SME vector length changes via sigreturn The test case for SME vector length changes via sigreturn use a bit too much cut'n'paste and only actually changed the SVE vector length in the test itself. Andre's recent factoring out of the initialisation code caused this to be exposed and the test to start failing. Fix the test to actually cover the thing it's supposed to test. Fixes: 4963aeb35a9e ("kselftest/arm64: signal: Add SME signal handling tests") Signed-off-by: Mark Brown Reviewed-by: Andre Przywara Tested-by: Andre Przywara Link: https://lore.kernel.org/r/20240829-arm64-sme-signal-vl-change-test-v1-1-42d7534cb818@kernel.org Signed-off-by: Will Deacon --- .../testcases/fake_sigreturn_sme_change_vl.c | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/tools/testing/selftests/arm64/signal/testcases/fake_sigreturn_sme_change_vl.c b/tools/testing/selftests/arm64/signal/testcases/fake_sigreturn_sme_change_vl.c index cb8c051b5c8f..dfd6a2badf9f 100644 --- a/tools/testing/selftests/arm64/signal/testcases/fake_sigreturn_sme_change_vl.c +++ b/tools/testing/selftests/arm64/signal/testcases/fake_sigreturn_sme_change_vl.c @@ -35,30 +35,30 @@ static int fake_sigreturn_ssve_change_vl(struct tdescr *td, { size_t resv_sz, offset; struct _aarch64_ctx *head = GET_SF_RESV_HEAD(sf); - struct sve_context *sve; + struct za_context *za; /* Get a signal context with a SME ZA frame in it */ if (!get_current_context(td, &sf.uc, sizeof(sf.uc))) return 1; resv_sz = GET_SF_RESV_SIZE(sf); - head = get_header(head, SVE_MAGIC, resv_sz, &offset); + head = get_header(head, ZA_MAGIC, resv_sz, &offset); if (!head) { - fprintf(stderr, "No SVE context\n"); + fprintf(stderr, "No ZA context\n"); return 1; } - if (head->size != sizeof(struct sve_context)) { + if (head->size != sizeof(struct za_context)) { fprintf(stderr, "Register data present, aborting\n"); return 1; } - sve = (struct sve_context *)head; + za = (struct za_context *)head; /* No changes are supported; init left us at minimum VL so go to max */ fprintf(stderr, "Attempting to change VL from %d to %d\n", - sve->vl, vls[0]); - sve->vl = vls[0]; + za->vl, vls[0]); + za->vl = vls[0]; fake_sigreturn(&sf, sizeof(sf), 0); From db0d8a84348b876df7c4276f0cbce5df3b769f5f Mon Sep 17 00:00:00 2001 From: D Scott Phillips Date: Tue, 27 Aug 2024 14:17:01 -0700 Subject: [PATCH 34/94] arm64: errata: Enable the AC03_CPU_38 workaround for ampere1a The ampere1a cpu is affected by erratum AC04_CPU_10 which is the same bug as AC03_CPU_38. Add ampere1a to the AC03_CPU_38 workaround midr list. Cc: Signed-off-by: D Scott Phillips Acked-by: Oliver Upton Link: https://lore.kernel.org/r/20240827211701.2216719-1-scott@os.amperecomputing.com Signed-off-by: Will Deacon --- Documentation/arch/arm64/silicon-errata.rst | 2 ++ arch/arm64/Kconfig | 2 +- arch/arm64/include/asm/cputype.h | 2 ++ arch/arm64/kernel/cpu_errata.c | 10 +++++++++- 4 files changed, 14 insertions(+), 2 deletions(-) diff --git a/Documentation/arch/arm64/silicon-errata.rst b/Documentation/arch/arm64/silicon-errata.rst index 50327c05be8d..39c52385f11f 100644 --- a/Documentation/arch/arm64/silicon-errata.rst +++ b/Documentation/arch/arm64/silicon-errata.rst @@ -55,6 +55,8 @@ stable kernels. +----------------+-----------------+-----------------+-----------------------------+ | Ampere | AmpereOne | AC03_CPU_38 | AMPERE_ERRATUM_AC03_CPU_38 | +----------------+-----------------+-----------------+-----------------------------+ +| Ampere | AmpereOne AC04 | AC04_CPU_10 | AMPERE_ERRATUM_AC03_CPU_38 | ++----------------+-----------------+-----------------+-----------------------------+ +----------------+-----------------+-----------------+-----------------------------+ | ARM | Cortex-A510 | #2457168 | ARM64_ERRATUM_2457168 | +----------------+-----------------+-----------------+-----------------------------+ diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig index a2f8ff354ca6..c8cba20a4d11 100644 --- a/arch/arm64/Kconfig +++ b/arch/arm64/Kconfig @@ -423,7 +423,7 @@ config AMPERE_ERRATUM_AC03_CPU_38 default y help This option adds an alternative code sequence to work around Ampere - erratum AC03_CPU_38 on AmpereOne. + errata AC03_CPU_38 and AC04_CPU_10 on AmpereOne. The affected design reports FEAT_HAFDBS as not implemented in ID_AA64MMFR1_EL1.HAFDBS, but (V)TCR_ELx.{HA,HD} are not RES0 diff --git a/arch/arm64/include/asm/cputype.h b/arch/arm64/include/asm/cputype.h index 5fd7caea4419..5a7dfeb8e8eb 100644 --- a/arch/arm64/include/asm/cputype.h +++ b/arch/arm64/include/asm/cputype.h @@ -143,6 +143,7 @@ #define APPLE_CPU_PART_M2_AVALANCHE_MAX 0x039 #define AMPERE_CPU_PART_AMPERE1 0xAC3 +#define AMPERE_CPU_PART_AMPERE1A 0xAC4 #define MICROSOFT_CPU_PART_AZURE_COBALT_100 0xD49 /* Based on r0p0 of ARM Neoverse N2 */ @@ -212,6 +213,7 @@ #define MIDR_APPLE_M2_BLIZZARD_MAX MIDR_CPU_MODEL(ARM_CPU_IMP_APPLE, APPLE_CPU_PART_M2_BLIZZARD_MAX) #define MIDR_APPLE_M2_AVALANCHE_MAX MIDR_CPU_MODEL(ARM_CPU_IMP_APPLE, APPLE_CPU_PART_M2_AVALANCHE_MAX) #define MIDR_AMPERE1 MIDR_CPU_MODEL(ARM_CPU_IMP_AMPERE, AMPERE_CPU_PART_AMPERE1) +#define MIDR_AMPERE1A MIDR_CPU_MODEL(ARM_CPU_IMP_AMPERE, AMPERE_CPU_PART_AMPERE1A) #define MIDR_MICROSOFT_AZURE_COBALT_100 MIDR_CPU_MODEL(ARM_CPU_IMP_MICROSOFT, MICROSOFT_CPU_PART_AZURE_COBALT_100) /* Fujitsu Erratum 010001 affects A64FX 1.0 and 1.1, (v0r0 and v1r0) */ diff --git a/arch/arm64/kernel/cpu_errata.c b/arch/arm64/kernel/cpu_errata.c index f6b6b4507357..dfefbdf4073a 100644 --- a/arch/arm64/kernel/cpu_errata.c +++ b/arch/arm64/kernel/cpu_errata.c @@ -456,6 +456,14 @@ static const struct midr_range erratum_spec_ssbs_list[] = { }; #endif +#ifdef CONFIG_AMPERE_ERRATUM_AC03_CPU_38 +static const struct midr_range erratum_ac03_cpu_38_list[] = { + MIDR_ALL_VERSIONS(MIDR_AMPERE1), + MIDR_ALL_VERSIONS(MIDR_AMPERE1A), + {}, +}; +#endif + const struct arm64_cpu_capabilities arm64_errata[] = { #ifdef CONFIG_ARM64_WORKAROUND_CLEAN_CACHE { @@ -772,7 +780,7 @@ const struct arm64_cpu_capabilities arm64_errata[] = { { .desc = "AmpereOne erratum AC03_CPU_38", .capability = ARM64_WORKAROUND_AMPERE_AC03_CPU_38, - ERRATA_MIDR_ALL_VERSIONS(MIDR_AMPERE1), + ERRATA_MIDR_RANGE_LIST(erratum_ac03_cpu_38_list), }, #endif { From d736d4fc763090f9a02dc5556174de9768093f43 Mon Sep 17 00:00:00 2001 From: Dev Jain Date: Fri, 30 Aug 2024 10:59:11 +0530 Subject: [PATCH 35/94] kselftest/arm64: Fix build warnings for ptrace A "%s" is missing in ksft_exit_fail_msg(); instead, use the newly introduced ksft_exit_fail_perror(). Signed-off-by: Dev Jain Reviewed-by: Shuah Khan Reviewed-by: Mark Brown Link: https://lore.kernel.org/r/20240830052911.4040970-1-dev.jain@arm.com Signed-off-by: Will Deacon --- tools/testing/selftests/arm64/abi/ptrace.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tools/testing/selftests/arm64/abi/ptrace.c b/tools/testing/selftests/arm64/abi/ptrace.c index e4fa507cbdd0..b51d21f78cf9 100644 --- a/tools/testing/selftests/arm64/abi/ptrace.c +++ b/tools/testing/selftests/arm64/abi/ptrace.c @@ -163,10 +163,10 @@ static void test_hw_debug(pid_t child, int type, const char *type_name) static int do_child(void) { if (ptrace(PTRACE_TRACEME, -1, NULL, NULL)) - ksft_exit_fail_msg("PTRACE_TRACEME", strerror(errno)); + ksft_exit_fail_perror("PTRACE_TRACEME"); if (raise(SIGSTOP)) - ksft_exit_fail_msg("raise(SIGSTOP)", strerror(errno)); + ksft_exit_fail_perror("raise(SIGSTOP)"); return EXIT_SUCCESS; } From 0ba5b4ba61781f1eca843d9e5d499da329a8a275 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Fri, 30 Aug 2024 14:01:44 +0100 Subject: [PATCH 36/94] firmware/smccc: Call arch-specific hook on discovering KVM services arm64 will soon require its own callback to initialise services that are only available on this architecture. Introduce a hook that can be overloaded by the architecture. Signed-off-by: Marc Zyngier Link: https://lore.kernel.org/r/20240830130150.8568-2-will@kernel.org Signed-off-by: Will Deacon --- arch/arm/include/asm/hypervisor.h | 2 ++ arch/arm64/include/asm/hypervisor.h | 4 ++++ drivers/firmware/smccc/kvm_guest.c | 2 ++ 3 files changed, 8 insertions(+) diff --git a/arch/arm/include/asm/hypervisor.h b/arch/arm/include/asm/hypervisor.h index bd61502b9715..8a648e506540 100644 --- a/arch/arm/include/asm/hypervisor.h +++ b/arch/arm/include/asm/hypervisor.h @@ -7,4 +7,6 @@ void kvm_init_hyp_services(void); bool kvm_arm_hyp_service_available(u32 func_id); +static inline void kvm_arch_init_hyp_services(void) { }; + #endif diff --git a/arch/arm64/include/asm/hypervisor.h b/arch/arm64/include/asm/hypervisor.h index 0ae427f352c8..8cab2ab535b7 100644 --- a/arch/arm64/include/asm/hypervisor.h +++ b/arch/arm64/include/asm/hypervisor.h @@ -7,4 +7,8 @@ void kvm_init_hyp_services(void); bool kvm_arm_hyp_service_available(u32 func_id); +static inline void kvm_arch_init_hyp_services(void) +{ +}; + #endif diff --git a/drivers/firmware/smccc/kvm_guest.c b/drivers/firmware/smccc/kvm_guest.c index 89a68e7eeaa6..f3319be20b36 100644 --- a/drivers/firmware/smccc/kvm_guest.c +++ b/drivers/firmware/smccc/kvm_guest.c @@ -39,6 +39,8 @@ void __init kvm_init_hyp_services(void) pr_info("hypervisor services detected (0x%08lx 0x%08lx 0x%08lx 0x%08lx)\n", res.a3, res.a2, res.a1, res.a0); + + kvm_arch_init_hyp_services(); } bool kvm_arm_hyp_service_available(u32 func_id) From a06c3fad49a50d5d5eb078f93e70f4d3eca5d5a5 Mon Sep 17 00:00:00 2001 From: Will Deacon Date: Fri, 30 Aug 2024 14:01:45 +0100 Subject: [PATCH 37/94] drivers/virt: pkvm: Add initial support for running as a protected guest Implement a pKVM protected guest driver to probe the presence of pKVM and determine the memory protection granule using the HYP_MEMINFO hypercall. Acked-by: Marc Zyngier Link: https://lore.kernel.org/r/20240830130150.8568-3-will@kernel.org Signed-off-by: Will Deacon --- Documentation/virt/kvm/arm/hypercalls.rst | 22 +++++++++++ arch/arm64/include/asm/hypervisor.h | 7 ++++ drivers/virt/coco/Kconfig | 2 + drivers/virt/coco/Makefile | 1 + drivers/virt/coco/pkvm-guest/Kconfig | 10 +++++ drivers/virt/coco/pkvm-guest/Makefile | 2 + drivers/virt/coco/pkvm-guest/arm-pkvm-guest.c | 37 +++++++++++++++++++ include/linux/arm-smccc.h | 7 ++++ 8 files changed, 88 insertions(+) create mode 100644 drivers/virt/coco/pkvm-guest/Kconfig create mode 100644 drivers/virt/coco/pkvm-guest/Makefile create mode 100644 drivers/virt/coco/pkvm-guest/arm-pkvm-guest.c diff --git a/Documentation/virt/kvm/arm/hypercalls.rst b/Documentation/virt/kvm/arm/hypercalls.rst index 17be111f493f..16515eb42149 100644 --- a/Documentation/virt/kvm/arm/hypercalls.rst +++ b/Documentation/virt/kvm/arm/hypercalls.rst @@ -44,3 +44,25 @@ Provides a discovery mechanism for other KVM/arm64 hypercalls. ---------------------------------------- See ptp_kvm.rst + +``ARM_SMCCC_KVM_FUNC_HYP_MEMINFO`` +---------------------------------- + +Query the memory protection parameters for a pKVM protected virtual machine. + ++---------------------+-------------------------------------------------------------+ +| Presence: | Optional; pKVM protected guests only. | ++---------------------+-------------------------------------------------------------+ +| Calling convention: | HVC64 | ++---------------------+----------+--------------------------------------------------+ +| Function ID: | (uint32) | 0xC6000002 | ++---------------------+----------+----+---------------------------------------------+ +| Arguments: | (uint64) | R1 | Reserved / Must be zero | +| +----------+----+---------------------------------------------+ +| | (uint64) | R2 | Reserved / Must be zero | +| +----------+----+---------------------------------------------+ +| | (uint64) | R3 | Reserved / Must be zero | ++---------------------+----------+----+---------------------------------------------+ +| Return Values: | (int64) | R0 | ``INVALID_PARAMETER (-3)`` on error, else | +| | | | memory protection granule in bytes | ++---------------------+----------+----+---------------------------------------------+ diff --git a/arch/arm64/include/asm/hypervisor.h b/arch/arm64/include/asm/hypervisor.h index 8cab2ab535b7..409e239834d1 100644 --- a/arch/arm64/include/asm/hypervisor.h +++ b/arch/arm64/include/asm/hypervisor.h @@ -7,8 +7,15 @@ void kvm_init_hyp_services(void); bool kvm_arm_hyp_service_available(u32 func_id); +#ifdef CONFIG_ARM_PKVM_GUEST +void pkvm_init_hyp_services(void); +#else +static inline void pkvm_init_hyp_services(void) { }; +#endif + static inline void kvm_arch_init_hyp_services(void) { + pkvm_init_hyp_services(); }; #endif diff --git a/drivers/virt/coco/Kconfig b/drivers/virt/coco/Kconfig index 87d142c1f932..d9ff676bf48d 100644 --- a/drivers/virt/coco/Kconfig +++ b/drivers/virt/coco/Kconfig @@ -9,6 +9,8 @@ config TSM_REPORTS source "drivers/virt/coco/efi_secret/Kconfig" +source "drivers/virt/coco/pkvm-guest/Kconfig" + source "drivers/virt/coco/sev-guest/Kconfig" source "drivers/virt/coco/tdx-guest/Kconfig" diff --git a/drivers/virt/coco/Makefile b/drivers/virt/coco/Makefile index 18c1aba5edb7..b69c30c1c720 100644 --- a/drivers/virt/coco/Makefile +++ b/drivers/virt/coco/Makefile @@ -4,5 +4,6 @@ # obj-$(CONFIG_TSM_REPORTS) += tsm.o obj-$(CONFIG_EFI_SECRET) += efi_secret/ +obj-$(CONFIG_ARM_PKVM_GUEST) += pkvm-guest/ obj-$(CONFIG_SEV_GUEST) += sev-guest/ obj-$(CONFIG_INTEL_TDX_GUEST) += tdx-guest/ diff --git a/drivers/virt/coco/pkvm-guest/Kconfig b/drivers/virt/coco/pkvm-guest/Kconfig new file mode 100644 index 000000000000..d2f344f1f98f --- /dev/null +++ b/drivers/virt/coco/pkvm-guest/Kconfig @@ -0,0 +1,10 @@ +config ARM_PKVM_GUEST + bool "Arm pKVM protected guest driver" + depends on ARM64 + help + Protected guests running under the pKVM hypervisor on arm64 + are isolated from the host and must issue hypercalls to enable + interaction with virtual devices. This driver implements + support for probing and issuing these hypercalls. + + If unsure, say 'N'. diff --git a/drivers/virt/coco/pkvm-guest/Makefile b/drivers/virt/coco/pkvm-guest/Makefile new file mode 100644 index 000000000000..4bee24579423 --- /dev/null +++ b/drivers/virt/coco/pkvm-guest/Makefile @@ -0,0 +1,2 @@ +# SPDX-License-Identifier: GPL-2.0-only +obj-$(CONFIG_ARM_PKVM_GUEST) += arm-pkvm-guest.o diff --git a/drivers/virt/coco/pkvm-guest/arm-pkvm-guest.c b/drivers/virt/coco/pkvm-guest/arm-pkvm-guest.c new file mode 100644 index 000000000000..a5148701d2f1 --- /dev/null +++ b/drivers/virt/coco/pkvm-guest/arm-pkvm-guest.c @@ -0,0 +1,37 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Support for the hypercall interface exposed to protected guests by + * pKVM. + * + * Author: Will Deacon + * Copyright (C) 2024 Google LLC + */ + +#include +#include +#include + +#include + +static size_t pkvm_granule; + +void pkvm_init_hyp_services(void) +{ + int i; + struct arm_smccc_res res; + const u32 funcs[] = { + ARM_SMCCC_KVM_FUNC_HYP_MEMINFO, + }; + + for (i = 0; i < ARRAY_SIZE(funcs); ++i) { + if (!kvm_arm_hyp_service_available(funcs[i])) + return; + } + + arm_smccc_1_1_invoke(ARM_SMCCC_VENDOR_HYP_KVM_HYP_MEMINFO_FUNC_ID, + 0, 0, 0, &res); + if (res.a0 > PAGE_SIZE) /* Includes error codes */ + return; + + pkvm_granule = res.a0; +} diff --git a/include/linux/arm-smccc.h b/include/linux/arm-smccc.h index 083f85653716..16b6dcc54e02 100644 --- a/include/linux/arm-smccc.h +++ b/include/linux/arm-smccc.h @@ -115,6 +115,7 @@ /* KVM "vendor specific" services */ #define ARM_SMCCC_KVM_FUNC_FEATURES 0 #define ARM_SMCCC_KVM_FUNC_PTP 1 +#define ARM_SMCCC_KVM_FUNC_HYP_MEMINFO 2 #define ARM_SMCCC_KVM_FUNC_FEATURES_2 127 #define ARM_SMCCC_KVM_NUM_FUNCS 128 @@ -137,6 +138,12 @@ ARM_SMCCC_OWNER_VENDOR_HYP, \ ARM_SMCCC_KVM_FUNC_PTP) +#define ARM_SMCCC_VENDOR_HYP_KVM_HYP_MEMINFO_FUNC_ID \ + ARM_SMCCC_CALL_VAL(ARM_SMCCC_FAST_CALL, \ + ARM_SMCCC_SMC_64, \ + ARM_SMCCC_OWNER_VENDOR_HYP, \ + ARM_SMCCC_KVM_FUNC_HYP_MEMINFO) + /* ptp_kvm counter type ID */ #define KVM_PTP_VIRT_COUNTER 0 #define KVM_PTP_PHYS_COUNTER 1 From e7bafbf7177750e6643941473b343ed72fc5a100 Mon Sep 17 00:00:00 2001 From: Will Deacon Date: Fri, 30 Aug 2024 14:01:46 +0100 Subject: [PATCH 38/94] arm64: mm: Add top-level dispatcher for internal mem_encrypt API Implementing the internal mem_encrypt API for arm64 depends entirely on the Confidential Computing environment in which the kernel is running. Introduce a simple dispatcher so that backend hooks can be registered depending upon the environment in which the kernel finds itself. Reviewed-by: Catalin Marinas Reviewed-by: Steven Price Acked-by: Marc Zyngier Link: https://lore.kernel.org/r/20240830130150.8568-4-will@kernel.org Signed-off-by: Will Deacon --- arch/arm64/Kconfig | 1 + arch/arm64/include/asm/mem_encrypt.h | 15 +++++++++ arch/arm64/include/asm/set_memory.h | 1 + arch/arm64/mm/Makefile | 2 +- arch/arm64/mm/mem_encrypt.c | 50 ++++++++++++++++++++++++++++ 5 files changed, 68 insertions(+), 1 deletion(-) create mode 100644 arch/arm64/include/asm/mem_encrypt.h create mode 100644 arch/arm64/mm/mem_encrypt.c diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig index a2f8ff354ca6..164858120191 100644 --- a/arch/arm64/Kconfig +++ b/arch/arm64/Kconfig @@ -34,6 +34,7 @@ config ARM64 select ARCH_HAS_KERNEL_FPU_SUPPORT if KERNEL_MODE_NEON select ARCH_HAS_KEEPINITRD select ARCH_HAS_MEMBARRIER_SYNC_CORE + select ARCH_HAS_MEM_ENCRYPT select ARCH_HAS_NMI_SAFE_THIS_CPU_OPS select ARCH_HAS_NON_OVERLAPPING_ADDRESS_SPACE select ARCH_HAS_PTE_DEVMAP diff --git a/arch/arm64/include/asm/mem_encrypt.h b/arch/arm64/include/asm/mem_encrypt.h new file mode 100644 index 000000000000..b0c9a86b13a4 --- /dev/null +++ b/arch/arm64/include/asm/mem_encrypt.h @@ -0,0 +1,15 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +#ifndef __ASM_MEM_ENCRYPT_H +#define __ASM_MEM_ENCRYPT_H + +struct arm64_mem_crypt_ops { + int (*encrypt)(unsigned long addr, int numpages); + int (*decrypt)(unsigned long addr, int numpages); +}; + +int arm64_mem_crypt_ops_register(const struct arm64_mem_crypt_ops *ops); + +int set_memory_encrypted(unsigned long addr, int numpages); +int set_memory_decrypted(unsigned long addr, int numpages); + +#endif /* __ASM_MEM_ENCRYPT_H */ diff --git a/arch/arm64/include/asm/set_memory.h b/arch/arm64/include/asm/set_memory.h index 0f740b781187..917761feeffd 100644 --- a/arch/arm64/include/asm/set_memory.h +++ b/arch/arm64/include/asm/set_memory.h @@ -3,6 +3,7 @@ #ifndef _ASM_ARM64_SET_MEMORY_H #define _ASM_ARM64_SET_MEMORY_H +#include #include bool can_set_direct_map(void); diff --git a/arch/arm64/mm/Makefile b/arch/arm64/mm/Makefile index 60454256945b..2fc8c6dd0407 100644 --- a/arch/arm64/mm/Makefile +++ b/arch/arm64/mm/Makefile @@ -1,7 +1,7 @@ # SPDX-License-Identifier: GPL-2.0 obj-y := dma-mapping.o extable.o fault.o init.o \ cache.o copypage.o flush.o \ - ioremap.o mmap.o pgd.o mmu.o \ + ioremap.o mmap.o pgd.o mem_encrypt.o mmu.o \ context.o proc.o pageattr.o fixmap.o obj-$(CONFIG_ARM64_CONTPTE) += contpte.o obj-$(CONFIG_HUGETLB_PAGE) += hugetlbpage.o diff --git a/arch/arm64/mm/mem_encrypt.c b/arch/arm64/mm/mem_encrypt.c new file mode 100644 index 000000000000..ee3c0ab04384 --- /dev/null +++ b/arch/arm64/mm/mem_encrypt.c @@ -0,0 +1,50 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Implementation of the memory encryption/decryption API. + * + * Since the low-level details of the operation depend on the + * Confidential Computing environment (e.g. pKVM, CCA, ...), this just + * acts as a top-level dispatcher to whatever hooks may have been + * registered. + * + * Author: Will Deacon + * Copyright (C) 2024 Google LLC + * + * "Hello, boils and ghouls!" + */ + +#include +#include +#include +#include + +#include + +static const struct arm64_mem_crypt_ops *crypt_ops; + +int arm64_mem_crypt_ops_register(const struct arm64_mem_crypt_ops *ops) +{ + if (WARN_ON(crypt_ops)) + return -EBUSY; + + crypt_ops = ops; + return 0; +} + +int set_memory_encrypted(unsigned long addr, int numpages) +{ + if (likely(!crypt_ops) || WARN_ON(!PAGE_ALIGNED(addr))) + return 0; + + return crypt_ops->encrypt(addr, numpages); +} +EXPORT_SYMBOL_GPL(set_memory_encrypted); + +int set_memory_decrypted(unsigned long addr, int numpages) +{ + if (likely(!crypt_ops) || WARN_ON(!PAGE_ALIGNED(addr))) + return 0; + + return crypt_ops->decrypt(addr, numpages); +} +EXPORT_SYMBOL_GPL(set_memory_decrypted); From ebc59b120c588156feb7ce194a9636584ced18ba Mon Sep 17 00:00:00 2001 From: Will Deacon Date: Fri, 30 Aug 2024 14:01:47 +0100 Subject: [PATCH 39/94] drivers/virt: pkvm: Hook up mem_encrypt API using pKVM hypercalls If we detect the presence of pKVM's SHARE and UNSHARE hypercalls, then register a backend implementation of the mem_encrypt API so that things like DMA buffers can be shared appropriately with the host. Acked-by: Marc Zyngier Link: https://lore.kernel.org/r/20240830130150.8568-5-will@kernel.org Signed-off-by: Will Deacon --- Documentation/virt/kvm/arm/hypercalls.rst | 50 +++++++++++++++++ drivers/virt/coco/pkvm-guest/arm-pkvm-guest.c | 55 +++++++++++++++++++ include/linux/arm-smccc.h | 14 +++++ 3 files changed, 119 insertions(+) diff --git a/Documentation/virt/kvm/arm/hypercalls.rst b/Documentation/virt/kvm/arm/hypercalls.rst index 16515eb42149..c42580e71bf8 100644 --- a/Documentation/virt/kvm/arm/hypercalls.rst +++ b/Documentation/virt/kvm/arm/hypercalls.rst @@ -66,3 +66,53 @@ Query the memory protection parameters for a pKVM protected virtual machine. | Return Values: | (int64) | R0 | ``INVALID_PARAMETER (-3)`` on error, else | | | | | memory protection granule in bytes | +---------------------+----------+----+---------------------------------------------+ + +``ARM_SMCCC_KVM_FUNC_MEM_SHARE`` +-------------------------------- + +Share a region of memory with the KVM host, granting it read, write and execute +permissions. The size of the region is equal to the memory protection granule +advertised by ``ARM_SMCCC_KVM_FUNC_HYP_MEMINFO``. + ++---------------------+-------------------------------------------------------------+ +| Presence: | Optional; pKVM protected guests only. | ++---------------------+-------------------------------------------------------------+ +| Calling convention: | HVC64 | ++---------------------+----------+--------------------------------------------------+ +| Function ID: | (uint32) | 0xC6000003 | ++---------------------+----------+----+---------------------------------------------+ +| Arguments: | (uint64) | R1 | Base IPA of memory region to share | +| +----------+----+---------------------------------------------+ +| | (uint64) | R2 | Reserved / Must be zero | +| +----------+----+---------------------------------------------+ +| | (uint64) | R3 | Reserved / Must be zero | ++---------------------+----------+----+---------------------------------------------+ +| Return Values: | (int64) | R0 | ``SUCCESS (0)`` | +| | | +---------------------------------------------+ +| | | | ``INVALID_PARAMETER (-3)`` | ++---------------------+----------+----+---------------------------------------------+ + +``ARM_SMCCC_KVM_FUNC_MEM_UNSHARE`` +---------------------------------- + +Revoke access permission from the KVM host to a memory region previously shared +with ``ARM_SMCCC_KVM_FUNC_MEM_SHARE``. The size of the region is equal to the +memory protection granule advertised by ``ARM_SMCCC_KVM_FUNC_HYP_MEMINFO``. + ++---------------------+-------------------------------------------------------------+ +| Presence: | Optional; pKVM protected guests only. | ++---------------------+-------------------------------------------------------------+ +| Calling convention: | HVC64 | ++---------------------+----------+--------------------------------------------------+ +| Function ID: | (uint32) | 0xC6000004 | ++---------------------+----------+----+---------------------------------------------+ +| Arguments: | (uint64) | R1 | Base IPA of memory region to unshare | +| +----------+----+---------------------------------------------+ +| | (uint64) | R2 | Reserved / Must be zero | +| +----------+----+---------------------------------------------+ +| | (uint64) | R3 | Reserved / Must be zero | ++---------------------+----------+----+---------------------------------------------+ +| Return Values: | (int64) | R0 | ``SUCCESS (0)`` | +| | | +---------------------------------------------+ +| | | | ``INVALID_PARAMETER (-3)`` | ++---------------------+----------+----+---------------------------------------------+ diff --git a/drivers/virt/coco/pkvm-guest/arm-pkvm-guest.c b/drivers/virt/coco/pkvm-guest/arm-pkvm-guest.c index a5148701d2f1..8256cf68fd76 100644 --- a/drivers/virt/coco/pkvm-guest/arm-pkvm-guest.c +++ b/drivers/virt/coco/pkvm-guest/arm-pkvm-guest.c @@ -9,18 +9,72 @@ #include #include +#include #include #include static size_t pkvm_granule; +static int arm_smccc_do_one_page(u32 func_id, phys_addr_t phys) +{ + phys_addr_t end = phys + PAGE_SIZE; + + while (phys < end) { + struct arm_smccc_res res; + + arm_smccc_1_1_invoke(func_id, phys, 0, 0, &res); + if (res.a0 != SMCCC_RET_SUCCESS) + return -EPERM; + + phys += pkvm_granule; + } + + return 0; +} + +static int __set_memory_range(u32 func_id, unsigned long start, int numpages) +{ + void *addr = (void *)start, *end = addr + numpages * PAGE_SIZE; + + while (addr < end) { + int err; + + err = arm_smccc_do_one_page(func_id, virt_to_phys(addr)); + if (err) + return err; + + addr += PAGE_SIZE; + } + + return 0; +} + +static int pkvm_set_memory_encrypted(unsigned long addr, int numpages) +{ + return __set_memory_range(ARM_SMCCC_VENDOR_HYP_KVM_MEM_UNSHARE_FUNC_ID, + addr, numpages); +} + +static int pkvm_set_memory_decrypted(unsigned long addr, int numpages) +{ + return __set_memory_range(ARM_SMCCC_VENDOR_HYP_KVM_MEM_SHARE_FUNC_ID, + addr, numpages); +} + +static const struct arm64_mem_crypt_ops pkvm_crypt_ops = { + .encrypt = pkvm_set_memory_encrypted, + .decrypt = pkvm_set_memory_decrypted, +}; + void pkvm_init_hyp_services(void) { int i; struct arm_smccc_res res; const u32 funcs[] = { ARM_SMCCC_KVM_FUNC_HYP_MEMINFO, + ARM_SMCCC_KVM_FUNC_MEM_SHARE, + ARM_SMCCC_KVM_FUNC_MEM_UNSHARE, }; for (i = 0; i < ARRAY_SIZE(funcs); ++i) { @@ -34,4 +88,5 @@ void pkvm_init_hyp_services(void) return; pkvm_granule = res.a0; + arm64_mem_crypt_ops_register(&pkvm_crypt_ops); } diff --git a/include/linux/arm-smccc.h b/include/linux/arm-smccc.h index 16b6dcc54e02..9cb7c95920b0 100644 --- a/include/linux/arm-smccc.h +++ b/include/linux/arm-smccc.h @@ -116,6 +116,8 @@ #define ARM_SMCCC_KVM_FUNC_FEATURES 0 #define ARM_SMCCC_KVM_FUNC_PTP 1 #define ARM_SMCCC_KVM_FUNC_HYP_MEMINFO 2 +#define ARM_SMCCC_KVM_FUNC_MEM_SHARE 3 +#define ARM_SMCCC_KVM_FUNC_MEM_UNSHARE 4 #define ARM_SMCCC_KVM_FUNC_FEATURES_2 127 #define ARM_SMCCC_KVM_NUM_FUNCS 128 @@ -144,6 +146,18 @@ ARM_SMCCC_OWNER_VENDOR_HYP, \ ARM_SMCCC_KVM_FUNC_HYP_MEMINFO) +#define ARM_SMCCC_VENDOR_HYP_KVM_MEM_SHARE_FUNC_ID \ + ARM_SMCCC_CALL_VAL(ARM_SMCCC_FAST_CALL, \ + ARM_SMCCC_SMC_64, \ + ARM_SMCCC_OWNER_VENDOR_HYP, \ + ARM_SMCCC_KVM_FUNC_MEM_SHARE) + +#define ARM_SMCCC_VENDOR_HYP_KVM_MEM_UNSHARE_FUNC_ID \ + ARM_SMCCC_CALL_VAL(ARM_SMCCC_FAST_CALL, \ + ARM_SMCCC_SMC_64, \ + ARM_SMCCC_OWNER_VENDOR_HYP, \ + ARM_SMCCC_KVM_FUNC_MEM_UNSHARE) + /* ptp_kvm counter type ID */ #define KVM_PTP_VIRT_COUNTER 0 #define KVM_PTP_PHYS_COUNTER 1 From c86fa3470c1026e9f63a93e8885ea51ef99fae35 Mon Sep 17 00:00:00 2001 From: Will Deacon Date: Fri, 30 Aug 2024 14:01:48 +0100 Subject: [PATCH 40/94] arm64: mm: Add confidential computing hook to ioremap_prot() Confidential Computing environments such as pKVM and Arm's CCA distinguish between shared (i.e. emulated) and private (i.e. assigned) MMIO regions. Introduce a hook into our implementation of ioremap_prot() so that MMIO regions can be shared if necessary. Reviewed-by: Catalin Marinas Reviewed-by: Steven Price Acked-by: Marc Zyngier Link: https://lore.kernel.org/r/20240830130150.8568-6-will@kernel.org Signed-off-by: Will Deacon --- arch/arm64/include/asm/io.h | 4 ++++ arch/arm64/mm/ioremap.c | 23 ++++++++++++++++++++++- 2 files changed, 26 insertions(+), 1 deletion(-) diff --git a/arch/arm64/include/asm/io.h b/arch/arm64/include/asm/io.h index 41fd90895dfc..1ada23a6ec19 100644 --- a/arch/arm64/include/asm/io.h +++ b/arch/arm64/include/asm/io.h @@ -271,6 +271,10 @@ __iowrite64_copy(void __iomem *to, const void *from, size_t count) * I/O memory mapping functions. */ +typedef int (*ioremap_prot_hook_t)(phys_addr_t phys_addr, size_t size, + pgprot_t *prot); +int arm64_ioremap_prot_hook_register(const ioremap_prot_hook_t hook); + #define ioremap_prot ioremap_prot #define _PAGE_IOREMAP PROT_DEVICE_nGnRE diff --git a/arch/arm64/mm/ioremap.c b/arch/arm64/mm/ioremap.c index 269f2f63ab7d..6cc0b7e7eb03 100644 --- a/arch/arm64/mm/ioremap.c +++ b/arch/arm64/mm/ioremap.c @@ -3,10 +3,22 @@ #include #include +static ioremap_prot_hook_t ioremap_prot_hook; + +int arm64_ioremap_prot_hook_register(ioremap_prot_hook_t hook) +{ + if (WARN_ON(ioremap_prot_hook)) + return -EBUSY; + + ioremap_prot_hook = hook; + return 0; +} + void __iomem *ioremap_prot(phys_addr_t phys_addr, size_t size, unsigned long prot) { unsigned long last_addr = phys_addr + size - 1; + pgprot_t pgprot = __pgprot(prot); /* Don't allow outside PHYS_MASK */ if (last_addr & ~PHYS_MASK) @@ -16,7 +28,16 @@ void __iomem *ioremap_prot(phys_addr_t phys_addr, size_t size, if (WARN_ON(pfn_is_map_memory(__phys_to_pfn(phys_addr)))) return NULL; - return generic_ioremap_prot(phys_addr, size, __pgprot(prot)); + /* + * If a hook is registered (e.g. for confidential computing + * purposes), call that now and barf if it fails. + */ + if (unlikely(ioremap_prot_hook) && + WARN_ON(ioremap_prot_hook(phys_addr, size, &pgprot))) { + return NULL; + } + + return generic_ioremap_prot(phys_addr, size, pgprot); } EXPORT_SYMBOL(ioremap_prot); From 0f12694958001c96bda811473fdb23f333c6d3ca Mon Sep 17 00:00:00 2001 From: Will Deacon Date: Fri, 30 Aug 2024 14:01:49 +0100 Subject: [PATCH 41/94] drivers/virt: pkvm: Intercept ioremap using pKVM MMIO_GUARD hypercall Hook up pKVM's MMIO_GUARD hypercall so that ioremap() and friends will register the target physical address as MMIO with the hypervisor, allowing guest exits to that page to be emulated by the host with full syndrome information. Acked-by: Marc Zyngier Link: https://lore.kernel.org/r/20240830130150.8568-7-will@kernel.org Signed-off-by: Will Deacon --- Documentation/virt/kvm/arm/hypercalls.rst | 26 ++++++++++++++ drivers/virt/coco/pkvm-guest/arm-pkvm-guest.c | 35 +++++++++++++++++++ include/linux/arm-smccc.h | 7 ++++ 3 files changed, 68 insertions(+) diff --git a/Documentation/virt/kvm/arm/hypercalls.rst b/Documentation/virt/kvm/arm/hypercalls.rst index c42580e71bf8..af7bc2c2e0cb 100644 --- a/Documentation/virt/kvm/arm/hypercalls.rst +++ b/Documentation/virt/kvm/arm/hypercalls.rst @@ -116,3 +116,29 @@ memory protection granule advertised by ``ARM_SMCCC_KVM_FUNC_HYP_MEMINFO``. | | | +---------------------------------------------+ | | | | ``INVALID_PARAMETER (-3)`` | +---------------------+----------+----+---------------------------------------------+ + +``ARM_SMCCC_KVM_FUNC_MMIO_GUARD`` +---------------------------------- + +Request that a given memory region is handled as MMIO by the hypervisor, +allowing accesses to this region to be emulated by the KVM host. The size of the +region is equal to the memory protection granule advertised by +``ARM_SMCCC_KVM_FUNC_HYP_MEMINFO``. + ++---------------------+-------------------------------------------------------------+ +| Presence: | Optional; pKVM protected guests only. | ++---------------------+-------------------------------------------------------------+ +| Calling convention: | HVC64 | ++---------------------+----------+--------------------------------------------------+ +| Function ID: | (uint32) | 0xC6000007 | ++---------------------+----------+----+---------------------------------------------+ +| Arguments: | (uint64) | R1 | Base IPA of MMIO memory region | +| +----------+----+---------------------------------------------+ +| | (uint64) | R2 | Reserved / Must be zero | +| +----------+----+---------------------------------------------+ +| | (uint64) | R3 | Reserved / Must be zero | ++---------------------+----------+----+---------------------------------------------+ +| Return Values: | (int64) | R0 | ``SUCCESS (0)`` | +| | | +---------------------------------------------+ +| | | | ``INVALID_PARAMETER (-3)`` | ++---------------------+----------+----+---------------------------------------------+ diff --git a/drivers/virt/coco/pkvm-guest/arm-pkvm-guest.c b/drivers/virt/coco/pkvm-guest/arm-pkvm-guest.c index 8256cf68fd76..56a3859dda8a 100644 --- a/drivers/virt/coco/pkvm-guest/arm-pkvm-guest.c +++ b/drivers/virt/coco/pkvm-guest/arm-pkvm-guest.c @@ -9,8 +9,10 @@ #include #include +#include #include #include +#include #include @@ -67,6 +69,36 @@ static const struct arm64_mem_crypt_ops pkvm_crypt_ops = { .decrypt = pkvm_set_memory_decrypted, }; +static int mmio_guard_ioremap_hook(phys_addr_t phys, size_t size, + pgprot_t *prot) +{ + phys_addr_t end; + pteval_t protval = pgprot_val(*prot); + + /* + * We only expect MMIO emulation for regions mapped with device + * attributes. + */ + if (protval != PROT_DEVICE_nGnRE && protval != PROT_DEVICE_nGnRnE) + return 0; + + phys = PAGE_ALIGN_DOWN(phys); + end = phys + PAGE_ALIGN(size); + + while (phys < end) { + const int func_id = ARM_SMCCC_VENDOR_HYP_KVM_MMIO_GUARD_FUNC_ID; + int err; + + err = arm_smccc_do_one_page(func_id, phys); + if (err) + return err; + + phys += PAGE_SIZE; + } + + return 0; +} + void pkvm_init_hyp_services(void) { int i; @@ -89,4 +121,7 @@ void pkvm_init_hyp_services(void) pkvm_granule = res.a0; arm64_mem_crypt_ops_register(&pkvm_crypt_ops); + + if (kvm_arm_hyp_service_available(ARM_SMCCC_KVM_FUNC_MMIO_GUARD)) + arm64_ioremap_prot_hook_register(&mmio_guard_ioremap_hook); } diff --git a/include/linux/arm-smccc.h b/include/linux/arm-smccc.h index 9cb7c95920b0..e93c1f7cea70 100644 --- a/include/linux/arm-smccc.h +++ b/include/linux/arm-smccc.h @@ -118,6 +118,7 @@ #define ARM_SMCCC_KVM_FUNC_HYP_MEMINFO 2 #define ARM_SMCCC_KVM_FUNC_MEM_SHARE 3 #define ARM_SMCCC_KVM_FUNC_MEM_UNSHARE 4 +#define ARM_SMCCC_KVM_FUNC_MMIO_GUARD 7 #define ARM_SMCCC_KVM_FUNC_FEATURES_2 127 #define ARM_SMCCC_KVM_NUM_FUNCS 128 @@ -158,6 +159,12 @@ ARM_SMCCC_OWNER_VENDOR_HYP, \ ARM_SMCCC_KVM_FUNC_MEM_UNSHARE) +#define ARM_SMCCC_VENDOR_HYP_KVM_MMIO_GUARD_FUNC_ID \ + ARM_SMCCC_CALL_VAL(ARM_SMCCC_FAST_CALL, \ + ARM_SMCCC_SMC_64, \ + ARM_SMCCC_OWNER_VENDOR_HYP, \ + ARM_SMCCC_KVM_FUNC_MMIO_GUARD) + /* ptp_kvm counter type ID */ #define KVM_PTP_VIRT_COUNTER 0 #define KVM_PTP_PHYS_COUNTER 1 From 21be9f7110d4c044c2b49bafbd7246335f236221 Mon Sep 17 00:00:00 2001 From: Will Deacon Date: Fri, 30 Aug 2024 14:01:50 +0100 Subject: [PATCH 42/94] arm64: smccc: Reserve block of KVM "vendor" services for pKVM hypercalls pKVM relies on hypercalls to expose services such as memory sharing to protected guests. Tentatively allocate a block of 58 hypercalls (i.e. fill the remaining space in the first 64 function IDs) for pKVM usage, as future extensions such as pvIOMMU support, range-based memory sharing and validation of assigned devices will require additional services. Suggested-by: Marc Zyngier Link: https://lore.kernel.org/r/86a5h5yg5y.wl-maz@kernel.org Acked-by: Marc Zyngier Link: https://lore.kernel.org/r/20240830130150.8568-8-will@kernel.org Signed-off-by: Will Deacon --- include/linux/arm-smccc.h | 60 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 60 insertions(+) diff --git a/include/linux/arm-smccc.h b/include/linux/arm-smccc.h index e93c1f7cea70..f59099a213d0 100644 --- a/include/linux/arm-smccc.h +++ b/include/linux/arm-smccc.h @@ -115,10 +115,70 @@ /* KVM "vendor specific" services */ #define ARM_SMCCC_KVM_FUNC_FEATURES 0 #define ARM_SMCCC_KVM_FUNC_PTP 1 +/* Start of pKVM hypercall range */ #define ARM_SMCCC_KVM_FUNC_HYP_MEMINFO 2 #define ARM_SMCCC_KVM_FUNC_MEM_SHARE 3 #define ARM_SMCCC_KVM_FUNC_MEM_UNSHARE 4 +#define ARM_SMCCC_KVM_FUNC_PKVM_RESV_5 5 +#define ARM_SMCCC_KVM_FUNC_PKVM_RESV_6 6 #define ARM_SMCCC_KVM_FUNC_MMIO_GUARD 7 +#define ARM_SMCCC_KVM_FUNC_PKVM_RESV_8 8 +#define ARM_SMCCC_KVM_FUNC_PKVM_RESV_9 9 +#define ARM_SMCCC_KVM_FUNC_PKVM_RESV_10 10 +#define ARM_SMCCC_KVM_FUNC_PKVM_RESV_11 11 +#define ARM_SMCCC_KVM_FUNC_PKVM_RESV_12 12 +#define ARM_SMCCC_KVM_FUNC_PKVM_RESV_13 13 +#define ARM_SMCCC_KVM_FUNC_PKVM_RESV_14 14 +#define ARM_SMCCC_KVM_FUNC_PKVM_RESV_15 15 +#define ARM_SMCCC_KVM_FUNC_PKVM_RESV_16 16 +#define ARM_SMCCC_KVM_FUNC_PKVM_RESV_17 17 +#define ARM_SMCCC_KVM_FUNC_PKVM_RESV_18 18 +#define ARM_SMCCC_KVM_FUNC_PKVM_RESV_19 19 +#define ARM_SMCCC_KVM_FUNC_PKVM_RESV_20 20 +#define ARM_SMCCC_KVM_FUNC_PKVM_RESV_21 21 +#define ARM_SMCCC_KVM_FUNC_PKVM_RESV_22 22 +#define ARM_SMCCC_KVM_FUNC_PKVM_RESV_23 23 +#define ARM_SMCCC_KVM_FUNC_PKVM_RESV_24 24 +#define ARM_SMCCC_KVM_FUNC_PKVM_RESV_25 25 +#define ARM_SMCCC_KVM_FUNC_PKVM_RESV_26 26 +#define ARM_SMCCC_KVM_FUNC_PKVM_RESV_27 27 +#define ARM_SMCCC_KVM_FUNC_PKVM_RESV_28 28 +#define ARM_SMCCC_KVM_FUNC_PKVM_RESV_29 29 +#define ARM_SMCCC_KVM_FUNC_PKVM_RESV_30 30 +#define ARM_SMCCC_KVM_FUNC_PKVM_RESV_31 31 +#define ARM_SMCCC_KVM_FUNC_PKVM_RESV_32 32 +#define ARM_SMCCC_KVM_FUNC_PKVM_RESV_33 33 +#define ARM_SMCCC_KVM_FUNC_PKVM_RESV_34 34 +#define ARM_SMCCC_KVM_FUNC_PKVM_RESV_35 35 +#define ARM_SMCCC_KVM_FUNC_PKVM_RESV_36 36 +#define ARM_SMCCC_KVM_FUNC_PKVM_RESV_37 37 +#define ARM_SMCCC_KVM_FUNC_PKVM_RESV_38 38 +#define ARM_SMCCC_KVM_FUNC_PKVM_RESV_39 39 +#define ARM_SMCCC_KVM_FUNC_PKVM_RESV_40 40 +#define ARM_SMCCC_KVM_FUNC_PKVM_RESV_41 41 +#define ARM_SMCCC_KVM_FUNC_PKVM_RESV_42 42 +#define ARM_SMCCC_KVM_FUNC_PKVM_RESV_43 43 +#define ARM_SMCCC_KVM_FUNC_PKVM_RESV_44 44 +#define ARM_SMCCC_KVM_FUNC_PKVM_RESV_45 45 +#define ARM_SMCCC_KVM_FUNC_PKVM_RESV_46 46 +#define ARM_SMCCC_KVM_FUNC_PKVM_RESV_47 47 +#define ARM_SMCCC_KVM_FUNC_PKVM_RESV_48 48 +#define ARM_SMCCC_KVM_FUNC_PKVM_RESV_49 49 +#define ARM_SMCCC_KVM_FUNC_PKVM_RESV_50 50 +#define ARM_SMCCC_KVM_FUNC_PKVM_RESV_51 51 +#define ARM_SMCCC_KVM_FUNC_PKVM_RESV_52 52 +#define ARM_SMCCC_KVM_FUNC_PKVM_RESV_53 53 +#define ARM_SMCCC_KVM_FUNC_PKVM_RESV_54 54 +#define ARM_SMCCC_KVM_FUNC_PKVM_RESV_55 55 +#define ARM_SMCCC_KVM_FUNC_PKVM_RESV_56 56 +#define ARM_SMCCC_KVM_FUNC_PKVM_RESV_57 57 +#define ARM_SMCCC_KVM_FUNC_PKVM_RESV_58 58 +#define ARM_SMCCC_KVM_FUNC_PKVM_RESV_59 59 +#define ARM_SMCCC_KVM_FUNC_PKVM_RESV_60 60 +#define ARM_SMCCC_KVM_FUNC_PKVM_RESV_61 61 +#define ARM_SMCCC_KVM_FUNC_PKVM_RESV_62 62 +#define ARM_SMCCC_KVM_FUNC_PKVM_RESV_63 63 +/* End of pKVM hypercall range */ #define ARM_SMCCC_KVM_FUNC_FEATURES_2 127 #define ARM_SMCCC_KVM_NUM_FUNCS 128 From 878c05e8ef846bd1e402ba662d12e575a073e070 Mon Sep 17 00:00:00 2001 From: Joey Gouly Date: Thu, 22 Aug 2024 16:10:47 +0100 Subject: [PATCH 43/94] arm64: disable trapping of POR_EL0 to EL2 Allow EL0 or EL1 to access POR_EL0 without being trapped to EL2. Signed-off-by: Joey Gouly Cc: Catalin Marinas Cc: Will Deacon Acked-by: Catalin Marinas Reviewed-by: Anshuman Khandual Link: https://lore.kernel.org/r/20240822151113.1479789-5-joey.gouly@arm.com [will: Rename Lset_poe_fgt to Lskip_pie_fgt to ease merge with for-next/misc] Signed-off-by: Will Deacon --- arch/arm64/include/asm/el2_setup.h | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/arch/arm64/include/asm/el2_setup.h b/arch/arm64/include/asm/el2_setup.h index fd87c4b8f984..1173aba9c6a8 100644 --- a/arch/arm64/include/asm/el2_setup.h +++ b/arch/arm64/include/asm/el2_setup.h @@ -185,12 +185,20 @@ .Lset_pie_fgt_\@: mrs_s x1, SYS_ID_AA64MMFR3_EL1 ubfx x1, x1, #ID_AA64MMFR3_EL1_S1PIE_SHIFT, #4 - cbz x1, .Lset_fgt_\@ + cbz x1, .Lskip_pie_fgt_\@ /* Disable trapping of PIR_EL1 / PIRE0_EL1 */ orr x0, x0, #HFGxTR_EL2_nPIR_EL1 orr x0, x0, #HFGxTR_EL2_nPIRE0_EL1 +.Lskip_pie_fgt_\@: + mrs_s x1, SYS_ID_AA64MMFR3_EL1 + ubfx x1, x1, #ID_AA64MMFR3_EL1_S1POE_SHIFT, #4 + cbz x1, .Lset_fgt_\@ + + /* Disable trapping of POR_EL0 */ + orr x0, x0, #HFGxTR_EL2_nPOR_EL0 + .Lset_fgt_\@: msr_s SYS_HFGRTR_EL2, x0 msr_s SYS_HFGWTR_EL2, x0 From 3496f69391eee225244f0d3f0404142a80b710f5 Mon Sep 17 00:00:00 2001 From: Joey Gouly Date: Thu, 22 Aug 2024 16:10:48 +0100 Subject: [PATCH 44/94] arm64: cpufeature: add Permission Overlay Extension cpucap This indicates if the system supports POE. This is a CPUCAP_BOOT_CPU_FEATURE as the boot CPU will enable POE if it has it, so secondary CPUs must also have this feature. Signed-off-by: Joey Gouly Cc: Catalin Marinas Cc: Will Deacon Acked-by: Catalin Marinas Reviewed-by: Anshuman Khandual Link: https://lore.kernel.org/r/20240822151113.1479789-6-joey.gouly@arm.com Signed-off-by: Will Deacon --- arch/arm64/kernel/cpufeature.c | 9 +++++++++ arch/arm64/tools/cpucaps | 1 + 2 files changed, 10 insertions(+) diff --git a/arch/arm64/kernel/cpufeature.c b/arch/arm64/kernel/cpufeature.c index 646ecd3069fd..2daf5597cd65 100644 --- a/arch/arm64/kernel/cpufeature.c +++ b/arch/arm64/kernel/cpufeature.c @@ -2870,6 +2870,15 @@ static const struct arm64_cpu_capabilities arm64_features[] = { .matches = has_nv1, ARM64_CPUID_FIELDS_NEG(ID_AA64MMFR4_EL1, E2H0, NI_NV1) }, +#ifdef CONFIG_ARM64_POE + { + .desc = "Stage-1 Permission Overlay Extension (S1POE)", + .capability = ARM64_HAS_S1POE, + .type = ARM64_CPUCAP_BOOT_CPU_FEATURE, + .matches = has_cpuid_feature, + ARM64_CPUID_FIELDS(ID_AA64MMFR3_EL1, S1POE, IMP) + }, +#endif {}, }; diff --git a/arch/arm64/tools/cpucaps b/arch/arm64/tools/cpucaps index ac3429d892b9..eedb5acc21ed 100644 --- a/arch/arm64/tools/cpucaps +++ b/arch/arm64/tools/cpucaps @@ -45,6 +45,7 @@ HAS_MOPS HAS_NESTED_VIRT HAS_PAN HAS_S1PIE +HAS_S1POE HAS_RAS_EXTN HAS_RNG HAS_SB From 160a8e13de6c36270e8c6537b8a944f4e73d2362 Mon Sep 17 00:00:00 2001 From: Joey Gouly Date: Thu, 22 Aug 2024 16:10:49 +0100 Subject: [PATCH 45/94] arm64: context switch POR_EL0 register POR_EL0 is a register that can be modified by userspace directly, so it must be context switched. Signed-off-by: Joey Gouly Cc: Catalin Marinas Cc: Will Deacon Reviewed-by: Catalin Marinas Link: https://lore.kernel.org/r/20240822151113.1479789-7-joey.gouly@arm.com [will: Dropped unnecessary isb()s] Signed-off-by: Will Deacon --- arch/arm64/include/asm/cpufeature.h | 6 ++++++ arch/arm64/include/asm/processor.h | 1 + arch/arm64/include/asm/sysreg.h | 3 +++ arch/arm64/kernel/process.c | 24 ++++++++++++++++++++++++ 4 files changed, 34 insertions(+) diff --git a/arch/arm64/include/asm/cpufeature.h b/arch/arm64/include/asm/cpufeature.h index 558434267271..3d261cc123c1 100644 --- a/arch/arm64/include/asm/cpufeature.h +++ b/arch/arm64/include/asm/cpufeature.h @@ -832,6 +832,12 @@ static inline bool system_supports_lpa2(void) return cpus_have_final_cap(ARM64_HAS_LPA2); } +static inline bool system_supports_poe(void) +{ + return IS_ENABLED(CONFIG_ARM64_POE) && + alternative_has_cap_unlikely(ARM64_HAS_S1POE); +} + int do_emulate_mrs(struct pt_regs *regs, u32 sys_reg, u32 rt); bool try_emulate_mrs(struct pt_regs *regs, u32 isn); diff --git a/arch/arm64/include/asm/processor.h b/arch/arm64/include/asm/processor.h index f77371232d8c..e6376f979273 100644 --- a/arch/arm64/include/asm/processor.h +++ b/arch/arm64/include/asm/processor.h @@ -184,6 +184,7 @@ struct thread_struct { u64 sctlr_user; u64 svcr; u64 tpidr2_el0; + u64 por_el0; }; static inline unsigned int thread_get_vl(struct thread_struct *thread, diff --git a/arch/arm64/include/asm/sysreg.h b/arch/arm64/include/asm/sysreg.h index 4a9ea103817e..494e9efd856f 100644 --- a/arch/arm64/include/asm/sysreg.h +++ b/arch/arm64/include/asm/sysreg.h @@ -1077,6 +1077,9 @@ #define POE_RXW UL(0x7) #define POE_MASK UL(0xf) +/* Initial value for Permission Overlay Extension for EL0 */ +#define POR_EL0_INIT POE_RXW + #define ARM64_FEATURE_FIELD_BITS 4 /* Defined for compatibility only, do not add new users. */ diff --git a/arch/arm64/kernel/process.c b/arch/arm64/kernel/process.c index 4ae31b7af6c3..f365b033a649 100644 --- a/arch/arm64/kernel/process.c +++ b/arch/arm64/kernel/process.c @@ -271,12 +271,21 @@ static void flush_tagged_addr_state(void) clear_thread_flag(TIF_TAGGED_ADDR); } +static void flush_poe(void) +{ + if (!system_supports_poe()) + return; + + write_sysreg_s(POR_EL0_INIT, SYS_POR_EL0); +} + void flush_thread(void) { fpsimd_flush_thread(); tls_thread_flush(); flush_ptrace_hw_breakpoint(current); flush_tagged_addr_state(); + flush_poe(); } void arch_release_task_struct(struct task_struct *tsk) @@ -371,6 +380,9 @@ int copy_thread(struct task_struct *p, const struct kernel_clone_args *args) if (system_supports_tpidr2()) p->thread.tpidr2_el0 = read_sysreg_s(SYS_TPIDR2_EL0); + if (system_supports_poe()) + p->thread.por_el0 = read_sysreg_s(SYS_POR_EL0); + if (stack_start) { if (is_compat_thread(task_thread_info(p))) childregs->compat_sp = stack_start; @@ -495,6 +507,17 @@ static void erratum_1418040_new_exec(void) preempt_enable(); } +static void permission_overlay_switch(struct task_struct *next) +{ + if (!system_supports_poe()) + return; + + current->thread.por_el0 = read_sysreg_s(SYS_POR_EL0); + if (current->thread.por_el0 != next->thread.por_el0) { + write_sysreg_s(next->thread.por_el0, SYS_POR_EL0); + } +} + /* * __switch_to() checks current->thread.sctlr_user as an optimisation. Therefore * this function must be called with preemption disabled and the update to @@ -530,6 +553,7 @@ struct task_struct *__switch_to(struct task_struct *prev, ssbs_thread_switch(next); erratum_1418040_thread_switch(next); ptrauth_thread_switch_user(next); + permission_overlay_switch(next); /* * Complete any pending TLB or cache maintenance on this CPU in case From b86c9bea634971565f15dc95c1b8752b14651c25 Mon Sep 17 00:00:00 2001 From: Joey Gouly Date: Thu, 22 Aug 2024 16:10:50 +0100 Subject: [PATCH 46/94] KVM: arm64: Save/restore POE registers Define the new system registers that POE introduces and context switch them. Signed-off-by: Joey Gouly Cc: Marc Zyngier Cc: Oliver Upton Cc: Catalin Marinas Cc: Will Deacon Reviewed-by: Marc Zyngier Link: https://lore.kernel.org/r/20240822151113.1479789-8-joey.gouly@arm.com Signed-off-by: Will Deacon --- arch/arm64/include/asm/kvm_host.h | 4 ++++ arch/arm64/include/asm/vncr_mapping.h | 1 + arch/arm64/kvm/hyp/include/hyp/sysreg-sr.h | 27 ++++++++++++++++++++++ arch/arm64/kvm/sys_regs.c | 19 +++++++++++++-- 4 files changed, 49 insertions(+), 2 deletions(-) diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h index a33f5996ca9f..5c9de7692201 100644 --- a/arch/arm64/include/asm/kvm_host.h +++ b/arch/arm64/include/asm/kvm_host.h @@ -446,6 +446,8 @@ enum vcpu_sysreg { GCR_EL1, /* Tag Control Register */ TFSRE0_EL1, /* Tag Fault Status Register (EL0) */ + POR_EL0, /* Permission Overlay Register 0 (EL0) */ + /* 32bit specific registers. */ DACR32_EL2, /* Domain Access Control Register */ IFSR32_EL2, /* Instruction Fault Status Register */ @@ -517,6 +519,8 @@ enum vcpu_sysreg { VNCR(PIR_EL1), /* Permission Indirection Register 1 (EL1) */ VNCR(PIRE0_EL1), /* Permission Indirection Register 0 (EL1) */ + VNCR(POR_EL1), /* Permission Overlay Register 1 (EL1) */ + VNCR(HFGRTR_EL2), VNCR(HFGWTR_EL2), VNCR(HFGITR_EL2), diff --git a/arch/arm64/include/asm/vncr_mapping.h b/arch/arm64/include/asm/vncr_mapping.h index df2c47c55972..06f8ec0906a6 100644 --- a/arch/arm64/include/asm/vncr_mapping.h +++ b/arch/arm64/include/asm/vncr_mapping.h @@ -52,6 +52,7 @@ #define VNCR_PIRE0_EL1 0x290 #define VNCR_PIRE0_EL2 0x298 #define VNCR_PIR_EL1 0x2A0 +#define VNCR_POR_EL1 0x2A8 #define VNCR_ICH_LR0_EL2 0x400 #define VNCR_ICH_LR1_EL2 0x408 #define VNCR_ICH_LR2_EL2 0x410 diff --git a/arch/arm64/kvm/hyp/include/hyp/sysreg-sr.h b/arch/arm64/kvm/hyp/include/hyp/sysreg-sr.h index 4c0fdabaf8ae..1579a3c08a36 100644 --- a/arch/arm64/kvm/hyp/include/hyp/sysreg-sr.h +++ b/arch/arm64/kvm/hyp/include/hyp/sysreg-sr.h @@ -16,9 +16,15 @@ #include #include +static inline bool ctxt_has_s1poe(struct kvm_cpu_context *ctxt); + static inline void __sysreg_save_common_state(struct kvm_cpu_context *ctxt) { ctxt_sys_reg(ctxt, MDSCR_EL1) = read_sysreg(mdscr_el1); + + // POR_EL0 can affect uaccess, so must be saved/restored early. + if (ctxt_has_s1poe(ctxt)) + ctxt_sys_reg(ctxt, POR_EL0) = read_sysreg_s(SYS_POR_EL0); } static inline void __sysreg_save_user_state(struct kvm_cpu_context *ctxt) @@ -66,6 +72,17 @@ static inline bool ctxt_has_tcrx(struct kvm_cpu_context *ctxt) return kvm_has_feat(kern_hyp_va(vcpu->kvm), ID_AA64MMFR3_EL1, TCRX, IMP); } +static inline bool ctxt_has_s1poe(struct kvm_cpu_context *ctxt) +{ + struct kvm_vcpu *vcpu; + + if (!system_supports_poe()) + return false; + + vcpu = ctxt_to_vcpu(ctxt); + return kvm_has_feat(kern_hyp_va(vcpu->kvm), ID_AA64MMFR3_EL1, S1POE, IMP); +} + static inline void __sysreg_save_el1_state(struct kvm_cpu_context *ctxt) { ctxt_sys_reg(ctxt, SCTLR_EL1) = read_sysreg_el1(SYS_SCTLR); @@ -80,6 +97,9 @@ static inline void __sysreg_save_el1_state(struct kvm_cpu_context *ctxt) ctxt_sys_reg(ctxt, PIR_EL1) = read_sysreg_el1(SYS_PIR); ctxt_sys_reg(ctxt, PIRE0_EL1) = read_sysreg_el1(SYS_PIRE0); } + + if (ctxt_has_s1poe(ctxt)) + ctxt_sys_reg(ctxt, POR_EL1) = read_sysreg_el1(SYS_POR); } ctxt_sys_reg(ctxt, ESR_EL1) = read_sysreg_el1(SYS_ESR); ctxt_sys_reg(ctxt, AFSR0_EL1) = read_sysreg_el1(SYS_AFSR0); @@ -120,6 +140,10 @@ static inline void __sysreg_save_el2_return_state(struct kvm_cpu_context *ctxt) static inline void __sysreg_restore_common_state(struct kvm_cpu_context *ctxt) { write_sysreg(ctxt_sys_reg(ctxt, MDSCR_EL1), mdscr_el1); + + // POR_EL0 can affect uaccess, so must be saved/restored early. + if (ctxt_has_s1poe(ctxt)) + write_sysreg_s(ctxt_sys_reg(ctxt, POR_EL0), SYS_POR_EL0); } static inline void __sysreg_restore_user_state(struct kvm_cpu_context *ctxt) @@ -158,6 +182,9 @@ static inline void __sysreg_restore_el1_state(struct kvm_cpu_context *ctxt) write_sysreg_el1(ctxt_sys_reg(ctxt, PIR_EL1), SYS_PIR); write_sysreg_el1(ctxt_sys_reg(ctxt, PIRE0_EL1), SYS_PIRE0); } + + if (ctxt_has_s1poe(ctxt)) + write_sysreg_el1(ctxt_sys_reg(ctxt, POR_EL1), SYS_POR); } write_sysreg_el1(ctxt_sys_reg(ctxt, ESR_EL1), SYS_ESR); write_sysreg_el1(ctxt_sys_reg(ctxt, AFSR0_EL1), SYS_AFSR0); diff --git a/arch/arm64/kvm/sys_regs.c b/arch/arm64/kvm/sys_regs.c index c90324060436..e7208b59ea12 100644 --- a/arch/arm64/kvm/sys_regs.c +++ b/arch/arm64/kvm/sys_regs.c @@ -2255,6 +2255,15 @@ static bool access_zcr_el2(struct kvm_vcpu *vcpu, return true; } +static unsigned int s1poe_visibility(const struct kvm_vcpu *vcpu, + const struct sys_reg_desc *rd) +{ + if (kvm_has_feat(vcpu->kvm, ID_AA64MMFR3_EL1, S1POE, IMP)) + return 0; + + return REG_HIDDEN; +} + /* * Architected system registers. * Important: Must be sorted ascending by Op0, Op1, CRn, CRm, Op2 @@ -2492,6 +2501,8 @@ static const struct sys_reg_desc sys_reg_descs[] = { { SYS_DESC(SYS_MAIR_EL1), access_vm_reg, reset_unknown, MAIR_EL1 }, { SYS_DESC(SYS_PIRE0_EL1), NULL, reset_unknown, PIRE0_EL1 }, { SYS_DESC(SYS_PIR_EL1), NULL, reset_unknown, PIR_EL1 }, + { SYS_DESC(SYS_POR_EL1), NULL, reset_unknown, POR_EL1, + .visibility = s1poe_visibility }, { SYS_DESC(SYS_AMAIR_EL1), access_vm_reg, reset_amair_el1, AMAIR_EL1 }, { SYS_DESC(SYS_LORSA_EL1), trap_loregion }, @@ -2578,6 +2589,8 @@ static const struct sys_reg_desc sys_reg_descs[] = { .access = access_pmovs, .reg = PMOVSSET_EL0, .get_user = get_pmreg, .set_user = set_pmreg }, + { SYS_DESC(SYS_POR_EL0), NULL, reset_unknown, POR_EL0, + .visibility = s1poe_visibility }, { SYS_DESC(SYS_TPIDR_EL0), NULL, reset_unknown, TPIDR_EL0 }, { SYS_DESC(SYS_TPIDRRO_EL0), NULL, reset_unknown, TPIDRRO_EL0 }, { SYS_DESC(SYS_TPIDR2_EL0), undef_access }, @@ -4568,8 +4581,6 @@ void kvm_calculate_traps(struct kvm_vcpu *vcpu) kvm->arch.fgu[HFGxTR_GROUP] = (HFGxTR_EL2_nAMAIR2_EL1 | HFGxTR_EL2_nMAIR2_EL1 | HFGxTR_EL2_nS2POR_EL1 | - HFGxTR_EL2_nPOR_EL1 | - HFGxTR_EL2_nPOR_EL0 | HFGxTR_EL2_nACCDATA_EL1 | HFGxTR_EL2_nSMPRI_EL1_MASK | HFGxTR_EL2_nTPIDR2_EL0_MASK); @@ -4604,6 +4615,10 @@ void kvm_calculate_traps(struct kvm_vcpu *vcpu) kvm->arch.fgu[HFGxTR_GROUP] |= (HFGxTR_EL2_nPIRE0_EL1 | HFGxTR_EL2_nPIR_EL1); + if (!kvm_has_feat(kvm, ID_AA64MMFR3_EL1, S1POE, IMP)) + kvm->arch.fgu[HFGxTR_GROUP] |= (HFGxTR_EL2_nPOR_EL1 | + HFGxTR_EL2_nPOR_EL0); + if (!kvm_has_feat(kvm, ID_AA64PFR0_EL1, AMU, IMP)) kvm->arch.fgu[HAFGRTR_GROUP] |= ~(HAFGRTR_EL2_RES0 | HAFGRTR_EL2_RES1); From 55f4b215fb60f81e97d716fae42f967e435ce156 Mon Sep 17 00:00:00 2001 From: Joey Gouly Date: Thu, 22 Aug 2024 16:10:52 +0100 Subject: [PATCH 47/94] KVM: arm64: use `at s1e1a` for POE FEAT_ATS1E1A introduces a new instruction: `at s1e1a`. This is an address translation, without permission checks. POE allows read permissions to be removed from S1 by the guest. This means that an `at` instruction could fail, and not get the IPA. Switch to using `at s1e1a` so that KVM can get the IPA regardless of S1 permissions. Signed-off-by: Joey Gouly Cc: Marc Zyngier Cc: Oliver Upton Cc: Catalin Marinas Cc: Will Deacon Reviewed-by: Marc Zyngier Link: https://lore.kernel.org/r/20240822151113.1479789-10-joey.gouly@arm.com Signed-off-by: Will Deacon --- arch/arm64/kvm/hyp/include/hyp/fault.h | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/arch/arm64/kvm/hyp/include/hyp/fault.h b/arch/arm64/kvm/hyp/include/hyp/fault.h index 487c06099d6f..17df94570f03 100644 --- a/arch/arm64/kvm/hyp/include/hyp/fault.h +++ b/arch/arm64/kvm/hyp/include/hyp/fault.h @@ -14,6 +14,7 @@ static inline bool __translate_far_to_hpfar(u64 far, u64 *hpfar) { + int ret; u64 par, tmp; /* @@ -27,7 +28,9 @@ static inline bool __translate_far_to_hpfar(u64 far, u64 *hpfar) * saved the guest context yet, and we may return early... */ par = read_sysreg_par(); - if (!__kvm_at(OP_AT_S1E1R, far)) + ret = system_supports_poe() ? __kvm_at(OP_AT_S1E1A, far) : + __kvm_at(OP_AT_S1E1R, far); + if (!ret) tmp = read_sysreg_par(); else tmp = SYS_PAR_EL1_F; /* back to the guest */ From 70ed7238297fb53111e0647e2ec7990ddcbbbb45 Mon Sep 17 00:00:00 2001 From: Joey Gouly Date: Thu, 22 Aug 2024 16:10:53 +0100 Subject: [PATCH 48/94] KVM: arm64: Sanitise ID_AA64MMFR3_EL1 Add the missing sanitisation of ID_AA64MMFR3_EL1, making sure we solely expose S1POE and TCRX (we currently don't support anything else). [joey: Took Marc's patch for S1PIE, and changed it for S1POE] Signed-off-by: Marc Zyngier Signed-off-by: Joey Gouly Link: https://lore.kernel.org/r/20240822151113.1479789-11-joey.gouly@arm.com Signed-off-by: Will Deacon --- arch/arm64/kvm/sys_regs.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/arch/arm64/kvm/sys_regs.c b/arch/arm64/kvm/sys_regs.c index e7208b59ea12..0f13378e761c 100644 --- a/arch/arm64/kvm/sys_regs.c +++ b/arch/arm64/kvm/sys_regs.c @@ -1556,6 +1556,9 @@ static u64 __kvm_read_sanitised_id_reg(const struct kvm_vcpu *vcpu, case SYS_ID_AA64MMFR2_EL1: val &= ~ID_AA64MMFR2_EL1_CCIDX_MASK; break; + case SYS_ID_AA64MMFR3_EL1: + val &= ID_AA64MMFR3_EL1_TCRX | ID_AA64MMFR3_EL1_S1POE; + break; case SYS_ID_MMFR4_EL1: val &= ~ARM64_FEATURE_MASK(ID_MMFR4_EL1_CCIDX); break; @@ -2427,7 +2430,8 @@ static const struct sys_reg_desc sys_reg_descs[] = { ID_AA64MMFR2_EL1_IDS | ID_AA64MMFR2_EL1_NV | ID_AA64MMFR2_EL1_CCIDX)), - ID_SANITISED(ID_AA64MMFR3_EL1), + ID_WRITABLE(ID_AA64MMFR3_EL1, (ID_AA64MMFR3_EL1_TCRX | + ID_AA64MMFR3_EL1_S1POE)), ID_SANITISED(ID_AA64MMFR4_EL1), ID_UNALLOCATED(7,5), ID_UNALLOCATED(7,6), From 487355f111f98a74d86007c4df0ba9f0f9edc172 Mon Sep 17 00:00:00 2001 From: Joey Gouly Date: Thu, 22 Aug 2024 16:11:13 +0100 Subject: [PATCH 49/94] KVM: selftests: get-reg-list: add Permission Overlay registers Add new system registers: - POR_EL1 - POR_EL0 Signed-off-by: Joey Gouly Cc: Catalin Marinas Cc: Will Deacon Cc: Marc Zyngier Cc: Oliver Upton Cc: Shuah Khan Reviewed-by: Mark Brown Link: https://lore.kernel.org/r/20240822151113.1479789-31-joey.gouly@arm.com Signed-off-by: Will Deacon --- tools/testing/selftests/kvm/aarch64/get-reg-list.c | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/tools/testing/selftests/kvm/aarch64/get-reg-list.c b/tools/testing/selftests/kvm/aarch64/get-reg-list.c index 709d7d721760..ac661ebf6859 100644 --- a/tools/testing/selftests/kvm/aarch64/get-reg-list.c +++ b/tools/testing/selftests/kvm/aarch64/get-reg-list.c @@ -40,6 +40,18 @@ static struct feature_id_reg feat_id_regs[] = { ARM64_SYS_REG(3, 0, 0, 7, 3), /* ID_AA64MMFR3_EL1 */ 4, 1 + }, + { + ARM64_SYS_REG(3, 0, 10, 2, 4), /* POR_EL1 */ + ARM64_SYS_REG(3, 0, 0, 7, 3), /* ID_AA64MMFR3_EL1 */ + 16, + 1 + }, + { + ARM64_SYS_REG(3, 3, 10, 2, 4), /* POR_EL0 */ + ARM64_SYS_REG(3, 0, 0, 7, 3), /* ID_AA64MMFR3_EL1 */ + 16, + 1 } }; @@ -468,6 +480,7 @@ static __u64 base_regs[] = { ARM64_SYS_REG(3, 0, 10, 2, 0), /* MAIR_EL1 */ ARM64_SYS_REG(3, 0, 10, 2, 2), /* PIRE0_EL1 */ ARM64_SYS_REG(3, 0, 10, 2, 3), /* PIR_EL1 */ + ARM64_SYS_REG(3, 0, 10, 2, 4), /* POR_EL1 */ ARM64_SYS_REG(3, 0, 10, 3, 0), /* AMAIR_EL1 */ ARM64_SYS_REG(3, 0, 12, 0, 0), /* VBAR_EL1 */ ARM64_SYS_REG(3, 0, 12, 1, 1), /* DISR_EL1 */ @@ -475,6 +488,7 @@ static __u64 base_regs[] = { ARM64_SYS_REG(3, 0, 13, 0, 4), /* TPIDR_EL1 */ ARM64_SYS_REG(3, 0, 14, 1, 0), /* CNTKCTL_EL1 */ ARM64_SYS_REG(3, 2, 0, 0, 0), /* CSSELR_EL1 */ + ARM64_SYS_REG(3, 3, 10, 2, 4), /* POR_EL0 */ ARM64_SYS_REG(3, 3, 13, 0, 2), /* TPIDR_EL0 */ ARM64_SYS_REG(3, 3, 13, 0, 3), /* TPIDRRO_EL0 */ ARM64_SYS_REG(3, 3, 14, 0, 1), /* CNTPCT_EL0 */ From 12930e3a86adbcee6b360d659645bce769d0fea1 Mon Sep 17 00:00:00 2001 From: Joey Gouly Date: Thu, 22 Aug 2024 16:10:44 +0100 Subject: [PATCH 50/94] powerpc/mm: add ARCH_PKEY_BITS to Kconfig The new config option specifies how many bits are in each PKEY. Signed-off-by: Joey Gouly Cc: Michael Ellerman Cc: Nicholas Piggin Cc: Christophe Leroy Cc: Aneesh Kumar K.V Cc: Naveen N. Rao Cc: linuxppc-dev@lists.ozlabs.org Acked-by: Michael Ellerman Link: https://lore.kernel.org/r/20240822151113.1479789-2-joey.gouly@arm.com Signed-off-by: Will Deacon --- arch/powerpc/Kconfig | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig index d7b09b064a8a..8a4ee57cd4ef 100644 --- a/arch/powerpc/Kconfig +++ b/arch/powerpc/Kconfig @@ -1026,6 +1026,10 @@ config PPC_MEM_KEYS If unsure, say y. +config ARCH_PKEY_BITS + int + default 5 + config PPC_SECURE_BOOT prompt "Enable secure boot support" bool From 5626f8d45e0951f418cfc06ad8be71e3f51e585f Mon Sep 17 00:00:00 2001 From: Joey Gouly Date: Thu, 22 Aug 2024 16:10:45 +0100 Subject: [PATCH 51/94] x86/mm: add ARCH_PKEY_BITS to Kconfig The new config option specifies how many bits are in each PKEY. Signed-off-by: Joey Gouly Cc: Thomas Gleixner Cc: Ingo Molnar Cc: Borislav Petkov Cc: Dave Hansen Cc: H. Peter Anvin Cc: x86@kernel.org Acked-by: Dave Hansen Link: https://lore.kernel.org/r/20240822151113.1479789-3-joey.gouly@arm.com Signed-off-by: Will Deacon --- arch/x86/Kconfig | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 007bab9f2a0e..683c0a64efe2 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -1889,6 +1889,10 @@ config X86_INTEL_MEMORY_PROTECTION_KEYS If unsure, say y. +config ARCH_PKEY_BITS + int + default 4 + choice prompt "TSX enable mode" depends on CPU_SUP_INTEL From 9f82f15ddfdd60bb9820f09737333b2345e22ab3 Mon Sep 17 00:00:00 2001 From: Joey Gouly Date: Thu, 22 Aug 2024 16:10:46 +0100 Subject: [PATCH 52/94] mm: use ARCH_PKEY_BITS to define VM_PKEY_BITN Use the new CONFIG_ARCH_PKEY_BITS to simplify setting these bits for different architectures. Signed-off-by: Joey Gouly Cc: Andrew Morton Cc: linux-fsdevel@vger.kernel.org Cc: linux-mm@kvack.org Acked-by: Dave Hansen Reviewed-by: Anshuman Khandual Link: https://lore.kernel.org/r/20240822151113.1479789-4-joey.gouly@arm.com Signed-off-by: Will Deacon --- fs/proc/task_mmu.c | 2 ++ include/linux/mm.h | 16 ++++++++++------ 2 files changed, 12 insertions(+), 6 deletions(-) diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index 5f171ad7b436..2c5f4814aef9 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -976,7 +976,9 @@ static void show_smap_vma_flags(struct seq_file *m, struct vm_area_struct *vma) [ilog2(VM_PKEY_BIT0)] = "", [ilog2(VM_PKEY_BIT1)] = "", [ilog2(VM_PKEY_BIT2)] = "", +#if VM_PKEY_BIT3 [ilog2(VM_PKEY_BIT3)] = "", +#endif #if VM_PKEY_BIT4 [ilog2(VM_PKEY_BIT4)] = "", #endif diff --git a/include/linux/mm.h b/include/linux/mm.h index c4b238a20b76..fb6ccd93f589 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -330,12 +330,16 @@ extern unsigned int kobjsize(const void *objp); #endif /* CONFIG_ARCH_USES_HIGH_VMA_FLAGS */ #ifdef CONFIG_ARCH_HAS_PKEYS -# define VM_PKEY_SHIFT VM_HIGH_ARCH_BIT_0 -# define VM_PKEY_BIT0 VM_HIGH_ARCH_0 /* A protection key is a 4-bit value */ -# define VM_PKEY_BIT1 VM_HIGH_ARCH_1 /* on x86 and 5-bit value on ppc64 */ -# define VM_PKEY_BIT2 VM_HIGH_ARCH_2 -# define VM_PKEY_BIT3 VM_HIGH_ARCH_3 -#ifdef CONFIG_PPC +# define VM_PKEY_SHIFT VM_HIGH_ARCH_BIT_0 +# define VM_PKEY_BIT0 VM_HIGH_ARCH_0 +# define VM_PKEY_BIT1 VM_HIGH_ARCH_1 +# define VM_PKEY_BIT2 VM_HIGH_ARCH_2 +#if CONFIG_ARCH_PKEY_BITS > 3 +# define VM_PKEY_BIT3 VM_HIGH_ARCH_3 +#else +# define VM_PKEY_BIT3 0 +#endif +#if CONFIG_ARCH_PKEY_BITS > 4 # define VM_PKEY_BIT4 VM_HIGH_ARCH_4 #else # define VM_PKEY_BIT4 0 From bf83dae90fbc01d66477a3440eaad07da6657fdc Mon Sep 17 00:00:00 2001 From: Joey Gouly Date: Thu, 22 Aug 2024 16:10:54 +0100 Subject: [PATCH 53/94] arm64: enable the Permission Overlay Extension for EL0 Expose a HWCAP and ID_AA64MMFR3_EL1_S1POE to userspace, so they can be used to check if the CPU supports the feature. Signed-off-by: Joey Gouly Cc: Catalin Marinas Cc: Will Deacon Reviewed-by: Catalin Marinas Reviewed-by: Anshuman Khandual Link: https://lore.kernel.org/r/20240822151113.1479789-12-joey.gouly@arm.com Signed-off-by: Will Deacon --- Documentation/arch/arm64/elf_hwcaps.rst | 2 ++ arch/arm64/include/asm/hwcap.h | 1 + arch/arm64/include/uapi/asm/hwcap.h | 1 + arch/arm64/kernel/cpufeature.c | 14 ++++++++++++++ arch/arm64/kernel/cpuinfo.c | 1 + 5 files changed, 19 insertions(+) diff --git a/Documentation/arch/arm64/elf_hwcaps.rst b/Documentation/arch/arm64/elf_hwcaps.rst index 448c1664879b..694f67fa07d1 100644 --- a/Documentation/arch/arm64/elf_hwcaps.rst +++ b/Documentation/arch/arm64/elf_hwcaps.rst @@ -365,6 +365,8 @@ HWCAP2_SME_SF8DP2 HWCAP2_SME_SF8DP4 Functionality implied by ID_AA64SMFR0_EL1.SF8DP4 == 0b1. +HWCAP2_POE + Functionality implied by ID_AA64MMFR3_EL1.S1POE == 0b0001. 4. Unused AT_HWCAP bits ----------------------- diff --git a/arch/arm64/include/asm/hwcap.h b/arch/arm64/include/asm/hwcap.h index 4edd3b61df11..a775adddecf2 100644 --- a/arch/arm64/include/asm/hwcap.h +++ b/arch/arm64/include/asm/hwcap.h @@ -157,6 +157,7 @@ #define KERNEL_HWCAP_SME_SF8FMA __khwcap2_feature(SME_SF8FMA) #define KERNEL_HWCAP_SME_SF8DP4 __khwcap2_feature(SME_SF8DP4) #define KERNEL_HWCAP_SME_SF8DP2 __khwcap2_feature(SME_SF8DP2) +#define KERNEL_HWCAP_POE __khwcap2_feature(POE) /* * This yields a mask that user programs can use to figure out what diff --git a/arch/arm64/include/uapi/asm/hwcap.h b/arch/arm64/include/uapi/asm/hwcap.h index 285610e626f5..055381b2c615 100644 --- a/arch/arm64/include/uapi/asm/hwcap.h +++ b/arch/arm64/include/uapi/asm/hwcap.h @@ -122,5 +122,6 @@ #define HWCAP2_SME_SF8FMA (1UL << 60) #define HWCAP2_SME_SF8DP4 (1UL << 61) #define HWCAP2_SME_SF8DP2 (1UL << 62) +#define HWCAP2_POE (1UL << 63) #endif /* _UAPI__ASM_HWCAP_H */ diff --git a/arch/arm64/kernel/cpufeature.c b/arch/arm64/kernel/cpufeature.c index 2daf5597cd65..718728a85430 100644 --- a/arch/arm64/kernel/cpufeature.c +++ b/arch/arm64/kernel/cpufeature.c @@ -466,6 +466,8 @@ static const struct arm64_ftr_bits ftr_id_aa64mmfr2[] = { }; static const struct arm64_ftr_bits ftr_id_aa64mmfr3[] = { + ARM64_FTR_BITS(FTR_VISIBLE_IF_IS_ENABLED(CONFIG_ARM64_POE), + FTR_NONSTRICT, FTR_LOWER_SAFE, ID_AA64MMFR3_EL1_S1POE_SHIFT, 4, 0), ARM64_FTR_BITS(FTR_HIDDEN, FTR_NONSTRICT, FTR_LOWER_SAFE, ID_AA64MMFR3_EL1_S1PIE_SHIFT, 4, 0), ARM64_FTR_BITS(FTR_HIDDEN, FTR_NONSTRICT, FTR_LOWER_SAFE, ID_AA64MMFR3_EL1_TCRX_SHIFT, 4, 0), ARM64_FTR_END, @@ -2348,6 +2350,14 @@ static void cpu_enable_mops(const struct arm64_cpu_capabilities *__unused) sysreg_clear_set(sctlr_el1, 0, SCTLR_EL1_MSCEn); } +#ifdef CONFIG_ARM64_POE +static void cpu_enable_poe(const struct arm64_cpu_capabilities *__unused) +{ + sysreg_clear_set(REG_TCR2_EL1, 0, TCR2_EL1x_E0POE); + sysreg_clear_set(CPACR_EL1, 0, CPACR_ELx_E0POE); +} +#endif + /* Internal helper functions to match cpu capability type */ static bool cpucap_late_cpu_optional(const struct arm64_cpu_capabilities *cap) @@ -2876,6 +2886,7 @@ static const struct arm64_cpu_capabilities arm64_features[] = { .capability = ARM64_HAS_S1POE, .type = ARM64_CPUCAP_BOOT_CPU_FEATURE, .matches = has_cpuid_feature, + .cpu_enable = cpu_enable_poe, ARM64_CPUID_FIELDS(ID_AA64MMFR3_EL1, S1POE, IMP) }, #endif @@ -3043,6 +3054,9 @@ static const struct arm64_cpu_capabilities arm64_elf_hwcaps[] = { HWCAP_CAP(ID_AA64FPFR0_EL1, F8DP2, IMP, CAP_HWCAP, KERNEL_HWCAP_F8DP2), HWCAP_CAP(ID_AA64FPFR0_EL1, F8E4M3, IMP, CAP_HWCAP, KERNEL_HWCAP_F8E4M3), HWCAP_CAP(ID_AA64FPFR0_EL1, F8E5M2, IMP, CAP_HWCAP, KERNEL_HWCAP_F8E5M2), +#ifdef CONFIG_ARM64_POE + HWCAP_CAP(ID_AA64MMFR3_EL1, S1POE, IMP, CAP_HWCAP, KERNEL_HWCAP_POE), +#endif {}, }; diff --git a/arch/arm64/kernel/cpuinfo.c b/arch/arm64/kernel/cpuinfo.c index 09eeaa24d456..b9db812082b3 100644 --- a/arch/arm64/kernel/cpuinfo.c +++ b/arch/arm64/kernel/cpuinfo.c @@ -143,6 +143,7 @@ static const char *const hwcap_str[] = { [KERNEL_HWCAP_SME_SF8FMA] = "smesf8fma", [KERNEL_HWCAP_SME_SF8DP4] = "smesf8dp4", [KERNEL_HWCAP_SME_SF8DP2] = "smesf8dp2", + [KERNEL_HWCAP_POE] = "poe", }; #ifdef CONFIG_COMPAT From facaa1373c9aabf8e34109a9cb205ad0f3a8584e Mon Sep 17 00:00:00 2001 From: Joey Gouly Date: Thu, 22 Aug 2024 16:10:55 +0100 Subject: [PATCH 54/94] arm64: re-order MTE VM_ flags VM_PKEY_BIT[012] will use VM_HIGH_ARCH_[012], move the MTE VM flags to accommodate this. Signed-off-by: Joey Gouly Cc: Catalin Marinas Cc: Will Deacon Acked-by: Catalin Marinas Link: https://lore.kernel.org/r/20240822151113.1479789-13-joey.gouly@arm.com Signed-off-by: Will Deacon --- include/linux/mm.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/include/linux/mm.h b/include/linux/mm.h index fb6ccd93f589..406512c16471 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -378,8 +378,8 @@ extern unsigned int kobjsize(const void *objp); #endif #if defined(CONFIG_ARM64_MTE) -# define VM_MTE VM_HIGH_ARCH_0 /* Use Tagged memory for access control */ -# define VM_MTE_ALLOWED VM_HIGH_ARCH_1 /* Tagged memory permitted */ +# define VM_MTE VM_HIGH_ARCH_4 /* Use Tagged memory for access control */ +# define VM_MTE_ALLOWED VM_HIGH_ARCH_5 /* Tagged memory permitted */ #else # define VM_MTE VM_NONE # define VM_MTE_ALLOWED VM_NONE From b66db4f3ccde2fe960ff2d7bb64fe8933e2db7b3 Mon Sep 17 00:00:00 2001 From: Joey Gouly Date: Thu, 22 Aug 2024 16:10:56 +0100 Subject: [PATCH 55/94] arm64: add POIndex defines The 3-bit POIndex is stored in the PTE at bits 60..62. Signed-off-by: Joey Gouly Cc: Catalin Marinas Cc: Will Deacon Acked-by: Catalin Marinas Link: https://lore.kernel.org/r/20240822151113.1479789-14-joey.gouly@arm.com Signed-off-by: Will Deacon --- arch/arm64/include/asm/pgtable-hwdef.h | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/arch/arm64/include/asm/pgtable-hwdef.h b/arch/arm64/include/asm/pgtable-hwdef.h index 1f60aa1bc750..3f0c3f5c5cef 100644 --- a/arch/arm64/include/asm/pgtable-hwdef.h +++ b/arch/arm64/include/asm/pgtable-hwdef.h @@ -199,6 +199,16 @@ #define PTE_PI_IDX_2 53 /* PXN */ #define PTE_PI_IDX_3 54 /* UXN */ +/* + * POIndex[2:0] encoding (Permission Overlay Extension) + */ +#define PTE_PO_IDX_0 (_AT(pteval_t, 1) << 60) +#define PTE_PO_IDX_1 (_AT(pteval_t, 1) << 61) +#define PTE_PO_IDX_2 (_AT(pteval_t, 1) << 62) + +#define PTE_PO_IDX_MASK GENMASK_ULL(62, 60) + + /* * Memory Attribute override for Stage-2 (MemAttr[3:0]) */ From b3c03fe13766f0455c4c77817a2aa385ed89937d Mon Sep 17 00:00:00 2001 From: Joey Gouly Date: Thu, 22 Aug 2024 16:10:57 +0100 Subject: [PATCH 56/94] arm64: convert protection key into vm_flags and pgprot values Modify arch_calc_vm_prot_bits() and vm_get_page_prot() such that the pkey value is set in the vm_flags and then into the pgprot value. Signed-off-by: Joey Gouly Cc: Catalin Marinas Cc: Will Deacon Link: https://lore.kernel.org/r/20240822151113.1479789-15-joey.gouly@arm.com Signed-off-by: Will Deacon --- arch/arm64/include/asm/mman.h | 10 +++++++++- arch/arm64/mm/mmap.c | 11 +++++++++++ 2 files changed, 20 insertions(+), 1 deletion(-) diff --git a/arch/arm64/include/asm/mman.h b/arch/arm64/include/asm/mman.h index 5966ee4a6154..52791715f6e6 100644 --- a/arch/arm64/include/asm/mman.h +++ b/arch/arm64/include/asm/mman.h @@ -7,7 +7,7 @@ #include static inline unsigned long arch_calc_vm_prot_bits(unsigned long prot, - unsigned long pkey __always_unused) + unsigned long pkey) { unsigned long ret = 0; @@ -17,6 +17,14 @@ static inline unsigned long arch_calc_vm_prot_bits(unsigned long prot, if (system_supports_mte() && (prot & PROT_MTE)) ret |= VM_MTE; +#ifdef CONFIG_ARCH_HAS_PKEYS + if (system_supports_poe()) { + ret |= pkey & BIT(0) ? VM_PKEY_BIT0 : 0; + ret |= pkey & BIT(1) ? VM_PKEY_BIT1 : 0; + ret |= pkey & BIT(2) ? VM_PKEY_BIT2 : 0; + } +#endif + return ret; } #define arch_calc_vm_prot_bits(prot, pkey) arch_calc_vm_prot_bits(prot, pkey) diff --git a/arch/arm64/mm/mmap.c b/arch/arm64/mm/mmap.c index 642bdf908b22..7e3ad97e27d8 100644 --- a/arch/arm64/mm/mmap.c +++ b/arch/arm64/mm/mmap.c @@ -102,6 +102,17 @@ pgprot_t vm_get_page_prot(unsigned long vm_flags) if (vm_flags & VM_MTE) prot |= PTE_ATTRINDX(MT_NORMAL_TAGGED); +#ifdef CONFIG_ARCH_HAS_PKEYS + if (system_supports_poe()) { + if (vm_flags & VM_PKEY_BIT0) + prot |= PTE_PO_IDX_0; + if (vm_flags & VM_PKEY_BIT1) + prot |= PTE_PO_IDX_1; + if (vm_flags & VM_PKEY_BIT2) + prot |= PTE_PO_IDX_2; + } +#endif + return __pgprot(prot); } EXPORT_SYMBOL(vm_get_page_prot); From 6580a36dd75acbf9c9a6f040d07dc8a9da329ac9 Mon Sep 17 00:00:00 2001 From: Joey Gouly Date: Thu, 22 Aug 2024 16:10:58 +0100 Subject: [PATCH 57/94] arm64: mask out POIndex when modifying a PTE When a PTE is modified, the POIndex must be masked off so that it can be modified. Signed-off-by: Joey Gouly Cc: Catalin Marinas Cc: Will Deacon Reviewed-by: Catalin Marinas Reviewed-by: Anshuman Khandual Link: https://lore.kernel.org/r/20240822151113.1479789-16-joey.gouly@arm.com Signed-off-by: Will Deacon --- arch/arm64/include/asm/pgtable.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/arch/arm64/include/asm/pgtable.h b/arch/arm64/include/asm/pgtable.h index 7a4f5604be3f..1d0f18d30e1e 100644 --- a/arch/arm64/include/asm/pgtable.h +++ b/arch/arm64/include/asm/pgtable.h @@ -1103,7 +1103,8 @@ static inline pte_t pte_modify(pte_t pte, pgprot_t newprot) */ const pteval_t mask = PTE_USER | PTE_PXN | PTE_UXN | PTE_RDONLY | PTE_PRESENT_INVALID | PTE_VALID | PTE_WRITE | - PTE_GP | PTE_ATTRINDX_MASK; + PTE_GP | PTE_ATTRINDX_MASK | PTE_PO_IDX_MASK; + /* preserve the hardware dirty information */ if (pte_hw_dirty(pte)) pte = set_pte_bit(pte, __pgprot(PTE_DIRTY)); From 7f0ab607630790fa09532dca6202683a0dac19b9 Mon Sep 17 00:00:00 2001 From: Joey Gouly Date: Thu, 22 Aug 2024 16:10:59 +0100 Subject: [PATCH 58/94] arm64: handle PKEY/POE faults If a memory fault occurs that is due to an overlay/pkey fault, report that to userspace with a SEGV_PKUERR. Signed-off-by: Joey Gouly Cc: Catalin Marinas Cc: Will Deacon Reviewed-by: Catalin Marinas Link: https://lore.kernel.org/r/20240822151113.1479789-17-joey.gouly@arm.com [will: Add ESR.FSC check to data abort handler] Signed-off-by: Will Deacon --- arch/arm64/include/asm/traps.h | 1 + arch/arm64/kernel/traps.c | 6 ++++ arch/arm64/mm/fault.c | 55 +++++++++++++++++++++++++++++++++- 3 files changed, 61 insertions(+), 1 deletion(-) diff --git a/arch/arm64/include/asm/traps.h b/arch/arm64/include/asm/traps.h index eefe766d6161..d780d1bd2eac 100644 --- a/arch/arm64/include/asm/traps.h +++ b/arch/arm64/include/asm/traps.h @@ -25,6 +25,7 @@ try_emulate_armv8_deprecated(struct pt_regs *regs, u32 insn) void force_signal_inject(int signal, int code, unsigned long address, unsigned long err); void arm64_notify_segfault(unsigned long addr); void arm64_force_sig_fault(int signo, int code, unsigned long far, const char *str); +void arm64_force_sig_fault_pkey(unsigned long far, const char *str, int pkey); void arm64_force_sig_mceerr(int code, unsigned long far, short lsb, const char *str); void arm64_force_sig_ptrace_errno_trap(int errno, unsigned long far, const char *str); diff --git a/arch/arm64/kernel/traps.c b/arch/arm64/kernel/traps.c index 9e22683aa921..9a11bb0db284 100644 --- a/arch/arm64/kernel/traps.c +++ b/arch/arm64/kernel/traps.c @@ -273,6 +273,12 @@ void arm64_force_sig_fault(int signo, int code, unsigned long far, force_sig_fault(signo, code, (void __user *)far); } +void arm64_force_sig_fault_pkey(unsigned long far, const char *str, int pkey) +{ + arm64_show_signal(SIGSEGV, str); + force_sig_pkuerr((void __user *)far, pkey); +} + void arm64_force_sig_mceerr(int code, unsigned long far, short lsb, const char *str) { diff --git a/arch/arm64/mm/fault.c b/arch/arm64/mm/fault.c index 451ba7cbd5ad..8b281cf308b3 100644 --- a/arch/arm64/mm/fault.c +++ b/arch/arm64/mm/fault.c @@ -23,6 +23,7 @@ #include #include #include +#include #include #include @@ -486,6 +487,23 @@ static void do_bad_area(unsigned long far, unsigned long esr, } } +static bool fault_from_pkey(unsigned long esr, struct vm_area_struct *vma, + unsigned int mm_flags) +{ + unsigned long iss2 = ESR_ELx_ISS2(esr); + + if (!system_supports_poe()) + return false; + + if (esr_fsc_is_permission_fault(esr) && (iss2 & ESR_ELx_Overlay)) + return true; + + return !arch_vma_access_permitted(vma, + mm_flags & FAULT_FLAG_WRITE, + mm_flags & FAULT_FLAG_INSTRUCTION, + false); +} + static bool is_el0_instruction_abort(unsigned long esr) { return ESR_ELx_EC(esr) == ESR_ELx_EC_IABT_LOW; @@ -511,6 +529,7 @@ static int __kprobes do_page_fault(unsigned long far, unsigned long esr, unsigned long addr = untagged_addr(far); struct vm_area_struct *vma; int si_code; + int pkey = -1; if (kprobe_page_fault(regs, esr)) return 0; @@ -575,6 +594,16 @@ static int __kprobes do_page_fault(unsigned long far, unsigned long esr, count_vm_vma_lock_event(VMA_LOCK_SUCCESS); goto bad_area; } + + if (fault_from_pkey(esr, vma, mm_flags)) { + pkey = vma_pkey(vma); + vma_end_read(vma); + fault = 0; + si_code = SEGV_PKUERR; + count_vm_vma_lock_event(VMA_LOCK_SUCCESS); + goto bad_area; + } + fault = handle_mm_fault(vma, addr, mm_flags | FAULT_FLAG_VMA_LOCK, regs); if (!(fault & (VM_FAULT_RETRY | VM_FAULT_COMPLETED))) vma_end_read(vma); @@ -610,7 +639,16 @@ retry: goto bad_area; } + if (fault_from_pkey(esr, vma, mm_flags)) { + pkey = vma_pkey(vma); + mmap_read_unlock(mm); + fault = 0; + si_code = SEGV_PKUERR; + goto bad_area; + } + fault = handle_mm_fault(vma, addr, mm_flags, regs); + /* Quick path to respond to signals */ if (fault_signal_pending(fault, regs)) { if (!user_mode(regs)) @@ -669,8 +707,23 @@ bad_area: arm64_force_sig_mceerr(BUS_MCEERR_AR, far, lsb, inf->name); } else { + /* + * The pkey value that we return to userspace can be different + * from the pkey that caused the fault. + * + * 1. T1 : mprotect_key(foo, PAGE_SIZE, pkey=4); + * 2. T1 : set POR_EL0 to deny access to pkey=4, touches, page + * 3. T1 : faults... + * 4. T2: mprotect_key(foo, PAGE_SIZE, pkey=5); + * 5. T1 : enters fault handler, takes mmap_lock, etc... + * 6. T1 : reaches here, sees vma_pkey(vma)=5, when we really + * faulted on a pte with its pkey=4. + */ /* Something tried to access memory that out of memory map */ - arm64_force_sig_fault(SIGSEGV, si_code, far, inf->name); + if (si_code == SEGV_PKUERR) + arm64_force_sig_fault_pkey(far, inf->name, pkey); + else + arm64_force_sig_fault(SIGSEGV, si_code, far, inf->name); } return 0; From fc2d9cd33040630f9d6ff819f1f326d51b354429 Mon Sep 17 00:00:00 2001 From: Joey Gouly Date: Thu, 22 Aug 2024 16:11:00 +0100 Subject: [PATCH 59/94] arm64: add pte_access_permitted_no_overlay() We do not want take POE into account when clearing the MTE tags. Signed-off-by: Joey Gouly Cc: Catalin Marinas Cc: Will Deacon Reviewed-by: Catalin Marinas Reviewed-by: Anshuman Khandual Link: https://lore.kernel.org/r/20240822151113.1479789-18-joey.gouly@arm.com Signed-off-by: Will Deacon --- arch/arm64/include/asm/pgtable.h | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/arch/arm64/include/asm/pgtable.h b/arch/arm64/include/asm/pgtable.h index 1d0f18d30e1e..61a674942a6b 100644 --- a/arch/arm64/include/asm/pgtable.h +++ b/arch/arm64/include/asm/pgtable.h @@ -156,8 +156,10 @@ static inline pteval_t __phys_to_pte_val(phys_addr_t phys) * not set) must return false. PROT_NONE mappings do not have the * PTE_VALID bit set. */ -#define pte_access_permitted(pte, write) \ +#define pte_access_permitted_no_overlay(pte, write) \ (((pte_val(pte) & (PTE_VALID | PTE_USER)) == (PTE_VALID | PTE_USER)) && (!(write) || pte_write(pte))) +#define pte_access_permitted(pte, write) \ + pte_access_permitted_no_overlay(pte, write) #define pmd_access_permitted(pmd, write) \ (pte_access_permitted(pmd_pte(pmd), (write))) #define pud_access_permitted(pud, write) \ @@ -373,10 +375,11 @@ static inline void __sync_cache_and_tags(pte_t pte, unsigned int nr_pages) /* * If the PTE would provide user space access to the tags associated * with it then ensure that the MTE tags are synchronised. Although - * pte_access_permitted() returns false for exec only mappings, they - * don't expose tags (instruction fetches don't check tags). + * pte_access_permitted_no_overlay() returns false for exec only + * mappings, they don't expose tags (instruction fetches don't check + * tags). */ - if (system_supports_mte() && pte_access_permitted(pte, false) && + if (system_supports_mte() && pte_access_permitted_no_overlay(pte, false) && !pte_special(pte) && pte_tagged(pte)) mte_sync_tags(pte, nr_pages); } From 7f955be9f887d3ce77afb61ea74d907f06fe6f1e Mon Sep 17 00:00:00 2001 From: Joey Gouly Date: Thu, 22 Aug 2024 16:11:01 +0100 Subject: [PATCH 60/94] arm64: implement PKEYS support Implement the PKEYS interface, using the Permission Overlay Extension. Signed-off-by: Joey Gouly Cc: Catalin Marinas Cc: Will Deacon Reviewed-by: Catalin Marinas Link: https://lore.kernel.org/r/20240822151113.1479789-19-joey.gouly@arm.com Signed-off-by: Will Deacon --- arch/arm64/include/asm/mmu.h | 1 + arch/arm64/include/asm/mmu_context.h | 46 +++++++++++- arch/arm64/include/asm/pgtable.h | 22 +++++- arch/arm64/include/asm/pkeys.h | 108 +++++++++++++++++++++++++++ arch/arm64/include/asm/por.h | 33 ++++++++ arch/arm64/include/uapi/asm/mman.h | 9 +++ arch/arm64/mm/mmu.c | 45 +++++++++++ 7 files changed, 262 insertions(+), 2 deletions(-) create mode 100644 arch/arm64/include/asm/pkeys.h create mode 100644 arch/arm64/include/asm/por.h diff --git a/arch/arm64/include/asm/mmu.h b/arch/arm64/include/asm/mmu.h index 65977c7783c5..983afeb4eba5 100644 --- a/arch/arm64/include/asm/mmu.h +++ b/arch/arm64/include/asm/mmu.h @@ -25,6 +25,7 @@ typedef struct { refcount_t pinned; void *vdso; unsigned long flags; + u8 pkey_allocation_map; } mm_context_t; /* diff --git a/arch/arm64/include/asm/mmu_context.h b/arch/arm64/include/asm/mmu_context.h index bd19f4c758b7..7c09d47e09cb 100644 --- a/arch/arm64/include/asm/mmu_context.h +++ b/arch/arm64/include/asm/mmu_context.h @@ -15,12 +15,12 @@ #include #include #include +#include #include #include #include #include -#include #include #include #include @@ -175,9 +175,36 @@ init_new_context(struct task_struct *tsk, struct mm_struct *mm) { atomic64_set(&mm->context.id, 0); refcount_set(&mm->context.pinned, 0); + + /* pkey 0 is the default, so always reserve it. */ + mm->context.pkey_allocation_map = BIT(0); + return 0; } +static inline void arch_dup_pkeys(struct mm_struct *oldmm, + struct mm_struct *mm) +{ + /* Duplicate the oldmm pkey state in mm: */ + mm->context.pkey_allocation_map = oldmm->context.pkey_allocation_map; +} + +static inline int arch_dup_mmap(struct mm_struct *oldmm, struct mm_struct *mm) +{ + arch_dup_pkeys(oldmm, mm); + + return 0; +} + +static inline void arch_exit_mmap(struct mm_struct *mm) +{ +} + +static inline void arch_unmap(struct mm_struct *mm, + unsigned long start, unsigned long end) +{ +} + #ifdef CONFIG_ARM64_SW_TTBR0_PAN static inline void update_saved_ttbr0(struct task_struct *tsk, struct mm_struct *mm) @@ -267,6 +294,23 @@ static inline unsigned long mm_untag_mask(struct mm_struct *mm) return -1UL >> 8; } +/* + * Only enforce protection keys on the current process, because there is no + * user context to access POR_EL0 for another address space. + */ +static inline bool arch_vma_access_permitted(struct vm_area_struct *vma, + bool write, bool execute, bool foreign) +{ + if (!system_supports_poe()) + return true; + + /* allow access if the VMA is not one from this process */ + if (foreign || vma_is_foreign(vma)) + return true; + + return por_el0_allows_pkey(vma_pkey(vma), write, execute); +} + #include #endif /* !__ASSEMBLY__ */ diff --git a/arch/arm64/include/asm/pgtable.h b/arch/arm64/include/asm/pgtable.h index 61a674942a6b..96c2b0b07c4c 100644 --- a/arch/arm64/include/asm/pgtable.h +++ b/arch/arm64/include/asm/pgtable.h @@ -34,6 +34,7 @@ #include #include +#include #include #include #include @@ -149,6 +150,24 @@ static inline pteval_t __phys_to_pte_val(phys_addr_t phys) #define pte_accessible(mm, pte) \ (mm_tlb_flush_pending(mm) ? pte_present(pte) : pte_valid(pte)) +static inline bool por_el0_allows_pkey(u8 pkey, bool write, bool execute) +{ + u64 por; + + if (!system_supports_poe()) + return true; + + por = read_sysreg_s(SYS_POR_EL0); + + if (write) + return por_elx_allows_write(por, pkey); + + if (execute) + return por_elx_allows_exec(por, pkey); + + return por_elx_allows_read(por, pkey); +} + /* * p??_access_permitted() is true for valid user mappings (PTE_USER * bit set, subject to the write permission check). For execute-only @@ -159,7 +178,8 @@ static inline pteval_t __phys_to_pte_val(phys_addr_t phys) #define pte_access_permitted_no_overlay(pte, write) \ (((pte_val(pte) & (PTE_VALID | PTE_USER)) == (PTE_VALID | PTE_USER)) && (!(write) || pte_write(pte))) #define pte_access_permitted(pte, write) \ - pte_access_permitted_no_overlay(pte, write) + (pte_access_permitted_no_overlay(pte, write) && \ + por_el0_allows_pkey(FIELD_GET(PTE_PO_IDX_MASK, pte_val(pte)), write, false)) #define pmd_access_permitted(pmd, write) \ (pte_access_permitted(pmd_pte(pmd), (write))) #define pud_access_permitted(pud, write) \ diff --git a/arch/arm64/include/asm/pkeys.h b/arch/arm64/include/asm/pkeys.h new file mode 100644 index 000000000000..32c352bb36b9 --- /dev/null +++ b/arch/arm64/include/asm/pkeys.h @@ -0,0 +1,108 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright (C) 2023 Arm Ltd. + * + * Based on arch/x86/include/asm/pkeys.h + */ + +#ifndef _ASM_ARM64_PKEYS_H +#define _ASM_ARM64_PKEYS_H + +#define ARCH_VM_PKEY_FLAGS (VM_PKEY_BIT0 | VM_PKEY_BIT1 | VM_PKEY_BIT2) + +#define arch_max_pkey() 8 + +int arch_set_user_pkey_access(struct task_struct *tsk, int pkey, + unsigned long init_val); + +static inline bool arch_pkeys_enabled(void) +{ + return false; +} + +static inline int vma_pkey(struct vm_area_struct *vma) +{ + return (vma->vm_flags & ARCH_VM_PKEY_FLAGS) >> VM_PKEY_SHIFT; +} + +static inline int arch_override_mprotect_pkey(struct vm_area_struct *vma, + int prot, int pkey) +{ + if (pkey != -1) + return pkey; + + return vma_pkey(vma); +} + +static inline int execute_only_pkey(struct mm_struct *mm) +{ + // Execute-only mappings are handled by EPAN/FEAT_PAN3. + WARN_ON_ONCE(!cpus_have_final_cap(ARM64_HAS_EPAN)); + + return -1; +} + +#define mm_pkey_allocation_map(mm) (mm)->context.pkey_allocation_map +#define mm_set_pkey_allocated(mm, pkey) do { \ + mm_pkey_allocation_map(mm) |= (1U << pkey); \ +} while (0) +#define mm_set_pkey_free(mm, pkey) do { \ + mm_pkey_allocation_map(mm) &= ~(1U << pkey); \ +} while (0) + +static inline bool mm_pkey_is_allocated(struct mm_struct *mm, int pkey) +{ + /* + * "Allocated" pkeys are those that have been returned + * from pkey_alloc() or pkey 0 which is allocated + * implicitly when the mm is created. + */ + if (pkey < 0 || pkey >= arch_max_pkey()) + return false; + + return mm_pkey_allocation_map(mm) & (1U << pkey); +} + +/* + * Returns a positive, 3-bit key on success, or -1 on failure. + */ +static inline int mm_pkey_alloc(struct mm_struct *mm) +{ + /* + * Note: this is the one and only place we make sure + * that the pkey is valid as far as the hardware is + * concerned. The rest of the kernel trusts that + * only good, valid pkeys come out of here. + */ + u8 all_pkeys_mask = GENMASK(arch_max_pkey() - 1, 0); + int ret; + + if (!arch_pkeys_enabled()) + return -1; + + /* + * Are we out of pkeys? We must handle this specially + * because ffz() behavior is undefined if there are no + * zeros. + */ + if (mm_pkey_allocation_map(mm) == all_pkeys_mask) + return -1; + + ret = ffz(mm_pkey_allocation_map(mm)); + + mm_set_pkey_allocated(mm, ret); + + return ret; +} + +static inline int mm_pkey_free(struct mm_struct *mm, int pkey) +{ + if (!mm_pkey_is_allocated(mm, pkey)) + return -EINVAL; + + mm_set_pkey_free(mm, pkey); + + return 0; +} + +#endif /* _ASM_ARM64_PKEYS_H */ diff --git a/arch/arm64/include/asm/por.h b/arch/arm64/include/asm/por.h new file mode 100644 index 000000000000..e06e9f473675 --- /dev/null +++ b/arch/arm64/include/asm/por.h @@ -0,0 +1,33 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright (C) 2023 Arm Ltd. + */ + +#ifndef _ASM_ARM64_POR_H +#define _ASM_ARM64_POR_H + +#define POR_BITS_PER_PKEY 4 +#define POR_ELx_IDX(por_elx, idx) (((por_elx) >> ((idx) * POR_BITS_PER_PKEY)) & 0xf) + +static inline bool por_elx_allows_read(u64 por, u8 pkey) +{ + u8 perm = POR_ELx_IDX(por, pkey); + + return perm & POE_R; +} + +static inline bool por_elx_allows_write(u64 por, u8 pkey) +{ + u8 perm = POR_ELx_IDX(por, pkey); + + return perm & POE_W; +} + +static inline bool por_elx_allows_exec(u64 por, u8 pkey) +{ + u8 perm = POR_ELx_IDX(por, pkey); + + return perm & POE_X; +} + +#endif /* _ASM_ARM64_POR_H */ diff --git a/arch/arm64/include/uapi/asm/mman.h b/arch/arm64/include/uapi/asm/mman.h index 1e6482a838e1..e7e0c8216243 100644 --- a/arch/arm64/include/uapi/asm/mman.h +++ b/arch/arm64/include/uapi/asm/mman.h @@ -7,4 +7,13 @@ #define PROT_BTI 0x10 /* BTI guarded page */ #define PROT_MTE 0x20 /* Normal Tagged mapping */ +/* Override any generic PKEY permission defines */ +#define PKEY_DISABLE_EXECUTE 0x4 +#define PKEY_DISABLE_READ 0x8 +#undef PKEY_ACCESS_MASK +#define PKEY_ACCESS_MASK (PKEY_DISABLE_ACCESS |\ + PKEY_DISABLE_WRITE |\ + PKEY_DISABLE_READ |\ + PKEY_DISABLE_EXECUTE) + #endif /* ! _UAPI__ASM_MMAN_H */ diff --git a/arch/arm64/mm/mmu.c b/arch/arm64/mm/mmu.c index 353ea5dc32b8..e55b02fbddc8 100644 --- a/arch/arm64/mm/mmu.c +++ b/arch/arm64/mm/mmu.c @@ -25,6 +25,7 @@ #include #include #include +#include #include #include @@ -1549,3 +1550,47 @@ void __cpu_replace_ttbr1(pgd_t *pgdp, bool cnp) cpu_uninstall_idmap(); } + +#ifdef CONFIG_ARCH_HAS_PKEYS +int arch_set_user_pkey_access(struct task_struct *tsk, int pkey, unsigned long init_val) +{ + u64 new_por = POE_RXW; + u64 old_por; + u64 pkey_shift; + + if (!system_supports_poe()) + return -ENOSPC; + + /* + * This code should only be called with valid 'pkey' + * values originating from in-kernel users. Complain + * if a bad value is observed. + */ + if (WARN_ON_ONCE(pkey >= arch_max_pkey())) + return -EINVAL; + + /* Set the bits we need in POR: */ + new_por = POE_RXW; + if (init_val & PKEY_DISABLE_WRITE) + new_por &= ~POE_W; + if (init_val & PKEY_DISABLE_ACCESS) + new_por &= ~POE_RW; + if (init_val & PKEY_DISABLE_READ) + new_por &= ~POE_R; + if (init_val & PKEY_DISABLE_EXECUTE) + new_por &= ~POE_X; + + /* Shift the bits in to the correct place in POR for pkey: */ + pkey_shift = pkey * POR_BITS_PER_PKEY; + new_por <<= pkey_shift; + + /* Get old POR and mask off any old bits in place: */ + old_por = read_sysreg_s(SYS_POR_EL0); + old_por &= ~(POE_MASK << pkey_shift); + + /* Write old part along with new part: */ + write_sysreg_s(old_por | new_por, SYS_POR_EL0); + + return 0; +} +#endif From 9160f7e909e179f333c2578d3032978e7a60b270 Mon Sep 17 00:00:00 2001 From: Joey Gouly Date: Thu, 22 Aug 2024 16:11:02 +0100 Subject: [PATCH 61/94] arm64: add POE signal support Add PKEY support to signals, by saving and restoring POR_EL0 from the stackframe. Signed-off-by: Joey Gouly Cc: Catalin Marinas Cc: Will Deacon Reviewed-by: Mark Brown Acked-by: Szabolcs Nagy Reviewed-by: Catalin Marinas Reviewed-by: Anshuman Khandual Link: https://lore.kernel.org/r/20240822151113.1479789-20-joey.gouly@arm.com Signed-off-by: Will Deacon --- arch/arm64/include/uapi/asm/sigcontext.h | 7 +++ arch/arm64/kernel/signal.c | 62 ++++++++++++++++++++++++ 2 files changed, 69 insertions(+) diff --git a/arch/arm64/include/uapi/asm/sigcontext.h b/arch/arm64/include/uapi/asm/sigcontext.h index 8a45b7a411e0..e4cba8a6c9a2 100644 --- a/arch/arm64/include/uapi/asm/sigcontext.h +++ b/arch/arm64/include/uapi/asm/sigcontext.h @@ -98,6 +98,13 @@ struct esr_context { __u64 esr; }; +#define POE_MAGIC 0x504f4530 + +struct poe_context { + struct _aarch64_ctx head; + __u64 por_el0; +}; + /* * extra_context: describes extra space in the signal frame for * additional structures that don't fit in sigcontext.__reserved[]. diff --git a/arch/arm64/kernel/signal.c b/arch/arm64/kernel/signal.c index 4a77f4976e11..561986947530 100644 --- a/arch/arm64/kernel/signal.c +++ b/arch/arm64/kernel/signal.c @@ -61,6 +61,7 @@ struct rt_sigframe_user_layout { unsigned long za_offset; unsigned long zt_offset; unsigned long fpmr_offset; + unsigned long poe_offset; unsigned long extra_offset; unsigned long end_offset; }; @@ -185,6 +186,8 @@ struct user_ctxs { u32 zt_size; struct fpmr_context __user *fpmr; u32 fpmr_size; + struct poe_context __user *poe; + u32 poe_size; }; static int preserve_fpsimd_context(struct fpsimd_context __user *ctx) @@ -258,6 +261,32 @@ static int restore_fpmr_context(struct user_ctxs *user) return err; } +static int preserve_poe_context(struct poe_context __user *ctx) +{ + int err = 0; + + __put_user_error(POE_MAGIC, &ctx->head.magic, err); + __put_user_error(sizeof(*ctx), &ctx->head.size, err); + __put_user_error(read_sysreg_s(SYS_POR_EL0), &ctx->por_el0, err); + + return err; +} + +static int restore_poe_context(struct user_ctxs *user) +{ + u64 por_el0; + int err = 0; + + if (user->poe_size != sizeof(*user->poe)) + return -EINVAL; + + __get_user_error(por_el0, &(user->poe->por_el0), err); + if (!err) + write_sysreg_s(por_el0, SYS_POR_EL0); + + return err; +} + #ifdef CONFIG_ARM64_SVE static int preserve_sve_context(struct sve_context __user *ctx) @@ -621,6 +650,7 @@ static int parse_user_sigframe(struct user_ctxs *user, user->za = NULL; user->zt = NULL; user->fpmr = NULL; + user->poe = NULL; if (!IS_ALIGNED((unsigned long)base, 16)) goto invalid; @@ -671,6 +701,17 @@ static int parse_user_sigframe(struct user_ctxs *user, /* ignore */ break; + case POE_MAGIC: + if (!system_supports_poe()) + goto invalid; + + if (user->poe) + goto invalid; + + user->poe = (struct poe_context __user *)head; + user->poe_size = size; + break; + case SVE_MAGIC: if (!system_supports_sve() && !system_supports_sme()) goto invalid; @@ -857,6 +898,9 @@ static int restore_sigframe(struct pt_regs *regs, if (err == 0 && system_supports_sme2() && user.zt) err = restore_zt_context(&user); + if (err == 0 && system_supports_poe() && user.poe) + err = restore_poe_context(&user); + return err; } @@ -980,6 +1024,13 @@ static int setup_sigframe_layout(struct rt_sigframe_user_layout *user, return err; } + if (system_supports_poe()) { + err = sigframe_alloc(user, &user->poe_offset, + sizeof(struct poe_context)); + if (err) + return err; + } + return sigframe_alloc_end(user); } @@ -1042,6 +1093,14 @@ static int setup_sigframe(struct rt_sigframe_user_layout *user, err |= preserve_fpmr_context(fpmr_ctx); } + if (system_supports_poe() && err == 0 && user->poe_offset) { + struct poe_context __user *poe_ctx = + apply_user_offset(user, user->poe_offset); + + err |= preserve_poe_context(poe_ctx); + } + + /* ZA state if present */ if (system_supports_sme() && err == 0 && user->za_offset) { struct za_context __user *za_ctx = @@ -1178,6 +1237,9 @@ static void setup_return(struct pt_regs *regs, struct k_sigaction *ka, sme_smstop(); } + if (system_supports_poe()) + write_sysreg_s(POR_EL0_INIT, SYS_POR_EL0); + if (ka->sa.sa_flags & SA_RESTORER) sigtramp = ka->sa.sa_restorer; else From 17519819926211e6b2834e00e4554bec0daf22ac Mon Sep 17 00:00:00 2001 From: Joey Gouly Date: Thu, 22 Aug 2024 16:11:03 +0100 Subject: [PATCH 62/94] arm64/ptrace: add support for FEAT_POE Add a regset for POE containing POR_EL0. Signed-off-by: Joey Gouly Cc: Catalin Marinas Cc: Will Deacon Reviewed-by: Mark Brown Reviewed-by: Catalin Marinas Reviewed-by: Anshuman Khandual Link: https://lore.kernel.org/r/20240822151113.1479789-21-joey.gouly@arm.com Signed-off-by: Will Deacon --- arch/arm64/kernel/ptrace.c | 46 ++++++++++++++++++++++++++++++++++++++ include/uapi/linux/elf.h | 1 + 2 files changed, 47 insertions(+) diff --git a/arch/arm64/kernel/ptrace.c b/arch/arm64/kernel/ptrace.c index 0d022599eb61..b756578aeaee 100644 --- a/arch/arm64/kernel/ptrace.c +++ b/arch/arm64/kernel/ptrace.c @@ -1440,6 +1440,39 @@ static int tagged_addr_ctrl_set(struct task_struct *target, const struct } #endif +#ifdef CONFIG_ARM64_POE +static int poe_get(struct task_struct *target, + const struct user_regset *regset, + struct membuf to) +{ + if (!system_supports_poe()) + return -EINVAL; + + return membuf_write(&to, &target->thread.por_el0, + sizeof(target->thread.por_el0)); +} + +static int poe_set(struct task_struct *target, const struct + user_regset *regset, unsigned int pos, + unsigned int count, const void *kbuf, const + void __user *ubuf) +{ + int ret; + long ctrl; + + if (!system_supports_poe()) + return -EINVAL; + + ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf, &ctrl, 0, -1); + if (ret) + return ret; + + target->thread.por_el0 = ctrl; + + return 0; +} +#endif + enum aarch64_regset { REGSET_GPR, REGSET_FPR, @@ -1469,6 +1502,9 @@ enum aarch64_regset { #ifdef CONFIG_ARM64_TAGGED_ADDR_ABI REGSET_TAGGED_ADDR_CTRL, #endif +#ifdef CONFIG_ARM64_POE + REGSET_POE +#endif }; static const struct user_regset aarch64_regsets[] = { @@ -1628,6 +1664,16 @@ static const struct user_regset aarch64_regsets[] = { .set = tagged_addr_ctrl_set, }, #endif +#ifdef CONFIG_ARM64_POE + [REGSET_POE] = { + .core_note_type = NT_ARM_POE, + .n = 1, + .size = sizeof(long), + .align = sizeof(long), + .regset_get = poe_get, + .set = poe_set, + }, +#endif }; static const struct user_regset_view user_aarch64_view = { diff --git a/include/uapi/linux/elf.h b/include/uapi/linux/elf.h index b54b313bcf07..81762ff3c99e 100644 --- a/include/uapi/linux/elf.h +++ b/include/uapi/linux/elf.h @@ -441,6 +441,7 @@ typedef struct elf64_shdr { #define NT_ARM_ZA 0x40c /* ARM SME ZA registers */ #define NT_ARM_ZT 0x40d /* ARM SME ZT registers */ #define NT_ARM_FPMR 0x40e /* ARM floating point mode register */ +#define NT_ARM_POE 0x40f /* ARM POE registers */ #define NT_ARC_V2 0x600 /* ARCv2 accumulator/extra registers */ #define NT_VMCOREDD 0x700 /* Vmcore Device Dump Note */ #define NT_MIPS_DSP 0x800 /* MIPS DSP ASE registers */ From d0d6e7e0812b24fc3220745d946ddb06a7a911dd Mon Sep 17 00:00:00 2001 From: Joey Gouly Date: Thu, 22 Aug 2024 16:11:04 +0100 Subject: [PATCH 63/94] arm64: enable POE and PIE to coexist Permission Indirection Extension and Permission Overlay Extension can be enabled independently. When PIE is disabled and POE is enabled, the permissions set by POR_EL0 will be applied on top of the permissions set in the PTE. When both PIE and POE are enabled, the permissions set by POR_EL0 will be applied on top of the permissions set by the PIRE0_EL1 register. However PIRE0_EL1 has encodings that specifically enable and disable the overlay from applying. For example: 0001 Read, Overlay applied. 1000 Read, Overlay not applied. Switch to using the 'Overlay applied' encodings in PIRE0_EL1, so that PIE and POE can coexist. Signed-off-by: Joey Gouly Cc: Catalin Marinas Cc: Will Deacon Reviewed-by: Catalin Marinas Link: https://lore.kernel.org/r/20240822151113.1479789-22-joey.gouly@arm.com Signed-off-by: Will Deacon --- arch/arm64/include/asm/pgtable-prot.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/arch/arm64/include/asm/pgtable-prot.h b/arch/arm64/include/asm/pgtable-prot.h index b11cfb9fdd37..2a11d0c10760 100644 --- a/arch/arm64/include/asm/pgtable-prot.h +++ b/arch/arm64/include/asm/pgtable-prot.h @@ -154,10 +154,10 @@ static inline bool __pure lpa2_is_enabled(void) #define PIE_E0 ( \ PIRx_ELx_PERM(pte_pi_index(_PAGE_EXECONLY), PIE_X_O) | \ - PIRx_ELx_PERM(pte_pi_index(_PAGE_READONLY_EXEC), PIE_RX) | \ - PIRx_ELx_PERM(pte_pi_index(_PAGE_SHARED_EXEC), PIE_RWX) | \ - PIRx_ELx_PERM(pte_pi_index(_PAGE_READONLY), PIE_R) | \ - PIRx_ELx_PERM(pte_pi_index(_PAGE_SHARED), PIE_RW)) + PIRx_ELx_PERM(pte_pi_index(_PAGE_READONLY_EXEC), PIE_RX_O) | \ + PIRx_ELx_PERM(pte_pi_index(_PAGE_SHARED_EXEC), PIE_RWX_O) | \ + PIRx_ELx_PERM(pte_pi_index(_PAGE_READONLY), PIE_R_O) | \ + PIRx_ELx_PERM(pte_pi_index(_PAGE_SHARED), PIE_RW_O)) #define PIE_E1 ( \ PIRx_ELx_PERM(pte_pi_index(_PAGE_EXECONLY), PIE_NONE_O) | \ From 4afd00641b220170143f2f7f4b42b26a0abe49b2 Mon Sep 17 00:00:00 2001 From: Joey Gouly Date: Thu, 22 Aug 2024 16:11:05 +0100 Subject: [PATCH 64/94] arm64: enable PKEY support for CPUs with S1POE Now that PKEYs support has been implemented, enable it for CPUs that support S1POE. Signed-off-by: Joey Gouly Cc: Catalin Marinas Cc: Will Deacon Acked-by: Catalin Marinas Reviewed-by: Anshuman Khandual Link: https://lore.kernel.org/r/20240822151113.1479789-23-joey.gouly@arm.com Signed-off-by: Will Deacon --- arch/arm64/include/asm/pkeys.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/arm64/include/asm/pkeys.h b/arch/arm64/include/asm/pkeys.h index 32c352bb36b9..19eb1b12b7fc 100644 --- a/arch/arm64/include/asm/pkeys.h +++ b/arch/arm64/include/asm/pkeys.h @@ -17,7 +17,7 @@ int arch_set_user_pkey_access(struct task_struct *tsk, int pkey, static inline bool arch_pkeys_enabled(void) { - return false; + return system_supports_poe(); } static inline int vma_pkey(struct vm_area_struct *vma) From b9b9d72de32bcb63ed4d9761907a3e5f352c6f9a Mon Sep 17 00:00:00 2001 From: Joey Gouly Date: Thu, 22 Aug 2024 16:11:06 +0100 Subject: [PATCH 65/94] arm64: add Permission Overlay Extension Kconfig Now that support for POE and Protection Keys has been implemented, add a config to allow users to actually enable it. Signed-off-by: Joey Gouly Cc: Catalin Marinas Cc: Will Deacon Reviewed-by: Anshuman Khandual Acked-by: Catalin Marinas Link: https://lore.kernel.org/r/20240822151113.1479789-24-joey.gouly@arm.com Signed-off-by: Will Deacon --- arch/arm64/Kconfig | 23 +++++++++++++++++++++++ 1 file changed, 23 insertions(+) diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig index a2f8ff354ca6..35dfc6275328 100644 --- a/arch/arm64/Kconfig +++ b/arch/arm64/Kconfig @@ -2137,6 +2137,29 @@ config ARM64_EPAN if the cpu does not implement the feature. endmenu # "ARMv8.7 architectural features" +menu "ARMv8.9 architectural features" + +config ARM64_POE + prompt "Permission Overlay Extension" + def_bool y + select ARCH_USES_HIGH_VMA_FLAGS + select ARCH_HAS_PKEYS + help + The Permission Overlay Extension is used to implement Memory + Protection Keys. Memory Protection Keys provides a mechanism for + enforcing page-based protections, but without requiring modification + of the page tables when an application changes protection domains. + + For details, see Documentation/core-api/protection-keys.rst + + If unsure, say y. + +config ARCH_PKEY_BITS + int + default 3 + +endmenu # "ARMv8.9 architectural features" + config ARM64_SVE bool "ARM Scalable Vector Extension support" default y From 6354a0184c542f2b8fade9cb0eb843acd3310191 Mon Sep 17 00:00:00 2001 From: Joey Gouly Date: Thu, 22 Aug 2024 16:11:07 +0100 Subject: [PATCH 66/94] kselftest/arm64: move get_header() Put this function in the header so that it can be used by other tests, without needing to link to testcases.c. This will be used by selftest/mm/protection_keys.c Signed-off-by: Joey Gouly Cc: Catalin Marinas Cc: Will Deacon Cc: Andrew Morton Cc: Shuah Khan Cc: Dave Hansen Cc: Aneesh Kumar K.V Reviewed-by: Mark Brown Link: https://lore.kernel.org/r/20240822151113.1479789-25-joey.gouly@arm.com Signed-off-by: Will Deacon --- .../arm64/signal/testcases/testcases.c | 23 ----------------- .../arm64/signal/testcases/testcases.h | 25 +++++++++++++++++-- 2 files changed, 23 insertions(+), 25 deletions(-) diff --git a/tools/testing/selftests/arm64/signal/testcases/testcases.c b/tools/testing/selftests/arm64/signal/testcases/testcases.c index 674b88cc8c39..e4331440fed0 100644 --- a/tools/testing/selftests/arm64/signal/testcases/testcases.c +++ b/tools/testing/selftests/arm64/signal/testcases/testcases.c @@ -6,29 +6,6 @@ #include "testcases.h" -struct _aarch64_ctx *get_header(struct _aarch64_ctx *head, uint32_t magic, - size_t resv_sz, size_t *offset) -{ - size_t offs = 0; - struct _aarch64_ctx *found = NULL; - - if (!head || resv_sz < HDR_SZ) - return found; - - while (offs <= resv_sz - HDR_SZ && - head->magic != magic && head->magic) { - offs += head->size; - head = GET_RESV_NEXT_HEAD(head); - } - if (head->magic == magic) { - found = head; - if (offset) - *offset = offs; - } - - return found; -} - bool validate_extra_context(struct extra_context *extra, char **err, void **extra_data, size_t *extra_size) { diff --git a/tools/testing/selftests/arm64/signal/testcases/testcases.h b/tools/testing/selftests/arm64/signal/testcases/testcases.h index 7727126347e0..3185e6875694 100644 --- a/tools/testing/selftests/arm64/signal/testcases/testcases.h +++ b/tools/testing/selftests/arm64/signal/testcases/testcases.h @@ -88,8 +88,29 @@ struct fake_sigframe { bool validate_reserved(ucontext_t *uc, size_t resv_sz, char **err); -struct _aarch64_ctx *get_header(struct _aarch64_ctx *head, uint32_t magic, - size_t resv_sz, size_t *offset); +static inline struct _aarch64_ctx *get_header(struct _aarch64_ctx *head, uint32_t magic, + size_t resv_sz, size_t *offset) +{ + size_t offs = 0; + struct _aarch64_ctx *found = NULL; + + if (!head || resv_sz < HDR_SZ) + return found; + + while (offs <= resv_sz - HDR_SZ && + head->magic != magic && head->magic) { + offs += head->size; + head = GET_RESV_NEXT_HEAD(head); + } + if (head->magic == magic) { + found = head; + if (offset) + *offset = offs; + } + + return found; +} + static inline struct _aarch64_ctx *get_terminator(struct _aarch64_ctx *head, size_t resv_sz, From 41bbcf7b4b046b4e7190c1866625aed0fe6f69f6 Mon Sep 17 00:00:00 2001 From: Joey Gouly Date: Thu, 22 Aug 2024 16:11:08 +0100 Subject: [PATCH 67/94] selftests: mm: move fpregs printing arm64's fpregs are not at a constant offset from sigcontext. Since this is not an important part of the test, don't print the fpregs pointer on arm64. Signed-off-by: Joey Gouly Cc: Catalin Marinas Cc: Will Deacon Cc: Andrew Morton Cc: Shuah Khan Cc: Dave Hansen Cc: Aneesh Kumar K.V Acked-by: Dave Hansen Link: https://lore.kernel.org/r/20240822151113.1479789-26-joey.gouly@arm.com Signed-off-by: Will Deacon --- tools/testing/selftests/mm/pkey-powerpc.h | 1 + tools/testing/selftests/mm/pkey-x86.h | 2 ++ tools/testing/selftests/mm/protection_keys.c | 6 ++++++ 3 files changed, 9 insertions(+) diff --git a/tools/testing/selftests/mm/pkey-powerpc.h b/tools/testing/selftests/mm/pkey-powerpc.h index ae5df26104e5..6275d0f474b3 100644 --- a/tools/testing/selftests/mm/pkey-powerpc.h +++ b/tools/testing/selftests/mm/pkey-powerpc.h @@ -9,6 +9,7 @@ #endif #define REG_IP_IDX PT_NIP #define REG_TRAPNO PT_TRAP +#define MCONTEXT_FPREGS #define gregs gp_regs #define fpregs fp_regs #define si_pkey_offset 0x20 diff --git a/tools/testing/selftests/mm/pkey-x86.h b/tools/testing/selftests/mm/pkey-x86.h index 814758e109c0..b9170a26bfcb 100644 --- a/tools/testing/selftests/mm/pkey-x86.h +++ b/tools/testing/selftests/mm/pkey-x86.h @@ -15,6 +15,8 @@ #endif +#define MCONTEXT_FPREGS + #ifndef PKEY_DISABLE_ACCESS # define PKEY_DISABLE_ACCESS 0x1 #endif diff --git a/tools/testing/selftests/mm/protection_keys.c b/tools/testing/selftests/mm/protection_keys.c index eaa6d1fc5328..4337106a985e 100644 --- a/tools/testing/selftests/mm/protection_keys.c +++ b/tools/testing/selftests/mm/protection_keys.c @@ -314,7 +314,9 @@ void signal_handler(int signum, siginfo_t *si, void *vucontext) ucontext_t *uctxt = vucontext; int trapno; unsigned long ip; +#ifdef MCONTEXT_FPREGS char *fpregs; +#endif #if defined(__i386__) || defined(__x86_64__) /* arch */ u32 *pkey_reg_ptr; int pkey_reg_offset; @@ -330,7 +332,9 @@ void signal_handler(int signum, siginfo_t *si, void *vucontext) trapno = uctxt->uc_mcontext.gregs[REG_TRAPNO]; ip = uctxt->uc_mcontext.gregs[REG_IP_IDX]; +#ifdef MCONTEXT_FPREGS fpregs = (char *) uctxt->uc_mcontext.fpregs; +#endif dprintf2("%s() trapno: %d ip: 0x%016lx info->si_code: %s/%d\n", __func__, trapno, ip, si_code_str(si->si_code), @@ -359,7 +363,9 @@ void signal_handler(int signum, siginfo_t *si, void *vucontext) #endif /* arch */ dprintf1("siginfo: %p\n", si); +#ifdef MCONTEXT_FPREGS dprintf1(" fpregs: %p\n", fpregs); +#endif if ((si->si_code == SEGV_MAPERR) || (si->si_code == SEGV_ACCERR) || From f5b5ea51f78f2ebd94d5a77702bbe5eee8924b50 Mon Sep 17 00:00:00 2001 From: Joey Gouly Date: Thu, 22 Aug 2024 16:11:09 +0100 Subject: [PATCH 68/94] selftests: mm: make protection_keys test work on arm64 The encoding of the pkey register differs on arm64, than on x86/ppc. On those platforms, a bit in the register is used to disable permissions, for arm64, a bit enabled in the register indicates that the permission is allowed. This drops two asserts of the form: assert(read_pkey_reg() <= orig_pkey_reg); Because on arm64 this doesn't hold, due to the encoding. The pkey must be reset to both access allow and write allow in the signal handler. pkey_access_allow() works currently for PowerPC as the PKEY_DISABLE_ACCESS and PKEY_DISABLE_WRITE have overlapping bits set. Access to the uc_mcontext is abstracted, as arm64 has a different structure. Signed-off-by: Joey Gouly Cc: Catalin Marinas Cc: Will Deacon Cc: Andrew Morton Cc: Shuah Khan Cc: Dave Hansen Cc: Aneesh Kumar K.V Acked-by: Dave Hansen Link: https://lore.kernel.org/r/20240822151113.1479789-27-joey.gouly@arm.com Signed-off-by: Will Deacon --- .../arm64/signal/testcases/testcases.h | 3 + tools/testing/selftests/mm/Makefile | 2 +- tools/testing/selftests/mm/pkey-arm64.h | 139 ++++++++++++++++++ tools/testing/selftests/mm/pkey-helpers.h | 8 + tools/testing/selftests/mm/pkey-powerpc.h | 2 + tools/testing/selftests/mm/pkey-x86.h | 2 + tools/testing/selftests/mm/protection_keys.c | 103 +++++++++++-- 7 files changed, 247 insertions(+), 12 deletions(-) create mode 100644 tools/testing/selftests/mm/pkey-arm64.h diff --git a/tools/testing/selftests/arm64/signal/testcases/testcases.h b/tools/testing/selftests/arm64/signal/testcases/testcases.h index 3185e6875694..9872b8912714 100644 --- a/tools/testing/selftests/arm64/signal/testcases/testcases.h +++ b/tools/testing/selftests/arm64/signal/testcases/testcases.h @@ -26,6 +26,9 @@ #define HDR_SZ \ sizeof(struct _aarch64_ctx) +#define GET_UC_RESV_HEAD(uc) \ + (struct _aarch64_ctx *)(&(uc->uc_mcontext.__reserved)) + #define GET_SF_RESV_HEAD(sf) \ (struct _aarch64_ctx *)(&(sf).uc.uc_mcontext.__reserved) diff --git a/tools/testing/selftests/mm/Makefile b/tools/testing/selftests/mm/Makefile index 7b8a5def54a1..b474ac7be8a5 100644 --- a/tools/testing/selftests/mm/Makefile +++ b/tools/testing/selftests/mm/Makefile @@ -104,7 +104,7 @@ TEST_GEN_FILES += $(BINARIES_64) endif else -ifneq (,$(findstring $(ARCH),powerpc)) +ifneq (,$(filter $(ARCH),arm64 powerpc)) TEST_GEN_FILES += protection_keys endif diff --git a/tools/testing/selftests/mm/pkey-arm64.h b/tools/testing/selftests/mm/pkey-arm64.h new file mode 100644 index 000000000000..580e1b0bb38e --- /dev/null +++ b/tools/testing/selftests/mm/pkey-arm64.h @@ -0,0 +1,139 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright (C) 2023 Arm Ltd. + */ + +#ifndef _PKEYS_ARM64_H +#define _PKEYS_ARM64_H + +#include "vm_util.h" +/* for signal frame parsing */ +#include "../arm64/signal/testcases/testcases.h" + +#ifndef SYS_mprotect_key +# define SYS_mprotect_key 288 +#endif +#ifndef SYS_pkey_alloc +# define SYS_pkey_alloc 289 +# define SYS_pkey_free 290 +#endif +#define MCONTEXT_IP(mc) mc.pc +#define MCONTEXT_TRAPNO(mc) -1 + +#define PKEY_MASK 0xf + +#define POE_NONE 0x0 +#define POE_X 0x2 +#define POE_RX 0x3 +#define POE_RWX 0x7 + +#define NR_PKEYS 8 +#define NR_RESERVED_PKEYS 1 /* pkey-0 */ + +#define PKEY_ALLOW_ALL 0x77777777 + +#define PKEY_BITS_PER_PKEY 4 +#define PAGE_SIZE sysconf(_SC_PAGESIZE) +#undef HPAGE_SIZE +#define HPAGE_SIZE default_huge_page_size() + +/* 4-byte instructions * 16384 = 64K page */ +#define __page_o_noops() asm(".rept 16384 ; nop; .endr") + +static inline u64 __read_pkey_reg(void) +{ + u64 pkey_reg = 0; + + // POR_EL0 + asm volatile("mrs %0, S3_3_c10_c2_4" : "=r" (pkey_reg)); + + return pkey_reg; +} + +static inline void __write_pkey_reg(u64 pkey_reg) +{ + u64 por = pkey_reg; + + dprintf4("%s() changing %016llx to %016llx\n", + __func__, __read_pkey_reg(), pkey_reg); + + // POR_EL0 + asm volatile("msr S3_3_c10_c2_4, %0\nisb" :: "r" (por) :); + + dprintf4("%s() pkey register after changing %016llx to %016llx\n", + __func__, __read_pkey_reg(), pkey_reg); +} + +static inline int cpu_has_pkeys(void) +{ + /* No simple way to determine this */ + return 1; +} + +static inline u32 pkey_bit_position(int pkey) +{ + return pkey * PKEY_BITS_PER_PKEY; +} + +static inline int get_arch_reserved_keys(void) +{ + return NR_RESERVED_PKEYS; +} + +void expect_fault_on_read_execonly_key(void *p1, int pkey) +{ +} + +void *malloc_pkey_with_mprotect_subpage(long size, int prot, u16 pkey) +{ + return PTR_ERR_ENOTSUP; +} + +#define set_pkey_bits set_pkey_bits +static inline u64 set_pkey_bits(u64 reg, int pkey, u64 flags) +{ + u32 shift = pkey_bit_position(pkey); + u64 new_val = POE_RWX; + + /* mask out bits from pkey in old value */ + reg &= ~((u64)PKEY_MASK << shift); + + if (flags & PKEY_DISABLE_ACCESS) + new_val = POE_X; + else if (flags & PKEY_DISABLE_WRITE) + new_val = POE_RX; + + /* OR in new bits for pkey */ + reg |= new_val << shift; + + return reg; +} + +#define get_pkey_bits get_pkey_bits +static inline u64 get_pkey_bits(u64 reg, int pkey) +{ + u32 shift = pkey_bit_position(pkey); + /* + * shift down the relevant bits to the lowest four, then + * mask off all the other higher bits + */ + u32 perm = (reg >> shift) & PKEY_MASK; + + if (perm == POE_X) + return PKEY_DISABLE_ACCESS; + if (perm == POE_RX) + return PKEY_DISABLE_WRITE; + return 0; +} + +static void aarch64_write_signal_pkey(ucontext_t *uctxt, u64 pkey) +{ + struct _aarch64_ctx *ctx = GET_UC_RESV_HEAD(uctxt); + struct poe_context *poe_ctx = + (struct poe_context *) get_header(ctx, POE_MAGIC, + sizeof(uctxt->uc_mcontext), NULL); + if (poe_ctx) + poe_ctx->por_el0 = pkey; +} + +#endif /* _PKEYS_ARM64_H */ diff --git a/tools/testing/selftests/mm/pkey-helpers.h b/tools/testing/selftests/mm/pkey-helpers.h index 1af3156a9db8..15608350fc01 100644 --- a/tools/testing/selftests/mm/pkey-helpers.h +++ b/tools/testing/selftests/mm/pkey-helpers.h @@ -91,12 +91,17 @@ void record_pkey_malloc(void *ptr, long size, int prot); #include "pkey-x86.h" #elif defined(__powerpc64__) /* arch */ #include "pkey-powerpc.h" +#elif defined(__aarch64__) /* arch */ +#include "pkey-arm64.h" #else /* arch */ #error Architecture not supported #endif /* arch */ +#ifndef PKEY_MASK #define PKEY_MASK (PKEY_DISABLE_ACCESS | PKEY_DISABLE_WRITE) +#endif +#ifndef set_pkey_bits static inline u64 set_pkey_bits(u64 reg, int pkey, u64 flags) { u32 shift = pkey_bit_position(pkey); @@ -106,7 +111,9 @@ static inline u64 set_pkey_bits(u64 reg, int pkey, u64 flags) reg |= (flags & PKEY_MASK) << shift; return reg; } +#endif +#ifndef get_pkey_bits static inline u64 get_pkey_bits(u64 reg, int pkey) { u32 shift = pkey_bit_position(pkey); @@ -116,6 +123,7 @@ static inline u64 get_pkey_bits(u64 reg, int pkey) */ return ((reg >> shift) & PKEY_MASK); } +#endif extern u64 shadow_pkey_reg; diff --git a/tools/testing/selftests/mm/pkey-powerpc.h b/tools/testing/selftests/mm/pkey-powerpc.h index 6275d0f474b3..3d0c0bdae5bc 100644 --- a/tools/testing/selftests/mm/pkey-powerpc.h +++ b/tools/testing/selftests/mm/pkey-powerpc.h @@ -8,6 +8,8 @@ # define SYS_pkey_free 385 #endif #define REG_IP_IDX PT_NIP +#define MCONTEXT_IP(mc) mc.gp_regs[REG_IP_IDX] +#define MCONTEXT_TRAPNO(mc) mc.gp_regs[REG_TRAPNO] #define REG_TRAPNO PT_TRAP #define MCONTEXT_FPREGS #define gregs gp_regs diff --git a/tools/testing/selftests/mm/pkey-x86.h b/tools/testing/selftests/mm/pkey-x86.h index b9170a26bfcb..5f28e26a2511 100644 --- a/tools/testing/selftests/mm/pkey-x86.h +++ b/tools/testing/selftests/mm/pkey-x86.h @@ -15,6 +15,8 @@ #endif +#define MCONTEXT_IP(mc) mc.gregs[REG_IP_IDX] +#define MCONTEXT_TRAPNO(mc) mc.gregs[REG_TRAPNO] #define MCONTEXT_FPREGS #ifndef PKEY_DISABLE_ACCESS diff --git a/tools/testing/selftests/mm/protection_keys.c b/tools/testing/selftests/mm/protection_keys.c index 4337106a985e..0789981b72b9 100644 --- a/tools/testing/selftests/mm/protection_keys.c +++ b/tools/testing/selftests/mm/protection_keys.c @@ -147,7 +147,7 @@ void abort_hooks(void) * will then fault, which makes sure that the fault code handles * execute-only memory properly. */ -#ifdef __powerpc64__ +#if defined(__powerpc64__) || defined(__aarch64__) /* This way, both 4K and 64K alignment are maintained */ __attribute__((__aligned__(65536))) #else @@ -212,7 +212,6 @@ void pkey_disable_set(int pkey, int flags) unsigned long syscall_flags = 0; int ret; int pkey_rights; - u64 orig_pkey_reg = read_pkey_reg(); dprintf1("START->%s(%d, 0x%x)\n", __func__, pkey, flags); @@ -242,8 +241,6 @@ void pkey_disable_set(int pkey, int flags) dprintf1("%s(%d) pkey_reg: 0x%016llx\n", __func__, pkey, read_pkey_reg()); - if (flags) - pkey_assert(read_pkey_reg() >= orig_pkey_reg); dprintf1("END<---%s(%d, 0x%x)\n", __func__, pkey, flags); } @@ -253,7 +250,6 @@ void pkey_disable_clear(int pkey, int flags) unsigned long syscall_flags = 0; int ret; int pkey_rights = hw_pkey_get(pkey, syscall_flags); - u64 orig_pkey_reg = read_pkey_reg(); pkey_assert(flags & (PKEY_DISABLE_ACCESS | PKEY_DISABLE_WRITE)); @@ -273,8 +269,6 @@ void pkey_disable_clear(int pkey, int flags) dprintf1("%s(%d) pkey_reg: 0x%016llx\n", __func__, pkey, read_pkey_reg()); - if (flags) - assert(read_pkey_reg() <= orig_pkey_reg); } void pkey_write_allow(int pkey) @@ -330,8 +324,8 @@ void signal_handler(int signum, siginfo_t *si, void *vucontext) __func__, __LINE__, __read_pkey_reg(), shadow_pkey_reg); - trapno = uctxt->uc_mcontext.gregs[REG_TRAPNO]; - ip = uctxt->uc_mcontext.gregs[REG_IP_IDX]; + trapno = MCONTEXT_TRAPNO(uctxt->uc_mcontext); + ip = MCONTEXT_IP(uctxt->uc_mcontext); #ifdef MCONTEXT_FPREGS fpregs = (char *) uctxt->uc_mcontext.fpregs; #endif @@ -395,6 +389,8 @@ void signal_handler(int signum, siginfo_t *si, void *vucontext) #elif defined(__powerpc64__) /* arch */ /* restore access and let the faulting instruction continue */ pkey_access_allow(siginfo_pkey); +#elif defined(__aarch64__) + aarch64_write_signal_pkey(uctxt, PKEY_ALLOW_ALL); #endif /* arch */ pkey_faults++; dprintf1("<<<<==================================================\n"); @@ -908,7 +904,9 @@ void expected_pkey_fault(int pkey) * test program continue. We now have to restore it. */ if (__read_pkey_reg() != 0) -#else /* arch */ +#elif defined(__aarch64__) + if (__read_pkey_reg() != PKEY_ALLOW_ALL) +#else if (__read_pkey_reg() != shadow_pkey_reg) #endif /* arch */ pkey_assert(0); @@ -1498,6 +1496,11 @@ void test_executing_on_unreadable_memory(int *ptr, u16 pkey) lots_o_noops_around_write(&scratch); do_not_expect_pkey_fault("executing on PROT_EXEC memory"); expect_fault_on_read_execonly_key(p1, pkey); + + // Reset back to PROT_EXEC | PROT_READ for architectures that support + // non-PKEY execute-only permissions. + ret = mprotect_pkey(p1, PAGE_SIZE, PROT_EXEC | PROT_READ, (u64)pkey); + pkey_assert(!ret); } void test_implicit_mprotect_exec_only_memory(int *ptr, u16 pkey) @@ -1671,6 +1674,84 @@ void test_ptrace_modifies_pkru(int *ptr, u16 pkey) } #endif +#if defined(__aarch64__) +void test_ptrace_modifies_pkru(int *ptr, u16 pkey) +{ + pid_t child; + int status, ret; + struct iovec iov; + u64 trace_pkey; + /* Just a random pkey value.. */ + u64 new_pkey = (POE_X << PKEY_BITS_PER_PKEY * 2) | + (POE_NONE << PKEY_BITS_PER_PKEY) | + POE_RWX; + + child = fork(); + pkey_assert(child >= 0); + dprintf3("[%d] fork() ret: %d\n", getpid(), child); + if (!child) { + ptrace(PTRACE_TRACEME, 0, 0, 0); + + /* Stop and allow the tracer to modify PKRU directly */ + raise(SIGSTOP); + + /* + * need __read_pkey_reg() version so we do not do shadow_pkey_reg + * checking + */ + if (__read_pkey_reg() != new_pkey) + exit(1); + + raise(SIGSTOP); + + exit(0); + } + + pkey_assert(child == waitpid(child, &status, 0)); + dprintf3("[%d] waitpid(%d) status: %x\n", getpid(), child, status); + pkey_assert(WIFSTOPPED(status) && WSTOPSIG(status) == SIGSTOP); + + iov.iov_base = &trace_pkey; + iov.iov_len = 8; + ret = ptrace(PTRACE_GETREGSET, child, (void *)NT_ARM_POE, &iov); + pkey_assert(ret == 0); + pkey_assert(trace_pkey == read_pkey_reg()); + + trace_pkey = new_pkey; + + ret = ptrace(PTRACE_SETREGSET, child, (void *)NT_ARM_POE, &iov); + pkey_assert(ret == 0); + + /* Test that the modification is visible in ptrace before any execution */ + memset(&trace_pkey, 0, sizeof(trace_pkey)); + ret = ptrace(PTRACE_GETREGSET, child, (void *)NT_ARM_POE, &iov); + pkey_assert(ret == 0); + pkey_assert(trace_pkey == new_pkey); + + /* Execute the tracee */ + ret = ptrace(PTRACE_CONT, child, 0, 0); + pkey_assert(ret == 0); + + /* Test that the tracee saw the PKRU value change */ + pkey_assert(child == waitpid(child, &status, 0)); + dprintf3("[%d] waitpid(%d) status: %x\n", getpid(), child, status); + pkey_assert(WIFSTOPPED(status) && WSTOPSIG(status) == SIGSTOP); + + /* Test that the modification is visible in ptrace after execution */ + memset(&trace_pkey, 0, sizeof(trace_pkey)); + ret = ptrace(PTRACE_GETREGSET, child, (void *)NT_ARM_POE, &iov); + pkey_assert(ret == 0); + pkey_assert(trace_pkey == new_pkey); + + ret = ptrace(PTRACE_CONT, child, 0, 0); + pkey_assert(ret == 0); + pkey_assert(child == waitpid(child, &status, 0)); + dprintf3("[%d] waitpid(%d) status: %x\n", getpid(), child, status); + pkey_assert(WIFEXITED(status)); + pkey_assert(WEXITSTATUS(status) == 0); +} +#endif + void test_mprotect_pkey_on_unsupported_cpu(int *ptr, u16 pkey) { int size = PAGE_SIZE; @@ -1706,7 +1787,7 @@ void (*pkey_tests[])(int *ptr, u16 pkey) = { test_pkey_syscalls_bad_args, test_pkey_alloc_exhaust, test_pkey_alloc_free_attach_pkey0, -#if defined(__i386__) || defined(__x86_64__) +#if defined(__i386__) || defined(__x86_64__) || defined(__aarch64__) test_ptrace_modifies_pkru, #endif }; From fabf056278b4ccddea4a944a1635fee44033b71f Mon Sep 17 00:00:00 2001 From: Joey Gouly Date: Thu, 22 Aug 2024 16:11:10 +0100 Subject: [PATCH 69/94] kselftest/arm64: add HWCAP test for FEAT_S1POE Check that when POE is enabled, the POR_EL0 register is accessible. Signed-off-by: Joey Gouly Cc: Catalin Marinas Cc: Will Deacon Cc: Mark Brown Cc: Shuah Khan Reviewed-by: Mark Brown Link: https://lore.kernel.org/r/20240822151113.1479789-28-joey.gouly@arm.com Signed-off-by: Will Deacon --- tools/testing/selftests/arm64/abi/hwcap.c | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/tools/testing/selftests/arm64/abi/hwcap.c b/tools/testing/selftests/arm64/abi/hwcap.c index d8909b2b535a..f2d6007a2b98 100644 --- a/tools/testing/selftests/arm64/abi/hwcap.c +++ b/tools/testing/selftests/arm64/abi/hwcap.c @@ -156,6 +156,12 @@ static void pmull_sigill(void) asm volatile(".inst 0x0ee0e000" : : : ); } +static void poe_sigill(void) +{ + /* mrs x0, POR_EL0 */ + asm volatile("mrs x0, S3_3_C10_C2_4" : : : "x0"); +} + static void rng_sigill(void) { asm volatile("mrs x0, S3_3_C2_C4_0" : : : "x0"); @@ -601,6 +607,14 @@ static const struct hwcap_data { .cpuinfo = "pmull", .sigill_fn = pmull_sigill, }, + { + .name = "POE", + .at_hwcap = AT_HWCAP2, + .hwcap_bit = HWCAP2_POE, + .cpuinfo = "poe", + .sigill_fn = poe_sigill, + .sigill_reliable = true, + }, { .name = "RNG", .at_hwcap = AT_HWCAP2, From d3c6e5b1093ac68ef66e3c78564e8fb958180bac Mon Sep 17 00:00:00 2001 From: Joey Gouly Date: Thu, 22 Aug 2024 16:11:11 +0100 Subject: [PATCH 70/94] kselftest/arm64: parse POE_MAGIC in a signal frame Teach the signal frame parsing about the new POE frame, avoids warning when it is generated. Signed-off-by: Joey Gouly Cc: Catalin Marinas Cc: Will Deacon Cc: Mark Brown Cc: Shuah Khan Reviewed-by: Mark Brown Link: https://lore.kernel.org/r/20240822151113.1479789-29-joey.gouly@arm.com Signed-off-by: Will Deacon --- tools/testing/selftests/arm64/signal/testcases/testcases.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/tools/testing/selftests/arm64/signal/testcases/testcases.c b/tools/testing/selftests/arm64/signal/testcases/testcases.c index e4331440fed0..e6daa94fcd2e 100644 --- a/tools/testing/selftests/arm64/signal/testcases/testcases.c +++ b/tools/testing/selftests/arm64/signal/testcases/testcases.c @@ -161,6 +161,10 @@ bool validate_reserved(ucontext_t *uc, size_t resv_sz, char **err) if (head->size != sizeof(struct esr_context)) *err = "Bad size for esr_context"; break; + case POE_MAGIC: + if (head->size != sizeof(struct poe_context)) + *err = "Bad size for poe_context"; + break; case TPIDR2_MAGIC: if (head->size != sizeof(struct tpidr2_context)) *err = "Bad size for tpidr2_context"; From 6a428d63717add52bba4175a7fde54d1f9d166e0 Mon Sep 17 00:00:00 2001 From: Joey Gouly Date: Thu, 22 Aug 2024 16:11:12 +0100 Subject: [PATCH 71/94] kselftest/arm64: Add test case for POR_EL0 signal frame records Ensure that we get signal context for POR_EL0 if and only if POE is present on the system. Copied from the TPIDR2 test. Signed-off-by: Joey Gouly Cc: Catalin Marinas Cc: Will Deacon Cc: Mark Brown Cc: Shuah Khan Reviewed-by: Mark Brown Acked-by: Shuah Khan Link: https://lore.kernel.org/r/20240822151113.1479789-30-joey.gouly@arm.com Signed-off-by: Will Deacon --- .../testing/selftests/arm64/signal/.gitignore | 1 + .../arm64/signal/testcases/poe_siginfo.c | 86 +++++++++++++++++++ 2 files changed, 87 insertions(+) create mode 100644 tools/testing/selftests/arm64/signal/testcases/poe_siginfo.c diff --git a/tools/testing/selftests/arm64/signal/.gitignore b/tools/testing/selftests/arm64/signal/.gitignore index 1ce5b5eac386..b2f2bfd5c6aa 100644 --- a/tools/testing/selftests/arm64/signal/.gitignore +++ b/tools/testing/selftests/arm64/signal/.gitignore @@ -2,6 +2,7 @@ mangle_* fake_sigreturn_* fpmr_* +poe_* sme_* ssve_* sve_* diff --git a/tools/testing/selftests/arm64/signal/testcases/poe_siginfo.c b/tools/testing/selftests/arm64/signal/testcases/poe_siginfo.c new file mode 100644 index 000000000000..36bd9940ee05 --- /dev/null +++ b/tools/testing/selftests/arm64/signal/testcases/poe_siginfo.c @@ -0,0 +1,86 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (C) 2023 Arm Limited + * + * Verify that the POR_EL0 register context in signal frames is set up as + * expected. + */ + +#include +#include +#include +#include +#include +#include + +#include "test_signals_utils.h" +#include "testcases.h" + +static union { + ucontext_t uc; + char buf[1024 * 128]; +} context; + +#define SYS_POR_EL0 "S3_3_C10_C2_4" + +static uint64_t get_por_el0(void) +{ + uint64_t val; + + asm volatile( + "mrs %0, " SYS_POR_EL0 "\n" + : "=r"(val) + : + : ); + + return val; +} + +int poe_present(struct tdescr *td, siginfo_t *si, ucontext_t *uc) +{ + struct _aarch64_ctx *head = GET_BUF_RESV_HEAD(context); + struct poe_context *poe_ctx; + size_t offset; + bool in_sigframe; + bool have_poe; + __u64 orig_poe; + + have_poe = getauxval(AT_HWCAP2) & HWCAP2_POE; + if (have_poe) + orig_poe = get_por_el0(); + + if (!get_current_context(td, &context.uc, sizeof(context))) + return 1; + + poe_ctx = (struct poe_context *) + get_header(head, POE_MAGIC, td->live_sz, &offset); + + in_sigframe = poe_ctx != NULL; + + fprintf(stderr, "POR_EL0 sigframe %s on system %s POE\n", + in_sigframe ? "present" : "absent", + have_poe ? "with" : "without"); + + td->pass = (in_sigframe == have_poe); + + /* + * Check that the value we read back was the one present at + * the time that the signal was triggered. + */ + if (have_poe && poe_ctx) { + if (poe_ctx->por_el0 != orig_poe) { + fprintf(stderr, "POR_EL0 in frame is %llx, was %llx\n", + poe_ctx->por_el0, orig_poe); + td->pass = false; + } + } + + return 0; +} + +struct tdescr tde = { + .name = "POR_EL0", + .descr = "Validate that POR_EL0 is present as expected", + .timeout = 3, + .run = poe_present, +}; From e79634b53e398966c49f803c49701bc74dc3ccf8 Mon Sep 17 00:00:00 2001 From: Robin Murphy Date: Mon, 2 Sep 2024 18:51:57 +0100 Subject: [PATCH 72/94] perf/arm-cmn: Refactor node ID handling. Again. The scope of the "extra device ports" configuration is not made clear by the CMN documentation - so far we've assumed it applies globally, based on the sole example which suggests as much. However it transpires that this is incorrect, and the format does in fact vary based on each individual XP's port configuration. As a consequence, we're currenly liable to decode the port/device indices from a node ID incorrectly, thus program the wrong event source in the DTM leading to bogus event counts, and also show device topology on the wrong ports in debugfs. To put this right, rework node IDs yet again to carry around the additional data necessary to decode them properly per-XP. At this point the notion of fully decomposing an ID becomes more impractical than it's worth, so unabstracting the XY mesh coordinates (where 2/3 users were just debug anyway) ends up leaving things a bit simpler overall. Fixes: 60d1504070c2 ("perf/arm-cmn: Support new IP features") Acked-by: Mark Rutland Signed-off-by: Robin Murphy Link: https://lore.kernel.org/r/5195f990152fc37adba5fbf5929a6b11063d9f09.1725296395.git.robin.murphy@arm.com Signed-off-by: Will Deacon --- drivers/perf/arm-cmn.c | 94 ++++++++++++++++++------------------------ 1 file changed, 40 insertions(+), 54 deletions(-) diff --git a/drivers/perf/arm-cmn.c b/drivers/perf/arm-cmn.c index c932d9d355cf..b59ae8513dce 100644 --- a/drivers/perf/arm-cmn.c +++ b/drivers/perf/arm-cmn.c @@ -24,14 +24,6 @@ #define CMN_NI_NODE_ID GENMASK_ULL(31, 16) #define CMN_NI_LOGICAL_ID GENMASK_ULL(47, 32) -#define CMN_NODEID_DEVID(reg) ((reg) & 3) -#define CMN_NODEID_EXT_DEVID(reg) ((reg) & 1) -#define CMN_NODEID_PID(reg) (((reg) >> 2) & 1) -#define CMN_NODEID_EXT_PID(reg) (((reg) >> 1) & 3) -#define CMN_NODEID_1x1_PID(reg) (((reg) >> 2) & 7) -#define CMN_NODEID_X(reg, bits) ((reg) >> (3 + (bits))) -#define CMN_NODEID_Y(reg, bits) (((reg) >> 3) & ((1U << (bits)) - 1)) - #define CMN_CHILD_INFO 0x0080 #define CMN_CI_CHILD_COUNT GENMASK_ULL(15, 0) #define CMN_CI_CHILD_PTR_OFFSET GENMASK_ULL(31, 16) @@ -280,8 +272,11 @@ struct arm_cmn_node { u16 id, logid; enum cmn_node_type type; + /* XP properties really, but replicated to children for convenience */ u8 dtm; s8 dtc; + u8 portid_bits:4; + u8 deviceid_bits:4; /* DN/HN-F/CXHA */ struct { u8 val : 4; @@ -357,49 +352,33 @@ struct arm_cmn { static int arm_cmn_hp_state; struct arm_cmn_nodeid { - u8 x; - u8 y; u8 port; u8 dev; }; static int arm_cmn_xyidbits(const struct arm_cmn *cmn) { - return fls((cmn->mesh_x - 1) | (cmn->mesh_y - 1) | 2); + return fls((cmn->mesh_x - 1) | (cmn->mesh_y - 1)); } -static struct arm_cmn_nodeid arm_cmn_nid(const struct arm_cmn *cmn, u16 id) +static struct arm_cmn_nodeid arm_cmn_nid(const struct arm_cmn_node *dn) { struct arm_cmn_nodeid nid; - if (cmn->num_xps == 1) { - nid.x = 0; - nid.y = 0; - nid.port = CMN_NODEID_1x1_PID(id); - nid.dev = CMN_NODEID_DEVID(id); - } else { - int bits = arm_cmn_xyidbits(cmn); - - nid.x = CMN_NODEID_X(id, bits); - nid.y = CMN_NODEID_Y(id, bits); - if (cmn->ports_used & 0xc) { - nid.port = CMN_NODEID_EXT_PID(id); - nid.dev = CMN_NODEID_EXT_DEVID(id); - } else { - nid.port = CMN_NODEID_PID(id); - nid.dev = CMN_NODEID_DEVID(id); - } - } + nid.dev = dn->id & ((1U << dn->deviceid_bits) - 1); + nid.port = (dn->id >> dn->deviceid_bits) & ((1U << dn->portid_bits) - 1); return nid; } static struct arm_cmn_node *arm_cmn_node_to_xp(const struct arm_cmn *cmn, const struct arm_cmn_node *dn) { - struct arm_cmn_nodeid nid = arm_cmn_nid(cmn, dn->id); - int xp_idx = cmn->mesh_x * nid.y + nid.x; + int id = dn->id >> (dn->portid_bits + dn->deviceid_bits); + int bits = arm_cmn_xyidbits(cmn); + int x = id >> bits; + int y = id & ((1U << bits) - 1); - return cmn->xps + xp_idx; + return cmn->xps + cmn->mesh_x * y + x; } static struct arm_cmn_node *arm_cmn_node(const struct arm_cmn *cmn, enum cmn_node_type type) @@ -485,13 +464,13 @@ static const char *arm_cmn_device_type(u8 type) } } -static void arm_cmn_show_logid(struct seq_file *s, int x, int y, int p, int d) +static void arm_cmn_show_logid(struct seq_file *s, const struct arm_cmn_node *xp, int p, int d) { struct arm_cmn *cmn = s->private; struct arm_cmn_node *dn; + u16 id = xp->id | d | (p << xp->deviceid_bits); for (dn = cmn->dns; dn->type; dn++) { - struct arm_cmn_nodeid nid = arm_cmn_nid(cmn, dn->id); int pad = dn->logid < 10; if (dn->type == CMN_TYPE_XP) @@ -500,7 +479,7 @@ static void arm_cmn_show_logid(struct seq_file *s, int x, int y, int p, int d) if (dn->type < CMN_TYPE_HNI) continue; - if (nid.x != x || nid.y != y || nid.port != p || nid.dev != d) + if (dn->id != id) continue; seq_printf(s, " %*c#%-*d |", pad + 1, ' ', 3 - pad, dn->logid); @@ -521,6 +500,7 @@ static int arm_cmn_map_show(struct seq_file *s, void *data) y = cmn->mesh_y; while (y--) { int xp_base = cmn->mesh_x * y; + struct arm_cmn_node *xp = cmn->xps + xp_base; u8 port[CMN_MAX_PORTS][CMN_MAX_DIMENSION]; for (x = 0; x < cmn->mesh_x; x++) @@ -528,16 +508,14 @@ static int arm_cmn_map_show(struct seq_file *s, void *data) seq_printf(s, "\n%-2d |", y); for (x = 0; x < cmn->mesh_x; x++) { - struct arm_cmn_node *xp = cmn->xps + xp_base + x; - for (p = 0; p < CMN_MAX_PORTS; p++) - port[p][x] = arm_cmn_device_connect_info(cmn, xp, p); + port[p][x] = arm_cmn_device_connect_info(cmn, xp + x, p); seq_printf(s, " XP #%-3d|", xp_base + x); } seq_puts(s, "\n |"); for (x = 0; x < cmn->mesh_x; x++) { - s8 dtc = cmn->xps[xp_base + x].dtc; + s8 dtc = xp[x].dtc; if (dtc < 0) seq_puts(s, " DTC ?? |"); @@ -554,10 +532,10 @@ static int arm_cmn_map_show(struct seq_file *s, void *data) seq_puts(s, arm_cmn_device_type(port[p][x])); seq_puts(s, "\n 0|"); for (x = 0; x < cmn->mesh_x; x++) - arm_cmn_show_logid(s, x, y, p, 0); + arm_cmn_show_logid(s, xp + x, p, 0); seq_puts(s, "\n 1|"); for (x = 0; x < cmn->mesh_x; x++) - arm_cmn_show_logid(s, x, y, p, 1); + arm_cmn_show_logid(s, xp + x, p, 1); } seq_puts(s, "\n-----+"); } @@ -1815,10 +1793,7 @@ static int arm_cmn_event_init(struct perf_event *event) } if (!hw->num_dns) { - struct arm_cmn_nodeid nid = arm_cmn_nid(cmn, nodeid); - - dev_dbg(cmn->dev, "invalid node 0x%x (%d,%d,%d,%d) type 0x%x\n", - nodeid, nid.x, nid.y, nid.port, nid.dev, type); + dev_dbg(cmn->dev, "invalid node 0x%x type 0x%x\n", nodeid, type); return -EINVAL; } @@ -1921,7 +1896,7 @@ static int arm_cmn_event_add(struct perf_event *event, int flags) arm_cmn_claim_wp_idx(dtm, event, d, wp_idx, i); writel_relaxed(cfg, dtm->base + CMN_DTM_WPn_CONFIG(wp_idx)); } else { - struct arm_cmn_nodeid nid = arm_cmn_nid(cmn, dn->id); + struct arm_cmn_nodeid nid = arm_cmn_nid(dn); if (cmn->multi_dtm) nid.port %= 2; @@ -2168,10 +2143,12 @@ static int arm_cmn_init_dtcs(struct arm_cmn *cmn) continue; xp = arm_cmn_node_to_xp(cmn, dn); + dn->portid_bits = xp->portid_bits; + dn->deviceid_bits = xp->deviceid_bits; dn->dtc = xp->dtc; dn->dtm = xp->dtm; if (cmn->multi_dtm) - dn->dtm += arm_cmn_nid(cmn, dn->id).port / 2; + dn->dtm += arm_cmn_nid(dn).port / 2; if (dn->type == CMN_TYPE_DTC) { int err = arm_cmn_init_dtc(cmn, dn, dtc_idx++); @@ -2341,18 +2318,27 @@ static int arm_cmn_discover(struct arm_cmn *cmn, unsigned int rgn_offset) arm_cmn_init_dtm(dtm++, xp, 0); /* * Keeping track of connected ports will let us filter out - * unnecessary XP events easily. We can also reliably infer the - * "extra device ports" configuration for the node ID format - * from this, since in that case we will see at least one XP - * with port 2 connected, for the HN-D. + * unnecessary XP events easily, and also infer the per-XP + * part of the node ID format. */ for (int p = 0; p < CMN_MAX_PORTS; p++) if (arm_cmn_device_connect_info(cmn, xp, p)) xp_ports |= BIT(p); - if (cmn->multi_dtm && (xp_ports & 0xc)) + if (cmn->num_xps == 1) { + xp->portid_bits = 3; + xp->deviceid_bits = 2; + } else if (xp_ports > 0x3) { + xp->portid_bits = 2; + xp->deviceid_bits = 1; + } else { + xp->portid_bits = 1; + xp->deviceid_bits = 2; + } + + if (cmn->multi_dtm && (xp_ports > 0x3)) arm_cmn_init_dtm(dtm++, xp, 1); - if (cmn->multi_dtm && (xp_ports & 0x30)) + if (cmn->multi_dtm && (xp_ports > 0xf)) arm_cmn_init_dtm(dtm++, xp, 2); cmn->ports_used |= xp_ports; From 88b63a82c84ed9bbcbdefb10cb6f75dd1dd04887 Mon Sep 17 00:00:00 2001 From: Robin Murphy Date: Mon, 2 Sep 2024 18:51:58 +0100 Subject: [PATCH 73/94] perf/arm-cmn: Fix CCLA register offset Apparently pmu_event_sel is offset by 8 for all CCLA nodes, not just the CCLA_RNI combination type. Fixes: 23760a014417 ("perf/arm-cmn: Add CMN-700 support") Acked-by: Mark Rutland Reviewed-by: Ilkka Koskinen Signed-off-by: Robin Murphy Link: https://lore.kernel.org/r/6e7bb06fef6046f83e7647aad0e5be544139763f.1725296395.git.robin.murphy@arm.com Signed-off-by: Will Deacon --- drivers/perf/arm-cmn.c | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/drivers/perf/arm-cmn.c b/drivers/perf/arm-cmn.c index b59ae8513dce..4e2338afe669 100644 --- a/drivers/perf/arm-cmn.c +++ b/drivers/perf/arm-cmn.c @@ -70,7 +70,8 @@ /* Technically this is 4 bits wide on DNs, but we only use 2 there anyway */ #define CMN__PMU_OCCUP1_ID GENMASK_ULL(34, 32) -/* HN-Ps are weird... */ +/* Some types are designed to coexist with another device in the same node */ +#define CMN_CCLA_PMU_EVENT_SEL 0x008 #define CMN_HNP_PMU_EVENT_SEL 0x008 /* DTMs live in the PMU space of XP registers */ @@ -2393,10 +2394,13 @@ static int arm_cmn_discover(struct arm_cmn *cmn, unsigned int rgn_offset) case CMN_TYPE_CXHA: case CMN_TYPE_CCRA: case CMN_TYPE_CCHA: - case CMN_TYPE_CCLA: case CMN_TYPE_HNS: dn++; break; + case CMN_TYPE_CCLA: + dn->pmu_base += CMN_CCLA_PMU_EVENT_SEL; + dn++; + break; /* Nothing to see here */ case CMN_TYPE_MPAM_S: case CMN_TYPE_MPAM_NS: @@ -2414,7 +2418,7 @@ static int arm_cmn_discover(struct arm_cmn *cmn, unsigned int rgn_offset) case CMN_TYPE_HNP: case CMN_TYPE_CCLA_RNI: dn[1] = dn[0]; - dn[0].pmu_base += CMN_HNP_PMU_EVENT_SEL; + dn[0].pmu_base += CMN_CCLA_PMU_EVENT_SEL; dn[1].type = arm_cmn_subtype(dn->type); dn += 2; break; From 359414b33e00bae91e4eabf3e4ef8e76024c7673 Mon Sep 17 00:00:00 2001 From: Robin Murphy Date: Mon, 2 Sep 2024 18:51:59 +0100 Subject: [PATCH 74/94] perf/arm-cmn: Ensure dtm_idx is big enough While CMN_MAX_DIMENSION was bumped to 12 for CMN-650, that only supports up to a 10x10 mesh, so bumping dtm_idx to 256 bits at the time worked out OK in practice. However CMN-700 did finally support up to 144 XPs, and thus needs a worst-case 288 bits of dtm_idx for an aggregated XP event on a maxed-out config. Oops. Fixes: 23760a014417 ("perf/arm-cmn: Add CMN-700 support") Signed-off-by: Robin Murphy Link: https://lore.kernel.org/r/e771b358526a0d7fc06efee2c3a2fdc0c9f51d44.1725296395.git.robin.murphy@arm.com Signed-off-by: Will Deacon --- drivers/perf/arm-cmn.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/drivers/perf/arm-cmn.c b/drivers/perf/arm-cmn.c index 4e2338afe669..48863b31ccfb 100644 --- a/drivers/perf/arm-cmn.c +++ b/drivers/perf/arm-cmn.c @@ -35,6 +35,9 @@ #define CMN_MAX_XPS (CMN_MAX_DIMENSION * CMN_MAX_DIMENSION) #define CMN_MAX_DTMS (CMN_MAX_XPS + (CMN_MAX_DIMENSION - 1) * 4) +/* Currently XPs are the node type we can have most of; others top out at 128 */ +#define CMN_MAX_NODES_PER_EVENT CMN_MAX_XPS + /* The CFG node has various info besides the discovery tree */ #define CMN_CFGM_PERIPH_ID_01 0x0008 #define CMN_CFGM_PID0_PART_0 GENMASK_ULL(7, 0) @@ -564,7 +567,7 @@ static void arm_cmn_debugfs_init(struct arm_cmn *cmn, int id) {} struct arm_cmn_hw_event { struct arm_cmn_node *dn; - u64 dtm_idx[4]; + u64 dtm_idx[DIV_ROUND_UP(CMN_MAX_NODES_PER_EVENT * 2, 64)]; s8 dtc_idx[CMN_MAX_DTCS]; u8 num_dns; u8 dtm_offset; From ff436cee694ee8bc4173f2d42622ee7c17a085d3 Mon Sep 17 00:00:00 2001 From: Robin Murphy Date: Mon, 2 Sep 2024 18:52:00 +0100 Subject: [PATCH 75/94] perf/arm-cmn: Improve build-time assertion These days we can use static_assert() in the logical place rather than jamming a BUILD_BUG_ON() into the nearest function scope. Signed-off-by: Robin Murphy Link: https://lore.kernel.org/r/224ee8286f299100f1c768edb254edc898539f50.1725296395.git.robin.murphy@arm.com Signed-off-by: Will Deacon --- drivers/perf/arm-cmn.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/perf/arm-cmn.c b/drivers/perf/arm-cmn.c index 48863b31ccfb..5b0edeb69394 100644 --- a/drivers/perf/arm-cmn.c +++ b/drivers/perf/arm-cmn.c @@ -581,6 +581,7 @@ struct arm_cmn_hw_event { bool wide_sel; enum cmn_filter_select filter_sel; }; +static_assert(sizeof(struct arm_cmn_hw_event) <= offsetof(struct hw_perf_event, target)); #define for_each_hw_dn(hw, dn, i) \ for (i = 0, dn = hw->dn; i < hw->num_dns; i++, dn++) @@ -591,7 +592,6 @@ struct arm_cmn_hw_event { static struct arm_cmn_hw_event *to_cmn_hw(struct perf_event *event) { - BUILD_BUG_ON(sizeof(struct arm_cmn_hw_event) > offsetof(struct hw_perf_event, target)); return (struct arm_cmn_hw_event *)&event->hw; } From c5b15ddf11a8a82f5e9ccd9b44f7b765c59bffdd Mon Sep 17 00:00:00 2001 From: Robin Murphy Date: Mon, 2 Sep 2024 18:52:01 +0100 Subject: [PATCH 76/94] perf/arm-cmn: Make cycle counts less surprising By default, CMN has automatic clock-gating with the implication that a DTC's cycle counter may not increment while the DTC is sufficiently idle. Given that we may have up to 4 DTCs to choose from when scheduling a cycles event, this may potentially lead to surprising results if trying to measure metrics based on activity in a different DTC domain from where cycles end up being counted. Furthermore, since the details of internal clock gating are not documented, we can't even reason about what "active" cycles for a DTC actually mean relative to the activity of other nodes within the same nominal DTC domain. Make the reasonable assumption that if the user wants to count cycles, they almost certainly want to count all of the cycles, and disable clock gating while a DTC's cycle counter is in use. Signed-off-by: Robin Murphy Link: https://lore.kernel.org/r/c47cfdc09e907b1d7753d142a7e659982cceb246.1725296395.git.robin.murphy@arm.com Signed-off-by: Will Deacon --- drivers/perf/arm-cmn.c | 16 +++++++++++----- 1 file changed, 11 insertions(+), 5 deletions(-) diff --git a/drivers/perf/arm-cmn.c b/drivers/perf/arm-cmn.c index 5b0edeb69394..2205c183ec1b 100644 --- a/drivers/perf/arm-cmn.c +++ b/drivers/perf/arm-cmn.c @@ -119,6 +119,7 @@ /* The DTC node is where the magic happens */ #define CMN_DT_DTC_CTL 0x0a00 #define CMN_DT_DTC_CTL_DT_EN BIT(0) +#define CMN_DT_DTC_CTL_CG_DISABLE BIT(10) /* DTC counters are paired in 64-bit registers on a 16-byte stride. Yuck */ #define _CMN_DT_CNT_REG(n) ((((n) / 2) * 4 + (n) % 2) * 4) @@ -1546,9 +1547,12 @@ static void arm_cmn_event_start(struct perf_event *event, int flags) int i; if (type == CMN_TYPE_DTC) { - i = hw->dtc_idx[0]; - writeq_relaxed(CMN_CC_INIT, cmn->dtc[i].base + CMN_DT_PMCCNTR); - cmn->dtc[i].cc_active = true; + struct arm_cmn_dtc *dtc = cmn->dtc + hw->dtc_idx[0]; + + writel_relaxed(CMN_DT_DTC_CTL_DT_EN | CMN_DT_DTC_CTL_CG_DISABLE, + dtc->base + CMN_DT_DTC_CTL); + writeq_relaxed(CMN_CC_INIT, dtc->base + CMN_DT_PMCCNTR); + dtc->cc_active = true; } else if (type == CMN_TYPE_WP) { u64 val = CMN_EVENT_WP_VAL(event); u64 mask = CMN_EVENT_WP_MASK(event); @@ -1577,8 +1581,10 @@ static void arm_cmn_event_stop(struct perf_event *event, int flags) int i; if (type == CMN_TYPE_DTC) { - i = hw->dtc_idx[0]; - cmn->dtc[i].cc_active = false; + struct arm_cmn_dtc *dtc = cmn->dtc + hw->dtc_idx[0]; + + dtc->cc_active = false; + writel_relaxed(CMN_DT_DTC_CTL_DT_EN, dtc->base + CMN_DT_DTC_CTL); } else if (type == CMN_TYPE_WP) { for_each_hw_dn(hw, dn, i) { void __iomem *base = dn->pmu_base + CMN_DTM_OFFSET(hw->dtm_offset); From 67acca3504eade75e03ea3ae87a329df6b10ae02 Mon Sep 17 00:00:00 2001 From: Robin Murphy Date: Mon, 2 Sep 2024 18:52:02 +0100 Subject: [PATCH 77/94] perf/arm-cmn: Refactor DTC PMU register access Annoyingly, we're soon going to have to cope with PMU registers moving about. This will mostly be straightforward, except for the hard-coding of CMN_PMU_OFFSET for the DTC PMU registers. As a first step, refactor those accessors to allow for encapsulating a variable offset without making a big mess all over. As a bonus, we can repack the arm_cmn_dtc structure to accommodate the new pointer without growing any larger, since irq_friend only encodes a range of +/-3. Acked-by: Mark Rutland Reviewed-by: Ilkka Koskinen Signed-off-by: Robin Murphy Link: https://lore.kernel.org/r/fc677576fae7b5b55780e5b245a4ef6ea1b30daf.1725296395.git.robin.murphy@arm.com Signed-off-by: Will Deacon --- drivers/perf/arm-cmn.c | 64 ++++++++++++++++++++++++------------------ 1 file changed, 36 insertions(+), 28 deletions(-) diff --git a/drivers/perf/arm-cmn.c b/drivers/perf/arm-cmn.c index 2205c183ec1b..ac7dd4c352e8 100644 --- a/drivers/perf/arm-cmn.c +++ b/drivers/perf/arm-cmn.c @@ -123,24 +123,24 @@ /* DTC counters are paired in 64-bit registers on a 16-byte stride. Yuck */ #define _CMN_DT_CNT_REG(n) ((((n) / 2) * 4 + (n) % 2) * 4) -#define CMN_DT_PMEVCNT(n) (CMN_PMU_OFFSET + _CMN_DT_CNT_REG(n)) -#define CMN_DT_PMCCNTR (CMN_PMU_OFFSET + 0x40) +#define CMN_DT_PMEVCNT(dtc, n) ((dtc)->pmu_base + _CMN_DT_CNT_REG(n)) +#define CMN_DT_PMCCNTR(dtc) ((dtc)->pmu_base + 0x40) -#define CMN_DT_PMEVCNTSR(n) (CMN_PMU_OFFSET + 0x50 + _CMN_DT_CNT_REG(n)) -#define CMN_DT_PMCCNTRSR (CMN_PMU_OFFSET + 0x90) +#define CMN_DT_PMEVCNTSR(dtc, n) ((dtc)->pmu_base + 0x50 + _CMN_DT_CNT_REG(n)) +#define CMN_DT_PMCCNTRSR(dtc) ((dtc)->pmu_base + 0x90) -#define CMN_DT_PMCR (CMN_PMU_OFFSET + 0x100) +#define CMN_DT_PMCR(dtc) ((dtc)->pmu_base + 0x100) #define CMN_DT_PMCR_PMU_EN BIT(0) #define CMN_DT_PMCR_CNTR_RST BIT(5) #define CMN_DT_PMCR_OVFL_INTR_EN BIT(6) -#define CMN_DT_PMOVSR (CMN_PMU_OFFSET + 0x118) -#define CMN_DT_PMOVSR_CLR (CMN_PMU_OFFSET + 0x120) +#define CMN_DT_PMOVSR(dtc) ((dtc)->pmu_base + 0x118) +#define CMN_DT_PMOVSR_CLR(dtc) ((dtc)->pmu_base + 0x120) -#define CMN_DT_PMSSR (CMN_PMU_OFFSET + 0x128) +#define CMN_DT_PMSSR(dtc) ((dtc)->pmu_base + 0x128) #define CMN_DT_PMSSR_SS_STATUS(n) BIT(n) -#define CMN_DT_PMSRR (CMN_PMU_OFFSET + 0x130) +#define CMN_DT_PMSRR(dtc) ((dtc)->pmu_base + 0x130) #define CMN_DT_PMSRR_SS_REQ BIT(0) #define CMN_DT_NUM_COUNTERS 8 @@ -307,8 +307,9 @@ struct arm_cmn_dtm { struct arm_cmn_dtc { void __iomem *base; + void __iomem *pmu_base; int irq; - int irq_friend; + s8 irq_friend; bool cc_active; struct perf_event *counters[CMN_DT_NUM_COUNTERS]; @@ -412,10 +413,15 @@ static enum cmn_model arm_cmn_model(const struct arm_cmn *cmn) }; } +static int arm_cmn_pmu_offset(const struct arm_cmn *cmn, const struct arm_cmn_node *dn) +{ + return CMN_PMU_OFFSET; +} + static u32 arm_cmn_device_connect_info(const struct arm_cmn *cmn, const struct arm_cmn_node *xp, int port) { - int offset = CMN_MXP__CONNECT_INFO(port); + int offset = CMN_MXP__CONNECT_INFO(port) - arm_cmn_pmu_offset(cmn, xp); if (port >= 2) { if (cmn->part == PART_CMN600 || cmn->part == PART_CMN650) @@ -428,7 +434,7 @@ static u32 arm_cmn_device_connect_info(const struct arm_cmn *cmn, offset += CI700_CONNECT_INFO_P2_5_OFFSET; } - return readl_relaxed(xp->pmu_base - CMN_PMU_OFFSET + offset); + return readl_relaxed(xp->pmu_base + offset); } static struct dentry *arm_cmn_debugfs; @@ -1398,7 +1404,7 @@ static u32 arm_cmn_wp_config(struct perf_event *event, int wp_idx) static void arm_cmn_set_state(struct arm_cmn *cmn, u32 state) { if (!cmn->state) - writel_relaxed(0, cmn->dtc[0].base + CMN_DT_PMCR); + writel_relaxed(0, CMN_DT_PMCR(&cmn->dtc[0])); cmn->state |= state; } @@ -1407,7 +1413,7 @@ static void arm_cmn_clear_state(struct arm_cmn *cmn, u32 state) cmn->state &= ~state; if (!cmn->state) writel_relaxed(CMN_DT_PMCR_PMU_EN | CMN_DT_PMCR_OVFL_INTR_EN, - cmn->dtc[0].base + CMN_DT_PMCR); + CMN_DT_PMCR(&cmn->dtc[0])); } static void arm_cmn_pmu_enable(struct pmu *pmu) @@ -1442,18 +1448,19 @@ static u64 arm_cmn_read_dtm(struct arm_cmn *cmn, struct arm_cmn_hw_event *hw, static u64 arm_cmn_read_cc(struct arm_cmn_dtc *dtc) { - u64 val = readq_relaxed(dtc->base + CMN_DT_PMCCNTR); + void __iomem *pmccntr = CMN_DT_PMCCNTR(dtc); + u64 val = readq_relaxed(pmccntr); - writeq_relaxed(CMN_CC_INIT, dtc->base + CMN_DT_PMCCNTR); + writeq_relaxed(CMN_CC_INIT, pmccntr); return (val - CMN_CC_INIT) & ((CMN_CC_INIT << 1) - 1); } static u32 arm_cmn_read_counter(struct arm_cmn_dtc *dtc, int idx) { - u32 val, pmevcnt = CMN_DT_PMEVCNT(idx); + void __iomem *pmevcnt = CMN_DT_PMEVCNT(dtc, idx); + u32 val = readl_relaxed(pmevcnt); - val = readl_relaxed(dtc->base + pmevcnt); - writel_relaxed(CMN_COUNTER_INIT, dtc->base + pmevcnt); + writel_relaxed(CMN_COUNTER_INIT, pmevcnt); return val - CMN_COUNTER_INIT; } @@ -1464,7 +1471,7 @@ static void arm_cmn_init_counter(struct perf_event *event) u64 count; for_each_hw_dtc_idx(hw, i, idx) { - writel_relaxed(CMN_COUNTER_INIT, cmn->dtc[i].base + CMN_DT_PMEVCNT(idx)); + writel_relaxed(CMN_COUNTER_INIT, CMN_DT_PMEVCNT(&cmn->dtc[i], idx)); cmn->dtc[i].counters[idx] = event; } @@ -1551,7 +1558,7 @@ static void arm_cmn_event_start(struct perf_event *event, int flags) writel_relaxed(CMN_DT_DTC_CTL_DT_EN | CMN_DT_DTC_CTL_CG_DISABLE, dtc->base + CMN_DT_DTC_CTL); - writeq_relaxed(CMN_CC_INIT, dtc->base + CMN_DT_PMCCNTR); + writeq_relaxed(CMN_CC_INIT, CMN_DT_PMCCNTR(dtc)); dtc->cc_active = true; } else if (type == CMN_TYPE_WP) { u64 val = CMN_EVENT_WP_VAL(event); @@ -2028,7 +2035,7 @@ static irqreturn_t arm_cmn_handle_irq(int irq, void *dev_id) irqreturn_t ret = IRQ_NONE; for (;;) { - u32 status = readl_relaxed(dtc->base + CMN_DT_PMOVSR); + u32 status = readl_relaxed(CMN_DT_PMOVSR(dtc)); u64 delta; int i; @@ -2050,7 +2057,7 @@ static irqreturn_t arm_cmn_handle_irq(int irq, void *dev_id) } } - writel_relaxed(status, dtc->base + CMN_DT_PMOVSR_CLR); + writel_relaxed(status, CMN_DT_PMOVSR_CLR(dtc)); if (!dtc->irq_friend) return ret; @@ -2104,15 +2111,16 @@ static int arm_cmn_init_dtc(struct arm_cmn *cmn, struct arm_cmn_node *dn, int id { struct arm_cmn_dtc *dtc = cmn->dtc + idx; - dtc->base = dn->pmu_base - CMN_PMU_OFFSET; + dtc->pmu_base = dn->pmu_base; + dtc->base = dtc->pmu_base - arm_cmn_pmu_offset(cmn, dn); dtc->irq = platform_get_irq(to_platform_device(cmn->dev), idx); if (dtc->irq < 0) return dtc->irq; writel_relaxed(CMN_DT_DTC_CTL_DT_EN, dtc->base + CMN_DT_DTC_CTL); - writel_relaxed(CMN_DT_PMCR_PMU_EN | CMN_DT_PMCR_OVFL_INTR_EN, dtc->base + CMN_DT_PMCR); - writeq_relaxed(0, dtc->base + CMN_DT_PMCCNTR); - writel_relaxed(0x1ff, dtc->base + CMN_DT_PMOVSR_CLR); + writel_relaxed(CMN_DT_PMCR_PMU_EN | CMN_DT_PMCR_OVFL_INTR_EN, CMN_DT_PMCR(dtc)); + writeq_relaxed(0, CMN_DT_PMCCNTR(dtc)); + writel_relaxed(0x1ff, CMN_DT_PMOVSR_CLR(dtc)); return 0; } @@ -2200,7 +2208,7 @@ static void arm_cmn_init_node_info(struct arm_cmn *cmn, u32 offset, struct arm_c node->id = FIELD_GET(CMN_NI_NODE_ID, reg); node->logid = FIELD_GET(CMN_NI_LOGICAL_ID, reg); - node->pmu_base = cmn->base + offset + CMN_PMU_OFFSET; + node->pmu_base = cmn->base + offset + arm_cmn_pmu_offset(cmn, node); if (node->type == CMN_TYPE_CFG) level = 0; From a87ef537f961721e7f9786aae46e145723e5a0c2 Mon Sep 17 00:00:00 2001 From: Robin Murphy Date: Mon, 2 Sep 2024 18:52:03 +0100 Subject: [PATCH 78/94] dt-bindings: perf: arm-cmn: Add CMN S3 The CMN S3 PMU is functionally still very similar to CMN-700, however while the register contents are compatible, many of them are moved to different offsets. While this is technically discoverable by a careful driver that understands the part number in the peripheral ID registers (which do at least remain in the same place), a new unique compatible seems warranted to avoid any surprises. CC: devicetree@vger.kernel.org Acked-by: Rob Herring (Arm) Signed-off-by: Robin Murphy Link: https://lore.kernel.org/r/2150e87f33284ba55cf6594def018a02bcf809fe.1725296395.git.robin.murphy@arm.com Signed-off-by: Will Deacon --- Documentation/devicetree/bindings/perf/arm,cmn.yaml | 1 + 1 file changed, 1 insertion(+) diff --git a/Documentation/devicetree/bindings/perf/arm,cmn.yaml b/Documentation/devicetree/bindings/perf/arm,cmn.yaml index 2e51072e794a..0e9d665584e6 100644 --- a/Documentation/devicetree/bindings/perf/arm,cmn.yaml +++ b/Documentation/devicetree/bindings/perf/arm,cmn.yaml @@ -16,6 +16,7 @@ properties: - arm,cmn-600 - arm,cmn-650 - arm,cmn-700 + - arm,cmn-s3 - arm,ci-700 reg: From 0dc2f4963f7ef187b80d832d7d88f735a9dc99cb Mon Sep 17 00:00:00 2001 From: Robin Murphy Date: Mon, 2 Sep 2024 18:52:04 +0100 Subject: [PATCH 79/94] perf/arm-cmn: Support CMN S3 CMN S3 is the latest and greatest evolution for 2024, although most of the new features don't impact the PMU, so from our point of view it ends up looking a lot like CMN-700 r3 still. We have some new device types to ignore, a mildly irritating rearrangement of the register layouts, and a scary new configuration option that makes it potentially unsafe to even walk the full discovery tree, let alone attempt to use the PMU. Acked-by: Mark Rutland Reviewed-by: Ilkka Koskinen Signed-off-by: Robin Murphy Link: https://lore.kernel.org/r/2ec9eec5b6bf215a9886f3b69e3b00e4cd85095c.1725296395.git.robin.murphy@arm.com Signed-off-by: Will Deacon --- drivers/perf/arm-cmn.c | 119 ++++++++++++++++++++++++++--------------- 1 file changed, 76 insertions(+), 43 deletions(-) diff --git a/drivers/perf/arm-cmn.c b/drivers/perf/arm-cmn.c index ac7dd4c352e8..0266ff041e0b 100644 --- a/drivers/perf/arm-cmn.c +++ b/drivers/perf/arm-cmn.c @@ -45,24 +45,28 @@ #define CMN_CFGM_PERIPH_ID_23 0x0010 #define CMN_CFGM_PID2_REVISION GENMASK_ULL(7, 4) -#define CMN_CFGM_INFO_GLOBAL 0x900 +#define CMN_CFGM_INFO_GLOBAL 0x0900 #define CMN_INFO_MULTIPLE_DTM_EN BIT_ULL(63) #define CMN_INFO_RSP_VC_NUM GENMASK_ULL(53, 52) #define CMN_INFO_DAT_VC_NUM GENMASK_ULL(51, 50) +#define CMN_INFO_DEVICE_ISO_ENABLE BIT_ULL(44) -#define CMN_CFGM_INFO_GLOBAL_1 0x908 +#define CMN_CFGM_INFO_GLOBAL_1 0x0908 #define CMN_INFO_SNP_VC_NUM GENMASK_ULL(3, 2) #define CMN_INFO_REQ_VC_NUM GENMASK_ULL(1, 0) /* XPs also have some local topology info which has uses too */ #define CMN_MXP__CONNECT_INFO(p) (0x0008 + 8 * (p)) -#define CMN__CONNECT_INFO_DEVICE_TYPE GENMASK_ULL(4, 0) +#define CMN__CONNECT_INFO_DEVICE_TYPE GENMASK_ULL(5, 0) #define CMN_MAX_PORTS 6 #define CI700_CONNECT_INFO_P2_5_OFFSET 0x10 /* PMU registers occupy the 3rd 4KB page of each node's region */ #define CMN_PMU_OFFSET 0x2000 +/* ...except when they don't :( */ +#define CMN_S3_DTM_OFFSET 0xa000 +#define CMN_S3_PMU_OFFSET 0xd900 /* For most nodes, this is all there is */ #define CMN_PMU_EVENT_SEL 0x000 @@ -195,10 +199,11 @@ enum cmn_model { CMN650 = 2, CMN700 = 4, CI700 = 8, + CMNS3 = 16, /* ...and then we can use bitmap tricks for commonality */ CMN_ANY = -1, NOT_CMN600 = -2, - CMN_650ON = CMN650 | CMN700, + CMN_650ON = CMN650 | CMN700 | CMNS3, }; /* Actual part numbers and revision IDs defined by the hardware */ @@ -207,6 +212,7 @@ enum cmn_part { PART_CMN650 = 0x436, PART_CMN700 = 0x43c, PART_CI700 = 0x43a, + PART_CMN_S3 = 0x43e, }; /* CMN-600 r0px shouldn't exist in silicon, thankfully */ @@ -258,6 +264,7 @@ enum cmn_node_type { CMN_TYPE_HNS = 0x200, CMN_TYPE_HNS_MPAM_S, CMN_TYPE_HNS_MPAM_NS, + CMN_TYPE_APB = 0x1000, /* Not a real node type */ CMN_TYPE_WP = 0x7770 }; @@ -408,6 +415,8 @@ static enum cmn_model arm_cmn_model(const struct arm_cmn *cmn) return CMN700; case PART_CI700: return CI700; + case PART_CMN_S3: + return CMNS3; default: return 0; }; @@ -415,6 +424,11 @@ static enum cmn_model arm_cmn_model(const struct arm_cmn *cmn) static int arm_cmn_pmu_offset(const struct arm_cmn *cmn, const struct arm_cmn_node *dn) { + if (cmn->part == PART_CMN_S3) { + if (dn->type == CMN_TYPE_XP) + return CMN_S3_DTM_OFFSET; + return CMN_S3_PMU_OFFSET; + } return CMN_PMU_OFFSET; } @@ -468,9 +482,14 @@ static const char *arm_cmn_device_type(u8 type) case 0x17: return "RN-F_C_E|"; case 0x18: return " RN-F_E |"; case 0x19: return "RN-F_E_E|"; + case 0x1a: return " HN-S |"; + case 0x1b: return " LCN |"; case 0x1c: return " MTSX |"; case 0x1d: return " HN-V |"; case 0x1e: return " CCG |"; + case 0x20: return " RN-F_F |"; + case 0x21: return "RN-F_F_E|"; + case 0x22: return " SN-F_F |"; default: return " ???? |"; } } @@ -779,8 +798,8 @@ static umode_t arm_cmn_event_attr_is_visible(struct kobject *kobj, CMN_EVENT_ATTR(CMN_ANY, cxha_##_name, CMN_TYPE_CXHA, _event) #define CMN_EVENT_CCRA(_name, _event) \ CMN_EVENT_ATTR(CMN_ANY, ccra_##_name, CMN_TYPE_CCRA, _event) -#define CMN_EVENT_CCHA(_name, _event) \ - CMN_EVENT_ATTR(CMN_ANY, ccha_##_name, CMN_TYPE_CCHA, _event) +#define CMN_EVENT_CCHA(_model, _name, _event) \ + CMN_EVENT_ATTR(_model, ccha_##_name, CMN_TYPE_CCHA, _event) #define CMN_EVENT_CCLA(_name, _event) \ CMN_EVENT_ATTR(CMN_ANY, ccla_##_name, CMN_TYPE_CCLA, _event) #define CMN_EVENT_CCLA_RNI(_name, _event) \ @@ -1138,42 +1157,43 @@ static struct attribute *arm_cmn_event_attrs[] = { CMN_EVENT_CCRA(wdb_alloc, 0x59), CMN_EVENT_CCRA(ssb_alloc, 0x5a), - CMN_EVENT_CCHA(rddatbyp, 0x61), - CMN_EVENT_CCHA(chirsp_up_stall, 0x62), - CMN_EVENT_CCHA(chidat_up_stall, 0x63), - CMN_EVENT_CCHA(snppcrd_link0_stall, 0x64), - CMN_EVENT_CCHA(snppcrd_link1_stall, 0x65), - CMN_EVENT_CCHA(snppcrd_link2_stall, 0x66), - CMN_EVENT_CCHA(reqtrk_occ, 0x67), - CMN_EVENT_CCHA(rdb_occ, 0x68), - CMN_EVENT_CCHA(rdbyp_occ, 0x69), - CMN_EVENT_CCHA(wdb_occ, 0x6a), - CMN_EVENT_CCHA(snptrk_occ, 0x6b), - CMN_EVENT_CCHA(sdb_occ, 0x6c), - CMN_EVENT_CCHA(snphaz_occ, 0x6d), - CMN_EVENT_CCHA(reqtrk_alloc, 0x6e), - CMN_EVENT_CCHA(rdb_alloc, 0x6f), - CMN_EVENT_CCHA(rdbyp_alloc, 0x70), - CMN_EVENT_CCHA(wdb_alloc, 0x71), - CMN_EVENT_CCHA(snptrk_alloc, 0x72), - CMN_EVENT_CCHA(sdb_alloc, 0x73), - CMN_EVENT_CCHA(snphaz_alloc, 0x74), - CMN_EVENT_CCHA(pb_rhu_req_occ, 0x75), - CMN_EVENT_CCHA(pb_rhu_req_alloc, 0x76), - CMN_EVENT_CCHA(pb_rhu_pcie_req_occ, 0x77), - CMN_EVENT_CCHA(pb_rhu_pcie_req_alloc, 0x78), - CMN_EVENT_CCHA(pb_pcie_wr_req_occ, 0x79), - CMN_EVENT_CCHA(pb_pcie_wr_req_alloc, 0x7a), - CMN_EVENT_CCHA(pb_pcie_reg_req_occ, 0x7b), - CMN_EVENT_CCHA(pb_pcie_reg_req_alloc, 0x7c), - CMN_EVENT_CCHA(pb_pcie_rsvd_req_occ, 0x7d), - CMN_EVENT_CCHA(pb_pcie_rsvd_req_alloc, 0x7e), - CMN_EVENT_CCHA(pb_rhu_dat_occ, 0x7f), - CMN_EVENT_CCHA(pb_rhu_dat_alloc, 0x80), - CMN_EVENT_CCHA(pb_rhu_pcie_dat_occ, 0x81), - CMN_EVENT_CCHA(pb_rhu_pcie_dat_alloc, 0x82), - CMN_EVENT_CCHA(pb_pcie_wr_dat_occ, 0x83), - CMN_EVENT_CCHA(pb_pcie_wr_dat_alloc, 0x84), + CMN_EVENT_CCHA(CMN_ANY, rddatbyp, 0x61), + CMN_EVENT_CCHA(CMN_ANY, chirsp_up_stall, 0x62), + CMN_EVENT_CCHA(CMN_ANY, chidat_up_stall, 0x63), + CMN_EVENT_CCHA(CMN_ANY, snppcrd_link0_stall, 0x64), + CMN_EVENT_CCHA(CMN_ANY, snppcrd_link1_stall, 0x65), + CMN_EVENT_CCHA(CMN_ANY, snppcrd_link2_stall, 0x66), + CMN_EVENT_CCHA(CMN_ANY, reqtrk_occ, 0x67), + CMN_EVENT_CCHA(CMN_ANY, rdb_occ, 0x68), + CMN_EVENT_CCHA(CMN_ANY, rdbyp_occ, 0x69), + CMN_EVENT_CCHA(CMN_ANY, wdb_occ, 0x6a), + CMN_EVENT_CCHA(CMN_ANY, snptrk_occ, 0x6b), + CMN_EVENT_CCHA(CMN_ANY, sdb_occ, 0x6c), + CMN_EVENT_CCHA(CMN_ANY, snphaz_occ, 0x6d), + CMN_EVENT_CCHA(CMN_ANY, reqtrk_alloc, 0x6e), + CMN_EVENT_CCHA(CMN_ANY, rdb_alloc, 0x6f), + CMN_EVENT_CCHA(CMN_ANY, rdbyp_alloc, 0x70), + CMN_EVENT_CCHA(CMN_ANY, wdb_alloc, 0x71), + CMN_EVENT_CCHA(CMN_ANY, snptrk_alloc, 0x72), + CMN_EVENT_CCHA(CMN_ANY, db_alloc, 0x73), + CMN_EVENT_CCHA(CMN_ANY, snphaz_alloc, 0x74), + CMN_EVENT_CCHA(CMN_ANY, pb_rhu_req_occ, 0x75), + CMN_EVENT_CCHA(CMN_ANY, pb_rhu_req_alloc, 0x76), + CMN_EVENT_CCHA(CMN_ANY, pb_rhu_pcie_req_occ, 0x77), + CMN_EVENT_CCHA(CMN_ANY, pb_rhu_pcie_req_alloc, 0x78), + CMN_EVENT_CCHA(CMN_ANY, pb_pcie_wr_req_occ, 0x79), + CMN_EVENT_CCHA(CMN_ANY, pb_pcie_wr_req_alloc, 0x7a), + CMN_EVENT_CCHA(CMN_ANY, pb_pcie_reg_req_occ, 0x7b), + CMN_EVENT_CCHA(CMN_ANY, pb_pcie_reg_req_alloc, 0x7c), + CMN_EVENT_CCHA(CMN_ANY, pb_pcie_rsvd_req_occ, 0x7d), + CMN_EVENT_CCHA(CMN_ANY, pb_pcie_rsvd_req_alloc, 0x7e), + CMN_EVENT_CCHA(CMN_ANY, pb_rhu_dat_occ, 0x7f), + CMN_EVENT_CCHA(CMN_ANY, pb_rhu_dat_alloc, 0x80), + CMN_EVENT_CCHA(CMN_ANY, pb_rhu_pcie_dat_occ, 0x81), + CMN_EVENT_CCHA(CMN_ANY, pb_rhu_pcie_dat_alloc, 0x82), + CMN_EVENT_CCHA(CMN_ANY, pb_pcie_wr_dat_occ, 0x83), + CMN_EVENT_CCHA(CMN_ANY, pb_pcie_wr_dat_alloc, 0x84), + CMN_EVENT_CCHA(CMNS3, chirsp1_up_stall, 0x85), CMN_EVENT_CCLA(rx_cxs, 0x21), CMN_EVENT_CCLA(tx_cxs, 0x22), @@ -1779,7 +1799,8 @@ static int arm_cmn_event_init(struct perf_event *event) /* ...but the DTM may depend on which port we're watching */ if (cmn->multi_dtm) hw->dtm_offset = CMN_EVENT_WP_DEV_SEL(event) / 2; - } else if (type == CMN_TYPE_XP && cmn->part == PART_CMN700) { + } else if (type == CMN_TYPE_XP && + (cmn->part == PART_CMN700 || cmn->part == PART_CMN_S3)) { hw->wide_sel = true; } @@ -2266,7 +2287,17 @@ static int arm_cmn_discover(struct arm_cmn *cmn, unsigned int rgn_offset) reg = readl_relaxed(cfg_region + CMN_CFGM_PERIPH_ID_23); cmn->rev = FIELD_GET(CMN_CFGM_PID2_REVISION, reg); + /* + * With the device isolation feature, if firmware has neglected to enable + * an XP port then we risk locking up if we try to access anything behind + * it; however we also have no way to tell from Non-Secure whether any + * given port is disabled or not, so the only way to win is not to play... + */ reg = readq_relaxed(cfg_region + CMN_CFGM_INFO_GLOBAL); + if (reg & CMN_INFO_DEVICE_ISO_ENABLE) { + dev_err(cmn->dev, "Device isolation enabled, not continuing due to risk of lockup\n"); + return -ENODEV; + } cmn->multi_dtm = reg & CMN_INFO_MULTIPLE_DTM_EN; cmn->rsp_vc_num = FIELD_GET(CMN_INFO_RSP_VC_NUM, reg); cmn->dat_vc_num = FIELD_GET(CMN_INFO_DAT_VC_NUM, reg); @@ -2425,6 +2456,7 @@ static int arm_cmn_discover(struct arm_cmn *cmn, unsigned int rgn_offset) case CMN_TYPE_CXLA: case CMN_TYPE_HNS_MPAM_S: case CMN_TYPE_HNS_MPAM_NS: + case CMN_TYPE_APB: break; /* * Split "optimised" combination nodes into separate @@ -2610,6 +2642,7 @@ static const struct of_device_id arm_cmn_of_match[] = { { .compatible = "arm,cmn-600", .data = (void *)PART_CMN600 }, { .compatible = "arm,cmn-650" }, { .compatible = "arm,cmn-700" }, + { .compatible = "arm,cmn-s3" }, { .compatible = "arm,ci-700" }, {} }; From 1db9716d44875d31acf29255710e82338560c177 Mon Sep 17 00:00:00 2001 From: Rong Qianfeng Date: Mon, 2 Sep 2024 10:39:35 +0800 Subject: [PATCH 80/94] arm64/mm: Delete __init region from memblock.reserved If CONFIG_ARCH_KEEP_MEMBLOCK is enabled, the memory information in memblock will be retained. We release the __init memory here, and we should also delete the corresponding region in memblock.reserved, which allows debugfs/memblock/reserved to display correct memory information. Signed-off-by: Rong Qianfeng Link: https://lore.kernel.org/r/20240902023940.43227-1-rongqianfeng@vivo.com Signed-off-by: Will Deacon --- arch/arm64/mm/init.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/arch/arm64/mm/init.c b/arch/arm64/mm/init.c index 9b5ab6818f7f..aea834a9691a 100644 --- a/arch/arm64/mm/init.c +++ b/arch/arm64/mm/init.c @@ -414,6 +414,12 @@ void __init mem_init(void) void free_initmem(void) { + unsigned long aligned_begin = ALIGN_DOWN((u64)__init_begin, PAGE_SIZE); + unsigned long aligned_end = ALIGN((u64)__init_end, PAGE_SIZE); + + /* Delete __init region from memblock.reserved. */ + memblock_free((void *)aligned_begin, aligned_end - aligned_begin); + free_reserved_area(lm_alias(__init_begin), lm_alias(__init_end), POISON_FREE_INITMEM, "unused kernel"); From 7eced90b202d63cdc1b9b11b1353adb1389830f9 Mon Sep 17 00:00:00 2001 From: Fares Mehanna Date: Mon, 2 Sep 2024 16:33:08 +0000 Subject: [PATCH 81/94] arm64: trans_pgd: mark PTEs entries as valid to avoid dead kexec() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The reasons for PTEs in the kernel direct map to be marked invalid are not limited to kfence / debug pagealloc machinery. In particular, memfd_secret() also steals pages with set_direct_map_invalid_noflush(). When building the transitional page tables for kexec from the current kernel's page tables, those pages need to become regular writable pages, otherwise, if the relocation places kexec segments over such pages, a fault will occur during kexec, leading to host going dark during kexec. This patch addresses the kexec issue by marking any PTE as valid if it is not none. While this fixes the kexec crash, it does not address the security concern that if processes owning secret memory are not terminated before kexec, the secret content will be mapped in the new kernel without being scrubbed. Suggested-by: Jan H. Schönherr Signed-off-by: Fares Mehanna Link: https://lore.kernel.org/r/20240902163309.97113-1-faresx@amazon.de Signed-off-by: Will Deacon --- arch/arm64/mm/trans_pgd.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/arch/arm64/mm/trans_pgd.c b/arch/arm64/mm/trans_pgd.c index 5139a28130c0..0f7b484cb2ff 100644 --- a/arch/arm64/mm/trans_pgd.c +++ b/arch/arm64/mm/trans_pgd.c @@ -42,14 +42,16 @@ static void _copy_pte(pte_t *dst_ptep, pte_t *src_ptep, unsigned long addr) * the temporary mappings we use during restore. */ __set_pte(dst_ptep, pte_mkwrite_novma(pte)); - } else if ((debug_pagealloc_enabled() || - is_kfence_address((void *)addr)) && !pte_none(pte)) { + } else if (!pte_none(pte)) { /* * debug_pagealloc will removed the PTE_VALID bit if * the page isn't in use by the resume kernel. It may have * been in use by the original kernel, in which case we need * to put it back in our copy to do the restore. * + * Other cases include kfence / vmalloc / memfd_secret which + * may call `set_direct_map_invalid_noflush()`. + * * Before marking this entry valid, check the pfn should * be mapped. */ From eeb8fdfcf0901578c26ecfb11e814f36bc9a92f5 Mon Sep 17 00:00:00 2001 From: D Scott Phillips Date: Tue, 3 Sep 2024 09:45:32 -0700 Subject: [PATCH 82/94] arm64: Expose the end of the linear map in PHYSMEM_END The memory hot-plug and resource management code needs to know the largest address which can fit in the linear map, so set PHYSMEM_END for that purpose. This fixes a crash at boot when amdgpu tries to create DEVICE_PRIVATE_MEMORY and is given a physical address by the resource management code which is outside the range which can have a `struct page` | Unable to handle kernel paging request at virtual address 000001ffa6000034 | user pgtable: 4k pages, 48-bit VAs, pgdp=000008000287c000 | [000001ffa6000034] pgd=0000000000000000, p4d=0000000000000000 | Call trace: | __init_zone_device_page.constprop.0+0x2c/0xa8 | memmap_init_zone_device+0xf0/0x210 | pagemap_range+0x1e0/0x410 | memremap_pages+0x18c/0x2e0 | devm_memremap_pages+0x30/0x90 | kgd2kfd_init_zone_device+0xf0/0x200 [amdgpu] | amdgpu_device_ip_init+0x674/0x888 [amdgpu] | amdgpu_device_init+0x7a4/0xea0 [amdgpu] | amdgpu_driver_load_kms+0x28/0x1c0 [amdgpu] | amdgpu_pci_probe+0x1a0/0x560 [amdgpu] Signed-off-by: D Scott Phillips Link: https://lore.kernel.org/r/20240903164532.3874988-1-scott@os.amperecomputing.com Signed-off-by: Will Deacon --- arch/arm64/include/asm/memory.h | 2 ++ 1 file changed, 2 insertions(+) diff --git a/arch/arm64/include/asm/memory.h b/arch/arm64/include/asm/memory.h index 54fb014eba05..0480c61dbb4f 100644 --- a/arch/arm64/include/asm/memory.h +++ b/arch/arm64/include/asm/memory.h @@ -110,6 +110,8 @@ #define PAGE_END (_PAGE_END(VA_BITS_MIN)) #endif /* CONFIG_KASAN */ +#define PHYSMEM_END __pa(PAGE_END - 1) + #define MIN_THREAD_SHIFT (14 + KASAN_THREAD_SHIFT) /* From 70565f2be8807e5ea24dfb421197b881a02af5e2 Mon Sep 17 00:00:00 2001 From: Barry Song Date: Thu, 5 Sep 2024 20:11:24 +1200 Subject: [PATCH 83/94] mm: arm64: document why pte is not advanced in contpte_ptep_set_access_flags() According to David and Ryan, there isn't a bug here, even though we don't advance the PTE entry, because __ptep_set_access_flags() only uses the access flags from the entry. However, we always check pte_same(pte, entry) using the first entry in __ptep_set_access_flags(). This means that the checks from 1 to nr - 1 are not comparing the same PTE indexes (thus, they always return false), which can be a bit confusing. To clarify the code, let's add some comments. Reviewed-by: Ryan Roberts Signed-off-by: Barry Song Cc: Ard Biesheuvel Cc: John Hubbard Cc: Mark Rutland Cc: Catalin Marinas Cc: David Hildenbrand Cc: Will Deacon Reviewed-by: David Hildenbrand Link: https://lore.kernel.org/r/20240905081124.9576-1-21cnbao@gmail.com Signed-off-by: Will Deacon --- arch/arm64/mm/contpte.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/arch/arm64/mm/contpte.c b/arch/arm64/mm/contpte.c index a3edced29ac1..55107d27d3f8 100644 --- a/arch/arm64/mm/contpte.c +++ b/arch/arm64/mm/contpte.c @@ -421,6 +421,12 @@ int contpte_ptep_set_access_flags(struct vm_area_struct *vma, ptep = contpte_align_down(ptep); start_addr = addr = ALIGN_DOWN(addr, CONT_PTE_SIZE); + /* + * We are not advancing entry because __ptep_set_access_flags() + * only consumes access flags from entry. And since we have checked + * for the whole contpte block and returned early, pte_same() + * within __ptep_set_access_flags() is likely false. + */ for (i = 0; i < CONT_PTES; i++, ptep++, addr += PAGE_SIZE) __ptep_set_access_flags(vma, addr, ptep, entry, 0); From c02e7c5c6da8c637fec60158b0d4b330841de5ce Mon Sep 17 00:00:00 2001 From: Joey Gouly Date: Thu, 5 Sep 2024 16:29:35 +0100 Subject: [PATCH 84/94] arm64/mm: use lm_alias() with addresses passed to memblock_free() The pointer argument to memblock_free() needs to be a linear map address, but in mem_init() we pass __init_begin/__init_end, which is a kernel image address. This results in warnings when building with CONFIG_DEBUG_VIRTUAL=y: virt_to_phys used for non-linear address: ffff800081270000 (set_reset_devices+0x0/0x10) WARNING: CPU: 0 PID: 1 at arch/arm64/mm/physaddr.c:12 __virt_to_phys+0x54/0x70 Modules linked in: CPU: 0 UID: 0 PID: 1 Comm: swapper/0 Not tainted 6.11.0-rc6-next-20240905 #5810 b1ebb0ad06653f35ce875413d5afad24668df3f3 Hardware name: FVP Base RevC (DT) pstate: 2161402005 (nZCv daif +PAN -UAO -TCO +DIT -SSBS BTYPE=--) pc : __virt_to_phys+0x54/0x70 lr : __virt_to_phys+0x54/0x70 sp : ffff80008169be20 ... Call trace: __virt_to_phys+0x54/0x70 memblock_free+0x18/0x30 free_initmem+0x3c/0x9c kernel_init+0x30/0x1cc ret_from_fork+0x10/0x20 Fix this by having mem_init() convert the pointers via lm_alias(). Fixes: 1db9716d4487 ("arm64/mm: Delete __init region from memblock.reserved") Signed-off-by: Joey Gouly Suggested-by: Mark Rutland Cc: Will Deacon Cc: Catalin Marinas Cc: Rong Qianfeng Reviewed-by: Mark Rutland Link: https://lore.kernel.org/r/20240905152935.4156469-1-joey.gouly@arm.com Signed-off-by: Will Deacon --- arch/arm64/mm/init.c | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/arch/arm64/mm/init.c b/arch/arm64/mm/init.c index aea834a9691a..a0400b9aa814 100644 --- a/arch/arm64/mm/init.c +++ b/arch/arm64/mm/init.c @@ -414,14 +414,16 @@ void __init mem_init(void) void free_initmem(void) { - unsigned long aligned_begin = ALIGN_DOWN((u64)__init_begin, PAGE_SIZE); - unsigned long aligned_end = ALIGN((u64)__init_end, PAGE_SIZE); + void *lm_init_begin = lm_alias(__init_begin); + void *lm_init_end = lm_alias(__init_end); + + WARN_ON(!IS_ALIGNED((unsigned long)lm_init_begin, PAGE_SIZE)); + WARN_ON(!IS_ALIGNED((unsigned long)lm_init_end, PAGE_SIZE)); /* Delete __init region from memblock.reserved. */ - memblock_free((void *)aligned_begin, aligned_end - aligned_begin); + memblock_free(lm_init_begin, lm_init_end - lm_init_begin); - free_reserved_area(lm_alias(__init_begin), - lm_alias(__init_end), + free_reserved_area(lm_init_begin, lm_init_end, POISON_FREE_INITMEM, "unused kernel"); /* * Unmap the __init region but leave the VM area in place. This From f04b611e66503336bbdac04eb5a76d62932ce2e3 Mon Sep 17 00:00:00 2001 From: Robin Murphy Date: Wed, 4 Sep 2024 19:41:54 +0100 Subject: [PATCH 85/94] perf/arm-cmn: Clean up unnecessary NUMA_NO_NODE check Checking for NUMA_NO_NODE is a misleading and, on reflection, entirely unnecessary micro-optimisation. If it ever did happen that an incoming CPU has no NUMA affinity while the current CPU does, a questionably- useful PMU migration isn't the biggest thing wrong with that picture... Signed-off-by: Robin Murphy Link: https://lore.kernel.org/r/00634da33c21269a00844140afc7cc3a2ac1eb4d.1725474584.git.robin.murphy@arm.com Signed-off-by: Will Deacon --- drivers/perf/arm-cmn.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/perf/arm-cmn.c b/drivers/perf/arm-cmn.c index 0266ff041e0b..62d4782da7e4 100644 --- a/drivers/perf/arm-cmn.c +++ b/drivers/perf/arm-cmn.c @@ -2023,7 +2023,7 @@ static int arm_cmn_pmu_online_cpu(unsigned int cpu, struct hlist_node *cpuhp_nod cmn = hlist_entry_safe(cpuhp_node, struct arm_cmn, cpuhp_node); node = dev_to_node(cmn->dev); - if (node != NUMA_NO_NODE && cpu_to_node(cmn->cpu) != node && cpu_to_node(cpu) == node) + if (cpu_to_node(cmn->cpu) != node && cpu_to_node(cpu) == node) arm_cmn_migrate(cmn, cpu); return 0; } From f32efa3e4bba5b3432d7932dc89bd2e36c5c0f49 Mon Sep 17 00:00:00 2001 From: Robin Murphy Date: Wed, 4 Sep 2024 19:41:55 +0100 Subject: [PATCH 86/94] perf/arm-cmn: Improve format attr printing Take full advantage of our formats being stored in bitfield form, and make the printing even more robust and simple by letting printk do all the hard work of formatting bitlists. Signed-off-by: Robin Murphy Reviewed-by: Ilkka Koskinen Link: https://lore.kernel.org/r/50459f2d48fc62310a566863dbf8a7c14361d363.1725474584.git.robin.murphy@arm.com Signed-off-by: Will Deacon --- drivers/perf/arm-cmn.c | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/drivers/perf/arm-cmn.c b/drivers/perf/arm-cmn.c index 62d4782da7e4..397a46410f7c 100644 --- a/drivers/perf/arm-cmn.c +++ b/drivers/perf/arm-cmn.c @@ -1280,15 +1280,11 @@ static ssize_t arm_cmn_format_show(struct device *dev, struct device_attribute *attr, char *buf) { struct arm_cmn_format_attr *fmt = container_of(attr, typeof(*fmt), attr); - int lo = __ffs(fmt->field), hi = __fls(fmt->field); - - if (lo == hi) - return sysfs_emit(buf, "config:%d\n", lo); if (!fmt->config) - return sysfs_emit(buf, "config:%d-%d\n", lo, hi); + return sysfs_emit(buf, "config:%*pbl\n", 64, &fmt->field); - return sysfs_emit(buf, "config%d:%d-%d\n", fmt->config, lo, hi); + return sysfs_emit(buf, "config%d:%*pbl\n", fmt->config, 64, &fmt->field); } #define _CMN_FORMAT_ATTR(_name, _cfg, _fld) \ From abbe74dd105b45330dd269530060d30441d45b95 Mon Sep 17 00:00:00 2001 From: Robin Murphy Date: Wed, 4 Sep 2024 18:34:02 +0100 Subject: [PATCH 87/94] dt-bindings/perf: Add Arm NI-700 PMU Add an initial binding for the Arm NI-700 interconnect PMU. As with the Arm CMN family, there are already future NI products on the roadmap, so the overall binding is named generically just in case any non-discoverable incompatibility between generations crops up. Cc: devicetree@vger.kernel.org Reviewed-by: Rob Herring (Arm) Signed-off-by: Robin Murphy Link: https://lore.kernel.org/r/5f86237580219116de37e5e54d8b7eb0c9ed580d.1725470837.git.robin.murphy@arm.com Signed-off-by: Will Deacon --- .../devicetree/bindings/perf/arm,ni.yaml | 30 +++++++++++++++++++ 1 file changed, 30 insertions(+) create mode 100644 Documentation/devicetree/bindings/perf/arm,ni.yaml diff --git a/Documentation/devicetree/bindings/perf/arm,ni.yaml b/Documentation/devicetree/bindings/perf/arm,ni.yaml new file mode 100644 index 000000000000..d66fffa256d5 --- /dev/null +++ b/Documentation/devicetree/bindings/perf/arm,ni.yaml @@ -0,0 +1,30 @@ +# SPDX-License-Identifier: GPL-2.0-only OR BSD-2-Clause +%YAML 1.2 +--- +$id: http://devicetree.org/schemas/perf/arm,ni.yaml# +$schema: http://devicetree.org/meta-schemas/core.yaml# + +title: Arm NI (Network-on-Chip Interconnect) Performance Monitors + +maintainers: + - Robin Murphy + +properties: + compatible: + const: arm,ni-700 + + reg: + items: + - description: Complete configuration register space + + interrupts: + minItems: 1 + maxItems: 32 + description: Overflow interrupts, one per clock domain, in order of domain ID + +required: + - compatible + - reg + - interrupts + +additionalProperties: false From 4d5a7680f2b4d0c2955e1d9f9a594b050d637436 Mon Sep 17 00:00:00 2001 From: Robin Murphy Date: Wed, 4 Sep 2024 18:34:03 +0100 Subject: [PATCH 88/94] perf: Add driver for Arm NI-700 interconnect PMU The Arm NI-700 Network-on-Chip Interconnect has a relatively straightforward design with a hierarchy of voltage, power, and clock domains, where each clock domain then contains a number of interface units and a PMU which can monitor events thereon. As such, it begets a relatively straightforward driver to interface those PMUs with perf. Even more so than with arm-cmn, users will require detailed knowledge of the wider system topology in order to meaningfully analyse anything, since the interconnect itself cannot know what lies beyond the boundary of each inscrutably-numbered interface. Given that, for now they are also expected to refer to the NI-700 documentation for the relevant event IDs to provide as well. An identifier is implemented so we can come back and add jevents if anyone really wants to. Signed-off-by: Robin Murphy Link: https://lore.kernel.org/r/9933058d0ab8138c78a61cd6852ea5d5ff48e393.1725470837.git.robin.murphy@arm.com Signed-off-by: Will Deacon --- Documentation/admin-guide/perf/arm-ni.rst | 17 + Documentation/admin-guide/perf/index.rst | 1 + drivers/perf/Kconfig | 7 + drivers/perf/Makefile | 1 + drivers/perf/arm-ni.c | 781 ++++++++++++++++++++++ 5 files changed, 807 insertions(+) create mode 100644 Documentation/admin-guide/perf/arm-ni.rst create mode 100644 drivers/perf/arm-ni.c diff --git a/Documentation/admin-guide/perf/arm-ni.rst b/Documentation/admin-guide/perf/arm-ni.rst new file mode 100644 index 000000000000..d26a8f697c36 --- /dev/null +++ b/Documentation/admin-guide/perf/arm-ni.rst @@ -0,0 +1,17 @@ +==================================== +Arm Network-on Chip Interconnect PMU +==================================== + +NI-700 and friends implement a distinct PMU for each clock domain within the +interconnect. Correspondingly, the driver exposes multiple PMU devices named +arm_ni__cd_, where is an (arbitrary) instance identifier and is +the clock domain ID within that particular instance. If multiple NI instances +exist within a system, the PMU devices can be correlated with the underlying +hardware instance via sysfs parentage. + +Each PMU exposes base event aliases for the interface types present in its clock +domain. These require qualifying with the "eventid" and "nodeid" parameters +to specify the event code to count and the interface at which to count it +(per the configured hardware ID as reflected in the xxNI_NODE_INFO register). +The exception is the "cycles" alias for the PMU cycle counter, which is encoded +with the PMU node type and needs no further qualification. diff --git a/Documentation/admin-guide/perf/index.rst b/Documentation/admin-guide/perf/index.rst index 7eb3dcd6f4da..8502bc174640 100644 --- a/Documentation/admin-guide/perf/index.rst +++ b/Documentation/admin-guide/perf/index.rst @@ -16,6 +16,7 @@ Performance monitor support starfive_starlink_pmu arm-ccn arm-cmn + arm-ni xgene-pmu arm_dsu_pmu thunderx2-pmu diff --git a/drivers/perf/Kconfig b/drivers/perf/Kconfig index aa9530b4064f..bab8ba64162f 100644 --- a/drivers/perf/Kconfig +++ b/drivers/perf/Kconfig @@ -48,6 +48,13 @@ config ARM_CMN Support for PMU events monitoring on the Arm CMN-600 Coherent Mesh Network interconnect. +config ARM_NI + tristate "Arm NI-700 PMU support" + depends on ARM64 || COMPILE_TEST + help + Support for PMU events monitoring on the Arm NI-700 Network-on-Chip + interconnect and family. + config ARM_PMU depends on ARM || ARM64 bool "ARM PMU framework" diff --git a/drivers/perf/Makefile b/drivers/perf/Makefile index d43df81d52f7..8268f38e42c5 100644 --- a/drivers/perf/Makefile +++ b/drivers/perf/Makefile @@ -3,6 +3,7 @@ obj-$(CONFIG_ARM_CCI_PMU) += arm-cci.o obj-$(CONFIG_ARM_CCN) += arm-ccn.o obj-$(CONFIG_ARM_CMN) += arm-cmn.o obj-$(CONFIG_ARM_DSU_PMU) += arm_dsu_pmu.o +obj-$(CONFIG_ARM_NI) += arm-ni.o obj-$(CONFIG_ARM_PMU) += arm_pmu.o arm_pmu_platform.o obj-$(CONFIG_ARM_PMU_ACPI) += arm_pmu_acpi.o obj-$(CONFIG_ARM_PMUV3) += arm_pmuv3.o diff --git a/drivers/perf/arm-ni.c b/drivers/perf/arm-ni.c new file mode 100644 index 000000000000..b72df3aea93e --- /dev/null +++ b/drivers/perf/arm-ni.c @@ -0,0 +1,781 @@ +// SPDX-License-Identifier: GPL-2.0 +// Copyright (C) 2022-2024 Arm Limited +// NI-700 Network-on-Chip PMU driver + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/* Common registers */ +#define NI_NODE_TYPE 0x000 +#define NI_NODE_TYPE_NODE_ID GENMASK(31, 16) +#define NI_NODE_TYPE_NODE_TYPE GENMASK(15, 0) + +#define NI_CHILD_NODE_INFO 0x004 +#define NI_CHILD_PTR(n) (0x008 + (n) * 4) + +#define NI700_PMUSELA 0x00c + +/* Config node */ +#define NI_PERIPHERAL_ID0 0xfe0 +#define NI_PIDR0_PART_7_0 GENMASK(7, 0) +#define NI_PERIPHERAL_ID1 0xfe4 +#define NI_PIDR1_PART_11_8 GENMASK(3, 0) +#define NI_PERIPHERAL_ID2 0xfe8 +#define NI_PIDR2_VERSION GENMASK(7, 4) + +/* PMU node */ +#define NI_PMEVCNTR(n) (0x008 + (n) * 8) +#define NI_PMCCNTR_L 0x0f8 +#define NI_PMCCNTR_U 0x0fc +#define NI_PMEVTYPER(n) (0x400 + (n) * 4) +#define NI_PMEVTYPER_NODE_TYPE GENMASK(12, 9) +#define NI_PMEVTYPER_NODE_ID GENMASK(8, 0) +#define NI_PMCNTENSET 0xc00 +#define NI_PMCNTENCLR 0xc20 +#define NI_PMINTENSET 0xc40 +#define NI_PMINTENCLR 0xc60 +#define NI_PMOVSCLR 0xc80 +#define NI_PMOVSSET 0xcc0 +#define NI_PMCFGR 0xe00 +#define NI_PMCR 0xe04 +#define NI_PMCR_RESET_CCNT BIT(2) +#define NI_PMCR_RESET_EVCNT BIT(1) +#define NI_PMCR_ENABLE BIT(0) + +#define NI_NUM_COUNTERS 8 +#define NI_CCNT_IDX 31 + +/* Event attributes */ +#define NI_CONFIG_TYPE GENMASK_ULL(15, 0) +#define NI_CONFIG_NODEID GENMASK_ULL(31, 16) +#define NI_CONFIG_EVENTID GENMASK_ULL(47, 32) + +#define NI_EVENT_TYPE(event) FIELD_GET(NI_CONFIG_TYPE, (event)->attr.config) +#define NI_EVENT_NODEID(event) FIELD_GET(NI_CONFIG_NODEID, (event)->attr.config) +#define NI_EVENT_EVENTID(event) FIELD_GET(NI_CONFIG_EVENTID, (event)->attr.config) + +enum ni_part { + PART_NI_700 = 0x43b, + PART_NI_710AE = 0x43d, +}; + +enum ni_node_type { + NI_GLOBAL, + NI_VOLTAGE, + NI_POWER, + NI_CLOCK, + NI_ASNI, + NI_AMNI, + NI_PMU, + NI_HSNI, + NI_HMNI, + NI_PMNI, +}; + +struct arm_ni_node { + void __iomem *base; + enum ni_node_type type; + u16 id; + u32 num_components; +}; + +struct arm_ni_unit { + void __iomem *pmusela; + enum ni_node_type type; + u16 id; + bool ns; + union { + __le64 pmusel; + u8 event[8]; + }; +}; + +struct arm_ni_cd { + void __iomem *pmu_base; + u16 id; + int num_units; + int irq; + int cpu; + struct hlist_node cpuhp_node; + struct pmu pmu; + struct arm_ni_unit *units; + struct perf_event *evcnt[NI_NUM_COUNTERS]; + struct perf_event *ccnt; +}; + +struct arm_ni { + struct device *dev; + void __iomem *base; + enum ni_part part; + int id; + int num_cds; + struct arm_ni_cd cds[] __counted_by(num_cds); +}; + +#define cd_to_ni(cd) container_of((cd), struct arm_ni, cds[(cd)->id]) +#define pmu_to_cd(p) container_of((p), struct arm_ni_cd, pmu) + +#define cd_for_each_unit(cd, u) \ + for (struct arm_ni_unit *u = cd->units; u < cd->units + cd->num_units; u++) + +static int arm_ni_hp_state; + +struct arm_ni_event_attr { + struct device_attribute attr; + enum ni_node_type type; +}; + +#define NI_EVENT_ATTR(_name, _type) \ + (&((struct arm_ni_event_attr[]) {{ \ + .attr = __ATTR(_name, 0444, arm_ni_event_show, NULL), \ + .type = _type, \ + }})[0].attr.attr) + +static ssize_t arm_ni_event_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct arm_ni_event_attr *eattr = container_of(attr, typeof(*eattr), attr); + + if (eattr->type == NI_PMU) + return sysfs_emit(buf, "type=0x%x\n", eattr->type); + + return sysfs_emit(buf, "type=0x%x,eventid=?,nodeid=?\n", eattr->type); +} + +static umode_t arm_ni_event_attr_is_visible(struct kobject *kobj, + struct attribute *attr, int unused) +{ + struct device *dev = kobj_to_dev(kobj); + struct arm_ni_cd *cd = pmu_to_cd(dev_get_drvdata(dev)); + struct arm_ni_event_attr *eattr; + + eattr = container_of(attr, typeof(*eattr), attr.attr); + + cd_for_each_unit(cd, unit) { + if (unit->type == eattr->type && unit->ns) + return attr->mode; + } + + return 0; +} + +static struct attribute *arm_ni_event_attrs[] = { + NI_EVENT_ATTR(asni, NI_ASNI), + NI_EVENT_ATTR(amni, NI_AMNI), + NI_EVENT_ATTR(cycles, NI_PMU), + NI_EVENT_ATTR(hsni, NI_HSNI), + NI_EVENT_ATTR(hmni, NI_HMNI), + NI_EVENT_ATTR(pmni, NI_PMNI), + NULL +}; + +static const struct attribute_group arm_ni_event_attrs_group = { + .name = "events", + .attrs = arm_ni_event_attrs, + .is_visible = arm_ni_event_attr_is_visible, +}; + +struct arm_ni_format_attr { + struct device_attribute attr; + u64 field; +}; + +#define NI_FORMAT_ATTR(_name, _fld) \ + (&((struct arm_ni_format_attr[]) {{ \ + .attr = __ATTR(_name, 0444, arm_ni_format_show, NULL), \ + .field = _fld, \ + }})[0].attr.attr) + +static ssize_t arm_ni_format_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct arm_ni_format_attr *fmt = container_of(attr, typeof(*fmt), attr); + + return sysfs_emit(buf, "config:%*pbl\n", 64, &fmt->field); +} + +static struct attribute *arm_ni_format_attrs[] = { + NI_FORMAT_ATTR(type, NI_CONFIG_TYPE), + NI_FORMAT_ATTR(nodeid, NI_CONFIG_NODEID), + NI_FORMAT_ATTR(eventid, NI_CONFIG_EVENTID), + NULL +}; + +static const struct attribute_group arm_ni_format_attrs_group = { + .name = "format", + .attrs = arm_ni_format_attrs, +}; + +static ssize_t arm_ni_cpumask_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct arm_ni_cd *cd = pmu_to_cd(dev_get_drvdata(dev)); + + return cpumap_print_to_pagebuf(true, buf, cpumask_of(cd->cpu)); +} + +static struct device_attribute arm_ni_cpumask_attr = + __ATTR(cpumask, 0444, arm_ni_cpumask_show, NULL); + +static ssize_t arm_ni_identifier_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct arm_ni *ni = cd_to_ni(pmu_to_cd(dev_get_drvdata(dev))); + u32 reg = readl_relaxed(ni->base + NI_PERIPHERAL_ID2); + int version = FIELD_GET(NI_PIDR2_VERSION, reg); + + return sysfs_emit(buf, "%03x%02x\n", ni->part, version); +} + +static struct device_attribute arm_ni_identifier_attr = + __ATTR(identifier, 0444, arm_ni_identifier_show, NULL); + +static struct attribute *arm_ni_other_attrs[] = { + &arm_ni_cpumask_attr.attr, + &arm_ni_identifier_attr.attr, + NULL +}; + +static const struct attribute_group arm_ni_other_attr_group = { + .attrs = arm_ni_other_attrs, + NULL +}; + +static const struct attribute_group *arm_ni_attr_groups[] = { + &arm_ni_event_attrs_group, + &arm_ni_format_attrs_group, + &arm_ni_other_attr_group, + NULL +}; + +static void arm_ni_pmu_enable(struct pmu *pmu) +{ + writel_relaxed(NI_PMCR_ENABLE, pmu_to_cd(pmu)->pmu_base + NI_PMCR); +} + +static void arm_ni_pmu_disable(struct pmu *pmu) +{ + writel_relaxed(0, pmu_to_cd(pmu)->pmu_base + NI_PMCR); +} + +struct arm_ni_val { + unsigned int evcnt; + unsigned int ccnt; +}; + +static bool arm_ni_val_count_event(struct perf_event *evt, struct arm_ni_val *val) +{ + if (is_software_event(evt)) + return true; + + if (NI_EVENT_TYPE(evt) == NI_PMU) { + val->ccnt++; + return val->ccnt <= 1; + } + + val->evcnt++; + return val->evcnt <= NI_NUM_COUNTERS; +} + +static int arm_ni_validate_group(struct perf_event *event) +{ + struct perf_event *sibling, *leader = event->group_leader; + struct arm_ni_val val = { 0 }; + + if (leader == event) + return 0; + + arm_ni_val_count_event(event, &val); + if (!arm_ni_val_count_event(leader, &val)) + return -EINVAL; + + for_each_sibling_event(sibling, leader) { + if (!arm_ni_val_count_event(sibling, &val)) + return -EINVAL; + } + return 0; +} + +static int arm_ni_event_init(struct perf_event *event) +{ + struct arm_ni_cd *cd = pmu_to_cd(event->pmu); + + if (event->attr.type != event->pmu->type) + return -ENOENT; + + if (is_sampling_event(event)) + return -EINVAL; + + event->cpu = cd->cpu; + if (NI_EVENT_TYPE(event) == NI_PMU) + return arm_ni_validate_group(event); + + cd_for_each_unit(cd, unit) { + if (unit->type == NI_EVENT_TYPE(event) && + unit->id == NI_EVENT_NODEID(event) && unit->ns) { + event->hw.config_base = (unsigned long)unit; + return arm_ni_validate_group(event); + } + } + return -EINVAL; +} + +static u64 arm_ni_read_ccnt(struct arm_ni_cd *cd) +{ + u64 l, u_old, u_new; + int retries = 3; /* 1st time unlucky, 2nd improbable, 3rd just broken */ + + u_new = readl_relaxed(cd->pmu_base + NI_PMCCNTR_U); + do { + u_old = u_new; + l = readl_relaxed(cd->pmu_base + NI_PMCCNTR_L); + u_new = readl_relaxed(cd->pmu_base + NI_PMCCNTR_U); + } while (u_new != u_old && --retries); + WARN_ON(!retries); + + return (u_new << 32) | l; +} + +static void arm_ni_event_read(struct perf_event *event) +{ + struct arm_ni_cd *cd = pmu_to_cd(event->pmu); + struct hw_perf_event *hw = &event->hw; + u64 count, prev; + bool ccnt = hw->idx == NI_CCNT_IDX; + + do { + prev = local64_read(&hw->prev_count); + if (ccnt) + count = arm_ni_read_ccnt(cd); + else + count = readl_relaxed(cd->pmu_base + NI_PMEVCNTR(hw->idx)); + } while (local64_cmpxchg(&hw->prev_count, prev, count) != prev); + + count -= prev; + if (!ccnt) + count = (u32)count; + local64_add(count, &event->count); +} + +static void arm_ni_event_start(struct perf_event *event, int flags) +{ + struct arm_ni_cd *cd = pmu_to_cd(event->pmu); + + writel_relaxed(1U << event->hw.idx, cd->pmu_base + NI_PMCNTENSET); +} + +static void arm_ni_event_stop(struct perf_event *event, int flags) +{ + struct arm_ni_cd *cd = pmu_to_cd(event->pmu); + + writel_relaxed(1U << event->hw.idx, cd->pmu_base + NI_PMCNTENCLR); + if (flags & PERF_EF_UPDATE) + arm_ni_event_read(event); +} + +static void arm_ni_init_ccnt(struct arm_ni_cd *cd) +{ + local64_set(&cd->ccnt->hw.prev_count, S64_MIN); + lo_hi_writeq_relaxed(S64_MIN, cd->pmu_base + NI_PMCCNTR_L); +} + +static void arm_ni_init_evcnt(struct arm_ni_cd *cd, int idx) +{ + local64_set(&cd->evcnt[idx]->hw.prev_count, S32_MIN); + writel_relaxed(S32_MIN, cd->pmu_base + NI_PMEVCNTR(idx)); +} + +static int arm_ni_event_add(struct perf_event *event, int flags) +{ + struct arm_ni_cd *cd = pmu_to_cd(event->pmu); + struct hw_perf_event *hw = &event->hw; + struct arm_ni_unit *unit; + enum ni_node_type type = NI_EVENT_TYPE(event); + u32 reg; + + if (type == NI_PMU) { + if (cd->ccnt) + return -ENOSPC; + hw->idx = NI_CCNT_IDX; + cd->ccnt = event; + arm_ni_init_ccnt(cd); + } else { + hw->idx = 0; + while (cd->evcnt[hw->idx]) { + if (++hw->idx == NI_NUM_COUNTERS) + return -ENOSPC; + } + cd->evcnt[hw->idx] = event; + unit = (void *)hw->config_base; + unit->event[hw->idx] = NI_EVENT_EVENTID(event); + arm_ni_init_evcnt(cd, hw->idx); + lo_hi_writeq_relaxed(le64_to_cpu(unit->pmusel), unit->pmusela); + + reg = FIELD_PREP(NI_PMEVTYPER_NODE_TYPE, type) | + FIELD_PREP(NI_PMEVTYPER_NODE_ID, NI_EVENT_NODEID(event)); + writel_relaxed(reg, cd->pmu_base + NI_PMEVTYPER(hw->idx)); + } + if (flags & PERF_EF_START) + arm_ni_event_start(event, 0); + return 0; +} + +static void arm_ni_event_del(struct perf_event *event, int flags) +{ + struct arm_ni_cd *cd = pmu_to_cd(event->pmu); + struct hw_perf_event *hw = &event->hw; + + arm_ni_event_stop(event, PERF_EF_UPDATE); + + if (hw->idx == NI_CCNT_IDX) + cd->ccnt = NULL; + else + cd->evcnt[hw->idx] = NULL; +} + +static irqreturn_t arm_ni_handle_irq(int irq, void *dev_id) +{ + struct arm_ni_cd *cd = dev_id; + irqreturn_t ret = IRQ_NONE; + u32 reg = readl_relaxed(cd->pmu_base + NI_PMOVSCLR); + + if (reg & (1U << NI_CCNT_IDX)) { + ret = IRQ_HANDLED; + if (!(WARN_ON(!cd->ccnt))) { + arm_ni_event_read(cd->ccnt); + arm_ni_init_ccnt(cd); + } + } + for (int i = 0; i < NI_NUM_COUNTERS; i++) { + if (!(reg & (1U << i))) + continue; + ret = IRQ_HANDLED; + if (!(WARN_ON(!cd->evcnt[i]))) { + arm_ni_event_read(cd->evcnt[i]); + arm_ni_init_evcnt(cd, i); + } + } + writel_relaxed(reg, cd->pmu_base + NI_PMOVSCLR); + return ret; +} + +static int arm_ni_init_cd(struct arm_ni *ni, struct arm_ni_node *node, u64 res_start) +{ + struct arm_ni_cd *cd = ni->cds + node->id; + const char *name; + int err; + + cd->id = node->id; + cd->num_units = node->num_components; + cd->units = devm_kcalloc(ni->dev, cd->num_units, sizeof(*(cd->units)), GFP_KERNEL); + if (!cd->units) + return -ENOMEM; + + for (int i = 0; i < cd->num_units; i++) { + u32 reg = readl_relaxed(node->base + NI_CHILD_PTR(i)); + void __iomem *unit_base = ni->base + reg; + struct arm_ni_unit *unit = cd->units + i; + + reg = readl_relaxed(unit_base + NI_NODE_TYPE); + unit->type = FIELD_GET(NI_NODE_TYPE_NODE_TYPE, reg); + unit->id = FIELD_GET(NI_NODE_TYPE_NODE_ID, reg); + + switch (unit->type) { + case NI_PMU: + reg = readl_relaxed(unit_base + NI_PMCFGR); + if (!reg) { + dev_info(ni->dev, "No access to PMU %d\n", cd->id); + devm_kfree(ni->dev, cd->units); + return 0; + } + unit->ns = true; + cd->pmu_base = unit_base; + break; + case NI_ASNI: + case NI_AMNI: + case NI_HSNI: + case NI_HMNI: + case NI_PMNI: + unit->pmusela = unit_base + NI700_PMUSELA; + writel_relaxed(1, unit->pmusela); + if (readl_relaxed(unit->pmusela) != 1) + dev_info(ni->dev, "No access to node 0x%04x%04x\n", unit->id, unit->type); + else + unit->ns = true; + break; + default: + /* + * e.g. FMU - thankfully bits 3:2 of FMU_ERR_FR0 are RES0 so + * can't alias any of the leaf node types we're looking for. + */ + dev_dbg(ni->dev, "Mystery node 0x%04x%04x\n", unit->id, unit->type); + break; + } + } + + res_start += cd->pmu_base - ni->base; + if (!devm_request_mem_region(ni->dev, res_start, SZ_4K, dev_name(ni->dev))) { + dev_err(ni->dev, "Failed to request PMU region 0x%llx\n", res_start); + return -EBUSY; + } + + writel_relaxed(NI_PMCR_RESET_CCNT | NI_PMCR_RESET_EVCNT, + cd->pmu_base + NI_PMCR); + writel_relaxed(U32_MAX, cd->pmu_base + NI_PMCNTENCLR); + writel_relaxed(U32_MAX, cd->pmu_base + NI_PMOVSCLR); + writel_relaxed(U32_MAX, cd->pmu_base + NI_PMINTENSET); + + cd->irq = platform_get_irq(to_platform_device(ni->dev), cd->id); + if (cd->irq < 0) + return cd->irq; + + err = devm_request_irq(ni->dev, cd->irq, arm_ni_handle_irq, + IRQF_NOBALANCING | IRQF_NO_THREAD, + dev_name(ni->dev), cd); + if (err) + return err; + + cd->cpu = cpumask_local_spread(0, dev_to_node(ni->dev)); + cd->pmu = (struct pmu) { + .module = THIS_MODULE, + .parent = ni->dev, + .attr_groups = arm_ni_attr_groups, + .capabilities = PERF_PMU_CAP_NO_EXCLUDE, + .task_ctx_nr = perf_invalid_context, + .pmu_enable = arm_ni_pmu_enable, + .pmu_disable = arm_ni_pmu_disable, + .event_init = arm_ni_event_init, + .add = arm_ni_event_add, + .del = arm_ni_event_del, + .start = arm_ni_event_start, + .stop = arm_ni_event_stop, + .read = arm_ni_event_read, + }; + + name = devm_kasprintf(ni->dev, GFP_KERNEL, "arm_ni_%d_cd_%d", ni->id, cd->id); + if (!name) + return -ENOMEM; + + err = cpuhp_state_add_instance_nocalls(arm_ni_hp_state, &cd->cpuhp_node); + if (err) + return err; + + err = perf_pmu_register(&cd->pmu, name, -1); + if (err) + cpuhp_state_remove_instance_nocalls(arm_ni_hp_state, &cd->cpuhp_node); + + return err; +} + +static void arm_ni_probe_domain(void __iomem *base, struct arm_ni_node *node) +{ + u32 reg = readl_relaxed(base + NI_NODE_TYPE); + + node->base = base; + node->type = FIELD_GET(NI_NODE_TYPE_NODE_TYPE, reg); + node->id = FIELD_GET(NI_NODE_TYPE_NODE_ID, reg); + node->num_components = readl_relaxed(base + NI_CHILD_NODE_INFO); +} + +static int arm_ni_probe(struct platform_device *pdev) +{ + struct arm_ni_node cfg, vd, pd, cd; + struct arm_ni *ni; + struct resource *res; + void __iomem *base; + static atomic_t id; + int num_cds; + u32 reg, part; + + /* + * We want to map the whole configuration space for ease of discovery, + * but the PMU pages are the only ones for which we can honestly claim + * exclusive ownership, so we'll request them explicitly once found. + */ + res = platform_get_resource(pdev, IORESOURCE_MEM, 0); + base = devm_ioremap(&pdev->dev, res->start, resource_size(res)); + if (IS_ERR(base)) + return PTR_ERR(base); + + arm_ni_probe_domain(base, &cfg); + if (cfg.type != NI_GLOBAL) + return -ENODEV; + + reg = readl_relaxed(cfg.base + NI_PERIPHERAL_ID0); + part = FIELD_GET(NI_PIDR0_PART_7_0, reg); + reg = readl_relaxed(cfg.base + NI_PERIPHERAL_ID1); + part |= FIELD_GET(NI_PIDR1_PART_11_8, reg) << 8; + + switch (part) { + case PART_NI_700: + case PART_NI_710AE: + break; + default: + dev_WARN(&pdev->dev, "Unknown part number: 0x%03x, this may go badly\n", part); + break; + } + + num_cds = 0; + for (int v = 0; v < cfg.num_components; v++) { + reg = readl_relaxed(cfg.base + NI_CHILD_PTR(v)); + arm_ni_probe_domain(base + reg, &vd); + for (int p = 0; p < vd.num_components; p++) { + reg = readl_relaxed(vd.base + NI_CHILD_PTR(p)); + arm_ni_probe_domain(base + reg, &pd); + num_cds += pd.num_components; + } + } + + ni = devm_kzalloc(&pdev->dev, struct_size(ni, cds, num_cds), GFP_KERNEL); + if (!ni) + return -ENOMEM; + + ni->dev = &pdev->dev; + ni->base = base; + ni->num_cds = num_cds; + ni->part = part; + ni->id = atomic_fetch_inc(&id); + + for (int v = 0; v < cfg.num_components; v++) { + reg = readl_relaxed(cfg.base + NI_CHILD_PTR(v)); + arm_ni_probe_domain(base + reg, &vd); + for (int p = 0; p < vd.num_components; p++) { + reg = readl_relaxed(vd.base + NI_CHILD_PTR(p)); + arm_ni_probe_domain(base + reg, &pd); + for (int c = 0; c < pd.num_components; c++) { + int ret; + + reg = readl_relaxed(pd.base + NI_CHILD_PTR(c)); + arm_ni_probe_domain(base + reg, &cd); + ret = arm_ni_init_cd(ni, &cd, res->start); + if (ret) + return ret; + } + } + } + + return 0; +} + +static void arm_ni_remove(struct platform_device *pdev) +{ + struct arm_ni *ni = platform_get_drvdata(pdev); + + for (int i = 0; i < ni->num_cds; i++) { + struct arm_ni_cd *cd = ni->cds + i; + + if (!cd->pmu_base) + continue; + + writel_relaxed(0, cd->pmu_base + NI_PMCR); + writel_relaxed(U32_MAX, cd->pmu_base + NI_PMINTENCLR); + perf_pmu_unregister(&cd->pmu); + cpuhp_state_remove_instance_nocalls(arm_ni_hp_state, &cd->cpuhp_node); + } +} + +#ifdef CONFIG_OF +static const struct of_device_id arm_ni_of_match[] = { + { .compatible = "arm,ni-700" }, + {} +}; +MODULE_DEVICE_TABLE(of, arm_ni_of_match); +#endif + +#ifdef CONFIG_ACPI +static const struct acpi_device_id arm_ni_acpi_match[] = { + { "ARMHCB70" }, + {} +}; +MODULE_DEVICE_TABLE(acpi, arm_ni_acpi_match); +#endif + +static struct platform_driver arm_ni_driver = { + .driver = { + .name = "arm-ni", + .of_match_table = of_match_ptr(arm_ni_of_match), + .acpi_match_table = ACPI_PTR(arm_ni_acpi_match), + }, + .probe = arm_ni_probe, + .remove = arm_ni_remove, +}; + +static void arm_ni_pmu_migrate(struct arm_ni_cd *cd, unsigned int cpu) +{ + perf_pmu_migrate_context(&cd->pmu, cd->cpu, cpu); + irq_set_affinity(cd->irq, cpumask_of(cpu)); + cd->cpu = cpu; +} + +static int arm_ni_pmu_online_cpu(unsigned int cpu, struct hlist_node *cpuhp_node) +{ + struct arm_ni_cd *cd; + int node; + + cd = hlist_entry_safe(cpuhp_node, struct arm_ni_cd, cpuhp_node); + node = dev_to_node(cd_to_ni(cd)->dev); + if (cpu_to_node(cd->cpu) != node && cpu_to_node(cpu) == node) + arm_ni_pmu_migrate(cd, cpu); + return 0; +} + +static int arm_ni_pmu_offline_cpu(unsigned int cpu, struct hlist_node *cpuhp_node) +{ + struct arm_ni_cd *cd; + unsigned int target; + int node; + + cd = hlist_entry_safe(cpuhp_node, struct arm_ni_cd, cpuhp_node); + if (cpu != cd->cpu) + return 0; + + node = dev_to_node(cd_to_ni(cd)->dev); + target = cpumask_any_and_but(cpumask_of_node(node), cpu_online_mask, cpu); + if (target >= nr_cpu_ids) + target = cpumask_any_but(cpu_online_mask, cpu); + + if (target < nr_cpu_ids) + arm_ni_pmu_migrate(cd, target); + return 0; +} + +static int __init arm_ni_init(void) +{ + int ret; + + ret = cpuhp_setup_state_multi(CPUHP_AP_ONLINE_DYN, + "perf/arm/ni:online", + arm_ni_pmu_online_cpu, + arm_ni_pmu_offline_cpu); + if (ret < 0) + return ret; + + arm_ni_hp_state = ret; + + ret = platform_driver_register(&arm_ni_driver); + if (ret) + cpuhp_remove_multi_state(arm_ni_hp_state); + return ret; +} + +static void __exit arm_ni_exit(void) +{ + platform_driver_unregister(&arm_ni_driver); + cpuhp_remove_multi_state(arm_ni_hp_state); +} + +module_init(arm_ni_init); +module_exit(arm_ni_exit); + +MODULE_AUTHOR("Robin Murphy "); +MODULE_DESCRIPTION("Arm NI-700 PMU driver"); +MODULE_LICENSE("GPL v2"); From 91df34ef2d88e4208c3ad53c439e9d6dbc36bb55 Mon Sep 17 00:00:00 2001 From: Robin Murphy Date: Wed, 4 Sep 2024 18:34:04 +0100 Subject: [PATCH 89/94] MAINTAINERS: List Arm interconnect PMUs as supported Whatever I may or may not have hoped for, looking after these drivers seems to have firmly stuck as one of the responsibilities of the job Arm pays me for, and I would still like to be aware of any other patches, so make it official. CC: Ilkka Koskinen CC: Jing Zhang Signed-off-by: Robin Murphy Link: https://lore.kernel.org/r/22ef1687ff3aa9da49b4577b3a179ccc055433ae.1725470837.git.robin.murphy@arm.com Signed-off-by: Will Deacon --- MAINTAINERS | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/MAINTAINERS b/MAINTAINERS index f328373463b0..1e1cd7414f06 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -1737,6 +1737,17 @@ F: drivers/mtd/maps/physmap-versatile.* F: drivers/power/reset/arm-versatile-reboot.c F: drivers/soc/versatile/ +ARM INTERCONNECT PMU DRIVERS +M: Robin Murphy +S: Supported +F: Documentation/admin-guide/perf/arm-cmn.rst +F: Documentation/admin-guide/perf/arm-ni.rst +F: Documentation/devicetree/bindings/perf/arm,cmn.yaml +F: Documentation/devicetree/bindings/perf/arm,ni.yaml +F: drivers/perf/arm-cmn.c +F: drivers/perf/arm-ni.c +F: tools/perf/pmu-events/arch/arm64/arm/cmn/ + ARM KOMEDA DRM-KMS DRIVER M: Liviu Dudau S: Supported From 5967a19f1c2ffb530f5d4589ddc4b4afbb6c7bd4 Mon Sep 17 00:00:00 2001 From: Ilkka Koskinen Date: Fri, 6 Sep 2024 12:15:39 -0700 Subject: [PATCH 90/94] perf: arm_pmuv3: Use BR_RETIRED for HW branch event if enabled The PMU driver attempts to use PC_WRITE_RETIRED for the HW branch event, if enabled. However, PC_WRITE_RETIRED counts only taken branches, whereas BR_RETIRED counts also non-taken ones. Furthermore, perf uses HW branch event to calculate branch misses ratio, implying BR_RETIRED is the correct event to count. We keep PC_WRITE_RETIRED still as an option in case BR_RETIRED isn't implemented. Signed-off-by: Ilkka Koskinen Link: https://lore.kernel.org/r/20240906191539.4847-1-ilkka@os.amperecomputing.com Signed-off-by: Will Deacon --- drivers/perf/arm_pmuv3.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/drivers/perf/arm_pmuv3.c b/drivers/perf/arm_pmuv3.c index 4d000532a07f..0afe02f879b4 100644 --- a/drivers/perf/arm_pmuv3.c +++ b/drivers/perf/arm_pmuv3.c @@ -1086,14 +1086,14 @@ static int __armv8_pmuv3_map_event_id(struct arm_pmu *armpmu, if (event->attr.type == PERF_TYPE_HARDWARE && event->attr.config == PERF_COUNT_HW_BRANCH_INSTRUCTIONS) { - if (test_bit(ARMV8_PMUV3_PERFCTR_PC_WRITE_RETIRED, - armpmu->pmceid_bitmap)) - return ARMV8_PMUV3_PERFCTR_PC_WRITE_RETIRED; - if (test_bit(ARMV8_PMUV3_PERFCTR_BR_RETIRED, armpmu->pmceid_bitmap)) return ARMV8_PMUV3_PERFCTR_BR_RETIRED; + if (test_bit(ARMV8_PMUV3_PERFCTR_PC_WRITE_RETIRED, + armpmu->pmceid_bitmap)) + return ARMV8_PMUV3_PERFCTR_PC_WRITE_RETIRED; + return HW_OP_UNSUPPORTED; } From 10166c23f41367b6aaebd403af86caab22466c22 Mon Sep 17 00:00:00 2001 From: Joey Gouly Date: Tue, 10 Sep 2024 11:50:04 +0100 Subject: [PATCH 91/94] arm64: pkeys: remove redundant WARN FEAT_PAN3 is present if FEAT_S1POE is, this WARN() was to represent that. However execute_only_pkey() is always called by mmap(), even on a CPU without POE support. Rather than making the WARN() conditional, just delete it. Reported-by: Naresh Kamboju Link: https://lore.kernel.org/linux-arm-kernel/CA+G9fYvarKEPN3u1Ogw2pcw4h6r3OMzg+5qJpYkAXRunAEF_0Q@mail.gmail.com/ Signed-off-by: Joey Gouly Cc: Will Deacon Cc: Catalin Marinas Link: https://lore.kernel.org/r/20240910105004.706981-1-joey.gouly@arm.com Signed-off-by: Will Deacon --- arch/arm64/include/asm/pkeys.h | 2 -- 1 file changed, 2 deletions(-) diff --git a/arch/arm64/include/asm/pkeys.h b/arch/arm64/include/asm/pkeys.h index 19eb1b12b7fc..0ca5f83ce148 100644 --- a/arch/arm64/include/asm/pkeys.h +++ b/arch/arm64/include/asm/pkeys.h @@ -37,8 +37,6 @@ static inline int arch_override_mprotect_pkey(struct vm_area_struct *vma, static inline int execute_only_pkey(struct mm_struct *mm) { // Execute-only mappings are handled by EPAN/FEAT_PAN3. - WARN_ON_ONCE(!cpus_have_final_cap(ARM64_HAS_EPAN)); - return -1; } From b6db3eb6c373b97d9e433530d748590421bbeea7 Mon Sep 17 00:00:00 2001 From: Anastasia Belova Date: Tue, 10 Sep 2024 11:50:16 +0300 Subject: [PATCH 92/94] arm64: esr: Define ESR_ELx_EC_* constants as UL Add explicit casting to prevent expantion of 32th bit of u32 into highest half of u64 in several places. For example, in inject_abt64: ESR_ELx_EC_DABT_LOW << ESR_ELx_EC_SHIFT = 0x24 << 26. This operation's result is int with 1 in 32th bit. While casting this value into u64 (esr is u64) 1 fills 32 highest bits. Found by Linux Verification Center (linuxtesting.org) with SVACE. Cc: Fixes: aa8eff9bfbd5 ("arm64: KVM: fault injection into a guest") Signed-off-by: Anastasia Belova Acked-by: Marc Zyngier Link: https://lore.kernel.org/stable/20240910085016.32120-1-abelova%40astralinux.ru Link: https://lore.kernel.org/r/20240910085016.32120-1-abelova@astralinux.ru Signed-off-by: Will Deacon --- arch/arm64/include/asm/esr.h | 88 ++++++++++++++++++------------------ 1 file changed, 44 insertions(+), 44 deletions(-) diff --git a/arch/arm64/include/asm/esr.h b/arch/arm64/include/asm/esr.h index 56c148890daf..2f3d56857a97 100644 --- a/arch/arm64/include/asm/esr.h +++ b/arch/arm64/include/asm/esr.h @@ -10,63 +10,63 @@ #include #include -#define ESR_ELx_EC_UNKNOWN (0x00) -#define ESR_ELx_EC_WFx (0x01) +#define ESR_ELx_EC_UNKNOWN UL(0x00) +#define ESR_ELx_EC_WFx UL(0x01) /* Unallocated EC: 0x02 */ -#define ESR_ELx_EC_CP15_32 (0x03) -#define ESR_ELx_EC_CP15_64 (0x04) -#define ESR_ELx_EC_CP14_MR (0x05) -#define ESR_ELx_EC_CP14_LS (0x06) -#define ESR_ELx_EC_FP_ASIMD (0x07) -#define ESR_ELx_EC_CP10_ID (0x08) /* EL2 only */ -#define ESR_ELx_EC_PAC (0x09) /* EL2 and above */ +#define ESR_ELx_EC_CP15_32 UL(0x03) +#define ESR_ELx_EC_CP15_64 UL(0x04) +#define ESR_ELx_EC_CP14_MR UL(0x05) +#define ESR_ELx_EC_CP14_LS UL(0x06) +#define ESR_ELx_EC_FP_ASIMD UL(0x07) +#define ESR_ELx_EC_CP10_ID UL(0x08) /* EL2 only */ +#define ESR_ELx_EC_PAC UL(0x09) /* EL2 and above */ /* Unallocated EC: 0x0A - 0x0B */ -#define ESR_ELx_EC_CP14_64 (0x0C) -#define ESR_ELx_EC_BTI (0x0D) -#define ESR_ELx_EC_ILL (0x0E) +#define ESR_ELx_EC_CP14_64 UL(0x0C) +#define ESR_ELx_EC_BTI UL(0x0D) +#define ESR_ELx_EC_ILL UL(0x0E) /* Unallocated EC: 0x0F - 0x10 */ -#define ESR_ELx_EC_SVC32 (0x11) -#define ESR_ELx_EC_HVC32 (0x12) /* EL2 only */ -#define ESR_ELx_EC_SMC32 (0x13) /* EL2 and above */ +#define ESR_ELx_EC_SVC32 UL(0x11) +#define ESR_ELx_EC_HVC32 UL(0x12) /* EL2 only */ +#define ESR_ELx_EC_SMC32 UL(0x13) /* EL2 and above */ /* Unallocated EC: 0x14 */ -#define ESR_ELx_EC_SVC64 (0x15) -#define ESR_ELx_EC_HVC64 (0x16) /* EL2 and above */ -#define ESR_ELx_EC_SMC64 (0x17) /* EL2 and above */ -#define ESR_ELx_EC_SYS64 (0x18) -#define ESR_ELx_EC_SVE (0x19) -#define ESR_ELx_EC_ERET (0x1a) /* EL2 only */ +#define ESR_ELx_EC_SVC64 UL(0x15) +#define ESR_ELx_EC_HVC64 UL(0x16) /* EL2 and above */ +#define ESR_ELx_EC_SMC64 UL(0x17) /* EL2 and above */ +#define ESR_ELx_EC_SYS64 UL(0x18) +#define ESR_ELx_EC_SVE UL(0x19) +#define ESR_ELx_EC_ERET UL(0x1a) /* EL2 only */ /* Unallocated EC: 0x1B */ -#define ESR_ELx_EC_FPAC (0x1C) /* EL1 and above */ -#define ESR_ELx_EC_SME (0x1D) +#define ESR_ELx_EC_FPAC UL(0x1C) /* EL1 and above */ +#define ESR_ELx_EC_SME UL(0x1D) /* Unallocated EC: 0x1E */ -#define ESR_ELx_EC_IMP_DEF (0x1f) /* EL3 only */ -#define ESR_ELx_EC_IABT_LOW (0x20) -#define ESR_ELx_EC_IABT_CUR (0x21) -#define ESR_ELx_EC_PC_ALIGN (0x22) +#define ESR_ELx_EC_IMP_DEF UL(0x1f) /* EL3 only */ +#define ESR_ELx_EC_IABT_LOW UL(0x20) +#define ESR_ELx_EC_IABT_CUR UL(0x21) +#define ESR_ELx_EC_PC_ALIGN UL(0x22) /* Unallocated EC: 0x23 */ -#define ESR_ELx_EC_DABT_LOW (0x24) -#define ESR_ELx_EC_DABT_CUR (0x25) -#define ESR_ELx_EC_SP_ALIGN (0x26) -#define ESR_ELx_EC_MOPS (0x27) -#define ESR_ELx_EC_FP_EXC32 (0x28) +#define ESR_ELx_EC_DABT_LOW UL(0x24) +#define ESR_ELx_EC_DABT_CUR UL(0x25) +#define ESR_ELx_EC_SP_ALIGN UL(0x26) +#define ESR_ELx_EC_MOPS UL(0x27) +#define ESR_ELx_EC_FP_EXC32 UL(0x28) /* Unallocated EC: 0x29 - 0x2B */ -#define ESR_ELx_EC_FP_EXC64 (0x2C) +#define ESR_ELx_EC_FP_EXC64 UL(0x2C) /* Unallocated EC: 0x2D - 0x2E */ -#define ESR_ELx_EC_SERROR (0x2F) -#define ESR_ELx_EC_BREAKPT_LOW (0x30) -#define ESR_ELx_EC_BREAKPT_CUR (0x31) -#define ESR_ELx_EC_SOFTSTP_LOW (0x32) -#define ESR_ELx_EC_SOFTSTP_CUR (0x33) -#define ESR_ELx_EC_WATCHPT_LOW (0x34) -#define ESR_ELx_EC_WATCHPT_CUR (0x35) +#define ESR_ELx_EC_SERROR UL(0x2F) +#define ESR_ELx_EC_BREAKPT_LOW UL(0x30) +#define ESR_ELx_EC_BREAKPT_CUR UL(0x31) +#define ESR_ELx_EC_SOFTSTP_LOW UL(0x32) +#define ESR_ELx_EC_SOFTSTP_CUR UL(0x33) +#define ESR_ELx_EC_WATCHPT_LOW UL(0x34) +#define ESR_ELx_EC_WATCHPT_CUR UL(0x35) /* Unallocated EC: 0x36 - 0x37 */ -#define ESR_ELx_EC_BKPT32 (0x38) +#define ESR_ELx_EC_BKPT32 UL(0x38) /* Unallocated EC: 0x39 */ -#define ESR_ELx_EC_VECTOR32 (0x3A) /* EL2 only */ +#define ESR_ELx_EC_VECTOR32 UL(0x3A) /* EL2 only */ /* Unallocated EC: 0x3B */ -#define ESR_ELx_EC_BRK64 (0x3C) +#define ESR_ELx_EC_BRK64 UL(0x3C) /* Unallocated EC: 0x3D - 0x3F */ -#define ESR_ELx_EC_MAX (0x3F) +#define ESR_ELx_EC_MAX UL(0x3F) #define ESR_ELx_EC_SHIFT (26) #define ESR_ELx_EC_WIDTH (6) From ecdd16df45649e344e38ec59a3022e13419a695a Mon Sep 17 00:00:00 2001 From: Min-Hua Chen Date: Wed, 11 Sep 2024 07:25:05 +0800 Subject: [PATCH 93/94] arm64: hibernate: Fix warning for cast from restricted gfp_t This patch fixes the following warning by adding __force to the cast: arch/arm64/kernel/hibernate.c:410:44: sparse: warning: cast from restricted gfp_t No functional change intended. Signed-off-by: Min-Hua Chen Link: https://lore.kernel.org/r/20240910232507.313555-1-minhuadotchen@gmail.com Signed-off-by: Will Deacon --- arch/arm64/kernel/hibernate.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/arm64/kernel/hibernate.c b/arch/arm64/kernel/hibernate.c index 02870beb271e..7b11d84f533c 100644 --- a/arch/arm64/kernel/hibernate.c +++ b/arch/arm64/kernel/hibernate.c @@ -407,7 +407,7 @@ int swsusp_arch_resume(void) void *, phys_addr_t, phys_addr_t); struct trans_pgd_info trans_info = { .trans_alloc_page = hibernate_page_alloc, - .trans_alloc_arg = (void *)GFP_ATOMIC, + .trans_alloc_arg = (__force void *)GFP_ATOMIC, }; /* From 2e091a805febb9a91cc7de2735d8d4ef0e640241 Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Wed, 11 Sep 2024 10:39:59 +0300 Subject: [PATCH 94/94] perf: arm-ni: Fix an NULL vs IS_ERR() bug The devm_ioremap() function never returns error pointers, it returns a NULL pointer if there is an error. Fixes: 4d5a7680f2b4 ("perf: Add driver for Arm NI-700 interconnect PMU") Signed-off-by: Dan Carpenter Reviewed-by: Robin Murphy Link: https://lore.kernel.org/r/04d6ccc3-6d31-4f0f-ab0f-7a88342cc09a@stanley.mountain Signed-off-by: Will Deacon --- drivers/perf/arm-ni.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/perf/arm-ni.c b/drivers/perf/arm-ni.c index b72df3aea93e..90fcfe693439 100644 --- a/drivers/perf/arm-ni.c +++ b/drivers/perf/arm-ni.c @@ -603,8 +603,8 @@ static int arm_ni_probe(struct platform_device *pdev) */ res = platform_get_resource(pdev, IORESOURCE_MEM, 0); base = devm_ioremap(&pdev->dev, res->start, resource_size(res)); - if (IS_ERR(base)) - return PTR_ERR(base); + if (!base) + return -ENOMEM; arm_ni_probe_domain(base, &cfg); if (cfg.type != NI_GLOBAL)