mirror of
https://github.com/Dasharo/linux.git
synced 2026-03-06 15:25:10 -08:00
Merge branch 'perf-urgent-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip
Pull perf fixes from Ingo Molnar: "This tree contains various perf fixes on the kernel side, plus three hw/event-enablement late additions: - Intel Memory Bandwidth Monitoring events and handling - the AMD Accumulated Power Mechanism reporting facility - more IOMMU events ... and a final round of perf tooling updates/fixes" * 'perf-urgent-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip: (44 commits) perf llvm: Use strerror_r instead of the thread unsafe strerror one perf llvm: Use realpath to canonicalize paths perf tools: Unexport some methods unused outside strbuf.c perf probe: No need to use formatting strbuf method perf help: Use asprintf instead of adhoc equivalents perf tools: Remove unused perf_pathdup, xstrdup functions perf tools: Do not include stringify.h from the kernel sources tools include: Copy linux/stringify.h from the kernel tools lib traceevent: Remove redundant CPU output perf tools: Remove needless 'extern' from function prototypes perf tools: Simplify die() mechanism perf tools: Remove unused DIE_IF macro perf script: Remove lots of unused arguments perf thread: Rename perf_event__preprocess_sample_addr to thread__resolve perf machine: Rename perf_event__preprocess_sample to machine__resolve perf tools: Add cpumode to struct perf_sample perf tests: Forward the perf_sample in the dwarf unwind test perf tools: Remove misplaced __maybe_unused perf list: Fix documentation of :ppp perf bench numa: Fix assertion for nodes bitfield ...
This commit is contained in:
@@ -1210,6 +1210,15 @@ config MICROCODE_OLD_INTERFACE
|
||||
def_bool y
|
||||
depends on MICROCODE
|
||||
|
||||
config PERF_EVENTS_AMD_POWER
|
||||
depends on PERF_EVENTS && CPU_SUP_AMD
|
||||
tristate "AMD Processor Power Reporting Mechanism"
|
||||
---help---
|
||||
Provide power reporting mechanism support for AMD processors.
|
||||
Currently, it leverages X86_FEATURE_ACC_POWER
|
||||
(CPUID Fn8000_0007_EDX[12]) interface to calculate the
|
||||
average power consumption on Family 15h processors.
|
||||
|
||||
config X86_MSR
|
||||
tristate "/dev/cpu/*/msr - Model-specific register support"
|
||||
---help---
|
||||
|
||||
@@ -1,6 +1,7 @@
|
||||
obj-y += core.o
|
||||
|
||||
obj-$(CONFIG_CPU_SUP_AMD) += amd/core.o amd/uncore.o
|
||||
obj-$(CONFIG_PERF_EVENTS_AMD_POWER) += amd/power.o
|
||||
obj-$(CONFIG_X86_LOCAL_APIC) += amd/ibs.o msr.o
|
||||
ifdef CONFIG_AMD_IOMMU
|
||||
obj-$(CONFIG_CPU_SUP_AMD) += amd/iommu.o
|
||||
|
||||
@@ -376,7 +376,13 @@ static void perf_ibs_start(struct perf_event *event, int flags)
|
||||
hwc->state = 0;
|
||||
|
||||
perf_ibs_set_period(perf_ibs, hwc, &period);
|
||||
/*
|
||||
* Set STARTED before enabling the hardware, such that
|
||||
* a subsequent NMI must observe it. Then clear STOPPING
|
||||
* such that we don't consume NMIs by accident.
|
||||
*/
|
||||
set_bit(IBS_STARTED, pcpu->state);
|
||||
clear_bit(IBS_STOPPING, pcpu->state);
|
||||
perf_ibs_enable_event(perf_ibs, hwc, period >> 4);
|
||||
|
||||
perf_event_update_userpage(event);
|
||||
@@ -390,7 +396,7 @@ static void perf_ibs_stop(struct perf_event *event, int flags)
|
||||
u64 config;
|
||||
int stopping;
|
||||
|
||||
stopping = test_and_clear_bit(IBS_STARTED, pcpu->state);
|
||||
stopping = test_bit(IBS_STARTED, pcpu->state);
|
||||
|
||||
if (!stopping && (hwc->state & PERF_HES_UPTODATE))
|
||||
return;
|
||||
@@ -398,8 +404,24 @@ static void perf_ibs_stop(struct perf_event *event, int flags)
|
||||
rdmsrl(hwc->config_base, config);
|
||||
|
||||
if (stopping) {
|
||||
/*
|
||||
* Set STOPPING before disabling the hardware, such that it
|
||||
* must be visible to NMIs the moment we clear the EN bit,
|
||||
* at which point we can generate an !VALID sample which
|
||||
* we need to consume.
|
||||
*/
|
||||
set_bit(IBS_STOPPING, pcpu->state);
|
||||
perf_ibs_disable_event(perf_ibs, hwc, config);
|
||||
/*
|
||||
* Clear STARTED after disabling the hardware; if it were
|
||||
* cleared before an NMI hitting after the clear but before
|
||||
* clearing the EN bit might think it a spurious NMI and not
|
||||
* handle it.
|
||||
*
|
||||
* Clearing it after, however, creates the problem of the NMI
|
||||
* handler seeing STARTED but not having a valid sample.
|
||||
*/
|
||||
clear_bit(IBS_STARTED, pcpu->state);
|
||||
WARN_ON_ONCE(hwc->state & PERF_HES_STOPPED);
|
||||
hwc->state |= PERF_HES_STOPPED;
|
||||
}
|
||||
@@ -527,20 +549,24 @@ static int perf_ibs_handle_irq(struct perf_ibs *perf_ibs, struct pt_regs *iregs)
|
||||
u64 *buf, *config, period;
|
||||
|
||||
if (!test_bit(IBS_STARTED, pcpu->state)) {
|
||||
fail:
|
||||
/*
|
||||
* Catch spurious interrupts after stopping IBS: After
|
||||
* disabling IBS there could be still incoming NMIs
|
||||
* with samples that even have the valid bit cleared.
|
||||
* Mark all this NMIs as handled.
|
||||
*/
|
||||
return test_and_clear_bit(IBS_STOPPING, pcpu->state) ? 1 : 0;
|
||||
if (test_and_clear_bit(IBS_STOPPING, pcpu->state))
|
||||
return 1;
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
msr = hwc->config_base;
|
||||
buf = ibs_data.regs;
|
||||
rdmsrl(msr, *buf);
|
||||
if (!(*buf++ & perf_ibs->valid_mask))
|
||||
return 0;
|
||||
goto fail;
|
||||
|
||||
config = &ibs_data.regs[0];
|
||||
perf_ibs_event_update(perf_ibs, event, config);
|
||||
@@ -599,7 +625,7 @@ static int perf_ibs_handle_irq(struct perf_ibs *perf_ibs, struct pt_regs *iregs)
|
||||
throttle = perf_event_overflow(event, &data, ®s);
|
||||
out:
|
||||
if (throttle)
|
||||
perf_ibs_disable_event(perf_ibs, hwc, *config);
|
||||
perf_ibs_stop(event, 0);
|
||||
else
|
||||
perf_ibs_enable_event(perf_ibs, hwc, period >> 4);
|
||||
|
||||
@@ -611,6 +637,7 @@ out:
|
||||
static int
|
||||
perf_ibs_nmi_handler(unsigned int cmd, struct pt_regs *regs)
|
||||
{
|
||||
u64 stamp = sched_clock();
|
||||
int handled = 0;
|
||||
|
||||
handled += perf_ibs_handle_irq(&perf_ibs_fetch, regs);
|
||||
@@ -619,6 +646,8 @@ perf_ibs_nmi_handler(unsigned int cmd, struct pt_regs *regs)
|
||||
if (handled)
|
||||
inc_irq_stat(apic_perf_irqs);
|
||||
|
||||
perf_sample_event_took(sched_clock() - stamp);
|
||||
|
||||
return handled;
|
||||
}
|
||||
NOKPROBE_SYMBOL(perf_ibs_nmi_handler);
|
||||
|
||||
@@ -118,6 +118,11 @@ static struct amd_iommu_event_desc amd_iommu_v2_event_descs[] = {
|
||||
AMD_IOMMU_EVENT_DESC(cmd_processed, "csource=0x11"),
|
||||
AMD_IOMMU_EVENT_DESC(cmd_processed_inv, "csource=0x12"),
|
||||
AMD_IOMMU_EVENT_DESC(tlb_inv, "csource=0x13"),
|
||||
AMD_IOMMU_EVENT_DESC(ign_rd_wr_mmio_1ff8h, "csource=0x14"),
|
||||
AMD_IOMMU_EVENT_DESC(vapic_int_non_guest, "csource=0x15"),
|
||||
AMD_IOMMU_EVENT_DESC(vapic_int_guest, "csource=0x16"),
|
||||
AMD_IOMMU_EVENT_DESC(smi_recv, "csource=0x17"),
|
||||
AMD_IOMMU_EVENT_DESC(smi_blk, "csource=0x18"),
|
||||
{ /* end: all zeroes */ },
|
||||
};
|
||||
|
||||
|
||||
353
arch/x86/events/amd/power.c
Normal file
353
arch/x86/events/amd/power.c
Normal file
@@ -0,0 +1,353 @@
|
||||
/*
|
||||
* Performance events - AMD Processor Power Reporting Mechanism
|
||||
*
|
||||
* Copyright (C) 2016 Advanced Micro Devices, Inc.
|
||||
*
|
||||
* Author: Huang Rui <ray.huang@amd.com>
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License version 2 as
|
||||
* published by the Free Software Foundation.
|
||||
*/
|
||||
|
||||
#include <linux/module.h>
|
||||
#include <linux/slab.h>
|
||||
#include <linux/perf_event.h>
|
||||
#include <asm/cpu_device_id.h>
|
||||
#include "../perf_event.h"
|
||||
|
||||
#define MSR_F15H_CU_PWR_ACCUMULATOR 0xc001007a
|
||||
#define MSR_F15H_CU_MAX_PWR_ACCUMULATOR 0xc001007b
|
||||
#define MSR_F15H_PTSC 0xc0010280
|
||||
|
||||
/* Event code: LSB 8 bits, passed in attr->config any other bit is reserved. */
|
||||
#define AMD_POWER_EVENT_MASK 0xFFULL
|
||||
|
||||
/*
|
||||
* Accumulated power status counters.
|
||||
*/
|
||||
#define AMD_POWER_EVENTSEL_PKG 1
|
||||
|
||||
/*
|
||||
* The ratio of compute unit power accumulator sample period to the
|
||||
* PTSC period.
|
||||
*/
|
||||
static unsigned int cpu_pwr_sample_ratio;
|
||||
|
||||
/* Maximum accumulated power of a compute unit. */
|
||||
static u64 max_cu_acc_power;
|
||||
|
||||
static struct pmu pmu_class;
|
||||
|
||||
/*
|
||||
* Accumulated power represents the sum of each compute unit's (CU) power
|
||||
* consumption. On any core of each CU we read the total accumulated power from
|
||||
* MSR_F15H_CU_PWR_ACCUMULATOR. cpu_mask represents CPU bit map of all cores
|
||||
* which are picked to measure the power for the CUs they belong to.
|
||||
*/
|
||||
static cpumask_t cpu_mask;
|
||||
|
||||
static void event_update(struct perf_event *event)
|
||||
{
|
||||
struct hw_perf_event *hwc = &event->hw;
|
||||
u64 prev_pwr_acc, new_pwr_acc, prev_ptsc, new_ptsc;
|
||||
u64 delta, tdelta;
|
||||
|
||||
prev_pwr_acc = hwc->pwr_acc;
|
||||
prev_ptsc = hwc->ptsc;
|
||||
rdmsrl(MSR_F15H_CU_PWR_ACCUMULATOR, new_pwr_acc);
|
||||
rdmsrl(MSR_F15H_PTSC, new_ptsc);
|
||||
|
||||
/*
|
||||
* Calculate the CU power consumption over a time period, the unit of
|
||||
* final value (delta) is micro-Watts. Then add it to the event count.
|
||||
*/
|
||||
if (new_pwr_acc < prev_pwr_acc) {
|
||||
delta = max_cu_acc_power + new_pwr_acc;
|
||||
delta -= prev_pwr_acc;
|
||||
} else
|
||||
delta = new_pwr_acc - prev_pwr_acc;
|
||||
|
||||
delta *= cpu_pwr_sample_ratio * 1000;
|
||||
tdelta = new_ptsc - prev_ptsc;
|
||||
|
||||
do_div(delta, tdelta);
|
||||
local64_add(delta, &event->count);
|
||||
}
|
||||
|
||||
static void __pmu_event_start(struct perf_event *event)
|
||||
{
|
||||
if (WARN_ON_ONCE(!(event->hw.state & PERF_HES_STOPPED)))
|
||||
return;
|
||||
|
||||
event->hw.state = 0;
|
||||
|
||||
rdmsrl(MSR_F15H_PTSC, event->hw.ptsc);
|
||||
rdmsrl(MSR_F15H_CU_PWR_ACCUMULATOR, event->hw.pwr_acc);
|
||||
}
|
||||
|
||||
static void pmu_event_start(struct perf_event *event, int mode)
|
||||
{
|
||||
__pmu_event_start(event);
|
||||
}
|
||||
|
||||
static void pmu_event_stop(struct perf_event *event, int mode)
|
||||
{
|
||||
struct hw_perf_event *hwc = &event->hw;
|
||||
|
||||
/* Mark event as deactivated and stopped. */
|
||||
if (!(hwc->state & PERF_HES_STOPPED))
|
||||
hwc->state |= PERF_HES_STOPPED;
|
||||
|
||||
/* Check if software counter update is necessary. */
|
||||
if ((mode & PERF_EF_UPDATE) && !(hwc->state & PERF_HES_UPTODATE)) {
|
||||
/*
|
||||
* Drain the remaining delta count out of an event
|
||||
* that we are disabling:
|
||||
*/
|
||||
event_update(event);
|
||||
hwc->state |= PERF_HES_UPTODATE;
|
||||
}
|
||||
}
|
||||
|
||||
static int pmu_event_add(struct perf_event *event, int mode)
|
||||
{
|
||||
struct hw_perf_event *hwc = &event->hw;
|
||||
|
||||
hwc->state = PERF_HES_UPTODATE | PERF_HES_STOPPED;
|
||||
|
||||
if (mode & PERF_EF_START)
|
||||
__pmu_event_start(event);
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
static void pmu_event_del(struct perf_event *event, int flags)
|
||||
{
|
||||
pmu_event_stop(event, PERF_EF_UPDATE);
|
||||
}
|
||||
|
||||
static int pmu_event_init(struct perf_event *event)
|
||||
{
|
||||
u64 cfg = event->attr.config & AMD_POWER_EVENT_MASK;
|
||||
|
||||
/* Only look at AMD power events. */
|
||||
if (event->attr.type != pmu_class.type)
|
||||
return -ENOENT;
|
||||
|
||||
/* Unsupported modes and filters. */
|
||||
if (event->attr.exclude_user ||
|
||||
event->attr.exclude_kernel ||
|
||||
event->attr.exclude_hv ||
|
||||
event->attr.exclude_idle ||
|
||||
event->attr.exclude_host ||
|
||||
event->attr.exclude_guest ||
|
||||
/* no sampling */
|
||||
event->attr.sample_period)
|
||||
return -EINVAL;
|
||||
|
||||
if (cfg != AMD_POWER_EVENTSEL_PKG)
|
||||
return -EINVAL;
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
static void pmu_event_read(struct perf_event *event)
|
||||
{
|
||||
event_update(event);
|
||||
}
|
||||
|
||||
static ssize_t
|
||||
get_attr_cpumask(struct device *dev, struct device_attribute *attr, char *buf)
|
||||
{
|
||||
return cpumap_print_to_pagebuf(true, buf, &cpu_mask);
|
||||
}
|
||||
|
||||
static DEVICE_ATTR(cpumask, S_IRUGO, get_attr_cpumask, NULL);
|
||||
|
||||
static struct attribute *pmu_attrs[] = {
|
||||
&dev_attr_cpumask.attr,
|
||||
NULL,
|
||||
};
|
||||
|
||||
static struct attribute_group pmu_attr_group = {
|
||||
.attrs = pmu_attrs,
|
||||
};
|
||||
|
||||
/*
|
||||
* Currently it only supports to report the power of each
|
||||
* processor/package.
|
||||
*/
|
||||
EVENT_ATTR_STR(power-pkg, power_pkg, "event=0x01");
|
||||
|
||||
EVENT_ATTR_STR(power-pkg.unit, power_pkg_unit, "mWatts");
|
||||
|
||||
/* Convert the count from micro-Watts to milli-Watts. */
|
||||
EVENT_ATTR_STR(power-pkg.scale, power_pkg_scale, "1.000000e-3");
|
||||
|
||||
static struct attribute *events_attr[] = {
|
||||
EVENT_PTR(power_pkg),
|
||||
EVENT_PTR(power_pkg_unit),
|
||||
EVENT_PTR(power_pkg_scale),
|
||||
NULL,
|
||||
};
|
||||
|
||||
static struct attribute_group pmu_events_group = {
|
||||
.name = "events",
|
||||
.attrs = events_attr,
|
||||
};
|
||||
|
||||
PMU_FORMAT_ATTR(event, "config:0-7");
|
||||
|
||||
static struct attribute *formats_attr[] = {
|
||||
&format_attr_event.attr,
|
||||
NULL,
|
||||
};
|
||||
|
||||
static struct attribute_group pmu_format_group = {
|
||||
.name = "format",
|
||||
.attrs = formats_attr,
|
||||
};
|
||||
|
||||
static const struct attribute_group *attr_groups[] = {
|
||||
&pmu_attr_group,
|
||||
&pmu_format_group,
|
||||
&pmu_events_group,
|
||||
NULL,
|
||||
};
|
||||
|
||||
static struct pmu pmu_class = {
|
||||
.attr_groups = attr_groups,
|
||||
/* system-wide only */
|
||||
.task_ctx_nr = perf_invalid_context,
|
||||
.event_init = pmu_event_init,
|
||||
.add = pmu_event_add,
|
||||
.del = pmu_event_del,
|
||||
.start = pmu_event_start,
|
||||
.stop = pmu_event_stop,
|
||||
.read = pmu_event_read,
|
||||
};
|
||||
|
||||
static void power_cpu_exit(int cpu)
|
||||
{
|
||||
int target;
|
||||
|
||||
if (!cpumask_test_and_clear_cpu(cpu, &cpu_mask))
|
||||
return;
|
||||
|
||||
/*
|
||||
* Find a new CPU on the same compute unit, if was set in cpumask
|
||||
* and still some CPUs on compute unit. Then migrate event and
|
||||
* context to new CPU.
|
||||
*/
|
||||
target = cpumask_any_but(topology_sibling_cpumask(cpu), cpu);
|
||||
if (target < nr_cpumask_bits) {
|
||||
cpumask_set_cpu(target, &cpu_mask);
|
||||
perf_pmu_migrate_context(&pmu_class, cpu, target);
|
||||
}
|
||||
}
|
||||
|
||||
static void power_cpu_init(int cpu)
|
||||
{
|
||||
int target;
|
||||
|
||||
/*
|
||||
* 1) If any CPU is set at cpu_mask in the same compute unit, do
|
||||
* nothing.
|
||||
* 2) If no CPU is set at cpu_mask in the same compute unit,
|
||||
* set current STARTING CPU.
|
||||
*
|
||||
* Note: if there is a CPU aside of the new one already in the
|
||||
* sibling mask, then it is also in cpu_mask.
|
||||
*/
|
||||
target = cpumask_any_but(topology_sibling_cpumask(cpu), cpu);
|
||||
if (target >= nr_cpumask_bits)
|
||||
cpumask_set_cpu(cpu, &cpu_mask);
|
||||
}
|
||||
|
||||
static int
|
||||
power_cpu_notifier(struct notifier_block *self, unsigned long action, void *hcpu)
|
||||
{
|
||||
unsigned int cpu = (long)hcpu;
|
||||
|
||||
switch (action & ~CPU_TASKS_FROZEN) {
|
||||
case CPU_DOWN_FAILED:
|
||||
case CPU_STARTING:
|
||||
power_cpu_init(cpu);
|
||||
break;
|
||||
case CPU_DOWN_PREPARE:
|
||||
power_cpu_exit(cpu);
|
||||
break;
|
||||
default:
|
||||
break;
|
||||
}
|
||||
|
||||
return NOTIFY_OK;
|
||||
}
|
||||
|
||||
static struct notifier_block power_cpu_notifier_nb = {
|
||||
.notifier_call = power_cpu_notifier,
|
||||
.priority = CPU_PRI_PERF,
|
||||
};
|
||||
|
||||
static const struct x86_cpu_id cpu_match[] = {
|
||||
{ .vendor = X86_VENDOR_AMD, .family = 0x15 },
|
||||
{},
|
||||
};
|
||||
|
||||
static int __init amd_power_pmu_init(void)
|
||||
{
|
||||
int cpu, target, ret;
|
||||
|
||||
if (!x86_match_cpu(cpu_match))
|
||||
return 0;
|
||||
|
||||
if (!boot_cpu_has(X86_FEATURE_ACC_POWER))
|
||||
return -ENODEV;
|
||||
|
||||
cpu_pwr_sample_ratio = cpuid_ecx(0x80000007);
|
||||
|
||||
if (rdmsrl_safe(MSR_F15H_CU_MAX_PWR_ACCUMULATOR, &max_cu_acc_power)) {
|
||||
pr_err("Failed to read max compute unit power accumulator MSR\n");
|
||||
return -ENODEV;
|
||||
}
|
||||
|
||||
cpu_notifier_register_begin();
|
||||
|
||||
/* Choose one online core of each compute unit. */
|
||||
for_each_online_cpu(cpu) {
|
||||
target = cpumask_first(topology_sibling_cpumask(cpu));
|
||||
if (!cpumask_test_cpu(target, &cpu_mask))
|
||||
cpumask_set_cpu(target, &cpu_mask);
|
||||
}
|
||||
|
||||
ret = perf_pmu_register(&pmu_class, "power", -1);
|
||||
if (WARN_ON(ret)) {
|
||||
pr_warn("AMD Power PMU registration failed\n");
|
||||
goto out;
|
||||
}
|
||||
|
||||
__register_cpu_notifier(&power_cpu_notifier_nb);
|
||||
|
||||
pr_info("AMD Power PMU detected\n");
|
||||
|
||||
out:
|
||||
cpu_notifier_register_done();
|
||||
|
||||
return ret;
|
||||
}
|
||||
module_init(amd_power_pmu_init);
|
||||
|
||||
static void __exit amd_power_pmu_exit(void)
|
||||
{
|
||||
cpu_notifier_register_begin();
|
||||
__unregister_cpu_notifier(&power_cpu_notifier_nb);
|
||||
cpu_notifier_register_done();
|
||||
|
||||
perf_pmu_unregister(&pmu_class);
|
||||
}
|
||||
module_exit(amd_power_pmu_exit);
|
||||
|
||||
MODULE_AUTHOR("Huang Rui <ray.huang@amd.com>");
|
||||
MODULE_DESCRIPTION("AMD Processor Power Reporting Mechanism");
|
||||
MODULE_LICENSE("GPL v2");
|
||||
@@ -1602,8 +1602,7 @@ __init struct attribute **merge_attr(struct attribute **a, struct attribute **b)
|
||||
return new;
|
||||
}
|
||||
|
||||
ssize_t events_sysfs_show(struct device *dev, struct device_attribute *attr,
|
||||
char *page)
|
||||
ssize_t events_sysfs_show(struct device *dev, struct device_attribute *attr, char *page)
|
||||
{
|
||||
struct perf_pmu_events_attr *pmu_attr = \
|
||||
container_of(attr, struct perf_pmu_events_attr, attr);
|
||||
@@ -1615,6 +1614,7 @@ ssize_t events_sysfs_show(struct device *dev, struct device_attribute *attr,
|
||||
|
||||
return x86_pmu.events_sysfs_show(page, config);
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(events_sysfs_show);
|
||||
|
||||
EVENT_ATTR(cpu-cycles, CPU_CYCLES );
|
||||
EVENT_ATTR(instructions, INSTRUCTIONS );
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
@@ -570,11 +570,12 @@ int intel_pmu_drain_bts_buffer(void)
|
||||
* We will overwrite the from and to address before we output
|
||||
* the sample.
|
||||
*/
|
||||
rcu_read_lock();
|
||||
perf_prepare_sample(&header, &data, event, ®s);
|
||||
|
||||
if (perf_output_begin(&handle, event, header.size *
|
||||
(top - base - skip)))
|
||||
return 1;
|
||||
goto unlock;
|
||||
|
||||
for (at = base; at < top; at++) {
|
||||
/* Filter out any records that contain kernel addresses. */
|
||||
@@ -593,6 +594,8 @@ int intel_pmu_drain_bts_buffer(void)
|
||||
/* There's new data available. */
|
||||
event->hw.interrupts++;
|
||||
event->pending_kill = POLL_IN;
|
||||
unlock:
|
||||
rcu_read_unlock();
|
||||
return 1;
|
||||
}
|
||||
|
||||
|
||||
@@ -711,6 +711,7 @@ static int __init rapl_pmu_init(void)
|
||||
rapl_pmu_events_group.attrs = rapl_events_cln_attr;
|
||||
break;
|
||||
case 63: /* Haswell-Server */
|
||||
case 79: /* Broadwell-Server */
|
||||
apply_quirk = true;
|
||||
rapl_cntr_mask = RAPL_IDX_SRV;
|
||||
rapl_pmu_events_group.attrs = rapl_events_srv_attr;
|
||||
@@ -718,6 +719,7 @@ static int __init rapl_pmu_init(void)
|
||||
case 60: /* Haswell */
|
||||
case 69: /* Haswell-Celeron */
|
||||
case 61: /* Broadwell */
|
||||
case 71: /* Broadwell-H */
|
||||
rapl_cntr_mask = RAPL_IDX_HSW;
|
||||
rapl_pmu_events_group.attrs = rapl_events_hsw_attr;
|
||||
break;
|
||||
|
||||
@@ -46,7 +46,6 @@
|
||||
(SNBEP_PMON_CTL_EV_SEL_MASK | \
|
||||
SNBEP_PCU_MSR_PMON_CTL_OCC_SEL_MASK | \
|
||||
SNBEP_PMON_CTL_EDGE_DET | \
|
||||
SNBEP_PMON_CTL_EV_SEL_EXT | \
|
||||
SNBEP_PMON_CTL_INVERT | \
|
||||
SNBEP_PCU_MSR_PMON_CTL_TRESH_MASK | \
|
||||
SNBEP_PCU_MSR_PMON_CTL_OCC_INVERT | \
|
||||
@@ -148,7 +147,6 @@
|
||||
/* IVBEP PCU */
|
||||
#define IVBEP_PCU_MSR_PMON_RAW_EVENT_MASK \
|
||||
(SNBEP_PMON_CTL_EV_SEL_MASK | \
|
||||
SNBEP_PMON_CTL_EV_SEL_EXT | \
|
||||
SNBEP_PCU_MSR_PMON_CTL_OCC_SEL_MASK | \
|
||||
SNBEP_PMON_CTL_EDGE_DET | \
|
||||
SNBEP_PCU_MSR_PMON_CTL_TRESH_MASK | \
|
||||
@@ -258,7 +256,6 @@
|
||||
SNBEP_PCU_MSR_PMON_CTL_OCC_SEL_MASK | \
|
||||
SNBEP_PMON_CTL_EDGE_DET | \
|
||||
SNBEP_CBO_PMON_CTL_TID_EN | \
|
||||
SNBEP_PMON_CTL_EV_SEL_EXT | \
|
||||
SNBEP_PMON_CTL_INVERT | \
|
||||
KNL_PCU_MSR_PMON_CTL_TRESH_MASK | \
|
||||
SNBEP_PCU_MSR_PMON_CTL_OCC_INVERT | \
|
||||
@@ -472,7 +469,7 @@ static struct attribute *snbep_uncore_cbox_formats_attr[] = {
|
||||
};
|
||||
|
||||
static struct attribute *snbep_uncore_pcu_formats_attr[] = {
|
||||
&format_attr_event_ext.attr,
|
||||
&format_attr_event.attr,
|
||||
&format_attr_occ_sel.attr,
|
||||
&format_attr_edge.attr,
|
||||
&format_attr_inv.attr,
|
||||
@@ -1313,7 +1310,7 @@ static struct attribute *ivbep_uncore_cbox_formats_attr[] = {
|
||||
};
|
||||
|
||||
static struct attribute *ivbep_uncore_pcu_formats_attr[] = {
|
||||
&format_attr_event_ext.attr,
|
||||
&format_attr_event.attr,
|
||||
&format_attr_occ_sel.attr,
|
||||
&format_attr_edge.attr,
|
||||
&format_attr_thresh5.attr,
|
||||
|
||||
@@ -94,7 +94,7 @@
|
||||
#define X86_FEATURE_REP_GOOD ( 3*32+16) /* rep microcode works well */
|
||||
#define X86_FEATURE_MFENCE_RDTSC ( 3*32+17) /* "" Mfence synchronizes RDTSC */
|
||||
#define X86_FEATURE_LFENCE_RDTSC ( 3*32+18) /* "" Lfence synchronizes RDTSC */
|
||||
/* free, was #define X86_FEATURE_11AP ( 3*32+19) * "" Bad local APIC aka 11AP */
|
||||
#define X86_FEATURE_ACC_POWER ( 3*32+19) /* AMD Accumulated Power Mechanism */
|
||||
#define X86_FEATURE_NOPL ( 3*32+20) /* The NOPL (0F 1F) instructions */
|
||||
#define X86_FEATURE_ALWAYS ( 3*32+21) /* "" Always-present feature */
|
||||
#define X86_FEATURE_XTOPOLOGY ( 3*32+22) /* cpu topology enum extensions */
|
||||
@@ -245,6 +245,8 @@
|
||||
|
||||
/* Intel-defined CPU QoS Sub-leaf, CPUID level 0x0000000F:1 (edx), word 12 */
|
||||
#define X86_FEATURE_CQM_OCCUP_LLC (12*32+ 0) /* LLC occupancy monitoring if 1 */
|
||||
#define X86_FEATURE_CQM_MBM_TOTAL (12*32+ 1) /* LLC Total MBM monitoring */
|
||||
#define X86_FEATURE_CQM_MBM_LOCAL (12*32+ 2) /* LLC Local MBM monitoring */
|
||||
|
||||
/* AMD-defined CPU features, CPUID level 0x80000008 (ebx), word 13 */
|
||||
#define X86_FEATURE_CLZERO (13*32+0) /* CLZERO instruction */
|
||||
|
||||
@@ -309,7 +309,6 @@ static void amd_get_topology(struct cpuinfo_x86 *c)
|
||||
u32 eax, ebx, ecx, edx;
|
||||
|
||||
cpuid(0x8000001e, &eax, &ebx, &ecx, &edx);
|
||||
nodes_per_socket = ((ecx >> 8) & 7) + 1;
|
||||
node_id = ecx & 7;
|
||||
|
||||
/* get compute unit information */
|
||||
@@ -320,7 +319,6 @@ static void amd_get_topology(struct cpuinfo_x86 *c)
|
||||
u64 value;
|
||||
|
||||
rdmsrl(MSR_FAM10H_NODE_ID, value);
|
||||
nodes_per_socket = ((value >> 3) & 7) + 1;
|
||||
node_id = value & 7;
|
||||
} else
|
||||
return;
|
||||
@@ -522,6 +520,18 @@ static void bsp_init_amd(struct cpuinfo_x86 *c)
|
||||
|
||||
if (cpu_has(c, X86_FEATURE_MWAITX))
|
||||
use_mwaitx_delay();
|
||||
|
||||
if (boot_cpu_has(X86_FEATURE_TOPOEXT)) {
|
||||
u32 ecx;
|
||||
|
||||
ecx = cpuid_ecx(0x8000001e);
|
||||
nodes_per_socket = ((ecx >> 8) & 7) + 1;
|
||||
} else if (boot_cpu_has(X86_FEATURE_NODEID_MSR)) {
|
||||
u64 value;
|
||||
|
||||
rdmsrl(MSR_FAM10H_NODE_ID, value);
|
||||
nodes_per_socket = ((value >> 3) & 7) + 1;
|
||||
}
|
||||
}
|
||||
|
||||
static void early_init_amd(struct cpuinfo_x86 *c)
|
||||
@@ -539,6 +549,10 @@ static void early_init_amd(struct cpuinfo_x86 *c)
|
||||
set_sched_clock_stable();
|
||||
}
|
||||
|
||||
/* Bit 12 of 8000_0007 edx is accumulated power mechanism. */
|
||||
if (c->x86_power & BIT(12))
|
||||
set_cpu_cap(c, X86_FEATURE_ACC_POWER);
|
||||
|
||||
#ifdef CONFIG_X86_64
|
||||
set_cpu_cap(c, X86_FEATURE_SYSCALL32);
|
||||
#else
|
||||
|
||||
@@ -692,7 +692,9 @@ void get_cpu_cap(struct cpuinfo_x86 *c)
|
||||
cpuid_count(0x0000000F, 1, &eax, &ebx, &ecx, &edx);
|
||||
c->x86_capability[CPUID_F_1_EDX] = edx;
|
||||
|
||||
if (cpu_has(c, X86_FEATURE_CQM_OCCUP_LLC)) {
|
||||
if ((cpu_has(c, X86_FEATURE_CQM_OCCUP_LLC)) ||
|
||||
((cpu_has(c, X86_FEATURE_CQM_MBM_TOTAL)) ||
|
||||
(cpu_has(c, X86_FEATURE_CQM_MBM_LOCAL)))) {
|
||||
c->x86_cache_max_rmid = ecx;
|
||||
c->x86_cache_occ_scale = ebx;
|
||||
}
|
||||
|
||||
@@ -121,6 +121,7 @@ struct hw_perf_event {
|
||||
struct { /* intel_cqm */
|
||||
int cqm_state;
|
||||
u32 cqm_rmid;
|
||||
int is_group_event;
|
||||
struct list_head cqm_events_entry;
|
||||
struct list_head cqm_groups_entry;
|
||||
struct list_head cqm_group_entry;
|
||||
@@ -128,6 +129,10 @@ struct hw_perf_event {
|
||||
struct { /* itrace */
|
||||
int itrace_started;
|
||||
};
|
||||
struct { /* amd_power */
|
||||
u64 pwr_acc;
|
||||
u64 ptsc;
|
||||
};
|
||||
#ifdef CONFIG_HAVE_HW_BREAKPOINT
|
||||
struct { /* breakpoint */
|
||||
/*
|
||||
|
||||
@@ -376,8 +376,11 @@ static void update_perf_cpu_limits(void)
|
||||
u64 tmp = perf_sample_period_ns;
|
||||
|
||||
tmp *= sysctl_perf_cpu_time_max_percent;
|
||||
do_div(tmp, 100);
|
||||
ACCESS_ONCE(perf_sample_allowed_ns) = tmp;
|
||||
tmp = div_u64(tmp, 100);
|
||||
if (!tmp)
|
||||
tmp = 1;
|
||||
|
||||
WRITE_ONCE(perf_sample_allowed_ns, tmp);
|
||||
}
|
||||
|
||||
static int perf_rotate_context(struct perf_cpu_context *cpuctx);
|
||||
@@ -409,7 +412,13 @@ int perf_cpu_time_max_percent_handler(struct ctl_table *table, int write,
|
||||
if (ret || !write)
|
||||
return ret;
|
||||
|
||||
update_perf_cpu_limits();
|
||||
if (sysctl_perf_cpu_time_max_percent == 100) {
|
||||
printk(KERN_WARNING
|
||||
"perf: Dynamic interrupt throttling disabled, can hang your system!\n");
|
||||
WRITE_ONCE(perf_sample_allowed_ns, 0);
|
||||
} else {
|
||||
update_perf_cpu_limits();
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
@@ -423,62 +432,68 @@ int perf_cpu_time_max_percent_handler(struct ctl_table *table, int write,
|
||||
#define NR_ACCUMULATED_SAMPLES 128
|
||||
static DEFINE_PER_CPU(u64, running_sample_length);
|
||||
|
||||
static u64 __report_avg;
|
||||
static u64 __report_allowed;
|
||||
|
||||
static void perf_duration_warn(struct irq_work *w)
|
||||
{
|
||||
u64 allowed_ns = ACCESS_ONCE(perf_sample_allowed_ns);
|
||||
u64 avg_local_sample_len;
|
||||
u64 local_samples_len;
|
||||
|
||||
local_samples_len = __this_cpu_read(running_sample_length);
|
||||
avg_local_sample_len = local_samples_len/NR_ACCUMULATED_SAMPLES;
|
||||
|
||||
printk_ratelimited(KERN_WARNING
|
||||
"perf interrupt took too long (%lld > %lld), lowering "
|
||||
"kernel.perf_event_max_sample_rate to %d\n",
|
||||
avg_local_sample_len, allowed_ns >> 1,
|
||||
sysctl_perf_event_sample_rate);
|
||||
"perf: interrupt took too long (%lld > %lld), lowering "
|
||||
"kernel.perf_event_max_sample_rate to %d\n",
|
||||
__report_avg, __report_allowed,
|
||||
sysctl_perf_event_sample_rate);
|
||||
}
|
||||
|
||||
static DEFINE_IRQ_WORK(perf_duration_work, perf_duration_warn);
|
||||
|
||||
void perf_sample_event_took(u64 sample_len_ns)
|
||||
{
|
||||
u64 allowed_ns = ACCESS_ONCE(perf_sample_allowed_ns);
|
||||
u64 avg_local_sample_len;
|
||||
u64 local_samples_len;
|
||||
u64 max_len = READ_ONCE(perf_sample_allowed_ns);
|
||||
u64 running_len;
|
||||
u64 avg_len;
|
||||
u32 max;
|
||||
|
||||
if (allowed_ns == 0)
|
||||
if (max_len == 0)
|
||||
return;
|
||||
|
||||
/* decay the counter by 1 average sample */
|
||||
local_samples_len = __this_cpu_read(running_sample_length);
|
||||
local_samples_len -= local_samples_len/NR_ACCUMULATED_SAMPLES;
|
||||
local_samples_len += sample_len_ns;
|
||||
__this_cpu_write(running_sample_length, local_samples_len);
|
||||
/* Decay the counter by 1 average sample. */
|
||||
running_len = __this_cpu_read(running_sample_length);
|
||||
running_len -= running_len/NR_ACCUMULATED_SAMPLES;
|
||||
running_len += sample_len_ns;
|
||||
__this_cpu_write(running_sample_length, running_len);
|
||||
|
||||
/*
|
||||
* note: this will be biased artifically low until we have
|
||||
* seen NR_ACCUMULATED_SAMPLES. Doing it this way keeps us
|
||||
* Note: this will be biased artifically low until we have
|
||||
* seen NR_ACCUMULATED_SAMPLES. Doing it this way keeps us
|
||||
* from having to maintain a count.
|
||||
*/
|
||||
avg_local_sample_len = local_samples_len/NR_ACCUMULATED_SAMPLES;
|
||||
|
||||
if (avg_local_sample_len <= allowed_ns)
|
||||
avg_len = running_len/NR_ACCUMULATED_SAMPLES;
|
||||
if (avg_len <= max_len)
|
||||
return;
|
||||
|
||||
if (max_samples_per_tick <= 1)
|
||||
return;
|
||||
__report_avg = avg_len;
|
||||
__report_allowed = max_len;
|
||||
|
||||
max_samples_per_tick = DIV_ROUND_UP(max_samples_per_tick, 2);
|
||||
sysctl_perf_event_sample_rate = max_samples_per_tick * HZ;
|
||||
/*
|
||||
* Compute a throttle threshold 25% below the current duration.
|
||||
*/
|
||||
avg_len += avg_len / 4;
|
||||
max = (TICK_NSEC / 100) * sysctl_perf_cpu_time_max_percent;
|
||||
if (avg_len < max)
|
||||
max /= (u32)avg_len;
|
||||
else
|
||||
max = 1;
|
||||
|
||||
WRITE_ONCE(perf_sample_allowed_ns, avg_len);
|
||||
WRITE_ONCE(max_samples_per_tick, max);
|
||||
|
||||
sysctl_perf_event_sample_rate = max * HZ;
|
||||
perf_sample_period_ns = NSEC_PER_SEC / sysctl_perf_event_sample_rate;
|
||||
|
||||
update_perf_cpu_limits();
|
||||
|
||||
if (!irq_work_queue(&perf_duration_work)) {
|
||||
early_printk("perf interrupt took too long (%lld > %lld), lowering "
|
||||
early_printk("perf: interrupt took too long (%lld > %lld), lowering "
|
||||
"kernel.perf_event_max_sample_rate to %d\n",
|
||||
avg_local_sample_len, allowed_ns >> 1,
|
||||
__report_avg, __report_allowed,
|
||||
sysctl_perf_event_sample_rate);
|
||||
}
|
||||
}
|
||||
@@ -4210,6 +4225,14 @@ static void __perf_event_period(struct perf_event *event,
|
||||
active = (event->state == PERF_EVENT_STATE_ACTIVE);
|
||||
if (active) {
|
||||
perf_pmu_disable(ctx->pmu);
|
||||
/*
|
||||
* We could be throttled; unthrottle now to avoid the tick
|
||||
* trying to unthrottle while we already re-started the event.
|
||||
*/
|
||||
if (event->hw.interrupts == MAX_INTERRUPTS) {
|
||||
event->hw.interrupts = 0;
|
||||
perf_log_throttle(event, 1);
|
||||
}
|
||||
event->pmu->stop(event, PERF_EF_UPDATE);
|
||||
}
|
||||
|
||||
@@ -9426,10 +9449,29 @@ perf_cpu_notify(struct notifier_block *self, unsigned long action, void *hcpu)
|
||||
switch (action & ~CPU_TASKS_FROZEN) {
|
||||
|
||||
case CPU_UP_PREPARE:
|
||||
/*
|
||||
* This must be done before the CPU comes alive, because the
|
||||
* moment we can run tasks we can encounter (software) events.
|
||||
*
|
||||
* Specifically, someone can have inherited events on kthreadd
|
||||
* or a pre-existing worker thread that gets re-bound.
|
||||
*/
|
||||
perf_event_init_cpu(cpu);
|
||||
break;
|
||||
|
||||
case CPU_DOWN_PREPARE:
|
||||
/*
|
||||
* This must be done before the CPU dies because after that an
|
||||
* active event might want to IPI the CPU and that'll not work
|
||||
* so great for dead CPUs.
|
||||
*
|
||||
* XXX smp_call_function_single() return -ENXIO without a warn
|
||||
* so we could possibly deal with this.
|
||||
*
|
||||
* This is safe against new events arriving because
|
||||
* sys_perf_event_open() serializes against hotplug using
|
||||
* get_online_cpus().
|
||||
*/
|
||||
perf_event_exit_cpu(cpu);
|
||||
break;
|
||||
default:
|
||||
|
||||
@@ -746,8 +746,10 @@ struct ring_buffer *rb_alloc(int nr_pages, long watermark, int cpu, int flags)
|
||||
|
||||
rb->user_page = all_buf;
|
||||
rb->data_pages[0] = all_buf + PAGE_SIZE;
|
||||
rb->page_order = ilog2(nr_pages);
|
||||
rb->nr_pages = !!nr_pages;
|
||||
if (nr_pages) {
|
||||
rb->nr_pages = 1;
|
||||
rb->page_order = ilog2(nr_pages);
|
||||
}
|
||||
|
||||
ring_buffer_init(rb, watermark, flags);
|
||||
|
||||
|
||||
12
tools/include/linux/stringify.h
Normal file
12
tools/include/linux/stringify.h
Normal file
@@ -0,0 +1,12 @@
|
||||
#ifndef __LINUX_STRINGIFY_H
|
||||
#define __LINUX_STRINGIFY_H
|
||||
|
||||
/* Indirect stringification. Doing two levels allows the parameter to be a
|
||||
* macro itself. For example, compile with -DFOO=bar, __stringify(FOO)
|
||||
* converts to "bar".
|
||||
*/
|
||||
|
||||
#define __stringify_1(x...) #x
|
||||
#define __stringify(x...) __stringify_1(x)
|
||||
|
||||
#endif /* !__LINUX_STRINGIFY_H */
|
||||
@@ -1,5 +1,5 @@
|
||||
include ../../scripts/Makefile.include
|
||||
include ../../perf/config/utilities.mak # QUIET_CLEAN
|
||||
include ../../scripts/utilities.mak # QUIET_CLEAN
|
||||
|
||||
ifeq ($(srctree),)
|
||||
srctree := $(patsubst %/,%,$(dir $(shell pwd)))
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
include ../../scripts/Makefile.include
|
||||
include ../../perf/config/utilities.mak # QUIET_CLEAN
|
||||
include ../../scripts/utilities.mak # QUIET_CLEAN
|
||||
|
||||
ifeq ($(srctree),)
|
||||
srctree := $(patsubst %/,%,$(dir $(shell pwd)))
|
||||
|
||||
@@ -5427,10 +5427,8 @@ void pevent_print_event_time(struct pevent *pevent, struct trace_seq *s,
|
||||
}
|
||||
|
||||
if (pevent->latency_format) {
|
||||
trace_seq_printf(s, " %3d", record->cpu);
|
||||
pevent_data_lat_fmt(pevent, s, record);
|
||||
} else
|
||||
trace_seq_printf(s, " [%03d]", record->cpu);
|
||||
}
|
||||
|
||||
if (use_usec_format) {
|
||||
if (pevent->flags & PEVENT_NSEC_OUTPUT) {
|
||||
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user