mirror of
https://github.com/Dasharo/linux.git
synced 2026-03-06 15:25:10 -08:00
Merge tag 'sched-core-2023-10-28' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip
Pull scheduler updates from Ingo Molnar:
"Fair scheduler (SCHED_OTHER) improvements:
- Remove the old and now unused SIS_PROP code & option
- Scan cluster before LLC in the wake-up path
- Use candidate prev/recent_used CPU if scanning failed for cluster
wakeup
NUMA scheduling improvements:
- Improve the VMA access-PID code to better skip/scan VMAs
- Extend tracing to cover VMA-skipping decisions
- Improve/fix the recently introduced sched_numa_find_nth_cpu() code
- Generalize numa_map_to_online_node()
Energy scheduling improvements:
- Remove the EM_MAX_COMPLEXITY limit
- Add tracepoints to track energy computation
- Make the behavior of the 'sched_energy_aware' sysctl more
consistent
- Consolidate and clean up access to a CPU's max compute capacity
- Fix uclamp code corner cases
RT scheduling improvements:
- Drive dl_rq->overloaded with dl_rq->pushable_dl_tasks updates
- Drive the ->rto_mask with rt_rq->pushable_tasks updates
Scheduler scalability improvements:
- Rate-limit updates to tg->load_avg
- On x86 disable IBRS when CPU is offline to improve single-threaded
performance
- Micro-optimize in_task() and in_interrupt()
- Micro-optimize the PSI code
- Avoid updating PSI triggers and ->rtpoll_total when there are no
state changes
Core scheduler infrastructure improvements:
- Use saved_state to reduce some spurious freezer wakeups
- Bring in a handful of fast-headers improvements to scheduler
headers
- Make the scheduler UAPI headers more widely usable by user-space
- Simplify the control flow of scheduler syscalls by using lock
guards
- Fix sched_setaffinity() vs. CPU hotplug race
Scheduler debuggability improvements:
- Disallow writing invalid values to sched_rt_period_us
- Fix a race in the rq-clock debugging code triggering warnings
- Fix a warning in the bandwidth distribution code
- Micro-optimize in_atomic_preempt_off() checks
- Enforce that the tasklist_lock is held in for_each_thread()
- Print the TGID in sched_show_task()
- Remove the /proc/sys/kernel/sched_child_runs_first sysctl
... and misc cleanups & fixes"
* tag 'sched-core-2023-10-28' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip: (82 commits)
sched/fair: Remove SIS_PROP
sched/fair: Use candidate prev/recent_used CPU if scanning failed for cluster wakeup
sched/fair: Scan cluster before scanning LLC in wake-up path
sched: Add cpus_share_resources API
sched/core: Fix RQCF_ACT_SKIP leak
sched/fair: Remove unused 'curr' argument from pick_next_entity()
sched/nohz: Update comments about NEWILB_KICK
sched/fair: Remove duplicate #include
sched/psi: Update poll => rtpoll in relevant comments
sched: Make PELT acronym definition searchable
sched: Fix stop_one_cpu_nowait() vs hotplug
sched/psi: Bail out early from irq time accounting
sched/topology: Rename 'DIE' domain to 'PKG'
sched/psi: Delete the 'update_total' function parameter from update_triggers()
sched/psi: Avoid updating PSI triggers and ->rtpoll_total when there are no state changes
sched/headers: Remove comment referring to rq::cpu_load, since this has been removed
sched/numa: Complete scanning of inactive VMAs when there is no alternative
sched/numa: Complete scanning of partial VMAs regardless of PID activity
sched/numa: Move up the access pid reset logic
sched/numa: Trace decisions related to skipping VMAs
...
This commit is contained in:
@@ -170,7 +170,7 @@ and ``idle=nomwait``. If any of them is present in the kernel command line, the
|
||||
``MWAIT`` instruction is not allowed to be used, so the initialization of
|
||||
``intel_idle`` will fail.
|
||||
|
||||
Apart from that there are four module parameters recognized by ``intel_idle``
|
||||
Apart from that there are five module parameters recognized by ``intel_idle``
|
||||
itself that can be set via the kernel command line (they cannot be updated via
|
||||
sysfs, so that is the only way to change their values).
|
||||
|
||||
@@ -216,6 +216,21 @@ are ignored).
|
||||
The idle states disabled this way can be enabled (on a per-CPU basis) from user
|
||||
space via ``sysfs``.
|
||||
|
||||
The ``ibrs_off`` module parameter is a boolean flag (defaults to
|
||||
false). If set, it is used to control if IBRS (Indirect Branch Restricted
|
||||
Speculation) should be turned off when the CPU enters an idle state.
|
||||
This flag does not affect CPUs that use Enhanced IBRS which can remain
|
||||
on with little performance impact.
|
||||
|
||||
For some CPUs, IBRS will be selected as mitigation for Spectre v2 and Retbleed
|
||||
security vulnerabilities by default. Leaving the IBRS mode on while idling may
|
||||
have a performance impact on its sibling CPU. The IBRS mode will be turned off
|
||||
by default when the CPU enters into a deep idle state, but not in some
|
||||
shallower ones. Setting the ``ibrs_off`` module parameter will force the IBRS
|
||||
mode to off when the CPU is in any one of the available idle states. This may
|
||||
help performance of a sibling CPU at the expense of a slightly higher wakeup
|
||||
latency for the idle CPU.
|
||||
|
||||
|
||||
.. _intel-idle-core-and-package-idle-states:
|
||||
|
||||
|
||||
@@ -1182,7 +1182,8 @@ automatically on platforms where it can run (that is,
|
||||
platforms with asymmetric CPU topologies and having an Energy
|
||||
Model available). If your platform happens to meet the
|
||||
requirements for EAS but you do not want to use it, change
|
||||
this value to 0.
|
||||
this value to 0. On Non-EAS platforms, write operation fails and
|
||||
read doesn't return anything.
|
||||
|
||||
task_delayacct
|
||||
===============
|
||||
|
||||
@@ -39,14 +39,15 @@ per Hz, leading to::
|
||||
-------------------
|
||||
|
||||
Two different capacity values are used within the scheduler. A CPU's
|
||||
``capacity_orig`` is its maximum attainable capacity, i.e. its maximum
|
||||
attainable performance level. A CPU's ``capacity`` is its ``capacity_orig`` to
|
||||
which some loss of available performance (e.g. time spent handling IRQs) is
|
||||
subtracted.
|
||||
``original capacity`` is its maximum attainable capacity, i.e. its maximum
|
||||
attainable performance level. This original capacity is returned by
|
||||
the function arch_scale_cpu_capacity(). A CPU's ``capacity`` is its ``original
|
||||
capacity`` to which some loss of available performance (e.g. time spent
|
||||
handling IRQs) is subtracted.
|
||||
|
||||
Note that a CPU's ``capacity`` is solely intended to be used by the CFS class,
|
||||
while ``capacity_orig`` is class-agnostic. The rest of this document will use
|
||||
the term ``capacity`` interchangeably with ``capacity_orig`` for the sake of
|
||||
while ``original capacity`` is class-agnostic. The rest of this document will use
|
||||
the term ``capacity`` interchangeably with ``original capacity`` for the sake of
|
||||
brevity.
|
||||
|
||||
1.3 Platform examples
|
||||
|
||||
@@ -359,32 +359,9 @@ in milli-Watts or in an 'abstract scale'.
|
||||
6.3 - Energy Model complexity
|
||||
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
||||
|
||||
The task wake-up path is very latency-sensitive. When the EM of a platform is
|
||||
too complex (too many CPUs, too many performance domains, too many performance
|
||||
states, ...), the cost of using it in the wake-up path can become prohibitive.
|
||||
The energy-aware wake-up algorithm has a complexity of:
|
||||
|
||||
C = Nd * (Nc + Ns)
|
||||
|
||||
with: Nd the number of performance domains; Nc the number of CPUs; and Ns the
|
||||
total number of OPPs (ex: for two perf. domains with 4 OPPs each, Ns = 8).
|
||||
|
||||
A complexity check is performed at the root domain level, when scheduling
|
||||
domains are built. EAS will not start on a root domain if its C happens to be
|
||||
higher than the completely arbitrary EM_MAX_COMPLEXITY threshold (2048 at the
|
||||
time of writing).
|
||||
|
||||
If you really want to use EAS but the complexity of your platform's Energy
|
||||
Model is too high to be used with a single root domain, you're left with only
|
||||
two possible options:
|
||||
|
||||
1. split your system into separate, smaller, root domains using exclusive
|
||||
cpusets and enable EAS locally on each of them. This option has the
|
||||
benefit to work out of the box but the drawback of preventing load
|
||||
balance between root domains, which can result in an unbalanced system
|
||||
overall;
|
||||
2. submit patches to reduce the complexity of the EAS wake-up algorithm,
|
||||
hence enabling it to cope with larger EMs in reasonable time.
|
||||
EAS does not impose any complexity limit on the number of PDs/OPPs/CPUs but
|
||||
restricts the number of CPUs to EM_MAX_NUM_CPUS to prevent overflows during
|
||||
the energy estimation.
|
||||
|
||||
|
||||
6.4 - Schedutil governor
|
||||
|
||||
@@ -39,10 +39,10 @@ Most notable:
|
||||
1.1 The problem
|
||||
---------------
|
||||
|
||||
Realtime scheduling is all about determinism, a group has to be able to rely on
|
||||
Real-time scheduling is all about determinism, a group has to be able to rely on
|
||||
the amount of bandwidth (eg. CPU time) being constant. In order to schedule
|
||||
multiple groups of realtime tasks, each group must be assigned a fixed portion
|
||||
of the CPU time available. Without a minimum guarantee a realtime group can
|
||||
multiple groups of real-time tasks, each group must be assigned a fixed portion
|
||||
of the CPU time available. Without a minimum guarantee a real-time group can
|
||||
obviously fall short. A fuzzy upper limit is of no use since it cannot be
|
||||
relied upon. Which leaves us with just the single fixed portion.
|
||||
|
||||
@@ -50,14 +50,14 @@ relied upon. Which leaves us with just the single fixed portion.
|
||||
----------------
|
||||
|
||||
CPU time is divided by means of specifying how much time can be spent running
|
||||
in a given period. We allocate this "run time" for each realtime group which
|
||||
the other realtime groups will not be permitted to use.
|
||||
in a given period. We allocate this "run time" for each real-time group which
|
||||
the other real-time groups will not be permitted to use.
|
||||
|
||||
Any time not allocated to a realtime group will be used to run normal priority
|
||||
Any time not allocated to a real-time group will be used to run normal priority
|
||||
tasks (SCHED_OTHER). Any allocated run time not used will also be picked up by
|
||||
SCHED_OTHER.
|
||||
|
||||
Let's consider an example: a frame fixed realtime renderer must deliver 25
|
||||
Let's consider an example: a frame fixed real-time renderer must deliver 25
|
||||
frames a second, which yields a period of 0.04s per frame. Now say it will also
|
||||
have to play some music and respond to input, leaving it with around 80% CPU
|
||||
time dedicated for the graphics. We can then give this group a run time of 0.8
|
||||
@@ -70,7 +70,7 @@ needs only about 3% CPU time to do so, it can do with a 0.03 * 0.005s =
|
||||
of 0.00015s.
|
||||
|
||||
The remaining CPU time will be used for user input and other tasks. Because
|
||||
realtime tasks have explicitly allocated the CPU time they need to perform
|
||||
real-time tasks have explicitly allocated the CPU time they need to perform
|
||||
their tasks, buffer underruns in the graphics or audio can be eliminated.
|
||||
|
||||
NOTE: the above example is not fully implemented yet. We still
|
||||
@@ -87,18 +87,20 @@ lack an EDF scheduler to make non-uniform periods usable.
|
||||
The system wide settings are configured under the /proc virtual file system:
|
||||
|
||||
/proc/sys/kernel/sched_rt_period_us:
|
||||
The scheduling period that is equivalent to 100% CPU bandwidth
|
||||
The scheduling period that is equivalent to 100% CPU bandwidth.
|
||||
|
||||
/proc/sys/kernel/sched_rt_runtime_us:
|
||||
A global limit on how much time realtime scheduling may use. Even without
|
||||
CONFIG_RT_GROUP_SCHED enabled, this will limit time reserved to realtime
|
||||
processes. With CONFIG_RT_GROUP_SCHED it signifies the total bandwidth
|
||||
available to all realtime groups.
|
||||
A global limit on how much time real-time scheduling may use. This is always
|
||||
less or equal to the period_us, as it denotes the time allocated from the
|
||||
period_us for the real-time tasks. Even without CONFIG_RT_GROUP_SCHED enabled,
|
||||
this will limit time reserved to real-time processes. With
|
||||
CONFIG_RT_GROUP_SCHED=y it signifies the total bandwidth available to all
|
||||
real-time groups.
|
||||
|
||||
* Time is specified in us because the interface is s32. This gives an
|
||||
operating range from 1us to about 35 minutes.
|
||||
* sched_rt_period_us takes values from 1 to INT_MAX.
|
||||
* sched_rt_runtime_us takes values from -1 to (INT_MAX - 1).
|
||||
* sched_rt_runtime_us takes values from -1 to sched_rt_period_us.
|
||||
* A run time of -1 specifies runtime == period, ie. no limit.
|
||||
|
||||
|
||||
@@ -108,7 +110,7 @@ The system wide settings are configured under the /proc virtual file system:
|
||||
The default values for sched_rt_period_us (1000000 or 1s) and
|
||||
sched_rt_runtime_us (950000 or 0.95s). This gives 0.05s to be used by
|
||||
SCHED_OTHER (non-RT tasks). These defaults were chosen so that a run-away
|
||||
realtime tasks will not lock up the machine but leave a little time to recover
|
||||
real-time tasks will not lock up the machine but leave a little time to recover
|
||||
it. By setting runtime to -1 you'd get the old behaviour back.
|
||||
|
||||
By default all bandwidth is assigned to the root group and new groups get the
|
||||
@@ -116,10 +118,10 @@ period from /proc/sys/kernel/sched_rt_period_us and a run time of 0. If you
|
||||
want to assign bandwidth to another group, reduce the root group's bandwidth
|
||||
and assign some or all of the difference to another group.
|
||||
|
||||
Realtime group scheduling means you have to assign a portion of total CPU
|
||||
bandwidth to the group before it will accept realtime tasks. Therefore you will
|
||||
not be able to run realtime tasks as any user other than root until you have
|
||||
done that, even if the user has the rights to run processes with realtime
|
||||
Real-time group scheduling means you have to assign a portion of total CPU
|
||||
bandwidth to the group before it will accept real-time tasks. Therefore you will
|
||||
not be able to run real-time tasks as any user other than root until you have
|
||||
done that, even if the user has the rights to run processes with real-time
|
||||
priority!
|
||||
|
||||
|
||||
|
||||
@@ -1051,7 +1051,7 @@ static struct sched_domain_topology_level powerpc_topology[] = {
|
||||
#endif
|
||||
{ shared_cache_mask, powerpc_shared_cache_flags, SD_INIT_NAME(CACHE) },
|
||||
{ cpu_mc_mask, SD_INIT_NAME(MC) },
|
||||
{ cpu_cpu_mask, SD_INIT_NAME(DIE) },
|
||||
{ cpu_cpu_mask, SD_INIT_NAME(PKG) },
|
||||
{ NULL, },
|
||||
};
|
||||
|
||||
@@ -1595,7 +1595,7 @@ static void add_cpu_to_masks(int cpu)
|
||||
/* Skip all CPUs already part of current CPU core mask */
|
||||
cpumask_andnot(mask, cpu_online_mask, cpu_core_mask(cpu));
|
||||
|
||||
/* If chip_id is -1; limit the cpu_core_mask to within DIE*/
|
||||
/* If chip_id is -1; limit the cpu_core_mask to within PKG */
|
||||
if (chip_id == -1)
|
||||
cpumask_and(mask, mask, cpu_cpu_mask(cpu));
|
||||
|
||||
|
||||
@@ -522,7 +522,7 @@ static struct sched_domain_topology_level s390_topology[] = {
|
||||
{ cpu_coregroup_mask, cpu_core_flags, SD_INIT_NAME(MC) },
|
||||
{ cpu_book_mask, SD_INIT_NAME(BOOK) },
|
||||
{ cpu_drawer_mask, SD_INIT_NAME(DRAWER) },
|
||||
{ cpu_cpu_mask, SD_INIT_NAME(DIE) },
|
||||
{ cpu_cpu_mask, SD_INIT_NAME(PKG) },
|
||||
{ NULL, },
|
||||
};
|
||||
|
||||
|
||||
@@ -4,6 +4,7 @@
|
||||
|
||||
#include <linux/thread_info.h>
|
||||
#include <asm/nospec-branch.h>
|
||||
#include <asm/msr.h>
|
||||
|
||||
/*
|
||||
* On VMENTER we must preserve whatever view of the SPEC_CTRL MSR
|
||||
@@ -76,6 +77,16 @@ static inline u64 ssbd_tif_to_amd_ls_cfg(u64 tifn)
|
||||
return (tifn & _TIF_SSBD) ? x86_amd_ls_cfg_ssbd_mask : 0ULL;
|
||||
}
|
||||
|
||||
/*
|
||||
* This can be used in noinstr functions & should only be called in bare
|
||||
* metal context.
|
||||
*/
|
||||
static __always_inline void __update_spec_ctrl(u64 val)
|
||||
{
|
||||
__this_cpu_write(x86_spec_ctrl_current, val);
|
||||
native_wrmsrl(MSR_IA32_SPEC_CTRL, val);
|
||||
}
|
||||
|
||||
#ifdef CONFIG_SMP
|
||||
extern void speculative_store_bypass_ht_init(void);
|
||||
#else
|
||||
|
||||
@@ -87,6 +87,7 @@
|
||||
#include <asm/hw_irq.h>
|
||||
#include <asm/stackprotector.h>
|
||||
#include <asm/sev.h>
|
||||
#include <asm/spec-ctrl.h>
|
||||
|
||||
/* representing HT siblings of each logical CPU */
|
||||
DEFINE_PER_CPU_READ_MOSTLY(cpumask_var_t, cpu_sibling_map);
|
||||
@@ -640,13 +641,13 @@ static void __init build_sched_topology(void)
|
||||
};
|
||||
#endif
|
||||
/*
|
||||
* When there is NUMA topology inside the package skip the DIE domain
|
||||
* When there is NUMA topology inside the package skip the PKG domain
|
||||
* since the NUMA domains will auto-magically create the right spanning
|
||||
* domains based on the SLIT.
|
||||
*/
|
||||
if (!x86_has_numa_in_package) {
|
||||
x86_topology[i++] = (struct sched_domain_topology_level){
|
||||
cpu_cpu_mask, x86_die_flags, SD_INIT_NAME(DIE)
|
||||
cpu_cpu_mask, x86_die_flags, SD_INIT_NAME(PKG)
|
||||
};
|
||||
}
|
||||
|
||||
@@ -1596,8 +1597,15 @@ void __noreturn hlt_play_dead(void)
|
||||
native_halt();
|
||||
}
|
||||
|
||||
/*
|
||||
* native_play_dead() is essentially a __noreturn function, but it can't
|
||||
* be marked as such as the compiler may complain about it.
|
||||
*/
|
||||
void native_play_dead(void)
|
||||
{
|
||||
if (cpu_feature_enabled(X86_FEATURE_KERNEL_IBRS))
|
||||
__update_spec_ctrl(0);
|
||||
|
||||
play_dead_common();
|
||||
tboot_shutdown(TB_SHUTDOWN_WFS);
|
||||
|
||||
|
||||
@@ -53,9 +53,8 @@
|
||||
#include <linux/moduleparam.h>
|
||||
#include <asm/cpu_device_id.h>
|
||||
#include <asm/intel-family.h>
|
||||
#include <asm/nospec-branch.h>
|
||||
#include <asm/mwait.h>
|
||||
#include <asm/msr.h>
|
||||
#include <asm/spec-ctrl.h>
|
||||
#include <asm/fpu/api.h>
|
||||
|
||||
#define INTEL_IDLE_VERSION "0.5.1"
|
||||
@@ -69,6 +68,7 @@ static int max_cstate = CPUIDLE_STATE_MAX - 1;
|
||||
static unsigned int disabled_states_mask __read_mostly;
|
||||
static unsigned int preferred_states_mask __read_mostly;
|
||||
static bool force_irq_on __read_mostly;
|
||||
static bool ibrs_off __read_mostly;
|
||||
|
||||
static struct cpuidle_device __percpu *intel_idle_cpuidle_devices;
|
||||
|
||||
@@ -182,12 +182,12 @@ static __cpuidle int intel_idle_ibrs(struct cpuidle_device *dev,
|
||||
int ret;
|
||||
|
||||
if (smt_active)
|
||||
native_wrmsrl(MSR_IA32_SPEC_CTRL, 0);
|
||||
__update_spec_ctrl(0);
|
||||
|
||||
ret = __intel_idle(dev, drv, index);
|
||||
|
||||
if (smt_active)
|
||||
native_wrmsrl(MSR_IA32_SPEC_CTRL, spec_ctrl);
|
||||
__update_spec_ctrl(spec_ctrl);
|
||||
|
||||
return ret;
|
||||
}
|
||||
@@ -1853,11 +1853,13 @@ static void state_update_enter_method(struct cpuidle_state *state, int cstate)
|
||||
}
|
||||
|
||||
if (cpu_feature_enabled(X86_FEATURE_KERNEL_IBRS) &&
|
||||
state->flags & CPUIDLE_FLAG_IBRS) {
|
||||
((state->flags & CPUIDLE_FLAG_IBRS) || ibrs_off)) {
|
||||
/*
|
||||
* IBRS mitigation requires that C-states are entered
|
||||
* with interrupts disabled.
|
||||
*/
|
||||
if (ibrs_off && (state->flags & CPUIDLE_FLAG_IRQ_ENABLE))
|
||||
state->flags &= ~CPUIDLE_FLAG_IRQ_ENABLE;
|
||||
WARN_ON_ONCE(state->flags & CPUIDLE_FLAG_IRQ_ENABLE);
|
||||
state->enter = intel_idle_ibrs;
|
||||
return;
|
||||
@@ -2176,3 +2178,9 @@ MODULE_PARM_DESC(preferred_cstates, "Mask of preferred idle states");
|
||||
* 'CPUIDLE_FLAG_INIT_XSTATE' and 'CPUIDLE_FLAG_IBRS' flags.
|
||||
*/
|
||||
module_param(force_irq_on, bool, 0444);
|
||||
/*
|
||||
* Force the disabling of IBRS when X86_FEATURE_KERNEL_IBRS is on and
|
||||
* CPUIDLE_FLAG_IRQ_ENABLE isn't set.
|
||||
*/
|
||||
module_param(ibrs_off, bool, 0444);
|
||||
MODULE_PARM_DESC(ibrs_off, "Disable IBRS when idle");
|
||||
|
||||
@@ -155,6 +155,8 @@ static inline int remove_cpu(unsigned int cpu) { return -EPERM; }
|
||||
static inline void smp_shutdown_nonboot_cpus(unsigned int primary_cpu) { }
|
||||
#endif /* !CONFIG_HOTPLUG_CPU */
|
||||
|
||||
DEFINE_LOCK_GUARD_0(cpus_read_lock, cpus_read_lock(), cpus_read_unlock())
|
||||
|
||||
#ifdef CONFIG_PM_SLEEP_SMP
|
||||
extern int freeze_secondary_cpus(int primary);
|
||||
extern void thaw_secondary_cpus(void);
|
||||
|
||||
@@ -686,6 +686,14 @@ static inline void list_splice_tail_init(struct list_head *list,
|
||||
#define list_for_each(pos, head) \
|
||||
for (pos = (head)->next; !list_is_head(pos, (head)); pos = pos->next)
|
||||
|
||||
/**
|
||||
* list_for_each_reverse - iterate backwards over a list
|
||||
* @pos: the &struct list_head to use as a loop cursor.
|
||||
* @head: the head for your list.
|
||||
*/
|
||||
#define list_for_each_reverse(pos, head) \
|
||||
for (pos = (head)->prev; pos != (head); pos = pos->prev)
|
||||
|
||||
/**
|
||||
* list_for_each_rcu - Iterate over a list in an RCU-safe fashion
|
||||
* @pos: the &struct list_head to use as a loop cursor.
|
||||
|
||||
@@ -1726,8 +1726,8 @@ static inline void vma_set_access_pid_bit(struct vm_area_struct *vma)
|
||||
unsigned int pid_bit;
|
||||
|
||||
pid_bit = hash_32(current->pid, ilog2(BITS_PER_LONG));
|
||||
if (vma->numab_state && !test_bit(pid_bit, &vma->numab_state->access_pids[1])) {
|
||||
__set_bit(pid_bit, &vma->numab_state->access_pids[1]);
|
||||
if (vma->numab_state && !test_bit(pid_bit, &vma->numab_state->pids_active[1])) {
|
||||
__set_bit(pid_bit, &vma->numab_state->pids_active[1]);
|
||||
}
|
||||
}
|
||||
#else /* !CONFIG_NUMA_BALANCING */
|
||||
|
||||
@@ -551,9 +551,36 @@ struct vma_lock {
|
||||
};
|
||||
|
||||
struct vma_numab_state {
|
||||
/*
|
||||
* Initialised as time in 'jiffies' after which VMA
|
||||
* should be scanned. Delays first scan of new VMA by at
|
||||
* least sysctl_numa_balancing_scan_delay:
|
||||
*/
|
||||
unsigned long next_scan;
|
||||
unsigned long next_pid_reset;
|
||||
unsigned long access_pids[2];
|
||||
|
||||
/*
|
||||
* Time in jiffies when pids_active[] is reset to
|
||||
* detect phase change behaviour:
|
||||
*/
|
||||
unsigned long pids_active_reset;
|
||||
|
||||
/*
|
||||
* Approximate tracking of PIDs that trapped a NUMA hinting
|
||||
* fault. May produce false positives due to hash collisions.
|
||||
*
|
||||
* [0] Previous PID tracking
|
||||
* [1] Current PID tracking
|
||||
*
|
||||
* Window moves after next_pid_reset has expired approximately
|
||||
* every VMA_PID_RESET_PERIOD jiffies:
|
||||
*/
|
||||
unsigned long pids_active[2];
|
||||
|
||||
/*
|
||||
* MM scan sequence ID when the VMA was last completely scanned.
|
||||
* A VMA is not eligible for scanning if prev_scan_seq == numa_scan_seq
|
||||
*/
|
||||
int prev_scan_seq;
|
||||
};
|
||||
|
||||
/*
|
||||
|
||||
@@ -25,7 +25,7 @@
|
||||
#include <asm/sparsemem.h>
|
||||
|
||||
/* Generic implementation available */
|
||||
int numa_map_to_online_node(int node);
|
||||
int numa_nearest_node(int node, unsigned int state);
|
||||
|
||||
#ifndef memory_add_physaddr_to_nid
|
||||
static inline int memory_add_physaddr_to_nid(u64 start)
|
||||
@@ -44,10 +44,11 @@ static inline int phys_to_target_node(u64 start)
|
||||
}
|
||||
#endif
|
||||
#else /* !CONFIG_NUMA */
|
||||
static inline int numa_map_to_online_node(int node)
|
||||
static inline int numa_nearest_node(int node, unsigned int state)
|
||||
{
|
||||
return NUMA_NO_NODE;
|
||||
}
|
||||
|
||||
static inline int memory_add_physaddr_to_nid(u64 start)
|
||||
{
|
||||
return 0;
|
||||
@@ -58,6 +59,8 @@ static inline int phys_to_target_node(u64 start)
|
||||
}
|
||||
#endif
|
||||
|
||||
#define numa_map_to_online_node(node) numa_nearest_node(node, N_ONLINE)
|
||||
|
||||
#ifdef CONFIG_HAVE_ARCH_NODE_DEV_GROUP
|
||||
extern const struct attribute_group arch_node_dev_group;
|
||||
#endif
|
||||
|
||||
@@ -99,14 +99,21 @@ static __always_inline unsigned char interrupt_context_level(void)
|
||||
return level;
|
||||
}
|
||||
|
||||
/*
|
||||
* These macro definitions avoid redundant invocations of preempt_count()
|
||||
* because such invocations would result in redundant loads given that
|
||||
* preempt_count() is commonly implemented with READ_ONCE().
|
||||
*/
|
||||
|
||||
#define nmi_count() (preempt_count() & NMI_MASK)
|
||||
#define hardirq_count() (preempt_count() & HARDIRQ_MASK)
|
||||
#ifdef CONFIG_PREEMPT_RT
|
||||
# define softirq_count() (current->softirq_disable_cnt & SOFTIRQ_MASK)
|
||||
# define irq_count() ((preempt_count() & (NMI_MASK | HARDIRQ_MASK)) | softirq_count())
|
||||
#else
|
||||
# define softirq_count() (preempt_count() & SOFTIRQ_MASK)
|
||||
# define irq_count() (preempt_count() & (NMI_MASK | HARDIRQ_MASK | SOFTIRQ_MASK))
|
||||
#endif
|
||||
#define irq_count() (nmi_count() | hardirq_count() | softirq_count())
|
||||
|
||||
/*
|
||||
* Macros to retrieve the current execution context:
|
||||
@@ -119,7 +126,11 @@ static __always_inline unsigned char interrupt_context_level(void)
|
||||
#define in_nmi() (nmi_count())
|
||||
#define in_hardirq() (hardirq_count())
|
||||
#define in_serving_softirq() (softirq_count() & SOFTIRQ_OFFSET)
|
||||
#define in_task() (!(in_nmi() | in_hardirq() | in_serving_softirq()))
|
||||
#ifdef CONFIG_PREEMPT_RT
|
||||
# define in_task() (!((preempt_count() & (NMI_MASK | HARDIRQ_MASK)) | in_serving_softirq()))
|
||||
#else
|
||||
# define in_task() (!(preempt_count() & (NMI_MASK | HARDIRQ_MASK | SOFTIRQ_OFFSET)))
|
||||
#endif
|
||||
|
||||
/*
|
||||
* The following macros are deprecated and should not be used in new code:
|
||||
|
||||
@@ -63,7 +63,6 @@ struct robust_list_head;
|
||||
struct root_domain;
|
||||
struct rq;
|
||||
struct sched_attr;
|
||||
struct sched_param;
|
||||
struct seq_file;
|
||||
struct sighand_struct;
|
||||
struct signal_struct;
|
||||
@@ -370,6 +369,10 @@ extern struct root_domain def_root_domain;
|
||||
extern struct mutex sched_domains_mutex;
|
||||
#endif
|
||||
|
||||
struct sched_param {
|
||||
int sched_priority;
|
||||
};
|
||||
|
||||
struct sched_info {
|
||||
#ifdef CONFIG_SCHED_INFO
|
||||
/* Cumulative counters: */
|
||||
@@ -750,10 +753,8 @@ struct task_struct {
|
||||
#endif
|
||||
unsigned int __state;
|
||||
|
||||
#ifdef CONFIG_PREEMPT_RT
|
||||
/* saved state for "spinlock sleepers" */
|
||||
unsigned int saved_state;
|
||||
#endif
|
||||
|
||||
/*
|
||||
* This begins the randomizable portion of task_struct. Only
|
||||
|
||||
@@ -1,4 +1,6 @@
|
||||
/* SPDX-License-Identifier: GPL-2.0 */
|
||||
#ifndef _LINUX_SCHED_DEADLINE_H
|
||||
#define _LINUX_SCHED_DEADLINE_H
|
||||
|
||||
/*
|
||||
* SCHED_DEADLINE tasks has negative priorities, reflecting
|
||||
@@ -34,3 +36,5 @@ extern void dl_add_task_root_domain(struct task_struct *p);
|
||||
extern void dl_clear_root_domain(struct root_domain *rd);
|
||||
|
||||
#endif /* CONFIG_SMP */
|
||||
|
||||
#endif /* _LINUX_SCHED_DEADLINE_H */
|
||||
|
||||
@@ -15,6 +15,16 @@
|
||||
#define TNF_FAULT_LOCAL 0x08
|
||||
#define TNF_MIGRATE_FAIL 0x10
|
||||
|
||||
enum numa_vmaskip_reason {
|
||||
NUMAB_SKIP_UNSUITABLE,
|
||||
NUMAB_SKIP_SHARED_RO,
|
||||
NUMAB_SKIP_INACCESSIBLE,
|
||||
NUMAB_SKIP_SCAN_DELAY,
|
||||
NUMAB_SKIP_PID_INACTIVE,
|
||||
NUMAB_SKIP_IGNORE_PID,
|
||||
NUMAB_SKIP_SEQ_COMPLETED,
|
||||
};
|
||||
|
||||
#ifdef CONFIG_NUMA_BALANCING
|
||||
extern void task_numa_fault(int last_node, int node, int pages, int flags);
|
||||
extern pid_t task_numa_group_id(struct task_struct *p);
|
||||
|
||||
@@ -109,6 +109,13 @@ SD_FLAG(SD_ASYM_CPUCAPACITY_FULL, SDF_SHARED_PARENT | SDF_NEEDS_GROUPS)
|
||||
*/
|
||||
SD_FLAG(SD_SHARE_CPUCAPACITY, SDF_SHARED_CHILD | SDF_NEEDS_GROUPS)
|
||||
|
||||
/*
|
||||
* Domain members share CPU cluster (LLC tags or L2 cache)
|
||||
*
|
||||
* NEEDS_GROUPS: Clusters are shared between groups.
|
||||
*/
|
||||
SD_FLAG(SD_CLUSTER, SDF_NEEDS_GROUPS)
|
||||
|
||||
/*
|
||||
* Domain members share CPU package resources (i.e. caches)
|
||||
*
|
||||
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user