Merge tag 'sched-core-2024-09-19' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip

Pull scheduler updates from Ingo Molnar:

 - Implement the SCHED_DEADLINE server infrastructure - Daniel Bristot
   de Oliveira's last major contribution to the kernel:

     "SCHED_DEADLINE servers can help fixing starvation issues of low
      priority tasks (e.g., SCHED_OTHER) when higher priority tasks
      monopolize CPU cycles. Today we have RT Throttling; DEADLINE
      servers should be able to replace and improve that."

   (Daniel Bristot de Oliveira, Peter Zijlstra, Joel Fernandes, Youssef
   Esmat, Huang Shijie)

 - Preparatory changes for sched_ext integration:
     - Use set_next_task(.first) where required
     - Fix up set_next_task() implementations
     - Clean up DL server vs. core sched
     - Split up put_prev_task_balance()
     - Rework pick_next_task()
     - Combine the last put_prev_task() and the first set_next_task()
     - Rework dl_server
     - Add put_prev_task(.next)

   (Peter Zijlstra, with a fix by Tejun Heo)

 - Complete the EEVDF transition and refine EEVDF scheduling:
     - Implement delayed dequeue
     - Allow shorter slices to wakeup-preempt
     - Use sched_attr::sched_runtime to set request/slice suggestion
     - Document the new feature flags
     - Remove unused and duplicate-functionality fields
     - Simplify & unify pick_next_task_fair()
     - Misc debuggability enhancements

   (Peter Zijlstra, with fixes/cleanups by Dietmar Eggemann, Valentin
   Schneider and Chuyi Zhou)

 - Initialize the vruntime of a new task when it is first enqueued,
   resulting in significant decrease in latency of newly woken tasks
   (Zhang Qiao)

 - Introduce SM_IDLE and an idle re-entry fast-path in __schedule()
   (K Prateek Nayak, Peter Zijlstra)

 - Clean up and clarify the usage of Clean up usage of rt_task()
   (Qais Yousef)

 - Preempt SCHED_IDLE entities in strict cgroup hierarchies
   (Tianchen Ding)

 - Clarify the documentation of time units for deadline scheduler
   parameters (Christian Loehle)

 - Remove the HZ_BW chicken-bit feature flag introduced a year ago,
   the original change seems to be working fine (Phil Auld)

 - Misc fixes and cleanups (Chen Yu, Dan Carpenter, Huang Shijie,
   Peilin He, Qais Yousefm and Vincent Guittot)

* tag 'sched-core-2024-09-19' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip: (64 commits)
  sched/cpufreq: Use NSEC_PER_MSEC for deadline task
  cpufreq/cppc: Use NSEC_PER_MSEC for deadline task
  sched/deadline: Clarify nanoseconds in uapi
  sched/deadline: Convert schedtool example to chrt
  sched/debug: Fix the runnable tasks output
  sched: Fix sched_delayed vs sched_core
  kernel/sched: Fix util_est accounting for DELAY_DEQUEUE
  kthread: Fix task state in kthread worker if being frozen
  sched/pelt: Use rq_clock_task() for hw_pressure
  sched/fair: Move effective_cpu_util() and effective_cpu_util() in fair.c
  sched/core: Introduce SM_IDLE and an idle re-entry fast-path in __schedule()
  sched: Add put_prev_task(.next)
  sched: Rework dl_server
  sched: Combine the last put_prev_task() and the first set_next_task()
  sched: Rework pick_next_task()
  sched: Split up put_prev_task_balance()
  sched: Clean up DL server vs core sched
  sched: Fixup set_next_task() implementations
  sched: Use set_next_task(.first) where required
  sched/fair: Properly deactivate sched_delayed task upon class change
  ...
This commit is contained in:
Linus Torvalds
2024-09-19 15:55:58 +02:00
32 changed files with 1695 additions and 747 deletions

View File

@@ -749,21 +749,19 @@ Appendix A. Test suite
of the command line options. Please refer to rt-app documentation for more
details (`<rt-app-sources>/doc/*.json`).
The second testing application is a modification of schedtool, called
schedtool-dl, which can be used to setup SCHED_DEADLINE parameters for a
certain pid/application. schedtool-dl is available at:
https://github.com/scheduler-tools/schedtool-dl.git.
The second testing application is done using chrt which has support
for SCHED_DEADLINE.
The usage is straightforward::
# schedtool -E -t 10000000:100000000 -e ./my_cpuhog_app
# chrt -d -T 10000000 -D 100000000 0 ./my_cpuhog_app
With this, my_cpuhog_app is put to run inside a SCHED_DEADLINE reservation
of 10ms every 100ms (note that parameters are expressed in microseconds).
You can also use schedtool to create a reservation for an already running
of 10ms every 100ms (note that parameters are expressed in nanoseconds).
You can also use chrt to create a reservation for an already running
application, given that you know its pid::
# schedtool -E -t 10000000:100000000 my_app_pid
# chrt -d -T 10000000 -D 100000000 -p 0 my_app_pid
Appendix B. Minimal main()
==========================

View File

@@ -224,9 +224,9 @@ static void __init cppc_freq_invariance_init(void)
* Fake (unused) bandwidth; workaround to "fix"
* priority inheritance.
*/
.sched_runtime = 1000000,
.sched_deadline = 10000000,
.sched_period = 10000000,
.sched_runtime = NSEC_PER_MSEC,
.sched_deadline = 10 * NSEC_PER_MSEC,
.sched_period = 10 * NSEC_PER_MSEC,
};
int ret;

View File

@@ -335,7 +335,7 @@ static inline bool six_owner_running(struct six_lock *lock)
*/
rcu_read_lock();
struct task_struct *owner = READ_ONCE(lock->owner);
bool ret = owner ? owner_on_cpu(owner) : !rt_task(current);
bool ret = owner ? owner_on_cpu(owner) : !rt_or_dl_task(current);
rcu_read_unlock();
return ret;

View File

@@ -2626,7 +2626,7 @@ static ssize_t timerslack_ns_write(struct file *file, const char __user *buf,
}
task_lock(p);
if (task_is_realtime(p))
if (rt_or_dl_task_policy(p))
slack_ns = 0;
else if (slack_ns == 0)
slack_ns = p->default_timer_slack_ns;

View File

@@ -40,7 +40,7 @@ static inline int task_nice_ioclass(struct task_struct *task)
{
if (task->policy == SCHED_IDLE)
return IOPRIO_CLASS_IDLE;
else if (task_is_realtime(task))
else if (rt_or_dl_task_policy(task))
return IOPRIO_CLASS_RT;
else
return IOPRIO_CLASS_BE;

View File

@@ -149,8 +149,9 @@ struct user_event_mm;
* Special states are those that do not use the normal wait-loop pattern. See
* the comment with set_special_state().
*/
#define is_special_task_state(state) \
((state) & (__TASK_STOPPED | __TASK_TRACED | TASK_PARKED | TASK_DEAD))
#define is_special_task_state(state) \
((state) & (__TASK_STOPPED | __TASK_TRACED | TASK_PARKED | \
TASK_DEAD | TASK_FROZEN))
#ifdef CONFIG_DEBUG_ATOMIC_SLEEP
# define debug_normal_state_change(state_value) \
@@ -541,9 +542,14 @@ struct sched_entity {
struct rb_node run_node;
u64 deadline;
u64 min_vruntime;
u64 min_slice;
struct list_head group_node;
unsigned int on_rq;
unsigned char on_rq;
unsigned char sched_delayed;
unsigned char rel_deadline;
unsigned char custom_slice;
/* hole */
u64 exec_start;
u64 sum_exec_runtime;
@@ -639,12 +645,26 @@ struct sched_dl_entity {
*
* @dl_overrun tells if the task asked to be informed about runtime
* overruns.
*
* @dl_server tells if this is a server entity.
*
* @dl_defer tells if this is a deferred or regular server. For
* now only defer server exists.
*
* @dl_defer_armed tells if the deferrable server is waiting
* for the replenishment timer to activate it.
*
* @dl_defer_running tells if the deferrable server is actually
* running, skipping the defer phase.
*/
unsigned int dl_throttled : 1;
unsigned int dl_yielded : 1;
unsigned int dl_non_contending : 1;
unsigned int dl_overrun : 1;
unsigned int dl_server : 1;
unsigned int dl_defer : 1;
unsigned int dl_defer_armed : 1;
unsigned int dl_defer_running : 1;
/*
* Bandwidth enforcement timer. Each -deadline task has its
@@ -672,7 +692,7 @@ struct sched_dl_entity {
*/
struct rq *rq;
dl_server_has_tasks_f server_has_tasks;
dl_server_pick_f server_pick;
dl_server_pick_f server_pick_task;
#ifdef CONFIG_RT_MUTEXES
/*

View File

@@ -10,16 +10,16 @@
#include <linux/sched.h>
#define MAX_DL_PRIO 0
static inline int dl_prio(int prio)
static inline bool dl_prio(int prio)
{
if (unlikely(prio < MAX_DL_PRIO))
return 1;
return 0;
return unlikely(prio < MAX_DL_PRIO);
}
static inline int dl_task(struct task_struct *p)
/*
* Returns true if a task has a priority that belongs to DL class. PI-boosted
* tasks will return true. Use dl_policy() to ignore PI-boosted tasks.
*/
static inline bool dl_task(struct task_struct *p)
{
return dl_prio(p->prio);
}

View File

@@ -14,6 +14,7 @@
*/
#define MAX_RT_PRIO 100
#define MAX_DL_PRIO 0
#define MAX_PRIO (MAX_RT_PRIO + NICE_WIDTH)
#define DEFAULT_PRIO (MAX_RT_PRIO + NICE_WIDTH / 2)

View File

@@ -6,19 +6,40 @@
struct task_struct;
static inline int rt_prio(int prio)
static inline bool rt_prio(int prio)
{
if (unlikely(prio < MAX_RT_PRIO))
return 1;
return 0;
return unlikely(prio < MAX_RT_PRIO && prio >= MAX_DL_PRIO);
}
static inline int rt_task(struct task_struct *p)
static inline bool rt_or_dl_prio(int prio)
{
return unlikely(prio < MAX_RT_PRIO);
}
/*
* Returns true if a task has a priority that belongs to RT class. PI-boosted
* tasks will return true. Use rt_policy() to ignore PI-boosted tasks.
*/
static inline bool rt_task(struct task_struct *p)
{
return rt_prio(p->prio);
}
static inline bool task_is_realtime(struct task_struct *tsk)
/*
* Returns true if a task has a priority that belongs to RT or DL classes.
* PI-boosted tasks will return true. Use rt_or_dl_task_policy() to ignore
* PI-boosted tasks.
*/
static inline bool rt_or_dl_task(struct task_struct *p)
{
return rt_or_dl_prio(p->prio);
}
/*
* Returns true if a task has a policy that belongs to RT or DL classes.
* PI-boosted tasks will return false.
*/
static inline bool rt_or_dl_task_policy(struct task_struct *tsk)
{
int policy = tsk->policy;

View File

@@ -58,9 +58,9 @@
*
* This is reflected by the following fields of the sched_attr structure:
*
* @sched_deadline representative of the task's deadline
* @sched_runtime representative of the task's runtime
* @sched_period representative of the task's period
* @sched_deadline representative of the task's deadline in nanoseconds
* @sched_runtime representative of the task's runtime in nanoseconds
* @sched_period representative of the task's period in nanoseconds
*
* Given this task model, there are a multiplicity of scheduling algorithms
* and policies, that can be used to ensure all the tasks will make their

View File

@@ -72,7 +72,7 @@ bool __refrigerator(bool check_kthr_stop)
bool freeze;
raw_spin_lock_irq(&current->pi_lock);
set_current_state(TASK_FROZEN);
WRITE_ONCE(current->__state, TASK_FROZEN);
/* unstale saved_state so that __thaw_task() will wake us up */
current->saved_state = TASK_RUNNING;
raw_spin_unlock_irq(&current->pi_lock);

View File

@@ -845,8 +845,16 @@ repeat:
* event only cares about the address.
*/
trace_sched_kthread_work_execute_end(work, func);
} else if (!freezing(current))
} else if (!freezing(current)) {
schedule();
} else {
/*
* Handle the case where the current remains
* TASK_INTERRUPTIBLE. try_to_freeze() expects
* the current to be TASK_RUNNING.
*/
__set_current_state(TASK_RUNNING);
}
try_to_freeze();
cond_resched();

View File

@@ -347,7 +347,7 @@ static __always_inline int __waiter_prio(struct task_struct *task)
{
int prio = task->prio;
if (!rt_prio(prio))
if (!rt_or_dl_prio(prio))
return DEFAULT_PRIO;
return prio;
@@ -435,7 +435,7 @@ static inline bool rt_mutex_steal(struct rt_mutex_waiter *waiter,
* Note that RT tasks are excluded from same priority (lateral)
* steals to prevent the introduction of an unbounded latency.
*/
if (rt_prio(waiter->tree.prio) || dl_prio(waiter->tree.prio))
if (rt_or_dl_prio(waiter->tree.prio))
return false;
return rt_waiter_node_equal(&waiter->tree, &top_waiter->tree);

View File

@@ -631,7 +631,7 @@ static inline bool rwsem_try_write_lock(struct rw_semaphore *sem,
* if it is an RT task or wait in the wait queue
* for too long.
*/
if (has_handoff || (!rt_task(waiter->task) &&
if (has_handoff || (!rt_or_dl_task(waiter->task) &&
!time_after(jiffies, waiter->timeout)))
return false;
@@ -914,7 +914,7 @@ static bool rwsem_optimistic_spin(struct rw_semaphore *sem)
if (owner_state != OWNER_WRITER) {
if (need_resched())
break;
if (rt_task(current) &&
if (rt_or_dl_task(current) &&
(prev_owner_state != OWNER_WRITER))
break;
}

View File

@@ -237,7 +237,7 @@ __ww_ctx_less(struct ww_acquire_ctx *a, struct ww_acquire_ctx *b)
int a_prio = a->task->prio;
int b_prio = b->task->prio;
if (rt_prio(a_prio) || rt_prio(b_prio)) {
if (rt_or_dl_prio(a_prio) || rt_or_dl_prio(b_prio)) {
if (a_prio > b_prio)
return true;

File diff suppressed because it is too large Load Diff

View File

@@ -654,9 +654,9 @@ static int sugov_kthread_create(struct sugov_policy *sg_policy)
* Fake (unused) bandwidth; workaround to "fix"
* priority inheritance.
*/
.sched_runtime = 1000000,
.sched_deadline = 10000000,
.sched_period = 10000000,
.sched_runtime = NSEC_PER_MSEC,
.sched_deadline = 10 * NSEC_PER_MSEC,
.sched_period = 10 * NSEC_PER_MSEC,
};
struct cpufreq_policy *policy = sg_policy->policy;
int ret;

File diff suppressed because it is too large Load Diff

View File

@@ -333,8 +333,165 @@ static const struct file_operations sched_debug_fops = {
.release = seq_release,
};
enum dl_param {
DL_RUNTIME = 0,
DL_PERIOD,
};
static unsigned long fair_server_period_max = (1UL << 22) * NSEC_PER_USEC; /* ~4 seconds */
static unsigned long fair_server_period_min = (100) * NSEC_PER_USEC; /* 100 us */
static ssize_t sched_fair_server_write(struct file *filp, const char __user *ubuf,
size_t cnt, loff_t *ppos, enum dl_param param)
{
long cpu = (long) ((struct seq_file *) filp->private_data)->private;
struct rq *rq = cpu_rq(cpu);
u64 runtime, period;
size_t err;
int retval;
u64 value;
err = kstrtoull_from_user(ubuf, cnt, 10, &value);
if (err)
return err;
scoped_guard (rq_lock_irqsave, rq) {
runtime = rq->fair_server.dl_runtime;
period = rq->fair_server.dl_period;
switch (param) {
case DL_RUNTIME:
if (runtime == value)
break;
runtime = value;
break;
case DL_PERIOD:
if (value == period)
break;
period = value;
break;
}
if (runtime > period ||
period > fair_server_period_max ||
period < fair_server_period_min) {
return -EINVAL;
}
if (rq->cfs.h_nr_running) {
update_rq_clock(rq);
dl_server_stop(&rq->fair_server);
}
retval = dl_server_apply_params(&rq->fair_server, runtime, period, 0);
if (retval)
cnt = retval;
if (!runtime)
printk_deferred("Fair server disabled in CPU %d, system may crash due to starvation.\n",
cpu_of(rq));
if (rq->cfs.h_nr_running)
dl_server_start(&rq->fair_server);
}
*ppos += cnt;
return cnt;
}
static size_t sched_fair_server_show(struct seq_file *m, void *v, enum dl_param param)
{
unsigned long cpu = (unsigned long) m->private;
struct rq *rq = cpu_rq(cpu);
u64 value;
switch (param) {
case DL_RUNTIME:
value = rq->fair_server.dl_runtime;
break;
case DL_PERIOD:
value = rq->fair_server.dl_period;
break;
}
seq_printf(m, "%llu\n", value);
return 0;
}
static ssize_t
sched_fair_server_runtime_write(struct file *filp, const char __user *ubuf,
size_t cnt, loff_t *ppos)
{
return sched_fair_server_write(filp, ubuf, cnt, ppos, DL_RUNTIME);
}
static int sched_fair_server_runtime_show(struct seq_file *m, void *v)
{
return sched_fair_server_show(m, v, DL_RUNTIME);
}
static int sched_fair_server_runtime_open(struct inode *inode, struct file *filp)
{
return single_open(filp, sched_fair_server_runtime_show, inode->i_private);
}
static const struct file_operations fair_server_runtime_fops = {
.open = sched_fair_server_runtime_open,
.write = sched_fair_server_runtime_write,
.read = seq_read,
.llseek = seq_lseek,
.release = single_release,
};
static ssize_t
sched_fair_server_period_write(struct file *filp, const char __user *ubuf,
size_t cnt, loff_t *ppos)
{
return sched_fair_server_write(filp, ubuf, cnt, ppos, DL_PERIOD);
}
static int sched_fair_server_period_show(struct seq_file *m, void *v)
{
return sched_fair_server_show(m, v, DL_PERIOD);
}
static int sched_fair_server_period_open(struct inode *inode, struct file *filp)
{
return single_open(filp, sched_fair_server_period_show, inode->i_private);
}
static const struct file_operations fair_server_period_fops = {
.open = sched_fair_server_period_open,
.write = sched_fair_server_period_write,
.read = seq_read,
.llseek = seq_lseek,
.release = single_release,
};
static struct dentry *debugfs_sched;
static void debugfs_fair_server_init(void)
{
struct dentry *d_fair;
unsigned long cpu;
d_fair = debugfs_create_dir("fair_server", debugfs_sched);
if (!d_fair)
return;
for_each_possible_cpu(cpu) {
struct dentry *d_cpu;
char buf[32];
snprintf(buf, sizeof(buf), "cpu%lu", cpu);
d_cpu = debugfs_create_dir(buf, d_fair);
debugfs_create_file("runtime", 0644, d_cpu, (void *) cpu, &fair_server_runtime_fops);
debugfs_create_file("period", 0644, d_cpu, (void *) cpu, &fair_server_period_fops);
}
}
static __init int sched_init_debug(void)
{
struct dentry __maybe_unused *numa;
@@ -374,6 +531,8 @@ static __init int sched_init_debug(void)
debugfs_create_file("debug", 0444, debugfs_sched, NULL, &sched_debug_fops);
debugfs_fair_server_init();
return 0;
}
late_initcall(sched_init_debug);
@@ -580,27 +739,27 @@ print_task(struct seq_file *m, struct rq *rq, struct task_struct *p)
else
SEQ_printf(m, " %c", task_state_to_char(p));
SEQ_printf(m, "%15s %5d %9Ld.%06ld %c %9Ld.%06ld %9Ld.%06ld %9Ld.%06ld %9Ld %5d ",
SEQ_printf(m, " %15s %5d %9Ld.%06ld %c %9Ld.%06ld %c %9Ld.%06ld %9Ld.%06ld %9Ld %5d ",
p->comm, task_pid_nr(p),
SPLIT_NS(p->se.vruntime),
entity_eligible(cfs_rq_of(&p->se), &p->se) ? 'E' : 'N',
SPLIT_NS(p->se.deadline),
p->se.custom_slice ? 'S' : ' ',
SPLIT_NS(p->se.slice),
SPLIT_NS(p->se.sum_exec_runtime),
(long long)(p->nvcsw + p->nivcsw),
p->prio);
SEQ_printf(m, "%9lld.%06ld %9lld.%06ld %9lld.%06ld %9lld.%06ld",
SEQ_printf(m, "%9lld.%06ld %9lld.%06ld %9lld.%06ld",
SPLIT_NS(schedstat_val_or_zero(p->stats.wait_sum)),
SPLIT_NS(p->se.sum_exec_runtime),
SPLIT_NS(schedstat_val_or_zero(p->stats.sum_sleep_runtime)),
SPLIT_NS(schedstat_val_or_zero(p->stats.sum_block_runtime)));
#ifdef CONFIG_NUMA_BALANCING
SEQ_printf(m, " %d %d", task_node(p), task_numa_group_id(p));
SEQ_printf(m, " %d %d", task_node(p), task_numa_group_id(p));
#endif
#ifdef CONFIG_CGROUP_SCHED
SEQ_printf_task_group_path(m, task_group(p), " %s")
SEQ_printf_task_group_path(m, task_group(p), " %s")
#endif
SEQ_printf(m, "\n");
@@ -612,10 +771,26 @@ static void print_rq(struct seq_file *m, struct rq *rq, int rq_cpu)
SEQ_printf(m, "\n");
SEQ_printf(m, "runnable tasks:\n");
SEQ_printf(m, " S task PID tree-key switches prio"
" wait-time sum-exec sum-sleep\n");
SEQ_printf(m, " S task PID vruntime eligible "
"deadline slice sum-exec switches "
"prio wait-time sum-sleep sum-block"
#ifdef CONFIG_NUMA_BALANCING
" node group-id"
#endif
#ifdef CONFIG_CGROUP_SCHED
" group-path"
#endif
"\n");
SEQ_printf(m, "-------------------------------------------------------"
"------------------------------------------------------\n");
"------------------------------------------------------"
"------------------------------------------------------"
#ifdef CONFIG_NUMA_BALANCING
"--------------"
#endif
#ifdef CONFIG_CGROUP_SCHED
"--------------"
#endif
"\n");
rcu_read_lock();
for_each_process_thread(g, p) {
@@ -641,8 +816,6 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq)
SEQ_printf(m, "\n");
SEQ_printf(m, "cfs_rq[%d]:\n", cpu);
#endif
SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "exec_clock",
SPLIT_NS(cfs_rq->exec_clock));
raw_spin_rq_lock_irqsave(rq, flags);
root = __pick_root_entity(cfs_rq);
@@ -669,8 +842,6 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq)
SPLIT_NS(right_vruntime));
spread = right_vruntime - left_vruntime;
SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "spread", SPLIT_NS(spread));
SEQ_printf(m, " .%-30s: %d\n", "nr_spread_over",
cfs_rq->nr_spread_over);
SEQ_printf(m, " .%-30s: %d\n", "nr_running", cfs_rq->nr_running);
SEQ_printf(m, " .%-30s: %d\n", "h_nr_running", cfs_rq->h_nr_running);
SEQ_printf(m, " .%-30s: %d\n", "idle_nr_running",
@@ -730,9 +901,12 @@ void print_rt_rq(struct seq_file *m, int cpu, struct rt_rq *rt_rq)
SEQ_printf(m, " .%-30s: %Ld.%06ld\n", #x, SPLIT_NS(rt_rq->x))
PU(rt_nr_running);
#ifdef CONFIG_RT_GROUP_SCHED
P(rt_throttled);
PN(rt_time);
PN(rt_runtime);
#endif
#undef PN
#undef PU

File diff suppressed because it is too large Load Diff

Some files were not shown because too many files have changed in this diff Show More