mirror of
https://github.com/linux-apfs/linux-apfs.git
synced 2026-05-01 15:00:59 -07:00
91c27493e78df6849baaa21a9d66e26de8b875c0
39 Commits
| Author | SHA1 | Message | Date | |
|---|---|---|---|---|
|
|
6da2ec5605 |
treewide: kmalloc() -> kmalloc_array()
The kmalloc() function has a 2-factor argument form, kmalloc_array(). This
patch replaces cases of:
kmalloc(a * b, gfp)
with:
kmalloc_array(a * b, gfp)
as well as handling cases of:
kmalloc(a * b * c, gfp)
with:
kmalloc(array3_size(a, b, c), gfp)
as it's slightly less ugly than:
kmalloc_array(array_size(a, b), c, gfp)
This does, however, attempt to ignore constant size factors like:
kmalloc(4 * 1024, gfp)
though any constants defined via macros get caught up in the conversion.
Any factors with a sizeof() of "unsigned char", "char", and "u8" were
dropped, since they're redundant.
The tools/ directory was manually excluded, since it has its own
implementation of kmalloc().
The Coccinelle script used for this was:
// Fix redundant parens around sizeof().
@@
type TYPE;
expression THING, E;
@@
(
kmalloc(
- (sizeof(TYPE)) * E
+ sizeof(TYPE) * E
, ...)
|
kmalloc(
- (sizeof(THING)) * E
+ sizeof(THING) * E
, ...)
)
// Drop single-byte sizes and redundant parens.
@@
expression COUNT;
typedef u8;
typedef __u8;
@@
(
kmalloc(
- sizeof(u8) * (COUNT)
+ COUNT
, ...)
|
kmalloc(
- sizeof(__u8) * (COUNT)
+ COUNT
, ...)
|
kmalloc(
- sizeof(char) * (COUNT)
+ COUNT
, ...)
|
kmalloc(
- sizeof(unsigned char) * (COUNT)
+ COUNT
, ...)
|
kmalloc(
- sizeof(u8) * COUNT
+ COUNT
, ...)
|
kmalloc(
- sizeof(__u8) * COUNT
+ COUNT
, ...)
|
kmalloc(
- sizeof(char) * COUNT
+ COUNT
, ...)
|
kmalloc(
- sizeof(unsigned char) * COUNT
+ COUNT
, ...)
)
// 2-factor product with sizeof(type/expression) and identifier or constant.
@@
type TYPE;
expression THING;
identifier COUNT_ID;
constant COUNT_CONST;
@@
(
- kmalloc
+ kmalloc_array
(
- sizeof(TYPE) * (COUNT_ID)
+ COUNT_ID, sizeof(TYPE)
, ...)
|
- kmalloc
+ kmalloc_array
(
- sizeof(TYPE) * COUNT_ID
+ COUNT_ID, sizeof(TYPE)
, ...)
|
- kmalloc
+ kmalloc_array
(
- sizeof(TYPE) * (COUNT_CONST)
+ COUNT_CONST, sizeof(TYPE)
, ...)
|
- kmalloc
+ kmalloc_array
(
- sizeof(TYPE) * COUNT_CONST
+ COUNT_CONST, sizeof(TYPE)
, ...)
|
- kmalloc
+ kmalloc_array
(
- sizeof(THING) * (COUNT_ID)
+ COUNT_ID, sizeof(THING)
, ...)
|
- kmalloc
+ kmalloc_array
(
- sizeof(THING) * COUNT_ID
+ COUNT_ID, sizeof(THING)
, ...)
|
- kmalloc
+ kmalloc_array
(
- sizeof(THING) * (COUNT_CONST)
+ COUNT_CONST, sizeof(THING)
, ...)
|
- kmalloc
+ kmalloc_array
(
- sizeof(THING) * COUNT_CONST
+ COUNT_CONST, sizeof(THING)
, ...)
)
// 2-factor product, only identifiers.
@@
identifier SIZE, COUNT;
@@
- kmalloc
+ kmalloc_array
(
- SIZE * COUNT
+ COUNT, SIZE
, ...)
// 3-factor product with 1 sizeof(type) or sizeof(expression), with
// redundant parens removed.
@@
expression THING;
identifier STRIDE, COUNT;
type TYPE;
@@
(
kmalloc(
- sizeof(TYPE) * (COUNT) * (STRIDE)
+ array3_size(COUNT, STRIDE, sizeof(TYPE))
, ...)
|
kmalloc(
- sizeof(TYPE) * (COUNT) * STRIDE
+ array3_size(COUNT, STRIDE, sizeof(TYPE))
, ...)
|
kmalloc(
- sizeof(TYPE) * COUNT * (STRIDE)
+ array3_size(COUNT, STRIDE, sizeof(TYPE))
, ...)
|
kmalloc(
- sizeof(TYPE) * COUNT * STRIDE
+ array3_size(COUNT, STRIDE, sizeof(TYPE))
, ...)
|
kmalloc(
- sizeof(THING) * (COUNT) * (STRIDE)
+ array3_size(COUNT, STRIDE, sizeof(THING))
, ...)
|
kmalloc(
- sizeof(THING) * (COUNT) * STRIDE
+ array3_size(COUNT, STRIDE, sizeof(THING))
, ...)
|
kmalloc(
- sizeof(THING) * COUNT * (STRIDE)
+ array3_size(COUNT, STRIDE, sizeof(THING))
, ...)
|
kmalloc(
- sizeof(THING) * COUNT * STRIDE
+ array3_size(COUNT, STRIDE, sizeof(THING))
, ...)
)
// 3-factor product with 2 sizeof(variable), with redundant parens removed.
@@
expression THING1, THING2;
identifier COUNT;
type TYPE1, TYPE2;
@@
(
kmalloc(
- sizeof(TYPE1) * sizeof(TYPE2) * COUNT
+ array3_size(COUNT, sizeof(TYPE1), sizeof(TYPE2))
, ...)
|
kmalloc(
- sizeof(TYPE1) * sizeof(THING2) * (COUNT)
+ array3_size(COUNT, sizeof(TYPE1), sizeof(TYPE2))
, ...)
|
kmalloc(
- sizeof(THING1) * sizeof(THING2) * COUNT
+ array3_size(COUNT, sizeof(THING1), sizeof(THING2))
, ...)
|
kmalloc(
- sizeof(THING1) * sizeof(THING2) * (COUNT)
+ array3_size(COUNT, sizeof(THING1), sizeof(THING2))
, ...)
|
kmalloc(
- sizeof(TYPE1) * sizeof(THING2) * COUNT
+ array3_size(COUNT, sizeof(TYPE1), sizeof(THING2))
, ...)
|
kmalloc(
- sizeof(TYPE1) * sizeof(THING2) * (COUNT)
+ array3_size(COUNT, sizeof(TYPE1), sizeof(THING2))
, ...)
)
// 3-factor product, only identifiers, with redundant parens removed.
@@
identifier STRIDE, SIZE, COUNT;
@@
(
kmalloc(
- (COUNT) * STRIDE * SIZE
+ array3_size(COUNT, STRIDE, SIZE)
, ...)
|
kmalloc(
- COUNT * (STRIDE) * SIZE
+ array3_size(COUNT, STRIDE, SIZE)
, ...)
|
kmalloc(
- COUNT * STRIDE * (SIZE)
+ array3_size(COUNT, STRIDE, SIZE)
, ...)
|
kmalloc(
- (COUNT) * (STRIDE) * SIZE
+ array3_size(COUNT, STRIDE, SIZE)
, ...)
|
kmalloc(
- COUNT * (STRIDE) * (SIZE)
+ array3_size(COUNT, STRIDE, SIZE)
, ...)
|
kmalloc(
- (COUNT) * STRIDE * (SIZE)
+ array3_size(COUNT, STRIDE, SIZE)
, ...)
|
kmalloc(
- (COUNT) * (STRIDE) * (SIZE)
+ array3_size(COUNT, STRIDE, SIZE)
, ...)
|
kmalloc(
- COUNT * STRIDE * SIZE
+ array3_size(COUNT, STRIDE, SIZE)
, ...)
)
// Any remaining multi-factor products, first at least 3-factor products,
// when they're not all constants...
@@
expression E1, E2, E3;
constant C1, C2, C3;
@@
(
kmalloc(C1 * C2 * C3, ...)
|
kmalloc(
- (E1) * E2 * E3
+ array3_size(E1, E2, E3)
, ...)
|
kmalloc(
- (E1) * (E2) * E3
+ array3_size(E1, E2, E3)
, ...)
|
kmalloc(
- (E1) * (E2) * (E3)
+ array3_size(E1, E2, E3)
, ...)
|
kmalloc(
- E1 * E2 * E3
+ array3_size(E1, E2, E3)
, ...)
)
// And then all remaining 2 factors products when they're not all constants,
// keeping sizeof() as the second factor argument.
@@
expression THING, E1, E2;
type TYPE;
constant C1, C2, C3;
@@
(
kmalloc(sizeof(THING) * C2, ...)
|
kmalloc(sizeof(TYPE) * C2, ...)
|
kmalloc(C1 * C2 * C3, ...)
|
kmalloc(C1 * C2, ...)
|
- kmalloc
+ kmalloc_array
(
- sizeof(TYPE) * (E2)
+ E2, sizeof(TYPE)
, ...)
|
- kmalloc
+ kmalloc_array
(
- sizeof(TYPE) * E2
+ E2, sizeof(TYPE)
, ...)
|
- kmalloc
+ kmalloc_array
(
- sizeof(THING) * (E2)
+ E2, sizeof(THING)
, ...)
|
- kmalloc
+ kmalloc_array
(
- sizeof(THING) * E2
+ E2, sizeof(THING)
, ...)
|
- kmalloc
+ kmalloc_array
(
- (E1) * E2
+ E1, E2
, ...)
|
- kmalloc
+ kmalloc_array
(
- (E1) * (E2)
+ E1, E2
, ...)
|
- kmalloc
+ kmalloc_array
(
- E1 * E2
+ E1, E2
, ...)
)
Signed-off-by: Kees Cook <keescook@chromium.org>
|
||
|
|
bf5015a50f |
sched/topology: Clarify root domain(s) debug string
When scheduler debug is enabled, building scheduling domains outputs
information about how the domains are laid out and to which root domain
each CPU (or sets of CPUs) belongs, e.g.:
CPU0 attaching sched-domain(s):
domain-0: span=0-5 level=MC
groups: 0:{ span=0 }, 1:{ span=1 }, 2:{ span=2 }, 3:{ span=3 }, 4:{ span=4 }, 5:{ span=5 }
CPU1 attaching sched-domain(s):
domain-0: span=0-5 level=MC
groups: 1:{ span=1 }, 2:{ span=2 }, 3:{ span=3 }, 4:{ span=4 }, 5:{ span=5 }, 0:{ span=0 }
[...]
span: 0-5 (max cpu_capacity = 1024)
The fact that latest line refers to CPUs 0-5 root domain doesn't however look
immediately obvious to me: one might wonder why span 0-5 is reported "again".
Make it more clear by adding "root domain" to it, as to end with the
following:
CPU0 attaching sched-domain(s):
domain-0: span=0-5 level=MC
groups: 0:{ span=0 }, 1:{ span=1 }, 2:{ span=2 }, 3:{ span=3 }, 4:{ span=4 }, 5:{ span=5 }
CPU1 attaching sched-domain(s):
domain-0: span=0-5 level=MC
groups: 1:{ span=1 }, 2:{ span=2 }, 3:{ span=3 }, 4:{ span=4 }, 5:{ span=5 }, 0:{ span=0 }
[...]
root domain span: 0-5 (max cpu_capacity = 1024)
Signed-off-by: Juri Lelli <juri.lelli@redhat.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Dietmar Eggemann <dietmar.eggemann@arm.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Patrick Bellasi <patrick.bellasi@arm.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/20180524152936.17611-1-juri.lelli@redhat.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
|
||
|
|
325ea10c08 |
sched/headers: Simplify and clean up header usage in the scheduler
Do the following cleanups and simplifications: - sched/sched.h already includes <asm/paravirt.h>, so no need to include it in sched/core.c again. - order the <linux/sched/*.h> headers alphabetically - add all <linux/sched/*.h> headers to kernel/sched/sched.h - remove all unnecessary includes from the .c files that are already included in kernel/sched/sched.h. Finally, make all scheduler .c files use a single common header: #include "sched.h" ... which now contains a union of the relied upon headers. This makes the various .c files easier to read and easier to handle. Cc: Linus Torvalds <torvalds@linux-foundation.org> Cc: Mike Galbraith <efault@gmx.de> Cc: Peter Zijlstra <peterz@infradead.org> Cc: Thomas Gleixner <tglx@linutronix.de> Cc: linux-kernel@vger.kernel.org Signed-off-by: Ingo Molnar <mingo@kernel.org> |
||
|
|
97fb7a0a89 |
sched: Clean up and harmonize the coding style of the scheduler code base
A good number of small style inconsistencies have accumulated
in the scheduler core, so do a pass over them to harmonize
all these details:
- fix speling in comments,
- use curly braces for multi-line statements,
- remove unnecessary parentheses from integer literals,
- capitalize consistently,
- remove stray newlines,
- add comments where necessary,
- remove invalid/unnecessary comments,
- align structure definitions and other data types vertically,
- add missing newlines for increased readability,
- fix vertical tabulation where it's misaligned,
- harmonize preprocessor conditional block labeling
and vertical alignment,
- remove line-breaks where they uglify the code,
- add newline after local variable definitions,
No change in functionality:
md5:
1191fa0a890cfa8132156d2959d7e9e2 built-in.o.before.asm
1191fa0a890cfa8132156d2959d7e9e2 built-in.o.after.asm
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: linux-kernel@vger.kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
|
||
|
|
364f566537 |
sched/rt: Up the root domain ref count when passing it around via IPIs
When issuing an IPI RT push, where an IPI is sent to each CPU that has more
than one RT task scheduled on it, it references the root domain's rto_mask,
that contains all the CPUs within the root domain that has more than one RT
task in the runable state. The problem is, after the IPIs are initiated, the
rq->lock is released. This means that the root domain that is associated to
the run queue could be freed while the IPIs are going around.
Add a sched_get_rd() and a sched_put_rd() that will increment and decrement
the root domain's ref count respectively. This way when initiating the IPIs,
the scheduler will up the root domain's ref count before releasing the
rq->lock, ensuring that the root domain does not go away until the IPI round
is complete.
Reported-by: Pavan Kondeti <pkondeti@codeaurora.org>
Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Fixes:
|
||
|
|
8a103df440 |
Merge branch 'linus' into sched/core, to pick up fixes
Signed-off-by: Ingo Molnar <mingo@kernel.org> |
||
|
|
b24413180f |
License cleanup: add SPDX GPL-2.0 license identifier to files with no license
Many source files in the tree are missing licensing information, which makes it harder for compliance tools to determine the correct license. By default all files without license information are under the default license of the kernel, which is GPL version 2. Update the files which contain no license information with the 'GPL-2.0' SPDX license identifier. The SPDX identifier is a legally binding shorthand, which can be used instead of the full boiler plate text. This patch is based on work done by Thomas Gleixner and Kate Stewart and Philippe Ombredanne. How this work was done: Patches were generated and checked against linux-4.14-rc6 for a subset of the use cases: - file had no licensing information it it. - file was a */uapi/* one with no licensing information in it, - file was a */uapi/* one with existing licensing information, Further patches will be generated in subsequent months to fix up cases where non-standard license headers were used, and references to license had to be inferred by heuristics based on keywords. The analysis to determine which SPDX License Identifier to be applied to a file was done in a spreadsheet of side by side results from of the output of two independent scanners (ScanCode & Windriver) producing SPDX tag:value files created by Philippe Ombredanne. Philippe prepared the base worksheet, and did an initial spot review of a few 1000 files. The 4.13 kernel was the starting point of the analysis with 60,537 files assessed. Kate Stewart did a file by file comparison of the scanner results in the spreadsheet to determine which SPDX license identifier(s) to be applied to the file. She confirmed any determination that was not immediately clear with lawyers working with the Linux Foundation. Criteria used to select files for SPDX license identifier tagging was: - Files considered eligible had to be source code files. - Make and config files were included as candidates if they contained >5 lines of source - File already had some variant of a license header in it (even if <5 lines). All documentation files were explicitly excluded. The following heuristics were used to determine which SPDX license identifiers to apply. - when both scanners couldn't find any license traces, file was considered to have no license information in it, and the top level COPYING file license applied. For non */uapi/* files that summary was: SPDX license identifier # files ---------------------------------------------------|------- GPL-2.0 11139 and resulted in the first patch in this series. If that file was a */uapi/* path one, it was "GPL-2.0 WITH Linux-syscall-note" otherwise it was "GPL-2.0". Results of that was: SPDX license identifier # files ---------------------------------------------------|------- GPL-2.0 WITH Linux-syscall-note 930 and resulted in the second patch in this series. - if a file had some form of licensing information in it, and was one of the */uapi/* ones, it was denoted with the Linux-syscall-note if any GPL family license was found in the file or had no licensing in it (per prior point). Results summary: SPDX license identifier # files ---------------------------------------------------|------ GPL-2.0 WITH Linux-syscall-note 270 GPL-2.0+ WITH Linux-syscall-note 169 ((GPL-2.0 WITH Linux-syscall-note) OR BSD-2-Clause) 21 ((GPL-2.0 WITH Linux-syscall-note) OR BSD-3-Clause) 17 LGPL-2.1+ WITH Linux-syscall-note 15 GPL-1.0+ WITH Linux-syscall-note 14 ((GPL-2.0+ WITH Linux-syscall-note) OR BSD-3-Clause) 5 LGPL-2.0+ WITH Linux-syscall-note 4 LGPL-2.1 WITH Linux-syscall-note 3 ((GPL-2.0 WITH Linux-syscall-note) OR MIT) 3 ((GPL-2.0 WITH Linux-syscall-note) AND MIT) 1 and that resulted in the third patch in this series. - when the two scanners agreed on the detected license(s), that became the concluded license(s). - when there was disagreement between the two scanners (one detected a license but the other didn't, or they both detected different licenses) a manual inspection of the file occurred. - In most cases a manual inspection of the information in the file resulted in a clear resolution of the license that should apply (and which scanner probably needed to revisit its heuristics). - When it was not immediately clear, the license identifier was confirmed with lawyers working with the Linux Foundation. - If there was any question as to the appropriate license identifier, the file was flagged for further research and to be revisited later in time. In total, over 70 hours of logged manual review was done on the spreadsheet to determine the SPDX license identifiers to apply to the source files by Kate, Philippe, Thomas and, in some cases, confirmation by lawyers working with the Linux Foundation. Kate also obtained a third independent scan of the 4.13 code base from FOSSology, and compared selected files where the other two scanners disagreed against that SPDX file, to see if there was new insights. The Windriver scanner is based on an older version of FOSSology in part, so they are related. Thomas did random spot checks in about 500 files from the spreadsheets for the uapi headers and agreed with SPDX license identifier in the files he inspected. For the non-uapi files Thomas did random spot checks in about 15000 files. In initial set of patches against 4.14-rc6, 3 files were found to have copy/paste license identifier errors, and have been fixed to reflect the correct identifier. Additionally Philippe spent 10 hours this week doing a detailed manual inspection and review of the 12,461 patched files from the initial patch version early this week with: - a full scancode scan run, collecting the matched texts, detected license ids and scores - reviewing anything where there was a license detected (about 500+ files) to ensure that the applied SPDX license was correct - reviewing anything where there was no detection but the patch license was not GPL-2.0 WITH Linux-syscall-note to ensure that the applied SPDX license was correct This produced a worksheet with 20 files needing minor correction. This worksheet was then exported into 3 different .csv files for the different types of files to be modified. These .csv files were then reviewed by Greg. Thomas wrote a script to parse the csv files and add the proper SPDX tag to the file, in the format that the file expected. This script was further refined by Greg based on the output to detect more types of files automatically and to distinguish between header and source .c files (which need different comment types.) Finally Greg ran the script using the .csv files to generate the patches. Reviewed-by: Kate Stewart <kstewart@linuxfoundation.org> Reviewed-by: Philippe Ombredanne <pombredanne@nexb.com> Reviewed-by: Thomas Gleixner <tglx@linutronix.de> Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org> |
||
|
|
edb9382175 |
sched/isolation: Move isolcpus= handling to the housekeeping code
We want to centralize the isolation features, to be done by the housekeeping subsystem and scheduler domain isolation is a significant part of it. No intended behaviour change, we just reuse the housekeeping cpumask and core code. Signed-off-by: Frederic Weisbecker <frederic@kernel.org> Acked-by: Thomas Gleixner <tglx@linutronix.de> Cc: Chris Metcalf <cmetcalf@mellanox.com> Cc: Christoph Lameter <cl@linux.com> Cc: Linus Torvalds <torvalds@linux-foundation.org> Cc: Luiz Capitulino <lcapitulino@redhat.com> Cc: Mike Galbraith <efault@gmx.de> Cc: Paul E. McKenney <paulmck@linux.vnet.ibm.com> Cc: Peter Zijlstra <peterz@infradead.org> Cc: Rik van Riel <riel@redhat.com> Cc: Wanpeng Li <kernellwp@gmail.com> Link: http://lkml.kernel.org/r/1509072159-31808-11-git-send-email-frederic@kernel.org Signed-off-by: Ingo Molnar <mingo@kernel.org> |
||
|
|
e22cdc3fc5 |
sched/isolcpus: Fix "isolcpus=" boot parameter handling when !CONFIG_CPUMASK_OFFSTACK
cpulist_parse() uses nr_cpumask_bits as a limit to parse the passed buffer from kernel commandline. What nr_cpumask_bits represents varies depending upon the CONFIG_CPUMASK_OFFSTACK option: - If CONFIG_CPUMASK_OFFSTACK=n, then nr_cpumask_bits is the same as NR_CPUS, which might not represent the # of CPUs that really exist (default 64). So, there's a chance of a gap between nr_cpu_ids and NR_CPUS, which ultimately lead towards invalid cpulist_parse() operation. For example, if isolcpus=9 is passed on an 8 cpu system (CONFIG_CPUMASK_OFFSTACK=n) it doesn't show the error that it's supposed to. This patch fixes this bug by finding the last CPU of the passed isolcpus= list and checking it against nr_cpu_ids. It also fixes the error message where the nr_cpu_ids should be nr_cpu_ids-1, since CPU numbering starts from 0. Signed-off-by: Rakib Mullick <rakib.mullick@gmail.com> Cc: Linus Torvalds <torvalds@linux-foundation.org> Cc: Peter Zijlstra <peterz@infradead.org> Cc: Thomas Gleixner <tglx@linutronix.de> Cc: adobriyan@gmail.com Cc: akpm@linux-foundation.org Cc: longman@redhat.com Cc: mka@chromium.org Cc: tj@kernel.org Link: http://lkml.kernel.org/r/20171023130154.9050-1-rakib.mullick@gmail.com [ Enhanced the changelog and the kernel message. ] Signed-off-by: Ingo Molnar <mingo@kernel.org> include/linux/cpumask.h | 16 ++++++++++++++++ kernel/sched/topology.c | 4 ++-- 2 files changed, 18 insertions(+), 2 deletions(-) |
||
|
|
4bdced5c9a |
sched/rt: Simplify the IPI based RT balancing logic
When a CPU lowers its priority (schedules out a high priority task for a
lower priority one), a check is made to see if any other CPU has overloaded
RT tasks (more than one). It checks the rto_mask to determine this and if so
it will request to pull one of those tasks to itself if the non running RT
task is of higher priority than the new priority of the next task to run on
the current CPU.
When we deal with large number of CPUs, the original pull logic suffered
from large lock contention on a single CPU run queue, which caused a huge
latency across all CPUs. This was caused by only having one CPU having
overloaded RT tasks and a bunch of other CPUs lowering their priority. To
solve this issue, commit:
|
||
|
|
051f3ca02e |
sched/topology: Introduce NUMA identity node sched domain
On AMD Family17h-based (EPYC) system, a logical NUMA node can contain
upto 8 cores (16 threads) with the following topology.
----------------------------
C0 | T0 T1 | || | T0 T1 | C4
--------| || |--------
C1 | T0 T1 | L3 || L3 | T0 T1 | C5
--------| || |--------
C2 | T0 T1 | #0 || #1 | T0 T1 | C6
--------| || |--------
C3 | T0 T1 | || | T0 T1 | C7
----------------------------
Here, there are 2 last-level (L3) caches per logical NUMA node.
A socket can contain upto 4 NUMA nodes, and a system can support
upto 2 sockets. With full system configuration, current scheduler
creates 4 sched domains:
domain0 SMT (span a core)
domain1 MC (span a last-level-cache)
domain2 NUMA (span a socket: 4 nodes)
domain3 NUMA (span a system: 8 nodes)
Note that there is no domain to represent cpus spaning a logical
NUMA node. With this hierarchy of sched domains, the scheduler does
not balance properly in the following cases:
Case1:
When running 8 tasks, a properly balanced system should
schedule a task per logical NUMA node. This is not the case for
the current scheduler.
Case2:
In some cases, threads are scheduled on the same cpu, while other
cpus are idle. This results in run-to-run inconsistency. For example:
taskset -c 0-7 sysbench --num-threads=8 --test=cpu \
--cpu-max-prime=100000 run
Total execution time ranges from 25.1s to 33.5s depending on threads
placement, where 25.1s is when all 8 threads are balanced properly
on 8 cpus.
Introducing NUMA identity node sched domain, which is based on how
SRAT/SLIT table define a logical NUMA node. This results in the following
hierarchy of sched domains on the same system described above.
domain0 SMT (span a core)
domain1 MC (span a last-level-cache)
domain2 NODE (span a logical NUMA node)
domain3 NUMA (span a socket: 4 nodes)
domain4 NUMA (span a system: 8 nodes)
This fixes the improper load balancing cases mentioned above.
Signed-off-by: Suravee Suthikulpanit <suravee.suthikulpanit@amd.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: bp@suse.de
Link: http://lkml.kernel.org/r/1504768805-46716-1-git-send-email-suravee.suthikulpanit@amd.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
|
||
|
|
ed4ad1ca08 |
sched/topology: Restore SD_PREFER_SIBLING on MC domains
The normal x86_topology on NHM+ machines degenerates because the MC and CPU domains are of the same size, therefore MC inherits SD_PREFER_SIBLING from CPU (which then gets taken out). The result is that we'll spread tasks across the first NUMA level in order to maximize cache utilization. However, for the x86_numa_in_package_topology we loose the CPU domain, and we'll not have SD_PREFER_SIBLING set anywhere, giving a distinct difference in behaviour. Commit: |
||
|
|
ec846ecd63 |
Merge branch 'sched-urgent-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip
Pull scheduler fixes from Ingo Molnar: "Three CPU hotplug related fixes and a debugging improvement" * 'sched-urgent-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip: sched/debug: Add debugfs knob for "sched_debug" sched/core: WARN() when migrating to an offline CPU sched/fair: Plug hole between hotplug and active_load_balance() sched/fair: Avoid newidle balance for !active CPUs |
||
|
|
9469eb01db |
sched/debug: Add debugfs knob for "sched_debug"
I'm forever late for editing my kernel cmdline, add a runtime knob to disable the "sched_debug" thing. Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org> Cc: Linus Torvalds <torvalds@linux-foundation.org> Cc: Peter Zijlstra <peterz@infradead.org> Cc: Thomas Gleixner <tglx@linutronix.de> Link: http://lkml.kernel.org/r/20170907150614.142924283@infradead.org Signed-off-by: Ingo Molnar <mingo@kernel.org> |
||
|
|
9b130ad5bb |
treewide: make "nr_cpu_ids" unsigned
First, number of CPUs can't be negative number. Second, different signnnedness leads to suboptimal code in the following cases: 1) kmalloc(nr_cpu_ids * sizeof(X)); "int" has to be sign extended to size_t. 2) while (loff_t *pos < nr_cpu_ids) MOVSXD is 1 byte longed than the same MOV. Other cases exist as well. Basically compiler is told that nr_cpu_ids can't be negative which can't be deduced if it is "int". Code savings on allyesconfig kernel: -3KB add/remove: 0/0 grow/shrink: 25/264 up/down: 261/-3631 (-3370) function old new delta coretemp_cpu_online 450 512 +62 rcu_init_one 1234 1272 +38 pci_device_probe 374 399 +25 ... pgdat_reclaimable_pages 628 556 -72 select_fallback_rq 446 369 -77 task_numa_find_cpu 1923 1807 -116 Link: http://lkml.kernel.org/r/20170819114959.GA30580@avx2 Signed-off-by: Alexey Dobriyan <adobriyan@gmail.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org> |
||
|
|
bbdacdfed2 |
sched/debug: Optimize sched_domain sysctl generation
Currently we unconditionally destroy all sysctl bits and regenerate them after we've rebuild the domains (even if that rebuild is a no-op). And since we unconditionally (re)build the sysctl for all possible CPUs, onlining all CPUs gets us O(n^2) time. Instead change this to only rebuild the bits for CPUs we've actually installed new domains on. Reported-by: Ofer Levi(SW) <oferle@mellanox.com> Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org> Cc: Linus Torvalds <torvalds@linux-foundation.org> Cc: Peter Zijlstra <peterz@infradead.org> Cc: Thomas Gleixner <tglx@linutronix.de> Signed-off-by: Ingo Molnar <mingo@kernel.org> |
||
|
|
09e0dd8e0f |
sched/topology: Avoid pointless rebuild
Fix partition_sched_domains() to try and preserve the existing machine wide domain instead of unconditionally destroying it. We do this by attempting to allocate the new single domain, only when that fails to we reuse the fallback_doms. When using fallback_doms we need to first destroy and then recreate because both the old and new could be backed by it. Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org> Cc: Linus Torvalds <torvalds@linux-foundation.org> Cc: Ofer Levi(SW) <oferle@mellanox.com> Cc: Peter Zijlstra <peterz@infradead.org> Cc: Tejun Heo <tj@kernel.org> Cc: Thomas Gleixner <tglx@linutronix.de> Cc: Vineet.Gupta1@synopsys.com <Vineet.Gupta1@synopsys.com> Cc: rusty@rustcorp.com.au <rusty@rustcorp.com.au> Signed-off-by: Ingo Molnar <mingo@kernel.org> |
||
|
|
a090c4f2cd |
sched/topology: Improve comments
Mike provided a better comment for destroy_sched_domain() ... Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org> Cc: Linus Torvalds <torvalds@linux-foundation.org> Cc: Mike Galbraith <efault@gmx.de> Cc: Peter Zijlstra <peterz@infradead.org> Cc: Thomas Gleixner <tglx@linutronix.de> Signed-off-by: Ingo Molnar <mingo@kernel.org> |
||
|
|
213c5a459a |
sched/topology: Fix memory leak in __sdt_alloc()
Found this issue by kmemleak: the 'sg' and 'sgc' pointers from
__sdt_alloc() might be leaked as each domain holds many groups' ref,
but in destroy_sched_domain(), it only declined the first group ref.
Onlining and offlining a CPU can trigger this leak, and cause OOM.
Reproducer for my 6 CPUs machine:
while true
do
echo 0 > /sys/devices/system/cpu/cpu5/online;
echo 1 > /sys/devices/system/cpu/cpu5/online;
done
unreferenced object 0xffff88007d772a80 (size 64):
comm "cpuhp/5", pid 39, jiffies 4294719962 (age 35.251s)
hex dump (first 32 bytes):
c0 22 77 7d 00 88 ff ff 02 00 00 00 01 00 00 00 ."w}............
40 2a 77 7d 00 88 ff ff 00 00 00 00 00 00 00 00 @*w}............
backtrace:
[<ffffffff8176525a>] kmemleak_alloc+0x4a/0xa0
[<ffffffff8121efe1>] __kmalloc_node+0xf1/0x280
[<ffffffff810d94a8>] build_sched_domains+0x1e8/0xf20
[<ffffffff810da674>] partition_sched_domains+0x304/0x360
[<ffffffff81139557>] cpuset_update_active_cpus+0x17/0x40
[<ffffffff810bdb2e>] sched_cpu_activate+0xae/0xc0
[<ffffffff810900e0>] cpuhp_invoke_callback+0x90/0x400
[<ffffffff81090597>] cpuhp_up_callbacks+0x37/0xb0
[<ffffffff81090887>] cpuhp_thread_fun+0xd7/0xf0
[<ffffffff810b37e0>] smpboot_thread_fn+0x110/0x160
[<ffffffff810af5d9>] kthread+0x109/0x140
[<ffffffff81770e45>] ret_from_fork+0x25/0x30
[<ffffffffffffffff>] 0xffffffffffffffff
unreferenced object 0xffff88007d772a40 (size 64):
comm "cpuhp/5", pid 39, jiffies 4294719962 (age 35.251s)
hex dump (first 32 bytes):
03 00 00 00 00 00 00 00 00 04 00 00 00 00 00 00 ................
00 04 00 00 00 00 00 00 4f 3c fc ff 00 00 00 00 ........O<......
backtrace:
[<ffffffff8176525a>] kmemleak_alloc+0x4a/0xa0
[<ffffffff8121efe1>] __kmalloc_node+0xf1/0x280
[<ffffffff810da16d>] build_sched_domains+0xead/0xf20
[<ffffffff810da674>] partition_sched_domains+0x304/0x360
[<ffffffff81139557>] cpuset_update_active_cpus+0x17/0x40
[<ffffffff810bdb2e>] sched_cpu_activate+0xae/0xc0
[<ffffffff810900e0>] cpuhp_invoke_callback+0x90/0x400
[<ffffffff81090597>] cpuhp_up_callbacks+0x37/0xb0
[<ffffffff81090887>] cpuhp_thread_fun+0xd7/0xf0
[<ffffffff810b37e0>] smpboot_thread_fn+0x110/0x160
[<ffffffff810af5d9>] kthread+0x109/0x140
[<ffffffff81770e45>] ret_from_fork+0x25/0x30
[<ffffffffffffffff>] 0xffffffffffffffff
Reported-by: Chunyu Hu <chuhu@redhat.com>
Signed-off-by: Shu Wang <shuwang@redhat.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Acked-by: Chunyu Hu <chuhu@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: liwang@redhat.com
Link: http://lkml.kernel.org/r/1502351536-9108-1-git-send-email-shuwang@redhat.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
|
||
|
|
181a80d1f7 |
sched: Mark pick_next_task_dl() and build_sched_domain() as static
pick_next_task_dl() and build_sched_domain() aren't used outside deadline.c and topology.c. Make them static. Signed-off-by: Viresh Kumar <viresh.kumar@linaro.org> Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org> Cc: Linus Torvalds <torvalds@linux-foundation.org> Cc: Peter Zijlstra <peterz@infradead.org> Cc: Thomas Gleixner <tglx@linutronix.de> Cc: Vincent Guittot <vincent.guittot@linaro.org> Cc: linaro-kernel@lists.linaro.org Link: http://lkml.kernel.org/r/36e4cbb6210002cadae89920ae97e19e7e513008.1493281605.git.viresh.kumar@linaro.org Signed-off-by: Ingo Molnar <mingo@kernel.org> |
||
|
|
4d13a06d54 |
sched/topology: Drop memset() from init_rootdomain()
There are only two callers of init_rootdomain(). One of them passes a global to it and another one sends dynamically allocated root-domain. There is no need to memset the root-domain in the first case as the structure is already reset. Update alloc_rootdomain() to allocate the memory with kzalloc() and remove the memset() call from init_rootdomain(). Signed-off-by: Viresh Kumar <viresh.kumar@linaro.org> Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org> Cc: Linus Torvalds <torvalds@linux-foundation.org> Cc: Peter Zijlstra <peterz@infradead.org> Cc: Thomas Gleixner <tglx@linutronix.de> Cc: Vincent Guittot <vincent.guittot@linaro.org> Cc: linaro-kernel@lists.linaro.org Link: http://lkml.kernel.org/r/fc2f6cc90b098040970c85a97046512572d765bc.1492065513.git.viresh.kumar@linaro.org Signed-off-by: Ingo Molnar <mingo@kernel.org> |
||
|
|
ae4df9d6c9 |
sched/topology: Rename sched_group_cpus()
There's a discrepancy in naming between the sched_domain and
sched_group cpumask accessor. Since we're doing changes, fix it.
$ git grep sched_group_cpus | wc -l
28
$ git grep sched_domain_span | wc -l
38
Suggests changing sched_group_cpus() into sched_group_span():
for i in `git grep -l sched_group_cpus`
do
sed -ie 's/sched_group_cpus/sched_group_span/g' $i
done
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: linux-kernel@vger.kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
|
||
|
|
e5c14b1fb8 |
sched/topology: Rename sched_group_mask()
Since sched_group_mask() is now an independent cpumask (it no longer masks sched_group_cpus()), rename the thing. Suggested-by: Lauro Ramos Venancio <lvenanci@redhat.com> Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org> Cc: Linus Torvalds <torvalds@linux-foundation.org> Cc: Mike Galbraith <efault@gmx.de> Cc: Peter Zijlstra <peterz@infradead.org> Cc: Thomas Gleixner <tglx@linutronix.de> Cc: linux-kernel@vger.kernel.org Signed-off-by: Ingo Molnar <mingo@kernel.org> |
||
|
|
af218122b1 |
sched/topology: Simplify sched_group_mask() usage
While writing the comments, it occurred to me that: sg_cpus & sg_mask == sg_mask at least conceptually; the !overlap case sets the all 1s mask. If we correct that we can simplify things and directly use sg_mask. Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org> Cc: Linus Torvalds <torvalds@linux-foundation.org> Cc: Mike Galbraith <efault@gmx.de> Cc: Peter Zijlstra <peterz@infradead.org> Cc: Thomas Gleixner <tglx@linutronix.de> Cc: linux-kernel@vger.kernel.org Signed-off-by: Ingo Molnar <mingo@kernel.org> |
||
|
|
0c0e776a9b |
sched/topology: Rewrite get_group()
We want to attain: sg_cpus() & sg_mask() == sg_mask() for this to be so we must initialize sg_mask() to sg_cpus() for the !overlap case (its currently cpumask_setall()). Since the code makes my head hurt bad, rewrite it into a simpler form, inspired by the now fixed overlap code. Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org> Cc: Linus Torvalds <torvalds@linux-foundation.org> Cc: Mike Galbraith <efault@gmx.de> Cc: Peter Zijlstra <peterz@infradead.org> Cc: Thomas Gleixner <tglx@linutronix.de> Cc: linux-kernel@vger.kernel.org Signed-off-by: Ingo Molnar <mingo@kernel.org> |