Merge branch 'sched-core-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip

Pull scheduler updates from Ingo Molnar: "The main changes are: - lockless wakeup support for futexes and IPC message queues (Davidlohr Bueso, Peter Zijlstra) - Replace spinlocks with atomics in thread_group_cputimer(), to improve scalability (Jason Low) - NUMA balancing improvements (Rik van Riel) - SCHED_DEADLINE improvements (Wanpeng Li) - clean up and reorganize preemption helpers (Frederic Weisbecker) - decouple page fault disabling machinery from the preemption counter, to improve debuggability and robustness (David Hildenbrand) - SCHED_DEADLINE documentation updates (Luca Abeni) - topology CPU masks cleanups (Bartosz Golaszewski) - /proc/sched_debug improvements (Srikar Dronamraju)" * 'sched-core-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip: (79 commits) sched/deadline: Remove needless parameter in dl_runtime_exceeded() sched: Remove superfluous resetting of the p->dl_throttled flag sched/deadline: Drop duplicate init_sched_dl_class() declaration sched/deadline: Reduce rq lock contention by eliminating locking of non-feasible target sched/deadline: Make init_sched_dl_class() __init sched/deadline: Optimize pull_dl_task() sched/preempt: Add static_key() to preempt_notifiers sched/preempt: Fix preempt notifiers documentation about hlist_del() within unsafe iteration sched/stop_machine: Fix deadlock between multiple stop_two_cpus() sched/debug: Add sum_sleep_runtime to /proc/<pid>/sched sched/debug: Replace vruntime with wait_sum in /proc/sched_debug sched/debug: Properly format runnable tasks in /proc/sched_debug sched/numa: Only consider less busy nodes as numa balancing destinations Revert 095bebf61a ("sched/numa: Do not move past the balance point if unbalanced") sched/fair: Prevent throttling in early pick_next_task_fair() preempt: Reorganize the notrace definitions a bit preempt: Use preempt_schedule_context() as the official tracing preemption point sched: Make preempt_schedule_context() function-tracing safe x86: Remove cpu_sibling_mask() and cpu_core_mask() x86: Replace cpu_**_mask() with topology_**_cpumask() ...
2026-05-01 15:00:59 -07:00 · 2015-06-22 15:52:04 -07:00
parent 6bc4c3ad36 6fab541019
commit 23b7776290
138 changed files with 1444 additions and 974 deletions
@@ -1,6 +1,6 @@

 Export CPU topology info via sysfs. Items (attributes) are similar
-to /proc/cpuinfo.
+to /proc/cpuinfo output of some architectures:

 1) /sys/devices/system/cpu/cpuX/topology/physical_package_id:

@@ -23,20 +23,35 @@ to /proc/cpuinfo.
 4) /sys/devices/system/cpu/cpuX/topology/thread_siblings:

 	internal kernel map of cpuX's hardware threads within the same
-	core as cpuX
+	core as cpuX.

-5) /sys/devices/system/cpu/cpuX/topology/core_siblings:
+5) /sys/devices/system/cpu/cpuX/topology/thread_siblings_list:
+
+	human-readable list of cpuX's hardware threads within the same
+	core as cpuX.
+
+6) /sys/devices/system/cpu/cpuX/topology/core_siblings:

 	internal kernel map of cpuX's hardware threads within the same
 	physical_package_id.

-6) /sys/devices/system/cpu/cpuX/topology/book_siblings:
+7) /sys/devices/system/cpu/cpuX/topology/core_siblings_list:
+
+	human-readable list of cpuX's hardware threads within the same
+	physical_package_id.
+
+8) /sys/devices/system/cpu/cpuX/topology/book_siblings:

 	internal kernel map of cpuX's hardware threads within the same
 	book_id.

+9) /sys/devices/system/cpu/cpuX/topology/book_siblings_list:
+
+	human-readable list of cpuX's hardware threads within the same
+	book_id.
+
 To implement it in an architecture-neutral way, a new source file,
-drivers/base/topology.c, is to export the 4 or 6 attributes. The two book
+drivers/base/topology.c, is to export the 6 or 9 attributes. The three book
 related sysfs files will only be created if CONFIG_SCHED_BOOK is selected.

 For an architecture to support this feature, it must define some of
@@ -44,20 +59,22 @@ these macros in include/asm-XXX/topology.h:
 #define topology_physical_package_id(cpu)
 #define topology_core_id(cpu)
 #define topology_book_id(cpu)
-#define topology_thread_cpumask(cpu)
+#define topology_sibling_cpumask(cpu)
 #define topology_core_cpumask(cpu)
 #define topology_book_cpumask(cpu)

-The type of **_id is int.
-The type of siblings is (const) struct cpumask *.
+The type of **_id macros is int.
+The type of **_cpumask macros is (const) struct cpumask *. The latter
+correspond with appropriate **_siblings sysfs attributes (except for
+topology_sibling_cpumask() which corresponds with thread_siblings).

 To be consistent on all architectures, include/linux/topology.h
 provides default definitions for any of the above macros that are
 not defined by include/asm-XXX/topology.h:
 1) physical_package_id: -1
 2) core_id: 0
-3) thread_siblings: just the given CPU
-4) core_siblings: just the given CPU
+3) sibling_cpumask: just the given CPU
+4) core_cpumask: just the given CPU

 For architectures that don't support books (CONFIG_SCHED_BOOK) there are no
 default definitions for topology_book_id() and topology_book_cpumask().
@@ -8,6 +8,10 @@ CONTENTS
 1. Overview
 2. Scheduling algorithm
 3. Scheduling Real-Time Tasks
+   3.1 Definitions
+   3.2 Schedulability Analysis for Uniprocessor Systems
+   3.3 Schedulability Analysis for Multiprocessor Systems
+   3.4 Relationship with SCHED_DEADLINE Parameters
 4. Bandwidth management
   4.1 System-wide settings
   4.2 Task interface
@@ -43,7 +47,7 @@ CONTENTS
 "deadline", to schedule tasks. A SCHED_DEADLINE task should receive
 "runtime" microseconds of execution time every "period" microseconds, and
 these "runtime" microseconds are available within "deadline" microseconds
- from the beginning of the period.  In order to implement this behaviour,
+ from the beginning of the period.  In order to implement this behavior,
 every time the task wakes up, the scheduler computes a "scheduling deadline"
 consistent with the guarantee (using the CBS[2,3] algorithm). Tasks are then
 scheduled using EDF[1] on these scheduling deadlines (the task with the
@@ -52,7 +56,7 @@ CONTENTS
 "admission control" strategy (see Section "4. Bandwidth management") is used
 (clearly, if the system is overloaded this guarantee cannot be respected).

- Summing up, the CBS[2,3] algorithms assigns scheduling deadlines to tasks so
+ Summing up, the CBS[2,3] algorithm assigns scheduling deadlines to tasks so
 that each task runs for at most its runtime every period, avoiding any
 interference between different tasks (bandwidth isolation), while the EDF[1]
 algorithm selects the task with the earliest scheduling deadline as the one
@@ -63,7 +67,7 @@ CONTENTS
 In more details, the CBS algorithm assigns scheduling deadlines to
 tasks in the following way:

-  - Each SCHED_DEADLINE task is characterised by the "runtime",
+  - Each SCHED_DEADLINE task is characterized by the "runtime",
    "deadline", and "period" parameters;

  - The state of the task is described by a "scheduling deadline", and
@@ -78,7 +82,7 @@ CONTENTS

    then, if the scheduling deadline is smaller than the current time, or
    this condition is verified, the scheduling deadline and the
-    remaining runtime are re-initialised as
+    remaining runtime are re-initialized as

         scheduling deadline = current time + deadline
         remaining runtime = runtime
@@ -126,31 +130,37 @@ CONTENTS
 suited for periodic or sporadic real-time tasks that need guarantees on their
 timing behavior, e.g., multimedia, streaming, control applications, etc.

+3.1 Definitions
+------------------------
+
 A typical real-time task is composed of a repetition of computation phases
 (task instances, or jobs) which are activated on a periodic or sporadic
 fashion.
- Each job J_j (where J_j is the j^th job of the task) is characterised by an
+ Each job J_j (where J_j is the j^th job of the task) is characterized by an
 arrival time r_j (the time when the job starts), an amount of computation
 time c_j needed to finish the job, and a job absolute deadline d_j, which
 is the time within which the job should be finished. The maximum execution
- time max_j{c_j} is called "Worst Case Execution Time" (WCET) for the task.
+ time max{c_j} is called "Worst Case Execution Time" (WCET) for the task.
 A real-time task can be periodic with period P if r_{j+1} = r_j + P, or
 sporadic with minimum inter-arrival time P is r_{j+1} >= r_j + P. Finally,
 d_j = r_j + D, where D is the task's relative deadline.
- The utilisation of a real-time task is defined as the ratio between its
+ Summing up, a real-time task can be described as
+	Task = (WCET, D, P)
+
+ The utilization of a real-time task is defined as the ratio between its
 WCET and its period (or minimum inter-arrival time), and represents
 the fraction of CPU time needed to execute the task.

- If the total utilisation sum_i(WCET_i/P_i) is larger than M (with M equal
+ If the total utilization U=sum(WCET_i/P_i) is larger than M (with M equal
 to the number of CPUs), then the scheduler is unable to respect all the
 deadlines.
- Note that total utilisation is defined as the sum of the utilisations
+ Note that total utilization is defined as the sum of the utilizations
 WCET_i/P_i over all the real-time tasks in the system. When considering
 multiple real-time tasks, the parameters of the i-th task are indicated
 with the "_i" suffix.
- Moreover, if the total utilisation is larger than M, then we risk starving
+ Moreover, if the total utilization is larger than M, then we risk starving
 non- real-time tasks by real-time tasks.
- If, instead, the total utilisation is smaller than M, then non real-time
+ If, instead, the total utilization is smaller than M, then non real-time
 tasks will not be starved and the system might be able to respect all the
 deadlines.
 As a matter of fact, in this case it is possible to provide an upper bound
@@ -159,38 +169,119 @@ CONTENTS
 More precisely, it can be proven that using a global EDF scheduler the
 maximum tardiness of each task is smaller or equal than
 	((M − 1) · WCET_max − WCET_min)/(M − (M − 2) · U_max) + WCET_max
- where WCET_max = max_i{WCET_i} is the maximum WCET, WCET_min=min_i{WCET_i}
- is the minimum WCET, and U_max = max_i{WCET_i/P_i} is the maximum utilisation.
+ where WCET_max = max{WCET_i} is the maximum WCET, WCET_min=min{WCET_i}
+ is the minimum WCET, and U_max = max{WCET_i/P_i} is the maximum
+ utilization[12].
+
+3.2 Schedulability Analysis for Uniprocessor Systems
+------------------------

 If M=1 (uniprocessor system), or in case of partitioned scheduling (each
 real-time task is statically assigned to one and only one CPU), it is
 possible to formally check if all the deadlines are respected.
 If D_i = P_i for all tasks, then EDF is able to respect all the deadlines
- of all the tasks executing on a CPU if and only if the total utilisation
+ of all the tasks executing on a CPU if and only if the total utilization
 of the tasks running on such a CPU is smaller or equal than 1.
 If D_i != P_i for some task, then it is possible to define the density of
- a task as C_i/min{D_i,T_i}, and EDF is able to respect all the deadlines
- of all the tasks running on a CPU if the sum sum_i C_i/min{D_i,T_i} of the
- densities of the tasks running on such a CPU is smaller or equal than 1
- (notice that this condition is only sufficient, and not necessary).
+ a task as WCET_i/min{D_i,P_i}, and EDF is able to respect all the deadlines
+ of all the tasks running on a CPU if the sum of the densities of the tasks
+ running on such a CPU is smaller or equal than 1:
+	sum(WCET_i / min{D_i, P_i}) <= 1
+ It is important to notice that this condition is only sufficient, and not
+ necessary: there are task sets that are schedulable, but do not respect the
+ condition. For example, consider the task set {Task_1,Task_2} composed by
+ Task_1=(50ms,50ms,100ms) and Task_2=(10ms,100ms,100ms).
+ EDF is clearly able to schedule the two tasks without missing any deadline
+ (Task_1 is scheduled as soon as it is released, and finishes just in time
+ to respect its deadline; Task_2 is scheduled immediately after Task_1, hence
+ its response time cannot be larger than 50ms + 10ms = 60ms) even if
+	50 / min{50,100} + 10 / min{100, 100} = 50 / 50 + 10 / 100 = 1.1
+ Of course it is possible to test the exact schedulability of tasks with
+ D_i != P_i (checking a condition that is both sufficient and necessary),
+ but this cannot be done by comparing the total utilization or density with
+ a constant. Instead, the so called "processor demand" approach can be used,
+ computing the total amount of CPU time h(t) needed by all the tasks to
+ respect all of their deadlines in a time interval of size t, and comparing
+ such a time with the interval size t. If h(t) is smaller than t (that is,
+ the amount of time needed by the tasks in a time interval of size t is
+ smaller than the size of the interval) for all the possible values of t, then
+ EDF is able to schedule the tasks respecting all of their deadlines. Since
+ performing this check for all possible values of t is impossible, it has been
+ proven[4,5,6] that it is sufficient to perform the test for values of t
+ between 0 and a maximum value L. The cited papers contain all of the
+ mathematical details and explain how to compute h(t) and L.
+ In any case, this kind of analysis is too complex as well as too
+ time-consuming to be performed on-line. Hence, as explained in Section
+ 4 Linux uses an admission test based on the tasks' utilizations.
+
+3.3 Schedulability Analysis for Multiprocessor Systems
+------------------------

 On multiprocessor systems with global EDF scheduling (non partitioned
 systems), a sufficient test for schedulability can not be based on the
- utilisations (it can be shown that task sets with utilisations slightly
- larger than 1 can miss deadlines regardless of the number of CPUs M).
- However, as previously stated, enforcing that the total utilisation is smaller
- than M is enough to guarantee that non real-time tasks are not starved and
- that the tardiness of real-time tasks has an upper bound.
+ utilizations or densities: it can be shown that even if D_i = P_i task
+ sets with utilizations slightly larger than 1 can miss deadlines regardless
+ of the number of CPUs.

- SCHED_DEADLINE can be used to schedule real-time tasks guaranteeing that
- the jobs' deadlines of a task are respected. In order to do this, a task
- must be scheduled by setting:
+ Consider a set {Task_1,...Task_{M+1}} of M+1 tasks on a system with M
+ CPUs, with the first task Task_1=(P,P,P) having period, relative deadline
+ and WCET equal to P. The remaining M tasks Task_i=(e,P-1,P-1) have an
+ arbitrarily small worst case execution time (indicated as "e" here) and a
+ period smaller than the one of the first task. Hence, if all the tasks
+ activate at the same time t, global EDF schedules these M tasks first
+ (because their absolute deadlines are equal to t + P - 1, hence they are
+ smaller than the absolute deadline of Task_1, which is t + P). As a
+ result, Task_1 can be scheduled only at time t + e, and will finish at
+ time t + e + P, after its absolute deadline. The total utilization of the
+ task set is U = M · e / (P - 1) + P / P = M · e / (P - 1) + 1, and for small
+ values of e this can become very close to 1. This is known as "Dhall's
+ effect"[7]. Note: the example in the original paper by Dhall has been
+ slightly simplified here (for example, Dhall more correctly computed
+ lim_{e->0}U).
+
+ More complex schedulability tests for global EDF have been developed in
+ real-time literature[8,9], but they are not based on a simple comparison
+ between total utilization (or density) and a fixed constant. If all tasks
+ have D_i = P_i, a sufficient schedulability condition can be expressed in
+ a simple way:
+	sum(WCET_i / P_i) <= M - (M - 1) · U_max
+ where U_max = max{WCET_i / P_i}[10]. Notice that for U_max = 1,
+ M - (M - 1) · U_max becomes M - M + 1 = 1 and this schedulability condition
+ just confirms the Dhall's effect. A more complete survey of the literature
+ about schedulability tests for multi-processor real-time scheduling can be
+ found in [11].
+
+ As seen, enforcing that the total utilization is smaller than M does not
+ guarantee that global EDF schedules the tasks without missing any deadline
+ (in other words, global EDF is not an optimal scheduling algorithm). However,
+ a total utilization smaller than M is enough to guarantee that non real-time
+ tasks are not starved and that the tardiness of real-time tasks has an upper
+ bound[12] (as previously noted). Different bounds on the maximum tardiness
+ experienced by real-time tasks have been developed in various papers[13,14],
+ but the theoretical result that is important for SCHED_DEADLINE is that if
+ the total utilization is smaller or equal than M then the response times of
+ the tasks are limited.
+
+3.4 Relationship with SCHED_DEADLINE Parameters
+------------------------
+
+ Finally, it is important to understand the relationship between the
+ SCHED_DEADLINE scheduling parameters described in Section 2 (runtime,
+ deadline and period) and the real-time task parameters (WCET, D, P)
+ described in this section. Note that the tasks' temporal constraints are
+ represented by its absolute deadlines d_j = r_j + D described above, while
+ SCHED_DEADLINE schedules the tasks according to scheduling deadlines (see
+ Section 2).
+ If an admission test is used to guarantee that the scheduling deadlines
+ are respected, then SCHED_DEADLINE can be used to schedule real-time tasks
+ guaranteeing that all the jobs' deadlines of a task are respected.
+ In order to do this, a task must be scheduled by setting:

  - runtime >= WCET
  - deadline = D
  - period <= P

- IOW, if runtime >= WCET and if period is >= P, then the scheduling deadlines
+ IOW, if runtime >= WCET and if period is <= P, then the scheduling deadlines
 and the absolute deadlines (d_j) coincide, so a proper admission control
 allows to respect the jobs' absolute deadlines for this task (this is what is
 called "hard schedulability property" and is an extension of Lemma 1 of [2]).
@@ -206,6 +297,39 @@ CONTENTS
      Symposium, 1998. http://retis.sssup.it/~giorgio/paps/1998/rtss98-cbs.pdf
  3 - L. Abeni. Server Mechanisms for Multimedia Applications. ReTiS Lab
      Technical Report. http://disi.unitn.it/~abeni/tr-98-01.pdf
+  4 - J. Y. Leung and M.L. Merril. A Note on Preemptive Scheduling of
+      Periodic, Real-Time Tasks. Information Processing Letters, vol. 11,
+      no. 3, pp. 115-118, 1980.
+  5 - S. K. Baruah, A. K. Mok and L. E. Rosier. Preemptively Scheduling
+      Hard-Real-Time Sporadic Tasks on One Processor. Proceedings of the
+      11th IEEE Real-time Systems Symposium, 1990.
+  6 - S. K. Baruah, L. E. Rosier and R. R. Howell. Algorithms and Complexity
+      Concerning the Preemptive Scheduling of Periodic Real-Time tasks on
+      One Processor. Real-Time Systems Journal, vol. 4, no. 2, pp 301-324,
+      1990.
+  7 - S. J. Dhall and C. L. Liu. On a real-time scheduling problem. Operations
+      research, vol. 26, no. 1, pp 127-140, 1978.
+  8 - T. Baker. Multiprocessor EDF and Deadline Monotonic Schedulability
+      Analysis. Proceedings of the 24th IEEE Real-Time Systems Symposium, 2003.
+  9 - T. Baker. An Analysis of EDF Schedulability on a Multiprocessor.
+      IEEE Transactions on Parallel and Distributed Systems, vol. 16, no. 8,
+      pp 760-768, 2005.
+  10 - J. Goossens, S. Funk and S. Baruah, Priority-Driven Scheduling of
+       Periodic Task Systems on Multiprocessors. Real-Time Systems Journal,
+       vol. 25, no. 2–3, pp. 187–205, 2003.
+  11 - R. Davis and A. Burns. A Survey of Hard Real-Time Scheduling for
+       Multiprocessor Systems. ACM Computing Surveys, vol. 43, no. 4, 2011.
+       http://www-users.cs.york.ac.uk/~robdavis/papers/MPSurveyv5.0.pdf
+  12 - U. C. Devi and J. H. Anderson. Tardiness Bounds under Global EDF
+       Scheduling on a Multiprocessor. Real-Time Systems Journal, vol. 32,
+       no. 2, pp 133-189, 2008.
+  13 - P. Valente and G. Lipari. An Upper Bound to the Lateness of Soft
+       Real-Time Tasks Scheduled by EDF on Multiprocessors. Proceedings of
+       the 26th IEEE Real-Time Systems Symposium, 2005.
+  14 - J. Erickson, U. Devi and S. Baruah. Improved tardiness bounds for
+       Global EDF. Proceedings of the 22nd Euromicro Conference on
+       Real-Time Systems, 2010.
+

 4. Bandwidth management
 =======================
@@ -218,10 +342,10 @@ CONTENTS
 no guarantee can be given on the actual scheduling of the -deadline tasks.

 As already stated in Section 3, a necessary condition to be respected to
- correctly schedule a set of real-time tasks is that the total utilisation
+ correctly schedule a set of real-time tasks is that the total utilization
 is smaller than M. When talking about -deadline tasks, this requires that
 the sum of the ratio between runtime and period for all tasks is smaller
- than M. Notice that the ratio runtime/period is equivalent to the utilisation
+ than M. Notice that the ratio runtime/period is equivalent to the utilization
 of a "traditional" real-time task, and is also often referred to as
 "bandwidth".
 The interface used to control the CPU bandwidth that can be allocated
@@ -251,7 +375,7 @@ CONTENTS
 The system wide settings are configured under the /proc virtual file system.

 For now the -rt knobs are used for -deadline admission control and the
- -deadline runtime is accounted against the -rt runtime. We realise that this
+ -deadline runtime is accounted against the -rt runtime. We realize that this
 isn't entirely desirable; however, it is better to have a small interface for
 now, and be able to change it easily later. The ideal situation (see 5.) is to
 run -rt tasks from a -deadline server; in which case the -rt bandwidth is a
@@ -23,8 +23,7 @@
 #include <linux/smp.h>
 #include <linux/interrupt.h>
 #include <linux/module.h>
-
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>

 extern void die_if_kernel(char *,struct pt_regs *,long, unsigned long *);

@@ -107,7 +106,7 @@ do_page_fault(unsigned long address, unsigned long mmcsr,

 	/* If we're in an interrupt context, or have no user context,
 	   we must not take the fault.  */
-	if (!mm || in_atomic())
+	if (!mm || faulthandler_disabled())
 		goto no_context;

 #ifdef CONFIG_ALPHA_LARGE_VMALLOC
@@ -53,7 +53,7 @@ static inline int futex_atomic_op_inuser(int encoded_op, u32 __user *uaddr)
 	if (!access_ok(VERIFY_WRITE, uaddr, sizeof(int)))
 		return -EFAULT;

-	pagefault_disable();	/* implies preempt_disable() */
+	pagefault_disable();

 	switch (op) {
 	case FUTEX_OP_SET:
@@ -75,7 +75,7 @@ static inline int futex_atomic_op_inuser(int encoded_op, u32 __user *uaddr)
 		ret = -ENOSYS;
 	}

-	pagefault_enable();	/* subsumes preempt_enable() */
+	pagefault_enable();

 	if (!ret) {
 		switch (cmp) {
@@ -104,7 +104,7 @@ static inline int futex_atomic_op_inuser(int encoded_op, u32 __user *uaddr)
 	return ret;
 }

-/* Compare-xchg with preemption disabled.
+/* Compare-xchg with pagefaults disabled.
 *  Notes:
 *      -Best-Effort: Exchg happens only if compare succeeds.
 *          If compare fails, returns; leaving retry/looping to upper layers
@@ -121,7 +121,7 @@ futex_atomic_cmpxchg_inatomic(u32 *uval, u32 __user *uaddr, u32 oldval,
 	if (!access_ok(VERIFY_WRITE, uaddr, sizeof(int)))
 		return -EFAULT;

-	pagefault_disable();	/* implies preempt_disable() */
+	pagefault_disable();

 	/* TBD : can use llock/scond */
 	__asm__ __volatile__(
@@ -142,7 +142,7 @@ futex_atomic_cmpxchg_inatomic(u32 *uval, u32 __user *uaddr, u32 oldval,
 	: "r"(oldval), "r"(newval), "r"(uaddr), "ir"(-EFAULT)
 	: "cc", "memory");

-	pagefault_enable();	/* subsumes preempt_enable() */
+	pagefault_enable();

 	*uval = val;
 	return val;
@@ -86,7 +86,7 @@ void do_page_fault(unsigned long address, struct pt_regs *regs)
 	 * If we're in an interrupt or have no user
 	 * context, we must not take the fault..
 	 */
-	if (in_atomic() || !mm)
+	if (faulthandler_disabled() || !mm)
 		goto no_context;

 	if (user_mode(regs))
@@ -93,6 +93,7 @@ futex_atomic_cmpxchg_inatomic(u32 *uval, u32 __user *uaddr,
 	if (!access_ok(VERIFY_WRITE, uaddr, sizeof(u32)))
 		return -EFAULT;

+	preempt_disable();
 	__asm__ __volatile__("@futex_atomic_cmpxchg_inatomic\n"
 	"1:	" TUSER(ldr) "	%1, [%4]\n"
 	"	teq	%1, %2\n"
@@ -104,6 +105,8 @@ futex_atomic_cmpxchg_inatomic(u32 *uval, u32 __user *uaddr,
 	: "cc", "memory");

 	*uval = val;
+	preempt_enable();
+
 	return ret;
 }

@@ -124,7 +127,10 @@ futex_atomic_op_inuser (int encoded_op, u32 __user *uaddr)
 	if (!access_ok(VERIFY_WRITE, uaddr, sizeof(u32)))
 		return -EFAULT;

-	pagefault_disable();	/* implies preempt_disable() */
+#ifndef CONFIG_SMP
+	preempt_disable();
+#endif
+	pagefault_disable();

 	switch (op) {
 	case FUTEX_OP_SET:
@@ -146,7 +152,10 @@ futex_atomic_op_inuser (int encoded_op, u32 __user *uaddr)
 		ret = -ENOSYS;
 	}

-	pagefault_enable();	/* subsumes preempt_enable() */
+	pagefault_enable();
+#ifndef CONFIG_SMP
+	preempt_enable();
+#endif

 	if (!ret) {
 		switch (cmp) {
@@ -18,7 +18,7 @@ extern struct cputopo_arm cpu_topology[NR_CPUS];
 #define topology_physical_package_id(cpu)	(cpu_topology[cpu].socket_id)
 #define topology_core_id(cpu)		(cpu_topology[cpu].core_id)
 #define topology_core_cpumask(cpu)	(&cpu_topology[cpu].core_sibling)
-#define topology_thread_cpumask(cpu)	(&cpu_topology[cpu].thread_sibling)
+#define topology_sibling_cpumask(cpu)	(&cpu_topology[cpu].thread_sibling)

 void init_cpu_topology(void);
 void store_cpu_topology(unsigned int cpuid);
@@ -276,7 +276,7 @@ do_page_fault(unsigned long addr, unsigned int fsr, struct pt_regs *regs)
 	 * If we're in an interrupt or have no user
 	 * context, we must not take the fault..
 	 */
-	if (in_atomic() || !mm)
+	if (faulthandler_disabled() || !mm)
 		goto no_context;

 	if (user_mode(regs))
@@ -59,6 +59,7 @@ void *kmap_atomic(struct page *page)
 	void *kmap;
 	int type;

+	preempt_disable();
 	pagefault_disable();
 	if (!PageHighMem(page))
 		return page_address(page);
@@ -121,6 +122,7 @@ void __kunmap_atomic(void *kvaddr)
 		kunmap_high(pte_page(pkmap_page_table[PKMAP_NR(vaddr)]));
 	}
 	pagefault_enable();
+	preempt_enable();
 }
 EXPORT_SYMBOL(__kunmap_atomic);

@@ -130,6 +132,7 @@ void *kmap_atomic_pfn(unsigned long pfn)
 	int idx, type;
 	struct page *page = pfn_to_page(pfn);

+	preempt_disable();
 	pagefault_disable();
 	if (!PageHighMem(page))
 		return page_address(page);
@@ -58,7 +58,7 @@ futex_atomic_op_inuser (int encoded_op, u32 __user *uaddr)
 	if (!access_ok(VERIFY_WRITE, uaddr, sizeof(u32)))
 		return -EFAULT;

-	pagefault_disable();	/* implies preempt_disable() */
+	pagefault_disable();

 	switch (op) {
 	case FUTEX_OP_SET:
@@ -85,7 +85,7 @@ futex_atomic_op_inuser (int encoded_op, u32 __user *uaddr)
 		ret = -ENOSYS;
 	}

-	pagefault_enable();	/* subsumes preempt_enable() */
+	pagefault_enable();

 	if (!ret) {
 		switch (cmp) {
@@ -18,7 +18,7 @@ extern struct cpu_topology cpu_topology[NR_CPUS];
 #define topology_physical_package_id(cpu)	(cpu_topology[cpu].cluster_id)
 #define topology_core_id(cpu)		(cpu_topology[cpu].core_id)
 #define topology_core_cpumask(cpu)	(&cpu_topology[cpu].core_sibling)
-#define topology_thread_cpumask(cpu)	(&cpu_topology[cpu].thread_sibling)
+#define topology_sibling_cpumask(cpu)	(&cpu_topology[cpu].thread_sibling)

 void init_cpu_topology(void);
 void store_cpu_topology(unsigned int cpuid);
@@ -211,7 +211,7 @@ static int __kprobes do_page_fault(unsigned long addr, unsigned int esr,
 	 * If we're in an interrupt or have no user context, we must not take
 	 * the fault.
 	 */
-	if (in_atomic() || !mm)
+	if (faulthandler_disabled() || !mm)
 		goto no_context;

 	if (user_mode(regs))
@@ -97,7 +97,8 @@ static inline __kernel_size_t __copy_from_user(void *to,
 * @x:   Value to copy to user space.
 * @ptr: Destination address, in user space.
 *
- * Context: User context only.  This function may sleep.
+ * Context: User context only. This function may sleep if pagefaults are
+ *          enabled.
 *
 * This macro copies a single simple value from kernel space to user
 * space.  It supports simple types like char and int, but not larger
@@ -116,7 +117,8 @@ static inline __kernel_size_t __copy_from_user(void *to,
 * @x:   Variable to store result.
 * @ptr: Source address, in user space.
 *
- * Context: User context only.  This function may sleep.
+ * Context: User context only. This function may sleep if pagefaults are
+ *          enabled.
 *
 * This macro copies a single simple variable from user space to kernel
 * space.  It supports simple types like char and int, but not larger
@@ -136,7 +138,8 @@ static inline __kernel_size_t __copy_from_user(void *to,
 * @x:   Value to copy to user space.
 * @ptr: Destination address, in user space.
 *
- * Context: User context only.  This function may sleep.
+ * Context: User context only. This function may sleep if pagefaults are
+ *          enabled.
 *
 * This macro copies a single simple value from kernel space to user
 * space.  It supports simple types like char and int, but not larger
@@ -158,7 +161,8 @@ static inline __kernel_size_t __copy_from_user(void *to,
 * @x:   Variable to store result.
 * @ptr: Source address, in user space.
 *
- * Context: User context only.  This function may sleep.
+ * Context: User context only. This function may sleep if pagefaults are
+ *          enabled.
 *
 * This macro copies a single simple variable from user space to kernel
 * space.  It supports simple types like char and int, but not larger
@@ -14,11 +14,11 @@
 #include <linux/pagemap.h>
 #include <linux/kdebug.h>
 #include <linux/kprobes.h>
+#include <linux/uaccess.h>

 #include <asm/mmu_context.h>
 #include <asm/sysreg.h>
 #include <asm/tlb.h>
-#include <asm/uaccess.h>

 #ifdef CONFIG_KPROBES
 static inline int notify_page_fault(struct pt_regs *regs, int trap)
@@ -81,7 +81,7 @@ asmlinkage void do_page_fault(unsigned long ecr, struct pt_regs *regs)
 	 * If we're in an interrupt or have no user context, we must
 	 * not take the fault...
 	 */
-	if (in_atomic() || !mm || regs->sr & SYSREG_BIT(GM))
+	if (faulthandler_disabled() || !mm || regs->sr & SYSREG_BIT(GM))
 		goto no_context;

 	local_irq_enable();
@@ -8,7 +8,7 @@
 #include <linux/interrupt.h>
 #include <linux/module.h>
 #include <linux/wait.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include <arch/system.h>

 extern int find_fixup_code(struct pt_regs *);
@@ -109,11 +109,11 @@ do_page_fault(unsigned long address, struct pt_regs *regs,
 	info.si_code = SEGV_MAPERR;

 	/*
-	 * If we're in an interrupt or "atomic" operation or have no
+	 * If we're in an interrupt, have pagefaults disabled or have no
 	 * user context, we must not take the fault.
 	 */

-	if (in_atomic() || !mm)
+	if (faulthandler_disabled() || !mm)
 		goto no_context;

 	if (user_mode(regs))
@@ -19,9 +19,9 @@
 #include <linux/kernel.h>
 #include <linux/ptrace.h>
 #include <linux/hardirq.h>
+#include <linux/uaccess.h>

 #include <asm/pgtable.h>
-#include <asm/uaccess.h>
 #include <asm/gdb-stub.h>

 /*****************************************************************************/
@@ -78,7 +78,7 @@ asmlinkage void do_page_fault(int datammu, unsigned long esr0, unsigned long ear
 	 * If we're in an interrupt or have no user
 	 * context, we must not take the fault..
 	 */
-	if (in_atomic() || !mm)
+	if (faulthandler_disabled() || !mm)
 		goto no_context;

 	if (user_mode(__frame))
@@ -42,6 +42,7 @@ void *kmap_atomic(struct page *page)
 	unsigned long paddr;
 	int type;

+	preempt_disable();
 	pagefault_disable();
 	type = kmap_atomic_idx_push();
 	paddr = page_to_phys(page);
@@ -85,5 +86,6 @@ void __kunmap_atomic(void *kvaddr)
 	}
 	kmap_atomic_idx_pop();
 	pagefault_enable();
+	preempt_enable();
 }
 EXPORT_SYMBOL(__kunmap_atomic);
@@ -36,7 +36,8 @@
 * @addr: User space pointer to start of block to check
 * @size: Size of block to check
 *
- * Context: User context only.  This function may sleep.
+ * Context: User context only. This function may sleep if pagefaults are
+ *          enabled.
 *
 * Checks if a pointer to a block of memory in user space is valid.
 *
@@ -53,7 +53,7 @@ void build_cpu_to_node_map(void);
 #define topology_physical_package_id(cpu)	(cpu_data(cpu)->socket_id)
 #define topology_core_id(cpu)			(cpu_data(cpu)->core_id)
 #define topology_core_cpumask(cpu)		(&cpu_core_map[cpu])
-#define topology_thread_cpumask(cpu)		(&per_cpu(cpu_sibling_map, cpu))
+#define topology_sibling_cpumask(cpu)		(&per_cpu(cpu_sibling_map, cpu))
 #endif

 extern void arch_fix_phys_package_id(int num, u32 slot);
@@ -11,10 +11,10 @@
 #include <linux/kprobes.h>
 #include <linux/kdebug.h>
 #include <linux/prefetch.h>
+#include <linux/uaccess.h>

 #include <asm/pgtable.h>
 #include <asm/processor.h>
-#include <asm/uaccess.h>

 extern int die(char *, struct pt_regs *, long);

@@ -96,7 +96,7 @@ ia64_do_page_fault (unsigned long address, unsigned long isr, struct pt_regs *re
 	/*
 	 * If we're in an interrupt or have no user context, we must not take the fault..
 	 */
-	if (in_atomic() || !mm)
+	if (faulthandler_disabled() || !mm)
 		goto no_context;

 #ifdef CONFIG_VIRTUAL_MEM_MAP
--- a/Show More
+++ b/Show More