You've already forked linux-apfs
mirror of
https://github.com/linux-apfs/linux-apfs.git
synced 2026-05-01 15:00:59 -07:00
Merge branch 'linus' into x86/urgent
Merge reason: Merge upstream commits to avoid conflicts in upcoming patches. Signed-off-by: Ingo Molnar <mingo@elte.hu>
This commit is contained in:
@@ -849,6 +849,37 @@ All: lockdep-checked RCU-protected pointer access
|
||||
See the comment headers in the source code (or the docbook generated
|
||||
from them) for more information.
|
||||
|
||||
However, given that there are no fewer than four families of RCU APIs
|
||||
in the Linux kernel, how do you choose which one to use? The following
|
||||
list can be helpful:
|
||||
|
||||
a. Will readers need to block? If so, you need SRCU.
|
||||
|
||||
b. What about the -rt patchset? If readers would need to block
|
||||
in an non-rt kernel, you need SRCU. If readers would block
|
||||
in a -rt kernel, but not in a non-rt kernel, SRCU is not
|
||||
necessary.
|
||||
|
||||
c. Do you need to treat NMI handlers, hardirq handlers,
|
||||
and code segments with preemption disabled (whether
|
||||
via preempt_disable(), local_irq_save(), local_bh_disable(),
|
||||
or some other mechanism) as if they were explicit RCU readers?
|
||||
If so, you need RCU-sched.
|
||||
|
||||
d. Do you need RCU grace periods to complete even in the face
|
||||
of softirq monopolization of one or more of the CPUs? For
|
||||
example, is your code subject to network-based denial-of-service
|
||||
attacks? If so, you need RCU-bh.
|
||||
|
||||
e. Is your workload too update-intensive for normal use of
|
||||
RCU, but inappropriate for other synchronization mechanisms?
|
||||
If so, consider SLAB_DESTROY_BY_RCU. But please be careful!
|
||||
|
||||
f. Otherwise, use RCU.
|
||||
|
||||
Of course, this all assumes that you have determined that RCU is in fact
|
||||
the right tool for your job.
|
||||
|
||||
|
||||
8. ANSWERS TO QUICK QUIZZES
|
||||
|
||||
|
||||
@@ -0,0 +1,93 @@
|
||||
CE4100 I2C
|
||||
----------
|
||||
|
||||
CE4100 has one PCI device which is described as the I2C-Controller. This
|
||||
PCI device has three PCI-bars, each bar contains a complete I2C
|
||||
controller. So we have a total of three independent I2C-Controllers
|
||||
which share only an interrupt line.
|
||||
The driver is probed via the PCI-ID and is gathering the information of
|
||||
attached devices from the devices tree.
|
||||
Grant Likely recommended to use the ranges property to map the PCI-Bar
|
||||
number to its physical address and to use this to find the child nodes
|
||||
of the specific I2C controller. This were his exact words:
|
||||
|
||||
Here's where the magic happens. Each entry in
|
||||
ranges describes how the parent pci address space
|
||||
(middle group of 3) is translated to the local
|
||||
address space (first group of 2) and the size of
|
||||
each range (last cell). In this particular case,
|
||||
the first cell of the local address is chosen to be
|
||||
1:1 mapped to the BARs, and the second is the
|
||||
offset from be base of the BAR (which would be
|
||||
non-zero if you had 2 or more devices mapped off
|
||||
the same BAR)
|
||||
|
||||
ranges allows the address mapping to be described
|
||||
in a way that the OS can interpret without
|
||||
requiring custom device driver code.
|
||||
|
||||
This is an example which is used on FalconFalls:
|
||||
------------------------------------------------
|
||||
i2c-controller@b,2 {
|
||||
#address-cells = <2>;
|
||||
#size-cells = <1>;
|
||||
compatible = "pci8086,2e68.2",
|
||||
"pci8086,2e68",
|
||||
"pciclass,ff0000",
|
||||
"pciclass,ff00";
|
||||
|
||||
reg = <0x15a00 0x0 0x0 0x0 0x0>;
|
||||
interrupts = <16 1>;
|
||||
|
||||
/* as described by Grant, the first number in the group of
|
||||
* three is the bar number followed by the 64bit bar address
|
||||
* followed by size of the mapping. The bar address
|
||||
* requires also a valid translation in parents ranges
|
||||
* property.
|
||||
*/
|
||||
ranges = <0 0 0x02000000 0 0xdffe0500 0x100
|
||||
1 0 0x02000000 0 0xdffe0600 0x100
|
||||
2 0 0x02000000 0 0xdffe0700 0x100>;
|
||||
|
||||
i2c@0 {
|
||||
#address-cells = <1>;
|
||||
#size-cells = <0>;
|
||||
compatible = "intel,ce4100-i2c-controller";
|
||||
|
||||
/* The first number in the reg property is the
|
||||
* number of the bar
|
||||
*/
|
||||
reg = <0 0 0x100>;
|
||||
|
||||
/* This I2C controller has no devices */
|
||||
};
|
||||
|
||||
i2c@1 {
|
||||
#address-cells = <1>;
|
||||
#size-cells = <0>;
|
||||
compatible = "intel,ce4100-i2c-controller";
|
||||
reg = <1 0 0x100>;
|
||||
|
||||
/* This I2C controller has one gpio controller */
|
||||
gpio@26 {
|
||||
#gpio-cells = <2>;
|
||||
compatible = "ti,pcf8575";
|
||||
reg = <0x26>;
|
||||
gpio-controller;
|
||||
};
|
||||
};
|
||||
|
||||
i2c@2 {
|
||||
#address-cells = <1>;
|
||||
#size-cells = <0>;
|
||||
compatible = "intel,ce4100-i2c-controller";
|
||||
reg = <2 0 0x100>;
|
||||
|
||||
gpio@26 {
|
||||
#gpio-cells = <2>;
|
||||
compatible = "ti,pcf8575";
|
||||
reg = <0x26>;
|
||||
gpio-controller;
|
||||
};
|
||||
};
|
||||
};
|
||||
@@ -0,0 +1,28 @@
|
||||
Motorola mc146818 compatible RTC
|
||||
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
Required properties:
|
||||
- compatible : "motorola,mc146818"
|
||||
- reg : should contain registers location and length.
|
||||
|
||||
Optional properties:
|
||||
- interrupts : should contain interrupt.
|
||||
- interrupt-parent : interrupt source phandle.
|
||||
- ctrl-reg : Contains the initial value of the control register also
|
||||
called "Register B".
|
||||
- freq-reg : Contains the initial value of the frequency register also
|
||||
called "Regsiter A".
|
||||
|
||||
"Register A" and "B" are usually initialized by the firmware (BIOS for
|
||||
instance). If this is not done, it can be performed by the driver.
|
||||
|
||||
ISA Example:
|
||||
|
||||
rtc@70 {
|
||||
compatible = "motorola,mc146818";
|
||||
interrupts = <8 3>;
|
||||
interrupt-parent = <&ioapic1>;
|
||||
ctrl-reg = <2>;
|
||||
freq-reg = <0x26>;
|
||||
reg = <1 0x70 2>;
|
||||
};
|
||||
@@ -0,0 +1,38 @@
|
||||
CE4100 Device Tree Bindings
|
||||
---------------------------
|
||||
|
||||
The CE4100 SoC uses for in core peripherals the following compatible
|
||||
format: <vendor>,<chip>-<device>.
|
||||
Many of the "generic" devices like HPET or IO APIC have the ce4100
|
||||
name in their compatible property because they first appeared in this
|
||||
SoC.
|
||||
|
||||
The CPU node
|
||||
------------
|
||||
cpu@0 {
|
||||
device_type = "cpu";
|
||||
compatible = "intel,ce4100";
|
||||
reg = <0>;
|
||||
lapic = <&lapic0>;
|
||||
};
|
||||
|
||||
The reg property describes the CPU number. The lapic property points to
|
||||
the local APIC timer.
|
||||
|
||||
The SoC node
|
||||
------------
|
||||
|
||||
This node describes the in-core peripherals. Required property:
|
||||
compatible = "intel,ce4100-cp";
|
||||
|
||||
The PCI node
|
||||
------------
|
||||
This node describes the PCI bus on the SoC. Its property should be
|
||||
compatible = "intel,ce4100-pci", "pci";
|
||||
|
||||
If the OS is using the IO-APIC for interrupt routing then the reported
|
||||
interrupt numbers for devices is no longer true. In order to obtain the
|
||||
correct interrupt number, the child node which represents the device has
|
||||
to contain the interrupt property. Besides the interrupt property it has
|
||||
to contain at least the reg property containing the PCI bus address and
|
||||
compatible property according to "PCI Bus Binding Revision 2.1".
|
||||
@@ -0,0 +1,26 @@
|
||||
Interrupt chips
|
||||
---------------
|
||||
|
||||
* Intel I/O Advanced Programmable Interrupt Controller (IO APIC)
|
||||
|
||||
Required properties:
|
||||
--------------------
|
||||
compatible = "intel,ce4100-ioapic";
|
||||
#interrupt-cells = <2>;
|
||||
|
||||
Device's interrupt property:
|
||||
|
||||
interrupts = <P S>;
|
||||
|
||||
The first number (P) represents the interrupt pin which is wired to the
|
||||
IO APIC. The second number (S) represents the sense of interrupt which
|
||||
should be configured and can be one of:
|
||||
0 - Edge Rising
|
||||
1 - Level Low
|
||||
2 - Level High
|
||||
3 - Edge Falling
|
||||
|
||||
* Local APIC
|
||||
Required property:
|
||||
|
||||
compatible = "intel,ce4100-lapic";
|
||||
@@ -0,0 +1,6 @@
|
||||
Timers
|
||||
------
|
||||
|
||||
* High Precision Event Timer (HPET)
|
||||
Required property:
|
||||
compatible = "intel,ce4100-hpet";
|
||||
@@ -13,6 +13,7 @@ Table of Contents
|
||||
|
||||
I - Introduction
|
||||
1) Entry point for arch/powerpc
|
||||
2) Entry point for arch/x86
|
||||
|
||||
II - The DT block format
|
||||
1) Header
|
||||
@@ -225,6 +226,25 @@ it with special cases.
|
||||
cannot support both configurations with Book E and configurations
|
||||
with classic Powerpc architectures.
|
||||
|
||||
2) Entry point for arch/x86
|
||||
-------------------------------
|
||||
|
||||
There is one single 32bit entry point to the kernel at code32_start,
|
||||
the decompressor (the real mode entry point goes to the same 32bit
|
||||
entry point once it switched into protected mode). That entry point
|
||||
supports one calling convention which is documented in
|
||||
Documentation/x86/boot.txt
|
||||
The physical pointer to the device-tree block (defined in chapter II)
|
||||
is passed via setup_data which requires at least boot protocol 2.09.
|
||||
The type filed is defined as
|
||||
|
||||
#define SETUP_DTB 2
|
||||
|
||||
This device-tree is used as an extension to the "boot page". As such it
|
||||
does not parse / consider data which is already covered by the boot
|
||||
page. This includes memory size, reserved ranges, command line arguments
|
||||
or initrd address. It simply holds information which can not be retrieved
|
||||
otherwise like interrupt routing or a list of devices behind an I2C bus.
|
||||
|
||||
II - The DT block format
|
||||
========================
|
||||
|
||||
@@ -2444,6 +2444,10 @@ bytes respectively. Such letter suffixes can also be entirely omitted.
|
||||
<deci-seconds>: poll all this frequency
|
||||
0: no polling (default)
|
||||
|
||||
threadirqs [KNL]
|
||||
Force threading of all interrupt handlers except those
|
||||
marked explicitely IRQF_NO_THREAD.
|
||||
|
||||
topology= [S390]
|
||||
Format: {off | on}
|
||||
Specify if the kernel should make use of the cpu
|
||||
|
||||
@@ -21,6 +21,7 @@ Contents:
|
||||
- SMP barrier pairing.
|
||||
- Examples of memory barrier sequences.
|
||||
- Read memory barriers vs load speculation.
|
||||
- Transitivity
|
||||
|
||||
(*) Explicit kernel barriers.
|
||||
|
||||
@@ -959,6 +960,63 @@ the speculation will be cancelled and the value reloaded:
|
||||
retrieved : : +-------+
|
||||
|
||||
|
||||
TRANSITIVITY
|
||||
------------
|
||||
|
||||
Transitivity is a deeply intuitive notion about ordering that is not
|
||||
always provided by real computer systems. The following example
|
||||
demonstrates transitivity (also called "cumulativity"):
|
||||
|
||||
CPU 1 CPU 2 CPU 3
|
||||
======================= ======================= =======================
|
||||
{ X = 0, Y = 0 }
|
||||
STORE X=1 LOAD X STORE Y=1
|
||||
<general barrier> <general barrier>
|
||||
LOAD Y LOAD X
|
||||
|
||||
Suppose that CPU 2's load from X returns 1 and its load from Y returns 0.
|
||||
This indicates that CPU 2's load from X in some sense follows CPU 1's
|
||||
store to X and that CPU 2's load from Y in some sense preceded CPU 3's
|
||||
store to Y. The question is then "Can CPU 3's load from X return 0?"
|
||||
|
||||
Because CPU 2's load from X in some sense came after CPU 1's store, it
|
||||
is natural to expect that CPU 3's load from X must therefore return 1.
|
||||
This expectation is an example of transitivity: if a load executing on
|
||||
CPU A follows a load from the same variable executing on CPU B, then
|
||||
CPU A's load must either return the same value that CPU B's load did,
|
||||
or must return some later value.
|
||||
|
||||
In the Linux kernel, use of general memory barriers guarantees
|
||||
transitivity. Therefore, in the above example, if CPU 2's load from X
|
||||
returns 1 and its load from Y returns 0, then CPU 3's load from X must
|
||||
also return 1.
|
||||
|
||||
However, transitivity is -not- guaranteed for read or write barriers.
|
||||
For example, suppose that CPU 2's general barrier in the above example
|
||||
is changed to a read barrier as shown below:
|
||||
|
||||
CPU 1 CPU 2 CPU 3
|
||||
======================= ======================= =======================
|
||||
{ X = 0, Y = 0 }
|
||||
STORE X=1 LOAD X STORE Y=1
|
||||
<read barrier> <general barrier>
|
||||
LOAD Y LOAD X
|
||||
|
||||
This substitution destroys transitivity: in this example, it is perfectly
|
||||
legal for CPU 2's load from X to return 1, its load from Y to return 0,
|
||||
and CPU 3's load from X to return 0.
|
||||
|
||||
The key point is that although CPU 2's read barrier orders its pair
|
||||
of loads, it does not guarantee to order CPU 1's store. Therefore, if
|
||||
this example runs on a system where CPUs 1 and 2 share a store buffer
|
||||
or a level of cache, CPU 2 might have early access to CPU 1's writes.
|
||||
General barriers are therefore required to ensure that all CPUs agree
|
||||
on the combined order of CPU 1's and CPU 2's accesses.
|
||||
|
||||
To reiterate, if your code requires transitivity, use general barriers
|
||||
throughout.
|
||||
|
||||
|
||||
========================
|
||||
EXPLICIT KERNEL BARRIERS
|
||||
========================
|
||||
|
||||
+10
-19
@@ -178,38 +178,29 @@ RTC class framework, but can't be supported by the older driver.
|
||||
setting the longer alarm time and enabling its IRQ using a single
|
||||
request (using the same model as EFI firmware).
|
||||
|
||||
* RTC_UIE_ON, RTC_UIE_OFF ... if the RTC offers IRQs, it probably
|
||||
also offers update IRQs whenever the "seconds" counter changes.
|
||||
If needed, the RTC framework can emulate this mechanism.
|
||||
* RTC_UIE_ON, RTC_UIE_OFF ... if the RTC offers IRQs, the RTC framework
|
||||
will emulate this mechanism.
|
||||
|
||||
* RTC_PIE_ON, RTC_PIE_OFF, RTC_IRQP_SET, RTC_IRQP_READ ... another
|
||||
feature often accessible with an IRQ line is a periodic IRQ, issued
|
||||
at settable frequencies (usually 2^N Hz).
|
||||
* RTC_PIE_ON, RTC_PIE_OFF, RTC_IRQP_SET, RTC_IRQP_READ ... these icotls
|
||||
are emulated via a kernel hrtimer.
|
||||
|
||||
In many cases, the RTC alarm can be a system wake event, used to force
|
||||
Linux out of a low power sleep state (or hibernation) back to a fully
|
||||
operational state. For example, a system could enter a deep power saving
|
||||
state until it's time to execute some scheduled tasks.
|
||||
|
||||
Note that many of these ioctls need not actually be implemented by your
|
||||
driver. The common rtc-dev interface handles many of these nicely if your
|
||||
driver returns ENOIOCTLCMD. Some common examples:
|
||||
Note that many of these ioctls are handled by the common rtc-dev interface.
|
||||
Some common examples:
|
||||
|
||||
* RTC_RD_TIME, RTC_SET_TIME: the read_time/set_time functions will be
|
||||
called with appropriate values.
|
||||
|
||||
* RTC_ALM_SET, RTC_ALM_READ, RTC_WKALM_SET, RTC_WKALM_RD: the
|
||||
set_alarm/read_alarm functions will be called.
|
||||
* RTC_ALM_SET, RTC_ALM_READ, RTC_WKALM_SET, RTC_WKALM_RD: gets or sets
|
||||
the alarm rtc_timer. May call the set_alarm driver function.
|
||||
|
||||
* RTC_IRQP_SET, RTC_IRQP_READ: the irq_set_freq function will be called
|
||||
to set the frequency while the framework will handle the read for you
|
||||
since the frequency is stored in the irq_freq member of the rtc_device
|
||||
structure. Your driver needs to initialize the irq_freq member during
|
||||
init. Make sure you check the requested frequency is in range of your
|
||||
hardware in the irq_set_freq function. If it isn't, return -EINVAL. If
|
||||
you cannot actually change the frequency, do not define irq_set_freq.
|
||||
* RTC_IRQP_SET, RTC_IRQP_READ: These are emulated by the generic code.
|
||||
|
||||
* RTC_PIE_ON, RTC_PIE_OFF: the irq_set_state function will be called.
|
||||
* RTC_PIE_ON, RTC_PIE_OFF: These are also emulated by the generic code.
|
||||
|
||||
If all else fails, check out the rtc-test.c driver!
|
||||
|
||||
|
||||
@@ -86,7 +86,7 @@ to change the variables it has to get an exclusive write lock.
|
||||
|
||||
The routines look the same as above:
|
||||
|
||||
rwlock_t xxx_lock = RW_LOCK_UNLOCKED;
|
||||
rwlock_t xxx_lock = __RW_LOCK_UNLOCKED(xxx_lock);
|
||||
|
||||
unsigned long flags;
|
||||
|
||||
@@ -196,25 +196,3 @@ appropriate:
|
||||
|
||||
For static initialization, use DEFINE_SPINLOCK() / DEFINE_RWLOCK() or
|
||||
__SPIN_LOCK_UNLOCKED() / __RW_LOCK_UNLOCKED() as appropriate.
|
||||
|
||||
SPIN_LOCK_UNLOCKED and RW_LOCK_UNLOCKED are deprecated. These interfere
|
||||
with lockdep state tracking.
|
||||
|
||||
Most of the time, you can simply turn:
|
||||
static spinlock_t xxx_lock = SPIN_LOCK_UNLOCKED;
|
||||
into:
|
||||
static DEFINE_SPINLOCK(xxx_lock);
|
||||
|
||||
Static structure member variables go from:
|
||||
|
||||
struct foo bar {
|
||||
.lock = SPIN_LOCK_UNLOCKED;
|
||||
};
|
||||
|
||||
to:
|
||||
|
||||
struct foo bar {
|
||||
.lock = __SPIN_LOCK_UNLOCKED(bar.lock);
|
||||
};
|
||||
|
||||
Declaration of static rw_locks undergo a similar transformation.
|
||||
|
||||
@@ -247,6 +247,13 @@ You need very few things to get the syscalls tracing in an arch.
|
||||
- Support the TIF_SYSCALL_TRACEPOINT thread flags.
|
||||
- Put the trace_sys_enter() and trace_sys_exit() tracepoints calls from ptrace
|
||||
in the ptrace syscalls tracing path.
|
||||
- If the system call table on this arch is more complicated than a simple array
|
||||
of addresses of the system calls, implement an arch_syscall_addr to return
|
||||
the address of a given system call.
|
||||
- If the symbol names of the system calls do not match the function names on
|
||||
this arch, define ARCH_HAS_SYSCALL_MATCH_SYM_NAME in asm/ftrace.h and
|
||||
implement arch_syscall_match_sym_name with the appropriate logic to return
|
||||
true if the function name corresponds with the symbol name.
|
||||
- Tag this arch as HAVE_SYSCALL_TRACEPOINTS.
|
||||
|
||||
|
||||
|
||||
+23
-128
@@ -80,11 +80,11 @@ of ftrace. Here is a list of some of the key files:
|
||||
tracers listed here can be configured by
|
||||
echoing their name into current_tracer.
|
||||
|
||||
tracing_enabled:
|
||||
tracing_on:
|
||||
|
||||
This sets or displays whether the current_tracer
|
||||
is activated and tracing or not. Echo 0 into this
|
||||
file to disable the tracer or 1 to enable it.
|
||||
This sets or displays whether writing to the trace
|
||||
ring buffer is enabled. Echo 0 into this file to disable
|
||||
the tracer or 1 to enable it.
|
||||
|
||||
trace:
|
||||
|
||||
@@ -202,10 +202,6 @@ Here is the list of current tracers that may be configured.
|
||||
to draw a graph of function calls similar to C code
|
||||
source.
|
||||
|
||||
"sched_switch"
|
||||
|
||||
Traces the context switches and wakeups between tasks.
|
||||
|
||||
"irqsoff"
|
||||
|
||||
Traces the areas that disable interrupts and saves
|
||||
@@ -273,39 +269,6 @@ format, the function name that was traced "path_put" and the
|
||||
parent function that called this function "path_walk". The
|
||||
timestamp is the time at which the function was entered.
|
||||
|
||||
The sched_switch tracer also includes tracing of task wakeups
|
||||
and context switches.
|
||||
|
||||
ksoftirqd/1-7 [01] 1453.070013: 7:115:R + 2916:115:S
|
||||
ksoftirqd/1-7 [01] 1453.070013: 7:115:R + 10:115:S
|
||||
ksoftirqd/1-7 [01] 1453.070013: 7:115:R ==> 10:115:R
|
||||
events/1-10 [01] 1453.070013: 10:115:S ==> 2916:115:R
|
||||
kondemand/1-2916 [01] 1453.070013: 2916:115:S ==> 7:115:R
|
||||
ksoftirqd/1-7 [01] 1453.070013: 7:115:S ==> 0:140:R
|
||||
|
||||
Wake ups are represented by a "+" and the context switches are
|
||||
shown as "==>". The format is:
|
||||
|
||||
Context switches:
|
||||
|
||||
Previous task Next Task
|
||||
|
||||
<pid>:<prio>:<state> ==> <pid>:<prio>:<state>
|
||||
|
||||
Wake ups:
|
||||
|
||||
Current task Task waking up
|
||||
|
||||
<pid>:<prio>:<state> + <pid>:<prio>:<state>
|
||||
|
||||
The prio is the internal kernel priority, which is the inverse
|
||||
of the priority that is usually displayed by user-space tools.
|
||||
Zero represents the highest priority (99). Prio 100 starts the
|
||||
"nice" priorities with 100 being equal to nice -20 and 139 being
|
||||
nice 19. The prio "140" is reserved for the idle task which is
|
||||
the lowest priority thread (pid 0).
|
||||
|
||||
|
||||
Latency trace format
|
||||
--------------------
|
||||
|
||||
@@ -491,78 +454,10 @@ x494] <- /root/a.out[+0x4a8] <- /lib/libc-2.7.so[+0x1e1a6]
|
||||
latencies, as described in "Latency
|
||||
trace format".
|
||||
|
||||
sched_switch
|
||||
------------
|
||||
|
||||
This tracer simply records schedule switches. Here is an example
|
||||
of how to use it.
|
||||
|
||||
# echo sched_switch > current_tracer
|
||||
# echo 1 > tracing_enabled
|
||||
# sleep 1
|
||||
# echo 0 > tracing_enabled
|
||||
# cat trace
|
||||
|
||||
# tracer: sched_switch
|
||||
#
|
||||
# TASK-PID CPU# TIMESTAMP FUNCTION
|
||||
# | | | | |
|
||||
bash-3997 [01] 240.132281: 3997:120:R + 4055:120:R
|
||||
bash-3997 [01] 240.132284: 3997:120:R ==> 4055:120:R
|
||||
sleep-4055 [01] 240.132371: 4055:120:S ==> 3997:120:R
|
||||
bash-3997 [01] 240.132454: 3997:120:R + 4055:120:S
|
||||
bash-3997 [01] 240.132457: 3997:120:R ==> 4055:120:R
|
||||
sleep-4055 [01] 240.132460: 4055:120:D ==> 3997:120:R
|
||||
bash-3997 [01] 240.132463: 3997:120:R + 4055:120:D
|
||||
bash-3997 [01] 240.132465: 3997:120:R ==> 4055:120:R
|
||||
<idle>-0 [00] 240.132589: 0:140:R + 4:115:S
|
||||
<idle>-0 [00] 240.132591: 0:140:R ==> 4:115:R
|
||||
ksoftirqd/0-4 [00] 240.132595: 4:115:S ==> 0:140:R
|
||||
<idle>-0 [00] 240.132598: 0:140:R + 4:115:S
|
||||
<idle>-0 [00] 240.132599: 0:140:R ==> 4:115:R
|
||||
ksoftirqd/0-4 [00] 240.132603: 4:115:S ==> 0:140:R
|
||||
sleep-4055 [01] 240.133058: 4055:120:S ==> 3997:120:R
|
||||
[...]
|
||||
|
||||
|
||||
As we have discussed previously about this format, the header
|
||||
shows the name of the trace and points to the options. The
|
||||
"FUNCTION" is a misnomer since here it represents the wake ups
|
||||
and context switches.
|
||||
|
||||
The sched_switch file only lists the wake ups (represented with
|
||||
'+') and context switches ('==>') with the previous task or
|
||||
current task first followed by the next task or task waking up.
|
||||
The format for both of these is PID:KERNEL-PRIO:TASK-STATE.
|
||||
Remember that the KERNEL-PRIO is the inverse of the actual
|
||||
priority with zero (0) being the highest priority and the nice
|
||||
values starting at 100 (nice -20). Below is a quick chart to map
|
||||
the kernel priority to user land priorities.
|
||||
|
||||
Kernel Space User Space
|
||||
===============================================================
|
||||
0(high) to 98(low) user RT priority 99(high) to 1(low)
|
||||
with SCHED_RR or SCHED_FIFO
|
||||
---------------------------------------------------------------
|
||||
99 sched_priority is not used in scheduling
|
||||
decisions(it must be specified as 0)
|
||||
---------------------------------------------------------------
|
||||
100(high) to 139(low) user nice -20(high) to 19(low)
|
||||
---------------------------------------------------------------
|
||||
140 idle task priority
|
||||
---------------------------------------------------------------
|
||||
|
||||
The task states are:
|
||||
|
||||
R - running : wants to run, may not actually be running
|
||||
S - sleep : process is waiting to be woken up (handles signals)
|
||||
D - disk sleep (uninterruptible sleep) : process must be woken up
|
||||
(ignores signals)
|
||||
T - stopped : process suspended
|
||||
t - traced : process is being traced (with something like gdb)
|
||||
Z - zombie : process waiting to be cleaned up
|
||||
X - unknown
|
||||
|
||||
overwrite - This controls what happens when the trace buffer is
|
||||
full. If "1" (default), the oldest events are
|
||||
discarded and overwritten. If "0", then the newest
|
||||
events are discarded.
|
||||
|
||||
ftrace_enabled
|
||||
--------------
|
||||
@@ -607,10 +502,10 @@ an example:
|
||||
# echo irqsoff > current_tracer
|
||||
# echo latency-format > trace_options
|
||||
# echo 0 > tracing_max_latency
|
||||
# echo 1 > tracing_enabled
|
||||
# echo 1 > tracing_on
|
||||
# ls -ltr
|
||||
[...]
|
||||
# echo 0 > tracing_enabled
|
||||
# echo 0 > tracing_on
|
||||
# cat trace
|
||||
# tracer: irqsoff
|
||||
#
|
||||
@@ -715,10 +610,10 @@ is much like the irqsoff tracer.
|
||||
# echo preemptoff > current_tracer
|
||||
# echo latency-format > trace_options
|
||||
# echo 0 > tracing_max_latency
|
||||
# echo 1 > tracing_enabled
|
||||
# echo 1 > tracing_on
|
||||
# ls -ltr
|
||||
[...]
|
||||
# echo 0 > tracing_enabled
|
||||
# echo 0 > tracing_on
|
||||
# cat trace
|
||||
# tracer: preemptoff
|
||||
#
|
||||
@@ -863,10 +758,10 @@ tracers.
|
||||
# echo preemptirqsoff > current_tracer
|
||||
# echo latency-format > trace_options
|
||||
# echo 0 > tracing_max_latency
|
||||
# echo 1 > tracing_enabled
|
||||
# echo 1 > tracing_on
|
||||
# ls -ltr
|
||||
[...]
|
||||
# echo 0 > tracing_enabled
|
||||
# echo 0 > tracing_on
|
||||
# cat trace
|
||||
# tracer: preemptirqsoff
|
||||
#
|
||||
@@ -1026,9 +921,9 @@ Instead of performing an 'ls', we will run 'sleep 1' under
|
||||
# echo wakeup > current_tracer
|
||||
# echo latency-format > trace_options
|
||||
# echo 0 > tracing_max_latency
|
||||
# echo 1 > tracing_enabled
|
||||
# echo 1 > tracing_on
|
||||
# chrt -f 5 sleep 1
|
||||
# echo 0 > tracing_enabled
|
||||
# echo 0 > tracing_on
|
||||
# cat trace
|
||||
# tracer: wakeup
|
||||
#
|
||||
@@ -1140,9 +1035,9 @@ ftrace_enabled is set; otherwise this tracer is a nop.
|
||||
|
||||
# sysctl kernel.ftrace_enabled=1
|
||||
# echo function > current_tracer
|
||||
# echo 1 > tracing_enabled
|
||||
# echo 1 > tracing_on
|
||||
# usleep 1
|
||||
# echo 0 > tracing_enabled
|
||||
# echo 0 > tracing_on
|
||||
# cat trace
|
||||
# tracer: function
|
||||
#
|
||||
@@ -1180,7 +1075,7 @@ int trace_fd;
|
||||
[...]
|
||||
int main(int argc, char *argv[]) {
|
||||
[...]
|
||||
trace_fd = open(tracing_file("tracing_enabled"), O_WRONLY);
|
||||
trace_fd = open(tracing_file("tracing_on"), O_WRONLY);
|
||||
[...]
|
||||
if (condition_hit()) {
|
||||
write(trace_fd, "0", 1);
|
||||
@@ -1631,9 +1526,9 @@ If I am only interested in sys_nanosleep and hrtimer_interrupt:
|
||||
# echo sys_nanosleep hrtimer_interrupt \
|
||||
> set_ftrace_filter
|
||||
# echo function > current_tracer
|
||||
# echo 1 > tracing_enabled
|
||||
# echo 1 > tracing_on
|
||||
# usleep 1
|
||||
# echo 0 > tracing_enabled
|
||||
# echo 0 > tracing_on
|
||||
# cat trace
|
||||
# tracer: ftrace
|
||||
#
|
||||
@@ -1879,9 +1774,9 @@ different. The trace is live.
|
||||
# echo function > current_tracer
|
||||
# cat trace_pipe > /tmp/trace.out &
|
||||
[1] 4153
|
||||
# echo 1 > tracing_enabled
|
||||
# echo 1 > tracing_on
|
||||
# usleep 1
|
||||
# echo 0 > tracing_enabled
|
||||
# echo 0 > tracing_on
|
||||
# cat trace
|
||||
# tracer: function
|
||||
#
|
||||
|
||||
@@ -42,11 +42,25 @@ Synopsis of kprobe_events
|
||||
+|-offs(FETCHARG) : Fetch memory at FETCHARG +|- offs address.(**)
|
||||
NAME=FETCHARG : Set NAME as the argument name of FETCHARG.
|
||||
FETCHARG:TYPE : Set TYPE as the type of FETCHARG. Currently, basic types
|
||||
(u8/u16/u32/u64/s8/s16/s32/s64) and string are supported.
|
||||
(u8/u16/u32/u64/s8/s16/s32/s64), "string" and bitfield
|
||||
are supported.
|
||||
|
||||
(*) only for return probe.
|
||||
(**) this is useful for fetching a field of data structures.
|
||||
|
||||
Types
|
||||
-----
|
||||
Several types are supported for fetch-args. Kprobe tracer will access memory
|
||||
by given type. Prefix 's' and 'u' means those types are signed and unsigned
|
||||
respectively. Traced arguments are shown in decimal (signed) or hex (unsigned).
|
||||
String type is a special type, which fetches a "null-terminated" string from
|
||||
kernel space. This means it will fail and store NULL if the string container
|
||||
has been paged out.
|
||||
Bitfield is another special type, which takes 3 parameters, bit-width, bit-
|
||||
offset, and container-size (usually 32). The syntax is;
|
||||
|
||||
b<bit-width>@<bit-offset>/<container-size>
|
||||
|
||||
|
||||
Per-Probe Event Filtering
|
||||
-------------------------
|
||||
|
||||
@@ -31,6 +31,8 @@
|
||||
#define __O_SYNC 020000000
|
||||
#define O_SYNC (__O_SYNC|O_DSYNC)
|
||||
|
||||
#define O_PATH 040000000
|
||||
|
||||
#define F_GETLK 7
|
||||
#define F_SETLK 8
|
||||
#define F_SETLKW 9
|
||||
|
||||
@@ -29,7 +29,7 @@
|
||||
: "r" (uaddr), "r"(oparg) \
|
||||
: "memory")
|
||||
|
||||
static inline int futex_atomic_op_inuser (int encoded_op, int __user *uaddr)
|
||||
static inline int futex_atomic_op_inuser (int encoded_op, u32 __user *uaddr)
|
||||
{
|
||||
int op = (encoded_op >> 28) & 7;
|
||||
int cmp = (encoded_op >> 24) & 15;
|
||||
@@ -39,7 +39,7 @@ static inline int futex_atomic_op_inuser (int encoded_op, int __user *uaddr)
|
||||
if (encoded_op & (FUTEX_OP_OPARG_SHIFT << 28))
|
||||
oparg = 1 << oparg;
|
||||
|
||||
if (!access_ok(VERIFY_WRITE, uaddr, sizeof(int)))
|
||||
if (!access_ok(VERIFY_WRITE, uaddr, sizeof(u32)))
|
||||
return -EFAULT;
|
||||
|
||||
pagefault_disable();
|
||||
@@ -81,21 +81,23 @@ static inline int futex_atomic_op_inuser (int encoded_op, int __user *uaddr)
|
||||
}
|
||||
|
||||
static inline int
|
||||
futex_atomic_cmpxchg_inatomic(int __user *uaddr, int oldval, int newval)
|
||||
futex_atomic_cmpxchg_inatomic(u32 *uval, u32 __user *uaddr,
|
||||
u32 oldval, u32 newval)
|
||||
{
|
||||
int prev, cmp;
|
||||
int ret = 0, cmp;
|
||||
u32 prev;
|
||||
|
||||
if (!access_ok(VERIFY_WRITE, uaddr, sizeof(int)))
|
||||
if (!access_ok(VERIFY_WRITE, uaddr, sizeof(u32)))
|
||||
return -EFAULT;
|
||||
|
||||
__asm__ __volatile__ (
|
||||
__ASM_SMP_MB
|
||||
"1: ldl_l %0,0(%2)\n"
|
||||
" cmpeq %0,%3,%1\n"
|
||||
" beq %1,3f\n"
|
||||
" mov %4,%1\n"
|
||||
"2: stl_c %1,0(%2)\n"
|
||||
" beq %1,4f\n"
|
||||
"1: ldl_l %1,0(%3)\n"
|
||||
" cmpeq %1,%4,%2\n"
|
||||
" beq %2,3f\n"
|
||||
" mov %5,%2\n"
|
||||
"2: stl_c %2,0(%3)\n"
|
||||
" beq %2,4f\n"
|
||||
"3: .subsection 2\n"
|
||||
"4: br 1b\n"
|
||||
" .previous\n"
|
||||
@@ -105,11 +107,12 @@ futex_atomic_cmpxchg_inatomic(int __user *uaddr, int oldval, int newval)
|
||||
" .long 2b-.\n"
|
||||
" lda $31,3b-2b(%0)\n"
|
||||
" .previous\n"
|
||||
: "=&r"(prev), "=&r"(cmp)
|
||||
: "+r"(ret), "=&r"(prev), "=&r"(cmp)
|
||||
: "r"(uaddr), "r"((long)oldval), "r"(newval)
|
||||
: "memory");
|
||||
|
||||
return prev;
|
||||
*uval = prev;
|
||||
return ret;
|
||||
}
|
||||
|
||||
#endif /* __KERNEL__ */
|
||||
|
||||
@@ -13,44 +13,13 @@
|
||||
#ifdef __KERNEL__
|
||||
|
||||
#include <linux/compiler.h>
|
||||
#include <linux/list.h>
|
||||
#include <linux/spinlock.h>
|
||||
|
||||
struct rwsem_waiter;
|
||||
|
||||
extern struct rw_semaphore *rwsem_down_read_failed(struct rw_semaphore *sem);
|
||||
extern struct rw_semaphore *rwsem_down_write_failed(struct rw_semaphore *sem);
|
||||
extern struct rw_semaphore *rwsem_wake(struct rw_semaphore *);
|
||||
extern struct rw_semaphore *rwsem_downgrade_wake(struct rw_semaphore *sem);
|
||||
|
||||
/*
|
||||
* the semaphore definition
|
||||
*/
|
||||
struct rw_semaphore {
|
||||
long count;
|
||||
#define RWSEM_UNLOCKED_VALUE 0x0000000000000000L
|
||||
#define RWSEM_ACTIVE_BIAS 0x0000000000000001L
|
||||
#define RWSEM_ACTIVE_MASK 0x00000000ffffffffL
|
||||
#define RWSEM_WAITING_BIAS (-0x0000000100000000L)
|
||||
#define RWSEM_ACTIVE_READ_BIAS RWSEM_ACTIVE_BIAS
|
||||
#define RWSEM_ACTIVE_WRITE_BIAS (RWSEM_WAITING_BIAS + RWSEM_ACTIVE_BIAS)
|
||||
spinlock_t wait_lock;
|
||||
struct list_head wait_list;
|
||||
};
|
||||
|
||||
#define __RWSEM_INITIALIZER(name) \
|
||||
{ RWSEM_UNLOCKED_VALUE, SPIN_LOCK_UNLOCKED, \
|
||||
LIST_HEAD_INIT((name).wait_list) }
|
||||
|
||||
#define DECLARE_RWSEM(name) \
|
||||
struct rw_semaphore name = __RWSEM_INITIALIZER(name)
|
||||
|
||||
static inline void init_rwsem(struct rw_semaphore *sem)
|
||||
{
|
||||
sem->count = RWSEM_UNLOCKED_VALUE;
|
||||
spin_lock_init(&sem->wait_lock);
|
||||
INIT_LIST_HEAD(&sem->wait_list);
|
||||
}
|
||||
|
||||
static inline void __down_read(struct rw_semaphore *sem)
|
||||
{
|
||||
@@ -250,10 +219,5 @@ static inline long rwsem_atomic_update(long val, struct rw_semaphore *sem)
|
||||
#endif
|
||||
}
|
||||
|
||||
static inline int rwsem_is_locked(struct rw_semaphore *sem)
|
||||
{
|
||||
return (sem->count != 0);
|
||||
}
|
||||
|
||||
#endif /* __KERNEL__ */
|
||||
#endif /* _ALPHA_RWSEM_H */
|
||||
|
||||
@@ -230,44 +230,24 @@ linux_to_osf_statfs(struct kstatfs *linux_stat, struct osf_statfs __user *osf_st
|
||||
return copy_to_user(osf_stat, &tmp_stat, bufsiz) ? -EFAULT : 0;
|
||||
}
|
||||
|
||||
static int
|
||||
do_osf_statfs(struct path *path, struct osf_statfs __user *buffer,
|
||||
unsigned long bufsiz)
|
||||
SYSCALL_DEFINE3(osf_statfs, const char __user *, pathname,
|
||||
struct osf_statfs __user *, buffer, unsigned long, bufsiz)
|
||||
{
|
||||
struct kstatfs linux_stat;
|
||||
int error = vfs_statfs(path, &linux_stat);
|
||||
int error = user_statfs(pathname, &linux_stat);
|
||||
if (!error)
|
||||
error = linux_to_osf_statfs(&linux_stat, buffer, bufsiz);
|
||||
return error;
|
||||
}
|
||||
|
||||
SYSCALL_DEFINE3(osf_statfs, const char __user *, pathname,
|
||||
struct osf_statfs __user *, buffer, unsigned long, bufsiz)
|
||||
{
|
||||
struct path path;
|
||||
int retval;
|
||||
|
||||
retval = user_path(pathname, &path);
|
||||
if (!retval) {
|
||||
retval = do_osf_statfs(&path, buffer, bufsiz);
|
||||
path_put(&path);
|
||||
}
|
||||
return retval;
|
||||
}
|
||||
|
||||
SYSCALL_DEFINE3(osf_fstatfs, unsigned long, fd,
|
||||
struct osf_statfs __user *, buffer, unsigned long, bufsiz)
|
||||
{
|
||||
struct file *file;
|
||||
int retval;
|
||||
|
||||
retval = -EBADF;
|
||||
file = fget(fd);
|
||||
if (file) {
|
||||
retval = do_osf_statfs(&file->f_path, buffer, bufsiz);
|
||||
fput(file);
|
||||
}
|
||||
return retval;
|
||||
struct kstatfs linux_stat;
|
||||
int error = fd_statfs(fd, &linux_stat);
|
||||
if (!error)
|
||||
error = linux_to_osf_statfs(&linux_stat, buffer, bufsiz);
|
||||
return error;
|
||||
}
|
||||
|
||||
/*
|
||||
|
||||
@@ -159,7 +159,7 @@ void read_persistent_clock(struct timespec *ts)
|
||||
|
||||
/*
|
||||
* timer_interrupt() needs to keep up the real-time clock,
|
||||
* as well as call the "do_timer()" routine every clocktick
|
||||
* as well as call the "xtime_update()" routine every clocktick
|
||||
*/
|
||||
irqreturn_t timer_interrupt(int irq, void *dev)
|
||||
{
|
||||
@@ -172,8 +172,6 @@ irqreturn_t timer_interrupt(int irq, void *dev)
|
||||
profile_tick(CPU_PROFILING);
|
||||
#endif
|
||||
|
||||
write_seqlock(&xtime_lock);
|
||||
|
||||
/*
|
||||
* Calculate how many ticks have passed since the last update,
|
||||
* including any previous partial leftover. Save any resulting
|
||||
@@ -187,9 +185,7 @@ irqreturn_t timer_interrupt(int irq, void *dev)
|
||||
nticks = delta >> FIX_SHIFT;
|
||||
|
||||
if (nticks)
|
||||
do_timer(nticks);
|
||||
|
||||
write_sequnlock(&xtime_lock);
|
||||
xtime_update(nticks);
|
||||
|
||||
if (test_irq_work_pending()) {
|
||||
clear_irq_work_pending();
|
||||
|
||||
@@ -35,7 +35,7 @@
|
||||
: "cc", "memory")
|
||||
|
||||
static inline int
|
||||
futex_atomic_op_inuser (int encoded_op, int __user *uaddr)
|
||||
futex_atomic_op_inuser (int encoded_op, u32 __user *uaddr)
|
||||
{
|
||||
int op = (encoded_op >> 28) & 7;
|
||||
int cmp = (encoded_op >> 24) & 15;
|
||||
@@ -46,7 +46,7 @@ futex_atomic_op_inuser (int encoded_op, int __user *uaddr)
|
||||
if (encoded_op & (FUTEX_OP_OPARG_SHIFT << 28))
|
||||
oparg = 1 << oparg;
|
||||
|
||||
if (!access_ok(VERIFY_WRITE, uaddr, sizeof(int)))
|
||||
if (!access_ok(VERIFY_WRITE, uaddr, sizeof(u32)))
|
||||
return -EFAULT;
|
||||
|
||||
pagefault_disable(); /* implies preempt_disable() */
|
||||
@@ -88,36 +88,35 @@ futex_atomic_op_inuser (int encoded_op, int __user *uaddr)
|
||||
}
|
||||
|
||||
static inline int
|
||||
futex_atomic_cmpxchg_inatomic(int __user *uaddr, int oldval, int newval)
|
||||
futex_atomic_cmpxchg_inatomic(u32 *uval, u32 __user *uaddr,
|
||||
u32 oldval, u32 newval)
|
||||
{
|
||||
int val;
|
||||
int ret = 0;
|
||||
u32 val;
|
||||
|
||||
if (!access_ok(VERIFY_WRITE, uaddr, sizeof(int)))
|
||||
if (!access_ok(VERIFY_WRITE, uaddr, sizeof(u32)))
|
||||
return -EFAULT;
|
||||
|
||||
pagefault_disable(); /* implies preempt_disable() */
|
||||
|
||||
__asm__ __volatile__("@futex_atomic_cmpxchg_inatomic\n"
|
||||
"1: " T(ldr) " %0, [%3]\n"
|
||||
" teq %0, %1\n"
|
||||
"1: " T(ldr) " %1, [%4]\n"
|
||||
" teq %1, %2\n"
|
||||
" it eq @ explicit IT needed for the 2b label\n"
|
||||
"2: " T(streq) " %2, [%3]\n"
|
||||
"2: " T(streq) " %3, [%4]\n"
|
||||
"3:\n"
|
||||
" .pushsection __ex_table,\"a\"\n"
|
||||
" .align 3\n"
|
||||
" .long 1b, 4f, 2b, 4f\n"
|
||||
" .popsection\n"
|
||||
" .pushsection .fixup,\"ax\"\n"
|
||||
"4: mov %0, %4\n"
|
||||
"4: mov %0, %5\n"
|
||||
" b 3b\n"
|
||||
" .popsection"
|
||||
: "=&r" (val)
|
||||
: "+r" (ret), "=&r" (val)
|
||||
: "r" (oldval), "r" (newval), "r" (uaddr), "Ir" (-EFAULT)
|
||||
: "cc", "memory");
|
||||
|
||||
pagefault_enable(); /* subsumes preempt_enable() */
|
||||
|
||||
return val;
|
||||
*uval = val;
|
||||
return ret;
|
||||
}
|
||||
|
||||
#endif /* !SMP */
|
||||
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user