You've already forked fedora-kernel-pf
mirror of
https://github.com/encounter/fedora-kernel-pf.git
synced 2026-03-30 11:10:06 -07:00
26048 lines
793 KiB
Diff
26048 lines
793 KiB
Diff
From 3600b89ef36136c3c0874ce7c07f904df2e3d7bb Mon Sep 17 00:00:00 2001
|
|
From: Fedora Kernel Team <kernel-team@fedoraproject.org>
|
|
Date: Tue, 14 Feb 2017 23:39:31 -0500
|
|
Subject: [PATCH] pf-kernel 4.9.9
|
|
|
|
---
|
|
Documentation/block/00-INDEX | 2 +
|
|
Documentation/block/bfq-iosched.txt | 530 ++
|
|
Documentation/block/queue-sysfs.txt | 13 +
|
|
Documentation/scheduler/sched-BFS.txt | 351 +
|
|
Documentation/scheduler/sched-MuQSS.txt | 345 +
|
|
Documentation/sysctl/kernel.txt | 37 +
|
|
Makefile | 4 +-
|
|
arch/powerpc/platforms/cell/spufs/sched.c | 5 -
|
|
arch/x86/Kconfig | 30 +-
|
|
arch/x86/Kconfig.cpu | 224 +-
|
|
arch/x86/Makefile | 33 +-
|
|
arch/x86/Makefile_32.cpu | 23 +-
|
|
arch/x86/include/asm/module.h | 38 +
|
|
block/Kconfig | 24 +
|
|
block/Kconfig.iosched | 30 +
|
|
block/Makefile | 4 +-
|
|
block/bfq-cgroup.c | 1213 ++++
|
|
block/bfq-ioc.c | 36 +
|
|
block/bfq-iosched.c | 5318 +++++++++++++++
|
|
block/bfq-sched.c | 1933 ++++++
|
|
block/bfq.h | 933 +++
|
|
block/blk-core.c | 22 +-
|
|
block/blk-mq-sysfs.c | 47 +
|
|
block/blk-mq.c | 41 +-
|
|
block/blk-mq.h | 3 +
|
|
block/blk-settings.c | 16 +
|
|
block/blk-stat.c | 234 +
|
|
block/blk-stat.h | 37 +
|
|
block/blk-sysfs.c | 160 +
|
|
block/blk-wbt.c | 704 ++
|
|
block/blk-wbt.h | 166 +
|
|
block/cfq-iosched.c | 14 +
|
|
drivers/block/swim.c | 6 +-
|
|
drivers/bluetooth/hci_qca.c | 2 +-
|
|
drivers/char/ipmi/ipmi_msghandler.c | 2 +-
|
|
drivers/char/ipmi/ipmi_ssif.c | 2 +-
|
|
drivers/char/snsc.c | 4 +-
|
|
drivers/gpu/drm/vmwgfx/vmwgfx_fifo.c | 2 +-
|
|
drivers/gpu/drm/vmwgfx/vmwgfx_irq.c | 2 +-
|
|
drivers/hwmon/fam15h_power.c | 2 +-
|
|
drivers/iio/light/tsl2563.c | 6 +-
|
|
drivers/media/i2c/msp3400-driver.c | 4 +-
|
|
drivers/media/pci/cx18/cx18-gpio.c | 4 +-
|
|
drivers/media/pci/ivtv/ivtv-gpio.c | 6 +-
|
|
drivers/media/pci/ivtv/ivtv-ioctl.c | 2 +-
|
|
drivers/media/pci/ivtv/ivtv-streams.c | 2 +-
|
|
drivers/media/radio/radio-mr800.c | 2 +-
|
|
drivers/media/radio/radio-tea5777.c | 2 +-
|
|
drivers/media/radio/tea575x.c | 2 +-
|
|
drivers/mfd/ucb1x00-core.c | 2 +-
|
|
drivers/misc/panel.c | 2 +-
|
|
drivers/misc/sgi-xp/xpc_channel.c | 2 +-
|
|
drivers/net/caif/caif_hsi.c | 2 +-
|
|
drivers/net/can/usb/peak_usb/pcan_usb.c | 2 +-
|
|
drivers/net/usb/lan78xx.c | 2 +-
|
|
drivers/net/usb/usbnet.c | 2 +-
|
|
drivers/net/wireless/ath/ath9k/ath9k.h | 27 +-
|
|
drivers/net/wireless/ath/ath9k/channel.c | 2 -
|
|
drivers/net/wireless/ath/ath9k/debug.c | 14 +-
|
|
drivers/net/wireless/ath/ath9k/debug.h | 2 -
|
|
drivers/net/wireless/ath/ath9k/debug_sta.c | 4 +-
|
|
drivers/net/wireless/ath/ath9k/init.c | 2 +-
|
|
drivers/net/wireless/ath/ath9k/main.c | 9 +-
|
|
drivers/net/wireless/ath/ath9k/xmit.c | 338 +-
|
|
drivers/net/wireless/intel/ipw2x00/ipw2100.c | 4 +-
|
|
drivers/ntb/test/ntb_perf.c | 2 +-
|
|
drivers/parport/ieee1284.c | 2 +-
|
|
drivers/parport/ieee1284_ops.c | 2 +-
|
|
drivers/platform/x86/intel_ips.c | 8 +-
|
|
drivers/rtc/rtc-wm8350.c | 6 +-
|
|
drivers/scsi/fnic/fnic_scsi.c | 4 +-
|
|
drivers/scsi/lpfc/lpfc_scsi.c | 2 +-
|
|
drivers/scsi/scsi.c | 3 +
|
|
drivers/scsi/snic/snic_scsi.c | 2 +-
|
|
drivers/staging/comedi/drivers/ni_mio_common.c | 2 +-
|
|
drivers/staging/lustre/lnet/lnet/lib-eq.c | 2 +-
|
|
drivers/staging/rts5208/rtsx.c | 2 +-
|
|
drivers/staging/speakup/speakup_acntpc.c | 4 +-
|
|
drivers/staging/speakup/speakup_apollo.c | 2 +-
|
|
drivers/staging/speakup/speakup_decext.c | 2 +-
|
|
drivers/staging/speakup/speakup_decpc.c | 2 +-
|
|
drivers/staging/speakup/speakup_dectlk.c | 2 +-
|
|
drivers/staging/speakup/speakup_dtlk.c | 4 +-
|
|
drivers/staging/speakup/speakup_keypc.c | 4 +-
|
|
drivers/staging/speakup/synth.c | 2 +-
|
|
drivers/staging/unisys/visorbus/periodic_work.c | 204 +
|
|
drivers/staging/unisys/visornic/visornic_main.c | 6 +-
|
|
drivers/target/target_core_user.c | 2 +-
|
|
drivers/video/fbdev/omap/hwa742.c | 2 +-
|
|
drivers/video/fbdev/pxafb.c | 2 +-
|
|
fs/afs/vlocation.c | 2 +-
|
|
fs/btrfs/extent-tree.c | 2 +-
|
|
fs/btrfs/inode-map.c | 2 +-
|
|
fs/buffer.c | 2 +-
|
|
fs/f2fs/data.c | 2 +-
|
|
fs/f2fs/node.c | 2 +-
|
|
fs/gfs2/meta_io.c | 3 +-
|
|
fs/mpage.c | 2 +-
|
|
fs/proc/base.c | 2 +-
|
|
fs/xfs/xfs_aops.c | 7 +-
|
|
include/linux/backing-dev-defs.h | 2 +
|
|
include/linux/blk_types.h | 20 +-
|
|
include/linux/blkdev.h | 20 +-
|
|
include/linux/freezer.h | 1 +
|
|
include/linux/fs.h | 3 +
|
|
include/linux/init_task.h | 76 +-
|
|
include/linux/ioprio.h | 2 +
|
|
include/linux/sched.h | 97 +-
|
|
include/linux/sched/prio.h | 12 +
|
|
include/linux/skip_list.h | 33 +
|
|
include/linux/writeback.h | 10 +
|
|
include/trace/events/wbt.h | 153 +
|
|
include/uapi/linux/sched.h | 9 +-
|
|
init/Kconfig | 25 +-
|
|
init/main.c | 3 +-
|
|
kernel/Kconfig.hz | 24 +-
|
|
kernel/Kconfig.preempt | 7 +-
|
|
kernel/Makefile | 2 +-
|
|
kernel/delayacct.c | 2 +-
|
|
kernel/exit.c | 2 +-
|
|
kernel/irq/Kconfig | 14 +
|
|
kernel/irq/manage.c | 10 +
|
|
kernel/kthread.c | 30 +-
|
|
kernel/sched/Makefile | 13 +-
|
|
kernel/sched/MuQSS.c | 8033 +++++++++++++++++++++++
|
|
kernel/sched/MuQSS.h | 348 +
|
|
kernel/sched/cpufreq.c | 4 +
|
|
kernel/sched/cpufreq_schedutil.c | 16 +
|
|
kernel/sched/cputime.c | 27 +-
|
|
kernel/sched/idle.c | 14 +-
|
|
kernel/sched/sched.h | 25 +
|
|
kernel/sched/stats.c | 4 +
|
|
kernel/skip_list.c | 148 +
|
|
kernel/sysctl.c | 74 +-
|
|
kernel/time/clockevents.c | 5 +-
|
|
kernel/time/hrtimer.c | 114 +
|
|
kernel/time/posix-cpu-timers.c | 10 +-
|
|
kernel/time/timer.c | 49 +-
|
|
kernel/trace/trace_selftest.c | 5 +
|
|
mm/backing-dev.c | 1 +
|
|
mm/page-writeback.c | 1 +
|
|
net/core/pktgen.c | 2 +-
|
|
sound/pci/maestro3.c | 4 +-
|
|
sound/soc/codecs/rt5631.c | 4 +-
|
|
sound/soc/codecs/wm8350.c | 12 +-
|
|
sound/soc/codecs/wm8900.c | 2 +-
|
|
sound/soc/codecs/wm9713.c | 4 +-
|
|
sound/soc/soc-dapm.c | 2 +-
|
|
sound/usb/line6/pcm.c | 2 +-
|
|
149 files changed, 22356 insertions(+), 463 deletions(-)
|
|
create mode 100644 Documentation/block/bfq-iosched.txt
|
|
create mode 100644 Documentation/scheduler/sched-BFS.txt
|
|
create mode 100644 Documentation/scheduler/sched-MuQSS.txt
|
|
create mode 100644 block/bfq-cgroup.c
|
|
create mode 100644 block/bfq-ioc.c
|
|
create mode 100644 block/bfq-iosched.c
|
|
create mode 100644 block/bfq-sched.c
|
|
create mode 100644 block/bfq.h
|
|
create mode 100644 block/blk-stat.c
|
|
create mode 100644 block/blk-stat.h
|
|
create mode 100644 block/blk-wbt.c
|
|
create mode 100644 block/blk-wbt.h
|
|
create mode 100644 drivers/staging/unisys/visorbus/periodic_work.c
|
|
create mode 100644 include/linux/skip_list.h
|
|
create mode 100644 include/trace/events/wbt.h
|
|
create mode 100644 kernel/sched/MuQSS.c
|
|
create mode 100644 kernel/sched/MuQSS.h
|
|
create mode 100644 kernel/skip_list.c
|
|
|
|
diff --git a/Documentation/block/00-INDEX b/Documentation/block/00-INDEX
|
|
index e55103a..8d55b4b 100644
|
|
--- a/Documentation/block/00-INDEX
|
|
+++ b/Documentation/block/00-INDEX
|
|
@@ -1,5 +1,7 @@
|
|
00-INDEX
|
|
- This file
|
|
+bfq-iosched.txt
|
|
+ - BFQ IO scheduler and its tunables
|
|
biodoc.txt
|
|
- Notes on the Generic Block Layer Rewrite in Linux 2.5
|
|
biovecs.txt
|
|
diff --git a/Documentation/block/bfq-iosched.txt b/Documentation/block/bfq-iosched.txt
|
|
new file mode 100644
|
|
index 0000000..13b5248
|
|
--- /dev/null
|
|
+++ b/Documentation/block/bfq-iosched.txt
|
|
@@ -0,0 +1,530 @@
|
|
+BFQ (Budget Fair Queueing)
|
|
+==========================
|
|
+
|
|
+BFQ is a proportional-share I/O scheduler, with some extra
|
|
+low-latency capabilities. In addition to cgroups support (blkio or io
|
|
+controllers), BFQ's main features are:
|
|
+- BFQ guarantees a high system and application responsiveness, and a
|
|
+ low latency for time-sensitive applications, such as audio or video
|
|
+ players;
|
|
+- BFQ distributes bandwidth, and not just time, among processes or
|
|
+ groups (switching back to time distribution when needed to keep
|
|
+ throughput high).
|
|
+
|
|
+On average CPUs, the current version of BFQ can handle devices
|
|
+performing at most ~30K IOPS; at most ~50 KIOPS on faster CPUs. As a
|
|
+reference, 30-50 KIOPS correspond to very high bandwidths with
|
|
+sequential I/O (e.g., 8-12 GB/s if I/O requests are 256 KB large), and
|
|
+to 120-200 MB/s with 4KB random I/O.
|
|
+
|
|
+The table of contents follow. Impatients can just jump to Section 3.
|
|
+
|
|
+CONTENTS
|
|
+
|
|
+1. When may BFQ be useful?
|
|
+ 1-1 Personal systems
|
|
+ 1-2 Server systems
|
|
+2. How does BFQ work?
|
|
+3. What are BFQ's tunable?
|
|
+4. BFQ group scheduling
|
|
+ 4-1 Service guarantees provided
|
|
+ 4-2 Interface
|
|
+
|
|
+1. When may BFQ be useful?
|
|
+==========================
|
|
+
|
|
+BFQ provides the following benefits on personal and server systems.
|
|
+
|
|
+1-1 Personal systems
|
|
+--------------------
|
|
+
|
|
+Low latency for interactive applications
|
|
+
|
|
+Regardless of the actual background workload, BFQ guarantees that, for
|
|
+interactive tasks, the storage device is virtually as responsive as if
|
|
+it was idle. For example, even if one or more of the following
|
|
+background workloads are being executed:
|
|
+- one or more large files are being read, written or copied,
|
|
+- a tree of source files is being compiled,
|
|
+- one or more virtual machines are performing I/O,
|
|
+- a software update is in progress,
|
|
+- indexing daemons are scanning filesystems and updating their
|
|
+ databases,
|
|
+starting an application or loading a file from within an application
|
|
+takes about the same time as if the storage device was idle. As a
|
|
+comparison, with CFQ, NOOP or DEADLINE, and in the same conditions,
|
|
+applications experience high latencies, or even become unresponsive
|
|
+until the background workload terminates (also on SSDs).
|
|
+
|
|
+Low latency for soft real-time applications
|
|
+
|
|
+Also soft real-time applications, such as audio and video
|
|
+players/streamers, enjoy a low latency and a low drop rate, regardless
|
|
+of the background I/O workload. As a consequence, these applications
|
|
+do not suffer from almost any glitch due to the background workload.
|
|
+
|
|
+Higher speed for code-development tasks
|
|
+
|
|
+If some additional workload happens to be executed in parallel, then
|
|
+BFQ executes the I/O-related components of typical code-development
|
|
+tasks (compilation, checkout, merge, ...) much more quickly than CFQ,
|
|
+NOOP or DEADLINE.
|
|
+
|
|
+High throughput
|
|
+
|
|
+On hard disks, BFQ achieves up to 30% higher throughput than CFQ, and
|
|
+up to 150% higher throughput than DEADLINE and NOOP, with all the
|
|
+sequential workloads considered in our tests. With random workloads,
|
|
+and with all the workloads on flash-based devices, BFQ achieves,
|
|
+instead, about the same throughput as the other schedulers.
|
|
+
|
|
+Strong fairness, bandwidth and delay guarantees
|
|
+
|
|
+BFQ distributes the device throughput, and not just the device time,
|
|
+among I/O-bound applications in proportion their weights, with any
|
|
+workload and regardless of the device parameters. From these bandwidth
|
|
+guarantees, it is possible to compute tight per-I/O-request delay
|
|
+guarantees by a simple formula. If not configured for strict service
|
|
+guarantees, BFQ switches to time-based resource sharing (only) for
|
|
+applications that would otherwise cause a throughput loss.
|
|
+
|
|
+1-2 Server systems
|
|
+------------------
|
|
+
|
|
+Most benefits for server systems follow from the same service
|
|
+properties as above. In particular, regardless of whether additional,
|
|
+possibly heavy workloads are being served, BFQ guarantees:
|
|
+
|
|
+. audio and video-streaming with zero or very low jitter and drop
|
|
+ rate;
|
|
+
|
|
+. fast retrieval of WEB pages and embedded objects;
|
|
+
|
|
+. real-time recording of data in live-dumping applications (e.g.,
|
|
+ packet logging);
|
|
+
|
|
+. responsiveness in local and remote access to a server.
|
|
+
|
|
+
|
|
+2. How does BFQ work?
|
|
+=====================
|
|
+
|
|
+BFQ is a proportional-share I/O scheduler, whose general structure,
|
|
+plus a lot of code, are borrowed from CFQ.
|
|
+
|
|
+- Each process doing I/O on a device is associated with a weight and a
|
|
+ (bfq_)queue.
|
|
+
|
|
+- BFQ grants exclusive access to the device, for a while, to one queue
|
|
+ (process) at a time, and implements this service model by
|
|
+ associating every queue with a budget, measured in number of
|
|
+ sectors.
|
|
+
|
|
+ - After a queue is granted access to the device, the budget of the
|
|
+ queue is decremented, on each request dispatch, by the size of the
|
|
+ request.
|
|
+
|
|
+ - The in-service queue is expired, i.e., its service is suspended,
|
|
+ only if one of the following events occurs: 1) the queue finishes
|
|
+ its budget, 2) the queue empties, 3) a "budget timeout" fires.
|
|
+
|
|
+ - The budget timeout prevents processes doing random I/O from
|
|
+ holding the device for too long and dramatically reducing
|
|
+ throughput.
|
|
+
|
|
+ - Actually, as in CFQ, a queue associated with a process issuing
|
|
+ sync requests may not be expired immediately when it empties. In
|
|
+ contrast, BFQ may idle the device for a short time interval,
|
|
+ giving the process the chance to go on being served if it issues
|
|
+ a new request in time. Device idling typically boosts the
|
|
+ throughput on rotational devices, if processes do synchronous
|
|
+ and sequential I/O. In addition, under BFQ, device idling is
|
|
+ also instrumental in guaranteeing the desired throughput
|
|
+ fraction to processes issuing sync requests (see the description
|
|
+ of the slice_idle tunable in this document, or [1, 2], for more
|
|
+ details).
|
|
+
|
|
+ - With respect to idling for service guarantees, if several
|
|
+ processes are competing for the device at the same time, but
|
|
+ all processes (and groups, after the following commit) have
|
|
+ the same weight, then BFQ guarantees the expected throughput
|
|
+ distribution without ever idling the device. Throughput is
|
|
+ thus as high as possible in this common scenario.
|
|
+
|
|
+ - If low-latency mode is enabled (default configuration), BFQ
|
|
+ executes some special heuristics to detect interactive and soft
|
|
+ real-time applications (e.g., video or audio players/streamers),
|
|
+ and to reduce their latency. The most important action taken to
|
|
+ achieve this goal is to give to the queues associated with these
|
|
+ applications more than their fair share of the device
|
|
+ throughput. For brevity, we call just "weight-raising" the whole
|
|
+ sets of actions taken by BFQ to privilege these queues. In
|
|
+ particular, BFQ provides a milder form of weight-raising for
|
|
+ interactive applications, and a stronger form for soft real-time
|
|
+ applications.
|
|
+
|
|
+ - BFQ automatically deactivates idling for queues born in a burst of
|
|
+ queue creations. In fact, these queues are usually associated with
|
|
+ the processes of applications and services that benefit mostly
|
|
+ from a high throughput. Examples are systemd during boot, or git
|
|
+ grep.
|
|
+
|
|
+ - As CFQ, BFQ merges queues performing interleaved I/O, i.e.,
|
|
+ performing random I/O that becomes mostly sequential if
|
|
+ merged. Differently from CFQ, BFQ achieves this goal with a more
|
|
+ reactive mechanism, called Early Queue Merge (EQM). EQM is so
|
|
+ responsive in detecting interleaved I/O (cooperating processes),
|
|
+ that it enables BFQ to achieve a high throughput, by queue
|
|
+ merging, even for queues for which CFQ needs a different
|
|
+ mechanism, preemption, to get a high throughput. As such EQM is a
|
|
+ unified mechanism to achieve a high throughput with interleaved
|
|
+ I/O.
|
|
+
|
|
+ - Queues are scheduled according to a variant of WF2Q+, named
|
|
+ B-WF2Q+, and implemented using an augmented rb-tree to preserve an
|
|
+ O(log N) overall complexity. See [2] for more details. B-WF2Q+ is
|
|
+ also ready for hierarchical scheduling. However, for a cleaner
|
|
+ logical breakdown, the code that enables and completes
|
|
+ hierarchical support is provided in the next commit, which focuses
|
|
+ exactly on this feature.
|
|
+
|
|
+ - B-WF2Q+ guarantees a tight deviation with respect to an ideal,
|
|
+ perfectly fair, and smooth service. In particular, B-WF2Q+
|
|
+ guarantees that each queue receives a fraction of the device
|
|
+ throughput proportional to its weight, even if the throughput
|
|
+ fluctuates, and regardless of: the device parameters, the current
|
|
+ workload and the budgets assigned to the queue.
|
|
+
|
|
+ - The last, budget-independence, property (although probably
|
|
+ counterintuitive in the first place) is definitely beneficial, for
|
|
+ the following reasons:
|
|
+
|
|
+ - First, with any proportional-share scheduler, the maximum
|
|
+ deviation with respect to an ideal service is proportional to
|
|
+ the maximum budget (slice) assigned to queues. As a consequence,
|
|
+ BFQ can keep this deviation tight not only because of the
|
|
+ accurate service of B-WF2Q+, but also because BFQ *does not*
|
|
+ need to assign a larger budget to a queue to let the queue
|
|
+ receive a higher fraction of the device throughput.
|
|
+
|
|
+ - Second, BFQ is free to choose, for every process (queue), the
|
|
+ budget that best fits the needs of the process, or best
|
|
+ leverages the I/O pattern of the process. In particular, BFQ
|
|
+ updates queue budgets with a simple feedback-loop algorithm that
|
|
+ allows a high throughput to be achieved, while still providing
|
|
+ tight latency guarantees to time-sensitive applications. When
|
|
+ the in-service queue expires, this algorithm computes the next
|
|
+ budget of the queue so as to:
|
|
+
|
|
+ - Let large budgets be eventually assigned to the queues
|
|
+ associated with I/O-bound applications performing sequential
|
|
+ I/O: in fact, the longer these applications are served once
|
|
+ got access to the device, the higher the throughput is.
|
|
+
|
|
+ - Let small budgets be eventually assigned to the queues
|
|
+ associated with time-sensitive applications (which typically
|
|
+ perform sporadic and short I/O), because, the smaller the
|
|
+ budget assigned to a queue waiting for service is, the sooner
|
|
+ B-WF2Q+ will serve that queue (Subsec 3.3 in [2]).
|
|
+
|
|
+- If several processes are competing for the device at the same time,
|
|
+ but all processes and groups have the same weight, then BFQ
|
|
+ guarantees the expected throughput distribution without ever idling
|
|
+ the device. It uses preemption instead. Throughput is then much
|
|
+ higher in this common scenario.
|
|
+
|
|
+- ioprio classes are served in strict priority order, i.e.,
|
|
+ lower-priority queues are not served as long as there are
|
|
+ higher-priority queues. Among queues in the same class, the
|
|
+ bandwidth is distributed in proportion to the weight of each
|
|
+ queue. A very thin extra bandwidth is however guaranteed to
|
|
+ the Idle class, to prevent it from starving.
|
|
+
|
|
+
|
|
+3. What are BFQ's tunable?
|
|
+==========================
|
|
+
|
|
+The tunables back_seek-max, back_seek_penalty, fifo_expire_async and
|
|
+fifo_expire_sync below are the same as in CFQ. Their description is
|
|
+just copied from that for CFQ. Some considerations in the description
|
|
+of slice_idle are copied from CFQ too.
|
|
+
|
|
+per-process ioprio and weight
|
|
+-----------------------------
|
|
+
|
|
+Unless the cgroups interface is used (see "4. BFQ group scheduling"),
|
|
+weights can be assigned to processes only indirectly, through I/O
|
|
+priorities, and according to the relation:
|
|
+weight = (IOPRIO_BE_NR - ioprio) * 10.
|
|
+
|
|
+Beware that, if low-latency is set, then BFQ automatically raises the
|
|
+weight of the queues associated with interactive and soft real-time
|
|
+applications. Unset this tunable if you need/want to control weights.
|
|
+
|
|
+slice_idle
|
|
+----------
|
|
+
|
|
+This parameter specifies how long BFQ should idle for next I/O
|
|
+request, when certain sync BFQ queues become empty. By default
|
|
+slice_idle is a non-zero value. Idling has a double purpose: boosting
|
|
+throughput and making sure that the desired throughput distribution is
|
|
+respected (see the description of how BFQ works, and, if needed, the
|
|
+papers referred there).
|
|
+
|
|
+As for throughput, idling can be very helpful on highly seeky media
|
|
+like single spindle SATA/SAS disks where we can cut down on overall
|
|
+number of seeks and see improved throughput.
|
|
+
|
|
+Setting slice_idle to 0 will remove all the idling on queues and one
|
|
+should see an overall improved throughput on faster storage devices
|
|
+like multiple SATA/SAS disks in hardware RAID configuration.
|
|
+
|
|
+So depending on storage and workload, it might be useful to set
|
|
+slice_idle=0. In general for SATA/SAS disks and software RAID of
|
|
+SATA/SAS disks keeping slice_idle enabled should be useful. For any
|
|
+configurations where there are multiple spindles behind single LUN
|
|
+(Host based hardware RAID controller or for storage arrays), setting
|
|
+slice_idle=0 might end up in better throughput and acceptable
|
|
+latencies.
|
|
+
|
|
+Idling is however necessary to have service guarantees enforced in
|
|
+case of differentiated weights or differentiated I/O-request lengths.
|
|
+To see why, suppose that a given BFQ queue A must get several I/O
|
|
+requests served for each request served for another queue B. Idling
|
|
+ensures that, if A makes a new I/O request slightly after becoming
|
|
+empty, then no request of B is dispatched in the middle, and thus A
|
|
+does not lose the possibility to get more than one request dispatched
|
|
+before the next request of B is dispatched. Note that idling
|
|
+guarantees the desired differentiated treatment of queues only in
|
|
+terms of I/O-request dispatches. To guarantee that the actual service
|
|
+order then corresponds to the dispatch order, the strict_guarantees
|
|
+tunable must be set too.
|
|
+
|
|
+There is an important flipside for idling: apart from the above cases
|
|
+where it is beneficial also for throughput, idling can severely impact
|
|
+throughput. One important case is random workload. Because of this
|
|
+issue, BFQ tends to avoid idling as much as possible, when it is not
|
|
+beneficial also for throughput. As a consequence of this behavior, and
|
|
+of further issues described for the strict_guarantees tunable,
|
|
+short-term service guarantees may be occasionally violated. And, in
|
|
+some cases, these guarantees may be more important than guaranteeing
|
|
+maximum throughput. For example, in video playing/streaming, a very
|
|
+low drop rate may be more important than maximum throughput. In these
|
|
+cases, consider setting the strict_guarantees parameter.
|
|
+
|
|
+strict_guarantees
|
|
+-----------------
|
|
+
|
|
+If this parameter is set (default: unset), then BFQ
|
|
+
|
|
+- always performs idling when the in-service queue becomes empty;
|
|
+
|
|
+- forces the device to serve one I/O request at a time, by dispatching a
|
|
+ new request only if there is no outstanding request.
|
|
+
|
|
+In the presence of differentiated weights or I/O-request sizes, both
|
|
+the above conditions are needed to guarantee that every BFQ queue
|
|
+receives its allotted share of the bandwidth. The first condition is
|
|
+needed for the reasons explained in the description of the slice_idle
|
|
+tunable. The second condition is needed because all modern storage
|
|
+devices reorder internally-queued requests, which may trivially break
|
|
+the service guarantees enforced by the I/O scheduler.
|
|
+
|
|
+Setting strict_guarantees may evidently affect throughput.
|
|
+
|
|
+back_seek_max
|
|
+-------------
|
|
+
|
|
+This specifies, given in Kbytes, the maximum "distance" for backward seeking.
|
|
+The distance is the amount of space from the current head location to the
|
|
+sectors that are backward in terms of distance.
|
|
+
|
|
+This parameter allows the scheduler to anticipate requests in the "backward"
|
|
+direction and consider them as being the "next" if they are within this
|
|
+distance from the current head location.
|
|
+
|
|
+back_seek_penalty
|
|
+-----------------
|
|
+
|
|
+This parameter is used to compute the cost of backward seeking. If the
|
|
+backward distance of request is just 1/back_seek_penalty from a "front"
|
|
+request, then the seeking cost of two requests is considered equivalent.
|
|
+
|
|
+So scheduler will not bias toward one or the other request (otherwise scheduler
|
|
+will bias toward front request). Default value of back_seek_penalty is 2.
|
|
+
|
|
+fifo_expire_async
|
|
+-----------------
|
|
+
|
|
+This parameter is used to set the timeout of asynchronous requests. Default
|
|
+value of this is 248ms.
|
|
+
|
|
+fifo_expire_sync
|
|
+----------------
|
|
+
|
|
+This parameter is used to set the timeout of synchronous requests. Default
|
|
+value of this is 124ms. In case to favor synchronous requests over asynchronous
|
|
+one, this value should be decreased relative to fifo_expire_async.
|
|
+
|
|
+low_latency
|
|
+-----------
|
|
+
|
|
+This parameter is used to enable/disable BFQ's low latency mode. By
|
|
+default, low latency mode is enabled. If enabled, interactive and soft
|
|
+real-time applications are privileged and experience a lower latency,
|
|
+as explained in more detail in the description of how BFQ works.
|
|
+
|
|
+DO NOT enable this mode if you need full control on bandwidth
|
|
+distribution. In fact, if it is enabled, then BFQ automatically
|
|
+increases the bandwidth share of privileged applications, as the main
|
|
+means to guarantee a lower latency to them.
|
|
+
|
|
+timeout_sync
|
|
+------------
|
|
+
|
|
+Maximum amount of device time that can be given to a task (queue) once
|
|
+it has been selected for service. On devices with costly seeks,
|
|
+increasing this time usually increases maximum throughput. On the
|
|
+opposite end, increasing this time coarsens the granularity of the
|
|
+short-term bandwidth and latency guarantees, especially if the
|
|
+following parameter is set to zero.
|
|
+
|
|
+max_budget
|
|
+----------
|
|
+
|
|
+Maximum amount of service, measured in sectors, that can be provided
|
|
+to a BFQ queue once it is set in service (of course within the limits
|
|
+of the above timeout). According to what said in the description of
|
|
+the algorithm, larger values increase the throughput in proportion to
|
|
+the percentage of sequential I/O requests issued. The price of larger
|
|
+values is that they coarsen the granularity of short-term bandwidth
|
|
+and latency guarantees.
|
|
+
|
|
+The default value is 0, which enables auto-tuning: BFQ sets max_budget
|
|
+to the maximum number of sectors that can be served during
|
|
+timeout_sync, according to the estimated peak rate.
|
|
+
|
|
+weights
|
|
+-------
|
|
+
|
|
+Read-only parameter, used to show the weights of the currently active
|
|
+BFQ queues.
|
|
+
|
|
+
|
|
+wr_ tunables
|
|
+------------
|
|
+
|
|
+BFQ exports a few parameters to control/tune the behavior of
|
|
+low-latency heuristics.
|
|
+
|
|
+wr_coeff
|
|
+
|
|
+Factor by which the weight of a weight-raised queue is multiplied. If
|
|
+the queue is deemed soft real-time, then the weight is further
|
|
+multiplied by an additional, constant factor.
|
|
+
|
|
+wr_max_time
|
|
+
|
|
+Maximum duration of a weight-raising period for an interactive task
|
|
+(ms). If set to zero (default value), then this value is computed
|
|
+automatically, as a function of the peak rate of the device. In any
|
|
+case, when the value of this parameter is read, it always reports the
|
|
+current duration, regardless of whether it has been set manually or
|
|
+computed automatically.
|
|
+
|
|
+wr_max_softrt_rate
|
|
+
|
|
+Maximum service rate below which a queue is deemed to be associated
|
|
+with a soft real-time application, and is then weight-raised
|
|
+accordingly (sectors/sec).
|
|
+
|
|
+wr_min_idle_time
|
|
+
|
|
+Minimum idle period after which interactive weight-raising may be
|
|
+reactivated for a queue (in ms).
|
|
+
|
|
+wr_rt_max_time
|
|
+
|
|
+Maximum weight-raising duration for soft real-time queues (in ms). The
|
|
+start time from which this duration is considered is automatically
|
|
+moved forward if the queue is detected to be still soft real-time
|
|
+before the current soft real-time weight-raising period finishes.
|
|
+
|
|
+wr_min_inter_arr_async
|
|
+
|
|
+Minimum period between I/O request arrivals after which weight-raising
|
|
+may be reactivated for an already busy async queue (in ms).
|
|
+
|
|
+
|
|
+4. Group scheduling with BFQ
|
|
+============================
|
|
+
|
|
+BFQ supports both cgroups-v1 and cgroups-v2 io controllers, namely
|
|
+blkio and io. In particular, BFQ supports weight-based proportional
|
|
+share. To activate cgroups support, set BFQ_GROUP_IOSCHED.
|
|
+
|
|
+4-1 Service guarantees provided
|
|
+-------------------------------
|
|
+
|
|
+With BFQ, proportional share means true proportional share of the
|
|
+device bandwidth, according to group weights. For example, a group
|
|
+with weight 200 gets twice the bandwidth, and not just twice the time,
|
|
+of a group with weight 100.
|
|
+
|
|
+BFQ supports hierarchies (group trees) of any depth. Bandwidth is
|
|
+distributed among groups and processes in the expected way: for each
|
|
+group, the children of the group share the whole bandwidth of the
|
|
+group in proportion to their weights. In particular, this implies
|
|
+that, for each leaf group, every process of the group receives the
|
|
+same share of the whole group bandwidth, unless the ioprio of the
|
|
+process is modified.
|
|
+
|
|
+The resource-sharing guarantee for a group may partially or totally
|
|
+switch from bandwidth to time, if providing bandwidth guarantees to
|
|
+the group lowers the throughput too much. This switch occurs on a
|
|
+per-process basis: if a process of a leaf group causes throughput loss
|
|
+if served in such a way to receive its share of the bandwidth, then
|
|
+BFQ switches back to just time-based proportional share for that
|
|
+process.
|
|
+
|
|
+4-2 Interface
|
|
+-------------
|
|
+
|
|
+To get proportional sharing of bandwidth with BFQ for a given device,
|
|
+BFQ must of course be the active scheduler for that device.
|
|
+
|
|
+Within each group directory, the names of the files associated with
|
|
+BFQ-specific cgroup parameters and stats begin with the "bfq."
|
|
+prefix. So, with cgroups-v1 or cgroups-v2, the full prefix for
|
|
+BFQ-specific files is "blkio.bfq." or "io.bfq." For example, the group
|
|
+parameter to set the weight of a group with BFQ is blkio.bfq.weight
|
|
+or io.bfq.weight.
|
|
+
|
|
+Parameters to set
|
|
+-----------------
|
|
+
|
|
+For each group, there is only the following parameter to set.
|
|
+
|
|
+weight (namely blkio.bfq.weight or io.bfq-weight): the weight of the
|
|
+group inside its parent. Available values: 1..10000 (default 100). The
|
|
+linear mapping between ioprio and weights, described at the beginning
|
|
+of the tunable section, is still valid, but all weights higher than
|
|
+IOPRIO_BE_NR*10 are mapped to ioprio 0.
|
|
+
|
|
+Recall that, if low-latency is set, then BFQ automatically raises the
|
|
+weight of the queues associated with interactive and soft real-time
|
|
+applications. Unset this tunable if you need/want to control weights.
|
|
+
|
|
+
|
|
+[1] P. Valente, A. Avanzini, "Evolution of the BFQ Storage I/O
|
|
+ Scheduler", Proceedings of the First Workshop on Mobile System
|
|
+ Technologies (MST-2015), May 2015.
|
|
+ http://algogroup.unimore.it/people/paolo/disk_sched/mst-2015.pdf
|
|
+
|
|
+[2] P. Valente and M. Andreolini, "Improving Application
|
|
+ Responsiveness with the BFQ Disk I/O Scheduler", Proceedings of
|
|
+ the 5th Annual International Systems and Storage Conference
|
|
+ (SYSTOR '12), June 2012.
|
|
+ Slightly extended version:
|
|
+ http://algogroup.unimore.it/people/paolo/disk_sched/bfq-v1-suite-
|
|
+ results.pdf
|
|
diff --git a/Documentation/block/queue-sysfs.txt b/Documentation/block/queue-sysfs.txt
|
|
index 2a39040..2847219 100644
|
|
--- a/Documentation/block/queue-sysfs.txt
|
|
+++ b/Documentation/block/queue-sysfs.txt
|
|
@@ -169,5 +169,18 @@ This is the number of bytes the device can write in a single write-same
|
|
command. A value of '0' means write-same is not supported by this
|
|
device.
|
|
|
|
+wb_lat_usec (RW)
|
|
+----------------
|
|
+If the device is registered for writeback throttling, then this file shows
|
|
+the target minimum read latency. If this latency is exceeded in a given
|
|
+window of time (see wb_window_usec), then the writeback throttling will start
|
|
+scaling back writes.
|
|
+
|
|
+wb_window_usec (RW)
|
|
+-------------------
|
|
+If the device is registered for writeback throttling, then this file shows
|
|
+the value of the monitoring window in which we'll look at the target
|
|
+latency. See wb_lat_usec.
|
|
+
|
|
|
|
Jens Axboe <jens.axboe@oracle.com>, February 2009
|
|
diff --git a/Documentation/scheduler/sched-BFS.txt b/Documentation/scheduler/sched-BFS.txt
|
|
new file mode 100644
|
|
index 0000000..c028200
|
|
--- /dev/null
|
|
+++ b/Documentation/scheduler/sched-BFS.txt
|
|
@@ -0,0 +1,351 @@
|
|
+BFS - The Brain Fuck Scheduler by Con Kolivas.
|
|
+
|
|
+Goals.
|
|
+
|
|
+The goal of the Brain Fuck Scheduler, referred to as BFS from here on, is to
|
|
+completely do away with the complex designs of the past for the cpu process
|
|
+scheduler and instead implement one that is very simple in basic design.
|
|
+The main focus of BFS is to achieve excellent desktop interactivity and
|
|
+responsiveness without heuristics and tuning knobs that are difficult to
|
|
+understand, impossible to model and predict the effect of, and when tuned to
|
|
+one workload cause massive detriment to another.
|
|
+
|
|
+
|
|
+Design summary.
|
|
+
|
|
+BFS is best described as a single runqueue, O(n) lookup, earliest effective
|
|
+virtual deadline first design, loosely based on EEVDF (earliest eligible virtual
|
|
+deadline first) and my previous Staircase Deadline scheduler. Each component
|
|
+shall be described in order to understand the significance of, and reasoning for
|
|
+it. The codebase when the first stable version was released was approximately
|
|
+9000 lines less code than the existing mainline linux kernel scheduler (in
|
|
+2.6.31). This does not even take into account the removal of documentation and
|
|
+the cgroups code that is not used.
|
|
+
|
|
+Design reasoning.
|
|
+
|
|
+The single runqueue refers to the queued but not running processes for the
|
|
+entire system, regardless of the number of CPUs. The reason for going back to
|
|
+a single runqueue design is that once multiple runqueues are introduced,
|
|
+per-CPU or otherwise, there will be complex interactions as each runqueue will
|
|
+be responsible for the scheduling latency and fairness of the tasks only on its
|
|
+own runqueue, and to achieve fairness and low latency across multiple CPUs, any
|
|
+advantage in throughput of having CPU local tasks causes other disadvantages.
|
|
+This is due to requiring a very complex balancing system to at best achieve some
|
|
+semblance of fairness across CPUs and can only maintain relatively low latency
|
|
+for tasks bound to the same CPUs, not across them. To increase said fairness
|
|
+and latency across CPUs, the advantage of local runqueue locking, which makes
|
|
+for better scalability, is lost due to having to grab multiple locks.
|
|
+
|
|
+A significant feature of BFS is that all accounting is done purely based on CPU
|
|
+used and nowhere is sleep time used in any way to determine entitlement or
|
|
+interactivity. Interactivity "estimators" that use some kind of sleep/run
|
|
+algorithm are doomed to fail to detect all interactive tasks, and to falsely tag
|
|
+tasks that aren't interactive as being so. The reason for this is that it is
|
|
+close to impossible to determine that when a task is sleeping, whether it is
|
|
+doing it voluntarily, as in a userspace application waiting for input in the
|
|
+form of a mouse click or otherwise, or involuntarily, because it is waiting for
|
|
+another thread, process, I/O, kernel activity or whatever. Thus, such an
|
|
+estimator will introduce corner cases, and more heuristics will be required to
|
|
+cope with those corner cases, introducing more corner cases and failed
|
|
+interactivity detection and so on. Interactivity in BFS is built into the design
|
|
+by virtue of the fact that tasks that are waking up have not used up their quota
|
|
+of CPU time, and have earlier effective deadlines, thereby making it very likely
|
|
+they will preempt any CPU bound task of equivalent nice level. See below for
|
|
+more information on the virtual deadline mechanism. Even if they do not preempt
|
|
+a running task, because the rr interval is guaranteed to have a bound upper
|
|
+limit on how long a task will wait for, it will be scheduled within a timeframe
|
|
+that will not cause visible interface jitter.
|
|
+
|
|
+
|
|
+Design details.
|
|
+
|
|
+Task insertion.
|
|
+
|
|
+BFS inserts tasks into each relevant queue as an O(1) insertion into a double
|
|
+linked list. On insertion, *every* running queue is checked to see if the newly
|
|
+queued task can run on any idle queue, or preempt the lowest running task on the
|
|
+system. This is how the cross-CPU scheduling of BFS achieves significantly lower
|
|
+latency per extra CPU the system has. In this case the lookup is, in the worst
|
|
+case scenario, O(n) where n is the number of CPUs on the system.
|
|
+
|
|
+Data protection.
|
|
+
|
|
+BFS has one single lock protecting the process local data of every task in the
|
|
+global queue. Thus every insertion, removal and modification of task data in the
|
|
+global runqueue needs to grab the global lock. However, once a task is taken by
|
|
+a CPU, the CPU has its own local data copy of the running process' accounting
|
|
+information which only that CPU accesses and modifies (such as during a
|
|
+timer tick) thus allowing the accounting data to be updated lockless. Once a
|
|
+CPU has taken a task to run, it removes it from the global queue. Thus the
|
|
+global queue only ever has, at most,
|
|
+
|
|
+ (number of tasks requesting cpu time) - (number of logical CPUs) + 1
|
|
+
|
|
+tasks in the global queue. This value is relevant for the time taken to look up
|
|
+tasks during scheduling. This will increase if many tasks with CPU affinity set
|
|
+in their policy to limit which CPUs they're allowed to run on if they outnumber
|
|
+the number of CPUs. The +1 is because when rescheduling a task, the CPU's
|
|
+currently running task is put back on the queue. Lookup will be described after
|
|
+the virtual deadline mechanism is explained.
|
|
+
|
|
+Virtual deadline.
|
|
+
|
|
+The key to achieving low latency, scheduling fairness, and "nice level"
|
|
+distribution in BFS is entirely in the virtual deadline mechanism. The one
|
|
+tunable in BFS is the rr_interval, or "round robin interval". This is the
|
|
+maximum time two SCHED_OTHER (or SCHED_NORMAL, the common scheduling policy)
|
|
+tasks of the same nice level will be running for, or looking at it the other
|
|
+way around, the longest duration two tasks of the same nice level will be
|
|
+delayed for. When a task requests cpu time, it is given a quota (time_slice)
|
|
+equal to the rr_interval and a virtual deadline. The virtual deadline is
|
|
+offset from the current time in jiffies by this equation:
|
|
+
|
|
+ jiffies + (prio_ratio * rr_interval)
|
|
+
|
|
+The prio_ratio is determined as a ratio compared to the baseline of nice -20
|
|
+and increases by 10% per nice level. The deadline is a virtual one only in that
|
|
+no guarantee is placed that a task will actually be scheduled by this time, but
|
|
+it is used to compare which task should go next. There are three components to
|
|
+how a task is next chosen. First is time_slice expiration. If a task runs out
|
|
+of its time_slice, it is descheduled, the time_slice is refilled, and the
|
|
+deadline reset to that formula above. Second is sleep, where a task no longer
|
|
+is requesting CPU for whatever reason. The time_slice and deadline are _not_
|
|
+adjusted in this case and are just carried over for when the task is next
|
|
+scheduled. Third is preemption, and that is when a newly waking task is deemed
|
|
+higher priority than a currently running task on any cpu by virtue of the fact
|
|
+that it has an earlier virtual deadline than the currently running task. The
|
|
+earlier deadline is the key to which task is next chosen for the first and
|
|
+second cases. Once a task is descheduled, it is put back on the queue, and an
|
|
+O(n) lookup of all queued-but-not-running tasks is done to determine which has
|
|
+the earliest deadline and that task is chosen to receive CPU next.
|
|
+
|
|
+The CPU proportion of different nice tasks works out to be approximately the
|
|
+
|
|
+ (prio_ratio difference)^2
|
|
+
|
|
+The reason it is squared is that a task's deadline does not change while it is
|
|
+running unless it runs out of time_slice. Thus, even if the time actually
|
|
+passes the deadline of another task that is queued, it will not get CPU time
|
|
+unless the current running task deschedules, and the time "base" (jiffies) is
|
|
+constantly moving.
|
|
+
|
|
+Task lookup.
|
|
+
|
|
+BFS has 103 priority queues. 100 of these are dedicated to the static priority
|
|
+of realtime tasks, and the remaining 3 are, in order of best to worst priority,
|
|
+SCHED_ISO (isochronous), SCHED_NORMAL, and SCHED_IDLEPRIO (idle priority
|
|
+scheduling). When a task of these priorities is queued, a bitmap of running
|
|
+priorities is set showing which of these priorities has tasks waiting for CPU
|
|
+time. When a CPU is made to reschedule, the lookup for the next task to get
|
|
+CPU time is performed in the following way:
|
|
+
|
|
+First the bitmap is checked to see what static priority tasks are queued. If
|
|
+any realtime priorities are found, the corresponding queue is checked and the
|
|
+first task listed there is taken (provided CPU affinity is suitable) and lookup
|
|
+is complete. If the priority corresponds to a SCHED_ISO task, they are also
|
|
+taken in FIFO order (as they behave like SCHED_RR). If the priority corresponds
|
|
+to either SCHED_NORMAL or SCHED_IDLEPRIO, then the lookup becomes O(n). At this
|
|
+stage, every task in the runlist that corresponds to that priority is checked
|
|
+to see which has the earliest set deadline, and (provided it has suitable CPU
|
|
+affinity) it is taken off the runqueue and given the CPU. If a task has an
|
|
+expired deadline, it is taken and the rest of the lookup aborted (as they are
|
|
+chosen in FIFO order).
|
|
+
|
|
+Thus, the lookup is O(n) in the worst case only, where n is as described
|
|
+earlier, as tasks may be chosen before the whole task list is looked over.
|
|
+
|
|
+
|
|
+Scalability.
|
|
+
|
|
+The major limitations of BFS will be that of scalability, as the separate
|
|
+runqueue designs will have less lock contention as the number of CPUs rises.
|
|
+However they do not scale linearly even with separate runqueues as multiple
|
|
+runqueues will need to be locked concurrently on such designs to be able to
|
|
+achieve fair CPU balancing, to try and achieve some sort of nice-level fairness
|
|
+across CPUs, and to achieve low enough latency for tasks on a busy CPU when
|
|
+other CPUs would be more suited. BFS has the advantage that it requires no
|
|
+balancing algorithm whatsoever, as balancing occurs by proxy simply because
|
|
+all CPUs draw off the global runqueue, in priority and deadline order. Despite
|
|
+the fact that scalability is _not_ the prime concern of BFS, it both shows very
|
|
+good scalability to smaller numbers of CPUs and is likely a more scalable design
|
|
+at these numbers of CPUs.
|
|
+
|
|
+It also has some very low overhead scalability features built into the design
|
|
+when it has been deemed their overhead is so marginal that they're worth adding.
|
|
+The first is the local copy of the running process' data to the CPU it's running
|
|
+on to allow that data to be updated lockless where possible. Then there is
|
|
+deference paid to the last CPU a task was running on, by trying that CPU first
|
|
+when looking for an idle CPU to use the next time it's scheduled. Finally there
|
|
+is the notion of cache locality beyond the last running CPU. The sched_domains
|
|
+information is used to determine the relative virtual "cache distance" that
|
|
+other CPUs have from the last CPU a task was running on. CPUs with shared
|
|
+caches, such as SMT siblings, or multicore CPUs with shared caches, are treated
|
|
+as cache local. CPUs without shared caches are treated as not cache local, and
|
|
+CPUs on different NUMA nodes are treated as very distant. This "relative cache
|
|
+distance" is used by modifying the virtual deadline value when doing lookups.
|
|
+Effectively, the deadline is unaltered between "cache local" CPUs, doubled for
|
|
+"cache distant" CPUs, and quadrupled for "very distant" CPUs. The reasoning
|
|
+behind the doubling of deadlines is as follows. The real cost of migrating a
|
|
+task from one CPU to another is entirely dependant on the cache footprint of
|
|
+the task, how cache intensive the task is, how long it's been running on that
|
|
+CPU to take up the bulk of its cache, how big the CPU cache is, how fast and
|
|
+how layered the CPU cache is, how fast a context switch is... and so on. In
|
|
+other words, it's close to random in the real world where we do more than just
|
|
+one sole workload. The only thing we can be sure of is that it's not free. So
|
|
+BFS uses the principle that an idle CPU is a wasted CPU and utilising idle CPUs
|
|
+is more important than cache locality, and cache locality only plays a part
|
|
+after that. Doubling the effective deadline is based on the premise that the
|
|
+"cache local" CPUs will tend to work on the same tasks up to double the number
|
|
+of cache local CPUs, and once the workload is beyond that amount, it is likely
|
|
+that none of the tasks are cache warm anywhere anyway. The quadrupling for NUMA
|
|
+is a value I pulled out of my arse.
|
|
+
|
|
+When choosing an idle CPU for a waking task, the cache locality is determined
|
|
+according to where the task last ran and then idle CPUs are ranked from best
|
|
+to worst to choose the most suitable idle CPU based on cache locality, NUMA
|
|
+node locality and hyperthread sibling business. They are chosen in the
|
|
+following preference (if idle):
|
|
+
|
|
+* Same core, idle or busy cache, idle threads
|
|
+* Other core, same cache, idle or busy cache, idle threads.
|
|
+* Same node, other CPU, idle cache, idle threads.
|
|
+* Same node, other CPU, busy cache, idle threads.
|
|
+* Same core, busy threads.
|
|
+* Other core, same cache, busy threads.
|
|
+* Same node, other CPU, busy threads.
|
|
+* Other node, other CPU, idle cache, idle threads.
|
|
+* Other node, other CPU, busy cache, idle threads.
|
|
+* Other node, other CPU, busy threads.
|
|
+
|
|
+This shows the SMT or "hyperthread" awareness in the design as well which will
|
|
+choose a real idle core first before a logical SMT sibling which already has
|
|
+tasks on the physical CPU.
|
|
+
|
|
+Early benchmarking of BFS suggested scalability dropped off at the 16 CPU mark.
|
|
+However this benchmarking was performed on an earlier design that was far less
|
|
+scalable than the current one so it's hard to know how scalable it is in terms
|
|
+of both CPUs (due to the global runqueue) and heavily loaded machines (due to
|
|
+O(n) lookup) at this stage. Note that in terms of scalability, the number of
|
|
+_logical_ CPUs matters, not the number of _physical_ CPUs. Thus, a dual (2x)
|
|
+quad core (4X) hyperthreaded (2X) machine is effectively a 16X. Newer benchmark
|
|
+results are very promising indeed, without needing to tweak any knobs, features
|
|
+or options. Benchmark contributions are most welcome.
|
|
+
|
|
+
|
|
+Features
|
|
+
|
|
+As the initial prime target audience for BFS was the average desktop user, it
|
|
+was designed to not need tweaking, tuning or have features set to obtain benefit
|
|
+from it. Thus the number of knobs and features has been kept to an absolute
|
|
+minimum and should not require extra user input for the vast majority of cases.
|
|
+There are precisely 2 tunables, and 2 extra scheduling policies. The rr_interval
|
|
+and iso_cpu tunables, and the SCHED_ISO and SCHED_IDLEPRIO policies. In addition
|
|
+to this, BFS also uses sub-tick accounting. What BFS does _not_ now feature is
|
|
+support for CGROUPS. The average user should neither need to know what these
|
|
+are, nor should they need to be using them to have good desktop behaviour.
|
|
+
|
|
+rr_interval
|
|
+
|
|
+There is only one "scheduler" tunable, the round robin interval. This can be
|
|
+accessed in
|
|
+
|
|
+ /proc/sys/kernel/rr_interval
|
|
+
|
|
+The value is in milliseconds, and the default value is set to 6 on a
|
|
+uniprocessor machine, and automatically set to a progressively higher value on
|
|
+multiprocessor machines. The reasoning behind increasing the value on more CPUs
|
|
+is that the effective latency is decreased by virtue of there being more CPUs on
|
|
+BFS (for reasons explained above), and increasing the value allows for less
|
|
+cache contention and more throughput. Valid values are from 1 to 1000
|
|
+Decreasing the value will decrease latencies at the cost of decreasing
|
|
+throughput, while increasing it will improve throughput, but at the cost of
|
|
+worsening latencies. The accuracy of the rr interval is limited by HZ resolution
|
|
+of the kernel configuration. Thus, the worst case latencies are usually slightly
|
|
+higher than this actual value. The default value of 6 is not an arbitrary one.
|
|
+It is based on the fact that humans can detect jitter at approximately 7ms, so
|
|
+aiming for much lower latencies is pointless under most circumstances. It is
|
|
+worth noting this fact when comparing the latency performance of BFS to other
|
|
+schedulers. Worst case latencies being higher than 7ms are far worse than
|
|
+average latencies not being in the microsecond range.
|
|
+
|
|
+Isochronous scheduling.
|
|
+
|
|
+Isochronous scheduling is a unique scheduling policy designed to provide
|
|
+near-real-time performance to unprivileged (ie non-root) users without the
|
|
+ability to starve the machine indefinitely. Isochronous tasks (which means
|
|
+"same time") are set using, for example, the schedtool application like so:
|
|
+
|
|
+ schedtool -I -e amarok
|
|
+
|
|
+This will start the audio application "amarok" as SCHED_ISO. How SCHED_ISO works
|
|
+is that it has a priority level between true realtime tasks and SCHED_NORMAL
|
|
+which would allow them to preempt all normal tasks, in a SCHED_RR fashion (ie,
|
|
+if multiple SCHED_ISO tasks are running, they purely round robin at rr_interval
|
|
+rate). However if ISO tasks run for more than a tunable finite amount of time,
|
|
+they are then demoted back to SCHED_NORMAL scheduling. This finite amount of
|
|
+time is the percentage of _total CPU_ available across the machine, configurable
|
|
+as a percentage in the following "resource handling" tunable (as opposed to a
|
|
+scheduler tunable):
|
|
+
|
|
+ /proc/sys/kernel/iso_cpu
|
|
+
|
|
+and is set to 70% by default. It is calculated over a rolling 5 second average
|
|
+Because it is the total CPU available, it means that on a multi CPU machine, it
|
|
+is possible to have an ISO task running as realtime scheduling indefinitely on
|
|
+just one CPU, as the other CPUs will be available. Setting this to 100 is the
|
|
+equivalent of giving all users SCHED_RR access and setting it to 0 removes the
|
|
+ability to run any pseudo-realtime tasks.
|
|
+
|
|
+A feature of BFS is that it detects when an application tries to obtain a
|
|
+realtime policy (SCHED_RR or SCHED_FIFO) and the caller does not have the
|
|
+appropriate privileges to use those policies. When it detects this, it will
|
|
+give the task SCHED_ISO policy instead. Thus it is transparent to the user.
|
|
+Because some applications constantly set their policy as well as their nice
|
|
+level, there is potential for them to undo the override specified by the user
|
|
+on the command line of setting the policy to SCHED_ISO. To counter this, once
|
|
+a task has been set to SCHED_ISO policy, it needs superuser privileges to set
|
|
+it back to SCHED_NORMAL. This will ensure the task remains ISO and all child
|
|
+processes and threads will also inherit the ISO policy.
|
|
+
|
|
+Idleprio scheduling.
|
|
+
|
|
+Idleprio scheduling is a scheduling policy designed to give out CPU to a task
|
|
+_only_ when the CPU would be otherwise idle. The idea behind this is to allow
|
|
+ultra low priority tasks to be run in the background that have virtually no
|
|
+effect on the foreground tasks. This is ideally suited to distributed computing
|
|
+clients (like setiathome, folding, mprime etc) but can also be used to start
|
|
+a video encode or so on without any slowdown of other tasks. To avoid this
|
|
+policy from grabbing shared resources and holding them indefinitely, if it
|
|
+detects a state where the task is waiting on I/O, the machine is about to
|
|
+suspend to ram and so on, it will transiently schedule them as SCHED_NORMAL. As
|
|
+per the Isochronous task management, once a task has been scheduled as IDLEPRIO,
|
|
+it cannot be put back to SCHED_NORMAL without superuser privileges. Tasks can
|
|
+be set to start as SCHED_IDLEPRIO with the schedtool command like so:
|
|
+
|
|
+ schedtool -D -e ./mprime
|
|
+
|
|
+Subtick accounting.
|
|
+
|
|
+It is surprisingly difficult to get accurate CPU accounting, and in many cases,
|
|
+the accounting is done by simply determining what is happening at the precise
|
|
+moment a timer tick fires off. This becomes increasingly inaccurate as the
|
|
+timer tick frequency (HZ) is lowered. It is possible to create an application
|
|
+which uses almost 100% CPU, yet by being descheduled at the right time, records
|
|
+zero CPU usage. While the main problem with this is that there are possible
|
|
+security implications, it is also difficult to determine how much CPU a task
|
|
+really does use. BFS tries to use the sub-tick accounting from the TSC clock,
|
|
+where possible, to determine real CPU usage. This is not entirely reliable, but
|
|
+is far more likely to produce accurate CPU usage data than the existing designs
|
|
+and will not show tasks as consuming no CPU usage when they actually are. Thus,
|
|
+the amount of CPU reported as being used by BFS will more accurately represent
|
|
+how much CPU the task itself is using (as is shown for example by the 'time'
|
|
+application), so the reported values may be quite different to other schedulers.
|
|
+Values reported as the 'load' are more prone to problems with this design, but
|
|
+per process values are closer to real usage. When comparing throughput of BFS
|
|
+to other designs, it is important to compare the actual completed work in terms
|
|
+of total wall clock time taken and total work done, rather than the reported
|
|
+"cpu usage".
|
|
+
|
|
+
|
|
+Con Kolivas <kernel@kolivas.org> Fri Aug 27 2010
|
|
diff --git a/Documentation/scheduler/sched-MuQSS.txt b/Documentation/scheduler/sched-MuQSS.txt
|
|
new file mode 100644
|
|
index 0000000..bbd6980
|
|
--- /dev/null
|
|
+++ b/Documentation/scheduler/sched-MuQSS.txt
|
|
@@ -0,0 +1,345 @@
|
|
+MuQSS - The Multiple Queue Skiplist Scheduler by Con Kolivas.
|
|
+
|
|
+MuQSS is a per-cpu runqueue variant of the original BFS scheduler with
|
|
+one 8 level skiplist per runqueue, and fine grained locking for much more
|
|
+scalability.
|
|
+
|
|
+
|
|
+Goals.
|
|
+
|
|
+The goal of the Multiple Queue Skiplist Scheduler, referred to as MuQSS from
|
|
+here on (pronounced mux) is to completely do away with the complex designs of
|
|
+the past for the cpu process scheduler and instead implement one that is very
|
|
+simple in basic design. The main focus of MuQSS is to achieve excellent desktop
|
|
+interactivity and responsiveness without heuristics and tuning knobs that are
|
|
+difficult to understand, impossible to model and predict the effect of, and when
|
|
+tuned to one workload cause massive detriment to another, while still being
|
|
+scalable to many CPUs and processes.
|
|
+
|
|
+
|
|
+Design summary.
|
|
+
|
|
+MuQSS is best described as per-cpu multiple runqueue, O(log n) insertion, O(1)
|
|
+lookup, earliest effective virtual deadline first tickless design, loosely based
|
|
+on EEVDF (earliest eligible virtual deadline first) and my previous Staircase
|
|
+Deadline scheduler, and evolved from the single runqueue O(n) BFS scheduler.
|
|
+Each component shall be described in order to understand the significance of,
|
|
+and reasoning for it.
|
|
+
|
|
+
|
|
+Design reasoning.
|
|
+
|
|
+In BFS, the use of a single runqueue across all CPUs meant that each CPU would
|
|
+need to scan the entire runqueue looking for the process with the earliest
|
|
+deadline and schedule that next, regardless of which CPU it originally came
|
|
+from. This made BFS deterministic with respect to latency and provided
|
|
+guaranteed latencies dependent on number of processes and CPUs. The single
|
|
+runqueue, however, meant that all CPUs would compete for the single lock
|
|
+protecting it, which would lead to increasing lock contention as the number of
|
|
+CPUs rose and appeared to limit scalability of common workloads beyond 16
|
|
+logical CPUs. Additionally, the O(n) lookup of the runqueue list obviously
|
|
+increased overhead proportionate to the number of queued proecesses and led to
|
|
+cache thrashing while iterating over the linked list.
|
|
+
|
|
+MuQSS is an evolution of BFS, designed to maintain the same scheduling
|
|
+decision mechanism and be virtually deterministic without relying on the
|
|
+constrained design of the single runqueue by splitting out the single runqueue
|
|
+to be per-CPU and use skiplists instead of linked lists.
|
|
+
|
|
+The original reason for going back to a single runqueue design for BFS was that
|
|
+once multiple runqueues are introduced, per-CPU or otherwise, there will be
|
|
+complex interactions as each runqueue will be responsible for the scheduling
|
|
+latency and fairness of the tasks only on its own runqueue, and to achieve
|
|
+fairness and low latency across multiple CPUs, any advantage in throughput of
|
|
+having CPU local tasks causes other disadvantages. This is due to requiring a
|
|
+very complex balancing system to at best achieve some semblance of fairness
|
|
+across CPUs and can only maintain relatively low latency for tasks bound to the
|
|
+same CPUs, not across them. To increase said fairness and latency across CPUs,
|
|
+the advantage of local runqueue locking, which makes for better scalability, is
|
|
+lost due to having to grab multiple locks.
|
|
+
|
|
+MuQSS works around the problems inherent in multiple runqueue designs by
|
|
+making its skip lists priority ordered and through novel use of lockless
|
|
+examination of each other runqueue it can decide if it should take the earliest
|
|
+deadline task from another runqueue for latency reasons, or for CPU balancing
|
|
+reasons. It still does not have a balancing system, choosing to allow the
|
|
+next task scheduling decision and task wakeup CPU choice to allow balancing to
|
|
+happen by virtue of its choices.
|
|
+
|
|
+
|
|
+Design details.
|
|
+
|
|
+Custom skip list implementation:
|
|
+
|
|
+To avoid the overhead of building up and tearing down skip list structures,
|
|
+the variant used by MuQSS has a number of optimisations making it specific for
|
|
+its use case in the scheduler. It uses static arrays of 8 'levels' instead of
|
|
+building up and tearing down structures dynamically. This makes each runqueue
|
|
+only scale O(log N) up to 256 tasks. However as there is one runqueue per CPU
|
|
+it means that it scales O(log N) up to 256 x number of logical CPUs which is
|
|
+far beyond the realistic task limits each CPU could handle. By being 8 levels
|
|
+it also makes the array exactly one cacheline in size. Additionally, each
|
|
+skip list node is bidirectional making insertion and removal amortised O(1),
|
|
+being O(k) where k is 1-8. Uniquely, we are only ever interested in the very
|
|
+first entry in each list at all times with MuQSS, so there is never a need to
|
|
+do a search and thus look up is always O(1).
|
|
+
|
|
+Task insertion:
|
|
+
|
|
+MuQSS inserts tasks into a per CPU runqueue as an O(log N) insertion into
|
|
+a custom skip list as described above (based on the original design by William
|
|
+Pugh). Insertion is ordered in such a way that there is never a need to do a
|
|
+search by ordering tasks according to static priority primarily, and then
|
|
+virtual deadline at the time of insertion.
|
|
+
|
|
+Niffies:
|
|
+
|
|
+Niffies are a monotonic forward moving timer not unlike the "jiffies" but are
|
|
+of nanosecond resolution. Niffies are calculated per-runqueue from the high
|
|
+resolution TSC timers, and in order to maintain fairness are synchronised
|
|
+between CPUs whenever both runqueues are locked concurrently.
|
|
+
|
|
+Virtual deadline:
|
|
+
|
|
+The key to achieving low latency, scheduling fairness, and "nice level"
|
|
+distribution in MuQSS is entirely in the virtual deadline mechanism. The one
|
|
+tunable in MuQSS is the rr_interval, or "round robin interval". This is the
|
|
+maximum time two SCHED_OTHER (or SCHED_NORMAL, the common scheduling policy)
|
|
+tasks of the same nice level will be running for, or looking at it the other
|
|
+way around, the longest duration two tasks of the same nice level will be
|
|
+delayed for. When a task requests cpu time, it is given a quota (time_slice)
|
|
+equal to the rr_interval and a virtual deadline. The virtual deadline is
|
|
+offset from the current time in niffies by this equation:
|
|
+
|
|
+ niffies + (prio_ratio * rr_interval)
|
|
+
|
|
+The prio_ratio is determined as a ratio compared to the baseline of nice -20
|
|
+and increases by 10% per nice level. The deadline is a virtual one only in that
|
|
+no guarantee is placed that a task will actually be scheduled by this time, but
|
|
+it is used to compare which task should go next. There are three components to
|
|
+how a task is next chosen. First is time_slice expiration. If a task runs out
|
|
+of its time_slice, it is descheduled, the time_slice is refilled, and the
|
|
+deadline reset to that formula above. Second is sleep, where a task no longer
|
|
+is requesting CPU for whatever reason. The time_slice and deadline are _not_
|
|
+adjusted in this case and are just carried over for when the task is next
|
|
+scheduled. Third is preemption, and that is when a newly waking task is deemed
|
|
+higher priority than a currently running task on any cpu by virtue of the fact
|
|
+that it has an earlier virtual deadline than the currently running task. The
|
|
+earlier deadline is the key to which task is next chosen for the first and
|
|
+second cases.
|
|
+
|
|
+The CPU proportion of different nice tasks works out to be approximately the
|
|
+
|
|
+ (prio_ratio difference)^2
|
|
+
|
|
+The reason it is squared is that a task's deadline does not change while it is
|
|
+running unless it runs out of time_slice. Thus, even if the time actually
|
|
+passes the deadline of another task that is queued, it will not get CPU time
|
|
+unless the current running task deschedules, and the time "base" (niffies) is
|
|
+constantly moving.
|
|
+
|
|
+Task lookup:
|
|
+
|
|
+As tasks are already pre-ordered according to anticipated scheduling order in
|
|
+the skip lists, lookup for the next suitable task per-runqueue is always a
|
|
+matter of simply selecting the first task in the 0th level skip list entry.
|
|
+In order to maintain optimal latency and fairness across CPUs, MuQSS does a
|
|
+novel examination of every other runqueue in cache locality order, choosing the
|
|
+best task across all runqueues. This provides near-determinism of how long any
|
|
+task across the entire system may wait before receiving CPU time. The other
|
|
+runqueues are first examine lockless and then trylocked to minimise the
|
|
+potential lock contention if they are likely to have a suitable better task.
|
|
+Each other runqueue lock is only held for as long as it takes to examine the
|
|
+entry for suitability. In "interactive" mode, the default setting, MuQSS will
|
|
+look for the best deadline task across all CPUs, while in !interactive mode,
|
|
+it will only select a better deadline task from another CPU if it is more
|
|
+heavily laden than the current one.
|
|
+
|
|
+Lookup is therefore O(k) where k is number of CPUs.
|
|
+
|
|
+
|
|
+Latency.
|
|
+
|
|
+Through the use of virtual deadlines to govern the scheduling order of normal
|
|
+tasks, queue-to-activation latency per runqueue is guaranteed to be bound by
|
|
+the rr_interval tunable which is set to 6ms by default. This means that the
|
|
+longest a CPU bound task will wait for more CPU is proportional to the number
|
|
+of running tasks and in the common case of 0-2 running tasks per CPU, will be
|
|
+under the 7ms threshold for human perception of jitter. Additionally, as newly
|
|
+woken tasks will have an early deadline from their previous runtime, the very
|
|
+tasks that are usually latency sensitive will have the shortest interval for
|
|
+activation, usually preempting any existing CPU bound tasks.
|
|
+
|
|
+Tickless expiry:
|
|
+
|
|
+A feature of MuQSS is that it is not tied to the resolution of the chosen tick
|
|
+rate in Hz, instead depending entirely on the high resolution timers where
|
|
+possible for sub-millisecond accuracy on timeouts regarless of the underlying
|
|
+tick rate. This allows MuQSS to be run with the low overhead of low Hz rates
|
|
+such as 100 by default, benefiting from the improved throughput and lower
|
|
+power usage it provides. Another advantage of this approach is that in
|
|
+combination with the Full No HZ option, which disables ticks on running task
|
|
+CPUs instead of just idle CPUs, the tick can be disabled at all times
|
|
+regardless of how many tasks are running instead of being limited to just one
|
|
+running task. Note that this option is NOT recommended for regular desktop
|
|
+users.
|
|
+
|
|
+
|
|
+Scalability and balancing.
|
|
+
|
|
+Unlike traditional approaches where balancing is a combination of CPU selection
|
|
+at task wakeup and intermittent balancing based on a vast array of rules set
|
|
+according to architecture, busyness calculations and special case management,
|
|
+MuQSS indirectly balances on the fly at task wakeup and next task selection.
|
|
+During initialisation, MuQSS creates a cache coherency ordered list of CPUs for
|
|
+each logical CPU and uses this to aid task/CPU selection when CPUs are busy.
|
|
+Additionally it selects any idle CPUs, if they are available, at any time over
|
|
+busy CPUs according to the following preference:
|
|
+
|
|
+ * Same thread, idle or busy cache, idle or busy threads
|
|
+ * Other core, same cache, idle or busy cache, idle threads.
|
|
+ * Same node, other CPU, idle cache, idle threads.
|
|
+ * Same node, other CPU, busy cache, idle threads.
|
|
+ * Other core, same cache, busy threads.
|
|
+ * Same node, other CPU, busy threads.
|
|
+ * Other node, other CPU, idle cache, idle threads.
|
|
+ * Other node, other CPU, busy cache, idle threads.
|
|
+ * Other node, other CPU, busy threads.
|
|
+
|
|
+Mux is therefore SMT, MC and Numa aware without the need for extra
|
|
+intermittent balancing to maintain CPUs busy and make the most of cache
|
|
+coherency.
|
|
+
|
|
+
|
|
+Features
|
|
+
|
|
+As the initial prime target audience for MuQSS was the average desktop user, it
|
|
+was designed to not need tweaking, tuning or have features set to obtain benefit
|
|
+from it. Thus the number of knobs and features has been kept to an absolute
|
|
+minimum and should not require extra user input for the vast majority of cases.
|
|
+There are 3 optional tunables, and 2 extra scheduling policies. The rr_interval,
|
|
+interactive, and iso_cpu tunables, and the SCHED_ISO and SCHED_IDLEPRIO
|
|
+policies. In addition to this, MuQSS also uses sub-tick accounting. What MuQSS
|
|
+does _not_ now feature is support for CGROUPS. The average user should neither
|
|
+need to know what these are, nor should they need to be using them to have good
|
|
+desktop behaviour. However since some applications refuse to work without
|
|
+cgroups, one can enable them with MuQSS as a stub and the filesystem will be
|
|
+created which will allow the applications to work.
|
|
+
|
|
+rr_interval:
|
|
+
|
|
+ /proc/sys/kernel/rr_interval
|
|
+
|
|
+The value is in milliseconds, and the default value is set to 6. Valid values
|
|
+are from 1 to 1000 Decreasing the value will decrease latencies at the cost of
|
|
+decreasing throughput, while increasing it will improve throughput, but at the
|
|
+cost of worsening latencies. It is based on the fact that humans can detect
|
|
+jitter at approximately 7ms, so aiming for much lower latencies is pointless
|
|
+under most circumstances. It is worth noting this fact when comparing the
|
|
+latency performance of MuQSS to other schedulers. Worst case latencies being
|
|
+higher than 7ms are far worse than average latencies not being in the
|
|
+microsecond range.
|
|
+
|
|
+interactive:
|
|
+
|
|
+ /proc/sys/kernel/interactive
|
|
+
|
|
+The value is a simple boolean of 1 for on and 0 for off and is set to on by
|
|
+default. Disabling this will disable the near-determinism of MuQSS when
|
|
+selecting the next task by not examining all CPUs for the earliest deadline
|
|
+task, or which CPU to wake to, instead prioritising CPU balancing for improved
|
|
+throughput. Latency will still be bound by rr_interval, but on a per-CPU basis
|
|
+instead of across the whole system.
|
|
+
|
|
+Isochronous scheduling:
|
|
+
|
|
+Isochronous scheduling is a unique scheduling policy designed to provide
|
|
+near-real-time performance to unprivileged (ie non-root) users without the
|
|
+ability to starve the machine indefinitely. Isochronous tasks (which means
|
|
+"same time") are set using, for example, the schedtool application like so:
|
|
+
|
|
+ schedtool -I -e amarok
|
|
+
|
|
+This will start the audio application "amarok" as SCHED_ISO. How SCHED_ISO works
|
|
+is that it has a priority level between true realtime tasks and SCHED_NORMAL
|
|
+which would allow them to preempt all normal tasks, in a SCHED_RR fashion (ie,
|
|
+if multiple SCHED_ISO tasks are running, they purely round robin at rr_interval
|
|
+rate). However if ISO tasks run for more than a tunable finite amount of time,
|
|
+they are then demoted back to SCHED_NORMAL scheduling. This finite amount of
|
|
+time is the percentage of CPU available per CPU, configurable as a percentage in
|
|
+the following "resource handling" tunable (as opposed to a scheduler tunable):
|
|
+
|
|
+iso_cpu:
|
|
+
|
|
+ /proc/sys/kernel/iso_cpu
|
|
+
|
|
+and is set to 70% by default. It is calculated over a rolling 5 second average
|
|
+Because it is the total CPU available, it means that on a multi CPU machine, it
|
|
+is possible to have an ISO task running as realtime scheduling indefinitely on
|
|
+just one CPU, as the other CPUs will be available. Setting this to 100 is the
|
|
+equivalent of giving all users SCHED_RR access and setting it to 0 removes the
|
|
+ability to run any pseudo-realtime tasks.
|
|
+
|
|
+A feature of MuQSS is that it detects when an application tries to obtain a
|
|
+realtime policy (SCHED_RR or SCHED_FIFO) and the caller does not have the
|
|
+appropriate privileges to use those policies. When it detects this, it will
|
|
+give the task SCHED_ISO policy instead. Thus it is transparent to the user.
|
|
+
|
|
+
|
|
+Idleprio scheduling:
|
|
+
|
|
+Idleprio scheduling is a scheduling policy designed to give out CPU to a task
|
|
+_only_ when the CPU would be otherwise idle. The idea behind this is to allow
|
|
+ultra low priority tasks to be run in the background that have virtually no
|
|
+effect on the foreground tasks. This is ideally suited to distributed computing
|
|
+clients (like setiathome, folding, mprime etc) but can also be used to start a
|
|
+video encode or so on without any slowdown of other tasks. To avoid this policy
|
|
+from grabbing shared resources and holding them indefinitely, if it detects a
|
|
+state where the task is waiting on I/O, the machine is about to suspend to ram
|
|
+and so on, it will transiently schedule them as SCHED_NORMAL. Once a task has
|
|
+been scheduled as IDLEPRIO, it cannot be put back to SCHED_NORMAL without
|
|
+superuser privileges since it is effectively a lower scheduling policy. Tasks
|
|
+can be set to start as SCHED_IDLEPRIO with the schedtool command like so:
|
|
+
|
|
+schedtool -D -e ./mprime
|
|
+
|
|
+Subtick accounting:
|
|
+
|
|
+It is surprisingly difficult to get accurate CPU accounting, and in many cases,
|
|
+the accounting is done by simply determining what is happening at the precise
|
|
+moment a timer tick fires off. This becomes increasingly inaccurate as the timer
|
|
+tick frequency (HZ) is lowered. It is possible to create an application which
|
|
+uses almost 100% CPU, yet by being descheduled at the right time, records zero
|
|
+CPU usage. While the main problem with this is that there are possible security
|
|
+implications, it is also difficult to determine how much CPU a task really does
|
|
+use. Mux uses sub-tick accounting from the TSC clock to determine real CPU
|
|
+usage. Thus, the amount of CPU reported as being used by MuQSS will more
|
|
+accurately represent how much CPU the task itself is using (as is shown for
|
|
+example by the 'time' application), so the reported values may be quite
|
|
+different to other schedulers. When comparing throughput of MuQSS to other
|
|
+designs, it is important to compare the actual completed work in terms of total
|
|
+wall clock time taken and total work done, rather than the reported "cpu usage".
|
|
+
|
|
+Symmetric MultiThreading (SMT) aware nice:
|
|
+
|
|
+SMT, a.k.a. hyperthreading, is a very common feature on modern CPUs. While the
|
|
+logical CPU count rises by adding thread units to each CPU core, allowing more
|
|
+than one task to be run simultaneously on the same core, the disadvantage of it
|
|
+is that the CPU power is shared between the tasks, not summating to the power
|
|
+of two CPUs. The practical upshot of this is that two tasks running on
|
|
+separate threads of the same core run significantly slower than if they had one
|
|
+core each to run on. While smart CPU selection allows each task to have a core
|
|
+to itself whenever available (as is done on MuQSS), it cannot offset the
|
|
+slowdown that occurs when the cores are all loaded and only a thread is left.
|
|
+Most of the time this is harmless as the CPU is effectively overloaded at this
|
|
+point and the extra thread is of benefit. However when running a niced task in
|
|
+the presence of an un-niced task (say nice 19 v nice 0), the nice task gets
|
|
+precisely the same amount of CPU power as the unniced one. MuQSS has an
|
|
+optional configuration feature known as SMT-NICE which selectively idles the
|
|
+secondary niced thread for a period proportional to the nice difference,
|
|
+allowing CPU distribution according to nice level to be maintained, at the
|
|
+expense of a small amount of extra overhead. If this is configured in on a
|
|
+machine without SMT threads, the overhead is minimal.
|
|
+
|
|
+
|
|
+Con Kolivas <kernel@kolivas.org> Sat, 29th October 2016
|
|
diff --git a/Documentation/sysctl/kernel.txt b/Documentation/sysctl/kernel.txt
|
|
index ffab8b5..2a82a37 100644
|
|
--- a/Documentation/sysctl/kernel.txt
|
|
+++ b/Documentation/sysctl/kernel.txt
|
|
@@ -39,6 +39,7 @@ show up in /proc/sys/kernel:
|
|
- hung_task_timeout_secs
|
|
- hung_task_warnings
|
|
- kexec_load_disabled
|
|
+- iso_cpu
|
|
- kptr_restrict
|
|
- kstack_depth_to_print [ X86 only ]
|
|
- l2cr [ PPC only ]
|
|
@@ -73,6 +74,7 @@ show up in /proc/sys/kernel:
|
|
- randomize_va_space
|
|
- real-root-dev ==> Documentation/initrd.txt
|
|
- reboot-cmd [ SPARC only ]
|
|
+- rr_interval
|
|
- rtsig-max
|
|
- rtsig-nr
|
|
- sem
|
|
@@ -93,6 +95,7 @@ show up in /proc/sys/kernel:
|
|
- unknown_nmi_panic
|
|
- watchdog
|
|
- watchdog_thresh
|
|
+- yield_type
|
|
- version
|
|
|
|
==============================================================
|
|
@@ -402,6 +405,16 @@ kernel stack.
|
|
|
|
==============================================================
|
|
|
|
+iso_cpu: (MuQSS CPU scheduler only).
|
|
+
|
|
+This sets the percentage cpu that the unprivileged SCHED_ISO tasks can
|
|
+run effectively at realtime priority, averaged over a rolling five
|
|
+seconds over the -whole- system, meaning all cpus.
|
|
+
|
|
+Set to 70 (percent) by default.
|
|
+
|
|
+==============================================================
|
|
+
|
|
l2cr: (PPC only)
|
|
|
|
This flag controls the L2 cache of G3 processor boards. If
|
|
@@ -818,6 +831,20 @@ rebooting. ???
|
|
|
|
==============================================================
|
|
|
|
+rr_interval: (MuQSS CPU scheduler only)
|
|
+
|
|
+This is the smallest duration that any cpu process scheduling unit
|
|
+will run for. Increasing this value can increase throughput of cpu
|
|
+bound tasks substantially but at the expense of increased latencies
|
|
+overall. Conversely decreasing it will decrease average and maximum
|
|
+latencies but at the expense of throughput. This value is in
|
|
+milliseconds and the default value chosen depends on the number of
|
|
+cpus available at scheduler initialisation with a minimum of 6.
|
|
+
|
|
+Valid values are from 1-1000.
|
|
+
|
|
+==============================================================
|
|
+
|
|
rtsig-max & rtsig-nr:
|
|
|
|
The file rtsig-max can be used to tune the maximum number
|
|
@@ -1056,3 +1083,13 @@ The softlockup threshold is (2 * watchdog_thresh). Setting this
|
|
tunable to zero will disable lockup detection altogether.
|
|
|
|
==============================================================
|
|
+
|
|
+yield_type: (MuQSS CPU scheduler only)
|
|
+
|
|
+This determines what type of yield calls to sched_yield will perform.
|
|
+
|
|
+ 0: No yield.
|
|
+ 1: Yield only to better priority/deadline tasks. (default)
|
|
+ 2: Expire timeslice and recalculate deadline.
|
|
+
|
|
+==============================================================
|
|
diff --git a/Makefile b/Makefile
|
|
index 8585e4e..f6df014 100644
|
|
--- a/Makefile
|
|
+++ b/Makefile
|
|
@@ -1,7 +1,7 @@
|
|
VERSION = 4
|
|
PATCHLEVEL = 9
|
|
-SUBLEVEL = 9
|
|
-EXTRAVERSION =
|
|
+SUBLEVEL = 0
|
|
+EXTRAVERSION = -pf6
|
|
NAME = Roaring Lionus
|
|
|
|
# *DOCUMENTATION*
|
|
diff --git a/arch/powerpc/platforms/cell/spufs/sched.c b/arch/powerpc/platforms/cell/spufs/sched.c
|
|
index 460f5f3..eeb3e32 100644
|
|
--- a/arch/powerpc/platforms/cell/spufs/sched.c
|
|
+++ b/arch/powerpc/platforms/cell/spufs/sched.c
|
|
@@ -64,11 +64,6 @@ static struct timer_list spusched_timer;
|
|
static struct timer_list spuloadavg_timer;
|
|
|
|
/*
|
|
- * Priority of a normal, non-rt, non-niced'd process (aka nice level 0).
|
|
- */
|
|
-#define NORMAL_PRIO 120
|
|
-
|
|
-/*
|
|
* Frequency of the spu scheduler tick. By default we do one SPU scheduler
|
|
* tick for every 10 CPU scheduler ticks.
|
|
*/
|
|
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
|
|
index d666ef8..cd1b67c 100644
|
|
--- a/arch/x86/Kconfig
|
|
+++ b/arch/x86/Kconfig
|
|
@@ -926,10 +926,26 @@ config SCHED_SMT
|
|
depends on SMP
|
|
---help---
|
|
SMT scheduler support improves the CPU scheduler's decision making
|
|
- when dealing with Intel Pentium 4 chips with HyperThreading at a
|
|
+ when dealing with Intel P4/Core 2 chips with HyperThreading at a
|
|
cost of slightly increased overhead in some places. If unsure say
|
|
N here.
|
|
|
|
+config SMT_NICE
|
|
+ bool "SMT (Hyperthreading) aware nice priority and policy support"
|
|
+ depends on SCHED_MUQSS && SCHED_SMT
|
|
+ default y
|
|
+ ---help---
|
|
+ Enabling Hyperthreading on Intel CPUs decreases the effectiveness
|
|
+ of the use of 'nice' levels and different scheduling policies
|
|
+ (e.g. realtime) due to sharing of CPU power between hyperthreads.
|
|
+ SMT nice support makes each logical CPU aware of what is running on
|
|
+ its hyperthread siblings, maintaining appropriate distribution of
|
|
+ CPU according to nice levels and scheduling policies at the expense
|
|
+ of slightly increased overhead.
|
|
+
|
|
+ If unsure say Y here.
|
|
+
|
|
+
|
|
config SCHED_MC
|
|
def_bool y
|
|
prompt "Multi-core scheduler support"
|
|
@@ -1295,7 +1311,7 @@ config HIGHMEM64G
|
|
endchoice
|
|
|
|
choice
|
|
- prompt "Memory split" if EXPERT
|
|
+ prompt "Memory split"
|
|
default VMSPLIT_3G
|
|
depends on X86_32
|
|
---help---
|
|
@@ -1315,17 +1331,17 @@ choice
|
|
option alone!
|
|
|
|
config VMSPLIT_3G
|
|
- bool "3G/1G user/kernel split"
|
|
+ bool "Default 896MB lowmem (3G/1G user/kernel split)"
|
|
config VMSPLIT_3G_OPT
|
|
depends on !X86_PAE
|
|
- bool "3G/1G user/kernel split (for full 1G low memory)"
|
|
+ bool "1GB lowmem (3G/1G user/kernel split)"
|
|
config VMSPLIT_2G
|
|
- bool "2G/2G user/kernel split"
|
|
+ bool "2GB lowmem (2G/2G user/kernel split)"
|
|
config VMSPLIT_2G_OPT
|
|
depends on !X86_PAE
|
|
- bool "2G/2G user/kernel split (for full 2G low memory)"
|
|
+ bool "2GB lowmem (2G/2G user/kernel split)"
|
|
config VMSPLIT_1G
|
|
- bool "1G/3G user/kernel split"
|
|
+ bool "3GB lowmem (1G/3G user/kernel split)"
|
|
endchoice
|
|
|
|
config PAGE_OFFSET
|
|
diff --git a/arch/x86/Kconfig.cpu b/arch/x86/Kconfig.cpu
|
|
index 3ba5ff2..aecb337 100644
|
|
--- a/arch/x86/Kconfig.cpu
|
|
+++ b/arch/x86/Kconfig.cpu
|
|
@@ -115,6 +115,7 @@ config MPENTIUMM
|
|
config MPENTIUM4
|
|
bool "Pentium-4/Celeron(P4-based)/Pentium-4 M/older Xeon"
|
|
depends on X86_32
|
|
+ select X86_P6_NOP
|
|
---help---
|
|
Select this for Intel Pentium 4 chips. This includes the
|
|
Pentium 4, Pentium D, P4-based Celeron and Xeon, and
|
|
@@ -147,9 +148,8 @@ config MPENTIUM4
|
|
-Paxville
|
|
-Dempsey
|
|
|
|
-
|
|
config MK6
|
|
- bool "K6/K6-II/K6-III"
|
|
+ bool "AMD K6/K6-II/K6-III"
|
|
depends on X86_32
|
|
---help---
|
|
Select this for an AMD K6-family processor. Enables use of
|
|
@@ -157,7 +157,7 @@ config MK6
|
|
flags to GCC.
|
|
|
|
config MK7
|
|
- bool "Athlon/Duron/K7"
|
|
+ bool "AMD Athlon/Duron/K7"
|
|
depends on X86_32
|
|
---help---
|
|
Select this for an AMD Athlon K7-family processor. Enables use of
|
|
@@ -165,12 +165,83 @@ config MK7
|
|
flags to GCC.
|
|
|
|
config MK8
|
|
- bool "Opteron/Athlon64/Hammer/K8"
|
|
+ bool "AMD Opteron/Athlon64/Hammer/K8"
|
|
---help---
|
|
Select this for an AMD Opteron or Athlon64 Hammer-family processor.
|
|
Enables use of some extended instructions, and passes appropriate
|
|
optimization flags to GCC.
|
|
|
|
+config MK8SSE3
|
|
+ bool "AMD Opteron/Athlon64/Hammer/K8 with SSE3"
|
|
+ ---help---
|
|
+ Select this for improved AMD Opteron or Athlon64 Hammer-family processors.
|
|
+ Enables use of some extended instructions, and passes appropriate
|
|
+ optimization flags to GCC.
|
|
+
|
|
+config MK10
|
|
+ bool "AMD 61xx/7x50/PhenomX3/X4/II/K10"
|
|
+ ---help---
|
|
+ Select this for an AMD 61xx Eight-Core Magny-Cours, Athlon X2 7x50,
|
|
+ Phenom X3/X4/II, Athlon II X2/X3/X4, or Turion II-family processor.
|
|
+ Enables use of some extended instructions, and passes appropriate
|
|
+ optimization flags to GCC.
|
|
+
|
|
+config MBARCELONA
|
|
+ bool "AMD Barcelona"
|
|
+ ---help---
|
|
+ Select this for AMD Family 10h Barcelona processors.
|
|
+
|
|
+ Enables -march=barcelona
|
|
+
|
|
+config MBOBCAT
|
|
+ bool "AMD Bobcat"
|
|
+ ---help---
|
|
+ Select this for AMD Family 14h Bobcat processors.
|
|
+
|
|
+ Enables -march=btver1
|
|
+
|
|
+config MJAGUAR
|
|
+ bool "AMD Jaguar"
|
|
+ ---help---
|
|
+ Select this for AMD Family 16h Jaguar processors.
|
|
+
|
|
+ Enables -march=btver2
|
|
+
|
|
+config MBULLDOZER
|
|
+ bool "AMD Bulldozer"
|
|
+ ---help---
|
|
+ Select this for AMD Family 15h Bulldozer processors.
|
|
+
|
|
+ Enables -march=bdver1
|
|
+
|
|
+config MPILEDRIVER
|
|
+ bool "AMD Piledriver"
|
|
+ ---help---
|
|
+ Select this for AMD Family 15h Piledriver processors.
|
|
+
|
|
+ Enables -march=bdver2
|
|
+
|
|
+config MSTEAMROLLER
|
|
+ bool "AMD Steamroller"
|
|
+ ---help---
|
|
+ Select this for AMD Family 15h Steamroller processors.
|
|
+
|
|
+ Enables -march=bdver3
|
|
+
|
|
+config MEXCAVATOR
|
|
+ bool "AMD Excavator"
|
|
+ ---help---
|
|
+ Select this for AMD Family 15h Excavator processors.
|
|
+
|
|
+ Enables -march=bdver4
|
|
+
|
|
+config MZEN
|
|
+ bool "AMD Zen"
|
|
+ ---help---
|
|
+ Select this for AMD Family 17h Zen processors.
|
|
+
|
|
+ Enables -march=znver1
|
|
+
|
|
config MCRUSOE
|
|
bool "Crusoe"
|
|
depends on X86_32
|
|
@@ -252,6 +323,7 @@ config MVIAC7
|
|
|
|
config MPSC
|
|
bool "Intel P4 / older Netburst based Xeon"
|
|
+ select X86_P6_NOP
|
|
depends on X86_64
|
|
---help---
|
|
Optimize for Intel Pentium 4, Pentium D and older Nocona/Dempsey
|
|
@@ -261,8 +333,19 @@ config MPSC
|
|
using the cpu family field
|
|
in /proc/cpuinfo. Family 15 is an older Xeon, Family 6 a newer one.
|
|
|
|
+config MATOM
|
|
+ bool "Intel Atom"
|
|
+ select X86_P6_NOP
|
|
+ ---help---
|
|
+
|
|
+ Select this for the Intel Atom platform. Intel Atom CPUs have an
|
|
+ in-order pipelining architecture and thus can benefit from
|
|
+ accordingly optimized code. Use a recent GCC with specific Atom
|
|
+ support in order to fully benefit from selecting this option.
|
|
+
|
|
config MCORE2
|
|
- bool "Core 2/newer Xeon"
|
|
+ bool "Intel Core 2"
|
|
+ select X86_P6_NOP
|
|
---help---
|
|
|
|
Select this for Intel Core 2 and newer Core 2 Xeons (Xeon 51xx and
|
|
@@ -270,14 +353,79 @@ config MCORE2
|
|
family in /proc/cpuinfo. Newer ones have 6 and older ones 15
|
|
(not a typo)
|
|
|
|
-config MATOM
|
|
- bool "Intel Atom"
|
|
+ Enables -march=core2
|
|
+
|
|
+config MNEHALEM
|
|
+ bool "Intel Nehalem"
|
|
+ select X86_P6_NOP
|
|
---help---
|
|
|
|
- Select this for the Intel Atom platform. Intel Atom CPUs have an
|
|
- in-order pipelining architecture and thus can benefit from
|
|
- accordingly optimized code. Use a recent GCC with specific Atom
|
|
- support in order to fully benefit from selecting this option.
|
|
+ Select this for 1st Gen Core processors in the Nehalem family.
|
|
+
|
|
+ Enables -march=nehalem
|
|
+
|
|
+config MWESTMERE
|
|
+ bool "Intel Westmere"
|
|
+ select X86_P6_NOP
|
|
+ ---help---
|
|
+
|
|
+ Select this for the Intel Westmere formerly Nehalem-C family.
|
|
+
|
|
+ Enables -march=westmere
|
|
+
|
|
+config MSILVERMONT
|
|
+ bool "Intel Silvermont"
|
|
+ select X86_P6_NOP
|
|
+ ---help---
|
|
+
|
|
+ Select this for the Intel Silvermont platform.
|
|
+
|
|
+ Enables -march=silvermont
|
|
+
|
|
+config MSANDYBRIDGE
|
|
+ bool "Intel Sandy Bridge"
|
|
+ select X86_P6_NOP
|
|
+ ---help---
|
|
+
|
|
+ Select this for 2nd Gen Core processors in the Sandy Bridge family.
|
|
+
|
|
+ Enables -march=sandybridge
|
|
+
|
|
+config MIVYBRIDGE
|
|
+ bool "Intel Ivy Bridge"
|
|
+ select X86_P6_NOP
|
|
+ ---help---
|
|
+
|
|
+ Select this for 3rd Gen Core processors in the Ivy Bridge family.
|
|
+
|
|
+ Enables -march=ivybridge
|
|
+
|
|
+config MHASWELL
|
|
+ bool "Intel Haswell"
|
|
+ select X86_P6_NOP
|
|
+ ---help---
|
|
+
|
|
+ Select this for 4th Gen Core processors in the Haswell family.
|
|
+
|
|
+ Enables -march=haswell
|
|
+
|
|
+config MBROADWELL
|
|
+ bool "Intel Broadwell"
|
|
+ select X86_P6_NOP
|
|
+ ---help---
|
|
+
|
|
+ Select this for 5th Gen Core processors in the Broadwell family.
|
|
+
|
|
+ Enables -march=broadwell
|
|
+
|
|
+config MSKYLAKE
|
|
+ bool "Intel Skylake"
|
|
+ select X86_P6_NOP
|
|
+ ---help---
|
|
+
|
|
+ Select this for 6th Gen Core processors in the Skylake family.
|
|
+
|
|
+ Enables -march=skylake
|
|
|
|
config GENERIC_CPU
|
|
bool "Generic-x86-64"
|
|
@@ -286,6 +434,19 @@ config GENERIC_CPU
|
|
Generic x86-64 CPU.
|
|
Run equally well on all x86-64 CPUs.
|
|
|
|
+config MNATIVE
|
|
+ bool "Native optimizations autodetected by GCC"
|
|
+ ---help---
|
|
+
|
|
+ GCC 4.2 and above support -march=native, which automatically detects
|
|
+ the optimum settings to use based on your processor. -march=native
|
|
+ also detects and applies additional settings beyond -march specific
|
|
+ to your CPU, (eg. -msse4). Unless you have a specific reason not to
|
|
+ (e.g. distcc cross-compiling), you should probably be using
|
|
+ -march=native rather than anything listed below.
|
|
+
|
|
+ Enables -march=native
|
|
+
|
|
endchoice
|
|
|
|
config X86_GENERIC
|
|
@@ -310,7 +471,7 @@ config X86_INTERNODE_CACHE_SHIFT
|
|
config X86_L1_CACHE_SHIFT
|
|
int
|
|
default "7" if MPENTIUM4 || MPSC
|
|
- default "6" if MK7 || MK8 || MPENTIUMM || MCORE2 || MATOM || MVIAC7 || X86_GENERIC || GENERIC_CPU
|
|
+ default "6" if MK7 || MK8 || MK8SSE3 || MK10 || MBARCELONA || MBOBCAT || MBULLDOZER || MPILEDRIVER || MSTEAMROLLER || MEXCAVATOR || MZEN || MJAGUAR || MPENTIUMM || MCORE2 || MNEHALEM || MWESTMERE || MSILVERMONT || MSANDYBRIDGE || MIVYBRIDGE || MHASWELL || MBROADWELL || MSKYLAKE || MNATIVE || MATOM || MVIAC7 || X86_GENERIC || GENERIC_CPU
|
|
default "4" if MELAN || M486 || MGEODEGX1
|
|
default "5" if MWINCHIP3D || MWINCHIPC6 || MCRUSOE || MEFFICEON || MCYRIXIII || MK6 || MPENTIUMIII || MPENTIUMII || M686 || M586MMX || M586TSC || M586 || MVIAC3_2 || MGEODE_LX
|
|
|
|
@@ -341,45 +502,46 @@ config X86_ALIGNMENT_16
|
|
|
|
config X86_INTEL_USERCOPY
|
|
def_bool y
|
|
- depends on MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M586MMX || X86_GENERIC || MK8 || MK7 || MEFFICEON || MCORE2
|
|
+ depends on MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M586MMX || X86_GENERIC || MK8 || MK8SSE3 || MK7 || MEFFICEON || MCORE2 || MK10 || MBARCELONA || MNEHALEM || MWESTMERE || MSILVERMONT || MSANDYBRIDGE || MIVYBRIDGE || MHASWELL || MBROADWELL || MSKYLAKE || MNATIVE
|
|
|
|
config X86_USE_PPRO_CHECKSUM
|
|
def_bool y
|
|
- depends on MWINCHIP3D || MWINCHIPC6 || MCYRIXIII || MK7 || MK6 || MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M686 || MK8 || MVIAC3_2 || MVIAC7 || MEFFICEON || MGEODE_LX || MCORE2 || MATOM
|
|
+ depends on MWINCHIP3D || MWINCHIPC6 || MCYRIXIII || MK7 || MK6 || MK10 || MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M686 || MK8 || MK8SSE3 || MVIAC3_2 || MVIAC7 || MEFFICEON || MGEODE_LX || MCORE2 || MNEHALEM || MWESTMERE || MSILVERMONT || MSANDYBRIDGE || MIVYBRIDGE || MHASWELL || MBROADWELL || MSKYLAKE || MATOM || MNATIVE
|
|
|
|
config X86_USE_3DNOW
|
|
def_bool y
|
|
depends on (MCYRIXIII || MK7 || MGEODE_LX) && !UML
|
|
|
|
-#
|
|
-# P6_NOPs are a relatively minor optimization that require a family >=
|
|
-# 6 processor, except that it is broken on certain VIA chips.
|
|
-# Furthermore, AMD chips prefer a totally different sequence of NOPs
|
|
-# (which work on all CPUs). In addition, it looks like Virtual PC
|
|
-# does not understand them.
|
|
-#
|
|
-# As a result, disallow these if we're not compiling for X86_64 (these
|
|
-# NOPs do work on all x86-64 capable chips); the list of processors in
|
|
-# the right-hand clause are the cores that benefit from this optimization.
|
|
-#
|
|
config X86_P6_NOP
|
|
- def_bool y
|
|
- depends on X86_64
|
|
- depends on (MCORE2 || MPENTIUM4 || MPSC)
|
|
+ default n
|
|
+ bool "Support for P6_NOPs on Intel chips"
|
|
+ depends on (MCORE2 || MPENTIUM4 || MPSC || MATOM || MNEHALEM || MWESTMERE || MSILVERMONT || MSANDYBRIDGE || MIVYBRIDGE || MHASWELL || MBROADWELL || MSKYLAKE || MNATIVE)
|
|
+ ---help---
|
|
+ P6_NOPs are a relatively minor optimization that require a family >=
|
|
+ 6 processor, except that it is broken on certain VIA chips.
|
|
+ Furthermore, AMD chips prefer a totally different sequence of NOPs
|
|
+ (which work on all CPUs). In addition, it looks like Virtual PC
|
|
+ does not understand them.
|
|
+
|
|
+ As a result, disallow these if we're not compiling for X86_64 (these
|
|
+ NOPs do work on all x86-64 capable chips); the list of processors in
|
|
+ the right-hand clause are the cores that benefit from this optimization.
|
|
+
|
|
+ Say Y if you have Intel CPU newer than Pentium Pro, N otherwise.
|
|
|
|
config X86_TSC
|
|
def_bool y
|
|
- depends on (MWINCHIP3D || MCRUSOE || MEFFICEON || MCYRIXIII || MK7 || MK6 || MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M686 || M586MMX || M586TSC || MK8 || MVIAC3_2 || MVIAC7 || MGEODEGX1 || MGEODE_LX || MCORE2 || MATOM) || X86_64
|
|
+ depends on (MWINCHIP3D || MCRUSOE || MEFFICEON || MCYRIXIII || MK7 || MK6 || MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M686 || M586MMX || M586TSC || MK8 || MK8SSE3 || MVIAC3_2 || MVIAC7 || MGEODEGX1 || MGEODE_LX || MCORE2 || MNEHALEM || MWESTMERE || MSILVERMONT || MSANDYBRIDGE || MIVYBRIDGE || MHASWELL || MBROADWELL || MSKYLAKE || MNATIVE || MATOM) || X86_64
|
|
|
|
config X86_CMPXCHG64
|
|
def_bool y
|
|
- depends on X86_PAE || X86_64 || MCORE2 || MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M686 || MATOM
|
|
+ depends on X86_PAE || X86_64 || MCORE2 || MNEHALEM || MWESTMERE || MSILVERMONT || MSANDYBRIDGE || MIVYBRIDGE || MHASWELL || MBROADWELL || MSKYLAKE || MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M686 || MATOM || MNATIVE
|
|
|
|
# this should be set for all -march=.. options where the compiler
|
|
# generates cmov.
|
|
config X86_CMOV
|
|
def_bool y
|
|
- depends on (MK8 || MK7 || MCORE2 || MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M686 || MVIAC3_2 || MVIAC7 || MCRUSOE || MEFFICEON || X86_64 || MATOM || MGEODE_LX)
|
|
+ depends on (MK8 || MK8SSE3 || MK10 || MBARCELONA || MBOBCAT || MBULLDOZER || MPILEDRIVER || MSTEAMROLLER || MEXCAVATOR || MZEN || MJAGUAR || MK7 || MCORE2 || MNEHALEM || MWESTMERE || MSILVERMONT || MSANDYBRIDGE || MIVYBRIDGE || MHASWELL || MBROADWELL || MSKYLAKE || MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M686 || MVIAC3_2 || MVIAC7 || MCRUSOE || MEFFICEON || X86_64 || MNATIVE || MATOM || MGEODE_LX)
|
|
|
|
config X86_MINIMUM_CPU_FAMILY
|
|
int
|
|
diff --git a/arch/x86/Makefile b/arch/x86/Makefile
|
|
index 2d44933..344075c 100644
|
|
--- a/arch/x86/Makefile
|
|
+++ b/arch/x86/Makefile
|
|
@@ -104,13 +104,40 @@ else
|
|
KBUILD_CFLAGS += $(call cc-option,-mskip-rax-setup)
|
|
|
|
# FIXME - should be integrated in Makefile.cpu (Makefile_32.cpu)
|
|
+ cflags-$(CONFIG_MNATIVE) += $(call cc-option,-march=native)
|
|
cflags-$(CONFIG_MK8) += $(call cc-option,-march=k8)
|
|
+ cflags-$(CONFIG_MK8SSE3) += $(call cc-option,-march=k8-sse3,-mtune=k8)
|
|
+ cflags-$(CONFIG_MK10) += $(call cc-option,-march=amdfam10)
|
|
+ cflags-$(CONFIG_MBARCELONA) += $(call cc-option,-march=barcelona)
|
|
+ cflags-$(CONFIG_MBOBCAT) += $(call cc-option,-march=btver1)
|
|
+ cflags-$(CONFIG_MJAGUAR) += $(call cc-option,-march=btver2)
|
|
+ cflags-$(CONFIG_MBULLDOZER) += $(call cc-option,-march=bdver1)
|
|
+ cflags-$(CONFIG_MPILEDRIVER) += $(call cc-option,-march=bdver2)
|
|
+ cflags-$(CONFIG_MSTEAMROLLER) += $(call cc-option,-march=bdver3)
|
|
+ cflags-$(CONFIG_MEXCAVATOR) += $(call cc-option,-march=bdver4)
|
|
+ cflags-$(CONFIG_MZEN) += $(call cc-option,-march=znver1)
|
|
cflags-$(CONFIG_MPSC) += $(call cc-option,-march=nocona)
|
|
|
|
cflags-$(CONFIG_MCORE2) += \
|
|
- $(call cc-option,-march=core2,$(call cc-option,-mtune=generic))
|
|
- cflags-$(CONFIG_MATOM) += $(call cc-option,-march=atom) \
|
|
- $(call cc-option,-mtune=atom,$(call cc-option,-mtune=generic))
|
|
+ $(call cc-option,-march=core2,$(call cc-option,-mtune=core2))
|
|
+ cflags-$(CONFIG_MNEHALEM) += \
|
|
+ $(call cc-option,-march=nehalem,$(call cc-option,-mtune=nehalem))
|
|
+ cflags-$(CONFIG_MWESTMERE) += \
|
|
+ $(call cc-option,-march=westmere,$(call cc-option,-mtune=westmere))
|
|
+ cflags-$(CONFIG_MSILVERMONT) += \
|
|
+ $(call cc-option,-march=silvermont,$(call cc-option,-mtune=silvermont))
|
|
+ cflags-$(CONFIG_MSANDYBRIDGE) += \
|
|
+ $(call cc-option,-march=sandybridge,$(call cc-option,-mtune=sandybridge))
|
|
+ cflags-$(CONFIG_MIVYBRIDGE) += \
|
|
+ $(call cc-option,-march=ivybridge,$(call cc-option,-mtune=ivybridge))
|
|
+ cflags-$(CONFIG_MHASWELL) += \
|
|
+ $(call cc-option,-march=haswell,$(call cc-option,-mtune=haswell))
|
|
+ cflags-$(CONFIG_MBROADWELL) += \
|
|
+ $(call cc-option,-march=broadwell,$(call cc-option,-mtune=broadwell))
|
|
+ cflags-$(CONFIG_MSKYLAKE) += \
|
|
+ $(call cc-option,-march=skylake,$(call cc-option,-mtune=skylake))
|
|
+ cflags-$(CONFIG_MATOM) += $(call cc-option,-march=bonnell) \
|
|
+ $(call cc-option,-mtune=bonnell,$(call cc-option,-mtune=generic))
|
|
cflags-$(CONFIG_GENERIC_CPU) += $(call cc-option,-mtune=generic)
|
|
KBUILD_CFLAGS += $(cflags-y)
|
|
|
|
diff --git a/arch/x86/Makefile_32.cpu b/arch/x86/Makefile_32.cpu
|
|
index 6647ed4..765a71e 100644
|
|
--- a/arch/x86/Makefile_32.cpu
|
|
+++ b/arch/x86/Makefile_32.cpu
|
|
@@ -23,7 +23,18 @@ cflags-$(CONFIG_MK6) += -march=k6
|
|
# Please note, that patches that add -march=athlon-xp and friends are pointless.
|
|
# They make zero difference whatsosever to performance at this time.
|
|
cflags-$(CONFIG_MK7) += -march=athlon
|
|
+cflags-$(CONFIG_MNATIVE) += $(call cc-option,-march=native)
|
|
cflags-$(CONFIG_MK8) += $(call cc-option,-march=k8,-march=athlon)
|
|
+cflags-$(CONFIG_MK8SSE3) += $(call cc-option,-march=k8-sse3,-march=athlon)
|
|
+cflags-$(CONFIG_MK10) += $(call cc-option,-march=amdfam10,-march=athlon)
|
|
+cflags-$(CONFIG_MBARCELONA) += $(call cc-option,-march=barcelona,-march=athlon)
|
|
+cflags-$(CONFIG_MBOBCAT) += $(call cc-option,-march=btver1,-march=athlon)
|
|
+cflags-$(CONFIG_MJAGUAR) += $(call cc-option,-march=btver2,-march=athlon)
|
|
+cflags-$(CONFIG_MBULLDOZER) += $(call cc-option,-march=bdver1,-march=athlon)
|
|
+cflags-$(CONFIG_MPILEDRIVER) += $(call cc-option,-march=bdver2,-march=athlon)
|
|
+cflags-$(CONFIG_MSTEAMROLLER) += $(call cc-option,-march=bdver3,-march=athlon)
|
|
+cflags-$(CONFIG_MEXCAVATOR) += $(call cc-option,-march=bdver4,-march=athlon)
|
|
+cflags-$(CONFIG_MZEN) += $(call cc-option,-march=znver1,-march=athlon)
|
|
cflags-$(CONFIG_MCRUSOE) += -march=i686 $(align)-functions=0 $(align)-jumps=0 $(align)-loops=0
|
|
cflags-$(CONFIG_MEFFICEON) += -march=i686 $(call tune,pentium3) $(align)-functions=0 $(align)-jumps=0 $(align)-loops=0
|
|
cflags-$(CONFIG_MWINCHIPC6) += $(call cc-option,-march=winchip-c6,-march=i586)
|
|
@@ -32,8 +43,16 @@ cflags-$(CONFIG_MCYRIXIII) += $(call cc-option,-march=c3,-march=i486) $(align)-f
|
|
cflags-$(CONFIG_MVIAC3_2) += $(call cc-option,-march=c3-2,-march=i686)
|
|
cflags-$(CONFIG_MVIAC7) += -march=i686
|
|
cflags-$(CONFIG_MCORE2) += -march=i686 $(call tune,core2)
|
|
-cflags-$(CONFIG_MATOM) += $(call cc-option,-march=atom,$(call cc-option,-march=core2,-march=i686)) \
|
|
- $(call cc-option,-mtune=atom,$(call cc-option,-mtune=generic))
|
|
+cflags-$(CONFIG_MNEHALEM) += -march=i686 $(call tune,nehalem)
|
|
+cflags-$(CONFIG_MWESTMERE) += -march=i686 $(call tune,westmere)
|
|
+cflags-$(CONFIG_MSILVERMONT) += -march=i686 $(call tune,silvermont)
|
|
+cflags-$(CONFIG_MSANDYBRIDGE) += -march=i686 $(call tune,sandybridge)
|
|
+cflags-$(CONFIG_MIVYBRIDGE) += -march=i686 $(call tune,ivybridge)
|
|
+cflags-$(CONFIG_MHASWELL) += -march=i686 $(call tune,haswell)
|
|
+cflags-$(CONFIG_MBROADWELL) += -march=i686 $(call tune,broadwell)
|
|
+cflags-$(CONFIG_MSKYLAKE) += -march=i686 $(call tune,skylake)
|
|
+cflags-$(CONFIG_MATOM) += $(call cc-option,-march=bonnell,$(call cc-option,-march=core2,-march=i686)) \
|
|
+ $(call cc-option,-mtune=bonnell,$(call cc-option,-mtune=generic))
|
|
|
|
# AMD Elan support
|
|
cflags-$(CONFIG_MELAN) += -march=i486
|
|
diff --git a/arch/x86/include/asm/module.h b/arch/x86/include/asm/module.h
|
|
index e3b7819..2543bde 100644
|
|
--- a/arch/x86/include/asm/module.h
|
|
+++ b/arch/x86/include/asm/module.h
|
|
@@ -15,6 +15,24 @@
|
|
#define MODULE_PROC_FAMILY "586MMX "
|
|
#elif defined CONFIG_MCORE2
|
|
#define MODULE_PROC_FAMILY "CORE2 "
|
|
+#elif defined CONFIG_MNATIVE
|
|
+#define MODULE_PROC_FAMILY "NATIVE "
|
|
+#elif defined CONFIG_MNEHALEM
|
|
+#define MODULE_PROC_FAMILY "NEHALEM "
|
|
+#elif defined CONFIG_MWESTMERE
|
|
+#define MODULE_PROC_FAMILY "WESTMERE "
|
|
+#elif defined CONFIG_MSILVERMONT
|
|
+#define MODULE_PROC_FAMILY "SILVERMONT "
|
|
+#elif defined CONFIG_MSANDYBRIDGE
|
|
+#define MODULE_PROC_FAMILY "SANDYBRIDGE "
|
|
+#elif defined CONFIG_MIVYBRIDGE
|
|
+#define MODULE_PROC_FAMILY "IVYBRIDGE "
|
|
+#elif defined CONFIG_MHASWELL
|
|
+#define MODULE_PROC_FAMILY "HASWELL "
|
|
+#elif defined CONFIG_MBROADWELL
|
|
+#define MODULE_PROC_FAMILY "BROADWELL "
|
|
+#elif defined CONFIG_MSKYLAKE
|
|
+#define MODULE_PROC_FAMILY "SKYLAKE "
|
|
#elif defined CONFIG_MATOM
|
|
#define MODULE_PROC_FAMILY "ATOM "
|
|
#elif defined CONFIG_M686
|
|
@@ -33,6 +51,26 @@
|
|
#define MODULE_PROC_FAMILY "K7 "
|
|
#elif defined CONFIG_MK8
|
|
#define MODULE_PROC_FAMILY "K8 "
|
|
+#elif defined CONFIG_MK8SSE3
|
|
+#define MODULE_PROC_FAMILY "K8SSE3 "
|
|
+#elif defined CONFIG_MK10
|
|
+#define MODULE_PROC_FAMILY "K10 "
|
|
+#elif defined CONFIG_MBARCELONA
|
|
+#define MODULE_PROC_FAMILY "BARCELONA "
|
|
+#elif defined CONFIG_MBOBCAT
|
|
+#define MODULE_PROC_FAMILY "BOBCAT "
|
|
+#elif defined CONFIG_MBULLDOZER
|
|
+#define MODULE_PROC_FAMILY "BULLDOZER "
|
|
+#elif defined CONFIG_MPILEDRIVER
|
|
+#define MODULE_PROC_FAMILY "STEAMROLLER "
|
|
+#elif defined CONFIG_MSTEAMROLLER
|
|
+#define MODULE_PROC_FAMILY "PILEDRIVER "
|
|
+#elif defined CONFIG_MJAGUAR
|
|
+#define MODULE_PROC_FAMILY "JAGUAR "
|
|
+#elif defined CONFIG_MEXCAVATOR
|
|
+#define MODULE_PROC_FAMILY "EXCAVATOR "
|
|
+#elif defined CONFIG_MZEN
|
|
+#define MODULE_PROC_FAMILY "ZEN "
|
|
#elif defined CONFIG_MELAN
|
|
#define MODULE_PROC_FAMILY "ELAN "
|
|
#elif defined CONFIG_MCRUSOE
|
|
diff --git a/block/Kconfig b/block/Kconfig
|
|
index 1d4d624..7c8523e 100644
|
|
--- a/block/Kconfig
|
|
+++ b/block/Kconfig
|
|
@@ -112,6 +112,30 @@ config BLK_CMDLINE_PARSER
|
|
|
|
See Documentation/block/cmdline-partition.txt for more information.
|
|
|
|
+config BLK_WBT
|
|
+ bool "Enable support for block device writeback throttling"
|
|
+ default n
|
|
+ ---help---
|
|
+ Enabling this option enables the block layer to throttle buffered
|
|
+ writeback from the VM, making it more smooth and having less
|
|
+ impact on foreground operations.
|
|
+
|
|
+config BLK_WBT_SQ
|
|
+ bool "Single queue writeback throttling"
|
|
+ default n
|
|
+ depends on BLK_WBT
|
|
+ ---help---
|
|
+ Enable writeback throttling by default on legacy single queue devices
|
|
+
|
|
+config BLK_WBT_MQ
|
|
+ bool "Multiqueue writeback throttling"
|
|
+ default y
|
|
+ depends on BLK_WBT
|
|
+ ---help---
|
|
+ Enable writeback throttling by default on multiqueue devices.
|
|
+ Multiqueue currently doesn't have support for IO scheduling,
|
|
+ enabling this option is recommended.
|
|
+
|
|
menu "Partition Types"
|
|
|
|
source "block/partitions/Kconfig"
|
|
diff --git a/block/Kconfig.iosched b/block/Kconfig.iosched
|
|
index 421bef9..f2cd945 100644
|
|
--- a/block/Kconfig.iosched
|
|
+++ b/block/Kconfig.iosched
|
|
@@ -39,6 +39,25 @@ config CFQ_GROUP_IOSCHED
|
|
---help---
|
|
Enable group IO scheduling in CFQ.
|
|
|
|
+config IOSCHED_BFQ
|
|
+ tristate "BFQ I/O scheduler"
|
|
+ default n
|
|
+ ---help---
|
|
+ The BFQ I/O scheduler distributes bandwidth among all
|
|
+ processes according to their weights, regardless of the
|
|
+ device parameters and with any workload. It also guarantees
|
|
+ a low latency to interactive and soft real-time applications.
|
|
+ Details in Documentation/block/bfq-iosched.txt
|
|
+
|
|
+config BFQ_GROUP_IOSCHED
|
|
+ bool "BFQ hierarchical scheduling support"
|
|
+ depends on IOSCHED_BFQ && BLK_CGROUP
|
|
+ default n
|
|
+ ---help---
|
|
+
|
|
+ Enable hierarchical scheduling in BFQ, using the blkio
|
|
+ (cgroups-v1) or io (cgroups-v2) controller.
|
|
+
|
|
choice
|
|
prompt "Default I/O scheduler"
|
|
default DEFAULT_CFQ
|
|
@@ -52,6 +71,16 @@ choice
|
|
config DEFAULT_CFQ
|
|
bool "CFQ" if IOSCHED_CFQ=y
|
|
|
|
+ config DEFAULT_BFQ
|
|
+ bool "BFQ" if IOSCHED_BFQ=y
|
|
+ help
|
|
+ Selects BFQ as the default I/O scheduler which will be
|
|
+ used by default for all block devices.
|
|
+ The BFQ I/O scheduler aims at distributing the bandwidth
|
|
+ as desired, independently of the disk parameters and with
|
|
+ any workload. It also tries to guarantee low latency to
|
|
+ interactive and soft real-time applications.
|
|
+
|
|
config DEFAULT_NOOP
|
|
bool "No-op"
|
|
|
|
@@ -61,6 +90,7 @@ config DEFAULT_IOSCHED
|
|
string
|
|
default "deadline" if DEFAULT_DEADLINE
|
|
default "cfq" if DEFAULT_CFQ
|
|
+ default "bfq" if DEFAULT_BFQ
|
|
default "noop" if DEFAULT_NOOP
|
|
|
|
endmenu
|
|
diff --git a/block/Makefile b/block/Makefile
|
|
index 36acdd7..5709f59 100644
|
|
--- a/block/Makefile
|
|
+++ b/block/Makefile
|
|
@@ -5,7 +5,7 @@
|
|
obj-$(CONFIG_BLOCK) := bio.o elevator.o blk-core.o blk-tag.o blk-sysfs.o \
|
|
blk-flush.o blk-settings.o blk-ioc.o blk-map.o \
|
|
blk-exec.o blk-merge.o blk-softirq.o blk-timeout.o \
|
|
- blk-lib.o blk-mq.o blk-mq-tag.o \
|
|
+ blk-lib.o blk-mq.o blk-mq-tag.o blk-stat.o \
|
|
blk-mq-sysfs.o blk-mq-cpumap.o ioctl.o \
|
|
genhd.o scsi_ioctl.o partition-generic.o ioprio.o \
|
|
badblocks.o partitions/
|
|
@@ -18,8 +18,10 @@ obj-$(CONFIG_BLK_DEV_THROTTLING) += blk-throttle.o
|
|
obj-$(CONFIG_IOSCHED_NOOP) += noop-iosched.o
|
|
obj-$(CONFIG_IOSCHED_DEADLINE) += deadline-iosched.o
|
|
obj-$(CONFIG_IOSCHED_CFQ) += cfq-iosched.o
|
|
+obj-$(CONFIG_IOSCHED_BFQ) += bfq-iosched.o
|
|
|
|
obj-$(CONFIG_BLOCK_COMPAT) += compat_ioctl.o
|
|
obj-$(CONFIG_BLK_CMDLINE_PARSER) += cmdline-parser.o
|
|
obj-$(CONFIG_BLK_DEV_INTEGRITY) += bio-integrity.o blk-integrity.o t10-pi.o
|
|
obj-$(CONFIG_BLK_MQ_PCI) += blk-mq-pci.o
|
|
+obj-$(CONFIG_BLK_WBT) += blk-wbt.o
|
|
diff --git a/block/bfq-cgroup.c b/block/bfq-cgroup.c
|
|
new file mode 100644
|
|
index 0000000..106abff
|
|
--- /dev/null
|
|
+++ b/block/bfq-cgroup.c
|
|
@@ -0,0 +1,1213 @@
|
|
+/*
|
|
+ * BFQ: CGROUPS support.
|
|
+ *
|
|
+ * Based on ideas and code from CFQ:
|
|
+ * Copyright (C) 2003 Jens Axboe <axboe@kernel.dk>
|
|
+ *
|
|
+ * Copyright (C) 2008 Fabio Checconi <fabio@gandalf.sssup.it>
|
|
+ * Paolo Valente <paolo.valente@unimore.it>
|
|
+ *
|
|
+ * Copyright (C) 2015 Paolo Valente <paolo.valente@unimore.it>
|
|
+ *
|
|
+ * Copyright (C) 2016 Paolo Valente <paolo.valente@linaro.org>
|
|
+ *
|
|
+ * Licensed under the GPL-2 as detailed in the accompanying COPYING.BFQ
|
|
+ * file.
|
|
+ */
|
|
+
|
|
+#ifdef CONFIG_BFQ_GROUP_IOSCHED
|
|
+
|
|
+/* bfqg stats flags */
|
|
+enum bfqg_stats_flags {
|
|
+ BFQG_stats_waiting = 0,
|
|
+ BFQG_stats_idling,
|
|
+ BFQG_stats_empty,
|
|
+};
|
|
+
|
|
+#define BFQG_FLAG_FNS(name) \
|
|
+static void bfqg_stats_mark_##name(struct bfqg_stats *stats) \
|
|
+{ \
|
|
+ stats->flags |= (1 << BFQG_stats_##name); \
|
|
+} \
|
|
+static void bfqg_stats_clear_##name(struct bfqg_stats *stats) \
|
|
+{ \
|
|
+ stats->flags &= ~(1 << BFQG_stats_##name); \
|
|
+} \
|
|
+static int bfqg_stats_##name(struct bfqg_stats *stats) \
|
|
+{ \
|
|
+ return (stats->flags & (1 << BFQG_stats_##name)) != 0; \
|
|
+} \
|
|
+
|
|
+BFQG_FLAG_FNS(waiting)
|
|
+BFQG_FLAG_FNS(idling)
|
|
+BFQG_FLAG_FNS(empty)
|
|
+#undef BFQG_FLAG_FNS
|
|
+
|
|
+/* This should be called with the queue_lock held. */
|
|
+static void bfqg_stats_update_group_wait_time(struct bfqg_stats *stats)
|
|
+{
|
|
+ unsigned long long now;
|
|
+
|
|
+ if (!bfqg_stats_waiting(stats))
|
|
+ return;
|
|
+
|
|
+ now = sched_clock();
|
|
+ if (time_after64(now, stats->start_group_wait_time))
|
|
+ blkg_stat_add(&stats->group_wait_time,
|
|
+ now - stats->start_group_wait_time);
|
|
+ bfqg_stats_clear_waiting(stats);
|
|
+}
|
|
+
|
|
+/* This should be called with the queue_lock held. */
|
|
+static void bfqg_stats_set_start_group_wait_time(struct bfq_group *bfqg,
|
|
+ struct bfq_group *curr_bfqg)
|
|
+{
|
|
+ struct bfqg_stats *stats = &bfqg->stats;
|
|
+
|
|
+ if (bfqg_stats_waiting(stats))
|
|
+ return;
|
|
+ if (bfqg == curr_bfqg)
|
|
+ return;
|
|
+ stats->start_group_wait_time = sched_clock();
|
|
+ bfqg_stats_mark_waiting(stats);
|
|
+}
|
|
+
|
|
+/* This should be called with the queue_lock held. */
|
|
+static void bfqg_stats_end_empty_time(struct bfqg_stats *stats)
|
|
+{
|
|
+ unsigned long long now;
|
|
+
|
|
+ if (!bfqg_stats_empty(stats))
|
|
+ return;
|
|
+
|
|
+ now = sched_clock();
|
|
+ if (time_after64(now, stats->start_empty_time))
|
|
+ blkg_stat_add(&stats->empty_time,
|
|
+ now - stats->start_empty_time);
|
|
+ bfqg_stats_clear_empty(stats);
|
|
+}
|
|
+
|
|
+static void bfqg_stats_update_dequeue(struct bfq_group *bfqg)
|
|
+{
|
|
+ blkg_stat_add(&bfqg->stats.dequeue, 1);
|
|
+}
|
|
+
|
|
+static void bfqg_stats_set_start_empty_time(struct bfq_group *bfqg)
|
|
+{
|
|
+ struct bfqg_stats *stats = &bfqg->stats;
|
|
+
|
|
+ if (blkg_rwstat_total(&stats->queued))
|
|
+ return;
|
|
+
|
|
+ /*
|
|
+ * group is already marked empty. This can happen if bfqq got new
|
|
+ * request in parent group and moved to this group while being added
|
|
+ * to service tree. Just ignore the event and move on.
|
|
+ */
|
|
+ if (bfqg_stats_empty(stats))
|
|
+ return;
|
|
+
|
|
+ stats->start_empty_time = sched_clock();
|
|
+ bfqg_stats_mark_empty(stats);
|
|
+}
|
|
+
|
|
+static void bfqg_stats_update_idle_time(struct bfq_group *bfqg)
|
|
+{
|
|
+ struct bfqg_stats *stats = &bfqg->stats;
|
|
+
|
|
+ if (bfqg_stats_idling(stats)) {
|
|
+ unsigned long long now = sched_clock();
|
|
+
|
|
+ if (time_after64(now, stats->start_idle_time))
|
|
+ blkg_stat_add(&stats->idle_time,
|
|
+ now - stats->start_idle_time);
|
|
+ bfqg_stats_clear_idling(stats);
|
|
+ }
|
|
+}
|
|
+
|
|
+static void bfqg_stats_set_start_idle_time(struct bfq_group *bfqg)
|
|
+{
|
|
+ struct bfqg_stats *stats = &bfqg->stats;
|
|
+
|
|
+ stats->start_idle_time = sched_clock();
|
|
+ bfqg_stats_mark_idling(stats);
|
|
+}
|
|
+
|
|
+static void bfqg_stats_update_avg_queue_size(struct bfq_group *bfqg)
|
|
+{
|
|
+ struct bfqg_stats *stats = &bfqg->stats;
|
|
+
|
|
+ blkg_stat_add(&stats->avg_queue_size_sum,
|
|
+ blkg_rwstat_total(&stats->queued));
|
|
+ blkg_stat_add(&stats->avg_queue_size_samples, 1);
|
|
+ bfqg_stats_update_group_wait_time(stats);
|
|
+}
|
|
+
|
|
+static struct blkcg_policy blkcg_policy_bfq;
|
|
+
|
|
+/*
|
|
+ * blk-cgroup policy-related handlers
|
|
+ * The following functions help in converting between blk-cgroup
|
|
+ * internal structures and BFQ-specific structures.
|
|
+ */
|
|
+
|
|
+static struct bfq_group *pd_to_bfqg(struct blkg_policy_data *pd)
|
|
+{
|
|
+ return pd ? container_of(pd, struct bfq_group, pd) : NULL;
|
|
+}
|
|
+
|
|
+static struct blkcg_gq *bfqg_to_blkg(struct bfq_group *bfqg)
|
|
+{
|
|
+ return pd_to_blkg(&bfqg->pd);
|
|
+}
|
|
+
|
|
+static struct bfq_group *blkg_to_bfqg(struct blkcg_gq *blkg)
|
|
+{
|
|
+ struct blkg_policy_data *pd = blkg_to_pd(blkg, &blkcg_policy_bfq);
|
|
+
|
|
+ return pd_to_bfqg(pd);
|
|
+}
|
|
+
|
|
+/*
|
|
+ * bfq_group handlers
|
|
+ * The following functions help in navigating the bfq_group hierarchy
|
|
+ * by allowing to find the parent of a bfq_group or the bfq_group
|
|
+ * associated to a bfq_queue.
|
|
+ */
|
|
+
|
|
+static struct bfq_group *bfqg_parent(struct bfq_group *bfqg)
|
|
+{
|
|
+ struct blkcg_gq *pblkg = bfqg_to_blkg(bfqg)->parent;
|
|
+
|
|
+ return pblkg ? blkg_to_bfqg(pblkg) : NULL;
|
|
+}
|
|
+
|
|
+static struct bfq_group *bfqq_group(struct bfq_queue *bfqq)
|
|
+{
|
|
+ struct bfq_entity *group_entity = bfqq->entity.parent;
|
|
+
|
|
+ return group_entity ? container_of(group_entity, struct bfq_group,
|
|
+ entity) :
|
|
+ bfqq->bfqd->root_group;
|
|
+}
|
|
+
|
|
+/*
|
|
+ * The following two functions handle get and put of a bfq_group by
|
|
+ * wrapping the related blk-cgroup hooks.
|
|
+ */
|
|
+
|
|
+static void bfqg_get(struct bfq_group *bfqg)
|
|
+{
|
|
+ return blkg_get(bfqg_to_blkg(bfqg));
|
|
+}
|
|
+
|
|
+static void bfqg_put(struct bfq_group *bfqg)
|
|
+{
|
|
+ return blkg_put(bfqg_to_blkg(bfqg));
|
|
+}
|
|
+
|
|
+static void bfqg_stats_update_io_add(struct bfq_group *bfqg,
|
|
+ struct bfq_queue *bfqq,
|
|
+ int op, int op_flags)
|
|
+{
|
|
+ blkg_rwstat_add(&bfqg->stats.queued, op, op_flags, 1);
|
|
+ bfqg_stats_end_empty_time(&bfqg->stats);
|
|
+ if (!(bfqq == ((struct bfq_data *)bfqg->bfqd)->in_service_queue))
|
|
+ bfqg_stats_set_start_group_wait_time(bfqg, bfqq_group(bfqq));
|
|
+}
|
|
+
|
|
+static void bfqg_stats_update_io_remove(struct bfq_group *bfqg, int op,
|
|
+ int op_flags)
|
|
+{
|
|
+ blkg_rwstat_add(&bfqg->stats.queued, op, op_flags, -1);
|
|
+}
|
|
+
|
|
+static void bfqg_stats_update_io_merged(struct bfq_group *bfqg, int op,
|
|
+ int op_flags)
|
|
+{
|
|
+ blkg_rwstat_add(&bfqg->stats.merged, op, op_flags, 1);
|
|
+}
|
|
+
|
|
+static void bfqg_stats_update_completion(struct bfq_group *bfqg,
|
|
+ uint64_t start_time, uint64_t io_start_time, int op,
|
|
+ int op_flags)
|
|
+{
|
|
+ struct bfqg_stats *stats = &bfqg->stats;
|
|
+ unsigned long long now = sched_clock();
|
|
+
|
|
+ if (time_after64(now, io_start_time))
|
|
+ blkg_rwstat_add(&stats->service_time, op, op_flags,
|
|
+ now - io_start_time);
|
|
+ if (time_after64(io_start_time, start_time))
|
|
+ blkg_rwstat_add(&stats->wait_time, op, op_flags,
|
|
+ io_start_time - start_time);
|
|
+}
|
|
+
|
|
+/* @stats = 0 */
|
|
+static void bfqg_stats_reset(struct bfqg_stats *stats)
|
|
+{
|
|
+ /* queued stats shouldn't be cleared */
|
|
+ blkg_rwstat_reset(&stats->merged);
|
|
+ blkg_rwstat_reset(&stats->service_time);
|
|
+ blkg_rwstat_reset(&stats->wait_time);
|
|
+ blkg_stat_reset(&stats->time);
|
|
+ blkg_stat_reset(&stats->avg_queue_size_sum);
|
|
+ blkg_stat_reset(&stats->avg_queue_size_samples);
|
|
+ blkg_stat_reset(&stats->dequeue);
|
|
+ blkg_stat_reset(&stats->group_wait_time);
|
|
+ blkg_stat_reset(&stats->idle_time);
|
|
+ blkg_stat_reset(&stats->empty_time);
|
|
+}
|
|
+
|
|
+/* @to += @from */
|
|
+static void bfqg_stats_add_aux(struct bfqg_stats *to, struct bfqg_stats *from)
|
|
+{
|
|
+ if (!to || !from)
|
|
+ return;
|
|
+
|
|
+ /* queued stats shouldn't be cleared */
|
|
+ blkg_rwstat_add_aux(&to->merged, &from->merged);
|
|
+ blkg_rwstat_add_aux(&to->service_time, &from->service_time);
|
|
+ blkg_rwstat_add_aux(&to->wait_time, &from->wait_time);
|
|
+ blkg_stat_add_aux(&from->time, &from->time);
|
|
+ blkg_stat_add_aux(&to->avg_queue_size_sum, &from->avg_queue_size_sum);
|
|
+ blkg_stat_add_aux(&to->avg_queue_size_samples,
|
|
+ &from->avg_queue_size_samples);
|
|
+ blkg_stat_add_aux(&to->dequeue, &from->dequeue);
|
|
+ blkg_stat_add_aux(&to->group_wait_time, &from->group_wait_time);
|
|
+ blkg_stat_add_aux(&to->idle_time, &from->idle_time);
|
|
+ blkg_stat_add_aux(&to->empty_time, &from->empty_time);
|
|
+}
|
|
+
|
|
+/*
|
|
+ * Transfer @bfqg's stats to its parent's dead_stats so that the ancestors'
|
|
+ * recursive stats can still account for the amount used by this bfqg after
|
|
+ * it's gone.
|
|
+ */
|
|
+static void bfqg_stats_xfer_dead(struct bfq_group *bfqg)
|
|
+{
|
|
+ struct bfq_group *parent;
|
|
+
|
|
+ if (!bfqg) /* root_group */
|
|
+ return;
|
|
+
|
|
+ parent = bfqg_parent(bfqg);
|
|
+
|
|
+ lockdep_assert_held(bfqg_to_blkg(bfqg)->q->queue_lock);
|
|
+
|
|
+ if (unlikely(!parent))
|
|
+ return;
|
|
+
|
|
+ bfqg_stats_add_aux(&parent->stats, &bfqg->stats);
|
|
+ bfqg_stats_reset(&bfqg->stats);
|
|
+}
|
|
+
|
|
+static void bfq_init_entity(struct bfq_entity *entity,
|
|
+ struct bfq_group *bfqg)
|
|
+{
|
|
+ struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);
|
|
+
|
|
+ entity->weight = entity->new_weight;
|
|
+ entity->orig_weight = entity->new_weight;
|
|
+ if (bfqq) {
|
|
+ bfqq->ioprio = bfqq->new_ioprio;
|
|
+ bfqq->ioprio_class = bfqq->new_ioprio_class;
|
|
+ bfqg_get(bfqg);
|
|
+ }
|
|
+ entity->parent = bfqg->my_entity; /* NULL for root group */
|
|
+ entity->sched_data = &bfqg->sched_data;
|
|
+}
|
|
+
|
|
+static void bfqg_stats_exit(struct bfqg_stats *stats)
|
|
+{
|
|
+ blkg_rwstat_exit(&stats->merged);
|
|
+ blkg_rwstat_exit(&stats->service_time);
|
|
+ blkg_rwstat_exit(&stats->wait_time);
|
|
+ blkg_rwstat_exit(&stats->queued);
|
|
+ blkg_stat_exit(&stats->time);
|
|
+ blkg_stat_exit(&stats->avg_queue_size_sum);
|
|
+ blkg_stat_exit(&stats->avg_queue_size_samples);
|
|
+ blkg_stat_exit(&stats->dequeue);
|
|
+ blkg_stat_exit(&stats->group_wait_time);
|
|
+ blkg_stat_exit(&stats->idle_time);
|
|
+ blkg_stat_exit(&stats->empty_time);
|
|
+}
|
|
+
|
|
+static int bfqg_stats_init(struct bfqg_stats *stats, gfp_t gfp)
|
|
+{
|
|
+ if (blkg_rwstat_init(&stats->merged, gfp) ||
|
|
+ blkg_rwstat_init(&stats->service_time, gfp) ||
|
|
+ blkg_rwstat_init(&stats->wait_time, gfp) ||
|
|
+ blkg_rwstat_init(&stats->queued, gfp) ||
|
|
+ blkg_stat_init(&stats->time, gfp) ||
|
|
+ blkg_stat_init(&stats->avg_queue_size_sum, gfp) ||
|
|
+ blkg_stat_init(&stats->avg_queue_size_samples, gfp) ||
|
|
+ blkg_stat_init(&stats->dequeue, gfp) ||
|
|
+ blkg_stat_init(&stats->group_wait_time, gfp) ||
|
|
+ blkg_stat_init(&stats->idle_time, gfp) ||
|
|
+ blkg_stat_init(&stats->empty_time, gfp)) {
|
|
+ bfqg_stats_exit(stats);
|
|
+ return -ENOMEM;
|
|
+ }
|
|
+
|
|
+ return 0;
|
|
+}
|
|
+
|
|
+static struct bfq_group_data *cpd_to_bfqgd(struct blkcg_policy_data *cpd)
|
|
+{
|
|
+ return cpd ? container_of(cpd, struct bfq_group_data, pd) : NULL;
|
|
+}
|
|
+
|
|
+static struct bfq_group_data *blkcg_to_bfqgd(struct blkcg *blkcg)
|
|
+{
|
|
+ return cpd_to_bfqgd(blkcg_to_cpd(blkcg, &blkcg_policy_bfq));
|
|
+}
|
|
+
|
|
+static struct blkcg_policy_data *bfq_cpd_alloc(gfp_t gfp)
|
|
+{
|
|
+ struct bfq_group_data *bgd;
|
|
+
|
|
+ bgd = kzalloc(sizeof(*bgd), GFP_KERNEL);
|
|
+ if (!bgd)
|
|
+ return NULL;
|
|
+ return &bgd->pd;
|
|
+}
|
|
+
|
|
+static void bfq_cpd_init(struct blkcg_policy_data *cpd)
|
|
+{
|
|
+ struct bfq_group_data *d = cpd_to_bfqgd(cpd);
|
|
+
|
|
+ d->weight = cgroup_subsys_on_dfl(io_cgrp_subsys) ?
|
|
+ CGROUP_WEIGHT_DFL : BFQ_WEIGHT_LEGACY_DFL;
|
|
+}
|
|
+
|
|
+static void bfq_cpd_free(struct blkcg_policy_data *cpd)
|
|
+{
|
|
+ kfree(cpd_to_bfqgd(cpd));
|
|
+}
|
|
+
|
|
+static struct blkg_policy_data *bfq_pd_alloc(gfp_t gfp, int node)
|
|
+{
|
|
+ struct bfq_group *bfqg;
|
|
+
|
|
+ bfqg = kzalloc_node(sizeof(*bfqg), gfp, node);
|
|
+ if (!bfqg)
|
|
+ return NULL;
|
|
+
|
|
+ if (bfqg_stats_init(&bfqg->stats, gfp)) {
|
|
+ kfree(bfqg);
|
|
+ return NULL;
|
|
+ }
|
|
+
|
|
+ return &bfqg->pd;
|
|
+}
|
|
+
|
|
+static void bfq_pd_init(struct blkg_policy_data *pd)
|
|
+{
|
|
+ struct blkcg_gq *blkg;
|
|
+ struct bfq_group *bfqg;
|
|
+ struct bfq_data *bfqd;
|
|
+ struct bfq_entity *entity;
|
|
+ struct bfq_group_data *d;
|
|
+
|
|
+ blkg = pd_to_blkg(pd);
|
|
+ BUG_ON(!blkg);
|
|
+ bfqg = blkg_to_bfqg(blkg);
|
|
+ bfqd = blkg->q->elevator->elevator_data;
|
|
+ entity = &bfqg->entity;
|
|
+ d = blkcg_to_bfqgd(blkg->blkcg);
|
|
+
|
|
+ entity->orig_weight = entity->weight = entity->new_weight = d->weight;
|
|
+ entity->my_sched_data = &bfqg->sched_data;
|
|
+ bfqg->my_entity = entity; /*
|
|
+ * the root_group's will be set to NULL
|
|
+ * in bfq_init_queue()
|
|
+ */
|
|
+ bfqg->bfqd = bfqd;
|
|
+ bfqg->active_entities = 0;
|
|
+ bfqg->rq_pos_tree = RB_ROOT;
|
|
+}
|
|
+
|
|
+static void bfq_pd_free(struct blkg_policy_data *pd)
|
|
+{
|
|
+ struct bfq_group *bfqg = pd_to_bfqg(pd);
|
|
+
|
|
+ bfqg_stats_exit(&bfqg->stats);
|
|
+ return kfree(bfqg);
|
|
+}
|
|
+
|
|
+static void bfq_pd_reset_stats(struct blkg_policy_data *pd)
|
|
+{
|
|
+ struct bfq_group *bfqg = pd_to_bfqg(pd);
|
|
+
|
|
+ bfqg_stats_reset(&bfqg->stats);
|
|
+}
|
|
+
|
|
+static void bfq_group_set_parent(struct bfq_group *bfqg,
|
|
+ struct bfq_group *parent)
|
|
+{
|
|
+ struct bfq_entity *entity;
|
|
+
|
|
+ BUG_ON(!parent);
|
|
+ BUG_ON(!bfqg);
|
|
+ BUG_ON(bfqg == parent);
|
|
+
|
|
+ entity = &bfqg->entity;
|
|
+ entity->parent = parent->my_entity;
|
|
+ entity->sched_data = &parent->sched_data;
|
|
+}
|
|
+
|
|
+static struct bfq_group *bfq_lookup_bfqg(struct bfq_data *bfqd,
|
|
+ struct blkcg *blkcg)
|
|
+{
|
|
+ struct blkcg_gq *blkg;
|
|
+
|
|
+ blkg = blkg_lookup(blkcg, bfqd->queue);
|
|
+ if (likely(blkg))
|
|
+ return blkg_to_bfqg(blkg);
|
|
+ return NULL;
|
|
+}
|
|
+
|
|
+static struct bfq_group *bfq_find_set_group(struct bfq_data *bfqd,
|
|
+ struct blkcg *blkcg)
|
|
+{
|
|
+ struct bfq_group *bfqg, *parent;
|
|
+ struct bfq_entity *entity;
|
|
+
|
|
+ assert_spin_locked(bfqd->queue->queue_lock);
|
|
+
|
|
+ bfqg = bfq_lookup_bfqg(bfqd, blkcg);
|
|
+
|
|
+ if (unlikely(!bfqg))
|
|
+ return NULL;
|
|
+
|
|
+ /*
|
|
+ * Update chain of bfq_groups as we might be handling a leaf group
|
|
+ * which, along with some of its relatives, has not been hooked yet
|
|
+ * to the private hierarchy of BFQ.
|
|
+ */
|
|
+ entity = &bfqg->entity;
|
|
+ for_each_entity(entity) {
|
|
+ bfqg = container_of(entity, struct bfq_group, entity);
|
|
+ BUG_ON(!bfqg);
|
|
+ if (bfqg != bfqd->root_group) {
|
|
+ parent = bfqg_parent(bfqg);
|
|
+ if (!parent)
|
|
+ parent = bfqd->root_group;
|
|
+ BUG_ON(!parent);
|
|
+ bfq_group_set_parent(bfqg, parent);
|
|
+ }
|
|
+ }
|
|
+
|
|
+ return bfqg;
|
|
+}
|
|
+
|
|
+static void bfq_pos_tree_add_move(struct bfq_data *bfqd,
|
|
+ struct bfq_queue *bfqq);
|
|
+
|
|
+static void bfq_bfqq_expire(struct bfq_data *bfqd,
|
|
+ struct bfq_queue *bfqq,
|
|
+ bool compensate,
|
|
+ enum bfqq_expiration reason);
|
|
+
|
|
+/**
|
|
+ * bfq_bfqq_move - migrate @bfqq to @bfqg.
|
|
+ * @bfqd: queue descriptor.
|
|
+ * @bfqq: the queue to move.
|
|
+ * @bfqg: the group to move to.
|
|
+ *
|
|
+ * Move @bfqq to @bfqg, deactivating it from its old group and reactivating
|
|
+ * it on the new one. Avoid putting the entity on the old group idle tree.
|
|
+ *
|
|
+ * Must be called under the queue lock; the cgroup owning @bfqg must
|
|
+ * not disappear (by now this just means that we are called under
|
|
+ * rcu_read_lock()).
|
|
+ */
|
|
+static void bfq_bfqq_move(struct bfq_data *bfqd, struct bfq_queue *bfqq,
|
|
+ struct bfq_group *bfqg)
|
|
+{
|
|
+ struct bfq_entity *entity = &bfqq->entity;
|
|
+
|
|
+ BUG_ON(!bfq_bfqq_busy(bfqq) && !RB_EMPTY_ROOT(&bfqq->sort_list));
|
|
+ BUG_ON(!RB_EMPTY_ROOT(&bfqq->sort_list) && !entity->on_st);
|
|
+ BUG_ON(bfq_bfqq_busy(bfqq) && RB_EMPTY_ROOT(&bfqq->sort_list)
|
|
+ && entity->on_st &&
|
|
+ bfqq != bfqd->in_service_queue);
|
|
+ BUG_ON(!bfq_bfqq_busy(bfqq) && bfqq == bfqd->in_service_queue);
|
|
+
|
|
+ /* If bfqq is empty, then bfq_bfqq_expire also invokes
|
|
+ * bfq_del_bfqq_busy, thereby removing bfqq and its entity
|
|
+ * from data structures related to current group. Otherwise we
|
|
+ * need to remove bfqq explicitly with bfq_deactivate_bfqq, as
|
|
+ * we do below.
|
|
+ */
|
|
+ if (bfqq == bfqd->in_service_queue)
|
|
+ bfq_bfqq_expire(bfqd, bfqd->in_service_queue,
|
|
+ false, BFQ_BFQQ_PREEMPTED);
|
|
+
|
|
+ BUG_ON(entity->on_st && !bfq_bfqq_busy(bfqq)
|
|
+ && &bfq_entity_service_tree(entity)->idle !=
|
|
+ entity->tree);
|
|
+
|
|
+ BUG_ON(RB_EMPTY_ROOT(&bfqq->sort_list) && bfq_bfqq_busy(bfqq));
|
|
+
|
|
+ if (bfq_bfqq_busy(bfqq))
|
|
+ bfq_deactivate_bfqq(bfqd, bfqq, false, false);
|
|
+ else if (entity->on_st) {
|
|
+ BUG_ON(&bfq_entity_service_tree(entity)->idle !=
|
|
+ entity->tree);
|
|
+ bfq_put_idle_entity(bfq_entity_service_tree(entity), entity);
|
|
+ }
|
|
+ bfqg_put(bfqq_group(bfqq));
|
|
+
|
|
+ /*
|
|
+ * Here we use a reference to bfqg. We don't need a refcounter
|
|
+ * as the cgroup reference will not be dropped, so that its
|
|
+ * destroy() callback will not be invoked.
|
|
+ */
|
|
+ entity->parent = bfqg->my_entity;
|
|
+ entity->sched_data = &bfqg->sched_data;
|
|
+ bfqg_get(bfqg);
|
|
+
|
|
+ BUG_ON(RB_EMPTY_ROOT(&bfqq->sort_list) && bfq_bfqq_busy(bfqq));
|
|
+ if (bfq_bfqq_busy(bfqq)) {
|
|
+ bfq_pos_tree_add_move(bfqd, bfqq);
|
|
+ bfq_activate_bfqq(bfqd, bfqq);
|
|
+ }
|
|
+
|
|
+ if (!bfqd->in_service_queue && !bfqd->rq_in_driver)
|
|
+ bfq_schedule_dispatch(bfqd);
|
|
+ BUG_ON(entity->on_st && !bfq_bfqq_busy(bfqq)
|
|
+ && &bfq_entity_service_tree(entity)->idle !=
|
|
+ entity->tree);
|
|
+}
|
|
+
|
|
+/**
|
|
+ * __bfq_bic_change_cgroup - move @bic to @cgroup.
|
|
+ * @bfqd: the queue descriptor.
|
|
+ * @bic: the bic to move.
|
|
+ * @blkcg: the blk-cgroup to move to.
|
|
+ *
|
|
+ * Move bic to blkcg, assuming that bfqd->queue is locked; the caller
|
|
+ * has to make sure that the reference to cgroup is valid across the call.
|
|
+ *
|
|
+ * NOTE: an alternative approach might have been to store the current
|
|
+ * cgroup in bfqq and getting a reference to it, reducing the lookup
|
|
+ * time here, at the price of slightly more complex code.
|
|
+ */
|
|
+static struct bfq_group *__bfq_bic_change_cgroup(struct bfq_data *bfqd,
|
|
+ struct bfq_io_cq *bic,
|
|
+ struct blkcg *blkcg)
|
|
+{
|
|
+ struct bfq_queue *async_bfqq = bic_to_bfqq(bic, 0);
|
|
+ struct bfq_queue *sync_bfqq = bic_to_bfqq(bic, 1);
|
|
+ struct bfq_group *bfqg;
|
|
+ struct bfq_entity *entity;
|
|
+
|
|
+ lockdep_assert_held(bfqd->queue->queue_lock);
|
|
+
|
|
+ bfqg = bfq_find_set_group(bfqd, blkcg);
|
|
+
|
|
+ if (unlikely(!bfqg))
|
|
+ bfqg = bfqd->root_group;
|
|
+
|
|
+ if (async_bfqq) {
|
|
+ entity = &async_bfqq->entity;
|
|
+
|
|
+ if (entity->sched_data != &bfqg->sched_data) {
|
|
+ bic_set_bfqq(bic, NULL, 0);
|
|
+ bfq_log_bfqq(bfqd, async_bfqq,
|
|
+ "bic_change_group: %p %d",
|
|
+ async_bfqq,
|
|
+ async_bfqq->ref);
|
|
+ bfq_put_queue(async_bfqq);
|
|
+ }
|
|
+ }
|
|
+
|
|
+ if (sync_bfqq) {
|
|
+ entity = &sync_bfqq->entity;
|
|
+ if (entity->sched_data != &bfqg->sched_data)
|
|
+ bfq_bfqq_move(bfqd, sync_bfqq, bfqg);
|
|
+ }
|
|
+
|
|
+ return bfqg;
|
|
+}
|
|
+
|
|
+static void bfq_bic_update_cgroup(struct bfq_io_cq *bic, struct bio *bio)
|
|
+{
|
|
+ struct bfq_data *bfqd = bic_to_bfqd(bic);
|
|
+ struct bfq_group *bfqg = NULL;
|
|
+ uint64_t serial_nr;
|
|
+
|
|
+ rcu_read_lock();
|
|
+ serial_nr = bio_blkcg(bio)->css.serial_nr;
|
|
+
|
|
+ /*
|
|
+ * Check whether blkcg has changed. The condition may trigger
|
|
+ * spuriously on a newly created cic but there's no harm.
|
|
+ */
|
|
+ if (unlikely(!bfqd) || likely(bic->blkcg_serial_nr == serial_nr))
|
|
+ goto out;
|
|
+
|
|
+#ifdef CONFIG_BLK_WBT_SQ
|
|
+ /*
|
|
+ * If we have a non-root cgroup, we can depend on that to
|
|
+ * do proper throttling of writes. Turn off wbt for that
|
|
+ * case.
|
|
+ */
|
|
+ if (bio_blkcg(bio) != &blkcg_root) {
|
|
+ struct request_queue *q = bfqd->queue;
|
|
+
|
|
+ if (q->rq_wb)
|
|
+ wbt_disable(q->rq_wb);
|
|
+ }
|
|
+#endif /* CONFIG_BLK_WBT_SQ */
|
|
+
|
|
+ bfqg = __bfq_bic_change_cgroup(bfqd, bic, bio_blkcg(bio));
|
|
+ bic->blkcg_serial_nr = serial_nr;
|
|
+out:
|
|
+ rcu_read_unlock();
|
|
+}
|
|
+
|
|
+/**
|
|
+ * bfq_flush_idle_tree - deactivate any entity on the idle tree of @st.
|
|
+ * @st: the service tree being flushed.
|
|
+ */
|
|
+static void bfq_flush_idle_tree(struct bfq_service_tree *st)
|
|
+{
|
|
+ struct bfq_entity *entity = st->first_idle;
|
|
+
|
|
+ for (; entity ; entity = st->first_idle)
|
|
+ __bfq_deactivate_entity(entity, false);
|
|
+}
|
|
+
|
|
+/**
|
|
+ * bfq_reparent_leaf_entity - move leaf entity to the root_group.
|
|
+ * @bfqd: the device data structure with the root group.
|
|
+ * @entity: the entity to move.
|
|
+ */
|
|
+static void bfq_reparent_leaf_entity(struct bfq_data *bfqd,
|
|
+ struct bfq_entity *entity)
|
|
+{
|
|
+ struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);
|
|
+
|
|
+ BUG_ON(!bfqq);
|
|
+ bfq_bfqq_move(bfqd, bfqq, bfqd->root_group);
|
|
+}
|
|
+
|
|
+/**
|
|
+ * bfq_reparent_active_entities - move to the root group all active
|
|
+ * entities.
|
|
+ * @bfqd: the device data structure with the root group.
|
|
+ * @bfqg: the group to move from.
|
|
+ * @st: the service tree with the entities.
|
|
+ *
|
|
+ * Needs queue_lock to be taken and reference to be valid over the call.
|
|
+ */
|
|
+static void bfq_reparent_active_entities(struct bfq_data *bfqd,
|
|
+ struct bfq_group *bfqg,
|
|
+ struct bfq_service_tree *st)
|
|
+{
|
|
+ struct rb_root *active = &st->active;
|
|
+ struct bfq_entity *entity = NULL;
|
|
+
|
|
+ if (!RB_EMPTY_ROOT(&st->active))
|
|
+ entity = bfq_entity_of(rb_first(active));
|
|
+
|
|
+ for (; entity ; entity = bfq_entity_of(rb_first(active)))
|
|
+ bfq_reparent_leaf_entity(bfqd, entity);
|
|
+
|
|
+ if (bfqg->sched_data.in_service_entity)
|
|
+ bfq_reparent_leaf_entity(bfqd,
|
|
+ bfqg->sched_data.in_service_entity);
|
|
+}
|
|
+
|
|
+/**
|
|
+ * bfq_pd_offline - deactivate the entity associated with @pd,
|
|
+ * and reparent its children entities.
|
|
+ * @pd: descriptor of the policy going offline.
|
|
+ *
|
|
+ * blkio already grabs the queue_lock for us, so no need to use
|
|
+ * RCU-based magic
|
|
+ */
|
|
+static void bfq_pd_offline(struct blkg_policy_data *pd)
|
|
+{
|
|
+ struct bfq_service_tree *st;
|
|
+ struct bfq_group *bfqg;
|
|
+ struct bfq_data *bfqd;
|
|
+ struct bfq_entity *entity;
|
|
+ int i;
|
|
+
|
|
+ BUG_ON(!pd);
|
|
+ bfqg = pd_to_bfqg(pd);
|
|
+ BUG_ON(!bfqg);
|
|
+ bfqd = bfqg->bfqd;
|
|
+ BUG_ON(bfqd && !bfqd->root_group);
|
|
+
|
|
+ entity = bfqg->my_entity;
|
|
+
|
|
+ if (!entity) /* root group */
|
|
+ return;
|
|
+
|
|
+ /*
|
|
+ * Empty all service_trees belonging to this group before
|
|
+ * deactivating the group itself.
|
|
+ */
|
|
+ for (i = 0; i < BFQ_IOPRIO_CLASSES; i++) {
|
|
+ BUG_ON(!bfqg->sched_data.service_tree);
|
|
+ st = bfqg->sched_data.service_tree + i;
|
|
+ /*
|
|
+ * The idle tree may still contain bfq_queues belonging
|
|
+ * to exited task because they never migrated to a different
|
|
+ * cgroup from the one being destroyed now. No one else
|
|
+ * can access them so it's safe to act without any lock.
|
|
+ */
|
|
+ bfq_flush_idle_tree(st);
|
|
+
|
|
+ /*
|
|
+ * It may happen that some queues are still active
|
|
+ * (busy) upon group destruction (if the corresponding
|
|
+ * processes have been forced to terminate). We move
|
|
+ * all the leaf entities corresponding to these queues
|
|
+ * to the root_group.
|
|
+ * Also, it may happen that the group has an entity
|
|
+ * in service, which is disconnected from the active
|
|
+ * tree: it must be moved, too.
|
|
+ * There is no need to put the sync queues, as the
|
|
+ * scheduler has taken no reference.
|
|
+ */
|
|
+ bfq_reparent_active_entities(bfqd, bfqg, st);
|
|
+ BUG_ON(!RB_EMPTY_ROOT(&st->active));
|
|
+ BUG_ON(!RB_EMPTY_ROOT(&st->idle));
|
|
+ }
|
|
+ BUG_ON(bfqg->sched_data.next_in_service);
|
|
+ BUG_ON(bfqg->sched_data.in_service_entity);
|
|
+
|
|
+ __bfq_deactivate_entity(entity, false);
|
|
+ bfq_put_async_queues(bfqd, bfqg);
|
|
+ BUG_ON(entity->tree);
|
|
+
|
|
+ /*
|
|
+ * @blkg is going offline and will be ignored by
|
|
+ * blkg_[rw]stat_recursive_sum(). Transfer stats to the parent so
|
|
+ * that they don't get lost. If IOs complete after this point, the
|
|
+ * stats for them will be lost. Oh well...
|
|
+ */
|
|
+ bfqg_stats_xfer_dead(bfqg);
|
|
+}
|
|
+
|
|
+static void bfq_end_wr_async(struct bfq_data *bfqd)
|
|
+{
|
|
+ struct blkcg_gq *blkg;
|
|
+
|
|
+ list_for_each_entry(blkg, &bfqd->queue->blkg_list, q_node) {
|
|
+ struct bfq_group *bfqg = blkg_to_bfqg(blkg);
|
|
+ BUG_ON(!bfqg);
|
|
+
|
|
+ bfq_end_wr_async_queues(bfqd, bfqg);
|
|
+ }
|
|
+ bfq_end_wr_async_queues(bfqd, bfqd->root_group);
|
|
+}
|
|
+
|
|
+static int bfq_io_show_weight(struct seq_file *sf, void *v)
|
|
+{
|
|
+ struct blkcg *blkcg = css_to_blkcg(seq_css(sf));
|
|
+ struct bfq_group_data *bfqgd = blkcg_to_bfqgd(blkcg);
|
|
+ unsigned int val = 0;
|
|
+
|
|
+ if (bfqgd)
|
|
+ val = bfqgd->weight;
|
|
+
|
|
+ seq_printf(sf, "%u\n", val);
|
|
+
|
|
+ return 0;
|
|
+}
|
|
+
|
|
+static int bfq_io_set_weight_legacy(struct cgroup_subsys_state *css,
|
|
+ struct cftype *cftype,
|
|
+ u64 val)
|
|
+{
|
|
+ struct blkcg *blkcg = css_to_blkcg(css);
|
|
+ struct bfq_group_data *bfqgd = blkcg_to_bfqgd(blkcg);
|
|
+ struct blkcg_gq *blkg;
|
|
+ int ret = -ERANGE;
|
|
+
|
|
+ if (val < BFQ_MIN_WEIGHT || val > BFQ_MAX_WEIGHT)
|
|
+ return ret;
|
|
+
|
|
+ ret = 0;
|
|
+ spin_lock_irq(&blkcg->lock);
|
|
+ bfqgd->weight = (unsigned short)val;
|
|
+ hlist_for_each_entry(blkg, &blkcg->blkg_list, blkcg_node) {
|
|
+ struct bfq_group *bfqg = blkg_to_bfqg(blkg);
|
|
+
|
|
+ if (!bfqg)
|
|
+ continue;
|
|
+ /*
|
|
+ * Setting the prio_changed flag of the entity
|
|
+ * to 1 with new_weight == weight would re-set
|
|
+ * the value of the weight to its ioprio mapping.
|
|
+ * Set the flag only if necessary.
|
|
+ */
|
|
+ if ((unsigned short)val != bfqg->entity.new_weight) {
|
|
+ bfqg->entity.new_weight = (unsigned short)val;
|
|
+ /*
|
|
+ * Make sure that the above new value has been
|
|
+ * stored in bfqg->entity.new_weight before
|
|
+ * setting the prio_changed flag. In fact,
|
|
+ * this flag may be read asynchronously (in
|
|
+ * critical sections protected by a different
|
|
+ * lock than that held here), and finding this
|
|
+ * flag set may cause the execution of the code
|
|
+ * for updating parameters whose value may
|
|
+ * depend also on bfqg->entity.new_weight (in
|
|
+ * __bfq_entity_update_weight_prio).
|
|
+ * This barrier makes sure that the new value
|
|
+ * of bfqg->entity.new_weight is correctly
|
|
+ * seen in that code.
|
|
+ */
|
|
+ smp_wmb();
|
|
+ bfqg->entity.prio_changed = 1;
|
|
+ }
|
|
+ }
|
|
+ spin_unlock_irq(&blkcg->lock);
|
|
+
|
|
+ return ret;
|
|
+}
|
|
+
|
|
+static ssize_t bfq_io_set_weight(struct kernfs_open_file *of,
|
|
+ char *buf, size_t nbytes,
|
|
+ loff_t off)
|
|
+{
|
|
+ u64 weight;
|
|
+ /* First unsigned long found in the file is used */
|
|
+ int ret = kstrtoull(strim(buf), 0, &weight);
|
|
+
|
|
+ if (ret)
|
|
+ return ret;
|
|
+
|
|
+ return bfq_io_set_weight_legacy(of_css(of), NULL, weight);
|
|
+}
|
|
+
|
|
+static int bfqg_print_stat(struct seq_file *sf, void *v)
|
|
+{
|
|
+ blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)), blkg_prfill_stat,
|
|
+ &blkcg_policy_bfq, seq_cft(sf)->private, false);
|
|
+ return 0;
|
|
+}
|
|
+
|
|
+static int bfqg_print_rwstat(struct seq_file *sf, void *v)
|
|
+{
|
|
+ blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)), blkg_prfill_rwstat,
|
|
+ &blkcg_policy_bfq, seq_cft(sf)->private, true);
|
|
+ return 0;
|
|
+}
|
|
+
|
|
+static u64 bfqg_prfill_stat_recursive(struct seq_file *sf,
|
|
+ struct blkg_policy_data *pd, int off)
|
|
+{
|
|
+ u64 sum = blkg_stat_recursive_sum(pd_to_blkg(pd),
|
|
+ &blkcg_policy_bfq, off);
|
|
+ return __blkg_prfill_u64(sf, pd, sum);
|
|
+}
|
|
+
|
|
+static u64 bfqg_prfill_rwstat_recursive(struct seq_file *sf,
|
|
+ struct blkg_policy_data *pd, int off)
|
|
+{
|
|
+ struct blkg_rwstat sum = blkg_rwstat_recursive_sum(pd_to_blkg(pd),
|
|
+ &blkcg_policy_bfq,
|
|
+ off);
|
|
+ return __blkg_prfill_rwstat(sf, pd, &sum);
|
|
+}
|
|
+
|
|
+static int bfqg_print_stat_recursive(struct seq_file *sf, void *v)
|
|
+{
|
|
+ blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)),
|
|
+ bfqg_prfill_stat_recursive, &blkcg_policy_bfq,
|
|
+ seq_cft(sf)->private, false);
|
|
+ return 0;
|
|
+}
|
|
+
|
|
+static int bfqg_print_rwstat_recursive(struct seq_file *sf, void *v)
|
|
+{
|
|
+ blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)),
|
|
+ bfqg_prfill_rwstat_recursive, &blkcg_policy_bfq,
|
|
+ seq_cft(sf)->private, true);
|
|
+ return 0;
|
|
+}
|
|
+
|
|
+static u64 bfqg_prfill_sectors(struct seq_file *sf, struct blkg_policy_data *pd,
|
|
+ int off)
|
|
+{
|
|
+ u64 sum = blkg_rwstat_total(&pd->blkg->stat_bytes);
|
|
+
|
|
+ return __blkg_prfill_u64(sf, pd, sum >> 9);
|
|
+}
|
|
+
|
|
+static int bfqg_print_stat_sectors(struct seq_file *sf, void *v)
|
|
+{
|
|
+ blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)),
|
|
+ bfqg_prfill_sectors, &blkcg_policy_bfq, 0, false);
|
|
+ return 0;
|
|
+}
|
|
+
|
|
+static u64 bfqg_prfill_sectors_recursive(struct seq_file *sf,
|
|
+ struct blkg_policy_data *pd, int off)
|
|
+{
|
|
+ struct blkg_rwstat tmp = blkg_rwstat_recursive_sum(pd->blkg, NULL,
|
|
+ offsetof(struct blkcg_gq, stat_bytes));
|
|
+ u64 sum = atomic64_read(&tmp.aux_cnt[BLKG_RWSTAT_READ]) +
|
|
+ atomic64_read(&tmp.aux_cnt[BLKG_RWSTAT_WRITE]);
|
|
+
|
|
+ return __blkg_prfill_u64(sf, pd, sum >> 9);
|
|
+}
|
|
+
|
|
+static int bfqg_print_stat_sectors_recursive(struct seq_file *sf, void *v)
|
|
+{
|
|
+ blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)),
|
|
+ bfqg_prfill_sectors_recursive, &blkcg_policy_bfq, 0,
|
|
+ false);
|
|
+ return 0;
|
|
+}
|
|
+
|
|
+
|
|
+static u64 bfqg_prfill_avg_queue_size(struct seq_file *sf,
|
|
+ struct blkg_policy_data *pd, int off)
|
|
+{
|
|
+ struct bfq_group *bfqg = pd_to_bfqg(pd);
|
|
+ u64 samples = blkg_stat_read(&bfqg->stats.avg_queue_size_samples);
|
|
+ u64 v = 0;
|
|
+
|
|
+ if (samples) {
|
|
+ v = blkg_stat_read(&bfqg->stats.avg_queue_size_sum);
|
|
+ v = div64_u64(v, samples);
|
|
+ }
|
|
+ __blkg_prfill_u64(sf, pd, v);
|
|
+ return 0;
|
|
+}
|
|
+
|
|
+/* print avg_queue_size */
|
|
+static int bfqg_print_avg_queue_size(struct seq_file *sf, void *v)
|
|
+{
|
|
+ blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)),
|
|
+ bfqg_prfill_avg_queue_size, &blkcg_policy_bfq,
|
|
+ 0, false);
|
|
+ return 0;
|
|
+}
|
|
+
|
|
+static struct bfq_group *
|
|
+bfq_create_group_hierarchy(struct bfq_data *bfqd, int node)
|
|
+{
|
|
+ int ret;
|
|
+
|
|
+ ret = blkcg_activate_policy(bfqd->queue, &blkcg_policy_bfq);
|
|
+ if (ret)
|
|
+ return NULL;
|
|
+
|
|
+ return blkg_to_bfqg(bfqd->queue->root_blkg);
|
|
+}
|
|
+
|
|
+static struct cftype bfq_blkcg_legacy_files[] = {
|
|
+ {
|
|
+ .name = "bfq.weight",
|
|
+ .flags = CFTYPE_NOT_ON_ROOT,
|
|
+ .seq_show = bfq_io_show_weight,
|
|
+ .write_u64 = bfq_io_set_weight_legacy,
|
|
+ },
|
|
+
|
|
+ /* statistics, covers only the tasks in the bfqg */
|
|
+ {
|
|
+ .name = "bfq.time",
|
|
+ .private = offsetof(struct bfq_group, stats.time),
|
|
+ .seq_show = bfqg_print_stat,
|
|
+ },
|
|
+ {
|
|
+ .name = "bfq.sectors",
|
|
+ .seq_show = bfqg_print_stat_sectors,
|
|
+ },
|
|
+ {
|
|
+ .name = "bfq.io_service_bytes",
|
|
+ .private = (unsigned long)&blkcg_policy_bfq,
|
|
+ .seq_show = blkg_print_stat_bytes,
|
|
+ },
|
|
+ {
|
|
+ .name = "bfq.io_serviced",
|
|
+ .private = (unsigned long)&blkcg_policy_bfq,
|
|
+ .seq_show = blkg_print_stat_ios,
|
|
+ },
|
|
+ {
|
|
+ .name = "bfq.io_service_time",
|
|
+ .private = offsetof(struct bfq_group, stats.service_time),
|
|
+ .seq_show = bfqg_print_rwstat,
|
|
+ },
|
|
+ {
|
|
+ .name = "bfq.io_wait_time",
|
|
+ .private = offsetof(struct bfq_group, stats.wait_time),
|
|
+ .seq_show = bfqg_print_rwstat,
|
|
+ },
|
|
+ {
|
|
+ .name = "bfq.io_merged",
|
|
+ .private = offsetof(struct bfq_group, stats.merged),
|
|
+ .seq_show = bfqg_print_rwstat,
|
|
+ },
|
|
+ {
|
|
+ .name = "bfq.io_queued",
|
|
+ .private = offsetof(struct bfq_group, stats.queued),
|
|
+ .seq_show = bfqg_print_rwstat,
|
|
+ },
|
|
+
|
|
+ /* the same statictics which cover the bfqg and its descendants */
|
|
+ {
|
|
+ .name = "bfq.time_recursive",
|
|
+ .private = offsetof(struct bfq_group, stats.time),
|
|
+ .seq_show = bfqg_print_stat_recursive,
|
|
+ },
|
|
+ {
|
|
+ .name = "bfq.sectors_recursive",
|
|
+ .seq_show = bfqg_print_stat_sectors_recursive,
|
|
+ },
|
|
+ {
|
|
+ .name = "bfq.io_service_bytes_recursive",
|
|
+ .private = (unsigned long)&blkcg_policy_bfq,
|
|
+ .seq_show = blkg_print_stat_bytes_recursive,
|
|
+ },
|
|
+ {
|
|
+ .name = "bfq.io_serviced_recursive",
|
|
+ .private = (unsigned long)&blkcg_policy_bfq,
|
|
+ .seq_show = blkg_print_stat_ios_recursive,
|
|
+ },
|
|
+ {
|
|
+ .name = "bfq.io_service_time_recursive",
|
|
+ .private = offsetof(struct bfq_group, stats.service_time),
|
|
+ .seq_show = bfqg_print_rwstat_recursive,
|
|
+ },
|
|
+ {
|
|
+ .name = "bfq.io_wait_time_recursive",
|
|
+ .private = offsetof(struct bfq_group, stats.wait_time),
|
|
+ .seq_show = bfqg_print_rwstat_recursive,
|
|
+ },
|
|
+ {
|
|
+ .name = "bfq.io_merged_recursive",
|
|
+ .private = offsetof(struct bfq_group, stats.merged),
|
|
+ .seq_show = bfqg_print_rwstat_recursive,
|
|
+ },
|
|
+ {
|
|
+ .name = "bfq.io_queued_recursive",
|
|
+ .private = offsetof(struct bfq_group, stats.queued),
|
|
+ .seq_show = bfqg_print_rwstat_recursive,
|
|
+ },
|
|
+ {
|
|
+ .name = "bfq.avg_queue_size",
|
|
+ .seq_show = bfqg_print_avg_queue_size,
|
|
+ },
|
|
+ {
|
|
+ .name = "bfq.group_wait_time",
|
|
+ .private = offsetof(struct bfq_group, stats.group_wait_time),
|
|
+ .seq_show = bfqg_print_stat,
|
|
+ },
|
|
+ {
|
|
+ .name = "bfq.idle_time",
|
|
+ .private = offsetof(struct bfq_group, stats.idle_time),
|
|
+ .seq_show = bfqg_print_stat,
|
|
+ },
|
|
+ {
|
|
+ .name = "bfq.empty_time",
|
|
+ .private = offsetof(struct bfq_group, stats.empty_time),
|
|
+ .seq_show = bfqg_print_stat,
|
|
+ },
|
|
+ {
|
|
+ .name = "bfq.dequeue",
|
|
+ .private = offsetof(struct bfq_group, stats.dequeue),
|
|
+ .seq_show = bfqg_print_stat,
|
|
+ },
|
|
+ { } /* terminate */
|
|
+};
|
|
+
|
|
+static struct cftype bfq_blkg_files[] = {
|
|
+ {
|
|
+ .name = "bfq.weight",
|
|
+ .flags = CFTYPE_NOT_ON_ROOT,
|
|
+ .seq_show = bfq_io_show_weight,
|
|
+ .write = bfq_io_set_weight,
|
|
+ },
|
|
+ {} /* terminate */
|
|
+};
|
|
+
|
|
+#else /* CONFIG_BFQ_GROUP_IOSCHED */
|
|
+
|
|
+static inline void bfqg_stats_update_io_add(struct bfq_group *bfqg,
|
|
+ struct bfq_queue *bfqq, int op, int op_flags) { }
|
|
+static inline void
|
|
+bfqg_stats_update_io_remove(struct bfq_group *bfqg, int op, int op_flags) { }
|
|
+static inline void
|
|
+bfqg_stats_update_io_merged(struct bfq_group *bfqg, int op, int op_flags) { }
|
|
+static inline void bfqg_stats_update_completion(struct bfq_group *bfqg,
|
|
+ uint64_t start_time, uint64_t io_start_time, int op,
|
|
+ int op_flags) { }
|
|
+static inline void
|
|
+bfqg_stats_set_start_group_wait_time(struct bfq_group *bfqg,
|
|
+ struct bfq_group *curr_bfqg) { }
|
|
+static inline void bfqg_stats_end_empty_time(struct bfqg_stats *stats) { }
|
|
+static inline void bfqg_stats_update_dequeue(struct bfq_group *bfqg) { }
|
|
+static inline void bfqg_stats_set_start_empty_time(struct bfq_group *bfqg) { }
|
|
+static inline void bfqg_stats_update_idle_time(struct bfq_group *bfqg) { }
|
|
+static inline void bfqg_stats_set_start_idle_time(struct bfq_group *bfqg) { }
|
|
+static inline void bfqg_stats_update_avg_queue_size(struct bfq_group *bfqg) { }
|
|
+
|
|
+static void bfq_bfqq_move(struct bfq_data *bfqd, struct bfq_queue *bfqq,
|
|
+ struct bfq_group *bfqg) {}
|
|
+
|
|
+static void bfq_init_entity(struct bfq_entity *entity,
|
|
+ struct bfq_group *bfqg)
|
|
+{
|
|
+ struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);
|
|
+
|
|
+ entity->weight = entity->new_weight;
|
|
+ entity->orig_weight = entity->new_weight;
|
|
+ if (bfqq) {
|
|
+ bfqq->ioprio = bfqq->new_ioprio;
|
|
+ bfqq->ioprio_class = bfqq->new_ioprio_class;
|
|
+ }
|
|
+ entity->sched_data = &bfqg->sched_data;
|
|
+}
|
|
+
|
|
+static struct bfq_group *
|
|
+bfq_bic_update_cgroup(struct bfq_io_cq *bic, struct bio *bio)
|
|
+{
|
|
+ struct bfq_data *bfqd = bic_to_bfqd(bic);
|
|
+
|
|
+ return bfqd->root_group;
|
|
+}
|
|
+
|
|
+static void bfq_end_wr_async(struct bfq_data *bfqd)
|
|
+{
|
|
+ bfq_end_wr_async_queues(bfqd, bfqd->root_group);
|
|
+}
|
|
+
|
|
+static struct bfq_group *bfq_find_set_group(struct bfq_data *bfqd,
|
|
+ struct blkcg *blkcg)
|
|
+{
|
|
+ return bfqd->root_group;
|
|
+}
|
|
+
|
|
+static struct bfq_group *bfqq_group(struct bfq_queue *bfqq)
|
|
+{
|
|
+ return bfqq->bfqd->root_group;
|
|
+}
|
|
+
|
|
+static struct bfq_group *
|
|
+bfq_create_group_hierarchy(struct bfq_data *bfqd, int node)
|
|
+{
|
|
+ struct bfq_group *bfqg;
|
|
+ int i;
|
|
+
|
|
+ bfqg = kmalloc_node(sizeof(*bfqg), GFP_KERNEL | __GFP_ZERO, node);
|
|
+ if (!bfqg)
|
|
+ return NULL;
|
|
+
|
|
+ for (i = 0; i < BFQ_IOPRIO_CLASSES; i++)
|
|
+ bfqg->sched_data.service_tree[i] = BFQ_SERVICE_TREE_INIT;
|
|
+
|
|
+ return bfqg;
|
|
+}
|
|
+#endif
|
|
diff --git a/block/bfq-ioc.c b/block/bfq-ioc.c
|
|
new file mode 100644
|
|
index 0000000..fb7bb8f
|
|
--- /dev/null
|
|
+++ b/block/bfq-ioc.c
|
|
@@ -0,0 +1,36 @@
|
|
+/*
|
|
+ * BFQ: I/O context handling.
|
|
+ *
|
|
+ * Based on ideas and code from CFQ:
|
|
+ * Copyright (C) 2003 Jens Axboe <axboe@kernel.dk>
|
|
+ *
|
|
+ * Copyright (C) 2008 Fabio Checconi <fabio@gandalf.sssup.it>
|
|
+ * Paolo Valente <paolo.valente@unimore.it>
|
|
+ *
|
|
+ * Copyright (C) 2010 Paolo Valente <paolo.valente@unimore.it>
|
|
+ */
|
|
+
|
|
+/**
|
|
+ * icq_to_bic - convert iocontext queue structure to bfq_io_cq.
|
|
+ * @icq: the iocontext queue.
|
|
+ */
|
|
+static struct bfq_io_cq *icq_to_bic(struct io_cq *icq)
|
|
+{
|
|
+ /* bic->icq is the first member, %NULL will convert to %NULL */
|
|
+ return container_of(icq, struct bfq_io_cq, icq);
|
|
+}
|
|
+
|
|
+/**
|
|
+ * bfq_bic_lookup - search into @ioc a bic associated to @bfqd.
|
|
+ * @bfqd: the lookup key.
|
|
+ * @ioc: the io_context of the process doing I/O.
|
|
+ *
|
|
+ * Queue lock must be held.
|
|
+ */
|
|
+static struct bfq_io_cq *bfq_bic_lookup(struct bfq_data *bfqd,
|
|
+ struct io_context *ioc)
|
|
+{
|
|
+ if (ioc)
|
|
+ return icq_to_bic(ioc_lookup_icq(ioc, bfqd->queue));
|
|
+ return NULL;
|
|
+}
|
|
diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c
|
|
new file mode 100644
|
|
index 0000000..45e534a
|
|
--- /dev/null
|
|
+++ b/block/bfq-iosched.c
|
|
@@ -0,0 +1,5318 @@
|
|
+/*
|
|
+ * Budget Fair Queueing (BFQ) I/O scheduler.
|
|
+ *
|
|
+ * Based on ideas and code from CFQ:
|
|
+ * Copyright (C) 2003 Jens Axboe <axboe@kernel.dk>
|
|
+ *
|
|
+ * Copyright (C) 2008 Fabio Checconi <fabio@gandalf.sssup.it>
|
|
+ * Paolo Valente <paolo.valente@unimore.it>
|
|
+ *
|
|
+ * Copyright (C) 2015 Paolo Valente <paolo.valente@unimore.it>
|
|
+ *
|
|
+ * Copyright (C) 2017 Paolo Valente <paolo.valente@linaro.org>
|
|
+ *
|
|
+ * Licensed under the GPL-2 as detailed in the accompanying COPYING.BFQ
|
|
+ * file.
|
|
+ *
|
|
+ * BFQ is a proportional-share I/O scheduler, with some extra
|
|
+ * low-latency capabilities. BFQ also supports full hierarchical
|
|
+ * scheduling through cgroups. Next paragraphs provide an introduction
|
|
+ * on BFQ inner workings. Details on BFQ benefits and usage can be
|
|
+ * found in Documentation/block/bfq-iosched.txt.
|
|
+ *
|
|
+ * BFQ is a proportional-share storage-I/O scheduling algorithm based
|
|
+ * on the slice-by-slice service scheme of CFQ. But BFQ assigns
|
|
+ * budgets, measured in number of sectors, to processes instead of
|
|
+ * time slices. The device is not granted to the in-service process
|
|
+ * for a given time slice, but until it has exhausted its assigned
|
|
+ * budget. This change from the time to the service domain enables BFQ
|
|
+ * to distribute the device throughput among processes as desired,
|
|
+ * without any distortion due to throughput fluctuations, or to device
|
|
+ * internal queueing. BFQ uses an ad hoc internal scheduler, called
|
|
+ * B-WF2Q+, to schedule processes according to their budgets. More
|
|
+ * precisely, BFQ schedules queues associated with processes. Thanks to
|
|
+ * the accurate policy of B-WF2Q+, BFQ can afford to assign high
|
|
+ * budgets to I/O-bound processes issuing sequential requests (to
|
|
+ * boost the throughput), and yet guarantee a low latency to
|
|
+ * interactive and soft real-time applications.
|
|
+ *
|
|
+ * BFQ is described in [1], where also a reference to the initial, more
|
|
+ * theoretical paper on BFQ can be found. The interested reader can find
|
|
+ * in the latter paper full details on the main algorithm, as well as
|
|
+ * formulas of the guarantees and formal proofs of all the properties.
|
|
+ * With respect to the version of BFQ presented in these papers, this
|
|
+ * implementation adds a few more heuristics, such as the one that
|
|
+ * guarantees a low latency to soft real-time applications, and a
|
|
+ * hierarchical extension based on H-WF2Q+.
|
|
+ *
|
|
+ * B-WF2Q+ is based on WF2Q+, that is described in [2], together with
|
|
+ * H-WF2Q+, while the augmented tree used to implement B-WF2Q+ with O(log N)
|
|
+ * complexity derives from the one introduced with EEVDF in [3].
|
|
+ *
|
|
+ * [1] P. Valente, A. Avanzini, "Evolution of the BFQ Storage I/O
|
|
+ * Scheduler", Proceedings of the First Workshop on Mobile System
|
|
+ * Technologies (MST-2015), May 2015.
|
|
+ * http://algogroup.unimore.it/people/paolo/disk_sched/mst-2015.pdf
|
|
+ *
|
|
+ * http://algogroup.unimo.it/people/paolo/disk_sched/bf1-v1-suite-results.pdf
|
|
+ *
|
|
+ * [2] Jon C.R. Bennett and H. Zhang, ``Hierarchical Packet Fair Queueing
|
|
+ * Algorithms,'' IEEE/ACM Transactions on Networking, 5(5):675-689,
|
|
+ * Oct 1997.
|
|
+ *
|
|
+ * http://www.cs.cmu.edu/~hzhang/papers/TON-97-Oct.ps.gz
|
|
+ *
|
|
+ * [3] I. Stoica and H. Abdel-Wahab, ``Earliest Eligible Virtual Deadline
|
|
+ * First: A Flexible and Accurate Mechanism for Proportional Share
|
|
+ * Resource Allocation,'' technical report.
|
|
+ *
|
|
+ * http://www.cs.berkeley.edu/~istoica/papers/eevdf-tr-95.pdf
|
|
+ */
|
|
+#include <linux/module.h>
|
|
+#include <linux/slab.h>
|
|
+#include <linux/blkdev.h>
|
|
+#include <linux/cgroup.h>
|
|
+#include <linux/elevator.h>
|
|
+#include <linux/jiffies.h>
|
|
+#include <linux/rbtree.h>
|
|
+#include <linux/ioprio.h>
|
|
+#include "bfq.h"
|
|
+#include "blk.h"
|
|
+#include "blk-wbt.h"
|
|
+
|
|
+/* Expiration time of sync (0) and async (1) requests, in ns. */
|
|
+static const u64 bfq_fifo_expire[2] = { NSEC_PER_SEC / 4, NSEC_PER_SEC / 8 };
|
|
+
|
|
+/* Maximum backwards seek, in KiB. */
|
|
+static const int bfq_back_max = (16 * 1024);
|
|
+
|
|
+/* Penalty of a backwards seek, in number of sectors. */
|
|
+static const int bfq_back_penalty = 2;
|
|
+
|
|
+/* Idling period duration, in ns. */
|
|
+static u32 bfq_slice_idle = (NSEC_PER_SEC / 125);
|
|
+
|
|
+/* Minimum number of assigned budgets for which stats are safe to compute. */
|
|
+static const int bfq_stats_min_budgets = 194;
|
|
+
|
|
+/* Default maximum budget values, in sectors and number of requests. */
|
|
+static const int bfq_default_max_budget = (16 * 1024);
|
|
+
|
|
+/*
|
|
+ * Async to sync throughput distribution is controlled as follows:
|
|
+ * when an async request is served, the entity is charged the number
|
|
+ * of sectors of the request, multiplied by the factor below
|
|
+ */
|
|
+static const int bfq_async_charge_factor = 10;
|
|
+
|
|
+/* Default timeout values, in jiffies, approximating CFQ defaults. */
|
|
+static const int bfq_timeout = (HZ / 8);
|
|
+
|
|
+static struct kmem_cache *bfq_pool;
|
|
+
|
|
+/* Below this threshold (in ns), we consider thinktime immediate. */
|
|
+#define BFQ_MIN_TT (2 * NSEC_PER_MSEC)
|
|
+
|
|
+/* hw_tag detection: parallel requests threshold and min samples needed. */
|
|
+#define BFQ_HW_QUEUE_THRESHOLD 4
|
|
+#define BFQ_HW_QUEUE_SAMPLES 32
|
|
+
|
|
+#define BFQQ_SEEK_THR (sector_t)(8 * 100)
|
|
+#define BFQQ_SECT_THR_NONROT (sector_t)(2 * 32)
|
|
+#define BFQQ_CLOSE_THR (sector_t)(8 * 1024)
|
|
+#define BFQQ_SEEKY(bfqq) (hweight32(bfqq->seek_history) > 32/8)
|
|
+
|
|
+/* Min number of samples required to perform peak-rate update */
|
|
+#define BFQ_RATE_MIN_SAMPLES 32
|
|
+/* Min observation time interval required to perform a peak-rate update (ns) */
|
|
+#define BFQ_RATE_MIN_INTERVAL (300*NSEC_PER_MSEC)
|
|
+/* Target observation time interval for a peak-rate update (ns) */
|
|
+#define BFQ_RATE_REF_INTERVAL NSEC_PER_SEC
|
|
+
|
|
+/* Shift used for peak rate fixed precision calculations. */
|
|
+#define BFQ_RATE_SHIFT 16
|
|
+
|
|
+/*
|
|
+ * By default, BFQ computes the duration of the weight raising for
|
|
+ * interactive applications automatically, using the following formula:
|
|
+ * duration = (R / r) * T, where r is the peak rate of the device, and
|
|
+ * R and T are two reference parameters.
|
|
+ * In particular, R is the peak rate of the reference device (see below),
|
|
+ * and T is a reference time: given the systems that are likely to be
|
|
+ * installed on the reference device according to its speed class, T is
|
|
+ * about the maximum time needed, under BFQ and while reading two files in
|
|
+ * parallel, to load typical large applications on these systems.
|
|
+ * In practice, the slower/faster the device at hand is, the more/less it
|
|
+ * takes to load applications with respect to the reference device.
|
|
+ * Accordingly, the longer/shorter BFQ grants weight raising to interactive
|
|
+ * applications.
|
|
+ *
|
|
+ * BFQ uses four different reference pairs (R, T), depending on:
|
|
+ * . whether the device is rotational or non-rotational;
|
|
+ * . whether the device is slow, such as old or portable HDDs, as well as
|
|
+ * SD cards, or fast, such as newer HDDs and SSDs.
|
|
+ *
|
|
+ * The device's speed class is dynamically (re)detected in
|
|
+ * bfq_update_peak_rate() every time the estimated peak rate is updated.
|
|
+ *
|
|
+ * In the following definitions, R_slow[0]/R_fast[0] and
|
|
+ * T_slow[0]/T_fast[0] are the reference values for a slow/fast
|
|
+ * rotational device, whereas R_slow[1]/R_fast[1] and
|
|
+ * T_slow[1]/T_fast[1] are the reference values for a slow/fast
|
|
+ * non-rotational device. Finally, device_speed_thresh are the
|
|
+ * thresholds used to switch between speed classes. The reference
|
|
+ * rates are not the actual peak rates of the devices used as a
|
|
+ * reference, but slightly lower values. The reason for using these
|
|
+ * slightly lower values is that the peak-rate estimator tends to
|
|
+ * yield slightly lower values than the actual peak rate (it can yield
|
|
+ * the actual peak rate only if there is only one process doing I/O,
|
|
+ * and the process does sequential I/O).
|
|
+ *
|
|
+ * Both the reference peak rates and the thresholds are measured in
|
|
+ * sectors/usec, left-shifted by BFQ_RATE_SHIFT.
|
|
+ */
|
|
+static int R_slow[2] = {1000, 10700};
|
|
+static int R_fast[2] = {14000, 33000};
|
|
+/*
|
|
+ * To improve readability, a conversion function is used to initialize the
|
|
+ * following arrays, which entails that they can be initialized only in a
|
|
+ * function.
|
|
+ */
|
|
+static int T_slow[2];
|
|
+static int T_fast[2];
|
|
+static int device_speed_thresh[2];
|
|
+
|
|
+#define BFQ_SERVICE_TREE_INIT ((struct bfq_service_tree) \
|
|
+ { RB_ROOT, RB_ROOT, NULL, NULL, 0, 0 })
|
|
+
|
|
+#define RQ_BIC(rq) ((struct bfq_io_cq *) (rq)->elv.priv[0])
|
|
+#define RQ_BFQQ(rq) ((rq)->elv.priv[1])
|
|
+
|
|
+static void bfq_schedule_dispatch(struct bfq_data *bfqd);
|
|
+
|
|
+#include "bfq-ioc.c"
|
|
+#include "bfq-sched.c"
|
|
+#include "bfq-cgroup.c"
|
|
+
|
|
+#define bfq_class_idle(bfqq) ((bfqq)->ioprio_class == IOPRIO_CLASS_IDLE)
|
|
+#define bfq_class_rt(bfqq) ((bfqq)->ioprio_class == IOPRIO_CLASS_RT)
|
|
+
|
|
+#define bfq_sample_valid(samples) ((samples) > 80)
|
|
+
|
|
+/*
|
|
+ * We regard a request as SYNC, if either it's a read or has the SYNC bit
|
|
+ * set (in which case it could also be a direct WRITE).
|
|
+ */
|
|
+static int bfq_bio_sync(struct bio *bio)
|
|
+{
|
|
+ return bio_data_dir(bio) == READ || (bio->bi_opf & REQ_SYNC);
|
|
+}
|
|
+
|
|
+/*
|
|
+ * Scheduler run of queue, if there are requests pending and no one in the
|
|
+ * driver that will restart queueing.
|
|
+ */
|
|
+static void bfq_schedule_dispatch(struct bfq_data *bfqd)
|
|
+{
|
|
+ if (bfqd->queued != 0) {
|
|
+ bfq_log(bfqd, "schedule dispatch");
|
|
+ kblockd_schedule_work(&bfqd->unplug_work);
|
|
+ }
|
|
+}
|
|
+
|
|
+/*
|
|
+ * Lifted from AS - choose which of rq1 and rq2 that is best served now.
|
|
+ * We choose the request that is closesr to the head right now. Distance
|
|
+ * behind the head is penalized and only allowed to a certain extent.
|
|
+ */
|
|
+static struct request *bfq_choose_req(struct bfq_data *bfqd,
|
|
+ struct request *rq1,
|
|
+ struct request *rq2,
|
|
+ sector_t last)
|
|
+{
|
|
+ sector_t s1, s2, d1 = 0, d2 = 0;
|
|
+ unsigned long back_max;
|
|
+#define BFQ_RQ1_WRAP 0x01 /* request 1 wraps */
|
|
+#define BFQ_RQ2_WRAP 0x02 /* request 2 wraps */
|
|
+ unsigned int wrap = 0; /* bit mask: requests behind the disk head? */
|
|
+
|
|
+ if (!rq1 || rq1 == rq2)
|
|
+ return rq2;
|
|
+ if (!rq2)
|
|
+ return rq1;
|
|
+
|
|
+ if (rq_is_sync(rq1) && !rq_is_sync(rq2))
|
|
+ return rq1;
|
|
+ else if (rq_is_sync(rq2) && !rq_is_sync(rq1))
|
|
+ return rq2;
|
|
+ if ((rq1->cmd_flags & REQ_META) && !(rq2->cmd_flags & REQ_META))
|
|
+ return rq1;
|
|
+ else if ((rq2->cmd_flags & REQ_META) && !(rq1->cmd_flags & REQ_META))
|
|
+ return rq2;
|
|
+
|
|
+ s1 = blk_rq_pos(rq1);
|
|
+ s2 = blk_rq_pos(rq2);
|
|
+
|
|
+ /*
|
|
+ * By definition, 1KiB is 2 sectors.
|
|
+ */
|
|
+ back_max = bfqd->bfq_back_max * 2;
|
|
+
|
|
+ /*
|
|
+ * Strict one way elevator _except_ in the case where we allow
|
|
+ * short backward seeks which are biased as twice the cost of a
|
|
+ * similar forward seek.
|
|
+ */
|
|
+ if (s1 >= last)
|
|
+ d1 = s1 - last;
|
|
+ else if (s1 + back_max >= last)
|
|
+ d1 = (last - s1) * bfqd->bfq_back_penalty;
|
|
+ else
|
|
+ wrap |= BFQ_RQ1_WRAP;
|
|
+
|
|
+ if (s2 >= last)
|
|
+ d2 = s2 - last;
|
|
+ else if (s2 + back_max >= last)
|
|
+ d2 = (last - s2) * bfqd->bfq_back_penalty;
|
|
+ else
|
|
+ wrap |= BFQ_RQ2_WRAP;
|
|
+
|
|
+ /* Found required data */
|
|
+
|
|
+ /*
|
|
+ * By doing switch() on the bit mask "wrap" we avoid having to
|
|
+ * check two variables for all permutations: --> faster!
|
|
+ */
|
|
+ switch (wrap) {
|
|
+ case 0: /* common case for CFQ: rq1 and rq2 not wrapped */
|
|
+ if (d1 < d2)
|
|
+ return rq1;
|
|
+ else if (d2 < d1)
|
|
+ return rq2;
|
|
+
|
|
+ if (s1 >= s2)
|
|
+ return rq1;
|
|
+ else
|
|
+ return rq2;
|
|
+
|
|
+ case BFQ_RQ2_WRAP:
|
|
+ return rq1;
|
|
+ case BFQ_RQ1_WRAP:
|
|
+ return rq2;
|
|
+ case (BFQ_RQ1_WRAP|BFQ_RQ2_WRAP): /* both rqs wrapped */
|
|
+ default:
|
|
+ /*
|
|
+ * Since both rqs are wrapped,
|
|
+ * start with the one that's further behind head
|
|
+ * (--> only *one* back seek required),
|
|
+ * since back seek takes more time than forward.
|
|
+ */
|
|
+ if (s1 <= s2)
|
|
+ return rq1;
|
|
+ else
|
|
+ return rq2;
|
|
+ }
|
|
+}
|
|
+
|
|
+static struct bfq_queue *
|
|
+bfq_rq_pos_tree_lookup(struct bfq_data *bfqd, struct rb_root *root,
|
|
+ sector_t sector, struct rb_node **ret_parent,
|
|
+ struct rb_node ***rb_link)
|
|
+{
|
|
+ struct rb_node **p, *parent;
|
|
+ struct bfq_queue *bfqq = NULL;
|
|
+
|
|
+ parent = NULL;
|
|
+ p = &root->rb_node;
|
|
+ while (*p) {
|
|
+ struct rb_node **n;
|
|
+
|
|
+ parent = *p;
|
|
+ bfqq = rb_entry(parent, struct bfq_queue, pos_node);
|
|
+
|
|
+ /*
|
|
+ * Sort strictly based on sector. Smallest to the left,
|
|
+ * largest to the right.
|
|
+ */
|
|
+ if (sector > blk_rq_pos(bfqq->next_rq))
|
|
+ n = &(*p)->rb_right;
|
|
+ else if (sector < blk_rq_pos(bfqq->next_rq))
|
|
+ n = &(*p)->rb_left;
|
|
+ else
|
|
+ break;
|
|
+ p = n;
|
|
+ bfqq = NULL;
|
|
+ }
|
|
+
|
|
+ *ret_parent = parent;
|
|
+ if (rb_link)
|
|
+ *rb_link = p;
|
|
+
|
|
+ bfq_log(bfqd, "rq_pos_tree_lookup %llu: returning %d",
|
|
+ (unsigned long long) sector,
|
|
+ bfqq ? bfqq->pid : 0);
|
|
+
|
|
+ return bfqq;
|
|
+}
|
|
+
|
|
+static void bfq_pos_tree_add_move(struct bfq_data *bfqd, struct bfq_queue *bfqq)
|
|
+{
|
|
+ struct rb_node **p, *parent;
|
|
+ struct bfq_queue *__bfqq;
|
|
+
|
|
+ if (bfqq->pos_root) {
|
|
+ rb_erase(&bfqq->pos_node, bfqq->pos_root);
|
|
+ bfqq->pos_root = NULL;
|
|
+ }
|
|
+
|
|
+ if (bfq_class_idle(bfqq))
|
|
+ return;
|
|
+ if (!bfqq->next_rq)
|
|
+ return;
|
|
+
|
|
+ bfqq->pos_root = &bfq_bfqq_to_bfqg(bfqq)->rq_pos_tree;
|
|
+ __bfqq = bfq_rq_pos_tree_lookup(bfqd, bfqq->pos_root,
|
|
+ blk_rq_pos(bfqq->next_rq), &parent, &p);
|
|
+ if (!__bfqq) {
|
|
+ rb_link_node(&bfqq->pos_node, parent, p);
|
|
+ rb_insert_color(&bfqq->pos_node, bfqq->pos_root);
|
|
+ } else
|
|
+ bfqq->pos_root = NULL;
|
|
+}
|
|
+
|
|
+/*
|
|
+ * Tell whether there are active queues or groups with differentiated weights.
|
|
+ */
|
|
+static bool bfq_differentiated_weights(struct bfq_data *bfqd)
|
|
+{
|
|
+ /*
|
|
+ * For weights to differ, at least one of the trees must contain
|
|
+ * at least two nodes.
|
|
+ */
|
|
+ return (!RB_EMPTY_ROOT(&bfqd->queue_weights_tree) &&
|
|
+ (bfqd->queue_weights_tree.rb_node->rb_left ||
|
|
+ bfqd->queue_weights_tree.rb_node->rb_right)
|
|
+#ifdef CONFIG_BFQ_GROUP_IOSCHED
|
|
+ ) ||
|
|
+ (!RB_EMPTY_ROOT(&bfqd->group_weights_tree) &&
|
|
+ (bfqd->group_weights_tree.rb_node->rb_left ||
|
|
+ bfqd->group_weights_tree.rb_node->rb_right)
|
|
+#endif
|
|
+ );
|
|
+}
|
|
+
|
|
+/*
|
|
+ * The following function returns true if every queue must receive the
|
|
+ * same share of the throughput (this condition is used when deciding
|
|
+ * whether idling may be disabled, see the comments in the function
|
|
+ * bfq_bfqq_may_idle()).
|
|
+ *
|
|
+ * Such a scenario occurs when:
|
|
+ * 1) all active queues have the same weight,
|
|
+ * 2) all active groups at the same level in the groups tree have the same
|
|
+ * weight,
|
|
+ * 3) all active groups at the same level in the groups tree have the same
|
|
+ * number of children.
|
|
+ *
|
|
+ * Unfortunately, keeping the necessary state for evaluating exactly the
|
|
+ * above symmetry conditions would be quite complex and time-consuming.
|
|
+ * Therefore this function evaluates, instead, the following stronger
|
|
+ * sub-conditions, for which it is much easier to maintain the needed
|
|
+ * state:
|
|
+ * 1) all active queues have the same weight,
|
|
+ * 2) all active groups have the same weight,
|
|
+ * 3) all active groups have at most one active child each.
|
|
+ * In particular, the last two conditions are always true if hierarchical
|
|
+ * support and the cgroups interface are not enabled, thus no state needs
|
|
+ * to be maintained in this case.
|
|
+ */
|
|
+static bool bfq_symmetric_scenario(struct bfq_data *bfqd)
|
|
+{
|
|
+ return !bfq_differentiated_weights(bfqd);
|
|
+}
|
|
+
|
|
+/*
|
|
+ * If the weight-counter tree passed as input contains no counter for
|
|
+ * the weight of the input entity, then add that counter; otherwise just
|
|
+ * increment the existing counter.
|
|
+ *
|
|
+ * Note that weight-counter trees contain few nodes in mostly symmetric
|
|
+ * scenarios. For example, if all queues have the same weight, then the
|
|
+ * weight-counter tree for the queues may contain at most one node.
|
|
+ * This holds even if low_latency is on, because weight-raised queues
|
|
+ * are not inserted in the tree.
|
|
+ * In most scenarios, the rate at which nodes are created/destroyed
|
|
+ * should be low too.
|
|
+ */
|
|
+static void bfq_weights_tree_add(struct bfq_data *bfqd,
|
|
+ struct bfq_entity *entity,
|
|
+ struct rb_root *root)
|
|
+{
|
|
+ struct rb_node **new = &(root->rb_node), *parent = NULL;
|
|
+
|
|
+ /*
|
|
+ * Do not insert if the entity is already associated with a
|
|
+ * counter, which happens if:
|
|
+ * 1) the entity is associated with a queue,
|
|
+ * 2) a request arrival has caused the queue to become both
|
|
+ * non-weight-raised, and hence change its weight, and
|
|
+ * backlogged; in this respect, each of the two events
|
|
+ * causes an invocation of this function,
|
|
+ * 3) this is the invocation of this function caused by the
|
|
+ * second event. This second invocation is actually useless,
|
|
+ * and we handle this fact by exiting immediately. More
|
|
+ * efficient or clearer solutions might possibly be adopted.
|
|
+ */
|
|
+ if (entity->weight_counter)
|
|
+ return;
|
|
+
|
|
+ while (*new) {
|
|
+ struct bfq_weight_counter *__counter = container_of(*new,
|
|
+ struct bfq_weight_counter,
|
|
+ weights_node);
|
|
+ parent = *new;
|
|
+
|
|
+ if (entity->weight == __counter->weight) {
|
|
+ entity->weight_counter = __counter;
|
|
+ goto inc_counter;
|
|
+ }
|
|
+ if (entity->weight < __counter->weight)
|
|
+ new = &((*new)->rb_left);
|
|
+ else
|
|
+ new = &((*new)->rb_right);
|
|
+ }
|
|
+
|
|
+ entity->weight_counter = kzalloc(sizeof(struct bfq_weight_counter),
|
|
+ GFP_ATOMIC);
|
|
+ entity->weight_counter->weight = entity->weight;
|
|
+ rb_link_node(&entity->weight_counter->weights_node, parent, new);
|
|
+ rb_insert_color(&entity->weight_counter->weights_node, root);
|
|
+
|
|
+inc_counter:
|
|
+ entity->weight_counter->num_active++;
|
|
+}
|
|
+
|
|
+/*
|
|
+ * Decrement the weight counter associated with the entity, and, if the
|
|
+ * counter reaches 0, remove the counter from the tree.
|
|
+ * See the comments to the function bfq_weights_tree_add() for considerations
|
|
+ * about overhead.
|
|
+ */
|
|
+static void bfq_weights_tree_remove(struct bfq_data *bfqd,
|
|
+ struct bfq_entity *entity,
|
|
+ struct rb_root *root)
|
|
+{
|
|
+ if (!entity->weight_counter)
|
|
+ return;
|
|
+
|
|
+ BUG_ON(RB_EMPTY_ROOT(root));
|
|
+ BUG_ON(entity->weight_counter->weight != entity->weight);
|
|
+
|
|
+ BUG_ON(!entity->weight_counter->num_active);
|
|
+ entity->weight_counter->num_active--;
|
|
+ if (entity->weight_counter->num_active > 0)
|
|
+ goto reset_entity_pointer;
|
|
+
|
|
+ rb_erase(&entity->weight_counter->weights_node, root);
|
|
+ kfree(entity->weight_counter);
|
|
+
|
|
+reset_entity_pointer:
|
|
+ entity->weight_counter = NULL;
|
|
+}
|
|
+
|
|
+/*
|
|
+ * Return expired entry, or NULL to just start from scratch in rbtree.
|
|
+ */
|
|
+static struct request *bfq_check_fifo(struct bfq_queue *bfqq,
|
|
+ struct request *last)
|
|
+{
|
|
+ struct request *rq;
|
|
+
|
|
+ if (bfq_bfqq_fifo_expire(bfqq))
|
|
+ return NULL;
|
|
+
|
|
+ bfq_mark_bfqq_fifo_expire(bfqq);
|
|
+
|
|
+ rq = rq_entry_fifo(bfqq->fifo.next);
|
|
+
|
|
+ if (rq == last || ktime_get_ns() < rq->fifo_time)
|
|
+ return NULL;
|
|
+
|
|
+ bfq_log_bfqq(bfqq->bfqd, bfqq, "check_fifo: returned %p", rq);
|
|
+ BUG_ON(RB_EMPTY_NODE(&rq->rb_node));
|
|
+ return rq;
|
|
+}
|
|
+
|
|
+static struct request *bfq_find_next_rq(struct bfq_data *bfqd,
|
|
+ struct bfq_queue *bfqq,
|
|
+ struct request *last)
|
|
+{
|
|
+ struct rb_node *rbnext = rb_next(&last->rb_node);
|
|
+ struct rb_node *rbprev = rb_prev(&last->rb_node);
|
|
+ struct request *next, *prev = NULL;
|
|
+
|
|
+ BUG_ON(list_empty(&bfqq->fifo));
|
|
+
|
|
+ /* Follow expired path, else get first next available. */
|
|
+ next = bfq_check_fifo(bfqq, last);
|
|
+ if (next) {
|
|
+ BUG_ON(next == last);
|
|
+ return next;
|
|
+ }
|
|
+
|
|
+ BUG_ON(RB_EMPTY_NODE(&last->rb_node));
|
|
+
|
|
+ if (rbprev)
|
|
+ prev = rb_entry_rq(rbprev);
|
|
+
|
|
+ if (rbnext)
|
|
+ next = rb_entry_rq(rbnext);
|
|
+ else {
|
|
+ rbnext = rb_first(&bfqq->sort_list);
|
|
+ if (rbnext && rbnext != &last->rb_node)
|
|
+ next = rb_entry_rq(rbnext);
|
|
+ }
|
|
+
|
|
+ return bfq_choose_req(bfqd, next, prev, blk_rq_pos(last));
|
|
+}
|
|
+
|
|
+/* see the definition of bfq_async_charge_factor for details */
|
|
+static unsigned long bfq_serv_to_charge(struct request *rq,
|
|
+ struct bfq_queue *bfqq)
|
|
+{
|
|
+ if (bfq_bfqq_sync(bfqq) || bfqq->wr_coeff > 1)
|
|
+ return blk_rq_sectors(rq);
|
|
+
|
|
+ /*
|
|
+ * If there are no weight-raised queues, then amplify service
|
|
+ * by just the async charge factor; otherwise amplify service
|
|
+ * by twice the async charge factor, to further reduce latency
|
|
+ * for weight-raised queues.
|
|
+ */
|
|
+ if (bfqq->bfqd->wr_busy_queues == 0)
|
|
+ return blk_rq_sectors(rq) * bfq_async_charge_factor;
|
|
+
|
|
+ return blk_rq_sectors(rq) * 2 * bfq_async_charge_factor;
|
|
+}
|
|
+
|
|
+/**
|
|
+ * bfq_updated_next_req - update the queue after a new next_rq selection.
|
|
+ * @bfqd: the device data the queue belongs to.
|
|
+ * @bfqq: the queue to update.
|
|
+ *
|
|
+ * If the first request of a queue changes we make sure that the queue
|
|
+ * has enough budget to serve at least its first request (if the
|
|
+ * request has grown). We do this because if the queue has not enough
|
|
+ * budget for its first request, it has to go through two dispatch
|
|
+ * rounds to actually get it dispatched.
|
|
+ */
|
|
+static void bfq_updated_next_req(struct bfq_data *bfqd,
|
|
+ struct bfq_queue *bfqq)
|
|
+{
|
|
+ struct bfq_entity *entity = &bfqq->entity;
|
|
+ struct bfq_service_tree *st = bfq_entity_service_tree(entity);
|
|
+ struct request *next_rq = bfqq->next_rq;
|
|
+ unsigned long new_budget;
|
|
+
|
|
+ if (!next_rq)
|
|
+ return;
|
|
+
|
|
+ if (bfqq == bfqd->in_service_queue)
|
|
+ /*
|
|
+ * In order not to break guarantees, budgets cannot be
|
|
+ * changed after an entity has been selected.
|
|
+ */
|
|
+ return;
|
|
+
|
|
+ BUG_ON(entity->tree != &st->active);
|
|
+ BUG_ON(entity == entity->sched_data->in_service_entity);
|
|
+
|
|
+ new_budget = max_t(unsigned long, bfqq->max_budget,
|
|
+ bfq_serv_to_charge(next_rq, bfqq));
|
|
+ if (entity->budget != new_budget) {
|
|
+ entity->budget = new_budget;
|
|
+ bfq_log_bfqq(bfqd, bfqq, "updated next rq: new budget %lu",
|
|
+ new_budget);
|
|
+ bfq_requeue_bfqq(bfqd, bfqq);
|
|
+ }
|
|
+}
|
|
+
|
|
+static unsigned int bfq_wr_duration(struct bfq_data *bfqd)
|
|
+{
|
|
+ u64 dur;
|
|
+
|
|
+ if (bfqd->bfq_wr_max_time > 0)
|
|
+ return bfqd->bfq_wr_max_time;
|
|
+
|
|
+ dur = bfqd->RT_prod;
|
|
+ do_div(dur, bfqd->peak_rate);
|
|
+
|
|
+ /*
|
|
+ * Limit duration between 3 and 13 seconds. Tests show that
|
|
+ * higher values than 13 seconds often yield the opposite of
|
|
+ * the desired result, i.e., worsen responsiveness by letting
|
|
+ * non-interactive and non-soft-real-time applications
|
|
+ * preserve weight raising for a too long time interval.
|
|
+ *
|
|
+ * On the other end, lower values than 3 seconds make it
|
|
+ * difficult for most interactive tasks to complete their jobs
|
|
+ * before weight-raising finishes.
|
|
+ */
|
|
+ if (dur > msecs_to_jiffies(13000))
|
|
+ dur = msecs_to_jiffies(13000);
|
|
+ else if (dur < msecs_to_jiffies(3000))
|
|
+ dur = msecs_to_jiffies(3000);
|
|
+
|
|
+ return dur;
|
|
+}
|
|
+
|
|
+static void
|
|
+bfq_bfqq_resume_state(struct bfq_queue *bfqq, struct bfq_io_cq *bic)
|
|
+{
|
|
+ if (bic->saved_idle_window)
|
|
+ bfq_mark_bfqq_idle_window(bfqq);
|
|
+ else
|
|
+ bfq_clear_bfqq_idle_window(bfqq);
|
|
+
|
|
+ if (bic->saved_IO_bound)
|
|
+ bfq_mark_bfqq_IO_bound(bfqq);
|
|
+ else
|
|
+ bfq_clear_bfqq_IO_bound(bfqq);
|
|
+
|
|
+ bfqq->wr_coeff = bic->saved_wr_coeff;
|
|
+ bfqq->wr_start_at_switch_to_srt = bic->saved_wr_start_at_switch_to_srt;
|
|
+ BUG_ON(time_is_after_jiffies(bfqq->wr_start_at_switch_to_srt));
|
|
+ bfqq->last_wr_start_finish = bic->saved_last_wr_start_finish;
|
|
+ bfqq->wr_cur_max_time = bic->saved_wr_cur_max_time;
|
|
+ BUG_ON(time_is_after_jiffies(bfqq->last_wr_start_finish));
|
|
+
|
|
+ if (bfqq->wr_coeff > 1 && (bfq_bfqq_in_large_burst(bfqq) ||
|
|
+ time_is_before_jiffies(bfqq->last_wr_start_finish +
|
|
+ bfqq->wr_cur_max_time))) {
|
|
+ bfq_log_bfqq(bfqq->bfqd, bfqq,
|
|
+ "resume state: switching off wr (%lu + %lu < %lu)",
|
|
+ bfqq->last_wr_start_finish, bfqq->wr_cur_max_time,
|
|
+ jiffies);
|
|
+
|
|
+ bfqq->wr_coeff = 1;
|
|
+ }
|
|
+ /* make sure weight will be updated, however we got here */
|
|
+ bfqq->entity.prio_changed = 1;
|
|
+}
|
|
+
|
|
+static int bfqq_process_refs(struct bfq_queue *bfqq)
|
|
+{
|
|
+ int process_refs, io_refs;
|
|
+
|
|
+ lockdep_assert_held(bfqq->bfqd->queue->queue_lock);
|
|
+
|
|
+ io_refs = bfqq->allocated[READ] + bfqq->allocated[WRITE];
|
|
+ process_refs = bfqq->ref - io_refs - bfqq->entity.on_st;
|
|
+ BUG_ON(process_refs < 0);
|
|
+ return process_refs;
|
|
+}
|
|
+
|
|
+/* Empty burst list and add just bfqq (see comments to bfq_handle_burst) */
|
|
+static void bfq_reset_burst_list(struct bfq_data *bfqd, struct bfq_queue *bfqq)
|
|
+{
|
|
+ struct bfq_queue *item;
|
|
+ struct hlist_node *n;
|
|
+
|
|
+ hlist_for_each_entry_safe(item, n, &bfqd->burst_list, burst_list_node)
|
|
+ hlist_del_init(&item->burst_list_node);
|
|
+ hlist_add_head(&bfqq->burst_list_node, &bfqd->burst_list);
|
|
+ bfqd->burst_size = 1;
|
|
+ bfqd->burst_parent_entity = bfqq->entity.parent;
|
|
+}
|
|
+
|
|
+/* Add bfqq to the list of queues in current burst (see bfq_handle_burst) */
|
|
+static void bfq_add_to_burst(struct bfq_data *bfqd, struct bfq_queue *bfqq)
|
|
+{
|
|
+ /* Increment burst size to take into account also bfqq */
|
|
+ bfqd->burst_size++;
|
|
+
|
|
+ bfq_log_bfqq(bfqd, bfqq, "add_to_burst %d", bfqd->burst_size);
|
|
+
|
|
+ BUG_ON(bfqd->burst_size > bfqd->bfq_large_burst_thresh);
|
|
+
|
|
+ if (bfqd->burst_size == bfqd->bfq_large_burst_thresh) {
|
|
+ struct bfq_queue *pos, *bfqq_item;
|
|
+ struct hlist_node *n;
|
|
+
|
|
+ /*
|
|
+ * Enough queues have been activated shortly after each
|
|
+ * other to consider this burst as large.
|
|
+ */
|
|
+ bfqd->large_burst = true;
|
|
+ bfq_log_bfqq(bfqd, bfqq, "add_to_burst: large burst started");
|
|
+
|
|
+ /*
|
|
+ * We can now mark all queues in the burst list as
|
|
+ * belonging to a large burst.
|
|
+ */
|
|
+ hlist_for_each_entry(bfqq_item, &bfqd->burst_list,
|
|
+ burst_list_node) {
|
|
+ bfq_mark_bfqq_in_large_burst(bfqq_item);
|
|
+ bfq_log_bfqq(bfqd, bfqq_item, "marked in large burst");
|
|
+ }
|
|
+ bfq_mark_bfqq_in_large_burst(bfqq);
|
|
+ bfq_log_bfqq(bfqd, bfqq, "marked in large burst");
|
|
+
|
|
+ /*
|
|
+ * From now on, and until the current burst finishes, any
|
|
+ * new queue being activated shortly after the last queue
|
|
+ * was inserted in the burst can be immediately marked as
|
|
+ * belonging to a large burst. So the burst list is not
|
|
+ * needed any more. Remove it.
|
|
+ */
|
|
+ hlist_for_each_entry_safe(pos, n, &bfqd->burst_list,
|
|
+ burst_list_node)
|
|
+ hlist_del_init(&pos->burst_list_node);
|
|
+ } else /*
|
|
+ * Burst not yet large: add bfqq to the burst list. Do
|
|
+ * not increment the ref counter for bfqq, because bfqq
|
|
+ * is removed from the burst list before freeing bfqq
|
|
+ * in put_queue.
|
|
+ */
|
|
+ hlist_add_head(&bfqq->burst_list_node, &bfqd->burst_list);
|
|
+}
|
|
+
|
|
+/*
|
|
+ * If many queues belonging to the same group happen to be created
|
|
+ * shortly after each other, then the processes associated with these
|
|
+ * queues have typically a common goal. In particular, bursts of queue
|
|
+ * creations are usually caused by services or applications that spawn
|
|
+ * many parallel threads/processes. Examples are systemd during boot,
|
|
+ * or git grep. To help these processes get their job done as soon as
|
|
+ * possible, it is usually better to not grant either weight-raising
|
|
+ * or device idling to their queues.
|
|
+ *
|
|
+ * In this comment we describe, firstly, the reasons why this fact
|
|
+ * holds, and, secondly, the next function, which implements the main
|
|
+ * steps needed to properly mark these queues so that they can then be
|
|
+ * treated in a different way.
|
|
+ *
|
|
+ * The above services or applications benefit mostly from a high
|
|
+ * throughput: the quicker the requests of the activated queues are
|
|
+ * cumulatively served, the sooner the target job of these queues gets
|
|
+ * completed. As a consequence, weight-raising any of these queues,
|
|
+ * which also implies idling the device for it, is almost always
|
|
+ * counterproductive. In most cases it just lowers throughput.
|
|
+ *
|
|
+ * On the other hand, a burst of queue creations may be caused also by
|
|
+ * the start of an application that does not consist of a lot of
|
|
+ * parallel I/O-bound threads. In fact, with a complex application,
|
|
+ * several short processes may need to be executed to start-up the
|
|
+ * application. In this respect, to start an application as quickly as
|
|
+ * possible, the best thing to do is in any case to privilege the I/O
|
|
+ * related to the application with respect to all other
|
|
+ * I/O. Therefore, the best strategy to start as quickly as possible
|
|
+ * an application that causes a burst of queue creations is to
|
|
+ * weight-raise all the queues created during the burst. This is the
|
|
+ * exact opposite of the best strategy for the other type of bursts.
|
|
+ *
|
|
+ * In the end, to take the best action for each of the two cases, the
|
|
+ * two types of bursts need to be distinguished. Fortunately, this
|
|
+ * seems relatively easy, by looking at the sizes of the bursts. In
|
|
+ * particular, we found a threshold such that only bursts with a
|
|
+ * larger size than that threshold are apparently caused by
|
|
+ * services or commands such as systemd or git grep. For brevity,
|
|
+ * hereafter we call just 'large' these bursts. BFQ *does not*
|
|
+ * weight-raise queues whose creation occurs in a large burst. In
|
|
+ * addition, for each of these queues BFQ performs or does not perform
|
|
+ * idling depending on which choice boosts the throughput more. The
|
|
+ * exact choice depends on the device and request pattern at
|
|
+ * hand.
|
|
+ *
|
|
+ * Unfortunately, false positives may occur while an interactive task
|
|
+ * is starting (e.g., an application is being started). The
|
|
+ * consequence is that the queues associated with the task do not
|
|
+ * enjoy weight raising as expected. Fortunately these false positives
|
|
+ * are very rare. They typically occur if some service happens to
|
|
+ * start doing I/O exactly when the interactive task starts.
|
|
+ *
|
|
+ * Turning back to the next function, it implements all the steps
|
|
+ * needed to detect the occurrence of a large burst and to properly
|
|
+ * mark all the queues belonging to it (so that they can then be
|
|
+ * treated in a different way). This goal is achieved by maintaining a
|
|
+ * "burst list" that holds, temporarily, the queues that belong to the
|
|
+ * burst in progress. The list is then used to mark these queues as
|
|
+ * belonging to a large burst if the burst does become large. The main
|
|
+ * steps are the following.
|
|
+ *
|
|
+ * . when the very first queue is created, the queue is inserted into the
|
|
+ * list (as it could be the first queue in a possible burst)
|
|
+ *
|
|
+ * . if the current burst has not yet become large, and a queue Q that does
|
|
+ * not yet belong to the burst is activated shortly after the last time
|
|
+ * at which a new queue entered the burst list, then the function appends
|
|
+ * Q to the burst list
|
|
+ *
|
|
+ * . if, as a consequence of the previous step, the burst size reaches
|
|
+ * the large-burst threshold, then
|
|
+ *
|
|
+ * . all the queues in the burst list are marked as belonging to a
|
|
+ * large burst
|
|
+ *
|
|
+ * . the burst list is deleted; in fact, the burst list already served
|
|
+ * its purpose (keeping temporarily track of the queues in a burst,
|
|
+ * so as to be able to mark them as belonging to a large burst in the
|
|
+ * previous sub-step), and now is not needed any more
|
|
+ *
|
|
+ * . the device enters a large-burst mode
|
|
+ *
|
|
+ * . if a queue Q that does not belong to the burst is created while
|
|
+ * the device is in large-burst mode and shortly after the last time
|
|
+ * at which a queue either entered the burst list or was marked as
|
|
+ * belonging to the current large burst, then Q is immediately marked
|
|
+ * as belonging to a large burst.
|
|
+ *
|
|
+ * . if a queue Q that does not belong to the burst is created a while
|
|
+ * later, i.e., not shortly after, than the last time at which a queue
|
|
+ * either entered the burst list or was marked as belonging to the
|
|
+ * current large burst, then the current burst is deemed as finished and:
|
|
+ *
|
|
+ * . the large-burst mode is reset if set
|
|
+ *
|
|
+ * . the burst list is emptied
|
|
+ *
|
|
+ * . Q is inserted in the burst list, as Q may be the first queue
|
|
+ * in a possible new burst (then the burst list contains just Q
|
|
+ * after this step).
|
|
+ */
|
|
+static void bfq_handle_burst(struct bfq_data *bfqd, struct bfq_queue *bfqq)
|
|
+{
|
|
+ /*
|
|
+ * If bfqq is already in the burst list or is part of a large
|
|
+ * burst, or finally has just been split, then there is
|
|
+ * nothing else to do.
|
|
+ */
|
|
+ if (!hlist_unhashed(&bfqq->burst_list_node) ||
|
|
+ bfq_bfqq_in_large_burst(bfqq) ||
|
|
+ time_is_after_eq_jiffies(bfqq->split_time +
|
|
+ msecs_to_jiffies(10)))
|
|
+ return;
|
|
+
|
|
+ /*
|
|
+ * If bfqq's creation happens late enough, or bfqq belongs to
|
|
+ * a different group than the burst group, then the current
|
|
+ * burst is finished, and related data structures must be
|
|
+ * reset.
|
|
+ *
|
|
+ * In this respect, consider the special case where bfqq is
|
|
+ * the very first queue created after BFQ is selected for this
|
|
+ * device. In this case, last_ins_in_burst and
|
|
+ * burst_parent_entity are not yet significant when we get
|
|
+ * here. But it is easy to verify that, whether or not the
|
|
+ * following condition is true, bfqq will end up being
|
|
+ * inserted into the burst list. In particular the list will
|
|
+ * happen to contain only bfqq. And this is exactly what has
|
|
+ * to happen, as bfqq may be the first queue of the first
|
|
+ * burst.
|
|
+ */
|
|
+ if (time_is_before_jiffies(bfqd->last_ins_in_burst +
|
|
+ bfqd->bfq_burst_interval) ||
|
|
+ bfqq->entity.parent != bfqd->burst_parent_entity) {
|
|
+ bfqd->large_burst = false;
|
|
+ bfq_reset_burst_list(bfqd, bfqq);
|
|
+ bfq_log_bfqq(bfqd, bfqq,
|
|
+ "handle_burst: late activation or different group");
|
|
+ goto end;
|
|
+ }
|
|
+
|
|
+ /*
|
|
+ * If we get here, then bfqq is being activated shortly after the
|
|
+ * last queue. So, if the current burst is also large, we can mark
|
|
+ * bfqq as belonging to this large burst immediately.
|
|
+ */
|
|
+ if (bfqd->large_burst) {
|
|
+ bfq_log_bfqq(bfqd, bfqq, "handle_burst: marked in burst");
|
|
+ bfq_mark_bfqq_in_large_burst(bfqq);
|
|
+ goto end;
|
|
+ }
|
|
+
|
|
+ /*
|
|
+ * If we get here, then a large-burst state has not yet been
|
|
+ * reached, but bfqq is being activated shortly after the last
|
|
+ * queue. Then we add bfqq to the burst.
|
|
+ */
|
|
+ bfq_add_to_burst(bfqd, bfqq);
|
|
+end:
|
|
+ /*
|
|
+ * At this point, bfqq either has been added to the current
|
|
+ * burst or has caused the current burst to terminate and a
|
|
+ * possible new burst to start. In particular, in the second
|
|
+ * case, bfqq has become the first queue in the possible new
|
|
+ * burst. In both cases last_ins_in_burst needs to be moved
|
|
+ * forward.
|
|
+ */
|
|
+ bfqd->last_ins_in_burst = jiffies;
|
|
+
|
|
+}
|
|
+
|
|
+static int bfq_bfqq_budget_left(struct bfq_queue *bfqq)
|
|
+{
|
|
+ struct bfq_entity *entity = &bfqq->entity;
|
|
+
|
|
+ return entity->budget - entity->service;
|
|
+}
|
|
+
|
|
+/*
|
|
+ * If enough samples have been computed, return the current max budget
|
|
+ * stored in bfqd, which is dynamically updated according to the
|
|
+ * estimated disk peak rate; otherwise return the default max budget
|
|
+ */
|
|
+static int bfq_max_budget(struct bfq_data *bfqd)
|
|
+{
|
|
+ if (bfqd->budgets_assigned < bfq_stats_min_budgets)
|
|
+ return bfq_default_max_budget;
|
|
+ else
|
|
+ return bfqd->bfq_max_budget;
|
|
+}
|
|
+
|
|
+/*
|
|
+ * Return min budget, which is a fraction of the current or default
|
|
+ * max budget (trying with 1/32)
|
|
+ */
|
|
+static int bfq_min_budget(struct bfq_data *bfqd)
|
|
+{
|
|
+ if (bfqd->budgets_assigned < bfq_stats_min_budgets)
|
|
+ return bfq_default_max_budget / 32;
|
|
+ else
|
|
+ return bfqd->bfq_max_budget / 32;
|
|
+}
|
|
+
|
|
+static void bfq_bfqq_expire(struct bfq_data *bfqd,
|
|
+ struct bfq_queue *bfqq,
|
|
+ bool compensate,
|
|
+ enum bfqq_expiration reason);
|
|
+
|
|
+/*
|
|
+ * The next function, invoked after the input queue bfqq switches from
|
|
+ * idle to busy, updates the budget of bfqq. The function also tells
|
|
+ * whether the in-service queue should be expired, by returning
|
|
+ * true. The purpose of expiring the in-service queue is to give bfqq
|
|
+ * the chance to possibly preempt the in-service queue, and the reason
|
|
+ * for preempting the in-service queue is to achieve one of the two
|
|
+ * goals below.
|
|
+ *
|
|
+ * 1. Guarantee to bfqq its reserved bandwidth even if bfqq has
|
|
+ * expired because it has remained idle. In particular, bfqq may have
|
|
+ * expired for one of the following two reasons:
|
|
+ *
|
|
+ * - BFQ_BFQQ_NO_MORE_REQUEST bfqq did not enjoy any device idling and
|
|
+ * did not make it to issue a new request before its last request
|
|
+ * was served;
|
|
+ *
|
|
+ * - BFQ_BFQQ_TOO_IDLE bfqq did enjoy device idling, but did not issue
|
|
+ * a new request before the expiration of the idling-time.
|
|
+ *
|
|
+ * Even if bfqq has expired for one of the above reasons, the process
|
|
+ * associated with the queue may be however issuing requests greedily,
|
|
+ * and thus be sensitive to the bandwidth it receives (bfqq may have
|
|
+ * remained idle for other reasons: CPU high load, bfqq not enjoying
|
|
+ * idling, I/O throttling somewhere in the path from the process to
|
|
+ * the I/O scheduler, ...). But if, after every expiration for one of
|
|
+ * the above two reasons, bfqq has to wait for the service of at least
|
|
+ * one full budget of another queue before being served again, then
|
|
+ * bfqq is likely to get a much lower bandwidth or resource time than
|
|
+ * its reserved ones. To address this issue, two countermeasures need
|
|
+ * to be taken.
|
|
+ *
|
|
+ * First, the budget and the timestamps of bfqq need to be updated in
|
|
+ * a special way on bfqq reactivation: they need to be updated as if
|
|
+ * bfqq did not remain idle and did not expire. In fact, if they are
|
|
+ * computed as if bfqq expired and remained idle until reactivation,
|
|
+ * then the process associated with bfqq is treated as if, instead of
|
|
+ * being greedy, it stopped issuing requests when bfqq remained idle,
|
|
+ * and restarts issuing requests only on this reactivation. In other
|
|
+ * words, the scheduler does not help the process recover the "service
|
|
+ * hole" between bfqq expiration and reactivation. As a consequence,
|
|
+ * the process receives a lower bandwidth than its reserved one. In
|
|
+ * contrast, to recover this hole, the budget must be updated as if
|
|
+ * bfqq was not expired at all before this reactivation, i.e., it must
|
|
+ * be set to the value of the remaining budget when bfqq was
|
|
+ * expired. Along the same line, timestamps need to be assigned the
|
|
+ * value they had the last time bfqq was selected for service, i.e.,
|
|
+ * before last expiration. Thus timestamps need to be back-shifted
|
|
+ * with respect to their normal computation (see [1] for more details
|
|
+ * on this tricky aspect).
|
|
+ *
|
|
+ * Secondly, to allow the process to recover the hole, the in-service
|
|
+ * queue must be expired too, to give bfqq the chance to preempt it
|
|
+ * immediately. In fact, if bfqq has to wait for a full budget of the
|
|
+ * in-service queue to be completed, then it may become impossible to
|
|
+ * let the process recover the hole, even if the back-shifted
|
|
+ * timestamps of bfqq are lower than those of the in-service queue. If
|
|
+ * this happens for most or all of the holes, then the process may not
|
|
+ * receive its reserved bandwidth. In this respect, it is worth noting
|
|
+ * that, being the service of outstanding requests unpreemptible, a
|
|
+ * little fraction of the holes may however be unrecoverable, thereby
|
|
+ * causing a little loss of bandwidth.
|
|
+ *
|
|
+ * The last important point is detecting whether bfqq does need this
|
|
+ * bandwidth recovery. In this respect, the next function deems the
|
|
+ * process associated with bfqq greedy, and thus allows it to recover
|
|
+ * the hole, if: 1) the process is waiting for the arrival of a new
|
|
+ * request (which implies that bfqq expired for one of the above two
|
|
+ * reasons), and 2) such a request has arrived soon. The first
|
|
+ * condition is controlled through the flag non_blocking_wait_rq,
|
|
+ * while the second through the flag arrived_in_time. If both
|
|
+ * conditions hold, then the function computes the budget in the
|
|
+ * above-described special way, and signals that the in-service queue
|
|
+ * should be expired. Timestamp back-shifting is done later in
|
|
+ * __bfq_activate_entity.
|
|
+ *
|
|
+ * 2. Reduce latency. Even if timestamps are not backshifted to let
|
|
+ * the process associated with bfqq recover a service hole, bfqq may
|
|
+ * however happen to have, after being (re)activated, a lower finish
|
|
+ * timestamp than the in-service queue. That is, the next budget of
|
|
+ * bfqq may have to be completed before the one of the in-service
|
|
+ * queue. If this is the case, then preempting the in-service queue
|
|
+ * allows this goal to be achieved, apart from the unpreemptible,
|
|
+ * outstanding requests mentioned above.
|
|
+ *
|
|
+ * Unfortunately, regardless of which of the above two goals one wants
|
|
+ * to achieve, service trees need first to be updated to know whether
|
|
+ * the in-service queue must be preempted. To have service trees
|
|
+ * correctly updated, the in-service queue must be expired and
|
|
+ * rescheduled, and bfqq must be scheduled too. This is one of the
|
|
+ * most costly operations (in future versions, the scheduling
|
|
+ * mechanism may be re-designed in such a way to make it possible to
|
|
+ * know whether preemption is needed without needing to update service
|
|
+ * trees). In addition, queue preemptions almost always cause random
|
|
+ * I/O, and thus loss of throughput. Because of these facts, the next
|
|
+ * function adopts the following simple scheme to avoid both costly
|
|
+ * operations and too frequent preemptions: it requests the expiration
|
|
+ * of the in-service queue (unconditionally) only for queues that need
|
|
+ * to recover a hole, or that either are weight-raised or deserve to
|
|
+ * be weight-raised.
|
|
+ */
|
|
+static bool bfq_bfqq_update_budg_for_activation(struct bfq_data *bfqd,
|
|
+ struct bfq_queue *bfqq,
|
|
+ bool arrived_in_time,
|
|
+ bool wr_or_deserves_wr)
|
|
+{
|
|
+ struct bfq_entity *entity = &bfqq->entity;
|
|
+
|
|
+ if (bfq_bfqq_non_blocking_wait_rq(bfqq) && arrived_in_time) {
|
|
+ /*
|
|
+ * We do not clear the flag non_blocking_wait_rq here, as
|
|
+ * the latter is used in bfq_activate_bfqq to signal
|
|
+ * that timestamps need to be back-shifted (and is
|
|
+ * cleared right after).
|
|
+ */
|
|
+
|
|
+ /*
|
|
+ * In next assignment we rely on that either
|
|
+ * entity->service or entity->budget are not updated
|
|
+ * on expiration if bfqq is empty (see
|
|
+ * __bfq_bfqq_recalc_budget). Thus both quantities
|
|
+ * remain unchanged after such an expiration, and the
|
|
+ * following statement therefore assigns to
|
|
+ * entity->budget the remaining budget on such an
|
|
+ * expiration. For clarity, entity->service is not
|
|
+ * updated on expiration in any case, and, in normal
|
|
+ * operation, is reset only when bfqq is selected for
|
|
+ * service (see bfq_get_next_queue).
|
|
+ */
|
|
+ BUG_ON(bfqq->max_budget < 0);
|
|
+ entity->budget = min_t(unsigned long,
|
|
+ bfq_bfqq_budget_left(bfqq),
|
|
+ bfqq->max_budget);
|
|
+
|
|
+ BUG_ON(entity->budget < 0);
|
|
+ return true;
|
|
+ }
|
|
+
|
|
+ BUG_ON(bfqq->max_budget < 0);
|
|
+ entity->budget = max_t(unsigned long, bfqq->max_budget,
|
|
+ bfq_serv_to_charge(bfqq->next_rq, bfqq));
|
|
+ BUG_ON(entity->budget < 0);
|
|
+
|
|
+ bfq_clear_bfqq_non_blocking_wait_rq(bfqq);
|
|
+ return wr_or_deserves_wr;
|
|
+}
|
|
+
|
|
+static void bfq_update_bfqq_wr_on_rq_arrival(struct bfq_data *bfqd,
|
|
+ struct bfq_queue *bfqq,
|
|
+ unsigned int old_wr_coeff,
|
|
+ bool wr_or_deserves_wr,
|
|
+ bool interactive,
|
|
+ bool in_burst,
|
|
+ bool soft_rt)
|
|
+{
|
|
+ if (old_wr_coeff == 1 && wr_or_deserves_wr) {
|
|
+ /* start a weight-raising period */
|
|
+ if (interactive) {
|
|
+ bfqq->wr_coeff = bfqd->bfq_wr_coeff;
|
|
+ bfqq->wr_cur_max_time = bfq_wr_duration(bfqd);
|
|
+ } else {
|
|
+ bfqq->wr_start_at_switch_to_srt = jiffies;
|
|
+ bfqq->wr_coeff = bfqd->bfq_wr_coeff *
|
|
+ BFQ_SOFTRT_WEIGHT_FACTOR;
|
|
+ bfqq->wr_cur_max_time =
|
|
+ bfqd->bfq_wr_rt_max_time;
|
|
+ }
|
|
+ /*
|
|
+ * If needed, further reduce budget to make sure it is
|
|
+ * close to bfqq's backlog, so as to reduce the
|
|
+ * scheduling-error component due to a too large
|
|
+ * budget. Do not care about throughput consequences,
|
|
+ * but only about latency. Finally, do not assign a
|
|
+ * too small budget either, to avoid increasing
|
|
+ * latency by causing too frequent expirations.
|
|
+ */
|
|
+ bfqq->entity.budget = min_t(unsigned long,
|
|
+ bfqq->entity.budget,
|
|
+ 2 * bfq_min_budget(bfqd));
|
|
+
|
|
+ bfq_log_bfqq(bfqd, bfqq,
|
|
+ "wrais starting at %lu, rais_max_time %u",
|
|
+ jiffies,
|
|
+ jiffies_to_msecs(bfqq->wr_cur_max_time));
|
|
+ } else if (old_wr_coeff > 1) {
|
|
+ if (interactive) { /* update wr coeff and duration */
|
|
+ bfqq->wr_coeff = bfqd->bfq_wr_coeff;
|
|
+ bfqq->wr_cur_max_time = bfq_wr_duration(bfqd);
|
|
+ } else if (in_burst) {
|
|
+ bfqq->wr_coeff = 1;
|
|
+ bfq_log_bfqq(bfqd, bfqq,
|
|
+ "wrais ending at %lu, rais_max_time %u",
|
|
+ jiffies,
|
|
+ jiffies_to_msecs(bfqq->
|
|
+ wr_cur_max_time));
|
|
+ } else if (soft_rt) {
|
|
+ /*
|
|
+ * The application is now or still meeting the
|
|
+ * requirements for being deemed soft rt. We
|
|
+ * can then correctly and safely (re)charge
|
|
+ * the weight-raising duration for the
|
|
+ * application with the weight-raising
|
|
+ * duration for soft rt applications.
|
|
+ *
|
|
+ * In particular, doing this recharge now, i.e.,
|
|
+ * before the weight-raising period for the
|
|
+ * application finishes, reduces the probability
|
|
+ * of the following negative scenario:
|
|
+ * 1) the weight of a soft rt application is
|
|
+ * raised at startup (as for any newly
|
|
+ * created application),
|
|
+ * 2) since the application is not interactive,
|
|
+ * at a certain time weight-raising is
|
|
+ * stopped for the application,
|
|
+ * 3) at that time the application happens to
|
|
+ * still have pending requests, and hence
|
|
+ * is destined to not have a chance to be
|
|
+ * deemed soft rt before these requests are
|
|
+ * completed (see the comments to the
|
|
+ * function bfq_bfqq_softrt_next_start()
|
|
+ * for details on soft rt detection),
|
|
+ * 4) these pending requests experience a high
|
|
+ * latency because the application is not
|
|
+ * weight-raised while they are pending.
|
|
+ */
|
|
+ if (bfqq->wr_cur_max_time !=
|
|
+ bfqd->bfq_wr_rt_max_time) {
|
|
+ bfqq->wr_start_at_switch_to_srt =
|
|
+ bfqq->last_wr_start_finish;
|
|
+ BUG_ON(time_is_after_jiffies(bfqq->last_wr_start_finish));
|
|
+
|
|
+ bfqq->wr_cur_max_time =
|
|
+ bfqd->bfq_wr_rt_max_time;
|
|
+ bfqq->wr_coeff = bfqd->bfq_wr_coeff *
|
|
+ BFQ_SOFTRT_WEIGHT_FACTOR;
|
|
+ bfq_log_bfqq(bfqd, bfqq,
|
|
+ "switching to soft_rt wr");
|
|
+ } else
|
|
+ bfq_log_bfqq(bfqd, bfqq,
|
|
+ "moving forward soft_rt wr duration");
|
|
+ bfqq->last_wr_start_finish = jiffies;
|
|
+ }
|
|
+ }
|
|
+}
|
|
+
|
|
+static bool bfq_bfqq_idle_for_long_time(struct bfq_data *bfqd,
|
|
+ struct bfq_queue *bfqq)
|
|
+{
|
|
+ return bfqq->dispatched == 0 &&
|
|
+ time_is_before_jiffies(
|
|
+ bfqq->budget_timeout +
|
|
+ bfqd->bfq_wr_min_idle_time);
|
|
+}
|
|
+
|
|
+static void bfq_bfqq_handle_idle_busy_switch(struct bfq_data *bfqd,
|
|
+ struct bfq_queue *bfqq,
|
|
+ int old_wr_coeff,
|
|
+ struct request *rq,
|
|
+ bool *interactive)
|
|
+{
|
|
+ bool soft_rt, in_burst, wr_or_deserves_wr,
|
|
+ bfqq_wants_to_preempt,
|
|
+ idle_for_long_time = bfq_bfqq_idle_for_long_time(bfqd, bfqq),
|
|
+ /*
|
|
+ * See the comments on
|
|
+ * bfq_bfqq_update_budg_for_activation for
|
|
+ * details on the usage of the next variable.
|
|
+ */
|
|
+ arrived_in_time = ktime_get_ns() <=
|
|
+ RQ_BIC(rq)->ttime.last_end_request +
|
|
+ bfqd->bfq_slice_idle * 3;
|
|
+
|
|
+ bfq_log_bfqq(bfqd, bfqq,
|
|
+ "bfq_add_request non-busy: "
|
|
+ "jiffies %lu, in_time %d, idle_long %d busyw %d "
|
|
+ "wr_coeff %u",
|
|
+ jiffies, arrived_in_time,
|
|
+ idle_for_long_time,
|
|
+ bfq_bfqq_non_blocking_wait_rq(bfqq),
|
|
+ old_wr_coeff);
|
|
+
|
|
+ BUG_ON(bfqq->entity.budget < bfqq->entity.service);
|
|
+
|
|
+ BUG_ON(bfqq == bfqd->in_service_queue);
|
|
+ bfqg_stats_update_io_add(bfqq_group(RQ_BFQQ(rq)), bfqq,
|
|
+ req_op(rq), rq->cmd_flags);
|
|
+
|
|
+ /*
|
|
+ * bfqq deserves to be weight-raised if:
|
|
+ * - it is sync,
|
|
+ * - it does not belong to a large burst,
|
|
+ * - it has been idle for enough time or is soft real-time,
|
|
+ * - is linked to a bfq_io_cq (it is not shared in any sense)
|
|
+ */
|
|
+ in_burst = bfq_bfqq_in_large_burst(bfqq);
|
|
+ soft_rt = bfqd->bfq_wr_max_softrt_rate > 0 &&
|
|
+ !in_burst &&
|
|
+ time_is_before_jiffies(bfqq->soft_rt_next_start);
|
|
+ *interactive =
|
|
+ !in_burst &&
|
|
+ idle_for_long_time;
|
|
+ wr_or_deserves_wr = bfqd->low_latency &&
|
|
+ (bfqq->wr_coeff > 1 ||
|
|
+ (bfq_bfqq_sync(bfqq) &&
|
|
+ bfqq->bic && (*interactive || soft_rt)));
|
|
+
|
|
+ bfq_log_bfqq(bfqd, bfqq,
|
|
+ "bfq_add_request: "
|
|
+ "in_burst %d, "
|
|
+ "soft_rt %d (next %lu), inter %d, bic %p",
|
|
+ bfq_bfqq_in_large_burst(bfqq), soft_rt,
|
|
+ bfqq->soft_rt_next_start,
|
|
+ *interactive,
|
|
+ bfqq->bic);
|
|
+
|
|
+ /*
|
|
+ * Using the last flag, update budget and check whether bfqq
|
|
+ * may want to preempt the in-service queue.
|
|
+ */
|
|
+ bfqq_wants_to_preempt =
|
|
+ bfq_bfqq_update_budg_for_activation(bfqd, bfqq,
|
|
+ arrived_in_time,
|
|
+ wr_or_deserves_wr);
|
|
+
|
|
+ /*
|
|
+ * If bfqq happened to be activated in a burst, but has been
|
|
+ * idle for much more than an interactive queue, then we
|
|
+ * assume that, in the overall I/O initiated in the burst, the
|
|
+ * I/O associated with bfqq is finished. So bfqq does not need
|
|
+ * to be treated as a queue belonging to a burst
|
|
+ * anymore. Accordingly, we reset bfqq's in_large_burst flag
|
|
+ * if set, and remove bfqq from the burst list if it's
|
|
+ * there. We do not decrement burst_size, because the fact
|
|
+ * that bfqq does not need to belong to the burst list any
|
|
+ * more does not invalidate the fact that bfqq was created in
|
|
+ * a burst.
|
|
+ */
|
|
+ if (likely(!bfq_bfqq_just_created(bfqq)) &&
|
|
+ idle_for_long_time &&
|
|
+ time_is_before_jiffies(
|
|
+ bfqq->budget_timeout +
|
|
+ msecs_to_jiffies(10000))) {
|
|
+ hlist_del_init(&bfqq->burst_list_node);
|
|
+ bfq_clear_bfqq_in_large_burst(bfqq);
|
|
+ }
|
|
+
|
|
+ bfq_clear_bfqq_just_created(bfqq);
|
|
+
|
|
+ if (!bfq_bfqq_IO_bound(bfqq)) {
|
|
+ if (arrived_in_time) {
|
|
+ bfqq->requests_within_timer++;
|
|
+ if (bfqq->requests_within_timer >=
|
|
+ bfqd->bfq_requests_within_timer)
|
|
+ bfq_mark_bfqq_IO_bound(bfqq);
|
|
+ } else
|
|
+ bfqq->requests_within_timer = 0;
|
|
+ bfq_log_bfqq(bfqd, bfqq, "requests in time %d",
|
|
+ bfqq->requests_within_timer);
|
|
+ }
|
|
+
|
|
+ if (bfqd->low_latency) {
|
|
+ if (unlikely(time_is_after_jiffies(bfqq->split_time)))
|
|
+ /* wraparound */
|
|
+ bfqq->split_time =
|
|
+ jiffies - bfqd->bfq_wr_min_idle_time - 1;
|
|
+
|
|
+ if (time_is_before_jiffies(bfqq->split_time +
|
|
+ bfqd->bfq_wr_min_idle_time)) {
|
|
+ bfq_update_bfqq_wr_on_rq_arrival(bfqd, bfqq,
|
|
+ old_wr_coeff,
|
|
+ wr_or_deserves_wr,
|
|
+ *interactive,
|
|
+ in_burst,
|
|
+ soft_rt);
|
|
+
|
|
+ if (old_wr_coeff != bfqq->wr_coeff)
|
|
+ bfqq->entity.prio_changed = 1;
|
|
+ }
|
|
+ }
|
|
+
|
|
+ bfqq->last_idle_bklogged = jiffies;
|
|
+ bfqq->service_from_backlogged = 0;
|
|
+ bfq_clear_bfqq_softrt_update(bfqq);
|
|
+
|
|
+ bfq_add_bfqq_busy(bfqd, bfqq);
|
|
+
|
|
+ /*
|
|
+ * Expire in-service queue only if preemption may be needed
|
|
+ * for guarantees. In this respect, the function
|
|
+ * next_queue_may_preempt just checks a simple, necessary
|
|
+ * condition, and not a sufficient condition based on
|
|
+ * timestamps. In fact, for the latter condition to be
|
|
+ * evaluated, timestamps would need first to be updated, and
|
|
+ * this operation is quite costly (see the comments on the
|
|
+ * function bfq_bfqq_update_budg_for_activation).
|
|
+ */
|
|
+ if (bfqd->in_service_queue && bfqq_wants_to_preempt &&
|
|
+ bfqd->in_service_queue->wr_coeff < bfqq->wr_coeff &&
|
|
+ next_queue_may_preempt(bfqd)) {
|
|
+ struct bfq_queue *in_serv =
|
|
+ bfqd->in_service_queue;
|
|
+ BUG_ON(in_serv == bfqq);
|
|
+
|
|
+ bfq_bfqq_expire(bfqd, bfqd->in_service_queue,
|
|
+ false, BFQ_BFQQ_PREEMPTED);
|
|
+ BUG_ON(in_serv->entity.budget < 0);
|
|
+ }
|
|
+}
|
|
+
|
|
+static void bfq_add_request(struct request *rq)
|
|
+{
|
|
+ struct bfq_queue *bfqq = RQ_BFQQ(rq);
|
|
+ struct bfq_data *bfqd = bfqq->bfqd;
|
|
+ struct request *next_rq, *prev;
|
|
+ unsigned int old_wr_coeff = bfqq->wr_coeff;
|
|
+ bool interactive = false;
|
|
+
|
|
+ bfq_log_bfqq(bfqd, bfqq, "add_request: size %u %s",
|
|
+ blk_rq_sectors(rq), rq_is_sync(rq) ? "S" : "A");
|
|
+
|
|
+ if (bfqq->wr_coeff > 1) /* queue is being weight-raised */
|
|
+ bfq_log_bfqq(bfqd, bfqq,
|
|
+ "raising period dur %u/%u msec, old coeff %u, w %d(%d)",
|
|
+ jiffies_to_msecs(jiffies - bfqq->last_wr_start_finish),
|
|
+ jiffies_to_msecs(bfqq->wr_cur_max_time),
|
|
+ bfqq->wr_coeff,
|
|
+ bfqq->entity.weight, bfqq->entity.orig_weight);
|
|
+
|
|
+ bfqq->queued[rq_is_sync(rq)]++;
|
|
+ bfqd->queued++;
|
|
+
|
|
+ elv_rb_add(&bfqq->sort_list, rq);
|
|
+
|
|
+ /*
|
|
+ * Check if this request is a better next-to-serve candidate.
|
|
+ */
|
|
+ prev = bfqq->next_rq;
|
|
+ next_rq = bfq_choose_req(bfqd, bfqq->next_rq, rq, bfqd->last_position);
|
|
+ BUG_ON(!next_rq);
|
|
+ bfqq->next_rq = next_rq;
|
|
+
|
|
+ /*
|
|
+ * Adjust priority tree position, if next_rq changes.
|
|
+ */
|
|
+ if (prev != bfqq->next_rq)
|
|
+ bfq_pos_tree_add_move(bfqd, bfqq);
|
|
+
|
|
+ if (!bfq_bfqq_busy(bfqq)) /* switching to busy ... */
|
|
+ bfq_bfqq_handle_idle_busy_switch(bfqd, bfqq, old_wr_coeff,
|
|
+ rq, &interactive);
|
|
+ else {
|
|
+ if (bfqd->low_latency && old_wr_coeff == 1 && !rq_is_sync(rq) &&
|
|
+ time_is_before_jiffies(
|
|
+ bfqq->last_wr_start_finish +
|
|
+ bfqd->bfq_wr_min_inter_arr_async)) {
|
|
+ bfqq->wr_coeff = bfqd->bfq_wr_coeff;
|
|
+ bfqq->wr_cur_max_time = bfq_wr_duration(bfqd);
|
|
+
|
|
+ bfqd->wr_busy_queues++;
|
|
+ bfqq->entity.prio_changed = 1;
|
|
+ bfq_log_bfqq(bfqd, bfqq,
|
|
+ "non-idle wrais starting, "
|
|
+ "wr_max_time %u wr_busy %d",
|
|
+ jiffies_to_msecs(bfqq->wr_cur_max_time),
|
|
+ bfqd->wr_busy_queues);
|
|
+ }
|
|
+ if (prev != bfqq->next_rq)
|
|
+ bfq_updated_next_req(bfqd, bfqq);
|
|
+ }
|
|
+
|
|
+ /*
|
|
+ * Assign jiffies to last_wr_start_finish in the following
|
|
+ * cases:
|
|
+ *
|
|
+ * . if bfqq is not going to be weight-raised, because, for
|
|
+ * non weight-raised queues, last_wr_start_finish stores the
|
|
+ * arrival time of the last request; as of now, this piece
|
|
+ * of information is used only for deciding whether to
|
|
+ * weight-raise async queues
|
|
+ *
|
|
+ * . if bfqq is not weight-raised, because, if bfqq is now
|
|
+ * switching to weight-raised, then last_wr_start_finish
|
|
+ * stores the time when weight-raising starts
|
|
+ *
|
|
+ * . if bfqq is interactive, because, regardless of whether
|
|
+ * bfqq is currently weight-raised, the weight-raising
|
|
+ * period must start or restart (this case is considered
|
|
+ * separately because it is not detected by the above
|
|
+ * conditions, if bfqq is already weight-raised)
|
|
+ *
|
|
+ * last_wr_start_finish has to be updated also if bfqq is soft
|
|
+ * real-time, because the weight-raising period is constantly
|
|
+ * restarted on idle-to-busy transitions for these queues, but
|
|
+ * this is already done in bfq_bfqq_handle_idle_busy_switch if
|
|
+ * needed.
|
|
+ */
|
|
+ if (bfqd->low_latency &&
|
|
+ (old_wr_coeff == 1 || bfqq->wr_coeff == 1 || interactive))
|
|
+ bfqq->last_wr_start_finish = jiffies;
|
|
+}
|
|
+
|
|
+static struct request *bfq_find_rq_fmerge(struct bfq_data *bfqd,
|
|
+ struct bio *bio)
|
|
+{
|
|
+ struct task_struct *tsk = current;
|
|
+ struct bfq_io_cq *bic;
|
|
+ struct bfq_queue *bfqq;
|
|
+
|
|
+ bic = bfq_bic_lookup(bfqd, tsk->io_context);
|
|
+ if (!bic)
|
|
+ return NULL;
|
|
+
|
|
+ bfqq = bic_to_bfqq(bic, bfq_bio_sync(bio));
|
|
+ if (bfqq)
|
|
+ return elv_rb_find(&bfqq->sort_list, bio_end_sector(bio));
|
|
+
|
|
+ return NULL;
|
|
+}
|
|
+
|
|
+static sector_t get_sdist(sector_t last_pos, struct request *rq)
|
|
+{
|
|
+ sector_t sdist = 0;
|
|
+
|
|
+ if (last_pos) {
|
|
+ if (last_pos < blk_rq_pos(rq))
|
|
+ sdist = blk_rq_pos(rq) - last_pos;
|
|
+ else
|
|
+ sdist = last_pos - blk_rq_pos(rq);
|
|
+ }
|
|
+
|
|
+ return sdist;
|
|
+}
|
|
+
|
|
+static void bfq_activate_request(struct request_queue *q, struct request *rq)
|
|
+{
|
|
+ struct bfq_data *bfqd = q->elevator->elevator_data;
|
|
+ bfqd->rq_in_driver++;
|
|
+}
|
|
+
|
|
+static void bfq_deactivate_request(struct request_queue *q, struct request *rq)
|
|
+{
|
|
+ struct bfq_data *bfqd = q->elevator->elevator_data;
|
|
+
|
|
+ BUG_ON(bfqd->rq_in_driver == 0);
|
|
+ bfqd->rq_in_driver--;
|
|
+}
|
|
+
|
|
+static void bfq_remove_request(struct request *rq)
|
|
+{
|
|
+ struct bfq_queue *bfqq = RQ_BFQQ(rq);
|
|
+ struct bfq_data *bfqd = bfqq->bfqd;
|
|
+ const int sync = rq_is_sync(rq);
|
|
+
|
|
+ BUG_ON(bfqq->entity.service > bfqq->entity.budget &&
|
|
+ bfqq == bfqd->in_service_queue);
|
|
+
|
|
+ if (bfqq->next_rq == rq) {
|
|
+ bfqq->next_rq = bfq_find_next_rq(bfqd, bfqq, rq);
|
|
+ bfq_updated_next_req(bfqd, bfqq);
|
|
+ }
|
|
+
|
|
+ if (rq->queuelist.prev != &rq->queuelist)
|
|
+ list_del_init(&rq->queuelist);
|
|
+ BUG_ON(bfqq->queued[sync] == 0);
|
|
+ bfqq->queued[sync]--;
|
|
+ bfqd->queued--;
|
|
+ elv_rb_del(&bfqq->sort_list, rq);
|
|
+
|
|
+ if (RB_EMPTY_ROOT(&bfqq->sort_list)) {
|
|
+ bfqq->next_rq = NULL;
|
|
+
|
|
+ BUG_ON(bfqq->entity.budget < 0);
|
|
+
|
|
+ if (bfq_bfqq_busy(bfqq) && bfqq != bfqd->in_service_queue) {
|
|
+ bfq_del_bfqq_busy(bfqd, bfqq, false);
|
|
+ /* bfqq emptied. In normal operation, when
|
|
+ * bfqq is empty, bfqq->entity.service and
|
|
+ * bfqq->entity.budget must contain,
|
|
+ * respectively, the service received and the
|
|
+ * budget used last time bfqq emptied. These
|
|
+ * facts do not hold in this case, as at least
|
|
+ * this last removal occurred while bfqq is
|
|
+ * not in service. To avoid inconsistencies,
|
|
+ * reset both bfqq->entity.service and
|
|
+ * bfqq->entity.budget.
|
|
+ */
|
|
+ bfqq->entity.budget = bfqq->entity.service = 0;
|
|
+ }
|
|
+
|
|
+ /*
|
|
+ * Remove queue from request-position tree as it is empty.
|
|
+ */
|
|
+ if (bfqq->pos_root) {
|
|
+ rb_erase(&bfqq->pos_node, bfqq->pos_root);
|
|
+ bfqq->pos_root = NULL;
|
|
+ }
|
|
+ }
|
|
+
|
|
+ if (rq->cmd_flags & REQ_META) {
|
|
+ BUG_ON(bfqq->meta_pending == 0);
|
|
+ bfqq->meta_pending--;
|
|
+ }
|
|
+ bfqg_stats_update_io_remove(bfqq_group(bfqq), req_op(rq),
|
|
+ rq->cmd_flags);
|
|
+}
|
|
+
|
|
+static int bfq_merge(struct request_queue *q, struct request **req,
|
|
+ struct bio *bio)
|
|
+{
|
|
+ struct bfq_data *bfqd = q->elevator->elevator_data;
|
|
+ struct request *__rq;
|
|
+
|
|
+ __rq = bfq_find_rq_fmerge(bfqd, bio);
|
|
+ if (__rq && elv_bio_merge_ok(__rq, bio)) {
|
|
+ *req = __rq;
|
|
+ return ELEVATOR_FRONT_MERGE;
|
|
+ }
|
|
+
|
|
+ return ELEVATOR_NO_MERGE;
|
|
+}
|
|
+
|
|
+static void bfq_merged_request(struct request_queue *q, struct request *req,
|
|
+ int type)
|
|
+{
|
|
+ if (type == ELEVATOR_FRONT_MERGE &&
|
|
+ rb_prev(&req->rb_node) &&
|
|
+ blk_rq_pos(req) <
|
|
+ blk_rq_pos(container_of(rb_prev(&req->rb_node),
|
|
+ struct request, rb_node))) {
|
|
+ struct bfq_queue *bfqq = RQ_BFQQ(req);
|
|
+ struct bfq_data *bfqd = bfqq->bfqd;
|
|
+ struct request *prev, *next_rq;
|
|
+
|
|
+ /* Reposition request in its sort_list */
|
|
+ elv_rb_del(&bfqq->sort_list, req);
|
|
+ elv_rb_add(&bfqq->sort_list, req);
|
|
+ /* Choose next request to be served for bfqq */
|
|
+ prev = bfqq->next_rq;
|
|
+ next_rq = bfq_choose_req(bfqd, bfqq->next_rq, req,
|
|
+ bfqd->last_position);
|
|
+ BUG_ON(!next_rq);
|
|
+ bfqq->next_rq = next_rq;
|
|
+ /*
|
|
+ * If next_rq changes, update both the queue's budget to
|
|
+ * fit the new request and the queue's position in its
|
|
+ * rq_pos_tree.
|
|
+ */
|
|
+ if (prev != bfqq->next_rq) {
|
|
+ bfq_updated_next_req(bfqd, bfqq);
|
|
+ bfq_pos_tree_add_move(bfqd, bfqq);
|
|
+ }
|
|
+ }
|
|
+}
|
|
+
|
|
+#ifdef CONFIG_BFQ_GROUP_IOSCHED
|
|
+static void bfq_bio_merged(struct request_queue *q, struct request *req,
|
|
+ struct bio *bio)
|
|
+{
|
|
+ bfqg_stats_update_io_merged(bfqq_group(RQ_BFQQ(req)), bio_op(bio),
|
|
+ bio->bi_opf);
|
|
+}
|
|
+#endif
|
|
+
|
|
+static void bfq_merged_requests(struct request_queue *q, struct request *rq,
|
|
+ struct request *next)
|
|
+{
|
|
+ struct bfq_queue *bfqq = RQ_BFQQ(rq), *next_bfqq = RQ_BFQQ(next);
|
|
+
|
|
+ /*
|
|
+ * If next and rq belong to the same bfq_queue and next is older
|
|
+ * than rq, then reposition rq in the fifo (by substituting next
|
|
+ * with rq). Otherwise, if next and rq belong to different
|
|
+ * bfq_queues, never reposition rq: in fact, we would have to
|
|
+ * reposition it with respect to next's position in its own fifo,
|
|
+ * which would most certainly be too expensive with respect to
|
|
+ * the benefits.
|
|
+ */
|
|
+ if (bfqq == next_bfqq &&
|
|
+ !list_empty(&rq->queuelist) && !list_empty(&next->queuelist) &&
|
|
+ next->fifo_time < rq->fifo_time) {
|
|
+ list_del_init(&rq->queuelist);
|
|
+ list_replace_init(&next->queuelist, &rq->queuelist);
|
|
+ rq->fifo_time = next->fifo_time;
|
|
+ }
|
|
+
|
|
+ if (bfqq->next_rq == next)
|
|
+ bfqq->next_rq = rq;
|
|
+
|
|
+ bfq_remove_request(next);
|
|
+ bfqg_stats_update_io_merged(bfqq_group(bfqq), req_op(next),
|
|
+ next->cmd_flags);
|
|
+}
|
|
+
|
|
+/* Must be called with bfqq != NULL */
|
|
+static void bfq_bfqq_end_wr(struct bfq_queue *bfqq)
|
|
+{
|
|
+ BUG_ON(!bfqq);
|
|
+
|
|
+ if (bfq_bfqq_busy(bfqq))
|
|
+ bfqq->bfqd->wr_busy_queues--;
|
|
+ bfqq->wr_coeff = 1;
|
|
+ bfqq->wr_cur_max_time = 0;
|
|
+ bfqq->last_wr_start_finish = jiffies;
|
|
+ /*
|
|
+ * Trigger a weight change on the next invocation of
|
|
+ * __bfq_entity_update_weight_prio.
|
|
+ */
|
|
+ bfqq->entity.prio_changed = 1;
|
|
+ bfq_log_bfqq(bfqq->bfqd, bfqq,
|
|
+ "end_wr: wrais ending at %lu, rais_max_time %u",
|
|
+ bfqq->last_wr_start_finish,
|
|
+ jiffies_to_msecs(bfqq->wr_cur_max_time));
|
|
+ bfq_log_bfqq(bfqq->bfqd, bfqq, "end_wr: wr_busy %d",
|
|
+ bfqq->bfqd->wr_busy_queues);
|
|
+}
|
|
+
|
|
+static void bfq_end_wr_async_queues(struct bfq_data *bfqd,
|
|
+ struct bfq_group *bfqg)
|
|
+{
|
|
+ int i, j;
|
|
+
|
|
+ for (i = 0; i < 2; i++)
|
|
+ for (j = 0; j < IOPRIO_BE_NR; j++)
|
|
+ if (bfqg->async_bfqq[i][j])
|
|
+ bfq_bfqq_end_wr(bfqg->async_bfqq[i][j]);
|
|
+ if (bfqg->async_idle_bfqq)
|
|
+ bfq_bfqq_end_wr(bfqg->async_idle_bfqq);
|
|
+}
|
|
+
|
|
+static void bfq_end_wr(struct bfq_data *bfqd)
|
|
+{
|
|
+ struct bfq_queue *bfqq;
|
|
+
|
|
+ spin_lock_irq(bfqd->queue->queue_lock);
|
|
+
|
|
+ list_for_each_entry(bfqq, &bfqd->active_list, bfqq_list)
|
|
+ bfq_bfqq_end_wr(bfqq);
|
|
+ list_for_each_entry(bfqq, &bfqd->idle_list, bfqq_list)
|
|
+ bfq_bfqq_end_wr(bfqq);
|
|
+ bfq_end_wr_async(bfqd);
|
|
+
|
|
+ spin_unlock_irq(bfqd->queue->queue_lock);
|
|
+}
|
|
+
|
|
+static sector_t bfq_io_struct_pos(void *io_struct, bool request)
|
|
+{
|
|
+ if (request)
|
|
+ return blk_rq_pos(io_struct);
|
|
+ else
|
|
+ return ((struct bio *)io_struct)->bi_iter.bi_sector;
|
|
+}
|
|
+
|
|
+static int bfq_rq_close_to_sector(void *io_struct, bool request,
|
|
+ sector_t sector)
|
|
+{
|
|
+ return abs(bfq_io_struct_pos(io_struct, request) - sector) <=
|
|
+ BFQQ_CLOSE_THR;
|
|
+}
|
|
+
|
|
+static struct bfq_queue *bfqq_find_close(struct bfq_data *bfqd,
|
|
+ struct bfq_queue *bfqq,
|
|
+ sector_t sector)
|
|
+{
|
|
+ struct rb_root *root = &bfq_bfqq_to_bfqg(bfqq)->rq_pos_tree;
|
|
+ struct rb_node *parent, *node;
|
|
+ struct bfq_queue *__bfqq;
|
|
+
|
|
+ if (RB_EMPTY_ROOT(root))
|
|
+ return NULL;
|
|
+
|
|
+ /*
|
|
+ * First, if we find a request starting at the end of the last
|
|
+ * request, choose it.
|
|
+ */
|
|
+ __bfqq = bfq_rq_pos_tree_lookup(bfqd, root, sector, &parent, NULL);
|
|
+ if (__bfqq)
|
|
+ return __bfqq;
|
|
+
|
|
+ /*
|
|
+ * If the exact sector wasn't found, the parent of the NULL leaf
|
|
+ * will contain the closest sector (rq_pos_tree sorted by
|
|
+ * next_request position).
|
|
+ */
|
|
+ __bfqq = rb_entry(parent, struct bfq_queue, pos_node);
|
|
+ if (bfq_rq_close_to_sector(__bfqq->next_rq, true, sector))
|
|
+ return __bfqq;
|
|
+
|
|
+ if (blk_rq_pos(__bfqq->next_rq) < sector)
|
|
+ node = rb_next(&__bfqq->pos_node);
|
|
+ else
|
|
+ node = rb_prev(&__bfqq->pos_node);
|
|
+ if (!node)
|
|
+ return NULL;
|
|
+
|
|
+ __bfqq = rb_entry(node, struct bfq_queue, pos_node);
|
|
+ if (bfq_rq_close_to_sector(__bfqq->next_rq, true, sector))
|
|
+ return __bfqq;
|
|
+
|
|
+ return NULL;
|
|
+}
|
|
+
|
|
+static struct bfq_queue *bfq_find_close_cooperator(struct bfq_data *bfqd,
|
|
+ struct bfq_queue *cur_bfqq,
|
|
+ sector_t sector)
|
|
+{
|
|
+ struct bfq_queue *bfqq;
|
|
+
|
|
+ /*
|
|
+ * We shall notice if some of the queues are cooperating,
|
|
+ * e.g., working closely on the same area of the device. In
|
|
+ * that case, we can group them together and: 1) don't waste
|
|
+ * time idling, and 2) serve the union of their requests in
|
|
+ * the best possible order for throughput.
|
|
+ */
|
|
+ bfqq = bfqq_find_close(bfqd, cur_bfqq, sector);
|
|
+ if (!bfqq || bfqq == cur_bfqq)
|
|
+ return NULL;
|
|
+
|
|
+ return bfqq;
|
|
+}
|
|
+
|
|
+static struct bfq_queue *
|
|
+bfq_setup_merge(struct bfq_queue *bfqq, struct bfq_queue *new_bfqq)
|
|
+{
|
|
+ int process_refs, new_process_refs;
|
|
+ struct bfq_queue *__bfqq;
|
|
+
|
|
+ /*
|
|
+ * If there are no process references on the new_bfqq, then it is
|
|
+ * unsafe to follow the ->new_bfqq chain as other bfqq's in the chain
|
|
+ * may have dropped their last reference (not just their last process
|
|
+ * reference).
|
|
+ */
|
|
+ if (!bfqq_process_refs(new_bfqq))
|
|
+ return NULL;
|
|
+
|
|
+ /* Avoid a circular list and skip interim queue merges. */
|
|
+ while ((__bfqq = new_bfqq->new_bfqq)) {
|
|
+ if (__bfqq == bfqq)
|
|
+ return NULL;
|
|
+ new_bfqq = __bfqq;
|
|
+ }
|
|
+
|
|
+ process_refs = bfqq_process_refs(bfqq);
|
|
+ new_process_refs = bfqq_process_refs(new_bfqq);
|
|
+ /*
|
|
+ * If the process for the bfqq has gone away, there is no
|
|
+ * sense in merging the queues.
|
|
+ */
|
|
+ if (process_refs == 0 || new_process_refs == 0)
|
|
+ return NULL;
|
|
+
|
|
+ bfq_log_bfqq(bfqq->bfqd, bfqq, "scheduling merge with queue %d",
|
|
+ new_bfqq->pid);
|
|
+
|
|
+ /*
|
|
+ * Merging is just a redirection: the requests of the process
|
|
+ * owning one of the two queues are redirected to the other queue.
|
|
+ * The latter queue, in its turn, is set as shared if this is the
|
|
+ * first time that the requests of some process are redirected to
|
|
+ * it.
|
|
+ *
|
|
+ * We redirect bfqq to new_bfqq and not the opposite, because we
|
|
+ * are in the context of the process owning bfqq, hence we have
|
|
+ * the io_cq of this process. So we can immediately configure this
|
|
+ * io_cq to redirect the requests of the process to new_bfqq.
|
|
+ *
|
|
+ * NOTE, even if new_bfqq coincides with the in-service queue, the
|
|
+ * io_cq of new_bfqq is not available, because, if the in-service
|
|
+ * queue is shared, bfqd->in_service_bic may not point to the
|
|
+ * io_cq of the in-service queue.
|
|
+ * Redirecting the requests of the process owning bfqq to the
|
|
+ * currently in-service queue is in any case the best option, as
|
|
+ * we feed the in-service queue with new requests close to the
|
|
+ * last request served and, by doing so, hopefully increase the
|
|
+ * throughput.
|
|
+ */
|
|
+ bfqq->new_bfqq = new_bfqq;
|
|
+ new_bfqq->ref += process_refs;
|
|
+ return new_bfqq;
|
|
+}
|
|
+
|
|
+static bool bfq_may_be_close_cooperator(struct bfq_queue *bfqq,
|
|
+ struct bfq_queue *new_bfqq)
|
|
+{
|
|
+ if (bfq_class_idle(bfqq) || bfq_class_idle(new_bfqq) ||
|
|
+ (bfqq->ioprio_class != new_bfqq->ioprio_class))
|
|
+ return false;
|
|
+
|
|
+ /*
|
|
+ * If either of the queues has already been detected as seeky,
|
|
+ * then merging it with the other queue is unlikely to lead to
|
|
+ * sequential I/O.
|
|
+ */
|
|
+ if (BFQQ_SEEKY(bfqq) || BFQQ_SEEKY(new_bfqq))
|
|
+ return false;
|
|
+
|
|
+ /*
|
|
+ * Interleaved I/O is known to be done by (some) applications
|
|
+ * only for reads, so it does not make sense to merge async
|
|
+ * queues.
|
|
+ */
|
|
+ if (!bfq_bfqq_sync(bfqq) || !bfq_bfqq_sync(new_bfqq))
|
|
+ return false;
|
|
+
|
|
+ return true;
|
|
+}
|
|
+
|
|
+/*
|
|
+ * If this function returns true, then bfqq cannot be merged. The idea
|
|
+ * is that true cooperation happens very early after processes start
|
|
+ * to do I/O. Usually, late cooperations are just accidental false
|
|
+ * positives. In case bfqq is weight-raised, such false positives
|
|
+ * would evidently degrade latency guarantees for bfqq.
|
|
+ */
|
|
+static bool wr_from_too_long(struct bfq_queue *bfqq)
|
|
+{
|
|
+ return bfqq->wr_coeff > 1 &&
|
|
+ time_is_before_jiffies(bfqq->last_wr_start_finish +
|
|
+ msecs_to_jiffies(100));
|
|
+}
|
|
+
|
|
+/*
|
|
+ * Attempt to schedule a merge of bfqq with the currently in-service
|
|
+ * queue or with a close queue among the scheduled queues. Return
|
|
+ * NULL if no merge was scheduled, a pointer to the shared bfq_queue
|
|
+ * structure otherwise.
|
|
+ *
|
|
+ * The OOM queue is not allowed to participate to cooperation: in fact, since
|
|
+ * the requests temporarily redirected to the OOM queue could be redirected
|
|
+ * again to dedicated queues at any time, the state needed to correctly
|
|
+ * handle merging with the OOM queue would be quite complex and expensive
|
|
+ * to maintain. Besides, in such a critical condition as an out of memory,
|
|
+ * the benefits of queue merging may be little relevant, or even negligible.
|
|
+ *
|
|
+ * Weight-raised queues can be merged only if their weight-raising
|
|
+ * period has just started. In fact cooperating processes are usually
|
|
+ * started together. Thus, with this filter we avoid false positives
|
|
+ * that would jeopardize low-latency guarantees.
|
|
+ *
|
|
+ * WARNING: queue merging may impair fairness among non-weight raised
|
|
+ * queues, for at least two reasons: 1) the original weight of a
|
|
+ * merged queue may change during the merged state, 2) even being the
|
|
+ * weight the same, a merged queue may be bloated with many more
|
|
+ * requests than the ones produced by its originally-associated
|
|
+ * process.
|
|
+ */
|
|
+static struct bfq_queue *
|
|
+bfq_setup_cooperator(struct bfq_data *bfqd, struct bfq_queue *bfqq,
|
|
+ void *io_struct, bool request)
|
|
+{
|
|
+ struct bfq_queue *in_service_bfqq, *new_bfqq;
|
|
+
|
|
+ if (bfqq->new_bfqq)
|
|
+ return bfqq->new_bfqq;
|
|
+
|
|
+ if (io_struct && wr_from_too_long(bfqq) &&
|
|
+ likely(bfqq != &bfqd->oom_bfqq))
|
|
+ bfq_log_bfqq(bfqd, bfqq,
|
|
+ "would have looked for coop, but bfq%d wr",
|
|
+ bfqq->pid);
|
|
+
|
|
+ if (!io_struct ||
|
|
+ wr_from_too_long(bfqq) ||
|
|
+ unlikely(bfqq == &bfqd->oom_bfqq))
|
|
+ return NULL;
|
|
+
|
|
+ /* If there is only one backlogged queue, don't search. */
|
|
+ if (bfqd->busy_queues == 1)
|
|
+ return NULL;
|
|
+
|
|
+ in_service_bfqq = bfqd->in_service_queue;
|
|
+
|
|
+ if (in_service_bfqq && in_service_bfqq != bfqq &&
|
|
+ bfqd->in_service_bic && wr_from_too_long(in_service_bfqq)
|
|
+ && likely(in_service_bfqq == &bfqd->oom_bfqq))
|
|
+ bfq_log_bfqq(bfqd, bfqq,
|
|
+ "would have tried merge with in-service-queue, but wr");
|
|
+
|
|
+ if (!in_service_bfqq || in_service_bfqq == bfqq ||
|
|
+ !bfqd->in_service_bic || wr_from_too_long(in_service_bfqq) ||
|
|
+ unlikely(in_service_bfqq == &bfqd->oom_bfqq))
|
|
+ goto check_scheduled;
|
|
+
|
|
+ if (bfq_rq_close_to_sector(io_struct, request, bfqd->last_position) &&
|
|
+ bfqq->entity.parent == in_service_bfqq->entity.parent &&
|
|
+ bfq_may_be_close_cooperator(bfqq, in_service_bfqq)) {
|
|
+ new_bfqq = bfq_setup_merge(bfqq, in_service_bfqq);
|
|
+ if (new_bfqq)
|
|
+ return new_bfqq;
|
|
+ }
|
|
+ /*
|
|
+ * Check whether there is a cooperator among currently scheduled
|
|
+ * queues. The only thing we need is that the bio/request is not
|
|
+ * NULL, as we need it to establish whether a cooperator exists.
|
|
+ */
|
|
+check_scheduled:
|
|
+ new_bfqq = bfq_find_close_cooperator(bfqd, bfqq,
|
|
+ bfq_io_struct_pos(io_struct, request));
|
|
+
|
|
+ BUG_ON(new_bfqq && bfqq->entity.parent != new_bfqq->entity.parent);
|
|
+
|
|
+ if (new_bfqq && wr_from_too_long(new_bfqq) &&
|
|
+ likely(new_bfqq != &bfqd->oom_bfqq) &&
|
|
+ bfq_may_be_close_cooperator(bfqq, new_bfqq))
|
|
+ bfq_log_bfqq(bfqd, bfqq,
|
|
+ "would have merged with bfq%d, but wr",
|
|
+ new_bfqq->pid);
|
|
+
|
|
+ if (new_bfqq && !wr_from_too_long(new_bfqq) &&
|
|
+ likely(new_bfqq != &bfqd->oom_bfqq) &&
|
|
+ bfq_may_be_close_cooperator(bfqq, new_bfqq))
|
|
+ return bfq_setup_merge(bfqq, new_bfqq);
|
|
+
|
|
+ return NULL;
|
|
+}
|
|
+
|
|
+static void bfq_bfqq_save_state(struct bfq_queue *bfqq)
|
|
+{
|
|
+ struct bfq_io_cq *bic = bfqq->bic;
|
|
+
|
|
+ /*
|
|
+ * If !bfqq->bic, the queue is already shared or its requests
|
|
+ * have already been redirected to a shared queue; both idle window
|
|
+ * and weight raising state have already been saved. Do nothing.
|
|
+ */
|
|
+ if (!bic)
|
|
+ return;
|
|
+
|
|
+ bic->saved_idle_window = bfq_bfqq_idle_window(bfqq);
|
|
+ bic->saved_IO_bound = bfq_bfqq_IO_bound(bfqq);
|
|
+ bic->saved_in_large_burst = bfq_bfqq_in_large_burst(bfqq);
|
|
+ bic->was_in_burst_list = !hlist_unhashed(&bfqq->burst_list_node);
|
|
+ bic->saved_wr_coeff = bfqq->wr_coeff;
|
|
+ bic->saved_wr_start_at_switch_to_srt = bfqq->wr_start_at_switch_to_srt;
|
|
+ bic->saved_last_wr_start_finish = bfqq->last_wr_start_finish;
|
|
+ bic->saved_wr_cur_max_time = bfqq->wr_cur_max_time;
|
|
+ BUG_ON(time_is_after_jiffies(bfqq->last_wr_start_finish));
|
|
+}
|
|
+
|
|
+static void bfq_get_bic_reference(struct bfq_queue *bfqq)
|
|
+{
|
|
+ /*
|
|
+ * If bfqq->bic has a non-NULL value, the bic to which it belongs
|
|
+ * is about to begin using a shared bfq_queue.
|
|
+ */
|
|
+ if (bfqq->bic)
|
|
+ atomic_long_inc(&bfqq->bic->icq.ioc->refcount);
|
|
+}
|
|
+
|
|
+static void
|
|
+bfq_merge_bfqqs(struct bfq_data *bfqd, struct bfq_io_cq *bic,
|
|
+ struct bfq_queue *bfqq, struct bfq_queue *new_bfqq)
|
|
+{
|
|
+ bfq_log_bfqq(bfqd, bfqq, "merging with queue %lu",
|
|
+ (unsigned long) new_bfqq->pid);
|
|
+ /* Save weight raising and idle window of the merged queues */
|
|
+ bfq_bfqq_save_state(bfqq);
|
|
+ bfq_bfqq_save_state(new_bfqq);
|
|
+ if (bfq_bfqq_IO_bound(bfqq))
|
|
+ bfq_mark_bfqq_IO_bound(new_bfqq);
|
|
+ bfq_clear_bfqq_IO_bound(bfqq);
|
|
+
|
|
+ /*
|
|
+ * If bfqq is weight-raised, then let new_bfqq inherit
|
|
+ * weight-raising. To reduce false positives, neglect the case
|
|
+ * where bfqq has just been created, but has not yet made it
|
|
+ * to be weight-raised (which may happen because EQM may merge
|
|
+ * bfqq even before bfq_add_request is executed for the first
|
|
+ * time for bfqq). Handling this case would however be very
|
|
+ * easy, thanks to the flag just_created.
|
|
+ */
|
|
+ if (new_bfqq->wr_coeff == 1 && bfqq->wr_coeff > 1) {
|
|
+ new_bfqq->wr_coeff = bfqq->wr_coeff;
|
|
+ new_bfqq->wr_cur_max_time = bfqq->wr_cur_max_time;
|
|
+ new_bfqq->last_wr_start_finish = bfqq->last_wr_start_finish;
|
|
+ new_bfqq->wr_start_at_switch_to_srt = bfqq->wr_start_at_switch_to_srt;
|
|
+ if (bfq_bfqq_busy(new_bfqq))
|
|
+ bfqd->wr_busy_queues++;
|
|
+ new_bfqq->entity.prio_changed = 1;
|
|
+ bfq_log_bfqq(bfqd, new_bfqq,
|
|
+ "wr start after merge with %d, rais_max_time %u",
|
|
+ bfqq->pid,
|
|
+ jiffies_to_msecs(bfqq->wr_cur_max_time));
|
|
+ }
|
|
+
|
|
+ if (bfqq->wr_coeff > 1) { /* bfqq has given its wr to new_bfqq */
|
|
+ bfqq->wr_coeff = 1;
|
|
+ bfqq->entity.prio_changed = 1;
|
|
+ if (bfq_bfqq_busy(bfqq))
|
|
+ bfqd->wr_busy_queues--;
|
|
+ }
|
|
+
|
|
+ bfq_log_bfqq(bfqd, new_bfqq, "merge_bfqqs: wr_busy %d",
|
|
+ bfqd->wr_busy_queues);
|
|
+
|
|
+ /*
|
|
+ * Grab a reference to the bic, to prevent it from being destroyed
|
|
+ * before being possibly touched by a bfq_split_bfqq().
|
|
+ */
|
|
+ bfq_get_bic_reference(bfqq);
|
|
+ bfq_get_bic_reference(new_bfqq);
|
|
+ /*
|
|
+ * Merge queues (that is, let bic redirect its requests to new_bfqq)
|
|
+ */
|
|
+ bic_set_bfqq(bic, new_bfqq, 1);
|
|
+ bfq_mark_bfqq_coop(new_bfqq);
|
|
+ /*
|
|
+ * new_bfqq now belongs to at least two bics (it is a shared queue):
|
|
+ * set new_bfqq->bic to NULL. bfqq either:
|
|
+ * - does not belong to any bic any more, and hence bfqq->bic must
|
|
+ * be set to NULL, or
|
|
+ * - is a queue whose owning bics have already been redirected to a
|
|
+ * different queue, hence the queue is destined to not belong to
|
|
+ * any bic soon and bfqq->bic is already NULL (therefore the next
|
|
+ * assignment causes no harm).
|
|
+ */
|
|
+ new_bfqq->bic = NULL;
|
|
+ bfqq->bic = NULL;
|
|
+ bfq_put_queue(bfqq);
|
|
+}
|
|
+
|
|
+static int bfq_allow_bio_merge(struct request_queue *q, struct request *rq,
|
|
+ struct bio *bio)
|
|
+{
|
|
+ struct bfq_data *bfqd = q->elevator->elevator_data;
|
|
+ struct bfq_io_cq *bic;
|
|
+ struct bfq_queue *bfqq, *new_bfqq;
|
|
+
|
|
+ /*
|
|
+ * Disallow merge of a sync bio into an async request.
|
|
+ */
|
|
+ if (bfq_bio_sync(bio) && !rq_is_sync(rq))
|
|
+ return false;
|
|
+
|
|
+ /*
|
|
+ * Lookup the bfqq that this bio will be queued with. Allow
|
|
+ * merge only if rq is queued there.
|
|
+ * Queue lock is held here.
|
|
+ */
|
|
+ bic = bfq_bic_lookup(bfqd, current->io_context);
|
|
+ if (!bic)
|
|
+ return false;
|
|
+
|
|
+ bfqq = bic_to_bfqq(bic, bfq_bio_sync(bio));
|
|
+ /*
|
|
+ * We take advantage of this function to perform an early merge
|
|
+ * of the queues of possible cooperating processes.
|
|
+ */
|
|
+ if (bfqq) {
|
|
+ new_bfqq = bfq_setup_cooperator(bfqd, bfqq, bio, false);
|
|
+ if (new_bfqq) {
|
|
+ bfq_merge_bfqqs(bfqd, bic, bfqq, new_bfqq);
|
|
+ /*
|
|
+ * If we get here, the bio will be queued in the
|
|
+ * shared queue, i.e., new_bfqq, so use new_bfqq
|
|
+ * to decide whether bio and rq can be merged.
|
|
+ */
|
|
+ bfqq = new_bfqq;
|
|
+ }
|
|
+ }
|
|
+
|
|
+ return bfqq == RQ_BFQQ(rq);
|
|
+}
|
|
+
|
|
+static int bfq_allow_rq_merge(struct request_queue *q, struct request *rq,
|
|
+ struct request *next)
|
|
+{
|
|
+ return RQ_BFQQ(rq) == RQ_BFQQ(next);
|
|
+}
|
|
+
|
|
+/*
|
|
+ * Set the maximum time for the in-service queue to consume its
|
|
+ * budget. This prevents seeky processes from lowering the throughput.
|
|
+ * In practice, a time-slice service scheme is used with seeky
|
|
+ * processes.
|
|
+ */
|
|
+static void bfq_set_budget_timeout(struct bfq_data *bfqd,
|
|
+ struct bfq_queue *bfqq)
|
|
+{
|
|
+ unsigned int timeout_coeff;
|
|
+
|
|
+ if (bfqq->wr_cur_max_time == bfqd->bfq_wr_rt_max_time)
|
|
+ timeout_coeff = 1;
|
|
+ else
|
|
+ timeout_coeff = bfqq->entity.weight / bfqq->entity.orig_weight;
|
|
+
|
|
+ bfqd->last_budget_start = ktime_get();
|
|
+
|
|
+ bfqq->budget_timeout = jiffies +
|
|
+ bfqd->bfq_timeout * timeout_coeff;
|
|
+
|
|
+ bfq_log_bfqq(bfqd, bfqq, "set budget_timeout %u",
|
|
+ jiffies_to_msecs(bfqd->bfq_timeout * timeout_coeff));
|
|
+}
|
|
+
|
|
+static void __bfq_set_in_service_queue(struct bfq_data *bfqd,
|
|
+ struct bfq_queue *bfqq)
|
|
+{
|
|
+ if (bfqq) {
|
|
+ bfqg_stats_update_avg_queue_size(bfqq_group(bfqq));
|
|
+ bfq_mark_bfqq_must_alloc(bfqq);
|
|
+ bfq_clear_bfqq_fifo_expire(bfqq);
|
|
+
|
|
+ bfqd->budgets_assigned = (bfqd->budgets_assigned*7 + 256) / 8;
|
|
+
|
|
+ BUG_ON(bfqq == bfqd->in_service_queue);
|
|
+ BUG_ON(RB_EMPTY_ROOT(&bfqq->sort_list));
|
|
+
|
|
+ if (time_is_before_jiffies(bfqq->last_wr_start_finish) &&
|
|
+ bfqq->wr_coeff > 1 &&
|
|
+ bfqq->wr_cur_max_time == bfqd->bfq_wr_rt_max_time &&
|
|
+ time_is_before_jiffies(bfqq->budget_timeout)) {
|
|
+ /*
|
|
+ * For soft real-time queues, move the start
|
|
+ * of the weight-raising period forward by the
|
|
+ * time the queue has not received any
|
|
+ * service. Otherwise, a relatively long
|
|
+ * service delay is likely to cause the
|
|
+ * weight-raising period of the queue to end,
|
|
+ * because of the short duration of the
|
|
+ * weight-raising period of a soft real-time
|
|
+ * queue. It is worth noting that this move
|
|
+ * is not so dangerous for the other queues,
|
|
+ * because soft real-time queues are not
|
|
+ * greedy.
|
|
+ *
|
|
+ * To not add a further variable, we use the
|
|
+ * overloaded field budget_timeout to
|
|
+ * determine for how long the queue has not
|
|
+ * received service, i.e., how much time has
|
|
+ * elapsed since the queue expired. However,
|
|
+ * this is a little imprecise, because
|
|
+ * budget_timeout is set to jiffies if bfqq
|
|
+ * not only expires, but also remains with no
|
|
+ * request.
|
|
+ */
|
|
+ if (time_after(bfqq->budget_timeout,
|
|
+ bfqq->last_wr_start_finish))
|
|
+ bfqq->last_wr_start_finish +=
|
|
+ jiffies - bfqq->budget_timeout;
|
|
+ else
|
|
+ bfqq->last_wr_start_finish = jiffies;
|
|
+
|
|
+ if (time_is_after_jiffies(bfqq->last_wr_start_finish)) {
|
|
+ pr_crit(
|
|
+ "BFQ WARNING:last %lu budget %lu jiffies %lu",
|
|
+ bfqq->last_wr_start_finish,
|
|
+ bfqq->budget_timeout,
|
|
+ jiffies);
|
|
+ pr_crit("diff %lu", jiffies -
|
|
+ max_t(unsigned long,
|
|
+ bfqq->last_wr_start_finish,
|
|
+ bfqq->budget_timeout));
|
|
+ bfqq->last_wr_start_finish = jiffies;
|
|
+ }
|
|
+ }
|
|
+
|
|
+ bfq_set_budget_timeout(bfqd, bfqq);
|
|
+ bfq_log_bfqq(bfqd, bfqq,
|
|
+ "set_in_service_queue, cur-budget = %d",
|
|
+ bfqq->entity.budget);
|
|
+ } else
|
|
+ bfq_log(bfqd, "set_in_service_queue: NULL");
|
|
+
|
|
+ bfqd->in_service_queue = bfqq;
|
|
+}
|
|
+
|
|
+/*
|
|
+ * Get and set a new queue for service.
|
|
+ */
|
|
+static struct bfq_queue *bfq_set_in_service_queue(struct bfq_data *bfqd)
|
|
+{
|
|
+ struct bfq_queue *bfqq = bfq_get_next_queue(bfqd);
|
|
+
|
|
+ __bfq_set_in_service_queue(bfqd, bfqq);
|
|
+ return bfqq;
|
|
+}
|
|
+
|
|
+static void bfq_arm_slice_timer(struct bfq_data *bfqd)
|
|
+{
|
|
+ struct bfq_queue *bfqq = bfqd->in_service_queue;
|
|
+ struct bfq_io_cq *bic;
|
|
+ u32 sl;
|
|
+
|
|
+ BUG_ON(!RB_EMPTY_ROOT(&bfqq->sort_list));
|
|
+
|
|
+ /* Processes have exited, don't wait. */
|
|
+ bic = bfqd->in_service_bic;
|
|
+ if (!bic || atomic_read(&bic->icq.ioc->active_ref) == 0)
|
|
+ return;
|
|
+
|
|
+ bfq_mark_bfqq_wait_request(bfqq);
|
|
+
|
|
+ /*
|
|
+ * We don't want to idle for seeks, but we do want to allow
|
|
+ * fair distribution of slice time for a process doing back-to-back
|
|
+ * seeks. So allow a little bit of time for him to submit a new rq.
|
|
+ *
|
|
+ * To prevent processes with (partly) seeky workloads from
|
|
+ * being too ill-treated, grant them a small fraction of the
|
|
+ * assigned budget before reducing the waiting time to
|
|
+ * BFQ_MIN_TT. This happened to help reduce latency.
|
|
+ */
|
|
+ sl = bfqd->bfq_slice_idle;
|
|
+ /*
|
|
+ * Unless the queue is being weight-raised or the scenario is
|
|
+ * asymmetric, grant only minimum idle time if the queue
|
|
+ * is seeky. A long idling is preserved for a weight-raised
|
|
+ * queue, or, more in general, in an asymemtric scenario,
|
|
+ * because a long idling is needed for guaranteeing to a queue
|
|
+ * its reserved share of the throughput (in particular, it is
|
|
+ * needed if the queue has a higher weight than some other
|
|
+ * queue).
|
|
+ */
|
|
+ if (BFQQ_SEEKY(bfqq) && bfqq->wr_coeff == 1 &&
|
|
+ bfq_symmetric_scenario(bfqd))
|
|
+ sl = min_t(u32, sl, BFQ_MIN_TT);
|
|
+
|
|
+ bfqd->last_idling_start = ktime_get();
|
|
+ hrtimer_start(&bfqd->idle_slice_timer, ns_to_ktime(sl),
|
|
+ HRTIMER_MODE_REL);
|
|
+ bfqg_stats_set_start_idle_time(bfqq_group(bfqq));
|
|
+ bfq_log(bfqd, "arm idle: %ld/%ld ms",
|
|
+ sl / NSEC_PER_MSEC, bfqd->bfq_slice_idle / NSEC_PER_MSEC);
|
|
+}
|
|
+
|
|
+/*
|
|
+ * In autotuning mode, max_budget is dynamically recomputed as the
|
|
+ * amount of sectors transferred in timeout at the estimated peak
|
|
+ * rate. This enables BFQ to utilize a full timeslice with a full
|
|
+ * budget, even if the in-service queue is served at peak rate. And
|
|
+ * this maximises throughput with sequential workloads.
|
|
+ */
|
|
+static unsigned long bfq_calc_max_budget(struct bfq_data *bfqd)
|
|
+{
|
|
+ return (u64)bfqd->peak_rate * USEC_PER_MSEC *
|
|
+ jiffies_to_msecs(bfqd->bfq_timeout)>>BFQ_RATE_SHIFT;
|
|
+}
|
|
+
|
|
+/*
|
|
+ * Update parameters related to throughput and responsiveness, as a
|
|
+ * function of the estimated peak rate. See comments on
|
|
+ * bfq_calc_max_budget(), and on T_slow and T_fast arrays.
|
|
+ */
|
|
+static void update_thr_responsiveness_params(struct bfq_data *bfqd)
|
|
+{
|
|
+ int dev_type = blk_queue_nonrot(bfqd->queue);
|
|
+
|
|
+ if (bfqd->bfq_user_max_budget == 0) {
|
|
+ bfqd->bfq_max_budget =
|
|
+ bfq_calc_max_budget(bfqd);
|
|
+ BUG_ON(bfqd->bfq_max_budget < 0);
|
|
+ bfq_log(bfqd, "new max_budget = %d",
|
|
+ bfqd->bfq_max_budget);
|
|
+ }
|
|
+
|
|
+ if (bfqd->device_speed == BFQ_BFQD_FAST &&
|
|
+ bfqd->peak_rate < device_speed_thresh[dev_type]) {
|
|
+ bfqd->device_speed = BFQ_BFQD_SLOW;
|
|
+ bfqd->RT_prod = R_slow[dev_type] *
|
|
+ T_slow[dev_type];
|
|
+ } else if (bfqd->device_speed == BFQ_BFQD_SLOW &&
|
|
+ bfqd->peak_rate > device_speed_thresh[dev_type]) {
|
|
+ bfqd->device_speed = BFQ_BFQD_FAST;
|
|
+ bfqd->RT_prod = R_fast[dev_type] *
|
|
+ T_fast[dev_type];
|
|
+ }
|
|
+
|
|
+ bfq_log(bfqd,
|
|
+"dev_type %s dev_speed_class = %s (%llu sects/sec), thresh %llu setcs/sec",
|
|
+ dev_type == 0 ? "ROT" : "NONROT",
|
|
+ bfqd->device_speed == BFQ_BFQD_FAST ? "FAST" : "SLOW",
|
|
+ bfqd->device_speed == BFQ_BFQD_FAST ?
|
|
+ (USEC_PER_SEC*(u64)R_fast[dev_type])>>BFQ_RATE_SHIFT :
|
|
+ (USEC_PER_SEC*(u64)R_slow[dev_type])>>BFQ_RATE_SHIFT,
|
|
+ (USEC_PER_SEC*(u64)device_speed_thresh[dev_type])>>
|
|
+ BFQ_RATE_SHIFT);
|
|
+}
|
|
+
|
|
+static void bfq_reset_rate_computation(struct bfq_data *bfqd, struct request *rq)
|
|
+{
|
|
+ if (rq != NULL) { /* new rq dispatch now, reset accordingly */
|
|
+ bfqd->last_dispatch = bfqd->first_dispatch = ktime_get_ns() ;
|
|
+ bfqd->peak_rate_samples = 1;
|
|
+ bfqd->sequential_samples = 0;
|
|
+ bfqd->tot_sectors_dispatched = bfqd->last_rq_max_size =
|
|
+ blk_rq_sectors(rq);
|
|
+ } else /* no new rq dispatched, just reset the number of samples */
|
|
+ bfqd->peak_rate_samples = 0; /* full re-init on next disp. */
|
|
+
|
|
+ bfq_log(bfqd,
|
|
+ "reset_rate_computation at end, sample %u/%u tot_sects %llu",
|
|
+ bfqd->peak_rate_samples, bfqd->sequential_samples,
|
|
+ bfqd->tot_sectors_dispatched);
|
|
+}
|
|
+
|
|
+static void bfq_update_rate_reset(struct bfq_data *bfqd, struct request *rq)
|
|
+{
|
|
+ u32 rate, weight, divisor;
|
|
+
|
|
+ /*
|
|
+ * For the convergence property to hold (see comments on
|
|
+ * bfq_update_peak_rate()) and for the assessment to be
|
|
+ * reliable, a minimum number of samples must be present, and
|
|
+ * a minimum amount of time must have elapsed. If not so, do
|
|
+ * not compute new rate. Just reset parameters, to get ready
|
|
+ * for a new evaluation attempt.
|
|
+ */
|
|
+ if (bfqd->peak_rate_samples < BFQ_RATE_MIN_SAMPLES ||
|
|
+ bfqd->delta_from_first < BFQ_RATE_MIN_INTERVAL) {
|
|
+ bfq_log(bfqd,
|
|
+ "update_rate_reset: only resetting, delta_first %lluus samples %d",
|
|
+ bfqd->delta_from_first>>10, bfqd->peak_rate_samples);
|
|
+ goto reset_computation;
|
|
+ }
|
|
+
|
|
+ /*
|
|
+ * If a new request completion has occurred after last
|
|
+ * dispatch, then, to approximate the rate at which requests
|
|
+ * have been served by the device, it is more precise to
|
|
+ * extend the observation interval to the last completion.
|
|
+ */
|
|
+ bfqd->delta_from_first =
|
|
+ max_t(u64, bfqd->delta_from_first,
|
|
+ bfqd->last_completion - bfqd->first_dispatch);
|
|
+
|
|
+ BUG_ON(bfqd->delta_from_first == 0);
|
|
+ /*
|
|
+ * Rate computed in sects/usec, and not sects/nsec, for
|
|
+ * precision issues.
|
|
+ */
|
|
+ rate = div64_ul(bfqd->tot_sectors_dispatched<<BFQ_RATE_SHIFT,
|
|
+ div_u64(bfqd->delta_from_first, NSEC_PER_USEC));
|
|
+
|
|
+ bfq_log(bfqd,
|
|
+"update_rate_reset: tot_sects %llu delta_first %lluus rate %llu sects/s (%d)",
|
|
+ bfqd->tot_sectors_dispatched, bfqd->delta_from_first>>10,
|
|
+ ((USEC_PER_SEC*(u64)rate)>>BFQ_RATE_SHIFT),
|
|
+ rate > 20<<BFQ_RATE_SHIFT);
|
|
+
|
|
+ /*
|
|
+ * Peak rate not updated if:
|
|
+ * - the percentage of sequential dispatches is below 3/4 of the
|
|
+ * total, and rate is below the current estimated peak rate
|
|
+ * - rate is unreasonably high (> 20M sectors/sec)
|
|
+ */
|
|
+ if ((bfqd->sequential_samples < (3 * bfqd->peak_rate_samples)>>2 &&
|
|
+ rate <= bfqd->peak_rate) ||
|
|
+ rate > 20<<BFQ_RATE_SHIFT) {
|
|
+ bfq_log(bfqd,
|
|
+ "update_rate_reset: goto reset, samples %u/%u rate/peak %llu/%llu",
|
|
+ bfqd->peak_rate_samples, bfqd->sequential_samples,
|
|
+ ((USEC_PER_SEC*(u64)rate)>>BFQ_RATE_SHIFT),
|
|
+ ((USEC_PER_SEC*(u64)bfqd->peak_rate)>>BFQ_RATE_SHIFT));
|
|
+ goto reset_computation;
|
|
+ } else {
|
|
+ bfq_log(bfqd,
|
|
+ "update_rate_reset: do update, samples %u/%u rate/peak %llu/%llu",
|
|
+ bfqd->peak_rate_samples, bfqd->sequential_samples,
|
|
+ ((USEC_PER_SEC*(u64)rate)>>BFQ_RATE_SHIFT),
|
|
+ ((USEC_PER_SEC*(u64)bfqd->peak_rate)>>BFQ_RATE_SHIFT));
|
|
+ }
|
|
+
|
|
+ /*
|
|
+ * We have to update the peak rate, at last! To this purpose,
|
|
+ * we use a low-pass filter. We compute the smoothing constant
|
|
+ * of the filter as a function of the 'weight' of the new
|
|
+ * measured rate.
|
|
+ *
|
|
+ * As can be seen in next formulas, we define this weight as a
|
|
+ * quantity proportional to how sequential the workload is,
|
|
+ * and to how long the observation time interval is.
|
|
+ *
|
|
+ * The weight runs from 0 to 8. The maximum value of the
|
|
+ * weight, 8, yields the minimum value for the smoothing
|
|
+ * constant. At this minimum value for the smoothing constant,
|
|
+ * the measured rate contributes for half of the next value of
|
|
+ * the estimated peak rate.
|
|
+ *
|
|
+ * So, the first step is to compute the weight as a function
|
|
+ * of how sequential the workload is. Note that the weight
|
|
+ * cannot reach 9, because bfqd->sequential_samples cannot
|
|
+ * become equal to bfqd->peak_rate_samples, which, in its
|
|
+ * turn, holds true because bfqd->sequential_samples is not
|
|
+ * incremented for the first sample.
|
|
+ */
|
|
+ weight = (9 * bfqd->sequential_samples) / bfqd->peak_rate_samples;
|
|
+
|
|
+ /*
|
|
+ * Second step: further refine the weight as a function of the
|
|
+ * duration of the observation interval.
|
|
+ */
|
|
+ weight = min_t(u32, 8,
|
|
+ div_u64(weight * bfqd->delta_from_first,
|
|
+ BFQ_RATE_REF_INTERVAL));
|
|
+
|
|
+ /*
|
|
+ * Divisor ranging from 10, for minimum weight, to 2, for
|
|
+ * maximum weight.
|
|
+ */
|
|
+ divisor = 10 - weight;
|
|
+ BUG_ON(divisor == 0);
|
|
+
|
|
+ /*
|
|
+ * Finally, update peak rate:
|
|
+ *
|
|
+ * peak_rate = peak_rate * (divisor-1) / divisor + rate / divisor
|
|
+ */
|
|
+ bfqd->peak_rate *= divisor-1;
|
|
+ bfqd->peak_rate /= divisor;
|
|
+ rate /= divisor; /* smoothing constant alpha = 1/divisor */
|
|
+
|
|
+ bfq_log(bfqd,
|
|
+ "update_rate_reset: divisor %d tmp_peak_rate %llu tmp_rate %u",
|
|
+ divisor,
|
|
+ ((USEC_PER_SEC*(u64)bfqd->peak_rate)>>BFQ_RATE_SHIFT),
|
|
+ (u32)((USEC_PER_SEC*(u64)rate)>>BFQ_RATE_SHIFT));
|
|
+
|
|
+ BUG_ON(bfqd->peak_rate == 0);
|
|
+ BUG_ON(bfqd->peak_rate > 20<<BFQ_RATE_SHIFT);
|
|
+
|
|
+ bfqd->peak_rate += rate;
|
|
+ update_thr_responsiveness_params(bfqd);
|
|
+ BUG_ON(bfqd->peak_rate > 20<<BFQ_RATE_SHIFT);
|
|
+
|
|
+reset_computation:
|
|
+ bfq_reset_rate_computation(bfqd, rq);
|
|
+}
|
|
+
|
|
+/*
|
|
+ * Update the read/write peak rate (the main quantity used for
|
|
+ * auto-tuning, see update_thr_responsiveness_params()).
|
|
+ *
|
|
+ * It is not trivial to estimate the peak rate (correctly): because of
|
|
+ * the presence of sw and hw queues between the scheduler and the
|
|
+ * device components that finally serve I/O requests, it is hard to
|
|
+ * say exactly when a given dispatched request is served inside the
|
|
+ * device, and for how long. As a consequence, it is hard to know
|
|
+ * precisely at what rate a given set of requests is actually served
|
|
+ * by the device.
|
|
+ *
|
|
+ * On the opposite end, the dispatch time of any request is trivially
|
|
+ * available, and, from this piece of information, the "dispatch rate"
|
|
+ * of requests can be immediately computed. So, the idea in the next
|
|
+ * function is to use what is known, namely request dispatch times
|
|
+ * (plus, when useful, request completion times), to estimate what is
|
|
+ * unknown, namely in-device request service rate.
|
|
+ *
|
|
+ * The main issue is that, because of the above facts, the rate at
|
|
+ * which a certain set of requests is dispatched over a certain time
|
|
+ * interval can vary greatly with respect to the rate at which the
|
|
+ * same requests are then served. But, since the size of any
|
|
+ * intermediate queue is limited, and the service scheme is lossless
|
|
+ * (no request is silently dropped), the following obvious convergence
|
|
+ * property holds: the number of requests dispatched MUST become
|
|
+ * closer and closer to the number of requests completed as the
|
|
+ * observation interval grows. This is the key property used in
|
|
+ * the next function to estimate the peak service rate as a function
|
|
+ * of the observed dispatch rate. The function assumes to be invoked
|
|
+ * on every request dispatch.
|
|
+ */
|
|
+static void bfq_update_peak_rate(struct bfq_data *bfqd, struct request *rq)
|
|
+{
|
|
+ u64 now_ns = ktime_get_ns();
|
|
+
|
|
+ if (bfqd->peak_rate_samples == 0) { /* first dispatch */
|
|
+ bfq_log(bfqd,
|
|
+ "update_peak_rate: goto reset, samples %d",
|
|
+ bfqd->peak_rate_samples) ;
|
|
+ bfq_reset_rate_computation(bfqd, rq);
|
|
+ goto update_last_values; /* will add one sample */
|
|
+ }
|
|
+
|
|
+ /*
|
|
+ * Device idle for very long: the observation interval lasting
|
|
+ * up to this dispatch cannot be a valid observation interval
|
|
+ * for computing a new peak rate (similarly to the late-
|
|
+ * completion event in bfq_completed_request()). Go to
|
|
+ * update_rate_and_reset to have the following three steps
|
|
+ * taken:
|
|
+ * - close the observation interval at the last (previous)
|
|
+ * request dispatch or completion
|
|
+ * - compute rate, if possible, for that observation interval
|
|
+ * - start a new observation interval with this dispatch
|
|
+ */
|
|
+ if (now_ns - bfqd->last_dispatch > 100*NSEC_PER_MSEC &&
|
|
+ bfqd->rq_in_driver == 0) {
|
|
+ bfq_log(bfqd,
|
|
+"update_peak_rate: jumping to updating&resetting delta_last %lluus samples %d",
|
|
+ (now_ns - bfqd->last_dispatch)>>10,
|
|
+ bfqd->peak_rate_samples) ;
|
|
+ goto update_rate_and_reset;
|
|
+ }
|
|
+
|
|
+ /* Update sampling information */
|
|
+ bfqd->peak_rate_samples++;
|
|
+
|
|
+ if ((bfqd->rq_in_driver > 0 ||
|
|
+ now_ns - bfqd->last_completion < BFQ_MIN_TT)
|
|
+ && get_sdist(bfqd->last_position, rq) < BFQQ_SEEK_THR)
|
|
+ bfqd->sequential_samples++;
|
|
+
|
|
+ bfqd->tot_sectors_dispatched += blk_rq_sectors(rq);
|
|
+
|
|
+ /* Reset max observed rq size every 32 dispatches */
|
|
+ if (likely(bfqd->peak_rate_samples % 32))
|
|
+ bfqd->last_rq_max_size =
|
|
+ max_t(u32, blk_rq_sectors(rq), bfqd->last_rq_max_size);
|
|
+ else
|
|
+ bfqd->last_rq_max_size = blk_rq_sectors(rq);
|
|
+
|
|
+ bfqd->delta_from_first = now_ns - bfqd->first_dispatch;
|
|
+
|
|
+ bfq_log(bfqd,
|
|
+ "update_peak_rate: added samples %u/%u tot_sects %llu delta_first %lluus",
|
|
+ bfqd->peak_rate_samples, bfqd->sequential_samples,
|
|
+ bfqd->tot_sectors_dispatched,
|
|
+ bfqd->delta_from_first>>10);
|
|
+
|
|
+ /* Target observation interval not yet reached, go on sampling */
|
|
+ if (bfqd->delta_from_first < BFQ_RATE_REF_INTERVAL)
|
|
+ goto update_last_values;
|
|
+
|
|
+update_rate_and_reset:
|
|
+ bfq_update_rate_reset(bfqd, rq);
|
|
+update_last_values:
|
|
+ bfqd->last_position = blk_rq_pos(rq) + blk_rq_sectors(rq);
|
|
+ bfqd->last_dispatch = now_ns;
|
|
+
|
|
+ bfq_log(bfqd,
|
|
+ "update_peak_rate: delta_first %lluus last_pos %llu peak_rate %llu",
|
|
+ (now_ns - bfqd->first_dispatch)>>10,
|
|
+ (unsigned long long) bfqd->last_position,
|
|
+ ((USEC_PER_SEC*(u64)bfqd->peak_rate)>>BFQ_RATE_SHIFT));
|
|
+ bfq_log(bfqd,
|
|
+ "update_peak_rate: samples at end %d", bfqd->peak_rate_samples);
|
|
+}
|
|
+
|
|
+/*
|
|
+ * Move request from internal lists to the dispatch list of the request queue
|
|
+ */
|
|
+static void bfq_dispatch_insert(struct request_queue *q, struct request *rq)
|
|
+{
|
|
+ struct bfq_queue *bfqq = RQ_BFQQ(rq);
|
|
+
|
|
+ /*
|
|
+ * For consistency, the next instruction should have been executed
|
|
+ * after removing the request from the queue and dispatching it.
|
|
+ * We execute instead this instruction before bfq_remove_request()
|
|
+ * (and hence introduce a temporary inconsistency), for efficiency.
|
|
+ * In fact, in a forced_dispatch, this prevents two counters related
|
|
+ * to bfqq->dispatched to risk to be uselessly decremented if bfqq
|
|
+ * is not in service, and then to be incremented again after
|
|
+ * incrementing bfqq->dispatched.
|
|
+ */
|
|
+ bfqq->dispatched++;
|
|
+ bfq_update_peak_rate(q->elevator->elevator_data, rq);
|
|
+
|
|
+ bfq_remove_request(rq);
|
|
+ elv_dispatch_sort(q, rq);
|
|
+}
|
|
+
|
|
+static void __bfq_bfqq_expire(struct bfq_data *bfqd, struct bfq_queue *bfqq)
|
|
+{
|
|
+ BUG_ON(bfqq != bfqd->in_service_queue);
|
|
+
|
|
+ /*
|
|
+ * If this bfqq is shared between multiple processes, check
|
|
+ * to make sure that those processes are still issuing I/Os
|
|
+ * within the mean seek distance. If not, it may be time to
|
|
+ * break the queues apart again.
|
|
+ */
|
|
+ if (bfq_bfqq_coop(bfqq) && BFQQ_SEEKY(bfqq))
|
|
+ bfq_mark_bfqq_split_coop(bfqq);
|
|
+
|
|
+ if (RB_EMPTY_ROOT(&bfqq->sort_list)) {
|
|
+ if (bfqq->dispatched == 0)
|
|
+ /*
|
|
+ * Overloading budget_timeout field to store
|
|
+ * the time at which the queue remains with no
|
|
+ * backlog and no outstanding request; used by
|
|
+ * the weight-raising mechanism.
|
|
+ */
|
|
+ bfqq->budget_timeout = jiffies;
|
|
+
|
|
+ bfq_del_bfqq_busy(bfqd, bfqq, true);
|
|
+ } else {
|
|
+ bfq_requeue_bfqq(bfqd, bfqq);
|
|
+ /*
|
|
+ * Resort priority tree of potential close cooperators.
|
|
+ */
|
|
+ bfq_pos_tree_add_move(bfqd, bfqq);
|
|
+ }
|
|
+
|
|
+ /*
|
|
+ * All in-service entities must have been properly deactivated
|
|
+ * or requeued before executing the next function, which
|
|
+ * resets all in-service entites as no more in service.
|
|
+ */
|
|
+ __bfq_bfqd_reset_in_service(bfqd);
|
|
+}
|
|
+
|
|
+/**
|
|
+ * __bfq_bfqq_recalc_budget - try to adapt the budget to the @bfqq behavior.
|
|
+ * @bfqd: device data.
|
|
+ * @bfqq: queue to update.
|
|
+ * @reason: reason for expiration.
|
|
+ *
|
|
+ * Handle the feedback on @bfqq budget at queue expiration.
|
|
+ * See the body for detailed comments.
|
|
+ */
|
|
+static void __bfq_bfqq_recalc_budget(struct bfq_data *bfqd,
|
|
+ struct bfq_queue *bfqq,
|
|
+ enum bfqq_expiration reason)
|
|
+{
|
|
+ struct request *next_rq;
|
|
+ int budget, min_budget;
|
|
+
|
|
+ BUG_ON(bfqq != bfqd->in_service_queue);
|
|
+
|
|
+ min_budget = bfq_min_budget(bfqd);
|
|
+
|
|
+ if (bfqq->wr_coeff == 1)
|
|
+ budget = bfqq->max_budget;
|
|
+ else /*
|
|
+ * Use a constant, low budget for weight-raised queues,
|
|
+ * to help achieve a low latency. Keep it slightly higher
|
|
+ * than the minimum possible budget, to cause a little
|
|
+ * bit fewer expirations.
|
|
+ */
|
|
+ budget = 2 * min_budget;
|
|
+
|
|
+ bfq_log_bfqq(bfqd, bfqq, "recalc_budg: last budg %d, budg left %d",
|
|
+ bfqq->entity.budget, bfq_bfqq_budget_left(bfqq));
|
|
+ bfq_log_bfqq(bfqd, bfqq, "recalc_budg: last max_budg %d, min budg %d",
|
|
+ budget, bfq_min_budget(bfqd));
|
|
+ bfq_log_bfqq(bfqd, bfqq, "recalc_budg: sync %d, seeky %d",
|
|
+ bfq_bfqq_sync(bfqq), BFQQ_SEEKY(bfqd->in_service_queue));
|
|
+
|
|
+ if (bfq_bfqq_sync(bfqq) && bfqq->wr_coeff == 1) {
|
|
+ switch (reason) {
|
|
+ /*
|
|
+ * Caveat: in all the following cases we trade latency
|
|
+ * for throughput.
|
|
+ */
|
|
+ case BFQ_BFQQ_TOO_IDLE:
|
|
+ /*
|
|
+ * This is the only case where we may reduce
|
|
+ * the budget: if there is no request of the
|
|
+ * process still waiting for completion, then
|
|
+ * we assume (tentatively) that the timer has
|
|
+ * expired because the batch of requests of
|
|
+ * the process could have been served with a
|
|
+ * smaller budget. Hence, betting that
|
|
+ * process will behave in the same way when it
|
|
+ * becomes backlogged again, we reduce its
|
|
+ * next budget. As long as we guess right,
|
|
+ * this budget cut reduces the latency
|
|
+ * experienced by the process.
|
|
+ *
|
|
+ * However, if there are still outstanding
|
|
+ * requests, then the process may have not yet
|
|
+ * issued its next request just because it is
|
|
+ * still waiting for the completion of some of
|
|
+ * the still outstanding ones. So in this
|
|
+ * subcase we do not reduce its budget, on the
|
|
+ * contrary we increase it to possibly boost
|
|
+ * the throughput, as discussed in the
|
|
+ * comments to the BUDGET_TIMEOUT case.
|
|
+ */
|
|
+ if (bfqq->dispatched > 0) /* still outstanding reqs */
|
|
+ budget = min(budget * 2, bfqd->bfq_max_budget);
|
|
+ else {
|
|
+ if (budget > 5 * min_budget)
|
|
+ budget -= 4 * min_budget;
|
|
+ else
|
|
+ budget = min_budget;
|
|
+ }
|
|
+ break;
|
|
+ case BFQ_BFQQ_BUDGET_TIMEOUT:
|
|
+ /*
|
|
+ * We double the budget here because it gives
|
|
+ * the chance to boost the throughput if this
|
|
+ * is not a seeky process (and has bumped into
|
|
+ * this timeout because of, e.g., ZBR).
|
|
+ */
|
|
+ budget = min(budget * 2, bfqd->bfq_max_budget);
|
|
+ break;
|
|
+ case BFQ_BFQQ_BUDGET_EXHAUSTED:
|
|
+ /*
|
|
+ * The process still has backlog, and did not
|
|
+ * let either the budget timeout or the disk
|
|
+ * idling timeout expire. Hence it is not
|
|
+ * seeky, has a short thinktime and may be
|
|
+ * happy with a higher budget too. So
|
|
+ * definitely increase the budget of this good
|
|
+ * candidate to boost the disk throughput.
|
|
+ */
|
|
+ budget = min(budget * 4, bfqd->bfq_max_budget);
|
|
+ break;
|
|
+ case BFQ_BFQQ_NO_MORE_REQUESTS:
|
|
+ /*
|
|
+ * For queues that expire for this reason, it
|
|
+ * is particularly important to keep the
|
|
+ * budget close to the actual service they
|
|
+ * need. Doing so reduces the timestamp
|
|
+ * misalignment problem described in the
|
|
+ * comments in the body of
|
|
+ * __bfq_activate_entity. In fact, suppose
|
|
+ * that a queue systematically expires for
|
|
+ * BFQ_BFQQ_NO_MORE_REQUESTS and presents a
|
|
+ * new request in time to enjoy timestamp
|
|
+ * back-shifting. The larger the budget of the
|
|
+ * queue is with respect to the service the
|
|
+ * queue actually requests in each service
|
|
+ * slot, the more times the queue can be
|
|
+ * reactivated with the same virtual finish
|
|
+ * time. It follows that, even if this finish
|
|
+ * time is pushed to the system virtual time
|
|
+ * to reduce the consequent timestamp
|
|
+ * misalignment, the queue unjustly enjoys for
|
|
+ * many re-activations a lower finish time
|
|
+ * than all newly activated queues.
|
|
+ *
|
|
+ * The service needed by bfqq is measured
|
|
+ * quite precisely by bfqq->entity.service.
|
|
+ * Since bfqq does not enjoy device idling,
|
|
+ * bfqq->entity.service is equal to the number
|
|
+ * of sectors that the process associated with
|
|
+ * bfqq requested to read/write before waiting
|
|
+ * for request completions, or blocking for
|
|
+ * other reasons.
|
|
+ */
|
|
+ budget = max_t(int, bfqq->entity.service, min_budget);
|
|
+ break;
|
|
+ default:
|
|
+ return;
|
|
+ }
|
|
+ } else if (!bfq_bfqq_sync(bfqq))
|
|
+ /*
|
|
+ * Async queues get always the maximum possible
|
|
+ * budget, as for them we do not care about latency
|
|
+ * (in addition, their ability to dispatch is limited
|
|
+ * by the charging factor).
|
|
+ */
|
|
+ budget = bfqd->bfq_max_budget;
|
|
+
|
|
+ bfqq->max_budget = budget;
|
|
+
|
|
+ if (bfqd->budgets_assigned >= bfq_stats_min_budgets &&
|
|
+ !bfqd->bfq_user_max_budget)
|
|
+ bfqq->max_budget = min(bfqq->max_budget, bfqd->bfq_max_budget);
|
|
+
|
|
+ /*
|
|
+ * If there is still backlog, then assign a new budget, making
|
|
+ * sure that it is large enough for the next request. Since
|
|
+ * the finish time of bfqq must be kept in sync with the
|
|
+ * budget, be sure to call __bfq_bfqq_expire() *after* this
|
|
+ * update.
|
|
+ *
|
|
+ * If there is no backlog, then no need to update the budget;
|
|
+ * it will be updated on the arrival of a new request.
|
|
+ */
|
|
+ next_rq = bfqq->next_rq;
|
|
+ if (next_rq) {
|
|
+ BUG_ON(reason == BFQ_BFQQ_TOO_IDLE ||
|
|
+ reason == BFQ_BFQQ_NO_MORE_REQUESTS);
|
|
+ bfqq->entity.budget = max_t(unsigned long, bfqq->max_budget,
|
|
+ bfq_serv_to_charge(next_rq, bfqq));
|
|
+ BUG_ON(!bfq_bfqq_busy(bfqq));
|
|
+ BUG_ON(RB_EMPTY_ROOT(&bfqq->sort_list));
|
|
+ }
|
|
+
|
|
+ bfq_log_bfqq(bfqd, bfqq, "head sect: %u, new budget %d",
|
|
+ next_rq ? blk_rq_sectors(next_rq) : 0,
|
|
+ bfqq->entity.budget);
|
|
+}
|
|
+
|
|
+/*
|
|
+ * Return true if the process associated with bfqq is "slow". The slow
|
|
+ * flag is used, in addition to the budget timeout, to reduce the
|
|
+ * amount of service provided to seeky processes, and thus reduce
|
|
+ * their chances to lower the throughput. More details in the comments
|
|
+ * on the function bfq_bfqq_expire().
|
|
+ *
|
|
+ * An important observation is in order: as discussed in the comments
|
|
+ * on the function bfq_update_peak_rate(), with devices with internal
|
|
+ * queues, it is hard if ever possible to know when and for how long
|
|
+ * an I/O request is processed by the device (apart from the trivial
|
|
+ * I/O pattern where a new request is dispatched only after the
|
|
+ * previous one has been completed). This makes it hard to evaluate
|
|
+ * the real rate at which the I/O requests of each bfq_queue are
|
|
+ * served. In fact, for an I/O scheduler like BFQ, serving a
|
|
+ * bfq_queue means just dispatching its requests during its service
|
|
+ * slot (i.e., until the budget of the queue is exhausted, or the
|
|
+ * queue remains idle, or, finally, a timeout fires). But, during the
|
|
+ * service slot of a bfq_queue, around 100 ms at most, the device may
|
|
+ * be even still processing requests of bfq_queues served in previous
|
|
+ * service slots. On the opposite end, the requests of the in-service
|
|
+ * bfq_queue may be completed after the service slot of the queue
|
|
+ * finishes.
|
|
+ *
|
|
+ * Anyway, unless more sophisticated solutions are used
|
|
+ * (where possible), the sum of the sizes of the requests dispatched
|
|
+ * during the service slot of a bfq_queue is probably the only
|
|
+ * approximation available for the service received by the bfq_queue
|
|
+ * during its service slot. And this sum is the quantity used in this
|
|
+ * function to evaluate the I/O speed of a process.
|
|
+ */
|
|
+static bool bfq_bfqq_is_slow(struct bfq_data *bfqd, struct bfq_queue *bfqq,
|
|
+ bool compensate, enum bfqq_expiration reason,
|
|
+ unsigned long *delta_ms)
|
|
+{
|
|
+ ktime_t delta_ktime;
|
|
+ u32 delta_usecs;
|
|
+ bool slow = BFQQ_SEEKY(bfqq); /* if delta too short, use seekyness */
|
|
+
|
|
+ if (!bfq_bfqq_sync(bfqq))
|
|
+ return false;
|
|
+
|
|
+ if (compensate)
|
|
+ delta_ktime = bfqd->last_idling_start;
|
|
+ else
|
|
+ delta_ktime = ktime_get();
|
|
+ delta_ktime = ktime_sub(delta_ktime, bfqd->last_budget_start);
|
|
+ delta_usecs = ktime_to_us(delta_ktime);
|
|
+
|
|
+ /* don't trust short/unrealistic values. */
|
|
+ if (delta_usecs < 1000 || delta_usecs >= LONG_MAX) {
|
|
+ if (blk_queue_nonrot(bfqd->queue))
|
|
+ /*
|
|
+ * give same worst-case guarantees as idling
|
|
+ * for seeky
|
|
+ */
|
|
+ *delta_ms = BFQ_MIN_TT / NSEC_PER_MSEC;
|
|
+ else /* charge at least one seek */
|
|
+ *delta_ms = bfq_slice_idle / NSEC_PER_MSEC;
|
|
+
|
|
+ bfq_log(bfqd, "bfq_bfqq_is_slow: unrealistic %u", delta_usecs);
|
|
+
|
|
+ return slow;
|
|
+ }
|
|
+
|
|
+ *delta_ms = delta_usecs / USEC_PER_MSEC;
|
|
+
|
|
+ /*
|
|
+ * Use only long (> 20ms) intervals to filter out excessive
|
|
+ * spikes in service rate estimation.
|
|
+ */
|
|
+ if (delta_usecs > 20000) {
|
|
+ /*
|
|
+ * Caveat for rotational devices: processes doing I/O
|
|
+ * in the slower disk zones tend to be slow(er) even
|
|
+ * if not seeky. In this respect, the estimated peak
|
|
+ * rate is likely to be an average over the disk
|
|
+ * surface. Accordingly, to not be too harsh with
|
|
+ * unlucky processes, a process is deemed slow only if
|
|
+ * its rate has been lower than half of the estimated
|
|
+ * peak rate.
|
|
+ */
|
|
+ slow = bfqq->entity.service < bfqd->bfq_max_budget / 2;
|
|
+ bfq_log(bfqd, "bfq_bfqq_is_slow: relative rate %d/%d",
|
|
+ bfqq->entity.service, bfqd->bfq_max_budget);
|
|
+ }
|
|
+
|
|
+ bfq_log_bfqq(bfqd, bfqq, "bfq_bfqq_is_slow: slow %d", slow);
|
|
+
|
|
+ return slow;
|
|
+}
|
|
+
|
|
+/*
|
|
+ * To be deemed as soft real-time, an application must meet two
|
|
+ * requirements. First, the application must not require an average
|
|
+ * bandwidth higher than the approximate bandwidth required to playback or
|
|
+ * record a compressed high-definition video.
|
|
+ * The next function is invoked on the completion of the last request of a
|
|
+ * batch, to compute the next-start time instant, soft_rt_next_start, such
|
|
+ * that, if the next request of the application does not arrive before
|
|
+ * soft_rt_next_start, then the above requirement on the bandwidth is met.
|
|
+ *
|
|
+ * The second requirement is that the request pattern of the application is
|
|
+ * isochronous, i.e., that, after issuing a request or a batch of requests,
|
|
+ * the application stops issuing new requests until all its pending requests
|
|
+ * have been completed. After that, the application may issue a new batch,
|
|
+ * and so on.
|
|
+ * For this reason the next function is invoked to compute
|
|
+ * soft_rt_next_start only for applications that meet this requirement,
|
|
+ * whereas soft_rt_next_start is set to infinity for applications that do
|
|
+ * not.
|
|
+ *
|
|
+ * Unfortunately, even a greedy application may happen to behave in an
|
|
+ * isochronous way if the CPU load is high. In fact, the application may
|
|
+ * stop issuing requests while the CPUs are busy serving other processes,
|
|
+ * then restart, then stop again for a while, and so on. In addition, if
|
|
+ * the disk achieves a low enough throughput with the request pattern
|
|
+ * issued by the application (e.g., because the request pattern is random
|
|
+ * and/or the device is slow), then the application may meet the above
|
|
+ * bandwidth requirement too. To prevent such a greedy application to be
|
|
+ * deemed as soft real-time, a further rule is used in the computation of
|
|
+ * soft_rt_next_start: soft_rt_next_start must be higher than the current
|
|
+ * time plus the maximum time for which the arrival of a request is waited
|
|
+ * for when a sync queue becomes idle, namely bfqd->bfq_slice_idle.
|
|
+ * This filters out greedy applications, as the latter issue instead their
|
|
+ * next request as soon as possible after the last one has been completed
|
|
+ * (in contrast, when a batch of requests is completed, a soft real-time
|
|
+ * application spends some time processing data).
|
|
+ *
|
|
+ * Unfortunately, the last filter may easily generate false positives if
|
|
+ * only bfqd->bfq_slice_idle is used as a reference time interval and one
|
|
+ * or both the following cases occur:
|
|
+ * 1) HZ is so low that the duration of a jiffy is comparable to or higher
|
|
+ * than bfqd->bfq_slice_idle. This happens, e.g., on slow devices with
|
|
+ * HZ=100.
|
|
+ * 2) jiffies, instead of increasing at a constant rate, may stop increasing
|
|
+ * for a while, then suddenly 'jump' by several units to recover the lost
|
|
+ * increments. This seems to happen, e.g., inside virtual machines.
|
|
+ * To address this issue, we do not use as a reference time interval just
|
|
+ * bfqd->bfq_slice_idle, but bfqd->bfq_slice_idle plus a few jiffies. In
|
|
+ * particular we add the minimum number of jiffies for which the filter
|
|
+ * seems to be quite precise also in embedded systems and KVM/QEMU virtual
|
|
+ * machines.
|
|
+ */
|
|
+static unsigned long bfq_bfqq_softrt_next_start(struct bfq_data *bfqd,
|
|
+ struct bfq_queue *bfqq)
|
|
+{
|
|
+ bfq_log_bfqq(bfqd, bfqq,
|
|
+"softrt_next_start: service_blkg %lu soft_rate %u sects/sec interval %u",
|
|
+ bfqq->service_from_backlogged,
|
|
+ bfqd->bfq_wr_max_softrt_rate,
|
|
+ jiffies_to_msecs(HZ * bfqq->service_from_backlogged /
|
|
+ bfqd->bfq_wr_max_softrt_rate));
|
|
+
|
|
+ return max(bfqq->last_idle_bklogged +
|
|
+ HZ * bfqq->service_from_backlogged /
|
|
+ bfqd->bfq_wr_max_softrt_rate,
|
|
+ jiffies + nsecs_to_jiffies(bfqq->bfqd->bfq_slice_idle) + 4);
|
|
+}
|
|
+
|
|
+/*
|
|
+ * Return the farthest future time instant according to jiffies
|
|
+ * macros.
|
|
+ */
|
|
+static unsigned long bfq_greatest_from_now(void)
|
|
+{
|
|
+ return jiffies + MAX_JIFFY_OFFSET;
|
|
+}
|
|
+
|
|
+/*
|
|
+ * Return the farthest past time instant according to jiffies
|
|
+ * macros.
|
|
+ */
|
|
+static unsigned long bfq_smallest_from_now(void)
|
|
+{
|
|
+ return jiffies - MAX_JIFFY_OFFSET;
|
|
+}
|
|
+
|
|
+/**
|
|
+ * bfq_bfqq_expire - expire a queue.
|
|
+ * @bfqd: device owning the queue.
|
|
+ * @bfqq: the queue to expire.
|
|
+ * @compensate: if true, compensate for the time spent idling.
|
|
+ * @reason: the reason causing the expiration.
|
|
+ *
|
|
+ * If the process associated with bfqq does slow I/O (e.g., because it
|
|
+ * issues random requests), we charge bfqq with the time it has been
|
|
+ * in service instead of the service it has received (see
|
|
+ * bfq_bfqq_charge_time for details on how this goal is achieved). As
|
|
+ * a consequence, bfqq will typically get higher timestamps upon
|
|
+ * reactivation, and hence it will be rescheduled as if it had
|
|
+ * received more service than what it has actually received. In the
|
|
+ * end, bfqq receives less service in proportion to how slowly its
|
|
+ * associated process consumes its budgets (and hence how seriously it
|
|
+ * tends to lower the throughput). In addition, this time-charging
|
|
+ * strategy guarantees time fairness among slow processes. In
|
|
+ * contrast, if the process associated with bfqq is not slow, we
|
|
+ * charge bfqq exactly with the service it has received.
|
|
+ *
|
|
+ * Charging time to the first type of queues and the exact service to
|
|
+ * the other has the effect of using the WF2Q+ policy to schedule the
|
|
+ * former on a timeslice basis, without violating service domain
|
|
+ * guarantees among the latter.
|
|
+ */
|
|
+static void bfq_bfqq_expire(struct bfq_data *bfqd,
|
|
+ struct bfq_queue *bfqq,
|
|
+ bool compensate,
|
|
+ enum bfqq_expiration reason)
|
|
+{
|
|
+ bool slow;
|
|
+ unsigned long delta = 0;
|
|
+ struct bfq_entity *entity = &bfqq->entity;
|
|
+
|
|
+ BUG_ON(bfqq != bfqd->in_service_queue);
|
|
+
|
|
+ /*
|
|
+ * Check whether the process is slow (see bfq_bfqq_is_slow).
|
|
+ */
|
|
+ slow = bfq_bfqq_is_slow(bfqd, bfqq, compensate, reason, &delta);
|
|
+
|
|
+ /*
|
|
+ * Increase service_from_backlogged before next statement,
|
|
+ * because the possible next invocation of
|
|
+ * bfq_bfqq_charge_time would likely inflate
|
|
+ * entity->service. In contrast, service_from_backlogged must
|
|
+ * contain real service, to enable the soft real-time
|
|
+ * heuristic to correctly compute the bandwidth consumed by
|
|
+ * bfqq.
|
|
+ */
|
|
+ bfqq->service_from_backlogged += entity->service;
|
|
+
|
|
+ /*
|
|
+ * As above explained, charge slow (typically seeky) and
|
|
+ * timed-out queues with the time and not the service
|
|
+ * received, to favor sequential workloads.
|
|
+ *
|
|
+ * Processes doing I/O in the slower disk zones will tend to
|
|
+ * be slow(er) even if not seeky. Therefore, since the
|
|
+ * estimated peak rate is actually an average over the disk
|
|
+ * surface, these processes may timeout just for bad luck. To
|
|
+ * avoid punishing them, do not charge time to processes that
|
|
+ * succeeded in consuming at least 2/3 of their budget. This
|
|
+ * allows BFQ to preserve enough elasticity to still perform
|
|
+ * bandwidth, and not time, distribution with little unlucky
|
|
+ * or quasi-sequential processes.
|
|
+ */
|
|
+ if (bfqq->wr_coeff == 1 &&
|
|
+ (slow ||
|
|
+ (reason == BFQ_BFQQ_BUDGET_TIMEOUT &&
|
|
+ bfq_bfqq_budget_left(bfqq) >= entity->budget / 3)))
|
|
+ bfq_bfqq_charge_time(bfqd, bfqq, delta);
|
|
+
|
|
+ BUG_ON(bfqq->entity.budget < bfqq->entity.service);
|
|
+
|
|
+ if (reason == BFQ_BFQQ_TOO_IDLE &&
|
|
+ entity->service <= 2 * entity->budget / 10)
|
|
+ bfq_clear_bfqq_IO_bound(bfqq);
|
|
+
|
|
+ if (bfqd->low_latency && bfqq->wr_coeff == 1)
|
|
+ bfqq->last_wr_start_finish = jiffies;
|
|
+
|
|
+ if (bfqd->low_latency && bfqd->bfq_wr_max_softrt_rate > 0 &&
|
|
+ RB_EMPTY_ROOT(&bfqq->sort_list)) {
|
|
+ /*
|
|
+ * If we get here, and there are no outstanding
|
|
+ * requests, then the request pattern is isochronous
|
|
+ * (see the comments on the function
|
|
+ * bfq_bfqq_softrt_next_start()). Thus we can compute
|
|
+ * soft_rt_next_start. If, instead, the queue still
|
|
+ * has outstanding requests, then we have to wait for
|
|
+ * the completion of all the outstanding requests to
|
|
+ * discover whether the request pattern is actually
|
|
+ * isochronous.
|
|
+ */
|
|
+ BUG_ON(bfqd->busy_queues < 1);
|
|
+ if (bfqq->dispatched == 0) {
|
|
+ bfqq->soft_rt_next_start =
|
|
+ bfq_bfqq_softrt_next_start(bfqd, bfqq);
|
|
+ bfq_log_bfqq(bfqd, bfqq, "new soft_rt_next %lu",
|
|
+ bfqq->soft_rt_next_start);
|
|
+ } else {
|
|
+ /*
|
|
+ * The application is still waiting for the
|
|
+ * completion of one or more requests:
|
|
+ * prevent it from possibly being incorrectly
|
|
+ * deemed as soft real-time by setting its
|
|
+ * soft_rt_next_start to infinity. In fact,
|
|
+ * without this assignment, the application
|
|
+ * would be incorrectly deemed as soft
|
|
+ * real-time if:
|
|
+ * 1) it issued a new request before the
|
|
+ * completion of all its in-flight
|
|
+ * requests, and
|
|
+ * 2) at that time, its soft_rt_next_start
|
|
+ * happened to be in the past.
|
|
+ */
|
|
+ bfqq->soft_rt_next_start =
|
|
+ bfq_greatest_from_now();
|
|
+ /*
|
|
+ * Schedule an update of soft_rt_next_start to when
|
|
+ * the task may be discovered to be isochronous.
|
|
+ */
|
|
+ bfq_mark_bfqq_softrt_update(bfqq);
|
|
+ }
|
|
+ }
|
|
+
|
|
+ bfq_log_bfqq(bfqd, bfqq,
|
|
+ "expire (%d, slow %d, num_disp %d, idle_win %d, weight %d)",
|
|
+ reason, slow, bfqq->dispatched,
|
|
+ bfq_bfqq_idle_window(bfqq), entity->weight);
|
|
+
|
|
+ /*
|
|
+ * Increase, decrease or leave budget unchanged according to
|
|
+ * reason.
|
|
+ */
|
|
+ BUG_ON(bfqq->entity.budget < bfqq->entity.service);
|
|
+ __bfq_bfqq_recalc_budget(bfqd, bfqq, reason);
|
|
+ BUG_ON(bfqq->next_rq == NULL &&
|
|
+ bfqq->entity.budget < bfqq->entity.service);
|
|
+ __bfq_bfqq_expire(bfqd, bfqq);
|
|
+
|
|
+ BUG_ON(!bfq_bfqq_busy(bfqq) && reason == BFQ_BFQQ_BUDGET_EXHAUSTED &&
|
|
+ !bfq_class_idle(bfqq));
|
|
+
|
|
+ if (!bfq_bfqq_busy(bfqq) &&
|
|
+ reason != BFQ_BFQQ_BUDGET_TIMEOUT &&
|
|
+ reason != BFQ_BFQQ_BUDGET_EXHAUSTED)
|
|
+ bfq_mark_bfqq_non_blocking_wait_rq(bfqq);
|
|
+}
|
|
+
|
|
+/*
|
|
+ * Budget timeout is not implemented through a dedicated timer, but
|
|
+ * just checked on request arrivals and completions, as well as on
|
|
+ * idle timer expirations.
|
|
+ */
|
|
+static bool bfq_bfqq_budget_timeout(struct bfq_queue *bfqq)
|
|
+{
|
|
+ return time_is_before_eq_jiffies(bfqq->budget_timeout);
|
|
+}
|
|
+
|
|
+/*
|
|
+ * If we expire a queue that is actively waiting (i.e., with the
|
|
+ * device idled) for the arrival of a new request, then we may incur
|
|
+ * the timestamp misalignment problem described in the body of the
|
|
+ * function __bfq_activate_entity. Hence we return true only if this
|
|
+ * condition does not hold, or if the queue is slow enough to deserve
|
|
+ * only to be kicked off for preserving a high throughput.
|
|
+ */
|
|
+static bool bfq_may_expire_for_budg_timeout(struct bfq_queue *bfqq)
|
|
+{
|
|
+ bfq_log_bfqq(bfqq->bfqd, bfqq,
|
|
+ "may_budget_timeout: wait_request %d left %d timeout %d",
|
|
+ bfq_bfqq_wait_request(bfqq),
|
|
+ bfq_bfqq_budget_left(bfqq) >= bfqq->entity.budget / 3,
|
|
+ bfq_bfqq_budget_timeout(bfqq));
|
|
+
|
|
+ return (!bfq_bfqq_wait_request(bfqq) ||
|
|
+ bfq_bfqq_budget_left(bfqq) >= bfqq->entity.budget / 3)
|
|
+ &&
|
|
+ bfq_bfqq_budget_timeout(bfqq);
|
|
+}
|
|
+
|
|
+/*
|
|
+ * For a queue that becomes empty, device idling is allowed only if
|
|
+ * this function returns true for that queue. As a consequence, since
|
|
+ * device idling plays a critical role for both throughput boosting
|
|
+ * and service guarantees, the return value of this function plays a
|
|
+ * critical role as well.
|
|
+ *
|
|
+ * In a nutshell, this function returns true only if idling is
|
|
+ * beneficial for throughput or, even if detrimental for throughput,
|
|
+ * idling is however necessary to preserve service guarantees (low
|
|
+ * latency, desired throughput distribution, ...). In particular, on
|
|
+ * NCQ-capable devices, this function tries to return false, so as to
|
|
+ * help keep the drives' internal queues full, whenever this helps the
|
|
+ * device boost the throughput without causing any service-guarantee
|
|
+ * issue.
|
|
+ *
|
|
+ * In more detail, the return value of this function is obtained by,
|
|
+ * first, computing a number of boolean variables that take into
|
|
+ * account throughput and service-guarantee issues, and, then,
|
|
+ * combining these variables in a logical expression. Most of the
|
|
+ * issues taken into account are not trivial. We discuss these issues
|
|
+ * while introducing the variables.
|
|
+ */
|
|
+static bool bfq_bfqq_may_idle(struct bfq_queue *bfqq)
|
|
+{
|
|
+ struct bfq_data *bfqd = bfqq->bfqd;
|
|
+ bool idling_boosts_thr, idling_boosts_thr_without_issues,
|
|
+ idling_needed_for_service_guarantees,
|
|
+ asymmetric_scenario;
|
|
+
|
|
+ if (bfqd->strict_guarantees)
|
|
+ return true;
|
|
+
|
|
+ /*
|
|
+ * The next variable takes into account the cases where idling
|
|
+ * boosts the throughput.
|
|
+ *
|
|
+ * The value of the variable is computed considering, first, that
|
|
+ * idling is virtually always beneficial for the throughput if:
|
|
+ * (a) the device is not NCQ-capable, or
|
|
+ * (b) regardless of the presence of NCQ, the device is rotational
|
|
+ * and the request pattern for bfqq is I/O-bound and sequential.
|
|
+ *
|
|
+ * Secondly, and in contrast to the above item (b), idling an
|
|
+ * NCQ-capable flash-based device would not boost the
|
|
+ * throughput even with sequential I/O; rather it would lower
|
|
+ * the throughput in proportion to how fast the device
|
|
+ * is. Accordingly, the next variable is true if any of the
|
|
+ * above conditions (a) and (b) is true, and, in particular,
|
|
+ * happens to be false if bfqd is an NCQ-capable flash-based
|
|
+ * device.
|
|
+ */
|
|
+ idling_boosts_thr = !bfqd->hw_tag ||
|
|
+ (!blk_queue_nonrot(bfqd->queue) && bfq_bfqq_IO_bound(bfqq) &&
|
|
+ bfq_bfqq_idle_window(bfqq));
|
|
+
|
|
+ /*
|
|
+ * The value of the next variable,
|
|
+ * idling_boosts_thr_without_issues, is equal to that of
|
|
+ * idling_boosts_thr, unless a special case holds. In this
|
|
+ * special case, described below, idling may cause problems to
|
|
+ * weight-raised queues.
|
|
+ *
|
|
+ * When the request pool is saturated (e.g., in the presence
|
|
+ * of write hogs), if the processes associated with
|
|
+ * non-weight-raised queues ask for requests at a lower rate,
|
|
+ * then processes associated with weight-raised queues have a
|
|
+ * higher probability to get a request from the pool
|
|
+ * immediately (or at least soon) when they need one. Thus
|
|
+ * they have a higher probability to actually get a fraction
|
|
+ * of the device throughput proportional to their high
|
|
+ * weight. This is especially true with NCQ-capable drives,
|
|
+ * which enqueue several requests in advance, and further
|
|
+ * reorder internally-queued requests.
|
|
+ *
|
|
+ * For this reason, we force to false the value of
|
|
+ * idling_boosts_thr_without_issues if there are weight-raised
|
|
+ * busy queues. In this case, and if bfqq is not weight-raised,
|
|
+ * this guarantees that the device is not idled for bfqq (if,
|
|
+ * instead, bfqq is weight-raised, then idling will be
|
|
+ * guaranteed by another variable, see below). Combined with
|
|
+ * the timestamping rules of BFQ (see [1] for details), this
|
|
+ * behavior causes bfqq, and hence any sync non-weight-raised
|
|
+ * queue, to get a lower number of requests served, and thus
|
|
+ * to ask for a lower number of requests from the request
|
|
+ * pool, before the busy weight-raised queues get served
|
|
+ * again. This often mitigates starvation problems in the
|
|
+ * presence of heavy write workloads and NCQ, thereby
|
|
+ * guaranteeing a higher application and system responsiveness
|
|
+ * in these hostile scenarios.
|
|
+ */
|
|
+ idling_boosts_thr_without_issues = idling_boosts_thr &&
|
|
+ bfqd->wr_busy_queues == 0;
|
|
+
|
|
+ /*
|
|
+ * There is then a case where idling must be performed not
|
|
+ * for throughput concerns, but to preserve service
|
|
+ * guarantees.
|
|
+ *
|
|
+ * To introduce this case, we can note that allowing the drive
|
|
+ * to enqueue more than one request at a time, and hence
|
|
+ * delegating de facto final scheduling decisions to the
|
|
+ * drive's internal scheduler, entails loss of control on the
|
|
+ * actual request service order. In particular, the critical
|
|
+ * situation is when requests from different processes happen
|
|
+ * to be present, at the same time, in the internal queue(s)
|
|
+ * of the drive. In such a situation, the drive, by deciding
|
|
+ * the service order of the internally-queued requests, does
|
|
+ * determine also the actual throughput distribution among
|
|
+ * these processes. But the drive typically has no notion or
|
|
+ * concern about per-process throughput distribution, and
|
|
+ * makes its decisions only on a per-request basis. Therefore,
|
|
+ * the service distribution enforced by the drive's internal
|
|
+ * scheduler is likely to coincide with the desired
|
|
+ * device-throughput distribution only in a completely
|
|
+ * symmetric scenario where:
|
|
+ * (i) each of these processes must get the same throughput as
|
|
+ * the others;
|
|
+ * (ii) all these processes have the same I/O pattern
|
|
+ * (either sequential or random).
|
|
+ * In fact, in such a scenario, the drive will tend to treat
|
|
+ * the requests of each of these processes in about the same
|
|
+ * way as the requests of the others, and thus to provide
|
|
+ * each of these processes with about the same throughput
|
|
+ * (which is exactly the desired throughput distribution). In
|
|
+ * contrast, in any asymmetric scenario, device idling is
|
|
+ * certainly needed to guarantee that bfqq receives its
|
|
+ * assigned fraction of the device throughput (see [1] for
|
|
+ * details).
|
|
+ *
|
|
+ * We address this issue by controlling, actually, only the
|
|
+ * symmetry sub-condition (i), i.e., provided that
|
|
+ * sub-condition (i) holds, idling is not performed,
|
|
+ * regardless of whether sub-condition (ii) holds. In other
|
|
+ * words, only if sub-condition (i) holds, then idling is
|
|
+ * allowed, and the device tends to be prevented from queueing
|
|
+ * many requests, possibly of several processes. The reason
|
|
+ * for not controlling also sub-condition (ii) is that we
|
|
+ * exploit preemption to preserve guarantees in case of
|
|
+ * symmetric scenarios, even if (ii) does not hold, as
|
|
+ * explained in the next two paragraphs.
|
|
+ *
|
|
+ * Even if a queue, say Q, is expired when it remains idle, Q
|
|
+ * can still preempt the new in-service queue if the next
|
|
+ * request of Q arrives soon (see the comments on
|
|
+ * bfq_bfqq_update_budg_for_activation). If all queues and
|
|
+ * groups have the same weight, this form of preemption,
|
|
+ * combined with the hole-recovery heuristic described in the
|
|
+ * comments on function bfq_bfqq_update_budg_for_activation,
|
|
+ * are enough to preserve a correct bandwidth distribution in
|
|
+ * the mid term, even without idling. In fact, even if not
|
|
+ * idling allows the internal queues of the device to contain
|
|
+ * many requests, and thus to reorder requests, we can rather
|
|
+ * safely assume that the internal scheduler still preserves a
|
|
+ * minimum of mid-term fairness. The motivation for using
|
|
+ * preemption instead of idling is that, by not idling,
|
|
+ * service guarantees are preserved without minimally
|
|
+ * sacrificing throughput. In other words, both a high
|
|
+ * throughput and its desired distribution are obtained.
|
|
+ *
|
|
+ * More precisely, this preemption-based, idleless approach
|
|
+ * provides fairness in terms of IOPS, and not sectors per
|
|
+ * second. This can be seen with a simple example. Suppose
|
|
+ * that there are two queues with the same weight, but that
|
|
+ * the first queue receives requests of 8 sectors, while the
|
|
+ * second queue receives requests of 1024 sectors. In
|
|
+ * addition, suppose that each of the two queues contains at
|
|
+ * most one request at a time, which implies that each queue
|
|
+ * always remains idle after it is served. Finally, after
|
|
+ * remaining idle, each queue receives very quickly a new
|
|
+ * request. It follows that the two queues are served
|
|
+ * alternatively, preempting each other if needed. This
|
|
+ * implies that, although both queues have the same weight,
|
|
+ * the queue with large requests receives a service that is
|
|
+ * 1024/8 times as high as the service received by the other
|
|
+ * queue.
|
|
+ *
|
|
+ * On the other hand, device idling is performed, and thus
|
|
+ * pure sector-domain guarantees are provided, for the
|
|
+ * following queues, which are likely to need stronger
|
|
+ * throughput guarantees: weight-raised queues, and queues
|
|
+ * with a higher weight than other queues. When such queues
|
|
+ * are active, sub-condition (i) is false, which triggers
|
|
+ * device idling.
|
|
+ *
|
|
+ * According to the above considerations, the next variable is
|
|
+ * true (only) if sub-condition (i) holds. To compute the
|
|
+ * value of this variable, we not only use the return value of
|
|
+ * the function bfq_symmetric_scenario(), but also check
|
|
+ * whether bfqq is being weight-raised, because
|
|
+ * bfq_symmetric_scenario() does not take into account also
|
|
+ * weight-raised queues (see comments on
|
|
+ * bfq_weights_tree_add()).
|
|
+ *
|
|
+ * As a side note, it is worth considering that the above
|
|
+ * device-idling countermeasures may however fail in the
|
|
+ * following unlucky scenario: if idling is (correctly)
|
|
+ * disabled in a time period during which all symmetry
|
|
+ * sub-conditions hold, and hence the device is allowed to
|
|
+ * enqueue many requests, but at some later point in time some
|
|
+ * sub-condition stops to hold, then it may become impossible
|
|
+ * to let requests be served in the desired order until all
|
|
+ * the requests already queued in the device have been served.
|
|
+ */
|
|
+ asymmetric_scenario = bfqq->wr_coeff > 1 ||
|
|
+ !bfq_symmetric_scenario(bfqd);
|
|
+
|
|
+ /*
|
|
+ * Finally, there is a case where maximizing throughput is the
|
|
+ * best choice even if it may cause unfairness toward
|
|
+ * bfqq. Such a case is when bfqq became active in a burst of
|
|
+ * queue activations. Queues that became active during a large
|
|
+ * burst benefit only from throughput, as discussed in the
|
|
+ * comments on bfq_handle_burst. Thus, if bfqq became active
|
|
+ * in a burst and not idling the device maximizes throughput,
|
|
+ * then the device must no be idled, because not idling the
|
|
+ * device provides bfqq and all other queues in the burst with
|
|
+ * maximum benefit. Combining this and the above case, we can
|
|
+ * now establish when idling is actually needed to preserve
|
|
+ * service guarantees.
|
|
+ */
|
|
+ idling_needed_for_service_guarantees =
|
|
+ asymmetric_scenario && !bfq_bfqq_in_large_burst(bfqq);
|
|
+
|
|
+ /*
|
|
+ * We have now all the components we need to compute the return
|
|
+ * value of the function, which is true only if both the following
|
|
+ * conditions hold:
|
|
+ * 1) bfqq is sync, because idling make sense only for sync queues;
|
|
+ * 2) idling either boosts the throughput (without issues), or
|
|
+ * is necessary to preserve service guarantees.
|
|
+ */
|
|
+ bfq_log_bfqq(bfqd, bfqq, "may_idle: sync %d idling_boosts_thr %d",
|
|
+ bfq_bfqq_sync(bfqq), idling_boosts_thr);
|
|
+
|
|
+ bfq_log_bfqq(bfqd, bfqq,
|
|
+ "may_idle: wr_busy %d boosts %d IO-bound %d guar %d",
|
|
+ bfqd->wr_busy_queues,
|
|
+ idling_boosts_thr_without_issues,
|
|
+ bfq_bfqq_IO_bound(bfqq),
|
|
+ idling_needed_for_service_guarantees);
|
|
+
|
|
+ return bfq_bfqq_sync(bfqq) &&
|
|
+ (idling_boosts_thr_without_issues ||
|
|
+ idling_needed_for_service_guarantees);
|
|
+}
|
|
+
|
|
+/*
|
|
+ * If the in-service queue is empty but the function bfq_bfqq_may_idle
|
|
+ * returns true, then:
|
|
+ * 1) the queue must remain in service and cannot be expired, and
|
|
+ * 2) the device must be idled to wait for the possible arrival of a new
|
|
+ * request for the queue.
|
|
+ * See the comments on the function bfq_bfqq_may_idle for the reasons
|
|
+ * why performing device idling is the best choice to boost the throughput
|
|
+ * and preserve service guarantees when bfq_bfqq_may_idle itself
|
|
+ * returns true.
|
|
+ */
|
|
+static bool bfq_bfqq_must_idle(struct bfq_queue *bfqq)
|
|
+{
|
|
+ struct bfq_data *bfqd = bfqq->bfqd;
|
|
+
|
|
+ return RB_EMPTY_ROOT(&bfqq->sort_list) && bfqd->bfq_slice_idle != 0 &&
|
|
+ bfq_bfqq_may_idle(bfqq);
|
|
+}
|
|
+
|
|
+/*
|
|
+ * Select a queue for service. If we have a current queue in service,
|
|
+ * check whether to continue servicing it, or retrieve and set a new one.
|
|
+ */
|
|
+static struct bfq_queue *bfq_select_queue(struct bfq_data *bfqd)
|
|
+{
|
|
+ struct bfq_queue *bfqq;
|
|
+ struct request *next_rq;
|
|
+ enum bfqq_expiration reason = BFQ_BFQQ_BUDGET_TIMEOUT;
|
|
+
|
|
+ bfqq = bfqd->in_service_queue;
|
|
+ if (!bfqq)
|
|
+ goto new_queue;
|
|
+
|
|
+ bfq_log_bfqq(bfqd, bfqq, "select_queue: already in-service queue");
|
|
+
|
|
+ if (bfq_may_expire_for_budg_timeout(bfqq) &&
|
|
+ !hrtimer_active(&bfqd->idle_slice_timer) &&
|
|
+ !bfq_bfqq_must_idle(bfqq))
|
|
+ goto expire;
|
|
+
|
|
+check_queue:
|
|
+ /*
|
|
+ * This loop is rarely executed more than once. Even when it
|
|
+ * happens, it is much more convenient to re-execute this loop
|
|
+ * than to return NULL and trigger a new dispatch to get a
|
|
+ * request served.
|
|
+ */
|
|
+ next_rq = bfqq->next_rq;
|
|
+ /*
|
|
+ * If bfqq has requests queued and it has enough budget left to
|
|
+ * serve them, keep the queue, otherwise expire it.
|
|
+ */
|
|
+ if (next_rq) {
|
|
+ BUG_ON(RB_EMPTY_ROOT(&bfqq->sort_list));
|
|
+
|
|
+ if (bfq_serv_to_charge(next_rq, bfqq) >
|
|
+ bfq_bfqq_budget_left(bfqq)) {
|
|
+ /*
|
|
+ * Expire the queue for budget exhaustion,
|
|
+ * which makes sure that the next budget is
|
|
+ * enough to serve the next request, even if
|
|
+ * it comes from the fifo expired path.
|
|
+ */
|
|
+ reason = BFQ_BFQQ_BUDGET_EXHAUSTED;
|
|
+ goto expire;
|
|
+ } else {
|
|
+ /*
|
|
+ * The idle timer may be pending because we may
|
|
+ * not disable disk idling even when a new request
|
|
+ * arrives.
|
|
+ */
|
|
+ if (bfq_bfqq_wait_request(bfqq)) {
|
|
+ BUG_ON(!hrtimer_active(&bfqd->idle_slice_timer));
|
|
+ /*
|
|
+ * If we get here: 1) at least a new request
|
|
+ * has arrived but we have not disabled the
|
|
+ * timer because the request was too small,
|
|
+ * 2) then the block layer has unplugged
|
|
+ * the device, causing the dispatch to be
|
|
+ * invoked.
|
|
+ *
|
|
+ * Since the device is unplugged, now the
|
|
+ * requests are probably large enough to
|
|
+ * provide a reasonable throughput.
|
|
+ * So we disable idling.
|
|
+ */
|
|
+ bfq_clear_bfqq_wait_request(bfqq);
|
|
+ hrtimer_try_to_cancel(&bfqd->idle_slice_timer);
|
|
+ bfqg_stats_update_idle_time(bfqq_group(bfqq));
|
|
+ }
|
|
+ goto keep_queue;
|
|
+ }
|
|
+ }
|
|
+
|
|
+ /*
|
|
+ * No requests pending. However, if the in-service queue is idling
|
|
+ * for a new request, or has requests waiting for a completion and
|
|
+ * may idle after their completion, then keep it anyway.
|
|
+ */
|
|
+ if (hrtimer_active(&bfqd->idle_slice_timer) ||
|
|
+ (bfqq->dispatched != 0 && bfq_bfqq_may_idle(bfqq))) {
|
|
+ bfqq = NULL;
|
|
+ goto keep_queue;
|
|
+ }
|
|
+
|
|
+ reason = BFQ_BFQQ_NO_MORE_REQUESTS;
|
|
+expire:
|
|
+ bfq_bfqq_expire(bfqd, bfqq, false, reason);
|
|
+new_queue:
|
|
+ bfqq = bfq_set_in_service_queue(bfqd);
|
|
+ if (bfqq) {
|
|
+ bfq_log_bfqq(bfqd, bfqq, "select_queue: checking new queue");
|
|
+ goto check_queue;
|
|
+ }
|
|
+keep_queue:
|
|
+ if (bfqq)
|
|
+ bfq_log_bfqq(bfqd, bfqq, "select_queue: returned this queue");
|
|
+ else
|
|
+ bfq_log(bfqd, "select_queue: no queue returned");
|
|
+
|
|
+ return bfqq;
|
|
+}
|
|
+
|
|
+static void bfq_update_wr_data(struct bfq_data *bfqd, struct bfq_queue *bfqq)
|
|
+{
|
|
+ struct bfq_entity *entity = &bfqq->entity;
|
|
+
|
|
+ if (bfqq->wr_coeff > 1) { /* queue is being weight-raised */
|
|
+ BUG_ON(bfqq->wr_cur_max_time == bfqd->bfq_wr_rt_max_time &&
|
|
+ time_is_after_jiffies(bfqq->last_wr_start_finish));
|
|
+
|
|
+ bfq_log_bfqq(bfqd, bfqq,
|
|
+ "raising period dur %u/%u msec, old coeff %u, w %d(%d)",
|
|
+ jiffies_to_msecs(jiffies - bfqq->last_wr_start_finish),
|
|
+ jiffies_to_msecs(bfqq->wr_cur_max_time),
|
|
+ bfqq->wr_coeff,
|
|
+ bfqq->entity.weight, bfqq->entity.orig_weight);
|
|
+
|
|
+ BUG_ON(bfqq != bfqd->in_service_queue && entity->weight !=
|
|
+ entity->orig_weight * bfqq->wr_coeff);
|
|
+ if (entity->prio_changed)
|
|
+ bfq_log_bfqq(bfqd, bfqq, "WARN: pending prio change");
|
|
+
|
|
+ /*
|
|
+ * If the queue was activated in a burst, or too much
|
|
+ * time has elapsed from the beginning of this
|
|
+ * weight-raising period, then end weight raising.
|
|
+ */
|
|
+ if (bfq_bfqq_in_large_burst(bfqq))
|
|
+ bfq_bfqq_end_wr(bfqq);
|
|
+ else if (time_is_before_jiffies(bfqq->last_wr_start_finish +
|
|
+ bfqq->wr_cur_max_time)) {
|
|
+ if (bfqq->wr_cur_max_time != bfqd->bfq_wr_rt_max_time ||
|
|
+ time_is_before_jiffies(bfqq->wr_start_at_switch_to_srt +
|
|
+ bfq_wr_duration(bfqd)))
|
|
+ bfq_bfqq_end_wr(bfqq);
|
|
+ else {
|
|
+ /* switch back to interactive wr */
|
|
+ bfqq->wr_coeff = bfqd->bfq_wr_coeff;
|
|
+ bfqq->wr_cur_max_time = bfq_wr_duration(bfqd);
|
|
+ bfqq->last_wr_start_finish =
|
|
+ bfqq->wr_start_at_switch_to_srt;
|
|
+ BUG_ON(time_is_after_jiffies(
|
|
+ bfqq->last_wr_start_finish));
|
|
+ bfqq->entity.prio_changed = 1;
|
|
+ bfq_log_bfqq(bfqd, bfqq,
|
|
+ "back to interactive wr");
|
|
+ }
|
|
+ }
|
|
+ }
|
|
+ /* Update weight both if it must be raised and if it must be lowered */
|
|
+ if ((entity->weight > entity->orig_weight) != (bfqq->wr_coeff > 1))
|
|
+ __bfq_entity_update_weight_prio(
|
|
+ bfq_entity_service_tree(entity),
|
|
+ entity);
|
|
+}
|
|
+
|
|
+/*
|
|
+ * Dispatch one request from bfqq, moving it to the request queue
|
|
+ * dispatch list.
|
|
+ */
|
|
+static int bfq_dispatch_request(struct bfq_data *bfqd,
|
|
+ struct bfq_queue *bfqq)
|
|
+{
|
|
+ int dispatched = 0;
|
|
+ struct request *rq = bfqq->next_rq;
|
|
+ unsigned long service_to_charge;
|
|
+
|
|
+ BUG_ON(RB_EMPTY_ROOT(&bfqq->sort_list));
|
|
+ BUG_ON(!rq);
|
|
+ service_to_charge = bfq_serv_to_charge(rq, bfqq);
|
|
+
|
|
+ BUG_ON(service_to_charge > bfq_bfqq_budget_left(bfqq));
|
|
+
|
|
+ BUG_ON(bfqq->entity.budget < bfqq->entity.service);
|
|
+
|
|
+ bfq_bfqq_served(bfqq, service_to_charge);
|
|
+
|
|
+ BUG_ON(bfqq->entity.budget < bfqq->entity.service);
|
|
+
|
|
+ bfq_dispatch_insert(bfqd->queue, rq);
|
|
+
|
|
+ /*
|
|
+ * If weight raising has to terminate for bfqq, then next
|
|
+ * function causes an immediate update of bfqq's weight,
|
|
+ * without waiting for next activation. As a consequence, on
|
|
+ * expiration, bfqq will be timestamped as if has never been
|
|
+ * weight-raised during this service slot, even if it has
|
|
+ * received part or even most of the service as a
|
|
+ * weight-raised queue. This inflates bfqq's timestamps, which
|
|
+ * is beneficial, as bfqq is then more willing to leave the
|
|
+ * device immediately to possible other weight-raised queues.
|
|
+ */
|
|
+ bfq_update_wr_data(bfqd, bfqq);
|
|
+
|
|
+ bfq_log_bfqq(bfqd, bfqq,
|
|
+ "dispatched %u sec req (%llu), budg left %d",
|
|
+ blk_rq_sectors(rq),
|
|
+ (unsigned long long) blk_rq_pos(rq),
|
|
+ bfq_bfqq_budget_left(bfqq));
|
|
+
|
|
+ dispatched++;
|
|
+
|
|
+ if (!bfqd->in_service_bic) {
|
|
+ atomic_long_inc(&RQ_BIC(rq)->icq.ioc->refcount);
|
|
+ bfqd->in_service_bic = RQ_BIC(rq);
|
|
+ }
|
|
+
|
|
+ if (bfqd->busy_queues > 1 && bfq_class_idle(bfqq))
|
|
+ goto expire;
|
|
+
|
|
+ return dispatched;
|
|
+
|
|
+expire:
|
|
+ bfq_bfqq_expire(bfqd, bfqq, false, BFQ_BFQQ_BUDGET_EXHAUSTED);
|
|
+ return dispatched;
|
|
+}
|
|
+
|
|
+static int __bfq_forced_dispatch_bfqq(struct bfq_queue *bfqq)
|
|
+{
|
|
+ int dispatched = 0;
|
|
+
|
|
+ while (bfqq->next_rq) {
|
|
+ bfq_dispatch_insert(bfqq->bfqd->queue, bfqq->next_rq);
|
|
+ dispatched++;
|
|
+ }
|
|
+
|
|
+ BUG_ON(!list_empty(&bfqq->fifo));
|
|
+ return dispatched;
|
|
+}
|
|
+
|
|
+/*
|
|
+ * Drain our current requests.
|
|
+ * Used for barriers and when switching io schedulers on-the-fly.
|
|
+ */
|
|
+static int bfq_forced_dispatch(struct bfq_data *bfqd)
|
|
+{
|
|
+ struct bfq_queue *bfqq, *n;
|
|
+ struct bfq_service_tree *st;
|
|
+ int dispatched = 0;
|
|
+
|
|
+ bfqq = bfqd->in_service_queue;
|
|
+ if (bfqq)
|
|
+ __bfq_bfqq_expire(bfqd, bfqq);
|
|
+
|
|
+ /*
|
|
+ * Loop through classes, and be careful to leave the scheduler
|
|
+ * in a consistent state, as feedback mechanisms and vtime
|
|
+ * updates cannot be disabled during the process.
|
|
+ */
|
|
+ list_for_each_entry_safe(bfqq, n, &bfqd->active_list, bfqq_list) {
|
|
+ st = bfq_entity_service_tree(&bfqq->entity);
|
|
+
|
|
+ dispatched += __bfq_forced_dispatch_bfqq(bfqq);
|
|
+
|
|
+ bfqq->max_budget = bfq_max_budget(bfqd);
|
|
+ bfq_forget_idle(st);
|
|
+ }
|
|
+
|
|
+ BUG_ON(bfqd->busy_queues != 0);
|
|
+
|
|
+ return dispatched;
|
|
+}
|
|
+
|
|
+static int bfq_dispatch_requests(struct request_queue *q, int force)
|
|
+{
|
|
+ struct bfq_data *bfqd = q->elevator->elevator_data;
|
|
+ struct bfq_queue *bfqq;
|
|
+
|
|
+ bfq_log(bfqd, "dispatch requests: %d busy queues", bfqd->busy_queues);
|
|
+
|
|
+ if (bfqd->busy_queues == 0)
|
|
+ return 0;
|
|
+
|
|
+ if (unlikely(force))
|
|
+ return bfq_forced_dispatch(bfqd);
|
|
+
|
|
+ /*
|
|
+ * Force device to serve one request at a time if
|
|
+ * strict_guarantees is true. Forcing this service scheme is
|
|
+ * currently the ONLY way to guarantee that the request
|
|
+ * service order enforced by the scheduler is respected by a
|
|
+ * queueing device. Otherwise the device is free even to make
|
|
+ * some unlucky request wait for as long as the device
|
|
+ * wishes.
|
|
+ *
|
|
+ * Of course, serving one request at at time may cause loss of
|
|
+ * throughput.
|
|
+ */
|
|
+ if (bfqd->strict_guarantees && bfqd->rq_in_driver > 0)
|
|
+ return 0;
|
|
+
|
|
+ bfqq = bfq_select_queue(bfqd);
|
|
+ if (!bfqq)
|
|
+ return 0;
|
|
+
|
|
+ BUG_ON(bfqq->entity.budget < bfqq->entity.service);
|
|
+
|
|
+ BUG_ON(bfq_bfqq_wait_request(bfqq));
|
|
+
|
|
+ if (!bfq_dispatch_request(bfqd, bfqq))
|
|
+ return 0;
|
|
+
|
|
+ bfq_log_bfqq(bfqd, bfqq, "dispatched %s request",
|
|
+ bfq_bfqq_sync(bfqq) ? "sync" : "async");
|
|
+
|
|
+ BUG_ON(bfqq->next_rq == NULL &&
|
|
+ bfqq->entity.budget < bfqq->entity.service);
|
|
+ return 1;
|
|
+}
|
|
+
|
|
+/*
|
|
+ * Task holds one reference to the queue, dropped when task exits. Each rq
|
|
+ * in-flight on this queue also holds a reference, dropped when rq is freed.
|
|
+ *
|
|
+ * Queue lock must be held here.
|
|
+ */
|
|
+static void bfq_put_queue(struct bfq_queue *bfqq)
|
|
+{
|
|
+#ifdef CONFIG_BFQ_GROUP_IOSCHED
|
|
+ struct bfq_group *bfqg = bfqq_group(bfqq);
|
|
+#endif
|
|
+
|
|
+ BUG_ON(bfqq->ref <= 0);
|
|
+
|
|
+ bfq_log_bfqq(bfqq->bfqd, bfqq, "put_queue: %p %d", bfqq, bfqq->ref);
|
|
+ bfqq->ref--;
|
|
+ if (bfqq->ref)
|
|
+ return;
|
|
+
|
|
+ BUG_ON(rb_first(&bfqq->sort_list));
|
|
+ BUG_ON(bfqq->allocated[READ] + bfqq->allocated[WRITE] != 0);
|
|
+ BUG_ON(bfqq->entity.tree);
|
|
+ BUG_ON(bfq_bfqq_busy(bfqq));
|
|
+
|
|
+ if (bfq_bfqq_sync(bfqq))
|
|
+ /*
|
|
+ * The fact that this queue is being destroyed does not
|
|
+ * invalidate the fact that this queue may have been
|
|
+ * activated during the current burst. As a consequence,
|
|
+ * although the queue does not exist anymore, and hence
|
|
+ * needs to be removed from the burst list if there,
|
|
+ * the burst size has not to be decremented.
|
|
+ */
|
|
+ hlist_del_init(&bfqq->burst_list_node);
|
|
+
|
|
+ bfq_log_bfqq(bfqq->bfqd, bfqq, "put_queue: %p freed", bfqq);
|
|
+
|
|
+ kmem_cache_free(bfq_pool, bfqq);
|
|
+#ifdef CONFIG_BFQ_GROUP_IOSCHED
|
|
+ bfqg_put(bfqg);
|
|
+#endif
|
|
+}
|
|
+
|
|
+static void bfq_put_cooperator(struct bfq_queue *bfqq)
|
|
+{
|
|
+ struct bfq_queue *__bfqq, *next;
|
|
+
|
|
+ /*
|
|
+ * If this queue was scheduled to merge with another queue, be
|
|
+ * sure to drop the reference taken on that queue (and others in
|
|
+ * the merge chain). See bfq_setup_merge and bfq_merge_bfqqs.
|
|
+ */
|
|
+ __bfqq = bfqq->new_bfqq;
|
|
+ while (__bfqq) {
|
|
+ if (__bfqq == bfqq)
|
|
+ break;
|
|
+ next = __bfqq->new_bfqq;
|
|
+ bfq_put_queue(__bfqq);
|
|
+ __bfqq = next;
|
|
+ }
|
|
+}
|
|
+
|
|
+static void bfq_exit_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq)
|
|
+{
|
|
+ if (bfqq == bfqd->in_service_queue) {
|
|
+ __bfq_bfqq_expire(bfqd, bfqq);
|
|
+ bfq_schedule_dispatch(bfqd);
|
|
+ }
|
|
+
|
|
+ bfq_log_bfqq(bfqd, bfqq, "exit_bfqq: %p, %d", bfqq, bfqq->ref);
|
|
+
|
|
+ bfq_put_cooperator(bfqq);
|
|
+
|
|
+ bfq_put_queue(bfqq);
|
|
+}
|
|
+
|
|
+static void bfq_init_icq(struct io_cq *icq)
|
|
+{
|
|
+ icq_to_bic(icq)->ttime.last_end_request = ktime_get_ns() - (1ULL<<32);
|
|
+}
|
|
+
|
|
+static void bfq_exit_icq(struct io_cq *icq)
|
|
+{
|
|
+ struct bfq_io_cq *bic = icq_to_bic(icq);
|
|
+ struct bfq_data *bfqd = bic_to_bfqd(bic);
|
|
+
|
|
+ if (bic_to_bfqq(bic, false)) {
|
|
+ bfq_exit_bfqq(bfqd, bic_to_bfqq(bic, false));
|
|
+ bic_set_bfqq(bic, NULL, false);
|
|
+ }
|
|
+
|
|
+ if (bic_to_bfqq(bic, true)) {
|
|
+ /*
|
|
+ * If the bic is using a shared queue, put the reference
|
|
+ * taken on the io_context when the bic started using a
|
|
+ * shared bfq_queue.
|
|
+ */
|
|
+ if (bfq_bfqq_coop(bic_to_bfqq(bic, true)))
|
|
+ put_io_context(icq->ioc);
|
|
+ bfq_exit_bfqq(bfqd, bic_to_bfqq(bic, true));
|
|
+ bic_set_bfqq(bic, NULL, true);
|
|
+ }
|
|
+}
|
|
+
|
|
+/*
|
|
+ * Update the entity prio values; note that the new values will not
|
|
+ * be used until the next (re)activation.
|
|
+ */
|
|
+static void bfq_set_next_ioprio_data(struct bfq_queue *bfqq,
|
|
+ struct bfq_io_cq *bic)
|
|
+{
|
|
+ struct task_struct *tsk = current;
|
|
+ int ioprio_class;
|
|
+
|
|
+ ioprio_class = IOPRIO_PRIO_CLASS(bic->ioprio);
|
|
+ switch (ioprio_class) {
|
|
+ default:
|
|
+ dev_err(bfqq->bfqd->queue->backing_dev_info.dev,
|
|
+ "bfq: bad prio class %d\n", ioprio_class);
|
|
+ case IOPRIO_CLASS_NONE:
|
|
+ /*
|
|
+ * No prio set, inherit CPU scheduling settings.
|
|
+ */
|
|
+ bfqq->new_ioprio = task_nice_ioprio(tsk);
|
|
+ bfqq->new_ioprio_class = task_nice_ioclass(tsk);
|
|
+ break;
|
|
+ case IOPRIO_CLASS_RT:
|
|
+ bfqq->new_ioprio = IOPRIO_PRIO_DATA(bic->ioprio);
|
|
+ bfqq->new_ioprio_class = IOPRIO_CLASS_RT;
|
|
+ break;
|
|
+ case IOPRIO_CLASS_BE:
|
|
+ bfqq->new_ioprio = IOPRIO_PRIO_DATA(bic->ioprio);
|
|
+ bfqq->new_ioprio_class = IOPRIO_CLASS_BE;
|
|
+ break;
|
|
+ case IOPRIO_CLASS_IDLE:
|
|
+ bfqq->new_ioprio_class = IOPRIO_CLASS_IDLE;
|
|
+ bfqq->new_ioprio = 7;
|
|
+ bfq_clear_bfqq_idle_window(bfqq);
|
|
+ break;
|
|
+ }
|
|
+
|
|
+ if (bfqq->new_ioprio >= IOPRIO_BE_NR) {
|
|
+ pr_crit("bfq_set_next_ioprio_data: new_ioprio %d\n",
|
|
+ bfqq->new_ioprio);
|
|
+ BUG();
|
|
+ }
|
|
+
|
|
+ bfqq->entity.new_weight = bfq_ioprio_to_weight(bfqq->new_ioprio);
|
|
+ bfqq->entity.prio_changed = 1;
|
|
+ bfq_log_bfqq(bfqq->bfqd, bfqq,
|
|
+ "set_next_ioprio_data: bic_class %d prio %d class %d",
|
|
+ ioprio_class, bfqq->new_ioprio, bfqq->new_ioprio_class);
|
|
+}
|
|
+
|
|
+static void bfq_check_ioprio_change(struct bfq_io_cq *bic, struct bio *bio)
|
|
+{
|
|
+ struct bfq_data *bfqd = bic_to_bfqd(bic);
|
|
+ struct bfq_queue *bfqq;
|
|
+ unsigned long uninitialized_var(flags);
|
|
+ int ioprio = bic->icq.ioc->ioprio;
|
|
+
|
|
+ /*
|
|
+ * This condition may trigger on a newly created bic, be sure to
|
|
+ * drop the lock before returning.
|
|
+ */
|
|
+ if (unlikely(!bfqd) || likely(bic->ioprio == ioprio))
|
|
+ return;
|
|
+
|
|
+ bic->ioprio = ioprio;
|
|
+
|
|
+ bfqq = bic_to_bfqq(bic, false);
|
|
+ if (bfqq) {
|
|
+ bfq_put_queue(bfqq);
|
|
+ bfqq = bfq_get_queue(bfqd, bio, BLK_RW_ASYNC, bic);
|
|
+ bic_set_bfqq(bic, bfqq, false);
|
|
+ bfq_log_bfqq(bfqd, bfqq,
|
|
+ "check_ioprio_change: bfqq %p %d",
|
|
+ bfqq, bfqq->ref);
|
|
+ }
|
|
+
|
|
+ bfqq = bic_to_bfqq(bic, true);
|
|
+ if (bfqq)
|
|
+ bfq_set_next_ioprio_data(bfqq, bic);
|
|
+}
|
|
+
|
|
+static void bfq_init_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq,
|
|
+ struct bfq_io_cq *bic, pid_t pid, int is_sync)
|
|
+{
|
|
+ RB_CLEAR_NODE(&bfqq->entity.rb_node);
|
|
+ INIT_LIST_HEAD(&bfqq->fifo);
|
|
+ INIT_HLIST_NODE(&bfqq->burst_list_node);
|
|
+ BUG_ON(!hlist_unhashed(&bfqq->burst_list_node));
|
|
+
|
|
+ bfqq->ref = 0;
|
|
+ bfqq->bfqd = bfqd;
|
|
+
|
|
+ if (bic)
|
|
+ bfq_set_next_ioprio_data(bfqq, bic);
|
|
+
|
|
+ if (is_sync) {
|
|
+ if (!bfq_class_idle(bfqq))
|
|
+ bfq_mark_bfqq_idle_window(bfqq);
|
|
+ bfq_mark_bfqq_sync(bfqq);
|
|
+ bfq_mark_bfqq_just_created(bfqq);
|
|
+ } else
|
|
+ bfq_clear_bfqq_sync(bfqq);
|
|
+ bfq_mark_bfqq_IO_bound(bfqq);
|
|
+
|
|
+ /* Tentative initial value to trade off between thr and lat */
|
|
+ bfqq->max_budget = (2 * bfq_max_budget(bfqd)) / 3;
|
|
+ bfqq->pid = pid;
|
|
+
|
|
+ bfqq->wr_coeff = 1;
|
|
+ bfqq->last_wr_start_finish = jiffies;
|
|
+ bfqq->wr_start_at_switch_to_srt = bfq_smallest_from_now();
|
|
+ bfqq->budget_timeout = bfq_smallest_from_now();
|
|
+ bfqq->split_time = bfq_smallest_from_now();
|
|
+
|
|
+ /*
|
|
+ * Set to the value for which bfqq will not be deemed as
|
|
+ * soft rt when it becomes backlogged.
|
|
+ */
|
|
+ bfqq->soft_rt_next_start = bfq_greatest_from_now();
|
|
+
|
|
+ /* first request is almost certainly seeky */
|
|
+ bfqq->seek_history = 1;
|
|
+}
|
|
+
|
|
+static struct bfq_queue **bfq_async_queue_prio(struct bfq_data *bfqd,
|
|
+ struct bfq_group *bfqg,
|
|
+ int ioprio_class, int ioprio)
|
|
+{
|
|
+ switch (ioprio_class) {
|
|
+ case IOPRIO_CLASS_RT:
|
|
+ return &bfqg->async_bfqq[0][ioprio];
|
|
+ case IOPRIO_CLASS_NONE:
|
|
+ ioprio = IOPRIO_NORM;
|
|
+ /* fall through */
|
|
+ case IOPRIO_CLASS_BE:
|
|
+ return &bfqg->async_bfqq[1][ioprio];
|
|
+ case IOPRIO_CLASS_IDLE:
|
|
+ return &bfqg->async_idle_bfqq;
|
|
+ default:
|
|
+ BUG();
|
|
+ }
|
|
+}
|
|
+
|
|
+static struct bfq_queue *bfq_get_queue(struct bfq_data *bfqd,
|
|
+ struct bio *bio, bool is_sync,
|
|
+ struct bfq_io_cq *bic)
|
|
+{
|
|
+ const int ioprio = IOPRIO_PRIO_DATA(bic->ioprio);
|
|
+ const int ioprio_class = IOPRIO_PRIO_CLASS(bic->ioprio);
|
|
+ struct bfq_queue **async_bfqq = NULL;
|
|
+ struct bfq_queue *bfqq;
|
|
+ struct bfq_group *bfqg;
|
|
+
|
|
+ rcu_read_lock();
|
|
+
|
|
+ bfqg = bfq_find_set_group(bfqd, bio_blkcg(bio));
|
|
+ if (!bfqg) {
|
|
+ bfqq = &bfqd->oom_bfqq;
|
|
+ goto out;
|
|
+ }
|
|
+
|
|
+ if (!is_sync) {
|
|
+ async_bfqq = bfq_async_queue_prio(bfqd, bfqg, ioprio_class,
|
|
+ ioprio);
|
|
+ bfqq = *async_bfqq;
|
|
+ if (bfqq)
|
|
+ goto out;
|
|
+ }
|
|
+
|
|
+ bfqq = kmem_cache_alloc_node(bfq_pool, GFP_NOWAIT | __GFP_ZERO,
|
|
+ bfqd->queue->node);
|
|
+
|
|
+ if (bfqq) {
|
|
+ bfq_init_bfqq(bfqd, bfqq, bic, current->pid,
|
|
+ is_sync);
|
|
+ bfq_init_entity(&bfqq->entity, bfqg);
|
|
+ bfq_log_bfqq(bfqd, bfqq, "allocated");
|
|
+ } else {
|
|
+ bfqq = &bfqd->oom_bfqq;
|
|
+ bfq_log_bfqq(bfqd, bfqq, "using oom bfqq");
|
|
+ goto out;
|
|
+ }
|
|
+
|
|
+ /*
|
|
+ * Pin the queue now that it's allocated, scheduler exit will
|
|
+ * prune it.
|
|
+ */
|
|
+ if (async_bfqq) {
|
|
+ bfqq->ref++; /*
|
|
+ * Extra group reference, w.r.t. sync
|
|
+ * queue. This extra reference is removed
|
|
+ * only if bfqq->bfqg disappears, to
|
|
+ * guarantee that this queue is not freed
|
|
+ * until its group goes away.
|
|
+ */
|
|
+ bfq_log_bfqq(bfqd, bfqq, "get_queue, bfqq not in async: %p, %d",
|
|
+ bfqq, bfqq->ref);
|
|
+ *async_bfqq = bfqq;
|
|
+ }
|
|
+
|
|
+out:
|
|
+ bfqq->ref++;
|
|
+ bfq_log_bfqq(bfqd, bfqq, "get_queue, at end: %p, %d", bfqq, bfqq->ref);
|
|
+ rcu_read_unlock();
|
|
+ return bfqq;
|
|
+}
|
|
+
|
|
+static void bfq_update_io_thinktime(struct bfq_data *bfqd,
|
|
+ struct bfq_io_cq *bic)
|
|
+{
|
|
+ struct bfq_ttime *ttime = &bic->ttime;
|
|
+ u64 elapsed = ktime_get_ns() - bic->ttime.last_end_request;
|
|
+
|
|
+ elapsed = min_t(u64, elapsed, 2 * bfqd->bfq_slice_idle);
|
|
+
|
|
+ ttime->ttime_samples = (7*bic->ttime.ttime_samples + 256) / 8;
|
|
+ ttime->ttime_total = div_u64(7*ttime->ttime_total + 256*elapsed, 8);
|
|
+ ttime->ttime_mean = div64_ul(ttime->ttime_total + 128,
|
|
+ ttime->ttime_samples);
|
|
+}
|
|
+
|
|
+static void
|
|
+bfq_update_io_seektime(struct bfq_data *bfqd, struct bfq_queue *bfqq,
|
|
+ struct request *rq)
|
|
+{
|
|
+ bfqq->seek_history <<= 1;
|
|
+ bfqq->seek_history |=
|
|
+ get_sdist(bfqq->last_request_pos, rq) > BFQQ_SEEK_THR &&
|
|
+ (!blk_queue_nonrot(bfqd->queue) ||
|
|
+ blk_rq_sectors(rq) < BFQQ_SECT_THR_NONROT);
|
|
+}
|
|
+
|
|
+/*
|
|
+ * Disable idle window if the process thinks too long or seeks so much that
|
|
+ * it doesn't matter.
|
|
+ */
|
|
+static void bfq_update_idle_window(struct bfq_data *bfqd,
|
|
+ struct bfq_queue *bfqq,
|
|
+ struct bfq_io_cq *bic)
|
|
+{
|
|
+ int enable_idle;
|
|
+
|
|
+ /* Don't idle for async or idle io prio class. */
|
|
+ if (!bfq_bfqq_sync(bfqq) || bfq_class_idle(bfqq))
|
|
+ return;
|
|
+
|
|
+ /* Idle window just restored, statistics are meaningless. */
|
|
+ if (time_is_after_eq_jiffies(bfqq->split_time +
|
|
+ bfqd->bfq_wr_min_idle_time))
|
|
+ return;
|
|
+
|
|
+ enable_idle = bfq_bfqq_idle_window(bfqq);
|
|
+
|
|
+ if (atomic_read(&bic->icq.ioc->active_ref) == 0 ||
|
|
+ bfqd->bfq_slice_idle == 0 ||
|
|
+ (bfqd->hw_tag && BFQQ_SEEKY(bfqq) &&
|
|
+ bfqq->wr_coeff == 1))
|
|
+ enable_idle = 0;
|
|
+ else if (bfq_sample_valid(bic->ttime.ttime_samples)) {
|
|
+ if (bic->ttime.ttime_mean > bfqd->bfq_slice_idle &&
|
|
+ bfqq->wr_coeff == 1)
|
|
+ enable_idle = 0;
|
|
+ else
|
|
+ enable_idle = 1;
|
|
+ }
|
|
+ bfq_log_bfqq(bfqd, bfqq, "update_idle_window: enable_idle %d",
|
|
+ enable_idle);
|
|
+
|
|
+ if (enable_idle)
|
|
+ bfq_mark_bfqq_idle_window(bfqq);
|
|
+ else
|
|
+ bfq_clear_bfqq_idle_window(bfqq);
|
|
+}
|
|
+
|
|
+/*
|
|
+ * Called when a new fs request (rq) is added to bfqq. Check if there's
|
|
+ * something we should do about it.
|
|
+ */
|
|
+static void bfq_rq_enqueued(struct bfq_data *bfqd, struct bfq_queue *bfqq,
|
|
+ struct request *rq)
|
|
+{
|
|
+ struct bfq_io_cq *bic = RQ_BIC(rq);
|
|
+
|
|
+ if (rq->cmd_flags & REQ_META)
|
|
+ bfqq->meta_pending++;
|
|
+
|
|
+ bfq_update_io_thinktime(bfqd, bic);
|
|
+ bfq_update_io_seektime(bfqd, bfqq, rq);
|
|
+ if (bfqq->entity.service > bfq_max_budget(bfqd) / 8 ||
|
|
+ !BFQQ_SEEKY(bfqq))
|
|
+ bfq_update_idle_window(bfqd, bfqq, bic);
|
|
+
|
|
+ bfq_log_bfqq(bfqd, bfqq,
|
|
+ "rq_enqueued: idle_window=%d (seeky %d)",
|
|
+ bfq_bfqq_idle_window(bfqq), BFQQ_SEEKY(bfqq));
|
|
+
|
|
+ bfqq->last_request_pos = blk_rq_pos(rq) + blk_rq_sectors(rq);
|
|
+
|
|
+ if (bfqq == bfqd->in_service_queue && bfq_bfqq_wait_request(bfqq)) {
|
|
+ bool small_req = bfqq->queued[rq_is_sync(rq)] == 1 &&
|
|
+ blk_rq_sectors(rq) < 32;
|
|
+ bool budget_timeout = bfq_bfqq_budget_timeout(bfqq);
|
|
+
|
|
+ /*
|
|
+ * There is just this request queued: if the request
|
|
+ * is small and the queue is not to be expired, then
|
|
+ * just exit.
|
|
+ *
|
|
+ * In this way, if the device is being idled to wait
|
|
+ * for a new request from the in-service queue, we
|
|
+ * avoid unplugging the device and committing the
|
|
+ * device to serve just a small request. On the
|
|
+ * contrary, we wait for the block layer to decide
|
|
+ * when to unplug the device: hopefully, new requests
|
|
+ * will be merged to this one quickly, then the device
|
|
+ * will be unplugged and larger requests will be
|
|
+ * dispatched.
|
|
+ */
|
|
+ if (small_req && !budget_timeout)
|
|
+ return;
|
|
+
|
|
+ /*
|
|
+ * A large enough request arrived, or the queue is to
|
|
+ * be expired: in both cases disk idling is to be
|
|
+ * stopped, so clear wait_request flag and reset
|
|
+ * timer.
|
|
+ */
|
|
+ bfq_clear_bfqq_wait_request(bfqq);
|
|
+ hrtimer_try_to_cancel(&bfqd->idle_slice_timer);
|
|
+ bfqg_stats_update_idle_time(bfqq_group(bfqq));
|
|
+
|
|
+ /*
|
|
+ * The queue is not empty, because a new request just
|
|
+ * arrived. Hence we can safely expire the queue, in
|
|
+ * case of budget timeout, without risking that the
|
|
+ * timestamps of the queue are not updated correctly.
|
|
+ * See [1] for more details.
|
|
+ */
|
|
+ if (budget_timeout)
|
|
+ bfq_bfqq_expire(bfqd, bfqq, false,
|
|
+ BFQ_BFQQ_BUDGET_TIMEOUT);
|
|
+
|
|
+ /*
|
|
+ * Let the request rip immediately, or let a new queue be
|
|
+ * selected if bfqq has just been expired.
|
|
+ */
|
|
+ __blk_run_queue(bfqd->queue);
|
|
+ }
|
|
+}
|
|
+
|
|
+static void bfq_insert_request(struct request_queue *q, struct request *rq)
|
|
+{
|
|
+ struct bfq_data *bfqd = q->elevator->elevator_data;
|
|
+ struct bfq_queue *bfqq = RQ_BFQQ(rq), *new_bfqq;
|
|
+
|
|
+ assert_spin_locked(bfqd->queue->queue_lock);
|
|
+
|
|
+ /*
|
|
+ * An unplug may trigger a requeue of a request from the device
|
|
+ * driver: make sure we are in process context while trying to
|
|
+ * merge two bfq_queues.
|
|
+ */
|
|
+ if (!in_interrupt()) {
|
|
+ new_bfqq = bfq_setup_cooperator(bfqd, bfqq, rq, true);
|
|
+ if (new_bfqq) {
|
|
+ if (bic_to_bfqq(RQ_BIC(rq), 1) != bfqq)
|
|
+ new_bfqq = bic_to_bfqq(RQ_BIC(rq), 1);
|
|
+ /*
|
|
+ * Release the request's reference to the old bfqq
|
|
+ * and make sure one is taken to the shared queue.
|
|
+ */
|
|
+ new_bfqq->allocated[rq_data_dir(rq)]++;
|
|
+ bfqq->allocated[rq_data_dir(rq)]--;
|
|
+ new_bfqq->ref++;
|
|
+ bfq_clear_bfqq_just_created(bfqq);
|
|
+ bfq_put_queue(bfqq);
|
|
+ if (bic_to_bfqq(RQ_BIC(rq), 1) == bfqq)
|
|
+ bfq_merge_bfqqs(bfqd, RQ_BIC(rq),
|
|
+ bfqq, new_bfqq);
|
|
+ rq->elv.priv[1] = new_bfqq;
|
|
+ bfqq = new_bfqq;
|
|
+ }
|
|
+ }
|
|
+
|
|
+ bfq_add_request(rq);
|
|
+
|
|
+ rq->fifo_time = ktime_get_ns() + bfqd->bfq_fifo_expire[rq_is_sync(rq)];
|
|
+ list_add_tail(&rq->queuelist, &bfqq->fifo);
|
|
+
|
|
+ bfq_rq_enqueued(bfqd, bfqq, rq);
|
|
+}
|
|
+
|
|
+static void bfq_update_hw_tag(struct bfq_data *bfqd)
|
|
+{
|
|
+ bfqd->max_rq_in_driver = max_t(int, bfqd->max_rq_in_driver,
|
|
+ bfqd->rq_in_driver);
|
|
+
|
|
+ if (bfqd->hw_tag == 1)
|
|
+ return;
|
|
+
|
|
+ /*
|
|
+ * This sample is valid if the number of outstanding requests
|
|
+ * is large enough to allow a queueing behavior. Note that the
|
|
+ * sum is not exact, as it's not taking into account deactivated
|
|
+ * requests.
|
|
+ */
|
|
+ if (bfqd->rq_in_driver + bfqd->queued < BFQ_HW_QUEUE_THRESHOLD)
|
|
+ return;
|
|
+
|
|
+ if (bfqd->hw_tag_samples++ < BFQ_HW_QUEUE_SAMPLES)
|
|
+ return;
|
|
+
|
|
+ bfqd->hw_tag = bfqd->max_rq_in_driver > BFQ_HW_QUEUE_THRESHOLD;
|
|
+ bfqd->max_rq_in_driver = 0;
|
|
+ bfqd->hw_tag_samples = 0;
|
|
+}
|
|
+
|
|
+static void bfq_completed_request(struct request_queue *q, struct request *rq)
|
|
+{
|
|
+ struct bfq_queue *bfqq = RQ_BFQQ(rq);
|
|
+ struct bfq_data *bfqd = bfqq->bfqd;
|
|
+ u64 now_ns;
|
|
+ u32 delta_us;
|
|
+
|
|
+ bfq_log_bfqq(bfqd, bfqq, "completed one req with %u sects left",
|
|
+ blk_rq_sectors(rq));
|
|
+
|
|
+ assert_spin_locked(bfqd->queue->queue_lock);
|
|
+ bfq_update_hw_tag(bfqd);
|
|
+
|
|
+ BUG_ON(!bfqd->rq_in_driver);
|
|
+ BUG_ON(!bfqq->dispatched);
|
|
+ bfqd->rq_in_driver--;
|
|
+ bfqq->dispatched--;
|
|
+ bfqg_stats_update_completion(bfqq_group(bfqq),
|
|
+ rq_start_time_ns(rq),
|
|
+ rq_io_start_time_ns(rq), req_op(rq),
|
|
+ rq->cmd_flags);
|
|
+
|
|
+ if (!bfqq->dispatched && !bfq_bfqq_busy(bfqq)) {
|
|
+ BUG_ON(!RB_EMPTY_ROOT(&bfqq->sort_list));
|
|
+ /*
|
|
+ * Set budget_timeout (which we overload to store the
|
|
+ * time at which the queue remains with no backlog and
|
|
+ * no outstanding request; used by the weight-raising
|
|
+ * mechanism).
|
|
+ */
|
|
+ bfqq->budget_timeout = jiffies;
|
|
+
|
|
+ bfq_weights_tree_remove(bfqd, &bfqq->entity,
|
|
+ &bfqd->queue_weights_tree);
|
|
+ }
|
|
+
|
|
+ now_ns = ktime_get_ns();
|
|
+
|
|
+ RQ_BIC(rq)->ttime.last_end_request = now_ns;
|
|
+
|
|
+ /*
|
|
+ * Using us instead of ns, to get a reasonable precision in
|
|
+ * computing rate in next check.
|
|
+ */
|
|
+ delta_us = div_u64(now_ns - bfqd->last_completion, NSEC_PER_USEC);
|
|
+
|
|
+ bfq_log(bfqd, "rq_completed: delta %uus/%luus max_size %u rate %llu/%llu",
|
|
+ delta_us, BFQ_MIN_TT/NSEC_PER_USEC, bfqd->last_rq_max_size,
|
|
+ (USEC_PER_SEC*
|
|
+ (u64)((bfqd->last_rq_max_size<<BFQ_RATE_SHIFT)/delta_us))
|
|
+ >>BFQ_RATE_SHIFT,
|
|
+ (USEC_PER_SEC*(u64)(1UL<<(BFQ_RATE_SHIFT-10)))>>BFQ_RATE_SHIFT);
|
|
+
|
|
+ /*
|
|
+ * If the request took rather long to complete, and, according
|
|
+ * to the maximum request size recorded, this completion latency
|
|
+ * implies that the request was certainly served at a very low
|
|
+ * rate (less than 1M sectors/sec), then the whole observation
|
|
+ * interval that lasts up to this time instant cannot be a
|
|
+ * valid time interval for computing a new peak rate. Invoke
|
|
+ * bfq_update_rate_reset to have the following three steps
|
|
+ * taken:
|
|
+ * - close the observation interval at the last (previous)
|
|
+ * request dispatch or completion
|
|
+ * - compute rate, if possible, for that observation interval
|
|
+ * - reset to zero samples, which will trigger a proper
|
|
+ * re-initialization of the observation interval on next
|
|
+ * dispatch
|
|
+ */
|
|
+ if (delta_us > BFQ_MIN_TT/NSEC_PER_USEC &&
|
|
+ (bfqd->last_rq_max_size<<BFQ_RATE_SHIFT)/delta_us <
|
|
+ 1UL<<(BFQ_RATE_SHIFT - 10))
|
|
+ bfq_update_rate_reset(bfqd, NULL);
|
|
+ bfqd->last_completion = now_ns;
|
|
+
|
|
+ /*
|
|
+ * If we are waiting to discover whether the request pattern
|
|
+ * of the task associated with the queue is actually
|
|
+ * isochronous, and both requisites for this condition to hold
|
|
+ * are now satisfied, then compute soft_rt_next_start (see the
|
|
+ * comments on the function bfq_bfqq_softrt_next_start()). We
|
|
+ * schedule this delayed check when bfqq expires, if it still
|
|
+ * has in-flight requests.
|
|
+ */
|
|
+ if (bfq_bfqq_softrt_update(bfqq) && bfqq->dispatched == 0 &&
|
|
+ RB_EMPTY_ROOT(&bfqq->sort_list))
|
|
+ bfqq->soft_rt_next_start =
|
|
+ bfq_bfqq_softrt_next_start(bfqd, bfqq);
|
|
+
|
|
+ /*
|
|
+ * If this is the in-service queue, check if it needs to be expired,
|
|
+ * or if we want to idle in case it has no pending requests.
|
|
+ */
|
|
+ if (bfqd->in_service_queue == bfqq) {
|
|
+ if (bfqq->dispatched == 0 && bfq_bfqq_must_idle(bfqq)) {
|
|
+ bfq_arm_slice_timer(bfqd);
|
|
+ goto out;
|
|
+ } else if (bfq_may_expire_for_budg_timeout(bfqq))
|
|
+ bfq_bfqq_expire(bfqd, bfqq, false,
|
|
+ BFQ_BFQQ_BUDGET_TIMEOUT);
|
|
+ else if (RB_EMPTY_ROOT(&bfqq->sort_list) &&
|
|
+ (bfqq->dispatched == 0 ||
|
|
+ !bfq_bfqq_may_idle(bfqq)))
|
|
+ bfq_bfqq_expire(bfqd, bfqq, false,
|
|
+ BFQ_BFQQ_NO_MORE_REQUESTS);
|
|
+ }
|
|
+
|
|
+ if (!bfqd->rq_in_driver)
|
|
+ bfq_schedule_dispatch(bfqd);
|
|
+
|
|
+out:
|
|
+ return;
|
|
+}
|
|
+
|
|
+static int __bfq_may_queue(struct bfq_queue *bfqq)
|
|
+{
|
|
+ if (bfq_bfqq_wait_request(bfqq) && bfq_bfqq_must_alloc(bfqq)) {
|
|
+ bfq_clear_bfqq_must_alloc(bfqq);
|
|
+ return ELV_MQUEUE_MUST;
|
|
+ }
|
|
+
|
|
+ return ELV_MQUEUE_MAY;
|
|
+}
|
|
+
|
|
+static int bfq_may_queue(struct request_queue *q, int op, int op_flags)
|
|
+{
|
|
+ struct bfq_data *bfqd = q->elevator->elevator_data;
|
|
+ struct task_struct *tsk = current;
|
|
+ struct bfq_io_cq *bic;
|
|
+ struct bfq_queue *bfqq;
|
|
+
|
|
+ /*
|
|
+ * Don't force setup of a queue from here, as a call to may_queue
|
|
+ * does not necessarily imply that a request actually will be
|
|
+ * queued. So just lookup a possibly existing queue, or return
|
|
+ * 'may queue' if that fails.
|
|
+ */
|
|
+ bic = bfq_bic_lookup(bfqd, tsk->io_context);
|
|
+ if (!bic)
|
|
+ return ELV_MQUEUE_MAY;
|
|
+
|
|
+ bfqq = bic_to_bfqq(bic, rw_is_sync(op, op_flags));
|
|
+ if (bfqq)
|
|
+ return __bfq_may_queue(bfqq);
|
|
+
|
|
+ return ELV_MQUEUE_MAY;
|
|
+}
|
|
+
|
|
+/*
|
|
+ * Queue lock held here.
|
|
+ */
|
|
+static void bfq_put_request(struct request *rq)
|
|
+{
|
|
+ struct bfq_queue *bfqq = RQ_BFQQ(rq);
|
|
+
|
|
+ if (bfqq) {
|
|
+ const int rw = rq_data_dir(rq);
|
|
+
|
|
+ BUG_ON(!bfqq->allocated[rw]);
|
|
+ bfqq->allocated[rw]--;
|
|
+
|
|
+ rq->elv.priv[0] = NULL;
|
|
+ rq->elv.priv[1] = NULL;
|
|
+
|
|
+ bfq_log_bfqq(bfqq->bfqd, bfqq, "put_request %p, %d",
|
|
+ bfqq, bfqq->ref);
|
|
+ bfq_put_queue(bfqq);
|
|
+ }
|
|
+}
|
|
+
|
|
+/*
|
|
+ * Returns NULL if a new bfqq should be allocated, or the old bfqq if this
|
|
+ * was the last process referring to that bfqq.
|
|
+ */
|
|
+static struct bfq_queue *
|
|
+bfq_split_bfqq(struct bfq_io_cq *bic, struct bfq_queue *bfqq)
|
|
+{
|
|
+ bfq_log_bfqq(bfqq->bfqd, bfqq, "splitting queue");
|
|
+
|
|
+ put_io_context(bic->icq.ioc);
|
|
+
|
|
+ if (bfqq_process_refs(bfqq) == 1) {
|
|
+ bfqq->pid = current->pid;
|
|
+ bfq_clear_bfqq_coop(bfqq);
|
|
+ bfq_clear_bfqq_split_coop(bfqq);
|
|
+ return bfqq;
|
|
+ }
|
|
+
|
|
+ bic_set_bfqq(bic, NULL, 1);
|
|
+
|
|
+ bfq_put_cooperator(bfqq);
|
|
+
|
|
+ bfq_put_queue(bfqq);
|
|
+ return NULL;
|
|
+}
|
|
+
|
|
+/*
|
|
+ * Allocate bfq data structures associated with this request.
|
|
+ */
|
|
+static int bfq_set_request(struct request_queue *q, struct request *rq,
|
|
+ struct bio *bio, gfp_t gfp_mask)
|
|
+{
|
|
+ struct bfq_data *bfqd = q->elevator->elevator_data;
|
|
+ struct bfq_io_cq *bic = icq_to_bic(rq->elv.icq);
|
|
+ const int rw = rq_data_dir(rq);
|
|
+ const int is_sync = rq_is_sync(rq);
|
|
+ struct bfq_queue *bfqq;
|
|
+ unsigned long flags;
|
|
+ bool split = false;
|
|
+
|
|
+ spin_lock_irqsave(q->queue_lock, flags);
|
|
+ bfq_check_ioprio_change(bic, bio);
|
|
+
|
|
+ if (!bic)
|
|
+ goto queue_fail;
|
|
+
|
|
+ bfq_bic_update_cgroup(bic, bio);
|
|
+
|
|
+new_queue:
|
|
+ bfqq = bic_to_bfqq(bic, is_sync);
|
|
+ if (!bfqq || bfqq == &bfqd->oom_bfqq) {
|
|
+ if (bfqq)
|
|
+ bfq_put_queue(bfqq);
|
|
+ bfqq = bfq_get_queue(bfqd, bio, is_sync, bic);
|
|
+ BUG_ON(!hlist_unhashed(&bfqq->burst_list_node));
|
|
+
|
|
+ bic_set_bfqq(bic, bfqq, is_sync);
|
|
+ if (split && is_sync) {
|
|
+ bfq_log_bfqq(bfqd, bfqq,
|
|
+ "set_request: was_in_list %d "
|
|
+ "was_in_large_burst %d "
|
|
+ "large burst in progress %d",
|
|
+ bic->was_in_burst_list,
|
|
+ bic->saved_in_large_burst,
|
|
+ bfqd->large_burst);
|
|
+
|
|
+ if ((bic->was_in_burst_list && bfqd->large_burst) ||
|
|
+ bic->saved_in_large_burst) {
|
|
+ bfq_log_bfqq(bfqd, bfqq,
|
|
+ "set_request: marking in "
|
|
+ "large burst");
|
|
+ bfq_mark_bfqq_in_large_burst(bfqq);
|
|
+ } else {
|
|
+ bfq_log_bfqq(bfqd, bfqq,
|
|
+ "set_request: clearing in "
|
|
+ "large burst");
|
|
+ bfq_clear_bfqq_in_large_burst(bfqq);
|
|
+ if (bic->was_in_burst_list)
|
|
+ hlist_add_head(&bfqq->burst_list_node,
|
|
+ &bfqd->burst_list);
|
|
+ }
|
|
+ bfqq->split_time = jiffies;
|
|
+ }
|
|
+ } else {
|
|
+ /* If the queue was seeky for too long, break it apart. */
|
|
+ if (bfq_bfqq_coop(bfqq) && bfq_bfqq_split_coop(bfqq)) {
|
|
+ bfq_log_bfqq(bfqd, bfqq, "breaking apart bfqq");
|
|
+
|
|
+ /* Update bic before losing reference to bfqq */
|
|
+ if (bfq_bfqq_in_large_burst(bfqq))
|
|
+ bic->saved_in_large_burst = true;
|
|
+
|
|
+ bfqq = bfq_split_bfqq(bic, bfqq);
|
|
+ split = true;
|
|
+ if (!bfqq)
|
|
+ goto new_queue;
|
|
+ }
|
|
+ }
|
|
+
|
|
+ bfqq->allocated[rw]++;
|
|
+ bfqq->ref++;
|
|
+ bfq_log_bfqq(bfqd, bfqq, "set_request: bfqq %p, %d", bfqq, bfqq->ref);
|
|
+
|
|
+ rq->elv.priv[0] = bic;
|
|
+ rq->elv.priv[1] = bfqq;
|
|
+
|
|
+ /*
|
|
+ * If a bfq_queue has only one process reference, it is owned
|
|
+ * by only one bfq_io_cq: we can set the bic field of the
|
|
+ * bfq_queue to the address of that structure. Also, if the
|
|
+ * queue has just been split, mark a flag so that the
|
|
+ * information is available to the other scheduler hooks.
|
|
+ */
|
|
+ if (likely(bfqq != &bfqd->oom_bfqq) && bfqq_process_refs(bfqq) == 1) {
|
|
+ bfqq->bic = bic;
|
|
+ if (split) {
|
|
+ /*
|
|
+ * If the queue has just been split from a shared
|
|
+ * queue, restore the idle window and the possible
|
|
+ * weight raising period.
|
|
+ */
|
|
+ bfq_bfqq_resume_state(bfqq, bic);
|
|
+ }
|
|
+ }
|
|
+
|
|
+ if (unlikely(bfq_bfqq_just_created(bfqq)))
|
|
+ bfq_handle_burst(bfqd, bfqq);
|
|
+
|
|
+ spin_unlock_irqrestore(q->queue_lock, flags);
|
|
+
|
|
+ return 0;
|
|
+
|
|
+queue_fail:
|
|
+ bfq_schedule_dispatch(bfqd);
|
|
+ spin_unlock_irqrestore(q->queue_lock, flags);
|
|
+
|
|
+ return 1;
|
|
+}
|
|
+
|
|
+static void bfq_kick_queue(struct work_struct *work)
|
|
+{
|
|
+ struct bfq_data *bfqd =
|
|
+ container_of(work, struct bfq_data, unplug_work);
|
|
+ struct request_queue *q = bfqd->queue;
|
|
+
|
|
+ spin_lock_irq(q->queue_lock);
|
|
+ __blk_run_queue(q);
|
|
+ spin_unlock_irq(q->queue_lock);
|
|
+}
|
|
+
|
|
+/*
|
|
+ * Handler of the expiration of the timer running if the in-service queue
|
|
+ * is idling inside its time slice.
|
|
+ */
|
|
+static enum hrtimer_restart bfq_idle_slice_timer(struct hrtimer *timer)
|
|
+{
|
|
+ struct bfq_data *bfqd = container_of(timer, struct bfq_data,
|
|
+ idle_slice_timer);
|
|
+ struct bfq_queue *bfqq;
|
|
+ unsigned long flags;
|
|
+ enum bfqq_expiration reason;
|
|
+
|
|
+ spin_lock_irqsave(bfqd->queue->queue_lock, flags);
|
|
+
|
|
+ bfqq = bfqd->in_service_queue;
|
|
+ /*
|
|
+ * Theoretical race here: the in-service queue can be NULL or
|
|
+ * different from the queue that was idling if the timer handler
|
|
+ * spins on the queue_lock and a new request arrives for the
|
|
+ * current queue and there is a full dispatch cycle that changes
|
|
+ * the in-service queue. This can hardly happen, but in the worst
|
|
+ * case we just expire a queue too early.
|
|
+ */
|
|
+ if (bfqq) {
|
|
+ bfq_log_bfqq(bfqd, bfqq, "slice_timer expired");
|
|
+ bfq_clear_bfqq_wait_request(bfqq);
|
|
+
|
|
+ if (bfq_bfqq_budget_timeout(bfqq))
|
|
+ /*
|
|
+ * Also here the queue can be safely expired
|
|
+ * for budget timeout without wasting
|
|
+ * guarantees
|
|
+ */
|
|
+ reason = BFQ_BFQQ_BUDGET_TIMEOUT;
|
|
+ else if (bfqq->queued[0] == 0 && bfqq->queued[1] == 0)
|
|
+ /*
|
|
+ * The queue may not be empty upon timer expiration,
|
|
+ * because we may not disable the timer when the
|
|
+ * first request of the in-service queue arrives
|
|
+ * during disk idling.
|
|
+ */
|
|
+ reason = BFQ_BFQQ_TOO_IDLE;
|
|
+ else
|
|
+ goto schedule_dispatch;
|
|
+
|
|
+ bfq_bfqq_expire(bfqd, bfqq, true, reason);
|
|
+ }
|
|
+
|
|
+schedule_dispatch:
|
|
+ bfq_schedule_dispatch(bfqd);
|
|
+
|
|
+ spin_unlock_irqrestore(bfqd->queue->queue_lock, flags);
|
|
+ return HRTIMER_NORESTART;
|
|
+}
|
|
+
|
|
+static void bfq_shutdown_timer_wq(struct bfq_data *bfqd)
|
|
+{
|
|
+ hrtimer_cancel(&bfqd->idle_slice_timer);
|
|
+ cancel_work_sync(&bfqd->unplug_work);
|
|
+}
|
|
+
|
|
+static void __bfq_put_async_bfqq(struct bfq_data *bfqd,
|
|
+ struct bfq_queue **bfqq_ptr)
|
|
+{
|
|
+ struct bfq_group *root_group = bfqd->root_group;
|
|
+ struct bfq_queue *bfqq = *bfqq_ptr;
|
|
+
|
|
+ bfq_log(bfqd, "put_async_bfqq: %p", bfqq);
|
|
+ if (bfqq) {
|
|
+ bfq_bfqq_move(bfqd, bfqq, root_group);
|
|
+ bfq_log_bfqq(bfqd, bfqq, "put_async_bfqq: putting %p, %d",
|
|
+ bfqq, bfqq->ref);
|
|
+ bfq_put_queue(bfqq);
|
|
+ *bfqq_ptr = NULL;
|
|
+ }
|
|
+}
|
|
+
|
|
+/*
|
|
+ * Release all the bfqg references to its async queues. If we are
|
|
+ * deallocating the group these queues may still contain requests, so
|
|
+ * we reparent them to the root cgroup (i.e., the only one that will
|
|
+ * exist for sure until all the requests on a device are gone).
|
|
+ */
|
|
+static void bfq_put_async_queues(struct bfq_data *bfqd, struct bfq_group *bfqg)
|
|
+{
|
|
+ int i, j;
|
|
+
|
|
+ for (i = 0; i < 2; i++)
|
|
+ for (j = 0; j < IOPRIO_BE_NR; j++)
|
|
+ __bfq_put_async_bfqq(bfqd, &bfqg->async_bfqq[i][j]);
|
|
+
|
|
+ __bfq_put_async_bfqq(bfqd, &bfqg->async_idle_bfqq);
|
|
+}
|
|
+
|
|
+static void bfq_exit_queue(struct elevator_queue *e)
|
|
+{
|
|
+ struct bfq_data *bfqd = e->elevator_data;
|
|
+ struct request_queue *q = bfqd->queue;
|
|
+ struct bfq_queue *bfqq, *n;
|
|
+
|
|
+ bfq_shutdown_timer_wq(bfqd);
|
|
+
|
|
+ spin_lock_irq(q->queue_lock);
|
|
+
|
|
+ BUG_ON(bfqd->in_service_queue);
|
|
+ list_for_each_entry_safe(bfqq, n, &bfqd->idle_list, bfqq_list)
|
|
+ bfq_deactivate_bfqq(bfqd, bfqq, false, false);
|
|
+
|
|
+ spin_unlock_irq(q->queue_lock);
|
|
+
|
|
+ bfq_shutdown_timer_wq(bfqd);
|
|
+
|
|
+ BUG_ON(hrtimer_active(&bfqd->idle_slice_timer));
|
|
+
|
|
+#ifdef CONFIG_BFQ_GROUP_IOSCHED
|
|
+ blkcg_deactivate_policy(q, &blkcg_policy_bfq);
|
|
+#else
|
|
+ bfq_put_async_queues(bfqd, bfqd->root_group);
|
|
+ kfree(bfqd->root_group);
|
|
+#endif
|
|
+
|
|
+ kfree(bfqd);
|
|
+}
|
|
+
|
|
+static void bfq_init_root_group(struct bfq_group *root_group,
|
|
+ struct bfq_data *bfqd)
|
|
+{
|
|
+ int i;
|
|
+
|
|
+#ifdef CONFIG_BFQ_GROUP_IOSCHED
|
|
+ root_group->entity.parent = NULL;
|
|
+ root_group->my_entity = NULL;
|
|
+ root_group->bfqd = bfqd;
|
|
+#endif
|
|
+ root_group->rq_pos_tree = RB_ROOT;
|
|
+ for (i = 0; i < BFQ_IOPRIO_CLASSES; i++)
|
|
+ root_group->sched_data.service_tree[i] = BFQ_SERVICE_TREE_INIT;
|
|
+ root_group->sched_data.bfq_class_idle_last_service = jiffies;
|
|
+}
|
|
+
|
|
+static int bfq_init_queue(struct request_queue *q, struct elevator_type *e)
|
|
+{
|
|
+ struct bfq_data *bfqd;
|
|
+ struct elevator_queue *eq;
|
|
+
|
|
+ eq = elevator_alloc(q, e);
|
|
+ if (!eq)
|
|
+ return -ENOMEM;
|
|
+
|
|
+ bfqd = kzalloc_node(sizeof(*bfqd), GFP_KERNEL, q->node);
|
|
+ if (!bfqd) {
|
|
+ kobject_put(&eq->kobj);
|
|
+ return -ENOMEM;
|
|
+ }
|
|
+ eq->elevator_data = bfqd;
|
|
+
|
|
+ /*
|
|
+ * Our fallback bfqq if bfq_find_alloc_queue() runs into OOM issues.
|
|
+ * Grab a permanent reference to it, so that the normal code flow
|
|
+ * will not attempt to free it.
|
|
+ */
|
|
+ bfq_init_bfqq(bfqd, &bfqd->oom_bfqq, NULL, 1, 0);
|
|
+ bfqd->oom_bfqq.ref++;
|
|
+ bfqd->oom_bfqq.new_ioprio = BFQ_DEFAULT_QUEUE_IOPRIO;
|
|
+ bfqd->oom_bfqq.new_ioprio_class = IOPRIO_CLASS_BE;
|
|
+ bfqd->oom_bfqq.entity.new_weight =
|
|
+ bfq_ioprio_to_weight(bfqd->oom_bfqq.new_ioprio);
|
|
+
|
|
+ /* oom_bfqq does not participate to bursts */
|
|
+ bfq_clear_bfqq_just_created(&bfqd->oom_bfqq);
|
|
+ /*
|
|
+ * Trigger weight initialization, according to ioprio, at the
|
|
+ * oom_bfqq's first activation. The oom_bfqq's ioprio and ioprio
|
|
+ * class won't be changed any more.
|
|
+ */
|
|
+ bfqd->oom_bfqq.entity.prio_changed = 1;
|
|
+
|
|
+ bfqd->queue = q;
|
|
+
|
|
+ spin_lock_irq(q->queue_lock);
|
|
+ q->elevator = eq;
|
|
+ spin_unlock_irq(q->queue_lock);
|
|
+
|
|
+ bfqd->root_group = bfq_create_group_hierarchy(bfqd, q->node);
|
|
+ if (!bfqd->root_group)
|
|
+ goto out_free;
|
|
+ bfq_init_root_group(bfqd->root_group, bfqd);
|
|
+ bfq_init_entity(&bfqd->oom_bfqq.entity, bfqd->root_group);
|
|
+
|
|
+ hrtimer_init(&bfqd->idle_slice_timer, CLOCK_MONOTONIC,
|
|
+ HRTIMER_MODE_REL);
|
|
+ bfqd->idle_slice_timer.function = bfq_idle_slice_timer;
|
|
+
|
|
+ bfqd->queue_weights_tree = RB_ROOT;
|
|
+ bfqd->group_weights_tree = RB_ROOT;
|
|
+
|
|
+ INIT_WORK(&bfqd->unplug_work, bfq_kick_queue);
|
|
+
|
|
+ INIT_LIST_HEAD(&bfqd->active_list);
|
|
+ INIT_LIST_HEAD(&bfqd->idle_list);
|
|
+ INIT_HLIST_HEAD(&bfqd->burst_list);
|
|
+
|
|
+ bfqd->hw_tag = -1;
|
|
+
|
|
+ bfqd->bfq_max_budget = bfq_default_max_budget;
|
|
+
|
|
+ bfqd->bfq_fifo_expire[0] = bfq_fifo_expire[0];
|
|
+ bfqd->bfq_fifo_expire[1] = bfq_fifo_expire[1];
|
|
+ bfqd->bfq_back_max = bfq_back_max;
|
|
+ bfqd->bfq_back_penalty = bfq_back_penalty;
|
|
+ bfqd->bfq_slice_idle = bfq_slice_idle;
|
|
+ bfqd->bfq_timeout = bfq_timeout;
|
|
+
|
|
+ bfqd->bfq_requests_within_timer = 120;
|
|
+
|
|
+ bfqd->bfq_large_burst_thresh = 8;
|
|
+ bfqd->bfq_burst_interval = msecs_to_jiffies(180);
|
|
+
|
|
+ bfqd->low_latency = true;
|
|
+
|
|
+ /*
|
|
+ * Trade-off between responsiveness and fairness.
|
|
+ */
|
|
+ bfqd->bfq_wr_coeff = 30;
|
|
+ bfqd->bfq_wr_rt_max_time = msecs_to_jiffies(300);
|
|
+ bfqd->bfq_wr_max_time = 0;
|
|
+ bfqd->bfq_wr_min_idle_time = msecs_to_jiffies(2000);
|
|
+ bfqd->bfq_wr_min_inter_arr_async = msecs_to_jiffies(500);
|
|
+ bfqd->bfq_wr_max_softrt_rate = 7000; /*
|
|
+ * Approximate rate required
|
|
+ * to playback or record a
|
|
+ * high-definition compressed
|
|
+ * video.
|
|
+ */
|
|
+ bfqd->wr_busy_queues = 0;
|
|
+
|
|
+ /*
|
|
+ * Begin by assuming, optimistically, that the device is a
|
|
+ * high-speed one, and that its peak rate is equal to 2/3 of
|
|
+ * the highest reference rate.
|
|
+ */
|
|
+ bfqd->RT_prod = R_fast[blk_queue_nonrot(bfqd->queue)] *
|
|
+ T_fast[blk_queue_nonrot(bfqd->queue)];
|
|
+ bfqd->peak_rate = R_fast[blk_queue_nonrot(bfqd->queue)] * 2 / 3;
|
|
+ bfqd->device_speed = BFQ_BFQD_FAST;
|
|
+
|
|
+ return 0;
|
|
+
|
|
+out_free:
|
|
+ kfree(bfqd);
|
|
+ kobject_put(&eq->kobj);
|
|
+ return -ENOMEM;
|
|
+}
|
|
+
|
|
+static void bfq_slab_kill(void)
|
|
+{
|
|
+ kmem_cache_destroy(bfq_pool);
|
|
+}
|
|
+
|
|
+static int __init bfq_slab_setup(void)
|
|
+{
|
|
+ bfq_pool = KMEM_CACHE(bfq_queue, 0);
|
|
+ if (!bfq_pool)
|
|
+ return -ENOMEM;
|
|
+ return 0;
|
|
+}
|
|
+
|
|
+static ssize_t bfq_var_show(unsigned int var, char *page)
|
|
+{
|
|
+ return sprintf(page, "%u\n", var);
|
|
+}
|
|
+
|
|
+static ssize_t bfq_var_store(unsigned long *var, const char *page,
|
|
+ size_t count)
|
|
+{
|
|
+ unsigned long new_val;
|
|
+ int ret = kstrtoul(page, 10, &new_val);
|
|
+
|
|
+ if (ret == 0)
|
|
+ *var = new_val;
|
|
+
|
|
+ return count;
|
|
+}
|
|
+
|
|
+static ssize_t bfq_wr_max_time_show(struct elevator_queue *e, char *page)
|
|
+{
|
|
+ struct bfq_data *bfqd = e->elevator_data;
|
|
+
|
|
+ return sprintf(page, "%d\n", bfqd->bfq_wr_max_time > 0 ?
|
|
+ jiffies_to_msecs(bfqd->bfq_wr_max_time) :
|
|
+ jiffies_to_msecs(bfq_wr_duration(bfqd)));
|
|
+}
|
|
+
|
|
+static ssize_t bfq_weights_show(struct elevator_queue *e, char *page)
|
|
+{
|
|
+ struct bfq_queue *bfqq;
|
|
+ struct bfq_data *bfqd = e->elevator_data;
|
|
+ ssize_t num_char = 0;
|
|
+
|
|
+ num_char += sprintf(page + num_char, "Tot reqs queued %d\n\n",
|
|
+ bfqd->queued);
|
|
+
|
|
+ spin_lock_irq(bfqd->queue->queue_lock);
|
|
+
|
|
+ num_char += sprintf(page + num_char, "Active:\n");
|
|
+ list_for_each_entry(bfqq, &bfqd->active_list, bfqq_list) {
|
|
+ num_char += sprintf(page + num_char,
|
|
+ "pid%d: weight %hu, nr_queued %d %d, ",
|
|
+ bfqq->pid,
|
|
+ bfqq->entity.weight,
|
|
+ bfqq->queued[0],
|
|
+ bfqq->queued[1]);
|
|
+ num_char += sprintf(page + num_char,
|
|
+ "dur %d/%u\n",
|
|
+ jiffies_to_msecs(
|
|
+ jiffies -
|
|
+ bfqq->last_wr_start_finish),
|
|
+ jiffies_to_msecs(bfqq->wr_cur_max_time));
|
|
+ }
|
|
+
|
|
+ num_char += sprintf(page + num_char, "Idle:\n");
|
|
+ list_for_each_entry(bfqq, &bfqd->idle_list, bfqq_list) {
|
|
+ num_char += sprintf(page + num_char,
|
|
+ "pid%d: weight %hu, dur %d/%u\n",
|
|
+ bfqq->pid,
|
|
+ bfqq->entity.weight,
|
|
+ jiffies_to_msecs(jiffies -
|
|
+ bfqq->last_wr_start_finish),
|
|
+ jiffies_to_msecs(bfqq->wr_cur_max_time));
|
|
+ }
|
|
+
|
|
+ spin_unlock_irq(bfqd->queue->queue_lock);
|
|
+
|
|
+ return num_char;
|
|
+}
|
|
+
|
|
+#define SHOW_FUNCTION(__FUNC, __VAR, __CONV) \
|
|
+static ssize_t __FUNC(struct elevator_queue *e, char *page) \
|
|
+{ \
|
|
+ struct bfq_data *bfqd = e->elevator_data; \
|
|
+ u64 __data = __VAR; \
|
|
+ if (__CONV == 1) \
|
|
+ __data = jiffies_to_msecs(__data); \
|
|
+ else if (__CONV == 2) \
|
|
+ __data = div_u64(__data, NSEC_PER_MSEC); \
|
|
+ return bfq_var_show(__data, (page)); \
|
|
+}
|
|
+SHOW_FUNCTION(bfq_fifo_expire_sync_show, bfqd->bfq_fifo_expire[1], 2);
|
|
+SHOW_FUNCTION(bfq_fifo_expire_async_show, bfqd->bfq_fifo_expire[0], 2);
|
|
+SHOW_FUNCTION(bfq_back_seek_max_show, bfqd->bfq_back_max, 0);
|
|
+SHOW_FUNCTION(bfq_back_seek_penalty_show, bfqd->bfq_back_penalty, 0);
|
|
+SHOW_FUNCTION(bfq_slice_idle_show, bfqd->bfq_slice_idle, 2);
|
|
+SHOW_FUNCTION(bfq_max_budget_show, bfqd->bfq_user_max_budget, 0);
|
|
+SHOW_FUNCTION(bfq_timeout_sync_show, bfqd->bfq_timeout, 1);
|
|
+SHOW_FUNCTION(bfq_strict_guarantees_show, bfqd->strict_guarantees, 0);
|
|
+SHOW_FUNCTION(bfq_low_latency_show, bfqd->low_latency, 0);
|
|
+SHOW_FUNCTION(bfq_wr_coeff_show, bfqd->bfq_wr_coeff, 0);
|
|
+SHOW_FUNCTION(bfq_wr_rt_max_time_show, bfqd->bfq_wr_rt_max_time, 1);
|
|
+SHOW_FUNCTION(bfq_wr_min_idle_time_show, bfqd->bfq_wr_min_idle_time, 1);
|
|
+SHOW_FUNCTION(bfq_wr_min_inter_arr_async_show, bfqd->bfq_wr_min_inter_arr_async,
|
|
+ 1);
|
|
+SHOW_FUNCTION(bfq_wr_max_softrt_rate_show, bfqd->bfq_wr_max_softrt_rate, 0);
|
|
+#undef SHOW_FUNCTION
|
|
+
|
|
+#define USEC_SHOW_FUNCTION(__FUNC, __VAR) \
|
|
+static ssize_t __FUNC(struct elevator_queue *e, char *page) \
|
|
+{ \
|
|
+ struct bfq_data *bfqd = e->elevator_data; \
|
|
+ u64 __data = __VAR; \
|
|
+ __data = div_u64(__data, NSEC_PER_USEC); \
|
|
+ return bfq_var_show(__data, (page)); \
|
|
+}
|
|
+USEC_SHOW_FUNCTION(bfq_slice_idle_us_show, bfqd->bfq_slice_idle);
|
|
+#undef USEC_SHOW_FUNCTION
|
|
+
|
|
+#define STORE_FUNCTION(__FUNC, __PTR, MIN, MAX, __CONV) \
|
|
+static ssize_t \
|
|
+__FUNC(struct elevator_queue *e, const char *page, size_t count) \
|
|
+{ \
|
|
+ struct bfq_data *bfqd = e->elevator_data; \
|
|
+ unsigned long uninitialized_var(__data); \
|
|
+ int ret = bfq_var_store(&__data, (page), count); \
|
|
+ if (__data < (MIN)) \
|
|
+ __data = (MIN); \
|
|
+ else if (__data > (MAX)) \
|
|
+ __data = (MAX); \
|
|
+ if (__CONV == 1) \
|
|
+ *(__PTR) = msecs_to_jiffies(__data); \
|
|
+ else if (__CONV == 2) \
|
|
+ *(__PTR) = (u64)__data * NSEC_PER_MSEC; \
|
|
+ else \
|
|
+ *(__PTR) = __data; \
|
|
+ return ret; \
|
|
+}
|
|
+STORE_FUNCTION(bfq_fifo_expire_sync_store, &bfqd->bfq_fifo_expire[1], 1,
|
|
+ INT_MAX, 2);
|
|
+STORE_FUNCTION(bfq_fifo_expire_async_store, &bfqd->bfq_fifo_expire[0], 1,
|
|
+ INT_MAX, 2);
|
|
+STORE_FUNCTION(bfq_back_seek_max_store, &bfqd->bfq_back_max, 0, INT_MAX, 0);
|
|
+STORE_FUNCTION(bfq_back_seek_penalty_store, &bfqd->bfq_back_penalty, 1,
|
|
+ INT_MAX, 0);
|
|
+STORE_FUNCTION(bfq_slice_idle_store, &bfqd->bfq_slice_idle, 0, INT_MAX, 2);
|
|
+STORE_FUNCTION(bfq_wr_coeff_store, &bfqd->bfq_wr_coeff, 1, INT_MAX, 0);
|
|
+STORE_FUNCTION(bfq_wr_max_time_store, &bfqd->bfq_wr_max_time, 0, INT_MAX, 1);
|
|
+STORE_FUNCTION(bfq_wr_rt_max_time_store, &bfqd->bfq_wr_rt_max_time, 0, INT_MAX,
|
|
+ 1);
|
|
+STORE_FUNCTION(bfq_wr_min_idle_time_store, &bfqd->bfq_wr_min_idle_time, 0,
|
|
+ INT_MAX, 1);
|
|
+STORE_FUNCTION(bfq_wr_min_inter_arr_async_store,
|
|
+ &bfqd->bfq_wr_min_inter_arr_async, 0, INT_MAX, 1);
|
|
+STORE_FUNCTION(bfq_wr_max_softrt_rate_store, &bfqd->bfq_wr_max_softrt_rate, 0,
|
|
+ INT_MAX, 0);
|
|
+#undef STORE_FUNCTION
|
|
+
|
|
+#define USEC_STORE_FUNCTION(__FUNC, __PTR, MIN, MAX) \
|
|
+static ssize_t __FUNC(struct elevator_queue *e, const char *page, size_t count)\
|
|
+{ \
|
|
+ struct bfq_data *bfqd = e->elevator_data; \
|
|
+ unsigned long uninitialized_var(__data); \
|
|
+ int ret = bfq_var_store(&__data, (page), count); \
|
|
+ if (__data < (MIN)) \
|
|
+ __data = (MIN); \
|
|
+ else if (__data > (MAX)) \
|
|
+ __data = (MAX); \
|
|
+ *(__PTR) = (u64)__data * NSEC_PER_USEC; \
|
|
+ return ret; \
|
|
+}
|
|
+USEC_STORE_FUNCTION(bfq_slice_idle_us_store, &bfqd->bfq_slice_idle, 0,
|
|
+ UINT_MAX);
|
|
+#undef USEC_STORE_FUNCTION
|
|
+
|
|
+/* do nothing for the moment */
|
|
+static ssize_t bfq_weights_store(struct elevator_queue *e,
|
|
+ const char *page, size_t count)
|
|
+{
|
|
+ return count;
|
|
+}
|
|
+
|
|
+static ssize_t bfq_max_budget_store(struct elevator_queue *e,
|
|
+ const char *page, size_t count)
|
|
+{
|
|
+ struct bfq_data *bfqd = e->elevator_data;
|
|
+ unsigned long uninitialized_var(__data);
|
|
+ int ret = bfq_var_store(&__data, (page), count);
|
|
+
|
|
+ if (__data == 0)
|
|
+ bfqd->bfq_max_budget = bfq_calc_max_budget(bfqd);
|
|
+ else {
|
|
+ if (__data > INT_MAX)
|
|
+ __data = INT_MAX;
|
|
+ bfqd->bfq_max_budget = __data;
|
|
+ }
|
|
+
|
|
+ bfqd->bfq_user_max_budget = __data;
|
|
+
|
|
+ return ret;
|
|
+}
|
|
+
|
|
+/*
|
|
+ * Leaving this name to preserve name compatibility with cfq
|
|
+ * parameters, but this timeout is used for both sync and async.
|
|
+ */
|
|
+static ssize_t bfq_timeout_sync_store(struct elevator_queue *e,
|
|
+ const char *page, size_t count)
|
|
+{
|
|
+ struct bfq_data *bfqd = e->elevator_data;
|
|
+ unsigned long uninitialized_var(__data);
|
|
+ int ret = bfq_var_store(&__data, (page), count);
|
|
+
|
|
+ if (__data < 1)
|
|
+ __data = 1;
|
|
+ else if (__data > INT_MAX)
|
|
+ __data = INT_MAX;
|
|
+
|
|
+ bfqd->bfq_timeout = msecs_to_jiffies(__data);
|
|
+ if (bfqd->bfq_user_max_budget == 0)
|
|
+ bfqd->bfq_max_budget = bfq_calc_max_budget(bfqd);
|
|
+
|
|
+ return ret;
|
|
+}
|
|
+
|
|
+static ssize_t bfq_strict_guarantees_store(struct elevator_queue *e,
|
|
+ const char *page, size_t count)
|
|
+{
|
|
+ struct bfq_data *bfqd = e->elevator_data;
|
|
+ unsigned long uninitialized_var(__data);
|
|
+ int ret = bfq_var_store(&__data, (page), count);
|
|
+
|
|
+ if (__data > 1)
|
|
+ __data = 1;
|
|
+ if (!bfqd->strict_guarantees && __data == 1
|
|
+ && bfqd->bfq_slice_idle < 8 * NSEC_PER_MSEC)
|
|
+ bfqd->bfq_slice_idle = 8 * NSEC_PER_MSEC;
|
|
+
|
|
+ bfqd->strict_guarantees = __data;
|
|
+
|
|
+ return ret;
|
|
+}
|
|
+
|
|
+static ssize_t bfq_low_latency_store(struct elevator_queue *e,
|
|
+ const char *page, size_t count)
|
|
+{
|
|
+ struct bfq_data *bfqd = e->elevator_data;
|
|
+ unsigned long uninitialized_var(__data);
|
|
+ int ret = bfq_var_store(&__data, (page), count);
|
|
+
|
|
+ if (__data > 1)
|
|
+ __data = 1;
|
|
+ if (__data == 0 && bfqd->low_latency != 0)
|
|
+ bfq_end_wr(bfqd);
|
|
+ bfqd->low_latency = __data;
|
|
+
|
|
+ return ret;
|
|
+}
|
|
+
|
|
+#define BFQ_ATTR(name) \
|
|
+ __ATTR(name, S_IRUGO|S_IWUSR, bfq_##name##_show, bfq_##name##_store)
|
|
+
|
|
+static struct elv_fs_entry bfq_attrs[] = {
|
|
+ BFQ_ATTR(fifo_expire_sync),
|
|
+ BFQ_ATTR(fifo_expire_async),
|
|
+ BFQ_ATTR(back_seek_max),
|
|
+ BFQ_ATTR(back_seek_penalty),
|
|
+ BFQ_ATTR(slice_idle),
|
|
+ BFQ_ATTR(slice_idle_us),
|
|
+ BFQ_ATTR(max_budget),
|
|
+ BFQ_ATTR(timeout_sync),
|
|
+ BFQ_ATTR(strict_guarantees),
|
|
+ BFQ_ATTR(low_latency),
|
|
+ BFQ_ATTR(wr_coeff),
|
|
+ BFQ_ATTR(wr_max_time),
|
|
+ BFQ_ATTR(wr_rt_max_time),
|
|
+ BFQ_ATTR(wr_min_idle_time),
|
|
+ BFQ_ATTR(wr_min_inter_arr_async),
|
|
+ BFQ_ATTR(wr_max_softrt_rate),
|
|
+ BFQ_ATTR(weights),
|
|
+ __ATTR_NULL
|
|
+};
|
|
+
|
|
+static struct elevator_type iosched_bfq = {
|
|
+ .ops = {
|
|
+ .elevator_merge_fn = bfq_merge,
|
|
+ .elevator_merged_fn = bfq_merged_request,
|
|
+ .elevator_merge_req_fn = bfq_merged_requests,
|
|
+#ifdef CONFIG_BFQ_GROUP_IOSCHED
|
|
+ .elevator_bio_merged_fn = bfq_bio_merged,
|
|
+#endif
|
|
+ .elevator_allow_bio_merge_fn = bfq_allow_bio_merge,
|
|
+ .elevator_allow_rq_merge_fn = bfq_allow_rq_merge,
|
|
+ .elevator_dispatch_fn = bfq_dispatch_requests,
|
|
+ .elevator_add_req_fn = bfq_insert_request,
|
|
+ .elevator_activate_req_fn = bfq_activate_request,
|
|
+ .elevator_deactivate_req_fn = bfq_deactivate_request,
|
|
+ .elevator_completed_req_fn = bfq_completed_request,
|
|
+ .elevator_former_req_fn = elv_rb_former_request,
|
|
+ .elevator_latter_req_fn = elv_rb_latter_request,
|
|
+ .elevator_init_icq_fn = bfq_init_icq,
|
|
+ .elevator_exit_icq_fn = bfq_exit_icq,
|
|
+ .elevator_set_req_fn = bfq_set_request,
|
|
+ .elevator_put_req_fn = bfq_put_request,
|
|
+ .elevator_may_queue_fn = bfq_may_queue,
|
|
+ .elevator_init_fn = bfq_init_queue,
|
|
+ .elevator_exit_fn = bfq_exit_queue,
|
|
+ },
|
|
+ .icq_size = sizeof(struct bfq_io_cq),
|
|
+ .icq_align = __alignof__(struct bfq_io_cq),
|
|
+ .elevator_attrs = bfq_attrs,
|
|
+ .elevator_name = "bfq",
|
|
+ .elevator_owner = THIS_MODULE,
|
|
+};
|
|
+
|
|
+#ifdef CONFIG_BFQ_GROUP_IOSCHED
|
|
+static struct blkcg_policy blkcg_policy_bfq = {
|
|
+ .dfl_cftypes = bfq_blkg_files,
|
|
+ .legacy_cftypes = bfq_blkcg_legacy_files,
|
|
+
|
|
+ .cpd_alloc_fn = bfq_cpd_alloc,
|
|
+ .cpd_init_fn = bfq_cpd_init,
|
|
+ .cpd_bind_fn = bfq_cpd_init,
|
|
+ .cpd_free_fn = bfq_cpd_free,
|
|
+
|
|
+ .pd_alloc_fn = bfq_pd_alloc,
|
|
+ .pd_init_fn = bfq_pd_init,
|
|
+ .pd_offline_fn = bfq_pd_offline,
|
|
+ .pd_free_fn = bfq_pd_free,
|
|
+ .pd_reset_stats_fn = bfq_pd_reset_stats,
|
|
+};
|
|
+#endif
|
|
+
|
|
+static int __init bfq_init(void)
|
|
+{
|
|
+ int ret;
|
|
+ char msg[60] = "BFQ I/O-scheduler: v8r8-rc2";
|
|
+
|
|
+#ifdef CONFIG_BFQ_GROUP_IOSCHED
|
|
+ ret = blkcg_policy_register(&blkcg_policy_bfq);
|
|
+ if (ret)
|
|
+ return ret;
|
|
+#endif
|
|
+
|
|
+ ret = -ENOMEM;
|
|
+ if (bfq_slab_setup())
|
|
+ goto err_pol_unreg;
|
|
+
|
|
+ /*
|
|
+ * Times to load large popular applications for the typical
|
|
+ * systems installed on the reference devices (see the
|
|
+ * comments before the definitions of the next two
|
|
+ * arrays). Actually, we use slightly slower values, as the
|
|
+ * estimated peak rate tends to be smaller than the actual
|
|
+ * peak rate. The reason for this last fact is that estimates
|
|
+ * are computed over much shorter time intervals than the long
|
|
+ * intervals typically used for benchmarking. Why? First, to
|
|
+ * adapt more quickly to variations. Second, because an I/O
|
|
+ * scheduler cannot rely on a peak-rate-evaluation workload to
|
|
+ * be run for a long time.
|
|
+ */
|
|
+ T_slow[0] = msecs_to_jiffies(3500); /* actually 4 sec */
|
|
+ T_slow[1] = msecs_to_jiffies(6000); /* actually 6.5 sec */
|
|
+ T_fast[0] = msecs_to_jiffies(7000); /* actually 8 sec */
|
|
+ T_fast[1] = msecs_to_jiffies(2500); /* actually 3 sec */
|
|
+
|
|
+ /*
|
|
+ * Thresholds that determine the switch between speed classes
|
|
+ * (see the comments before the definition of the array
|
|
+ * device_speed_thresh). These thresholds are biased towards
|
|
+ * transitions to the fast class. This is safer than the
|
|
+ * opposite bias. In fact, a wrong transition to the slow
|
|
+ * class results in short weight-raising periods, because the
|
|
+ * speed of the device then tends to be higher that the
|
|
+ * reference peak rate. On the opposite end, a wrong
|
|
+ * transition to the fast class tends to increase
|
|
+ * weight-raising periods, because of the opposite reason.
|
|
+ */
|
|
+ device_speed_thresh[0] = (4 * R_slow[0]) / 3;
|
|
+ device_speed_thresh[1] = (4 * R_slow[1]) / 3;
|
|
+
|
|
+ ret = elv_register(&iosched_bfq);
|
|
+ if (ret)
|
|
+ goto err_pol_unreg;
|
|
+
|
|
+#ifdef CONFIG_BFQ_GROUP_IOSCHED
|
|
+ strcat(msg, " (with cgroups support)");
|
|
+#endif
|
|
+ pr_info("%s", msg);
|
|
+
|
|
+ return 0;
|
|
+
|
|
+err_pol_unreg:
|
|
+#ifdef CONFIG_BFQ_GROUP_IOSCHED
|
|
+ blkcg_policy_unregister(&blkcg_policy_bfq);
|
|
+#endif
|
|
+ return ret;
|
|
+}
|
|
+
|
|
+static void __exit bfq_exit(void)
|
|
+{
|
|
+ elv_unregister(&iosched_bfq);
|
|
+#ifdef CONFIG_BFQ_GROUP_IOSCHED
|
|
+ blkcg_policy_unregister(&blkcg_policy_bfq);
|
|
+#endif
|
|
+ bfq_slab_kill();
|
|
+}
|
|
+
|
|
+module_init(bfq_init);
|
|
+module_exit(bfq_exit);
|
|
+
|
|
+MODULE_AUTHOR("Arianna Avanzini, Fabio Checconi, Paolo Valente");
|
|
+MODULE_LICENSE("GPL");
|
|
diff --git a/block/bfq-sched.c b/block/bfq-sched.c
|
|
new file mode 100644
|
|
index 0000000..2e9dc59
|
|
--- /dev/null
|
|
+++ b/block/bfq-sched.c
|
|
@@ -0,0 +1,1933 @@
|
|
+/*
|
|
+ * BFQ: Hierarchical B-WF2Q+ scheduler.
|
|
+ *
|
|
+ * Based on ideas and code from CFQ:
|
|
+ * Copyright (C) 2003 Jens Axboe <axboe@kernel.dk>
|
|
+ *
|
|
+ * Copyright (C) 2008 Fabio Checconi <fabio@gandalf.sssup.it>
|
|
+ * Paolo Valente <paolo.valente@unimore.it>
|
|
+ *
|
|
+ * Copyright (C) 2015 Paolo Valente <paolo.valente@unimore.it>
|
|
+ *
|
|
+ * Copyright (C) 2016 Paolo Valente <paolo.valente@linaro.org>
|
|
+ */
|
|
+
|
|
+static struct bfq_group *bfqq_group(struct bfq_queue *bfqq);
|
|
+
|
|
+/**
|
|
+ * bfq_gt - compare two timestamps.
|
|
+ * @a: first ts.
|
|
+ * @b: second ts.
|
|
+ *
|
|
+ * Return @a > @b, dealing with wrapping correctly.
|
|
+ */
|
|
+static int bfq_gt(u64 a, u64 b)
|
|
+{
|
|
+ return (s64)(a - b) > 0;
|
|
+}
|
|
+
|
|
+static struct bfq_entity *bfq_root_active_entity(struct rb_root *tree)
|
|
+{
|
|
+ struct rb_node *node = tree->rb_node;
|
|
+
|
|
+ return rb_entry(node, struct bfq_entity, rb_node);
|
|
+}
|
|
+
|
|
+static struct bfq_entity *bfq_lookup_next_entity(struct bfq_sched_data *sd);
|
|
+
|
|
+static bool bfq_update_parent_budget(struct bfq_entity *next_in_service);
|
|
+
|
|
+/**
|
|
+ * bfq_update_next_in_service - update sd->next_in_service
|
|
+ * @sd: sched_data for which to perform the update.
|
|
+ * @new_entity: if not NULL, pointer to the entity whose activation,
|
|
+ * requeueing or repositionig triggered the invocation of
|
|
+ * this function.
|
|
+ *
|
|
+ * This function is called to update sd->next_in_service, which, in
|
|
+ * its turn, may change as a consequence of the insertion or
|
|
+ * extraction of an entity into/from one of the active trees of
|
|
+ * sd. These insertions/extractions occur as a consequence of
|
|
+ * activations/deactivations of entities, with some activations being
|
|
+ * 'true' activations, and other activations being requeueings (i.e.,
|
|
+ * implementing the second, requeueing phase of the mechanism used to
|
|
+ * reposition an entity in its active tree; see comments on
|
|
+ * __bfq_activate_entity and __bfq_requeue_entity for details). In
|
|
+ * both the last two activation sub-cases, new_entity points to the
|
|
+ * just activated or requeued entity.
|
|
+ *
|
|
+ * Returns true if sd->next_in_service changes in such a way that
|
|
+ * entity->parent may become the next_in_service for its parent
|
|
+ * entity.
|
|
+ */
|
|
+static bool bfq_update_next_in_service(struct bfq_sched_data *sd,
|
|
+ struct bfq_entity *new_entity)
|
|
+{
|
|
+ struct bfq_entity *next_in_service = sd->next_in_service;
|
|
+ struct bfq_queue *bfqq;
|
|
+ bool parent_sched_may_change = false;
|
|
+
|
|
+ /*
|
|
+ * If this update is triggered by the activation, requeueing
|
|
+ * or repositiong of an entity that does not coincide with
|
|
+ * sd->next_in_service, then a full lookup in the active tree
|
|
+ * can be avoided. In fact, it is enough to check whether the
|
|
+ * just-modified entity has a higher priority than
|
|
+ * sd->next_in_service, or, even if it has the same priority
|
|
+ * as sd->next_in_service, is eligible and has a lower virtual
|
|
+ * finish time than sd->next_in_service. If this compound
|
|
+ * condition holds, then the new entity becomes the new
|
|
+ * next_in_service. Otherwise no change is needed.
|
|
+ */
|
|
+ if (new_entity && new_entity != sd->next_in_service) {
|
|
+ /*
|
|
+ * Flag used to decide whether to replace
|
|
+ * sd->next_in_service with new_entity. Tentatively
|
|
+ * set to true, and left as true if
|
|
+ * sd->next_in_service is NULL.
|
|
+ */
|
|
+ bool replace_next = true;
|
|
+
|
|
+ /*
|
|
+ * If there is already a next_in_service candidate
|
|
+ * entity, then compare class priorities or timestamps
|
|
+ * to decide whether to replace sd->service_tree with
|
|
+ * new_entity.
|
|
+ */
|
|
+ if (next_in_service) {
|
|
+ unsigned int new_entity_class_idx =
|
|
+ bfq_class_idx(new_entity);
|
|
+ struct bfq_service_tree *st =
|
|
+ sd->service_tree + new_entity_class_idx;
|
|
+
|
|
+ /*
|
|
+ * For efficiency, evaluate the most likely
|
|
+ * sub-condition first.
|
|
+ */
|
|
+ replace_next =
|
|
+ (new_entity_class_idx ==
|
|
+ bfq_class_idx(next_in_service)
|
|
+ &&
|
|
+ !bfq_gt(new_entity->start, st->vtime)
|
|
+ &&
|
|
+ bfq_gt(next_in_service->finish,
|
|
+ new_entity->finish))
|
|
+ ||
|
|
+ new_entity_class_idx <
|
|
+ bfq_class_idx(next_in_service);
|
|
+ }
|
|
+
|
|
+ if (replace_next)
|
|
+ next_in_service = new_entity;
|
|
+ } else /* invoked because of a deactivation: lookup needed */
|
|
+ next_in_service = bfq_lookup_next_entity(sd);
|
|
+
|
|
+ if (next_in_service) {
|
|
+ parent_sched_may_change = !sd->next_in_service ||
|
|
+ bfq_update_parent_budget(next_in_service);
|
|
+ }
|
|
+
|
|
+ sd->next_in_service = next_in_service;
|
|
+
|
|
+ if (!next_in_service)
|
|
+ return parent_sched_may_change;
|
|
+
|
|
+ bfqq = bfq_entity_to_bfqq(next_in_service);
|
|
+ if (bfqq)
|
|
+ bfq_log_bfqq(bfqq->bfqd, bfqq,
|
|
+ "update_next_in_service: chosen this queue");
|
|
+#ifdef CONFIG_BFQ_GROUP_IOSCHED
|
|
+ else {
|
|
+ struct bfq_group *bfqg =
|
|
+ container_of(next_in_service,
|
|
+ struct bfq_group, entity);
|
|
+
|
|
+ bfq_log_bfqg((struct bfq_data *)bfqg->bfqd, bfqg,
|
|
+ "update_next_in_service: chosen this entity");
|
|
+ }
|
|
+#endif
|
|
+ return parent_sched_may_change;
|
|
+}
|
|
+
|
|
+#ifdef CONFIG_BFQ_GROUP_IOSCHED
|
|
+/* both next loops stop at one of the child entities of the root group */
|
|
+#define for_each_entity(entity) \
|
|
+ for (; entity ; entity = entity->parent)
|
|
+
|
|
+#define for_each_entity_safe(entity, parent) \
|
|
+ for (; entity && ({ parent = entity->parent; 1; }); entity = parent)
|
|
+
|
|
+/*
|
|
+ * Returns true if this budget changes may let next_in_service->parent
|
|
+ * become the next_in_service entity for its parent entity.
|
|
+ */
|
|
+static bool bfq_update_parent_budget(struct bfq_entity *next_in_service)
|
|
+{
|
|
+ struct bfq_entity *bfqg_entity;
|
|
+ struct bfq_group *bfqg;
|
|
+ struct bfq_sched_data *group_sd;
|
|
+ bool ret = false;
|
|
+
|
|
+ BUG_ON(!next_in_service);
|
|
+
|
|
+ group_sd = next_in_service->sched_data;
|
|
+
|
|
+ bfqg = container_of(group_sd, struct bfq_group, sched_data);
|
|
+ /*
|
|
+ * bfq_group's my_entity field is not NULL only if the group
|
|
+ * is not the root group. We must not touch the root entity
|
|
+ * as it must never become an in-service entity.
|
|
+ */
|
|
+ bfqg_entity = bfqg->my_entity;
|
|
+ if (bfqg_entity) {
|
|
+ if (bfqg_entity->budget > next_in_service->budget)
|
|
+ ret = true;
|
|
+ bfqg_entity->budget = next_in_service->budget;
|
|
+ }
|
|
+
|
|
+ return ret;
|
|
+}
|
|
+
|
|
+/*
|
|
+ * This function tells whether entity stops being a candidate for next
|
|
+ * service, according to the following logic.
|
|
+ *
|
|
+ * This function is invoked for an entity that is about to be set in
|
|
+ * service. If such an entity is a queue, then the entity is no longer
|
|
+ * a candidate for next service (i.e, a candidate entity to serve
|
|
+ * after the in-service entity is expired). The function then returns
|
|
+ * true.
|
|
+ *
|
|
+ * In contrast, the entity could stil be a candidate for next service
|
|
+ * if it is not a queue, and has more than one child. In fact, even if
|
|
+ * one of its children is about to be set in service, other children
|
|
+ * may still be the next to serve. As a consequence, a non-queue
|
|
+ * entity is not a candidate for next-service only if it has only one
|
|
+ * child. And only if this condition holds, then the function returns
|
|
+ * true for a non-queue entity.
|
|
+ */
|
|
+static bool bfq_no_longer_next_in_service(struct bfq_entity *entity)
|
|
+{
|
|
+ struct bfq_group *bfqg;
|
|
+
|
|
+ if (bfq_entity_to_bfqq(entity))
|
|
+ return true;
|
|
+
|
|
+ bfqg = container_of(entity, struct bfq_group, entity);
|
|
+
|
|
+ BUG_ON(bfqg == ((struct bfq_data *)(bfqg->bfqd))->root_group);
|
|
+ BUG_ON(bfqg->active_entities == 0);
|
|
+ if (bfqg->active_entities == 1)
|
|
+ return true;
|
|
+
|
|
+ return false;
|
|
+}
|
|
+
|
|
+#else /* CONFIG_BFQ_GROUP_IOSCHED */
|
|
+#define for_each_entity(entity) \
|
|
+ for (; entity ; entity = NULL)
|
|
+
|
|
+#define for_each_entity_safe(entity, parent) \
|
|
+ for (parent = NULL; entity ; entity = parent)
|
|
+
|
|
+static bool bfq_update_parent_budget(struct bfq_entity *next_in_service)
|
|
+{
|
|
+ return false;
|
|
+}
|
|
+
|
|
+static bool bfq_no_longer_next_in_service(struct bfq_entity *entity)
|
|
+{
|
|
+ return true;
|
|
+}
|
|
+
|
|
+#endif /* CONFIG_BFQ_GROUP_IOSCHED */
|
|
+
|
|
+/*
|
|
+ * Shift for timestamp calculations. This actually limits the maximum
|
|
+ * service allowed in one timestamp delta (small shift values increase it),
|
|
+ * the maximum total weight that can be used for the queues in the system
|
|
+ * (big shift values increase it), and the period of virtual time
|
|
+ * wraparounds.
|
|
+ */
|
|
+#define WFQ_SERVICE_SHIFT 22
|
|
+
|
|
+static struct bfq_queue *bfq_entity_to_bfqq(struct bfq_entity *entity)
|
|
+{
|
|
+ struct bfq_queue *bfqq = NULL;
|
|
+
|
|
+ BUG_ON(!entity);
|
|
+
|
|
+ if (!entity->my_sched_data)
|
|
+ bfqq = container_of(entity, struct bfq_queue, entity);
|
|
+
|
|
+ return bfqq;
|
|
+}
|
|
+
|
|
+
|
|
+/**
|
|
+ * bfq_delta - map service into the virtual time domain.
|
|
+ * @service: amount of service.
|
|
+ * @weight: scale factor (weight of an entity or weight sum).
|
|
+ */
|
|
+static u64 bfq_delta(unsigned long service, unsigned long weight)
|
|
+{
|
|
+ u64 d = (u64)service << WFQ_SERVICE_SHIFT;
|
|
+
|
|
+ do_div(d, weight);
|
|
+ return d;
|
|
+}
|
|
+
|
|
+/**
|
|
+ * bfq_calc_finish - assign the finish time to an entity.
|
|
+ * @entity: the entity to act upon.
|
|
+ * @service: the service to be charged to the entity.
|
|
+ */
|
|
+static void bfq_calc_finish(struct bfq_entity *entity, unsigned long service)
|
|
+{
|
|
+ struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);
|
|
+ unsigned long long start, finish, delta;
|
|
+
|
|
+ BUG_ON(entity->weight == 0);
|
|
+
|
|
+ entity->finish = entity->start +
|
|
+ bfq_delta(service, entity->weight);
|
|
+
|
|
+ start = ((entity->start>>10)*1000)>>12;
|
|
+ finish = ((entity->finish>>10)*1000)>>12;
|
|
+ delta = ((bfq_delta(service, entity->weight)>>10)*1000)>>12;
|
|
+
|
|
+ if (bfqq) {
|
|
+ bfq_log_bfqq(bfqq->bfqd, bfqq,
|
|
+ "calc_finish: serv %lu, w %d",
|
|
+ service, entity->weight);
|
|
+ bfq_log_bfqq(bfqq->bfqd, bfqq,
|
|
+ "calc_finish: start %llu, finish %llu, delta %llu",
|
|
+ start, finish, delta);
|
|
+#ifdef CONFIG_BFQ_GROUP_IOSCHED
|
|
+ } else {
|
|
+ struct bfq_group *bfqg =
|
|
+ container_of(entity, struct bfq_group, entity);
|
|
+
|
|
+ bfq_log_bfqg((struct bfq_data *)bfqg->bfqd, bfqg,
|
|
+ "calc_finish group: serv %lu, w %d",
|
|
+ service, entity->weight);
|
|
+ bfq_log_bfqg((struct bfq_data *)bfqg->bfqd, bfqg,
|
|
+ "calc_finish group: start %llu, finish %llu, delta %llu",
|
|
+ start, finish, delta);
|
|
+#endif
|
|
+ }
|
|
+}
|
|
+
|
|
+/**
|
|
+ * bfq_entity_of - get an entity from a node.
|
|
+ * @node: the node field of the entity.
|
|
+ *
|
|
+ * Convert a node pointer to the relative entity. This is used only
|
|
+ * to simplify the logic of some functions and not as the generic
|
|
+ * conversion mechanism because, e.g., in the tree walking functions,
|
|
+ * the check for a %NULL value would be redundant.
|
|
+ */
|
|
+static struct bfq_entity *bfq_entity_of(struct rb_node *node)
|
|
+{
|
|
+ struct bfq_entity *entity = NULL;
|
|
+
|
|
+ if (node)
|
|
+ entity = rb_entry(node, struct bfq_entity, rb_node);
|
|
+
|
|
+ return entity;
|
|
+}
|
|
+
|
|
+/**
|
|
+ * bfq_extract - remove an entity from a tree.
|
|
+ * @root: the tree root.
|
|
+ * @entity: the entity to remove.
|
|
+ */
|
|
+static void bfq_extract(struct rb_root *root, struct bfq_entity *entity)
|
|
+{
|
|
+ BUG_ON(entity->tree != root);
|
|
+
|
|
+ entity->tree = NULL;
|
|
+ rb_erase(&entity->rb_node, root);
|
|
+}
|
|
+
|
|
+/**
|
|
+ * bfq_idle_extract - extract an entity from the idle tree.
|
|
+ * @st: the service tree of the owning @entity.
|
|
+ * @entity: the entity being removed.
|
|
+ */
|
|
+static void bfq_idle_extract(struct bfq_service_tree *st,
|
|
+ struct bfq_entity *entity)
|
|
+{
|
|
+ struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);
|
|
+ struct rb_node *next;
|
|
+
|
|
+ BUG_ON(entity->tree != &st->idle);
|
|
+
|
|
+ if (entity == st->first_idle) {
|
|
+ next = rb_next(&entity->rb_node);
|
|
+ st->first_idle = bfq_entity_of(next);
|
|
+ }
|
|
+
|
|
+ if (entity == st->last_idle) {
|
|
+ next = rb_prev(&entity->rb_node);
|
|
+ st->last_idle = bfq_entity_of(next);
|
|
+ }
|
|
+
|
|
+ bfq_extract(&st->idle, entity);
|
|
+
|
|
+ if (bfqq)
|
|
+ list_del(&bfqq->bfqq_list);
|
|
+}
|
|
+
|
|
+/**
|
|
+ * bfq_insert - generic tree insertion.
|
|
+ * @root: tree root.
|
|
+ * @entity: entity to insert.
|
|
+ *
|
|
+ * This is used for the idle and the active tree, since they are both
|
|
+ * ordered by finish time.
|
|
+ */
|
|
+static void bfq_insert(struct rb_root *root, struct bfq_entity *entity)
|
|
+{
|
|
+ struct bfq_entity *entry;
|
|
+ struct rb_node **node = &root->rb_node;
|
|
+ struct rb_node *parent = NULL;
|
|
+
|
|
+ BUG_ON(entity->tree);
|
|
+
|
|
+ while (*node) {
|
|
+ parent = *node;
|
|
+ entry = rb_entry(parent, struct bfq_entity, rb_node);
|
|
+
|
|
+ if (bfq_gt(entry->finish, entity->finish))
|
|
+ node = &parent->rb_left;
|
|
+ else
|
|
+ node = &parent->rb_right;
|
|
+ }
|
|
+
|
|
+ rb_link_node(&entity->rb_node, parent, node);
|
|
+ rb_insert_color(&entity->rb_node, root);
|
|
+
|
|
+ entity->tree = root;
|
|
+}
|
|
+
|
|
+/**
|
|
+ * bfq_update_min - update the min_start field of a entity.
|
|
+ * @entity: the entity to update.
|
|
+ * @node: one of its children.
|
|
+ *
|
|
+ * This function is called when @entity may store an invalid value for
|
|
+ * min_start due to updates to the active tree. The function assumes
|
|
+ * that the subtree rooted at @node (which may be its left or its right
|
|
+ * child) has a valid min_start value.
|
|
+ */
|
|
+static void bfq_update_min(struct bfq_entity *entity, struct rb_node *node)
|
|
+{
|
|
+ struct bfq_entity *child;
|
|
+
|
|
+ if (node) {
|
|
+ child = rb_entry(node, struct bfq_entity, rb_node);
|
|
+ if (bfq_gt(entity->min_start, child->min_start))
|
|
+ entity->min_start = child->min_start;
|
|
+ }
|
|
+}
|
|
+
|
|
+/**
|
|
+ * bfq_update_active_node - recalculate min_start.
|
|
+ * @node: the node to update.
|
|
+ *
|
|
+ * @node may have changed position or one of its children may have moved,
|
|
+ * this function updates its min_start value. The left and right subtrees
|
|
+ * are assumed to hold a correct min_start value.
|
|
+ */
|
|
+static void bfq_update_active_node(struct rb_node *node)
|
|
+{
|
|
+ struct bfq_entity *entity = rb_entry(node, struct bfq_entity, rb_node);
|
|
+ struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);
|
|
+
|
|
+ entity->min_start = entity->start;
|
|
+ bfq_update_min(entity, node->rb_right);
|
|
+ bfq_update_min(entity, node->rb_left);
|
|
+
|
|
+ if (bfqq) {
|
|
+ bfq_log_bfqq(bfqq->bfqd, bfqq,
|
|
+ "update_active_node: new min_start %llu",
|
|
+ ((entity->min_start>>10)*1000)>>12);
|
|
+#ifdef CONFIG_BFQ_GROUP_IOSCHED
|
|
+ } else {
|
|
+ struct bfq_group *bfqg =
|
|
+ container_of(entity, struct bfq_group, entity);
|
|
+
|
|
+ bfq_log_bfqg((struct bfq_data *)bfqg->bfqd, bfqg,
|
|
+ "update_active_node: new min_start %llu",
|
|
+ ((entity->min_start>>10)*1000)>>12);
|
|
+#endif
|
|
+ }
|
|
+}
|
|
+
|
|
+/**
|
|
+ * bfq_update_active_tree - update min_start for the whole active tree.
|
|
+ * @node: the starting node.
|
|
+ *
|
|
+ * @node must be the deepest modified node after an update. This function
|
|
+ * updates its min_start using the values held by its children, assuming
|
|
+ * that they did not change, and then updates all the nodes that may have
|
|
+ * changed in the path to the root. The only nodes that may have changed
|
|
+ * are the ones in the path or their siblings.
|
|
+ */
|
|
+static void bfq_update_active_tree(struct rb_node *node)
|
|
+{
|
|
+ struct rb_node *parent;
|
|
+
|
|
+up:
|
|
+ bfq_update_active_node(node);
|
|
+
|
|
+ parent = rb_parent(node);
|
|
+ if (!parent)
|
|
+ return;
|
|
+
|
|
+ if (node == parent->rb_left && parent->rb_right)
|
|
+ bfq_update_active_node(parent->rb_right);
|
|
+ else if (parent->rb_left)
|
|
+ bfq_update_active_node(parent->rb_left);
|
|
+
|
|
+ node = parent;
|
|
+ goto up;
|
|
+}
|
|
+
|
|
+static void bfq_weights_tree_add(struct bfq_data *bfqd,
|
|
+ struct bfq_entity *entity,
|
|
+ struct rb_root *root);
|
|
+
|
|
+static void bfq_weights_tree_remove(struct bfq_data *bfqd,
|
|
+ struct bfq_entity *entity,
|
|
+ struct rb_root *root);
|
|
+
|
|
+
|
|
+/**
|
|
+ * bfq_active_insert - insert an entity in the active tree of its
|
|
+ * group/device.
|
|
+ * @st: the service tree of the entity.
|
|
+ * @entity: the entity being inserted.
|
|
+ *
|
|
+ * The active tree is ordered by finish time, but an extra key is kept
|
|
+ * per each node, containing the minimum value for the start times of
|
|
+ * its children (and the node itself), so it's possible to search for
|
|
+ * the eligible node with the lowest finish time in logarithmic time.
|
|
+ */
|
|
+static void bfq_active_insert(struct bfq_service_tree *st,
|
|
+ struct bfq_entity *entity)
|
|
+{
|
|
+ struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);
|
|
+ struct rb_node *node = &entity->rb_node;
|
|
+#ifdef CONFIG_BFQ_GROUP_IOSCHED
|
|
+ struct bfq_sched_data *sd = NULL;
|
|
+ struct bfq_group *bfqg = NULL;
|
|
+ struct bfq_data *bfqd = NULL;
|
|
+#endif
|
|
+
|
|
+ bfq_insert(&st->active, entity);
|
|
+
|
|
+ if (node->rb_left)
|
|
+ node = node->rb_left;
|
|
+ else if (node->rb_right)
|
|
+ node = node->rb_right;
|
|
+
|
|
+ bfq_update_active_tree(node);
|
|
+
|
|
+#ifdef CONFIG_BFQ_GROUP_IOSCHED
|
|
+ sd = entity->sched_data;
|
|
+ bfqg = container_of(sd, struct bfq_group, sched_data);
|
|
+ BUG_ON(!bfqg);
|
|
+ bfqd = (struct bfq_data *)bfqg->bfqd;
|
|
+#endif
|
|
+ if (bfqq)
|
|
+ list_add(&bfqq->bfqq_list, &bfqq->bfqd->active_list);
|
|
+#ifdef CONFIG_BFQ_GROUP_IOSCHED
|
|
+ else { /* bfq_group */
|
|
+ BUG_ON(!bfqd);
|
|
+ bfq_weights_tree_add(bfqd, entity, &bfqd->group_weights_tree);
|
|
+ }
|
|
+ if (bfqg != bfqd->root_group) {
|
|
+ BUG_ON(!bfqg);
|
|
+ BUG_ON(!bfqd);
|
|
+ bfqg->active_entities++;
|
|
+ }
|
|
+#endif
|
|
+}
|
|
+
|
|
+/**
|
|
+ * bfq_ioprio_to_weight - calc a weight from an ioprio.
|
|
+ * @ioprio: the ioprio value to convert.
|
|
+ */
|
|
+static unsigned short bfq_ioprio_to_weight(int ioprio)
|
|
+{
|
|
+ BUG_ON(ioprio < 0 || ioprio >= IOPRIO_BE_NR);
|
|
+ return (IOPRIO_BE_NR - ioprio) * BFQ_WEIGHT_CONVERSION_COEFF;
|
|
+}
|
|
+
|
|
+/**
|
|
+ * bfq_weight_to_ioprio - calc an ioprio from a weight.
|
|
+ * @weight: the weight value to convert.
|
|
+ *
|
|
+ * To preserve as much as possible the old only-ioprio user interface,
|
|
+ * 0 is used as an escape ioprio value for weights (numerically) equal or
|
|
+ * larger than IOPRIO_BE_NR * BFQ_WEIGHT_CONVERSION_COEFF.
|
|
+ */
|
|
+static unsigned short bfq_weight_to_ioprio(int weight)
|
|
+{
|
|
+ BUG_ON(weight < BFQ_MIN_WEIGHT || weight > BFQ_MAX_WEIGHT);
|
|
+ return IOPRIO_BE_NR * BFQ_WEIGHT_CONVERSION_COEFF - weight < 0 ?
|
|
+ 0 : IOPRIO_BE_NR * BFQ_WEIGHT_CONVERSION_COEFF - weight;
|
|
+}
|
|
+
|
|
+static void bfq_get_entity(struct bfq_entity *entity)
|
|
+{
|
|
+ struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);
|
|
+
|
|
+ if (bfqq) {
|
|
+ bfqq->ref++;
|
|
+ bfq_log_bfqq(bfqq->bfqd, bfqq, "get_entity: %p %d",
|
|
+ bfqq, bfqq->ref);
|
|
+ }
|
|
+}
|
|
+
|
|
+/**
|
|
+ * bfq_find_deepest - find the deepest node that an extraction can modify.
|
|
+ * @node: the node being removed.
|
|
+ *
|
|
+ * Do the first step of an extraction in an rb tree, looking for the
|
|
+ * node that will replace @node, and returning the deepest node that
|
|
+ * the following modifications to the tree can touch. If @node is the
|
|
+ * last node in the tree return %NULL.
|
|
+ */
|
|
+static struct rb_node *bfq_find_deepest(struct rb_node *node)
|
|
+{
|
|
+ struct rb_node *deepest;
|
|
+
|
|
+ if (!node->rb_right && !node->rb_left)
|
|
+ deepest = rb_parent(node);
|
|
+ else if (!node->rb_right)
|
|
+ deepest = node->rb_left;
|
|
+ else if (!node->rb_left)
|
|
+ deepest = node->rb_right;
|
|
+ else {
|
|
+ deepest = rb_next(node);
|
|
+ if (deepest->rb_right)
|
|
+ deepest = deepest->rb_right;
|
|
+ else if (rb_parent(deepest) != node)
|
|
+ deepest = rb_parent(deepest);
|
|
+ }
|
|
+
|
|
+ return deepest;
|
|
+}
|
|
+
|
|
+/**
|
|
+ * bfq_active_extract - remove an entity from the active tree.
|
|
+ * @st: the service_tree containing the tree.
|
|
+ * @entity: the entity being removed.
|
|
+ */
|
|
+static void bfq_active_extract(struct bfq_service_tree *st,
|
|
+ struct bfq_entity *entity)
|
|
+{
|
|
+ struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);
|
|
+ struct rb_node *node;
|
|
+#ifdef CONFIG_BFQ_GROUP_IOSCHED
|
|
+ struct bfq_sched_data *sd = NULL;
|
|
+ struct bfq_group *bfqg = NULL;
|
|
+ struct bfq_data *bfqd = NULL;
|
|
+#endif
|
|
+
|
|
+ node = bfq_find_deepest(&entity->rb_node);
|
|
+ bfq_extract(&st->active, entity);
|
|
+
|
|
+ if (node)
|
|
+ bfq_update_active_tree(node);
|
|
+
|
|
+#ifdef CONFIG_BFQ_GROUP_IOSCHED
|
|
+ sd = entity->sched_data;
|
|
+ bfqg = container_of(sd, struct bfq_group, sched_data);
|
|
+ BUG_ON(!bfqg);
|
|
+ bfqd = (struct bfq_data *)bfqg->bfqd;
|
|
+#endif
|
|
+ if (bfqq)
|
|
+ list_del(&bfqq->bfqq_list);
|
|
+#ifdef CONFIG_BFQ_GROUP_IOSCHED
|
|
+ else { /* bfq_group */
|
|
+ BUG_ON(!bfqd);
|
|
+ bfq_weights_tree_remove(bfqd, entity,
|
|
+ &bfqd->group_weights_tree);
|
|
+ }
|
|
+ if (bfqg != bfqd->root_group) {
|
|
+ BUG_ON(!bfqg);
|
|
+ BUG_ON(!bfqd);
|
|
+ BUG_ON(!bfqg->active_entities);
|
|
+ bfqg->active_entities--;
|
|
+ }
|
|
+#endif
|
|
+}
|
|
+
|
|
+/**
|
|
+ * bfq_idle_insert - insert an entity into the idle tree.
|
|
+ * @st: the service tree containing the tree.
|
|
+ * @entity: the entity to insert.
|
|
+ */
|
|
+static void bfq_idle_insert(struct bfq_service_tree *st,
|
|
+ struct bfq_entity *entity)
|
|
+{
|
|
+ struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);
|
|
+ struct bfq_entity *first_idle = st->first_idle;
|
|
+ struct bfq_entity *last_idle = st->last_idle;
|
|
+
|
|
+ if (!first_idle || bfq_gt(first_idle->finish, entity->finish))
|
|
+ st->first_idle = entity;
|
|
+ if (!last_idle || bfq_gt(entity->finish, last_idle->finish))
|
|
+ st->last_idle = entity;
|
|
+
|
|
+ bfq_insert(&st->idle, entity);
|
|
+
|
|
+ if (bfqq)
|
|
+ list_add(&bfqq->bfqq_list, &bfqq->bfqd->idle_list);
|
|
+}
|
|
+
|
|
+/**
|
|
+ * bfq_forget_entity - remove an entity from the wfq trees.
|
|
+ * @st: the service tree.
|
|
+ * @entity: the entity being removed.
|
|
+ *
|
|
+ * Update the device status and forget everything about @entity, putting
|
|
+ * the device reference to it, if it is a queue. Entities belonging to
|
|
+ * groups are not refcounted.
|
|
+ */
|
|
+static void bfq_forget_entity(struct bfq_service_tree *st,
|
|
+ struct bfq_entity *entity)
|
|
+{
|
|
+ struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);
|
|
+ struct bfq_sched_data *sd;
|
|
+
|
|
+ BUG_ON(!entity->on_st);
|
|
+
|
|
+ entity->on_st = false;
|
|
+ st->wsum -= entity->weight;
|
|
+ if (bfqq) {
|
|
+ sd = entity->sched_data;
|
|
+ bfq_log_bfqq(bfqq->bfqd, bfqq, "forget_entity: %p %d",
|
|
+ bfqq, bfqq->ref);
|
|
+ bfq_put_queue(bfqq);
|
|
+ }
|
|
+}
|
|
+
|
|
+/**
|
|
+ * bfq_put_idle_entity - release the idle tree ref of an entity.
|
|
+ * @st: service tree for the entity.
|
|
+ * @entity: the entity being released.
|
|
+ */
|
|
+static void bfq_put_idle_entity(struct bfq_service_tree *st,
|
|
+ struct bfq_entity *entity)
|
|
+{
|
|
+ bfq_idle_extract(st, entity);
|
|
+ bfq_forget_entity(st, entity);
|
|
+}
|
|
+
|
|
+/**
|
|
+ * bfq_forget_idle - update the idle tree if necessary.
|
|
+ * @st: the service tree to act upon.
|
|
+ *
|
|
+ * To preserve the global O(log N) complexity we only remove one entry here;
|
|
+ * as the idle tree will not grow indefinitely this can be done safely.
|
|
+ */
|
|
+static void bfq_forget_idle(struct bfq_service_tree *st)
|
|
+{
|
|
+ struct bfq_entity *first_idle = st->first_idle;
|
|
+ struct bfq_entity *last_idle = st->last_idle;
|
|
+
|
|
+ if (RB_EMPTY_ROOT(&st->active) && last_idle &&
|
|
+ !bfq_gt(last_idle->finish, st->vtime)) {
|
|
+ /*
|
|
+ * Forget the whole idle tree, increasing the vtime past
|
|
+ * the last finish time of idle entities.
|
|
+ */
|
|
+ st->vtime = last_idle->finish;
|
|
+ }
|
|
+
|
|
+ if (first_idle && !bfq_gt(first_idle->finish, st->vtime))
|
|
+ bfq_put_idle_entity(st, first_idle);
|
|
+}
|
|
+
|
|
+static struct bfq_service_tree *
|
|
+__bfq_entity_update_weight_prio(struct bfq_service_tree *old_st,
|
|
+ struct bfq_entity *entity)
|
|
+{
|
|
+ struct bfq_service_tree *new_st = old_st;
|
|
+
|
|
+ if (entity->prio_changed) {
|
|
+ struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);
|
|
+ unsigned int prev_weight, new_weight;
|
|
+ struct bfq_data *bfqd = NULL;
|
|
+ struct rb_root *root;
|
|
+#ifdef CONFIG_BFQ_GROUP_IOSCHED
|
|
+ struct bfq_sched_data *sd;
|
|
+ struct bfq_group *bfqg;
|
|
+#endif
|
|
+
|
|
+ if (bfqq)
|
|
+ bfqd = bfqq->bfqd;
|
|
+#ifdef CONFIG_BFQ_GROUP_IOSCHED
|
|
+ else {
|
|
+ sd = entity->my_sched_data;
|
|
+ bfqg = container_of(sd, struct bfq_group, sched_data);
|
|
+ BUG_ON(!bfqg);
|
|
+ bfqd = (struct bfq_data *)bfqg->bfqd;
|
|
+ BUG_ON(!bfqd);
|
|
+ }
|
|
+#endif
|
|
+
|
|
+ BUG_ON(old_st->wsum < entity->weight);
|
|
+ old_st->wsum -= entity->weight;
|
|
+
|
|
+ if (entity->new_weight != entity->orig_weight) {
|
|
+ if (entity->new_weight < BFQ_MIN_WEIGHT ||
|
|
+ entity->new_weight > BFQ_MAX_WEIGHT) {
|
|
+ pr_crit("update_weight_prio: new_weight %d\n",
|
|
+ entity->new_weight);
|
|
+ if (entity->new_weight < BFQ_MIN_WEIGHT)
|
|
+ entity->new_weight = BFQ_MIN_WEIGHT;
|
|
+ else
|
|
+ entity->new_weight = BFQ_MAX_WEIGHT;
|
|
+ }
|
|
+ entity->orig_weight = entity->new_weight;
|
|
+ if (bfqq)
|
|
+ bfqq->ioprio =
|
|
+ bfq_weight_to_ioprio(entity->orig_weight);
|
|
+ }
|
|
+
|
|
+ if (bfqq)
|
|
+ bfqq->ioprio_class = bfqq->new_ioprio_class;
|
|
+ entity->prio_changed = 0;
|
|
+
|
|
+ /*
|
|
+ * NOTE: here we may be changing the weight too early,
|
|
+ * this will cause unfairness. The correct approach
|
|
+ * would have required additional complexity to defer
|
|
+ * weight changes to the proper time instants (i.e.,
|
|
+ * when entity->finish <= old_st->vtime).
|
|
+ */
|
|
+ new_st = bfq_entity_service_tree(entity);
|
|
+
|
|
+ prev_weight = entity->weight;
|
|
+ new_weight = entity->orig_weight *
|
|
+ (bfqq ? bfqq->wr_coeff : 1);
|
|
+ /*
|
|
+ * If the weight of the entity changes, remove the entity
|
|
+ * from its old weight counter (if there is a counter
|
|
+ * associated with the entity), and add it to the counter
|
|
+ * associated with its new weight.
|
|
+ */
|
|
+ if (prev_weight != new_weight) {
|
|
+ if (bfqq)
|
|
+ bfq_log_bfqq(bfqq->bfqd, bfqq,
|
|
+ "weight changed %d %d(%d %d)",
|
|
+ prev_weight, new_weight,
|
|
+ entity->orig_weight,
|
|
+ bfqq->wr_coeff);
|
|
+
|
|
+ root = bfqq ? &bfqd->queue_weights_tree :
|
|
+ &bfqd->group_weights_tree;
|
|
+ bfq_weights_tree_remove(bfqd, entity, root);
|
|
+ }
|
|
+ entity->weight = new_weight;
|
|
+ /*
|
|
+ * Add the entity to its weights tree only if it is
|
|
+ * not associated with a weight-raised queue.
|
|
+ */
|
|
+ if (prev_weight != new_weight &&
|
|
+ (bfqq ? bfqq->wr_coeff == 1 : 1))
|
|
+ /* If we get here, root has been initialized. */
|
|
+ bfq_weights_tree_add(bfqd, entity, root);
|
|
+
|
|
+ new_st->wsum += entity->weight;
|
|
+
|
|
+ if (new_st != old_st)
|
|
+ entity->start = new_st->vtime;
|
|
+ }
|
|
+
|
|
+ return new_st;
|
|
+}
|
|
+
|
|
+#ifdef CONFIG_BFQ_GROUP_IOSCHED
|
|
+static void bfqg_stats_set_start_empty_time(struct bfq_group *bfqg);
|
|
+#endif
|
|
+
|
|
+/**
|
|
+ * bfq_bfqq_served - update the scheduler status after selection for
|
|
+ * service.
|
|
+ * @bfqq: the queue being served.
|
|
+ * @served: bytes to transfer.
|
|
+ *
|
|
+ * NOTE: this can be optimized, as the timestamps of upper level entities
|
|
+ * are synchronized every time a new bfqq is selected for service. By now,
|
|
+ * we keep it to better check consistency.
|
|
+ */
|
|
+static void bfq_bfqq_served(struct bfq_queue *bfqq, int served)
|
|
+{
|
|
+ struct bfq_entity *entity = &bfqq->entity;
|
|
+ struct bfq_service_tree *st;
|
|
+
|
|
+ for_each_entity(entity) {
|
|
+ st = bfq_entity_service_tree(entity);
|
|
+
|
|
+ entity->service += served;
|
|
+
|
|
+ BUG_ON(st->wsum == 0);
|
|
+
|
|
+ st->vtime += bfq_delta(served, st->wsum);
|
|
+ bfq_forget_idle(st);
|
|
+ }
|
|
+#ifdef CONFIG_BFQ_GROUP_IOSCHED
|
|
+ bfqg_stats_set_start_empty_time(bfqq_group(bfqq));
|
|
+#endif
|
|
+ st = bfq_entity_service_tree(&bfqq->entity);
|
|
+ bfq_log_bfqq(bfqq->bfqd, bfqq, "bfqq_served %d secs, vtime %llu on %p",
|
|
+ served, ((st->vtime>>10)*1000)>>12, st);
|
|
+}
|
|
+
|
|
+/**
|
|
+ * bfq_bfqq_charge_time - charge an amount of service equivalent to the length
|
|
+ * of the time interval during which bfqq has been in
|
|
+ * service.
|
|
+ * @bfqd: the device
|
|
+ * @bfqq: the queue that needs a service update.
|
|
+ * @time_ms: the amount of time during which the queue has received service
|
|
+ *
|
|
+ * If a queue does not consume its budget fast enough, then providing
|
|
+ * the queue with service fairness may impair throughput, more or less
|
|
+ * severely. For this reason, queues that consume their budget slowly
|
|
+ * are provided with time fairness instead of service fairness. This
|
|
+ * goal is achieved through the BFQ scheduling engine, even if such an
|
|
+ * engine works in the service, and not in the time domain. The trick
|
|
+ * is charging these queues with an inflated amount of service, equal
|
|
+ * to the amount of service that they would have received during their
|
|
+ * service slot if they had been fast, i.e., if their requests had
|
|
+ * been dispatched at a rate equal to the estimated peak rate.
|
|
+ *
|
|
+ * It is worth noting that time fairness can cause important
|
|
+ * distortions in terms of bandwidth distribution, on devices with
|
|
+ * internal queueing. The reason is that I/O requests dispatched
|
|
+ * during the service slot of a queue may be served after that service
|
|
+ * slot is finished, and may have a total processing time loosely
|
|
+ * correlated with the duration of the service slot. This is
|
|
+ * especially true for short service slots.
|
|
+ */
|
|
+static void bfq_bfqq_charge_time(struct bfq_data *bfqd, struct bfq_queue *bfqq,
|
|
+ unsigned long time_ms)
|
|
+{
|
|
+ struct bfq_entity *entity = &bfqq->entity;
|
|
+ int tot_serv_to_charge = entity->service;
|
|
+ unsigned int timeout_ms = jiffies_to_msecs(bfq_timeout);
|
|
+
|
|
+ if (time_ms > 0 && time_ms < timeout_ms)
|
|
+ tot_serv_to_charge =
|
|
+ (bfqd->bfq_max_budget * time_ms) / timeout_ms;
|
|
+
|
|
+ if (tot_serv_to_charge < entity->service)
|
|
+ tot_serv_to_charge = entity->service;
|
|
+
|
|
+ bfq_log_bfqq(bfqq->bfqd, bfqq,
|
|
+ "charge_time: %lu/%u ms, %d/%d/%d sectors",
|
|
+ time_ms, timeout_ms, entity->service,
|
|
+ tot_serv_to_charge, entity->budget);
|
|
+
|
|
+ /* Increase budget to avoid inconsistencies */
|
|
+ if (tot_serv_to_charge > entity->budget)
|
|
+ entity->budget = tot_serv_to_charge;
|
|
+
|
|
+ bfq_bfqq_served(bfqq,
|
|
+ max_t(int, 0, tot_serv_to_charge - entity->service));
|
|
+}
|
|
+
|
|
+static void bfq_update_fin_time_enqueue(struct bfq_entity *entity,
|
|
+ struct bfq_service_tree *st,
|
|
+ bool backshifted)
|
|
+{
|
|
+ struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);
|
|
+ struct bfq_sched_data *sd = entity->sched_data;
|
|
+
|
|
+ st = __bfq_entity_update_weight_prio(st, entity);
|
|
+ bfq_calc_finish(entity, entity->budget);
|
|
+
|
|
+ /*
|
|
+ * If some queues enjoy backshifting for a while, then their
|
|
+ * (virtual) finish timestamps may happen to become lower and
|
|
+ * lower than the system virtual time. In particular, if
|
|
+ * these queues often happen to be idle for short time
|
|
+ * periods, and during such time periods other queues with
|
|
+ * higher timestamps happen to be busy, then the backshifted
|
|
+ * timestamps of the former queues can become much lower than
|
|
+ * the system virtual time. In fact, to serve the queues with
|
|
+ * higher timestamps while the ones with lower timestamps are
|
|
+ * idle, the system virtual time may be pushed-up to much
|
|
+ * higher values than the finish timestamps of the idle
|
|
+ * queues. As a consequence, the finish timestamps of all new
|
|
+ * or newly activated queues may end up being much larger than
|
|
+ * those of lucky queues with backshifted timestamps. The
|
|
+ * latter queues may then monopolize the device for a lot of
|
|
+ * time. This would simply break service guarantees.
|
|
+ *
|
|
+ * To reduce this problem, push up a little bit the
|
|
+ * backshifted timestamps of the queue associated with this
|
|
+ * entity (only a queue can happen to have the backshifted
|
|
+ * flag set): just enough to let the finish timestamp of the
|
|
+ * queue be equal to the current value of the system virtual
|
|
+ * time. This may introduce a little unfairness among queues
|
|
+ * with backshifted timestamps, but it does not break
|
|
+ * worst-case fairness guarantees.
|
|
+ *
|
|
+ * As a special case, if bfqq is weight-raised, push up
|
|
+ * timestamps much less, to keep very low the probability that
|
|
+ * this push up causes the backshifted finish timestamps of
|
|
+ * weight-raised queues to become higher than the backshifted
|
|
+ * finish timestamps of non weight-raised queues.
|
|
+ */
|
|
+ if (backshifted && bfq_gt(st->vtime, entity->finish)) {
|
|
+ unsigned long delta = st->vtime - entity->finish;
|
|
+
|
|
+ if (bfqq)
|
|
+ delta /= bfqq->wr_coeff;
|
|
+
|
|
+ entity->start += delta;
|
|
+ entity->finish += delta;
|
|
+
|
|
+ if (bfqq) {
|
|
+ bfq_log_bfqq(bfqq->bfqd, bfqq,
|
|
+ "__activate_entity: new queue finish %llu",
|
|
+ ((entity->finish>>10)*1000)>>12);
|
|
+#ifdef CONFIG_BFQ_GROUP_IOSCHED
|
|
+ } else {
|
|
+ struct bfq_group *bfqg =
|
|
+ container_of(entity, struct bfq_group, entity);
|
|
+
|
|
+ bfq_log_bfqg((struct bfq_data *)bfqg->bfqd, bfqg,
|
|
+ "__activate_entity: new group finish %llu",
|
|
+ ((entity->finish>>10)*1000)>>12);
|
|
+#endif
|
|
+ }
|
|
+ }
|
|
+
|
|
+ bfq_active_insert(st, entity);
|
|
+
|
|
+ if (bfqq) {
|
|
+ bfq_log_bfqq(bfqq->bfqd, bfqq,
|
|
+ "__activate_entity: queue %seligible in st %p",
|
|
+ entity->start <= st->vtime ? "" : "non ", st);
|
|
+#ifdef CONFIG_BFQ_GROUP_IOSCHED
|
|
+ } else {
|
|
+ struct bfq_group *bfqg =
|
|
+ container_of(entity, struct bfq_group, entity);
|
|
+
|
|
+ bfq_log_bfqg((struct bfq_data *)bfqg->bfqd, bfqg,
|
|
+ "__activate_entity: group %seligible in st %p",
|
|
+ entity->start <= st->vtime ? "" : "non ", st);
|
|
+#endif
|
|
+ }
|
|
+ BUG_ON(RB_EMPTY_ROOT(&st->active));
|
|
+ BUG_ON(&st->active != &sd->service_tree->active &&
|
|
+ &st->active != &(sd->service_tree+1)->active &&
|
|
+ &st->active != &(sd->service_tree+2)->active);
|
|
+}
|
|
+
|
|
+/**
|
|
+ * __bfq_activate_entity - handle activation of entity.
|
|
+ * @entity: the entity being activated.
|
|
+ * @non_blocking_wait_rq: true if entity was waiting for a request
|
|
+ *
|
|
+ * Called for a 'true' activation, i.e., if entity is not active and
|
|
+ * one of its children receives a new request.
|
|
+ *
|
|
+ * Basically, this function updates the timestamps of entity and
|
|
+ * inserts entity into its active tree, ater possible extracting it
|
|
+ * from its idle tree.
|
|
+ */
|
|
+static void __bfq_activate_entity(struct bfq_entity *entity,
|
|
+ bool non_blocking_wait_rq)
|
|
+{
|
|
+ struct bfq_sched_data *sd = entity->sched_data;
|
|
+ struct bfq_service_tree *st = bfq_entity_service_tree(entity);
|
|
+ struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);
|
|
+ bool backshifted = false;
|
|
+ unsigned long long min_vstart;
|
|
+
|
|
+ BUG_ON(!sd);
|
|
+ BUG_ON(!st);
|
|
+
|
|
+ /* See comments on bfq_fqq_update_budg_for_activation */
|
|
+ if (non_blocking_wait_rq && bfq_gt(st->vtime, entity->finish)) {
|
|
+ backshifted = true;
|
|
+ min_vstart = entity->finish;
|
|
+ } else
|
|
+ min_vstart = st->vtime;
|
|
+
|
|
+ if (entity->tree == &st->idle) {
|
|
+ /*
|
|
+ * Must be on the idle tree, bfq_idle_extract() will
|
|
+ * check for that.
|
|
+ */
|
|
+ bfq_idle_extract(st, entity);
|
|
+ entity->start = bfq_gt(min_vstart, entity->finish) ?
|
|
+ min_vstart : entity->finish;
|
|
+ } else {
|
|
+ /*
|
|
+ * The finish time of the entity may be invalid, and
|
|
+ * it is in the past for sure, otherwise the queue
|
|
+ * would have been on the idle tree.
|
|
+ */
|
|
+ entity->start = min_vstart;
|
|
+ st->wsum += entity->weight;
|
|
+ bfq_get_entity(entity);
|
|
+
|
|
+ BUG_ON(entity->on_st && bfqq);
|
|
+
|
|
+#ifdef CONFIG_BFQ_GROUP_IOSCHED
|
|
+ if (entity->on_st && !bfqq) {
|
|
+ struct bfq_group *bfqg =
|
|
+ container_of(entity, struct bfq_group,
|
|
+ entity);
|
|
+
|
|
+ bfq_log_bfqg((struct bfq_data *)bfqg->bfqd,
|
|
+ bfqg,
|
|
+ "activate bug, class %d in_service %p",
|
|
+ bfq_class_idx(entity), sd->in_service_entity);
|
|
+ }
|
|
+#endif
|
|
+ BUG_ON(entity->on_st && !bfqq);
|
|
+ entity->on_st = true;
|
|
+ }
|
|
+
|
|
+ bfq_update_fin_time_enqueue(entity, st, backshifted);
|
|
+}
|
|
+
|
|
+/**
|
|
+ * __bfq_requeue_entity - handle requeueing or repositioning of an entity.
|
|
+ * @entity: the entity being requeued or repositioned.
|
|
+ *
|
|
+ * Requeueing is needed if this entity stops being served, which
|
|
+ * happens if a leaf descendant entity has expired. On the other hand,
|
|
+ * repositioning is needed if the next_inservice_entity for the child
|
|
+ * entity has changed. See the comments inside the function for
|
|
+ * details.
|
|
+ *
|
|
+ * Basically, this function: 1) removes entity from its active tree if
|
|
+ * present there, 2) updates the timestamps of entity and 3) inserts
|
|
+ * entity back into its active tree (in the new, right position for
|
|
+ * the new values of the timestamps).
|
|
+ */
|
|
+static void __bfq_requeue_entity(struct bfq_entity *entity)
|
|
+{
|
|
+ struct bfq_sched_data *sd = entity->sched_data;
|
|
+ struct bfq_service_tree *st = bfq_entity_service_tree(entity);
|
|
+
|
|
+ BUG_ON(!sd);
|
|
+ BUG_ON(!st);
|
|
+
|
|
+ BUG_ON(entity != sd->in_service_entity &&
|
|
+ entity->tree != &st->active);
|
|
+
|
|
+ if (entity == sd->in_service_entity) {
|
|
+ /*
|
|
+ * We are requeueing the current in-service entity,
|
|
+ * which may have to be done for one of the following
|
|
+ * reasons:
|
|
+ * - entity represents the in-service queue, and the
|
|
+ * in-service queue is being requeued after an
|
|
+ * expiration;
|
|
+ * - entity represents a group, and its budget has
|
|
+ * changed because one of its child entities has
|
|
+ * just been either activated or requeued for some
|
|
+ * reason; the timestamps of the entity need then to
|
|
+ * be updated, and the entity needs to be enqueued
|
|
+ * or repositioned accordingly.
|
|
+ *
|
|
+ * In particular, before requeueing, the start time of
|
|
+ * the entity must be moved forward to account for the
|
|
+ * service that the entity has received while in
|
|
+ * service. This is done by the next instructions. The
|
|
+ * finish time will then be updated according to this
|
|
+ * new value of the start time, and to the budget of
|
|
+ * the entity.
|
|
+ */
|
|
+ bfq_calc_finish(entity, entity->service);
|
|
+ entity->start = entity->finish;
|
|
+ BUG_ON(entity->tree && entity->tree != &st->active);
|
|
+ /*
|
|
+ * In addition, if the entity had more than one child
|
|
+ * when set in service, then was not extracted from
|
|
+ * the active tree. This implies that the position of
|
|
+ * the entity in the active tree may need to be
|
|
+ * changed now, because we have just updated the start
|
|
+ * time of the entity, and we will update its finish
|
|
+ * time in a moment (the requeueing is then, more
|
|
+ * precisely, a repositioning in this case). To
|
|
+ * implement this repositioning, we: 1) dequeue the
|
|
+ * entity here, 2) update the finish time and
|
|
+ * requeue the entity according to the new
|
|
+ * timestamps below.
|
|
+ */
|
|
+ if (entity->tree)
|
|
+ bfq_active_extract(st, entity);
|
|
+ } else { /* The entity is already active, and not in service */
|
|
+ /*
|
|
+ * In this case, this function gets called only if the
|
|
+ * next_in_service entity below this entity has
|
|
+ * changed, and this change has caused the budget of
|
|
+ * this entity to change, which, finally implies that
|
|
+ * the finish time of this entity must be
|
|
+ * updated. Such an update may cause the scheduling,
|
|
+ * i.e., the position in the active tree, of this
|
|
+ * entity to change. We handle this change by: 1)
|
|
+ * dequeueing the entity here, 2) updating the finish
|
|
+ * time and requeueing the entity according to the new
|
|
+ * timestamps below. This is the same approach as the
|
|
+ * non-extracted-entity sub-case above.
|
|
+ */
|
|
+ bfq_active_extract(st, entity);
|
|
+ }
|
|
+
|
|
+ bfq_update_fin_time_enqueue(entity, st, false);
|
|
+}
|
|
+
|
|
+static void __bfq_activate_requeue_entity(struct bfq_entity *entity,
|
|
+ struct bfq_sched_data *sd,
|
|
+ bool non_blocking_wait_rq)
|
|
+{
|
|
+ struct bfq_service_tree *st = bfq_entity_service_tree(entity);
|
|
+
|
|
+ if (sd->in_service_entity == entity || entity->tree == &st->active)
|
|
+ /*
|
|
+ * in service or already queued on the active tree,
|
|
+ * requeue or reposition
|
|
+ */
|
|
+ __bfq_requeue_entity(entity);
|
|
+ else
|
|
+ /*
|
|
+ * Not in service and not queued on its active tree:
|
|
+ * the activity is idle and this is a true activation.
|
|
+ */
|
|
+ __bfq_activate_entity(entity, non_blocking_wait_rq);
|
|
+}
|
|
+
|
|
+
|
|
+/**
|
|
+ * bfq_activate_entity - activate or requeue an entity representing a bfq_queue,
|
|
+ * and activate, requeue or reposition all ancestors
|
|
+ * for which such an update becomes necessary.
|
|
+ * @entity: the entity to activate.
|
|
+ * @non_blocking_wait_rq: true if this entity was waiting for a request
|
|
+ * @requeue: true if this is a requeue, which implies that bfqq is
|
|
+ * being expired; thus ALL its ancestors stop being served and must
|
|
+ * therefore be requeued
|
|
+ */
|
|
+static void bfq_activate_requeue_entity(struct bfq_entity *entity,
|
|
+ bool non_blocking_wait_rq,
|
|
+ bool requeue)
|
|
+{
|
|
+ struct bfq_sched_data *sd;
|
|
+
|
|
+ for_each_entity(entity) {
|
|
+ BUG_ON(!entity);
|
|
+ sd = entity->sched_data;
|
|
+ __bfq_activate_requeue_entity(entity, sd, non_blocking_wait_rq);
|
|
+
|
|
+ BUG_ON(RB_EMPTY_ROOT(&sd->service_tree->active) &&
|
|
+ RB_EMPTY_ROOT(&(sd->service_tree+1)->active) &&
|
|
+ RB_EMPTY_ROOT(&(sd->service_tree+2)->active));
|
|
+
|
|
+ if (!bfq_update_next_in_service(sd, entity) && !requeue) {
|
|
+ BUG_ON(!sd->next_in_service);
|
|
+ break;
|
|
+ }
|
|
+ BUG_ON(!sd->next_in_service);
|
|
+ }
|
|
+}
|
|
+
|
|
+/**
|
|
+ * __bfq_deactivate_entity - deactivate an entity from its service tree.
|
|
+ * @entity: the entity to deactivate.
|
|
+ * @ins_into_idle_tree: if false, the entity will not be put into the
|
|
+ * idle tree.
|
|
+ *
|
|
+ * Deactivates an entity, independently from its previous state. Must
|
|
+ * be invoked only if entity is on a service tree. Extracts the entity
|
|
+ * from that tree, and if necessary and allowed, puts it on the idle
|
|
+ * tree.
|
|
+ */
|
|
+static bool __bfq_deactivate_entity(struct bfq_entity *entity,
|
|
+ bool ins_into_idle_tree)
|
|
+{
|
|
+ struct bfq_sched_data *sd = entity->sched_data;
|
|
+ struct bfq_service_tree *st = bfq_entity_service_tree(entity);
|
|
+ bool was_in_service = entity == sd->in_service_entity;
|
|
+
|
|
+ if (!entity->on_st) { /* entity never activated, or already inactive */
|
|
+ BUG_ON(entity == entity->sched_data->in_service_entity);
|
|
+ return false;
|
|
+ }
|
|
+
|
|
+ BUG_ON(was_in_service && entity->tree && entity->tree != &st->active);
|
|
+
|
|
+ if (was_in_service)
|
|
+ bfq_calc_finish(entity, entity->service);
|
|
+
|
|
+ if (entity->tree == &st->active)
|
|
+ bfq_active_extract(st, entity);
|
|
+ else if (!was_in_service && entity->tree == &st->idle)
|
|
+ bfq_idle_extract(st, entity);
|
|
+ else if (entity->tree)
|
|
+ BUG();
|
|
+
|
|
+ if (!ins_into_idle_tree || !bfq_gt(entity->finish, st->vtime))
|
|
+ bfq_forget_entity(st, entity);
|
|
+ else
|
|
+ bfq_idle_insert(st, entity);
|
|
+
|
|
+ return true;
|
|
+}
|
|
+
|
|
+/**
|
|
+ * bfq_deactivate_entity - deactivate an entity representing a bfq_queue.
|
|
+ * @entity: the entity to deactivate.
|
|
+ * @ins_into_idle_tree: true if the entity can be put on the idle tree
|
|
+ */
|
|
+static void bfq_deactivate_entity(struct bfq_entity *entity,
|
|
+ bool ins_into_idle_tree,
|
|
+ bool expiration)
|
|
+{
|
|
+ struct bfq_sched_data *sd;
|
|
+ struct bfq_entity *parent = NULL;
|
|
+
|
|
+ for_each_entity_safe(entity, parent) {
|
|
+ sd = entity->sched_data;
|
|
+
|
|
+ BUG_ON(sd == NULL); /*
|
|
+ * It would mean that this is the
|
|
+ * root group.
|
|
+ */
|
|
+
|
|
+ BUG_ON(expiration && entity != sd->in_service_entity);
|
|
+
|
|
+ BUG_ON(entity != sd->in_service_entity &&
|
|
+ entity->tree ==
|
|
+ &bfq_entity_service_tree(entity)->active &&
|
|
+ !sd->next_in_service);
|
|
+
|
|
+ if (!__bfq_deactivate_entity(entity, ins_into_idle_tree)) {
|
|
+ /*
|
|
+ * Entity is not any tree any more, so, this
|
|
+ * deactivation is a no-op, and there is
|
|
+ * nothing to change for upper-level entities
|
|
+ * (in case of expiration, this can never
|
|
+ * happen).
|
|
+ */
|
|
+ BUG_ON(expiration); /*
|
|
+ * entity cannot be already out of
|
|
+ * any tree
|
|
+ */
|
|
+ return;
|
|
+ }
|
|
+
|
|
+ if (sd->next_in_service == entity)
|
|
+ /*
|
|
+ * entity was the next_in_service entity,
|
|
+ * then, since entity has just been
|
|
+ * deactivated, a new one must be found.
|
|
+ */
|
|
+ bfq_update_next_in_service(sd, NULL);
|
|
+
|
|
+ if (sd->next_in_service) {
|
|
+ /*
|
|
+ * The parent entity is still backlogged,
|
|
+ * because next_in_service is not NULL. So, no
|
|
+ * further upwards deactivation must be
|
|
+ * performed. Yet, next_in_service has
|
|
+ * changed. Then the schedule does need to be
|
|
+ * updated upwards.
|
|
+ */
|
|
+ BUG_ON(sd->next_in_service == entity);
|
|
+ break;
|
|
+ }
|
|
+
|
|
+ /*
|
|
+ * If we get here, then the parent is no more
|
|
+ * backlogged and we need to propagate the
|
|
+ * deactivation upwards. Thus let the loop go on.
|
|
+ */
|
|
+
|
|
+ /*
|
|
+ * Also let parent be queued into the idle tree on
|
|
+ * deactivation, to preserve service guarantees, and
|
|
+ * assuming that who invoked this function does not
|
|
+ * need parent entities too to be removed completely.
|
|
+ */
|
|
+ ins_into_idle_tree = true;
|
|
+ }
|
|
+
|
|
+ /*
|
|
+ * If the deactivation loop is fully executed, then there are
|
|
+ * no more entities to touch and next loop is not executed at
|
|
+ * all. Otherwise, requeue remaining entities if they are
|
|
+ * about to stop receiving service, or reposition them if this
|
|
+ * is not the case.
|
|
+ */
|
|
+ entity = parent;
|
|
+ for_each_entity(entity) {
|
|
+ struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);
|
|
+
|
|
+ /*
|
|
+ * Invoke __bfq_requeue_entity on entity, even if
|
|
+ * already active, to requeue/reposition it in the
|
|
+ * active tree (because sd->next_in_service has
|
|
+ * changed)
|
|
+ */
|
|
+ __bfq_requeue_entity(entity);
|
|
+
|
|
+ sd = entity->sched_data;
|
|
+ BUG_ON(expiration && sd->in_service_entity != entity);
|
|
+
|
|
+ if (bfqq)
|
|
+ bfq_log_bfqq(bfqq->bfqd, bfqq,
|
|
+ "invoking udpdate_next for this queue");
|
|
+#ifdef CONFIG_BFQ_GROUP_IOSCHED
|
|
+ else {
|
|
+ struct bfq_group *bfqg =
|
|
+ container_of(entity,
|
|
+ struct bfq_group, entity);
|
|
+
|
|
+ bfq_log_bfqg((struct bfq_data *)bfqg->bfqd, bfqg,
|
|
+ "invoking udpdate_next for this entity");
|
|
+ }
|
|
+#endif
|
|
+ if (!bfq_update_next_in_service(sd, entity) &&
|
|
+ !expiration)
|
|
+ /*
|
|
+ * next_in_service unchanged or not causing
|
|
+ * any change in entity->parent->sd, and no
|
|
+ * requeueing needed for expiration: stop
|
|
+ * here.
|
|
+ */
|
|
+ break;
|
|
+ }
|
|
+}
|
|
+
|
|
+/**
|
|
+ * bfq_calc_vtime_jump - compute the value to which the vtime should jump,
|
|
+ * if needed, to have at least one entity eligible.
|
|
+ * @st: the service tree to act upon.
|
|
+ *
|
|
+ * Assumes that st is not empty.
|
|
+ */
|
|
+static u64 bfq_calc_vtime_jump(struct bfq_service_tree *st)
|
|
+{
|
|
+ struct bfq_entity *root_entity = bfq_root_active_entity(&st->active);
|
|
+
|
|
+ if (bfq_gt(root_entity->min_start, st->vtime)) {
|
|
+ struct bfq_queue *bfqq = bfq_entity_to_bfqq(root_entity);
|
|
+
|
|
+ if (bfqq)
|
|
+ bfq_log_bfqq(bfqq->bfqd, bfqq,
|
|
+ "calc_vtime_jump: new value %llu",
|
|
+ root_entity->min_start);
|
|
+#ifdef CONFIG_BFQ_GROUP_IOSCHED
|
|
+ else {
|
|
+ struct bfq_group *bfqg =
|
|
+ container_of(root_entity, struct bfq_group,
|
|
+ entity);
|
|
+
|
|
+ bfq_log_bfqg((struct bfq_data *)bfqg->bfqd, bfqg,
|
|
+ "calc_vtime_jump: new value %llu",
|
|
+ root_entity->min_start);
|
|
+ }
|
|
+#endif
|
|
+ return root_entity->min_start;
|
|
+ }
|
|
+ return st->vtime;
|
|
+}
|
|
+
|
|
+static void bfq_update_vtime(struct bfq_service_tree *st, u64 new_value)
|
|
+{
|
|
+ if (new_value > st->vtime) {
|
|
+ st->vtime = new_value;
|
|
+ bfq_forget_idle(st);
|
|
+ }
|
|
+}
|
|
+
|
|
+/**
|
|
+ * bfq_first_active_entity - find the eligible entity with
|
|
+ * the smallest finish time
|
|
+ * @st: the service tree to select from.
|
|
+ * @vtime: the system virtual to use as a reference for eligibility
|
|
+ *
|
|
+ * This function searches the first schedulable entity, starting from the
|
|
+ * root of the tree and going on the left every time on this side there is
|
|
+ * a subtree with at least one eligible (start >= vtime) entity. The path on
|
|
+ * the right is followed only if a) the left subtree contains no eligible
|
|
+ * entities and b) no eligible entity has been found yet.
|
|
+ */
|
|
+static struct bfq_entity *bfq_first_active_entity(struct bfq_service_tree *st,
|
|
+ u64 vtime)
|
|
+{
|
|
+ struct bfq_entity *entry, *first = NULL;
|
|
+ struct rb_node *node = st->active.rb_node;
|
|
+
|
|
+ while (node) {
|
|
+ entry = rb_entry(node, struct bfq_entity, rb_node);
|
|
+left:
|
|
+ if (!bfq_gt(entry->start, vtime))
|
|
+ first = entry;
|
|
+
|
|
+ BUG_ON(bfq_gt(entry->min_start, vtime));
|
|
+
|
|
+ if (node->rb_left) {
|
|
+ entry = rb_entry(node->rb_left,
|
|
+ struct bfq_entity, rb_node);
|
|
+ if (!bfq_gt(entry->min_start, vtime)) {
|
|
+ node = node->rb_left;
|
|
+ goto left;
|
|
+ }
|
|
+ }
|
|
+ if (first)
|
|
+ break;
|
|
+ node = node->rb_right;
|
|
+ }
|
|
+
|
|
+ BUG_ON(!first && !RB_EMPTY_ROOT(&st->active));
|
|
+ return first;
|
|
+}
|
|
+
|
|
+/**
|
|
+ * __bfq_lookup_next_entity - return the first eligible entity in @st.
|
|
+ * @st: the service tree.
|
|
+ *
|
|
+ * If there is no in-service entity for the sched_data st belongs to,
|
|
+ * then return the entity that will be set in service if:
|
|
+ * 1) the parent entity this st belongs to is set in service;
|
|
+ * 2) no entity belonging to such parent entity undergoes a state change
|
|
+ * that would influence the timestamps of the entity (e.g., becomes idle,
|
|
+ * becomes backlogged, changes its budget, ...).
|
|
+ *
|
|
+ * In this first case, update the virtual time in @st too (see the
|
|
+ * comments on this update inside the function).
|
|
+ *
|
|
+ * In constrast, if there is an in-service entity, then return the
|
|
+ * entity that would be set in service if not only the above
|
|
+ * conditions, but also the next one held true: the currently
|
|
+ * in-service entity, on expiration,
|
|
+ * 1) gets a finish time equal to the current one, or
|
|
+ * 2) is not eligible any more, or
|
|
+ * 3) is idle.
|
|
+ */
|
|
+static struct bfq_entity *
|
|
+__bfq_lookup_next_entity(struct bfq_service_tree *st, bool in_service
|
|
+#if 0
|
|
+ , bool force
|
|
+#endif
|
|
+ )
|
|
+{
|
|
+ struct bfq_entity *entity
|
|
+#if 0
|
|
+ , *new_next_in_service = NULL
|
|
+#endif
|
|
+ ;
|
|
+ u64 new_vtime;
|
|
+ struct bfq_queue *bfqq;
|
|
+
|
|
+ if (RB_EMPTY_ROOT(&st->active))
|
|
+ return NULL;
|
|
+
|
|
+ /*
|
|
+ * Get the value of the system virtual time for which at
|
|
+ * least one entity is eligible.
|
|
+ */
|
|
+ new_vtime = bfq_calc_vtime_jump(st);
|
|
+
|
|
+ /*
|
|
+ * If there is no in-service entity for the sched_data this
|
|
+ * active tree belongs to, then push the system virtual time
|
|
+ * up to the value that guarantees that at least one entity is
|
|
+ * eligible. If, instead, there is an in-service entity, then
|
|
+ * do not make any such update, because there is already an
|
|
+ * eligible entity, namely the in-service one (even if the
|
|
+ * entity is not on st, because it was extracted when set in
|
|
+ * service).
|
|
+ */
|
|
+ if (!in_service)
|
|
+ bfq_update_vtime(st, new_vtime);
|
|
+
|
|
+ entity = bfq_first_active_entity(st, new_vtime);
|
|
+ BUG_ON(bfq_gt(entity->start, new_vtime));
|
|
+
|
|
+ /* Log some information */
|
|
+ bfqq = bfq_entity_to_bfqq(entity);
|
|
+ if (bfqq)
|
|
+ bfq_log_bfqq(bfqq->bfqd, bfqq,
|
|
+ "__lookup_next: start %llu vtime %llu st %p",
|
|
+ ((entity->start>>10)*1000)>>12,
|
|
+ ((new_vtime>>10)*1000)>>12, st);
|
|
+#ifdef CONFIG_BFQ_GROUP_IOSCHED
|
|
+ else {
|
|
+ struct bfq_group *bfqg =
|
|
+ container_of(entity, struct bfq_group, entity);
|
|
+
|
|
+ bfq_log_bfqg((struct bfq_data *)bfqg->bfqd, bfqg,
|
|
+ "__lookup_next: start %llu vtime %llu st %p",
|
|
+ ((entity->start>>10)*1000)>>12,
|
|
+ ((new_vtime>>10)*1000)>>12, st);
|
|
+ }
|
|
+#endif
|
|
+
|
|
+ BUG_ON(!entity);
|
|
+
|
|
+ return entity;
|
|
+}
|
|
+
|
|
+/**
|
|
+ * bfq_lookup_next_entity - return the first eligible entity in @sd.
|
|
+ * @sd: the sched_data.
|
|
+ *
|
|
+ * This function is invoked when there has been a change in the trees
|
|
+ * for sd, and we need know what is the new next entity after this
|
|
+ * change.
|
|
+ */
|
|
+static struct bfq_entity *bfq_lookup_next_entity(struct bfq_sched_data *sd)
|
|
+{
|
|
+ struct bfq_service_tree *st = sd->service_tree;
|
|
+ struct bfq_service_tree *idle_class_st = st + (BFQ_IOPRIO_CLASSES - 1);
|
|
+ struct bfq_entity *entity = NULL;
|
|
+ struct bfq_queue *bfqq;
|
|
+ int class_idx = 0;
|
|
+
|
|
+ BUG_ON(!sd);
|
|
+ BUG_ON(!st);
|
|
+ /*
|
|
+ * Choose from idle class, if needed to guarantee a minimum
|
|
+ * bandwidth to this class (and if there is some active entity
|
|
+ * in idle class). This should also mitigate
|
|
+ * priority-inversion problems in case a low priority task is
|
|
+ * holding file system resources.
|
|
+ */
|
|
+ if (time_is_before_jiffies(sd->bfq_class_idle_last_service +
|
|
+ BFQ_CL_IDLE_TIMEOUT)) {
|
|
+ if (!RB_EMPTY_ROOT(&idle_class_st->active))
|
|
+ class_idx = BFQ_IOPRIO_CLASSES - 1;
|
|
+ /* About to be served if backlogged, or not yet backlogged */
|
|
+ sd->bfq_class_idle_last_service = jiffies;
|
|
+ }
|
|
+
|
|
+ /*
|
|
+ * Find the next entity to serve for the highest-priority
|
|
+ * class, unless the idle class needs to be served.
|
|
+ */
|
|
+ for (; class_idx < BFQ_IOPRIO_CLASSES; class_idx++) {
|
|
+ entity = __bfq_lookup_next_entity(st + class_idx,
|
|
+ sd->in_service_entity);
|
|
+
|
|
+ if (entity)
|
|
+ break;
|
|
+ }
|
|
+
|
|
+ BUG_ON(!entity &&
|
|
+ (!RB_EMPTY_ROOT(&st->active) || !RB_EMPTY_ROOT(&(st+1)->active) ||
|
|
+ !RB_EMPTY_ROOT(&(st+2)->active)));
|
|
+
|
|
+ if (!entity)
|
|
+ return NULL;
|
|
+
|
|
+ /* Log some information */
|
|
+ bfqq = bfq_entity_to_bfqq(entity);
|
|
+ if (bfqq)
|
|
+ bfq_log_bfqq(bfqq->bfqd, bfqq, "chosen from st %p %d",
|
|
+ st + class_idx, class_idx);
|
|
+#ifdef CONFIG_BFQ_GROUP_IOSCHED
|
|
+ else {
|
|
+ struct bfq_group *bfqg =
|
|
+ container_of(entity, struct bfq_group, entity);
|
|
+
|
|
+ bfq_log_bfqg((struct bfq_data *)bfqg->bfqd, bfqg,
|
|
+ "chosen from st %p %d",
|
|
+ st + class_idx, class_idx);
|
|
+ }
|
|
+#endif
|
|
+
|
|
+ return entity;
|
|
+}
|
|
+
|
|
+static bool next_queue_may_preempt(struct bfq_data *bfqd)
|
|
+{
|
|
+ struct bfq_sched_data *sd = &bfqd->root_group->sched_data;
|
|
+
|
|
+ return sd->next_in_service != sd->in_service_entity;
|
|
+}
|
|
+
|
|
+/*
|
|
+ * Get next queue for service.
|
|
+ */
|
|
+static struct bfq_queue *bfq_get_next_queue(struct bfq_data *bfqd)
|
|
+{
|
|
+ struct bfq_entity *entity = NULL;
|
|
+ struct bfq_sched_data *sd;
|
|
+ struct bfq_queue *bfqq;
|
|
+
|
|
+ BUG_ON(bfqd->in_service_queue);
|
|
+
|
|
+ if (bfqd->busy_queues == 0)
|
|
+ return NULL;
|
|
+
|
|
+ /*
|
|
+ * Traverse the path from the root to the leaf entity to
|
|
+ * serve. Set in service all the entities visited along the
|
|
+ * way.
|
|
+ */
|
|
+ sd = &bfqd->root_group->sched_data;
|
|
+ for (; sd ; sd = entity->my_sched_data) {
|
|
+#ifdef CONFIG_BFQ_GROUP_IOSCHED
|
|
+ if (entity) {
|
|
+ struct bfq_group *bfqg =
|
|
+ container_of(entity, struct bfq_group, entity);
|
|
+
|
|
+ bfq_log_bfqg(bfqd, bfqg,
|
|
+ "get_next_queue: lookup in this group");
|
|
+ if (!sd->next_in_service)
|
|
+ pr_crit("get_next_queue: lookup in this group");
|
|
+ } else {
|
|
+ bfq_log_bfqg(bfqd, bfqd->root_group,
|
|
+ "get_next_queue: lookup in root group");
|
|
+ if (!sd->next_in_service)
|
|
+ pr_crit("get_next_queue: lookup in root group");
|
|
+ }
|
|
+#endif
|
|
+
|
|
+ BUG_ON(!sd->next_in_service);
|
|
+
|
|
+ /*
|
|
+ * WARNING. We are about to set the in-service entity
|
|
+ * to sd->next_in_service, i.e., to the (cached) value
|
|
+ * returned by bfq_lookup_next_entity(sd) the last
|
|
+ * time it was invoked, i.e., the last time when the
|
|
+ * service order in sd changed as a consequence of the
|
|
+ * activation or deactivation of an entity. In this
|
|
+ * respect, if we execute bfq_lookup_next_entity(sd)
|
|
+ * in this very moment, it may, although with low
|
|
+ * probability, yield a different entity than that
|
|
+ * pointed to by sd->next_in_service. This rare event
|
|
+ * happens in case there was no CLASS_IDLE entity to
|
|
+ * serve for sd when bfq_lookup_next_entity(sd) was
|
|
+ * invoked for the last time, while there is now one
|
|
+ * such entity.
|
|
+ *
|
|
+ * If the above event happens, then the scheduling of
|
|
+ * such entity in CLASS_IDLE is postponed until the
|
|
+ * service of the sd->next_in_service entity
|
|
+ * finishes. In fact, when the latter is expired,
|
|
+ * bfq_lookup_next_entity(sd) gets called again,
|
|
+ * exactly to update sd->next_in_service.
|
|
+ */
|
|
+
|
|
+ /* Make next_in_service entity become in_service_entity */
|
|
+ entity = sd->next_in_service;
|
|
+ sd->in_service_entity = entity;
|
|
+
|
|
+ /*
|
|
+ * Reset the accumulator of the amount of service that
|
|
+ * the entity is about to receive.
|
|
+ */
|
|
+ entity->service = 0;
|
|
+
|
|
+ /*
|
|
+ * If entity is no longer a candidate for next
|
|
+ * service, then we extract it from its active tree,
|
|
+ * for the following reason. To further boost the
|
|
+ * throughput in some special case, BFQ needs to know
|
|
+ * which is the next candidate entity to serve, while
|
|
+ * there is already an entity in service. In this
|
|
+ * respect, to make it easy to compute/update the next
|
|
+ * candidate entity to serve after the current
|
|
+ * candidate has been set in service, there is a case
|
|
+ * where it is necessary to extract the current
|
|
+ * candidate from its service tree. Such a case is
|
|
+ * when the entity just set in service cannot be also
|
|
+ * a candidate for next service. Details about when
|
|
+ * this conditions holds are reported in the comments
|
|
+ * on the function bfq_no_longer_next_in_service()
|
|
+ * invoked below.
|
|
+ */
|
|
+ if (bfq_no_longer_next_in_service(entity))
|
|
+ bfq_active_extract(bfq_entity_service_tree(entity),
|
|
+ entity);
|
|
+
|
|
+ /*
|
|
+ * For the same reason why we may have just extracted
|
|
+ * entity from its active tree, we may need to update
|
|
+ * next_in_service for the sched_data of entity too,
|
|
+ * regardless of whether entity has been extracted.
|
|
+ * In fact, even if entity has not been extracted, a
|
|
+ * descendant entity may get extracted. Such an event
|
|
+ * would cause a change in next_in_service for the
|
|
+ * level of the descendant entity, and thus possibly
|
|
+ * back to upper levels.
|
|
+ *
|
|
+ * We cannot perform the resulting needed update
|
|
+ * before the end of this loop, because, to know which
|
|
+ * is the correct next-to-serve candidate entity for
|
|
+ * each level, we need first to find the leaf entity
|
|
+ * to set in service. In fact, only after we know
|
|
+ * which is the next-to-serve leaf entity, we can
|
|
+ * discover whether the parent entity of the leaf
|
|
+ * entity becomes the next-to-serve, and so on.
|
|
+ */
|
|
+
|
|
+ /* Log some information */
|
|
+ bfqq = bfq_entity_to_bfqq(entity);
|
|
+ if (bfqq)
|
|
+ bfq_log_bfqq(bfqd, bfqq,
|
|
+ "get_next_queue: this queue, finish %llu",
|
|
+ (((entity->finish>>10)*1000)>>10)>>2);
|
|
+#ifdef CONFIG_BFQ_GROUP_IOSCHED
|
|
+ else {
|
|
+ struct bfq_group *bfqg =
|
|
+ container_of(entity, struct bfq_group, entity);
|
|
+
|
|
+ bfq_log_bfqg(bfqd, bfqg,
|
|
+ "get_next_queue: this entity, finish %llu",
|
|
+ (((entity->finish>>10)*1000)>>10)>>2);
|
|
+ }
|
|
+#endif
|
|
+
|
|
+ }
|
|
+
|
|
+ BUG_ON(!entity);
|
|
+ bfqq = bfq_entity_to_bfqq(entity);
|
|
+ BUG_ON(!bfqq);
|
|
+
|
|
+ /*
|
|
+ * We can finally update all next-to-serve entities along the
|
|
+ * path from the leaf entity just set in service to the root.
|
|
+ */
|
|
+ for_each_entity(entity) {
|
|
+ struct bfq_sched_data *sd = entity->sched_data;
|
|
+
|
|
+ if(!bfq_update_next_in_service(sd, NULL))
|
|
+ break;
|
|
+ }
|
|
+
|
|
+ return bfqq;
|
|
+}
|
|
+
|
|
+static void __bfq_bfqd_reset_in_service(struct bfq_data *bfqd)
|
|
+{
|
|
+ struct bfq_entity *entity = &bfqd->in_service_queue->entity;
|
|
+
|
|
+ if (bfqd->in_service_bic) {
|
|
+ put_io_context(bfqd->in_service_bic->icq.ioc);
|
|
+ bfqd->in_service_bic = NULL;
|
|
+ }
|
|
+
|
|
+ bfq_clear_bfqq_wait_request(bfqd->in_service_queue);
|
|
+ hrtimer_try_to_cancel(&bfqd->idle_slice_timer);
|
|
+ bfqd->in_service_queue = NULL;
|
|
+
|
|
+ /*
|
|
+ * When this function is called, all in-service entities have
|
|
+ * been properly deactivated or requeued, so we can safely
|
|
+ * execute the final step: reset in_service_entity along the
|
|
+ * path from entity to the root.
|
|
+ */
|
|
+ for_each_entity(entity)
|
|
+ entity->sched_data->in_service_entity = NULL;
|
|
+}
|
|
+
|
|
+static void bfq_deactivate_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq,
|
|
+ bool ins_into_idle_tree, bool expiration)
|
|
+{
|
|
+ struct bfq_entity *entity = &bfqq->entity;
|
|
+
|
|
+ bfq_deactivate_entity(entity, ins_into_idle_tree, expiration);
|
|
+}
|
|
+
|
|
+static void bfq_activate_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq)
|
|
+{
|
|
+ struct bfq_entity *entity = &bfqq->entity;
|
|
+ struct bfq_service_tree *st = bfq_entity_service_tree(entity);
|
|
+
|
|
+ BUG_ON(bfqq == bfqd->in_service_queue);
|
|
+ BUG_ON(entity->tree != &st->active && entity->tree != &st->idle &&
|
|
+ entity->on_st);
|
|
+
|
|
+ bfq_activate_requeue_entity(entity, bfq_bfqq_non_blocking_wait_rq(bfqq),
|
|
+ false);
|
|
+ bfq_clear_bfqq_non_blocking_wait_rq(bfqq);
|
|
+}
|
|
+
|
|
+static void bfq_requeue_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq)
|
|
+{
|
|
+ struct bfq_entity *entity = &bfqq->entity;
|
|
+
|
|
+ bfq_activate_requeue_entity(entity, false,
|
|
+ bfqq == bfqd->in_service_queue);
|
|
+}
|
|
+
|
|
+static void bfqg_stats_update_dequeue(struct bfq_group *bfqg);
|
|
+
|
|
+/*
|
|
+ * Called when the bfqq no longer has requests pending, remove it from
|
|
+ * the service tree. As a special case, it can be invoked during an
|
|
+ * expiration.
|
|
+ */
|
|
+static void bfq_del_bfqq_busy(struct bfq_data *bfqd, struct bfq_queue *bfqq,
|
|
+ bool expiration)
|
|
+{
|
|
+ BUG_ON(!bfq_bfqq_busy(bfqq));
|
|
+ BUG_ON(!RB_EMPTY_ROOT(&bfqq->sort_list));
|
|
+
|
|
+ bfq_log_bfqq(bfqd, bfqq, "del from busy");
|
|
+
|
|
+ bfq_clear_bfqq_busy(bfqq);
|
|
+
|
|
+ BUG_ON(bfqd->busy_queues == 0);
|
|
+ bfqd->busy_queues--;
|
|
+
|
|
+ if (!bfqq->dispatched)
|
|
+ bfq_weights_tree_remove(bfqd, &bfqq->entity,
|
|
+ &bfqd->queue_weights_tree);
|
|
+
|
|
+ if (bfqq->wr_coeff > 1)
|
|
+ bfqd->wr_busy_queues--;
|
|
+
|
|
+ bfqg_stats_update_dequeue(bfqq_group(bfqq));
|
|
+
|
|
+ BUG_ON(bfqq->entity.budget < 0);
|
|
+
|
|
+ bfq_deactivate_bfqq(bfqd, bfqq, true, expiration);
|
|
+
|
|
+ BUG_ON(bfqq->entity.budget < 0);
|
|
+}
|
|
+
|
|
+/*
|
|
+ * Called when an inactive queue receives a new request.
|
|
+ */
|
|
+static void bfq_add_bfqq_busy(struct bfq_data *bfqd, struct bfq_queue *bfqq)
|
|
+{
|
|
+ BUG_ON(bfq_bfqq_busy(bfqq));
|
|
+ BUG_ON(bfqq == bfqd->in_service_queue);
|
|
+
|
|
+ bfq_log_bfqq(bfqd, bfqq, "add to busy");
|
|
+
|
|
+ bfq_activate_bfqq(bfqd, bfqq);
|
|
+
|
|
+ bfq_mark_bfqq_busy(bfqq);
|
|
+ bfqd->busy_queues++;
|
|
+
|
|
+ if (!bfqq->dispatched)
|
|
+ if (bfqq->wr_coeff == 1)
|
|
+ bfq_weights_tree_add(bfqd, &bfqq->entity,
|
|
+ &bfqd->queue_weights_tree);
|
|
+
|
|
+ if (bfqq->wr_coeff > 1)
|
|
+ bfqd->wr_busy_queues++;
|
|
+}
|
|
diff --git a/block/bfq.h b/block/bfq.h
|
|
new file mode 100644
|
|
index 0000000..d5c5c56
|
|
--- /dev/null
|
|
+++ b/block/bfq.h
|
|
@@ -0,0 +1,933 @@
|
|
+/*
|
|
+ * BFQ v8r8-rc2 for 4.9.0: data structures and common functions prototypes.
|
|
+ *
|
|
+ * Based on ideas and code from CFQ:
|
|
+ * Copyright (C) 2003 Jens Axboe <axboe@kernel.dk>
|
|
+ *
|
|
+ * Copyright (C) 2008 Fabio Checconi <fabio@gandalf.sssup.it>
|
|
+ * Paolo Valente <paolo.valente@unimore.it>
|
|
+ *
|
|
+ * Copyright (C) 2015 Paolo Valente <paolo.valente@unimore.it>
|
|
+ *
|
|
+ * Copyright (C) 2017 Paolo Valente <paolo.valente@linaro.org>
|
|
+ */
|
|
+
|
|
+#ifndef _BFQ_H
|
|
+#define _BFQ_H
|
|
+
|
|
+#include <linux/blktrace_api.h>
|
|
+#include <linux/hrtimer.h>
|
|
+#include <linux/ioprio.h>
|
|
+#include <linux/rbtree.h>
|
|
+#include <linux/blk-cgroup.h>
|
|
+
|
|
+#define BFQ_IOPRIO_CLASSES 3
|
|
+#define BFQ_CL_IDLE_TIMEOUT (HZ/5)
|
|
+
|
|
+#define BFQ_MIN_WEIGHT 1
|
|
+#define BFQ_MAX_WEIGHT 1000
|
|
+#define BFQ_WEIGHT_CONVERSION_COEFF 10
|
|
+
|
|
+#define BFQ_DEFAULT_QUEUE_IOPRIO 4
|
|
+
|
|
+#define BFQ_WEIGHT_LEGACY_DFL 100
|
|
+#define BFQ_DEFAULT_GRP_IOPRIO 0
|
|
+#define BFQ_DEFAULT_GRP_CLASS IOPRIO_CLASS_BE
|
|
+
|
|
+/*
|
|
+ * Soft real-time applications are extremely more latency sensitive
|
|
+ * than interactive ones. Over-raise the weight of the former to
|
|
+ * privilege them against the latter.
|
|
+ */
|
|
+#define BFQ_SOFTRT_WEIGHT_FACTOR 100
|
|
+
|
|
+struct bfq_entity;
|
|
+
|
|
+/**
|
|
+ * struct bfq_service_tree - per ioprio_class service tree.
|
|
+ *
|
|
+ * Each service tree represents a B-WF2Q+ scheduler on its own. Each
|
|
+ * ioprio_class has its own independent scheduler, and so its own
|
|
+ * bfq_service_tree. All the fields are protected by the queue lock
|
|
+ * of the containing bfqd.
|
|
+ */
|
|
+struct bfq_service_tree {
|
|
+ /* tree for active entities (i.e., those backlogged) */
|
|
+ struct rb_root active;
|
|
+ /* tree for idle entities (i.e., not backlogged, with V <= F_i)*/
|
|
+ struct rb_root idle;
|
|
+
|
|
+ struct bfq_entity *first_idle; /* idle entity with minimum F_i */
|
|
+ struct bfq_entity *last_idle; /* idle entity with maximum F_i */
|
|
+
|
|
+ u64 vtime; /* scheduler virtual time */
|
|
+ /* scheduler weight sum; active and idle entities contribute to it */
|
|
+ unsigned long wsum;
|
|
+};
|
|
+
|
|
+/**
|
|
+ * struct bfq_sched_data - multi-class scheduler.
|
|
+ *
|
|
+ * bfq_sched_data is the basic scheduler queue. It supports three
|
|
+ * ioprio_classes, and can be used either as a toplevel queue or as an
|
|
+ * intermediate queue on a hierarchical setup. @next_in_service
|
|
+ * points to the active entity of the sched_data service trees that
|
|
+ * will be scheduled next. It is used to reduce the number of steps
|
|
+ * needed for each hierarchical-schedule update.
|
|
+ *
|
|
+ * The supported ioprio_classes are the same as in CFQ, in descending
|
|
+ * priority order, IOPRIO_CLASS_RT, IOPRIO_CLASS_BE, IOPRIO_CLASS_IDLE.
|
|
+ * Requests from higher priority queues are served before all the
|
|
+ * requests from lower priority queues; among requests of the same
|
|
+ * queue requests are served according to B-WF2Q+.
|
|
+ * All the fields are protected by the queue lock of the containing bfqd.
|
|
+ */
|
|
+struct bfq_sched_data {
|
|
+ struct bfq_entity *in_service_entity; /* entity in service */
|
|
+ /* head-of-the-line entity in the scheduler (see comments above) */
|
|
+ struct bfq_entity *next_in_service;
|
|
+ /* array of service trees, one per ioprio_class */
|
|
+ struct bfq_service_tree service_tree[BFQ_IOPRIO_CLASSES];
|
|
+ /* last time CLASS_IDLE was served */
|
|
+ unsigned long bfq_class_idle_last_service;
|
|
+
|
|
+};
|
|
+
|
|
+/**
|
|
+ * struct bfq_weight_counter - counter of the number of all active entities
|
|
+ * with a given weight.
|
|
+ */
|
|
+struct bfq_weight_counter {
|
|
+ unsigned int weight; /* weight of the entities this counter refers to */
|
|
+ unsigned int num_active; /* nr of active entities with this weight */
|
|
+ /*
|
|
+ * Weights tree member (see bfq_data's @queue_weights_tree and
|
|
+ * @group_weights_tree)
|
|
+ */
|
|
+ struct rb_node weights_node;
|
|
+};
|
|
+
|
|
+/**
|
|
+ * struct bfq_entity - schedulable entity.
|
|
+ *
|
|
+ * A bfq_entity is used to represent either a bfq_queue (leaf node in the
|
|
+ * cgroup hierarchy) or a bfq_group into the upper level scheduler. Each
|
|
+ * entity belongs to the sched_data of the parent group in the cgroup
|
|
+ * hierarchy. Non-leaf entities have also their own sched_data, stored
|
|
+ * in @my_sched_data.
|
|
+ *
|
|
+ * Each entity stores independently its priority values; this would
|
|
+ * allow different weights on different devices, but this
|
|
+ * functionality is not exported to userspace by now. Priorities and
|
|
+ * weights are updated lazily, first storing the new values into the
|
|
+ * new_* fields, then setting the @prio_changed flag. As soon as
|
|
+ * there is a transition in the entity state that allows the priority
|
|
+ * update to take place the effective and the requested priority
|
|
+ * values are synchronized.
|
|
+ *
|
|
+ * Unless cgroups are used, the weight value is calculated from the
|
|
+ * ioprio to export the same interface as CFQ. When dealing with
|
|
+ * ``well-behaved'' queues (i.e., queues that do not spend too much
|
|
+ * time to consume their budget and have true sequential behavior, and
|
|
+ * when there are no external factors breaking anticipation) the
|
|
+ * relative weights at each level of the cgroups hierarchy should be
|
|
+ * guaranteed. All the fields are protected by the queue lock of the
|
|
+ * containing bfqd.
|
|
+ */
|
|
+struct bfq_entity {
|
|
+ struct rb_node rb_node; /* service_tree member */
|
|
+ /* pointer to the weight counter associated with this entity */
|
|
+ struct bfq_weight_counter *weight_counter;
|
|
+
|
|
+ /*
|
|
+ * Flag, true if the entity is on a tree (either the active or
|
|
+ * the idle one of its service_tree) or is in service.
|
|
+ */
|
|
+ bool on_st;
|
|
+
|
|
+ u64 finish; /* B-WF2Q+ finish timestamp (aka F_i) */
|
|
+ u64 start; /* B-WF2Q+ start timestamp (aka S_i) */
|
|
+
|
|
+ /* tree the entity is enqueued into; %NULL if not on a tree */
|
|
+ struct rb_root *tree;
|
|
+
|
|
+ /*
|
|
+ * minimum start time of the (active) subtree rooted at this
|
|
+ * entity; used for O(log N) lookups into active trees
|
|
+ */
|
|
+ u64 min_start;
|
|
+
|
|
+ /* amount of service received during the last service slot */
|
|
+ int service;
|
|
+
|
|
+ /* budget, used also to calculate F_i: F_i = S_i + @budget / @weight */
|
|
+ int budget;
|
|
+
|
|
+ unsigned int weight; /* weight of the queue */
|
|
+ unsigned int new_weight; /* next weight if a change is in progress */
|
|
+
|
|
+ /* original weight, used to implement weight boosting */
|
|
+ unsigned int orig_weight;
|
|
+
|
|
+ /* parent entity, for hierarchical scheduling */
|
|
+ struct bfq_entity *parent;
|
|
+
|
|
+ /*
|
|
+ * For non-leaf nodes in the hierarchy, the associated
|
|
+ * scheduler queue, %NULL on leaf nodes.
|
|
+ */
|
|
+ struct bfq_sched_data *my_sched_data;
|
|
+ /* the scheduler queue this entity belongs to */
|
|
+ struct bfq_sched_data *sched_data;
|
|
+
|
|
+ /* flag, set to request a weight, ioprio or ioprio_class change */
|
|
+ int prio_changed;
|
|
+};
|
|
+
|
|
+struct bfq_group;
|
|
+
|
|
+/**
|
|
+ * struct bfq_queue - leaf schedulable entity.
|
|
+ *
|
|
+ * A bfq_queue is a leaf request queue; it can be associated with an
|
|
+ * io_context or more, if it is async or shared between cooperating
|
|
+ * processes. @cgroup holds a reference to the cgroup, to be sure that it
|
|
+ * does not disappear while a bfqq still references it (mostly to avoid
|
|
+ * races between request issuing and task migration followed by cgroup
|
|
+ * destruction).
|
|
+ * All the fields are protected by the queue lock of the containing bfqd.
|
|
+ */
|
|
+struct bfq_queue {
|
|
+ /* reference counter */
|
|
+ int ref;
|
|
+ /* parent bfq_data */
|
|
+ struct bfq_data *bfqd;
|
|
+
|
|
+ /* current ioprio and ioprio class */
|
|
+ unsigned short ioprio, ioprio_class;
|
|
+ /* next ioprio and ioprio class if a change is in progress */
|
|
+ unsigned short new_ioprio, new_ioprio_class;
|
|
+
|
|
+ /*
|
|
+ * Shared bfq_queue if queue is cooperating with one or more
|
|
+ * other queues.
|
|
+ */
|
|
+ struct bfq_queue *new_bfqq;
|
|
+ /* request-position tree member (see bfq_group's @rq_pos_tree) */
|
|
+ struct rb_node pos_node;
|
|
+ /* request-position tree root (see bfq_group's @rq_pos_tree) */
|
|
+ struct rb_root *pos_root;
|
|
+
|
|
+ /* sorted list of pending requests */
|
|
+ struct rb_root sort_list;
|
|
+ /* if fifo isn't expired, next request to serve */
|
|
+ struct request *next_rq;
|
|
+ /* number of sync and async requests queued */
|
|
+ int queued[2];
|
|
+ /* number of sync and async requests currently allocated */
|
|
+ int allocated[2];
|
|
+ /* number of pending metadata requests */
|
|
+ int meta_pending;
|
|
+ /* fifo list of requests in sort_list */
|
|
+ struct list_head fifo;
|
|
+
|
|
+ /* entity representing this queue in the scheduler */
|
|
+ struct bfq_entity entity;
|
|
+
|
|
+ /* maximum budget allowed from the feedback mechanism */
|
|
+ int max_budget;
|
|
+ /* budget expiration (in jiffies) */
|
|
+ unsigned long budget_timeout;
|
|
+
|
|
+ /* number of requests on the dispatch list or inside driver */
|
|
+ int dispatched;
|
|
+
|
|
+ unsigned int flags; /* status flags.*/
|
|
+
|
|
+ /* node for active/idle bfqq list inside parent bfqd */
|
|
+ struct list_head bfqq_list;
|
|
+
|
|
+ /* bit vector: a 1 for each seeky requests in history */
|
|
+ u32 seek_history;
|
|
+
|
|
+ /* node for the device's burst list */
|
|
+ struct hlist_node burst_list_node;
|
|
+
|
|
+ /* position of the last request enqueued */
|
|
+ sector_t last_request_pos;
|
|
+
|
|
+ /* Number of consecutive pairs of request completion and
|
|
+ * arrival, such that the queue becomes idle after the
|
|
+ * completion, but the next request arrives within an idle
|
|
+ * time slice; used only if the queue's IO_bound flag has been
|
|
+ * cleared.
|
|
+ */
|
|
+ unsigned int requests_within_timer;
|
|
+
|
|
+ /* pid of the process owning the queue, used for logging purposes */
|
|
+ pid_t pid;
|
|
+
|
|
+ /*
|
|
+ * Pointer to the bfq_io_cq owning the bfq_queue, set to %NULL
|
|
+ * if the queue is shared.
|
|
+ */
|
|
+ struct bfq_io_cq *bic;
|
|
+
|
|
+ /* current maximum weight-raising time for this queue */
|
|
+ unsigned long wr_cur_max_time;
|
|
+ /*
|
|
+ * Minimum time instant such that, only if a new request is
|
|
+ * enqueued after this time instant in an idle @bfq_queue with
|
|
+ * no outstanding requests, then the task associated with the
|
|
+ * queue it is deemed as soft real-time (see the comments on
|
|
+ * the function bfq_bfqq_softrt_next_start())
|
|
+ */
|
|
+ unsigned long soft_rt_next_start;
|
|
+ /*
|
|
+ * Start time of the current weight-raising period if
|
|
+ * the @bfq-queue is being weight-raised, otherwise
|
|
+ * finish time of the last weight-raising period.
|
|
+ */
|
|
+ unsigned long last_wr_start_finish;
|
|
+ /* factor by which the weight of this queue is multiplied */
|
|
+ unsigned int wr_coeff;
|
|
+ /*
|
|
+ * Time of the last transition of the @bfq_queue from idle to
|
|
+ * backlogged.
|
|
+ */
|
|
+ unsigned long last_idle_bklogged;
|
|
+ /*
|
|
+ * Cumulative service received from the @bfq_queue since the
|
|
+ * last transition from idle to backlogged.
|
|
+ */
|
|
+ unsigned long service_from_backlogged;
|
|
+ /*
|
|
+ * Value of wr start time when switching to soft rt
|
|
+ */
|
|
+ unsigned long wr_start_at_switch_to_srt;
|
|
+
|
|
+ unsigned long split_time; /* time of last split */
|
|
+};
|
|
+
|
|
+/**
|
|
+ * struct bfq_ttime - per process thinktime stats.
|
|
+ */
|
|
+struct bfq_ttime {
|
|
+ u64 last_end_request; /* completion time of last request */
|
|
+
|
|
+ u64 ttime_total; /* total process thinktime */
|
|
+ unsigned long ttime_samples; /* number of thinktime samples */
|
|
+ u64 ttime_mean; /* average process thinktime */
|
|
+
|
|
+};
|
|
+
|
|
+/**
|
|
+ * struct bfq_io_cq - per (request_queue, io_context) structure.
|
|
+ */
|
|
+struct bfq_io_cq {
|
|
+ /* associated io_cq structure */
|
|
+ struct io_cq icq; /* must be the first member */
|
|
+ /* array of two process queues, the sync and the async */
|
|
+ struct bfq_queue *bfqq[2];
|
|
+ /* associated @bfq_ttime struct */
|
|
+ struct bfq_ttime ttime;
|
|
+ /* per (request_queue, blkcg) ioprio */
|
|
+ int ioprio;
|
|
+#ifdef CONFIG_BFQ_GROUP_IOSCHED
|
|
+ uint64_t blkcg_serial_nr; /* the current blkcg serial */
|
|
+#endif
|
|
+
|
|
+ /*
|
|
+ * Snapshot of the idle window before merging; taken to
|
|
+ * remember this value while the queue is merged, so as to be
|
|
+ * able to restore it in case of split.
|
|
+ */
|
|
+ bool saved_idle_window;
|
|
+ /*
|
|
+ * Same purpose as the previous two fields for the I/O bound
|
|
+ * classification of a queue.
|
|
+ */
|
|
+ bool saved_IO_bound;
|
|
+
|
|
+ /*
|
|
+ * Same purpose as the previous fields for the value of the
|
|
+ * field keeping the queue's belonging to a large burst
|
|
+ */
|
|
+ bool saved_in_large_burst;
|
|
+ /*
|
|
+ * True if the queue belonged to a burst list before its merge
|
|
+ * with another cooperating queue.
|
|
+ */
|
|
+ bool was_in_burst_list;
|
|
+
|
|
+ /*
|
|
+ * Similar to previous fields: save wr information.
|
|
+ */
|
|
+ unsigned long saved_wr_coeff;
|
|
+ unsigned long saved_last_wr_start_finish;
|
|
+ unsigned long saved_wr_start_at_switch_to_srt;
|
|
+ unsigned int saved_wr_cur_max_time;
|
|
+};
|
|
+
|
|
+enum bfq_device_speed {
|
|
+ BFQ_BFQD_FAST,
|
|
+ BFQ_BFQD_SLOW,
|
|
+};
|
|
+
|
|
+/**
|
|
+ * struct bfq_data - per-device data structure.
|
|
+ *
|
|
+ * All the fields are protected by the @queue lock.
|
|
+ */
|
|
+struct bfq_data {
|
|
+ /* request queue for the device */
|
|
+ struct request_queue *queue;
|
|
+
|
|
+ /* root bfq_group for the device */
|
|
+ struct bfq_group *root_group;
|
|
+
|
|
+ /*
|
|
+ * rbtree of weight counters of @bfq_queues, sorted by
|
|
+ * weight. Used to keep track of whether all @bfq_queues have
|
|
+ * the same weight. The tree contains one counter for each
|
|
+ * distinct weight associated to some active and not
|
|
+ * weight-raised @bfq_queue (see the comments to the functions
|
|
+ * bfq_weights_tree_[add|remove] for further details).
|
|
+ */
|
|
+ struct rb_root queue_weights_tree;
|
|
+ /*
|
|
+ * rbtree of non-queue @bfq_entity weight counters, sorted by
|
|
+ * weight. Used to keep track of whether all @bfq_groups have
|
|
+ * the same weight. The tree contains one counter for each
|
|
+ * distinct weight associated to some active @bfq_group (see
|
|
+ * the comments to the functions bfq_weights_tree_[add|remove]
|
|
+ * for further details).
|
|
+ */
|
|
+ struct rb_root group_weights_tree;
|
|
+
|
|
+ /*
|
|
+ * Number of bfq_queues containing requests (including the
|
|
+ * queue in service, even if it is idling).
|
|
+ */
|
|
+ int busy_queues;
|
|
+ /* number of weight-raised busy @bfq_queues */
|
|
+ int wr_busy_queues;
|
|
+ /* number of queued requests */
|
|
+ int queued;
|
|
+ /* number of requests dispatched and waiting for completion */
|
|
+ int rq_in_driver;
|
|
+
|
|
+ /*
|
|
+ * Maximum number of requests in driver in the last
|
|
+ * @hw_tag_samples completed requests.
|
|
+ */
|
|
+ int max_rq_in_driver;
|
|
+ /* number of samples used to calculate hw_tag */
|
|
+ int hw_tag_samples;
|
|
+ /* flag set to one if the driver is showing a queueing behavior */
|
|
+ int hw_tag;
|
|
+
|
|
+ /* number of budgets assigned */
|
|
+ int budgets_assigned;
|
|
+
|
|
+ /*
|
|
+ * Timer set when idling (waiting) for the next request from
|
|
+ * the queue in service.
|
|
+ */
|
|
+ struct hrtimer idle_slice_timer;
|
|
+ /* delayed work to restart dispatching on the request queue */
|
|
+ struct work_struct unplug_work;
|
|
+
|
|
+ /* bfq_queue in service */
|
|
+ struct bfq_queue *in_service_queue;
|
|
+ /* bfq_io_cq (bic) associated with the @in_service_queue */
|
|
+ struct bfq_io_cq *in_service_bic;
|
|
+
|
|
+ /* on-disk position of the last served request */
|
|
+ sector_t last_position;
|
|
+
|
|
+ /* time of last request completion (ns) */
|
|
+ u64 last_completion;
|
|
+
|
|
+ /* time of first rq dispatch in current observation interval (ns) */
|
|
+ u64 first_dispatch;
|
|
+ /* time of last rq dispatch in current observation interval (ns) */
|
|
+ u64 last_dispatch;
|
|
+
|
|
+ /* beginning of the last budget */
|
|
+ ktime_t last_budget_start;
|
|
+ /* beginning of the last idle slice */
|
|
+ ktime_t last_idling_start;
|
|
+
|
|
+ /* number of samples in current observation interval */
|
|
+ int peak_rate_samples;
|
|
+ /* num of samples of seq dispatches in current observation interval */
|
|
+ u32 sequential_samples;
|
|
+ /* total num of sectors transferred in current observation interval */
|
|
+ u64 tot_sectors_dispatched;
|
|
+ /* max rq size seen during current observation interval (sectors) */
|
|
+ u32 last_rq_max_size;
|
|
+ /* time elapsed from first dispatch in current observ. interval (us) */
|
|
+ u64 delta_from_first;
|
|
+ /* current estimate of device peak rate */
|
|
+ u32 peak_rate;
|
|
+
|
|
+ /* maximum budget allotted to a bfq_queue before rescheduling */
|
|
+ int bfq_max_budget;
|
|
+
|
|
+ /* list of all the bfq_queues active on the device */
|
|
+ struct list_head active_list;
|
|
+ /* list of all the bfq_queues idle on the device */
|
|
+ struct list_head idle_list;
|
|
+
|
|
+ /*
|
|
+ * Timeout for async/sync requests; when it fires, requests
|
|
+ * are served in fifo order.
|
|
+ */
|
|
+ u64 bfq_fifo_expire[2];
|
|
+ /* weight of backward seeks wrt forward ones */
|
|
+ unsigned int bfq_back_penalty;
|
|
+ /* maximum allowed backward seek */
|
|
+ unsigned int bfq_back_max;
|
|
+ /* maximum idling time */
|
|
+ u32 bfq_slice_idle;
|
|
+
|
|
+ /* user-configured max budget value (0 for auto-tuning) */
|
|
+ int bfq_user_max_budget;
|
|
+ /*
|
|
+ * Timeout for bfq_queues to consume their budget; used to
|
|
+ * prevent seeky queues from imposing long latencies to
|
|
+ * sequential or quasi-sequential ones (this also implies that
|
|
+ * seeky queues cannot receive guarantees in the service
|
|
+ * domain; after a timeout they are charged for the time they
|
|
+ * have been in service, to preserve fairness among them, but
|
|
+ * without service-domain guarantees).
|
|
+ */
|
|
+ unsigned int bfq_timeout;
|
|
+
|
|
+ /*
|
|
+ * Number of consecutive requests that must be issued within
|
|
+ * the idle time slice to set again idling to a queue which
|
|
+ * was marked as non-I/O-bound (see the definition of the
|
|
+ * IO_bound flag for further details).
|
|
+ */
|
|
+ unsigned int bfq_requests_within_timer;
|
|
+
|
|
+ /*
|
|
+ * Force device idling whenever needed to provide accurate
|
|
+ * service guarantees, without caring about throughput
|
|
+ * issues. CAVEAT: this may even increase latencies, in case
|
|
+ * of useless idling for processes that did stop doing I/O.
|
|
+ */
|
|
+ bool strict_guarantees;
|
|
+
|
|
+ /*
|
|
+ * Last time at which a queue entered the current burst of
|
|
+ * queues being activated shortly after each other; for more
|
|
+ * details about this and the following parameters related to
|
|
+ * a burst of activations, see the comments on the function
|
|
+ * bfq_handle_burst.
|
|
+ */
|
|
+ unsigned long last_ins_in_burst;
|
|
+ /*
|
|
+ * Reference time interval used to decide whether a queue has
|
|
+ * been activated shortly after @last_ins_in_burst.
|
|
+ */
|
|
+ unsigned long bfq_burst_interval;
|
|
+ /* number of queues in the current burst of queue activations */
|
|
+ int burst_size;
|
|
+
|
|
+ /* common parent entity for the queues in the burst */
|
|
+ struct bfq_entity *burst_parent_entity;
|
|
+ /* Maximum burst size above which the current queue-activation
|
|
+ * burst is deemed as 'large'.
|
|
+ */
|
|
+ unsigned long bfq_large_burst_thresh;
|
|
+ /* true if a large queue-activation burst is in progress */
|
|
+ bool large_burst;
|
|
+ /*
|
|
+ * Head of the burst list (as for the above fields, more
|
|
+ * details in the comments on the function bfq_handle_burst).
|
|
+ */
|
|
+ struct hlist_head burst_list;
|
|
+
|
|
+ /* if set to true, low-latency heuristics are enabled */
|
|
+ bool low_latency;
|
|
+ /*
|
|
+ * Maximum factor by which the weight of a weight-raised queue
|
|
+ * is multiplied.
|
|
+ */
|
|
+ unsigned int bfq_wr_coeff;
|
|
+ /* maximum duration of a weight-raising period (jiffies) */
|
|
+ unsigned int bfq_wr_max_time;
|
|
+
|
|
+ /* Maximum weight-raising duration for soft real-time processes */
|
|
+ unsigned int bfq_wr_rt_max_time;
|
|
+ /*
|
|
+ * Minimum idle period after which weight-raising may be
|
|
+ * reactivated for a queue (in jiffies).
|
|
+ */
|
|
+ unsigned int bfq_wr_min_idle_time;
|
|
+ /*
|
|
+ * Minimum period between request arrivals after which
|
|
+ * weight-raising may be reactivated for an already busy async
|
|
+ * queue (in jiffies).
|
|
+ */
|
|
+ unsigned long bfq_wr_min_inter_arr_async;
|
|
+
|
|
+ /* Max service-rate for a soft real-time queue, in sectors/sec */
|
|
+ unsigned int bfq_wr_max_softrt_rate;
|
|
+ /*
|
|
+ * Cached value of the product R*T, used for computing the
|
|
+ * maximum duration of weight raising automatically.
|
|
+ */
|
|
+ u64 RT_prod;
|
|
+ /* device-speed class for the low-latency heuristic */
|
|
+ enum bfq_device_speed device_speed;
|
|
+
|
|
+ /* fallback dummy bfqq for extreme OOM conditions */
|
|
+ struct bfq_queue oom_bfqq;
|
|
+};
|
|
+
|
|
+enum bfqq_state_flags {
|
|
+ BFQ_BFQQ_FLAG_just_created = 0, /* queue just allocated */
|
|
+ BFQ_BFQQ_FLAG_busy, /* has requests or is in service */
|
|
+ BFQ_BFQQ_FLAG_wait_request, /* waiting for a request */
|
|
+ BFQ_BFQQ_FLAG_non_blocking_wait_rq, /*
|
|
+ * waiting for a request
|
|
+ * without idling the device
|
|
+ */
|
|
+ BFQ_BFQQ_FLAG_must_alloc, /* must be allowed rq alloc */
|
|
+ BFQ_BFQQ_FLAG_fifo_expire, /* FIFO checked in this slice */
|
|
+ BFQ_BFQQ_FLAG_idle_window, /* slice idling enabled */
|
|
+ BFQ_BFQQ_FLAG_sync, /* synchronous queue */
|
|
+ BFQ_BFQQ_FLAG_IO_bound, /*
|
|
+ * bfqq has timed-out at least once
|
|
+ * having consumed at most 2/10 of
|
|
+ * its budget
|
|
+ */
|
|
+ BFQ_BFQQ_FLAG_in_large_burst, /*
|
|
+ * bfqq activated in a large burst,
|
|
+ * see comments to bfq_handle_burst.
|
|
+ */
|
|
+ BFQ_BFQQ_FLAG_softrt_update, /*
|
|
+ * may need softrt-next-start
|
|
+ * update
|
|
+ */
|
|
+ BFQ_BFQQ_FLAG_coop, /* bfqq is shared */
|
|
+ BFQ_BFQQ_FLAG_split_coop /* shared bfqq will be split */
|
|
+};
|
|
+
|
|
+#define BFQ_BFQQ_FNS(name) \
|
|
+static void bfq_mark_bfqq_##name(struct bfq_queue *bfqq) \
|
|
+{ \
|
|
+ (bfqq)->flags |= (1 << BFQ_BFQQ_FLAG_##name); \
|
|
+} \
|
|
+static void bfq_clear_bfqq_##name(struct bfq_queue *bfqq) \
|
|
+{ \
|
|
+ (bfqq)->flags &= ~(1 << BFQ_BFQQ_FLAG_##name); \
|
|
+} \
|
|
+static int bfq_bfqq_##name(const struct bfq_queue *bfqq) \
|
|
+{ \
|
|
+ return ((bfqq)->flags & (1 << BFQ_BFQQ_FLAG_##name)) != 0; \
|
|
+}
|
|
+
|
|
+BFQ_BFQQ_FNS(just_created);
|
|
+BFQ_BFQQ_FNS(busy);
|
|
+BFQ_BFQQ_FNS(wait_request);
|
|
+BFQ_BFQQ_FNS(non_blocking_wait_rq);
|
|
+BFQ_BFQQ_FNS(must_alloc);
|
|
+BFQ_BFQQ_FNS(fifo_expire);
|
|
+BFQ_BFQQ_FNS(idle_window);
|
|
+BFQ_BFQQ_FNS(sync);
|
|
+BFQ_BFQQ_FNS(IO_bound);
|
|
+BFQ_BFQQ_FNS(in_large_burst);
|
|
+BFQ_BFQQ_FNS(coop);
|
|
+BFQ_BFQQ_FNS(split_coop);
|
|
+BFQ_BFQQ_FNS(softrt_update);
|
|
+#undef BFQ_BFQQ_FNS
|
|
+
|
|
+/* Logging facilities. */
|
|
+#ifdef CONFIG_BFQ_REDIRECT_TO_CONSOLE
|
|
+#ifdef CONFIG_BFQ_GROUP_IOSCHED
|
|
+static struct bfq_group *bfqq_group(struct bfq_queue *bfqq);
|
|
+static struct blkcg_gq *bfqg_to_blkg(struct bfq_group *bfqg);
|
|
+
|
|
+#define bfq_log_bfqq(bfqd, bfqq, fmt, args...) do { \
|
|
+ char __pbuf[128]; \
|
|
+ \
|
|
+ assert_spin_locked((bfqd)->queue->queue_lock); \
|
|
+ blkg_path(bfqg_to_blkg(bfqq_group(bfqq)), __pbuf, sizeof(__pbuf)); \
|
|
+ pr_crit("bfq%d%c %s " fmt "\n", \
|
|
+ (bfqq)->pid, \
|
|
+ bfq_bfqq_sync((bfqq)) ? 'S' : 'A', \
|
|
+ __pbuf, ##args); \
|
|
+} while (0)
|
|
+
|
|
+#define bfq_log_bfqg(bfqd, bfqg, fmt, args...) do { \
|
|
+ char __pbuf[128]; \
|
|
+ \
|
|
+ blkg_path(bfqg_to_blkg(bfqg), __pbuf, sizeof(__pbuf)); \
|
|
+ pr_crit("%s " fmt "\n", __pbuf, ##args); \
|
|
+} while (0)
|
|
+
|
|
+#else /* CONFIG_BFQ_GROUP_IOSCHED */
|
|
+
|
|
+#define bfq_log_bfqq(bfqd, bfqq, fmt, args...) \
|
|
+ pr_crit("bfq%d%c " fmt "\n", (bfqq)->pid, \
|
|
+ bfq_bfqq_sync((bfqq)) ? 'S' : 'A', \
|
|
+ ##args)
|
|
+#define bfq_log_bfqg(bfqd, bfqg, fmt, args...) do {} while (0)
|
|
+
|
|
+#endif /* CONFIG_BFQ_GROUP_IOSCHED */
|
|
+
|
|
+#define bfq_log(bfqd, fmt, args...) \
|
|
+ pr_crit("bfq " fmt "\n", ##args)
|
|
+
|
|
+#else /* CONFIG_BFQ_REDIRECT_TO_CONSOLE */
|
|
+#ifdef CONFIG_BFQ_GROUP_IOSCHED
|
|
+static struct bfq_group *bfqq_group(struct bfq_queue *bfqq);
|
|
+static struct blkcg_gq *bfqg_to_blkg(struct bfq_group *bfqg);
|
|
+
|
|
+#define bfq_log_bfqq(bfqd, bfqq, fmt, args...) do { \
|
|
+ char __pbuf[128]; \
|
|
+ \
|
|
+ assert_spin_locked((bfqd)->queue->queue_lock); \
|
|
+ blkg_path(bfqg_to_blkg(bfqq_group(bfqq)), __pbuf, sizeof(__pbuf)); \
|
|
+ blk_add_trace_msg((bfqd)->queue, "bfq%d%c %s " fmt, \
|
|
+ (bfqq)->pid, \
|
|
+ bfq_bfqq_sync((bfqq)) ? 'S' : 'A', \
|
|
+ __pbuf, ##args); \
|
|
+} while (0)
|
|
+
|
|
+#define bfq_log_bfqg(bfqd, bfqg, fmt, args...) do { \
|
|
+ char __pbuf[128]; \
|
|
+ \
|
|
+ blkg_path(bfqg_to_blkg(bfqg), __pbuf, sizeof(__pbuf)); \
|
|
+ blk_add_trace_msg((bfqd)->queue, "%s " fmt, __pbuf, ##args); \
|
|
+} while (0)
|
|
+
|
|
+#else /* CONFIG_BFQ_GROUP_IOSCHED */
|
|
+
|
|
+#define bfq_log_bfqq(bfqd, bfqq, fmt, args...) \
|
|
+ blk_add_trace_msg((bfqd)->queue, "bfq%d%c " fmt, (bfqq)->pid, \
|
|
+ bfq_bfqq_sync((bfqq)) ? 'S' : 'A', \
|
|
+ ##args)
|
|
+#define bfq_log_bfqg(bfqd, bfqg, fmt, args...) do {} while (0)
|
|
+
|
|
+#endif /* CONFIG_BFQ_GROUP_IOSCHED */
|
|
+
|
|
+#define bfq_log(bfqd, fmt, args...) \
|
|
+ blk_add_trace_msg((bfqd)->queue, "bfq " fmt, ##args)
|
|
+#endif /* CONFIG_BFQ_REDIRECT_TO_CONSOLE */
|
|
+
|
|
+/* Expiration reasons. */
|
|
+enum bfqq_expiration {
|
|
+ BFQ_BFQQ_TOO_IDLE = 0, /*
|
|
+ * queue has been idling for
|
|
+ * too long
|
|
+ */
|
|
+ BFQ_BFQQ_BUDGET_TIMEOUT, /* budget took too long to be used */
|
|
+ BFQ_BFQQ_BUDGET_EXHAUSTED, /* budget consumed */
|
|
+ BFQ_BFQQ_NO_MORE_REQUESTS, /* the queue has no more requests */
|
|
+ BFQ_BFQQ_PREEMPTED /* preemption in progress */
|
|
+};
|
|
+
|
|
+
|
|
+struct bfqg_stats {
|
|
+#ifdef CONFIG_BFQ_GROUP_IOSCHED
|
|
+ /* number of ios merged */
|
|
+ struct blkg_rwstat merged;
|
|
+ /* total time spent on device in ns, may not be accurate w/ queueing */
|
|
+ struct blkg_rwstat service_time;
|
|
+ /* total time spent waiting in scheduler queue in ns */
|
|
+ struct blkg_rwstat wait_time;
|
|
+ /* number of IOs queued up */
|
|
+ struct blkg_rwstat queued;
|
|
+ /* total disk time and nr sectors dispatched by this group */
|
|
+ struct blkg_stat time;
|
|
+ /* sum of number of ios queued across all samples */
|
|
+ struct blkg_stat avg_queue_size_sum;
|
|
+ /* count of samples taken for average */
|
|
+ struct blkg_stat avg_queue_size_samples;
|
|
+ /* how many times this group has been removed from service tree */
|
|
+ struct blkg_stat dequeue;
|
|
+ /* total time spent waiting for it to be assigned a timeslice. */
|
|
+ struct blkg_stat group_wait_time;
|
|
+ /* time spent idling for this blkcg_gq */
|
|
+ struct blkg_stat idle_time;
|
|
+ /* total time with empty current active q with other requests queued */
|
|
+ struct blkg_stat empty_time;
|
|
+ /* fields after this shouldn't be cleared on stat reset */
|
|
+ uint64_t start_group_wait_time;
|
|
+ uint64_t start_idle_time;
|
|
+ uint64_t start_empty_time;
|
|
+ uint16_t flags;
|
|
+#endif
|
|
+};
|
|
+
|
|
+#ifdef CONFIG_BFQ_GROUP_IOSCHED
|
|
+/*
|
|
+ * struct bfq_group_data - per-blkcg storage for the blkio subsystem.
|
|
+ *
|
|
+ * @ps: @blkcg_policy_storage that this structure inherits
|
|
+ * @weight: weight of the bfq_group
|
|
+ */
|
|
+struct bfq_group_data {
|
|
+ /* must be the first member */
|
|
+ struct blkcg_policy_data pd;
|
|
+
|
|
+ unsigned int weight;
|
|
+};
|
|
+
|
|
+/**
|
|
+ * struct bfq_group - per (device, cgroup) data structure.
|
|
+ * @entity: schedulable entity to insert into the parent group sched_data.
|
|
+ * @sched_data: own sched_data, to contain child entities (they may be
|
|
+ * both bfq_queues and bfq_groups).
|
|
+ * @bfqd: the bfq_data for the device this group acts upon.
|
|
+ * @async_bfqq: array of async queues for all the tasks belonging to
|
|
+ * the group, one queue per ioprio value per ioprio_class,
|
|
+ * except for the idle class that has only one queue.
|
|
+ * @async_idle_bfqq: async queue for the idle class (ioprio is ignored).
|
|
+ * @my_entity: pointer to @entity, %NULL for the toplevel group; used
|
|
+ * to avoid too many special cases during group creation/
|
|
+ * migration.
|
|
+ * @active_entities: number of active entities belonging to the group;
|
|
+ * unused for the root group. Used to know whether there
|
|
+ * are groups with more than one active @bfq_entity
|
|
+ * (see the comments to the function
|
|
+ * bfq_bfqq_may_idle()).
|
|
+ * @rq_pos_tree: rbtree sorted by next_request position, used when
|
|
+ * determining if two or more queues have interleaving
|
|
+ * requests (see bfq_find_close_cooperator()).
|
|
+ *
|
|
+ * Each (device, cgroup) pair has its own bfq_group, i.e., for each cgroup
|
|
+ * there is a set of bfq_groups, each one collecting the lower-level
|
|
+ * entities belonging to the group that are acting on the same device.
|
|
+ *
|
|
+ * Locking works as follows:
|
|
+ * o @bfqd is protected by the queue lock, RCU is used to access it
|
|
+ * from the readers.
|
|
+ * o All the other fields are protected by the @bfqd queue lock.
|
|
+ */
|
|
+struct bfq_group {
|
|
+ /* must be the first member */
|
|
+ struct blkg_policy_data pd;
|
|
+
|
|
+ struct bfq_entity entity;
|
|
+ struct bfq_sched_data sched_data;
|
|
+
|
|
+ void *bfqd;
|
|
+
|
|
+ struct bfq_queue *async_bfqq[2][IOPRIO_BE_NR];
|
|
+ struct bfq_queue *async_idle_bfqq;
|
|
+
|
|
+ struct bfq_entity *my_entity;
|
|
+
|
|
+ int active_entities;
|
|
+
|
|
+ struct rb_root rq_pos_tree;
|
|
+
|
|
+ struct bfqg_stats stats;
|
|
+};
|
|
+
|
|
+#else
|
|
+struct bfq_group {
|
|
+ struct bfq_sched_data sched_data;
|
|
+
|
|
+ struct bfq_queue *async_bfqq[2][IOPRIO_BE_NR];
|
|
+ struct bfq_queue *async_idle_bfqq;
|
|
+
|
|
+ struct rb_root rq_pos_tree;
|
|
+};
|
|
+#endif
|
|
+
|
|
+static struct bfq_queue *bfq_entity_to_bfqq(struct bfq_entity *entity);
|
|
+
|
|
+static unsigned int bfq_class_idx(struct bfq_entity *entity)
|
|
+{
|
|
+ struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);
|
|
+
|
|
+ return bfqq ? bfqq->ioprio_class - 1 :
|
|
+ BFQ_DEFAULT_GRP_CLASS - 1;
|
|
+}
|
|
+
|
|
+static struct bfq_service_tree *
|
|
+bfq_entity_service_tree(struct bfq_entity *entity)
|
|
+{
|
|
+ struct bfq_sched_data *sched_data = entity->sched_data;
|
|
+ struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);
|
|
+ unsigned int idx = bfq_class_idx(entity);
|
|
+
|
|
+ BUG_ON(idx >= BFQ_IOPRIO_CLASSES);
|
|
+ BUG_ON(sched_data == NULL);
|
|
+
|
|
+ if (bfqq)
|
|
+ bfq_log_bfqq(bfqq->bfqd, bfqq,
|
|
+ "entity_service_tree %p %d",
|
|
+ sched_data->service_tree + idx, idx);
|
|
+#ifdef CONFIG_BFQ_GROUP_IOSCHED
|
|
+ else {
|
|
+ struct bfq_group *bfqg =
|
|
+ container_of(entity, struct bfq_group, entity);
|
|
+
|
|
+ bfq_log_bfqg((struct bfq_data *)bfqg->bfqd, bfqg,
|
|
+ "entity_service_tree %p %d",
|
|
+ sched_data->service_tree + idx, idx);
|
|
+ }
|
|
+#endif
|
|
+ return sched_data->service_tree + idx;
|
|
+}
|
|
+
|
|
+static struct bfq_queue *bic_to_bfqq(struct bfq_io_cq *bic, bool is_sync)
|
|
+{
|
|
+ return bic->bfqq[is_sync];
|
|
+}
|
|
+
|
|
+static void bic_set_bfqq(struct bfq_io_cq *bic, struct bfq_queue *bfqq,
|
|
+ bool is_sync)
|
|
+{
|
|
+ bic->bfqq[is_sync] = bfqq;
|
|
+}
|
|
+
|
|
+static struct bfq_data *bic_to_bfqd(struct bfq_io_cq *bic)
|
|
+{
|
|
+ return bic->icq.q->elevator->elevator_data;
|
|
+}
|
|
+
|
|
+#ifdef CONFIG_BFQ_GROUP_IOSCHED
|
|
+
|
|
+static struct bfq_group *bfq_bfqq_to_bfqg(struct bfq_queue *bfqq)
|
|
+{
|
|
+ struct bfq_entity *group_entity = bfqq->entity.parent;
|
|
+
|
|
+ if (!group_entity)
|
|
+ group_entity = &bfqq->bfqd->root_group->entity;
|
|
+
|
|
+ return container_of(group_entity, struct bfq_group, entity);
|
|
+}
|
|
+
|
|
+#else
|
|
+
|
|
+static struct bfq_group *bfq_bfqq_to_bfqg(struct bfq_queue *bfqq)
|
|
+{
|
|
+ return bfqq->bfqd->root_group;
|
|
+}
|
|
+
|
|
+#endif
|
|
+
|
|
+static void bfq_check_ioprio_change(struct bfq_io_cq *bic, struct bio *bio);
|
|
+static void bfq_put_queue(struct bfq_queue *bfqq);
|
|
+static void bfq_dispatch_insert(struct request_queue *q, struct request *rq);
|
|
+static struct bfq_queue *bfq_get_queue(struct bfq_data *bfqd,
|
|
+ struct bio *bio, bool is_sync,
|
|
+ struct bfq_io_cq *bic);
|
|
+static void bfq_end_wr_async_queues(struct bfq_data *bfqd,
|
|
+ struct bfq_group *bfqg);
|
|
+#ifdef CONFIG_BFQ_GROUP_IOSCHED
|
|
+static void bfq_put_async_queues(struct bfq_data *bfqd, struct bfq_group *bfqg);
|
|
+#endif
|
|
+static void bfq_exit_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq);
|
|
+
|
|
+#endif /* _BFQ_H */
|
|
diff --git a/block/blk-core.c b/block/blk-core.c
|
|
index 14d7c07..1655112 100644
|
|
--- a/block/blk-core.c
|
|
+++ b/block/blk-core.c
|
|
@@ -39,6 +39,7 @@
|
|
|
|
#include "blk.h"
|
|
#include "blk-mq.h"
|
|
+#include "blk-wbt.h"
|
|
|
|
EXPORT_TRACEPOINT_SYMBOL_GPL(block_bio_remap);
|
|
EXPORT_TRACEPOINT_SYMBOL_GPL(block_rq_remap);
|
|
@@ -882,6 +883,8 @@ blk_init_allocated_queue(struct request_queue *q, request_fn_proc *rfn,
|
|
|
|
fail:
|
|
blk_free_flush_queue(q->fq);
|
|
+ wbt_exit(q->rq_wb);
|
|
+ q->rq_wb = NULL;
|
|
return NULL;
|
|
}
|
|
EXPORT_SYMBOL(blk_init_allocated_queue);
|
|
@@ -1346,6 +1349,7 @@ void blk_requeue_request(struct request_queue *q, struct request *rq)
|
|
blk_delete_timer(rq);
|
|
blk_clear_rq_complete(rq);
|
|
trace_block_rq_requeue(q, rq);
|
|
+ wbt_requeue(q->rq_wb, &rq->issue_stat);
|
|
|
|
if (rq->cmd_flags & REQ_QUEUED)
|
|
blk_queue_end_tag(q, rq);
|
|
@@ -1436,6 +1440,8 @@ void __blk_put_request(struct request_queue *q, struct request *req)
|
|
/* this is a bio leak */
|
|
WARN_ON(req->bio != NULL);
|
|
|
|
+ wbt_done(q->rq_wb, &req->issue_stat);
|
|
+
|
|
/*
|
|
* Request may not have originated from ll_rw_blk. if not,
|
|
* it didn't come out of our reserved rq pools
|
|
@@ -1667,6 +1673,7 @@ static blk_qc_t blk_queue_bio(struct request_queue *q, struct bio *bio)
|
|
int el_ret, rw_flags = 0, where = ELEVATOR_INSERT_SORT;
|
|
struct request *req;
|
|
unsigned int request_count = 0;
|
|
+ unsigned int wb_acct;
|
|
|
|
/*
|
|
* low level driver can indicate that it wants pages above a
|
|
@@ -1719,6 +1726,8 @@ static blk_qc_t blk_queue_bio(struct request_queue *q, struct bio *bio)
|
|
}
|
|
|
|
get_rq:
|
|
+ wb_acct = wbt_wait(q->rq_wb, bio->bi_opf, q->queue_lock);
|
|
+
|
|
/*
|
|
* This sync check and mask will be re-done in init_request_from_bio(),
|
|
* but we need to set it earlier to expose the sync flag to the
|
|
@@ -1738,11 +1747,14 @@ get_rq:
|
|
*/
|
|
req = get_request(q, bio_data_dir(bio), rw_flags, bio, GFP_NOIO);
|
|
if (IS_ERR(req)) {
|
|
+ __wbt_done(q->rq_wb, wb_acct);
|
|
bio->bi_error = PTR_ERR(req);
|
|
bio_endio(bio);
|
|
goto out_unlock;
|
|
}
|
|
|
|
+ wbt_track(&req->issue_stat, wb_acct);
|
|
+
|
|
/*
|
|
* After dropping the lock and possibly sleeping here, our request
|
|
* may now be mergeable after it had proven unmergeable (above).
|
|
@@ -2475,6 +2487,9 @@ void blk_start_request(struct request *req)
|
|
{
|
|
blk_dequeue_request(req);
|
|
|
|
+ blk_stat_set_issue_time(&req->issue_stat);
|
|
+ wbt_issue(req->q->rq_wb, &req->issue_stat);
|
|
+
|
|
/*
|
|
* We are now handing the request to the hardware, initialize
|
|
* resid_len to full count and add the timeout handler.
|
|
@@ -2542,6 +2557,8 @@ bool blk_update_request(struct request *req, int error, unsigned int nr_bytes)
|
|
|
|
trace_block_rq_complete(req->q, req, nr_bytes);
|
|
|
|
+ blk_stat_add(&req->q->rq_stats[rq_data_dir(req)], req);
|
|
+
|
|
if (!req->bio)
|
|
return false;
|
|
|
|
@@ -2709,9 +2726,10 @@ void blk_finish_request(struct request *req, int error)
|
|
|
|
blk_account_io_done(req);
|
|
|
|
- if (req->end_io)
|
|
+ if (req->end_io) {
|
|
+ wbt_done(req->q->rq_wb, &req->issue_stat);
|
|
req->end_io(req, error);
|
|
- else {
|
|
+ } else {
|
|
if (blk_bidi_rq(req))
|
|
__blk_put_request(req->next_rq->q, req->next_rq);
|
|
|
|
diff --git a/block/blk-mq-sysfs.c b/block/blk-mq-sysfs.c
|
|
index 01fb455..633c79a 100644
|
|
--- a/block/blk-mq-sysfs.c
|
|
+++ b/block/blk-mq-sysfs.c
|
|
@@ -259,6 +259,47 @@ static ssize_t blk_mq_hw_sysfs_cpus_show(struct blk_mq_hw_ctx *hctx, char *page)
|
|
return ret;
|
|
}
|
|
|
|
+static void blk_mq_stat_clear(struct blk_mq_hw_ctx *hctx)
|
|
+{
|
|
+ struct blk_mq_ctx *ctx;
|
|
+ unsigned int i;
|
|
+
|
|
+ hctx_for_each_ctx(hctx, ctx, i) {
|
|
+ blk_stat_init(&ctx->stat[0]);
|
|
+ blk_stat_init(&ctx->stat[1]);
|
|
+ }
|
|
+}
|
|
+
|
|
+static ssize_t blk_mq_hw_sysfs_stat_store(struct blk_mq_hw_ctx *hctx,
|
|
+ const char *page, size_t count)
|
|
+{
|
|
+ blk_mq_stat_clear(hctx);
|
|
+ return count;
|
|
+}
|
|
+
|
|
+static ssize_t print_stat(char *page, struct blk_rq_stat *stat, const char *pre)
|
|
+{
|
|
+ return sprintf(page, "%s samples=%llu, mean=%lld, min=%lld, max=%lld\n",
|
|
+ pre, (long long) stat->nr_samples,
|
|
+ (long long) stat->mean, (long long) stat->min,
|
|
+ (long long) stat->max);
|
|
+}
|
|
+
|
|
+static ssize_t blk_mq_hw_sysfs_stat_show(struct blk_mq_hw_ctx *hctx, char *page)
|
|
+{
|
|
+ struct blk_rq_stat stat[2];
|
|
+ ssize_t ret;
|
|
+
|
|
+ blk_stat_init(&stat[0]);
|
|
+ blk_stat_init(&stat[1]);
|
|
+
|
|
+ blk_hctx_stat_get(hctx, stat);
|
|
+
|
|
+ ret = print_stat(page, &stat[0], "read :");
|
|
+ ret += print_stat(page + ret, &stat[1], "write:");
|
|
+ return ret;
|
|
+}
|
|
+
|
|
static struct blk_mq_ctx_sysfs_entry blk_mq_sysfs_dispatched = {
|
|
.attr = {.name = "dispatched", .mode = S_IRUGO },
|
|
.show = blk_mq_sysfs_dispatched_show,
|
|
@@ -317,6 +358,11 @@ static struct blk_mq_hw_ctx_sysfs_entry blk_mq_hw_sysfs_poll = {
|
|
.show = blk_mq_hw_sysfs_poll_show,
|
|
.store = blk_mq_hw_sysfs_poll_store,
|
|
};
|
|
+static struct blk_mq_hw_ctx_sysfs_entry blk_mq_hw_sysfs_stat = {
|
|
+ .attr = {.name = "stats", .mode = S_IRUGO | S_IWUSR },
|
|
+ .show = blk_mq_hw_sysfs_stat_show,
|
|
+ .store = blk_mq_hw_sysfs_stat_store,
|
|
+};
|
|
|
|
static struct attribute *default_hw_ctx_attrs[] = {
|
|
&blk_mq_hw_sysfs_queued.attr,
|
|
@@ -327,6 +373,7 @@ static struct attribute *default_hw_ctx_attrs[] = {
|
|
&blk_mq_hw_sysfs_cpus.attr,
|
|
&blk_mq_hw_sysfs_active.attr,
|
|
&blk_mq_hw_sysfs_poll.attr,
|
|
+ &blk_mq_hw_sysfs_stat.attr,
|
|
NULL,
|
|
};
|
|
|
|
diff --git a/block/blk-mq.c b/block/blk-mq.c
|
|
index 81caceb..9dddd99 100644
|
|
--- a/block/blk-mq.c
|
|
+++ b/block/blk-mq.c
|
|
@@ -30,6 +30,8 @@
|
|
#include "blk.h"
|
|
#include "blk-mq.h"
|
|
#include "blk-mq-tag.h"
|
|
+#include "blk-stat.h"
|
|
+#include "blk-wbt.h"
|
|
|
|
static DEFINE_MUTEX(all_q_mutex);
|
|
static LIST_HEAD(all_q_list);
|
|
@@ -300,6 +302,8 @@ static void __blk_mq_free_request(struct blk_mq_hw_ctx *hctx,
|
|
|
|
if (rq->cmd_flags & REQ_MQ_INFLIGHT)
|
|
atomic_dec(&hctx->nr_active);
|
|
+
|
|
+ wbt_done(q->rq_wb, &rq->issue_stat);
|
|
rq->cmd_flags = 0;
|
|
|
|
clear_bit(REQ_ATOM_STARTED, &rq->atomic_flags);
|
|
@@ -328,6 +332,7 @@ inline void __blk_mq_end_request(struct request *rq, int error)
|
|
blk_account_io_done(rq);
|
|
|
|
if (rq->end_io) {
|
|
+ wbt_done(rq->q->rq_wb, &rq->issue_stat);
|
|
rq->end_io(rq, error);
|
|
} else {
|
|
if (unlikely(blk_bidi_rq(rq)))
|
|
@@ -378,10 +383,19 @@ static void blk_mq_ipi_complete_request(struct request *rq)
|
|
put_cpu();
|
|
}
|
|
|
|
+static void blk_mq_stat_add(struct request *rq)
|
|
+{
|
|
+ struct blk_rq_stat *stat = &rq->mq_ctx->stat[rq_data_dir(rq)];
|
|
+
|
|
+ blk_stat_add(stat, rq);
|
|
+}
|
|
+
|
|
static void __blk_mq_complete_request(struct request *rq)
|
|
{
|
|
struct request_queue *q = rq->q;
|
|
|
|
+ blk_mq_stat_add(rq);
|
|
+
|
|
if (!q->softirq_done_fn)
|
|
blk_mq_end_request(rq, rq->errors);
|
|
else
|
|
@@ -425,6 +439,9 @@ void blk_mq_start_request(struct request *rq)
|
|
if (unlikely(blk_bidi_rq(rq)))
|
|
rq->next_rq->resid_len = blk_rq_bytes(rq->next_rq);
|
|
|
|
+ blk_stat_set_issue_time(&rq->issue_stat);
|
|
+ wbt_issue(q->rq_wb, &rq->issue_stat);
|
|
+
|
|
blk_add_timer(rq);
|
|
|
|
/*
|
|
@@ -460,6 +477,7 @@ static void __blk_mq_requeue_request(struct request *rq)
|
|
struct request_queue *q = rq->q;
|
|
|
|
trace_block_rq_requeue(q, rq);
|
|
+ wbt_requeue(q->rq_wb, &rq->issue_stat);
|
|
|
|
if (test_and_clear_bit(REQ_ATOM_STARTED, &rq->atomic_flags)) {
|
|
if (q->dma_drain_size && blk_rq_bytes(rq))
|
|
@@ -1271,6 +1289,7 @@ static blk_qc_t blk_mq_make_request(struct request_queue *q, struct bio *bio)
|
|
struct blk_plug *plug;
|
|
struct request *same_queue_rq = NULL;
|
|
blk_qc_t cookie;
|
|
+ unsigned int wb_acct;
|
|
|
|
blk_queue_bounce(q, &bio);
|
|
|
|
@@ -1285,9 +1304,15 @@ static blk_qc_t blk_mq_make_request(struct request_queue *q, struct bio *bio)
|
|
blk_attempt_plug_merge(q, bio, &request_count, &same_queue_rq))
|
|
return BLK_QC_T_NONE;
|
|
|
|
+ wb_acct = wbt_wait(q->rq_wb, bio->bi_opf, NULL);
|
|
+
|
|
rq = blk_mq_map_request(q, bio, &data);
|
|
- if (unlikely(!rq))
|
|
+ if (unlikely(!rq)) {
|
|
+ __wbt_done(q->rq_wb, wb_acct);
|
|
return BLK_QC_T_NONE;
|
|
+ }
|
|
+
|
|
+ wbt_track(&rq->issue_stat, wb_acct);
|
|
|
|
cookie = blk_tag_to_qc_t(rq->tag, data.hctx->queue_num);
|
|
|
|
@@ -1364,6 +1389,7 @@ static blk_qc_t blk_sq_make_request(struct request_queue *q, struct bio *bio)
|
|
struct blk_map_ctx data;
|
|
struct request *rq;
|
|
blk_qc_t cookie;
|
|
+ unsigned int wb_acct;
|
|
|
|
blk_queue_bounce(q, &bio);
|
|
|
|
@@ -1380,9 +1406,15 @@ static blk_qc_t blk_sq_make_request(struct request_queue *q, struct bio *bio)
|
|
} else
|
|
request_count = blk_plug_queued_count(q);
|
|
|
|
+ wb_acct = wbt_wait(q->rq_wb, bio->bi_opf, NULL);
|
|
+
|
|
rq = blk_mq_map_request(q, bio, &data);
|
|
- if (unlikely(!rq))
|
|
+ if (unlikely(!rq)) {
|
|
+ __wbt_done(q->rq_wb, wb_acct);
|
|
return BLK_QC_T_NONE;
|
|
+ }
|
|
+
|
|
+ wbt_track(&rq->issue_stat, wb_acct);
|
|
|
|
cookie = blk_tag_to_qc_t(rq->tag, data.hctx->queue_num);
|
|
|
|
@@ -1721,6 +1753,8 @@ static void blk_mq_init_cpu_queues(struct request_queue *q,
|
|
spin_lock_init(&__ctx->lock);
|
|
INIT_LIST_HEAD(&__ctx->rq_list);
|
|
__ctx->queue = q;
|
|
+ blk_stat_init(&__ctx->stat[0]);
|
|
+ blk_stat_init(&__ctx->stat[1]);
|
|
|
|
/* If the cpu isn't online, the cpu is mapped to first hctx */
|
|
if (!cpu_online(i))
|
|
@@ -2051,6 +2085,9 @@ void blk_mq_free_queue(struct request_queue *q)
|
|
list_del_init(&q->all_q_node);
|
|
mutex_unlock(&all_q_mutex);
|
|
|
|
+ wbt_exit(q->rq_wb);
|
|
+ q->rq_wb = NULL;
|
|
+
|
|
blk_mq_del_queue_tag_set(q);
|
|
|
|
blk_mq_exit_hw_queues(q, set, set->nr_hw_queues);
|
|
diff --git a/block/blk-mq.h b/block/blk-mq.h
|
|
index e5d2524..8cf16cb 100644
|
|
--- a/block/blk-mq.h
|
|
+++ b/block/blk-mq.h
|
|
@@ -1,6 +1,8 @@
|
|
#ifndef INT_BLK_MQ_H
|
|
#define INT_BLK_MQ_H
|
|
|
|
+#include "blk-stat.h"
|
|
+
|
|
struct blk_mq_tag_set;
|
|
|
|
struct blk_mq_ctx {
|
|
@@ -18,6 +20,7 @@ struct blk_mq_ctx {
|
|
|
|
/* incremented at completion time */
|
|
unsigned long ____cacheline_aligned_in_smp rq_completed[2];
|
|
+ struct blk_rq_stat stat[2];
|
|
|
|
struct request_queue *queue;
|
|
struct kobject kobj;
|
|
diff --git a/block/blk-settings.c b/block/blk-settings.c
|
|
index f679ae1..b51ad19 100644
|
|
--- a/block/blk-settings.c
|
|
+++ b/block/blk-settings.c
|
|
@@ -13,6 +13,7 @@
|
|
#include <linux/gfp.h>
|
|
|
|
#include "blk.h"
|
|
+#include "blk-wbt.h"
|
|
|
|
unsigned long blk_max_low_pfn;
|
|
EXPORT_SYMBOL(blk_max_low_pfn);
|
|
@@ -832,6 +833,19 @@ void blk_queue_flush_queueable(struct request_queue *q, bool queueable)
|
|
EXPORT_SYMBOL_GPL(blk_queue_flush_queueable);
|
|
|
|
/**
|
|
+ * blk_set_queue_depth - tell the block layer about the device queue depth
|
|
+ * @q: the request queue for the device
|
|
+ * @depth: queue depth
|
|
+ *
|
|
+ */
|
|
+void blk_set_queue_depth(struct request_queue *q, unsigned int depth)
|
|
+{
|
|
+ q->queue_depth = depth;
|
|
+ wbt_set_queue_depth(q->rq_wb, depth);
|
|
+}
|
|
+EXPORT_SYMBOL(blk_set_queue_depth);
|
|
+
|
|
+/**
|
|
* blk_queue_write_cache - configure queue's write cache
|
|
* @q: the request queue for the device
|
|
* @wc: write back cache on or off
|
|
@@ -851,6 +865,8 @@ void blk_queue_write_cache(struct request_queue *q, bool wc, bool fua)
|
|
else
|
|
queue_flag_clear(QUEUE_FLAG_FUA, q);
|
|
spin_unlock_irq(q->queue_lock);
|
|
+
|
|
+ wbt_set_write_cache(q->rq_wb, test_bit(QUEUE_FLAG_WC, &q->queue_flags));
|
|
}
|
|
EXPORT_SYMBOL_GPL(blk_queue_write_cache);
|
|
|
|
diff --git a/block/blk-stat.c b/block/blk-stat.c
|
|
new file mode 100644
|
|
index 0000000..c219f1b
|
|
--- /dev/null
|
|
+++ b/block/blk-stat.c
|
|
@@ -0,0 +1,234 @@
|
|
+/*
|
|
+ * Block stat tracking code
|
|
+ *
|
|
+ * Copyright (C) 2016 Jens Axboe
|
|
+ */
|
|
+#include <linux/kernel.h>
|
|
+#include <linux/blk-mq.h>
|
|
+
|
|
+#include "blk-stat.h"
|
|
+#include "blk-mq.h"
|
|
+
|
|
+static void blk_stat_flush_batch(struct blk_rq_stat *stat)
|
|
+{
|
|
+ if (!stat->nr_batch)
|
|
+ return;
|
|
+ if (!stat->nr_samples)
|
|
+ stat->mean = div64_s64(stat->batch, stat->nr_batch);
|
|
+ else {
|
|
+ stat->mean = div64_s64((stat->mean * stat->nr_samples) +
|
|
+ stat->batch,
|
|
+ stat->nr_samples + stat->nr_batch);
|
|
+ }
|
|
+
|
|
+ stat->nr_samples += stat->nr_batch;
|
|
+ stat->nr_batch = stat->batch = 0;
|
|
+}
|
|
+
|
|
+void blk_stat_sum(struct blk_rq_stat *dst, struct blk_rq_stat *src)
|
|
+{
|
|
+ if (!src->nr_samples)
|
|
+ return;
|
|
+
|
|
+ blk_stat_flush_batch(src);
|
|
+
|
|
+ dst->min = min(dst->min, src->min);
|
|
+ dst->max = max(dst->max, src->max);
|
|
+
|
|
+ if (!dst->nr_samples)
|
|
+ dst->mean = src->mean;
|
|
+ else {
|
|
+ dst->mean = div64_s64((src->mean * src->nr_samples) +
|
|
+ (dst->mean * dst->nr_samples),
|
|
+ dst->nr_samples + src->nr_samples);
|
|
+ }
|
|
+ dst->nr_samples += src->nr_samples;
|
|
+}
|
|
+
|
|
+static void blk_mq_stat_get(struct request_queue *q, struct blk_rq_stat *dst)
|
|
+{
|
|
+ struct blk_mq_hw_ctx *hctx;
|
|
+ struct blk_mq_ctx *ctx;
|
|
+ uint64_t latest = 0;
|
|
+ int i, j, nr;
|
|
+
|
|
+ blk_stat_init(&dst[0]);
|
|
+ blk_stat_init(&dst[1]);
|
|
+
|
|
+ nr = 0;
|
|
+ do {
|
|
+ uint64_t newest = 0;
|
|
+
|
|
+ queue_for_each_hw_ctx(q, hctx, i) {
|
|
+ hctx_for_each_ctx(hctx, ctx, j) {
|
|
+ blk_stat_flush_batch(&ctx->stat[0]);
|
|
+ blk_stat_flush_batch(&ctx->stat[1]);
|
|
+
|
|
+ if (!ctx->stat[0].nr_samples &&
|
|
+ !ctx->stat[1].nr_samples)
|
|
+ continue;
|
|
+ if (ctx->stat[0].time > newest)
|
|
+ newest = ctx->stat[0].time;
|
|
+ if (ctx->stat[1].time > newest)
|
|
+ newest = ctx->stat[1].time;
|
|
+ }
|
|
+ }
|
|
+
|
|
+ /*
|
|
+ * No samples
|
|
+ */
|
|
+ if (!newest)
|
|
+ break;
|
|
+
|
|
+ if (newest > latest)
|
|
+ latest = newest;
|
|
+
|
|
+ queue_for_each_hw_ctx(q, hctx, i) {
|
|
+ hctx_for_each_ctx(hctx, ctx, j) {
|
|
+ if (ctx->stat[0].time == newest) {
|
|
+ blk_stat_sum(&dst[0], &ctx->stat[0]);
|
|
+ nr++;
|
|
+ }
|
|
+ if (ctx->stat[1].time == newest) {
|
|
+ blk_stat_sum(&dst[1], &ctx->stat[1]);
|
|
+ nr++;
|
|
+ }
|
|
+ }
|
|
+ }
|
|
+ /*
|
|
+ * If we race on finding an entry, just loop back again.
|
|
+ * Should be very rare.
|
|
+ */
|
|
+ } while (!nr);
|
|
+
|
|
+ dst[0].time = dst[1].time = latest;
|
|
+}
|
|
+
|
|
+void blk_queue_stat_get(struct request_queue *q, struct blk_rq_stat *dst)
|
|
+{
|
|
+ if (q->mq_ops)
|
|
+ blk_mq_stat_get(q, dst);
|
|
+ else {
|
|
+ blk_stat_flush_batch(&q->rq_stats[0]);
|
|
+ blk_stat_flush_batch(&q->rq_stats[1]);
|
|
+ memcpy(&dst[0], &q->rq_stats[0], sizeof(struct blk_rq_stat));
|
|
+ memcpy(&dst[1], &q->rq_stats[1], sizeof(struct blk_rq_stat));
|
|
+ }
|
|
+}
|
|
+
|
|
+void blk_hctx_stat_get(struct blk_mq_hw_ctx *hctx, struct blk_rq_stat *dst)
|
|
+{
|
|
+ struct blk_mq_ctx *ctx;
|
|
+ unsigned int i, nr;
|
|
+
|
|
+ nr = 0;
|
|
+ do {
|
|
+ uint64_t newest = 0;
|
|
+
|
|
+ hctx_for_each_ctx(hctx, ctx, i) {
|
|
+ blk_stat_flush_batch(&ctx->stat[0]);
|
|
+ blk_stat_flush_batch(&ctx->stat[1]);
|
|
+
|
|
+ if (!ctx->stat[0].nr_samples &&
|
|
+ !ctx->stat[1].nr_samples)
|
|
+ continue;
|
|
+
|
|
+ if (ctx->stat[0].time > newest)
|
|
+ newest = ctx->stat[0].time;
|
|
+ if (ctx->stat[1].time > newest)
|
|
+ newest = ctx->stat[1].time;
|
|
+ }
|
|
+
|
|
+ if (!newest)
|
|
+ break;
|
|
+
|
|
+ hctx_for_each_ctx(hctx, ctx, i) {
|
|
+ if (ctx->stat[0].time == newest) {
|
|
+ blk_stat_sum(&dst[0], &ctx->stat[0]);
|
|
+ nr++;
|
|
+ }
|
|
+ if (ctx->stat[1].time == newest) {
|
|
+ blk_stat_sum(&dst[1], &ctx->stat[1]);
|
|
+ nr++;
|
|
+ }
|
|
+ }
|
|
+ /*
|
|
+ * If we race on finding an entry, just loop back again.
|
|
+ * Should be very rare, as the window is only updated
|
|
+ * occasionally
|
|
+ */
|
|
+ } while (!nr);
|
|
+}
|
|
+
|
|
+static void __blk_stat_init(struct blk_rq_stat *stat, s64 time_now)
|
|
+{
|
|
+ stat->min = -1ULL;
|
|
+ stat->max = stat->nr_samples = stat->mean = 0;
|
|
+ stat->batch = stat->nr_batch = 0;
|
|
+ stat->time = time_now & BLK_STAT_NSEC_MASK;
|
|
+}
|
|
+
|
|
+void blk_stat_init(struct blk_rq_stat *stat)
|
|
+{
|
|
+ __blk_stat_init(stat, ktime_to_ns(ktime_get()));
|
|
+}
|
|
+
|
|
+static bool __blk_stat_is_current(struct blk_rq_stat *stat, s64 now)
|
|
+{
|
|
+ return (now & BLK_STAT_NSEC_MASK) == (stat->time & BLK_STAT_NSEC_MASK);
|
|
+}
|
|
+
|
|
+bool blk_stat_is_current(struct blk_rq_stat *stat)
|
|
+{
|
|
+ return __blk_stat_is_current(stat, ktime_to_ns(ktime_get()));
|
|
+}
|
|
+
|
|
+void blk_stat_add(struct blk_rq_stat *stat, struct request *rq)
|
|
+{
|
|
+ s64 now, value;
|
|
+
|
|
+ now = __blk_stat_time(ktime_to_ns(ktime_get()));
|
|
+ if (now < blk_stat_time(&rq->issue_stat))
|
|
+ return;
|
|
+
|
|
+ if (!__blk_stat_is_current(stat, now))
|
|
+ __blk_stat_init(stat, now);
|
|
+
|
|
+ value = now - blk_stat_time(&rq->issue_stat);
|
|
+ if (value > stat->max)
|
|
+ stat->max = value;
|
|
+ if (value < stat->min)
|
|
+ stat->min = value;
|
|
+
|
|
+ if (stat->batch + value < stat->batch ||
|
|
+ stat->nr_batch + 1 == BLK_RQ_STAT_BATCH)
|
|
+ blk_stat_flush_batch(stat);
|
|
+
|
|
+ stat->batch += value;
|
|
+ stat->nr_batch++;
|
|
+}
|
|
+
|
|
+void blk_stat_clear(struct request_queue *q)
|
|
+{
|
|
+ if (q->mq_ops) {
|
|
+ struct blk_mq_hw_ctx *hctx;
|
|
+ struct blk_mq_ctx *ctx;
|
|
+ int i, j;
|
|
+
|
|
+ queue_for_each_hw_ctx(q, hctx, i) {
|
|
+ hctx_for_each_ctx(hctx, ctx, j) {
|
|
+ blk_stat_init(&ctx->stat[0]);
|
|
+ blk_stat_init(&ctx->stat[1]);
|
|
+ }
|
|
+ }
|
|
+ } else {
|
|
+ blk_stat_init(&q->rq_stats[0]);
|
|
+ blk_stat_init(&q->rq_stats[1]);
|
|
+ }
|
|
+}
|
|
+
|
|
+void blk_stat_set_issue_time(struct blk_issue_stat *stat)
|
|
+{
|
|
+ stat->time = (stat->time & BLK_STAT_MASK) |
|
|
+ (ktime_to_ns(ktime_get()) & BLK_STAT_TIME_MASK);
|
|
+}
|
|
diff --git a/block/blk-stat.h b/block/blk-stat.h
|
|
new file mode 100644
|
|
index 0000000..26b1f45
|
|
--- /dev/null
|
|
+++ b/block/blk-stat.h
|
|
@@ -0,0 +1,37 @@
|
|
+#ifndef BLK_STAT_H
|
|
+#define BLK_STAT_H
|
|
+
|
|
+/*
|
|
+ * ~0.13s window as a power-of-2 (2^27 nsecs)
|
|
+ */
|
|
+#define BLK_STAT_NSEC 134217728ULL
|
|
+#define BLK_STAT_NSEC_MASK ~(BLK_STAT_NSEC - 1)
|
|
+
|
|
+/*
|
|
+ * Upper 3 bits can be used elsewhere
|
|
+ */
|
|
+#define BLK_STAT_RES_BITS 3
|
|
+#define BLK_STAT_SHIFT (64 - BLK_STAT_RES_BITS)
|
|
+#define BLK_STAT_TIME_MASK ((1ULL << BLK_STAT_SHIFT) - 1)
|
|
+#define BLK_STAT_MASK ~BLK_STAT_TIME_MASK
|
|
+
|
|
+void blk_stat_add(struct blk_rq_stat *, struct request *);
|
|
+void blk_hctx_stat_get(struct blk_mq_hw_ctx *, struct blk_rq_stat *);
|
|
+void blk_queue_stat_get(struct request_queue *, struct blk_rq_stat *);
|
|
+void blk_stat_clear(struct request_queue *q);
|
|
+void blk_stat_init(struct blk_rq_stat *);
|
|
+void blk_stat_sum(struct blk_rq_stat *, struct blk_rq_stat *);
|
|
+bool blk_stat_is_current(struct blk_rq_stat *);
|
|
+void blk_stat_set_issue_time(struct blk_issue_stat *);
|
|
+
|
|
+static inline u64 __blk_stat_time(u64 time)
|
|
+{
|
|
+ return time & BLK_STAT_TIME_MASK;
|
|
+}
|
|
+
|
|
+static inline u64 blk_stat_time(struct blk_issue_stat *stat)
|
|
+{
|
|
+ return __blk_stat_time(stat->time);
|
|
+}
|
|
+
|
|
+#endif
|
|
diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c
|
|
index 9cc8d7c..1dfe5cc 100644
|
|
--- a/block/blk-sysfs.c
|
|
+++ b/block/blk-sysfs.c
|
|
@@ -13,6 +13,7 @@
|
|
|
|
#include "blk.h"
|
|
#include "blk-mq.h"
|
|
+#include "blk-wbt.h"
|
|
|
|
struct queue_sysfs_entry {
|
|
struct attribute attr;
|
|
@@ -41,6 +42,19 @@ queue_var_store(unsigned long *var, const char *page, size_t count)
|
|
return count;
|
|
}
|
|
|
|
+static ssize_t queue_var_store64(u64 *var, const char *page)
|
|
+{
|
|
+ int err;
|
|
+ u64 v;
|
|
+
|
|
+ err = kstrtou64(page, 10, &v);
|
|
+ if (err < 0)
|
|
+ return err;
|
|
+
|
|
+ *var = v;
|
|
+ return 0;
|
|
+}
|
|
+
|
|
static ssize_t queue_requests_show(struct request_queue *q, char *page)
|
|
{
|
|
return queue_var_show(q->nr_requests, (page));
|
|
@@ -347,6 +361,58 @@ static ssize_t queue_poll_store(struct request_queue *q, const char *page,
|
|
return ret;
|
|
}
|
|
|
|
+static ssize_t queue_wb_win_show(struct request_queue *q, char *page)
|
|
+{
|
|
+ if (!q->rq_wb)
|
|
+ return -EINVAL;
|
|
+
|
|
+ return sprintf(page, "%llu\n", div_u64(q->rq_wb->win_nsec, 1000));
|
|
+}
|
|
+
|
|
+static ssize_t queue_wb_win_store(struct request_queue *q, const char *page,
|
|
+ size_t count)
|
|
+{
|
|
+ ssize_t ret;
|
|
+ u64 val;
|
|
+
|
|
+ if (!q->rq_wb)
|
|
+ return -EINVAL;
|
|
+
|
|
+ ret = queue_var_store64(&val, page);
|
|
+ if (ret < 0)
|
|
+ return ret;
|
|
+
|
|
+ q->rq_wb->win_nsec = val * 1000ULL;
|
|
+ wbt_update_limits(q->rq_wb);
|
|
+ return count;
|
|
+}
|
|
+
|
|
+static ssize_t queue_wb_lat_show(struct request_queue *q, char *page)
|
|
+{
|
|
+ if (!q->rq_wb)
|
|
+ return -EINVAL;
|
|
+
|
|
+ return sprintf(page, "%llu\n", div_u64(q->rq_wb->min_lat_nsec, 1000));
|
|
+}
|
|
+
|
|
+static ssize_t queue_wb_lat_store(struct request_queue *q, const char *page,
|
|
+ size_t count)
|
|
+{
|
|
+ ssize_t ret;
|
|
+ u64 val;
|
|
+
|
|
+ if (!q->rq_wb)
|
|
+ return -EINVAL;
|
|
+
|
|
+ ret = queue_var_store64(&val, page);
|
|
+ if (ret < 0)
|
|
+ return ret;
|
|
+
|
|
+ q->rq_wb->min_lat_nsec = val * 1000ULL;
|
|
+ wbt_update_limits(q->rq_wb);
|
|
+ return count;
|
|
+}
|
|
+
|
|
static ssize_t queue_wc_show(struct request_queue *q, char *page)
|
|
{
|
|
if (test_bit(QUEUE_FLAG_WC, &q->queue_flags))
|
|
@@ -384,6 +450,26 @@ static ssize_t queue_dax_show(struct request_queue *q, char *page)
|
|
return queue_var_show(blk_queue_dax(q), page);
|
|
}
|
|
|
|
+static ssize_t print_stat(char *page, struct blk_rq_stat *stat, const char *pre)
|
|
+{
|
|
+ return sprintf(page, "%s samples=%llu, mean=%lld, min=%lld, max=%lld\n",
|
|
+ pre, (long long) stat->nr_samples,
|
|
+ (long long) stat->mean, (long long) stat->min,
|
|
+ (long long) stat->max);
|
|
+}
|
|
+
|
|
+static ssize_t queue_stats_show(struct request_queue *q, char *page)
|
|
+{
|
|
+ struct blk_rq_stat stat[2];
|
|
+ ssize_t ret;
|
|
+
|
|
+ blk_queue_stat_get(q, stat);
|
|
+
|
|
+ ret = print_stat(page, &stat[0], "read :");
|
|
+ ret += print_stat(page + ret, &stat[1], "write:");
|
|
+ return ret;
|
|
+}
|
|
+
|
|
static struct queue_sysfs_entry queue_requests_entry = {
|
|
.attr = {.name = "nr_requests", .mode = S_IRUGO | S_IWUSR },
|
|
.show = queue_requests_show,
|
|
@@ -526,6 +612,23 @@ static struct queue_sysfs_entry queue_dax_entry = {
|
|
.show = queue_dax_show,
|
|
};
|
|
|
|
+static struct queue_sysfs_entry queue_stats_entry = {
|
|
+ .attr = {.name = "stats", .mode = S_IRUGO },
|
|
+ .show = queue_stats_show,
|
|
+};
|
|
+
|
|
+static struct queue_sysfs_entry queue_wb_lat_entry = {
|
|
+ .attr = {.name = "wbt_lat_usec", .mode = S_IRUGO | S_IWUSR },
|
|
+ .show = queue_wb_lat_show,
|
|
+ .store = queue_wb_lat_store,
|
|
+};
|
|
+
|
|
+static struct queue_sysfs_entry queue_wb_win_entry = {
|
|
+ .attr = {.name = "wbt_window_usec", .mode = S_IRUGO | S_IWUSR },
|
|
+ .show = queue_wb_win_show,
|
|
+ .store = queue_wb_win_store,
|
|
+};
|
|
+
|
|
static struct attribute *default_attrs[] = {
|
|
&queue_requests_entry.attr,
|
|
&queue_ra_entry.attr,
|
|
@@ -553,6 +656,9 @@ static struct attribute *default_attrs[] = {
|
|
&queue_poll_entry.attr,
|
|
&queue_wc_entry.attr,
|
|
&queue_dax_entry.attr,
|
|
+ &queue_stats_entry.attr,
|
|
+ &queue_wb_lat_entry.attr,
|
|
+ &queue_wb_win_entry.attr,
|
|
NULL,
|
|
};
|
|
|
|
@@ -667,6 +773,58 @@ struct kobj_type blk_queue_ktype = {
|
|
.release = blk_release_queue,
|
|
};
|
|
|
|
+static void blk_wb_stat_get(void *data, struct blk_rq_stat *stat)
|
|
+{
|
|
+ blk_queue_stat_get(data, stat);
|
|
+}
|
|
+
|
|
+static void blk_wb_stat_clear(void *data)
|
|
+{
|
|
+ blk_stat_clear(data);
|
|
+}
|
|
+
|
|
+static bool blk_wb_stat_is_current(struct blk_rq_stat *stat)
|
|
+{
|
|
+ return blk_stat_is_current(stat);
|
|
+}
|
|
+
|
|
+static struct wb_stat_ops wb_stat_ops = {
|
|
+ .get = blk_wb_stat_get,
|
|
+ .is_current = blk_wb_stat_is_current,
|
|
+ .clear = blk_wb_stat_clear,
|
|
+};
|
|
+
|
|
+static void blk_wb_init(struct request_queue *q)
|
|
+{
|
|
+ struct rq_wb *rwb;
|
|
+
|
|
+#ifndef CONFIG_BLK_WBT_MQ
|
|
+ if (q->mq_ops)
|
|
+ return;
|
|
+#endif
|
|
+#ifndef CONFIG_BLK_WBT_SQ
|
|
+ if (q->request_fn)
|
|
+ return;
|
|
+#endif
|
|
+
|
|
+ rwb = wbt_init(&q->backing_dev_info, &wb_stat_ops, q);
|
|
+
|
|
+ /*
|
|
+ * If this fails, we don't get throttling
|
|
+ */
|
|
+ if (IS_ERR_OR_NULL(rwb))
|
|
+ return;
|
|
+
|
|
+ if (blk_queue_nonrot(q))
|
|
+ rwb->min_lat_nsec = 2000000ULL;
|
|
+ else
|
|
+ rwb->min_lat_nsec = 75000000ULL;
|
|
+
|
|
+ wbt_set_queue_depth(rwb, blk_queue_depth(q));
|
|
+ wbt_set_write_cache(rwb, test_bit(QUEUE_FLAG_WC, &q->queue_flags));
|
|
+ q->rq_wb = rwb;
|
|
+}
|
|
+
|
|
int blk_register_queue(struct gendisk *disk)
|
|
{
|
|
int ret;
|
|
@@ -706,6 +864,8 @@ int blk_register_queue(struct gendisk *disk)
|
|
if (q->mq_ops)
|
|
blk_mq_register_dev(dev, q);
|
|
|
|
+ blk_wb_init(q);
|
|
+
|
|
if (!q->request_fn)
|
|
return 0;
|
|
|
|
diff --git a/block/blk-wbt.c b/block/blk-wbt.c
|
|
new file mode 100644
|
|
index 0000000..82823c7
|
|
--- /dev/null
|
|
+++ b/block/blk-wbt.c
|
|
@@ -0,0 +1,704 @@
|
|
+/*
|
|
+ * buffered writeback throttling. loosely based on CoDel. We can't drop
|
|
+ * packets for IO scheduling, so the logic is something like this:
|
|
+ *
|
|
+ * - Monitor latencies in a defined window of time.
|
|
+ * - If the minimum latency in the above window exceeds some target, increment
|
|
+ * scaling step and scale down queue depth by a factor of 2x. The monitoring
|
|
+ * window is then shrunk to 100 / sqrt(scaling step + 1).
|
|
+ * - For any window where we don't have solid data on what the latencies
|
|
+ * look like, retain status quo.
|
|
+ * - If latencies look good, decrement scaling step.
|
|
+ * - If we're only doing writes, allow the scaling step to go negative. This
|
|
+ * will temporarily boost write performance, snapping back to a stable
|
|
+ * scaling step of 0 if reads show up or the heavy writers finish. Unlike
|
|
+ * positive scaling steps where we shrink the monitoring window, a negative
|
|
+ * scaling step retains the default step==0 window size.
|
|
+ *
|
|
+ * Copyright (C) 2016 Jens Axboe
|
|
+ *
|
|
+ */
|
|
+#include <linux/kernel.h>
|
|
+#include <linux/blk_types.h>
|
|
+#include <linux/slab.h>
|
|
+#include <linux/backing-dev.h>
|
|
+#include <linux/swap.h>
|
|
+
|
|
+#include "blk-wbt.h"
|
|
+
|
|
+#define CREATE_TRACE_POINTS
|
|
+#include <trace/events/wbt.h>
|
|
+
|
|
+enum {
|
|
+ /*
|
|
+ * Default setting, we'll scale up (to 75% of QD max) or down (min 1)
|
|
+ * from here depending on device stats
|
|
+ */
|
|
+ RWB_DEF_DEPTH = 16,
|
|
+
|
|
+ /*
|
|
+ * 100msec window
|
|
+ */
|
|
+ RWB_WINDOW_NSEC = 100 * 1000 * 1000ULL,
|
|
+
|
|
+ /*
|
|
+ * Disregard stats, if we don't meet this minimum
|
|
+ */
|
|
+ RWB_MIN_WRITE_SAMPLES = 3,
|
|
+
|
|
+ /*
|
|
+ * If we have this number of consecutive windows with not enough
|
|
+ * information to scale up or down, scale up.
|
|
+ */
|
|
+ RWB_UNKNOWN_BUMP = 5,
|
|
+};
|
|
+
|
|
+static inline bool rwb_enabled(struct rq_wb *rwb)
|
|
+{
|
|
+ return rwb && rwb->wb_normal != 0;
|
|
+}
|
|
+
|
|
+/*
|
|
+ * Increment 'v', if 'v' is below 'below'. Returns true if we succeeded,
|
|
+ * false if 'v' + 1 would be bigger than 'below'.
|
|
+ */
|
|
+static bool atomic_inc_below(atomic_t *v, int below)
|
|
+{
|
|
+ int cur = atomic_read(v);
|
|
+
|
|
+ for (;;) {
|
|
+ int old;
|
|
+
|
|
+ if (cur >= below)
|
|
+ return false;
|
|
+ old = atomic_cmpxchg(v, cur, cur + 1);
|
|
+ if (old == cur)
|
|
+ break;
|
|
+ cur = old;
|
|
+ }
|
|
+
|
|
+ return true;
|
|
+}
|
|
+
|
|
+static void wb_timestamp(struct rq_wb *rwb, unsigned long *var)
|
|
+{
|
|
+ if (rwb_enabled(rwb)) {
|
|
+ const unsigned long cur = jiffies;
|
|
+
|
|
+ if (cur != *var)
|
|
+ *var = cur;
|
|
+ }
|
|
+}
|
|
+
|
|
+/*
|
|
+ * If a task was rate throttled in balance_dirty_pages() within the last
|
|
+ * second or so, use that to indicate a higher cleaning rate.
|
|
+ */
|
|
+static bool wb_recent_wait(struct rq_wb *rwb)
|
|
+{
|
|
+ struct bdi_writeback *wb = &rwb->bdi->wb;
|
|
+
|
|
+ return time_before(jiffies, wb->dirty_sleep + HZ);
|
|
+}
|
|
+
|
|
+static inline struct rq_wait *get_rq_wait(struct rq_wb *rwb, bool is_kswapd)
|
|
+{
|
|
+ return &rwb->rq_wait[is_kswapd];
|
|
+}
|
|
+
|
|
+static void rwb_wake_all(struct rq_wb *rwb)
|
|
+{
|
|
+ int i;
|
|
+
|
|
+ for (i = 0; i < WBT_NUM_RWQ; i++) {
|
|
+ struct rq_wait *rqw = &rwb->rq_wait[i];
|
|
+
|
|
+ if (waitqueue_active(&rqw->wait))
|
|
+ wake_up_all(&rqw->wait);
|
|
+ }
|
|
+}
|
|
+
|
|
+void __wbt_done(struct rq_wb *rwb, enum wbt_flags wb_acct)
|
|
+{
|
|
+ struct rq_wait *rqw;
|
|
+ int inflight, limit;
|
|
+
|
|
+ if (!(wb_acct & WBT_TRACKED))
|
|
+ return;
|
|
+
|
|
+ rqw = get_rq_wait(rwb, wb_acct & WBT_KSWAPD);
|
|
+ inflight = atomic_dec_return(&rqw->inflight);
|
|
+
|
|
+ /*
|
|
+ * wbt got disabled with IO in flight. Wake up any potential
|
|
+ * waiters, we don't have to do more than that.
|
|
+ */
|
|
+ if (unlikely(!rwb_enabled(rwb))) {
|
|
+ rwb_wake_all(rwb);
|
|
+ return;
|
|
+ }
|
|
+
|
|
+ /*
|
|
+ * If the device does write back caching, drop further down
|
|
+ * before we wake people up.
|
|
+ */
|
|
+ if (rwb->wc && !wb_recent_wait(rwb))
|
|
+ limit = 0;
|
|
+ else
|
|
+ limit = rwb->wb_normal;
|
|
+
|
|
+ /*
|
|
+ * Don't wake anyone up if we are above the normal limit.
|
|
+ */
|
|
+ if (inflight && inflight >= limit)
|
|
+ return;
|
|
+
|
|
+ if (waitqueue_active(&rqw->wait)) {
|
|
+ int diff = limit - inflight;
|
|
+
|
|
+ if (!inflight || diff >= rwb->wb_background / 2)
|
|
+ wake_up(&rqw->wait);
|
|
+ }
|
|
+}
|
|
+
|
|
+/*
|
|
+ * Called on completion of a request. Note that it's also called when
|
|
+ * a request is merged, when the request gets freed.
|
|
+ */
|
|
+void wbt_done(struct rq_wb *rwb, struct blk_issue_stat *stat)
|
|
+{
|
|
+ if (!rwb)
|
|
+ return;
|
|
+
|
|
+ if (!wbt_is_tracked(stat)) {
|
|
+ if (rwb->sync_cookie == stat) {
|
|
+ rwb->sync_issue = 0;
|
|
+ rwb->sync_cookie = NULL;
|
|
+ }
|
|
+
|
|
+ if (wbt_is_read(stat))
|
|
+ wb_timestamp(rwb, &rwb->last_comp);
|
|
+ wbt_clear_state(stat);
|
|
+ } else {
|
|
+ WARN_ON_ONCE(stat == rwb->sync_cookie);
|
|
+ __wbt_done(rwb, wbt_stat_to_mask(stat));
|
|
+ wbt_clear_state(stat);
|
|
+ }
|
|
+}
|
|
+
|
|
+/*
|
|
+ * Return true, if we can't increase the depth further by scaling
|
|
+ */
|
|
+static bool calc_wb_limits(struct rq_wb *rwb)
|
|
+{
|
|
+ unsigned int depth;
|
|
+ bool ret = false;
|
|
+
|
|
+ if (!rwb->min_lat_nsec) {
|
|
+ rwb->wb_max = rwb->wb_normal = rwb->wb_background = 0;
|
|
+ return false;
|
|
+ }
|
|
+
|
|
+ /*
|
|
+ * For QD=1 devices, this is a special case. It's important for those
|
|
+ * to have one request ready when one completes, so force a depth of
|
|
+ * 2 for those devices. On the backend, it'll be a depth of 1 anyway,
|
|
+ * since the device can't have more than that in flight. If we're
|
|
+ * scaling down, then keep a setting of 1/1/1.
|
|
+ */
|
|
+ if (rwb->queue_depth == 1) {
|
|
+ if (rwb->scale_step > 0)
|
|
+ rwb->wb_max = rwb->wb_normal = 1;
|
|
+ else {
|
|
+ rwb->wb_max = rwb->wb_normal = 2;
|
|
+ ret = true;
|
|
+ }
|
|
+ rwb->wb_background = 1;
|
|
+ } else {
|
|
+ /*
|
|
+ * scale_step == 0 is our default state. If we have suffered
|
|
+ * latency spikes, step will be > 0, and we shrink the
|
|
+ * allowed write depths. If step is < 0, we're only doing
|
|
+ * writes, and we allow a temporarily higher depth to
|
|
+ * increase performance.
|
|
+ */
|
|
+ depth = min_t(unsigned int, RWB_DEF_DEPTH, rwb->queue_depth);
|
|
+ if (rwb->scale_step > 0)
|
|
+ depth = 1 + ((depth - 1) >> min(31, rwb->scale_step));
|
|
+ else if (rwb->scale_step < 0) {
|
|
+ unsigned int maxd = 3 * rwb->queue_depth / 4;
|
|
+
|
|
+ depth = 1 + ((depth - 1) << -rwb->scale_step);
|
|
+ if (depth > maxd) {
|
|
+ depth = maxd;
|
|
+ ret = true;
|
|
+ }
|
|
+ }
|
|
+
|
|
+ /*
|
|
+ * Set our max/normal/bg queue depths based on how far
|
|
+ * we have scaled down (->scale_step).
|
|
+ */
|
|
+ rwb->wb_max = depth;
|
|
+ rwb->wb_normal = (rwb->wb_max + 1) / 2;
|
|
+ rwb->wb_background = (rwb->wb_max + 3) / 4;
|
|
+ }
|
|
+
|
|
+ return ret;
|
|
+}
|
|
+
|
|
+static bool inline stat_sample_valid(struct blk_rq_stat *stat)
|
|
+{
|
|
+ /*
|
|
+ * We need at least one read sample, and a minimum of
|
|
+ * RWB_MIN_WRITE_SAMPLES. We require some write samples to know
|
|
+ * that it's writes impacting us, and not just some sole read on
|
|
+ * a device that is in a lower power state.
|
|
+ */
|
|
+ return stat[0].nr_samples >= 1 &&
|
|
+ stat[1].nr_samples >= RWB_MIN_WRITE_SAMPLES;
|
|
+}
|
|
+
|
|
+static u64 rwb_sync_issue_lat(struct rq_wb *rwb)
|
|
+{
|
|
+ u64 now, issue = ACCESS_ONCE(rwb->sync_issue);
|
|
+
|
|
+ if (!issue || !rwb->sync_cookie)
|
|
+ return 0;
|
|
+
|
|
+ now = ktime_to_ns(ktime_get());
|
|
+ return now - issue;
|
|
+}
|
|
+
|
|
+enum {
|
|
+ LAT_OK = 1,
|
|
+ LAT_UNKNOWN,
|
|
+ LAT_UNKNOWN_WRITES,
|
|
+ LAT_EXCEEDED,
|
|
+};
|
|
+
|
|
+static int __latency_exceeded(struct rq_wb *rwb, struct blk_rq_stat *stat)
|
|
+{
|
|
+ u64 thislat;
|
|
+
|
|
+ /*
|
|
+ * If our stored sync issue exceeds the window size, or it
|
|
+ * exceeds our min target AND we haven't logged any entries,
|
|
+ * flag the latency as exceeded. wbt works off completion latencies,
|
|
+ * but for a flooded device, a single sync IO can take a long time
|
|
+ * to complete after being issued. If this time exceeds our
|
|
+ * monitoring window AND we didn't see any other completions in that
|
|
+ * window, then count that sync IO as a violation of the latency.
|
|
+ */
|
|
+ thislat = rwb_sync_issue_lat(rwb);
|
|
+ if (thislat > rwb->cur_win_nsec ||
|
|
+ (thislat > rwb->min_lat_nsec && !stat[0].nr_samples)) {
|
|
+ trace_wbt_lat(rwb->bdi, thislat);
|
|
+ return LAT_EXCEEDED;
|
|
+ }
|
|
+
|
|
+ /*
|
|
+ * No read/write mix, if stat isn't valid
|
|
+ */
|
|
+ if (!stat_sample_valid(stat)) {
|
|
+ /*
|
|
+ * If we had writes in this stat window and the window is
|
|
+ * current, we're only doing writes. If a task recently
|
|
+ * waited or still has writes in flights, consider us doing
|
|
+ * just writes as well.
|
|
+ */
|
|
+ if ((stat[1].nr_samples && rwb->stat_ops->is_current(stat)) ||
|
|
+ wb_recent_wait(rwb) || wbt_inflight(rwb))
|
|
+ return LAT_UNKNOWN_WRITES;
|
|
+ return LAT_UNKNOWN;
|
|
+ }
|
|
+
|
|
+ /*
|
|
+ * If the 'min' latency exceeds our target, step down.
|
|
+ */
|
|
+ if (stat[0].min > rwb->min_lat_nsec) {
|
|
+ trace_wbt_lat(rwb->bdi, stat[0].min);
|
|
+ trace_wbt_stat(rwb->bdi, stat);
|
|
+ return LAT_EXCEEDED;
|
|
+ }
|
|
+
|
|
+ if (rwb->scale_step)
|
|
+ trace_wbt_stat(rwb->bdi, stat);
|
|
+
|
|
+ return LAT_OK;
|
|
+}
|
|
+
|
|
+static int latency_exceeded(struct rq_wb *rwb)
|
|
+{
|
|
+ struct blk_rq_stat stat[2];
|
|
+
|
|
+ rwb->stat_ops->get(rwb->ops_data, stat);
|
|
+ return __latency_exceeded(rwb, stat);
|
|
+}
|
|
+
|
|
+static void rwb_trace_step(struct rq_wb *rwb, const char *msg)
|
|
+{
|
|
+ trace_wbt_step(rwb->bdi, msg, rwb->scale_step, rwb->cur_win_nsec,
|
|
+ rwb->wb_background, rwb->wb_normal, rwb->wb_max);
|
|
+}
|
|
+
|
|
+static void scale_up(struct rq_wb *rwb)
|
|
+{
|
|
+ /*
|
|
+ * Hit max in previous round, stop here
|
|
+ */
|
|
+ if (rwb->scaled_max)
|
|
+ return;
|
|
+
|
|
+ rwb->scale_step--;
|
|
+ rwb->unknown_cnt = 0;
|
|
+ rwb->stat_ops->clear(rwb->ops_data);
|
|
+
|
|
+ rwb->scaled_max = calc_wb_limits(rwb);
|
|
+
|
|
+ rwb_wake_all(rwb);
|
|
+
|
|
+ rwb_trace_step(rwb, "step up");
|
|
+}
|
|
+
|
|
+/*
|
|
+ * Scale rwb down. If 'hard_throttle' is set, do it quicker, since we
|
|
+ * had a latency violation.
|
|
+ */
|
|
+static void scale_down(struct rq_wb *rwb, bool hard_throttle)
|
|
+{
|
|
+ /*
|
|
+ * Stop scaling down when we've hit the limit. This also prevents
|
|
+ * ->scale_step from going to crazy values, if the device can't
|
|
+ * keep up.
|
|
+ */
|
|
+ if (rwb->wb_max == 1)
|
|
+ return;
|
|
+
|
|
+ if (rwb->scale_step < 0 && hard_throttle)
|
|
+ rwb->scale_step = 0;
|
|
+ else
|
|
+ rwb->scale_step++;
|
|
+
|
|
+ rwb->scaled_max = false;
|
|
+ rwb->unknown_cnt = 0;
|
|
+ rwb->stat_ops->clear(rwb->ops_data);
|
|
+ calc_wb_limits(rwb);
|
|
+ rwb_trace_step(rwb, "step down");
|
|
+}
|
|
+
|
|
+static void rwb_arm_timer(struct rq_wb *rwb)
|
|
+{
|
|
+ unsigned long expires;
|
|
+
|
|
+ if (rwb->scale_step > 0) {
|
|
+ /*
|
|
+ * We should speed this up, using some variant of a fast
|
|
+ * integer inverse square root calculation. Since we only do
|
|
+ * this for every window expiration, it's not a huge deal,
|
|
+ * though.
|
|
+ */
|
|
+ rwb->cur_win_nsec = div_u64(rwb->win_nsec << 4,
|
|
+ int_sqrt((rwb->scale_step + 1) << 8));
|
|
+ } else {
|
|
+ /*
|
|
+ * For step < 0, we don't want to increase/decrease the
|
|
+ * window size.
|
|
+ */
|
|
+ rwb->cur_win_nsec = rwb->win_nsec;
|
|
+ }
|
|
+
|
|
+ expires = jiffies + nsecs_to_jiffies(rwb->cur_win_nsec);
|
|
+ mod_timer(&rwb->window_timer, expires);
|
|
+}
|
|
+
|
|
+static void wb_timer_fn(unsigned long data)
|
|
+{
|
|
+ struct rq_wb *rwb = (struct rq_wb *) data;
|
|
+ unsigned int inflight = wbt_inflight(rwb);
|
|
+ int status;
|
|
+
|
|
+ status = latency_exceeded(rwb);
|
|
+
|
|
+ trace_wbt_timer(rwb->bdi, status, rwb->scale_step, inflight);
|
|
+
|
|
+ /*
|
|
+ * If we exceeded the latency target, step down. If we did not,
|
|
+ * step one level up. If we don't know enough to say either exceeded
|
|
+ * or ok, then don't do anything.
|
|
+ */
|
|
+ switch (status) {
|
|
+ case LAT_EXCEEDED:
|
|
+ scale_down(rwb, true);
|
|
+ break;
|
|
+ case LAT_OK:
|
|
+ scale_up(rwb);
|
|
+ break;
|
|
+ case LAT_UNKNOWN_WRITES:
|
|
+ scale_up(rwb);
|
|
+ break;
|
|
+ case LAT_UNKNOWN:
|
|
+ if (++rwb->unknown_cnt < RWB_UNKNOWN_BUMP)
|
|
+ break;
|
|
+ /*
|
|
+ * We get here for two reasons:
|
|
+ *
|
|
+ * 1) We previously scaled reduced depth, and we currently
|
|
+ * don't have a valid read/write sample. For that case,
|
|
+ * slowly return to center state (step == 0).
|
|
+ * 2) We started a the center step, but don't have a valid
|
|
+ * read/write sample, but we do have writes going on.
|
|
+ * Allow step to go negative, to increase write perf.
|
|
+ */
|
|
+ if (rwb->scale_step > 0)
|
|
+ scale_up(rwb);
|
|
+ else if (rwb->scale_step < 0)
|
|
+ scale_down(rwb, false);
|
|
+ break;
|
|
+ default:
|
|
+ break;
|
|
+ }
|
|
+
|
|
+ /*
|
|
+ * Re-arm timer, if we have IO in flight
|
|
+ */
|
|
+ if (rwb->scale_step || inflight)
|
|
+ rwb_arm_timer(rwb);
|
|
+}
|
|
+
|
|
+void wbt_update_limits(struct rq_wb *rwb)
|
|
+{
|
|
+ rwb->scale_step = 0;
|
|
+ rwb->scaled_max = false;
|
|
+ calc_wb_limits(rwb);
|
|
+
|
|
+ rwb_wake_all(rwb);
|
|
+}
|
|
+
|
|
+static bool close_io(struct rq_wb *rwb)
|
|
+{
|
|
+ const unsigned long now = jiffies;
|
|
+
|
|
+ return time_before(now, rwb->last_issue + HZ / 10) ||
|
|
+ time_before(now, rwb->last_comp + HZ / 10);
|
|
+}
|
|
+
|
|
+#define REQ_HIPRIO (REQ_SYNC | REQ_META | REQ_PRIO)
|
|
+
|
|
+static inline unsigned int get_limit(struct rq_wb *rwb, unsigned long rw)
|
|
+{
|
|
+ unsigned int limit;
|
|
+
|
|
+ /*
|
|
+ * At this point we know it's a buffered write. If this is
|
|
+ * kswapd trying to free memory, or REQ_SYNC is set, set, then
|
|
+ * it's WB_SYNC_ALL writeback, and we'll use the max limit for
|
|
+ * that. If the write is marked as a background write, then use
|
|
+ * the idle limit, or go to normal if we haven't had competing
|
|
+ * IO for a bit.
|
|
+ */
|
|
+ if ((rw & REQ_HIPRIO) || wb_recent_wait(rwb) || current_is_kswapd())
|
|
+ limit = rwb->wb_max;
|
|
+ else if ((rw & REQ_BG) || close_io(rwb)) {
|
|
+ /*
|
|
+ * If less than 100ms since we completed unrelated IO,
|
|
+ * limit us to half the depth for background writeback.
|
|
+ */
|
|
+ limit = rwb->wb_background;
|
|
+ } else
|
|
+ limit = rwb->wb_normal;
|
|
+
|
|
+ return limit;
|
|
+}
|
|
+
|
|
+static inline bool may_queue(struct rq_wb *rwb, struct rq_wait *rqw,
|
|
+ unsigned long rw)
|
|
+{
|
|
+ /*
|
|
+ * inc it here even if disabled, since we'll dec it at completion.
|
|
+ * this only happens if the task was sleeping in __wbt_wait(),
|
|
+ * and someone turned it off at the same time.
|
|
+ */
|
|
+ if (!rwb_enabled(rwb)) {
|
|
+ atomic_inc(&rqw->inflight);
|
|
+ return true;
|
|
+ }
|
|
+
|
|
+ return atomic_inc_below(&rqw->inflight, get_limit(rwb, rw));
|
|
+}
|
|
+
|
|
+/*
|
|
+ * Block if we will exceed our limit, or if we are currently waiting for
|
|
+ * the timer to kick off queuing again.
|
|
+ */
|
|
+static void __wbt_wait(struct rq_wb *rwb, unsigned long rw, spinlock_t *lock)
|
|
+{
|
|
+ struct rq_wait *rqw = get_rq_wait(rwb, current_is_kswapd());
|
|
+ DEFINE_WAIT(wait);
|
|
+
|
|
+ if (may_queue(rwb, rqw, rw))
|
|
+ return;
|
|
+
|
|
+ do {
|
|
+ prepare_to_wait_exclusive(&rqw->wait, &wait,
|
|
+ TASK_UNINTERRUPTIBLE);
|
|
+
|
|
+ if (may_queue(rwb, rqw, rw))
|
|
+ break;
|
|
+
|
|
+ if (lock)
|
|
+ spin_unlock_irq(lock);
|
|
+
|
|
+ io_schedule();
|
|
+
|
|
+ if (lock)
|
|
+ spin_lock_irq(lock);
|
|
+ } while (1);
|
|
+
|
|
+ finish_wait(&rqw->wait, &wait);
|
|
+}
|
|
+
|
|
+static inline bool wbt_should_throttle(struct rq_wb *rwb, unsigned int rw)
|
|
+{
|
|
+ const int op = rw >> BIO_OP_SHIFT;
|
|
+
|
|
+ /*
|
|
+ * If not a WRITE (or a discard), do nothing
|
|
+ */
|
|
+ if (!(op == REQ_OP_WRITE || op == REQ_OP_DISCARD))
|
|
+ return false;
|
|
+
|
|
+ /*
|
|
+ * Don't throttle WRITE_ODIRECT
|
|
+ */
|
|
+ if ((rw & (REQ_SYNC | REQ_NOIDLE)) == REQ_SYNC)
|
|
+ return false;
|
|
+
|
|
+ return true;
|
|
+}
|
|
+
|
|
+/*
|
|
+ * Returns true if the IO request should be accounted, false if not.
|
|
+ * May sleep, if we have exceeded the writeback limits. Caller can pass
|
|
+ * in an irq held spinlock, if it holds one when calling this function.
|
|
+ * If we do sleep, we'll release and re-grab it.
|
|
+ */
|
|
+unsigned int wbt_wait(struct rq_wb *rwb, unsigned int rw, spinlock_t *lock)
|
|
+{
|
|
+ unsigned int ret = 0;
|
|
+
|
|
+ if (!rwb_enabled(rwb))
|
|
+ return 0;
|
|
+
|
|
+ if ((rw >> BIO_OP_SHIFT) == REQ_OP_READ)
|
|
+ ret = WBT_READ;
|
|
+
|
|
+ if (!wbt_should_throttle(rwb, rw)) {
|
|
+ if (ret & WBT_READ)
|
|
+ wb_timestamp(rwb, &rwb->last_issue);
|
|
+ return ret;
|
|
+ }
|
|
+
|
|
+ __wbt_wait(rwb, rw, lock);
|
|
+
|
|
+ if (!timer_pending(&rwb->window_timer))
|
|
+ rwb_arm_timer(rwb);
|
|
+
|
|
+ if (current_is_kswapd())
|
|
+ ret |= WBT_KSWAPD;
|
|
+
|
|
+ return ret | WBT_TRACKED;
|
|
+}
|
|
+
|
|
+void wbt_issue(struct rq_wb *rwb, struct blk_issue_stat *stat)
|
|
+{
|
|
+ if (!rwb_enabled(rwb))
|
|
+ return;
|
|
+
|
|
+ /*
|
|
+ * Track sync issue, in case it takes a long time to complete. Allows
|
|
+ * us to react quicker, if a sync IO takes a long time to complete.
|
|
+ * Note that this is just a hint. 'stat' can go away when the
|
|
+ * request completes, so it's important we never dereference it. We
|
|
+ * only use the address to compare with, which is why we store the
|
|
+ * sync_issue time locally.
|
|
+ */
|
|
+ if (wbt_is_read(stat) && !rwb->sync_issue) {
|
|
+ rwb->sync_cookie = stat;
|
|
+ rwb->sync_issue = blk_stat_time(stat);
|
|
+ }
|
|
+}
|
|
+
|
|
+void wbt_requeue(struct rq_wb *rwb, struct blk_issue_stat *stat)
|
|
+{
|
|
+ if (!rwb_enabled(rwb))
|
|
+ return;
|
|
+ if (stat == rwb->sync_cookie) {
|
|
+ rwb->sync_issue = 0;
|
|
+ rwb->sync_cookie = NULL;
|
|
+ }
|
|
+}
|
|
+
|
|
+void wbt_set_queue_depth(struct rq_wb *rwb, unsigned int depth)
|
|
+{
|
|
+ if (rwb) {
|
|
+ rwb->queue_depth = depth;
|
|
+ wbt_update_limits(rwb);
|
|
+ }
|
|
+}
|
|
+
|
|
+void wbt_set_write_cache(struct rq_wb *rwb, bool write_cache_on)
|
|
+{
|
|
+ if (rwb)
|
|
+ rwb->wc = write_cache_on;
|
|
+}
|
|
+
|
|
+void wbt_disable(struct rq_wb *rwb)
|
|
+{
|
|
+ if (rwb) {
|
|
+ del_timer_sync(&rwb->window_timer);
|
|
+ rwb->win_nsec = rwb->min_lat_nsec = 0;
|
|
+ wbt_update_limits(rwb);
|
|
+ }
|
|
+}
|
|
+EXPORT_SYMBOL_GPL(wbt_disable);
|
|
+
|
|
+struct rq_wb *wbt_init(struct backing_dev_info *bdi, struct wb_stat_ops *ops,
|
|
+ void *ops_data)
|
|
+{
|
|
+ struct rq_wb *rwb;
|
|
+ int i;
|
|
+
|
|
+ BUILD_BUG_ON(WBT_NR_BITS > BLK_STAT_RES_BITS);
|
|
+
|
|
+ if (!ops->get || !ops->is_current || !ops->clear)
|
|
+ return ERR_PTR(-EINVAL);
|
|
+
|
|
+ rwb = kzalloc(sizeof(*rwb), GFP_KERNEL);
|
|
+ if (!rwb)
|
|
+ return ERR_PTR(-ENOMEM);
|
|
+
|
|
+ for (i = 0; i < WBT_NUM_RWQ; i++) {
|
|
+ atomic_set(&rwb->rq_wait[i].inflight, 0);
|
|
+ init_waitqueue_head(&rwb->rq_wait[i].wait);
|
|
+ }
|
|
+
|
|
+ setup_timer(&rwb->window_timer, wb_timer_fn, (unsigned long) rwb);
|
|
+ rwb->wc = 1;
|
|
+ rwb->queue_depth = RWB_DEF_DEPTH;
|
|
+ rwb->last_comp = rwb->last_issue = jiffies;
|
|
+ rwb->bdi = bdi;
|
|
+ rwb->win_nsec = RWB_WINDOW_NSEC;
|
|
+ rwb->stat_ops = ops;
|
|
+ rwb->ops_data = ops_data;
|
|
+ wbt_update_limits(rwb);
|
|
+ return rwb;
|
|
+}
|
|
+
|
|
+void wbt_exit(struct rq_wb *rwb)
|
|
+{
|
|
+ if (rwb) {
|
|
+ del_timer_sync(&rwb->window_timer);
|
|
+ kfree(rwb);
|
|
+ }
|
|
+}
|
|
diff --git a/block/blk-wbt.h b/block/blk-wbt.h
|
|
new file mode 100644
|
|
index 0000000..7af8968
|
|
--- /dev/null
|
|
+++ b/block/blk-wbt.h
|
|
@@ -0,0 +1,166 @@
|
|
+#ifndef WB_THROTTLE_H
|
|
+#define WB_THROTTLE_H
|
|
+
|
|
+#include <linux/kernel.h>
|
|
+#include <linux/atomic.h>
|
|
+#include <linux/wait.h>
|
|
+#include <linux/timer.h>
|
|
+#include <linux/ktime.h>
|
|
+
|
|
+#include "blk-stat.h"
|
|
+
|
|
+enum wbt_flags {
|
|
+ WBT_TRACKED = 1, /* write, tracked for throttling */
|
|
+ WBT_READ = 2, /* read */
|
|
+ WBT_KSWAPD = 4, /* write, from kswapd */
|
|
+
|
|
+ WBT_NR_BITS = 3, /* number of bits */
|
|
+};
|
|
+
|
|
+enum {
|
|
+ WBT_NUM_RWQ = 2,
|
|
+};
|
|
+
|
|
+static inline void wbt_clear_state(struct blk_issue_stat *stat)
|
|
+{
|
|
+ stat->time &= BLK_STAT_TIME_MASK;
|
|
+}
|
|
+
|
|
+static inline enum wbt_flags wbt_stat_to_mask(struct blk_issue_stat *stat)
|
|
+{
|
|
+ return (stat->time & BLK_STAT_MASK) >> BLK_STAT_SHIFT;
|
|
+}
|
|
+
|
|
+static inline void wbt_track(struct blk_issue_stat *stat, enum wbt_flags wb_acct)
|
|
+{
|
|
+ stat->time |= ((u64) wb_acct) << BLK_STAT_SHIFT;
|
|
+}
|
|
+
|
|
+static inline bool wbt_is_tracked(struct blk_issue_stat *stat)
|
|
+{
|
|
+ return (stat->time >> BLK_STAT_SHIFT) & WBT_TRACKED;
|
|
+}
|
|
+
|
|
+static inline bool wbt_is_read(struct blk_issue_stat *stat)
|
|
+{
|
|
+ return (stat->time >> BLK_STAT_SHIFT) & WBT_READ;
|
|
+}
|
|
+
|
|
+struct wb_stat_ops {
|
|
+ void (*get)(void *, struct blk_rq_stat *);
|
|
+ bool (*is_current)(struct blk_rq_stat *);
|
|
+ void (*clear)(void *);
|
|
+};
|
|
+
|
|
+struct rq_wait {
|
|
+ wait_queue_head_t wait;
|
|
+ atomic_t inflight;
|
|
+};
|
|
+
|
|
+struct rq_wb {
|
|
+ /*
|
|
+ * Settings that govern how we throttle
|
|
+ */
|
|
+ unsigned int wb_background; /* background writeback */
|
|
+ unsigned int wb_normal; /* normal writeback */
|
|
+ unsigned int wb_max; /* max throughput writeback */
|
|
+ int scale_step;
|
|
+ bool scaled_max;
|
|
+
|
|
+ /*
|
|
+ * Number of consecutive periods where we don't have enough
|
|
+ * information to make a firm scale up/down decision.
|
|
+ */
|
|
+ unsigned int unknown_cnt;
|
|
+
|
|
+ u64 win_nsec; /* default window size */
|
|
+ u64 cur_win_nsec; /* current window size */
|
|
+
|
|
+ struct timer_list window_timer;
|
|
+
|
|
+ s64 sync_issue;
|
|
+ void *sync_cookie;
|
|
+
|
|
+ unsigned int wc;
|
|
+ unsigned int queue_depth;
|
|
+
|
|
+ unsigned long last_issue; /* last non-throttled issue */
|
|
+ unsigned long last_comp; /* last non-throttled comp */
|
|
+ unsigned long min_lat_nsec;
|
|
+ struct backing_dev_info *bdi;
|
|
+ struct rq_wait rq_wait[WBT_NUM_RWQ];
|
|
+
|
|
+ struct wb_stat_ops *stat_ops;
|
|
+ void *ops_data;
|
|
+};
|
|
+
|
|
+static inline unsigned int wbt_inflight(struct rq_wb *rwb)
|
|
+{
|
|
+ unsigned int i, ret = 0;
|
|
+
|
|
+ for (i = 0; i < WBT_NUM_RWQ; i++)
|
|
+ ret += atomic_read(&rwb->rq_wait[i].inflight);
|
|
+
|
|
+ return ret;
|
|
+}
|
|
+
|
|
+struct backing_dev_info;
|
|
+
|
|
+#ifdef CONFIG_BLK_WBT
|
|
+
|
|
+void __wbt_done(struct rq_wb *, enum wbt_flags);
|
|
+void wbt_done(struct rq_wb *, struct blk_issue_stat *);
|
|
+enum wbt_flags wbt_wait(struct rq_wb *, unsigned int, spinlock_t *);
|
|
+struct rq_wb *wbt_init(struct backing_dev_info *, struct wb_stat_ops *, void *);
|
|
+void wbt_exit(struct rq_wb *);
|
|
+void wbt_update_limits(struct rq_wb *);
|
|
+void wbt_requeue(struct rq_wb *, struct blk_issue_stat *);
|
|
+void wbt_issue(struct rq_wb *, struct blk_issue_stat *);
|
|
+void wbt_disable(struct rq_wb *);
|
|
+
|
|
+void wbt_set_queue_depth(struct rq_wb *, unsigned int);
|
|
+void wbt_set_write_cache(struct rq_wb *, bool);
|
|
+
|
|
+#else
|
|
+
|
|
+static inline void __wbt_done(struct rq_wb *rwb, enum wbt_flags flags)
|
|
+{
|
|
+}
|
|
+static inline void wbt_done(struct rq_wb *rwb, struct blk_issue_stat *stat)
|
|
+{
|
|
+}
|
|
+static inline enum wbt_flags wbt_wait(struct rq_wb *rwb,
|
|
+ unsigned int rw, spinlock_t *lock)
|
|
+{
|
|
+ return 0;
|
|
+}
|
|
+static inline struct rq_wb *wbt_init(struct backing_dev_info *bdi,
|
|
+ struct wb_stat_ops *ops, void *ops_data)
|
|
+{
|
|
+ return ERR_PTR(-EINVAL);
|
|
+}
|
|
+static inline void wbt_exit(struct rq_wb *rbw)
|
|
+{
|
|
+}
|
|
+static inline void wbt_update_limits(struct rq_wb *rwb)
|
|
+{
|
|
+}
|
|
+static inline void wbt_requeue(struct rq_wb *rwb, struct blk_issue_stat *stat)
|
|
+{
|
|
+}
|
|
+static inline void wbt_issue(struct rq_wb *rwb, struct blk_issue_stat *stat)
|
|
+{
|
|
+}
|
|
+static inline void wbt_disable(struct rq_wb *rwb)
|
|
+{
|
|
+}
|
|
+static inline void wbt_set_queue_depth(struct rq_wb *rwb, unsigned int depth)
|
|
+{
|
|
+}
|
|
+static inline void wbt_set_write_cache(struct rq_wb *rwb, bool wc)
|
|
+{
|
|
+}
|
|
+
|
|
+#endif /* CONFIG_BLK_WBT */
|
|
+
|
|
+#endif
|
|
diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
|
|
index 3ab6807..1be81bb 100644
|
|
--- a/block/cfq-iosched.c
|
|
+++ b/block/cfq-iosched.c
|
|
@@ -16,6 +16,7 @@
|
|
#include <linux/blktrace_api.h>
|
|
#include <linux/blk-cgroup.h>
|
|
#include "blk.h"
|
|
+#include "blk-wbt.h"
|
|
|
|
/*
|
|
* tunables
|
|
@@ -3771,9 +3772,11 @@ static void check_blkcg_changed(struct cfq_io_cq *cic, struct bio *bio)
|
|
struct cfq_data *cfqd = cic_to_cfqd(cic);
|
|
struct cfq_queue *cfqq;
|
|
uint64_t serial_nr;
|
|
+ bool nonroot_cg;
|
|
|
|
rcu_read_lock();
|
|
serial_nr = bio_blkcg(bio)->css.serial_nr;
|
|
+ nonroot_cg = bio_blkcg(bio) != &blkcg_root;
|
|
rcu_read_unlock();
|
|
|
|
/*
|
|
@@ -3784,6 +3787,17 @@ static void check_blkcg_changed(struct cfq_io_cq *cic, struct bio *bio)
|
|
return;
|
|
|
|
/*
|
|
+ * If we have a non-root cgroup, we can depend on that to
|
|
+ * do proper throttling of writes. Turn off wbt for that
|
|
+ * case.
|
|
+ */
|
|
+ if (nonroot_cg) {
|
|
+ struct request_queue *q = cfqd->queue;
|
|
+
|
|
+ wbt_disable(q->rq_wb);
|
|
+ }
|
|
+
|
|
+ /*
|
|
* Drop reference to queues. New queues will be assigned in new
|
|
* group upon arrival of fresh requests.
|
|
*/
|
|
diff --git a/drivers/block/swim.c b/drivers/block/swim.c
|
|
index b5afd49..7d09955 100644
|
|
--- a/drivers/block/swim.c
|
|
+++ b/drivers/block/swim.c
|
|
@@ -332,7 +332,7 @@ static inline void swim_motor(struct swim __iomem *base,
|
|
if (swim_readbit(base, MOTOR_ON))
|
|
break;
|
|
current->state = TASK_INTERRUPTIBLE;
|
|
- schedule_timeout(1);
|
|
+ schedule_min_hrtimeout();
|
|
}
|
|
} else if (action == OFF) {
|
|
swim_action(base, MOTOR_OFF);
|
|
@@ -351,7 +351,7 @@ static inline void swim_eject(struct swim __iomem *base)
|
|
if (!swim_readbit(base, DISK_IN))
|
|
break;
|
|
current->state = TASK_INTERRUPTIBLE;
|
|
- schedule_timeout(1);
|
|
+ schedule_min_hrtimeout();
|
|
}
|
|
swim_select(base, RELAX);
|
|
}
|
|
@@ -375,7 +375,7 @@ static inline int swim_step(struct swim __iomem *base)
|
|
for (wait = 0; wait < HZ; wait++) {
|
|
|
|
current->state = TASK_INTERRUPTIBLE;
|
|
- schedule_timeout(1);
|
|
+ schedule_min_hrtimeout();
|
|
|
|
swim_select(base, RELAX);
|
|
if (!swim_readbit(base, STEP))
|
|
diff --git a/drivers/bluetooth/hci_qca.c b/drivers/bluetooth/hci_qca.c
|
|
index 6c867fb..87adc94 100644
|
|
--- a/drivers/bluetooth/hci_qca.c
|
|
+++ b/drivers/bluetooth/hci_qca.c
|
|
@@ -883,7 +883,7 @@ static int qca_set_baudrate(struct hci_dev *hdev, uint8_t baudrate)
|
|
* then host can communicate with new baudrate to controller
|
|
*/
|
|
set_current_state(TASK_UNINTERRUPTIBLE);
|
|
- schedule_timeout(msecs_to_jiffies(BAUDRATE_SETTLE_TIMEOUT_MS));
|
|
+ schedule_msec_hrtimeout((BAUDRATE_SETTLE_TIMEOUT_MS));
|
|
set_current_state(TASK_INTERRUPTIBLE);
|
|
|
|
return 0;
|
|
diff --git a/drivers/char/ipmi/ipmi_msghandler.c b/drivers/char/ipmi/ipmi_msghandler.c
|
|
index fcdd886..a018d18 100644
|
|
--- a/drivers/char/ipmi/ipmi_msghandler.c
|
|
+++ b/drivers/char/ipmi/ipmi_msghandler.c
|
|
@@ -2952,7 +2952,7 @@ static void cleanup_smi_msgs(ipmi_smi_t intf)
|
|
/* Current message first, to preserve order */
|
|
while (intf->curr_msg && !list_empty(&intf->waiting_rcv_msgs)) {
|
|
/* Wait for the message to clear out. */
|
|
- schedule_timeout(1);
|
|
+ schedule_min_hrtimeout();
|
|
}
|
|
|
|
/* No need for locks, the interface is down. */
|
|
diff --git a/drivers/char/ipmi/ipmi_ssif.c b/drivers/char/ipmi/ipmi_ssif.c
|
|
index 5673fff..8bb6345 100644
|
|
--- a/drivers/char/ipmi/ipmi_ssif.c
|
|
+++ b/drivers/char/ipmi/ipmi_ssif.c
|
|
@@ -1190,7 +1190,7 @@ static int ssif_remove(struct i2c_client *client)
|
|
|
|
/* make sure the driver is not looking for flags any more. */
|
|
while (ssif_info->ssif_state != SSIF_NORMAL)
|
|
- schedule_timeout(1);
|
|
+ schedule_min_hrtimeout();
|
|
|
|
ssif_info->stopping = true;
|
|
del_timer_sync(&ssif_info->retry_timer);
|
|
diff --git a/drivers/char/snsc.c b/drivers/char/snsc.c
|
|
index 10e5632..b7a8197 100644
|
|
--- a/drivers/char/snsc.c
|
|
+++ b/drivers/char/snsc.c
|
|
@@ -198,7 +198,7 @@ scdrv_read(struct file *file, char __user *buf, size_t count, loff_t *f_pos)
|
|
add_wait_queue(&sd->sd_rq, &wait);
|
|
spin_unlock_irqrestore(&sd->sd_rlock, flags);
|
|
|
|
- schedule_timeout(msecs_to_jiffies(SCDRV_TIMEOUT));
|
|
+ schedule_msec_hrtimeout((SCDRV_TIMEOUT));
|
|
|
|
remove_wait_queue(&sd->sd_rq, &wait);
|
|
if (signal_pending(current)) {
|
|
@@ -294,7 +294,7 @@ scdrv_write(struct file *file, const char __user *buf,
|
|
add_wait_queue(&sd->sd_wq, &wait);
|
|
spin_unlock_irqrestore(&sd->sd_wlock, flags);
|
|
|
|
- schedule_timeout(msecs_to_jiffies(SCDRV_TIMEOUT));
|
|
+ schedule_msec_hrtimeout((SCDRV_TIMEOUT));
|
|
|
|
remove_wait_queue(&sd->sd_wq, &wait);
|
|
if (signal_pending(current)) {
|
|
diff --git a/drivers/gpu/drm/vmwgfx/vmwgfx_fifo.c b/drivers/gpu/drm/vmwgfx/vmwgfx_fifo.c
|
|
index b6a0806..b5b02cf 100644
|
|
--- a/drivers/gpu/drm/vmwgfx/vmwgfx_fifo.c
|
|
+++ b/drivers/gpu/drm/vmwgfx/vmwgfx_fifo.c
|
|
@@ -235,7 +235,7 @@ static int vmw_fifo_wait_noirq(struct vmw_private *dev_priv,
|
|
DRM_ERROR("SVGA device lockup.\n");
|
|
break;
|
|
}
|
|
- schedule_timeout(1);
|
|
+ schedule_min_hrtimeout();
|
|
if (interruptible && signal_pending(current)) {
|
|
ret = -ERESTARTSYS;
|
|
break;
|
|
diff --git a/drivers/gpu/drm/vmwgfx/vmwgfx_irq.c b/drivers/gpu/drm/vmwgfx/vmwgfx_irq.c
|
|
index 0c7e172..4c1555c 100644
|
|
--- a/drivers/gpu/drm/vmwgfx/vmwgfx_irq.c
|
|
+++ b/drivers/gpu/drm/vmwgfx/vmwgfx_irq.c
|
|
@@ -156,7 +156,7 @@ int vmw_fallback_wait(struct vmw_private *dev_priv,
|
|
break;
|
|
}
|
|
if (lazy)
|
|
- schedule_timeout(1);
|
|
+ schedule_min_hrtimeout();
|
|
else if ((++count & 0x0F) == 0) {
|
|
/**
|
|
* FIXME: Use schedule_hr_timeout here for
|
|
diff --git a/drivers/hwmon/fam15h_power.c b/drivers/hwmon/fam15h_power.c
|
|
index 15aa49d..991e8a7 100644
|
|
--- a/drivers/hwmon/fam15h_power.c
|
|
+++ b/drivers/hwmon/fam15h_power.c
|
|
@@ -238,7 +238,7 @@ static ssize_t acc_show_power(struct device *dev,
|
|
prev_ptsc[cu] = data->cpu_sw_pwr_ptsc[cu];
|
|
}
|
|
|
|
- leftover = schedule_timeout_interruptible(msecs_to_jiffies(data->power_period));
|
|
+ leftover = schedule_msec_hrtimeout_interruptible((data->power_period));
|
|
if (leftover)
|
|
return 0;
|
|
|
|
diff --git a/drivers/iio/light/tsl2563.c b/drivers/iio/light/tsl2563.c
|
|
index 04598ae..a8c095d 100644
|
|
--- a/drivers/iio/light/tsl2563.c
|
|
+++ b/drivers/iio/light/tsl2563.c
|
|
@@ -282,11 +282,7 @@ static void tsl2563_wait_adc(struct tsl2563_chip *chip)
|
|
default:
|
|
delay = 402;
|
|
}
|
|
- /*
|
|
- * TODO: Make sure that we wait at least required delay but why we
|
|
- * have to extend it one tick more?
|
|
- */
|
|
- schedule_timeout_interruptible(msecs_to_jiffies(delay) + 2);
|
|
+ schedule_msec_hrtimeout_interruptible(delay + 1);
|
|
}
|
|
|
|
static int tsl2563_adjust_gainlevel(struct tsl2563_chip *chip, u16 adc)
|
|
diff --git a/drivers/media/i2c/msp3400-driver.c b/drivers/media/i2c/msp3400-driver.c
|
|
index 503b7c4..d790a90 100644
|
|
--- a/drivers/media/i2c/msp3400-driver.c
|
|
+++ b/drivers/media/i2c/msp3400-driver.c
|
|
@@ -184,7 +184,7 @@ static int msp_read(struct i2c_client *client, int dev, int addr)
|
|
break;
|
|
v4l_warn(client, "I/O error #%d (read 0x%02x/0x%02x)\n", err,
|
|
dev, addr);
|
|
- schedule_timeout_interruptible(msecs_to_jiffies(10));
|
|
+ schedule_msec_hrtimeout_interruptible((10));
|
|
}
|
|
if (err == 3) {
|
|
v4l_warn(client, "resetting chip, sound will go off.\n");
|
|
@@ -225,7 +225,7 @@ static int msp_write(struct i2c_client *client, int dev, int addr, int val)
|
|
break;
|
|
v4l_warn(client, "I/O error #%d (write 0x%02x/0x%02x)\n", err,
|
|
dev, addr);
|
|
- schedule_timeout_interruptible(msecs_to_jiffies(10));
|
|
+ schedule_msec_hrtimeout_interruptible((10));
|
|
}
|
|
if (err == 3) {
|
|
v4l_warn(client, "resetting chip, sound will go off.\n");
|
|
diff --git a/drivers/media/pci/cx18/cx18-gpio.c b/drivers/media/pci/cx18/cx18-gpio.c
|
|
index 38dc6b8..3cd3098 100644
|
|
--- a/drivers/media/pci/cx18/cx18-gpio.c
|
|
+++ b/drivers/media/pci/cx18/cx18-gpio.c
|
|
@@ -95,11 +95,11 @@ static void gpio_reset_seq(struct cx18 *cx, u32 active_lo, u32 active_hi,
|
|
|
|
/* Assert */
|
|
gpio_update(cx, mask, ~active_lo);
|
|
- schedule_timeout_uninterruptible(msecs_to_jiffies(assert_msecs));
|
|
+ schedule_msec_hrtimeout_uninterruptible((assert_msecs));
|
|
|
|
/* Deassert */
|
|
gpio_update(cx, mask, ~active_hi);
|
|
- schedule_timeout_uninterruptible(msecs_to_jiffies(recovery_msecs));
|
|
+ schedule_msec_hrtimeout_uninterruptible((recovery_msecs));
|
|
}
|
|
|
|
/*
|
|
diff --git a/drivers/media/pci/ivtv/ivtv-gpio.c b/drivers/media/pci/ivtv/ivtv-gpio.c
|
|
index f752f39..23372af 100644
|
|
--- a/drivers/media/pci/ivtv/ivtv-gpio.c
|
|
+++ b/drivers/media/pci/ivtv/ivtv-gpio.c
|
|
@@ -117,7 +117,7 @@ void ivtv_reset_ir_gpio(struct ivtv *itv)
|
|
curout = (curout & ~0xF) | 1;
|
|
write_reg(curout, IVTV_REG_GPIO_OUT);
|
|
/* We could use something else for smaller time */
|
|
- schedule_timeout_interruptible(msecs_to_jiffies(1));
|
|
+ schedule_msec_hrtimeout_interruptible((1));
|
|
curout |= 2;
|
|
write_reg(curout, IVTV_REG_GPIO_OUT);
|
|
curdir &= ~0x80;
|
|
@@ -137,11 +137,11 @@ int ivtv_reset_tuner_gpio(void *dev, int component, int cmd, int value)
|
|
curout = read_reg(IVTV_REG_GPIO_OUT);
|
|
curout &= ~(1 << itv->card->xceive_pin);
|
|
write_reg(curout, IVTV_REG_GPIO_OUT);
|
|
- schedule_timeout_interruptible(msecs_to_jiffies(1));
|
|
+ schedule_msec_hrtimeout_interruptible((1));
|
|
|
|
curout |= 1 << itv->card->xceive_pin;
|
|
write_reg(curout, IVTV_REG_GPIO_OUT);
|
|
- schedule_timeout_interruptible(msecs_to_jiffies(1));
|
|
+ schedule_msec_hrtimeout_interruptible((1));
|
|
return 0;
|
|
}
|
|
|
|
diff --git a/drivers/media/pci/ivtv/ivtv-ioctl.c b/drivers/media/pci/ivtv/ivtv-ioctl.c
|
|
index 2dc4b20..8e061cf 100644
|
|
--- a/drivers/media/pci/ivtv/ivtv-ioctl.c
|
|
+++ b/drivers/media/pci/ivtv/ivtv-ioctl.c
|
|
@@ -1151,7 +1151,7 @@ void ivtv_s_std_dec(struct ivtv *itv, v4l2_std_id std)
|
|
TASK_UNINTERRUPTIBLE);
|
|
if ((read_reg(IVTV_REG_DEC_LINE_FIELD) >> 16) < 100)
|
|
break;
|
|
- schedule_timeout(msecs_to_jiffies(25));
|
|
+ schedule_msec_hrtimeout((25));
|
|
}
|
|
finish_wait(&itv->vsync_waitq, &wait);
|
|
mutex_lock(&itv->serialize_lock);
|
|
diff --git a/drivers/media/pci/ivtv/ivtv-streams.c b/drivers/media/pci/ivtv/ivtv-streams.c
|
|
index d27c6df..e9ffc4e 100644
|
|
--- a/drivers/media/pci/ivtv/ivtv-streams.c
|
|
+++ b/drivers/media/pci/ivtv/ivtv-streams.c
|
|
@@ -834,7 +834,7 @@ int ivtv_stop_v4l2_encode_stream(struct ivtv_stream *s, int gop_end)
|
|
while (!test_bit(IVTV_F_I_EOS, &itv->i_flags) &&
|
|
time_before(jiffies,
|
|
then + msecs_to_jiffies(2000))) {
|
|
- schedule_timeout(msecs_to_jiffies(10));
|
|
+ schedule_msec_hrtimeout((10));
|
|
}
|
|
|
|
/* To convert jiffies to ms, we must multiply by 1000
|
|
diff --git a/drivers/media/radio/radio-mr800.c b/drivers/media/radio/radio-mr800.c
|
|
index c2927fd..bdee269 100644
|
|
--- a/drivers/media/radio/radio-mr800.c
|
|
+++ b/drivers/media/radio/radio-mr800.c
|
|
@@ -382,7 +382,7 @@ static int vidioc_s_hw_freq_seek(struct file *file, void *priv,
|
|
retval = -ENODATA;
|
|
break;
|
|
}
|
|
- if (schedule_timeout_interruptible(msecs_to_jiffies(10))) {
|
|
+ if (schedule_msec_hrtimeout_interruptible((10))) {
|
|
retval = -ERESTARTSYS;
|
|
break;
|
|
}
|
|
diff --git a/drivers/media/radio/radio-tea5777.c b/drivers/media/radio/radio-tea5777.c
|
|
index 83fe7ab..aaae5fa 100644
|
|
--- a/drivers/media/radio/radio-tea5777.c
|
|
+++ b/drivers/media/radio/radio-tea5777.c
|
|
@@ -249,7 +249,7 @@ static int radio_tea5777_update_read_reg(struct radio_tea5777 *tea, int wait)
|
|
}
|
|
|
|
if (wait) {
|
|
- if (schedule_timeout_interruptible(msecs_to_jiffies(wait)))
|
|
+ if (schedule_msec_hrtimeout_interruptible((wait)))
|
|
return -ERESTARTSYS;
|
|
}
|
|
|
|
diff --git a/drivers/media/radio/tea575x.c b/drivers/media/radio/tea575x.c
|
|
index 4dc2067..29f4416 100644
|
|
--- a/drivers/media/radio/tea575x.c
|
|
+++ b/drivers/media/radio/tea575x.c
|
|
@@ -416,7 +416,7 @@ int snd_tea575x_s_hw_freq_seek(struct file *file, struct snd_tea575x *tea,
|
|
for (;;) {
|
|
if (time_after(jiffies, timeout))
|
|
break;
|
|
- if (schedule_timeout_interruptible(msecs_to_jiffies(10))) {
|
|
+ if (schedule_msec_hrtimeout_interruptible((10))) {
|
|
/* some signal arrived, stop search */
|
|
tea->val &= ~TEA575X_BIT_SEARCH;
|
|
snd_tea575x_set_freq(tea);
|
|
diff --git a/drivers/mfd/ucb1x00-core.c b/drivers/mfd/ucb1x00-core.c
|
|
index d6fb2e1..7ac951b 100644
|
|
--- a/drivers/mfd/ucb1x00-core.c
|
|
+++ b/drivers/mfd/ucb1x00-core.c
|
|
@@ -253,7 +253,7 @@ unsigned int ucb1x00_adc_read(struct ucb1x00 *ucb, int adc_channel, int sync)
|
|
break;
|
|
/* yield to other processes */
|
|
set_current_state(TASK_INTERRUPTIBLE);
|
|
- schedule_timeout(1);
|
|
+ schedule_min_hrtimeout();
|
|
}
|
|
|
|
return UCB_ADC_DAT(val);
|
|
diff --git a/drivers/misc/panel.c b/drivers/misc/panel.c
|
|
index 6030ac5..f0c1a10 100644
|
|
--- a/drivers/misc/panel.c
|
|
+++ b/drivers/misc/panel.c
|
|
@@ -760,7 +760,7 @@ static void long_sleep(int ms)
|
|
if (in_interrupt())
|
|
mdelay(ms);
|
|
else
|
|
- schedule_timeout_interruptible(msecs_to_jiffies(ms));
|
|
+ schedule_msec_hrtimeout_interruptible((ms));
|
|
}
|
|
|
|
/*
|
|
diff --git a/drivers/misc/sgi-xp/xpc_channel.c b/drivers/misc/sgi-xp/xpc_channel.c
|
|
index 128d561..38e68e9 100644
|
|
--- a/drivers/misc/sgi-xp/xpc_channel.c
|
|
+++ b/drivers/misc/sgi-xp/xpc_channel.c
|
|
@@ -837,7 +837,7 @@ xpc_allocate_msg_wait(struct xpc_channel *ch)
|
|
|
|
atomic_inc(&ch->n_on_msg_allocate_wq);
|
|
prepare_to_wait(&ch->msg_allocate_wq, &wait, TASK_INTERRUPTIBLE);
|
|
- ret = schedule_timeout(1);
|
|
+ ret = schedule_min_hrtimeout();
|
|
finish_wait(&ch->msg_allocate_wq, &wait);
|
|
atomic_dec(&ch->n_on_msg_allocate_wq);
|
|
|
|
diff --git a/drivers/net/caif/caif_hsi.c b/drivers/net/caif/caif_hsi.c
|
|
index ddabce7..67fb5ce 100644
|
|
--- a/drivers/net/caif/caif_hsi.c
|
|
+++ b/drivers/net/caif/caif_hsi.c
|
|
@@ -944,7 +944,7 @@ static void cfhsi_wake_down(struct work_struct *work)
|
|
break;
|
|
|
|
set_current_state(TASK_INTERRUPTIBLE);
|
|
- schedule_timeout(1);
|
|
+ schedule_min_hrtimeout();
|
|
retry--;
|
|
}
|
|
|
|
diff --git a/drivers/net/can/usb/peak_usb/pcan_usb.c b/drivers/net/can/usb/peak_usb/pcan_usb.c
|
|
index 838545c..34f8972 100644
|
|
--- a/drivers/net/can/usb/peak_usb/pcan_usb.c
|
|
+++ b/drivers/net/can/usb/peak_usb/pcan_usb.c
|
|
@@ -250,7 +250,7 @@ static int pcan_usb_write_mode(struct peak_usb_device *dev, u8 onoff)
|
|
} else {
|
|
/* the PCAN-USB needs time to init */
|
|
set_current_state(TASK_INTERRUPTIBLE);
|
|
- schedule_timeout(msecs_to_jiffies(PCAN_USB_STARTUP_TIMEOUT));
|
|
+ schedule_msec_hrtimeout((PCAN_USB_STARTUP_TIMEOUT));
|
|
}
|
|
|
|
return err;
|
|
diff --git a/drivers/net/usb/lan78xx.c b/drivers/net/usb/lan78xx.c
|
|
index f33460c..93d335d 100644
|
|
--- a/drivers/net/usb/lan78xx.c
|
|
+++ b/drivers/net/usb/lan78xx.c
|
|
@@ -2361,7 +2361,7 @@ static void lan78xx_terminate_urbs(struct lan78xx_net *dev)
|
|
while (!skb_queue_empty(&dev->rxq) &&
|
|
!skb_queue_empty(&dev->txq) &&
|
|
!skb_queue_empty(&dev->done)) {
|
|
- schedule_timeout(msecs_to_jiffies(UNLINK_TIMEOUT_MS));
|
|
+ schedule_msec_hrtimeout((UNLINK_TIMEOUT_MS));
|
|
set_current_state(TASK_UNINTERRUPTIBLE);
|
|
netif_dbg(dev, ifdown, dev->net,
|
|
"waited for %d urb completions\n", temp);
|
|
diff --git a/drivers/net/usb/usbnet.c b/drivers/net/usb/usbnet.c
|
|
index d5071e3..7ca0618 100644
|
|
--- a/drivers/net/usb/usbnet.c
|
|
+++ b/drivers/net/usb/usbnet.c
|
|
@@ -769,7 +769,7 @@ static void wait_skb_queue_empty(struct sk_buff_head *q)
|
|
spin_lock_irqsave(&q->lock, flags);
|
|
while (!skb_queue_empty(q)) {
|
|
spin_unlock_irqrestore(&q->lock, flags);
|
|
- schedule_timeout(msecs_to_jiffies(UNLINK_TIMEOUT_MS));
|
|
+ schedule_msec_hrtimeout((UNLINK_TIMEOUT_MS));
|
|
set_current_state(TASK_UNINTERRUPTIBLE);
|
|
spin_lock_irqsave(&q->lock, flags);
|
|
}
|
|
diff --git a/drivers/net/wireless/ath/ath9k/ath9k.h b/drivers/net/wireless/ath/ath9k/ath9k.h
|
|
index 26fc8ec..378d345 100644
|
|
--- a/drivers/net/wireless/ath/ath9k/ath9k.h
|
|
+++ b/drivers/net/wireless/ath/ath9k/ath9k.h
|
|
@@ -91,7 +91,6 @@ int ath_descdma_setup(struct ath_softc *sc, struct ath_descdma *dd,
|
|
#define ATH_RXBUF 512
|
|
#define ATH_TXBUF 512
|
|
#define ATH_TXBUF_RESERVE 5
|
|
-#define ATH_MAX_QDEPTH (ATH_TXBUF / 4 - ATH_TXBUF_RESERVE)
|
|
#define ATH_TXMAXTRY 13
|
|
#define ATH_MAX_SW_RETRIES 30
|
|
|
|
@@ -145,7 +144,7 @@ int ath_descdma_setup(struct ath_softc *sc, struct ath_descdma *dd,
|
|
#define BAW_WITHIN(_start, _bawsz, _seqno) \
|
|
((((_seqno) - (_start)) & 4095) < (_bawsz))
|
|
|
|
-#define ATH_AN_2_TID(_an, _tidno) (&(_an)->tid[(_tidno)])
|
|
+#define ATH_AN_2_TID(_an, _tidno) ath_node_to_tid(_an, _tidno)
|
|
|
|
#define IS_HT_RATE(rate) (rate & 0x80)
|
|
#define IS_CCK_RATE(rate) ((rate >= 0x18) && (rate <= 0x1e))
|
|
@@ -164,7 +163,6 @@ struct ath_txq {
|
|
spinlock_t axq_lock;
|
|
u32 axq_depth;
|
|
u32 axq_ampdu_depth;
|
|
- bool stopped;
|
|
bool axq_tx_inprogress;
|
|
struct list_head txq_fifo[ATH_TXFIFO_DEPTH];
|
|
u8 txq_headidx;
|
|
@@ -232,7 +230,6 @@ struct ath_buf {
|
|
|
|
struct ath_atx_tid {
|
|
struct list_head list;
|
|
- struct sk_buff_head buf_q;
|
|
struct sk_buff_head retry_q;
|
|
struct ath_node *an;
|
|
struct ath_txq *txq;
|
|
@@ -247,13 +244,13 @@ struct ath_atx_tid {
|
|
s8 bar_index;
|
|
bool active;
|
|
bool clear_ps_filter;
|
|
+ bool has_queued;
|
|
};
|
|
|
|
struct ath_node {
|
|
struct ath_softc *sc;
|
|
struct ieee80211_sta *sta; /* station struct we're part of */
|
|
struct ieee80211_vif *vif; /* interface with which we're associated */
|
|
- struct ath_atx_tid tid[IEEE80211_NUM_TIDS];
|
|
|
|
u16 maxampdu;
|
|
u8 mpdudensity;
|
|
@@ -276,7 +273,6 @@ struct ath_tx_control {
|
|
struct ath_node *an;
|
|
struct ieee80211_sta *sta;
|
|
u8 paprd;
|
|
- bool force_channel;
|
|
};
|
|
|
|
|
|
@@ -293,7 +289,6 @@ struct ath_tx {
|
|
struct ath_descdma txdma;
|
|
struct ath_txq *txq_map[IEEE80211_NUM_ACS];
|
|
struct ath_txq *uapsdq;
|
|
- u32 txq_max_pending[IEEE80211_NUM_ACS];
|
|
u16 max_aggr_framelen[IEEE80211_NUM_ACS][4][32];
|
|
};
|
|
|
|
@@ -421,6 +416,22 @@ struct ath_offchannel {
|
|
int duration;
|
|
};
|
|
|
|
+static inline struct ath_atx_tid *
|
|
+ath_node_to_tid(struct ath_node *an, u8 tidno)
|
|
+{
|
|
+ struct ieee80211_sta *sta = an->sta;
|
|
+ struct ieee80211_vif *vif = an->vif;
|
|
+ struct ieee80211_txq *txq;
|
|
+
|
|
+ BUG_ON(!vif);
|
|
+ if (sta)
|
|
+ txq = sta->txq[tidno % ARRAY_SIZE(sta->txq)];
|
|
+ else
|
|
+ txq = vif->txq;
|
|
+
|
|
+ return (struct ath_atx_tid *) txq->drv_priv;
|
|
+}
|
|
+
|
|
#define case_rtn_string(val) case val: return #val
|
|
|
|
#define ath_for_each_chanctx(_sc, _ctx) \
|
|
@@ -575,7 +586,6 @@ void ath_tx_edma_tasklet(struct ath_softc *sc);
|
|
int ath_tx_aggr_start(struct ath_softc *sc, struct ieee80211_sta *sta,
|
|
u16 tid, u16 *ssn);
|
|
void ath_tx_aggr_stop(struct ath_softc *sc, struct ieee80211_sta *sta, u16 tid);
|
|
-void ath_tx_aggr_resume(struct ath_softc *sc, struct ieee80211_sta *sta, u16 tid);
|
|
|
|
void ath_tx_aggr_wakeup(struct ath_softc *sc, struct ath_node *an);
|
|
void ath_tx_aggr_sleep(struct ieee80211_sta *sta, struct ath_softc *sc,
|
|
@@ -585,6 +595,7 @@ void ath9k_release_buffered_frames(struct ieee80211_hw *hw,
|
|
u16 tids, int nframes,
|
|
enum ieee80211_frame_release_type reason,
|
|
bool more_data);
|
|
+void ath9k_wake_tx_queue(struct ieee80211_hw *hw, struct ieee80211_txq *queue);
|
|
|
|
/********/
|
|
/* VIFs */
|
|
diff --git a/drivers/net/wireless/ath/ath9k/channel.c b/drivers/net/wireless/ath/ath9k/channel.c
|
|
index 57e26a6..929dd70 100644
|
|
--- a/drivers/net/wireless/ath/ath9k/channel.c
|
|
+++ b/drivers/net/wireless/ath/ath9k/channel.c
|
|
@@ -1010,7 +1010,6 @@ static void ath_scan_send_probe(struct ath_softc *sc,
|
|
goto error;
|
|
|
|
txctl.txq = sc->tx.txq_map[IEEE80211_AC_VO];
|
|
- txctl.force_channel = true;
|
|
if (ath_tx_start(sc->hw, skb, &txctl))
|
|
goto error;
|
|
|
|
@@ -1133,7 +1132,6 @@ ath_chanctx_send_vif_ps_frame(struct ath_softc *sc, struct ath_vif *avp,
|
|
memset(&txctl, 0, sizeof(txctl));
|
|
txctl.txq = sc->tx.txq_map[IEEE80211_AC_VO];
|
|
txctl.sta = sta;
|
|
- txctl.force_channel = true;
|
|
if (ath_tx_start(sc->hw, skb, &txctl)) {
|
|
ieee80211_free_txskb(sc->hw, skb);
|
|
return false;
|
|
diff --git a/drivers/net/wireless/ath/ath9k/debug.c b/drivers/net/wireless/ath/ath9k/debug.c
|
|
index c56e40f..89a94dd 100644
|
|
--- a/drivers/net/wireless/ath/ath9k/debug.c
|
|
+++ b/drivers/net/wireless/ath/ath9k/debug.c
|
|
@@ -600,7 +600,6 @@ static int read_file_xmit(struct seq_file *file, void *data)
|
|
PR("MPDUs XRetried: ", xretries);
|
|
PR("Aggregates: ", a_aggr);
|
|
PR("AMPDUs Queued HW:", a_queued_hw);
|
|
- PR("AMPDUs Queued SW:", a_queued_sw);
|
|
PR("AMPDUs Completed:", a_completed);
|
|
PR("AMPDUs Retried: ", a_retries);
|
|
PR("AMPDUs XRetried: ", a_xretries);
|
|
@@ -629,8 +628,7 @@ static void print_queue(struct ath_softc *sc, struct ath_txq *txq,
|
|
seq_printf(file, "%s: %d ", "qnum", txq->axq_qnum);
|
|
seq_printf(file, "%s: %2d ", "qdepth", txq->axq_depth);
|
|
seq_printf(file, "%s: %2d ", "ampdu-depth", txq->axq_ampdu_depth);
|
|
- seq_printf(file, "%s: %3d ", "pending", txq->pending_frames);
|
|
- seq_printf(file, "%s: %d\n", "stopped", txq->stopped);
|
|
+ seq_printf(file, "%s: %3d\n", "pending", txq->pending_frames);
|
|
|
|
ath_txq_unlock(sc, txq);
|
|
}
|
|
@@ -1208,7 +1206,6 @@ static const char ath9k_gstrings_stats[][ETH_GSTRING_LEN] = {
|
|
AMKSTR(d_tx_mpdu_xretries),
|
|
AMKSTR(d_tx_aggregates),
|
|
AMKSTR(d_tx_ampdus_queued_hw),
|
|
- AMKSTR(d_tx_ampdus_queued_sw),
|
|
AMKSTR(d_tx_ampdus_completed),
|
|
AMKSTR(d_tx_ampdu_retries),
|
|
AMKSTR(d_tx_ampdu_xretries),
|
|
@@ -1288,7 +1285,6 @@ void ath9k_get_et_stats(struct ieee80211_hw *hw,
|
|
AWDATA(xretries);
|
|
AWDATA(a_aggr);
|
|
AWDATA(a_queued_hw);
|
|
- AWDATA(a_queued_sw);
|
|
AWDATA(a_completed);
|
|
AWDATA(a_retries);
|
|
AWDATA(a_xretries);
|
|
@@ -1346,14 +1342,6 @@ int ath9k_init_debug(struct ath_hw *ah)
|
|
read_file_xmit);
|
|
debugfs_create_devm_seqfile(sc->dev, "queues", sc->debug.debugfs_phy,
|
|
read_file_queues);
|
|
- debugfs_create_u32("qlen_bk", S_IRUSR | S_IWUSR, sc->debug.debugfs_phy,
|
|
- &sc->tx.txq_max_pending[IEEE80211_AC_BK]);
|
|
- debugfs_create_u32("qlen_be", S_IRUSR | S_IWUSR, sc->debug.debugfs_phy,
|
|
- &sc->tx.txq_max_pending[IEEE80211_AC_BE]);
|
|
- debugfs_create_u32("qlen_vi", S_IRUSR | S_IWUSR, sc->debug.debugfs_phy,
|
|
- &sc->tx.txq_max_pending[IEEE80211_AC_VI]);
|
|
- debugfs_create_u32("qlen_vo", S_IRUSR | S_IWUSR, sc->debug.debugfs_phy,
|
|
- &sc->tx.txq_max_pending[IEEE80211_AC_VO]);
|
|
debugfs_create_devm_seqfile(sc->dev, "misc", sc->debug.debugfs_phy,
|
|
read_file_misc);
|
|
debugfs_create_devm_seqfile(sc->dev, "reset", sc->debug.debugfs_phy,
|
|
diff --git a/drivers/net/wireless/ath/ath9k/debug.h b/drivers/net/wireless/ath/ath9k/debug.h
|
|
index cd68c5f..a078cdd 100644
|
|
--- a/drivers/net/wireless/ath/ath9k/debug.h
|
|
+++ b/drivers/net/wireless/ath/ath9k/debug.h
|
|
@@ -147,7 +147,6 @@ struct ath_interrupt_stats {
|
|
* @completed: Total MPDUs (non-aggr) completed
|
|
* @a_aggr: Total no. of aggregates queued
|
|
* @a_queued_hw: Total AMPDUs queued to hardware
|
|
- * @a_queued_sw: Total AMPDUs queued to software queues
|
|
* @a_completed: Total AMPDUs completed
|
|
* @a_retries: No. of AMPDUs retried (SW)
|
|
* @a_xretries: No. of AMPDUs dropped due to xretries
|
|
@@ -174,7 +173,6 @@ struct ath_tx_stats {
|
|
u32 xretries;
|
|
u32 a_aggr;
|
|
u32 a_queued_hw;
|
|
- u32 a_queued_sw;
|
|
u32 a_completed;
|
|
u32 a_retries;
|
|
u32 a_xretries;
|
|
diff --git a/drivers/net/wireless/ath/ath9k/debug_sta.c b/drivers/net/wireless/ath/ath9k/debug_sta.c
|
|
index b66cfa9..2a3a3c4 100644
|
|
--- a/drivers/net/wireless/ath/ath9k/debug_sta.c
|
|
+++ b/drivers/net/wireless/ath/ath9k/debug_sta.c
|
|
@@ -52,8 +52,8 @@ static ssize_t read_file_node_aggr(struct file *file, char __user *user_buf,
|
|
"TID", "SEQ_START", "SEQ_NEXT", "BAW_SIZE",
|
|
"BAW_HEAD", "BAW_TAIL", "BAR_IDX", "SCHED", "PAUSED");
|
|
|
|
- for (tidno = 0, tid = &an->tid[tidno];
|
|
- tidno < IEEE80211_NUM_TIDS; tidno++, tid++) {
|
|
+ for (tidno = 0; tidno < IEEE80211_NUM_TIDS; tidno++) {
|
|
+ tid = ath_node_to_tid(an, tidno);
|
|
txq = tid->txq;
|
|
ath_txq_lock(sc, txq);
|
|
if (tid->active) {
|
|
diff --git a/drivers/net/wireless/ath/ath9k/init.c b/drivers/net/wireless/ath/ath9k/init.c
|
|
index cfa3fe8..96bba17 100644
|
|
--- a/drivers/net/wireless/ath/ath9k/init.c
|
|
+++ b/drivers/net/wireless/ath/ath9k/init.c
|
|
@@ -358,7 +358,6 @@ static int ath9k_init_queues(struct ath_softc *sc)
|
|
for (i = 0; i < IEEE80211_NUM_ACS; i++) {
|
|
sc->tx.txq_map[i] = ath_txq_setup(sc, ATH9K_TX_QUEUE_DATA, i);
|
|
sc->tx.txq_map[i]->mac80211_qnum = i;
|
|
- sc->tx.txq_max_pending[i] = ATH_MAX_QDEPTH;
|
|
}
|
|
return 0;
|
|
}
|
|
@@ -877,6 +876,7 @@ static void ath9k_set_hw_capab(struct ath_softc *sc, struct ieee80211_hw *hw)
|
|
hw->max_rate_tries = 10;
|
|
hw->sta_data_size = sizeof(struct ath_node);
|
|
hw->vif_data_size = sizeof(struct ath_vif);
|
|
+ hw->txq_data_size = sizeof(struct ath_atx_tid);
|
|
hw->extra_tx_headroom = 4;
|
|
|
|
hw->wiphy->available_antennas_rx = BIT(ah->caps.max_rxchains) - 1;
|
|
diff --git a/drivers/net/wireless/ath/ath9k/main.c b/drivers/net/wireless/ath/ath9k/main.c
|
|
index e9f32b5..59e3bd0 100644
|
|
--- a/drivers/net/wireless/ath/ath9k/main.c
|
|
+++ b/drivers/net/wireless/ath/ath9k/main.c
|
|
@@ -1902,9 +1902,11 @@ static int ath9k_ampdu_action(struct ieee80211_hw *hw,
|
|
bool flush = false;
|
|
int ret = 0;
|
|
struct ieee80211_sta *sta = params->sta;
|
|
+ struct ath_node *an = (struct ath_node *)sta->drv_priv;
|
|
enum ieee80211_ampdu_mlme_action action = params->action;
|
|
u16 tid = params->tid;
|
|
u16 *ssn = ¶ms->ssn;
|
|
+ struct ath_atx_tid *atid;
|
|
|
|
mutex_lock(&sc->mutex);
|
|
|
|
@@ -1937,9 +1939,9 @@ static int ath9k_ampdu_action(struct ieee80211_hw *hw,
|
|
ath9k_ps_restore(sc);
|
|
break;
|
|
case IEEE80211_AMPDU_TX_OPERATIONAL:
|
|
- ath9k_ps_wakeup(sc);
|
|
- ath_tx_aggr_resume(sc, sta, tid);
|
|
- ath9k_ps_restore(sc);
|
|
+ atid = ath_node_to_tid(an, tid);
|
|
+ atid->baw_size = IEEE80211_MIN_AMPDU_BUF <<
|
|
+ sta->ht_cap.ampdu_factor;
|
|
break;
|
|
default:
|
|
ath_err(ath9k_hw_common(sc->sc_ah), "Unknown AMPDU action\n");
|
|
@@ -2701,4 +2703,5 @@ struct ieee80211_ops ath9k_ops = {
|
|
.sw_scan_start = ath9k_sw_scan_start,
|
|
.sw_scan_complete = ath9k_sw_scan_complete,
|
|
.get_txpower = ath9k_get_txpower,
|
|
+ .wake_tx_queue = ath9k_wake_tx_queue,
|
|
};
|
|
diff --git a/drivers/net/wireless/ath/ath9k/xmit.c b/drivers/net/wireless/ath/ath9k/xmit.c
|
|
index e47286b..4e2f3ac 100644
|
|
--- a/drivers/net/wireless/ath/ath9k/xmit.c
|
|
+++ b/drivers/net/wireless/ath/ath9k/xmit.c
|
|
@@ -67,6 +67,8 @@ static struct ath_buf *ath_tx_setup_buffer(struct ath_softc *sc,
|
|
struct ath_txq *txq,
|
|
struct ath_atx_tid *tid,
|
|
struct sk_buff *skb);
|
|
+static int ath_tx_prepare(struct ieee80211_hw *hw, struct sk_buff *skb,
|
|
+ struct ath_tx_control *txctl);
|
|
|
|
enum {
|
|
MCS_HT20,
|
|
@@ -137,6 +139,26 @@ static void ath_tx_queue_tid(struct ath_softc *sc, struct ath_txq *txq,
|
|
list_add_tail(&tid->list, list);
|
|
}
|
|
|
|
+void ath9k_wake_tx_queue(struct ieee80211_hw *hw, struct ieee80211_txq *queue)
|
|
+{
|
|
+ struct ath_softc *sc = hw->priv;
|
|
+ struct ath_common *common = ath9k_hw_common(sc->sc_ah);
|
|
+ struct ath_atx_tid *tid = (struct ath_atx_tid *) queue->drv_priv;
|
|
+ struct ath_txq *txq = tid->txq;
|
|
+
|
|
+ ath_dbg(common, QUEUE, "Waking TX queue: %pM (%d)\n",
|
|
+ queue->sta ? queue->sta->addr : queue->vif->addr,
|
|
+ tid->tidno);
|
|
+
|
|
+ ath_txq_lock(sc, txq);
|
|
+
|
|
+ tid->has_queued = true;
|
|
+ ath_tx_queue_tid(sc, txq, tid);
|
|
+ ath_txq_schedule(sc, txq);
|
|
+
|
|
+ ath_txq_unlock(sc, txq);
|
|
+}
|
|
+
|
|
static struct ath_frame_info *get_frame_info(struct sk_buff *skb)
|
|
{
|
|
struct ieee80211_tx_info *tx_info = IEEE80211_SKB_CB(skb);
|
|
@@ -164,7 +186,6 @@ static void ath_set_rates(struct ieee80211_vif *vif, struct ieee80211_sta *sta,
|
|
static void ath_txq_skb_done(struct ath_softc *sc, struct ath_txq *txq,
|
|
struct sk_buff *skb)
|
|
{
|
|
- struct ieee80211_tx_info *info = IEEE80211_SKB_CB(skb);
|
|
struct ath_frame_info *fi = get_frame_info(skb);
|
|
int q = fi->txq;
|
|
|
|
@@ -175,14 +196,6 @@ static void ath_txq_skb_done(struct ath_softc *sc, struct ath_txq *txq,
|
|
if (WARN_ON(--txq->pending_frames < 0))
|
|
txq->pending_frames = 0;
|
|
|
|
- if (txq->stopped &&
|
|
- txq->pending_frames < sc->tx.txq_max_pending[q]) {
|
|
- if (ath9k_is_chanctx_enabled())
|
|
- ieee80211_wake_queue(sc->hw, info->hw_queue);
|
|
- else
|
|
- ieee80211_wake_queue(sc->hw, q);
|
|
- txq->stopped = false;
|
|
- }
|
|
}
|
|
|
|
static struct ath_atx_tid *
|
|
@@ -192,9 +205,48 @@ ath_get_skb_tid(struct ath_softc *sc, struct ath_node *an, struct sk_buff *skb)
|
|
return ATH_AN_2_TID(an, tidno);
|
|
}
|
|
|
|
+static struct sk_buff *
|
|
+ath_tid_pull(struct ath_atx_tid *tid)
|
|
+{
|
|
+ struct ieee80211_txq *txq = container_of((void*)tid, struct ieee80211_txq, drv_priv);
|
|
+ struct ath_softc *sc = tid->an->sc;
|
|
+ struct ieee80211_hw *hw = sc->hw;
|
|
+ struct ath_tx_control txctl = {
|
|
+ .txq = tid->txq,
|
|
+ .sta = tid->an->sta,
|
|
+ };
|
|
+ struct sk_buff *skb;
|
|
+ struct ath_frame_info *fi;
|
|
+ int q;
|
|
+
|
|
+ if (!tid->has_queued)
|
|
+ return NULL;
|
|
+
|
|
+ skb = ieee80211_tx_dequeue(hw, txq);
|
|
+ if (!skb) {
|
|
+ tid->has_queued = false;
|
|
+ return NULL;
|
|
+ }
|
|
+
|
|
+ if (ath_tx_prepare(hw, skb, &txctl)) {
|
|
+ ieee80211_free_txskb(hw, skb);
|
|
+ return NULL;
|
|
+ }
|
|
+
|
|
+ q = skb_get_queue_mapping(skb);
|
|
+ if (tid->txq == sc->tx.txq_map[q]) {
|
|
+ fi = get_frame_info(skb);
|
|
+ fi->txq = q;
|
|
+ ++tid->txq->pending_frames;
|
|
+ }
|
|
+
|
|
+ return skb;
|
|
+ }
|
|
+
|
|
+
|
|
static bool ath_tid_has_buffered(struct ath_atx_tid *tid)
|
|
{
|
|
- return !skb_queue_empty(&tid->buf_q) || !skb_queue_empty(&tid->retry_q);
|
|
+ return !skb_queue_empty(&tid->retry_q) || tid->has_queued;
|
|
}
|
|
|
|
static struct sk_buff *ath_tid_dequeue(struct ath_atx_tid *tid)
|
|
@@ -203,46 +255,11 @@ static struct sk_buff *ath_tid_dequeue(struct ath_atx_tid *tid)
|
|
|
|
skb = __skb_dequeue(&tid->retry_q);
|
|
if (!skb)
|
|
- skb = __skb_dequeue(&tid->buf_q);
|
|
+ skb = ath_tid_pull(tid);
|
|
|
|
return skb;
|
|
}
|
|
|
|
-/*
|
|
- * ath_tx_tid_change_state:
|
|
- * - clears a-mpdu flag of previous session
|
|
- * - force sequence number allocation to fix next BlockAck Window
|
|
- */
|
|
-static void
|
|
-ath_tx_tid_change_state(struct ath_softc *sc, struct ath_atx_tid *tid)
|
|
-{
|
|
- struct ath_txq *txq = tid->txq;
|
|
- struct ieee80211_tx_info *tx_info;
|
|
- struct sk_buff *skb, *tskb;
|
|
- struct ath_buf *bf;
|
|
- struct ath_frame_info *fi;
|
|
-
|
|
- skb_queue_walk_safe(&tid->buf_q, skb, tskb) {
|
|
- fi = get_frame_info(skb);
|
|
- bf = fi->bf;
|
|
-
|
|
- tx_info = IEEE80211_SKB_CB(skb);
|
|
- tx_info->flags &= ~IEEE80211_TX_CTL_AMPDU;
|
|
-
|
|
- if (bf)
|
|
- continue;
|
|
-
|
|
- bf = ath_tx_setup_buffer(sc, txq, tid, skb);
|
|
- if (!bf) {
|
|
- __skb_unlink(skb, &tid->buf_q);
|
|
- ath_txq_skb_done(sc, txq, skb);
|
|
- ieee80211_free_txskb(sc->hw, skb);
|
|
- continue;
|
|
- }
|
|
- }
|
|
-
|
|
-}
|
|
-
|
|
static void ath_tx_flush_tid(struct ath_softc *sc, struct ath_atx_tid *tid)
|
|
{
|
|
struct ath_txq *txq = tid->txq;
|
|
@@ -883,20 +900,16 @@ static int ath_compute_num_delims(struct ath_softc *sc, struct ath_atx_tid *tid,
|
|
|
|
static struct ath_buf *
|
|
ath_tx_get_tid_subframe(struct ath_softc *sc, struct ath_txq *txq,
|
|
- struct ath_atx_tid *tid, struct sk_buff_head **q)
|
|
+ struct ath_atx_tid *tid)
|
|
{
|
|
struct ieee80211_tx_info *tx_info;
|
|
struct ath_frame_info *fi;
|
|
- struct sk_buff *skb;
|
|
+ struct sk_buff *skb, *first_skb = NULL;
|
|
struct ath_buf *bf;
|
|
u16 seqno;
|
|
|
|
while (1) {
|
|
- *q = &tid->retry_q;
|
|
- if (skb_queue_empty(*q))
|
|
- *q = &tid->buf_q;
|
|
-
|
|
- skb = skb_peek(*q);
|
|
+ skb = ath_tid_dequeue(tid);
|
|
if (!skb)
|
|
break;
|
|
|
|
@@ -908,7 +921,6 @@ ath_tx_get_tid_subframe(struct ath_softc *sc, struct ath_txq *txq,
|
|
bf->bf_state.stale = false;
|
|
|
|
if (!bf) {
|
|
- __skb_unlink(skb, *q);
|
|
ath_txq_skb_done(sc, txq, skb);
|
|
ieee80211_free_txskb(sc->hw, skb);
|
|
continue;
|
|
@@ -937,8 +949,20 @@ ath_tx_get_tid_subframe(struct ath_softc *sc, struct ath_txq *txq,
|
|
seqno = bf->bf_state.seqno;
|
|
|
|
/* do not step over block-ack window */
|
|
- if (!BAW_WITHIN(tid->seq_start, tid->baw_size, seqno))
|
|
+ if (!BAW_WITHIN(tid->seq_start, tid->baw_size, seqno)) {
|
|
+ __skb_queue_tail(&tid->retry_q, skb);
|
|
+
|
|
+ /* If there are other skbs in the retry q, they are
|
|
+ * probably within the BAW, so loop immediately to get
|
|
+ * one of them. Otherwise the queue can get stuck. */
|
|
+ if (!skb_queue_is_first(&tid->retry_q, skb) &&
|
|
+ !WARN_ON(skb == first_skb)) {
|
|
+ if(!first_skb) /* infinite loop prevention */
|
|
+ first_skb = skb;
|
|
+ continue;
|
|
+ }
|
|
break;
|
|
+ }
|
|
|
|
if (tid->bar_index > ATH_BA_INDEX(tid->seq_start, seqno)) {
|
|
struct ath_tx_status ts = {};
|
|
@@ -946,7 +970,6 @@ ath_tx_get_tid_subframe(struct ath_softc *sc, struct ath_txq *txq,
|
|
|
|
INIT_LIST_HEAD(&bf_head);
|
|
list_add(&bf->list, &bf_head);
|
|
- __skb_unlink(skb, *q);
|
|
ath_tx_update_baw(sc, tid, seqno);
|
|
ath_tx_complete_buf(sc, bf, txq, &bf_head, NULL, &ts, 0);
|
|
continue;
|
|
@@ -958,11 +981,10 @@ ath_tx_get_tid_subframe(struct ath_softc *sc, struct ath_txq *txq,
|
|
return NULL;
|
|
}
|
|
|
|
-static bool
|
|
+static int
|
|
ath_tx_form_aggr(struct ath_softc *sc, struct ath_txq *txq,
|
|
struct ath_atx_tid *tid, struct list_head *bf_q,
|
|
- struct ath_buf *bf_first, struct sk_buff_head *tid_q,
|
|
- int *aggr_len)
|
|
+ struct ath_buf *bf_first)
|
|
{
|
|
#define PADBYTES(_len) ((4 - ((_len) % 4)) % 4)
|
|
struct ath_buf *bf = bf_first, *bf_prev = NULL;
|
|
@@ -972,12 +994,13 @@ ath_tx_form_aggr(struct ath_softc *sc, struct ath_txq *txq,
|
|
struct ieee80211_tx_info *tx_info;
|
|
struct ath_frame_info *fi;
|
|
struct sk_buff *skb;
|
|
- bool closed = false;
|
|
+
|
|
|
|
bf = bf_first;
|
|
aggr_limit = ath_lookup_rate(sc, bf, tid);
|
|
|
|
- do {
|
|
+ while (bf)
|
|
+ {
|
|
skb = bf->bf_mpdu;
|
|
fi = get_frame_info(skb);
|
|
|
|
@@ -986,12 +1009,12 @@ ath_tx_form_aggr(struct ath_softc *sc, struct ath_txq *txq,
|
|
if (nframes) {
|
|
if (aggr_limit < al + bpad + al_delta ||
|
|
ath_lookup_legacy(bf) || nframes >= h_baw)
|
|
- break;
|
|
+ goto stop;
|
|
|
|
tx_info = IEEE80211_SKB_CB(bf->bf_mpdu);
|
|
if ((tx_info->flags & IEEE80211_TX_CTL_RATE_CTRL_PROBE) ||
|
|
!(tx_info->flags & IEEE80211_TX_CTL_AMPDU))
|
|
- break;
|
|
+ goto stop;
|
|
}
|
|
|
|
/* add padding for previous frame to aggregation length */
|
|
@@ -1013,20 +1036,18 @@ ath_tx_form_aggr(struct ath_softc *sc, struct ath_txq *txq,
|
|
ath_tx_addto_baw(sc, tid, bf);
|
|
bf->bf_state.ndelim = ndelim;
|
|
|
|
- __skb_unlink(skb, tid_q);
|
|
list_add_tail(&bf->list, bf_q);
|
|
if (bf_prev)
|
|
bf_prev->bf_next = bf;
|
|
|
|
bf_prev = bf;
|
|
|
|
- bf = ath_tx_get_tid_subframe(sc, txq, tid, &tid_q);
|
|
- if (!bf) {
|
|
- closed = true;
|
|
- break;
|
|
- }
|
|
- } while (ath_tid_has_buffered(tid));
|
|
-
|
|
+ bf = ath_tx_get_tid_subframe(sc, txq, tid);
|
|
+ }
|
|
+ goto finish;
|
|
+stop:
|
|
+ __skb_queue_tail(&tid->retry_q, bf->bf_mpdu);
|
|
+finish:
|
|
bf = bf_first;
|
|
bf->bf_lastbf = bf_prev;
|
|
|
|
@@ -1037,9 +1058,7 @@ ath_tx_form_aggr(struct ath_softc *sc, struct ath_txq *txq,
|
|
TX_STAT_INC(txq->axq_qnum, a_aggr);
|
|
}
|
|
|
|
- *aggr_len = al;
|
|
-
|
|
- return closed;
|
|
+ return al;
|
|
#undef PADBYTES
|
|
}
|
|
|
|
@@ -1416,18 +1435,15 @@ static void ath_tx_fill_desc(struct ath_softc *sc, struct ath_buf *bf,
|
|
static void
|
|
ath_tx_form_burst(struct ath_softc *sc, struct ath_txq *txq,
|
|
struct ath_atx_tid *tid, struct list_head *bf_q,
|
|
- struct ath_buf *bf_first, struct sk_buff_head *tid_q)
|
|
+ struct ath_buf *bf_first)
|
|
{
|
|
struct ath_buf *bf = bf_first, *bf_prev = NULL;
|
|
- struct sk_buff *skb;
|
|
int nframes = 0;
|
|
|
|
do {
|
|
struct ieee80211_tx_info *tx_info;
|
|
- skb = bf->bf_mpdu;
|
|
|
|
nframes++;
|
|
- __skb_unlink(skb, tid_q);
|
|
list_add_tail(&bf->list, bf_q);
|
|
if (bf_prev)
|
|
bf_prev->bf_next = bf;
|
|
@@ -1436,13 +1452,15 @@ ath_tx_form_burst(struct ath_softc *sc, struct ath_txq *txq,
|
|
if (nframes >= 2)
|
|
break;
|
|
|
|
- bf = ath_tx_get_tid_subframe(sc, txq, tid, &tid_q);
|
|
+ bf = ath_tx_get_tid_subframe(sc, txq, tid);
|
|
if (!bf)
|
|
break;
|
|
|
|
tx_info = IEEE80211_SKB_CB(bf->bf_mpdu);
|
|
- if (tx_info->flags & IEEE80211_TX_CTL_AMPDU)
|
|
+ if (tx_info->flags & IEEE80211_TX_CTL_AMPDU) {
|
|
+ __skb_queue_tail(&tid->retry_q, bf->bf_mpdu);
|
|
break;
|
|
+ }
|
|
|
|
ath_set_rates(tid->an->vif, tid->an->sta, bf);
|
|
} while (1);
|
|
@@ -1453,34 +1471,33 @@ static bool ath_tx_sched_aggr(struct ath_softc *sc, struct ath_txq *txq,
|
|
{
|
|
struct ath_buf *bf;
|
|
struct ieee80211_tx_info *tx_info;
|
|
- struct sk_buff_head *tid_q;
|
|
struct list_head bf_q;
|
|
int aggr_len = 0;
|
|
- bool aggr, last = true;
|
|
+ bool aggr;
|
|
|
|
if (!ath_tid_has_buffered(tid))
|
|
return false;
|
|
|
|
INIT_LIST_HEAD(&bf_q);
|
|
|
|
- bf = ath_tx_get_tid_subframe(sc, txq, tid, &tid_q);
|
|
+ bf = ath_tx_get_tid_subframe(sc, txq, tid);
|
|
if (!bf)
|
|
return false;
|
|
|
|
tx_info = IEEE80211_SKB_CB(bf->bf_mpdu);
|
|
aggr = !!(tx_info->flags & IEEE80211_TX_CTL_AMPDU);
|
|
if ((aggr && txq->axq_ampdu_depth >= ATH_AGGR_MIN_QDEPTH) ||
|
|
- (!aggr && txq->axq_depth >= ATH_NON_AGGR_MIN_QDEPTH)) {
|
|
+ (!aggr && txq->axq_depth >= ATH_NON_AGGR_MIN_QDEPTH)) {
|
|
+ __skb_queue_tail(&tid->retry_q, bf->bf_mpdu);
|
|
*stop = true;
|
|
return false;
|
|
}
|
|
|
|
ath_set_rates(tid->an->vif, tid->an->sta, bf);
|
|
if (aggr)
|
|
- last = ath_tx_form_aggr(sc, txq, tid, &bf_q, bf,
|
|
- tid_q, &aggr_len);
|
|
+ aggr_len = ath_tx_form_aggr(sc, txq, tid, &bf_q, bf);
|
|
else
|
|
- ath_tx_form_burst(sc, txq, tid, &bf_q, bf, tid_q);
|
|
+ ath_tx_form_burst(sc, txq, tid, &bf_q, bf);
|
|
|
|
if (list_empty(&bf_q))
|
|
return false;
|
|
@@ -1523,9 +1540,6 @@ int ath_tx_aggr_start(struct ath_softc *sc, struct ieee80211_sta *sta,
|
|
an->mpdudensity = density;
|
|
}
|
|
|
|
- /* force sequence number allocation for pending frames */
|
|
- ath_tx_tid_change_state(sc, txtid);
|
|
-
|
|
txtid->active = true;
|
|
*ssn = txtid->seq_start = txtid->seq_next;
|
|
txtid->bar_index = -1;
|
|
@@ -1550,7 +1564,6 @@ void ath_tx_aggr_stop(struct ath_softc *sc, struct ieee80211_sta *sta, u16 tid)
|
|
ath_txq_lock(sc, txq);
|
|
txtid->active = false;
|
|
ath_tx_flush_tid(sc, txtid);
|
|
- ath_tx_tid_change_state(sc, txtid);
|
|
ath_txq_unlock_complete(sc, txq);
|
|
}
|
|
|
|
@@ -1560,14 +1573,12 @@ void ath_tx_aggr_sleep(struct ieee80211_sta *sta, struct ath_softc *sc,
|
|
struct ath_common *common = ath9k_hw_common(sc->sc_ah);
|
|
struct ath_atx_tid *tid;
|
|
struct ath_txq *txq;
|
|
- bool buffered;
|
|
int tidno;
|
|
|
|
ath_dbg(common, XMIT, "%s called\n", __func__);
|
|
|
|
- for (tidno = 0, tid = &an->tid[tidno];
|
|
- tidno < IEEE80211_NUM_TIDS; tidno++, tid++) {
|
|
-
|
|
+ for (tidno = 0; tidno < IEEE80211_NUM_TIDS; tidno++) {
|
|
+ tid = ath_node_to_tid(an, tidno);
|
|
txq = tid->txq;
|
|
|
|
ath_txq_lock(sc, txq);
|
|
@@ -1577,13 +1588,12 @@ void ath_tx_aggr_sleep(struct ieee80211_sta *sta, struct ath_softc *sc,
|
|
continue;
|
|
}
|
|
|
|
- buffered = ath_tid_has_buffered(tid);
|
|
+ if (!skb_queue_empty(&tid->retry_q))
|
|
+ ieee80211_sta_set_buffered(sta, tid->tidno, true);
|
|
|
|
list_del_init(&tid->list);
|
|
|
|
ath_txq_unlock(sc, txq);
|
|
-
|
|
- ieee80211_sta_set_buffered(sta, tidno, buffered);
|
|
}
|
|
}
|
|
|
|
@@ -1596,49 +1606,20 @@ void ath_tx_aggr_wakeup(struct ath_softc *sc, struct ath_node *an)
|
|
|
|
ath_dbg(common, XMIT, "%s called\n", __func__);
|
|
|
|
- for (tidno = 0, tid = &an->tid[tidno];
|
|
- tidno < IEEE80211_NUM_TIDS; tidno++, tid++) {
|
|
-
|
|
+ for (tidno = 0; tidno < IEEE80211_NUM_TIDS; tidno++) {
|
|
+ tid = ath_node_to_tid(an, tidno);
|
|
txq = tid->txq;
|
|
|
|
ath_txq_lock(sc, txq);
|
|
tid->clear_ps_filter = true;
|
|
-
|
|
if (ath_tid_has_buffered(tid)) {
|
|
ath_tx_queue_tid(sc, txq, tid);
|
|
ath_txq_schedule(sc, txq);
|
|
}
|
|
-
|
|
ath_txq_unlock_complete(sc, txq);
|
|
}
|
|
}
|
|
|
|
-void ath_tx_aggr_resume(struct ath_softc *sc, struct ieee80211_sta *sta,
|
|
- u16 tidno)
|
|
-{
|
|
- struct ath_common *common = ath9k_hw_common(sc->sc_ah);
|
|
- struct ath_atx_tid *tid;
|
|
- struct ath_node *an;
|
|
- struct ath_txq *txq;
|
|
-
|
|
- ath_dbg(common, XMIT, "%s called\n", __func__);
|
|
-
|
|
- an = (struct ath_node *)sta->drv_priv;
|
|
- tid = ATH_AN_2_TID(an, tidno);
|
|
- txq = tid->txq;
|
|
-
|
|
- ath_txq_lock(sc, txq);
|
|
-
|
|
- tid->baw_size = IEEE80211_MIN_AMPDU_BUF << sta->ht_cap.ampdu_factor;
|
|
-
|
|
- if (ath_tid_has_buffered(tid)) {
|
|
- ath_tx_queue_tid(sc, txq, tid);
|
|
- ath_txq_schedule(sc, txq);
|
|
- }
|
|
-
|
|
- ath_txq_unlock_complete(sc, txq);
|
|
-}
|
|
-
|
|
void ath9k_release_buffered_frames(struct ieee80211_hw *hw,
|
|
struct ieee80211_sta *sta,
|
|
u16 tids, int nframes,
|
|
@@ -1651,7 +1632,6 @@ void ath9k_release_buffered_frames(struct ieee80211_hw *hw,
|
|
struct ieee80211_tx_info *info;
|
|
struct list_head bf_q;
|
|
struct ath_buf *bf_tail = NULL, *bf;
|
|
- struct sk_buff_head *tid_q;
|
|
int sent = 0;
|
|
int i;
|
|
|
|
@@ -1666,11 +1646,10 @@ void ath9k_release_buffered_frames(struct ieee80211_hw *hw,
|
|
|
|
ath_txq_lock(sc, tid->txq);
|
|
while (nframes > 0) {
|
|
- bf = ath_tx_get_tid_subframe(sc, sc->tx.uapsdq, tid, &tid_q);
|
|
+ bf = ath_tx_get_tid_subframe(sc, sc->tx.uapsdq, tid);
|
|
if (!bf)
|
|
break;
|
|
|
|
- __skb_unlink(bf->bf_mpdu, tid_q);
|
|
list_add_tail(&bf->list, &bf_q);
|
|
ath_set_rates(tid->an->vif, tid->an->sta, bf);
|
|
if (bf_isampdu(bf)) {
|
|
@@ -1685,7 +1664,7 @@ void ath9k_release_buffered_frames(struct ieee80211_hw *hw,
|
|
sent++;
|
|
TX_STAT_INC(txq->axq_qnum, a_queued_hw);
|
|
|
|
- if (an->sta && !ath_tid_has_buffered(tid))
|
|
+ if (an->sta && skb_queue_empty(&tid->retry_q))
|
|
ieee80211_sta_set_buffered(an->sta, i, false);
|
|
}
|
|
ath_txq_unlock_complete(sc, tid->txq);
|
|
@@ -1914,13 +1893,7 @@ bool ath_drain_all_txq(struct ath_softc *sc)
|
|
if (!ATH_TXQ_SETUP(sc, i))
|
|
continue;
|
|
|
|
- /*
|
|
- * The caller will resume queues with ieee80211_wake_queues.
|
|
- * Mark the queue as not stopped to prevent ath_tx_complete
|
|
- * from waking the queue too early.
|
|
- */
|
|
txq = &sc->tx.txq[i];
|
|
- txq->stopped = false;
|
|
ath_draintxq(sc, txq);
|
|
}
|
|
|
|
@@ -2319,16 +2292,14 @@ int ath_tx_start(struct ieee80211_hw *hw, struct sk_buff *skb,
|
|
struct ath_softc *sc = hw->priv;
|
|
struct ath_txq *txq = txctl->txq;
|
|
struct ath_atx_tid *tid = NULL;
|
|
+ struct ath_node *an = NULL;
|
|
struct ath_buf *bf;
|
|
- bool queue, skip_uapsd = false, ps_resp;
|
|
+ bool ps_resp;
|
|
int q, ret;
|
|
|
|
if (vif)
|
|
avp = (void *)vif->drv_priv;
|
|
|
|
- if (info->flags & IEEE80211_TX_CTL_TX_OFFCHAN)
|
|
- txctl->force_channel = true;
|
|
-
|
|
ps_resp = !!(info->control.flags & IEEE80211_TX_CTRL_PS_RESPONSE);
|
|
|
|
ret = ath_tx_prepare(hw, skb, txctl);
|
|
@@ -2343,63 +2314,18 @@ int ath_tx_start(struct ieee80211_hw *hw, struct sk_buff *skb,
|
|
|
|
q = skb_get_queue_mapping(skb);
|
|
|
|
- ath_txq_lock(sc, txq);
|
|
- if (txq == sc->tx.txq_map[q]) {
|
|
- fi->txq = q;
|
|
- if (++txq->pending_frames > sc->tx.txq_max_pending[q] &&
|
|
- !txq->stopped) {
|
|
- if (ath9k_is_chanctx_enabled())
|
|
- ieee80211_stop_queue(sc->hw, info->hw_queue);
|
|
- else
|
|
- ieee80211_stop_queue(sc->hw, q);
|
|
- txq->stopped = true;
|
|
- }
|
|
- }
|
|
-
|
|
- queue = ieee80211_is_data_present(hdr->frame_control);
|
|
-
|
|
- /* If chanctx, queue all null frames while NOA could be there */
|
|
- if (ath9k_is_chanctx_enabled() &&
|
|
- ieee80211_is_nullfunc(hdr->frame_control) &&
|
|
- !txctl->force_channel)
|
|
- queue = true;
|
|
-
|
|
- /* Force queueing of all frames that belong to a virtual interface on
|
|
- * a different channel context, to ensure that they are sent on the
|
|
- * correct channel.
|
|
- */
|
|
- if (((avp && avp->chanctx != sc->cur_chan) ||
|
|
- sc->cur_chan->stopped) && !txctl->force_channel) {
|
|
- if (!txctl->an)
|
|
- txctl->an = &avp->mcast_node;
|
|
- queue = true;
|
|
- skip_uapsd = true;
|
|
- }
|
|
-
|
|
- if (txctl->an && queue)
|
|
- tid = ath_get_skb_tid(sc, txctl->an, skb);
|
|
-
|
|
- if (!skip_uapsd && ps_resp) {
|
|
- ath_txq_unlock(sc, txq);
|
|
+ if (ps_resp)
|
|
txq = sc->tx.uapsdq;
|
|
- ath_txq_lock(sc, txq);
|
|
- } else if (txctl->an && queue) {
|
|
- WARN_ON(tid->txq != txctl->txq);
|
|
|
|
- if (info->flags & IEEE80211_TX_CTL_CLEAR_PS_FILT)
|
|
- tid->clear_ps_filter = true;
|
|
-
|
|
- /*
|
|
- * Add this frame to software queue for scheduling later
|
|
- * for aggregation.
|
|
- */
|
|
- TX_STAT_INC(txq->axq_qnum, a_queued_sw);
|
|
- __skb_queue_tail(&tid->buf_q, skb);
|
|
- if (!txctl->an->sleeping)
|
|
- ath_tx_queue_tid(sc, txq, tid);
|
|
+ if (txctl->sta) {
|
|
+ an = (struct ath_node *) sta->drv_priv;
|
|
+ tid = ath_get_skb_tid(sc, an, skb);
|
|
+ }
|
|
|
|
- ath_txq_schedule(sc, txq);
|
|
- goto out;
|
|
+ ath_txq_lock(sc, txq);
|
|
+ if (txq == sc->tx.txq_map[q]) {
|
|
+ fi->txq = q;
|
|
+ ++txq->pending_frames;
|
|
}
|
|
|
|
bf = ath_tx_setup_buffer(sc, txq, tid, skb);
|
|
@@ -2892,9 +2818,8 @@ void ath_tx_node_init(struct ath_softc *sc, struct ath_node *an)
|
|
struct ath_atx_tid *tid;
|
|
int tidno, acno;
|
|
|
|
- for (tidno = 0, tid = &an->tid[tidno];
|
|
- tidno < IEEE80211_NUM_TIDS;
|
|
- tidno++, tid++) {
|
|
+ for (tidno = 0; tidno < IEEE80211_NUM_TIDS; tidno++) {
|
|
+ tid = ath_node_to_tid(an, tidno);
|
|
tid->an = an;
|
|
tid->tidno = tidno;
|
|
tid->seq_start = tid->seq_next = 0;
|
|
@@ -2902,11 +2827,14 @@ void ath_tx_node_init(struct ath_softc *sc, struct ath_node *an)
|
|
tid->baw_head = tid->baw_tail = 0;
|
|
tid->active = false;
|
|
tid->clear_ps_filter = true;
|
|
- __skb_queue_head_init(&tid->buf_q);
|
|
+ tid->has_queued = false;
|
|
__skb_queue_head_init(&tid->retry_q);
|
|
INIT_LIST_HEAD(&tid->list);
|
|
acno = TID_TO_WME_AC(tidno);
|
|
tid->txq = sc->tx.txq_map[acno];
|
|
+
|
|
+ if (!an->sta)
|
|
+ break; /* just one multicast ath_atx_tid */
|
|
}
|
|
}
|
|
|
|
@@ -2916,9 +2844,8 @@ void ath_tx_node_cleanup(struct ath_softc *sc, struct ath_node *an)
|
|
struct ath_txq *txq;
|
|
int tidno;
|
|
|
|
- for (tidno = 0, tid = &an->tid[tidno];
|
|
- tidno < IEEE80211_NUM_TIDS; tidno++, tid++) {
|
|
-
|
|
+ for (tidno = 0; tidno < IEEE80211_NUM_TIDS; tidno++) {
|
|
+ tid = ath_node_to_tid(an, tidno);
|
|
txq = tid->txq;
|
|
|
|
ath_txq_lock(sc, txq);
|
|
@@ -2930,6 +2857,9 @@ void ath_tx_node_cleanup(struct ath_softc *sc, struct ath_node *an)
|
|
tid->active = false;
|
|
|
|
ath_txq_unlock(sc, txq);
|
|
+
|
|
+ if (!an->sta)
|
|
+ break; /* just one multicast ath_atx_tid */
|
|
}
|
|
}
|
|
|
|
diff --git a/drivers/net/wireless/intel/ipw2x00/ipw2100.c b/drivers/net/wireless/intel/ipw2x00/ipw2100.c
|
|
index bfa542c..c6759c5 100644
|
|
--- a/drivers/net/wireless/intel/ipw2x00/ipw2100.c
|
|
+++ b/drivers/net/wireless/intel/ipw2x00/ipw2100.c
|
|
@@ -830,7 +830,7 @@ static int ipw2100_hw_send_command(struct ipw2100_priv *priv,
|
|
* doesn't seem to have as many firmware restart cycles...
|
|
*
|
|
* As a test, we're sticking in a 1/100s delay here */
|
|
- schedule_timeout_uninterruptible(msecs_to_jiffies(10));
|
|
+ schedule_msec_hrtimeout_uninterruptible((10));
|
|
|
|
return 0;
|
|
|
|
@@ -1281,7 +1281,7 @@ static int ipw2100_start_adapter(struct ipw2100_priv *priv)
|
|
IPW_DEBUG_FW("Waiting for f/w initialization to complete...\n");
|
|
i = 5000;
|
|
do {
|
|
- schedule_timeout_uninterruptible(msecs_to_jiffies(40));
|
|
+ schedule_msec_hrtimeout_uninterruptible((40));
|
|
/* Todo... wait for sync command ... */
|
|
|
|
read_register(priv->net_dev, IPW_REG_INTA, &inta);
|
|
diff --git a/drivers/ntb/test/ntb_perf.c b/drivers/ntb/test/ntb_perf.c
|
|
index e75d4fd..7103ac1 100644
|
|
--- a/drivers/ntb/test/ntb_perf.c
|
|
+++ b/drivers/ntb/test/ntb_perf.c
|
|
@@ -306,7 +306,7 @@ static int perf_move_data(struct pthr_ctx *pctx, char __iomem *dst, char *src,
|
|
if (unlikely((jiffies - last_sleep) > 5 * HZ)) {
|
|
last_sleep = jiffies;
|
|
set_current_state(TASK_INTERRUPTIBLE);
|
|
- schedule_timeout(1);
|
|
+ schedule_min_hrtimeout();
|
|
}
|
|
|
|
if (unlikely(kthread_should_stop()))
|
|
diff --git a/drivers/parport/ieee1284.c b/drivers/parport/ieee1284.c
|
|
index f9fd4b3..00ad2f3 100644
|
|
--- a/drivers/parport/ieee1284.c
|
|
+++ b/drivers/parport/ieee1284.c
|
|
@@ -215,7 +215,7 @@ int parport_wait_peripheral(struct parport *port,
|
|
/* parport_wait_event didn't time out, but the
|
|
* peripheral wasn't actually ready either.
|
|
* Wait for another 10ms. */
|
|
- schedule_timeout_interruptible(msecs_to_jiffies(10));
|
|
+ schedule_msec_hrtimeout_interruptible((10));
|
|
}
|
|
}
|
|
|
|
diff --git a/drivers/parport/ieee1284_ops.c b/drivers/parport/ieee1284_ops.c
|
|
index 2e21af4..da5f240 100644
|
|
--- a/drivers/parport/ieee1284_ops.c
|
|
+++ b/drivers/parport/ieee1284_ops.c
|
|
@@ -536,7 +536,7 @@ size_t parport_ieee1284_ecp_read_data (struct parport *port,
|
|
/* Yield the port for a while. */
|
|
if (count && dev->port->irq != PARPORT_IRQ_NONE) {
|
|
parport_release (dev);
|
|
- schedule_timeout_interruptible(msecs_to_jiffies(40));
|
|
+ schedule_msec_hrtimeout_interruptible((40));
|
|
parport_claim_or_block (dev);
|
|
}
|
|
else
|
|
diff --git a/drivers/platform/x86/intel_ips.c b/drivers/platform/x86/intel_ips.c
|
|
index 55663b3..0363fed 100644
|
|
--- a/drivers/platform/x86/intel_ips.c
|
|
+++ b/drivers/platform/x86/intel_ips.c
|
|
@@ -812,7 +812,7 @@ static int ips_adjust(void *data)
|
|
ips_gpu_lower(ips);
|
|
|
|
sleep:
|
|
- schedule_timeout_interruptible(msecs_to_jiffies(IPS_ADJUST_PERIOD));
|
|
+ schedule_msec_hrtimeout_interruptible((IPS_ADJUST_PERIOD));
|
|
} while (!kthread_should_stop());
|
|
|
|
dev_dbg(&ips->dev->dev, "ips-adjust thread stopped\n");
|
|
@@ -991,7 +991,7 @@ static int ips_monitor(void *data)
|
|
seqno_timestamp = get_jiffies_64();
|
|
|
|
old_cpu_power = thm_readl(THM_CEC);
|
|
- schedule_timeout_interruptible(msecs_to_jiffies(IPS_SAMPLE_PERIOD));
|
|
+ schedule_msec_hrtimeout_interruptible((IPS_SAMPLE_PERIOD));
|
|
|
|
/* Collect an initial average */
|
|
for (i = 0; i < IPS_SAMPLE_COUNT; i++) {
|
|
@@ -1018,7 +1018,7 @@ static int ips_monitor(void *data)
|
|
mchp_samples[i] = mchp;
|
|
}
|
|
|
|
- schedule_timeout_interruptible(msecs_to_jiffies(IPS_SAMPLE_PERIOD));
|
|
+ schedule_msec_hrtimeout_interruptible((IPS_SAMPLE_PERIOD));
|
|
if (kthread_should_stop())
|
|
break;
|
|
}
|
|
@@ -1045,7 +1045,7 @@ static int ips_monitor(void *data)
|
|
* us to reduce the sample frequency if the CPU and GPU are idle.
|
|
*/
|
|
old_cpu_power = thm_readl(THM_CEC);
|
|
- schedule_timeout_interruptible(msecs_to_jiffies(IPS_SAMPLE_PERIOD));
|
|
+ schedule_msec_hrtimeout_interruptible((IPS_SAMPLE_PERIOD));
|
|
last_sample_period = IPS_SAMPLE_PERIOD;
|
|
|
|
setup_deferrable_timer_on_stack(&timer, monitor_timeout,
|
|
diff --git a/drivers/rtc/rtc-wm8350.c b/drivers/rtc/rtc-wm8350.c
|
|
index fa247de..f1a28d8 100644
|
|
--- a/drivers/rtc/rtc-wm8350.c
|
|
+++ b/drivers/rtc/rtc-wm8350.c
|
|
@@ -121,7 +121,7 @@ static int wm8350_rtc_settime(struct device *dev, struct rtc_time *tm)
|
|
/* Wait until confirmation of stopping */
|
|
do {
|
|
rtc_ctrl = wm8350_reg_read(wm8350, WM8350_RTC_TIME_CONTROL);
|
|
- schedule_timeout_uninterruptible(msecs_to_jiffies(1));
|
|
+ schedule_msec_hrtimeout_uninterruptible((1));
|
|
} while (--retries && !(rtc_ctrl & WM8350_RTC_STS));
|
|
|
|
if (!retries) {
|
|
@@ -204,7 +204,7 @@ static int wm8350_rtc_stop_alarm(struct wm8350 *wm8350)
|
|
/* Wait until confirmation of stopping */
|
|
do {
|
|
rtc_ctrl = wm8350_reg_read(wm8350, WM8350_RTC_TIME_CONTROL);
|
|
- schedule_timeout_uninterruptible(msecs_to_jiffies(1));
|
|
+ schedule_msec_hrtimeout_uninterruptible((1));
|
|
} while (retries-- && !(rtc_ctrl & WM8350_RTC_ALMSTS));
|
|
|
|
if (!(rtc_ctrl & WM8350_RTC_ALMSTS))
|
|
@@ -227,7 +227,7 @@ static int wm8350_rtc_start_alarm(struct wm8350 *wm8350)
|
|
/* Wait until confirmation */
|
|
do {
|
|
rtc_ctrl = wm8350_reg_read(wm8350, WM8350_RTC_TIME_CONTROL);
|
|
- schedule_timeout_uninterruptible(msecs_to_jiffies(1));
|
|
+ schedule_msec_hrtimeout_uninterruptible((1));
|
|
} while (retries-- && rtc_ctrl & WM8350_RTC_ALMSTS);
|
|
|
|
if (rtc_ctrl & WM8350_RTC_ALMSTS)
|
|
diff --git a/drivers/scsi/fnic/fnic_scsi.c b/drivers/scsi/fnic/fnic_scsi.c
|
|
index d9fd2f8..ea44afd 100644
|
|
--- a/drivers/scsi/fnic/fnic_scsi.c
|
|
+++ b/drivers/scsi/fnic/fnic_scsi.c
|
|
@@ -217,7 +217,7 @@ int fnic_fw_reset_handler(struct fnic *fnic)
|
|
|
|
/* wait for io cmpl */
|
|
while (atomic_read(&fnic->in_flight))
|
|
- schedule_timeout(msecs_to_jiffies(1));
|
|
+ schedule_msec_hrtimeout((1));
|
|
|
|
spin_lock_irqsave(&fnic->wq_copy_lock[0], flags);
|
|
|
|
@@ -2193,7 +2193,7 @@ static int fnic_clean_pending_aborts(struct fnic *fnic,
|
|
}
|
|
}
|
|
|
|
- schedule_timeout(msecs_to_jiffies(2 * fnic->config.ed_tov));
|
|
+ schedule_msec_hrtimeout((2 * fnic->config.ed_tov));
|
|
|
|
/* walk again to check, if IOs are still pending in fw */
|
|
if (fnic_is_abts_pending(fnic, lr_sc))
|
|
diff --git a/drivers/scsi/lpfc/lpfc_scsi.c b/drivers/scsi/lpfc/lpfc_scsi.c
|
|
index d197aa1..8e2fd7b 100644
|
|
--- a/drivers/scsi/lpfc/lpfc_scsi.c
|
|
+++ b/drivers/scsi/lpfc/lpfc_scsi.c
|
|
@@ -5105,7 +5105,7 @@ lpfc_reset_flush_io_context(struct lpfc_vport *vport, uint16_t tgt_id,
|
|
tgt_id, lun_id, context);
|
|
later = msecs_to_jiffies(2 * vport->cfg_devloss_tmo * 1000) + jiffies;
|
|
while (time_after(later, jiffies) && cnt) {
|
|
- schedule_timeout_uninterruptible(msecs_to_jiffies(20));
|
|
+ schedule_msec_hrtimeout_uninterruptible((20));
|
|
cnt = lpfc_sli_sum_iocb(vport, tgt_id, lun_id, context);
|
|
}
|
|
if (cnt) {
|
|
diff --git a/drivers/scsi/scsi.c b/drivers/scsi/scsi.c
|
|
index 1deb6ad..75455d4 100644
|
|
--- a/drivers/scsi/scsi.c
|
|
+++ b/drivers/scsi/scsi.c
|
|
@@ -621,6 +621,9 @@ int scsi_change_queue_depth(struct scsi_device *sdev, int depth)
|
|
wmb();
|
|
}
|
|
|
|
+ if (sdev->request_queue)
|
|
+ blk_set_queue_depth(sdev->request_queue, depth);
|
|
+
|
|
return sdev->queue_depth;
|
|
}
|
|
EXPORT_SYMBOL(scsi_change_queue_depth);
|
|
diff --git a/drivers/scsi/snic/snic_scsi.c b/drivers/scsi/snic/snic_scsi.c
|
|
index abada16..0bf30dc 100644
|
|
--- a/drivers/scsi/snic/snic_scsi.c
|
|
+++ b/drivers/scsi/snic/snic_scsi.c
|
|
@@ -2356,7 +2356,7 @@ snic_reset(struct Scsi_Host *shost, struct scsi_cmnd *sc)
|
|
|
|
/* Wait for all the IOs that are entered in Qcmd */
|
|
while (atomic_read(&snic->ios_inflight))
|
|
- schedule_timeout(msecs_to_jiffies(1));
|
|
+ schedule_msec_hrtimeout((1));
|
|
|
|
ret = snic_issue_hba_reset(snic, sc);
|
|
if (ret) {
|
|
diff --git a/drivers/staging/comedi/drivers/ni_mio_common.c b/drivers/staging/comedi/drivers/ni_mio_common.c
|
|
index 1c967c3..a23726e 100644
|
|
--- a/drivers/staging/comedi/drivers/ni_mio_common.c
|
|
+++ b/drivers/staging/comedi/drivers/ni_mio_common.c
|
|
@@ -4610,7 +4610,7 @@ static int cs5529_wait_for_idle(struct comedi_device *dev)
|
|
if ((status & NI67XX_CAL_STATUS_BUSY) == 0)
|
|
break;
|
|
set_current_state(TASK_INTERRUPTIBLE);
|
|
- if (schedule_timeout(1))
|
|
+ if (schedule_min_hrtimeout())
|
|
return -EIO;
|
|
}
|
|
if (i == timeout) {
|
|
diff --git a/drivers/staging/lustre/lnet/lnet/lib-eq.c b/drivers/staging/lustre/lnet/lnet/lib-eq.c
|
|
index d05c6cc..3f62b6f 100644
|
|
--- a/drivers/staging/lustre/lnet/lnet/lib-eq.c
|
|
+++ b/drivers/staging/lustre/lnet/lnet/lib-eq.c
|
|
@@ -328,7 +328,7 @@ __must_hold(&the_lnet.ln_eq_wait_lock)
|
|
schedule();
|
|
} else {
|
|
now = jiffies;
|
|
- schedule_timeout(msecs_to_jiffies(tms));
|
|
+ schedule_msec_hrtimeout((tms));
|
|
tms -= jiffies_to_msecs(jiffies - now);
|
|
if (tms < 0) /* no more wait but may have new event */
|
|
tms = 0;
|
|
diff --git a/drivers/staging/rts5208/rtsx.c b/drivers/staging/rts5208/rtsx.c
|
|
index 5d65a5c..729ab6d 100644
|
|
--- a/drivers/staging/rts5208/rtsx.c
|
|
+++ b/drivers/staging/rts5208/rtsx.c
|
|
@@ -533,7 +533,7 @@ static int rtsx_polling_thread(void *__dev)
|
|
for (;;) {
|
|
|
|
set_current_state(TASK_INTERRUPTIBLE);
|
|
- schedule_timeout(msecs_to_jiffies(POLLING_INTERVAL));
|
|
+ schedule_msec_hrtimeout((POLLING_INTERVAL));
|
|
|
|
/* lock the device pointers */
|
|
mutex_lock(&(dev->dev_mutex));
|
|
diff --git a/drivers/staging/speakup/speakup_acntpc.c b/drivers/staging/speakup/speakup_acntpc.c
|
|
index efb791b..fd02fb2 100644
|
|
--- a/drivers/staging/speakup/speakup_acntpc.c
|
|
+++ b/drivers/staging/speakup/speakup_acntpc.c
|
|
@@ -204,7 +204,7 @@ static void do_catch_up(struct spk_synth *synth)
|
|
full_time_val = full_time->u.n.value;
|
|
spin_unlock_irqrestore(&speakup_info.spinlock, flags);
|
|
if (synth_full()) {
|
|
- schedule_timeout(msecs_to_jiffies(full_time_val));
|
|
+ schedule_msec_hrtimeout((full_time_val));
|
|
continue;
|
|
}
|
|
set_current_state(TASK_RUNNING);
|
|
@@ -232,7 +232,7 @@ static void do_catch_up(struct spk_synth *synth)
|
|
jiffy_delta_val = jiffy_delta->u.n.value;
|
|
delay_time_val = delay_time->u.n.value;
|
|
spin_unlock_irqrestore(&speakup_info.spinlock, flags);
|
|
- schedule_timeout(msecs_to_jiffies(delay_time_val));
|
|
+ schedule_msec_hrtimeout((delay_time_val));
|
|
jiff_max = jiffies+jiffy_delta_val;
|
|
}
|
|
}
|
|
diff --git a/drivers/staging/speakup/speakup_apollo.c b/drivers/staging/speakup/speakup_apollo.c
|
|
index 3cbc8a7..3c17854 100644
|
|
--- a/drivers/staging/speakup/speakup_apollo.c
|
|
+++ b/drivers/staging/speakup/speakup_apollo.c
|
|
@@ -172,7 +172,7 @@ static void do_catch_up(struct spk_synth *synth)
|
|
outb(UART_MCR_DTR, speakup_info.port_tts + UART_MCR);
|
|
outb(UART_MCR_DTR | UART_MCR_RTS,
|
|
speakup_info.port_tts + UART_MCR);
|
|
- schedule_timeout(msecs_to_jiffies(full_time_val));
|
|
+ schedule_msec_hrtimeout((full_time_val));
|
|
continue;
|
|
}
|
|
if (time_after_eq(jiffies, jiff_max) && (ch == SPACE)) {
|
|
diff --git a/drivers/staging/speakup/speakup_decext.c b/drivers/staging/speakup/speakup_decext.c
|
|
index 1a5cf3d..fa2b4e1 100644
|
|
--- a/drivers/staging/speakup/speakup_decext.c
|
|
+++ b/drivers/staging/speakup/speakup_decext.c
|
|
@@ -186,7 +186,7 @@ static void do_catch_up(struct spk_synth *synth)
|
|
if (ch == '\n')
|
|
ch = 0x0D;
|
|
if (synth_full() || !spk_serial_out(ch)) {
|
|
- schedule_timeout(msecs_to_jiffies(delay_time_val));
|
|
+ schedule_msec_hrtimeout((delay_time_val));
|
|
continue;
|
|
}
|
|
set_current_state(TASK_RUNNING);
|
|
diff --git a/drivers/staging/speakup/speakup_decpc.c b/drivers/staging/speakup/speakup_decpc.c
|
|
index d6479bd..f7554bf 100644
|
|
--- a/drivers/staging/speakup/speakup_decpc.c
|
|
+++ b/drivers/staging/speakup/speakup_decpc.c
|
|
@@ -403,7 +403,7 @@ static void do_catch_up(struct spk_synth *synth)
|
|
if (ch == '\n')
|
|
ch = 0x0D;
|
|
if (dt_sendchar(ch)) {
|
|
- schedule_timeout(msecs_to_jiffies(delay_time_val));
|
|
+ schedule_msec_hrtimeout((delay_time_val));
|
|
continue;
|
|
}
|
|
set_current_state(TASK_RUNNING);
|
|
diff --git a/drivers/staging/speakup/speakup_dectlk.c b/drivers/staging/speakup/speakup_dectlk.c
|
|
index 7646567..639192e 100644
|
|
--- a/drivers/staging/speakup/speakup_dectlk.c
|
|
+++ b/drivers/staging/speakup/speakup_dectlk.c
|
|
@@ -251,7 +251,7 @@ static void do_catch_up(struct spk_synth *synth)
|
|
if (ch == '\n')
|
|
ch = 0x0D;
|
|
if (synth_full_val || !spk_serial_out(ch)) {
|
|
- schedule_timeout(msecs_to_jiffies(delay_time_val));
|
|
+ schedule_msec_hrtimeout((delay_time_val));
|
|
continue;
|
|
}
|
|
set_current_state(TASK_RUNNING);
|
|
diff --git a/drivers/staging/speakup/speakup_dtlk.c b/drivers/staging/speakup/speakup_dtlk.c
|
|
index 38aa401..1640519 100644
|
|
--- a/drivers/staging/speakup/speakup_dtlk.c
|
|
+++ b/drivers/staging/speakup/speakup_dtlk.c
|
|
@@ -217,7 +217,7 @@ static void do_catch_up(struct spk_synth *synth)
|
|
delay_time_val = delay_time->u.n.value;
|
|
spin_unlock_irqrestore(&speakup_info.spinlock, flags);
|
|
if (synth_full()) {
|
|
- schedule_timeout(msecs_to_jiffies(delay_time_val));
|
|
+ schedule_msec_hrtimeout((delay_time_val));
|
|
continue;
|
|
}
|
|
set_current_state(TASK_RUNNING);
|
|
@@ -233,7 +233,7 @@ static void do_catch_up(struct spk_synth *synth)
|
|
delay_time_val = delay_time->u.n.value;
|
|
jiffy_delta_val = jiffy_delta->u.n.value;
|
|
spin_unlock_irqrestore(&speakup_info.spinlock, flags);
|
|
- schedule_timeout(msecs_to_jiffies(delay_time_val));
|
|
+ schedule_msec_hrtimeout((delay_time_val));
|
|
jiff_max = jiffies + jiffy_delta_val;
|
|
}
|
|
}
|
|
diff --git a/drivers/staging/speakup/speakup_keypc.c b/drivers/staging/speakup/speakup_keypc.c
|
|
index 5e2170b..30b5df7 100644
|
|
--- a/drivers/staging/speakup/speakup_keypc.c
|
|
+++ b/drivers/staging/speakup/speakup_keypc.c
|
|
@@ -206,7 +206,7 @@ spin_lock_irqsave(&speakup_info.spinlock, flags);
|
|
full_time_val = full_time->u.n.value;
|
|
spin_unlock_irqrestore(&speakup_info.spinlock, flags);
|
|
if (synth_full()) {
|
|
- schedule_timeout(msecs_to_jiffies(full_time_val));
|
|
+ schedule_msec_hrtimeout((full_time_val));
|
|
continue;
|
|
}
|
|
set_current_state(TASK_RUNNING);
|
|
@@ -239,7 +239,7 @@ spin_lock_irqsave(&speakup_info.spinlock, flags);
|
|
jiffy_delta_val = jiffy_delta->u.n.value;
|
|
delay_time_val = delay_time->u.n.value;
|
|
spin_unlock_irqrestore(&speakup_info.spinlock, flags);
|
|
- schedule_timeout(msecs_to_jiffies(delay_time_val));
|
|
+ schedule_msec_hrtimeout((delay_time_val));
|
|
jiff_max = jiffies+jiffy_delta_val;
|
|
}
|
|
}
|
|
diff --git a/drivers/staging/speakup/synth.c b/drivers/staging/speakup/synth.c
|
|
index 54b2f39..70eebfa 100644
|
|
--- a/drivers/staging/speakup/synth.c
|
|
+++ b/drivers/staging/speakup/synth.c
|
|
@@ -119,7 +119,7 @@ void spk_do_catch_up(struct spk_synth *synth)
|
|
if (ch == '\n')
|
|
ch = synth->procspeech;
|
|
if (!spk_serial_out(ch)) {
|
|
- schedule_timeout(msecs_to_jiffies(full_time_val));
|
|
+ schedule_msec_hrtimeout((full_time_val));
|
|
continue;
|
|
}
|
|
if (time_after_eq(jiffies, jiff_max) && (ch == SPACE)) {
|
|
diff --git a/drivers/staging/unisys/visorbus/periodic_work.c b/drivers/staging/unisys/visorbus/periodic_work.c
|
|
new file mode 100644
|
|
index 0000000..b930287
|
|
--- /dev/null
|
|
+++ b/drivers/staging/unisys/visorbus/periodic_work.c
|
|
@@ -0,0 +1,204 @@
|
|
+/* periodic_work.c
|
|
+ *
|
|
+ * Copyright (C) 2010 - 2015 UNISYS CORPORATION
|
|
+ * All rights reserved.
|
|
+ *
|
|
+ * This program is free software; you can redistribute it and/or modify it
|
|
+ * under the terms and conditions of the GNU General Public License,
|
|
+ * version 2, as published by the Free Software Foundation.
|
|
+ *
|
|
+ * This program is distributed in the hope that it will be useful, but
|
|
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
+ * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
|
|
+ * NON INFRINGEMENT. See the GNU General Public License for more
|
|
+ * details.
|
|
+ */
|
|
+
|
|
+/*
|
|
+ * Helper functions to schedule periodic work in Linux kernel mode.
|
|
+ */
|
|
+#include <linux/sched.h>
|
|
+
|
|
+#include "periodic_work.h"
|
|
+
|
|
+#define MYDRVNAME "periodic_work"
|
|
+
|
|
+struct periodic_work {
|
|
+ rwlock_t lock;
|
|
+ struct delayed_work work;
|
|
+ void (*workfunc)(void *);
|
|
+ void *workfuncarg;
|
|
+ bool is_scheduled;
|
|
+ bool want_to_stop;
|
|
+ ulong jiffy_interval;
|
|
+ struct workqueue_struct *workqueue;
|
|
+ const char *devnam;
|
|
+};
|
|
+
|
|
+static void periodic_work_func(struct work_struct *work)
|
|
+{
|
|
+ struct periodic_work *pw;
|
|
+
|
|
+ pw = container_of(work, struct periodic_work, work.work);
|
|
+ (*pw->workfunc)(pw->workfuncarg);
|
|
+}
|
|
+
|
|
+struct periodic_work
|
|
+*visor_periodic_work_create(ulong jiffy_interval,
|
|
+ struct workqueue_struct *workqueue,
|
|
+ void (*workfunc)(void *),
|
|
+ void *workfuncarg,
|
|
+ const char *devnam)
|
|
+{
|
|
+ struct periodic_work *pw;
|
|
+
|
|
+ pw = kzalloc(sizeof(*pw), GFP_KERNEL | __GFP_NORETRY);
|
|
+ if (!pw)
|
|
+ return NULL;
|
|
+
|
|
+ rwlock_init(&pw->lock);
|
|
+ pw->jiffy_interval = jiffy_interval;
|
|
+ pw->workqueue = workqueue;
|
|
+ pw->workfunc = workfunc;
|
|
+ pw->workfuncarg = workfuncarg;
|
|
+ pw->devnam = devnam;
|
|
+ return pw;
|
|
+}
|
|
+EXPORT_SYMBOL_GPL(visor_periodic_work_create);
|
|
+
|
|
+void visor_periodic_work_destroy(struct periodic_work *pw)
|
|
+{
|
|
+ kfree(pw);
|
|
+}
|
|
+EXPORT_SYMBOL_GPL(visor_periodic_work_destroy);
|
|
+
|
|
+/** Call this from your periodic work worker function to schedule the next
|
|
+ * call.
|
|
+ * If this function returns false, there was a failure and the
|
|
+ * periodic work is no longer scheduled
|
|
+ */
|
|
+bool visor_periodic_work_nextperiod(struct periodic_work *pw)
|
|
+{
|
|
+ bool rc = false;
|
|
+
|
|
+ write_lock(&pw->lock);
|
|
+ if (pw->want_to_stop) {
|
|
+ pw->is_scheduled = false;
|
|
+ pw->want_to_stop = false;
|
|
+ rc = true; /* yes, true; see visor_periodic_work_stop() */
|
|
+ goto unlock;
|
|
+ } else if (!queue_delayed_work(pw->workqueue, &pw->work,
|
|
+ pw->jiffy_interval)) {
|
|
+ pw->is_scheduled = false;
|
|
+ rc = false;
|
|
+ goto unlock;
|
|
+ }
|
|
+ rc = true;
|
|
+unlock:
|
|
+ write_unlock(&pw->lock);
|
|
+ return rc;
|
|
+}
|
|
+EXPORT_SYMBOL_GPL(visor_periodic_work_nextperiod);
|
|
+
|
|
+/** This function returns true iff new periodic work was actually started.
|
|
+ * If this function returns false, then no work was started
|
|
+ * (either because it was already started, or because of a failure).
|
|
+ */
|
|
+bool visor_periodic_work_start(struct periodic_work *pw)
|
|
+{
|
|
+ bool rc = false;
|
|
+
|
|
+ write_lock(&pw->lock);
|
|
+ if (pw->is_scheduled) {
|
|
+ rc = false;
|
|
+ goto unlock;
|
|
+ }
|
|
+ if (pw->want_to_stop) {
|
|
+ rc = false;
|
|
+ goto unlock;
|
|
+ }
|
|
+ INIT_DELAYED_WORK(&pw->work, &periodic_work_func);
|
|
+ if (!queue_delayed_work(pw->workqueue, &pw->work,
|
|
+ pw->jiffy_interval)) {
|
|
+ rc = false;
|
|
+ goto unlock;
|
|
+ }
|
|
+ pw->is_scheduled = true;
|
|
+ rc = true;
|
|
+unlock:
|
|
+ write_unlock(&pw->lock);
|
|
+ return rc;
|
|
+}
|
|
+EXPORT_SYMBOL_GPL(visor_periodic_work_start);
|
|
+
|
|
+/** This function returns true iff your call actually stopped the periodic
|
|
+ * work.
|
|
+ *
|
|
+ * -- PAY ATTENTION... this is important --
|
|
+ *
|
|
+ * NO NO #1
|
|
+ *
|
|
+ * Do NOT call this function from some function that is running on the
|
|
+ * same workqueue as the work you are trying to stop might be running
|
|
+ * on! If you violate this rule, visor_periodic_work_stop() MIGHT work,
|
|
+ * but it also MIGHT get hung up in an infinite loop saying
|
|
+ * "waiting for delayed work...". This will happen if the delayed work
|
|
+ * you are trying to cancel has been put in the workqueue list, but can't
|
|
+ * run yet because we are running that same workqueue thread right now.
|
|
+ *
|
|
+ * Bottom line: If you need to call visor_periodic_work_stop() from a
|
|
+ * workitem, be sure the workitem is on a DIFFERENT workqueue than the
|
|
+ * workitem that you are trying to cancel.
|
|
+ *
|
|
+ * If I could figure out some way to check for this "no no" condition in
|
|
+ * the code, I would. It would have saved me the trouble of writing this
|
|
+ * long comment. And also, don't think this is some "theoretical" race
|
|
+ * condition. It is REAL, as I have spent the day chasing it.
|
|
+ *
|
|
+ * NO NO #2
|
|
+ *
|
|
+ * Take close note of the locks that you own when you call this function.
|
|
+ * You must NOT own any locks that are needed by the periodic work
|
|
+ * function that is currently installed. If you DO, a deadlock may result,
|
|
+ * because stopping the periodic work often involves waiting for the last
|
|
+ * iteration of the periodic work function to complete. Again, if you hit
|
|
+ * this deadlock, you will get hung up in an infinite loop saying
|
|
+ * "waiting for delayed work...".
|
|
+ */
|
|
+bool visor_periodic_work_stop(struct periodic_work *pw)
|
|
+{
|
|
+ bool stopped_something = false;
|
|
+
|
|
+ write_lock(&pw->lock);
|
|
+ stopped_something = pw->is_scheduled && (!pw->want_to_stop);
|
|
+ while (pw->is_scheduled) {
|
|
+ pw->want_to_stop = true;
|
|
+ if (cancel_delayed_work(&pw->work)) {
|
|
+ /* We get here if the delayed work was pending as
|
|
+ * delayed work, but was NOT run.
|
|
+ */
|
|
+ WARN_ON(!pw->is_scheduled);
|
|
+ pw->is_scheduled = false;
|
|
+ } else {
|
|
+ /* If we get here, either the delayed work:
|
|
+ * - was run, OR,
|
|
+ * - is running RIGHT NOW on another processor, OR,
|
|
+ * - wasn't even scheduled (there is a miniscule
|
|
+ * timing window where this could be the case)
|
|
+ * flush_workqueue() would make sure it is finished
|
|
+ * executing, but that still isn't very useful, which
|
|
+ * explains the loop...
|
|
+ */
|
|
+ }
|
|
+ if (pw->is_scheduled) {
|
|
+ write_unlock(&pw->lock);
|
|
+ schedule_msec_hrtimeout_interruptible((10));
|
|
+ write_lock(&pw->lock);
|
|
+ } else {
|
|
+ pw->want_to_stop = false;
|
|
+ }
|
|
+ }
|
|
+ write_unlock(&pw->lock);
|
|
+ return stopped_something;
|
|
+}
|
|
+EXPORT_SYMBOL_GPL(visor_periodic_work_stop);
|
|
diff --git a/drivers/staging/unisys/visornic/visornic_main.c b/drivers/staging/unisys/visornic/visornic_main.c
|
|
index 1367007..910f158 100644
|
|
--- a/drivers/staging/unisys/visornic/visornic_main.c
|
|
+++ b/drivers/staging/unisys/visornic/visornic_main.c
|
|
@@ -468,7 +468,7 @@ visornic_disable_with_timeout(struct net_device *netdev, const int timeout)
|
|
}
|
|
set_current_state(TASK_INTERRUPTIBLE);
|
|
spin_unlock_irqrestore(&devdata->priv_lock, flags);
|
|
- wait += schedule_timeout(msecs_to_jiffies(10));
|
|
+ wait += schedule_msec_hrtimeout((10));
|
|
spin_lock_irqsave(&devdata->priv_lock, flags);
|
|
}
|
|
|
|
@@ -479,7 +479,7 @@ visornic_disable_with_timeout(struct net_device *netdev, const int timeout)
|
|
while (1) {
|
|
set_current_state(TASK_INTERRUPTIBLE);
|
|
spin_unlock_irqrestore(&devdata->priv_lock, flags);
|
|
- schedule_timeout(msecs_to_jiffies(10));
|
|
+ schedule_msec_hrtimeout((10));
|
|
spin_lock_irqsave(&devdata->priv_lock, flags);
|
|
if (atomic_read(&devdata->usage))
|
|
break;
|
|
@@ -611,7 +611,7 @@ visornic_enable_with_timeout(struct net_device *netdev, const int timeout)
|
|
}
|
|
set_current_state(TASK_INTERRUPTIBLE);
|
|
spin_unlock_irqrestore(&devdata->priv_lock, flags);
|
|
- wait += schedule_timeout(msecs_to_jiffies(10));
|
|
+ wait += schedule_msec_hrtimeout((10));
|
|
spin_lock_irqsave(&devdata->priv_lock, flags);
|
|
}
|
|
|
|
diff --git a/drivers/target/target_core_user.c b/drivers/target/target_core_user.c
|
|
index 70c143a..2d3cbc9 100644
|
|
--- a/drivers/target/target_core_user.c
|
|
+++ b/drivers/target/target_core_user.c
|
|
@@ -450,7 +450,7 @@ tcmu_queue_cmd_ring(struct tcmu_cmd *tcmu_cmd)
|
|
|
|
pr_debug("sleeping for ring space\n");
|
|
spin_unlock_irq(&udev->cmdr_lock);
|
|
- ret = schedule_timeout(msecs_to_jiffies(TCMU_TIME_OUT));
|
|
+ ret = schedule_msec_hrtimeout((TCMU_TIME_OUT));
|
|
finish_wait(&udev->wait_cmdr, &__wait);
|
|
if (!ret) {
|
|
pr_warn("tcmu: command timed out\n");
|
|
diff --git a/drivers/video/fbdev/omap/hwa742.c b/drivers/video/fbdev/omap/hwa742.c
|
|
index a4ee65b..cf38bcb 100644
|
|
--- a/drivers/video/fbdev/omap/hwa742.c
|
|
+++ b/drivers/video/fbdev/omap/hwa742.c
|
|
@@ -926,7 +926,7 @@ static void hwa742_resume(void)
|
|
if (hwa742_read_reg(HWA742_PLL_DIV_REG) & (1 << 7))
|
|
break;
|
|
set_current_state(TASK_UNINTERRUPTIBLE);
|
|
- schedule_timeout(msecs_to_jiffies(5));
|
|
+ schedule_msec_hrtimeout((5));
|
|
}
|
|
hwa742_set_update_mode(hwa742.update_mode_before_suspend);
|
|
}
|
|
diff --git a/drivers/video/fbdev/pxafb.c b/drivers/video/fbdev/pxafb.c
|
|
index ef73f14..7b5483b 100644
|
|
--- a/drivers/video/fbdev/pxafb.c
|
|
+++ b/drivers/video/fbdev/pxafb.c
|
|
@@ -1287,7 +1287,7 @@ static int pxafb_smart_thread(void *arg)
|
|
mutex_unlock(&fbi->ctrlr_lock);
|
|
|
|
set_current_state(TASK_INTERRUPTIBLE);
|
|
- schedule_timeout(msecs_to_jiffies(30));
|
|
+ schedule_msec_hrtimeout((30));
|
|
}
|
|
|
|
pr_debug("%s(): task ending\n", __func__);
|
|
diff --git a/fs/afs/vlocation.c b/fs/afs/vlocation.c
|
|
index 45a8639..855d08e 100644
|
|
--- a/fs/afs/vlocation.c
|
|
+++ b/fs/afs/vlocation.c
|
|
@@ -129,7 +129,7 @@ static int afs_vlocation_access_vl_by_id(struct afs_vlocation *vl,
|
|
if (vl->upd_busy_cnt > 1) {
|
|
/* second+ BUSY - sleep a little bit */
|
|
set_current_state(TASK_UNINTERRUPTIBLE);
|
|
- schedule_timeout(1);
|
|
+ schedule_min_hrtimeout();
|
|
}
|
|
continue;
|
|
}
|
|
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
|
|
index 5909ae8..c010e57 100644
|
|
--- a/fs/btrfs/extent-tree.c
|
|
+++ b/fs/btrfs/extent-tree.c
|
|
@@ -5953,7 +5953,7 @@ int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes)
|
|
|
|
if (flush != BTRFS_RESERVE_NO_FLUSH &&
|
|
btrfs_transaction_in_commit(root->fs_info))
|
|
- schedule_timeout(1);
|
|
+ schedule_min_hrtimeout();
|
|
|
|
if (delalloc_lock)
|
|
mutex_lock(&BTRFS_I(inode)->delalloc_mutex);
|
|
diff --git a/fs/btrfs/inode-map.c b/fs/btrfs/inode-map.c
|
|
index d27014b..cd87786 100644
|
|
--- a/fs/btrfs/inode-map.c
|
|
+++ b/fs/btrfs/inode-map.c
|
|
@@ -89,7 +89,7 @@ again:
|
|
btrfs_release_path(path);
|
|
root->ino_cache_progress = last;
|
|
up_read(&fs_info->commit_root_sem);
|
|
- schedule_timeout(1);
|
|
+ schedule_min_hrtimeout();
|
|
goto again;
|
|
} else
|
|
continue;
|
|
diff --git a/fs/buffer.c b/fs/buffer.c
|
|
index b205a62..abb13b0 100644
|
|
--- a/fs/buffer.c
|
|
+++ b/fs/buffer.c
|
|
@@ -1697,7 +1697,7 @@ int __block_write_full_page(struct inode *inode, struct page *page,
|
|
struct buffer_head *bh, *head;
|
|
unsigned int blocksize, bbits;
|
|
int nr_underway = 0;
|
|
- int write_flags = (wbc->sync_mode == WB_SYNC_ALL ? WRITE_SYNC : 0);
|
|
+ int write_flags = wbc_to_write_flags(wbc);
|
|
|
|
head = create_page_buffers(page, inode,
|
|
(1 << BH_Dirty)|(1 << BH_Uptodate));
|
|
diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c
|
|
index 14db4b7..2cc48bf 100644
|
|
--- a/fs/f2fs/data.c
|
|
+++ b/fs/f2fs/data.c
|
|
@@ -1251,7 +1251,7 @@ static int f2fs_write_data_page(struct page *page,
|
|
.sbi = sbi,
|
|
.type = DATA,
|
|
.op = REQ_OP_WRITE,
|
|
- .op_flags = (wbc->sync_mode == WB_SYNC_ALL) ? WRITE_SYNC : 0,
|
|
+ .op_flags = wbc_to_write_flags(wbc),
|
|
.page = page,
|
|
.encrypted_page = NULL,
|
|
};
|
|
diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c
|
|
index 01177ec..772a123 100644
|
|
--- a/fs/f2fs/node.c
|
|
+++ b/fs/f2fs/node.c
|
|
@@ -1570,7 +1570,7 @@ static int f2fs_write_node_page(struct page *page,
|
|
.sbi = sbi,
|
|
.type = NODE,
|
|
.op = REQ_OP_WRITE,
|
|
- .op_flags = (wbc->sync_mode == WB_SYNC_ALL) ? WRITE_SYNC : 0,
|
|
+ .op_flags = wbc_to_write_flags(wbc),
|
|
.page = page,
|
|
.encrypted_page = NULL,
|
|
};
|
|
diff --git a/fs/gfs2/meta_io.c b/fs/gfs2/meta_io.c
|
|
index 373639a..3223ccf 100644
|
|
--- a/fs/gfs2/meta_io.c
|
|
+++ b/fs/gfs2/meta_io.c
|
|
@@ -37,8 +37,7 @@ static int gfs2_aspace_writepage(struct page *page, struct writeback_control *wb
|
|
{
|
|
struct buffer_head *bh, *head;
|
|
int nr_underway = 0;
|
|
- int write_flags = REQ_META | REQ_PRIO |
|
|
- (wbc->sync_mode == WB_SYNC_ALL ? WRITE_SYNC : 0);
|
|
+ int write_flags = REQ_META | REQ_PRIO | wbc_to_write_flags(wbc);
|
|
|
|
BUG_ON(!PageLocked(page));
|
|
BUG_ON(!page_has_buffers(page));
|
|
diff --git a/fs/mpage.c b/fs/mpage.c
|
|
index d2413af..d6f1afe 100644
|
|
--- a/fs/mpage.c
|
|
+++ b/fs/mpage.c
|
|
@@ -489,7 +489,7 @@ static int __mpage_writepage(struct page *page, struct writeback_control *wbc,
|
|
struct buffer_head map_bh;
|
|
loff_t i_size = i_size_read(inode);
|
|
int ret = 0;
|
|
- int op_flags = (wbc->sync_mode == WB_SYNC_ALL ? WRITE_SYNC : 0);
|
|
+ int op_flags = wbc_to_write_flags(wbc);
|
|
|
|
if (page_has_buffers(page)) {
|
|
struct buffer_head *head = page_buffers(page);
|
|
diff --git a/fs/proc/base.c b/fs/proc/base.c
|
|
index ca651ac..ee394c8 100644
|
|
--- a/fs/proc/base.c
|
|
+++ b/fs/proc/base.c
|
|
@@ -488,7 +488,7 @@ static int proc_pid_schedstat(struct seq_file *m, struct pid_namespace *ns,
|
|
seq_printf(m, "0 0 0\n");
|
|
else
|
|
seq_printf(m, "%llu %llu %lu\n",
|
|
- (unsigned long long)task->se.sum_exec_runtime,
|
|
+ (unsigned long long)tsk_seruntime(task),
|
|
(unsigned long long)task->sched_info.run_delay,
|
|
task->sched_info.pcount);
|
|
|
|
diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c
|
|
index 06763f5..33a9e0b 100644
|
|
--- a/fs/xfs/xfs_aops.c
|
|
+++ b/fs/xfs/xfs_aops.c
|
|
@@ -495,8 +495,8 @@ xfs_submit_ioend(
|
|
|
|
ioend->io_bio->bi_private = ioend;
|
|
ioend->io_bio->bi_end_io = xfs_end_bio;
|
|
- bio_set_op_attrs(ioend->io_bio, REQ_OP_WRITE,
|
|
- (wbc->sync_mode == WB_SYNC_ALL) ? WRITE_SYNC : 0);
|
|
+ bio_set_op_attrs(ioend->io_bio, REQ_OP_WRITE, wbc_to_write_flags(wbc));
|
|
+
|
|
/*
|
|
* If we are failing the IO now, just mark the ioend with an
|
|
* error and finish it. This will run IO completion immediately
|
|
@@ -567,8 +567,7 @@ xfs_chain_bio(
|
|
|
|
bio_chain(ioend->io_bio, new);
|
|
bio_get(ioend->io_bio); /* for xfs_destroy_ioend */
|
|
- bio_set_op_attrs(ioend->io_bio, REQ_OP_WRITE,
|
|
- (wbc->sync_mode == WB_SYNC_ALL) ? WRITE_SYNC : 0);
|
|
+ bio_set_op_attrs(ioend->io_bio, REQ_OP_WRITE, wbc_to_write_flags(wbc));
|
|
submit_bio(ioend->io_bio);
|
|
ioend->io_bio = new;
|
|
}
|
|
diff --git a/include/linux/backing-dev-defs.h b/include/linux/backing-dev-defs.h
|
|
index c357f27..dc5f76d 100644
|
|
--- a/include/linux/backing-dev-defs.h
|
|
+++ b/include/linux/backing-dev-defs.h
|
|
@@ -116,6 +116,8 @@ struct bdi_writeback {
|
|
struct list_head work_list;
|
|
struct delayed_work dwork; /* work item used for writeback */
|
|
|
|
+ unsigned long dirty_sleep; /* last wait */
|
|
+
|
|
struct list_head bdi_node; /* anchored at bdi->wb_list */
|
|
|
|
#ifdef CONFIG_CGROUP_WRITEBACK
|
|
diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h
|
|
index cd395ec..cb5d746 100644
|
|
--- a/include/linux/blk_types.h
|
|
+++ b/include/linux/blk_types.h
|
|
@@ -162,6 +162,7 @@ enum rq_flag_bits {
|
|
__REQ_INTEGRITY, /* I/O includes block integrity payload */
|
|
__REQ_FUA, /* forced unit access */
|
|
__REQ_PREFLUSH, /* request for cache flush */
|
|
+ __REQ_BG, /* background activity */
|
|
|
|
/* bio only flags */
|
|
__REQ_RAHEAD, /* read ahead, can fail anytime */
|
|
@@ -205,7 +206,7 @@ enum rq_flag_bits {
|
|
(REQ_FAILFAST_DEV | REQ_FAILFAST_TRANSPORT | REQ_FAILFAST_DRIVER)
|
|
#define REQ_COMMON_MASK \
|
|
(REQ_FAILFAST_MASK | REQ_SYNC | REQ_META | REQ_PRIO | REQ_NOIDLE | \
|
|
- REQ_PREFLUSH | REQ_FUA | REQ_INTEGRITY | REQ_NOMERGE)
|
|
+ REQ_PREFLUSH | REQ_FUA | REQ_INTEGRITY | REQ_NOMERGE | REQ_BG)
|
|
#define REQ_CLONE_MASK REQ_COMMON_MASK
|
|
|
|
/* This mask is used for both bio and request merge checking */
|
|
@@ -230,6 +231,7 @@ enum rq_flag_bits {
|
|
#define REQ_COPY_USER (1ULL << __REQ_COPY_USER)
|
|
#define REQ_PREFLUSH (1ULL << __REQ_PREFLUSH)
|
|
#define REQ_FLUSH_SEQ (1ULL << __REQ_FLUSH_SEQ)
|
|
+#define REQ_BG (1ULL << __REQ_BG)
|
|
#define REQ_IO_STAT (1ULL << __REQ_IO_STAT)
|
|
#define REQ_MIXED_MERGE (1ULL << __REQ_MIXED_MERGE)
|
|
#define REQ_PM (1ULL << __REQ_PM)
|
|
@@ -271,4 +273,20 @@ static inline unsigned int blk_qc_t_to_tag(blk_qc_t cookie)
|
|
return cookie & ((1u << BLK_QC_T_SHIFT) - 1);
|
|
}
|
|
|
|
+struct blk_issue_stat {
|
|
+ u64 time;
|
|
+};
|
|
+
|
|
+#define BLK_RQ_STAT_BATCH 64
|
|
+
|
|
+struct blk_rq_stat {
|
|
+ s64 mean;
|
|
+ u64 min;
|
|
+ u64 max;
|
|
+ s32 nr_samples;
|
|
+ s32 nr_batch;
|
|
+ u64 batch;
|
|
+ s64 time;
|
|
+};
|
|
+
|
|
#endif /* __LINUX_BLK_TYPES_H */
|
|
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
|
|
index f6a8161..4adcbff 100644
|
|
--- a/include/linux/blkdev.h
|
|
+++ b/include/linux/blkdev.h
|
|
@@ -37,6 +37,7 @@ struct bsg_job;
|
|
struct blkcg_gq;
|
|
struct blk_flush_queue;
|
|
struct pr_ops;
|
|
+struct rq_wb;
|
|
|
|
#define BLKDEV_MIN_RQ 4
|
|
#define BLKDEV_MAX_RQ 128 /* Default maximum */
|
|
@@ -45,7 +46,7 @@ struct pr_ops;
|
|
* Maximum number of blkcg policies allowed to be registered concurrently.
|
|
* Defined here to simplify include dependency.
|
|
*/
|
|
-#define BLKCG_MAX_POLS 2
|
|
+#define BLKCG_MAX_POLS 3
|
|
|
|
typedef void (rq_end_io_fn)(struct request *, int);
|
|
|
|
@@ -151,6 +152,7 @@ struct request {
|
|
struct gendisk *rq_disk;
|
|
struct hd_struct *part;
|
|
unsigned long start_time;
|
|
+ struct blk_issue_stat issue_stat;
|
|
#ifdef CONFIG_BLK_CGROUP
|
|
struct request_list *rl; /* rl this rq is alloced from */
|
|
unsigned long long start_time_ns;
|
|
@@ -302,6 +304,8 @@ struct request_queue {
|
|
int nr_rqs[2]; /* # allocated [a]sync rqs */
|
|
int nr_rqs_elvpriv; /* # allocated rqs w/ elvpriv */
|
|
|
|
+ struct rq_wb *rq_wb;
|
|
+
|
|
/*
|
|
* If blkcg is not used, @q->root_rl serves all requests. If blkcg
|
|
* is used, root blkg allocates from @q->root_rl and all other
|
|
@@ -327,6 +331,8 @@ struct request_queue {
|
|
struct blk_mq_ctx __percpu *queue_ctx;
|
|
unsigned int nr_queues;
|
|
|
|
+ unsigned int queue_depth;
|
|
+
|
|
/* hw dispatch queues */
|
|
struct blk_mq_hw_ctx **queue_hw_ctx;
|
|
unsigned int nr_hw_queues;
|
|
@@ -412,6 +418,9 @@ struct request_queue {
|
|
|
|
unsigned int nr_sorted;
|
|
unsigned int in_flight[2];
|
|
+
|
|
+ struct blk_rq_stat rq_stats[2];
|
|
+
|
|
/*
|
|
* Number of active block driver functions for which blk_drain_queue()
|
|
* must wait. Must be incremented around functions that unlock the
|
|
@@ -683,6 +692,14 @@ static inline bool blk_write_same_mergeable(struct bio *a, struct bio *b)
|
|
return false;
|
|
}
|
|
|
|
+static inline unsigned int blk_queue_depth(struct request_queue *q)
|
|
+{
|
|
+ if (q->queue_depth)
|
|
+ return q->queue_depth;
|
|
+
|
|
+ return q->nr_requests;
|
|
+}
|
|
+
|
|
/*
|
|
* q->prep_rq_fn return values
|
|
*/
|
|
@@ -999,6 +1016,7 @@ extern void blk_limits_io_min(struct queue_limits *limits, unsigned int min);
|
|
extern void blk_queue_io_min(struct request_queue *q, unsigned int min);
|
|
extern void blk_limits_io_opt(struct queue_limits *limits, unsigned int opt);
|
|
extern void blk_queue_io_opt(struct request_queue *q, unsigned int opt);
|
|
+extern void blk_set_queue_depth(struct request_queue *q, unsigned int depth);
|
|
extern void blk_set_default_limits(struct queue_limits *lim);
|
|
extern void blk_set_stacking_limits(struct queue_limits *lim);
|
|
extern int blk_stack_limits(struct queue_limits *t, struct queue_limits *b,
|
|
diff --git a/include/linux/freezer.h b/include/linux/freezer.h
|
|
index dd03e83..2fda682 100644
|
|
--- a/include/linux/freezer.h
|
|
+++ b/include/linux/freezer.h
|
|
@@ -296,6 +296,7 @@ static inline void set_freezable(void) {}
|
|
#define wait_event_freezekillable_unsafe(wq, condition) \
|
|
wait_event_killable(wq, condition)
|
|
|
|
+#define pm_freezing (false)
|
|
#endif /* !CONFIG_FREEZER */
|
|
|
|
#endif /* FREEZER_H_INCLUDED */
|
|
diff --git a/include/linux/fs.h b/include/linux/fs.h
|
|
index dc0478c..ea1b019 100644
|
|
--- a/include/linux/fs.h
|
|
+++ b/include/linux/fs.h
|
|
@@ -189,6 +189,8 @@ typedef int (dio_iodone_t)(struct kiocb *iocb, loff_t offset,
|
|
* WRITE_FLUSH_FUA Combination of WRITE_FLUSH and FUA. The IO is preceded
|
|
* by a cache flush and data is guaranteed to be on
|
|
* non-volatile media on completion.
|
|
+ * WRITE_BG Background write. This is for background activity like
|
|
+ * the periodic flush and background threshold writeback
|
|
*
|
|
*/
|
|
#define RW_MASK REQ_OP_WRITE
|
|
@@ -202,6 +204,7 @@ typedef int (dio_iodone_t)(struct kiocb *iocb, loff_t offset,
|
|
#define WRITE_FLUSH (REQ_SYNC | REQ_NOIDLE | REQ_PREFLUSH)
|
|
#define WRITE_FUA (REQ_SYNC | REQ_NOIDLE | REQ_FUA)
|
|
#define WRITE_FLUSH_FUA (REQ_SYNC | REQ_NOIDLE | REQ_PREFLUSH | REQ_FUA)
|
|
+#define WRITE_BG (REQ_NOIDLE | REQ_BG)
|
|
|
|
/*
|
|
* Attribute flags. These should be or-ed together to figure out what
|
|
diff --git a/include/linux/init_task.h b/include/linux/init_task.h
|
|
index 325f649..ac7a68f 100644
|
|
--- a/include/linux/init_task.h
|
|
+++ b/include/linux/init_task.h
|
|
@@ -159,8 +159,6 @@ extern struct task_group root_task_group;
|
|
# define INIT_VTIME(tsk)
|
|
#endif
|
|
|
|
-#define INIT_TASK_COMM "swapper"
|
|
-
|
|
#ifdef CONFIG_RT_MUTEXES
|
|
# define INIT_RT_MUTEXES(tsk) \
|
|
.pi_waiters = RB_ROOT, \
|
|
@@ -197,6 +195,78 @@ extern struct task_group root_task_group;
|
|
* INIT_TASK is used to set up the first task table, touch at
|
|
* your own risk!. Base=0, limit=0x1fffff (=2MB)
|
|
*/
|
|
+#ifdef CONFIG_SCHED_MUQSS
|
|
+#define INIT_TASK_COMM "MuQSS"
|
|
+#define INIT_TASK(tsk) \
|
|
+{ \
|
|
+ INIT_TASK_TI(tsk) \
|
|
+ .state = 0, \
|
|
+ .stack = init_stack, \
|
|
+ .usage = ATOMIC_INIT(2), \
|
|
+ .flags = PF_KTHREAD, \
|
|
+ .prio = NORMAL_PRIO, \
|
|
+ .static_prio = MAX_PRIO-20, \
|
|
+ .normal_prio = NORMAL_PRIO, \
|
|
+ .deadline = 0, \
|
|
+ .policy = SCHED_NORMAL, \
|
|
+ .cpus_allowed = CPU_MASK_ALL, \
|
|
+ .mm = NULL, \
|
|
+ .active_mm = &init_mm, \
|
|
+ .restart_block = { \
|
|
+ .fn = do_no_restart_syscall, \
|
|
+ }, \
|
|
+ .time_slice = 1000000, \
|
|
+ .tasks = LIST_HEAD_INIT(tsk.tasks), \
|
|
+ INIT_PUSHABLE_TASKS(tsk) \
|
|
+ .ptraced = LIST_HEAD_INIT(tsk.ptraced), \
|
|
+ .ptrace_entry = LIST_HEAD_INIT(tsk.ptrace_entry), \
|
|
+ .real_parent = &tsk, \
|
|
+ .parent = &tsk, \
|
|
+ .children = LIST_HEAD_INIT(tsk.children), \
|
|
+ .sibling = LIST_HEAD_INIT(tsk.sibling), \
|
|
+ .group_leader = &tsk, \
|
|
+ RCU_POINTER_INITIALIZER(real_cred, &init_cred), \
|
|
+ RCU_POINTER_INITIALIZER(cred, &init_cred), \
|
|
+ .comm = INIT_TASK_COMM, \
|
|
+ .thread = INIT_THREAD, \
|
|
+ .fs = &init_fs, \
|
|
+ .files = &init_files, \
|
|
+ .signal = &init_signals, \
|
|
+ .sighand = &init_sighand, \
|
|
+ .nsproxy = &init_nsproxy, \
|
|
+ .pending = { \
|
|
+ .list = LIST_HEAD_INIT(tsk.pending.list), \
|
|
+ .signal = {{0}}}, \
|
|
+ .blocked = {{0}}, \
|
|
+ .alloc_lock = __SPIN_LOCK_UNLOCKED(tsk.alloc_lock), \
|
|
+ .journal_info = NULL, \
|
|
+ .cpu_timers = INIT_CPU_TIMERS(tsk.cpu_timers), \
|
|
+ .pi_lock = __RAW_SPIN_LOCK_UNLOCKED(tsk.pi_lock), \
|
|
+ .timer_slack_ns = 50000, /* 50 usec default slack */ \
|
|
+ .pids = { \
|
|
+ [PIDTYPE_PID] = INIT_PID_LINK(PIDTYPE_PID), \
|
|
+ [PIDTYPE_PGID] = INIT_PID_LINK(PIDTYPE_PGID), \
|
|
+ [PIDTYPE_SID] = INIT_PID_LINK(PIDTYPE_SID), \
|
|
+ }, \
|
|
+ .thread_group = LIST_HEAD_INIT(tsk.thread_group), \
|
|
+ .thread_node = LIST_HEAD_INIT(init_signals.thread_head), \
|
|
+ INIT_IDS \
|
|
+ INIT_PERF_EVENTS(tsk) \
|
|
+ INIT_TRACE_IRQFLAGS \
|
|
+ INIT_LOCKDEP \
|
|
+ INIT_FTRACE_GRAPH \
|
|
+ INIT_TRACE_RECURSION \
|
|
+ INIT_TASK_RCU_PREEMPT(tsk) \
|
|
+ INIT_TASK_RCU_TASKS(tsk) \
|
|
+ INIT_CPUSET_SEQ(tsk) \
|
|
+ INIT_RT_MUTEXES(tsk) \
|
|
+ INIT_PREV_CPUTIME(tsk) \
|
|
+ INIT_VTIME(tsk) \
|
|
+ INIT_NUMA_BALANCING(tsk) \
|
|
+ INIT_KASAN(tsk) \
|
|
+}
|
|
+#else /* CONFIG_SCHED_MUQSS */
|
|
+#define INIT_TASK_COMM "swapper"
|
|
#define INIT_TASK(tsk) \
|
|
{ \
|
|
INIT_TASK_TI(tsk) \
|
|
@@ -272,7 +342,7 @@ extern struct task_group root_task_group;
|
|
INIT_NUMA_BALANCING(tsk) \
|
|
INIT_KASAN(tsk) \
|
|
}
|
|
-
|
|
+#endif /* CONFIG_SCHED_MUQSS */
|
|
|
|
#define INIT_CPU_TIMERS(cpu_timers) \
|
|
{ \
|
|
diff --git a/include/linux/ioprio.h b/include/linux/ioprio.h
|
|
index 8c12390..ebe98b9 100644
|
|
--- a/include/linux/ioprio.h
|
|
+++ b/include/linux/ioprio.h
|
|
@@ -51,6 +51,8 @@ enum {
|
|
*/
|
|
static inline int task_nice_ioprio(struct task_struct *task)
|
|
{
|
|
+ if (iso_task(task))
|
|
+ return 0;
|
|
return (task_nice(task) + 20) / 5;
|
|
}
|
|
|
|
diff --git a/include/linux/sched.h b/include/linux/sched.h
|
|
index 75d9a57..40a31ab 100644
|
|
--- a/include/linux/sched.h
|
|
+++ b/include/linux/sched.h
|
|
@@ -59,6 +59,7 @@ struct sched_param {
|
|
#include <linux/gfp.h>
|
|
#include <linux/magic.h>
|
|
#include <linux/cgroup-defs.h>
|
|
+#include <linux/skip_list.h>
|
|
|
|
#include <asm/processor.h>
|
|
|
|
@@ -176,7 +177,7 @@ extern void get_iowait_load(unsigned long *nr_waiters, unsigned long *load);
|
|
|
|
extern void calc_global_load(unsigned long ticks);
|
|
|
|
-#if defined(CONFIG_SMP) && defined(CONFIG_NO_HZ_COMMON)
|
|
+#if defined(CONFIG_SMP) && defined(CONFIG_NO_HZ_COMMON) && !defined(CONFIG_SCHED_MUQSS)
|
|
extern void cpu_load_update_nohz_start(void);
|
|
extern void cpu_load_update_nohz_stop(void);
|
|
#else
|
|
@@ -340,8 +341,6 @@ extern void init_idle_bootup_task(struct task_struct *idle);
|
|
|
|
extern cpumask_var_t cpu_isolated_map;
|
|
|
|
-extern int runqueue_is_locked(int cpu);
|
|
-
|
|
#if defined(CONFIG_SMP) && defined(CONFIG_NO_HZ_COMMON)
|
|
extern void nohz_balance_enter_idle(int cpu);
|
|
extern void set_cpu_sd_state_idle(void);
|
|
@@ -438,6 +437,34 @@ extern signed long schedule_timeout_interruptible(signed long timeout);
|
|
extern signed long schedule_timeout_killable(signed long timeout);
|
|
extern signed long schedule_timeout_uninterruptible(signed long timeout);
|
|
extern signed long schedule_timeout_idle(signed long timeout);
|
|
+
|
|
+#ifdef CONFIG_HIGH_RES_TIMERS
|
|
+extern signed long schedule_msec_hrtimeout(signed long timeout);
|
|
+extern signed long schedule_min_hrtimeout(void);
|
|
+extern signed long schedule_msec_hrtimeout_interruptible(signed long timeout);
|
|
+extern signed long schedule_msec_hrtimeout_uninterruptible(signed long timeout);
|
|
+#else
|
|
+static inline signed long schedule_msec_hrtimeout(signed long timeout)
|
|
+{
|
|
+ return schedule_timeout(msecs_to_jiffies(timeout));
|
|
+}
|
|
+
|
|
+static inline signed long schedule_min_hrtimeout(void)
|
|
+{
|
|
+ return schedule_timeout(1);
|
|
+}
|
|
+
|
|
+static inline signed long schedule_msec_hrtimeout_interruptible(signed long timeout)
|
|
+{
|
|
+ return schedule_timeout_interruptible(msecs_to_jiffies(timeout));
|
|
+}
|
|
+
|
|
+static inline signed long schedule_msec_hrtimeout_uninterruptible(signed long timeout)
|
|
+{
|
|
+ return schedule_timeout_uninterruptible(msecs_to_jiffies(timeout));
|
|
+}
|
|
+#endif
|
|
+
|
|
asmlinkage void schedule(void);
|
|
extern void schedule_preempt_disabled(void);
|
|
|
|
@@ -1486,9 +1513,11 @@ struct task_struct {
|
|
unsigned int flags; /* per process flags, defined below */
|
|
unsigned int ptrace;
|
|
|
|
+#if defined(CONFIG_SMP) || defined(CONFIG_SCHED_MUQSS)
|
|
+ int on_cpu;
|
|
+#endif
|
|
#ifdef CONFIG_SMP
|
|
struct llist_node wake_entry;
|
|
- int on_cpu;
|
|
#ifdef CONFIG_THREAD_INFO_IN_TASK
|
|
unsigned int cpu; /* current CPU */
|
|
#endif
|
|
@@ -1499,12 +1528,26 @@ struct task_struct {
|
|
int wake_cpu;
|
|
#endif
|
|
int on_rq;
|
|
-
|
|
int prio, static_prio, normal_prio;
|
|
unsigned int rt_priority;
|
|
+#ifdef CONFIG_SCHED_MUQSS
|
|
+ int time_slice;
|
|
+ u64 deadline;
|
|
+ skiplist_node node; /* Skip list node */
|
|
+ u64 last_ran;
|
|
+ u64 sched_time; /* sched_clock time spent running */
|
|
+#ifdef CONFIG_SMT_NICE
|
|
+ int smt_bias; /* Policy/nice level bias across smt siblings */
|
|
+#endif
|
|
+#ifdef CONFIG_HOTPLUG_CPU
|
|
+ bool zerobound; /* Bound to CPU0 for hotplug */
|
|
+#endif
|
|
+ unsigned long rt_timeout;
|
|
+#else /* CONFIG_SCHED_MUQSS */
|
|
const struct sched_class *sched_class;
|
|
struct sched_entity se;
|
|
struct sched_rt_entity rt;
|
|
+#endif
|
|
#ifdef CONFIG_CGROUP_SCHED
|
|
struct task_group *sched_task_group;
|
|
#endif
|
|
@@ -1628,6 +1671,10 @@ struct task_struct {
|
|
int __user *clear_child_tid; /* CLONE_CHILD_CLEARTID */
|
|
|
|
cputime_t utime, stime, utimescaled, stimescaled;
|
|
+#ifdef CONFIG_SCHED_MUQSS
|
|
+ /* Unbanked cpu time */
|
|
+ unsigned long utime_ns, stime_ns;
|
|
+#endif
|
|
cputime_t gtime;
|
|
struct prev_cputime prev_cputime;
|
|
#ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN
|
|
@@ -1984,6 +2031,40 @@ static inline struct vm_struct *task_stack_vm_area(const struct task_struct *t)
|
|
}
|
|
#endif
|
|
|
|
+#ifdef CONFIG_SCHED_MUQSS
|
|
+#define tsk_seruntime(t) ((t)->sched_time)
|
|
+#define tsk_rttimeout(t) ((t)->rt_timeout)
|
|
+
|
|
+static inline void tsk_cpus_current(struct task_struct *p)
|
|
+{
|
|
+}
|
|
+
|
|
+void print_scheduler_version(void);
|
|
+
|
|
+static inline bool iso_task(struct task_struct *p)
|
|
+{
|
|
+ return (p->policy == SCHED_ISO);
|
|
+}
|
|
+#else /* CFS */
|
|
+#define tsk_seruntime(t) ((t)->se.sum_exec_runtime)
|
|
+#define tsk_rttimeout(t) ((t)->rt.timeout)
|
|
+
|
|
+static inline void tsk_cpus_current(struct task_struct *p)
|
|
+{
|
|
+ p->nr_cpus_allowed = current->nr_cpus_allowed;
|
|
+}
|
|
+
|
|
+static inline void print_scheduler_version(void)
|
|
+{
|
|
+ printk(KERN_INFO"CFS CPU scheduler.\n");
|
|
+}
|
|
+
|
|
+static inline bool iso_task(struct task_struct *p)
|
|
+{
|
|
+ return false;
|
|
+}
|
|
+#endif /* CONFIG_SCHED_MUQSS */
|
|
+
|
|
/* Future-safe accessor for struct task_struct's cpus_allowed. */
|
|
#define tsk_cpus_allowed(tsk) (&(tsk)->cpus_allowed)
|
|
|
|
@@ -2437,7 +2518,7 @@ static inline int set_cpus_allowed_ptr(struct task_struct *p,
|
|
}
|
|
#endif
|
|
|
|
-#ifdef CONFIG_NO_HZ_COMMON
|
|
+#if defined(CONFIG_NO_HZ_COMMON) && !defined(CONFIG_SCHED_MUQSS)
|
|
void calc_load_enter_idle(void);
|
|
void calc_load_exit_idle(void);
|
|
#else
|
|
@@ -2538,7 +2619,7 @@ extern unsigned long long
|
|
task_sched_runtime(struct task_struct *task);
|
|
|
|
/* sched_exec is called by processes performing an exec */
|
|
-#ifdef CONFIG_SMP
|
|
+#if defined(CONFIG_SMP) && !defined(CONFIG_SCHED_MUQSS)
|
|
extern void sched_exec(void);
|
|
#else
|
|
#define sched_exec() {}
|
|
@@ -3503,7 +3584,7 @@ static inline unsigned int task_cpu(const struct task_struct *p)
|
|
return 0;
|
|
}
|
|
|
|
-static inline void set_task_cpu(struct task_struct *p, unsigned int cpu)
|
|
+static inline void set_task_cpu(struct task_struct *p, int cpu)
|
|
{
|
|
}
|
|
|
|
diff --git a/include/linux/sched/prio.h b/include/linux/sched/prio.h
|
|
index d9cf5a5..94d397e 100644
|
|
--- a/include/linux/sched/prio.h
|
|
+++ b/include/linux/sched/prio.h
|
|
@@ -19,8 +19,20 @@
|
|
*/
|
|
|
|
#define MAX_USER_RT_PRIO 100
|
|
+
|
|
+#ifdef CONFIG_SCHED_MUQSS
|
|
+/* Note different MAX_RT_PRIO */
|
|
+#define MAX_RT_PRIO (MAX_USER_RT_PRIO + 1)
|
|
+
|
|
+#define ISO_PRIO (MAX_RT_PRIO)
|
|
+#define NORMAL_PRIO (MAX_RT_PRIO + 1)
|
|
+#define IDLE_PRIO (MAX_RT_PRIO + 2)
|
|
+#define PRIO_LIMIT ((IDLE_PRIO) + 1)
|
|
+#else /* CONFIG_SCHED_MUQSS */
|
|
#define MAX_RT_PRIO MAX_USER_RT_PRIO
|
|
|
|
+#endif /* CONFIG_SCHED_MUQSS */
|
|
+
|
|
#define MAX_PRIO (MAX_RT_PRIO + NICE_WIDTH)
|
|
#define DEFAULT_PRIO (MAX_RT_PRIO + NICE_WIDTH / 2)
|
|
|
|
diff --git a/include/linux/skip_list.h b/include/linux/skip_list.h
|
|
new file mode 100644
|
|
index 0000000..d4be84b
|
|
--- /dev/null
|
|
+++ b/include/linux/skip_list.h
|
|
@@ -0,0 +1,33 @@
|
|
+#ifndef _LINUX_SKIP_LISTS_H
|
|
+#define _LINUX_SKIP_LISTS_H
|
|
+typedef u64 keyType;
|
|
+typedef void *valueType;
|
|
+
|
|
+typedef struct nodeStructure skiplist_node;
|
|
+
|
|
+struct nodeStructure {
|
|
+ int level; /* Levels in this structure */
|
|
+ keyType key;
|
|
+ valueType value;
|
|
+ skiplist_node *next[8];
|
|
+ skiplist_node *prev[8];
|
|
+};
|
|
+
|
|
+typedef struct listStructure {
|
|
+ int entries;
|
|
+ int level; /* Maximum level of the list
|
|
+ (1 more than the number of levels in the list) */
|
|
+ skiplist_node *header; /* pointer to header */
|
|
+} skiplist;
|
|
+
|
|
+void skiplist_init(skiplist_node *slnode);
|
|
+skiplist *new_skiplist(skiplist_node *slnode);
|
|
+void free_skiplist(skiplist *l);
|
|
+void skiplist_node_init(skiplist_node *node);
|
|
+void skiplist_insert(skiplist *l, skiplist_node *node, keyType key, valueType value, unsigned int randseed);
|
|
+void skiplist_delete(skiplist *l, skiplist_node *node);
|
|
+
|
|
+static inline bool skiplist_node_empty(skiplist_node *node) {
|
|
+ return (!node->next[0]);
|
|
+}
|
|
+#endif /* _LINUX_SKIP_LISTS_H */
|
|
diff --git a/include/linux/writeback.h b/include/linux/writeback.h
|
|
index 797100e..f12f0b3 100644
|
|
--- a/include/linux/writeback.h
|
|
+++ b/include/linux/writeback.h
|
|
@@ -100,6 +100,16 @@ struct writeback_control {
|
|
#endif
|
|
};
|
|
|
|
+static inline int wbc_to_write_flags(struct writeback_control *wbc)
|
|
+{
|
|
+ if (wbc->sync_mode == WB_SYNC_ALL)
|
|
+ return WRITE_SYNC;
|
|
+ else if (wbc->for_kupdate || wbc->for_background)
|
|
+ return WRITE_BG;
|
|
+
|
|
+ return 0;
|
|
+}
|
|
+
|
|
/*
|
|
* A wb_domain represents a domain that wb's (bdi_writeback's) belong to
|
|
* and are measured against each other in. There always is one global
|
|
diff --git a/include/trace/events/wbt.h b/include/trace/events/wbt.h
|
|
new file mode 100644
|
|
index 0000000..3c518e4
|
|
--- /dev/null
|
|
+++ b/include/trace/events/wbt.h
|
|
@@ -0,0 +1,153 @@
|
|
+#undef TRACE_SYSTEM
|
|
+#define TRACE_SYSTEM wbt
|
|
+
|
|
+#if !defined(_TRACE_WBT_H) || defined(TRACE_HEADER_MULTI_READ)
|
|
+#define _TRACE_WBT_H
|
|
+
|
|
+#include <linux/tracepoint.h>
|
|
+#include "../../../block/blk-wbt.h"
|
|
+
|
|
+/**
|
|
+ * wbt_stat - trace stats for blk_wb
|
|
+ * @stat: array of read/write stats
|
|
+ */
|
|
+TRACE_EVENT(wbt_stat,
|
|
+
|
|
+ TP_PROTO(struct backing_dev_info *bdi, struct blk_rq_stat *stat),
|
|
+
|
|
+ TP_ARGS(bdi, stat),
|
|
+
|
|
+ TP_STRUCT__entry(
|
|
+ __array(char, name, 32)
|
|
+ __field(s64, rmean)
|
|
+ __field(u64, rmin)
|
|
+ __field(u64, rmax)
|
|
+ __field(s64, rnr_samples)
|
|
+ __field(s64, rtime)
|
|
+ __field(s64, wmean)
|
|
+ __field(u64, wmin)
|
|
+ __field(u64, wmax)
|
|
+ __field(s64, wnr_samples)
|
|
+ __field(s64, wtime)
|
|
+ ),
|
|
+
|
|
+ TP_fast_assign(
|
|
+ strncpy(__entry->name, dev_name(bdi->dev), 32);
|
|
+ __entry->rmean = stat[0].mean;
|
|
+ __entry->rmin = stat[0].min;
|
|
+ __entry->rmax = stat[0].max;
|
|
+ __entry->rnr_samples = stat[0].nr_samples;
|
|
+ __entry->wmean = stat[1].mean;
|
|
+ __entry->wmin = stat[1].min;
|
|
+ __entry->wmax = stat[1].max;
|
|
+ __entry->wnr_samples = stat[1].nr_samples;
|
|
+ ),
|
|
+
|
|
+ TP_printk("%s: rmean=%llu, rmin=%llu, rmax=%llu, rsamples=%llu, "
|
|
+ "wmean=%llu, wmin=%llu, wmax=%llu, wsamples=%llu\n",
|
|
+ __entry->name, __entry->rmean, __entry->rmin, __entry->rmax,
|
|
+ __entry->rnr_samples, __entry->wmean, __entry->wmin,
|
|
+ __entry->wmax, __entry->wnr_samples)
|
|
+);
|
|
+
|
|
+/**
|
|
+ * wbt_lat - trace latency event
|
|
+ * @lat: latency trigger
|
|
+ */
|
|
+TRACE_EVENT(wbt_lat,
|
|
+
|
|
+ TP_PROTO(struct backing_dev_info *bdi, unsigned long lat),
|
|
+
|
|
+ TP_ARGS(bdi, lat),
|
|
+
|
|
+ TP_STRUCT__entry(
|
|
+ __array(char, name, 32)
|
|
+ __field(unsigned long, lat)
|
|
+ ),
|
|
+
|
|
+ TP_fast_assign(
|
|
+ strncpy(__entry->name, dev_name(bdi->dev), 32);
|
|
+ __entry->lat = div_u64(lat, 1000);
|
|
+ ),
|
|
+
|
|
+ TP_printk("%s: latency %lluus\n", __entry->name,
|
|
+ (unsigned long long) __entry->lat)
|
|
+);
|
|
+
|
|
+/**
|
|
+ * wbt_step - trace wb event step
|
|
+ * @msg: context message
|
|
+ * @step: the current scale step count
|
|
+ * @window: the current monitoring window
|
|
+ * @bg: the current background queue limit
|
|
+ * @normal: the current normal writeback limit
|
|
+ * @max: the current max throughput writeback limit
|
|
+ */
|
|
+TRACE_EVENT(wbt_step,
|
|
+
|
|
+ TP_PROTO(struct backing_dev_info *bdi, const char *msg,
|
|
+ int step, unsigned long window, unsigned int bg,
|
|
+ unsigned int normal, unsigned int max),
|
|
+
|
|
+ TP_ARGS(bdi, msg, step, window, bg, normal, max),
|
|
+
|
|
+ TP_STRUCT__entry(
|
|
+ __array(char, name, 32)
|
|
+ __field(const char *, msg)
|
|
+ __field(int, step)
|
|
+ __field(unsigned long, window)
|
|
+ __field(unsigned int, bg)
|
|
+ __field(unsigned int, normal)
|
|
+ __field(unsigned int, max)
|
|
+ ),
|
|
+
|
|
+ TP_fast_assign(
|
|
+ strncpy(__entry->name, dev_name(bdi->dev), 32);
|
|
+ __entry->msg = msg;
|
|
+ __entry->step = step;
|
|
+ __entry->window = div_u64(window, 1000);
|
|
+ __entry->bg = bg;
|
|
+ __entry->normal = normal;
|
|
+ __entry->max = max;
|
|
+ ),
|
|
+
|
|
+ TP_printk("%s: %s: step=%d, window=%luus, background=%u, normal=%u, max=%u\n",
|
|
+ __entry->name, __entry->msg, __entry->step, __entry->window,
|
|
+ __entry->bg, __entry->normal, __entry->max)
|
|
+);
|
|
+
|
|
+/**
|
|
+ * wbt_timer - trace wb timer event
|
|
+ * @status: timer state status
|
|
+ * @step: the current scale step count
|
|
+ * @inflight: tracked writes inflight
|
|
+ */
|
|
+TRACE_EVENT(wbt_timer,
|
|
+
|
|
+ TP_PROTO(struct backing_dev_info *bdi, unsigned int status,
|
|
+ int step, unsigned int inflight),
|
|
+
|
|
+ TP_ARGS(bdi, status, step, inflight),
|
|
+
|
|
+ TP_STRUCT__entry(
|
|
+ __array(char, name, 32)
|
|
+ __field(unsigned int, status)
|
|
+ __field(int, step)
|
|
+ __field(unsigned int, inflight)
|
|
+ ),
|
|
+
|
|
+ TP_fast_assign(
|
|
+ strncpy(__entry->name, dev_name(bdi->dev), 32);
|
|
+ __entry->status = status;
|
|
+ __entry->step = step;
|
|
+ __entry->inflight = inflight;
|
|
+ ),
|
|
+
|
|
+ TP_printk("%s: status=%u, step=%d, inflight=%u\n", __entry->name,
|
|
+ __entry->status, __entry->step, __entry->inflight)
|
|
+);
|
|
+
|
|
+#endif /* _TRACE_WBT_H */
|
|
+
|
|
+/* This part must be outside protection */
|
|
+#include <trace/define_trace.h>
|
|
diff --git a/include/uapi/linux/sched.h b/include/uapi/linux/sched.h
|
|
index 5f0fe01..950a481 100644
|
|
--- a/include/uapi/linux/sched.h
|
|
+++ b/include/uapi/linux/sched.h
|
|
@@ -36,9 +36,16 @@
|
|
#define SCHED_FIFO 1
|
|
#define SCHED_RR 2
|
|
#define SCHED_BATCH 3
|
|
-/* SCHED_ISO: reserved but not implemented yet */
|
|
+/* SCHED_ISO: Implemented on MuQSS only */
|
|
#define SCHED_IDLE 5
|
|
+#ifdef CONFIG_SCHED_MUQSS
|
|
+#define SCHED_ISO 4
|
|
+#define SCHED_IDLEPRIO SCHED_IDLE
|
|
+#define SCHED_MAX (SCHED_IDLEPRIO)
|
|
+#define SCHED_RANGE(policy) ((policy) <= SCHED_MAX)
|
|
+#else /* CONFIG_SCHED_MUQSS */
|
|
#define SCHED_DEADLINE 6
|
|
+#endif /* CONFIG_SCHED_MUQSS */
|
|
|
|
/* Can be ORed in to make sure the process is reverted back to SCHED_NORMAL on fork */
|
|
#define SCHED_RESET_ON_FORK 0x40000000
|
|
diff --git a/init/Kconfig b/init/Kconfig
|
|
index 8963254..c69df07 100644
|
|
--- a/init/Kconfig
|
|
+++ b/init/Kconfig
|
|
@@ -38,6 +38,18 @@ config THREAD_INFO_IN_TASK
|
|
|
|
menu "General setup"
|
|
|
|
+config SCHED_MUQSS
|
|
+ bool "MuQSS cpu scheduler"
|
|
+ select HIGH_RES_TIMERS
|
|
+ ---help---
|
|
+ The Multiple Queue Skiplist Scheduler for excellent interactivity and
|
|
+ responsiveness on the desktop and highly scalable deterministic
|
|
+ low latency on any hardware.
|
|
+
|
|
+ Say Y here.
|
|
+ default y
|
|
+
|
|
+
|
|
config BROKEN
|
|
bool
|
|
|
|
@@ -550,7 +562,7 @@ config CONTEXT_TRACKING
|
|
config CONTEXT_TRACKING_FORCE
|
|
bool "Force context tracking"
|
|
depends on CONTEXT_TRACKING
|
|
- default y if !NO_HZ_FULL
|
|
+ default y if !NO_HZ_FULL && !SCHED_MUQSS
|
|
help
|
|
The major pre-requirement for full dynticks to work is to
|
|
support the context tracking subsystem. But there are also
|
|
@@ -940,6 +952,7 @@ config NUMA_BALANCING
|
|
depends on ARCH_SUPPORTS_NUMA_BALANCING
|
|
depends on !ARCH_WANT_NUMA_VARIABLE_LOCALITY
|
|
depends on SMP && NUMA && MIGRATION
|
|
+ depends on !SCHED_MUQSS
|
|
help
|
|
This option adds support for automatic NUMA aware memory/task placement.
|
|
The mechanism is quite primitive and is based on migrating memory when
|
|
@@ -1042,9 +1055,13 @@ menuconfig CGROUP_SCHED
|
|
help
|
|
This feature lets CPU scheduler recognize task groups and control CPU
|
|
bandwidth allocation to such task groups. It uses cgroups to group
|
|
- tasks.
|
|
+ tasks. In combination with MuQSS this is purely a STUB to create the
|
|
+ files associated with the CPU controller cgroup but most of the
|
|
+ controls do nothing. This is useful for working in environments and
|
|
+ with applications that will only work if this control group is
|
|
+ present.
|
|
|
|
-if CGROUP_SCHED
|
|
+if CGROUP_SCHED && !SCHED_MUQSS
|
|
config FAIR_GROUP_SCHED
|
|
bool "Group scheduling for SCHED_OTHER"
|
|
depends on CGROUP_SCHED
|
|
@@ -1140,6 +1157,7 @@ config CGROUP_DEVICE
|
|
|
|
config CGROUP_CPUACCT
|
|
bool "Simple CPU accounting controller"
|
|
+ depends on !SCHED_MUQSS
|
|
help
|
|
Provides a simple controller for monitoring the
|
|
total CPU consumed by the tasks in a cgroup.
|
|
@@ -1238,6 +1256,7 @@ endif # NAMESPACES
|
|
|
|
config SCHED_AUTOGROUP
|
|
bool "Automatic process group scheduling"
|
|
+ depends on !SCHED_MUQSS
|
|
select CGROUPS
|
|
select CGROUP_SCHED
|
|
select FAIR_GROUP_SCHED
|
|
diff --git a/init/main.c b/init/main.c
|
|
index 2858be7..5ec4c43 100644
|
|
--- a/init/main.c
|
|
+++ b/init/main.c
|
|
@@ -793,7 +793,6 @@ int __init_or_module do_one_initcall(initcall_t fn)
|
|
return ret;
|
|
}
|
|
|
|
-
|
|
extern initcall_t __initcall_start[];
|
|
extern initcall_t __initcall0_start[];
|
|
extern initcall_t __initcall1_start[];
|
|
@@ -952,6 +951,8 @@ static int __ref kernel_init(void *unused)
|
|
|
|
rcu_end_inkernel_boot();
|
|
|
|
+ print_scheduler_version();
|
|
+
|
|
if (ramdisk_execute_command) {
|
|
ret = run_init_process(ramdisk_execute_command);
|
|
if (!ret)
|
|
diff --git a/kernel/Kconfig.hz b/kernel/Kconfig.hz
|
|
index 2a202a8..f97168d 100644
|
|
--- a/kernel/Kconfig.hz
|
|
+++ b/kernel/Kconfig.hz
|
|
@@ -4,7 +4,8 @@
|
|
|
|
choice
|
|
prompt "Timer frequency"
|
|
- default HZ_250
|
|
+ default HZ_100_MUQSS if SCHED_MUQSS
|
|
+ default HZ_250 if !SCHED_MUQSS
|
|
help
|
|
Allows the configuration of the timer frequency. It is customary
|
|
to have the timer interrupt run at 1000 Hz but 100 Hz may be more
|
|
@@ -16,15 +17,30 @@ choice
|
|
per second.
|
|
|
|
|
|
+ config HZ_100_MUQSS
|
|
+ bool "100 HZ" if SCHED_MUQSS
|
|
+ help
|
|
+ 100 Hz is a suitable choice in combination with MuQSS which does
|
|
+ not rely on ticks for rescheduling interrupts, and is not Hz limited
|
|
+ for timeouts and sleeps from both the kernel and userspace.
|
|
+ This allows us to benefit from the lower overhead and higher
|
|
+ throughput of fewer timer ticks.
|
|
+
|
|
config HZ_100
|
|
- bool "100 HZ"
|
|
+ bool "100 HZ" if !SCHED_MUQSS
|
|
help
|
|
100 Hz is a typical choice for servers, SMP and NUMA systems
|
|
with lots of processors that may show reduced performance if
|
|
too many timer interrupts are occurring.
|
|
|
|
+ config HZ_250_MUQSS
|
|
+ bool "250 HZ" if SCHED_MUQSS
|
|
+ help
|
|
+ 250 Hz is the default choice for the mainline scheduler but not
|
|
+ advantageous in combination with MuQSS.
|
|
+
|
|
config HZ_250
|
|
- bool "250 HZ"
|
|
+ bool "250 HZ" if !SCHED_MUQSS
|
|
help
|
|
250 Hz is a good compromise choice allowing server performance
|
|
while also showing good interactive responsiveness even
|
|
@@ -49,7 +65,9 @@ endchoice
|
|
|
|
config HZ
|
|
int
|
|
+ default 100 if HZ_100_MUQSS
|
|
default 100 if HZ_100
|
|
+ default 250 if HZ_250_MUQSS
|
|
default 250 if HZ_250
|
|
default 300 if HZ_300
|
|
default 1000 if HZ_1000
|
|
diff --git a/kernel/Kconfig.preempt b/kernel/Kconfig.preempt
|
|
index 3f9c974..1dc79ec 100644
|
|
--- a/kernel/Kconfig.preempt
|
|
+++ b/kernel/Kconfig.preempt
|
|
@@ -1,7 +1,7 @@
|
|
|
|
choice
|
|
prompt "Preemption Model"
|
|
- default PREEMPT_NONE
|
|
+ default PREEMPT
|
|
|
|
config PREEMPT_NONE
|
|
bool "No Forced Preemption (Server)"
|
|
@@ -17,7 +17,7 @@ config PREEMPT_NONE
|
|
latencies.
|
|
|
|
config PREEMPT_VOLUNTARY
|
|
- bool "Voluntary Kernel Preemption (Desktop)"
|
|
+ bool "Voluntary Kernel Preemption (Nothing)"
|
|
help
|
|
This option reduces the latency of the kernel by adding more
|
|
"explicit preemption points" to the kernel code. These new
|
|
@@ -31,7 +31,8 @@ config PREEMPT_VOLUNTARY
|
|
applications to run more 'smoothly' even when the system is
|
|
under load.
|
|
|
|
- Select this if you are building a kernel for a desktop system.
|
|
+ Select this for no system in particular (choose Preemptible
|
|
+ instead on a desktop if you know what's good for you).
|
|
|
|
config PREEMPT
|
|
bool "Preemptible Kernel (Low-Latency Desktop)"
|
|
diff --git a/kernel/Makefile b/kernel/Makefile
|
|
index e0c2268..cffa2c7 100644
|
|
--- a/kernel/Makefile
|
|
+++ b/kernel/Makefile
|
|
@@ -9,7 +9,7 @@ obj-y = fork.o exec_domain.o panic.o \
|
|
extable.o params.o \
|
|
kthread.o sys_ni.o nsproxy.o \
|
|
notifier.o ksysfs.o cred.o reboot.o \
|
|
- async.o range.o smpboot.o ucount.o
|
|
+ async.o range.o smpboot.o ucount.o skip_list.o
|
|
|
|
obj-$(CONFIG_MULTIUSER) += groups.o
|
|
|
|
diff --git a/kernel/delayacct.c b/kernel/delayacct.c
|
|
index 435c14a..a80d56d 100644
|
|
--- a/kernel/delayacct.c
|
|
+++ b/kernel/delayacct.c
|
|
@@ -104,7 +104,7 @@ int __delayacct_add_tsk(struct taskstats *d, struct task_struct *tsk)
|
|
*/
|
|
t1 = tsk->sched_info.pcount;
|
|
t2 = tsk->sched_info.run_delay;
|
|
- t3 = tsk->se.sum_exec_runtime;
|
|
+ t3 = tsk_seruntime(tsk);
|
|
|
|
d->cpu_count += t1;
|
|
|
|
diff --git a/kernel/exit.c b/kernel/exit.c
|
|
index 3076f30..d749749 100644
|
|
--- a/kernel/exit.c
|
|
+++ b/kernel/exit.c
|
|
@@ -134,7 +134,7 @@ static void __exit_signal(struct task_struct *tsk)
|
|
sig->inblock += task_io_get_inblock(tsk);
|
|
sig->oublock += task_io_get_oublock(tsk);
|
|
task_io_accounting_add(&sig->ioac, &tsk->ioac);
|
|
- sig->sum_sched_runtime += tsk->se.sum_exec_runtime;
|
|
+ sig->sum_sched_runtime += tsk_seruntime(tsk);
|
|
sig->nr_threads--;
|
|
__unhash_process(tsk, group_dead);
|
|
write_sequnlock(&sig->stats_lock);
|
|
diff --git a/kernel/irq/Kconfig b/kernel/irq/Kconfig
|
|
index 3bbfd6a..351bf16 100644
|
|
--- a/kernel/irq/Kconfig
|
|
+++ b/kernel/irq/Kconfig
|
|
@@ -95,6 +95,20 @@ config IRQ_DOMAIN_DEBUG
|
|
config IRQ_FORCED_THREADING
|
|
bool
|
|
|
|
+config FORCE_IRQ_THREADING
|
|
+ bool "Make IRQ threading compulsory"
|
|
+ depends on IRQ_FORCED_THREADING
|
|
+ default y
|
|
+ ---help---
|
|
+
|
|
+ Make IRQ threading mandatory for any IRQ handlers that support it
|
|
+ instead of being optional and requiring the threadirqs kernel
|
|
+ parameter. Instead they can be optionally disabled with the
|
|
+ nothreadirqs kernel parameter.
|
|
+
|
|
+ Enable if you are building for a desktop or low latency system,
|
|
+ otherwise say N.
|
|
+
|
|
config SPARSE_IRQ
|
|
bool "Support sparse irq numbering" if MAY_HAVE_SPARSE_IRQ
|
|
---help---
|
|
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
|
|
index 6b66959..6b3fb17 100644
|
|
--- a/kernel/irq/manage.c
|
|
+++ b/kernel/irq/manage.c
|
|
@@ -22,7 +22,17 @@
|
|
#include "internals.h"
|
|
|
|
#ifdef CONFIG_IRQ_FORCED_THREADING
|
|
+#ifdef CONFIG_FORCE_IRQ_THREADING
|
|
+__read_mostly bool force_irqthreads = true;
|
|
+#else
|
|
__read_mostly bool force_irqthreads;
|
|
+#endif
|
|
+static int __init setup_noforced_irqthreads(char *arg)
|
|
+{
|
|
+ force_irqthreads = false;
|
|
+ return 0;
|
|
+}
|
|
+early_param("nothreadirqs", setup_noforced_irqthreads);
|
|
|
|
static int __init setup_forced_irqthreads(char *arg)
|
|
{
|
|
diff --git a/kernel/kthread.c b/kernel/kthread.c
|
|
index be2cc1f..665e3bd 100644
|
|
--- a/kernel/kthread.c
|
|
+++ b/kernel/kthread.c
|
|
@@ -381,6 +381,34 @@ void kthread_bind(struct task_struct *p, unsigned int cpu)
|
|
}
|
|
EXPORT_SYMBOL(kthread_bind);
|
|
|
|
+#if defined(CONFIG_SCHED_MUQSS) && defined(CONFIG_SMP)
|
|
+extern void __do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask);
|
|
+
|
|
+/*
|
|
+ * new_kthread_bind is a special variant of __kthread_bind_mask.
|
|
+ * For new threads to work on muqss we want to call do_set_cpus_allowed
|
|
+ * without the task_cpu being set and the task rescheduled until they're
|
|
+ * rescheduled on their own so we call __do_set_cpus_allowed directly which
|
|
+ * only changes the cpumask. This is particularly important for smpboot threads
|
|
+ * to work.
|
|
+ */
|
|
+static void new_kthread_bind(struct task_struct *p, unsigned int cpu)
|
|
+{
|
|
+ unsigned long flags;
|
|
+
|
|
+ if (WARN_ON(!wait_task_inactive(p, TASK_UNINTERRUPTIBLE)))
|
|
+ return;
|
|
+
|
|
+ /* It's safe because the task is inactive. */
|
|
+ raw_spin_lock_irqsave(&p->pi_lock, flags);
|
|
+ __do_set_cpus_allowed(p, cpumask_of(cpu));
|
|
+ p->flags |= PF_NO_SETAFFINITY;
|
|
+ raw_spin_unlock_irqrestore(&p->pi_lock, flags);
|
|
+}
|
|
+#else
|
|
+#define new_kthread_bind(p, cpu) kthread_bind(p, cpu)
|
|
+#endif
|
|
+
|
|
/**
|
|
* kthread_create_on_cpu - Create a cpu bound kthread
|
|
* @threadfn: the function to run until signal_pending(current).
|
|
@@ -402,7 +430,7 @@ struct task_struct *kthread_create_on_cpu(int (*threadfn)(void *data),
|
|
cpu);
|
|
if (IS_ERR(p))
|
|
return p;
|
|
- kthread_bind(p, cpu);
|
|
+ new_kthread_bind(p, cpu);
|
|
/* CPU hotplug need to bind once again when unparking the thread. */
|
|
set_bit(KTHREAD_IS_PER_CPU, &to_kthread(p)->flags);
|
|
to_kthread(p)->cpu = cpu;
|
|
diff --git a/kernel/sched/Makefile b/kernel/sched/Makefile
|
|
index 5e59b83..77bdf98 100644
|
|
--- a/kernel/sched/Makefile
|
|
+++ b/kernel/sched/Makefile
|
|
@@ -15,13 +15,18 @@ ifneq ($(CONFIG_SCHED_OMIT_FRAME_POINTER),y)
|
|
CFLAGS_core.o := $(PROFILING) -fno-omit-frame-pointer
|
|
endif
|
|
|
|
-obj-y += core.o loadavg.o clock.o cputime.o
|
|
+ifdef CONFIG_SCHED_MUQSS
|
|
+obj-y += MuQSS.o clock.o
|
|
+else
|
|
+obj-y += core.o loadavg.o clock.o
|
|
obj-y += idle_task.o fair.o rt.o deadline.o stop_task.o
|
|
-obj-y += wait.o swait.o completion.o idle.o
|
|
-obj-$(CONFIG_SMP) += cpupri.o cpudeadline.o
|
|
+obj-$(CONFIG_SMP) += cpudeadline.o
|
|
obj-$(CONFIG_SCHED_AUTOGROUP) += auto_group.o
|
|
-obj-$(CONFIG_SCHEDSTATS) += stats.o
|
|
obj-$(CONFIG_SCHED_DEBUG) += debug.o
|
|
obj-$(CONFIG_CGROUP_CPUACCT) += cpuacct.o
|
|
+endif
|
|
+obj-y += wait.o swait.o completion.o idle.o cputime.o
|
|
+obj-$(CONFIG_SMP) += cpupri.o
|
|
+obj-$(CONFIG_SCHEDSTATS) += stats.o
|
|
obj-$(CONFIG_CPU_FREQ) += cpufreq.o
|
|
obj-$(CONFIG_CPU_FREQ_GOV_SCHEDUTIL) += cpufreq_schedutil.o
|
|
diff --git a/kernel/sched/MuQSS.c b/kernel/sched/MuQSS.c
|
|
new file mode 100644
|
|
index 0000000..7617ae4
|
|
--- /dev/null
|
|
+++ b/kernel/sched/MuQSS.c
|
|
@@ -0,0 +1,8033 @@
|
|
+/*
|
|
+ * kernel/sched/MuQSS.c, was kernel/sched.c
|
|
+ *
|
|
+ * Kernel scheduler and related syscalls
|
|
+ *
|
|
+ * Copyright (C) 1991-2002 Linus Torvalds
|
|
+ *
|
|
+ * 1996-12-23 Modified by Dave Grothe to fix bugs in semaphores and
|
|
+ * make semaphores SMP safe
|
|
+ * 1998-11-19 Implemented schedule_timeout() and related stuff
|
|
+ * by Andrea Arcangeli
|
|
+ * 2002-01-04 New ultra-scalable O(1) scheduler by Ingo Molnar:
|
|
+ * hybrid priority-list and round-robin design with
|
|
+ * an array-switch method of distributing timeslices
|
|
+ * and per-CPU runqueues. Cleanups and useful suggestions
|
|
+ * by Davide Libenzi, preemptible kernel bits by Robert Love.
|
|
+ * 2003-09-03 Interactivity tuning by Con Kolivas.
|
|
+ * 2004-04-02 Scheduler domains code by Nick Piggin
|
|
+ * 2007-04-15 Work begun on replacing all interactivity tuning with a
|
|
+ * fair scheduling design by Con Kolivas.
|
|
+ * 2007-05-05 Load balancing (smp-nice) and other improvements
|
|
+ * by Peter Williams
|
|
+ * 2007-05-06 Interactivity improvements to CFS by Mike Galbraith
|
|
+ * 2007-07-01 Group scheduling enhancements by Srivatsa Vaddagiri
|
|
+ * 2007-11-29 RT balancing improvements by Steven Rostedt, Gregory Haskins,
|
|
+ * Thomas Gleixner, Mike Kravetz
|
|
+ * 2009-08-13 Brainfuck deadline scheduling policy by Con Kolivas deletes
|
|
+ * a whole lot of those previous things.
|
|
+ * 2016-10-01 Multiple Queue Skiplist Scheduler scalable evolution of BFS
|
|
+ * scheduler by Con Kolivas.
|
|
+ */
|
|
+
|
|
+#include <linux/kasan.h>
|
|
+#include <linux/mm.h>
|
|
+#include <linux/module.h>
|
|
+#include <linux/nmi.h>
|
|
+#include <linux/init.h>
|
|
+#include <asm/uaccess.h>
|
|
+#include <linux/highmem.h>
|
|
+#include <linux/mmu_context.h>
|
|
+#include <linux/interrupt.h>
|
|
+#include <linux/capability.h>
|
|
+#include <linux/completion.h>
|
|
+#include <linux/kernel_stat.h>
|
|
+#include <linux/debug_locks.h>
|
|
+#include <linux/perf_event.h>
|
|
+#include <linux/security.h>
|
|
+#include <linux/notifier.h>
|
|
+#include <linux/profile.h>
|
|
+#include <linux/freezer.h>
|
|
+#include <linux/vmalloc.h>
|
|
+#include <linux/blkdev.h>
|
|
+#include <linux/delay.h>
|
|
+#include <linux/smp.h>
|
|
+#include <linux/threads.h>
|
|
+#include <linux/timer.h>
|
|
+#include <linux/rcupdate.h>
|
|
+#include <linux/cpu.h>
|
|
+#include <linux/cpuset.h>
|
|
+#include <linux/cpumask.h>
|
|
+#include <linux/percpu.h>
|
|
+#include <linux/proc_fs.h>
|
|
+#include <linux/seq_file.h>
|
|
+#include <linux/syscalls.h>
|
|
+#include <linux/sched/sysctl.h>
|
|
+#include <linux/times.h>
|
|
+#include <linux/tsacct_kern.h>
|
|
+#include <linux/kprobes.h>
|
|
+#include <linux/delayacct.h>
|
|
+#include <linux/log2.h>
|
|
+#include <linux/bootmem.h>
|
|
+#include <linux/ftrace.h>
|
|
+#include <linux/slab.h>
|
|
+#include <linux/init_task.h>
|
|
+#include <linux/binfmts.h>
|
|
+#include <linux/context_tracking.h>
|
|
+#include <linux/sched/prio.h>
|
|
+#include <linux/tick.h>
|
|
+#include <linux/skip_list.h>
|
|
+
|
|
+#include <asm/irq_regs.h>
|
|
+#include <asm/switch_to.h>
|
|
+#include <asm/tlb.h>
|
|
+#include <asm/unistd.h>
|
|
+#include <asm/mutex.h>
|
|
+#ifdef CONFIG_PARAVIRT
|
|
+#include <asm/paravirt.h>
|
|
+#endif
|
|
+
|
|
+#include "cpupri.h"
|
|
+#include "../workqueue_internal.h"
|
|
+#include "../smpboot.h"
|
|
+
|
|
+#define CREATE_TRACE_POINTS
|
|
+#include <trace/events/sched.h>
|
|
+
|
|
+#include "MuQSS.h"
|
|
+
|
|
+#define rt_prio(prio) unlikely((prio) < MAX_RT_PRIO)
|
|
+#define rt_task(p) rt_prio((p)->prio)
|
|
+#define batch_task(p) (unlikely((p)->policy == SCHED_BATCH))
|
|
+#define is_rt_policy(policy) ((policy) == SCHED_FIFO || \
|
|
+ (policy) == SCHED_RR)
|
|
+#define has_rt_policy(p) unlikely(is_rt_policy((p)->policy))
|
|
+
|
|
+#define is_idle_policy(policy) ((policy) == SCHED_IDLEPRIO)
|
|
+#define idleprio_task(p) unlikely(is_idle_policy((p)->policy))
|
|
+#define task_running_idle(p) unlikely((p)->prio == IDLE_PRIO)
|
|
+
|
|
+#define is_iso_policy(policy) ((policy) == SCHED_ISO)
|
|
+#define iso_task(p) unlikely(is_iso_policy((p)->policy))
|
|
+#define task_running_iso(p) unlikely((p)->prio == ISO_PRIO)
|
|
+
|
|
+#define rq_idle(rq) ((rq)->rq_prio == PRIO_LIMIT)
|
|
+
|
|
+#define ISO_PERIOD (5 * HZ)
|
|
+
|
|
+#define STOP_PRIO (MAX_RT_PRIO - 1)
|
|
+
|
|
+/*
|
|
+ * Some helpers for converting to/from various scales. Use shifts to get
|
|
+ * approximate multiples of ten for less overhead.
|
|
+ */
|
|
+#define JIFFIES_TO_NS(TIME) ((TIME) * (1073741824 / HZ))
|
|
+#define JIFFY_NS (1073741824 / HZ)
|
|
+#define JIFFY_US (1048576 / HZ)
|
|
+#define NS_TO_JIFFIES(TIME) ((TIME) / JIFFY_NS)
|
|
+#define HALF_JIFFY_NS (1073741824 / HZ / 2)
|
|
+#define HALF_JIFFY_US (1048576 / HZ / 2)
|
|
+#define MS_TO_NS(TIME) ((TIME) << 20)
|
|
+#define MS_TO_US(TIME) ((TIME) << 10)
|
|
+#define NS_TO_MS(TIME) ((TIME) >> 20)
|
|
+#define NS_TO_US(TIME) ((TIME) >> 10)
|
|
+#define US_TO_NS(TIME) ((TIME) << 10)
|
|
+
|
|
+#define RESCHED_US (100) /* Reschedule if less than this many μs left */
|
|
+
|
|
+void print_scheduler_version(void)
|
|
+{
|
|
+ printk(KERN_INFO "MuQSS CPU scheduler v0.150 by Con Kolivas.\n");
|
|
+}
|
|
+
|
|
+/*
|
|
+ * This is the time all tasks within the same priority round robin.
|
|
+ * Value is in ms and set to a minimum of 6ms.
|
|
+ * Tunable via /proc interface.
|
|
+ */
|
|
+int rr_interval __read_mostly = 6;
|
|
+
|
|
+/*
|
|
+ * Tunable to choose whether to prioritise latency or throughput, simple
|
|
+ * binary yes or no
|
|
+ */
|
|
+int sched_interactive __read_mostly = 1;
|
|
+
|
|
+/*
|
|
+ * sched_iso_cpu - sysctl which determines the cpu percentage SCHED_ISO tasks
|
|
+ * are allowed to run five seconds as real time tasks. This is the total over
|
|
+ * all online cpus.
|
|
+ */
|
|
+int sched_iso_cpu __read_mostly = 70;
|
|
+
|
|
+/*
|
|
+ * sched_yield_type - Choose what sort of yield sched_yield will perform.
|
|
+ * 0: No yield.
|
|
+ * 1: Yield only to better priority/deadline tasks. (default)
|
|
+ * 2: Expire timeslice and recalculate deadline.
|
|
+ */
|
|
+int sched_yield_type __read_mostly = 1;
|
|
+
|
|
+/*
|
|
+ * The relative length of deadline for each priority(nice) level.
|
|
+ */
|
|
+static int prio_ratios[NICE_WIDTH] __read_mostly;
|
|
+
|
|
+/*
|
|
+ * The quota handed out to tasks of all priority levels when refilling their
|
|
+ * time_slice.
|
|
+ */
|
|
+static inline int timeslice(void)
|
|
+{
|
|
+ return MS_TO_US(rr_interval);
|
|
+}
|
|
+
|
|
+static bool sched_smp_initialized __read_mostly;
|
|
+
|
|
+/*
|
|
+ * The global runqueue data that all CPUs work off. Contains either atomic
|
|
+ * variables and a cpu bitmap set atomically.
|
|
+ */
|
|
+struct global_rq {
|
|
+#ifdef CONFIG_SMP
|
|
+ atomic_t nr_running ____cacheline_aligned_in_smp;
|
|
+ atomic_t nr_uninterruptible ____cacheline_aligned_in_smp;
|
|
+ atomic64_t nr_switches ____cacheline_aligned_in_smp;
|
|
+ cpumask_t cpu_idle_map ____cacheline_aligned_in_smp;
|
|
+#else
|
|
+ atomic_t nr_running ____cacheline_aligned;
|
|
+ atomic_t nr_uninterruptible ____cacheline_aligned;
|
|
+ atomic64_t nr_switches ____cacheline_aligned;
|
|
+#endif
|
|
+};
|
|
+
|
|
+#ifdef CONFIG_SMP
|
|
+/*
|
|
+ * We add the notion of a root-domain which will be used to define per-domain
|
|
+ * variables. Each exclusive cpuset essentially defines an island domain by
|
|
+ * fully partitioning the member cpus from any other cpuset. Whenever a new
|
|
+ * exclusive cpuset is created, we also create and attach a new root-domain
|
|
+ * object.
|
|
+ *
|
|
+ */
|
|
+struct root_domain {
|
|
+ atomic_t refcount;
|
|
+ atomic_t rto_count;
|
|
+ struct rcu_head rcu;
|
|
+ cpumask_var_t span;
|
|
+ cpumask_var_t online;
|
|
+
|
|
+ /*
|
|
+ * The "RT overload" flag: it gets set if a CPU has more than
|
|
+ * one runnable RT task.
|
|
+ */
|
|
+ cpumask_var_t rto_mask;
|
|
+ struct cpupri cpupri;
|
|
+};
|
|
+
|
|
+/*
|
|
+ * By default the system creates a single root-domain with all cpus as
|
|
+ * members (mimicking the global state we have today).
|
|
+ */
|
|
+static struct root_domain def_root_domain;
|
|
+
|
|
+#endif /* CONFIG_SMP */
|
|
+
|
|
+/* There can be only one */
|
|
+#ifdef CONFIG_SMP
|
|
+static struct global_rq grq ____cacheline_aligned_in_smp;
|
|
+#else
|
|
+static struct global_rq grq ____cacheline_aligned;
|
|
+#endif
|
|
+
|
|
+static DEFINE_MUTEX(sched_hotcpu_mutex);
|
|
+
|
|
+/* cpus with isolated domains */
|
|
+cpumask_var_t cpu_isolated_map;
|
|
+
|
|
+DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues);
|
|
+#ifdef CONFIG_SMP
|
|
+struct rq *cpu_rq(int cpu)
|
|
+{
|
|
+ return &per_cpu(runqueues, (cpu));
|
|
+}
|
|
+#define task_rq(p) cpu_rq(task_cpu(p))
|
|
+#define cpu_curr(cpu) (cpu_rq(cpu)->curr)
|
|
+/*
|
|
+ * sched_domains_mutex serialises calls to init_sched_domains,
|
|
+ * detach_destroy_domains and partition_sched_domains.
|
|
+ */
|
|
+DEFINE_MUTEX(sched_domains_mutex);
|
|
+
|
|
+/*
|
|
+ * By default the system creates a single root-domain with all cpus as
|
|
+ * members (mimicking the global state we have today).
|
|
+ */
|
|
+static struct root_domain def_root_domain;
|
|
+
|
|
+int __weak arch_sd_sibling_asym_packing(void)
|
|
+{
|
|
+ return 0*SD_ASYM_PACKING;
|
|
+}
|
|
+#else
|
|
+struct rq *uprq;
|
|
+#endif /* CONFIG_SMP */
|
|
+
|
|
+#ifdef CONFIG_SMP
|
|
+static inline int cpu_of(struct rq *rq)
|
|
+{
|
|
+ return rq->cpu;
|
|
+}
|
|
+#else /* CONFIG_SMP */
|
|
+static inline int cpu_of(struct rq *rq)
|
|
+{
|
|
+ return 0;
|
|
+}
|
|
+#endif
|
|
+
|
|
+#include "stats.h"
|
|
+
|
|
+#ifndef prepare_arch_switch
|
|
+# define prepare_arch_switch(next) do { } while (0)
|
|
+#endif
|
|
+#ifndef finish_arch_switch
|
|
+# define finish_arch_switch(prev) do { } while (0)
|
|
+#endif
|
|
+#ifndef finish_arch_post_lock_switch
|
|
+# define finish_arch_post_lock_switch() do { } while (0)
|
|
+#endif
|
|
+
|
|
+/*
|
|
+ * All common locking functions performed on rq->lock. rq->clock is local to
|
|
+ * the CPU accessing it so it can be modified just with interrupts disabled
|
|
+ * when we're not updating niffies.
|
|
+ * Looking up task_rq must be done under rq->lock to be safe.
|
|
+ */
|
|
+static void update_rq_clock_task(struct rq *rq, s64 delta);
|
|
+
|
|
+static inline void update_rq_clock(struct rq *rq)
|
|
+{
|
|
+ s64 delta = sched_clock_cpu(cpu_of(rq)) - rq->clock;
|
|
+
|
|
+ if (unlikely(delta < 0))
|
|
+ return;
|
|
+ rq->clock += delta;
|
|
+ update_rq_clock_task(rq, delta);
|
|
+}
|
|
+
|
|
+/*
|
|
+ * Niffies are a globally increasing nanosecond counter. They're only used by
|
|
+ * update_load_avg and time_slice_expired, however deadlines are based on them
|
|
+ * across CPUs. Update them whenever we will call one of those functions, and
|
|
+ * synchronise them across CPUs whenever we hold both runqueue locks.
|
|
+ */
|
|
+static inline void update_clocks(struct rq *rq)
|
|
+{
|
|
+ s64 ndiff, minndiff;
|
|
+ long jdiff;
|
|
+
|
|
+ update_rq_clock(rq);
|
|
+ ndiff = rq->clock - rq->old_clock;
|
|
+ rq->old_clock = rq->clock;
|
|
+ jdiff = jiffies - rq->last_jiffy;
|
|
+
|
|
+ /* Subtract any niffies added by balancing with other rqs */
|
|
+ ndiff -= rq->niffies - rq->last_niffy;
|
|
+ minndiff = JIFFIES_TO_NS(jdiff) - rq->niffies + rq->last_jiffy_niffies;
|
|
+ if (minndiff < 0)
|
|
+ minndiff = 0;
|
|
+ ndiff = max(ndiff, minndiff);
|
|
+ rq->niffies += ndiff;
|
|
+ rq->last_niffy = rq->niffies;
|
|
+ if (jdiff) {
|
|
+ rq->last_jiffy += jdiff;
|
|
+ rq->last_jiffy_niffies = rq->niffies;
|
|
+ }
|
|
+}
|
|
+
|
|
+static inline int task_current(struct rq *rq, struct task_struct *p)
|
|
+{
|
|
+ return rq->curr == p;
|
|
+}
|
|
+
|
|
+static inline int task_running(struct rq *rq, struct task_struct *p)
|
|
+{
|
|
+#ifdef CONFIG_SMP
|
|
+ return p->on_cpu;
|
|
+#else
|
|
+ return task_current(rq, p);
|
|
+#endif
|
|
+}
|
|
+
|
|
+static inline int task_on_rq_queued(struct task_struct *p)
|
|
+{
|
|
+ return p->on_rq == TASK_ON_RQ_QUEUED;
|
|
+}
|
|
+
|
|
+static inline int task_on_rq_migrating(struct task_struct *p)
|
|
+{
|
|
+ return p->on_rq == TASK_ON_RQ_MIGRATING;
|
|
+}
|
|
+
|
|
+static inline void rq_lock(struct rq *rq)
|
|
+ __acquires(rq->lock)
|
|
+{
|
|
+ raw_spin_lock(&rq->lock);
|
|
+}
|
|
+
|
|
+static inline int rq_trylock(struct rq *rq)
|
|
+ __acquires(rq->lock)
|
|
+{
|
|
+ return raw_spin_trylock(&rq->lock);
|
|
+}
|
|
+
|
|
+static inline void rq_unlock(struct rq *rq)
|
|
+ __releases(rq->lock)
|
|
+{
|
|
+ raw_spin_unlock(&rq->lock);
|
|
+}
|
|
+
|
|
+static inline struct rq *this_rq_lock(void)
|
|
+ __acquires(rq->lock)
|
|
+{
|
|
+ struct rq *rq;
|
|
+
|
|
+ local_irq_disable();
|
|
+ rq = this_rq();
|
|
+ raw_spin_lock(&rq->lock);
|
|
+
|
|
+ return rq;
|
|
+}
|
|
+
|
|
+/*
|
|
+ * Any time we have two runqueues locked we use that as an opportunity to
|
|
+ * synchronise niffies to the highest value as idle ticks may have artificially
|
|
+ * kept niffies low on one CPU and the truth can only be later.
|
|
+ */
|
|
+static inline void synchronise_niffies(struct rq *rq1, struct rq *rq2)
|
|
+{
|
|
+ if (rq1->niffies > rq2->niffies)
|
|
+ rq2->niffies = rq1->niffies;
|
|
+ else
|
|
+ rq1->niffies = rq2->niffies;
|
|
+}
|
|
+
|
|
+/*
|
|
+ * double_rq_lock - safely lock two runqueues
|
|
+ *
|
|
+ * Note this does not disable interrupts like task_rq_lock,
|
|
+ * you need to do so manually before calling.
|
|
+ */
|
|
+
|
|
+/* For when we know rq1 != rq2 */
|
|
+static inline void __double_rq_lock(struct rq *rq1, struct rq *rq2)
|
|
+ __acquires(rq1->lock)
|
|
+ __acquires(rq2->lock)
|
|
+{
|
|
+ if (rq1 < rq2) {
|
|
+ raw_spin_lock(&rq1->lock);
|
|
+ raw_spin_lock_nested(&rq2->lock, SINGLE_DEPTH_NESTING);
|
|
+ } else {
|
|
+ raw_spin_lock(&rq2->lock);
|
|
+ raw_spin_lock_nested(&rq1->lock, SINGLE_DEPTH_NESTING);
|
|
+ }
|
|
+}
|
|
+
|
|
+static inline void double_rq_lock(struct rq *rq1, struct rq *rq2)
|
|
+ __acquires(rq1->lock)
|
|
+ __acquires(rq2->lock)
|
|
+{
|
|
+ BUG_ON(!irqs_disabled());
|
|
+ if (rq1 == rq2) {
|
|
+ raw_spin_lock(&rq1->lock);
|
|
+ __acquire(rq2->lock); /* Fake it out ;) */
|
|
+ } else
|
|
+ __double_rq_lock(rq1, rq2);
|
|
+ synchronise_niffies(rq1, rq2);
|
|
+}
|
|
+
|
|
+/*
|
|
+ * double_rq_unlock - safely unlock two runqueues
|
|
+ *
|
|
+ * Note this does not restore interrupts like task_rq_unlock,
|
|
+ * you need to do so manually after calling.
|
|
+ */
|
|
+static inline void double_rq_unlock(struct rq *rq1, struct rq *rq2)
|
|
+ __releases(rq1->lock)
|
|
+ __releases(rq2->lock)
|
|
+{
|
|
+ raw_spin_unlock(&rq1->lock);
|
|
+ if (rq1 != rq2)
|
|
+ raw_spin_unlock(&rq2->lock);
|
|
+ else
|
|
+ __release(rq2->lock);
|
|
+}
|
|
+
|
|
+/* Must be sure rq1 != rq2 and irqs are disabled */
|
|
+static inline void lock_second_rq(struct rq *rq1, struct rq *rq2)
|
|
+ __releases(rq1->lock)
|
|
+ __acquires(rq1->lock)
|
|
+ __acquires(rq2->lock)
|
|
+{
|
|
+ BUG_ON(!irqs_disabled());
|
|
+ if (unlikely(!raw_spin_trylock(&rq2->lock))) {
|
|
+ raw_spin_unlock(&rq1->lock);
|
|
+ __double_rq_lock(rq1, rq2);
|
|
+ }
|
|
+ synchronise_niffies(rq1, rq2);
|
|
+}
|
|
+
|
|
+static inline void lock_all_rqs(void)
|
|
+{
|
|
+ int cpu;
|
|
+
|
|
+ preempt_disable();
|
|
+ for_each_possible_cpu(cpu) {
|
|
+ struct rq *rq = cpu_rq(cpu);
|
|
+
|
|
+ do_raw_spin_lock(&rq->lock);
|
|
+ }
|
|
+}
|
|
+
|
|
+static inline void unlock_all_rqs(void)
|
|
+{
|
|
+ int cpu;
|
|
+
|
|
+ for_each_possible_cpu(cpu) {
|
|
+ struct rq *rq = cpu_rq(cpu);
|
|
+
|
|
+ do_raw_spin_unlock(&rq->lock);
|
|
+ }
|
|
+ preempt_enable();
|
|
+}
|
|
+
|
|
+/* Specially nest trylock an rq */
|
|
+static inline bool trylock_rq(struct rq *this_rq, struct rq *rq)
|
|
+{
|
|
+ if (unlikely(!do_raw_spin_trylock(&rq->lock)))
|
|
+ return false;
|
|
+ spin_acquire(&rq->lock.dep_map, SINGLE_DEPTH_NESTING, 1, _RET_IP_);
|
|
+ synchronise_niffies(this_rq, rq);
|
|
+ return true;
|
|
+}
|
|
+
|
|
+/* Unlock a specially nested trylocked rq */
|
|
+static inline void unlock_rq(struct rq *rq)
|
|
+{
|
|
+ spin_release(&rq->lock.dep_map, 1, _RET_IP_);
|
|
+ do_raw_spin_unlock(&rq->lock);
|
|
+}
|
|
+
|
|
+static inline void rq_lock_irq(struct rq *rq)
|
|
+ __acquires(rq->lock)
|
|
+{
|
|
+ raw_spin_lock_irq(&rq->lock);
|
|
+}
|
|
+
|
|
+static inline void rq_unlock_irq(struct rq *rq)
|
|
+ __releases(rq->lock)
|
|
+{
|
|
+ raw_spin_unlock_irq(&rq->lock);
|
|
+}
|
|
+
|
|
+static inline void rq_lock_irqsave(struct rq *rq, unsigned long *flags)
|
|
+ __acquires(rq->lock)
|
|
+{
|
|
+ raw_spin_lock_irqsave(&rq->lock, *flags);
|
|
+}
|
|
+
|
|
+static inline void rq_unlock_irqrestore(struct rq *rq, unsigned long *flags)
|
|
+ __releases(rq->lock)
|
|
+{
|
|
+ raw_spin_unlock_irqrestore(&rq->lock, *flags);
|
|
+}
|
|
+
|
|
+struct rq *task_rq_lock(struct task_struct *p, unsigned long *flags)
|
|
+ __acquires(p->pi_lock)
|
|
+ __acquires(rq->lock)
|
|
+{
|
|
+ struct rq *rq;
|
|
+
|
|
+ while (42) {
|
|
+ raw_spin_lock_irqsave(&p->pi_lock, *flags);
|
|
+ rq = task_rq(p);
|
|
+ raw_spin_lock(&rq->lock);
|
|
+ if (likely(rq == task_rq(p)))
|
|
+ break;
|
|
+ raw_spin_unlock(&rq->lock);
|
|
+ raw_spin_unlock_irqrestore(&p->pi_lock, *flags);
|
|
+ }
|
|
+ return rq;
|
|
+}
|
|
+
|
|
+void task_rq_unlock(struct rq *rq, struct task_struct *p, unsigned long *flags)
|
|
+ __releases(rq->lock)
|
|
+ __releases(p->pi_lock)
|
|
+{
|
|
+ rq_unlock(rq);
|
|
+ raw_spin_unlock_irqrestore(&p->pi_lock, *flags);
|
|
+}
|
|
+
|
|
+static inline struct rq *__task_rq_lock(struct task_struct *p)
|
|
+ __acquires(rq->lock)
|
|
+{
|
|
+ struct rq *rq;
|
|
+
|
|
+ lockdep_assert_held(&p->pi_lock);
|
|
+
|
|
+ while (42) {
|
|
+ rq = task_rq(p);
|
|
+ raw_spin_lock(&rq->lock);
|
|
+ if (likely(rq == task_rq(p)))
|
|
+ break;
|
|
+ raw_spin_unlock(&rq->lock);
|
|
+ }
|
|
+ return rq;
|
|
+}
|
|
+
|
|
+static inline void __task_rq_unlock(struct rq *rq)
|
|
+{
|
|
+ rq_unlock(rq);
|
|
+}
|
|
+
|
|
+/*
|
|
+ * cmpxchg based fetch_or, macro so it works for different integer types
|
|
+ */
|
|
+#define fetch_or(ptr, mask) \
|
|
+ ({ \
|
|
+ typeof(ptr) _ptr = (ptr); \
|
|
+ typeof(mask) _mask = (mask); \
|
|
+ typeof(*_ptr) _old, _val = *_ptr; \
|
|
+ \
|
|
+ for (;;) { \
|
|
+ _old = cmpxchg(_ptr, _val, _val | _mask); \
|
|
+ if (_old == _val) \
|
|
+ break; \
|
|
+ _val = _old; \
|
|
+ } \
|
|
+ _old; \
|
|
+})
|
|
+
|
|
+#if defined(CONFIG_SMP) && defined(TIF_POLLING_NRFLAG)
|
|
+/*
|
|
+ * Atomically set TIF_NEED_RESCHED and test for TIF_POLLING_NRFLAG,
|
|
+ * this avoids any races wrt polling state changes and thereby avoids
|
|
+ * spurious IPIs.
|
|
+ */
|
|
+static bool set_nr_and_not_polling(struct task_struct *p)
|
|
+{
|
|
+ struct thread_info *ti = task_thread_info(p);
|
|
+ return !(fetch_or(&ti->flags, _TIF_NEED_RESCHED) & _TIF_POLLING_NRFLAG);
|
|
+}
|
|
+
|
|
+/*
|
|
+ * Atomically set TIF_NEED_RESCHED if TIF_POLLING_NRFLAG is set.
|
|
+ *
|
|
+ * If this returns true, then the idle task promises to call
|
|
+ * sched_ttwu_pending() and reschedule soon.
|
|
+ */
|
|
+static bool set_nr_if_polling(struct task_struct *p)
|
|
+{
|
|
+ struct thread_info *ti = task_thread_info(p);
|
|
+ typeof(ti->flags) old, val = READ_ONCE(ti->flags);
|
|
+
|
|
+ for (;;) {
|
|
+ if (!(val & _TIF_POLLING_NRFLAG))
|
|
+ return false;
|
|
+ if (val & _TIF_NEED_RESCHED)
|
|
+ return true;
|
|
+ old = cmpxchg(&ti->flags, val, val | _TIF_NEED_RESCHED);
|
|
+ if (old == val)
|
|
+ break;
|
|
+ val = old;
|
|
+ }
|
|
+ return true;
|
|
+}
|
|
+
|
|
+#else
|
|
+static bool set_nr_and_not_polling(struct task_struct *p)
|
|
+{
|
|
+ set_tsk_need_resched(p);
|
|
+ return true;
|
|
+}
|
|
+
|
|
+#ifdef CONFIG_SMP
|
|
+static bool set_nr_if_polling(struct task_struct *p)
|
|
+{
|
|
+ return false;
|
|
+}
|
|
+#endif
|
|
+#endif
|
|
+
|
|
+void wake_q_add(struct wake_q_head *head, struct task_struct *task)
|
|
+{
|
|
+ struct wake_q_node *node = &task->wake_q;
|
|
+
|
|
+ /*
|
|
+ * Atomically grab the task, if ->wake_q is !nil already it means
|
|
+ * its already queued (either by us or someone else) and will get the
|
|
+ * wakeup due to that.
|
|
+ *
|
|
+ * This cmpxchg() implies a full barrier, which pairs with the write
|
|
+ * barrier implied by the wakeup in wake_up_q().
|
|
+ */
|
|
+ if (cmpxchg(&node->next, NULL, WAKE_Q_TAIL))
|
|
+ return;
|
|
+
|
|
+ get_task_struct(task);
|
|
+
|
|
+ /*
|
|
+ * The head is context local, there can be no concurrency.
|
|
+ */
|
|
+ *head->lastp = node;
|
|
+ head->lastp = &node->next;
|
|
+}
|
|
+
|
|
+void wake_up_q(struct wake_q_head *head)
|
|
+{
|
|
+ struct wake_q_node *node = head->first;
|
|
+
|
|
+ while (node != WAKE_Q_TAIL) {
|
|
+ struct task_struct *task;
|
|
+
|
|
+ task = container_of(node, struct task_struct, wake_q);
|
|
+ BUG_ON(!task);
|
|
+ /* task can safely be re-inserted now */
|
|
+ node = node->next;
|
|
+ task->wake_q.next = NULL;
|
|
+
|
|
+ /*
|
|
+ * wake_up_process() implies a wmb() to pair with the queueing
|
|
+ * in wake_q_add() so as not to miss wakeups.
|
|
+ */
|
|
+ wake_up_process(task);
|
|
+ put_task_struct(task);
|
|
+ }
|
|
+}
|
|
+
|
|
+static inline void prepare_lock_switch(struct rq *rq, struct task_struct *next)
|
|
+{
|
|
+ next->on_cpu = 1;
|
|
+}
|
|
+
|
|
+static inline void smp_sched_reschedule(int cpu)
|
|
+{
|
|
+ if (likely(cpu_online(cpu)))
|
|
+ smp_send_reschedule(cpu);
|
|
+}
|
|
+
|
|
+/*
|
|
+ * resched_task - mark a task 'to be rescheduled now'.
|
|
+ *
|
|
+ * On UP this means the setting of the need_resched flag, on SMP it
|
|
+ * might also involve a cross-CPU call to trigger the scheduler on
|
|
+ * the target CPU.
|
|
+ */
|
|
+void resched_task(struct task_struct *p)
|
|
+{
|
|
+ int cpu;
|
|
+#ifdef CONFIG_LOCKDEP
|
|
+ struct rq *rq = task_rq(p);
|
|
+
|
|
+ lockdep_assert_held(&rq->lock);
|
|
+#endif
|
|
+ if (test_tsk_need_resched(p))
|
|
+ return;
|
|
+
|
|
+ cpu = task_cpu(p);
|
|
+ if (cpu == smp_processor_id()) {
|
|
+ set_tsk_need_resched(p);
|
|
+ set_preempt_need_resched();
|
|
+ return;
|
|
+ }
|
|
+
|
|
+ if (set_nr_and_not_polling(p))
|
|
+ smp_sched_reschedule(cpu);
|
|
+ else
|
|
+ trace_sched_wake_idle_without_ipi(cpu);
|
|
+}
|
|
+
|
|
+/*
|
|
+ * A task that is not running or queued will not have a node set.
|
|
+ * A task that is queued but not running will have a node set.
|
|
+ * A task that is currently running will have ->on_cpu set but no node set.
|
|
+ */
|
|
+static inline bool task_queued(struct task_struct *p)
|
|
+{
|
|
+ return !skiplist_node_empty(&p->node);
|
|
+}
|
|
+
|
|
+static void enqueue_task(struct rq *rq, struct task_struct *p, int flags);
|
|
+static inline void resched_if_idle(struct rq *rq);
|
|
+
|
|
+/* Dodgy workaround till we figure out where the softirqs are going */
|
|
+static inline void do_pending_softirq(struct rq *rq, struct task_struct *next)
|
|
+{
|
|
+ if (unlikely(next == rq->idle && local_softirq_pending() && !in_interrupt()))
|
|
+ do_softirq_own_stack();
|
|
+}
|
|
+
|
|
+static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev)
|
|
+{
|
|
+#ifdef CONFIG_SMP
|
|
+ /*
|
|
+ * After ->on_cpu is cleared, the task can be moved to a different CPU.
|
|
+ * We must ensure this doesn't happen until the switch is completely
|
|
+ * finished.
|
|
+ *
|
|
+ * In particular, the load of prev->state in finish_task_switch() must
|
|
+ * happen before this.
|
|
+ *
|
|
+ * Pairs with the smp_cond_load_acquire() in try_to_wake_up().
|
|
+ */
|
|
+ smp_store_release(&prev->on_cpu, 0);
|
|
+#endif
|
|
+#ifdef CONFIG_DEBUG_SPINLOCK
|
|
+ /* this is a valid case when another task releases the spinlock */
|
|
+ rq->lock.owner = current;
|
|
+#endif
|
|
+ /*
|
|
+ * If we are tracking spinlock dependencies then we have to
|
|
+ * fix up the runqueue lock - which gets 'carried over' from
|
|
+ * prev into current:
|
|
+ */
|
|
+ spin_acquire(&rq->lock.dep_map, 0, 0, _THIS_IP_);
|
|
+
|
|
+#ifdef CONFIG_SMP
|
|
+ /*
|
|
+ * If prev was marked as migrating to another CPU in return_task, drop
|
|
+ * the local runqueue lock but leave interrupts disabled and grab the
|
|
+ * remote lock we're migrating it to before enabling them.
|
|
+ */
|
|
+ if (unlikely(task_on_rq_migrating(prev))) {
|
|
+ sched_info_dequeued(rq, prev);
|
|
+ /*
|
|
+ * We move the ownership of prev to the new cpu now. ttwu can't
|
|
+ * activate prev to the wrong cpu since it has to grab this
|
|
+ * runqueue in ttwu_remote.
|
|
+ */
|
|
+#ifdef CONFIG_THREAD_INFO_IN_TASK
|
|
+ prev->cpu = prev->wake_cpu;
|
|
+#else
|
|
+ task_thread_info(prev)->cpu = prev->wake_cpu;
|
|
+#endif
|
|
+ raw_spin_unlock(&rq->lock);
|
|
+
|
|
+ raw_spin_lock(&prev->pi_lock);
|
|
+ rq = __task_rq_lock(prev);
|
|
+ /* Check that someone else hasn't already queued prev */
|
|
+ if (likely(!task_queued(prev))) {
|
|
+ enqueue_task(rq, prev, 0);
|
|
+ prev->on_rq = TASK_ON_RQ_QUEUED;
|
|
+ /* Wake up the CPU if it's not already running */
|
|
+ resched_if_idle(rq);
|
|
+ }
|
|
+ raw_spin_unlock(&prev->pi_lock);
|
|
+ }
|
|
+#endif
|
|
+ rq_unlock(rq);
|
|
+
|
|
+ do_pending_softirq(rq, current);
|
|
+
|
|
+ local_irq_enable();
|
|
+}
|
|
+
|
|
+static inline bool deadline_before(u64 deadline, u64 time)
|
|
+{
|
|
+ return (deadline < time);
|
|
+}
|
|
+
|
|
+/*
|
|
+ * Deadline is "now" in niffies + (offset by priority). Setting the deadline
|
|
+ * is the key to everything. It distributes cpu fairly amongst tasks of the
|
|
+ * same nice value, it proportions cpu according to nice level, it means the
|
|
+ * task that last woke up the longest ago has the earliest deadline, thus
|
|
+ * ensuring that interactive tasks get low latency on wake up. The CPU
|
|
+ * proportion works out to the square of the virtual deadline difference, so
|
|
+ * this equation will give nice 19 3% CPU compared to nice 0.
|
|
+ */
|
|
+static inline u64 prio_deadline_diff(int user_prio)
|
|
+{
|
|
+ return (prio_ratios[user_prio] * rr_interval * (MS_TO_NS(1) / 128));
|
|
+}
|
|
+
|
|
+static inline u64 task_deadline_diff(struct task_struct *p)
|
|
+{
|
|
+ return prio_deadline_diff(TASK_USER_PRIO(p));
|
|
+}
|
|
+
|
|
+static inline u64 static_deadline_diff(int static_prio)
|
|
+{
|
|
+ return prio_deadline_diff(USER_PRIO(static_prio));
|
|
+}
|
|
+
|
|
+static inline int longest_deadline_diff(void)
|
|
+{
|
|
+ return prio_deadline_diff(39);
|
|
+}
|
|
+
|
|
+static inline int ms_longest_deadline_diff(void)
|
|
+{
|
|
+ return NS_TO_MS(longest_deadline_diff());
|
|
+}
|
|
+
|
|
+static inline int rq_load(struct rq *rq)
|
|
+{
|
|
+ return rq->sl->entries + !rq_idle(rq);
|
|
+}
|
|
+
|
|
+static inline bool rq_local(struct rq *rq);
|
|
+
|
|
+/*
|
|
+ * Update the load average for feeding into cpu frequency governors. Use a
|
|
+ * rough estimate of a rolling average with ~ time constant of 32ms.
|
|
+ * 80/128 ~ 0.63. * 80 / 32768 / 128 == * 5 / 262144
|
|
+ * Make sure a call to update_clocks has been made before calling this to get
|
|
+ * an updated rq->niffies.
|
|
+ */
|
|
+static void update_load_avg(struct rq *rq, unsigned int flags)
|
|
+{
|
|
+ unsigned long us_interval;
|
|
+ long load, curload;
|
|
+
|
|
+ if (unlikely(rq->niffies <= rq->load_update))
|
|
+ return;
|
|
+
|
|
+ us_interval = NS_TO_US(rq->niffies - rq->load_update);
|
|
+ curload = rq_load(rq);
|
|
+ load = rq->load_avg - (rq->load_avg * us_interval * 5 / 262144);
|
|
+ if (unlikely(load < 0))
|
|
+ load = 0;
|
|
+ load += curload * curload * SCHED_CAPACITY_SCALE * us_interval * 5 / 262144;
|
|
+ /* If this CPU has all the load, make it ramp up quickly */
|
|
+ if (curload > load && curload >= atomic_read(&grq.nr_running))
|
|
+ load = curload;
|
|
+ rq->load_avg = load;
|
|
+
|
|
+ rq->load_update = rq->niffies;
|
|
+ if (likely(rq_local(rq)))
|
|
+ cpufreq_trigger(rq->niffies, flags);
|
|
+}
|
|
+
|
|
+/*
|
|
+ * Removing from the runqueue. Enter with rq locked. Deleting a task
|
|
+ * from the skip list is done via the stored node reference in the task struct
|
|
+ * and does not require a full look up. Thus it occurs in O(k) time where k
|
|
+ * is the "level" of the list the task was stored at - usually < 4, max 8.
|
|
+ */
|
|
+static void dequeue_task(struct rq *rq, struct task_struct *p, int flags)
|
|
+{
|
|
+ skiplist_delete(rq->sl, &p->node);
|
|
+ rq->best_key = rq->node.next[0]->key;
|
|
+ update_clocks(rq);
|
|
+ if (!(flags & DEQUEUE_SAVE))
|
|
+ sched_info_dequeued(task_rq(p), p);
|
|
+ update_load_avg(rq, flags);
|
|
+}
|
|
+
|
|
+#ifdef CONFIG_PREEMPT_RCU
|
|
+static bool rcu_read_critical(struct task_struct *p)
|
|
+{
|
|
+ return p->rcu_read_unlock_special.b.blocked;
|
|
+}
|
|
+#else /* CONFIG_PREEMPT_RCU */
|
|
+#define rcu_read_critical(p) (false)
|
|
+#endif /* CONFIG_PREEMPT_RCU */
|
|
+
|
|
+/*
|
|
+ * To determine if it's safe for a task of SCHED_IDLEPRIO to actually run as
|
|
+ * an idle task, we ensure none of the following conditions are met.
|
|
+ */
|
|
+static bool idleprio_suitable(struct task_struct *p)
|
|
+{
|
|
+ return (!(task_contributes_to_load(p)) && !(p->flags & (PF_EXITING)) &&
|
|
+ !signal_pending(p) && !rcu_read_critical(p) && !freezing(p));
|
|
+}
|
|
+
|
|
+/*
|
|
+ * To determine if a task of SCHED_ISO can run in pseudo-realtime, we check
|
|
+ * that the iso_refractory flag is not set.
|
|
+ */
|
|
+static inline bool isoprio_suitable(struct rq *rq)
|
|
+{
|
|
+ return !rq->iso_refractory;
|
|
+}
|
|
+
|
|
+/*
|
|
+ * Adding to the runqueue. Enter with rq locked.
|
|
+ */
|
|
+static void enqueue_task(struct rq *rq, struct task_struct *p, int flags)
|
|
+{
|
|
+ unsigned int randseed, cflags = 0;
|
|
+ u64 sl_id;
|
|
+
|
|
+ if (!rt_task(p)) {
|
|
+ /* Check it hasn't gotten rt from PI */
|
|
+ if ((idleprio_task(p) && idleprio_suitable(p)) ||
|
|
+ (iso_task(p) && isoprio_suitable(rq)))
|
|
+ p->prio = p->normal_prio;
|
|
+ else
|
|
+ p->prio = NORMAL_PRIO;
|
|
+ }
|
|
+ /*
|
|
+ * The sl_id key passed to the skiplist generates a sorted list.
|
|
+ * Realtime and sched iso tasks run FIFO so they only need be sorted
|
|
+ * according to priority. The skiplist will put tasks of the same
|
|
+ * key inserted later in FIFO order. Tasks of sched normal, batch
|
|
+ * and idleprio are sorted according to their deadlines. Idleprio
|
|
+ * tasks are offset by an impossibly large deadline value ensuring
|
|
+ * they get sorted into last positions, but still according to their
|
|
+ * own deadlines. This creates a "landscape" of skiplists running
|
|
+ * from priority 0 realtime in first place to the lowest priority
|
|
+ * idleprio tasks last. Skiplist insertion is an O(log n) process.
|
|
+ */
|
|
+ if (p->prio <= ISO_PRIO) {
|
|
+ sl_id = p->prio;
|
|
+ cflags = SCHED_CPUFREQ_RT;
|
|
+ } else {
|
|
+ sl_id = p->deadline;
|
|
+ if (idleprio_task(p)) {
|
|
+ if (p->prio == IDLE_PRIO)
|
|
+ sl_id |= 0xF000000000000000;
|
|
+ else
|
|
+ sl_id += longest_deadline_diff();
|
|
+ }
|
|
+ }
|
|
+ /*
|
|
+ * Some architectures don't have better than microsecond resolution
|
|
+ * so mask out ~microseconds as the random seed for skiplist insertion.
|
|
+ */
|
|
+ update_clocks(rq);
|
|
+ if (!(flags & ENQUEUE_RESTORE))
|
|
+ sched_info_queued(rq, p);
|
|
+ randseed = (rq->niffies >> 10) & 0xFFFFFFFF;
|
|
+ skiplist_insert(rq->sl, &p->node, sl_id, p, randseed);
|
|
+ rq->best_key = rq->node.next[0]->key;
|
|
+ if (p->in_iowait)
|
|
+ cflags |= SCHED_CPUFREQ_IOWAIT;
|
|
+ update_load_avg(rq, cflags);
|
|
+}
|
|
+
|
|
+/*
|
|
+ * Returns the relative length of deadline all compared to the shortest
|
|
+ * deadline which is that of nice -20.
|
|
+ */
|
|
+static inline int task_prio_ratio(struct task_struct *p)
|
|
+{
|
|
+ return prio_ratios[TASK_USER_PRIO(p)];
|
|
+}
|
|
+
|
|
+/*
|
|
+ * task_timeslice - all tasks of all priorities get the exact same timeslice
|
|
+ * length. CPU distribution is handled by giving different deadlines to
|
|
+ * tasks of different priorities. Use 128 as the base value for fast shifts.
|
|
+ */
|
|
+static inline int task_timeslice(struct task_struct *p)
|
|
+{
|
|
+ return (rr_interval * task_prio_ratio(p) / 128);
|
|
+}
|
|
+
|
|
+#ifdef CONFIG_SMP
|
|
+/* Entered with rq locked */
|
|
+static inline void resched_if_idle(struct rq *rq)
|
|
+{
|
|
+ if (rq_idle(rq))
|
|
+ resched_task(rq->curr);
|
|
+}
|
|
+
|
|
+static inline bool rq_local(struct rq *rq)
|
|
+{
|
|
+ return (rq->cpu == smp_processor_id());
|
|
+}
|
|
+#ifdef CONFIG_SMT_NICE
|
|
+static const cpumask_t *thread_cpumask(int cpu);
|
|
+
|
|
+/* Find the best real time priority running on any SMT siblings of cpu and if
|
|
+ * none are running, the static priority of the best deadline task running.
|
|
+ * The lookups to the other runqueues is done lockless as the occasional wrong
|
|
+ * value would be harmless. */
|
|
+static int best_smt_bias(struct rq *this_rq)
|
|
+{
|
|
+ int other_cpu, best_bias = 0;
|
|
+
|
|
+ for_each_cpu(other_cpu, &this_rq->thread_mask) {
|
|
+ struct rq *rq = cpu_rq(other_cpu);
|
|
+
|
|
+ if (rq_idle(rq))
|
|
+ continue;
|
|
+ if (unlikely(!rq->online))
|
|
+ continue;
|
|
+ if (!rq->rq_mm)
|
|
+ continue;
|
|
+ if (likely(rq->rq_smt_bias > best_bias))
|
|
+ best_bias = rq->rq_smt_bias;
|
|
+ }
|
|
+ return best_bias;
|
|
+}
|
|
+
|
|
+static int task_prio_bias(struct task_struct *p)
|
|
+{
|
|
+ if (rt_task(p))
|
|
+ return 1 << 30;
|
|
+ else if (task_running_iso(p))
|
|
+ return 1 << 29;
|
|
+ else if (task_running_idle(p))
|
|
+ return 0;
|
|
+ return MAX_PRIO - p->static_prio;
|
|
+}
|
|
+
|
|
+static bool smt_always_schedule(struct task_struct __maybe_unused *p, struct rq __maybe_unused *this_rq)
|
|
+{
|
|
+ return true;
|
|
+}
|
|
+
|
|
+static bool (*smt_schedule)(struct task_struct *p, struct rq *this_rq) = &smt_always_schedule;
|
|
+
|
|
+/* We've already decided p can run on CPU, now test if it shouldn't for SMT
|
|
+ * nice reasons. */
|
|
+static bool smt_should_schedule(struct task_struct *p, struct rq *this_rq)
|
|
+{
|
|
+ int best_bias, task_bias;
|
|
+
|
|
+ /* Kernel threads always run */
|
|
+ if (unlikely(!p->mm))
|
|
+ return true;
|
|
+ if (rt_task(p))
|
|
+ return true;
|
|
+ if (!idleprio_suitable(p))
|
|
+ return true;
|
|
+ best_bias = best_smt_bias(this_rq);
|
|
+ /* The smt siblings are all idle or running IDLEPRIO */
|
|
+ if (best_bias < 1)
|
|
+ return true;
|
|
+ task_bias = task_prio_bias(p);
|
|
+ if (task_bias < 1)
|
|
+ return false;
|
|
+ if (task_bias >= best_bias)
|
|
+ return true;
|
|
+ /* Dither 25% cpu of normal tasks regardless of nice difference */
|
|
+ if (best_bias % 4 == 1)
|
|
+ return true;
|
|
+ /* Sorry, you lose */
|
|
+ return false;
|
|
+}
|
|
+#else /* CONFIG_SMT_NICE */
|
|
+#define smt_schedule(p, this_rq) (true)
|
|
+#endif /* CONFIG_SMT_NICE */
|
|
+
|
|
+static inline void atomic_set_cpu(int cpu, cpumask_t *cpumask)
|
|
+{
|
|
+ set_bit(cpu, (volatile unsigned long *)cpumask);
|
|
+}
|
|
+
|
|
+/*
|
|
+ * The cpu_idle_map stores a bitmap of all the CPUs currently idle to
|
|
+ * allow easy lookup of whether any suitable idle CPUs are available.
|
|
+ * It's cheaper to maintain a binary yes/no if there are any idle CPUs on the
|
|
+ * idle_cpus variable than to do a full bitmask check when we are busy. The
|
|
+ * bits are set atomically but read locklessly as occasional false positive /
|
|
+ * negative is harmless.
|
|
+ */
|
|
+static inline void set_cpuidle_map(int cpu)
|
|
+{
|
|
+ if (likely(cpu_online(cpu)))
|
|
+ atomic_set_cpu(cpu, &grq.cpu_idle_map);
|
|
+}
|
|
+
|
|
+static inline void atomic_clear_cpu(int cpu, cpumask_t *cpumask)
|
|
+{
|
|
+ clear_bit(cpu, (volatile unsigned long *)cpumask);
|
|
+}
|
|
+
|
|
+static inline void clear_cpuidle_map(int cpu)
|
|
+{
|
|
+ atomic_clear_cpu(cpu, &grq.cpu_idle_map);
|
|
+}
|
|
+
|
|
+static bool suitable_idle_cpus(struct task_struct *p)
|
|
+{
|
|
+ return (cpumask_intersects(&p->cpus_allowed, &grq.cpu_idle_map));
|
|
+}
|
|
+
|
|
+/*
|
|
+ * Resched current on rq. We don't know if rq is local to this CPU nor if it
|
|
+ * is locked so we do not use an intermediate variable for the task to avoid
|
|
+ * having it dereferenced.
|
|
+ */
|
|
+static void resched_curr(struct rq *rq)
|
|
+{
|
|
+ int cpu;
|
|
+
|
|
+ if (test_tsk_need_resched(rq->curr))
|
|
+ return;
|
|
+
|
|
+ rq->preempt = rq->curr;
|
|
+ cpu = rq->cpu;
|
|
+
|
|
+ /* We're doing this without holding the rq lock if it's not task_rq */
|
|
+
|
|
+ if (cpu == smp_processor_id()) {
|
|
+ set_tsk_need_resched(rq->curr);
|
|
+ set_preempt_need_resched();
|
|
+ return;
|
|
+ }
|
|
+
|
|
+ if (set_nr_and_not_polling(rq->curr))
|
|
+ smp_sched_reschedule(cpu);
|
|
+ else
|
|
+ trace_sched_wake_idle_without_ipi(cpu);
|
|
+}
|
|
+
|
|
+#define CPUIDLE_DIFF_THREAD (1)
|
|
+#define CPUIDLE_DIFF_CORE (2)
|
|
+#define CPUIDLE_CACHE_BUSY (4)
|
|
+#define CPUIDLE_DIFF_CPU (8)
|
|
+#define CPUIDLE_THREAD_BUSY (16)
|
|
+#define CPUIDLE_DIFF_NODE (32)
|
|
+
|
|
+/*
|
|
+ * The best idle CPU is chosen according to the CPUIDLE ranking above where the
|
|
+ * lowest value would give the most suitable CPU to schedule p onto next. The
|
|
+ * order works out to be the following:
|
|
+ *
|
|
+ * Same thread, idle or busy cache, idle or busy threads
|
|
+ * Other core, same cache, idle or busy cache, idle threads.
|
|
+ * Same node, other CPU, idle cache, idle threads.
|
|
+ * Same node, other CPU, busy cache, idle threads.
|
|
+ * Other core, same cache, busy threads.
|
|
+ * Same node, other CPU, busy threads.
|
|
+ * Other node, other CPU, idle cache, idle threads.
|
|
+ * Other node, other CPU, busy cache, idle threads.
|
|
+ * Other node, other CPU, busy threads.
|
|
+ */
|
|
+static int best_mask_cpu(int best_cpu, struct rq *rq, cpumask_t *tmpmask)
|
|
+{
|
|
+ int best_ranking = CPUIDLE_DIFF_NODE | CPUIDLE_THREAD_BUSY |
|
|
+ CPUIDLE_DIFF_CPU | CPUIDLE_CACHE_BUSY | CPUIDLE_DIFF_CORE |
|
|
+ CPUIDLE_DIFF_THREAD;
|
|
+ int cpu_tmp;
|
|
+
|
|
+ if (cpumask_test_cpu(best_cpu, tmpmask))
|
|
+ goto out;
|
|
+
|
|
+ for_each_cpu(cpu_tmp, tmpmask) {
|
|
+ int ranking, locality;
|
|
+ struct rq *tmp_rq;
|
|
+
|
|
+ ranking = 0;
|
|
+ tmp_rq = cpu_rq(cpu_tmp);
|
|
+
|
|
+ locality = rq->cpu_locality[cpu_tmp];
|
|
+#ifdef CONFIG_NUMA
|
|
+ if (locality > 3)
|
|
+ ranking |= CPUIDLE_DIFF_NODE;
|
|
+ else
|
|
+#endif
|
|
+ if (locality > 2)
|
|
+ ranking |= CPUIDLE_DIFF_CPU;
|
|
+#ifdef CONFIG_SCHED_MC
|
|
+ else if (locality == 2)
|
|
+ ranking |= CPUIDLE_DIFF_CORE;
|
|
+ else if (!(tmp_rq->cache_idle(tmp_rq)))
|
|
+ ranking |= CPUIDLE_CACHE_BUSY;
|
|
+#endif
|
|
+#ifdef CONFIG_SCHED_SMT
|
|
+ if (locality == 1)
|
|
+ ranking |= CPUIDLE_DIFF_THREAD;
|
|
+ if (!(tmp_rq->siblings_idle(tmp_rq)))
|
|
+ ranking |= CPUIDLE_THREAD_BUSY;
|
|
+#endif
|
|
+ if (ranking < best_ranking) {
|
|
+ best_cpu = cpu_tmp;
|
|
+ best_ranking = ranking;
|
|
+ }
|
|
+ }
|
|
+out:
|
|
+ return best_cpu;
|
|
+}
|
|
+
|
|
+bool cpus_share_cache(int this_cpu, int that_cpu)
|
|
+{
|
|
+ struct rq *this_rq = cpu_rq(this_cpu);
|
|
+
|
|
+ return (this_rq->cpu_locality[that_cpu] < 3);
|
|
+}
|
|
+
|
|
+/* As per resched_curr but only will resched idle task */
|
|
+static inline void resched_idle(struct rq *rq)
|
|
+{
|
|
+ if (test_tsk_need_resched(rq->idle))
|
|
+ return;
|
|
+
|
|
+ rq->preempt = rq->idle;
|
|
+
|
|
+ set_tsk_need_resched(rq->idle);
|
|
+
|
|
+ if (rq_local(rq)) {
|
|
+ set_preempt_need_resched();
|
|
+ return;
|
|
+ }
|
|
+
|
|
+ smp_sched_reschedule(rq->cpu);
|
|
+}
|
|
+
|
|
+static struct rq *resched_best_idle(struct task_struct *p, int cpu)
|
|
+{
|
|
+ cpumask_t tmpmask;
|
|
+ struct rq *rq;
|
|
+ int best_cpu;
|
|
+
|
|
+ cpumask_and(&tmpmask, &p->cpus_allowed, &grq.cpu_idle_map);
|
|
+ best_cpu = best_mask_cpu(cpu, task_rq(p), &tmpmask);
|
|
+ rq = cpu_rq(best_cpu);
|
|
+ if (!smt_schedule(p, rq))
|
|
+ return NULL;
|
|
+ resched_idle(rq);
|
|
+ return rq;
|
|
+}
|
|
+
|
|
+static inline void resched_suitable_idle(struct task_struct *p)
|
|
+{
|
|
+ if (suitable_idle_cpus(p))
|
|
+ resched_best_idle(p, task_cpu(p));
|
|
+}
|
|
+
|
|
+static inline struct rq *rq_order(struct rq *rq, int cpu)
|
|
+{
|
|
+ return rq->rq_order[cpu];
|
|
+}
|
|
+#else /* CONFIG_SMP */
|
|
+static inline void set_cpuidle_map(int cpu)
|
|
+{
|
|
+}
|
|
+
|
|
+static inline void clear_cpuidle_map(int cpu)
|
|
+{
|
|
+}
|
|
+
|
|
+static inline bool suitable_idle_cpus(struct task_struct *p)
|
|
+{
|
|
+ return uprq->curr == uprq->idle;
|
|
+}
|
|
+
|
|
+static inline void resched_suitable_idle(struct task_struct *p)
|
|
+{
|
|
+}
|
|
+
|
|
+static inline void resched_curr(struct rq *rq)
|
|
+{
|
|
+ resched_task(rq->curr);
|
|
+}
|
|
+
|
|
+static inline void resched_if_idle(struct rq *rq)
|
|
+{
|
|
+}
|
|
+
|
|
+static inline bool rq_local(struct rq *rq)
|
|
+{
|
|
+ return true;
|
|
+}
|
|
+
|
|
+static inline struct rq *rq_order(struct rq *rq, int cpu)
|
|
+{
|
|
+ return rq;
|
|
+}
|
|
+
|
|
+static inline bool smt_schedule(struct task_struct *p, struct rq *rq)
|
|
+{
|
|
+ return true;
|
|
+}
|
|
+#endif /* CONFIG_SMP */
|
|
+
|
|
+static inline int normal_prio(struct task_struct *p)
|
|
+{
|
|
+ if (has_rt_policy(p))
|
|
+ return MAX_RT_PRIO - 1 - p->rt_priority;
|
|
+ if (idleprio_task(p))
|
|
+ return IDLE_PRIO;
|
|
+ if (iso_task(p))
|
|
+ return ISO_PRIO;
|
|
+ return NORMAL_PRIO;
|
|
+}
|
|
+
|
|
+/*
|
|
+ * Calculate the current priority, i.e. the priority
|
|
+ * taken into account by the scheduler. This value might
|
|
+ * be boosted by RT tasks as it will be RT if the task got
|
|
+ * RT-boosted. If not then it returns p->normal_prio.
|
|
+ */
|
|
+static int effective_prio(struct task_struct *p)
|
|
+{
|
|
+ p->normal_prio = normal_prio(p);
|
|
+ /*
|
|
+ * If we are RT tasks or we were boosted to RT priority,
|
|
+ * keep the priority unchanged. Otherwise, update priority
|
|
+ * to the normal priority:
|
|
+ */
|
|
+ if (!rt_prio(p->prio))
|
|
+ return p->normal_prio;
|
|
+ return p->prio;
|
|
+}
|
|
+
|
|
+/*
|
|
+ * activate_task - move a task to the runqueue. Enter with rq locked.
|
|
+ */
|
|
+static void activate_task(struct task_struct *p, struct rq *rq)
|
|
+{
|
|
+ resched_if_idle(rq);
|
|
+
|
|
+ /*
|
|
+ * Sleep time is in units of nanosecs, so shift by 20 to get a
|
|
+ * milliseconds-range estimation of the amount of time that the task
|
|
+ * spent sleeping:
|
|
+ */
|
|
+ if (unlikely(prof_on == SLEEP_PROFILING)) {
|
|
+ if (p->state == TASK_UNINTERRUPTIBLE)
|
|
+ profile_hits(SLEEP_PROFILING, (void *)get_wchan(p),
|
|
+ (rq->niffies - p->last_ran) >> 20);
|
|
+ }
|
|
+
|
|
+ p->prio = effective_prio(p);
|
|
+ if (task_contributes_to_load(p))
|
|
+ atomic_dec(&grq.nr_uninterruptible);
|
|
+
|
|
+ enqueue_task(rq, p, 0);
|
|
+ p->on_rq = TASK_ON_RQ_QUEUED;
|
|
+ atomic_inc(&grq.nr_running);
|
|
+}
|
|
+
|
|
+/*
|
|
+ * deactivate_task - If it's running, it's not on the runqueue and we can just
|
|
+ * decrement the nr_running. Enter with rq locked.
|
|
+ */
|
|
+static inline void deactivate_task(struct task_struct *p, struct rq *rq)
|
|
+{
|
|
+ if (task_contributes_to_load(p))
|
|
+ atomic_inc(&grq.nr_uninterruptible);
|
|
+
|
|
+ p->on_rq = 0;
|
|
+ atomic_dec(&grq.nr_running);
|
|
+ sched_info_dequeued(rq, p);
|
|
+}
|
|
+
|
|
+#ifdef CONFIG_SMP
|
|
+void set_task_cpu(struct task_struct *p, unsigned int cpu)
|
|
+{
|
|
+ struct rq *rq = task_rq(p);
|
|
+ bool queued;
|
|
+
|
|
+#ifdef CONFIG_LOCKDEP
|
|
+ /*
|
|
+ * The caller should hold either p->pi_lock or rq->lock, when changing
|
|
+ * a task's CPU. ->pi_lock for waking tasks, rq->lock for runnable tasks.
|
|
+ *
|
|
+ * Furthermore, all task_rq users should acquire both locks, see
|
|
+ * task_rq_lock().
|
|
+ */
|
|
+ WARN_ON_ONCE(debug_locks && !(lockdep_is_held(&p->pi_lock) ||
|
|
+ lockdep_is_held(&task_rq(p)->lock)));
|
|
+#endif
|
|
+ if (task_cpu(p) == cpu)
|
|
+ return;
|
|
+ trace_sched_migrate_task(p, cpu);
|
|
+ perf_event_task_migrate(p);
|
|
+
|
|
+ /*
|
|
+ * After ->cpu is set up to a new value, task_rq_lock(p, ...) can be
|
|
+ * successfully executed on another CPU. We must ensure that updates of
|
|
+ * per-task data have been completed by this moment.
|
|
+ */
|
|
+ smp_wmb();
|
|
+
|
|
+ if (task_running(rq, p)) {
|
|
+ /*
|
|
+ * We should only be calling this on a running task if we're
|
|
+ * holding rq lock.
|
|
+ */
|
|
+ lockdep_assert_held(&rq->lock);
|
|
+
|
|
+ /*
|
|
+ * We can't change the task_thread_info cpu on a running task
|
|
+ * as p will still be protected by the rq lock of the cpu it
|
|
+ * is still running on so we set the wake_cpu for it to be
|
|
+ * lazily updated once off the cpu.
|
|
+ */
|
|
+ p->wake_cpu = cpu;
|
|
+ return;
|
|
+ }
|
|
+
|
|
+ if ((queued = task_queued(p)))
|
|
+ dequeue_task(rq, p, 0);
|
|
+#ifdef CONFIG_THREAD_INFO_IN_TASK
|
|
+ p->cpu = cpu;
|
|
+#else
|
|
+ task_thread_info(p)->cpu = cpu;
|
|
+#endif
|
|
+ p->wake_cpu = cpu;
|
|
+ if (queued)
|
|
+ enqueue_task(cpu_rq(cpu), p, 0);
|
|
+}
|
|
+#endif /* CONFIG_SMP */
|
|
+
|
|
+/*
|
|
+ * Move a task off the runqueue and take it to a cpu for it will
|
|
+ * become the running task.
|
|
+ */
|
|
+static inline void take_task(struct rq *rq, int cpu, struct task_struct *p)
|
|
+{
|
|
+ struct rq *p_rq = task_rq(p);
|
|
+
|
|
+ dequeue_task(p_rq, p, DEQUEUE_SAVE);
|
|
+ if (p_rq != rq) {
|
|
+ sched_info_dequeued(p_rq, p);
|
|
+ sched_info_queued(rq, p);
|
|
+ }
|
|
+ set_task_cpu(p, cpu);
|
|
+}
|
|
+
|
|
+/*
|
|
+ * Returns a descheduling task to the runqueue unless it is being
|
|
+ * deactivated.
|
|
+ */
|
|
+static inline void return_task(struct task_struct *p, struct rq *rq,
|
|
+ int cpu, bool deactivate)
|
|
+{
|
|
+ if (deactivate)
|
|
+ deactivate_task(p, rq);
|
|
+ else {
|
|
+#ifdef CONFIG_SMP
|
|
+ /*
|
|
+ * set_task_cpu was called on the running task that doesn't
|
|
+ * want to deactivate so it has to be enqueued to a different
|
|
+ * CPU and we need its lock. Tag it to be moved with as the
|
|
+ * lock is dropped in finish_lock_switch.
|
|
+ */
|
|
+ if (unlikely(p->wake_cpu != cpu))
|
|
+ p->on_rq = TASK_ON_RQ_MIGRATING;
|
|
+ else
|
|
+#endif
|
|
+ enqueue_task(rq, p, ENQUEUE_RESTORE);
|
|
+ }
|
|
+}
|
|
+
|
|
+/* Enter with rq lock held. We know p is on the local cpu */
|
|
+static inline void __set_tsk_resched(struct task_struct *p)
|
|
+{
|
|
+ set_tsk_need_resched(p);
|
|
+ set_preempt_need_resched();
|
|
+}
|
|
+
|
|
+/**
|
|
+ * task_curr - is this task currently executing on a CPU?
|
|
+ * @p: the task in question.
|
|
+ *
|
|
+ * Return: 1 if the task is currently executing. 0 otherwise.
|
|
+ */
|
|
+inline int task_curr(const struct task_struct *p)
|
|
+{
|
|
+ return cpu_curr(task_cpu(p)) == p;
|
|
+}
|
|
+
|
|
+#ifdef CONFIG_SMP
|
|
+/*
|
|
+ * wait_task_inactive - wait for a thread to unschedule.
|
|
+ *
|
|
+ * If @match_state is nonzero, it's the @p->state value just checked and
|
|
+ * not expected to change. If it changes, i.e. @p might have woken up,
|
|
+ * then return zero. When we succeed in waiting for @p to be off its CPU,
|
|
+ * we return a positive number (its total switch count). If a second call
|
|
+ * a short while later returns the same number, the caller can be sure that
|
|
+ * @p has remained unscheduled the whole time.
|
|
+ *
|
|
+ * The caller must ensure that the task *will* unschedule sometime soon,
|
|
+ * else this function might spin for a *long* time. This function can't
|
|
+ * be called with interrupts off, or it may introduce deadlock with
|
|
+ * smp_call_function() if an IPI is sent by the same process we are
|
|
+ * waiting to become inactive.
|
|
+ */
|
|
+unsigned long wait_task_inactive(struct task_struct *p, long match_state)
|
|
+{
|
|
+ int running, queued;
|
|
+ unsigned long flags;
|
|
+ unsigned long ncsw;
|
|
+ struct rq *rq;
|
|
+
|
|
+ for (;;) {
|
|
+ rq = task_rq(p);
|
|
+
|
|
+ /*
|
|
+ * If the task is actively running on another CPU
|
|
+ * still, just relax and busy-wait without holding
|
|
+ * any locks.
|
|
+ *
|
|
+ * NOTE! Since we don't hold any locks, it's not
|
|
+ * even sure that "rq" stays as the right runqueue!
|
|
+ * But we don't care, since this will return false
|
|
+ * if the runqueue has changed and p is actually now
|
|
+ * running somewhere else!
|
|
+ */
|
|
+ while (task_running(rq, p)) {
|
|
+ if (match_state && unlikely(p->state != match_state))
|
|
+ return 0;
|
|
+ cpu_relax();
|
|
+ }
|
|
+
|
|
+ /*
|
|
+ * Ok, time to look more closely! We need the rq
|
|
+ * lock now, to be *sure*. If we're wrong, we'll
|
|
+ * just go back and repeat.
|
|
+ */
|
|
+ rq = task_rq_lock(p, &flags);
|
|
+ trace_sched_wait_task(p);
|
|
+ running = task_running(rq, p);
|
|
+ queued = task_on_rq_queued(p);
|
|
+ ncsw = 0;
|
|
+ if (!match_state || p->state == match_state)
|
|
+ ncsw = p->nvcsw | LONG_MIN; /* sets MSB */
|
|
+ task_rq_unlock(rq, p, &flags);
|
|
+
|
|
+ /*
|
|
+ * If it changed from the expected state, bail out now.
|
|
+ */
|
|
+ if (unlikely(!ncsw))
|
|
+ break;
|
|
+
|
|
+ /*
|
|
+ * Was it really running after all now that we
|
|
+ * checked with the proper locks actually held?
|
|
+ *
|
|
+ * Oops. Go back and try again..
|
|
+ */
|
|
+ if (unlikely(running)) {
|
|
+ cpu_relax();
|
|
+ continue;
|
|
+ }
|
|
+
|
|
+ /*
|
|
+ * It's not enough that it's not actively running,
|
|
+ * it must be off the runqueue _entirely_, and not
|
|
+ * preempted!
|
|
+ *
|
|
+ * So if it was still runnable (but just not actively
|
|
+ * running right now), it's preempted, and we should
|
|
+ * yield - it could be a while.
|
|
+ */
|
|
+ if (unlikely(queued)) {
|
|
+ ktime_t to = ktime_set(0, NSEC_PER_SEC / HZ);
|
|
+
|
|
+ set_current_state(TASK_UNINTERRUPTIBLE);
|
|
+ schedule_hrtimeout(&to, HRTIMER_MODE_REL);
|
|
+ continue;
|
|
+ }
|
|
+
|
|
+ /*
|
|
+ * Ahh, all good. It wasn't running, and it wasn't
|
|
+ * runnable, which means that it will never become
|
|
+ * running in the future either. We're all done!
|
|
+ */
|
|
+ break;
|
|
+ }
|
|
+
|
|
+ return ncsw;
|
|
+}
|
|
+
|
|
+/***
|
|
+ * kick_process - kick a running thread to enter/exit the kernel
|
|
+ * @p: the to-be-kicked thread
|
|
+ *
|
|
+ * Cause a process which is running on another CPU to enter
|
|
+ * kernel-mode, without any delay. (to get signals handled.)
|
|
+ *
|
|
+ * NOTE: this function doesn't have to take the runqueue lock,
|
|
+ * because all it wants to ensure is that the remote task enters
|
|
+ * the kernel. If the IPI races and the task has been migrated
|
|
+ * to another CPU then no harm is done and the purpose has been
|
|
+ * achieved as well.
|
|
+ */
|
|
+void kick_process(struct task_struct *p)
|
|
+{
|
|
+ int cpu;
|
|
+
|
|
+ preempt_disable();
|
|
+ cpu = task_cpu(p);
|
|
+ if ((cpu != smp_processor_id()) && task_curr(p))
|
|
+ smp_sched_reschedule(cpu);
|
|
+ preempt_enable();
|
|
+}
|
|
+EXPORT_SYMBOL_GPL(kick_process);
|
|
+#endif
|
|
+
|
|
+/*
|
|
+ * RT tasks preempt purely on priority. SCHED_NORMAL tasks preempt on the
|
|
+ * basis of earlier deadlines. SCHED_IDLEPRIO don't preempt anything else or
|
|
+ * between themselves, they cooperatively multitask. An idle rq scores as
|
|
+ * prio PRIO_LIMIT so it is always preempted.
|
|
+ */
|
|
+static inline bool
|
|
+can_preempt(struct task_struct *p, int prio, u64 deadline)
|
|
+{
|
|
+ /* Better static priority RT task or better policy preemption */
|
|
+ if (p->prio < prio)
|
|
+ return true;
|
|
+ if (p->prio > prio)
|
|
+ return false;
|
|
+ if (p->policy == SCHED_BATCH)
|
|
+ return false;
|
|
+ /* SCHED_NORMAL and ISO will preempt based on deadline */
|
|
+ if (!deadline_before(p->deadline, deadline))
|
|
+ return false;
|
|
+ return true;
|
|
+}
|
|
+
|
|
+#ifdef CONFIG_SMP
|
|
+/*
|
|
+ * Check to see if p can run on cpu, and if not, whether there are any online
|
|
+ * CPUs it can run on instead.
|
|
+ */
|
|
+static inline bool needs_other_cpu(struct task_struct *p, int cpu)
|
|
+{
|
|
+ if (unlikely(!cpumask_test_cpu(cpu, &p->cpus_allowed)))
|
|
+ return true;
|
|
+ return false;
|
|
+}
|
|
+#define cpu_online_map (*(cpumask_t *)cpu_online_mask)
|
|
+
|
|
+static void try_preempt(struct task_struct *p, struct rq *this_rq)
|
|
+{
|
|
+ int i, this_entries = rq_load(this_rq);
|
|
+ cpumask_t tmp;
|
|
+
|
|
+ if (suitable_idle_cpus(p) && resched_best_idle(p, task_cpu(p)))
|
|
+ return;
|
|
+
|
|
+ /* IDLEPRIO tasks never preempt anything but idle */
|
|
+ if (p->policy == SCHED_IDLEPRIO)
|
|
+ return;
|
|
+
|
|
+ cpumask_and(&tmp, &cpu_online_map, &p->cpus_allowed);
|
|
+
|
|
+ for (i = 0; i < num_possible_cpus(); i++) {
|
|
+ struct rq *rq = this_rq->rq_order[i];
|
|
+
|
|
+ if (!cpumask_test_cpu(rq->cpu, &tmp))
|
|
+ continue;
|
|
+
|
|
+ if (!sched_interactive && rq != this_rq && rq_load(rq) <= this_entries)
|
|
+ continue;
|
|
+ if (smt_schedule(p, rq) && can_preempt(p, rq->rq_prio, rq->rq_deadline)) {
|
|
+ resched_curr(rq);
|
|
+ return;
|
|
+ }
|
|
+ }
|
|
+}
|
|
+
|
|
+static int __set_cpus_allowed_ptr(struct task_struct *p,
|
|
+ const struct cpumask *new_mask, bool check);
|
|
+#else /* CONFIG_SMP */
|
|
+static inline bool needs_other_cpu(struct task_struct *p, int cpu)
|
|
+{
|
|
+ return false;
|
|
+}
|
|
+
|
|
+static void try_preempt(struct task_struct *p, struct rq *this_rq)
|
|
+{
|
|
+ if (p->policy == SCHED_IDLEPRIO)
|
|
+ return;
|
|
+ if (can_preempt(p, uprq->rq_prio, uprq->rq_deadline))
|
|
+ resched_curr(uprq);
|
|
+}
|
|
+
|
|
+static inline int __set_cpus_allowed_ptr(struct task_struct *p,
|
|
+ const struct cpumask *new_mask, bool check)
|
|
+{
|
|
+ return set_cpus_allowed_ptr(p, new_mask);
|
|
+}
|
|
+#endif /* CONFIG_SMP */
|
|
+
|
|
+/*
|
|
+ * wake flags
|
|
+ */
|
|
+#define WF_SYNC 0x01 /* waker goes to sleep after wakeup */
|
|
+#define WF_FORK 0x02 /* child wakeup after fork */
|
|
+#define WF_MIGRATED 0x04 /* internal use, task got migrated */
|
|
+
|
|
+static void
|
|
+ttwu_stat(struct task_struct *p, int cpu, int wake_flags)
|
|
+{
|
|
+ struct rq *rq;
|
|
+
|
|
+ if (!schedstat_enabled())
|
|
+ return;
|
|
+
|
|
+ rq = this_rq();
|
|
+
|
|
+#ifdef CONFIG_SMP
|
|
+ if (cpu == rq->cpu)
|
|
+ schedstat_inc(rq->ttwu_local);
|
|
+ else {
|
|
+ struct sched_domain *sd;
|
|
+
|
|
+ rcu_read_lock();
|
|
+ for_each_domain(rq->cpu, sd) {
|
|
+ if (cpumask_test_cpu(cpu, sched_domain_span(sd))) {
|
|
+ schedstat_inc(sd->ttwu_wake_remote);
|
|
+ break;
|
|
+ }
|
|
+ }
|
|
+ rcu_read_unlock();
|
|
+ }
|
|
+
|
|
+#endif /* CONFIG_SMP */
|
|
+
|
|
+ schedstat_inc(rq->ttwu_count);
|
|
+}
|
|
+
|
|
+static inline void ttwu_activate(struct rq *rq, struct task_struct *p)
|
|
+{
|
|
+ activate_task(p, rq);
|
|
+
|
|
+ /* if a worker is waking up, notify workqueue */
|
|
+ if (p->flags & PF_WQ_WORKER)
|
|
+ wq_worker_waking_up(p, cpu_of(rq));
|
|
+}
|
|
+
|
|
+/*
|
|
+ * Mark the task runnable and perform wakeup-preemption.
|
|
+ */
|
|
+static void ttwu_do_wakeup(struct rq *rq, struct task_struct *p, int wake_flags)
|
|
+{
|
|
+ /*
|
|
+ * Sync wakeups (i.e. those types of wakeups where the waker
|
|
+ * has indicated that it will leave the CPU in short order)
|
|
+ * don't trigger a preemption if there are no idle cpus,
|
|
+ * instead waiting for current to deschedule.
|
|
+ */
|
|
+ if (wake_flags & WF_SYNC)
|
|
+ resched_suitable_idle(p);
|
|
+ else
|
|
+ try_preempt(p, rq);
|
|
+ p->state = TASK_RUNNING;
|
|
+ trace_sched_wakeup(p);
|
|
+}
|
|
+
|
|
+static void
|
|
+ttwu_do_activate(struct rq *rq, struct task_struct *p, int wake_flags)
|
|
+{
|
|
+ lockdep_assert_held(&rq->lock);
|
|
+
|
|
+#ifdef CONFIG_SMP
|
|
+ if (p->sched_contributes_to_load)
|
|
+ atomic_dec(&grq.nr_uninterruptible);
|
|
+#endif
|
|
+
|
|
+ ttwu_activate(rq, p);
|
|
+ ttwu_do_wakeup(rq, p, wake_flags);
|
|
+}
|
|
+
|
|
+/*
|
|
+ * Called in case the task @p isn't fully descheduled from its runqueue,
|
|
+ * in this case we must do a remote wakeup. Its a 'light' wakeup though,
|
|
+ * since all we need to do is flip p->state to TASK_RUNNING, since
|
|
+ * the task is still ->on_rq.
|
|
+ */
|
|
+static int ttwu_remote(struct task_struct *p, int wake_flags)
|
|
+{
|
|
+ struct rq *rq;
|
|
+ int ret = 0;
|
|
+
|
|
+ rq = __task_rq_lock(p);
|
|
+ if (likely(task_on_rq_queued(p))) {
|
|
+ ttwu_do_wakeup(rq, p, wake_flags);
|
|
+ ret = 1;
|
|
+ }
|
|
+ __task_rq_unlock(rq);
|
|
+
|
|
+ return ret;
|
|
+}
|
|
+
|
|
+#ifdef CONFIG_SMP
|
|
+void sched_ttwu_pending(void)
|
|
+{
|
|
+ struct rq *rq = this_rq();
|
|
+ struct llist_node *llist = llist_del_all(&rq->wake_list);
|
|
+ struct task_struct *p;
|
|
+ unsigned long flags;
|
|
+
|
|
+ if (!llist)
|
|
+ return;
|
|
+
|
|
+ raw_spin_lock_irqsave(&rq->lock, flags);
|
|
+
|
|
+ while (llist) {
|
|
+ int wake_flags = 0;
|
|
+
|
|
+ p = llist_entry(llist, struct task_struct, wake_entry);
|
|
+ llist = llist_next(llist);
|
|
+
|
|
+ ttwu_do_activate(rq, p, wake_flags);
|
|
+ }
|
|
+
|
|
+ raw_spin_unlock_irqrestore(&rq->lock, flags);
|
|
+}
|
|
+
|
|
+void scheduler_ipi(void)
|
|
+{
|
|
+ /*
|
|
+ * Fold TIF_NEED_RESCHED into the preempt_count; anybody setting
|
|
+ * TIF_NEED_RESCHED remotely (for the first time) will also send
|
|
+ * this IPI.
|
|
+ */
|
|
+ preempt_fold_need_resched();
|
|
+
|
|
+ if (llist_empty(&this_rq()->wake_list) && (!idle_cpu(smp_processor_id()) || need_resched()))
|
|
+ return;
|
|
+
|
|
+ /*
|
|
+ * Not all reschedule IPI handlers call irq_enter/irq_exit, since
|
|
+ * traditionally all their work was done from the interrupt return
|
|
+ * path. Now that we actually do some work, we need to make sure
|
|
+ * we do call them.
|
|
+ *
|
|
+ * Some archs already do call them, luckily irq_enter/exit nest
|
|
+ * properly.
|
|
+ *
|
|
+ * Arguably we should visit all archs and update all handlers,
|
|
+ * however a fair share of IPIs are still resched only so this would
|
|
+ * somewhat pessimize the simple resched case.
|
|
+ */
|
|
+ irq_enter();
|
|
+ sched_ttwu_pending();
|
|
+ irq_exit();
|
|
+}
|
|
+
|
|
+static void ttwu_queue_remote(struct task_struct *p, int cpu, int wake_flags)
|
|
+{
|
|
+ struct rq *rq = cpu_rq(cpu);
|
|
+
|
|
+ if (llist_add(&p->wake_entry, &cpu_rq(cpu)->wake_list)) {
|
|
+ if (!set_nr_if_polling(rq->idle))
|
|
+ smp_sched_reschedule(cpu);
|
|
+ else
|
|
+ trace_sched_wake_idle_without_ipi(cpu);
|
|
+ }
|
|
+}
|
|
+
|
|
+void wake_up_if_idle(int cpu)
|
|
+{
|
|
+ struct rq *rq = cpu_rq(cpu);
|
|
+ unsigned long flags;
|
|
+
|
|
+ rcu_read_lock();
|
|
+
|
|
+ if (!is_idle_task(rcu_dereference(rq->curr)))
|
|
+ goto out;
|
|
+
|
|
+ if (set_nr_if_polling(rq->idle)) {
|
|
+ trace_sched_wake_idle_without_ipi(cpu);
|
|
+ } else {
|
|
+ rq_lock_irqsave(rq, &flags);
|
|
+ if (likely(is_idle_task(rq->curr)))
|
|
+ smp_sched_reschedule(cpu);
|
|
+ /* Else cpu is not in idle, do nothing here */
|
|
+ rq_unlock_irqrestore(rq, &flags);
|
|
+ }
|
|
+
|
|
+out:
|
|
+ rcu_read_unlock();
|
|
+}
|
|
+
|
|
+static int valid_task_cpu(struct task_struct *p)
|
|
+{
|
|
+ cpumask_t valid_mask;
|
|
+
|
|
+ if (p->flags & PF_KTHREAD)
|
|
+ cpumask_and(&valid_mask, tsk_cpus_allowed(p), cpu_online_mask);
|
|
+ else
|
|
+ cpumask_and(&valid_mask, tsk_cpus_allowed(p), cpu_active_mask);
|
|
+
|
|
+ if (unlikely(!cpumask_weight(&valid_mask))) {
|
|
+ /* Hotplug boot threads do this before the CPU is up */
|
|
+ WARN_ON(sched_smp_initialized);
|
|
+ return cpumask_any(tsk_cpus_allowed(p));
|
|
+ }
|
|
+ return cpumask_any(&valid_mask);
|
|
+}
|
|
+
|
|
+/*
|
|
+ * For a task that's just being woken up we have a valuable balancing
|
|
+ * opportunity so choose the nearest cache most lightly loaded runqueue.
|
|
+ * Entered with rq locked and returns with the chosen runqueue locked.
|
|
+ */
|
|
+static inline int select_best_cpu(struct task_struct *p)
|
|
+{
|
|
+ unsigned int idlest = ~0U;
|
|
+ struct rq *rq = NULL;
|
|
+ int i;
|
|
+
|
|
+ if (suitable_idle_cpus(p)) {
|
|
+ int cpu = task_cpu(p);
|
|
+
|
|
+ if (unlikely(needs_other_cpu(p, cpu)))
|
|
+ cpu = valid_task_cpu(p);
|
|
+ rq = resched_best_idle(p, cpu);
|
|
+ if (likely(rq))
|
|
+ return rq->cpu;
|
|
+ }
|
|
+
|
|
+ for (i = 0; i < num_possible_cpus(); i++) {
|
|
+ struct rq *other_rq = task_rq(p)->rq_order[i];
|
|
+ int entries;
|
|
+
|
|
+ if (!other_rq->online)
|
|
+ continue;
|
|
+ if (needs_other_cpu(p, other_rq->cpu))
|
|
+ continue;
|
|
+ entries = rq_load(other_rq);
|
|
+ if (entries >= idlest)
|
|
+ continue;
|
|
+ idlest = entries;
|
|
+ rq = other_rq;
|
|
+ }
|
|
+ if (unlikely(!rq))
|
|
+ return task_cpu(p);
|
|
+ return rq->cpu;
|
|
+}
|
|
+#else /* CONFIG_SMP */
|
|
+static int valid_task_cpu(struct task_struct *p)
|
|
+{
|
|
+ return 0;
|
|
+}
|
|
+
|
|
+static inline int select_best_cpu(struct task_struct *p)
|
|
+{
|
|
+ return 0;
|
|
+}
|
|
+
|
|
+static struct rq *resched_best_idle(struct task_struct *p, int cpu)
|
|
+{
|
|
+ return NULL;
|
|
+}
|
|
+#endif /* CONFIG_SMP */
|
|
+
|
|
+static void ttwu_queue(struct task_struct *p, int cpu, int wake_flags)
|
|
+{
|
|
+ struct rq *rq = cpu_rq(cpu);
|
|
+
|
|
+#if defined(CONFIG_SMP)
|
|
+ if (!cpus_share_cache(smp_processor_id(), cpu)) {
|
|
+ sched_clock_cpu(cpu); /* sync clocks x-cpu */
|
|
+ ttwu_queue_remote(p, cpu, wake_flags);
|
|
+ return;
|
|
+ }
|
|
+#endif
|
|
+ rq_lock(rq);
|
|
+ ttwu_do_activate(rq, p, wake_flags);
|
|
+ rq_unlock(rq);
|
|
+}
|
|
+
|
|
+/***
|
|
+ * try_to_wake_up - wake up a thread
|
|
+ * @p: the thread to be awakened
|
|
+ * @state: the mask of task states that can be woken
|
|
+ * @wake_flags: wake modifier flags (WF_*)
|
|
+ *
|
|
+ * Put it on the run-queue if it's not already there. The "current"
|
|
+ * thread is always on the run-queue (except when the actual
|
|
+ * re-schedule is in progress), and as such you're allowed to do
|
|
+ * the simpler "current->state = TASK_RUNNING" to mark yourself
|
|
+ * runnable without the overhead of this.
|
|
+ *
|
|
+ * Return: %true if @p was woken up, %false if it was already running.
|
|
+ * or @state didn't match @p's state.
|
|
+ */
|
|
+static int
|
|
+try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
|
|
+{
|
|
+ unsigned long flags;
|
|
+ int cpu, success = 0;
|
|
+
|
|
+ /*
|
|
+ * If we are going to wake up a thread waiting for CONDITION we
|
|
+ * need to ensure that CONDITION=1 done by the caller can not be
|
|
+ * reordered with p->state check below. This pairs with mb() in
|
|
+ * set_current_state() the waiting thread does.
|
|
+ */
|
|
+ smp_mb__before_spinlock();
|
|
+ raw_spin_lock_irqsave(&p->pi_lock, flags);
|
|
+ /* state is a volatile long, どうして、分からない */
|
|
+ if (!((unsigned int)p->state & state))
|
|
+ goto out;
|
|
+
|
|
+ trace_sched_waking(p);
|
|
+
|
|
+ success = 1; /* we're going to change ->state */
|
|
+ cpu = task_cpu(p);
|
|
+
|
|
+ /*
|
|
+ * Ensure we load p->on_rq _after_ p->state, otherwise it would
|
|
+ * be possible to, falsely, observe p->on_rq == 0 and get stuck
|
|
+ * in smp_cond_load_acquire() below.
|
|
+ *
|
|
+ * sched_ttwu_pending() try_to_wake_up()
|
|
+ * [S] p->on_rq = 1; [L] P->state
|
|
+ * UNLOCK rq->lock -----.
|
|
+ * \
|
|
+ * +--- RMB
|
|
+ * schedule() /
|
|
+ * LOCK rq->lock -----'
|
|
+ * UNLOCK rq->lock
|
|
+ *
|
|
+ * [task p]
|
|
+ * [S] p->state = UNINTERRUPTIBLE [L] p->on_rq
|
|
+ *
|
|
+ * Pairs with the UNLOCK+LOCK on rq->lock from the
|
|
+ * last wakeup of our task and the schedule that got our task
|
|
+ * current.
|
|
+ */
|
|
+ smp_rmb();
|
|
+ if (p->on_rq && ttwu_remote(p, wake_flags))
|
|
+ goto stat;
|
|
+
|
|
+#ifdef CONFIG_SMP
|
|
+ /*
|
|
+ * Ensure we load p->on_cpu _after_ p->on_rq, otherwise it would be
|
|
+ * possible to, falsely, observe p->on_cpu == 0.
|
|
+ *
|
|
+ * One must be running (->on_cpu == 1) in order to remove oneself
|
|
+ * from the runqueue.
|
|
+ *
|
|
+ * [S] ->on_cpu = 1; [L] ->on_rq
|
|
+ * UNLOCK rq->lock
|
|
+ * RMB
|
|
+ * LOCK rq->lock
|
|
+ * [S] ->on_rq = 0; [L] ->on_cpu
|
|
+ *
|
|
+ * Pairs with the full barrier implied in the UNLOCK+LOCK on rq->lock
|
|
+ * from the consecutive calls to schedule(); the first switching to our
|
|
+ * task, the second putting it to sleep.
|
|
+ */
|
|
+ smp_rmb();
|
|
+
|
|
+ /*
|
|
+ * If the owning (remote) cpu is still in the middle of schedule() with
|
|
+ * this task as prev, wait until its done referencing the task.
|
|
+ *
|
|
+ * Pairs with the smp_store_release() in finish_lock_switch().
|
|
+ *
|
|
+ * This ensures that tasks getting woken will be fully ordered against
|
|
+ * their previous state and preserve Program Order.
|
|
+ */
|
|
+ smp_cond_load_acquire(&p->on_cpu, !VAL);
|
|
+
|
|
+ p->sched_contributes_to_load = !!task_contributes_to_load(p);
|
|
+ p->state = TASK_WAKING;
|
|
+
|
|
+ cpu = select_best_cpu(p);
|
|
+ if (task_cpu(p) != cpu)
|
|
+ set_task_cpu(p, cpu);
|
|
+#endif /* CONFIG_SMP */
|
|
+
|
|
+ ttwu_queue(p, cpu, wake_flags);
|
|
+stat:
|
|
+ ttwu_stat(p, cpu, wake_flags);
|
|
+out:
|
|
+ raw_spin_unlock_irqrestore(&p->pi_lock, flags);
|
|
+
|
|
+ return success;
|
|
+}
|
|
+
|
|
+/**
|
|
+ * try_to_wake_up_local - try to wake up a local task with rq lock held
|
|
+ * @p: the thread to be awakened
|
|
+ *
|
|
+ * Put @p on the run-queue if it's not already there. The caller must
|
|
+ * ensure that rq is locked and, @p is not the current task.
|
|
+ * rq stays locked over invocation.
|
|
+ */
|
|
+static void try_to_wake_up_local(struct task_struct *p)
|
|
+{
|
|
+ struct rq *rq = task_rq(p);
|
|
+
|
|
+ if (WARN_ON_ONCE(rq != this_rq()) ||
|
|
+ WARN_ON_ONCE(p == current))
|
|
+ return;
|
|
+
|
|
+ lockdep_assert_held(&rq->lock);
|
|
+
|
|
+ if (!raw_spin_trylock(&p->pi_lock)) {
|
|
+ /*
|
|
+ * This is OK, because current is on_cpu, which avoids it being
|
|
+ * picked for load-balance and preemption/IRQs are still
|
|
+ * disabled avoiding further scheduler activity on it and we've
|
|
+ * not yet picked a replacement task.
|
|
+ */
|
|
+ raw_spin_unlock(&rq->lock);
|
|
+ raw_spin_lock(&p->pi_lock);
|
|
+ raw_spin_lock(&rq->lock);
|
|
+ }
|
|
+
|
|
+ if (!(p->state & TASK_NORMAL))
|
|
+ goto out;
|
|
+
|
|
+ trace_sched_waking(p);
|
|
+
|
|
+ if (!task_on_rq_queued(p))
|
|
+ ttwu_activate(rq, p);
|
|
+
|
|
+ ttwu_do_wakeup(rq, p, 0);
|
|
+ ttwu_stat(p, smp_processor_id(), 0);
|
|
+out:
|
|
+ raw_spin_unlock(&p->pi_lock);
|
|
+}
|
|
+
|
|
+/**
|
|
+ * wake_up_process - Wake up a specific process
|
|
+ * @p: The process to be woken up.
|
|
+ *
|
|
+ * Attempt to wake up the nominated process and move it to the set of runnable
|
|
+ * processes.
|
|
+ *
|
|
+ * Return: 1 if the process was woken up, 0 if it was already running.
|
|
+ *
|
|
+ * It may be assumed that this function implies a write memory barrier before
|
|
+ * changing the task state if and only if any tasks are woken up.
|
|
+ */
|
|
+int wake_up_process(struct task_struct *p)
|
|
+{
|
|
+ return try_to_wake_up(p, TASK_NORMAL, 0);
|
|
+}
|
|
+EXPORT_SYMBOL(wake_up_process);
|
|
+
|
|
+int wake_up_state(struct task_struct *p, unsigned int state)
|
|
+{
|
|
+ return try_to_wake_up(p, state, 0);
|
|
+}
|
|
+
|
|
+static void time_slice_expired(struct task_struct *p, struct rq *rq);
|
|
+
|
|
+/*
|
|
+ * Perform scheduler related setup for a newly forked process p.
|
|
+ * p is forked by current.
|
|
+ */
|
|
+int sched_fork(unsigned long __maybe_unused clone_flags, struct task_struct *p)
|
|
+{
|
|
+ unsigned long flags;
|
|
+ int cpu = get_cpu();
|
|
+
|
|
+#ifdef CONFIG_PREEMPT_NOTIFIERS
|
|
+ INIT_HLIST_HEAD(&p->preempt_notifiers);
|
|
+#endif
|
|
+ /*
|
|
+ * We mark the process as NEW here. This guarantees that
|
|
+ * nobody will actually run it, and a signal or other external
|
|
+ * event cannot wake it up and insert it on the runqueue either.
|
|
+ */
|
|
+ p->state = TASK_NEW;
|
|
+
|
|
+ /*
|
|
+ * The process state is set to the same value of the process executing
|
|
+ * do_fork() code. That is running. This guarantees that nobody will
|
|
+ * actually run it, and a signal or other external event cannot wake
|
|
+ * it up and insert it on the runqueue either.
|
|
+ */
|
|
+
|
|
+ /* Should be reset in fork.c but done here for ease of MuQSS patching */
|
|
+ p->on_cpu =
|
|
+ p->on_rq =
|
|
+ p->utime =
|
|
+ p->stime =
|
|
+ p->utimescaled =
|
|
+ p->stimescaled =
|
|
+ p->sched_time =
|
|
+ p->stime_ns =
|
|
+ p->utime_ns = 0;
|
|
+ skiplist_node_init(&p->node);
|
|
+
|
|
+ /*
|
|
+ * Revert to default priority/policy on fork if requested.
|
|
+ */
|
|
+ if (unlikely(p->sched_reset_on_fork)) {
|
|
+ if (p->policy == SCHED_FIFO || p->policy == SCHED_RR) {
|
|
+ p->policy = SCHED_NORMAL;
|
|
+ p->normal_prio = normal_prio(p);
|
|
+ }
|
|
+
|
|
+ if (PRIO_TO_NICE(p->static_prio) < 0) {
|
|
+ p->static_prio = NICE_TO_PRIO(0);
|
|
+ p->normal_prio = p->static_prio;
|
|
+ }
|
|
+
|
|
+ /*
|
|
+ * We don't need the reset flag anymore after the fork. It has
|
|
+ * fulfilled its duty:
|
|
+ */
|
|
+ p->sched_reset_on_fork = 0;
|
|
+ }
|
|
+
|
|
+ /*
|
|
+ * Silence PROVE_RCU.
|
|
+ */
|
|
+ raw_spin_lock_irqsave(&p->pi_lock, flags);
|
|
+ set_task_cpu(p, cpu);
|
|
+ raw_spin_unlock_irqrestore(&p->pi_lock, flags);
|
|
+
|
|
+#ifdef CONFIG_SCHED_INFO
|
|
+ if (unlikely(sched_info_on()))
|
|
+ memset(&p->sched_info, 0, sizeof(p->sched_info));
|
|
+#endif
|
|
+ init_task_preempt_count(p);
|
|
+
|
|
+ put_cpu();
|
|
+ return 0;
|
|
+}
|
|
+
|
|
+#ifdef CONFIG_SCHEDSTATS
|
|
+
|
|
+DEFINE_STATIC_KEY_FALSE(sched_schedstats);
|
|
+static bool __initdata __sched_schedstats = false;
|
|
+
|
|
+static void set_schedstats(bool enabled)
|
|
+{
|
|
+ if (enabled)
|
|
+ static_branch_enable(&sched_schedstats);
|
|
+ else
|
|
+ static_branch_disable(&sched_schedstats);
|
|
+}
|
|
+
|
|
+void force_schedstat_enabled(void)
|
|
+{
|
|
+ if (!schedstat_enabled()) {
|
|
+ pr_info("kernel profiling enabled schedstats, disable via kernel.sched_schedstats.\n");
|
|
+ static_branch_enable(&sched_schedstats);
|
|
+ }
|
|
+}
|
|
+
|
|
+static int __init setup_schedstats(char *str)
|
|
+{
|
|
+ int ret = 0;
|
|
+ if (!str)
|
|
+ goto out;
|
|
+
|
|
+ /*
|
|
+ * This code is called before jump labels have been set up, so we can't
|
|
+ * change the static branch directly just yet. Instead set a temporary
|
|
+ * variable so init_schedstats() can do it later.
|
|
+ */
|
|
+ if (!strcmp(str, "enable")) {
|
|
+ __sched_schedstats = true;
|
|
+ ret = 1;
|
|
+ } else if (!strcmp(str, "disable")) {
|
|
+ __sched_schedstats = false;
|
|
+ ret = 1;
|
|
+ }
|
|
+out:
|
|
+ if (!ret)
|
|
+ pr_warn("Unable to parse schedstats=\n");
|
|
+
|
|
+ return ret;
|
|
+}
|
|
+__setup("schedstats=", setup_schedstats);
|
|
+
|
|
+static void __init init_schedstats(void)
|
|
+{
|
|
+ set_schedstats(__sched_schedstats);
|
|
+}
|
|
+
|
|
+#ifdef CONFIG_PROC_SYSCTL
|
|
+int sysctl_schedstats(struct ctl_table *table, int write,
|
|
+ void __user *buffer, size_t *lenp, loff_t *ppos)
|
|
+{
|
|
+ struct ctl_table t;
|
|
+ int err;
|
|
+ int state = static_branch_likely(&sched_schedstats);
|
|
+
|
|
+ if (write && !capable(CAP_SYS_ADMIN))
|
|
+ return -EPERM;
|
|
+
|
|
+ t = *table;
|
|
+ t.data = &state;
|
|
+ err = proc_dointvec_minmax(&t, write, buffer, lenp, ppos);
|
|
+ if (err < 0)
|
|
+ return err;
|
|
+ if (write)
|
|
+ set_schedstats(state);
|
|
+ return err;
|
|
+}
|
|
+#endif /* CONFIG_PROC_SYSCTL */
|
|
+#else /* !CONFIG_SCHEDSTATS */
|
|
+static inline void init_schedstats(void) {}
|
|
+#endif /* CONFIG_SCHEDSTATS */
|
|
+
|
|
+static void update_cpu_clock_switch(struct rq *rq, struct task_struct *p);
|
|
+
|
|
+static void account_task_cpu(struct rq *rq, struct task_struct *p)
|
|
+{
|
|
+ update_clocks(rq);
|
|
+ /* This isn't really a context switch but accounting is the same */
|
|
+ update_cpu_clock_switch(rq, p);
|
|
+ p->last_ran = rq->niffies;
|
|
+}
|
|
+
|
|
+static inline int hrexpiry_enabled(struct rq *rq)
|
|
+{
|
|
+ if (unlikely(!cpu_active(cpu_of(rq)) || !sched_smp_initialized))
|
|
+ return 0;
|
|
+ return hrtimer_is_hres_active(&rq->hrexpiry_timer);
|
|
+}
|
|
+
|
|
+/*
|
|
+ * Use HR-timers to deliver accurate preemption points.
|
|
+ */
|
|
+static inline void hrexpiry_clear(struct rq *rq)
|
|
+{
|
|
+ if (!hrexpiry_enabled(rq))
|
|
+ return;
|
|
+ if (hrtimer_active(&rq->hrexpiry_timer))
|
|
+ hrtimer_cancel(&rq->hrexpiry_timer);
|
|
+}
|
|
+
|
|
+/*
|
|
+ * High-resolution time_slice expiry.
|
|
+ * Runs from hardirq context with interrupts disabled.
|
|
+ */
|
|
+static enum hrtimer_restart hrexpiry(struct hrtimer *timer)
|
|
+{
|
|
+ struct rq *rq = container_of(timer, struct rq, hrexpiry_timer);
|
|
+ struct task_struct *p;
|
|
+
|
|
+ /* This can happen during CPU hotplug / resume */
|
|
+ if (unlikely(cpu_of(rq) != smp_processor_id()))
|
|
+ goto out;
|
|
+
|
|
+ /*
|
|
+ * We're doing this without the runqueue lock but this should always
|
|
+ * be run on the local CPU. Time slice should run out in __schedule
|
|
+ * but we set it to zero here in case niffies is slightly less.
|
|
+ */
|
|
+ p = rq->curr;
|
|
+ p->time_slice = 0;
|
|
+ __set_tsk_resched(p);
|
|
+out:
|
|
+ return HRTIMER_NORESTART;
|
|
+}
|
|
+
|
|
+/*
|
|
+ * Called to set the hrexpiry timer state.
|
|
+ *
|
|
+ * called with irqs disabled from the local CPU only
|
|
+ */
|
|
+static void hrexpiry_start(struct rq *rq, u64 delay)
|
|
+{
|
|
+ if (!hrexpiry_enabled(rq))
|
|
+ return;
|
|
+
|
|
+ hrtimer_start(&rq->hrexpiry_timer, ns_to_ktime(delay),
|
|
+ HRTIMER_MODE_REL_PINNED);
|
|
+}
|
|
+
|
|
+static void init_rq_hrexpiry(struct rq *rq)
|
|
+{
|
|
+ hrtimer_init(&rq->hrexpiry_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
|
|
+ rq->hrexpiry_timer.function = hrexpiry;
|
|
+}
|
|
+
|
|
+static inline int rq_dither(struct rq *rq)
|
|
+{
|
|
+ if (!hrexpiry_enabled(rq))
|
|
+ return HALF_JIFFY_US;
|
|
+ return 0;
|
|
+}
|
|
+
|
|
+/*
|
|
+ * wake_up_new_task - wake up a newly created task for the first time.
|
|
+ *
|
|
+ * This function will do some initial scheduler statistics housekeeping
|
|
+ * that must be done for every newly created context, then puts the task
|
|
+ * on the runqueue and wakes it.
|
|
+ */
|
|
+void wake_up_new_task(struct task_struct *p)
|
|
+{
|
|
+ struct task_struct *parent, *rq_curr;
|
|
+ struct rq *rq, *new_rq;
|
|
+ unsigned long flags;
|
|
+
|
|
+ parent = p->parent;
|
|
+
|
|
+ raw_spin_lock_irqsave(&p->pi_lock, flags);
|
|
+ p->state = TASK_RUNNING;
|
|
+ /* Task_rq can't change yet on a new task */
|
|
+ new_rq = rq = task_rq(p);
|
|
+ if (unlikely(needs_other_cpu(p, task_cpu(p)))) {
|
|
+ set_task_cpu(p, valid_task_cpu(p));
|
|
+ new_rq = task_rq(p);
|
|
+ }
|
|
+
|
|
+ double_rq_lock(rq, new_rq);
|
|
+ rq_curr = rq->curr;
|
|
+
|
|
+ /*
|
|
+ * Make sure we do not leak PI boosting priority to the child.
|
|
+ */
|
|
+ p->prio = rq_curr->normal_prio;
|
|
+
|
|
+ trace_sched_wakeup_new(p);
|
|
+
|
|
+ /*
|
|
+ * Share the timeslice between parent and child, thus the
|
|
+ * total amount of pending timeslices in the system doesn't change,
|
|
+ * resulting in more scheduling fairness. If it's negative, it won't
|
|
+ * matter since that's the same as being 0. rq->rq_deadline is only
|
|
+ * modified within schedule() so it is always equal to
|
|
+ * current->deadline.
|
|
+ */
|
|
+ account_task_cpu(rq, rq_curr);
|
|
+ p->last_ran = rq_curr->last_ran;
|
|
+ if (likely(rq_curr->policy != SCHED_FIFO)) {
|
|
+ rq_curr->time_slice /= 2;
|
|
+ if (rq_curr->time_slice < RESCHED_US) {
|
|
+ /*
|
|
+ * Forking task has run out of timeslice. Reschedule it and
|
|
+ * start its child with a new time slice and deadline. The
|
|
+ * child will end up running first because its deadline will
|
|
+ * be slightly earlier.
|
|
+ */
|
|
+ __set_tsk_resched(rq_curr);
|
|
+ time_slice_expired(p, new_rq);
|
|
+ if (suitable_idle_cpus(p))
|
|
+ resched_best_idle(p, task_cpu(p));
|
|
+ else if (unlikely(rq != new_rq))
|
|
+ try_preempt(p, new_rq);
|
|
+ } else {
|
|
+ p->time_slice = rq_curr->time_slice;
|
|
+ if (rq_curr == parent && rq == new_rq && !suitable_idle_cpus(p)) {
|
|
+ /*
|
|
+ * The VM isn't cloned, so we're in a good position to
|
|
+ * do child-runs-first in anticipation of an exec. This
|
|
+ * usually avoids a lot of COW overhead.
|
|
+ */
|
|
+ __set_tsk_resched(rq_curr);
|
|
+ } else {
|
|
+ /*
|
|
+ * Adjust the hrexpiry since rq_curr will keep
|
|
+ * running and its timeslice has been shortened.
|
|
+ */
|
|
+ hrexpiry_start(rq, US_TO_NS(rq_curr->time_slice));
|
|
+ try_preempt(p, new_rq);
|
|
+ }
|
|
+ }
|
|
+ } else {
|
|
+ time_slice_expired(p, new_rq);
|
|
+ try_preempt(p, new_rq);
|
|
+ }
|
|
+ activate_task(p, new_rq);
|
|
+ double_rq_unlock(rq, new_rq);
|
|
+ raw_spin_unlock_irqrestore(&p->pi_lock, flags);
|
|
+}
|
|
+
|
|
+#ifdef CONFIG_PREEMPT_NOTIFIERS
|
|
+
|
|
+static struct static_key preempt_notifier_key = STATIC_KEY_INIT_FALSE;
|
|
+
|
|
+void preempt_notifier_inc(void)
|
|
+{
|
|
+ static_key_slow_inc(&preempt_notifier_key);
|
|
+}
|
|
+EXPORT_SYMBOL_GPL(preempt_notifier_inc);
|
|
+
|
|
+void preempt_notifier_dec(void)
|
|
+{
|
|
+ static_key_slow_dec(&preempt_notifier_key);
|
|
+}
|
|
+EXPORT_SYMBOL_GPL(preempt_notifier_dec);
|
|
+
|
|
+/**
|
|
+ * preempt_notifier_register - tell me when current is being preempted & rescheduled
|
|
+ * @notifier: notifier struct to register
|
|
+ */
|
|
+void preempt_notifier_register(struct preempt_notifier *notifier)
|
|
+{
|
|
+ if (!static_key_false(&preempt_notifier_key))
|
|
+ WARN(1, "registering preempt_notifier while notifiers disabled\n");
|
|
+
|
|
+ hlist_add_head(¬ifier->link, ¤t->preempt_notifiers);
|
|
+}
|
|
+EXPORT_SYMBOL_GPL(preempt_notifier_register);
|
|
+
|
|
+/**
|
|
+ * preempt_notifier_unregister - no longer interested in preemption notifications
|
|
+ * @notifier: notifier struct to unregister
|
|
+ *
|
|
+ * This is *not* safe to call from within a preemption notifier.
|
|
+ */
|
|
+void preempt_notifier_unregister(struct preempt_notifier *notifier)
|
|
+{
|
|
+ hlist_del(¬ifier->link);
|
|
+}
|
|
+EXPORT_SYMBOL_GPL(preempt_notifier_unregister);
|
|
+
|
|
+static void __fire_sched_in_preempt_notifiers(struct task_struct *curr)
|
|
+{
|
|
+ struct preempt_notifier *notifier;
|
|
+
|
|
+ hlist_for_each_entry(notifier, &curr->preempt_notifiers, link)
|
|
+ notifier->ops->sched_in(notifier, raw_smp_processor_id());
|
|
+}
|
|
+
|
|
+static __always_inline void fire_sched_in_preempt_notifiers(struct task_struct *curr)
|
|
+{
|
|
+ if (static_key_false(&preempt_notifier_key))
|
|
+ __fire_sched_in_preempt_notifiers(curr);
|
|
+}
|
|
+
|
|
+static void
|
|
+__fire_sched_out_preempt_notifiers(struct task_struct *curr,
|
|
+ struct task_struct *next)
|
|
+{
|
|
+ struct preempt_notifier *notifier;
|
|
+
|
|
+ hlist_for_each_entry(notifier, &curr->preempt_notifiers, link)
|
|
+ notifier->ops->sched_out(notifier, next);
|
|
+}
|
|
+
|
|
+static __always_inline void
|
|
+fire_sched_out_preempt_notifiers(struct task_struct *curr,
|
|
+ struct task_struct *next)
|
|
+{
|
|
+ if (static_key_false(&preempt_notifier_key))
|
|
+ __fire_sched_out_preempt_notifiers(curr, next);
|
|
+}
|
|
+
|
|
+#else /* !CONFIG_PREEMPT_NOTIFIERS */
|
|
+
|
|
+static inline void fire_sched_in_preempt_notifiers(struct task_struct *curr)
|
|
+{
|
|
+}
|
|
+
|
|
+static inline void
|
|
+fire_sched_out_preempt_notifiers(struct task_struct *curr,
|
|
+ struct task_struct *next)
|
|
+{
|
|
+}
|
|
+
|
|
+#endif /* CONFIG_PREEMPT_NOTIFIERS */
|
|
+
|
|
+/**
|
|
+ * prepare_task_switch - prepare to switch tasks
|
|
+ * @rq: the runqueue preparing to switch
|
|
+ * @next: the task we are going to switch to.
|
|
+ *
|
|
+ * This is called with the rq lock held and interrupts off. It must
|
|
+ * be paired with a subsequent finish_task_switch after the context
|
|
+ * switch.
|
|
+ *
|
|
+ * prepare_task_switch sets up locking and calls architecture specific
|
|
+ * hooks.
|
|
+ */
|
|
+static inline void
|
|
+prepare_task_switch(struct rq *rq, struct task_struct *prev,
|
|
+ struct task_struct *next)
|
|
+{
|
|
+ sched_info_switch(rq, prev, next);
|
|
+ perf_event_task_sched_out(prev, next);
|
|
+ fire_sched_out_preempt_notifiers(prev, next);
|
|
+ prepare_lock_switch(rq, next);
|
|
+ prepare_arch_switch(next);
|
|
+}
|
|
+
|
|
+/**
|
|
+ * finish_task_switch - clean up after a task-switch
|
|
+ * @rq: runqueue associated with task-switch
|
|
+ * @prev: the thread we just switched away from.
|
|
+ *
|
|
+ * finish_task_switch must be called after the context switch, paired
|
|
+ * with a prepare_task_switch call before the context switch.
|
|
+ * finish_task_switch will reconcile locking set up by prepare_task_switch,
|
|
+ * and do any other architecture-specific cleanup actions.
|
|
+ *
|
|
+ * Note that we may have delayed dropping an mm in context_switch(). If
|
|
+ * so, we finish that here outside of the runqueue lock. (Doing it
|
|
+ * with the lock held can cause deadlocks; see schedule() for
|
|
+ * details.)
|
|
+ *
|
|
+ * The context switch have flipped the stack from under us and restored the
|
|
+ * local variables which were saved when this task called schedule() in the
|
|
+ * past. prev == current is still correct but we need to recalculate this_rq
|
|
+ * because prev may have moved to another CPU.
|
|
+ */
|
|
+static void finish_task_switch(struct task_struct *prev)
|
|
+ __releases(rq->lock)
|
|
+{
|
|
+ struct rq *rq = this_rq();
|
|
+ struct mm_struct *mm = rq->prev_mm;
|
|
+ long prev_state;
|
|
+
|
|
+ /*
|
|
+ * The previous task will have left us with a preempt_count of 2
|
|
+ * because it left us after:
|
|
+ *
|
|
+ * schedule()
|
|
+ * preempt_disable(); // 1
|
|
+ * __schedule()
|
|
+ * raw_spin_lock_irq(&rq->lock) // 2
|
|
+ *
|
|
+ * Also, see FORK_PREEMPT_COUNT.
|
|
+ */
|
|
+ if (WARN_ONCE(preempt_count() != 2*PREEMPT_DISABLE_OFFSET,
|
|
+ "corrupted preempt_count: %s/%d/0x%x\n",
|
|
+ current->comm, current->pid, preempt_count()))
|
|
+ preempt_count_set(FORK_PREEMPT_COUNT);
|
|
+
|
|
+ rq->prev_mm = NULL;
|
|
+
|
|
+ /*
|
|
+ * A task struct has one reference for the use as "current".
|
|
+ * If a task dies, then it sets TASK_DEAD in tsk->state and calls
|
|
+ * schedule one last time. The schedule call will never return, and
|
|
+ * the scheduled task must drop that reference.
|
|
+ *
|
|
+ * We must observe prev->state before clearing prev->on_cpu (in
|
|
+ * finish_lock_switch), otherwise a concurrent wakeup can get prev
|
|
+ * running on another CPU and we could rave with its RUNNING -> DEAD
|
|
+ * transition, resulting in a double drop.
|
|
+ */
|
|
+ prev_state = prev->state;
|
|
+ vtime_task_switch(prev);
|
|
+ perf_event_task_sched_in(prev, current);
|
|
+ finish_lock_switch(rq, prev);
|
|
+ finish_arch_post_lock_switch();
|
|
+
|
|
+ fire_sched_in_preempt_notifiers(current);
|
|
+ if (mm)
|
|
+ mmdrop(mm);
|
|
+ if (unlikely(prev_state == TASK_DEAD)) {
|
|
+ /*
|
|
+ * Remove function-return probe instances associated with this
|
|
+ * task and put them back on the free list.
|
|
+ */
|
|
+ kprobe_flush_task(prev);
|
|
+
|
|
+ /* Task is done with its stack. */
|
|
+ put_task_stack(prev);
|
|
+
|
|
+ put_task_struct(prev);
|
|
+ }
|
|
+}
|
|
+
|
|
+/**
|
|
+ * schedule_tail - first thing a freshly forked thread must call.
|
|
+ * @prev: the thread we just switched away from.
|
|
+ */
|
|
+asmlinkage __visible void schedule_tail(struct task_struct *prev)
|
|
+{
|
|
+ /*
|
|
+ * New tasks start with FORK_PREEMPT_COUNT, see there and
|
|
+ * finish_task_switch() for details.
|
|
+ *
|
|
+ * finish_task_switch() will drop rq->lock() and lower preempt_count
|
|
+ * and the preempt_enable() will end up enabling preemption (on
|
|
+ * PREEMPT_COUNT kernels).
|
|
+ */
|
|
+
|
|
+ finish_task_switch(prev);
|
|
+ preempt_enable();
|
|
+
|
|
+ if (current->set_child_tid)
|
|
+ put_user(task_pid_vnr(current), current->set_child_tid);
|
|
+}
|
|
+
|
|
+/*
|
|
+ * context_switch - switch to the new MM and the new thread's register state.
|
|
+ */
|
|
+static __always_inline void
|
|
+context_switch(struct rq *rq, struct task_struct *prev,
|
|
+ struct task_struct *next)
|
|
+{
|
|
+ struct mm_struct *mm, *oldmm;
|
|
+
|
|
+ prepare_task_switch(rq, prev, next);
|
|
+
|
|
+ mm = next->mm;
|
|
+ oldmm = prev->active_mm;
|
|
+ /*
|
|
+ * For paravirt, this is coupled with an exit in switch_to to
|
|
+ * combine the page table reload and the switch backend into
|
|
+ * one hypercall.
|
|
+ */
|
|
+ arch_start_context_switch(prev);
|
|
+
|
|
+ if (!mm) {
|
|
+ next->active_mm = oldmm;
|
|
+ atomic_inc(&oldmm->mm_count);
|
|
+ enter_lazy_tlb(oldmm, next);
|
|
+ } else
|
|
+ switch_mm_irqs_off(oldmm, mm, next);
|
|
+
|
|
+ if (!prev->mm) {
|
|
+ prev->active_mm = NULL;
|
|
+ rq->prev_mm = oldmm;
|
|
+ }
|
|
+ /*
|
|
+ * Since the runqueue lock will be released by the next
|
|
+ * task (which is an invalid locking op but in the case
|
|
+ * of the scheduler it's an obvious special-case), so we
|
|
+ * do an early lockdep release here:
|
|
+ */
|
|
+ spin_release(&rq->lock.dep_map, 1, _THIS_IP_);
|
|
+
|
|
+ /* Here we just switch the register state and the stack. */
|
|
+ switch_to(prev, next, prev);
|
|
+ barrier();
|
|
+
|
|
+ finish_task_switch(prev);
|
|
+}
|
|
+
|
|
+/*
|
|
+ * nr_running, nr_uninterruptible and nr_context_switches:
|
|
+ *
|
|
+ * externally visible scheduler statistics: current number of runnable
|
|
+ * threads, total number of context switches performed since bootup.
|
|
+ */
|
|
+unsigned long nr_running(void)
|
|
+{
|
|
+ return atomic_read(&grq.nr_running);
|
|
+}
|
|
+
|
|
+static unsigned long nr_uninterruptible(void)
|
|
+{
|
|
+ return atomic_read(&grq.nr_uninterruptible);
|
|
+}
|
|
+
|
|
+/*
|
|
+ * Check if only the current task is running on the cpu.
|
|
+ *
|
|
+ * Caution: this function does not check that the caller has disabled
|
|
+ * preemption, thus the result might have a time-of-check-to-time-of-use
|
|
+ * race. The caller is responsible to use it correctly, for example:
|
|
+ *
|
|
+ * - from a non-preemptable section (of course)
|
|
+ *
|
|
+ * - from a thread that is bound to a single CPU
|
|
+ *
|
|
+ * - in a loop with very short iterations (e.g. a polling loop)
|
|
+ */
|
|
+bool single_task_running(void)
|
|
+{
|
|
+ struct rq *rq = cpu_rq(smp_processor_id());
|
|
+
|
|
+ if (rq_load(rq) == 1)
|
|
+ return true;
|
|
+ else
|
|
+ return false;
|
|
+}
|
|
+EXPORT_SYMBOL(single_task_running);
|
|
+
|
|
+unsigned long long nr_context_switches(void)
|
|
+{
|
|
+ return (unsigned long long)atomic64_read(&grq.nr_switches);
|
|
+}
|
|
+
|
|
+unsigned long nr_iowait(void)
|
|
+{
|
|
+ unsigned long i, sum = 0;
|
|
+
|
|
+ for_each_possible_cpu(i)
|
|
+ sum += atomic_read(&cpu_rq(i)->nr_iowait);
|
|
+
|
|
+ return sum;
|
|
+}
|
|
+
|
|
+unsigned long nr_iowait_cpu(int cpu)
|
|
+{
|
|
+ struct rq *this = cpu_rq(cpu);
|
|
+ return atomic_read(&this->nr_iowait);
|
|
+}
|
|
+
|
|
+unsigned long nr_active(void)
|
|
+{
|
|
+ return nr_running() + nr_uninterruptible();
|
|
+}
|
|
+
|
|
+/*
|
|
+ * I/O wait is the number of running or queued tasks with their ->rq pointer
|
|
+ * set to this cpu as being the CPU they're more likely to run on.
|
|
+ */
|
|
+void get_iowait_load(unsigned long *nr_waiters, unsigned long *load)
|
|
+{
|
|
+ struct rq *rq = this_rq();
|
|
+
|
|
+ *nr_waiters = atomic_read(&rq->nr_iowait);
|
|
+ *load = rq_load(rq);
|
|
+}
|
|
+
|
|
+/* Variables and functions for calc_load */
|
|
+static unsigned long calc_load_update;
|
|
+unsigned long avenrun[3];
|
|
+EXPORT_SYMBOL(avenrun);
|
|
+
|
|
+/**
|
|
+ * get_avenrun - get the load average array
|
|
+ * @loads: pointer to dest load array
|
|
+ * @offset: offset to add
|
|
+ * @shift: shift count to shift the result left
|
|
+ *
|
|
+ * These values are estimates at best, so no need for locking.
|
|
+ */
|
|
+void get_avenrun(unsigned long *loads, unsigned long offset, int shift)
|
|
+{
|
|
+ loads[0] = (avenrun[0] + offset) << shift;
|
|
+ loads[1] = (avenrun[1] + offset) << shift;
|
|
+ loads[2] = (avenrun[2] + offset) << shift;
|
|
+}
|
|
+
|
|
+static unsigned long
|
|
+calc_load(unsigned long load, unsigned long exp, unsigned long active)
|
|
+{
|
|
+ unsigned long newload;
|
|
+
|
|
+ newload = load * exp + active * (FIXED_1 - exp);
|
|
+ if (active >= load)
|
|
+ newload += FIXED_1-1;
|
|
+
|
|
+ return newload / FIXED_1;
|
|
+}
|
|
+
|
|
+/*
|
|
+ * calc_load - update the avenrun load estimates every LOAD_FREQ seconds.
|
|
+ */
|
|
+void calc_global_load(unsigned long ticks)
|
|
+{
|
|
+ long active;
|
|
+
|
|
+ if (time_before(jiffies, calc_load_update))
|
|
+ return;
|
|
+ active = nr_active() * FIXED_1;
|
|
+
|
|
+ avenrun[0] = calc_load(avenrun[0], EXP_1, active);
|
|
+ avenrun[1] = calc_load(avenrun[1], EXP_5, active);
|
|
+ avenrun[2] = calc_load(avenrun[2], EXP_15, active);
|
|
+
|
|
+ calc_load_update = jiffies + LOAD_FREQ;
|
|
+}
|
|
+
|
|
+DEFINE_PER_CPU(struct kernel_stat, kstat);
|
|
+DEFINE_PER_CPU(struct kernel_cpustat, kernel_cpustat);
|
|
+
|
|
+EXPORT_PER_CPU_SYMBOL(kstat);
|
|
+EXPORT_PER_CPU_SYMBOL(kernel_cpustat);
|
|
+
|
|
+#ifdef CONFIG_PARAVIRT
|
|
+static inline u64 steal_ticks(u64 steal)
|
|
+{
|
|
+ if (unlikely(steal > NSEC_PER_SEC))
|
|
+ return div_u64(steal, TICK_NSEC);
|
|
+
|
|
+ return __iter_div_u64_rem(steal, TICK_NSEC, &steal);
|
|
+}
|
|
+#endif
|
|
+
|
|
+static void update_rq_clock_task(struct rq *rq, s64 delta)
|
|
+{
|
|
+/*
|
|
+ * In theory, the compile should just see 0 here, and optimize out the call
|
|
+ * to sched_rt_avg_update. But I don't trust it...
|
|
+ */
|
|
+#ifdef CONFIG_IRQ_TIME_ACCOUNTING
|
|
+ s64 irq_delta = irq_time_read(cpu_of(rq)) - rq->prev_irq_time;
|
|
+
|
|
+ /*
|
|
+ * Since irq_time is only updated on {soft,}irq_exit, we might run into
|
|
+ * this case when a previous update_rq_clock() happened inside a
|
|
+ * {soft,}irq region.
|
|
+ *
|
|
+ * When this happens, we stop ->clock_task and only update the
|
|
+ * prev_irq_time stamp to account for the part that fit, so that a next
|
|
+ * update will consume the rest. This ensures ->clock_task is
|
|
+ * monotonic.
|
|
+ *
|
|
+ * It does however cause some slight miss-attribution of {soft,}irq
|
|
+ * time, a more accurate solution would be to update the irq_time using
|
|
+ * the current rq->clock timestamp, except that would require using
|
|
+ * atomic ops.
|
|
+ */
|
|
+ if (irq_delta > delta)
|
|
+ irq_delta = delta;
|
|
+
|
|
+ rq->prev_irq_time += irq_delta;
|
|
+ delta -= irq_delta;
|
|
+#endif
|
|
+#ifdef CONFIG_PARAVIRT_TIME_ACCOUNTING
|
|
+ if (static_key_false((¶virt_steal_rq_enabled))) {
|
|
+ s64 steal = paravirt_steal_clock(cpu_of(rq));
|
|
+
|
|
+ steal -= rq->prev_steal_time_rq;
|
|
+
|
|
+ if (unlikely(steal > delta))
|
|
+ steal = delta;
|
|
+
|
|
+ rq->prev_steal_time_rq += steal;
|
|
+
|
|
+ delta -= steal;
|
|
+ }
|
|
+#endif
|
|
+ rq->clock_task += delta;
|
|
+}
|
|
+
|
|
+#ifndef nsecs_to_cputime
|
|
+# define nsecs_to_cputime(__nsecs) nsecs_to_jiffies(__nsecs)
|
|
+#endif
|
|
+
|
|
+/*
|
|
+ * On each tick, add the number of nanoseconds to the unbanked variables and
|
|
+ * once one tick's worth has accumulated, account it allowing for accurate
|
|
+ * sub-tick accounting and totals.
|
|
+ */
|
|
+static void pc_idle_time(struct rq *rq, struct task_struct *idle, unsigned long ns)
|
|
+{
|
|
+ u64 *cpustat = kcpustat_this_cpu->cpustat;
|
|
+ unsigned long ticks;
|
|
+
|
|
+ if (atomic_read(&rq->nr_iowait) > 0) {
|
|
+ rq->iowait_ns += ns;
|
|
+ if (rq->iowait_ns >= JIFFY_NS) {
|
|
+ ticks = NS_TO_JIFFIES(rq->iowait_ns);
|
|
+ cpustat[CPUTIME_IOWAIT] += (__force u64)cputime_one_jiffy * ticks;
|
|
+ rq->iowait_ns %= JIFFY_NS;
|
|
+ }
|
|
+ } else {
|
|
+ rq->idle_ns += ns;
|
|
+ if (rq->idle_ns >= JIFFY_NS) {
|
|
+ ticks = NS_TO_JIFFIES(rq->idle_ns);
|
|
+ cpustat[CPUTIME_IDLE] += (__force u64)cputime_one_jiffy * ticks;
|
|
+ rq->idle_ns %= JIFFY_NS;
|
|
+ }
|
|
+ }
|
|
+ acct_update_integrals(idle);
|
|
+}
|
|
+
|
|
+static void pc_system_time(struct rq *rq, struct task_struct *p,
|
|
+ int hardirq_offset, unsigned long ns)
|
|
+{
|
|
+ cputime_t one_jiffy_scaled = cputime_to_scaled(cputime_one_jiffy);
|
|
+ u64 *cpustat = kcpustat_this_cpu->cpustat;
|
|
+ unsigned long ticks;
|
|
+
|
|
+ p->stime_ns += ns;
|
|
+ if (p->stime_ns >= JIFFY_NS) {
|
|
+ ticks = NS_TO_JIFFIES(p->stime_ns);
|
|
+ p->stime_ns %= JIFFY_NS;
|
|
+ p->stime += (__force u64)cputime_one_jiffy * ticks;
|
|
+ p->stimescaled += one_jiffy_scaled * ticks;
|
|
+ account_group_system_time(p, cputime_one_jiffy * ticks);
|
|
+ }
|
|
+ p->sched_time += ns;
|
|
+ account_group_exec_runtime(p, ns);
|
|
+
|
|
+ if (hardirq_count() - hardirq_offset) {
|
|
+ rq->irq_ns += ns;
|
|
+ if (rq->irq_ns >= JIFFY_NS) {
|
|
+ ticks = NS_TO_JIFFIES(rq->irq_ns);
|
|
+ cpustat[CPUTIME_IRQ] += (__force u64)cputime_one_jiffy * ticks;
|
|
+ rq->irq_ns %= JIFFY_NS;
|
|
+ }
|
|
+ } else if (in_serving_softirq()) {
|
|
+ rq->softirq_ns += ns;
|
|
+ if (rq->softirq_ns >= JIFFY_NS) {
|
|
+ ticks = NS_TO_JIFFIES(rq->softirq_ns);
|
|
+ cpustat[CPUTIME_SOFTIRQ] += (__force u64)cputime_one_jiffy * ticks;
|
|
+ rq->softirq_ns %= JIFFY_NS;
|
|
+ }
|
|
+ } else {
|
|
+ rq->system_ns += ns;
|
|
+ if (rq->system_ns >= JIFFY_NS) {
|
|
+ ticks = NS_TO_JIFFIES(rq->system_ns);
|
|
+ cpustat[CPUTIME_SYSTEM] += (__force u64)cputime_one_jiffy * ticks;
|
|
+ rq->system_ns %= JIFFY_NS;
|
|
+ }
|
|
+ }
|
|
+ acct_update_integrals(p);
|
|
+}
|
|
+
|
|
+static void pc_user_time(struct rq *rq, struct task_struct *p, unsigned long ns)
|
|
+{
|
|
+ cputime_t one_jiffy_scaled = cputime_to_scaled(cputime_one_jiffy);
|
|
+ u64 *cpustat = kcpustat_this_cpu->cpustat;
|
|
+ unsigned long ticks;
|
|
+
|
|
+ p->utime_ns += ns;
|
|
+ if (p->utime_ns >= JIFFY_NS) {
|
|
+ ticks = NS_TO_JIFFIES(p->utime_ns);
|
|
+ p->utime_ns %= JIFFY_NS;
|
|
+ p->utime += (__force u64)cputime_one_jiffy * ticks;
|
|
+ p->utimescaled += one_jiffy_scaled * ticks;
|
|
+ account_group_user_time(p, cputime_one_jiffy * ticks);
|
|
+ }
|
|
+ p->sched_time += ns;
|
|
+ account_group_exec_runtime(p, ns);
|
|
+
|
|
+ if (this_cpu_ksoftirqd() == p) {
|
|
+ /*
|
|
+ * ksoftirqd time do not get accounted in cpu_softirq_time.
|
|
+ * So, we have to handle it separately here.
|
|
+ */
|
|
+ rq->softirq_ns += ns;
|
|
+ if (rq->softirq_ns >= JIFFY_NS) {
|
|
+ ticks = NS_TO_JIFFIES(rq->softirq_ns);
|
|
+ cpustat[CPUTIME_SOFTIRQ] += (__force u64)cputime_one_jiffy * ticks;
|
|
+ rq->softirq_ns %= JIFFY_NS;
|
|
+ }
|
|
+ }
|
|
+
|
|
+ if (task_nice(p) > 0 || idleprio_task(p)) {
|
|
+ rq->nice_ns += ns;
|
|
+ if (rq->nice_ns >= JIFFY_NS) {
|
|
+ ticks = NS_TO_JIFFIES(rq->nice_ns);
|
|
+ cpustat[CPUTIME_NICE] += (__force u64)cputime_one_jiffy * ticks;
|
|
+ rq->nice_ns %= JIFFY_NS;
|
|
+ }
|
|
+ } else {
|
|
+ rq->user_ns += ns;
|
|
+ if (rq->user_ns >= JIFFY_NS) {
|
|
+ ticks = NS_TO_JIFFIES(rq->user_ns);
|
|
+ cpustat[CPUTIME_USER] += (__force u64)cputime_one_jiffy * ticks;
|
|
+ rq->user_ns %= JIFFY_NS;
|
|
+ }
|
|
+ }
|
|
+ acct_update_integrals(p);
|
|
+}
|
|
+
|
|
+/*
|
|
+ * This is called on clock ticks.
|
|
+ * Bank in p->sched_time the ns elapsed since the last tick or switch.
|
|
+ * CPU scheduler quota accounting is also performed here in microseconds.
|
|
+ */
|
|
+static void update_cpu_clock_tick(struct rq *rq, struct task_struct *p)
|
|
+{
|
|
+ s64 account_ns = rq->niffies - p->last_ran;
|
|
+ struct task_struct *idle = rq->idle;
|
|
+
|
|
+ /* Accurate tick timekeeping */
|
|
+ if (user_mode(get_irq_regs()))
|
|
+ pc_user_time(rq, p, account_ns);
|
|
+ else if (p != idle || (irq_count() != HARDIRQ_OFFSET)) {
|
|
+ pc_system_time(rq, p, HARDIRQ_OFFSET, account_ns);
|
|
+ } else
|
|
+ pc_idle_time(rq, idle, account_ns);
|
|
+
|
|
+ /* time_slice accounting is done in usecs to avoid overflow on 32bit */
|
|
+ if (p->policy != SCHED_FIFO && p != idle)
|
|
+ p->time_slice -= NS_TO_US(account_ns);
|
|
+
|
|
+ p->last_ran = rq->niffies;
|
|
+}
|
|
+
|
|
+/*
|
|
+ * This is called on context switches.
|
|
+ * Bank in p->sched_time the ns elapsed since the last tick or switch.
|
|
+ * CPU scheduler quota accounting is also performed here in microseconds.
|
|
+ */
|
|
+static void update_cpu_clock_switch(struct rq *rq, struct task_struct *p)
|
|
+{
|
|
+ s64 account_ns = rq->niffies - p->last_ran;
|
|
+ struct task_struct *idle = rq->idle;
|
|
+
|
|
+ /* Accurate subtick timekeeping */
|
|
+ if (p != idle)
|
|
+ pc_user_time(rq, p, account_ns);
|
|
+ else
|
|
+ pc_idle_time(rq, idle, account_ns);
|
|
+
|
|
+ /* time_slice accounting is done in usecs to avoid overflow on 32bit */
|
|
+ if (p->policy != SCHED_FIFO && p != idle)
|
|
+ p->time_slice -= NS_TO_US(account_ns);
|
|
+}
|
|
+
|
|
+/*
|
|
+ * Return any ns on the sched_clock that have not yet been accounted in
|
|
+ * @p in case that task is currently running.
|
|
+ *
|
|
+ * Called with task_rq_lock(p) held.
|
|
+ */
|
|
+static inline u64 do_task_delta_exec(struct task_struct *p, struct rq *rq)
|
|
+{
|
|
+ u64 ns = 0;
|
|
+
|
|
+ /*
|
|
+ * Must be ->curr _and_ ->on_rq. If dequeued, we would
|
|
+ * project cycles that may never be accounted to this
|
|
+ * thread, breaking clock_gettime().
|
|
+ */
|
|
+ if (p == rq->curr && task_on_rq_queued(p)) {
|
|
+ update_clocks(rq);
|
|
+ ns = rq->niffies - p->last_ran;
|
|
+ }
|
|
+
|
|
+ return ns;
|
|
+}
|
|
+
|
|
+/*
|
|
+ * Return accounted runtime for the task.
|
|
+ * Return separately the current's pending runtime that have not been
|
|
+ * accounted yet.
|
|
+ *
|
|
+ */
|
|
+unsigned long long task_sched_runtime(struct task_struct *p)
|
|
+{
|
|
+ unsigned long flags;
|
|
+ struct rq *rq;
|
|
+ u64 ns;
|
|
+
|
|
+#if defined(CONFIG_64BIT) && defined(CONFIG_SMP)
|
|
+ /*
|
|
+ * 64-bit doesn't need locks to atomically read a 64bit value.
|
|
+ * So we have a optimization chance when the task's delta_exec is 0.
|
|
+ * Reading ->on_cpu is racy, but this is ok.
|
|
+ *
|
|
+ * If we race with it leaving cpu, we'll take a lock. So we're correct.
|
|
+ * If we race with it entering cpu, unaccounted time is 0. This is
|
|
+ * indistinguishable from the read occurring a few cycles earlier.
|
|
+ * If we see ->on_cpu without ->on_rq, the task is leaving, and has
|
|
+ * been accounted, so we're correct here as well.
|
|
+ */
|
|
+ if (!p->on_cpu || !task_on_rq_queued(p))
|
|
+ return tsk_seruntime(p);
|
|
+#endif
|
|
+
|
|
+ rq = task_rq_lock(p, &flags);
|
|
+ ns = p->sched_time + do_task_delta_exec(p, rq);
|
|
+ task_rq_unlock(rq, p, &flags);
|
|
+
|
|
+ return ns;
|
|
+}
|
|
+
|
|
+/*
|
|
+ * Functions to test for when SCHED_ISO tasks have used their allocated
|
|
+ * quota as real time scheduling and convert them back to SCHED_NORMAL. All
|
|
+ * data is modified only by the local runqueue during scheduler_tick with
|
|
+ * interrupts disabled.
|
|
+ */
|
|
+
|
|
+/*
|
|
+ * Test if SCHED_ISO tasks have run longer than their alloted period as RT
|
|
+ * tasks and set the refractory flag if necessary. There is 10% hysteresis
|
|
+ * for unsetting the flag. 115/128 is ~90/100 as a fast shift instead of a
|
|
+ * slow division.
|
|
+ */
|
|
+static inline void iso_tick(struct rq *rq)
|
|
+{
|
|
+ rq->iso_ticks = rq->iso_ticks * (ISO_PERIOD - 1) / ISO_PERIOD;
|
|
+ rq->iso_ticks += 100;
|
|
+ if (rq->iso_ticks > ISO_PERIOD * sched_iso_cpu) {
|
|
+ rq->iso_refractory = true;
|
|
+ if (unlikely(rq->iso_ticks > ISO_PERIOD * 100))
|
|
+ rq->iso_ticks = ISO_PERIOD * 100;
|
|
+ }
|
|
+}
|
|
+
|
|
+/* No SCHED_ISO task was running so decrease rq->iso_ticks */
|
|
+static inline void no_iso_tick(struct rq *rq, int ticks)
|
|
+{
|
|
+ if (rq->iso_ticks > 0 || rq->iso_refractory) {
|
|
+ rq->iso_ticks = rq->iso_ticks * (ISO_PERIOD - ticks) / ISO_PERIOD;
|
|
+ if (rq->iso_ticks < ISO_PERIOD * (sched_iso_cpu * 115 / 128)) {
|
|
+ rq->iso_refractory = false;
|
|
+ if (unlikely(rq->iso_ticks < 0))
|
|
+ rq->iso_ticks = 0;
|
|
+ }
|
|
+ }
|
|
+}
|
|
+
|
|
+/* This manages tasks that have run out of timeslice during a scheduler_tick */
|
|
+static void task_running_tick(struct rq *rq)
|
|
+{
|
|
+ struct task_struct *p = rq->curr;
|
|
+
|
|
+ /*
|
|
+ * If a SCHED_ISO task is running we increment the iso_ticks. In
|
|
+ * order to prevent SCHED_ISO tasks from causing starvation in the
|
|
+ * presence of true RT tasks we account those as iso_ticks as well.
|
|
+ */
|
|
+ if (rt_task(p) || task_running_iso(p))
|
|
+ iso_tick(rq);
|
|
+ else
|
|
+ no_iso_tick(rq, 1);
|
|
+
|
|
+ /* SCHED_FIFO tasks never run out of timeslice. */
|
|
+ if (p->policy == SCHED_FIFO)
|
|
+ return;
|
|
+
|
|
+ if (iso_task(p)) {
|
|
+ if (task_running_iso(p)) {
|
|
+ if (rq->iso_refractory) {
|
|
+ /*
|
|
+ * SCHED_ISO task is running as RT and limit
|
|
+ * has been hit. Force it to reschedule as
|
|
+ * SCHED_NORMAL by zeroing its time_slice
|
|
+ */
|
|
+ p->time_slice = 0;
|
|
+ }
|
|
+ } else if (!rq->iso_refractory) {
|
|
+ /* Can now run again ISO. Reschedule to pick up prio */
|
|
+ goto out_resched;
|
|
+ }
|
|
+ }
|
|
+
|
|
+ /*
|
|
+ * Tasks that were scheduled in the first half of a tick are not
|
|
+ * allowed to run into the 2nd half of the next tick if they will
|
|
+ * run out of time slice in the interim. Otherwise, if they have
|
|
+ * less than RESCHED_US μs of time slice left they will be rescheduled.
|
|
+ * Dither is used as a backup for when hrexpiry is disabled or high res
|
|
+ * timers not configured in.
|
|
+ */
|
|
+ if (p->time_slice - rq->dither >= RESCHED_US)
|
|
+ return;
|
|
+out_resched:
|
|
+ rq_lock(rq);
|
|
+ __set_tsk_resched(p);
|
|
+ rq_unlock(rq);
|
|
+}
|
|
+
|
|
+#ifdef CONFIG_NO_HZ_FULL
|
|
+/*
|
|
+ * We can stop the timer tick any time highres timers are active since
|
|
+ * we rely entirely on highres timeouts for task expiry rescheduling.
|
|
+ */
|
|
+static void sched_stop_tick(struct rq *rq, int cpu)
|
|
+{
|
|
+ if (!hrexpiry_enabled(rq))
|
|
+ return;
|
|
+ if (!tick_nohz_full_enabled())
|
|
+ return;
|
|
+ if (!tick_nohz_full_cpu(cpu))
|
|
+ return;
|
|
+ tick_nohz_dep_clear_cpu(cpu, TICK_DEP_BIT_SCHED);
|
|
+}
|
|
+
|
|
+static inline void sched_start_tick(struct rq *rq, int cpu)
|
|
+{
|
|
+ tick_nohz_dep_set_cpu(cpu, TICK_DEP_BIT_SCHED);
|
|
+}
|
|
+
|
|
+/**
|
|
+ * scheduler_tick_max_deferment
|
|
+ *
|
|
+ * Keep at least one tick per second when a single
|
|
+ * active task is running.
|
|
+ *
|
|
+ * This makes sure that uptime continues to move forward, even
|
|
+ * with a very low granularity.
|
|
+ *
|
|
+ * Return: Maximum deferment in nanoseconds.
|
|
+ */
|
|
+u64 scheduler_tick_max_deferment(void)
|
|
+{
|
|
+ struct rq *rq = this_rq();
|
|
+ unsigned long next, now = READ_ONCE(jiffies);
|
|
+
|
|
+ next = rq->last_jiffy + HZ;
|
|
+
|
|
+ if (time_before_eq(next, now))
|
|
+ return 0;
|
|
+
|
|
+ return jiffies_to_nsecs(next - now);
|
|
+}
|
|
+#else
|
|
+static inline void sched_stop_tick(struct rq *rq, int cpu)
|
|
+{
|
|
+}
|
|
+
|
|
+static inline void sched_start_tick(struct rq *rq, int cpu)
|
|
+{
|
|
+}
|
|
+#endif
|
|
+
|
|
+/*
|
|
+ * This function gets called by the timer code, with HZ frequency.
|
|
+ * We call it with interrupts disabled.
|
|
+ */
|
|
+void scheduler_tick(void)
|
|
+{
|
|
+ int cpu __maybe_unused = smp_processor_id();
|
|
+ struct rq *rq = cpu_rq(cpu);
|
|
+
|
|
+ sched_clock_tick();
|
|
+ update_clocks(rq);
|
|
+ update_load_avg(rq, 0);
|
|
+ update_cpu_clock_tick(rq, rq->curr);
|
|
+ if (!rq_idle(rq))
|
|
+ task_running_tick(rq);
|
|
+ else
|
|
+ no_iso_tick(rq, rq->last_scheduler_tick - rq->last_jiffy);
|
|
+ rq->last_scheduler_tick = rq->last_jiffy;
|
|
+ rq->last_tick = rq->clock;
|
|
+ perf_event_task_tick();
|
|
+ sched_stop_tick(rq, cpu);
|
|
+}
|
|
+
|
|
+#if defined(CONFIG_PREEMPT) && (defined(CONFIG_DEBUG_PREEMPT) || \
|
|
+ defined(CONFIG_PREEMPT_TRACER))
|
|
+/*
|
|
+ * If the value passed in is equal to the current preempt count
|
|
+ * then we just disabled preemption. Start timing the latency.
|
|
+ */
|
|
+static inline void preempt_latency_start(int val)
|
|
+{
|
|
+ if (preempt_count() == val) {
|
|
+ unsigned long ip = get_lock_parent_ip();
|
|
+#ifdef CONFIG_DEBUG_PREEMPT
|
|
+ current->preempt_disable_ip = ip;
|
|
+#endif
|
|
+ trace_preempt_off(CALLER_ADDR0, ip);
|
|
+ }
|
|
+}
|
|
+
|
|
+void preempt_count_add(int val)
|
|
+{
|
|
+#ifdef CONFIG_DEBUG_PREEMPT
|
|
+ /*
|
|
+ * Underflow?
|
|
+ */
|
|
+ if (DEBUG_LOCKS_WARN_ON((preempt_count() < 0)))
|
|
+ return;
|
|
+#endif
|
|
+ __preempt_count_add(val);
|
|
+#ifdef CONFIG_DEBUG_PREEMPT
|
|
+ /*
|
|
+ * Spinlock count overflowing soon?
|
|
+ */
|
|
+ DEBUG_LOCKS_WARN_ON((preempt_count() & PREEMPT_MASK) >=
|
|
+ PREEMPT_MASK - 10);
|
|
+#endif
|
|
+ preempt_latency_start(val);
|
|
+}
|
|
+EXPORT_SYMBOL(preempt_count_add);
|
|
+NOKPROBE_SYMBOL(preempt_count_add);
|
|
+
|
|
+/*
|
|
+ * If the value passed in equals to the current preempt count
|
|
+ * then we just enabled preemption. Stop timing the latency.
|
|
+ */
|
|
+static inline void preempt_latency_stop(int val)
|
|
+{
|
|
+ if (preempt_count() == val)
|
|
+ trace_preempt_on(CALLER_ADDR0, get_lock_parent_ip());
|
|
+}
|
|
+
|
|
+void preempt_count_sub(int val)
|
|
+{
|
|
+#ifdef CONFIG_DEBUG_PREEMPT
|
|
+ /*
|
|
+ * Underflow?
|
|
+ */
|
|
+ if (DEBUG_LOCKS_WARN_ON(val > preempt_count()))
|
|
+ return;
|
|
+ /*
|
|
+ * Is the spinlock portion underflowing?
|
|
+ */
|
|
+ if (DEBUG_LOCKS_WARN_ON((val < PREEMPT_MASK) &&
|
|
+ !(preempt_count() & PREEMPT_MASK)))
|
|
+ return;
|
|
+#endif
|
|
+
|
|
+ preempt_latency_stop(val);
|
|
+ __preempt_count_sub(val);
|
|
+}
|
|
+EXPORT_SYMBOL(preempt_count_sub);
|
|
+NOKPROBE_SYMBOL(preempt_count_sub);
|
|
+
|
|
+#else
|
|
+static inline void preempt_latency_start(int val) { }
|
|
+static inline void preempt_latency_stop(int val) { }
|
|
+#endif
|
|
+
|
|
+/*
|
|
+ * The time_slice is only refilled when it is empty and that is when we set a
|
|
+ * new deadline. Make sure update_clocks has been called recently to update
|
|
+ * rq->niffies.
|
|
+ */
|
|
+static void time_slice_expired(struct task_struct *p, struct rq *rq)
|
|
+{
|
|
+ p->time_slice = timeslice();
|
|
+ p->deadline = rq->niffies + task_deadline_diff(p);
|
|
+#ifdef CONFIG_SMT_NICE
|
|
+ if (!p->mm)
|
|
+ p->smt_bias = 0;
|
|
+ else if (rt_task(p))
|
|
+ p->smt_bias = 1 << 30;
|
|
+ else if (task_running_iso(p))
|
|
+ p->smt_bias = 1 << 29;
|
|
+ else if (idleprio_task(p)) {
|
|
+ if (task_running_idle(p))
|
|
+ p->smt_bias = 0;
|
|
+ else
|
|
+ p->smt_bias = 1;
|
|
+ } else if (--p->smt_bias < 1)
|
|
+ p->smt_bias = MAX_PRIO - p->static_prio;
|
|
+#endif
|
|
+}
|
|
+
|
|
+/*
|
|
+ * Timeslices below RESCHED_US are considered as good as expired as there's no
|
|
+ * point rescheduling when there's so little time left. SCHED_BATCH tasks
|
|
+ * have been flagged be not latency sensitive and likely to be fully CPU
|
|
+ * bound so every time they're rescheduled they have their time_slice
|
|
+ * refilled, but get a new later deadline to have little effect on
|
|
+ * SCHED_NORMAL tasks.
|
|
+
|
|
+ */
|
|
+static inline void check_deadline(struct task_struct *p, struct rq *rq)
|
|
+{
|
|
+ if (p->time_slice < RESCHED_US || batch_task(p))
|
|
+ time_slice_expired(p, rq);
|
|
+}
|
|
+
|
|
+/*
|
|
+ * Task selection with skiplists is a simple matter of picking off the first
|
|
+ * task in the sorted list, an O(1) operation. The lookup is amortised O(1)
|
|
+ * being bound to the number of processors.
|
|
+ *
|
|
+ * Runqueues are selectively locked based on their unlocked data and then
|
|
+ * unlocked if not needed. At most 3 locks will be held at any time and are
|
|
+ * released as soon as they're no longer needed. All balancing between CPUs
|
|
+ * is thus done here in an extremely simple first come best fit manner.
|
|
+ *
|
|
+ * This iterates over runqueues in cache locality order. In interactive mode
|
|
+ * it iterates over all CPUs and finds the task with the best key/deadline.
|
|
+ * In non-interactive mode it will only take a task if it's from the current
|
|
+ * runqueue or a runqueue with more tasks than the current one with a better
|
|
+ * key/deadline.
|
|
+ */
|
|
+#ifdef CONFIG_SMP
|
|
+static inline struct task_struct
|
|
+*earliest_deadline_task(struct rq *rq, int cpu, struct task_struct *idle)
|
|
+{
|
|
+ struct rq *locked = NULL, *chosen = NULL;
|
|
+ struct task_struct *edt = idle;
|
|
+ int i, best_entries = 0;
|
|
+ u64 best_key = ~0ULL;
|
|
+
|
|
+ for (i = 0; i < num_possible_cpus(); i++) {
|
|
+ struct rq *other_rq = rq_order(rq, i);
|
|
+ int entries = other_rq->sl->entries;
|
|
+ skiplist_node *next;
|
|
+
|
|
+ /*
|
|
+ * Check for queued entres lockless first. The local runqueue
|
|
+ * is locked so entries will always be accurate.
|
|
+ */
|
|
+ if (!sched_interactive) {
|
|
+ /*
|
|
+ * Don't reschedule balance across nodes unless the CPU
|
|
+ * is idle.
|
|
+ */
|
|
+ if (edt != idle && rq->cpu_locality[other_rq->cpu] > 3)
|
|
+ break;
|
|
+ if (entries <= best_entries)
|
|
+ continue;
|
|
+ } else if (!entries)
|
|
+ continue;
|
|
+
|
|
+ /* if (i) implies other_rq != rq */
|
|
+ if (i) {
|
|
+ /* Check for best id queued lockless first */
|
|
+ if (other_rq->best_key >= best_key)
|
|
+ continue;
|
|
+
|
|
+ if (unlikely(!trylock_rq(rq, other_rq)))
|
|
+ continue;
|
|
+
|
|
+ /* Need to reevaluate entries after locking */
|
|
+ entries = other_rq->sl->entries;
|
|
+ if (unlikely(!entries)) {
|
|
+ unlock_rq(other_rq);
|
|
+ continue;
|
|
+ }
|
|
+ }
|
|
+
|
|
+ next = &other_rq->node;
|
|
+ /*
|
|
+ * In interactive mode we check beyond the best entry on other
|
|
+ * runqueues if we can't get the best for smt or affinity
|
|
+ * reasons.
|
|
+ */
|
|
+ while ((next = next->next[0]) != &other_rq->node) {
|
|
+ struct task_struct *p;
|
|
+ u64 key = next->key;
|
|
+
|
|
+ /* Reevaluate key after locking */
|
|
+ if (key >= best_key)
|
|
+ break;
|
|
+
|
|
+ p = next->value;
|
|
+ if (!smt_schedule(p, rq)) {
|
|
+ if (i && !sched_interactive)
|
|
+ break;
|
|
+ continue;
|
|
+ }
|
|
+
|
|
+ /* Make sure affinity is ok */
|
|
+ if (i) {
|
|
+ if (needs_other_cpu(p, cpu)) {
|
|
+ if (sched_interactive)
|
|
+ continue;
|
|
+ break;
|
|
+ }
|
|
+ /* From this point on p is the best so far */
|
|
+ if (locked)
|
|
+ unlock_rq(locked);
|
|
+ chosen = locked = other_rq;
|
|
+ }
|
|
+ best_entries = entries;
|
|
+ best_key = key;
|
|
+ edt = p;
|
|
+ break;
|
|
+ }
|
|
+ if (i && other_rq != chosen)
|
|
+ unlock_rq(other_rq);
|
|
+ }
|
|
+
|
|
+ if (likely(edt != idle))
|
|
+ take_task(rq, cpu, edt);
|
|
+
|
|
+ if (locked)
|
|
+ unlock_rq(locked);
|
|
+
|
|
+ return edt;
|
|
+}
|
|
+#else /* CONFIG_SMP */
|
|
+static inline struct task_struct
|
|
+*earliest_deadline_task(struct rq *rq, int cpu, struct task_struct *idle)
|
|
+{
|
|
+ struct task_struct *edt;
|
|
+
|
|
+ if (unlikely(!rq->sl->entries))
|
|
+ return idle;
|
|
+ edt = rq->node.next[0]->value;
|
|
+ take_task(rq, cpu, edt);
|
|
+ return edt;
|
|
+}
|
|
+#endif /* CONFIG_SMP */
|
|
+
|
|
+/*
|
|
+ * Print scheduling while atomic bug:
|
|
+ */
|
|
+static noinline void __schedule_bug(struct task_struct *prev)
|
|
+{
|
|
+ /* Save this before calling printk(), since that will clobber it */
|
|
+ unsigned long preempt_disable_ip = get_preempt_disable_ip(current);
|
|
+
|
|
+ if (oops_in_progress)
|
|
+ return;
|
|
+
|
|
+ printk(KERN_ERR "BUG: scheduling while atomic: %s/%d/0x%08x\n",
|
|
+ prev->comm, prev->pid, preempt_count());
|
|
+
|
|
+ debug_show_held_locks(prev);
|
|
+ print_modules();
|
|
+ if (irqs_disabled())
|
|
+ print_irqtrace_events(prev);
|
|
+ if (IS_ENABLED(CONFIG_DEBUG_PREEMPT)
|
|
+ && in_atomic_preempt_off()) {
|
|
+ pr_err("Preemption disabled at:");
|
|
+ print_ip_sym(preempt_disable_ip);
|
|
+ pr_cont("\n");
|
|
+ }
|
|
+ dump_stack();
|
|
+ add_taint(TAINT_WARN, LOCKDEP_STILL_OK);
|
|
+}
|
|
+
|
|
+/*
|
|
+ * Various schedule()-time debugging checks and statistics:
|
|
+ */
|
|
+static inline void schedule_debug(struct task_struct *prev)
|
|
+{
|
|
+#ifdef CONFIG_SCHED_STACK_END_CHECK
|
|
+ if (task_stack_end_corrupted(prev))
|
|
+ panic("corrupted stack end detected inside scheduler\n");
|
|
+#endif
|
|
+
|
|
+ if (unlikely(in_atomic_preempt_off())) {
|
|
+ __schedule_bug(prev);
|
|
+ preempt_count_set(PREEMPT_DISABLED);
|
|
+ }
|
|
+ rcu_sleep_check();
|
|
+
|
|
+ profile_hit(SCHED_PROFILING, __builtin_return_address(0));
|
|
+
|
|
+ schedstat_inc(this_rq()->sched_count);
|
|
+}
|
|
+
|
|
+/*
|
|
+ * The currently running task's information is all stored in rq local data
|
|
+ * which is only modified by the local CPU.
|
|
+ */
|
|
+static inline void set_rq_task(struct rq *rq, struct task_struct *p)
|
|
+{
|
|
+ if (p == rq->idle || p->policy == SCHED_FIFO)
|
|
+ hrexpiry_clear(rq);
|
|
+ else
|
|
+ hrexpiry_start(rq, US_TO_NS(p->time_slice));
|
|
+ if (rq->clock - rq->last_tick > HALF_JIFFY_NS)
|
|
+ rq->dither = 0;
|
|
+ else
|
|
+ rq->dither = rq_dither(rq);
|
|
+
|
|
+ rq->rq_deadline = p->deadline;
|
|
+ rq->rq_prio = p->prio;
|
|
+#ifdef CONFIG_SMT_NICE
|
|
+ rq->rq_mm = p->mm;
|
|
+ rq->rq_smt_bias = p->smt_bias;
|
|
+#endif
|
|
+}
|
|
+
|
|
+#ifdef CONFIG_SMT_NICE
|
|
+static void check_no_siblings(struct rq __maybe_unused *this_rq) {}
|
|
+static void wake_no_siblings(struct rq __maybe_unused *this_rq) {}
|
|
+static void (*check_siblings)(struct rq *this_rq) = &check_no_siblings;
|
|
+static void (*wake_siblings)(struct rq *this_rq) = &wake_no_siblings;
|
|
+
|
|
+/* Iterate over smt siblings when we've scheduled a process on cpu and decide
|
|
+ * whether they should continue running or be descheduled. */
|
|
+static void check_smt_siblings(struct rq *this_rq)
|
|
+{
|
|
+ int other_cpu;
|
|
+
|
|
+ for_each_cpu(other_cpu, &this_rq->thread_mask) {
|
|
+ struct task_struct *p;
|
|
+ struct rq *rq;
|
|
+
|
|
+ rq = cpu_rq(other_cpu);
|
|
+ if (rq_idle(rq))
|
|
+ continue;
|
|
+ p = rq->curr;
|
|
+ if (!smt_schedule(p, this_rq))
|
|
+ resched_curr(rq);
|
|
+ }
|
|
+}
|
|
+
|
|
+static void wake_smt_siblings(struct rq *this_rq)
|
|
+{
|
|
+ int other_cpu;
|
|
+
|
|
+ for_each_cpu(other_cpu, &this_rq->thread_mask) {
|
|
+ struct rq *rq;
|
|
+
|
|
+ rq = cpu_rq(other_cpu);
|
|
+ if (rq_idle(rq))
|
|
+ resched_idle(rq);
|
|
+ }
|
|
+}
|
|
+#else
|
|
+static void check_siblings(struct rq __maybe_unused *this_rq) {}
|
|
+static void wake_siblings(struct rq __maybe_unused *this_rq) {}
|
|
+#endif
|
|
+
|
|
+/*
|
|
+ * schedule() is the main scheduler function.
|
|
+ *
|
|
+ * The main means of driving the scheduler and thus entering this function are:
|
|
+ *
|
|
+ * 1. Explicit blocking: mutex, semaphore, waitqueue, etc.
|
|
+ *
|
|
+ * 2. TIF_NEED_RESCHED flag is checked on interrupt and userspace return
|
|
+ * paths. For example, see arch/x86/entry_64.S.
|
|
+ *
|
|
+ * To drive preemption between tasks, the scheduler sets the flag in timer
|
|
+ * interrupt handler scheduler_tick().
|
|
+ *
|
|
+ * 3. Wakeups don't really cause entry into schedule(). They add a
|
|
+ * task to the run-queue and that's it.
|
|
+ *
|
|
+ * Now, if the new task added to the run-queue preempts the current
|
|
+ * task, then the wakeup sets TIF_NEED_RESCHED and schedule() gets
|
|
+ * called on the nearest possible occasion:
|
|
+ *
|
|
+ * - If the kernel is preemptible (CONFIG_PREEMPT=y):
|
|
+ *
|
|
+ * - in syscall or exception context, at the next outmost
|
|
+ * preempt_enable(). (this might be as soon as the wake_up()'s
|
|
+ * spin_unlock()!)
|
|
+ *
|
|
+ * - in IRQ context, return from interrupt-handler to
|
|
+ * preemptible context
|
|
+ *
|
|
+ * - If the kernel is not preemptible (CONFIG_PREEMPT is not set)
|
|
+ * then at the next:
|
|
+ *
|
|
+ * - cond_resched() call
|
|
+ * - explicit schedule() call
|
|
+ * - return from syscall or exception to user-space
|
|
+ * - return from interrupt-handler to user-space
|
|
+ *
|
|
+ * WARNING: must be called with preemption disabled!
|
|
+ */
|
|
+static void __sched notrace __schedule(bool preempt)
|
|
+{
|
|
+ struct task_struct *prev, *next, *idle;
|
|
+ unsigned long *switch_count;
|
|
+ bool deactivate = false;
|
|
+ struct rq *rq;
|
|
+ u64 niffies;
|
|
+ int cpu;
|
|
+
|
|
+ cpu = smp_processor_id();
|
|
+ rq = cpu_rq(cpu);
|
|
+ prev = rq->curr;
|
|
+ idle = rq->idle;
|
|
+
|
|
+ schedule_debug(prev);
|
|
+
|
|
+ local_irq_disable();
|
|
+ rcu_note_context_switch();
|
|
+
|
|
+ /*
|
|
+ * Make sure that signal_pending_state()->signal_pending() below
|
|
+ * can't be reordered with __set_current_state(TASK_INTERRUPTIBLE)
|
|
+ * done by the caller to avoid the race with signal_wake_up().
|
|
+ */
|
|
+ smp_mb__before_spinlock();
|
|
+ rq_lock(rq);
|
|
+#ifdef CONFIG_SMP
|
|
+ if (rq->preempt) {
|
|
+ /*
|
|
+ * Make sure resched_curr hasn't triggered a preemption
|
|
+ * locklessly on a task that has since scheduled away. Spurious
|
|
+ * wakeup of idle is okay though.
|
|
+ */
|
|
+ if (unlikely(preempt && prev != idle && !test_tsk_need_resched(prev))) {
|
|
+ rq->preempt = NULL;
|
|
+ clear_preempt_need_resched();
|
|
+ rq_unlock_irq(rq);
|
|
+ return;
|
|
+ }
|
|
+ rq->preempt = NULL;
|
|
+ }
|
|
+#endif
|
|
+
|
|
+ switch_count = &prev->nivcsw;
|
|
+ if (!preempt && prev->state) {
|
|
+ if (unlikely(signal_pending_state(prev->state, prev))) {
|
|
+ prev->state = TASK_RUNNING;
|
|
+ } else {
|
|
+ deactivate = true;
|
|
+ prev->on_rq = 0;
|
|
+
|
|
+ /*
|
|
+ * If a worker is going to sleep, notify and
|
|
+ * ask workqueue whether it wants to wake up a
|
|
+ * task to maintain concurrency. If so, wake
|
|
+ * up the task.
|
|
+ */
|
|
+ if (prev->flags & PF_WQ_WORKER) {
|
|
+ struct task_struct *to_wakeup;
|
|
+
|
|
+ to_wakeup = wq_worker_sleeping(prev);
|
|
+ if (to_wakeup)
|
|
+ try_to_wake_up_local(to_wakeup);
|
|
+ }
|
|
+ }
|
|
+ switch_count = &prev->nvcsw;
|
|
+ }
|
|
+
|
|
+ /*
|
|
+ * Store the niffy value here for use by the next task's last_ran
|
|
+ * below to avoid losing niffies due to update_clocks being called
|
|
+ * again after this point.
|
|
+ */
|
|
+ update_clocks(rq);
|
|
+ niffies = rq->niffies;
|
|
+ update_cpu_clock_switch(rq, prev);
|
|
+
|
|
+ clear_tsk_need_resched(prev);
|
|
+ clear_preempt_need_resched();
|
|
+
|
|
+ if (idle != prev) {
|
|
+ check_deadline(prev, rq);
|
|
+ return_task(prev, rq, cpu, deactivate);
|
|
+ }
|
|
+
|
|
+ next = earliest_deadline_task(rq, cpu, idle);
|
|
+ if (likely(next->prio != PRIO_LIMIT))
|
|
+ clear_cpuidle_map(cpu);
|
|
+ else {
|
|
+ set_cpuidle_map(cpu);
|
|
+ update_load_avg(rq, 0);
|
|
+ }
|
|
+
|
|
+ set_rq_task(rq, next);
|
|
+ next->last_ran = niffies;
|
|
+
|
|
+ if (likely(prev != next)) {
|
|
+ /*
|
|
+ * Don't reschedule an idle task or deactivated tasks
|
|
+ */
|
|
+ if (prev != idle && !deactivate)
|
|
+ resched_suitable_idle(prev);
|
|
+ if (next != idle)
|
|
+ check_siblings(rq);
|
|
+ else
|
|
+ wake_siblings(rq);
|
|
+ atomic64_inc(&grq.nr_switches);
|
|
+ rq->curr = next;
|
|
+ ++*switch_count;
|
|
+
|
|
+ trace_sched_switch(preempt, prev, next);
|
|
+ context_switch(rq, prev, next); /* unlocks the rq */
|
|
+ } else {
|
|
+ check_siblings(rq);
|
|
+ rq_unlock(rq);
|
|
+ do_pending_softirq(rq, next);
|
|
+ local_irq_enable();
|
|
+ }
|
|
+}
|
|
+
|
|
+void __noreturn do_task_dead(void)
|
|
+{
|
|
+ /*
|
|
+ * The setting of TASK_RUNNING by try_to_wake_up() may be delayed
|
|
+ * when the following two conditions become true.
|
|
+ * - There is race condition of mmap_sem (It is acquired by
|
|
+ * exit_mm()), and
|
|
+ * - SMI occurs before setting TASK_RUNINNG.
|
|
+ * (or hypervisor of virtual machine switches to other guest)
|
|
+ * As a result, we may become TASK_RUNNING after becoming TASK_DEAD
|
|
+ *
|
|
+ * To avoid it, we have to wait for releasing tsk->pi_lock which
|
|
+ * is held by try_to_wake_up()
|
|
+ */
|
|
+ smp_mb();
|
|
+ raw_spin_unlock_wait(¤t->pi_lock);
|
|
+
|
|
+ /* causes final put_task_struct in finish_task_switch(). */
|
|
+ __set_current_state(TASK_DEAD);
|
|
+ current->flags |= PF_NOFREEZE; /* tell freezer to ignore us */
|
|
+ __schedule(false);
|
|
+ BUG();
|
|
+ /* Avoid "noreturn function does return". */
|
|
+ for (;;)
|
|
+ cpu_relax(); /* For when BUG is null */
|
|
+}
|
|
+
|
|
+static inline void sched_submit_work(struct task_struct *tsk)
|
|
+{
|
|
+ if (!tsk->state || tsk_is_pi_blocked(tsk) ||
|
|
+ preempt_count() ||
|
|
+ signal_pending_state(tsk->state, tsk))
|
|
+ return;
|
|
+
|
|
+ /*
|
|
+ * If we are going to sleep and we have plugged IO queued,
|
|
+ * make sure to submit it to avoid deadlocks.
|
|
+ */
|
|
+ if (blk_needs_flush_plug(tsk))
|
|
+ blk_schedule_flush_plug(tsk);
|
|
+}
|
|
+
|
|
+asmlinkage __visible void __sched schedule(void)
|
|
+{
|
|
+ struct task_struct *tsk = current;
|
|
+
|
|
+ sched_submit_work(tsk);
|
|
+ do {
|
|
+ preempt_disable();
|
|
+ __schedule(false);
|
|
+ sched_preempt_enable_no_resched();
|
|
+ } while (need_resched());
|
|
+}
|
|
+
|
|
+EXPORT_SYMBOL(schedule);
|
|
+
|
|
+#ifdef CONFIG_CONTEXT_TRACKING
|
|
+asmlinkage __visible void __sched schedule_user(void)
|
|
+{
|
|
+ /*
|
|
+ * If we come here after a random call to set_need_resched(),
|
|
+ * or we have been woken up remotely but the IPI has not yet arrived,
|
|
+ * we haven't yet exited the RCU idle mode. Do it here manually until
|
|
+ * we find a better solution.
|
|
+ *
|
|
+ * NB: There are buggy callers of this function. Ideally we
|
|
+ * should warn if prev_state != IN_USER, but that will trigger
|
|
+ * too frequently to make sense yet.
|
|
+ */
|
|
+ enum ctx_state prev_state = exception_enter();
|
|
+ schedule();
|
|
+ exception_exit(prev_state);
|
|
+}
|
|
+#endif
|
|
+
|
|
+/**
|
|
+ * schedule_preempt_disabled - called with preemption disabled
|
|
+ *
|
|
+ * Returns with preemption disabled. Note: preempt_count must be 1
|
|
+ */
|
|
+void __sched schedule_preempt_disabled(void)
|
|
+{
|
|
+ sched_preempt_enable_no_resched();
|
|
+ schedule();
|
|
+ preempt_disable();
|
|
+}
|
|
+
|
|
+static void __sched notrace preempt_schedule_common(void)
|
|
+{
|
|
+ do {
|
|
+ /*
|
|
+ * Because the function tracer can trace preempt_count_sub()
|
|
+ * and it also uses preempt_enable/disable_notrace(), if
|
|
+ * NEED_RESCHED is set, the preempt_enable_notrace() called
|
|
+ * by the function tracer will call this function again and
|
|
+ * cause infinite recursion.
|
|
+ *
|
|
+ * Preemption must be disabled here before the function
|
|
+ * tracer can trace. Break up preempt_disable() into two
|
|
+ * calls. One to disable preemption without fear of being
|
|
+ * traced. The other to still record the preemption latency,
|
|
+ * which can also be traced by the function tracer.
|
|
+ */
|
|
+ preempt_disable_notrace();
|
|
+ preempt_latency_start(1);
|
|
+ __schedule(true);
|
|
+ preempt_latency_stop(1);
|
|
+ preempt_enable_no_resched_notrace();
|
|
+
|
|
+ /*
|
|
+ * Check again in case we missed a preemption opportunity
|
|
+ * between schedule and now.
|
|
+ */
|
|
+ } while (need_resched());
|
|
+}
|
|
+
|
|
+#ifdef CONFIG_PREEMPT
|
|
+/*
|
|
+ * this is the entry point to schedule() from in-kernel preemption
|
|
+ * off of preempt_enable. Kernel preemptions off return from interrupt
|
|
+ * occur there and call schedule directly.
|
|
+ */
|
|
+asmlinkage __visible void __sched notrace preempt_schedule(void)
|
|
+{
|
|
+ /*
|
|
+ * If there is a non-zero preempt_count or interrupts are disabled,
|
|
+ * we do not want to preempt the current task. Just return..
|
|
+ */
|
|
+ if (likely(!preemptible()))
|
|
+ return;
|
|
+
|
|
+ preempt_schedule_common();
|
|
+}
|
|
+NOKPROBE_SYMBOL(preempt_schedule);
|
|
+EXPORT_SYMBOL(preempt_schedule);
|
|
+
|
|
+/**
|
|
+ * preempt_schedule_notrace - preempt_schedule called by tracing
|
|
+ *
|
|
+ * The tracing infrastructure uses preempt_enable_notrace to prevent
|
|
+ * recursion and tracing preempt enabling caused by the tracing
|
|
+ * infrastructure itself. But as tracing can happen in areas coming
|
|
+ * from userspace or just about to enter userspace, a preempt enable
|
|
+ * can occur before user_exit() is called. This will cause the scheduler
|
|
+ * to be called when the system is still in usermode.
|
|
+ *
|
|
+ * To prevent this, the preempt_enable_notrace will use this function
|
|
+ * instead of preempt_schedule() to exit user context if needed before
|
|
+ * calling the scheduler.
|
|
+ */
|
|
+asmlinkage __visible void __sched notrace preempt_schedule_notrace(void)
|
|
+{
|
|
+ enum ctx_state prev_ctx;
|
|
+
|
|
+ if (likely(!preemptible()))
|
|
+ return;
|
|
+
|
|
+ do {
|
|
+ /*
|
|
+ * Because the function tracer can trace preempt_count_sub()
|
|
+ * and it also uses preempt_enable/disable_notrace(), if
|
|
+ * NEED_RESCHED is set, the preempt_enable_notrace() called
|
|
+ * by the function tracer will call this function again and
|
|
+ * cause infinite recursion.
|
|
+ *
|
|
+ * Preemption must be disabled here before the function
|
|
+ * tracer can trace. Break up preempt_disable() into two
|
|
+ * calls. One to disable preemption without fear of being
|
|
+ * traced. The other to still record the preemption latency,
|
|
+ * which can also be traced by the function tracer.
|
|
+ */
|
|
+ preempt_disable_notrace();
|
|
+ preempt_latency_start(1);
|
|
+ /*
|
|
+ * Needs preempt disabled in case user_exit() is traced
|
|
+ * and the tracer calls preempt_enable_notrace() causing
|
|
+ * an infinite recursion.
|
|
+ */
|
|
+ prev_ctx = exception_enter();
|
|
+ __schedule(true);
|
|
+ exception_exit(prev_ctx);
|
|
+
|
|
+ preempt_latency_stop(1);
|
|
+ preempt_enable_no_resched_notrace();
|
|
+ } while (need_resched());
|
|
+}
|
|
+EXPORT_SYMBOL_GPL(preempt_schedule_notrace);
|
|
+
|
|
+#endif /* CONFIG_PREEMPT */
|
|
+
|
|
+/*
|
|
+ * this is the entry point to schedule() from kernel preemption
|
|
+ * off of irq context.
|
|
+ * Note, that this is called and return with irqs disabled. This will
|
|
+ * protect us against recursive calling from irq.
|
|
+ */
|
|
+asmlinkage __visible void __sched preempt_schedule_irq(void)
|
|
+{
|
|
+ enum ctx_state prev_state;
|
|
+
|
|
+ /* Catch callers which need to be fixed */
|
|
+ BUG_ON(preempt_count() || !irqs_disabled());
|
|
+
|
|
+ prev_state = exception_enter();
|
|
+
|
|
+ do {
|
|
+ preempt_disable();
|
|
+ local_irq_enable();
|
|
+ __schedule(true);
|
|
+ local_irq_disable();
|
|
+ sched_preempt_enable_no_resched();
|
|
+ } while (need_resched());
|
|
+
|
|
+ exception_exit(prev_state);
|
|
+}
|
|
+
|
|
+int default_wake_function(wait_queue_t *curr, unsigned mode, int wake_flags,
|
|
+ void *key)
|
|
+{
|
|
+ return try_to_wake_up(curr->private, mode, wake_flags);
|
|
+}
|
|
+EXPORT_SYMBOL(default_wake_function);
|
|
+
|
|
+#ifdef CONFIG_RT_MUTEXES
|
|
+
|
|
+/*
|
|
+ * rt_mutex_setprio - set the current priority of a task
|
|
+ * @p: task
|
|
+ * @prio: prio value (kernel-internal form)
|
|
+ *
|
|
+ * This function changes the 'effective' priority of a task. It does
|
|
+ * not touch ->normal_prio like __setscheduler().
|
|
+ *
|
|
+ * Used by the rt_mutex code to implement priority inheritance
|
|
+ * logic. Call site only calls if the priority of the task changed.
|
|
+ */
|
|
+void rt_mutex_setprio(struct task_struct *p, int prio)
|
|
+{
|
|
+ struct rq *rq;
|
|
+ int oldprio;
|
|
+
|
|
+ BUG_ON(prio < 0 || prio > MAX_PRIO);
|
|
+
|
|
+ rq = __task_rq_lock(p);
|
|
+
|
|
+ /*
|
|
+ * Idle task boosting is a nono in general. There is one
|
|
+ * exception, when PREEMPT_RT and NOHZ is active:
|
|
+ *
|
|
+ * The idle task calls get_next_timer_interrupt() and holds
|
|
+ * the timer wheel base->lock on the CPU and another CPU wants
|
|
+ * to access the timer (probably to cancel it). We can safely
|
|
+ * ignore the boosting request, as the idle CPU runs this code
|
|
+ * with interrupts disabled and will complete the lock
|
|
+ * protected section without being interrupted. So there is no
|
|
+ * real need to boost.
|
|
+ */
|
|
+ if (unlikely(p == rq->idle)) {
|
|
+ WARN_ON(p != rq->curr);
|
|
+ WARN_ON(p->pi_blocked_on);
|
|
+ goto out_unlock;
|
|
+ }
|
|
+
|
|
+ trace_sched_pi_setprio(p, prio);
|
|
+ oldprio = p->prio;
|
|
+ p->prio = prio;
|
|
+ if (task_running(rq, p)){
|
|
+ if (prio > oldprio)
|
|
+ resched_task(p);
|
|
+ } else if (task_queued(p)) {
|
|
+ dequeue_task(rq, p, DEQUEUE_SAVE);
|
|
+ enqueue_task(rq, p, ENQUEUE_RESTORE);
|
|
+ if (prio < oldprio)
|
|
+ try_preempt(p, rq);
|
|
+ }
|
|
+out_unlock:
|
|
+ __task_rq_unlock(rq);
|
|
+}
|
|
+
|
|
+#endif
|
|
+
|
|
+/*
|
|
+ * Adjust the deadline for when the priority is to change, before it's
|
|
+ * changed.
|
|
+ */
|
|
+static inline void adjust_deadline(struct task_struct *p, int new_prio)
|
|
+{
|
|
+ p->deadline += static_deadline_diff(new_prio) - task_deadline_diff(p);
|
|
+}
|
|
+
|
|
+void set_user_nice(struct task_struct *p, long nice)
|
|
+{
|
|
+ int new_static, old_static;
|
|
+ unsigned long flags;
|
|
+ struct rq *rq;
|
|
+
|
|
+ if (task_nice(p) == nice || nice < MIN_NICE || nice > MAX_NICE)
|
|
+ return;
|
|
+ new_static = NICE_TO_PRIO(nice);
|
|
+ /*
|
|
+ * We have to be careful, if called from sys_setpriority(),
|
|
+ * the task might be in the middle of scheduling on another CPU.
|
|
+ */
|
|
+ rq = task_rq_lock(p, &flags);
|
|
+ /*
|
|
+ * The RT priorities are set via sched_setscheduler(), but we still
|
|
+ * allow the 'normal' nice value to be set - but as expected
|
|
+ * it wont have any effect on scheduling until the task is
|
|
+ * not SCHED_NORMAL/SCHED_BATCH:
|
|
+ */
|
|
+ if (has_rt_policy(p)) {
|
|
+ p->static_prio = new_static;
|
|
+ goto out_unlock;
|
|
+ }
|
|
+
|
|
+ adjust_deadline(p, new_static);
|
|
+ old_static = p->static_prio;
|
|
+ p->static_prio = new_static;
|
|
+ p->prio = effective_prio(p);
|
|
+
|
|
+ if (task_queued(p)) {
|
|
+ dequeue_task(rq, p, DEQUEUE_SAVE);
|
|
+ enqueue_task(rq, p, ENQUEUE_RESTORE);
|
|
+ if (new_static < old_static)
|
|
+ try_preempt(p, rq);
|
|
+ } else if (task_running(rq, p)) {
|
|
+ set_rq_task(rq, p);
|
|
+ if (old_static < new_static)
|
|
+ resched_task(p);
|
|
+ }
|
|
+out_unlock:
|
|
+ task_rq_unlock(rq, p, &flags);
|
|
+}
|
|
+EXPORT_SYMBOL(set_user_nice);
|
|
+
|
|
+/*
|
|
+ * can_nice - check if a task can reduce its nice value
|
|
+ * @p: task
|
|
+ * @nice: nice value
|
|
+ */
|
|
+int can_nice(const struct task_struct *p, const int nice)
|
|
+{
|
|
+ /* convert nice value [19,-20] to rlimit style value [1,40] */
|
|
+ int nice_rlim = nice_to_rlimit(nice);
|
|
+
|
|
+ return (nice_rlim <= task_rlimit(p, RLIMIT_NICE) ||
|
|
+ capable(CAP_SYS_NICE));
|
|
+}
|
|
+
|
|
+#ifdef __ARCH_WANT_SYS_NICE
|
|
+
|
|
+/*
|
|
+ * sys_nice - change the priority of the current process.
|
|
+ * @increment: priority increment
|
|
+ *
|
|
+ * sys_setpriority is a more generic, but much slower function that
|
|
+ * does similar things.
|
|
+ */
|
|
+SYSCALL_DEFINE1(nice, int, increment)
|
|
+{
|
|
+ long nice, retval;
|
|
+
|
|
+ /*
|
|
+ * Setpriority might change our priority at the same moment.
|
|
+ * We don't have to worry. Conceptually one call occurs first
|
|
+ * and we have a single winner.
|
|
+ */
|
|
+
|
|
+ increment = clamp(increment, -NICE_WIDTH, NICE_WIDTH);
|
|
+ nice = task_nice(current) + increment;
|
|
+
|
|
+ nice = clamp_val(nice, MIN_NICE, MAX_NICE);
|
|
+ if (increment < 0 && !can_nice(current, nice))
|
|
+ return -EPERM;
|
|
+
|
|
+ retval = security_task_setnice(current, nice);
|
|
+ if (retval)
|
|
+ return retval;
|
|
+
|
|
+ set_user_nice(current, nice);
|
|
+ return 0;
|
|
+}
|
|
+
|
|
+#endif
|
|
+
|
|
+/**
|
|
+ * task_prio - return the priority value of a given task.
|
|
+ * @p: the task in question.
|
|
+ *
|
|
+ * Return: The priority value as seen by users in /proc.
|
|
+ * RT tasks are offset by -100. Normal tasks are centered around 1, value goes
|
|
+ * from 0 (SCHED_ISO) up to 82 (nice +19 SCHED_IDLEPRIO).
|
|
+ */
|
|
+int task_prio(const struct task_struct *p)
|
|
+{
|
|
+ int delta, prio = p->prio - MAX_RT_PRIO;
|
|
+
|
|
+ /* rt tasks and iso tasks */
|
|
+ if (prio <= 0)
|
|
+ goto out;
|
|
+
|
|
+ /* Convert to ms to avoid overflows */
|
|
+ delta = NS_TO_MS(p->deadline - task_rq(p)->niffies);
|
|
+ delta = delta * 40 / ms_longest_deadline_diff();
|
|
+ if (delta > 0 && delta <= 80)
|
|
+ prio += delta;
|
|
+ if (idleprio_task(p))
|
|
+ prio += 40;
|
|
+out:
|
|
+ return prio;
|
|
+}
|
|
+
|
|
+/**
|
|
+ * idle_cpu - is a given cpu idle currently?
|
|
+ * @cpu: the processor in question.
|
|
+ *
|
|
+ * Return: 1 if the CPU is currently idle. 0 otherwise.
|
|
+ */
|
|
+int idle_cpu(int cpu)
|
|
+{
|
|
+ return cpu_curr(cpu) == cpu_rq(cpu)->idle;
|
|
+}
|
|
+
|
|
+/**
|
|
+ * idle_task - return the idle task for a given cpu.
|
|
+ * @cpu: the processor in question.
|
|
+ *
|
|
+ * Return: The idle task for the cpu @cpu.
|
|
+ */
|
|
+struct task_struct *idle_task(int cpu)
|
|
+{
|
|
+ return cpu_rq(cpu)->idle;
|
|
+}
|
|
+
|
|
+/**
|
|
+ * find_process_by_pid - find a process with a matching PID value.
|
|
+ * @pid: the pid in question.
|
|
+ *
|
|
+ * The task of @pid, if found. %NULL otherwise.
|
|
+ */
|
|
+static inline struct task_struct *find_process_by_pid(pid_t pid)
|
|
+{
|
|
+ return pid ? find_task_by_vpid(pid) : current;
|
|
+}
|
|
+
|
|
+/* Actually do priority change: must hold rq lock. */
|
|
+static void __setscheduler(struct task_struct *p, struct rq *rq, int policy,
|
|
+ int prio, bool keep_boost)
|
|
+{
|
|
+ int oldrtprio, oldprio;
|
|
+
|
|
+ p->policy = policy;
|
|
+ oldrtprio = p->rt_priority;
|
|
+ p->rt_priority = prio;
|
|
+ p->normal_prio = normal_prio(p);
|
|
+ oldprio = p->prio;
|
|
+ /*
|
|
+ * Keep a potential priority boosting if called from
|
|
+ * sched_setscheduler().
|
|
+ */
|
|
+ if (keep_boost) {
|
|
+ /*
|
|
+ * Take priority boosted tasks into account. If the new
|
|
+ * effective priority is unchanged, we just store the new
|
|
+ * normal parameters and do not touch the scheduler class and
|
|
+ * the runqueue. This will be done when the task deboost
|
|
+ * itself.
|
|
+ */
|
|
+ p->prio = rt_mutex_get_effective_prio(p, p->normal_prio);
|
|
+ } else
|
|
+ p->prio = p->normal_prio;
|
|
+
|
|
+ if (task_running(rq, p)) {
|
|
+ set_rq_task(rq, p);
|
|
+ resched_task(p);
|
|
+ } else if (task_queued(p)) {
|
|
+ dequeue_task(rq, p, DEQUEUE_SAVE);
|
|
+ enqueue_task(rq, p, ENQUEUE_RESTORE);
|
|
+ if (p->prio < oldprio || p->rt_priority > oldrtprio)
|
|
+ try_preempt(p, rq);
|
|
+ }
|
|
+}
|
|
+
|
|
+/*
|
|
+ * check the target process has a UID that matches the current process's
|
|
+ */
|
|
+static bool check_same_owner(struct task_struct *p)
|
|
+{
|
|
+ const struct cred *cred = current_cred(), *pcred;
|
|
+ bool match;
|
|
+
|
|
+ rcu_read_lock();
|
|
+ pcred = __task_cred(p);
|
|
+ match = (uid_eq(cred->euid, pcred->euid) ||
|
|
+ uid_eq(cred->euid, pcred->uid));
|
|
+ rcu_read_unlock();
|
|
+ return match;
|
|
+}
|
|
+
|
|
+static int
|
|
+__sched_setscheduler(struct task_struct *p, int policy,
|
|
+ const struct sched_param *param, bool user, bool pi)
|
|
+{
|
|
+ struct sched_param zero_param = { .sched_priority = 0 };
|
|
+ unsigned long flags, rlim_rtprio = 0;
|
|
+ int retval, oldpolicy = -1;
|
|
+ int reset_on_fork;
|
|
+ struct rq *rq;
|
|
+
|
|
+ /* may grab non-irq protected spin_locks */
|
|
+ BUG_ON(in_interrupt());
|
|
+
|
|
+ if (is_rt_policy(policy) && !capable(CAP_SYS_NICE)) {
|
|
+ unsigned long lflags;
|
|
+
|
|
+ if (!lock_task_sighand(p, &lflags))
|
|
+ return -ESRCH;
|
|
+ rlim_rtprio = task_rlimit(p, RLIMIT_RTPRIO);
|
|
+ unlock_task_sighand(p, &lflags);
|
|
+ if (rlim_rtprio)
|
|
+ goto recheck;
|
|
+ /*
|
|
+ * If the caller requested an RT policy without having the
|
|
+ * necessary rights, we downgrade the policy to SCHED_ISO.
|
|
+ * We also set the parameter to zero to pass the checks.
|
|
+ */
|
|
+ policy = SCHED_ISO;
|
|
+ param = &zero_param;
|
|
+ }
|
|
+recheck:
|
|
+ /* double check policy once rq lock held */
|
|
+ if (policy < 0) {
|
|
+ reset_on_fork = p->sched_reset_on_fork;
|
|
+ policy = oldpolicy = p->policy;
|
|
+ } else {
|
|
+ reset_on_fork = !!(policy & SCHED_RESET_ON_FORK);
|
|
+ policy &= ~SCHED_RESET_ON_FORK;
|
|
+
|
|
+ if (!SCHED_RANGE(policy))
|
|
+ return -EINVAL;
|
|
+ }
|
|
+
|
|
+ /*
|
|
+ * Valid priorities for SCHED_FIFO and SCHED_RR are
|
|
+ * 1..MAX_USER_RT_PRIO-1, valid priority for SCHED_NORMAL and
|
|
+ * SCHED_BATCH is 0.
|
|
+ */
|
|
+ if (param->sched_priority < 0 ||
|
|
+ (p->mm && param->sched_priority > MAX_USER_RT_PRIO - 1) ||
|
|
+ (!p->mm && param->sched_priority > MAX_RT_PRIO - 1))
|
|
+ return -EINVAL;
|
|
+ if (is_rt_policy(policy) != (param->sched_priority != 0))
|
|
+ return -EINVAL;
|
|
+
|
|
+ /*
|
|
+ * Allow unprivileged RT tasks to decrease priority:
|
|
+ */
|
|
+ if (user && !capable(CAP_SYS_NICE)) {
|
|
+ if (is_rt_policy(policy)) {
|
|
+ unsigned long rlim_rtprio =
|
|
+ task_rlimit(p, RLIMIT_RTPRIO);
|
|
+
|
|
+ /* can't set/change the rt policy */
|
|
+ if (policy != p->policy && !rlim_rtprio)
|
|
+ return -EPERM;
|
|
+
|
|
+ /* can't increase priority */
|
|
+ if (param->sched_priority > p->rt_priority &&
|
|
+ param->sched_priority > rlim_rtprio)
|
|
+ return -EPERM;
|
|
+ } else {
|
|
+ switch (p->policy) {
|
|
+ /*
|
|
+ * Can only downgrade policies but not back to
|
|
+ * SCHED_NORMAL
|
|
+ */
|
|
+ case SCHED_ISO:
|
|
+ if (policy == SCHED_ISO)
|
|
+ goto out;
|
|
+ if (policy != SCHED_NORMAL)
|
|
+ return -EPERM;
|
|
+ break;
|
|
+ case SCHED_BATCH:
|
|
+ if (policy == SCHED_BATCH)
|
|
+ goto out;
|
|
+ if (policy != SCHED_IDLEPRIO)
|
|
+ return -EPERM;
|
|
+ break;
|
|
+ case SCHED_IDLEPRIO:
|
|
+ if (policy == SCHED_IDLEPRIO)
|
|
+ goto out;
|
|
+ return -EPERM;
|
|
+ default:
|
|
+ break;
|
|
+ }
|
|
+ }
|
|
+
|
|
+ /* can't change other user's priorities */
|
|
+ if (!check_same_owner(p))
|
|
+ return -EPERM;
|
|
+
|
|
+ /* Normal users shall not reset the sched_reset_on_fork flag */
|
|
+ if (p->sched_reset_on_fork && !reset_on_fork)
|
|
+ return -EPERM;
|
|
+ }
|
|
+
|
|
+ if (user) {
|
|
+ retval = security_task_setscheduler(p);
|
|
+ if (retval)
|
|
+ return retval;
|
|
+ }
|
|
+
|
|
+ /*
|
|
+ * make sure no PI-waiters arrive (or leave) while we are
|
|
+ * changing the priority of the task:
|
|
+ *
|
|
+ * To be able to change p->policy safely, the runqueue lock must be
|
|
+ * held.
|
|
+ */
|
|
+ rq = task_rq_lock(p, &flags);
|
|
+
|
|
+ /*
|
|
+ * Changing the policy of the stop threads its a very bad idea
|
|
+ */
|
|
+ if (p == rq->stop) {
|
|
+ task_rq_unlock(rq, p, &flags);
|
|
+ return -EINVAL;
|
|
+ }
|
|
+
|
|
+ /*
|
|
+ * If not changing anything there's no need to proceed further:
|
|
+ */
|
|
+ if (unlikely(policy == p->policy && (!is_rt_policy(policy) ||
|
|
+ param->sched_priority == p->rt_priority))) {
|
|
+ task_rq_unlock(rq, p, &flags);
|
|
+ return 0;
|
|
+ }
|
|
+
|
|
+ /* recheck policy now with rq lock held */
|
|
+ if (unlikely(oldpolicy != -1 && oldpolicy != p->policy)) {
|
|
+ policy = oldpolicy = -1;
|
|
+ task_rq_unlock(rq, p, &flags);
|
|
+ goto recheck;
|
|
+ }
|
|
+ p->sched_reset_on_fork = reset_on_fork;
|
|
+
|
|
+ __setscheduler(p, rq, policy, param->sched_priority, pi);
|
|
+ task_rq_unlock(rq, p, &flags);
|
|
+
|
|
+ if (pi)
|
|
+ rt_mutex_adjust_pi(p);
|
|
+out:
|
|
+ return 0;
|
|
+}
|
|
+
|
|
+/**
|
|
+ * sched_setscheduler - change the scheduling policy and/or RT priority of a thread.
|
|
+ * @p: the task in question.
|
|
+ * @policy: new policy.
|
|
+ * @param: structure containing the new RT priority.
|
|
+ *
|
|
+ * Return: 0 on success. An error code otherwise.
|
|
+ *
|
|
+ * NOTE that the task may be already dead.
|
|
+ */
|
|
+int sched_setscheduler(struct task_struct *p, int policy,
|
|
+ const struct sched_param *param)
|
|
+{
|
|
+ return __sched_setscheduler(p, policy, param, true, true);
|
|
+}
|
|
+
|
|
+EXPORT_SYMBOL_GPL(sched_setscheduler);
|
|
+
|
|
+int sched_setattr(struct task_struct *p, const struct sched_attr *attr)
|
|
+{
|
|
+ const struct sched_param param = { .sched_priority = attr->sched_priority };
|
|
+ int policy = attr->sched_policy;
|
|
+
|
|
+ return __sched_setscheduler(p, policy, ¶m, true, true);
|
|
+}
|
|
+EXPORT_SYMBOL_GPL(sched_setattr);
|
|
+
|
|
+/**
|
|
+ * sched_setscheduler_nocheck - change the scheduling policy and/or RT priority of a thread from kernelspace.
|
|
+ * @p: the task in question.
|
|
+ * @policy: new policy.
|
|
+ * @param: structure containing the new RT priority.
|
|
+ *
|
|
+ * Just like sched_setscheduler, only don't bother checking if the
|
|
+ * current context has permission. For example, this is needed in
|
|
+ * stop_machine(): we create temporary high priority worker threads,
|
|
+ * but our caller might not have that capability.
|
|
+ *
|
|
+ * Return: 0 on success. An error code otherwise.
|
|
+ */
|
|
+int sched_setscheduler_nocheck(struct task_struct *p, int policy,
|
|
+ const struct sched_param *param)
|
|
+{
|
|
+ return __sched_setscheduler(p, policy, param, false, true);
|
|
+}
|
|
+EXPORT_SYMBOL_GPL(sched_setscheduler_nocheck);
|
|
+
|
|
+static int
|
|
+do_sched_setscheduler(pid_t pid, int policy, struct sched_param __user *param)
|
|
+{
|
|
+ struct sched_param lparam;
|
|
+ struct task_struct *p;
|
|
+ int retval;
|
|
+
|
|
+ if (!param || pid < 0)
|
|
+ return -EINVAL;
|
|
+ if (copy_from_user(&lparam, param, sizeof(struct sched_param)))
|
|
+ return -EFAULT;
|
|
+
|
|
+ rcu_read_lock();
|
|
+ retval = -ESRCH;
|
|
+ p = find_process_by_pid(pid);
|
|
+ if (p != NULL)
|
|
+ retval = sched_setscheduler(p, policy, &lparam);
|
|
+ rcu_read_unlock();
|
|
+
|
|
+ return retval;
|
|
+}
|
|
+
|
|
+/*
|
|
+ * Mimics kernel/events/core.c perf_copy_attr().
|
|
+ */
|
|
+static int sched_copy_attr(struct sched_attr __user *uattr,
|
|
+ struct sched_attr *attr)
|
|
+{
|
|
+ u32 size;
|
|
+ int ret;
|
|
+
|
|
+ if (!access_ok(VERIFY_WRITE, uattr, SCHED_ATTR_SIZE_VER0))
|
|
+ return -EFAULT;
|
|
+
|
|
+ /*
|
|
+ * zero the full structure, so that a short copy will be nice.
|
|
+ */
|
|
+ memset(attr, 0, sizeof(*attr));
|
|
+
|
|
+ ret = get_user(size, &uattr->size);
|
|
+ if (ret)
|
|
+ return ret;
|
|
+
|
|
+ if (size > PAGE_SIZE) /* silly large */
|
|
+ goto err_size;
|
|
+
|
|
+ if (!size) /* abi compat */
|
|
+ size = SCHED_ATTR_SIZE_VER0;
|
|
+
|
|
+ if (size < SCHED_ATTR_SIZE_VER0)
|
|
+ goto err_size;
|
|
+
|
|
+ /*
|
|
+ * If we're handed a bigger struct than we know of,
|
|
+ * ensure all the unknown bits are 0 - i.e. new
|
|
+ * user-space does not rely on any kernel feature
|
|
+ * extensions we dont know about yet.
|
|
+ */
|
|
+ if (size > sizeof(*attr)) {
|
|
+ unsigned char __user *addr;
|
|
+ unsigned char __user *end;
|
|
+ unsigned char val;
|
|
+
|
|
+ addr = (void __user *)uattr + sizeof(*attr);
|
|
+ end = (void __user *)uattr + size;
|
|
+
|
|
+ for (; addr < end; addr++) {
|
|
+ ret = get_user(val, addr);
|
|
+ if (ret)
|
|
+ return ret;
|
|
+ if (val)
|
|
+ goto err_size;
|
|
+ }
|
|
+ size = sizeof(*attr);
|
|
+ }
|
|
+
|
|
+ ret = copy_from_user(attr, uattr, size);
|
|
+ if (ret)
|
|
+ return -EFAULT;
|
|
+
|
|
+ /*
|
|
+ * XXX: do we want to be lenient like existing syscalls; or do we want
|
|
+ * to be strict and return an error on out-of-bounds values?
|
|
+ */
|
|
+ attr->sched_nice = clamp(attr->sched_nice, -20, 19);
|
|
+
|
|
+ /* sched/core.c uses zero here but we already know ret is zero */
|
|
+ return 0;
|
|
+
|
|
+err_size:
|
|
+ put_user(sizeof(*attr), &uattr->size);
|
|
+ return -E2BIG;
|
|
+}
|
|
+
|
|
+/**
|
|
+ * sys_sched_setscheduler - set/change the scheduler policy and RT priority
|
|
+ * @pid: the pid in question.
|
|
+ * @policy: new policy.
|
|
+ *
|
|
+ * Return: 0 on success. An error code otherwise.
|
|
+ * @param: structure containing the new RT priority.
|
|
+ */
|
|
+asmlinkage long sys_sched_setscheduler(pid_t pid, int policy,
|
|
+ struct sched_param __user *param)
|
|
+{
|
|
+ /* negative values for policy are not valid */
|
|
+ if (policy < 0)
|
|
+ return -EINVAL;
|
|
+
|
|
+ return do_sched_setscheduler(pid, policy, param);
|
|
+}
|
|
+
|
|
+/*
|
|
+ * sched_setparam() passes in -1 for its policy, to let the functions
|
|
+ * it calls know not to change it.
|
|
+ */
|
|
+#define SETPARAM_POLICY -1
|
|
+
|
|
+/**
|
|
+ * sys_sched_setparam - set/change the RT priority of a thread
|
|
+ * @pid: the pid in question.
|
|
+ * @param: structure containing the new RT priority.
|
|
+ *
|
|
+ * Return: 0 on success. An error code otherwise.
|
|
+ */
|
|
+SYSCALL_DEFINE2(sched_setparam, pid_t, pid, struct sched_param __user *, param)
|
|
+{
|
|
+ return do_sched_setscheduler(pid, SETPARAM_POLICY, param);
|
|
+}
|
|
+
|
|
+/**
|
|
+ * sys_sched_setattr - same as above, but with extended sched_attr
|
|
+ * @pid: the pid in question.
|
|
+ * @uattr: structure containing the extended parameters.
|
|
+ */
|
|
+SYSCALL_DEFINE3(sched_setattr, pid_t, pid, struct sched_attr __user *, uattr,
|
|
+ unsigned int, flags)
|
|
+{
|
|
+ struct sched_attr attr;
|
|
+ struct task_struct *p;
|
|
+ int retval;
|
|
+
|
|
+ if (!uattr || pid < 0 || flags)
|
|
+ return -EINVAL;
|
|
+
|
|
+ retval = sched_copy_attr(uattr, &attr);
|
|
+ if (retval)
|
|
+ return retval;
|
|
+
|
|
+ if ((int)attr.sched_policy < 0)
|
|
+ return -EINVAL;
|
|
+
|
|
+ rcu_read_lock();
|
|
+ retval = -ESRCH;
|
|
+ p = find_process_by_pid(pid);
|
|
+ if (p != NULL)
|
|
+ retval = sched_setattr(p, &attr);
|
|
+ rcu_read_unlock();
|
|
+
|
|
+ return retval;
|
|
+}
|
|
+
|
|
+/**
|
|
+ * sys_sched_getscheduler - get the policy (scheduling class) of a thread
|
|
+ * @pid: the pid in question.
|
|
+ *
|
|
+ * Return: On success, the policy of the thread. Otherwise, a negative error
|
|
+ * code.
|
|
+ */
|
|
+SYSCALL_DEFINE1(sched_getscheduler, pid_t, pid)
|
|
+{
|
|
+ struct task_struct *p;
|
|
+ int retval = -EINVAL;
|
|
+
|
|
+ if (pid < 0)
|
|
+ goto out_nounlock;
|
|
+
|
|
+ retval = -ESRCH;
|
|
+ rcu_read_lock();
|
|
+ p = find_process_by_pid(pid);
|
|
+ if (p) {
|
|
+ retval = security_task_getscheduler(p);
|
|
+ if (!retval)
|
|
+ retval = p->policy;
|
|
+ }
|
|
+ rcu_read_unlock();
|
|
+
|
|
+out_nounlock:
|
|
+ return retval;
|
|
+}
|
|
+
|
|
+/**
|
|
+ * sys_sched_getscheduler - get the RT priority of a thread
|
|
+ * @pid: the pid in question.
|
|
+ * @param: structure containing the RT priority.
|
|
+ *
|
|
+ * Return: On success, 0 and the RT priority is in @param. Otherwise, an error
|
|
+ * code.
|
|
+ */
|
|
+SYSCALL_DEFINE2(sched_getparam, pid_t, pid, struct sched_param __user *, param)
|
|
+{
|
|
+ struct sched_param lp = { .sched_priority = 0 };
|
|
+ struct task_struct *p;
|
|
+ int retval = -EINVAL;
|
|
+
|
|
+ if (!param || pid < 0)
|
|
+ goto out_nounlock;
|
|
+
|
|
+ rcu_read_lock();
|
|
+ p = find_process_by_pid(pid);
|
|
+ retval = -ESRCH;
|
|
+ if (!p)
|
|
+ goto out_unlock;
|
|
+
|
|
+ retval = security_task_getscheduler(p);
|
|
+ if (retval)
|
|
+ goto out_unlock;
|
|
+
|
|
+ if (has_rt_policy(p))
|
|
+ lp.sched_priority = p->rt_priority;
|
|
+ rcu_read_unlock();
|
|
+
|
|
+ /*
|
|
+ * This one might sleep, we cannot do it with a spinlock held ...
|
|
+ */
|
|
+ retval = copy_to_user(param, &lp, sizeof(*param)) ? -EFAULT : 0;
|
|
+
|
|
+out_nounlock:
|
|
+ return retval;
|
|
+
|
|
+out_unlock:
|
|
+ rcu_read_unlock();
|
|
+ return retval;
|
|
+}
|
|
+
|
|
+static int sched_read_attr(struct sched_attr __user *uattr,
|
|
+ struct sched_attr *attr,
|
|
+ unsigned int usize)
|
|
+{
|
|
+ int ret;
|
|
+
|
|
+ if (!access_ok(VERIFY_WRITE, uattr, usize))
|
|
+ return -EFAULT;
|
|
+
|
|
+ /*
|
|
+ * If we're handed a smaller struct than we know of,
|
|
+ * ensure all the unknown bits are 0 - i.e. old
|
|
+ * user-space does not get uncomplete information.
|
|
+ */
|
|
+ if (usize < sizeof(*attr)) {
|
|
+ unsigned char *addr;
|
|
+ unsigned char *end;
|
|
+
|
|
+ addr = (void *)attr + usize;
|
|
+ end = (void *)attr + sizeof(*attr);
|
|
+
|
|
+ for (; addr < end; addr++) {
|
|
+ if (*addr)
|
|
+ return -EFBIG;
|
|
+ }
|
|
+
|
|
+ attr->size = usize;
|
|
+ }
|
|
+
|
|
+ ret = copy_to_user(uattr, attr, attr->size);
|
|
+ if (ret)
|
|
+ return -EFAULT;
|
|
+
|
|
+ /* sched/core.c uses zero here but we already know ret is zero */
|
|
+ return ret;
|
|
+}
|
|
+
|
|
+/**
|
|
+ * sys_sched_getattr - similar to sched_getparam, but with sched_attr
|
|
+ * @pid: the pid in question.
|
|
+ * @uattr: structure containing the extended parameters.
|
|
+ * @size: sizeof(attr) for fwd/bwd comp.
|
|
+ * @flags: for future extension.
|
|
+ */
|
|
+SYSCALL_DEFINE4(sched_getattr, pid_t, pid, struct sched_attr __user *, uattr,
|
|
+ unsigned int, size, unsigned int, flags)
|
|
+{
|
|
+ struct sched_attr attr = {
|
|
+ .size = sizeof(struct sched_attr),
|
|
+ };
|
|
+ struct task_struct *p;
|
|
+ int retval;
|
|
+
|
|
+ if (!uattr || pid < 0 || size > PAGE_SIZE ||
|
|
+ size < SCHED_ATTR_SIZE_VER0 || flags)
|
|
+ return -EINVAL;
|
|
+
|
|
+ rcu_read_lock();
|
|
+ p = find_process_by_pid(pid);
|
|
+ retval = -ESRCH;
|
|
+ if (!p)
|
|
+ goto out_unlock;
|
|
+
|
|
+ retval = security_task_getscheduler(p);
|
|
+ if (retval)
|
|
+ goto out_unlock;
|
|
+
|
|
+ attr.sched_policy = p->policy;
|
|
+ if (rt_task(p))
|
|
+ attr.sched_priority = p->rt_priority;
|
|
+ else
|
|
+ attr.sched_nice = task_nice(p);
|
|
+
|
|
+ rcu_read_unlock();
|
|
+
|
|
+ retval = sched_read_attr(uattr, &attr, size);
|
|
+ return retval;
|
|
+
|
|
+out_unlock:
|
|
+ rcu_read_unlock();
|
|
+ return retval;
|
|
+}
|
|
+
|
|
+long sched_setaffinity(pid_t pid, const struct cpumask *in_mask)
|
|
+{
|
|
+ cpumask_var_t cpus_allowed, new_mask;
|
|
+ struct task_struct *p;
|
|
+ int retval;
|
|
+
|
|
+ rcu_read_lock();
|
|
+
|
|
+ p = find_process_by_pid(pid);
|
|
+ if (!p) {
|
|
+ rcu_read_unlock();
|
|
+ return -ESRCH;
|
|
+ }
|
|
+
|
|
+ /* Prevent p going away */
|
|
+ get_task_struct(p);
|
|
+ rcu_read_unlock();
|
|
+
|
|
+ if (p->flags & PF_NO_SETAFFINITY) {
|
|
+ retval = -EINVAL;
|
|
+ goto out_put_task;
|
|
+ }
|
|
+ if (!alloc_cpumask_var(&cpus_allowed, GFP_KERNEL)) {
|
|
+ retval = -ENOMEM;
|
|
+ goto out_put_task;
|
|
+ }
|
|
+ if (!alloc_cpumask_var(&new_mask, GFP_KERNEL)) {
|
|
+ retval = -ENOMEM;
|
|
+ goto out_free_cpus_allowed;
|
|
+ }
|
|
+ retval = -EPERM;
|
|
+ if (!check_same_owner(p)) {
|
|
+ rcu_read_lock();
|
|
+ if (!ns_capable(__task_cred(p)->user_ns, CAP_SYS_NICE)) {
|
|
+ rcu_read_unlock();
|
|
+ goto out_unlock;
|
|
+ }
|
|
+ rcu_read_unlock();
|
|
+ }
|
|
+
|
|
+ retval = security_task_setscheduler(p);
|
|
+ if (retval)
|
|
+ goto out_unlock;
|
|
+
|
|
+ cpuset_cpus_allowed(p, cpus_allowed);
|
|
+ cpumask_and(new_mask, in_mask, cpus_allowed);
|
|
+again:
|
|
+ retval = __set_cpus_allowed_ptr(p, new_mask, true);
|
|
+
|
|
+ if (!retval) {
|
|
+ cpuset_cpus_allowed(p, cpus_allowed);
|
|
+ if (!cpumask_subset(new_mask, cpus_allowed)) {
|
|
+ /*
|
|
+ * We must have raced with a concurrent cpuset
|
|
+ * update. Just reset the cpus_allowed to the
|
|
+ * cpuset's cpus_allowed
|
|
+ */
|
|
+ cpumask_copy(new_mask, cpus_allowed);
|
|
+ goto again;
|
|
+ }
|
|
+ }
|
|
+out_unlock:
|
|
+ free_cpumask_var(new_mask);
|
|
+out_free_cpus_allowed:
|
|
+ free_cpumask_var(cpus_allowed);
|
|
+out_put_task:
|
|
+ put_task_struct(p);
|
|
+ return retval;
|
|
+}
|
|
+
|
|
+static int get_user_cpu_mask(unsigned long __user *user_mask_ptr, unsigned len,
|
|
+ cpumask_t *new_mask)
|
|
+{
|
|
+ if (len < sizeof(cpumask_t)) {
|
|
+ memset(new_mask, 0, sizeof(cpumask_t));
|
|
+ } else if (len > sizeof(cpumask_t)) {
|
|
+ len = sizeof(cpumask_t);
|
|
+ }
|
|
+ return copy_from_user(new_mask, user_mask_ptr, len) ? -EFAULT : 0;
|
|
+}
|
|
+
|
|
+
|
|
+/**
|
|
+ * sys_sched_setaffinity - set the cpu affinity of a process
|
|
+ * @pid: pid of the process
|
|
+ * @len: length in bytes of the bitmask pointed to by user_mask_ptr
|
|
+ * @user_mask_ptr: user-space pointer to the new cpu mask
|
|
+ *
|
|
+ * Return: 0 on success. An error code otherwise.
|
|
+ */
|
|
+SYSCALL_DEFINE3(sched_setaffinity, pid_t, pid, unsigned int, len,
|
|
+ unsigned long __user *, user_mask_ptr)
|
|
+{
|
|
+ cpumask_var_t new_mask;
|
|
+ int retval;
|
|
+
|
|
+ if (!alloc_cpumask_var(&new_mask, GFP_KERNEL))
|
|
+ return -ENOMEM;
|
|
+
|
|
+ retval = get_user_cpu_mask(user_mask_ptr, len, new_mask);
|
|
+ if (retval == 0)
|
|
+ retval = sched_setaffinity(pid, new_mask);
|
|
+ free_cpumask_var(new_mask);
|
|
+ return retval;
|
|
+}
|
|
+
|
|
+long sched_getaffinity(pid_t pid, cpumask_t *mask)
|
|
+{
|
|
+ struct task_struct *p;
|
|
+ unsigned long flags;
|
|
+ int retval;
|
|
+
|
|
+ get_online_cpus();
|
|
+ rcu_read_lock();
|
|
+
|
|
+ retval = -ESRCH;
|
|
+ p = find_process_by_pid(pid);
|
|
+ if (!p)
|
|
+ goto out_unlock;
|
|
+
|
|
+ retval = security_task_getscheduler(p);
|
|
+ if (retval)
|
|
+ goto out_unlock;
|
|
+
|
|
+ raw_spin_lock_irqsave(&p->pi_lock, flags);
|
|
+ cpumask_and(mask, tsk_cpus_allowed(p), cpu_active_mask);
|
|
+ raw_spin_unlock_irqrestore(&p->pi_lock, flags);
|
|
+
|
|
+out_unlock:
|
|
+ rcu_read_unlock();
|
|
+ put_online_cpus();
|
|
+
|
|
+ return retval;
|
|
+}
|
|
+
|
|
+/**
|
|
+ * sys_sched_getaffinity - get the cpu affinity of a process
|
|
+ * @pid: pid of the process
|
|
+ * @len: length in bytes of the bitmask pointed to by user_mask_ptr
|
|
+ * @user_mask_ptr: user-space pointer to hold the current cpu mask
|
|
+ *
|
|
+ * Return: 0 on success. An error code otherwise.
|
|
+ */
|
|
+SYSCALL_DEFINE3(sched_getaffinity, pid_t, pid, unsigned int, len,
|
|
+ unsigned long __user *, user_mask_ptr)
|
|
+{
|
|
+ int ret;
|
|
+ cpumask_var_t mask;
|
|
+
|
|
+ if ((len * BITS_PER_BYTE) < nr_cpu_ids)
|
|
+ return -EINVAL;
|
|
+ if (len & (sizeof(unsigned long)-1))
|
|
+ return -EINVAL;
|
|
+
|
|
+ if (!alloc_cpumask_var(&mask, GFP_KERNEL))
|
|
+ return -ENOMEM;
|
|
+
|
|
+ ret = sched_getaffinity(pid, mask);
|
|
+ if (ret == 0) {
|
|
+ size_t retlen = min_t(size_t, len, cpumask_size());
|
|
+
|
|
+ if (copy_to_user(user_mask_ptr, mask, retlen))
|
|
+ ret = -EFAULT;
|
|
+ else
|
|
+ ret = retlen;
|
|
+ }
|
|
+ free_cpumask_var(mask);
|
|
+
|
|
+ return ret;
|
|
+}
|
|
+
|
|
+/**
|
|
+ * sys_sched_yield - yield the current processor to other threads.
|
|
+ *
|
|
+ * This function yields the current CPU to other tasks. It does this by
|
|
+ * scheduling away the current task. If it still has the earliest deadline
|
|
+ * it will be scheduled again as the next task.
|
|
+ *
|
|
+ * Return: 0.
|
|
+ */
|
|
+SYSCALL_DEFINE0(sched_yield)
|
|
+{
|
|
+ struct task_struct *p;
|
|
+ struct rq *rq;
|
|
+
|
|
+ if (!sched_yield_type)
|
|
+ goto out;
|
|
+ p = current;
|
|
+ rq = this_rq_lock();
|
|
+ if (sched_yield_type > 1)
|
|
+ time_slice_expired(p, rq);
|
|
+ schedstat_inc(rq->yld_count);
|
|
+
|
|
+ /*
|
|
+ * Since we are going to call schedule() anyway, there's
|
|
+ * no need to preempt or enable interrupts:
|
|
+ */
|
|
+ __release(rq->lock);
|
|
+ spin_release(&rq->lock.dep_map, 1, _THIS_IP_);
|
|
+ do_raw_spin_unlock(&rq->lock);
|
|
+ sched_preempt_enable_no_resched();
|
|
+
|
|
+ schedule();
|
|
+out:
|
|
+ return 0;
|
|
+}
|
|
+
|
|
+#ifndef CONFIG_PREEMPT
|
|
+int __sched _cond_resched(void)
|
|
+{
|
|
+ if (should_resched(0)) {
|
|
+ preempt_schedule_common();
|
|
+ return 1;
|
|
+ }
|
|
+ return 0;
|
|
+}
|
|
+EXPORT_SYMBOL(_cond_resched);
|
|
+#endif
|
|
+
|
|
+/*
|
|
+ * __cond_resched_lock() - if a reschedule is pending, drop the given lock,
|
|
+ * call schedule, and on return reacquire the lock.
|
|
+ *
|
|
+ * This works OK both with and without CONFIG_PREEMPT. We do strange low-level
|
|
+ * operations here to prevent schedule() from being called twice (once via
|
|
+ * spin_unlock(), once by hand).
|
|
+ */
|
|
+int __cond_resched_lock(spinlock_t *lock)
|
|
+{
|
|
+ int resched = should_resched(PREEMPT_LOCK_OFFSET);
|
|
+ int ret = 0;
|
|
+
|
|
+ lockdep_assert_held(lock);
|
|
+
|
|
+ if (spin_needbreak(lock) || resched) {
|
|
+ spin_unlock(lock);
|
|
+ if (resched)
|
|
+ preempt_schedule_common();
|
|
+ else
|
|
+ cpu_relax();
|
|
+ ret = 1;
|
|
+ spin_lock(lock);
|
|
+ }
|
|
+ return ret;
|
|
+}
|
|
+EXPORT_SYMBOL(__cond_resched_lock);
|
|
+
|
|
+int __sched __cond_resched_softirq(void)
|
|
+{
|
|
+ BUG_ON(!in_softirq());
|
|
+
|
|
+ if (should_resched(SOFTIRQ_DISABLE_OFFSET)) {
|
|
+ local_bh_enable();
|
|
+ preempt_schedule_common();
|
|
+ local_bh_disable();
|
|
+ return 1;
|
|
+ }
|
|
+ return 0;
|
|
+}
|
|
+EXPORT_SYMBOL(__cond_resched_softirq);
|
|
+
|
|
+/**
|
|
+ * yield - yield the current processor to other threads.
|
|
+ *
|
|
+ * Do not ever use this function, there's a 99% chance you're doing it wrong.
|
|
+ *
|
|
+ * The scheduler is at all times free to pick the calling task as the most
|
|
+ * eligible task to run, if removing the yield() call from your code breaks
|
|
+ * it, its already broken.
|
|
+ *
|
|
+ * Typical broken usage is:
|
|
+ *
|
|
+ * while (!event)
|
|
+ * yield();
|
|
+ *
|
|
+ * where one assumes that yield() will let 'the other' process run that will
|
|
+ * make event true. If the current task is a SCHED_FIFO task that will never
|
|
+ * happen. Never use yield() as a progress guarantee!!
|
|
+ *
|
|
+ * If you want to use yield() to wait for something, use wait_event().
|
|
+ * If you want to use yield() to be 'nice' for others, use cond_resched().
|
|
+ * If you still want to use yield(), do not!
|
|
+ */
|
|
+void __sched yield(void)
|
|
+{
|
|
+ set_current_state(TASK_RUNNING);
|
|
+ sys_sched_yield();
|
|
+}
|
|
+EXPORT_SYMBOL(yield);
|
|
+
|
|
+/**
|
|
+ * yield_to - yield the current processor to another thread in
|
|
+ * your thread group, or accelerate that thread toward the
|
|
+ * processor it's on.
|
|
+ * @p: target task
|
|
+ * @preempt: whether task preemption is allowed or not
|
|
+ *
|
|
+ * It's the caller's job to ensure that the target task struct
|
|
+ * can't go away on us before we can do any checks.
|
|
+ *
|
|
+ * Return:
|
|
+ * true (>0) if we indeed boosted the target task.
|
|
+ * false (0) if we failed to boost the target.
|
|
+ * -ESRCH if there's no task to yield to.
|
|
+ */
|
|
+int __sched yield_to(struct task_struct *p, bool preempt)
|
|
+{
|
|
+ struct task_struct *rq_p;
|
|
+ struct rq *rq, *p_rq;
|
|
+ unsigned long flags;
|
|
+ int yielded = 0;
|
|
+
|
|
+ local_irq_save(flags);
|
|
+ rq = this_rq();
|
|
+
|
|
+again:
|
|
+ p_rq = task_rq(p);
|
|
+ /*
|
|
+ * If we're the only runnable task on the rq and target rq also
|
|
+ * has only one task, there's absolutely no point in yielding.
|
|
+ */
|
|
+ if (task_running(p_rq, p) || p->state) {
|
|
+ yielded = -ESRCH;
|
|
+ goto out_irq;
|
|
+ }
|
|
+
|
|
+ double_rq_lock(rq, p_rq);
|
|
+ if (unlikely(task_rq(p) != p_rq)) {
|
|
+ double_rq_unlock(rq, p_rq);
|
|
+ goto again;
|
|
+ }
|
|
+
|
|
+ yielded = 1;
|
|
+ schedstat_inc(rq->yld_count);
|
|
+ rq_p = rq->curr;
|
|
+ if (p->deadline > rq_p->deadline)
|
|
+ p->deadline = rq_p->deadline;
|
|
+ p->time_slice += rq_p->time_slice;
|
|
+ if (p->time_slice > timeslice())
|
|
+ p->time_slice = timeslice();
|
|
+ time_slice_expired(rq_p, rq);
|
|
+ if (preempt && rq != p_rq)
|
|
+ resched_task(p_rq->curr);
|
|
+ double_rq_unlock(rq, p_rq);
|
|
+out_irq:
|
|
+ local_irq_restore(flags);
|
|
+
|
|
+ if (yielded > 0)
|
|
+ schedule();
|
|
+ return yielded;
|
|
+}
|
|
+EXPORT_SYMBOL_GPL(yield_to);
|
|
+
|
|
+/*
|
|
+ * This task is about to go to sleep on IO. Increment rq->nr_iowait so
|
|
+ * that process accounting knows that this is a task in IO wait state.
|
|
+ *
|
|
+ * But don't do that if it is a deliberate, throttling IO wait (this task
|
|
+ * has set its backing_dev_info: the queue against which it should throttle)
|
|
+ */
|
|
+
|
|
+long __sched io_schedule_timeout(long timeout)
|
|
+{
|
|
+ int old_iowait = current->in_iowait;
|
|
+ struct rq *rq;
|
|
+ long ret;
|
|
+
|
|
+ current->in_iowait = 1;
|
|
+ blk_schedule_flush_plug(current);
|
|
+
|
|
+ delayacct_blkio_start();
|
|
+ rq = raw_rq();
|
|
+ atomic_inc(&rq->nr_iowait);
|
|
+ ret = schedule_timeout(timeout);
|
|
+ current->in_iowait = old_iowait;
|
|
+ atomic_dec(&rq->nr_iowait);
|
|
+ delayacct_blkio_end();
|
|
+
|
|
+ return ret;
|
|
+}
|
|
+EXPORT_SYMBOL(io_schedule_timeout);
|
|
+
|
|
+/**
|
|
+ * sys_sched_get_priority_max - return maximum RT priority.
|
|
+ * @policy: scheduling class.
|
|
+ *
|
|
+ * Return: On success, this syscall returns the maximum
|
|
+ * rt_priority that can be used by a given scheduling class.
|
|
+ * On failure, a negative error code is returned.
|
|
+ */
|
|
+SYSCALL_DEFINE1(sched_get_priority_max, int, policy)
|
|
+{
|
|
+ int ret = -EINVAL;
|
|
+
|
|
+ switch (policy) {
|
|
+ case SCHED_FIFO:
|
|
+ case SCHED_RR:
|
|
+ ret = MAX_USER_RT_PRIO-1;
|
|
+ break;
|
|
+ case SCHED_NORMAL:
|
|
+ case SCHED_BATCH:
|
|
+ case SCHED_ISO:
|
|
+ case SCHED_IDLEPRIO:
|
|
+ ret = 0;
|
|
+ break;
|
|
+ }
|
|
+ return ret;
|
|
+}
|
|
+
|
|
+/**
|
|
+ * sys_sched_get_priority_min - return minimum RT priority.
|
|
+ * @policy: scheduling class.
|
|
+ *
|
|
+ * Return: On success, this syscall returns the minimum
|
|
+ * rt_priority that can be used by a given scheduling class.
|
|
+ * On failure, a negative error code is returned.
|
|
+ */
|
|
+SYSCALL_DEFINE1(sched_get_priority_min, int, policy)
|
|
+{
|
|
+ int ret = -EINVAL;
|
|
+
|
|
+ switch (policy) {
|
|
+ case SCHED_FIFO:
|
|
+ case SCHED_RR:
|
|
+ ret = 1;
|
|
+ break;
|
|
+ case SCHED_NORMAL:
|
|
+ case SCHED_BATCH:
|
|
+ case SCHED_ISO:
|
|
+ case SCHED_IDLEPRIO:
|
|
+ ret = 0;
|
|
+ break;
|
|
+ }
|
|
+ return ret;
|
|
+}
|
|
+
|
|
+/**
|
|
+ * sys_sched_rr_get_interval - return the default timeslice of a process.
|
|
+ * @pid: pid of the process.
|
|
+ * @interval: userspace pointer to the timeslice value.
|
|
+ *
|
|
+ *
|
|
+ * Return: On success, 0 and the timeslice is in @interval. Otherwise,
|
|
+ * an error code.
|
|
+ */
|
|
+SYSCALL_DEFINE2(sched_rr_get_interval, pid_t, pid,
|
|
+ struct timespec __user *, interval)
|
|
+{
|
|
+ struct task_struct *p;
|
|
+ unsigned int time_slice;
|
|
+ unsigned long flags;
|
|
+ struct timespec t;
|
|
+ struct rq *rq;
|
|
+ int retval;
|
|
+
|
|
+ if (pid < 0)
|
|
+ return -EINVAL;
|
|
+
|
|
+ retval = -ESRCH;
|
|
+ rcu_read_lock();
|
|
+ p = find_process_by_pid(pid);
|
|
+ if (!p)
|
|
+ goto out_unlock;
|
|
+
|
|
+ retval = security_task_getscheduler(p);
|
|
+ if (retval)
|
|
+ goto out_unlock;
|
|
+
|
|
+ rq = task_rq_lock(p, &flags);
|
|
+ time_slice = p->policy == SCHED_FIFO ? 0 : MS_TO_NS(task_timeslice(p));
|
|
+ task_rq_unlock(rq, p, &flags);
|
|
+
|
|
+ rcu_read_unlock();
|
|
+ t = ns_to_timespec(time_slice);
|
|
+ retval = copy_to_user(interval, &t, sizeof(t)) ? -EFAULT : 0;
|
|
+ return retval;
|
|
+
|
|
+out_unlock:
|
|
+ rcu_read_unlock();
|
|
+ return retval;
|
|
+}
|
|
+
|
|
+static const char stat_nam[] = TASK_STATE_TO_CHAR_STR;
|
|
+
|
|
+void sched_show_task(struct task_struct *p)
|
|
+{
|
|
+ unsigned long free = 0;
|
|
+ int ppid;
|
|
+ unsigned long state = p->state;
|
|
+
|
|
+ if (!try_get_task_stack(p))
|
|
+ return;
|
|
+ if (state)
|
|
+ state = __ffs(state) + 1;
|
|
+ printk(KERN_INFO "%-15.15s %c", p->comm,
|
|
+ state < sizeof(stat_nam) - 1 ? stat_nam[state] : '?');
|
|
+ if (state == TASK_RUNNING)
|
|
+ printk(KERN_CONT " running task ");
|
|
+#ifdef CONFIG_DEBUG_STACK_USAGE
|
|
+ free = stack_not_used(p);
|
|
+#endif
|
|
+ ppid = 0;
|
|
+ rcu_read_lock();
|
|
+ if (pid_alive(p))
|
|
+ ppid = task_pid_nr(rcu_dereference(p->real_parent));
|
|
+ rcu_read_unlock();
|
|
+ printk(KERN_CONT "%5lu %5d %6d 0x%08lx\n", free,
|
|
+ task_pid_nr(p), ppid,
|
|
+ (unsigned long)task_thread_info(p)->flags);
|
|
+
|
|
+ print_worker_info(KERN_INFO, p);
|
|
+ show_stack(p, NULL);
|
|
+ put_task_stack(p);
|
|
+}
|
|
+
|
|
+void show_state_filter(unsigned long state_filter)
|
|
+{
|
|
+ struct task_struct *g, *p;
|
|
+
|
|
+#if BITS_PER_LONG == 32
|
|
+ printk(KERN_INFO
|
|
+ " task PC stack pid father\n");
|
|
+#else
|
|
+ printk(KERN_INFO
|
|
+ " task PC stack pid father\n");
|
|
+#endif
|
|
+ rcu_read_lock();
|
|
+ for_each_process_thread(g, p) {
|
|
+ /*
|
|
+ * reset the NMI-timeout, listing all files on a slow
|
|
+ * console might take a lot of time:
|
|
+ * Also, reset softlockup watchdogs on all CPUs, because
|
|
+ * another CPU might be blocked waiting for us to process
|
|
+ * an IPI.
|
|
+ */
|
|
+ touch_nmi_watchdog();
|
|
+ touch_all_softlockup_watchdogs();
|
|
+ if (!state_filter || (p->state & state_filter))
|
|
+ sched_show_task(p);
|
|
+ }
|
|
+
|
|
+ rcu_read_unlock();
|
|
+ /*
|
|
+ * Only show locks if all tasks are dumped:
|
|
+ */
|
|
+ if (!state_filter)
|
|
+ debug_show_all_locks();
|
|
+}
|
|
+
|
|
+void dump_cpu_task(int cpu)
|
|
+{
|
|
+ pr_info("Task dump for CPU %d:\n", cpu);
|
|
+ sched_show_task(cpu_curr(cpu));
|
|
+}
|
|
+
|
|
+#ifdef CONFIG_SMP
|
|
+void set_cpus_allowed_common(struct task_struct *p, const struct cpumask *new_mask)
|
|
+{
|
|
+ cpumask_copy(&p->cpus_allowed, new_mask);
|
|
+ p->nr_cpus_allowed = cpumask_weight(new_mask);
|
|
+}
|
|
+
|
|
+void __do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask)
|
|
+{
|
|
+ struct rq *rq = task_rq(p);
|
|
+
|
|
+ lockdep_assert_held(&p->pi_lock);
|
|
+
|
|
+ cpumask_copy(tsk_cpus_allowed(p), new_mask);
|
|
+
|
|
+ if (task_queued(p)) {
|
|
+ /*
|
|
+ * Because __kthread_bind() calls this on blocked tasks without
|
|
+ * holding rq->lock.
|
|
+ */
|
|
+ lockdep_assert_held(&rq->lock);
|
|
+ }
|
|
+}
|
|
+
|
|
+/*
|
|
+ * Calling do_set_cpus_allowed from outside the scheduler code may make the
|
|
+ * task not be able to run on its current CPU so we resched it here.
|
|
+ */
|
|
+void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask)
|
|
+{
|
|
+ __do_set_cpus_allowed(p, new_mask);
|
|
+ if (needs_other_cpu(p, task_cpu(p))) {
|
|
+ struct rq *rq;
|
|
+
|
|
+ set_task_cpu(p, valid_task_cpu(p));
|
|
+ rq = __task_rq_lock(p);
|
|
+ resched_task(p);
|
|
+ __task_rq_unlock(rq);
|
|
+ }
|
|
+}
|
|
+
|
|
+/*
|
|
+ * For internal scheduler calls to do_set_cpus_allowed which will resched
|
|
+ * themselves if needed.
|
|
+ */
|
|
+static void _do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask)
|
|
+{
|
|
+ __do_set_cpus_allowed(p, new_mask);
|
|
+ /* __set_cpus_allowed_ptr will handle the reschedule in this variant */
|
|
+ if (needs_other_cpu(p, task_cpu(p)))
|
|
+ set_task_cpu(p, valid_task_cpu(p));
|
|
+}
|
|
+#endif
|
|
+
|
|
+/**
|
|
+ * init_idle - set up an idle thread for a given CPU
|
|
+ * @idle: task in question
|
|
+ * @cpu: cpu the idle task belongs to
|
|
+ *
|
|
+ * NOTE: this function does not set the idle thread's NEED_RESCHED
|
|
+ * flag, to make booting more robust.
|
|
+ */
|
|
+void init_idle(struct task_struct *idle, int cpu)
|
|
+{
|
|
+ struct rq *rq = cpu_rq(cpu);
|
|
+ unsigned long flags;
|
|
+
|
|
+ raw_spin_lock_irqsave(&idle->pi_lock, flags);
|
|
+ raw_spin_lock(&rq->lock);
|
|
+ idle->last_ran = rq->niffies;
|
|
+ time_slice_expired(idle, rq);
|
|
+ idle->state = TASK_RUNNING;
|
|
+ /* Setting prio to illegal value shouldn't matter when never queued */
|
|
+ idle->prio = PRIO_LIMIT;
|
|
+
|
|
+ kasan_unpoison_task_stack(idle);
|
|
+
|
|
+#ifdef CONFIG_SMP
|
|
+ /*
|
|
+ * It's possible that init_idle() gets called multiple times on a task,
|
|
+ * in that case do_set_cpus_allowed() will not do the right thing.
|
|
+ *
|
|
+ * And since this is boot we can forgo the serialisation.
|
|
+ */
|
|
+ set_cpus_allowed_common(idle, cpumask_of(cpu));
|
|
+#ifdef CONFIG_SMT_NICE
|
|
+ idle->smt_bias = 0;
|
|
+#endif
|
|
+#endif
|
|
+ set_rq_task(rq, idle);
|
|
+
|
|
+ /* Silence PROVE_RCU */
|
|
+ rcu_read_lock();
|
|
+ set_task_cpu(idle, cpu);
|
|
+ rcu_read_unlock();
|
|
+
|
|
+ rq->curr = rq->idle = idle;
|
|
+ idle->on_rq = TASK_ON_RQ_QUEUED;
|
|
+ raw_spin_unlock(&rq->lock);
|
|
+ raw_spin_unlock_irqrestore(&idle->pi_lock, flags);
|
|
+
|
|
+ /* Set the preempt count _outside_ the spinlocks! */
|
|
+ init_idle_preempt_count(idle, cpu);
|
|
+
|
|
+ ftrace_graph_init_idle_task(idle, cpu);
|
|
+ vtime_init_idle(idle, cpu);
|
|
+#ifdef CONFIG_SMP
|
|
+ sprintf(idle->comm, "%s/%d", INIT_TASK_COMM, cpu);
|
|
+#endif
|
|
+}
|
|
+
|
|
+int cpuset_cpumask_can_shrink(const struct cpumask __maybe_unused *cur,
|
|
+ const struct cpumask __maybe_unused *trial)
|
|
+{
|
|
+ return 1;
|
|
+}
|
|
+
|
|
+int task_can_attach(struct task_struct *p,
|
|
+ const struct cpumask *cs_cpus_allowed)
|
|
+{
|
|
+ int ret = 0;
|
|
+
|
|
+ /*
|
|
+ * Kthreads which disallow setaffinity shouldn't be moved
|
|
+ * to a new cpuset; we don't want to change their cpu
|
|
+ * affinity and isolating such threads by their set of
|
|
+ * allowed nodes is unnecessary. Thus, cpusets are not
|
|
+ * applicable for such threads. This prevents checking for
|
|
+ * success of set_cpus_allowed_ptr() on all attached tasks
|
|
+ * before cpus_allowed may be changed.
|
|
+ */
|
|
+ if (p->flags & PF_NO_SETAFFINITY)
|
|
+ ret = -EINVAL;
|
|
+
|
|
+ return ret;
|
|
+}
|
|
+
|
|
+void resched_cpu(int cpu)
|
|
+{
|
|
+ struct rq *rq = cpu_rq(cpu);
|
|
+ unsigned long flags;
|
|
+
|
|
+ rq_lock_irqsave(rq, &flags);
|
|
+ resched_task(cpu_curr(cpu));
|
|
+ rq_unlock_irqrestore(rq, &flags);
|
|
+}
|
|
+
|
|
+#ifdef CONFIG_SMP
|
|
+#ifdef CONFIG_NO_HZ_COMMON
|
|
+void nohz_balance_enter_idle(int cpu)
|
|
+{
|
|
+}
|
|
+
|
|
+void select_nohz_load_balancer(int stop_tick)
|
|
+{
|
|
+}
|
|
+
|
|
+void set_cpu_sd_state_idle(void) {}
|
|
+#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
|
|
+/**
|
|
+ * lowest_flag_domain - Return lowest sched_domain containing flag.
|
|
+ * @cpu: The cpu whose lowest level of sched domain is to
|
|
+ * be returned.
|
|
+ * @flag: The flag to check for the lowest sched_domain
|
|
+ * for the given cpu.
|
|
+ *
|
|
+ * Returns the lowest sched_domain of a cpu which contains the given flag.
|
|
+ */
|
|
+static inline struct sched_domain *lowest_flag_domain(int cpu, int flag)
|
|
+{
|
|
+ struct sched_domain *sd;
|
|
+
|
|
+ for_each_domain(cpu, sd)
|
|
+ if (sd && (sd->flags & flag))
|
|
+ break;
|
|
+
|
|
+ return sd;
|
|
+}
|
|
+
|
|
+/**
|
|
+ * for_each_flag_domain - Iterates over sched_domains containing the flag.
|
|
+ * @cpu: The cpu whose domains we're iterating over.
|
|
+ * @sd: variable holding the value of the power_savings_sd
|
|
+ * for cpu.
|
|
+ * @flag: The flag to filter the sched_domains to be iterated.
|
|
+ *
|
|
+ * Iterates over all the scheduler domains for a given cpu that has the 'flag'
|
|
+ * set, starting from the lowest sched_domain to the highest.
|
|
+ */
|
|
+#define for_each_flag_domain(cpu, sd, flag) \
|
|
+ for (sd = lowest_flag_domain(cpu, flag); \
|
|
+ (sd && (sd->flags & flag)); sd = sd->parent)
|
|
+
|
|
+#endif /* (CONFIG_SCHED_MC || CONFIG_SCHED_SMT) */
|
|
+
|
|
+/*
|
|
+ * In the semi idle case, use the nearest busy cpu for migrating timers
|
|
+ * from an idle cpu. This is good for power-savings.
|
|
+ *
|
|
+ * We don't do similar optimization for completely idle system, as
|
|
+ * selecting an idle cpu will add more delays to the timers than intended
|
|
+ * (as that cpu's timer base may not be uptodate wrt jiffies etc).
|
|
+ */
|
|
+int get_nohz_timer_target(void)
|
|
+{
|
|
+ int i, cpu = smp_processor_id();
|
|
+ struct sched_domain *sd;
|
|
+
|
|
+ if (!idle_cpu(cpu) && is_housekeeping_cpu(cpu))
|
|
+ return cpu;
|
|
+
|
|
+ rcu_read_lock();
|
|
+ for_each_domain(cpu, sd) {
|
|
+ for_each_cpu(i, sched_domain_span(sd)) {
|
|
+ if (cpu == i)
|
|
+ continue;
|
|
+
|
|
+ if (!idle_cpu(i) && is_housekeeping_cpu(i)) {
|
|
+ cpu = i;
|
|
+ cpu = i;
|
|
+ goto unlock;
|
|
+ }
|
|
+ }
|
|
+ }
|
|
+
|
|
+ if (!is_housekeeping_cpu(cpu))
|
|
+ cpu = housekeeping_any_cpu();
|
|
+unlock:
|
|
+ rcu_read_unlock();
|
|
+ return cpu;
|
|
+}
|
|
+
|
|
+/*
|
|
+ * When add_timer_on() enqueues a timer into the timer wheel of an
|
|
+ * idle CPU then this timer might expire before the next timer event
|
|
+ * which is scheduled to wake up that CPU. In case of a completely
|
|
+ * idle system the next event might even be infinite time into the
|
|
+ * future. wake_up_idle_cpu() ensures that the CPU is woken up and
|
|
+ * leaves the inner idle loop so the newly added timer is taken into
|
|
+ * account when the CPU goes back to idle and evaluates the timer
|
|
+ * wheel for the next timer event.
|
|
+ */
|
|
+void wake_up_idle_cpu(int cpu)
|
|
+{
|
|
+ if (cpu == smp_processor_id())
|
|
+ return;
|
|
+
|
|
+ if (set_nr_and_not_polling(cpu_rq(cpu)->idle))
|
|
+ smp_sched_reschedule(cpu);
|
|
+ else
|
|
+ trace_sched_wake_idle_without_ipi(cpu);
|
|
+}
|
|
+
|
|
+static bool wake_up_full_nohz_cpu(int cpu)
|
|
+{
|
|
+ /*
|
|
+ * We just need the target to call irq_exit() and re-evaluate
|
|
+ * the next tick. The nohz full kick at least implies that.
|
|
+ * If needed we can still optimize that later with an
|
|
+ * empty IRQ.
|
|
+ */
|
|
+ if (cpu_is_offline(cpu))
|
|
+ return true; /* Don't try to wake offline CPUs. */
|
|
+ if (tick_nohz_full_cpu(cpu)) {
|
|
+ if (cpu != smp_processor_id() ||
|
|
+ tick_nohz_tick_stopped())
|
|
+ tick_nohz_full_kick_cpu(cpu);
|
|
+ return true;
|
|
+ }
|
|
+
|
|
+ return false;
|
|
+}
|
|
+
|
|
+/*
|
|
+ * Wake up the specified CPU. If the CPU is going offline, it is the
|
|
+ * caller's responsibility to deal with the lost wakeup, for example,
|
|
+ * by hooking into the CPU_DEAD notifier like timers and hrtimers do.
|
|
+ */
|
|
+void wake_up_nohz_cpu(int cpu)
|
|
+{
|
|
+ if (!wake_up_full_nohz_cpu(cpu))
|
|
+ wake_up_idle_cpu(cpu);
|
|
+}
|
|
+#endif /* CONFIG_NO_HZ_COMMON */
|
|
+
|
|
+/*
|
|
+ * Change a given task's CPU affinity. Migrate the thread to a
|
|
+ * proper CPU and schedule it away if the CPU it's executing on
|
|
+ * is removed from the allowed bitmask.
|
|
+ *
|
|
+ * NOTE: the caller must have a valid reference to the task, the
|
|
+ * task must not exit() & deallocate itself prematurely. The
|
|
+ * call is not atomic; no spinlocks may be held.
|
|
+ */
|
|
+static int __set_cpus_allowed_ptr(struct task_struct *p,
|
|
+ const struct cpumask *new_mask, bool check)
|
|
+{
|
|
+ const struct cpumask *cpu_valid_mask = cpu_active_mask;
|
|
+ bool queued = false, running_wrong = false, kthread;
|
|
+ struct cpumask old_mask;
|
|
+ unsigned long flags;
|
|
+ struct rq *rq;
|
|
+ int ret = 0;
|
|
+
|
|
+ rq = task_rq_lock(p, &flags);
|
|
+
|
|
+ kthread = !!(p->flags & PF_KTHREAD);
|
|
+ if (kthread) {
|
|
+ /*
|
|
+ * Kernel threads are allowed on online && !active CPUs
|
|
+ */
|
|
+ cpu_valid_mask = cpu_online_mask;
|
|
+ }
|
|
+
|
|
+ /*
|
|
+ * Must re-check here, to close a race against __kthread_bind(),
|
|
+ * sched_setaffinity() is not guaranteed to observe the flag.
|
|
+ */
|
|
+ if (check && (p->flags & PF_NO_SETAFFINITY)) {
|
|
+ ret = -EINVAL;
|
|
+ goto out;
|
|
+ }
|
|
+
|
|
+ cpumask_copy(&old_mask, tsk_cpus_allowed(p));
|
|
+ if (cpumask_equal(&old_mask, new_mask))
|
|
+ goto out;
|
|
+
|
|
+ if (!cpumask_intersects(new_mask, cpu_valid_mask)) {
|
|
+ ret = -EINVAL;
|
|
+ goto out;
|
|
+ }
|
|
+
|
|
+ queued = task_queued(p);
|
|
+
|
|
+ _do_set_cpus_allowed(p, new_mask);
|
|
+
|
|
+ if (kthread) {
|
|
+ /*
|
|
+ * For kernel threads that do indeed end up on online &&
|
|
+ * !active we want to ensure they are strict per-cpu threads.
|
|
+ */
|
|
+ WARN_ON(cpumask_intersects(new_mask, cpu_online_mask) &&
|
|
+ !cpumask_intersects(new_mask, cpu_active_mask) &&
|
|
+ tsk_nr_cpus_allowed(p) != 1);
|
|
+ }
|
|
+
|
|
+ /* Can the task run on the task's current CPU? If so, we're done */
|
|
+ if (cpumask_test_cpu(task_cpu(p), new_mask))
|
|
+ goto out;
|
|
+
|
|
+ if (task_running(rq, p)) {
|
|
+ /* Task is running on the wrong cpu now, reschedule it. */
|
|
+ if (rq == this_rq()) {
|
|
+ set_tsk_need_resched(p);
|
|
+ running_wrong = kthread;
|
|
+ } else
|
|
+ resched_task(p);
|
|
+ } else {
|
|
+ int dest_cpu = cpumask_any_and(cpu_valid_mask, new_mask);
|
|
+ struct rq *dest_rq = cpu_rq(dest_cpu);
|
|
+
|
|
+ /* Switch rq locks here */
|
|
+ lock_second_rq(rq, dest_rq);
|
|
+ set_task_cpu(p, dest_cpu);
|
|
+ rq_unlock(rq);
|
|
+
|
|
+ rq = dest_rq;
|
|
+ }
|
|
+out:
|
|
+ if (queued && !cpumask_subset(new_mask, &old_mask))
|
|
+ try_preempt(p, rq);
|
|
+ if (running_wrong)
|
|
+ preempt_disable();
|
|
+ task_rq_unlock(rq, p, &flags);
|
|
+
|
|
+ if (running_wrong) {
|
|
+ __schedule(true);
|
|
+ preempt_enable();
|
|
+ }
|
|
+
|
|
+ return ret;
|
|
+}
|
|
+
|
|
+int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask)
|
|
+{
|
|
+ return __set_cpus_allowed_ptr(p, new_mask, false);
|
|
+}
|
|
+EXPORT_SYMBOL_GPL(set_cpus_allowed_ptr);
|
|
+
|
|
+#ifdef CONFIG_HOTPLUG_CPU
|
|
+/*
|
|
+ * Run through task list and find tasks affined to the dead cpu, then remove
|
|
+ * that cpu from the list, enable cpu0 and set the zerobound flag. Must hold
|
|
+ * cpu 0 and src_cpu's runqueue locks.
|
|
+ */
|
|
+static void bind_zero(int src_cpu)
|
|
+{
|
|
+ struct task_struct *p, *t;
|
|
+ int bound = 0;
|
|
+
|
|
+ if (src_cpu == 0)
|
|
+ return;
|
|
+
|
|
+ do_each_thread(t, p) {
|
|
+ if (cpumask_test_cpu(src_cpu, tsk_cpus_allowed(p))) {
|
|
+ bool local = (task_cpu(p) == src_cpu);
|
|
+
|
|
+ /* task_running is the cpu stopper thread */
|
|
+ if (local && task_running(task_rq(p), p))
|
|
+ continue;
|
|
+ atomic_clear_cpu(src_cpu, tsk_cpus_allowed(p));
|
|
+ atomic_set_cpu(0, tsk_cpus_allowed(p));
|
|
+ p->zerobound = true;
|
|
+ bound++;
|
|
+ if (local)
|
|
+ set_task_cpu(p, 0);
|
|
+ }
|
|
+ } while_each_thread(t, p);
|
|
+
|
|
+ if (bound) {
|
|
+ printk(KERN_INFO "Removed affinity for %d processes to cpu %d\n",
|
|
+ bound, src_cpu);
|
|
+ }
|
|
+}
|
|
+
|
|
+/* Find processes with the zerobound flag and reenable their affinity for the
|
|
+ * CPU coming alive. */
|
|
+static void unbind_zero(int src_cpu)
|
|
+{
|
|
+ int unbound = 0, zerobound = 0;
|
|
+ struct task_struct *p, *t;
|
|
+
|
|
+ if (src_cpu == 0)
|
|
+ return;
|
|
+
|
|
+ do_each_thread(t, p) {
|
|
+ if (!p->mm)
|
|
+ p->zerobound = false;
|
|
+ if (p->zerobound) {
|
|
+ unbound++;
|
|
+ cpumask_set_cpu(src_cpu, tsk_cpus_allowed(p));
|
|
+ /* Once every CPU affinity has been re-enabled, remove
|
|
+ * the zerobound flag */
|
|
+ if (cpumask_subset(cpu_possible_mask, tsk_cpus_allowed(p))) {
|
|
+ p->zerobound = false;
|
|
+ zerobound++;
|
|
+ }
|
|
+ }
|
|
+ } while_each_thread(t, p);
|
|
+
|
|
+ if (unbound) {
|
|
+ printk(KERN_INFO "Added affinity for %d processes to cpu %d\n",
|
|
+ unbound, src_cpu);
|
|
+ }
|
|
+ if (zerobound) {
|
|
+ printk(KERN_INFO "Released forced binding to cpu0 for %d processes\n",
|
|
+ zerobound);
|
|
+ }
|
|
+}
|
|
+
|
|
+/*
|
|
+ * Ensures that the idle task is using init_mm right before its cpu goes
|
|
+ * offline.
|
|
+ */
|
|
+void idle_task_exit(void)
|
|
+{
|
|
+ struct mm_struct *mm = current->active_mm;
|
|
+
|
|
+ BUG_ON(cpu_online(smp_processor_id()));
|
|
+
|
|
+ if (mm != &init_mm) {
|
|
+ switch_mm_irqs_off(mm, &init_mm, current);
|
|
+ finish_arch_post_lock_switch();
|
|
+ }
|
|
+ mmdrop(mm);
|
|
+}
|
|
+#else /* CONFIG_HOTPLUG_CPU */
|
|
+static void unbind_zero(int src_cpu) {}
|
|
+#endif /* CONFIG_HOTPLUG_CPU */
|
|
+
|
|
+void sched_set_stop_task(int cpu, struct task_struct *stop)
|
|
+{
|
|
+ struct sched_param stop_param = { .sched_priority = STOP_PRIO };
|
|
+ struct sched_param start_param = { .sched_priority = 0 };
|
|
+ struct task_struct *old_stop = cpu_rq(cpu)->stop;
|
|
+
|
|
+ if (stop) {
|
|
+ /*
|
|
+ * Make it appear like a SCHED_FIFO task, its something
|
|
+ * userspace knows about and won't get confused about.
|
|
+ *
|
|
+ * Also, it will make PI more or less work without too
|
|
+ * much confusion -- but then, stop work should not
|
|
+ * rely on PI working anyway.
|
|
+ */
|
|
+ sched_setscheduler_nocheck(stop, SCHED_FIFO, &stop_param);
|
|
+ }
|
|
+
|
|
+ cpu_rq(cpu)->stop = stop;
|
|
+
|
|
+ if (old_stop) {
|
|
+ /*
|
|
+ * Reset it back to a normal scheduling policy so that
|
|
+ * it can die in pieces.
|
|
+ */
|
|
+ sched_setscheduler_nocheck(old_stop, SCHED_NORMAL, &start_param);
|
|
+ }
|
|
+}
|
|
+
|
|
+#if defined(CONFIG_SCHED_DEBUG) && defined(CONFIG_SYSCTL)
|
|
+
|
|
+static struct ctl_table sd_ctl_dir[] = {
|
|
+ {
|
|
+ .procname = "sched_domain",
|
|
+ .mode = 0555,
|
|
+ },
|
|
+ {}
|
|
+};
|
|
+
|
|
+static struct ctl_table sd_ctl_root[] = {
|
|
+ {
|
|
+ .procname = "kernel",
|
|
+ .mode = 0555,
|
|
+ .child = sd_ctl_dir,
|
|
+ },
|
|
+ {}
|
|
+};
|
|
+
|
|
+static struct ctl_table *sd_alloc_ctl_entry(int n)
|
|
+{
|
|
+ struct ctl_table *entry =
|
|
+ kcalloc(n, sizeof(struct ctl_table), GFP_KERNEL);
|
|
+
|
|
+ return entry;
|
|
+}
|
|
+
|
|
+static void sd_free_ctl_entry(struct ctl_table **tablep)
|
|
+{
|
|
+ struct ctl_table *entry;
|
|
+
|
|
+ /*
|
|
+ * In the intermediate directories, both the child directory and
|
|
+ * procname are dynamically allocated and could fail but the mode
|
|
+ * will always be set. In the lowest directory the names are
|
|
+ * static strings and all have proc handlers.
|
|
+ */
|
|
+ for (entry = *tablep; entry->mode; entry++) {
|
|
+ if (entry->child)
|
|
+ sd_free_ctl_entry(&entry->child);
|
|
+ if (entry->proc_handler == NULL)
|
|
+ kfree(entry->procname);
|
|
+ }
|
|
+
|
|
+ kfree(*tablep);
|
|
+ *tablep = NULL;
|
|
+}
|
|
+
|
|
+#define CPU_LOAD_IDX_MAX 5
|
|
+static int min_load_idx = 0;
|
|
+static int max_load_idx = CPU_LOAD_IDX_MAX-1;
|
|
+
|
|
+static void
|
|
+set_table_entry(struct ctl_table *entry,
|
|
+ const char *procname, void *data, int maxlen,
|
|
+ umode_t mode, proc_handler *proc_handler,
|
|
+ bool load_idx)
|
|
+{
|
|
+ entry->procname = procname;
|
|
+ entry->data = data;
|
|
+ entry->maxlen = maxlen;
|
|
+ entry->mode = mode;
|
|
+ entry->proc_handler = proc_handler;
|
|
+
|
|
+ if (load_idx) {
|
|
+ entry->extra1 = &min_load_idx;
|
|
+ entry->extra2 = &max_load_idx;
|
|
+ }
|
|
+}
|
|
+
|
|
+static struct ctl_table *
|
|
+sd_alloc_ctl_domain_table(struct sched_domain *sd)
|
|
+{
|
|
+ struct ctl_table *table = sd_alloc_ctl_entry(14);
|
|
+
|
|
+ if (table == NULL)
|
|
+ return NULL;
|
|
+
|
|
+ set_table_entry(&table[0], "min_interval", &sd->min_interval,
|
|
+ sizeof(long), 0644, proc_doulongvec_minmax, false);
|
|
+ set_table_entry(&table[1], "max_interval", &sd->max_interval,
|
|
+ sizeof(long), 0644, proc_doulongvec_minmax, false);
|
|
+ set_table_entry(&table[2], "busy_idx", &sd->busy_idx,
|
|
+ sizeof(int), 0644, proc_dointvec_minmax, true);
|
|
+ set_table_entry(&table[3], "idle_idx", &sd->idle_idx,
|
|
+ sizeof(int), 0644, proc_dointvec_minmax, true);
|
|
+ set_table_entry(&table[4], "newidle_idx", &sd->newidle_idx,
|
|
+ sizeof(int), 0644, proc_dointvec_minmax, true);
|
|
+ set_table_entry(&table[5], "wake_idx", &sd->wake_idx,
|
|
+ sizeof(int), 0644, proc_dointvec_minmax, true);
|
|
+ set_table_entry(&table[6], "forkexec_idx", &sd->forkexec_idx,
|
|
+ sizeof(int), 0644, proc_dointvec_minmax, true);
|
|
+ set_table_entry(&table[7], "busy_factor", &sd->busy_factor,
|
|
+ sizeof(int), 0644, proc_dointvec_minmax, false);
|
|
+ set_table_entry(&table[8], "imbalance_pct", &sd->imbalance_pct,
|
|
+ sizeof(int), 0644, proc_dointvec_minmax, false);
|
|
+ set_table_entry(&table[9], "cache_nice_tries",
|
|
+ &sd->cache_nice_tries,
|
|
+ sizeof(int), 0644, proc_dointvec_minmax, false);
|
|
+ set_table_entry(&table[10], "flags", &sd->flags,
|
|
+ sizeof(int), 0644, proc_dointvec_minmax, false);
|
|
+ set_table_entry(&table[11], "max_newidle_lb_cost",
|
|
+ &sd->max_newidle_lb_cost,
|
|
+ sizeof(long), 0644, proc_doulongvec_minmax, false);
|
|
+ set_table_entry(&table[12], "name", sd->name,
|
|
+ CORENAME_MAX_SIZE, 0444, proc_dostring, false);
|
|
+ /* &table[13] is terminator */
|
|
+
|
|
+ return table;
|
|
+}
|
|
+
|
|
+static struct ctl_table *sd_alloc_ctl_cpu_table(int cpu)
|
|
+{
|
|
+ struct ctl_table *entry, *table;
|
|
+ struct sched_domain *sd;
|
|
+ int domain_num = 0, i;
|
|
+ char buf[32];
|
|
+
|
|
+ for_each_domain(cpu, sd)
|
|
+ domain_num++;
|
|
+ entry = table = sd_alloc_ctl_entry(domain_num + 1);
|
|
+ if (table == NULL)
|
|
+ return NULL;
|
|
+
|
|
+ i = 0;
|
|
+ for_each_domain(cpu, sd) {
|
|
+ snprintf(buf, 32, "domain%d", i);
|
|
+ entry->procname = kstrdup(buf, GFP_KERNEL);
|
|
+ entry->mode = 0555;
|
|
+ entry->child = sd_alloc_ctl_domain_table(sd);
|
|
+ entry++;
|
|
+ i++;
|
|
+ }
|
|
+ return table;
|
|
+}
|
|
+
|
|
+static struct ctl_table_header *sd_sysctl_header;
|
|
+void register_sched_domain_sysctl(void)
|
|
+{
|
|
+ int i, cpu_num = num_possible_cpus();
|
|
+ struct ctl_table *entry = sd_alloc_ctl_entry(cpu_num + 1);
|
|
+ char buf[32];
|
|
+
|
|
+ WARN_ON(sd_ctl_dir[0].child);
|
|
+ sd_ctl_dir[0].child = entry;
|
|
+
|
|
+ if (entry == NULL)
|
|
+ return;
|
|
+
|
|
+ for_each_possible_cpu(i) {
|
|
+ snprintf(buf, 32, "cpu%d", i);
|
|
+ entry->procname = kstrdup(buf, GFP_KERNEL);
|
|
+ entry->mode = 0555;
|
|
+ entry->child = sd_alloc_ctl_cpu_table(i);
|
|
+ entry++;
|
|
+ }
|
|
+
|
|
+ WARN_ON(sd_sysctl_header);
|
|
+ sd_sysctl_header = register_sysctl_table(sd_ctl_root);
|
|
+}
|
|
+
|
|
+/* may be called multiple times per register */
|
|
+void unregister_sched_domain_sysctl(void)
|
|
+{
|
|
+ unregister_sysctl_table(sd_sysctl_header);
|
|
+ sd_sysctl_header = NULL;
|
|
+ if (sd_ctl_dir[0].child)
|
|
+ sd_free_ctl_entry(&sd_ctl_dir[0].child);
|
|
+}
|
|
+#endif /* CONFIG_SYSCTL */
|
|
+
|
|
+static void set_rq_online(struct rq *rq)
|
|
+{
|
|
+ if (!rq->online) {
|
|
+ cpumask_set_cpu(cpu_of(rq), rq->rd->online);
|
|
+ rq->online = true;
|
|
+ }
|
|
+}
|
|
+
|
|
+static void set_rq_offline(struct rq *rq)
|
|
+{
|
|
+ if (rq->online) {
|
|
+ int cpu = cpu_of(rq);
|
|
+
|
|
+ cpumask_clear_cpu(cpu, rq->rd->online);
|
|
+ rq->online = false;
|
|
+ clear_cpuidle_map(cpu);
|
|
+ }
|
|
+}
|
|
+
|
|
+static cpumask_var_t sched_domains_tmpmask; /* sched_domains_mutex */
|
|
+
|
|
+#ifdef CONFIG_SCHED_DEBUG
|
|
+
|
|
+static __read_mostly int sched_debug_enabled;
|
|
+
|
|
+static int __init sched_debug_setup(char *str)
|
|
+{
|
|
+ sched_debug_enabled = 1;
|
|
+
|
|
+ return 0;
|
|
+}
|
|
+early_param("sched_debug", sched_debug_setup);
|
|
+
|
|
+static inline bool sched_debug(void)
|
|
+{
|
|
+ return sched_debug_enabled;
|
|
+}
|
|
+
|
|
+static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level,
|
|
+ struct cpumask *groupmask)
|
|
+{
|
|
+ cpumask_clear(groupmask);
|
|
+
|
|
+ printk(KERN_DEBUG "%*s domain %d: ", level, "", level);
|
|
+
|
|
+ if (!(sd->flags & SD_LOAD_BALANCE)) {
|
|
+ printk("does not load-balance\n");
|
|
+ if (sd->parent)
|
|
+ printk(KERN_ERR "ERROR: !SD_LOAD_BALANCE domain"
|
|
+ " has parent");
|
|
+ return -1;
|
|
+ }
|
|
+
|
|
+ printk(KERN_CONT "span %*pbl level %s\n",
|
|
+ cpumask_pr_args(sched_domain_span(sd)), sd->name);
|
|
+
|
|
+ if (!cpumask_test_cpu(cpu, sched_domain_span(sd))) {
|
|
+ printk(KERN_ERR "ERROR: domain->span does not contain "
|
|
+ "CPU%d\n", cpu);
|
|
+ }
|
|
+
|
|
+ printk(KERN_CONT "\n");
|
|
+
|
|
+ if (!cpumask_equal(sched_domain_span(sd), groupmask))
|
|
+ printk(KERN_ERR "ERROR: groups don't span domain->span\n");
|
|
+
|
|
+ if (sd->parent &&
|
|
+ !cpumask_subset(groupmask, sched_domain_span(sd->parent)))
|
|
+ printk(KERN_ERR "ERROR: parent span is not a superset "
|
|
+ "of domain->span\n");
|
|
+ return 0;
|
|
+}
|
|
+
|
|
+static void sched_domain_debug(struct sched_domain *sd, int cpu)
|
|
+{
|
|
+ int level = 0;
|
|
+
|
|
+ if (!sched_debug_enabled)
|
|
+ return;
|
|
+
|
|
+ if (!sd) {
|
|
+ printk(KERN_DEBUG "CPU%d attaching NULL sched-domain.\n", cpu);
|
|
+ return;
|
|
+ }
|
|
+
|
|
+ printk(KERN_DEBUG "CPU%d attaching sched-domain:\n", cpu);
|
|
+
|
|
+ for (;;) {
|
|
+ if (sched_domain_debug_one(sd, cpu, level, sched_domains_tmpmask))
|
|
+ break;
|
|
+ level++;
|
|
+ sd = sd->parent;
|
|
+ if (!sd)
|
|
+ break;
|
|
+ }
|
|
+}
|
|
+#else /* !CONFIG_SCHED_DEBUG */
|
|
+
|
|
+# define sched_debug_enabled 0
|
|
+# define sched_domain_debug(sd, cpu) do { } while (0)
|
|
+static inline bool sched_debug(void)
|
|
+{
|
|
+ return false;
|
|
+}
|
|
+#endif /* CONFIG_SCHED_DEBUG */
|
|
+
|
|
+static int sd_degenerate(struct sched_domain *sd)
|
|
+{
|
|
+ if (cpumask_weight(sched_domain_span(sd)) == 1)
|
|
+ return 1;
|
|
+
|
|
+ /* Following flags don't use groups */
|
|
+ if (sd->flags & (SD_WAKE_AFFINE))
|
|
+ return 0;
|
|
+
|
|
+ return 1;
|
|
+}
|
|
+
|
|
+static int
|
|
+sd_parent_degenerate(struct sched_domain *sd, struct sched_domain *parent)
|
|
+{
|
|
+ unsigned long cflags = sd->flags, pflags = parent->flags;
|
|
+
|
|
+ if (sd_degenerate(parent))
|
|
+ return 1;
|
|
+
|
|
+ if (!cpumask_equal(sched_domain_span(sd), sched_domain_span(parent)))
|
|
+ return 0;
|
|
+
|
|
+ if (~cflags & pflags)
|
|
+ return 0;
|
|
+
|
|
+ return 1;
|
|
+}
|
|
+
|
|
+static void free_rootdomain(struct rcu_head *rcu)
|
|
+{
|
|
+ struct root_domain *rd = container_of(rcu, struct root_domain, rcu);
|
|
+
|
|
+ cpupri_cleanup(&rd->cpupri);
|
|
+ free_cpumask_var(rd->rto_mask);
|
|
+ free_cpumask_var(rd->online);
|
|
+ free_cpumask_var(rd->span);
|
|
+ kfree(rd);
|
|
+}
|
|
+
|
|
+static void rq_attach_root(struct rq *rq, struct root_domain *rd)
|
|
+{
|
|
+ struct root_domain *old_rd = NULL;
|
|
+ unsigned long flags;
|
|
+
|
|
+ rq_lock_irqsave(rq, &flags);
|
|
+
|
|
+ if (rq->rd) {
|
|
+ old_rd = rq->rd;
|
|
+
|
|
+ if (cpumask_test_cpu(rq->cpu, old_rd->online))
|
|
+ set_rq_offline(rq);
|
|
+
|
|
+ cpumask_clear_cpu(rq->cpu, old_rd->span);
|
|
+
|
|
+ /*
|
|
+ * If we dont want to free the old_rd yet then
|
|
+ * set old_rd to NULL to skip the freeing later
|
|
+ * in this function:
|
|
+ */
|
|
+ if (!atomic_dec_and_test(&old_rd->refcount))
|
|
+ old_rd = NULL;
|
|
+ }
|
|
+
|
|
+ atomic_inc(&rd->refcount);
|
|
+ rq->rd = rd;
|
|
+
|
|
+ cpumask_set_cpu(rq->cpu, rd->span);
|
|
+ if (cpumask_test_cpu(rq->cpu, cpu_active_mask))
|
|
+ set_rq_online(rq);
|
|
+
|
|
+ rq_unlock_irqrestore(rq, &flags);
|
|
+
|
|
+ if (old_rd)
|
|
+ call_rcu_sched(&old_rd->rcu, free_rootdomain);
|
|
+}
|
|
+
|
|
+static int init_rootdomain(struct root_domain *rd)
|
|
+{
|
|
+ memset(rd, 0, sizeof(*rd));
|
|
+
|
|
+ if (!zalloc_cpumask_var(&rd->span, GFP_KERNEL))
|
|
+ goto out;
|
|
+ if (!zalloc_cpumask_var(&rd->online, GFP_KERNEL))
|
|
+ goto free_span;
|
|
+ if (!zalloc_cpumask_var(&rd->rto_mask, GFP_KERNEL))
|
|
+ goto free_online;
|
|
+
|
|
+ if (cpupri_init(&rd->cpupri) != 0)
|
|
+ goto free_rto_mask;
|
|
+ return 0;
|
|
+
|
|
+free_rto_mask:
|
|
+ free_cpumask_var(rd->rto_mask);
|
|
+free_online:
|
|
+ free_cpumask_var(rd->online);
|
|
+free_span:
|
|
+ free_cpumask_var(rd->span);
|
|
+out:
|
|
+ return -ENOMEM;
|
|
+}
|
|
+
|
|
+static void init_defrootdomain(void)
|
|
+{
|
|
+ init_rootdomain(&def_root_domain);
|
|
+
|
|
+ atomic_set(&def_root_domain.refcount, 1);
|
|
+}
|
|
+
|
|
+static struct root_domain *alloc_rootdomain(void)
|
|
+{
|
|
+ struct root_domain *rd;
|
|
+
|
|
+ rd = kmalloc(sizeof(*rd), GFP_KERNEL);
|
|
+ if (!rd)
|
|
+ return NULL;
|
|
+
|
|
+ if (init_rootdomain(rd) != 0) {
|
|
+ kfree(rd);
|
|
+ return NULL;
|
|
+ }
|
|
+
|
|
+ return rd;
|
|
+}
|
|
+
|
|
+static void destroy_sched_domain(struct sched_domain *sd)
|
|
+{
|
|
+ if (sd->shared && atomic_dec_and_test(&sd->shared->ref))
|
|
+ kfree(sd->shared);
|
|
+ kfree(sd);
|
|
+}
|
|
+
|
|
+static void destroy_sched_domains_rcu(struct rcu_head *rcu)
|
|
+{
|
|
+ struct sched_domain *sd = container_of(rcu, struct sched_domain, rcu);
|
|
+
|
|
+ while (sd) {
|
|
+ struct sched_domain *parent = sd->parent;
|
|
+ destroy_sched_domain(sd);
|
|
+ sd = parent;
|
|
+ }
|
|
+}
|
|
+
|
|
+static void destroy_sched_domains(struct sched_domain *sd)
|
|
+{
|
|
+ if (sd)
|
|
+ call_rcu(&sd->rcu, destroy_sched_domains_rcu);
|
|
+}
|
|
+
|
|
+/*
|
|
+ * Attach the domain 'sd' to 'cpu' as its base domain. Callers must
|
|
+ * hold the hotplug lock.
|
|
+ */
|
|
+static void
|
|
+cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu)
|
|
+{
|
|
+ struct rq *rq = cpu_rq(cpu);
|
|
+ struct sched_domain *tmp;
|
|
+
|
|
+ /* Remove the sched domains which do not contribute to scheduling. */
|
|
+ for (tmp = sd; tmp; ) {
|
|
+ struct sched_domain *parent = tmp->parent;
|
|
+ if (!parent)
|
|
+ break;
|
|
+
|
|
+ if (sd_parent_degenerate(tmp, parent)) {
|
|
+ tmp->parent = parent->parent;
|
|
+ if (parent->parent)
|
|
+ parent->parent->child = tmp;
|
|
+ /*
|
|
+ * Transfer SD_PREFER_SIBLING down in case of a
|
|
+ * degenerate parent; the spans match for this
|
|
+ * so the property transfers.
|
|
+ */
|
|
+ if (parent->flags & SD_PREFER_SIBLING)
|
|
+ tmp->flags |= SD_PREFER_SIBLING;
|
|
+ destroy_sched_domain(parent);
|
|
+ } else
|
|
+ tmp = tmp->parent;
|
|
+ }
|
|
+
|
|
+ if (sd && sd_degenerate(sd)) {
|
|
+ tmp = sd;
|
|
+ sd = sd->parent;
|
|
+ destroy_sched_domain(tmp);
|
|
+ if (sd)
|
|
+ sd->child = NULL;
|
|
+ }
|
|
+
|
|
+ sched_domain_debug(sd, cpu);
|
|
+
|
|
+ rq_attach_root(rq, rd);
|
|
+ tmp = rq->sd;
|
|
+ rcu_assign_pointer(rq->sd, sd);
|
|
+ destroy_sched_domains(tmp);
|
|
+}
|
|
+
|
|
+/* Setup the mask of cpus configured for isolated domains */
|
|
+static int __init isolated_cpu_setup(char *str)
|
|
+{
|
|
+ int ret;
|
|
+
|
|
+ alloc_bootmem_cpumask_var(&cpu_isolated_map);
|
|
+ ret = cpulist_parse(str, cpu_isolated_map);
|
|
+ if (ret) {
|
|
+ pr_err("sched: Error, all isolcpus= values must be between 0 and %d\n", nr_cpu_ids);
|
|
+ return 0;
|
|
+ }
|
|
+ return 1;
|
|
+}
|
|
+
|
|
+__setup("isolcpus=", isolated_cpu_setup);
|
|
+
|
|
+struct s_data {
|
|
+ struct sched_domain ** __percpu sd;
|
|
+ struct root_domain *rd;
|
|
+};
|
|
+
|
|
+enum s_alloc {
|
|
+ sa_rootdomain,
|
|
+ sa_sd,
|
|
+ sa_sd_storage,
|
|
+ sa_none,
|
|
+};
|
|
+
|
|
+/*
|
|
+ * Initializers for schedule domains
|
|
+ * Non-inlined to reduce accumulated stack pressure in build_sched_domains()
|
|
+ */
|
|
+
|
|
+static int default_relax_domain_level = -1;
|
|
+int sched_domain_level_max;
|
|
+
|
|
+static int __init setup_relax_domain_level(char *str)
|
|
+{
|
|
+ if (kstrtoint(str, 0, &default_relax_domain_level))
|
|
+ pr_warn("Unable to set relax_domain_level\n");
|
|
+
|
|
+ return 1;
|
|
+}
|
|
+__setup("relax_domain_level=", setup_relax_domain_level);
|
|
+
|
|
+static void set_domain_attribute(struct sched_domain *sd,
|
|
+ struct sched_domain_attr *attr)
|
|
+{
|
|
+ int request;
|
|
+
|
|
+ if (!attr || attr->relax_domain_level < 0) {
|
|
+ if (default_relax_domain_level < 0)
|
|
+ return;
|
|
+ else
|
|
+ request = default_relax_domain_level;
|
|
+ } else
|
|
+ request = attr->relax_domain_level;
|
|
+ if (request < sd->level) {
|
|
+ /* turn off idle balance on this domain */
|
|
+ sd->flags &= ~(SD_BALANCE_WAKE|SD_BALANCE_NEWIDLE);
|
|
+ } else {
|
|
+ /* turn on idle balance on this domain */
|
|
+ sd->flags |= (SD_BALANCE_WAKE|SD_BALANCE_NEWIDLE);
|
|
+ }
|
|
+}
|
|
+
|
|
+static void __sdt_free(const struct cpumask *cpu_map);
|
|
+static int __sdt_alloc(const struct cpumask *cpu_map);
|
|
+
|
|
+static void __free_domain_allocs(struct s_data *d, enum s_alloc what,
|
|
+ const struct cpumask *cpu_map)
|
|
+{
|
|
+ switch (what) {
|
|
+ case sa_rootdomain:
|
|
+ if (!atomic_read(&d->rd->refcount))
|
|
+ free_rootdomain(&d->rd->rcu); /* fall through */
|
|
+ case sa_sd:
|
|
+ free_percpu(d->sd); /* fall through */
|
|
+ case sa_sd_storage:
|
|
+ __sdt_free(cpu_map); /* fall through */
|
|
+ case sa_none:
|
|
+ break;
|
|
+ }
|
|
+}
|
|
+
|
|
+static enum s_alloc __visit_domain_allocation_hell(struct s_data *d,
|
|
+ const struct cpumask *cpu_map)
|
|
+{
|
|
+ memset(d, 0, sizeof(*d));
|
|
+
|
|
+ if (__sdt_alloc(cpu_map))
|
|
+ return sa_sd_storage;
|
|
+ d->sd = alloc_percpu(struct sched_domain *);
|
|
+ if (!d->sd)
|
|
+ return sa_sd_storage;
|
|
+ d->rd = alloc_rootdomain();
|
|
+ if (!d->rd)
|
|
+ return sa_sd;
|
|
+ return sa_rootdomain;
|
|
+}
|
|
+
|
|
+/*
|
|
+ * NULL the sd_data elements we've used to build the sched_domain
|
|
+ * structure so that the subsequent __free_domain_allocs()
|
|
+ * will not free the data we're using.
|
|
+ */
|
|
+static void claim_allocations(int cpu, struct sched_domain *sd)
|
|
+{
|
|
+ struct sd_data *sdd = sd->private;
|
|
+
|
|
+ WARN_ON_ONCE(*per_cpu_ptr(sdd->sd, cpu) != sd);
|
|
+ *per_cpu_ptr(sdd->sd, cpu) = NULL;
|
|
+
|
|
+ if (atomic_read(&(*per_cpu_ptr(sdd->sds, cpu))->ref))
|
|
+ *per_cpu_ptr(sdd->sds, cpu) = NULL;
|
|
+}
|
|
+
|
|
+#ifdef CONFIG_NUMA
|
|
+static int sched_domains_numa_levels;
|
|
+static int *sched_domains_numa_distance;
|
|
+static struct cpumask ***sched_domains_numa_masks;
|
|
+static int sched_domains_curr_level;
|
|
+#endif
|
|
+
|
|
+/*
|
|
+ * SD_flags allowed in topology descriptions.
|
|
+ *
|
|
+ * These flags are purely descriptive of the topology and do not prescribe
|
|
+ * behaviour. Behaviour is artificial and mapped in the below sd_init()
|
|
+ * function:
|
|
+ *
|
|
+ * SD_SHARE_CPUCAPACITY - describes SMT topologies
|
|
+ * SD_SHARE_PKG_RESOURCES - describes shared caches
|
|
+ * SD_NUMA - describes NUMA topologies
|
|
+ * SD_SHARE_POWERDOMAIN - describes shared power domain
|
|
+ * SD_ASYM_CPUCAPACITY - describes mixed capacity topologies
|
|
+ *
|
|
+ * Odd one out, which beside describing the topology has a quirk also
|
|
+ * prescribes the desired behaviour that goes along with it:
|
|
+ *
|
|
+ * SD_ASYM_PACKING - describes SMT quirks
|
|
+ */
|
|
+#define TOPOLOGY_SD_FLAGS \
|
|
+ (SD_SHARE_CPUCAPACITY | \
|
|
+ SD_SHARE_PKG_RESOURCES | \
|
|
+ SD_NUMA | \
|
|
+ SD_ASYM_PACKING | \
|
|
+ SD_ASYM_CPUCAPACITY | \
|
|
+ SD_SHARE_POWERDOMAIN)
|
|
+
|
|
+static struct sched_domain *
|
|
+sd_init(struct sched_domain_topology_level *tl,
|
|
+ const struct cpumask *cpu_map,
|
|
+ struct sched_domain *child, int cpu)
|
|
+{
|
|
+ struct sd_data *sdd = &tl->data;
|
|
+ struct sched_domain *sd = *per_cpu_ptr(sdd->sd, cpu);
|
|
+ int sd_id, sd_weight, sd_flags = 0;
|
|
+
|
|
+#ifdef CONFIG_NUMA
|
|
+ /*
|
|
+ * Ugly hack to pass state to sd_numa_mask()...
|
|
+ */
|
|
+ sched_domains_curr_level = tl->numa_level;
|
|
+#endif
|
|
+
|
|
+ sd_weight = cpumask_weight(tl->mask(cpu));
|
|
+
|
|
+ if (tl->sd_flags)
|
|
+ sd_flags = (*tl->sd_flags)();
|
|
+ if (WARN_ONCE(sd_flags & ~TOPOLOGY_SD_FLAGS,
|
|
+ "wrong sd_flags in topology description\n"))
|
|
+ sd_flags &= ~TOPOLOGY_SD_FLAGS;
|
|
+
|
|
+ *sd = (struct sched_domain){
|
|
+ .min_interval = sd_weight,
|
|
+ .max_interval = 2*sd_weight,
|
|
+ .busy_factor = 32,
|
|
+ .imbalance_pct = 125,
|
|
+
|
|
+ .cache_nice_tries = 0,
|
|
+ .busy_idx = 0,
|
|
+ .idle_idx = 0,
|
|
+ .newidle_idx = 0,
|
|
+ .wake_idx = 0,
|
|
+ .forkexec_idx = 0,
|
|
+
|
|
+ .flags = 1*SD_LOAD_BALANCE
|
|
+ | 1*SD_BALANCE_NEWIDLE
|
|
+ | 1*SD_BALANCE_EXEC
|
|
+ | 1*SD_BALANCE_FORK
|
|
+ | 0*SD_BALANCE_WAKE
|
|
+ | 1*SD_WAKE_AFFINE
|
|
+ | 0*SD_SHARE_CPUCAPACITY
|
|
+ | 0*SD_SHARE_PKG_RESOURCES
|
|
+ | 0*SD_SERIALIZE
|
|
+ | 0*SD_PREFER_SIBLING
|
|
+ | 0*SD_NUMA
|
|
+ | sd_flags
|
|
+ ,
|
|
+
|
|
+ .last_balance = jiffies,
|
|
+ .balance_interval = sd_weight,
|
|
+ .smt_gain = 0,
|
|
+ .max_newidle_lb_cost = 0,
|
|
+ .next_decay_max_lb_cost = jiffies,
|
|
+ .child = child,
|
|
+#ifdef CONFIG_SCHED_DEBUG
|
|
+ .name = tl->name,
|
|
+#endif
|
|
+ };
|
|
+
|
|
+ cpumask_and(sched_domain_span(sd), cpu_map, tl->mask(cpu));
|
|
+ sd_id = cpumask_first(sched_domain_span(sd));
|
|
+
|
|
+ /*
|
|
+ * Convert topological properties into behaviour.
|
|
+ */
|
|
+
|
|
+ if (sd->flags & SD_ASYM_CPUCAPACITY) {
|
|
+ struct sched_domain *t = sd;
|
|
+
|
|
+ for_each_lower_domain(t)
|
|
+ t->flags |= SD_BALANCE_WAKE;
|
|
+ }
|
|
+
|
|
+ if (sd->flags & SD_SHARE_CPUCAPACITY) {
|
|
+ sd->flags |= SD_PREFER_SIBLING;
|
|
+ sd->imbalance_pct = 110;
|
|
+ sd->smt_gain = 1178; /* ~15% */
|
|
+
|
|
+ } else if (sd->flags & SD_SHARE_PKG_RESOURCES) {
|
|
+ sd->imbalance_pct = 117;
|
|
+ sd->cache_nice_tries = 1;
|
|
+ sd->busy_idx = 2;
|
|
+
|
|
+#ifdef CONFIG_NUMA
|
|
+ } else if (sd->flags & SD_NUMA) {
|
|
+ sd->cache_nice_tries = 2;
|
|
+ sd->busy_idx = 3;
|
|
+ sd->idle_idx = 2;
|
|
+
|
|
+ sd->flags |= SD_SERIALIZE;
|
|
+ if (sched_domains_numa_distance[tl->numa_level] > RECLAIM_DISTANCE) {
|
|
+ sd->flags &= ~(SD_BALANCE_EXEC |
|
|
+ SD_BALANCE_FORK |
|
|
+ SD_WAKE_AFFINE);
|
|
+ }
|
|
+
|
|
+#endif
|
|
+ } else {
|
|
+ sd->flags |= SD_PREFER_SIBLING;
|
|
+ sd->cache_nice_tries = 1;
|
|
+ sd->busy_idx = 2;
|
|
+ sd->idle_idx = 1;
|
|
+ }
|
|
+
|
|
+ /*
|
|
+ * For all levels sharing cache; connect a sched_domain_shared
|
|
+ * instance.
|
|
+ */
|
|
+ if (sd->flags & SD_SHARE_PKG_RESOURCES) {
|
|
+ sd->shared = *per_cpu_ptr(sdd->sds, sd_id);
|
|
+ atomic_inc(&sd->shared->ref);
|
|
+ atomic_set(&sd->shared->nr_busy_cpus, sd_weight);
|
|
+ }
|
|
+
|
|
+ sd->private = sdd;
|
|
+
|
|
+ return sd;
|
|
+}
|
|
+
|
|
+/*
|
|
+ * Topology list, bottom-up.
|
|
+ */
|
|
+static struct sched_domain_topology_level default_topology[] = {
|
|
+#ifdef CONFIG_SCHED_SMT
|
|
+ { cpu_smt_mask, cpu_smt_flags, SD_INIT_NAME(SMT) },
|
|
+#endif
|
|
+#ifdef CONFIG_SCHED_MC
|
|
+ { cpu_coregroup_mask, cpu_core_flags, SD_INIT_NAME(MC) },
|
|
+#endif
|
|
+ { cpu_cpu_mask, SD_INIT_NAME(DIE) },
|
|
+ { NULL, },
|
|
+};
|
|
+
|
|
+static struct sched_domain_topology_level *sched_domain_topology =
|
|
+ default_topology;
|
|
+
|
|
+#define for_each_sd_topology(tl) \
|
|
+ for (tl = sched_domain_topology; tl->mask; tl++)
|
|
+
|
|
+void set_sched_topology(struct sched_domain_topology_level *tl)
|
|
+{
|
|
+ if (WARN_ON_ONCE(sched_smp_initialized))
|
|
+ return;
|
|
+
|
|
+ sched_domain_topology = tl;
|
|
+}
|
|
+
|
|
+#ifdef CONFIG_NUMA
|
|
+
|
|
+static const struct cpumask *sd_numa_mask(int cpu)
|
|
+{
|
|
+ return sched_domains_numa_masks[sched_domains_curr_level][cpu_to_node(cpu)];
|
|
+}
|
|
+
|
|
+static void sched_numa_warn(const char *str)
|
|
+{
|
|
+ static int done = false;
|
|
+ int i,j;
|
|
+
|
|
+ if (done)
|
|
+ return;
|
|
+
|
|
+ done = true;
|
|
+
|
|
+ printk(KERN_WARNING "ERROR: %s\n\n", str);
|
|
+
|
|
+ for (i = 0; i < nr_node_ids; i++) {
|
|
+ printk(KERN_WARNING " ");
|
|
+ for (j = 0; j < nr_node_ids; j++)
|
|
+ printk(KERN_CONT "%02d ", node_distance(i,j));
|
|
+ printk(KERN_CONT "\n");
|
|
+ }
|
|
+ printk(KERN_WARNING "\n");
|
|
+}
|
|
+
|
|
+static bool find_numa_distance(int distance)
|
|
+{
|
|
+ int i;
|
|
+
|
|
+ if (distance == node_distance(0, 0))
|
|
+ return true;
|
|
+
|
|
+ for (i = 0; i < sched_domains_numa_levels; i++) {
|
|
+ if (sched_domains_numa_distance[i] == distance)
|
|
+ return true;
|
|
+ }
|
|
+
|
|
+ return false;
|
|
+}
|
|
+
|
|
+static void sched_init_numa(void)
|
|
+{
|
|
+ int next_distance, curr_distance = node_distance(0, 0);
|
|
+ struct sched_domain_topology_level *tl;
|
|
+ int level = 0;
|
|
+ int i, j, k;
|
|
+
|
|
+ sched_domains_numa_distance = kzalloc(sizeof(int) * nr_node_ids, GFP_KERNEL);
|
|
+ if (!sched_domains_numa_distance)
|
|
+ return;
|
|
+
|
|
+ /*
|
|
+ * O(nr_nodes^2) deduplicating selection sort -- in order to find the
|
|
+ * unique distances in the node_distance() table.
|
|
+ *
|
|
+ * Assumes node_distance(0,j) includes all distances in
|
|
+ * node_distance(i,j) in order to avoid cubic time.
|
|
+ */
|
|
+ next_distance = curr_distance;
|
|
+ for (i = 0; i < nr_node_ids; i++) {
|
|
+ for (j = 0; j < nr_node_ids; j++) {
|
|
+ for (k = 0; k < nr_node_ids; k++) {
|
|
+ int distance = node_distance(i, k);
|
|
+
|
|
+ if (distance > curr_distance &&
|
|
+ (distance < next_distance ||
|
|
+ next_distance == curr_distance))
|
|
+ next_distance = distance;
|
|
+
|
|
+ /*
|
|
+ * While not a strong assumption it would be nice to know
|
|
+ * about cases where if node A is connected to B, B is not
|
|
+ * equally connected to A.
|
|
+ */
|
|
+ if (sched_debug() && node_distance(k, i) != distance)
|
|
+ sched_numa_warn("Node-distance not symmetric");
|
|
+
|
|
+ if (sched_debug() && i && !find_numa_distance(distance))
|
|
+ sched_numa_warn("Node-0 not representative");
|
|
+ }
|
|
+ if (next_distance != curr_distance) {
|
|
+ sched_domains_numa_distance[level++] = next_distance;
|
|
+ sched_domains_numa_levels = level;
|
|
+ curr_distance = next_distance;
|
|
+ } else break;
|
|
+ }
|
|
+
|
|
+ /*
|
|
+ * In case of sched_debug() we verify the above assumption.
|
|
+ */
|
|
+ if (!sched_debug())
|
|
+ break;
|
|
+ }
|
|
+ /*
|
|
+ * 'level' contains the number of unique distances, excluding the
|
|
+ * identity distance node_distance(i,i).
|
|
+ *
|
|
+ * The sched_domains_numa_distance[] array includes the actual distance
|
|
+ * numbers.
|
|
+ */
|
|
+
|
|
+ /*
|
|
+ * Here, we should temporarily reset sched_domains_numa_levels to 0.
|
|
+ * If it fails to allocate memory for array sched_domains_numa_masks[][],
|
|
+ * the array will contain less then 'level' members. This could be
|
|
+ * dangerous when we use it to iterate array sched_domains_numa_masks[][]
|
|
+ * in other functions.
|
|
+ *
|
|
+ * We reset it to 'level' at the end of this function.
|
|
+ */
|
|
+ sched_domains_numa_levels = 0;
|
|
+
|
|
+ sched_domains_numa_masks = kzalloc(sizeof(void *) * level, GFP_KERNEL);
|
|
+ if (!sched_domains_numa_masks)
|
|
+ return;
|
|
+
|
|
+ /*
|
|
+ * Now for each level, construct a mask per node which contains all
|
|
+ * cpus of nodes that are that many hops away from us.
|
|
+ */
|
|
+ for (i = 0; i < level; i++) {
|
|
+ sched_domains_numa_masks[i] =
|
|
+ kzalloc(nr_node_ids * sizeof(void *), GFP_KERNEL);
|
|
+ if (!sched_domains_numa_masks[i])
|
|
+ return;
|
|
+
|
|
+ for (j = 0; j < nr_node_ids; j++) {
|
|
+ struct cpumask *mask = kzalloc(cpumask_size(), GFP_KERNEL);
|
|
+ if (!mask)
|
|
+ return;
|
|
+
|
|
+ sched_domains_numa_masks[i][j] = mask;
|
|
+
|
|
+ for_each_node(k) {
|
|
+ if (node_distance(j, k) > sched_domains_numa_distance[i])
|
|
+ continue;
|
|
+
|
|
+ cpumask_or(mask, mask, cpumask_of_node(k));
|
|
+ }
|
|
+ }
|
|
+ }
|
|
+
|
|
+ /* Compute default topology size */
|
|
+ for (i = 0; sched_domain_topology[i].mask; i++);
|
|
+
|
|
+ tl = kzalloc((i + level + 1) *
|
|
+ sizeof(struct sched_domain_topology_level), GFP_KERNEL);
|
|
+ if (!tl)
|
|
+ return;
|
|
+
|
|
+ /*
|
|
+ * Copy the default topology bits..
|
|
+ */
|
|
+ for (i = 0; sched_domain_topology[i].mask; i++)
|
|
+ tl[i] = sched_domain_topology[i];
|
|
+
|
|
+ /*
|
|
+ * .. and append 'j' levels of NUMA goodness.
|
|
+ */
|
|
+ for (j = 0; j < level; i++, j++) {
|
|
+ tl[i] = (struct sched_domain_topology_level){
|
|
+ .mask = sd_numa_mask,
|
|
+ .sd_flags = cpu_numa_flags,
|
|
+ .flags = SDTL_OVERLAP,
|
|
+ .numa_level = j,
|
|
+ SD_INIT_NAME(NUMA)
|
|
+ };
|
|
+ }
|
|
+
|
|
+ sched_domain_topology = tl;
|
|
+
|
|
+ sched_domains_numa_levels = level;
|
|
+}
|
|
+
|
|
+static void sched_domains_numa_masks_set(int cpu)
|
|
+{
|
|
+ int node = cpu_to_node(cpu);
|
|
+ int i, j;
|
|
+
|
|
+ for (i = 0; i < sched_domains_numa_levels; i++) {
|
|
+ for (j = 0; j < nr_node_ids; j++) {
|
|
+ if (node_distance(j, node) <= sched_domains_numa_distance[i])
|
|
+ cpumask_set_cpu(cpu, sched_domains_numa_masks[i][j]);
|
|
+ }
|
|
+ }
|
|
+}
|
|
+
|
|
+static void sched_domains_numa_masks_clear(int cpu)
|
|
+{
|
|
+ int i, j;
|
|
+
|
|
+ for (i = 0; i < sched_domains_numa_levels; i++) {
|
|
+ for (j = 0; j < nr_node_ids; j++)
|
|
+ cpumask_clear_cpu(cpu, sched_domains_numa_masks[i][j]);
|
|
+ }
|
|
+}
|
|
+
|
|
+#else
|
|
+static inline void sched_init_numa(void) { }
|
|
+static void sched_domains_numa_masks_set(unsigned int cpu) { }
|
|
+static void sched_domains_numa_masks_clear(unsigned int cpu) { }
|
|
+#endif /* CONFIG_NUMA */
|
|
+
|
|
+static int __sdt_alloc(const struct cpumask *cpu_map)
|
|
+{
|
|
+ struct sched_domain_topology_level *tl;
|
|
+ int j;
|
|
+
|
|
+ for_each_sd_topology(tl) {
|
|
+ struct sd_data *sdd = &tl->data;
|
|
+
|
|
+ sdd->sd = alloc_percpu(struct sched_domain *);
|
|
+ if (!sdd->sd)
|
|
+ return -ENOMEM;
|
|
+
|
|
+ sdd->sds = alloc_percpu(struct sched_domain_shared *);
|
|
+ if (!sdd->sds)
|
|
+ return -ENOMEM;
|
|
+
|
|
+ for_each_cpu(j, cpu_map) {
|
|
+ struct sched_domain *sd;
|
|
+ struct sched_domain_shared *sds;
|
|
+
|
|
+ sd = kzalloc_node(sizeof(struct sched_domain) + cpumask_size(),
|
|
+ GFP_KERNEL, cpu_to_node(j));
|
|
+ if (!sd)
|
|
+ return -ENOMEM;
|
|
+
|
|
+ *per_cpu_ptr(sdd->sd, j) = sd;
|
|
+
|
|
+ sds = kzalloc_node(sizeof(struct sched_domain_shared),
|
|
+ GFP_KERNEL, cpu_to_node(j));
|
|
+ if (!sds)
|
|
+ return -ENOMEM;
|
|
+
|
|
+ *per_cpu_ptr(sdd->sds, j) = sds;
|
|
+ }
|
|
+ }
|
|
+
|
|
+ return 0;
|
|
+}
|
|
+
|
|
+static void __sdt_free(const struct cpumask *cpu_map)
|
|
+{
|
|
+ struct sched_domain_topology_level *tl;
|
|
+ int j;
|
|
+
|
|
+ for_each_sd_topology(tl) {
|
|
+ struct sd_data *sdd = &tl->data;
|
|
+
|
|
+ for_each_cpu(j, cpu_map) {
|
|
+ struct sched_domain *sd;
|
|
+
|
|
+ if (sdd->sd) {
|
|
+ sd = *per_cpu_ptr(sdd->sd, j);
|
|
+ kfree(*per_cpu_ptr(sdd->sd, j));
|
|
+ }
|
|
+
|
|
+ if (sdd->sds)
|
|
+ kfree(*per_cpu_ptr(sdd->sds, j));
|
|
+ }
|
|
+ free_percpu(sdd->sd);
|
|
+ sdd->sd = NULL;
|
|
+ free_percpu(sdd->sds);
|
|
+ sdd->sds = NULL;
|
|
+ }
|
|
+}
|
|
+
|
|
+struct sched_domain *build_sched_domain(struct sched_domain_topology_level *tl,
|
|
+ const struct cpumask *cpu_map, struct sched_domain_attr *attr,
|
|
+ struct sched_domain *child, int cpu)
|
|
+{
|
|
+ struct sched_domain *sd = sd_init(tl, cpu_map, child, cpu);
|
|
+
|
|
+ if (child) {
|
|
+ sd->level = child->level + 1;
|
|
+ sched_domain_level_max = max(sched_domain_level_max, sd->level);
|
|
+ child->parent = sd;
|
|
+
|
|
+ if (!cpumask_subset(sched_domain_span(child),
|
|
+ sched_domain_span(sd))) {
|
|
+ pr_err("BUG: arch topology borken\n");
|
|
+#ifdef CONFIG_SCHED_DEBUG
|
|
+ pr_err(" the %s domain not a subset of the %s domain\n",
|
|
+ child->name, sd->name);
|
|
+#endif
|
|
+ /* Fixup, ensure @sd has at least @child cpus. */
|
|
+ cpumask_or(sched_domain_span(sd),
|
|
+ sched_domain_span(sd),
|
|
+ sched_domain_span(child));
|
|
+ }
|
|
+
|
|
+ }
|
|
+ set_domain_attribute(sd, attr);
|
|
+
|
|
+ return sd;
|
|
+}
|
|
+
|
|
+/*
|
|
+ * Build sched domains for a given set of cpus and attach the sched domains
|
|
+ * to the individual cpus
|
|
+ */
|
|
+static int build_sched_domains(const struct cpumask *cpu_map,
|
|
+ struct sched_domain_attr *attr)
|
|
+{
|
|
+ enum s_alloc alloc_state;
|
|
+ struct sched_domain *sd;
|
|
+ struct s_data d;
|
|
+ int i, ret = -ENOMEM;
|
|
+
|
|
+ alloc_state = __visit_domain_allocation_hell(&d, cpu_map);
|
|
+ if (alloc_state != sa_rootdomain)
|
|
+ goto error;
|
|
+
|
|
+ /* Set up domains for cpus specified by the cpu_map. */
|
|
+ for_each_cpu(i, cpu_map) {
|
|
+ struct sched_domain_topology_level *tl;
|
|
+
|
|
+ sd = NULL;
|
|
+ for_each_sd_topology(tl) {
|
|
+ sd = build_sched_domain(tl, cpu_map, attr, sd, i);
|
|
+ if (tl == sched_domain_topology)
|
|
+ *per_cpu_ptr(d.sd, i) = sd;
|
|
+ if (tl->flags & SDTL_OVERLAP)
|
|
+ sd->flags |= SD_OVERLAP;
|
|
+ if (cpumask_equal(cpu_map, sched_domain_span(sd)))
|
|
+ break;
|
|
+ }
|
|
+ }
|
|
+
|
|
+ /* Calculate CPU capacity for physical packages and nodes */
|
|
+ for (i = nr_cpumask_bits-1; i >= 0; i--) {
|
|
+ if (!cpumask_test_cpu(i, cpu_map))
|
|
+ continue;
|
|
+
|
|
+ for (sd = *per_cpu_ptr(d.sd, i); sd; sd = sd->parent) {
|
|
+ claim_allocations(i, sd);
|
|
+ }
|
|
+ }
|
|
+
|
|
+ /* Attach the domains */
|
|
+ rcu_read_lock();
|
|
+ for_each_cpu(i, cpu_map) {
|
|
+ sd = *per_cpu_ptr(d.sd, i);
|
|
+ cpu_attach_domain(sd, d.rd, i);
|
|
+ }
|
|
+ rcu_read_unlock();
|
|
+
|
|
+ ret = 0;
|
|
+error:
|
|
+ __free_domain_allocs(&d, alloc_state, cpu_map);
|
|
+ return ret;
|
|
+}
|
|
+
|
|
+static cpumask_var_t *doms_cur; /* current sched domains */
|
|
+static int ndoms_cur; /* number of sched domains in 'doms_cur' */
|
|
+static struct sched_domain_attr *dattr_cur;
|
|
+ /* attribues of custom domains in 'doms_cur' */
|
|
+
|
|
+/*
|
|
+ * Special case: If a kmalloc of a doms_cur partition (array of
|
|
+ * cpumask) fails, then fallback to a single sched domain,
|
|
+ * as determined by the single cpumask fallback_doms.
|
|
+ */
|
|
+static cpumask_var_t fallback_doms;
|
|
+
|
|
+/*
|
|
+ * arch_update_cpu_topology lets virtualized architectures update the
|
|
+ * cpu core maps. It is supposed to return 1 if the topology changed
|
|
+ * or 0 if it stayed the same.
|
|
+ */
|
|
+int __weak arch_update_cpu_topology(void)
|
|
+{
|
|
+ return 0;
|
|
+}
|
|
+
|
|
+cpumask_var_t *alloc_sched_domains(unsigned int ndoms)
|
|
+{
|
|
+ int i;
|
|
+ cpumask_var_t *doms;
|
|
+
|
|
+ doms = kmalloc(sizeof(*doms) * ndoms, GFP_KERNEL);
|
|
+ if (!doms)
|
|
+ return NULL;
|
|
+ for (i = 0; i < ndoms; i++) {
|
|
+ if (!alloc_cpumask_var(&doms[i], GFP_KERNEL)) {
|
|
+ free_sched_domains(doms, i);
|
|
+ return NULL;
|
|
+ }
|
|
+ }
|
|
+ return doms;
|
|
+}
|
|
+
|
|
+void free_sched_domains(cpumask_var_t doms[], unsigned int ndoms)
|
|
+{
|
|
+ unsigned int i;
|
|
+ for (i = 0; i < ndoms; i++)
|
|
+ free_cpumask_var(doms[i]);
|
|
+ kfree(doms);
|
|
+}
|
|
+
|
|
+/*
|
|
+ * Set up scheduler domains and groups. Callers must hold the hotplug lock.
|
|
+ * For now this just excludes isolated cpus, but could be used to
|
|
+ * exclude other special cases in the future.
|
|
+ */
|
|
+static int init_sched_domains(const struct cpumask *cpu_map)
|
|
+{
|
|
+ int err;
|
|
+
|
|
+ arch_update_cpu_topology();
|
|
+ ndoms_cur = 1;
|
|
+ doms_cur = alloc_sched_domains(ndoms_cur);
|
|
+ if (!doms_cur)
|
|
+ doms_cur = &fallback_doms;
|
|
+ cpumask_andnot(doms_cur[0], cpu_map, cpu_isolated_map);
|
|
+ err = build_sched_domains(doms_cur[0], NULL);
|
|
+ register_sched_domain_sysctl();
|
|
+
|
|
+ return err;
|
|
+}
|
|
+
|
|
+/*
|
|
+ * Detach sched domains from a group of cpus specified in cpu_map
|
|
+ * These cpus will now be attached to the NULL domain
|
|
+ */
|
|
+static void detach_destroy_domains(const struct cpumask *cpu_map)
|
|
+{
|
|
+ int i;
|
|
+
|
|
+ rcu_read_lock();
|
|
+ for_each_cpu(i, cpu_map)
|
|
+ cpu_attach_domain(NULL, &def_root_domain, i);
|
|
+ rcu_read_unlock();
|
|
+}
|
|
+
|
|
+/* handle null as "default" */
|
|
+static int dattrs_equal(struct sched_domain_attr *cur, int idx_cur,
|
|
+ struct sched_domain_attr *new, int idx_new)
|
|
+{
|
|
+ struct sched_domain_attr tmp;
|
|
+
|
|
+ /* fast path */
|
|
+ if (!new && !cur)
|
|
+ return 1;
|
|
+
|
|
+ tmp = SD_ATTR_INIT;
|
|
+ return !memcmp(cur ? (cur + idx_cur) : &tmp,
|
|
+ new ? (new + idx_new) : &tmp,
|
|
+ sizeof(struct sched_domain_attr));
|
|
+}
|
|
+
|
|
+/*
|
|
+ * Partition sched domains as specified by the 'ndoms_new'
|
|
+ * cpumasks in the array doms_new[] of cpumasks. This compares
|
|
+ * doms_new[] to the current sched domain partitioning, doms_cur[].
|
|
+ * It destroys each deleted domain and builds each new domain.
|
|
+ *
|
|
+ * 'doms_new' is an array of cpumask_var_t's of length 'ndoms_new'.
|
|
+ * The masks don't intersect (don't overlap.) We should setup one
|
|
+ * sched domain for each mask. CPUs not in any of the cpumasks will
|
|
+ * not be load balanced. If the same cpumask appears both in the
|
|
+ * current 'doms_cur' domains and in the new 'doms_new', we can leave
|
|
+ * it as it is.
|
|
+ *
|
|
+ * The passed in 'doms_new' should be allocated using
|
|
+ * alloc_sched_domains. This routine takes ownership of it and will
|
|
+ * free_sched_domains it when done with it. If the caller failed the
|
|
+ * alloc call, then it can pass in doms_new == NULL && ndoms_new == 1,
|
|
+ * and partition_sched_domains() will fallback to the single partition
|
|
+ * 'fallback_doms', it also forces the domains to be rebuilt.
|
|
+ *
|
|
+ * If doms_new == NULL it will be replaced with cpu_online_mask.
|
|
+ * ndoms_new == 0 is a special case for destroying existing domains,
|
|
+ * and it will not create the default domain.
|
|
+ *
|
|
+ * Call with hotplug lock held
|
|
+ */
|
|
+void partition_sched_domains(int ndoms_new, cpumask_var_t doms_new[],
|
|
+ struct sched_domain_attr *dattr_new)
|
|
+{
|
|
+ int i, j, n;
|
|
+ int new_topology;
|
|
+
|
|
+ mutex_lock(&sched_domains_mutex);
|
|
+
|
|
+ /* always unregister in case we don't destroy any domains */
|
|
+ unregister_sched_domain_sysctl();
|
|
+
|
|
+ /* Let architecture update cpu core mappings. */
|
|
+ new_topology = arch_update_cpu_topology();
|
|
+
|
|
+ n = doms_new ? ndoms_new : 0;
|
|
+
|
|
+ /* Destroy deleted domains */
|
|
+ for (i = 0; i < ndoms_cur; i++) {
|
|
+ for (j = 0; j < n && !new_topology; j++) {
|
|
+ if (cpumask_equal(doms_cur[i], doms_new[j])
|
|
+ && dattrs_equal(dattr_cur, i, dattr_new, j))
|
|
+ goto match1;
|
|
+ }
|
|
+ /* no match - a current sched domain not in new doms_new[] */
|
|
+ detach_destroy_domains(doms_cur[i]);
|
|
+match1:
|
|
+ ;
|
|
+ }
|
|
+
|
|
+ n = ndoms_cur;
|
|
+ if (doms_new == NULL) {
|
|
+ n = 0;
|
|
+ doms_new = &fallback_doms;
|
|
+ cpumask_andnot(doms_new[0], cpu_active_mask, cpu_isolated_map);
|
|
+ WARN_ON_ONCE(dattr_new);
|
|
+ }
|
|
+
|
|
+ /* Build new domains */
|
|
+ for (i = 0; i < ndoms_new; i++) {
|
|
+ for (j = 0; j < n && !new_topology; j++) {
|
|
+ if (cpumask_equal(doms_new[i], doms_cur[j])
|
|
+ && dattrs_equal(dattr_new, i, dattr_cur, j))
|
|
+ goto match2;
|
|
+ }
|
|
+ /* no match - add a new doms_new */
|
|
+ build_sched_domains(doms_new[i], dattr_new ? dattr_new + i : NULL);
|
|
+match2:
|
|
+ ;
|
|
+ }
|
|
+
|
|
+ /* Remember the new sched domains */
|
|
+ if (doms_cur != &fallback_doms)
|
|
+ free_sched_domains(doms_cur, ndoms_cur);
|
|
+ kfree(dattr_cur); /* kfree(NULL) is safe */
|
|
+ doms_cur = doms_new;
|
|
+ dattr_cur = dattr_new;
|
|
+ ndoms_cur = ndoms_new;
|
|
+
|
|
+ register_sched_domain_sysctl();
|
|
+
|
|
+ mutex_unlock(&sched_domains_mutex);
|
|
+}
|
|
+
|
|
+static int num_cpus_frozen; /* used to mark begin/end of suspend/resume */
|
|
+
|
|
+/*
|
|
+ * Update cpusets according to cpu_active mask. If cpusets are
|
|
+ * disabled, cpuset_update_active_cpus() becomes a simple wrapper
|
|
+ * around partition_sched_domains().
|
|
+ *
|
|
+ * If we come here as part of a suspend/resume, don't touch cpusets because we
|
|
+ * want to restore it back to its original state upon resume anyway.
|
|
+ */
|
|
+static void cpuset_cpu_active(void)
|
|
+{
|
|
+ if (cpuhp_tasks_frozen) {
|
|
+ /*
|
|
+ * num_cpus_frozen tracks how many CPUs are involved in suspend
|
|
+ * resume sequence. As long as this is not the last online
|
|
+ * operation in the resume sequence, just build a single sched
|
|
+ * domain, ignoring cpusets.
|
|
+ */
|
|
+ num_cpus_frozen--;
|
|
+ if (likely(num_cpus_frozen)) {
|
|
+ partition_sched_domains(1, NULL, NULL);
|
|
+ return;
|
|
+ }
|
|
+ /*
|
|
+ * This is the last CPU online operation. So fall through and
|
|
+ * restore the original sched domains by considering the
|
|
+ * cpuset configurations.
|
|
+ */
|
|
+ }
|
|
+
|
|
+ cpuset_update_active_cpus(true);
|
|
+}
|
|
+
|
|
+static int cpuset_cpu_inactive(unsigned int cpu)
|
|
+{
|
|
+ if (!cpuhp_tasks_frozen) {
|
|
+ cpuset_update_active_cpus(false);
|
|
+ } else {
|
|
+ num_cpus_frozen++;
|
|
+ partition_sched_domains(1, NULL, NULL);
|
|
+ }
|
|
+ return 0;
|
|
+}
|
|
+
|
|
+int sched_cpu_activate(unsigned int cpu)
|
|
+{
|
|
+ struct rq *rq = cpu_rq(cpu);
|
|
+ unsigned long flags;
|
|
+
|
|
+ set_cpu_active(cpu, true);
|
|
+
|
|
+ if (sched_smp_initialized) {
|
|
+ sched_domains_numa_masks_set(cpu);
|
|
+ cpuset_cpu_active();
|
|
+ }
|
|
+
|
|
+ /*
|
|
+ * Put the rq online, if not already. This happens:
|
|
+ *
|
|
+ * 1) In the early boot process, because we build the real domains
|
|
+ * after all cpus have been brought up.
|
|
+ *
|
|
+ * 2) At runtime, if cpuset_cpu_active() fails to rebuild the
|
|
+ * domains.
|
|
+ */
|
|
+ rq_lock_irqsave(rq, &flags);
|
|
+ if (rq->rd) {
|
|
+ BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span));
|
|
+ set_rq_online(rq);
|
|
+ }
|
|
+ unbind_zero(cpu);
|
|
+ rq_unlock_irqrestore(rq, &flags);
|
|
+
|
|
+ return 0;
|
|
+}
|
|
+
|
|
+int sched_cpu_deactivate(unsigned int cpu)
|
|
+{
|
|
+ int ret;
|
|
+
|
|
+ set_cpu_active(cpu, false);
|
|
+ /*
|
|
+ * We've cleared cpu_active_mask, wait for all preempt-disabled and RCU
|
|
+ * users of this state to go away such that all new such users will
|
|
+ * observe it.
|
|
+ *
|
|
+ * For CONFIG_PREEMPT we have preemptible RCU and its sync_rcu() might
|
|
+ * not imply sync_sched(), so wait for both.
|
|
+ *
|
|
+ * Do sync before park smpboot threads to take care the rcu boost case.
|
|
+ */
|
|
+ if (IS_ENABLED(CONFIG_PREEMPT))
|
|
+ synchronize_rcu_mult(call_rcu, call_rcu_sched);
|
|
+ else
|
|
+ synchronize_rcu();
|
|
+
|
|
+ if (!sched_smp_initialized)
|
|
+ return 0;
|
|
+
|
|
+ ret = cpuset_cpu_inactive(cpu);
|
|
+ if (ret) {
|
|
+ set_cpu_active(cpu, true);
|
|
+ return ret;
|
|
+ }
|
|
+ sched_domains_numa_masks_clear(cpu);
|
|
+ return 0;
|
|
+}
|
|
+
|
|
+int sched_cpu_starting(unsigned int __maybe_unused cpu)
|
|
+{
|
|
+ return 0;
|
|
+}
|
|
+
|
|
+#ifdef CONFIG_HOTPLUG_CPU
|
|
+int sched_cpu_dying(unsigned int cpu)
|
|
+{
|
|
+ struct rq *rq = cpu_rq(cpu);
|
|
+ unsigned long flags;
|
|
+
|
|
+ local_irq_save(flags);
|
|
+ double_rq_lock(rq, cpu_rq(0));
|
|
+ if (rq->rd) {
|
|
+ BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span));
|
|
+ set_rq_offline(rq);
|
|
+ }
|
|
+ bind_zero(cpu);
|
|
+ double_rq_unlock(rq, cpu_rq(0));
|
|
+ sched_start_tick(rq, cpu);
|
|
+ hrexpiry_clear(rq);
|
|
+ local_irq_restore(flags);
|
|
+
|
|
+ return 0;
|
|
+}
|
|
+#endif
|
|
+
|
|
+#if defined(CONFIG_SCHED_SMT) || defined(CONFIG_SCHED_MC)
|
|
+/*
|
|
+ * Cheaper version of the below functions in case support for SMT and MC is
|
|
+ * compiled in but CPUs have no siblings.
|
|
+ */
|
|
+static bool sole_cpu_idle(struct rq *rq)
|
|
+{
|
|
+ return rq_idle(rq);
|
|
+}
|
|
+#endif
|
|
+#ifdef CONFIG_SCHED_SMT
|
|
+static const cpumask_t *thread_cpumask(int cpu)
|
|
+{
|
|
+ return topology_sibling_cpumask(cpu);
|
|
+}
|
|
+/* All this CPU's SMT siblings are idle */
|
|
+static bool siblings_cpu_idle(struct rq *rq)
|
|
+{
|
|
+ return cpumask_subset(&rq->thread_mask, &grq.cpu_idle_map);
|
|
+}
|
|
+#endif
|
|
+#ifdef CONFIG_SCHED_MC
|
|
+static const cpumask_t *core_cpumask(int cpu)
|
|
+{
|
|
+ return topology_core_cpumask(cpu);
|
|
+}
|
|
+/* All this CPU's shared cache siblings are idle */
|
|
+static bool cache_cpu_idle(struct rq *rq)
|
|
+{
|
|
+ return cpumask_subset(&rq->core_mask, &grq.cpu_idle_map);
|
|
+}
|
|
+#endif
|
|
+
|
|
+enum sched_domain_level {
|
|
+ SD_LV_NONE = 0,
|
|
+ SD_LV_SIBLING,
|
|
+ SD_LV_MC,
|
|
+ SD_LV_BOOK,
|
|
+ SD_LV_CPU,
|
|
+ SD_LV_NODE,
|
|
+ SD_LV_ALLNODES,
|
|
+ SD_LV_MAX
|
|
+};
|
|
+
|
|
+void __init sched_init_smp(void)
|
|
+{
|
|
+ struct sched_domain *sd;
|
|
+ int cpu, other_cpu;
|
|
+#ifdef CONFIG_SCHED_SMT
|
|
+ bool smt_threads = false;
|
|
+#endif
|
|
+ cpumask_var_t non_isolated_cpus;
|
|
+ struct rq *rq;
|
|
+
|
|
+ alloc_cpumask_var(&non_isolated_cpus, GFP_KERNEL);
|
|
+ alloc_cpumask_var(&fallback_doms, GFP_KERNEL);
|
|
+
|
|
+ sched_init_numa();
|
|
+
|
|
+ /*
|
|
+ * There's no userspace yet to cause hotplug operations; hence all the
|
|
+ * cpu masks are stable and all blatant races in the below code cannot
|
|
+ * happen.
|
|
+ */
|
|
+ mutex_lock(&sched_domains_mutex);
|
|
+ init_sched_domains(cpu_active_mask);
|
|
+ cpumask_andnot(non_isolated_cpus, cpu_possible_mask, cpu_isolated_map);
|
|
+ if (cpumask_empty(non_isolated_cpus))
|
|
+ cpumask_set_cpu(smp_processor_id(), non_isolated_cpus);
|
|
+ mutex_unlock(&sched_domains_mutex);
|
|
+
|
|
+ /* Move init over to a non-isolated CPU */
|
|
+ if (set_cpus_allowed_ptr(current, non_isolated_cpus) < 0)
|
|
+ BUG();
|
|
+ free_cpumask_var(non_isolated_cpus);
|
|
+
|
|
+ mutex_lock(&sched_domains_mutex);
|
|
+ local_irq_disable();
|
|
+ lock_all_rqs();
|
|
+ /*
|
|
+ * Set up the relative cache distance of each online cpu from each
|
|
+ * other in a simple array for quick lookup. Locality is determined
|
|
+ * by the closest sched_domain that CPUs are separated by. CPUs with
|
|
+ * shared cache in SMT and MC are treated as local. Separate CPUs
|
|
+ * (within the same package or physically) within the same node are
|
|
+ * treated as not local. CPUs not even in the same domain (different
|
|
+ * nodes) are treated as very distant.
|
|
+ */
|
|
+ for_each_online_cpu(cpu) {
|
|
+ rq = cpu_rq(cpu);
|
|
+
|
|
+ /* First check if this cpu is in the same node */
|
|
+ for_each_domain(cpu, sd) {
|
|
+ if (sd->level > SD_LV_MC)
|
|
+ continue;
|
|
+ /* Set locality to local node if not already found lower */
|
|
+ for_each_cpu(other_cpu, sched_domain_span(sd)) {
|
|
+ if (rq->cpu_locality[other_cpu] > 3)
|
|
+ rq->cpu_locality[other_cpu] = 3;
|
|
+ }
|
|
+ }
|
|
+
|
|
+ /*
|
|
+ * Each runqueue has its own function in case it doesn't have
|
|
+ * siblings of its own allowing mixed topologies.
|
|
+ */
|
|
+#ifdef CONFIG_SCHED_MC
|
|
+ for_each_cpu(other_cpu, core_cpumask(cpu)) {
|
|
+ if (rq->cpu_locality[other_cpu] > 2)
|
|
+ rq->cpu_locality[other_cpu] = 2;
|
|
+ }
|
|
+ if (cpumask_weight(core_cpumask(cpu)) > 1) {
|
|
+ cpumask_copy(&rq->core_mask, core_cpumask(cpu));
|
|
+ cpumask_clear_cpu(cpu, &rq->core_mask);
|
|
+ rq->cache_idle = cache_cpu_idle;
|
|
+ }
|
|
+#endif
|
|
+#ifdef CONFIG_SCHED_SMT
|
|
+ if (cpumask_weight(thread_cpumask(cpu)) > 1) {
|
|
+ cpumask_copy(&rq->thread_mask, thread_cpumask(cpu));
|
|
+ cpumask_clear_cpu(cpu, &rq->thread_mask);
|
|
+ for_each_cpu(other_cpu, thread_cpumask(cpu))
|
|
+ rq->cpu_locality[other_cpu] = 1;
|
|
+ rq->siblings_idle = siblings_cpu_idle;
|
|
+ smt_threads = true;
|
|
+ }
|
|
+#endif
|
|
+ }
|
|
+ for_each_possible_cpu(cpu) {
|
|
+ int total_cpus = 1, locality;
|
|
+
|
|
+ rq = cpu_rq(cpu);
|
|
+ for (locality = 1; locality <= 4; locality++) {
|
|
+ for_each_possible_cpu(other_cpu) {
|
|
+ if (rq->cpu_locality[other_cpu] == locality)
|
|
+ rq->rq_order[total_cpus++] = cpu_rq(other_cpu);
|
|
+ }
|
|
+ }
|
|
+ }
|
|
+#ifdef CONFIG_SMT_NICE
|
|
+ if (smt_threads) {
|
|
+ check_siblings = &check_smt_siblings;
|
|
+ wake_siblings = &wake_smt_siblings;
|
|
+ smt_schedule = &smt_should_schedule;
|
|
+ }
|
|
+#endif
|
|
+ unlock_all_rqs();
|
|
+ local_irq_enable();
|
|
+ mutex_unlock(&sched_domains_mutex);
|
|
+
|
|
+ for_each_online_cpu(cpu) {
|
|
+ rq = cpu_rq(cpu);
|
|
+
|
|
+ for_each_online_cpu(other_cpu) {
|
|
+ if (other_cpu <= cpu)
|
|
+ continue;
|
|
+ printk(KERN_DEBUG "MuQSS locality CPU %d to %d: %d\n", cpu, other_cpu, rq->cpu_locality[other_cpu]);
|
|
+ }
|
|
+ }
|
|
+
|
|
+ sched_smp_initialized = true;
|
|
+}
|
|
+#else
|
|
+void __init sched_init_smp(void)
|
|
+{
|
|
+ sched_smp_initialized = true;
|
|
+}
|
|
+#endif /* CONFIG_SMP */
|
|
+
|
|
+int in_sched_functions(unsigned long addr)
|
|
+{
|
|
+ return in_lock_functions(addr) ||
|
|
+ (addr >= (unsigned long)__sched_text_start
|
|
+ && addr < (unsigned long)__sched_text_end);
|
|
+}
|
|
+
|
|
+#ifdef CONFIG_CGROUP_SCHED
|
|
+/* task group related information */
|
|
+struct task_group {
|
|
+ struct cgroup_subsys_state css;
|
|
+
|
|
+ struct rcu_head rcu;
|
|
+ struct list_head list;
|
|
+
|
|
+ struct task_group *parent;
|
|
+ struct list_head siblings;
|
|
+ struct list_head children;
|
|
+};
|
|
+
|
|
+/*
|
|
+ * Default task group.
|
|
+ * Every task in system belongs to this group at bootup.
|
|
+ */
|
|
+struct task_group root_task_group;
|
|
+LIST_HEAD(task_groups);
|
|
+
|
|
+/* Cacheline aligned slab cache for task_group */
|
|
+static struct kmem_cache *task_group_cache __read_mostly;
|
|
+#endif /* CONFIG_CGROUP_SCHED */
|
|
+
|
|
+#define WAIT_TABLE_BITS 8
|
|
+#define WAIT_TABLE_SIZE (1 << WAIT_TABLE_BITS)
|
|
+static wait_queue_head_t bit_wait_table[WAIT_TABLE_SIZE] __cacheline_aligned;
|
|
+
|
|
+wait_queue_head_t *bit_waitqueue(void *word, int bit)
|
|
+{
|
|
+ const int shift = BITS_PER_LONG == 32 ? 5 : 6;
|
|
+ unsigned long val = (unsigned long)word << shift | bit;
|
|
+
|
|
+ return bit_wait_table + hash_long(val, WAIT_TABLE_BITS);
|
|
+}
|
|
+EXPORT_SYMBOL(bit_waitqueue);
|
|
+
|
|
+void __init sched_init(void)
|
|
+{
|
|
+#ifdef CONFIG_SMP
|
|
+ int cpu_ids;
|
|
+#endif
|
|
+ int i;
|
|
+ struct rq *rq;
|
|
+
|
|
+ for (i = 0; i < WAIT_TABLE_SIZE; i++)
|
|
+ init_waitqueue_head(bit_wait_table + i);
|
|
+
|
|
+ prio_ratios[0] = 128;
|
|
+ for (i = 1 ; i < NICE_WIDTH ; i++)
|
|
+ prio_ratios[i] = prio_ratios[i - 1] * 11 / 10;
|
|
+
|
|
+ atomic_set(&grq.nr_running, 0);
|
|
+ atomic_set(&grq.nr_uninterruptible, 0);
|
|
+ atomic64_set(&grq.nr_switches, 0);
|
|
+ skiplist_node_init(&init_task.node);
|
|
+
|
|
+#ifdef CONFIG_SMP
|
|
+ init_defrootdomain();
|
|
+ cpumask_clear(&grq.cpu_idle_map);
|
|
+#else
|
|
+ uprq = &per_cpu(runqueues, 0);
|
|
+#endif
|
|
+
|
|
+#ifdef CONFIG_CGROUP_SCHED
|
|
+ task_group_cache = KMEM_CACHE(task_group, 0);
|
|
+
|
|
+ list_add(&root_task_group.list, &task_groups);
|
|
+ INIT_LIST_HEAD(&root_task_group.children);
|
|
+ INIT_LIST_HEAD(&root_task_group.siblings);
|
|
+#endif /* CONFIG_CGROUP_SCHED */
|
|
+ for_each_possible_cpu(i) {
|
|
+ rq = cpu_rq(i);
|
|
+ skiplist_init(&rq->node);
|
|
+ rq->sl = new_skiplist(&rq->node);
|
|
+ raw_spin_lock_init(&rq->lock);
|
|
+ rq->clock = rq->old_clock = rq->last_niffy = rq->niffies = 0;
|
|
+ rq->last_jiffy = jiffies;
|
|
+ rq->user_ns = rq->nice_ns = rq->softirq_ns = rq->system_ns =
|
|
+ rq->iowait_ns = rq->idle_ns = 0;
|
|
+ rq->dither = 0;
|
|
+ set_rq_task(rq, &init_task);
|
|
+ rq->iso_ticks = 0;
|
|
+ rq->iso_refractory = false;
|
|
+#ifdef CONFIG_SMP
|
|
+ rq->sd = NULL;
|
|
+ rq->rd = NULL;
|
|
+ rq->online = false;
|
|
+ rq->cpu = i;
|
|
+ rq_attach_root(rq, &def_root_domain);
|
|
+#endif
|
|
+ init_rq_hrexpiry(rq);
|
|
+ atomic_set(&rq->nr_iowait, 0);
|
|
+ }
|
|
+
|
|
+#ifdef CONFIG_SMP
|
|
+ cpu_ids = i;
|
|
+ /*
|
|
+ * Set the base locality for cpu cache distance calculation to
|
|
+ * "distant" (3). Make sure the distance from a CPU to itself is 0.
|
|
+ */
|
|
+ for_each_possible_cpu(i) {
|
|
+ int j;
|
|
+
|
|
+ rq = cpu_rq(i);
|
|
+#ifdef CONFIG_SCHED_SMT
|
|
+ rq->siblings_idle = sole_cpu_idle;
|
|
+#endif
|
|
+#ifdef CONFIG_SCHED_MC
|
|
+ rq->cache_idle = sole_cpu_idle;
|
|
+#endif
|
|
+ rq->cpu_locality = kmalloc(cpu_ids * sizeof(int *), GFP_ATOMIC);
|
|
+ for_each_possible_cpu(j) {
|
|
+ if (i == j)
|
|
+ rq->cpu_locality[j] = 0;
|
|
+ else
|
|
+ rq->cpu_locality[j] = 4;
|
|
+ }
|
|
+ rq->rq_order = kmalloc(cpu_ids * sizeof(struct rq *), GFP_ATOMIC);
|
|
+ rq->rq_order[0] = rq;
|
|
+ for (j = 1; j < cpu_ids; j++)
|
|
+ rq->rq_order[j] = cpu_rq(j);
|
|
+ }
|
|
+#endif
|
|
+
|
|
+ /*
|
|
+ * The boot idle thread does lazy MMU switching as well:
|
|
+ */
|
|
+ atomic_inc(&init_mm.mm_count);
|
|
+ enter_lazy_tlb(&init_mm, current);
|
|
+
|
|
+ /*
|
|
+ * Make us the idle thread. Technically, schedule() should not be
|
|
+ * called from this thread, however somewhere below it might be,
|
|
+ * but because we are the idle thread, we just pick up running again
|
|
+ * when this runqueue becomes "idle".
|
|
+ */
|
|
+ init_idle(current, smp_processor_id());
|
|
+
|
|
+#ifdef CONFIG_SMP
|
|
+ zalloc_cpumask_var(&sched_domains_tmpmask, GFP_NOWAIT);
|
|
+ /* May be allocated at isolcpus cmdline parse time */
|
|
+ if (cpu_isolated_map == NULL)
|
|
+ zalloc_cpumask_var(&cpu_isolated_map, GFP_NOWAIT);
|
|
+ idle_thread_set_boot_cpu();
|
|
+#endif /* SMP */
|
|
+
|
|
+ init_schedstats();
|
|
+}
|
|
+
|
|
+#ifdef CONFIG_DEBUG_ATOMIC_SLEEP
|
|
+static inline int preempt_count_equals(int preempt_offset)
|
|
+{
|
|
+ int nested = preempt_count() + rcu_preempt_depth();
|
|
+
|
|
+ return (nested == preempt_offset);
|
|
+}
|
|
+
|
|
+void __might_sleep(const char *file, int line, int preempt_offset)
|
|
+{
|
|
+ /*
|
|
+ * Blocking primitives will set (and therefore destroy) current->state,
|
|
+ * since we will exit with TASK_RUNNING make sure we enter with it,
|
|
+ * otherwise we will destroy state.
|
|
+ */
|
|
+ WARN_ONCE(current->state != TASK_RUNNING && current->task_state_change,
|
|
+ "do not call blocking ops when !TASK_RUNNING; "
|
|
+ "state=%lx set at [<%p>] %pS\n",
|
|
+ current->state,
|
|
+ (void *)current->task_state_change,
|
|
+ (void *)current->task_state_change);
|
|
+
|
|
+ ___might_sleep(file, line, preempt_offset);
|
|
+}
|
|
+EXPORT_SYMBOL(__might_sleep);
|
|
+
|
|
+void ___might_sleep(const char *file, int line, int preempt_offset)
|
|
+{
|
|
+ static unsigned long prev_jiffy; /* ratelimiting */
|
|
+ unsigned long preempt_disable_ip;
|
|
+
|
|
+ rcu_sleep_check(); /* WARN_ON_ONCE() by default, no rate limit reqd. */
|
|
+ if ((preempt_count_equals(preempt_offset) && !irqs_disabled() &&
|
|
+ !is_idle_task(current)) ||
|
|
+ system_state != SYSTEM_RUNNING || oops_in_progress)
|
|
+ return;
|
|
+ if (time_before(jiffies, prev_jiffy + HZ) && prev_jiffy)
|
|
+ return;
|
|
+ prev_jiffy = jiffies;
|
|
+
|
|
+ /* Save this before calling printk(), since that will clobber it */
|
|
+ preempt_disable_ip = get_preempt_disable_ip(current);
|
|
+
|
|
+ printk(KERN_ERR
|
|
+ "BUG: sleeping function called from invalid context at %s:%d\n",
|
|
+ file, line);
|
|
+ printk(KERN_ERR
|
|
+ "in_atomic(): %d, irqs_disabled(): %d, pid: %d, name: %s\n",
|
|
+ in_atomic(), irqs_disabled(),
|
|
+ current->pid, current->comm);
|
|
+
|
|
+ if (task_stack_end_corrupted(current))
|
|
+ printk(KERN_EMERG "Thread overran stack, or stack corrupted\n");
|
|
+
|
|
+ debug_show_held_locks(current);
|
|
+ if (irqs_disabled())
|
|
+ print_irqtrace_events(current);
|
|
+ if (IS_ENABLED(CONFIG_DEBUG_PREEMPT)
|
|
+ && !preempt_count_equals(preempt_offset)) {
|
|
+ pr_err("Preemption disabled at:");
|
|
+ print_ip_sym(preempt_disable_ip);
|
|
+ pr_cont("\n");
|
|
+ }
|
|
+ dump_stack();
|
|
+ add_taint(TAINT_WARN, LOCKDEP_STILL_OK);
|
|
+}
|
|
+EXPORT_SYMBOL(___might_sleep);
|
|
+#endif
|
|
+
|
|
+#ifdef CONFIG_MAGIC_SYSRQ
|
|
+static inline void normalise_rt_tasks(void)
|
|
+{
|
|
+ struct task_struct *g, *p;
|
|
+ unsigned long flags;
|
|
+ struct rq *rq;
|
|
+
|
|
+ read_lock(&tasklist_lock);
|
|
+ for_each_process_thread(g, p) {
|
|
+ /*
|
|
+ * Only normalize user tasks:
|
|
+ */
|
|
+ if (p->flags & PF_KTHREAD)
|
|
+ continue;
|
|
+
|
|
+ if (!rt_task(p) && !iso_task(p))
|
|
+ continue;
|
|
+
|
|
+ rq = task_rq_lock(p, &flags);
|
|
+ __setscheduler(p, rq, SCHED_NORMAL, 0, false);
|
|
+ task_rq_unlock(rq, p, &flags);
|
|
+ }
|
|
+ read_unlock(&tasklist_lock);
|
|
+}
|
|
+
|
|
+void normalize_rt_tasks(void)
|
|
+{
|
|
+ normalise_rt_tasks();
|
|
+}
|
|
+#endif /* CONFIG_MAGIC_SYSRQ */
|
|
+
|
|
+#if defined(CONFIG_IA64) || defined(CONFIG_KGDB_KDB)
|
|
+/*
|
|
+ * These functions are only useful for the IA64 MCA handling, or kdb.
|
|
+ *
|
|
+ * They can only be called when the whole system has been
|
|
+ * stopped - every CPU needs to be quiescent, and no scheduling
|
|
+ * activity can take place. Using them for anything else would
|
|
+ * be a serious bug, and as a result, they aren't even visible
|
|
+ * under any other configuration.
|
|
+ */
|
|
+
|
|
+/**
|
|
+ * curr_task - return the current task for a given cpu.
|
|
+ * @cpu: the processor in question.
|
|
+ *
|
|
+ * ONLY VALID WHEN THE WHOLE SYSTEM IS STOPPED!
|
|
+ *
|
|
+ * Return: The current task for @cpu.
|
|
+ */
|
|
+struct task_struct *curr_task(int cpu)
|
|
+{
|
|
+ return cpu_curr(cpu);
|
|
+}
|
|
+
|
|
+#endif /* defined(CONFIG_IA64) || defined(CONFIG_KGDB_KDB) */
|
|
+
|
|
+#ifdef CONFIG_IA64
|
|
+/**
|
|
+ * set_curr_task - set the current task for a given cpu.
|
|
+ * @cpu: the processor in question.
|
|
+ * @p: the task pointer to set.
|
|
+ *
|
|
+ * Description: This function must only be used when non-maskable interrupts
|
|
+ * are serviced on a separate stack. It allows the architecture to switch the
|
|
+ * notion of the current task on a cpu in a non-blocking manner. This function
|
|
+ * must be called with all CPU's synchronised, and interrupts disabled, the
|
|
+ * and caller must save the original value of the current task (see
|
|
+ * curr_task() above) and restore that value before reenabling interrupts and
|
|
+ * re-starting the system.
|
|
+ *
|
|
+ * ONLY VALID WHEN THE WHOLE SYSTEM IS STOPPED!
|
|
+ */
|
|
+void ia64_set_curr_task(int cpu, struct task_struct *p)
|
|
+{
|
|
+ cpu_curr(cpu) = p;
|
|
+}
|
|
+
|
|
+#endif
|
|
+
|
|
+void init_idle_bootup_task(struct task_struct *idle)
|
|
+{}
|
|
+
|
|
+#ifdef CONFIG_SCHED_DEBUG
|
|
+void proc_sched_show_task(struct task_struct *p, struct seq_file *m)
|
|
+{}
|
|
+
|
|
+void proc_sched_set_task(struct task_struct *p)
|
|
+{}
|
|
+#endif
|
|
+
|
|
+#ifdef CONFIG_SMP
|
|
+#define SCHED_LOAD_SHIFT (10)
|
|
+#define SCHED_LOAD_SCALE (1L << SCHED_LOAD_SHIFT)
|
|
+
|
|
+unsigned long default_scale_freq_power(struct sched_domain *sd, int cpu)
|
|
+{
|
|
+ return SCHED_LOAD_SCALE;
|
|
+}
|
|
+
|
|
+unsigned long default_scale_smt_power(struct sched_domain *sd, int cpu)
|
|
+{
|
|
+ unsigned long weight = cpumask_weight(sched_domain_span(sd));
|
|
+ unsigned long smt_gain = sd->smt_gain;
|
|
+
|
|
+ smt_gain /= weight;
|
|
+
|
|
+ return smt_gain;
|
|
+}
|
|
+#endif
|
|
+
|
|
+#ifdef CONFIG_CGROUP_SCHED
|
|
+static void sched_free_group(struct task_group *tg)
|
|
+{
|
|
+ kmem_cache_free(task_group_cache, tg);
|
|
+}
|
|
+
|
|
+/* allocate runqueue etc for a new task group */
|
|
+struct task_group *sched_create_group(struct task_group *parent)
|
|
+{
|
|
+ struct task_group *tg;
|
|
+
|
|
+ tg = kmem_cache_alloc(task_group_cache, GFP_KERNEL | __GFP_ZERO);
|
|
+ if (!tg)
|
|
+ return ERR_PTR(-ENOMEM);
|
|
+
|
|
+ return tg;
|
|
+}
|
|
+
|
|
+void sched_online_group(struct task_group *tg, struct task_group *parent)
|
|
+{
|
|
+}
|
|
+
|
|
+/* rcu callback to free various structures associated with a task group */
|
|
+static void sched_free_group_rcu(struct rcu_head *rhp)
|
|
+{
|
|
+ /* now it should be safe to free those cfs_rqs */
|
|
+ sched_free_group(container_of(rhp, struct task_group, rcu));
|
|
+}
|
|
+
|
|
+void sched_destroy_group(struct task_group *tg)
|
|
+{
|
|
+ /* wait for possible concurrent references to cfs_rqs complete */
|
|
+ call_rcu(&tg->rcu, sched_free_group_rcu);
|
|
+}
|
|
+
|
|
+void sched_offline_group(struct task_group *tg)
|
|
+{
|
|
+}
|
|
+
|
|
+static inline struct task_group *css_tg(struct cgroup_subsys_state *css)
|
|
+{
|
|
+ return css ? container_of(css, struct task_group, css) : NULL;
|
|
+}
|
|
+
|
|
+static struct cgroup_subsys_state *
|
|
+cpu_cgroup_css_alloc(struct cgroup_subsys_state *parent_css)
|
|
+{
|
|
+ struct task_group *parent = css_tg(parent_css);
|
|
+ struct task_group *tg;
|
|
+
|
|
+ if (!parent) {
|
|
+ /* This is early initialization for the top cgroup */
|
|
+ return &root_task_group.css;
|
|
+ }
|
|
+
|
|
+ tg = sched_create_group(parent);
|
|
+ if (IS_ERR(tg))
|
|
+ return ERR_PTR(-ENOMEM);
|
|
+ return &tg->css;
|
|
+}
|
|
+
|
|
+static void cpu_cgroup_css_released(struct cgroup_subsys_state *css)
|
|
+{
|
|
+ struct task_group *tg = css_tg(css);
|
|
+
|
|
+ sched_offline_group(tg);
|
|
+}
|
|
+
|
|
+static void cpu_cgroup_css_free(struct cgroup_subsys_state *css)
|
|
+{
|
|
+ struct task_group *tg = css_tg(css);
|
|
+
|
|
+ /*
|
|
+ * Relies on the RCU grace period between css_released() and this.
|
|
+ */
|
|
+ sched_free_group(tg);
|
|
+}
|
|
+
|
|
+static void cpu_cgroup_fork(struct task_struct *task)
|
|
+{
|
|
+}
|
|
+
|
|
+static int cpu_cgroup_can_attach(struct cgroup_taskset *tset)
|
|
+{
|
|
+ return 0;
|
|
+}
|
|
+
|
|
+static void cpu_cgroup_attach(struct cgroup_taskset *tset)
|
|
+{
|
|
+}
|
|
+
|
|
+static struct cftype cpu_files[] = {
|
|
+ { } /* terminate */
|
|
+};
|
|
+
|
|
+struct cgroup_subsys cpu_cgrp_subsys = {
|
|
+ .css_alloc = cpu_cgroup_css_alloc,
|
|
+ .css_released = cpu_cgroup_css_released,
|
|
+ .css_free = cpu_cgroup_css_free,
|
|
+ .fork = cpu_cgroup_fork,
|
|
+ .can_attach = cpu_cgroup_can_attach,
|
|
+ .attach = cpu_cgroup_attach,
|
|
+ .legacy_cftypes = cpu_files,
|
|
+ .early_init = true,
|
|
+};
|
|
+#endif /* CONFIG_CGROUP_SCHED */
|
|
diff --git a/kernel/sched/MuQSS.h b/kernel/sched/MuQSS.h
|
|
new file mode 100644
|
|
index 0000000..04ce652
|
|
--- /dev/null
|
|
+++ b/kernel/sched/MuQSS.h
|
|
@@ -0,0 +1,348 @@
|
|
+#include <linux/sched.h>
|
|
+#include <linux/cpuidle.h>
|
|
+#include <linux/interrupt.h>
|
|
+#include <linux/skip_list.h>
|
|
+#include <linux/stop_machine.h>
|
|
+#include <linux/u64_stats_sync.h>
|
|
+#include "cpuacct.h"
|
|
+
|
|
+#ifndef MUQSS_SCHED_H
|
|
+#define MUQSS_SCHED_H
|
|
+
|
|
+#ifdef CONFIG_SCHED_DEBUG
|
|
+#define SCHED_WARN_ON(x) WARN_ONCE(x, #x)
|
|
+#else
|
|
+#define SCHED_WARN_ON(x) ((void)(x))
|
|
+#endif
|
|
+
|
|
+/* task_struct::on_rq states: */
|
|
+#define TASK_ON_RQ_QUEUED 1
|
|
+#define TASK_ON_RQ_MIGRATING 2
|
|
+
|
|
+/*
|
|
+ * This is the main, per-CPU runqueue data structure.
|
|
+ * This data should only be modified by the local cpu.
|
|
+ */
|
|
+struct rq {
|
|
+ struct task_struct *curr, *idle, *stop;
|
|
+ struct mm_struct *prev_mm;
|
|
+
|
|
+ raw_spinlock_t lock;
|
|
+
|
|
+ /* Stored data about rq->curr to work outside rq lock */
|
|
+ u64 rq_deadline;
|
|
+ int rq_prio;
|
|
+
|
|
+ /* Best queued id for use outside lock */
|
|
+ u64 best_key;
|
|
+
|
|
+ unsigned long last_scheduler_tick; /* Last jiffy this RQ ticked */
|
|
+ unsigned long last_jiffy; /* Last jiffy this RQ updated rq clock */
|
|
+ u64 niffies; /* Last time this RQ updated rq clock */
|
|
+ u64 last_niffy; /* Last niffies as updated by local clock */
|
|
+ u64 last_jiffy_niffies; /* Niffies @ last_jiffy */
|
|
+
|
|
+ u64 load_update; /* When we last updated load */
|
|
+ unsigned long load_avg; /* Rolling load average */
|
|
+#ifdef CONFIG_SMT_NICE
|
|
+ struct mm_struct *rq_mm;
|
|
+ int rq_smt_bias; /* Policy/nice level bias across smt siblings */
|
|
+#endif
|
|
+ /* Accurate timekeeping data */
|
|
+ unsigned long user_ns, nice_ns, irq_ns, softirq_ns, system_ns,
|
|
+ iowait_ns, idle_ns;
|
|
+ atomic_t nr_iowait;
|
|
+
|
|
+ skiplist_node node;
|
|
+ skiplist *sl;
|
|
+#ifdef CONFIG_SMP
|
|
+ struct task_struct *preempt; /* Preempt triggered on this task */
|
|
+
|
|
+ int cpu; /* cpu of this runqueue */
|
|
+ bool online;
|
|
+
|
|
+ struct root_domain *rd;
|
|
+ struct sched_domain *sd;
|
|
+ int *cpu_locality; /* CPU relative cache distance */
|
|
+ struct rq **rq_order; /* RQs ordered by relative cache distance */
|
|
+
|
|
+#ifdef CONFIG_SCHED_SMT
|
|
+ cpumask_t thread_mask;
|
|
+ bool (*siblings_idle)(struct rq *rq);
|
|
+ /* See if all smt siblings are idle */
|
|
+#endif /* CONFIG_SCHED_SMT */
|
|
+#ifdef CONFIG_SCHED_MC
|
|
+ cpumask_t core_mask;
|
|
+ bool (*cache_idle)(struct rq *rq);
|
|
+ /* See if all cache siblings are idle */
|
|
+#endif /* CONFIG_SCHED_MC */
|
|
+#endif /* CONFIG_SMP */
|
|
+#ifdef CONFIG_IRQ_TIME_ACCOUNTING
|
|
+ u64 prev_irq_time;
|
|
+#endif /* CONFIG_IRQ_TIME_ACCOUNTING */
|
|
+#ifdef CONFIG_PARAVIRT
|
|
+ u64 prev_steal_time;
|
|
+#endif /* CONFIG_PARAVIRT */
|
|
+#ifdef CONFIG_PARAVIRT_TIME_ACCOUNTING
|
|
+ u64 prev_steal_time_rq;
|
|
+#endif /* CONFIG_PARAVIRT_TIME_ACCOUNTING */
|
|
+
|
|
+ u64 clock, old_clock, last_tick;
|
|
+ u64 clock_task;
|
|
+ int dither;
|
|
+
|
|
+ int iso_ticks;
|
|
+ bool iso_refractory;
|
|
+
|
|
+#ifdef CONFIG_HIGH_RES_TIMERS
|
|
+ struct hrtimer hrexpiry_timer;
|
|
+#endif
|
|
+
|
|
+#ifdef CONFIG_SCHEDSTATS
|
|
+
|
|
+ /* latency stats */
|
|
+ struct sched_info rq_sched_info;
|
|
+ unsigned long long rq_cpu_time;
|
|
+ /* could above be rq->cfs_rq.exec_clock + rq->rt_rq.rt_runtime ? */
|
|
+
|
|
+ /* sys_sched_yield() stats */
|
|
+ unsigned int yld_count;
|
|
+
|
|
+ /* schedule() stats */
|
|
+ unsigned int sched_switch;
|
|
+ unsigned int sched_count;
|
|
+ unsigned int sched_goidle;
|
|
+
|
|
+ /* try_to_wake_up() stats */
|
|
+ unsigned int ttwu_count;
|
|
+ unsigned int ttwu_local;
|
|
+#endif /* CONFIG_SCHEDSTATS */
|
|
+
|
|
+#ifdef CONFIG_SMP
|
|
+ struct llist_head wake_list;
|
|
+#endif
|
|
+
|
|
+#ifdef CONFIG_CPU_IDLE
|
|
+ /* Must be inspected within a rcu lock section */
|
|
+ struct cpuidle_state *idle_state;
|
|
+#endif
|
|
+};
|
|
+
|
|
+#ifdef CONFIG_SMP
|
|
+struct rq *cpu_rq(int cpu);
|
|
+#endif
|
|
+
|
|
+#ifndef CONFIG_SMP
|
|
+extern struct rq *uprq;
|
|
+#define cpu_rq(cpu) (uprq)
|
|
+#define this_rq() (uprq)
|
|
+#define raw_rq() (uprq)
|
|
+#define task_rq(p) (uprq)
|
|
+#define cpu_curr(cpu) ((uprq)->curr)
|
|
+#else /* CONFIG_SMP */
|
|
+DECLARE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues);
|
|
+#define this_rq() this_cpu_ptr(&runqueues)
|
|
+#define raw_rq() raw_cpu_ptr(&runqueues)
|
|
+#endif /* CONFIG_SMP */
|
|
+
|
|
+/*
|
|
+ * {de,en}queue flags:
|
|
+ *
|
|
+ * DEQUEUE_SLEEP - task is no longer runnable
|
|
+ * ENQUEUE_WAKEUP - task just became runnable
|
|
+ *
|
|
+ * SAVE/RESTORE - an otherwise spurious dequeue/enqueue, done to ensure tasks
|
|
+ * are in a known state which allows modification. Such pairs
|
|
+ * should preserve as much state as possible.
|
|
+ *
|
|
+ * MOVE - paired with SAVE/RESTORE, explicitly does not preserve the location
|
|
+ * in the runqueue.
|
|
+ *
|
|
+ * ENQUEUE_HEAD - place at front of runqueue (tail if not specified)
|
|
+ * ENQUEUE_REPLENISH - CBS (replenish runtime and postpone deadline)
|
|
+ * ENQUEUE_MIGRATED - the task was migrated during wakeup
|
|
+ *
|
|
+ */
|
|
+
|
|
+#define DEQUEUE_SLEEP 0x01
|
|
+#define DEQUEUE_SAVE 0x02 /* matches ENQUEUE_RESTORE */
|
|
+#define DEQUEUE_MOVE 0x04 /* matches ENQUEUE_MOVE */
|
|
+
|
|
+#define ENQUEUE_WAKEUP 0x01
|
|
+#define ENQUEUE_RESTORE 0x02
|
|
+#define ENQUEUE_MOVE 0x04
|
|
+
|
|
+#define ENQUEUE_HEAD 0x08
|
|
+#define ENQUEUE_REPLENISH 0x10
|
|
+#ifdef CONFIG_SMP
|
|
+#define ENQUEUE_MIGRATED 0x20
|
|
+#else
|
|
+#define ENQUEUE_MIGRATED 0x00
|
|
+#endif
|
|
+
|
|
+static inline u64 __rq_clock_broken(struct rq *rq)
|
|
+{
|
|
+ return READ_ONCE(rq->clock);
|
|
+}
|
|
+
|
|
+static inline u64 rq_clock(struct rq *rq)
|
|
+{
|
|
+ lockdep_assert_held(&rq->lock);
|
|
+ return rq->clock;
|
|
+}
|
|
+
|
|
+static inline u64 rq_clock_task(struct rq *rq)
|
|
+{
|
|
+ lockdep_assert_held(&rq->lock);
|
|
+ return rq->clock_task;
|
|
+}
|
|
+
|
|
+extern struct mutex sched_domains_mutex;
|
|
+extern struct static_key_false sched_schedstats;
|
|
+
|
|
+#define rcu_dereference_check_sched_domain(p) \
|
|
+ rcu_dereference_check((p), \
|
|
+ lockdep_is_held(&sched_domains_mutex))
|
|
+
|
|
+#define for_each_lower_domain(sd) for (; sd; sd = sd->child)
|
|
+
|
|
+/*
|
|
+ * The domain tree (rq->sd) is protected by RCU's quiescent state transition.
|
|
+ * See detach_destroy_domains: synchronize_sched for details.
|
|
+ *
|
|
+ * The domain tree of any CPU may only be accessed from within
|
|
+ * preempt-disabled sections.
|
|
+ */
|
|
+#define for_each_domain(cpu, __sd) \
|
|
+ for (__sd = rcu_dereference_check_sched_domain(cpu_rq(cpu)->sd); __sd; __sd = __sd->parent)
|
|
+
|
|
+#if defined(CONFIG_SCHED_DEBUG) && defined(CONFIG_SYSCTL)
|
|
+void register_sched_domain_sysctl(void);
|
|
+void unregister_sched_domain_sysctl(void);
|
|
+#else
|
|
+static inline void register_sched_domain_sysctl(void)
|
|
+{
|
|
+}
|
|
+static inline void unregister_sched_domain_sysctl(void)
|
|
+{
|
|
+}
|
|
+#endif
|
|
+
|
|
+#ifdef CONFIG_SMP
|
|
+extern void sched_ttwu_pending(void);
|
|
+extern void set_cpus_allowed_common(struct task_struct *p, const struct cpumask *new_mask);
|
|
+#else
|
|
+static inline void sched_ttwu_pending(void) { }
|
|
+#endif
|
|
+
|
|
+#ifdef CONFIG_CPU_IDLE
|
|
+static inline void idle_set_state(struct rq *rq,
|
|
+ struct cpuidle_state *idle_state)
|
|
+{
|
|
+ rq->idle_state = idle_state;
|
|
+}
|
|
+
|
|
+static inline struct cpuidle_state *idle_get_state(struct rq *rq)
|
|
+{
|
|
+ SCHED_WARN_ON(!rcu_read_lock_held());
|
|
+ return rq->idle_state;
|
|
+}
|
|
+#else
|
|
+static inline void idle_set_state(struct rq *rq,
|
|
+ struct cpuidle_state *idle_state)
|
|
+{
|
|
+}
|
|
+
|
|
+static inline struct cpuidle_state *idle_get_state(struct rq *rq)
|
|
+{
|
|
+ return NULL;
|
|
+}
|
|
+#endif
|
|
+
|
|
+#ifdef CONFIG_IRQ_TIME_ACCOUNTING
|
|
+struct irqtime {
|
|
+ u64 hardirq_time;
|
|
+ u64 softirq_time;
|
|
+ u64 irq_start_time;
|
|
+ struct u64_stats_sync sync;
|
|
+};
|
|
+
|
|
+DECLARE_PER_CPU(struct irqtime, cpu_irqtime);
|
|
+
|
|
+static inline u64 irq_time_read(int cpu)
|
|
+{
|
|
+ struct irqtime *irqtime = &per_cpu(cpu_irqtime, cpu);
|
|
+ unsigned int seq;
|
|
+ u64 total;
|
|
+
|
|
+ do {
|
|
+ seq = __u64_stats_fetch_begin(&irqtime->sync);
|
|
+ total = irqtime->softirq_time + irqtime->hardirq_time;
|
|
+ } while (__u64_stats_fetch_retry(&irqtime->sync, seq));
|
|
+
|
|
+ return total;
|
|
+}
|
|
+#endif /* CONFIG_IRQ_TIME_ACCOUNTING */
|
|
+
|
|
+#ifdef CONFIG_CPU_FREQ
|
|
+DECLARE_PER_CPU(struct update_util_data *, cpufreq_update_util_data);
|
|
+
|
|
+static inline void cpufreq_trigger(u64 time, unsigned int flags)
|
|
+{
|
|
+ struct update_util_data *data = rcu_dereference_sched(*this_cpu_ptr(&cpufreq_update_util_data));
|
|
+
|
|
+ if (data)
|
|
+ data->func(data, time, flags);
|
|
+}
|
|
+#else
|
|
+static inline void cpufreq_trigger(u64 time, unsigned int flag)
|
|
+{
|
|
+}
|
|
+#endif /* CONFIG_CPU_FREQ */
|
|
+
|
|
+#ifdef arch_scale_freq_capacity
|
|
+#ifndef arch_scale_freq_invariant
|
|
+#define arch_scale_freq_invariant() (true)
|
|
+#endif
|
|
+#else /* arch_scale_freq_capacity */
|
|
+#define arch_scale_freq_invariant() (false)
|
|
+#endif
|
|
+
|
|
+/*
|
|
+ * This should only be called when current == rq->idle. Dodgy workaround for
|
|
+ * when softirqs are pending and we are in the idle loop. Setting current to
|
|
+ * resched will kick us out of the idle loop and the softirqs will be serviced
|
|
+ * on our next pass through schedule().
|
|
+ */
|
|
+static inline bool softirq_pending(int cpu)
|
|
+{
|
|
+ if (likely(!local_softirq_pending()))
|
|
+ return false;
|
|
+ set_tsk_need_resched(current);
|
|
+ return true;
|
|
+}
|
|
+
|
|
+#ifdef CONFIG_64BIT
|
|
+static inline u64 read_sum_exec_runtime(struct task_struct *t)
|
|
+{
|
|
+ return tsk_seruntime(t);
|
|
+}
|
|
+#else
|
|
+struct rq *task_rq_lock(struct task_struct *p, unsigned long *flags);
|
|
+void task_rq_unlock(struct rq *rq, struct task_struct *p, unsigned long *flags);
|
|
+
|
|
+static inline u64 read_sum_exec_runtime(struct task_struct *t)
|
|
+{
|
|
+ unsigned long flags;
|
|
+ u64 ns;
|
|
+ struct rq *rq;
|
|
+
|
|
+ rq = task_rq_lock(t, &flags);
|
|
+ ns = tsk_seruntime(t);
|
|
+ task_rq_unlock(rq, t, &flags);
|
|
+
|
|
+ return ns;
|
|
+}
|
|
+#endif
|
|
+
|
|
+#endif /* MUQSS_SCHED_H */
|
|
diff --git a/kernel/sched/cpufreq.c b/kernel/sched/cpufreq.c
|
|
index dbc5144..22003e8 100644
|
|
--- a/kernel/sched/cpufreq.c
|
|
+++ b/kernel/sched/cpufreq.c
|
|
@@ -9,7 +9,11 @@
|
|
* published by the Free Software Foundation.
|
|
*/
|
|
|
|
+#ifdef CONFIG_SCHED_MUQSS
|
|
+#include "MuQSS.h"
|
|
+#else
|
|
#include "sched.h"
|
|
+#endif
|
|
|
|
DEFINE_PER_CPU(struct update_util_data *, cpufreq_update_util_data);
|
|
|
|
diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c
|
|
index 69e0689..c798cb4 100644
|
|
--- a/kernel/sched/cpufreq_schedutil.c
|
|
+++ b/kernel/sched/cpufreq_schedutil.c
|
|
@@ -15,7 +15,11 @@
|
|
#include <linux/slab.h>
|
|
#include <trace/events/power.h>
|
|
|
|
+#ifdef CONFIG_SCHED_MUQSS
|
|
+#include "MuQSS.h"
|
|
+#else
|
|
#include "sched.h"
|
|
+#endif
|
|
|
|
struct sugov_tunables {
|
|
struct gov_attr_set attr_set;
|
|
@@ -146,6 +150,17 @@ static unsigned int get_next_freq(struct sugov_cpu *sg_cpu, unsigned long util,
|
|
return cpufreq_driver_resolve_freq(policy, freq);
|
|
}
|
|
|
|
+#ifdef CONFIG_SCHED_MUQSS
|
|
+static void sugov_get_util(unsigned long *util, unsigned long *max)
|
|
+{
|
|
+ struct rq *rq = this_rq();
|
|
+
|
|
+ *util = rq->load_avg;
|
|
+ if (*util > SCHED_CAPACITY_SCALE)
|
|
+ *util = SCHED_CAPACITY_SCALE;
|
|
+ *max = SCHED_CAPACITY_SCALE;
|
|
+}
|
|
+#else /* CONFIG_SCHED_MUQSS */
|
|
static void sugov_get_util(unsigned long *util, unsigned long *max)
|
|
{
|
|
struct rq *rq = this_rq();
|
|
@@ -156,6 +171,7 @@ static void sugov_get_util(unsigned long *util, unsigned long *max)
|
|
*util = min(rq->cfs.avg.util_avg, cfs_max);
|
|
*max = cfs_max;
|
|
}
|
|
+#endif /* CONFIG_SCHED_MUQSS */
|
|
|
|
static void sugov_set_iowait_boost(struct sugov_cpu *sg_cpu, u64 time,
|
|
unsigned int flags)
|
|
diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c
|
|
index 5ebee31..cd26436 100644
|
|
--- a/kernel/sched/cputime.c
|
|
+++ b/kernel/sched/cputime.c
|
|
@@ -4,7 +4,12 @@
|
|
#include <linux/kernel_stat.h>
|
|
#include <linux/static_key.h>
|
|
#include <linux/context_tracking.h>
|
|
+#ifdef CONFIG_SCHED_MUQSS
|
|
+#include "MuQSS.h"
|
|
+#include "stats.h"
|
|
+#else
|
|
#include "sched.h"
|
|
+#endif
|
|
#ifdef CONFIG_PARAVIRT
|
|
#include <asm/paravirt.h>
|
|
#endif
|
|
@@ -298,26 +303,6 @@ static inline cputime_t account_other_time(cputime_t max)
|
|
return accounted;
|
|
}
|
|
|
|
-#ifdef CONFIG_64BIT
|
|
-static inline u64 read_sum_exec_runtime(struct task_struct *t)
|
|
-{
|
|
- return t->se.sum_exec_runtime;
|
|
-}
|
|
-#else
|
|
-static u64 read_sum_exec_runtime(struct task_struct *t)
|
|
-{
|
|
- u64 ns;
|
|
- struct rq_flags rf;
|
|
- struct rq *rq;
|
|
-
|
|
- rq = task_rq_lock(t, &rf);
|
|
- ns = t->se.sum_exec_runtime;
|
|
- task_rq_unlock(rq, t, &rf);
|
|
-
|
|
- return ns;
|
|
-}
|
|
-#endif
|
|
-
|
|
/*
|
|
* Accumulate raw cputime values of dead tasks (sig->[us]time) and live
|
|
* tasks (sum on group iteration) belonging to @tsk's group.
|
|
@@ -694,7 +679,7 @@ out:
|
|
void task_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime_t *st)
|
|
{
|
|
struct task_cputime cputime = {
|
|
- .sum_exec_runtime = p->se.sum_exec_runtime,
|
|
+ .sum_exec_runtime = tsk_seruntime(p),
|
|
};
|
|
|
|
task_cputime(p, &cputime.utime, &cputime.stime);
|
|
diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c
|
|
index 1d8718d..7ec1b39 100644
|
|
--- a/kernel/sched/idle.c
|
|
+++ b/kernel/sched/idle.c
|
|
@@ -14,7 +14,11 @@
|
|
|
|
#include <trace/events/power.h>
|
|
|
|
+#ifdef CONFIG_SCHED_MUQSS
|
|
+#include "MuQSS.h"
|
|
+#else
|
|
#include "sched.h"
|
|
+#endif
|
|
|
|
/* Linker adds these: start and end of __cpuidle functions */
|
|
extern char __cpuidle_text_start[], __cpuidle_text_end[];
|
|
@@ -207,6 +211,8 @@ static void cpu_idle_loop(void)
|
|
int cpu = smp_processor_id();
|
|
|
|
while (1) {
|
|
+ bool pending = false;
|
|
+
|
|
/*
|
|
* If the arch has a polling bit, we maintain an invariant:
|
|
*
|
|
@@ -218,7 +224,10 @@ static void cpu_idle_loop(void)
|
|
|
|
__current_set_polling();
|
|
quiet_vmstat();
|
|
- tick_nohz_idle_enter();
|
|
+ if (unlikely(softirq_pending(cpu)))
|
|
+ pending = true;
|
|
+ else
|
|
+ tick_nohz_idle_enter();
|
|
|
|
while (!need_resched()) {
|
|
check_pgt_cache();
|
|
@@ -258,7 +267,8 @@ static void cpu_idle_loop(void)
|
|
* not have had an IPI to fold the state for us.
|
|
*/
|
|
preempt_set_need_resched();
|
|
- tick_nohz_idle_exit();
|
|
+ if (!pending)
|
|
+ tick_nohz_idle_exit();
|
|
__current_clr_polling();
|
|
|
|
/*
|
|
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
|
|
index 055f935..662f62b 100644
|
|
--- a/kernel/sched/sched.h
|
|
+++ b/kernel/sched/sched.h
|
|
@@ -1815,3 +1815,28 @@ static inline void cpufreq_update_this_cpu(struct rq *rq, unsigned int flags) {}
|
|
#else /* arch_scale_freq_capacity */
|
|
#define arch_scale_freq_invariant() (false)
|
|
#endif
|
|
+
|
|
+static inline bool softirq_pending(int cpu)
|
|
+{
|
|
+ return false;
|
|
+}
|
|
+
|
|
+#ifdef CONFIG_64BIT
|
|
+static inline u64 read_sum_exec_runtime(struct task_struct *t)
|
|
+{
|
|
+ return t->se.sum_exec_runtime;
|
|
+}
|
|
+#else
|
|
+static inline u64 read_sum_exec_runtime(struct task_struct *t)
|
|
+{
|
|
+ u64 ns;
|
|
+ struct rq_flags rf;
|
|
+ struct rq *rq;
|
|
+
|
|
+ rq = task_rq_lock(t, &rf);
|
|
+ ns = t->se.sum_exec_runtime;
|
|
+ task_rq_unlock(rq, t, &rf);
|
|
+
|
|
+ return ns;
|
|
+}
|
|
+#endif
|
|
diff --git a/kernel/sched/stats.c b/kernel/sched/stats.c
|
|
index 87e2c9f..ba7b137 100644
|
|
--- a/kernel/sched/stats.c
|
|
+++ b/kernel/sched/stats.c
|
|
@@ -4,7 +4,11 @@
|
|
#include <linux/seq_file.h>
|
|
#include <linux/proc_fs.h>
|
|
|
|
+#ifndef CONFIG_SCHED_MUQSS
|
|
#include "sched.h"
|
|
+#else
|
|
+#include "MuQSS.h"
|
|
+#endif
|
|
|
|
/*
|
|
* bump this up when changing the output format or the meaning of an existing
|
|
diff --git a/kernel/skip_list.c b/kernel/skip_list.c
|
|
new file mode 100644
|
|
index 0000000..d525080
|
|
--- /dev/null
|
|
+++ b/kernel/skip_list.c
|
|
@@ -0,0 +1,148 @@
|
|
+/*
|
|
+ Copyright (C) 2011,2016 Con Kolivas.
|
|
+
|
|
+ Code based on example originally by William Pugh.
|
|
+
|
|
+Skip Lists are a probabilistic alternative to balanced trees, as
|
|
+described in the June 1990 issue of CACM and were invented by
|
|
+William Pugh in 1987.
|
|
+
|
|
+A couple of comments about this implementation:
|
|
+The routine randomLevel has been hard-coded to generate random
|
|
+levels using p=0.25. It can be easily changed.
|
|
+
|
|
+The insertion routine has been implemented so as to use the
|
|
+dirty hack described in the CACM paper: if a random level is
|
|
+generated that is more than the current maximum level, the
|
|
+current maximum level plus one is used instead.
|
|
+
|
|
+Levels start at zero and go up to MaxLevel (which is equal to
|
|
+MaxNumberOfLevels-1).
|
|
+
|
|
+The routines defined in this file are:
|
|
+
|
|
+init: defines slnode
|
|
+
|
|
+new_skiplist: returns a new, empty list
|
|
+
|
|
+randomLevel: Returns a random level based on a u64 random seed passed to it.
|
|
+In MuQSS, the "niffy" time is used for this purpose.
|
|
+
|
|
+insert(l,key, value): inserts the binding (key, value) into l. This operation
|
|
+occurs in O(log n) time.
|
|
+
|
|
+delnode(slnode, l, node): deletes any binding of key from the l based on the
|
|
+actual node value. This operation occurs in O(k) time where k is the
|
|
+number of levels of the node in question (max 8). The original delete
|
|
+function occurred in O(log n) time and involved a search.
|
|
+
|
|
+MuQSS Notes: In this implementation of skiplists, there are bidirectional
|
|
+next/prev pointers and the insert function returns a pointer to the actual
|
|
+node the value is stored. The key here is chosen by the scheduler so as to
|
|
+sort tasks according to the priority list requirements and is no longer used
|
|
+by the scheduler after insertion. The scheduler lookup, however, occurs in
|
|
+O(1) time because it is always the first item in the level 0 linked list.
|
|
+Since the task struct stores a copy of the node pointer upon skiplist_insert,
|
|
+it can also remove it much faster than the original implementation with the
|
|
+aid of prev<->next pointer manipulation and no searching.
|
|
+
|
|
+*/
|
|
+
|
|
+#include <linux/slab.h>
|
|
+#include <linux/skip_list.h>
|
|
+
|
|
+#define MaxNumberOfLevels 8
|
|
+#define MaxLevel (MaxNumberOfLevels - 1)
|
|
+
|
|
+void skiplist_init(skiplist_node *slnode)
|
|
+{
|
|
+ int i;
|
|
+
|
|
+ slnode->key = 0xFFFFFFFFFFFFFFFF;
|
|
+ slnode->level = 0;
|
|
+ slnode->value = NULL;
|
|
+ for (i = 0; i < MaxNumberOfLevels; i++)
|
|
+ slnode->next[i] = slnode->prev[i] = slnode;
|
|
+}
|
|
+
|
|
+skiplist *new_skiplist(skiplist_node *slnode)
|
|
+{
|
|
+ skiplist *l = kzalloc(sizeof(skiplist), GFP_ATOMIC);
|
|
+
|
|
+ BUG_ON(!l);
|
|
+ l->header = slnode;
|
|
+ return l;
|
|
+}
|
|
+
|
|
+void free_skiplist(skiplist *l)
|
|
+{
|
|
+ skiplist_node *p, *q;
|
|
+
|
|
+ p = l->header;
|
|
+ do {
|
|
+ q = p->next[0];
|
|
+ p->next[0]->prev[0] = q->prev[0];
|
|
+ skiplist_node_init(p);
|
|
+ p = q;
|
|
+ } while (p != l->header);
|
|
+ kfree(l);
|
|
+}
|
|
+
|
|
+void skiplist_node_init(skiplist_node *node)
|
|
+{
|
|
+ memset(node, 0, sizeof(skiplist_node));
|
|
+}
|
|
+
|
|
+static inline unsigned int randomLevel(const long unsigned int randseed)
|
|
+{
|
|
+ return find_first_bit(&randseed, MaxLevel);
|
|
+}
|
|
+
|
|
+void skiplist_insert(skiplist *l, skiplist_node *node, keyType key, valueType value, unsigned int randseed)
|
|
+{
|
|
+ skiplist_node *update[MaxNumberOfLevels];
|
|
+ skiplist_node *p, *q;
|
|
+ int k = l->level;
|
|
+
|
|
+ p = l->header;
|
|
+ do {
|
|
+ while (q = p->next[k], q->key <= key)
|
|
+ p = q;
|
|
+ update[k] = p;
|
|
+ } while (--k >= 0);
|
|
+
|
|
+ ++l->entries;
|
|
+ k = randomLevel(randseed);
|
|
+ if (k > l->level) {
|
|
+ k = ++l->level;
|
|
+ update[k] = l->header;
|
|
+ }
|
|
+
|
|
+ node->level = k;
|
|
+ node->key = key;
|
|
+ node->value = value;
|
|
+ do {
|
|
+ p = update[k];
|
|
+ node->next[k] = p->next[k];
|
|
+ p->next[k] = node;
|
|
+ node->prev[k] = p;
|
|
+ node->next[k]->prev[k] = node;
|
|
+ } while (--k >= 0);
|
|
+}
|
|
+
|
|
+void skiplist_delete(skiplist *l, skiplist_node *node)
|
|
+{
|
|
+ int k, m = node->level;
|
|
+
|
|
+ for (k = 0; k <= m; k++) {
|
|
+ node->prev[k]->next[k] = node->next[k];
|
|
+ node->next[k]->prev[k] = node->prev[k];
|
|
+ }
|
|
+ skiplist_node_init(node);
|
|
+ if (m == l->level) {
|
|
+ while (l->header->next[m] == l->header && l->header->prev[m] == l->header && m > 0)
|
|
+ m--;
|
|
+ l->level = m;
|
|
+ }
|
|
+ l->entries--;
|
|
+}
|
|
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
|
|
index c1095cd..90e66b3 100644
|
|
--- a/kernel/sysctl.c
|
|
+++ b/kernel/sysctl.c
|
|
@@ -125,9 +125,17 @@ static int __maybe_unused one = 1;
|
|
static int __maybe_unused two = 2;
|
|
static int __maybe_unused four = 4;
|
|
static unsigned long one_ul = 1;
|
|
-static int one_hundred = 100;
|
|
-static int one_thousand = 1000;
|
|
-#ifdef CONFIG_PRINTK
|
|
+static int __read_mostly one_hundred = 100;
|
|
+static int __read_mostly one_thousand = 1000;
|
|
+#ifdef CONFIG_SCHED_MUQSS
|
|
+extern int rr_interval;
|
|
+extern int sched_interactive;
|
|
+extern int sched_iso_cpu;
|
|
+extern int sched_yield_type;
|
|
+#endif
|
|
+extern int hrtimer_granularity_us;
|
|
+extern int hrtimeout_min_us;
|
|
+#if defined(CONFIG_PRINTK) || defined(CONFIG_SCHED_MUQSS)
|
|
static int ten_thousand = 10000;
|
|
#endif
|
|
#ifdef CONFIG_PERF_EVENTS
|
|
@@ -264,7 +272,7 @@ static struct ctl_table sysctl_base_table[] = {
|
|
{ }
|
|
};
|
|
|
|
-#ifdef CONFIG_SCHED_DEBUG
|
|
+#if defined(CONFIG_SCHED_DEBUG) && !defined(CONFIG_SCHED_MUQSS)
|
|
static int min_sched_granularity_ns = 100000; /* 100 usecs */
|
|
static int max_sched_granularity_ns = NSEC_PER_SEC; /* 1 second */
|
|
static int min_wakeup_granularity_ns; /* 0 usecs */
|
|
@@ -281,6 +289,7 @@ static int max_extfrag_threshold = 1000;
|
|
#endif
|
|
|
|
static struct ctl_table kern_table[] = {
|
|
+#ifndef CONFIG_SCHED_MUQSS
|
|
{
|
|
.procname = "sched_child_runs_first",
|
|
.data = &sysctl_sched_child_runs_first,
|
|
@@ -449,6 +458,7 @@ static struct ctl_table kern_table[] = {
|
|
.extra1 = &one,
|
|
},
|
|
#endif
|
|
+#endif /* !CONFIG_SCHED_MUQSS */
|
|
#ifdef CONFIG_PROVE_LOCKING
|
|
{
|
|
.procname = "prove_locking",
|
|
@@ -1013,6 +1023,62 @@ static struct ctl_table kern_table[] = {
|
|
.proc_handler = proc_dointvec,
|
|
},
|
|
#endif
|
|
+#ifdef CONFIG_SCHED_MUQSS
|
|
+ {
|
|
+ .procname = "rr_interval",
|
|
+ .data = &rr_interval,
|
|
+ .maxlen = sizeof (int),
|
|
+ .mode = 0644,
|
|
+ .proc_handler = &proc_dointvec_minmax,
|
|
+ .extra1 = &one,
|
|
+ .extra2 = &one_thousand,
|
|
+ },
|
|
+ {
|
|
+ .procname = "interactive",
|
|
+ .data = &sched_interactive,
|
|
+ .maxlen = sizeof(int),
|
|
+ .mode = 0644,
|
|
+ .proc_handler = &proc_dointvec_minmax,
|
|
+ .extra1 = &zero,
|
|
+ .extra2 = &one,
|
|
+ },
|
|
+ {
|
|
+ .procname = "iso_cpu",
|
|
+ .data = &sched_iso_cpu,
|
|
+ .maxlen = sizeof (int),
|
|
+ .mode = 0644,
|
|
+ .proc_handler = &proc_dointvec_minmax,
|
|
+ .extra1 = &zero,
|
|
+ .extra2 = &one_hundred,
|
|
+ },
|
|
+ {
|
|
+ .procname = "yield_type",
|
|
+ .data = &sched_yield_type,
|
|
+ .maxlen = sizeof (int),
|
|
+ .mode = 0644,
|
|
+ .proc_handler = &proc_dointvec_minmax,
|
|
+ .extra1 = &zero,
|
|
+ .extra2 = &two,
|
|
+ },
|
|
+#endif
|
|
+ {
|
|
+ .procname = "hrtimer_granularity_us",
|
|
+ .data = &hrtimer_granularity_us,
|
|
+ .maxlen = sizeof(int),
|
|
+ .mode = 0644,
|
|
+ .proc_handler = &proc_dointvec_minmax,
|
|
+ .extra1 = &one,
|
|
+ .extra2 = &ten_thousand,
|
|
+ },
|
|
+ {
|
|
+ .procname = "hrtimeout_min_us",
|
|
+ .data = &hrtimeout_min_us,
|
|
+ .maxlen = sizeof(int),
|
|
+ .mode = 0644,
|
|
+ .proc_handler = &proc_dointvec_minmax,
|
|
+ .extra1 = &one,
|
|
+ .extra2 = &ten_thousand,
|
|
+ },
|
|
#if defined(CONFIG_S390) && defined(CONFIG_SMP)
|
|
{
|
|
.procname = "spin_retry",
|
|
diff --git a/kernel/time/clockevents.c b/kernel/time/clockevents.c
|
|
index 2c5bc77..2cc8271 100644
|
|
--- a/kernel/time/clockevents.c
|
|
+++ b/kernel/time/clockevents.c
|
|
@@ -198,8 +198,9 @@ int clockevents_tick_resume(struct clock_event_device *dev)
|
|
|
|
#ifdef CONFIG_GENERIC_CLOCKEVENTS_MIN_ADJUST
|
|
|
|
-/* Limit min_delta to a jiffie */
|
|
-#define MIN_DELTA_LIMIT (NSEC_PER_SEC / HZ)
|
|
+int __read_mostly hrtimer_granularity_us = 100;
|
|
+/* Limit min_delta to 100us */
|
|
+#define MIN_DELTA_LIMIT (hrtimer_granularity_us * NSEC_PER_USEC)
|
|
|
|
/**
|
|
* clockevents_increase_min_delta - raise minimum delta of a clock event device
|
|
diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c
|
|
index bb5ec42..309494b 100644
|
|
--- a/kernel/time/hrtimer.c
|
|
+++ b/kernel/time/hrtimer.c
|
|
@@ -1788,3 +1788,117 @@ int __sched schedule_hrtimeout(ktime_t *expires,
|
|
return schedule_hrtimeout_range(expires, 0, mode);
|
|
}
|
|
EXPORT_SYMBOL_GPL(schedule_hrtimeout);
|
|
+
|
|
+/*
|
|
+ * As per schedule_hrtimeout but taskes a millisecond value and returns how
|
|
+ * many milliseconds are left.
|
|
+ */
|
|
+signed long __sched schedule_msec_hrtimeout(signed long timeout)
|
|
+{
|
|
+ struct hrtimer_sleeper t;
|
|
+ int delta, jiffs;
|
|
+ ktime_t expires;
|
|
+
|
|
+ if (!timeout) {
|
|
+ __set_current_state(TASK_RUNNING);
|
|
+ return 0;
|
|
+ }
|
|
+
|
|
+ jiffs = msecs_to_jiffies(timeout);
|
|
+ /*
|
|
+ * If regular timer resolution is adequate or hrtimer resolution is not
|
|
+ * (yet) better than Hz, as would occur during startup, use regular
|
|
+ * timers.
|
|
+ */
|
|
+ if (jiffs > 4 || hrtimer_resolution >= NSEC_PER_SEC / HZ || pm_freezing)
|
|
+ return schedule_timeout(jiffs);
|
|
+
|
|
+ delta = (timeout % 1000) * NSEC_PER_MSEC;
|
|
+ expires = ktime_set(0, delta);
|
|
+
|
|
+ hrtimer_init_on_stack(&t.timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
|
|
+ hrtimer_set_expires_range_ns(&t.timer, expires, delta);
|
|
+
|
|
+ hrtimer_init_sleeper(&t, current);
|
|
+
|
|
+ hrtimer_start_expires(&t.timer, HRTIMER_MODE_REL);
|
|
+
|
|
+ if (likely(t.task))
|
|
+ schedule();
|
|
+
|
|
+ hrtimer_cancel(&t.timer);
|
|
+ destroy_hrtimer_on_stack(&t.timer);
|
|
+
|
|
+ __set_current_state(TASK_RUNNING);
|
|
+
|
|
+ expires = hrtimer_expires_remaining(&t.timer);
|
|
+ timeout = ktime_to_ms(expires);
|
|
+ return timeout < 0 ? 0 : timeout;
|
|
+}
|
|
+
|
|
+EXPORT_SYMBOL(schedule_msec_hrtimeout);
|
|
+
|
|
+#define USECS_PER_SEC 1000000
|
|
+extern int hrtimer_granularity_us;
|
|
+
|
|
+static inline signed long schedule_usec_hrtimeout(signed long timeout)
|
|
+{
|
|
+ struct hrtimer_sleeper t;
|
|
+ ktime_t expires;
|
|
+ int delta;
|
|
+
|
|
+ if (!timeout) {
|
|
+ __set_current_state(TASK_RUNNING);
|
|
+ return 0;
|
|
+ }
|
|
+
|
|
+ if (hrtimer_resolution >= NSEC_PER_SEC / HZ)
|
|
+ return schedule_timeout(usecs_to_jiffies(timeout));
|
|
+
|
|
+ if (timeout < hrtimer_granularity_us)
|
|
+ timeout = hrtimer_granularity_us;
|
|
+ delta = (timeout % USECS_PER_SEC) * NSEC_PER_USEC;
|
|
+ expires = ktime_set(0, delta);
|
|
+
|
|
+ hrtimer_init_on_stack(&t.timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
|
|
+ hrtimer_set_expires_range_ns(&t.timer, expires, delta);
|
|
+
|
|
+ hrtimer_init_sleeper(&t, current);
|
|
+
|
|
+ hrtimer_start_expires(&t.timer, HRTIMER_MODE_REL);
|
|
+
|
|
+ if (likely(t.task))
|
|
+ schedule();
|
|
+
|
|
+ hrtimer_cancel(&t.timer);
|
|
+ destroy_hrtimer_on_stack(&t.timer);
|
|
+
|
|
+ __set_current_state(TASK_RUNNING);
|
|
+
|
|
+ expires = hrtimer_expires_remaining(&t.timer);
|
|
+ timeout = ktime_to_us(expires);
|
|
+ return timeout < 0 ? 0 : timeout;
|
|
+}
|
|
+
|
|
+int __read_mostly hrtimeout_min_us = 1000;
|
|
+
|
|
+signed long __sched schedule_min_hrtimeout(void)
|
|
+{
|
|
+ return usecs_to_jiffies(schedule_usec_hrtimeout(hrtimeout_min_us));
|
|
+}
|
|
+
|
|
+EXPORT_SYMBOL(schedule_min_hrtimeout);
|
|
+
|
|
+signed long __sched schedule_msec_hrtimeout_interruptible(signed long timeout)
|
|
+{
|
|
+ __set_current_state(TASK_INTERRUPTIBLE);
|
|
+ return schedule_msec_hrtimeout(timeout);
|
|
+}
|
|
+EXPORT_SYMBOL(schedule_msec_hrtimeout_interruptible);
|
|
+
|
|
+signed long __sched schedule_msec_hrtimeout_uninterruptible(signed long timeout)
|
|
+{
|
|
+ __set_current_state(TASK_UNINTERRUPTIBLE);
|
|
+ return schedule_msec_hrtimeout(timeout);
|
|
+}
|
|
+EXPORT_SYMBOL(schedule_msec_hrtimeout_uninterruptible);
|
|
diff --git a/kernel/time/posix-cpu-timers.c b/kernel/time/posix-cpu-timers.c
|
|
index 39008d7..784f3a1 100644
|
|
--- a/kernel/time/posix-cpu-timers.c
|
|
+++ b/kernel/time/posix-cpu-timers.c
|
|
@@ -447,7 +447,7 @@ static void cleanup_timers(struct list_head *head)
|
|
*/
|
|
void posix_cpu_timers_exit(struct task_struct *tsk)
|
|
{
|
|
- add_device_randomness((const void*) &tsk->se.sum_exec_runtime,
|
|
+ add_device_randomness((const void*) &tsk_seruntime(tsk),
|
|
sizeof(unsigned long long));
|
|
cleanup_timers(tsk->cpu_timers);
|
|
|
|
@@ -848,7 +848,7 @@ static void check_thread_timers(struct task_struct *tsk,
|
|
tsk_expires->virt_exp = expires_to_cputime(expires);
|
|
|
|
tsk_expires->sched_exp = check_timers_list(++timers, firing,
|
|
- tsk->se.sum_exec_runtime);
|
|
+ tsk_seruntime(tsk));
|
|
|
|
/*
|
|
* Check for the special case thread timers.
|
|
@@ -859,7 +859,7 @@ static void check_thread_timers(struct task_struct *tsk,
|
|
READ_ONCE(sig->rlim[RLIMIT_RTTIME].rlim_max);
|
|
|
|
if (hard != RLIM_INFINITY &&
|
|
- tsk->rt.timeout > DIV_ROUND_UP(hard, USEC_PER_SEC/HZ)) {
|
|
+ tsk_rttimeout(tsk) > DIV_ROUND_UP(hard, USEC_PER_SEC/HZ)) {
|
|
/*
|
|
* At the hard limit, we just die.
|
|
* No need to calculate anything else now.
|
|
@@ -867,7 +867,7 @@ static void check_thread_timers(struct task_struct *tsk,
|
|
__group_send_sig_info(SIGKILL, SEND_SIG_PRIV, tsk);
|
|
return;
|
|
}
|
|
- if (tsk->rt.timeout > DIV_ROUND_UP(soft, USEC_PER_SEC/HZ)) {
|
|
+ if (tsk_rttimeout(tsk) > DIV_ROUND_UP(soft, USEC_PER_SEC/HZ)) {
|
|
/*
|
|
* At the soft limit, send a SIGXCPU every second.
|
|
*/
|
|
@@ -1115,7 +1115,7 @@ static inline int fastpath_timer_check(struct task_struct *tsk)
|
|
struct task_cputime task_sample;
|
|
|
|
task_cputime(tsk, &task_sample.utime, &task_sample.stime);
|
|
- task_sample.sum_exec_runtime = tsk->se.sum_exec_runtime;
|
|
+ task_sample.sum_exec_runtime = tsk_seruntime(tsk);
|
|
if (task_cputime_expired(&task_sample, &tsk->cputime_expires))
|
|
return 1;
|
|
}
|
|
diff --git a/kernel/time/timer.c b/kernel/time/timer.c
|
|
index c611c47..9d0d44b 100644
|
|
--- a/kernel/time/timer.c
|
|
+++ b/kernel/time/timer.c
|
|
@@ -42,6 +42,7 @@
|
|
#include <linux/sched/sysctl.h>
|
|
#include <linux/slab.h>
|
|
#include <linux/compat.h>
|
|
+#include <linux/freezer.h>
|
|
|
|
#include <asm/uaccess.h>
|
|
#include <asm/unistd.h>
|
|
@@ -1464,7 +1465,7 @@ static unsigned long __next_timer_interrupt(struct timer_base *base)
|
|
* Check, if the next hrtimer event is before the next timer wheel
|
|
* event:
|
|
*/
|
|
-static u64 cmp_next_hrtimer_event(u64 basem, u64 expires)
|
|
+static u64 cmp_next_hrtimer_event(struct timer_base *base, u64 basem, u64 expires)
|
|
{
|
|
u64 nextevt = hrtimer_get_next_event();
|
|
|
|
@@ -1482,6 +1483,9 @@ static u64 cmp_next_hrtimer_event(u64 basem, u64 expires)
|
|
if (nextevt <= basem)
|
|
return basem;
|
|
|
|
+ if (nextevt < expires && nextevt - basem <= TICK_NSEC)
|
|
+ base->is_idle = false;
|
|
+
|
|
/*
|
|
* Round up to the next jiffie. High resolution timers are
|
|
* off, so the hrtimers are expired in the tick and we need to
|
|
@@ -1545,7 +1549,7 @@ u64 get_next_timer_interrupt(unsigned long basej, u64 basem)
|
|
}
|
|
spin_unlock(&base->lock);
|
|
|
|
- return cmp_next_hrtimer_event(basem, expires);
|
|
+ return cmp_next_hrtimer_event(base, basem, expires);
|
|
}
|
|
|
|
/**
|
|
@@ -1756,6 +1760,19 @@ signed long __sched schedule_timeout(signed long timeout)
|
|
|
|
expire = timeout + jiffies;
|
|
|
|
+#ifdef CONFIG_HIGH_RES_TIMERS
|
|
+ if (timeout == 1 && hrtimer_resolution < NSEC_PER_SEC / HZ) {
|
|
+ /*
|
|
+ * Special case 1 as being a request for the minimum timeout
|
|
+ * and use highres timers to timeout after 1ms to workaround
|
|
+ * the granularity of low Hz tick timers.
|
|
+ */
|
|
+ if (!schedule_min_hrtimeout())
|
|
+ return 0;
|
|
+ goto out_timeout;
|
|
+ }
|
|
+#endif
|
|
+
|
|
setup_timer_on_stack(&timer, process_timeout, (unsigned long)current);
|
|
__mod_timer(&timer, expire, false);
|
|
schedule();
|
|
@@ -1763,10 +1780,10 @@ signed long __sched schedule_timeout(signed long timeout)
|
|
|
|
/* Remove the timer from the object tracker */
|
|
destroy_timer_on_stack(&timer);
|
|
-
|
|
+out_timeout:
|
|
timeout = expire - jiffies;
|
|
|
|
- out:
|
|
+out:
|
|
return timeout < 0 ? 0 : timeout;
|
|
}
|
|
EXPORT_SYMBOL(schedule_timeout);
|
|
@@ -1887,7 +1904,19 @@ void __init init_timers(void)
|
|
*/
|
|
void msleep(unsigned int msecs)
|
|
{
|
|
- unsigned long timeout = msecs_to_jiffies(msecs) + 1;
|
|
+ int jiffs = msecs_to_jiffies(msecs);
|
|
+ unsigned long timeout;
|
|
+
|
|
+ /*
|
|
+ * Use high resolution timers where the resolution of tick based
|
|
+ * timers is inadequate.
|
|
+ */
|
|
+ if (jiffs < 5 && hrtimer_resolution < NSEC_PER_SEC / HZ && !pm_freezing) {
|
|
+ while (msecs)
|
|
+ msecs = schedule_msec_hrtimeout_uninterruptible(msecs);
|
|
+ return;
|
|
+ }
|
|
+ timeout = jiffs + 1;
|
|
|
|
while (timeout)
|
|
timeout = schedule_timeout_uninterruptible(timeout);
|
|
@@ -1901,7 +1930,15 @@ EXPORT_SYMBOL(msleep);
|
|
*/
|
|
unsigned long msleep_interruptible(unsigned int msecs)
|
|
{
|
|
- unsigned long timeout = msecs_to_jiffies(msecs) + 1;
|
|
+ int jiffs = msecs_to_jiffies(msecs);
|
|
+ unsigned long timeout;
|
|
+
|
|
+ if (jiffs < 5 && hrtimer_resolution < NSEC_PER_SEC / HZ && !pm_freezing) {
|
|
+ while (msecs && !signal_pending(current))
|
|
+ msecs = schedule_msec_hrtimeout_interruptible(msecs);
|
|
+ return msecs;
|
|
+ }
|
|
+ timeout = jiffs + 1;
|
|
|
|
while (timeout && !signal_pending(current))
|
|
timeout = schedule_timeout_interruptible(timeout);
|
|
diff --git a/kernel/trace/trace_selftest.c b/kernel/trace/trace_selftest.c
|
|
index b0f86ea..69ee53a 100644
|
|
--- a/kernel/trace/trace_selftest.c
|
|
+++ b/kernel/trace/trace_selftest.c
|
|
@@ -1039,10 +1039,15 @@ static int trace_wakeup_test_thread(void *data)
|
|
{
|
|
/* Make this a -deadline thread */
|
|
static const struct sched_attr attr = {
|
|
+#ifdef CONFIG_SCHED_MUQSS
|
|
+ /* No deadline on MuQSS, use RR */
|
|
+ .sched_policy = SCHED_RR,
|
|
+#else
|
|
.sched_policy = SCHED_DEADLINE,
|
|
.sched_runtime = 100000ULL,
|
|
.sched_deadline = 10000000ULL,
|
|
.sched_period = 10000000ULL
|
|
+#endif
|
|
};
|
|
struct wakeup_test_data *x = data;
|
|
|
|
diff --git a/mm/backing-dev.c b/mm/backing-dev.c
|
|
index 8fde443..3bfed5a 100644
|
|
--- a/mm/backing-dev.c
|
|
+++ b/mm/backing-dev.c
|
|
@@ -310,6 +310,7 @@ static int wb_init(struct bdi_writeback *wb, struct backing_dev_info *bdi,
|
|
spin_lock_init(&wb->work_lock);
|
|
INIT_LIST_HEAD(&wb->work_list);
|
|
INIT_DELAYED_WORK(&wb->dwork, wb_workfn);
|
|
+ wb->dirty_sleep = jiffies;
|
|
|
|
wb->congested = wb_congested_get_create(bdi, blkcg_id, gfp);
|
|
if (!wb->congested)
|
|
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
|
|
index 439cc63..52e2f8e 100644
|
|
--- a/mm/page-writeback.c
|
|
+++ b/mm/page-writeback.c
|
|
@@ -1778,6 +1778,7 @@ pause:
|
|
pause,
|
|
start_time);
|
|
__set_current_state(TASK_KILLABLE);
|
|
+ wb->dirty_sleep = now;
|
|
io_schedule_timeout(pause);
|
|
|
|
current->dirty_paused_when = now + pause;
|
|
diff --git a/net/core/pktgen.c b/net/core/pktgen.c
|
|
index 306b8f0..4f4ab3d 100644
|
|
--- a/net/core/pktgen.c
|
|
+++ b/net/core/pktgen.c
|
|
@@ -1992,7 +1992,7 @@ static void pktgen_mark_device(const struct pktgen_net *pn, const char *ifname)
|
|
mutex_unlock(&pktgen_thread_lock);
|
|
pr_debug("%s: waiting for %s to disappear....\n",
|
|
__func__, ifname);
|
|
- schedule_timeout_interruptible(msecs_to_jiffies(msec_per_try));
|
|
+ schedule_msec_hrtimeout_interruptible((msec_per_try));
|
|
mutex_lock(&pktgen_thread_lock);
|
|
|
|
if (++i >= max_tries) {
|
|
diff --git a/sound/pci/maestro3.c b/sound/pci/maestro3.c
|
|
index cafea6d..d374514 100644
|
|
--- a/sound/pci/maestro3.c
|
|
+++ b/sound/pci/maestro3.c
|
|
@@ -2016,7 +2016,7 @@ static void snd_m3_ac97_reset(struct snd_m3 *chip)
|
|
outw(0, io + GPIO_DATA);
|
|
outw(dir | GPO_PRIMARY_AC97, io + GPIO_DIRECTION);
|
|
|
|
- schedule_timeout_uninterruptible(msecs_to_jiffies(delay1));
|
|
+ schedule_msec_hrtimeout_uninterruptible((delay1));
|
|
|
|
outw(GPO_PRIMARY_AC97, io + GPIO_DATA);
|
|
udelay(5);
|
|
@@ -2024,7 +2024,7 @@ static void snd_m3_ac97_reset(struct snd_m3 *chip)
|
|
outw(IO_SRAM_ENABLE | SERIAL_AC_LINK_ENABLE, io + RING_BUS_CTRL_A);
|
|
outw(~0, io + GPIO_MASK);
|
|
|
|
- schedule_timeout_uninterruptible(msecs_to_jiffies(delay2));
|
|
+ schedule_msec_hrtimeout_uninterruptible((delay2));
|
|
|
|
if (! snd_m3_try_read_vendor(chip))
|
|
break;
|
|
diff --git a/sound/soc/codecs/rt5631.c b/sound/soc/codecs/rt5631.c
|
|
index 0e41808..611cb9f 100644
|
|
--- a/sound/soc/codecs/rt5631.c
|
|
+++ b/sound/soc/codecs/rt5631.c
|
|
@@ -419,7 +419,7 @@ static void onebit_depop_mute_stage(struct snd_soc_codec *codec, int enable)
|
|
hp_zc = snd_soc_read(codec, RT5631_INT_ST_IRQ_CTRL_2);
|
|
snd_soc_write(codec, RT5631_INT_ST_IRQ_CTRL_2, hp_zc & 0xf7ff);
|
|
if (enable) {
|
|
- schedule_timeout_uninterruptible(msecs_to_jiffies(10));
|
|
+ schedule_msec_hrtimeout_uninterruptible((10));
|
|
/* config one-bit depop parameter */
|
|
rt5631_write_index(codec, RT5631_SPK_INTL_CTRL, 0x307f);
|
|
snd_soc_update_bits(codec, RT5631_HP_OUT_VOL,
|
|
@@ -529,7 +529,7 @@ static void depop_seq_mute_stage(struct snd_soc_codec *codec, int enable)
|
|
hp_zc = snd_soc_read(codec, RT5631_INT_ST_IRQ_CTRL_2);
|
|
snd_soc_write(codec, RT5631_INT_ST_IRQ_CTRL_2, hp_zc & 0xf7ff);
|
|
if (enable) {
|
|
- schedule_timeout_uninterruptible(msecs_to_jiffies(10));
|
|
+ schedule_msec_hrtimeout_uninterruptible((10));
|
|
|
|
/* config depop sequence parameter */
|
|
rt5631_write_index(codec, RT5631_SPK_INTL_CTRL, 0x302f);
|
|
diff --git a/sound/soc/codecs/wm8350.c b/sound/soc/codecs/wm8350.c
|
|
index 2efc5b4..3e3248c 100644
|
|
--- a/sound/soc/codecs/wm8350.c
|
|
+++ b/sound/soc/codecs/wm8350.c
|
|
@@ -236,10 +236,10 @@ static void wm8350_pga_work(struct work_struct *work)
|
|
out2->ramp == WM8350_RAMP_UP) {
|
|
/* delay is longer over 0dB as increases are larger */
|
|
if (i >= WM8350_OUTn_0dB)
|
|
- schedule_timeout_interruptible(msecs_to_jiffies
|
|
+ schedule_msec_hrtimeout_interruptible(
|
|
(2));
|
|
else
|
|
- schedule_timeout_interruptible(msecs_to_jiffies
|
|
+ schedule_msec_hrtimeout_interruptible(
|
|
(1));
|
|
} else
|
|
udelay(50); /* doesn't matter if we delay longer */
|
|
@@ -1123,7 +1123,7 @@ static int wm8350_set_bias_level(struct snd_soc_codec *codec,
|
|
(platform->dis_out4 << 6));
|
|
|
|
/* wait for discharge */
|
|
- schedule_timeout_interruptible(msecs_to_jiffies
|
|
+ schedule_msec_hrtimeout_interruptible(
|
|
(platform->
|
|
cap_discharge_msecs));
|
|
|
|
@@ -1139,7 +1139,7 @@ static int wm8350_set_bias_level(struct snd_soc_codec *codec,
|
|
WM8350_VBUFEN);
|
|
|
|
/* wait for vmid */
|
|
- schedule_timeout_interruptible(msecs_to_jiffies
|
|
+ schedule_msec_hrtimeout_interruptible(
|
|
(platform->
|
|
vmid_charge_msecs));
|
|
|
|
@@ -1190,7 +1190,7 @@ static int wm8350_set_bias_level(struct snd_soc_codec *codec,
|
|
wm8350_reg_write(wm8350, WM8350_POWER_MGMT_1, pm1);
|
|
|
|
/* wait */
|
|
- schedule_timeout_interruptible(msecs_to_jiffies
|
|
+ schedule_msec_hrtimeout_interruptible(
|
|
(platform->
|
|
vmid_discharge_msecs));
|
|
|
|
@@ -1208,7 +1208,7 @@ static int wm8350_set_bias_level(struct snd_soc_codec *codec,
|
|
pm1 | WM8350_OUTPUT_DRAIN_EN);
|
|
|
|
/* wait */
|
|
- schedule_timeout_interruptible(msecs_to_jiffies
|
|
+ schedule_msec_hrtimeout_interruptible(
|
|
(platform->drain_msecs));
|
|
|
|
pm1 &= ~WM8350_BIASEN;
|
|
diff --git a/sound/soc/codecs/wm8900.c b/sound/soc/codecs/wm8900.c
|
|
index c77b49a..fc50456 100644
|
|
--- a/sound/soc/codecs/wm8900.c
|
|
+++ b/sound/soc/codecs/wm8900.c
|
|
@@ -1112,7 +1112,7 @@ static int wm8900_set_bias_level(struct snd_soc_codec *codec,
|
|
/* Need to let things settle before stopping the clock
|
|
* to ensure that restart works, see "Stopping the
|
|
* master clock" in the datasheet. */
|
|
- schedule_timeout_interruptible(msecs_to_jiffies(1));
|
|
+ schedule_msec_hrtimeout_interruptible((1));
|
|
snd_soc_write(codec, WM8900_REG_POWER2,
|
|
WM8900_REG_POWER2_SYSCLK_ENA);
|
|
break;
|
|
diff --git a/sound/soc/codecs/wm9713.c b/sound/soc/codecs/wm9713.c
|
|
index e4301dd..bc7b153 100644
|
|
--- a/sound/soc/codecs/wm9713.c
|
|
+++ b/sound/soc/codecs/wm9713.c
|
|
@@ -199,7 +199,7 @@ static int wm9713_voice_shutdown(struct snd_soc_dapm_widget *w,
|
|
|
|
/* Gracefully shut down the voice interface. */
|
|
snd_soc_update_bits(codec, AC97_HANDSET_RATE, 0x0f00, 0x0200);
|
|
- schedule_timeout_interruptible(msecs_to_jiffies(1));
|
|
+ schedule_msec_hrtimeout_interruptible((1));
|
|
snd_soc_update_bits(codec, AC97_HANDSET_RATE, 0x0f00, 0x0f00);
|
|
snd_soc_update_bits(codec, AC97_EXTENDED_MID, 0x1000, 0x1000);
|
|
|
|
@@ -868,7 +868,7 @@ static int wm9713_set_pll(struct snd_soc_codec *codec,
|
|
wm9713->pll_in = freq_in;
|
|
|
|
/* wait 10ms AC97 link frames for the link to stabilise */
|
|
- schedule_timeout_interruptible(msecs_to_jiffies(10));
|
|
+ schedule_msec_hrtimeout_interruptible((10));
|
|
return 0;
|
|
}
|
|
|
|
diff --git a/sound/soc/soc-dapm.c b/sound/soc/soc-dapm.c
|
|
index 3bbe32e..be285ac 100644
|
|
--- a/sound/soc/soc-dapm.c
|
|
+++ b/sound/soc/soc-dapm.c
|
|
@@ -134,7 +134,7 @@ static void dapm_assert_locked(struct snd_soc_dapm_context *dapm)
|
|
static void pop_wait(u32 pop_time)
|
|
{
|
|
if (pop_time)
|
|
- schedule_timeout_uninterruptible(msecs_to_jiffies(pop_time));
|
|
+ schedule_msec_hrtimeout_uninterruptible((pop_time));
|
|
}
|
|
|
|
static void pop_dbg(struct device *dev, u32 pop_time, const char *fmt, ...)
|
|
diff --git a/sound/usb/line6/pcm.c b/sound/usb/line6/pcm.c
|
|
index fab53f5..fda1ab5 100644
|
|
--- a/sound/usb/line6/pcm.c
|
|
+++ b/sound/usb/line6/pcm.c
|
|
@@ -131,7 +131,7 @@ static void line6_wait_clear_audio_urbs(struct snd_line6_pcm *line6pcm,
|
|
if (!alive)
|
|
break;
|
|
set_current_state(TASK_UNINTERRUPTIBLE);
|
|
- schedule_timeout(1);
|
|
+ schedule_min_hrtimeout();
|
|
} while (--timeout > 0);
|
|
if (alive)
|
|
dev_err(line6pcm->line6->ifcdev,
|
|
--
|
|
2.9.3
|
|
|