Files
fedora-kernel-pf/pf-kernel-4.9.9.patch
2017-02-14 23:41:12 -05:00

26048 lines
793 KiB
Diff

From 3600b89ef36136c3c0874ce7c07f904df2e3d7bb Mon Sep 17 00:00:00 2001
From: Fedora Kernel Team <kernel-team@fedoraproject.org>
Date: Tue, 14 Feb 2017 23:39:31 -0500
Subject: [PATCH] pf-kernel 4.9.9
---
Documentation/block/00-INDEX | 2 +
Documentation/block/bfq-iosched.txt | 530 ++
Documentation/block/queue-sysfs.txt | 13 +
Documentation/scheduler/sched-BFS.txt | 351 +
Documentation/scheduler/sched-MuQSS.txt | 345 +
Documentation/sysctl/kernel.txt | 37 +
Makefile | 4 +-
arch/powerpc/platforms/cell/spufs/sched.c | 5 -
arch/x86/Kconfig | 30 +-
arch/x86/Kconfig.cpu | 224 +-
arch/x86/Makefile | 33 +-
arch/x86/Makefile_32.cpu | 23 +-
arch/x86/include/asm/module.h | 38 +
block/Kconfig | 24 +
block/Kconfig.iosched | 30 +
block/Makefile | 4 +-
block/bfq-cgroup.c | 1213 ++++
block/bfq-ioc.c | 36 +
block/bfq-iosched.c | 5318 +++++++++++++++
block/bfq-sched.c | 1933 ++++++
block/bfq.h | 933 +++
block/blk-core.c | 22 +-
block/blk-mq-sysfs.c | 47 +
block/blk-mq.c | 41 +-
block/blk-mq.h | 3 +
block/blk-settings.c | 16 +
block/blk-stat.c | 234 +
block/blk-stat.h | 37 +
block/blk-sysfs.c | 160 +
block/blk-wbt.c | 704 ++
block/blk-wbt.h | 166 +
block/cfq-iosched.c | 14 +
drivers/block/swim.c | 6 +-
drivers/bluetooth/hci_qca.c | 2 +-
drivers/char/ipmi/ipmi_msghandler.c | 2 +-
drivers/char/ipmi/ipmi_ssif.c | 2 +-
drivers/char/snsc.c | 4 +-
drivers/gpu/drm/vmwgfx/vmwgfx_fifo.c | 2 +-
drivers/gpu/drm/vmwgfx/vmwgfx_irq.c | 2 +-
drivers/hwmon/fam15h_power.c | 2 +-
drivers/iio/light/tsl2563.c | 6 +-
drivers/media/i2c/msp3400-driver.c | 4 +-
drivers/media/pci/cx18/cx18-gpio.c | 4 +-
drivers/media/pci/ivtv/ivtv-gpio.c | 6 +-
drivers/media/pci/ivtv/ivtv-ioctl.c | 2 +-
drivers/media/pci/ivtv/ivtv-streams.c | 2 +-
drivers/media/radio/radio-mr800.c | 2 +-
drivers/media/radio/radio-tea5777.c | 2 +-
drivers/media/radio/tea575x.c | 2 +-
drivers/mfd/ucb1x00-core.c | 2 +-
drivers/misc/panel.c | 2 +-
drivers/misc/sgi-xp/xpc_channel.c | 2 +-
drivers/net/caif/caif_hsi.c | 2 +-
drivers/net/can/usb/peak_usb/pcan_usb.c | 2 +-
drivers/net/usb/lan78xx.c | 2 +-
drivers/net/usb/usbnet.c | 2 +-
drivers/net/wireless/ath/ath9k/ath9k.h | 27 +-
drivers/net/wireless/ath/ath9k/channel.c | 2 -
drivers/net/wireless/ath/ath9k/debug.c | 14 +-
drivers/net/wireless/ath/ath9k/debug.h | 2 -
drivers/net/wireless/ath/ath9k/debug_sta.c | 4 +-
drivers/net/wireless/ath/ath9k/init.c | 2 +-
drivers/net/wireless/ath/ath9k/main.c | 9 +-
drivers/net/wireless/ath/ath9k/xmit.c | 338 +-
drivers/net/wireless/intel/ipw2x00/ipw2100.c | 4 +-
drivers/ntb/test/ntb_perf.c | 2 +-
drivers/parport/ieee1284.c | 2 +-
drivers/parport/ieee1284_ops.c | 2 +-
drivers/platform/x86/intel_ips.c | 8 +-
drivers/rtc/rtc-wm8350.c | 6 +-
drivers/scsi/fnic/fnic_scsi.c | 4 +-
drivers/scsi/lpfc/lpfc_scsi.c | 2 +-
drivers/scsi/scsi.c | 3 +
drivers/scsi/snic/snic_scsi.c | 2 +-
drivers/staging/comedi/drivers/ni_mio_common.c | 2 +-
drivers/staging/lustre/lnet/lnet/lib-eq.c | 2 +-
drivers/staging/rts5208/rtsx.c | 2 +-
drivers/staging/speakup/speakup_acntpc.c | 4 +-
drivers/staging/speakup/speakup_apollo.c | 2 +-
drivers/staging/speakup/speakup_decext.c | 2 +-
drivers/staging/speakup/speakup_decpc.c | 2 +-
drivers/staging/speakup/speakup_dectlk.c | 2 +-
drivers/staging/speakup/speakup_dtlk.c | 4 +-
drivers/staging/speakup/speakup_keypc.c | 4 +-
drivers/staging/speakup/synth.c | 2 +-
drivers/staging/unisys/visorbus/periodic_work.c | 204 +
drivers/staging/unisys/visornic/visornic_main.c | 6 +-
drivers/target/target_core_user.c | 2 +-
drivers/video/fbdev/omap/hwa742.c | 2 +-
drivers/video/fbdev/pxafb.c | 2 +-
fs/afs/vlocation.c | 2 +-
fs/btrfs/extent-tree.c | 2 +-
fs/btrfs/inode-map.c | 2 +-
fs/buffer.c | 2 +-
fs/f2fs/data.c | 2 +-
fs/f2fs/node.c | 2 +-
fs/gfs2/meta_io.c | 3 +-
fs/mpage.c | 2 +-
fs/proc/base.c | 2 +-
fs/xfs/xfs_aops.c | 7 +-
include/linux/backing-dev-defs.h | 2 +
include/linux/blk_types.h | 20 +-
include/linux/blkdev.h | 20 +-
include/linux/freezer.h | 1 +
include/linux/fs.h | 3 +
include/linux/init_task.h | 76 +-
include/linux/ioprio.h | 2 +
include/linux/sched.h | 97 +-
include/linux/sched/prio.h | 12 +
include/linux/skip_list.h | 33 +
include/linux/writeback.h | 10 +
include/trace/events/wbt.h | 153 +
include/uapi/linux/sched.h | 9 +-
init/Kconfig | 25 +-
init/main.c | 3 +-
kernel/Kconfig.hz | 24 +-
kernel/Kconfig.preempt | 7 +-
kernel/Makefile | 2 +-
kernel/delayacct.c | 2 +-
kernel/exit.c | 2 +-
kernel/irq/Kconfig | 14 +
kernel/irq/manage.c | 10 +
kernel/kthread.c | 30 +-
kernel/sched/Makefile | 13 +-
kernel/sched/MuQSS.c | 8033 +++++++++++++++++++++++
kernel/sched/MuQSS.h | 348 +
kernel/sched/cpufreq.c | 4 +
kernel/sched/cpufreq_schedutil.c | 16 +
kernel/sched/cputime.c | 27 +-
kernel/sched/idle.c | 14 +-
kernel/sched/sched.h | 25 +
kernel/sched/stats.c | 4 +
kernel/skip_list.c | 148 +
kernel/sysctl.c | 74 +-
kernel/time/clockevents.c | 5 +-
kernel/time/hrtimer.c | 114 +
kernel/time/posix-cpu-timers.c | 10 +-
kernel/time/timer.c | 49 +-
kernel/trace/trace_selftest.c | 5 +
mm/backing-dev.c | 1 +
mm/page-writeback.c | 1 +
net/core/pktgen.c | 2 +-
sound/pci/maestro3.c | 4 +-
sound/soc/codecs/rt5631.c | 4 +-
sound/soc/codecs/wm8350.c | 12 +-
sound/soc/codecs/wm8900.c | 2 +-
sound/soc/codecs/wm9713.c | 4 +-
sound/soc/soc-dapm.c | 2 +-
sound/usb/line6/pcm.c | 2 +-
149 files changed, 22356 insertions(+), 463 deletions(-)
create mode 100644 Documentation/block/bfq-iosched.txt
create mode 100644 Documentation/scheduler/sched-BFS.txt
create mode 100644 Documentation/scheduler/sched-MuQSS.txt
create mode 100644 block/bfq-cgroup.c
create mode 100644 block/bfq-ioc.c
create mode 100644 block/bfq-iosched.c
create mode 100644 block/bfq-sched.c
create mode 100644 block/bfq.h
create mode 100644 block/blk-stat.c
create mode 100644 block/blk-stat.h
create mode 100644 block/blk-wbt.c
create mode 100644 block/blk-wbt.h
create mode 100644 drivers/staging/unisys/visorbus/periodic_work.c
create mode 100644 include/linux/skip_list.h
create mode 100644 include/trace/events/wbt.h
create mode 100644 kernel/sched/MuQSS.c
create mode 100644 kernel/sched/MuQSS.h
create mode 100644 kernel/skip_list.c
diff --git a/Documentation/block/00-INDEX b/Documentation/block/00-INDEX
index e55103a..8d55b4b 100644
--- a/Documentation/block/00-INDEX
+++ b/Documentation/block/00-INDEX
@@ -1,5 +1,7 @@
00-INDEX
- This file
+bfq-iosched.txt
+ - BFQ IO scheduler and its tunables
biodoc.txt
- Notes on the Generic Block Layer Rewrite in Linux 2.5
biovecs.txt
diff --git a/Documentation/block/bfq-iosched.txt b/Documentation/block/bfq-iosched.txt
new file mode 100644
index 0000000..13b5248
--- /dev/null
+++ b/Documentation/block/bfq-iosched.txt
@@ -0,0 +1,530 @@
+BFQ (Budget Fair Queueing)
+==========================
+
+BFQ is a proportional-share I/O scheduler, with some extra
+low-latency capabilities. In addition to cgroups support (blkio or io
+controllers), BFQ's main features are:
+- BFQ guarantees a high system and application responsiveness, and a
+ low latency for time-sensitive applications, such as audio or video
+ players;
+- BFQ distributes bandwidth, and not just time, among processes or
+ groups (switching back to time distribution when needed to keep
+ throughput high).
+
+On average CPUs, the current version of BFQ can handle devices
+performing at most ~30K IOPS; at most ~50 KIOPS on faster CPUs. As a
+reference, 30-50 KIOPS correspond to very high bandwidths with
+sequential I/O (e.g., 8-12 GB/s if I/O requests are 256 KB large), and
+to 120-200 MB/s with 4KB random I/O.
+
+The table of contents follow. Impatients can just jump to Section 3.
+
+CONTENTS
+
+1. When may BFQ be useful?
+ 1-1 Personal systems
+ 1-2 Server systems
+2. How does BFQ work?
+3. What are BFQ's tunable?
+4. BFQ group scheduling
+ 4-1 Service guarantees provided
+ 4-2 Interface
+
+1. When may BFQ be useful?
+==========================
+
+BFQ provides the following benefits on personal and server systems.
+
+1-1 Personal systems
+--------------------
+
+Low latency for interactive applications
+
+Regardless of the actual background workload, BFQ guarantees that, for
+interactive tasks, the storage device is virtually as responsive as if
+it was idle. For example, even if one or more of the following
+background workloads are being executed:
+- one or more large files are being read, written or copied,
+- a tree of source files is being compiled,
+- one or more virtual machines are performing I/O,
+- a software update is in progress,
+- indexing daemons are scanning filesystems and updating their
+ databases,
+starting an application or loading a file from within an application
+takes about the same time as if the storage device was idle. As a
+comparison, with CFQ, NOOP or DEADLINE, and in the same conditions,
+applications experience high latencies, or even become unresponsive
+until the background workload terminates (also on SSDs).
+
+Low latency for soft real-time applications
+
+Also soft real-time applications, such as audio and video
+players/streamers, enjoy a low latency and a low drop rate, regardless
+of the background I/O workload. As a consequence, these applications
+do not suffer from almost any glitch due to the background workload.
+
+Higher speed for code-development tasks
+
+If some additional workload happens to be executed in parallel, then
+BFQ executes the I/O-related components of typical code-development
+tasks (compilation, checkout, merge, ...) much more quickly than CFQ,
+NOOP or DEADLINE.
+
+High throughput
+
+On hard disks, BFQ achieves up to 30% higher throughput than CFQ, and
+up to 150% higher throughput than DEADLINE and NOOP, with all the
+sequential workloads considered in our tests. With random workloads,
+and with all the workloads on flash-based devices, BFQ achieves,
+instead, about the same throughput as the other schedulers.
+
+Strong fairness, bandwidth and delay guarantees
+
+BFQ distributes the device throughput, and not just the device time,
+among I/O-bound applications in proportion their weights, with any
+workload and regardless of the device parameters. From these bandwidth
+guarantees, it is possible to compute tight per-I/O-request delay
+guarantees by a simple formula. If not configured for strict service
+guarantees, BFQ switches to time-based resource sharing (only) for
+applications that would otherwise cause a throughput loss.
+
+1-2 Server systems
+------------------
+
+Most benefits for server systems follow from the same service
+properties as above. In particular, regardless of whether additional,
+possibly heavy workloads are being served, BFQ guarantees:
+
+. audio and video-streaming with zero or very low jitter and drop
+ rate;
+
+. fast retrieval of WEB pages and embedded objects;
+
+. real-time recording of data in live-dumping applications (e.g.,
+ packet logging);
+
+. responsiveness in local and remote access to a server.
+
+
+2. How does BFQ work?
+=====================
+
+BFQ is a proportional-share I/O scheduler, whose general structure,
+plus a lot of code, are borrowed from CFQ.
+
+- Each process doing I/O on a device is associated with a weight and a
+ (bfq_)queue.
+
+- BFQ grants exclusive access to the device, for a while, to one queue
+ (process) at a time, and implements this service model by
+ associating every queue with a budget, measured in number of
+ sectors.
+
+ - After a queue is granted access to the device, the budget of the
+ queue is decremented, on each request dispatch, by the size of the
+ request.
+
+ - The in-service queue is expired, i.e., its service is suspended,
+ only if one of the following events occurs: 1) the queue finishes
+ its budget, 2) the queue empties, 3) a "budget timeout" fires.
+
+ - The budget timeout prevents processes doing random I/O from
+ holding the device for too long and dramatically reducing
+ throughput.
+
+ - Actually, as in CFQ, a queue associated with a process issuing
+ sync requests may not be expired immediately when it empties. In
+ contrast, BFQ may idle the device for a short time interval,
+ giving the process the chance to go on being served if it issues
+ a new request in time. Device idling typically boosts the
+ throughput on rotational devices, if processes do synchronous
+ and sequential I/O. In addition, under BFQ, device idling is
+ also instrumental in guaranteeing the desired throughput
+ fraction to processes issuing sync requests (see the description
+ of the slice_idle tunable in this document, or [1, 2], for more
+ details).
+
+ - With respect to idling for service guarantees, if several
+ processes are competing for the device at the same time, but
+ all processes (and groups, after the following commit) have
+ the same weight, then BFQ guarantees the expected throughput
+ distribution without ever idling the device. Throughput is
+ thus as high as possible in this common scenario.
+
+ - If low-latency mode is enabled (default configuration), BFQ
+ executes some special heuristics to detect interactive and soft
+ real-time applications (e.g., video or audio players/streamers),
+ and to reduce their latency. The most important action taken to
+ achieve this goal is to give to the queues associated with these
+ applications more than their fair share of the device
+ throughput. For brevity, we call just "weight-raising" the whole
+ sets of actions taken by BFQ to privilege these queues. In
+ particular, BFQ provides a milder form of weight-raising for
+ interactive applications, and a stronger form for soft real-time
+ applications.
+
+ - BFQ automatically deactivates idling for queues born in a burst of
+ queue creations. In fact, these queues are usually associated with
+ the processes of applications and services that benefit mostly
+ from a high throughput. Examples are systemd during boot, or git
+ grep.
+
+ - As CFQ, BFQ merges queues performing interleaved I/O, i.e.,
+ performing random I/O that becomes mostly sequential if
+ merged. Differently from CFQ, BFQ achieves this goal with a more
+ reactive mechanism, called Early Queue Merge (EQM). EQM is so
+ responsive in detecting interleaved I/O (cooperating processes),
+ that it enables BFQ to achieve a high throughput, by queue
+ merging, even for queues for which CFQ needs a different
+ mechanism, preemption, to get a high throughput. As such EQM is a
+ unified mechanism to achieve a high throughput with interleaved
+ I/O.
+
+ - Queues are scheduled according to a variant of WF2Q+, named
+ B-WF2Q+, and implemented using an augmented rb-tree to preserve an
+ O(log N) overall complexity. See [2] for more details. B-WF2Q+ is
+ also ready for hierarchical scheduling. However, for a cleaner
+ logical breakdown, the code that enables and completes
+ hierarchical support is provided in the next commit, which focuses
+ exactly on this feature.
+
+ - B-WF2Q+ guarantees a tight deviation with respect to an ideal,
+ perfectly fair, and smooth service. In particular, B-WF2Q+
+ guarantees that each queue receives a fraction of the device
+ throughput proportional to its weight, even if the throughput
+ fluctuates, and regardless of: the device parameters, the current
+ workload and the budgets assigned to the queue.
+
+ - The last, budget-independence, property (although probably
+ counterintuitive in the first place) is definitely beneficial, for
+ the following reasons:
+
+ - First, with any proportional-share scheduler, the maximum
+ deviation with respect to an ideal service is proportional to
+ the maximum budget (slice) assigned to queues. As a consequence,
+ BFQ can keep this deviation tight not only because of the
+ accurate service of B-WF2Q+, but also because BFQ *does not*
+ need to assign a larger budget to a queue to let the queue
+ receive a higher fraction of the device throughput.
+
+ - Second, BFQ is free to choose, for every process (queue), the
+ budget that best fits the needs of the process, or best
+ leverages the I/O pattern of the process. In particular, BFQ
+ updates queue budgets with a simple feedback-loop algorithm that
+ allows a high throughput to be achieved, while still providing
+ tight latency guarantees to time-sensitive applications. When
+ the in-service queue expires, this algorithm computes the next
+ budget of the queue so as to:
+
+ - Let large budgets be eventually assigned to the queues
+ associated with I/O-bound applications performing sequential
+ I/O: in fact, the longer these applications are served once
+ got access to the device, the higher the throughput is.
+
+ - Let small budgets be eventually assigned to the queues
+ associated with time-sensitive applications (which typically
+ perform sporadic and short I/O), because, the smaller the
+ budget assigned to a queue waiting for service is, the sooner
+ B-WF2Q+ will serve that queue (Subsec 3.3 in [2]).
+
+- If several processes are competing for the device at the same time,
+ but all processes and groups have the same weight, then BFQ
+ guarantees the expected throughput distribution without ever idling
+ the device. It uses preemption instead. Throughput is then much
+ higher in this common scenario.
+
+- ioprio classes are served in strict priority order, i.e.,
+ lower-priority queues are not served as long as there are
+ higher-priority queues. Among queues in the same class, the
+ bandwidth is distributed in proportion to the weight of each
+ queue. A very thin extra bandwidth is however guaranteed to
+ the Idle class, to prevent it from starving.
+
+
+3. What are BFQ's tunable?
+==========================
+
+The tunables back_seek-max, back_seek_penalty, fifo_expire_async and
+fifo_expire_sync below are the same as in CFQ. Their description is
+just copied from that for CFQ. Some considerations in the description
+of slice_idle are copied from CFQ too.
+
+per-process ioprio and weight
+-----------------------------
+
+Unless the cgroups interface is used (see "4. BFQ group scheduling"),
+weights can be assigned to processes only indirectly, through I/O
+priorities, and according to the relation:
+weight = (IOPRIO_BE_NR - ioprio) * 10.
+
+Beware that, if low-latency is set, then BFQ automatically raises the
+weight of the queues associated with interactive and soft real-time
+applications. Unset this tunable if you need/want to control weights.
+
+slice_idle
+----------
+
+This parameter specifies how long BFQ should idle for next I/O
+request, when certain sync BFQ queues become empty. By default
+slice_idle is a non-zero value. Idling has a double purpose: boosting
+throughput and making sure that the desired throughput distribution is
+respected (see the description of how BFQ works, and, if needed, the
+papers referred there).
+
+As for throughput, idling can be very helpful on highly seeky media
+like single spindle SATA/SAS disks where we can cut down on overall
+number of seeks and see improved throughput.
+
+Setting slice_idle to 0 will remove all the idling on queues and one
+should see an overall improved throughput on faster storage devices
+like multiple SATA/SAS disks in hardware RAID configuration.
+
+So depending on storage and workload, it might be useful to set
+slice_idle=0. In general for SATA/SAS disks and software RAID of
+SATA/SAS disks keeping slice_idle enabled should be useful. For any
+configurations where there are multiple spindles behind single LUN
+(Host based hardware RAID controller or for storage arrays), setting
+slice_idle=0 might end up in better throughput and acceptable
+latencies.
+
+Idling is however necessary to have service guarantees enforced in
+case of differentiated weights or differentiated I/O-request lengths.
+To see why, suppose that a given BFQ queue A must get several I/O
+requests served for each request served for another queue B. Idling
+ensures that, if A makes a new I/O request slightly after becoming
+empty, then no request of B is dispatched in the middle, and thus A
+does not lose the possibility to get more than one request dispatched
+before the next request of B is dispatched. Note that idling
+guarantees the desired differentiated treatment of queues only in
+terms of I/O-request dispatches. To guarantee that the actual service
+order then corresponds to the dispatch order, the strict_guarantees
+tunable must be set too.
+
+There is an important flipside for idling: apart from the above cases
+where it is beneficial also for throughput, idling can severely impact
+throughput. One important case is random workload. Because of this
+issue, BFQ tends to avoid idling as much as possible, when it is not
+beneficial also for throughput. As a consequence of this behavior, and
+of further issues described for the strict_guarantees tunable,
+short-term service guarantees may be occasionally violated. And, in
+some cases, these guarantees may be more important than guaranteeing
+maximum throughput. For example, in video playing/streaming, a very
+low drop rate may be more important than maximum throughput. In these
+cases, consider setting the strict_guarantees parameter.
+
+strict_guarantees
+-----------------
+
+If this parameter is set (default: unset), then BFQ
+
+- always performs idling when the in-service queue becomes empty;
+
+- forces the device to serve one I/O request at a time, by dispatching a
+ new request only if there is no outstanding request.
+
+In the presence of differentiated weights or I/O-request sizes, both
+the above conditions are needed to guarantee that every BFQ queue
+receives its allotted share of the bandwidth. The first condition is
+needed for the reasons explained in the description of the slice_idle
+tunable. The second condition is needed because all modern storage
+devices reorder internally-queued requests, which may trivially break
+the service guarantees enforced by the I/O scheduler.
+
+Setting strict_guarantees may evidently affect throughput.
+
+back_seek_max
+-------------
+
+This specifies, given in Kbytes, the maximum "distance" for backward seeking.
+The distance is the amount of space from the current head location to the
+sectors that are backward in terms of distance.
+
+This parameter allows the scheduler to anticipate requests in the "backward"
+direction and consider them as being the "next" if they are within this
+distance from the current head location.
+
+back_seek_penalty
+-----------------
+
+This parameter is used to compute the cost of backward seeking. If the
+backward distance of request is just 1/back_seek_penalty from a "front"
+request, then the seeking cost of two requests is considered equivalent.
+
+So scheduler will not bias toward one or the other request (otherwise scheduler
+will bias toward front request). Default value of back_seek_penalty is 2.
+
+fifo_expire_async
+-----------------
+
+This parameter is used to set the timeout of asynchronous requests. Default
+value of this is 248ms.
+
+fifo_expire_sync
+----------------
+
+This parameter is used to set the timeout of synchronous requests. Default
+value of this is 124ms. In case to favor synchronous requests over asynchronous
+one, this value should be decreased relative to fifo_expire_async.
+
+low_latency
+-----------
+
+This parameter is used to enable/disable BFQ's low latency mode. By
+default, low latency mode is enabled. If enabled, interactive and soft
+real-time applications are privileged and experience a lower latency,
+as explained in more detail in the description of how BFQ works.
+
+DO NOT enable this mode if you need full control on bandwidth
+distribution. In fact, if it is enabled, then BFQ automatically
+increases the bandwidth share of privileged applications, as the main
+means to guarantee a lower latency to them.
+
+timeout_sync
+------------
+
+Maximum amount of device time that can be given to a task (queue) once
+it has been selected for service. On devices with costly seeks,
+increasing this time usually increases maximum throughput. On the
+opposite end, increasing this time coarsens the granularity of the
+short-term bandwidth and latency guarantees, especially if the
+following parameter is set to zero.
+
+max_budget
+----------
+
+Maximum amount of service, measured in sectors, that can be provided
+to a BFQ queue once it is set in service (of course within the limits
+of the above timeout). According to what said in the description of
+the algorithm, larger values increase the throughput in proportion to
+the percentage of sequential I/O requests issued. The price of larger
+values is that they coarsen the granularity of short-term bandwidth
+and latency guarantees.
+
+The default value is 0, which enables auto-tuning: BFQ sets max_budget
+to the maximum number of sectors that can be served during
+timeout_sync, according to the estimated peak rate.
+
+weights
+-------
+
+Read-only parameter, used to show the weights of the currently active
+BFQ queues.
+
+
+wr_ tunables
+------------
+
+BFQ exports a few parameters to control/tune the behavior of
+low-latency heuristics.
+
+wr_coeff
+
+Factor by which the weight of a weight-raised queue is multiplied. If
+the queue is deemed soft real-time, then the weight is further
+multiplied by an additional, constant factor.
+
+wr_max_time
+
+Maximum duration of a weight-raising period for an interactive task
+(ms). If set to zero (default value), then this value is computed
+automatically, as a function of the peak rate of the device. In any
+case, when the value of this parameter is read, it always reports the
+current duration, regardless of whether it has been set manually or
+computed automatically.
+
+wr_max_softrt_rate
+
+Maximum service rate below which a queue is deemed to be associated
+with a soft real-time application, and is then weight-raised
+accordingly (sectors/sec).
+
+wr_min_idle_time
+
+Minimum idle period after which interactive weight-raising may be
+reactivated for a queue (in ms).
+
+wr_rt_max_time
+
+Maximum weight-raising duration for soft real-time queues (in ms). The
+start time from which this duration is considered is automatically
+moved forward if the queue is detected to be still soft real-time
+before the current soft real-time weight-raising period finishes.
+
+wr_min_inter_arr_async
+
+Minimum period between I/O request arrivals after which weight-raising
+may be reactivated for an already busy async queue (in ms).
+
+
+4. Group scheduling with BFQ
+============================
+
+BFQ supports both cgroups-v1 and cgroups-v2 io controllers, namely
+blkio and io. In particular, BFQ supports weight-based proportional
+share. To activate cgroups support, set BFQ_GROUP_IOSCHED.
+
+4-1 Service guarantees provided
+-------------------------------
+
+With BFQ, proportional share means true proportional share of the
+device bandwidth, according to group weights. For example, a group
+with weight 200 gets twice the bandwidth, and not just twice the time,
+of a group with weight 100.
+
+BFQ supports hierarchies (group trees) of any depth. Bandwidth is
+distributed among groups and processes in the expected way: for each
+group, the children of the group share the whole bandwidth of the
+group in proportion to their weights. In particular, this implies
+that, for each leaf group, every process of the group receives the
+same share of the whole group bandwidth, unless the ioprio of the
+process is modified.
+
+The resource-sharing guarantee for a group may partially or totally
+switch from bandwidth to time, if providing bandwidth guarantees to
+the group lowers the throughput too much. This switch occurs on a
+per-process basis: if a process of a leaf group causes throughput loss
+if served in such a way to receive its share of the bandwidth, then
+BFQ switches back to just time-based proportional share for that
+process.
+
+4-2 Interface
+-------------
+
+To get proportional sharing of bandwidth with BFQ for a given device,
+BFQ must of course be the active scheduler for that device.
+
+Within each group directory, the names of the files associated with
+BFQ-specific cgroup parameters and stats begin with the "bfq."
+prefix. So, with cgroups-v1 or cgroups-v2, the full prefix for
+BFQ-specific files is "blkio.bfq." or "io.bfq." For example, the group
+parameter to set the weight of a group with BFQ is blkio.bfq.weight
+or io.bfq.weight.
+
+Parameters to set
+-----------------
+
+For each group, there is only the following parameter to set.
+
+weight (namely blkio.bfq.weight or io.bfq-weight): the weight of the
+group inside its parent. Available values: 1..10000 (default 100). The
+linear mapping between ioprio and weights, described at the beginning
+of the tunable section, is still valid, but all weights higher than
+IOPRIO_BE_NR*10 are mapped to ioprio 0.
+
+Recall that, if low-latency is set, then BFQ automatically raises the
+weight of the queues associated with interactive and soft real-time
+applications. Unset this tunable if you need/want to control weights.
+
+
+[1] P. Valente, A. Avanzini, "Evolution of the BFQ Storage I/O
+ Scheduler", Proceedings of the First Workshop on Mobile System
+ Technologies (MST-2015), May 2015.
+ http://algogroup.unimore.it/people/paolo/disk_sched/mst-2015.pdf
+
+[2] P. Valente and M. Andreolini, "Improving Application
+ Responsiveness with the BFQ Disk I/O Scheduler", Proceedings of
+ the 5th Annual International Systems and Storage Conference
+ (SYSTOR '12), June 2012.
+ Slightly extended version:
+ http://algogroup.unimore.it/people/paolo/disk_sched/bfq-v1-suite-
+ results.pdf
diff --git a/Documentation/block/queue-sysfs.txt b/Documentation/block/queue-sysfs.txt
index 2a39040..2847219 100644
--- a/Documentation/block/queue-sysfs.txt
+++ b/Documentation/block/queue-sysfs.txt
@@ -169,5 +169,18 @@ This is the number of bytes the device can write in a single write-same
command. A value of '0' means write-same is not supported by this
device.
+wb_lat_usec (RW)
+----------------
+If the device is registered for writeback throttling, then this file shows
+the target minimum read latency. If this latency is exceeded in a given
+window of time (see wb_window_usec), then the writeback throttling will start
+scaling back writes.
+
+wb_window_usec (RW)
+-------------------
+If the device is registered for writeback throttling, then this file shows
+the value of the monitoring window in which we'll look at the target
+latency. See wb_lat_usec.
+
Jens Axboe <jens.axboe@oracle.com>, February 2009
diff --git a/Documentation/scheduler/sched-BFS.txt b/Documentation/scheduler/sched-BFS.txt
new file mode 100644
index 0000000..c028200
--- /dev/null
+++ b/Documentation/scheduler/sched-BFS.txt
@@ -0,0 +1,351 @@
+BFS - The Brain Fuck Scheduler by Con Kolivas.
+
+Goals.
+
+The goal of the Brain Fuck Scheduler, referred to as BFS from here on, is to
+completely do away with the complex designs of the past for the cpu process
+scheduler and instead implement one that is very simple in basic design.
+The main focus of BFS is to achieve excellent desktop interactivity and
+responsiveness without heuristics and tuning knobs that are difficult to
+understand, impossible to model and predict the effect of, and when tuned to
+one workload cause massive detriment to another.
+
+
+Design summary.
+
+BFS is best described as a single runqueue, O(n) lookup, earliest effective
+virtual deadline first design, loosely based on EEVDF (earliest eligible virtual
+deadline first) and my previous Staircase Deadline scheduler. Each component
+shall be described in order to understand the significance of, and reasoning for
+it. The codebase when the first stable version was released was approximately
+9000 lines less code than the existing mainline linux kernel scheduler (in
+2.6.31). This does not even take into account the removal of documentation and
+the cgroups code that is not used.
+
+Design reasoning.
+
+The single runqueue refers to the queued but not running processes for the
+entire system, regardless of the number of CPUs. The reason for going back to
+a single runqueue design is that once multiple runqueues are introduced,
+per-CPU or otherwise, there will be complex interactions as each runqueue will
+be responsible for the scheduling latency and fairness of the tasks only on its
+own runqueue, and to achieve fairness and low latency across multiple CPUs, any
+advantage in throughput of having CPU local tasks causes other disadvantages.
+This is due to requiring a very complex balancing system to at best achieve some
+semblance of fairness across CPUs and can only maintain relatively low latency
+for tasks bound to the same CPUs, not across them. To increase said fairness
+and latency across CPUs, the advantage of local runqueue locking, which makes
+for better scalability, is lost due to having to grab multiple locks.
+
+A significant feature of BFS is that all accounting is done purely based on CPU
+used and nowhere is sleep time used in any way to determine entitlement or
+interactivity. Interactivity "estimators" that use some kind of sleep/run
+algorithm are doomed to fail to detect all interactive tasks, and to falsely tag
+tasks that aren't interactive as being so. The reason for this is that it is
+close to impossible to determine that when a task is sleeping, whether it is
+doing it voluntarily, as in a userspace application waiting for input in the
+form of a mouse click or otherwise, or involuntarily, because it is waiting for
+another thread, process, I/O, kernel activity or whatever. Thus, such an
+estimator will introduce corner cases, and more heuristics will be required to
+cope with those corner cases, introducing more corner cases and failed
+interactivity detection and so on. Interactivity in BFS is built into the design
+by virtue of the fact that tasks that are waking up have not used up their quota
+of CPU time, and have earlier effective deadlines, thereby making it very likely
+they will preempt any CPU bound task of equivalent nice level. See below for
+more information on the virtual deadline mechanism. Even if they do not preempt
+a running task, because the rr interval is guaranteed to have a bound upper
+limit on how long a task will wait for, it will be scheduled within a timeframe
+that will not cause visible interface jitter.
+
+
+Design details.
+
+Task insertion.
+
+BFS inserts tasks into each relevant queue as an O(1) insertion into a double
+linked list. On insertion, *every* running queue is checked to see if the newly
+queued task can run on any idle queue, or preempt the lowest running task on the
+system. This is how the cross-CPU scheduling of BFS achieves significantly lower
+latency per extra CPU the system has. In this case the lookup is, in the worst
+case scenario, O(n) where n is the number of CPUs on the system.
+
+Data protection.
+
+BFS has one single lock protecting the process local data of every task in the
+global queue. Thus every insertion, removal and modification of task data in the
+global runqueue needs to grab the global lock. However, once a task is taken by
+a CPU, the CPU has its own local data copy of the running process' accounting
+information which only that CPU accesses and modifies (such as during a
+timer tick) thus allowing the accounting data to be updated lockless. Once a
+CPU has taken a task to run, it removes it from the global queue. Thus the
+global queue only ever has, at most,
+
+ (number of tasks requesting cpu time) - (number of logical CPUs) + 1
+
+tasks in the global queue. This value is relevant for the time taken to look up
+tasks during scheduling. This will increase if many tasks with CPU affinity set
+in their policy to limit which CPUs they're allowed to run on if they outnumber
+the number of CPUs. The +1 is because when rescheduling a task, the CPU's
+currently running task is put back on the queue. Lookup will be described after
+the virtual deadline mechanism is explained.
+
+Virtual deadline.
+
+The key to achieving low latency, scheduling fairness, and "nice level"
+distribution in BFS is entirely in the virtual deadline mechanism. The one
+tunable in BFS is the rr_interval, or "round robin interval". This is the
+maximum time two SCHED_OTHER (or SCHED_NORMAL, the common scheduling policy)
+tasks of the same nice level will be running for, or looking at it the other
+way around, the longest duration two tasks of the same nice level will be
+delayed for. When a task requests cpu time, it is given a quota (time_slice)
+equal to the rr_interval and a virtual deadline. The virtual deadline is
+offset from the current time in jiffies by this equation:
+
+ jiffies + (prio_ratio * rr_interval)
+
+The prio_ratio is determined as a ratio compared to the baseline of nice -20
+and increases by 10% per nice level. The deadline is a virtual one only in that
+no guarantee is placed that a task will actually be scheduled by this time, but
+it is used to compare which task should go next. There are three components to
+how a task is next chosen. First is time_slice expiration. If a task runs out
+of its time_slice, it is descheduled, the time_slice is refilled, and the
+deadline reset to that formula above. Second is sleep, where a task no longer
+is requesting CPU for whatever reason. The time_slice and deadline are _not_
+adjusted in this case and are just carried over for when the task is next
+scheduled. Third is preemption, and that is when a newly waking task is deemed
+higher priority than a currently running task on any cpu by virtue of the fact
+that it has an earlier virtual deadline than the currently running task. The
+earlier deadline is the key to which task is next chosen for the first and
+second cases. Once a task is descheduled, it is put back on the queue, and an
+O(n) lookup of all queued-but-not-running tasks is done to determine which has
+the earliest deadline and that task is chosen to receive CPU next.
+
+The CPU proportion of different nice tasks works out to be approximately the
+
+ (prio_ratio difference)^2
+
+The reason it is squared is that a task's deadline does not change while it is
+running unless it runs out of time_slice. Thus, even if the time actually
+passes the deadline of another task that is queued, it will not get CPU time
+unless the current running task deschedules, and the time "base" (jiffies) is
+constantly moving.
+
+Task lookup.
+
+BFS has 103 priority queues. 100 of these are dedicated to the static priority
+of realtime tasks, and the remaining 3 are, in order of best to worst priority,
+SCHED_ISO (isochronous), SCHED_NORMAL, and SCHED_IDLEPRIO (idle priority
+scheduling). When a task of these priorities is queued, a bitmap of running
+priorities is set showing which of these priorities has tasks waiting for CPU
+time. When a CPU is made to reschedule, the lookup for the next task to get
+CPU time is performed in the following way:
+
+First the bitmap is checked to see what static priority tasks are queued. If
+any realtime priorities are found, the corresponding queue is checked and the
+first task listed there is taken (provided CPU affinity is suitable) and lookup
+is complete. If the priority corresponds to a SCHED_ISO task, they are also
+taken in FIFO order (as they behave like SCHED_RR). If the priority corresponds
+to either SCHED_NORMAL or SCHED_IDLEPRIO, then the lookup becomes O(n). At this
+stage, every task in the runlist that corresponds to that priority is checked
+to see which has the earliest set deadline, and (provided it has suitable CPU
+affinity) it is taken off the runqueue and given the CPU. If a task has an
+expired deadline, it is taken and the rest of the lookup aborted (as they are
+chosen in FIFO order).
+
+Thus, the lookup is O(n) in the worst case only, where n is as described
+earlier, as tasks may be chosen before the whole task list is looked over.
+
+
+Scalability.
+
+The major limitations of BFS will be that of scalability, as the separate
+runqueue designs will have less lock contention as the number of CPUs rises.
+However they do not scale linearly even with separate runqueues as multiple
+runqueues will need to be locked concurrently on such designs to be able to
+achieve fair CPU balancing, to try and achieve some sort of nice-level fairness
+across CPUs, and to achieve low enough latency for tasks on a busy CPU when
+other CPUs would be more suited. BFS has the advantage that it requires no
+balancing algorithm whatsoever, as balancing occurs by proxy simply because
+all CPUs draw off the global runqueue, in priority and deadline order. Despite
+the fact that scalability is _not_ the prime concern of BFS, it both shows very
+good scalability to smaller numbers of CPUs and is likely a more scalable design
+at these numbers of CPUs.
+
+It also has some very low overhead scalability features built into the design
+when it has been deemed their overhead is so marginal that they're worth adding.
+The first is the local copy of the running process' data to the CPU it's running
+on to allow that data to be updated lockless where possible. Then there is
+deference paid to the last CPU a task was running on, by trying that CPU first
+when looking for an idle CPU to use the next time it's scheduled. Finally there
+is the notion of cache locality beyond the last running CPU. The sched_domains
+information is used to determine the relative virtual "cache distance" that
+other CPUs have from the last CPU a task was running on. CPUs with shared
+caches, such as SMT siblings, or multicore CPUs with shared caches, are treated
+as cache local. CPUs without shared caches are treated as not cache local, and
+CPUs on different NUMA nodes are treated as very distant. This "relative cache
+distance" is used by modifying the virtual deadline value when doing lookups.
+Effectively, the deadline is unaltered between "cache local" CPUs, doubled for
+"cache distant" CPUs, and quadrupled for "very distant" CPUs. The reasoning
+behind the doubling of deadlines is as follows. The real cost of migrating a
+task from one CPU to another is entirely dependant on the cache footprint of
+the task, how cache intensive the task is, how long it's been running on that
+CPU to take up the bulk of its cache, how big the CPU cache is, how fast and
+how layered the CPU cache is, how fast a context switch is... and so on. In
+other words, it's close to random in the real world where we do more than just
+one sole workload. The only thing we can be sure of is that it's not free. So
+BFS uses the principle that an idle CPU is a wasted CPU and utilising idle CPUs
+is more important than cache locality, and cache locality only plays a part
+after that. Doubling the effective deadline is based on the premise that the
+"cache local" CPUs will tend to work on the same tasks up to double the number
+of cache local CPUs, and once the workload is beyond that amount, it is likely
+that none of the tasks are cache warm anywhere anyway. The quadrupling for NUMA
+is a value I pulled out of my arse.
+
+When choosing an idle CPU for a waking task, the cache locality is determined
+according to where the task last ran and then idle CPUs are ranked from best
+to worst to choose the most suitable idle CPU based on cache locality, NUMA
+node locality and hyperthread sibling business. They are chosen in the
+following preference (if idle):
+
+* Same core, idle or busy cache, idle threads
+* Other core, same cache, idle or busy cache, idle threads.
+* Same node, other CPU, idle cache, idle threads.
+* Same node, other CPU, busy cache, idle threads.
+* Same core, busy threads.
+* Other core, same cache, busy threads.
+* Same node, other CPU, busy threads.
+* Other node, other CPU, idle cache, idle threads.
+* Other node, other CPU, busy cache, idle threads.
+* Other node, other CPU, busy threads.
+
+This shows the SMT or "hyperthread" awareness in the design as well which will
+choose a real idle core first before a logical SMT sibling which already has
+tasks on the physical CPU.
+
+Early benchmarking of BFS suggested scalability dropped off at the 16 CPU mark.
+However this benchmarking was performed on an earlier design that was far less
+scalable than the current one so it's hard to know how scalable it is in terms
+of both CPUs (due to the global runqueue) and heavily loaded machines (due to
+O(n) lookup) at this stage. Note that in terms of scalability, the number of
+_logical_ CPUs matters, not the number of _physical_ CPUs. Thus, a dual (2x)
+quad core (4X) hyperthreaded (2X) machine is effectively a 16X. Newer benchmark
+results are very promising indeed, without needing to tweak any knobs, features
+or options. Benchmark contributions are most welcome.
+
+
+Features
+
+As the initial prime target audience for BFS was the average desktop user, it
+was designed to not need tweaking, tuning or have features set to obtain benefit
+from it. Thus the number of knobs and features has been kept to an absolute
+minimum and should not require extra user input for the vast majority of cases.
+There are precisely 2 tunables, and 2 extra scheduling policies. The rr_interval
+and iso_cpu tunables, and the SCHED_ISO and SCHED_IDLEPRIO policies. In addition
+to this, BFS also uses sub-tick accounting. What BFS does _not_ now feature is
+support for CGROUPS. The average user should neither need to know what these
+are, nor should they need to be using them to have good desktop behaviour.
+
+rr_interval
+
+There is only one "scheduler" tunable, the round robin interval. This can be
+accessed in
+
+ /proc/sys/kernel/rr_interval
+
+The value is in milliseconds, and the default value is set to 6 on a
+uniprocessor machine, and automatically set to a progressively higher value on
+multiprocessor machines. The reasoning behind increasing the value on more CPUs
+is that the effective latency is decreased by virtue of there being more CPUs on
+BFS (for reasons explained above), and increasing the value allows for less
+cache contention and more throughput. Valid values are from 1 to 1000
+Decreasing the value will decrease latencies at the cost of decreasing
+throughput, while increasing it will improve throughput, but at the cost of
+worsening latencies. The accuracy of the rr interval is limited by HZ resolution
+of the kernel configuration. Thus, the worst case latencies are usually slightly
+higher than this actual value. The default value of 6 is not an arbitrary one.
+It is based on the fact that humans can detect jitter at approximately 7ms, so
+aiming for much lower latencies is pointless under most circumstances. It is
+worth noting this fact when comparing the latency performance of BFS to other
+schedulers. Worst case latencies being higher than 7ms are far worse than
+average latencies not being in the microsecond range.
+
+Isochronous scheduling.
+
+Isochronous scheduling is a unique scheduling policy designed to provide
+near-real-time performance to unprivileged (ie non-root) users without the
+ability to starve the machine indefinitely. Isochronous tasks (which means
+"same time") are set using, for example, the schedtool application like so:
+
+ schedtool -I -e amarok
+
+This will start the audio application "amarok" as SCHED_ISO. How SCHED_ISO works
+is that it has a priority level between true realtime tasks and SCHED_NORMAL
+which would allow them to preempt all normal tasks, in a SCHED_RR fashion (ie,
+if multiple SCHED_ISO tasks are running, they purely round robin at rr_interval
+rate). However if ISO tasks run for more than a tunable finite amount of time,
+they are then demoted back to SCHED_NORMAL scheduling. This finite amount of
+time is the percentage of _total CPU_ available across the machine, configurable
+as a percentage in the following "resource handling" tunable (as opposed to a
+scheduler tunable):
+
+ /proc/sys/kernel/iso_cpu
+
+and is set to 70% by default. It is calculated over a rolling 5 second average
+Because it is the total CPU available, it means that on a multi CPU machine, it
+is possible to have an ISO task running as realtime scheduling indefinitely on
+just one CPU, as the other CPUs will be available. Setting this to 100 is the
+equivalent of giving all users SCHED_RR access and setting it to 0 removes the
+ability to run any pseudo-realtime tasks.
+
+A feature of BFS is that it detects when an application tries to obtain a
+realtime policy (SCHED_RR or SCHED_FIFO) and the caller does not have the
+appropriate privileges to use those policies. When it detects this, it will
+give the task SCHED_ISO policy instead. Thus it is transparent to the user.
+Because some applications constantly set their policy as well as their nice
+level, there is potential for them to undo the override specified by the user
+on the command line of setting the policy to SCHED_ISO. To counter this, once
+a task has been set to SCHED_ISO policy, it needs superuser privileges to set
+it back to SCHED_NORMAL. This will ensure the task remains ISO and all child
+processes and threads will also inherit the ISO policy.
+
+Idleprio scheduling.
+
+Idleprio scheduling is a scheduling policy designed to give out CPU to a task
+_only_ when the CPU would be otherwise idle. The idea behind this is to allow
+ultra low priority tasks to be run in the background that have virtually no
+effect on the foreground tasks. This is ideally suited to distributed computing
+clients (like setiathome, folding, mprime etc) but can also be used to start
+a video encode or so on without any slowdown of other tasks. To avoid this
+policy from grabbing shared resources and holding them indefinitely, if it
+detects a state where the task is waiting on I/O, the machine is about to
+suspend to ram and so on, it will transiently schedule them as SCHED_NORMAL. As
+per the Isochronous task management, once a task has been scheduled as IDLEPRIO,
+it cannot be put back to SCHED_NORMAL without superuser privileges. Tasks can
+be set to start as SCHED_IDLEPRIO with the schedtool command like so:
+
+ schedtool -D -e ./mprime
+
+Subtick accounting.
+
+It is surprisingly difficult to get accurate CPU accounting, and in many cases,
+the accounting is done by simply determining what is happening at the precise
+moment a timer tick fires off. This becomes increasingly inaccurate as the
+timer tick frequency (HZ) is lowered. It is possible to create an application
+which uses almost 100% CPU, yet by being descheduled at the right time, records
+zero CPU usage. While the main problem with this is that there are possible
+security implications, it is also difficult to determine how much CPU a task
+really does use. BFS tries to use the sub-tick accounting from the TSC clock,
+where possible, to determine real CPU usage. This is not entirely reliable, but
+is far more likely to produce accurate CPU usage data than the existing designs
+and will not show tasks as consuming no CPU usage when they actually are. Thus,
+the amount of CPU reported as being used by BFS will more accurately represent
+how much CPU the task itself is using (as is shown for example by the 'time'
+application), so the reported values may be quite different to other schedulers.
+Values reported as the 'load' are more prone to problems with this design, but
+per process values are closer to real usage. When comparing throughput of BFS
+to other designs, it is important to compare the actual completed work in terms
+of total wall clock time taken and total work done, rather than the reported
+"cpu usage".
+
+
+Con Kolivas <kernel@kolivas.org> Fri Aug 27 2010
diff --git a/Documentation/scheduler/sched-MuQSS.txt b/Documentation/scheduler/sched-MuQSS.txt
new file mode 100644
index 0000000..bbd6980
--- /dev/null
+++ b/Documentation/scheduler/sched-MuQSS.txt
@@ -0,0 +1,345 @@
+MuQSS - The Multiple Queue Skiplist Scheduler by Con Kolivas.
+
+MuQSS is a per-cpu runqueue variant of the original BFS scheduler with
+one 8 level skiplist per runqueue, and fine grained locking for much more
+scalability.
+
+
+Goals.
+
+The goal of the Multiple Queue Skiplist Scheduler, referred to as MuQSS from
+here on (pronounced mux) is to completely do away with the complex designs of
+the past for the cpu process scheduler and instead implement one that is very
+simple in basic design. The main focus of MuQSS is to achieve excellent desktop
+interactivity and responsiveness without heuristics and tuning knobs that are
+difficult to understand, impossible to model and predict the effect of, and when
+tuned to one workload cause massive detriment to another, while still being
+scalable to many CPUs and processes.
+
+
+Design summary.
+
+MuQSS is best described as per-cpu multiple runqueue, O(log n) insertion, O(1)
+lookup, earliest effective virtual deadline first tickless design, loosely based
+on EEVDF (earliest eligible virtual deadline first) and my previous Staircase
+Deadline scheduler, and evolved from the single runqueue O(n) BFS scheduler.
+Each component shall be described in order to understand the significance of,
+and reasoning for it.
+
+
+Design reasoning.
+
+In BFS, the use of a single runqueue across all CPUs meant that each CPU would
+need to scan the entire runqueue looking for the process with the earliest
+deadline and schedule that next, regardless of which CPU it originally came
+from. This made BFS deterministic with respect to latency and provided
+guaranteed latencies dependent on number of processes and CPUs. The single
+runqueue, however, meant that all CPUs would compete for the single lock
+protecting it, which would lead to increasing lock contention as the number of
+CPUs rose and appeared to limit scalability of common workloads beyond 16
+logical CPUs. Additionally, the O(n) lookup of the runqueue list obviously
+increased overhead proportionate to the number of queued proecesses and led to
+cache thrashing while iterating over the linked list.
+
+MuQSS is an evolution of BFS, designed to maintain the same scheduling
+decision mechanism and be virtually deterministic without relying on the
+constrained design of the single runqueue by splitting out the single runqueue
+to be per-CPU and use skiplists instead of linked lists.
+
+The original reason for going back to a single runqueue design for BFS was that
+once multiple runqueues are introduced, per-CPU or otherwise, there will be
+complex interactions as each runqueue will be responsible for the scheduling
+latency and fairness of the tasks only on its own runqueue, and to achieve
+fairness and low latency across multiple CPUs, any advantage in throughput of
+having CPU local tasks causes other disadvantages. This is due to requiring a
+very complex balancing system to at best achieve some semblance of fairness
+across CPUs and can only maintain relatively low latency for tasks bound to the
+same CPUs, not across them. To increase said fairness and latency across CPUs,
+the advantage of local runqueue locking, which makes for better scalability, is
+lost due to having to grab multiple locks.
+
+MuQSS works around the problems inherent in multiple runqueue designs by
+making its skip lists priority ordered and through novel use of lockless
+examination of each other runqueue it can decide if it should take the earliest
+deadline task from another runqueue for latency reasons, or for CPU balancing
+reasons. It still does not have a balancing system, choosing to allow the
+next task scheduling decision and task wakeup CPU choice to allow balancing to
+happen by virtue of its choices.
+
+
+Design details.
+
+Custom skip list implementation:
+
+To avoid the overhead of building up and tearing down skip list structures,
+the variant used by MuQSS has a number of optimisations making it specific for
+its use case in the scheduler. It uses static arrays of 8 'levels' instead of
+building up and tearing down structures dynamically. This makes each runqueue
+only scale O(log N) up to 256 tasks. However as there is one runqueue per CPU
+it means that it scales O(log N) up to 256 x number of logical CPUs which is
+far beyond the realistic task limits each CPU could handle. By being 8 levels
+it also makes the array exactly one cacheline in size. Additionally, each
+skip list node is bidirectional making insertion and removal amortised O(1),
+being O(k) where k is 1-8. Uniquely, we are only ever interested in the very
+first entry in each list at all times with MuQSS, so there is never a need to
+do a search and thus look up is always O(1).
+
+Task insertion:
+
+MuQSS inserts tasks into a per CPU runqueue as an O(log N) insertion into
+a custom skip list as described above (based on the original design by William
+Pugh). Insertion is ordered in such a way that there is never a need to do a
+search by ordering tasks according to static priority primarily, and then
+virtual deadline at the time of insertion.
+
+Niffies:
+
+Niffies are a monotonic forward moving timer not unlike the "jiffies" but are
+of nanosecond resolution. Niffies are calculated per-runqueue from the high
+resolution TSC timers, and in order to maintain fairness are synchronised
+between CPUs whenever both runqueues are locked concurrently.
+
+Virtual deadline:
+
+The key to achieving low latency, scheduling fairness, and "nice level"
+distribution in MuQSS is entirely in the virtual deadline mechanism. The one
+tunable in MuQSS is the rr_interval, or "round robin interval". This is the
+maximum time two SCHED_OTHER (or SCHED_NORMAL, the common scheduling policy)
+tasks of the same nice level will be running for, or looking at it the other
+way around, the longest duration two tasks of the same nice level will be
+delayed for. When a task requests cpu time, it is given a quota (time_slice)
+equal to the rr_interval and a virtual deadline. The virtual deadline is
+offset from the current time in niffies by this equation:
+
+ niffies + (prio_ratio * rr_interval)
+
+The prio_ratio is determined as a ratio compared to the baseline of nice -20
+and increases by 10% per nice level. The deadline is a virtual one only in that
+no guarantee is placed that a task will actually be scheduled by this time, but
+it is used to compare which task should go next. There are three components to
+how a task is next chosen. First is time_slice expiration. If a task runs out
+of its time_slice, it is descheduled, the time_slice is refilled, and the
+deadline reset to that formula above. Second is sleep, where a task no longer
+is requesting CPU for whatever reason. The time_slice and deadline are _not_
+adjusted in this case and are just carried over for when the task is next
+scheduled. Third is preemption, and that is when a newly waking task is deemed
+higher priority than a currently running task on any cpu by virtue of the fact
+that it has an earlier virtual deadline than the currently running task. The
+earlier deadline is the key to which task is next chosen for the first and
+second cases.
+
+The CPU proportion of different nice tasks works out to be approximately the
+
+ (prio_ratio difference)^2
+
+The reason it is squared is that a task's deadline does not change while it is
+running unless it runs out of time_slice. Thus, even if the time actually
+passes the deadline of another task that is queued, it will not get CPU time
+unless the current running task deschedules, and the time "base" (niffies) is
+constantly moving.
+
+Task lookup:
+
+As tasks are already pre-ordered according to anticipated scheduling order in
+the skip lists, lookup for the next suitable task per-runqueue is always a
+matter of simply selecting the first task in the 0th level skip list entry.
+In order to maintain optimal latency and fairness across CPUs, MuQSS does a
+novel examination of every other runqueue in cache locality order, choosing the
+best task across all runqueues. This provides near-determinism of how long any
+task across the entire system may wait before receiving CPU time. The other
+runqueues are first examine lockless and then trylocked to minimise the
+potential lock contention if they are likely to have a suitable better task.
+Each other runqueue lock is only held for as long as it takes to examine the
+entry for suitability. In "interactive" mode, the default setting, MuQSS will
+look for the best deadline task across all CPUs, while in !interactive mode,
+it will only select a better deadline task from another CPU if it is more
+heavily laden than the current one.
+
+Lookup is therefore O(k) where k is number of CPUs.
+
+
+Latency.
+
+Through the use of virtual deadlines to govern the scheduling order of normal
+tasks, queue-to-activation latency per runqueue is guaranteed to be bound by
+the rr_interval tunable which is set to 6ms by default. This means that the
+longest a CPU bound task will wait for more CPU is proportional to the number
+of running tasks and in the common case of 0-2 running tasks per CPU, will be
+under the 7ms threshold for human perception of jitter. Additionally, as newly
+woken tasks will have an early deadline from their previous runtime, the very
+tasks that are usually latency sensitive will have the shortest interval for
+activation, usually preempting any existing CPU bound tasks.
+
+Tickless expiry:
+
+A feature of MuQSS is that it is not tied to the resolution of the chosen tick
+rate in Hz, instead depending entirely on the high resolution timers where
+possible for sub-millisecond accuracy on timeouts regarless of the underlying
+tick rate. This allows MuQSS to be run with the low overhead of low Hz rates
+such as 100 by default, benefiting from the improved throughput and lower
+power usage it provides. Another advantage of this approach is that in
+combination with the Full No HZ option, which disables ticks on running task
+CPUs instead of just idle CPUs, the tick can be disabled at all times
+regardless of how many tasks are running instead of being limited to just one
+running task. Note that this option is NOT recommended for regular desktop
+users.
+
+
+Scalability and balancing.
+
+Unlike traditional approaches where balancing is a combination of CPU selection
+at task wakeup and intermittent balancing based on a vast array of rules set
+according to architecture, busyness calculations and special case management,
+MuQSS indirectly balances on the fly at task wakeup and next task selection.
+During initialisation, MuQSS creates a cache coherency ordered list of CPUs for
+each logical CPU and uses this to aid task/CPU selection when CPUs are busy.
+Additionally it selects any idle CPUs, if they are available, at any time over
+busy CPUs according to the following preference:
+
+ * Same thread, idle or busy cache, idle or busy threads
+ * Other core, same cache, idle or busy cache, idle threads.
+ * Same node, other CPU, idle cache, idle threads.
+ * Same node, other CPU, busy cache, idle threads.
+ * Other core, same cache, busy threads.
+ * Same node, other CPU, busy threads.
+ * Other node, other CPU, idle cache, idle threads.
+ * Other node, other CPU, busy cache, idle threads.
+ * Other node, other CPU, busy threads.
+
+Mux is therefore SMT, MC and Numa aware without the need for extra
+intermittent balancing to maintain CPUs busy and make the most of cache
+coherency.
+
+
+Features
+
+As the initial prime target audience for MuQSS was the average desktop user, it
+was designed to not need tweaking, tuning or have features set to obtain benefit
+from it. Thus the number of knobs and features has been kept to an absolute
+minimum and should not require extra user input for the vast majority of cases.
+There are 3 optional tunables, and 2 extra scheduling policies. The rr_interval,
+interactive, and iso_cpu tunables, and the SCHED_ISO and SCHED_IDLEPRIO
+policies. In addition to this, MuQSS also uses sub-tick accounting. What MuQSS
+does _not_ now feature is support for CGROUPS. The average user should neither
+need to know what these are, nor should they need to be using them to have good
+desktop behaviour. However since some applications refuse to work without
+cgroups, one can enable them with MuQSS as a stub and the filesystem will be
+created which will allow the applications to work.
+
+rr_interval:
+
+ /proc/sys/kernel/rr_interval
+
+The value is in milliseconds, and the default value is set to 6. Valid values
+are from 1 to 1000 Decreasing the value will decrease latencies at the cost of
+decreasing throughput, while increasing it will improve throughput, but at the
+cost of worsening latencies. It is based on the fact that humans can detect
+jitter at approximately 7ms, so aiming for much lower latencies is pointless
+under most circumstances. It is worth noting this fact when comparing the
+latency performance of MuQSS to other schedulers. Worst case latencies being
+higher than 7ms are far worse than average latencies not being in the
+microsecond range.
+
+interactive:
+
+ /proc/sys/kernel/interactive
+
+The value is a simple boolean of 1 for on and 0 for off and is set to on by
+default. Disabling this will disable the near-determinism of MuQSS when
+selecting the next task by not examining all CPUs for the earliest deadline
+task, or which CPU to wake to, instead prioritising CPU balancing for improved
+throughput. Latency will still be bound by rr_interval, but on a per-CPU basis
+instead of across the whole system.
+
+Isochronous scheduling:
+
+Isochronous scheduling is a unique scheduling policy designed to provide
+near-real-time performance to unprivileged (ie non-root) users without the
+ability to starve the machine indefinitely. Isochronous tasks (which means
+"same time") are set using, for example, the schedtool application like so:
+
+ schedtool -I -e amarok
+
+This will start the audio application "amarok" as SCHED_ISO. How SCHED_ISO works
+is that it has a priority level between true realtime tasks and SCHED_NORMAL
+which would allow them to preempt all normal tasks, in a SCHED_RR fashion (ie,
+if multiple SCHED_ISO tasks are running, they purely round robin at rr_interval
+rate). However if ISO tasks run for more than a tunable finite amount of time,
+they are then demoted back to SCHED_NORMAL scheduling. This finite amount of
+time is the percentage of CPU available per CPU, configurable as a percentage in
+the following "resource handling" tunable (as opposed to a scheduler tunable):
+
+iso_cpu:
+
+ /proc/sys/kernel/iso_cpu
+
+and is set to 70% by default. It is calculated over a rolling 5 second average
+Because it is the total CPU available, it means that on a multi CPU machine, it
+is possible to have an ISO task running as realtime scheduling indefinitely on
+just one CPU, as the other CPUs will be available. Setting this to 100 is the
+equivalent of giving all users SCHED_RR access and setting it to 0 removes the
+ability to run any pseudo-realtime tasks.
+
+A feature of MuQSS is that it detects when an application tries to obtain a
+realtime policy (SCHED_RR or SCHED_FIFO) and the caller does not have the
+appropriate privileges to use those policies. When it detects this, it will
+give the task SCHED_ISO policy instead. Thus it is transparent to the user.
+
+
+Idleprio scheduling:
+
+Idleprio scheduling is a scheduling policy designed to give out CPU to a task
+_only_ when the CPU would be otherwise idle. The idea behind this is to allow
+ultra low priority tasks to be run in the background that have virtually no
+effect on the foreground tasks. This is ideally suited to distributed computing
+clients (like setiathome, folding, mprime etc) but can also be used to start a
+video encode or so on without any slowdown of other tasks. To avoid this policy
+from grabbing shared resources and holding them indefinitely, if it detects a
+state where the task is waiting on I/O, the machine is about to suspend to ram
+and so on, it will transiently schedule them as SCHED_NORMAL. Once a task has
+been scheduled as IDLEPRIO, it cannot be put back to SCHED_NORMAL without
+superuser privileges since it is effectively a lower scheduling policy. Tasks
+can be set to start as SCHED_IDLEPRIO with the schedtool command like so:
+
+schedtool -D -e ./mprime
+
+Subtick accounting:
+
+It is surprisingly difficult to get accurate CPU accounting, and in many cases,
+the accounting is done by simply determining what is happening at the precise
+moment a timer tick fires off. This becomes increasingly inaccurate as the timer
+tick frequency (HZ) is lowered. It is possible to create an application which
+uses almost 100% CPU, yet by being descheduled at the right time, records zero
+CPU usage. While the main problem with this is that there are possible security
+implications, it is also difficult to determine how much CPU a task really does
+use. Mux uses sub-tick accounting from the TSC clock to determine real CPU
+usage. Thus, the amount of CPU reported as being used by MuQSS will more
+accurately represent how much CPU the task itself is using (as is shown for
+example by the 'time' application), so the reported values may be quite
+different to other schedulers. When comparing throughput of MuQSS to other
+designs, it is important to compare the actual completed work in terms of total
+wall clock time taken and total work done, rather than the reported "cpu usage".
+
+Symmetric MultiThreading (SMT) aware nice:
+
+SMT, a.k.a. hyperthreading, is a very common feature on modern CPUs. While the
+logical CPU count rises by adding thread units to each CPU core, allowing more
+than one task to be run simultaneously on the same core, the disadvantage of it
+is that the CPU power is shared between the tasks, not summating to the power
+of two CPUs. The practical upshot of this is that two tasks running on
+separate threads of the same core run significantly slower than if they had one
+core each to run on. While smart CPU selection allows each task to have a core
+to itself whenever available (as is done on MuQSS), it cannot offset the
+slowdown that occurs when the cores are all loaded and only a thread is left.
+Most of the time this is harmless as the CPU is effectively overloaded at this
+point and the extra thread is of benefit. However when running a niced task in
+the presence of an un-niced task (say nice 19 v nice 0), the nice task gets
+precisely the same amount of CPU power as the unniced one. MuQSS has an
+optional configuration feature known as SMT-NICE which selectively idles the
+secondary niced thread for a period proportional to the nice difference,
+allowing CPU distribution according to nice level to be maintained, at the
+expense of a small amount of extra overhead. If this is configured in on a
+machine without SMT threads, the overhead is minimal.
+
+
+Con Kolivas <kernel@kolivas.org> Sat, 29th October 2016
diff --git a/Documentation/sysctl/kernel.txt b/Documentation/sysctl/kernel.txt
index ffab8b5..2a82a37 100644
--- a/Documentation/sysctl/kernel.txt
+++ b/Documentation/sysctl/kernel.txt
@@ -39,6 +39,7 @@ show up in /proc/sys/kernel:
- hung_task_timeout_secs
- hung_task_warnings
- kexec_load_disabled
+- iso_cpu
- kptr_restrict
- kstack_depth_to_print [ X86 only ]
- l2cr [ PPC only ]
@@ -73,6 +74,7 @@ show up in /proc/sys/kernel:
- randomize_va_space
- real-root-dev ==> Documentation/initrd.txt
- reboot-cmd [ SPARC only ]
+- rr_interval
- rtsig-max
- rtsig-nr
- sem
@@ -93,6 +95,7 @@ show up in /proc/sys/kernel:
- unknown_nmi_panic
- watchdog
- watchdog_thresh
+- yield_type
- version
==============================================================
@@ -402,6 +405,16 @@ kernel stack.
==============================================================
+iso_cpu: (MuQSS CPU scheduler only).
+
+This sets the percentage cpu that the unprivileged SCHED_ISO tasks can
+run effectively at realtime priority, averaged over a rolling five
+seconds over the -whole- system, meaning all cpus.
+
+Set to 70 (percent) by default.
+
+==============================================================
+
l2cr: (PPC only)
This flag controls the L2 cache of G3 processor boards. If
@@ -818,6 +831,20 @@ rebooting. ???
==============================================================
+rr_interval: (MuQSS CPU scheduler only)
+
+This is the smallest duration that any cpu process scheduling unit
+will run for. Increasing this value can increase throughput of cpu
+bound tasks substantially but at the expense of increased latencies
+overall. Conversely decreasing it will decrease average and maximum
+latencies but at the expense of throughput. This value is in
+milliseconds and the default value chosen depends on the number of
+cpus available at scheduler initialisation with a minimum of 6.
+
+Valid values are from 1-1000.
+
+==============================================================
+
rtsig-max & rtsig-nr:
The file rtsig-max can be used to tune the maximum number
@@ -1056,3 +1083,13 @@ The softlockup threshold is (2 * watchdog_thresh). Setting this
tunable to zero will disable lockup detection altogether.
==============================================================
+
+yield_type: (MuQSS CPU scheduler only)
+
+This determines what type of yield calls to sched_yield will perform.
+
+ 0: No yield.
+ 1: Yield only to better priority/deadline tasks. (default)
+ 2: Expire timeslice and recalculate deadline.
+
+==============================================================
diff --git a/Makefile b/Makefile
index 8585e4e..f6df014 100644
--- a/Makefile
+++ b/Makefile
@@ -1,7 +1,7 @@
VERSION = 4
PATCHLEVEL = 9
-SUBLEVEL = 9
-EXTRAVERSION =
+SUBLEVEL = 0
+EXTRAVERSION = -pf6
NAME = Roaring Lionus
# *DOCUMENTATION*
diff --git a/arch/powerpc/platforms/cell/spufs/sched.c b/arch/powerpc/platforms/cell/spufs/sched.c
index 460f5f3..eeb3e32 100644
--- a/arch/powerpc/platforms/cell/spufs/sched.c
+++ b/arch/powerpc/platforms/cell/spufs/sched.c
@@ -64,11 +64,6 @@ static struct timer_list spusched_timer;
static struct timer_list spuloadavg_timer;
/*
- * Priority of a normal, non-rt, non-niced'd process (aka nice level 0).
- */
-#define NORMAL_PRIO 120
-
-/*
* Frequency of the spu scheduler tick. By default we do one SPU scheduler
* tick for every 10 CPU scheduler ticks.
*/
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index d666ef8..cd1b67c 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -926,10 +926,26 @@ config SCHED_SMT
depends on SMP
---help---
SMT scheduler support improves the CPU scheduler's decision making
- when dealing with Intel Pentium 4 chips with HyperThreading at a
+ when dealing with Intel P4/Core 2 chips with HyperThreading at a
cost of slightly increased overhead in some places. If unsure say
N here.
+config SMT_NICE
+ bool "SMT (Hyperthreading) aware nice priority and policy support"
+ depends on SCHED_MUQSS && SCHED_SMT
+ default y
+ ---help---
+ Enabling Hyperthreading on Intel CPUs decreases the effectiveness
+ of the use of 'nice' levels and different scheduling policies
+ (e.g. realtime) due to sharing of CPU power between hyperthreads.
+ SMT nice support makes each logical CPU aware of what is running on
+ its hyperthread siblings, maintaining appropriate distribution of
+ CPU according to nice levels and scheduling policies at the expense
+ of slightly increased overhead.
+
+ If unsure say Y here.
+
+
config SCHED_MC
def_bool y
prompt "Multi-core scheduler support"
@@ -1295,7 +1311,7 @@ config HIGHMEM64G
endchoice
choice
- prompt "Memory split" if EXPERT
+ prompt "Memory split"
default VMSPLIT_3G
depends on X86_32
---help---
@@ -1315,17 +1331,17 @@ choice
option alone!
config VMSPLIT_3G
- bool "3G/1G user/kernel split"
+ bool "Default 896MB lowmem (3G/1G user/kernel split)"
config VMSPLIT_3G_OPT
depends on !X86_PAE
- bool "3G/1G user/kernel split (for full 1G low memory)"
+ bool "1GB lowmem (3G/1G user/kernel split)"
config VMSPLIT_2G
- bool "2G/2G user/kernel split"
+ bool "2GB lowmem (2G/2G user/kernel split)"
config VMSPLIT_2G_OPT
depends on !X86_PAE
- bool "2G/2G user/kernel split (for full 2G low memory)"
+ bool "2GB lowmem (2G/2G user/kernel split)"
config VMSPLIT_1G
- bool "1G/3G user/kernel split"
+ bool "3GB lowmem (1G/3G user/kernel split)"
endchoice
config PAGE_OFFSET
diff --git a/arch/x86/Kconfig.cpu b/arch/x86/Kconfig.cpu
index 3ba5ff2..aecb337 100644
--- a/arch/x86/Kconfig.cpu
+++ b/arch/x86/Kconfig.cpu
@@ -115,6 +115,7 @@ config MPENTIUMM
config MPENTIUM4
bool "Pentium-4/Celeron(P4-based)/Pentium-4 M/older Xeon"
depends on X86_32
+ select X86_P6_NOP
---help---
Select this for Intel Pentium 4 chips. This includes the
Pentium 4, Pentium D, P4-based Celeron and Xeon, and
@@ -147,9 +148,8 @@ config MPENTIUM4
-Paxville
-Dempsey
-
config MK6
- bool "K6/K6-II/K6-III"
+ bool "AMD K6/K6-II/K6-III"
depends on X86_32
---help---
Select this for an AMD K6-family processor. Enables use of
@@ -157,7 +157,7 @@ config MK6
flags to GCC.
config MK7
- bool "Athlon/Duron/K7"
+ bool "AMD Athlon/Duron/K7"
depends on X86_32
---help---
Select this for an AMD Athlon K7-family processor. Enables use of
@@ -165,12 +165,83 @@ config MK7
flags to GCC.
config MK8
- bool "Opteron/Athlon64/Hammer/K8"
+ bool "AMD Opteron/Athlon64/Hammer/K8"
---help---
Select this for an AMD Opteron or Athlon64 Hammer-family processor.
Enables use of some extended instructions, and passes appropriate
optimization flags to GCC.
+config MK8SSE3
+ bool "AMD Opteron/Athlon64/Hammer/K8 with SSE3"
+ ---help---
+ Select this for improved AMD Opteron or Athlon64 Hammer-family processors.
+ Enables use of some extended instructions, and passes appropriate
+ optimization flags to GCC.
+
+config MK10
+ bool "AMD 61xx/7x50/PhenomX3/X4/II/K10"
+ ---help---
+ Select this for an AMD 61xx Eight-Core Magny-Cours, Athlon X2 7x50,
+ Phenom X3/X4/II, Athlon II X2/X3/X4, or Turion II-family processor.
+ Enables use of some extended instructions, and passes appropriate
+ optimization flags to GCC.
+
+config MBARCELONA
+ bool "AMD Barcelona"
+ ---help---
+ Select this for AMD Family 10h Barcelona processors.
+
+ Enables -march=barcelona
+
+config MBOBCAT
+ bool "AMD Bobcat"
+ ---help---
+ Select this for AMD Family 14h Bobcat processors.
+
+ Enables -march=btver1
+
+config MJAGUAR
+ bool "AMD Jaguar"
+ ---help---
+ Select this for AMD Family 16h Jaguar processors.
+
+ Enables -march=btver2
+
+config MBULLDOZER
+ bool "AMD Bulldozer"
+ ---help---
+ Select this for AMD Family 15h Bulldozer processors.
+
+ Enables -march=bdver1
+
+config MPILEDRIVER
+ bool "AMD Piledriver"
+ ---help---
+ Select this for AMD Family 15h Piledriver processors.
+
+ Enables -march=bdver2
+
+config MSTEAMROLLER
+ bool "AMD Steamroller"
+ ---help---
+ Select this for AMD Family 15h Steamroller processors.
+
+ Enables -march=bdver3
+
+config MEXCAVATOR
+ bool "AMD Excavator"
+ ---help---
+ Select this for AMD Family 15h Excavator processors.
+
+ Enables -march=bdver4
+
+config MZEN
+ bool "AMD Zen"
+ ---help---
+ Select this for AMD Family 17h Zen processors.
+
+ Enables -march=znver1
+
config MCRUSOE
bool "Crusoe"
depends on X86_32
@@ -252,6 +323,7 @@ config MVIAC7
config MPSC
bool "Intel P4 / older Netburst based Xeon"
+ select X86_P6_NOP
depends on X86_64
---help---
Optimize for Intel Pentium 4, Pentium D and older Nocona/Dempsey
@@ -261,8 +333,19 @@ config MPSC
using the cpu family field
in /proc/cpuinfo. Family 15 is an older Xeon, Family 6 a newer one.
+config MATOM
+ bool "Intel Atom"
+ select X86_P6_NOP
+ ---help---
+
+ Select this for the Intel Atom platform. Intel Atom CPUs have an
+ in-order pipelining architecture and thus can benefit from
+ accordingly optimized code. Use a recent GCC with specific Atom
+ support in order to fully benefit from selecting this option.
+
config MCORE2
- bool "Core 2/newer Xeon"
+ bool "Intel Core 2"
+ select X86_P6_NOP
---help---
Select this for Intel Core 2 and newer Core 2 Xeons (Xeon 51xx and
@@ -270,14 +353,79 @@ config MCORE2
family in /proc/cpuinfo. Newer ones have 6 and older ones 15
(not a typo)
-config MATOM
- bool "Intel Atom"
+ Enables -march=core2
+
+config MNEHALEM
+ bool "Intel Nehalem"
+ select X86_P6_NOP
---help---
- Select this for the Intel Atom platform. Intel Atom CPUs have an
- in-order pipelining architecture and thus can benefit from
- accordingly optimized code. Use a recent GCC with specific Atom
- support in order to fully benefit from selecting this option.
+ Select this for 1st Gen Core processors in the Nehalem family.
+
+ Enables -march=nehalem
+
+config MWESTMERE
+ bool "Intel Westmere"
+ select X86_P6_NOP
+ ---help---
+
+ Select this for the Intel Westmere formerly Nehalem-C family.
+
+ Enables -march=westmere
+
+config MSILVERMONT
+ bool "Intel Silvermont"
+ select X86_P6_NOP
+ ---help---
+
+ Select this for the Intel Silvermont platform.
+
+ Enables -march=silvermont
+
+config MSANDYBRIDGE
+ bool "Intel Sandy Bridge"
+ select X86_P6_NOP
+ ---help---
+
+ Select this for 2nd Gen Core processors in the Sandy Bridge family.
+
+ Enables -march=sandybridge
+
+config MIVYBRIDGE
+ bool "Intel Ivy Bridge"
+ select X86_P6_NOP
+ ---help---
+
+ Select this for 3rd Gen Core processors in the Ivy Bridge family.
+
+ Enables -march=ivybridge
+
+config MHASWELL
+ bool "Intel Haswell"
+ select X86_P6_NOP
+ ---help---
+
+ Select this for 4th Gen Core processors in the Haswell family.
+
+ Enables -march=haswell
+
+config MBROADWELL
+ bool "Intel Broadwell"
+ select X86_P6_NOP
+ ---help---
+
+ Select this for 5th Gen Core processors in the Broadwell family.
+
+ Enables -march=broadwell
+
+config MSKYLAKE
+ bool "Intel Skylake"
+ select X86_P6_NOP
+ ---help---
+
+ Select this for 6th Gen Core processors in the Skylake family.
+
+ Enables -march=skylake
config GENERIC_CPU
bool "Generic-x86-64"
@@ -286,6 +434,19 @@ config GENERIC_CPU
Generic x86-64 CPU.
Run equally well on all x86-64 CPUs.
+config MNATIVE
+ bool "Native optimizations autodetected by GCC"
+ ---help---
+
+ GCC 4.2 and above support -march=native, which automatically detects
+ the optimum settings to use based on your processor. -march=native
+ also detects and applies additional settings beyond -march specific
+ to your CPU, (eg. -msse4). Unless you have a specific reason not to
+ (e.g. distcc cross-compiling), you should probably be using
+ -march=native rather than anything listed below.
+
+ Enables -march=native
+
endchoice
config X86_GENERIC
@@ -310,7 +471,7 @@ config X86_INTERNODE_CACHE_SHIFT
config X86_L1_CACHE_SHIFT
int
default "7" if MPENTIUM4 || MPSC
- default "6" if MK7 || MK8 || MPENTIUMM || MCORE2 || MATOM || MVIAC7 || X86_GENERIC || GENERIC_CPU
+ default "6" if MK7 || MK8 || MK8SSE3 || MK10 || MBARCELONA || MBOBCAT || MBULLDOZER || MPILEDRIVER || MSTEAMROLLER || MEXCAVATOR || MZEN || MJAGUAR || MPENTIUMM || MCORE2 || MNEHALEM || MWESTMERE || MSILVERMONT || MSANDYBRIDGE || MIVYBRIDGE || MHASWELL || MBROADWELL || MSKYLAKE || MNATIVE || MATOM || MVIAC7 || X86_GENERIC || GENERIC_CPU
default "4" if MELAN || M486 || MGEODEGX1
default "5" if MWINCHIP3D || MWINCHIPC6 || MCRUSOE || MEFFICEON || MCYRIXIII || MK6 || MPENTIUMIII || MPENTIUMII || M686 || M586MMX || M586TSC || M586 || MVIAC3_2 || MGEODE_LX
@@ -341,45 +502,46 @@ config X86_ALIGNMENT_16
config X86_INTEL_USERCOPY
def_bool y
- depends on MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M586MMX || X86_GENERIC || MK8 || MK7 || MEFFICEON || MCORE2
+ depends on MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M586MMX || X86_GENERIC || MK8 || MK8SSE3 || MK7 || MEFFICEON || MCORE2 || MK10 || MBARCELONA || MNEHALEM || MWESTMERE || MSILVERMONT || MSANDYBRIDGE || MIVYBRIDGE || MHASWELL || MBROADWELL || MSKYLAKE || MNATIVE
config X86_USE_PPRO_CHECKSUM
def_bool y
- depends on MWINCHIP3D || MWINCHIPC6 || MCYRIXIII || MK7 || MK6 || MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M686 || MK8 || MVIAC3_2 || MVIAC7 || MEFFICEON || MGEODE_LX || MCORE2 || MATOM
+ depends on MWINCHIP3D || MWINCHIPC6 || MCYRIXIII || MK7 || MK6 || MK10 || MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M686 || MK8 || MK8SSE3 || MVIAC3_2 || MVIAC7 || MEFFICEON || MGEODE_LX || MCORE2 || MNEHALEM || MWESTMERE || MSILVERMONT || MSANDYBRIDGE || MIVYBRIDGE || MHASWELL || MBROADWELL || MSKYLAKE || MATOM || MNATIVE
config X86_USE_3DNOW
def_bool y
depends on (MCYRIXIII || MK7 || MGEODE_LX) && !UML
-#
-# P6_NOPs are a relatively minor optimization that require a family >=
-# 6 processor, except that it is broken on certain VIA chips.
-# Furthermore, AMD chips prefer a totally different sequence of NOPs
-# (which work on all CPUs). In addition, it looks like Virtual PC
-# does not understand them.
-#
-# As a result, disallow these if we're not compiling for X86_64 (these
-# NOPs do work on all x86-64 capable chips); the list of processors in
-# the right-hand clause are the cores that benefit from this optimization.
-#
config X86_P6_NOP
- def_bool y
- depends on X86_64
- depends on (MCORE2 || MPENTIUM4 || MPSC)
+ default n
+ bool "Support for P6_NOPs on Intel chips"
+ depends on (MCORE2 || MPENTIUM4 || MPSC || MATOM || MNEHALEM || MWESTMERE || MSILVERMONT || MSANDYBRIDGE || MIVYBRIDGE || MHASWELL || MBROADWELL || MSKYLAKE || MNATIVE)
+ ---help---
+ P6_NOPs are a relatively minor optimization that require a family >=
+ 6 processor, except that it is broken on certain VIA chips.
+ Furthermore, AMD chips prefer a totally different sequence of NOPs
+ (which work on all CPUs). In addition, it looks like Virtual PC
+ does not understand them.
+
+ As a result, disallow these if we're not compiling for X86_64 (these
+ NOPs do work on all x86-64 capable chips); the list of processors in
+ the right-hand clause are the cores that benefit from this optimization.
+
+ Say Y if you have Intel CPU newer than Pentium Pro, N otherwise.
config X86_TSC
def_bool y
- depends on (MWINCHIP3D || MCRUSOE || MEFFICEON || MCYRIXIII || MK7 || MK6 || MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M686 || M586MMX || M586TSC || MK8 || MVIAC3_2 || MVIAC7 || MGEODEGX1 || MGEODE_LX || MCORE2 || MATOM) || X86_64
+ depends on (MWINCHIP3D || MCRUSOE || MEFFICEON || MCYRIXIII || MK7 || MK6 || MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M686 || M586MMX || M586TSC || MK8 || MK8SSE3 || MVIAC3_2 || MVIAC7 || MGEODEGX1 || MGEODE_LX || MCORE2 || MNEHALEM || MWESTMERE || MSILVERMONT || MSANDYBRIDGE || MIVYBRIDGE || MHASWELL || MBROADWELL || MSKYLAKE || MNATIVE || MATOM) || X86_64
config X86_CMPXCHG64
def_bool y
- depends on X86_PAE || X86_64 || MCORE2 || MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M686 || MATOM
+ depends on X86_PAE || X86_64 || MCORE2 || MNEHALEM || MWESTMERE || MSILVERMONT || MSANDYBRIDGE || MIVYBRIDGE || MHASWELL || MBROADWELL || MSKYLAKE || MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M686 || MATOM || MNATIVE
# this should be set for all -march=.. options where the compiler
# generates cmov.
config X86_CMOV
def_bool y
- depends on (MK8 || MK7 || MCORE2 || MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M686 || MVIAC3_2 || MVIAC7 || MCRUSOE || MEFFICEON || X86_64 || MATOM || MGEODE_LX)
+ depends on (MK8 || MK8SSE3 || MK10 || MBARCELONA || MBOBCAT || MBULLDOZER || MPILEDRIVER || MSTEAMROLLER || MEXCAVATOR || MZEN || MJAGUAR || MK7 || MCORE2 || MNEHALEM || MWESTMERE || MSILVERMONT || MSANDYBRIDGE || MIVYBRIDGE || MHASWELL || MBROADWELL || MSKYLAKE || MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M686 || MVIAC3_2 || MVIAC7 || MCRUSOE || MEFFICEON || X86_64 || MNATIVE || MATOM || MGEODE_LX)
config X86_MINIMUM_CPU_FAMILY
int
diff --git a/arch/x86/Makefile b/arch/x86/Makefile
index 2d44933..344075c 100644
--- a/arch/x86/Makefile
+++ b/arch/x86/Makefile
@@ -104,13 +104,40 @@ else
KBUILD_CFLAGS += $(call cc-option,-mskip-rax-setup)
# FIXME - should be integrated in Makefile.cpu (Makefile_32.cpu)
+ cflags-$(CONFIG_MNATIVE) += $(call cc-option,-march=native)
cflags-$(CONFIG_MK8) += $(call cc-option,-march=k8)
+ cflags-$(CONFIG_MK8SSE3) += $(call cc-option,-march=k8-sse3,-mtune=k8)
+ cflags-$(CONFIG_MK10) += $(call cc-option,-march=amdfam10)
+ cflags-$(CONFIG_MBARCELONA) += $(call cc-option,-march=barcelona)
+ cflags-$(CONFIG_MBOBCAT) += $(call cc-option,-march=btver1)
+ cflags-$(CONFIG_MJAGUAR) += $(call cc-option,-march=btver2)
+ cflags-$(CONFIG_MBULLDOZER) += $(call cc-option,-march=bdver1)
+ cflags-$(CONFIG_MPILEDRIVER) += $(call cc-option,-march=bdver2)
+ cflags-$(CONFIG_MSTEAMROLLER) += $(call cc-option,-march=bdver3)
+ cflags-$(CONFIG_MEXCAVATOR) += $(call cc-option,-march=bdver4)
+ cflags-$(CONFIG_MZEN) += $(call cc-option,-march=znver1)
cflags-$(CONFIG_MPSC) += $(call cc-option,-march=nocona)
cflags-$(CONFIG_MCORE2) += \
- $(call cc-option,-march=core2,$(call cc-option,-mtune=generic))
- cflags-$(CONFIG_MATOM) += $(call cc-option,-march=atom) \
- $(call cc-option,-mtune=atom,$(call cc-option,-mtune=generic))
+ $(call cc-option,-march=core2,$(call cc-option,-mtune=core2))
+ cflags-$(CONFIG_MNEHALEM) += \
+ $(call cc-option,-march=nehalem,$(call cc-option,-mtune=nehalem))
+ cflags-$(CONFIG_MWESTMERE) += \
+ $(call cc-option,-march=westmere,$(call cc-option,-mtune=westmere))
+ cflags-$(CONFIG_MSILVERMONT) += \
+ $(call cc-option,-march=silvermont,$(call cc-option,-mtune=silvermont))
+ cflags-$(CONFIG_MSANDYBRIDGE) += \
+ $(call cc-option,-march=sandybridge,$(call cc-option,-mtune=sandybridge))
+ cflags-$(CONFIG_MIVYBRIDGE) += \
+ $(call cc-option,-march=ivybridge,$(call cc-option,-mtune=ivybridge))
+ cflags-$(CONFIG_MHASWELL) += \
+ $(call cc-option,-march=haswell,$(call cc-option,-mtune=haswell))
+ cflags-$(CONFIG_MBROADWELL) += \
+ $(call cc-option,-march=broadwell,$(call cc-option,-mtune=broadwell))
+ cflags-$(CONFIG_MSKYLAKE) += \
+ $(call cc-option,-march=skylake,$(call cc-option,-mtune=skylake))
+ cflags-$(CONFIG_MATOM) += $(call cc-option,-march=bonnell) \
+ $(call cc-option,-mtune=bonnell,$(call cc-option,-mtune=generic))
cflags-$(CONFIG_GENERIC_CPU) += $(call cc-option,-mtune=generic)
KBUILD_CFLAGS += $(cflags-y)
diff --git a/arch/x86/Makefile_32.cpu b/arch/x86/Makefile_32.cpu
index 6647ed4..765a71e 100644
--- a/arch/x86/Makefile_32.cpu
+++ b/arch/x86/Makefile_32.cpu
@@ -23,7 +23,18 @@ cflags-$(CONFIG_MK6) += -march=k6
# Please note, that patches that add -march=athlon-xp and friends are pointless.
# They make zero difference whatsosever to performance at this time.
cflags-$(CONFIG_MK7) += -march=athlon
+cflags-$(CONFIG_MNATIVE) += $(call cc-option,-march=native)
cflags-$(CONFIG_MK8) += $(call cc-option,-march=k8,-march=athlon)
+cflags-$(CONFIG_MK8SSE3) += $(call cc-option,-march=k8-sse3,-march=athlon)
+cflags-$(CONFIG_MK10) += $(call cc-option,-march=amdfam10,-march=athlon)
+cflags-$(CONFIG_MBARCELONA) += $(call cc-option,-march=barcelona,-march=athlon)
+cflags-$(CONFIG_MBOBCAT) += $(call cc-option,-march=btver1,-march=athlon)
+cflags-$(CONFIG_MJAGUAR) += $(call cc-option,-march=btver2,-march=athlon)
+cflags-$(CONFIG_MBULLDOZER) += $(call cc-option,-march=bdver1,-march=athlon)
+cflags-$(CONFIG_MPILEDRIVER) += $(call cc-option,-march=bdver2,-march=athlon)
+cflags-$(CONFIG_MSTEAMROLLER) += $(call cc-option,-march=bdver3,-march=athlon)
+cflags-$(CONFIG_MEXCAVATOR) += $(call cc-option,-march=bdver4,-march=athlon)
+cflags-$(CONFIG_MZEN) += $(call cc-option,-march=znver1,-march=athlon)
cflags-$(CONFIG_MCRUSOE) += -march=i686 $(align)-functions=0 $(align)-jumps=0 $(align)-loops=0
cflags-$(CONFIG_MEFFICEON) += -march=i686 $(call tune,pentium3) $(align)-functions=0 $(align)-jumps=0 $(align)-loops=0
cflags-$(CONFIG_MWINCHIPC6) += $(call cc-option,-march=winchip-c6,-march=i586)
@@ -32,8 +43,16 @@ cflags-$(CONFIG_MCYRIXIII) += $(call cc-option,-march=c3,-march=i486) $(align)-f
cflags-$(CONFIG_MVIAC3_2) += $(call cc-option,-march=c3-2,-march=i686)
cflags-$(CONFIG_MVIAC7) += -march=i686
cflags-$(CONFIG_MCORE2) += -march=i686 $(call tune,core2)
-cflags-$(CONFIG_MATOM) += $(call cc-option,-march=atom,$(call cc-option,-march=core2,-march=i686)) \
- $(call cc-option,-mtune=atom,$(call cc-option,-mtune=generic))
+cflags-$(CONFIG_MNEHALEM) += -march=i686 $(call tune,nehalem)
+cflags-$(CONFIG_MWESTMERE) += -march=i686 $(call tune,westmere)
+cflags-$(CONFIG_MSILVERMONT) += -march=i686 $(call tune,silvermont)
+cflags-$(CONFIG_MSANDYBRIDGE) += -march=i686 $(call tune,sandybridge)
+cflags-$(CONFIG_MIVYBRIDGE) += -march=i686 $(call tune,ivybridge)
+cflags-$(CONFIG_MHASWELL) += -march=i686 $(call tune,haswell)
+cflags-$(CONFIG_MBROADWELL) += -march=i686 $(call tune,broadwell)
+cflags-$(CONFIG_MSKYLAKE) += -march=i686 $(call tune,skylake)
+cflags-$(CONFIG_MATOM) += $(call cc-option,-march=bonnell,$(call cc-option,-march=core2,-march=i686)) \
+ $(call cc-option,-mtune=bonnell,$(call cc-option,-mtune=generic))
# AMD Elan support
cflags-$(CONFIG_MELAN) += -march=i486
diff --git a/arch/x86/include/asm/module.h b/arch/x86/include/asm/module.h
index e3b7819..2543bde 100644
--- a/arch/x86/include/asm/module.h
+++ b/arch/x86/include/asm/module.h
@@ -15,6 +15,24 @@
#define MODULE_PROC_FAMILY "586MMX "
#elif defined CONFIG_MCORE2
#define MODULE_PROC_FAMILY "CORE2 "
+#elif defined CONFIG_MNATIVE
+#define MODULE_PROC_FAMILY "NATIVE "
+#elif defined CONFIG_MNEHALEM
+#define MODULE_PROC_FAMILY "NEHALEM "
+#elif defined CONFIG_MWESTMERE
+#define MODULE_PROC_FAMILY "WESTMERE "
+#elif defined CONFIG_MSILVERMONT
+#define MODULE_PROC_FAMILY "SILVERMONT "
+#elif defined CONFIG_MSANDYBRIDGE
+#define MODULE_PROC_FAMILY "SANDYBRIDGE "
+#elif defined CONFIG_MIVYBRIDGE
+#define MODULE_PROC_FAMILY "IVYBRIDGE "
+#elif defined CONFIG_MHASWELL
+#define MODULE_PROC_FAMILY "HASWELL "
+#elif defined CONFIG_MBROADWELL
+#define MODULE_PROC_FAMILY "BROADWELL "
+#elif defined CONFIG_MSKYLAKE
+#define MODULE_PROC_FAMILY "SKYLAKE "
#elif defined CONFIG_MATOM
#define MODULE_PROC_FAMILY "ATOM "
#elif defined CONFIG_M686
@@ -33,6 +51,26 @@
#define MODULE_PROC_FAMILY "K7 "
#elif defined CONFIG_MK8
#define MODULE_PROC_FAMILY "K8 "
+#elif defined CONFIG_MK8SSE3
+#define MODULE_PROC_FAMILY "K8SSE3 "
+#elif defined CONFIG_MK10
+#define MODULE_PROC_FAMILY "K10 "
+#elif defined CONFIG_MBARCELONA
+#define MODULE_PROC_FAMILY "BARCELONA "
+#elif defined CONFIG_MBOBCAT
+#define MODULE_PROC_FAMILY "BOBCAT "
+#elif defined CONFIG_MBULLDOZER
+#define MODULE_PROC_FAMILY "BULLDOZER "
+#elif defined CONFIG_MPILEDRIVER
+#define MODULE_PROC_FAMILY "STEAMROLLER "
+#elif defined CONFIG_MSTEAMROLLER
+#define MODULE_PROC_FAMILY "PILEDRIVER "
+#elif defined CONFIG_MJAGUAR
+#define MODULE_PROC_FAMILY "JAGUAR "
+#elif defined CONFIG_MEXCAVATOR
+#define MODULE_PROC_FAMILY "EXCAVATOR "
+#elif defined CONFIG_MZEN
+#define MODULE_PROC_FAMILY "ZEN "
#elif defined CONFIG_MELAN
#define MODULE_PROC_FAMILY "ELAN "
#elif defined CONFIG_MCRUSOE
diff --git a/block/Kconfig b/block/Kconfig
index 1d4d624..7c8523e 100644
--- a/block/Kconfig
+++ b/block/Kconfig
@@ -112,6 +112,30 @@ config BLK_CMDLINE_PARSER
See Documentation/block/cmdline-partition.txt for more information.
+config BLK_WBT
+ bool "Enable support for block device writeback throttling"
+ default n
+ ---help---
+ Enabling this option enables the block layer to throttle buffered
+ writeback from the VM, making it more smooth and having less
+ impact on foreground operations.
+
+config BLK_WBT_SQ
+ bool "Single queue writeback throttling"
+ default n
+ depends on BLK_WBT
+ ---help---
+ Enable writeback throttling by default on legacy single queue devices
+
+config BLK_WBT_MQ
+ bool "Multiqueue writeback throttling"
+ default y
+ depends on BLK_WBT
+ ---help---
+ Enable writeback throttling by default on multiqueue devices.
+ Multiqueue currently doesn't have support for IO scheduling,
+ enabling this option is recommended.
+
menu "Partition Types"
source "block/partitions/Kconfig"
diff --git a/block/Kconfig.iosched b/block/Kconfig.iosched
index 421bef9..f2cd945 100644
--- a/block/Kconfig.iosched
+++ b/block/Kconfig.iosched
@@ -39,6 +39,25 @@ config CFQ_GROUP_IOSCHED
---help---
Enable group IO scheduling in CFQ.
+config IOSCHED_BFQ
+ tristate "BFQ I/O scheduler"
+ default n
+ ---help---
+ The BFQ I/O scheduler distributes bandwidth among all
+ processes according to their weights, regardless of the
+ device parameters and with any workload. It also guarantees
+ a low latency to interactive and soft real-time applications.
+ Details in Documentation/block/bfq-iosched.txt
+
+config BFQ_GROUP_IOSCHED
+ bool "BFQ hierarchical scheduling support"
+ depends on IOSCHED_BFQ && BLK_CGROUP
+ default n
+ ---help---
+
+ Enable hierarchical scheduling in BFQ, using the blkio
+ (cgroups-v1) or io (cgroups-v2) controller.
+
choice
prompt "Default I/O scheduler"
default DEFAULT_CFQ
@@ -52,6 +71,16 @@ choice
config DEFAULT_CFQ
bool "CFQ" if IOSCHED_CFQ=y
+ config DEFAULT_BFQ
+ bool "BFQ" if IOSCHED_BFQ=y
+ help
+ Selects BFQ as the default I/O scheduler which will be
+ used by default for all block devices.
+ The BFQ I/O scheduler aims at distributing the bandwidth
+ as desired, independently of the disk parameters and with
+ any workload. It also tries to guarantee low latency to
+ interactive and soft real-time applications.
+
config DEFAULT_NOOP
bool "No-op"
@@ -61,6 +90,7 @@ config DEFAULT_IOSCHED
string
default "deadline" if DEFAULT_DEADLINE
default "cfq" if DEFAULT_CFQ
+ default "bfq" if DEFAULT_BFQ
default "noop" if DEFAULT_NOOP
endmenu
diff --git a/block/Makefile b/block/Makefile
index 36acdd7..5709f59 100644
--- a/block/Makefile
+++ b/block/Makefile
@@ -5,7 +5,7 @@
obj-$(CONFIG_BLOCK) := bio.o elevator.o blk-core.o blk-tag.o blk-sysfs.o \
blk-flush.o blk-settings.o blk-ioc.o blk-map.o \
blk-exec.o blk-merge.o blk-softirq.o blk-timeout.o \
- blk-lib.o blk-mq.o blk-mq-tag.o \
+ blk-lib.o blk-mq.o blk-mq-tag.o blk-stat.o \
blk-mq-sysfs.o blk-mq-cpumap.o ioctl.o \
genhd.o scsi_ioctl.o partition-generic.o ioprio.o \
badblocks.o partitions/
@@ -18,8 +18,10 @@ obj-$(CONFIG_BLK_DEV_THROTTLING) += blk-throttle.o
obj-$(CONFIG_IOSCHED_NOOP) += noop-iosched.o
obj-$(CONFIG_IOSCHED_DEADLINE) += deadline-iosched.o
obj-$(CONFIG_IOSCHED_CFQ) += cfq-iosched.o
+obj-$(CONFIG_IOSCHED_BFQ) += bfq-iosched.o
obj-$(CONFIG_BLOCK_COMPAT) += compat_ioctl.o
obj-$(CONFIG_BLK_CMDLINE_PARSER) += cmdline-parser.o
obj-$(CONFIG_BLK_DEV_INTEGRITY) += bio-integrity.o blk-integrity.o t10-pi.o
obj-$(CONFIG_BLK_MQ_PCI) += blk-mq-pci.o
+obj-$(CONFIG_BLK_WBT) += blk-wbt.o
diff --git a/block/bfq-cgroup.c b/block/bfq-cgroup.c
new file mode 100644
index 0000000..106abff
--- /dev/null
+++ b/block/bfq-cgroup.c
@@ -0,0 +1,1213 @@
+/*
+ * BFQ: CGROUPS support.
+ *
+ * Based on ideas and code from CFQ:
+ * Copyright (C) 2003 Jens Axboe <axboe@kernel.dk>
+ *
+ * Copyright (C) 2008 Fabio Checconi <fabio@gandalf.sssup.it>
+ * Paolo Valente <paolo.valente@unimore.it>
+ *
+ * Copyright (C) 2015 Paolo Valente <paolo.valente@unimore.it>
+ *
+ * Copyright (C) 2016 Paolo Valente <paolo.valente@linaro.org>
+ *
+ * Licensed under the GPL-2 as detailed in the accompanying COPYING.BFQ
+ * file.
+ */
+
+#ifdef CONFIG_BFQ_GROUP_IOSCHED
+
+/* bfqg stats flags */
+enum bfqg_stats_flags {
+ BFQG_stats_waiting = 0,
+ BFQG_stats_idling,
+ BFQG_stats_empty,
+};
+
+#define BFQG_FLAG_FNS(name) \
+static void bfqg_stats_mark_##name(struct bfqg_stats *stats) \
+{ \
+ stats->flags |= (1 << BFQG_stats_##name); \
+} \
+static void bfqg_stats_clear_##name(struct bfqg_stats *stats) \
+{ \
+ stats->flags &= ~(1 << BFQG_stats_##name); \
+} \
+static int bfqg_stats_##name(struct bfqg_stats *stats) \
+{ \
+ return (stats->flags & (1 << BFQG_stats_##name)) != 0; \
+} \
+
+BFQG_FLAG_FNS(waiting)
+BFQG_FLAG_FNS(idling)
+BFQG_FLAG_FNS(empty)
+#undef BFQG_FLAG_FNS
+
+/* This should be called with the queue_lock held. */
+static void bfqg_stats_update_group_wait_time(struct bfqg_stats *stats)
+{
+ unsigned long long now;
+
+ if (!bfqg_stats_waiting(stats))
+ return;
+
+ now = sched_clock();
+ if (time_after64(now, stats->start_group_wait_time))
+ blkg_stat_add(&stats->group_wait_time,
+ now - stats->start_group_wait_time);
+ bfqg_stats_clear_waiting(stats);
+}
+
+/* This should be called with the queue_lock held. */
+static void bfqg_stats_set_start_group_wait_time(struct bfq_group *bfqg,
+ struct bfq_group *curr_bfqg)
+{
+ struct bfqg_stats *stats = &bfqg->stats;
+
+ if (bfqg_stats_waiting(stats))
+ return;
+ if (bfqg == curr_bfqg)
+ return;
+ stats->start_group_wait_time = sched_clock();
+ bfqg_stats_mark_waiting(stats);
+}
+
+/* This should be called with the queue_lock held. */
+static void bfqg_stats_end_empty_time(struct bfqg_stats *stats)
+{
+ unsigned long long now;
+
+ if (!bfqg_stats_empty(stats))
+ return;
+
+ now = sched_clock();
+ if (time_after64(now, stats->start_empty_time))
+ blkg_stat_add(&stats->empty_time,
+ now - stats->start_empty_time);
+ bfqg_stats_clear_empty(stats);
+}
+
+static void bfqg_stats_update_dequeue(struct bfq_group *bfqg)
+{
+ blkg_stat_add(&bfqg->stats.dequeue, 1);
+}
+
+static void bfqg_stats_set_start_empty_time(struct bfq_group *bfqg)
+{
+ struct bfqg_stats *stats = &bfqg->stats;
+
+ if (blkg_rwstat_total(&stats->queued))
+ return;
+
+ /*
+ * group is already marked empty. This can happen if bfqq got new
+ * request in parent group and moved to this group while being added
+ * to service tree. Just ignore the event and move on.
+ */
+ if (bfqg_stats_empty(stats))
+ return;
+
+ stats->start_empty_time = sched_clock();
+ bfqg_stats_mark_empty(stats);
+}
+
+static void bfqg_stats_update_idle_time(struct bfq_group *bfqg)
+{
+ struct bfqg_stats *stats = &bfqg->stats;
+
+ if (bfqg_stats_idling(stats)) {
+ unsigned long long now = sched_clock();
+
+ if (time_after64(now, stats->start_idle_time))
+ blkg_stat_add(&stats->idle_time,
+ now - stats->start_idle_time);
+ bfqg_stats_clear_idling(stats);
+ }
+}
+
+static void bfqg_stats_set_start_idle_time(struct bfq_group *bfqg)
+{
+ struct bfqg_stats *stats = &bfqg->stats;
+
+ stats->start_idle_time = sched_clock();
+ bfqg_stats_mark_idling(stats);
+}
+
+static void bfqg_stats_update_avg_queue_size(struct bfq_group *bfqg)
+{
+ struct bfqg_stats *stats = &bfqg->stats;
+
+ blkg_stat_add(&stats->avg_queue_size_sum,
+ blkg_rwstat_total(&stats->queued));
+ blkg_stat_add(&stats->avg_queue_size_samples, 1);
+ bfqg_stats_update_group_wait_time(stats);
+}
+
+static struct blkcg_policy blkcg_policy_bfq;
+
+/*
+ * blk-cgroup policy-related handlers
+ * The following functions help in converting between blk-cgroup
+ * internal structures and BFQ-specific structures.
+ */
+
+static struct bfq_group *pd_to_bfqg(struct blkg_policy_data *pd)
+{
+ return pd ? container_of(pd, struct bfq_group, pd) : NULL;
+}
+
+static struct blkcg_gq *bfqg_to_blkg(struct bfq_group *bfqg)
+{
+ return pd_to_blkg(&bfqg->pd);
+}
+
+static struct bfq_group *blkg_to_bfqg(struct blkcg_gq *blkg)
+{
+ struct blkg_policy_data *pd = blkg_to_pd(blkg, &blkcg_policy_bfq);
+
+ return pd_to_bfqg(pd);
+}
+
+/*
+ * bfq_group handlers
+ * The following functions help in navigating the bfq_group hierarchy
+ * by allowing to find the parent of a bfq_group or the bfq_group
+ * associated to a bfq_queue.
+ */
+
+static struct bfq_group *bfqg_parent(struct bfq_group *bfqg)
+{
+ struct blkcg_gq *pblkg = bfqg_to_blkg(bfqg)->parent;
+
+ return pblkg ? blkg_to_bfqg(pblkg) : NULL;
+}
+
+static struct bfq_group *bfqq_group(struct bfq_queue *bfqq)
+{
+ struct bfq_entity *group_entity = bfqq->entity.parent;
+
+ return group_entity ? container_of(group_entity, struct bfq_group,
+ entity) :
+ bfqq->bfqd->root_group;
+}
+
+/*
+ * The following two functions handle get and put of a bfq_group by
+ * wrapping the related blk-cgroup hooks.
+ */
+
+static void bfqg_get(struct bfq_group *bfqg)
+{
+ return blkg_get(bfqg_to_blkg(bfqg));
+}
+
+static void bfqg_put(struct bfq_group *bfqg)
+{
+ return blkg_put(bfqg_to_blkg(bfqg));
+}
+
+static void bfqg_stats_update_io_add(struct bfq_group *bfqg,
+ struct bfq_queue *bfqq,
+ int op, int op_flags)
+{
+ blkg_rwstat_add(&bfqg->stats.queued, op, op_flags, 1);
+ bfqg_stats_end_empty_time(&bfqg->stats);
+ if (!(bfqq == ((struct bfq_data *)bfqg->bfqd)->in_service_queue))
+ bfqg_stats_set_start_group_wait_time(bfqg, bfqq_group(bfqq));
+}
+
+static void bfqg_stats_update_io_remove(struct bfq_group *bfqg, int op,
+ int op_flags)
+{
+ blkg_rwstat_add(&bfqg->stats.queued, op, op_flags, -1);
+}
+
+static void bfqg_stats_update_io_merged(struct bfq_group *bfqg, int op,
+ int op_flags)
+{
+ blkg_rwstat_add(&bfqg->stats.merged, op, op_flags, 1);
+}
+
+static void bfqg_stats_update_completion(struct bfq_group *bfqg,
+ uint64_t start_time, uint64_t io_start_time, int op,
+ int op_flags)
+{
+ struct bfqg_stats *stats = &bfqg->stats;
+ unsigned long long now = sched_clock();
+
+ if (time_after64(now, io_start_time))
+ blkg_rwstat_add(&stats->service_time, op, op_flags,
+ now - io_start_time);
+ if (time_after64(io_start_time, start_time))
+ blkg_rwstat_add(&stats->wait_time, op, op_flags,
+ io_start_time - start_time);
+}
+
+/* @stats = 0 */
+static void bfqg_stats_reset(struct bfqg_stats *stats)
+{
+ /* queued stats shouldn't be cleared */
+ blkg_rwstat_reset(&stats->merged);
+ blkg_rwstat_reset(&stats->service_time);
+ blkg_rwstat_reset(&stats->wait_time);
+ blkg_stat_reset(&stats->time);
+ blkg_stat_reset(&stats->avg_queue_size_sum);
+ blkg_stat_reset(&stats->avg_queue_size_samples);
+ blkg_stat_reset(&stats->dequeue);
+ blkg_stat_reset(&stats->group_wait_time);
+ blkg_stat_reset(&stats->idle_time);
+ blkg_stat_reset(&stats->empty_time);
+}
+
+/* @to += @from */
+static void bfqg_stats_add_aux(struct bfqg_stats *to, struct bfqg_stats *from)
+{
+ if (!to || !from)
+ return;
+
+ /* queued stats shouldn't be cleared */
+ blkg_rwstat_add_aux(&to->merged, &from->merged);
+ blkg_rwstat_add_aux(&to->service_time, &from->service_time);
+ blkg_rwstat_add_aux(&to->wait_time, &from->wait_time);
+ blkg_stat_add_aux(&from->time, &from->time);
+ blkg_stat_add_aux(&to->avg_queue_size_sum, &from->avg_queue_size_sum);
+ blkg_stat_add_aux(&to->avg_queue_size_samples,
+ &from->avg_queue_size_samples);
+ blkg_stat_add_aux(&to->dequeue, &from->dequeue);
+ blkg_stat_add_aux(&to->group_wait_time, &from->group_wait_time);
+ blkg_stat_add_aux(&to->idle_time, &from->idle_time);
+ blkg_stat_add_aux(&to->empty_time, &from->empty_time);
+}
+
+/*
+ * Transfer @bfqg's stats to its parent's dead_stats so that the ancestors'
+ * recursive stats can still account for the amount used by this bfqg after
+ * it's gone.
+ */
+static void bfqg_stats_xfer_dead(struct bfq_group *bfqg)
+{
+ struct bfq_group *parent;
+
+ if (!bfqg) /* root_group */
+ return;
+
+ parent = bfqg_parent(bfqg);
+
+ lockdep_assert_held(bfqg_to_blkg(bfqg)->q->queue_lock);
+
+ if (unlikely(!parent))
+ return;
+
+ bfqg_stats_add_aux(&parent->stats, &bfqg->stats);
+ bfqg_stats_reset(&bfqg->stats);
+}
+
+static void bfq_init_entity(struct bfq_entity *entity,
+ struct bfq_group *bfqg)
+{
+ struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);
+
+ entity->weight = entity->new_weight;
+ entity->orig_weight = entity->new_weight;
+ if (bfqq) {
+ bfqq->ioprio = bfqq->new_ioprio;
+ bfqq->ioprio_class = bfqq->new_ioprio_class;
+ bfqg_get(bfqg);
+ }
+ entity->parent = bfqg->my_entity; /* NULL for root group */
+ entity->sched_data = &bfqg->sched_data;
+}
+
+static void bfqg_stats_exit(struct bfqg_stats *stats)
+{
+ blkg_rwstat_exit(&stats->merged);
+ blkg_rwstat_exit(&stats->service_time);
+ blkg_rwstat_exit(&stats->wait_time);
+ blkg_rwstat_exit(&stats->queued);
+ blkg_stat_exit(&stats->time);
+ blkg_stat_exit(&stats->avg_queue_size_sum);
+ blkg_stat_exit(&stats->avg_queue_size_samples);
+ blkg_stat_exit(&stats->dequeue);
+ blkg_stat_exit(&stats->group_wait_time);
+ blkg_stat_exit(&stats->idle_time);
+ blkg_stat_exit(&stats->empty_time);
+}
+
+static int bfqg_stats_init(struct bfqg_stats *stats, gfp_t gfp)
+{
+ if (blkg_rwstat_init(&stats->merged, gfp) ||
+ blkg_rwstat_init(&stats->service_time, gfp) ||
+ blkg_rwstat_init(&stats->wait_time, gfp) ||
+ blkg_rwstat_init(&stats->queued, gfp) ||
+ blkg_stat_init(&stats->time, gfp) ||
+ blkg_stat_init(&stats->avg_queue_size_sum, gfp) ||
+ blkg_stat_init(&stats->avg_queue_size_samples, gfp) ||
+ blkg_stat_init(&stats->dequeue, gfp) ||
+ blkg_stat_init(&stats->group_wait_time, gfp) ||
+ blkg_stat_init(&stats->idle_time, gfp) ||
+ blkg_stat_init(&stats->empty_time, gfp)) {
+ bfqg_stats_exit(stats);
+ return -ENOMEM;
+ }
+
+ return 0;
+}
+
+static struct bfq_group_data *cpd_to_bfqgd(struct blkcg_policy_data *cpd)
+{
+ return cpd ? container_of(cpd, struct bfq_group_data, pd) : NULL;
+}
+
+static struct bfq_group_data *blkcg_to_bfqgd(struct blkcg *blkcg)
+{
+ return cpd_to_bfqgd(blkcg_to_cpd(blkcg, &blkcg_policy_bfq));
+}
+
+static struct blkcg_policy_data *bfq_cpd_alloc(gfp_t gfp)
+{
+ struct bfq_group_data *bgd;
+
+ bgd = kzalloc(sizeof(*bgd), GFP_KERNEL);
+ if (!bgd)
+ return NULL;
+ return &bgd->pd;
+}
+
+static void bfq_cpd_init(struct blkcg_policy_data *cpd)
+{
+ struct bfq_group_data *d = cpd_to_bfqgd(cpd);
+
+ d->weight = cgroup_subsys_on_dfl(io_cgrp_subsys) ?
+ CGROUP_WEIGHT_DFL : BFQ_WEIGHT_LEGACY_DFL;
+}
+
+static void bfq_cpd_free(struct blkcg_policy_data *cpd)
+{
+ kfree(cpd_to_bfqgd(cpd));
+}
+
+static struct blkg_policy_data *bfq_pd_alloc(gfp_t gfp, int node)
+{
+ struct bfq_group *bfqg;
+
+ bfqg = kzalloc_node(sizeof(*bfqg), gfp, node);
+ if (!bfqg)
+ return NULL;
+
+ if (bfqg_stats_init(&bfqg->stats, gfp)) {
+ kfree(bfqg);
+ return NULL;
+ }
+
+ return &bfqg->pd;
+}
+
+static void bfq_pd_init(struct blkg_policy_data *pd)
+{
+ struct blkcg_gq *blkg;
+ struct bfq_group *bfqg;
+ struct bfq_data *bfqd;
+ struct bfq_entity *entity;
+ struct bfq_group_data *d;
+
+ blkg = pd_to_blkg(pd);
+ BUG_ON(!blkg);
+ bfqg = blkg_to_bfqg(blkg);
+ bfqd = blkg->q->elevator->elevator_data;
+ entity = &bfqg->entity;
+ d = blkcg_to_bfqgd(blkg->blkcg);
+
+ entity->orig_weight = entity->weight = entity->new_weight = d->weight;
+ entity->my_sched_data = &bfqg->sched_data;
+ bfqg->my_entity = entity; /*
+ * the root_group's will be set to NULL
+ * in bfq_init_queue()
+ */
+ bfqg->bfqd = bfqd;
+ bfqg->active_entities = 0;
+ bfqg->rq_pos_tree = RB_ROOT;
+}
+
+static void bfq_pd_free(struct blkg_policy_data *pd)
+{
+ struct bfq_group *bfqg = pd_to_bfqg(pd);
+
+ bfqg_stats_exit(&bfqg->stats);
+ return kfree(bfqg);
+}
+
+static void bfq_pd_reset_stats(struct blkg_policy_data *pd)
+{
+ struct bfq_group *bfqg = pd_to_bfqg(pd);
+
+ bfqg_stats_reset(&bfqg->stats);
+}
+
+static void bfq_group_set_parent(struct bfq_group *bfqg,
+ struct bfq_group *parent)
+{
+ struct bfq_entity *entity;
+
+ BUG_ON(!parent);
+ BUG_ON(!bfqg);
+ BUG_ON(bfqg == parent);
+
+ entity = &bfqg->entity;
+ entity->parent = parent->my_entity;
+ entity->sched_data = &parent->sched_data;
+}
+
+static struct bfq_group *bfq_lookup_bfqg(struct bfq_data *bfqd,
+ struct blkcg *blkcg)
+{
+ struct blkcg_gq *blkg;
+
+ blkg = blkg_lookup(blkcg, bfqd->queue);
+ if (likely(blkg))
+ return blkg_to_bfqg(blkg);
+ return NULL;
+}
+
+static struct bfq_group *bfq_find_set_group(struct bfq_data *bfqd,
+ struct blkcg *blkcg)
+{
+ struct bfq_group *bfqg, *parent;
+ struct bfq_entity *entity;
+
+ assert_spin_locked(bfqd->queue->queue_lock);
+
+ bfqg = bfq_lookup_bfqg(bfqd, blkcg);
+
+ if (unlikely(!bfqg))
+ return NULL;
+
+ /*
+ * Update chain of bfq_groups as we might be handling a leaf group
+ * which, along with some of its relatives, has not been hooked yet
+ * to the private hierarchy of BFQ.
+ */
+ entity = &bfqg->entity;
+ for_each_entity(entity) {
+ bfqg = container_of(entity, struct bfq_group, entity);
+ BUG_ON(!bfqg);
+ if (bfqg != bfqd->root_group) {
+ parent = bfqg_parent(bfqg);
+ if (!parent)
+ parent = bfqd->root_group;
+ BUG_ON(!parent);
+ bfq_group_set_parent(bfqg, parent);
+ }
+ }
+
+ return bfqg;
+}
+
+static void bfq_pos_tree_add_move(struct bfq_data *bfqd,
+ struct bfq_queue *bfqq);
+
+static void bfq_bfqq_expire(struct bfq_data *bfqd,
+ struct bfq_queue *bfqq,
+ bool compensate,
+ enum bfqq_expiration reason);
+
+/**
+ * bfq_bfqq_move - migrate @bfqq to @bfqg.
+ * @bfqd: queue descriptor.
+ * @bfqq: the queue to move.
+ * @bfqg: the group to move to.
+ *
+ * Move @bfqq to @bfqg, deactivating it from its old group and reactivating
+ * it on the new one. Avoid putting the entity on the old group idle tree.
+ *
+ * Must be called under the queue lock; the cgroup owning @bfqg must
+ * not disappear (by now this just means that we are called under
+ * rcu_read_lock()).
+ */
+static void bfq_bfqq_move(struct bfq_data *bfqd, struct bfq_queue *bfqq,
+ struct bfq_group *bfqg)
+{
+ struct bfq_entity *entity = &bfqq->entity;
+
+ BUG_ON(!bfq_bfqq_busy(bfqq) && !RB_EMPTY_ROOT(&bfqq->sort_list));
+ BUG_ON(!RB_EMPTY_ROOT(&bfqq->sort_list) && !entity->on_st);
+ BUG_ON(bfq_bfqq_busy(bfqq) && RB_EMPTY_ROOT(&bfqq->sort_list)
+ && entity->on_st &&
+ bfqq != bfqd->in_service_queue);
+ BUG_ON(!bfq_bfqq_busy(bfqq) && bfqq == bfqd->in_service_queue);
+
+ /* If bfqq is empty, then bfq_bfqq_expire also invokes
+ * bfq_del_bfqq_busy, thereby removing bfqq and its entity
+ * from data structures related to current group. Otherwise we
+ * need to remove bfqq explicitly with bfq_deactivate_bfqq, as
+ * we do below.
+ */
+ if (bfqq == bfqd->in_service_queue)
+ bfq_bfqq_expire(bfqd, bfqd->in_service_queue,
+ false, BFQ_BFQQ_PREEMPTED);
+
+ BUG_ON(entity->on_st && !bfq_bfqq_busy(bfqq)
+ && &bfq_entity_service_tree(entity)->idle !=
+ entity->tree);
+
+ BUG_ON(RB_EMPTY_ROOT(&bfqq->sort_list) && bfq_bfqq_busy(bfqq));
+
+ if (bfq_bfqq_busy(bfqq))
+ bfq_deactivate_bfqq(bfqd, bfqq, false, false);
+ else if (entity->on_st) {
+ BUG_ON(&bfq_entity_service_tree(entity)->idle !=
+ entity->tree);
+ bfq_put_idle_entity(bfq_entity_service_tree(entity), entity);
+ }
+ bfqg_put(bfqq_group(bfqq));
+
+ /*
+ * Here we use a reference to bfqg. We don't need a refcounter
+ * as the cgroup reference will not be dropped, so that its
+ * destroy() callback will not be invoked.
+ */
+ entity->parent = bfqg->my_entity;
+ entity->sched_data = &bfqg->sched_data;
+ bfqg_get(bfqg);
+
+ BUG_ON(RB_EMPTY_ROOT(&bfqq->sort_list) && bfq_bfqq_busy(bfqq));
+ if (bfq_bfqq_busy(bfqq)) {
+ bfq_pos_tree_add_move(bfqd, bfqq);
+ bfq_activate_bfqq(bfqd, bfqq);
+ }
+
+ if (!bfqd->in_service_queue && !bfqd->rq_in_driver)
+ bfq_schedule_dispatch(bfqd);
+ BUG_ON(entity->on_st && !bfq_bfqq_busy(bfqq)
+ && &bfq_entity_service_tree(entity)->idle !=
+ entity->tree);
+}
+
+/**
+ * __bfq_bic_change_cgroup - move @bic to @cgroup.
+ * @bfqd: the queue descriptor.
+ * @bic: the bic to move.
+ * @blkcg: the blk-cgroup to move to.
+ *
+ * Move bic to blkcg, assuming that bfqd->queue is locked; the caller
+ * has to make sure that the reference to cgroup is valid across the call.
+ *
+ * NOTE: an alternative approach might have been to store the current
+ * cgroup in bfqq and getting a reference to it, reducing the lookup
+ * time here, at the price of slightly more complex code.
+ */
+static struct bfq_group *__bfq_bic_change_cgroup(struct bfq_data *bfqd,
+ struct bfq_io_cq *bic,
+ struct blkcg *blkcg)
+{
+ struct bfq_queue *async_bfqq = bic_to_bfqq(bic, 0);
+ struct bfq_queue *sync_bfqq = bic_to_bfqq(bic, 1);
+ struct bfq_group *bfqg;
+ struct bfq_entity *entity;
+
+ lockdep_assert_held(bfqd->queue->queue_lock);
+
+ bfqg = bfq_find_set_group(bfqd, blkcg);
+
+ if (unlikely(!bfqg))
+ bfqg = bfqd->root_group;
+
+ if (async_bfqq) {
+ entity = &async_bfqq->entity;
+
+ if (entity->sched_data != &bfqg->sched_data) {
+ bic_set_bfqq(bic, NULL, 0);
+ bfq_log_bfqq(bfqd, async_bfqq,
+ "bic_change_group: %p %d",
+ async_bfqq,
+ async_bfqq->ref);
+ bfq_put_queue(async_bfqq);
+ }
+ }
+
+ if (sync_bfqq) {
+ entity = &sync_bfqq->entity;
+ if (entity->sched_data != &bfqg->sched_data)
+ bfq_bfqq_move(bfqd, sync_bfqq, bfqg);
+ }
+
+ return bfqg;
+}
+
+static void bfq_bic_update_cgroup(struct bfq_io_cq *bic, struct bio *bio)
+{
+ struct bfq_data *bfqd = bic_to_bfqd(bic);
+ struct bfq_group *bfqg = NULL;
+ uint64_t serial_nr;
+
+ rcu_read_lock();
+ serial_nr = bio_blkcg(bio)->css.serial_nr;
+
+ /*
+ * Check whether blkcg has changed. The condition may trigger
+ * spuriously on a newly created cic but there's no harm.
+ */
+ if (unlikely(!bfqd) || likely(bic->blkcg_serial_nr == serial_nr))
+ goto out;
+
+#ifdef CONFIG_BLK_WBT_SQ
+ /*
+ * If we have a non-root cgroup, we can depend on that to
+ * do proper throttling of writes. Turn off wbt for that
+ * case.
+ */
+ if (bio_blkcg(bio) != &blkcg_root) {
+ struct request_queue *q = bfqd->queue;
+
+ if (q->rq_wb)
+ wbt_disable(q->rq_wb);
+ }
+#endif /* CONFIG_BLK_WBT_SQ */
+
+ bfqg = __bfq_bic_change_cgroup(bfqd, bic, bio_blkcg(bio));
+ bic->blkcg_serial_nr = serial_nr;
+out:
+ rcu_read_unlock();
+}
+
+/**
+ * bfq_flush_idle_tree - deactivate any entity on the idle tree of @st.
+ * @st: the service tree being flushed.
+ */
+static void bfq_flush_idle_tree(struct bfq_service_tree *st)
+{
+ struct bfq_entity *entity = st->first_idle;
+
+ for (; entity ; entity = st->first_idle)
+ __bfq_deactivate_entity(entity, false);
+}
+
+/**
+ * bfq_reparent_leaf_entity - move leaf entity to the root_group.
+ * @bfqd: the device data structure with the root group.
+ * @entity: the entity to move.
+ */
+static void bfq_reparent_leaf_entity(struct bfq_data *bfqd,
+ struct bfq_entity *entity)
+{
+ struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);
+
+ BUG_ON(!bfqq);
+ bfq_bfqq_move(bfqd, bfqq, bfqd->root_group);
+}
+
+/**
+ * bfq_reparent_active_entities - move to the root group all active
+ * entities.
+ * @bfqd: the device data structure with the root group.
+ * @bfqg: the group to move from.
+ * @st: the service tree with the entities.
+ *
+ * Needs queue_lock to be taken and reference to be valid over the call.
+ */
+static void bfq_reparent_active_entities(struct bfq_data *bfqd,
+ struct bfq_group *bfqg,
+ struct bfq_service_tree *st)
+{
+ struct rb_root *active = &st->active;
+ struct bfq_entity *entity = NULL;
+
+ if (!RB_EMPTY_ROOT(&st->active))
+ entity = bfq_entity_of(rb_first(active));
+
+ for (; entity ; entity = bfq_entity_of(rb_first(active)))
+ bfq_reparent_leaf_entity(bfqd, entity);
+
+ if (bfqg->sched_data.in_service_entity)
+ bfq_reparent_leaf_entity(bfqd,
+ bfqg->sched_data.in_service_entity);
+}
+
+/**
+ * bfq_pd_offline - deactivate the entity associated with @pd,
+ * and reparent its children entities.
+ * @pd: descriptor of the policy going offline.
+ *
+ * blkio already grabs the queue_lock for us, so no need to use
+ * RCU-based magic
+ */
+static void bfq_pd_offline(struct blkg_policy_data *pd)
+{
+ struct bfq_service_tree *st;
+ struct bfq_group *bfqg;
+ struct bfq_data *bfqd;
+ struct bfq_entity *entity;
+ int i;
+
+ BUG_ON(!pd);
+ bfqg = pd_to_bfqg(pd);
+ BUG_ON(!bfqg);
+ bfqd = bfqg->bfqd;
+ BUG_ON(bfqd && !bfqd->root_group);
+
+ entity = bfqg->my_entity;
+
+ if (!entity) /* root group */
+ return;
+
+ /*
+ * Empty all service_trees belonging to this group before
+ * deactivating the group itself.
+ */
+ for (i = 0; i < BFQ_IOPRIO_CLASSES; i++) {
+ BUG_ON(!bfqg->sched_data.service_tree);
+ st = bfqg->sched_data.service_tree + i;
+ /*
+ * The idle tree may still contain bfq_queues belonging
+ * to exited task because they never migrated to a different
+ * cgroup from the one being destroyed now. No one else
+ * can access them so it's safe to act without any lock.
+ */
+ bfq_flush_idle_tree(st);
+
+ /*
+ * It may happen that some queues are still active
+ * (busy) upon group destruction (if the corresponding
+ * processes have been forced to terminate). We move
+ * all the leaf entities corresponding to these queues
+ * to the root_group.
+ * Also, it may happen that the group has an entity
+ * in service, which is disconnected from the active
+ * tree: it must be moved, too.
+ * There is no need to put the sync queues, as the
+ * scheduler has taken no reference.
+ */
+ bfq_reparent_active_entities(bfqd, bfqg, st);
+ BUG_ON(!RB_EMPTY_ROOT(&st->active));
+ BUG_ON(!RB_EMPTY_ROOT(&st->idle));
+ }
+ BUG_ON(bfqg->sched_data.next_in_service);
+ BUG_ON(bfqg->sched_data.in_service_entity);
+
+ __bfq_deactivate_entity(entity, false);
+ bfq_put_async_queues(bfqd, bfqg);
+ BUG_ON(entity->tree);
+
+ /*
+ * @blkg is going offline and will be ignored by
+ * blkg_[rw]stat_recursive_sum(). Transfer stats to the parent so
+ * that they don't get lost. If IOs complete after this point, the
+ * stats for them will be lost. Oh well...
+ */
+ bfqg_stats_xfer_dead(bfqg);
+}
+
+static void bfq_end_wr_async(struct bfq_data *bfqd)
+{
+ struct blkcg_gq *blkg;
+
+ list_for_each_entry(blkg, &bfqd->queue->blkg_list, q_node) {
+ struct bfq_group *bfqg = blkg_to_bfqg(blkg);
+ BUG_ON(!bfqg);
+
+ bfq_end_wr_async_queues(bfqd, bfqg);
+ }
+ bfq_end_wr_async_queues(bfqd, bfqd->root_group);
+}
+
+static int bfq_io_show_weight(struct seq_file *sf, void *v)
+{
+ struct blkcg *blkcg = css_to_blkcg(seq_css(sf));
+ struct bfq_group_data *bfqgd = blkcg_to_bfqgd(blkcg);
+ unsigned int val = 0;
+
+ if (bfqgd)
+ val = bfqgd->weight;
+
+ seq_printf(sf, "%u\n", val);
+
+ return 0;
+}
+
+static int bfq_io_set_weight_legacy(struct cgroup_subsys_state *css,
+ struct cftype *cftype,
+ u64 val)
+{
+ struct blkcg *blkcg = css_to_blkcg(css);
+ struct bfq_group_data *bfqgd = blkcg_to_bfqgd(blkcg);
+ struct blkcg_gq *blkg;
+ int ret = -ERANGE;
+
+ if (val < BFQ_MIN_WEIGHT || val > BFQ_MAX_WEIGHT)
+ return ret;
+
+ ret = 0;
+ spin_lock_irq(&blkcg->lock);
+ bfqgd->weight = (unsigned short)val;
+ hlist_for_each_entry(blkg, &blkcg->blkg_list, blkcg_node) {
+ struct bfq_group *bfqg = blkg_to_bfqg(blkg);
+
+ if (!bfqg)
+ continue;
+ /*
+ * Setting the prio_changed flag of the entity
+ * to 1 with new_weight == weight would re-set
+ * the value of the weight to its ioprio mapping.
+ * Set the flag only if necessary.
+ */
+ if ((unsigned short)val != bfqg->entity.new_weight) {
+ bfqg->entity.new_weight = (unsigned short)val;
+ /*
+ * Make sure that the above new value has been
+ * stored in bfqg->entity.new_weight before
+ * setting the prio_changed flag. In fact,
+ * this flag may be read asynchronously (in
+ * critical sections protected by a different
+ * lock than that held here), and finding this
+ * flag set may cause the execution of the code
+ * for updating parameters whose value may
+ * depend also on bfqg->entity.new_weight (in
+ * __bfq_entity_update_weight_prio).
+ * This barrier makes sure that the new value
+ * of bfqg->entity.new_weight is correctly
+ * seen in that code.
+ */
+ smp_wmb();
+ bfqg->entity.prio_changed = 1;
+ }
+ }
+ spin_unlock_irq(&blkcg->lock);
+
+ return ret;
+}
+
+static ssize_t bfq_io_set_weight(struct kernfs_open_file *of,
+ char *buf, size_t nbytes,
+ loff_t off)
+{
+ u64 weight;
+ /* First unsigned long found in the file is used */
+ int ret = kstrtoull(strim(buf), 0, &weight);
+
+ if (ret)
+ return ret;
+
+ return bfq_io_set_weight_legacy(of_css(of), NULL, weight);
+}
+
+static int bfqg_print_stat(struct seq_file *sf, void *v)
+{
+ blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)), blkg_prfill_stat,
+ &blkcg_policy_bfq, seq_cft(sf)->private, false);
+ return 0;
+}
+
+static int bfqg_print_rwstat(struct seq_file *sf, void *v)
+{
+ blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)), blkg_prfill_rwstat,
+ &blkcg_policy_bfq, seq_cft(sf)->private, true);
+ return 0;
+}
+
+static u64 bfqg_prfill_stat_recursive(struct seq_file *sf,
+ struct blkg_policy_data *pd, int off)
+{
+ u64 sum = blkg_stat_recursive_sum(pd_to_blkg(pd),
+ &blkcg_policy_bfq, off);
+ return __blkg_prfill_u64(sf, pd, sum);
+}
+
+static u64 bfqg_prfill_rwstat_recursive(struct seq_file *sf,
+ struct blkg_policy_data *pd, int off)
+{
+ struct blkg_rwstat sum = blkg_rwstat_recursive_sum(pd_to_blkg(pd),
+ &blkcg_policy_bfq,
+ off);
+ return __blkg_prfill_rwstat(sf, pd, &sum);
+}
+
+static int bfqg_print_stat_recursive(struct seq_file *sf, void *v)
+{
+ blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)),
+ bfqg_prfill_stat_recursive, &blkcg_policy_bfq,
+ seq_cft(sf)->private, false);
+ return 0;
+}
+
+static int bfqg_print_rwstat_recursive(struct seq_file *sf, void *v)
+{
+ blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)),
+ bfqg_prfill_rwstat_recursive, &blkcg_policy_bfq,
+ seq_cft(sf)->private, true);
+ return 0;
+}
+
+static u64 bfqg_prfill_sectors(struct seq_file *sf, struct blkg_policy_data *pd,
+ int off)
+{
+ u64 sum = blkg_rwstat_total(&pd->blkg->stat_bytes);
+
+ return __blkg_prfill_u64(sf, pd, sum >> 9);
+}
+
+static int bfqg_print_stat_sectors(struct seq_file *sf, void *v)
+{
+ blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)),
+ bfqg_prfill_sectors, &blkcg_policy_bfq, 0, false);
+ return 0;
+}
+
+static u64 bfqg_prfill_sectors_recursive(struct seq_file *sf,
+ struct blkg_policy_data *pd, int off)
+{
+ struct blkg_rwstat tmp = blkg_rwstat_recursive_sum(pd->blkg, NULL,
+ offsetof(struct blkcg_gq, stat_bytes));
+ u64 sum = atomic64_read(&tmp.aux_cnt[BLKG_RWSTAT_READ]) +
+ atomic64_read(&tmp.aux_cnt[BLKG_RWSTAT_WRITE]);
+
+ return __blkg_prfill_u64(sf, pd, sum >> 9);
+}
+
+static int bfqg_print_stat_sectors_recursive(struct seq_file *sf, void *v)
+{
+ blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)),
+ bfqg_prfill_sectors_recursive, &blkcg_policy_bfq, 0,
+ false);
+ return 0;
+}
+
+
+static u64 bfqg_prfill_avg_queue_size(struct seq_file *sf,
+ struct blkg_policy_data *pd, int off)
+{
+ struct bfq_group *bfqg = pd_to_bfqg(pd);
+ u64 samples = blkg_stat_read(&bfqg->stats.avg_queue_size_samples);
+ u64 v = 0;
+
+ if (samples) {
+ v = blkg_stat_read(&bfqg->stats.avg_queue_size_sum);
+ v = div64_u64(v, samples);
+ }
+ __blkg_prfill_u64(sf, pd, v);
+ return 0;
+}
+
+/* print avg_queue_size */
+static int bfqg_print_avg_queue_size(struct seq_file *sf, void *v)
+{
+ blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)),
+ bfqg_prfill_avg_queue_size, &blkcg_policy_bfq,
+ 0, false);
+ return 0;
+}
+
+static struct bfq_group *
+bfq_create_group_hierarchy(struct bfq_data *bfqd, int node)
+{
+ int ret;
+
+ ret = blkcg_activate_policy(bfqd->queue, &blkcg_policy_bfq);
+ if (ret)
+ return NULL;
+
+ return blkg_to_bfqg(bfqd->queue->root_blkg);
+}
+
+static struct cftype bfq_blkcg_legacy_files[] = {
+ {
+ .name = "bfq.weight",
+ .flags = CFTYPE_NOT_ON_ROOT,
+ .seq_show = bfq_io_show_weight,
+ .write_u64 = bfq_io_set_weight_legacy,
+ },
+
+ /* statistics, covers only the tasks in the bfqg */
+ {
+ .name = "bfq.time",
+ .private = offsetof(struct bfq_group, stats.time),
+ .seq_show = bfqg_print_stat,
+ },
+ {
+ .name = "bfq.sectors",
+ .seq_show = bfqg_print_stat_sectors,
+ },
+ {
+ .name = "bfq.io_service_bytes",
+ .private = (unsigned long)&blkcg_policy_bfq,
+ .seq_show = blkg_print_stat_bytes,
+ },
+ {
+ .name = "bfq.io_serviced",
+ .private = (unsigned long)&blkcg_policy_bfq,
+ .seq_show = blkg_print_stat_ios,
+ },
+ {
+ .name = "bfq.io_service_time",
+ .private = offsetof(struct bfq_group, stats.service_time),
+ .seq_show = bfqg_print_rwstat,
+ },
+ {
+ .name = "bfq.io_wait_time",
+ .private = offsetof(struct bfq_group, stats.wait_time),
+ .seq_show = bfqg_print_rwstat,
+ },
+ {
+ .name = "bfq.io_merged",
+ .private = offsetof(struct bfq_group, stats.merged),
+ .seq_show = bfqg_print_rwstat,
+ },
+ {
+ .name = "bfq.io_queued",
+ .private = offsetof(struct bfq_group, stats.queued),
+ .seq_show = bfqg_print_rwstat,
+ },
+
+ /* the same statictics which cover the bfqg and its descendants */
+ {
+ .name = "bfq.time_recursive",
+ .private = offsetof(struct bfq_group, stats.time),
+ .seq_show = bfqg_print_stat_recursive,
+ },
+ {
+ .name = "bfq.sectors_recursive",
+ .seq_show = bfqg_print_stat_sectors_recursive,
+ },
+ {
+ .name = "bfq.io_service_bytes_recursive",
+ .private = (unsigned long)&blkcg_policy_bfq,
+ .seq_show = blkg_print_stat_bytes_recursive,
+ },
+ {
+ .name = "bfq.io_serviced_recursive",
+ .private = (unsigned long)&blkcg_policy_bfq,
+ .seq_show = blkg_print_stat_ios_recursive,
+ },
+ {
+ .name = "bfq.io_service_time_recursive",
+ .private = offsetof(struct bfq_group, stats.service_time),
+ .seq_show = bfqg_print_rwstat_recursive,
+ },
+ {
+ .name = "bfq.io_wait_time_recursive",
+ .private = offsetof(struct bfq_group, stats.wait_time),
+ .seq_show = bfqg_print_rwstat_recursive,
+ },
+ {
+ .name = "bfq.io_merged_recursive",
+ .private = offsetof(struct bfq_group, stats.merged),
+ .seq_show = bfqg_print_rwstat_recursive,
+ },
+ {
+ .name = "bfq.io_queued_recursive",
+ .private = offsetof(struct bfq_group, stats.queued),
+ .seq_show = bfqg_print_rwstat_recursive,
+ },
+ {
+ .name = "bfq.avg_queue_size",
+ .seq_show = bfqg_print_avg_queue_size,
+ },
+ {
+ .name = "bfq.group_wait_time",
+ .private = offsetof(struct bfq_group, stats.group_wait_time),
+ .seq_show = bfqg_print_stat,
+ },
+ {
+ .name = "bfq.idle_time",
+ .private = offsetof(struct bfq_group, stats.idle_time),
+ .seq_show = bfqg_print_stat,
+ },
+ {
+ .name = "bfq.empty_time",
+ .private = offsetof(struct bfq_group, stats.empty_time),
+ .seq_show = bfqg_print_stat,
+ },
+ {
+ .name = "bfq.dequeue",
+ .private = offsetof(struct bfq_group, stats.dequeue),
+ .seq_show = bfqg_print_stat,
+ },
+ { } /* terminate */
+};
+
+static struct cftype bfq_blkg_files[] = {
+ {
+ .name = "bfq.weight",
+ .flags = CFTYPE_NOT_ON_ROOT,
+ .seq_show = bfq_io_show_weight,
+ .write = bfq_io_set_weight,
+ },
+ {} /* terminate */
+};
+
+#else /* CONFIG_BFQ_GROUP_IOSCHED */
+
+static inline void bfqg_stats_update_io_add(struct bfq_group *bfqg,
+ struct bfq_queue *bfqq, int op, int op_flags) { }
+static inline void
+bfqg_stats_update_io_remove(struct bfq_group *bfqg, int op, int op_flags) { }
+static inline void
+bfqg_stats_update_io_merged(struct bfq_group *bfqg, int op, int op_flags) { }
+static inline void bfqg_stats_update_completion(struct bfq_group *bfqg,
+ uint64_t start_time, uint64_t io_start_time, int op,
+ int op_flags) { }
+static inline void
+bfqg_stats_set_start_group_wait_time(struct bfq_group *bfqg,
+ struct bfq_group *curr_bfqg) { }
+static inline void bfqg_stats_end_empty_time(struct bfqg_stats *stats) { }
+static inline void bfqg_stats_update_dequeue(struct bfq_group *bfqg) { }
+static inline void bfqg_stats_set_start_empty_time(struct bfq_group *bfqg) { }
+static inline void bfqg_stats_update_idle_time(struct bfq_group *bfqg) { }
+static inline void bfqg_stats_set_start_idle_time(struct bfq_group *bfqg) { }
+static inline void bfqg_stats_update_avg_queue_size(struct bfq_group *bfqg) { }
+
+static void bfq_bfqq_move(struct bfq_data *bfqd, struct bfq_queue *bfqq,
+ struct bfq_group *bfqg) {}
+
+static void bfq_init_entity(struct bfq_entity *entity,
+ struct bfq_group *bfqg)
+{
+ struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);
+
+ entity->weight = entity->new_weight;
+ entity->orig_weight = entity->new_weight;
+ if (bfqq) {
+ bfqq->ioprio = bfqq->new_ioprio;
+ bfqq->ioprio_class = bfqq->new_ioprio_class;
+ }
+ entity->sched_data = &bfqg->sched_data;
+}
+
+static struct bfq_group *
+bfq_bic_update_cgroup(struct bfq_io_cq *bic, struct bio *bio)
+{
+ struct bfq_data *bfqd = bic_to_bfqd(bic);
+
+ return bfqd->root_group;
+}
+
+static void bfq_end_wr_async(struct bfq_data *bfqd)
+{
+ bfq_end_wr_async_queues(bfqd, bfqd->root_group);
+}
+
+static struct bfq_group *bfq_find_set_group(struct bfq_data *bfqd,
+ struct blkcg *blkcg)
+{
+ return bfqd->root_group;
+}
+
+static struct bfq_group *bfqq_group(struct bfq_queue *bfqq)
+{
+ return bfqq->bfqd->root_group;
+}
+
+static struct bfq_group *
+bfq_create_group_hierarchy(struct bfq_data *bfqd, int node)
+{
+ struct bfq_group *bfqg;
+ int i;
+
+ bfqg = kmalloc_node(sizeof(*bfqg), GFP_KERNEL | __GFP_ZERO, node);
+ if (!bfqg)
+ return NULL;
+
+ for (i = 0; i < BFQ_IOPRIO_CLASSES; i++)
+ bfqg->sched_data.service_tree[i] = BFQ_SERVICE_TREE_INIT;
+
+ return bfqg;
+}
+#endif
diff --git a/block/bfq-ioc.c b/block/bfq-ioc.c
new file mode 100644
index 0000000..fb7bb8f
--- /dev/null
+++ b/block/bfq-ioc.c
@@ -0,0 +1,36 @@
+/*
+ * BFQ: I/O context handling.
+ *
+ * Based on ideas and code from CFQ:
+ * Copyright (C) 2003 Jens Axboe <axboe@kernel.dk>
+ *
+ * Copyright (C) 2008 Fabio Checconi <fabio@gandalf.sssup.it>
+ * Paolo Valente <paolo.valente@unimore.it>
+ *
+ * Copyright (C) 2010 Paolo Valente <paolo.valente@unimore.it>
+ */
+
+/**
+ * icq_to_bic - convert iocontext queue structure to bfq_io_cq.
+ * @icq: the iocontext queue.
+ */
+static struct bfq_io_cq *icq_to_bic(struct io_cq *icq)
+{
+ /* bic->icq is the first member, %NULL will convert to %NULL */
+ return container_of(icq, struct bfq_io_cq, icq);
+}
+
+/**
+ * bfq_bic_lookup - search into @ioc a bic associated to @bfqd.
+ * @bfqd: the lookup key.
+ * @ioc: the io_context of the process doing I/O.
+ *
+ * Queue lock must be held.
+ */
+static struct bfq_io_cq *bfq_bic_lookup(struct bfq_data *bfqd,
+ struct io_context *ioc)
+{
+ if (ioc)
+ return icq_to_bic(ioc_lookup_icq(ioc, bfqd->queue));
+ return NULL;
+}
diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c
new file mode 100644
index 0000000..45e534a
--- /dev/null
+++ b/block/bfq-iosched.c
@@ -0,0 +1,5318 @@
+/*
+ * Budget Fair Queueing (BFQ) I/O scheduler.
+ *
+ * Based on ideas and code from CFQ:
+ * Copyright (C) 2003 Jens Axboe <axboe@kernel.dk>
+ *
+ * Copyright (C) 2008 Fabio Checconi <fabio@gandalf.sssup.it>
+ * Paolo Valente <paolo.valente@unimore.it>
+ *
+ * Copyright (C) 2015 Paolo Valente <paolo.valente@unimore.it>
+ *
+ * Copyright (C) 2017 Paolo Valente <paolo.valente@linaro.org>
+ *
+ * Licensed under the GPL-2 as detailed in the accompanying COPYING.BFQ
+ * file.
+ *
+ * BFQ is a proportional-share I/O scheduler, with some extra
+ * low-latency capabilities. BFQ also supports full hierarchical
+ * scheduling through cgroups. Next paragraphs provide an introduction
+ * on BFQ inner workings. Details on BFQ benefits and usage can be
+ * found in Documentation/block/bfq-iosched.txt.
+ *
+ * BFQ is a proportional-share storage-I/O scheduling algorithm based
+ * on the slice-by-slice service scheme of CFQ. But BFQ assigns
+ * budgets, measured in number of sectors, to processes instead of
+ * time slices. The device is not granted to the in-service process
+ * for a given time slice, but until it has exhausted its assigned
+ * budget. This change from the time to the service domain enables BFQ
+ * to distribute the device throughput among processes as desired,
+ * without any distortion due to throughput fluctuations, or to device
+ * internal queueing. BFQ uses an ad hoc internal scheduler, called
+ * B-WF2Q+, to schedule processes according to their budgets. More
+ * precisely, BFQ schedules queues associated with processes. Thanks to
+ * the accurate policy of B-WF2Q+, BFQ can afford to assign high
+ * budgets to I/O-bound processes issuing sequential requests (to
+ * boost the throughput), and yet guarantee a low latency to
+ * interactive and soft real-time applications.
+ *
+ * BFQ is described in [1], where also a reference to the initial, more
+ * theoretical paper on BFQ can be found. The interested reader can find
+ * in the latter paper full details on the main algorithm, as well as
+ * formulas of the guarantees and formal proofs of all the properties.
+ * With respect to the version of BFQ presented in these papers, this
+ * implementation adds a few more heuristics, such as the one that
+ * guarantees a low latency to soft real-time applications, and a
+ * hierarchical extension based on H-WF2Q+.
+ *
+ * B-WF2Q+ is based on WF2Q+, that is described in [2], together with
+ * H-WF2Q+, while the augmented tree used to implement B-WF2Q+ with O(log N)
+ * complexity derives from the one introduced with EEVDF in [3].
+ *
+ * [1] P. Valente, A. Avanzini, "Evolution of the BFQ Storage I/O
+ * Scheduler", Proceedings of the First Workshop on Mobile System
+ * Technologies (MST-2015), May 2015.
+ * http://algogroup.unimore.it/people/paolo/disk_sched/mst-2015.pdf
+ *
+ * http://algogroup.unimo.it/people/paolo/disk_sched/bf1-v1-suite-results.pdf
+ *
+ * [2] Jon C.R. Bennett and H. Zhang, ``Hierarchical Packet Fair Queueing
+ * Algorithms,'' IEEE/ACM Transactions on Networking, 5(5):675-689,
+ * Oct 1997.
+ *
+ * http://www.cs.cmu.edu/~hzhang/papers/TON-97-Oct.ps.gz
+ *
+ * [3] I. Stoica and H. Abdel-Wahab, ``Earliest Eligible Virtual Deadline
+ * First: A Flexible and Accurate Mechanism for Proportional Share
+ * Resource Allocation,'' technical report.
+ *
+ * http://www.cs.berkeley.edu/~istoica/papers/eevdf-tr-95.pdf
+ */
+#include <linux/module.h>
+#include <linux/slab.h>
+#include <linux/blkdev.h>
+#include <linux/cgroup.h>
+#include <linux/elevator.h>
+#include <linux/jiffies.h>
+#include <linux/rbtree.h>
+#include <linux/ioprio.h>
+#include "bfq.h"
+#include "blk.h"
+#include "blk-wbt.h"
+
+/* Expiration time of sync (0) and async (1) requests, in ns. */
+static const u64 bfq_fifo_expire[2] = { NSEC_PER_SEC / 4, NSEC_PER_SEC / 8 };
+
+/* Maximum backwards seek, in KiB. */
+static const int bfq_back_max = (16 * 1024);
+
+/* Penalty of a backwards seek, in number of sectors. */
+static const int bfq_back_penalty = 2;
+
+/* Idling period duration, in ns. */
+static u32 bfq_slice_idle = (NSEC_PER_SEC / 125);
+
+/* Minimum number of assigned budgets for which stats are safe to compute. */
+static const int bfq_stats_min_budgets = 194;
+
+/* Default maximum budget values, in sectors and number of requests. */
+static const int bfq_default_max_budget = (16 * 1024);
+
+/*
+ * Async to sync throughput distribution is controlled as follows:
+ * when an async request is served, the entity is charged the number
+ * of sectors of the request, multiplied by the factor below
+ */
+static const int bfq_async_charge_factor = 10;
+
+/* Default timeout values, in jiffies, approximating CFQ defaults. */
+static const int bfq_timeout = (HZ / 8);
+
+static struct kmem_cache *bfq_pool;
+
+/* Below this threshold (in ns), we consider thinktime immediate. */
+#define BFQ_MIN_TT (2 * NSEC_PER_MSEC)
+
+/* hw_tag detection: parallel requests threshold and min samples needed. */
+#define BFQ_HW_QUEUE_THRESHOLD 4
+#define BFQ_HW_QUEUE_SAMPLES 32
+
+#define BFQQ_SEEK_THR (sector_t)(8 * 100)
+#define BFQQ_SECT_THR_NONROT (sector_t)(2 * 32)
+#define BFQQ_CLOSE_THR (sector_t)(8 * 1024)
+#define BFQQ_SEEKY(bfqq) (hweight32(bfqq->seek_history) > 32/8)
+
+/* Min number of samples required to perform peak-rate update */
+#define BFQ_RATE_MIN_SAMPLES 32
+/* Min observation time interval required to perform a peak-rate update (ns) */
+#define BFQ_RATE_MIN_INTERVAL (300*NSEC_PER_MSEC)
+/* Target observation time interval for a peak-rate update (ns) */
+#define BFQ_RATE_REF_INTERVAL NSEC_PER_SEC
+
+/* Shift used for peak rate fixed precision calculations. */
+#define BFQ_RATE_SHIFT 16
+
+/*
+ * By default, BFQ computes the duration of the weight raising for
+ * interactive applications automatically, using the following formula:
+ * duration = (R / r) * T, where r is the peak rate of the device, and
+ * R and T are two reference parameters.
+ * In particular, R is the peak rate of the reference device (see below),
+ * and T is a reference time: given the systems that are likely to be
+ * installed on the reference device according to its speed class, T is
+ * about the maximum time needed, under BFQ and while reading two files in
+ * parallel, to load typical large applications on these systems.
+ * In practice, the slower/faster the device at hand is, the more/less it
+ * takes to load applications with respect to the reference device.
+ * Accordingly, the longer/shorter BFQ grants weight raising to interactive
+ * applications.
+ *
+ * BFQ uses four different reference pairs (R, T), depending on:
+ * . whether the device is rotational or non-rotational;
+ * . whether the device is slow, such as old or portable HDDs, as well as
+ * SD cards, or fast, such as newer HDDs and SSDs.
+ *
+ * The device's speed class is dynamically (re)detected in
+ * bfq_update_peak_rate() every time the estimated peak rate is updated.
+ *
+ * In the following definitions, R_slow[0]/R_fast[0] and
+ * T_slow[0]/T_fast[0] are the reference values for a slow/fast
+ * rotational device, whereas R_slow[1]/R_fast[1] and
+ * T_slow[1]/T_fast[1] are the reference values for a slow/fast
+ * non-rotational device. Finally, device_speed_thresh are the
+ * thresholds used to switch between speed classes. The reference
+ * rates are not the actual peak rates of the devices used as a
+ * reference, but slightly lower values. The reason for using these
+ * slightly lower values is that the peak-rate estimator tends to
+ * yield slightly lower values than the actual peak rate (it can yield
+ * the actual peak rate only if there is only one process doing I/O,
+ * and the process does sequential I/O).
+ *
+ * Both the reference peak rates and the thresholds are measured in
+ * sectors/usec, left-shifted by BFQ_RATE_SHIFT.
+ */
+static int R_slow[2] = {1000, 10700};
+static int R_fast[2] = {14000, 33000};
+/*
+ * To improve readability, a conversion function is used to initialize the
+ * following arrays, which entails that they can be initialized only in a
+ * function.
+ */
+static int T_slow[2];
+static int T_fast[2];
+static int device_speed_thresh[2];
+
+#define BFQ_SERVICE_TREE_INIT ((struct bfq_service_tree) \
+ { RB_ROOT, RB_ROOT, NULL, NULL, 0, 0 })
+
+#define RQ_BIC(rq) ((struct bfq_io_cq *) (rq)->elv.priv[0])
+#define RQ_BFQQ(rq) ((rq)->elv.priv[1])
+
+static void bfq_schedule_dispatch(struct bfq_data *bfqd);
+
+#include "bfq-ioc.c"
+#include "bfq-sched.c"
+#include "bfq-cgroup.c"
+
+#define bfq_class_idle(bfqq) ((bfqq)->ioprio_class == IOPRIO_CLASS_IDLE)
+#define bfq_class_rt(bfqq) ((bfqq)->ioprio_class == IOPRIO_CLASS_RT)
+
+#define bfq_sample_valid(samples) ((samples) > 80)
+
+/*
+ * We regard a request as SYNC, if either it's a read or has the SYNC bit
+ * set (in which case it could also be a direct WRITE).
+ */
+static int bfq_bio_sync(struct bio *bio)
+{
+ return bio_data_dir(bio) == READ || (bio->bi_opf & REQ_SYNC);
+}
+
+/*
+ * Scheduler run of queue, if there are requests pending and no one in the
+ * driver that will restart queueing.
+ */
+static void bfq_schedule_dispatch(struct bfq_data *bfqd)
+{
+ if (bfqd->queued != 0) {
+ bfq_log(bfqd, "schedule dispatch");
+ kblockd_schedule_work(&bfqd->unplug_work);
+ }
+}
+
+/*
+ * Lifted from AS - choose which of rq1 and rq2 that is best served now.
+ * We choose the request that is closesr to the head right now. Distance
+ * behind the head is penalized and only allowed to a certain extent.
+ */
+static struct request *bfq_choose_req(struct bfq_data *bfqd,
+ struct request *rq1,
+ struct request *rq2,
+ sector_t last)
+{
+ sector_t s1, s2, d1 = 0, d2 = 0;
+ unsigned long back_max;
+#define BFQ_RQ1_WRAP 0x01 /* request 1 wraps */
+#define BFQ_RQ2_WRAP 0x02 /* request 2 wraps */
+ unsigned int wrap = 0; /* bit mask: requests behind the disk head? */
+
+ if (!rq1 || rq1 == rq2)
+ return rq2;
+ if (!rq2)
+ return rq1;
+
+ if (rq_is_sync(rq1) && !rq_is_sync(rq2))
+ return rq1;
+ else if (rq_is_sync(rq2) && !rq_is_sync(rq1))
+ return rq2;
+ if ((rq1->cmd_flags & REQ_META) && !(rq2->cmd_flags & REQ_META))
+ return rq1;
+ else if ((rq2->cmd_flags & REQ_META) && !(rq1->cmd_flags & REQ_META))
+ return rq2;
+
+ s1 = blk_rq_pos(rq1);
+ s2 = blk_rq_pos(rq2);
+
+ /*
+ * By definition, 1KiB is 2 sectors.
+ */
+ back_max = bfqd->bfq_back_max * 2;
+
+ /*
+ * Strict one way elevator _except_ in the case where we allow
+ * short backward seeks which are biased as twice the cost of a
+ * similar forward seek.
+ */
+ if (s1 >= last)
+ d1 = s1 - last;
+ else if (s1 + back_max >= last)
+ d1 = (last - s1) * bfqd->bfq_back_penalty;
+ else
+ wrap |= BFQ_RQ1_WRAP;
+
+ if (s2 >= last)
+ d2 = s2 - last;
+ else if (s2 + back_max >= last)
+ d2 = (last - s2) * bfqd->bfq_back_penalty;
+ else
+ wrap |= BFQ_RQ2_WRAP;
+
+ /* Found required data */
+
+ /*
+ * By doing switch() on the bit mask "wrap" we avoid having to
+ * check two variables for all permutations: --> faster!
+ */
+ switch (wrap) {
+ case 0: /* common case for CFQ: rq1 and rq2 not wrapped */
+ if (d1 < d2)
+ return rq1;
+ else if (d2 < d1)
+ return rq2;
+
+ if (s1 >= s2)
+ return rq1;
+ else
+ return rq2;
+
+ case BFQ_RQ2_WRAP:
+ return rq1;
+ case BFQ_RQ1_WRAP:
+ return rq2;
+ case (BFQ_RQ1_WRAP|BFQ_RQ2_WRAP): /* both rqs wrapped */
+ default:
+ /*
+ * Since both rqs are wrapped,
+ * start with the one that's further behind head
+ * (--> only *one* back seek required),
+ * since back seek takes more time than forward.
+ */
+ if (s1 <= s2)
+ return rq1;
+ else
+ return rq2;
+ }
+}
+
+static struct bfq_queue *
+bfq_rq_pos_tree_lookup(struct bfq_data *bfqd, struct rb_root *root,
+ sector_t sector, struct rb_node **ret_parent,
+ struct rb_node ***rb_link)
+{
+ struct rb_node **p, *parent;
+ struct bfq_queue *bfqq = NULL;
+
+ parent = NULL;
+ p = &root->rb_node;
+ while (*p) {
+ struct rb_node **n;
+
+ parent = *p;
+ bfqq = rb_entry(parent, struct bfq_queue, pos_node);
+
+ /*
+ * Sort strictly based on sector. Smallest to the left,
+ * largest to the right.
+ */
+ if (sector > blk_rq_pos(bfqq->next_rq))
+ n = &(*p)->rb_right;
+ else if (sector < blk_rq_pos(bfqq->next_rq))
+ n = &(*p)->rb_left;
+ else
+ break;
+ p = n;
+ bfqq = NULL;
+ }
+
+ *ret_parent = parent;
+ if (rb_link)
+ *rb_link = p;
+
+ bfq_log(bfqd, "rq_pos_tree_lookup %llu: returning %d",
+ (unsigned long long) sector,
+ bfqq ? bfqq->pid : 0);
+
+ return bfqq;
+}
+
+static void bfq_pos_tree_add_move(struct bfq_data *bfqd, struct bfq_queue *bfqq)
+{
+ struct rb_node **p, *parent;
+ struct bfq_queue *__bfqq;
+
+ if (bfqq->pos_root) {
+ rb_erase(&bfqq->pos_node, bfqq->pos_root);
+ bfqq->pos_root = NULL;
+ }
+
+ if (bfq_class_idle(bfqq))
+ return;
+ if (!bfqq->next_rq)
+ return;
+
+ bfqq->pos_root = &bfq_bfqq_to_bfqg(bfqq)->rq_pos_tree;
+ __bfqq = bfq_rq_pos_tree_lookup(bfqd, bfqq->pos_root,
+ blk_rq_pos(bfqq->next_rq), &parent, &p);
+ if (!__bfqq) {
+ rb_link_node(&bfqq->pos_node, parent, p);
+ rb_insert_color(&bfqq->pos_node, bfqq->pos_root);
+ } else
+ bfqq->pos_root = NULL;
+}
+
+/*
+ * Tell whether there are active queues or groups with differentiated weights.
+ */
+static bool bfq_differentiated_weights(struct bfq_data *bfqd)
+{
+ /*
+ * For weights to differ, at least one of the trees must contain
+ * at least two nodes.
+ */
+ return (!RB_EMPTY_ROOT(&bfqd->queue_weights_tree) &&
+ (bfqd->queue_weights_tree.rb_node->rb_left ||
+ bfqd->queue_weights_tree.rb_node->rb_right)
+#ifdef CONFIG_BFQ_GROUP_IOSCHED
+ ) ||
+ (!RB_EMPTY_ROOT(&bfqd->group_weights_tree) &&
+ (bfqd->group_weights_tree.rb_node->rb_left ||
+ bfqd->group_weights_tree.rb_node->rb_right)
+#endif
+ );
+}
+
+/*
+ * The following function returns true if every queue must receive the
+ * same share of the throughput (this condition is used when deciding
+ * whether idling may be disabled, see the comments in the function
+ * bfq_bfqq_may_idle()).
+ *
+ * Such a scenario occurs when:
+ * 1) all active queues have the same weight,
+ * 2) all active groups at the same level in the groups tree have the same
+ * weight,
+ * 3) all active groups at the same level in the groups tree have the same
+ * number of children.
+ *
+ * Unfortunately, keeping the necessary state for evaluating exactly the
+ * above symmetry conditions would be quite complex and time-consuming.
+ * Therefore this function evaluates, instead, the following stronger
+ * sub-conditions, for which it is much easier to maintain the needed
+ * state:
+ * 1) all active queues have the same weight,
+ * 2) all active groups have the same weight,
+ * 3) all active groups have at most one active child each.
+ * In particular, the last two conditions are always true if hierarchical
+ * support and the cgroups interface are not enabled, thus no state needs
+ * to be maintained in this case.
+ */
+static bool bfq_symmetric_scenario(struct bfq_data *bfqd)
+{
+ return !bfq_differentiated_weights(bfqd);
+}
+
+/*
+ * If the weight-counter tree passed as input contains no counter for
+ * the weight of the input entity, then add that counter; otherwise just
+ * increment the existing counter.
+ *
+ * Note that weight-counter trees contain few nodes in mostly symmetric
+ * scenarios. For example, if all queues have the same weight, then the
+ * weight-counter tree for the queues may contain at most one node.
+ * This holds even if low_latency is on, because weight-raised queues
+ * are not inserted in the tree.
+ * In most scenarios, the rate at which nodes are created/destroyed
+ * should be low too.
+ */
+static void bfq_weights_tree_add(struct bfq_data *bfqd,
+ struct bfq_entity *entity,
+ struct rb_root *root)
+{
+ struct rb_node **new = &(root->rb_node), *parent = NULL;
+
+ /*
+ * Do not insert if the entity is already associated with a
+ * counter, which happens if:
+ * 1) the entity is associated with a queue,
+ * 2) a request arrival has caused the queue to become both
+ * non-weight-raised, and hence change its weight, and
+ * backlogged; in this respect, each of the two events
+ * causes an invocation of this function,
+ * 3) this is the invocation of this function caused by the
+ * second event. This second invocation is actually useless,
+ * and we handle this fact by exiting immediately. More
+ * efficient or clearer solutions might possibly be adopted.
+ */
+ if (entity->weight_counter)
+ return;
+
+ while (*new) {
+ struct bfq_weight_counter *__counter = container_of(*new,
+ struct bfq_weight_counter,
+ weights_node);
+ parent = *new;
+
+ if (entity->weight == __counter->weight) {
+ entity->weight_counter = __counter;
+ goto inc_counter;
+ }
+ if (entity->weight < __counter->weight)
+ new = &((*new)->rb_left);
+ else
+ new = &((*new)->rb_right);
+ }
+
+ entity->weight_counter = kzalloc(sizeof(struct bfq_weight_counter),
+ GFP_ATOMIC);
+ entity->weight_counter->weight = entity->weight;
+ rb_link_node(&entity->weight_counter->weights_node, parent, new);
+ rb_insert_color(&entity->weight_counter->weights_node, root);
+
+inc_counter:
+ entity->weight_counter->num_active++;
+}
+
+/*
+ * Decrement the weight counter associated with the entity, and, if the
+ * counter reaches 0, remove the counter from the tree.
+ * See the comments to the function bfq_weights_tree_add() for considerations
+ * about overhead.
+ */
+static void bfq_weights_tree_remove(struct bfq_data *bfqd,
+ struct bfq_entity *entity,
+ struct rb_root *root)
+{
+ if (!entity->weight_counter)
+ return;
+
+ BUG_ON(RB_EMPTY_ROOT(root));
+ BUG_ON(entity->weight_counter->weight != entity->weight);
+
+ BUG_ON(!entity->weight_counter->num_active);
+ entity->weight_counter->num_active--;
+ if (entity->weight_counter->num_active > 0)
+ goto reset_entity_pointer;
+
+ rb_erase(&entity->weight_counter->weights_node, root);
+ kfree(entity->weight_counter);
+
+reset_entity_pointer:
+ entity->weight_counter = NULL;
+}
+
+/*
+ * Return expired entry, or NULL to just start from scratch in rbtree.
+ */
+static struct request *bfq_check_fifo(struct bfq_queue *bfqq,
+ struct request *last)
+{
+ struct request *rq;
+
+ if (bfq_bfqq_fifo_expire(bfqq))
+ return NULL;
+
+ bfq_mark_bfqq_fifo_expire(bfqq);
+
+ rq = rq_entry_fifo(bfqq->fifo.next);
+
+ if (rq == last || ktime_get_ns() < rq->fifo_time)
+ return NULL;
+
+ bfq_log_bfqq(bfqq->bfqd, bfqq, "check_fifo: returned %p", rq);
+ BUG_ON(RB_EMPTY_NODE(&rq->rb_node));
+ return rq;
+}
+
+static struct request *bfq_find_next_rq(struct bfq_data *bfqd,
+ struct bfq_queue *bfqq,
+ struct request *last)
+{
+ struct rb_node *rbnext = rb_next(&last->rb_node);
+ struct rb_node *rbprev = rb_prev(&last->rb_node);
+ struct request *next, *prev = NULL;
+
+ BUG_ON(list_empty(&bfqq->fifo));
+
+ /* Follow expired path, else get first next available. */
+ next = bfq_check_fifo(bfqq, last);
+ if (next) {
+ BUG_ON(next == last);
+ return next;
+ }
+
+ BUG_ON(RB_EMPTY_NODE(&last->rb_node));
+
+ if (rbprev)
+ prev = rb_entry_rq(rbprev);
+
+ if (rbnext)
+ next = rb_entry_rq(rbnext);
+ else {
+ rbnext = rb_first(&bfqq->sort_list);
+ if (rbnext && rbnext != &last->rb_node)
+ next = rb_entry_rq(rbnext);
+ }
+
+ return bfq_choose_req(bfqd, next, prev, blk_rq_pos(last));
+}
+
+/* see the definition of bfq_async_charge_factor for details */
+static unsigned long bfq_serv_to_charge(struct request *rq,
+ struct bfq_queue *bfqq)
+{
+ if (bfq_bfqq_sync(bfqq) || bfqq->wr_coeff > 1)
+ return blk_rq_sectors(rq);
+
+ /*
+ * If there are no weight-raised queues, then amplify service
+ * by just the async charge factor; otherwise amplify service
+ * by twice the async charge factor, to further reduce latency
+ * for weight-raised queues.
+ */
+ if (bfqq->bfqd->wr_busy_queues == 0)
+ return blk_rq_sectors(rq) * bfq_async_charge_factor;
+
+ return blk_rq_sectors(rq) * 2 * bfq_async_charge_factor;
+}
+
+/**
+ * bfq_updated_next_req - update the queue after a new next_rq selection.
+ * @bfqd: the device data the queue belongs to.
+ * @bfqq: the queue to update.
+ *
+ * If the first request of a queue changes we make sure that the queue
+ * has enough budget to serve at least its first request (if the
+ * request has grown). We do this because if the queue has not enough
+ * budget for its first request, it has to go through two dispatch
+ * rounds to actually get it dispatched.
+ */
+static void bfq_updated_next_req(struct bfq_data *bfqd,
+ struct bfq_queue *bfqq)
+{
+ struct bfq_entity *entity = &bfqq->entity;
+ struct bfq_service_tree *st = bfq_entity_service_tree(entity);
+ struct request *next_rq = bfqq->next_rq;
+ unsigned long new_budget;
+
+ if (!next_rq)
+ return;
+
+ if (bfqq == bfqd->in_service_queue)
+ /*
+ * In order not to break guarantees, budgets cannot be
+ * changed after an entity has been selected.
+ */
+ return;
+
+ BUG_ON(entity->tree != &st->active);
+ BUG_ON(entity == entity->sched_data->in_service_entity);
+
+ new_budget = max_t(unsigned long, bfqq->max_budget,
+ bfq_serv_to_charge(next_rq, bfqq));
+ if (entity->budget != new_budget) {
+ entity->budget = new_budget;
+ bfq_log_bfqq(bfqd, bfqq, "updated next rq: new budget %lu",
+ new_budget);
+ bfq_requeue_bfqq(bfqd, bfqq);
+ }
+}
+
+static unsigned int bfq_wr_duration(struct bfq_data *bfqd)
+{
+ u64 dur;
+
+ if (bfqd->bfq_wr_max_time > 0)
+ return bfqd->bfq_wr_max_time;
+
+ dur = bfqd->RT_prod;
+ do_div(dur, bfqd->peak_rate);
+
+ /*
+ * Limit duration between 3 and 13 seconds. Tests show that
+ * higher values than 13 seconds often yield the opposite of
+ * the desired result, i.e., worsen responsiveness by letting
+ * non-interactive and non-soft-real-time applications
+ * preserve weight raising for a too long time interval.
+ *
+ * On the other end, lower values than 3 seconds make it
+ * difficult for most interactive tasks to complete their jobs
+ * before weight-raising finishes.
+ */
+ if (dur > msecs_to_jiffies(13000))
+ dur = msecs_to_jiffies(13000);
+ else if (dur < msecs_to_jiffies(3000))
+ dur = msecs_to_jiffies(3000);
+
+ return dur;
+}
+
+static void
+bfq_bfqq_resume_state(struct bfq_queue *bfqq, struct bfq_io_cq *bic)
+{
+ if (bic->saved_idle_window)
+ bfq_mark_bfqq_idle_window(bfqq);
+ else
+ bfq_clear_bfqq_idle_window(bfqq);
+
+ if (bic->saved_IO_bound)
+ bfq_mark_bfqq_IO_bound(bfqq);
+ else
+ bfq_clear_bfqq_IO_bound(bfqq);
+
+ bfqq->wr_coeff = bic->saved_wr_coeff;
+ bfqq->wr_start_at_switch_to_srt = bic->saved_wr_start_at_switch_to_srt;
+ BUG_ON(time_is_after_jiffies(bfqq->wr_start_at_switch_to_srt));
+ bfqq->last_wr_start_finish = bic->saved_last_wr_start_finish;
+ bfqq->wr_cur_max_time = bic->saved_wr_cur_max_time;
+ BUG_ON(time_is_after_jiffies(bfqq->last_wr_start_finish));
+
+ if (bfqq->wr_coeff > 1 && (bfq_bfqq_in_large_burst(bfqq) ||
+ time_is_before_jiffies(bfqq->last_wr_start_finish +
+ bfqq->wr_cur_max_time))) {
+ bfq_log_bfqq(bfqq->bfqd, bfqq,
+ "resume state: switching off wr (%lu + %lu < %lu)",
+ bfqq->last_wr_start_finish, bfqq->wr_cur_max_time,
+ jiffies);
+
+ bfqq->wr_coeff = 1;
+ }
+ /* make sure weight will be updated, however we got here */
+ bfqq->entity.prio_changed = 1;
+}
+
+static int bfqq_process_refs(struct bfq_queue *bfqq)
+{
+ int process_refs, io_refs;
+
+ lockdep_assert_held(bfqq->bfqd->queue->queue_lock);
+
+ io_refs = bfqq->allocated[READ] + bfqq->allocated[WRITE];
+ process_refs = bfqq->ref - io_refs - bfqq->entity.on_st;
+ BUG_ON(process_refs < 0);
+ return process_refs;
+}
+
+/* Empty burst list and add just bfqq (see comments to bfq_handle_burst) */
+static void bfq_reset_burst_list(struct bfq_data *bfqd, struct bfq_queue *bfqq)
+{
+ struct bfq_queue *item;
+ struct hlist_node *n;
+
+ hlist_for_each_entry_safe(item, n, &bfqd->burst_list, burst_list_node)
+ hlist_del_init(&item->burst_list_node);
+ hlist_add_head(&bfqq->burst_list_node, &bfqd->burst_list);
+ bfqd->burst_size = 1;
+ bfqd->burst_parent_entity = bfqq->entity.parent;
+}
+
+/* Add bfqq to the list of queues in current burst (see bfq_handle_burst) */
+static void bfq_add_to_burst(struct bfq_data *bfqd, struct bfq_queue *bfqq)
+{
+ /* Increment burst size to take into account also bfqq */
+ bfqd->burst_size++;
+
+ bfq_log_bfqq(bfqd, bfqq, "add_to_burst %d", bfqd->burst_size);
+
+ BUG_ON(bfqd->burst_size > bfqd->bfq_large_burst_thresh);
+
+ if (bfqd->burst_size == bfqd->bfq_large_burst_thresh) {
+ struct bfq_queue *pos, *bfqq_item;
+ struct hlist_node *n;
+
+ /*
+ * Enough queues have been activated shortly after each
+ * other to consider this burst as large.
+ */
+ bfqd->large_burst = true;
+ bfq_log_bfqq(bfqd, bfqq, "add_to_burst: large burst started");
+
+ /*
+ * We can now mark all queues in the burst list as
+ * belonging to a large burst.
+ */
+ hlist_for_each_entry(bfqq_item, &bfqd->burst_list,
+ burst_list_node) {
+ bfq_mark_bfqq_in_large_burst(bfqq_item);
+ bfq_log_bfqq(bfqd, bfqq_item, "marked in large burst");
+ }
+ bfq_mark_bfqq_in_large_burst(bfqq);
+ bfq_log_bfqq(bfqd, bfqq, "marked in large burst");
+
+ /*
+ * From now on, and until the current burst finishes, any
+ * new queue being activated shortly after the last queue
+ * was inserted in the burst can be immediately marked as
+ * belonging to a large burst. So the burst list is not
+ * needed any more. Remove it.
+ */
+ hlist_for_each_entry_safe(pos, n, &bfqd->burst_list,
+ burst_list_node)
+ hlist_del_init(&pos->burst_list_node);
+ } else /*
+ * Burst not yet large: add bfqq to the burst list. Do
+ * not increment the ref counter for bfqq, because bfqq
+ * is removed from the burst list before freeing bfqq
+ * in put_queue.
+ */
+ hlist_add_head(&bfqq->burst_list_node, &bfqd->burst_list);
+}
+
+/*
+ * If many queues belonging to the same group happen to be created
+ * shortly after each other, then the processes associated with these
+ * queues have typically a common goal. In particular, bursts of queue
+ * creations are usually caused by services or applications that spawn
+ * many parallel threads/processes. Examples are systemd during boot,
+ * or git grep. To help these processes get their job done as soon as
+ * possible, it is usually better to not grant either weight-raising
+ * or device idling to their queues.
+ *
+ * In this comment we describe, firstly, the reasons why this fact
+ * holds, and, secondly, the next function, which implements the main
+ * steps needed to properly mark these queues so that they can then be
+ * treated in a different way.
+ *
+ * The above services or applications benefit mostly from a high
+ * throughput: the quicker the requests of the activated queues are
+ * cumulatively served, the sooner the target job of these queues gets
+ * completed. As a consequence, weight-raising any of these queues,
+ * which also implies idling the device for it, is almost always
+ * counterproductive. In most cases it just lowers throughput.
+ *
+ * On the other hand, a burst of queue creations may be caused also by
+ * the start of an application that does not consist of a lot of
+ * parallel I/O-bound threads. In fact, with a complex application,
+ * several short processes may need to be executed to start-up the
+ * application. In this respect, to start an application as quickly as
+ * possible, the best thing to do is in any case to privilege the I/O
+ * related to the application with respect to all other
+ * I/O. Therefore, the best strategy to start as quickly as possible
+ * an application that causes a burst of queue creations is to
+ * weight-raise all the queues created during the burst. This is the
+ * exact opposite of the best strategy for the other type of bursts.
+ *
+ * In the end, to take the best action for each of the two cases, the
+ * two types of bursts need to be distinguished. Fortunately, this
+ * seems relatively easy, by looking at the sizes of the bursts. In
+ * particular, we found a threshold such that only bursts with a
+ * larger size than that threshold are apparently caused by
+ * services or commands such as systemd or git grep. For brevity,
+ * hereafter we call just 'large' these bursts. BFQ *does not*
+ * weight-raise queues whose creation occurs in a large burst. In
+ * addition, for each of these queues BFQ performs or does not perform
+ * idling depending on which choice boosts the throughput more. The
+ * exact choice depends on the device and request pattern at
+ * hand.
+ *
+ * Unfortunately, false positives may occur while an interactive task
+ * is starting (e.g., an application is being started). The
+ * consequence is that the queues associated with the task do not
+ * enjoy weight raising as expected. Fortunately these false positives
+ * are very rare. They typically occur if some service happens to
+ * start doing I/O exactly when the interactive task starts.
+ *
+ * Turning back to the next function, it implements all the steps
+ * needed to detect the occurrence of a large burst and to properly
+ * mark all the queues belonging to it (so that they can then be
+ * treated in a different way). This goal is achieved by maintaining a
+ * "burst list" that holds, temporarily, the queues that belong to the
+ * burst in progress. The list is then used to mark these queues as
+ * belonging to a large burst if the burst does become large. The main
+ * steps are the following.
+ *
+ * . when the very first queue is created, the queue is inserted into the
+ * list (as it could be the first queue in a possible burst)
+ *
+ * . if the current burst has not yet become large, and a queue Q that does
+ * not yet belong to the burst is activated shortly after the last time
+ * at which a new queue entered the burst list, then the function appends
+ * Q to the burst list
+ *
+ * . if, as a consequence of the previous step, the burst size reaches
+ * the large-burst threshold, then
+ *
+ * . all the queues in the burst list are marked as belonging to a
+ * large burst
+ *
+ * . the burst list is deleted; in fact, the burst list already served
+ * its purpose (keeping temporarily track of the queues in a burst,
+ * so as to be able to mark them as belonging to a large burst in the
+ * previous sub-step), and now is not needed any more
+ *
+ * . the device enters a large-burst mode
+ *
+ * . if a queue Q that does not belong to the burst is created while
+ * the device is in large-burst mode and shortly after the last time
+ * at which a queue either entered the burst list or was marked as
+ * belonging to the current large burst, then Q is immediately marked
+ * as belonging to a large burst.
+ *
+ * . if a queue Q that does not belong to the burst is created a while
+ * later, i.e., not shortly after, than the last time at which a queue
+ * either entered the burst list or was marked as belonging to the
+ * current large burst, then the current burst is deemed as finished and:
+ *
+ * . the large-burst mode is reset if set
+ *
+ * . the burst list is emptied
+ *
+ * . Q is inserted in the burst list, as Q may be the first queue
+ * in a possible new burst (then the burst list contains just Q
+ * after this step).
+ */
+static void bfq_handle_burst(struct bfq_data *bfqd, struct bfq_queue *bfqq)
+{
+ /*
+ * If bfqq is already in the burst list or is part of a large
+ * burst, or finally has just been split, then there is
+ * nothing else to do.
+ */
+ if (!hlist_unhashed(&bfqq->burst_list_node) ||
+ bfq_bfqq_in_large_burst(bfqq) ||
+ time_is_after_eq_jiffies(bfqq->split_time +
+ msecs_to_jiffies(10)))
+ return;
+
+ /*
+ * If bfqq's creation happens late enough, or bfqq belongs to
+ * a different group than the burst group, then the current
+ * burst is finished, and related data structures must be
+ * reset.
+ *
+ * In this respect, consider the special case where bfqq is
+ * the very first queue created after BFQ is selected for this
+ * device. In this case, last_ins_in_burst and
+ * burst_parent_entity are not yet significant when we get
+ * here. But it is easy to verify that, whether or not the
+ * following condition is true, bfqq will end up being
+ * inserted into the burst list. In particular the list will
+ * happen to contain only bfqq. And this is exactly what has
+ * to happen, as bfqq may be the first queue of the first
+ * burst.
+ */
+ if (time_is_before_jiffies(bfqd->last_ins_in_burst +
+ bfqd->bfq_burst_interval) ||
+ bfqq->entity.parent != bfqd->burst_parent_entity) {
+ bfqd->large_burst = false;
+ bfq_reset_burst_list(bfqd, bfqq);
+ bfq_log_bfqq(bfqd, bfqq,
+ "handle_burst: late activation or different group");
+ goto end;
+ }
+
+ /*
+ * If we get here, then bfqq is being activated shortly after the
+ * last queue. So, if the current burst is also large, we can mark
+ * bfqq as belonging to this large burst immediately.
+ */
+ if (bfqd->large_burst) {
+ bfq_log_bfqq(bfqd, bfqq, "handle_burst: marked in burst");
+ bfq_mark_bfqq_in_large_burst(bfqq);
+ goto end;
+ }
+
+ /*
+ * If we get here, then a large-burst state has not yet been
+ * reached, but bfqq is being activated shortly after the last
+ * queue. Then we add bfqq to the burst.
+ */
+ bfq_add_to_burst(bfqd, bfqq);
+end:
+ /*
+ * At this point, bfqq either has been added to the current
+ * burst or has caused the current burst to terminate and a
+ * possible new burst to start. In particular, in the second
+ * case, bfqq has become the first queue in the possible new
+ * burst. In both cases last_ins_in_burst needs to be moved
+ * forward.
+ */
+ bfqd->last_ins_in_burst = jiffies;
+
+}
+
+static int bfq_bfqq_budget_left(struct bfq_queue *bfqq)
+{
+ struct bfq_entity *entity = &bfqq->entity;
+
+ return entity->budget - entity->service;
+}
+
+/*
+ * If enough samples have been computed, return the current max budget
+ * stored in bfqd, which is dynamically updated according to the
+ * estimated disk peak rate; otherwise return the default max budget
+ */
+static int bfq_max_budget(struct bfq_data *bfqd)
+{
+ if (bfqd->budgets_assigned < bfq_stats_min_budgets)
+ return bfq_default_max_budget;
+ else
+ return bfqd->bfq_max_budget;
+}
+
+/*
+ * Return min budget, which is a fraction of the current or default
+ * max budget (trying with 1/32)
+ */
+static int bfq_min_budget(struct bfq_data *bfqd)
+{
+ if (bfqd->budgets_assigned < bfq_stats_min_budgets)
+ return bfq_default_max_budget / 32;
+ else
+ return bfqd->bfq_max_budget / 32;
+}
+
+static void bfq_bfqq_expire(struct bfq_data *bfqd,
+ struct bfq_queue *bfqq,
+ bool compensate,
+ enum bfqq_expiration reason);
+
+/*
+ * The next function, invoked after the input queue bfqq switches from
+ * idle to busy, updates the budget of bfqq. The function also tells
+ * whether the in-service queue should be expired, by returning
+ * true. The purpose of expiring the in-service queue is to give bfqq
+ * the chance to possibly preempt the in-service queue, and the reason
+ * for preempting the in-service queue is to achieve one of the two
+ * goals below.
+ *
+ * 1. Guarantee to bfqq its reserved bandwidth even if bfqq has
+ * expired because it has remained idle. In particular, bfqq may have
+ * expired for one of the following two reasons:
+ *
+ * - BFQ_BFQQ_NO_MORE_REQUEST bfqq did not enjoy any device idling and
+ * did not make it to issue a new request before its last request
+ * was served;
+ *
+ * - BFQ_BFQQ_TOO_IDLE bfqq did enjoy device idling, but did not issue
+ * a new request before the expiration of the idling-time.
+ *
+ * Even if bfqq has expired for one of the above reasons, the process
+ * associated with the queue may be however issuing requests greedily,
+ * and thus be sensitive to the bandwidth it receives (bfqq may have
+ * remained idle for other reasons: CPU high load, bfqq not enjoying
+ * idling, I/O throttling somewhere in the path from the process to
+ * the I/O scheduler, ...). But if, after every expiration for one of
+ * the above two reasons, bfqq has to wait for the service of at least
+ * one full budget of another queue before being served again, then
+ * bfqq is likely to get a much lower bandwidth or resource time than
+ * its reserved ones. To address this issue, two countermeasures need
+ * to be taken.
+ *
+ * First, the budget and the timestamps of bfqq need to be updated in
+ * a special way on bfqq reactivation: they need to be updated as if
+ * bfqq did not remain idle and did not expire. In fact, if they are
+ * computed as if bfqq expired and remained idle until reactivation,
+ * then the process associated with bfqq is treated as if, instead of
+ * being greedy, it stopped issuing requests when bfqq remained idle,
+ * and restarts issuing requests only on this reactivation. In other
+ * words, the scheduler does not help the process recover the "service
+ * hole" between bfqq expiration and reactivation. As a consequence,
+ * the process receives a lower bandwidth than its reserved one. In
+ * contrast, to recover this hole, the budget must be updated as if
+ * bfqq was not expired at all before this reactivation, i.e., it must
+ * be set to the value of the remaining budget when bfqq was
+ * expired. Along the same line, timestamps need to be assigned the
+ * value they had the last time bfqq was selected for service, i.e.,
+ * before last expiration. Thus timestamps need to be back-shifted
+ * with respect to their normal computation (see [1] for more details
+ * on this tricky aspect).
+ *
+ * Secondly, to allow the process to recover the hole, the in-service
+ * queue must be expired too, to give bfqq the chance to preempt it
+ * immediately. In fact, if bfqq has to wait for a full budget of the
+ * in-service queue to be completed, then it may become impossible to
+ * let the process recover the hole, even if the back-shifted
+ * timestamps of bfqq are lower than those of the in-service queue. If
+ * this happens for most or all of the holes, then the process may not
+ * receive its reserved bandwidth. In this respect, it is worth noting
+ * that, being the service of outstanding requests unpreemptible, a
+ * little fraction of the holes may however be unrecoverable, thereby
+ * causing a little loss of bandwidth.
+ *
+ * The last important point is detecting whether bfqq does need this
+ * bandwidth recovery. In this respect, the next function deems the
+ * process associated with bfqq greedy, and thus allows it to recover
+ * the hole, if: 1) the process is waiting for the arrival of a new
+ * request (which implies that bfqq expired for one of the above two
+ * reasons), and 2) such a request has arrived soon. The first
+ * condition is controlled through the flag non_blocking_wait_rq,
+ * while the second through the flag arrived_in_time. If both
+ * conditions hold, then the function computes the budget in the
+ * above-described special way, and signals that the in-service queue
+ * should be expired. Timestamp back-shifting is done later in
+ * __bfq_activate_entity.
+ *
+ * 2. Reduce latency. Even if timestamps are not backshifted to let
+ * the process associated with bfqq recover a service hole, bfqq may
+ * however happen to have, after being (re)activated, a lower finish
+ * timestamp than the in-service queue. That is, the next budget of
+ * bfqq may have to be completed before the one of the in-service
+ * queue. If this is the case, then preempting the in-service queue
+ * allows this goal to be achieved, apart from the unpreemptible,
+ * outstanding requests mentioned above.
+ *
+ * Unfortunately, regardless of which of the above two goals one wants
+ * to achieve, service trees need first to be updated to know whether
+ * the in-service queue must be preempted. To have service trees
+ * correctly updated, the in-service queue must be expired and
+ * rescheduled, and bfqq must be scheduled too. This is one of the
+ * most costly operations (in future versions, the scheduling
+ * mechanism may be re-designed in such a way to make it possible to
+ * know whether preemption is needed without needing to update service
+ * trees). In addition, queue preemptions almost always cause random
+ * I/O, and thus loss of throughput. Because of these facts, the next
+ * function adopts the following simple scheme to avoid both costly
+ * operations and too frequent preemptions: it requests the expiration
+ * of the in-service queue (unconditionally) only for queues that need
+ * to recover a hole, or that either are weight-raised or deserve to
+ * be weight-raised.
+ */
+static bool bfq_bfqq_update_budg_for_activation(struct bfq_data *bfqd,
+ struct bfq_queue *bfqq,
+ bool arrived_in_time,
+ bool wr_or_deserves_wr)
+{
+ struct bfq_entity *entity = &bfqq->entity;
+
+ if (bfq_bfqq_non_blocking_wait_rq(bfqq) && arrived_in_time) {
+ /*
+ * We do not clear the flag non_blocking_wait_rq here, as
+ * the latter is used in bfq_activate_bfqq to signal
+ * that timestamps need to be back-shifted (and is
+ * cleared right after).
+ */
+
+ /*
+ * In next assignment we rely on that either
+ * entity->service or entity->budget are not updated
+ * on expiration if bfqq is empty (see
+ * __bfq_bfqq_recalc_budget). Thus both quantities
+ * remain unchanged after such an expiration, and the
+ * following statement therefore assigns to
+ * entity->budget the remaining budget on such an
+ * expiration. For clarity, entity->service is not
+ * updated on expiration in any case, and, in normal
+ * operation, is reset only when bfqq is selected for
+ * service (see bfq_get_next_queue).
+ */
+ BUG_ON(bfqq->max_budget < 0);
+ entity->budget = min_t(unsigned long,
+ bfq_bfqq_budget_left(bfqq),
+ bfqq->max_budget);
+
+ BUG_ON(entity->budget < 0);
+ return true;
+ }
+
+ BUG_ON(bfqq->max_budget < 0);
+ entity->budget = max_t(unsigned long, bfqq->max_budget,
+ bfq_serv_to_charge(bfqq->next_rq, bfqq));
+ BUG_ON(entity->budget < 0);
+
+ bfq_clear_bfqq_non_blocking_wait_rq(bfqq);
+ return wr_or_deserves_wr;
+}
+
+static void bfq_update_bfqq_wr_on_rq_arrival(struct bfq_data *bfqd,
+ struct bfq_queue *bfqq,
+ unsigned int old_wr_coeff,
+ bool wr_or_deserves_wr,
+ bool interactive,
+ bool in_burst,
+ bool soft_rt)
+{
+ if (old_wr_coeff == 1 && wr_or_deserves_wr) {
+ /* start a weight-raising period */
+ if (interactive) {
+ bfqq->wr_coeff = bfqd->bfq_wr_coeff;
+ bfqq->wr_cur_max_time = bfq_wr_duration(bfqd);
+ } else {
+ bfqq->wr_start_at_switch_to_srt = jiffies;
+ bfqq->wr_coeff = bfqd->bfq_wr_coeff *
+ BFQ_SOFTRT_WEIGHT_FACTOR;
+ bfqq->wr_cur_max_time =
+ bfqd->bfq_wr_rt_max_time;
+ }
+ /*
+ * If needed, further reduce budget to make sure it is
+ * close to bfqq's backlog, so as to reduce the
+ * scheduling-error component due to a too large
+ * budget. Do not care about throughput consequences,
+ * but only about latency. Finally, do not assign a
+ * too small budget either, to avoid increasing
+ * latency by causing too frequent expirations.
+ */
+ bfqq->entity.budget = min_t(unsigned long,
+ bfqq->entity.budget,
+ 2 * bfq_min_budget(bfqd));
+
+ bfq_log_bfqq(bfqd, bfqq,
+ "wrais starting at %lu, rais_max_time %u",
+ jiffies,
+ jiffies_to_msecs(bfqq->wr_cur_max_time));
+ } else if (old_wr_coeff > 1) {
+ if (interactive) { /* update wr coeff and duration */
+ bfqq->wr_coeff = bfqd->bfq_wr_coeff;
+ bfqq->wr_cur_max_time = bfq_wr_duration(bfqd);
+ } else if (in_burst) {
+ bfqq->wr_coeff = 1;
+ bfq_log_bfqq(bfqd, bfqq,
+ "wrais ending at %lu, rais_max_time %u",
+ jiffies,
+ jiffies_to_msecs(bfqq->
+ wr_cur_max_time));
+ } else if (soft_rt) {
+ /*
+ * The application is now or still meeting the
+ * requirements for being deemed soft rt. We
+ * can then correctly and safely (re)charge
+ * the weight-raising duration for the
+ * application with the weight-raising
+ * duration for soft rt applications.
+ *
+ * In particular, doing this recharge now, i.e.,
+ * before the weight-raising period for the
+ * application finishes, reduces the probability
+ * of the following negative scenario:
+ * 1) the weight of a soft rt application is
+ * raised at startup (as for any newly
+ * created application),
+ * 2) since the application is not interactive,
+ * at a certain time weight-raising is
+ * stopped for the application,
+ * 3) at that time the application happens to
+ * still have pending requests, and hence
+ * is destined to not have a chance to be
+ * deemed soft rt before these requests are
+ * completed (see the comments to the
+ * function bfq_bfqq_softrt_next_start()
+ * for details on soft rt detection),
+ * 4) these pending requests experience a high
+ * latency because the application is not
+ * weight-raised while they are pending.
+ */
+ if (bfqq->wr_cur_max_time !=
+ bfqd->bfq_wr_rt_max_time) {
+ bfqq->wr_start_at_switch_to_srt =
+ bfqq->last_wr_start_finish;
+ BUG_ON(time_is_after_jiffies(bfqq->last_wr_start_finish));
+
+ bfqq->wr_cur_max_time =
+ bfqd->bfq_wr_rt_max_time;
+ bfqq->wr_coeff = bfqd->bfq_wr_coeff *
+ BFQ_SOFTRT_WEIGHT_FACTOR;
+ bfq_log_bfqq(bfqd, bfqq,
+ "switching to soft_rt wr");
+ } else
+ bfq_log_bfqq(bfqd, bfqq,
+ "moving forward soft_rt wr duration");
+ bfqq->last_wr_start_finish = jiffies;
+ }
+ }
+}
+
+static bool bfq_bfqq_idle_for_long_time(struct bfq_data *bfqd,
+ struct bfq_queue *bfqq)
+{
+ return bfqq->dispatched == 0 &&
+ time_is_before_jiffies(
+ bfqq->budget_timeout +
+ bfqd->bfq_wr_min_idle_time);
+}
+
+static void bfq_bfqq_handle_idle_busy_switch(struct bfq_data *bfqd,
+ struct bfq_queue *bfqq,
+ int old_wr_coeff,
+ struct request *rq,
+ bool *interactive)
+{
+ bool soft_rt, in_burst, wr_or_deserves_wr,
+ bfqq_wants_to_preempt,
+ idle_for_long_time = bfq_bfqq_idle_for_long_time(bfqd, bfqq),
+ /*
+ * See the comments on
+ * bfq_bfqq_update_budg_for_activation for
+ * details on the usage of the next variable.
+ */
+ arrived_in_time = ktime_get_ns() <=
+ RQ_BIC(rq)->ttime.last_end_request +
+ bfqd->bfq_slice_idle * 3;
+
+ bfq_log_bfqq(bfqd, bfqq,
+ "bfq_add_request non-busy: "
+ "jiffies %lu, in_time %d, idle_long %d busyw %d "
+ "wr_coeff %u",
+ jiffies, arrived_in_time,
+ idle_for_long_time,
+ bfq_bfqq_non_blocking_wait_rq(bfqq),
+ old_wr_coeff);
+
+ BUG_ON(bfqq->entity.budget < bfqq->entity.service);
+
+ BUG_ON(bfqq == bfqd->in_service_queue);
+ bfqg_stats_update_io_add(bfqq_group(RQ_BFQQ(rq)), bfqq,
+ req_op(rq), rq->cmd_flags);
+
+ /*
+ * bfqq deserves to be weight-raised if:
+ * - it is sync,
+ * - it does not belong to a large burst,
+ * - it has been idle for enough time or is soft real-time,
+ * - is linked to a bfq_io_cq (it is not shared in any sense)
+ */
+ in_burst = bfq_bfqq_in_large_burst(bfqq);
+ soft_rt = bfqd->bfq_wr_max_softrt_rate > 0 &&
+ !in_burst &&
+ time_is_before_jiffies(bfqq->soft_rt_next_start);
+ *interactive =
+ !in_burst &&
+ idle_for_long_time;
+ wr_or_deserves_wr = bfqd->low_latency &&
+ (bfqq->wr_coeff > 1 ||
+ (bfq_bfqq_sync(bfqq) &&
+ bfqq->bic && (*interactive || soft_rt)));
+
+ bfq_log_bfqq(bfqd, bfqq,
+ "bfq_add_request: "
+ "in_burst %d, "
+ "soft_rt %d (next %lu), inter %d, bic %p",
+ bfq_bfqq_in_large_burst(bfqq), soft_rt,
+ bfqq->soft_rt_next_start,
+ *interactive,
+ bfqq->bic);
+
+ /*
+ * Using the last flag, update budget and check whether bfqq
+ * may want to preempt the in-service queue.
+ */
+ bfqq_wants_to_preempt =
+ bfq_bfqq_update_budg_for_activation(bfqd, bfqq,
+ arrived_in_time,
+ wr_or_deserves_wr);
+
+ /*
+ * If bfqq happened to be activated in a burst, but has been
+ * idle for much more than an interactive queue, then we
+ * assume that, in the overall I/O initiated in the burst, the
+ * I/O associated with bfqq is finished. So bfqq does not need
+ * to be treated as a queue belonging to a burst
+ * anymore. Accordingly, we reset bfqq's in_large_burst flag
+ * if set, and remove bfqq from the burst list if it's
+ * there. We do not decrement burst_size, because the fact
+ * that bfqq does not need to belong to the burst list any
+ * more does not invalidate the fact that bfqq was created in
+ * a burst.
+ */
+ if (likely(!bfq_bfqq_just_created(bfqq)) &&
+ idle_for_long_time &&
+ time_is_before_jiffies(
+ bfqq->budget_timeout +
+ msecs_to_jiffies(10000))) {
+ hlist_del_init(&bfqq->burst_list_node);
+ bfq_clear_bfqq_in_large_burst(bfqq);
+ }
+
+ bfq_clear_bfqq_just_created(bfqq);
+
+ if (!bfq_bfqq_IO_bound(bfqq)) {
+ if (arrived_in_time) {
+ bfqq->requests_within_timer++;
+ if (bfqq->requests_within_timer >=
+ bfqd->bfq_requests_within_timer)
+ bfq_mark_bfqq_IO_bound(bfqq);
+ } else
+ bfqq->requests_within_timer = 0;
+ bfq_log_bfqq(bfqd, bfqq, "requests in time %d",
+ bfqq->requests_within_timer);
+ }
+
+ if (bfqd->low_latency) {
+ if (unlikely(time_is_after_jiffies(bfqq->split_time)))
+ /* wraparound */
+ bfqq->split_time =
+ jiffies - bfqd->bfq_wr_min_idle_time - 1;
+
+ if (time_is_before_jiffies(bfqq->split_time +
+ bfqd->bfq_wr_min_idle_time)) {
+ bfq_update_bfqq_wr_on_rq_arrival(bfqd, bfqq,
+ old_wr_coeff,
+ wr_or_deserves_wr,
+ *interactive,
+ in_burst,
+ soft_rt);
+
+ if (old_wr_coeff != bfqq->wr_coeff)
+ bfqq->entity.prio_changed = 1;
+ }
+ }
+
+ bfqq->last_idle_bklogged = jiffies;
+ bfqq->service_from_backlogged = 0;
+ bfq_clear_bfqq_softrt_update(bfqq);
+
+ bfq_add_bfqq_busy(bfqd, bfqq);
+
+ /*
+ * Expire in-service queue only if preemption may be needed
+ * for guarantees. In this respect, the function
+ * next_queue_may_preempt just checks a simple, necessary
+ * condition, and not a sufficient condition based on
+ * timestamps. In fact, for the latter condition to be
+ * evaluated, timestamps would need first to be updated, and
+ * this operation is quite costly (see the comments on the
+ * function bfq_bfqq_update_budg_for_activation).
+ */
+ if (bfqd->in_service_queue && bfqq_wants_to_preempt &&
+ bfqd->in_service_queue->wr_coeff < bfqq->wr_coeff &&
+ next_queue_may_preempt(bfqd)) {
+ struct bfq_queue *in_serv =
+ bfqd->in_service_queue;
+ BUG_ON(in_serv == bfqq);
+
+ bfq_bfqq_expire(bfqd, bfqd->in_service_queue,
+ false, BFQ_BFQQ_PREEMPTED);
+ BUG_ON(in_serv->entity.budget < 0);
+ }
+}
+
+static void bfq_add_request(struct request *rq)
+{
+ struct bfq_queue *bfqq = RQ_BFQQ(rq);
+ struct bfq_data *bfqd = bfqq->bfqd;
+ struct request *next_rq, *prev;
+ unsigned int old_wr_coeff = bfqq->wr_coeff;
+ bool interactive = false;
+
+ bfq_log_bfqq(bfqd, bfqq, "add_request: size %u %s",
+ blk_rq_sectors(rq), rq_is_sync(rq) ? "S" : "A");
+
+ if (bfqq->wr_coeff > 1) /* queue is being weight-raised */
+ bfq_log_bfqq(bfqd, bfqq,
+ "raising period dur %u/%u msec, old coeff %u, w %d(%d)",
+ jiffies_to_msecs(jiffies - bfqq->last_wr_start_finish),
+ jiffies_to_msecs(bfqq->wr_cur_max_time),
+ bfqq->wr_coeff,
+ bfqq->entity.weight, bfqq->entity.orig_weight);
+
+ bfqq->queued[rq_is_sync(rq)]++;
+ bfqd->queued++;
+
+ elv_rb_add(&bfqq->sort_list, rq);
+
+ /*
+ * Check if this request is a better next-to-serve candidate.
+ */
+ prev = bfqq->next_rq;
+ next_rq = bfq_choose_req(bfqd, bfqq->next_rq, rq, bfqd->last_position);
+ BUG_ON(!next_rq);
+ bfqq->next_rq = next_rq;
+
+ /*
+ * Adjust priority tree position, if next_rq changes.
+ */
+ if (prev != bfqq->next_rq)
+ bfq_pos_tree_add_move(bfqd, bfqq);
+
+ if (!bfq_bfqq_busy(bfqq)) /* switching to busy ... */
+ bfq_bfqq_handle_idle_busy_switch(bfqd, bfqq, old_wr_coeff,
+ rq, &interactive);
+ else {
+ if (bfqd->low_latency && old_wr_coeff == 1 && !rq_is_sync(rq) &&
+ time_is_before_jiffies(
+ bfqq->last_wr_start_finish +
+ bfqd->bfq_wr_min_inter_arr_async)) {
+ bfqq->wr_coeff = bfqd->bfq_wr_coeff;
+ bfqq->wr_cur_max_time = bfq_wr_duration(bfqd);
+
+ bfqd->wr_busy_queues++;
+ bfqq->entity.prio_changed = 1;
+ bfq_log_bfqq(bfqd, bfqq,
+ "non-idle wrais starting, "
+ "wr_max_time %u wr_busy %d",
+ jiffies_to_msecs(bfqq->wr_cur_max_time),
+ bfqd->wr_busy_queues);
+ }
+ if (prev != bfqq->next_rq)
+ bfq_updated_next_req(bfqd, bfqq);
+ }
+
+ /*
+ * Assign jiffies to last_wr_start_finish in the following
+ * cases:
+ *
+ * . if bfqq is not going to be weight-raised, because, for
+ * non weight-raised queues, last_wr_start_finish stores the
+ * arrival time of the last request; as of now, this piece
+ * of information is used only for deciding whether to
+ * weight-raise async queues
+ *
+ * . if bfqq is not weight-raised, because, if bfqq is now
+ * switching to weight-raised, then last_wr_start_finish
+ * stores the time when weight-raising starts
+ *
+ * . if bfqq is interactive, because, regardless of whether
+ * bfqq is currently weight-raised, the weight-raising
+ * period must start or restart (this case is considered
+ * separately because it is not detected by the above
+ * conditions, if bfqq is already weight-raised)
+ *
+ * last_wr_start_finish has to be updated also if bfqq is soft
+ * real-time, because the weight-raising period is constantly
+ * restarted on idle-to-busy transitions for these queues, but
+ * this is already done in bfq_bfqq_handle_idle_busy_switch if
+ * needed.
+ */
+ if (bfqd->low_latency &&
+ (old_wr_coeff == 1 || bfqq->wr_coeff == 1 || interactive))
+ bfqq->last_wr_start_finish = jiffies;
+}
+
+static struct request *bfq_find_rq_fmerge(struct bfq_data *bfqd,
+ struct bio *bio)
+{
+ struct task_struct *tsk = current;
+ struct bfq_io_cq *bic;
+ struct bfq_queue *bfqq;
+
+ bic = bfq_bic_lookup(bfqd, tsk->io_context);
+ if (!bic)
+ return NULL;
+
+ bfqq = bic_to_bfqq(bic, bfq_bio_sync(bio));
+ if (bfqq)
+ return elv_rb_find(&bfqq->sort_list, bio_end_sector(bio));
+
+ return NULL;
+}
+
+static sector_t get_sdist(sector_t last_pos, struct request *rq)
+{
+ sector_t sdist = 0;
+
+ if (last_pos) {
+ if (last_pos < blk_rq_pos(rq))
+ sdist = blk_rq_pos(rq) - last_pos;
+ else
+ sdist = last_pos - blk_rq_pos(rq);
+ }
+
+ return sdist;
+}
+
+static void bfq_activate_request(struct request_queue *q, struct request *rq)
+{
+ struct bfq_data *bfqd = q->elevator->elevator_data;
+ bfqd->rq_in_driver++;
+}
+
+static void bfq_deactivate_request(struct request_queue *q, struct request *rq)
+{
+ struct bfq_data *bfqd = q->elevator->elevator_data;
+
+ BUG_ON(bfqd->rq_in_driver == 0);
+ bfqd->rq_in_driver--;
+}
+
+static void bfq_remove_request(struct request *rq)
+{
+ struct bfq_queue *bfqq = RQ_BFQQ(rq);
+ struct bfq_data *bfqd = bfqq->bfqd;
+ const int sync = rq_is_sync(rq);
+
+ BUG_ON(bfqq->entity.service > bfqq->entity.budget &&
+ bfqq == bfqd->in_service_queue);
+
+ if (bfqq->next_rq == rq) {
+ bfqq->next_rq = bfq_find_next_rq(bfqd, bfqq, rq);
+ bfq_updated_next_req(bfqd, bfqq);
+ }
+
+ if (rq->queuelist.prev != &rq->queuelist)
+ list_del_init(&rq->queuelist);
+ BUG_ON(bfqq->queued[sync] == 0);
+ bfqq->queued[sync]--;
+ bfqd->queued--;
+ elv_rb_del(&bfqq->sort_list, rq);
+
+ if (RB_EMPTY_ROOT(&bfqq->sort_list)) {
+ bfqq->next_rq = NULL;
+
+ BUG_ON(bfqq->entity.budget < 0);
+
+ if (bfq_bfqq_busy(bfqq) && bfqq != bfqd->in_service_queue) {
+ bfq_del_bfqq_busy(bfqd, bfqq, false);
+ /* bfqq emptied. In normal operation, when
+ * bfqq is empty, bfqq->entity.service and
+ * bfqq->entity.budget must contain,
+ * respectively, the service received and the
+ * budget used last time bfqq emptied. These
+ * facts do not hold in this case, as at least
+ * this last removal occurred while bfqq is
+ * not in service. To avoid inconsistencies,
+ * reset both bfqq->entity.service and
+ * bfqq->entity.budget.
+ */
+ bfqq->entity.budget = bfqq->entity.service = 0;
+ }
+
+ /*
+ * Remove queue from request-position tree as it is empty.
+ */
+ if (bfqq->pos_root) {
+ rb_erase(&bfqq->pos_node, bfqq->pos_root);
+ bfqq->pos_root = NULL;
+ }
+ }
+
+ if (rq->cmd_flags & REQ_META) {
+ BUG_ON(bfqq->meta_pending == 0);
+ bfqq->meta_pending--;
+ }
+ bfqg_stats_update_io_remove(bfqq_group(bfqq), req_op(rq),
+ rq->cmd_flags);
+}
+
+static int bfq_merge(struct request_queue *q, struct request **req,
+ struct bio *bio)
+{
+ struct bfq_data *bfqd = q->elevator->elevator_data;
+ struct request *__rq;
+
+ __rq = bfq_find_rq_fmerge(bfqd, bio);
+ if (__rq && elv_bio_merge_ok(__rq, bio)) {
+ *req = __rq;
+ return ELEVATOR_FRONT_MERGE;
+ }
+
+ return ELEVATOR_NO_MERGE;
+}
+
+static void bfq_merged_request(struct request_queue *q, struct request *req,
+ int type)
+{
+ if (type == ELEVATOR_FRONT_MERGE &&
+ rb_prev(&req->rb_node) &&
+ blk_rq_pos(req) <
+ blk_rq_pos(container_of(rb_prev(&req->rb_node),
+ struct request, rb_node))) {
+ struct bfq_queue *bfqq = RQ_BFQQ(req);
+ struct bfq_data *bfqd = bfqq->bfqd;
+ struct request *prev, *next_rq;
+
+ /* Reposition request in its sort_list */
+ elv_rb_del(&bfqq->sort_list, req);
+ elv_rb_add(&bfqq->sort_list, req);
+ /* Choose next request to be served for bfqq */
+ prev = bfqq->next_rq;
+ next_rq = bfq_choose_req(bfqd, bfqq->next_rq, req,
+ bfqd->last_position);
+ BUG_ON(!next_rq);
+ bfqq->next_rq = next_rq;
+ /*
+ * If next_rq changes, update both the queue's budget to
+ * fit the new request and the queue's position in its
+ * rq_pos_tree.
+ */
+ if (prev != bfqq->next_rq) {
+ bfq_updated_next_req(bfqd, bfqq);
+ bfq_pos_tree_add_move(bfqd, bfqq);
+ }
+ }
+}
+
+#ifdef CONFIG_BFQ_GROUP_IOSCHED
+static void bfq_bio_merged(struct request_queue *q, struct request *req,
+ struct bio *bio)
+{
+ bfqg_stats_update_io_merged(bfqq_group(RQ_BFQQ(req)), bio_op(bio),
+ bio->bi_opf);
+}
+#endif
+
+static void bfq_merged_requests(struct request_queue *q, struct request *rq,
+ struct request *next)
+{
+ struct bfq_queue *bfqq = RQ_BFQQ(rq), *next_bfqq = RQ_BFQQ(next);
+
+ /*
+ * If next and rq belong to the same bfq_queue and next is older
+ * than rq, then reposition rq in the fifo (by substituting next
+ * with rq). Otherwise, if next and rq belong to different
+ * bfq_queues, never reposition rq: in fact, we would have to
+ * reposition it with respect to next's position in its own fifo,
+ * which would most certainly be too expensive with respect to
+ * the benefits.
+ */
+ if (bfqq == next_bfqq &&
+ !list_empty(&rq->queuelist) && !list_empty(&next->queuelist) &&
+ next->fifo_time < rq->fifo_time) {
+ list_del_init(&rq->queuelist);
+ list_replace_init(&next->queuelist, &rq->queuelist);
+ rq->fifo_time = next->fifo_time;
+ }
+
+ if (bfqq->next_rq == next)
+ bfqq->next_rq = rq;
+
+ bfq_remove_request(next);
+ bfqg_stats_update_io_merged(bfqq_group(bfqq), req_op(next),
+ next->cmd_flags);
+}
+
+/* Must be called with bfqq != NULL */
+static void bfq_bfqq_end_wr(struct bfq_queue *bfqq)
+{
+ BUG_ON(!bfqq);
+
+ if (bfq_bfqq_busy(bfqq))
+ bfqq->bfqd->wr_busy_queues--;
+ bfqq->wr_coeff = 1;
+ bfqq->wr_cur_max_time = 0;
+ bfqq->last_wr_start_finish = jiffies;
+ /*
+ * Trigger a weight change on the next invocation of
+ * __bfq_entity_update_weight_prio.
+ */
+ bfqq->entity.prio_changed = 1;
+ bfq_log_bfqq(bfqq->bfqd, bfqq,
+ "end_wr: wrais ending at %lu, rais_max_time %u",
+ bfqq->last_wr_start_finish,
+ jiffies_to_msecs(bfqq->wr_cur_max_time));
+ bfq_log_bfqq(bfqq->bfqd, bfqq, "end_wr: wr_busy %d",
+ bfqq->bfqd->wr_busy_queues);
+}
+
+static void bfq_end_wr_async_queues(struct bfq_data *bfqd,
+ struct bfq_group *bfqg)
+{
+ int i, j;
+
+ for (i = 0; i < 2; i++)
+ for (j = 0; j < IOPRIO_BE_NR; j++)
+ if (bfqg->async_bfqq[i][j])
+ bfq_bfqq_end_wr(bfqg->async_bfqq[i][j]);
+ if (bfqg->async_idle_bfqq)
+ bfq_bfqq_end_wr(bfqg->async_idle_bfqq);
+}
+
+static void bfq_end_wr(struct bfq_data *bfqd)
+{
+ struct bfq_queue *bfqq;
+
+ spin_lock_irq(bfqd->queue->queue_lock);
+
+ list_for_each_entry(bfqq, &bfqd->active_list, bfqq_list)
+ bfq_bfqq_end_wr(bfqq);
+ list_for_each_entry(bfqq, &bfqd->idle_list, bfqq_list)
+ bfq_bfqq_end_wr(bfqq);
+ bfq_end_wr_async(bfqd);
+
+ spin_unlock_irq(bfqd->queue->queue_lock);
+}
+
+static sector_t bfq_io_struct_pos(void *io_struct, bool request)
+{
+ if (request)
+ return blk_rq_pos(io_struct);
+ else
+ return ((struct bio *)io_struct)->bi_iter.bi_sector;
+}
+
+static int bfq_rq_close_to_sector(void *io_struct, bool request,
+ sector_t sector)
+{
+ return abs(bfq_io_struct_pos(io_struct, request) - sector) <=
+ BFQQ_CLOSE_THR;
+}
+
+static struct bfq_queue *bfqq_find_close(struct bfq_data *bfqd,
+ struct bfq_queue *bfqq,
+ sector_t sector)
+{
+ struct rb_root *root = &bfq_bfqq_to_bfqg(bfqq)->rq_pos_tree;
+ struct rb_node *parent, *node;
+ struct bfq_queue *__bfqq;
+
+ if (RB_EMPTY_ROOT(root))
+ return NULL;
+
+ /*
+ * First, if we find a request starting at the end of the last
+ * request, choose it.
+ */
+ __bfqq = bfq_rq_pos_tree_lookup(bfqd, root, sector, &parent, NULL);
+ if (__bfqq)
+ return __bfqq;
+
+ /*
+ * If the exact sector wasn't found, the parent of the NULL leaf
+ * will contain the closest sector (rq_pos_tree sorted by
+ * next_request position).
+ */
+ __bfqq = rb_entry(parent, struct bfq_queue, pos_node);
+ if (bfq_rq_close_to_sector(__bfqq->next_rq, true, sector))
+ return __bfqq;
+
+ if (blk_rq_pos(__bfqq->next_rq) < sector)
+ node = rb_next(&__bfqq->pos_node);
+ else
+ node = rb_prev(&__bfqq->pos_node);
+ if (!node)
+ return NULL;
+
+ __bfqq = rb_entry(node, struct bfq_queue, pos_node);
+ if (bfq_rq_close_to_sector(__bfqq->next_rq, true, sector))
+ return __bfqq;
+
+ return NULL;
+}
+
+static struct bfq_queue *bfq_find_close_cooperator(struct bfq_data *bfqd,
+ struct bfq_queue *cur_bfqq,
+ sector_t sector)
+{
+ struct bfq_queue *bfqq;
+
+ /*
+ * We shall notice if some of the queues are cooperating,
+ * e.g., working closely on the same area of the device. In
+ * that case, we can group them together and: 1) don't waste
+ * time idling, and 2) serve the union of their requests in
+ * the best possible order for throughput.
+ */
+ bfqq = bfqq_find_close(bfqd, cur_bfqq, sector);
+ if (!bfqq || bfqq == cur_bfqq)
+ return NULL;
+
+ return bfqq;
+}
+
+static struct bfq_queue *
+bfq_setup_merge(struct bfq_queue *bfqq, struct bfq_queue *new_bfqq)
+{
+ int process_refs, new_process_refs;
+ struct bfq_queue *__bfqq;
+
+ /*
+ * If there are no process references on the new_bfqq, then it is
+ * unsafe to follow the ->new_bfqq chain as other bfqq's in the chain
+ * may have dropped their last reference (not just their last process
+ * reference).
+ */
+ if (!bfqq_process_refs(new_bfqq))
+ return NULL;
+
+ /* Avoid a circular list and skip interim queue merges. */
+ while ((__bfqq = new_bfqq->new_bfqq)) {
+ if (__bfqq == bfqq)
+ return NULL;
+ new_bfqq = __bfqq;
+ }
+
+ process_refs = bfqq_process_refs(bfqq);
+ new_process_refs = bfqq_process_refs(new_bfqq);
+ /*
+ * If the process for the bfqq has gone away, there is no
+ * sense in merging the queues.
+ */
+ if (process_refs == 0 || new_process_refs == 0)
+ return NULL;
+
+ bfq_log_bfqq(bfqq->bfqd, bfqq, "scheduling merge with queue %d",
+ new_bfqq->pid);
+
+ /*
+ * Merging is just a redirection: the requests of the process
+ * owning one of the two queues are redirected to the other queue.
+ * The latter queue, in its turn, is set as shared if this is the
+ * first time that the requests of some process are redirected to
+ * it.
+ *
+ * We redirect bfqq to new_bfqq and not the opposite, because we
+ * are in the context of the process owning bfqq, hence we have
+ * the io_cq of this process. So we can immediately configure this
+ * io_cq to redirect the requests of the process to new_bfqq.
+ *
+ * NOTE, even if new_bfqq coincides with the in-service queue, the
+ * io_cq of new_bfqq is not available, because, if the in-service
+ * queue is shared, bfqd->in_service_bic may not point to the
+ * io_cq of the in-service queue.
+ * Redirecting the requests of the process owning bfqq to the
+ * currently in-service queue is in any case the best option, as
+ * we feed the in-service queue with new requests close to the
+ * last request served and, by doing so, hopefully increase the
+ * throughput.
+ */
+ bfqq->new_bfqq = new_bfqq;
+ new_bfqq->ref += process_refs;
+ return new_bfqq;
+}
+
+static bool bfq_may_be_close_cooperator(struct bfq_queue *bfqq,
+ struct bfq_queue *new_bfqq)
+{
+ if (bfq_class_idle(bfqq) || bfq_class_idle(new_bfqq) ||
+ (bfqq->ioprio_class != new_bfqq->ioprio_class))
+ return false;
+
+ /*
+ * If either of the queues has already been detected as seeky,
+ * then merging it with the other queue is unlikely to lead to
+ * sequential I/O.
+ */
+ if (BFQQ_SEEKY(bfqq) || BFQQ_SEEKY(new_bfqq))
+ return false;
+
+ /*
+ * Interleaved I/O is known to be done by (some) applications
+ * only for reads, so it does not make sense to merge async
+ * queues.
+ */
+ if (!bfq_bfqq_sync(bfqq) || !bfq_bfqq_sync(new_bfqq))
+ return false;
+
+ return true;
+}
+
+/*
+ * If this function returns true, then bfqq cannot be merged. The idea
+ * is that true cooperation happens very early after processes start
+ * to do I/O. Usually, late cooperations are just accidental false
+ * positives. In case bfqq is weight-raised, such false positives
+ * would evidently degrade latency guarantees for bfqq.
+ */
+static bool wr_from_too_long(struct bfq_queue *bfqq)
+{
+ return bfqq->wr_coeff > 1 &&
+ time_is_before_jiffies(bfqq->last_wr_start_finish +
+ msecs_to_jiffies(100));
+}
+
+/*
+ * Attempt to schedule a merge of bfqq with the currently in-service
+ * queue or with a close queue among the scheduled queues. Return
+ * NULL if no merge was scheduled, a pointer to the shared bfq_queue
+ * structure otherwise.
+ *
+ * The OOM queue is not allowed to participate to cooperation: in fact, since
+ * the requests temporarily redirected to the OOM queue could be redirected
+ * again to dedicated queues at any time, the state needed to correctly
+ * handle merging with the OOM queue would be quite complex and expensive
+ * to maintain. Besides, in such a critical condition as an out of memory,
+ * the benefits of queue merging may be little relevant, or even negligible.
+ *
+ * Weight-raised queues can be merged only if their weight-raising
+ * period has just started. In fact cooperating processes are usually
+ * started together. Thus, with this filter we avoid false positives
+ * that would jeopardize low-latency guarantees.
+ *
+ * WARNING: queue merging may impair fairness among non-weight raised
+ * queues, for at least two reasons: 1) the original weight of a
+ * merged queue may change during the merged state, 2) even being the
+ * weight the same, a merged queue may be bloated with many more
+ * requests than the ones produced by its originally-associated
+ * process.
+ */
+static struct bfq_queue *
+bfq_setup_cooperator(struct bfq_data *bfqd, struct bfq_queue *bfqq,
+ void *io_struct, bool request)
+{
+ struct bfq_queue *in_service_bfqq, *new_bfqq;
+
+ if (bfqq->new_bfqq)
+ return bfqq->new_bfqq;
+
+ if (io_struct && wr_from_too_long(bfqq) &&
+ likely(bfqq != &bfqd->oom_bfqq))
+ bfq_log_bfqq(bfqd, bfqq,
+ "would have looked for coop, but bfq%d wr",
+ bfqq->pid);
+
+ if (!io_struct ||
+ wr_from_too_long(bfqq) ||
+ unlikely(bfqq == &bfqd->oom_bfqq))
+ return NULL;
+
+ /* If there is only one backlogged queue, don't search. */
+ if (bfqd->busy_queues == 1)
+ return NULL;
+
+ in_service_bfqq = bfqd->in_service_queue;
+
+ if (in_service_bfqq && in_service_bfqq != bfqq &&
+ bfqd->in_service_bic && wr_from_too_long(in_service_bfqq)
+ && likely(in_service_bfqq == &bfqd->oom_bfqq))
+ bfq_log_bfqq(bfqd, bfqq,
+ "would have tried merge with in-service-queue, but wr");
+
+ if (!in_service_bfqq || in_service_bfqq == bfqq ||
+ !bfqd->in_service_bic || wr_from_too_long(in_service_bfqq) ||
+ unlikely(in_service_bfqq == &bfqd->oom_bfqq))
+ goto check_scheduled;
+
+ if (bfq_rq_close_to_sector(io_struct, request, bfqd->last_position) &&
+ bfqq->entity.parent == in_service_bfqq->entity.parent &&
+ bfq_may_be_close_cooperator(bfqq, in_service_bfqq)) {
+ new_bfqq = bfq_setup_merge(bfqq, in_service_bfqq);
+ if (new_bfqq)
+ return new_bfqq;
+ }
+ /*
+ * Check whether there is a cooperator among currently scheduled
+ * queues. The only thing we need is that the bio/request is not
+ * NULL, as we need it to establish whether a cooperator exists.
+ */
+check_scheduled:
+ new_bfqq = bfq_find_close_cooperator(bfqd, bfqq,
+ bfq_io_struct_pos(io_struct, request));
+
+ BUG_ON(new_bfqq && bfqq->entity.parent != new_bfqq->entity.parent);
+
+ if (new_bfqq && wr_from_too_long(new_bfqq) &&
+ likely(new_bfqq != &bfqd->oom_bfqq) &&
+ bfq_may_be_close_cooperator(bfqq, new_bfqq))
+ bfq_log_bfqq(bfqd, bfqq,
+ "would have merged with bfq%d, but wr",
+ new_bfqq->pid);
+
+ if (new_bfqq && !wr_from_too_long(new_bfqq) &&
+ likely(new_bfqq != &bfqd->oom_bfqq) &&
+ bfq_may_be_close_cooperator(bfqq, new_bfqq))
+ return bfq_setup_merge(bfqq, new_bfqq);
+
+ return NULL;
+}
+
+static void bfq_bfqq_save_state(struct bfq_queue *bfqq)
+{
+ struct bfq_io_cq *bic = bfqq->bic;
+
+ /*
+ * If !bfqq->bic, the queue is already shared or its requests
+ * have already been redirected to a shared queue; both idle window
+ * and weight raising state have already been saved. Do nothing.
+ */
+ if (!bic)
+ return;
+
+ bic->saved_idle_window = bfq_bfqq_idle_window(bfqq);
+ bic->saved_IO_bound = bfq_bfqq_IO_bound(bfqq);
+ bic->saved_in_large_burst = bfq_bfqq_in_large_burst(bfqq);
+ bic->was_in_burst_list = !hlist_unhashed(&bfqq->burst_list_node);
+ bic->saved_wr_coeff = bfqq->wr_coeff;
+ bic->saved_wr_start_at_switch_to_srt = bfqq->wr_start_at_switch_to_srt;
+ bic->saved_last_wr_start_finish = bfqq->last_wr_start_finish;
+ bic->saved_wr_cur_max_time = bfqq->wr_cur_max_time;
+ BUG_ON(time_is_after_jiffies(bfqq->last_wr_start_finish));
+}
+
+static void bfq_get_bic_reference(struct bfq_queue *bfqq)
+{
+ /*
+ * If bfqq->bic has a non-NULL value, the bic to which it belongs
+ * is about to begin using a shared bfq_queue.
+ */
+ if (bfqq->bic)
+ atomic_long_inc(&bfqq->bic->icq.ioc->refcount);
+}
+
+static void
+bfq_merge_bfqqs(struct bfq_data *bfqd, struct bfq_io_cq *bic,
+ struct bfq_queue *bfqq, struct bfq_queue *new_bfqq)
+{
+ bfq_log_bfqq(bfqd, bfqq, "merging with queue %lu",
+ (unsigned long) new_bfqq->pid);
+ /* Save weight raising and idle window of the merged queues */
+ bfq_bfqq_save_state(bfqq);
+ bfq_bfqq_save_state(new_bfqq);
+ if (bfq_bfqq_IO_bound(bfqq))
+ bfq_mark_bfqq_IO_bound(new_bfqq);
+ bfq_clear_bfqq_IO_bound(bfqq);
+
+ /*
+ * If bfqq is weight-raised, then let new_bfqq inherit
+ * weight-raising. To reduce false positives, neglect the case
+ * where bfqq has just been created, but has not yet made it
+ * to be weight-raised (which may happen because EQM may merge
+ * bfqq even before bfq_add_request is executed for the first
+ * time for bfqq). Handling this case would however be very
+ * easy, thanks to the flag just_created.
+ */
+ if (new_bfqq->wr_coeff == 1 && bfqq->wr_coeff > 1) {
+ new_bfqq->wr_coeff = bfqq->wr_coeff;
+ new_bfqq->wr_cur_max_time = bfqq->wr_cur_max_time;
+ new_bfqq->last_wr_start_finish = bfqq->last_wr_start_finish;
+ new_bfqq->wr_start_at_switch_to_srt = bfqq->wr_start_at_switch_to_srt;
+ if (bfq_bfqq_busy(new_bfqq))
+ bfqd->wr_busy_queues++;
+ new_bfqq->entity.prio_changed = 1;
+ bfq_log_bfqq(bfqd, new_bfqq,
+ "wr start after merge with %d, rais_max_time %u",
+ bfqq->pid,
+ jiffies_to_msecs(bfqq->wr_cur_max_time));
+ }
+
+ if (bfqq->wr_coeff > 1) { /* bfqq has given its wr to new_bfqq */
+ bfqq->wr_coeff = 1;
+ bfqq->entity.prio_changed = 1;
+ if (bfq_bfqq_busy(bfqq))
+ bfqd->wr_busy_queues--;
+ }
+
+ bfq_log_bfqq(bfqd, new_bfqq, "merge_bfqqs: wr_busy %d",
+ bfqd->wr_busy_queues);
+
+ /*
+ * Grab a reference to the bic, to prevent it from being destroyed
+ * before being possibly touched by a bfq_split_bfqq().
+ */
+ bfq_get_bic_reference(bfqq);
+ bfq_get_bic_reference(new_bfqq);
+ /*
+ * Merge queues (that is, let bic redirect its requests to new_bfqq)
+ */
+ bic_set_bfqq(bic, new_bfqq, 1);
+ bfq_mark_bfqq_coop(new_bfqq);
+ /*
+ * new_bfqq now belongs to at least two bics (it is a shared queue):
+ * set new_bfqq->bic to NULL. bfqq either:
+ * - does not belong to any bic any more, and hence bfqq->bic must
+ * be set to NULL, or
+ * - is a queue whose owning bics have already been redirected to a
+ * different queue, hence the queue is destined to not belong to
+ * any bic soon and bfqq->bic is already NULL (therefore the next
+ * assignment causes no harm).
+ */
+ new_bfqq->bic = NULL;
+ bfqq->bic = NULL;
+ bfq_put_queue(bfqq);
+}
+
+static int bfq_allow_bio_merge(struct request_queue *q, struct request *rq,
+ struct bio *bio)
+{
+ struct bfq_data *bfqd = q->elevator->elevator_data;
+ struct bfq_io_cq *bic;
+ struct bfq_queue *bfqq, *new_bfqq;
+
+ /*
+ * Disallow merge of a sync bio into an async request.
+ */
+ if (bfq_bio_sync(bio) && !rq_is_sync(rq))
+ return false;
+
+ /*
+ * Lookup the bfqq that this bio will be queued with. Allow
+ * merge only if rq is queued there.
+ * Queue lock is held here.
+ */
+ bic = bfq_bic_lookup(bfqd, current->io_context);
+ if (!bic)
+ return false;
+
+ bfqq = bic_to_bfqq(bic, bfq_bio_sync(bio));
+ /*
+ * We take advantage of this function to perform an early merge
+ * of the queues of possible cooperating processes.
+ */
+ if (bfqq) {
+ new_bfqq = bfq_setup_cooperator(bfqd, bfqq, bio, false);
+ if (new_bfqq) {
+ bfq_merge_bfqqs(bfqd, bic, bfqq, new_bfqq);
+ /*
+ * If we get here, the bio will be queued in the
+ * shared queue, i.e., new_bfqq, so use new_bfqq
+ * to decide whether bio and rq can be merged.
+ */
+ bfqq = new_bfqq;
+ }
+ }
+
+ return bfqq == RQ_BFQQ(rq);
+}
+
+static int bfq_allow_rq_merge(struct request_queue *q, struct request *rq,
+ struct request *next)
+{
+ return RQ_BFQQ(rq) == RQ_BFQQ(next);
+}
+
+/*
+ * Set the maximum time for the in-service queue to consume its
+ * budget. This prevents seeky processes from lowering the throughput.
+ * In practice, a time-slice service scheme is used with seeky
+ * processes.
+ */
+static void bfq_set_budget_timeout(struct bfq_data *bfqd,
+ struct bfq_queue *bfqq)
+{
+ unsigned int timeout_coeff;
+
+ if (bfqq->wr_cur_max_time == bfqd->bfq_wr_rt_max_time)
+ timeout_coeff = 1;
+ else
+ timeout_coeff = bfqq->entity.weight / bfqq->entity.orig_weight;
+
+ bfqd->last_budget_start = ktime_get();
+
+ bfqq->budget_timeout = jiffies +
+ bfqd->bfq_timeout * timeout_coeff;
+
+ bfq_log_bfqq(bfqd, bfqq, "set budget_timeout %u",
+ jiffies_to_msecs(bfqd->bfq_timeout * timeout_coeff));
+}
+
+static void __bfq_set_in_service_queue(struct bfq_data *bfqd,
+ struct bfq_queue *bfqq)
+{
+ if (bfqq) {
+ bfqg_stats_update_avg_queue_size(bfqq_group(bfqq));
+ bfq_mark_bfqq_must_alloc(bfqq);
+ bfq_clear_bfqq_fifo_expire(bfqq);
+
+ bfqd->budgets_assigned = (bfqd->budgets_assigned*7 + 256) / 8;
+
+ BUG_ON(bfqq == bfqd->in_service_queue);
+ BUG_ON(RB_EMPTY_ROOT(&bfqq->sort_list));
+
+ if (time_is_before_jiffies(bfqq->last_wr_start_finish) &&
+ bfqq->wr_coeff > 1 &&
+ bfqq->wr_cur_max_time == bfqd->bfq_wr_rt_max_time &&
+ time_is_before_jiffies(bfqq->budget_timeout)) {
+ /*
+ * For soft real-time queues, move the start
+ * of the weight-raising period forward by the
+ * time the queue has not received any
+ * service. Otherwise, a relatively long
+ * service delay is likely to cause the
+ * weight-raising period of the queue to end,
+ * because of the short duration of the
+ * weight-raising period of a soft real-time
+ * queue. It is worth noting that this move
+ * is not so dangerous for the other queues,
+ * because soft real-time queues are not
+ * greedy.
+ *
+ * To not add a further variable, we use the
+ * overloaded field budget_timeout to
+ * determine for how long the queue has not
+ * received service, i.e., how much time has
+ * elapsed since the queue expired. However,
+ * this is a little imprecise, because
+ * budget_timeout is set to jiffies if bfqq
+ * not only expires, but also remains with no
+ * request.
+ */
+ if (time_after(bfqq->budget_timeout,
+ bfqq->last_wr_start_finish))
+ bfqq->last_wr_start_finish +=
+ jiffies - bfqq->budget_timeout;
+ else
+ bfqq->last_wr_start_finish = jiffies;
+
+ if (time_is_after_jiffies(bfqq->last_wr_start_finish)) {
+ pr_crit(
+ "BFQ WARNING:last %lu budget %lu jiffies %lu",
+ bfqq->last_wr_start_finish,
+ bfqq->budget_timeout,
+ jiffies);
+ pr_crit("diff %lu", jiffies -
+ max_t(unsigned long,
+ bfqq->last_wr_start_finish,
+ bfqq->budget_timeout));
+ bfqq->last_wr_start_finish = jiffies;
+ }
+ }
+
+ bfq_set_budget_timeout(bfqd, bfqq);
+ bfq_log_bfqq(bfqd, bfqq,
+ "set_in_service_queue, cur-budget = %d",
+ bfqq->entity.budget);
+ } else
+ bfq_log(bfqd, "set_in_service_queue: NULL");
+
+ bfqd->in_service_queue = bfqq;
+}
+
+/*
+ * Get and set a new queue for service.
+ */
+static struct bfq_queue *bfq_set_in_service_queue(struct bfq_data *bfqd)
+{
+ struct bfq_queue *bfqq = bfq_get_next_queue(bfqd);
+
+ __bfq_set_in_service_queue(bfqd, bfqq);
+ return bfqq;
+}
+
+static void bfq_arm_slice_timer(struct bfq_data *bfqd)
+{
+ struct bfq_queue *bfqq = bfqd->in_service_queue;
+ struct bfq_io_cq *bic;
+ u32 sl;
+
+ BUG_ON(!RB_EMPTY_ROOT(&bfqq->sort_list));
+
+ /* Processes have exited, don't wait. */
+ bic = bfqd->in_service_bic;
+ if (!bic || atomic_read(&bic->icq.ioc->active_ref) == 0)
+ return;
+
+ bfq_mark_bfqq_wait_request(bfqq);
+
+ /*
+ * We don't want to idle for seeks, but we do want to allow
+ * fair distribution of slice time for a process doing back-to-back
+ * seeks. So allow a little bit of time for him to submit a new rq.
+ *
+ * To prevent processes with (partly) seeky workloads from
+ * being too ill-treated, grant them a small fraction of the
+ * assigned budget before reducing the waiting time to
+ * BFQ_MIN_TT. This happened to help reduce latency.
+ */
+ sl = bfqd->bfq_slice_idle;
+ /*
+ * Unless the queue is being weight-raised or the scenario is
+ * asymmetric, grant only minimum idle time if the queue
+ * is seeky. A long idling is preserved for a weight-raised
+ * queue, or, more in general, in an asymemtric scenario,
+ * because a long idling is needed for guaranteeing to a queue
+ * its reserved share of the throughput (in particular, it is
+ * needed if the queue has a higher weight than some other
+ * queue).
+ */
+ if (BFQQ_SEEKY(bfqq) && bfqq->wr_coeff == 1 &&
+ bfq_symmetric_scenario(bfqd))
+ sl = min_t(u32, sl, BFQ_MIN_TT);
+
+ bfqd->last_idling_start = ktime_get();
+ hrtimer_start(&bfqd->idle_slice_timer, ns_to_ktime(sl),
+ HRTIMER_MODE_REL);
+ bfqg_stats_set_start_idle_time(bfqq_group(bfqq));
+ bfq_log(bfqd, "arm idle: %ld/%ld ms",
+ sl / NSEC_PER_MSEC, bfqd->bfq_slice_idle / NSEC_PER_MSEC);
+}
+
+/*
+ * In autotuning mode, max_budget is dynamically recomputed as the
+ * amount of sectors transferred in timeout at the estimated peak
+ * rate. This enables BFQ to utilize a full timeslice with a full
+ * budget, even if the in-service queue is served at peak rate. And
+ * this maximises throughput with sequential workloads.
+ */
+static unsigned long bfq_calc_max_budget(struct bfq_data *bfqd)
+{
+ return (u64)bfqd->peak_rate * USEC_PER_MSEC *
+ jiffies_to_msecs(bfqd->bfq_timeout)>>BFQ_RATE_SHIFT;
+}
+
+/*
+ * Update parameters related to throughput and responsiveness, as a
+ * function of the estimated peak rate. See comments on
+ * bfq_calc_max_budget(), and on T_slow and T_fast arrays.
+ */
+static void update_thr_responsiveness_params(struct bfq_data *bfqd)
+{
+ int dev_type = blk_queue_nonrot(bfqd->queue);
+
+ if (bfqd->bfq_user_max_budget == 0) {
+ bfqd->bfq_max_budget =
+ bfq_calc_max_budget(bfqd);
+ BUG_ON(bfqd->bfq_max_budget < 0);
+ bfq_log(bfqd, "new max_budget = %d",
+ bfqd->bfq_max_budget);
+ }
+
+ if (bfqd->device_speed == BFQ_BFQD_FAST &&
+ bfqd->peak_rate < device_speed_thresh[dev_type]) {
+ bfqd->device_speed = BFQ_BFQD_SLOW;
+ bfqd->RT_prod = R_slow[dev_type] *
+ T_slow[dev_type];
+ } else if (bfqd->device_speed == BFQ_BFQD_SLOW &&
+ bfqd->peak_rate > device_speed_thresh[dev_type]) {
+ bfqd->device_speed = BFQ_BFQD_FAST;
+ bfqd->RT_prod = R_fast[dev_type] *
+ T_fast[dev_type];
+ }
+
+ bfq_log(bfqd,
+"dev_type %s dev_speed_class = %s (%llu sects/sec), thresh %llu setcs/sec",
+ dev_type == 0 ? "ROT" : "NONROT",
+ bfqd->device_speed == BFQ_BFQD_FAST ? "FAST" : "SLOW",
+ bfqd->device_speed == BFQ_BFQD_FAST ?
+ (USEC_PER_SEC*(u64)R_fast[dev_type])>>BFQ_RATE_SHIFT :
+ (USEC_PER_SEC*(u64)R_slow[dev_type])>>BFQ_RATE_SHIFT,
+ (USEC_PER_SEC*(u64)device_speed_thresh[dev_type])>>
+ BFQ_RATE_SHIFT);
+}
+
+static void bfq_reset_rate_computation(struct bfq_data *bfqd, struct request *rq)
+{
+ if (rq != NULL) { /* new rq dispatch now, reset accordingly */
+ bfqd->last_dispatch = bfqd->first_dispatch = ktime_get_ns() ;
+ bfqd->peak_rate_samples = 1;
+ bfqd->sequential_samples = 0;
+ bfqd->tot_sectors_dispatched = bfqd->last_rq_max_size =
+ blk_rq_sectors(rq);
+ } else /* no new rq dispatched, just reset the number of samples */
+ bfqd->peak_rate_samples = 0; /* full re-init on next disp. */
+
+ bfq_log(bfqd,
+ "reset_rate_computation at end, sample %u/%u tot_sects %llu",
+ bfqd->peak_rate_samples, bfqd->sequential_samples,
+ bfqd->tot_sectors_dispatched);
+}
+
+static void bfq_update_rate_reset(struct bfq_data *bfqd, struct request *rq)
+{
+ u32 rate, weight, divisor;
+
+ /*
+ * For the convergence property to hold (see comments on
+ * bfq_update_peak_rate()) and for the assessment to be
+ * reliable, a minimum number of samples must be present, and
+ * a minimum amount of time must have elapsed. If not so, do
+ * not compute new rate. Just reset parameters, to get ready
+ * for a new evaluation attempt.
+ */
+ if (bfqd->peak_rate_samples < BFQ_RATE_MIN_SAMPLES ||
+ bfqd->delta_from_first < BFQ_RATE_MIN_INTERVAL) {
+ bfq_log(bfqd,
+ "update_rate_reset: only resetting, delta_first %lluus samples %d",
+ bfqd->delta_from_first>>10, bfqd->peak_rate_samples);
+ goto reset_computation;
+ }
+
+ /*
+ * If a new request completion has occurred after last
+ * dispatch, then, to approximate the rate at which requests
+ * have been served by the device, it is more precise to
+ * extend the observation interval to the last completion.
+ */
+ bfqd->delta_from_first =
+ max_t(u64, bfqd->delta_from_first,
+ bfqd->last_completion - bfqd->first_dispatch);
+
+ BUG_ON(bfqd->delta_from_first == 0);
+ /*
+ * Rate computed in sects/usec, and not sects/nsec, for
+ * precision issues.
+ */
+ rate = div64_ul(bfqd->tot_sectors_dispatched<<BFQ_RATE_SHIFT,
+ div_u64(bfqd->delta_from_first, NSEC_PER_USEC));
+
+ bfq_log(bfqd,
+"update_rate_reset: tot_sects %llu delta_first %lluus rate %llu sects/s (%d)",
+ bfqd->tot_sectors_dispatched, bfqd->delta_from_first>>10,
+ ((USEC_PER_SEC*(u64)rate)>>BFQ_RATE_SHIFT),
+ rate > 20<<BFQ_RATE_SHIFT);
+
+ /*
+ * Peak rate not updated if:
+ * - the percentage of sequential dispatches is below 3/4 of the
+ * total, and rate is below the current estimated peak rate
+ * - rate is unreasonably high (> 20M sectors/sec)
+ */
+ if ((bfqd->sequential_samples < (3 * bfqd->peak_rate_samples)>>2 &&
+ rate <= bfqd->peak_rate) ||
+ rate > 20<<BFQ_RATE_SHIFT) {
+ bfq_log(bfqd,
+ "update_rate_reset: goto reset, samples %u/%u rate/peak %llu/%llu",
+ bfqd->peak_rate_samples, bfqd->sequential_samples,
+ ((USEC_PER_SEC*(u64)rate)>>BFQ_RATE_SHIFT),
+ ((USEC_PER_SEC*(u64)bfqd->peak_rate)>>BFQ_RATE_SHIFT));
+ goto reset_computation;
+ } else {
+ bfq_log(bfqd,
+ "update_rate_reset: do update, samples %u/%u rate/peak %llu/%llu",
+ bfqd->peak_rate_samples, bfqd->sequential_samples,
+ ((USEC_PER_SEC*(u64)rate)>>BFQ_RATE_SHIFT),
+ ((USEC_PER_SEC*(u64)bfqd->peak_rate)>>BFQ_RATE_SHIFT));
+ }
+
+ /*
+ * We have to update the peak rate, at last! To this purpose,
+ * we use a low-pass filter. We compute the smoothing constant
+ * of the filter as a function of the 'weight' of the new
+ * measured rate.
+ *
+ * As can be seen in next formulas, we define this weight as a
+ * quantity proportional to how sequential the workload is,
+ * and to how long the observation time interval is.
+ *
+ * The weight runs from 0 to 8. The maximum value of the
+ * weight, 8, yields the minimum value for the smoothing
+ * constant. At this minimum value for the smoothing constant,
+ * the measured rate contributes for half of the next value of
+ * the estimated peak rate.
+ *
+ * So, the first step is to compute the weight as a function
+ * of how sequential the workload is. Note that the weight
+ * cannot reach 9, because bfqd->sequential_samples cannot
+ * become equal to bfqd->peak_rate_samples, which, in its
+ * turn, holds true because bfqd->sequential_samples is not
+ * incremented for the first sample.
+ */
+ weight = (9 * bfqd->sequential_samples) / bfqd->peak_rate_samples;
+
+ /*
+ * Second step: further refine the weight as a function of the
+ * duration of the observation interval.
+ */
+ weight = min_t(u32, 8,
+ div_u64(weight * bfqd->delta_from_first,
+ BFQ_RATE_REF_INTERVAL));
+
+ /*
+ * Divisor ranging from 10, for minimum weight, to 2, for
+ * maximum weight.
+ */
+ divisor = 10 - weight;
+ BUG_ON(divisor == 0);
+
+ /*
+ * Finally, update peak rate:
+ *
+ * peak_rate = peak_rate * (divisor-1) / divisor + rate / divisor
+ */
+ bfqd->peak_rate *= divisor-1;
+ bfqd->peak_rate /= divisor;
+ rate /= divisor; /* smoothing constant alpha = 1/divisor */
+
+ bfq_log(bfqd,
+ "update_rate_reset: divisor %d tmp_peak_rate %llu tmp_rate %u",
+ divisor,
+ ((USEC_PER_SEC*(u64)bfqd->peak_rate)>>BFQ_RATE_SHIFT),
+ (u32)((USEC_PER_SEC*(u64)rate)>>BFQ_RATE_SHIFT));
+
+ BUG_ON(bfqd->peak_rate == 0);
+ BUG_ON(bfqd->peak_rate > 20<<BFQ_RATE_SHIFT);
+
+ bfqd->peak_rate += rate;
+ update_thr_responsiveness_params(bfqd);
+ BUG_ON(bfqd->peak_rate > 20<<BFQ_RATE_SHIFT);
+
+reset_computation:
+ bfq_reset_rate_computation(bfqd, rq);
+}
+
+/*
+ * Update the read/write peak rate (the main quantity used for
+ * auto-tuning, see update_thr_responsiveness_params()).
+ *
+ * It is not trivial to estimate the peak rate (correctly): because of
+ * the presence of sw and hw queues between the scheduler and the
+ * device components that finally serve I/O requests, it is hard to
+ * say exactly when a given dispatched request is served inside the
+ * device, and for how long. As a consequence, it is hard to know
+ * precisely at what rate a given set of requests is actually served
+ * by the device.
+ *
+ * On the opposite end, the dispatch time of any request is trivially
+ * available, and, from this piece of information, the "dispatch rate"
+ * of requests can be immediately computed. So, the idea in the next
+ * function is to use what is known, namely request dispatch times
+ * (plus, when useful, request completion times), to estimate what is
+ * unknown, namely in-device request service rate.
+ *
+ * The main issue is that, because of the above facts, the rate at
+ * which a certain set of requests is dispatched over a certain time
+ * interval can vary greatly with respect to the rate at which the
+ * same requests are then served. But, since the size of any
+ * intermediate queue is limited, and the service scheme is lossless
+ * (no request is silently dropped), the following obvious convergence
+ * property holds: the number of requests dispatched MUST become
+ * closer and closer to the number of requests completed as the
+ * observation interval grows. This is the key property used in
+ * the next function to estimate the peak service rate as a function
+ * of the observed dispatch rate. The function assumes to be invoked
+ * on every request dispatch.
+ */
+static void bfq_update_peak_rate(struct bfq_data *bfqd, struct request *rq)
+{
+ u64 now_ns = ktime_get_ns();
+
+ if (bfqd->peak_rate_samples == 0) { /* first dispatch */
+ bfq_log(bfqd,
+ "update_peak_rate: goto reset, samples %d",
+ bfqd->peak_rate_samples) ;
+ bfq_reset_rate_computation(bfqd, rq);
+ goto update_last_values; /* will add one sample */
+ }
+
+ /*
+ * Device idle for very long: the observation interval lasting
+ * up to this dispatch cannot be a valid observation interval
+ * for computing a new peak rate (similarly to the late-
+ * completion event in bfq_completed_request()). Go to
+ * update_rate_and_reset to have the following three steps
+ * taken:
+ * - close the observation interval at the last (previous)
+ * request dispatch or completion
+ * - compute rate, if possible, for that observation interval
+ * - start a new observation interval with this dispatch
+ */
+ if (now_ns - bfqd->last_dispatch > 100*NSEC_PER_MSEC &&
+ bfqd->rq_in_driver == 0) {
+ bfq_log(bfqd,
+"update_peak_rate: jumping to updating&resetting delta_last %lluus samples %d",
+ (now_ns - bfqd->last_dispatch)>>10,
+ bfqd->peak_rate_samples) ;
+ goto update_rate_and_reset;
+ }
+
+ /* Update sampling information */
+ bfqd->peak_rate_samples++;
+
+ if ((bfqd->rq_in_driver > 0 ||
+ now_ns - bfqd->last_completion < BFQ_MIN_TT)
+ && get_sdist(bfqd->last_position, rq) < BFQQ_SEEK_THR)
+ bfqd->sequential_samples++;
+
+ bfqd->tot_sectors_dispatched += blk_rq_sectors(rq);
+
+ /* Reset max observed rq size every 32 dispatches */
+ if (likely(bfqd->peak_rate_samples % 32))
+ bfqd->last_rq_max_size =
+ max_t(u32, blk_rq_sectors(rq), bfqd->last_rq_max_size);
+ else
+ bfqd->last_rq_max_size = blk_rq_sectors(rq);
+
+ bfqd->delta_from_first = now_ns - bfqd->first_dispatch;
+
+ bfq_log(bfqd,
+ "update_peak_rate: added samples %u/%u tot_sects %llu delta_first %lluus",
+ bfqd->peak_rate_samples, bfqd->sequential_samples,
+ bfqd->tot_sectors_dispatched,
+ bfqd->delta_from_first>>10);
+
+ /* Target observation interval not yet reached, go on sampling */
+ if (bfqd->delta_from_first < BFQ_RATE_REF_INTERVAL)
+ goto update_last_values;
+
+update_rate_and_reset:
+ bfq_update_rate_reset(bfqd, rq);
+update_last_values:
+ bfqd->last_position = blk_rq_pos(rq) + blk_rq_sectors(rq);
+ bfqd->last_dispatch = now_ns;
+
+ bfq_log(bfqd,
+ "update_peak_rate: delta_first %lluus last_pos %llu peak_rate %llu",
+ (now_ns - bfqd->first_dispatch)>>10,
+ (unsigned long long) bfqd->last_position,
+ ((USEC_PER_SEC*(u64)bfqd->peak_rate)>>BFQ_RATE_SHIFT));
+ bfq_log(bfqd,
+ "update_peak_rate: samples at end %d", bfqd->peak_rate_samples);
+}
+
+/*
+ * Move request from internal lists to the dispatch list of the request queue
+ */
+static void bfq_dispatch_insert(struct request_queue *q, struct request *rq)
+{
+ struct bfq_queue *bfqq = RQ_BFQQ(rq);
+
+ /*
+ * For consistency, the next instruction should have been executed
+ * after removing the request from the queue and dispatching it.
+ * We execute instead this instruction before bfq_remove_request()
+ * (and hence introduce a temporary inconsistency), for efficiency.
+ * In fact, in a forced_dispatch, this prevents two counters related
+ * to bfqq->dispatched to risk to be uselessly decremented if bfqq
+ * is not in service, and then to be incremented again after
+ * incrementing bfqq->dispatched.
+ */
+ bfqq->dispatched++;
+ bfq_update_peak_rate(q->elevator->elevator_data, rq);
+
+ bfq_remove_request(rq);
+ elv_dispatch_sort(q, rq);
+}
+
+static void __bfq_bfqq_expire(struct bfq_data *bfqd, struct bfq_queue *bfqq)
+{
+ BUG_ON(bfqq != bfqd->in_service_queue);
+
+ /*
+ * If this bfqq is shared between multiple processes, check
+ * to make sure that those processes are still issuing I/Os
+ * within the mean seek distance. If not, it may be time to
+ * break the queues apart again.
+ */
+ if (bfq_bfqq_coop(bfqq) && BFQQ_SEEKY(bfqq))
+ bfq_mark_bfqq_split_coop(bfqq);
+
+ if (RB_EMPTY_ROOT(&bfqq->sort_list)) {
+ if (bfqq->dispatched == 0)
+ /*
+ * Overloading budget_timeout field to store
+ * the time at which the queue remains with no
+ * backlog and no outstanding request; used by
+ * the weight-raising mechanism.
+ */
+ bfqq->budget_timeout = jiffies;
+
+ bfq_del_bfqq_busy(bfqd, bfqq, true);
+ } else {
+ bfq_requeue_bfqq(bfqd, bfqq);
+ /*
+ * Resort priority tree of potential close cooperators.
+ */
+ bfq_pos_tree_add_move(bfqd, bfqq);
+ }
+
+ /*
+ * All in-service entities must have been properly deactivated
+ * or requeued before executing the next function, which
+ * resets all in-service entites as no more in service.
+ */
+ __bfq_bfqd_reset_in_service(bfqd);
+}
+
+/**
+ * __bfq_bfqq_recalc_budget - try to adapt the budget to the @bfqq behavior.
+ * @bfqd: device data.
+ * @bfqq: queue to update.
+ * @reason: reason for expiration.
+ *
+ * Handle the feedback on @bfqq budget at queue expiration.
+ * See the body for detailed comments.
+ */
+static void __bfq_bfqq_recalc_budget(struct bfq_data *bfqd,
+ struct bfq_queue *bfqq,
+ enum bfqq_expiration reason)
+{
+ struct request *next_rq;
+ int budget, min_budget;
+
+ BUG_ON(bfqq != bfqd->in_service_queue);
+
+ min_budget = bfq_min_budget(bfqd);
+
+ if (bfqq->wr_coeff == 1)
+ budget = bfqq->max_budget;
+ else /*
+ * Use a constant, low budget for weight-raised queues,
+ * to help achieve a low latency. Keep it slightly higher
+ * than the minimum possible budget, to cause a little
+ * bit fewer expirations.
+ */
+ budget = 2 * min_budget;
+
+ bfq_log_bfqq(bfqd, bfqq, "recalc_budg: last budg %d, budg left %d",
+ bfqq->entity.budget, bfq_bfqq_budget_left(bfqq));
+ bfq_log_bfqq(bfqd, bfqq, "recalc_budg: last max_budg %d, min budg %d",
+ budget, bfq_min_budget(bfqd));
+ bfq_log_bfqq(bfqd, bfqq, "recalc_budg: sync %d, seeky %d",
+ bfq_bfqq_sync(bfqq), BFQQ_SEEKY(bfqd->in_service_queue));
+
+ if (bfq_bfqq_sync(bfqq) && bfqq->wr_coeff == 1) {
+ switch (reason) {
+ /*
+ * Caveat: in all the following cases we trade latency
+ * for throughput.
+ */
+ case BFQ_BFQQ_TOO_IDLE:
+ /*
+ * This is the only case where we may reduce
+ * the budget: if there is no request of the
+ * process still waiting for completion, then
+ * we assume (tentatively) that the timer has
+ * expired because the batch of requests of
+ * the process could have been served with a
+ * smaller budget. Hence, betting that
+ * process will behave in the same way when it
+ * becomes backlogged again, we reduce its
+ * next budget. As long as we guess right,
+ * this budget cut reduces the latency
+ * experienced by the process.
+ *
+ * However, if there are still outstanding
+ * requests, then the process may have not yet
+ * issued its next request just because it is
+ * still waiting for the completion of some of
+ * the still outstanding ones. So in this
+ * subcase we do not reduce its budget, on the
+ * contrary we increase it to possibly boost
+ * the throughput, as discussed in the
+ * comments to the BUDGET_TIMEOUT case.
+ */
+ if (bfqq->dispatched > 0) /* still outstanding reqs */
+ budget = min(budget * 2, bfqd->bfq_max_budget);
+ else {
+ if (budget > 5 * min_budget)
+ budget -= 4 * min_budget;
+ else
+ budget = min_budget;
+ }
+ break;
+ case BFQ_BFQQ_BUDGET_TIMEOUT:
+ /*
+ * We double the budget here because it gives
+ * the chance to boost the throughput if this
+ * is not a seeky process (and has bumped into
+ * this timeout because of, e.g., ZBR).
+ */
+ budget = min(budget * 2, bfqd->bfq_max_budget);
+ break;
+ case BFQ_BFQQ_BUDGET_EXHAUSTED:
+ /*
+ * The process still has backlog, and did not
+ * let either the budget timeout or the disk
+ * idling timeout expire. Hence it is not
+ * seeky, has a short thinktime and may be
+ * happy with a higher budget too. So
+ * definitely increase the budget of this good
+ * candidate to boost the disk throughput.
+ */
+ budget = min(budget * 4, bfqd->bfq_max_budget);
+ break;
+ case BFQ_BFQQ_NO_MORE_REQUESTS:
+ /*
+ * For queues that expire for this reason, it
+ * is particularly important to keep the
+ * budget close to the actual service they
+ * need. Doing so reduces the timestamp
+ * misalignment problem described in the
+ * comments in the body of
+ * __bfq_activate_entity. In fact, suppose
+ * that a queue systematically expires for
+ * BFQ_BFQQ_NO_MORE_REQUESTS and presents a
+ * new request in time to enjoy timestamp
+ * back-shifting. The larger the budget of the
+ * queue is with respect to the service the
+ * queue actually requests in each service
+ * slot, the more times the queue can be
+ * reactivated with the same virtual finish
+ * time. It follows that, even if this finish
+ * time is pushed to the system virtual time
+ * to reduce the consequent timestamp
+ * misalignment, the queue unjustly enjoys for
+ * many re-activations a lower finish time
+ * than all newly activated queues.
+ *
+ * The service needed by bfqq is measured
+ * quite precisely by bfqq->entity.service.
+ * Since bfqq does not enjoy device idling,
+ * bfqq->entity.service is equal to the number
+ * of sectors that the process associated with
+ * bfqq requested to read/write before waiting
+ * for request completions, or blocking for
+ * other reasons.
+ */
+ budget = max_t(int, bfqq->entity.service, min_budget);
+ break;
+ default:
+ return;
+ }
+ } else if (!bfq_bfqq_sync(bfqq))
+ /*
+ * Async queues get always the maximum possible
+ * budget, as for them we do not care about latency
+ * (in addition, their ability to dispatch is limited
+ * by the charging factor).
+ */
+ budget = bfqd->bfq_max_budget;
+
+ bfqq->max_budget = budget;
+
+ if (bfqd->budgets_assigned >= bfq_stats_min_budgets &&
+ !bfqd->bfq_user_max_budget)
+ bfqq->max_budget = min(bfqq->max_budget, bfqd->bfq_max_budget);
+
+ /*
+ * If there is still backlog, then assign a new budget, making
+ * sure that it is large enough for the next request. Since
+ * the finish time of bfqq must be kept in sync with the
+ * budget, be sure to call __bfq_bfqq_expire() *after* this
+ * update.
+ *
+ * If there is no backlog, then no need to update the budget;
+ * it will be updated on the arrival of a new request.
+ */
+ next_rq = bfqq->next_rq;
+ if (next_rq) {
+ BUG_ON(reason == BFQ_BFQQ_TOO_IDLE ||
+ reason == BFQ_BFQQ_NO_MORE_REQUESTS);
+ bfqq->entity.budget = max_t(unsigned long, bfqq->max_budget,
+ bfq_serv_to_charge(next_rq, bfqq));
+ BUG_ON(!bfq_bfqq_busy(bfqq));
+ BUG_ON(RB_EMPTY_ROOT(&bfqq->sort_list));
+ }
+
+ bfq_log_bfqq(bfqd, bfqq, "head sect: %u, new budget %d",
+ next_rq ? blk_rq_sectors(next_rq) : 0,
+ bfqq->entity.budget);
+}
+
+/*
+ * Return true if the process associated with bfqq is "slow". The slow
+ * flag is used, in addition to the budget timeout, to reduce the
+ * amount of service provided to seeky processes, and thus reduce
+ * their chances to lower the throughput. More details in the comments
+ * on the function bfq_bfqq_expire().
+ *
+ * An important observation is in order: as discussed in the comments
+ * on the function bfq_update_peak_rate(), with devices with internal
+ * queues, it is hard if ever possible to know when and for how long
+ * an I/O request is processed by the device (apart from the trivial
+ * I/O pattern where a new request is dispatched only after the
+ * previous one has been completed). This makes it hard to evaluate
+ * the real rate at which the I/O requests of each bfq_queue are
+ * served. In fact, for an I/O scheduler like BFQ, serving a
+ * bfq_queue means just dispatching its requests during its service
+ * slot (i.e., until the budget of the queue is exhausted, or the
+ * queue remains idle, or, finally, a timeout fires). But, during the
+ * service slot of a bfq_queue, around 100 ms at most, the device may
+ * be even still processing requests of bfq_queues served in previous
+ * service slots. On the opposite end, the requests of the in-service
+ * bfq_queue may be completed after the service slot of the queue
+ * finishes.
+ *
+ * Anyway, unless more sophisticated solutions are used
+ * (where possible), the sum of the sizes of the requests dispatched
+ * during the service slot of a bfq_queue is probably the only
+ * approximation available for the service received by the bfq_queue
+ * during its service slot. And this sum is the quantity used in this
+ * function to evaluate the I/O speed of a process.
+ */
+static bool bfq_bfqq_is_slow(struct bfq_data *bfqd, struct bfq_queue *bfqq,
+ bool compensate, enum bfqq_expiration reason,
+ unsigned long *delta_ms)
+{
+ ktime_t delta_ktime;
+ u32 delta_usecs;
+ bool slow = BFQQ_SEEKY(bfqq); /* if delta too short, use seekyness */
+
+ if (!bfq_bfqq_sync(bfqq))
+ return false;
+
+ if (compensate)
+ delta_ktime = bfqd->last_idling_start;
+ else
+ delta_ktime = ktime_get();
+ delta_ktime = ktime_sub(delta_ktime, bfqd->last_budget_start);
+ delta_usecs = ktime_to_us(delta_ktime);
+
+ /* don't trust short/unrealistic values. */
+ if (delta_usecs < 1000 || delta_usecs >= LONG_MAX) {
+ if (blk_queue_nonrot(bfqd->queue))
+ /*
+ * give same worst-case guarantees as idling
+ * for seeky
+ */
+ *delta_ms = BFQ_MIN_TT / NSEC_PER_MSEC;
+ else /* charge at least one seek */
+ *delta_ms = bfq_slice_idle / NSEC_PER_MSEC;
+
+ bfq_log(bfqd, "bfq_bfqq_is_slow: unrealistic %u", delta_usecs);
+
+ return slow;
+ }
+
+ *delta_ms = delta_usecs / USEC_PER_MSEC;
+
+ /*
+ * Use only long (> 20ms) intervals to filter out excessive
+ * spikes in service rate estimation.
+ */
+ if (delta_usecs > 20000) {
+ /*
+ * Caveat for rotational devices: processes doing I/O
+ * in the slower disk zones tend to be slow(er) even
+ * if not seeky. In this respect, the estimated peak
+ * rate is likely to be an average over the disk
+ * surface. Accordingly, to not be too harsh with
+ * unlucky processes, a process is deemed slow only if
+ * its rate has been lower than half of the estimated
+ * peak rate.
+ */
+ slow = bfqq->entity.service < bfqd->bfq_max_budget / 2;
+ bfq_log(bfqd, "bfq_bfqq_is_slow: relative rate %d/%d",
+ bfqq->entity.service, bfqd->bfq_max_budget);
+ }
+
+ bfq_log_bfqq(bfqd, bfqq, "bfq_bfqq_is_slow: slow %d", slow);
+
+ return slow;
+}
+
+/*
+ * To be deemed as soft real-time, an application must meet two
+ * requirements. First, the application must not require an average
+ * bandwidth higher than the approximate bandwidth required to playback or
+ * record a compressed high-definition video.
+ * The next function is invoked on the completion of the last request of a
+ * batch, to compute the next-start time instant, soft_rt_next_start, such
+ * that, if the next request of the application does not arrive before
+ * soft_rt_next_start, then the above requirement on the bandwidth is met.
+ *
+ * The second requirement is that the request pattern of the application is
+ * isochronous, i.e., that, after issuing a request or a batch of requests,
+ * the application stops issuing new requests until all its pending requests
+ * have been completed. After that, the application may issue a new batch,
+ * and so on.
+ * For this reason the next function is invoked to compute
+ * soft_rt_next_start only for applications that meet this requirement,
+ * whereas soft_rt_next_start is set to infinity for applications that do
+ * not.
+ *
+ * Unfortunately, even a greedy application may happen to behave in an
+ * isochronous way if the CPU load is high. In fact, the application may
+ * stop issuing requests while the CPUs are busy serving other processes,
+ * then restart, then stop again for a while, and so on. In addition, if
+ * the disk achieves a low enough throughput with the request pattern
+ * issued by the application (e.g., because the request pattern is random
+ * and/or the device is slow), then the application may meet the above
+ * bandwidth requirement too. To prevent such a greedy application to be
+ * deemed as soft real-time, a further rule is used in the computation of
+ * soft_rt_next_start: soft_rt_next_start must be higher than the current
+ * time plus the maximum time for which the arrival of a request is waited
+ * for when a sync queue becomes idle, namely bfqd->bfq_slice_idle.
+ * This filters out greedy applications, as the latter issue instead their
+ * next request as soon as possible after the last one has been completed
+ * (in contrast, when a batch of requests is completed, a soft real-time
+ * application spends some time processing data).
+ *
+ * Unfortunately, the last filter may easily generate false positives if
+ * only bfqd->bfq_slice_idle is used as a reference time interval and one
+ * or both the following cases occur:
+ * 1) HZ is so low that the duration of a jiffy is comparable to or higher
+ * than bfqd->bfq_slice_idle. This happens, e.g., on slow devices with
+ * HZ=100.
+ * 2) jiffies, instead of increasing at a constant rate, may stop increasing
+ * for a while, then suddenly 'jump' by several units to recover the lost
+ * increments. This seems to happen, e.g., inside virtual machines.
+ * To address this issue, we do not use as a reference time interval just
+ * bfqd->bfq_slice_idle, but bfqd->bfq_slice_idle plus a few jiffies. In
+ * particular we add the minimum number of jiffies for which the filter
+ * seems to be quite precise also in embedded systems and KVM/QEMU virtual
+ * machines.
+ */
+static unsigned long bfq_bfqq_softrt_next_start(struct bfq_data *bfqd,
+ struct bfq_queue *bfqq)
+{
+ bfq_log_bfqq(bfqd, bfqq,
+"softrt_next_start: service_blkg %lu soft_rate %u sects/sec interval %u",
+ bfqq->service_from_backlogged,
+ bfqd->bfq_wr_max_softrt_rate,
+ jiffies_to_msecs(HZ * bfqq->service_from_backlogged /
+ bfqd->bfq_wr_max_softrt_rate));
+
+ return max(bfqq->last_idle_bklogged +
+ HZ * bfqq->service_from_backlogged /
+ bfqd->bfq_wr_max_softrt_rate,
+ jiffies + nsecs_to_jiffies(bfqq->bfqd->bfq_slice_idle) + 4);
+}
+
+/*
+ * Return the farthest future time instant according to jiffies
+ * macros.
+ */
+static unsigned long bfq_greatest_from_now(void)
+{
+ return jiffies + MAX_JIFFY_OFFSET;
+}
+
+/*
+ * Return the farthest past time instant according to jiffies
+ * macros.
+ */
+static unsigned long bfq_smallest_from_now(void)
+{
+ return jiffies - MAX_JIFFY_OFFSET;
+}
+
+/**
+ * bfq_bfqq_expire - expire a queue.
+ * @bfqd: device owning the queue.
+ * @bfqq: the queue to expire.
+ * @compensate: if true, compensate for the time spent idling.
+ * @reason: the reason causing the expiration.
+ *
+ * If the process associated with bfqq does slow I/O (e.g., because it
+ * issues random requests), we charge bfqq with the time it has been
+ * in service instead of the service it has received (see
+ * bfq_bfqq_charge_time for details on how this goal is achieved). As
+ * a consequence, bfqq will typically get higher timestamps upon
+ * reactivation, and hence it will be rescheduled as if it had
+ * received more service than what it has actually received. In the
+ * end, bfqq receives less service in proportion to how slowly its
+ * associated process consumes its budgets (and hence how seriously it
+ * tends to lower the throughput). In addition, this time-charging
+ * strategy guarantees time fairness among slow processes. In
+ * contrast, if the process associated with bfqq is not slow, we
+ * charge bfqq exactly with the service it has received.
+ *
+ * Charging time to the first type of queues and the exact service to
+ * the other has the effect of using the WF2Q+ policy to schedule the
+ * former on a timeslice basis, without violating service domain
+ * guarantees among the latter.
+ */
+static void bfq_bfqq_expire(struct bfq_data *bfqd,
+ struct bfq_queue *bfqq,
+ bool compensate,
+ enum bfqq_expiration reason)
+{
+ bool slow;
+ unsigned long delta = 0;
+ struct bfq_entity *entity = &bfqq->entity;
+
+ BUG_ON(bfqq != bfqd->in_service_queue);
+
+ /*
+ * Check whether the process is slow (see bfq_bfqq_is_slow).
+ */
+ slow = bfq_bfqq_is_slow(bfqd, bfqq, compensate, reason, &delta);
+
+ /*
+ * Increase service_from_backlogged before next statement,
+ * because the possible next invocation of
+ * bfq_bfqq_charge_time would likely inflate
+ * entity->service. In contrast, service_from_backlogged must
+ * contain real service, to enable the soft real-time
+ * heuristic to correctly compute the bandwidth consumed by
+ * bfqq.
+ */
+ bfqq->service_from_backlogged += entity->service;
+
+ /*
+ * As above explained, charge slow (typically seeky) and
+ * timed-out queues with the time and not the service
+ * received, to favor sequential workloads.
+ *
+ * Processes doing I/O in the slower disk zones will tend to
+ * be slow(er) even if not seeky. Therefore, since the
+ * estimated peak rate is actually an average over the disk
+ * surface, these processes may timeout just for bad luck. To
+ * avoid punishing them, do not charge time to processes that
+ * succeeded in consuming at least 2/3 of their budget. This
+ * allows BFQ to preserve enough elasticity to still perform
+ * bandwidth, and not time, distribution with little unlucky
+ * or quasi-sequential processes.
+ */
+ if (bfqq->wr_coeff == 1 &&
+ (slow ||
+ (reason == BFQ_BFQQ_BUDGET_TIMEOUT &&
+ bfq_bfqq_budget_left(bfqq) >= entity->budget / 3)))
+ bfq_bfqq_charge_time(bfqd, bfqq, delta);
+
+ BUG_ON(bfqq->entity.budget < bfqq->entity.service);
+
+ if (reason == BFQ_BFQQ_TOO_IDLE &&
+ entity->service <= 2 * entity->budget / 10)
+ bfq_clear_bfqq_IO_bound(bfqq);
+
+ if (bfqd->low_latency && bfqq->wr_coeff == 1)
+ bfqq->last_wr_start_finish = jiffies;
+
+ if (bfqd->low_latency && bfqd->bfq_wr_max_softrt_rate > 0 &&
+ RB_EMPTY_ROOT(&bfqq->sort_list)) {
+ /*
+ * If we get here, and there are no outstanding
+ * requests, then the request pattern is isochronous
+ * (see the comments on the function
+ * bfq_bfqq_softrt_next_start()). Thus we can compute
+ * soft_rt_next_start. If, instead, the queue still
+ * has outstanding requests, then we have to wait for
+ * the completion of all the outstanding requests to
+ * discover whether the request pattern is actually
+ * isochronous.
+ */
+ BUG_ON(bfqd->busy_queues < 1);
+ if (bfqq->dispatched == 0) {
+ bfqq->soft_rt_next_start =
+ bfq_bfqq_softrt_next_start(bfqd, bfqq);
+ bfq_log_bfqq(bfqd, bfqq, "new soft_rt_next %lu",
+ bfqq->soft_rt_next_start);
+ } else {
+ /*
+ * The application is still waiting for the
+ * completion of one or more requests:
+ * prevent it from possibly being incorrectly
+ * deemed as soft real-time by setting its
+ * soft_rt_next_start to infinity. In fact,
+ * without this assignment, the application
+ * would be incorrectly deemed as soft
+ * real-time if:
+ * 1) it issued a new request before the
+ * completion of all its in-flight
+ * requests, and
+ * 2) at that time, its soft_rt_next_start
+ * happened to be in the past.
+ */
+ bfqq->soft_rt_next_start =
+ bfq_greatest_from_now();
+ /*
+ * Schedule an update of soft_rt_next_start to when
+ * the task may be discovered to be isochronous.
+ */
+ bfq_mark_bfqq_softrt_update(bfqq);
+ }
+ }
+
+ bfq_log_bfqq(bfqd, bfqq,
+ "expire (%d, slow %d, num_disp %d, idle_win %d, weight %d)",
+ reason, slow, bfqq->dispatched,
+ bfq_bfqq_idle_window(bfqq), entity->weight);
+
+ /*
+ * Increase, decrease or leave budget unchanged according to
+ * reason.
+ */
+ BUG_ON(bfqq->entity.budget < bfqq->entity.service);
+ __bfq_bfqq_recalc_budget(bfqd, bfqq, reason);
+ BUG_ON(bfqq->next_rq == NULL &&
+ bfqq->entity.budget < bfqq->entity.service);
+ __bfq_bfqq_expire(bfqd, bfqq);
+
+ BUG_ON(!bfq_bfqq_busy(bfqq) && reason == BFQ_BFQQ_BUDGET_EXHAUSTED &&
+ !bfq_class_idle(bfqq));
+
+ if (!bfq_bfqq_busy(bfqq) &&
+ reason != BFQ_BFQQ_BUDGET_TIMEOUT &&
+ reason != BFQ_BFQQ_BUDGET_EXHAUSTED)
+ bfq_mark_bfqq_non_blocking_wait_rq(bfqq);
+}
+
+/*
+ * Budget timeout is not implemented through a dedicated timer, but
+ * just checked on request arrivals and completions, as well as on
+ * idle timer expirations.
+ */
+static bool bfq_bfqq_budget_timeout(struct bfq_queue *bfqq)
+{
+ return time_is_before_eq_jiffies(bfqq->budget_timeout);
+}
+
+/*
+ * If we expire a queue that is actively waiting (i.e., with the
+ * device idled) for the arrival of a new request, then we may incur
+ * the timestamp misalignment problem described in the body of the
+ * function __bfq_activate_entity. Hence we return true only if this
+ * condition does not hold, or if the queue is slow enough to deserve
+ * only to be kicked off for preserving a high throughput.
+ */
+static bool bfq_may_expire_for_budg_timeout(struct bfq_queue *bfqq)
+{
+ bfq_log_bfqq(bfqq->bfqd, bfqq,
+ "may_budget_timeout: wait_request %d left %d timeout %d",
+ bfq_bfqq_wait_request(bfqq),
+ bfq_bfqq_budget_left(bfqq) >= bfqq->entity.budget / 3,
+ bfq_bfqq_budget_timeout(bfqq));
+
+ return (!bfq_bfqq_wait_request(bfqq) ||
+ bfq_bfqq_budget_left(bfqq) >= bfqq->entity.budget / 3)
+ &&
+ bfq_bfqq_budget_timeout(bfqq);
+}
+
+/*
+ * For a queue that becomes empty, device idling is allowed only if
+ * this function returns true for that queue. As a consequence, since
+ * device idling plays a critical role for both throughput boosting
+ * and service guarantees, the return value of this function plays a
+ * critical role as well.
+ *
+ * In a nutshell, this function returns true only if idling is
+ * beneficial for throughput or, even if detrimental for throughput,
+ * idling is however necessary to preserve service guarantees (low
+ * latency, desired throughput distribution, ...). In particular, on
+ * NCQ-capable devices, this function tries to return false, so as to
+ * help keep the drives' internal queues full, whenever this helps the
+ * device boost the throughput without causing any service-guarantee
+ * issue.
+ *
+ * In more detail, the return value of this function is obtained by,
+ * first, computing a number of boolean variables that take into
+ * account throughput and service-guarantee issues, and, then,
+ * combining these variables in a logical expression. Most of the
+ * issues taken into account are not trivial. We discuss these issues
+ * while introducing the variables.
+ */
+static bool bfq_bfqq_may_idle(struct bfq_queue *bfqq)
+{
+ struct bfq_data *bfqd = bfqq->bfqd;
+ bool idling_boosts_thr, idling_boosts_thr_without_issues,
+ idling_needed_for_service_guarantees,
+ asymmetric_scenario;
+
+ if (bfqd->strict_guarantees)
+ return true;
+
+ /*
+ * The next variable takes into account the cases where idling
+ * boosts the throughput.
+ *
+ * The value of the variable is computed considering, first, that
+ * idling is virtually always beneficial for the throughput if:
+ * (a) the device is not NCQ-capable, or
+ * (b) regardless of the presence of NCQ, the device is rotational
+ * and the request pattern for bfqq is I/O-bound and sequential.
+ *
+ * Secondly, and in contrast to the above item (b), idling an
+ * NCQ-capable flash-based device would not boost the
+ * throughput even with sequential I/O; rather it would lower
+ * the throughput in proportion to how fast the device
+ * is. Accordingly, the next variable is true if any of the
+ * above conditions (a) and (b) is true, and, in particular,
+ * happens to be false if bfqd is an NCQ-capable flash-based
+ * device.
+ */
+ idling_boosts_thr = !bfqd->hw_tag ||
+ (!blk_queue_nonrot(bfqd->queue) && bfq_bfqq_IO_bound(bfqq) &&
+ bfq_bfqq_idle_window(bfqq));
+
+ /*
+ * The value of the next variable,
+ * idling_boosts_thr_without_issues, is equal to that of
+ * idling_boosts_thr, unless a special case holds. In this
+ * special case, described below, idling may cause problems to
+ * weight-raised queues.
+ *
+ * When the request pool is saturated (e.g., in the presence
+ * of write hogs), if the processes associated with
+ * non-weight-raised queues ask for requests at a lower rate,
+ * then processes associated with weight-raised queues have a
+ * higher probability to get a request from the pool
+ * immediately (or at least soon) when they need one. Thus
+ * they have a higher probability to actually get a fraction
+ * of the device throughput proportional to their high
+ * weight. This is especially true with NCQ-capable drives,
+ * which enqueue several requests in advance, and further
+ * reorder internally-queued requests.
+ *
+ * For this reason, we force to false the value of
+ * idling_boosts_thr_without_issues if there are weight-raised
+ * busy queues. In this case, and if bfqq is not weight-raised,
+ * this guarantees that the device is not idled for bfqq (if,
+ * instead, bfqq is weight-raised, then idling will be
+ * guaranteed by another variable, see below). Combined with
+ * the timestamping rules of BFQ (see [1] for details), this
+ * behavior causes bfqq, and hence any sync non-weight-raised
+ * queue, to get a lower number of requests served, and thus
+ * to ask for a lower number of requests from the request
+ * pool, before the busy weight-raised queues get served
+ * again. This often mitigates starvation problems in the
+ * presence of heavy write workloads and NCQ, thereby
+ * guaranteeing a higher application and system responsiveness
+ * in these hostile scenarios.
+ */
+ idling_boosts_thr_without_issues = idling_boosts_thr &&
+ bfqd->wr_busy_queues == 0;
+
+ /*
+ * There is then a case where idling must be performed not
+ * for throughput concerns, but to preserve service
+ * guarantees.
+ *
+ * To introduce this case, we can note that allowing the drive
+ * to enqueue more than one request at a time, and hence
+ * delegating de facto final scheduling decisions to the
+ * drive's internal scheduler, entails loss of control on the
+ * actual request service order. In particular, the critical
+ * situation is when requests from different processes happen
+ * to be present, at the same time, in the internal queue(s)
+ * of the drive. In such a situation, the drive, by deciding
+ * the service order of the internally-queued requests, does
+ * determine also the actual throughput distribution among
+ * these processes. But the drive typically has no notion or
+ * concern about per-process throughput distribution, and
+ * makes its decisions only on a per-request basis. Therefore,
+ * the service distribution enforced by the drive's internal
+ * scheduler is likely to coincide with the desired
+ * device-throughput distribution only in a completely
+ * symmetric scenario where:
+ * (i) each of these processes must get the same throughput as
+ * the others;
+ * (ii) all these processes have the same I/O pattern
+ * (either sequential or random).
+ * In fact, in such a scenario, the drive will tend to treat
+ * the requests of each of these processes in about the same
+ * way as the requests of the others, and thus to provide
+ * each of these processes with about the same throughput
+ * (which is exactly the desired throughput distribution). In
+ * contrast, in any asymmetric scenario, device idling is
+ * certainly needed to guarantee that bfqq receives its
+ * assigned fraction of the device throughput (see [1] for
+ * details).
+ *
+ * We address this issue by controlling, actually, only the
+ * symmetry sub-condition (i), i.e., provided that
+ * sub-condition (i) holds, idling is not performed,
+ * regardless of whether sub-condition (ii) holds. In other
+ * words, only if sub-condition (i) holds, then idling is
+ * allowed, and the device tends to be prevented from queueing
+ * many requests, possibly of several processes. The reason
+ * for not controlling also sub-condition (ii) is that we
+ * exploit preemption to preserve guarantees in case of
+ * symmetric scenarios, even if (ii) does not hold, as
+ * explained in the next two paragraphs.
+ *
+ * Even if a queue, say Q, is expired when it remains idle, Q
+ * can still preempt the new in-service queue if the next
+ * request of Q arrives soon (see the comments on
+ * bfq_bfqq_update_budg_for_activation). If all queues and
+ * groups have the same weight, this form of preemption,
+ * combined with the hole-recovery heuristic described in the
+ * comments on function bfq_bfqq_update_budg_for_activation,
+ * are enough to preserve a correct bandwidth distribution in
+ * the mid term, even without idling. In fact, even if not
+ * idling allows the internal queues of the device to contain
+ * many requests, and thus to reorder requests, we can rather
+ * safely assume that the internal scheduler still preserves a
+ * minimum of mid-term fairness. The motivation for using
+ * preemption instead of idling is that, by not idling,
+ * service guarantees are preserved without minimally
+ * sacrificing throughput. In other words, both a high
+ * throughput and its desired distribution are obtained.
+ *
+ * More precisely, this preemption-based, idleless approach
+ * provides fairness in terms of IOPS, and not sectors per
+ * second. This can be seen with a simple example. Suppose
+ * that there are two queues with the same weight, but that
+ * the first queue receives requests of 8 sectors, while the
+ * second queue receives requests of 1024 sectors. In
+ * addition, suppose that each of the two queues contains at
+ * most one request at a time, which implies that each queue
+ * always remains idle after it is served. Finally, after
+ * remaining idle, each queue receives very quickly a new
+ * request. It follows that the two queues are served
+ * alternatively, preempting each other if needed. This
+ * implies that, although both queues have the same weight,
+ * the queue with large requests receives a service that is
+ * 1024/8 times as high as the service received by the other
+ * queue.
+ *
+ * On the other hand, device idling is performed, and thus
+ * pure sector-domain guarantees are provided, for the
+ * following queues, which are likely to need stronger
+ * throughput guarantees: weight-raised queues, and queues
+ * with a higher weight than other queues. When such queues
+ * are active, sub-condition (i) is false, which triggers
+ * device idling.
+ *
+ * According to the above considerations, the next variable is
+ * true (only) if sub-condition (i) holds. To compute the
+ * value of this variable, we not only use the return value of
+ * the function bfq_symmetric_scenario(), but also check
+ * whether bfqq is being weight-raised, because
+ * bfq_symmetric_scenario() does not take into account also
+ * weight-raised queues (see comments on
+ * bfq_weights_tree_add()).
+ *
+ * As a side note, it is worth considering that the above
+ * device-idling countermeasures may however fail in the
+ * following unlucky scenario: if idling is (correctly)
+ * disabled in a time period during which all symmetry
+ * sub-conditions hold, and hence the device is allowed to
+ * enqueue many requests, but at some later point in time some
+ * sub-condition stops to hold, then it may become impossible
+ * to let requests be served in the desired order until all
+ * the requests already queued in the device have been served.
+ */
+ asymmetric_scenario = bfqq->wr_coeff > 1 ||
+ !bfq_symmetric_scenario(bfqd);
+
+ /*
+ * Finally, there is a case where maximizing throughput is the
+ * best choice even if it may cause unfairness toward
+ * bfqq. Such a case is when bfqq became active in a burst of
+ * queue activations. Queues that became active during a large
+ * burst benefit only from throughput, as discussed in the
+ * comments on bfq_handle_burst. Thus, if bfqq became active
+ * in a burst and not idling the device maximizes throughput,
+ * then the device must no be idled, because not idling the
+ * device provides bfqq and all other queues in the burst with
+ * maximum benefit. Combining this and the above case, we can
+ * now establish when idling is actually needed to preserve
+ * service guarantees.
+ */
+ idling_needed_for_service_guarantees =
+ asymmetric_scenario && !bfq_bfqq_in_large_burst(bfqq);
+
+ /*
+ * We have now all the components we need to compute the return
+ * value of the function, which is true only if both the following
+ * conditions hold:
+ * 1) bfqq is sync, because idling make sense only for sync queues;
+ * 2) idling either boosts the throughput (without issues), or
+ * is necessary to preserve service guarantees.
+ */
+ bfq_log_bfqq(bfqd, bfqq, "may_idle: sync %d idling_boosts_thr %d",
+ bfq_bfqq_sync(bfqq), idling_boosts_thr);
+
+ bfq_log_bfqq(bfqd, bfqq,
+ "may_idle: wr_busy %d boosts %d IO-bound %d guar %d",
+ bfqd->wr_busy_queues,
+ idling_boosts_thr_without_issues,
+ bfq_bfqq_IO_bound(bfqq),
+ idling_needed_for_service_guarantees);
+
+ return bfq_bfqq_sync(bfqq) &&
+ (idling_boosts_thr_without_issues ||
+ idling_needed_for_service_guarantees);
+}
+
+/*
+ * If the in-service queue is empty but the function bfq_bfqq_may_idle
+ * returns true, then:
+ * 1) the queue must remain in service and cannot be expired, and
+ * 2) the device must be idled to wait for the possible arrival of a new
+ * request for the queue.
+ * See the comments on the function bfq_bfqq_may_idle for the reasons
+ * why performing device idling is the best choice to boost the throughput
+ * and preserve service guarantees when bfq_bfqq_may_idle itself
+ * returns true.
+ */
+static bool bfq_bfqq_must_idle(struct bfq_queue *bfqq)
+{
+ struct bfq_data *bfqd = bfqq->bfqd;
+
+ return RB_EMPTY_ROOT(&bfqq->sort_list) && bfqd->bfq_slice_idle != 0 &&
+ bfq_bfqq_may_idle(bfqq);
+}
+
+/*
+ * Select a queue for service. If we have a current queue in service,
+ * check whether to continue servicing it, or retrieve and set a new one.
+ */
+static struct bfq_queue *bfq_select_queue(struct bfq_data *bfqd)
+{
+ struct bfq_queue *bfqq;
+ struct request *next_rq;
+ enum bfqq_expiration reason = BFQ_BFQQ_BUDGET_TIMEOUT;
+
+ bfqq = bfqd->in_service_queue;
+ if (!bfqq)
+ goto new_queue;
+
+ bfq_log_bfqq(bfqd, bfqq, "select_queue: already in-service queue");
+
+ if (bfq_may_expire_for_budg_timeout(bfqq) &&
+ !hrtimer_active(&bfqd->idle_slice_timer) &&
+ !bfq_bfqq_must_idle(bfqq))
+ goto expire;
+
+check_queue:
+ /*
+ * This loop is rarely executed more than once. Even when it
+ * happens, it is much more convenient to re-execute this loop
+ * than to return NULL and trigger a new dispatch to get a
+ * request served.
+ */
+ next_rq = bfqq->next_rq;
+ /*
+ * If bfqq has requests queued and it has enough budget left to
+ * serve them, keep the queue, otherwise expire it.
+ */
+ if (next_rq) {
+ BUG_ON(RB_EMPTY_ROOT(&bfqq->sort_list));
+
+ if (bfq_serv_to_charge(next_rq, bfqq) >
+ bfq_bfqq_budget_left(bfqq)) {
+ /*
+ * Expire the queue for budget exhaustion,
+ * which makes sure that the next budget is
+ * enough to serve the next request, even if
+ * it comes from the fifo expired path.
+ */
+ reason = BFQ_BFQQ_BUDGET_EXHAUSTED;
+ goto expire;
+ } else {
+ /*
+ * The idle timer may be pending because we may
+ * not disable disk idling even when a new request
+ * arrives.
+ */
+ if (bfq_bfqq_wait_request(bfqq)) {
+ BUG_ON(!hrtimer_active(&bfqd->idle_slice_timer));
+ /*
+ * If we get here: 1) at least a new request
+ * has arrived but we have not disabled the
+ * timer because the request was too small,
+ * 2) then the block layer has unplugged
+ * the device, causing the dispatch to be
+ * invoked.
+ *
+ * Since the device is unplugged, now the
+ * requests are probably large enough to
+ * provide a reasonable throughput.
+ * So we disable idling.
+ */
+ bfq_clear_bfqq_wait_request(bfqq);
+ hrtimer_try_to_cancel(&bfqd->idle_slice_timer);
+ bfqg_stats_update_idle_time(bfqq_group(bfqq));
+ }
+ goto keep_queue;
+ }
+ }
+
+ /*
+ * No requests pending. However, if the in-service queue is idling
+ * for a new request, or has requests waiting for a completion and
+ * may idle after their completion, then keep it anyway.
+ */
+ if (hrtimer_active(&bfqd->idle_slice_timer) ||
+ (bfqq->dispatched != 0 && bfq_bfqq_may_idle(bfqq))) {
+ bfqq = NULL;
+ goto keep_queue;
+ }
+
+ reason = BFQ_BFQQ_NO_MORE_REQUESTS;
+expire:
+ bfq_bfqq_expire(bfqd, bfqq, false, reason);
+new_queue:
+ bfqq = bfq_set_in_service_queue(bfqd);
+ if (bfqq) {
+ bfq_log_bfqq(bfqd, bfqq, "select_queue: checking new queue");
+ goto check_queue;
+ }
+keep_queue:
+ if (bfqq)
+ bfq_log_bfqq(bfqd, bfqq, "select_queue: returned this queue");
+ else
+ bfq_log(bfqd, "select_queue: no queue returned");
+
+ return bfqq;
+}
+
+static void bfq_update_wr_data(struct bfq_data *bfqd, struct bfq_queue *bfqq)
+{
+ struct bfq_entity *entity = &bfqq->entity;
+
+ if (bfqq->wr_coeff > 1) { /* queue is being weight-raised */
+ BUG_ON(bfqq->wr_cur_max_time == bfqd->bfq_wr_rt_max_time &&
+ time_is_after_jiffies(bfqq->last_wr_start_finish));
+
+ bfq_log_bfqq(bfqd, bfqq,
+ "raising period dur %u/%u msec, old coeff %u, w %d(%d)",
+ jiffies_to_msecs(jiffies - bfqq->last_wr_start_finish),
+ jiffies_to_msecs(bfqq->wr_cur_max_time),
+ bfqq->wr_coeff,
+ bfqq->entity.weight, bfqq->entity.orig_weight);
+
+ BUG_ON(bfqq != bfqd->in_service_queue && entity->weight !=
+ entity->orig_weight * bfqq->wr_coeff);
+ if (entity->prio_changed)
+ bfq_log_bfqq(bfqd, bfqq, "WARN: pending prio change");
+
+ /*
+ * If the queue was activated in a burst, or too much
+ * time has elapsed from the beginning of this
+ * weight-raising period, then end weight raising.
+ */
+ if (bfq_bfqq_in_large_burst(bfqq))
+ bfq_bfqq_end_wr(bfqq);
+ else if (time_is_before_jiffies(bfqq->last_wr_start_finish +
+ bfqq->wr_cur_max_time)) {
+ if (bfqq->wr_cur_max_time != bfqd->bfq_wr_rt_max_time ||
+ time_is_before_jiffies(bfqq->wr_start_at_switch_to_srt +
+ bfq_wr_duration(bfqd)))
+ bfq_bfqq_end_wr(bfqq);
+ else {
+ /* switch back to interactive wr */
+ bfqq->wr_coeff = bfqd->bfq_wr_coeff;
+ bfqq->wr_cur_max_time = bfq_wr_duration(bfqd);
+ bfqq->last_wr_start_finish =
+ bfqq->wr_start_at_switch_to_srt;
+ BUG_ON(time_is_after_jiffies(
+ bfqq->last_wr_start_finish));
+ bfqq->entity.prio_changed = 1;
+ bfq_log_bfqq(bfqd, bfqq,
+ "back to interactive wr");
+ }
+ }
+ }
+ /* Update weight both if it must be raised and if it must be lowered */
+ if ((entity->weight > entity->orig_weight) != (bfqq->wr_coeff > 1))
+ __bfq_entity_update_weight_prio(
+ bfq_entity_service_tree(entity),
+ entity);
+}
+
+/*
+ * Dispatch one request from bfqq, moving it to the request queue
+ * dispatch list.
+ */
+static int bfq_dispatch_request(struct bfq_data *bfqd,
+ struct bfq_queue *bfqq)
+{
+ int dispatched = 0;
+ struct request *rq = bfqq->next_rq;
+ unsigned long service_to_charge;
+
+ BUG_ON(RB_EMPTY_ROOT(&bfqq->sort_list));
+ BUG_ON(!rq);
+ service_to_charge = bfq_serv_to_charge(rq, bfqq);
+
+ BUG_ON(service_to_charge > bfq_bfqq_budget_left(bfqq));
+
+ BUG_ON(bfqq->entity.budget < bfqq->entity.service);
+
+ bfq_bfqq_served(bfqq, service_to_charge);
+
+ BUG_ON(bfqq->entity.budget < bfqq->entity.service);
+
+ bfq_dispatch_insert(bfqd->queue, rq);
+
+ /*
+ * If weight raising has to terminate for bfqq, then next
+ * function causes an immediate update of bfqq's weight,
+ * without waiting for next activation. As a consequence, on
+ * expiration, bfqq will be timestamped as if has never been
+ * weight-raised during this service slot, even if it has
+ * received part or even most of the service as a
+ * weight-raised queue. This inflates bfqq's timestamps, which
+ * is beneficial, as bfqq is then more willing to leave the
+ * device immediately to possible other weight-raised queues.
+ */
+ bfq_update_wr_data(bfqd, bfqq);
+
+ bfq_log_bfqq(bfqd, bfqq,
+ "dispatched %u sec req (%llu), budg left %d",
+ blk_rq_sectors(rq),
+ (unsigned long long) blk_rq_pos(rq),
+ bfq_bfqq_budget_left(bfqq));
+
+ dispatched++;
+
+ if (!bfqd->in_service_bic) {
+ atomic_long_inc(&RQ_BIC(rq)->icq.ioc->refcount);
+ bfqd->in_service_bic = RQ_BIC(rq);
+ }
+
+ if (bfqd->busy_queues > 1 && bfq_class_idle(bfqq))
+ goto expire;
+
+ return dispatched;
+
+expire:
+ bfq_bfqq_expire(bfqd, bfqq, false, BFQ_BFQQ_BUDGET_EXHAUSTED);
+ return dispatched;
+}
+
+static int __bfq_forced_dispatch_bfqq(struct bfq_queue *bfqq)
+{
+ int dispatched = 0;
+
+ while (bfqq->next_rq) {
+ bfq_dispatch_insert(bfqq->bfqd->queue, bfqq->next_rq);
+ dispatched++;
+ }
+
+ BUG_ON(!list_empty(&bfqq->fifo));
+ return dispatched;
+}
+
+/*
+ * Drain our current requests.
+ * Used for barriers and when switching io schedulers on-the-fly.
+ */
+static int bfq_forced_dispatch(struct bfq_data *bfqd)
+{
+ struct bfq_queue *bfqq, *n;
+ struct bfq_service_tree *st;
+ int dispatched = 0;
+
+ bfqq = bfqd->in_service_queue;
+ if (bfqq)
+ __bfq_bfqq_expire(bfqd, bfqq);
+
+ /*
+ * Loop through classes, and be careful to leave the scheduler
+ * in a consistent state, as feedback mechanisms and vtime
+ * updates cannot be disabled during the process.
+ */
+ list_for_each_entry_safe(bfqq, n, &bfqd->active_list, bfqq_list) {
+ st = bfq_entity_service_tree(&bfqq->entity);
+
+ dispatched += __bfq_forced_dispatch_bfqq(bfqq);
+
+ bfqq->max_budget = bfq_max_budget(bfqd);
+ bfq_forget_idle(st);
+ }
+
+ BUG_ON(bfqd->busy_queues != 0);
+
+ return dispatched;
+}
+
+static int bfq_dispatch_requests(struct request_queue *q, int force)
+{
+ struct bfq_data *bfqd = q->elevator->elevator_data;
+ struct bfq_queue *bfqq;
+
+ bfq_log(bfqd, "dispatch requests: %d busy queues", bfqd->busy_queues);
+
+ if (bfqd->busy_queues == 0)
+ return 0;
+
+ if (unlikely(force))
+ return bfq_forced_dispatch(bfqd);
+
+ /*
+ * Force device to serve one request at a time if
+ * strict_guarantees is true. Forcing this service scheme is
+ * currently the ONLY way to guarantee that the request
+ * service order enforced by the scheduler is respected by a
+ * queueing device. Otherwise the device is free even to make
+ * some unlucky request wait for as long as the device
+ * wishes.
+ *
+ * Of course, serving one request at at time may cause loss of
+ * throughput.
+ */
+ if (bfqd->strict_guarantees && bfqd->rq_in_driver > 0)
+ return 0;
+
+ bfqq = bfq_select_queue(bfqd);
+ if (!bfqq)
+ return 0;
+
+ BUG_ON(bfqq->entity.budget < bfqq->entity.service);
+
+ BUG_ON(bfq_bfqq_wait_request(bfqq));
+
+ if (!bfq_dispatch_request(bfqd, bfqq))
+ return 0;
+
+ bfq_log_bfqq(bfqd, bfqq, "dispatched %s request",
+ bfq_bfqq_sync(bfqq) ? "sync" : "async");
+
+ BUG_ON(bfqq->next_rq == NULL &&
+ bfqq->entity.budget < bfqq->entity.service);
+ return 1;
+}
+
+/*
+ * Task holds one reference to the queue, dropped when task exits. Each rq
+ * in-flight on this queue also holds a reference, dropped when rq is freed.
+ *
+ * Queue lock must be held here.
+ */
+static void bfq_put_queue(struct bfq_queue *bfqq)
+{
+#ifdef CONFIG_BFQ_GROUP_IOSCHED
+ struct bfq_group *bfqg = bfqq_group(bfqq);
+#endif
+
+ BUG_ON(bfqq->ref <= 0);
+
+ bfq_log_bfqq(bfqq->bfqd, bfqq, "put_queue: %p %d", bfqq, bfqq->ref);
+ bfqq->ref--;
+ if (bfqq->ref)
+ return;
+
+ BUG_ON(rb_first(&bfqq->sort_list));
+ BUG_ON(bfqq->allocated[READ] + bfqq->allocated[WRITE] != 0);
+ BUG_ON(bfqq->entity.tree);
+ BUG_ON(bfq_bfqq_busy(bfqq));
+
+ if (bfq_bfqq_sync(bfqq))
+ /*
+ * The fact that this queue is being destroyed does not
+ * invalidate the fact that this queue may have been
+ * activated during the current burst. As a consequence,
+ * although the queue does not exist anymore, and hence
+ * needs to be removed from the burst list if there,
+ * the burst size has not to be decremented.
+ */
+ hlist_del_init(&bfqq->burst_list_node);
+
+ bfq_log_bfqq(bfqq->bfqd, bfqq, "put_queue: %p freed", bfqq);
+
+ kmem_cache_free(bfq_pool, bfqq);
+#ifdef CONFIG_BFQ_GROUP_IOSCHED
+ bfqg_put(bfqg);
+#endif
+}
+
+static void bfq_put_cooperator(struct bfq_queue *bfqq)
+{
+ struct bfq_queue *__bfqq, *next;
+
+ /*
+ * If this queue was scheduled to merge with another queue, be
+ * sure to drop the reference taken on that queue (and others in
+ * the merge chain). See bfq_setup_merge and bfq_merge_bfqqs.
+ */
+ __bfqq = bfqq->new_bfqq;
+ while (__bfqq) {
+ if (__bfqq == bfqq)
+ break;
+ next = __bfqq->new_bfqq;
+ bfq_put_queue(__bfqq);
+ __bfqq = next;
+ }
+}
+
+static void bfq_exit_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq)
+{
+ if (bfqq == bfqd->in_service_queue) {
+ __bfq_bfqq_expire(bfqd, bfqq);
+ bfq_schedule_dispatch(bfqd);
+ }
+
+ bfq_log_bfqq(bfqd, bfqq, "exit_bfqq: %p, %d", bfqq, bfqq->ref);
+
+ bfq_put_cooperator(bfqq);
+
+ bfq_put_queue(bfqq);
+}
+
+static void bfq_init_icq(struct io_cq *icq)
+{
+ icq_to_bic(icq)->ttime.last_end_request = ktime_get_ns() - (1ULL<<32);
+}
+
+static void bfq_exit_icq(struct io_cq *icq)
+{
+ struct bfq_io_cq *bic = icq_to_bic(icq);
+ struct bfq_data *bfqd = bic_to_bfqd(bic);
+
+ if (bic_to_bfqq(bic, false)) {
+ bfq_exit_bfqq(bfqd, bic_to_bfqq(bic, false));
+ bic_set_bfqq(bic, NULL, false);
+ }
+
+ if (bic_to_bfqq(bic, true)) {
+ /*
+ * If the bic is using a shared queue, put the reference
+ * taken on the io_context when the bic started using a
+ * shared bfq_queue.
+ */
+ if (bfq_bfqq_coop(bic_to_bfqq(bic, true)))
+ put_io_context(icq->ioc);
+ bfq_exit_bfqq(bfqd, bic_to_bfqq(bic, true));
+ bic_set_bfqq(bic, NULL, true);
+ }
+}
+
+/*
+ * Update the entity prio values; note that the new values will not
+ * be used until the next (re)activation.
+ */
+static void bfq_set_next_ioprio_data(struct bfq_queue *bfqq,
+ struct bfq_io_cq *bic)
+{
+ struct task_struct *tsk = current;
+ int ioprio_class;
+
+ ioprio_class = IOPRIO_PRIO_CLASS(bic->ioprio);
+ switch (ioprio_class) {
+ default:
+ dev_err(bfqq->bfqd->queue->backing_dev_info.dev,
+ "bfq: bad prio class %d\n", ioprio_class);
+ case IOPRIO_CLASS_NONE:
+ /*
+ * No prio set, inherit CPU scheduling settings.
+ */
+ bfqq->new_ioprio = task_nice_ioprio(tsk);
+ bfqq->new_ioprio_class = task_nice_ioclass(tsk);
+ break;
+ case IOPRIO_CLASS_RT:
+ bfqq->new_ioprio = IOPRIO_PRIO_DATA(bic->ioprio);
+ bfqq->new_ioprio_class = IOPRIO_CLASS_RT;
+ break;
+ case IOPRIO_CLASS_BE:
+ bfqq->new_ioprio = IOPRIO_PRIO_DATA(bic->ioprio);
+ bfqq->new_ioprio_class = IOPRIO_CLASS_BE;
+ break;
+ case IOPRIO_CLASS_IDLE:
+ bfqq->new_ioprio_class = IOPRIO_CLASS_IDLE;
+ bfqq->new_ioprio = 7;
+ bfq_clear_bfqq_idle_window(bfqq);
+ break;
+ }
+
+ if (bfqq->new_ioprio >= IOPRIO_BE_NR) {
+ pr_crit("bfq_set_next_ioprio_data: new_ioprio %d\n",
+ bfqq->new_ioprio);
+ BUG();
+ }
+
+ bfqq->entity.new_weight = bfq_ioprio_to_weight(bfqq->new_ioprio);
+ bfqq->entity.prio_changed = 1;
+ bfq_log_bfqq(bfqq->bfqd, bfqq,
+ "set_next_ioprio_data: bic_class %d prio %d class %d",
+ ioprio_class, bfqq->new_ioprio, bfqq->new_ioprio_class);
+}
+
+static void bfq_check_ioprio_change(struct bfq_io_cq *bic, struct bio *bio)
+{
+ struct bfq_data *bfqd = bic_to_bfqd(bic);
+ struct bfq_queue *bfqq;
+ unsigned long uninitialized_var(flags);
+ int ioprio = bic->icq.ioc->ioprio;
+
+ /*
+ * This condition may trigger on a newly created bic, be sure to
+ * drop the lock before returning.
+ */
+ if (unlikely(!bfqd) || likely(bic->ioprio == ioprio))
+ return;
+
+ bic->ioprio = ioprio;
+
+ bfqq = bic_to_bfqq(bic, false);
+ if (bfqq) {
+ bfq_put_queue(bfqq);
+ bfqq = bfq_get_queue(bfqd, bio, BLK_RW_ASYNC, bic);
+ bic_set_bfqq(bic, bfqq, false);
+ bfq_log_bfqq(bfqd, bfqq,
+ "check_ioprio_change: bfqq %p %d",
+ bfqq, bfqq->ref);
+ }
+
+ bfqq = bic_to_bfqq(bic, true);
+ if (bfqq)
+ bfq_set_next_ioprio_data(bfqq, bic);
+}
+
+static void bfq_init_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq,
+ struct bfq_io_cq *bic, pid_t pid, int is_sync)
+{
+ RB_CLEAR_NODE(&bfqq->entity.rb_node);
+ INIT_LIST_HEAD(&bfqq->fifo);
+ INIT_HLIST_NODE(&bfqq->burst_list_node);
+ BUG_ON(!hlist_unhashed(&bfqq->burst_list_node));
+
+ bfqq->ref = 0;
+ bfqq->bfqd = bfqd;
+
+ if (bic)
+ bfq_set_next_ioprio_data(bfqq, bic);
+
+ if (is_sync) {
+ if (!bfq_class_idle(bfqq))
+ bfq_mark_bfqq_idle_window(bfqq);
+ bfq_mark_bfqq_sync(bfqq);
+ bfq_mark_bfqq_just_created(bfqq);
+ } else
+ bfq_clear_bfqq_sync(bfqq);
+ bfq_mark_bfqq_IO_bound(bfqq);
+
+ /* Tentative initial value to trade off between thr and lat */
+ bfqq->max_budget = (2 * bfq_max_budget(bfqd)) / 3;
+ bfqq->pid = pid;
+
+ bfqq->wr_coeff = 1;
+ bfqq->last_wr_start_finish = jiffies;
+ bfqq->wr_start_at_switch_to_srt = bfq_smallest_from_now();
+ bfqq->budget_timeout = bfq_smallest_from_now();
+ bfqq->split_time = bfq_smallest_from_now();
+
+ /*
+ * Set to the value for which bfqq will not be deemed as
+ * soft rt when it becomes backlogged.
+ */
+ bfqq->soft_rt_next_start = bfq_greatest_from_now();
+
+ /* first request is almost certainly seeky */
+ bfqq->seek_history = 1;
+}
+
+static struct bfq_queue **bfq_async_queue_prio(struct bfq_data *bfqd,
+ struct bfq_group *bfqg,
+ int ioprio_class, int ioprio)
+{
+ switch (ioprio_class) {
+ case IOPRIO_CLASS_RT:
+ return &bfqg->async_bfqq[0][ioprio];
+ case IOPRIO_CLASS_NONE:
+ ioprio = IOPRIO_NORM;
+ /* fall through */
+ case IOPRIO_CLASS_BE:
+ return &bfqg->async_bfqq[1][ioprio];
+ case IOPRIO_CLASS_IDLE:
+ return &bfqg->async_idle_bfqq;
+ default:
+ BUG();
+ }
+}
+
+static struct bfq_queue *bfq_get_queue(struct bfq_data *bfqd,
+ struct bio *bio, bool is_sync,
+ struct bfq_io_cq *bic)
+{
+ const int ioprio = IOPRIO_PRIO_DATA(bic->ioprio);
+ const int ioprio_class = IOPRIO_PRIO_CLASS(bic->ioprio);
+ struct bfq_queue **async_bfqq = NULL;
+ struct bfq_queue *bfqq;
+ struct bfq_group *bfqg;
+
+ rcu_read_lock();
+
+ bfqg = bfq_find_set_group(bfqd, bio_blkcg(bio));
+ if (!bfqg) {
+ bfqq = &bfqd->oom_bfqq;
+ goto out;
+ }
+
+ if (!is_sync) {
+ async_bfqq = bfq_async_queue_prio(bfqd, bfqg, ioprio_class,
+ ioprio);
+ bfqq = *async_bfqq;
+ if (bfqq)
+ goto out;
+ }
+
+ bfqq = kmem_cache_alloc_node(bfq_pool, GFP_NOWAIT | __GFP_ZERO,
+ bfqd->queue->node);
+
+ if (bfqq) {
+ bfq_init_bfqq(bfqd, bfqq, bic, current->pid,
+ is_sync);
+ bfq_init_entity(&bfqq->entity, bfqg);
+ bfq_log_bfqq(bfqd, bfqq, "allocated");
+ } else {
+ bfqq = &bfqd->oom_bfqq;
+ bfq_log_bfqq(bfqd, bfqq, "using oom bfqq");
+ goto out;
+ }
+
+ /*
+ * Pin the queue now that it's allocated, scheduler exit will
+ * prune it.
+ */
+ if (async_bfqq) {
+ bfqq->ref++; /*
+ * Extra group reference, w.r.t. sync
+ * queue. This extra reference is removed
+ * only if bfqq->bfqg disappears, to
+ * guarantee that this queue is not freed
+ * until its group goes away.
+ */
+ bfq_log_bfqq(bfqd, bfqq, "get_queue, bfqq not in async: %p, %d",
+ bfqq, bfqq->ref);
+ *async_bfqq = bfqq;
+ }
+
+out:
+ bfqq->ref++;
+ bfq_log_bfqq(bfqd, bfqq, "get_queue, at end: %p, %d", bfqq, bfqq->ref);
+ rcu_read_unlock();
+ return bfqq;
+}
+
+static void bfq_update_io_thinktime(struct bfq_data *bfqd,
+ struct bfq_io_cq *bic)
+{
+ struct bfq_ttime *ttime = &bic->ttime;
+ u64 elapsed = ktime_get_ns() - bic->ttime.last_end_request;
+
+ elapsed = min_t(u64, elapsed, 2 * bfqd->bfq_slice_idle);
+
+ ttime->ttime_samples = (7*bic->ttime.ttime_samples + 256) / 8;
+ ttime->ttime_total = div_u64(7*ttime->ttime_total + 256*elapsed, 8);
+ ttime->ttime_mean = div64_ul(ttime->ttime_total + 128,
+ ttime->ttime_samples);
+}
+
+static void
+bfq_update_io_seektime(struct bfq_data *bfqd, struct bfq_queue *bfqq,
+ struct request *rq)
+{
+ bfqq->seek_history <<= 1;
+ bfqq->seek_history |=
+ get_sdist(bfqq->last_request_pos, rq) > BFQQ_SEEK_THR &&
+ (!blk_queue_nonrot(bfqd->queue) ||
+ blk_rq_sectors(rq) < BFQQ_SECT_THR_NONROT);
+}
+
+/*
+ * Disable idle window if the process thinks too long or seeks so much that
+ * it doesn't matter.
+ */
+static void bfq_update_idle_window(struct bfq_data *bfqd,
+ struct bfq_queue *bfqq,
+ struct bfq_io_cq *bic)
+{
+ int enable_idle;
+
+ /* Don't idle for async or idle io prio class. */
+ if (!bfq_bfqq_sync(bfqq) || bfq_class_idle(bfqq))
+ return;
+
+ /* Idle window just restored, statistics are meaningless. */
+ if (time_is_after_eq_jiffies(bfqq->split_time +
+ bfqd->bfq_wr_min_idle_time))
+ return;
+
+ enable_idle = bfq_bfqq_idle_window(bfqq);
+
+ if (atomic_read(&bic->icq.ioc->active_ref) == 0 ||
+ bfqd->bfq_slice_idle == 0 ||
+ (bfqd->hw_tag && BFQQ_SEEKY(bfqq) &&
+ bfqq->wr_coeff == 1))
+ enable_idle = 0;
+ else if (bfq_sample_valid(bic->ttime.ttime_samples)) {
+ if (bic->ttime.ttime_mean > bfqd->bfq_slice_idle &&
+ bfqq->wr_coeff == 1)
+ enable_idle = 0;
+ else
+ enable_idle = 1;
+ }
+ bfq_log_bfqq(bfqd, bfqq, "update_idle_window: enable_idle %d",
+ enable_idle);
+
+ if (enable_idle)
+ bfq_mark_bfqq_idle_window(bfqq);
+ else
+ bfq_clear_bfqq_idle_window(bfqq);
+}
+
+/*
+ * Called when a new fs request (rq) is added to bfqq. Check if there's
+ * something we should do about it.
+ */
+static void bfq_rq_enqueued(struct bfq_data *bfqd, struct bfq_queue *bfqq,
+ struct request *rq)
+{
+ struct bfq_io_cq *bic = RQ_BIC(rq);
+
+ if (rq->cmd_flags & REQ_META)
+ bfqq->meta_pending++;
+
+ bfq_update_io_thinktime(bfqd, bic);
+ bfq_update_io_seektime(bfqd, bfqq, rq);
+ if (bfqq->entity.service > bfq_max_budget(bfqd) / 8 ||
+ !BFQQ_SEEKY(bfqq))
+ bfq_update_idle_window(bfqd, bfqq, bic);
+
+ bfq_log_bfqq(bfqd, bfqq,
+ "rq_enqueued: idle_window=%d (seeky %d)",
+ bfq_bfqq_idle_window(bfqq), BFQQ_SEEKY(bfqq));
+
+ bfqq->last_request_pos = blk_rq_pos(rq) + blk_rq_sectors(rq);
+
+ if (bfqq == bfqd->in_service_queue && bfq_bfqq_wait_request(bfqq)) {
+ bool small_req = bfqq->queued[rq_is_sync(rq)] == 1 &&
+ blk_rq_sectors(rq) < 32;
+ bool budget_timeout = bfq_bfqq_budget_timeout(bfqq);
+
+ /*
+ * There is just this request queued: if the request
+ * is small and the queue is not to be expired, then
+ * just exit.
+ *
+ * In this way, if the device is being idled to wait
+ * for a new request from the in-service queue, we
+ * avoid unplugging the device and committing the
+ * device to serve just a small request. On the
+ * contrary, we wait for the block layer to decide
+ * when to unplug the device: hopefully, new requests
+ * will be merged to this one quickly, then the device
+ * will be unplugged and larger requests will be
+ * dispatched.
+ */
+ if (small_req && !budget_timeout)
+ return;
+
+ /*
+ * A large enough request arrived, or the queue is to
+ * be expired: in both cases disk idling is to be
+ * stopped, so clear wait_request flag and reset
+ * timer.
+ */
+ bfq_clear_bfqq_wait_request(bfqq);
+ hrtimer_try_to_cancel(&bfqd->idle_slice_timer);
+ bfqg_stats_update_idle_time(bfqq_group(bfqq));
+
+ /*
+ * The queue is not empty, because a new request just
+ * arrived. Hence we can safely expire the queue, in
+ * case of budget timeout, without risking that the
+ * timestamps of the queue are not updated correctly.
+ * See [1] for more details.
+ */
+ if (budget_timeout)
+ bfq_bfqq_expire(bfqd, bfqq, false,
+ BFQ_BFQQ_BUDGET_TIMEOUT);
+
+ /*
+ * Let the request rip immediately, or let a new queue be
+ * selected if bfqq has just been expired.
+ */
+ __blk_run_queue(bfqd->queue);
+ }
+}
+
+static void bfq_insert_request(struct request_queue *q, struct request *rq)
+{
+ struct bfq_data *bfqd = q->elevator->elevator_data;
+ struct bfq_queue *bfqq = RQ_BFQQ(rq), *new_bfqq;
+
+ assert_spin_locked(bfqd->queue->queue_lock);
+
+ /*
+ * An unplug may trigger a requeue of a request from the device
+ * driver: make sure we are in process context while trying to
+ * merge two bfq_queues.
+ */
+ if (!in_interrupt()) {
+ new_bfqq = bfq_setup_cooperator(bfqd, bfqq, rq, true);
+ if (new_bfqq) {
+ if (bic_to_bfqq(RQ_BIC(rq), 1) != bfqq)
+ new_bfqq = bic_to_bfqq(RQ_BIC(rq), 1);
+ /*
+ * Release the request's reference to the old bfqq
+ * and make sure one is taken to the shared queue.
+ */
+ new_bfqq->allocated[rq_data_dir(rq)]++;
+ bfqq->allocated[rq_data_dir(rq)]--;
+ new_bfqq->ref++;
+ bfq_clear_bfqq_just_created(bfqq);
+ bfq_put_queue(bfqq);
+ if (bic_to_bfqq(RQ_BIC(rq), 1) == bfqq)
+ bfq_merge_bfqqs(bfqd, RQ_BIC(rq),
+ bfqq, new_bfqq);
+ rq->elv.priv[1] = new_bfqq;
+ bfqq = new_bfqq;
+ }
+ }
+
+ bfq_add_request(rq);
+
+ rq->fifo_time = ktime_get_ns() + bfqd->bfq_fifo_expire[rq_is_sync(rq)];
+ list_add_tail(&rq->queuelist, &bfqq->fifo);
+
+ bfq_rq_enqueued(bfqd, bfqq, rq);
+}
+
+static void bfq_update_hw_tag(struct bfq_data *bfqd)
+{
+ bfqd->max_rq_in_driver = max_t(int, bfqd->max_rq_in_driver,
+ bfqd->rq_in_driver);
+
+ if (bfqd->hw_tag == 1)
+ return;
+
+ /*
+ * This sample is valid if the number of outstanding requests
+ * is large enough to allow a queueing behavior. Note that the
+ * sum is not exact, as it's not taking into account deactivated
+ * requests.
+ */
+ if (bfqd->rq_in_driver + bfqd->queued < BFQ_HW_QUEUE_THRESHOLD)
+ return;
+
+ if (bfqd->hw_tag_samples++ < BFQ_HW_QUEUE_SAMPLES)
+ return;
+
+ bfqd->hw_tag = bfqd->max_rq_in_driver > BFQ_HW_QUEUE_THRESHOLD;
+ bfqd->max_rq_in_driver = 0;
+ bfqd->hw_tag_samples = 0;
+}
+
+static void bfq_completed_request(struct request_queue *q, struct request *rq)
+{
+ struct bfq_queue *bfqq = RQ_BFQQ(rq);
+ struct bfq_data *bfqd = bfqq->bfqd;
+ u64 now_ns;
+ u32 delta_us;
+
+ bfq_log_bfqq(bfqd, bfqq, "completed one req with %u sects left",
+ blk_rq_sectors(rq));
+
+ assert_spin_locked(bfqd->queue->queue_lock);
+ bfq_update_hw_tag(bfqd);
+
+ BUG_ON(!bfqd->rq_in_driver);
+ BUG_ON(!bfqq->dispatched);
+ bfqd->rq_in_driver--;
+ bfqq->dispatched--;
+ bfqg_stats_update_completion(bfqq_group(bfqq),
+ rq_start_time_ns(rq),
+ rq_io_start_time_ns(rq), req_op(rq),
+ rq->cmd_flags);
+
+ if (!bfqq->dispatched && !bfq_bfqq_busy(bfqq)) {
+ BUG_ON(!RB_EMPTY_ROOT(&bfqq->sort_list));
+ /*
+ * Set budget_timeout (which we overload to store the
+ * time at which the queue remains with no backlog and
+ * no outstanding request; used by the weight-raising
+ * mechanism).
+ */
+ bfqq->budget_timeout = jiffies;
+
+ bfq_weights_tree_remove(bfqd, &bfqq->entity,
+ &bfqd->queue_weights_tree);
+ }
+
+ now_ns = ktime_get_ns();
+
+ RQ_BIC(rq)->ttime.last_end_request = now_ns;
+
+ /*
+ * Using us instead of ns, to get a reasonable precision in
+ * computing rate in next check.
+ */
+ delta_us = div_u64(now_ns - bfqd->last_completion, NSEC_PER_USEC);
+
+ bfq_log(bfqd, "rq_completed: delta %uus/%luus max_size %u rate %llu/%llu",
+ delta_us, BFQ_MIN_TT/NSEC_PER_USEC, bfqd->last_rq_max_size,
+ (USEC_PER_SEC*
+ (u64)((bfqd->last_rq_max_size<<BFQ_RATE_SHIFT)/delta_us))
+ >>BFQ_RATE_SHIFT,
+ (USEC_PER_SEC*(u64)(1UL<<(BFQ_RATE_SHIFT-10)))>>BFQ_RATE_SHIFT);
+
+ /*
+ * If the request took rather long to complete, and, according
+ * to the maximum request size recorded, this completion latency
+ * implies that the request was certainly served at a very low
+ * rate (less than 1M sectors/sec), then the whole observation
+ * interval that lasts up to this time instant cannot be a
+ * valid time interval for computing a new peak rate. Invoke
+ * bfq_update_rate_reset to have the following three steps
+ * taken:
+ * - close the observation interval at the last (previous)
+ * request dispatch or completion
+ * - compute rate, if possible, for that observation interval
+ * - reset to zero samples, which will trigger a proper
+ * re-initialization of the observation interval on next
+ * dispatch
+ */
+ if (delta_us > BFQ_MIN_TT/NSEC_PER_USEC &&
+ (bfqd->last_rq_max_size<<BFQ_RATE_SHIFT)/delta_us <
+ 1UL<<(BFQ_RATE_SHIFT - 10))
+ bfq_update_rate_reset(bfqd, NULL);
+ bfqd->last_completion = now_ns;
+
+ /*
+ * If we are waiting to discover whether the request pattern
+ * of the task associated with the queue is actually
+ * isochronous, and both requisites for this condition to hold
+ * are now satisfied, then compute soft_rt_next_start (see the
+ * comments on the function bfq_bfqq_softrt_next_start()). We
+ * schedule this delayed check when bfqq expires, if it still
+ * has in-flight requests.
+ */
+ if (bfq_bfqq_softrt_update(bfqq) && bfqq->dispatched == 0 &&
+ RB_EMPTY_ROOT(&bfqq->sort_list))
+ bfqq->soft_rt_next_start =
+ bfq_bfqq_softrt_next_start(bfqd, bfqq);
+
+ /*
+ * If this is the in-service queue, check if it needs to be expired,
+ * or if we want to idle in case it has no pending requests.
+ */
+ if (bfqd->in_service_queue == bfqq) {
+ if (bfqq->dispatched == 0 && bfq_bfqq_must_idle(bfqq)) {
+ bfq_arm_slice_timer(bfqd);
+ goto out;
+ } else if (bfq_may_expire_for_budg_timeout(bfqq))
+ bfq_bfqq_expire(bfqd, bfqq, false,
+ BFQ_BFQQ_BUDGET_TIMEOUT);
+ else if (RB_EMPTY_ROOT(&bfqq->sort_list) &&
+ (bfqq->dispatched == 0 ||
+ !bfq_bfqq_may_idle(bfqq)))
+ bfq_bfqq_expire(bfqd, bfqq, false,
+ BFQ_BFQQ_NO_MORE_REQUESTS);
+ }
+
+ if (!bfqd->rq_in_driver)
+ bfq_schedule_dispatch(bfqd);
+
+out:
+ return;
+}
+
+static int __bfq_may_queue(struct bfq_queue *bfqq)
+{
+ if (bfq_bfqq_wait_request(bfqq) && bfq_bfqq_must_alloc(bfqq)) {
+ bfq_clear_bfqq_must_alloc(bfqq);
+ return ELV_MQUEUE_MUST;
+ }
+
+ return ELV_MQUEUE_MAY;
+}
+
+static int bfq_may_queue(struct request_queue *q, int op, int op_flags)
+{
+ struct bfq_data *bfqd = q->elevator->elevator_data;
+ struct task_struct *tsk = current;
+ struct bfq_io_cq *bic;
+ struct bfq_queue *bfqq;
+
+ /*
+ * Don't force setup of a queue from here, as a call to may_queue
+ * does not necessarily imply that a request actually will be
+ * queued. So just lookup a possibly existing queue, or return
+ * 'may queue' if that fails.
+ */
+ bic = bfq_bic_lookup(bfqd, tsk->io_context);
+ if (!bic)
+ return ELV_MQUEUE_MAY;
+
+ bfqq = bic_to_bfqq(bic, rw_is_sync(op, op_flags));
+ if (bfqq)
+ return __bfq_may_queue(bfqq);
+
+ return ELV_MQUEUE_MAY;
+}
+
+/*
+ * Queue lock held here.
+ */
+static void bfq_put_request(struct request *rq)
+{
+ struct bfq_queue *bfqq = RQ_BFQQ(rq);
+
+ if (bfqq) {
+ const int rw = rq_data_dir(rq);
+
+ BUG_ON(!bfqq->allocated[rw]);
+ bfqq->allocated[rw]--;
+
+ rq->elv.priv[0] = NULL;
+ rq->elv.priv[1] = NULL;
+
+ bfq_log_bfqq(bfqq->bfqd, bfqq, "put_request %p, %d",
+ bfqq, bfqq->ref);
+ bfq_put_queue(bfqq);
+ }
+}
+
+/*
+ * Returns NULL if a new bfqq should be allocated, or the old bfqq if this
+ * was the last process referring to that bfqq.
+ */
+static struct bfq_queue *
+bfq_split_bfqq(struct bfq_io_cq *bic, struct bfq_queue *bfqq)
+{
+ bfq_log_bfqq(bfqq->bfqd, bfqq, "splitting queue");
+
+ put_io_context(bic->icq.ioc);
+
+ if (bfqq_process_refs(bfqq) == 1) {
+ bfqq->pid = current->pid;
+ bfq_clear_bfqq_coop(bfqq);
+ bfq_clear_bfqq_split_coop(bfqq);
+ return bfqq;
+ }
+
+ bic_set_bfqq(bic, NULL, 1);
+
+ bfq_put_cooperator(bfqq);
+
+ bfq_put_queue(bfqq);
+ return NULL;
+}
+
+/*
+ * Allocate bfq data structures associated with this request.
+ */
+static int bfq_set_request(struct request_queue *q, struct request *rq,
+ struct bio *bio, gfp_t gfp_mask)
+{
+ struct bfq_data *bfqd = q->elevator->elevator_data;
+ struct bfq_io_cq *bic = icq_to_bic(rq->elv.icq);
+ const int rw = rq_data_dir(rq);
+ const int is_sync = rq_is_sync(rq);
+ struct bfq_queue *bfqq;
+ unsigned long flags;
+ bool split = false;
+
+ spin_lock_irqsave(q->queue_lock, flags);
+ bfq_check_ioprio_change(bic, bio);
+
+ if (!bic)
+ goto queue_fail;
+
+ bfq_bic_update_cgroup(bic, bio);
+
+new_queue:
+ bfqq = bic_to_bfqq(bic, is_sync);
+ if (!bfqq || bfqq == &bfqd->oom_bfqq) {
+ if (bfqq)
+ bfq_put_queue(bfqq);
+ bfqq = bfq_get_queue(bfqd, bio, is_sync, bic);
+ BUG_ON(!hlist_unhashed(&bfqq->burst_list_node));
+
+ bic_set_bfqq(bic, bfqq, is_sync);
+ if (split && is_sync) {
+ bfq_log_bfqq(bfqd, bfqq,
+ "set_request: was_in_list %d "
+ "was_in_large_burst %d "
+ "large burst in progress %d",
+ bic->was_in_burst_list,
+ bic->saved_in_large_burst,
+ bfqd->large_burst);
+
+ if ((bic->was_in_burst_list && bfqd->large_burst) ||
+ bic->saved_in_large_burst) {
+ bfq_log_bfqq(bfqd, bfqq,
+ "set_request: marking in "
+ "large burst");
+ bfq_mark_bfqq_in_large_burst(bfqq);
+ } else {
+ bfq_log_bfqq(bfqd, bfqq,
+ "set_request: clearing in "
+ "large burst");
+ bfq_clear_bfqq_in_large_burst(bfqq);
+ if (bic->was_in_burst_list)
+ hlist_add_head(&bfqq->burst_list_node,
+ &bfqd->burst_list);
+ }
+ bfqq->split_time = jiffies;
+ }
+ } else {
+ /* If the queue was seeky for too long, break it apart. */
+ if (bfq_bfqq_coop(bfqq) && bfq_bfqq_split_coop(bfqq)) {
+ bfq_log_bfqq(bfqd, bfqq, "breaking apart bfqq");
+
+ /* Update bic before losing reference to bfqq */
+ if (bfq_bfqq_in_large_burst(bfqq))
+ bic->saved_in_large_burst = true;
+
+ bfqq = bfq_split_bfqq(bic, bfqq);
+ split = true;
+ if (!bfqq)
+ goto new_queue;
+ }
+ }
+
+ bfqq->allocated[rw]++;
+ bfqq->ref++;
+ bfq_log_bfqq(bfqd, bfqq, "set_request: bfqq %p, %d", bfqq, bfqq->ref);
+
+ rq->elv.priv[0] = bic;
+ rq->elv.priv[1] = bfqq;
+
+ /*
+ * If a bfq_queue has only one process reference, it is owned
+ * by only one bfq_io_cq: we can set the bic field of the
+ * bfq_queue to the address of that structure. Also, if the
+ * queue has just been split, mark a flag so that the
+ * information is available to the other scheduler hooks.
+ */
+ if (likely(bfqq != &bfqd->oom_bfqq) && bfqq_process_refs(bfqq) == 1) {
+ bfqq->bic = bic;
+ if (split) {
+ /*
+ * If the queue has just been split from a shared
+ * queue, restore the idle window and the possible
+ * weight raising period.
+ */
+ bfq_bfqq_resume_state(bfqq, bic);
+ }
+ }
+
+ if (unlikely(bfq_bfqq_just_created(bfqq)))
+ bfq_handle_burst(bfqd, bfqq);
+
+ spin_unlock_irqrestore(q->queue_lock, flags);
+
+ return 0;
+
+queue_fail:
+ bfq_schedule_dispatch(bfqd);
+ spin_unlock_irqrestore(q->queue_lock, flags);
+
+ return 1;
+}
+
+static void bfq_kick_queue(struct work_struct *work)
+{
+ struct bfq_data *bfqd =
+ container_of(work, struct bfq_data, unplug_work);
+ struct request_queue *q = bfqd->queue;
+
+ spin_lock_irq(q->queue_lock);
+ __blk_run_queue(q);
+ spin_unlock_irq(q->queue_lock);
+}
+
+/*
+ * Handler of the expiration of the timer running if the in-service queue
+ * is idling inside its time slice.
+ */
+static enum hrtimer_restart bfq_idle_slice_timer(struct hrtimer *timer)
+{
+ struct bfq_data *bfqd = container_of(timer, struct bfq_data,
+ idle_slice_timer);
+ struct bfq_queue *bfqq;
+ unsigned long flags;
+ enum bfqq_expiration reason;
+
+ spin_lock_irqsave(bfqd->queue->queue_lock, flags);
+
+ bfqq = bfqd->in_service_queue;
+ /*
+ * Theoretical race here: the in-service queue can be NULL or
+ * different from the queue that was idling if the timer handler
+ * spins on the queue_lock and a new request arrives for the
+ * current queue and there is a full dispatch cycle that changes
+ * the in-service queue. This can hardly happen, but in the worst
+ * case we just expire a queue too early.
+ */
+ if (bfqq) {
+ bfq_log_bfqq(bfqd, bfqq, "slice_timer expired");
+ bfq_clear_bfqq_wait_request(bfqq);
+
+ if (bfq_bfqq_budget_timeout(bfqq))
+ /*
+ * Also here the queue can be safely expired
+ * for budget timeout without wasting
+ * guarantees
+ */
+ reason = BFQ_BFQQ_BUDGET_TIMEOUT;
+ else if (bfqq->queued[0] == 0 && bfqq->queued[1] == 0)
+ /*
+ * The queue may not be empty upon timer expiration,
+ * because we may not disable the timer when the
+ * first request of the in-service queue arrives
+ * during disk idling.
+ */
+ reason = BFQ_BFQQ_TOO_IDLE;
+ else
+ goto schedule_dispatch;
+
+ bfq_bfqq_expire(bfqd, bfqq, true, reason);
+ }
+
+schedule_dispatch:
+ bfq_schedule_dispatch(bfqd);
+
+ spin_unlock_irqrestore(bfqd->queue->queue_lock, flags);
+ return HRTIMER_NORESTART;
+}
+
+static void bfq_shutdown_timer_wq(struct bfq_data *bfqd)
+{
+ hrtimer_cancel(&bfqd->idle_slice_timer);
+ cancel_work_sync(&bfqd->unplug_work);
+}
+
+static void __bfq_put_async_bfqq(struct bfq_data *bfqd,
+ struct bfq_queue **bfqq_ptr)
+{
+ struct bfq_group *root_group = bfqd->root_group;
+ struct bfq_queue *bfqq = *bfqq_ptr;
+
+ bfq_log(bfqd, "put_async_bfqq: %p", bfqq);
+ if (bfqq) {
+ bfq_bfqq_move(bfqd, bfqq, root_group);
+ bfq_log_bfqq(bfqd, bfqq, "put_async_bfqq: putting %p, %d",
+ bfqq, bfqq->ref);
+ bfq_put_queue(bfqq);
+ *bfqq_ptr = NULL;
+ }
+}
+
+/*
+ * Release all the bfqg references to its async queues. If we are
+ * deallocating the group these queues may still contain requests, so
+ * we reparent them to the root cgroup (i.e., the only one that will
+ * exist for sure until all the requests on a device are gone).
+ */
+static void bfq_put_async_queues(struct bfq_data *bfqd, struct bfq_group *bfqg)
+{
+ int i, j;
+
+ for (i = 0; i < 2; i++)
+ for (j = 0; j < IOPRIO_BE_NR; j++)
+ __bfq_put_async_bfqq(bfqd, &bfqg->async_bfqq[i][j]);
+
+ __bfq_put_async_bfqq(bfqd, &bfqg->async_idle_bfqq);
+}
+
+static void bfq_exit_queue(struct elevator_queue *e)
+{
+ struct bfq_data *bfqd = e->elevator_data;
+ struct request_queue *q = bfqd->queue;
+ struct bfq_queue *bfqq, *n;
+
+ bfq_shutdown_timer_wq(bfqd);
+
+ spin_lock_irq(q->queue_lock);
+
+ BUG_ON(bfqd->in_service_queue);
+ list_for_each_entry_safe(bfqq, n, &bfqd->idle_list, bfqq_list)
+ bfq_deactivate_bfqq(bfqd, bfqq, false, false);
+
+ spin_unlock_irq(q->queue_lock);
+
+ bfq_shutdown_timer_wq(bfqd);
+
+ BUG_ON(hrtimer_active(&bfqd->idle_slice_timer));
+
+#ifdef CONFIG_BFQ_GROUP_IOSCHED
+ blkcg_deactivate_policy(q, &blkcg_policy_bfq);
+#else
+ bfq_put_async_queues(bfqd, bfqd->root_group);
+ kfree(bfqd->root_group);
+#endif
+
+ kfree(bfqd);
+}
+
+static void bfq_init_root_group(struct bfq_group *root_group,
+ struct bfq_data *bfqd)
+{
+ int i;
+
+#ifdef CONFIG_BFQ_GROUP_IOSCHED
+ root_group->entity.parent = NULL;
+ root_group->my_entity = NULL;
+ root_group->bfqd = bfqd;
+#endif
+ root_group->rq_pos_tree = RB_ROOT;
+ for (i = 0; i < BFQ_IOPRIO_CLASSES; i++)
+ root_group->sched_data.service_tree[i] = BFQ_SERVICE_TREE_INIT;
+ root_group->sched_data.bfq_class_idle_last_service = jiffies;
+}
+
+static int bfq_init_queue(struct request_queue *q, struct elevator_type *e)
+{
+ struct bfq_data *bfqd;
+ struct elevator_queue *eq;
+
+ eq = elevator_alloc(q, e);
+ if (!eq)
+ return -ENOMEM;
+
+ bfqd = kzalloc_node(sizeof(*bfqd), GFP_KERNEL, q->node);
+ if (!bfqd) {
+ kobject_put(&eq->kobj);
+ return -ENOMEM;
+ }
+ eq->elevator_data = bfqd;
+
+ /*
+ * Our fallback bfqq if bfq_find_alloc_queue() runs into OOM issues.
+ * Grab a permanent reference to it, so that the normal code flow
+ * will not attempt to free it.
+ */
+ bfq_init_bfqq(bfqd, &bfqd->oom_bfqq, NULL, 1, 0);
+ bfqd->oom_bfqq.ref++;
+ bfqd->oom_bfqq.new_ioprio = BFQ_DEFAULT_QUEUE_IOPRIO;
+ bfqd->oom_bfqq.new_ioprio_class = IOPRIO_CLASS_BE;
+ bfqd->oom_bfqq.entity.new_weight =
+ bfq_ioprio_to_weight(bfqd->oom_bfqq.new_ioprio);
+
+ /* oom_bfqq does not participate to bursts */
+ bfq_clear_bfqq_just_created(&bfqd->oom_bfqq);
+ /*
+ * Trigger weight initialization, according to ioprio, at the
+ * oom_bfqq's first activation. The oom_bfqq's ioprio and ioprio
+ * class won't be changed any more.
+ */
+ bfqd->oom_bfqq.entity.prio_changed = 1;
+
+ bfqd->queue = q;
+
+ spin_lock_irq(q->queue_lock);
+ q->elevator = eq;
+ spin_unlock_irq(q->queue_lock);
+
+ bfqd->root_group = bfq_create_group_hierarchy(bfqd, q->node);
+ if (!bfqd->root_group)
+ goto out_free;
+ bfq_init_root_group(bfqd->root_group, bfqd);
+ bfq_init_entity(&bfqd->oom_bfqq.entity, bfqd->root_group);
+
+ hrtimer_init(&bfqd->idle_slice_timer, CLOCK_MONOTONIC,
+ HRTIMER_MODE_REL);
+ bfqd->idle_slice_timer.function = bfq_idle_slice_timer;
+
+ bfqd->queue_weights_tree = RB_ROOT;
+ bfqd->group_weights_tree = RB_ROOT;
+
+ INIT_WORK(&bfqd->unplug_work, bfq_kick_queue);
+
+ INIT_LIST_HEAD(&bfqd->active_list);
+ INIT_LIST_HEAD(&bfqd->idle_list);
+ INIT_HLIST_HEAD(&bfqd->burst_list);
+
+ bfqd->hw_tag = -1;
+
+ bfqd->bfq_max_budget = bfq_default_max_budget;
+
+ bfqd->bfq_fifo_expire[0] = bfq_fifo_expire[0];
+ bfqd->bfq_fifo_expire[1] = bfq_fifo_expire[1];
+ bfqd->bfq_back_max = bfq_back_max;
+ bfqd->bfq_back_penalty = bfq_back_penalty;
+ bfqd->bfq_slice_idle = bfq_slice_idle;
+ bfqd->bfq_timeout = bfq_timeout;
+
+ bfqd->bfq_requests_within_timer = 120;
+
+ bfqd->bfq_large_burst_thresh = 8;
+ bfqd->bfq_burst_interval = msecs_to_jiffies(180);
+
+ bfqd->low_latency = true;
+
+ /*
+ * Trade-off between responsiveness and fairness.
+ */
+ bfqd->bfq_wr_coeff = 30;
+ bfqd->bfq_wr_rt_max_time = msecs_to_jiffies(300);
+ bfqd->bfq_wr_max_time = 0;
+ bfqd->bfq_wr_min_idle_time = msecs_to_jiffies(2000);
+ bfqd->bfq_wr_min_inter_arr_async = msecs_to_jiffies(500);
+ bfqd->bfq_wr_max_softrt_rate = 7000; /*
+ * Approximate rate required
+ * to playback or record a
+ * high-definition compressed
+ * video.
+ */
+ bfqd->wr_busy_queues = 0;
+
+ /*
+ * Begin by assuming, optimistically, that the device is a
+ * high-speed one, and that its peak rate is equal to 2/3 of
+ * the highest reference rate.
+ */
+ bfqd->RT_prod = R_fast[blk_queue_nonrot(bfqd->queue)] *
+ T_fast[blk_queue_nonrot(bfqd->queue)];
+ bfqd->peak_rate = R_fast[blk_queue_nonrot(bfqd->queue)] * 2 / 3;
+ bfqd->device_speed = BFQ_BFQD_FAST;
+
+ return 0;
+
+out_free:
+ kfree(bfqd);
+ kobject_put(&eq->kobj);
+ return -ENOMEM;
+}
+
+static void bfq_slab_kill(void)
+{
+ kmem_cache_destroy(bfq_pool);
+}
+
+static int __init bfq_slab_setup(void)
+{
+ bfq_pool = KMEM_CACHE(bfq_queue, 0);
+ if (!bfq_pool)
+ return -ENOMEM;
+ return 0;
+}
+
+static ssize_t bfq_var_show(unsigned int var, char *page)
+{
+ return sprintf(page, "%u\n", var);
+}
+
+static ssize_t bfq_var_store(unsigned long *var, const char *page,
+ size_t count)
+{
+ unsigned long new_val;
+ int ret = kstrtoul(page, 10, &new_val);
+
+ if (ret == 0)
+ *var = new_val;
+
+ return count;
+}
+
+static ssize_t bfq_wr_max_time_show(struct elevator_queue *e, char *page)
+{
+ struct bfq_data *bfqd = e->elevator_data;
+
+ return sprintf(page, "%d\n", bfqd->bfq_wr_max_time > 0 ?
+ jiffies_to_msecs(bfqd->bfq_wr_max_time) :
+ jiffies_to_msecs(bfq_wr_duration(bfqd)));
+}
+
+static ssize_t bfq_weights_show(struct elevator_queue *e, char *page)
+{
+ struct bfq_queue *bfqq;
+ struct bfq_data *bfqd = e->elevator_data;
+ ssize_t num_char = 0;
+
+ num_char += sprintf(page + num_char, "Tot reqs queued %d\n\n",
+ bfqd->queued);
+
+ spin_lock_irq(bfqd->queue->queue_lock);
+
+ num_char += sprintf(page + num_char, "Active:\n");
+ list_for_each_entry(bfqq, &bfqd->active_list, bfqq_list) {
+ num_char += sprintf(page + num_char,
+ "pid%d: weight %hu, nr_queued %d %d, ",
+ bfqq->pid,
+ bfqq->entity.weight,
+ bfqq->queued[0],
+ bfqq->queued[1]);
+ num_char += sprintf(page + num_char,
+ "dur %d/%u\n",
+ jiffies_to_msecs(
+ jiffies -
+ bfqq->last_wr_start_finish),
+ jiffies_to_msecs(bfqq->wr_cur_max_time));
+ }
+
+ num_char += sprintf(page + num_char, "Idle:\n");
+ list_for_each_entry(bfqq, &bfqd->idle_list, bfqq_list) {
+ num_char += sprintf(page + num_char,
+ "pid%d: weight %hu, dur %d/%u\n",
+ bfqq->pid,
+ bfqq->entity.weight,
+ jiffies_to_msecs(jiffies -
+ bfqq->last_wr_start_finish),
+ jiffies_to_msecs(bfqq->wr_cur_max_time));
+ }
+
+ spin_unlock_irq(bfqd->queue->queue_lock);
+
+ return num_char;
+}
+
+#define SHOW_FUNCTION(__FUNC, __VAR, __CONV) \
+static ssize_t __FUNC(struct elevator_queue *e, char *page) \
+{ \
+ struct bfq_data *bfqd = e->elevator_data; \
+ u64 __data = __VAR; \
+ if (__CONV == 1) \
+ __data = jiffies_to_msecs(__data); \
+ else if (__CONV == 2) \
+ __data = div_u64(__data, NSEC_PER_MSEC); \
+ return bfq_var_show(__data, (page)); \
+}
+SHOW_FUNCTION(bfq_fifo_expire_sync_show, bfqd->bfq_fifo_expire[1], 2);
+SHOW_FUNCTION(bfq_fifo_expire_async_show, bfqd->bfq_fifo_expire[0], 2);
+SHOW_FUNCTION(bfq_back_seek_max_show, bfqd->bfq_back_max, 0);
+SHOW_FUNCTION(bfq_back_seek_penalty_show, bfqd->bfq_back_penalty, 0);
+SHOW_FUNCTION(bfq_slice_idle_show, bfqd->bfq_slice_idle, 2);
+SHOW_FUNCTION(bfq_max_budget_show, bfqd->bfq_user_max_budget, 0);
+SHOW_FUNCTION(bfq_timeout_sync_show, bfqd->bfq_timeout, 1);
+SHOW_FUNCTION(bfq_strict_guarantees_show, bfqd->strict_guarantees, 0);
+SHOW_FUNCTION(bfq_low_latency_show, bfqd->low_latency, 0);
+SHOW_FUNCTION(bfq_wr_coeff_show, bfqd->bfq_wr_coeff, 0);
+SHOW_FUNCTION(bfq_wr_rt_max_time_show, bfqd->bfq_wr_rt_max_time, 1);
+SHOW_FUNCTION(bfq_wr_min_idle_time_show, bfqd->bfq_wr_min_idle_time, 1);
+SHOW_FUNCTION(bfq_wr_min_inter_arr_async_show, bfqd->bfq_wr_min_inter_arr_async,
+ 1);
+SHOW_FUNCTION(bfq_wr_max_softrt_rate_show, bfqd->bfq_wr_max_softrt_rate, 0);
+#undef SHOW_FUNCTION
+
+#define USEC_SHOW_FUNCTION(__FUNC, __VAR) \
+static ssize_t __FUNC(struct elevator_queue *e, char *page) \
+{ \
+ struct bfq_data *bfqd = e->elevator_data; \
+ u64 __data = __VAR; \
+ __data = div_u64(__data, NSEC_PER_USEC); \
+ return bfq_var_show(__data, (page)); \
+}
+USEC_SHOW_FUNCTION(bfq_slice_idle_us_show, bfqd->bfq_slice_idle);
+#undef USEC_SHOW_FUNCTION
+
+#define STORE_FUNCTION(__FUNC, __PTR, MIN, MAX, __CONV) \
+static ssize_t \
+__FUNC(struct elevator_queue *e, const char *page, size_t count) \
+{ \
+ struct bfq_data *bfqd = e->elevator_data; \
+ unsigned long uninitialized_var(__data); \
+ int ret = bfq_var_store(&__data, (page), count); \
+ if (__data < (MIN)) \
+ __data = (MIN); \
+ else if (__data > (MAX)) \
+ __data = (MAX); \
+ if (__CONV == 1) \
+ *(__PTR) = msecs_to_jiffies(__data); \
+ else if (__CONV == 2) \
+ *(__PTR) = (u64)__data * NSEC_PER_MSEC; \
+ else \
+ *(__PTR) = __data; \
+ return ret; \
+}
+STORE_FUNCTION(bfq_fifo_expire_sync_store, &bfqd->bfq_fifo_expire[1], 1,
+ INT_MAX, 2);
+STORE_FUNCTION(bfq_fifo_expire_async_store, &bfqd->bfq_fifo_expire[0], 1,
+ INT_MAX, 2);
+STORE_FUNCTION(bfq_back_seek_max_store, &bfqd->bfq_back_max, 0, INT_MAX, 0);
+STORE_FUNCTION(bfq_back_seek_penalty_store, &bfqd->bfq_back_penalty, 1,
+ INT_MAX, 0);
+STORE_FUNCTION(bfq_slice_idle_store, &bfqd->bfq_slice_idle, 0, INT_MAX, 2);
+STORE_FUNCTION(bfq_wr_coeff_store, &bfqd->bfq_wr_coeff, 1, INT_MAX, 0);
+STORE_FUNCTION(bfq_wr_max_time_store, &bfqd->bfq_wr_max_time, 0, INT_MAX, 1);
+STORE_FUNCTION(bfq_wr_rt_max_time_store, &bfqd->bfq_wr_rt_max_time, 0, INT_MAX,
+ 1);
+STORE_FUNCTION(bfq_wr_min_idle_time_store, &bfqd->bfq_wr_min_idle_time, 0,
+ INT_MAX, 1);
+STORE_FUNCTION(bfq_wr_min_inter_arr_async_store,
+ &bfqd->bfq_wr_min_inter_arr_async, 0, INT_MAX, 1);
+STORE_FUNCTION(bfq_wr_max_softrt_rate_store, &bfqd->bfq_wr_max_softrt_rate, 0,
+ INT_MAX, 0);
+#undef STORE_FUNCTION
+
+#define USEC_STORE_FUNCTION(__FUNC, __PTR, MIN, MAX) \
+static ssize_t __FUNC(struct elevator_queue *e, const char *page, size_t count)\
+{ \
+ struct bfq_data *bfqd = e->elevator_data; \
+ unsigned long uninitialized_var(__data); \
+ int ret = bfq_var_store(&__data, (page), count); \
+ if (__data < (MIN)) \
+ __data = (MIN); \
+ else if (__data > (MAX)) \
+ __data = (MAX); \
+ *(__PTR) = (u64)__data * NSEC_PER_USEC; \
+ return ret; \
+}
+USEC_STORE_FUNCTION(bfq_slice_idle_us_store, &bfqd->bfq_slice_idle, 0,
+ UINT_MAX);
+#undef USEC_STORE_FUNCTION
+
+/* do nothing for the moment */
+static ssize_t bfq_weights_store(struct elevator_queue *e,
+ const char *page, size_t count)
+{
+ return count;
+}
+
+static ssize_t bfq_max_budget_store(struct elevator_queue *e,
+ const char *page, size_t count)
+{
+ struct bfq_data *bfqd = e->elevator_data;
+ unsigned long uninitialized_var(__data);
+ int ret = bfq_var_store(&__data, (page), count);
+
+ if (__data == 0)
+ bfqd->bfq_max_budget = bfq_calc_max_budget(bfqd);
+ else {
+ if (__data > INT_MAX)
+ __data = INT_MAX;
+ bfqd->bfq_max_budget = __data;
+ }
+
+ bfqd->bfq_user_max_budget = __data;
+
+ return ret;
+}
+
+/*
+ * Leaving this name to preserve name compatibility with cfq
+ * parameters, but this timeout is used for both sync and async.
+ */
+static ssize_t bfq_timeout_sync_store(struct elevator_queue *e,
+ const char *page, size_t count)
+{
+ struct bfq_data *bfqd = e->elevator_data;
+ unsigned long uninitialized_var(__data);
+ int ret = bfq_var_store(&__data, (page), count);
+
+ if (__data < 1)
+ __data = 1;
+ else if (__data > INT_MAX)
+ __data = INT_MAX;
+
+ bfqd->bfq_timeout = msecs_to_jiffies(__data);
+ if (bfqd->bfq_user_max_budget == 0)
+ bfqd->bfq_max_budget = bfq_calc_max_budget(bfqd);
+
+ return ret;
+}
+
+static ssize_t bfq_strict_guarantees_store(struct elevator_queue *e,
+ const char *page, size_t count)
+{
+ struct bfq_data *bfqd = e->elevator_data;
+ unsigned long uninitialized_var(__data);
+ int ret = bfq_var_store(&__data, (page), count);
+
+ if (__data > 1)
+ __data = 1;
+ if (!bfqd->strict_guarantees && __data == 1
+ && bfqd->bfq_slice_idle < 8 * NSEC_PER_MSEC)
+ bfqd->bfq_slice_idle = 8 * NSEC_PER_MSEC;
+
+ bfqd->strict_guarantees = __data;
+
+ return ret;
+}
+
+static ssize_t bfq_low_latency_store(struct elevator_queue *e,
+ const char *page, size_t count)
+{
+ struct bfq_data *bfqd = e->elevator_data;
+ unsigned long uninitialized_var(__data);
+ int ret = bfq_var_store(&__data, (page), count);
+
+ if (__data > 1)
+ __data = 1;
+ if (__data == 0 && bfqd->low_latency != 0)
+ bfq_end_wr(bfqd);
+ bfqd->low_latency = __data;
+
+ return ret;
+}
+
+#define BFQ_ATTR(name) \
+ __ATTR(name, S_IRUGO|S_IWUSR, bfq_##name##_show, bfq_##name##_store)
+
+static struct elv_fs_entry bfq_attrs[] = {
+ BFQ_ATTR(fifo_expire_sync),
+ BFQ_ATTR(fifo_expire_async),
+ BFQ_ATTR(back_seek_max),
+ BFQ_ATTR(back_seek_penalty),
+ BFQ_ATTR(slice_idle),
+ BFQ_ATTR(slice_idle_us),
+ BFQ_ATTR(max_budget),
+ BFQ_ATTR(timeout_sync),
+ BFQ_ATTR(strict_guarantees),
+ BFQ_ATTR(low_latency),
+ BFQ_ATTR(wr_coeff),
+ BFQ_ATTR(wr_max_time),
+ BFQ_ATTR(wr_rt_max_time),
+ BFQ_ATTR(wr_min_idle_time),
+ BFQ_ATTR(wr_min_inter_arr_async),
+ BFQ_ATTR(wr_max_softrt_rate),
+ BFQ_ATTR(weights),
+ __ATTR_NULL
+};
+
+static struct elevator_type iosched_bfq = {
+ .ops = {
+ .elevator_merge_fn = bfq_merge,
+ .elevator_merged_fn = bfq_merged_request,
+ .elevator_merge_req_fn = bfq_merged_requests,
+#ifdef CONFIG_BFQ_GROUP_IOSCHED
+ .elevator_bio_merged_fn = bfq_bio_merged,
+#endif
+ .elevator_allow_bio_merge_fn = bfq_allow_bio_merge,
+ .elevator_allow_rq_merge_fn = bfq_allow_rq_merge,
+ .elevator_dispatch_fn = bfq_dispatch_requests,
+ .elevator_add_req_fn = bfq_insert_request,
+ .elevator_activate_req_fn = bfq_activate_request,
+ .elevator_deactivate_req_fn = bfq_deactivate_request,
+ .elevator_completed_req_fn = bfq_completed_request,
+ .elevator_former_req_fn = elv_rb_former_request,
+ .elevator_latter_req_fn = elv_rb_latter_request,
+ .elevator_init_icq_fn = bfq_init_icq,
+ .elevator_exit_icq_fn = bfq_exit_icq,
+ .elevator_set_req_fn = bfq_set_request,
+ .elevator_put_req_fn = bfq_put_request,
+ .elevator_may_queue_fn = bfq_may_queue,
+ .elevator_init_fn = bfq_init_queue,
+ .elevator_exit_fn = bfq_exit_queue,
+ },
+ .icq_size = sizeof(struct bfq_io_cq),
+ .icq_align = __alignof__(struct bfq_io_cq),
+ .elevator_attrs = bfq_attrs,
+ .elevator_name = "bfq",
+ .elevator_owner = THIS_MODULE,
+};
+
+#ifdef CONFIG_BFQ_GROUP_IOSCHED
+static struct blkcg_policy blkcg_policy_bfq = {
+ .dfl_cftypes = bfq_blkg_files,
+ .legacy_cftypes = bfq_blkcg_legacy_files,
+
+ .cpd_alloc_fn = bfq_cpd_alloc,
+ .cpd_init_fn = bfq_cpd_init,
+ .cpd_bind_fn = bfq_cpd_init,
+ .cpd_free_fn = bfq_cpd_free,
+
+ .pd_alloc_fn = bfq_pd_alloc,
+ .pd_init_fn = bfq_pd_init,
+ .pd_offline_fn = bfq_pd_offline,
+ .pd_free_fn = bfq_pd_free,
+ .pd_reset_stats_fn = bfq_pd_reset_stats,
+};
+#endif
+
+static int __init bfq_init(void)
+{
+ int ret;
+ char msg[60] = "BFQ I/O-scheduler: v8r8-rc2";
+
+#ifdef CONFIG_BFQ_GROUP_IOSCHED
+ ret = blkcg_policy_register(&blkcg_policy_bfq);
+ if (ret)
+ return ret;
+#endif
+
+ ret = -ENOMEM;
+ if (bfq_slab_setup())
+ goto err_pol_unreg;
+
+ /*
+ * Times to load large popular applications for the typical
+ * systems installed on the reference devices (see the
+ * comments before the definitions of the next two
+ * arrays). Actually, we use slightly slower values, as the
+ * estimated peak rate tends to be smaller than the actual
+ * peak rate. The reason for this last fact is that estimates
+ * are computed over much shorter time intervals than the long
+ * intervals typically used for benchmarking. Why? First, to
+ * adapt more quickly to variations. Second, because an I/O
+ * scheduler cannot rely on a peak-rate-evaluation workload to
+ * be run for a long time.
+ */
+ T_slow[0] = msecs_to_jiffies(3500); /* actually 4 sec */
+ T_slow[1] = msecs_to_jiffies(6000); /* actually 6.5 sec */
+ T_fast[0] = msecs_to_jiffies(7000); /* actually 8 sec */
+ T_fast[1] = msecs_to_jiffies(2500); /* actually 3 sec */
+
+ /*
+ * Thresholds that determine the switch between speed classes
+ * (see the comments before the definition of the array
+ * device_speed_thresh). These thresholds are biased towards
+ * transitions to the fast class. This is safer than the
+ * opposite bias. In fact, a wrong transition to the slow
+ * class results in short weight-raising periods, because the
+ * speed of the device then tends to be higher that the
+ * reference peak rate. On the opposite end, a wrong
+ * transition to the fast class tends to increase
+ * weight-raising periods, because of the opposite reason.
+ */
+ device_speed_thresh[0] = (4 * R_slow[0]) / 3;
+ device_speed_thresh[1] = (4 * R_slow[1]) / 3;
+
+ ret = elv_register(&iosched_bfq);
+ if (ret)
+ goto err_pol_unreg;
+
+#ifdef CONFIG_BFQ_GROUP_IOSCHED
+ strcat(msg, " (with cgroups support)");
+#endif
+ pr_info("%s", msg);
+
+ return 0;
+
+err_pol_unreg:
+#ifdef CONFIG_BFQ_GROUP_IOSCHED
+ blkcg_policy_unregister(&blkcg_policy_bfq);
+#endif
+ return ret;
+}
+
+static void __exit bfq_exit(void)
+{
+ elv_unregister(&iosched_bfq);
+#ifdef CONFIG_BFQ_GROUP_IOSCHED
+ blkcg_policy_unregister(&blkcg_policy_bfq);
+#endif
+ bfq_slab_kill();
+}
+
+module_init(bfq_init);
+module_exit(bfq_exit);
+
+MODULE_AUTHOR("Arianna Avanzini, Fabio Checconi, Paolo Valente");
+MODULE_LICENSE("GPL");
diff --git a/block/bfq-sched.c b/block/bfq-sched.c
new file mode 100644
index 0000000..2e9dc59
--- /dev/null
+++ b/block/bfq-sched.c
@@ -0,0 +1,1933 @@
+/*
+ * BFQ: Hierarchical B-WF2Q+ scheduler.
+ *
+ * Based on ideas and code from CFQ:
+ * Copyright (C) 2003 Jens Axboe <axboe@kernel.dk>
+ *
+ * Copyright (C) 2008 Fabio Checconi <fabio@gandalf.sssup.it>
+ * Paolo Valente <paolo.valente@unimore.it>
+ *
+ * Copyright (C) 2015 Paolo Valente <paolo.valente@unimore.it>
+ *
+ * Copyright (C) 2016 Paolo Valente <paolo.valente@linaro.org>
+ */
+
+static struct bfq_group *bfqq_group(struct bfq_queue *bfqq);
+
+/**
+ * bfq_gt - compare two timestamps.
+ * @a: first ts.
+ * @b: second ts.
+ *
+ * Return @a > @b, dealing with wrapping correctly.
+ */
+static int bfq_gt(u64 a, u64 b)
+{
+ return (s64)(a - b) > 0;
+}
+
+static struct bfq_entity *bfq_root_active_entity(struct rb_root *tree)
+{
+ struct rb_node *node = tree->rb_node;
+
+ return rb_entry(node, struct bfq_entity, rb_node);
+}
+
+static struct bfq_entity *bfq_lookup_next_entity(struct bfq_sched_data *sd);
+
+static bool bfq_update_parent_budget(struct bfq_entity *next_in_service);
+
+/**
+ * bfq_update_next_in_service - update sd->next_in_service
+ * @sd: sched_data for which to perform the update.
+ * @new_entity: if not NULL, pointer to the entity whose activation,
+ * requeueing or repositionig triggered the invocation of
+ * this function.
+ *
+ * This function is called to update sd->next_in_service, which, in
+ * its turn, may change as a consequence of the insertion or
+ * extraction of an entity into/from one of the active trees of
+ * sd. These insertions/extractions occur as a consequence of
+ * activations/deactivations of entities, with some activations being
+ * 'true' activations, and other activations being requeueings (i.e.,
+ * implementing the second, requeueing phase of the mechanism used to
+ * reposition an entity in its active tree; see comments on
+ * __bfq_activate_entity and __bfq_requeue_entity for details). In
+ * both the last two activation sub-cases, new_entity points to the
+ * just activated or requeued entity.
+ *
+ * Returns true if sd->next_in_service changes in such a way that
+ * entity->parent may become the next_in_service for its parent
+ * entity.
+ */
+static bool bfq_update_next_in_service(struct bfq_sched_data *sd,
+ struct bfq_entity *new_entity)
+{
+ struct bfq_entity *next_in_service = sd->next_in_service;
+ struct bfq_queue *bfqq;
+ bool parent_sched_may_change = false;
+
+ /*
+ * If this update is triggered by the activation, requeueing
+ * or repositiong of an entity that does not coincide with
+ * sd->next_in_service, then a full lookup in the active tree
+ * can be avoided. In fact, it is enough to check whether the
+ * just-modified entity has a higher priority than
+ * sd->next_in_service, or, even if it has the same priority
+ * as sd->next_in_service, is eligible and has a lower virtual
+ * finish time than sd->next_in_service. If this compound
+ * condition holds, then the new entity becomes the new
+ * next_in_service. Otherwise no change is needed.
+ */
+ if (new_entity && new_entity != sd->next_in_service) {
+ /*
+ * Flag used to decide whether to replace
+ * sd->next_in_service with new_entity. Tentatively
+ * set to true, and left as true if
+ * sd->next_in_service is NULL.
+ */
+ bool replace_next = true;
+
+ /*
+ * If there is already a next_in_service candidate
+ * entity, then compare class priorities or timestamps
+ * to decide whether to replace sd->service_tree with
+ * new_entity.
+ */
+ if (next_in_service) {
+ unsigned int new_entity_class_idx =
+ bfq_class_idx(new_entity);
+ struct bfq_service_tree *st =
+ sd->service_tree + new_entity_class_idx;
+
+ /*
+ * For efficiency, evaluate the most likely
+ * sub-condition first.
+ */
+ replace_next =
+ (new_entity_class_idx ==
+ bfq_class_idx(next_in_service)
+ &&
+ !bfq_gt(new_entity->start, st->vtime)
+ &&
+ bfq_gt(next_in_service->finish,
+ new_entity->finish))
+ ||
+ new_entity_class_idx <
+ bfq_class_idx(next_in_service);
+ }
+
+ if (replace_next)
+ next_in_service = new_entity;
+ } else /* invoked because of a deactivation: lookup needed */
+ next_in_service = bfq_lookup_next_entity(sd);
+
+ if (next_in_service) {
+ parent_sched_may_change = !sd->next_in_service ||
+ bfq_update_parent_budget(next_in_service);
+ }
+
+ sd->next_in_service = next_in_service;
+
+ if (!next_in_service)
+ return parent_sched_may_change;
+
+ bfqq = bfq_entity_to_bfqq(next_in_service);
+ if (bfqq)
+ bfq_log_bfqq(bfqq->bfqd, bfqq,
+ "update_next_in_service: chosen this queue");
+#ifdef CONFIG_BFQ_GROUP_IOSCHED
+ else {
+ struct bfq_group *bfqg =
+ container_of(next_in_service,
+ struct bfq_group, entity);
+
+ bfq_log_bfqg((struct bfq_data *)bfqg->bfqd, bfqg,
+ "update_next_in_service: chosen this entity");
+ }
+#endif
+ return parent_sched_may_change;
+}
+
+#ifdef CONFIG_BFQ_GROUP_IOSCHED
+/* both next loops stop at one of the child entities of the root group */
+#define for_each_entity(entity) \
+ for (; entity ; entity = entity->parent)
+
+#define for_each_entity_safe(entity, parent) \
+ for (; entity && ({ parent = entity->parent; 1; }); entity = parent)
+
+/*
+ * Returns true if this budget changes may let next_in_service->parent
+ * become the next_in_service entity for its parent entity.
+ */
+static bool bfq_update_parent_budget(struct bfq_entity *next_in_service)
+{
+ struct bfq_entity *bfqg_entity;
+ struct bfq_group *bfqg;
+ struct bfq_sched_data *group_sd;
+ bool ret = false;
+
+ BUG_ON(!next_in_service);
+
+ group_sd = next_in_service->sched_data;
+
+ bfqg = container_of(group_sd, struct bfq_group, sched_data);
+ /*
+ * bfq_group's my_entity field is not NULL only if the group
+ * is not the root group. We must not touch the root entity
+ * as it must never become an in-service entity.
+ */
+ bfqg_entity = bfqg->my_entity;
+ if (bfqg_entity) {
+ if (bfqg_entity->budget > next_in_service->budget)
+ ret = true;
+ bfqg_entity->budget = next_in_service->budget;
+ }
+
+ return ret;
+}
+
+/*
+ * This function tells whether entity stops being a candidate for next
+ * service, according to the following logic.
+ *
+ * This function is invoked for an entity that is about to be set in
+ * service. If such an entity is a queue, then the entity is no longer
+ * a candidate for next service (i.e, a candidate entity to serve
+ * after the in-service entity is expired). The function then returns
+ * true.
+ *
+ * In contrast, the entity could stil be a candidate for next service
+ * if it is not a queue, and has more than one child. In fact, even if
+ * one of its children is about to be set in service, other children
+ * may still be the next to serve. As a consequence, a non-queue
+ * entity is not a candidate for next-service only if it has only one
+ * child. And only if this condition holds, then the function returns
+ * true for a non-queue entity.
+ */
+static bool bfq_no_longer_next_in_service(struct bfq_entity *entity)
+{
+ struct bfq_group *bfqg;
+
+ if (bfq_entity_to_bfqq(entity))
+ return true;
+
+ bfqg = container_of(entity, struct bfq_group, entity);
+
+ BUG_ON(bfqg == ((struct bfq_data *)(bfqg->bfqd))->root_group);
+ BUG_ON(bfqg->active_entities == 0);
+ if (bfqg->active_entities == 1)
+ return true;
+
+ return false;
+}
+
+#else /* CONFIG_BFQ_GROUP_IOSCHED */
+#define for_each_entity(entity) \
+ for (; entity ; entity = NULL)
+
+#define for_each_entity_safe(entity, parent) \
+ for (parent = NULL; entity ; entity = parent)
+
+static bool bfq_update_parent_budget(struct bfq_entity *next_in_service)
+{
+ return false;
+}
+
+static bool bfq_no_longer_next_in_service(struct bfq_entity *entity)
+{
+ return true;
+}
+
+#endif /* CONFIG_BFQ_GROUP_IOSCHED */
+
+/*
+ * Shift for timestamp calculations. This actually limits the maximum
+ * service allowed in one timestamp delta (small shift values increase it),
+ * the maximum total weight that can be used for the queues in the system
+ * (big shift values increase it), and the period of virtual time
+ * wraparounds.
+ */
+#define WFQ_SERVICE_SHIFT 22
+
+static struct bfq_queue *bfq_entity_to_bfqq(struct bfq_entity *entity)
+{
+ struct bfq_queue *bfqq = NULL;
+
+ BUG_ON(!entity);
+
+ if (!entity->my_sched_data)
+ bfqq = container_of(entity, struct bfq_queue, entity);
+
+ return bfqq;
+}
+
+
+/**
+ * bfq_delta - map service into the virtual time domain.
+ * @service: amount of service.
+ * @weight: scale factor (weight of an entity or weight sum).
+ */
+static u64 bfq_delta(unsigned long service, unsigned long weight)
+{
+ u64 d = (u64)service << WFQ_SERVICE_SHIFT;
+
+ do_div(d, weight);
+ return d;
+}
+
+/**
+ * bfq_calc_finish - assign the finish time to an entity.
+ * @entity: the entity to act upon.
+ * @service: the service to be charged to the entity.
+ */
+static void bfq_calc_finish(struct bfq_entity *entity, unsigned long service)
+{
+ struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);
+ unsigned long long start, finish, delta;
+
+ BUG_ON(entity->weight == 0);
+
+ entity->finish = entity->start +
+ bfq_delta(service, entity->weight);
+
+ start = ((entity->start>>10)*1000)>>12;
+ finish = ((entity->finish>>10)*1000)>>12;
+ delta = ((bfq_delta(service, entity->weight)>>10)*1000)>>12;
+
+ if (bfqq) {
+ bfq_log_bfqq(bfqq->bfqd, bfqq,
+ "calc_finish: serv %lu, w %d",
+ service, entity->weight);
+ bfq_log_bfqq(bfqq->bfqd, bfqq,
+ "calc_finish: start %llu, finish %llu, delta %llu",
+ start, finish, delta);
+#ifdef CONFIG_BFQ_GROUP_IOSCHED
+ } else {
+ struct bfq_group *bfqg =
+ container_of(entity, struct bfq_group, entity);
+
+ bfq_log_bfqg((struct bfq_data *)bfqg->bfqd, bfqg,
+ "calc_finish group: serv %lu, w %d",
+ service, entity->weight);
+ bfq_log_bfqg((struct bfq_data *)bfqg->bfqd, bfqg,
+ "calc_finish group: start %llu, finish %llu, delta %llu",
+ start, finish, delta);
+#endif
+ }
+}
+
+/**
+ * bfq_entity_of - get an entity from a node.
+ * @node: the node field of the entity.
+ *
+ * Convert a node pointer to the relative entity. This is used only
+ * to simplify the logic of some functions and not as the generic
+ * conversion mechanism because, e.g., in the tree walking functions,
+ * the check for a %NULL value would be redundant.
+ */
+static struct bfq_entity *bfq_entity_of(struct rb_node *node)
+{
+ struct bfq_entity *entity = NULL;
+
+ if (node)
+ entity = rb_entry(node, struct bfq_entity, rb_node);
+
+ return entity;
+}
+
+/**
+ * bfq_extract - remove an entity from a tree.
+ * @root: the tree root.
+ * @entity: the entity to remove.
+ */
+static void bfq_extract(struct rb_root *root, struct bfq_entity *entity)
+{
+ BUG_ON(entity->tree != root);
+
+ entity->tree = NULL;
+ rb_erase(&entity->rb_node, root);
+}
+
+/**
+ * bfq_idle_extract - extract an entity from the idle tree.
+ * @st: the service tree of the owning @entity.
+ * @entity: the entity being removed.
+ */
+static void bfq_idle_extract(struct bfq_service_tree *st,
+ struct bfq_entity *entity)
+{
+ struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);
+ struct rb_node *next;
+
+ BUG_ON(entity->tree != &st->idle);
+
+ if (entity == st->first_idle) {
+ next = rb_next(&entity->rb_node);
+ st->first_idle = bfq_entity_of(next);
+ }
+
+ if (entity == st->last_idle) {
+ next = rb_prev(&entity->rb_node);
+ st->last_idle = bfq_entity_of(next);
+ }
+
+ bfq_extract(&st->idle, entity);
+
+ if (bfqq)
+ list_del(&bfqq->bfqq_list);
+}
+
+/**
+ * bfq_insert - generic tree insertion.
+ * @root: tree root.
+ * @entity: entity to insert.
+ *
+ * This is used for the idle and the active tree, since they are both
+ * ordered by finish time.
+ */
+static void bfq_insert(struct rb_root *root, struct bfq_entity *entity)
+{
+ struct bfq_entity *entry;
+ struct rb_node **node = &root->rb_node;
+ struct rb_node *parent = NULL;
+
+ BUG_ON(entity->tree);
+
+ while (*node) {
+ parent = *node;
+ entry = rb_entry(parent, struct bfq_entity, rb_node);
+
+ if (bfq_gt(entry->finish, entity->finish))
+ node = &parent->rb_left;
+ else
+ node = &parent->rb_right;
+ }
+
+ rb_link_node(&entity->rb_node, parent, node);
+ rb_insert_color(&entity->rb_node, root);
+
+ entity->tree = root;
+}
+
+/**
+ * bfq_update_min - update the min_start field of a entity.
+ * @entity: the entity to update.
+ * @node: one of its children.
+ *
+ * This function is called when @entity may store an invalid value for
+ * min_start due to updates to the active tree. The function assumes
+ * that the subtree rooted at @node (which may be its left or its right
+ * child) has a valid min_start value.
+ */
+static void bfq_update_min(struct bfq_entity *entity, struct rb_node *node)
+{
+ struct bfq_entity *child;
+
+ if (node) {
+ child = rb_entry(node, struct bfq_entity, rb_node);
+ if (bfq_gt(entity->min_start, child->min_start))
+ entity->min_start = child->min_start;
+ }
+}
+
+/**
+ * bfq_update_active_node - recalculate min_start.
+ * @node: the node to update.
+ *
+ * @node may have changed position or one of its children may have moved,
+ * this function updates its min_start value. The left and right subtrees
+ * are assumed to hold a correct min_start value.
+ */
+static void bfq_update_active_node(struct rb_node *node)
+{
+ struct bfq_entity *entity = rb_entry(node, struct bfq_entity, rb_node);
+ struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);
+
+ entity->min_start = entity->start;
+ bfq_update_min(entity, node->rb_right);
+ bfq_update_min(entity, node->rb_left);
+
+ if (bfqq) {
+ bfq_log_bfqq(bfqq->bfqd, bfqq,
+ "update_active_node: new min_start %llu",
+ ((entity->min_start>>10)*1000)>>12);
+#ifdef CONFIG_BFQ_GROUP_IOSCHED
+ } else {
+ struct bfq_group *bfqg =
+ container_of(entity, struct bfq_group, entity);
+
+ bfq_log_bfqg((struct bfq_data *)bfqg->bfqd, bfqg,
+ "update_active_node: new min_start %llu",
+ ((entity->min_start>>10)*1000)>>12);
+#endif
+ }
+}
+
+/**
+ * bfq_update_active_tree - update min_start for the whole active tree.
+ * @node: the starting node.
+ *
+ * @node must be the deepest modified node after an update. This function
+ * updates its min_start using the values held by its children, assuming
+ * that they did not change, and then updates all the nodes that may have
+ * changed in the path to the root. The only nodes that may have changed
+ * are the ones in the path or their siblings.
+ */
+static void bfq_update_active_tree(struct rb_node *node)
+{
+ struct rb_node *parent;
+
+up:
+ bfq_update_active_node(node);
+
+ parent = rb_parent(node);
+ if (!parent)
+ return;
+
+ if (node == parent->rb_left && parent->rb_right)
+ bfq_update_active_node(parent->rb_right);
+ else if (parent->rb_left)
+ bfq_update_active_node(parent->rb_left);
+
+ node = parent;
+ goto up;
+}
+
+static void bfq_weights_tree_add(struct bfq_data *bfqd,
+ struct bfq_entity *entity,
+ struct rb_root *root);
+
+static void bfq_weights_tree_remove(struct bfq_data *bfqd,
+ struct bfq_entity *entity,
+ struct rb_root *root);
+
+
+/**
+ * bfq_active_insert - insert an entity in the active tree of its
+ * group/device.
+ * @st: the service tree of the entity.
+ * @entity: the entity being inserted.
+ *
+ * The active tree is ordered by finish time, but an extra key is kept
+ * per each node, containing the minimum value for the start times of
+ * its children (and the node itself), so it's possible to search for
+ * the eligible node with the lowest finish time in logarithmic time.
+ */
+static void bfq_active_insert(struct bfq_service_tree *st,
+ struct bfq_entity *entity)
+{
+ struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);
+ struct rb_node *node = &entity->rb_node;
+#ifdef CONFIG_BFQ_GROUP_IOSCHED
+ struct bfq_sched_data *sd = NULL;
+ struct bfq_group *bfqg = NULL;
+ struct bfq_data *bfqd = NULL;
+#endif
+
+ bfq_insert(&st->active, entity);
+
+ if (node->rb_left)
+ node = node->rb_left;
+ else if (node->rb_right)
+ node = node->rb_right;
+
+ bfq_update_active_tree(node);
+
+#ifdef CONFIG_BFQ_GROUP_IOSCHED
+ sd = entity->sched_data;
+ bfqg = container_of(sd, struct bfq_group, sched_data);
+ BUG_ON(!bfqg);
+ bfqd = (struct bfq_data *)bfqg->bfqd;
+#endif
+ if (bfqq)
+ list_add(&bfqq->bfqq_list, &bfqq->bfqd->active_list);
+#ifdef CONFIG_BFQ_GROUP_IOSCHED
+ else { /* bfq_group */
+ BUG_ON(!bfqd);
+ bfq_weights_tree_add(bfqd, entity, &bfqd->group_weights_tree);
+ }
+ if (bfqg != bfqd->root_group) {
+ BUG_ON(!bfqg);
+ BUG_ON(!bfqd);
+ bfqg->active_entities++;
+ }
+#endif
+}
+
+/**
+ * bfq_ioprio_to_weight - calc a weight from an ioprio.
+ * @ioprio: the ioprio value to convert.
+ */
+static unsigned short bfq_ioprio_to_weight(int ioprio)
+{
+ BUG_ON(ioprio < 0 || ioprio >= IOPRIO_BE_NR);
+ return (IOPRIO_BE_NR - ioprio) * BFQ_WEIGHT_CONVERSION_COEFF;
+}
+
+/**
+ * bfq_weight_to_ioprio - calc an ioprio from a weight.
+ * @weight: the weight value to convert.
+ *
+ * To preserve as much as possible the old only-ioprio user interface,
+ * 0 is used as an escape ioprio value for weights (numerically) equal or
+ * larger than IOPRIO_BE_NR * BFQ_WEIGHT_CONVERSION_COEFF.
+ */
+static unsigned short bfq_weight_to_ioprio(int weight)
+{
+ BUG_ON(weight < BFQ_MIN_WEIGHT || weight > BFQ_MAX_WEIGHT);
+ return IOPRIO_BE_NR * BFQ_WEIGHT_CONVERSION_COEFF - weight < 0 ?
+ 0 : IOPRIO_BE_NR * BFQ_WEIGHT_CONVERSION_COEFF - weight;
+}
+
+static void bfq_get_entity(struct bfq_entity *entity)
+{
+ struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);
+
+ if (bfqq) {
+ bfqq->ref++;
+ bfq_log_bfqq(bfqq->bfqd, bfqq, "get_entity: %p %d",
+ bfqq, bfqq->ref);
+ }
+}
+
+/**
+ * bfq_find_deepest - find the deepest node that an extraction can modify.
+ * @node: the node being removed.
+ *
+ * Do the first step of an extraction in an rb tree, looking for the
+ * node that will replace @node, and returning the deepest node that
+ * the following modifications to the tree can touch. If @node is the
+ * last node in the tree return %NULL.
+ */
+static struct rb_node *bfq_find_deepest(struct rb_node *node)
+{
+ struct rb_node *deepest;
+
+ if (!node->rb_right && !node->rb_left)
+ deepest = rb_parent(node);
+ else if (!node->rb_right)
+ deepest = node->rb_left;
+ else if (!node->rb_left)
+ deepest = node->rb_right;
+ else {
+ deepest = rb_next(node);
+ if (deepest->rb_right)
+ deepest = deepest->rb_right;
+ else if (rb_parent(deepest) != node)
+ deepest = rb_parent(deepest);
+ }
+
+ return deepest;
+}
+
+/**
+ * bfq_active_extract - remove an entity from the active tree.
+ * @st: the service_tree containing the tree.
+ * @entity: the entity being removed.
+ */
+static void bfq_active_extract(struct bfq_service_tree *st,
+ struct bfq_entity *entity)
+{
+ struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);
+ struct rb_node *node;
+#ifdef CONFIG_BFQ_GROUP_IOSCHED
+ struct bfq_sched_data *sd = NULL;
+ struct bfq_group *bfqg = NULL;
+ struct bfq_data *bfqd = NULL;
+#endif
+
+ node = bfq_find_deepest(&entity->rb_node);
+ bfq_extract(&st->active, entity);
+
+ if (node)
+ bfq_update_active_tree(node);
+
+#ifdef CONFIG_BFQ_GROUP_IOSCHED
+ sd = entity->sched_data;
+ bfqg = container_of(sd, struct bfq_group, sched_data);
+ BUG_ON(!bfqg);
+ bfqd = (struct bfq_data *)bfqg->bfqd;
+#endif
+ if (bfqq)
+ list_del(&bfqq->bfqq_list);
+#ifdef CONFIG_BFQ_GROUP_IOSCHED
+ else { /* bfq_group */
+ BUG_ON(!bfqd);
+ bfq_weights_tree_remove(bfqd, entity,
+ &bfqd->group_weights_tree);
+ }
+ if (bfqg != bfqd->root_group) {
+ BUG_ON(!bfqg);
+ BUG_ON(!bfqd);
+ BUG_ON(!bfqg->active_entities);
+ bfqg->active_entities--;
+ }
+#endif
+}
+
+/**
+ * bfq_idle_insert - insert an entity into the idle tree.
+ * @st: the service tree containing the tree.
+ * @entity: the entity to insert.
+ */
+static void bfq_idle_insert(struct bfq_service_tree *st,
+ struct bfq_entity *entity)
+{
+ struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);
+ struct bfq_entity *first_idle = st->first_idle;
+ struct bfq_entity *last_idle = st->last_idle;
+
+ if (!first_idle || bfq_gt(first_idle->finish, entity->finish))
+ st->first_idle = entity;
+ if (!last_idle || bfq_gt(entity->finish, last_idle->finish))
+ st->last_idle = entity;
+
+ bfq_insert(&st->idle, entity);
+
+ if (bfqq)
+ list_add(&bfqq->bfqq_list, &bfqq->bfqd->idle_list);
+}
+
+/**
+ * bfq_forget_entity - remove an entity from the wfq trees.
+ * @st: the service tree.
+ * @entity: the entity being removed.
+ *
+ * Update the device status and forget everything about @entity, putting
+ * the device reference to it, if it is a queue. Entities belonging to
+ * groups are not refcounted.
+ */
+static void bfq_forget_entity(struct bfq_service_tree *st,
+ struct bfq_entity *entity)
+{
+ struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);
+ struct bfq_sched_data *sd;
+
+ BUG_ON(!entity->on_st);
+
+ entity->on_st = false;
+ st->wsum -= entity->weight;
+ if (bfqq) {
+ sd = entity->sched_data;
+ bfq_log_bfqq(bfqq->bfqd, bfqq, "forget_entity: %p %d",
+ bfqq, bfqq->ref);
+ bfq_put_queue(bfqq);
+ }
+}
+
+/**
+ * bfq_put_idle_entity - release the idle tree ref of an entity.
+ * @st: service tree for the entity.
+ * @entity: the entity being released.
+ */
+static void bfq_put_idle_entity(struct bfq_service_tree *st,
+ struct bfq_entity *entity)
+{
+ bfq_idle_extract(st, entity);
+ bfq_forget_entity(st, entity);
+}
+
+/**
+ * bfq_forget_idle - update the idle tree if necessary.
+ * @st: the service tree to act upon.
+ *
+ * To preserve the global O(log N) complexity we only remove one entry here;
+ * as the idle tree will not grow indefinitely this can be done safely.
+ */
+static void bfq_forget_idle(struct bfq_service_tree *st)
+{
+ struct bfq_entity *first_idle = st->first_idle;
+ struct bfq_entity *last_idle = st->last_idle;
+
+ if (RB_EMPTY_ROOT(&st->active) && last_idle &&
+ !bfq_gt(last_idle->finish, st->vtime)) {
+ /*
+ * Forget the whole idle tree, increasing the vtime past
+ * the last finish time of idle entities.
+ */
+ st->vtime = last_idle->finish;
+ }
+
+ if (first_idle && !bfq_gt(first_idle->finish, st->vtime))
+ bfq_put_idle_entity(st, first_idle);
+}
+
+static struct bfq_service_tree *
+__bfq_entity_update_weight_prio(struct bfq_service_tree *old_st,
+ struct bfq_entity *entity)
+{
+ struct bfq_service_tree *new_st = old_st;
+
+ if (entity->prio_changed) {
+ struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);
+ unsigned int prev_weight, new_weight;
+ struct bfq_data *bfqd = NULL;
+ struct rb_root *root;
+#ifdef CONFIG_BFQ_GROUP_IOSCHED
+ struct bfq_sched_data *sd;
+ struct bfq_group *bfqg;
+#endif
+
+ if (bfqq)
+ bfqd = bfqq->bfqd;
+#ifdef CONFIG_BFQ_GROUP_IOSCHED
+ else {
+ sd = entity->my_sched_data;
+ bfqg = container_of(sd, struct bfq_group, sched_data);
+ BUG_ON(!bfqg);
+ bfqd = (struct bfq_data *)bfqg->bfqd;
+ BUG_ON(!bfqd);
+ }
+#endif
+
+ BUG_ON(old_st->wsum < entity->weight);
+ old_st->wsum -= entity->weight;
+
+ if (entity->new_weight != entity->orig_weight) {
+ if (entity->new_weight < BFQ_MIN_WEIGHT ||
+ entity->new_weight > BFQ_MAX_WEIGHT) {
+ pr_crit("update_weight_prio: new_weight %d\n",
+ entity->new_weight);
+ if (entity->new_weight < BFQ_MIN_WEIGHT)
+ entity->new_weight = BFQ_MIN_WEIGHT;
+ else
+ entity->new_weight = BFQ_MAX_WEIGHT;
+ }
+ entity->orig_weight = entity->new_weight;
+ if (bfqq)
+ bfqq->ioprio =
+ bfq_weight_to_ioprio(entity->orig_weight);
+ }
+
+ if (bfqq)
+ bfqq->ioprio_class = bfqq->new_ioprio_class;
+ entity->prio_changed = 0;
+
+ /*
+ * NOTE: here we may be changing the weight too early,
+ * this will cause unfairness. The correct approach
+ * would have required additional complexity to defer
+ * weight changes to the proper time instants (i.e.,
+ * when entity->finish <= old_st->vtime).
+ */
+ new_st = bfq_entity_service_tree(entity);
+
+ prev_weight = entity->weight;
+ new_weight = entity->orig_weight *
+ (bfqq ? bfqq->wr_coeff : 1);
+ /*
+ * If the weight of the entity changes, remove the entity
+ * from its old weight counter (if there is a counter
+ * associated with the entity), and add it to the counter
+ * associated with its new weight.
+ */
+ if (prev_weight != new_weight) {
+ if (bfqq)
+ bfq_log_bfqq(bfqq->bfqd, bfqq,
+ "weight changed %d %d(%d %d)",
+ prev_weight, new_weight,
+ entity->orig_weight,
+ bfqq->wr_coeff);
+
+ root = bfqq ? &bfqd->queue_weights_tree :
+ &bfqd->group_weights_tree;
+ bfq_weights_tree_remove(bfqd, entity, root);
+ }
+ entity->weight = new_weight;
+ /*
+ * Add the entity to its weights tree only if it is
+ * not associated with a weight-raised queue.
+ */
+ if (prev_weight != new_weight &&
+ (bfqq ? bfqq->wr_coeff == 1 : 1))
+ /* If we get here, root has been initialized. */
+ bfq_weights_tree_add(bfqd, entity, root);
+
+ new_st->wsum += entity->weight;
+
+ if (new_st != old_st)
+ entity->start = new_st->vtime;
+ }
+
+ return new_st;
+}
+
+#ifdef CONFIG_BFQ_GROUP_IOSCHED
+static void bfqg_stats_set_start_empty_time(struct bfq_group *bfqg);
+#endif
+
+/**
+ * bfq_bfqq_served - update the scheduler status after selection for
+ * service.
+ * @bfqq: the queue being served.
+ * @served: bytes to transfer.
+ *
+ * NOTE: this can be optimized, as the timestamps of upper level entities
+ * are synchronized every time a new bfqq is selected for service. By now,
+ * we keep it to better check consistency.
+ */
+static void bfq_bfqq_served(struct bfq_queue *bfqq, int served)
+{
+ struct bfq_entity *entity = &bfqq->entity;
+ struct bfq_service_tree *st;
+
+ for_each_entity(entity) {
+ st = bfq_entity_service_tree(entity);
+
+ entity->service += served;
+
+ BUG_ON(st->wsum == 0);
+
+ st->vtime += bfq_delta(served, st->wsum);
+ bfq_forget_idle(st);
+ }
+#ifdef CONFIG_BFQ_GROUP_IOSCHED
+ bfqg_stats_set_start_empty_time(bfqq_group(bfqq));
+#endif
+ st = bfq_entity_service_tree(&bfqq->entity);
+ bfq_log_bfqq(bfqq->bfqd, bfqq, "bfqq_served %d secs, vtime %llu on %p",
+ served, ((st->vtime>>10)*1000)>>12, st);
+}
+
+/**
+ * bfq_bfqq_charge_time - charge an amount of service equivalent to the length
+ * of the time interval during which bfqq has been in
+ * service.
+ * @bfqd: the device
+ * @bfqq: the queue that needs a service update.
+ * @time_ms: the amount of time during which the queue has received service
+ *
+ * If a queue does not consume its budget fast enough, then providing
+ * the queue with service fairness may impair throughput, more or less
+ * severely. For this reason, queues that consume their budget slowly
+ * are provided with time fairness instead of service fairness. This
+ * goal is achieved through the BFQ scheduling engine, even if such an
+ * engine works in the service, and not in the time domain. The trick
+ * is charging these queues with an inflated amount of service, equal
+ * to the amount of service that they would have received during their
+ * service slot if they had been fast, i.e., if their requests had
+ * been dispatched at a rate equal to the estimated peak rate.
+ *
+ * It is worth noting that time fairness can cause important
+ * distortions in terms of bandwidth distribution, on devices with
+ * internal queueing. The reason is that I/O requests dispatched
+ * during the service slot of a queue may be served after that service
+ * slot is finished, and may have a total processing time loosely
+ * correlated with the duration of the service slot. This is
+ * especially true for short service slots.
+ */
+static void bfq_bfqq_charge_time(struct bfq_data *bfqd, struct bfq_queue *bfqq,
+ unsigned long time_ms)
+{
+ struct bfq_entity *entity = &bfqq->entity;
+ int tot_serv_to_charge = entity->service;
+ unsigned int timeout_ms = jiffies_to_msecs(bfq_timeout);
+
+ if (time_ms > 0 && time_ms < timeout_ms)
+ tot_serv_to_charge =
+ (bfqd->bfq_max_budget * time_ms) / timeout_ms;
+
+ if (tot_serv_to_charge < entity->service)
+ tot_serv_to_charge = entity->service;
+
+ bfq_log_bfqq(bfqq->bfqd, bfqq,
+ "charge_time: %lu/%u ms, %d/%d/%d sectors",
+ time_ms, timeout_ms, entity->service,
+ tot_serv_to_charge, entity->budget);
+
+ /* Increase budget to avoid inconsistencies */
+ if (tot_serv_to_charge > entity->budget)
+ entity->budget = tot_serv_to_charge;
+
+ bfq_bfqq_served(bfqq,
+ max_t(int, 0, tot_serv_to_charge - entity->service));
+}
+
+static void bfq_update_fin_time_enqueue(struct bfq_entity *entity,
+ struct bfq_service_tree *st,
+ bool backshifted)
+{
+ struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);
+ struct bfq_sched_data *sd = entity->sched_data;
+
+ st = __bfq_entity_update_weight_prio(st, entity);
+ bfq_calc_finish(entity, entity->budget);
+
+ /*
+ * If some queues enjoy backshifting for a while, then their
+ * (virtual) finish timestamps may happen to become lower and
+ * lower than the system virtual time. In particular, if
+ * these queues often happen to be idle for short time
+ * periods, and during such time periods other queues with
+ * higher timestamps happen to be busy, then the backshifted
+ * timestamps of the former queues can become much lower than
+ * the system virtual time. In fact, to serve the queues with
+ * higher timestamps while the ones with lower timestamps are
+ * idle, the system virtual time may be pushed-up to much
+ * higher values than the finish timestamps of the idle
+ * queues. As a consequence, the finish timestamps of all new
+ * or newly activated queues may end up being much larger than
+ * those of lucky queues with backshifted timestamps. The
+ * latter queues may then monopolize the device for a lot of
+ * time. This would simply break service guarantees.
+ *
+ * To reduce this problem, push up a little bit the
+ * backshifted timestamps of the queue associated with this
+ * entity (only a queue can happen to have the backshifted
+ * flag set): just enough to let the finish timestamp of the
+ * queue be equal to the current value of the system virtual
+ * time. This may introduce a little unfairness among queues
+ * with backshifted timestamps, but it does not break
+ * worst-case fairness guarantees.
+ *
+ * As a special case, if bfqq is weight-raised, push up
+ * timestamps much less, to keep very low the probability that
+ * this push up causes the backshifted finish timestamps of
+ * weight-raised queues to become higher than the backshifted
+ * finish timestamps of non weight-raised queues.
+ */
+ if (backshifted && bfq_gt(st->vtime, entity->finish)) {
+ unsigned long delta = st->vtime - entity->finish;
+
+ if (bfqq)
+ delta /= bfqq->wr_coeff;
+
+ entity->start += delta;
+ entity->finish += delta;
+
+ if (bfqq) {
+ bfq_log_bfqq(bfqq->bfqd, bfqq,
+ "__activate_entity: new queue finish %llu",
+ ((entity->finish>>10)*1000)>>12);
+#ifdef CONFIG_BFQ_GROUP_IOSCHED
+ } else {
+ struct bfq_group *bfqg =
+ container_of(entity, struct bfq_group, entity);
+
+ bfq_log_bfqg((struct bfq_data *)bfqg->bfqd, bfqg,
+ "__activate_entity: new group finish %llu",
+ ((entity->finish>>10)*1000)>>12);
+#endif
+ }
+ }
+
+ bfq_active_insert(st, entity);
+
+ if (bfqq) {
+ bfq_log_bfqq(bfqq->bfqd, bfqq,
+ "__activate_entity: queue %seligible in st %p",
+ entity->start <= st->vtime ? "" : "non ", st);
+#ifdef CONFIG_BFQ_GROUP_IOSCHED
+ } else {
+ struct bfq_group *bfqg =
+ container_of(entity, struct bfq_group, entity);
+
+ bfq_log_bfqg((struct bfq_data *)bfqg->bfqd, bfqg,
+ "__activate_entity: group %seligible in st %p",
+ entity->start <= st->vtime ? "" : "non ", st);
+#endif
+ }
+ BUG_ON(RB_EMPTY_ROOT(&st->active));
+ BUG_ON(&st->active != &sd->service_tree->active &&
+ &st->active != &(sd->service_tree+1)->active &&
+ &st->active != &(sd->service_tree+2)->active);
+}
+
+/**
+ * __bfq_activate_entity - handle activation of entity.
+ * @entity: the entity being activated.
+ * @non_blocking_wait_rq: true if entity was waiting for a request
+ *
+ * Called for a 'true' activation, i.e., if entity is not active and
+ * one of its children receives a new request.
+ *
+ * Basically, this function updates the timestamps of entity and
+ * inserts entity into its active tree, ater possible extracting it
+ * from its idle tree.
+ */
+static void __bfq_activate_entity(struct bfq_entity *entity,
+ bool non_blocking_wait_rq)
+{
+ struct bfq_sched_data *sd = entity->sched_data;
+ struct bfq_service_tree *st = bfq_entity_service_tree(entity);
+ struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);
+ bool backshifted = false;
+ unsigned long long min_vstart;
+
+ BUG_ON(!sd);
+ BUG_ON(!st);
+
+ /* See comments on bfq_fqq_update_budg_for_activation */
+ if (non_blocking_wait_rq && bfq_gt(st->vtime, entity->finish)) {
+ backshifted = true;
+ min_vstart = entity->finish;
+ } else
+ min_vstart = st->vtime;
+
+ if (entity->tree == &st->idle) {
+ /*
+ * Must be on the idle tree, bfq_idle_extract() will
+ * check for that.
+ */
+ bfq_idle_extract(st, entity);
+ entity->start = bfq_gt(min_vstart, entity->finish) ?
+ min_vstart : entity->finish;
+ } else {
+ /*
+ * The finish time of the entity may be invalid, and
+ * it is in the past for sure, otherwise the queue
+ * would have been on the idle tree.
+ */
+ entity->start = min_vstart;
+ st->wsum += entity->weight;
+ bfq_get_entity(entity);
+
+ BUG_ON(entity->on_st && bfqq);
+
+#ifdef CONFIG_BFQ_GROUP_IOSCHED
+ if (entity->on_st && !bfqq) {
+ struct bfq_group *bfqg =
+ container_of(entity, struct bfq_group,
+ entity);
+
+ bfq_log_bfqg((struct bfq_data *)bfqg->bfqd,
+ bfqg,
+ "activate bug, class %d in_service %p",
+ bfq_class_idx(entity), sd->in_service_entity);
+ }
+#endif
+ BUG_ON(entity->on_st && !bfqq);
+ entity->on_st = true;
+ }
+
+ bfq_update_fin_time_enqueue(entity, st, backshifted);
+}
+
+/**
+ * __bfq_requeue_entity - handle requeueing or repositioning of an entity.
+ * @entity: the entity being requeued or repositioned.
+ *
+ * Requeueing is needed if this entity stops being served, which
+ * happens if a leaf descendant entity has expired. On the other hand,
+ * repositioning is needed if the next_inservice_entity for the child
+ * entity has changed. See the comments inside the function for
+ * details.
+ *
+ * Basically, this function: 1) removes entity from its active tree if
+ * present there, 2) updates the timestamps of entity and 3) inserts
+ * entity back into its active tree (in the new, right position for
+ * the new values of the timestamps).
+ */
+static void __bfq_requeue_entity(struct bfq_entity *entity)
+{
+ struct bfq_sched_data *sd = entity->sched_data;
+ struct bfq_service_tree *st = bfq_entity_service_tree(entity);
+
+ BUG_ON(!sd);
+ BUG_ON(!st);
+
+ BUG_ON(entity != sd->in_service_entity &&
+ entity->tree != &st->active);
+
+ if (entity == sd->in_service_entity) {
+ /*
+ * We are requeueing the current in-service entity,
+ * which may have to be done for one of the following
+ * reasons:
+ * - entity represents the in-service queue, and the
+ * in-service queue is being requeued after an
+ * expiration;
+ * - entity represents a group, and its budget has
+ * changed because one of its child entities has
+ * just been either activated or requeued for some
+ * reason; the timestamps of the entity need then to
+ * be updated, and the entity needs to be enqueued
+ * or repositioned accordingly.
+ *
+ * In particular, before requeueing, the start time of
+ * the entity must be moved forward to account for the
+ * service that the entity has received while in
+ * service. This is done by the next instructions. The
+ * finish time will then be updated according to this
+ * new value of the start time, and to the budget of
+ * the entity.
+ */
+ bfq_calc_finish(entity, entity->service);
+ entity->start = entity->finish;
+ BUG_ON(entity->tree && entity->tree != &st->active);
+ /*
+ * In addition, if the entity had more than one child
+ * when set in service, then was not extracted from
+ * the active tree. This implies that the position of
+ * the entity in the active tree may need to be
+ * changed now, because we have just updated the start
+ * time of the entity, and we will update its finish
+ * time in a moment (the requeueing is then, more
+ * precisely, a repositioning in this case). To
+ * implement this repositioning, we: 1) dequeue the
+ * entity here, 2) update the finish time and
+ * requeue the entity according to the new
+ * timestamps below.
+ */
+ if (entity->tree)
+ bfq_active_extract(st, entity);
+ } else { /* The entity is already active, and not in service */
+ /*
+ * In this case, this function gets called only if the
+ * next_in_service entity below this entity has
+ * changed, and this change has caused the budget of
+ * this entity to change, which, finally implies that
+ * the finish time of this entity must be
+ * updated. Such an update may cause the scheduling,
+ * i.e., the position in the active tree, of this
+ * entity to change. We handle this change by: 1)
+ * dequeueing the entity here, 2) updating the finish
+ * time and requeueing the entity according to the new
+ * timestamps below. This is the same approach as the
+ * non-extracted-entity sub-case above.
+ */
+ bfq_active_extract(st, entity);
+ }
+
+ bfq_update_fin_time_enqueue(entity, st, false);
+}
+
+static void __bfq_activate_requeue_entity(struct bfq_entity *entity,
+ struct bfq_sched_data *sd,
+ bool non_blocking_wait_rq)
+{
+ struct bfq_service_tree *st = bfq_entity_service_tree(entity);
+
+ if (sd->in_service_entity == entity || entity->tree == &st->active)
+ /*
+ * in service or already queued on the active tree,
+ * requeue or reposition
+ */
+ __bfq_requeue_entity(entity);
+ else
+ /*
+ * Not in service and not queued on its active tree:
+ * the activity is idle and this is a true activation.
+ */
+ __bfq_activate_entity(entity, non_blocking_wait_rq);
+}
+
+
+/**
+ * bfq_activate_entity - activate or requeue an entity representing a bfq_queue,
+ * and activate, requeue or reposition all ancestors
+ * for which such an update becomes necessary.
+ * @entity: the entity to activate.
+ * @non_blocking_wait_rq: true if this entity was waiting for a request
+ * @requeue: true if this is a requeue, which implies that bfqq is
+ * being expired; thus ALL its ancestors stop being served and must
+ * therefore be requeued
+ */
+static void bfq_activate_requeue_entity(struct bfq_entity *entity,
+ bool non_blocking_wait_rq,
+ bool requeue)
+{
+ struct bfq_sched_data *sd;
+
+ for_each_entity(entity) {
+ BUG_ON(!entity);
+ sd = entity->sched_data;
+ __bfq_activate_requeue_entity(entity, sd, non_blocking_wait_rq);
+
+ BUG_ON(RB_EMPTY_ROOT(&sd->service_tree->active) &&
+ RB_EMPTY_ROOT(&(sd->service_tree+1)->active) &&
+ RB_EMPTY_ROOT(&(sd->service_tree+2)->active));
+
+ if (!bfq_update_next_in_service(sd, entity) && !requeue) {
+ BUG_ON(!sd->next_in_service);
+ break;
+ }
+ BUG_ON(!sd->next_in_service);
+ }
+}
+
+/**
+ * __bfq_deactivate_entity - deactivate an entity from its service tree.
+ * @entity: the entity to deactivate.
+ * @ins_into_idle_tree: if false, the entity will not be put into the
+ * idle tree.
+ *
+ * Deactivates an entity, independently from its previous state. Must
+ * be invoked only if entity is on a service tree. Extracts the entity
+ * from that tree, and if necessary and allowed, puts it on the idle
+ * tree.
+ */
+static bool __bfq_deactivate_entity(struct bfq_entity *entity,
+ bool ins_into_idle_tree)
+{
+ struct bfq_sched_data *sd = entity->sched_data;
+ struct bfq_service_tree *st = bfq_entity_service_tree(entity);
+ bool was_in_service = entity == sd->in_service_entity;
+
+ if (!entity->on_st) { /* entity never activated, or already inactive */
+ BUG_ON(entity == entity->sched_data->in_service_entity);
+ return false;
+ }
+
+ BUG_ON(was_in_service && entity->tree && entity->tree != &st->active);
+
+ if (was_in_service)
+ bfq_calc_finish(entity, entity->service);
+
+ if (entity->tree == &st->active)
+ bfq_active_extract(st, entity);
+ else if (!was_in_service && entity->tree == &st->idle)
+ bfq_idle_extract(st, entity);
+ else if (entity->tree)
+ BUG();
+
+ if (!ins_into_idle_tree || !bfq_gt(entity->finish, st->vtime))
+ bfq_forget_entity(st, entity);
+ else
+ bfq_idle_insert(st, entity);
+
+ return true;
+}
+
+/**
+ * bfq_deactivate_entity - deactivate an entity representing a bfq_queue.
+ * @entity: the entity to deactivate.
+ * @ins_into_idle_tree: true if the entity can be put on the idle tree
+ */
+static void bfq_deactivate_entity(struct bfq_entity *entity,
+ bool ins_into_idle_tree,
+ bool expiration)
+{
+ struct bfq_sched_data *sd;
+ struct bfq_entity *parent = NULL;
+
+ for_each_entity_safe(entity, parent) {
+ sd = entity->sched_data;
+
+ BUG_ON(sd == NULL); /*
+ * It would mean that this is the
+ * root group.
+ */
+
+ BUG_ON(expiration && entity != sd->in_service_entity);
+
+ BUG_ON(entity != sd->in_service_entity &&
+ entity->tree ==
+ &bfq_entity_service_tree(entity)->active &&
+ !sd->next_in_service);
+
+ if (!__bfq_deactivate_entity(entity, ins_into_idle_tree)) {
+ /*
+ * Entity is not any tree any more, so, this
+ * deactivation is a no-op, and there is
+ * nothing to change for upper-level entities
+ * (in case of expiration, this can never
+ * happen).
+ */
+ BUG_ON(expiration); /*
+ * entity cannot be already out of
+ * any tree
+ */
+ return;
+ }
+
+ if (sd->next_in_service == entity)
+ /*
+ * entity was the next_in_service entity,
+ * then, since entity has just been
+ * deactivated, a new one must be found.
+ */
+ bfq_update_next_in_service(sd, NULL);
+
+ if (sd->next_in_service) {
+ /*
+ * The parent entity is still backlogged,
+ * because next_in_service is not NULL. So, no
+ * further upwards deactivation must be
+ * performed. Yet, next_in_service has
+ * changed. Then the schedule does need to be
+ * updated upwards.
+ */
+ BUG_ON(sd->next_in_service == entity);
+ break;
+ }
+
+ /*
+ * If we get here, then the parent is no more
+ * backlogged and we need to propagate the
+ * deactivation upwards. Thus let the loop go on.
+ */
+
+ /*
+ * Also let parent be queued into the idle tree on
+ * deactivation, to preserve service guarantees, and
+ * assuming that who invoked this function does not
+ * need parent entities too to be removed completely.
+ */
+ ins_into_idle_tree = true;
+ }
+
+ /*
+ * If the deactivation loop is fully executed, then there are
+ * no more entities to touch and next loop is not executed at
+ * all. Otherwise, requeue remaining entities if they are
+ * about to stop receiving service, or reposition them if this
+ * is not the case.
+ */
+ entity = parent;
+ for_each_entity(entity) {
+ struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);
+
+ /*
+ * Invoke __bfq_requeue_entity on entity, even if
+ * already active, to requeue/reposition it in the
+ * active tree (because sd->next_in_service has
+ * changed)
+ */
+ __bfq_requeue_entity(entity);
+
+ sd = entity->sched_data;
+ BUG_ON(expiration && sd->in_service_entity != entity);
+
+ if (bfqq)
+ bfq_log_bfqq(bfqq->bfqd, bfqq,
+ "invoking udpdate_next for this queue");
+#ifdef CONFIG_BFQ_GROUP_IOSCHED
+ else {
+ struct bfq_group *bfqg =
+ container_of(entity,
+ struct bfq_group, entity);
+
+ bfq_log_bfqg((struct bfq_data *)bfqg->bfqd, bfqg,
+ "invoking udpdate_next for this entity");
+ }
+#endif
+ if (!bfq_update_next_in_service(sd, entity) &&
+ !expiration)
+ /*
+ * next_in_service unchanged or not causing
+ * any change in entity->parent->sd, and no
+ * requeueing needed for expiration: stop
+ * here.
+ */
+ break;
+ }
+}
+
+/**
+ * bfq_calc_vtime_jump - compute the value to which the vtime should jump,
+ * if needed, to have at least one entity eligible.
+ * @st: the service tree to act upon.
+ *
+ * Assumes that st is not empty.
+ */
+static u64 bfq_calc_vtime_jump(struct bfq_service_tree *st)
+{
+ struct bfq_entity *root_entity = bfq_root_active_entity(&st->active);
+
+ if (bfq_gt(root_entity->min_start, st->vtime)) {
+ struct bfq_queue *bfqq = bfq_entity_to_bfqq(root_entity);
+
+ if (bfqq)
+ bfq_log_bfqq(bfqq->bfqd, bfqq,
+ "calc_vtime_jump: new value %llu",
+ root_entity->min_start);
+#ifdef CONFIG_BFQ_GROUP_IOSCHED
+ else {
+ struct bfq_group *bfqg =
+ container_of(root_entity, struct bfq_group,
+ entity);
+
+ bfq_log_bfqg((struct bfq_data *)bfqg->bfqd, bfqg,
+ "calc_vtime_jump: new value %llu",
+ root_entity->min_start);
+ }
+#endif
+ return root_entity->min_start;
+ }
+ return st->vtime;
+}
+
+static void bfq_update_vtime(struct bfq_service_tree *st, u64 new_value)
+{
+ if (new_value > st->vtime) {
+ st->vtime = new_value;
+ bfq_forget_idle(st);
+ }
+}
+
+/**
+ * bfq_first_active_entity - find the eligible entity with
+ * the smallest finish time
+ * @st: the service tree to select from.
+ * @vtime: the system virtual to use as a reference for eligibility
+ *
+ * This function searches the first schedulable entity, starting from the
+ * root of the tree and going on the left every time on this side there is
+ * a subtree with at least one eligible (start >= vtime) entity. The path on
+ * the right is followed only if a) the left subtree contains no eligible
+ * entities and b) no eligible entity has been found yet.
+ */
+static struct bfq_entity *bfq_first_active_entity(struct bfq_service_tree *st,
+ u64 vtime)
+{
+ struct bfq_entity *entry, *first = NULL;
+ struct rb_node *node = st->active.rb_node;
+
+ while (node) {
+ entry = rb_entry(node, struct bfq_entity, rb_node);
+left:
+ if (!bfq_gt(entry->start, vtime))
+ first = entry;
+
+ BUG_ON(bfq_gt(entry->min_start, vtime));
+
+ if (node->rb_left) {
+ entry = rb_entry(node->rb_left,
+ struct bfq_entity, rb_node);
+ if (!bfq_gt(entry->min_start, vtime)) {
+ node = node->rb_left;
+ goto left;
+ }
+ }
+ if (first)
+ break;
+ node = node->rb_right;
+ }
+
+ BUG_ON(!first && !RB_EMPTY_ROOT(&st->active));
+ return first;
+}
+
+/**
+ * __bfq_lookup_next_entity - return the first eligible entity in @st.
+ * @st: the service tree.
+ *
+ * If there is no in-service entity for the sched_data st belongs to,
+ * then return the entity that will be set in service if:
+ * 1) the parent entity this st belongs to is set in service;
+ * 2) no entity belonging to such parent entity undergoes a state change
+ * that would influence the timestamps of the entity (e.g., becomes idle,
+ * becomes backlogged, changes its budget, ...).
+ *
+ * In this first case, update the virtual time in @st too (see the
+ * comments on this update inside the function).
+ *
+ * In constrast, if there is an in-service entity, then return the
+ * entity that would be set in service if not only the above
+ * conditions, but also the next one held true: the currently
+ * in-service entity, on expiration,
+ * 1) gets a finish time equal to the current one, or
+ * 2) is not eligible any more, or
+ * 3) is idle.
+ */
+static struct bfq_entity *
+__bfq_lookup_next_entity(struct bfq_service_tree *st, bool in_service
+#if 0
+ , bool force
+#endif
+ )
+{
+ struct bfq_entity *entity
+#if 0
+ , *new_next_in_service = NULL
+#endif
+ ;
+ u64 new_vtime;
+ struct bfq_queue *bfqq;
+
+ if (RB_EMPTY_ROOT(&st->active))
+ return NULL;
+
+ /*
+ * Get the value of the system virtual time for which at
+ * least one entity is eligible.
+ */
+ new_vtime = bfq_calc_vtime_jump(st);
+
+ /*
+ * If there is no in-service entity for the sched_data this
+ * active tree belongs to, then push the system virtual time
+ * up to the value that guarantees that at least one entity is
+ * eligible. If, instead, there is an in-service entity, then
+ * do not make any such update, because there is already an
+ * eligible entity, namely the in-service one (even if the
+ * entity is not on st, because it was extracted when set in
+ * service).
+ */
+ if (!in_service)
+ bfq_update_vtime(st, new_vtime);
+
+ entity = bfq_first_active_entity(st, new_vtime);
+ BUG_ON(bfq_gt(entity->start, new_vtime));
+
+ /* Log some information */
+ bfqq = bfq_entity_to_bfqq(entity);
+ if (bfqq)
+ bfq_log_bfqq(bfqq->bfqd, bfqq,
+ "__lookup_next: start %llu vtime %llu st %p",
+ ((entity->start>>10)*1000)>>12,
+ ((new_vtime>>10)*1000)>>12, st);
+#ifdef CONFIG_BFQ_GROUP_IOSCHED
+ else {
+ struct bfq_group *bfqg =
+ container_of(entity, struct bfq_group, entity);
+
+ bfq_log_bfqg((struct bfq_data *)bfqg->bfqd, bfqg,
+ "__lookup_next: start %llu vtime %llu st %p",
+ ((entity->start>>10)*1000)>>12,
+ ((new_vtime>>10)*1000)>>12, st);
+ }
+#endif
+
+ BUG_ON(!entity);
+
+ return entity;
+}
+
+/**
+ * bfq_lookup_next_entity - return the first eligible entity in @sd.
+ * @sd: the sched_data.
+ *
+ * This function is invoked when there has been a change in the trees
+ * for sd, and we need know what is the new next entity after this
+ * change.
+ */
+static struct bfq_entity *bfq_lookup_next_entity(struct bfq_sched_data *sd)
+{
+ struct bfq_service_tree *st = sd->service_tree;
+ struct bfq_service_tree *idle_class_st = st + (BFQ_IOPRIO_CLASSES - 1);
+ struct bfq_entity *entity = NULL;
+ struct bfq_queue *bfqq;
+ int class_idx = 0;
+
+ BUG_ON(!sd);
+ BUG_ON(!st);
+ /*
+ * Choose from idle class, if needed to guarantee a minimum
+ * bandwidth to this class (and if there is some active entity
+ * in idle class). This should also mitigate
+ * priority-inversion problems in case a low priority task is
+ * holding file system resources.
+ */
+ if (time_is_before_jiffies(sd->bfq_class_idle_last_service +
+ BFQ_CL_IDLE_TIMEOUT)) {
+ if (!RB_EMPTY_ROOT(&idle_class_st->active))
+ class_idx = BFQ_IOPRIO_CLASSES - 1;
+ /* About to be served if backlogged, or not yet backlogged */
+ sd->bfq_class_idle_last_service = jiffies;
+ }
+
+ /*
+ * Find the next entity to serve for the highest-priority
+ * class, unless the idle class needs to be served.
+ */
+ for (; class_idx < BFQ_IOPRIO_CLASSES; class_idx++) {
+ entity = __bfq_lookup_next_entity(st + class_idx,
+ sd->in_service_entity);
+
+ if (entity)
+ break;
+ }
+
+ BUG_ON(!entity &&
+ (!RB_EMPTY_ROOT(&st->active) || !RB_EMPTY_ROOT(&(st+1)->active) ||
+ !RB_EMPTY_ROOT(&(st+2)->active)));
+
+ if (!entity)
+ return NULL;
+
+ /* Log some information */
+ bfqq = bfq_entity_to_bfqq(entity);
+ if (bfqq)
+ bfq_log_bfqq(bfqq->bfqd, bfqq, "chosen from st %p %d",
+ st + class_idx, class_idx);
+#ifdef CONFIG_BFQ_GROUP_IOSCHED
+ else {
+ struct bfq_group *bfqg =
+ container_of(entity, struct bfq_group, entity);
+
+ bfq_log_bfqg((struct bfq_data *)bfqg->bfqd, bfqg,
+ "chosen from st %p %d",
+ st + class_idx, class_idx);
+ }
+#endif
+
+ return entity;
+}
+
+static bool next_queue_may_preempt(struct bfq_data *bfqd)
+{
+ struct bfq_sched_data *sd = &bfqd->root_group->sched_data;
+
+ return sd->next_in_service != sd->in_service_entity;
+}
+
+/*
+ * Get next queue for service.
+ */
+static struct bfq_queue *bfq_get_next_queue(struct bfq_data *bfqd)
+{
+ struct bfq_entity *entity = NULL;
+ struct bfq_sched_data *sd;
+ struct bfq_queue *bfqq;
+
+ BUG_ON(bfqd->in_service_queue);
+
+ if (bfqd->busy_queues == 0)
+ return NULL;
+
+ /*
+ * Traverse the path from the root to the leaf entity to
+ * serve. Set in service all the entities visited along the
+ * way.
+ */
+ sd = &bfqd->root_group->sched_data;
+ for (; sd ; sd = entity->my_sched_data) {
+#ifdef CONFIG_BFQ_GROUP_IOSCHED
+ if (entity) {
+ struct bfq_group *bfqg =
+ container_of(entity, struct bfq_group, entity);
+
+ bfq_log_bfqg(bfqd, bfqg,
+ "get_next_queue: lookup in this group");
+ if (!sd->next_in_service)
+ pr_crit("get_next_queue: lookup in this group");
+ } else {
+ bfq_log_bfqg(bfqd, bfqd->root_group,
+ "get_next_queue: lookup in root group");
+ if (!sd->next_in_service)
+ pr_crit("get_next_queue: lookup in root group");
+ }
+#endif
+
+ BUG_ON(!sd->next_in_service);
+
+ /*
+ * WARNING. We are about to set the in-service entity
+ * to sd->next_in_service, i.e., to the (cached) value
+ * returned by bfq_lookup_next_entity(sd) the last
+ * time it was invoked, i.e., the last time when the
+ * service order in sd changed as a consequence of the
+ * activation or deactivation of an entity. In this
+ * respect, if we execute bfq_lookup_next_entity(sd)
+ * in this very moment, it may, although with low
+ * probability, yield a different entity than that
+ * pointed to by sd->next_in_service. This rare event
+ * happens in case there was no CLASS_IDLE entity to
+ * serve for sd when bfq_lookup_next_entity(sd) was
+ * invoked for the last time, while there is now one
+ * such entity.
+ *
+ * If the above event happens, then the scheduling of
+ * such entity in CLASS_IDLE is postponed until the
+ * service of the sd->next_in_service entity
+ * finishes. In fact, when the latter is expired,
+ * bfq_lookup_next_entity(sd) gets called again,
+ * exactly to update sd->next_in_service.
+ */
+
+ /* Make next_in_service entity become in_service_entity */
+ entity = sd->next_in_service;
+ sd->in_service_entity = entity;
+
+ /*
+ * Reset the accumulator of the amount of service that
+ * the entity is about to receive.
+ */
+ entity->service = 0;
+
+ /*
+ * If entity is no longer a candidate for next
+ * service, then we extract it from its active tree,
+ * for the following reason. To further boost the
+ * throughput in some special case, BFQ needs to know
+ * which is the next candidate entity to serve, while
+ * there is already an entity in service. In this
+ * respect, to make it easy to compute/update the next
+ * candidate entity to serve after the current
+ * candidate has been set in service, there is a case
+ * where it is necessary to extract the current
+ * candidate from its service tree. Such a case is
+ * when the entity just set in service cannot be also
+ * a candidate for next service. Details about when
+ * this conditions holds are reported in the comments
+ * on the function bfq_no_longer_next_in_service()
+ * invoked below.
+ */
+ if (bfq_no_longer_next_in_service(entity))
+ bfq_active_extract(bfq_entity_service_tree(entity),
+ entity);
+
+ /*
+ * For the same reason why we may have just extracted
+ * entity from its active tree, we may need to update
+ * next_in_service for the sched_data of entity too,
+ * regardless of whether entity has been extracted.
+ * In fact, even if entity has not been extracted, a
+ * descendant entity may get extracted. Such an event
+ * would cause a change in next_in_service for the
+ * level of the descendant entity, and thus possibly
+ * back to upper levels.
+ *
+ * We cannot perform the resulting needed update
+ * before the end of this loop, because, to know which
+ * is the correct next-to-serve candidate entity for
+ * each level, we need first to find the leaf entity
+ * to set in service. In fact, only after we know
+ * which is the next-to-serve leaf entity, we can
+ * discover whether the parent entity of the leaf
+ * entity becomes the next-to-serve, and so on.
+ */
+
+ /* Log some information */
+ bfqq = bfq_entity_to_bfqq(entity);
+ if (bfqq)
+ bfq_log_bfqq(bfqd, bfqq,
+ "get_next_queue: this queue, finish %llu",
+ (((entity->finish>>10)*1000)>>10)>>2);
+#ifdef CONFIG_BFQ_GROUP_IOSCHED
+ else {
+ struct bfq_group *bfqg =
+ container_of(entity, struct bfq_group, entity);
+
+ bfq_log_bfqg(bfqd, bfqg,
+ "get_next_queue: this entity, finish %llu",
+ (((entity->finish>>10)*1000)>>10)>>2);
+ }
+#endif
+
+ }
+
+ BUG_ON(!entity);
+ bfqq = bfq_entity_to_bfqq(entity);
+ BUG_ON(!bfqq);
+
+ /*
+ * We can finally update all next-to-serve entities along the
+ * path from the leaf entity just set in service to the root.
+ */
+ for_each_entity(entity) {
+ struct bfq_sched_data *sd = entity->sched_data;
+
+ if(!bfq_update_next_in_service(sd, NULL))
+ break;
+ }
+
+ return bfqq;
+}
+
+static void __bfq_bfqd_reset_in_service(struct bfq_data *bfqd)
+{
+ struct bfq_entity *entity = &bfqd->in_service_queue->entity;
+
+ if (bfqd->in_service_bic) {
+ put_io_context(bfqd->in_service_bic->icq.ioc);
+ bfqd->in_service_bic = NULL;
+ }
+
+ bfq_clear_bfqq_wait_request(bfqd->in_service_queue);
+ hrtimer_try_to_cancel(&bfqd->idle_slice_timer);
+ bfqd->in_service_queue = NULL;
+
+ /*
+ * When this function is called, all in-service entities have
+ * been properly deactivated or requeued, so we can safely
+ * execute the final step: reset in_service_entity along the
+ * path from entity to the root.
+ */
+ for_each_entity(entity)
+ entity->sched_data->in_service_entity = NULL;
+}
+
+static void bfq_deactivate_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq,
+ bool ins_into_idle_tree, bool expiration)
+{
+ struct bfq_entity *entity = &bfqq->entity;
+
+ bfq_deactivate_entity(entity, ins_into_idle_tree, expiration);
+}
+
+static void bfq_activate_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq)
+{
+ struct bfq_entity *entity = &bfqq->entity;
+ struct bfq_service_tree *st = bfq_entity_service_tree(entity);
+
+ BUG_ON(bfqq == bfqd->in_service_queue);
+ BUG_ON(entity->tree != &st->active && entity->tree != &st->idle &&
+ entity->on_st);
+
+ bfq_activate_requeue_entity(entity, bfq_bfqq_non_blocking_wait_rq(bfqq),
+ false);
+ bfq_clear_bfqq_non_blocking_wait_rq(bfqq);
+}
+
+static void bfq_requeue_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq)
+{
+ struct bfq_entity *entity = &bfqq->entity;
+
+ bfq_activate_requeue_entity(entity, false,
+ bfqq == bfqd->in_service_queue);
+}
+
+static void bfqg_stats_update_dequeue(struct bfq_group *bfqg);
+
+/*
+ * Called when the bfqq no longer has requests pending, remove it from
+ * the service tree. As a special case, it can be invoked during an
+ * expiration.
+ */
+static void bfq_del_bfqq_busy(struct bfq_data *bfqd, struct bfq_queue *bfqq,
+ bool expiration)
+{
+ BUG_ON(!bfq_bfqq_busy(bfqq));
+ BUG_ON(!RB_EMPTY_ROOT(&bfqq->sort_list));
+
+ bfq_log_bfqq(bfqd, bfqq, "del from busy");
+
+ bfq_clear_bfqq_busy(bfqq);
+
+ BUG_ON(bfqd->busy_queues == 0);
+ bfqd->busy_queues--;
+
+ if (!bfqq->dispatched)
+ bfq_weights_tree_remove(bfqd, &bfqq->entity,
+ &bfqd->queue_weights_tree);
+
+ if (bfqq->wr_coeff > 1)
+ bfqd->wr_busy_queues--;
+
+ bfqg_stats_update_dequeue(bfqq_group(bfqq));
+
+ BUG_ON(bfqq->entity.budget < 0);
+
+ bfq_deactivate_bfqq(bfqd, bfqq, true, expiration);
+
+ BUG_ON(bfqq->entity.budget < 0);
+}
+
+/*
+ * Called when an inactive queue receives a new request.
+ */
+static void bfq_add_bfqq_busy(struct bfq_data *bfqd, struct bfq_queue *bfqq)
+{
+ BUG_ON(bfq_bfqq_busy(bfqq));
+ BUG_ON(bfqq == bfqd->in_service_queue);
+
+ bfq_log_bfqq(bfqd, bfqq, "add to busy");
+
+ bfq_activate_bfqq(bfqd, bfqq);
+
+ bfq_mark_bfqq_busy(bfqq);
+ bfqd->busy_queues++;
+
+ if (!bfqq->dispatched)
+ if (bfqq->wr_coeff == 1)
+ bfq_weights_tree_add(bfqd, &bfqq->entity,
+ &bfqd->queue_weights_tree);
+
+ if (bfqq->wr_coeff > 1)
+ bfqd->wr_busy_queues++;
+}
diff --git a/block/bfq.h b/block/bfq.h
new file mode 100644
index 0000000..d5c5c56
--- /dev/null
+++ b/block/bfq.h
@@ -0,0 +1,933 @@
+/*
+ * BFQ v8r8-rc2 for 4.9.0: data structures and common functions prototypes.
+ *
+ * Based on ideas and code from CFQ:
+ * Copyright (C) 2003 Jens Axboe <axboe@kernel.dk>
+ *
+ * Copyright (C) 2008 Fabio Checconi <fabio@gandalf.sssup.it>
+ * Paolo Valente <paolo.valente@unimore.it>
+ *
+ * Copyright (C) 2015 Paolo Valente <paolo.valente@unimore.it>
+ *
+ * Copyright (C) 2017 Paolo Valente <paolo.valente@linaro.org>
+ */
+
+#ifndef _BFQ_H
+#define _BFQ_H
+
+#include <linux/blktrace_api.h>
+#include <linux/hrtimer.h>
+#include <linux/ioprio.h>
+#include <linux/rbtree.h>
+#include <linux/blk-cgroup.h>
+
+#define BFQ_IOPRIO_CLASSES 3
+#define BFQ_CL_IDLE_TIMEOUT (HZ/5)
+
+#define BFQ_MIN_WEIGHT 1
+#define BFQ_MAX_WEIGHT 1000
+#define BFQ_WEIGHT_CONVERSION_COEFF 10
+
+#define BFQ_DEFAULT_QUEUE_IOPRIO 4
+
+#define BFQ_WEIGHT_LEGACY_DFL 100
+#define BFQ_DEFAULT_GRP_IOPRIO 0
+#define BFQ_DEFAULT_GRP_CLASS IOPRIO_CLASS_BE
+
+/*
+ * Soft real-time applications are extremely more latency sensitive
+ * than interactive ones. Over-raise the weight of the former to
+ * privilege them against the latter.
+ */
+#define BFQ_SOFTRT_WEIGHT_FACTOR 100
+
+struct bfq_entity;
+
+/**
+ * struct bfq_service_tree - per ioprio_class service tree.
+ *
+ * Each service tree represents a B-WF2Q+ scheduler on its own. Each
+ * ioprio_class has its own independent scheduler, and so its own
+ * bfq_service_tree. All the fields are protected by the queue lock
+ * of the containing bfqd.
+ */
+struct bfq_service_tree {
+ /* tree for active entities (i.e., those backlogged) */
+ struct rb_root active;
+ /* tree for idle entities (i.e., not backlogged, with V <= F_i)*/
+ struct rb_root idle;
+
+ struct bfq_entity *first_idle; /* idle entity with minimum F_i */
+ struct bfq_entity *last_idle; /* idle entity with maximum F_i */
+
+ u64 vtime; /* scheduler virtual time */
+ /* scheduler weight sum; active and idle entities contribute to it */
+ unsigned long wsum;
+};
+
+/**
+ * struct bfq_sched_data - multi-class scheduler.
+ *
+ * bfq_sched_data is the basic scheduler queue. It supports three
+ * ioprio_classes, and can be used either as a toplevel queue or as an
+ * intermediate queue on a hierarchical setup. @next_in_service
+ * points to the active entity of the sched_data service trees that
+ * will be scheduled next. It is used to reduce the number of steps
+ * needed for each hierarchical-schedule update.
+ *
+ * The supported ioprio_classes are the same as in CFQ, in descending
+ * priority order, IOPRIO_CLASS_RT, IOPRIO_CLASS_BE, IOPRIO_CLASS_IDLE.
+ * Requests from higher priority queues are served before all the
+ * requests from lower priority queues; among requests of the same
+ * queue requests are served according to B-WF2Q+.
+ * All the fields are protected by the queue lock of the containing bfqd.
+ */
+struct bfq_sched_data {
+ struct bfq_entity *in_service_entity; /* entity in service */
+ /* head-of-the-line entity in the scheduler (see comments above) */
+ struct bfq_entity *next_in_service;
+ /* array of service trees, one per ioprio_class */
+ struct bfq_service_tree service_tree[BFQ_IOPRIO_CLASSES];
+ /* last time CLASS_IDLE was served */
+ unsigned long bfq_class_idle_last_service;
+
+};
+
+/**
+ * struct bfq_weight_counter - counter of the number of all active entities
+ * with a given weight.
+ */
+struct bfq_weight_counter {
+ unsigned int weight; /* weight of the entities this counter refers to */
+ unsigned int num_active; /* nr of active entities with this weight */
+ /*
+ * Weights tree member (see bfq_data's @queue_weights_tree and
+ * @group_weights_tree)
+ */
+ struct rb_node weights_node;
+};
+
+/**
+ * struct bfq_entity - schedulable entity.
+ *
+ * A bfq_entity is used to represent either a bfq_queue (leaf node in the
+ * cgroup hierarchy) or a bfq_group into the upper level scheduler. Each
+ * entity belongs to the sched_data of the parent group in the cgroup
+ * hierarchy. Non-leaf entities have also their own sched_data, stored
+ * in @my_sched_data.
+ *
+ * Each entity stores independently its priority values; this would
+ * allow different weights on different devices, but this
+ * functionality is not exported to userspace by now. Priorities and
+ * weights are updated lazily, first storing the new values into the
+ * new_* fields, then setting the @prio_changed flag. As soon as
+ * there is a transition in the entity state that allows the priority
+ * update to take place the effective and the requested priority
+ * values are synchronized.
+ *
+ * Unless cgroups are used, the weight value is calculated from the
+ * ioprio to export the same interface as CFQ. When dealing with
+ * ``well-behaved'' queues (i.e., queues that do not spend too much
+ * time to consume their budget and have true sequential behavior, and
+ * when there are no external factors breaking anticipation) the
+ * relative weights at each level of the cgroups hierarchy should be
+ * guaranteed. All the fields are protected by the queue lock of the
+ * containing bfqd.
+ */
+struct bfq_entity {
+ struct rb_node rb_node; /* service_tree member */
+ /* pointer to the weight counter associated with this entity */
+ struct bfq_weight_counter *weight_counter;
+
+ /*
+ * Flag, true if the entity is on a tree (either the active or
+ * the idle one of its service_tree) or is in service.
+ */
+ bool on_st;
+
+ u64 finish; /* B-WF2Q+ finish timestamp (aka F_i) */
+ u64 start; /* B-WF2Q+ start timestamp (aka S_i) */
+
+ /* tree the entity is enqueued into; %NULL if not on a tree */
+ struct rb_root *tree;
+
+ /*
+ * minimum start time of the (active) subtree rooted at this
+ * entity; used for O(log N) lookups into active trees
+ */
+ u64 min_start;
+
+ /* amount of service received during the last service slot */
+ int service;
+
+ /* budget, used also to calculate F_i: F_i = S_i + @budget / @weight */
+ int budget;
+
+ unsigned int weight; /* weight of the queue */
+ unsigned int new_weight; /* next weight if a change is in progress */
+
+ /* original weight, used to implement weight boosting */
+ unsigned int orig_weight;
+
+ /* parent entity, for hierarchical scheduling */
+ struct bfq_entity *parent;
+
+ /*
+ * For non-leaf nodes in the hierarchy, the associated
+ * scheduler queue, %NULL on leaf nodes.
+ */
+ struct bfq_sched_data *my_sched_data;
+ /* the scheduler queue this entity belongs to */
+ struct bfq_sched_data *sched_data;
+
+ /* flag, set to request a weight, ioprio or ioprio_class change */
+ int prio_changed;
+};
+
+struct bfq_group;
+
+/**
+ * struct bfq_queue - leaf schedulable entity.
+ *
+ * A bfq_queue is a leaf request queue; it can be associated with an
+ * io_context or more, if it is async or shared between cooperating
+ * processes. @cgroup holds a reference to the cgroup, to be sure that it
+ * does not disappear while a bfqq still references it (mostly to avoid
+ * races between request issuing and task migration followed by cgroup
+ * destruction).
+ * All the fields are protected by the queue lock of the containing bfqd.
+ */
+struct bfq_queue {
+ /* reference counter */
+ int ref;
+ /* parent bfq_data */
+ struct bfq_data *bfqd;
+
+ /* current ioprio and ioprio class */
+ unsigned short ioprio, ioprio_class;
+ /* next ioprio and ioprio class if a change is in progress */
+ unsigned short new_ioprio, new_ioprio_class;
+
+ /*
+ * Shared bfq_queue if queue is cooperating with one or more
+ * other queues.
+ */
+ struct bfq_queue *new_bfqq;
+ /* request-position tree member (see bfq_group's @rq_pos_tree) */
+ struct rb_node pos_node;
+ /* request-position tree root (see bfq_group's @rq_pos_tree) */
+ struct rb_root *pos_root;
+
+ /* sorted list of pending requests */
+ struct rb_root sort_list;
+ /* if fifo isn't expired, next request to serve */
+ struct request *next_rq;
+ /* number of sync and async requests queued */
+ int queued[2];
+ /* number of sync and async requests currently allocated */
+ int allocated[2];
+ /* number of pending metadata requests */
+ int meta_pending;
+ /* fifo list of requests in sort_list */
+ struct list_head fifo;
+
+ /* entity representing this queue in the scheduler */
+ struct bfq_entity entity;
+
+ /* maximum budget allowed from the feedback mechanism */
+ int max_budget;
+ /* budget expiration (in jiffies) */
+ unsigned long budget_timeout;
+
+ /* number of requests on the dispatch list or inside driver */
+ int dispatched;
+
+ unsigned int flags; /* status flags.*/
+
+ /* node for active/idle bfqq list inside parent bfqd */
+ struct list_head bfqq_list;
+
+ /* bit vector: a 1 for each seeky requests in history */
+ u32 seek_history;
+
+ /* node for the device's burst list */
+ struct hlist_node burst_list_node;
+
+ /* position of the last request enqueued */
+ sector_t last_request_pos;
+
+ /* Number of consecutive pairs of request completion and
+ * arrival, such that the queue becomes idle after the
+ * completion, but the next request arrives within an idle
+ * time slice; used only if the queue's IO_bound flag has been
+ * cleared.
+ */
+ unsigned int requests_within_timer;
+
+ /* pid of the process owning the queue, used for logging purposes */
+ pid_t pid;
+
+ /*
+ * Pointer to the bfq_io_cq owning the bfq_queue, set to %NULL
+ * if the queue is shared.
+ */
+ struct bfq_io_cq *bic;
+
+ /* current maximum weight-raising time for this queue */
+ unsigned long wr_cur_max_time;
+ /*
+ * Minimum time instant such that, only if a new request is
+ * enqueued after this time instant in an idle @bfq_queue with
+ * no outstanding requests, then the task associated with the
+ * queue it is deemed as soft real-time (see the comments on
+ * the function bfq_bfqq_softrt_next_start())
+ */
+ unsigned long soft_rt_next_start;
+ /*
+ * Start time of the current weight-raising period if
+ * the @bfq-queue is being weight-raised, otherwise
+ * finish time of the last weight-raising period.
+ */
+ unsigned long last_wr_start_finish;
+ /* factor by which the weight of this queue is multiplied */
+ unsigned int wr_coeff;
+ /*
+ * Time of the last transition of the @bfq_queue from idle to
+ * backlogged.
+ */
+ unsigned long last_idle_bklogged;
+ /*
+ * Cumulative service received from the @bfq_queue since the
+ * last transition from idle to backlogged.
+ */
+ unsigned long service_from_backlogged;
+ /*
+ * Value of wr start time when switching to soft rt
+ */
+ unsigned long wr_start_at_switch_to_srt;
+
+ unsigned long split_time; /* time of last split */
+};
+
+/**
+ * struct bfq_ttime - per process thinktime stats.
+ */
+struct bfq_ttime {
+ u64 last_end_request; /* completion time of last request */
+
+ u64 ttime_total; /* total process thinktime */
+ unsigned long ttime_samples; /* number of thinktime samples */
+ u64 ttime_mean; /* average process thinktime */
+
+};
+
+/**
+ * struct bfq_io_cq - per (request_queue, io_context) structure.
+ */
+struct bfq_io_cq {
+ /* associated io_cq structure */
+ struct io_cq icq; /* must be the first member */
+ /* array of two process queues, the sync and the async */
+ struct bfq_queue *bfqq[2];
+ /* associated @bfq_ttime struct */
+ struct bfq_ttime ttime;
+ /* per (request_queue, blkcg) ioprio */
+ int ioprio;
+#ifdef CONFIG_BFQ_GROUP_IOSCHED
+ uint64_t blkcg_serial_nr; /* the current blkcg serial */
+#endif
+
+ /*
+ * Snapshot of the idle window before merging; taken to
+ * remember this value while the queue is merged, so as to be
+ * able to restore it in case of split.
+ */
+ bool saved_idle_window;
+ /*
+ * Same purpose as the previous two fields for the I/O bound
+ * classification of a queue.
+ */
+ bool saved_IO_bound;
+
+ /*
+ * Same purpose as the previous fields for the value of the
+ * field keeping the queue's belonging to a large burst
+ */
+ bool saved_in_large_burst;
+ /*
+ * True if the queue belonged to a burst list before its merge
+ * with another cooperating queue.
+ */
+ bool was_in_burst_list;
+
+ /*
+ * Similar to previous fields: save wr information.
+ */
+ unsigned long saved_wr_coeff;
+ unsigned long saved_last_wr_start_finish;
+ unsigned long saved_wr_start_at_switch_to_srt;
+ unsigned int saved_wr_cur_max_time;
+};
+
+enum bfq_device_speed {
+ BFQ_BFQD_FAST,
+ BFQ_BFQD_SLOW,
+};
+
+/**
+ * struct bfq_data - per-device data structure.
+ *
+ * All the fields are protected by the @queue lock.
+ */
+struct bfq_data {
+ /* request queue for the device */
+ struct request_queue *queue;
+
+ /* root bfq_group for the device */
+ struct bfq_group *root_group;
+
+ /*
+ * rbtree of weight counters of @bfq_queues, sorted by
+ * weight. Used to keep track of whether all @bfq_queues have
+ * the same weight. The tree contains one counter for each
+ * distinct weight associated to some active and not
+ * weight-raised @bfq_queue (see the comments to the functions
+ * bfq_weights_tree_[add|remove] for further details).
+ */
+ struct rb_root queue_weights_tree;
+ /*
+ * rbtree of non-queue @bfq_entity weight counters, sorted by
+ * weight. Used to keep track of whether all @bfq_groups have
+ * the same weight. The tree contains one counter for each
+ * distinct weight associated to some active @bfq_group (see
+ * the comments to the functions bfq_weights_tree_[add|remove]
+ * for further details).
+ */
+ struct rb_root group_weights_tree;
+
+ /*
+ * Number of bfq_queues containing requests (including the
+ * queue in service, even if it is idling).
+ */
+ int busy_queues;
+ /* number of weight-raised busy @bfq_queues */
+ int wr_busy_queues;
+ /* number of queued requests */
+ int queued;
+ /* number of requests dispatched and waiting for completion */
+ int rq_in_driver;
+
+ /*
+ * Maximum number of requests in driver in the last
+ * @hw_tag_samples completed requests.
+ */
+ int max_rq_in_driver;
+ /* number of samples used to calculate hw_tag */
+ int hw_tag_samples;
+ /* flag set to one if the driver is showing a queueing behavior */
+ int hw_tag;
+
+ /* number of budgets assigned */
+ int budgets_assigned;
+
+ /*
+ * Timer set when idling (waiting) for the next request from
+ * the queue in service.
+ */
+ struct hrtimer idle_slice_timer;
+ /* delayed work to restart dispatching on the request queue */
+ struct work_struct unplug_work;
+
+ /* bfq_queue in service */
+ struct bfq_queue *in_service_queue;
+ /* bfq_io_cq (bic) associated with the @in_service_queue */
+ struct bfq_io_cq *in_service_bic;
+
+ /* on-disk position of the last served request */
+ sector_t last_position;
+
+ /* time of last request completion (ns) */
+ u64 last_completion;
+
+ /* time of first rq dispatch in current observation interval (ns) */
+ u64 first_dispatch;
+ /* time of last rq dispatch in current observation interval (ns) */
+ u64 last_dispatch;
+
+ /* beginning of the last budget */
+ ktime_t last_budget_start;
+ /* beginning of the last idle slice */
+ ktime_t last_idling_start;
+
+ /* number of samples in current observation interval */
+ int peak_rate_samples;
+ /* num of samples of seq dispatches in current observation interval */
+ u32 sequential_samples;
+ /* total num of sectors transferred in current observation interval */
+ u64 tot_sectors_dispatched;
+ /* max rq size seen during current observation interval (sectors) */
+ u32 last_rq_max_size;
+ /* time elapsed from first dispatch in current observ. interval (us) */
+ u64 delta_from_first;
+ /* current estimate of device peak rate */
+ u32 peak_rate;
+
+ /* maximum budget allotted to a bfq_queue before rescheduling */
+ int bfq_max_budget;
+
+ /* list of all the bfq_queues active on the device */
+ struct list_head active_list;
+ /* list of all the bfq_queues idle on the device */
+ struct list_head idle_list;
+
+ /*
+ * Timeout for async/sync requests; when it fires, requests
+ * are served in fifo order.
+ */
+ u64 bfq_fifo_expire[2];
+ /* weight of backward seeks wrt forward ones */
+ unsigned int bfq_back_penalty;
+ /* maximum allowed backward seek */
+ unsigned int bfq_back_max;
+ /* maximum idling time */
+ u32 bfq_slice_idle;
+
+ /* user-configured max budget value (0 for auto-tuning) */
+ int bfq_user_max_budget;
+ /*
+ * Timeout for bfq_queues to consume their budget; used to
+ * prevent seeky queues from imposing long latencies to
+ * sequential or quasi-sequential ones (this also implies that
+ * seeky queues cannot receive guarantees in the service
+ * domain; after a timeout they are charged for the time they
+ * have been in service, to preserve fairness among them, but
+ * without service-domain guarantees).
+ */
+ unsigned int bfq_timeout;
+
+ /*
+ * Number of consecutive requests that must be issued within
+ * the idle time slice to set again idling to a queue which
+ * was marked as non-I/O-bound (see the definition of the
+ * IO_bound flag for further details).
+ */
+ unsigned int bfq_requests_within_timer;
+
+ /*
+ * Force device idling whenever needed to provide accurate
+ * service guarantees, without caring about throughput
+ * issues. CAVEAT: this may even increase latencies, in case
+ * of useless idling for processes that did stop doing I/O.
+ */
+ bool strict_guarantees;
+
+ /*
+ * Last time at which a queue entered the current burst of
+ * queues being activated shortly after each other; for more
+ * details about this and the following parameters related to
+ * a burst of activations, see the comments on the function
+ * bfq_handle_burst.
+ */
+ unsigned long last_ins_in_burst;
+ /*
+ * Reference time interval used to decide whether a queue has
+ * been activated shortly after @last_ins_in_burst.
+ */
+ unsigned long bfq_burst_interval;
+ /* number of queues in the current burst of queue activations */
+ int burst_size;
+
+ /* common parent entity for the queues in the burst */
+ struct bfq_entity *burst_parent_entity;
+ /* Maximum burst size above which the current queue-activation
+ * burst is deemed as 'large'.
+ */
+ unsigned long bfq_large_burst_thresh;
+ /* true if a large queue-activation burst is in progress */
+ bool large_burst;
+ /*
+ * Head of the burst list (as for the above fields, more
+ * details in the comments on the function bfq_handle_burst).
+ */
+ struct hlist_head burst_list;
+
+ /* if set to true, low-latency heuristics are enabled */
+ bool low_latency;
+ /*
+ * Maximum factor by which the weight of a weight-raised queue
+ * is multiplied.
+ */
+ unsigned int bfq_wr_coeff;
+ /* maximum duration of a weight-raising period (jiffies) */
+ unsigned int bfq_wr_max_time;
+
+ /* Maximum weight-raising duration for soft real-time processes */
+ unsigned int bfq_wr_rt_max_time;
+ /*
+ * Minimum idle period after which weight-raising may be
+ * reactivated for a queue (in jiffies).
+ */
+ unsigned int bfq_wr_min_idle_time;
+ /*
+ * Minimum period between request arrivals after which
+ * weight-raising may be reactivated for an already busy async
+ * queue (in jiffies).
+ */
+ unsigned long bfq_wr_min_inter_arr_async;
+
+ /* Max service-rate for a soft real-time queue, in sectors/sec */
+ unsigned int bfq_wr_max_softrt_rate;
+ /*
+ * Cached value of the product R*T, used for computing the
+ * maximum duration of weight raising automatically.
+ */
+ u64 RT_prod;
+ /* device-speed class for the low-latency heuristic */
+ enum bfq_device_speed device_speed;
+
+ /* fallback dummy bfqq for extreme OOM conditions */
+ struct bfq_queue oom_bfqq;
+};
+
+enum bfqq_state_flags {
+ BFQ_BFQQ_FLAG_just_created = 0, /* queue just allocated */
+ BFQ_BFQQ_FLAG_busy, /* has requests or is in service */
+ BFQ_BFQQ_FLAG_wait_request, /* waiting for a request */
+ BFQ_BFQQ_FLAG_non_blocking_wait_rq, /*
+ * waiting for a request
+ * without idling the device
+ */
+ BFQ_BFQQ_FLAG_must_alloc, /* must be allowed rq alloc */
+ BFQ_BFQQ_FLAG_fifo_expire, /* FIFO checked in this slice */
+ BFQ_BFQQ_FLAG_idle_window, /* slice idling enabled */
+ BFQ_BFQQ_FLAG_sync, /* synchronous queue */
+ BFQ_BFQQ_FLAG_IO_bound, /*
+ * bfqq has timed-out at least once
+ * having consumed at most 2/10 of
+ * its budget
+ */
+ BFQ_BFQQ_FLAG_in_large_burst, /*
+ * bfqq activated in a large burst,
+ * see comments to bfq_handle_burst.
+ */
+ BFQ_BFQQ_FLAG_softrt_update, /*
+ * may need softrt-next-start
+ * update
+ */
+ BFQ_BFQQ_FLAG_coop, /* bfqq is shared */
+ BFQ_BFQQ_FLAG_split_coop /* shared bfqq will be split */
+};
+
+#define BFQ_BFQQ_FNS(name) \
+static void bfq_mark_bfqq_##name(struct bfq_queue *bfqq) \
+{ \
+ (bfqq)->flags |= (1 << BFQ_BFQQ_FLAG_##name); \
+} \
+static void bfq_clear_bfqq_##name(struct bfq_queue *bfqq) \
+{ \
+ (bfqq)->flags &= ~(1 << BFQ_BFQQ_FLAG_##name); \
+} \
+static int bfq_bfqq_##name(const struct bfq_queue *bfqq) \
+{ \
+ return ((bfqq)->flags & (1 << BFQ_BFQQ_FLAG_##name)) != 0; \
+}
+
+BFQ_BFQQ_FNS(just_created);
+BFQ_BFQQ_FNS(busy);
+BFQ_BFQQ_FNS(wait_request);
+BFQ_BFQQ_FNS(non_blocking_wait_rq);
+BFQ_BFQQ_FNS(must_alloc);
+BFQ_BFQQ_FNS(fifo_expire);
+BFQ_BFQQ_FNS(idle_window);
+BFQ_BFQQ_FNS(sync);
+BFQ_BFQQ_FNS(IO_bound);
+BFQ_BFQQ_FNS(in_large_burst);
+BFQ_BFQQ_FNS(coop);
+BFQ_BFQQ_FNS(split_coop);
+BFQ_BFQQ_FNS(softrt_update);
+#undef BFQ_BFQQ_FNS
+
+/* Logging facilities. */
+#ifdef CONFIG_BFQ_REDIRECT_TO_CONSOLE
+#ifdef CONFIG_BFQ_GROUP_IOSCHED
+static struct bfq_group *bfqq_group(struct bfq_queue *bfqq);
+static struct blkcg_gq *bfqg_to_blkg(struct bfq_group *bfqg);
+
+#define bfq_log_bfqq(bfqd, bfqq, fmt, args...) do { \
+ char __pbuf[128]; \
+ \
+ assert_spin_locked((bfqd)->queue->queue_lock); \
+ blkg_path(bfqg_to_blkg(bfqq_group(bfqq)), __pbuf, sizeof(__pbuf)); \
+ pr_crit("bfq%d%c %s " fmt "\n", \
+ (bfqq)->pid, \
+ bfq_bfqq_sync((bfqq)) ? 'S' : 'A', \
+ __pbuf, ##args); \
+} while (0)
+
+#define bfq_log_bfqg(bfqd, bfqg, fmt, args...) do { \
+ char __pbuf[128]; \
+ \
+ blkg_path(bfqg_to_blkg(bfqg), __pbuf, sizeof(__pbuf)); \
+ pr_crit("%s " fmt "\n", __pbuf, ##args); \
+} while (0)
+
+#else /* CONFIG_BFQ_GROUP_IOSCHED */
+
+#define bfq_log_bfqq(bfqd, bfqq, fmt, args...) \
+ pr_crit("bfq%d%c " fmt "\n", (bfqq)->pid, \
+ bfq_bfqq_sync((bfqq)) ? 'S' : 'A', \
+ ##args)
+#define bfq_log_bfqg(bfqd, bfqg, fmt, args...) do {} while (0)
+
+#endif /* CONFIG_BFQ_GROUP_IOSCHED */
+
+#define bfq_log(bfqd, fmt, args...) \
+ pr_crit("bfq " fmt "\n", ##args)
+
+#else /* CONFIG_BFQ_REDIRECT_TO_CONSOLE */
+#ifdef CONFIG_BFQ_GROUP_IOSCHED
+static struct bfq_group *bfqq_group(struct bfq_queue *bfqq);
+static struct blkcg_gq *bfqg_to_blkg(struct bfq_group *bfqg);
+
+#define bfq_log_bfqq(bfqd, bfqq, fmt, args...) do { \
+ char __pbuf[128]; \
+ \
+ assert_spin_locked((bfqd)->queue->queue_lock); \
+ blkg_path(bfqg_to_blkg(bfqq_group(bfqq)), __pbuf, sizeof(__pbuf)); \
+ blk_add_trace_msg((bfqd)->queue, "bfq%d%c %s " fmt, \
+ (bfqq)->pid, \
+ bfq_bfqq_sync((bfqq)) ? 'S' : 'A', \
+ __pbuf, ##args); \
+} while (0)
+
+#define bfq_log_bfqg(bfqd, bfqg, fmt, args...) do { \
+ char __pbuf[128]; \
+ \
+ blkg_path(bfqg_to_blkg(bfqg), __pbuf, sizeof(__pbuf)); \
+ blk_add_trace_msg((bfqd)->queue, "%s " fmt, __pbuf, ##args); \
+} while (0)
+
+#else /* CONFIG_BFQ_GROUP_IOSCHED */
+
+#define bfq_log_bfqq(bfqd, bfqq, fmt, args...) \
+ blk_add_trace_msg((bfqd)->queue, "bfq%d%c " fmt, (bfqq)->pid, \
+ bfq_bfqq_sync((bfqq)) ? 'S' : 'A', \
+ ##args)
+#define bfq_log_bfqg(bfqd, bfqg, fmt, args...) do {} while (0)
+
+#endif /* CONFIG_BFQ_GROUP_IOSCHED */
+
+#define bfq_log(bfqd, fmt, args...) \
+ blk_add_trace_msg((bfqd)->queue, "bfq " fmt, ##args)
+#endif /* CONFIG_BFQ_REDIRECT_TO_CONSOLE */
+
+/* Expiration reasons. */
+enum bfqq_expiration {
+ BFQ_BFQQ_TOO_IDLE = 0, /*
+ * queue has been idling for
+ * too long
+ */
+ BFQ_BFQQ_BUDGET_TIMEOUT, /* budget took too long to be used */
+ BFQ_BFQQ_BUDGET_EXHAUSTED, /* budget consumed */
+ BFQ_BFQQ_NO_MORE_REQUESTS, /* the queue has no more requests */
+ BFQ_BFQQ_PREEMPTED /* preemption in progress */
+};
+
+
+struct bfqg_stats {
+#ifdef CONFIG_BFQ_GROUP_IOSCHED
+ /* number of ios merged */
+ struct blkg_rwstat merged;
+ /* total time spent on device in ns, may not be accurate w/ queueing */
+ struct blkg_rwstat service_time;
+ /* total time spent waiting in scheduler queue in ns */
+ struct blkg_rwstat wait_time;
+ /* number of IOs queued up */
+ struct blkg_rwstat queued;
+ /* total disk time and nr sectors dispatched by this group */
+ struct blkg_stat time;
+ /* sum of number of ios queued across all samples */
+ struct blkg_stat avg_queue_size_sum;
+ /* count of samples taken for average */
+ struct blkg_stat avg_queue_size_samples;
+ /* how many times this group has been removed from service tree */
+ struct blkg_stat dequeue;
+ /* total time spent waiting for it to be assigned a timeslice. */
+ struct blkg_stat group_wait_time;
+ /* time spent idling for this blkcg_gq */
+ struct blkg_stat idle_time;
+ /* total time with empty current active q with other requests queued */
+ struct blkg_stat empty_time;
+ /* fields after this shouldn't be cleared on stat reset */
+ uint64_t start_group_wait_time;
+ uint64_t start_idle_time;
+ uint64_t start_empty_time;
+ uint16_t flags;
+#endif
+};
+
+#ifdef CONFIG_BFQ_GROUP_IOSCHED
+/*
+ * struct bfq_group_data - per-blkcg storage for the blkio subsystem.
+ *
+ * @ps: @blkcg_policy_storage that this structure inherits
+ * @weight: weight of the bfq_group
+ */
+struct bfq_group_data {
+ /* must be the first member */
+ struct blkcg_policy_data pd;
+
+ unsigned int weight;
+};
+
+/**
+ * struct bfq_group - per (device, cgroup) data structure.
+ * @entity: schedulable entity to insert into the parent group sched_data.
+ * @sched_data: own sched_data, to contain child entities (they may be
+ * both bfq_queues and bfq_groups).
+ * @bfqd: the bfq_data for the device this group acts upon.
+ * @async_bfqq: array of async queues for all the tasks belonging to
+ * the group, one queue per ioprio value per ioprio_class,
+ * except for the idle class that has only one queue.
+ * @async_idle_bfqq: async queue for the idle class (ioprio is ignored).
+ * @my_entity: pointer to @entity, %NULL for the toplevel group; used
+ * to avoid too many special cases during group creation/
+ * migration.
+ * @active_entities: number of active entities belonging to the group;
+ * unused for the root group. Used to know whether there
+ * are groups with more than one active @bfq_entity
+ * (see the comments to the function
+ * bfq_bfqq_may_idle()).
+ * @rq_pos_tree: rbtree sorted by next_request position, used when
+ * determining if two or more queues have interleaving
+ * requests (see bfq_find_close_cooperator()).
+ *
+ * Each (device, cgroup) pair has its own bfq_group, i.e., for each cgroup
+ * there is a set of bfq_groups, each one collecting the lower-level
+ * entities belonging to the group that are acting on the same device.
+ *
+ * Locking works as follows:
+ * o @bfqd is protected by the queue lock, RCU is used to access it
+ * from the readers.
+ * o All the other fields are protected by the @bfqd queue lock.
+ */
+struct bfq_group {
+ /* must be the first member */
+ struct blkg_policy_data pd;
+
+ struct bfq_entity entity;
+ struct bfq_sched_data sched_data;
+
+ void *bfqd;
+
+ struct bfq_queue *async_bfqq[2][IOPRIO_BE_NR];
+ struct bfq_queue *async_idle_bfqq;
+
+ struct bfq_entity *my_entity;
+
+ int active_entities;
+
+ struct rb_root rq_pos_tree;
+
+ struct bfqg_stats stats;
+};
+
+#else
+struct bfq_group {
+ struct bfq_sched_data sched_data;
+
+ struct bfq_queue *async_bfqq[2][IOPRIO_BE_NR];
+ struct bfq_queue *async_idle_bfqq;
+
+ struct rb_root rq_pos_tree;
+};
+#endif
+
+static struct bfq_queue *bfq_entity_to_bfqq(struct bfq_entity *entity);
+
+static unsigned int bfq_class_idx(struct bfq_entity *entity)
+{
+ struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);
+
+ return bfqq ? bfqq->ioprio_class - 1 :
+ BFQ_DEFAULT_GRP_CLASS - 1;
+}
+
+static struct bfq_service_tree *
+bfq_entity_service_tree(struct bfq_entity *entity)
+{
+ struct bfq_sched_data *sched_data = entity->sched_data;
+ struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);
+ unsigned int idx = bfq_class_idx(entity);
+
+ BUG_ON(idx >= BFQ_IOPRIO_CLASSES);
+ BUG_ON(sched_data == NULL);
+
+ if (bfqq)
+ bfq_log_bfqq(bfqq->bfqd, bfqq,
+ "entity_service_tree %p %d",
+ sched_data->service_tree + idx, idx);
+#ifdef CONFIG_BFQ_GROUP_IOSCHED
+ else {
+ struct bfq_group *bfqg =
+ container_of(entity, struct bfq_group, entity);
+
+ bfq_log_bfqg((struct bfq_data *)bfqg->bfqd, bfqg,
+ "entity_service_tree %p %d",
+ sched_data->service_tree + idx, idx);
+ }
+#endif
+ return sched_data->service_tree + idx;
+}
+
+static struct bfq_queue *bic_to_bfqq(struct bfq_io_cq *bic, bool is_sync)
+{
+ return bic->bfqq[is_sync];
+}
+
+static void bic_set_bfqq(struct bfq_io_cq *bic, struct bfq_queue *bfqq,
+ bool is_sync)
+{
+ bic->bfqq[is_sync] = bfqq;
+}
+
+static struct bfq_data *bic_to_bfqd(struct bfq_io_cq *bic)
+{
+ return bic->icq.q->elevator->elevator_data;
+}
+
+#ifdef CONFIG_BFQ_GROUP_IOSCHED
+
+static struct bfq_group *bfq_bfqq_to_bfqg(struct bfq_queue *bfqq)
+{
+ struct bfq_entity *group_entity = bfqq->entity.parent;
+
+ if (!group_entity)
+ group_entity = &bfqq->bfqd->root_group->entity;
+
+ return container_of(group_entity, struct bfq_group, entity);
+}
+
+#else
+
+static struct bfq_group *bfq_bfqq_to_bfqg(struct bfq_queue *bfqq)
+{
+ return bfqq->bfqd->root_group;
+}
+
+#endif
+
+static void bfq_check_ioprio_change(struct bfq_io_cq *bic, struct bio *bio);
+static void bfq_put_queue(struct bfq_queue *bfqq);
+static void bfq_dispatch_insert(struct request_queue *q, struct request *rq);
+static struct bfq_queue *bfq_get_queue(struct bfq_data *bfqd,
+ struct bio *bio, bool is_sync,
+ struct bfq_io_cq *bic);
+static void bfq_end_wr_async_queues(struct bfq_data *bfqd,
+ struct bfq_group *bfqg);
+#ifdef CONFIG_BFQ_GROUP_IOSCHED
+static void bfq_put_async_queues(struct bfq_data *bfqd, struct bfq_group *bfqg);
+#endif
+static void bfq_exit_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq);
+
+#endif /* _BFQ_H */
diff --git a/block/blk-core.c b/block/blk-core.c
index 14d7c07..1655112 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -39,6 +39,7 @@
#include "blk.h"
#include "blk-mq.h"
+#include "blk-wbt.h"
EXPORT_TRACEPOINT_SYMBOL_GPL(block_bio_remap);
EXPORT_TRACEPOINT_SYMBOL_GPL(block_rq_remap);
@@ -882,6 +883,8 @@ blk_init_allocated_queue(struct request_queue *q, request_fn_proc *rfn,
fail:
blk_free_flush_queue(q->fq);
+ wbt_exit(q->rq_wb);
+ q->rq_wb = NULL;
return NULL;
}
EXPORT_SYMBOL(blk_init_allocated_queue);
@@ -1346,6 +1349,7 @@ void blk_requeue_request(struct request_queue *q, struct request *rq)
blk_delete_timer(rq);
blk_clear_rq_complete(rq);
trace_block_rq_requeue(q, rq);
+ wbt_requeue(q->rq_wb, &rq->issue_stat);
if (rq->cmd_flags & REQ_QUEUED)
blk_queue_end_tag(q, rq);
@@ -1436,6 +1440,8 @@ void __blk_put_request(struct request_queue *q, struct request *req)
/* this is a bio leak */
WARN_ON(req->bio != NULL);
+ wbt_done(q->rq_wb, &req->issue_stat);
+
/*
* Request may not have originated from ll_rw_blk. if not,
* it didn't come out of our reserved rq pools
@@ -1667,6 +1673,7 @@ static blk_qc_t blk_queue_bio(struct request_queue *q, struct bio *bio)
int el_ret, rw_flags = 0, where = ELEVATOR_INSERT_SORT;
struct request *req;
unsigned int request_count = 0;
+ unsigned int wb_acct;
/*
* low level driver can indicate that it wants pages above a
@@ -1719,6 +1726,8 @@ static blk_qc_t blk_queue_bio(struct request_queue *q, struct bio *bio)
}
get_rq:
+ wb_acct = wbt_wait(q->rq_wb, bio->bi_opf, q->queue_lock);
+
/*
* This sync check and mask will be re-done in init_request_from_bio(),
* but we need to set it earlier to expose the sync flag to the
@@ -1738,11 +1747,14 @@ get_rq:
*/
req = get_request(q, bio_data_dir(bio), rw_flags, bio, GFP_NOIO);
if (IS_ERR(req)) {
+ __wbt_done(q->rq_wb, wb_acct);
bio->bi_error = PTR_ERR(req);
bio_endio(bio);
goto out_unlock;
}
+ wbt_track(&req->issue_stat, wb_acct);
+
/*
* After dropping the lock and possibly sleeping here, our request
* may now be mergeable after it had proven unmergeable (above).
@@ -2475,6 +2487,9 @@ void blk_start_request(struct request *req)
{
blk_dequeue_request(req);
+ blk_stat_set_issue_time(&req->issue_stat);
+ wbt_issue(req->q->rq_wb, &req->issue_stat);
+
/*
* We are now handing the request to the hardware, initialize
* resid_len to full count and add the timeout handler.
@@ -2542,6 +2557,8 @@ bool blk_update_request(struct request *req, int error, unsigned int nr_bytes)
trace_block_rq_complete(req->q, req, nr_bytes);
+ blk_stat_add(&req->q->rq_stats[rq_data_dir(req)], req);
+
if (!req->bio)
return false;
@@ -2709,9 +2726,10 @@ void blk_finish_request(struct request *req, int error)
blk_account_io_done(req);
- if (req->end_io)
+ if (req->end_io) {
+ wbt_done(req->q->rq_wb, &req->issue_stat);
req->end_io(req, error);
- else {
+ } else {
if (blk_bidi_rq(req))
__blk_put_request(req->next_rq->q, req->next_rq);
diff --git a/block/blk-mq-sysfs.c b/block/blk-mq-sysfs.c
index 01fb455..633c79a 100644
--- a/block/blk-mq-sysfs.c
+++ b/block/blk-mq-sysfs.c
@@ -259,6 +259,47 @@ static ssize_t blk_mq_hw_sysfs_cpus_show(struct blk_mq_hw_ctx *hctx, char *page)
return ret;
}
+static void blk_mq_stat_clear(struct blk_mq_hw_ctx *hctx)
+{
+ struct blk_mq_ctx *ctx;
+ unsigned int i;
+
+ hctx_for_each_ctx(hctx, ctx, i) {
+ blk_stat_init(&ctx->stat[0]);
+ blk_stat_init(&ctx->stat[1]);
+ }
+}
+
+static ssize_t blk_mq_hw_sysfs_stat_store(struct blk_mq_hw_ctx *hctx,
+ const char *page, size_t count)
+{
+ blk_mq_stat_clear(hctx);
+ return count;
+}
+
+static ssize_t print_stat(char *page, struct blk_rq_stat *stat, const char *pre)
+{
+ return sprintf(page, "%s samples=%llu, mean=%lld, min=%lld, max=%lld\n",
+ pre, (long long) stat->nr_samples,
+ (long long) stat->mean, (long long) stat->min,
+ (long long) stat->max);
+}
+
+static ssize_t blk_mq_hw_sysfs_stat_show(struct blk_mq_hw_ctx *hctx, char *page)
+{
+ struct blk_rq_stat stat[2];
+ ssize_t ret;
+
+ blk_stat_init(&stat[0]);
+ blk_stat_init(&stat[1]);
+
+ blk_hctx_stat_get(hctx, stat);
+
+ ret = print_stat(page, &stat[0], "read :");
+ ret += print_stat(page + ret, &stat[1], "write:");
+ return ret;
+}
+
static struct blk_mq_ctx_sysfs_entry blk_mq_sysfs_dispatched = {
.attr = {.name = "dispatched", .mode = S_IRUGO },
.show = blk_mq_sysfs_dispatched_show,
@@ -317,6 +358,11 @@ static struct blk_mq_hw_ctx_sysfs_entry blk_mq_hw_sysfs_poll = {
.show = blk_mq_hw_sysfs_poll_show,
.store = blk_mq_hw_sysfs_poll_store,
};
+static struct blk_mq_hw_ctx_sysfs_entry blk_mq_hw_sysfs_stat = {
+ .attr = {.name = "stats", .mode = S_IRUGO | S_IWUSR },
+ .show = blk_mq_hw_sysfs_stat_show,
+ .store = blk_mq_hw_sysfs_stat_store,
+};
static struct attribute *default_hw_ctx_attrs[] = {
&blk_mq_hw_sysfs_queued.attr,
@@ -327,6 +373,7 @@ static struct attribute *default_hw_ctx_attrs[] = {
&blk_mq_hw_sysfs_cpus.attr,
&blk_mq_hw_sysfs_active.attr,
&blk_mq_hw_sysfs_poll.attr,
+ &blk_mq_hw_sysfs_stat.attr,
NULL,
};
diff --git a/block/blk-mq.c b/block/blk-mq.c
index 81caceb..9dddd99 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -30,6 +30,8 @@
#include "blk.h"
#include "blk-mq.h"
#include "blk-mq-tag.h"
+#include "blk-stat.h"
+#include "blk-wbt.h"
static DEFINE_MUTEX(all_q_mutex);
static LIST_HEAD(all_q_list);
@@ -300,6 +302,8 @@ static void __blk_mq_free_request(struct blk_mq_hw_ctx *hctx,
if (rq->cmd_flags & REQ_MQ_INFLIGHT)
atomic_dec(&hctx->nr_active);
+
+ wbt_done(q->rq_wb, &rq->issue_stat);
rq->cmd_flags = 0;
clear_bit(REQ_ATOM_STARTED, &rq->atomic_flags);
@@ -328,6 +332,7 @@ inline void __blk_mq_end_request(struct request *rq, int error)
blk_account_io_done(rq);
if (rq->end_io) {
+ wbt_done(rq->q->rq_wb, &rq->issue_stat);
rq->end_io(rq, error);
} else {
if (unlikely(blk_bidi_rq(rq)))
@@ -378,10 +383,19 @@ static void blk_mq_ipi_complete_request(struct request *rq)
put_cpu();
}
+static void blk_mq_stat_add(struct request *rq)
+{
+ struct blk_rq_stat *stat = &rq->mq_ctx->stat[rq_data_dir(rq)];
+
+ blk_stat_add(stat, rq);
+}
+
static void __blk_mq_complete_request(struct request *rq)
{
struct request_queue *q = rq->q;
+ blk_mq_stat_add(rq);
+
if (!q->softirq_done_fn)
blk_mq_end_request(rq, rq->errors);
else
@@ -425,6 +439,9 @@ void blk_mq_start_request(struct request *rq)
if (unlikely(blk_bidi_rq(rq)))
rq->next_rq->resid_len = blk_rq_bytes(rq->next_rq);
+ blk_stat_set_issue_time(&rq->issue_stat);
+ wbt_issue(q->rq_wb, &rq->issue_stat);
+
blk_add_timer(rq);
/*
@@ -460,6 +477,7 @@ static void __blk_mq_requeue_request(struct request *rq)
struct request_queue *q = rq->q;
trace_block_rq_requeue(q, rq);
+ wbt_requeue(q->rq_wb, &rq->issue_stat);
if (test_and_clear_bit(REQ_ATOM_STARTED, &rq->atomic_flags)) {
if (q->dma_drain_size && blk_rq_bytes(rq))
@@ -1271,6 +1289,7 @@ static blk_qc_t blk_mq_make_request(struct request_queue *q, struct bio *bio)
struct blk_plug *plug;
struct request *same_queue_rq = NULL;
blk_qc_t cookie;
+ unsigned int wb_acct;
blk_queue_bounce(q, &bio);
@@ -1285,9 +1304,15 @@ static blk_qc_t blk_mq_make_request(struct request_queue *q, struct bio *bio)
blk_attempt_plug_merge(q, bio, &request_count, &same_queue_rq))
return BLK_QC_T_NONE;
+ wb_acct = wbt_wait(q->rq_wb, bio->bi_opf, NULL);
+
rq = blk_mq_map_request(q, bio, &data);
- if (unlikely(!rq))
+ if (unlikely(!rq)) {
+ __wbt_done(q->rq_wb, wb_acct);
return BLK_QC_T_NONE;
+ }
+
+ wbt_track(&rq->issue_stat, wb_acct);
cookie = blk_tag_to_qc_t(rq->tag, data.hctx->queue_num);
@@ -1364,6 +1389,7 @@ static blk_qc_t blk_sq_make_request(struct request_queue *q, struct bio *bio)
struct blk_map_ctx data;
struct request *rq;
blk_qc_t cookie;
+ unsigned int wb_acct;
blk_queue_bounce(q, &bio);
@@ -1380,9 +1406,15 @@ static blk_qc_t blk_sq_make_request(struct request_queue *q, struct bio *bio)
} else
request_count = blk_plug_queued_count(q);
+ wb_acct = wbt_wait(q->rq_wb, bio->bi_opf, NULL);
+
rq = blk_mq_map_request(q, bio, &data);
- if (unlikely(!rq))
+ if (unlikely(!rq)) {
+ __wbt_done(q->rq_wb, wb_acct);
return BLK_QC_T_NONE;
+ }
+
+ wbt_track(&rq->issue_stat, wb_acct);
cookie = blk_tag_to_qc_t(rq->tag, data.hctx->queue_num);
@@ -1721,6 +1753,8 @@ static void blk_mq_init_cpu_queues(struct request_queue *q,
spin_lock_init(&__ctx->lock);
INIT_LIST_HEAD(&__ctx->rq_list);
__ctx->queue = q;
+ blk_stat_init(&__ctx->stat[0]);
+ blk_stat_init(&__ctx->stat[1]);
/* If the cpu isn't online, the cpu is mapped to first hctx */
if (!cpu_online(i))
@@ -2051,6 +2085,9 @@ void blk_mq_free_queue(struct request_queue *q)
list_del_init(&q->all_q_node);
mutex_unlock(&all_q_mutex);
+ wbt_exit(q->rq_wb);
+ q->rq_wb = NULL;
+
blk_mq_del_queue_tag_set(q);
blk_mq_exit_hw_queues(q, set, set->nr_hw_queues);
diff --git a/block/blk-mq.h b/block/blk-mq.h
index e5d2524..8cf16cb 100644
--- a/block/blk-mq.h
+++ b/block/blk-mq.h
@@ -1,6 +1,8 @@
#ifndef INT_BLK_MQ_H
#define INT_BLK_MQ_H
+#include "blk-stat.h"
+
struct blk_mq_tag_set;
struct blk_mq_ctx {
@@ -18,6 +20,7 @@ struct blk_mq_ctx {
/* incremented at completion time */
unsigned long ____cacheline_aligned_in_smp rq_completed[2];
+ struct blk_rq_stat stat[2];
struct request_queue *queue;
struct kobject kobj;
diff --git a/block/blk-settings.c b/block/blk-settings.c
index f679ae1..b51ad19 100644
--- a/block/blk-settings.c
+++ b/block/blk-settings.c
@@ -13,6 +13,7 @@
#include <linux/gfp.h>
#include "blk.h"
+#include "blk-wbt.h"
unsigned long blk_max_low_pfn;
EXPORT_SYMBOL(blk_max_low_pfn);
@@ -832,6 +833,19 @@ void blk_queue_flush_queueable(struct request_queue *q, bool queueable)
EXPORT_SYMBOL_GPL(blk_queue_flush_queueable);
/**
+ * blk_set_queue_depth - tell the block layer about the device queue depth
+ * @q: the request queue for the device
+ * @depth: queue depth
+ *
+ */
+void blk_set_queue_depth(struct request_queue *q, unsigned int depth)
+{
+ q->queue_depth = depth;
+ wbt_set_queue_depth(q->rq_wb, depth);
+}
+EXPORT_SYMBOL(blk_set_queue_depth);
+
+/**
* blk_queue_write_cache - configure queue's write cache
* @q: the request queue for the device
* @wc: write back cache on or off
@@ -851,6 +865,8 @@ void blk_queue_write_cache(struct request_queue *q, bool wc, bool fua)
else
queue_flag_clear(QUEUE_FLAG_FUA, q);
spin_unlock_irq(q->queue_lock);
+
+ wbt_set_write_cache(q->rq_wb, test_bit(QUEUE_FLAG_WC, &q->queue_flags));
}
EXPORT_SYMBOL_GPL(blk_queue_write_cache);
diff --git a/block/blk-stat.c b/block/blk-stat.c
new file mode 100644
index 0000000..c219f1b
--- /dev/null
+++ b/block/blk-stat.c
@@ -0,0 +1,234 @@
+/*
+ * Block stat tracking code
+ *
+ * Copyright (C) 2016 Jens Axboe
+ */
+#include <linux/kernel.h>
+#include <linux/blk-mq.h>
+
+#include "blk-stat.h"
+#include "blk-mq.h"
+
+static void blk_stat_flush_batch(struct blk_rq_stat *stat)
+{
+ if (!stat->nr_batch)
+ return;
+ if (!stat->nr_samples)
+ stat->mean = div64_s64(stat->batch, stat->nr_batch);
+ else {
+ stat->mean = div64_s64((stat->mean * stat->nr_samples) +
+ stat->batch,
+ stat->nr_samples + stat->nr_batch);
+ }
+
+ stat->nr_samples += stat->nr_batch;
+ stat->nr_batch = stat->batch = 0;
+}
+
+void blk_stat_sum(struct blk_rq_stat *dst, struct blk_rq_stat *src)
+{
+ if (!src->nr_samples)
+ return;
+
+ blk_stat_flush_batch(src);
+
+ dst->min = min(dst->min, src->min);
+ dst->max = max(dst->max, src->max);
+
+ if (!dst->nr_samples)
+ dst->mean = src->mean;
+ else {
+ dst->mean = div64_s64((src->mean * src->nr_samples) +
+ (dst->mean * dst->nr_samples),
+ dst->nr_samples + src->nr_samples);
+ }
+ dst->nr_samples += src->nr_samples;
+}
+
+static void blk_mq_stat_get(struct request_queue *q, struct blk_rq_stat *dst)
+{
+ struct blk_mq_hw_ctx *hctx;
+ struct blk_mq_ctx *ctx;
+ uint64_t latest = 0;
+ int i, j, nr;
+
+ blk_stat_init(&dst[0]);
+ blk_stat_init(&dst[1]);
+
+ nr = 0;
+ do {
+ uint64_t newest = 0;
+
+ queue_for_each_hw_ctx(q, hctx, i) {
+ hctx_for_each_ctx(hctx, ctx, j) {
+ blk_stat_flush_batch(&ctx->stat[0]);
+ blk_stat_flush_batch(&ctx->stat[1]);
+
+ if (!ctx->stat[0].nr_samples &&
+ !ctx->stat[1].nr_samples)
+ continue;
+ if (ctx->stat[0].time > newest)
+ newest = ctx->stat[0].time;
+ if (ctx->stat[1].time > newest)
+ newest = ctx->stat[1].time;
+ }
+ }
+
+ /*
+ * No samples
+ */
+ if (!newest)
+ break;
+
+ if (newest > latest)
+ latest = newest;
+
+ queue_for_each_hw_ctx(q, hctx, i) {
+ hctx_for_each_ctx(hctx, ctx, j) {
+ if (ctx->stat[0].time == newest) {
+ blk_stat_sum(&dst[0], &ctx->stat[0]);
+ nr++;
+ }
+ if (ctx->stat[1].time == newest) {
+ blk_stat_sum(&dst[1], &ctx->stat[1]);
+ nr++;
+ }
+ }
+ }
+ /*
+ * If we race on finding an entry, just loop back again.
+ * Should be very rare.
+ */
+ } while (!nr);
+
+ dst[0].time = dst[1].time = latest;
+}
+
+void blk_queue_stat_get(struct request_queue *q, struct blk_rq_stat *dst)
+{
+ if (q->mq_ops)
+ blk_mq_stat_get(q, dst);
+ else {
+ blk_stat_flush_batch(&q->rq_stats[0]);
+ blk_stat_flush_batch(&q->rq_stats[1]);
+ memcpy(&dst[0], &q->rq_stats[0], sizeof(struct blk_rq_stat));
+ memcpy(&dst[1], &q->rq_stats[1], sizeof(struct blk_rq_stat));
+ }
+}
+
+void blk_hctx_stat_get(struct blk_mq_hw_ctx *hctx, struct blk_rq_stat *dst)
+{
+ struct blk_mq_ctx *ctx;
+ unsigned int i, nr;
+
+ nr = 0;
+ do {
+ uint64_t newest = 0;
+
+ hctx_for_each_ctx(hctx, ctx, i) {
+ blk_stat_flush_batch(&ctx->stat[0]);
+ blk_stat_flush_batch(&ctx->stat[1]);
+
+ if (!ctx->stat[0].nr_samples &&
+ !ctx->stat[1].nr_samples)
+ continue;
+
+ if (ctx->stat[0].time > newest)
+ newest = ctx->stat[0].time;
+ if (ctx->stat[1].time > newest)
+ newest = ctx->stat[1].time;
+ }
+
+ if (!newest)
+ break;
+
+ hctx_for_each_ctx(hctx, ctx, i) {
+ if (ctx->stat[0].time == newest) {
+ blk_stat_sum(&dst[0], &ctx->stat[0]);
+ nr++;
+ }
+ if (ctx->stat[1].time == newest) {
+ blk_stat_sum(&dst[1], &ctx->stat[1]);
+ nr++;
+ }
+ }
+ /*
+ * If we race on finding an entry, just loop back again.
+ * Should be very rare, as the window is only updated
+ * occasionally
+ */
+ } while (!nr);
+}
+
+static void __blk_stat_init(struct blk_rq_stat *stat, s64 time_now)
+{
+ stat->min = -1ULL;
+ stat->max = stat->nr_samples = stat->mean = 0;
+ stat->batch = stat->nr_batch = 0;
+ stat->time = time_now & BLK_STAT_NSEC_MASK;
+}
+
+void blk_stat_init(struct blk_rq_stat *stat)
+{
+ __blk_stat_init(stat, ktime_to_ns(ktime_get()));
+}
+
+static bool __blk_stat_is_current(struct blk_rq_stat *stat, s64 now)
+{
+ return (now & BLK_STAT_NSEC_MASK) == (stat->time & BLK_STAT_NSEC_MASK);
+}
+
+bool blk_stat_is_current(struct blk_rq_stat *stat)
+{
+ return __blk_stat_is_current(stat, ktime_to_ns(ktime_get()));
+}
+
+void blk_stat_add(struct blk_rq_stat *stat, struct request *rq)
+{
+ s64 now, value;
+
+ now = __blk_stat_time(ktime_to_ns(ktime_get()));
+ if (now < blk_stat_time(&rq->issue_stat))
+ return;
+
+ if (!__blk_stat_is_current(stat, now))
+ __blk_stat_init(stat, now);
+
+ value = now - blk_stat_time(&rq->issue_stat);
+ if (value > stat->max)
+ stat->max = value;
+ if (value < stat->min)
+ stat->min = value;
+
+ if (stat->batch + value < stat->batch ||
+ stat->nr_batch + 1 == BLK_RQ_STAT_BATCH)
+ blk_stat_flush_batch(stat);
+
+ stat->batch += value;
+ stat->nr_batch++;
+}
+
+void blk_stat_clear(struct request_queue *q)
+{
+ if (q->mq_ops) {
+ struct blk_mq_hw_ctx *hctx;
+ struct blk_mq_ctx *ctx;
+ int i, j;
+
+ queue_for_each_hw_ctx(q, hctx, i) {
+ hctx_for_each_ctx(hctx, ctx, j) {
+ blk_stat_init(&ctx->stat[0]);
+ blk_stat_init(&ctx->stat[1]);
+ }
+ }
+ } else {
+ blk_stat_init(&q->rq_stats[0]);
+ blk_stat_init(&q->rq_stats[1]);
+ }
+}
+
+void blk_stat_set_issue_time(struct blk_issue_stat *stat)
+{
+ stat->time = (stat->time & BLK_STAT_MASK) |
+ (ktime_to_ns(ktime_get()) & BLK_STAT_TIME_MASK);
+}
diff --git a/block/blk-stat.h b/block/blk-stat.h
new file mode 100644
index 0000000..26b1f45
--- /dev/null
+++ b/block/blk-stat.h
@@ -0,0 +1,37 @@
+#ifndef BLK_STAT_H
+#define BLK_STAT_H
+
+/*
+ * ~0.13s window as a power-of-2 (2^27 nsecs)
+ */
+#define BLK_STAT_NSEC 134217728ULL
+#define BLK_STAT_NSEC_MASK ~(BLK_STAT_NSEC - 1)
+
+/*
+ * Upper 3 bits can be used elsewhere
+ */
+#define BLK_STAT_RES_BITS 3
+#define BLK_STAT_SHIFT (64 - BLK_STAT_RES_BITS)
+#define BLK_STAT_TIME_MASK ((1ULL << BLK_STAT_SHIFT) - 1)
+#define BLK_STAT_MASK ~BLK_STAT_TIME_MASK
+
+void blk_stat_add(struct blk_rq_stat *, struct request *);
+void blk_hctx_stat_get(struct blk_mq_hw_ctx *, struct blk_rq_stat *);
+void blk_queue_stat_get(struct request_queue *, struct blk_rq_stat *);
+void blk_stat_clear(struct request_queue *q);
+void blk_stat_init(struct blk_rq_stat *);
+void blk_stat_sum(struct blk_rq_stat *, struct blk_rq_stat *);
+bool blk_stat_is_current(struct blk_rq_stat *);
+void blk_stat_set_issue_time(struct blk_issue_stat *);
+
+static inline u64 __blk_stat_time(u64 time)
+{
+ return time & BLK_STAT_TIME_MASK;
+}
+
+static inline u64 blk_stat_time(struct blk_issue_stat *stat)
+{
+ return __blk_stat_time(stat->time);
+}
+
+#endif
diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c
index 9cc8d7c..1dfe5cc 100644
--- a/block/blk-sysfs.c
+++ b/block/blk-sysfs.c
@@ -13,6 +13,7 @@
#include "blk.h"
#include "blk-mq.h"
+#include "blk-wbt.h"
struct queue_sysfs_entry {
struct attribute attr;
@@ -41,6 +42,19 @@ queue_var_store(unsigned long *var, const char *page, size_t count)
return count;
}
+static ssize_t queue_var_store64(u64 *var, const char *page)
+{
+ int err;
+ u64 v;
+
+ err = kstrtou64(page, 10, &v);
+ if (err < 0)
+ return err;
+
+ *var = v;
+ return 0;
+}
+
static ssize_t queue_requests_show(struct request_queue *q, char *page)
{
return queue_var_show(q->nr_requests, (page));
@@ -347,6 +361,58 @@ static ssize_t queue_poll_store(struct request_queue *q, const char *page,
return ret;
}
+static ssize_t queue_wb_win_show(struct request_queue *q, char *page)
+{
+ if (!q->rq_wb)
+ return -EINVAL;
+
+ return sprintf(page, "%llu\n", div_u64(q->rq_wb->win_nsec, 1000));
+}
+
+static ssize_t queue_wb_win_store(struct request_queue *q, const char *page,
+ size_t count)
+{
+ ssize_t ret;
+ u64 val;
+
+ if (!q->rq_wb)
+ return -EINVAL;
+
+ ret = queue_var_store64(&val, page);
+ if (ret < 0)
+ return ret;
+
+ q->rq_wb->win_nsec = val * 1000ULL;
+ wbt_update_limits(q->rq_wb);
+ return count;
+}
+
+static ssize_t queue_wb_lat_show(struct request_queue *q, char *page)
+{
+ if (!q->rq_wb)
+ return -EINVAL;
+
+ return sprintf(page, "%llu\n", div_u64(q->rq_wb->min_lat_nsec, 1000));
+}
+
+static ssize_t queue_wb_lat_store(struct request_queue *q, const char *page,
+ size_t count)
+{
+ ssize_t ret;
+ u64 val;
+
+ if (!q->rq_wb)
+ return -EINVAL;
+
+ ret = queue_var_store64(&val, page);
+ if (ret < 0)
+ return ret;
+
+ q->rq_wb->min_lat_nsec = val * 1000ULL;
+ wbt_update_limits(q->rq_wb);
+ return count;
+}
+
static ssize_t queue_wc_show(struct request_queue *q, char *page)
{
if (test_bit(QUEUE_FLAG_WC, &q->queue_flags))
@@ -384,6 +450,26 @@ static ssize_t queue_dax_show(struct request_queue *q, char *page)
return queue_var_show(blk_queue_dax(q), page);
}
+static ssize_t print_stat(char *page, struct blk_rq_stat *stat, const char *pre)
+{
+ return sprintf(page, "%s samples=%llu, mean=%lld, min=%lld, max=%lld\n",
+ pre, (long long) stat->nr_samples,
+ (long long) stat->mean, (long long) stat->min,
+ (long long) stat->max);
+}
+
+static ssize_t queue_stats_show(struct request_queue *q, char *page)
+{
+ struct blk_rq_stat stat[2];
+ ssize_t ret;
+
+ blk_queue_stat_get(q, stat);
+
+ ret = print_stat(page, &stat[0], "read :");
+ ret += print_stat(page + ret, &stat[1], "write:");
+ return ret;
+}
+
static struct queue_sysfs_entry queue_requests_entry = {
.attr = {.name = "nr_requests", .mode = S_IRUGO | S_IWUSR },
.show = queue_requests_show,
@@ -526,6 +612,23 @@ static struct queue_sysfs_entry queue_dax_entry = {
.show = queue_dax_show,
};
+static struct queue_sysfs_entry queue_stats_entry = {
+ .attr = {.name = "stats", .mode = S_IRUGO },
+ .show = queue_stats_show,
+};
+
+static struct queue_sysfs_entry queue_wb_lat_entry = {
+ .attr = {.name = "wbt_lat_usec", .mode = S_IRUGO | S_IWUSR },
+ .show = queue_wb_lat_show,
+ .store = queue_wb_lat_store,
+};
+
+static struct queue_sysfs_entry queue_wb_win_entry = {
+ .attr = {.name = "wbt_window_usec", .mode = S_IRUGO | S_IWUSR },
+ .show = queue_wb_win_show,
+ .store = queue_wb_win_store,
+};
+
static struct attribute *default_attrs[] = {
&queue_requests_entry.attr,
&queue_ra_entry.attr,
@@ -553,6 +656,9 @@ static struct attribute *default_attrs[] = {
&queue_poll_entry.attr,
&queue_wc_entry.attr,
&queue_dax_entry.attr,
+ &queue_stats_entry.attr,
+ &queue_wb_lat_entry.attr,
+ &queue_wb_win_entry.attr,
NULL,
};
@@ -667,6 +773,58 @@ struct kobj_type blk_queue_ktype = {
.release = blk_release_queue,
};
+static void blk_wb_stat_get(void *data, struct blk_rq_stat *stat)
+{
+ blk_queue_stat_get(data, stat);
+}
+
+static void blk_wb_stat_clear(void *data)
+{
+ blk_stat_clear(data);
+}
+
+static bool blk_wb_stat_is_current(struct blk_rq_stat *stat)
+{
+ return blk_stat_is_current(stat);
+}
+
+static struct wb_stat_ops wb_stat_ops = {
+ .get = blk_wb_stat_get,
+ .is_current = blk_wb_stat_is_current,
+ .clear = blk_wb_stat_clear,
+};
+
+static void blk_wb_init(struct request_queue *q)
+{
+ struct rq_wb *rwb;
+
+#ifndef CONFIG_BLK_WBT_MQ
+ if (q->mq_ops)
+ return;
+#endif
+#ifndef CONFIG_BLK_WBT_SQ
+ if (q->request_fn)
+ return;
+#endif
+
+ rwb = wbt_init(&q->backing_dev_info, &wb_stat_ops, q);
+
+ /*
+ * If this fails, we don't get throttling
+ */
+ if (IS_ERR_OR_NULL(rwb))
+ return;
+
+ if (blk_queue_nonrot(q))
+ rwb->min_lat_nsec = 2000000ULL;
+ else
+ rwb->min_lat_nsec = 75000000ULL;
+
+ wbt_set_queue_depth(rwb, blk_queue_depth(q));
+ wbt_set_write_cache(rwb, test_bit(QUEUE_FLAG_WC, &q->queue_flags));
+ q->rq_wb = rwb;
+}
+
int blk_register_queue(struct gendisk *disk)
{
int ret;
@@ -706,6 +864,8 @@ int blk_register_queue(struct gendisk *disk)
if (q->mq_ops)
blk_mq_register_dev(dev, q);
+ blk_wb_init(q);
+
if (!q->request_fn)
return 0;
diff --git a/block/blk-wbt.c b/block/blk-wbt.c
new file mode 100644
index 0000000..82823c7
--- /dev/null
+++ b/block/blk-wbt.c
@@ -0,0 +1,704 @@
+/*
+ * buffered writeback throttling. loosely based on CoDel. We can't drop
+ * packets for IO scheduling, so the logic is something like this:
+ *
+ * - Monitor latencies in a defined window of time.
+ * - If the minimum latency in the above window exceeds some target, increment
+ * scaling step and scale down queue depth by a factor of 2x. The monitoring
+ * window is then shrunk to 100 / sqrt(scaling step + 1).
+ * - For any window where we don't have solid data on what the latencies
+ * look like, retain status quo.
+ * - If latencies look good, decrement scaling step.
+ * - If we're only doing writes, allow the scaling step to go negative. This
+ * will temporarily boost write performance, snapping back to a stable
+ * scaling step of 0 if reads show up or the heavy writers finish. Unlike
+ * positive scaling steps where we shrink the monitoring window, a negative
+ * scaling step retains the default step==0 window size.
+ *
+ * Copyright (C) 2016 Jens Axboe
+ *
+ */
+#include <linux/kernel.h>
+#include <linux/blk_types.h>
+#include <linux/slab.h>
+#include <linux/backing-dev.h>
+#include <linux/swap.h>
+
+#include "blk-wbt.h"
+
+#define CREATE_TRACE_POINTS
+#include <trace/events/wbt.h>
+
+enum {
+ /*
+ * Default setting, we'll scale up (to 75% of QD max) or down (min 1)
+ * from here depending on device stats
+ */
+ RWB_DEF_DEPTH = 16,
+
+ /*
+ * 100msec window
+ */
+ RWB_WINDOW_NSEC = 100 * 1000 * 1000ULL,
+
+ /*
+ * Disregard stats, if we don't meet this minimum
+ */
+ RWB_MIN_WRITE_SAMPLES = 3,
+
+ /*
+ * If we have this number of consecutive windows with not enough
+ * information to scale up or down, scale up.
+ */
+ RWB_UNKNOWN_BUMP = 5,
+};
+
+static inline bool rwb_enabled(struct rq_wb *rwb)
+{
+ return rwb && rwb->wb_normal != 0;
+}
+
+/*
+ * Increment 'v', if 'v' is below 'below'. Returns true if we succeeded,
+ * false if 'v' + 1 would be bigger than 'below'.
+ */
+static bool atomic_inc_below(atomic_t *v, int below)
+{
+ int cur = atomic_read(v);
+
+ for (;;) {
+ int old;
+
+ if (cur >= below)
+ return false;
+ old = atomic_cmpxchg(v, cur, cur + 1);
+ if (old == cur)
+ break;
+ cur = old;
+ }
+
+ return true;
+}
+
+static void wb_timestamp(struct rq_wb *rwb, unsigned long *var)
+{
+ if (rwb_enabled(rwb)) {
+ const unsigned long cur = jiffies;
+
+ if (cur != *var)
+ *var = cur;
+ }
+}
+
+/*
+ * If a task was rate throttled in balance_dirty_pages() within the last
+ * second or so, use that to indicate a higher cleaning rate.
+ */
+static bool wb_recent_wait(struct rq_wb *rwb)
+{
+ struct bdi_writeback *wb = &rwb->bdi->wb;
+
+ return time_before(jiffies, wb->dirty_sleep + HZ);
+}
+
+static inline struct rq_wait *get_rq_wait(struct rq_wb *rwb, bool is_kswapd)
+{
+ return &rwb->rq_wait[is_kswapd];
+}
+
+static void rwb_wake_all(struct rq_wb *rwb)
+{
+ int i;
+
+ for (i = 0; i < WBT_NUM_RWQ; i++) {
+ struct rq_wait *rqw = &rwb->rq_wait[i];
+
+ if (waitqueue_active(&rqw->wait))
+ wake_up_all(&rqw->wait);
+ }
+}
+
+void __wbt_done(struct rq_wb *rwb, enum wbt_flags wb_acct)
+{
+ struct rq_wait *rqw;
+ int inflight, limit;
+
+ if (!(wb_acct & WBT_TRACKED))
+ return;
+
+ rqw = get_rq_wait(rwb, wb_acct & WBT_KSWAPD);
+ inflight = atomic_dec_return(&rqw->inflight);
+
+ /*
+ * wbt got disabled with IO in flight. Wake up any potential
+ * waiters, we don't have to do more than that.
+ */
+ if (unlikely(!rwb_enabled(rwb))) {
+ rwb_wake_all(rwb);
+ return;
+ }
+
+ /*
+ * If the device does write back caching, drop further down
+ * before we wake people up.
+ */
+ if (rwb->wc && !wb_recent_wait(rwb))
+ limit = 0;
+ else
+ limit = rwb->wb_normal;
+
+ /*
+ * Don't wake anyone up if we are above the normal limit.
+ */
+ if (inflight && inflight >= limit)
+ return;
+
+ if (waitqueue_active(&rqw->wait)) {
+ int diff = limit - inflight;
+
+ if (!inflight || diff >= rwb->wb_background / 2)
+ wake_up(&rqw->wait);
+ }
+}
+
+/*
+ * Called on completion of a request. Note that it's also called when
+ * a request is merged, when the request gets freed.
+ */
+void wbt_done(struct rq_wb *rwb, struct blk_issue_stat *stat)
+{
+ if (!rwb)
+ return;
+
+ if (!wbt_is_tracked(stat)) {
+ if (rwb->sync_cookie == stat) {
+ rwb->sync_issue = 0;
+ rwb->sync_cookie = NULL;
+ }
+
+ if (wbt_is_read(stat))
+ wb_timestamp(rwb, &rwb->last_comp);
+ wbt_clear_state(stat);
+ } else {
+ WARN_ON_ONCE(stat == rwb->sync_cookie);
+ __wbt_done(rwb, wbt_stat_to_mask(stat));
+ wbt_clear_state(stat);
+ }
+}
+
+/*
+ * Return true, if we can't increase the depth further by scaling
+ */
+static bool calc_wb_limits(struct rq_wb *rwb)
+{
+ unsigned int depth;
+ bool ret = false;
+
+ if (!rwb->min_lat_nsec) {
+ rwb->wb_max = rwb->wb_normal = rwb->wb_background = 0;
+ return false;
+ }
+
+ /*
+ * For QD=1 devices, this is a special case. It's important for those
+ * to have one request ready when one completes, so force a depth of
+ * 2 for those devices. On the backend, it'll be a depth of 1 anyway,
+ * since the device can't have more than that in flight. If we're
+ * scaling down, then keep a setting of 1/1/1.
+ */
+ if (rwb->queue_depth == 1) {
+ if (rwb->scale_step > 0)
+ rwb->wb_max = rwb->wb_normal = 1;
+ else {
+ rwb->wb_max = rwb->wb_normal = 2;
+ ret = true;
+ }
+ rwb->wb_background = 1;
+ } else {
+ /*
+ * scale_step == 0 is our default state. If we have suffered
+ * latency spikes, step will be > 0, and we shrink the
+ * allowed write depths. If step is < 0, we're only doing
+ * writes, and we allow a temporarily higher depth to
+ * increase performance.
+ */
+ depth = min_t(unsigned int, RWB_DEF_DEPTH, rwb->queue_depth);
+ if (rwb->scale_step > 0)
+ depth = 1 + ((depth - 1) >> min(31, rwb->scale_step));
+ else if (rwb->scale_step < 0) {
+ unsigned int maxd = 3 * rwb->queue_depth / 4;
+
+ depth = 1 + ((depth - 1) << -rwb->scale_step);
+ if (depth > maxd) {
+ depth = maxd;
+ ret = true;
+ }
+ }
+
+ /*
+ * Set our max/normal/bg queue depths based on how far
+ * we have scaled down (->scale_step).
+ */
+ rwb->wb_max = depth;
+ rwb->wb_normal = (rwb->wb_max + 1) / 2;
+ rwb->wb_background = (rwb->wb_max + 3) / 4;
+ }
+
+ return ret;
+}
+
+static bool inline stat_sample_valid(struct blk_rq_stat *stat)
+{
+ /*
+ * We need at least one read sample, and a minimum of
+ * RWB_MIN_WRITE_SAMPLES. We require some write samples to know
+ * that it's writes impacting us, and not just some sole read on
+ * a device that is in a lower power state.
+ */
+ return stat[0].nr_samples >= 1 &&
+ stat[1].nr_samples >= RWB_MIN_WRITE_SAMPLES;
+}
+
+static u64 rwb_sync_issue_lat(struct rq_wb *rwb)
+{
+ u64 now, issue = ACCESS_ONCE(rwb->sync_issue);
+
+ if (!issue || !rwb->sync_cookie)
+ return 0;
+
+ now = ktime_to_ns(ktime_get());
+ return now - issue;
+}
+
+enum {
+ LAT_OK = 1,
+ LAT_UNKNOWN,
+ LAT_UNKNOWN_WRITES,
+ LAT_EXCEEDED,
+};
+
+static int __latency_exceeded(struct rq_wb *rwb, struct blk_rq_stat *stat)
+{
+ u64 thislat;
+
+ /*
+ * If our stored sync issue exceeds the window size, or it
+ * exceeds our min target AND we haven't logged any entries,
+ * flag the latency as exceeded. wbt works off completion latencies,
+ * but for a flooded device, a single sync IO can take a long time
+ * to complete after being issued. If this time exceeds our
+ * monitoring window AND we didn't see any other completions in that
+ * window, then count that sync IO as a violation of the latency.
+ */
+ thislat = rwb_sync_issue_lat(rwb);
+ if (thislat > rwb->cur_win_nsec ||
+ (thislat > rwb->min_lat_nsec && !stat[0].nr_samples)) {
+ trace_wbt_lat(rwb->bdi, thislat);
+ return LAT_EXCEEDED;
+ }
+
+ /*
+ * No read/write mix, if stat isn't valid
+ */
+ if (!stat_sample_valid(stat)) {
+ /*
+ * If we had writes in this stat window and the window is
+ * current, we're only doing writes. If a task recently
+ * waited or still has writes in flights, consider us doing
+ * just writes as well.
+ */
+ if ((stat[1].nr_samples && rwb->stat_ops->is_current(stat)) ||
+ wb_recent_wait(rwb) || wbt_inflight(rwb))
+ return LAT_UNKNOWN_WRITES;
+ return LAT_UNKNOWN;
+ }
+
+ /*
+ * If the 'min' latency exceeds our target, step down.
+ */
+ if (stat[0].min > rwb->min_lat_nsec) {
+ trace_wbt_lat(rwb->bdi, stat[0].min);
+ trace_wbt_stat(rwb->bdi, stat);
+ return LAT_EXCEEDED;
+ }
+
+ if (rwb->scale_step)
+ trace_wbt_stat(rwb->bdi, stat);
+
+ return LAT_OK;
+}
+
+static int latency_exceeded(struct rq_wb *rwb)
+{
+ struct blk_rq_stat stat[2];
+
+ rwb->stat_ops->get(rwb->ops_data, stat);
+ return __latency_exceeded(rwb, stat);
+}
+
+static void rwb_trace_step(struct rq_wb *rwb, const char *msg)
+{
+ trace_wbt_step(rwb->bdi, msg, rwb->scale_step, rwb->cur_win_nsec,
+ rwb->wb_background, rwb->wb_normal, rwb->wb_max);
+}
+
+static void scale_up(struct rq_wb *rwb)
+{
+ /*
+ * Hit max in previous round, stop here
+ */
+ if (rwb->scaled_max)
+ return;
+
+ rwb->scale_step--;
+ rwb->unknown_cnt = 0;
+ rwb->stat_ops->clear(rwb->ops_data);
+
+ rwb->scaled_max = calc_wb_limits(rwb);
+
+ rwb_wake_all(rwb);
+
+ rwb_trace_step(rwb, "step up");
+}
+
+/*
+ * Scale rwb down. If 'hard_throttle' is set, do it quicker, since we
+ * had a latency violation.
+ */
+static void scale_down(struct rq_wb *rwb, bool hard_throttle)
+{
+ /*
+ * Stop scaling down when we've hit the limit. This also prevents
+ * ->scale_step from going to crazy values, if the device can't
+ * keep up.
+ */
+ if (rwb->wb_max == 1)
+ return;
+
+ if (rwb->scale_step < 0 && hard_throttle)
+ rwb->scale_step = 0;
+ else
+ rwb->scale_step++;
+
+ rwb->scaled_max = false;
+ rwb->unknown_cnt = 0;
+ rwb->stat_ops->clear(rwb->ops_data);
+ calc_wb_limits(rwb);
+ rwb_trace_step(rwb, "step down");
+}
+
+static void rwb_arm_timer(struct rq_wb *rwb)
+{
+ unsigned long expires;
+
+ if (rwb->scale_step > 0) {
+ /*
+ * We should speed this up, using some variant of a fast
+ * integer inverse square root calculation. Since we only do
+ * this for every window expiration, it's not a huge deal,
+ * though.
+ */
+ rwb->cur_win_nsec = div_u64(rwb->win_nsec << 4,
+ int_sqrt((rwb->scale_step + 1) << 8));
+ } else {
+ /*
+ * For step < 0, we don't want to increase/decrease the
+ * window size.
+ */
+ rwb->cur_win_nsec = rwb->win_nsec;
+ }
+
+ expires = jiffies + nsecs_to_jiffies(rwb->cur_win_nsec);
+ mod_timer(&rwb->window_timer, expires);
+}
+
+static void wb_timer_fn(unsigned long data)
+{
+ struct rq_wb *rwb = (struct rq_wb *) data;
+ unsigned int inflight = wbt_inflight(rwb);
+ int status;
+
+ status = latency_exceeded(rwb);
+
+ trace_wbt_timer(rwb->bdi, status, rwb->scale_step, inflight);
+
+ /*
+ * If we exceeded the latency target, step down. If we did not,
+ * step one level up. If we don't know enough to say either exceeded
+ * or ok, then don't do anything.
+ */
+ switch (status) {
+ case LAT_EXCEEDED:
+ scale_down(rwb, true);
+ break;
+ case LAT_OK:
+ scale_up(rwb);
+ break;
+ case LAT_UNKNOWN_WRITES:
+ scale_up(rwb);
+ break;
+ case LAT_UNKNOWN:
+ if (++rwb->unknown_cnt < RWB_UNKNOWN_BUMP)
+ break;
+ /*
+ * We get here for two reasons:
+ *
+ * 1) We previously scaled reduced depth, and we currently
+ * don't have a valid read/write sample. For that case,
+ * slowly return to center state (step == 0).
+ * 2) We started a the center step, but don't have a valid
+ * read/write sample, but we do have writes going on.
+ * Allow step to go negative, to increase write perf.
+ */
+ if (rwb->scale_step > 0)
+ scale_up(rwb);
+ else if (rwb->scale_step < 0)
+ scale_down(rwb, false);
+ break;
+ default:
+ break;
+ }
+
+ /*
+ * Re-arm timer, if we have IO in flight
+ */
+ if (rwb->scale_step || inflight)
+ rwb_arm_timer(rwb);
+}
+
+void wbt_update_limits(struct rq_wb *rwb)
+{
+ rwb->scale_step = 0;
+ rwb->scaled_max = false;
+ calc_wb_limits(rwb);
+
+ rwb_wake_all(rwb);
+}
+
+static bool close_io(struct rq_wb *rwb)
+{
+ const unsigned long now = jiffies;
+
+ return time_before(now, rwb->last_issue + HZ / 10) ||
+ time_before(now, rwb->last_comp + HZ / 10);
+}
+
+#define REQ_HIPRIO (REQ_SYNC | REQ_META | REQ_PRIO)
+
+static inline unsigned int get_limit(struct rq_wb *rwb, unsigned long rw)
+{
+ unsigned int limit;
+
+ /*
+ * At this point we know it's a buffered write. If this is
+ * kswapd trying to free memory, or REQ_SYNC is set, set, then
+ * it's WB_SYNC_ALL writeback, and we'll use the max limit for
+ * that. If the write is marked as a background write, then use
+ * the idle limit, or go to normal if we haven't had competing
+ * IO for a bit.
+ */
+ if ((rw & REQ_HIPRIO) || wb_recent_wait(rwb) || current_is_kswapd())
+ limit = rwb->wb_max;
+ else if ((rw & REQ_BG) || close_io(rwb)) {
+ /*
+ * If less than 100ms since we completed unrelated IO,
+ * limit us to half the depth for background writeback.
+ */
+ limit = rwb->wb_background;
+ } else
+ limit = rwb->wb_normal;
+
+ return limit;
+}
+
+static inline bool may_queue(struct rq_wb *rwb, struct rq_wait *rqw,
+ unsigned long rw)
+{
+ /*
+ * inc it here even if disabled, since we'll dec it at completion.
+ * this only happens if the task was sleeping in __wbt_wait(),
+ * and someone turned it off at the same time.
+ */
+ if (!rwb_enabled(rwb)) {
+ atomic_inc(&rqw->inflight);
+ return true;
+ }
+
+ return atomic_inc_below(&rqw->inflight, get_limit(rwb, rw));
+}
+
+/*
+ * Block if we will exceed our limit, or if we are currently waiting for
+ * the timer to kick off queuing again.
+ */
+static void __wbt_wait(struct rq_wb *rwb, unsigned long rw, spinlock_t *lock)
+{
+ struct rq_wait *rqw = get_rq_wait(rwb, current_is_kswapd());
+ DEFINE_WAIT(wait);
+
+ if (may_queue(rwb, rqw, rw))
+ return;
+
+ do {
+ prepare_to_wait_exclusive(&rqw->wait, &wait,
+ TASK_UNINTERRUPTIBLE);
+
+ if (may_queue(rwb, rqw, rw))
+ break;
+
+ if (lock)
+ spin_unlock_irq(lock);
+
+ io_schedule();
+
+ if (lock)
+ spin_lock_irq(lock);
+ } while (1);
+
+ finish_wait(&rqw->wait, &wait);
+}
+
+static inline bool wbt_should_throttle(struct rq_wb *rwb, unsigned int rw)
+{
+ const int op = rw >> BIO_OP_SHIFT;
+
+ /*
+ * If not a WRITE (or a discard), do nothing
+ */
+ if (!(op == REQ_OP_WRITE || op == REQ_OP_DISCARD))
+ return false;
+
+ /*
+ * Don't throttle WRITE_ODIRECT
+ */
+ if ((rw & (REQ_SYNC | REQ_NOIDLE)) == REQ_SYNC)
+ return false;
+
+ return true;
+}
+
+/*
+ * Returns true if the IO request should be accounted, false if not.
+ * May sleep, if we have exceeded the writeback limits. Caller can pass
+ * in an irq held spinlock, if it holds one when calling this function.
+ * If we do sleep, we'll release and re-grab it.
+ */
+unsigned int wbt_wait(struct rq_wb *rwb, unsigned int rw, spinlock_t *lock)
+{
+ unsigned int ret = 0;
+
+ if (!rwb_enabled(rwb))
+ return 0;
+
+ if ((rw >> BIO_OP_SHIFT) == REQ_OP_READ)
+ ret = WBT_READ;
+
+ if (!wbt_should_throttle(rwb, rw)) {
+ if (ret & WBT_READ)
+ wb_timestamp(rwb, &rwb->last_issue);
+ return ret;
+ }
+
+ __wbt_wait(rwb, rw, lock);
+
+ if (!timer_pending(&rwb->window_timer))
+ rwb_arm_timer(rwb);
+
+ if (current_is_kswapd())
+ ret |= WBT_KSWAPD;
+
+ return ret | WBT_TRACKED;
+}
+
+void wbt_issue(struct rq_wb *rwb, struct blk_issue_stat *stat)
+{
+ if (!rwb_enabled(rwb))
+ return;
+
+ /*
+ * Track sync issue, in case it takes a long time to complete. Allows
+ * us to react quicker, if a sync IO takes a long time to complete.
+ * Note that this is just a hint. 'stat' can go away when the
+ * request completes, so it's important we never dereference it. We
+ * only use the address to compare with, which is why we store the
+ * sync_issue time locally.
+ */
+ if (wbt_is_read(stat) && !rwb->sync_issue) {
+ rwb->sync_cookie = stat;
+ rwb->sync_issue = blk_stat_time(stat);
+ }
+}
+
+void wbt_requeue(struct rq_wb *rwb, struct blk_issue_stat *stat)
+{
+ if (!rwb_enabled(rwb))
+ return;
+ if (stat == rwb->sync_cookie) {
+ rwb->sync_issue = 0;
+ rwb->sync_cookie = NULL;
+ }
+}
+
+void wbt_set_queue_depth(struct rq_wb *rwb, unsigned int depth)
+{
+ if (rwb) {
+ rwb->queue_depth = depth;
+ wbt_update_limits(rwb);
+ }
+}
+
+void wbt_set_write_cache(struct rq_wb *rwb, bool write_cache_on)
+{
+ if (rwb)
+ rwb->wc = write_cache_on;
+}
+
+void wbt_disable(struct rq_wb *rwb)
+{
+ if (rwb) {
+ del_timer_sync(&rwb->window_timer);
+ rwb->win_nsec = rwb->min_lat_nsec = 0;
+ wbt_update_limits(rwb);
+ }
+}
+EXPORT_SYMBOL_GPL(wbt_disable);
+
+struct rq_wb *wbt_init(struct backing_dev_info *bdi, struct wb_stat_ops *ops,
+ void *ops_data)
+{
+ struct rq_wb *rwb;
+ int i;
+
+ BUILD_BUG_ON(WBT_NR_BITS > BLK_STAT_RES_BITS);
+
+ if (!ops->get || !ops->is_current || !ops->clear)
+ return ERR_PTR(-EINVAL);
+
+ rwb = kzalloc(sizeof(*rwb), GFP_KERNEL);
+ if (!rwb)
+ return ERR_PTR(-ENOMEM);
+
+ for (i = 0; i < WBT_NUM_RWQ; i++) {
+ atomic_set(&rwb->rq_wait[i].inflight, 0);
+ init_waitqueue_head(&rwb->rq_wait[i].wait);
+ }
+
+ setup_timer(&rwb->window_timer, wb_timer_fn, (unsigned long) rwb);
+ rwb->wc = 1;
+ rwb->queue_depth = RWB_DEF_DEPTH;
+ rwb->last_comp = rwb->last_issue = jiffies;
+ rwb->bdi = bdi;
+ rwb->win_nsec = RWB_WINDOW_NSEC;
+ rwb->stat_ops = ops;
+ rwb->ops_data = ops_data;
+ wbt_update_limits(rwb);
+ return rwb;
+}
+
+void wbt_exit(struct rq_wb *rwb)
+{
+ if (rwb) {
+ del_timer_sync(&rwb->window_timer);
+ kfree(rwb);
+ }
+}
diff --git a/block/blk-wbt.h b/block/blk-wbt.h
new file mode 100644
index 0000000..7af8968
--- /dev/null
+++ b/block/blk-wbt.h
@@ -0,0 +1,166 @@
+#ifndef WB_THROTTLE_H
+#define WB_THROTTLE_H
+
+#include <linux/kernel.h>
+#include <linux/atomic.h>
+#include <linux/wait.h>
+#include <linux/timer.h>
+#include <linux/ktime.h>
+
+#include "blk-stat.h"
+
+enum wbt_flags {
+ WBT_TRACKED = 1, /* write, tracked for throttling */
+ WBT_READ = 2, /* read */
+ WBT_KSWAPD = 4, /* write, from kswapd */
+
+ WBT_NR_BITS = 3, /* number of bits */
+};
+
+enum {
+ WBT_NUM_RWQ = 2,
+};
+
+static inline void wbt_clear_state(struct blk_issue_stat *stat)
+{
+ stat->time &= BLK_STAT_TIME_MASK;
+}
+
+static inline enum wbt_flags wbt_stat_to_mask(struct blk_issue_stat *stat)
+{
+ return (stat->time & BLK_STAT_MASK) >> BLK_STAT_SHIFT;
+}
+
+static inline void wbt_track(struct blk_issue_stat *stat, enum wbt_flags wb_acct)
+{
+ stat->time |= ((u64) wb_acct) << BLK_STAT_SHIFT;
+}
+
+static inline bool wbt_is_tracked(struct blk_issue_stat *stat)
+{
+ return (stat->time >> BLK_STAT_SHIFT) & WBT_TRACKED;
+}
+
+static inline bool wbt_is_read(struct blk_issue_stat *stat)
+{
+ return (stat->time >> BLK_STAT_SHIFT) & WBT_READ;
+}
+
+struct wb_stat_ops {
+ void (*get)(void *, struct blk_rq_stat *);
+ bool (*is_current)(struct blk_rq_stat *);
+ void (*clear)(void *);
+};
+
+struct rq_wait {
+ wait_queue_head_t wait;
+ atomic_t inflight;
+};
+
+struct rq_wb {
+ /*
+ * Settings that govern how we throttle
+ */
+ unsigned int wb_background; /* background writeback */
+ unsigned int wb_normal; /* normal writeback */
+ unsigned int wb_max; /* max throughput writeback */
+ int scale_step;
+ bool scaled_max;
+
+ /*
+ * Number of consecutive periods where we don't have enough
+ * information to make a firm scale up/down decision.
+ */
+ unsigned int unknown_cnt;
+
+ u64 win_nsec; /* default window size */
+ u64 cur_win_nsec; /* current window size */
+
+ struct timer_list window_timer;
+
+ s64 sync_issue;
+ void *sync_cookie;
+
+ unsigned int wc;
+ unsigned int queue_depth;
+
+ unsigned long last_issue; /* last non-throttled issue */
+ unsigned long last_comp; /* last non-throttled comp */
+ unsigned long min_lat_nsec;
+ struct backing_dev_info *bdi;
+ struct rq_wait rq_wait[WBT_NUM_RWQ];
+
+ struct wb_stat_ops *stat_ops;
+ void *ops_data;
+};
+
+static inline unsigned int wbt_inflight(struct rq_wb *rwb)
+{
+ unsigned int i, ret = 0;
+
+ for (i = 0; i < WBT_NUM_RWQ; i++)
+ ret += atomic_read(&rwb->rq_wait[i].inflight);
+
+ return ret;
+}
+
+struct backing_dev_info;
+
+#ifdef CONFIG_BLK_WBT
+
+void __wbt_done(struct rq_wb *, enum wbt_flags);
+void wbt_done(struct rq_wb *, struct blk_issue_stat *);
+enum wbt_flags wbt_wait(struct rq_wb *, unsigned int, spinlock_t *);
+struct rq_wb *wbt_init(struct backing_dev_info *, struct wb_stat_ops *, void *);
+void wbt_exit(struct rq_wb *);
+void wbt_update_limits(struct rq_wb *);
+void wbt_requeue(struct rq_wb *, struct blk_issue_stat *);
+void wbt_issue(struct rq_wb *, struct blk_issue_stat *);
+void wbt_disable(struct rq_wb *);
+
+void wbt_set_queue_depth(struct rq_wb *, unsigned int);
+void wbt_set_write_cache(struct rq_wb *, bool);
+
+#else
+
+static inline void __wbt_done(struct rq_wb *rwb, enum wbt_flags flags)
+{
+}
+static inline void wbt_done(struct rq_wb *rwb, struct blk_issue_stat *stat)
+{
+}
+static inline enum wbt_flags wbt_wait(struct rq_wb *rwb,
+ unsigned int rw, spinlock_t *lock)
+{
+ return 0;
+}
+static inline struct rq_wb *wbt_init(struct backing_dev_info *bdi,
+ struct wb_stat_ops *ops, void *ops_data)
+{
+ return ERR_PTR(-EINVAL);
+}
+static inline void wbt_exit(struct rq_wb *rbw)
+{
+}
+static inline void wbt_update_limits(struct rq_wb *rwb)
+{
+}
+static inline void wbt_requeue(struct rq_wb *rwb, struct blk_issue_stat *stat)
+{
+}
+static inline void wbt_issue(struct rq_wb *rwb, struct blk_issue_stat *stat)
+{
+}
+static inline void wbt_disable(struct rq_wb *rwb)
+{
+}
+static inline void wbt_set_queue_depth(struct rq_wb *rwb, unsigned int depth)
+{
+}
+static inline void wbt_set_write_cache(struct rq_wb *rwb, bool wc)
+{
+}
+
+#endif /* CONFIG_BLK_WBT */
+
+#endif
diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
index 3ab6807..1be81bb 100644
--- a/block/cfq-iosched.c
+++ b/block/cfq-iosched.c
@@ -16,6 +16,7 @@
#include <linux/blktrace_api.h>
#include <linux/blk-cgroup.h>
#include "blk.h"
+#include "blk-wbt.h"
/*
* tunables
@@ -3771,9 +3772,11 @@ static void check_blkcg_changed(struct cfq_io_cq *cic, struct bio *bio)
struct cfq_data *cfqd = cic_to_cfqd(cic);
struct cfq_queue *cfqq;
uint64_t serial_nr;
+ bool nonroot_cg;
rcu_read_lock();
serial_nr = bio_blkcg(bio)->css.serial_nr;
+ nonroot_cg = bio_blkcg(bio) != &blkcg_root;
rcu_read_unlock();
/*
@@ -3784,6 +3787,17 @@ static void check_blkcg_changed(struct cfq_io_cq *cic, struct bio *bio)
return;
/*
+ * If we have a non-root cgroup, we can depend on that to
+ * do proper throttling of writes. Turn off wbt for that
+ * case.
+ */
+ if (nonroot_cg) {
+ struct request_queue *q = cfqd->queue;
+
+ wbt_disable(q->rq_wb);
+ }
+
+ /*
* Drop reference to queues. New queues will be assigned in new
* group upon arrival of fresh requests.
*/
diff --git a/drivers/block/swim.c b/drivers/block/swim.c
index b5afd49..7d09955 100644
--- a/drivers/block/swim.c
+++ b/drivers/block/swim.c
@@ -332,7 +332,7 @@ static inline void swim_motor(struct swim __iomem *base,
if (swim_readbit(base, MOTOR_ON))
break;
current->state = TASK_INTERRUPTIBLE;
- schedule_timeout(1);
+ schedule_min_hrtimeout();
}
} else if (action == OFF) {
swim_action(base, MOTOR_OFF);
@@ -351,7 +351,7 @@ static inline void swim_eject(struct swim __iomem *base)
if (!swim_readbit(base, DISK_IN))
break;
current->state = TASK_INTERRUPTIBLE;
- schedule_timeout(1);
+ schedule_min_hrtimeout();
}
swim_select(base, RELAX);
}
@@ -375,7 +375,7 @@ static inline int swim_step(struct swim __iomem *base)
for (wait = 0; wait < HZ; wait++) {
current->state = TASK_INTERRUPTIBLE;
- schedule_timeout(1);
+ schedule_min_hrtimeout();
swim_select(base, RELAX);
if (!swim_readbit(base, STEP))
diff --git a/drivers/bluetooth/hci_qca.c b/drivers/bluetooth/hci_qca.c
index 6c867fb..87adc94 100644
--- a/drivers/bluetooth/hci_qca.c
+++ b/drivers/bluetooth/hci_qca.c
@@ -883,7 +883,7 @@ static int qca_set_baudrate(struct hci_dev *hdev, uint8_t baudrate)
* then host can communicate with new baudrate to controller
*/
set_current_state(TASK_UNINTERRUPTIBLE);
- schedule_timeout(msecs_to_jiffies(BAUDRATE_SETTLE_TIMEOUT_MS));
+ schedule_msec_hrtimeout((BAUDRATE_SETTLE_TIMEOUT_MS));
set_current_state(TASK_INTERRUPTIBLE);
return 0;
diff --git a/drivers/char/ipmi/ipmi_msghandler.c b/drivers/char/ipmi/ipmi_msghandler.c
index fcdd886..a018d18 100644
--- a/drivers/char/ipmi/ipmi_msghandler.c
+++ b/drivers/char/ipmi/ipmi_msghandler.c
@@ -2952,7 +2952,7 @@ static void cleanup_smi_msgs(ipmi_smi_t intf)
/* Current message first, to preserve order */
while (intf->curr_msg && !list_empty(&intf->waiting_rcv_msgs)) {
/* Wait for the message to clear out. */
- schedule_timeout(1);
+ schedule_min_hrtimeout();
}
/* No need for locks, the interface is down. */
diff --git a/drivers/char/ipmi/ipmi_ssif.c b/drivers/char/ipmi/ipmi_ssif.c
index 5673fff..8bb6345 100644
--- a/drivers/char/ipmi/ipmi_ssif.c
+++ b/drivers/char/ipmi/ipmi_ssif.c
@@ -1190,7 +1190,7 @@ static int ssif_remove(struct i2c_client *client)
/* make sure the driver is not looking for flags any more. */
while (ssif_info->ssif_state != SSIF_NORMAL)
- schedule_timeout(1);
+ schedule_min_hrtimeout();
ssif_info->stopping = true;
del_timer_sync(&ssif_info->retry_timer);
diff --git a/drivers/char/snsc.c b/drivers/char/snsc.c
index 10e5632..b7a8197 100644
--- a/drivers/char/snsc.c
+++ b/drivers/char/snsc.c
@@ -198,7 +198,7 @@ scdrv_read(struct file *file, char __user *buf, size_t count, loff_t *f_pos)
add_wait_queue(&sd->sd_rq, &wait);
spin_unlock_irqrestore(&sd->sd_rlock, flags);
- schedule_timeout(msecs_to_jiffies(SCDRV_TIMEOUT));
+ schedule_msec_hrtimeout((SCDRV_TIMEOUT));
remove_wait_queue(&sd->sd_rq, &wait);
if (signal_pending(current)) {
@@ -294,7 +294,7 @@ scdrv_write(struct file *file, const char __user *buf,
add_wait_queue(&sd->sd_wq, &wait);
spin_unlock_irqrestore(&sd->sd_wlock, flags);
- schedule_timeout(msecs_to_jiffies(SCDRV_TIMEOUT));
+ schedule_msec_hrtimeout((SCDRV_TIMEOUT));
remove_wait_queue(&sd->sd_wq, &wait);
if (signal_pending(current)) {
diff --git a/drivers/gpu/drm/vmwgfx/vmwgfx_fifo.c b/drivers/gpu/drm/vmwgfx/vmwgfx_fifo.c
index b6a0806..b5b02cf 100644
--- a/drivers/gpu/drm/vmwgfx/vmwgfx_fifo.c
+++ b/drivers/gpu/drm/vmwgfx/vmwgfx_fifo.c
@@ -235,7 +235,7 @@ static int vmw_fifo_wait_noirq(struct vmw_private *dev_priv,
DRM_ERROR("SVGA device lockup.\n");
break;
}
- schedule_timeout(1);
+ schedule_min_hrtimeout();
if (interruptible && signal_pending(current)) {
ret = -ERESTARTSYS;
break;
diff --git a/drivers/gpu/drm/vmwgfx/vmwgfx_irq.c b/drivers/gpu/drm/vmwgfx/vmwgfx_irq.c
index 0c7e172..4c1555c 100644
--- a/drivers/gpu/drm/vmwgfx/vmwgfx_irq.c
+++ b/drivers/gpu/drm/vmwgfx/vmwgfx_irq.c
@@ -156,7 +156,7 @@ int vmw_fallback_wait(struct vmw_private *dev_priv,
break;
}
if (lazy)
- schedule_timeout(1);
+ schedule_min_hrtimeout();
else if ((++count & 0x0F) == 0) {
/**
* FIXME: Use schedule_hr_timeout here for
diff --git a/drivers/hwmon/fam15h_power.c b/drivers/hwmon/fam15h_power.c
index 15aa49d..991e8a7 100644
--- a/drivers/hwmon/fam15h_power.c
+++ b/drivers/hwmon/fam15h_power.c
@@ -238,7 +238,7 @@ static ssize_t acc_show_power(struct device *dev,
prev_ptsc[cu] = data->cpu_sw_pwr_ptsc[cu];
}
- leftover = schedule_timeout_interruptible(msecs_to_jiffies(data->power_period));
+ leftover = schedule_msec_hrtimeout_interruptible((data->power_period));
if (leftover)
return 0;
diff --git a/drivers/iio/light/tsl2563.c b/drivers/iio/light/tsl2563.c
index 04598ae..a8c095d 100644
--- a/drivers/iio/light/tsl2563.c
+++ b/drivers/iio/light/tsl2563.c
@@ -282,11 +282,7 @@ static void tsl2563_wait_adc(struct tsl2563_chip *chip)
default:
delay = 402;
}
- /*
- * TODO: Make sure that we wait at least required delay but why we
- * have to extend it one tick more?
- */
- schedule_timeout_interruptible(msecs_to_jiffies(delay) + 2);
+ schedule_msec_hrtimeout_interruptible(delay + 1);
}
static int tsl2563_adjust_gainlevel(struct tsl2563_chip *chip, u16 adc)
diff --git a/drivers/media/i2c/msp3400-driver.c b/drivers/media/i2c/msp3400-driver.c
index 503b7c4..d790a90 100644
--- a/drivers/media/i2c/msp3400-driver.c
+++ b/drivers/media/i2c/msp3400-driver.c
@@ -184,7 +184,7 @@ static int msp_read(struct i2c_client *client, int dev, int addr)
break;
v4l_warn(client, "I/O error #%d (read 0x%02x/0x%02x)\n", err,
dev, addr);
- schedule_timeout_interruptible(msecs_to_jiffies(10));
+ schedule_msec_hrtimeout_interruptible((10));
}
if (err == 3) {
v4l_warn(client, "resetting chip, sound will go off.\n");
@@ -225,7 +225,7 @@ static int msp_write(struct i2c_client *client, int dev, int addr, int val)
break;
v4l_warn(client, "I/O error #%d (write 0x%02x/0x%02x)\n", err,
dev, addr);
- schedule_timeout_interruptible(msecs_to_jiffies(10));
+ schedule_msec_hrtimeout_interruptible((10));
}
if (err == 3) {
v4l_warn(client, "resetting chip, sound will go off.\n");
diff --git a/drivers/media/pci/cx18/cx18-gpio.c b/drivers/media/pci/cx18/cx18-gpio.c
index 38dc6b8..3cd3098 100644
--- a/drivers/media/pci/cx18/cx18-gpio.c
+++ b/drivers/media/pci/cx18/cx18-gpio.c
@@ -95,11 +95,11 @@ static void gpio_reset_seq(struct cx18 *cx, u32 active_lo, u32 active_hi,
/* Assert */
gpio_update(cx, mask, ~active_lo);
- schedule_timeout_uninterruptible(msecs_to_jiffies(assert_msecs));
+ schedule_msec_hrtimeout_uninterruptible((assert_msecs));
/* Deassert */
gpio_update(cx, mask, ~active_hi);
- schedule_timeout_uninterruptible(msecs_to_jiffies(recovery_msecs));
+ schedule_msec_hrtimeout_uninterruptible((recovery_msecs));
}
/*
diff --git a/drivers/media/pci/ivtv/ivtv-gpio.c b/drivers/media/pci/ivtv/ivtv-gpio.c
index f752f39..23372af 100644
--- a/drivers/media/pci/ivtv/ivtv-gpio.c
+++ b/drivers/media/pci/ivtv/ivtv-gpio.c
@@ -117,7 +117,7 @@ void ivtv_reset_ir_gpio(struct ivtv *itv)
curout = (curout & ~0xF) | 1;
write_reg(curout, IVTV_REG_GPIO_OUT);
/* We could use something else for smaller time */
- schedule_timeout_interruptible(msecs_to_jiffies(1));
+ schedule_msec_hrtimeout_interruptible((1));
curout |= 2;
write_reg(curout, IVTV_REG_GPIO_OUT);
curdir &= ~0x80;
@@ -137,11 +137,11 @@ int ivtv_reset_tuner_gpio(void *dev, int component, int cmd, int value)
curout = read_reg(IVTV_REG_GPIO_OUT);
curout &= ~(1 << itv->card->xceive_pin);
write_reg(curout, IVTV_REG_GPIO_OUT);
- schedule_timeout_interruptible(msecs_to_jiffies(1));
+ schedule_msec_hrtimeout_interruptible((1));
curout |= 1 << itv->card->xceive_pin;
write_reg(curout, IVTV_REG_GPIO_OUT);
- schedule_timeout_interruptible(msecs_to_jiffies(1));
+ schedule_msec_hrtimeout_interruptible((1));
return 0;
}
diff --git a/drivers/media/pci/ivtv/ivtv-ioctl.c b/drivers/media/pci/ivtv/ivtv-ioctl.c
index 2dc4b20..8e061cf 100644
--- a/drivers/media/pci/ivtv/ivtv-ioctl.c
+++ b/drivers/media/pci/ivtv/ivtv-ioctl.c
@@ -1151,7 +1151,7 @@ void ivtv_s_std_dec(struct ivtv *itv, v4l2_std_id std)
TASK_UNINTERRUPTIBLE);
if ((read_reg(IVTV_REG_DEC_LINE_FIELD) >> 16) < 100)
break;
- schedule_timeout(msecs_to_jiffies(25));
+ schedule_msec_hrtimeout((25));
}
finish_wait(&itv->vsync_waitq, &wait);
mutex_lock(&itv->serialize_lock);
diff --git a/drivers/media/pci/ivtv/ivtv-streams.c b/drivers/media/pci/ivtv/ivtv-streams.c
index d27c6df..e9ffc4e 100644
--- a/drivers/media/pci/ivtv/ivtv-streams.c
+++ b/drivers/media/pci/ivtv/ivtv-streams.c
@@ -834,7 +834,7 @@ int ivtv_stop_v4l2_encode_stream(struct ivtv_stream *s, int gop_end)
while (!test_bit(IVTV_F_I_EOS, &itv->i_flags) &&
time_before(jiffies,
then + msecs_to_jiffies(2000))) {
- schedule_timeout(msecs_to_jiffies(10));
+ schedule_msec_hrtimeout((10));
}
/* To convert jiffies to ms, we must multiply by 1000
diff --git a/drivers/media/radio/radio-mr800.c b/drivers/media/radio/radio-mr800.c
index c2927fd..bdee269 100644
--- a/drivers/media/radio/radio-mr800.c
+++ b/drivers/media/radio/radio-mr800.c
@@ -382,7 +382,7 @@ static int vidioc_s_hw_freq_seek(struct file *file, void *priv,
retval = -ENODATA;
break;
}
- if (schedule_timeout_interruptible(msecs_to_jiffies(10))) {
+ if (schedule_msec_hrtimeout_interruptible((10))) {
retval = -ERESTARTSYS;
break;
}
diff --git a/drivers/media/radio/radio-tea5777.c b/drivers/media/radio/radio-tea5777.c
index 83fe7ab..aaae5fa 100644
--- a/drivers/media/radio/radio-tea5777.c
+++ b/drivers/media/radio/radio-tea5777.c
@@ -249,7 +249,7 @@ static int radio_tea5777_update_read_reg(struct radio_tea5777 *tea, int wait)
}
if (wait) {
- if (schedule_timeout_interruptible(msecs_to_jiffies(wait)))
+ if (schedule_msec_hrtimeout_interruptible((wait)))
return -ERESTARTSYS;
}
diff --git a/drivers/media/radio/tea575x.c b/drivers/media/radio/tea575x.c
index 4dc2067..29f4416 100644
--- a/drivers/media/radio/tea575x.c
+++ b/drivers/media/radio/tea575x.c
@@ -416,7 +416,7 @@ int snd_tea575x_s_hw_freq_seek(struct file *file, struct snd_tea575x *tea,
for (;;) {
if (time_after(jiffies, timeout))
break;
- if (schedule_timeout_interruptible(msecs_to_jiffies(10))) {
+ if (schedule_msec_hrtimeout_interruptible((10))) {
/* some signal arrived, stop search */
tea->val &= ~TEA575X_BIT_SEARCH;
snd_tea575x_set_freq(tea);
diff --git a/drivers/mfd/ucb1x00-core.c b/drivers/mfd/ucb1x00-core.c
index d6fb2e1..7ac951b 100644
--- a/drivers/mfd/ucb1x00-core.c
+++ b/drivers/mfd/ucb1x00-core.c
@@ -253,7 +253,7 @@ unsigned int ucb1x00_adc_read(struct ucb1x00 *ucb, int adc_channel, int sync)
break;
/* yield to other processes */
set_current_state(TASK_INTERRUPTIBLE);
- schedule_timeout(1);
+ schedule_min_hrtimeout();
}
return UCB_ADC_DAT(val);
diff --git a/drivers/misc/panel.c b/drivers/misc/panel.c
index 6030ac5..f0c1a10 100644
--- a/drivers/misc/panel.c
+++ b/drivers/misc/panel.c
@@ -760,7 +760,7 @@ static void long_sleep(int ms)
if (in_interrupt())
mdelay(ms);
else
- schedule_timeout_interruptible(msecs_to_jiffies(ms));
+ schedule_msec_hrtimeout_interruptible((ms));
}
/*
diff --git a/drivers/misc/sgi-xp/xpc_channel.c b/drivers/misc/sgi-xp/xpc_channel.c
index 128d561..38e68e9 100644
--- a/drivers/misc/sgi-xp/xpc_channel.c
+++ b/drivers/misc/sgi-xp/xpc_channel.c
@@ -837,7 +837,7 @@ xpc_allocate_msg_wait(struct xpc_channel *ch)
atomic_inc(&ch->n_on_msg_allocate_wq);
prepare_to_wait(&ch->msg_allocate_wq, &wait, TASK_INTERRUPTIBLE);
- ret = schedule_timeout(1);
+ ret = schedule_min_hrtimeout();
finish_wait(&ch->msg_allocate_wq, &wait);
atomic_dec(&ch->n_on_msg_allocate_wq);
diff --git a/drivers/net/caif/caif_hsi.c b/drivers/net/caif/caif_hsi.c
index ddabce7..67fb5ce 100644
--- a/drivers/net/caif/caif_hsi.c
+++ b/drivers/net/caif/caif_hsi.c
@@ -944,7 +944,7 @@ static void cfhsi_wake_down(struct work_struct *work)
break;
set_current_state(TASK_INTERRUPTIBLE);
- schedule_timeout(1);
+ schedule_min_hrtimeout();
retry--;
}
diff --git a/drivers/net/can/usb/peak_usb/pcan_usb.c b/drivers/net/can/usb/peak_usb/pcan_usb.c
index 838545c..34f8972 100644
--- a/drivers/net/can/usb/peak_usb/pcan_usb.c
+++ b/drivers/net/can/usb/peak_usb/pcan_usb.c
@@ -250,7 +250,7 @@ static int pcan_usb_write_mode(struct peak_usb_device *dev, u8 onoff)
} else {
/* the PCAN-USB needs time to init */
set_current_state(TASK_INTERRUPTIBLE);
- schedule_timeout(msecs_to_jiffies(PCAN_USB_STARTUP_TIMEOUT));
+ schedule_msec_hrtimeout((PCAN_USB_STARTUP_TIMEOUT));
}
return err;
diff --git a/drivers/net/usb/lan78xx.c b/drivers/net/usb/lan78xx.c
index f33460c..93d335d 100644
--- a/drivers/net/usb/lan78xx.c
+++ b/drivers/net/usb/lan78xx.c
@@ -2361,7 +2361,7 @@ static void lan78xx_terminate_urbs(struct lan78xx_net *dev)
while (!skb_queue_empty(&dev->rxq) &&
!skb_queue_empty(&dev->txq) &&
!skb_queue_empty(&dev->done)) {
- schedule_timeout(msecs_to_jiffies(UNLINK_TIMEOUT_MS));
+ schedule_msec_hrtimeout((UNLINK_TIMEOUT_MS));
set_current_state(TASK_UNINTERRUPTIBLE);
netif_dbg(dev, ifdown, dev->net,
"waited for %d urb completions\n", temp);
diff --git a/drivers/net/usb/usbnet.c b/drivers/net/usb/usbnet.c
index d5071e3..7ca0618 100644
--- a/drivers/net/usb/usbnet.c
+++ b/drivers/net/usb/usbnet.c
@@ -769,7 +769,7 @@ static void wait_skb_queue_empty(struct sk_buff_head *q)
spin_lock_irqsave(&q->lock, flags);
while (!skb_queue_empty(q)) {
spin_unlock_irqrestore(&q->lock, flags);
- schedule_timeout(msecs_to_jiffies(UNLINK_TIMEOUT_MS));
+ schedule_msec_hrtimeout((UNLINK_TIMEOUT_MS));
set_current_state(TASK_UNINTERRUPTIBLE);
spin_lock_irqsave(&q->lock, flags);
}
diff --git a/drivers/net/wireless/ath/ath9k/ath9k.h b/drivers/net/wireless/ath/ath9k/ath9k.h
index 26fc8ec..378d345 100644
--- a/drivers/net/wireless/ath/ath9k/ath9k.h
+++ b/drivers/net/wireless/ath/ath9k/ath9k.h
@@ -91,7 +91,6 @@ int ath_descdma_setup(struct ath_softc *sc, struct ath_descdma *dd,
#define ATH_RXBUF 512
#define ATH_TXBUF 512
#define ATH_TXBUF_RESERVE 5
-#define ATH_MAX_QDEPTH (ATH_TXBUF / 4 - ATH_TXBUF_RESERVE)
#define ATH_TXMAXTRY 13
#define ATH_MAX_SW_RETRIES 30
@@ -145,7 +144,7 @@ int ath_descdma_setup(struct ath_softc *sc, struct ath_descdma *dd,
#define BAW_WITHIN(_start, _bawsz, _seqno) \
((((_seqno) - (_start)) & 4095) < (_bawsz))
-#define ATH_AN_2_TID(_an, _tidno) (&(_an)->tid[(_tidno)])
+#define ATH_AN_2_TID(_an, _tidno) ath_node_to_tid(_an, _tidno)
#define IS_HT_RATE(rate) (rate & 0x80)
#define IS_CCK_RATE(rate) ((rate >= 0x18) && (rate <= 0x1e))
@@ -164,7 +163,6 @@ struct ath_txq {
spinlock_t axq_lock;
u32 axq_depth;
u32 axq_ampdu_depth;
- bool stopped;
bool axq_tx_inprogress;
struct list_head txq_fifo[ATH_TXFIFO_DEPTH];
u8 txq_headidx;
@@ -232,7 +230,6 @@ struct ath_buf {
struct ath_atx_tid {
struct list_head list;
- struct sk_buff_head buf_q;
struct sk_buff_head retry_q;
struct ath_node *an;
struct ath_txq *txq;
@@ -247,13 +244,13 @@ struct ath_atx_tid {
s8 bar_index;
bool active;
bool clear_ps_filter;
+ bool has_queued;
};
struct ath_node {
struct ath_softc *sc;
struct ieee80211_sta *sta; /* station struct we're part of */
struct ieee80211_vif *vif; /* interface with which we're associated */
- struct ath_atx_tid tid[IEEE80211_NUM_TIDS];
u16 maxampdu;
u8 mpdudensity;
@@ -276,7 +273,6 @@ struct ath_tx_control {
struct ath_node *an;
struct ieee80211_sta *sta;
u8 paprd;
- bool force_channel;
};
@@ -293,7 +289,6 @@ struct ath_tx {
struct ath_descdma txdma;
struct ath_txq *txq_map[IEEE80211_NUM_ACS];
struct ath_txq *uapsdq;
- u32 txq_max_pending[IEEE80211_NUM_ACS];
u16 max_aggr_framelen[IEEE80211_NUM_ACS][4][32];
};
@@ -421,6 +416,22 @@ struct ath_offchannel {
int duration;
};
+static inline struct ath_atx_tid *
+ath_node_to_tid(struct ath_node *an, u8 tidno)
+{
+ struct ieee80211_sta *sta = an->sta;
+ struct ieee80211_vif *vif = an->vif;
+ struct ieee80211_txq *txq;
+
+ BUG_ON(!vif);
+ if (sta)
+ txq = sta->txq[tidno % ARRAY_SIZE(sta->txq)];
+ else
+ txq = vif->txq;
+
+ return (struct ath_atx_tid *) txq->drv_priv;
+}
+
#define case_rtn_string(val) case val: return #val
#define ath_for_each_chanctx(_sc, _ctx) \
@@ -575,7 +586,6 @@ void ath_tx_edma_tasklet(struct ath_softc *sc);
int ath_tx_aggr_start(struct ath_softc *sc, struct ieee80211_sta *sta,
u16 tid, u16 *ssn);
void ath_tx_aggr_stop(struct ath_softc *sc, struct ieee80211_sta *sta, u16 tid);
-void ath_tx_aggr_resume(struct ath_softc *sc, struct ieee80211_sta *sta, u16 tid);
void ath_tx_aggr_wakeup(struct ath_softc *sc, struct ath_node *an);
void ath_tx_aggr_sleep(struct ieee80211_sta *sta, struct ath_softc *sc,
@@ -585,6 +595,7 @@ void ath9k_release_buffered_frames(struct ieee80211_hw *hw,
u16 tids, int nframes,
enum ieee80211_frame_release_type reason,
bool more_data);
+void ath9k_wake_tx_queue(struct ieee80211_hw *hw, struct ieee80211_txq *queue);
/********/
/* VIFs */
diff --git a/drivers/net/wireless/ath/ath9k/channel.c b/drivers/net/wireless/ath/ath9k/channel.c
index 57e26a6..929dd70 100644
--- a/drivers/net/wireless/ath/ath9k/channel.c
+++ b/drivers/net/wireless/ath/ath9k/channel.c
@@ -1010,7 +1010,6 @@ static void ath_scan_send_probe(struct ath_softc *sc,
goto error;
txctl.txq = sc->tx.txq_map[IEEE80211_AC_VO];
- txctl.force_channel = true;
if (ath_tx_start(sc->hw, skb, &txctl))
goto error;
@@ -1133,7 +1132,6 @@ ath_chanctx_send_vif_ps_frame(struct ath_softc *sc, struct ath_vif *avp,
memset(&txctl, 0, sizeof(txctl));
txctl.txq = sc->tx.txq_map[IEEE80211_AC_VO];
txctl.sta = sta;
- txctl.force_channel = true;
if (ath_tx_start(sc->hw, skb, &txctl)) {
ieee80211_free_txskb(sc->hw, skb);
return false;
diff --git a/drivers/net/wireless/ath/ath9k/debug.c b/drivers/net/wireless/ath/ath9k/debug.c
index c56e40f..89a94dd 100644
--- a/drivers/net/wireless/ath/ath9k/debug.c
+++ b/drivers/net/wireless/ath/ath9k/debug.c
@@ -600,7 +600,6 @@ static int read_file_xmit(struct seq_file *file, void *data)
PR("MPDUs XRetried: ", xretries);
PR("Aggregates: ", a_aggr);
PR("AMPDUs Queued HW:", a_queued_hw);
- PR("AMPDUs Queued SW:", a_queued_sw);
PR("AMPDUs Completed:", a_completed);
PR("AMPDUs Retried: ", a_retries);
PR("AMPDUs XRetried: ", a_xretries);
@@ -629,8 +628,7 @@ static void print_queue(struct ath_softc *sc, struct ath_txq *txq,
seq_printf(file, "%s: %d ", "qnum", txq->axq_qnum);
seq_printf(file, "%s: %2d ", "qdepth", txq->axq_depth);
seq_printf(file, "%s: %2d ", "ampdu-depth", txq->axq_ampdu_depth);
- seq_printf(file, "%s: %3d ", "pending", txq->pending_frames);
- seq_printf(file, "%s: %d\n", "stopped", txq->stopped);
+ seq_printf(file, "%s: %3d\n", "pending", txq->pending_frames);
ath_txq_unlock(sc, txq);
}
@@ -1208,7 +1206,6 @@ static const char ath9k_gstrings_stats[][ETH_GSTRING_LEN] = {
AMKSTR(d_tx_mpdu_xretries),
AMKSTR(d_tx_aggregates),
AMKSTR(d_tx_ampdus_queued_hw),
- AMKSTR(d_tx_ampdus_queued_sw),
AMKSTR(d_tx_ampdus_completed),
AMKSTR(d_tx_ampdu_retries),
AMKSTR(d_tx_ampdu_xretries),
@@ -1288,7 +1285,6 @@ void ath9k_get_et_stats(struct ieee80211_hw *hw,
AWDATA(xretries);
AWDATA(a_aggr);
AWDATA(a_queued_hw);
- AWDATA(a_queued_sw);
AWDATA(a_completed);
AWDATA(a_retries);
AWDATA(a_xretries);
@@ -1346,14 +1342,6 @@ int ath9k_init_debug(struct ath_hw *ah)
read_file_xmit);
debugfs_create_devm_seqfile(sc->dev, "queues", sc->debug.debugfs_phy,
read_file_queues);
- debugfs_create_u32("qlen_bk", S_IRUSR | S_IWUSR, sc->debug.debugfs_phy,
- &sc->tx.txq_max_pending[IEEE80211_AC_BK]);
- debugfs_create_u32("qlen_be", S_IRUSR | S_IWUSR, sc->debug.debugfs_phy,
- &sc->tx.txq_max_pending[IEEE80211_AC_BE]);
- debugfs_create_u32("qlen_vi", S_IRUSR | S_IWUSR, sc->debug.debugfs_phy,
- &sc->tx.txq_max_pending[IEEE80211_AC_VI]);
- debugfs_create_u32("qlen_vo", S_IRUSR | S_IWUSR, sc->debug.debugfs_phy,
- &sc->tx.txq_max_pending[IEEE80211_AC_VO]);
debugfs_create_devm_seqfile(sc->dev, "misc", sc->debug.debugfs_phy,
read_file_misc);
debugfs_create_devm_seqfile(sc->dev, "reset", sc->debug.debugfs_phy,
diff --git a/drivers/net/wireless/ath/ath9k/debug.h b/drivers/net/wireless/ath/ath9k/debug.h
index cd68c5f..a078cdd 100644
--- a/drivers/net/wireless/ath/ath9k/debug.h
+++ b/drivers/net/wireless/ath/ath9k/debug.h
@@ -147,7 +147,6 @@ struct ath_interrupt_stats {
* @completed: Total MPDUs (non-aggr) completed
* @a_aggr: Total no. of aggregates queued
* @a_queued_hw: Total AMPDUs queued to hardware
- * @a_queued_sw: Total AMPDUs queued to software queues
* @a_completed: Total AMPDUs completed
* @a_retries: No. of AMPDUs retried (SW)
* @a_xretries: No. of AMPDUs dropped due to xretries
@@ -174,7 +173,6 @@ struct ath_tx_stats {
u32 xretries;
u32 a_aggr;
u32 a_queued_hw;
- u32 a_queued_sw;
u32 a_completed;
u32 a_retries;
u32 a_xretries;
diff --git a/drivers/net/wireless/ath/ath9k/debug_sta.c b/drivers/net/wireless/ath/ath9k/debug_sta.c
index b66cfa9..2a3a3c4 100644
--- a/drivers/net/wireless/ath/ath9k/debug_sta.c
+++ b/drivers/net/wireless/ath/ath9k/debug_sta.c
@@ -52,8 +52,8 @@ static ssize_t read_file_node_aggr(struct file *file, char __user *user_buf,
"TID", "SEQ_START", "SEQ_NEXT", "BAW_SIZE",
"BAW_HEAD", "BAW_TAIL", "BAR_IDX", "SCHED", "PAUSED");
- for (tidno = 0, tid = &an->tid[tidno];
- tidno < IEEE80211_NUM_TIDS; tidno++, tid++) {
+ for (tidno = 0; tidno < IEEE80211_NUM_TIDS; tidno++) {
+ tid = ath_node_to_tid(an, tidno);
txq = tid->txq;
ath_txq_lock(sc, txq);
if (tid->active) {
diff --git a/drivers/net/wireless/ath/ath9k/init.c b/drivers/net/wireless/ath/ath9k/init.c
index cfa3fe8..96bba17 100644
--- a/drivers/net/wireless/ath/ath9k/init.c
+++ b/drivers/net/wireless/ath/ath9k/init.c
@@ -358,7 +358,6 @@ static int ath9k_init_queues(struct ath_softc *sc)
for (i = 0; i < IEEE80211_NUM_ACS; i++) {
sc->tx.txq_map[i] = ath_txq_setup(sc, ATH9K_TX_QUEUE_DATA, i);
sc->tx.txq_map[i]->mac80211_qnum = i;
- sc->tx.txq_max_pending[i] = ATH_MAX_QDEPTH;
}
return 0;
}
@@ -877,6 +876,7 @@ static void ath9k_set_hw_capab(struct ath_softc *sc, struct ieee80211_hw *hw)
hw->max_rate_tries = 10;
hw->sta_data_size = sizeof(struct ath_node);
hw->vif_data_size = sizeof(struct ath_vif);
+ hw->txq_data_size = sizeof(struct ath_atx_tid);
hw->extra_tx_headroom = 4;
hw->wiphy->available_antennas_rx = BIT(ah->caps.max_rxchains) - 1;
diff --git a/drivers/net/wireless/ath/ath9k/main.c b/drivers/net/wireless/ath/ath9k/main.c
index e9f32b5..59e3bd0 100644
--- a/drivers/net/wireless/ath/ath9k/main.c
+++ b/drivers/net/wireless/ath/ath9k/main.c
@@ -1902,9 +1902,11 @@ static int ath9k_ampdu_action(struct ieee80211_hw *hw,
bool flush = false;
int ret = 0;
struct ieee80211_sta *sta = params->sta;
+ struct ath_node *an = (struct ath_node *)sta->drv_priv;
enum ieee80211_ampdu_mlme_action action = params->action;
u16 tid = params->tid;
u16 *ssn = &params->ssn;
+ struct ath_atx_tid *atid;
mutex_lock(&sc->mutex);
@@ -1937,9 +1939,9 @@ static int ath9k_ampdu_action(struct ieee80211_hw *hw,
ath9k_ps_restore(sc);
break;
case IEEE80211_AMPDU_TX_OPERATIONAL:
- ath9k_ps_wakeup(sc);
- ath_tx_aggr_resume(sc, sta, tid);
- ath9k_ps_restore(sc);
+ atid = ath_node_to_tid(an, tid);
+ atid->baw_size = IEEE80211_MIN_AMPDU_BUF <<
+ sta->ht_cap.ampdu_factor;
break;
default:
ath_err(ath9k_hw_common(sc->sc_ah), "Unknown AMPDU action\n");
@@ -2701,4 +2703,5 @@ struct ieee80211_ops ath9k_ops = {
.sw_scan_start = ath9k_sw_scan_start,
.sw_scan_complete = ath9k_sw_scan_complete,
.get_txpower = ath9k_get_txpower,
+ .wake_tx_queue = ath9k_wake_tx_queue,
};
diff --git a/drivers/net/wireless/ath/ath9k/xmit.c b/drivers/net/wireless/ath/ath9k/xmit.c
index e47286b..4e2f3ac 100644
--- a/drivers/net/wireless/ath/ath9k/xmit.c
+++ b/drivers/net/wireless/ath/ath9k/xmit.c
@@ -67,6 +67,8 @@ static struct ath_buf *ath_tx_setup_buffer(struct ath_softc *sc,
struct ath_txq *txq,
struct ath_atx_tid *tid,
struct sk_buff *skb);
+static int ath_tx_prepare(struct ieee80211_hw *hw, struct sk_buff *skb,
+ struct ath_tx_control *txctl);
enum {
MCS_HT20,
@@ -137,6 +139,26 @@ static void ath_tx_queue_tid(struct ath_softc *sc, struct ath_txq *txq,
list_add_tail(&tid->list, list);
}
+void ath9k_wake_tx_queue(struct ieee80211_hw *hw, struct ieee80211_txq *queue)
+{
+ struct ath_softc *sc = hw->priv;
+ struct ath_common *common = ath9k_hw_common(sc->sc_ah);
+ struct ath_atx_tid *tid = (struct ath_atx_tid *) queue->drv_priv;
+ struct ath_txq *txq = tid->txq;
+
+ ath_dbg(common, QUEUE, "Waking TX queue: %pM (%d)\n",
+ queue->sta ? queue->sta->addr : queue->vif->addr,
+ tid->tidno);
+
+ ath_txq_lock(sc, txq);
+
+ tid->has_queued = true;
+ ath_tx_queue_tid(sc, txq, tid);
+ ath_txq_schedule(sc, txq);
+
+ ath_txq_unlock(sc, txq);
+}
+
static struct ath_frame_info *get_frame_info(struct sk_buff *skb)
{
struct ieee80211_tx_info *tx_info = IEEE80211_SKB_CB(skb);
@@ -164,7 +186,6 @@ static void ath_set_rates(struct ieee80211_vif *vif, struct ieee80211_sta *sta,
static void ath_txq_skb_done(struct ath_softc *sc, struct ath_txq *txq,
struct sk_buff *skb)
{
- struct ieee80211_tx_info *info = IEEE80211_SKB_CB(skb);
struct ath_frame_info *fi = get_frame_info(skb);
int q = fi->txq;
@@ -175,14 +196,6 @@ static void ath_txq_skb_done(struct ath_softc *sc, struct ath_txq *txq,
if (WARN_ON(--txq->pending_frames < 0))
txq->pending_frames = 0;
- if (txq->stopped &&
- txq->pending_frames < sc->tx.txq_max_pending[q]) {
- if (ath9k_is_chanctx_enabled())
- ieee80211_wake_queue(sc->hw, info->hw_queue);
- else
- ieee80211_wake_queue(sc->hw, q);
- txq->stopped = false;
- }
}
static struct ath_atx_tid *
@@ -192,9 +205,48 @@ ath_get_skb_tid(struct ath_softc *sc, struct ath_node *an, struct sk_buff *skb)
return ATH_AN_2_TID(an, tidno);
}
+static struct sk_buff *
+ath_tid_pull(struct ath_atx_tid *tid)
+{
+ struct ieee80211_txq *txq = container_of((void*)tid, struct ieee80211_txq, drv_priv);
+ struct ath_softc *sc = tid->an->sc;
+ struct ieee80211_hw *hw = sc->hw;
+ struct ath_tx_control txctl = {
+ .txq = tid->txq,
+ .sta = tid->an->sta,
+ };
+ struct sk_buff *skb;
+ struct ath_frame_info *fi;
+ int q;
+
+ if (!tid->has_queued)
+ return NULL;
+
+ skb = ieee80211_tx_dequeue(hw, txq);
+ if (!skb) {
+ tid->has_queued = false;
+ return NULL;
+ }
+
+ if (ath_tx_prepare(hw, skb, &txctl)) {
+ ieee80211_free_txskb(hw, skb);
+ return NULL;
+ }
+
+ q = skb_get_queue_mapping(skb);
+ if (tid->txq == sc->tx.txq_map[q]) {
+ fi = get_frame_info(skb);
+ fi->txq = q;
+ ++tid->txq->pending_frames;
+ }
+
+ return skb;
+ }
+
+
static bool ath_tid_has_buffered(struct ath_atx_tid *tid)
{
- return !skb_queue_empty(&tid->buf_q) || !skb_queue_empty(&tid->retry_q);
+ return !skb_queue_empty(&tid->retry_q) || tid->has_queued;
}
static struct sk_buff *ath_tid_dequeue(struct ath_atx_tid *tid)
@@ -203,46 +255,11 @@ static struct sk_buff *ath_tid_dequeue(struct ath_atx_tid *tid)
skb = __skb_dequeue(&tid->retry_q);
if (!skb)
- skb = __skb_dequeue(&tid->buf_q);
+ skb = ath_tid_pull(tid);
return skb;
}
-/*
- * ath_tx_tid_change_state:
- * - clears a-mpdu flag of previous session
- * - force sequence number allocation to fix next BlockAck Window
- */
-static void
-ath_tx_tid_change_state(struct ath_softc *sc, struct ath_atx_tid *tid)
-{
- struct ath_txq *txq = tid->txq;
- struct ieee80211_tx_info *tx_info;
- struct sk_buff *skb, *tskb;
- struct ath_buf *bf;
- struct ath_frame_info *fi;
-
- skb_queue_walk_safe(&tid->buf_q, skb, tskb) {
- fi = get_frame_info(skb);
- bf = fi->bf;
-
- tx_info = IEEE80211_SKB_CB(skb);
- tx_info->flags &= ~IEEE80211_TX_CTL_AMPDU;
-
- if (bf)
- continue;
-
- bf = ath_tx_setup_buffer(sc, txq, tid, skb);
- if (!bf) {
- __skb_unlink(skb, &tid->buf_q);
- ath_txq_skb_done(sc, txq, skb);
- ieee80211_free_txskb(sc->hw, skb);
- continue;
- }
- }
-
-}
-
static void ath_tx_flush_tid(struct ath_softc *sc, struct ath_atx_tid *tid)
{
struct ath_txq *txq = tid->txq;
@@ -883,20 +900,16 @@ static int ath_compute_num_delims(struct ath_softc *sc, struct ath_atx_tid *tid,
static struct ath_buf *
ath_tx_get_tid_subframe(struct ath_softc *sc, struct ath_txq *txq,
- struct ath_atx_tid *tid, struct sk_buff_head **q)
+ struct ath_atx_tid *tid)
{
struct ieee80211_tx_info *tx_info;
struct ath_frame_info *fi;
- struct sk_buff *skb;
+ struct sk_buff *skb, *first_skb = NULL;
struct ath_buf *bf;
u16 seqno;
while (1) {
- *q = &tid->retry_q;
- if (skb_queue_empty(*q))
- *q = &tid->buf_q;
-
- skb = skb_peek(*q);
+ skb = ath_tid_dequeue(tid);
if (!skb)
break;
@@ -908,7 +921,6 @@ ath_tx_get_tid_subframe(struct ath_softc *sc, struct ath_txq *txq,
bf->bf_state.stale = false;
if (!bf) {
- __skb_unlink(skb, *q);
ath_txq_skb_done(sc, txq, skb);
ieee80211_free_txskb(sc->hw, skb);
continue;
@@ -937,8 +949,20 @@ ath_tx_get_tid_subframe(struct ath_softc *sc, struct ath_txq *txq,
seqno = bf->bf_state.seqno;
/* do not step over block-ack window */
- if (!BAW_WITHIN(tid->seq_start, tid->baw_size, seqno))
+ if (!BAW_WITHIN(tid->seq_start, tid->baw_size, seqno)) {
+ __skb_queue_tail(&tid->retry_q, skb);
+
+ /* If there are other skbs in the retry q, they are
+ * probably within the BAW, so loop immediately to get
+ * one of them. Otherwise the queue can get stuck. */
+ if (!skb_queue_is_first(&tid->retry_q, skb) &&
+ !WARN_ON(skb == first_skb)) {
+ if(!first_skb) /* infinite loop prevention */
+ first_skb = skb;
+ continue;
+ }
break;
+ }
if (tid->bar_index > ATH_BA_INDEX(tid->seq_start, seqno)) {
struct ath_tx_status ts = {};
@@ -946,7 +970,6 @@ ath_tx_get_tid_subframe(struct ath_softc *sc, struct ath_txq *txq,
INIT_LIST_HEAD(&bf_head);
list_add(&bf->list, &bf_head);
- __skb_unlink(skb, *q);
ath_tx_update_baw(sc, tid, seqno);
ath_tx_complete_buf(sc, bf, txq, &bf_head, NULL, &ts, 0);
continue;
@@ -958,11 +981,10 @@ ath_tx_get_tid_subframe(struct ath_softc *sc, struct ath_txq *txq,
return NULL;
}
-static bool
+static int
ath_tx_form_aggr(struct ath_softc *sc, struct ath_txq *txq,
struct ath_atx_tid *tid, struct list_head *bf_q,
- struct ath_buf *bf_first, struct sk_buff_head *tid_q,
- int *aggr_len)
+ struct ath_buf *bf_first)
{
#define PADBYTES(_len) ((4 - ((_len) % 4)) % 4)
struct ath_buf *bf = bf_first, *bf_prev = NULL;
@@ -972,12 +994,13 @@ ath_tx_form_aggr(struct ath_softc *sc, struct ath_txq *txq,
struct ieee80211_tx_info *tx_info;
struct ath_frame_info *fi;
struct sk_buff *skb;
- bool closed = false;
+
bf = bf_first;
aggr_limit = ath_lookup_rate(sc, bf, tid);
- do {
+ while (bf)
+ {
skb = bf->bf_mpdu;
fi = get_frame_info(skb);
@@ -986,12 +1009,12 @@ ath_tx_form_aggr(struct ath_softc *sc, struct ath_txq *txq,
if (nframes) {
if (aggr_limit < al + bpad + al_delta ||
ath_lookup_legacy(bf) || nframes >= h_baw)
- break;
+ goto stop;
tx_info = IEEE80211_SKB_CB(bf->bf_mpdu);
if ((tx_info->flags & IEEE80211_TX_CTL_RATE_CTRL_PROBE) ||
!(tx_info->flags & IEEE80211_TX_CTL_AMPDU))
- break;
+ goto stop;
}
/* add padding for previous frame to aggregation length */
@@ -1013,20 +1036,18 @@ ath_tx_form_aggr(struct ath_softc *sc, struct ath_txq *txq,
ath_tx_addto_baw(sc, tid, bf);
bf->bf_state.ndelim = ndelim;
- __skb_unlink(skb, tid_q);
list_add_tail(&bf->list, bf_q);
if (bf_prev)
bf_prev->bf_next = bf;
bf_prev = bf;
- bf = ath_tx_get_tid_subframe(sc, txq, tid, &tid_q);
- if (!bf) {
- closed = true;
- break;
- }
- } while (ath_tid_has_buffered(tid));
-
+ bf = ath_tx_get_tid_subframe(sc, txq, tid);
+ }
+ goto finish;
+stop:
+ __skb_queue_tail(&tid->retry_q, bf->bf_mpdu);
+finish:
bf = bf_first;
bf->bf_lastbf = bf_prev;
@@ -1037,9 +1058,7 @@ ath_tx_form_aggr(struct ath_softc *sc, struct ath_txq *txq,
TX_STAT_INC(txq->axq_qnum, a_aggr);
}
- *aggr_len = al;
-
- return closed;
+ return al;
#undef PADBYTES
}
@@ -1416,18 +1435,15 @@ static void ath_tx_fill_desc(struct ath_softc *sc, struct ath_buf *bf,
static void
ath_tx_form_burst(struct ath_softc *sc, struct ath_txq *txq,
struct ath_atx_tid *tid, struct list_head *bf_q,
- struct ath_buf *bf_first, struct sk_buff_head *tid_q)
+ struct ath_buf *bf_first)
{
struct ath_buf *bf = bf_first, *bf_prev = NULL;
- struct sk_buff *skb;
int nframes = 0;
do {
struct ieee80211_tx_info *tx_info;
- skb = bf->bf_mpdu;
nframes++;
- __skb_unlink(skb, tid_q);
list_add_tail(&bf->list, bf_q);
if (bf_prev)
bf_prev->bf_next = bf;
@@ -1436,13 +1452,15 @@ ath_tx_form_burst(struct ath_softc *sc, struct ath_txq *txq,
if (nframes >= 2)
break;
- bf = ath_tx_get_tid_subframe(sc, txq, tid, &tid_q);
+ bf = ath_tx_get_tid_subframe(sc, txq, tid);
if (!bf)
break;
tx_info = IEEE80211_SKB_CB(bf->bf_mpdu);
- if (tx_info->flags & IEEE80211_TX_CTL_AMPDU)
+ if (tx_info->flags & IEEE80211_TX_CTL_AMPDU) {
+ __skb_queue_tail(&tid->retry_q, bf->bf_mpdu);
break;
+ }
ath_set_rates(tid->an->vif, tid->an->sta, bf);
} while (1);
@@ -1453,34 +1471,33 @@ static bool ath_tx_sched_aggr(struct ath_softc *sc, struct ath_txq *txq,
{
struct ath_buf *bf;
struct ieee80211_tx_info *tx_info;
- struct sk_buff_head *tid_q;
struct list_head bf_q;
int aggr_len = 0;
- bool aggr, last = true;
+ bool aggr;
if (!ath_tid_has_buffered(tid))
return false;
INIT_LIST_HEAD(&bf_q);
- bf = ath_tx_get_tid_subframe(sc, txq, tid, &tid_q);
+ bf = ath_tx_get_tid_subframe(sc, txq, tid);
if (!bf)
return false;
tx_info = IEEE80211_SKB_CB(bf->bf_mpdu);
aggr = !!(tx_info->flags & IEEE80211_TX_CTL_AMPDU);
if ((aggr && txq->axq_ampdu_depth >= ATH_AGGR_MIN_QDEPTH) ||
- (!aggr && txq->axq_depth >= ATH_NON_AGGR_MIN_QDEPTH)) {
+ (!aggr && txq->axq_depth >= ATH_NON_AGGR_MIN_QDEPTH)) {
+ __skb_queue_tail(&tid->retry_q, bf->bf_mpdu);
*stop = true;
return false;
}
ath_set_rates(tid->an->vif, tid->an->sta, bf);
if (aggr)
- last = ath_tx_form_aggr(sc, txq, tid, &bf_q, bf,
- tid_q, &aggr_len);
+ aggr_len = ath_tx_form_aggr(sc, txq, tid, &bf_q, bf);
else
- ath_tx_form_burst(sc, txq, tid, &bf_q, bf, tid_q);
+ ath_tx_form_burst(sc, txq, tid, &bf_q, bf);
if (list_empty(&bf_q))
return false;
@@ -1523,9 +1540,6 @@ int ath_tx_aggr_start(struct ath_softc *sc, struct ieee80211_sta *sta,
an->mpdudensity = density;
}
- /* force sequence number allocation for pending frames */
- ath_tx_tid_change_state(sc, txtid);
-
txtid->active = true;
*ssn = txtid->seq_start = txtid->seq_next;
txtid->bar_index = -1;
@@ -1550,7 +1564,6 @@ void ath_tx_aggr_stop(struct ath_softc *sc, struct ieee80211_sta *sta, u16 tid)
ath_txq_lock(sc, txq);
txtid->active = false;
ath_tx_flush_tid(sc, txtid);
- ath_tx_tid_change_state(sc, txtid);
ath_txq_unlock_complete(sc, txq);
}
@@ -1560,14 +1573,12 @@ void ath_tx_aggr_sleep(struct ieee80211_sta *sta, struct ath_softc *sc,
struct ath_common *common = ath9k_hw_common(sc->sc_ah);
struct ath_atx_tid *tid;
struct ath_txq *txq;
- bool buffered;
int tidno;
ath_dbg(common, XMIT, "%s called\n", __func__);
- for (tidno = 0, tid = &an->tid[tidno];
- tidno < IEEE80211_NUM_TIDS; tidno++, tid++) {
-
+ for (tidno = 0; tidno < IEEE80211_NUM_TIDS; tidno++) {
+ tid = ath_node_to_tid(an, tidno);
txq = tid->txq;
ath_txq_lock(sc, txq);
@@ -1577,13 +1588,12 @@ void ath_tx_aggr_sleep(struct ieee80211_sta *sta, struct ath_softc *sc,
continue;
}
- buffered = ath_tid_has_buffered(tid);
+ if (!skb_queue_empty(&tid->retry_q))
+ ieee80211_sta_set_buffered(sta, tid->tidno, true);
list_del_init(&tid->list);
ath_txq_unlock(sc, txq);
-
- ieee80211_sta_set_buffered(sta, tidno, buffered);
}
}
@@ -1596,49 +1606,20 @@ void ath_tx_aggr_wakeup(struct ath_softc *sc, struct ath_node *an)
ath_dbg(common, XMIT, "%s called\n", __func__);
- for (tidno = 0, tid = &an->tid[tidno];
- tidno < IEEE80211_NUM_TIDS; tidno++, tid++) {
-
+ for (tidno = 0; tidno < IEEE80211_NUM_TIDS; tidno++) {
+ tid = ath_node_to_tid(an, tidno);
txq = tid->txq;
ath_txq_lock(sc, txq);
tid->clear_ps_filter = true;
-
if (ath_tid_has_buffered(tid)) {
ath_tx_queue_tid(sc, txq, tid);
ath_txq_schedule(sc, txq);
}
-
ath_txq_unlock_complete(sc, txq);
}
}
-void ath_tx_aggr_resume(struct ath_softc *sc, struct ieee80211_sta *sta,
- u16 tidno)
-{
- struct ath_common *common = ath9k_hw_common(sc->sc_ah);
- struct ath_atx_tid *tid;
- struct ath_node *an;
- struct ath_txq *txq;
-
- ath_dbg(common, XMIT, "%s called\n", __func__);
-
- an = (struct ath_node *)sta->drv_priv;
- tid = ATH_AN_2_TID(an, tidno);
- txq = tid->txq;
-
- ath_txq_lock(sc, txq);
-
- tid->baw_size = IEEE80211_MIN_AMPDU_BUF << sta->ht_cap.ampdu_factor;
-
- if (ath_tid_has_buffered(tid)) {
- ath_tx_queue_tid(sc, txq, tid);
- ath_txq_schedule(sc, txq);
- }
-
- ath_txq_unlock_complete(sc, txq);
-}
-
void ath9k_release_buffered_frames(struct ieee80211_hw *hw,
struct ieee80211_sta *sta,
u16 tids, int nframes,
@@ -1651,7 +1632,6 @@ void ath9k_release_buffered_frames(struct ieee80211_hw *hw,
struct ieee80211_tx_info *info;
struct list_head bf_q;
struct ath_buf *bf_tail = NULL, *bf;
- struct sk_buff_head *tid_q;
int sent = 0;
int i;
@@ -1666,11 +1646,10 @@ void ath9k_release_buffered_frames(struct ieee80211_hw *hw,
ath_txq_lock(sc, tid->txq);
while (nframes > 0) {
- bf = ath_tx_get_tid_subframe(sc, sc->tx.uapsdq, tid, &tid_q);
+ bf = ath_tx_get_tid_subframe(sc, sc->tx.uapsdq, tid);
if (!bf)
break;
- __skb_unlink(bf->bf_mpdu, tid_q);
list_add_tail(&bf->list, &bf_q);
ath_set_rates(tid->an->vif, tid->an->sta, bf);
if (bf_isampdu(bf)) {
@@ -1685,7 +1664,7 @@ void ath9k_release_buffered_frames(struct ieee80211_hw *hw,
sent++;
TX_STAT_INC(txq->axq_qnum, a_queued_hw);
- if (an->sta && !ath_tid_has_buffered(tid))
+ if (an->sta && skb_queue_empty(&tid->retry_q))
ieee80211_sta_set_buffered(an->sta, i, false);
}
ath_txq_unlock_complete(sc, tid->txq);
@@ -1914,13 +1893,7 @@ bool ath_drain_all_txq(struct ath_softc *sc)
if (!ATH_TXQ_SETUP(sc, i))
continue;
- /*
- * The caller will resume queues with ieee80211_wake_queues.
- * Mark the queue as not stopped to prevent ath_tx_complete
- * from waking the queue too early.
- */
txq = &sc->tx.txq[i];
- txq->stopped = false;
ath_draintxq(sc, txq);
}
@@ -2319,16 +2292,14 @@ int ath_tx_start(struct ieee80211_hw *hw, struct sk_buff *skb,
struct ath_softc *sc = hw->priv;
struct ath_txq *txq = txctl->txq;
struct ath_atx_tid *tid = NULL;
+ struct ath_node *an = NULL;
struct ath_buf *bf;
- bool queue, skip_uapsd = false, ps_resp;
+ bool ps_resp;
int q, ret;
if (vif)
avp = (void *)vif->drv_priv;
- if (info->flags & IEEE80211_TX_CTL_TX_OFFCHAN)
- txctl->force_channel = true;
-
ps_resp = !!(info->control.flags & IEEE80211_TX_CTRL_PS_RESPONSE);
ret = ath_tx_prepare(hw, skb, txctl);
@@ -2343,63 +2314,18 @@ int ath_tx_start(struct ieee80211_hw *hw, struct sk_buff *skb,
q = skb_get_queue_mapping(skb);
- ath_txq_lock(sc, txq);
- if (txq == sc->tx.txq_map[q]) {
- fi->txq = q;
- if (++txq->pending_frames > sc->tx.txq_max_pending[q] &&
- !txq->stopped) {
- if (ath9k_is_chanctx_enabled())
- ieee80211_stop_queue(sc->hw, info->hw_queue);
- else
- ieee80211_stop_queue(sc->hw, q);
- txq->stopped = true;
- }
- }
-
- queue = ieee80211_is_data_present(hdr->frame_control);
-
- /* If chanctx, queue all null frames while NOA could be there */
- if (ath9k_is_chanctx_enabled() &&
- ieee80211_is_nullfunc(hdr->frame_control) &&
- !txctl->force_channel)
- queue = true;
-
- /* Force queueing of all frames that belong to a virtual interface on
- * a different channel context, to ensure that they are sent on the
- * correct channel.
- */
- if (((avp && avp->chanctx != sc->cur_chan) ||
- sc->cur_chan->stopped) && !txctl->force_channel) {
- if (!txctl->an)
- txctl->an = &avp->mcast_node;
- queue = true;
- skip_uapsd = true;
- }
-
- if (txctl->an && queue)
- tid = ath_get_skb_tid(sc, txctl->an, skb);
-
- if (!skip_uapsd && ps_resp) {
- ath_txq_unlock(sc, txq);
+ if (ps_resp)
txq = sc->tx.uapsdq;
- ath_txq_lock(sc, txq);
- } else if (txctl->an && queue) {
- WARN_ON(tid->txq != txctl->txq);
- if (info->flags & IEEE80211_TX_CTL_CLEAR_PS_FILT)
- tid->clear_ps_filter = true;
-
- /*
- * Add this frame to software queue for scheduling later
- * for aggregation.
- */
- TX_STAT_INC(txq->axq_qnum, a_queued_sw);
- __skb_queue_tail(&tid->buf_q, skb);
- if (!txctl->an->sleeping)
- ath_tx_queue_tid(sc, txq, tid);
+ if (txctl->sta) {
+ an = (struct ath_node *) sta->drv_priv;
+ tid = ath_get_skb_tid(sc, an, skb);
+ }
- ath_txq_schedule(sc, txq);
- goto out;
+ ath_txq_lock(sc, txq);
+ if (txq == sc->tx.txq_map[q]) {
+ fi->txq = q;
+ ++txq->pending_frames;
}
bf = ath_tx_setup_buffer(sc, txq, tid, skb);
@@ -2892,9 +2818,8 @@ void ath_tx_node_init(struct ath_softc *sc, struct ath_node *an)
struct ath_atx_tid *tid;
int tidno, acno;
- for (tidno = 0, tid = &an->tid[tidno];
- tidno < IEEE80211_NUM_TIDS;
- tidno++, tid++) {
+ for (tidno = 0; tidno < IEEE80211_NUM_TIDS; tidno++) {
+ tid = ath_node_to_tid(an, tidno);
tid->an = an;
tid->tidno = tidno;
tid->seq_start = tid->seq_next = 0;
@@ -2902,11 +2827,14 @@ void ath_tx_node_init(struct ath_softc *sc, struct ath_node *an)
tid->baw_head = tid->baw_tail = 0;
tid->active = false;
tid->clear_ps_filter = true;
- __skb_queue_head_init(&tid->buf_q);
+ tid->has_queued = false;
__skb_queue_head_init(&tid->retry_q);
INIT_LIST_HEAD(&tid->list);
acno = TID_TO_WME_AC(tidno);
tid->txq = sc->tx.txq_map[acno];
+
+ if (!an->sta)
+ break; /* just one multicast ath_atx_tid */
}
}
@@ -2916,9 +2844,8 @@ void ath_tx_node_cleanup(struct ath_softc *sc, struct ath_node *an)
struct ath_txq *txq;
int tidno;
- for (tidno = 0, tid = &an->tid[tidno];
- tidno < IEEE80211_NUM_TIDS; tidno++, tid++) {
-
+ for (tidno = 0; tidno < IEEE80211_NUM_TIDS; tidno++) {
+ tid = ath_node_to_tid(an, tidno);
txq = tid->txq;
ath_txq_lock(sc, txq);
@@ -2930,6 +2857,9 @@ void ath_tx_node_cleanup(struct ath_softc *sc, struct ath_node *an)
tid->active = false;
ath_txq_unlock(sc, txq);
+
+ if (!an->sta)
+ break; /* just one multicast ath_atx_tid */
}
}
diff --git a/drivers/net/wireless/intel/ipw2x00/ipw2100.c b/drivers/net/wireless/intel/ipw2x00/ipw2100.c
index bfa542c..c6759c5 100644
--- a/drivers/net/wireless/intel/ipw2x00/ipw2100.c
+++ b/drivers/net/wireless/intel/ipw2x00/ipw2100.c
@@ -830,7 +830,7 @@ static int ipw2100_hw_send_command(struct ipw2100_priv *priv,
* doesn't seem to have as many firmware restart cycles...
*
* As a test, we're sticking in a 1/100s delay here */
- schedule_timeout_uninterruptible(msecs_to_jiffies(10));
+ schedule_msec_hrtimeout_uninterruptible((10));
return 0;
@@ -1281,7 +1281,7 @@ static int ipw2100_start_adapter(struct ipw2100_priv *priv)
IPW_DEBUG_FW("Waiting for f/w initialization to complete...\n");
i = 5000;
do {
- schedule_timeout_uninterruptible(msecs_to_jiffies(40));
+ schedule_msec_hrtimeout_uninterruptible((40));
/* Todo... wait for sync command ... */
read_register(priv->net_dev, IPW_REG_INTA, &inta);
diff --git a/drivers/ntb/test/ntb_perf.c b/drivers/ntb/test/ntb_perf.c
index e75d4fd..7103ac1 100644
--- a/drivers/ntb/test/ntb_perf.c
+++ b/drivers/ntb/test/ntb_perf.c
@@ -306,7 +306,7 @@ static int perf_move_data(struct pthr_ctx *pctx, char __iomem *dst, char *src,
if (unlikely((jiffies - last_sleep) > 5 * HZ)) {
last_sleep = jiffies;
set_current_state(TASK_INTERRUPTIBLE);
- schedule_timeout(1);
+ schedule_min_hrtimeout();
}
if (unlikely(kthread_should_stop()))
diff --git a/drivers/parport/ieee1284.c b/drivers/parport/ieee1284.c
index f9fd4b3..00ad2f3 100644
--- a/drivers/parport/ieee1284.c
+++ b/drivers/parport/ieee1284.c
@@ -215,7 +215,7 @@ int parport_wait_peripheral(struct parport *port,
/* parport_wait_event didn't time out, but the
* peripheral wasn't actually ready either.
* Wait for another 10ms. */
- schedule_timeout_interruptible(msecs_to_jiffies(10));
+ schedule_msec_hrtimeout_interruptible((10));
}
}
diff --git a/drivers/parport/ieee1284_ops.c b/drivers/parport/ieee1284_ops.c
index 2e21af4..da5f240 100644
--- a/drivers/parport/ieee1284_ops.c
+++ b/drivers/parport/ieee1284_ops.c
@@ -536,7 +536,7 @@ size_t parport_ieee1284_ecp_read_data (struct parport *port,
/* Yield the port for a while. */
if (count && dev->port->irq != PARPORT_IRQ_NONE) {
parport_release (dev);
- schedule_timeout_interruptible(msecs_to_jiffies(40));
+ schedule_msec_hrtimeout_interruptible((40));
parport_claim_or_block (dev);
}
else
diff --git a/drivers/platform/x86/intel_ips.c b/drivers/platform/x86/intel_ips.c
index 55663b3..0363fed 100644
--- a/drivers/platform/x86/intel_ips.c
+++ b/drivers/platform/x86/intel_ips.c
@@ -812,7 +812,7 @@ static int ips_adjust(void *data)
ips_gpu_lower(ips);
sleep:
- schedule_timeout_interruptible(msecs_to_jiffies(IPS_ADJUST_PERIOD));
+ schedule_msec_hrtimeout_interruptible((IPS_ADJUST_PERIOD));
} while (!kthread_should_stop());
dev_dbg(&ips->dev->dev, "ips-adjust thread stopped\n");
@@ -991,7 +991,7 @@ static int ips_monitor(void *data)
seqno_timestamp = get_jiffies_64();
old_cpu_power = thm_readl(THM_CEC);
- schedule_timeout_interruptible(msecs_to_jiffies(IPS_SAMPLE_PERIOD));
+ schedule_msec_hrtimeout_interruptible((IPS_SAMPLE_PERIOD));
/* Collect an initial average */
for (i = 0; i < IPS_SAMPLE_COUNT; i++) {
@@ -1018,7 +1018,7 @@ static int ips_monitor(void *data)
mchp_samples[i] = mchp;
}
- schedule_timeout_interruptible(msecs_to_jiffies(IPS_SAMPLE_PERIOD));
+ schedule_msec_hrtimeout_interruptible((IPS_SAMPLE_PERIOD));
if (kthread_should_stop())
break;
}
@@ -1045,7 +1045,7 @@ static int ips_monitor(void *data)
* us to reduce the sample frequency if the CPU and GPU are idle.
*/
old_cpu_power = thm_readl(THM_CEC);
- schedule_timeout_interruptible(msecs_to_jiffies(IPS_SAMPLE_PERIOD));
+ schedule_msec_hrtimeout_interruptible((IPS_SAMPLE_PERIOD));
last_sample_period = IPS_SAMPLE_PERIOD;
setup_deferrable_timer_on_stack(&timer, monitor_timeout,
diff --git a/drivers/rtc/rtc-wm8350.c b/drivers/rtc/rtc-wm8350.c
index fa247de..f1a28d8 100644
--- a/drivers/rtc/rtc-wm8350.c
+++ b/drivers/rtc/rtc-wm8350.c
@@ -121,7 +121,7 @@ static int wm8350_rtc_settime(struct device *dev, struct rtc_time *tm)
/* Wait until confirmation of stopping */
do {
rtc_ctrl = wm8350_reg_read(wm8350, WM8350_RTC_TIME_CONTROL);
- schedule_timeout_uninterruptible(msecs_to_jiffies(1));
+ schedule_msec_hrtimeout_uninterruptible((1));
} while (--retries && !(rtc_ctrl & WM8350_RTC_STS));
if (!retries) {
@@ -204,7 +204,7 @@ static int wm8350_rtc_stop_alarm(struct wm8350 *wm8350)
/* Wait until confirmation of stopping */
do {
rtc_ctrl = wm8350_reg_read(wm8350, WM8350_RTC_TIME_CONTROL);
- schedule_timeout_uninterruptible(msecs_to_jiffies(1));
+ schedule_msec_hrtimeout_uninterruptible((1));
} while (retries-- && !(rtc_ctrl & WM8350_RTC_ALMSTS));
if (!(rtc_ctrl & WM8350_RTC_ALMSTS))
@@ -227,7 +227,7 @@ static int wm8350_rtc_start_alarm(struct wm8350 *wm8350)
/* Wait until confirmation */
do {
rtc_ctrl = wm8350_reg_read(wm8350, WM8350_RTC_TIME_CONTROL);
- schedule_timeout_uninterruptible(msecs_to_jiffies(1));
+ schedule_msec_hrtimeout_uninterruptible((1));
} while (retries-- && rtc_ctrl & WM8350_RTC_ALMSTS);
if (rtc_ctrl & WM8350_RTC_ALMSTS)
diff --git a/drivers/scsi/fnic/fnic_scsi.c b/drivers/scsi/fnic/fnic_scsi.c
index d9fd2f8..ea44afd 100644
--- a/drivers/scsi/fnic/fnic_scsi.c
+++ b/drivers/scsi/fnic/fnic_scsi.c
@@ -217,7 +217,7 @@ int fnic_fw_reset_handler(struct fnic *fnic)
/* wait for io cmpl */
while (atomic_read(&fnic->in_flight))
- schedule_timeout(msecs_to_jiffies(1));
+ schedule_msec_hrtimeout((1));
spin_lock_irqsave(&fnic->wq_copy_lock[0], flags);
@@ -2193,7 +2193,7 @@ static int fnic_clean_pending_aborts(struct fnic *fnic,
}
}
- schedule_timeout(msecs_to_jiffies(2 * fnic->config.ed_tov));
+ schedule_msec_hrtimeout((2 * fnic->config.ed_tov));
/* walk again to check, if IOs are still pending in fw */
if (fnic_is_abts_pending(fnic, lr_sc))
diff --git a/drivers/scsi/lpfc/lpfc_scsi.c b/drivers/scsi/lpfc/lpfc_scsi.c
index d197aa1..8e2fd7b 100644
--- a/drivers/scsi/lpfc/lpfc_scsi.c
+++ b/drivers/scsi/lpfc/lpfc_scsi.c
@@ -5105,7 +5105,7 @@ lpfc_reset_flush_io_context(struct lpfc_vport *vport, uint16_t tgt_id,
tgt_id, lun_id, context);
later = msecs_to_jiffies(2 * vport->cfg_devloss_tmo * 1000) + jiffies;
while (time_after(later, jiffies) && cnt) {
- schedule_timeout_uninterruptible(msecs_to_jiffies(20));
+ schedule_msec_hrtimeout_uninterruptible((20));
cnt = lpfc_sli_sum_iocb(vport, tgt_id, lun_id, context);
}
if (cnt) {
diff --git a/drivers/scsi/scsi.c b/drivers/scsi/scsi.c
index 1deb6ad..75455d4 100644
--- a/drivers/scsi/scsi.c
+++ b/drivers/scsi/scsi.c
@@ -621,6 +621,9 @@ int scsi_change_queue_depth(struct scsi_device *sdev, int depth)
wmb();
}
+ if (sdev->request_queue)
+ blk_set_queue_depth(sdev->request_queue, depth);
+
return sdev->queue_depth;
}
EXPORT_SYMBOL(scsi_change_queue_depth);
diff --git a/drivers/scsi/snic/snic_scsi.c b/drivers/scsi/snic/snic_scsi.c
index abada16..0bf30dc 100644
--- a/drivers/scsi/snic/snic_scsi.c
+++ b/drivers/scsi/snic/snic_scsi.c
@@ -2356,7 +2356,7 @@ snic_reset(struct Scsi_Host *shost, struct scsi_cmnd *sc)
/* Wait for all the IOs that are entered in Qcmd */
while (atomic_read(&snic->ios_inflight))
- schedule_timeout(msecs_to_jiffies(1));
+ schedule_msec_hrtimeout((1));
ret = snic_issue_hba_reset(snic, sc);
if (ret) {
diff --git a/drivers/staging/comedi/drivers/ni_mio_common.c b/drivers/staging/comedi/drivers/ni_mio_common.c
index 1c967c3..a23726e 100644
--- a/drivers/staging/comedi/drivers/ni_mio_common.c
+++ b/drivers/staging/comedi/drivers/ni_mio_common.c
@@ -4610,7 +4610,7 @@ static int cs5529_wait_for_idle(struct comedi_device *dev)
if ((status & NI67XX_CAL_STATUS_BUSY) == 0)
break;
set_current_state(TASK_INTERRUPTIBLE);
- if (schedule_timeout(1))
+ if (schedule_min_hrtimeout())
return -EIO;
}
if (i == timeout) {
diff --git a/drivers/staging/lustre/lnet/lnet/lib-eq.c b/drivers/staging/lustre/lnet/lnet/lib-eq.c
index d05c6cc..3f62b6f 100644
--- a/drivers/staging/lustre/lnet/lnet/lib-eq.c
+++ b/drivers/staging/lustre/lnet/lnet/lib-eq.c
@@ -328,7 +328,7 @@ __must_hold(&the_lnet.ln_eq_wait_lock)
schedule();
} else {
now = jiffies;
- schedule_timeout(msecs_to_jiffies(tms));
+ schedule_msec_hrtimeout((tms));
tms -= jiffies_to_msecs(jiffies - now);
if (tms < 0) /* no more wait but may have new event */
tms = 0;
diff --git a/drivers/staging/rts5208/rtsx.c b/drivers/staging/rts5208/rtsx.c
index 5d65a5c..729ab6d 100644
--- a/drivers/staging/rts5208/rtsx.c
+++ b/drivers/staging/rts5208/rtsx.c
@@ -533,7 +533,7 @@ static int rtsx_polling_thread(void *__dev)
for (;;) {
set_current_state(TASK_INTERRUPTIBLE);
- schedule_timeout(msecs_to_jiffies(POLLING_INTERVAL));
+ schedule_msec_hrtimeout((POLLING_INTERVAL));
/* lock the device pointers */
mutex_lock(&(dev->dev_mutex));
diff --git a/drivers/staging/speakup/speakup_acntpc.c b/drivers/staging/speakup/speakup_acntpc.c
index efb791b..fd02fb2 100644
--- a/drivers/staging/speakup/speakup_acntpc.c
+++ b/drivers/staging/speakup/speakup_acntpc.c
@@ -204,7 +204,7 @@ static void do_catch_up(struct spk_synth *synth)
full_time_val = full_time->u.n.value;
spin_unlock_irqrestore(&speakup_info.spinlock, flags);
if (synth_full()) {
- schedule_timeout(msecs_to_jiffies(full_time_val));
+ schedule_msec_hrtimeout((full_time_val));
continue;
}
set_current_state(TASK_RUNNING);
@@ -232,7 +232,7 @@ static void do_catch_up(struct spk_synth *synth)
jiffy_delta_val = jiffy_delta->u.n.value;
delay_time_val = delay_time->u.n.value;
spin_unlock_irqrestore(&speakup_info.spinlock, flags);
- schedule_timeout(msecs_to_jiffies(delay_time_val));
+ schedule_msec_hrtimeout((delay_time_val));
jiff_max = jiffies+jiffy_delta_val;
}
}
diff --git a/drivers/staging/speakup/speakup_apollo.c b/drivers/staging/speakup/speakup_apollo.c
index 3cbc8a7..3c17854 100644
--- a/drivers/staging/speakup/speakup_apollo.c
+++ b/drivers/staging/speakup/speakup_apollo.c
@@ -172,7 +172,7 @@ static void do_catch_up(struct spk_synth *synth)
outb(UART_MCR_DTR, speakup_info.port_tts + UART_MCR);
outb(UART_MCR_DTR | UART_MCR_RTS,
speakup_info.port_tts + UART_MCR);
- schedule_timeout(msecs_to_jiffies(full_time_val));
+ schedule_msec_hrtimeout((full_time_val));
continue;
}
if (time_after_eq(jiffies, jiff_max) && (ch == SPACE)) {
diff --git a/drivers/staging/speakup/speakup_decext.c b/drivers/staging/speakup/speakup_decext.c
index 1a5cf3d..fa2b4e1 100644
--- a/drivers/staging/speakup/speakup_decext.c
+++ b/drivers/staging/speakup/speakup_decext.c
@@ -186,7 +186,7 @@ static void do_catch_up(struct spk_synth *synth)
if (ch == '\n')
ch = 0x0D;
if (synth_full() || !spk_serial_out(ch)) {
- schedule_timeout(msecs_to_jiffies(delay_time_val));
+ schedule_msec_hrtimeout((delay_time_val));
continue;
}
set_current_state(TASK_RUNNING);
diff --git a/drivers/staging/speakup/speakup_decpc.c b/drivers/staging/speakup/speakup_decpc.c
index d6479bd..f7554bf 100644
--- a/drivers/staging/speakup/speakup_decpc.c
+++ b/drivers/staging/speakup/speakup_decpc.c
@@ -403,7 +403,7 @@ static void do_catch_up(struct spk_synth *synth)
if (ch == '\n')
ch = 0x0D;
if (dt_sendchar(ch)) {
- schedule_timeout(msecs_to_jiffies(delay_time_val));
+ schedule_msec_hrtimeout((delay_time_val));
continue;
}
set_current_state(TASK_RUNNING);
diff --git a/drivers/staging/speakup/speakup_dectlk.c b/drivers/staging/speakup/speakup_dectlk.c
index 7646567..639192e 100644
--- a/drivers/staging/speakup/speakup_dectlk.c
+++ b/drivers/staging/speakup/speakup_dectlk.c
@@ -251,7 +251,7 @@ static void do_catch_up(struct spk_synth *synth)
if (ch == '\n')
ch = 0x0D;
if (synth_full_val || !spk_serial_out(ch)) {
- schedule_timeout(msecs_to_jiffies(delay_time_val));
+ schedule_msec_hrtimeout((delay_time_val));
continue;
}
set_current_state(TASK_RUNNING);
diff --git a/drivers/staging/speakup/speakup_dtlk.c b/drivers/staging/speakup/speakup_dtlk.c
index 38aa401..1640519 100644
--- a/drivers/staging/speakup/speakup_dtlk.c
+++ b/drivers/staging/speakup/speakup_dtlk.c
@@ -217,7 +217,7 @@ static void do_catch_up(struct spk_synth *synth)
delay_time_val = delay_time->u.n.value;
spin_unlock_irqrestore(&speakup_info.spinlock, flags);
if (synth_full()) {
- schedule_timeout(msecs_to_jiffies(delay_time_val));
+ schedule_msec_hrtimeout((delay_time_val));
continue;
}
set_current_state(TASK_RUNNING);
@@ -233,7 +233,7 @@ static void do_catch_up(struct spk_synth *synth)
delay_time_val = delay_time->u.n.value;
jiffy_delta_val = jiffy_delta->u.n.value;
spin_unlock_irqrestore(&speakup_info.spinlock, flags);
- schedule_timeout(msecs_to_jiffies(delay_time_val));
+ schedule_msec_hrtimeout((delay_time_val));
jiff_max = jiffies + jiffy_delta_val;
}
}
diff --git a/drivers/staging/speakup/speakup_keypc.c b/drivers/staging/speakup/speakup_keypc.c
index 5e2170b..30b5df7 100644
--- a/drivers/staging/speakup/speakup_keypc.c
+++ b/drivers/staging/speakup/speakup_keypc.c
@@ -206,7 +206,7 @@ spin_lock_irqsave(&speakup_info.spinlock, flags);
full_time_val = full_time->u.n.value;
spin_unlock_irqrestore(&speakup_info.spinlock, flags);
if (synth_full()) {
- schedule_timeout(msecs_to_jiffies(full_time_val));
+ schedule_msec_hrtimeout((full_time_val));
continue;
}
set_current_state(TASK_RUNNING);
@@ -239,7 +239,7 @@ spin_lock_irqsave(&speakup_info.spinlock, flags);
jiffy_delta_val = jiffy_delta->u.n.value;
delay_time_val = delay_time->u.n.value;
spin_unlock_irqrestore(&speakup_info.spinlock, flags);
- schedule_timeout(msecs_to_jiffies(delay_time_val));
+ schedule_msec_hrtimeout((delay_time_val));
jiff_max = jiffies+jiffy_delta_val;
}
}
diff --git a/drivers/staging/speakup/synth.c b/drivers/staging/speakup/synth.c
index 54b2f39..70eebfa 100644
--- a/drivers/staging/speakup/synth.c
+++ b/drivers/staging/speakup/synth.c
@@ -119,7 +119,7 @@ void spk_do_catch_up(struct spk_synth *synth)
if (ch == '\n')
ch = synth->procspeech;
if (!spk_serial_out(ch)) {
- schedule_timeout(msecs_to_jiffies(full_time_val));
+ schedule_msec_hrtimeout((full_time_val));
continue;
}
if (time_after_eq(jiffies, jiff_max) && (ch == SPACE)) {
diff --git a/drivers/staging/unisys/visorbus/periodic_work.c b/drivers/staging/unisys/visorbus/periodic_work.c
new file mode 100644
index 0000000..b930287
--- /dev/null
+++ b/drivers/staging/unisys/visorbus/periodic_work.c
@@ -0,0 +1,204 @@
+/* periodic_work.c
+ *
+ * Copyright (C) 2010 - 2015 UNISYS CORPORATION
+ * All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
+ * NON INFRINGEMENT. See the GNU General Public License for more
+ * details.
+ */
+
+/*
+ * Helper functions to schedule periodic work in Linux kernel mode.
+ */
+#include <linux/sched.h>
+
+#include "periodic_work.h"
+
+#define MYDRVNAME "periodic_work"
+
+struct periodic_work {
+ rwlock_t lock;
+ struct delayed_work work;
+ void (*workfunc)(void *);
+ void *workfuncarg;
+ bool is_scheduled;
+ bool want_to_stop;
+ ulong jiffy_interval;
+ struct workqueue_struct *workqueue;
+ const char *devnam;
+};
+
+static void periodic_work_func(struct work_struct *work)
+{
+ struct periodic_work *pw;
+
+ pw = container_of(work, struct periodic_work, work.work);
+ (*pw->workfunc)(pw->workfuncarg);
+}
+
+struct periodic_work
+*visor_periodic_work_create(ulong jiffy_interval,
+ struct workqueue_struct *workqueue,
+ void (*workfunc)(void *),
+ void *workfuncarg,
+ const char *devnam)
+{
+ struct periodic_work *pw;
+
+ pw = kzalloc(sizeof(*pw), GFP_KERNEL | __GFP_NORETRY);
+ if (!pw)
+ return NULL;
+
+ rwlock_init(&pw->lock);
+ pw->jiffy_interval = jiffy_interval;
+ pw->workqueue = workqueue;
+ pw->workfunc = workfunc;
+ pw->workfuncarg = workfuncarg;
+ pw->devnam = devnam;
+ return pw;
+}
+EXPORT_SYMBOL_GPL(visor_periodic_work_create);
+
+void visor_periodic_work_destroy(struct periodic_work *pw)
+{
+ kfree(pw);
+}
+EXPORT_SYMBOL_GPL(visor_periodic_work_destroy);
+
+/** Call this from your periodic work worker function to schedule the next
+ * call.
+ * If this function returns false, there was a failure and the
+ * periodic work is no longer scheduled
+ */
+bool visor_periodic_work_nextperiod(struct periodic_work *pw)
+{
+ bool rc = false;
+
+ write_lock(&pw->lock);
+ if (pw->want_to_stop) {
+ pw->is_scheduled = false;
+ pw->want_to_stop = false;
+ rc = true; /* yes, true; see visor_periodic_work_stop() */
+ goto unlock;
+ } else if (!queue_delayed_work(pw->workqueue, &pw->work,
+ pw->jiffy_interval)) {
+ pw->is_scheduled = false;
+ rc = false;
+ goto unlock;
+ }
+ rc = true;
+unlock:
+ write_unlock(&pw->lock);
+ return rc;
+}
+EXPORT_SYMBOL_GPL(visor_periodic_work_nextperiod);
+
+/** This function returns true iff new periodic work was actually started.
+ * If this function returns false, then no work was started
+ * (either because it was already started, or because of a failure).
+ */
+bool visor_periodic_work_start(struct periodic_work *pw)
+{
+ bool rc = false;
+
+ write_lock(&pw->lock);
+ if (pw->is_scheduled) {
+ rc = false;
+ goto unlock;
+ }
+ if (pw->want_to_stop) {
+ rc = false;
+ goto unlock;
+ }
+ INIT_DELAYED_WORK(&pw->work, &periodic_work_func);
+ if (!queue_delayed_work(pw->workqueue, &pw->work,
+ pw->jiffy_interval)) {
+ rc = false;
+ goto unlock;
+ }
+ pw->is_scheduled = true;
+ rc = true;
+unlock:
+ write_unlock(&pw->lock);
+ return rc;
+}
+EXPORT_SYMBOL_GPL(visor_periodic_work_start);
+
+/** This function returns true iff your call actually stopped the periodic
+ * work.
+ *
+ * -- PAY ATTENTION... this is important --
+ *
+ * NO NO #1
+ *
+ * Do NOT call this function from some function that is running on the
+ * same workqueue as the work you are trying to stop might be running
+ * on! If you violate this rule, visor_periodic_work_stop() MIGHT work,
+ * but it also MIGHT get hung up in an infinite loop saying
+ * "waiting for delayed work...". This will happen if the delayed work
+ * you are trying to cancel has been put in the workqueue list, but can't
+ * run yet because we are running that same workqueue thread right now.
+ *
+ * Bottom line: If you need to call visor_periodic_work_stop() from a
+ * workitem, be sure the workitem is on a DIFFERENT workqueue than the
+ * workitem that you are trying to cancel.
+ *
+ * If I could figure out some way to check for this "no no" condition in
+ * the code, I would. It would have saved me the trouble of writing this
+ * long comment. And also, don't think this is some "theoretical" race
+ * condition. It is REAL, as I have spent the day chasing it.
+ *
+ * NO NO #2
+ *
+ * Take close note of the locks that you own when you call this function.
+ * You must NOT own any locks that are needed by the periodic work
+ * function that is currently installed. If you DO, a deadlock may result,
+ * because stopping the periodic work often involves waiting for the last
+ * iteration of the periodic work function to complete. Again, if you hit
+ * this deadlock, you will get hung up in an infinite loop saying
+ * "waiting for delayed work...".
+ */
+bool visor_periodic_work_stop(struct periodic_work *pw)
+{
+ bool stopped_something = false;
+
+ write_lock(&pw->lock);
+ stopped_something = pw->is_scheduled && (!pw->want_to_stop);
+ while (pw->is_scheduled) {
+ pw->want_to_stop = true;
+ if (cancel_delayed_work(&pw->work)) {
+ /* We get here if the delayed work was pending as
+ * delayed work, but was NOT run.
+ */
+ WARN_ON(!pw->is_scheduled);
+ pw->is_scheduled = false;
+ } else {
+ /* If we get here, either the delayed work:
+ * - was run, OR,
+ * - is running RIGHT NOW on another processor, OR,
+ * - wasn't even scheduled (there is a miniscule
+ * timing window where this could be the case)
+ * flush_workqueue() would make sure it is finished
+ * executing, but that still isn't very useful, which
+ * explains the loop...
+ */
+ }
+ if (pw->is_scheduled) {
+ write_unlock(&pw->lock);
+ schedule_msec_hrtimeout_interruptible((10));
+ write_lock(&pw->lock);
+ } else {
+ pw->want_to_stop = false;
+ }
+ }
+ write_unlock(&pw->lock);
+ return stopped_something;
+}
+EXPORT_SYMBOL_GPL(visor_periodic_work_stop);
diff --git a/drivers/staging/unisys/visornic/visornic_main.c b/drivers/staging/unisys/visornic/visornic_main.c
index 1367007..910f158 100644
--- a/drivers/staging/unisys/visornic/visornic_main.c
+++ b/drivers/staging/unisys/visornic/visornic_main.c
@@ -468,7 +468,7 @@ visornic_disable_with_timeout(struct net_device *netdev, const int timeout)
}
set_current_state(TASK_INTERRUPTIBLE);
spin_unlock_irqrestore(&devdata->priv_lock, flags);
- wait += schedule_timeout(msecs_to_jiffies(10));
+ wait += schedule_msec_hrtimeout((10));
spin_lock_irqsave(&devdata->priv_lock, flags);
}
@@ -479,7 +479,7 @@ visornic_disable_with_timeout(struct net_device *netdev, const int timeout)
while (1) {
set_current_state(TASK_INTERRUPTIBLE);
spin_unlock_irqrestore(&devdata->priv_lock, flags);
- schedule_timeout(msecs_to_jiffies(10));
+ schedule_msec_hrtimeout((10));
spin_lock_irqsave(&devdata->priv_lock, flags);
if (atomic_read(&devdata->usage))
break;
@@ -611,7 +611,7 @@ visornic_enable_with_timeout(struct net_device *netdev, const int timeout)
}
set_current_state(TASK_INTERRUPTIBLE);
spin_unlock_irqrestore(&devdata->priv_lock, flags);
- wait += schedule_timeout(msecs_to_jiffies(10));
+ wait += schedule_msec_hrtimeout((10));
spin_lock_irqsave(&devdata->priv_lock, flags);
}
diff --git a/drivers/target/target_core_user.c b/drivers/target/target_core_user.c
index 70c143a..2d3cbc9 100644
--- a/drivers/target/target_core_user.c
+++ b/drivers/target/target_core_user.c
@@ -450,7 +450,7 @@ tcmu_queue_cmd_ring(struct tcmu_cmd *tcmu_cmd)
pr_debug("sleeping for ring space\n");
spin_unlock_irq(&udev->cmdr_lock);
- ret = schedule_timeout(msecs_to_jiffies(TCMU_TIME_OUT));
+ ret = schedule_msec_hrtimeout((TCMU_TIME_OUT));
finish_wait(&udev->wait_cmdr, &__wait);
if (!ret) {
pr_warn("tcmu: command timed out\n");
diff --git a/drivers/video/fbdev/omap/hwa742.c b/drivers/video/fbdev/omap/hwa742.c
index a4ee65b..cf38bcb 100644
--- a/drivers/video/fbdev/omap/hwa742.c
+++ b/drivers/video/fbdev/omap/hwa742.c
@@ -926,7 +926,7 @@ static void hwa742_resume(void)
if (hwa742_read_reg(HWA742_PLL_DIV_REG) & (1 << 7))
break;
set_current_state(TASK_UNINTERRUPTIBLE);
- schedule_timeout(msecs_to_jiffies(5));
+ schedule_msec_hrtimeout((5));
}
hwa742_set_update_mode(hwa742.update_mode_before_suspend);
}
diff --git a/drivers/video/fbdev/pxafb.c b/drivers/video/fbdev/pxafb.c
index ef73f14..7b5483b 100644
--- a/drivers/video/fbdev/pxafb.c
+++ b/drivers/video/fbdev/pxafb.c
@@ -1287,7 +1287,7 @@ static int pxafb_smart_thread(void *arg)
mutex_unlock(&fbi->ctrlr_lock);
set_current_state(TASK_INTERRUPTIBLE);
- schedule_timeout(msecs_to_jiffies(30));
+ schedule_msec_hrtimeout((30));
}
pr_debug("%s(): task ending\n", __func__);
diff --git a/fs/afs/vlocation.c b/fs/afs/vlocation.c
index 45a8639..855d08e 100644
--- a/fs/afs/vlocation.c
+++ b/fs/afs/vlocation.c
@@ -129,7 +129,7 @@ static int afs_vlocation_access_vl_by_id(struct afs_vlocation *vl,
if (vl->upd_busy_cnt > 1) {
/* second+ BUSY - sleep a little bit */
set_current_state(TASK_UNINTERRUPTIBLE);
- schedule_timeout(1);
+ schedule_min_hrtimeout();
}
continue;
}
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 5909ae8..c010e57 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -5953,7 +5953,7 @@ int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes)
if (flush != BTRFS_RESERVE_NO_FLUSH &&
btrfs_transaction_in_commit(root->fs_info))
- schedule_timeout(1);
+ schedule_min_hrtimeout();
if (delalloc_lock)
mutex_lock(&BTRFS_I(inode)->delalloc_mutex);
diff --git a/fs/btrfs/inode-map.c b/fs/btrfs/inode-map.c
index d27014b..cd87786 100644
--- a/fs/btrfs/inode-map.c
+++ b/fs/btrfs/inode-map.c
@@ -89,7 +89,7 @@ again:
btrfs_release_path(path);
root->ino_cache_progress = last;
up_read(&fs_info->commit_root_sem);
- schedule_timeout(1);
+ schedule_min_hrtimeout();
goto again;
} else
continue;
diff --git a/fs/buffer.c b/fs/buffer.c
index b205a62..abb13b0 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -1697,7 +1697,7 @@ int __block_write_full_page(struct inode *inode, struct page *page,
struct buffer_head *bh, *head;
unsigned int blocksize, bbits;
int nr_underway = 0;
- int write_flags = (wbc->sync_mode == WB_SYNC_ALL ? WRITE_SYNC : 0);
+ int write_flags = wbc_to_write_flags(wbc);
head = create_page_buffers(page, inode,
(1 << BH_Dirty)|(1 << BH_Uptodate));
diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c
index 14db4b7..2cc48bf 100644
--- a/fs/f2fs/data.c
+++ b/fs/f2fs/data.c
@@ -1251,7 +1251,7 @@ static int f2fs_write_data_page(struct page *page,
.sbi = sbi,
.type = DATA,
.op = REQ_OP_WRITE,
- .op_flags = (wbc->sync_mode == WB_SYNC_ALL) ? WRITE_SYNC : 0,
+ .op_flags = wbc_to_write_flags(wbc),
.page = page,
.encrypted_page = NULL,
};
diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c
index 01177ec..772a123 100644
--- a/fs/f2fs/node.c
+++ b/fs/f2fs/node.c
@@ -1570,7 +1570,7 @@ static int f2fs_write_node_page(struct page *page,
.sbi = sbi,
.type = NODE,
.op = REQ_OP_WRITE,
- .op_flags = (wbc->sync_mode == WB_SYNC_ALL) ? WRITE_SYNC : 0,
+ .op_flags = wbc_to_write_flags(wbc),
.page = page,
.encrypted_page = NULL,
};
diff --git a/fs/gfs2/meta_io.c b/fs/gfs2/meta_io.c
index 373639a..3223ccf 100644
--- a/fs/gfs2/meta_io.c
+++ b/fs/gfs2/meta_io.c
@@ -37,8 +37,7 @@ static int gfs2_aspace_writepage(struct page *page, struct writeback_control *wb
{
struct buffer_head *bh, *head;
int nr_underway = 0;
- int write_flags = REQ_META | REQ_PRIO |
- (wbc->sync_mode == WB_SYNC_ALL ? WRITE_SYNC : 0);
+ int write_flags = REQ_META | REQ_PRIO | wbc_to_write_flags(wbc);
BUG_ON(!PageLocked(page));
BUG_ON(!page_has_buffers(page));
diff --git a/fs/mpage.c b/fs/mpage.c
index d2413af..d6f1afe 100644
--- a/fs/mpage.c
+++ b/fs/mpage.c
@@ -489,7 +489,7 @@ static int __mpage_writepage(struct page *page, struct writeback_control *wbc,
struct buffer_head map_bh;
loff_t i_size = i_size_read(inode);
int ret = 0;
- int op_flags = (wbc->sync_mode == WB_SYNC_ALL ? WRITE_SYNC : 0);
+ int op_flags = wbc_to_write_flags(wbc);
if (page_has_buffers(page)) {
struct buffer_head *head = page_buffers(page);
diff --git a/fs/proc/base.c b/fs/proc/base.c
index ca651ac..ee394c8 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -488,7 +488,7 @@ static int proc_pid_schedstat(struct seq_file *m, struct pid_namespace *ns,
seq_printf(m, "0 0 0\n");
else
seq_printf(m, "%llu %llu %lu\n",
- (unsigned long long)task->se.sum_exec_runtime,
+ (unsigned long long)tsk_seruntime(task),
(unsigned long long)task->sched_info.run_delay,
task->sched_info.pcount);
diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c
index 06763f5..33a9e0b 100644
--- a/fs/xfs/xfs_aops.c
+++ b/fs/xfs/xfs_aops.c
@@ -495,8 +495,8 @@ xfs_submit_ioend(
ioend->io_bio->bi_private = ioend;
ioend->io_bio->bi_end_io = xfs_end_bio;
- bio_set_op_attrs(ioend->io_bio, REQ_OP_WRITE,
- (wbc->sync_mode == WB_SYNC_ALL) ? WRITE_SYNC : 0);
+ bio_set_op_attrs(ioend->io_bio, REQ_OP_WRITE, wbc_to_write_flags(wbc));
+
/*
* If we are failing the IO now, just mark the ioend with an
* error and finish it. This will run IO completion immediately
@@ -567,8 +567,7 @@ xfs_chain_bio(
bio_chain(ioend->io_bio, new);
bio_get(ioend->io_bio); /* for xfs_destroy_ioend */
- bio_set_op_attrs(ioend->io_bio, REQ_OP_WRITE,
- (wbc->sync_mode == WB_SYNC_ALL) ? WRITE_SYNC : 0);
+ bio_set_op_attrs(ioend->io_bio, REQ_OP_WRITE, wbc_to_write_flags(wbc));
submit_bio(ioend->io_bio);
ioend->io_bio = new;
}
diff --git a/include/linux/backing-dev-defs.h b/include/linux/backing-dev-defs.h
index c357f27..dc5f76d 100644
--- a/include/linux/backing-dev-defs.h
+++ b/include/linux/backing-dev-defs.h
@@ -116,6 +116,8 @@ struct bdi_writeback {
struct list_head work_list;
struct delayed_work dwork; /* work item used for writeback */
+ unsigned long dirty_sleep; /* last wait */
+
struct list_head bdi_node; /* anchored at bdi->wb_list */
#ifdef CONFIG_CGROUP_WRITEBACK
diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h
index cd395ec..cb5d746 100644
--- a/include/linux/blk_types.h
+++ b/include/linux/blk_types.h
@@ -162,6 +162,7 @@ enum rq_flag_bits {
__REQ_INTEGRITY, /* I/O includes block integrity payload */
__REQ_FUA, /* forced unit access */
__REQ_PREFLUSH, /* request for cache flush */
+ __REQ_BG, /* background activity */
/* bio only flags */
__REQ_RAHEAD, /* read ahead, can fail anytime */
@@ -205,7 +206,7 @@ enum rq_flag_bits {
(REQ_FAILFAST_DEV | REQ_FAILFAST_TRANSPORT | REQ_FAILFAST_DRIVER)
#define REQ_COMMON_MASK \
(REQ_FAILFAST_MASK | REQ_SYNC | REQ_META | REQ_PRIO | REQ_NOIDLE | \
- REQ_PREFLUSH | REQ_FUA | REQ_INTEGRITY | REQ_NOMERGE)
+ REQ_PREFLUSH | REQ_FUA | REQ_INTEGRITY | REQ_NOMERGE | REQ_BG)
#define REQ_CLONE_MASK REQ_COMMON_MASK
/* This mask is used for both bio and request merge checking */
@@ -230,6 +231,7 @@ enum rq_flag_bits {
#define REQ_COPY_USER (1ULL << __REQ_COPY_USER)
#define REQ_PREFLUSH (1ULL << __REQ_PREFLUSH)
#define REQ_FLUSH_SEQ (1ULL << __REQ_FLUSH_SEQ)
+#define REQ_BG (1ULL << __REQ_BG)
#define REQ_IO_STAT (1ULL << __REQ_IO_STAT)
#define REQ_MIXED_MERGE (1ULL << __REQ_MIXED_MERGE)
#define REQ_PM (1ULL << __REQ_PM)
@@ -271,4 +273,20 @@ static inline unsigned int blk_qc_t_to_tag(blk_qc_t cookie)
return cookie & ((1u << BLK_QC_T_SHIFT) - 1);
}
+struct blk_issue_stat {
+ u64 time;
+};
+
+#define BLK_RQ_STAT_BATCH 64
+
+struct blk_rq_stat {
+ s64 mean;
+ u64 min;
+ u64 max;
+ s32 nr_samples;
+ s32 nr_batch;
+ u64 batch;
+ s64 time;
+};
+
#endif /* __LINUX_BLK_TYPES_H */
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index f6a8161..4adcbff 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -37,6 +37,7 @@ struct bsg_job;
struct blkcg_gq;
struct blk_flush_queue;
struct pr_ops;
+struct rq_wb;
#define BLKDEV_MIN_RQ 4
#define BLKDEV_MAX_RQ 128 /* Default maximum */
@@ -45,7 +46,7 @@ struct pr_ops;
* Maximum number of blkcg policies allowed to be registered concurrently.
* Defined here to simplify include dependency.
*/
-#define BLKCG_MAX_POLS 2
+#define BLKCG_MAX_POLS 3
typedef void (rq_end_io_fn)(struct request *, int);
@@ -151,6 +152,7 @@ struct request {
struct gendisk *rq_disk;
struct hd_struct *part;
unsigned long start_time;
+ struct blk_issue_stat issue_stat;
#ifdef CONFIG_BLK_CGROUP
struct request_list *rl; /* rl this rq is alloced from */
unsigned long long start_time_ns;
@@ -302,6 +304,8 @@ struct request_queue {
int nr_rqs[2]; /* # allocated [a]sync rqs */
int nr_rqs_elvpriv; /* # allocated rqs w/ elvpriv */
+ struct rq_wb *rq_wb;
+
/*
* If blkcg is not used, @q->root_rl serves all requests. If blkcg
* is used, root blkg allocates from @q->root_rl and all other
@@ -327,6 +331,8 @@ struct request_queue {
struct blk_mq_ctx __percpu *queue_ctx;
unsigned int nr_queues;
+ unsigned int queue_depth;
+
/* hw dispatch queues */
struct blk_mq_hw_ctx **queue_hw_ctx;
unsigned int nr_hw_queues;
@@ -412,6 +418,9 @@ struct request_queue {
unsigned int nr_sorted;
unsigned int in_flight[2];
+
+ struct blk_rq_stat rq_stats[2];
+
/*
* Number of active block driver functions for which blk_drain_queue()
* must wait. Must be incremented around functions that unlock the
@@ -683,6 +692,14 @@ static inline bool blk_write_same_mergeable(struct bio *a, struct bio *b)
return false;
}
+static inline unsigned int blk_queue_depth(struct request_queue *q)
+{
+ if (q->queue_depth)
+ return q->queue_depth;
+
+ return q->nr_requests;
+}
+
/*
* q->prep_rq_fn return values
*/
@@ -999,6 +1016,7 @@ extern void blk_limits_io_min(struct queue_limits *limits, unsigned int min);
extern void blk_queue_io_min(struct request_queue *q, unsigned int min);
extern void blk_limits_io_opt(struct queue_limits *limits, unsigned int opt);
extern void blk_queue_io_opt(struct request_queue *q, unsigned int opt);
+extern void blk_set_queue_depth(struct request_queue *q, unsigned int depth);
extern void blk_set_default_limits(struct queue_limits *lim);
extern void blk_set_stacking_limits(struct queue_limits *lim);
extern int blk_stack_limits(struct queue_limits *t, struct queue_limits *b,
diff --git a/include/linux/freezer.h b/include/linux/freezer.h
index dd03e83..2fda682 100644
--- a/include/linux/freezer.h
+++ b/include/linux/freezer.h
@@ -296,6 +296,7 @@ static inline void set_freezable(void) {}
#define wait_event_freezekillable_unsafe(wq, condition) \
wait_event_killable(wq, condition)
+#define pm_freezing (false)
#endif /* !CONFIG_FREEZER */
#endif /* FREEZER_H_INCLUDED */
diff --git a/include/linux/fs.h b/include/linux/fs.h
index dc0478c..ea1b019 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -189,6 +189,8 @@ typedef int (dio_iodone_t)(struct kiocb *iocb, loff_t offset,
* WRITE_FLUSH_FUA Combination of WRITE_FLUSH and FUA. The IO is preceded
* by a cache flush and data is guaranteed to be on
* non-volatile media on completion.
+ * WRITE_BG Background write. This is for background activity like
+ * the periodic flush and background threshold writeback
*
*/
#define RW_MASK REQ_OP_WRITE
@@ -202,6 +204,7 @@ typedef int (dio_iodone_t)(struct kiocb *iocb, loff_t offset,
#define WRITE_FLUSH (REQ_SYNC | REQ_NOIDLE | REQ_PREFLUSH)
#define WRITE_FUA (REQ_SYNC | REQ_NOIDLE | REQ_FUA)
#define WRITE_FLUSH_FUA (REQ_SYNC | REQ_NOIDLE | REQ_PREFLUSH | REQ_FUA)
+#define WRITE_BG (REQ_NOIDLE | REQ_BG)
/*
* Attribute flags. These should be or-ed together to figure out what
diff --git a/include/linux/init_task.h b/include/linux/init_task.h
index 325f649..ac7a68f 100644
--- a/include/linux/init_task.h
+++ b/include/linux/init_task.h
@@ -159,8 +159,6 @@ extern struct task_group root_task_group;
# define INIT_VTIME(tsk)
#endif
-#define INIT_TASK_COMM "swapper"
-
#ifdef CONFIG_RT_MUTEXES
# define INIT_RT_MUTEXES(tsk) \
.pi_waiters = RB_ROOT, \
@@ -197,6 +195,78 @@ extern struct task_group root_task_group;
* INIT_TASK is used to set up the first task table, touch at
* your own risk!. Base=0, limit=0x1fffff (=2MB)
*/
+#ifdef CONFIG_SCHED_MUQSS
+#define INIT_TASK_COMM "MuQSS"
+#define INIT_TASK(tsk) \
+{ \
+ INIT_TASK_TI(tsk) \
+ .state = 0, \
+ .stack = init_stack, \
+ .usage = ATOMIC_INIT(2), \
+ .flags = PF_KTHREAD, \
+ .prio = NORMAL_PRIO, \
+ .static_prio = MAX_PRIO-20, \
+ .normal_prio = NORMAL_PRIO, \
+ .deadline = 0, \
+ .policy = SCHED_NORMAL, \
+ .cpus_allowed = CPU_MASK_ALL, \
+ .mm = NULL, \
+ .active_mm = &init_mm, \
+ .restart_block = { \
+ .fn = do_no_restart_syscall, \
+ }, \
+ .time_slice = 1000000, \
+ .tasks = LIST_HEAD_INIT(tsk.tasks), \
+ INIT_PUSHABLE_TASKS(tsk) \
+ .ptraced = LIST_HEAD_INIT(tsk.ptraced), \
+ .ptrace_entry = LIST_HEAD_INIT(tsk.ptrace_entry), \
+ .real_parent = &tsk, \
+ .parent = &tsk, \
+ .children = LIST_HEAD_INIT(tsk.children), \
+ .sibling = LIST_HEAD_INIT(tsk.sibling), \
+ .group_leader = &tsk, \
+ RCU_POINTER_INITIALIZER(real_cred, &init_cred), \
+ RCU_POINTER_INITIALIZER(cred, &init_cred), \
+ .comm = INIT_TASK_COMM, \
+ .thread = INIT_THREAD, \
+ .fs = &init_fs, \
+ .files = &init_files, \
+ .signal = &init_signals, \
+ .sighand = &init_sighand, \
+ .nsproxy = &init_nsproxy, \
+ .pending = { \
+ .list = LIST_HEAD_INIT(tsk.pending.list), \
+ .signal = {{0}}}, \
+ .blocked = {{0}}, \
+ .alloc_lock = __SPIN_LOCK_UNLOCKED(tsk.alloc_lock), \
+ .journal_info = NULL, \
+ .cpu_timers = INIT_CPU_TIMERS(tsk.cpu_timers), \
+ .pi_lock = __RAW_SPIN_LOCK_UNLOCKED(tsk.pi_lock), \
+ .timer_slack_ns = 50000, /* 50 usec default slack */ \
+ .pids = { \
+ [PIDTYPE_PID] = INIT_PID_LINK(PIDTYPE_PID), \
+ [PIDTYPE_PGID] = INIT_PID_LINK(PIDTYPE_PGID), \
+ [PIDTYPE_SID] = INIT_PID_LINK(PIDTYPE_SID), \
+ }, \
+ .thread_group = LIST_HEAD_INIT(tsk.thread_group), \
+ .thread_node = LIST_HEAD_INIT(init_signals.thread_head), \
+ INIT_IDS \
+ INIT_PERF_EVENTS(tsk) \
+ INIT_TRACE_IRQFLAGS \
+ INIT_LOCKDEP \
+ INIT_FTRACE_GRAPH \
+ INIT_TRACE_RECURSION \
+ INIT_TASK_RCU_PREEMPT(tsk) \
+ INIT_TASK_RCU_TASKS(tsk) \
+ INIT_CPUSET_SEQ(tsk) \
+ INIT_RT_MUTEXES(tsk) \
+ INIT_PREV_CPUTIME(tsk) \
+ INIT_VTIME(tsk) \
+ INIT_NUMA_BALANCING(tsk) \
+ INIT_KASAN(tsk) \
+}
+#else /* CONFIG_SCHED_MUQSS */
+#define INIT_TASK_COMM "swapper"
#define INIT_TASK(tsk) \
{ \
INIT_TASK_TI(tsk) \
@@ -272,7 +342,7 @@ extern struct task_group root_task_group;
INIT_NUMA_BALANCING(tsk) \
INIT_KASAN(tsk) \
}
-
+#endif /* CONFIG_SCHED_MUQSS */
#define INIT_CPU_TIMERS(cpu_timers) \
{ \
diff --git a/include/linux/ioprio.h b/include/linux/ioprio.h
index 8c12390..ebe98b9 100644
--- a/include/linux/ioprio.h
+++ b/include/linux/ioprio.h
@@ -51,6 +51,8 @@ enum {
*/
static inline int task_nice_ioprio(struct task_struct *task)
{
+ if (iso_task(task))
+ return 0;
return (task_nice(task) + 20) / 5;
}
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 75d9a57..40a31ab 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -59,6 +59,7 @@ struct sched_param {
#include <linux/gfp.h>
#include <linux/magic.h>
#include <linux/cgroup-defs.h>
+#include <linux/skip_list.h>
#include <asm/processor.h>
@@ -176,7 +177,7 @@ extern void get_iowait_load(unsigned long *nr_waiters, unsigned long *load);
extern void calc_global_load(unsigned long ticks);
-#if defined(CONFIG_SMP) && defined(CONFIG_NO_HZ_COMMON)
+#if defined(CONFIG_SMP) && defined(CONFIG_NO_HZ_COMMON) && !defined(CONFIG_SCHED_MUQSS)
extern void cpu_load_update_nohz_start(void);
extern void cpu_load_update_nohz_stop(void);
#else
@@ -340,8 +341,6 @@ extern void init_idle_bootup_task(struct task_struct *idle);
extern cpumask_var_t cpu_isolated_map;
-extern int runqueue_is_locked(int cpu);
-
#if defined(CONFIG_SMP) && defined(CONFIG_NO_HZ_COMMON)
extern void nohz_balance_enter_idle(int cpu);
extern void set_cpu_sd_state_idle(void);
@@ -438,6 +437,34 @@ extern signed long schedule_timeout_interruptible(signed long timeout);
extern signed long schedule_timeout_killable(signed long timeout);
extern signed long schedule_timeout_uninterruptible(signed long timeout);
extern signed long schedule_timeout_idle(signed long timeout);
+
+#ifdef CONFIG_HIGH_RES_TIMERS
+extern signed long schedule_msec_hrtimeout(signed long timeout);
+extern signed long schedule_min_hrtimeout(void);
+extern signed long schedule_msec_hrtimeout_interruptible(signed long timeout);
+extern signed long schedule_msec_hrtimeout_uninterruptible(signed long timeout);
+#else
+static inline signed long schedule_msec_hrtimeout(signed long timeout)
+{
+ return schedule_timeout(msecs_to_jiffies(timeout));
+}
+
+static inline signed long schedule_min_hrtimeout(void)
+{
+ return schedule_timeout(1);
+}
+
+static inline signed long schedule_msec_hrtimeout_interruptible(signed long timeout)
+{
+ return schedule_timeout_interruptible(msecs_to_jiffies(timeout));
+}
+
+static inline signed long schedule_msec_hrtimeout_uninterruptible(signed long timeout)
+{
+ return schedule_timeout_uninterruptible(msecs_to_jiffies(timeout));
+}
+#endif
+
asmlinkage void schedule(void);
extern void schedule_preempt_disabled(void);
@@ -1486,9 +1513,11 @@ struct task_struct {
unsigned int flags; /* per process flags, defined below */
unsigned int ptrace;
+#if defined(CONFIG_SMP) || defined(CONFIG_SCHED_MUQSS)
+ int on_cpu;
+#endif
#ifdef CONFIG_SMP
struct llist_node wake_entry;
- int on_cpu;
#ifdef CONFIG_THREAD_INFO_IN_TASK
unsigned int cpu; /* current CPU */
#endif
@@ -1499,12 +1528,26 @@ struct task_struct {
int wake_cpu;
#endif
int on_rq;
-
int prio, static_prio, normal_prio;
unsigned int rt_priority;
+#ifdef CONFIG_SCHED_MUQSS
+ int time_slice;
+ u64 deadline;
+ skiplist_node node; /* Skip list node */
+ u64 last_ran;
+ u64 sched_time; /* sched_clock time spent running */
+#ifdef CONFIG_SMT_NICE
+ int smt_bias; /* Policy/nice level bias across smt siblings */
+#endif
+#ifdef CONFIG_HOTPLUG_CPU
+ bool zerobound; /* Bound to CPU0 for hotplug */
+#endif
+ unsigned long rt_timeout;
+#else /* CONFIG_SCHED_MUQSS */
const struct sched_class *sched_class;
struct sched_entity se;
struct sched_rt_entity rt;
+#endif
#ifdef CONFIG_CGROUP_SCHED
struct task_group *sched_task_group;
#endif
@@ -1628,6 +1671,10 @@ struct task_struct {
int __user *clear_child_tid; /* CLONE_CHILD_CLEARTID */
cputime_t utime, stime, utimescaled, stimescaled;
+#ifdef CONFIG_SCHED_MUQSS
+ /* Unbanked cpu time */
+ unsigned long utime_ns, stime_ns;
+#endif
cputime_t gtime;
struct prev_cputime prev_cputime;
#ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN
@@ -1984,6 +2031,40 @@ static inline struct vm_struct *task_stack_vm_area(const struct task_struct *t)
}
#endif
+#ifdef CONFIG_SCHED_MUQSS
+#define tsk_seruntime(t) ((t)->sched_time)
+#define tsk_rttimeout(t) ((t)->rt_timeout)
+
+static inline void tsk_cpus_current(struct task_struct *p)
+{
+}
+
+void print_scheduler_version(void);
+
+static inline bool iso_task(struct task_struct *p)
+{
+ return (p->policy == SCHED_ISO);
+}
+#else /* CFS */
+#define tsk_seruntime(t) ((t)->se.sum_exec_runtime)
+#define tsk_rttimeout(t) ((t)->rt.timeout)
+
+static inline void tsk_cpus_current(struct task_struct *p)
+{
+ p->nr_cpus_allowed = current->nr_cpus_allowed;
+}
+
+static inline void print_scheduler_version(void)
+{
+ printk(KERN_INFO"CFS CPU scheduler.\n");
+}
+
+static inline bool iso_task(struct task_struct *p)
+{
+ return false;
+}
+#endif /* CONFIG_SCHED_MUQSS */
+
/* Future-safe accessor for struct task_struct's cpus_allowed. */
#define tsk_cpus_allowed(tsk) (&(tsk)->cpus_allowed)
@@ -2437,7 +2518,7 @@ static inline int set_cpus_allowed_ptr(struct task_struct *p,
}
#endif
-#ifdef CONFIG_NO_HZ_COMMON
+#if defined(CONFIG_NO_HZ_COMMON) && !defined(CONFIG_SCHED_MUQSS)
void calc_load_enter_idle(void);
void calc_load_exit_idle(void);
#else
@@ -2538,7 +2619,7 @@ extern unsigned long long
task_sched_runtime(struct task_struct *task);
/* sched_exec is called by processes performing an exec */
-#ifdef CONFIG_SMP
+#if defined(CONFIG_SMP) && !defined(CONFIG_SCHED_MUQSS)
extern void sched_exec(void);
#else
#define sched_exec() {}
@@ -3503,7 +3584,7 @@ static inline unsigned int task_cpu(const struct task_struct *p)
return 0;
}
-static inline void set_task_cpu(struct task_struct *p, unsigned int cpu)
+static inline void set_task_cpu(struct task_struct *p, int cpu)
{
}
diff --git a/include/linux/sched/prio.h b/include/linux/sched/prio.h
index d9cf5a5..94d397e 100644
--- a/include/linux/sched/prio.h
+++ b/include/linux/sched/prio.h
@@ -19,8 +19,20 @@
*/
#define MAX_USER_RT_PRIO 100
+
+#ifdef CONFIG_SCHED_MUQSS
+/* Note different MAX_RT_PRIO */
+#define MAX_RT_PRIO (MAX_USER_RT_PRIO + 1)
+
+#define ISO_PRIO (MAX_RT_PRIO)
+#define NORMAL_PRIO (MAX_RT_PRIO + 1)
+#define IDLE_PRIO (MAX_RT_PRIO + 2)
+#define PRIO_LIMIT ((IDLE_PRIO) + 1)
+#else /* CONFIG_SCHED_MUQSS */
#define MAX_RT_PRIO MAX_USER_RT_PRIO
+#endif /* CONFIG_SCHED_MUQSS */
+
#define MAX_PRIO (MAX_RT_PRIO + NICE_WIDTH)
#define DEFAULT_PRIO (MAX_RT_PRIO + NICE_WIDTH / 2)
diff --git a/include/linux/skip_list.h b/include/linux/skip_list.h
new file mode 100644
index 0000000..d4be84b
--- /dev/null
+++ b/include/linux/skip_list.h
@@ -0,0 +1,33 @@
+#ifndef _LINUX_SKIP_LISTS_H
+#define _LINUX_SKIP_LISTS_H
+typedef u64 keyType;
+typedef void *valueType;
+
+typedef struct nodeStructure skiplist_node;
+
+struct nodeStructure {
+ int level; /* Levels in this structure */
+ keyType key;
+ valueType value;
+ skiplist_node *next[8];
+ skiplist_node *prev[8];
+};
+
+typedef struct listStructure {
+ int entries;
+ int level; /* Maximum level of the list
+ (1 more than the number of levels in the list) */
+ skiplist_node *header; /* pointer to header */
+} skiplist;
+
+void skiplist_init(skiplist_node *slnode);
+skiplist *new_skiplist(skiplist_node *slnode);
+void free_skiplist(skiplist *l);
+void skiplist_node_init(skiplist_node *node);
+void skiplist_insert(skiplist *l, skiplist_node *node, keyType key, valueType value, unsigned int randseed);
+void skiplist_delete(skiplist *l, skiplist_node *node);
+
+static inline bool skiplist_node_empty(skiplist_node *node) {
+ return (!node->next[0]);
+}
+#endif /* _LINUX_SKIP_LISTS_H */
diff --git a/include/linux/writeback.h b/include/linux/writeback.h
index 797100e..f12f0b3 100644
--- a/include/linux/writeback.h
+++ b/include/linux/writeback.h
@@ -100,6 +100,16 @@ struct writeback_control {
#endif
};
+static inline int wbc_to_write_flags(struct writeback_control *wbc)
+{
+ if (wbc->sync_mode == WB_SYNC_ALL)
+ return WRITE_SYNC;
+ else if (wbc->for_kupdate || wbc->for_background)
+ return WRITE_BG;
+
+ return 0;
+}
+
/*
* A wb_domain represents a domain that wb's (bdi_writeback's) belong to
* and are measured against each other in. There always is one global
diff --git a/include/trace/events/wbt.h b/include/trace/events/wbt.h
new file mode 100644
index 0000000..3c518e4
--- /dev/null
+++ b/include/trace/events/wbt.h
@@ -0,0 +1,153 @@
+#undef TRACE_SYSTEM
+#define TRACE_SYSTEM wbt
+
+#if !defined(_TRACE_WBT_H) || defined(TRACE_HEADER_MULTI_READ)
+#define _TRACE_WBT_H
+
+#include <linux/tracepoint.h>
+#include "../../../block/blk-wbt.h"
+
+/**
+ * wbt_stat - trace stats for blk_wb
+ * @stat: array of read/write stats
+ */
+TRACE_EVENT(wbt_stat,
+
+ TP_PROTO(struct backing_dev_info *bdi, struct blk_rq_stat *stat),
+
+ TP_ARGS(bdi, stat),
+
+ TP_STRUCT__entry(
+ __array(char, name, 32)
+ __field(s64, rmean)
+ __field(u64, rmin)
+ __field(u64, rmax)
+ __field(s64, rnr_samples)
+ __field(s64, rtime)
+ __field(s64, wmean)
+ __field(u64, wmin)
+ __field(u64, wmax)
+ __field(s64, wnr_samples)
+ __field(s64, wtime)
+ ),
+
+ TP_fast_assign(
+ strncpy(__entry->name, dev_name(bdi->dev), 32);
+ __entry->rmean = stat[0].mean;
+ __entry->rmin = stat[0].min;
+ __entry->rmax = stat[0].max;
+ __entry->rnr_samples = stat[0].nr_samples;
+ __entry->wmean = stat[1].mean;
+ __entry->wmin = stat[1].min;
+ __entry->wmax = stat[1].max;
+ __entry->wnr_samples = stat[1].nr_samples;
+ ),
+
+ TP_printk("%s: rmean=%llu, rmin=%llu, rmax=%llu, rsamples=%llu, "
+ "wmean=%llu, wmin=%llu, wmax=%llu, wsamples=%llu\n",
+ __entry->name, __entry->rmean, __entry->rmin, __entry->rmax,
+ __entry->rnr_samples, __entry->wmean, __entry->wmin,
+ __entry->wmax, __entry->wnr_samples)
+);
+
+/**
+ * wbt_lat - trace latency event
+ * @lat: latency trigger
+ */
+TRACE_EVENT(wbt_lat,
+
+ TP_PROTO(struct backing_dev_info *bdi, unsigned long lat),
+
+ TP_ARGS(bdi, lat),
+
+ TP_STRUCT__entry(
+ __array(char, name, 32)
+ __field(unsigned long, lat)
+ ),
+
+ TP_fast_assign(
+ strncpy(__entry->name, dev_name(bdi->dev), 32);
+ __entry->lat = div_u64(lat, 1000);
+ ),
+
+ TP_printk("%s: latency %lluus\n", __entry->name,
+ (unsigned long long) __entry->lat)
+);
+
+/**
+ * wbt_step - trace wb event step
+ * @msg: context message
+ * @step: the current scale step count
+ * @window: the current monitoring window
+ * @bg: the current background queue limit
+ * @normal: the current normal writeback limit
+ * @max: the current max throughput writeback limit
+ */
+TRACE_EVENT(wbt_step,
+
+ TP_PROTO(struct backing_dev_info *bdi, const char *msg,
+ int step, unsigned long window, unsigned int bg,
+ unsigned int normal, unsigned int max),
+
+ TP_ARGS(bdi, msg, step, window, bg, normal, max),
+
+ TP_STRUCT__entry(
+ __array(char, name, 32)
+ __field(const char *, msg)
+ __field(int, step)
+ __field(unsigned long, window)
+ __field(unsigned int, bg)
+ __field(unsigned int, normal)
+ __field(unsigned int, max)
+ ),
+
+ TP_fast_assign(
+ strncpy(__entry->name, dev_name(bdi->dev), 32);
+ __entry->msg = msg;
+ __entry->step = step;
+ __entry->window = div_u64(window, 1000);
+ __entry->bg = bg;
+ __entry->normal = normal;
+ __entry->max = max;
+ ),
+
+ TP_printk("%s: %s: step=%d, window=%luus, background=%u, normal=%u, max=%u\n",
+ __entry->name, __entry->msg, __entry->step, __entry->window,
+ __entry->bg, __entry->normal, __entry->max)
+);
+
+/**
+ * wbt_timer - trace wb timer event
+ * @status: timer state status
+ * @step: the current scale step count
+ * @inflight: tracked writes inflight
+ */
+TRACE_EVENT(wbt_timer,
+
+ TP_PROTO(struct backing_dev_info *bdi, unsigned int status,
+ int step, unsigned int inflight),
+
+ TP_ARGS(bdi, status, step, inflight),
+
+ TP_STRUCT__entry(
+ __array(char, name, 32)
+ __field(unsigned int, status)
+ __field(int, step)
+ __field(unsigned int, inflight)
+ ),
+
+ TP_fast_assign(
+ strncpy(__entry->name, dev_name(bdi->dev), 32);
+ __entry->status = status;
+ __entry->step = step;
+ __entry->inflight = inflight;
+ ),
+
+ TP_printk("%s: status=%u, step=%d, inflight=%u\n", __entry->name,
+ __entry->status, __entry->step, __entry->inflight)
+);
+
+#endif /* _TRACE_WBT_H */
+
+/* This part must be outside protection */
+#include <trace/define_trace.h>
diff --git a/include/uapi/linux/sched.h b/include/uapi/linux/sched.h
index 5f0fe01..950a481 100644
--- a/include/uapi/linux/sched.h
+++ b/include/uapi/linux/sched.h
@@ -36,9 +36,16 @@
#define SCHED_FIFO 1
#define SCHED_RR 2
#define SCHED_BATCH 3
-/* SCHED_ISO: reserved but not implemented yet */
+/* SCHED_ISO: Implemented on MuQSS only */
#define SCHED_IDLE 5
+#ifdef CONFIG_SCHED_MUQSS
+#define SCHED_ISO 4
+#define SCHED_IDLEPRIO SCHED_IDLE
+#define SCHED_MAX (SCHED_IDLEPRIO)
+#define SCHED_RANGE(policy) ((policy) <= SCHED_MAX)
+#else /* CONFIG_SCHED_MUQSS */
#define SCHED_DEADLINE 6
+#endif /* CONFIG_SCHED_MUQSS */
/* Can be ORed in to make sure the process is reverted back to SCHED_NORMAL on fork */
#define SCHED_RESET_ON_FORK 0x40000000
diff --git a/init/Kconfig b/init/Kconfig
index 8963254..c69df07 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -38,6 +38,18 @@ config THREAD_INFO_IN_TASK
menu "General setup"
+config SCHED_MUQSS
+ bool "MuQSS cpu scheduler"
+ select HIGH_RES_TIMERS
+ ---help---
+ The Multiple Queue Skiplist Scheduler for excellent interactivity and
+ responsiveness on the desktop and highly scalable deterministic
+ low latency on any hardware.
+
+ Say Y here.
+ default y
+
+
config BROKEN
bool
@@ -550,7 +562,7 @@ config CONTEXT_TRACKING
config CONTEXT_TRACKING_FORCE
bool "Force context tracking"
depends on CONTEXT_TRACKING
- default y if !NO_HZ_FULL
+ default y if !NO_HZ_FULL && !SCHED_MUQSS
help
The major pre-requirement for full dynticks to work is to
support the context tracking subsystem. But there are also
@@ -940,6 +952,7 @@ config NUMA_BALANCING
depends on ARCH_SUPPORTS_NUMA_BALANCING
depends on !ARCH_WANT_NUMA_VARIABLE_LOCALITY
depends on SMP && NUMA && MIGRATION
+ depends on !SCHED_MUQSS
help
This option adds support for automatic NUMA aware memory/task placement.
The mechanism is quite primitive and is based on migrating memory when
@@ -1042,9 +1055,13 @@ menuconfig CGROUP_SCHED
help
This feature lets CPU scheduler recognize task groups and control CPU
bandwidth allocation to such task groups. It uses cgroups to group
- tasks.
+ tasks. In combination with MuQSS this is purely a STUB to create the
+ files associated with the CPU controller cgroup but most of the
+ controls do nothing. This is useful for working in environments and
+ with applications that will only work if this control group is
+ present.
-if CGROUP_SCHED
+if CGROUP_SCHED && !SCHED_MUQSS
config FAIR_GROUP_SCHED
bool "Group scheduling for SCHED_OTHER"
depends on CGROUP_SCHED
@@ -1140,6 +1157,7 @@ config CGROUP_DEVICE
config CGROUP_CPUACCT
bool "Simple CPU accounting controller"
+ depends on !SCHED_MUQSS
help
Provides a simple controller for monitoring the
total CPU consumed by the tasks in a cgroup.
@@ -1238,6 +1256,7 @@ endif # NAMESPACES
config SCHED_AUTOGROUP
bool "Automatic process group scheduling"
+ depends on !SCHED_MUQSS
select CGROUPS
select CGROUP_SCHED
select FAIR_GROUP_SCHED
diff --git a/init/main.c b/init/main.c
index 2858be7..5ec4c43 100644
--- a/init/main.c
+++ b/init/main.c
@@ -793,7 +793,6 @@ int __init_or_module do_one_initcall(initcall_t fn)
return ret;
}
-
extern initcall_t __initcall_start[];
extern initcall_t __initcall0_start[];
extern initcall_t __initcall1_start[];
@@ -952,6 +951,8 @@ static int __ref kernel_init(void *unused)
rcu_end_inkernel_boot();
+ print_scheduler_version();
+
if (ramdisk_execute_command) {
ret = run_init_process(ramdisk_execute_command);
if (!ret)
diff --git a/kernel/Kconfig.hz b/kernel/Kconfig.hz
index 2a202a8..f97168d 100644
--- a/kernel/Kconfig.hz
+++ b/kernel/Kconfig.hz
@@ -4,7 +4,8 @@
choice
prompt "Timer frequency"
- default HZ_250
+ default HZ_100_MUQSS if SCHED_MUQSS
+ default HZ_250 if !SCHED_MUQSS
help
Allows the configuration of the timer frequency. It is customary
to have the timer interrupt run at 1000 Hz but 100 Hz may be more
@@ -16,15 +17,30 @@ choice
per second.
+ config HZ_100_MUQSS
+ bool "100 HZ" if SCHED_MUQSS
+ help
+ 100 Hz is a suitable choice in combination with MuQSS which does
+ not rely on ticks for rescheduling interrupts, and is not Hz limited
+ for timeouts and sleeps from both the kernel and userspace.
+ This allows us to benefit from the lower overhead and higher
+ throughput of fewer timer ticks.
+
config HZ_100
- bool "100 HZ"
+ bool "100 HZ" if !SCHED_MUQSS
help
100 Hz is a typical choice for servers, SMP and NUMA systems
with lots of processors that may show reduced performance if
too many timer interrupts are occurring.
+ config HZ_250_MUQSS
+ bool "250 HZ" if SCHED_MUQSS
+ help
+ 250 Hz is the default choice for the mainline scheduler but not
+ advantageous in combination with MuQSS.
+
config HZ_250
- bool "250 HZ"
+ bool "250 HZ" if !SCHED_MUQSS
help
250 Hz is a good compromise choice allowing server performance
while also showing good interactive responsiveness even
@@ -49,7 +65,9 @@ endchoice
config HZ
int
+ default 100 if HZ_100_MUQSS
default 100 if HZ_100
+ default 250 if HZ_250_MUQSS
default 250 if HZ_250
default 300 if HZ_300
default 1000 if HZ_1000
diff --git a/kernel/Kconfig.preempt b/kernel/Kconfig.preempt
index 3f9c974..1dc79ec 100644
--- a/kernel/Kconfig.preempt
+++ b/kernel/Kconfig.preempt
@@ -1,7 +1,7 @@
choice
prompt "Preemption Model"
- default PREEMPT_NONE
+ default PREEMPT
config PREEMPT_NONE
bool "No Forced Preemption (Server)"
@@ -17,7 +17,7 @@ config PREEMPT_NONE
latencies.
config PREEMPT_VOLUNTARY
- bool "Voluntary Kernel Preemption (Desktop)"
+ bool "Voluntary Kernel Preemption (Nothing)"
help
This option reduces the latency of the kernel by adding more
"explicit preemption points" to the kernel code. These new
@@ -31,7 +31,8 @@ config PREEMPT_VOLUNTARY
applications to run more 'smoothly' even when the system is
under load.
- Select this if you are building a kernel for a desktop system.
+ Select this for no system in particular (choose Preemptible
+ instead on a desktop if you know what's good for you).
config PREEMPT
bool "Preemptible Kernel (Low-Latency Desktop)"
diff --git a/kernel/Makefile b/kernel/Makefile
index e0c2268..cffa2c7 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -9,7 +9,7 @@ obj-y = fork.o exec_domain.o panic.o \
extable.o params.o \
kthread.o sys_ni.o nsproxy.o \
notifier.o ksysfs.o cred.o reboot.o \
- async.o range.o smpboot.o ucount.o
+ async.o range.o smpboot.o ucount.o skip_list.o
obj-$(CONFIG_MULTIUSER) += groups.o
diff --git a/kernel/delayacct.c b/kernel/delayacct.c
index 435c14a..a80d56d 100644
--- a/kernel/delayacct.c
+++ b/kernel/delayacct.c
@@ -104,7 +104,7 @@ int __delayacct_add_tsk(struct taskstats *d, struct task_struct *tsk)
*/
t1 = tsk->sched_info.pcount;
t2 = tsk->sched_info.run_delay;
- t3 = tsk->se.sum_exec_runtime;
+ t3 = tsk_seruntime(tsk);
d->cpu_count += t1;
diff --git a/kernel/exit.c b/kernel/exit.c
index 3076f30..d749749 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -134,7 +134,7 @@ static void __exit_signal(struct task_struct *tsk)
sig->inblock += task_io_get_inblock(tsk);
sig->oublock += task_io_get_oublock(tsk);
task_io_accounting_add(&sig->ioac, &tsk->ioac);
- sig->sum_sched_runtime += tsk->se.sum_exec_runtime;
+ sig->sum_sched_runtime += tsk_seruntime(tsk);
sig->nr_threads--;
__unhash_process(tsk, group_dead);
write_sequnlock(&sig->stats_lock);
diff --git a/kernel/irq/Kconfig b/kernel/irq/Kconfig
index 3bbfd6a..351bf16 100644
--- a/kernel/irq/Kconfig
+++ b/kernel/irq/Kconfig
@@ -95,6 +95,20 @@ config IRQ_DOMAIN_DEBUG
config IRQ_FORCED_THREADING
bool
+config FORCE_IRQ_THREADING
+ bool "Make IRQ threading compulsory"
+ depends on IRQ_FORCED_THREADING
+ default y
+ ---help---
+
+ Make IRQ threading mandatory for any IRQ handlers that support it
+ instead of being optional and requiring the threadirqs kernel
+ parameter. Instead they can be optionally disabled with the
+ nothreadirqs kernel parameter.
+
+ Enable if you are building for a desktop or low latency system,
+ otherwise say N.
+
config SPARSE_IRQ
bool "Support sparse irq numbering" if MAY_HAVE_SPARSE_IRQ
---help---
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index 6b66959..6b3fb17 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -22,7 +22,17 @@
#include "internals.h"
#ifdef CONFIG_IRQ_FORCED_THREADING
+#ifdef CONFIG_FORCE_IRQ_THREADING
+__read_mostly bool force_irqthreads = true;
+#else
__read_mostly bool force_irqthreads;
+#endif
+static int __init setup_noforced_irqthreads(char *arg)
+{
+ force_irqthreads = false;
+ return 0;
+}
+early_param("nothreadirqs", setup_noforced_irqthreads);
static int __init setup_forced_irqthreads(char *arg)
{
diff --git a/kernel/kthread.c b/kernel/kthread.c
index be2cc1f..665e3bd 100644
--- a/kernel/kthread.c
+++ b/kernel/kthread.c
@@ -381,6 +381,34 @@ void kthread_bind(struct task_struct *p, unsigned int cpu)
}
EXPORT_SYMBOL(kthread_bind);
+#if defined(CONFIG_SCHED_MUQSS) && defined(CONFIG_SMP)
+extern void __do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask);
+
+/*
+ * new_kthread_bind is a special variant of __kthread_bind_mask.
+ * For new threads to work on muqss we want to call do_set_cpus_allowed
+ * without the task_cpu being set and the task rescheduled until they're
+ * rescheduled on their own so we call __do_set_cpus_allowed directly which
+ * only changes the cpumask. This is particularly important for smpboot threads
+ * to work.
+ */
+static void new_kthread_bind(struct task_struct *p, unsigned int cpu)
+{
+ unsigned long flags;
+
+ if (WARN_ON(!wait_task_inactive(p, TASK_UNINTERRUPTIBLE)))
+ return;
+
+ /* It's safe because the task is inactive. */
+ raw_spin_lock_irqsave(&p->pi_lock, flags);
+ __do_set_cpus_allowed(p, cpumask_of(cpu));
+ p->flags |= PF_NO_SETAFFINITY;
+ raw_spin_unlock_irqrestore(&p->pi_lock, flags);
+}
+#else
+#define new_kthread_bind(p, cpu) kthread_bind(p, cpu)
+#endif
+
/**
* kthread_create_on_cpu - Create a cpu bound kthread
* @threadfn: the function to run until signal_pending(current).
@@ -402,7 +430,7 @@ struct task_struct *kthread_create_on_cpu(int (*threadfn)(void *data),
cpu);
if (IS_ERR(p))
return p;
- kthread_bind(p, cpu);
+ new_kthread_bind(p, cpu);
/* CPU hotplug need to bind once again when unparking the thread. */
set_bit(KTHREAD_IS_PER_CPU, &to_kthread(p)->flags);
to_kthread(p)->cpu = cpu;
diff --git a/kernel/sched/Makefile b/kernel/sched/Makefile
index 5e59b83..77bdf98 100644
--- a/kernel/sched/Makefile
+++ b/kernel/sched/Makefile
@@ -15,13 +15,18 @@ ifneq ($(CONFIG_SCHED_OMIT_FRAME_POINTER),y)
CFLAGS_core.o := $(PROFILING) -fno-omit-frame-pointer
endif
-obj-y += core.o loadavg.o clock.o cputime.o
+ifdef CONFIG_SCHED_MUQSS
+obj-y += MuQSS.o clock.o
+else
+obj-y += core.o loadavg.o clock.o
obj-y += idle_task.o fair.o rt.o deadline.o stop_task.o
-obj-y += wait.o swait.o completion.o idle.o
-obj-$(CONFIG_SMP) += cpupri.o cpudeadline.o
+obj-$(CONFIG_SMP) += cpudeadline.o
obj-$(CONFIG_SCHED_AUTOGROUP) += auto_group.o
-obj-$(CONFIG_SCHEDSTATS) += stats.o
obj-$(CONFIG_SCHED_DEBUG) += debug.o
obj-$(CONFIG_CGROUP_CPUACCT) += cpuacct.o
+endif
+obj-y += wait.o swait.o completion.o idle.o cputime.o
+obj-$(CONFIG_SMP) += cpupri.o
+obj-$(CONFIG_SCHEDSTATS) += stats.o
obj-$(CONFIG_CPU_FREQ) += cpufreq.o
obj-$(CONFIG_CPU_FREQ_GOV_SCHEDUTIL) += cpufreq_schedutil.o
diff --git a/kernel/sched/MuQSS.c b/kernel/sched/MuQSS.c
new file mode 100644
index 0000000..7617ae4
--- /dev/null
+++ b/kernel/sched/MuQSS.c
@@ -0,0 +1,8033 @@
+/*
+ * kernel/sched/MuQSS.c, was kernel/sched.c
+ *
+ * Kernel scheduler and related syscalls
+ *
+ * Copyright (C) 1991-2002 Linus Torvalds
+ *
+ * 1996-12-23 Modified by Dave Grothe to fix bugs in semaphores and
+ * make semaphores SMP safe
+ * 1998-11-19 Implemented schedule_timeout() and related stuff
+ * by Andrea Arcangeli
+ * 2002-01-04 New ultra-scalable O(1) scheduler by Ingo Molnar:
+ * hybrid priority-list and round-robin design with
+ * an array-switch method of distributing timeslices
+ * and per-CPU runqueues. Cleanups and useful suggestions
+ * by Davide Libenzi, preemptible kernel bits by Robert Love.
+ * 2003-09-03 Interactivity tuning by Con Kolivas.
+ * 2004-04-02 Scheduler domains code by Nick Piggin
+ * 2007-04-15 Work begun on replacing all interactivity tuning with a
+ * fair scheduling design by Con Kolivas.
+ * 2007-05-05 Load balancing (smp-nice) and other improvements
+ * by Peter Williams
+ * 2007-05-06 Interactivity improvements to CFS by Mike Galbraith
+ * 2007-07-01 Group scheduling enhancements by Srivatsa Vaddagiri
+ * 2007-11-29 RT balancing improvements by Steven Rostedt, Gregory Haskins,
+ * Thomas Gleixner, Mike Kravetz
+ * 2009-08-13 Brainfuck deadline scheduling policy by Con Kolivas deletes
+ * a whole lot of those previous things.
+ * 2016-10-01 Multiple Queue Skiplist Scheduler scalable evolution of BFS
+ * scheduler by Con Kolivas.
+ */
+
+#include <linux/kasan.h>
+#include <linux/mm.h>
+#include <linux/module.h>
+#include <linux/nmi.h>
+#include <linux/init.h>
+#include <asm/uaccess.h>
+#include <linux/highmem.h>
+#include <linux/mmu_context.h>
+#include <linux/interrupt.h>
+#include <linux/capability.h>
+#include <linux/completion.h>
+#include <linux/kernel_stat.h>
+#include <linux/debug_locks.h>
+#include <linux/perf_event.h>
+#include <linux/security.h>
+#include <linux/notifier.h>
+#include <linux/profile.h>
+#include <linux/freezer.h>
+#include <linux/vmalloc.h>
+#include <linux/blkdev.h>
+#include <linux/delay.h>
+#include <linux/smp.h>
+#include <linux/threads.h>
+#include <linux/timer.h>
+#include <linux/rcupdate.h>
+#include <linux/cpu.h>
+#include <linux/cpuset.h>
+#include <linux/cpumask.h>
+#include <linux/percpu.h>
+#include <linux/proc_fs.h>
+#include <linux/seq_file.h>
+#include <linux/syscalls.h>
+#include <linux/sched/sysctl.h>
+#include <linux/times.h>
+#include <linux/tsacct_kern.h>
+#include <linux/kprobes.h>
+#include <linux/delayacct.h>
+#include <linux/log2.h>
+#include <linux/bootmem.h>
+#include <linux/ftrace.h>
+#include <linux/slab.h>
+#include <linux/init_task.h>
+#include <linux/binfmts.h>
+#include <linux/context_tracking.h>
+#include <linux/sched/prio.h>
+#include <linux/tick.h>
+#include <linux/skip_list.h>
+
+#include <asm/irq_regs.h>
+#include <asm/switch_to.h>
+#include <asm/tlb.h>
+#include <asm/unistd.h>
+#include <asm/mutex.h>
+#ifdef CONFIG_PARAVIRT
+#include <asm/paravirt.h>
+#endif
+
+#include "cpupri.h"
+#include "../workqueue_internal.h"
+#include "../smpboot.h"
+
+#define CREATE_TRACE_POINTS
+#include <trace/events/sched.h>
+
+#include "MuQSS.h"
+
+#define rt_prio(prio) unlikely((prio) < MAX_RT_PRIO)
+#define rt_task(p) rt_prio((p)->prio)
+#define batch_task(p) (unlikely((p)->policy == SCHED_BATCH))
+#define is_rt_policy(policy) ((policy) == SCHED_FIFO || \
+ (policy) == SCHED_RR)
+#define has_rt_policy(p) unlikely(is_rt_policy((p)->policy))
+
+#define is_idle_policy(policy) ((policy) == SCHED_IDLEPRIO)
+#define idleprio_task(p) unlikely(is_idle_policy((p)->policy))
+#define task_running_idle(p) unlikely((p)->prio == IDLE_PRIO)
+
+#define is_iso_policy(policy) ((policy) == SCHED_ISO)
+#define iso_task(p) unlikely(is_iso_policy((p)->policy))
+#define task_running_iso(p) unlikely((p)->prio == ISO_PRIO)
+
+#define rq_idle(rq) ((rq)->rq_prio == PRIO_LIMIT)
+
+#define ISO_PERIOD (5 * HZ)
+
+#define STOP_PRIO (MAX_RT_PRIO - 1)
+
+/*
+ * Some helpers for converting to/from various scales. Use shifts to get
+ * approximate multiples of ten for less overhead.
+ */
+#define JIFFIES_TO_NS(TIME) ((TIME) * (1073741824 / HZ))
+#define JIFFY_NS (1073741824 / HZ)
+#define JIFFY_US (1048576 / HZ)
+#define NS_TO_JIFFIES(TIME) ((TIME) / JIFFY_NS)
+#define HALF_JIFFY_NS (1073741824 / HZ / 2)
+#define HALF_JIFFY_US (1048576 / HZ / 2)
+#define MS_TO_NS(TIME) ((TIME) << 20)
+#define MS_TO_US(TIME) ((TIME) << 10)
+#define NS_TO_MS(TIME) ((TIME) >> 20)
+#define NS_TO_US(TIME) ((TIME) >> 10)
+#define US_TO_NS(TIME) ((TIME) << 10)
+
+#define RESCHED_US (100) /* Reschedule if less than this many μs left */
+
+void print_scheduler_version(void)
+{
+ printk(KERN_INFO "MuQSS CPU scheduler v0.150 by Con Kolivas.\n");
+}
+
+/*
+ * This is the time all tasks within the same priority round robin.
+ * Value is in ms and set to a minimum of 6ms.
+ * Tunable via /proc interface.
+ */
+int rr_interval __read_mostly = 6;
+
+/*
+ * Tunable to choose whether to prioritise latency or throughput, simple
+ * binary yes or no
+ */
+int sched_interactive __read_mostly = 1;
+
+/*
+ * sched_iso_cpu - sysctl which determines the cpu percentage SCHED_ISO tasks
+ * are allowed to run five seconds as real time tasks. This is the total over
+ * all online cpus.
+ */
+int sched_iso_cpu __read_mostly = 70;
+
+/*
+ * sched_yield_type - Choose what sort of yield sched_yield will perform.
+ * 0: No yield.
+ * 1: Yield only to better priority/deadline tasks. (default)
+ * 2: Expire timeslice and recalculate deadline.
+ */
+int sched_yield_type __read_mostly = 1;
+
+/*
+ * The relative length of deadline for each priority(nice) level.
+ */
+static int prio_ratios[NICE_WIDTH] __read_mostly;
+
+/*
+ * The quota handed out to tasks of all priority levels when refilling their
+ * time_slice.
+ */
+static inline int timeslice(void)
+{
+ return MS_TO_US(rr_interval);
+}
+
+static bool sched_smp_initialized __read_mostly;
+
+/*
+ * The global runqueue data that all CPUs work off. Contains either atomic
+ * variables and a cpu bitmap set atomically.
+ */
+struct global_rq {
+#ifdef CONFIG_SMP
+ atomic_t nr_running ____cacheline_aligned_in_smp;
+ atomic_t nr_uninterruptible ____cacheline_aligned_in_smp;
+ atomic64_t nr_switches ____cacheline_aligned_in_smp;
+ cpumask_t cpu_idle_map ____cacheline_aligned_in_smp;
+#else
+ atomic_t nr_running ____cacheline_aligned;
+ atomic_t nr_uninterruptible ____cacheline_aligned;
+ atomic64_t nr_switches ____cacheline_aligned;
+#endif
+};
+
+#ifdef CONFIG_SMP
+/*
+ * We add the notion of a root-domain which will be used to define per-domain
+ * variables. Each exclusive cpuset essentially defines an island domain by
+ * fully partitioning the member cpus from any other cpuset. Whenever a new
+ * exclusive cpuset is created, we also create and attach a new root-domain
+ * object.
+ *
+ */
+struct root_domain {
+ atomic_t refcount;
+ atomic_t rto_count;
+ struct rcu_head rcu;
+ cpumask_var_t span;
+ cpumask_var_t online;
+
+ /*
+ * The "RT overload" flag: it gets set if a CPU has more than
+ * one runnable RT task.
+ */
+ cpumask_var_t rto_mask;
+ struct cpupri cpupri;
+};
+
+/*
+ * By default the system creates a single root-domain with all cpus as
+ * members (mimicking the global state we have today).
+ */
+static struct root_domain def_root_domain;
+
+#endif /* CONFIG_SMP */
+
+/* There can be only one */
+#ifdef CONFIG_SMP
+static struct global_rq grq ____cacheline_aligned_in_smp;
+#else
+static struct global_rq grq ____cacheline_aligned;
+#endif
+
+static DEFINE_MUTEX(sched_hotcpu_mutex);
+
+/* cpus with isolated domains */
+cpumask_var_t cpu_isolated_map;
+
+DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues);
+#ifdef CONFIG_SMP
+struct rq *cpu_rq(int cpu)
+{
+ return &per_cpu(runqueues, (cpu));
+}
+#define task_rq(p) cpu_rq(task_cpu(p))
+#define cpu_curr(cpu) (cpu_rq(cpu)->curr)
+/*
+ * sched_domains_mutex serialises calls to init_sched_domains,
+ * detach_destroy_domains and partition_sched_domains.
+ */
+DEFINE_MUTEX(sched_domains_mutex);
+
+/*
+ * By default the system creates a single root-domain with all cpus as
+ * members (mimicking the global state we have today).
+ */
+static struct root_domain def_root_domain;
+
+int __weak arch_sd_sibling_asym_packing(void)
+{
+ return 0*SD_ASYM_PACKING;
+}
+#else
+struct rq *uprq;
+#endif /* CONFIG_SMP */
+
+#ifdef CONFIG_SMP
+static inline int cpu_of(struct rq *rq)
+{
+ return rq->cpu;
+}
+#else /* CONFIG_SMP */
+static inline int cpu_of(struct rq *rq)
+{
+ return 0;
+}
+#endif
+
+#include "stats.h"
+
+#ifndef prepare_arch_switch
+# define prepare_arch_switch(next) do { } while (0)
+#endif
+#ifndef finish_arch_switch
+# define finish_arch_switch(prev) do { } while (0)
+#endif
+#ifndef finish_arch_post_lock_switch
+# define finish_arch_post_lock_switch() do { } while (0)
+#endif
+
+/*
+ * All common locking functions performed on rq->lock. rq->clock is local to
+ * the CPU accessing it so it can be modified just with interrupts disabled
+ * when we're not updating niffies.
+ * Looking up task_rq must be done under rq->lock to be safe.
+ */
+static void update_rq_clock_task(struct rq *rq, s64 delta);
+
+static inline void update_rq_clock(struct rq *rq)
+{
+ s64 delta = sched_clock_cpu(cpu_of(rq)) - rq->clock;
+
+ if (unlikely(delta < 0))
+ return;
+ rq->clock += delta;
+ update_rq_clock_task(rq, delta);
+}
+
+/*
+ * Niffies are a globally increasing nanosecond counter. They're only used by
+ * update_load_avg and time_slice_expired, however deadlines are based on them
+ * across CPUs. Update them whenever we will call one of those functions, and
+ * synchronise them across CPUs whenever we hold both runqueue locks.
+ */
+static inline void update_clocks(struct rq *rq)
+{
+ s64 ndiff, minndiff;
+ long jdiff;
+
+ update_rq_clock(rq);
+ ndiff = rq->clock - rq->old_clock;
+ rq->old_clock = rq->clock;
+ jdiff = jiffies - rq->last_jiffy;
+
+ /* Subtract any niffies added by balancing with other rqs */
+ ndiff -= rq->niffies - rq->last_niffy;
+ minndiff = JIFFIES_TO_NS(jdiff) - rq->niffies + rq->last_jiffy_niffies;
+ if (minndiff < 0)
+ minndiff = 0;
+ ndiff = max(ndiff, minndiff);
+ rq->niffies += ndiff;
+ rq->last_niffy = rq->niffies;
+ if (jdiff) {
+ rq->last_jiffy += jdiff;
+ rq->last_jiffy_niffies = rq->niffies;
+ }
+}
+
+static inline int task_current(struct rq *rq, struct task_struct *p)
+{
+ return rq->curr == p;
+}
+
+static inline int task_running(struct rq *rq, struct task_struct *p)
+{
+#ifdef CONFIG_SMP
+ return p->on_cpu;
+#else
+ return task_current(rq, p);
+#endif
+}
+
+static inline int task_on_rq_queued(struct task_struct *p)
+{
+ return p->on_rq == TASK_ON_RQ_QUEUED;
+}
+
+static inline int task_on_rq_migrating(struct task_struct *p)
+{
+ return p->on_rq == TASK_ON_RQ_MIGRATING;
+}
+
+static inline void rq_lock(struct rq *rq)
+ __acquires(rq->lock)
+{
+ raw_spin_lock(&rq->lock);
+}
+
+static inline int rq_trylock(struct rq *rq)
+ __acquires(rq->lock)
+{
+ return raw_spin_trylock(&rq->lock);
+}
+
+static inline void rq_unlock(struct rq *rq)
+ __releases(rq->lock)
+{
+ raw_spin_unlock(&rq->lock);
+}
+
+static inline struct rq *this_rq_lock(void)
+ __acquires(rq->lock)
+{
+ struct rq *rq;
+
+ local_irq_disable();
+ rq = this_rq();
+ raw_spin_lock(&rq->lock);
+
+ return rq;
+}
+
+/*
+ * Any time we have two runqueues locked we use that as an opportunity to
+ * synchronise niffies to the highest value as idle ticks may have artificially
+ * kept niffies low on one CPU and the truth can only be later.
+ */
+static inline void synchronise_niffies(struct rq *rq1, struct rq *rq2)
+{
+ if (rq1->niffies > rq2->niffies)
+ rq2->niffies = rq1->niffies;
+ else
+ rq1->niffies = rq2->niffies;
+}
+
+/*
+ * double_rq_lock - safely lock two runqueues
+ *
+ * Note this does not disable interrupts like task_rq_lock,
+ * you need to do so manually before calling.
+ */
+
+/* For when we know rq1 != rq2 */
+static inline void __double_rq_lock(struct rq *rq1, struct rq *rq2)
+ __acquires(rq1->lock)
+ __acquires(rq2->lock)
+{
+ if (rq1 < rq2) {
+ raw_spin_lock(&rq1->lock);
+ raw_spin_lock_nested(&rq2->lock, SINGLE_DEPTH_NESTING);
+ } else {
+ raw_spin_lock(&rq2->lock);
+ raw_spin_lock_nested(&rq1->lock, SINGLE_DEPTH_NESTING);
+ }
+}
+
+static inline void double_rq_lock(struct rq *rq1, struct rq *rq2)
+ __acquires(rq1->lock)
+ __acquires(rq2->lock)
+{
+ BUG_ON(!irqs_disabled());
+ if (rq1 == rq2) {
+ raw_spin_lock(&rq1->lock);
+ __acquire(rq2->lock); /* Fake it out ;) */
+ } else
+ __double_rq_lock(rq1, rq2);
+ synchronise_niffies(rq1, rq2);
+}
+
+/*
+ * double_rq_unlock - safely unlock two runqueues
+ *
+ * Note this does not restore interrupts like task_rq_unlock,
+ * you need to do so manually after calling.
+ */
+static inline void double_rq_unlock(struct rq *rq1, struct rq *rq2)
+ __releases(rq1->lock)
+ __releases(rq2->lock)
+{
+ raw_spin_unlock(&rq1->lock);
+ if (rq1 != rq2)
+ raw_spin_unlock(&rq2->lock);
+ else
+ __release(rq2->lock);
+}
+
+/* Must be sure rq1 != rq2 and irqs are disabled */
+static inline void lock_second_rq(struct rq *rq1, struct rq *rq2)
+ __releases(rq1->lock)
+ __acquires(rq1->lock)
+ __acquires(rq2->lock)
+{
+ BUG_ON(!irqs_disabled());
+ if (unlikely(!raw_spin_trylock(&rq2->lock))) {
+ raw_spin_unlock(&rq1->lock);
+ __double_rq_lock(rq1, rq2);
+ }
+ synchronise_niffies(rq1, rq2);
+}
+
+static inline void lock_all_rqs(void)
+{
+ int cpu;
+
+ preempt_disable();
+ for_each_possible_cpu(cpu) {
+ struct rq *rq = cpu_rq(cpu);
+
+ do_raw_spin_lock(&rq->lock);
+ }
+}
+
+static inline void unlock_all_rqs(void)
+{
+ int cpu;
+
+ for_each_possible_cpu(cpu) {
+ struct rq *rq = cpu_rq(cpu);
+
+ do_raw_spin_unlock(&rq->lock);
+ }
+ preempt_enable();
+}
+
+/* Specially nest trylock an rq */
+static inline bool trylock_rq(struct rq *this_rq, struct rq *rq)
+{
+ if (unlikely(!do_raw_spin_trylock(&rq->lock)))
+ return false;
+ spin_acquire(&rq->lock.dep_map, SINGLE_DEPTH_NESTING, 1, _RET_IP_);
+ synchronise_niffies(this_rq, rq);
+ return true;
+}
+
+/* Unlock a specially nested trylocked rq */
+static inline void unlock_rq(struct rq *rq)
+{
+ spin_release(&rq->lock.dep_map, 1, _RET_IP_);
+ do_raw_spin_unlock(&rq->lock);
+}
+
+static inline void rq_lock_irq(struct rq *rq)
+ __acquires(rq->lock)
+{
+ raw_spin_lock_irq(&rq->lock);
+}
+
+static inline void rq_unlock_irq(struct rq *rq)
+ __releases(rq->lock)
+{
+ raw_spin_unlock_irq(&rq->lock);
+}
+
+static inline void rq_lock_irqsave(struct rq *rq, unsigned long *flags)
+ __acquires(rq->lock)
+{
+ raw_spin_lock_irqsave(&rq->lock, *flags);
+}
+
+static inline void rq_unlock_irqrestore(struct rq *rq, unsigned long *flags)
+ __releases(rq->lock)
+{
+ raw_spin_unlock_irqrestore(&rq->lock, *flags);
+}
+
+struct rq *task_rq_lock(struct task_struct *p, unsigned long *flags)
+ __acquires(p->pi_lock)
+ __acquires(rq->lock)
+{
+ struct rq *rq;
+
+ while (42) {
+ raw_spin_lock_irqsave(&p->pi_lock, *flags);
+ rq = task_rq(p);
+ raw_spin_lock(&rq->lock);
+ if (likely(rq == task_rq(p)))
+ break;
+ raw_spin_unlock(&rq->lock);
+ raw_spin_unlock_irqrestore(&p->pi_lock, *flags);
+ }
+ return rq;
+}
+
+void task_rq_unlock(struct rq *rq, struct task_struct *p, unsigned long *flags)
+ __releases(rq->lock)
+ __releases(p->pi_lock)
+{
+ rq_unlock(rq);
+ raw_spin_unlock_irqrestore(&p->pi_lock, *flags);
+}
+
+static inline struct rq *__task_rq_lock(struct task_struct *p)
+ __acquires(rq->lock)
+{
+ struct rq *rq;
+
+ lockdep_assert_held(&p->pi_lock);
+
+ while (42) {
+ rq = task_rq(p);
+ raw_spin_lock(&rq->lock);
+ if (likely(rq == task_rq(p)))
+ break;
+ raw_spin_unlock(&rq->lock);
+ }
+ return rq;
+}
+
+static inline void __task_rq_unlock(struct rq *rq)
+{
+ rq_unlock(rq);
+}
+
+/*
+ * cmpxchg based fetch_or, macro so it works for different integer types
+ */
+#define fetch_or(ptr, mask) \
+ ({ \
+ typeof(ptr) _ptr = (ptr); \
+ typeof(mask) _mask = (mask); \
+ typeof(*_ptr) _old, _val = *_ptr; \
+ \
+ for (;;) { \
+ _old = cmpxchg(_ptr, _val, _val | _mask); \
+ if (_old == _val) \
+ break; \
+ _val = _old; \
+ } \
+ _old; \
+})
+
+#if defined(CONFIG_SMP) && defined(TIF_POLLING_NRFLAG)
+/*
+ * Atomically set TIF_NEED_RESCHED and test for TIF_POLLING_NRFLAG,
+ * this avoids any races wrt polling state changes and thereby avoids
+ * spurious IPIs.
+ */
+static bool set_nr_and_not_polling(struct task_struct *p)
+{
+ struct thread_info *ti = task_thread_info(p);
+ return !(fetch_or(&ti->flags, _TIF_NEED_RESCHED) & _TIF_POLLING_NRFLAG);
+}
+
+/*
+ * Atomically set TIF_NEED_RESCHED if TIF_POLLING_NRFLAG is set.
+ *
+ * If this returns true, then the idle task promises to call
+ * sched_ttwu_pending() and reschedule soon.
+ */
+static bool set_nr_if_polling(struct task_struct *p)
+{
+ struct thread_info *ti = task_thread_info(p);
+ typeof(ti->flags) old, val = READ_ONCE(ti->flags);
+
+ for (;;) {
+ if (!(val & _TIF_POLLING_NRFLAG))
+ return false;
+ if (val & _TIF_NEED_RESCHED)
+ return true;
+ old = cmpxchg(&ti->flags, val, val | _TIF_NEED_RESCHED);
+ if (old == val)
+ break;
+ val = old;
+ }
+ return true;
+}
+
+#else
+static bool set_nr_and_not_polling(struct task_struct *p)
+{
+ set_tsk_need_resched(p);
+ return true;
+}
+
+#ifdef CONFIG_SMP
+static bool set_nr_if_polling(struct task_struct *p)
+{
+ return false;
+}
+#endif
+#endif
+
+void wake_q_add(struct wake_q_head *head, struct task_struct *task)
+{
+ struct wake_q_node *node = &task->wake_q;
+
+ /*
+ * Atomically grab the task, if ->wake_q is !nil already it means
+ * its already queued (either by us or someone else) and will get the
+ * wakeup due to that.
+ *
+ * This cmpxchg() implies a full barrier, which pairs with the write
+ * barrier implied by the wakeup in wake_up_q().
+ */
+ if (cmpxchg(&node->next, NULL, WAKE_Q_TAIL))
+ return;
+
+ get_task_struct(task);
+
+ /*
+ * The head is context local, there can be no concurrency.
+ */
+ *head->lastp = node;
+ head->lastp = &node->next;
+}
+
+void wake_up_q(struct wake_q_head *head)
+{
+ struct wake_q_node *node = head->first;
+
+ while (node != WAKE_Q_TAIL) {
+ struct task_struct *task;
+
+ task = container_of(node, struct task_struct, wake_q);
+ BUG_ON(!task);
+ /* task can safely be re-inserted now */
+ node = node->next;
+ task->wake_q.next = NULL;
+
+ /*
+ * wake_up_process() implies a wmb() to pair with the queueing
+ * in wake_q_add() so as not to miss wakeups.
+ */
+ wake_up_process(task);
+ put_task_struct(task);
+ }
+}
+
+static inline void prepare_lock_switch(struct rq *rq, struct task_struct *next)
+{
+ next->on_cpu = 1;
+}
+
+static inline void smp_sched_reschedule(int cpu)
+{
+ if (likely(cpu_online(cpu)))
+ smp_send_reschedule(cpu);
+}
+
+/*
+ * resched_task - mark a task 'to be rescheduled now'.
+ *
+ * On UP this means the setting of the need_resched flag, on SMP it
+ * might also involve a cross-CPU call to trigger the scheduler on
+ * the target CPU.
+ */
+void resched_task(struct task_struct *p)
+{
+ int cpu;
+#ifdef CONFIG_LOCKDEP
+ struct rq *rq = task_rq(p);
+
+ lockdep_assert_held(&rq->lock);
+#endif
+ if (test_tsk_need_resched(p))
+ return;
+
+ cpu = task_cpu(p);
+ if (cpu == smp_processor_id()) {
+ set_tsk_need_resched(p);
+ set_preempt_need_resched();
+ return;
+ }
+
+ if (set_nr_and_not_polling(p))
+ smp_sched_reschedule(cpu);
+ else
+ trace_sched_wake_idle_without_ipi(cpu);
+}
+
+/*
+ * A task that is not running or queued will not have a node set.
+ * A task that is queued but not running will have a node set.
+ * A task that is currently running will have ->on_cpu set but no node set.
+ */
+static inline bool task_queued(struct task_struct *p)
+{
+ return !skiplist_node_empty(&p->node);
+}
+
+static void enqueue_task(struct rq *rq, struct task_struct *p, int flags);
+static inline void resched_if_idle(struct rq *rq);
+
+/* Dodgy workaround till we figure out where the softirqs are going */
+static inline void do_pending_softirq(struct rq *rq, struct task_struct *next)
+{
+ if (unlikely(next == rq->idle && local_softirq_pending() && !in_interrupt()))
+ do_softirq_own_stack();
+}
+
+static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev)
+{
+#ifdef CONFIG_SMP
+ /*
+ * After ->on_cpu is cleared, the task can be moved to a different CPU.
+ * We must ensure this doesn't happen until the switch is completely
+ * finished.
+ *
+ * In particular, the load of prev->state in finish_task_switch() must
+ * happen before this.
+ *
+ * Pairs with the smp_cond_load_acquire() in try_to_wake_up().
+ */
+ smp_store_release(&prev->on_cpu, 0);
+#endif
+#ifdef CONFIG_DEBUG_SPINLOCK
+ /* this is a valid case when another task releases the spinlock */
+ rq->lock.owner = current;
+#endif
+ /*
+ * If we are tracking spinlock dependencies then we have to
+ * fix up the runqueue lock - which gets 'carried over' from
+ * prev into current:
+ */
+ spin_acquire(&rq->lock.dep_map, 0, 0, _THIS_IP_);
+
+#ifdef CONFIG_SMP
+ /*
+ * If prev was marked as migrating to another CPU in return_task, drop
+ * the local runqueue lock but leave interrupts disabled and grab the
+ * remote lock we're migrating it to before enabling them.
+ */
+ if (unlikely(task_on_rq_migrating(prev))) {
+ sched_info_dequeued(rq, prev);
+ /*
+ * We move the ownership of prev to the new cpu now. ttwu can't
+ * activate prev to the wrong cpu since it has to grab this
+ * runqueue in ttwu_remote.
+ */
+#ifdef CONFIG_THREAD_INFO_IN_TASK
+ prev->cpu = prev->wake_cpu;
+#else
+ task_thread_info(prev)->cpu = prev->wake_cpu;
+#endif
+ raw_spin_unlock(&rq->lock);
+
+ raw_spin_lock(&prev->pi_lock);
+ rq = __task_rq_lock(prev);
+ /* Check that someone else hasn't already queued prev */
+ if (likely(!task_queued(prev))) {
+ enqueue_task(rq, prev, 0);
+ prev->on_rq = TASK_ON_RQ_QUEUED;
+ /* Wake up the CPU if it's not already running */
+ resched_if_idle(rq);
+ }
+ raw_spin_unlock(&prev->pi_lock);
+ }
+#endif
+ rq_unlock(rq);
+
+ do_pending_softirq(rq, current);
+
+ local_irq_enable();
+}
+
+static inline bool deadline_before(u64 deadline, u64 time)
+{
+ return (deadline < time);
+}
+
+/*
+ * Deadline is "now" in niffies + (offset by priority). Setting the deadline
+ * is the key to everything. It distributes cpu fairly amongst tasks of the
+ * same nice value, it proportions cpu according to nice level, it means the
+ * task that last woke up the longest ago has the earliest deadline, thus
+ * ensuring that interactive tasks get low latency on wake up. The CPU
+ * proportion works out to the square of the virtual deadline difference, so
+ * this equation will give nice 19 3% CPU compared to nice 0.
+ */
+static inline u64 prio_deadline_diff(int user_prio)
+{
+ return (prio_ratios[user_prio] * rr_interval * (MS_TO_NS(1) / 128));
+}
+
+static inline u64 task_deadline_diff(struct task_struct *p)
+{
+ return prio_deadline_diff(TASK_USER_PRIO(p));
+}
+
+static inline u64 static_deadline_diff(int static_prio)
+{
+ return prio_deadline_diff(USER_PRIO(static_prio));
+}
+
+static inline int longest_deadline_diff(void)
+{
+ return prio_deadline_diff(39);
+}
+
+static inline int ms_longest_deadline_diff(void)
+{
+ return NS_TO_MS(longest_deadline_diff());
+}
+
+static inline int rq_load(struct rq *rq)
+{
+ return rq->sl->entries + !rq_idle(rq);
+}
+
+static inline bool rq_local(struct rq *rq);
+
+/*
+ * Update the load average for feeding into cpu frequency governors. Use a
+ * rough estimate of a rolling average with ~ time constant of 32ms.
+ * 80/128 ~ 0.63. * 80 / 32768 / 128 == * 5 / 262144
+ * Make sure a call to update_clocks has been made before calling this to get
+ * an updated rq->niffies.
+ */
+static void update_load_avg(struct rq *rq, unsigned int flags)
+{
+ unsigned long us_interval;
+ long load, curload;
+
+ if (unlikely(rq->niffies <= rq->load_update))
+ return;
+
+ us_interval = NS_TO_US(rq->niffies - rq->load_update);
+ curload = rq_load(rq);
+ load = rq->load_avg - (rq->load_avg * us_interval * 5 / 262144);
+ if (unlikely(load < 0))
+ load = 0;
+ load += curload * curload * SCHED_CAPACITY_SCALE * us_interval * 5 / 262144;
+ /* If this CPU has all the load, make it ramp up quickly */
+ if (curload > load && curload >= atomic_read(&grq.nr_running))
+ load = curload;
+ rq->load_avg = load;
+
+ rq->load_update = rq->niffies;
+ if (likely(rq_local(rq)))
+ cpufreq_trigger(rq->niffies, flags);
+}
+
+/*
+ * Removing from the runqueue. Enter with rq locked. Deleting a task
+ * from the skip list is done via the stored node reference in the task struct
+ * and does not require a full look up. Thus it occurs in O(k) time where k
+ * is the "level" of the list the task was stored at - usually < 4, max 8.
+ */
+static void dequeue_task(struct rq *rq, struct task_struct *p, int flags)
+{
+ skiplist_delete(rq->sl, &p->node);
+ rq->best_key = rq->node.next[0]->key;
+ update_clocks(rq);
+ if (!(flags & DEQUEUE_SAVE))
+ sched_info_dequeued(task_rq(p), p);
+ update_load_avg(rq, flags);
+}
+
+#ifdef CONFIG_PREEMPT_RCU
+static bool rcu_read_critical(struct task_struct *p)
+{
+ return p->rcu_read_unlock_special.b.blocked;
+}
+#else /* CONFIG_PREEMPT_RCU */
+#define rcu_read_critical(p) (false)
+#endif /* CONFIG_PREEMPT_RCU */
+
+/*
+ * To determine if it's safe for a task of SCHED_IDLEPRIO to actually run as
+ * an idle task, we ensure none of the following conditions are met.
+ */
+static bool idleprio_suitable(struct task_struct *p)
+{
+ return (!(task_contributes_to_load(p)) && !(p->flags & (PF_EXITING)) &&
+ !signal_pending(p) && !rcu_read_critical(p) && !freezing(p));
+}
+
+/*
+ * To determine if a task of SCHED_ISO can run in pseudo-realtime, we check
+ * that the iso_refractory flag is not set.
+ */
+static inline bool isoprio_suitable(struct rq *rq)
+{
+ return !rq->iso_refractory;
+}
+
+/*
+ * Adding to the runqueue. Enter with rq locked.
+ */
+static void enqueue_task(struct rq *rq, struct task_struct *p, int flags)
+{
+ unsigned int randseed, cflags = 0;
+ u64 sl_id;
+
+ if (!rt_task(p)) {
+ /* Check it hasn't gotten rt from PI */
+ if ((idleprio_task(p) && idleprio_suitable(p)) ||
+ (iso_task(p) && isoprio_suitable(rq)))
+ p->prio = p->normal_prio;
+ else
+ p->prio = NORMAL_PRIO;
+ }
+ /*
+ * The sl_id key passed to the skiplist generates a sorted list.
+ * Realtime and sched iso tasks run FIFO so they only need be sorted
+ * according to priority. The skiplist will put tasks of the same
+ * key inserted later in FIFO order. Tasks of sched normal, batch
+ * and idleprio are sorted according to their deadlines. Idleprio
+ * tasks are offset by an impossibly large deadline value ensuring
+ * they get sorted into last positions, but still according to their
+ * own deadlines. This creates a "landscape" of skiplists running
+ * from priority 0 realtime in first place to the lowest priority
+ * idleprio tasks last. Skiplist insertion is an O(log n) process.
+ */
+ if (p->prio <= ISO_PRIO) {
+ sl_id = p->prio;
+ cflags = SCHED_CPUFREQ_RT;
+ } else {
+ sl_id = p->deadline;
+ if (idleprio_task(p)) {
+ if (p->prio == IDLE_PRIO)
+ sl_id |= 0xF000000000000000;
+ else
+ sl_id += longest_deadline_diff();
+ }
+ }
+ /*
+ * Some architectures don't have better than microsecond resolution
+ * so mask out ~microseconds as the random seed for skiplist insertion.
+ */
+ update_clocks(rq);
+ if (!(flags & ENQUEUE_RESTORE))
+ sched_info_queued(rq, p);
+ randseed = (rq->niffies >> 10) & 0xFFFFFFFF;
+ skiplist_insert(rq->sl, &p->node, sl_id, p, randseed);
+ rq->best_key = rq->node.next[0]->key;
+ if (p->in_iowait)
+ cflags |= SCHED_CPUFREQ_IOWAIT;
+ update_load_avg(rq, cflags);
+}
+
+/*
+ * Returns the relative length of deadline all compared to the shortest
+ * deadline which is that of nice -20.
+ */
+static inline int task_prio_ratio(struct task_struct *p)
+{
+ return prio_ratios[TASK_USER_PRIO(p)];
+}
+
+/*
+ * task_timeslice - all tasks of all priorities get the exact same timeslice
+ * length. CPU distribution is handled by giving different deadlines to
+ * tasks of different priorities. Use 128 as the base value for fast shifts.
+ */
+static inline int task_timeslice(struct task_struct *p)
+{
+ return (rr_interval * task_prio_ratio(p) / 128);
+}
+
+#ifdef CONFIG_SMP
+/* Entered with rq locked */
+static inline void resched_if_idle(struct rq *rq)
+{
+ if (rq_idle(rq))
+ resched_task(rq->curr);
+}
+
+static inline bool rq_local(struct rq *rq)
+{
+ return (rq->cpu == smp_processor_id());
+}
+#ifdef CONFIG_SMT_NICE
+static const cpumask_t *thread_cpumask(int cpu);
+
+/* Find the best real time priority running on any SMT siblings of cpu and if
+ * none are running, the static priority of the best deadline task running.
+ * The lookups to the other runqueues is done lockless as the occasional wrong
+ * value would be harmless. */
+static int best_smt_bias(struct rq *this_rq)
+{
+ int other_cpu, best_bias = 0;
+
+ for_each_cpu(other_cpu, &this_rq->thread_mask) {
+ struct rq *rq = cpu_rq(other_cpu);
+
+ if (rq_idle(rq))
+ continue;
+ if (unlikely(!rq->online))
+ continue;
+ if (!rq->rq_mm)
+ continue;
+ if (likely(rq->rq_smt_bias > best_bias))
+ best_bias = rq->rq_smt_bias;
+ }
+ return best_bias;
+}
+
+static int task_prio_bias(struct task_struct *p)
+{
+ if (rt_task(p))
+ return 1 << 30;
+ else if (task_running_iso(p))
+ return 1 << 29;
+ else if (task_running_idle(p))
+ return 0;
+ return MAX_PRIO - p->static_prio;
+}
+
+static bool smt_always_schedule(struct task_struct __maybe_unused *p, struct rq __maybe_unused *this_rq)
+{
+ return true;
+}
+
+static bool (*smt_schedule)(struct task_struct *p, struct rq *this_rq) = &smt_always_schedule;
+
+/* We've already decided p can run on CPU, now test if it shouldn't for SMT
+ * nice reasons. */
+static bool smt_should_schedule(struct task_struct *p, struct rq *this_rq)
+{
+ int best_bias, task_bias;
+
+ /* Kernel threads always run */
+ if (unlikely(!p->mm))
+ return true;
+ if (rt_task(p))
+ return true;
+ if (!idleprio_suitable(p))
+ return true;
+ best_bias = best_smt_bias(this_rq);
+ /* The smt siblings are all idle or running IDLEPRIO */
+ if (best_bias < 1)
+ return true;
+ task_bias = task_prio_bias(p);
+ if (task_bias < 1)
+ return false;
+ if (task_bias >= best_bias)
+ return true;
+ /* Dither 25% cpu of normal tasks regardless of nice difference */
+ if (best_bias % 4 == 1)
+ return true;
+ /* Sorry, you lose */
+ return false;
+}
+#else /* CONFIG_SMT_NICE */
+#define smt_schedule(p, this_rq) (true)
+#endif /* CONFIG_SMT_NICE */
+
+static inline void atomic_set_cpu(int cpu, cpumask_t *cpumask)
+{
+ set_bit(cpu, (volatile unsigned long *)cpumask);
+}
+
+/*
+ * The cpu_idle_map stores a bitmap of all the CPUs currently idle to
+ * allow easy lookup of whether any suitable idle CPUs are available.
+ * It's cheaper to maintain a binary yes/no if there are any idle CPUs on the
+ * idle_cpus variable than to do a full bitmask check when we are busy. The
+ * bits are set atomically but read locklessly as occasional false positive /
+ * negative is harmless.
+ */
+static inline void set_cpuidle_map(int cpu)
+{
+ if (likely(cpu_online(cpu)))
+ atomic_set_cpu(cpu, &grq.cpu_idle_map);
+}
+
+static inline void atomic_clear_cpu(int cpu, cpumask_t *cpumask)
+{
+ clear_bit(cpu, (volatile unsigned long *)cpumask);
+}
+
+static inline void clear_cpuidle_map(int cpu)
+{
+ atomic_clear_cpu(cpu, &grq.cpu_idle_map);
+}
+
+static bool suitable_idle_cpus(struct task_struct *p)
+{
+ return (cpumask_intersects(&p->cpus_allowed, &grq.cpu_idle_map));
+}
+
+/*
+ * Resched current on rq. We don't know if rq is local to this CPU nor if it
+ * is locked so we do not use an intermediate variable for the task to avoid
+ * having it dereferenced.
+ */
+static void resched_curr(struct rq *rq)
+{
+ int cpu;
+
+ if (test_tsk_need_resched(rq->curr))
+ return;
+
+ rq->preempt = rq->curr;
+ cpu = rq->cpu;
+
+ /* We're doing this without holding the rq lock if it's not task_rq */
+
+ if (cpu == smp_processor_id()) {
+ set_tsk_need_resched(rq->curr);
+ set_preempt_need_resched();
+ return;
+ }
+
+ if (set_nr_and_not_polling(rq->curr))
+ smp_sched_reschedule(cpu);
+ else
+ trace_sched_wake_idle_without_ipi(cpu);
+}
+
+#define CPUIDLE_DIFF_THREAD (1)
+#define CPUIDLE_DIFF_CORE (2)
+#define CPUIDLE_CACHE_BUSY (4)
+#define CPUIDLE_DIFF_CPU (8)
+#define CPUIDLE_THREAD_BUSY (16)
+#define CPUIDLE_DIFF_NODE (32)
+
+/*
+ * The best idle CPU is chosen according to the CPUIDLE ranking above where the
+ * lowest value would give the most suitable CPU to schedule p onto next. The
+ * order works out to be the following:
+ *
+ * Same thread, idle or busy cache, idle or busy threads
+ * Other core, same cache, idle or busy cache, idle threads.
+ * Same node, other CPU, idle cache, idle threads.
+ * Same node, other CPU, busy cache, idle threads.
+ * Other core, same cache, busy threads.
+ * Same node, other CPU, busy threads.
+ * Other node, other CPU, idle cache, idle threads.
+ * Other node, other CPU, busy cache, idle threads.
+ * Other node, other CPU, busy threads.
+ */
+static int best_mask_cpu(int best_cpu, struct rq *rq, cpumask_t *tmpmask)
+{
+ int best_ranking = CPUIDLE_DIFF_NODE | CPUIDLE_THREAD_BUSY |
+ CPUIDLE_DIFF_CPU | CPUIDLE_CACHE_BUSY | CPUIDLE_DIFF_CORE |
+ CPUIDLE_DIFF_THREAD;
+ int cpu_tmp;
+
+ if (cpumask_test_cpu(best_cpu, tmpmask))
+ goto out;
+
+ for_each_cpu(cpu_tmp, tmpmask) {
+ int ranking, locality;
+ struct rq *tmp_rq;
+
+ ranking = 0;
+ tmp_rq = cpu_rq(cpu_tmp);
+
+ locality = rq->cpu_locality[cpu_tmp];
+#ifdef CONFIG_NUMA
+ if (locality > 3)
+ ranking |= CPUIDLE_DIFF_NODE;
+ else
+#endif
+ if (locality > 2)
+ ranking |= CPUIDLE_DIFF_CPU;
+#ifdef CONFIG_SCHED_MC
+ else if (locality == 2)
+ ranking |= CPUIDLE_DIFF_CORE;
+ else if (!(tmp_rq->cache_idle(tmp_rq)))
+ ranking |= CPUIDLE_CACHE_BUSY;
+#endif
+#ifdef CONFIG_SCHED_SMT
+ if (locality == 1)
+ ranking |= CPUIDLE_DIFF_THREAD;
+ if (!(tmp_rq->siblings_idle(tmp_rq)))
+ ranking |= CPUIDLE_THREAD_BUSY;
+#endif
+ if (ranking < best_ranking) {
+ best_cpu = cpu_tmp;
+ best_ranking = ranking;
+ }
+ }
+out:
+ return best_cpu;
+}
+
+bool cpus_share_cache(int this_cpu, int that_cpu)
+{
+ struct rq *this_rq = cpu_rq(this_cpu);
+
+ return (this_rq->cpu_locality[that_cpu] < 3);
+}
+
+/* As per resched_curr but only will resched idle task */
+static inline void resched_idle(struct rq *rq)
+{
+ if (test_tsk_need_resched(rq->idle))
+ return;
+
+ rq->preempt = rq->idle;
+
+ set_tsk_need_resched(rq->idle);
+
+ if (rq_local(rq)) {
+ set_preempt_need_resched();
+ return;
+ }
+
+ smp_sched_reschedule(rq->cpu);
+}
+
+static struct rq *resched_best_idle(struct task_struct *p, int cpu)
+{
+ cpumask_t tmpmask;
+ struct rq *rq;
+ int best_cpu;
+
+ cpumask_and(&tmpmask, &p->cpus_allowed, &grq.cpu_idle_map);
+ best_cpu = best_mask_cpu(cpu, task_rq(p), &tmpmask);
+ rq = cpu_rq(best_cpu);
+ if (!smt_schedule(p, rq))
+ return NULL;
+ resched_idle(rq);
+ return rq;
+}
+
+static inline void resched_suitable_idle(struct task_struct *p)
+{
+ if (suitable_idle_cpus(p))
+ resched_best_idle(p, task_cpu(p));
+}
+
+static inline struct rq *rq_order(struct rq *rq, int cpu)
+{
+ return rq->rq_order[cpu];
+}
+#else /* CONFIG_SMP */
+static inline void set_cpuidle_map(int cpu)
+{
+}
+
+static inline void clear_cpuidle_map(int cpu)
+{
+}
+
+static inline bool suitable_idle_cpus(struct task_struct *p)
+{
+ return uprq->curr == uprq->idle;
+}
+
+static inline void resched_suitable_idle(struct task_struct *p)
+{
+}
+
+static inline void resched_curr(struct rq *rq)
+{
+ resched_task(rq->curr);
+}
+
+static inline void resched_if_idle(struct rq *rq)
+{
+}
+
+static inline bool rq_local(struct rq *rq)
+{
+ return true;
+}
+
+static inline struct rq *rq_order(struct rq *rq, int cpu)
+{
+ return rq;
+}
+
+static inline bool smt_schedule(struct task_struct *p, struct rq *rq)
+{
+ return true;
+}
+#endif /* CONFIG_SMP */
+
+static inline int normal_prio(struct task_struct *p)
+{
+ if (has_rt_policy(p))
+ return MAX_RT_PRIO - 1 - p->rt_priority;
+ if (idleprio_task(p))
+ return IDLE_PRIO;
+ if (iso_task(p))
+ return ISO_PRIO;
+ return NORMAL_PRIO;
+}
+
+/*
+ * Calculate the current priority, i.e. the priority
+ * taken into account by the scheduler. This value might
+ * be boosted by RT tasks as it will be RT if the task got
+ * RT-boosted. If not then it returns p->normal_prio.
+ */
+static int effective_prio(struct task_struct *p)
+{
+ p->normal_prio = normal_prio(p);
+ /*
+ * If we are RT tasks or we were boosted to RT priority,
+ * keep the priority unchanged. Otherwise, update priority
+ * to the normal priority:
+ */
+ if (!rt_prio(p->prio))
+ return p->normal_prio;
+ return p->prio;
+}
+
+/*
+ * activate_task - move a task to the runqueue. Enter with rq locked.
+ */
+static void activate_task(struct task_struct *p, struct rq *rq)
+{
+ resched_if_idle(rq);
+
+ /*
+ * Sleep time is in units of nanosecs, so shift by 20 to get a
+ * milliseconds-range estimation of the amount of time that the task
+ * spent sleeping:
+ */
+ if (unlikely(prof_on == SLEEP_PROFILING)) {
+ if (p->state == TASK_UNINTERRUPTIBLE)
+ profile_hits(SLEEP_PROFILING, (void *)get_wchan(p),
+ (rq->niffies - p->last_ran) >> 20);
+ }
+
+ p->prio = effective_prio(p);
+ if (task_contributes_to_load(p))
+ atomic_dec(&grq.nr_uninterruptible);
+
+ enqueue_task(rq, p, 0);
+ p->on_rq = TASK_ON_RQ_QUEUED;
+ atomic_inc(&grq.nr_running);
+}
+
+/*
+ * deactivate_task - If it's running, it's not on the runqueue and we can just
+ * decrement the nr_running. Enter with rq locked.
+ */
+static inline void deactivate_task(struct task_struct *p, struct rq *rq)
+{
+ if (task_contributes_to_load(p))
+ atomic_inc(&grq.nr_uninterruptible);
+
+ p->on_rq = 0;
+ atomic_dec(&grq.nr_running);
+ sched_info_dequeued(rq, p);
+}
+
+#ifdef CONFIG_SMP
+void set_task_cpu(struct task_struct *p, unsigned int cpu)
+{
+ struct rq *rq = task_rq(p);
+ bool queued;
+
+#ifdef CONFIG_LOCKDEP
+ /*
+ * The caller should hold either p->pi_lock or rq->lock, when changing
+ * a task's CPU. ->pi_lock for waking tasks, rq->lock for runnable tasks.
+ *
+ * Furthermore, all task_rq users should acquire both locks, see
+ * task_rq_lock().
+ */
+ WARN_ON_ONCE(debug_locks && !(lockdep_is_held(&p->pi_lock) ||
+ lockdep_is_held(&task_rq(p)->lock)));
+#endif
+ if (task_cpu(p) == cpu)
+ return;
+ trace_sched_migrate_task(p, cpu);
+ perf_event_task_migrate(p);
+
+ /*
+ * After ->cpu is set up to a new value, task_rq_lock(p, ...) can be
+ * successfully executed on another CPU. We must ensure that updates of
+ * per-task data have been completed by this moment.
+ */
+ smp_wmb();
+
+ if (task_running(rq, p)) {
+ /*
+ * We should only be calling this on a running task if we're
+ * holding rq lock.
+ */
+ lockdep_assert_held(&rq->lock);
+
+ /*
+ * We can't change the task_thread_info cpu on a running task
+ * as p will still be protected by the rq lock of the cpu it
+ * is still running on so we set the wake_cpu for it to be
+ * lazily updated once off the cpu.
+ */
+ p->wake_cpu = cpu;
+ return;
+ }
+
+ if ((queued = task_queued(p)))
+ dequeue_task(rq, p, 0);
+#ifdef CONFIG_THREAD_INFO_IN_TASK
+ p->cpu = cpu;
+#else
+ task_thread_info(p)->cpu = cpu;
+#endif
+ p->wake_cpu = cpu;
+ if (queued)
+ enqueue_task(cpu_rq(cpu), p, 0);
+}
+#endif /* CONFIG_SMP */
+
+/*
+ * Move a task off the runqueue and take it to a cpu for it will
+ * become the running task.
+ */
+static inline void take_task(struct rq *rq, int cpu, struct task_struct *p)
+{
+ struct rq *p_rq = task_rq(p);
+
+ dequeue_task(p_rq, p, DEQUEUE_SAVE);
+ if (p_rq != rq) {
+ sched_info_dequeued(p_rq, p);
+ sched_info_queued(rq, p);
+ }
+ set_task_cpu(p, cpu);
+}
+
+/*
+ * Returns a descheduling task to the runqueue unless it is being
+ * deactivated.
+ */
+static inline void return_task(struct task_struct *p, struct rq *rq,
+ int cpu, bool deactivate)
+{
+ if (deactivate)
+ deactivate_task(p, rq);
+ else {
+#ifdef CONFIG_SMP
+ /*
+ * set_task_cpu was called on the running task that doesn't
+ * want to deactivate so it has to be enqueued to a different
+ * CPU and we need its lock. Tag it to be moved with as the
+ * lock is dropped in finish_lock_switch.
+ */
+ if (unlikely(p->wake_cpu != cpu))
+ p->on_rq = TASK_ON_RQ_MIGRATING;
+ else
+#endif
+ enqueue_task(rq, p, ENQUEUE_RESTORE);
+ }
+}
+
+/* Enter with rq lock held. We know p is on the local cpu */
+static inline void __set_tsk_resched(struct task_struct *p)
+{
+ set_tsk_need_resched(p);
+ set_preempt_need_resched();
+}
+
+/**
+ * task_curr - is this task currently executing on a CPU?
+ * @p: the task in question.
+ *
+ * Return: 1 if the task is currently executing. 0 otherwise.
+ */
+inline int task_curr(const struct task_struct *p)
+{
+ return cpu_curr(task_cpu(p)) == p;
+}
+
+#ifdef CONFIG_SMP
+/*
+ * wait_task_inactive - wait for a thread to unschedule.
+ *
+ * If @match_state is nonzero, it's the @p->state value just checked and
+ * not expected to change. If it changes, i.e. @p might have woken up,
+ * then return zero. When we succeed in waiting for @p to be off its CPU,
+ * we return a positive number (its total switch count). If a second call
+ * a short while later returns the same number, the caller can be sure that
+ * @p has remained unscheduled the whole time.
+ *
+ * The caller must ensure that the task *will* unschedule sometime soon,
+ * else this function might spin for a *long* time. This function can't
+ * be called with interrupts off, or it may introduce deadlock with
+ * smp_call_function() if an IPI is sent by the same process we are
+ * waiting to become inactive.
+ */
+unsigned long wait_task_inactive(struct task_struct *p, long match_state)
+{
+ int running, queued;
+ unsigned long flags;
+ unsigned long ncsw;
+ struct rq *rq;
+
+ for (;;) {
+ rq = task_rq(p);
+
+ /*
+ * If the task is actively running on another CPU
+ * still, just relax and busy-wait without holding
+ * any locks.
+ *
+ * NOTE! Since we don't hold any locks, it's not
+ * even sure that "rq" stays as the right runqueue!
+ * But we don't care, since this will return false
+ * if the runqueue has changed and p is actually now
+ * running somewhere else!
+ */
+ while (task_running(rq, p)) {
+ if (match_state && unlikely(p->state != match_state))
+ return 0;
+ cpu_relax();
+ }
+
+ /*
+ * Ok, time to look more closely! We need the rq
+ * lock now, to be *sure*. If we're wrong, we'll
+ * just go back and repeat.
+ */
+ rq = task_rq_lock(p, &flags);
+ trace_sched_wait_task(p);
+ running = task_running(rq, p);
+ queued = task_on_rq_queued(p);
+ ncsw = 0;
+ if (!match_state || p->state == match_state)
+ ncsw = p->nvcsw | LONG_MIN; /* sets MSB */
+ task_rq_unlock(rq, p, &flags);
+
+ /*
+ * If it changed from the expected state, bail out now.
+ */
+ if (unlikely(!ncsw))
+ break;
+
+ /*
+ * Was it really running after all now that we
+ * checked with the proper locks actually held?
+ *
+ * Oops. Go back and try again..
+ */
+ if (unlikely(running)) {
+ cpu_relax();
+ continue;
+ }
+
+ /*
+ * It's not enough that it's not actively running,
+ * it must be off the runqueue _entirely_, and not
+ * preempted!
+ *
+ * So if it was still runnable (but just not actively
+ * running right now), it's preempted, and we should
+ * yield - it could be a while.
+ */
+ if (unlikely(queued)) {
+ ktime_t to = ktime_set(0, NSEC_PER_SEC / HZ);
+
+ set_current_state(TASK_UNINTERRUPTIBLE);
+ schedule_hrtimeout(&to, HRTIMER_MODE_REL);
+ continue;
+ }
+
+ /*
+ * Ahh, all good. It wasn't running, and it wasn't
+ * runnable, which means that it will never become
+ * running in the future either. We're all done!
+ */
+ break;
+ }
+
+ return ncsw;
+}
+
+/***
+ * kick_process - kick a running thread to enter/exit the kernel
+ * @p: the to-be-kicked thread
+ *
+ * Cause a process which is running on another CPU to enter
+ * kernel-mode, without any delay. (to get signals handled.)
+ *
+ * NOTE: this function doesn't have to take the runqueue lock,
+ * because all it wants to ensure is that the remote task enters
+ * the kernel. If the IPI races and the task has been migrated
+ * to another CPU then no harm is done and the purpose has been
+ * achieved as well.
+ */
+void kick_process(struct task_struct *p)
+{
+ int cpu;
+
+ preempt_disable();
+ cpu = task_cpu(p);
+ if ((cpu != smp_processor_id()) && task_curr(p))
+ smp_sched_reschedule(cpu);
+ preempt_enable();
+}
+EXPORT_SYMBOL_GPL(kick_process);
+#endif
+
+/*
+ * RT tasks preempt purely on priority. SCHED_NORMAL tasks preempt on the
+ * basis of earlier deadlines. SCHED_IDLEPRIO don't preempt anything else or
+ * between themselves, they cooperatively multitask. An idle rq scores as
+ * prio PRIO_LIMIT so it is always preempted.
+ */
+static inline bool
+can_preempt(struct task_struct *p, int prio, u64 deadline)
+{
+ /* Better static priority RT task or better policy preemption */
+ if (p->prio < prio)
+ return true;
+ if (p->prio > prio)
+ return false;
+ if (p->policy == SCHED_BATCH)
+ return false;
+ /* SCHED_NORMAL and ISO will preempt based on deadline */
+ if (!deadline_before(p->deadline, deadline))
+ return false;
+ return true;
+}
+
+#ifdef CONFIG_SMP
+/*
+ * Check to see if p can run on cpu, and if not, whether there are any online
+ * CPUs it can run on instead.
+ */
+static inline bool needs_other_cpu(struct task_struct *p, int cpu)
+{
+ if (unlikely(!cpumask_test_cpu(cpu, &p->cpus_allowed)))
+ return true;
+ return false;
+}
+#define cpu_online_map (*(cpumask_t *)cpu_online_mask)
+
+static void try_preempt(struct task_struct *p, struct rq *this_rq)
+{
+ int i, this_entries = rq_load(this_rq);
+ cpumask_t tmp;
+
+ if (suitable_idle_cpus(p) && resched_best_idle(p, task_cpu(p)))
+ return;
+
+ /* IDLEPRIO tasks never preempt anything but idle */
+ if (p->policy == SCHED_IDLEPRIO)
+ return;
+
+ cpumask_and(&tmp, &cpu_online_map, &p->cpus_allowed);
+
+ for (i = 0; i < num_possible_cpus(); i++) {
+ struct rq *rq = this_rq->rq_order[i];
+
+ if (!cpumask_test_cpu(rq->cpu, &tmp))
+ continue;
+
+ if (!sched_interactive && rq != this_rq && rq_load(rq) <= this_entries)
+ continue;
+ if (smt_schedule(p, rq) && can_preempt(p, rq->rq_prio, rq->rq_deadline)) {
+ resched_curr(rq);
+ return;
+ }
+ }
+}
+
+static int __set_cpus_allowed_ptr(struct task_struct *p,
+ const struct cpumask *new_mask, bool check);
+#else /* CONFIG_SMP */
+static inline bool needs_other_cpu(struct task_struct *p, int cpu)
+{
+ return false;
+}
+
+static void try_preempt(struct task_struct *p, struct rq *this_rq)
+{
+ if (p->policy == SCHED_IDLEPRIO)
+ return;
+ if (can_preempt(p, uprq->rq_prio, uprq->rq_deadline))
+ resched_curr(uprq);
+}
+
+static inline int __set_cpus_allowed_ptr(struct task_struct *p,
+ const struct cpumask *new_mask, bool check)
+{
+ return set_cpus_allowed_ptr(p, new_mask);
+}
+#endif /* CONFIG_SMP */
+
+/*
+ * wake flags
+ */
+#define WF_SYNC 0x01 /* waker goes to sleep after wakeup */
+#define WF_FORK 0x02 /* child wakeup after fork */
+#define WF_MIGRATED 0x04 /* internal use, task got migrated */
+
+static void
+ttwu_stat(struct task_struct *p, int cpu, int wake_flags)
+{
+ struct rq *rq;
+
+ if (!schedstat_enabled())
+ return;
+
+ rq = this_rq();
+
+#ifdef CONFIG_SMP
+ if (cpu == rq->cpu)
+ schedstat_inc(rq->ttwu_local);
+ else {
+ struct sched_domain *sd;
+
+ rcu_read_lock();
+ for_each_domain(rq->cpu, sd) {
+ if (cpumask_test_cpu(cpu, sched_domain_span(sd))) {
+ schedstat_inc(sd->ttwu_wake_remote);
+ break;
+ }
+ }
+ rcu_read_unlock();
+ }
+
+#endif /* CONFIG_SMP */
+
+ schedstat_inc(rq->ttwu_count);
+}
+
+static inline void ttwu_activate(struct rq *rq, struct task_struct *p)
+{
+ activate_task(p, rq);
+
+ /* if a worker is waking up, notify workqueue */
+ if (p->flags & PF_WQ_WORKER)
+ wq_worker_waking_up(p, cpu_of(rq));
+}
+
+/*
+ * Mark the task runnable and perform wakeup-preemption.
+ */
+static void ttwu_do_wakeup(struct rq *rq, struct task_struct *p, int wake_flags)
+{
+ /*
+ * Sync wakeups (i.e. those types of wakeups where the waker
+ * has indicated that it will leave the CPU in short order)
+ * don't trigger a preemption if there are no idle cpus,
+ * instead waiting for current to deschedule.
+ */
+ if (wake_flags & WF_SYNC)
+ resched_suitable_idle(p);
+ else
+ try_preempt(p, rq);
+ p->state = TASK_RUNNING;
+ trace_sched_wakeup(p);
+}
+
+static void
+ttwu_do_activate(struct rq *rq, struct task_struct *p, int wake_flags)
+{
+ lockdep_assert_held(&rq->lock);
+
+#ifdef CONFIG_SMP
+ if (p->sched_contributes_to_load)
+ atomic_dec(&grq.nr_uninterruptible);
+#endif
+
+ ttwu_activate(rq, p);
+ ttwu_do_wakeup(rq, p, wake_flags);
+}
+
+/*
+ * Called in case the task @p isn't fully descheduled from its runqueue,
+ * in this case we must do a remote wakeup. Its a 'light' wakeup though,
+ * since all we need to do is flip p->state to TASK_RUNNING, since
+ * the task is still ->on_rq.
+ */
+static int ttwu_remote(struct task_struct *p, int wake_flags)
+{
+ struct rq *rq;
+ int ret = 0;
+
+ rq = __task_rq_lock(p);
+ if (likely(task_on_rq_queued(p))) {
+ ttwu_do_wakeup(rq, p, wake_flags);
+ ret = 1;
+ }
+ __task_rq_unlock(rq);
+
+ return ret;
+}
+
+#ifdef CONFIG_SMP
+void sched_ttwu_pending(void)
+{
+ struct rq *rq = this_rq();
+ struct llist_node *llist = llist_del_all(&rq->wake_list);
+ struct task_struct *p;
+ unsigned long flags;
+
+ if (!llist)
+ return;
+
+ raw_spin_lock_irqsave(&rq->lock, flags);
+
+ while (llist) {
+ int wake_flags = 0;
+
+ p = llist_entry(llist, struct task_struct, wake_entry);
+ llist = llist_next(llist);
+
+ ttwu_do_activate(rq, p, wake_flags);
+ }
+
+ raw_spin_unlock_irqrestore(&rq->lock, flags);
+}
+
+void scheduler_ipi(void)
+{
+ /*
+ * Fold TIF_NEED_RESCHED into the preempt_count; anybody setting
+ * TIF_NEED_RESCHED remotely (for the first time) will also send
+ * this IPI.
+ */
+ preempt_fold_need_resched();
+
+ if (llist_empty(&this_rq()->wake_list) && (!idle_cpu(smp_processor_id()) || need_resched()))
+ return;
+
+ /*
+ * Not all reschedule IPI handlers call irq_enter/irq_exit, since
+ * traditionally all their work was done from the interrupt return
+ * path. Now that we actually do some work, we need to make sure
+ * we do call them.
+ *
+ * Some archs already do call them, luckily irq_enter/exit nest
+ * properly.
+ *
+ * Arguably we should visit all archs and update all handlers,
+ * however a fair share of IPIs are still resched only so this would
+ * somewhat pessimize the simple resched case.
+ */
+ irq_enter();
+ sched_ttwu_pending();
+ irq_exit();
+}
+
+static void ttwu_queue_remote(struct task_struct *p, int cpu, int wake_flags)
+{
+ struct rq *rq = cpu_rq(cpu);
+
+ if (llist_add(&p->wake_entry, &cpu_rq(cpu)->wake_list)) {
+ if (!set_nr_if_polling(rq->idle))
+ smp_sched_reschedule(cpu);
+ else
+ trace_sched_wake_idle_without_ipi(cpu);
+ }
+}
+
+void wake_up_if_idle(int cpu)
+{
+ struct rq *rq = cpu_rq(cpu);
+ unsigned long flags;
+
+ rcu_read_lock();
+
+ if (!is_idle_task(rcu_dereference(rq->curr)))
+ goto out;
+
+ if (set_nr_if_polling(rq->idle)) {
+ trace_sched_wake_idle_without_ipi(cpu);
+ } else {
+ rq_lock_irqsave(rq, &flags);
+ if (likely(is_idle_task(rq->curr)))
+ smp_sched_reschedule(cpu);
+ /* Else cpu is not in idle, do nothing here */
+ rq_unlock_irqrestore(rq, &flags);
+ }
+
+out:
+ rcu_read_unlock();
+}
+
+static int valid_task_cpu(struct task_struct *p)
+{
+ cpumask_t valid_mask;
+
+ if (p->flags & PF_KTHREAD)
+ cpumask_and(&valid_mask, tsk_cpus_allowed(p), cpu_online_mask);
+ else
+ cpumask_and(&valid_mask, tsk_cpus_allowed(p), cpu_active_mask);
+
+ if (unlikely(!cpumask_weight(&valid_mask))) {
+ /* Hotplug boot threads do this before the CPU is up */
+ WARN_ON(sched_smp_initialized);
+ return cpumask_any(tsk_cpus_allowed(p));
+ }
+ return cpumask_any(&valid_mask);
+}
+
+/*
+ * For a task that's just being woken up we have a valuable balancing
+ * opportunity so choose the nearest cache most lightly loaded runqueue.
+ * Entered with rq locked and returns with the chosen runqueue locked.
+ */
+static inline int select_best_cpu(struct task_struct *p)
+{
+ unsigned int idlest = ~0U;
+ struct rq *rq = NULL;
+ int i;
+
+ if (suitable_idle_cpus(p)) {
+ int cpu = task_cpu(p);
+
+ if (unlikely(needs_other_cpu(p, cpu)))
+ cpu = valid_task_cpu(p);
+ rq = resched_best_idle(p, cpu);
+ if (likely(rq))
+ return rq->cpu;
+ }
+
+ for (i = 0; i < num_possible_cpus(); i++) {
+ struct rq *other_rq = task_rq(p)->rq_order[i];
+ int entries;
+
+ if (!other_rq->online)
+ continue;
+ if (needs_other_cpu(p, other_rq->cpu))
+ continue;
+ entries = rq_load(other_rq);
+ if (entries >= idlest)
+ continue;
+ idlest = entries;
+ rq = other_rq;
+ }
+ if (unlikely(!rq))
+ return task_cpu(p);
+ return rq->cpu;
+}
+#else /* CONFIG_SMP */
+static int valid_task_cpu(struct task_struct *p)
+{
+ return 0;
+}
+
+static inline int select_best_cpu(struct task_struct *p)
+{
+ return 0;
+}
+
+static struct rq *resched_best_idle(struct task_struct *p, int cpu)
+{
+ return NULL;
+}
+#endif /* CONFIG_SMP */
+
+static void ttwu_queue(struct task_struct *p, int cpu, int wake_flags)
+{
+ struct rq *rq = cpu_rq(cpu);
+
+#if defined(CONFIG_SMP)
+ if (!cpus_share_cache(smp_processor_id(), cpu)) {
+ sched_clock_cpu(cpu); /* sync clocks x-cpu */
+ ttwu_queue_remote(p, cpu, wake_flags);
+ return;
+ }
+#endif
+ rq_lock(rq);
+ ttwu_do_activate(rq, p, wake_flags);
+ rq_unlock(rq);
+}
+
+/***
+ * try_to_wake_up - wake up a thread
+ * @p: the thread to be awakened
+ * @state: the mask of task states that can be woken
+ * @wake_flags: wake modifier flags (WF_*)
+ *
+ * Put it on the run-queue if it's not already there. The "current"
+ * thread is always on the run-queue (except when the actual
+ * re-schedule is in progress), and as such you're allowed to do
+ * the simpler "current->state = TASK_RUNNING" to mark yourself
+ * runnable without the overhead of this.
+ *
+ * Return: %true if @p was woken up, %false if it was already running.
+ * or @state didn't match @p's state.
+ */
+static int
+try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
+{
+ unsigned long flags;
+ int cpu, success = 0;
+
+ /*
+ * If we are going to wake up a thread waiting for CONDITION we
+ * need to ensure that CONDITION=1 done by the caller can not be
+ * reordered with p->state check below. This pairs with mb() in
+ * set_current_state() the waiting thread does.
+ */
+ smp_mb__before_spinlock();
+ raw_spin_lock_irqsave(&p->pi_lock, flags);
+ /* state is a volatile long, どうして、分からない */
+ if (!((unsigned int)p->state & state))
+ goto out;
+
+ trace_sched_waking(p);
+
+ success = 1; /* we're going to change ->state */
+ cpu = task_cpu(p);
+
+ /*
+ * Ensure we load p->on_rq _after_ p->state, otherwise it would
+ * be possible to, falsely, observe p->on_rq == 0 and get stuck
+ * in smp_cond_load_acquire() below.
+ *
+ * sched_ttwu_pending() try_to_wake_up()
+ * [S] p->on_rq = 1; [L] P->state
+ * UNLOCK rq->lock -----.
+ * \
+ * +--- RMB
+ * schedule() /
+ * LOCK rq->lock -----'
+ * UNLOCK rq->lock
+ *
+ * [task p]
+ * [S] p->state = UNINTERRUPTIBLE [L] p->on_rq
+ *
+ * Pairs with the UNLOCK+LOCK on rq->lock from the
+ * last wakeup of our task and the schedule that got our task
+ * current.
+ */
+ smp_rmb();
+ if (p->on_rq && ttwu_remote(p, wake_flags))
+ goto stat;
+
+#ifdef CONFIG_SMP
+ /*
+ * Ensure we load p->on_cpu _after_ p->on_rq, otherwise it would be
+ * possible to, falsely, observe p->on_cpu == 0.
+ *
+ * One must be running (->on_cpu == 1) in order to remove oneself
+ * from the runqueue.
+ *
+ * [S] ->on_cpu = 1; [L] ->on_rq
+ * UNLOCK rq->lock
+ * RMB
+ * LOCK rq->lock
+ * [S] ->on_rq = 0; [L] ->on_cpu
+ *
+ * Pairs with the full barrier implied in the UNLOCK+LOCK on rq->lock
+ * from the consecutive calls to schedule(); the first switching to our
+ * task, the second putting it to sleep.
+ */
+ smp_rmb();
+
+ /*
+ * If the owning (remote) cpu is still in the middle of schedule() with
+ * this task as prev, wait until its done referencing the task.
+ *
+ * Pairs with the smp_store_release() in finish_lock_switch().
+ *
+ * This ensures that tasks getting woken will be fully ordered against
+ * their previous state and preserve Program Order.
+ */
+ smp_cond_load_acquire(&p->on_cpu, !VAL);
+
+ p->sched_contributes_to_load = !!task_contributes_to_load(p);
+ p->state = TASK_WAKING;
+
+ cpu = select_best_cpu(p);
+ if (task_cpu(p) != cpu)
+ set_task_cpu(p, cpu);
+#endif /* CONFIG_SMP */
+
+ ttwu_queue(p, cpu, wake_flags);
+stat:
+ ttwu_stat(p, cpu, wake_flags);
+out:
+ raw_spin_unlock_irqrestore(&p->pi_lock, flags);
+
+ return success;
+}
+
+/**
+ * try_to_wake_up_local - try to wake up a local task with rq lock held
+ * @p: the thread to be awakened
+ *
+ * Put @p on the run-queue if it's not already there. The caller must
+ * ensure that rq is locked and, @p is not the current task.
+ * rq stays locked over invocation.
+ */
+static void try_to_wake_up_local(struct task_struct *p)
+{
+ struct rq *rq = task_rq(p);
+
+ if (WARN_ON_ONCE(rq != this_rq()) ||
+ WARN_ON_ONCE(p == current))
+ return;
+
+ lockdep_assert_held(&rq->lock);
+
+ if (!raw_spin_trylock(&p->pi_lock)) {
+ /*
+ * This is OK, because current is on_cpu, which avoids it being
+ * picked for load-balance and preemption/IRQs are still
+ * disabled avoiding further scheduler activity on it and we've
+ * not yet picked a replacement task.
+ */
+ raw_spin_unlock(&rq->lock);
+ raw_spin_lock(&p->pi_lock);
+ raw_spin_lock(&rq->lock);
+ }
+
+ if (!(p->state & TASK_NORMAL))
+ goto out;
+
+ trace_sched_waking(p);
+
+ if (!task_on_rq_queued(p))
+ ttwu_activate(rq, p);
+
+ ttwu_do_wakeup(rq, p, 0);
+ ttwu_stat(p, smp_processor_id(), 0);
+out:
+ raw_spin_unlock(&p->pi_lock);
+}
+
+/**
+ * wake_up_process - Wake up a specific process
+ * @p: The process to be woken up.
+ *
+ * Attempt to wake up the nominated process and move it to the set of runnable
+ * processes.
+ *
+ * Return: 1 if the process was woken up, 0 if it was already running.
+ *
+ * It may be assumed that this function implies a write memory barrier before
+ * changing the task state if and only if any tasks are woken up.
+ */
+int wake_up_process(struct task_struct *p)
+{
+ return try_to_wake_up(p, TASK_NORMAL, 0);
+}
+EXPORT_SYMBOL(wake_up_process);
+
+int wake_up_state(struct task_struct *p, unsigned int state)
+{
+ return try_to_wake_up(p, state, 0);
+}
+
+static void time_slice_expired(struct task_struct *p, struct rq *rq);
+
+/*
+ * Perform scheduler related setup for a newly forked process p.
+ * p is forked by current.
+ */
+int sched_fork(unsigned long __maybe_unused clone_flags, struct task_struct *p)
+{
+ unsigned long flags;
+ int cpu = get_cpu();
+
+#ifdef CONFIG_PREEMPT_NOTIFIERS
+ INIT_HLIST_HEAD(&p->preempt_notifiers);
+#endif
+ /*
+ * We mark the process as NEW here. This guarantees that
+ * nobody will actually run it, and a signal or other external
+ * event cannot wake it up and insert it on the runqueue either.
+ */
+ p->state = TASK_NEW;
+
+ /*
+ * The process state is set to the same value of the process executing
+ * do_fork() code. That is running. This guarantees that nobody will
+ * actually run it, and a signal or other external event cannot wake
+ * it up and insert it on the runqueue either.
+ */
+
+ /* Should be reset in fork.c but done here for ease of MuQSS patching */
+ p->on_cpu =
+ p->on_rq =
+ p->utime =
+ p->stime =
+ p->utimescaled =
+ p->stimescaled =
+ p->sched_time =
+ p->stime_ns =
+ p->utime_ns = 0;
+ skiplist_node_init(&p->node);
+
+ /*
+ * Revert to default priority/policy on fork if requested.
+ */
+ if (unlikely(p->sched_reset_on_fork)) {
+ if (p->policy == SCHED_FIFO || p->policy == SCHED_RR) {
+ p->policy = SCHED_NORMAL;
+ p->normal_prio = normal_prio(p);
+ }
+
+ if (PRIO_TO_NICE(p->static_prio) < 0) {
+ p->static_prio = NICE_TO_PRIO(0);
+ p->normal_prio = p->static_prio;
+ }
+
+ /*
+ * We don't need the reset flag anymore after the fork. It has
+ * fulfilled its duty:
+ */
+ p->sched_reset_on_fork = 0;
+ }
+
+ /*
+ * Silence PROVE_RCU.
+ */
+ raw_spin_lock_irqsave(&p->pi_lock, flags);
+ set_task_cpu(p, cpu);
+ raw_spin_unlock_irqrestore(&p->pi_lock, flags);
+
+#ifdef CONFIG_SCHED_INFO
+ if (unlikely(sched_info_on()))
+ memset(&p->sched_info, 0, sizeof(p->sched_info));
+#endif
+ init_task_preempt_count(p);
+
+ put_cpu();
+ return 0;
+}
+
+#ifdef CONFIG_SCHEDSTATS
+
+DEFINE_STATIC_KEY_FALSE(sched_schedstats);
+static bool __initdata __sched_schedstats = false;
+
+static void set_schedstats(bool enabled)
+{
+ if (enabled)
+ static_branch_enable(&sched_schedstats);
+ else
+ static_branch_disable(&sched_schedstats);
+}
+
+void force_schedstat_enabled(void)
+{
+ if (!schedstat_enabled()) {
+ pr_info("kernel profiling enabled schedstats, disable via kernel.sched_schedstats.\n");
+ static_branch_enable(&sched_schedstats);
+ }
+}
+
+static int __init setup_schedstats(char *str)
+{
+ int ret = 0;
+ if (!str)
+ goto out;
+
+ /*
+ * This code is called before jump labels have been set up, so we can't
+ * change the static branch directly just yet. Instead set a temporary
+ * variable so init_schedstats() can do it later.
+ */
+ if (!strcmp(str, "enable")) {
+ __sched_schedstats = true;
+ ret = 1;
+ } else if (!strcmp(str, "disable")) {
+ __sched_schedstats = false;
+ ret = 1;
+ }
+out:
+ if (!ret)
+ pr_warn("Unable to parse schedstats=\n");
+
+ return ret;
+}
+__setup("schedstats=", setup_schedstats);
+
+static void __init init_schedstats(void)
+{
+ set_schedstats(__sched_schedstats);
+}
+
+#ifdef CONFIG_PROC_SYSCTL
+int sysctl_schedstats(struct ctl_table *table, int write,
+ void __user *buffer, size_t *lenp, loff_t *ppos)
+{
+ struct ctl_table t;
+ int err;
+ int state = static_branch_likely(&sched_schedstats);
+
+ if (write && !capable(CAP_SYS_ADMIN))
+ return -EPERM;
+
+ t = *table;
+ t.data = &state;
+ err = proc_dointvec_minmax(&t, write, buffer, lenp, ppos);
+ if (err < 0)
+ return err;
+ if (write)
+ set_schedstats(state);
+ return err;
+}
+#endif /* CONFIG_PROC_SYSCTL */
+#else /* !CONFIG_SCHEDSTATS */
+static inline void init_schedstats(void) {}
+#endif /* CONFIG_SCHEDSTATS */
+
+static void update_cpu_clock_switch(struct rq *rq, struct task_struct *p);
+
+static void account_task_cpu(struct rq *rq, struct task_struct *p)
+{
+ update_clocks(rq);
+ /* This isn't really a context switch but accounting is the same */
+ update_cpu_clock_switch(rq, p);
+ p->last_ran = rq->niffies;
+}
+
+static inline int hrexpiry_enabled(struct rq *rq)
+{
+ if (unlikely(!cpu_active(cpu_of(rq)) || !sched_smp_initialized))
+ return 0;
+ return hrtimer_is_hres_active(&rq->hrexpiry_timer);
+}
+
+/*
+ * Use HR-timers to deliver accurate preemption points.
+ */
+static inline void hrexpiry_clear(struct rq *rq)
+{
+ if (!hrexpiry_enabled(rq))
+ return;
+ if (hrtimer_active(&rq->hrexpiry_timer))
+ hrtimer_cancel(&rq->hrexpiry_timer);
+}
+
+/*
+ * High-resolution time_slice expiry.
+ * Runs from hardirq context with interrupts disabled.
+ */
+static enum hrtimer_restart hrexpiry(struct hrtimer *timer)
+{
+ struct rq *rq = container_of(timer, struct rq, hrexpiry_timer);
+ struct task_struct *p;
+
+ /* This can happen during CPU hotplug / resume */
+ if (unlikely(cpu_of(rq) != smp_processor_id()))
+ goto out;
+
+ /*
+ * We're doing this without the runqueue lock but this should always
+ * be run on the local CPU. Time slice should run out in __schedule
+ * but we set it to zero here in case niffies is slightly less.
+ */
+ p = rq->curr;
+ p->time_slice = 0;
+ __set_tsk_resched(p);
+out:
+ return HRTIMER_NORESTART;
+}
+
+/*
+ * Called to set the hrexpiry timer state.
+ *
+ * called with irqs disabled from the local CPU only
+ */
+static void hrexpiry_start(struct rq *rq, u64 delay)
+{
+ if (!hrexpiry_enabled(rq))
+ return;
+
+ hrtimer_start(&rq->hrexpiry_timer, ns_to_ktime(delay),
+ HRTIMER_MODE_REL_PINNED);
+}
+
+static void init_rq_hrexpiry(struct rq *rq)
+{
+ hrtimer_init(&rq->hrexpiry_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
+ rq->hrexpiry_timer.function = hrexpiry;
+}
+
+static inline int rq_dither(struct rq *rq)
+{
+ if (!hrexpiry_enabled(rq))
+ return HALF_JIFFY_US;
+ return 0;
+}
+
+/*
+ * wake_up_new_task - wake up a newly created task for the first time.
+ *
+ * This function will do some initial scheduler statistics housekeeping
+ * that must be done for every newly created context, then puts the task
+ * on the runqueue and wakes it.
+ */
+void wake_up_new_task(struct task_struct *p)
+{
+ struct task_struct *parent, *rq_curr;
+ struct rq *rq, *new_rq;
+ unsigned long flags;
+
+ parent = p->parent;
+
+ raw_spin_lock_irqsave(&p->pi_lock, flags);
+ p->state = TASK_RUNNING;
+ /* Task_rq can't change yet on a new task */
+ new_rq = rq = task_rq(p);
+ if (unlikely(needs_other_cpu(p, task_cpu(p)))) {
+ set_task_cpu(p, valid_task_cpu(p));
+ new_rq = task_rq(p);
+ }
+
+ double_rq_lock(rq, new_rq);
+ rq_curr = rq->curr;
+
+ /*
+ * Make sure we do not leak PI boosting priority to the child.
+ */
+ p->prio = rq_curr->normal_prio;
+
+ trace_sched_wakeup_new(p);
+
+ /*
+ * Share the timeslice between parent and child, thus the
+ * total amount of pending timeslices in the system doesn't change,
+ * resulting in more scheduling fairness. If it's negative, it won't
+ * matter since that's the same as being 0. rq->rq_deadline is only
+ * modified within schedule() so it is always equal to
+ * current->deadline.
+ */
+ account_task_cpu(rq, rq_curr);
+ p->last_ran = rq_curr->last_ran;
+ if (likely(rq_curr->policy != SCHED_FIFO)) {
+ rq_curr->time_slice /= 2;
+ if (rq_curr->time_slice < RESCHED_US) {
+ /*
+ * Forking task has run out of timeslice. Reschedule it and
+ * start its child with a new time slice and deadline. The
+ * child will end up running first because its deadline will
+ * be slightly earlier.
+ */
+ __set_tsk_resched(rq_curr);
+ time_slice_expired(p, new_rq);
+ if (suitable_idle_cpus(p))
+ resched_best_idle(p, task_cpu(p));
+ else if (unlikely(rq != new_rq))
+ try_preempt(p, new_rq);
+ } else {
+ p->time_slice = rq_curr->time_slice;
+ if (rq_curr == parent && rq == new_rq && !suitable_idle_cpus(p)) {
+ /*
+ * The VM isn't cloned, so we're in a good position to
+ * do child-runs-first in anticipation of an exec. This
+ * usually avoids a lot of COW overhead.
+ */
+ __set_tsk_resched(rq_curr);
+ } else {
+ /*
+ * Adjust the hrexpiry since rq_curr will keep
+ * running and its timeslice has been shortened.
+ */
+ hrexpiry_start(rq, US_TO_NS(rq_curr->time_slice));
+ try_preempt(p, new_rq);
+ }
+ }
+ } else {
+ time_slice_expired(p, new_rq);
+ try_preempt(p, new_rq);
+ }
+ activate_task(p, new_rq);
+ double_rq_unlock(rq, new_rq);
+ raw_spin_unlock_irqrestore(&p->pi_lock, flags);
+}
+
+#ifdef CONFIG_PREEMPT_NOTIFIERS
+
+static struct static_key preempt_notifier_key = STATIC_KEY_INIT_FALSE;
+
+void preempt_notifier_inc(void)
+{
+ static_key_slow_inc(&preempt_notifier_key);
+}
+EXPORT_SYMBOL_GPL(preempt_notifier_inc);
+
+void preempt_notifier_dec(void)
+{
+ static_key_slow_dec(&preempt_notifier_key);
+}
+EXPORT_SYMBOL_GPL(preempt_notifier_dec);
+
+/**
+ * preempt_notifier_register - tell me when current is being preempted & rescheduled
+ * @notifier: notifier struct to register
+ */
+void preempt_notifier_register(struct preempt_notifier *notifier)
+{
+ if (!static_key_false(&preempt_notifier_key))
+ WARN(1, "registering preempt_notifier while notifiers disabled\n");
+
+ hlist_add_head(&notifier->link, &current->preempt_notifiers);
+}
+EXPORT_SYMBOL_GPL(preempt_notifier_register);
+
+/**
+ * preempt_notifier_unregister - no longer interested in preemption notifications
+ * @notifier: notifier struct to unregister
+ *
+ * This is *not* safe to call from within a preemption notifier.
+ */
+void preempt_notifier_unregister(struct preempt_notifier *notifier)
+{
+ hlist_del(&notifier->link);
+}
+EXPORT_SYMBOL_GPL(preempt_notifier_unregister);
+
+static void __fire_sched_in_preempt_notifiers(struct task_struct *curr)
+{
+ struct preempt_notifier *notifier;
+
+ hlist_for_each_entry(notifier, &curr->preempt_notifiers, link)
+ notifier->ops->sched_in(notifier, raw_smp_processor_id());
+}
+
+static __always_inline void fire_sched_in_preempt_notifiers(struct task_struct *curr)
+{
+ if (static_key_false(&preempt_notifier_key))
+ __fire_sched_in_preempt_notifiers(curr);
+}
+
+static void
+__fire_sched_out_preempt_notifiers(struct task_struct *curr,
+ struct task_struct *next)
+{
+ struct preempt_notifier *notifier;
+
+ hlist_for_each_entry(notifier, &curr->preempt_notifiers, link)
+ notifier->ops->sched_out(notifier, next);
+}
+
+static __always_inline void
+fire_sched_out_preempt_notifiers(struct task_struct *curr,
+ struct task_struct *next)
+{
+ if (static_key_false(&preempt_notifier_key))
+ __fire_sched_out_preempt_notifiers(curr, next);
+}
+
+#else /* !CONFIG_PREEMPT_NOTIFIERS */
+
+static inline void fire_sched_in_preempt_notifiers(struct task_struct *curr)
+{
+}
+
+static inline void
+fire_sched_out_preempt_notifiers(struct task_struct *curr,
+ struct task_struct *next)
+{
+}
+
+#endif /* CONFIG_PREEMPT_NOTIFIERS */
+
+/**
+ * prepare_task_switch - prepare to switch tasks
+ * @rq: the runqueue preparing to switch
+ * @next: the task we are going to switch to.
+ *
+ * This is called with the rq lock held and interrupts off. It must
+ * be paired with a subsequent finish_task_switch after the context
+ * switch.
+ *
+ * prepare_task_switch sets up locking and calls architecture specific
+ * hooks.
+ */
+static inline void
+prepare_task_switch(struct rq *rq, struct task_struct *prev,
+ struct task_struct *next)
+{
+ sched_info_switch(rq, prev, next);
+ perf_event_task_sched_out(prev, next);
+ fire_sched_out_preempt_notifiers(prev, next);
+ prepare_lock_switch(rq, next);
+ prepare_arch_switch(next);
+}
+
+/**
+ * finish_task_switch - clean up after a task-switch
+ * @rq: runqueue associated with task-switch
+ * @prev: the thread we just switched away from.
+ *
+ * finish_task_switch must be called after the context switch, paired
+ * with a prepare_task_switch call before the context switch.
+ * finish_task_switch will reconcile locking set up by prepare_task_switch,
+ * and do any other architecture-specific cleanup actions.
+ *
+ * Note that we may have delayed dropping an mm in context_switch(). If
+ * so, we finish that here outside of the runqueue lock. (Doing it
+ * with the lock held can cause deadlocks; see schedule() for
+ * details.)
+ *
+ * The context switch have flipped the stack from under us and restored the
+ * local variables which were saved when this task called schedule() in the
+ * past. prev == current is still correct but we need to recalculate this_rq
+ * because prev may have moved to another CPU.
+ */
+static void finish_task_switch(struct task_struct *prev)
+ __releases(rq->lock)
+{
+ struct rq *rq = this_rq();
+ struct mm_struct *mm = rq->prev_mm;
+ long prev_state;
+
+ /*
+ * The previous task will have left us with a preempt_count of 2
+ * because it left us after:
+ *
+ * schedule()
+ * preempt_disable(); // 1
+ * __schedule()
+ * raw_spin_lock_irq(&rq->lock) // 2
+ *
+ * Also, see FORK_PREEMPT_COUNT.
+ */
+ if (WARN_ONCE(preempt_count() != 2*PREEMPT_DISABLE_OFFSET,
+ "corrupted preempt_count: %s/%d/0x%x\n",
+ current->comm, current->pid, preempt_count()))
+ preempt_count_set(FORK_PREEMPT_COUNT);
+
+ rq->prev_mm = NULL;
+
+ /*
+ * A task struct has one reference for the use as "current".
+ * If a task dies, then it sets TASK_DEAD in tsk->state and calls
+ * schedule one last time. The schedule call will never return, and
+ * the scheduled task must drop that reference.
+ *
+ * We must observe prev->state before clearing prev->on_cpu (in
+ * finish_lock_switch), otherwise a concurrent wakeup can get prev
+ * running on another CPU and we could rave with its RUNNING -> DEAD
+ * transition, resulting in a double drop.
+ */
+ prev_state = prev->state;
+ vtime_task_switch(prev);
+ perf_event_task_sched_in(prev, current);
+ finish_lock_switch(rq, prev);
+ finish_arch_post_lock_switch();
+
+ fire_sched_in_preempt_notifiers(current);
+ if (mm)
+ mmdrop(mm);
+ if (unlikely(prev_state == TASK_DEAD)) {
+ /*
+ * Remove function-return probe instances associated with this
+ * task and put them back on the free list.
+ */
+ kprobe_flush_task(prev);
+
+ /* Task is done with its stack. */
+ put_task_stack(prev);
+
+ put_task_struct(prev);
+ }
+}
+
+/**
+ * schedule_tail - first thing a freshly forked thread must call.
+ * @prev: the thread we just switched away from.
+ */
+asmlinkage __visible void schedule_tail(struct task_struct *prev)
+{
+ /*
+ * New tasks start with FORK_PREEMPT_COUNT, see there and
+ * finish_task_switch() for details.
+ *
+ * finish_task_switch() will drop rq->lock() and lower preempt_count
+ * and the preempt_enable() will end up enabling preemption (on
+ * PREEMPT_COUNT kernels).
+ */
+
+ finish_task_switch(prev);
+ preempt_enable();
+
+ if (current->set_child_tid)
+ put_user(task_pid_vnr(current), current->set_child_tid);
+}
+
+/*
+ * context_switch - switch to the new MM and the new thread's register state.
+ */
+static __always_inline void
+context_switch(struct rq *rq, struct task_struct *prev,
+ struct task_struct *next)
+{
+ struct mm_struct *mm, *oldmm;
+
+ prepare_task_switch(rq, prev, next);
+
+ mm = next->mm;
+ oldmm = prev->active_mm;
+ /*
+ * For paravirt, this is coupled with an exit in switch_to to
+ * combine the page table reload and the switch backend into
+ * one hypercall.
+ */
+ arch_start_context_switch(prev);
+
+ if (!mm) {
+ next->active_mm = oldmm;
+ atomic_inc(&oldmm->mm_count);
+ enter_lazy_tlb(oldmm, next);
+ } else
+ switch_mm_irqs_off(oldmm, mm, next);
+
+ if (!prev->mm) {
+ prev->active_mm = NULL;
+ rq->prev_mm = oldmm;
+ }
+ /*
+ * Since the runqueue lock will be released by the next
+ * task (which is an invalid locking op but in the case
+ * of the scheduler it's an obvious special-case), so we
+ * do an early lockdep release here:
+ */
+ spin_release(&rq->lock.dep_map, 1, _THIS_IP_);
+
+ /* Here we just switch the register state and the stack. */
+ switch_to(prev, next, prev);
+ barrier();
+
+ finish_task_switch(prev);
+}
+
+/*
+ * nr_running, nr_uninterruptible and nr_context_switches:
+ *
+ * externally visible scheduler statistics: current number of runnable
+ * threads, total number of context switches performed since bootup.
+ */
+unsigned long nr_running(void)
+{
+ return atomic_read(&grq.nr_running);
+}
+
+static unsigned long nr_uninterruptible(void)
+{
+ return atomic_read(&grq.nr_uninterruptible);
+}
+
+/*
+ * Check if only the current task is running on the cpu.
+ *
+ * Caution: this function does not check that the caller has disabled
+ * preemption, thus the result might have a time-of-check-to-time-of-use
+ * race. The caller is responsible to use it correctly, for example:
+ *
+ * - from a non-preemptable section (of course)
+ *
+ * - from a thread that is bound to a single CPU
+ *
+ * - in a loop with very short iterations (e.g. a polling loop)
+ */
+bool single_task_running(void)
+{
+ struct rq *rq = cpu_rq(smp_processor_id());
+
+ if (rq_load(rq) == 1)
+ return true;
+ else
+ return false;
+}
+EXPORT_SYMBOL(single_task_running);
+
+unsigned long long nr_context_switches(void)
+{
+ return (unsigned long long)atomic64_read(&grq.nr_switches);
+}
+
+unsigned long nr_iowait(void)
+{
+ unsigned long i, sum = 0;
+
+ for_each_possible_cpu(i)
+ sum += atomic_read(&cpu_rq(i)->nr_iowait);
+
+ return sum;
+}
+
+unsigned long nr_iowait_cpu(int cpu)
+{
+ struct rq *this = cpu_rq(cpu);
+ return atomic_read(&this->nr_iowait);
+}
+
+unsigned long nr_active(void)
+{
+ return nr_running() + nr_uninterruptible();
+}
+
+/*
+ * I/O wait is the number of running or queued tasks with their ->rq pointer
+ * set to this cpu as being the CPU they're more likely to run on.
+ */
+void get_iowait_load(unsigned long *nr_waiters, unsigned long *load)
+{
+ struct rq *rq = this_rq();
+
+ *nr_waiters = atomic_read(&rq->nr_iowait);
+ *load = rq_load(rq);
+}
+
+/* Variables and functions for calc_load */
+static unsigned long calc_load_update;
+unsigned long avenrun[3];
+EXPORT_SYMBOL(avenrun);
+
+/**
+ * get_avenrun - get the load average array
+ * @loads: pointer to dest load array
+ * @offset: offset to add
+ * @shift: shift count to shift the result left
+ *
+ * These values are estimates at best, so no need for locking.
+ */
+void get_avenrun(unsigned long *loads, unsigned long offset, int shift)
+{
+ loads[0] = (avenrun[0] + offset) << shift;
+ loads[1] = (avenrun[1] + offset) << shift;
+ loads[2] = (avenrun[2] + offset) << shift;
+}
+
+static unsigned long
+calc_load(unsigned long load, unsigned long exp, unsigned long active)
+{
+ unsigned long newload;
+
+ newload = load * exp + active * (FIXED_1 - exp);
+ if (active >= load)
+ newload += FIXED_1-1;
+
+ return newload / FIXED_1;
+}
+
+/*
+ * calc_load - update the avenrun load estimates every LOAD_FREQ seconds.
+ */
+void calc_global_load(unsigned long ticks)
+{
+ long active;
+
+ if (time_before(jiffies, calc_load_update))
+ return;
+ active = nr_active() * FIXED_1;
+
+ avenrun[0] = calc_load(avenrun[0], EXP_1, active);
+ avenrun[1] = calc_load(avenrun[1], EXP_5, active);
+ avenrun[2] = calc_load(avenrun[2], EXP_15, active);
+
+ calc_load_update = jiffies + LOAD_FREQ;
+}
+
+DEFINE_PER_CPU(struct kernel_stat, kstat);
+DEFINE_PER_CPU(struct kernel_cpustat, kernel_cpustat);
+
+EXPORT_PER_CPU_SYMBOL(kstat);
+EXPORT_PER_CPU_SYMBOL(kernel_cpustat);
+
+#ifdef CONFIG_PARAVIRT
+static inline u64 steal_ticks(u64 steal)
+{
+ if (unlikely(steal > NSEC_PER_SEC))
+ return div_u64(steal, TICK_NSEC);
+
+ return __iter_div_u64_rem(steal, TICK_NSEC, &steal);
+}
+#endif
+
+static void update_rq_clock_task(struct rq *rq, s64 delta)
+{
+/*
+ * In theory, the compile should just see 0 here, and optimize out the call
+ * to sched_rt_avg_update. But I don't trust it...
+ */
+#ifdef CONFIG_IRQ_TIME_ACCOUNTING
+ s64 irq_delta = irq_time_read(cpu_of(rq)) - rq->prev_irq_time;
+
+ /*
+ * Since irq_time is only updated on {soft,}irq_exit, we might run into
+ * this case when a previous update_rq_clock() happened inside a
+ * {soft,}irq region.
+ *
+ * When this happens, we stop ->clock_task and only update the
+ * prev_irq_time stamp to account for the part that fit, so that a next
+ * update will consume the rest. This ensures ->clock_task is
+ * monotonic.
+ *
+ * It does however cause some slight miss-attribution of {soft,}irq
+ * time, a more accurate solution would be to update the irq_time using
+ * the current rq->clock timestamp, except that would require using
+ * atomic ops.
+ */
+ if (irq_delta > delta)
+ irq_delta = delta;
+
+ rq->prev_irq_time += irq_delta;
+ delta -= irq_delta;
+#endif
+#ifdef CONFIG_PARAVIRT_TIME_ACCOUNTING
+ if (static_key_false((&paravirt_steal_rq_enabled))) {
+ s64 steal = paravirt_steal_clock(cpu_of(rq));
+
+ steal -= rq->prev_steal_time_rq;
+
+ if (unlikely(steal > delta))
+ steal = delta;
+
+ rq->prev_steal_time_rq += steal;
+
+ delta -= steal;
+ }
+#endif
+ rq->clock_task += delta;
+}
+
+#ifndef nsecs_to_cputime
+# define nsecs_to_cputime(__nsecs) nsecs_to_jiffies(__nsecs)
+#endif
+
+/*
+ * On each tick, add the number of nanoseconds to the unbanked variables and
+ * once one tick's worth has accumulated, account it allowing for accurate
+ * sub-tick accounting and totals.
+ */
+static void pc_idle_time(struct rq *rq, struct task_struct *idle, unsigned long ns)
+{
+ u64 *cpustat = kcpustat_this_cpu->cpustat;
+ unsigned long ticks;
+
+ if (atomic_read(&rq->nr_iowait) > 0) {
+ rq->iowait_ns += ns;
+ if (rq->iowait_ns >= JIFFY_NS) {
+ ticks = NS_TO_JIFFIES(rq->iowait_ns);
+ cpustat[CPUTIME_IOWAIT] += (__force u64)cputime_one_jiffy * ticks;
+ rq->iowait_ns %= JIFFY_NS;
+ }
+ } else {
+ rq->idle_ns += ns;
+ if (rq->idle_ns >= JIFFY_NS) {
+ ticks = NS_TO_JIFFIES(rq->idle_ns);
+ cpustat[CPUTIME_IDLE] += (__force u64)cputime_one_jiffy * ticks;
+ rq->idle_ns %= JIFFY_NS;
+ }
+ }
+ acct_update_integrals(idle);
+}
+
+static void pc_system_time(struct rq *rq, struct task_struct *p,
+ int hardirq_offset, unsigned long ns)
+{
+ cputime_t one_jiffy_scaled = cputime_to_scaled(cputime_one_jiffy);
+ u64 *cpustat = kcpustat_this_cpu->cpustat;
+ unsigned long ticks;
+
+ p->stime_ns += ns;
+ if (p->stime_ns >= JIFFY_NS) {
+ ticks = NS_TO_JIFFIES(p->stime_ns);
+ p->stime_ns %= JIFFY_NS;
+ p->stime += (__force u64)cputime_one_jiffy * ticks;
+ p->stimescaled += one_jiffy_scaled * ticks;
+ account_group_system_time(p, cputime_one_jiffy * ticks);
+ }
+ p->sched_time += ns;
+ account_group_exec_runtime(p, ns);
+
+ if (hardirq_count() - hardirq_offset) {
+ rq->irq_ns += ns;
+ if (rq->irq_ns >= JIFFY_NS) {
+ ticks = NS_TO_JIFFIES(rq->irq_ns);
+ cpustat[CPUTIME_IRQ] += (__force u64)cputime_one_jiffy * ticks;
+ rq->irq_ns %= JIFFY_NS;
+ }
+ } else if (in_serving_softirq()) {
+ rq->softirq_ns += ns;
+ if (rq->softirq_ns >= JIFFY_NS) {
+ ticks = NS_TO_JIFFIES(rq->softirq_ns);
+ cpustat[CPUTIME_SOFTIRQ] += (__force u64)cputime_one_jiffy * ticks;
+ rq->softirq_ns %= JIFFY_NS;
+ }
+ } else {
+ rq->system_ns += ns;
+ if (rq->system_ns >= JIFFY_NS) {
+ ticks = NS_TO_JIFFIES(rq->system_ns);
+ cpustat[CPUTIME_SYSTEM] += (__force u64)cputime_one_jiffy * ticks;
+ rq->system_ns %= JIFFY_NS;
+ }
+ }
+ acct_update_integrals(p);
+}
+
+static void pc_user_time(struct rq *rq, struct task_struct *p, unsigned long ns)
+{
+ cputime_t one_jiffy_scaled = cputime_to_scaled(cputime_one_jiffy);
+ u64 *cpustat = kcpustat_this_cpu->cpustat;
+ unsigned long ticks;
+
+ p->utime_ns += ns;
+ if (p->utime_ns >= JIFFY_NS) {
+ ticks = NS_TO_JIFFIES(p->utime_ns);
+ p->utime_ns %= JIFFY_NS;
+ p->utime += (__force u64)cputime_one_jiffy * ticks;
+ p->utimescaled += one_jiffy_scaled * ticks;
+ account_group_user_time(p, cputime_one_jiffy * ticks);
+ }
+ p->sched_time += ns;
+ account_group_exec_runtime(p, ns);
+
+ if (this_cpu_ksoftirqd() == p) {
+ /*
+ * ksoftirqd time do not get accounted in cpu_softirq_time.
+ * So, we have to handle it separately here.
+ */
+ rq->softirq_ns += ns;
+ if (rq->softirq_ns >= JIFFY_NS) {
+ ticks = NS_TO_JIFFIES(rq->softirq_ns);
+ cpustat[CPUTIME_SOFTIRQ] += (__force u64)cputime_one_jiffy * ticks;
+ rq->softirq_ns %= JIFFY_NS;
+ }
+ }
+
+ if (task_nice(p) > 0 || idleprio_task(p)) {
+ rq->nice_ns += ns;
+ if (rq->nice_ns >= JIFFY_NS) {
+ ticks = NS_TO_JIFFIES(rq->nice_ns);
+ cpustat[CPUTIME_NICE] += (__force u64)cputime_one_jiffy * ticks;
+ rq->nice_ns %= JIFFY_NS;
+ }
+ } else {
+ rq->user_ns += ns;
+ if (rq->user_ns >= JIFFY_NS) {
+ ticks = NS_TO_JIFFIES(rq->user_ns);
+ cpustat[CPUTIME_USER] += (__force u64)cputime_one_jiffy * ticks;
+ rq->user_ns %= JIFFY_NS;
+ }
+ }
+ acct_update_integrals(p);
+}
+
+/*
+ * This is called on clock ticks.
+ * Bank in p->sched_time the ns elapsed since the last tick or switch.
+ * CPU scheduler quota accounting is also performed here in microseconds.
+ */
+static void update_cpu_clock_tick(struct rq *rq, struct task_struct *p)
+{
+ s64 account_ns = rq->niffies - p->last_ran;
+ struct task_struct *idle = rq->idle;
+
+ /* Accurate tick timekeeping */
+ if (user_mode(get_irq_regs()))
+ pc_user_time(rq, p, account_ns);
+ else if (p != idle || (irq_count() != HARDIRQ_OFFSET)) {
+ pc_system_time(rq, p, HARDIRQ_OFFSET, account_ns);
+ } else
+ pc_idle_time(rq, idle, account_ns);
+
+ /* time_slice accounting is done in usecs to avoid overflow on 32bit */
+ if (p->policy != SCHED_FIFO && p != idle)
+ p->time_slice -= NS_TO_US(account_ns);
+
+ p->last_ran = rq->niffies;
+}
+
+/*
+ * This is called on context switches.
+ * Bank in p->sched_time the ns elapsed since the last tick or switch.
+ * CPU scheduler quota accounting is also performed here in microseconds.
+ */
+static void update_cpu_clock_switch(struct rq *rq, struct task_struct *p)
+{
+ s64 account_ns = rq->niffies - p->last_ran;
+ struct task_struct *idle = rq->idle;
+
+ /* Accurate subtick timekeeping */
+ if (p != idle)
+ pc_user_time(rq, p, account_ns);
+ else
+ pc_idle_time(rq, idle, account_ns);
+
+ /* time_slice accounting is done in usecs to avoid overflow on 32bit */
+ if (p->policy != SCHED_FIFO && p != idle)
+ p->time_slice -= NS_TO_US(account_ns);
+}
+
+/*
+ * Return any ns on the sched_clock that have not yet been accounted in
+ * @p in case that task is currently running.
+ *
+ * Called with task_rq_lock(p) held.
+ */
+static inline u64 do_task_delta_exec(struct task_struct *p, struct rq *rq)
+{
+ u64 ns = 0;
+
+ /*
+ * Must be ->curr _and_ ->on_rq. If dequeued, we would
+ * project cycles that may never be accounted to this
+ * thread, breaking clock_gettime().
+ */
+ if (p == rq->curr && task_on_rq_queued(p)) {
+ update_clocks(rq);
+ ns = rq->niffies - p->last_ran;
+ }
+
+ return ns;
+}
+
+/*
+ * Return accounted runtime for the task.
+ * Return separately the current's pending runtime that have not been
+ * accounted yet.
+ *
+ */
+unsigned long long task_sched_runtime(struct task_struct *p)
+{
+ unsigned long flags;
+ struct rq *rq;
+ u64 ns;
+
+#if defined(CONFIG_64BIT) && defined(CONFIG_SMP)
+ /*
+ * 64-bit doesn't need locks to atomically read a 64bit value.
+ * So we have a optimization chance when the task's delta_exec is 0.
+ * Reading ->on_cpu is racy, but this is ok.
+ *
+ * If we race with it leaving cpu, we'll take a lock. So we're correct.
+ * If we race with it entering cpu, unaccounted time is 0. This is
+ * indistinguishable from the read occurring a few cycles earlier.
+ * If we see ->on_cpu without ->on_rq, the task is leaving, and has
+ * been accounted, so we're correct here as well.
+ */
+ if (!p->on_cpu || !task_on_rq_queued(p))
+ return tsk_seruntime(p);
+#endif
+
+ rq = task_rq_lock(p, &flags);
+ ns = p->sched_time + do_task_delta_exec(p, rq);
+ task_rq_unlock(rq, p, &flags);
+
+ return ns;
+}
+
+/*
+ * Functions to test for when SCHED_ISO tasks have used their allocated
+ * quota as real time scheduling and convert them back to SCHED_NORMAL. All
+ * data is modified only by the local runqueue during scheduler_tick with
+ * interrupts disabled.
+ */
+
+/*
+ * Test if SCHED_ISO tasks have run longer than their alloted period as RT
+ * tasks and set the refractory flag if necessary. There is 10% hysteresis
+ * for unsetting the flag. 115/128 is ~90/100 as a fast shift instead of a
+ * slow division.
+ */
+static inline void iso_tick(struct rq *rq)
+{
+ rq->iso_ticks = rq->iso_ticks * (ISO_PERIOD - 1) / ISO_PERIOD;
+ rq->iso_ticks += 100;
+ if (rq->iso_ticks > ISO_PERIOD * sched_iso_cpu) {
+ rq->iso_refractory = true;
+ if (unlikely(rq->iso_ticks > ISO_PERIOD * 100))
+ rq->iso_ticks = ISO_PERIOD * 100;
+ }
+}
+
+/* No SCHED_ISO task was running so decrease rq->iso_ticks */
+static inline void no_iso_tick(struct rq *rq, int ticks)
+{
+ if (rq->iso_ticks > 0 || rq->iso_refractory) {
+ rq->iso_ticks = rq->iso_ticks * (ISO_PERIOD - ticks) / ISO_PERIOD;
+ if (rq->iso_ticks < ISO_PERIOD * (sched_iso_cpu * 115 / 128)) {
+ rq->iso_refractory = false;
+ if (unlikely(rq->iso_ticks < 0))
+ rq->iso_ticks = 0;
+ }
+ }
+}
+
+/* This manages tasks that have run out of timeslice during a scheduler_tick */
+static void task_running_tick(struct rq *rq)
+{
+ struct task_struct *p = rq->curr;
+
+ /*
+ * If a SCHED_ISO task is running we increment the iso_ticks. In
+ * order to prevent SCHED_ISO tasks from causing starvation in the
+ * presence of true RT tasks we account those as iso_ticks as well.
+ */
+ if (rt_task(p) || task_running_iso(p))
+ iso_tick(rq);
+ else
+ no_iso_tick(rq, 1);
+
+ /* SCHED_FIFO tasks never run out of timeslice. */
+ if (p->policy == SCHED_FIFO)
+ return;
+
+ if (iso_task(p)) {
+ if (task_running_iso(p)) {
+ if (rq->iso_refractory) {
+ /*
+ * SCHED_ISO task is running as RT and limit
+ * has been hit. Force it to reschedule as
+ * SCHED_NORMAL by zeroing its time_slice
+ */
+ p->time_slice = 0;
+ }
+ } else if (!rq->iso_refractory) {
+ /* Can now run again ISO. Reschedule to pick up prio */
+ goto out_resched;
+ }
+ }
+
+ /*
+ * Tasks that were scheduled in the first half of a tick are not
+ * allowed to run into the 2nd half of the next tick if they will
+ * run out of time slice in the interim. Otherwise, if they have
+ * less than RESCHED_US μs of time slice left they will be rescheduled.
+ * Dither is used as a backup for when hrexpiry is disabled or high res
+ * timers not configured in.
+ */
+ if (p->time_slice - rq->dither >= RESCHED_US)
+ return;
+out_resched:
+ rq_lock(rq);
+ __set_tsk_resched(p);
+ rq_unlock(rq);
+}
+
+#ifdef CONFIG_NO_HZ_FULL
+/*
+ * We can stop the timer tick any time highres timers are active since
+ * we rely entirely on highres timeouts for task expiry rescheduling.
+ */
+static void sched_stop_tick(struct rq *rq, int cpu)
+{
+ if (!hrexpiry_enabled(rq))
+ return;
+ if (!tick_nohz_full_enabled())
+ return;
+ if (!tick_nohz_full_cpu(cpu))
+ return;
+ tick_nohz_dep_clear_cpu(cpu, TICK_DEP_BIT_SCHED);
+}
+
+static inline void sched_start_tick(struct rq *rq, int cpu)
+{
+ tick_nohz_dep_set_cpu(cpu, TICK_DEP_BIT_SCHED);
+}
+
+/**
+ * scheduler_tick_max_deferment
+ *
+ * Keep at least one tick per second when a single
+ * active task is running.
+ *
+ * This makes sure that uptime continues to move forward, even
+ * with a very low granularity.
+ *
+ * Return: Maximum deferment in nanoseconds.
+ */
+u64 scheduler_tick_max_deferment(void)
+{
+ struct rq *rq = this_rq();
+ unsigned long next, now = READ_ONCE(jiffies);
+
+ next = rq->last_jiffy + HZ;
+
+ if (time_before_eq(next, now))
+ return 0;
+
+ return jiffies_to_nsecs(next - now);
+}
+#else
+static inline void sched_stop_tick(struct rq *rq, int cpu)
+{
+}
+
+static inline void sched_start_tick(struct rq *rq, int cpu)
+{
+}
+#endif
+
+/*
+ * This function gets called by the timer code, with HZ frequency.
+ * We call it with interrupts disabled.
+ */
+void scheduler_tick(void)
+{
+ int cpu __maybe_unused = smp_processor_id();
+ struct rq *rq = cpu_rq(cpu);
+
+ sched_clock_tick();
+ update_clocks(rq);
+ update_load_avg(rq, 0);
+ update_cpu_clock_tick(rq, rq->curr);
+ if (!rq_idle(rq))
+ task_running_tick(rq);
+ else
+ no_iso_tick(rq, rq->last_scheduler_tick - rq->last_jiffy);
+ rq->last_scheduler_tick = rq->last_jiffy;
+ rq->last_tick = rq->clock;
+ perf_event_task_tick();
+ sched_stop_tick(rq, cpu);
+}
+
+#if defined(CONFIG_PREEMPT) && (defined(CONFIG_DEBUG_PREEMPT) || \
+ defined(CONFIG_PREEMPT_TRACER))
+/*
+ * If the value passed in is equal to the current preempt count
+ * then we just disabled preemption. Start timing the latency.
+ */
+static inline void preempt_latency_start(int val)
+{
+ if (preempt_count() == val) {
+ unsigned long ip = get_lock_parent_ip();
+#ifdef CONFIG_DEBUG_PREEMPT
+ current->preempt_disable_ip = ip;
+#endif
+ trace_preempt_off(CALLER_ADDR0, ip);
+ }
+}
+
+void preempt_count_add(int val)
+{
+#ifdef CONFIG_DEBUG_PREEMPT
+ /*
+ * Underflow?
+ */
+ if (DEBUG_LOCKS_WARN_ON((preempt_count() < 0)))
+ return;
+#endif
+ __preempt_count_add(val);
+#ifdef CONFIG_DEBUG_PREEMPT
+ /*
+ * Spinlock count overflowing soon?
+ */
+ DEBUG_LOCKS_WARN_ON((preempt_count() & PREEMPT_MASK) >=
+ PREEMPT_MASK - 10);
+#endif
+ preempt_latency_start(val);
+}
+EXPORT_SYMBOL(preempt_count_add);
+NOKPROBE_SYMBOL(preempt_count_add);
+
+/*
+ * If the value passed in equals to the current preempt count
+ * then we just enabled preemption. Stop timing the latency.
+ */
+static inline void preempt_latency_stop(int val)
+{
+ if (preempt_count() == val)
+ trace_preempt_on(CALLER_ADDR0, get_lock_parent_ip());
+}
+
+void preempt_count_sub(int val)
+{
+#ifdef CONFIG_DEBUG_PREEMPT
+ /*
+ * Underflow?
+ */
+ if (DEBUG_LOCKS_WARN_ON(val > preempt_count()))
+ return;
+ /*
+ * Is the spinlock portion underflowing?
+ */
+ if (DEBUG_LOCKS_WARN_ON((val < PREEMPT_MASK) &&
+ !(preempt_count() & PREEMPT_MASK)))
+ return;
+#endif
+
+ preempt_latency_stop(val);
+ __preempt_count_sub(val);
+}
+EXPORT_SYMBOL(preempt_count_sub);
+NOKPROBE_SYMBOL(preempt_count_sub);
+
+#else
+static inline void preempt_latency_start(int val) { }
+static inline void preempt_latency_stop(int val) { }
+#endif
+
+/*
+ * The time_slice is only refilled when it is empty and that is when we set a
+ * new deadline. Make sure update_clocks has been called recently to update
+ * rq->niffies.
+ */
+static void time_slice_expired(struct task_struct *p, struct rq *rq)
+{
+ p->time_slice = timeslice();
+ p->deadline = rq->niffies + task_deadline_diff(p);
+#ifdef CONFIG_SMT_NICE
+ if (!p->mm)
+ p->smt_bias = 0;
+ else if (rt_task(p))
+ p->smt_bias = 1 << 30;
+ else if (task_running_iso(p))
+ p->smt_bias = 1 << 29;
+ else if (idleprio_task(p)) {
+ if (task_running_idle(p))
+ p->smt_bias = 0;
+ else
+ p->smt_bias = 1;
+ } else if (--p->smt_bias < 1)
+ p->smt_bias = MAX_PRIO - p->static_prio;
+#endif
+}
+
+/*
+ * Timeslices below RESCHED_US are considered as good as expired as there's no
+ * point rescheduling when there's so little time left. SCHED_BATCH tasks
+ * have been flagged be not latency sensitive and likely to be fully CPU
+ * bound so every time they're rescheduled they have their time_slice
+ * refilled, but get a new later deadline to have little effect on
+ * SCHED_NORMAL tasks.
+
+ */
+static inline void check_deadline(struct task_struct *p, struct rq *rq)
+{
+ if (p->time_slice < RESCHED_US || batch_task(p))
+ time_slice_expired(p, rq);
+}
+
+/*
+ * Task selection with skiplists is a simple matter of picking off the first
+ * task in the sorted list, an O(1) operation. The lookup is amortised O(1)
+ * being bound to the number of processors.
+ *
+ * Runqueues are selectively locked based on their unlocked data and then
+ * unlocked if not needed. At most 3 locks will be held at any time and are
+ * released as soon as they're no longer needed. All balancing between CPUs
+ * is thus done here in an extremely simple first come best fit manner.
+ *
+ * This iterates over runqueues in cache locality order. In interactive mode
+ * it iterates over all CPUs and finds the task with the best key/deadline.
+ * In non-interactive mode it will only take a task if it's from the current
+ * runqueue or a runqueue with more tasks than the current one with a better
+ * key/deadline.
+ */
+#ifdef CONFIG_SMP
+static inline struct task_struct
+*earliest_deadline_task(struct rq *rq, int cpu, struct task_struct *idle)
+{
+ struct rq *locked = NULL, *chosen = NULL;
+ struct task_struct *edt = idle;
+ int i, best_entries = 0;
+ u64 best_key = ~0ULL;
+
+ for (i = 0; i < num_possible_cpus(); i++) {
+ struct rq *other_rq = rq_order(rq, i);
+ int entries = other_rq->sl->entries;
+ skiplist_node *next;
+
+ /*
+ * Check for queued entres lockless first. The local runqueue
+ * is locked so entries will always be accurate.
+ */
+ if (!sched_interactive) {
+ /*
+ * Don't reschedule balance across nodes unless the CPU
+ * is idle.
+ */
+ if (edt != idle && rq->cpu_locality[other_rq->cpu] > 3)
+ break;
+ if (entries <= best_entries)
+ continue;
+ } else if (!entries)
+ continue;
+
+ /* if (i) implies other_rq != rq */
+ if (i) {
+ /* Check for best id queued lockless first */
+ if (other_rq->best_key >= best_key)
+ continue;
+
+ if (unlikely(!trylock_rq(rq, other_rq)))
+ continue;
+
+ /* Need to reevaluate entries after locking */
+ entries = other_rq->sl->entries;
+ if (unlikely(!entries)) {
+ unlock_rq(other_rq);
+ continue;
+ }
+ }
+
+ next = &other_rq->node;
+ /*
+ * In interactive mode we check beyond the best entry on other
+ * runqueues if we can't get the best for smt or affinity
+ * reasons.
+ */
+ while ((next = next->next[0]) != &other_rq->node) {
+ struct task_struct *p;
+ u64 key = next->key;
+
+ /* Reevaluate key after locking */
+ if (key >= best_key)
+ break;
+
+ p = next->value;
+ if (!smt_schedule(p, rq)) {
+ if (i && !sched_interactive)
+ break;
+ continue;
+ }
+
+ /* Make sure affinity is ok */
+ if (i) {
+ if (needs_other_cpu(p, cpu)) {
+ if (sched_interactive)
+ continue;
+ break;
+ }
+ /* From this point on p is the best so far */
+ if (locked)
+ unlock_rq(locked);
+ chosen = locked = other_rq;
+ }
+ best_entries = entries;
+ best_key = key;
+ edt = p;
+ break;
+ }
+ if (i && other_rq != chosen)
+ unlock_rq(other_rq);
+ }
+
+ if (likely(edt != idle))
+ take_task(rq, cpu, edt);
+
+ if (locked)
+ unlock_rq(locked);
+
+ return edt;
+}
+#else /* CONFIG_SMP */
+static inline struct task_struct
+*earliest_deadline_task(struct rq *rq, int cpu, struct task_struct *idle)
+{
+ struct task_struct *edt;
+
+ if (unlikely(!rq->sl->entries))
+ return idle;
+ edt = rq->node.next[0]->value;
+ take_task(rq, cpu, edt);
+ return edt;
+}
+#endif /* CONFIG_SMP */
+
+/*
+ * Print scheduling while atomic bug:
+ */
+static noinline void __schedule_bug(struct task_struct *prev)
+{
+ /* Save this before calling printk(), since that will clobber it */
+ unsigned long preempt_disable_ip = get_preempt_disable_ip(current);
+
+ if (oops_in_progress)
+ return;
+
+ printk(KERN_ERR "BUG: scheduling while atomic: %s/%d/0x%08x\n",
+ prev->comm, prev->pid, preempt_count());
+
+ debug_show_held_locks(prev);
+ print_modules();
+ if (irqs_disabled())
+ print_irqtrace_events(prev);
+ if (IS_ENABLED(CONFIG_DEBUG_PREEMPT)
+ && in_atomic_preempt_off()) {
+ pr_err("Preemption disabled at:");
+ print_ip_sym(preempt_disable_ip);
+ pr_cont("\n");
+ }
+ dump_stack();
+ add_taint(TAINT_WARN, LOCKDEP_STILL_OK);
+}
+
+/*
+ * Various schedule()-time debugging checks and statistics:
+ */
+static inline void schedule_debug(struct task_struct *prev)
+{
+#ifdef CONFIG_SCHED_STACK_END_CHECK
+ if (task_stack_end_corrupted(prev))
+ panic("corrupted stack end detected inside scheduler\n");
+#endif
+
+ if (unlikely(in_atomic_preempt_off())) {
+ __schedule_bug(prev);
+ preempt_count_set(PREEMPT_DISABLED);
+ }
+ rcu_sleep_check();
+
+ profile_hit(SCHED_PROFILING, __builtin_return_address(0));
+
+ schedstat_inc(this_rq()->sched_count);
+}
+
+/*
+ * The currently running task's information is all stored in rq local data
+ * which is only modified by the local CPU.
+ */
+static inline void set_rq_task(struct rq *rq, struct task_struct *p)
+{
+ if (p == rq->idle || p->policy == SCHED_FIFO)
+ hrexpiry_clear(rq);
+ else
+ hrexpiry_start(rq, US_TO_NS(p->time_slice));
+ if (rq->clock - rq->last_tick > HALF_JIFFY_NS)
+ rq->dither = 0;
+ else
+ rq->dither = rq_dither(rq);
+
+ rq->rq_deadline = p->deadline;
+ rq->rq_prio = p->prio;
+#ifdef CONFIG_SMT_NICE
+ rq->rq_mm = p->mm;
+ rq->rq_smt_bias = p->smt_bias;
+#endif
+}
+
+#ifdef CONFIG_SMT_NICE
+static void check_no_siblings(struct rq __maybe_unused *this_rq) {}
+static void wake_no_siblings(struct rq __maybe_unused *this_rq) {}
+static void (*check_siblings)(struct rq *this_rq) = &check_no_siblings;
+static void (*wake_siblings)(struct rq *this_rq) = &wake_no_siblings;
+
+/* Iterate over smt siblings when we've scheduled a process on cpu and decide
+ * whether they should continue running or be descheduled. */
+static void check_smt_siblings(struct rq *this_rq)
+{
+ int other_cpu;
+
+ for_each_cpu(other_cpu, &this_rq->thread_mask) {
+ struct task_struct *p;
+ struct rq *rq;
+
+ rq = cpu_rq(other_cpu);
+ if (rq_idle(rq))
+ continue;
+ p = rq->curr;
+ if (!smt_schedule(p, this_rq))
+ resched_curr(rq);
+ }
+}
+
+static void wake_smt_siblings(struct rq *this_rq)
+{
+ int other_cpu;
+
+ for_each_cpu(other_cpu, &this_rq->thread_mask) {
+ struct rq *rq;
+
+ rq = cpu_rq(other_cpu);
+ if (rq_idle(rq))
+ resched_idle(rq);
+ }
+}
+#else
+static void check_siblings(struct rq __maybe_unused *this_rq) {}
+static void wake_siblings(struct rq __maybe_unused *this_rq) {}
+#endif
+
+/*
+ * schedule() is the main scheduler function.
+ *
+ * The main means of driving the scheduler and thus entering this function are:
+ *
+ * 1. Explicit blocking: mutex, semaphore, waitqueue, etc.
+ *
+ * 2. TIF_NEED_RESCHED flag is checked on interrupt and userspace return
+ * paths. For example, see arch/x86/entry_64.S.
+ *
+ * To drive preemption between tasks, the scheduler sets the flag in timer
+ * interrupt handler scheduler_tick().
+ *
+ * 3. Wakeups don't really cause entry into schedule(). They add a
+ * task to the run-queue and that's it.
+ *
+ * Now, if the new task added to the run-queue preempts the current
+ * task, then the wakeup sets TIF_NEED_RESCHED and schedule() gets
+ * called on the nearest possible occasion:
+ *
+ * - If the kernel is preemptible (CONFIG_PREEMPT=y):
+ *
+ * - in syscall or exception context, at the next outmost
+ * preempt_enable(). (this might be as soon as the wake_up()'s
+ * spin_unlock()!)
+ *
+ * - in IRQ context, return from interrupt-handler to
+ * preemptible context
+ *
+ * - If the kernel is not preemptible (CONFIG_PREEMPT is not set)
+ * then at the next:
+ *
+ * - cond_resched() call
+ * - explicit schedule() call
+ * - return from syscall or exception to user-space
+ * - return from interrupt-handler to user-space
+ *
+ * WARNING: must be called with preemption disabled!
+ */
+static void __sched notrace __schedule(bool preempt)
+{
+ struct task_struct *prev, *next, *idle;
+ unsigned long *switch_count;
+ bool deactivate = false;
+ struct rq *rq;
+ u64 niffies;
+ int cpu;
+
+ cpu = smp_processor_id();
+ rq = cpu_rq(cpu);
+ prev = rq->curr;
+ idle = rq->idle;
+
+ schedule_debug(prev);
+
+ local_irq_disable();
+ rcu_note_context_switch();
+
+ /*
+ * Make sure that signal_pending_state()->signal_pending() below
+ * can't be reordered with __set_current_state(TASK_INTERRUPTIBLE)
+ * done by the caller to avoid the race with signal_wake_up().
+ */
+ smp_mb__before_spinlock();
+ rq_lock(rq);
+#ifdef CONFIG_SMP
+ if (rq->preempt) {
+ /*
+ * Make sure resched_curr hasn't triggered a preemption
+ * locklessly on a task that has since scheduled away. Spurious
+ * wakeup of idle is okay though.
+ */
+ if (unlikely(preempt && prev != idle && !test_tsk_need_resched(prev))) {
+ rq->preempt = NULL;
+ clear_preempt_need_resched();
+ rq_unlock_irq(rq);
+ return;
+ }
+ rq->preempt = NULL;
+ }
+#endif
+
+ switch_count = &prev->nivcsw;
+ if (!preempt && prev->state) {
+ if (unlikely(signal_pending_state(prev->state, prev))) {
+ prev->state = TASK_RUNNING;
+ } else {
+ deactivate = true;
+ prev->on_rq = 0;
+
+ /*
+ * If a worker is going to sleep, notify and
+ * ask workqueue whether it wants to wake up a
+ * task to maintain concurrency. If so, wake
+ * up the task.
+ */
+ if (prev->flags & PF_WQ_WORKER) {
+ struct task_struct *to_wakeup;
+
+ to_wakeup = wq_worker_sleeping(prev);
+ if (to_wakeup)
+ try_to_wake_up_local(to_wakeup);
+ }
+ }
+ switch_count = &prev->nvcsw;
+ }
+
+ /*
+ * Store the niffy value here for use by the next task's last_ran
+ * below to avoid losing niffies due to update_clocks being called
+ * again after this point.
+ */
+ update_clocks(rq);
+ niffies = rq->niffies;
+ update_cpu_clock_switch(rq, prev);
+
+ clear_tsk_need_resched(prev);
+ clear_preempt_need_resched();
+
+ if (idle != prev) {
+ check_deadline(prev, rq);
+ return_task(prev, rq, cpu, deactivate);
+ }
+
+ next = earliest_deadline_task(rq, cpu, idle);
+ if (likely(next->prio != PRIO_LIMIT))
+ clear_cpuidle_map(cpu);
+ else {
+ set_cpuidle_map(cpu);
+ update_load_avg(rq, 0);
+ }
+
+ set_rq_task(rq, next);
+ next->last_ran = niffies;
+
+ if (likely(prev != next)) {
+ /*
+ * Don't reschedule an idle task or deactivated tasks
+ */
+ if (prev != idle && !deactivate)
+ resched_suitable_idle(prev);
+ if (next != idle)
+ check_siblings(rq);
+ else
+ wake_siblings(rq);
+ atomic64_inc(&grq.nr_switches);
+ rq->curr = next;
+ ++*switch_count;
+
+ trace_sched_switch(preempt, prev, next);
+ context_switch(rq, prev, next); /* unlocks the rq */
+ } else {
+ check_siblings(rq);
+ rq_unlock(rq);
+ do_pending_softirq(rq, next);
+ local_irq_enable();
+ }
+}
+
+void __noreturn do_task_dead(void)
+{
+ /*
+ * The setting of TASK_RUNNING by try_to_wake_up() may be delayed
+ * when the following two conditions become true.
+ * - There is race condition of mmap_sem (It is acquired by
+ * exit_mm()), and
+ * - SMI occurs before setting TASK_RUNINNG.
+ * (or hypervisor of virtual machine switches to other guest)
+ * As a result, we may become TASK_RUNNING after becoming TASK_DEAD
+ *
+ * To avoid it, we have to wait for releasing tsk->pi_lock which
+ * is held by try_to_wake_up()
+ */
+ smp_mb();
+ raw_spin_unlock_wait(&current->pi_lock);
+
+ /* causes final put_task_struct in finish_task_switch(). */
+ __set_current_state(TASK_DEAD);
+ current->flags |= PF_NOFREEZE; /* tell freezer to ignore us */
+ __schedule(false);
+ BUG();
+ /* Avoid "noreturn function does return". */
+ for (;;)
+ cpu_relax(); /* For when BUG is null */
+}
+
+static inline void sched_submit_work(struct task_struct *tsk)
+{
+ if (!tsk->state || tsk_is_pi_blocked(tsk) ||
+ preempt_count() ||
+ signal_pending_state(tsk->state, tsk))
+ return;
+
+ /*
+ * If we are going to sleep and we have plugged IO queued,
+ * make sure to submit it to avoid deadlocks.
+ */
+ if (blk_needs_flush_plug(tsk))
+ blk_schedule_flush_plug(tsk);
+}
+
+asmlinkage __visible void __sched schedule(void)
+{
+ struct task_struct *tsk = current;
+
+ sched_submit_work(tsk);
+ do {
+ preempt_disable();
+ __schedule(false);
+ sched_preempt_enable_no_resched();
+ } while (need_resched());
+}
+
+EXPORT_SYMBOL(schedule);
+
+#ifdef CONFIG_CONTEXT_TRACKING
+asmlinkage __visible void __sched schedule_user(void)
+{
+ /*
+ * If we come here after a random call to set_need_resched(),
+ * or we have been woken up remotely but the IPI has not yet arrived,
+ * we haven't yet exited the RCU idle mode. Do it here manually until
+ * we find a better solution.
+ *
+ * NB: There are buggy callers of this function. Ideally we
+ * should warn if prev_state != IN_USER, but that will trigger
+ * too frequently to make sense yet.
+ */
+ enum ctx_state prev_state = exception_enter();
+ schedule();
+ exception_exit(prev_state);
+}
+#endif
+
+/**
+ * schedule_preempt_disabled - called with preemption disabled
+ *
+ * Returns with preemption disabled. Note: preempt_count must be 1
+ */
+void __sched schedule_preempt_disabled(void)
+{
+ sched_preempt_enable_no_resched();
+ schedule();
+ preempt_disable();
+}
+
+static void __sched notrace preempt_schedule_common(void)
+{
+ do {
+ /*
+ * Because the function tracer can trace preempt_count_sub()
+ * and it also uses preempt_enable/disable_notrace(), if
+ * NEED_RESCHED is set, the preempt_enable_notrace() called
+ * by the function tracer will call this function again and
+ * cause infinite recursion.
+ *
+ * Preemption must be disabled here before the function
+ * tracer can trace. Break up preempt_disable() into two
+ * calls. One to disable preemption without fear of being
+ * traced. The other to still record the preemption latency,
+ * which can also be traced by the function tracer.
+ */
+ preempt_disable_notrace();
+ preempt_latency_start(1);
+ __schedule(true);
+ preempt_latency_stop(1);
+ preempt_enable_no_resched_notrace();
+
+ /*
+ * Check again in case we missed a preemption opportunity
+ * between schedule and now.
+ */
+ } while (need_resched());
+}
+
+#ifdef CONFIG_PREEMPT
+/*
+ * this is the entry point to schedule() from in-kernel preemption
+ * off of preempt_enable. Kernel preemptions off return from interrupt
+ * occur there and call schedule directly.
+ */
+asmlinkage __visible void __sched notrace preempt_schedule(void)
+{
+ /*
+ * If there is a non-zero preempt_count or interrupts are disabled,
+ * we do not want to preempt the current task. Just return..
+ */
+ if (likely(!preemptible()))
+ return;
+
+ preempt_schedule_common();
+}
+NOKPROBE_SYMBOL(preempt_schedule);
+EXPORT_SYMBOL(preempt_schedule);
+
+/**
+ * preempt_schedule_notrace - preempt_schedule called by tracing
+ *
+ * The tracing infrastructure uses preempt_enable_notrace to prevent
+ * recursion and tracing preempt enabling caused by the tracing
+ * infrastructure itself. But as tracing can happen in areas coming
+ * from userspace or just about to enter userspace, a preempt enable
+ * can occur before user_exit() is called. This will cause the scheduler
+ * to be called when the system is still in usermode.
+ *
+ * To prevent this, the preempt_enable_notrace will use this function
+ * instead of preempt_schedule() to exit user context if needed before
+ * calling the scheduler.
+ */
+asmlinkage __visible void __sched notrace preempt_schedule_notrace(void)
+{
+ enum ctx_state prev_ctx;
+
+ if (likely(!preemptible()))
+ return;
+
+ do {
+ /*
+ * Because the function tracer can trace preempt_count_sub()
+ * and it also uses preempt_enable/disable_notrace(), if
+ * NEED_RESCHED is set, the preempt_enable_notrace() called
+ * by the function tracer will call this function again and
+ * cause infinite recursion.
+ *
+ * Preemption must be disabled here before the function
+ * tracer can trace. Break up preempt_disable() into two
+ * calls. One to disable preemption without fear of being
+ * traced. The other to still record the preemption latency,
+ * which can also be traced by the function tracer.
+ */
+ preempt_disable_notrace();
+ preempt_latency_start(1);
+ /*
+ * Needs preempt disabled in case user_exit() is traced
+ * and the tracer calls preempt_enable_notrace() causing
+ * an infinite recursion.
+ */
+ prev_ctx = exception_enter();
+ __schedule(true);
+ exception_exit(prev_ctx);
+
+ preempt_latency_stop(1);
+ preempt_enable_no_resched_notrace();
+ } while (need_resched());
+}
+EXPORT_SYMBOL_GPL(preempt_schedule_notrace);
+
+#endif /* CONFIG_PREEMPT */
+
+/*
+ * this is the entry point to schedule() from kernel preemption
+ * off of irq context.
+ * Note, that this is called and return with irqs disabled. This will
+ * protect us against recursive calling from irq.
+ */
+asmlinkage __visible void __sched preempt_schedule_irq(void)
+{
+ enum ctx_state prev_state;
+
+ /* Catch callers which need to be fixed */
+ BUG_ON(preempt_count() || !irqs_disabled());
+
+ prev_state = exception_enter();
+
+ do {
+ preempt_disable();
+ local_irq_enable();
+ __schedule(true);
+ local_irq_disable();
+ sched_preempt_enable_no_resched();
+ } while (need_resched());
+
+ exception_exit(prev_state);
+}
+
+int default_wake_function(wait_queue_t *curr, unsigned mode, int wake_flags,
+ void *key)
+{
+ return try_to_wake_up(curr->private, mode, wake_flags);
+}
+EXPORT_SYMBOL(default_wake_function);
+
+#ifdef CONFIG_RT_MUTEXES
+
+/*
+ * rt_mutex_setprio - set the current priority of a task
+ * @p: task
+ * @prio: prio value (kernel-internal form)
+ *
+ * This function changes the 'effective' priority of a task. It does
+ * not touch ->normal_prio like __setscheduler().
+ *
+ * Used by the rt_mutex code to implement priority inheritance
+ * logic. Call site only calls if the priority of the task changed.
+ */
+void rt_mutex_setprio(struct task_struct *p, int prio)
+{
+ struct rq *rq;
+ int oldprio;
+
+ BUG_ON(prio < 0 || prio > MAX_PRIO);
+
+ rq = __task_rq_lock(p);
+
+ /*
+ * Idle task boosting is a nono in general. There is one
+ * exception, when PREEMPT_RT and NOHZ is active:
+ *
+ * The idle task calls get_next_timer_interrupt() and holds
+ * the timer wheel base->lock on the CPU and another CPU wants
+ * to access the timer (probably to cancel it). We can safely
+ * ignore the boosting request, as the idle CPU runs this code
+ * with interrupts disabled and will complete the lock
+ * protected section without being interrupted. So there is no
+ * real need to boost.
+ */
+ if (unlikely(p == rq->idle)) {
+ WARN_ON(p != rq->curr);
+ WARN_ON(p->pi_blocked_on);
+ goto out_unlock;
+ }
+
+ trace_sched_pi_setprio(p, prio);
+ oldprio = p->prio;
+ p->prio = prio;
+ if (task_running(rq, p)){
+ if (prio > oldprio)
+ resched_task(p);
+ } else if (task_queued(p)) {
+ dequeue_task(rq, p, DEQUEUE_SAVE);
+ enqueue_task(rq, p, ENQUEUE_RESTORE);
+ if (prio < oldprio)
+ try_preempt(p, rq);
+ }
+out_unlock:
+ __task_rq_unlock(rq);
+}
+
+#endif
+
+/*
+ * Adjust the deadline for when the priority is to change, before it's
+ * changed.
+ */
+static inline void adjust_deadline(struct task_struct *p, int new_prio)
+{
+ p->deadline += static_deadline_diff(new_prio) - task_deadline_diff(p);
+}
+
+void set_user_nice(struct task_struct *p, long nice)
+{
+ int new_static, old_static;
+ unsigned long flags;
+ struct rq *rq;
+
+ if (task_nice(p) == nice || nice < MIN_NICE || nice > MAX_NICE)
+ return;
+ new_static = NICE_TO_PRIO(nice);
+ /*
+ * We have to be careful, if called from sys_setpriority(),
+ * the task might be in the middle of scheduling on another CPU.
+ */
+ rq = task_rq_lock(p, &flags);
+ /*
+ * The RT priorities are set via sched_setscheduler(), but we still
+ * allow the 'normal' nice value to be set - but as expected
+ * it wont have any effect on scheduling until the task is
+ * not SCHED_NORMAL/SCHED_BATCH:
+ */
+ if (has_rt_policy(p)) {
+ p->static_prio = new_static;
+ goto out_unlock;
+ }
+
+ adjust_deadline(p, new_static);
+ old_static = p->static_prio;
+ p->static_prio = new_static;
+ p->prio = effective_prio(p);
+
+ if (task_queued(p)) {
+ dequeue_task(rq, p, DEQUEUE_SAVE);
+ enqueue_task(rq, p, ENQUEUE_RESTORE);
+ if (new_static < old_static)
+ try_preempt(p, rq);
+ } else if (task_running(rq, p)) {
+ set_rq_task(rq, p);
+ if (old_static < new_static)
+ resched_task(p);
+ }
+out_unlock:
+ task_rq_unlock(rq, p, &flags);
+}
+EXPORT_SYMBOL(set_user_nice);
+
+/*
+ * can_nice - check if a task can reduce its nice value
+ * @p: task
+ * @nice: nice value
+ */
+int can_nice(const struct task_struct *p, const int nice)
+{
+ /* convert nice value [19,-20] to rlimit style value [1,40] */
+ int nice_rlim = nice_to_rlimit(nice);
+
+ return (nice_rlim <= task_rlimit(p, RLIMIT_NICE) ||
+ capable(CAP_SYS_NICE));
+}
+
+#ifdef __ARCH_WANT_SYS_NICE
+
+/*
+ * sys_nice - change the priority of the current process.
+ * @increment: priority increment
+ *
+ * sys_setpriority is a more generic, but much slower function that
+ * does similar things.
+ */
+SYSCALL_DEFINE1(nice, int, increment)
+{
+ long nice, retval;
+
+ /*
+ * Setpriority might change our priority at the same moment.
+ * We don't have to worry. Conceptually one call occurs first
+ * and we have a single winner.
+ */
+
+ increment = clamp(increment, -NICE_WIDTH, NICE_WIDTH);
+ nice = task_nice(current) + increment;
+
+ nice = clamp_val(nice, MIN_NICE, MAX_NICE);
+ if (increment < 0 && !can_nice(current, nice))
+ return -EPERM;
+
+ retval = security_task_setnice(current, nice);
+ if (retval)
+ return retval;
+
+ set_user_nice(current, nice);
+ return 0;
+}
+
+#endif
+
+/**
+ * task_prio - return the priority value of a given task.
+ * @p: the task in question.
+ *
+ * Return: The priority value as seen by users in /proc.
+ * RT tasks are offset by -100. Normal tasks are centered around 1, value goes
+ * from 0 (SCHED_ISO) up to 82 (nice +19 SCHED_IDLEPRIO).
+ */
+int task_prio(const struct task_struct *p)
+{
+ int delta, prio = p->prio - MAX_RT_PRIO;
+
+ /* rt tasks and iso tasks */
+ if (prio <= 0)
+ goto out;
+
+ /* Convert to ms to avoid overflows */
+ delta = NS_TO_MS(p->deadline - task_rq(p)->niffies);
+ delta = delta * 40 / ms_longest_deadline_diff();
+ if (delta > 0 && delta <= 80)
+ prio += delta;
+ if (idleprio_task(p))
+ prio += 40;
+out:
+ return prio;
+}
+
+/**
+ * idle_cpu - is a given cpu idle currently?
+ * @cpu: the processor in question.
+ *
+ * Return: 1 if the CPU is currently idle. 0 otherwise.
+ */
+int idle_cpu(int cpu)
+{
+ return cpu_curr(cpu) == cpu_rq(cpu)->idle;
+}
+
+/**
+ * idle_task - return the idle task for a given cpu.
+ * @cpu: the processor in question.
+ *
+ * Return: The idle task for the cpu @cpu.
+ */
+struct task_struct *idle_task(int cpu)
+{
+ return cpu_rq(cpu)->idle;
+}
+
+/**
+ * find_process_by_pid - find a process with a matching PID value.
+ * @pid: the pid in question.
+ *
+ * The task of @pid, if found. %NULL otherwise.
+ */
+static inline struct task_struct *find_process_by_pid(pid_t pid)
+{
+ return pid ? find_task_by_vpid(pid) : current;
+}
+
+/* Actually do priority change: must hold rq lock. */
+static void __setscheduler(struct task_struct *p, struct rq *rq, int policy,
+ int prio, bool keep_boost)
+{
+ int oldrtprio, oldprio;
+
+ p->policy = policy;
+ oldrtprio = p->rt_priority;
+ p->rt_priority = prio;
+ p->normal_prio = normal_prio(p);
+ oldprio = p->prio;
+ /*
+ * Keep a potential priority boosting if called from
+ * sched_setscheduler().
+ */
+ if (keep_boost) {
+ /*
+ * Take priority boosted tasks into account. If the new
+ * effective priority is unchanged, we just store the new
+ * normal parameters and do not touch the scheduler class and
+ * the runqueue. This will be done when the task deboost
+ * itself.
+ */
+ p->prio = rt_mutex_get_effective_prio(p, p->normal_prio);
+ } else
+ p->prio = p->normal_prio;
+
+ if (task_running(rq, p)) {
+ set_rq_task(rq, p);
+ resched_task(p);
+ } else if (task_queued(p)) {
+ dequeue_task(rq, p, DEQUEUE_SAVE);
+ enqueue_task(rq, p, ENQUEUE_RESTORE);
+ if (p->prio < oldprio || p->rt_priority > oldrtprio)
+ try_preempt(p, rq);
+ }
+}
+
+/*
+ * check the target process has a UID that matches the current process's
+ */
+static bool check_same_owner(struct task_struct *p)
+{
+ const struct cred *cred = current_cred(), *pcred;
+ bool match;
+
+ rcu_read_lock();
+ pcred = __task_cred(p);
+ match = (uid_eq(cred->euid, pcred->euid) ||
+ uid_eq(cred->euid, pcred->uid));
+ rcu_read_unlock();
+ return match;
+}
+
+static int
+__sched_setscheduler(struct task_struct *p, int policy,
+ const struct sched_param *param, bool user, bool pi)
+{
+ struct sched_param zero_param = { .sched_priority = 0 };
+ unsigned long flags, rlim_rtprio = 0;
+ int retval, oldpolicy = -1;
+ int reset_on_fork;
+ struct rq *rq;
+
+ /* may grab non-irq protected spin_locks */
+ BUG_ON(in_interrupt());
+
+ if (is_rt_policy(policy) && !capable(CAP_SYS_NICE)) {
+ unsigned long lflags;
+
+ if (!lock_task_sighand(p, &lflags))
+ return -ESRCH;
+ rlim_rtprio = task_rlimit(p, RLIMIT_RTPRIO);
+ unlock_task_sighand(p, &lflags);
+ if (rlim_rtprio)
+ goto recheck;
+ /*
+ * If the caller requested an RT policy without having the
+ * necessary rights, we downgrade the policy to SCHED_ISO.
+ * We also set the parameter to zero to pass the checks.
+ */
+ policy = SCHED_ISO;
+ param = &zero_param;
+ }
+recheck:
+ /* double check policy once rq lock held */
+ if (policy < 0) {
+ reset_on_fork = p->sched_reset_on_fork;
+ policy = oldpolicy = p->policy;
+ } else {
+ reset_on_fork = !!(policy & SCHED_RESET_ON_FORK);
+ policy &= ~SCHED_RESET_ON_FORK;
+
+ if (!SCHED_RANGE(policy))
+ return -EINVAL;
+ }
+
+ /*
+ * Valid priorities for SCHED_FIFO and SCHED_RR are
+ * 1..MAX_USER_RT_PRIO-1, valid priority for SCHED_NORMAL and
+ * SCHED_BATCH is 0.
+ */
+ if (param->sched_priority < 0 ||
+ (p->mm && param->sched_priority > MAX_USER_RT_PRIO - 1) ||
+ (!p->mm && param->sched_priority > MAX_RT_PRIO - 1))
+ return -EINVAL;
+ if (is_rt_policy(policy) != (param->sched_priority != 0))
+ return -EINVAL;
+
+ /*
+ * Allow unprivileged RT tasks to decrease priority:
+ */
+ if (user && !capable(CAP_SYS_NICE)) {
+ if (is_rt_policy(policy)) {
+ unsigned long rlim_rtprio =
+ task_rlimit(p, RLIMIT_RTPRIO);
+
+ /* can't set/change the rt policy */
+ if (policy != p->policy && !rlim_rtprio)
+ return -EPERM;
+
+ /* can't increase priority */
+ if (param->sched_priority > p->rt_priority &&
+ param->sched_priority > rlim_rtprio)
+ return -EPERM;
+ } else {
+ switch (p->policy) {
+ /*
+ * Can only downgrade policies but not back to
+ * SCHED_NORMAL
+ */
+ case SCHED_ISO:
+ if (policy == SCHED_ISO)
+ goto out;
+ if (policy != SCHED_NORMAL)
+ return -EPERM;
+ break;
+ case SCHED_BATCH:
+ if (policy == SCHED_BATCH)
+ goto out;
+ if (policy != SCHED_IDLEPRIO)
+ return -EPERM;
+ break;
+ case SCHED_IDLEPRIO:
+ if (policy == SCHED_IDLEPRIO)
+ goto out;
+ return -EPERM;
+ default:
+ break;
+ }
+ }
+
+ /* can't change other user's priorities */
+ if (!check_same_owner(p))
+ return -EPERM;
+
+ /* Normal users shall not reset the sched_reset_on_fork flag */
+ if (p->sched_reset_on_fork && !reset_on_fork)
+ return -EPERM;
+ }
+
+ if (user) {
+ retval = security_task_setscheduler(p);
+ if (retval)
+ return retval;
+ }
+
+ /*
+ * make sure no PI-waiters arrive (or leave) while we are
+ * changing the priority of the task:
+ *
+ * To be able to change p->policy safely, the runqueue lock must be
+ * held.
+ */
+ rq = task_rq_lock(p, &flags);
+
+ /*
+ * Changing the policy of the stop threads its a very bad idea
+ */
+ if (p == rq->stop) {
+ task_rq_unlock(rq, p, &flags);
+ return -EINVAL;
+ }
+
+ /*
+ * If not changing anything there's no need to proceed further:
+ */
+ if (unlikely(policy == p->policy && (!is_rt_policy(policy) ||
+ param->sched_priority == p->rt_priority))) {
+ task_rq_unlock(rq, p, &flags);
+ return 0;
+ }
+
+ /* recheck policy now with rq lock held */
+ if (unlikely(oldpolicy != -1 && oldpolicy != p->policy)) {
+ policy = oldpolicy = -1;
+ task_rq_unlock(rq, p, &flags);
+ goto recheck;
+ }
+ p->sched_reset_on_fork = reset_on_fork;
+
+ __setscheduler(p, rq, policy, param->sched_priority, pi);
+ task_rq_unlock(rq, p, &flags);
+
+ if (pi)
+ rt_mutex_adjust_pi(p);
+out:
+ return 0;
+}
+
+/**
+ * sched_setscheduler - change the scheduling policy and/or RT priority of a thread.
+ * @p: the task in question.
+ * @policy: new policy.
+ * @param: structure containing the new RT priority.
+ *
+ * Return: 0 on success. An error code otherwise.
+ *
+ * NOTE that the task may be already dead.
+ */
+int sched_setscheduler(struct task_struct *p, int policy,
+ const struct sched_param *param)
+{
+ return __sched_setscheduler(p, policy, param, true, true);
+}
+
+EXPORT_SYMBOL_GPL(sched_setscheduler);
+
+int sched_setattr(struct task_struct *p, const struct sched_attr *attr)
+{
+ const struct sched_param param = { .sched_priority = attr->sched_priority };
+ int policy = attr->sched_policy;
+
+ return __sched_setscheduler(p, policy, &param, true, true);
+}
+EXPORT_SYMBOL_GPL(sched_setattr);
+
+/**
+ * sched_setscheduler_nocheck - change the scheduling policy and/or RT priority of a thread from kernelspace.
+ * @p: the task in question.
+ * @policy: new policy.
+ * @param: structure containing the new RT priority.
+ *
+ * Just like sched_setscheduler, only don't bother checking if the
+ * current context has permission. For example, this is needed in
+ * stop_machine(): we create temporary high priority worker threads,
+ * but our caller might not have that capability.
+ *
+ * Return: 0 on success. An error code otherwise.
+ */
+int sched_setscheduler_nocheck(struct task_struct *p, int policy,
+ const struct sched_param *param)
+{
+ return __sched_setscheduler(p, policy, param, false, true);
+}
+EXPORT_SYMBOL_GPL(sched_setscheduler_nocheck);
+
+static int
+do_sched_setscheduler(pid_t pid, int policy, struct sched_param __user *param)
+{
+ struct sched_param lparam;
+ struct task_struct *p;
+ int retval;
+
+ if (!param || pid < 0)
+ return -EINVAL;
+ if (copy_from_user(&lparam, param, sizeof(struct sched_param)))
+ return -EFAULT;
+
+ rcu_read_lock();
+ retval = -ESRCH;
+ p = find_process_by_pid(pid);
+ if (p != NULL)
+ retval = sched_setscheduler(p, policy, &lparam);
+ rcu_read_unlock();
+
+ return retval;
+}
+
+/*
+ * Mimics kernel/events/core.c perf_copy_attr().
+ */
+static int sched_copy_attr(struct sched_attr __user *uattr,
+ struct sched_attr *attr)
+{
+ u32 size;
+ int ret;
+
+ if (!access_ok(VERIFY_WRITE, uattr, SCHED_ATTR_SIZE_VER0))
+ return -EFAULT;
+
+ /*
+ * zero the full structure, so that a short copy will be nice.
+ */
+ memset(attr, 0, sizeof(*attr));
+
+ ret = get_user(size, &uattr->size);
+ if (ret)
+ return ret;
+
+ if (size > PAGE_SIZE) /* silly large */
+ goto err_size;
+
+ if (!size) /* abi compat */
+ size = SCHED_ATTR_SIZE_VER0;
+
+ if (size < SCHED_ATTR_SIZE_VER0)
+ goto err_size;
+
+ /*
+ * If we're handed a bigger struct than we know of,
+ * ensure all the unknown bits are 0 - i.e. new
+ * user-space does not rely on any kernel feature
+ * extensions we dont know about yet.
+ */
+ if (size > sizeof(*attr)) {
+ unsigned char __user *addr;
+ unsigned char __user *end;
+ unsigned char val;
+
+ addr = (void __user *)uattr + sizeof(*attr);
+ end = (void __user *)uattr + size;
+
+ for (; addr < end; addr++) {
+ ret = get_user(val, addr);
+ if (ret)
+ return ret;
+ if (val)
+ goto err_size;
+ }
+ size = sizeof(*attr);
+ }
+
+ ret = copy_from_user(attr, uattr, size);
+ if (ret)
+ return -EFAULT;
+
+ /*
+ * XXX: do we want to be lenient like existing syscalls; or do we want
+ * to be strict and return an error on out-of-bounds values?
+ */
+ attr->sched_nice = clamp(attr->sched_nice, -20, 19);
+
+ /* sched/core.c uses zero here but we already know ret is zero */
+ return 0;
+
+err_size:
+ put_user(sizeof(*attr), &uattr->size);
+ return -E2BIG;
+}
+
+/**
+ * sys_sched_setscheduler - set/change the scheduler policy and RT priority
+ * @pid: the pid in question.
+ * @policy: new policy.
+ *
+ * Return: 0 on success. An error code otherwise.
+ * @param: structure containing the new RT priority.
+ */
+asmlinkage long sys_sched_setscheduler(pid_t pid, int policy,
+ struct sched_param __user *param)
+{
+ /* negative values for policy are not valid */
+ if (policy < 0)
+ return -EINVAL;
+
+ return do_sched_setscheduler(pid, policy, param);
+}
+
+/*
+ * sched_setparam() passes in -1 for its policy, to let the functions
+ * it calls know not to change it.
+ */
+#define SETPARAM_POLICY -1
+
+/**
+ * sys_sched_setparam - set/change the RT priority of a thread
+ * @pid: the pid in question.
+ * @param: structure containing the new RT priority.
+ *
+ * Return: 0 on success. An error code otherwise.
+ */
+SYSCALL_DEFINE2(sched_setparam, pid_t, pid, struct sched_param __user *, param)
+{
+ return do_sched_setscheduler(pid, SETPARAM_POLICY, param);
+}
+
+/**
+ * sys_sched_setattr - same as above, but with extended sched_attr
+ * @pid: the pid in question.
+ * @uattr: structure containing the extended parameters.
+ */
+SYSCALL_DEFINE3(sched_setattr, pid_t, pid, struct sched_attr __user *, uattr,
+ unsigned int, flags)
+{
+ struct sched_attr attr;
+ struct task_struct *p;
+ int retval;
+
+ if (!uattr || pid < 0 || flags)
+ return -EINVAL;
+
+ retval = sched_copy_attr(uattr, &attr);
+ if (retval)
+ return retval;
+
+ if ((int)attr.sched_policy < 0)
+ return -EINVAL;
+
+ rcu_read_lock();
+ retval = -ESRCH;
+ p = find_process_by_pid(pid);
+ if (p != NULL)
+ retval = sched_setattr(p, &attr);
+ rcu_read_unlock();
+
+ return retval;
+}
+
+/**
+ * sys_sched_getscheduler - get the policy (scheduling class) of a thread
+ * @pid: the pid in question.
+ *
+ * Return: On success, the policy of the thread. Otherwise, a negative error
+ * code.
+ */
+SYSCALL_DEFINE1(sched_getscheduler, pid_t, pid)
+{
+ struct task_struct *p;
+ int retval = -EINVAL;
+
+ if (pid < 0)
+ goto out_nounlock;
+
+ retval = -ESRCH;
+ rcu_read_lock();
+ p = find_process_by_pid(pid);
+ if (p) {
+ retval = security_task_getscheduler(p);
+ if (!retval)
+ retval = p->policy;
+ }
+ rcu_read_unlock();
+
+out_nounlock:
+ return retval;
+}
+
+/**
+ * sys_sched_getscheduler - get the RT priority of a thread
+ * @pid: the pid in question.
+ * @param: structure containing the RT priority.
+ *
+ * Return: On success, 0 and the RT priority is in @param. Otherwise, an error
+ * code.
+ */
+SYSCALL_DEFINE2(sched_getparam, pid_t, pid, struct sched_param __user *, param)
+{
+ struct sched_param lp = { .sched_priority = 0 };
+ struct task_struct *p;
+ int retval = -EINVAL;
+
+ if (!param || pid < 0)
+ goto out_nounlock;
+
+ rcu_read_lock();
+ p = find_process_by_pid(pid);
+ retval = -ESRCH;
+ if (!p)
+ goto out_unlock;
+
+ retval = security_task_getscheduler(p);
+ if (retval)
+ goto out_unlock;
+
+ if (has_rt_policy(p))
+ lp.sched_priority = p->rt_priority;
+ rcu_read_unlock();
+
+ /*
+ * This one might sleep, we cannot do it with a spinlock held ...
+ */
+ retval = copy_to_user(param, &lp, sizeof(*param)) ? -EFAULT : 0;
+
+out_nounlock:
+ return retval;
+
+out_unlock:
+ rcu_read_unlock();
+ return retval;
+}
+
+static int sched_read_attr(struct sched_attr __user *uattr,
+ struct sched_attr *attr,
+ unsigned int usize)
+{
+ int ret;
+
+ if (!access_ok(VERIFY_WRITE, uattr, usize))
+ return -EFAULT;
+
+ /*
+ * If we're handed a smaller struct than we know of,
+ * ensure all the unknown bits are 0 - i.e. old
+ * user-space does not get uncomplete information.
+ */
+ if (usize < sizeof(*attr)) {
+ unsigned char *addr;
+ unsigned char *end;
+
+ addr = (void *)attr + usize;
+ end = (void *)attr + sizeof(*attr);
+
+ for (; addr < end; addr++) {
+ if (*addr)
+ return -EFBIG;
+ }
+
+ attr->size = usize;
+ }
+
+ ret = copy_to_user(uattr, attr, attr->size);
+ if (ret)
+ return -EFAULT;
+
+ /* sched/core.c uses zero here but we already know ret is zero */
+ return ret;
+}
+
+/**
+ * sys_sched_getattr - similar to sched_getparam, but with sched_attr
+ * @pid: the pid in question.
+ * @uattr: structure containing the extended parameters.
+ * @size: sizeof(attr) for fwd/bwd comp.
+ * @flags: for future extension.
+ */
+SYSCALL_DEFINE4(sched_getattr, pid_t, pid, struct sched_attr __user *, uattr,
+ unsigned int, size, unsigned int, flags)
+{
+ struct sched_attr attr = {
+ .size = sizeof(struct sched_attr),
+ };
+ struct task_struct *p;
+ int retval;
+
+ if (!uattr || pid < 0 || size > PAGE_SIZE ||
+ size < SCHED_ATTR_SIZE_VER0 || flags)
+ return -EINVAL;
+
+ rcu_read_lock();
+ p = find_process_by_pid(pid);
+ retval = -ESRCH;
+ if (!p)
+ goto out_unlock;
+
+ retval = security_task_getscheduler(p);
+ if (retval)
+ goto out_unlock;
+
+ attr.sched_policy = p->policy;
+ if (rt_task(p))
+ attr.sched_priority = p->rt_priority;
+ else
+ attr.sched_nice = task_nice(p);
+
+ rcu_read_unlock();
+
+ retval = sched_read_attr(uattr, &attr, size);
+ return retval;
+
+out_unlock:
+ rcu_read_unlock();
+ return retval;
+}
+
+long sched_setaffinity(pid_t pid, const struct cpumask *in_mask)
+{
+ cpumask_var_t cpus_allowed, new_mask;
+ struct task_struct *p;
+ int retval;
+
+ rcu_read_lock();
+
+ p = find_process_by_pid(pid);
+ if (!p) {
+ rcu_read_unlock();
+ return -ESRCH;
+ }
+
+ /* Prevent p going away */
+ get_task_struct(p);
+ rcu_read_unlock();
+
+ if (p->flags & PF_NO_SETAFFINITY) {
+ retval = -EINVAL;
+ goto out_put_task;
+ }
+ if (!alloc_cpumask_var(&cpus_allowed, GFP_KERNEL)) {
+ retval = -ENOMEM;
+ goto out_put_task;
+ }
+ if (!alloc_cpumask_var(&new_mask, GFP_KERNEL)) {
+ retval = -ENOMEM;
+ goto out_free_cpus_allowed;
+ }
+ retval = -EPERM;
+ if (!check_same_owner(p)) {
+ rcu_read_lock();
+ if (!ns_capable(__task_cred(p)->user_ns, CAP_SYS_NICE)) {
+ rcu_read_unlock();
+ goto out_unlock;
+ }
+ rcu_read_unlock();
+ }
+
+ retval = security_task_setscheduler(p);
+ if (retval)
+ goto out_unlock;
+
+ cpuset_cpus_allowed(p, cpus_allowed);
+ cpumask_and(new_mask, in_mask, cpus_allowed);
+again:
+ retval = __set_cpus_allowed_ptr(p, new_mask, true);
+
+ if (!retval) {
+ cpuset_cpus_allowed(p, cpus_allowed);
+ if (!cpumask_subset(new_mask, cpus_allowed)) {
+ /*
+ * We must have raced with a concurrent cpuset
+ * update. Just reset the cpus_allowed to the
+ * cpuset's cpus_allowed
+ */
+ cpumask_copy(new_mask, cpus_allowed);
+ goto again;
+ }
+ }
+out_unlock:
+ free_cpumask_var(new_mask);
+out_free_cpus_allowed:
+ free_cpumask_var(cpus_allowed);
+out_put_task:
+ put_task_struct(p);
+ return retval;
+}
+
+static int get_user_cpu_mask(unsigned long __user *user_mask_ptr, unsigned len,
+ cpumask_t *new_mask)
+{
+ if (len < sizeof(cpumask_t)) {
+ memset(new_mask, 0, sizeof(cpumask_t));
+ } else if (len > sizeof(cpumask_t)) {
+ len = sizeof(cpumask_t);
+ }
+ return copy_from_user(new_mask, user_mask_ptr, len) ? -EFAULT : 0;
+}
+
+
+/**
+ * sys_sched_setaffinity - set the cpu affinity of a process
+ * @pid: pid of the process
+ * @len: length in bytes of the bitmask pointed to by user_mask_ptr
+ * @user_mask_ptr: user-space pointer to the new cpu mask
+ *
+ * Return: 0 on success. An error code otherwise.
+ */
+SYSCALL_DEFINE3(sched_setaffinity, pid_t, pid, unsigned int, len,
+ unsigned long __user *, user_mask_ptr)
+{
+ cpumask_var_t new_mask;
+ int retval;
+
+ if (!alloc_cpumask_var(&new_mask, GFP_KERNEL))
+ return -ENOMEM;
+
+ retval = get_user_cpu_mask(user_mask_ptr, len, new_mask);
+ if (retval == 0)
+ retval = sched_setaffinity(pid, new_mask);
+ free_cpumask_var(new_mask);
+ return retval;
+}
+
+long sched_getaffinity(pid_t pid, cpumask_t *mask)
+{
+ struct task_struct *p;
+ unsigned long flags;
+ int retval;
+
+ get_online_cpus();
+ rcu_read_lock();
+
+ retval = -ESRCH;
+ p = find_process_by_pid(pid);
+ if (!p)
+ goto out_unlock;
+
+ retval = security_task_getscheduler(p);
+ if (retval)
+ goto out_unlock;
+
+ raw_spin_lock_irqsave(&p->pi_lock, flags);
+ cpumask_and(mask, tsk_cpus_allowed(p), cpu_active_mask);
+ raw_spin_unlock_irqrestore(&p->pi_lock, flags);
+
+out_unlock:
+ rcu_read_unlock();
+ put_online_cpus();
+
+ return retval;
+}
+
+/**
+ * sys_sched_getaffinity - get the cpu affinity of a process
+ * @pid: pid of the process
+ * @len: length in bytes of the bitmask pointed to by user_mask_ptr
+ * @user_mask_ptr: user-space pointer to hold the current cpu mask
+ *
+ * Return: 0 on success. An error code otherwise.
+ */
+SYSCALL_DEFINE3(sched_getaffinity, pid_t, pid, unsigned int, len,
+ unsigned long __user *, user_mask_ptr)
+{
+ int ret;
+ cpumask_var_t mask;
+
+ if ((len * BITS_PER_BYTE) < nr_cpu_ids)
+ return -EINVAL;
+ if (len & (sizeof(unsigned long)-1))
+ return -EINVAL;
+
+ if (!alloc_cpumask_var(&mask, GFP_KERNEL))
+ return -ENOMEM;
+
+ ret = sched_getaffinity(pid, mask);
+ if (ret == 0) {
+ size_t retlen = min_t(size_t, len, cpumask_size());
+
+ if (copy_to_user(user_mask_ptr, mask, retlen))
+ ret = -EFAULT;
+ else
+ ret = retlen;
+ }
+ free_cpumask_var(mask);
+
+ return ret;
+}
+
+/**
+ * sys_sched_yield - yield the current processor to other threads.
+ *
+ * This function yields the current CPU to other tasks. It does this by
+ * scheduling away the current task. If it still has the earliest deadline
+ * it will be scheduled again as the next task.
+ *
+ * Return: 0.
+ */
+SYSCALL_DEFINE0(sched_yield)
+{
+ struct task_struct *p;
+ struct rq *rq;
+
+ if (!sched_yield_type)
+ goto out;
+ p = current;
+ rq = this_rq_lock();
+ if (sched_yield_type > 1)
+ time_slice_expired(p, rq);
+ schedstat_inc(rq->yld_count);
+
+ /*
+ * Since we are going to call schedule() anyway, there's
+ * no need to preempt or enable interrupts:
+ */
+ __release(rq->lock);
+ spin_release(&rq->lock.dep_map, 1, _THIS_IP_);
+ do_raw_spin_unlock(&rq->lock);
+ sched_preempt_enable_no_resched();
+
+ schedule();
+out:
+ return 0;
+}
+
+#ifndef CONFIG_PREEMPT
+int __sched _cond_resched(void)
+{
+ if (should_resched(0)) {
+ preempt_schedule_common();
+ return 1;
+ }
+ return 0;
+}
+EXPORT_SYMBOL(_cond_resched);
+#endif
+
+/*
+ * __cond_resched_lock() - if a reschedule is pending, drop the given lock,
+ * call schedule, and on return reacquire the lock.
+ *
+ * This works OK both with and without CONFIG_PREEMPT. We do strange low-level
+ * operations here to prevent schedule() from being called twice (once via
+ * spin_unlock(), once by hand).
+ */
+int __cond_resched_lock(spinlock_t *lock)
+{
+ int resched = should_resched(PREEMPT_LOCK_OFFSET);
+ int ret = 0;
+
+ lockdep_assert_held(lock);
+
+ if (spin_needbreak(lock) || resched) {
+ spin_unlock(lock);
+ if (resched)
+ preempt_schedule_common();
+ else
+ cpu_relax();
+ ret = 1;
+ spin_lock(lock);
+ }
+ return ret;
+}
+EXPORT_SYMBOL(__cond_resched_lock);
+
+int __sched __cond_resched_softirq(void)
+{
+ BUG_ON(!in_softirq());
+
+ if (should_resched(SOFTIRQ_DISABLE_OFFSET)) {
+ local_bh_enable();
+ preempt_schedule_common();
+ local_bh_disable();
+ return 1;
+ }
+ return 0;
+}
+EXPORT_SYMBOL(__cond_resched_softirq);
+
+/**
+ * yield - yield the current processor to other threads.
+ *
+ * Do not ever use this function, there's a 99% chance you're doing it wrong.
+ *
+ * The scheduler is at all times free to pick the calling task as the most
+ * eligible task to run, if removing the yield() call from your code breaks
+ * it, its already broken.
+ *
+ * Typical broken usage is:
+ *
+ * while (!event)
+ * yield();
+ *
+ * where one assumes that yield() will let 'the other' process run that will
+ * make event true. If the current task is a SCHED_FIFO task that will never
+ * happen. Never use yield() as a progress guarantee!!
+ *
+ * If you want to use yield() to wait for something, use wait_event().
+ * If you want to use yield() to be 'nice' for others, use cond_resched().
+ * If you still want to use yield(), do not!
+ */
+void __sched yield(void)
+{
+ set_current_state(TASK_RUNNING);
+ sys_sched_yield();
+}
+EXPORT_SYMBOL(yield);
+
+/**
+ * yield_to - yield the current processor to another thread in
+ * your thread group, or accelerate that thread toward the
+ * processor it's on.
+ * @p: target task
+ * @preempt: whether task preemption is allowed or not
+ *
+ * It's the caller's job to ensure that the target task struct
+ * can't go away on us before we can do any checks.
+ *
+ * Return:
+ * true (>0) if we indeed boosted the target task.
+ * false (0) if we failed to boost the target.
+ * -ESRCH if there's no task to yield to.
+ */
+int __sched yield_to(struct task_struct *p, bool preempt)
+{
+ struct task_struct *rq_p;
+ struct rq *rq, *p_rq;
+ unsigned long flags;
+ int yielded = 0;
+
+ local_irq_save(flags);
+ rq = this_rq();
+
+again:
+ p_rq = task_rq(p);
+ /*
+ * If we're the only runnable task on the rq and target rq also
+ * has only one task, there's absolutely no point in yielding.
+ */
+ if (task_running(p_rq, p) || p->state) {
+ yielded = -ESRCH;
+ goto out_irq;
+ }
+
+ double_rq_lock(rq, p_rq);
+ if (unlikely(task_rq(p) != p_rq)) {
+ double_rq_unlock(rq, p_rq);
+ goto again;
+ }
+
+ yielded = 1;
+ schedstat_inc(rq->yld_count);
+ rq_p = rq->curr;
+ if (p->deadline > rq_p->deadline)
+ p->deadline = rq_p->deadline;
+ p->time_slice += rq_p->time_slice;
+ if (p->time_slice > timeslice())
+ p->time_slice = timeslice();
+ time_slice_expired(rq_p, rq);
+ if (preempt && rq != p_rq)
+ resched_task(p_rq->curr);
+ double_rq_unlock(rq, p_rq);
+out_irq:
+ local_irq_restore(flags);
+
+ if (yielded > 0)
+ schedule();
+ return yielded;
+}
+EXPORT_SYMBOL_GPL(yield_to);
+
+/*
+ * This task is about to go to sleep on IO. Increment rq->nr_iowait so
+ * that process accounting knows that this is a task in IO wait state.
+ *
+ * But don't do that if it is a deliberate, throttling IO wait (this task
+ * has set its backing_dev_info: the queue against which it should throttle)
+ */
+
+long __sched io_schedule_timeout(long timeout)
+{
+ int old_iowait = current->in_iowait;
+ struct rq *rq;
+ long ret;
+
+ current->in_iowait = 1;
+ blk_schedule_flush_plug(current);
+
+ delayacct_blkio_start();
+ rq = raw_rq();
+ atomic_inc(&rq->nr_iowait);
+ ret = schedule_timeout(timeout);
+ current->in_iowait = old_iowait;
+ atomic_dec(&rq->nr_iowait);
+ delayacct_blkio_end();
+
+ return ret;
+}
+EXPORT_SYMBOL(io_schedule_timeout);
+
+/**
+ * sys_sched_get_priority_max - return maximum RT priority.
+ * @policy: scheduling class.
+ *
+ * Return: On success, this syscall returns the maximum
+ * rt_priority that can be used by a given scheduling class.
+ * On failure, a negative error code is returned.
+ */
+SYSCALL_DEFINE1(sched_get_priority_max, int, policy)
+{
+ int ret = -EINVAL;
+
+ switch (policy) {
+ case SCHED_FIFO:
+ case SCHED_RR:
+ ret = MAX_USER_RT_PRIO-1;
+ break;
+ case SCHED_NORMAL:
+ case SCHED_BATCH:
+ case SCHED_ISO:
+ case SCHED_IDLEPRIO:
+ ret = 0;
+ break;
+ }
+ return ret;
+}
+
+/**
+ * sys_sched_get_priority_min - return minimum RT priority.
+ * @policy: scheduling class.
+ *
+ * Return: On success, this syscall returns the minimum
+ * rt_priority that can be used by a given scheduling class.
+ * On failure, a negative error code is returned.
+ */
+SYSCALL_DEFINE1(sched_get_priority_min, int, policy)
+{
+ int ret = -EINVAL;
+
+ switch (policy) {
+ case SCHED_FIFO:
+ case SCHED_RR:
+ ret = 1;
+ break;
+ case SCHED_NORMAL:
+ case SCHED_BATCH:
+ case SCHED_ISO:
+ case SCHED_IDLEPRIO:
+ ret = 0;
+ break;
+ }
+ return ret;
+}
+
+/**
+ * sys_sched_rr_get_interval - return the default timeslice of a process.
+ * @pid: pid of the process.
+ * @interval: userspace pointer to the timeslice value.
+ *
+ *
+ * Return: On success, 0 and the timeslice is in @interval. Otherwise,
+ * an error code.
+ */
+SYSCALL_DEFINE2(sched_rr_get_interval, pid_t, pid,
+ struct timespec __user *, interval)
+{
+ struct task_struct *p;
+ unsigned int time_slice;
+ unsigned long flags;
+ struct timespec t;
+ struct rq *rq;
+ int retval;
+
+ if (pid < 0)
+ return -EINVAL;
+
+ retval = -ESRCH;
+ rcu_read_lock();
+ p = find_process_by_pid(pid);
+ if (!p)
+ goto out_unlock;
+
+ retval = security_task_getscheduler(p);
+ if (retval)
+ goto out_unlock;
+
+ rq = task_rq_lock(p, &flags);
+ time_slice = p->policy == SCHED_FIFO ? 0 : MS_TO_NS(task_timeslice(p));
+ task_rq_unlock(rq, p, &flags);
+
+ rcu_read_unlock();
+ t = ns_to_timespec(time_slice);
+ retval = copy_to_user(interval, &t, sizeof(t)) ? -EFAULT : 0;
+ return retval;
+
+out_unlock:
+ rcu_read_unlock();
+ return retval;
+}
+
+static const char stat_nam[] = TASK_STATE_TO_CHAR_STR;
+
+void sched_show_task(struct task_struct *p)
+{
+ unsigned long free = 0;
+ int ppid;
+ unsigned long state = p->state;
+
+ if (!try_get_task_stack(p))
+ return;
+ if (state)
+ state = __ffs(state) + 1;
+ printk(KERN_INFO "%-15.15s %c", p->comm,
+ state < sizeof(stat_nam) - 1 ? stat_nam[state] : '?');
+ if (state == TASK_RUNNING)
+ printk(KERN_CONT " running task ");
+#ifdef CONFIG_DEBUG_STACK_USAGE
+ free = stack_not_used(p);
+#endif
+ ppid = 0;
+ rcu_read_lock();
+ if (pid_alive(p))
+ ppid = task_pid_nr(rcu_dereference(p->real_parent));
+ rcu_read_unlock();
+ printk(KERN_CONT "%5lu %5d %6d 0x%08lx\n", free,
+ task_pid_nr(p), ppid,
+ (unsigned long)task_thread_info(p)->flags);
+
+ print_worker_info(KERN_INFO, p);
+ show_stack(p, NULL);
+ put_task_stack(p);
+}
+
+void show_state_filter(unsigned long state_filter)
+{
+ struct task_struct *g, *p;
+
+#if BITS_PER_LONG == 32
+ printk(KERN_INFO
+ " task PC stack pid father\n");
+#else
+ printk(KERN_INFO
+ " task PC stack pid father\n");
+#endif
+ rcu_read_lock();
+ for_each_process_thread(g, p) {
+ /*
+ * reset the NMI-timeout, listing all files on a slow
+ * console might take a lot of time:
+ * Also, reset softlockup watchdogs on all CPUs, because
+ * another CPU might be blocked waiting for us to process
+ * an IPI.
+ */
+ touch_nmi_watchdog();
+ touch_all_softlockup_watchdogs();
+ if (!state_filter || (p->state & state_filter))
+ sched_show_task(p);
+ }
+
+ rcu_read_unlock();
+ /*
+ * Only show locks if all tasks are dumped:
+ */
+ if (!state_filter)
+ debug_show_all_locks();
+}
+
+void dump_cpu_task(int cpu)
+{
+ pr_info("Task dump for CPU %d:\n", cpu);
+ sched_show_task(cpu_curr(cpu));
+}
+
+#ifdef CONFIG_SMP
+void set_cpus_allowed_common(struct task_struct *p, const struct cpumask *new_mask)
+{
+ cpumask_copy(&p->cpus_allowed, new_mask);
+ p->nr_cpus_allowed = cpumask_weight(new_mask);
+}
+
+void __do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask)
+{
+ struct rq *rq = task_rq(p);
+
+ lockdep_assert_held(&p->pi_lock);
+
+ cpumask_copy(tsk_cpus_allowed(p), new_mask);
+
+ if (task_queued(p)) {
+ /*
+ * Because __kthread_bind() calls this on blocked tasks without
+ * holding rq->lock.
+ */
+ lockdep_assert_held(&rq->lock);
+ }
+}
+
+/*
+ * Calling do_set_cpus_allowed from outside the scheduler code may make the
+ * task not be able to run on its current CPU so we resched it here.
+ */
+void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask)
+{
+ __do_set_cpus_allowed(p, new_mask);
+ if (needs_other_cpu(p, task_cpu(p))) {
+ struct rq *rq;
+
+ set_task_cpu(p, valid_task_cpu(p));
+ rq = __task_rq_lock(p);
+ resched_task(p);
+ __task_rq_unlock(rq);
+ }
+}
+
+/*
+ * For internal scheduler calls to do_set_cpus_allowed which will resched
+ * themselves if needed.
+ */
+static void _do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask)
+{
+ __do_set_cpus_allowed(p, new_mask);
+ /* __set_cpus_allowed_ptr will handle the reschedule in this variant */
+ if (needs_other_cpu(p, task_cpu(p)))
+ set_task_cpu(p, valid_task_cpu(p));
+}
+#endif
+
+/**
+ * init_idle - set up an idle thread for a given CPU
+ * @idle: task in question
+ * @cpu: cpu the idle task belongs to
+ *
+ * NOTE: this function does not set the idle thread's NEED_RESCHED
+ * flag, to make booting more robust.
+ */
+void init_idle(struct task_struct *idle, int cpu)
+{
+ struct rq *rq = cpu_rq(cpu);
+ unsigned long flags;
+
+ raw_spin_lock_irqsave(&idle->pi_lock, flags);
+ raw_spin_lock(&rq->lock);
+ idle->last_ran = rq->niffies;
+ time_slice_expired(idle, rq);
+ idle->state = TASK_RUNNING;
+ /* Setting prio to illegal value shouldn't matter when never queued */
+ idle->prio = PRIO_LIMIT;
+
+ kasan_unpoison_task_stack(idle);
+
+#ifdef CONFIG_SMP
+ /*
+ * It's possible that init_idle() gets called multiple times on a task,
+ * in that case do_set_cpus_allowed() will not do the right thing.
+ *
+ * And since this is boot we can forgo the serialisation.
+ */
+ set_cpus_allowed_common(idle, cpumask_of(cpu));
+#ifdef CONFIG_SMT_NICE
+ idle->smt_bias = 0;
+#endif
+#endif
+ set_rq_task(rq, idle);
+
+ /* Silence PROVE_RCU */
+ rcu_read_lock();
+ set_task_cpu(idle, cpu);
+ rcu_read_unlock();
+
+ rq->curr = rq->idle = idle;
+ idle->on_rq = TASK_ON_RQ_QUEUED;
+ raw_spin_unlock(&rq->lock);
+ raw_spin_unlock_irqrestore(&idle->pi_lock, flags);
+
+ /* Set the preempt count _outside_ the spinlocks! */
+ init_idle_preempt_count(idle, cpu);
+
+ ftrace_graph_init_idle_task(idle, cpu);
+ vtime_init_idle(idle, cpu);
+#ifdef CONFIG_SMP
+ sprintf(idle->comm, "%s/%d", INIT_TASK_COMM, cpu);
+#endif
+}
+
+int cpuset_cpumask_can_shrink(const struct cpumask __maybe_unused *cur,
+ const struct cpumask __maybe_unused *trial)
+{
+ return 1;
+}
+
+int task_can_attach(struct task_struct *p,
+ const struct cpumask *cs_cpus_allowed)
+{
+ int ret = 0;
+
+ /*
+ * Kthreads which disallow setaffinity shouldn't be moved
+ * to a new cpuset; we don't want to change their cpu
+ * affinity and isolating such threads by their set of
+ * allowed nodes is unnecessary. Thus, cpusets are not
+ * applicable for such threads. This prevents checking for
+ * success of set_cpus_allowed_ptr() on all attached tasks
+ * before cpus_allowed may be changed.
+ */
+ if (p->flags & PF_NO_SETAFFINITY)
+ ret = -EINVAL;
+
+ return ret;
+}
+
+void resched_cpu(int cpu)
+{
+ struct rq *rq = cpu_rq(cpu);
+ unsigned long flags;
+
+ rq_lock_irqsave(rq, &flags);
+ resched_task(cpu_curr(cpu));
+ rq_unlock_irqrestore(rq, &flags);
+}
+
+#ifdef CONFIG_SMP
+#ifdef CONFIG_NO_HZ_COMMON
+void nohz_balance_enter_idle(int cpu)
+{
+}
+
+void select_nohz_load_balancer(int stop_tick)
+{
+}
+
+void set_cpu_sd_state_idle(void) {}
+#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
+/**
+ * lowest_flag_domain - Return lowest sched_domain containing flag.
+ * @cpu: The cpu whose lowest level of sched domain is to
+ * be returned.
+ * @flag: The flag to check for the lowest sched_domain
+ * for the given cpu.
+ *
+ * Returns the lowest sched_domain of a cpu which contains the given flag.
+ */
+static inline struct sched_domain *lowest_flag_domain(int cpu, int flag)
+{
+ struct sched_domain *sd;
+
+ for_each_domain(cpu, sd)
+ if (sd && (sd->flags & flag))
+ break;
+
+ return sd;
+}
+
+/**
+ * for_each_flag_domain - Iterates over sched_domains containing the flag.
+ * @cpu: The cpu whose domains we're iterating over.
+ * @sd: variable holding the value of the power_savings_sd
+ * for cpu.
+ * @flag: The flag to filter the sched_domains to be iterated.
+ *
+ * Iterates over all the scheduler domains for a given cpu that has the 'flag'
+ * set, starting from the lowest sched_domain to the highest.
+ */
+#define for_each_flag_domain(cpu, sd, flag) \
+ for (sd = lowest_flag_domain(cpu, flag); \
+ (sd && (sd->flags & flag)); sd = sd->parent)
+
+#endif /* (CONFIG_SCHED_MC || CONFIG_SCHED_SMT) */
+
+/*
+ * In the semi idle case, use the nearest busy cpu for migrating timers
+ * from an idle cpu. This is good for power-savings.
+ *
+ * We don't do similar optimization for completely idle system, as
+ * selecting an idle cpu will add more delays to the timers than intended
+ * (as that cpu's timer base may not be uptodate wrt jiffies etc).
+ */
+int get_nohz_timer_target(void)
+{
+ int i, cpu = smp_processor_id();
+ struct sched_domain *sd;
+
+ if (!idle_cpu(cpu) && is_housekeeping_cpu(cpu))
+ return cpu;
+
+ rcu_read_lock();
+ for_each_domain(cpu, sd) {
+ for_each_cpu(i, sched_domain_span(sd)) {
+ if (cpu == i)
+ continue;
+
+ if (!idle_cpu(i) && is_housekeeping_cpu(i)) {
+ cpu = i;
+ cpu = i;
+ goto unlock;
+ }
+ }
+ }
+
+ if (!is_housekeeping_cpu(cpu))
+ cpu = housekeeping_any_cpu();
+unlock:
+ rcu_read_unlock();
+ return cpu;
+}
+
+/*
+ * When add_timer_on() enqueues a timer into the timer wheel of an
+ * idle CPU then this timer might expire before the next timer event
+ * which is scheduled to wake up that CPU. In case of a completely
+ * idle system the next event might even be infinite time into the
+ * future. wake_up_idle_cpu() ensures that the CPU is woken up and
+ * leaves the inner idle loop so the newly added timer is taken into
+ * account when the CPU goes back to idle and evaluates the timer
+ * wheel for the next timer event.
+ */
+void wake_up_idle_cpu(int cpu)
+{
+ if (cpu == smp_processor_id())
+ return;
+
+ if (set_nr_and_not_polling(cpu_rq(cpu)->idle))
+ smp_sched_reschedule(cpu);
+ else
+ trace_sched_wake_idle_without_ipi(cpu);
+}
+
+static bool wake_up_full_nohz_cpu(int cpu)
+{
+ /*
+ * We just need the target to call irq_exit() and re-evaluate
+ * the next tick. The nohz full kick at least implies that.
+ * If needed we can still optimize that later with an
+ * empty IRQ.
+ */
+ if (cpu_is_offline(cpu))
+ return true; /* Don't try to wake offline CPUs. */
+ if (tick_nohz_full_cpu(cpu)) {
+ if (cpu != smp_processor_id() ||
+ tick_nohz_tick_stopped())
+ tick_nohz_full_kick_cpu(cpu);
+ return true;
+ }
+
+ return false;
+}
+
+/*
+ * Wake up the specified CPU. If the CPU is going offline, it is the
+ * caller's responsibility to deal with the lost wakeup, for example,
+ * by hooking into the CPU_DEAD notifier like timers and hrtimers do.
+ */
+void wake_up_nohz_cpu(int cpu)
+{
+ if (!wake_up_full_nohz_cpu(cpu))
+ wake_up_idle_cpu(cpu);
+}
+#endif /* CONFIG_NO_HZ_COMMON */
+
+/*
+ * Change a given task's CPU affinity. Migrate the thread to a
+ * proper CPU and schedule it away if the CPU it's executing on
+ * is removed from the allowed bitmask.
+ *
+ * NOTE: the caller must have a valid reference to the task, the
+ * task must not exit() & deallocate itself prematurely. The
+ * call is not atomic; no spinlocks may be held.
+ */
+static int __set_cpus_allowed_ptr(struct task_struct *p,
+ const struct cpumask *new_mask, bool check)
+{
+ const struct cpumask *cpu_valid_mask = cpu_active_mask;
+ bool queued = false, running_wrong = false, kthread;
+ struct cpumask old_mask;
+ unsigned long flags;
+ struct rq *rq;
+ int ret = 0;
+
+ rq = task_rq_lock(p, &flags);
+
+ kthread = !!(p->flags & PF_KTHREAD);
+ if (kthread) {
+ /*
+ * Kernel threads are allowed on online && !active CPUs
+ */
+ cpu_valid_mask = cpu_online_mask;
+ }
+
+ /*
+ * Must re-check here, to close a race against __kthread_bind(),
+ * sched_setaffinity() is not guaranteed to observe the flag.
+ */
+ if (check && (p->flags & PF_NO_SETAFFINITY)) {
+ ret = -EINVAL;
+ goto out;
+ }
+
+ cpumask_copy(&old_mask, tsk_cpus_allowed(p));
+ if (cpumask_equal(&old_mask, new_mask))
+ goto out;
+
+ if (!cpumask_intersects(new_mask, cpu_valid_mask)) {
+ ret = -EINVAL;
+ goto out;
+ }
+
+ queued = task_queued(p);
+
+ _do_set_cpus_allowed(p, new_mask);
+
+ if (kthread) {
+ /*
+ * For kernel threads that do indeed end up on online &&
+ * !active we want to ensure they are strict per-cpu threads.
+ */
+ WARN_ON(cpumask_intersects(new_mask, cpu_online_mask) &&
+ !cpumask_intersects(new_mask, cpu_active_mask) &&
+ tsk_nr_cpus_allowed(p) != 1);
+ }
+
+ /* Can the task run on the task's current CPU? If so, we're done */
+ if (cpumask_test_cpu(task_cpu(p), new_mask))
+ goto out;
+
+ if (task_running(rq, p)) {
+ /* Task is running on the wrong cpu now, reschedule it. */
+ if (rq == this_rq()) {
+ set_tsk_need_resched(p);
+ running_wrong = kthread;
+ } else
+ resched_task(p);
+ } else {
+ int dest_cpu = cpumask_any_and(cpu_valid_mask, new_mask);
+ struct rq *dest_rq = cpu_rq(dest_cpu);
+
+ /* Switch rq locks here */
+ lock_second_rq(rq, dest_rq);
+ set_task_cpu(p, dest_cpu);
+ rq_unlock(rq);
+
+ rq = dest_rq;
+ }
+out:
+ if (queued && !cpumask_subset(new_mask, &old_mask))
+ try_preempt(p, rq);
+ if (running_wrong)
+ preempt_disable();
+ task_rq_unlock(rq, p, &flags);
+
+ if (running_wrong) {
+ __schedule(true);
+ preempt_enable();
+ }
+
+ return ret;
+}
+
+int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask)
+{
+ return __set_cpus_allowed_ptr(p, new_mask, false);
+}
+EXPORT_SYMBOL_GPL(set_cpus_allowed_ptr);
+
+#ifdef CONFIG_HOTPLUG_CPU
+/*
+ * Run through task list and find tasks affined to the dead cpu, then remove
+ * that cpu from the list, enable cpu0 and set the zerobound flag. Must hold
+ * cpu 0 and src_cpu's runqueue locks.
+ */
+static void bind_zero(int src_cpu)
+{
+ struct task_struct *p, *t;
+ int bound = 0;
+
+ if (src_cpu == 0)
+ return;
+
+ do_each_thread(t, p) {
+ if (cpumask_test_cpu(src_cpu, tsk_cpus_allowed(p))) {
+ bool local = (task_cpu(p) == src_cpu);
+
+ /* task_running is the cpu stopper thread */
+ if (local && task_running(task_rq(p), p))
+ continue;
+ atomic_clear_cpu(src_cpu, tsk_cpus_allowed(p));
+ atomic_set_cpu(0, tsk_cpus_allowed(p));
+ p->zerobound = true;
+ bound++;
+ if (local)
+ set_task_cpu(p, 0);
+ }
+ } while_each_thread(t, p);
+
+ if (bound) {
+ printk(KERN_INFO "Removed affinity for %d processes to cpu %d\n",
+ bound, src_cpu);
+ }
+}
+
+/* Find processes with the zerobound flag and reenable their affinity for the
+ * CPU coming alive. */
+static void unbind_zero(int src_cpu)
+{
+ int unbound = 0, zerobound = 0;
+ struct task_struct *p, *t;
+
+ if (src_cpu == 0)
+ return;
+
+ do_each_thread(t, p) {
+ if (!p->mm)
+ p->zerobound = false;
+ if (p->zerobound) {
+ unbound++;
+ cpumask_set_cpu(src_cpu, tsk_cpus_allowed(p));
+ /* Once every CPU affinity has been re-enabled, remove
+ * the zerobound flag */
+ if (cpumask_subset(cpu_possible_mask, tsk_cpus_allowed(p))) {
+ p->zerobound = false;
+ zerobound++;
+ }
+ }
+ } while_each_thread(t, p);
+
+ if (unbound) {
+ printk(KERN_INFO "Added affinity for %d processes to cpu %d\n",
+ unbound, src_cpu);
+ }
+ if (zerobound) {
+ printk(KERN_INFO "Released forced binding to cpu0 for %d processes\n",
+ zerobound);
+ }
+}
+
+/*
+ * Ensures that the idle task is using init_mm right before its cpu goes
+ * offline.
+ */
+void idle_task_exit(void)
+{
+ struct mm_struct *mm = current->active_mm;
+
+ BUG_ON(cpu_online(smp_processor_id()));
+
+ if (mm != &init_mm) {
+ switch_mm_irqs_off(mm, &init_mm, current);
+ finish_arch_post_lock_switch();
+ }
+ mmdrop(mm);
+}
+#else /* CONFIG_HOTPLUG_CPU */
+static void unbind_zero(int src_cpu) {}
+#endif /* CONFIG_HOTPLUG_CPU */
+
+void sched_set_stop_task(int cpu, struct task_struct *stop)
+{
+ struct sched_param stop_param = { .sched_priority = STOP_PRIO };
+ struct sched_param start_param = { .sched_priority = 0 };
+ struct task_struct *old_stop = cpu_rq(cpu)->stop;
+
+ if (stop) {
+ /*
+ * Make it appear like a SCHED_FIFO task, its something
+ * userspace knows about and won't get confused about.
+ *
+ * Also, it will make PI more or less work without too
+ * much confusion -- but then, stop work should not
+ * rely on PI working anyway.
+ */
+ sched_setscheduler_nocheck(stop, SCHED_FIFO, &stop_param);
+ }
+
+ cpu_rq(cpu)->stop = stop;
+
+ if (old_stop) {
+ /*
+ * Reset it back to a normal scheduling policy so that
+ * it can die in pieces.
+ */
+ sched_setscheduler_nocheck(old_stop, SCHED_NORMAL, &start_param);
+ }
+}
+
+#if defined(CONFIG_SCHED_DEBUG) && defined(CONFIG_SYSCTL)
+
+static struct ctl_table sd_ctl_dir[] = {
+ {
+ .procname = "sched_domain",
+ .mode = 0555,
+ },
+ {}
+};
+
+static struct ctl_table sd_ctl_root[] = {
+ {
+ .procname = "kernel",
+ .mode = 0555,
+ .child = sd_ctl_dir,
+ },
+ {}
+};
+
+static struct ctl_table *sd_alloc_ctl_entry(int n)
+{
+ struct ctl_table *entry =
+ kcalloc(n, sizeof(struct ctl_table), GFP_KERNEL);
+
+ return entry;
+}
+
+static void sd_free_ctl_entry(struct ctl_table **tablep)
+{
+ struct ctl_table *entry;
+
+ /*
+ * In the intermediate directories, both the child directory and
+ * procname are dynamically allocated and could fail but the mode
+ * will always be set. In the lowest directory the names are
+ * static strings and all have proc handlers.
+ */
+ for (entry = *tablep; entry->mode; entry++) {
+ if (entry->child)
+ sd_free_ctl_entry(&entry->child);
+ if (entry->proc_handler == NULL)
+ kfree(entry->procname);
+ }
+
+ kfree(*tablep);
+ *tablep = NULL;
+}
+
+#define CPU_LOAD_IDX_MAX 5
+static int min_load_idx = 0;
+static int max_load_idx = CPU_LOAD_IDX_MAX-1;
+
+static void
+set_table_entry(struct ctl_table *entry,
+ const char *procname, void *data, int maxlen,
+ umode_t mode, proc_handler *proc_handler,
+ bool load_idx)
+{
+ entry->procname = procname;
+ entry->data = data;
+ entry->maxlen = maxlen;
+ entry->mode = mode;
+ entry->proc_handler = proc_handler;
+
+ if (load_idx) {
+ entry->extra1 = &min_load_idx;
+ entry->extra2 = &max_load_idx;
+ }
+}
+
+static struct ctl_table *
+sd_alloc_ctl_domain_table(struct sched_domain *sd)
+{
+ struct ctl_table *table = sd_alloc_ctl_entry(14);
+
+ if (table == NULL)
+ return NULL;
+
+ set_table_entry(&table[0], "min_interval", &sd->min_interval,
+ sizeof(long), 0644, proc_doulongvec_minmax, false);
+ set_table_entry(&table[1], "max_interval", &sd->max_interval,
+ sizeof(long), 0644, proc_doulongvec_minmax, false);
+ set_table_entry(&table[2], "busy_idx", &sd->busy_idx,
+ sizeof(int), 0644, proc_dointvec_minmax, true);
+ set_table_entry(&table[3], "idle_idx", &sd->idle_idx,
+ sizeof(int), 0644, proc_dointvec_minmax, true);
+ set_table_entry(&table[4], "newidle_idx", &sd->newidle_idx,
+ sizeof(int), 0644, proc_dointvec_minmax, true);
+ set_table_entry(&table[5], "wake_idx", &sd->wake_idx,
+ sizeof(int), 0644, proc_dointvec_minmax, true);
+ set_table_entry(&table[6], "forkexec_idx", &sd->forkexec_idx,
+ sizeof(int), 0644, proc_dointvec_minmax, true);
+ set_table_entry(&table[7], "busy_factor", &sd->busy_factor,
+ sizeof(int), 0644, proc_dointvec_minmax, false);
+ set_table_entry(&table[8], "imbalance_pct", &sd->imbalance_pct,
+ sizeof(int), 0644, proc_dointvec_minmax, false);
+ set_table_entry(&table[9], "cache_nice_tries",
+ &sd->cache_nice_tries,
+ sizeof(int), 0644, proc_dointvec_minmax, false);
+ set_table_entry(&table[10], "flags", &sd->flags,
+ sizeof(int), 0644, proc_dointvec_minmax, false);
+ set_table_entry(&table[11], "max_newidle_lb_cost",
+ &sd->max_newidle_lb_cost,
+ sizeof(long), 0644, proc_doulongvec_minmax, false);
+ set_table_entry(&table[12], "name", sd->name,
+ CORENAME_MAX_SIZE, 0444, proc_dostring, false);
+ /* &table[13] is terminator */
+
+ return table;
+}
+
+static struct ctl_table *sd_alloc_ctl_cpu_table(int cpu)
+{
+ struct ctl_table *entry, *table;
+ struct sched_domain *sd;
+ int domain_num = 0, i;
+ char buf[32];
+
+ for_each_domain(cpu, sd)
+ domain_num++;
+ entry = table = sd_alloc_ctl_entry(domain_num + 1);
+ if (table == NULL)
+ return NULL;
+
+ i = 0;
+ for_each_domain(cpu, sd) {
+ snprintf(buf, 32, "domain%d", i);
+ entry->procname = kstrdup(buf, GFP_KERNEL);
+ entry->mode = 0555;
+ entry->child = sd_alloc_ctl_domain_table(sd);
+ entry++;
+ i++;
+ }
+ return table;
+}
+
+static struct ctl_table_header *sd_sysctl_header;
+void register_sched_domain_sysctl(void)
+{
+ int i, cpu_num = num_possible_cpus();
+ struct ctl_table *entry = sd_alloc_ctl_entry(cpu_num + 1);
+ char buf[32];
+
+ WARN_ON(sd_ctl_dir[0].child);
+ sd_ctl_dir[0].child = entry;
+
+ if (entry == NULL)
+ return;
+
+ for_each_possible_cpu(i) {
+ snprintf(buf, 32, "cpu%d", i);
+ entry->procname = kstrdup(buf, GFP_KERNEL);
+ entry->mode = 0555;
+ entry->child = sd_alloc_ctl_cpu_table(i);
+ entry++;
+ }
+
+ WARN_ON(sd_sysctl_header);
+ sd_sysctl_header = register_sysctl_table(sd_ctl_root);
+}
+
+/* may be called multiple times per register */
+void unregister_sched_domain_sysctl(void)
+{
+ unregister_sysctl_table(sd_sysctl_header);
+ sd_sysctl_header = NULL;
+ if (sd_ctl_dir[0].child)
+ sd_free_ctl_entry(&sd_ctl_dir[0].child);
+}
+#endif /* CONFIG_SYSCTL */
+
+static void set_rq_online(struct rq *rq)
+{
+ if (!rq->online) {
+ cpumask_set_cpu(cpu_of(rq), rq->rd->online);
+ rq->online = true;
+ }
+}
+
+static void set_rq_offline(struct rq *rq)
+{
+ if (rq->online) {
+ int cpu = cpu_of(rq);
+
+ cpumask_clear_cpu(cpu, rq->rd->online);
+ rq->online = false;
+ clear_cpuidle_map(cpu);
+ }
+}
+
+static cpumask_var_t sched_domains_tmpmask; /* sched_domains_mutex */
+
+#ifdef CONFIG_SCHED_DEBUG
+
+static __read_mostly int sched_debug_enabled;
+
+static int __init sched_debug_setup(char *str)
+{
+ sched_debug_enabled = 1;
+
+ return 0;
+}
+early_param("sched_debug", sched_debug_setup);
+
+static inline bool sched_debug(void)
+{
+ return sched_debug_enabled;
+}
+
+static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level,
+ struct cpumask *groupmask)
+{
+ cpumask_clear(groupmask);
+
+ printk(KERN_DEBUG "%*s domain %d: ", level, "", level);
+
+ if (!(sd->flags & SD_LOAD_BALANCE)) {
+ printk("does not load-balance\n");
+ if (sd->parent)
+ printk(KERN_ERR "ERROR: !SD_LOAD_BALANCE domain"
+ " has parent");
+ return -1;
+ }
+
+ printk(KERN_CONT "span %*pbl level %s\n",
+ cpumask_pr_args(sched_domain_span(sd)), sd->name);
+
+ if (!cpumask_test_cpu(cpu, sched_domain_span(sd))) {
+ printk(KERN_ERR "ERROR: domain->span does not contain "
+ "CPU%d\n", cpu);
+ }
+
+ printk(KERN_CONT "\n");
+
+ if (!cpumask_equal(sched_domain_span(sd), groupmask))
+ printk(KERN_ERR "ERROR: groups don't span domain->span\n");
+
+ if (sd->parent &&
+ !cpumask_subset(groupmask, sched_domain_span(sd->parent)))
+ printk(KERN_ERR "ERROR: parent span is not a superset "
+ "of domain->span\n");
+ return 0;
+}
+
+static void sched_domain_debug(struct sched_domain *sd, int cpu)
+{
+ int level = 0;
+
+ if (!sched_debug_enabled)
+ return;
+
+ if (!sd) {
+ printk(KERN_DEBUG "CPU%d attaching NULL sched-domain.\n", cpu);
+ return;
+ }
+
+ printk(KERN_DEBUG "CPU%d attaching sched-domain:\n", cpu);
+
+ for (;;) {
+ if (sched_domain_debug_one(sd, cpu, level, sched_domains_tmpmask))
+ break;
+ level++;
+ sd = sd->parent;
+ if (!sd)
+ break;
+ }
+}
+#else /* !CONFIG_SCHED_DEBUG */
+
+# define sched_debug_enabled 0
+# define sched_domain_debug(sd, cpu) do { } while (0)
+static inline bool sched_debug(void)
+{
+ return false;
+}
+#endif /* CONFIG_SCHED_DEBUG */
+
+static int sd_degenerate(struct sched_domain *sd)
+{
+ if (cpumask_weight(sched_domain_span(sd)) == 1)
+ return 1;
+
+ /* Following flags don't use groups */
+ if (sd->flags & (SD_WAKE_AFFINE))
+ return 0;
+
+ return 1;
+}
+
+static int
+sd_parent_degenerate(struct sched_domain *sd, struct sched_domain *parent)
+{
+ unsigned long cflags = sd->flags, pflags = parent->flags;
+
+ if (sd_degenerate(parent))
+ return 1;
+
+ if (!cpumask_equal(sched_domain_span(sd), sched_domain_span(parent)))
+ return 0;
+
+ if (~cflags & pflags)
+ return 0;
+
+ return 1;
+}
+
+static void free_rootdomain(struct rcu_head *rcu)
+{
+ struct root_domain *rd = container_of(rcu, struct root_domain, rcu);
+
+ cpupri_cleanup(&rd->cpupri);
+ free_cpumask_var(rd->rto_mask);
+ free_cpumask_var(rd->online);
+ free_cpumask_var(rd->span);
+ kfree(rd);
+}
+
+static void rq_attach_root(struct rq *rq, struct root_domain *rd)
+{
+ struct root_domain *old_rd = NULL;
+ unsigned long flags;
+
+ rq_lock_irqsave(rq, &flags);
+
+ if (rq->rd) {
+ old_rd = rq->rd;
+
+ if (cpumask_test_cpu(rq->cpu, old_rd->online))
+ set_rq_offline(rq);
+
+ cpumask_clear_cpu(rq->cpu, old_rd->span);
+
+ /*
+ * If we dont want to free the old_rd yet then
+ * set old_rd to NULL to skip the freeing later
+ * in this function:
+ */
+ if (!atomic_dec_and_test(&old_rd->refcount))
+ old_rd = NULL;
+ }
+
+ atomic_inc(&rd->refcount);
+ rq->rd = rd;
+
+ cpumask_set_cpu(rq->cpu, rd->span);
+ if (cpumask_test_cpu(rq->cpu, cpu_active_mask))
+ set_rq_online(rq);
+
+ rq_unlock_irqrestore(rq, &flags);
+
+ if (old_rd)
+ call_rcu_sched(&old_rd->rcu, free_rootdomain);
+}
+
+static int init_rootdomain(struct root_domain *rd)
+{
+ memset(rd, 0, sizeof(*rd));
+
+ if (!zalloc_cpumask_var(&rd->span, GFP_KERNEL))
+ goto out;
+ if (!zalloc_cpumask_var(&rd->online, GFP_KERNEL))
+ goto free_span;
+ if (!zalloc_cpumask_var(&rd->rto_mask, GFP_KERNEL))
+ goto free_online;
+
+ if (cpupri_init(&rd->cpupri) != 0)
+ goto free_rto_mask;
+ return 0;
+
+free_rto_mask:
+ free_cpumask_var(rd->rto_mask);
+free_online:
+ free_cpumask_var(rd->online);
+free_span:
+ free_cpumask_var(rd->span);
+out:
+ return -ENOMEM;
+}
+
+static void init_defrootdomain(void)
+{
+ init_rootdomain(&def_root_domain);
+
+ atomic_set(&def_root_domain.refcount, 1);
+}
+
+static struct root_domain *alloc_rootdomain(void)
+{
+ struct root_domain *rd;
+
+ rd = kmalloc(sizeof(*rd), GFP_KERNEL);
+ if (!rd)
+ return NULL;
+
+ if (init_rootdomain(rd) != 0) {
+ kfree(rd);
+ return NULL;
+ }
+
+ return rd;
+}
+
+static void destroy_sched_domain(struct sched_domain *sd)
+{
+ if (sd->shared && atomic_dec_and_test(&sd->shared->ref))
+ kfree(sd->shared);
+ kfree(sd);
+}
+
+static void destroy_sched_domains_rcu(struct rcu_head *rcu)
+{
+ struct sched_domain *sd = container_of(rcu, struct sched_domain, rcu);
+
+ while (sd) {
+ struct sched_domain *parent = sd->parent;
+ destroy_sched_domain(sd);
+ sd = parent;
+ }
+}
+
+static void destroy_sched_domains(struct sched_domain *sd)
+{
+ if (sd)
+ call_rcu(&sd->rcu, destroy_sched_domains_rcu);
+}
+
+/*
+ * Attach the domain 'sd' to 'cpu' as its base domain. Callers must
+ * hold the hotplug lock.
+ */
+static void
+cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu)
+{
+ struct rq *rq = cpu_rq(cpu);
+ struct sched_domain *tmp;
+
+ /* Remove the sched domains which do not contribute to scheduling. */
+ for (tmp = sd; tmp; ) {
+ struct sched_domain *parent = tmp->parent;
+ if (!parent)
+ break;
+
+ if (sd_parent_degenerate(tmp, parent)) {
+ tmp->parent = parent->parent;
+ if (parent->parent)
+ parent->parent->child = tmp;
+ /*
+ * Transfer SD_PREFER_SIBLING down in case of a
+ * degenerate parent; the spans match for this
+ * so the property transfers.
+ */
+ if (parent->flags & SD_PREFER_SIBLING)
+ tmp->flags |= SD_PREFER_SIBLING;
+ destroy_sched_domain(parent);
+ } else
+ tmp = tmp->parent;
+ }
+
+ if (sd && sd_degenerate(sd)) {
+ tmp = sd;
+ sd = sd->parent;
+ destroy_sched_domain(tmp);
+ if (sd)
+ sd->child = NULL;
+ }
+
+ sched_domain_debug(sd, cpu);
+
+ rq_attach_root(rq, rd);
+ tmp = rq->sd;
+ rcu_assign_pointer(rq->sd, sd);
+ destroy_sched_domains(tmp);
+}
+
+/* Setup the mask of cpus configured for isolated domains */
+static int __init isolated_cpu_setup(char *str)
+{
+ int ret;
+
+ alloc_bootmem_cpumask_var(&cpu_isolated_map);
+ ret = cpulist_parse(str, cpu_isolated_map);
+ if (ret) {
+ pr_err("sched: Error, all isolcpus= values must be between 0 and %d\n", nr_cpu_ids);
+ return 0;
+ }
+ return 1;
+}
+
+__setup("isolcpus=", isolated_cpu_setup);
+
+struct s_data {
+ struct sched_domain ** __percpu sd;
+ struct root_domain *rd;
+};
+
+enum s_alloc {
+ sa_rootdomain,
+ sa_sd,
+ sa_sd_storage,
+ sa_none,
+};
+
+/*
+ * Initializers for schedule domains
+ * Non-inlined to reduce accumulated stack pressure in build_sched_domains()
+ */
+
+static int default_relax_domain_level = -1;
+int sched_domain_level_max;
+
+static int __init setup_relax_domain_level(char *str)
+{
+ if (kstrtoint(str, 0, &default_relax_domain_level))
+ pr_warn("Unable to set relax_domain_level\n");
+
+ return 1;
+}
+__setup("relax_domain_level=", setup_relax_domain_level);
+
+static void set_domain_attribute(struct sched_domain *sd,
+ struct sched_domain_attr *attr)
+{
+ int request;
+
+ if (!attr || attr->relax_domain_level < 0) {
+ if (default_relax_domain_level < 0)
+ return;
+ else
+ request = default_relax_domain_level;
+ } else
+ request = attr->relax_domain_level;
+ if (request < sd->level) {
+ /* turn off idle balance on this domain */
+ sd->flags &= ~(SD_BALANCE_WAKE|SD_BALANCE_NEWIDLE);
+ } else {
+ /* turn on idle balance on this domain */
+ sd->flags |= (SD_BALANCE_WAKE|SD_BALANCE_NEWIDLE);
+ }
+}
+
+static void __sdt_free(const struct cpumask *cpu_map);
+static int __sdt_alloc(const struct cpumask *cpu_map);
+
+static void __free_domain_allocs(struct s_data *d, enum s_alloc what,
+ const struct cpumask *cpu_map)
+{
+ switch (what) {
+ case sa_rootdomain:
+ if (!atomic_read(&d->rd->refcount))
+ free_rootdomain(&d->rd->rcu); /* fall through */
+ case sa_sd:
+ free_percpu(d->sd); /* fall through */
+ case sa_sd_storage:
+ __sdt_free(cpu_map); /* fall through */
+ case sa_none:
+ break;
+ }
+}
+
+static enum s_alloc __visit_domain_allocation_hell(struct s_data *d,
+ const struct cpumask *cpu_map)
+{
+ memset(d, 0, sizeof(*d));
+
+ if (__sdt_alloc(cpu_map))
+ return sa_sd_storage;
+ d->sd = alloc_percpu(struct sched_domain *);
+ if (!d->sd)
+ return sa_sd_storage;
+ d->rd = alloc_rootdomain();
+ if (!d->rd)
+ return sa_sd;
+ return sa_rootdomain;
+}
+
+/*
+ * NULL the sd_data elements we've used to build the sched_domain
+ * structure so that the subsequent __free_domain_allocs()
+ * will not free the data we're using.
+ */
+static void claim_allocations(int cpu, struct sched_domain *sd)
+{
+ struct sd_data *sdd = sd->private;
+
+ WARN_ON_ONCE(*per_cpu_ptr(sdd->sd, cpu) != sd);
+ *per_cpu_ptr(sdd->sd, cpu) = NULL;
+
+ if (atomic_read(&(*per_cpu_ptr(sdd->sds, cpu))->ref))
+ *per_cpu_ptr(sdd->sds, cpu) = NULL;
+}
+
+#ifdef CONFIG_NUMA
+static int sched_domains_numa_levels;
+static int *sched_domains_numa_distance;
+static struct cpumask ***sched_domains_numa_masks;
+static int sched_domains_curr_level;
+#endif
+
+/*
+ * SD_flags allowed in topology descriptions.
+ *
+ * These flags are purely descriptive of the topology and do not prescribe
+ * behaviour. Behaviour is artificial and mapped in the below sd_init()
+ * function:
+ *
+ * SD_SHARE_CPUCAPACITY - describes SMT topologies
+ * SD_SHARE_PKG_RESOURCES - describes shared caches
+ * SD_NUMA - describes NUMA topologies
+ * SD_SHARE_POWERDOMAIN - describes shared power domain
+ * SD_ASYM_CPUCAPACITY - describes mixed capacity topologies
+ *
+ * Odd one out, which beside describing the topology has a quirk also
+ * prescribes the desired behaviour that goes along with it:
+ *
+ * SD_ASYM_PACKING - describes SMT quirks
+ */
+#define TOPOLOGY_SD_FLAGS \
+ (SD_SHARE_CPUCAPACITY | \
+ SD_SHARE_PKG_RESOURCES | \
+ SD_NUMA | \
+ SD_ASYM_PACKING | \
+ SD_ASYM_CPUCAPACITY | \
+ SD_SHARE_POWERDOMAIN)
+
+static struct sched_domain *
+sd_init(struct sched_domain_topology_level *tl,
+ const struct cpumask *cpu_map,
+ struct sched_domain *child, int cpu)
+{
+ struct sd_data *sdd = &tl->data;
+ struct sched_domain *sd = *per_cpu_ptr(sdd->sd, cpu);
+ int sd_id, sd_weight, sd_flags = 0;
+
+#ifdef CONFIG_NUMA
+ /*
+ * Ugly hack to pass state to sd_numa_mask()...
+ */
+ sched_domains_curr_level = tl->numa_level;
+#endif
+
+ sd_weight = cpumask_weight(tl->mask(cpu));
+
+ if (tl->sd_flags)
+ sd_flags = (*tl->sd_flags)();
+ if (WARN_ONCE(sd_flags & ~TOPOLOGY_SD_FLAGS,
+ "wrong sd_flags in topology description\n"))
+ sd_flags &= ~TOPOLOGY_SD_FLAGS;
+
+ *sd = (struct sched_domain){
+ .min_interval = sd_weight,
+ .max_interval = 2*sd_weight,
+ .busy_factor = 32,
+ .imbalance_pct = 125,
+
+ .cache_nice_tries = 0,
+ .busy_idx = 0,
+ .idle_idx = 0,
+ .newidle_idx = 0,
+ .wake_idx = 0,
+ .forkexec_idx = 0,
+
+ .flags = 1*SD_LOAD_BALANCE
+ | 1*SD_BALANCE_NEWIDLE
+ | 1*SD_BALANCE_EXEC
+ | 1*SD_BALANCE_FORK
+ | 0*SD_BALANCE_WAKE
+ | 1*SD_WAKE_AFFINE
+ | 0*SD_SHARE_CPUCAPACITY
+ | 0*SD_SHARE_PKG_RESOURCES
+ | 0*SD_SERIALIZE
+ | 0*SD_PREFER_SIBLING
+ | 0*SD_NUMA
+ | sd_flags
+ ,
+
+ .last_balance = jiffies,
+ .balance_interval = sd_weight,
+ .smt_gain = 0,
+ .max_newidle_lb_cost = 0,
+ .next_decay_max_lb_cost = jiffies,
+ .child = child,
+#ifdef CONFIG_SCHED_DEBUG
+ .name = tl->name,
+#endif
+ };
+
+ cpumask_and(sched_domain_span(sd), cpu_map, tl->mask(cpu));
+ sd_id = cpumask_first(sched_domain_span(sd));
+
+ /*
+ * Convert topological properties into behaviour.
+ */
+
+ if (sd->flags & SD_ASYM_CPUCAPACITY) {
+ struct sched_domain *t = sd;
+
+ for_each_lower_domain(t)
+ t->flags |= SD_BALANCE_WAKE;
+ }
+
+ if (sd->flags & SD_SHARE_CPUCAPACITY) {
+ sd->flags |= SD_PREFER_SIBLING;
+ sd->imbalance_pct = 110;
+ sd->smt_gain = 1178; /* ~15% */
+
+ } else if (sd->flags & SD_SHARE_PKG_RESOURCES) {
+ sd->imbalance_pct = 117;
+ sd->cache_nice_tries = 1;
+ sd->busy_idx = 2;
+
+#ifdef CONFIG_NUMA
+ } else if (sd->flags & SD_NUMA) {
+ sd->cache_nice_tries = 2;
+ sd->busy_idx = 3;
+ sd->idle_idx = 2;
+
+ sd->flags |= SD_SERIALIZE;
+ if (sched_domains_numa_distance[tl->numa_level] > RECLAIM_DISTANCE) {
+ sd->flags &= ~(SD_BALANCE_EXEC |
+ SD_BALANCE_FORK |
+ SD_WAKE_AFFINE);
+ }
+
+#endif
+ } else {
+ sd->flags |= SD_PREFER_SIBLING;
+ sd->cache_nice_tries = 1;
+ sd->busy_idx = 2;
+ sd->idle_idx = 1;
+ }
+
+ /*
+ * For all levels sharing cache; connect a sched_domain_shared
+ * instance.
+ */
+ if (sd->flags & SD_SHARE_PKG_RESOURCES) {
+ sd->shared = *per_cpu_ptr(sdd->sds, sd_id);
+ atomic_inc(&sd->shared->ref);
+ atomic_set(&sd->shared->nr_busy_cpus, sd_weight);
+ }
+
+ sd->private = sdd;
+
+ return sd;
+}
+
+/*
+ * Topology list, bottom-up.
+ */
+static struct sched_domain_topology_level default_topology[] = {
+#ifdef CONFIG_SCHED_SMT
+ { cpu_smt_mask, cpu_smt_flags, SD_INIT_NAME(SMT) },
+#endif
+#ifdef CONFIG_SCHED_MC
+ { cpu_coregroup_mask, cpu_core_flags, SD_INIT_NAME(MC) },
+#endif
+ { cpu_cpu_mask, SD_INIT_NAME(DIE) },
+ { NULL, },
+};
+
+static struct sched_domain_topology_level *sched_domain_topology =
+ default_topology;
+
+#define for_each_sd_topology(tl) \
+ for (tl = sched_domain_topology; tl->mask; tl++)
+
+void set_sched_topology(struct sched_domain_topology_level *tl)
+{
+ if (WARN_ON_ONCE(sched_smp_initialized))
+ return;
+
+ sched_domain_topology = tl;
+}
+
+#ifdef CONFIG_NUMA
+
+static const struct cpumask *sd_numa_mask(int cpu)
+{
+ return sched_domains_numa_masks[sched_domains_curr_level][cpu_to_node(cpu)];
+}
+
+static void sched_numa_warn(const char *str)
+{
+ static int done = false;
+ int i,j;
+
+ if (done)
+ return;
+
+ done = true;
+
+ printk(KERN_WARNING "ERROR: %s\n\n", str);
+
+ for (i = 0; i < nr_node_ids; i++) {
+ printk(KERN_WARNING " ");
+ for (j = 0; j < nr_node_ids; j++)
+ printk(KERN_CONT "%02d ", node_distance(i,j));
+ printk(KERN_CONT "\n");
+ }
+ printk(KERN_WARNING "\n");
+}
+
+static bool find_numa_distance(int distance)
+{
+ int i;
+
+ if (distance == node_distance(0, 0))
+ return true;
+
+ for (i = 0; i < sched_domains_numa_levels; i++) {
+ if (sched_domains_numa_distance[i] == distance)
+ return true;
+ }
+
+ return false;
+}
+
+static void sched_init_numa(void)
+{
+ int next_distance, curr_distance = node_distance(0, 0);
+ struct sched_domain_topology_level *tl;
+ int level = 0;
+ int i, j, k;
+
+ sched_domains_numa_distance = kzalloc(sizeof(int) * nr_node_ids, GFP_KERNEL);
+ if (!sched_domains_numa_distance)
+ return;
+
+ /*
+ * O(nr_nodes^2) deduplicating selection sort -- in order to find the
+ * unique distances in the node_distance() table.
+ *
+ * Assumes node_distance(0,j) includes all distances in
+ * node_distance(i,j) in order to avoid cubic time.
+ */
+ next_distance = curr_distance;
+ for (i = 0; i < nr_node_ids; i++) {
+ for (j = 0; j < nr_node_ids; j++) {
+ for (k = 0; k < nr_node_ids; k++) {
+ int distance = node_distance(i, k);
+
+ if (distance > curr_distance &&
+ (distance < next_distance ||
+ next_distance == curr_distance))
+ next_distance = distance;
+
+ /*
+ * While not a strong assumption it would be nice to know
+ * about cases where if node A is connected to B, B is not
+ * equally connected to A.
+ */
+ if (sched_debug() && node_distance(k, i) != distance)
+ sched_numa_warn("Node-distance not symmetric");
+
+ if (sched_debug() && i && !find_numa_distance(distance))
+ sched_numa_warn("Node-0 not representative");
+ }
+ if (next_distance != curr_distance) {
+ sched_domains_numa_distance[level++] = next_distance;
+ sched_domains_numa_levels = level;
+ curr_distance = next_distance;
+ } else break;
+ }
+
+ /*
+ * In case of sched_debug() we verify the above assumption.
+ */
+ if (!sched_debug())
+ break;
+ }
+ /*
+ * 'level' contains the number of unique distances, excluding the
+ * identity distance node_distance(i,i).
+ *
+ * The sched_domains_numa_distance[] array includes the actual distance
+ * numbers.
+ */
+
+ /*
+ * Here, we should temporarily reset sched_domains_numa_levels to 0.
+ * If it fails to allocate memory for array sched_domains_numa_masks[][],
+ * the array will contain less then 'level' members. This could be
+ * dangerous when we use it to iterate array sched_domains_numa_masks[][]
+ * in other functions.
+ *
+ * We reset it to 'level' at the end of this function.
+ */
+ sched_domains_numa_levels = 0;
+
+ sched_domains_numa_masks = kzalloc(sizeof(void *) * level, GFP_KERNEL);
+ if (!sched_domains_numa_masks)
+ return;
+
+ /*
+ * Now for each level, construct a mask per node which contains all
+ * cpus of nodes that are that many hops away from us.
+ */
+ for (i = 0; i < level; i++) {
+ sched_domains_numa_masks[i] =
+ kzalloc(nr_node_ids * sizeof(void *), GFP_KERNEL);
+ if (!sched_domains_numa_masks[i])
+ return;
+
+ for (j = 0; j < nr_node_ids; j++) {
+ struct cpumask *mask = kzalloc(cpumask_size(), GFP_KERNEL);
+ if (!mask)
+ return;
+
+ sched_domains_numa_masks[i][j] = mask;
+
+ for_each_node(k) {
+ if (node_distance(j, k) > sched_domains_numa_distance[i])
+ continue;
+
+ cpumask_or(mask, mask, cpumask_of_node(k));
+ }
+ }
+ }
+
+ /* Compute default topology size */
+ for (i = 0; sched_domain_topology[i].mask; i++);
+
+ tl = kzalloc((i + level + 1) *
+ sizeof(struct sched_domain_topology_level), GFP_KERNEL);
+ if (!tl)
+ return;
+
+ /*
+ * Copy the default topology bits..
+ */
+ for (i = 0; sched_domain_topology[i].mask; i++)
+ tl[i] = sched_domain_topology[i];
+
+ /*
+ * .. and append 'j' levels of NUMA goodness.
+ */
+ for (j = 0; j < level; i++, j++) {
+ tl[i] = (struct sched_domain_topology_level){
+ .mask = sd_numa_mask,
+ .sd_flags = cpu_numa_flags,
+ .flags = SDTL_OVERLAP,
+ .numa_level = j,
+ SD_INIT_NAME(NUMA)
+ };
+ }
+
+ sched_domain_topology = tl;
+
+ sched_domains_numa_levels = level;
+}
+
+static void sched_domains_numa_masks_set(int cpu)
+{
+ int node = cpu_to_node(cpu);
+ int i, j;
+
+ for (i = 0; i < sched_domains_numa_levels; i++) {
+ for (j = 0; j < nr_node_ids; j++) {
+ if (node_distance(j, node) <= sched_domains_numa_distance[i])
+ cpumask_set_cpu(cpu, sched_domains_numa_masks[i][j]);
+ }
+ }
+}
+
+static void sched_domains_numa_masks_clear(int cpu)
+{
+ int i, j;
+
+ for (i = 0; i < sched_domains_numa_levels; i++) {
+ for (j = 0; j < nr_node_ids; j++)
+ cpumask_clear_cpu(cpu, sched_domains_numa_masks[i][j]);
+ }
+}
+
+#else
+static inline void sched_init_numa(void) { }
+static void sched_domains_numa_masks_set(unsigned int cpu) { }
+static void sched_domains_numa_masks_clear(unsigned int cpu) { }
+#endif /* CONFIG_NUMA */
+
+static int __sdt_alloc(const struct cpumask *cpu_map)
+{
+ struct sched_domain_topology_level *tl;
+ int j;
+
+ for_each_sd_topology(tl) {
+ struct sd_data *sdd = &tl->data;
+
+ sdd->sd = alloc_percpu(struct sched_domain *);
+ if (!sdd->sd)
+ return -ENOMEM;
+
+ sdd->sds = alloc_percpu(struct sched_domain_shared *);
+ if (!sdd->sds)
+ return -ENOMEM;
+
+ for_each_cpu(j, cpu_map) {
+ struct sched_domain *sd;
+ struct sched_domain_shared *sds;
+
+ sd = kzalloc_node(sizeof(struct sched_domain) + cpumask_size(),
+ GFP_KERNEL, cpu_to_node(j));
+ if (!sd)
+ return -ENOMEM;
+
+ *per_cpu_ptr(sdd->sd, j) = sd;
+
+ sds = kzalloc_node(sizeof(struct sched_domain_shared),
+ GFP_KERNEL, cpu_to_node(j));
+ if (!sds)
+ return -ENOMEM;
+
+ *per_cpu_ptr(sdd->sds, j) = sds;
+ }
+ }
+
+ return 0;
+}
+
+static void __sdt_free(const struct cpumask *cpu_map)
+{
+ struct sched_domain_topology_level *tl;
+ int j;
+
+ for_each_sd_topology(tl) {
+ struct sd_data *sdd = &tl->data;
+
+ for_each_cpu(j, cpu_map) {
+ struct sched_domain *sd;
+
+ if (sdd->sd) {
+ sd = *per_cpu_ptr(sdd->sd, j);
+ kfree(*per_cpu_ptr(sdd->sd, j));
+ }
+
+ if (sdd->sds)
+ kfree(*per_cpu_ptr(sdd->sds, j));
+ }
+ free_percpu(sdd->sd);
+ sdd->sd = NULL;
+ free_percpu(sdd->sds);
+ sdd->sds = NULL;
+ }
+}
+
+struct sched_domain *build_sched_domain(struct sched_domain_topology_level *tl,
+ const struct cpumask *cpu_map, struct sched_domain_attr *attr,
+ struct sched_domain *child, int cpu)
+{
+ struct sched_domain *sd = sd_init(tl, cpu_map, child, cpu);
+
+ if (child) {
+ sd->level = child->level + 1;
+ sched_domain_level_max = max(sched_domain_level_max, sd->level);
+ child->parent = sd;
+
+ if (!cpumask_subset(sched_domain_span(child),
+ sched_domain_span(sd))) {
+ pr_err("BUG: arch topology borken\n");
+#ifdef CONFIG_SCHED_DEBUG
+ pr_err(" the %s domain not a subset of the %s domain\n",
+ child->name, sd->name);
+#endif
+ /* Fixup, ensure @sd has at least @child cpus. */
+ cpumask_or(sched_domain_span(sd),
+ sched_domain_span(sd),
+ sched_domain_span(child));
+ }
+
+ }
+ set_domain_attribute(sd, attr);
+
+ return sd;
+}
+
+/*
+ * Build sched domains for a given set of cpus and attach the sched domains
+ * to the individual cpus
+ */
+static int build_sched_domains(const struct cpumask *cpu_map,
+ struct sched_domain_attr *attr)
+{
+ enum s_alloc alloc_state;
+ struct sched_domain *sd;
+ struct s_data d;
+ int i, ret = -ENOMEM;
+
+ alloc_state = __visit_domain_allocation_hell(&d, cpu_map);
+ if (alloc_state != sa_rootdomain)
+ goto error;
+
+ /* Set up domains for cpus specified by the cpu_map. */
+ for_each_cpu(i, cpu_map) {
+ struct sched_domain_topology_level *tl;
+
+ sd = NULL;
+ for_each_sd_topology(tl) {
+ sd = build_sched_domain(tl, cpu_map, attr, sd, i);
+ if (tl == sched_domain_topology)
+ *per_cpu_ptr(d.sd, i) = sd;
+ if (tl->flags & SDTL_OVERLAP)
+ sd->flags |= SD_OVERLAP;
+ if (cpumask_equal(cpu_map, sched_domain_span(sd)))
+ break;
+ }
+ }
+
+ /* Calculate CPU capacity for physical packages and nodes */
+ for (i = nr_cpumask_bits-1; i >= 0; i--) {
+ if (!cpumask_test_cpu(i, cpu_map))
+ continue;
+
+ for (sd = *per_cpu_ptr(d.sd, i); sd; sd = sd->parent) {
+ claim_allocations(i, sd);
+ }
+ }
+
+ /* Attach the domains */
+ rcu_read_lock();
+ for_each_cpu(i, cpu_map) {
+ sd = *per_cpu_ptr(d.sd, i);
+ cpu_attach_domain(sd, d.rd, i);
+ }
+ rcu_read_unlock();
+
+ ret = 0;
+error:
+ __free_domain_allocs(&d, alloc_state, cpu_map);
+ return ret;
+}
+
+static cpumask_var_t *doms_cur; /* current sched domains */
+static int ndoms_cur; /* number of sched domains in 'doms_cur' */
+static struct sched_domain_attr *dattr_cur;
+ /* attribues of custom domains in 'doms_cur' */
+
+/*
+ * Special case: If a kmalloc of a doms_cur partition (array of
+ * cpumask) fails, then fallback to a single sched domain,
+ * as determined by the single cpumask fallback_doms.
+ */
+static cpumask_var_t fallback_doms;
+
+/*
+ * arch_update_cpu_topology lets virtualized architectures update the
+ * cpu core maps. It is supposed to return 1 if the topology changed
+ * or 0 if it stayed the same.
+ */
+int __weak arch_update_cpu_topology(void)
+{
+ return 0;
+}
+
+cpumask_var_t *alloc_sched_domains(unsigned int ndoms)
+{
+ int i;
+ cpumask_var_t *doms;
+
+ doms = kmalloc(sizeof(*doms) * ndoms, GFP_KERNEL);
+ if (!doms)
+ return NULL;
+ for (i = 0; i < ndoms; i++) {
+ if (!alloc_cpumask_var(&doms[i], GFP_KERNEL)) {
+ free_sched_domains(doms, i);
+ return NULL;
+ }
+ }
+ return doms;
+}
+
+void free_sched_domains(cpumask_var_t doms[], unsigned int ndoms)
+{
+ unsigned int i;
+ for (i = 0; i < ndoms; i++)
+ free_cpumask_var(doms[i]);
+ kfree(doms);
+}
+
+/*
+ * Set up scheduler domains and groups. Callers must hold the hotplug lock.
+ * For now this just excludes isolated cpus, but could be used to
+ * exclude other special cases in the future.
+ */
+static int init_sched_domains(const struct cpumask *cpu_map)
+{
+ int err;
+
+ arch_update_cpu_topology();
+ ndoms_cur = 1;
+ doms_cur = alloc_sched_domains(ndoms_cur);
+ if (!doms_cur)
+ doms_cur = &fallback_doms;
+ cpumask_andnot(doms_cur[0], cpu_map, cpu_isolated_map);
+ err = build_sched_domains(doms_cur[0], NULL);
+ register_sched_domain_sysctl();
+
+ return err;
+}
+
+/*
+ * Detach sched domains from a group of cpus specified in cpu_map
+ * These cpus will now be attached to the NULL domain
+ */
+static void detach_destroy_domains(const struct cpumask *cpu_map)
+{
+ int i;
+
+ rcu_read_lock();
+ for_each_cpu(i, cpu_map)
+ cpu_attach_domain(NULL, &def_root_domain, i);
+ rcu_read_unlock();
+}
+
+/* handle null as "default" */
+static int dattrs_equal(struct sched_domain_attr *cur, int idx_cur,
+ struct sched_domain_attr *new, int idx_new)
+{
+ struct sched_domain_attr tmp;
+
+ /* fast path */
+ if (!new && !cur)
+ return 1;
+
+ tmp = SD_ATTR_INIT;
+ return !memcmp(cur ? (cur + idx_cur) : &tmp,
+ new ? (new + idx_new) : &tmp,
+ sizeof(struct sched_domain_attr));
+}
+
+/*
+ * Partition sched domains as specified by the 'ndoms_new'
+ * cpumasks in the array doms_new[] of cpumasks. This compares
+ * doms_new[] to the current sched domain partitioning, doms_cur[].
+ * It destroys each deleted domain and builds each new domain.
+ *
+ * 'doms_new' is an array of cpumask_var_t's of length 'ndoms_new'.
+ * The masks don't intersect (don't overlap.) We should setup one
+ * sched domain for each mask. CPUs not in any of the cpumasks will
+ * not be load balanced. If the same cpumask appears both in the
+ * current 'doms_cur' domains and in the new 'doms_new', we can leave
+ * it as it is.
+ *
+ * The passed in 'doms_new' should be allocated using
+ * alloc_sched_domains. This routine takes ownership of it and will
+ * free_sched_domains it when done with it. If the caller failed the
+ * alloc call, then it can pass in doms_new == NULL && ndoms_new == 1,
+ * and partition_sched_domains() will fallback to the single partition
+ * 'fallback_doms', it also forces the domains to be rebuilt.
+ *
+ * If doms_new == NULL it will be replaced with cpu_online_mask.
+ * ndoms_new == 0 is a special case for destroying existing domains,
+ * and it will not create the default domain.
+ *
+ * Call with hotplug lock held
+ */
+void partition_sched_domains(int ndoms_new, cpumask_var_t doms_new[],
+ struct sched_domain_attr *dattr_new)
+{
+ int i, j, n;
+ int new_topology;
+
+ mutex_lock(&sched_domains_mutex);
+
+ /* always unregister in case we don't destroy any domains */
+ unregister_sched_domain_sysctl();
+
+ /* Let architecture update cpu core mappings. */
+ new_topology = arch_update_cpu_topology();
+
+ n = doms_new ? ndoms_new : 0;
+
+ /* Destroy deleted domains */
+ for (i = 0; i < ndoms_cur; i++) {
+ for (j = 0; j < n && !new_topology; j++) {
+ if (cpumask_equal(doms_cur[i], doms_new[j])
+ && dattrs_equal(dattr_cur, i, dattr_new, j))
+ goto match1;
+ }
+ /* no match - a current sched domain not in new doms_new[] */
+ detach_destroy_domains(doms_cur[i]);
+match1:
+ ;
+ }
+
+ n = ndoms_cur;
+ if (doms_new == NULL) {
+ n = 0;
+ doms_new = &fallback_doms;
+ cpumask_andnot(doms_new[0], cpu_active_mask, cpu_isolated_map);
+ WARN_ON_ONCE(dattr_new);
+ }
+
+ /* Build new domains */
+ for (i = 0; i < ndoms_new; i++) {
+ for (j = 0; j < n && !new_topology; j++) {
+ if (cpumask_equal(doms_new[i], doms_cur[j])
+ && dattrs_equal(dattr_new, i, dattr_cur, j))
+ goto match2;
+ }
+ /* no match - add a new doms_new */
+ build_sched_domains(doms_new[i], dattr_new ? dattr_new + i : NULL);
+match2:
+ ;
+ }
+
+ /* Remember the new sched domains */
+ if (doms_cur != &fallback_doms)
+ free_sched_domains(doms_cur, ndoms_cur);
+ kfree(dattr_cur); /* kfree(NULL) is safe */
+ doms_cur = doms_new;
+ dattr_cur = dattr_new;
+ ndoms_cur = ndoms_new;
+
+ register_sched_domain_sysctl();
+
+ mutex_unlock(&sched_domains_mutex);
+}
+
+static int num_cpus_frozen; /* used to mark begin/end of suspend/resume */
+
+/*
+ * Update cpusets according to cpu_active mask. If cpusets are
+ * disabled, cpuset_update_active_cpus() becomes a simple wrapper
+ * around partition_sched_domains().
+ *
+ * If we come here as part of a suspend/resume, don't touch cpusets because we
+ * want to restore it back to its original state upon resume anyway.
+ */
+static void cpuset_cpu_active(void)
+{
+ if (cpuhp_tasks_frozen) {
+ /*
+ * num_cpus_frozen tracks how many CPUs are involved in suspend
+ * resume sequence. As long as this is not the last online
+ * operation in the resume sequence, just build a single sched
+ * domain, ignoring cpusets.
+ */
+ num_cpus_frozen--;
+ if (likely(num_cpus_frozen)) {
+ partition_sched_domains(1, NULL, NULL);
+ return;
+ }
+ /*
+ * This is the last CPU online operation. So fall through and
+ * restore the original sched domains by considering the
+ * cpuset configurations.
+ */
+ }
+
+ cpuset_update_active_cpus(true);
+}
+
+static int cpuset_cpu_inactive(unsigned int cpu)
+{
+ if (!cpuhp_tasks_frozen) {
+ cpuset_update_active_cpus(false);
+ } else {
+ num_cpus_frozen++;
+ partition_sched_domains(1, NULL, NULL);
+ }
+ return 0;
+}
+
+int sched_cpu_activate(unsigned int cpu)
+{
+ struct rq *rq = cpu_rq(cpu);
+ unsigned long flags;
+
+ set_cpu_active(cpu, true);
+
+ if (sched_smp_initialized) {
+ sched_domains_numa_masks_set(cpu);
+ cpuset_cpu_active();
+ }
+
+ /*
+ * Put the rq online, if not already. This happens:
+ *
+ * 1) In the early boot process, because we build the real domains
+ * after all cpus have been brought up.
+ *
+ * 2) At runtime, if cpuset_cpu_active() fails to rebuild the
+ * domains.
+ */
+ rq_lock_irqsave(rq, &flags);
+ if (rq->rd) {
+ BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span));
+ set_rq_online(rq);
+ }
+ unbind_zero(cpu);
+ rq_unlock_irqrestore(rq, &flags);
+
+ return 0;
+}
+
+int sched_cpu_deactivate(unsigned int cpu)
+{
+ int ret;
+
+ set_cpu_active(cpu, false);
+ /*
+ * We've cleared cpu_active_mask, wait for all preempt-disabled and RCU
+ * users of this state to go away such that all new such users will
+ * observe it.
+ *
+ * For CONFIG_PREEMPT we have preemptible RCU and its sync_rcu() might
+ * not imply sync_sched(), so wait for both.
+ *
+ * Do sync before park smpboot threads to take care the rcu boost case.
+ */
+ if (IS_ENABLED(CONFIG_PREEMPT))
+ synchronize_rcu_mult(call_rcu, call_rcu_sched);
+ else
+ synchronize_rcu();
+
+ if (!sched_smp_initialized)
+ return 0;
+
+ ret = cpuset_cpu_inactive(cpu);
+ if (ret) {
+ set_cpu_active(cpu, true);
+ return ret;
+ }
+ sched_domains_numa_masks_clear(cpu);
+ return 0;
+}
+
+int sched_cpu_starting(unsigned int __maybe_unused cpu)
+{
+ return 0;
+}
+
+#ifdef CONFIG_HOTPLUG_CPU
+int sched_cpu_dying(unsigned int cpu)
+{
+ struct rq *rq = cpu_rq(cpu);
+ unsigned long flags;
+
+ local_irq_save(flags);
+ double_rq_lock(rq, cpu_rq(0));
+ if (rq->rd) {
+ BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span));
+ set_rq_offline(rq);
+ }
+ bind_zero(cpu);
+ double_rq_unlock(rq, cpu_rq(0));
+ sched_start_tick(rq, cpu);
+ hrexpiry_clear(rq);
+ local_irq_restore(flags);
+
+ return 0;
+}
+#endif
+
+#if defined(CONFIG_SCHED_SMT) || defined(CONFIG_SCHED_MC)
+/*
+ * Cheaper version of the below functions in case support for SMT and MC is
+ * compiled in but CPUs have no siblings.
+ */
+static bool sole_cpu_idle(struct rq *rq)
+{
+ return rq_idle(rq);
+}
+#endif
+#ifdef CONFIG_SCHED_SMT
+static const cpumask_t *thread_cpumask(int cpu)
+{
+ return topology_sibling_cpumask(cpu);
+}
+/* All this CPU's SMT siblings are idle */
+static bool siblings_cpu_idle(struct rq *rq)
+{
+ return cpumask_subset(&rq->thread_mask, &grq.cpu_idle_map);
+}
+#endif
+#ifdef CONFIG_SCHED_MC
+static const cpumask_t *core_cpumask(int cpu)
+{
+ return topology_core_cpumask(cpu);
+}
+/* All this CPU's shared cache siblings are idle */
+static bool cache_cpu_idle(struct rq *rq)
+{
+ return cpumask_subset(&rq->core_mask, &grq.cpu_idle_map);
+}
+#endif
+
+enum sched_domain_level {
+ SD_LV_NONE = 0,
+ SD_LV_SIBLING,
+ SD_LV_MC,
+ SD_LV_BOOK,
+ SD_LV_CPU,
+ SD_LV_NODE,
+ SD_LV_ALLNODES,
+ SD_LV_MAX
+};
+
+void __init sched_init_smp(void)
+{
+ struct sched_domain *sd;
+ int cpu, other_cpu;
+#ifdef CONFIG_SCHED_SMT
+ bool smt_threads = false;
+#endif
+ cpumask_var_t non_isolated_cpus;
+ struct rq *rq;
+
+ alloc_cpumask_var(&non_isolated_cpus, GFP_KERNEL);
+ alloc_cpumask_var(&fallback_doms, GFP_KERNEL);
+
+ sched_init_numa();
+
+ /*
+ * There's no userspace yet to cause hotplug operations; hence all the
+ * cpu masks are stable and all blatant races in the below code cannot
+ * happen.
+ */
+ mutex_lock(&sched_domains_mutex);
+ init_sched_domains(cpu_active_mask);
+ cpumask_andnot(non_isolated_cpus, cpu_possible_mask, cpu_isolated_map);
+ if (cpumask_empty(non_isolated_cpus))
+ cpumask_set_cpu(smp_processor_id(), non_isolated_cpus);
+ mutex_unlock(&sched_domains_mutex);
+
+ /* Move init over to a non-isolated CPU */
+ if (set_cpus_allowed_ptr(current, non_isolated_cpus) < 0)
+ BUG();
+ free_cpumask_var(non_isolated_cpus);
+
+ mutex_lock(&sched_domains_mutex);
+ local_irq_disable();
+ lock_all_rqs();
+ /*
+ * Set up the relative cache distance of each online cpu from each
+ * other in a simple array for quick lookup. Locality is determined
+ * by the closest sched_domain that CPUs are separated by. CPUs with
+ * shared cache in SMT and MC are treated as local. Separate CPUs
+ * (within the same package or physically) within the same node are
+ * treated as not local. CPUs not even in the same domain (different
+ * nodes) are treated as very distant.
+ */
+ for_each_online_cpu(cpu) {
+ rq = cpu_rq(cpu);
+
+ /* First check if this cpu is in the same node */
+ for_each_domain(cpu, sd) {
+ if (sd->level > SD_LV_MC)
+ continue;
+ /* Set locality to local node if not already found lower */
+ for_each_cpu(other_cpu, sched_domain_span(sd)) {
+ if (rq->cpu_locality[other_cpu] > 3)
+ rq->cpu_locality[other_cpu] = 3;
+ }
+ }
+
+ /*
+ * Each runqueue has its own function in case it doesn't have
+ * siblings of its own allowing mixed topologies.
+ */
+#ifdef CONFIG_SCHED_MC
+ for_each_cpu(other_cpu, core_cpumask(cpu)) {
+ if (rq->cpu_locality[other_cpu] > 2)
+ rq->cpu_locality[other_cpu] = 2;
+ }
+ if (cpumask_weight(core_cpumask(cpu)) > 1) {
+ cpumask_copy(&rq->core_mask, core_cpumask(cpu));
+ cpumask_clear_cpu(cpu, &rq->core_mask);
+ rq->cache_idle = cache_cpu_idle;
+ }
+#endif
+#ifdef CONFIG_SCHED_SMT
+ if (cpumask_weight(thread_cpumask(cpu)) > 1) {
+ cpumask_copy(&rq->thread_mask, thread_cpumask(cpu));
+ cpumask_clear_cpu(cpu, &rq->thread_mask);
+ for_each_cpu(other_cpu, thread_cpumask(cpu))
+ rq->cpu_locality[other_cpu] = 1;
+ rq->siblings_idle = siblings_cpu_idle;
+ smt_threads = true;
+ }
+#endif
+ }
+ for_each_possible_cpu(cpu) {
+ int total_cpus = 1, locality;
+
+ rq = cpu_rq(cpu);
+ for (locality = 1; locality <= 4; locality++) {
+ for_each_possible_cpu(other_cpu) {
+ if (rq->cpu_locality[other_cpu] == locality)
+ rq->rq_order[total_cpus++] = cpu_rq(other_cpu);
+ }
+ }
+ }
+#ifdef CONFIG_SMT_NICE
+ if (smt_threads) {
+ check_siblings = &check_smt_siblings;
+ wake_siblings = &wake_smt_siblings;
+ smt_schedule = &smt_should_schedule;
+ }
+#endif
+ unlock_all_rqs();
+ local_irq_enable();
+ mutex_unlock(&sched_domains_mutex);
+
+ for_each_online_cpu(cpu) {
+ rq = cpu_rq(cpu);
+
+ for_each_online_cpu(other_cpu) {
+ if (other_cpu <= cpu)
+ continue;
+ printk(KERN_DEBUG "MuQSS locality CPU %d to %d: %d\n", cpu, other_cpu, rq->cpu_locality[other_cpu]);
+ }
+ }
+
+ sched_smp_initialized = true;
+}
+#else
+void __init sched_init_smp(void)
+{
+ sched_smp_initialized = true;
+}
+#endif /* CONFIG_SMP */
+
+int in_sched_functions(unsigned long addr)
+{
+ return in_lock_functions(addr) ||
+ (addr >= (unsigned long)__sched_text_start
+ && addr < (unsigned long)__sched_text_end);
+}
+
+#ifdef CONFIG_CGROUP_SCHED
+/* task group related information */
+struct task_group {
+ struct cgroup_subsys_state css;
+
+ struct rcu_head rcu;
+ struct list_head list;
+
+ struct task_group *parent;
+ struct list_head siblings;
+ struct list_head children;
+};
+
+/*
+ * Default task group.
+ * Every task in system belongs to this group at bootup.
+ */
+struct task_group root_task_group;
+LIST_HEAD(task_groups);
+
+/* Cacheline aligned slab cache for task_group */
+static struct kmem_cache *task_group_cache __read_mostly;
+#endif /* CONFIG_CGROUP_SCHED */
+
+#define WAIT_TABLE_BITS 8
+#define WAIT_TABLE_SIZE (1 << WAIT_TABLE_BITS)
+static wait_queue_head_t bit_wait_table[WAIT_TABLE_SIZE] __cacheline_aligned;
+
+wait_queue_head_t *bit_waitqueue(void *word, int bit)
+{
+ const int shift = BITS_PER_LONG == 32 ? 5 : 6;
+ unsigned long val = (unsigned long)word << shift | bit;
+
+ return bit_wait_table + hash_long(val, WAIT_TABLE_BITS);
+}
+EXPORT_SYMBOL(bit_waitqueue);
+
+void __init sched_init(void)
+{
+#ifdef CONFIG_SMP
+ int cpu_ids;
+#endif
+ int i;
+ struct rq *rq;
+
+ for (i = 0; i < WAIT_TABLE_SIZE; i++)
+ init_waitqueue_head(bit_wait_table + i);
+
+ prio_ratios[0] = 128;
+ for (i = 1 ; i < NICE_WIDTH ; i++)
+ prio_ratios[i] = prio_ratios[i - 1] * 11 / 10;
+
+ atomic_set(&grq.nr_running, 0);
+ atomic_set(&grq.nr_uninterruptible, 0);
+ atomic64_set(&grq.nr_switches, 0);
+ skiplist_node_init(&init_task.node);
+
+#ifdef CONFIG_SMP
+ init_defrootdomain();
+ cpumask_clear(&grq.cpu_idle_map);
+#else
+ uprq = &per_cpu(runqueues, 0);
+#endif
+
+#ifdef CONFIG_CGROUP_SCHED
+ task_group_cache = KMEM_CACHE(task_group, 0);
+
+ list_add(&root_task_group.list, &task_groups);
+ INIT_LIST_HEAD(&root_task_group.children);
+ INIT_LIST_HEAD(&root_task_group.siblings);
+#endif /* CONFIG_CGROUP_SCHED */
+ for_each_possible_cpu(i) {
+ rq = cpu_rq(i);
+ skiplist_init(&rq->node);
+ rq->sl = new_skiplist(&rq->node);
+ raw_spin_lock_init(&rq->lock);
+ rq->clock = rq->old_clock = rq->last_niffy = rq->niffies = 0;
+ rq->last_jiffy = jiffies;
+ rq->user_ns = rq->nice_ns = rq->softirq_ns = rq->system_ns =
+ rq->iowait_ns = rq->idle_ns = 0;
+ rq->dither = 0;
+ set_rq_task(rq, &init_task);
+ rq->iso_ticks = 0;
+ rq->iso_refractory = false;
+#ifdef CONFIG_SMP
+ rq->sd = NULL;
+ rq->rd = NULL;
+ rq->online = false;
+ rq->cpu = i;
+ rq_attach_root(rq, &def_root_domain);
+#endif
+ init_rq_hrexpiry(rq);
+ atomic_set(&rq->nr_iowait, 0);
+ }
+
+#ifdef CONFIG_SMP
+ cpu_ids = i;
+ /*
+ * Set the base locality for cpu cache distance calculation to
+ * "distant" (3). Make sure the distance from a CPU to itself is 0.
+ */
+ for_each_possible_cpu(i) {
+ int j;
+
+ rq = cpu_rq(i);
+#ifdef CONFIG_SCHED_SMT
+ rq->siblings_idle = sole_cpu_idle;
+#endif
+#ifdef CONFIG_SCHED_MC
+ rq->cache_idle = sole_cpu_idle;
+#endif
+ rq->cpu_locality = kmalloc(cpu_ids * sizeof(int *), GFP_ATOMIC);
+ for_each_possible_cpu(j) {
+ if (i == j)
+ rq->cpu_locality[j] = 0;
+ else
+ rq->cpu_locality[j] = 4;
+ }
+ rq->rq_order = kmalloc(cpu_ids * sizeof(struct rq *), GFP_ATOMIC);
+ rq->rq_order[0] = rq;
+ for (j = 1; j < cpu_ids; j++)
+ rq->rq_order[j] = cpu_rq(j);
+ }
+#endif
+
+ /*
+ * The boot idle thread does lazy MMU switching as well:
+ */
+ atomic_inc(&init_mm.mm_count);
+ enter_lazy_tlb(&init_mm, current);
+
+ /*
+ * Make us the idle thread. Technically, schedule() should not be
+ * called from this thread, however somewhere below it might be,
+ * but because we are the idle thread, we just pick up running again
+ * when this runqueue becomes "idle".
+ */
+ init_idle(current, smp_processor_id());
+
+#ifdef CONFIG_SMP
+ zalloc_cpumask_var(&sched_domains_tmpmask, GFP_NOWAIT);
+ /* May be allocated at isolcpus cmdline parse time */
+ if (cpu_isolated_map == NULL)
+ zalloc_cpumask_var(&cpu_isolated_map, GFP_NOWAIT);
+ idle_thread_set_boot_cpu();
+#endif /* SMP */
+
+ init_schedstats();
+}
+
+#ifdef CONFIG_DEBUG_ATOMIC_SLEEP
+static inline int preempt_count_equals(int preempt_offset)
+{
+ int nested = preempt_count() + rcu_preempt_depth();
+
+ return (nested == preempt_offset);
+}
+
+void __might_sleep(const char *file, int line, int preempt_offset)
+{
+ /*
+ * Blocking primitives will set (and therefore destroy) current->state,
+ * since we will exit with TASK_RUNNING make sure we enter with it,
+ * otherwise we will destroy state.
+ */
+ WARN_ONCE(current->state != TASK_RUNNING && current->task_state_change,
+ "do not call blocking ops when !TASK_RUNNING; "
+ "state=%lx set at [<%p>] %pS\n",
+ current->state,
+ (void *)current->task_state_change,
+ (void *)current->task_state_change);
+
+ ___might_sleep(file, line, preempt_offset);
+}
+EXPORT_SYMBOL(__might_sleep);
+
+void ___might_sleep(const char *file, int line, int preempt_offset)
+{
+ static unsigned long prev_jiffy; /* ratelimiting */
+ unsigned long preempt_disable_ip;
+
+ rcu_sleep_check(); /* WARN_ON_ONCE() by default, no rate limit reqd. */
+ if ((preempt_count_equals(preempt_offset) && !irqs_disabled() &&
+ !is_idle_task(current)) ||
+ system_state != SYSTEM_RUNNING || oops_in_progress)
+ return;
+ if (time_before(jiffies, prev_jiffy + HZ) && prev_jiffy)
+ return;
+ prev_jiffy = jiffies;
+
+ /* Save this before calling printk(), since that will clobber it */
+ preempt_disable_ip = get_preempt_disable_ip(current);
+
+ printk(KERN_ERR
+ "BUG: sleeping function called from invalid context at %s:%d\n",
+ file, line);
+ printk(KERN_ERR
+ "in_atomic(): %d, irqs_disabled(): %d, pid: %d, name: %s\n",
+ in_atomic(), irqs_disabled(),
+ current->pid, current->comm);
+
+ if (task_stack_end_corrupted(current))
+ printk(KERN_EMERG "Thread overran stack, or stack corrupted\n");
+
+ debug_show_held_locks(current);
+ if (irqs_disabled())
+ print_irqtrace_events(current);
+ if (IS_ENABLED(CONFIG_DEBUG_PREEMPT)
+ && !preempt_count_equals(preempt_offset)) {
+ pr_err("Preemption disabled at:");
+ print_ip_sym(preempt_disable_ip);
+ pr_cont("\n");
+ }
+ dump_stack();
+ add_taint(TAINT_WARN, LOCKDEP_STILL_OK);
+}
+EXPORT_SYMBOL(___might_sleep);
+#endif
+
+#ifdef CONFIG_MAGIC_SYSRQ
+static inline void normalise_rt_tasks(void)
+{
+ struct task_struct *g, *p;
+ unsigned long flags;
+ struct rq *rq;
+
+ read_lock(&tasklist_lock);
+ for_each_process_thread(g, p) {
+ /*
+ * Only normalize user tasks:
+ */
+ if (p->flags & PF_KTHREAD)
+ continue;
+
+ if (!rt_task(p) && !iso_task(p))
+ continue;
+
+ rq = task_rq_lock(p, &flags);
+ __setscheduler(p, rq, SCHED_NORMAL, 0, false);
+ task_rq_unlock(rq, p, &flags);
+ }
+ read_unlock(&tasklist_lock);
+}
+
+void normalize_rt_tasks(void)
+{
+ normalise_rt_tasks();
+}
+#endif /* CONFIG_MAGIC_SYSRQ */
+
+#if defined(CONFIG_IA64) || defined(CONFIG_KGDB_KDB)
+/*
+ * These functions are only useful for the IA64 MCA handling, or kdb.
+ *
+ * They can only be called when the whole system has been
+ * stopped - every CPU needs to be quiescent, and no scheduling
+ * activity can take place. Using them for anything else would
+ * be a serious bug, and as a result, they aren't even visible
+ * under any other configuration.
+ */
+
+/**
+ * curr_task - return the current task for a given cpu.
+ * @cpu: the processor in question.
+ *
+ * ONLY VALID WHEN THE WHOLE SYSTEM IS STOPPED!
+ *
+ * Return: The current task for @cpu.
+ */
+struct task_struct *curr_task(int cpu)
+{
+ return cpu_curr(cpu);
+}
+
+#endif /* defined(CONFIG_IA64) || defined(CONFIG_KGDB_KDB) */
+
+#ifdef CONFIG_IA64
+/**
+ * set_curr_task - set the current task for a given cpu.
+ * @cpu: the processor in question.
+ * @p: the task pointer to set.
+ *
+ * Description: This function must only be used when non-maskable interrupts
+ * are serviced on a separate stack. It allows the architecture to switch the
+ * notion of the current task on a cpu in a non-blocking manner. This function
+ * must be called with all CPU's synchronised, and interrupts disabled, the
+ * and caller must save the original value of the current task (see
+ * curr_task() above) and restore that value before reenabling interrupts and
+ * re-starting the system.
+ *
+ * ONLY VALID WHEN THE WHOLE SYSTEM IS STOPPED!
+ */
+void ia64_set_curr_task(int cpu, struct task_struct *p)
+{
+ cpu_curr(cpu) = p;
+}
+
+#endif
+
+void init_idle_bootup_task(struct task_struct *idle)
+{}
+
+#ifdef CONFIG_SCHED_DEBUG
+void proc_sched_show_task(struct task_struct *p, struct seq_file *m)
+{}
+
+void proc_sched_set_task(struct task_struct *p)
+{}
+#endif
+
+#ifdef CONFIG_SMP
+#define SCHED_LOAD_SHIFT (10)
+#define SCHED_LOAD_SCALE (1L << SCHED_LOAD_SHIFT)
+
+unsigned long default_scale_freq_power(struct sched_domain *sd, int cpu)
+{
+ return SCHED_LOAD_SCALE;
+}
+
+unsigned long default_scale_smt_power(struct sched_domain *sd, int cpu)
+{
+ unsigned long weight = cpumask_weight(sched_domain_span(sd));
+ unsigned long smt_gain = sd->smt_gain;
+
+ smt_gain /= weight;
+
+ return smt_gain;
+}
+#endif
+
+#ifdef CONFIG_CGROUP_SCHED
+static void sched_free_group(struct task_group *tg)
+{
+ kmem_cache_free(task_group_cache, tg);
+}
+
+/* allocate runqueue etc for a new task group */
+struct task_group *sched_create_group(struct task_group *parent)
+{
+ struct task_group *tg;
+
+ tg = kmem_cache_alloc(task_group_cache, GFP_KERNEL | __GFP_ZERO);
+ if (!tg)
+ return ERR_PTR(-ENOMEM);
+
+ return tg;
+}
+
+void sched_online_group(struct task_group *tg, struct task_group *parent)
+{
+}
+
+/* rcu callback to free various structures associated with a task group */
+static void sched_free_group_rcu(struct rcu_head *rhp)
+{
+ /* now it should be safe to free those cfs_rqs */
+ sched_free_group(container_of(rhp, struct task_group, rcu));
+}
+
+void sched_destroy_group(struct task_group *tg)
+{
+ /* wait for possible concurrent references to cfs_rqs complete */
+ call_rcu(&tg->rcu, sched_free_group_rcu);
+}
+
+void sched_offline_group(struct task_group *tg)
+{
+}
+
+static inline struct task_group *css_tg(struct cgroup_subsys_state *css)
+{
+ return css ? container_of(css, struct task_group, css) : NULL;
+}
+
+static struct cgroup_subsys_state *
+cpu_cgroup_css_alloc(struct cgroup_subsys_state *parent_css)
+{
+ struct task_group *parent = css_tg(parent_css);
+ struct task_group *tg;
+
+ if (!parent) {
+ /* This is early initialization for the top cgroup */
+ return &root_task_group.css;
+ }
+
+ tg = sched_create_group(parent);
+ if (IS_ERR(tg))
+ return ERR_PTR(-ENOMEM);
+ return &tg->css;
+}
+
+static void cpu_cgroup_css_released(struct cgroup_subsys_state *css)
+{
+ struct task_group *tg = css_tg(css);
+
+ sched_offline_group(tg);
+}
+
+static void cpu_cgroup_css_free(struct cgroup_subsys_state *css)
+{
+ struct task_group *tg = css_tg(css);
+
+ /*
+ * Relies on the RCU grace period between css_released() and this.
+ */
+ sched_free_group(tg);
+}
+
+static void cpu_cgroup_fork(struct task_struct *task)
+{
+}
+
+static int cpu_cgroup_can_attach(struct cgroup_taskset *tset)
+{
+ return 0;
+}
+
+static void cpu_cgroup_attach(struct cgroup_taskset *tset)
+{
+}
+
+static struct cftype cpu_files[] = {
+ { } /* terminate */
+};
+
+struct cgroup_subsys cpu_cgrp_subsys = {
+ .css_alloc = cpu_cgroup_css_alloc,
+ .css_released = cpu_cgroup_css_released,
+ .css_free = cpu_cgroup_css_free,
+ .fork = cpu_cgroup_fork,
+ .can_attach = cpu_cgroup_can_attach,
+ .attach = cpu_cgroup_attach,
+ .legacy_cftypes = cpu_files,
+ .early_init = true,
+};
+#endif /* CONFIG_CGROUP_SCHED */
diff --git a/kernel/sched/MuQSS.h b/kernel/sched/MuQSS.h
new file mode 100644
index 0000000..04ce652
--- /dev/null
+++ b/kernel/sched/MuQSS.h
@@ -0,0 +1,348 @@
+#include <linux/sched.h>
+#include <linux/cpuidle.h>
+#include <linux/interrupt.h>
+#include <linux/skip_list.h>
+#include <linux/stop_machine.h>
+#include <linux/u64_stats_sync.h>
+#include "cpuacct.h"
+
+#ifndef MUQSS_SCHED_H
+#define MUQSS_SCHED_H
+
+#ifdef CONFIG_SCHED_DEBUG
+#define SCHED_WARN_ON(x) WARN_ONCE(x, #x)
+#else
+#define SCHED_WARN_ON(x) ((void)(x))
+#endif
+
+/* task_struct::on_rq states: */
+#define TASK_ON_RQ_QUEUED 1
+#define TASK_ON_RQ_MIGRATING 2
+
+/*
+ * This is the main, per-CPU runqueue data structure.
+ * This data should only be modified by the local cpu.
+ */
+struct rq {
+ struct task_struct *curr, *idle, *stop;
+ struct mm_struct *prev_mm;
+
+ raw_spinlock_t lock;
+
+ /* Stored data about rq->curr to work outside rq lock */
+ u64 rq_deadline;
+ int rq_prio;
+
+ /* Best queued id for use outside lock */
+ u64 best_key;
+
+ unsigned long last_scheduler_tick; /* Last jiffy this RQ ticked */
+ unsigned long last_jiffy; /* Last jiffy this RQ updated rq clock */
+ u64 niffies; /* Last time this RQ updated rq clock */
+ u64 last_niffy; /* Last niffies as updated by local clock */
+ u64 last_jiffy_niffies; /* Niffies @ last_jiffy */
+
+ u64 load_update; /* When we last updated load */
+ unsigned long load_avg; /* Rolling load average */
+#ifdef CONFIG_SMT_NICE
+ struct mm_struct *rq_mm;
+ int rq_smt_bias; /* Policy/nice level bias across smt siblings */
+#endif
+ /* Accurate timekeeping data */
+ unsigned long user_ns, nice_ns, irq_ns, softirq_ns, system_ns,
+ iowait_ns, idle_ns;
+ atomic_t nr_iowait;
+
+ skiplist_node node;
+ skiplist *sl;
+#ifdef CONFIG_SMP
+ struct task_struct *preempt; /* Preempt triggered on this task */
+
+ int cpu; /* cpu of this runqueue */
+ bool online;
+
+ struct root_domain *rd;
+ struct sched_domain *sd;
+ int *cpu_locality; /* CPU relative cache distance */
+ struct rq **rq_order; /* RQs ordered by relative cache distance */
+
+#ifdef CONFIG_SCHED_SMT
+ cpumask_t thread_mask;
+ bool (*siblings_idle)(struct rq *rq);
+ /* See if all smt siblings are idle */
+#endif /* CONFIG_SCHED_SMT */
+#ifdef CONFIG_SCHED_MC
+ cpumask_t core_mask;
+ bool (*cache_idle)(struct rq *rq);
+ /* See if all cache siblings are idle */
+#endif /* CONFIG_SCHED_MC */
+#endif /* CONFIG_SMP */
+#ifdef CONFIG_IRQ_TIME_ACCOUNTING
+ u64 prev_irq_time;
+#endif /* CONFIG_IRQ_TIME_ACCOUNTING */
+#ifdef CONFIG_PARAVIRT
+ u64 prev_steal_time;
+#endif /* CONFIG_PARAVIRT */
+#ifdef CONFIG_PARAVIRT_TIME_ACCOUNTING
+ u64 prev_steal_time_rq;
+#endif /* CONFIG_PARAVIRT_TIME_ACCOUNTING */
+
+ u64 clock, old_clock, last_tick;
+ u64 clock_task;
+ int dither;
+
+ int iso_ticks;
+ bool iso_refractory;
+
+#ifdef CONFIG_HIGH_RES_TIMERS
+ struct hrtimer hrexpiry_timer;
+#endif
+
+#ifdef CONFIG_SCHEDSTATS
+
+ /* latency stats */
+ struct sched_info rq_sched_info;
+ unsigned long long rq_cpu_time;
+ /* could above be rq->cfs_rq.exec_clock + rq->rt_rq.rt_runtime ? */
+
+ /* sys_sched_yield() stats */
+ unsigned int yld_count;
+
+ /* schedule() stats */
+ unsigned int sched_switch;
+ unsigned int sched_count;
+ unsigned int sched_goidle;
+
+ /* try_to_wake_up() stats */
+ unsigned int ttwu_count;
+ unsigned int ttwu_local;
+#endif /* CONFIG_SCHEDSTATS */
+
+#ifdef CONFIG_SMP
+ struct llist_head wake_list;
+#endif
+
+#ifdef CONFIG_CPU_IDLE
+ /* Must be inspected within a rcu lock section */
+ struct cpuidle_state *idle_state;
+#endif
+};
+
+#ifdef CONFIG_SMP
+struct rq *cpu_rq(int cpu);
+#endif
+
+#ifndef CONFIG_SMP
+extern struct rq *uprq;
+#define cpu_rq(cpu) (uprq)
+#define this_rq() (uprq)
+#define raw_rq() (uprq)
+#define task_rq(p) (uprq)
+#define cpu_curr(cpu) ((uprq)->curr)
+#else /* CONFIG_SMP */
+DECLARE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues);
+#define this_rq() this_cpu_ptr(&runqueues)
+#define raw_rq() raw_cpu_ptr(&runqueues)
+#endif /* CONFIG_SMP */
+
+/*
+ * {de,en}queue flags:
+ *
+ * DEQUEUE_SLEEP - task is no longer runnable
+ * ENQUEUE_WAKEUP - task just became runnable
+ *
+ * SAVE/RESTORE - an otherwise spurious dequeue/enqueue, done to ensure tasks
+ * are in a known state which allows modification. Such pairs
+ * should preserve as much state as possible.
+ *
+ * MOVE - paired with SAVE/RESTORE, explicitly does not preserve the location
+ * in the runqueue.
+ *
+ * ENQUEUE_HEAD - place at front of runqueue (tail if not specified)
+ * ENQUEUE_REPLENISH - CBS (replenish runtime and postpone deadline)
+ * ENQUEUE_MIGRATED - the task was migrated during wakeup
+ *
+ */
+
+#define DEQUEUE_SLEEP 0x01
+#define DEQUEUE_SAVE 0x02 /* matches ENQUEUE_RESTORE */
+#define DEQUEUE_MOVE 0x04 /* matches ENQUEUE_MOVE */
+
+#define ENQUEUE_WAKEUP 0x01
+#define ENQUEUE_RESTORE 0x02
+#define ENQUEUE_MOVE 0x04
+
+#define ENQUEUE_HEAD 0x08
+#define ENQUEUE_REPLENISH 0x10
+#ifdef CONFIG_SMP
+#define ENQUEUE_MIGRATED 0x20
+#else
+#define ENQUEUE_MIGRATED 0x00
+#endif
+
+static inline u64 __rq_clock_broken(struct rq *rq)
+{
+ return READ_ONCE(rq->clock);
+}
+
+static inline u64 rq_clock(struct rq *rq)
+{
+ lockdep_assert_held(&rq->lock);
+ return rq->clock;
+}
+
+static inline u64 rq_clock_task(struct rq *rq)
+{
+ lockdep_assert_held(&rq->lock);
+ return rq->clock_task;
+}
+
+extern struct mutex sched_domains_mutex;
+extern struct static_key_false sched_schedstats;
+
+#define rcu_dereference_check_sched_domain(p) \
+ rcu_dereference_check((p), \
+ lockdep_is_held(&sched_domains_mutex))
+
+#define for_each_lower_domain(sd) for (; sd; sd = sd->child)
+
+/*
+ * The domain tree (rq->sd) is protected by RCU's quiescent state transition.
+ * See detach_destroy_domains: synchronize_sched for details.
+ *
+ * The domain tree of any CPU may only be accessed from within
+ * preempt-disabled sections.
+ */
+#define for_each_domain(cpu, __sd) \
+ for (__sd = rcu_dereference_check_sched_domain(cpu_rq(cpu)->sd); __sd; __sd = __sd->parent)
+
+#if defined(CONFIG_SCHED_DEBUG) && defined(CONFIG_SYSCTL)
+void register_sched_domain_sysctl(void);
+void unregister_sched_domain_sysctl(void);
+#else
+static inline void register_sched_domain_sysctl(void)
+{
+}
+static inline void unregister_sched_domain_sysctl(void)
+{
+}
+#endif
+
+#ifdef CONFIG_SMP
+extern void sched_ttwu_pending(void);
+extern void set_cpus_allowed_common(struct task_struct *p, const struct cpumask *new_mask);
+#else
+static inline void sched_ttwu_pending(void) { }
+#endif
+
+#ifdef CONFIG_CPU_IDLE
+static inline void idle_set_state(struct rq *rq,
+ struct cpuidle_state *idle_state)
+{
+ rq->idle_state = idle_state;
+}
+
+static inline struct cpuidle_state *idle_get_state(struct rq *rq)
+{
+ SCHED_WARN_ON(!rcu_read_lock_held());
+ return rq->idle_state;
+}
+#else
+static inline void idle_set_state(struct rq *rq,
+ struct cpuidle_state *idle_state)
+{
+}
+
+static inline struct cpuidle_state *idle_get_state(struct rq *rq)
+{
+ return NULL;
+}
+#endif
+
+#ifdef CONFIG_IRQ_TIME_ACCOUNTING
+struct irqtime {
+ u64 hardirq_time;
+ u64 softirq_time;
+ u64 irq_start_time;
+ struct u64_stats_sync sync;
+};
+
+DECLARE_PER_CPU(struct irqtime, cpu_irqtime);
+
+static inline u64 irq_time_read(int cpu)
+{
+ struct irqtime *irqtime = &per_cpu(cpu_irqtime, cpu);
+ unsigned int seq;
+ u64 total;
+
+ do {
+ seq = __u64_stats_fetch_begin(&irqtime->sync);
+ total = irqtime->softirq_time + irqtime->hardirq_time;
+ } while (__u64_stats_fetch_retry(&irqtime->sync, seq));
+
+ return total;
+}
+#endif /* CONFIG_IRQ_TIME_ACCOUNTING */
+
+#ifdef CONFIG_CPU_FREQ
+DECLARE_PER_CPU(struct update_util_data *, cpufreq_update_util_data);
+
+static inline void cpufreq_trigger(u64 time, unsigned int flags)
+{
+ struct update_util_data *data = rcu_dereference_sched(*this_cpu_ptr(&cpufreq_update_util_data));
+
+ if (data)
+ data->func(data, time, flags);
+}
+#else
+static inline void cpufreq_trigger(u64 time, unsigned int flag)
+{
+}
+#endif /* CONFIG_CPU_FREQ */
+
+#ifdef arch_scale_freq_capacity
+#ifndef arch_scale_freq_invariant
+#define arch_scale_freq_invariant() (true)
+#endif
+#else /* arch_scale_freq_capacity */
+#define arch_scale_freq_invariant() (false)
+#endif
+
+/*
+ * This should only be called when current == rq->idle. Dodgy workaround for
+ * when softirqs are pending and we are in the idle loop. Setting current to
+ * resched will kick us out of the idle loop and the softirqs will be serviced
+ * on our next pass through schedule().
+ */
+static inline bool softirq_pending(int cpu)
+{
+ if (likely(!local_softirq_pending()))
+ return false;
+ set_tsk_need_resched(current);
+ return true;
+}
+
+#ifdef CONFIG_64BIT
+static inline u64 read_sum_exec_runtime(struct task_struct *t)
+{
+ return tsk_seruntime(t);
+}
+#else
+struct rq *task_rq_lock(struct task_struct *p, unsigned long *flags);
+void task_rq_unlock(struct rq *rq, struct task_struct *p, unsigned long *flags);
+
+static inline u64 read_sum_exec_runtime(struct task_struct *t)
+{
+ unsigned long flags;
+ u64 ns;
+ struct rq *rq;
+
+ rq = task_rq_lock(t, &flags);
+ ns = tsk_seruntime(t);
+ task_rq_unlock(rq, t, &flags);
+
+ return ns;
+}
+#endif
+
+#endif /* MUQSS_SCHED_H */
diff --git a/kernel/sched/cpufreq.c b/kernel/sched/cpufreq.c
index dbc5144..22003e8 100644
--- a/kernel/sched/cpufreq.c
+++ b/kernel/sched/cpufreq.c
@@ -9,7 +9,11 @@
* published by the Free Software Foundation.
*/
+#ifdef CONFIG_SCHED_MUQSS
+#include "MuQSS.h"
+#else
#include "sched.h"
+#endif
DEFINE_PER_CPU(struct update_util_data *, cpufreq_update_util_data);
diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c
index 69e0689..c798cb4 100644
--- a/kernel/sched/cpufreq_schedutil.c
+++ b/kernel/sched/cpufreq_schedutil.c
@@ -15,7 +15,11 @@
#include <linux/slab.h>
#include <trace/events/power.h>
+#ifdef CONFIG_SCHED_MUQSS
+#include "MuQSS.h"
+#else
#include "sched.h"
+#endif
struct sugov_tunables {
struct gov_attr_set attr_set;
@@ -146,6 +150,17 @@ static unsigned int get_next_freq(struct sugov_cpu *sg_cpu, unsigned long util,
return cpufreq_driver_resolve_freq(policy, freq);
}
+#ifdef CONFIG_SCHED_MUQSS
+static void sugov_get_util(unsigned long *util, unsigned long *max)
+{
+ struct rq *rq = this_rq();
+
+ *util = rq->load_avg;
+ if (*util > SCHED_CAPACITY_SCALE)
+ *util = SCHED_CAPACITY_SCALE;
+ *max = SCHED_CAPACITY_SCALE;
+}
+#else /* CONFIG_SCHED_MUQSS */
static void sugov_get_util(unsigned long *util, unsigned long *max)
{
struct rq *rq = this_rq();
@@ -156,6 +171,7 @@ static void sugov_get_util(unsigned long *util, unsigned long *max)
*util = min(rq->cfs.avg.util_avg, cfs_max);
*max = cfs_max;
}
+#endif /* CONFIG_SCHED_MUQSS */
static void sugov_set_iowait_boost(struct sugov_cpu *sg_cpu, u64 time,
unsigned int flags)
diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c
index 5ebee31..cd26436 100644
--- a/kernel/sched/cputime.c
+++ b/kernel/sched/cputime.c
@@ -4,7 +4,12 @@
#include <linux/kernel_stat.h>
#include <linux/static_key.h>
#include <linux/context_tracking.h>
+#ifdef CONFIG_SCHED_MUQSS
+#include "MuQSS.h"
+#include "stats.h"
+#else
#include "sched.h"
+#endif
#ifdef CONFIG_PARAVIRT
#include <asm/paravirt.h>
#endif
@@ -298,26 +303,6 @@ static inline cputime_t account_other_time(cputime_t max)
return accounted;
}
-#ifdef CONFIG_64BIT
-static inline u64 read_sum_exec_runtime(struct task_struct *t)
-{
- return t->se.sum_exec_runtime;
-}
-#else
-static u64 read_sum_exec_runtime(struct task_struct *t)
-{
- u64 ns;
- struct rq_flags rf;
- struct rq *rq;
-
- rq = task_rq_lock(t, &rf);
- ns = t->se.sum_exec_runtime;
- task_rq_unlock(rq, t, &rf);
-
- return ns;
-}
-#endif
-
/*
* Accumulate raw cputime values of dead tasks (sig->[us]time) and live
* tasks (sum on group iteration) belonging to @tsk's group.
@@ -694,7 +679,7 @@ out:
void task_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime_t *st)
{
struct task_cputime cputime = {
- .sum_exec_runtime = p->se.sum_exec_runtime,
+ .sum_exec_runtime = tsk_seruntime(p),
};
task_cputime(p, &cputime.utime, &cputime.stime);
diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c
index 1d8718d..7ec1b39 100644
--- a/kernel/sched/idle.c
+++ b/kernel/sched/idle.c
@@ -14,7 +14,11 @@
#include <trace/events/power.h>
+#ifdef CONFIG_SCHED_MUQSS
+#include "MuQSS.h"
+#else
#include "sched.h"
+#endif
/* Linker adds these: start and end of __cpuidle functions */
extern char __cpuidle_text_start[], __cpuidle_text_end[];
@@ -207,6 +211,8 @@ static void cpu_idle_loop(void)
int cpu = smp_processor_id();
while (1) {
+ bool pending = false;
+
/*
* If the arch has a polling bit, we maintain an invariant:
*
@@ -218,7 +224,10 @@ static void cpu_idle_loop(void)
__current_set_polling();
quiet_vmstat();
- tick_nohz_idle_enter();
+ if (unlikely(softirq_pending(cpu)))
+ pending = true;
+ else
+ tick_nohz_idle_enter();
while (!need_resched()) {
check_pgt_cache();
@@ -258,7 +267,8 @@ static void cpu_idle_loop(void)
* not have had an IPI to fold the state for us.
*/
preempt_set_need_resched();
- tick_nohz_idle_exit();
+ if (!pending)
+ tick_nohz_idle_exit();
__current_clr_polling();
/*
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 055f935..662f62b 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -1815,3 +1815,28 @@ static inline void cpufreq_update_this_cpu(struct rq *rq, unsigned int flags) {}
#else /* arch_scale_freq_capacity */
#define arch_scale_freq_invariant() (false)
#endif
+
+static inline bool softirq_pending(int cpu)
+{
+ return false;
+}
+
+#ifdef CONFIG_64BIT
+static inline u64 read_sum_exec_runtime(struct task_struct *t)
+{
+ return t->se.sum_exec_runtime;
+}
+#else
+static inline u64 read_sum_exec_runtime(struct task_struct *t)
+{
+ u64 ns;
+ struct rq_flags rf;
+ struct rq *rq;
+
+ rq = task_rq_lock(t, &rf);
+ ns = t->se.sum_exec_runtime;
+ task_rq_unlock(rq, t, &rf);
+
+ return ns;
+}
+#endif
diff --git a/kernel/sched/stats.c b/kernel/sched/stats.c
index 87e2c9f..ba7b137 100644
--- a/kernel/sched/stats.c
+++ b/kernel/sched/stats.c
@@ -4,7 +4,11 @@
#include <linux/seq_file.h>
#include <linux/proc_fs.h>
+#ifndef CONFIG_SCHED_MUQSS
#include "sched.h"
+#else
+#include "MuQSS.h"
+#endif
/*
* bump this up when changing the output format or the meaning of an existing
diff --git a/kernel/skip_list.c b/kernel/skip_list.c
new file mode 100644
index 0000000..d525080
--- /dev/null
+++ b/kernel/skip_list.c
@@ -0,0 +1,148 @@
+/*
+ Copyright (C) 2011,2016 Con Kolivas.
+
+ Code based on example originally by William Pugh.
+
+Skip Lists are a probabilistic alternative to balanced trees, as
+described in the June 1990 issue of CACM and were invented by
+William Pugh in 1987.
+
+A couple of comments about this implementation:
+The routine randomLevel has been hard-coded to generate random
+levels using p=0.25. It can be easily changed.
+
+The insertion routine has been implemented so as to use the
+dirty hack described in the CACM paper: if a random level is
+generated that is more than the current maximum level, the
+current maximum level plus one is used instead.
+
+Levels start at zero and go up to MaxLevel (which is equal to
+MaxNumberOfLevels-1).
+
+The routines defined in this file are:
+
+init: defines slnode
+
+new_skiplist: returns a new, empty list
+
+randomLevel: Returns a random level based on a u64 random seed passed to it.
+In MuQSS, the "niffy" time is used for this purpose.
+
+insert(l,key, value): inserts the binding (key, value) into l. This operation
+occurs in O(log n) time.
+
+delnode(slnode, l, node): deletes any binding of key from the l based on the
+actual node value. This operation occurs in O(k) time where k is the
+number of levels of the node in question (max 8). The original delete
+function occurred in O(log n) time and involved a search.
+
+MuQSS Notes: In this implementation of skiplists, there are bidirectional
+next/prev pointers and the insert function returns a pointer to the actual
+node the value is stored. The key here is chosen by the scheduler so as to
+sort tasks according to the priority list requirements and is no longer used
+by the scheduler after insertion. The scheduler lookup, however, occurs in
+O(1) time because it is always the first item in the level 0 linked list.
+Since the task struct stores a copy of the node pointer upon skiplist_insert,
+it can also remove it much faster than the original implementation with the
+aid of prev<->next pointer manipulation and no searching.
+
+*/
+
+#include <linux/slab.h>
+#include <linux/skip_list.h>
+
+#define MaxNumberOfLevels 8
+#define MaxLevel (MaxNumberOfLevels - 1)
+
+void skiplist_init(skiplist_node *slnode)
+{
+ int i;
+
+ slnode->key = 0xFFFFFFFFFFFFFFFF;
+ slnode->level = 0;
+ slnode->value = NULL;
+ for (i = 0; i < MaxNumberOfLevels; i++)
+ slnode->next[i] = slnode->prev[i] = slnode;
+}
+
+skiplist *new_skiplist(skiplist_node *slnode)
+{
+ skiplist *l = kzalloc(sizeof(skiplist), GFP_ATOMIC);
+
+ BUG_ON(!l);
+ l->header = slnode;
+ return l;
+}
+
+void free_skiplist(skiplist *l)
+{
+ skiplist_node *p, *q;
+
+ p = l->header;
+ do {
+ q = p->next[0];
+ p->next[0]->prev[0] = q->prev[0];
+ skiplist_node_init(p);
+ p = q;
+ } while (p != l->header);
+ kfree(l);
+}
+
+void skiplist_node_init(skiplist_node *node)
+{
+ memset(node, 0, sizeof(skiplist_node));
+}
+
+static inline unsigned int randomLevel(const long unsigned int randseed)
+{
+ return find_first_bit(&randseed, MaxLevel);
+}
+
+void skiplist_insert(skiplist *l, skiplist_node *node, keyType key, valueType value, unsigned int randseed)
+{
+ skiplist_node *update[MaxNumberOfLevels];
+ skiplist_node *p, *q;
+ int k = l->level;
+
+ p = l->header;
+ do {
+ while (q = p->next[k], q->key <= key)
+ p = q;
+ update[k] = p;
+ } while (--k >= 0);
+
+ ++l->entries;
+ k = randomLevel(randseed);
+ if (k > l->level) {
+ k = ++l->level;
+ update[k] = l->header;
+ }
+
+ node->level = k;
+ node->key = key;
+ node->value = value;
+ do {
+ p = update[k];
+ node->next[k] = p->next[k];
+ p->next[k] = node;
+ node->prev[k] = p;
+ node->next[k]->prev[k] = node;
+ } while (--k >= 0);
+}
+
+void skiplist_delete(skiplist *l, skiplist_node *node)
+{
+ int k, m = node->level;
+
+ for (k = 0; k <= m; k++) {
+ node->prev[k]->next[k] = node->next[k];
+ node->next[k]->prev[k] = node->prev[k];
+ }
+ skiplist_node_init(node);
+ if (m == l->level) {
+ while (l->header->next[m] == l->header && l->header->prev[m] == l->header && m > 0)
+ m--;
+ l->level = m;
+ }
+ l->entries--;
+}
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index c1095cd..90e66b3 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -125,9 +125,17 @@ static int __maybe_unused one = 1;
static int __maybe_unused two = 2;
static int __maybe_unused four = 4;
static unsigned long one_ul = 1;
-static int one_hundred = 100;
-static int one_thousand = 1000;
-#ifdef CONFIG_PRINTK
+static int __read_mostly one_hundred = 100;
+static int __read_mostly one_thousand = 1000;
+#ifdef CONFIG_SCHED_MUQSS
+extern int rr_interval;
+extern int sched_interactive;
+extern int sched_iso_cpu;
+extern int sched_yield_type;
+#endif
+extern int hrtimer_granularity_us;
+extern int hrtimeout_min_us;
+#if defined(CONFIG_PRINTK) || defined(CONFIG_SCHED_MUQSS)
static int ten_thousand = 10000;
#endif
#ifdef CONFIG_PERF_EVENTS
@@ -264,7 +272,7 @@ static struct ctl_table sysctl_base_table[] = {
{ }
};
-#ifdef CONFIG_SCHED_DEBUG
+#if defined(CONFIG_SCHED_DEBUG) && !defined(CONFIG_SCHED_MUQSS)
static int min_sched_granularity_ns = 100000; /* 100 usecs */
static int max_sched_granularity_ns = NSEC_PER_SEC; /* 1 second */
static int min_wakeup_granularity_ns; /* 0 usecs */
@@ -281,6 +289,7 @@ static int max_extfrag_threshold = 1000;
#endif
static struct ctl_table kern_table[] = {
+#ifndef CONFIG_SCHED_MUQSS
{
.procname = "sched_child_runs_first",
.data = &sysctl_sched_child_runs_first,
@@ -449,6 +458,7 @@ static struct ctl_table kern_table[] = {
.extra1 = &one,
},
#endif
+#endif /* !CONFIG_SCHED_MUQSS */
#ifdef CONFIG_PROVE_LOCKING
{
.procname = "prove_locking",
@@ -1013,6 +1023,62 @@ static struct ctl_table kern_table[] = {
.proc_handler = proc_dointvec,
},
#endif
+#ifdef CONFIG_SCHED_MUQSS
+ {
+ .procname = "rr_interval",
+ .data = &rr_interval,
+ .maxlen = sizeof (int),
+ .mode = 0644,
+ .proc_handler = &proc_dointvec_minmax,
+ .extra1 = &one,
+ .extra2 = &one_thousand,
+ },
+ {
+ .procname = "interactive",
+ .data = &sched_interactive,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = &proc_dointvec_minmax,
+ .extra1 = &zero,
+ .extra2 = &one,
+ },
+ {
+ .procname = "iso_cpu",
+ .data = &sched_iso_cpu,
+ .maxlen = sizeof (int),
+ .mode = 0644,
+ .proc_handler = &proc_dointvec_minmax,
+ .extra1 = &zero,
+ .extra2 = &one_hundred,
+ },
+ {
+ .procname = "yield_type",
+ .data = &sched_yield_type,
+ .maxlen = sizeof (int),
+ .mode = 0644,
+ .proc_handler = &proc_dointvec_minmax,
+ .extra1 = &zero,
+ .extra2 = &two,
+ },
+#endif
+ {
+ .procname = "hrtimer_granularity_us",
+ .data = &hrtimer_granularity_us,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = &proc_dointvec_minmax,
+ .extra1 = &one,
+ .extra2 = &ten_thousand,
+ },
+ {
+ .procname = "hrtimeout_min_us",
+ .data = &hrtimeout_min_us,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = &proc_dointvec_minmax,
+ .extra1 = &one,
+ .extra2 = &ten_thousand,
+ },
#if defined(CONFIG_S390) && defined(CONFIG_SMP)
{
.procname = "spin_retry",
diff --git a/kernel/time/clockevents.c b/kernel/time/clockevents.c
index 2c5bc77..2cc8271 100644
--- a/kernel/time/clockevents.c
+++ b/kernel/time/clockevents.c
@@ -198,8 +198,9 @@ int clockevents_tick_resume(struct clock_event_device *dev)
#ifdef CONFIG_GENERIC_CLOCKEVENTS_MIN_ADJUST
-/* Limit min_delta to a jiffie */
-#define MIN_DELTA_LIMIT (NSEC_PER_SEC / HZ)
+int __read_mostly hrtimer_granularity_us = 100;
+/* Limit min_delta to 100us */
+#define MIN_DELTA_LIMIT (hrtimer_granularity_us * NSEC_PER_USEC)
/**
* clockevents_increase_min_delta - raise minimum delta of a clock event device
diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c
index bb5ec42..309494b 100644
--- a/kernel/time/hrtimer.c
+++ b/kernel/time/hrtimer.c
@@ -1788,3 +1788,117 @@ int __sched schedule_hrtimeout(ktime_t *expires,
return schedule_hrtimeout_range(expires, 0, mode);
}
EXPORT_SYMBOL_GPL(schedule_hrtimeout);
+
+/*
+ * As per schedule_hrtimeout but taskes a millisecond value and returns how
+ * many milliseconds are left.
+ */
+signed long __sched schedule_msec_hrtimeout(signed long timeout)
+{
+ struct hrtimer_sleeper t;
+ int delta, jiffs;
+ ktime_t expires;
+
+ if (!timeout) {
+ __set_current_state(TASK_RUNNING);
+ return 0;
+ }
+
+ jiffs = msecs_to_jiffies(timeout);
+ /*
+ * If regular timer resolution is adequate or hrtimer resolution is not
+ * (yet) better than Hz, as would occur during startup, use regular
+ * timers.
+ */
+ if (jiffs > 4 || hrtimer_resolution >= NSEC_PER_SEC / HZ || pm_freezing)
+ return schedule_timeout(jiffs);
+
+ delta = (timeout % 1000) * NSEC_PER_MSEC;
+ expires = ktime_set(0, delta);
+
+ hrtimer_init_on_stack(&t.timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
+ hrtimer_set_expires_range_ns(&t.timer, expires, delta);
+
+ hrtimer_init_sleeper(&t, current);
+
+ hrtimer_start_expires(&t.timer, HRTIMER_MODE_REL);
+
+ if (likely(t.task))
+ schedule();
+
+ hrtimer_cancel(&t.timer);
+ destroy_hrtimer_on_stack(&t.timer);
+
+ __set_current_state(TASK_RUNNING);
+
+ expires = hrtimer_expires_remaining(&t.timer);
+ timeout = ktime_to_ms(expires);
+ return timeout < 0 ? 0 : timeout;
+}
+
+EXPORT_SYMBOL(schedule_msec_hrtimeout);
+
+#define USECS_PER_SEC 1000000
+extern int hrtimer_granularity_us;
+
+static inline signed long schedule_usec_hrtimeout(signed long timeout)
+{
+ struct hrtimer_sleeper t;
+ ktime_t expires;
+ int delta;
+
+ if (!timeout) {
+ __set_current_state(TASK_RUNNING);
+ return 0;
+ }
+
+ if (hrtimer_resolution >= NSEC_PER_SEC / HZ)
+ return schedule_timeout(usecs_to_jiffies(timeout));
+
+ if (timeout < hrtimer_granularity_us)
+ timeout = hrtimer_granularity_us;
+ delta = (timeout % USECS_PER_SEC) * NSEC_PER_USEC;
+ expires = ktime_set(0, delta);
+
+ hrtimer_init_on_stack(&t.timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
+ hrtimer_set_expires_range_ns(&t.timer, expires, delta);
+
+ hrtimer_init_sleeper(&t, current);
+
+ hrtimer_start_expires(&t.timer, HRTIMER_MODE_REL);
+
+ if (likely(t.task))
+ schedule();
+
+ hrtimer_cancel(&t.timer);
+ destroy_hrtimer_on_stack(&t.timer);
+
+ __set_current_state(TASK_RUNNING);
+
+ expires = hrtimer_expires_remaining(&t.timer);
+ timeout = ktime_to_us(expires);
+ return timeout < 0 ? 0 : timeout;
+}
+
+int __read_mostly hrtimeout_min_us = 1000;
+
+signed long __sched schedule_min_hrtimeout(void)
+{
+ return usecs_to_jiffies(schedule_usec_hrtimeout(hrtimeout_min_us));
+}
+
+EXPORT_SYMBOL(schedule_min_hrtimeout);
+
+signed long __sched schedule_msec_hrtimeout_interruptible(signed long timeout)
+{
+ __set_current_state(TASK_INTERRUPTIBLE);
+ return schedule_msec_hrtimeout(timeout);
+}
+EXPORT_SYMBOL(schedule_msec_hrtimeout_interruptible);
+
+signed long __sched schedule_msec_hrtimeout_uninterruptible(signed long timeout)
+{
+ __set_current_state(TASK_UNINTERRUPTIBLE);
+ return schedule_msec_hrtimeout(timeout);
+}
+EXPORT_SYMBOL(schedule_msec_hrtimeout_uninterruptible);
diff --git a/kernel/time/posix-cpu-timers.c b/kernel/time/posix-cpu-timers.c
index 39008d7..784f3a1 100644
--- a/kernel/time/posix-cpu-timers.c
+++ b/kernel/time/posix-cpu-timers.c
@@ -447,7 +447,7 @@ static void cleanup_timers(struct list_head *head)
*/
void posix_cpu_timers_exit(struct task_struct *tsk)
{
- add_device_randomness((const void*) &tsk->se.sum_exec_runtime,
+ add_device_randomness((const void*) &tsk_seruntime(tsk),
sizeof(unsigned long long));
cleanup_timers(tsk->cpu_timers);
@@ -848,7 +848,7 @@ static void check_thread_timers(struct task_struct *tsk,
tsk_expires->virt_exp = expires_to_cputime(expires);
tsk_expires->sched_exp = check_timers_list(++timers, firing,
- tsk->se.sum_exec_runtime);
+ tsk_seruntime(tsk));
/*
* Check for the special case thread timers.
@@ -859,7 +859,7 @@ static void check_thread_timers(struct task_struct *tsk,
READ_ONCE(sig->rlim[RLIMIT_RTTIME].rlim_max);
if (hard != RLIM_INFINITY &&
- tsk->rt.timeout > DIV_ROUND_UP(hard, USEC_PER_SEC/HZ)) {
+ tsk_rttimeout(tsk) > DIV_ROUND_UP(hard, USEC_PER_SEC/HZ)) {
/*
* At the hard limit, we just die.
* No need to calculate anything else now.
@@ -867,7 +867,7 @@ static void check_thread_timers(struct task_struct *tsk,
__group_send_sig_info(SIGKILL, SEND_SIG_PRIV, tsk);
return;
}
- if (tsk->rt.timeout > DIV_ROUND_UP(soft, USEC_PER_SEC/HZ)) {
+ if (tsk_rttimeout(tsk) > DIV_ROUND_UP(soft, USEC_PER_SEC/HZ)) {
/*
* At the soft limit, send a SIGXCPU every second.
*/
@@ -1115,7 +1115,7 @@ static inline int fastpath_timer_check(struct task_struct *tsk)
struct task_cputime task_sample;
task_cputime(tsk, &task_sample.utime, &task_sample.stime);
- task_sample.sum_exec_runtime = tsk->se.sum_exec_runtime;
+ task_sample.sum_exec_runtime = tsk_seruntime(tsk);
if (task_cputime_expired(&task_sample, &tsk->cputime_expires))
return 1;
}
diff --git a/kernel/time/timer.c b/kernel/time/timer.c
index c611c47..9d0d44b 100644
--- a/kernel/time/timer.c
+++ b/kernel/time/timer.c
@@ -42,6 +42,7 @@
#include <linux/sched/sysctl.h>
#include <linux/slab.h>
#include <linux/compat.h>
+#include <linux/freezer.h>
#include <asm/uaccess.h>
#include <asm/unistd.h>
@@ -1464,7 +1465,7 @@ static unsigned long __next_timer_interrupt(struct timer_base *base)
* Check, if the next hrtimer event is before the next timer wheel
* event:
*/
-static u64 cmp_next_hrtimer_event(u64 basem, u64 expires)
+static u64 cmp_next_hrtimer_event(struct timer_base *base, u64 basem, u64 expires)
{
u64 nextevt = hrtimer_get_next_event();
@@ -1482,6 +1483,9 @@ static u64 cmp_next_hrtimer_event(u64 basem, u64 expires)
if (nextevt <= basem)
return basem;
+ if (nextevt < expires && nextevt - basem <= TICK_NSEC)
+ base->is_idle = false;
+
/*
* Round up to the next jiffie. High resolution timers are
* off, so the hrtimers are expired in the tick and we need to
@@ -1545,7 +1549,7 @@ u64 get_next_timer_interrupt(unsigned long basej, u64 basem)
}
spin_unlock(&base->lock);
- return cmp_next_hrtimer_event(basem, expires);
+ return cmp_next_hrtimer_event(base, basem, expires);
}
/**
@@ -1756,6 +1760,19 @@ signed long __sched schedule_timeout(signed long timeout)
expire = timeout + jiffies;
+#ifdef CONFIG_HIGH_RES_TIMERS
+ if (timeout == 1 && hrtimer_resolution < NSEC_PER_SEC / HZ) {
+ /*
+ * Special case 1 as being a request for the minimum timeout
+ * and use highres timers to timeout after 1ms to workaround
+ * the granularity of low Hz tick timers.
+ */
+ if (!schedule_min_hrtimeout())
+ return 0;
+ goto out_timeout;
+ }
+#endif
+
setup_timer_on_stack(&timer, process_timeout, (unsigned long)current);
__mod_timer(&timer, expire, false);
schedule();
@@ -1763,10 +1780,10 @@ signed long __sched schedule_timeout(signed long timeout)
/* Remove the timer from the object tracker */
destroy_timer_on_stack(&timer);
-
+out_timeout:
timeout = expire - jiffies;
- out:
+out:
return timeout < 0 ? 0 : timeout;
}
EXPORT_SYMBOL(schedule_timeout);
@@ -1887,7 +1904,19 @@ void __init init_timers(void)
*/
void msleep(unsigned int msecs)
{
- unsigned long timeout = msecs_to_jiffies(msecs) + 1;
+ int jiffs = msecs_to_jiffies(msecs);
+ unsigned long timeout;
+
+ /*
+ * Use high resolution timers where the resolution of tick based
+ * timers is inadequate.
+ */
+ if (jiffs < 5 && hrtimer_resolution < NSEC_PER_SEC / HZ && !pm_freezing) {
+ while (msecs)
+ msecs = schedule_msec_hrtimeout_uninterruptible(msecs);
+ return;
+ }
+ timeout = jiffs + 1;
while (timeout)
timeout = schedule_timeout_uninterruptible(timeout);
@@ -1901,7 +1930,15 @@ EXPORT_SYMBOL(msleep);
*/
unsigned long msleep_interruptible(unsigned int msecs)
{
- unsigned long timeout = msecs_to_jiffies(msecs) + 1;
+ int jiffs = msecs_to_jiffies(msecs);
+ unsigned long timeout;
+
+ if (jiffs < 5 && hrtimer_resolution < NSEC_PER_SEC / HZ && !pm_freezing) {
+ while (msecs && !signal_pending(current))
+ msecs = schedule_msec_hrtimeout_interruptible(msecs);
+ return msecs;
+ }
+ timeout = jiffs + 1;
while (timeout && !signal_pending(current))
timeout = schedule_timeout_interruptible(timeout);
diff --git a/kernel/trace/trace_selftest.c b/kernel/trace/trace_selftest.c
index b0f86ea..69ee53a 100644
--- a/kernel/trace/trace_selftest.c
+++ b/kernel/trace/trace_selftest.c
@@ -1039,10 +1039,15 @@ static int trace_wakeup_test_thread(void *data)
{
/* Make this a -deadline thread */
static const struct sched_attr attr = {
+#ifdef CONFIG_SCHED_MUQSS
+ /* No deadline on MuQSS, use RR */
+ .sched_policy = SCHED_RR,
+#else
.sched_policy = SCHED_DEADLINE,
.sched_runtime = 100000ULL,
.sched_deadline = 10000000ULL,
.sched_period = 10000000ULL
+#endif
};
struct wakeup_test_data *x = data;
diff --git a/mm/backing-dev.c b/mm/backing-dev.c
index 8fde443..3bfed5a 100644
--- a/mm/backing-dev.c
+++ b/mm/backing-dev.c
@@ -310,6 +310,7 @@ static int wb_init(struct bdi_writeback *wb, struct backing_dev_info *bdi,
spin_lock_init(&wb->work_lock);
INIT_LIST_HEAD(&wb->work_list);
INIT_DELAYED_WORK(&wb->dwork, wb_workfn);
+ wb->dirty_sleep = jiffies;
wb->congested = wb_congested_get_create(bdi, blkcg_id, gfp);
if (!wb->congested)
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index 439cc63..52e2f8e 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -1778,6 +1778,7 @@ pause:
pause,
start_time);
__set_current_state(TASK_KILLABLE);
+ wb->dirty_sleep = now;
io_schedule_timeout(pause);
current->dirty_paused_when = now + pause;
diff --git a/net/core/pktgen.c b/net/core/pktgen.c
index 306b8f0..4f4ab3d 100644
--- a/net/core/pktgen.c
+++ b/net/core/pktgen.c
@@ -1992,7 +1992,7 @@ static void pktgen_mark_device(const struct pktgen_net *pn, const char *ifname)
mutex_unlock(&pktgen_thread_lock);
pr_debug("%s: waiting for %s to disappear....\n",
__func__, ifname);
- schedule_timeout_interruptible(msecs_to_jiffies(msec_per_try));
+ schedule_msec_hrtimeout_interruptible((msec_per_try));
mutex_lock(&pktgen_thread_lock);
if (++i >= max_tries) {
diff --git a/sound/pci/maestro3.c b/sound/pci/maestro3.c
index cafea6d..d374514 100644
--- a/sound/pci/maestro3.c
+++ b/sound/pci/maestro3.c
@@ -2016,7 +2016,7 @@ static void snd_m3_ac97_reset(struct snd_m3 *chip)
outw(0, io + GPIO_DATA);
outw(dir | GPO_PRIMARY_AC97, io + GPIO_DIRECTION);
- schedule_timeout_uninterruptible(msecs_to_jiffies(delay1));
+ schedule_msec_hrtimeout_uninterruptible((delay1));
outw(GPO_PRIMARY_AC97, io + GPIO_DATA);
udelay(5);
@@ -2024,7 +2024,7 @@ static void snd_m3_ac97_reset(struct snd_m3 *chip)
outw(IO_SRAM_ENABLE | SERIAL_AC_LINK_ENABLE, io + RING_BUS_CTRL_A);
outw(~0, io + GPIO_MASK);
- schedule_timeout_uninterruptible(msecs_to_jiffies(delay2));
+ schedule_msec_hrtimeout_uninterruptible((delay2));
if (! snd_m3_try_read_vendor(chip))
break;
diff --git a/sound/soc/codecs/rt5631.c b/sound/soc/codecs/rt5631.c
index 0e41808..611cb9f 100644
--- a/sound/soc/codecs/rt5631.c
+++ b/sound/soc/codecs/rt5631.c
@@ -419,7 +419,7 @@ static void onebit_depop_mute_stage(struct snd_soc_codec *codec, int enable)
hp_zc = snd_soc_read(codec, RT5631_INT_ST_IRQ_CTRL_2);
snd_soc_write(codec, RT5631_INT_ST_IRQ_CTRL_2, hp_zc & 0xf7ff);
if (enable) {
- schedule_timeout_uninterruptible(msecs_to_jiffies(10));
+ schedule_msec_hrtimeout_uninterruptible((10));
/* config one-bit depop parameter */
rt5631_write_index(codec, RT5631_SPK_INTL_CTRL, 0x307f);
snd_soc_update_bits(codec, RT5631_HP_OUT_VOL,
@@ -529,7 +529,7 @@ static void depop_seq_mute_stage(struct snd_soc_codec *codec, int enable)
hp_zc = snd_soc_read(codec, RT5631_INT_ST_IRQ_CTRL_2);
snd_soc_write(codec, RT5631_INT_ST_IRQ_CTRL_2, hp_zc & 0xf7ff);
if (enable) {
- schedule_timeout_uninterruptible(msecs_to_jiffies(10));
+ schedule_msec_hrtimeout_uninterruptible((10));
/* config depop sequence parameter */
rt5631_write_index(codec, RT5631_SPK_INTL_CTRL, 0x302f);
diff --git a/sound/soc/codecs/wm8350.c b/sound/soc/codecs/wm8350.c
index 2efc5b4..3e3248c 100644
--- a/sound/soc/codecs/wm8350.c
+++ b/sound/soc/codecs/wm8350.c
@@ -236,10 +236,10 @@ static void wm8350_pga_work(struct work_struct *work)
out2->ramp == WM8350_RAMP_UP) {
/* delay is longer over 0dB as increases are larger */
if (i >= WM8350_OUTn_0dB)
- schedule_timeout_interruptible(msecs_to_jiffies
+ schedule_msec_hrtimeout_interruptible(
(2));
else
- schedule_timeout_interruptible(msecs_to_jiffies
+ schedule_msec_hrtimeout_interruptible(
(1));
} else
udelay(50); /* doesn't matter if we delay longer */
@@ -1123,7 +1123,7 @@ static int wm8350_set_bias_level(struct snd_soc_codec *codec,
(platform->dis_out4 << 6));
/* wait for discharge */
- schedule_timeout_interruptible(msecs_to_jiffies
+ schedule_msec_hrtimeout_interruptible(
(platform->
cap_discharge_msecs));
@@ -1139,7 +1139,7 @@ static int wm8350_set_bias_level(struct snd_soc_codec *codec,
WM8350_VBUFEN);
/* wait for vmid */
- schedule_timeout_interruptible(msecs_to_jiffies
+ schedule_msec_hrtimeout_interruptible(
(platform->
vmid_charge_msecs));
@@ -1190,7 +1190,7 @@ static int wm8350_set_bias_level(struct snd_soc_codec *codec,
wm8350_reg_write(wm8350, WM8350_POWER_MGMT_1, pm1);
/* wait */
- schedule_timeout_interruptible(msecs_to_jiffies
+ schedule_msec_hrtimeout_interruptible(
(platform->
vmid_discharge_msecs));
@@ -1208,7 +1208,7 @@ static int wm8350_set_bias_level(struct snd_soc_codec *codec,
pm1 | WM8350_OUTPUT_DRAIN_EN);
/* wait */
- schedule_timeout_interruptible(msecs_to_jiffies
+ schedule_msec_hrtimeout_interruptible(
(platform->drain_msecs));
pm1 &= ~WM8350_BIASEN;
diff --git a/sound/soc/codecs/wm8900.c b/sound/soc/codecs/wm8900.c
index c77b49a..fc50456 100644
--- a/sound/soc/codecs/wm8900.c
+++ b/sound/soc/codecs/wm8900.c
@@ -1112,7 +1112,7 @@ static int wm8900_set_bias_level(struct snd_soc_codec *codec,
/* Need to let things settle before stopping the clock
* to ensure that restart works, see "Stopping the
* master clock" in the datasheet. */
- schedule_timeout_interruptible(msecs_to_jiffies(1));
+ schedule_msec_hrtimeout_interruptible((1));
snd_soc_write(codec, WM8900_REG_POWER2,
WM8900_REG_POWER2_SYSCLK_ENA);
break;
diff --git a/sound/soc/codecs/wm9713.c b/sound/soc/codecs/wm9713.c
index e4301dd..bc7b153 100644
--- a/sound/soc/codecs/wm9713.c
+++ b/sound/soc/codecs/wm9713.c
@@ -199,7 +199,7 @@ static int wm9713_voice_shutdown(struct snd_soc_dapm_widget *w,
/* Gracefully shut down the voice interface. */
snd_soc_update_bits(codec, AC97_HANDSET_RATE, 0x0f00, 0x0200);
- schedule_timeout_interruptible(msecs_to_jiffies(1));
+ schedule_msec_hrtimeout_interruptible((1));
snd_soc_update_bits(codec, AC97_HANDSET_RATE, 0x0f00, 0x0f00);
snd_soc_update_bits(codec, AC97_EXTENDED_MID, 0x1000, 0x1000);
@@ -868,7 +868,7 @@ static int wm9713_set_pll(struct snd_soc_codec *codec,
wm9713->pll_in = freq_in;
/* wait 10ms AC97 link frames for the link to stabilise */
- schedule_timeout_interruptible(msecs_to_jiffies(10));
+ schedule_msec_hrtimeout_interruptible((10));
return 0;
}
diff --git a/sound/soc/soc-dapm.c b/sound/soc/soc-dapm.c
index 3bbe32e..be285ac 100644
--- a/sound/soc/soc-dapm.c
+++ b/sound/soc/soc-dapm.c
@@ -134,7 +134,7 @@ static void dapm_assert_locked(struct snd_soc_dapm_context *dapm)
static void pop_wait(u32 pop_time)
{
if (pop_time)
- schedule_timeout_uninterruptible(msecs_to_jiffies(pop_time));
+ schedule_msec_hrtimeout_uninterruptible((pop_time));
}
static void pop_dbg(struct device *dev, u32 pop_time, const char *fmt, ...)
diff --git a/sound/usb/line6/pcm.c b/sound/usb/line6/pcm.c
index fab53f5..fda1ab5 100644
--- a/sound/usb/line6/pcm.c
+++ b/sound/usb/line6/pcm.c
@@ -131,7 +131,7 @@ static void line6_wait_clear_audio_urbs(struct snd_line6_pcm *line6pcm,
if (!alive)
break;
set_current_state(TASK_UNINTERRUPTIBLE);
- schedule_timeout(1);
+ schedule_min_hrtimeout();
} while (--timeout > 0);
if (alive)
dev_err(line6pcm->line6->ifcdev,
--
2.9.3