mirror of
https://github.com/Dasharo/linux.git
synced 2026-03-06 15:25:10 -08:00
Merge tag 'for-5.14/block-2021-06-29' of git://git.kernel.dk/linux-block
Pull core block updates from Jens Axboe: - disk events cleanup (Christoph) - gendisk and request queue allocation simplifications (Christoph) - bdev_disk_changed cleanups (Christoph) - IO priority improvements (Bart) - Chained bio completion trace fix (Edward) - blk-wbt fixes (Jan) - blk-wbt enable/disable fix (Zhang) - Scheduler dispatch improvements (Jan, Ming) - Shared tagset scheduler improvements (John) - BFQ updates (Paolo, Luca, Pietro) - BFQ lock inversion fix (Jan) - Documentation improvements (Kir) - CLONE_IO block cgroup fix (Tejun) - Remove of ancient and deprecated block dump feature (zhangyi) - Discard merge fix (Ming) - Misc fixes or followup fixes (Colin, Damien, Dan, Long, Max, Thomas, Yang) * tag 'for-5.14/block-2021-06-29' of git://git.kernel.dk/linux-block: (129 commits) block: fix discard request merge block/mq-deadline: Remove a WARN_ON_ONCE() call blk-mq: update hctx->dispatch_busy in case of real scheduler blk: Fix lock inversion between ioc lock and bfqd lock bfq: Remove merged request already in bfq_requests_merged() block: pass a gendisk to bdev_disk_changed block: move bdev_disk_changed block: add the events* attributes to disk_attrs block: move the disk events code to a separate file block: fix trace completion for chained bio block/partitions/msdos: Fix typo inidicator -> indicator block, bfq: reset waker pointer with shared queues block, bfq: check waker only for queues with no in-flight I/O block, bfq: avoid delayed merge of async queues block, bfq: boost throughput by extending queue-merging times block, bfq: consider also creation time in delayed stable merge block, bfq: fix delayed stable merge check block, bfq: let also stably merged queues enjoy weight raising blk-wbt: make sure throttle is enabled properly blk-wbt: introduce a new disable state to prevent false positive by rwb_enabled() ...
This commit is contained in:
@@ -17,36 +17,37 @@ level logical devices like device mapper.
|
||||
|
||||
HOWTO
|
||||
=====
|
||||
|
||||
Throttling/Upper Limit policy
|
||||
-----------------------------
|
||||
- Enable Block IO controller::
|
||||
Enable Block IO controller::
|
||||
|
||||
CONFIG_BLK_CGROUP=y
|
||||
|
||||
- Enable throttling in block layer::
|
||||
Enable throttling in block layer::
|
||||
|
||||
CONFIG_BLK_DEV_THROTTLING=y
|
||||
|
||||
- Mount blkio controller (see cgroups.txt, Why are cgroups needed?)::
|
||||
Mount blkio controller (see cgroups.txt, Why are cgroups needed?)::
|
||||
|
||||
mount -t cgroup -o blkio none /sys/fs/cgroup/blkio
|
||||
|
||||
- Specify a bandwidth rate on particular device for root group. The format
|
||||
for policy is "<major>:<minor> <bytes_per_second>"::
|
||||
Specify a bandwidth rate on particular device for root group. The format
|
||||
for policy is "<major>:<minor> <bytes_per_second>"::
|
||||
|
||||
echo "8:16 1048576" > /sys/fs/cgroup/blkio/blkio.throttle.read_bps_device
|
||||
|
||||
Above will put a limit of 1MB/second on reads happening for root group
|
||||
on device having major/minor number 8:16.
|
||||
This will put a limit of 1MB/second on reads happening for root group
|
||||
on device having major/minor number 8:16.
|
||||
|
||||
- Run dd to read a file and see if rate is throttled to 1MB/s or not::
|
||||
Run dd to read a file and see if rate is throttled to 1MB/s or not::
|
||||
|
||||
# dd iflag=direct if=/mnt/common/zerofile of=/dev/null bs=4K count=1024
|
||||
1024+0 records in
|
||||
1024+0 records out
|
||||
4194304 bytes (4.2 MB) copied, 4.0001 s, 1.0 MB/s
|
||||
|
||||
Limits for writes can be put using blkio.throttle.write_bps_device file.
|
||||
Limits for writes can be put using blkio.throttle.write_bps_device file.
|
||||
|
||||
Hierarchical Cgroups
|
||||
====================
|
||||
@@ -79,85 +80,89 @@ following::
|
||||
|
||||
Various user visible config options
|
||||
===================================
|
||||
CONFIG_BLK_CGROUP
|
||||
- Block IO controller.
|
||||
|
||||
CONFIG_BFQ_CGROUP_DEBUG
|
||||
- Debug help. Right now some additional stats file show up in cgroup
|
||||
CONFIG_BLK_CGROUP
|
||||
Block IO controller.
|
||||
|
||||
CONFIG_BFQ_CGROUP_DEBUG
|
||||
Debug help. Right now some additional stats file show up in cgroup
|
||||
if this option is enabled.
|
||||
|
||||
CONFIG_BLK_DEV_THROTTLING
|
||||
- Enable block device throttling support in block layer.
|
||||
CONFIG_BLK_DEV_THROTTLING
|
||||
Enable block device throttling support in block layer.
|
||||
|
||||
Details of cgroup files
|
||||
=======================
|
||||
|
||||
Proportional weight policy files
|
||||
--------------------------------
|
||||
- blkio.weight
|
||||
- Specifies per cgroup weight. This is default weight of the group
|
||||
on all the devices until and unless overridden by per device rule.
|
||||
(See blkio.weight_device).
|
||||
Currently allowed range of weights is from 10 to 1000.
|
||||
|
||||
- blkio.weight_device
|
||||
- One can specify per cgroup per device rules using this interface.
|
||||
These rules override the default value of group weight as specified
|
||||
by blkio.weight.
|
||||
blkio.bfq.weight
|
||||
Specifies per cgroup weight. This is default weight of the group
|
||||
on all the devices until and unless overridden by per device rule
|
||||
(see `blkio.bfq.weight_device` below).
|
||||
|
||||
Currently allowed range of weights is from 1 to 1000. For more details,
|
||||
see Documentation/block/bfq-iosched.rst.
|
||||
|
||||
blkio.bfq.weight_device
|
||||
Specifes per cgroup per device weights, overriding the default group
|
||||
weight. For more details, see Documentation/block/bfq-iosched.rst.
|
||||
|
||||
Following is the format::
|
||||
|
||||
# echo dev_maj:dev_minor weight > blkio.weight_device
|
||||
# echo dev_maj:dev_minor weight > blkio.bfq.weight_device
|
||||
|
||||
Configure weight=300 on /dev/sdb (8:16) in this cgroup::
|
||||
|
||||
# echo 8:16 300 > blkio.weight_device
|
||||
# cat blkio.weight_device
|
||||
# echo 8:16 300 > blkio.bfq.weight_device
|
||||
# cat blkio.bfq.weight_device
|
||||
dev weight
|
||||
8:16 300
|
||||
|
||||
Configure weight=500 on /dev/sda (8:0) in this cgroup::
|
||||
|
||||
# echo 8:0 500 > blkio.weight_device
|
||||
# cat blkio.weight_device
|
||||
# echo 8:0 500 > blkio.bfq.weight_device
|
||||
# cat blkio.bfq.weight_device
|
||||
dev weight
|
||||
8:0 500
|
||||
8:16 300
|
||||
|
||||
Remove specific weight for /dev/sda in this cgroup::
|
||||
|
||||
# echo 8:0 0 > blkio.weight_device
|
||||
# cat blkio.weight_device
|
||||
# echo 8:0 0 > blkio.bfq.weight_device
|
||||
# cat blkio.bfq.weight_device
|
||||
dev weight
|
||||
8:16 300
|
||||
|
||||
- blkio.time
|
||||
- disk time allocated to cgroup per device in milliseconds. First
|
||||
blkio.time
|
||||
Disk time allocated to cgroup per device in milliseconds. First
|
||||
two fields specify the major and minor number of the device and
|
||||
third field specifies the disk time allocated to group in
|
||||
milliseconds.
|
||||
|
||||
- blkio.sectors
|
||||
- number of sectors transferred to/from disk by the group. First
|
||||
blkio.sectors
|
||||
Number of sectors transferred to/from disk by the group. First
|
||||
two fields specify the major and minor number of the device and
|
||||
third field specifies the number of sectors transferred by the
|
||||
group to/from the device.
|
||||
|
||||
- blkio.io_service_bytes
|
||||
- Number of bytes transferred to/from the disk by the group. These
|
||||
blkio.io_service_bytes
|
||||
Number of bytes transferred to/from the disk by the group. These
|
||||
are further divided by the type of operation - read or write, sync
|
||||
or async. First two fields specify the major and minor number of the
|
||||
device, third field specifies the operation type and the fourth field
|
||||
specifies the number of bytes.
|
||||
|
||||
- blkio.io_serviced
|
||||
- Number of IOs (bio) issued to the disk by the group. These
|
||||
blkio.io_serviced
|
||||
Number of IOs (bio) issued to the disk by the group. These
|
||||
are further divided by the type of operation - read or write, sync
|
||||
or async. First two fields specify the major and minor number of the
|
||||
device, third field specifies the operation type and the fourth field
|
||||
specifies the number of IOs.
|
||||
|
||||
- blkio.io_service_time
|
||||
- Total amount of time between request dispatch and request completion
|
||||
blkio.io_service_time
|
||||
Total amount of time between request dispatch and request completion
|
||||
for the IOs done by this cgroup. This is in nanoseconds to make it
|
||||
meaningful for flash devices too. For devices with queue depth of 1,
|
||||
this time represents the actual service time. When queue_depth > 1,
|
||||
@@ -170,8 +175,8 @@ Proportional weight policy files
|
||||
specifies the operation type and the fourth field specifies the
|
||||
io_service_time in ns.
|
||||
|
||||
- blkio.io_wait_time
|
||||
- Total amount of time the IOs for this cgroup spent waiting in the
|
||||
blkio.io_wait_time
|
||||
Total amount of time the IOs for this cgroup spent waiting in the
|
||||
scheduler queues for service. This can be greater than the total time
|
||||
elapsed since it is cumulative io_wait_time for all IOs. It is not a
|
||||
measure of total time the cgroup spent waiting but rather a measure of
|
||||
@@ -185,24 +190,24 @@ Proportional weight policy files
|
||||
minor number of the device, third field specifies the operation type
|
||||
and the fourth field specifies the io_wait_time in ns.
|
||||
|
||||
- blkio.io_merged
|
||||
- Total number of bios/requests merged into requests belonging to this
|
||||
blkio.io_merged
|
||||
Total number of bios/requests merged into requests belonging to this
|
||||
cgroup. This is further divided by the type of operation - read or
|
||||
write, sync or async.
|
||||
|
||||
- blkio.io_queued
|
||||
- Total number of requests queued up at any given instant for this
|
||||
blkio.io_queued
|
||||
Total number of requests queued up at any given instant for this
|
||||
cgroup. This is further divided by the type of operation - read or
|
||||
write, sync or async.
|
||||
|
||||
- blkio.avg_queue_size
|
||||
- Debugging aid only enabled if CONFIG_BFQ_CGROUP_DEBUG=y.
|
||||
blkio.avg_queue_size
|
||||
Debugging aid only enabled if CONFIG_BFQ_CGROUP_DEBUG=y.
|
||||
The average queue size for this cgroup over the entire time of this
|
||||
cgroup's existence. Queue size samples are taken each time one of the
|
||||
queues of this cgroup gets a timeslice.
|
||||
|
||||
- blkio.group_wait_time
|
||||
- Debugging aid only enabled if CONFIG_BFQ_CGROUP_DEBUG=y.
|
||||
blkio.group_wait_time
|
||||
Debugging aid only enabled if CONFIG_BFQ_CGROUP_DEBUG=y.
|
||||
This is the amount of time the cgroup had to wait since it became busy
|
||||
(i.e., went from 0 to 1 request queued) to get a timeslice for one of
|
||||
its queues. This is different from the io_wait_time which is the
|
||||
@@ -212,8 +217,8 @@ Proportional weight policy files
|
||||
will only report the group_wait_time accumulated till the last time it
|
||||
got a timeslice and will not include the current delta.
|
||||
|
||||
- blkio.empty_time
|
||||
- Debugging aid only enabled if CONFIG_BFQ_CGROUP_DEBUG=y.
|
||||
blkio.empty_time
|
||||
Debugging aid only enabled if CONFIG_BFQ_CGROUP_DEBUG=y.
|
||||
This is the amount of time a cgroup spends without any pending
|
||||
requests when not being served, i.e., it does not include any time
|
||||
spent idling for one of the queues of the cgroup. This is in
|
||||
@@ -221,8 +226,8 @@ Proportional weight policy files
|
||||
the stat will only report the empty_time accumulated till the last
|
||||
time it had a pending request and will not include the current delta.
|
||||
|
||||
- blkio.idle_time
|
||||
- Debugging aid only enabled if CONFIG_BFQ_CGROUP_DEBUG=y.
|
||||
blkio.idle_time
|
||||
Debugging aid only enabled if CONFIG_BFQ_CGROUP_DEBUG=y.
|
||||
This is the amount of time spent by the IO scheduler idling for a
|
||||
given cgroup in anticipation of a better request than the existing ones
|
||||
from other queues/cgroups. This is in nanoseconds. If this is read
|
||||
@@ -230,60 +235,60 @@ Proportional weight policy files
|
||||
idle_time accumulated till the last idle period and will not include
|
||||
the current delta.
|
||||
|
||||
- blkio.dequeue
|
||||
- Debugging aid only enabled if CONFIG_BFQ_CGROUP_DEBUG=y. This
|
||||
blkio.dequeue
|
||||
Debugging aid only enabled if CONFIG_BFQ_CGROUP_DEBUG=y. This
|
||||
gives the statistics about how many a times a group was dequeued
|
||||
from service tree of the device. First two fields specify the major
|
||||
and minor number of the device and third field specifies the number
|
||||
of times a group was dequeued from a particular device.
|
||||
|
||||
- blkio.*_recursive
|
||||
- Recursive version of various stats. These files show the
|
||||
blkio.*_recursive
|
||||
Recursive version of various stats. These files show the
|
||||
same information as their non-recursive counterparts but
|
||||
include stats from all the descendant cgroups.
|
||||
|
||||
Throttling/Upper limit policy files
|
||||
-----------------------------------
|
||||
- blkio.throttle.read_bps_device
|
||||
- Specifies upper limit on READ rate from the device. IO rate is
|
||||
blkio.throttle.read_bps_device
|
||||
Specifies upper limit on READ rate from the device. IO rate is
|
||||
specified in bytes per second. Rules are per device. Following is
|
||||
the format::
|
||||
|
||||
echo "<major>:<minor> <rate_bytes_per_second>" > /cgrp/blkio.throttle.read_bps_device
|
||||
|
||||
- blkio.throttle.write_bps_device
|
||||
- Specifies upper limit on WRITE rate to the device. IO rate is
|
||||
blkio.throttle.write_bps_device
|
||||
Specifies upper limit on WRITE rate to the device. IO rate is
|
||||
specified in bytes per second. Rules are per device. Following is
|
||||
the format::
|
||||
|
||||
echo "<major>:<minor> <rate_bytes_per_second>" > /cgrp/blkio.throttle.write_bps_device
|
||||
|
||||
- blkio.throttle.read_iops_device
|
||||
- Specifies upper limit on READ rate from the device. IO rate is
|
||||
blkio.throttle.read_iops_device
|
||||
Specifies upper limit on READ rate from the device. IO rate is
|
||||
specified in IO per second. Rules are per device. Following is
|
||||
the format::
|
||||
|
||||
echo "<major>:<minor> <rate_io_per_second>" > /cgrp/blkio.throttle.read_iops_device
|
||||
|
||||
- blkio.throttle.write_iops_device
|
||||
- Specifies upper limit on WRITE rate to the device. IO rate is
|
||||
blkio.throttle.write_iops_device
|
||||
Specifies upper limit on WRITE rate to the device. IO rate is
|
||||
specified in io per second. Rules are per device. Following is
|
||||
the format::
|
||||
|
||||
echo "<major>:<minor> <rate_io_per_second>" > /cgrp/blkio.throttle.write_iops_device
|
||||
|
||||
Note: If both BW and IOPS rules are specified for a device, then IO is
|
||||
subjected to both the constraints.
|
||||
Note: If both BW and IOPS rules are specified for a device, then IO is
|
||||
subjected to both the constraints.
|
||||
|
||||
- blkio.throttle.io_serviced
|
||||
- Number of IOs (bio) issued to the disk by the group. These
|
||||
blkio.throttle.io_serviced
|
||||
Number of IOs (bio) issued to the disk by the group. These
|
||||
are further divided by the type of operation - read or write, sync
|
||||
or async. First two fields specify the major and minor number of the
|
||||
device, third field specifies the operation type and the fourth field
|
||||
specifies the number of IOs.
|
||||
|
||||
- blkio.throttle.io_service_bytes
|
||||
- Number of bytes transferred to/from the disk by the group. These
|
||||
blkio.throttle.io_service_bytes
|
||||
Number of bytes transferred to/from the disk by the group. These
|
||||
are further divided by the type of operation - read or write, sync
|
||||
or async. First two fields specify the major and minor number of the
|
||||
device, third field specifies the operation type and the fourth field
|
||||
@@ -291,6 +296,6 @@ Note: If both BW and IOPS rules are specified for a device, then IO is
|
||||
|
||||
Common files among various policies
|
||||
-----------------------------------
|
||||
- blkio.reset_stats
|
||||
- Writing an int to this file will result in resetting all the stats
|
||||
blkio.reset_stats
|
||||
Writing an int to this file will result in resetting all the stats
|
||||
for that cgroup.
|
||||
|
||||
@@ -56,6 +56,7 @@ v1 is available under :ref:`Documentation/admin-guide/cgroup-v1/index.rst <cgrou
|
||||
5-3-3. IO Latency
|
||||
5-3-3-1. How IO Latency Throttling Works
|
||||
5-3-3-2. IO Latency Interface Files
|
||||
5-3-4. IO Priority
|
||||
5-4. PID
|
||||
5-4-1. PID Interface Files
|
||||
5-5. Cpuset
|
||||
@@ -1866,6 +1867,60 @@ IO Latency Interface Files
|
||||
duration of time between evaluation events. Windows only elapse
|
||||
with IO activity. Idle periods extend the most recent window.
|
||||
|
||||
IO Priority
|
||||
~~~~~~~~~~~
|
||||
|
||||
A single attribute controls the behavior of the I/O priority cgroup policy,
|
||||
namely the blkio.prio.class attribute. The following values are accepted for
|
||||
that attribute:
|
||||
|
||||
no-change
|
||||
Do not modify the I/O priority class.
|
||||
|
||||
none-to-rt
|
||||
For requests that do not have an I/O priority class (NONE),
|
||||
change the I/O priority class into RT. Do not modify
|
||||
the I/O priority class of other requests.
|
||||
|
||||
restrict-to-be
|
||||
For requests that do not have an I/O priority class or that have I/O
|
||||
priority class RT, change it into BE. Do not modify the I/O priority
|
||||
class of requests that have priority class IDLE.
|
||||
|
||||
idle
|
||||
Change the I/O priority class of all requests into IDLE, the lowest
|
||||
I/O priority class.
|
||||
|
||||
The following numerical values are associated with the I/O priority policies:
|
||||
|
||||
+-------------+---+
|
||||
| no-change | 0 |
|
||||
+-------------+---+
|
||||
| none-to-rt | 1 |
|
||||
+-------------+---+
|
||||
| rt-to-be | 2 |
|
||||
+-------------+---+
|
||||
| all-to-idle | 3 |
|
||||
+-------------+---+
|
||||
|
||||
The numerical value that corresponds to each I/O priority class is as follows:
|
||||
|
||||
+-------------------------------+---+
|
||||
| IOPRIO_CLASS_NONE | 0 |
|
||||
+-------------------------------+---+
|
||||
| IOPRIO_CLASS_RT (real-time) | 1 |
|
||||
+-------------------------------+---+
|
||||
| IOPRIO_CLASS_BE (best effort) | 2 |
|
||||
+-------------------------------+---+
|
||||
| IOPRIO_CLASS_IDLE | 3 |
|
||||
+-------------------------------+---+
|
||||
|
||||
The algorithm to set the I/O priority class for a request is as follows:
|
||||
|
||||
- Translate the I/O priority class policy into a number.
|
||||
- Change the request I/O priority class into the maximum of the I/O priority
|
||||
class policy number and the numerical I/O priority class.
|
||||
|
||||
PID
|
||||
---
|
||||
|
||||
|
||||
@@ -101,17 +101,6 @@ this results in concentration of disk activity in a small time interval which
|
||||
occurs only once every 10 minutes, or whenever the disk is forced to spin up by
|
||||
a cache miss. The disk can then be spun down in the periods of inactivity.
|
||||
|
||||
If you want to find out which process caused the disk to spin up, you can
|
||||
gather information by setting the flag /proc/sys/vm/block_dump. When this flag
|
||||
is set, Linux reports all disk read and write operations that take place, and
|
||||
all block dirtyings done to files. This makes it possible to debug why a disk
|
||||
needs to spin up, and to increase battery life even more. The output of
|
||||
block_dump is written to the kernel output, and it can be retrieved using
|
||||
"dmesg". When you use block_dump and your kernel logging level also includes
|
||||
kernel debugging messages, you probably want to turn off klogd, otherwise
|
||||
the output of block_dump will be logged, causing disk activity that is not
|
||||
normally there.
|
||||
|
||||
|
||||
Configuration
|
||||
-------------
|
||||
|
||||
@@ -25,7 +25,6 @@ files can be found in mm/swap.c.
|
||||
Currently, these files are in /proc/sys/vm:
|
||||
|
||||
- admin_reserve_kbytes
|
||||
- block_dump
|
||||
- compact_memory
|
||||
- compaction_proactiveness
|
||||
- compact_unevictable_allowed
|
||||
@@ -106,13 +105,6 @@ On x86_64 this is about 128MB.
|
||||
Changing this takes effect whenever an application requests memory.
|
||||
|
||||
|
||||
block_dump
|
||||
==========
|
||||
|
||||
block_dump enables block I/O debugging when set to a nonzero value. More
|
||||
information on block I/O debugging is in Documentation/admin-guide/laptops/laptop-mode.rst.
|
||||
|
||||
|
||||
compact_memory
|
||||
==============
|
||||
|
||||
|
||||
@@ -553,20 +553,36 @@ throughput sustainable with bfq, because updating the blkio.bfq.*
|
||||
stats is rather costly, especially for some of the stats enabled by
|
||||
CONFIG_BFQ_CGROUP_DEBUG.
|
||||
|
||||
Parameters to set
|
||||
-----------------
|
||||
Parameters
|
||||
----------
|
||||
|
||||
For each group, there is only the following parameter to set.
|
||||
For each group, the following parameters can be set:
|
||||
|
||||
weight (namely blkio.bfq.weight or io.bfq-weight): the weight of the
|
||||
group inside its parent. Available values: 1..1000 (default 100). The
|
||||
linear mapping between ioprio and weights, described at the beginning
|
||||
of the tunable section, is still valid, but all weights higher than
|
||||
IOPRIO_BE_NR*10 are mapped to ioprio 0.
|
||||
weight
|
||||
This specifies the default weight for the cgroup inside its parent.
|
||||
Available values: 1..1000 (default: 100).
|
||||
|
||||
Recall that, if low-latency is set, then BFQ automatically raises the
|
||||
weight of the queues associated with interactive and soft real-time
|
||||
applications. Unset this tunable if you need/want to control weights.
|
||||
For cgroup v1, it is set by writing the value to `blkio.bfq.weight`.
|
||||
|
||||
For cgroup v2, it is set by writing the value to `io.bfq.weight`.
|
||||
(with an optional prefix of `default` and a space).
|
||||
|
||||
The linear mapping between ioprio and weights, described at the beginning
|
||||
of the tunable section, is still valid, but all weights higher than
|
||||
IOPRIO_BE_NR*10 are mapped to ioprio 0.
|
||||
|
||||
Recall that, if low-latency is set, then BFQ automatically raises the
|
||||
weight of the queues associated with interactive and soft real-time
|
||||
applications. Unset this tunable if you need/want to control weights.
|
||||
|
||||
weight_device
|
||||
This specifies a per-device weight for the cgroup. The syntax is
|
||||
`minor:major weight`. A weight of `0` may be used to reset to the default
|
||||
weight.
|
||||
|
||||
For cgroup v1, it is set by writing the value to `blkio.bfq.weight_device`.
|
||||
|
||||
For cgroup v2, the file name is `io.bfq.weight`.
|
||||
|
||||
|
||||
[1]
|
||||
|
||||
@@ -480,7 +480,7 @@ prototypes::
|
||||
locking rules:
|
||||
|
||||
======================= ===================
|
||||
ops bd_mutex
|
||||
ops open_mutex
|
||||
======================= ===================
|
||||
open: yes
|
||||
release: yes
|
||||
|
||||
@@ -55,7 +55,6 @@ struct nfhd_device {
|
||||
int id;
|
||||
u32 blocks, bsize;
|
||||
int bshift;
|
||||
struct request_queue *queue;
|
||||
struct gendisk *disk;
|
||||
};
|
||||
|
||||
@@ -119,32 +118,24 @@ static int __init nfhd_init_one(int id, u32 blocks, u32 bsize)
|
||||
dev->bsize = bsize;
|
||||
dev->bshift = ffs(bsize) - 10;
|
||||
|
||||
dev->queue = blk_alloc_queue(NUMA_NO_NODE);
|
||||
if (dev->queue == NULL)
|
||||
goto free_dev;
|
||||
|
||||
blk_queue_logical_block_size(dev->queue, bsize);
|
||||
|
||||
dev->disk = alloc_disk(16);
|
||||
dev->disk = blk_alloc_disk(NUMA_NO_NODE);
|
||||
if (!dev->disk)
|
||||
goto free_queue;
|
||||
goto free_dev;
|
||||
|
||||
dev->disk->major = major_num;
|
||||
dev->disk->first_minor = dev_id * 16;
|
||||
dev->disk->minors = 16;
|
||||
dev->disk->fops = &nfhd_ops;
|
||||
dev->disk->private_data = dev;
|
||||
sprintf(dev->disk->disk_name, "nfhd%u", dev_id);
|
||||
set_capacity(dev->disk, (sector_t)blocks * (bsize / 512));
|
||||
dev->disk->queue = dev->queue;
|
||||
|
||||
blk_queue_logical_block_size(dev->disk->queue, bsize);
|
||||
add_disk(dev->disk);
|
||||
|
||||
list_add_tail(&dev->list, &nfhd_list);
|
||||
|
||||
return 0;
|
||||
|
||||
free_queue:
|
||||
blk_cleanup_queue(dev->queue);
|
||||
free_dev:
|
||||
kfree(dev);
|
||||
out:
|
||||
@@ -186,8 +177,7 @@ static void __exit nfhd_exit(void)
|
||||
list_for_each_entry_safe(dev, next, &nfhd_list, list) {
|
||||
list_del(&dev->list);
|
||||
del_gendisk(dev->disk);
|
||||
put_disk(dev->disk);
|
||||
blk_cleanup_queue(dev->queue);
|
||||
blk_cleanup_disk(dev->disk);
|
||||
kfree(dev);
|
||||
}
|
||||
unregister_blkdev(major_num, "nfhd");
|
||||
|
||||
@@ -27,7 +27,6 @@
|
||||
struct simdisk {
|
||||
const char *filename;
|
||||
spinlock_t lock;
|
||||
struct request_queue *queue;
|
||||
struct gendisk *gd;
|
||||
struct proc_dir_entry *procfile;
|
||||
int users;
|
||||
@@ -266,21 +265,13 @@ static int __init simdisk_setup(struct simdisk *dev, int which,
|
||||
spin_lock_init(&dev->lock);
|
||||
dev->users = 0;
|
||||
|
||||
dev->queue = blk_alloc_queue(NUMA_NO_NODE);
|
||||
if (dev->queue == NULL) {
|
||||
pr_err("blk_alloc_queue failed\n");
|
||||
goto out_alloc_queue;
|
||||
}
|
||||
|
||||
dev->gd = alloc_disk(SIMDISK_MINORS);
|
||||
if (dev->gd == NULL) {
|
||||
pr_err("alloc_disk failed\n");
|
||||
goto out_alloc_disk;
|
||||
}
|
||||
dev->gd = blk_alloc_disk(NUMA_NO_NODE);
|
||||
if (!dev->gd)
|
||||
return -ENOMEM;
|
||||
dev->gd->major = simdisk_major;
|
||||
dev->gd->first_minor = which;
|
||||
dev->gd->minors = SIMDISK_MINORS;
|
||||
dev->gd->fops = &simdisk_ops;
|
||||
dev->gd->queue = dev->queue;
|
||||
dev->gd->private_data = dev;
|
||||
snprintf(dev->gd->disk_name, 32, "simdisk%d", which);
|
||||
set_capacity(dev->gd, 0);
|
||||
@@ -288,12 +279,6 @@ static int __init simdisk_setup(struct simdisk *dev, int which,
|
||||
|
||||
dev->procfile = proc_create_data(tmp, 0644, procdir, &simdisk_proc_ops, dev);
|
||||
return 0;
|
||||
|
||||
out_alloc_disk:
|
||||
blk_cleanup_queue(dev->queue);
|
||||
dev->queue = NULL;
|
||||
out_alloc_queue:
|
||||
return -ENOMEM;
|
||||
}
|
||||
|
||||
static int __init simdisk_init(void)
|
||||
@@ -343,10 +328,10 @@ static void simdisk_teardown(struct simdisk *dev, int which,
|
||||
char tmp[2] = { '0' + which, 0 };
|
||||
|
||||
simdisk_detach(dev);
|
||||
if (dev->gd)
|
||||
if (dev->gd) {
|
||||
del_gendisk(dev->gd);
|
||||
if (dev->queue)
|
||||
blk_cleanup_queue(dev->queue);
|
||||
blk_cleanup_disk(dev->gd);
|
||||
}
|
||||
remove_proc_entry(tmp, procdir);
|
||||
}
|
||||
|
||||
|
||||
@@ -133,6 +133,13 @@ config BLK_WBT
|
||||
dynamically on an algorithm loosely based on CoDel, factoring in
|
||||
the realtime performance of the disk.
|
||||
|
||||
config BLK_WBT_MQ
|
||||
bool "Enable writeback throttling by default"
|
||||
default y
|
||||
depends on BLK_WBT
|
||||
help
|
||||
Enable writeback throttling by default for request-based block devices.
|
||||
|
||||
config BLK_CGROUP_IOLATENCY
|
||||
bool "Enable support for latency based cgroup IO protection"
|
||||
depends on BLK_CGROUP=y
|
||||
@@ -155,12 +162,14 @@ config BLK_CGROUP_IOCOST
|
||||
distributes IO capacity between different groups based on
|
||||
their share of the overall weight distribution.
|
||||
|
||||
config BLK_WBT_MQ
|
||||
bool "Multiqueue writeback throttling"
|
||||
default y
|
||||
depends on BLK_WBT
|
||||
config BLK_CGROUP_IOPRIO
|
||||
bool "Cgroup I/O controller for assigning an I/O priority class"
|
||||
depends on BLK_CGROUP
|
||||
help
|
||||
Enable writeback throttling by default on multiqueue devices.
|
||||
Enable the .prio interface for assigning an I/O priority class to
|
||||
requests. The I/O priority class affects the order in which an I/O
|
||||
scheduler and block devices process requests. Only some I/O schedulers
|
||||
and some block devices support I/O priorities.
|
||||
|
||||
config BLK_DEBUG_FS
|
||||
bool "Block layer debugging information in debugfs"
|
||||
|
||||
@@ -9,6 +9,12 @@ config MQ_IOSCHED_DEADLINE
|
||||
help
|
||||
MQ version of the deadline IO scheduler.
|
||||
|
||||
config MQ_IOSCHED_DEADLINE_CGROUP
|
||||
tristate
|
||||
default y
|
||||
depends on MQ_IOSCHED_DEADLINE
|
||||
depends on BLK_CGROUP
|
||||
|
||||
config MQ_IOSCHED_KYBER
|
||||
tristate "Kyber I/O scheduler"
|
||||
default y
|
||||
|
||||
@@ -8,7 +8,8 @@ obj-$(CONFIG_BLOCK) := bio.o elevator.o blk-core.o blk-sysfs.o \
|
||||
blk-exec.o blk-merge.o blk-timeout.o \
|
||||
blk-lib.o blk-mq.o blk-mq-tag.o blk-stat.o \
|
||||
blk-mq-sysfs.o blk-mq-cpumap.o blk-mq-sched.o ioctl.o \
|
||||
genhd.o ioprio.o badblocks.o partitions/ blk-rq-qos.o
|
||||
genhd.o ioprio.o badblocks.o partitions/ blk-rq-qos.o \
|
||||
disk-events.o
|
||||
|
||||
obj-$(CONFIG_BOUNCE) += bounce.o
|
||||
obj-$(CONFIG_BLK_SCSI_REQUEST) += scsi_ioctl.o
|
||||
@@ -17,9 +18,12 @@ obj-$(CONFIG_BLK_DEV_BSGLIB) += bsg-lib.o
|
||||
obj-$(CONFIG_BLK_CGROUP) += blk-cgroup.o
|
||||
obj-$(CONFIG_BLK_CGROUP_RWSTAT) += blk-cgroup-rwstat.o
|
||||
obj-$(CONFIG_BLK_DEV_THROTTLING) += blk-throttle.o
|
||||
obj-$(CONFIG_BLK_CGROUP_IOPRIO) += blk-ioprio.o
|
||||
obj-$(CONFIG_BLK_CGROUP_IOLATENCY) += blk-iolatency.o
|
||||
obj-$(CONFIG_BLK_CGROUP_IOCOST) += blk-iocost.o
|
||||
obj-$(CONFIG_MQ_IOSCHED_DEADLINE) += mq-deadline.o
|
||||
mq-deadline-y += mq-deadline-main.o
|
||||
mq-deadline-$(CONFIG_MQ_IOSCHED_DEADLINE_CGROUP)+= mq-deadline-cgroup.o
|
||||
obj-$(CONFIG_MQ_IOSCHED_KYBER) += kyber-iosched.o
|
||||
bfq-y := bfq-iosched.o bfq-wf2q.o bfq-cgroup.o
|
||||
obj-$(CONFIG_IOSCHED_BFQ) += bfq.o
|
||||
|
||||
@@ -364,6 +364,16 @@ static int ref_wr_duration[2];
|
||||
*/
|
||||
static const unsigned long max_service_from_wr = 120000;
|
||||
|
||||
/*
|
||||
* Maximum time between the creation of two queues, for stable merge
|
||||
* to be activated (in ms)
|
||||
*/
|
||||
static const unsigned long bfq_activation_stable_merging = 600;
|
||||
/*
|
||||
* Minimum time to be waited before evaluating delayed stable merge (in ms)
|
||||
*/
|
||||
static const unsigned long bfq_late_stable_merging = 600;
|
||||
|
||||
#define RQ_BIC(rq) icq_to_bic((rq)->elv.priv[0])
|
||||
#define RQ_BFQQ(rq) ((rq)->elv.priv[1])
|
||||
|
||||
@@ -1729,10 +1739,23 @@ static void bfq_bfqq_handle_idle_busy_switch(struct bfq_data *bfqd,
|
||||
bfqq->entity.new_weight == 40;
|
||||
*interactive = !in_burst && idle_for_long_time &&
|
||||
bfqq->entity.new_weight == 40;
|
||||
/*
|
||||
* Merged bfq_queues are kept out of weight-raising
|
||||
* (low-latency) mechanisms. The reason is that these queues
|
||||
* are usually created for non-interactive and
|
||||
* non-soft-real-time tasks. Yet this is not the case for
|
||||
* stably-merged queues. These queues are merged just because
|
||||
* they are created shortly after each other. So they may
|
||||
* easily serve the I/O of an interactive or soft-real time
|
||||
* application, if the application happens to spawn multiple
|
||||
* processes. So let also stably-merged queued enjoy weight
|
||||
* raising.
|
||||
*/
|
||||
wr_or_deserves_wr = bfqd->low_latency &&
|
||||
(bfqq->wr_coeff > 1 ||
|
||||
(bfq_bfqq_sync(bfqq) &&
|
||||
bfqq->bic && (*interactive || soft_rt)));
|
||||
(bfqq->bic || RQ_BIC(rq)->stably_merged) &&
|
||||
(*interactive || soft_rt)));
|
||||
|
||||
/*
|
||||
* Using the last flag, update budget and check whether bfqq
|
||||
@@ -1962,14 +1985,18 @@ static void bfq_update_io_intensity(struct bfq_queue *bfqq, u64 now_ns)
|
||||
* Turning back to the detection of a waker queue, a queue Q is deemed
|
||||
* as a waker queue for bfqq if, for three consecutive times, bfqq
|
||||
* happens to become non empty right after a request of Q has been
|
||||
* completed. In particular, on the first time, Q is tentatively set
|
||||
* as a candidate waker queue, while on the third consecutive time
|
||||
* that Q is detected, the field waker_bfqq is set to Q, to confirm
|
||||
* that Q is a waker queue for bfqq. These detection steps are
|
||||
* performed only if bfqq has a long think time, so as to make it more
|
||||
* likely that bfqq's I/O is actually being blocked by a
|
||||
* synchronization. This last filter, plus the above three-times
|
||||
* requirement, make false positives less likely.
|
||||
* completed. In this respect, even if bfqq is empty, we do not check
|
||||
* for a waker if it still has some in-flight I/O. In fact, in this
|
||||
* case bfqq is actually still being served by the drive, and may
|
||||
* receive new I/O on the completion of some of the in-flight
|
||||
* requests. In particular, on the first time, Q is tentatively set as
|
||||
* a candidate waker queue, while on the third consecutive time that Q
|
||||
* is detected, the field waker_bfqq is set to Q, to confirm that Q is
|
||||
* a waker queue for bfqq. These detection steps are performed only if
|
||||
* bfqq has a long think time, so as to make it more likely that
|
||||
* bfqq's I/O is actually being blocked by a synchronization. This
|
||||
* last filter, plus the above three-times requirement, make false
|
||||
* positives less likely.
|
||||
*
|
||||
* NOTE
|
||||
*
|
||||
@@ -1995,6 +2022,7 @@ static void bfq_check_waker(struct bfq_data *bfqd, struct bfq_queue *bfqq,
|
||||
if (!bfqd->last_completed_rq_bfqq ||
|
||||
bfqd->last_completed_rq_bfqq == bfqq ||
|
||||
bfq_bfqq_has_short_ttime(bfqq) ||
|
||||
bfqq->dispatched > 0 ||
|
||||
now_ns - bfqd->last_completion >= 4 * NSEC_PER_MSEC ||
|
||||
bfqd->last_completed_rq_bfqq == bfqq->waker_bfqq)
|
||||
return;
|
||||
@@ -2317,9 +2345,9 @@ static bool bfq_bio_merge(struct request_queue *q, struct bio *bio,
|
||||
|
||||
ret = blk_mq_sched_try_merge(q, bio, nr_segs, &free);
|
||||
|
||||
spin_unlock_irq(&bfqd->lock);
|
||||
if (free)
|
||||
blk_mq_free_request(free);
|
||||
spin_unlock_irq(&bfqd->lock);
|
||||
|
||||
return ret;
|
||||
}
|
||||
@@ -2405,7 +2433,7 @@ static void bfq_requests_merged(struct request_queue *q, struct request *rq,
|
||||
*next_bfqq = bfq_init_rq(next);
|
||||
|
||||
if (!bfqq)
|
||||
return;
|
||||
goto remove;
|
||||
|
||||
/*
|
||||
* If next and rq belong to the same bfq_queue and next is older
|
||||
@@ -2428,6 +2456,14 @@ static void bfq_requests_merged(struct request_queue *q, struct request *rq,
|
||||
bfqq->next_rq = rq;
|
||||
|
||||
bfqg_stats_update_io_merged(bfqq_group(bfqq), next->cmd_flags);
|
||||
remove:
|
||||
/* Merged request may be in the IO scheduler. Remove it. */
|
||||
if (!RB_EMPTY_NODE(&next->rb_node)) {
|
||||
bfq_remove_request(next->q, next);
|
||||
if (next_bfqq)
|
||||
bfqg_stats_update_io_remove(bfqq_group(next_bfqq),
|
||||
next->cmd_flags);
|
||||
}
|
||||
}
|
||||
|
||||
/* Must be called with bfqq != NULL */
|
||||
@@ -2695,10 +2731,18 @@ bfq_setup_cooperator(struct bfq_data *bfqd, struct bfq_queue *bfqq,
|
||||
* costly and complicated.
|
||||
*/
|
||||
if (unlikely(!bfqd->nonrot_with_queueing)) {
|
||||
if (bic->stable_merge_bfqq &&
|
||||
/*
|
||||
* Make sure also that bfqq is sync, because
|
||||
* bic->stable_merge_bfqq may point to some queue (for
|
||||
* stable merging) also if bic is associated with a
|
||||
* sync queue, but this bfqq is async
|
||||
*/
|
||||
if (bfq_bfqq_sync(bfqq) && bic->stable_merge_bfqq &&
|
||||
!bfq_bfqq_just_created(bfqq) &&
|
||||
time_is_after_jiffies(bfqq->split_time +
|
||||
msecs_to_jiffies(200))) {
|
||||
time_is_before_jiffies(bfqq->split_time +
|
||||
msecs_to_jiffies(bfq_late_stable_merging)) &&
|
||||
time_is_before_jiffies(bfqq->creation_time +
|
||||
msecs_to_jiffies(bfq_late_stable_merging))) {
|
||||
struct bfq_queue *stable_merge_bfqq =
|
||||
bic->stable_merge_bfqq;
|
||||
int proc_ref = min(bfqq_process_refs(bfqq),
|
||||
@@ -5479,7 +5523,7 @@ static struct bfq_queue *bfq_do_or_sched_stable_merge(struct bfq_data *bfqd,
|
||||
*/
|
||||
if (!last_bfqq_created ||
|
||||
time_before(last_bfqq_created->creation_time +
|
||||
bfqd->bfq_burst_interval,
|
||||
msecs_to_jiffies(bfq_activation_stable_merging),
|
||||
bfqq->creation_time) ||
|
||||
bfqq->entity.parent != last_bfqq_created->entity.parent ||
|
||||
bfqq->ioprio != last_bfqq_created->ioprio ||
|
||||
@@ -5925,14 +5969,16 @@ static void bfq_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq,
|
||||
struct bfq_queue *bfqq;
|
||||
bool idle_timer_disabled = false;
|
||||
unsigned int cmd_flags;
|
||||
LIST_HEAD(free);
|
||||
|
||||
#ifdef CONFIG_BFQ_GROUP_IOSCHED
|
||||
if (!cgroup_subsys_on_dfl(io_cgrp_subsys) && rq->bio)
|
||||
bfqg_stats_update_legacy_io(q, rq);
|
||||
#endif
|
||||
spin_lock_irq(&bfqd->lock);
|
||||
if (blk_mq_sched_try_insert_merge(q, rq)) {
|
||||
if (blk_mq_sched_try_insert_merge(q, rq, &free)) {
|
||||
spin_unlock_irq(&bfqd->lock);
|
||||
blk_mq_free_requests(&free);
|
||||
return;
|
||||
}
|
||||
|
||||
@@ -6129,11 +6175,13 @@ static void bfq_completed_request(struct bfq_queue *bfqq, struct bfq_data *bfqd)
|
||||
* of other queues. But a false waker will unjustly steal
|
||||
* bandwidth to its supposedly woken queue. So considering
|
||||
* also shared queues in the waking mechanism may cause more
|
||||
* control troubles than throughput benefits. Then do not set
|
||||
* last_completed_rq_bfqq to bfqq if bfqq is a shared queue.
|
||||
* control troubles than throughput benefits. Then reset
|
||||
* last_completed_rq_bfqq if bfqq is a shared queue.
|
||||
*/
|
||||
if (!bfq_bfqq_coop(bfqq))
|
||||
bfqd->last_completed_rq_bfqq = bfqq;
|
||||
else
|
||||
bfqd->last_completed_rq_bfqq = NULL;
|
||||
|
||||
/*
|
||||
* If we are waiting to discover whether the request pattern
|
||||
@@ -6376,6 +6424,7 @@ static void bfq_finish_requeue_request(struct request *rq)
|
||||
{
|
||||
struct bfq_queue *bfqq = RQ_BFQQ(rq);
|
||||
struct bfq_data *bfqd;
|
||||
unsigned long flags;
|
||||
|
||||
/*
|
||||
* rq either is not associated with any icq, or is an already
|
||||
@@ -6393,39 +6442,15 @@ static void bfq_finish_requeue_request(struct request *rq)
|
||||
rq->io_start_time_ns,
|
||||
rq->cmd_flags);
|
||||
|
||||
spin_lock_irqsave(&bfqd->lock, flags);
|
||||
if (likely(rq->rq_flags & RQF_STARTED)) {
|
||||
unsigned long flags;
|
||||
|
||||
spin_lock_irqsave(&bfqd->lock, flags);
|
||||
|
||||
if (rq == bfqd->waited_rq)
|
||||
bfq_update_inject_limit(bfqd, bfqq);
|
||||
|
||||
bfq_completed_request(bfqq, bfqd);
|
||||
bfq_finish_requeue_request_body(bfqq);
|
||||
|
||||
spin_unlock_irqrestore(&bfqd->lock, flags);
|
||||
} else {
|
||||
/*
|
||||
* Request rq may be still/already in the scheduler,
|
||||
* in which case we need to remove it (this should
|
||||
* never happen in case of requeue). And we cannot
|
||||
* defer such a check and removal, to avoid
|
||||
* inconsistencies in the time interval from the end
|
||||
* of this function to the start of the deferred work.
|
||||
* This situation seems to occur only in process
|
||||
* context, as a consequence of a merge. In the
|
||||
* current version of the code, this implies that the
|
||||
* lock is held.
|
||||
*/
|
||||
|
||||
if (!RB_EMPTY_NODE(&rq->rb_node)) {
|
||||
bfq_remove_request(rq->q, rq);
|
||||
bfqg_stats_update_io_remove(bfqq_group(bfqq),
|
||||
rq->cmd_flags);
|
||||
}
|
||||
bfq_finish_requeue_request_body(bfqq);
|
||||
}
|
||||
bfq_finish_requeue_request_body(bfqq);
|
||||
spin_unlock_irqrestore(&bfqd->lock, flags);
|
||||
|
||||
/*
|
||||
* Reset private fields. In case of a requeue, this allows
|
||||
|
||||
13
block/bio.c
13
block/bio.c
@@ -1375,8 +1375,7 @@ static inline bool bio_remaining_done(struct bio *bio)
|
||||
*
|
||||
* bio_endio() can be called several times on a bio that has been chained
|
||||
* using bio_chain(). The ->bi_end_io() function will only be called the
|
||||
* last time. At this point the BLK_TA_COMPLETE tracing event will be
|
||||
* generated if BIO_TRACE_COMPLETION is set.
|
||||
* last time.
|
||||
**/
|
||||
void bio_endio(struct bio *bio)
|
||||
{
|
||||
@@ -1389,6 +1388,11 @@ again:
|
||||
if (bio->bi_bdev)
|
||||
rq_qos_done_bio(bio->bi_bdev->bd_disk->queue, bio);
|
||||
|
||||
if (bio->bi_bdev && bio_flagged(bio, BIO_TRACE_COMPLETION)) {
|
||||
trace_block_bio_complete(bio->bi_bdev->bd_disk->queue, bio);
|
||||
bio_clear_flag(bio, BIO_TRACE_COMPLETION);
|
||||
}
|
||||
|
||||
/*
|
||||
* Need to have a real endio function for chained bios, otherwise
|
||||
* various corner cases will break (like stacking block devices that
|
||||
@@ -1402,11 +1406,6 @@ again:
|
||||
goto again;
|
||||
}
|
||||
|
||||
if (bio->bi_bdev && bio_flagged(bio, BIO_TRACE_COMPLETION)) {
|
||||
trace_block_bio_complete(bio->bi_bdev->bd_disk->queue, bio);
|
||||
bio_clear_flag(bio, BIO_TRACE_COMPLETION);
|
||||
}
|
||||
|
||||
blk_throtl_bio_endio(bio);
|
||||
/* release cgroup info */
|
||||
bio_uninit(bio);
|
||||
|
||||
@@ -31,6 +31,7 @@
|
||||
#include <linux/tracehook.h>
|
||||
#include <linux/psi.h>
|
||||
#include "blk.h"
|
||||
#include "blk-ioprio.h"
|
||||
|
||||
/*
|
||||
* blkcg_pol_mutex protects blkcg_policy[] and policy [de]activation.
|
||||
@@ -1183,15 +1184,18 @@ int blkcg_init_queue(struct request_queue *q)
|
||||
if (preloaded)
|
||||
radix_tree_preload_end();
|
||||
|
||||
ret = blk_iolatency_init(q);
|
||||
if (ret)
|
||||
goto err_destroy_all;
|
||||
|
||||
ret = blk_ioprio_init(q);
|
||||
if (ret)
|
||||
goto err_destroy_all;
|
||||
|
||||
ret = blk_throtl_init(q);
|
||||
if (ret)
|
||||
goto err_destroy_all;
|
||||
|
||||
ret = blk_iolatency_init(q);
|
||||
if (ret) {
|
||||
blk_throtl_exit(q);
|
||||
goto err_destroy_all;
|
||||
}
|
||||
return 0;
|
||||
|
||||
err_destroy_all:
|
||||
@@ -1217,32 +1221,6 @@ void blkcg_exit_queue(struct request_queue *q)
|
||||
blk_throtl_exit(q);
|
||||
}
|
||||
|
||||
/*
|
||||
* We cannot support shared io contexts, as we have no mean to support
|
||||
* two tasks with the same ioc in two different groups without major rework
|
||||
* of the main cic data structures. For now we allow a task to change
|
||||
* its cgroup only if it's the only owner of its ioc.
|
||||
*/
|
||||
static int blkcg_can_attach(struct cgroup_taskset *tset)
|
||||
{
|
||||
struct task_struct *task;
|
||||
struct cgroup_subsys_state *dst_css;
|
||||
struct io_context *ioc;
|
||||
int ret = 0;
|
||||
|
||||
/* task_lock() is needed to avoid races with exit_io_context() */
|
||||
cgroup_taskset_for_each(task, dst_css, tset) {
|
||||
task_lock(task);
|
||||
ioc = task->io_context;
|
||||
if (ioc && atomic_read(&ioc->nr_tasks) > 1)
|
||||
ret = -EINVAL;
|
||||
task_unlock(task);
|
||||
if (ret)
|
||||
break;
|
||||
}
|
||||
return ret;
|
||||
}
|
||||
|
||||
static void blkcg_bind(struct cgroup_subsys_state *root_css)
|
||||
{
|
||||
int i;
|
||||
@@ -1275,7 +1253,6 @@ struct cgroup_subsys io_cgrp_subsys = {
|
||||
.css_online = blkcg_css_online,
|
||||
.css_offline = blkcg_css_offline,
|
||||
.css_free = blkcg_css_free,
|
||||
.can_attach = blkcg_can_attach,
|
||||
.css_rstat_flush = blkcg_rstat_flush,
|
||||
.bind = blkcg_bind,
|
||||
.dfl_cftypes = blkcg_files,
|
||||
|
||||
@@ -599,7 +599,6 @@ fail_q:
|
||||
kmem_cache_free(blk_requestq_cachep, q);
|
||||
return NULL;
|
||||
}
|
||||
EXPORT_SYMBOL(blk_alloc_queue);
|
||||
|
||||
/**
|
||||
* blk_get_queue - increment the request_queue refcount
|
||||
@@ -1086,15 +1085,6 @@ blk_qc_t submit_bio(struct bio *bio)
|
||||
task_io_account_read(bio->bi_iter.bi_size);
|
||||
count_vm_events(PGPGIN, count);
|
||||
}
|
||||
|
||||
if (unlikely(block_dump)) {
|
||||
char b[BDEVNAME_SIZE];
|
||||
printk(KERN_DEBUG "%s(%d): %s block %Lu on %s (%u sectors)\n",
|
||||
current->comm, task_pid_nr(current),
|
||||
op_is_write(bio_op(bio)) ? "WRITE" : "READ",
|
||||
(unsigned long long)bio->bi_iter.bi_sector,
|
||||
bio_devname(bio, b), count);
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
@@ -1394,26 +1384,22 @@ void blk_steal_bios(struct bio_list *list, struct request *rq)
|
||||
EXPORT_SYMBOL_GPL(blk_steal_bios);
|
||||
|
||||
/**
|
||||
* blk_update_request - Special helper function for request stacking drivers
|
||||
* blk_update_request - Complete multiple bytes without completing the request
|
||||
* @req: the request being processed
|
||||
* @error: block status code
|
||||
* @nr_bytes: number of bytes to complete @req
|
||||
* @nr_bytes: number of bytes to complete for @req
|
||||
*
|
||||
* Description:
|
||||
* Ends I/O on a number of bytes attached to @req, but doesn't complete
|
||||
* the request structure even if @req doesn't have leftover.
|
||||
* If @req has leftover, sets it up for the next range of segments.
|
||||
*
|
||||
* This special helper function is only for request stacking drivers
|
||||
* (e.g. request-based dm) so that they can handle partial completion.
|
||||
* Actual device drivers should use blk_mq_end_request instead.
|
||||
*
|
||||
* Passing the result of blk_rq_bytes() as @nr_bytes guarantees
|
||||
* %false return from this function.
|
||||
*
|
||||
* Note:
|
||||
* The RQF_SPECIAL_PAYLOAD flag is ignored on purpose in both
|
||||
* blk_rq_bytes() and in blk_update_request().
|
||||
* The RQF_SPECIAL_PAYLOAD flag is ignored on purpose in this function
|
||||
* except in the consistency check at the end of this function.
|
||||
*
|
||||
* Return:
|
||||
* %false - this request doesn't have any more data
|
||||
|
||||
@@ -219,8 +219,6 @@ static void flush_end_io(struct request *flush_rq, blk_status_t error)
|
||||
unsigned long flags = 0;
|
||||
struct blk_flush_queue *fq = blk_get_flush_queue(q, flush_rq->mq_ctx);
|
||||
|
||||
blk_account_io_flush(flush_rq);
|
||||
|
||||
/* release the tag's ownership to the req cloned from */
|
||||
spin_lock_irqsave(&fq->mq_flush_lock, flags);
|
||||
|
||||
@@ -230,6 +228,7 @@ static void flush_end_io(struct request *flush_rq, blk_status_t error)
|
||||
return;
|
||||
}
|
||||
|
||||
blk_account_io_flush(flush_rq);
|
||||
/*
|
||||
* Flush request has to be marked as IDLE when it is really ended
|
||||
* because its .end_io() is called from timeout code path too for
|
||||
|
||||
262
block/blk-ioprio.c
Normal file
262
block/blk-ioprio.c
Normal file
@@ -0,0 +1,262 @@
|
||||
// SPDX-License-Identifier: GPL-2.0
|
||||
/*
|
||||
* Block rq-qos policy for assigning an I/O priority class to requests.
|
||||
*
|
||||
* Using an rq-qos policy for assigning I/O priority class has two advantages
|
||||
* over using the ioprio_set() system call:
|
||||
*
|
||||
* - This policy is cgroup based so it has all the advantages of cgroups.
|
||||
* - While ioprio_set() does not affect page cache writeback I/O, this rq-qos
|
||||
* controller affects page cache writeback I/O for filesystems that support
|
||||
* assiociating a cgroup with writeback I/O. See also
|
||||
* Documentation/admin-guide/cgroup-v2.rst.
|
||||
*/
|
||||
|
||||
#include <linux/blk-cgroup.h>
|
||||
#include <linux/blk-mq.h>
|
||||
#include <linux/blk_types.h>
|
||||
#include <linux/kernel.h>
|
||||
#include <linux/module.h>
|
||||
#include "blk-ioprio.h"
|
||||
#include "blk-rq-qos.h"
|
||||
|
||||
/**
|
||||
* enum prio_policy - I/O priority class policy.
|
||||
* @POLICY_NO_CHANGE: (default) do not modify the I/O priority class.
|
||||
* @POLICY_NONE_TO_RT: modify IOPRIO_CLASS_NONE into IOPRIO_CLASS_RT.
|
||||
* @POLICY_RESTRICT_TO_BE: modify IOPRIO_CLASS_NONE and IOPRIO_CLASS_RT into
|
||||
* IOPRIO_CLASS_BE.
|
||||
* @POLICY_ALL_TO_IDLE: change the I/O priority class into IOPRIO_CLASS_IDLE.
|
||||
*
|
||||
* See also <linux/ioprio.h>.
|
||||
*/
|
||||
enum prio_policy {
|
||||
POLICY_NO_CHANGE = 0,
|
||||
POLICY_NONE_TO_RT = 1,
|
||||
POLICY_RESTRICT_TO_BE = 2,
|
||||
POLICY_ALL_TO_IDLE = 3,
|
||||
};
|
||||
|
||||
static const char *policy_name[] = {
|
||||
[POLICY_NO_CHANGE] = "no-change",
|
||||
[POLICY_NONE_TO_RT] = "none-to-rt",
|
||||
[POLICY_RESTRICT_TO_BE] = "restrict-to-be",
|
||||
[POLICY_ALL_TO_IDLE] = "idle",
|
||||
};
|
||||
|
||||
static struct blkcg_policy ioprio_policy;
|
||||
|
||||
/**
|
||||
* struct ioprio_blkg - Per (cgroup, request queue) data.
|
||||
* @pd: blkg_policy_data structure.
|
||||
*/
|
||||
struct ioprio_blkg {
|
||||
struct blkg_policy_data pd;
|
||||
};
|
||||
|
||||
/**
|
||||
* struct ioprio_blkcg - Per cgroup data.
|
||||
* @cpd: blkcg_policy_data structure.
|
||||
* @prio_policy: One of the IOPRIO_CLASS_* values. See also <linux/ioprio.h>.
|
||||
*/
|
||||
struct ioprio_blkcg {
|
||||
struct blkcg_policy_data cpd;
|
||||
enum prio_policy prio_policy;
|
||||
};
|
||||
|
||||
static inline struct ioprio_blkg *pd_to_ioprio(struct blkg_policy_data *pd)
|
||||
{
|
||||
return pd ? container_of(pd, struct ioprio_blkg, pd) : NULL;
|
||||
}
|
||||
|
||||
static struct ioprio_blkcg *blkcg_to_ioprio_blkcg(struct blkcg *blkcg)
|
||||
{
|
||||
return container_of(blkcg_to_cpd(blkcg, &ioprio_policy),
|
||||
struct ioprio_blkcg, cpd);
|
||||
}
|
||||
|
||||
static struct ioprio_blkcg *
|
||||
ioprio_blkcg_from_css(struct cgroup_subsys_state *css)
|
||||
{
|
||||
return blkcg_to_ioprio_blkcg(css_to_blkcg(css));
|
||||
}
|
||||
|
||||
static struct ioprio_blkcg *ioprio_blkcg_from_bio(struct bio *bio)
|
||||
{
|
||||
struct blkg_policy_data *pd = blkg_to_pd(bio->bi_blkg, &ioprio_policy);
|
||||
|
||||
if (!pd)
|
||||
return NULL;
|
||||
|
||||
return blkcg_to_ioprio_blkcg(pd->blkg->blkcg);
|
||||
}
|
||||
|
||||
static int ioprio_show_prio_policy(struct seq_file *sf, void *v)
|
||||
{
|
||||
struct ioprio_blkcg *blkcg = ioprio_blkcg_from_css(seq_css(sf));
|
||||
|
||||
seq_printf(sf, "%s\n", policy_name[blkcg->prio_policy]);
|
||||
return 0;
|
||||
}
|
||||
|
||||
static ssize_t ioprio_set_prio_policy(struct kernfs_open_file *of, char *buf,
|
||||
size_t nbytes, loff_t off)
|
||||
{
|
||||
struct ioprio_blkcg *blkcg = ioprio_blkcg_from_css(of_css(of));
|
||||
int ret;
|
||||
|
||||
if (off != 0)
|
||||
return -EIO;
|
||||
/* kernfs_fop_write_iter() terminates 'buf' with '\0'. */
|
||||
ret = sysfs_match_string(policy_name, buf);
|
||||
if (ret < 0)
|
||||
return ret;
|
||||
blkcg->prio_policy = ret;
|
||||
|
||||
return nbytes;
|
||||
}
|
||||
|
||||
static struct blkg_policy_data *
|
||||
ioprio_alloc_pd(gfp_t gfp, struct request_queue *q, struct blkcg *blkcg)
|
||||
{
|
||||
struct ioprio_blkg *ioprio_blkg;
|
||||
|
||||
ioprio_blkg = kzalloc(sizeof(*ioprio_blkg), gfp);
|
||||
if (!ioprio_blkg)
|
||||
return NULL;
|
||||
|
||||
return &ioprio_blkg->pd;
|
||||
}
|
||||
|
||||
static void ioprio_free_pd(struct blkg_policy_data *pd)
|
||||
{
|
||||
struct ioprio_blkg *ioprio_blkg = pd_to_ioprio(pd);
|
||||
|
||||
kfree(ioprio_blkg);
|
||||
}
|
||||
|
||||
static struct blkcg_policy_data *ioprio_alloc_cpd(gfp_t gfp)
|
||||
{
|
||||
struct ioprio_blkcg *blkcg;
|
||||
|
||||
blkcg = kzalloc(sizeof(*blkcg), gfp);
|
||||
if (!blkcg)
|
||||
return NULL;
|
||||
blkcg->prio_policy = POLICY_NO_CHANGE;
|
||||
return &blkcg->cpd;
|
||||
}
|
||||
|
||||
static void ioprio_free_cpd(struct blkcg_policy_data *cpd)
|
||||
{
|
||||
struct ioprio_blkcg *blkcg = container_of(cpd, typeof(*blkcg), cpd);
|
||||
|
||||
kfree(blkcg);
|
||||
}
|
||||
|
||||
#define IOPRIO_ATTRS \
|
||||
{ \
|
||||
.name = "prio.class", \
|
||||
.seq_show = ioprio_show_prio_policy, \
|
||||
.write = ioprio_set_prio_policy, \
|
||||
}, \
|
||||
{ } /* sentinel */
|
||||
|
||||
/* cgroup v2 attributes */
|
||||
static struct cftype ioprio_files[] = {
|
||||
IOPRIO_ATTRS
|
||||
};
|
||||
|
||||
/* cgroup v1 attributes */
|
||||
static struct cftype ioprio_legacy_files[] = {
|
||||
IOPRIO_ATTRS
|
||||
};
|
||||
|
||||
static struct blkcg_policy ioprio_policy = {
|
||||
.dfl_cftypes = ioprio_files,
|
||||
.legacy_cftypes = ioprio_legacy_files,
|
||||
|
||||
.cpd_alloc_fn = ioprio_alloc_cpd,
|
||||
.cpd_free_fn = ioprio_free_cpd,
|
||||
|
||||
.pd_alloc_fn = ioprio_alloc_pd,
|
||||
.pd_free_fn = ioprio_free_pd,
|
||||
};
|
||||
|
||||
struct blk_ioprio {
|
||||
struct rq_qos rqos;
|
||||
};
|
||||
|
||||
static void blkcg_ioprio_track(struct rq_qos *rqos, struct request *rq,
|
||||
struct bio *bio)
|
||||
{
|
||||
struct ioprio_blkcg *blkcg = ioprio_blkcg_from_bio(bio);
|
||||
|
||||
/*
|
||||
* Except for IOPRIO_CLASS_NONE, higher I/O priority numbers
|
||||
* correspond to a lower priority. Hence, the max_t() below selects
|
||||
* the lower priority of bi_ioprio and the cgroup I/O priority class.
|
||||
* If the cgroup policy has been set to POLICY_NO_CHANGE == 0, the
|
||||
* bio I/O priority is not modified. If the bio I/O priority equals
|
||||
* IOPRIO_CLASS_NONE, the cgroup I/O priority is assigned to the bio.
|
||||
*/
|
||||
bio->bi_ioprio = max_t(u16, bio->bi_ioprio,
|
||||
IOPRIO_PRIO_VALUE(blkcg->prio_policy, 0));
|
||||
}
|
||||
|
||||
static void blkcg_ioprio_exit(struct rq_qos *rqos)
|
||||
{
|
||||
struct blk_ioprio *blkioprio_blkg =
|
||||
container_of(rqos, typeof(*blkioprio_blkg), rqos);
|
||||
|
||||
blkcg_deactivate_policy(rqos->q, &ioprio_policy);
|
||||
kfree(blkioprio_blkg);
|
||||
}
|
||||
|
||||
static struct rq_qos_ops blkcg_ioprio_ops = {
|
||||
.track = blkcg_ioprio_track,
|
||||
.exit = blkcg_ioprio_exit,
|
||||
};
|
||||
|
||||
int blk_ioprio_init(struct request_queue *q)
|
||||
{
|
||||
struct blk_ioprio *blkioprio_blkg;
|
||||
struct rq_qos *rqos;
|
||||
int ret;
|
||||
|
||||
blkioprio_blkg = kzalloc(sizeof(*blkioprio_blkg), GFP_KERNEL);
|
||||
if (!blkioprio_blkg)
|
||||
return -ENOMEM;
|
||||
|
||||
ret = blkcg_activate_policy(q, &ioprio_policy);
|
||||
if (ret) {
|
||||
kfree(blkioprio_blkg);
|
||||
return ret;
|
||||
}
|
||||
|
||||
rqos = &blkioprio_blkg->rqos;
|
||||
rqos->id = RQ_QOS_IOPRIO;
|
||||
rqos->ops = &blkcg_ioprio_ops;
|
||||
rqos->q = q;
|
||||
|
||||
/*
|
||||
* Registering the rq-qos policy after activating the blk-cgroup
|
||||
* policy guarantees that ioprio_blkcg_from_bio(bio) != NULL in the
|
||||
* rq-qos callbacks.
|
||||
*/
|
||||
rq_qos_add(q, rqos);
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int __init ioprio_init(void)
|
||||
{
|
||||
return blkcg_policy_register(&ioprio_policy);
|
||||
}
|
||||
|
||||
static void __exit ioprio_exit(void)
|
||||
{
|
||||
blkcg_policy_unregister(&ioprio_policy);
|
||||
}
|
||||
|
||||
module_init(ioprio_init);
|
||||
module_exit(ioprio_exit);
|
||||
19
block/blk-ioprio.h
Normal file
19
block/blk-ioprio.h
Normal file
@@ -0,0 +1,19 @@
|
||||
/* SPDX-License-Identifier: GPL-2.0 */
|
||||
|
||||
#ifndef _BLK_IOPRIO_H_
|
||||
#define _BLK_IOPRIO_H_
|
||||
|
||||
#include <linux/kconfig.h>
|
||||
|
||||
struct request_queue;
|
||||
|
||||
#ifdef CONFIG_BLK_CGROUP_IOPRIO
|
||||
int blk_ioprio_init(struct request_queue *q);
|
||||
#else
|
||||
static inline int blk_ioprio_init(struct request_queue *q)
|
||||
{
|
||||
return 0;
|
||||
}
|
||||
#endif
|
||||
|
||||
#endif /* _BLK_IOPRIO_H_ */
|
||||
@@ -559,10 +559,14 @@ static inline unsigned int blk_rq_get_max_segments(struct request *rq)
|
||||
static inline int ll_new_hw_segment(struct request *req, struct bio *bio,
|
||||
unsigned int nr_phys_segs)
|
||||
{
|
||||
if (req->nr_phys_segments + nr_phys_segs > blk_rq_get_max_segments(req))
|
||||
if (blk_integrity_merge_bio(req->q, req, bio) == false)
|
||||
goto no_merge;
|
||||
|
||||
if (blk_integrity_merge_bio(req->q, req, bio) == false)
|
||||
/* discard request merge won't add new segment */
|
||||
if (req_op(req) == REQ_OP_DISCARD)
|
||||
return 1;
|
||||
|
||||
if (req->nr_phys_segments + nr_phys_segs > blk_rq_get_max_segments(req))
|
||||
goto no_merge;
|
||||
|
||||
/*
|
||||
@@ -846,18 +850,15 @@ static struct request *attempt_front_merge(struct request_queue *q,
|
||||
return NULL;
|
||||
}
|
||||
|
||||
int blk_attempt_req_merge(struct request_queue *q, struct request *rq,
|
||||
struct request *next)
|
||||
/*
|
||||
* Try to merge 'next' into 'rq'. Return true if the merge happened, false
|
||||
* otherwise. The caller is responsible for freeing 'next' if the merge
|
||||
* happened.
|
||||
*/
|
||||
bool blk_attempt_req_merge(struct request_queue *q, struct request *rq,
|
||||
struct request *next)
|
||||
{
|
||||
struct request *free;
|
||||
|
||||
free = attempt_merge(q, rq, next);
|
||||
if (free) {
|
||||
blk_put_request(free);
|
||||
return 1;
|
||||
}
|
||||
|
||||
return 0;
|
||||
return attempt_merge(q, rq, next);
|
||||
}
|
||||
|
||||
bool blk_rq_merge_ok(struct request *rq, struct bio *bio)
|
||||
|
||||
@@ -937,6 +937,21 @@ void blk_mq_debugfs_unregister_sched(struct request_queue *q)
|
||||
q->sched_debugfs_dir = NULL;
|
||||
}
|
||||
|
||||
static const char *rq_qos_id_to_name(enum rq_qos_id id)
|
||||
{
|
||||
switch (id) {
|
||||
case RQ_QOS_WBT:
|
||||
return "wbt";
|
||||
case RQ_QOS_LATENCY:
|
||||
return "latency";
|
||||
case RQ_QOS_COST:
|
||||
return "cost";
|
||||
case RQ_QOS_IOPRIO:
|
||||
return "ioprio";
|
||||
}
|
||||
return "unknown";
|
||||
}
|
||||
|
||||
void blk_mq_debugfs_unregister_rqos(struct rq_qos *rqos)
|
||||
{
|
||||
debugfs_remove_recursive(rqos->debugfs_dir);
|
||||
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user