mirror of
https://github.com/ukui/kernel.git
synced 2026-03-09 10:07:04 -07:00
Merge 6.0-rc5 into driver-core-next
We need the driver core and debugfs changes in this branch. Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
This commit is contained in:
@@ -1,2 +1,4 @@
|
||||
Alan Cox <alan@lxorguk.ukuu.org.uk>
|
||||
Alan Cox <root@hraefn.swansea.linux.org.uk>
|
||||
Christoph Hellwig <hch@lst.de>
|
||||
Marc Gonzalez <marc.w.gonzalez@free.fr>
|
||||
|
||||
6
.mailmap
6
.mailmap
@@ -98,8 +98,7 @@ Christian Brauner <brauner@kernel.org> <christian.brauner@ubuntu.com>
|
||||
Christian Marangi <ansuelsmth@gmail.com>
|
||||
Christophe Ricard <christophe.ricard@gmail.com>
|
||||
Christoph Hellwig <hch@lst.de>
|
||||
Colin Ian King <colin.king@intel.com> <colin.king@canonical.com>
|
||||
Colin Ian King <colin.king@intel.com> <colin.i.king@gmail.com>
|
||||
Colin Ian King <colin.i.king@gmail.com> <colin.king@canonical.com>
|
||||
Corey Minyard <minyard@acm.org>
|
||||
Damian Hobson-Garcia <dhobsong@igel.co.jp>
|
||||
Daniel Borkmann <daniel@iogearbox.net> <danborkmann@googlemail.com>
|
||||
@@ -150,6 +149,8 @@ Greg Kroah-Hartman <gregkh@suse.de>
|
||||
Greg Kroah-Hartman <greg@kroah.com>
|
||||
Greg Kurz <groug@kaod.org> <gkurz@linux.vnet.ibm.com>
|
||||
Gregory CLEMENT <gregory.clement@bootlin.com> <gregory.clement@free-electrons.com>
|
||||
Guilherme G. Piccoli <kernel@gpiccoli.net> <gpiccoli@linux.vnet.ibm.com>
|
||||
Guilherme G. Piccoli <kernel@gpiccoli.net> <gpiccoli@canonical.com>
|
||||
Guo Ren <guoren@kernel.org> <guoren@linux.alibaba.com>
|
||||
Guo Ren <guoren@kernel.org> <ren_guo@c-sky.com>
|
||||
Gustavo Padovan <gustavo@las.ic.unicamp.br>
|
||||
@@ -253,6 +254,7 @@ Linus Lüssing <linus.luessing@c0d3.blue> <linus.luessing@web.de>
|
||||
Li Yang <leoyang.li@nxp.com> <leoli@freescale.com>
|
||||
Li Yang <leoyang.li@nxp.com> <leo@zh-kernel.org>
|
||||
Lorenzo Pieralisi <lpieralisi@kernel.org> <lorenzo.pieralisi@arm.com>
|
||||
Luca Ceresoli <luca.ceresoli@bootlin.com> <luca@lucaceresoli.net>
|
||||
Lukasz Luba <lukasz.luba@arm.com> <l.luba@partner.samsung.com>
|
||||
Maciej W. Rozycki <macro@mips.com> <macro@imgtec.com>
|
||||
Maciej W. Rozycki <macro@orcam.me.uk> <macro@linux-mips.org>
|
||||
|
||||
@@ -523,6 +523,7 @@ What: /sys/devices/system/cpu/vulnerabilities
|
||||
/sys/devices/system/cpu/vulnerabilities/tsx_async_abort
|
||||
/sys/devices/system/cpu/vulnerabilities/itlb_multihit
|
||||
/sys/devices/system/cpu/vulnerabilities/mmio_stale_data
|
||||
/sys/devices/system/cpu/vulnerabilities/retbleed
|
||||
Date: January 2018
|
||||
Contact: Linux kernel mailing list <linux-kernel@vger.kernel.org>
|
||||
Description: Information about CPU vulnerabilities
|
||||
|
||||
@@ -1,9 +1,9 @@
|
||||
.. _readme:
|
||||
|
||||
Linux kernel release 5.x <http://kernel.org/>
|
||||
Linux kernel release 6.x <http://kernel.org/>
|
||||
=============================================
|
||||
|
||||
These are the release notes for Linux version 5. Read them carefully,
|
||||
These are the release notes for Linux version 6. Read them carefully,
|
||||
as they tell you what this is all about, explain how to install the
|
||||
kernel, and what to do if something goes wrong.
|
||||
|
||||
@@ -63,7 +63,7 @@ Installing the kernel source
|
||||
directory where you have permissions (e.g. your home directory) and
|
||||
unpack it::
|
||||
|
||||
xz -cd linux-5.x.tar.xz | tar xvf -
|
||||
xz -cd linux-6.x.tar.xz | tar xvf -
|
||||
|
||||
Replace "X" with the version number of the latest kernel.
|
||||
|
||||
@@ -72,12 +72,12 @@ Installing the kernel source
|
||||
files. They should match the library, and not get messed up by
|
||||
whatever the kernel-du-jour happens to be.
|
||||
|
||||
- You can also upgrade between 5.x releases by patching. Patches are
|
||||
- You can also upgrade between 6.x releases by patching. Patches are
|
||||
distributed in the xz format. To install by patching, get all the
|
||||
newer patch files, enter the top level directory of the kernel source
|
||||
(linux-5.x) and execute::
|
||||
(linux-6.x) and execute::
|
||||
|
||||
xz -cd ../patch-5.x.xz | patch -p1
|
||||
xz -cd ../patch-6.x.xz | patch -p1
|
||||
|
||||
Replace "x" for all versions bigger than the version "x" of your current
|
||||
source tree, **in_order**, and you should be ok. You may want to remove
|
||||
@@ -85,13 +85,13 @@ Installing the kernel source
|
||||
that there are no failed patches (some-file-name# or some-file-name.rej).
|
||||
If there are, either you or I have made a mistake.
|
||||
|
||||
Unlike patches for the 5.x kernels, patches for the 5.x.y kernels
|
||||
Unlike patches for the 6.x kernels, patches for the 6.x.y kernels
|
||||
(also known as the -stable kernels) are not incremental but instead apply
|
||||
directly to the base 5.x kernel. For example, if your base kernel is 5.0
|
||||
and you want to apply the 5.0.3 patch, you must not first apply the 5.0.1
|
||||
and 5.0.2 patches. Similarly, if you are running kernel version 5.0.2 and
|
||||
want to jump to 5.0.3, you must first reverse the 5.0.2 patch (that is,
|
||||
patch -R) **before** applying the 5.0.3 patch. You can read more on this in
|
||||
directly to the base 6.x kernel. For example, if your base kernel is 6.0
|
||||
and you want to apply the 6.0.3 patch, you must not first apply the 6.0.1
|
||||
and 6.0.2 patches. Similarly, if you are running kernel version 6.0.2 and
|
||||
want to jump to 6.0.3, you must first reverse the 6.0.2 patch (that is,
|
||||
patch -R) **before** applying the 6.0.3 patch. You can read more on this in
|
||||
:ref:`Documentation/process/applying-patches.rst <applying_patches>`.
|
||||
|
||||
Alternatively, the script patch-kernel can be used to automate this
|
||||
@@ -114,7 +114,7 @@ Installing the kernel source
|
||||
Software requirements
|
||||
---------------------
|
||||
|
||||
Compiling and running the 5.x kernels requires up-to-date
|
||||
Compiling and running the 6.x kernels requires up-to-date
|
||||
versions of various software packages. Consult
|
||||
:ref:`Documentation/process/changes.rst <changes>` for the minimum version numbers
|
||||
required and how to get updates for these packages. Beware that using
|
||||
@@ -132,12 +132,12 @@ Build directory for the kernel
|
||||
place for the output files (including .config).
|
||||
Example::
|
||||
|
||||
kernel source code: /usr/src/linux-5.x
|
||||
kernel source code: /usr/src/linux-6.x
|
||||
build directory: /home/name/build/kernel
|
||||
|
||||
To configure and build the kernel, use::
|
||||
|
||||
cd /usr/src/linux-5.x
|
||||
cd /usr/src/linux-6.x
|
||||
make O=/home/name/build/kernel menuconfig
|
||||
make O=/home/name/build/kernel
|
||||
sudo make O=/home/name/build/kernel modules_install install
|
||||
|
||||
@@ -230,6 +230,20 @@ The possible values in this file are:
|
||||
* - 'Mitigation: Clear CPU buffers'
|
||||
- The processor is vulnerable and the CPU buffer clearing mitigation is
|
||||
enabled.
|
||||
* - 'Unknown: No mitigations'
|
||||
- The processor vulnerability status is unknown because it is
|
||||
out of Servicing period. Mitigation is not attempted.
|
||||
|
||||
Definitions:
|
||||
------------
|
||||
|
||||
Servicing period: The process of providing functional and security updates to
|
||||
Intel processors or platforms, utilizing the Intel Platform Update (IPU)
|
||||
process or other similar mechanisms.
|
||||
|
||||
End of Servicing Updates (ESU): ESU is the date at which Intel will no
|
||||
longer provide Servicing, such as through IPU or other similar update
|
||||
processes. ESU dates will typically be aligned to end of quarter.
|
||||
|
||||
If the processor is vulnerable then the following information is appended to
|
||||
the above information:
|
||||
|
||||
@@ -5331,6 +5331,8 @@
|
||||
rodata= [KNL]
|
||||
on Mark read-only kernel memory as read-only (default).
|
||||
off Leave read-only kernel memory writable for debugging.
|
||||
full Mark read-only kernel memory and aliases as read-only
|
||||
[arm64]
|
||||
|
||||
rockchip.usb_uart
|
||||
Enable the uart passthrough on the designated usb port
|
||||
|
||||
@@ -50,10 +50,10 @@ For a short example, users can monitor the virtual address space of a given
|
||||
workload as below. ::
|
||||
|
||||
# cd /sys/kernel/mm/damon/admin/
|
||||
# echo 1 > kdamonds/nr && echo 1 > kdamonds/0/contexts/nr
|
||||
# echo 1 > kdamonds/nr_kdamonds && echo 1 > kdamonds/0/contexts/nr_contexts
|
||||
# echo vaddr > kdamonds/0/contexts/0/operations
|
||||
# echo 1 > kdamonds/0/contexts/0/targets/nr
|
||||
# echo $(pidof <workload>) > kdamonds/0/contexts/0/targets/0/pid
|
||||
# echo 1 > kdamonds/0/contexts/0/targets/nr_targets
|
||||
# echo $(pidof <workload>) > kdamonds/0/contexts/0/targets/0/pid_target
|
||||
# echo on > kdamonds/0/state
|
||||
|
||||
Files Hierarchy
|
||||
@@ -366,12 +366,12 @@ memory rate becomes larger than 60%, or lower than 30%". ::
|
||||
# echo 1 > kdamonds/0/contexts/0/schemes/nr_schemes
|
||||
# cd kdamonds/0/contexts/0/schemes/0
|
||||
# # set the basic access pattern and the action
|
||||
# echo 4096 > access_patterns/sz/min
|
||||
# echo 8192 > access_patterns/sz/max
|
||||
# echo 0 > access_patterns/nr_accesses/min
|
||||
# echo 5 > access_patterns/nr_accesses/max
|
||||
# echo 10 > access_patterns/age/min
|
||||
# echo 20 > access_patterns/age/max
|
||||
# echo 4096 > access_pattern/sz/min
|
||||
# echo 8192 > access_pattern/sz/max
|
||||
# echo 0 > access_pattern/nr_accesses/min
|
||||
# echo 5 > access_pattern/nr_accesses/max
|
||||
# echo 10 > access_pattern/age/min
|
||||
# echo 20 > access_pattern/age/max
|
||||
# echo pageout > action
|
||||
# # set quotas
|
||||
# echo 10 > quotas/ms
|
||||
|
||||
@@ -271,7 +271,7 @@ poll cycle or the number of packets processed reaches netdev_budget.
|
||||
netdev_max_backlog
|
||||
------------------
|
||||
|
||||
Maximum number of packets, queued on the INPUT side, when the interface
|
||||
Maximum number of packets, queued on the INPUT side, when the interface
|
||||
receives packets faster than kernel can process them.
|
||||
|
||||
netdev_rss_key
|
||||
|
||||
@@ -242,44 +242,34 @@ HWCAP2_MTE3
|
||||
by Documentation/arm64/memory-tagging-extension.rst.
|
||||
|
||||
HWCAP2_SME
|
||||
|
||||
Functionality implied by ID_AA64PFR1_EL1.SME == 0b0001, as described
|
||||
by Documentation/arm64/sme.rst.
|
||||
|
||||
HWCAP2_SME_I16I64
|
||||
|
||||
Functionality implied by ID_AA64SMFR0_EL1.I16I64 == 0b1111.
|
||||
|
||||
HWCAP2_SME_F64F64
|
||||
|
||||
Functionality implied by ID_AA64SMFR0_EL1.F64F64 == 0b1.
|
||||
|
||||
HWCAP2_SME_I8I32
|
||||
|
||||
Functionality implied by ID_AA64SMFR0_EL1.I8I32 == 0b1111.
|
||||
|
||||
HWCAP2_SME_F16F32
|
||||
|
||||
Functionality implied by ID_AA64SMFR0_EL1.F16F32 == 0b1.
|
||||
|
||||
HWCAP2_SME_B16F32
|
||||
|
||||
Functionality implied by ID_AA64SMFR0_EL1.B16F32 == 0b1.
|
||||
|
||||
HWCAP2_SME_F32F32
|
||||
|
||||
Functionality implied by ID_AA64SMFR0_EL1.F32F32 == 0b1.
|
||||
|
||||
HWCAP2_SME_FA64
|
||||
|
||||
Functionality implied by ID_AA64SMFR0_EL1.FA64 == 0b1.
|
||||
|
||||
HWCAP2_WFXT
|
||||
|
||||
Functionality implied by ID_AA64ISAR2_EL1.WFXT == 0b0010.
|
||||
|
||||
HWCAP2_EBF16
|
||||
|
||||
Functionality implied by ID_AA64ISAR1_EL1.BF16 == 0b0010.
|
||||
|
||||
4. Unused AT_HWCAP bits
|
||||
|
||||
@@ -52,6 +52,8 @@ stable kernels.
|
||||
| Allwinner | A64/R18 | UNKNOWN1 | SUN50I_ERRATUM_UNKNOWN1 |
|
||||
+----------------+-----------------+-----------------+-----------------------------+
|
||||
+----------------+-----------------+-----------------+-----------------------------+
|
||||
| ARM | Cortex-A510 | #2457168 | ARM64_ERRATUM_2457168 |
|
||||
+----------------+-----------------+-----------------+-----------------------------+
|
||||
| ARM | Cortex-A510 | #2064142 | ARM64_ERRATUM_2064142 |
|
||||
+----------------+-----------------+-----------------+-----------------------------+
|
||||
| ARM | Cortex-A510 | #2038923 | ARM64_ERRATUM_2038923 |
|
||||
|
||||
@@ -58,13 +58,11 @@ Like with atomic_t, the rule of thumb is:
|
||||
|
||||
- RMW operations that have a return value are fully ordered.
|
||||
|
||||
- RMW operations that are conditional are unordered on FAILURE,
|
||||
otherwise the above rules apply. In the case of test_and_set_bit_lock(),
|
||||
if the bit in memory is unchanged by the operation then it is deemed to have
|
||||
failed.
|
||||
- RMW operations that are conditional are fully ordered.
|
||||
|
||||
Except for a successful test_and_set_bit_lock() which has ACQUIRE semantics and
|
||||
clear_bit_unlock() which has RELEASE semantics.
|
||||
Except for a successful test_and_set_bit_lock() which has ACQUIRE semantics,
|
||||
clear_bit_unlock() which has RELEASE semantics and test_bit_acquire which has
|
||||
ACQUIRE semantics.
|
||||
|
||||
Since a platform only has a single means of achieving atomic operations
|
||||
the same barriers as for atomic_t are used, see atomic_t.txt.
|
||||
|
||||
@@ -23,3 +23,4 @@ Block
|
||||
stat
|
||||
switching-sched
|
||||
writeback_cache_control
|
||||
ublk
|
||||
|
||||
253
Documentation/block/ublk.rst
Normal file
253
Documentation/block/ublk.rst
Normal file
@@ -0,0 +1,253 @@
|
||||
.. SPDX-License-Identifier: GPL-2.0
|
||||
|
||||
===========================================
|
||||
Userspace block device driver (ublk driver)
|
||||
===========================================
|
||||
|
||||
Overview
|
||||
========
|
||||
|
||||
ublk is a generic framework for implementing block device logic from userspace.
|
||||
The motivation behind it is that moving virtual block drivers into userspace,
|
||||
such as loop, nbd and similar can be very helpful. It can help to implement
|
||||
new virtual block device such as ublk-qcow2 (there are several attempts of
|
||||
implementing qcow2 driver in kernel).
|
||||
|
||||
Userspace block devices are attractive because:
|
||||
|
||||
- They can be written many programming languages.
|
||||
- They can use libraries that are not available in the kernel.
|
||||
- They can be debugged with tools familiar to application developers.
|
||||
- Crashes do not kernel panic the machine.
|
||||
- Bugs are likely to have a lower security impact than bugs in kernel
|
||||
code.
|
||||
- They can be installed and updated independently of the kernel.
|
||||
- They can be used to simulate block device easily with user specified
|
||||
parameters/setting for test/debug purpose
|
||||
|
||||
ublk block device (``/dev/ublkb*``) is added by ublk driver. Any IO request
|
||||
on the device will be forwarded to ublk userspace program. For convenience,
|
||||
in this document, ``ublk server`` refers to generic ublk userspace
|
||||
program. ``ublksrv`` [#userspace]_ is one of such implementation. It
|
||||
provides ``libublksrv`` [#userspace_lib]_ library for developing specific
|
||||
user block device conveniently, while also generic type block device is
|
||||
included, such as loop and null. Richard W.M. Jones wrote userspace nbd device
|
||||
``nbdublk`` [#userspace_nbdublk]_ based on ``libublksrv`` [#userspace_lib]_.
|
||||
|
||||
After the IO is handled by userspace, the result is committed back to the
|
||||
driver, thus completing the request cycle. This way, any specific IO handling
|
||||
logic is totally done by userspace, such as loop's IO handling, NBD's IO
|
||||
communication, or qcow2's IO mapping.
|
||||
|
||||
``/dev/ublkb*`` is driven by blk-mq request-based driver. Each request is
|
||||
assigned by one queue wide unique tag. ublk server assigns unique tag to each
|
||||
IO too, which is 1:1 mapped with IO of ``/dev/ublkb*``.
|
||||
|
||||
Both the IO request forward and IO handling result committing are done via
|
||||
``io_uring`` passthrough command; that is why ublk is also one io_uring based
|
||||
block driver. It has been observed that using io_uring passthrough command can
|
||||
give better IOPS than block IO; which is why ublk is one of high performance
|
||||
implementation of userspace block device: not only IO request communication is
|
||||
done by io_uring, but also the preferred IO handling in ublk server is io_uring
|
||||
based approach too.
|
||||
|
||||
ublk provides control interface to set/get ublk block device parameters.
|
||||
The interface is extendable and kabi compatible: basically any ublk request
|
||||
queue's parameter or ublk generic feature parameters can be set/get via the
|
||||
interface. Thus, ublk is generic userspace block device framework.
|
||||
For example, it is easy to setup a ublk device with specified block
|
||||
parameters from userspace.
|
||||
|
||||
Using ublk
|
||||
==========
|
||||
|
||||
ublk requires userspace ublk server to handle real block device logic.
|
||||
|
||||
Below is example of using ``ublksrv`` to provide ublk-based loop device.
|
||||
|
||||
- add a device::
|
||||
|
||||
ublk add -t loop -f ublk-loop.img
|
||||
|
||||
- format with xfs, then use it::
|
||||
|
||||
mkfs.xfs /dev/ublkb0
|
||||
mount /dev/ublkb0 /mnt
|
||||
# do anything. all IOs are handled by io_uring
|
||||
...
|
||||
umount /mnt
|
||||
|
||||
- list the devices with their info::
|
||||
|
||||
ublk list
|
||||
|
||||
- delete the device::
|
||||
|
||||
ublk del -a
|
||||
ublk del -n $ublk_dev_id
|
||||
|
||||
See usage details in README of ``ublksrv`` [#userspace_readme]_.
|
||||
|
||||
Design
|
||||
======
|
||||
|
||||
Control plane
|
||||
-------------
|
||||
|
||||
ublk driver provides global misc device node (``/dev/ublk-control``) for
|
||||
managing and controlling ublk devices with help of several control commands:
|
||||
|
||||
- ``UBLK_CMD_ADD_DEV``
|
||||
|
||||
Add a ublk char device (``/dev/ublkc*``) which is talked with ublk server
|
||||
WRT IO command communication. Basic device info is sent together with this
|
||||
command. It sets UAPI structure of ``ublksrv_ctrl_dev_info``,
|
||||
such as ``nr_hw_queues``, ``queue_depth``, and max IO request buffer size,
|
||||
for which the info is negotiated with the driver and sent back to the server.
|
||||
When this command is completed, the basic device info is immutable.
|
||||
|
||||
- ``UBLK_CMD_SET_PARAMS`` / ``UBLK_CMD_GET_PARAMS``
|
||||
|
||||
Set or get parameters of the device, which can be either generic feature
|
||||
related, or request queue limit related, but can't be IO logic specific,
|
||||
because the driver does not handle any IO logic. This command has to be
|
||||
sent before sending ``UBLK_CMD_START_DEV``.
|
||||
|
||||
- ``UBLK_CMD_START_DEV``
|
||||
|
||||
After the server prepares userspace resources (such as creating per-queue
|
||||
pthread & io_uring for handling ublk IO), this command is sent to the
|
||||
driver for allocating & exposing ``/dev/ublkb*``. Parameters set via
|
||||
``UBLK_CMD_SET_PARAMS`` are applied for creating the device.
|
||||
|
||||
- ``UBLK_CMD_STOP_DEV``
|
||||
|
||||
Halt IO on ``/dev/ublkb*`` and remove the device. When this command returns,
|
||||
ublk server will release resources (such as destroying per-queue pthread &
|
||||
io_uring).
|
||||
|
||||
- ``UBLK_CMD_DEL_DEV``
|
||||
|
||||
Remove ``/dev/ublkc*``. When this command returns, the allocated ublk device
|
||||
number can be reused.
|
||||
|
||||
- ``UBLK_CMD_GET_QUEUE_AFFINITY``
|
||||
|
||||
When ``/dev/ublkc`` is added, the driver creates block layer tagset, so
|
||||
that each queue's affinity info is available. The server sends
|
||||
``UBLK_CMD_GET_QUEUE_AFFINITY`` to retrieve queue affinity info. It can
|
||||
set up the per-queue context efficiently, such as bind affine CPUs with IO
|
||||
pthread and try to allocate buffers in IO thread context.
|
||||
|
||||
- ``UBLK_CMD_GET_DEV_INFO``
|
||||
|
||||
For retrieving device info via ``ublksrv_ctrl_dev_info``. It is the server's
|
||||
responsibility to save IO target specific info in userspace.
|
||||
|
||||
Data plane
|
||||
----------
|
||||
|
||||
ublk server needs to create per-queue IO pthread & io_uring for handling IO
|
||||
commands via io_uring passthrough. The per-queue IO pthread
|
||||
focuses on IO handling and shouldn't handle any control & management
|
||||
tasks.
|
||||
|
||||
The's IO is assigned by a unique tag, which is 1:1 mapping with IO
|
||||
request of ``/dev/ublkb*``.
|
||||
|
||||
UAPI structure of ``ublksrv_io_desc`` is defined for describing each IO from
|
||||
the driver. A fixed mmaped area (array) on ``/dev/ublkc*`` is provided for
|
||||
exporting IO info to the server; such as IO offset, length, OP/flags and
|
||||
buffer address. Each ``ublksrv_io_desc`` instance can be indexed via queue id
|
||||
and IO tag directly.
|
||||
|
||||
The following IO commands are communicated via io_uring passthrough command,
|
||||
and each command is only for forwarding the IO and committing the result
|
||||
with specified IO tag in the command data:
|
||||
|
||||
- ``UBLK_IO_FETCH_REQ``
|
||||
|
||||
Sent from the server IO pthread for fetching future incoming IO requests
|
||||
destined to ``/dev/ublkb*``. This command is sent only once from the server
|
||||
IO pthread for ublk driver to setup IO forward environment.
|
||||
|
||||
- ``UBLK_IO_COMMIT_AND_FETCH_REQ``
|
||||
|
||||
When an IO request is destined to ``/dev/ublkb*``, the driver stores
|
||||
the IO's ``ublksrv_io_desc`` to the specified mapped area; then the
|
||||
previous received IO command of this IO tag (either ``UBLK_IO_FETCH_REQ``
|
||||
or ``UBLK_IO_COMMIT_AND_FETCH_REQ)`` is completed, so the server gets
|
||||
the IO notification via io_uring.
|
||||
|
||||
After the server handles the IO, its result is committed back to the
|
||||
driver by sending ``UBLK_IO_COMMIT_AND_FETCH_REQ`` back. Once ublkdrv
|
||||
received this command, it parses the result and complete the request to
|
||||
``/dev/ublkb*``. In the meantime setup environment for fetching future
|
||||
requests with the same IO tag. That is, ``UBLK_IO_COMMIT_AND_FETCH_REQ``
|
||||
is reused for both fetching request and committing back IO result.
|
||||
|
||||
- ``UBLK_IO_NEED_GET_DATA``
|
||||
|
||||
With ``UBLK_F_NEED_GET_DATA`` enabled, the WRITE request will be firstly
|
||||
issued to ublk server without data copy. Then, IO backend of ublk server
|
||||
receives the request and it can allocate data buffer and embed its addr
|
||||
inside this new io command. After the kernel driver gets the command,
|
||||
data copy is done from request pages to this backend's buffer. Finally,
|
||||
backend receives the request again with data to be written and it can
|
||||
truly handle the request.
|
||||
|
||||
``UBLK_IO_NEED_GET_DATA`` adds one additional round-trip and one
|
||||
io_uring_enter() syscall. Any user thinks that it may lower performance
|
||||
should not enable UBLK_F_NEED_GET_DATA. ublk server pre-allocates IO
|
||||
buffer for each IO by default. Any new project should try to use this
|
||||
buffer to communicate with ublk driver. However, existing project may
|
||||
break or not able to consume the new buffer interface; that's why this
|
||||
command is added for backwards compatibility so that existing projects
|
||||
can still consume existing buffers.
|
||||
|
||||
- data copy between ublk server IO buffer and ublk block IO request
|
||||
|
||||
The driver needs to copy the block IO request pages into the server buffer
|
||||
(pages) first for WRITE before notifying the server of the coming IO, so
|
||||
that the server can handle WRITE request.
|
||||
|
||||
When the server handles READ request and sends
|
||||
``UBLK_IO_COMMIT_AND_FETCH_REQ`` to the server, ublkdrv needs to copy
|
||||
the server buffer (pages) read to the IO request pages.
|
||||
|
||||
Future development
|
||||
==================
|
||||
|
||||
Container-aware ublk deivice
|
||||
----------------------------
|
||||
|
||||
ublk driver doesn't handle any IO logic. Its function is well defined
|
||||
for now and very limited userspace interfaces are needed, which is also
|
||||
well defined too. It is possible to make ublk devices container-aware block
|
||||
devices in future as Stefan Hajnoczi suggested [#stefan]_, by removing
|
||||
ADMIN privilege.
|
||||
|
||||
Zero copy
|
||||
---------
|
||||
|
||||
Zero copy is a generic requirement for nbd, fuse or similar drivers. A
|
||||
problem [#xiaoguang]_ Xiaoguang mentioned is that pages mapped to userspace
|
||||
can't be remapped any more in kernel with existing mm interfaces. This can
|
||||
occurs when destining direct IO to ``/dev/ublkb*``. Also, he reported that
|
||||
big requests (IO size >= 256 KB) may benefit a lot from zero copy.
|
||||
|
||||
|
||||
References
|
||||
==========
|
||||
|
||||
.. [#userspace] https://github.com/ming1/ubdsrv
|
||||
|
||||
.. [#userspace_lib] https://github.com/ming1/ubdsrv/tree/master/lib
|
||||
|
||||
.. [#userspace_nbdublk] https://gitlab.com/rwmjones/libnbd/-/tree/nbdublk
|
||||
|
||||
.. [#userspace_readme] https://github.com/ming1/ubdsrv/blob/master/README
|
||||
|
||||
.. [#stefan] https://lore.kernel.org/linux-block/YoOr6jBfgVm8GvWg@stefanha-x1.localdomain/
|
||||
|
||||
.. [#xiaoguang] https://lore.kernel.org/linux-block/YoOr6jBfgVm8GvWg@stefanha-x1.localdomain/
|
||||
@@ -86,6 +86,7 @@ if major >= 3:
|
||||
"__used",
|
||||
"__weak",
|
||||
"noinline",
|
||||
"__fix_address",
|
||||
|
||||
# include/linux/memblock.h:
|
||||
"__init_memblock",
|
||||
|
||||
@@ -48,7 +48,6 @@ required:
|
||||
- compatible
|
||||
- reg
|
||||
- reg-names
|
||||
- intel,vm-map
|
||||
- clocks
|
||||
- resets
|
||||
- "#thermal-sensor-cells"
|
||||
|
||||
@@ -60,6 +60,9 @@ properties:
|
||||
power-domains:
|
||||
maxItems: 1
|
||||
|
||||
resets:
|
||||
maxItems: 1
|
||||
|
||||
required:
|
||||
- compatible
|
||||
- reg
|
||||
|
||||
@@ -24,8 +24,10 @@ properties:
|
||||
|
||||
interrupts:
|
||||
minItems: 1
|
||||
maxItems: 2
|
||||
description:
|
||||
Should be configured with type IRQ_TYPE_EDGE_RISING.
|
||||
If two interrupts are provided, expected order is INT1 and INT2.
|
||||
|
||||
required:
|
||||
- compatible
|
||||
|
||||
@@ -16,6 +16,7 @@ properties:
|
||||
compatible:
|
||||
enum:
|
||||
- goodix,gt1151
|
||||
- goodix,gt1158
|
||||
- goodix,gt5663
|
||||
- goodix,gt5688
|
||||
- goodix,gt911
|
||||
|
||||
@@ -35,6 +35,7 @@ patternProperties:
|
||||
description: List of regulators and its properties
|
||||
type: object
|
||||
$ref: regulator.yaml#
|
||||
unevaluatedProperties: false
|
||||
|
||||
properties:
|
||||
qcom,ocp-max-retries:
|
||||
@@ -100,8 +101,6 @@ patternProperties:
|
||||
SAW controlled gang leader. Will be configured as SAW regulator.
|
||||
type: boolean
|
||||
|
||||
unevaluatedProperties: false
|
||||
|
||||
required:
|
||||
- compatible
|
||||
|
||||
|
||||
@@ -17,9 +17,6 @@ description:
|
||||
acts as directory-based coherency manager.
|
||||
All the properties in ePAPR/DeviceTree specification applies for this platform.
|
||||
|
||||
allOf:
|
||||
- $ref: /schemas/cache-controller.yaml#
|
||||
|
||||
select:
|
||||
properties:
|
||||
compatible:
|
||||
@@ -33,11 +30,16 @@ select:
|
||||
|
||||
properties:
|
||||
compatible:
|
||||
items:
|
||||
- enum:
|
||||
- sifive,fu540-c000-ccache
|
||||
- sifive,fu740-c000-ccache
|
||||
- const: cache
|
||||
oneOf:
|
||||
- items:
|
||||
- enum:
|
||||
- sifive,fu540-c000-ccache
|
||||
- sifive,fu740-c000-ccache
|
||||
- const: cache
|
||||
- items:
|
||||
- const: microchip,mpfs-ccache
|
||||
- const: sifive,fu540-c000-ccache
|
||||
- const: cache
|
||||
|
||||
cache-block-size:
|
||||
const: 64
|
||||
@@ -72,29 +74,46 @@ properties:
|
||||
The reference to the reserved-memory for the L2 Loosely Integrated Memory region.
|
||||
The reserved memory node should be defined as per the bindings in reserved-memory.txt.
|
||||
|
||||
if:
|
||||
properties:
|
||||
compatible:
|
||||
contains:
|
||||
const: sifive,fu540-c000-ccache
|
||||
allOf:
|
||||
- $ref: /schemas/cache-controller.yaml#
|
||||
|
||||
then:
|
||||
properties:
|
||||
interrupts:
|
||||
description: |
|
||||
Must contain entries for DirError, DataError and DataFail signals.
|
||||
maxItems: 3
|
||||
cache-sets:
|
||||
const: 1024
|
||||
- if:
|
||||
properties:
|
||||
compatible:
|
||||
contains:
|
||||
enum:
|
||||
- sifive,fu740-c000-ccache
|
||||
- microchip,mpfs-ccache
|
||||
|
||||
else:
|
||||
properties:
|
||||
interrupts:
|
||||
description: |
|
||||
Must contain entries for DirError, DataError, DataFail, DirFail signals.
|
||||
minItems: 4
|
||||
cache-sets:
|
||||
const: 2048
|
||||
then:
|
||||
properties:
|
||||
interrupts:
|
||||
description: |
|
||||
Must contain entries for DirError, DataError, DataFail, DirFail signals.
|
||||
minItems: 4
|
||||
|
||||
else:
|
||||
properties:
|
||||
interrupts:
|
||||
description: |
|
||||
Must contain entries for DirError, DataError and DataFail signals.
|
||||
maxItems: 3
|
||||
|
||||
- if:
|
||||
properties:
|
||||
compatible:
|
||||
contains:
|
||||
const: sifive,fu740-c000-ccache
|
||||
|
||||
then:
|
||||
properties:
|
||||
cache-sets:
|
||||
const: 2048
|
||||
|
||||
else:
|
||||
properties:
|
||||
cache-sets:
|
||||
const: 1024
|
||||
|
||||
additionalProperties: false
|
||||
|
||||
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user