Merge tag 'for-6.9/block-20240310' of git://git.kernel.dk/linux

Pull block updates from Jens Axboe:

 - MD pull requests via Song:
      - Cleanup redundant checks (Yu Kuai)
      - Remove deprecated headers (Marc Zyngier, Song Liu)
      - Concurrency fixes (Li Lingfeng)
      - Memory leak fix (Li Nan)
      - Refactor raid1 read_balance (Yu Kuai, Paul Luse)
      - Clean up and fix for md_ioctl (Li Nan)
      - Other small fixes (Gui-Dong Han, Heming Zhao)
      - MD atomic limits (Christoph)

 - NVMe pull request via Keith:
      - RDMA target enhancements (Max)
      - Fabrics fixes (Max, Guixin, Hannes)
      - Atomic queue_limits usage (Christoph)
      - Const use for class_register (Ricardo)
      - Identification error handling fixes (Shin'ichiro, Keith)

 - Improvement and cleanup for cached request handling (Christoph)

 - Moving towards atomic queue limits. Core changes and driver bits so
   far (Christoph)

 - Fix UAF issues in aoeblk (Chun-Yi)

 - Zoned fix and cleanups (Damien)

 - s390 dasd cleanups and fixes (Jan, Miroslav)

 - Block issue timestamp caching (me)

 - noio scope guarding for zoned IO (Johannes)

 - block/nvme PI improvements (Kanchan)

 - Ability to terminate long running discard loop (Keith)

 - bdev revalidation fix (Li)

 - Get rid of old nr_queues hack for kdump kernels (Ming)

 - Support for async deletion of ublk (Ming)

 - Improve IRQ bio recycling (Pavel)

 - Factor in CPU capacity for remote vs local completion (Qais)

 - Add shared_tags configfs entry for null_blk (Shin'ichiro

 - Fix for a regression in page refcounts introduced by the folio
   unification (Tony)

 - Misc fixes and cleanups (Arnd, Colin, John, Kunwu, Li, Navid,
   Ricardo, Roman, Tang, Uwe)

* tag 'for-6.9/block-20240310' of git://git.kernel.dk/linux: (221 commits)
  block: partitions: only define function mac_fix_string for CONFIG_PPC_PMAC
  block/swim: Convert to platform remove callback returning void
  cdrom: gdrom: Convert to platform remove callback returning void
  block: remove disk_stack_limits
  md: remove mddev->queue
  md: don't initialize queue limits
  md/raid10: use the atomic queue limit update APIs
  md/raid5: use the atomic queue limit update APIs
  md/raid1: use the atomic queue limit update APIs
  md/raid0: use the atomic queue limit update APIs
  md: add queue limit helpers
  md: add a mddev_is_dm helper
  md: add a mddev_add_trace_msg helper
  md: add a mddev_trace_remap helper
  bcache: move calculation of stripe_size and io_opt into bcache_device_init
  virtio_blk: Do not use disk_set_max_open/active_zones()
  aoe: fix the potential use-after-free problem in aoecmd_cfg_pkts
  block: move capacity validation to blkpg_do_ioctl()
  block: prevent division by zero in blk_rq_stat_sum()
  drbd: atomically update queue limits in drbd_reconsider_queue_parameters
  ...
This commit is contained in:
Linus Torvalds
2024-03-11 11:43:44 -07:00
138 changed files with 3571 additions and 3298 deletions

View File

@@ -96,6 +96,9 @@ static const struct block_device_operations nfhd_ops = {
static int __init nfhd_init_one(int id, u32 blocks, u32 bsize)
{
struct queue_limits lim = {
.logical_block_size = bsize,
};
struct nfhd_device *dev;
int dev_id = id - NFHD_DEV_OFFSET;
int err = -ENOMEM;
@@ -117,9 +120,11 @@ static int __init nfhd_init_one(int id, u32 blocks, u32 bsize)
dev->bsize = bsize;
dev->bshift = ffs(bsize) - 10;
dev->disk = blk_alloc_disk(NUMA_NO_NODE);
if (!dev->disk)
dev->disk = blk_alloc_disk(&lim, NUMA_NO_NODE);
if (IS_ERR(dev->disk)) {
err = PTR_ERR(dev->disk);
goto free_dev;
}
dev->disk->major = major_num;
dev->disk->first_minor = dev_id * 16;
@@ -128,7 +133,6 @@ static int __init nfhd_init_one(int id, u32 blocks, u32 bsize)
dev->disk->private_data = dev;
sprintf(dev->disk->disk_name, "nfhd%u", dev_id);
set_capacity(dev->disk, (sector_t)blocks * (bsize / 512));
blk_queue_logical_block_size(dev->disk->queue, bsize);
err = add_disk(dev->disk);
if (err)
goto out_cleanup_disk;

View File

@@ -108,8 +108,6 @@ static inline void ubd_set_bit(__u64 bit, unsigned char *data)
static DEFINE_MUTEX(ubd_lock);
static DEFINE_MUTEX(ubd_mutex); /* replaces BKL, might not be needed */
static int ubd_open(struct gendisk *disk, blk_mode_t mode);
static void ubd_release(struct gendisk *disk);
static int ubd_ioctl(struct block_device *bdev, blk_mode_t mode,
unsigned int cmd, unsigned long arg);
static int ubd_getgeo(struct block_device *bdev, struct hd_geometry *geo);
@@ -118,16 +116,11 @@ static int ubd_getgeo(struct block_device *bdev, struct hd_geometry *geo);
static const struct block_device_operations ubd_blops = {
.owner = THIS_MODULE,
.open = ubd_open,
.release = ubd_release,
.ioctl = ubd_ioctl,
.compat_ioctl = blkdev_compat_ptr_ioctl,
.getgeo = ubd_getgeo,
};
/* Protected by ubd_lock */
static struct gendisk *ubd_gendisk[MAX_DEV];
#ifdef CONFIG_BLK_DEV_UBD_SYNC
#define OPEN_FLAGS ((struct openflags) { .r = 1, .w = 1, .s = 1, .c = 0, \
.cl = 1 })
@@ -155,7 +148,6 @@ struct ubd {
* backing or the cow file. */
char *file;
char *serial;
int count;
int fd;
__u64 size;
struct openflags boot_openflags;
@@ -165,7 +157,7 @@ struct ubd {
unsigned no_trim:1;
struct cow cow;
struct platform_device pdev;
struct request_queue *queue;
struct gendisk *disk;
struct blk_mq_tag_set tag_set;
spinlock_t lock;
};
@@ -181,7 +173,6 @@ struct ubd {
#define DEFAULT_UBD { \
.file = NULL, \
.serial = NULL, \
.count = 0, \
.fd = -1, \
.size = -1, \
.boot_openflags = OPEN_FLAGS, \
@@ -774,8 +765,6 @@ static int ubd_open_dev(struct ubd *ubd_dev)
ubd_dev->fd = fd;
if(ubd_dev->cow.file != NULL){
blk_queue_max_hw_sectors(ubd_dev->queue, 8 * sizeof(long));
err = -ENOMEM;
ubd_dev->cow.bitmap = vmalloc(ubd_dev->cow.bitmap_len);
if(ubd_dev->cow.bitmap == NULL){
@@ -797,11 +786,6 @@ static int ubd_open_dev(struct ubd *ubd_dev)
if(err < 0) goto error;
ubd_dev->cow.fd = err;
}
if (ubd_dev->no_trim == 0) {
blk_queue_max_discard_sectors(ubd_dev->queue, UBD_MAX_REQUEST);
blk_queue_max_write_zeroes_sectors(ubd_dev->queue, UBD_MAX_REQUEST);
}
blk_queue_flag_set(QUEUE_FLAG_NONROT, ubd_dev->queue);
return 0;
error:
os_close_file(ubd_dev->fd);
@@ -851,27 +835,6 @@ static const struct attribute_group *ubd_attr_groups[] = {
NULL,
};
static int ubd_disk_register(int major, u64 size, int unit,
struct gendisk *disk)
{
disk->major = major;
disk->first_minor = unit << UBD_SHIFT;
disk->minors = 1 << UBD_SHIFT;
disk->fops = &ubd_blops;
set_capacity(disk, size / 512);
sprintf(disk->disk_name, "ubd%c", 'a' + unit);
ubd_devs[unit].pdev.id = unit;
ubd_devs[unit].pdev.name = DRIVER_NAME;
ubd_devs[unit].pdev.dev.release = ubd_device_release;
dev_set_drvdata(&ubd_devs[unit].pdev.dev, &ubd_devs[unit]);
platform_device_register(&ubd_devs[unit].pdev);
disk->private_data = &ubd_devs[unit];
disk->queue = ubd_devs[unit].queue;
return device_add_disk(&ubd_devs[unit].pdev.dev, disk, ubd_attr_groups);
}
#define ROUND_BLOCK(n) ((n + (SECTOR_SIZE - 1)) & (-SECTOR_SIZE))
static const struct blk_mq_ops ubd_mq_ops = {
@@ -881,18 +844,36 @@ static const struct blk_mq_ops ubd_mq_ops = {
static int ubd_add(int n, char **error_out)
{
struct ubd *ubd_dev = &ubd_devs[n];
struct queue_limits lim = {
.max_segments = MAX_SG,
.seg_boundary_mask = PAGE_SIZE - 1,
};
struct gendisk *disk;
int err = 0;
if(ubd_dev->file == NULL)
goto out;
if (ubd_dev->cow.file)
lim.max_hw_sectors = 8 * sizeof(long);
if (!ubd_dev->no_trim) {
lim.max_hw_discard_sectors = UBD_MAX_REQUEST;
lim.max_write_zeroes_sectors = UBD_MAX_REQUEST;
}
err = ubd_file_size(ubd_dev, &ubd_dev->size);
if(err < 0){
*error_out = "Couldn't determine size of device's file";
goto out;
}
err = ubd_open_dev(ubd_dev);
if (err) {
pr_err("ubd%c: Can't open \"%s\": errno = %d\n",
'a' + n, ubd_dev->file, -err);
goto out;
}
ubd_dev->size = ROUND_BLOCK(ubd_dev->size);
ubd_dev->tag_set.ops = &ubd_mq_ops;
@@ -904,29 +885,43 @@ static int ubd_add(int n, char **error_out)
err = blk_mq_alloc_tag_set(&ubd_dev->tag_set);
if (err)
goto out;
goto out_close;
disk = blk_mq_alloc_disk(&ubd_dev->tag_set, ubd_dev);
disk = blk_mq_alloc_disk(&ubd_dev->tag_set, &lim, ubd_dev);
if (IS_ERR(disk)) {
err = PTR_ERR(disk);
goto out_cleanup_tags;
}
ubd_dev->queue = disk->queue;
blk_queue_write_cache(ubd_dev->queue, true, false);
blk_queue_max_segments(ubd_dev->queue, MAX_SG);
blk_queue_segment_boundary(ubd_dev->queue, PAGE_SIZE - 1);
err = ubd_disk_register(UBD_MAJOR, ubd_dev->size, n, disk);
blk_queue_flag_set(QUEUE_FLAG_NONROT, disk->queue);
blk_queue_write_cache(disk->queue, true, false);
disk->major = UBD_MAJOR;
disk->first_minor = n << UBD_SHIFT;
disk->minors = 1 << UBD_SHIFT;
disk->fops = &ubd_blops;
set_capacity(disk, ubd_dev->size / 512);
sprintf(disk->disk_name, "ubd%c", 'a' + n);
disk->private_data = ubd_dev;
set_disk_ro(disk, !ubd_dev->openflags.w);
ubd_dev->pdev.id = n;
ubd_dev->pdev.name = DRIVER_NAME;
ubd_dev->pdev.dev.release = ubd_device_release;
dev_set_drvdata(&ubd_dev->pdev.dev, ubd_dev);
platform_device_register(&ubd_dev->pdev);
err = device_add_disk(&ubd_dev->pdev.dev, disk, ubd_attr_groups);
if (err)
goto out_cleanup_disk;
ubd_gendisk[n] = disk;
return 0;
out_cleanup_disk:
put_disk(disk);
out_cleanup_tags:
blk_mq_free_tag_set(&ubd_dev->tag_set);
out_close:
ubd_close_dev(ubd_dev);
out:
return err;
}
@@ -1012,7 +1007,6 @@ static int ubd_id(char **str, int *start_out, int *end_out)
static int ubd_remove(int n, char **error_out)
{
struct gendisk *disk = ubd_gendisk[n];
struct ubd *ubd_dev;
int err = -ENODEV;
@@ -1023,15 +1017,15 @@ static int ubd_remove(int n, char **error_out)
if(ubd_dev->file == NULL)
goto out;
/* you cannot remove a open disk */
err = -EBUSY;
if(ubd_dev->count > 0)
goto out;
if (ubd_dev->disk) {
/* you cannot remove a open disk */
err = -EBUSY;
if (disk_openers(ubd_dev->disk))
goto out;
ubd_gendisk[n] = NULL;
if(disk != NULL){
del_gendisk(disk);
put_disk(disk);
del_gendisk(ubd_dev->disk);
ubd_close_dev(ubd_dev);
put_disk(ubd_dev->disk);
}
err = 0;
@@ -1153,37 +1147,6 @@ static int __init ubd_driver_init(void){
device_initcall(ubd_driver_init);
static int ubd_open(struct gendisk *disk, blk_mode_t mode)
{
struct ubd *ubd_dev = disk->private_data;
int err = 0;
mutex_lock(&ubd_mutex);
if(ubd_dev->count == 0){
err = ubd_open_dev(ubd_dev);
if(err){
printk(KERN_ERR "%s: Can't open \"%s\": errno = %d\n",
disk->disk_name, ubd_dev->file, -err);
goto out;
}
}
ubd_dev->count++;
set_disk_ro(disk, !ubd_dev->openflags.w);
out:
mutex_unlock(&ubd_mutex);
return err;
}
static void ubd_release(struct gendisk *disk)
{
struct ubd *ubd_dev = disk->private_data;
mutex_lock(&ubd_mutex);
if(--ubd_dev->count == 0)
ubd_close_dev(ubd_dev);
mutex_unlock(&ubd_mutex);
}
static void cowify_bitmap(__u64 io_offset, int length, unsigned long *cow_mask,
__u64 *cow_offset, unsigned long *bitmap,
__u64 bitmap_offset, unsigned long *bitmap_words,

View File

@@ -264,16 +264,18 @@ static int __init simdisk_setup(struct simdisk *dev, int which,
struct proc_dir_entry *procdir)
{
char tmp[2] = { '0' + which, 0 };
int err = -ENOMEM;
int err;
dev->fd = -1;
dev->filename = NULL;
spin_lock_init(&dev->lock);
dev->users = 0;
dev->gd = blk_alloc_disk(NUMA_NO_NODE);
if (!dev->gd)
dev->gd = blk_alloc_disk(NULL, NUMA_NO_NODE);
if (IS_ERR(dev->gd)) {
err = PTR_ERR(dev->gd);
goto out;
}
dev->gd->major = simdisk_major;
dev->gd->first_minor = which;
dev->gd->minors = SIMDISK_MINORS;

View File

@@ -383,7 +383,7 @@ void __init bdev_cache_init(void)
bdev_cachep = kmem_cache_create("bdev_cache", sizeof(struct bdev_inode),
0, (SLAB_HWCACHE_ALIGN|SLAB_RECLAIM_ACCOUNT|
SLAB_MEM_SPREAD|SLAB_ACCOUNT|SLAB_PANIC),
SLAB_ACCOUNT|SLAB_PANIC),
init_once);
err = register_filesystem(&bd_type);
if (err)

View File

@@ -127,7 +127,7 @@ static void bfqg_stats_update_group_wait_time(struct bfqg_stats *stats)
if (!bfqg_stats_waiting(stats))
return;
now = ktime_get_ns();
now = blk_time_get_ns();
if (now > stats->start_group_wait_time)
bfq_stat_add(&stats->group_wait_time,
now - stats->start_group_wait_time);
@@ -144,7 +144,7 @@ static void bfqg_stats_set_start_group_wait_time(struct bfq_group *bfqg,
return;
if (bfqg == curr_bfqg)
return;
stats->start_group_wait_time = ktime_get_ns();
stats->start_group_wait_time = blk_time_get_ns();
bfqg_stats_mark_waiting(stats);
}
@@ -156,7 +156,7 @@ static void bfqg_stats_end_empty_time(struct bfqg_stats *stats)
if (!bfqg_stats_empty(stats))
return;
now = ktime_get_ns();
now = blk_time_get_ns();
if (now > stats->start_empty_time)
bfq_stat_add(&stats->empty_time,
now - stats->start_empty_time);
@@ -183,7 +183,7 @@ void bfqg_stats_set_start_empty_time(struct bfq_group *bfqg)
if (bfqg_stats_empty(stats))
return;
stats->start_empty_time = ktime_get_ns();
stats->start_empty_time = blk_time_get_ns();
bfqg_stats_mark_empty(stats);
}
@@ -192,7 +192,7 @@ void bfqg_stats_update_idle_time(struct bfq_group *bfqg)
struct bfqg_stats *stats = &bfqg->stats;
if (bfqg_stats_idling(stats)) {
u64 now = ktime_get_ns();
u64 now = blk_time_get_ns();
if (now > stats->start_idle_time)
bfq_stat_add(&stats->idle_time,
@@ -205,7 +205,7 @@ void bfqg_stats_set_start_idle_time(struct bfq_group *bfqg)
{
struct bfqg_stats *stats = &bfqg->stats;
stats->start_idle_time = ktime_get_ns();
stats->start_idle_time = blk_time_get_ns();
bfqg_stats_mark_idling(stats);
}
@@ -242,7 +242,7 @@ void bfqg_stats_update_completion(struct bfq_group *bfqg, u64 start_time_ns,
u64 io_start_time_ns, blk_opf_t opf)
{
struct bfqg_stats *stats = &bfqg->stats;
u64 now = ktime_get_ns();
u64 now = blk_time_get_ns();
if (now > io_start_time_ns)
blkg_rwstat_add(&stats->service_time, opf,

View File

@@ -1005,7 +1005,7 @@ static struct request *bfq_check_fifo(struct bfq_queue *bfqq,
rq = rq_entry_fifo(bfqq->fifo.next);
if (rq == last || ktime_get_ns() < rq->fifo_time)
if (rq == last || blk_time_get_ns() < rq->fifo_time)
return NULL;
bfq_log_bfqq(bfqq->bfqd, bfqq, "check_fifo: returned %p", rq);
@@ -1829,7 +1829,7 @@ static void bfq_bfqq_handle_idle_busy_switch(struct bfq_data *bfqd,
* bfq_bfqq_update_budg_for_activation for
* details on the usage of the next variable.
*/
arrived_in_time = ktime_get_ns() <=
arrived_in_time = blk_time_get_ns() <=
bfqq->ttime.last_end_request +
bfqd->bfq_slice_idle * 3;
unsigned int act_idx = bfq_actuator_index(bfqd, rq->bio);
@@ -2208,7 +2208,7 @@ static void bfq_add_request(struct request *rq)
struct request *next_rq, *prev;
unsigned int old_wr_coeff = bfqq->wr_coeff;
bool interactive = false;
u64 now_ns = ktime_get_ns();
u64 now_ns = blk_time_get_ns();
bfq_log_bfqq(bfqd, bfqq, "add_request %d", rq_is_sync(rq));
bfqq->queued[rq_is_sync(rq)]++;
@@ -2262,7 +2262,7 @@ static void bfq_add_request(struct request *rq)
bfqd->rqs_injected && bfqd->tot_rq_in_driver > 0)) &&
time_is_before_eq_jiffies(bfqq->decrease_time_jif +
msecs_to_jiffies(10))) {
bfqd->last_empty_occupied_ns = ktime_get_ns();
bfqd->last_empty_occupied_ns = blk_time_get_ns();
/*
* Start the state machine for measuring the
* total service time of rq: setting
@@ -3294,7 +3294,7 @@ static void bfq_set_budget_timeout(struct bfq_data *bfqd,
else
timeout_coeff = bfqq->entity.weight / bfqq->entity.orig_weight;
bfqd->last_budget_start = ktime_get();
bfqd->last_budget_start = blk_time_get();
bfqq->budget_timeout = jiffies +
bfqd->bfq_timeout * timeout_coeff;
@@ -3394,7 +3394,7 @@ static void bfq_arm_slice_timer(struct bfq_data *bfqd)
else if (bfqq->wr_coeff > 1)
sl = max_t(u32, sl, 20ULL * NSEC_PER_MSEC);
bfqd->last_idling_start = ktime_get();
bfqd->last_idling_start = blk_time_get();
bfqd->last_idling_start_jiffies = jiffies;
hrtimer_start(&bfqd->idle_slice_timer, ns_to_ktime(sl),
@@ -3433,7 +3433,7 @@ static void bfq_reset_rate_computation(struct bfq_data *bfqd,
struct request *rq)
{
if (rq != NULL) { /* new rq dispatch now, reset accordingly */
bfqd->last_dispatch = bfqd->first_dispatch = ktime_get_ns();
bfqd->last_dispatch = bfqd->first_dispatch = blk_time_get_ns();
bfqd->peak_rate_samples = 1;
bfqd->sequential_samples = 0;
bfqd->tot_sectors_dispatched = bfqd->last_rq_max_size =
@@ -3590,7 +3590,7 @@ reset_computation:
*/
static void bfq_update_peak_rate(struct bfq_data *bfqd, struct request *rq)
{
u64 now_ns = ktime_get_ns();
u64 now_ns = blk_time_get_ns();
if (bfqd->peak_rate_samples == 0) { /* first dispatch */
bfq_log(bfqd, "update_peak_rate: goto reset, samples %d",
@@ -4162,7 +4162,7 @@ static bool bfq_bfqq_is_slow(struct bfq_data *bfqd, struct bfq_queue *bfqq,
if (compensate)
delta_ktime = bfqd->last_idling_start;
else
delta_ktime = ktime_get();
delta_ktime = blk_time_get();
delta_ktime = ktime_sub(delta_ktime, bfqd->last_budget_start);
delta_usecs = ktime_to_us(delta_ktime);
@@ -5591,7 +5591,7 @@ static void bfq_init_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq,
struct bfq_io_cq *bic, pid_t pid, int is_sync,
unsigned int act_idx)
{
u64 now_ns = ktime_get_ns();
u64 now_ns = blk_time_get_ns();
bfqq->actuator_idx = act_idx;
RB_CLEAR_NODE(&bfqq->entity.rb_node);
@@ -5903,7 +5903,7 @@ static void bfq_update_io_thinktime(struct bfq_data *bfqd,
*/
if (bfqq->dispatched || bfq_bfqq_busy(bfqq))
return;
elapsed = ktime_get_ns() - bfqq->ttime.last_end_request;
elapsed = blk_time_get_ns() - bfqq->ttime.last_end_request;
elapsed = min_t(u64, elapsed, 2ULL * bfqd->bfq_slice_idle);
ttime->ttime_samples = (7*ttime->ttime_samples + 256) / 8;
@@ -6194,7 +6194,7 @@ static bool __bfq_insert_request(struct bfq_data *bfqd, struct request *rq)
bfq_add_request(rq);
idle_timer_disabled = waiting && !bfq_bfqq_wait_request(bfqq);
rq->fifo_time = ktime_get_ns() + bfqd->bfq_fifo_expire[rq_is_sync(rq)];
rq->fifo_time = blk_time_get_ns() + bfqd->bfq_fifo_expire[rq_is_sync(rq)];
list_add_tail(&rq->queuelist, &bfqq->fifo);
bfq_rq_enqueued(bfqd, bfqq, rq);
@@ -6370,7 +6370,7 @@ static void bfq_completed_request(struct bfq_queue *bfqq, struct bfq_data *bfqd)
bfq_weights_tree_remove(bfqq);
}
now_ns = ktime_get_ns();
now_ns = blk_time_get_ns();
bfqq->ttime.last_end_request = now_ns;
@@ -6585,7 +6585,7 @@ static void bfq_completed_request(struct bfq_queue *bfqq, struct bfq_data *bfqd)
static void bfq_update_inject_limit(struct bfq_data *bfqd,
struct bfq_queue *bfqq)
{
u64 tot_time_ns = ktime_get_ns() - bfqd->last_empty_occupied_ns;
u64 tot_time_ns = blk_time_get_ns() - bfqd->last_empty_occupied_ns;
unsigned int old_limit = bfqq->inject_limit;
if (bfqq->last_serv_time_ns > 0 && bfqd->rqs_injected) {

View File

@@ -395,6 +395,7 @@ static blk_status_t bio_integrity_process(struct bio *bio,
iter.tuple_size = bi->tuple_size;
iter.seed = proc_iter->bi_sector;
iter.prot_buf = bvec_virt(bip->bip_vec);
iter.pi_offset = bi->pi_offset;
__bio_for_each_segment(bv, bio, bviter, *proc_iter) {
void *kaddr = bvec_kmap_local(&bv);

View File

@@ -16,7 +16,6 @@
#include <linux/workqueue.h>
#include <linux/cgroup.h>
#include <linux/highmem.h>
#include <linux/sched/sysctl.h>
#include <linux/blk-crypto.h>
#include <linux/xarray.h>
@@ -763,29 +762,31 @@ static inline void bio_put_percpu_cache(struct bio *bio)
struct bio_alloc_cache *cache;
cache = per_cpu_ptr(bio->bi_pool->cache, get_cpu());
if (READ_ONCE(cache->nr_irq) + cache->nr > ALLOC_CACHE_MAX) {
put_cpu();
bio_free(bio);
return;
}
if (READ_ONCE(cache->nr_irq) + cache->nr > ALLOC_CACHE_MAX)
goto out_free;
bio_uninit(bio);
if ((bio->bi_opf & REQ_POLLED) && !WARN_ON_ONCE(in_interrupt())) {
if (in_task()) {
bio_uninit(bio);
bio->bi_next = cache->free_list;
/* Not necessary but helps not to iopoll already freed bios */
bio->bi_bdev = NULL;
cache->free_list = bio;
cache->nr++;
} else {
unsigned long flags;
} else if (in_hardirq()) {
lockdep_assert_irqs_disabled();
local_irq_save(flags);
bio_uninit(bio);
bio->bi_next = cache->free_list_irq;
cache->free_list_irq = bio;
cache->nr_irq++;
local_irq_restore(flags);
} else {
goto out_free;
}
put_cpu();
return;
out_free:
put_cpu();
bio_free(bio);
}
/**
@@ -1154,7 +1155,7 @@ void __bio_release_pages(struct bio *bio, bool mark_dirty)
bio_for_each_folio_all(fi, bio) {
struct page *page;
size_t done = 0;
size_t nr_pages;
if (mark_dirty) {
folio_lock(fi.folio);
@@ -1162,10 +1163,11 @@ void __bio_release_pages(struct bio *bio, bool mark_dirty)
folio_unlock(fi.folio);
}
page = folio_page(fi.folio, fi.offset / PAGE_SIZE);
nr_pages = (fi.offset + fi.length - 1) / PAGE_SIZE -
fi.offset / PAGE_SIZE + 1;
do {
bio_release_page(bio, page++);
done += PAGE_SIZE;
} while (done < fi.length);
} while (--nr_pages != 0);
}
}
EXPORT_SYMBOL_GPL(__bio_release_pages);
@@ -1371,21 +1373,12 @@ int submit_bio_wait(struct bio *bio)
{
DECLARE_COMPLETION_ONSTACK_MAP(done,
bio->bi_bdev->bd_disk->lockdep_map);
unsigned long hang_check;
bio->bi_private = &done;
bio->bi_end_io = submit_bio_wait_endio;
bio->bi_opf |= REQ_SYNC;
submit_bio(bio);
/* Prevent hang_check timer from firing at us during very long I/O */
hang_check = sysctl_hung_task_timeout_secs;
if (hang_check)
while (!wait_for_completion_io_timeout(&done,
hang_check * (HZ/2)))
;
else
wait_for_completion_io(&done);
blk_wait_io(&done);
return blk_status_to_errno(bio->bi_status);
}

View File

@@ -1846,7 +1846,7 @@ static void blkcg_maybe_throttle_blkg(struct blkcg_gq *blkg, bool use_memdelay)
{
unsigned long pflags;
bool clamp;
u64 now = ktime_to_ns(ktime_get());
u64 now = blk_time_get_ns();
u64 exp;
u64 delay_nsec = 0;
int tok;

View File

@@ -19,6 +19,7 @@
#include <linux/kthread.h>
#include <linux/blk-mq.h>
#include <linux/llist.h>
#include "blk.h"
struct blkcg_gq;
struct blkg_policy_data;

View File

@@ -394,24 +394,34 @@ static void blk_timeout_work(struct work_struct *work)
{
}
struct request_queue *blk_alloc_queue(int node_id)
struct request_queue *blk_alloc_queue(struct queue_limits *lim, int node_id)
{
struct request_queue *q;
int error;
q = kmem_cache_alloc_node(blk_requestq_cachep, GFP_KERNEL | __GFP_ZERO,
node_id);
if (!q)
return NULL;
return ERR_PTR(-ENOMEM);
q->last_merge = NULL;
q->id = ida_alloc(&blk_queue_ida, GFP_KERNEL);
if (q->id < 0)
if (q->id < 0) {
error = q->id;
goto fail_q;
}
q->stats = blk_alloc_queue_stats();
if (!q->stats)
if (!q->stats) {
error = -ENOMEM;
goto fail_id;
}
error = blk_set_default_limits(lim);
if (error)
goto fail_stats;
q->limits = *lim;
q->node = node_id;
@@ -425,6 +435,7 @@ struct request_queue *blk_alloc_queue(int node_id)
mutex_init(&q->debugfs_mutex);
mutex_init(&q->sysfs_lock);
mutex_init(&q->sysfs_dir_lock);
mutex_init(&q->limits_lock);
mutex_init(&q->rq_qos_mutex);
spin_lock_init(&q->queue_lock);
@@ -435,12 +446,12 @@ struct request_queue *blk_alloc_queue(int node_id)
* Init percpu_ref in atomic mode so that it's faster to shutdown.
* See blk_register_queue() for details.
*/
if (percpu_ref_init(&q->q_usage_counter,
error = percpu_ref_init(&q->q_usage_counter,
blk_queue_usage_counter_release,
PERCPU_REF_INIT_ATOMIC, GFP_KERNEL))
PERCPU_REF_INIT_ATOMIC, GFP_KERNEL);
if (error)
goto fail_stats;
blk_set_default_limits(&q->limits);
q->nr_requests = BLKDEV_DEFAULT_RQ;
return q;
@@ -451,7 +462,7 @@ fail_id:
ida_free(&blk_queue_ida, q->id);
fail_q:
kmem_cache_free(blk_requestq_cachep, q);
return NULL;
return ERR_PTR(error);
}
/**
@@ -1083,6 +1094,7 @@ void blk_start_plug_nr_ios(struct blk_plug *plug, unsigned short nr_ios)
if (tsk->plug)
return;
plug->cur_ktime = 0;
plug->mq_list = NULL;
plug->cached_rq = NULL;
plug->nr_ios = min_t(unsigned short, nr_ios, BLK_MAX_REQUEST_COUNT);
@@ -1182,6 +1194,8 @@ void __blk_flush_plug(struct blk_plug *plug, bool from_schedule)
*/
if (unlikely(!rq_list_empty(plug->cached_rq)))
blk_mq_free_plug_rqs(plug);
current->flags &= ~PF_BLOCK_TS;
}
/**
@@ -1229,8 +1243,7 @@ int __init blk_dev_init(void)
if (!kblockd_workqueue)
panic("Failed to create kblockd\n");
blk_requestq_cachep = kmem_cache_create("request_queue",
sizeof(struct request_queue), 0, SLAB_PANIC, NULL);
blk_requestq_cachep = KMEM_CACHE(request_queue, SLAB_PANIC);
blk_debugfs_root = debugfs_create_dir("block", NULL);

View File

@@ -143,7 +143,7 @@ static void blk_account_io_flush(struct request *rq)
part_stat_lock();
part_stat_inc(part, ios[STAT_FLUSH]);
part_stat_add(part, nsecs[STAT_FLUSH],
ktime_get_ns() - rq->start_time_ns);
blk_time_get_ns() - rq->start_time_ns);
part_stat_unlock();
}

View File

@@ -370,6 +370,7 @@ void blk_integrity_register(struct gendisk *disk, struct blk_integrity *template
bi->profile = template->profile ? template->profile : &nop_profile;
bi->tuple_size = template->tuple_size;
bi->tag_size = template->tag_size;
bi->pi_offset = template->pi_offset;
blk_queue_flag_set(QUEUE_FLAG_STABLE_WRITES, disk->queue);

View File

@@ -829,7 +829,7 @@ static int ioc_autop_idx(struct ioc *ioc, struct gendisk *disk)
/* step up/down based on the vrate */
vrate_pct = div64_u64(ioc->vtime_base_rate * 100, VTIME_PER_USEC);
now_ns = ktime_get_ns();
now_ns = blk_time_get_ns();
if (p->too_fast_vrate_pct && p->too_fast_vrate_pct <= vrate_pct) {
if (!ioc->autop_too_fast_at)
@@ -1044,7 +1044,7 @@ static void ioc_now(struct ioc *ioc, struct ioc_now *now)
unsigned seq;
u64 vrate;
now->now_ns = ktime_get();
now->now_ns = blk_time_get_ns();
now->now = ktime_to_us(now->now_ns);
vrate = atomic64_read(&ioc->vtime_rate);
@@ -2817,7 +2817,7 @@ static void ioc_rqos_done(struct rq_qos *rqos, struct request *rq)
return;
}
on_q_ns = ktime_get_ns() - rq->alloc_time_ns;
on_q_ns = blk_time_get_ns() - rq->alloc_time_ns;
rq_wait_ns = rq->start_time_ns - rq->alloc_time_ns;
size_nsec = div64_u64(calc_size_vtime_cost(rq, ioc), VTIME_PER_NSEC);
@@ -2900,7 +2900,7 @@ static int blk_iocost_init(struct gendisk *disk)
ioc->vtime_base_rate = VTIME_PER_USEC;
atomic64_set(&ioc->vtime_rate, VTIME_PER_USEC);
seqcount_spinlock_init(&ioc->period_seqcount, &ioc->lock);
ioc->period_at = ktime_to_us(ktime_get());
ioc->period_at = ktime_to_us(blk_time_get());
atomic64_set(&ioc->cur_period, 0);
atomic_set(&ioc->hweight_gen, 0);

View File

@@ -609,7 +609,7 @@ static void blkcg_iolatency_done_bio(struct rq_qos *rqos, struct bio *bio)
if (!iolat->blkiolat->enabled)
return;
now = ktime_to_ns(ktime_get());
now = blk_time_get_ns();
while (blkg && blkg->parent) {
iolat = blkg_to_lat(blkg);
if (!iolat) {
@@ -661,7 +661,7 @@ static void blkiolatency_timer_fn(struct timer_list *t)
struct blk_iolatency *blkiolat = from_timer(blkiolat, t, timer);
struct blkcg_gq *blkg;
struct cgroup_subsys_state *pos_css;
u64 now = ktime_to_ns(ktime_get());
u64 now = blk_time_get_ns();
rcu_read_lock();
blkg_for_each_descendant_pre(blkg, pos_css,
@@ -985,7 +985,7 @@ static void iolatency_pd_init(struct blkg_policy_data *pd)
struct blkcg_gq *blkg = lat_to_blkg(iolat);
struct rq_qos *rqos = iolat_rq_qos(blkg->q);
struct blk_iolatency *blkiolat = BLKIOLATENCY(rqos);
u64 now = ktime_to_ns(ktime_get());
u64 now = blk_time_get_ns();
int cpu;
if (blk_queue_nonrot(blkg->q))

View File

@@ -35,6 +35,26 @@ static sector_t bio_discard_limit(struct block_device *bdev, sector_t sector)
return round_down(UINT_MAX, discard_granularity) >> SECTOR_SHIFT;
}
static void await_bio_endio(struct bio *bio)
{
complete(bio->bi_private);
bio_put(bio);
}
/*
* await_bio_chain - ends @bio and waits for every chained bio to complete
*/
static void await_bio_chain(struct bio *bio)
{
DECLARE_COMPLETION_ONSTACK_MAP(done,
bio->bi_bdev->bd_disk->lockdep_map);
bio->bi_private = &done;
bio->bi_end_io = await_bio_endio;
bio_endio(bio);
blk_wait_io(&done);
}
int __blkdev_issue_discard(struct block_device *bdev, sector_t sector,
sector_t nr_sects, gfp_t gfp_mask, struct bio **biop)
{
@@ -77,6 +97,10 @@ int __blkdev_issue_discard(struct block_device *bdev, sector_t sector,
* is disabled.
*/
cond_resched();
if (fatal_signal_pending(current)) {
await_bio_chain(bio);
return -EINTR;
}
}
*biop = bio;
@@ -120,32 +144,33 @@ static int __blkdev_issue_write_zeroes(struct block_device *bdev,
struct bio **biop, unsigned flags)
{
struct bio *bio = *biop;
unsigned int max_write_zeroes_sectors;
unsigned int max_sectors;
if (bdev_read_only(bdev))
return -EPERM;
/* Ensure that max_write_zeroes_sectors doesn't overflow bi_size */
max_write_zeroes_sectors = bdev_write_zeroes_sectors(bdev);
/* Ensure that max_sectors doesn't overflow bi_size */
max_sectors = bdev_write_zeroes_sectors(bdev);
if (max_write_zeroes_sectors == 0)
if (max_sectors == 0)
return -EOPNOTSUPP;
while (nr_sects) {
unsigned int len = min_t(sector_t, nr_sects, max_sectors);
bio = blk_next_bio(bio, bdev, 0, REQ_OP_WRITE_ZEROES, gfp_mask);
bio->bi_iter.bi_sector = sector;
if (flags & BLKDEV_ZERO_NOUNMAP)
bio->bi_opf |= REQ_NOUNMAP;
if (nr_sects > max_write_zeroes_sectors) {
bio->bi_iter.bi_size = max_write_zeroes_sectors << 9;
nr_sects -= max_write_zeroes_sectors;
sector += max_write_zeroes_sectors;
} else {
bio->bi_iter.bi_size = nr_sects << 9;
nr_sects = 0;
}
bio->bi_iter.bi_size = len << SECTOR_SHIFT;
nr_sects -= len;
sector += len;
cond_resched();
if (fatal_signal_pending(current)) {
await_bio_chain(bio);
return -EINTR;
}
}
*biop = bio;
@@ -190,6 +215,10 @@ static int __blkdev_issue_zero_pages(struct block_device *bdev,
break;
}
cond_resched();
if (fatal_signal_pending(current)) {
await_bio_chain(bio);
return -EINTR;
}
}
*biop = bio;
@@ -280,7 +309,7 @@ retry:
bio_put(bio);
}
blk_finish_plug(&plug);
if (ret && try_write_zeroes) {
if (ret && ret != -EINTR && try_write_zeroes) {
if (!(flags & BLKDEV_ZERO_NOFALLBACK)) {
try_write_zeroes = false;
goto retry;
@@ -322,7 +351,7 @@ int blkdev_issue_secure_erase(struct block_device *bdev, sector_t sector,
return -EPERM;
blk_start_plug(&plug);
for (;;) {
while (nr_sects) {
unsigned int len = min_t(sector_t, nr_sects, max_sectors);
bio = blk_next_bio(bio, bdev, 0, REQ_OP_SECURE_ERASE, gfp);
@@ -331,12 +360,17 @@ int blkdev_issue_secure_erase(struct block_device *bdev, sector_t sector,
sector += len;
nr_sects -= len;
if (!nr_sects) {
ret = submit_bio_wait(bio);
bio_put(bio);
cond_resched();
if (fatal_signal_pending(current)) {
await_bio_chain(bio);
ret = -EINTR;
bio = NULL;
break;
}
cond_resched();
}
if (bio) {
ret = submit_bio_wait(bio);
bio_put(bio);
}
blk_finish_plug(&plug);

View File

@@ -21,7 +21,6 @@
#include <linux/llist.h>
#include <linux/cpu.h>
#include <linux/cache.h>
#include <linux/sched/sysctl.h>
#include <linux/sched/topology.h>
#include <linux/sched/signal.h>
#include <linux/delay.h>
@@ -322,7 +321,7 @@ void blk_rq_init(struct request_queue *q, struct request *rq)
RB_CLEAR_NODE(&rq->rb_node);
rq->tag = BLK_MQ_NO_TAG;
rq->internal_tag = BLK_MQ_NO_TAG;
rq->start_time_ns = ktime_get_ns();
rq->start_time_ns = blk_time_get_ns();
rq->part = NULL;
blk_crypto_rq_set_defaults(rq);
}
@@ -332,7 +331,7 @@ EXPORT_SYMBOL(blk_rq_init);
static inline void blk_mq_rq_time_init(struct request *rq, u64 alloc_time_ns)
{
if (blk_mq_need_time_stamp(rq))
rq->start_time_ns = ktime_get_ns();
rq->start_time_ns = blk_time_get_ns();
else
rq->start_time_ns = 0;
@@ -443,7 +442,7 @@ static struct request *__blk_mq_alloc_requests(struct blk_mq_alloc_data *data)
/* alloc_time includes depth and tag waits */
if (blk_queue_rq_alloc_time(q))
alloc_time_ns = ktime_get_ns();
alloc_time_ns = blk_time_get_ns();
if (data->cmd_flags & REQ_NOWAIT)
data->flags |= BLK_MQ_REQ_NOWAIT;
@@ -628,7 +627,7 @@ struct request *blk_mq_alloc_request_hctx(struct request_queue *q,
/* alloc_time includes depth and tag waits */
if (blk_queue_rq_alloc_time(q))
alloc_time_ns = ktime_get_ns();
alloc_time_ns = blk_time_get_ns();
/*
* If the tag allocator sleeps we could get an allocation for a
@@ -1041,7 +1040,7 @@ static inline void __blk_mq_end_request_acct(struct request *rq, u64 now)
inline void __blk_mq_end_request(struct request *rq, blk_status_t error)
{
if (blk_mq_need_time_stamp(rq))
__blk_mq_end_request_acct(rq, ktime_get_ns());
__blk_mq_end_request_acct(rq, blk_time_get_ns());
blk_mq_finish_request(rq);
@@ -1084,7 +1083,7 @@ void blk_mq_end_request_batch(struct io_comp_batch *iob)
u64 now = 0;
if (iob->need_ts)
now = ktime_get_ns();
now = blk_time_get_ns();
while ((rq = rq_list_pop(&iob->req_list)) != NULL) {
prefetch(rq->bio);
@@ -1167,10 +1166,11 @@ static inline bool blk_mq_complete_need_ipi(struct request *rq)
if (force_irqthreads())
return false;
/* same CPU or cache domain? Complete locally */
/* same CPU or cache domain and capacity? Complete locally */
if (cpu == rq->mq_ctx->cpu ||
(!test_bit(QUEUE_FLAG_SAME_FORCE, &rq->q->queue_flags) &&
cpus_share_cache(cpu, rq->mq_ctx->cpu)))
cpus_share_cache(cpu, rq->mq_ctx->cpu) &&
cpus_equal_capacity(cpu, rq->mq_ctx->cpu)))
return false;
/* don't try to IPI to an offline CPU */
@@ -1254,7 +1254,7 @@ void blk_mq_start_request(struct request *rq)
if (test_bit(QUEUE_FLAG_STATS, &q->queue_flags) &&
!blk_rq_is_passthrough(rq)) {
rq->io_start_time_ns = ktime_get_ns();
rq->io_start_time_ns = blk_time_get_ns();
rq->stats_sectors = blk_rq_sectors(rq);
rq->rq_flags |= RQF_STATS;
rq_qos_issue(q, rq);
@@ -1409,22 +1409,10 @@ blk_status_t blk_execute_rq(struct request *rq, bool at_head)
blk_mq_insert_request(rq, at_head ? BLK_MQ_INSERT_AT_HEAD : 0);
blk_mq_run_hw_queue(hctx, false);
if (blk_rq_is_poll(rq)) {
if (blk_rq_is_poll(rq))
blk_rq_poll_completion(rq, &wait.done);
} else {
/*
* Prevent hang_check timer from firing at us during very long
* I/O
*/
unsigned long hang_check = sysctl_hung_task_timeout_secs;
if (hang_check)
while (!wait_for_completion_io_timeout(&wait.done,
hang_check * (HZ/2)))
;
else
wait_for_completion_io(&wait.done);
}
else
blk_wait_io(&wait.done);
return wait.ret;
}
@@ -2892,9 +2880,6 @@ static struct request *blk_mq_get_new_requests(struct request_queue *q,
};
struct request *rq;
if (blk_mq_attempt_bio_merge(q, bio, nsegs))
return NULL;
rq_qos_throttle(q, bio);
if (plug) {
@@ -2913,23 +2898,32 @@ static struct request *blk_mq_get_new_requests(struct request_queue *q,
}
/*
* Check if we can use the passed on request for submitting the passed in bio,
* and remove it from the request list if it can be used.
* Check if there is a suitable cached request and return it.
*/
static bool blk_mq_use_cached_rq(struct request *rq, struct blk_plug *plug,
static struct request *blk_mq_peek_cached_request(struct blk_plug *plug,
struct request_queue *q, blk_opf_t opf)
{
enum hctx_type type = blk_mq_get_hctx_type(opf);
struct request *rq;
if (!plug)
return NULL;
rq = rq_list_peek(&plug->cached_rq);
if (!rq || rq->q != q)
return NULL;
if (type != rq->mq_hctx->type &&
(type != HCTX_TYPE_READ || rq->mq_hctx->type != HCTX_TYPE_DEFAULT))
return NULL;
if (op_is_flush(rq->cmd_flags) != op_is_flush(opf))
return NULL;
return rq;
}
static void blk_mq_use_cached_rq(struct request *rq, struct blk_plug *plug,
struct bio *bio)
{
enum hctx_type type = blk_mq_get_hctx_type(bio->bi_opf);
enum hctx_type hctx_type = rq->mq_hctx->type;
WARN_ON_ONCE(rq_list_peek(&plug->cached_rq) != rq);
if (type != hctx_type &&
!(type == HCTX_TYPE_READ && hctx_type == HCTX_TYPE_DEFAULT))
return false;
if (op_is_flush(rq->cmd_flags) != op_is_flush(bio->bi_opf))
return false;
/*
* If any qos ->throttle() end up blocking, we will have flushed the
* plug and hence killed the cached_rq list as well. Pop this entry
@@ -2941,7 +2935,6 @@ static bool blk_mq_use_cached_rq(struct request *rq, struct blk_plug *plug,
blk_mq_rq_time_init(rq, 0);
rq->cmd_flags = bio->bi_opf;
INIT_LIST_HEAD(&rq->queuelist);
return true;
}
/**
@@ -2963,50 +2956,43 @@ void blk_mq_submit_bio(struct bio *bio)
struct blk_plug *plug = blk_mq_plug(bio);
const int is_sync = op_is_sync(bio->bi_opf);
struct blk_mq_hw_ctx *hctx;
struct request *rq = NULL;
unsigned int nr_segs = 1;
struct request *rq;
blk_status_t ret;
bio = blk_queue_bounce(bio, q);
if (plug) {
rq = rq_list_peek(&plug->cached_rq);
if (rq && rq->q != q)
rq = NULL;
}
if (rq) {
if (unlikely(bio_may_exceed_limits(bio, &q->limits))) {
bio = __bio_split_to_limits(bio, &q->limits, &nr_segs);
if (!bio)
return;
}
if (!bio_integrity_prep(bio))
return;
if (blk_mq_attempt_bio_merge(q, bio, nr_segs))
return;
if (blk_mq_use_cached_rq(rq, plug, bio))
goto done;
percpu_ref_get(&q->q_usage_counter);
} else {
/*
* If the plug has a cached request for this queue, try use it.
*
* The cached request already holds a q_usage_counter reference and we
* don't have to acquire a new one if we use it.
*/
rq = blk_mq_peek_cached_request(plug, q, bio->bi_opf);
if (!rq) {
if (unlikely(bio_queue_enter(bio)))
return;
if (unlikely(bio_may_exceed_limits(bio, &q->limits))) {
bio = __bio_split_to_limits(bio, &q->limits, &nr_segs);
if (!bio)
goto fail;
}
if (!bio_integrity_prep(bio))
goto fail;
}
rq = blk_mq_get_new_requests(q, plug, bio, nr_segs);
if (unlikely(!rq)) {
fail:
blk_queue_exit(q);
return;
if (unlikely(bio_may_exceed_limits(bio, &q->limits))) {
bio = __bio_split_to_limits(bio, &q->limits, &nr_segs);
if (!bio)
goto queue_exit;
}
if (!bio_integrity_prep(bio))
goto queue_exit;
if (blk_mq_attempt_bio_merge(q, bio, nr_segs))
goto queue_exit;
if (!rq) {
rq = blk_mq_get_new_requests(q, plug, bio, nr_segs);
if (unlikely(!rq))
goto queue_exit;
} else {
blk_mq_use_cached_rq(rq, plug, bio);
}
done:
trace_block_getrq(bio);
rq_qos_track(q, rq, bio);
@@ -3037,6 +3023,15 @@ done:
} else {
blk_mq_run_dispatch_ops(q, blk_mq_try_issue_directly(hctx, rq));
}
return;
queue_exit:
/*
* Don't drop the queue reference if we were trying to use a cached
* request and thus didn't acquire one.
*/
if (!rq)
blk_queue_exit(q);
}
#ifdef CONFIG_BLK_MQ_STACKING
@@ -3098,7 +3093,7 @@ blk_status_t blk_insert_cloned_request(struct request *rq)
blk_mq_run_dispatch_ops(q,
ret = blk_mq_request_issue_directly(rq, true));
if (ret)
blk_account_io_done(rq, ktime_get_ns());
blk_account_io_done(rq, blk_time_get_ns());
return ret;
}
EXPORT_SYMBOL_GPL(blk_insert_cloned_request);
@@ -4078,15 +4073,16 @@ void blk_mq_release(struct request_queue *q)
blk_mq_sysfs_deinit(q);
}
static struct request_queue *blk_mq_init_queue_data(struct blk_mq_tag_set *set,
void *queuedata)
struct request_queue *blk_mq_alloc_queue(struct blk_mq_tag_set *set,
struct queue_limits *lim, void *queuedata)
{
struct queue_limits default_lim = { };
struct request_queue *q;
int ret;
q = blk_alloc_queue(set->numa_node);
if (!q)
return ERR_PTR(-ENOMEM);
q = blk_alloc_queue(lim ? lim : &default_lim, set->numa_node);
if (IS_ERR(q))
return q;
q->queuedata = queuedata;
ret = blk_mq_init_allocated_queue(set, q);
if (ret) {
@@ -4095,20 +4091,15 @@ static struct request_queue *blk_mq_init_queue_data(struct blk_mq_tag_set *set,
}
return q;
}
struct request_queue *blk_mq_init_queue(struct blk_mq_tag_set *set)
{
return blk_mq_init_queue_data(set, NULL);
}
EXPORT_SYMBOL(blk_mq_init_queue);
EXPORT_SYMBOL(blk_mq_alloc_queue);
/**
* blk_mq_destroy_queue - shutdown a request queue
* @q: request queue to shutdown
*
* This shuts down a request queue allocated by blk_mq_init_queue(). All future
* This shuts down a request queue allocated by blk_mq_alloc_queue(). All future
* requests will be failed with -ENODEV. The caller is responsible for dropping
* the reference from blk_mq_init_queue() by calling blk_put_queue().
* the reference from blk_mq_alloc_queue() by calling blk_put_queue().
*
* Context: can sleep
*/
@@ -4129,13 +4120,14 @@ void blk_mq_destroy_queue(struct request_queue *q)
}
EXPORT_SYMBOL(blk_mq_destroy_queue);
struct gendisk *__blk_mq_alloc_disk(struct blk_mq_tag_set *set, void *queuedata,
struct gendisk *__blk_mq_alloc_disk(struct blk_mq_tag_set *set,
struct queue_limits *lim, void *queuedata,
struct lock_class_key *lkclass)
{
struct request_queue *q;
struct gendisk *disk;
q = blk_mq_init_queue_data(set, queuedata);
q = blk_mq_alloc_queue(set, lim, queuedata);
if (IS_ERR(q))
return ERR_CAST(q);
@@ -4389,7 +4381,7 @@ static void blk_mq_update_queue_map(struct blk_mq_tag_set *set)
if (set->nr_maps == 1)
set->map[HCTX_TYPE_DEFAULT].nr_queues = set->nr_hw_queues;
if (set->ops->map_queues && !is_kdump_kernel()) {
if (set->ops->map_queues) {
int i;
/*
@@ -4488,14 +4480,12 @@ int blk_mq_alloc_tag_set(struct blk_mq_tag_set *set)
/*
* If a crashdump is active, then we are potentially in a very
* memory constrained environment. Limit us to 1 queue and
* 64 tags to prevent using too much memory.
* memory constrained environment. Limit us to 64 tags to prevent
* using too much memory.
*/
if (is_kdump_kernel()) {
set->nr_hw_queues = 1;
set->nr_maps = 1;
if (is_kdump_kernel())
set->queue_depth = min(64U, set->queue_depth);
}
/*
* There is no use for more h/w queues than cpus if we just have
* a single map
@@ -4525,7 +4515,7 @@ int blk_mq_alloc_tag_set(struct blk_mq_tag_set *set)
GFP_KERNEL, set->numa_node);
if (!set->map[i].mq_map)
goto out_free_mq_map;
set->map[i].nr_queues = is_kdump_kernel() ? 1 : set->nr_hw_queues;
set->map[i].nr_queues = set->nr_hw_queues;
}
blk_mq_update_queue_map(set);

View File

@@ -25,53 +25,22 @@ void blk_queue_rq_timeout(struct request_queue *q, unsigned int timeout)
}
EXPORT_SYMBOL_GPL(blk_queue_rq_timeout);
/**
* blk_set_default_limits - reset limits to default values
* @lim: the queue_limits structure to reset
*
* Description:
* Returns a queue_limit struct to its default state.
*/
void blk_set_default_limits(struct queue_limits *lim)
{
lim->max_segments = BLK_MAX_SEGMENTS;
lim->max_discard_segments = 1;
lim->max_integrity_segments = 0;
lim->seg_boundary_mask = BLK_SEG_BOUNDARY_MASK;
lim->virt_boundary_mask = 0;
lim->max_segment_size = BLK_MAX_SEGMENT_SIZE;
lim->max_sectors = lim->max_hw_sectors = BLK_SAFE_MAX_SECTORS;
lim->max_user_sectors = lim->max_dev_sectors = 0;
lim->chunk_sectors = 0;
lim->max_write_zeroes_sectors = 0;
lim->max_zone_append_sectors = 0;
lim->max_discard_sectors = 0;
lim->max_hw_discard_sectors = 0;
lim->max_secure_erase_sectors = 0;
lim->discard_granularity = 512;
lim->discard_alignment = 0;
lim->discard_misaligned = 0;
lim->logical_block_size = lim->physical_block_size = lim->io_min = 512;
lim->bounce = BLK_BOUNCE_NONE;
lim->alignment_offset = 0;
lim->io_opt = 0;
lim->misaligned = 0;
lim->zoned = false;
lim->zone_write_granularity = 0;
lim->dma_alignment = 511;
}
/**
* blk_set_stacking_limits - set default limits for stacking devices
* @lim: the queue_limits structure to reset
*
* Description:
* Returns a queue_limit struct to its default state. Should be used
* by stacking drivers like DM that have no internal limits.
* Prepare queue limits for applying limits from underlying devices using
* blk_stack_limits().
*/
void blk_set_stacking_limits(struct queue_limits *lim)
{
blk_set_default_limits(lim);
memset(lim, 0, sizeof(*lim));
lim->logical_block_size = SECTOR_SIZE;
lim->physical_block_size = SECTOR_SIZE;
lim->io_min = SECTOR_SIZE;
lim->discard_granularity = SECTOR_SIZE;
lim->dma_alignment = SECTOR_SIZE - 1;
lim->seg_boundary_mask = BLK_SEG_BOUNDARY_MASK;
/* Inherit limits from component devices */
lim->max_segments = USHRT_MAX;
@@ -82,9 +51,239 @@ void blk_set_stacking_limits(struct queue_limits *lim)
lim->max_dev_sectors = UINT_MAX;
lim->max_write_zeroes_sectors = UINT_MAX;
lim->max_zone_append_sectors = UINT_MAX;
lim->max_user_discard_sectors = UINT_MAX;
}
EXPORT_SYMBOL(blk_set_stacking_limits);
static void blk_apply_bdi_limits(struct backing_dev_info *bdi,
struct queue_limits *lim)
{
/*
* For read-ahead of large files to be effective, we need to read ahead
* at least twice the optimal I/O size.
*/
bdi->ra_pages = max(lim->io_opt * 2 / PAGE_SIZE, VM_READAHEAD_PAGES);
bdi->io_pages = lim->max_sectors >> PAGE_SECTORS_SHIFT;
}
static int blk_validate_zoned_limits(struct queue_limits *lim)
{
if (!lim->zoned) {
if (WARN_ON_ONCE(lim->max_open_zones) ||
WARN_ON_ONCE(lim->max_active_zones) ||
WARN_ON_ONCE(lim->zone_write_granularity) ||
WARN_ON_ONCE(lim->max_zone_append_sectors))
return -EINVAL;
return 0;
}
if (WARN_ON_ONCE(!IS_ENABLED(CONFIG_BLK_DEV_ZONED)))
return -EINVAL;
if (lim->zone_write_granularity < lim->logical_block_size)
lim->zone_write_granularity = lim->logical_block_size;
if (lim->max_zone_append_sectors) {
/*
* The Zone Append size is limited by the maximum I/O size
* and the zone size given that it can't span zones.
*/
lim->max_zone_append_sectors =
min3(lim->max_hw_sectors,
lim->max_zone_append_sectors,
lim->chunk_sectors);
}
return 0;
}
/*
* Check that the limits in lim are valid, initialize defaults for unset
* values, and cap values based on others where needed.
*/
static int blk_validate_limits(struct queue_limits *lim)
{
unsigned int max_hw_sectors;
/*
* Unless otherwise specified, default to 512 byte logical blocks and a
* physical block size equal to the logical block size.
*/
if (!lim->logical_block_size)
lim->logical_block_size = SECTOR_SIZE;
if (lim->physical_block_size < lim->logical_block_size)
lim->physical_block_size = lim->logical_block_size;
/*
* The minimum I/O size defaults to the physical block size unless
* explicitly overridden.
*/
if (lim->io_min < lim->physical_block_size)
lim->io_min = lim->physical_block_size;
/*
* max_hw_sectors has a somewhat weird default for historical reason,
* but driver really should set their own instead of relying on this
* value.
*
* The block layer relies on the fact that every driver can
* handle at lest a page worth of data per I/O, and needs the value
* aligned to the logical block size.
*/
if (!lim->max_hw_sectors)
lim->max_hw_sectors = BLK_SAFE_MAX_SECTORS;
if (WARN_ON_ONCE(lim->max_hw_sectors < PAGE_SECTORS))
return -EINVAL;
lim->max_hw_sectors = round_down(lim->max_hw_sectors,
lim->logical_block_size >> SECTOR_SHIFT);
/*
* The actual max_sectors value is a complex beast and also takes the
* max_dev_sectors value (set by SCSI ULPs) and a user configurable
* value into account. The ->max_sectors value is always calculated
* from these, so directly setting it won't have any effect.
*/
max_hw_sectors = min_not_zero(lim->max_hw_sectors,
lim->max_dev_sectors);
if (lim->max_user_sectors) {
if (lim->max_user_sectors > max_hw_sectors ||
lim->max_user_sectors < PAGE_SIZE / SECTOR_SIZE)
return -EINVAL;
lim->max_sectors = min(max_hw_sectors, lim->max_user_sectors);
} else {
lim->max_sectors = min(max_hw_sectors, BLK_DEF_MAX_SECTORS_CAP);
}
lim->max_sectors = round_down(lim->max_sectors,
lim->logical_block_size >> SECTOR_SHIFT);
/*
* Random default for the maximum number of segments. Driver should not
* rely on this and set their own.
*/
if (!lim->max_segments)
lim->max_segments = BLK_MAX_SEGMENTS;
lim->max_discard_sectors =
min(lim->max_hw_discard_sectors, lim->max_user_discard_sectors);
if (!lim->max_discard_segments)
lim->max_discard_segments = 1;
if (lim->discard_granularity < lim->physical_block_size)
lim->discard_granularity = lim->physical_block_size;
/*
* By default there is no limit on the segment boundary alignment,
* but if there is one it can't be smaller than the page size as
* that would break all the normal I/O patterns.
*/
if (!lim->seg_boundary_mask)
lim->seg_boundary_mask = BLK_SEG_BOUNDARY_MASK;
if (WARN_ON_ONCE(lim->seg_boundary_mask < PAGE_SIZE - 1))
return -EINVAL;
/*
* Devices that require a virtual boundary do not support scatter/gather
* I/O natively, but instead require a descriptor list entry for each
* page (which might not be identical to the Linux PAGE_SIZE). Because
* of that they are not limited by our notion of "segment size".
*/
if (lim->virt_boundary_mask) {
if (WARN_ON_ONCE(lim->max_segment_size &&
lim->max_segment_size != UINT_MAX))
return -EINVAL;
lim->max_segment_size = UINT_MAX;
} else {
/*
* The maximum segment size has an odd historic 64k default that
* drivers probably should override. Just like the I/O size we
* require drivers to at least handle a full page per segment.
*/
if (!lim->max_segment_size)
lim->max_segment_size = BLK_MAX_SEGMENT_SIZE;
if (WARN_ON_ONCE(lim->max_segment_size < PAGE_SIZE))
return -EINVAL;
}
/*
* We require drivers to at least do logical block aligned I/O, but
* historically could not check for that due to the separate calls
* to set the limits. Once the transition is finished the check
* below should be narrowed down to check the logical block size.
*/
if (!lim->dma_alignment)
lim->dma_alignment = SECTOR_SIZE - 1;
if (WARN_ON_ONCE(lim->dma_alignment > PAGE_SIZE))
return -EINVAL;
if (lim->alignment_offset) {
lim->alignment_offset &= (lim->physical_block_size - 1);
lim->misaligned = 0;
}
return blk_validate_zoned_limits(lim);
}
/*
* Set the default limits for a newly allocated queue. @lim contains the
* initial limits set by the driver, which could be no limit in which case
* all fields are cleared to zero.
*/
int blk_set_default_limits(struct queue_limits *lim)
{
/*
* Most defaults are set by capping the bounds in blk_validate_limits,
* but max_user_discard_sectors is special and needs an explicit
* initialization to the max value here.
*/
lim->max_user_discard_sectors = UINT_MAX;
return blk_validate_limits(lim);
}
/**
* queue_limits_commit_update - commit an atomic update of queue limits
* @q: queue to update
* @lim: limits to apply
*
* Apply the limits in @lim that were obtained from queue_limits_start_update()
* and updated by the caller to @q.
*
* Returns 0 if successful, else a negative error code.
*/
int queue_limits_commit_update(struct request_queue *q,
struct queue_limits *lim)
__releases(q->limits_lock)
{
int error = blk_validate_limits(lim);
if (!error) {
q->limits = *lim;
if (q->disk)
blk_apply_bdi_limits(q->disk->bdi, lim);
}
mutex_unlock(&q->limits_lock);
return error;
}
EXPORT_SYMBOL_GPL(queue_limits_commit_update);
/**
* queue_limits_set - apply queue limits to queue
* @q: queue to update
* @lim: limits to apply
*
* Apply the limits in @lim that were freshly initialized to @q.
* To update existing limits use queue_limits_start_update() and
* queue_limits_commit_update() instead.
*
* Returns 0 if successful, else a negative error code.
*/
int queue_limits_set(struct request_queue *q, struct queue_limits *lim)
{
mutex_lock(&q->limits_lock);
return queue_limits_commit_update(q, lim);
}
EXPORT_SYMBOL_GPL(queue_limits_set);
/**
* blk_queue_bounce_limit - set bounce buffer limit for queue
* @q: the request queue for the device
@@ -177,8 +376,11 @@ EXPORT_SYMBOL(blk_queue_chunk_sectors);
void blk_queue_max_discard_sectors(struct request_queue *q,
unsigned int max_discard_sectors)
{
q->limits.max_hw_discard_sectors = max_discard_sectors;
q->limits.max_discard_sectors = max_discard_sectors;
struct queue_limits *lim = &q->limits;
lim->max_hw_discard_sectors = max_discard_sectors;
lim->max_discard_sectors =
min(max_discard_sectors, lim->max_user_discard_sectors);
}
EXPORT_SYMBOL(blk_queue_max_discard_sectors);
@@ -393,15 +595,7 @@ EXPORT_SYMBOL(blk_queue_alignment_offset);
void disk_update_readahead(struct gendisk *disk)
{
struct request_queue *q = disk->queue;
/*
* For read-ahead of large files to be effective, we need to read ahead
* at least twice the optimal I/O size.
*/
disk->bdi->ra_pages =
max(queue_io_opt(q) * 2 / PAGE_SIZE, VM_READAHEAD_PAGES);
disk->bdi->io_pages = queue_max_sectors(q) >> (PAGE_SHIFT - 9);
blk_apply_bdi_limits(disk->bdi, &disk->queue->limits);
}
EXPORT_SYMBOL_GPL(disk_update_readahead);
@@ -689,33 +883,38 @@ int blk_stack_limits(struct queue_limits *t, struct queue_limits *b,
t->zone_write_granularity = max(t->zone_write_granularity,
b->zone_write_granularity);
t->zoned = max(t->zoned, b->zoned);
if (!t->zoned) {
t->zone_write_granularity = 0;
t->max_zone_append_sectors = 0;
}
return ret;
}
EXPORT_SYMBOL(blk_stack_limits);
/**
* disk_stack_limits - adjust queue limits for stacked drivers
* @disk: MD/DM gendisk (top)
* queue_limits_stack_bdev - adjust queue_limits for stacked devices
* @t: the stacking driver limits (top device)
* @bdev: the underlying block device (bottom)
* @offset: offset to beginning of data within component device
* @pfx: prefix to use for warnings logged
*
* Description:
* Merges the limits for a top level gendisk and a bottom level
* block_device.
* This function is used by stacking drivers like MD and DM to ensure
* that all component devices have compatible block sizes and
* alignments. The stacking driver must provide a queue_limits
* struct (top) and then iteratively call the stacking function for
* all component (bottom) devices. The stacking function will
* attempt to combine the values and ensure proper alignment.
*/
void disk_stack_limits(struct gendisk *disk, struct block_device *bdev,
sector_t offset)
void queue_limits_stack_bdev(struct queue_limits *t, struct block_device *bdev,
sector_t offset, const char *pfx)
{
struct request_queue *t = disk->queue;
if (blk_stack_limits(&t->limits, &bdev_get_queue(bdev)->limits,
get_start_sect(bdev) + (offset >> 9)) < 0)
if (blk_stack_limits(t, &bdev_get_queue(bdev)->limits,
get_start_sect(bdev) + offset))
pr_notice("%s: Warning: Device %pg is misaligned\n",
disk->disk_name, bdev);
disk_update_readahead(disk);
pfx, bdev);
}
EXPORT_SYMBOL(disk_stack_limits);
EXPORT_SYMBOL_GPL(queue_limits_stack_bdev);
/**
* blk_queue_update_dma_pad - update pad mask

View File

@@ -27,7 +27,7 @@ void blk_rq_stat_init(struct blk_rq_stat *stat)
/* src is a per-cpu stat, mean isn't initialized */
void blk_rq_stat_sum(struct blk_rq_stat *dst, struct blk_rq_stat *src)
{
if (!src->nr_samples)
if (dst->nr_samples + src->nr_samples <= dst->nr_samples)
return;
dst->min = min(dst->min, src->min);

View File

@@ -174,23 +174,29 @@ static ssize_t queue_discard_max_show(struct request_queue *q, char *page)
static ssize_t queue_discard_max_store(struct request_queue *q,
const char *page, size_t count)
{
unsigned long max_discard;
ssize_t ret = queue_var_store(&max_discard, page, count);
unsigned long max_discard_bytes;
struct queue_limits lim;
ssize_t ret;
int err;
ret = queue_var_store(&max_discard_bytes, page, count);
if (ret < 0)
return ret;
if (max_discard & (q->limits.discard_granularity - 1))
if (max_discard_bytes & (q->limits.discard_granularity - 1))
return -EINVAL;
max_discard >>= 9;
if (max_discard > UINT_MAX)
if ((max_discard_bytes >> SECTOR_SHIFT) > UINT_MAX)
return -EINVAL;
if (max_discard > q->limits.max_hw_discard_sectors)
max_discard = q->limits.max_hw_discard_sectors;
blk_mq_freeze_queue(q);
lim = queue_limits_start_update(q);
lim.max_user_discard_sectors = max_discard_bytes >> SECTOR_SHIFT;
err = queue_limits_commit_update(q, &lim);
blk_mq_unfreeze_queue(q);
q->limits.max_discard_sectors = max_discard;
if (err)
return err;
return ret;
}
@@ -226,35 +232,22 @@ static ssize_t queue_zone_append_max_show(struct request_queue *q, char *page)
static ssize_t
queue_max_sectors_store(struct request_queue *q, const char *page, size_t count)
{
unsigned long var;
unsigned int max_sectors_kb,
max_hw_sectors_kb = queue_max_hw_sectors(q) >> 1,
page_kb = 1 << (PAGE_SHIFT - 10);
ssize_t ret = queue_var_store(&var, page, count);
unsigned long max_sectors_kb;
struct queue_limits lim;
ssize_t ret;
int err;
ret = queue_var_store(&max_sectors_kb, page, count);
if (ret < 0)
return ret;
max_sectors_kb = (unsigned int)var;
max_hw_sectors_kb = min_not_zero(max_hw_sectors_kb,
q->limits.max_dev_sectors >> 1);
if (max_sectors_kb == 0) {
q->limits.max_user_sectors = 0;
max_sectors_kb = min(max_hw_sectors_kb,
BLK_DEF_MAX_SECTORS_CAP >> 1);
} else {
if (max_sectors_kb > max_hw_sectors_kb ||
max_sectors_kb < page_kb)
return -EINVAL;
q->limits.max_user_sectors = max_sectors_kb << 1;
}
spin_lock_irq(&q->queue_lock);
q->limits.max_sectors = max_sectors_kb << 1;
if (q->disk)
q->disk->bdi->io_pages = max_sectors_kb >> (PAGE_SHIFT - 10);
spin_unlock_irq(&q->queue_lock);
blk_mq_freeze_queue(q);
lim = queue_limits_start_update(q);
lim.max_user_sectors = max_sectors_kb << 1;
err = queue_limits_commit_update(q, &lim);
blk_mq_unfreeze_queue(q);
if (err)
return err;
return ret;
}

Some files were not shown because too many files have changed in this diff Show More