From d3a043733f25d743f3aa617c7f82dbcb5ee2211a Mon Sep 17 00:00:00 2001 From: Nilay Shroff Date: Thu, 16 May 2024 17:43:51 +0530 Subject: [PATCH 01/25] nvme-multipath: find NUMA path only for online numa-node In current native multipath design when a shared namespace is created, we loop through each possible numa-node, calculate the NUMA distance of that node from each nvme controller and then cache the optimal IO path for future reference while sending IO. The issue with this design is that we may refer to the NUMA distance table for an offline node which may not be populated at the time and so we may inadvertently end up finding and caching a non-optimal path for IO. Then latter when the corresponding numa-node becomes online and hence the NUMA distance table entry for that node is created, ideally we should re-calculate the multipath node distance for the newly added node however that doesn't happen unless we rescan/reset the controller. So essentially, we may keep using non-optimal IO path for a node which is made online after namespace is created. This patch helps fix this issue ensuring that when a shared namespace is created, we calculate the multipath node distance for each online numa-node instead of each possible numa-node. Then latter when a node becomes online and we receive any IO on that newly added node, we would calculate the multipath node distance for newly added node but this time NUMA distance table would have been already populated for newly added node. Hence we would be able to correctly calculate the multipath node distance and choose the optimal path for the IO. Signed-off-by: Nilay Shroff Reviewed-by: Christoph Hellwig Signed-off-by: Keith Busch --- drivers/nvme/host/multipath.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/nvme/host/multipath.c b/drivers/nvme/host/multipath.c index d16e976ae1a4..9c1e135b8df3 100644 --- a/drivers/nvme/host/multipath.c +++ b/drivers/nvme/host/multipath.c @@ -595,7 +595,7 @@ static void nvme_mpath_set_live(struct nvme_ns *ns) int node, srcu_idx; srcu_idx = srcu_read_lock(&head->srcu); - for_each_node(node) + for_each_online_node(node) __nvme_find_path(head, node); srcu_read_unlock(&head->srcu, srcu_idx); } From 2fe7b422460d14b33027d8770f7be8d26bcb2639 Mon Sep 17 00:00:00 2001 From: Keith Busch Date: Tue, 21 May 2024 09:50:47 -0700 Subject: [PATCH 02/25] nvme: fix multipath batched completion accounting Batched completions were missing the io stats accounting and bio trace events. Move the common code to a helper and call it from the batched and non-batched functions. Fixes: d4d957b53d91ee ("nvme-multipath: support io stats on the mpath device") Reviewed-by: Christoph Hellwig Reviewed-by: Sagi Grimberg Reviewed-by: Chaitanya Kulkarni Reviewed-by: Hannes Reinecke Signed-off-by: Keith Busch --- drivers/nvme/host/core.c | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index 954f850f113a..79cdd34dfa18 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -414,6 +414,14 @@ static inline void nvme_end_req_zoned(struct request *req) } } +static inline void __nvme_end_req(struct request *req) +{ + nvme_end_req_zoned(req); + nvme_trace_bio_complete(req); + if (req->cmd_flags & REQ_NVME_MPATH) + nvme_mpath_end_request(req); +} + static inline void nvme_end_req(struct request *req) { blk_status_t status = nvme_error_status(nvme_req(req)->status); @@ -424,10 +432,7 @@ static inline void nvme_end_req(struct request *req) else nvme_log_error(req); } - nvme_end_req_zoned(req); - nvme_trace_bio_complete(req); - if (req->cmd_flags & REQ_NVME_MPATH) - nvme_mpath_end_request(req); + __nvme_end_req(req); blk_mq_end_request(req, status); } @@ -476,7 +481,7 @@ void nvme_complete_batch_req(struct request *req) { trace_nvme_complete_rq(req); nvme_cleanup_cmd(req); - nvme_end_req_zoned(req); + __nvme_end_req(req); } EXPORT_SYMBOL_GPL(nvme_complete_batch_req); From a2e4c5f5f68dbd206f132bc709b98dea64afc3b8 Mon Sep 17 00:00:00 2001 From: Keith Busch Date: Tue, 21 May 2024 11:02:28 -0700 Subject: [PATCH 03/25] nvme-multipath: fix io accounting on failover There are io stats accounting that needs to be handled, so don't call blk_mq_end_request() directly. Use the existing nvme_end_req() helper that already handles everything. Fixes: d4d957b53d91ee ("nvme-multipath: support io stats on the mpath device") Reviewed-by: Christoph Hellwig Reviewed-by: Sagi Grimberg Signed-off-by: Keith Busch --- drivers/nvme/host/core.c | 2 +- drivers/nvme/host/multipath.c | 3 ++- drivers/nvme/host/nvme.h | 1 + 3 files changed, 4 insertions(+), 2 deletions(-) diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index 79cdd34dfa18..7706df237349 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -422,7 +422,7 @@ static inline void __nvme_end_req(struct request *req) nvme_mpath_end_request(req); } -static inline void nvme_end_req(struct request *req) +void nvme_end_req(struct request *req) { blk_status_t status = nvme_error_status(nvme_req(req)->status); diff --git a/drivers/nvme/host/multipath.c b/drivers/nvme/host/multipath.c index 9c1e135b8df3..1bee176fd850 100644 --- a/drivers/nvme/host/multipath.c +++ b/drivers/nvme/host/multipath.c @@ -118,7 +118,8 @@ void nvme_failover_req(struct request *req) blk_steal_bios(&ns->head->requeue_list, req); spin_unlock_irqrestore(&ns->head->requeue_lock, flags); - blk_mq_end_request(req, 0); + nvme_req(req)->status = 0; + nvme_end_req(req); kblockd_schedule_work(&ns->head->requeue_work); } diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h index cacc56f4bbf4..fc31bd340a63 100644 --- a/drivers/nvme/host/nvme.h +++ b/drivers/nvme/host/nvme.h @@ -767,6 +767,7 @@ static inline bool nvme_state_terminal(struct nvme_ctrl *ctrl) } } +void nvme_end_req(struct request *req); void nvme_complete_rq(struct request *req); void nvme_complete_batch_req(struct request *req); From f97914e35fd98b2b18fb8a092e0a0799f73afdfe Mon Sep 17 00:00:00 2001 From: Sagi Grimberg Date: Tue, 21 May 2024 23:20:28 +0300 Subject: [PATCH 04/25] nvmet: fix ns enable/disable possible hang When disabling an nvmet namespace, there is a period where the subsys->lock is released, as the ns disable waits for backend IO to complete, and the ns percpu ref to be properly killed. The original intent was to avoid taking the subsystem lock for a prolong period as other processes may need to acquire it (for example new incoming connections). However, it opens up a window where another process may come in and enable the ns, (re)intiailizing the ns percpu_ref, causing the disable sequence to hang. Solve this by taking the global nvmet_config_sem over the entire configfs enable/disable sequence. Fixes: a07b4970f464 ("nvmet: add a generic NVMe target") Signed-off-by: Sagi Grimberg Reviewed-by: Christoph Hellwig Reviewed-by: Chaitanya Kulkarni Signed-off-by: Keith Busch --- drivers/nvme/target/configfs.c | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/drivers/nvme/target/configfs.c b/drivers/nvme/target/configfs.c index 7c43a0ad6877..bd87dfd173a4 100644 --- a/drivers/nvme/target/configfs.c +++ b/drivers/nvme/target/configfs.c @@ -676,10 +676,18 @@ static ssize_t nvmet_ns_enable_store(struct config_item *item, if (kstrtobool(page, &enable)) return -EINVAL; + /* + * take a global nvmet_config_sem because the disable routine has a + * window where it releases the subsys-lock, giving a chance to + * a parallel enable to concurrently execute causing the disable to + * have a misaccounting of the ns percpu_ref. + */ + down_write(&nvmet_config_sem); if (enable) ret = nvmet_ns_enable(ns); else nvmet_ns_disable(ns); + up_write(&nvmet_config_sem); return ret ? ret : count; } From 64e3d02b43b17390f0fa9af6708a4eafdf20ba01 Mon Sep 17 00:00:00 2001 From: Kanchan Joshi Date: Fri, 24 May 2024 16:04:48 +0530 Subject: [PATCH 05/25] nvme: remove sgs and sws sgs/sws are unused, so remove these from nvme_ns_head structure. Signed-off-by: Kanchan Joshi Reviewed-by: Sagi Grimberg Reviewed-by: Christoph Hellwig Signed-off-by: Keith Busch --- drivers/nvme/host/nvme.h | 2 -- 1 file changed, 2 deletions(-) diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h index fc31bd340a63..c43a30753d87 100644 --- a/drivers/nvme/host/nvme.h +++ b/drivers/nvme/host/nvme.h @@ -471,8 +471,6 @@ struct nvme_ns_head { u8 pi_type; u8 pi_offset; u8 guard_type; - u16 sgs; - u32 sws; #ifdef CONFIG_BLK_DEV_ZONED u64 zsze; #endif From 1bd293fcf3af84674e82ed022c049491f3768840 Mon Sep 17 00:00:00 2001 From: Kundan Kumar Date: Thu, 23 May 2024 17:01:49 +0530 Subject: [PATCH 06/25] nvme: adjust multiples of NVME_CTRL_PAGE_SIZE in offset bio_vec start offset may be relatively large particularly when large folio gets added to the bio. A bigger offset will result in avoiding the single-segment mapping optimization and end up using expensive mempool_alloc further. Rather than using absolute value, adjust bv_offset by NVME_CTRL_PAGE_SIZE while checking if segment can be fitted into one/two PRP entries. Suggested-by: Christoph Hellwig Signed-off-by: Kundan Kumar Reviewed-by: Sagi Grimberg Reviewed-by: Christoph Hellwig Signed-off-by: Keith Busch --- drivers/nvme/host/pci.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index 710043086dff..102a9fb0c65f 100644 --- a/drivers/nvme/host/pci.c +++ b/drivers/nvme/host/pci.c @@ -778,7 +778,8 @@ static blk_status_t nvme_map_data(struct nvme_dev *dev, struct request *req, struct bio_vec bv = req_bvec(req); if (!is_pci_p2pdma_page(bv.bv_page)) { - if (bv.bv_offset + bv.bv_len <= NVME_CTRL_PAGE_SIZE * 2) + if ((bv.bv_offset & (NVME_CTRL_PAGE_SIZE - 1)) + + bv.bv_len <= NVME_CTRL_PAGE_SIZE * 2) return nvme_setup_prp_simple(dev, req, &cmnd->rw, &bv); From 80e4e17ac9e05adba3f1f0e1793398086e6d4007 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 21 May 2024 15:16:06 -0700 Subject: [PATCH 07/25] block: remove blk_queue_max_integrity_segments This is unused now that all the atomic queue limit conversions are merged. Signed-off-by: Christoph Hellwig Link: https://lore.kernel.org/r/20240521221606.393040-1-hch@lst.de Signed-off-by: Jens Axboe --- include/linux/blk-integrity.h | 10 ---------- 1 file changed, 10 deletions(-) diff --git a/include/linux/blk-integrity.h b/include/linux/blk-integrity.h index e253e7bd0d17..7428cb43952d 100644 --- a/include/linux/blk-integrity.h +++ b/include/linux/blk-integrity.h @@ -66,12 +66,6 @@ blk_integrity_queue_supports_integrity(struct request_queue *q) return q->integrity.profile; } -static inline void blk_queue_max_integrity_segments(struct request_queue *q, - unsigned int segs) -{ - q->limits.max_integrity_segments = segs; -} - static inline unsigned short queue_max_integrity_segments(const struct request_queue *q) { @@ -151,10 +145,6 @@ static inline void blk_integrity_register(struct gendisk *d, static inline void blk_integrity_unregister(struct gendisk *d) { } -static inline void blk_queue_max_integrity_segments(struct request_queue *q, - unsigned int segs) -{ -} static inline unsigned short queue_max_integrity_segments(const struct request_queue *q) { From d9780064b163b91c28e4d44ec3115599db65b7fa Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 27 May 2024 14:36:18 +0200 Subject: [PATCH 08/25] dm: move setting zoned_enabled to dm_table_set_restrictions Keep it together with the rest of the zoned code. Signed-off-by: Christoph Hellwig Reviewed-by: Johannes Thumshirn Link: https://lore.kernel.org/r/20240527123634.1116952-2-hch@lst.de Signed-off-by: Jens Axboe --- drivers/md/dm-table.c | 3 --- drivers/md/dm-zone.c | 8 +++++++- 2 files changed, 7 insertions(+), 4 deletions(-) diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c index cc66a27c363a..e291b78b307b 100644 --- a/drivers/md/dm-table.c +++ b/drivers/md/dm-table.c @@ -2040,9 +2040,6 @@ int dm_table_set_restrictions(struct dm_table *t, struct request_queue *q, r = dm_set_zones_restrictions(t, q); if (r) return r; - if (blk_queue_is_zoned(q) && - !static_key_enabled(&zoned_enabled.key)) - static_branch_enable(&zoned_enabled); } dm_update_crypto_profile(q, t); diff --git a/drivers/md/dm-zone.c b/drivers/md/dm-zone.c index 8e6bcb0d786a..3103360ce7f0 100644 --- a/drivers/md/dm-zone.c +++ b/drivers/md/dm-zone.c @@ -287,7 +287,13 @@ int dm_set_zones_restrictions(struct dm_table *t, struct request_queue *q) queue_emulates_zone_append(q) ? "emulated" : "native"); } - return dm_revalidate_zones(md, t); + ret = dm_revalidate_zones(md, t); + if (ret < 0) + return ret; + + if (!static_key_enabled(&zoned_enabled.key)) + static_branch_enable(&zoned_enabled); + return 0; } /* From 5e7a4bbcc33d7df6bcc8565a8938c196285e5423 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 27 May 2024 14:36:19 +0200 Subject: [PATCH 09/25] dm: remove dm_check_zoned Fold it into the only caller in preparation to changes in the queue limits setup. Signed-off-by: Christoph Hellwig Reviewed-by: Mike Snitzer Reviewed-by: Johannes Thumshirn Link: https://lore.kernel.org/r/20240527123634.1116952-3-hch@lst.de Signed-off-by: Jens Axboe --- drivers/md/dm-zone.c | 59 +++++++++++++++++--------------------------- 1 file changed, 23 insertions(+), 36 deletions(-) diff --git a/drivers/md/dm-zone.c b/drivers/md/dm-zone.c index 3103360ce7f0..0ee22494857d 100644 --- a/drivers/md/dm-zone.c +++ b/drivers/md/dm-zone.c @@ -160,37 +160,6 @@ static int dm_check_zoned_cb(struct blk_zone *zone, unsigned int idx, return 0; } -static int dm_check_zoned(struct mapped_device *md, struct dm_table *t) -{ - struct gendisk *disk = md->disk; - unsigned int nr_conv_zones = 0; - int ret; - - /* Count conventional zones */ - md->zone_revalidate_map = t; - ret = dm_blk_report_zones(disk, 0, UINT_MAX, - dm_check_zoned_cb, &nr_conv_zones); - md->zone_revalidate_map = NULL; - if (ret < 0) { - DMERR("Check zoned failed %d", ret); - return ret; - } - - /* - * If we only have conventional zones, expose the mapped device as - * a regular device. - */ - if (nr_conv_zones >= ret) { - disk->queue->limits.max_open_zones = 0; - disk->queue->limits.max_active_zones = 0; - disk->queue->limits.zoned = false; - clear_bit(DMF_EMULATE_ZONE_APPEND, &md->flags); - disk->nr_zones = 0; - } - - return 0; -} - /* * Revalidate the zones of a mapped device to initialize resource necessary * for zone append emulation. Note that we cannot simply use the block layer @@ -254,6 +223,8 @@ static bool dm_table_supports_zone_append(struct dm_table *t) int dm_set_zones_restrictions(struct dm_table *t, struct request_queue *q) { struct mapped_device *md = t->md; + struct gendisk *disk = md->disk; + unsigned int nr_conv_zones = 0; int ret; /* @@ -272,14 +243,30 @@ int dm_set_zones_restrictions(struct dm_table *t, struct request_queue *q) return 0; /* - * Check that the mapped device will indeed be zoned, that is, that it - * has sequential write required zones. + * Count conventional zones to check that the mapped device will indeed + * have sequential write required zones. */ - ret = dm_check_zoned(md, t); - if (ret) + md->zone_revalidate_map = t; + ret = dm_blk_report_zones(disk, 0, UINT_MAX, + dm_check_zoned_cb, &nr_conv_zones); + md->zone_revalidate_map = NULL; + if (ret < 0) { + DMERR("Check zoned failed %d", ret); return ret; - if (!blk_queue_is_zoned(q)) + } + + /* + * If we only have conventional zones, expose the mapped device as + * a regular device. + */ + if (nr_conv_zones >= ret) { + disk->queue->limits.max_open_zones = 0; + disk->queue->limits.max_active_zones = 0; + disk->queue->limits.zoned = false; + clear_bit(DMF_EMULATE_ZONE_APPEND, &md->flags); + disk->nr_zones = 0; return 0; + } if (!md->disk->nr_zones) { DMINFO("%s using %s zone append", From c8c1f7012b807ca4da0136eacab96961b56f25d5 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 27 May 2024 14:36:20 +0200 Subject: [PATCH 10/25] dm: make dm_set_zones_restrictions work on the queue limits Don't stuff the values directly into the queue without any synchronization, but instead delay applying the queue limits in the caller and let dm_set_zones_restrictions work on the limit structure. Signed-off-by: Christoph Hellwig Reviewed-by: Johannes Thumshirn Link: https://lore.kernel.org/r/20240527123634.1116952-4-hch@lst.de Signed-off-by: Jens Axboe --- drivers/md/dm-table.c | 12 ++++++------ drivers/md/dm-zone.c | 11 ++++++----- drivers/md/dm.h | 3 ++- 3 files changed, 14 insertions(+), 12 deletions(-) diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c index e291b78b307b..b2d5246cff21 100644 --- a/drivers/md/dm-table.c +++ b/drivers/md/dm-table.c @@ -1981,10 +1981,6 @@ int dm_table_set_restrictions(struct dm_table *t, struct request_queue *q, if (!dm_table_supports_secure_erase(t)) limits->max_secure_erase_sectors = 0; - r = queue_limits_set(q, limits); - if (r) - return r; - if (dm_table_supports_flush(t, (1UL << QUEUE_FLAG_WC))) { wc = true; if (dm_table_supports_flush(t, (1UL << QUEUE_FLAG_FUA))) @@ -2036,12 +2032,16 @@ int dm_table_set_restrictions(struct dm_table *t, struct request_queue *q, * For a zoned target, setup the zones related queue attributes * and resources necessary for zone append emulation if necessary. */ - if (blk_queue_is_zoned(q)) { - r = dm_set_zones_restrictions(t, q); + if (IS_ENABLED(CONFIG_BLK_DEV_ZONED) && limits->zoned) { + r = dm_set_zones_restrictions(t, q, limits); if (r) return r; } + r = queue_limits_set(q, limits); + if (r) + return r; + dm_update_crypto_profile(q, t); /* diff --git a/drivers/md/dm-zone.c b/drivers/md/dm-zone.c index 0ee22494857d..5d66d916730e 100644 --- a/drivers/md/dm-zone.c +++ b/drivers/md/dm-zone.c @@ -220,7 +220,8 @@ static bool dm_table_supports_zone_append(struct dm_table *t) return true; } -int dm_set_zones_restrictions(struct dm_table *t, struct request_queue *q) +int dm_set_zones_restrictions(struct dm_table *t, struct request_queue *q, + struct queue_limits *lim) { struct mapped_device *md = t->md; struct gendisk *disk = md->disk; @@ -236,7 +237,7 @@ int dm_set_zones_restrictions(struct dm_table *t, struct request_queue *q) clear_bit(DMF_EMULATE_ZONE_APPEND, &md->flags); } else { set_bit(DMF_EMULATE_ZONE_APPEND, &md->flags); - blk_queue_max_zone_append_sectors(q, 0); + lim->max_zone_append_sectors = 0; } if (!get_capacity(md->disk)) @@ -260,9 +261,9 @@ int dm_set_zones_restrictions(struct dm_table *t, struct request_queue *q) * a regular device. */ if (nr_conv_zones >= ret) { - disk->queue->limits.max_open_zones = 0; - disk->queue->limits.max_active_zones = 0; - disk->queue->limits.zoned = false; + lim->max_open_zones = 0; + lim->max_active_zones = 0; + lim->zoned = false; clear_bit(DMF_EMULATE_ZONE_APPEND, &md->flags); disk->nr_zones = 0; return 0; diff --git a/drivers/md/dm.h b/drivers/md/dm.h index e0c57f19839b..53ef8207fe2c 100644 --- a/drivers/md/dm.h +++ b/drivers/md/dm.h @@ -101,7 +101,8 @@ int dm_setup_md_queue(struct mapped_device *md, struct dm_table *t); /* * Zoned targets related functions. */ -int dm_set_zones_restrictions(struct dm_table *t, struct request_queue *q); +int dm_set_zones_restrictions(struct dm_table *t, struct request_queue *q, + struct queue_limits *lim); void dm_zone_endio(struct dm_io *io, struct bio *clone); #ifdef CONFIG_BLK_DEV_ZONED int dm_blk_report_zones(struct gendisk *disk, sector_t sector, From d9ff882b54f99f96787fa3df7cd938966843c418 Mon Sep 17 00:00:00 2001 From: Damien Le Moal Date: Mon, 27 May 2024 13:34:45 +0900 Subject: [PATCH 11/25] null_blk: Fix return value of nullb_device_power_store() When powering on a null_blk device that is not already on, the return value ret that is initialized to be count is reused to check the return value of null_add_dev(), leading to nullb_device_power_store() to return null_add_dev() return value (0 on success) instead of "count". So make sure to set ret to be equal to count when there are no errors. Fixes: a2db328b0839 ("null_blk: fix null-ptr-dereference while configuring 'power' and 'submit_queues'") Signed-off-by: Damien Le Moal Reviewed-by: Yu Kuai Reviewed-by: Kanchan Joshi Link: https://lore.kernel.org/r/20240527043445.235267-1-dlemoal@kernel.org Signed-off-by: Jens Axboe --- drivers/block/null_blk/main.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/block/null_blk/main.c b/drivers/block/null_blk/main.c index eb023d267369..631dca2e4e84 100644 --- a/drivers/block/null_blk/main.c +++ b/drivers/block/null_blk/main.c @@ -494,6 +494,7 @@ static ssize_t nullb_device_power_store(struct config_item *item, set_bit(NULLB_DEV_FL_CONFIGURED, &dev->flags); dev->power = newp; + ret = count; } else if (dev->power && !newp) { if (test_and_clear_bit(NULLB_DEV_FL_UP, &dev->flags)) { dev->power = newp; From 30a0e3135f9aa14ba745f767375183b3112d7440 Mon Sep 17 00:00:00 2001 From: hexue Date: Mon, 27 May 2024 16:45:33 +0800 Subject: [PATCH 12/25] block: delete redundant function declaration blk_stats_alloc_enable was used for block hybrid poll, the related function definition was removed by patch: commit 54bdd67d0f88 ("blk-mq: remove hybrid polling") but the function declaration was not deleted. Signed-off-by: hexue Link: https://lore.kernel.org/r/20240527084533.1485210-1-xue01.he@samsung.com Signed-off-by: Jens Axboe --- block/blk-stat.h | 1 - 1 file changed, 1 deletion(-) diff --git a/block/blk-stat.h b/block/blk-stat.h index 17e1eb4ec7e2..5d7f18ba436d 100644 --- a/block/blk-stat.h +++ b/block/blk-stat.h @@ -64,7 +64,6 @@ struct blk_stat_callback { struct blk_queue_stats *blk_alloc_queue_stats(void); void blk_free_queue_stats(struct blk_queue_stats *); -bool blk_stats_alloc_enable(struct request_queue *q); void blk_stat_add(struct request *rq, u64 now); From 233e27b4d21c3e44eb863f03e566d3a22e81a7ae Mon Sep 17 00:00:00 2001 From: Damien Le Moal Date: Tue, 28 May 2024 15:28:52 +0900 Subject: [PATCH 13/25] null_blk: Print correct max open zones limit in null_init_zoned_dev() When changing the maximum number of open zones, print that number instead of the total number of zones. Fixes: dc4d137ee3b7 ("null_blk: add support for max open/active zone limit for zoned devices") Cc: stable@vger.kernel.org Signed-off-by: Damien Le Moal Reviewed-by: Niklas Cassel Link: https://lore.kernel.org/r/20240528062852.437599-1-dlemoal@kernel.org Signed-off-by: Jens Axboe --- drivers/block/null_blk/zoned.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/block/null_blk/zoned.c b/drivers/block/null_blk/zoned.c index 5b5a63adacc1..79c8e5e99f7f 100644 --- a/drivers/block/null_blk/zoned.c +++ b/drivers/block/null_blk/zoned.c @@ -108,7 +108,7 @@ int null_init_zoned_dev(struct nullb_device *dev, if (dev->zone_max_active && dev->zone_max_open > dev->zone_max_active) { dev->zone_max_open = dev->zone_max_active; pr_info("changed the maximum number of open zones to %u\n", - dev->nr_zones); + dev->zone_max_open); } else if (dev->zone_max_open >= dev->nr_zones - dev->zone_nr_conv) { dev->zone_max_open = 0; pr_info("zone_max_open limit disabled, limit >= zone count\n"); From bafea1c58b24be594d97841ced1b7ae0347bf6e3 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 23 May 2024 20:26:13 +0200 Subject: [PATCH 14/25] sd: also set max_user_sectors when setting max_sectors sd can set a max_sectors value that is lower than the max_hw_sectors limit based on the block limits VPD page. While this is rather unusual, it used to work until the max_user_sectors field was split out to cleanly deal with conflicting hardware and user limits when the hardware limit changes. Also set max_user_sectors to ensure the limit can properly be stacked. Fixes: 4f563a64732d ("block: add a max_user_discard_sectors queue limit") Reported-by: Mike Snitzer Signed-off-by: Christoph Hellwig Acked-by: Mike Snitzer Reviewed-by: Martin K. Petersen Link: https://lore.kernel.org/r/20240523182618.602003-2-hch@lst.de Signed-off-by: Jens Axboe --- drivers/scsi/sd.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/scsi/sd.c b/drivers/scsi/sd.c index 332eb9dac22d..f6c822c9cbd2 100644 --- a/drivers/scsi/sd.c +++ b/drivers/scsi/sd.c @@ -3700,8 +3700,10 @@ static int sd_revalidate_disk(struct gendisk *disk) */ if (sdkp->first_scan || q->limits.max_sectors > q->limits.max_dev_sectors || - q->limits.max_sectors > q->limits.max_hw_sectors) + q->limits.max_sectors > q->limits.max_hw_sectors) { q->limits.max_sectors = rw_max; + q->limits.max_user_sectors = rw_max; + } sdkp->first_scan = 0; From e528bede6f4e6822afdf0fa80be46ea9199f0911 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 23 May 2024 20:26:14 +0200 Subject: [PATCH 15/25] block: stack max_user_sectors The max_user_sectors is one of the three factors determining the actual max_sectors limit for READ/WRITE requests. Because of that it needs to be stacked at least for the device mapper multi-path case where requests are directly inserted on the lower device. For SCSI disks this is important because the sd driver actually sets it's own advisory limit that is lower than max_hw_sectors based on the block limits VPD page. While this is a bit odd an unusual, the same effect can happen if a user or udev script tweaks the value manually. Fixes: 4f563a64732d ("block: add a max_user_discard_sectors queue limit") Reported-by: Mike Snitzer Signed-off-by: Christoph Hellwig Acked-by: Mike Snitzer Reviewed-by: Martin K. Petersen Link: https://lore.kernel.org/r/20240523182618.602003-3-hch@lst.de Signed-off-by: Jens Axboe --- block/blk-settings.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/block/blk-settings.c b/block/blk-settings.c index a7fe8e90240a..7a672021daee 100644 --- a/block/blk-settings.c +++ b/block/blk-settings.c @@ -611,6 +611,8 @@ int blk_stack_limits(struct queue_limits *t, struct queue_limits *b, unsigned int top, bottom, alignment, ret = 0; t->max_sectors = min_not_zero(t->max_sectors, b->max_sectors); + t->max_user_sectors = min_not_zero(t->max_user_sectors, + b->max_user_sectors); t->max_hw_sectors = min_not_zero(t->max_hw_sectors, b->max_hw_sectors); t->max_dev_sectors = min_not_zero(t->max_dev_sectors, b->max_dev_sectors); t->max_write_zeroes_sectors = min(t->max_write_zeroes_sectors, From e993db2d6e5207f1ae061c2ac554ab1f714c741d Mon Sep 17 00:00:00 2001 From: Hannes Reinecke Date: Fri, 24 May 2024 12:46:51 +0200 Subject: [PATCH 16/25] block: check for max_hw_sectors underflow The logical block size need to be smaller than the max_hw_sector setting, otherwise we can't even transfer a single LBA. Signed-off-by: Hannes Reinecke Reviewed-by: Christoph Hellwig Reviewed-by: John Garry Signed-off-by: Jens Axboe --- block/blk-settings.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/block/blk-settings.c b/block/blk-settings.c index 7a672021daee..effeb9a639bb 100644 --- a/block/blk-settings.c +++ b/block/blk-settings.c @@ -104,6 +104,7 @@ static int blk_validate_zoned_limits(struct queue_limits *lim) static int blk_validate_limits(struct queue_limits *lim) { unsigned int max_hw_sectors; + unsigned int logical_block_sectors; /* * Unless otherwise specified, default to 512 byte logical blocks and a @@ -134,8 +135,11 @@ static int blk_validate_limits(struct queue_limits *lim) lim->max_hw_sectors = BLK_SAFE_MAX_SECTORS; if (WARN_ON_ONCE(lim->max_hw_sectors < PAGE_SECTORS)) return -EINVAL; + logical_block_sectors = lim->logical_block_size >> SECTOR_SHIFT; + if (WARN_ON_ONCE(logical_block_sectors > lim->max_hw_sectors)) + return -EINVAL; lim->max_hw_sectors = round_down(lim->max_hw_sectors, - lim->logical_block_size >> SECTOR_SHIFT); + logical_block_sectors); /* * The actual max_sectors value is a complex beast and also takes the @@ -153,7 +157,7 @@ static int blk_validate_limits(struct queue_limits *lim) lim->max_sectors = min(max_hw_sectors, BLK_DEF_MAX_SECTORS_CAP); } lim->max_sectors = round_down(lim->max_sectors, - lim->logical_block_size >> SECTOR_SHIFT); + logical_block_sectors); /* * Random default for the maximum number of segments. Driver should not From a14a68b76954e73031ca6399abace17dcb77c17a Mon Sep 17 00:00:00 2001 From: Dongsheng Yang Date: Tue, 28 May 2024 20:09:12 +0800 Subject: [PATCH 17/25] bcache: allow allocator to invalidate bucket in gc Currently, if the gc is running, when the allocator found free_inc is empty, allocator has to wait the gc finish. Before that, the IO is blocked. But actually, there would be some buckets is reclaimable before gc, and gc will never mark this kind of bucket to be unreclaimable. So we can put these buckets into free_inc in gc running to avoid IO being blocked. Signed-off-by: Dongsheng Yang Signed-off-by: Mingzhe Zou Signed-off-by: Coly Li Link: https://lore.kernel.org/r/20240528120914.28705-2-colyli@suse.de Signed-off-by: Jens Axboe --- drivers/md/bcache/alloc.c | 13 +++++-------- drivers/md/bcache/bcache.h | 1 + drivers/md/bcache/btree.c | 7 ++++++- 3 files changed, 12 insertions(+), 9 deletions(-) diff --git a/drivers/md/bcache/alloc.c b/drivers/md/bcache/alloc.c index ce13c272c387..32a46343097d 100644 --- a/drivers/md/bcache/alloc.c +++ b/drivers/md/bcache/alloc.c @@ -129,12 +129,9 @@ static inline bool can_inc_bucket_gen(struct bucket *b) bool bch_can_invalidate_bucket(struct cache *ca, struct bucket *b) { - BUG_ON(!ca->set->gc_mark_valid); - - return (!GC_MARK(b) || - GC_MARK(b) == GC_MARK_RECLAIMABLE) && - !atomic_read(&b->pin) && - can_inc_bucket_gen(b); + return (ca->set->gc_mark_valid || b->reclaimable_in_gc) && + ((!GC_MARK(b) || GC_MARK(b) == GC_MARK_RECLAIMABLE) && + !atomic_read(&b->pin) && can_inc_bucket_gen(b)); } void __bch_invalidate_one_bucket(struct cache *ca, struct bucket *b) @@ -148,6 +145,7 @@ void __bch_invalidate_one_bucket(struct cache *ca, struct bucket *b) bch_inc_gen(ca, b); b->prio = INITIAL_PRIO; atomic_inc(&b->pin); + b->reclaimable_in_gc = 0; } static void bch_invalidate_one_bucket(struct cache *ca, struct bucket *b) @@ -352,8 +350,7 @@ static int bch_allocator_thread(void *arg) */ retry_invalidate: - allocator_wait(ca, ca->set->gc_mark_valid && - !ca->invalidate_needs_gc); + allocator_wait(ca, !ca->invalidate_needs_gc); invalidate_buckets(ca); /* diff --git a/drivers/md/bcache/bcache.h b/drivers/md/bcache/bcache.h index 4e6afa89921f..1d33e40d26ea 100644 --- a/drivers/md/bcache/bcache.h +++ b/drivers/md/bcache/bcache.h @@ -200,6 +200,7 @@ struct bucket { uint8_t gen; uint8_t last_gc; /* Most out of date gen in the btree */ uint16_t gc_mark; /* Bitfield used by GC. See below for field */ + uint16_t reclaimable_in_gc:1; }; /* diff --git a/drivers/md/bcache/btree.c b/drivers/md/bcache/btree.c index d011a7154d33..4e6ccf2c8a0b 100644 --- a/drivers/md/bcache/btree.c +++ b/drivers/md/bcache/btree.c @@ -1741,18 +1741,20 @@ static void btree_gc_start(struct cache_set *c) mutex_lock(&c->bucket_lock); - c->gc_mark_valid = 0; c->gc_done = ZERO_KEY; ca = c->cache; for_each_bucket(b, ca) { b->last_gc = b->gen; + if (bch_can_invalidate_bucket(ca, b)) + b->reclaimable_in_gc = 1; if (!atomic_read(&b->pin)) { SET_GC_MARK(b, 0); SET_GC_SECTORS_USED(b, 0); } } + c->gc_mark_valid = 0; mutex_unlock(&c->bucket_lock); } @@ -1809,6 +1811,9 @@ static void bch_btree_gc_finish(struct cache_set *c) for_each_bucket(b, ca) { c->need_gc = max(c->need_gc, bucket_gc_gen(b)); + if (b->reclaimable_in_gc) + b->reclaimable_in_gc = 0; + if (atomic_read(&b->pin)) continue; From 05356938a4be356adde4eab4425c6822f3c7d706 Mon Sep 17 00:00:00 2001 From: Coly Li Date: Tue, 28 May 2024 20:09:13 +0800 Subject: [PATCH 18/25] bcache: call force_wake_up_gc() if necessary in check_should_bypass() If there are extreme heavy write I/O continuously hit on relative small cache device (512GB in my testing), it is possible to make counter c->gc_stats.in_use continue to increase and exceed CUTOFF_CACHE_ADD. If 'c->gc_stats.in_use > CUTOFF_CACHE_ADD' happens, all following write requests will bypass the cache device because check_should_bypass() returns 'true'. Because all writes bypass the cache device, counter c->sectors_to_gc has no chance to be negative value, and garbage collection thread won't be waken up even the whole cache becomes clean after writeback accomplished. The aftermath is that all write I/Os go directly into backing device even the cache device is clean. To avoid the above situation, this patch uses a quite conservative way to fix: if 'c->gc_stats.in_use > CUTOFF_CACHE_ADD' happens, only wakes up garbage collection thread when the whole cache device is clean. Before the fix, the writes-always-bypass situation happens after 10+ hours write I/O pressure on 512GB Intel optane memory which acts as cache device. After this fix, such situation doesn't happen after 36+ hours testing. Signed-off-by: Coly Li Link: https://lore.kernel.org/r/20240528120914.28705-3-colyli@suse.de Signed-off-by: Jens Axboe --- drivers/md/bcache/request.c | 16 +++++++++++++++- 1 file changed, 15 insertions(+), 1 deletion(-) diff --git a/drivers/md/bcache/request.c b/drivers/md/bcache/request.c index 83d112bd2b1c..af345dc6fde1 100644 --- a/drivers/md/bcache/request.c +++ b/drivers/md/bcache/request.c @@ -369,10 +369,24 @@ static bool check_should_bypass(struct cached_dev *dc, struct bio *bio) struct io *i; if (test_bit(BCACHE_DEV_DETACHING, &dc->disk.flags) || - c->gc_stats.in_use > CUTOFF_CACHE_ADD || (bio_op(bio) == REQ_OP_DISCARD)) goto skip; + if (c->gc_stats.in_use > CUTOFF_CACHE_ADD) { + /* + * If cached buckets are all clean now, 'true' will be + * returned and all requests will bypass the cache device. + * Then c->sectors_to_gc has no chance to be negative, and + * gc thread won't wake up and caching won't work forever. + * Here call force_wake_up_gc() to avoid such aftermath. + */ + if (BDEV_STATE(&dc->sb) == BDEV_STATE_CLEAN && + c->gc_mark_valid) + force_wake_up_gc(c); + + goto skip; + } + if (mode == CACHE_MODE_NONE || (mode == CACHE_MODE_WRITEAROUND && op_is_write(bio_op(bio)))) From 74d4ce92e08d5669d66fd890403724faa4286c21 Mon Sep 17 00:00:00 2001 From: Coly Li Date: Tue, 28 May 2024 20:09:14 +0800 Subject: [PATCH 19/25] bcache: code cleanup in __bch_bucket_alloc_set() In __bch_bucket_alloc_set() the lines after lable 'err:' indeed do nothing useful after multiple cache devices are removed from bcache code. This cleanup patch drops the useless code to save a bit CPU cycles. Signed-off-by: Coly Li Link: https://lore.kernel.org/r/20240528120914.28705-4-colyli@suse.de Signed-off-by: Jens Axboe --- drivers/md/bcache/alloc.c | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/drivers/md/bcache/alloc.c b/drivers/md/bcache/alloc.c index 32a46343097d..48ce750bf70a 100644 --- a/drivers/md/bcache/alloc.c +++ b/drivers/md/bcache/alloc.c @@ -498,8 +498,8 @@ int __bch_bucket_alloc_set(struct cache_set *c, unsigned int reserve, ca = c->cache; b = bch_bucket_alloc(ca, reserve, wait); - if (b == -1) - goto err; + if (b < 0) + return -1; k->ptr[0] = MAKE_PTR(ca->buckets[b].gen, bucket_to_sector(c, b), @@ -508,10 +508,6 @@ int __bch_bucket_alloc_set(struct cache_set *c, unsigned int reserve, SET_KEY_PTRS(k, 1); return 0; -err: - bch_bucket_free(c, k); - bkey_put(c, k); - return -1; } int bch_bucket_alloc_set(struct cache_set *c, unsigned int reserve, From be647e2c76b27f409cdd520f66c95be888b553a3 Mon Sep 17 00:00:00 2001 From: Keith Busch Date: Tue, 21 May 2024 06:41:45 -0700 Subject: [PATCH 20/25] nvme: use srcu for iterating namespace list The nvme pci driver synchronizes with all the namespace queues during a reset to ensure that there's no pending timeout work. Meanwhile the timeout work potentially iterates those same namespaces to freeze their queues. Each of those namespace iterations use the same read lock. If a write lock should somehow get between the synchronize and freeze steps, then forward progress is deadlocked. We had been relying on the nvme controller state machine to ensure the reset work wouldn't conflict with timeout work. That guarantee may be a bit fragile to rely on, so iterate the namespace lists without taking potentially circular locks, as reported by lockdep. Link: https://lore.kernel.org/all/20220930001943.zdbvolc3gkekfmcv@shindev/ Reported-by: Shinichiro Kawasaki Tested-by: Shinichiro Kawasaki Reviewed-by: Sagi Grimberg Reviewed-by: Christoph Hellwig Signed-off-by: Keith Busch --- drivers/nvme/host/core.c | 99 +++++++++++++++++++++-------------- drivers/nvme/host/ioctl.c | 15 +++--- drivers/nvme/host/multipath.c | 21 ++++---- drivers/nvme/host/nvme.h | 4 +- 4 files changed, 83 insertions(+), 56 deletions(-) diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index 7706df237349..f5d150c62955 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -678,7 +678,7 @@ static void nvme_free_ns(struct kref *kref) kfree(ns); } -static inline bool nvme_get_ns(struct nvme_ns *ns) +bool nvme_get_ns(struct nvme_ns *ns) { return kref_get_unless_zero(&ns->kref); } @@ -3684,9 +3684,10 @@ out_unlock: struct nvme_ns *nvme_find_get_ns(struct nvme_ctrl *ctrl, unsigned nsid) { struct nvme_ns *ns, *ret = NULL; + int srcu_idx; - down_read(&ctrl->namespaces_rwsem); - list_for_each_entry(ns, &ctrl->namespaces, list) { + srcu_idx = srcu_read_lock(&ctrl->srcu); + list_for_each_entry_rcu(ns, &ctrl->namespaces, list) { if (ns->head->ns_id == nsid) { if (!nvme_get_ns(ns)) continue; @@ -3696,7 +3697,7 @@ struct nvme_ns *nvme_find_get_ns(struct nvme_ctrl *ctrl, unsigned nsid) if (ns->head->ns_id > nsid) break; } - up_read(&ctrl->namespaces_rwsem); + srcu_read_unlock(&ctrl->srcu, srcu_idx); return ret; } EXPORT_SYMBOL_NS_GPL(nvme_find_get_ns, NVME_TARGET_PASSTHRU); @@ -3710,7 +3711,7 @@ static void nvme_ns_add_to_ctrl_list(struct nvme_ns *ns) list_for_each_entry_reverse(tmp, &ns->ctrl->namespaces, list) { if (tmp->head->ns_id < ns->head->ns_id) { - list_add(&ns->list, &tmp->list); + list_add_rcu(&ns->list, &tmp->list); return; } } @@ -3776,17 +3777,18 @@ static void nvme_alloc_ns(struct nvme_ctrl *ctrl, struct nvme_ns_info *info) if (nvme_update_ns_info(ns, info)) goto out_unlink_ns; - down_write(&ctrl->namespaces_rwsem); + mutex_lock(&ctrl->namespaces_lock); /* * Ensure that no namespaces are added to the ctrl list after the queues * are frozen, thereby avoiding a deadlock between scan and reset. */ if (test_bit(NVME_CTRL_FROZEN, &ctrl->flags)) { - up_write(&ctrl->namespaces_rwsem); + mutex_unlock(&ctrl->namespaces_lock); goto out_unlink_ns; } nvme_ns_add_to_ctrl_list(ns); - up_write(&ctrl->namespaces_rwsem); + mutex_unlock(&ctrl->namespaces_lock); + synchronize_srcu(&ctrl->srcu); nvme_get_ctrl(ctrl); if (device_add_disk(ctrl->device, ns->disk, nvme_ns_attr_groups)) @@ -3809,9 +3811,10 @@ static void nvme_alloc_ns(struct nvme_ctrl *ctrl, struct nvme_ns_info *info) out_cleanup_ns_from_list: nvme_put_ctrl(ctrl); - down_write(&ctrl->namespaces_rwsem); - list_del_init(&ns->list); - up_write(&ctrl->namespaces_rwsem); + mutex_lock(&ctrl->namespaces_lock); + list_del_rcu(&ns->list); + mutex_unlock(&ctrl->namespaces_lock); + synchronize_srcu(&ctrl->srcu); out_unlink_ns: mutex_lock(&ctrl->subsys->lock); list_del_rcu(&ns->siblings); @@ -3861,9 +3864,10 @@ static void nvme_ns_remove(struct nvme_ns *ns) nvme_cdev_del(&ns->cdev, &ns->cdev_device); del_gendisk(ns->disk); - down_write(&ns->ctrl->namespaces_rwsem); - list_del_init(&ns->list); - up_write(&ns->ctrl->namespaces_rwsem); + mutex_lock(&ns->ctrl->namespaces_lock); + list_del_rcu(&ns->list); + mutex_unlock(&ns->ctrl->namespaces_lock); + synchronize_srcu(&ns->ctrl->srcu); if (last_path) nvme_mpath_shutdown_disk(ns->head); @@ -3953,16 +3957,17 @@ static void nvme_remove_invalid_namespaces(struct nvme_ctrl *ctrl, struct nvme_ns *ns, *next; LIST_HEAD(rm_list); - down_write(&ctrl->namespaces_rwsem); + mutex_lock(&ctrl->namespaces_lock); list_for_each_entry_safe(ns, next, &ctrl->namespaces, list) { if (ns->head->ns_id > nsid) - list_move_tail(&ns->list, &rm_list); + list_splice_init_rcu(&ns->list, &rm_list, + synchronize_rcu); } - up_write(&ctrl->namespaces_rwsem); + mutex_unlock(&ctrl->namespaces_lock); + synchronize_srcu(&ctrl->srcu); list_for_each_entry_safe(ns, next, &rm_list, list) nvme_ns_remove(ns); - } static int nvme_scan_ns_list(struct nvme_ctrl *ctrl) @@ -4132,9 +4137,10 @@ void nvme_remove_namespaces(struct nvme_ctrl *ctrl) /* this is a no-op when called from the controller reset handler */ nvme_change_ctrl_state(ctrl, NVME_CTRL_DELETING_NOIO); - down_write(&ctrl->namespaces_rwsem); - list_splice_init(&ctrl->namespaces, &ns_list); - up_write(&ctrl->namespaces_rwsem); + mutex_lock(&ctrl->namespaces_lock); + list_splice_init_rcu(&ctrl->namespaces, &ns_list, synchronize_rcu); + mutex_unlock(&ctrl->namespaces_lock); + synchronize_srcu(&ctrl->srcu); list_for_each_entry_safe(ns, next, &ns_list, list) nvme_ns_remove(ns); @@ -4582,6 +4588,7 @@ static void nvme_free_ctrl(struct device *dev) key_put(ctrl->tls_key); nvme_free_cels(ctrl); nvme_mpath_uninit(ctrl); + cleanup_srcu_struct(&ctrl->srcu); nvme_auth_stop(ctrl); nvme_auth_free(ctrl); __free_page(ctrl->discard_page); @@ -4614,10 +4621,15 @@ int nvme_init_ctrl(struct nvme_ctrl *ctrl, struct device *dev, ctrl->passthru_err_log_enabled = false; clear_bit(NVME_CTRL_FAILFAST_EXPIRED, &ctrl->flags); spin_lock_init(&ctrl->lock); + mutex_init(&ctrl->namespaces_lock); + + ret = init_srcu_struct(&ctrl->srcu); + if (ret) + return ret; + mutex_init(&ctrl->scan_lock); INIT_LIST_HEAD(&ctrl->namespaces); xa_init(&ctrl->cels); - init_rwsem(&ctrl->namespaces_rwsem); ctrl->dev = dev; ctrl->ops = ops; ctrl->quirks = quirks; @@ -4697,6 +4709,7 @@ out_release_instance: out: if (ctrl->discard_page) __free_page(ctrl->discard_page); + cleanup_srcu_struct(&ctrl->srcu); return ret; } EXPORT_SYMBOL_GPL(nvme_init_ctrl); @@ -4705,22 +4718,24 @@ EXPORT_SYMBOL_GPL(nvme_init_ctrl); void nvme_mark_namespaces_dead(struct nvme_ctrl *ctrl) { struct nvme_ns *ns; + int srcu_idx; - down_read(&ctrl->namespaces_rwsem); - list_for_each_entry(ns, &ctrl->namespaces, list) + srcu_idx = srcu_read_lock(&ctrl->srcu); + list_for_each_entry_rcu(ns, &ctrl->namespaces, list) blk_mark_disk_dead(ns->disk); - up_read(&ctrl->namespaces_rwsem); + srcu_read_unlock(&ctrl->srcu, srcu_idx); } EXPORT_SYMBOL_GPL(nvme_mark_namespaces_dead); void nvme_unfreeze(struct nvme_ctrl *ctrl) { struct nvme_ns *ns; + int srcu_idx; - down_read(&ctrl->namespaces_rwsem); - list_for_each_entry(ns, &ctrl->namespaces, list) + srcu_idx = srcu_read_lock(&ctrl->srcu); + list_for_each_entry_rcu(ns, &ctrl->namespaces, list) blk_mq_unfreeze_queue(ns->queue); - up_read(&ctrl->namespaces_rwsem); + srcu_read_unlock(&ctrl->srcu, srcu_idx); clear_bit(NVME_CTRL_FROZEN, &ctrl->flags); } EXPORT_SYMBOL_GPL(nvme_unfreeze); @@ -4728,14 +4743,15 @@ EXPORT_SYMBOL_GPL(nvme_unfreeze); int nvme_wait_freeze_timeout(struct nvme_ctrl *ctrl, long timeout) { struct nvme_ns *ns; + int srcu_idx; - down_read(&ctrl->namespaces_rwsem); - list_for_each_entry(ns, &ctrl->namespaces, list) { + srcu_idx = srcu_read_lock(&ctrl->srcu); + list_for_each_entry_rcu(ns, &ctrl->namespaces, list) { timeout = blk_mq_freeze_queue_wait_timeout(ns->queue, timeout); if (timeout <= 0) break; } - up_read(&ctrl->namespaces_rwsem); + srcu_read_unlock(&ctrl->srcu, srcu_idx); return timeout; } EXPORT_SYMBOL_GPL(nvme_wait_freeze_timeout); @@ -4743,23 +4759,25 @@ EXPORT_SYMBOL_GPL(nvme_wait_freeze_timeout); void nvme_wait_freeze(struct nvme_ctrl *ctrl) { struct nvme_ns *ns; + int srcu_idx; - down_read(&ctrl->namespaces_rwsem); - list_for_each_entry(ns, &ctrl->namespaces, list) + srcu_idx = srcu_read_lock(&ctrl->srcu); + list_for_each_entry_rcu(ns, &ctrl->namespaces, list) blk_mq_freeze_queue_wait(ns->queue); - up_read(&ctrl->namespaces_rwsem); + srcu_read_unlock(&ctrl->srcu, srcu_idx); } EXPORT_SYMBOL_GPL(nvme_wait_freeze); void nvme_start_freeze(struct nvme_ctrl *ctrl) { struct nvme_ns *ns; + int srcu_idx; set_bit(NVME_CTRL_FROZEN, &ctrl->flags); - down_read(&ctrl->namespaces_rwsem); - list_for_each_entry(ns, &ctrl->namespaces, list) + srcu_idx = srcu_read_lock(&ctrl->srcu); + list_for_each_entry_rcu(ns, &ctrl->namespaces, list) blk_freeze_queue_start(ns->queue); - up_read(&ctrl->namespaces_rwsem); + srcu_read_unlock(&ctrl->srcu, srcu_idx); } EXPORT_SYMBOL_GPL(nvme_start_freeze); @@ -4802,11 +4820,12 @@ EXPORT_SYMBOL_GPL(nvme_unquiesce_admin_queue); void nvme_sync_io_queues(struct nvme_ctrl *ctrl) { struct nvme_ns *ns; + int srcu_idx; - down_read(&ctrl->namespaces_rwsem); - list_for_each_entry(ns, &ctrl->namespaces, list) + srcu_idx = srcu_read_lock(&ctrl->srcu); + list_for_each_entry_rcu(ns, &ctrl->namespaces, list) blk_sync_queue(ns->queue); - up_read(&ctrl->namespaces_rwsem); + srcu_read_unlock(&ctrl->srcu, srcu_idx); } EXPORT_SYMBOL_GPL(nvme_sync_io_queues); diff --git a/drivers/nvme/host/ioctl.c b/drivers/nvme/host/ioctl.c index 499a8bb7cac7..9d9d2a127c4e 100644 --- a/drivers/nvme/host/ioctl.c +++ b/drivers/nvme/host/ioctl.c @@ -789,15 +789,15 @@ static int nvme_dev_user_cmd(struct nvme_ctrl *ctrl, void __user *argp, bool open_for_write) { struct nvme_ns *ns; - int ret; + int ret, srcu_idx; - down_read(&ctrl->namespaces_rwsem); + srcu_idx = srcu_read_lock(&ctrl->srcu); if (list_empty(&ctrl->namespaces)) { ret = -ENOTTY; goto out_unlock; } - ns = list_first_entry(&ctrl->namespaces, struct nvme_ns, list); + ns = list_first_or_null_rcu(&ctrl->namespaces, struct nvme_ns, list); if (ns != list_last_entry(&ctrl->namespaces, struct nvme_ns, list)) { dev_warn(ctrl->device, "NVME_IOCTL_IO_CMD not supported when multiple namespaces present!\n"); @@ -807,15 +807,18 @@ static int nvme_dev_user_cmd(struct nvme_ctrl *ctrl, void __user *argp, dev_warn(ctrl->device, "using deprecated NVME_IOCTL_IO_CMD ioctl on the char device!\n"); - kref_get(&ns->kref); - up_read(&ctrl->namespaces_rwsem); + if (!nvme_get_ns(ns)) { + ret = -ENXIO; + goto out_unlock; + } + srcu_read_unlock(&ctrl->srcu, srcu_idx); ret = nvme_user_cmd(ctrl, ns, argp, 0, open_for_write); nvme_put_ns(ns); return ret; out_unlock: - up_read(&ctrl->namespaces_rwsem); + srcu_read_unlock(&ctrl->srcu, srcu_idx); return ret; } diff --git a/drivers/nvme/host/multipath.c b/drivers/nvme/host/multipath.c index 1bee176fd850..d8b6b4648eaf 100644 --- a/drivers/nvme/host/multipath.c +++ b/drivers/nvme/host/multipath.c @@ -151,16 +151,17 @@ void nvme_mpath_end_request(struct request *rq) void nvme_kick_requeue_lists(struct nvme_ctrl *ctrl) { struct nvme_ns *ns; + int srcu_idx; - down_read(&ctrl->namespaces_rwsem); - list_for_each_entry(ns, &ctrl->namespaces, list) { + srcu_idx = srcu_read_lock(&ctrl->srcu); + list_for_each_entry_rcu(ns, &ctrl->namespaces, list) { if (!ns->head->disk) continue; kblockd_schedule_work(&ns->head->requeue_work); if (nvme_ctrl_state(ns->ctrl) == NVME_CTRL_LIVE) disk_uevent(ns->head->disk, KOBJ_CHANGE); } - up_read(&ctrl->namespaces_rwsem); + srcu_read_unlock(&ctrl->srcu, srcu_idx); } static const char *nvme_ana_state_names[] = { @@ -194,13 +195,14 @@ out: void nvme_mpath_clear_ctrl_paths(struct nvme_ctrl *ctrl) { struct nvme_ns *ns; + int srcu_idx; - down_read(&ctrl->namespaces_rwsem); - list_for_each_entry(ns, &ctrl->namespaces, list) { + srcu_idx = srcu_read_lock(&ctrl->srcu); + list_for_each_entry_rcu(ns, &ctrl->namespaces, list) { nvme_mpath_clear_current_path(ns); kblockd_schedule_work(&ns->head->requeue_work); } - up_read(&ctrl->namespaces_rwsem); + srcu_read_unlock(&ctrl->srcu, srcu_idx); } void nvme_mpath_revalidate_paths(struct nvme_ns *ns) @@ -681,6 +683,7 @@ static int nvme_update_ana_state(struct nvme_ctrl *ctrl, u32 nr_nsids = le32_to_cpu(desc->nnsids), n = 0; unsigned *nr_change_groups = data; struct nvme_ns *ns; + int srcu_idx; dev_dbg(ctrl->device, "ANA group %d: %s.\n", le32_to_cpu(desc->grpid), @@ -692,8 +695,8 @@ static int nvme_update_ana_state(struct nvme_ctrl *ctrl, if (!nr_nsids) return 0; - down_read(&ctrl->namespaces_rwsem); - list_for_each_entry(ns, &ctrl->namespaces, list) { + srcu_idx = srcu_read_lock(&ctrl->srcu); + list_for_each_entry_rcu(ns, &ctrl->namespaces, list) { unsigned nsid; again: nsid = le32_to_cpu(desc->nsids[n]); @@ -706,7 +709,7 @@ again: if (ns->head->ns_id > nsid) goto again; } - up_read(&ctrl->namespaces_rwsem); + srcu_read_unlock(&ctrl->srcu, srcu_idx); return 0; } diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h index c43a30753d87..f3a41133ac3f 100644 --- a/drivers/nvme/host/nvme.h +++ b/drivers/nvme/host/nvme.h @@ -282,7 +282,8 @@ struct nvme_ctrl { struct blk_mq_tag_set *tagset; struct blk_mq_tag_set *admin_tagset; struct list_head namespaces; - struct rw_semaphore namespaces_rwsem; + struct mutex namespaces_lock; + struct srcu_struct srcu; struct device ctrl_device; struct device *device; /* char device */ #ifdef CONFIG_NVME_HWMON @@ -1160,6 +1161,7 @@ void nvme_passthru_end(struct nvme_ctrl *ctrl, struct nvme_ns *ns, u32 effects, struct nvme_command *cmd, int status); struct nvme_ctrl *nvme_ctrl_from_file(struct file *file); struct nvme_ns *nvme_find_get_ns(struct nvme_ctrl *ctrl, unsigned nsid); +bool nvme_get_ns(struct nvme_ns *ns); void nvme_put_ns(struct nvme_ns *ns); static inline bool nvme_multi_css(struct nvme_ctrl *ctrl) From c758b77d4a0a0ed3a1292b3fd7a2aeccd1a169a4 Mon Sep 17 00:00:00 2001 From: Sagi Grimberg Date: Mon, 27 May 2024 22:38:52 +0300 Subject: [PATCH 21/25] nvmet: fix a possible leak when destroy a ctrl during qp establishment In nvmet_sq_destroy we capture sq->ctrl early and if it is non-NULL we know that a ctrl was allocated (in the admin connect request handler) and we need to release pending AERs, clear ctrl->sqs and sq->ctrl (for nvme-loop primarily), and drop the final reference on the ctrl. However, a small window is possible where nvmet_sq_destroy starts (as a result of the client giving up and disconnecting) concurrently with the nvme admin connect cmd (which may be in an early stage). But *before* kill_and_confirm of sq->ref (i.e. the admin connect managed to get an sq live reference). In this case, sq->ctrl was allocated however after it was captured in a local variable in nvmet_sq_destroy. This prevented the final reference drop on the ctrl. Solve this by re-capturing the sq->ctrl after all inflight request has completed, where for sure sq->ctrl reference is final, and move forward based on that. This issue was observed in an environment with many hosts connecting multiple ctrls simoutanuosly, creating a delay in allocating a ctrl leading up to this race window. Reported-by: Alex Turin Signed-off-by: Sagi Grimberg Reviewed-by: Christoph Hellwig Signed-off-by: Keith Busch --- drivers/nvme/target/core.c | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/drivers/nvme/target/core.c b/drivers/nvme/target/core.c index 2fde22323622..06f0c587f343 100644 --- a/drivers/nvme/target/core.c +++ b/drivers/nvme/target/core.c @@ -818,6 +818,15 @@ void nvmet_sq_destroy(struct nvmet_sq *sq) percpu_ref_exit(&sq->ref); nvmet_auth_sq_free(sq); + /* + * we must reference the ctrl again after waiting for inflight IO + * to complete. Because admin connect may have sneaked in after we + * store sq->ctrl locally, but before we killed the percpu_ref. the + * admin connect allocates and assigns sq->ctrl, which now needs a + * final ref put, as this ctrl is going away. + */ + ctrl = sq->ctrl; + if (ctrl) { /* * The teardown flow may take some time, and the host may not From b164316808ec5de391c3e7b0148ec937d32d280d Mon Sep 17 00:00:00 2001 From: Damien Le Moal Date: Thu, 30 May 2024 14:40:32 +0900 Subject: [PATCH 22/25] null_blk: Do not allow runt zone with zone capacity smaller then zone size A zoned device with a smaller last zone together with a zone capacity smaller than the zone size does make any sense as that does not correspond to any possible setup for a real device: 1) For ZNS and zoned UFS devices, all zones are always the same size. 2) For SMR HDDs, all zones always have the same capacity. In other words, if we have a smaller last runt zone, then this zone capacity should always be equal to the zone size. Add a check in null_init_zoned_dev() to prevent a configuration to have both a smaller zone size and a zone capacity smaller than the zone size. Signed-off-by: Damien Le Moal Reviewed-by: Niklas Cassel Reviewed-by: Bart Van Assche Link: https://lore.kernel.org/r/20240530054035.491497-2-dlemoal@kernel.org Signed-off-by: Jens Axboe --- drivers/block/null_blk/zoned.c | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/drivers/block/null_blk/zoned.c b/drivers/block/null_blk/zoned.c index 79c8e5e99f7f..f118d304f310 100644 --- a/drivers/block/null_blk/zoned.c +++ b/drivers/block/null_blk/zoned.c @@ -74,6 +74,17 @@ int null_init_zoned_dev(struct nullb_device *dev, return -EINVAL; } + /* + * If a smaller zone capacity was requested, do not allow a smaller last + * zone at the same time as such zone configuration does not correspond + * to any real zoned device. + */ + if (dev->zone_capacity != dev->zone_size && + dev->size & (dev->zone_size - 1)) { + pr_err("A smaller last zone is not allowed with zone capacity smaller than zone size.\n"); + return -EINVAL; + } + zone_capacity_sects = mb_to_sects(dev->zone_capacity); dev_capacity_sects = mb_to_sects(dev->size); dev->zone_size_sects = mb_to_sects(dev->zone_size); From cd6399936869b4a042dd1270078cbf2bb871a407 Mon Sep 17 00:00:00 2001 From: Damien Le Moal Date: Thu, 30 May 2024 14:40:33 +0900 Subject: [PATCH 23/25] block: Fix validation of zoned device with a runt zone Commit ecfe43b11b02 ("block: Remember zone capacity when revalidating zones") introduced checks to ensure that the capacity of the zones of a zoned device is constant for all zones. However, this check ignores the possibility that a zoned device has a smaller last zone with a size not equal to the capacity of other zones. Such device correspond in practice to an SMR drive with a smaller last zone and all zones with a capacity equal to the zone size, leading to the last zone capacity being different than the capacity of other zones. Correctly handle such device by fixing the check for the constant zone capacity in blk_revalidate_seq_zone() using the new helper function disk_zone_is_last(). This helper function is also used in blk_revalidate_zone_cb() when checking the zone size. Fixes: ecfe43b11b02 ("block: Remember zone capacity when revalidating zones") Signed-off-by: Damien Le Moal Reviewed-by: Bart Van Assche Reviewed-by: Niklas Cassel Link: https://lore.kernel.org/r/20240530054035.491497-3-dlemoal@kernel.org Signed-off-by: Jens Axboe --- block/blk-zoned.c | 16 +++++++++++----- 1 file changed, 11 insertions(+), 5 deletions(-) diff --git a/block/blk-zoned.c b/block/blk-zoned.c index 03aa4eead39e..402a50a1ac4d 100644 --- a/block/blk-zoned.c +++ b/block/blk-zoned.c @@ -450,6 +450,11 @@ static inline bool disk_zone_is_conv(struct gendisk *disk, sector_t sector) return test_bit(disk_zone_no(disk, sector), disk->conv_zones_bitmap); } +static bool disk_zone_is_last(struct gendisk *disk, struct blk_zone *zone) +{ + return zone->start + zone->len >= get_capacity(disk); +} + static bool disk_insert_zone_wplug(struct gendisk *disk, struct blk_zone_wplug *zwplug) { @@ -1693,11 +1698,13 @@ static int blk_revalidate_seq_zone(struct blk_zone *zone, unsigned int idx, /* * Remember the capacity of the first sequential zone and check - * if it is constant for all zones. + * if it is constant for all zones, ignoring the last zone as it can be + * smaller. */ if (!args->zone_capacity) args->zone_capacity = zone->capacity; - if (zone->capacity != args->zone_capacity) { + if (!disk_zone_is_last(disk, zone) && + zone->capacity != args->zone_capacity) { pr_warn("%s: Invalid variable zone capacity\n", disk->disk_name); return -ENODEV; @@ -1732,7 +1739,6 @@ static int blk_revalidate_zone_cb(struct blk_zone *zone, unsigned int idx, { struct blk_revalidate_zone_args *args = data; struct gendisk *disk = args->disk; - sector_t capacity = get_capacity(disk); sector_t zone_sectors = disk->queue->limits.chunk_sectors; int ret; @@ -1743,7 +1749,7 @@ static int blk_revalidate_zone_cb(struct blk_zone *zone, unsigned int idx, return -ENODEV; } - if (zone->start >= capacity || !zone->len) { + if (zone->start >= get_capacity(disk) || !zone->len) { pr_warn("%s: Invalid zone start %llu, length %llu\n", disk->disk_name, zone->start, zone->len); return -ENODEV; @@ -1753,7 +1759,7 @@ static int blk_revalidate_zone_cb(struct blk_zone *zone, unsigned int idx, * All zones must have the same size, with the exception on an eventual * smaller last zone. */ - if (zone->start + zone->len < capacity) { + if (!disk_zone_is_last(disk, zone)) { if (zone->len != zone_sectors) { pr_warn("%s: Invalid zoned device with non constant zone size\n", disk->disk_name); From 29459c3eaa5c6261fbe0dea7bdeb9b48d35d862a Mon Sep 17 00:00:00 2001 From: Damien Le Moal Date: Thu, 30 May 2024 14:40:34 +0900 Subject: [PATCH 24/25] block: Fix zone write plugging handling of devices with a runt zone A zoned device may have a last sequential write required zone that is smaller than other zones. However, all tests to check if a zone write plug write offset exceeds the zone capacity use the same capacity value stored in the gendisk zone_capacity field. This is incorrect for a zoned device with a last runt (smaller) zone. Add the new field last_zone_capacity to struct gendisk to store the capacity of the last zone of the device. blk_revalidate_seq_zone() and blk_revalidate_conv_zone() are both modified to get this value when disk_zone_is_last() returns true. Similarly to zone_capacity, the value is first stored using the last_zone_capacity field of struct blk_revalidate_zone_args. Once zone revalidation of all zones is done, this is used to set the gendisk last_zone_capacity field. The checks to determine if a zone is full or if a sector offset in a zone exceeds the zone capacity in disk_should_remove_zone_wplug(), disk_zone_wplug_abort_unaligned(), blk_zone_write_plug_init_request(), and blk_zone_wplug_prepare_bio() are modified to use the new helper functions disk_zone_is_full() and disk_zone_wplug_is_full(). disk_zone_is_full() uses the zone index to determine if the zone being tested is the last one of the disk and uses the either the disk zone_capacity or last_zone_capacity accordingly. Fixes: dd291d77cc90 ("block: Introduce zone write plugging") Signed-off-by: Damien Le Moal Reviewed-by: Bart Van Assche Reviewed-by: Niklas Cassel Link: https://lore.kernel.org/r/20240530054035.491497-4-dlemoal@kernel.org Signed-off-by: Jens Axboe --- block/blk-zoned.c | 35 +++++++++++++++++++++++++++-------- include/linux/blkdev.h | 1 + 2 files changed, 28 insertions(+), 8 deletions(-) diff --git a/block/blk-zoned.c b/block/blk-zoned.c index 402a50a1ac4d..52abebf56027 100644 --- a/block/blk-zoned.c +++ b/block/blk-zoned.c @@ -455,6 +455,20 @@ static bool disk_zone_is_last(struct gendisk *disk, struct blk_zone *zone) return zone->start + zone->len >= get_capacity(disk); } +static bool disk_zone_is_full(struct gendisk *disk, + unsigned int zno, unsigned int offset_in_zone) +{ + if (zno < disk->nr_zones - 1) + return offset_in_zone >= disk->zone_capacity; + return offset_in_zone >= disk->last_zone_capacity; +} + +static bool disk_zone_wplug_is_full(struct gendisk *disk, + struct blk_zone_wplug *zwplug) +{ + return disk_zone_is_full(disk, zwplug->zone_no, zwplug->wp_offset); +} + static bool disk_insert_zone_wplug(struct gendisk *disk, struct blk_zone_wplug *zwplug) { @@ -548,7 +562,7 @@ static inline bool disk_should_remove_zone_wplug(struct gendisk *disk, return false; /* We can remove zone write plugs for zones that are empty or full. */ - return !zwplug->wp_offset || zwplug->wp_offset >= disk->zone_capacity; + return !zwplug->wp_offset || disk_zone_wplug_is_full(disk, zwplug); } static void disk_remove_zone_wplug(struct gendisk *disk, @@ -669,13 +683,12 @@ static void disk_zone_wplug_abort(struct blk_zone_wplug *zwplug) static void disk_zone_wplug_abort_unaligned(struct gendisk *disk, struct blk_zone_wplug *zwplug) { - unsigned int zone_capacity = disk->zone_capacity; unsigned int wp_offset = zwplug->wp_offset; struct bio_list bl = BIO_EMPTY_LIST; struct bio *bio; while ((bio = bio_list_pop(&zwplug->bio_list))) { - if (wp_offset >= zone_capacity || + if (disk_zone_is_full(disk, zwplug->zone_no, wp_offset) || (bio_op(bio) != REQ_OP_ZONE_APPEND && bio_offset_from_zone_start(bio) != wp_offset)) { blk_zone_wplug_bio_io_error(zwplug, bio); @@ -914,7 +927,6 @@ void blk_zone_write_plug_init_request(struct request *req) sector_t req_back_sector = blk_rq_pos(req) + blk_rq_sectors(req); struct request_queue *q = req->q; struct gendisk *disk = q->disk; - unsigned int zone_capacity = disk->zone_capacity; struct blk_zone_wplug *zwplug = disk_get_zone_wplug(disk, blk_rq_pos(req)); unsigned long flags; @@ -938,7 +950,7 @@ void blk_zone_write_plug_init_request(struct request *req) * into the back of the request. */ spin_lock_irqsave(&zwplug->lock, flags); - while (zwplug->wp_offset < zone_capacity) { + while (!disk_zone_wplug_is_full(disk, zwplug)) { bio = bio_list_peek(&zwplug->bio_list); if (!bio) break; @@ -984,7 +996,7 @@ static bool blk_zone_wplug_prepare_bio(struct blk_zone_wplug *zwplug, * We know such BIO will fail, and that would potentially overflow our * write pointer offset beyond the end of the zone. */ - if (zwplug->wp_offset >= disk->zone_capacity) + if (disk_zone_wplug_is_full(disk, zwplug)) goto err; if (bio_op(bio) == REQ_OP_ZONE_APPEND) { @@ -1561,6 +1573,7 @@ void disk_free_zone_resources(struct gendisk *disk) kfree(disk->conv_zones_bitmap); disk->conv_zones_bitmap = NULL; disk->zone_capacity = 0; + disk->last_zone_capacity = 0; disk->nr_zones = 0; } @@ -1605,6 +1618,7 @@ struct blk_revalidate_zone_args { unsigned long *conv_zones_bitmap; unsigned int nr_zones; unsigned int zone_capacity; + unsigned int last_zone_capacity; sector_t sector; }; @@ -1622,6 +1636,7 @@ static int disk_update_zone_resources(struct gendisk *disk, disk->nr_zones = args->nr_zones; disk->zone_capacity = args->zone_capacity; + disk->last_zone_capacity = args->last_zone_capacity; swap(disk->conv_zones_bitmap, args->conv_zones_bitmap); if (disk->conv_zones_bitmap) nr_conv_zones = bitmap_weight(disk->conv_zones_bitmap, @@ -1673,6 +1688,9 @@ static int blk_revalidate_conv_zone(struct blk_zone *zone, unsigned int idx, return -ENODEV; } + if (disk_zone_is_last(disk, zone)) + args->last_zone_capacity = zone->capacity; + if (!disk_need_zone_resources(disk)) return 0; @@ -1703,8 +1721,9 @@ static int blk_revalidate_seq_zone(struct blk_zone *zone, unsigned int idx, */ if (!args->zone_capacity) args->zone_capacity = zone->capacity; - if (!disk_zone_is_last(disk, zone) && - zone->capacity != args->zone_capacity) { + if (disk_zone_is_last(disk, zone)) { + args->last_zone_capacity = zone->capacity; + } else if (zone->capacity != args->zone_capacity) { pr_warn("%s: Invalid variable zone capacity\n", disk->disk_name); return -ENODEV; diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index aefdda9f4ec7..24c36929920b 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -186,6 +186,7 @@ struct gendisk { */ unsigned int nr_zones; unsigned int zone_capacity; + unsigned int last_zone_capacity; unsigned long *conv_zones_bitmap; unsigned int zone_wplugs_hash_bits; spinlock_t zone_wplugs_lock; From 0a751df4566c86e5a24f2a03290dad3d0f215692 Mon Sep 17 00:00:00 2001 From: Waiman Long Date: Thu, 30 May 2024 09:45:47 -0400 Subject: [PATCH 25/25] blk-throttle: Fix incorrect display of io.max Commit bf20ab538c81 ("blk-throttle: remove CONFIG_BLK_DEV_THROTTLING_LOW") attempts to revert the code change introduced by commit cd5ab1b0fcb4 ("blk-throttle: add .low interface"). However, it leaves behind the bps_conf[] and iops_conf[] fields in the throtl_grp structure which aren't set anywhere in the new blk-throttle.c code but are still being used by tg_prfill_limit() to display the limits in io.max. Now io.max always displays the following values if a block queue is used: : rbps=0 wbps=0 riops=0 wiops=0 Fix this problem by removing bps_conf[] and iops_conf[] and use bps[] and iops[] instead to complete the revert. Fixes: bf20ab538c81 ("blk-throttle: remove CONFIG_BLK_DEV_THROTTLING_LOW") Reported-by: Justin Forbes Closes: https://github.com/containers/podman/issues/22701#issuecomment-2120627789 Signed-off-by: Waiman Long Acked-by: Tejun Heo Reviewed-by: Yu Kuai Link: https://lore.kernel.org/r/20240530134547.970075-1-longman@redhat.com Signed-off-by: Jens Axboe --- block/blk-throttle.c | 24 ++++++++++++------------ block/blk-throttle.h | 8 ++------ 2 files changed, 14 insertions(+), 18 deletions(-) diff --git a/block/blk-throttle.c b/block/blk-throttle.c index 0be180f9a789..c1bf73f8c75d 100644 --- a/block/blk-throttle.c +++ b/block/blk-throttle.c @@ -1399,32 +1399,32 @@ static u64 tg_prfill_limit(struct seq_file *sf, struct blkg_policy_data *pd, bps_dft = U64_MAX; iops_dft = UINT_MAX; - if (tg->bps_conf[READ] == bps_dft && - tg->bps_conf[WRITE] == bps_dft && - tg->iops_conf[READ] == iops_dft && - tg->iops_conf[WRITE] == iops_dft) + if (tg->bps[READ] == bps_dft && + tg->bps[WRITE] == bps_dft && + tg->iops[READ] == iops_dft && + tg->iops[WRITE] == iops_dft) return 0; seq_printf(sf, "%s", dname); - if (tg->bps_conf[READ] == U64_MAX) + if (tg->bps[READ] == U64_MAX) seq_printf(sf, " rbps=max"); else - seq_printf(sf, " rbps=%llu", tg->bps_conf[READ]); + seq_printf(sf, " rbps=%llu", tg->bps[READ]); - if (tg->bps_conf[WRITE] == U64_MAX) + if (tg->bps[WRITE] == U64_MAX) seq_printf(sf, " wbps=max"); else - seq_printf(sf, " wbps=%llu", tg->bps_conf[WRITE]); + seq_printf(sf, " wbps=%llu", tg->bps[WRITE]); - if (tg->iops_conf[READ] == UINT_MAX) + if (tg->iops[READ] == UINT_MAX) seq_printf(sf, " riops=max"); else - seq_printf(sf, " riops=%u", tg->iops_conf[READ]); + seq_printf(sf, " riops=%u", tg->iops[READ]); - if (tg->iops_conf[WRITE] == UINT_MAX) + if (tg->iops[WRITE] == UINT_MAX) seq_printf(sf, " wiops=max"); else - seq_printf(sf, " wiops=%u", tg->iops_conf[WRITE]); + seq_printf(sf, " wiops=%u", tg->iops[WRITE]); seq_printf(sf, "\n"); return 0; diff --git a/block/blk-throttle.h b/block/blk-throttle.h index 393c3d134b96..4d9ef5abdf21 100644 --- a/block/blk-throttle.h +++ b/block/blk-throttle.h @@ -95,15 +95,11 @@ struct throtl_grp { bool has_rules_bps[2]; bool has_rules_iops[2]; - /* internally used bytes per second rate limits */ + /* bytes per second rate limits */ uint64_t bps[2]; - /* user configured bps limits */ - uint64_t bps_conf[2]; - /* internally used IOPS limits */ + /* IOPS limits */ unsigned int iops[2]; - /* user configured IOPS limits */ - unsigned int iops_conf[2]; /* Number of bytes dispatched in current slice */ uint64_t bytes_disp[2];