From f1ba4e674febf5c0e9f725a75ca43b7722b4e963 Mon Sep 17 00:00:00 2001
From: Dmitry Fomichev <dmitry.fomichev@wdc.com>
Date: Thu, 30 Mar 2023 17:49:52 -0400
Subject: [PATCH 1/7] virtio-blk: fix to match virtio spec

The merged patch series to support zoned block devices in virtio-blk
is not the most up to date version. The merged patch can be found at

https://lore.kernel.org/linux-block/20221016034127.330942-3-dmitry.fomichev@wdc.com/

but the latest and reviewed version is

https://lore.kernel.org/linux-block/20221110053952.3378990-3-dmitry.fomichev@wdc.com/

The reason is apparently that the correct mailing lists and
maintainers were not copied.

The differences between the two are mostly cleanups, but there is one
change that is very important in terms of compatibility with the
approved virtio-zbd specification.

Before it was approved, the OASIS virtio spec had a change in
VIRTIO_BLK_T_ZONE_APPEND request layout that is not reflected in the
current virtio-blk driver code. In the running code, the status is
the first byte of the in-header that is followed by some pad bytes
and the u64 that carries the sector at which the data has been written
to the zone back to the driver, aka the append sector.

This layout turned out to be problematic for implementing in QEMU and
the request status byte has been eventually made the last byte of the
in-header. The current code doesn't expect that and this causes the
append sector value always come as zero to the block layer. This needs
to be fixed ASAP.

Fixes: 95bfec41bd3d ("virtio-blk: add support for zoned block devices")
Cc: stable@vger.kernel.org
Signed-off-by: Dmitry Fomichev <dmitry.fomichev@wdc.com>
Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
Reviewed-by: Damien Le Moal <damien.lemoal@opensource.wdc.com>
Message-Id: <20230330214953.1088216-2-dmitry.fomichev@wdc.com>
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
---
 drivers/block/virtio_blk.c      | 238 +++++++++++++++++++++-----------
 include/uapi/linux/virtio_blk.h |  18 +--
 2 files changed, 166 insertions(+), 90 deletions(-)

diff --git a/drivers/block/virtio_blk.c b/drivers/block/virtio_blk.c
index 2723eede6f21..4f0dbbb3d4a5 100644
--- a/drivers/block/virtio_blk.c
+++ b/drivers/block/virtio_blk.c
@@ -96,16 +96,14 @@ struct virtblk_req {
 
 		/*
 		 * The zone append command has an extended in header.
-		 * The status field in zone_append_in_hdr must have
-		 * the same offset in virtblk_req as the non-zoned
-		 * status field above.
+		 * The status field in zone_append_in_hdr must always
+		 * be the last byte.
 		 */
 		struct {
+			__virtio64 sector;
 			u8 status;
-			u8 reserved[7];
-			__le64 append_sector;
-		} zone_append_in_hdr;
-	};
+		} zone_append;
+	} in_hdr;
 
 	size_t in_hdr_len;
 
@@ -154,7 +152,7 @@ static int virtblk_add_req(struct virtqueue *vq, struct virtblk_req *vbr)
 			sgs[num_out + num_in++] = vbr->sg_table.sgl;
 	}
 
-	sg_init_one(&in_hdr, &vbr->status, vbr->in_hdr_len);
+	sg_init_one(&in_hdr, &vbr->in_hdr.status, vbr->in_hdr_len);
 	sgs[num_out + num_in++] = &in_hdr;
 
 	return virtqueue_add_sgs(vq, sgs, num_out, num_in, vbr, GFP_ATOMIC);
@@ -242,11 +240,14 @@ static blk_status_t virtblk_setup_cmd(struct virtio_device *vdev,
 				      struct request *req,
 				      struct virtblk_req *vbr)
 {
-	size_t in_hdr_len = sizeof(vbr->status);
+	size_t in_hdr_len = sizeof(vbr->in_hdr.status);
 	bool unmap = false;
 	u32 type;
 	u64 sector = 0;
 
+	if (!IS_ENABLED(CONFIG_BLK_DEV_ZONED) && op_is_zone_mgmt(req_op(req)))
+		return BLK_STS_NOTSUPP;
+
 	/* Set fields for all request types */
 	vbr->out_hdr.ioprio = cpu_to_virtio32(vdev, req_get_ioprio(req));
 
@@ -287,7 +288,7 @@ static blk_status_t virtblk_setup_cmd(struct virtio_device *vdev,
 	case REQ_OP_ZONE_APPEND:
 		type = VIRTIO_BLK_T_ZONE_APPEND;
 		sector = blk_rq_pos(req);
-		in_hdr_len = sizeof(vbr->zone_append_in_hdr);
+		in_hdr_len = sizeof(vbr->in_hdr.zone_append);
 		break;
 	case REQ_OP_ZONE_RESET:
 		type = VIRTIO_BLK_T_ZONE_RESET;
@@ -297,7 +298,10 @@ static blk_status_t virtblk_setup_cmd(struct virtio_device *vdev,
 		type = VIRTIO_BLK_T_ZONE_RESET_ALL;
 		break;
 	case REQ_OP_DRV_IN:
-		/* Out header already filled in, nothing to do */
+		/*
+		 * Out header has already been prepared by the caller (virtblk_get_id()
+		 * or virtblk_submit_zone_report()), nothing to do here.
+		 */
 		return 0;
 	default:
 		WARN_ON_ONCE(1);
@@ -318,16 +322,28 @@ static blk_status_t virtblk_setup_cmd(struct virtio_device *vdev,
 	return 0;
 }
 
+/*
+ * The status byte is always the last byte of the virtblk request
+ * in-header. This helper fetches its value for all in-header formats
+ * that are currently defined.
+ */
+static inline u8 virtblk_vbr_status(struct virtblk_req *vbr)
+{
+	return *((u8 *)&vbr->in_hdr + vbr->in_hdr_len - 1);
+}
+
 static inline void virtblk_request_done(struct request *req)
 {
 	struct virtblk_req *vbr = blk_mq_rq_to_pdu(req);
-	blk_status_t status = virtblk_result(vbr->status);
+	blk_status_t status = virtblk_result(virtblk_vbr_status(vbr));
+	struct virtio_blk *vblk = req->mq_hctx->queue->queuedata;
 
 	virtblk_unmap_data(req, vbr);
 	virtblk_cleanup_cmd(req);
 
 	if (req_op(req) == REQ_OP_ZONE_APPEND)
-		req->__sector = le64_to_cpu(vbr->zone_append_in_hdr.append_sector);
+		req->__sector = virtio64_to_cpu(vblk->vdev,
+						vbr->in_hdr.zone_append.sector);
 
 	blk_mq_end_request(req, status);
 }
@@ -355,7 +371,7 @@ static int virtblk_handle_req(struct virtio_blk_vq *vq,
 
 		if (likely(!blk_should_fake_timeout(req->q)) &&
 		    !blk_mq_complete_request_remote(req) &&
-		    !blk_mq_add_to_batch(req, iob, vbr->status,
+		    !blk_mq_add_to_batch(req, iob, virtblk_vbr_status(vbr),
 					 virtblk_complete_batch))
 			virtblk_request_done(req);
 		req_done++;
@@ -550,7 +566,6 @@ static void virtio_queue_rqs(struct request **rqlist)
 #ifdef CONFIG_BLK_DEV_ZONED
 static void *virtblk_alloc_report_buffer(struct virtio_blk *vblk,
 					  unsigned int nr_zones,
-					  unsigned int zone_sectors,
 					  size_t *buflen)
 {
 	struct request_queue *q = vblk->disk->queue;
@@ -558,7 +573,7 @@ static void *virtblk_alloc_report_buffer(struct virtio_blk *vblk,
 	void *buf;
 
 	nr_zones = min_t(unsigned int, nr_zones,
-			 get_capacity(vblk->disk) >> ilog2(zone_sectors));
+			 get_capacity(vblk->disk) >> ilog2(vblk->zone_sectors));
 
 	bufsize = sizeof(struct virtio_blk_zone_report) +
 		nr_zones * sizeof(struct virtio_blk_zone_descriptor);
@@ -592,7 +607,7 @@ static int virtblk_submit_zone_report(struct virtio_blk *vblk,
 		return PTR_ERR(req);
 
 	vbr = blk_mq_rq_to_pdu(req);
-	vbr->in_hdr_len = sizeof(vbr->status);
+	vbr->in_hdr_len = sizeof(vbr->in_hdr.status);
 	vbr->out_hdr.type = cpu_to_virtio32(vblk->vdev, VIRTIO_BLK_T_ZONE_REPORT);
 	vbr->out_hdr.sector = cpu_to_virtio64(vblk->vdev, sector);
 
@@ -601,7 +616,7 @@ static int virtblk_submit_zone_report(struct virtio_blk *vblk,
 		goto out;
 
 	blk_execute_rq(req, false);
-	err = blk_status_to_errno(virtblk_result(vbr->status));
+	err = blk_status_to_errno(virtblk_result(vbr->in_hdr.status));
 out:
 	blk_mq_free_request(req);
 	return err;
@@ -609,29 +624,72 @@ out:
 
 static int virtblk_parse_zone(struct virtio_blk *vblk,
 			       struct virtio_blk_zone_descriptor *entry,
-			       unsigned int idx, unsigned int zone_sectors,
-			       report_zones_cb cb, void *data)
+			       unsigned int idx, report_zones_cb cb, void *data)
 {
 	struct blk_zone zone = { };
 
-	if (entry->z_type != VIRTIO_BLK_ZT_SWR &&
-	    entry->z_type != VIRTIO_BLK_ZT_SWP &&
-	    entry->z_type != VIRTIO_BLK_ZT_CONV) {
-		dev_err(&vblk->vdev->dev, "invalid zone type %#x\n",
-			entry->z_type);
-		return -EINVAL;
+	zone.start = virtio64_to_cpu(vblk->vdev, entry->z_start);
+	if (zone.start + vblk->zone_sectors <= get_capacity(vblk->disk))
+		zone.len = vblk->zone_sectors;
+	else
+		zone.len = get_capacity(vblk->disk) - zone.start;
+	zone.capacity = virtio64_to_cpu(vblk->vdev, entry->z_cap);
+	zone.wp = virtio64_to_cpu(vblk->vdev, entry->z_wp);
+
+	switch (entry->z_type) {
+	case VIRTIO_BLK_ZT_SWR:
+		zone.type = BLK_ZONE_TYPE_SEQWRITE_REQ;
+		break;
+	case VIRTIO_BLK_ZT_SWP:
+		zone.type = BLK_ZONE_TYPE_SEQWRITE_PREF;
+		break;
+	case VIRTIO_BLK_ZT_CONV:
+		zone.type = BLK_ZONE_TYPE_CONVENTIONAL;
+		break;
+	default:
+		dev_err(&vblk->vdev->dev, "zone %llu: invalid type %#x\n",
+			zone.start, entry->z_type);
+		return -EIO;
 	}
 
-	zone.type = entry->z_type;
-	zone.cond = entry->z_state;
-	zone.len = zone_sectors;
-	zone.capacity = le64_to_cpu(entry->z_cap);
-	zone.start = le64_to_cpu(entry->z_start);
-	if (zone.cond == BLK_ZONE_COND_FULL)
+	switch (entry->z_state) {
+	case VIRTIO_BLK_ZS_EMPTY:
+		zone.cond = BLK_ZONE_COND_EMPTY;
+		break;
+	case VIRTIO_BLK_ZS_CLOSED:
+		zone.cond = BLK_ZONE_COND_CLOSED;
+		break;
+	case VIRTIO_BLK_ZS_FULL:
+		zone.cond = BLK_ZONE_COND_FULL;
 		zone.wp = zone.start + zone.len;
-	else
-		zone.wp = le64_to_cpu(entry->z_wp);
+		break;
+	case VIRTIO_BLK_ZS_EOPEN:
+		zone.cond = BLK_ZONE_COND_EXP_OPEN;
+		break;
+	case VIRTIO_BLK_ZS_IOPEN:
+		zone.cond = BLK_ZONE_COND_IMP_OPEN;
+		break;
+	case VIRTIO_BLK_ZS_NOT_WP:
+		zone.cond = BLK_ZONE_COND_NOT_WP;
+		break;
+	case VIRTIO_BLK_ZS_RDONLY:
+		zone.cond = BLK_ZONE_COND_READONLY;
+		zone.wp = ULONG_MAX;
+		break;
+	case VIRTIO_BLK_ZS_OFFLINE:
+		zone.cond = BLK_ZONE_COND_OFFLINE;
+		zone.wp = ULONG_MAX;
+		break;
+	default:
+		dev_err(&vblk->vdev->dev, "zone %llu: invalid condition %#x\n",
+			zone.start, entry->z_state);
+		return -EIO;
+	}
 
+	/*
+	 * The callback below checks the validity of the reported
+	 * entry data, no need to further validate it here.
+	 */
 	return cb(&zone, idx, data);
 }
 
@@ -641,39 +699,47 @@ static int virtblk_report_zones(struct gendisk *disk, sector_t sector,
 {
 	struct virtio_blk *vblk = disk->private_data;
 	struct virtio_blk_zone_report *report;
-	unsigned int zone_sectors = vblk->zone_sectors;
-	unsigned int nz, i;
-	int ret, zone_idx = 0;
+	unsigned long long nz, i;
 	size_t buflen;
+	unsigned int zone_idx = 0;
+	int ret;
 
 	if (WARN_ON_ONCE(!vblk->zone_sectors))
 		return -EOPNOTSUPP;
 
-	report = virtblk_alloc_report_buffer(vblk, nr_zones,
-					     zone_sectors, &buflen);
+	report = virtblk_alloc_report_buffer(vblk, nr_zones, &buflen);
 	if (!report)
 		return -ENOMEM;
 
+	mutex_lock(&vblk->vdev_mutex);
+
+	if (!vblk->vdev) {
+		ret = -ENXIO;
+		goto fail_report;
+	}
+
 	while (zone_idx < nr_zones && sector < get_capacity(vblk->disk)) {
 		memset(report, 0, buflen);
 
 		ret = virtblk_submit_zone_report(vblk, (char *)report,
 						 buflen, sector);
-		if (ret) {
-			if (ret > 0)
-				ret = -EIO;
-			goto out_free;
-		}
-		nz = min((unsigned int)le64_to_cpu(report->nr_zones), nr_zones);
+		if (ret)
+			goto fail_report;
+
+		nz = min_t(u64, virtio64_to_cpu(vblk->vdev, report->nr_zones),
+			   nr_zones);
 		if (!nz)
 			break;
 
 		for (i = 0; i < nz && zone_idx < nr_zones; i++) {
 			ret = virtblk_parse_zone(vblk, &report->zones[i],
-						 zone_idx, zone_sectors, cb, data);
+						 zone_idx, cb, data);
 			if (ret)
-				goto out_free;
-			sector = le64_to_cpu(report->zones[i].z_start) + zone_sectors;
+				goto fail_report;
+
+			sector = virtio64_to_cpu(vblk->vdev,
+						 report->zones[i].z_start) +
+				 vblk->zone_sectors;
 			zone_idx++;
 		}
 	}
@@ -682,7 +748,8 @@ static int virtblk_report_zones(struct gendisk *disk, sector_t sector,
 		ret = zone_idx;
 	else
 		ret = -EINVAL;
-out_free:
+fail_report:
+	mutex_unlock(&vblk->vdev_mutex);
 	kvfree(report);
 	return ret;
 }
@@ -691,20 +758,28 @@ static void virtblk_revalidate_zones(struct virtio_blk *vblk)
 {
 	u8 model;
 
-	if (!vblk->zone_sectors)
-		return;
-
 	virtio_cread(vblk->vdev, struct virtio_blk_config,
 		     zoned.model, &model);
-	if (!blk_revalidate_disk_zones(vblk->disk, NULL))
-		set_capacity_and_notify(vblk->disk, 0);
+	switch (model) {
+	default:
+		dev_err(&vblk->vdev->dev, "unknown zone model %d\n", model);
+		fallthrough;
+	case VIRTIO_BLK_Z_NONE:
+	case VIRTIO_BLK_Z_HA:
+		disk_set_zoned(vblk->disk, BLK_ZONED_NONE);
+		return;
+	case VIRTIO_BLK_Z_HM:
+		WARN_ON_ONCE(!vblk->zone_sectors);
+		if (!blk_revalidate_disk_zones(vblk->disk, NULL))
+			set_capacity_and_notify(vblk->disk, 0);
+	}
 }
 
 static int virtblk_probe_zoned_device(struct virtio_device *vdev,
 				       struct virtio_blk *vblk,
 				       struct request_queue *q)
 {
-	u32 v;
+	u32 v, wg;
 	u8 model;
 	int ret;
 
@@ -713,16 +788,11 @@ static int virtblk_probe_zoned_device(struct virtio_device *vdev,
 
 	switch (model) {
 	case VIRTIO_BLK_Z_NONE:
+	case VIRTIO_BLK_Z_HA:
+		/* Present the host-aware device as non-zoned */
 		return 0;
 	case VIRTIO_BLK_Z_HM:
 		break;
-	case VIRTIO_BLK_Z_HA:
-		/*
-		 * Present the host-aware device as a regular drive.
-		 * TODO It is possible to add an option to make it appear
-		 * in the system as a zoned drive.
-		 */
-		return 0;
 	default:
 		dev_err(&vdev->dev, "unsupported zone model %d\n", model);
 		return -EINVAL;
@@ -735,32 +805,31 @@ static int virtblk_probe_zoned_device(struct virtio_device *vdev,
 
 	virtio_cread(vdev, struct virtio_blk_config,
 		     zoned.max_open_zones, &v);
-	disk_set_max_open_zones(vblk->disk, le32_to_cpu(v));
-
-	dev_dbg(&vdev->dev, "max open zones = %u\n", le32_to_cpu(v));
+	disk_set_max_open_zones(vblk->disk, v);
+	dev_dbg(&vdev->dev, "max open zones = %u\n", v);
 
 	virtio_cread(vdev, struct virtio_blk_config,
 		     zoned.max_active_zones, &v);
-	disk_set_max_active_zones(vblk->disk, le32_to_cpu(v));
-	dev_dbg(&vdev->dev, "max active zones = %u\n", le32_to_cpu(v));
+	disk_set_max_active_zones(vblk->disk, v);
+	dev_dbg(&vdev->dev, "max active zones = %u\n", v);
 
 	virtio_cread(vdev, struct virtio_blk_config,
-		     zoned.write_granularity, &v);
-	if (!v) {
+		     zoned.write_granularity, &wg);
+	if (!wg) {
 		dev_warn(&vdev->dev, "zero write granularity reported\n");
 		return -ENODEV;
 	}
-	blk_queue_physical_block_size(q, le32_to_cpu(v));
-	blk_queue_io_min(q, le32_to_cpu(v));
+	blk_queue_physical_block_size(q, wg);
+	blk_queue_io_min(q, wg);
 
-	dev_dbg(&vdev->dev, "write granularity = %u\n", le32_to_cpu(v));
+	dev_dbg(&vdev->dev, "write granularity = %u\n", wg);
 
 	/*
 	 * virtio ZBD specification doesn't require zones to be a power of
 	 * two sectors in size, but the code in this driver expects that.
 	 */
-	virtio_cread(vdev, struct virtio_blk_config, zoned.zone_sectors, &v);
-	vblk->zone_sectors = le32_to_cpu(v);
+	virtio_cread(vdev, struct virtio_blk_config, zoned.zone_sectors,
+		     &vblk->zone_sectors);
 	if (vblk->zone_sectors == 0 || !is_power_of_2(vblk->zone_sectors)) {
 		dev_err(&vdev->dev,
 			"zoned device with non power of two zone size %u\n",
@@ -783,8 +852,15 @@ static int virtblk_probe_zoned_device(struct virtio_device *vdev,
 			dev_warn(&vdev->dev, "zero max_append_sectors reported\n");
 			return -ENODEV;
 		}
-		blk_queue_max_zone_append_sectors(q, le32_to_cpu(v));
-		dev_dbg(&vdev->dev, "max append sectors = %u\n", le32_to_cpu(v));
+		if ((v << SECTOR_SHIFT) < wg) {
+			dev_err(&vdev->dev,
+				"write granularity %u exceeds max_append_sectors %u limit\n",
+				wg, v);
+			return -ENODEV;
+		}
+
+		blk_queue_max_zone_append_sectors(q, v);
+		dev_dbg(&vdev->dev, "max append sectors = %u\n", v);
 	}
 
 	return ret;
@@ -794,6 +870,7 @@ static inline bool virtblk_has_zoned_feature(struct virtio_device *vdev)
 {
 	return virtio_has_feature(vdev, VIRTIO_BLK_F_ZONED);
 }
+
 #else
 
 /*
@@ -801,9 +878,11 @@ static inline bool virtblk_has_zoned_feature(struct virtio_device *vdev)
  * We only need to define a few symbols to avoid compilation errors.
  */
 #define virtblk_report_zones       NULL
+
 static inline void virtblk_revalidate_zones(struct virtio_blk *vblk)
 {
 }
+
 static inline int virtblk_probe_zoned_device(struct virtio_device *vdev,
 			struct virtio_blk *vblk, struct request_queue *q)
 {
@@ -831,7 +910,7 @@ static int virtblk_get_id(struct gendisk *disk, char *id_str)
 		return PTR_ERR(req);
 
 	vbr = blk_mq_rq_to_pdu(req);
-	vbr->in_hdr_len = sizeof(vbr->status);
+	vbr->in_hdr_len = sizeof(vbr->in_hdr.status);
 	vbr->out_hdr.type = cpu_to_virtio32(vblk->vdev, VIRTIO_BLK_T_GET_ID);
 	vbr->out_hdr.sector = 0;
 
@@ -840,7 +919,7 @@ static int virtblk_get_id(struct gendisk *disk, char *id_str)
 		goto out;
 
 	blk_execute_rq(req, false);
-	err = blk_status_to_errno(virtblk_result(vbr->status));
+	err = blk_status_to_errno(virtblk_result(vbr->in_hdr.status));
 out:
 	blk_mq_free_request(req);
 	return err;
@@ -1504,9 +1583,6 @@ static int virtblk_probe(struct virtio_device *vdev)
 			goto out_cleanup_disk;
 	}
 
-	dev_info(&vdev->dev, "blk config size: %zu\n",
-		sizeof(struct virtio_blk_config));
-
 	err = device_add_disk(&vdev->dev, vblk->disk, virtblk_attr_groups);
 	if (err)
 		goto out_cleanup_disk;
diff --git a/include/uapi/linux/virtio_blk.h b/include/uapi/linux/virtio_blk.h
index 5af2a0300bb9..3744e4da1b2a 100644
--- a/include/uapi/linux/virtio_blk.h
+++ b/include/uapi/linux/virtio_blk.h
@@ -140,11 +140,11 @@ struct virtio_blk_config {
 
 	/* Zoned block device characteristics (if VIRTIO_BLK_F_ZONED) */
 	struct virtio_blk_zoned_characteristics {
-		__le32 zone_sectors;
-		__le32 max_open_zones;
-		__le32 max_active_zones;
-		__le32 max_append_sectors;
-		__le32 write_granularity;
+		__virtio32 zone_sectors;
+		__virtio32 max_open_zones;
+		__virtio32 max_active_zones;
+		__virtio32 max_append_sectors;
+		__virtio32 write_granularity;
 		__u8 model;
 		__u8 unused2[3];
 	} zoned;
@@ -241,11 +241,11 @@ struct virtio_blk_outhdr {
  */
 struct virtio_blk_zone_descriptor {
 	/* Zone capacity */
-	__le64 z_cap;
+	__virtio64 z_cap;
 	/* The starting sector of the zone */
-	__le64 z_start;
+	__virtio64 z_start;
 	/* Zone write pointer position in sectors */
-	__le64 z_wp;
+	__virtio64 z_wp;
 	/* Zone type */
 	__u8 z_type;
 	/* Zone state */
@@ -254,7 +254,7 @@ struct virtio_blk_zone_descriptor {
 };
 
 struct virtio_blk_zone_report {
-	__le64 nr_zones;
+	__virtio64 nr_zones;
 	__u8 reserved[56];
 	struct virtio_blk_zone_descriptor zones[];
 };

From 10805eb5d6d15fd4f61b05cc7aa269e12ab99848 Mon Sep 17 00:00:00 2001
From: Dmitry Fomichev <dmitry.fomichev@wdc.com>
Date: Thu, 30 Mar 2023 17:49:53 -0400
Subject: [PATCH 2/7] virtio-blk: fix ZBD probe in kernels without ZBD support

When the kernel is built without support for zoned block devices,
virtio-blk probe needs to error out any host-managed device scans
to prevent such devices from appearing in the system as non-zoned.
The current virtio-blk code simply bypasses all ZBD checks if
CONFIG_BLK_DEV_ZONED is not defined and this leads to host-managed
block devices being presented as non-zoned in the OS. This is one of
the main problems this patch series is aimed to fix.

In this patch, make VIRTIO_BLK_F_ZONED feature defined even when
CONFIG_BLK_DEV_ZONED is not. This change makes the code compliant with
the voted revision of virtio-blk ZBD spec. Modify the probe code to
look at the situation when VIRTIO_BLK_F_ZONED is negotiated in a kernel
that is built without ZBD support. In this case, the code checks
the zoned model of the device and fails the probe is the device
is host-managed.

The patch also adds the comment to clarify that the call to perform
the zoned device probe is correctly placed after virtio_device ready().

Fixes: 95bfec41bd3d ("virtio-blk: add support for zoned block devices")
Cc: stable@vger.kernel.org
Signed-off-by: Dmitry Fomichev <dmitry.fomichev@wdc.com>
Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
Reviewed-by: Damien Le Moal <damien.lemoal@opensource.wdc.com>
Message-Id: <20230330214953.1088216-3-dmitry.fomichev@wdc.com>
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
---
 drivers/block/virtio_blk.c | 33 +++++++++++++++++----------------
 1 file changed, 17 insertions(+), 16 deletions(-)

diff --git a/drivers/block/virtio_blk.c b/drivers/block/virtio_blk.c
index 4f0dbbb3d4a5..2b918e28acaa 100644
--- a/drivers/block/virtio_blk.c
+++ b/drivers/block/virtio_blk.c
@@ -866,16 +866,12 @@ static int virtblk_probe_zoned_device(struct virtio_device *vdev,
 	return ret;
 }
 
-static inline bool virtblk_has_zoned_feature(struct virtio_device *vdev)
-{
-	return virtio_has_feature(vdev, VIRTIO_BLK_F_ZONED);
-}
-
 #else
 
 /*
  * Zoned block device support is not configured in this kernel.
- * We only need to define a few symbols to avoid compilation errors.
+ * Host-managed zoned devices can't be supported, but others are
+ * good to go as regular block devices.
  */
 #define virtblk_report_zones       NULL
 
@@ -886,12 +882,16 @@ static inline void virtblk_revalidate_zones(struct virtio_blk *vblk)
 static inline int virtblk_probe_zoned_device(struct virtio_device *vdev,
 			struct virtio_blk *vblk, struct request_queue *q)
 {
-	return -EOPNOTSUPP;
-}
+	u8 model;
 
-static inline bool virtblk_has_zoned_feature(struct virtio_device *vdev)
-{
-	return false;
+	virtio_cread(vdev, struct virtio_blk_config, zoned.model, &model);
+	if (model == VIRTIO_BLK_Z_HM) {
+		dev_err(&vdev->dev,
+			"virtio_blk: zoned devices are not supported");
+		return -EOPNOTSUPP;
+	}
+
+	return 0;
 }
 #endif /* CONFIG_BLK_DEV_ZONED */
 
@@ -1577,7 +1577,11 @@ static int virtblk_probe(struct virtio_device *vdev)
 	virtblk_update_capacity(vblk, false);
 	virtio_device_ready(vdev);
 
-	if (virtblk_has_zoned_feature(vdev)) {
+	/*
+	 * All steps that follow use the VQs therefore they need to be
+	 * placed after the virtio_device_ready() call above.
+	 */
+	if (virtio_has_feature(vdev, VIRTIO_BLK_F_ZONED)) {
 		err = virtblk_probe_zoned_device(vdev, vblk, q);
 		if (err)
 			goto out_cleanup_disk;
@@ -1683,10 +1687,7 @@ static unsigned int features[] = {
 	VIRTIO_BLK_F_RO, VIRTIO_BLK_F_BLK_SIZE,
 	VIRTIO_BLK_F_FLUSH, VIRTIO_BLK_F_TOPOLOGY, VIRTIO_BLK_F_CONFIG_WCE,
 	VIRTIO_BLK_F_MQ, VIRTIO_BLK_F_DISCARD, VIRTIO_BLK_F_WRITE_ZEROES,
-	VIRTIO_BLK_F_SECURE_ERASE,
-#ifdef CONFIG_BLK_DEV_ZONED
-	VIRTIO_BLK_F_ZONED,
-#endif /* CONFIG_BLK_DEV_ZONED */
+	VIRTIO_BLK_F_SECURE_ERASE, VIRTIO_BLK_F_ZONED,
 };
 
 static struct virtio_driver virtio_blk = {

From e508efc3ae7e44eb3caf595a086bfd3824da5b9a Mon Sep 17 00:00:00 2001
From: Mike Christie <michael.christie@oracle.com>
Date: Mon, 20 Mar 2023 21:06:18 -0500
Subject: [PATCH 3/7] vhost-scsi: Fix vhost_scsi struct use after free

If vhost_scsi_setup_vq_cmds fails we leave the tpg->vhost_scsi pointer
set. If the device is freed and then the user unmaps the LUN, the call to
vhost_scsi_port_unlink -> vhost_scsi_hotunplug will see the that
tpg->vhost_scsi is still set and try to use it.

This has us clear the vhost_scsi pointer in the failure path. It also
has us take tv_tpg_mutex in this failure path, because tv_tpg_vhost_count
is accessed under this mutex in vhost_scsi_drop_nexus and in the future
we will want to serialize access to tpg->vhost_scsi with that mutex
instead of the vhost_scsi_mutex.

Signed-off-by: Mike Christie <michael.christie@oracle.com>
Message-Id: <20230321020624.13323-2-michael.christie@oracle.com>
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
---
 drivers/vhost/scsi.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/drivers/vhost/scsi.c b/drivers/vhost/scsi.c
index b244e7c0f514..5875241e1654 100644
--- a/drivers/vhost/scsi.c
+++ b/drivers/vhost/scsi.c
@@ -1658,7 +1658,10 @@ undepend:
 	for (i = 0; i < VHOST_SCSI_MAX_TARGET; i++) {
 		tpg = vs_tpg[i];
 		if (tpg) {
+			mutex_lock(&tpg->tv_tpg_mutex);
+			tpg->vhost_scsi = NULL;
 			tpg->tv_tpg_vhost_count--;
+			mutex_unlock(&tpg->tv_tpg_mutex);
 			target_undepend_item(&tpg->se_tpg.tpg_group.cg_item);
 		}
 	}

From 4c363c81f66c77e0a2394b9a4efa707d122dc544 Mon Sep 17 00:00:00 2001
From: Mike Christie <michael.christie@oracle.com>
Date: Mon, 20 Mar 2023 21:06:19 -0500
Subject: [PATCH 4/7] vhost-scsi: Fix crash during LUN unmapping

We normally clear the endpoint then unmap LUNs so the devices are fully
shutdown when the LUN is unmapped, but it's legal to unmap before
clearing. If the user does that while TMFs are running then we can end
up crashing.

vhost_scsi_port_unlink assumes that the LUN's tmf struct will always be on
the tmf_queue list. However, if a TMF is running then it will have been
removed while it's executing. If we do a LUN unmap at this time, then
we assume the entry is on the list and just start accessing it and free
it.

This fixes the bug by just allocating the vhost_scsi_tmf struct when it's
needed like is done with the se_tmr struct that's needed when we submit
the TMF. In this path perf is not an issue and we can use GFP_KERNEL
since it won't swing directly back on us, so we don't need to preallocate
the struct.

Signed-off-by: Mike Christie <michael.christie@oracle.com>
Message-Id: <20230321020624.13323-3-michael.christie@oracle.com>
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
---
 drivers/vhost/scsi.c | 36 ++++--------------------------------
 1 file changed, 4 insertions(+), 32 deletions(-)

diff --git a/drivers/vhost/scsi.c b/drivers/vhost/scsi.c
index 5875241e1654..32d0be968103 100644
--- a/drivers/vhost/scsi.c
+++ b/drivers/vhost/scsi.c
@@ -125,7 +125,6 @@ struct vhost_scsi_tpg {
 	struct se_portal_group se_tpg;
 	/* Pointer back to vhost_scsi, protected by tv_tpg_mutex */
 	struct vhost_scsi *vhost_scsi;
-	struct list_head tmf_queue;
 };
 
 struct vhost_scsi_tport {
@@ -206,10 +205,8 @@ struct vhost_scsi {
 
 struct vhost_scsi_tmf {
 	struct vhost_work vwork;
-	struct vhost_scsi_tpg *tpg;
 	struct vhost_scsi *vhost;
 	struct vhost_scsi_virtqueue *svq;
-	struct list_head queue_entry;
 
 	struct se_cmd se_cmd;
 	u8 scsi_resp;
@@ -352,12 +349,9 @@ static void vhost_scsi_release_cmd_res(struct se_cmd *se_cmd)
 
 static void vhost_scsi_release_tmf_res(struct vhost_scsi_tmf *tmf)
 {
-	struct vhost_scsi_tpg *tpg = tmf->tpg;
 	struct vhost_scsi_inflight *inflight = tmf->inflight;
 
-	mutex_lock(&tpg->tv_tpg_mutex);
-	list_add_tail(&tpg->tmf_queue, &tmf->queue_entry);
-	mutex_unlock(&tpg->tv_tpg_mutex);
+	kfree(tmf);
 	vhost_scsi_put_inflight(inflight);
 }
 
@@ -1194,19 +1188,11 @@ vhost_scsi_handle_tmf(struct vhost_scsi *vs, struct vhost_scsi_tpg *tpg,
 		goto send_reject;
 	}
 
-	mutex_lock(&tpg->tv_tpg_mutex);
-	if (list_empty(&tpg->tmf_queue)) {
-		pr_err("Missing reserve TMF. Could not handle LUN RESET.\n");
-		mutex_unlock(&tpg->tv_tpg_mutex);
+	tmf = kzalloc(sizeof(*tmf), GFP_KERNEL);
+	if (!tmf)
 		goto send_reject;
-	}
 
-	tmf = list_first_entry(&tpg->tmf_queue, struct vhost_scsi_tmf,
-			       queue_entry);
-	list_del_init(&tmf->queue_entry);
-	mutex_unlock(&tpg->tv_tpg_mutex);
-
-	tmf->tpg = tpg;
+	vhost_work_init(&tmf->vwork, vhost_scsi_tmf_resp_work);
 	tmf->vhost = vs;
 	tmf->svq = svq;
 	tmf->resp_iov = vq->iov[vc->out];
@@ -2035,19 +2021,11 @@ static int vhost_scsi_port_link(struct se_portal_group *se_tpg,
 {
 	struct vhost_scsi_tpg *tpg = container_of(se_tpg,
 				struct vhost_scsi_tpg, se_tpg);
-	struct vhost_scsi_tmf *tmf;
-
-	tmf = kzalloc(sizeof(*tmf), GFP_KERNEL);
-	if (!tmf)
-		return -ENOMEM;
-	INIT_LIST_HEAD(&tmf->queue_entry);
-	vhost_work_init(&tmf->vwork, vhost_scsi_tmf_resp_work);
 
 	mutex_lock(&vhost_scsi_mutex);
 
 	mutex_lock(&tpg->tv_tpg_mutex);
 	tpg->tv_tpg_port_count++;
-	list_add_tail(&tmf->queue_entry, &tpg->tmf_queue);
 	mutex_unlock(&tpg->tv_tpg_mutex);
 
 	vhost_scsi_hotplug(tpg, lun);
@@ -2062,16 +2040,11 @@ static void vhost_scsi_port_unlink(struct se_portal_group *se_tpg,
 {
 	struct vhost_scsi_tpg *tpg = container_of(se_tpg,
 				struct vhost_scsi_tpg, se_tpg);
-	struct vhost_scsi_tmf *tmf;
 
 	mutex_lock(&vhost_scsi_mutex);
 
 	mutex_lock(&tpg->tv_tpg_mutex);
 	tpg->tv_tpg_port_count--;
-	tmf = list_first_entry(&tpg->tmf_queue, struct vhost_scsi_tmf,
-			       queue_entry);
-	list_del(&tmf->queue_entry);
-	kfree(tmf);
 	mutex_unlock(&tpg->tv_tpg_mutex);
 
 	vhost_scsi_hotunplug(tpg, lun);
@@ -2332,7 +2305,6 @@ vhost_scsi_make_tpg(struct se_wwn *wwn, const char *name)
 	}
 	mutex_init(&tpg->tv_tpg_mutex);
 	INIT_LIST_HEAD(&tpg->tv_tpg_list);
-	INIT_LIST_HEAD(&tpg->tmf_queue);
 	tpg->tport = tport;
 	tpg->tport_tpgt = tpgt;
 

From 9513c55ce3e8f0f118c47423452c4a4bdaa80222 Mon Sep 17 00:00:00 2001
From: Ross Zwisler <zwisler@chromium.org>
Date: Wed, 15 Feb 2023 15:33:50 -0700
Subject: [PATCH 5/7] tools/virtio: fix typo in README instructions

We need to have a unique chardev for each data path, else the chardevs
will collide and qemu will die with this message:

  qemu-system-x86_64: -device
  virtserialport,bus=virtio-serial0.0,nr=2,chardev=charchannel0,
  id=channel1,name=trace-path-cpu0:
  Property 'virtserialport.chardev' can't take value 'charchannel0':
  Device 'charchannel0' is in use

Signed-off-by: Ross Zwisler <zwisler@google.com>
Message-Id: <20230215223350.2658616-7-zwisler@google.com>
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
---
 tools/virtio/virtio-trace/README | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tools/virtio/virtio-trace/README b/tools/virtio/virtio-trace/README
index b64845b823ab..4fb9368bf751 100644
--- a/tools/virtio/virtio-trace/README
+++ b/tools/virtio/virtio-trace/README
@@ -61,7 +61,7 @@ and
       id=channel0,name=agent-ctl-path\
  ##data path##
      -chardev pipe,id=charchannel1,path=/tmp/virtio-trace/trace-path-cpu0\
-     -device virtserialport,bus=virtio-serial0.0,nr=2,chardev=charchannel0,\
+     -device virtserialport,bus=virtio-serial0.0,nr=2,chardev=charchannel1,\
       id=channel1,name=trace-path-cpu0\
       ...
 

From f0417e72add5aed5997092b6eb76298290d866c9 Mon Sep 17 00:00:00 2001
From: Eli Cohen <elic@nvidia.com>
Date: Mon, 3 Apr 2023 14:40:39 +0300
Subject: [PATCH 6/7] vdpa/mlx5: Add and remove debugfs in setup/teardown
 driver

The right place to add the debugfs create is in
setup_driver() and remove it in teardown_driver().

Current code adds the debugfs when creating the device but resetting a
device will remove the debugfs subtree and subsequent set_driver will
not be able to create the files since the debugfs pointer is NULL.

Fixes: 294221004322 ("vdpa/mlx5: Add debugfs subtree")
Signed-off-by: Eli Cohen <elic@nvidia.com>

v3 -> v4:
Fix error flow in setup_driver()
Message-Id: <20230403114039.11102-1-elic@nvidia.com>

Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
Acked-by: Jason Wang <jasowang@redhat.com>
---
 drivers/vdpa/mlx5/net/mlx5_vnet.c | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/drivers/vdpa/mlx5/net/mlx5_vnet.c b/drivers/vdpa/mlx5/net/mlx5_vnet.c
index 520646ae7fa0..195963b82b63 100644
--- a/drivers/vdpa/mlx5/net/mlx5_vnet.c
+++ b/drivers/vdpa/mlx5/net/mlx5_vnet.c
@@ -2467,10 +2467,11 @@ static int setup_driver(struct mlx5_vdpa_dev *mvdev)
 		err = 0;
 		goto out;
 	}
+	mlx5_vdpa_add_debugfs(ndev);
 	err = setup_virtqueues(mvdev);
 	if (err) {
 		mlx5_vdpa_warn(mvdev, "setup_virtqueues\n");
-		goto out;
+		goto err_setup;
 	}
 
 	err = create_rqt(ndev);
@@ -2500,6 +2501,8 @@ err_tir:
 	destroy_rqt(ndev);
 err_rqt:
 	teardown_virtqueues(ndev);
+err_setup:
+	mlx5_vdpa_remove_debugfs(ndev->debugfs);
 out:
 	return err;
 }
@@ -2513,6 +2516,8 @@ static void teardown_driver(struct mlx5_vdpa_net *ndev)
 	if (!ndev->setup)
 		return;
 
+	mlx5_vdpa_remove_debugfs(ndev->debugfs);
+	ndev->debugfs = NULL;
 	teardown_steering(ndev);
 	destroy_tir(ndev);
 	destroy_rqt(ndev);
@@ -3261,7 +3266,6 @@ static int mlx5_vdpa_dev_add(struct vdpa_mgmt_dev *v_mdev, const char *name,
 	if (err)
 		goto err_reg;
 
-	mlx5_vdpa_add_debugfs(ndev);
 	mgtdev->ndev = ndev;
 	return 0;
 

From 9da667e50c7e62266f3c2f8ad57b32fca40716b1 Mon Sep 17 00:00:00 2001
From: Stefano Garzarella <sgarzare@redhat.com>
Date: Wed, 29 Mar 2023 18:03:21 +0200
Subject: [PATCH 7/7] vdpa_sim_net: complete the initialization before register
 the device

Initialization must be completed before calling _vdpa_register_device()
since it can connect the device to the vDPA bus, so requests can arrive
after that call.

So for example vdpasim_net_work(), which uses the net->*_stats variables,
can be scheduled before they are initialized.

Let's move _vdpa_register_device() to the end of vdpasim_net_dev_add()
and add a comment to avoid future issues.

Fixes: 0899774cb360 ("vdpa_sim_net: vendor satistics")
Signed-off-by: Stefano Garzarella <sgarzare@redhat.com>
Message-Id: <20230329160321.187176-1-sgarzare@redhat.com>
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
Acked-by: Jason Wang <jasowang@redhat.com>
---
 drivers/vdpa/vdpa_sim/vdpa_sim_net.c | 13 +++++++++----
 1 file changed, 9 insertions(+), 4 deletions(-)

diff --git a/drivers/vdpa/vdpa_sim/vdpa_sim_net.c b/drivers/vdpa/vdpa_sim/vdpa_sim_net.c
index 862f405362de..dfe2ce341803 100644
--- a/drivers/vdpa/vdpa_sim/vdpa_sim_net.c
+++ b/drivers/vdpa/vdpa_sim/vdpa_sim_net.c
@@ -466,16 +466,21 @@ static int vdpasim_net_dev_add(struct vdpa_mgmt_dev *mdev, const char *name,
 
 	vdpasim_net_setup_config(simdev, config);
 
-	ret = _vdpa_register_device(&simdev->vdpa, VDPASIM_NET_VQ_NUM);
-	if (ret)
-		goto reg_err;
-
 	net = sim_to_net(simdev);
 
 	u64_stats_init(&net->tx_stats.syncp);
 	u64_stats_init(&net->rx_stats.syncp);
 	u64_stats_init(&net->cq_stats.syncp);
 
+	/*
+	 * Initialization must be completed before this call, since it can
+	 * connect the device to the vDPA bus, so requests can arrive after
+	 * this call.
+	 */
+	ret = _vdpa_register_device(&simdev->vdpa, VDPASIM_NET_VQ_NUM);
+	if (ret)
+		goto reg_err;
+
 	return 0;
 
 reg_err: