Merge branch 'for-linus' of git://git.kernel.dk/linux-block

Pull more block updates from Jens Axboe: "This is a followup for block changes, that didn't make the initial pull request. It's a bit of a mixed bag, this contains: - A followup pull request from Sagi for NVMe. Outside of fixups for NVMe, it also includes a series for ensuring that we properly quiesce hardware queues when browsing live tags. - Set of integrity fixes from Dmitry (mostly), fixing various issues for folks using DIF/DIX. - Fix for a bug introduced in cciss, with the req init changes. From Christoph. - Fix for a bug in BFQ, from Paolo. - Two followup fixes for lightnvm/pblk from Javier. - Depth fix from Ming for blk-mq-sched. - Also from Ming, performance fix for mtip32xx that was introduced with the dynamic initialization of commands" * 'for-linus' of git://git.kernel.dk/linux-block: (44 commits) block: call bio_uninit in bio_endio nvmet: avoid unneeded assignment of submit_bio return value nvme-pci: add module parameter for io queue depth nvme-pci: compile warnings in nvme_alloc_host_mem() nvmet_fc: Accept variable pad lengths on Create Association LS nvme_fc/nvmet_fc: revise Create Association descriptor length lightnvm: pblk: remove unnecessary checks lightnvm: pblk: control I/O flow also on tear down cciss: initialize struct scsi_req null_blk: fix error flow for shared tags during module_init block: Fix __blkdev_issue_zeroout loop nvme-rdma: unconditionally recycle the request mr nvme: split nvme_uninit_ctrl into stop and uninit virtio_blk: quiesce/unquiesce live IO when entering PM states mtip32xx: quiesce request queues to make sure no submissions are inflight nbd: quiesce request queues to make sure no submissions are inflight nvme: kick requeue list when requeueing a request instead of when starting the queues nvme-pci: quiesce/unquiesce admin_q instead of start/stop its hw queues nvme-loop: quiesce/unquiesce admin_q instead of start/stop its hw queues nvme-fc: quiesce/unquiesce admin_q instead of start/stop its hw queues ...
2026-05-01 15:00:59 -07:00 · 2017-07-11 15:36:52 -07:00
parent 908b852df1 b222dd2fdd
commit 130568d5ea
40 changed files with 601 additions and 441 deletions
@@ -192,7 +192,7 @@ will require extra work due to the application tag.
    supported by the block device.


-    int bio_integrity_prep(bio);
+    bool bio_integrity_prep(bio);

      To generate IMD for WRITE and to set up buffers for READ, the
      filesystem must call bio_integrity_prep(bio).
@@ -201,9 +201,7 @@ will require extra work due to the application tag.
      sector must be set, and the bio should have all data pages
      added.  It is up to the caller to ensure that the bio does not
      change while I/O is in progress.
-
-      bio_integrity_prep() should only be called if
-      bio_integrity_enabled() returned 1.
+      Complete bio with error if prepare failed for some reson.


 5.3 PASSING EXISTING INTEGRITY METADATA
@@ -3483,11 +3483,17 @@ static void bfq_update_wr_data(struct bfq_data *bfqd, struct bfq_queue *bfqq)
 			}
 		}
 	}
-	/* Update weight both if it must be raised and if it must be lowered */
+	/*
+	 * To improve latency (for this or other queues), immediately
+	 * update weight both if it must be raised and if it must be
+	 * lowered. Since, entity may be on some active tree here, and
+	 * might have a pending change of its ioprio class, invoke
+	 * next function with the last parameter unset (see the
+	 * comments on the function).
+	 */
 	if ((entity->weight > entity->orig_weight) != (bfqq->wr_coeff > 1))
-		__bfq_entity_update_weight_prio(
-			bfq_entity_service_tree(entity),
-			entity);
+		__bfq_entity_update_weight_prio(bfq_entity_service_tree(entity),
+						entity, false);
 }

 /*
@@ -892,7 +892,8 @@ void bfq_put_idle_entity(struct bfq_service_tree *st,
 			 struct bfq_entity *entity);
 struct bfq_service_tree *
 __bfq_entity_update_weight_prio(struct bfq_service_tree *old_st,
-				struct bfq_entity *entity);
+				struct bfq_entity *entity,
+				bool update_class_too);
 void bfq_bfqq_served(struct bfq_queue *bfqq, int served);
 void bfq_bfqq_charge_time(struct bfq_data *bfqd, struct bfq_queue *bfqq,
 			  unsigned long time_ms);
@@ -694,10 +694,28 @@ struct bfq_service_tree *bfq_entity_service_tree(struct bfq_entity *entity)
 	return sched_data->service_tree + idx;
 }

-
+/*
+ * Update weight and priority of entity. If update_class_too is true,
+ * then update the ioprio_class of entity too.
+ *
+ * The reason why the update of ioprio_class is controlled through the
+ * last parameter is as follows. Changing the ioprio class of an
+ * entity implies changing the destination service trees for that
+ * entity. If such a change occurred when the entity is already on one
+ * of the service trees for its previous class, then the state of the
+ * entity would become more complex: none of the new possible service
+ * trees for the entity, according to bfq_entity_service_tree(), would
+ * match any of the possible service trees on which the entity
+ * is. Complex operations involving these trees, such as entity
+ * activations and deactivations, should take into account this
+ * additional complexity.  To avoid this issue, this function is
+ * invoked with update_class_too unset in the points in the code where
+ * entity may happen to be on some tree.
+ */
 struct bfq_service_tree *
 __bfq_entity_update_weight_prio(struct bfq_service_tree *old_st,
-				struct bfq_entity *entity)
+				struct bfq_entity *entity,
+				bool update_class_too)
 {
 	struct bfq_service_tree *new_st = old_st;

@@ -739,9 +757,15 @@ __bfq_entity_update_weight_prio(struct bfq_service_tree *old_st,
 				  bfq_weight_to_ioprio(entity->orig_weight);
 		}

-		if (bfqq)
+		if (bfqq && update_class_too)
 			bfqq->ioprio_class = bfqq->new_ioprio_class;
-		entity->prio_changed = 0;
+
+		/*
+		 * Reset prio_changed only if the ioprio_class change
+		 * is not pending any longer.
+		 */
+		if (!bfqq || bfqq->ioprio_class == bfqq->new_ioprio_class)
+			entity->prio_changed = 0;

 		/*
 		 * NOTE: here we may be changing the weight too early,
@@ -867,7 +891,12 @@ static void bfq_update_fin_time_enqueue(struct bfq_entity *entity,
 {
 	struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);

-	st = __bfq_entity_update_weight_prio(st, entity);
+	/*
+	 * When this function is invoked, entity is not in any service
+	 * tree, then it is safe to invoke next function with the last
+	 * parameter set (see the comments on the function).
+	 */
+	st = __bfq_entity_update_weight_prio(st, entity, true);
 	bfq_calc_finish(entity, entity->budget);

 	/*
@@ -102,7 +102,7 @@ EXPORT_SYMBOL(bio_integrity_alloc);
 * Description: Used to free the integrity portion of a bio. Usually
 * called from bio_free().
 */
-void bio_integrity_free(struct bio *bio)
+static void bio_integrity_free(struct bio *bio)
 {
 	struct bio_integrity_payload *bip = bio_integrity(bio);
 	struct bio_set *bs = bio->bi_pool;
@@ -120,8 +120,8 @@ void bio_integrity_free(struct bio *bio)
 	}

 	bio->bi_integrity = NULL;
+	bio->bi_opf &= ~REQ_INTEGRITY;
 }
-EXPORT_SYMBOL(bio_integrity_free);

 /**
 * bio_integrity_add_page - Attach integrity metadata
@@ -159,44 +159,6 @@ int bio_integrity_add_page(struct bio *bio, struct page *page,
 }
 EXPORT_SYMBOL(bio_integrity_add_page);

-/**
- * bio_integrity_enabled - Check whether integrity can be passed
- * @bio:	bio to check
- *
- * Description: Determines whether bio_integrity_prep() can be called
- * on this bio or not.	bio data direction and target device must be
- * set prior to calling.  The functions honors the write_generate and
- * read_verify flags in sysfs.
- */
-bool bio_integrity_enabled(struct bio *bio)
-{
-	struct blk_integrity *bi = bdev_get_integrity(bio->bi_bdev);
-
-	if (bio_op(bio) != REQ_OP_READ && bio_op(bio) != REQ_OP_WRITE)
-		return false;
-
-	if (!bio_sectors(bio))
-		return false;
-
-	/* Already protected? */
-	if (bio_integrity(bio))
-		return false;
-
-	if (bi == NULL)
-		return false;
-
-	if (bio_data_dir(bio) == READ && bi->profile->verify_fn != NULL &&
-	    (bi->flags & BLK_INTEGRITY_VERIFY))
-		return true;
-
-	if (bio_data_dir(bio) == WRITE && bi->profile->generate_fn != NULL &&
-	    (bi->flags & BLK_INTEGRITY_GENERATE))
-		return true;
-
-	return false;
-}
-EXPORT_SYMBOL(bio_integrity_enabled);
-
 /**
 * bio_integrity_intervals - Return number of integrity intervals for a bio
 * @bi:		blk_integrity profile for device
@@ -222,10 +184,11 @@ static inline unsigned int bio_integrity_bytes(struct blk_integrity *bi,
 /**
 * bio_integrity_process - Process integrity metadata for a bio
 * @bio:	bio to generate/verify integrity metadata for
+ * @proc_iter:  iterator to process
 * @proc_fn:	Pointer to the relevant processing function
 */
 static blk_status_t bio_integrity_process(struct bio *bio,
-				 integrity_processing_fn *proc_fn)
+		struct bvec_iter *proc_iter, integrity_processing_fn *proc_fn)
 {
 	struct blk_integrity *bi = bdev_get_integrity(bio->bi_bdev);
 	struct blk_integrity_iter iter;
@@ -238,10 +201,10 @@ static blk_status_t bio_integrity_process(struct bio *bio,

 	iter.disk_name = bio->bi_bdev->bd_disk->disk_name;
 	iter.interval = 1 << bi->interval_exp;
-	iter.seed = bip_get_seed(bip);
+	iter.seed = proc_iter->bi_sector;
 	iter.prot_buf = prot_buf;

-	bio_for_each_segment(bv, bio, bviter) {
+	__bio_for_each_segment(bv, bio, bviter, *proc_iter) {
 		void *kaddr = kmap_atomic(bv.bv_page);

 		iter.data_buf = kaddr + bv.bv_offset;
@@ -262,14 +225,15 @@ static blk_status_t bio_integrity_process(struct bio *bio,
 * bio_integrity_prep - Prepare bio for integrity I/O
 * @bio:	bio to prepare
 *
- * Description: Allocates a buffer for integrity metadata, maps the
- * pages and attaches them to a bio.  The bio must have data
- * direction, target device and start sector set priot to calling.  In
- * the WRITE case, integrity metadata will be generated using the
- * block device's integrity function.  In the READ case, the buffer
+ * Description:  Checks if the bio already has an integrity payload attached.
+ * If it does, the payload has been generated by another kernel subsystem,
+ * and we just pass it through. Otherwise allocates integrity payload.
+ * The bio must have data direction, target device and start sector set priot
+ * to calling.  In the WRITE case, integrity metadata will be generated using
+ * the block device's integrity function.  In the READ case, the buffer
 * will be prepared for DMA and a suitable end_io handler set up.
 */
-int bio_integrity_prep(struct bio *bio)
+bool bio_integrity_prep(struct bio *bio)
 {
 	struct bio_integrity_payload *bip;
 	struct blk_integrity *bi;
@@ -279,20 +243,41 @@ int bio_integrity_prep(struct bio *bio)
 	unsigned int len, nr_pages;
 	unsigned int bytes, offset, i;
 	unsigned int intervals;
+	blk_status_t status;

 	bi = bdev_get_integrity(bio->bi_bdev);
 	q = bdev_get_queue(bio->bi_bdev);
-	BUG_ON(bi == NULL);
-	BUG_ON(bio_integrity(bio));
+	if (bio_op(bio) != REQ_OP_READ && bio_op(bio) != REQ_OP_WRITE)
+		return true;

+	if (!bio_sectors(bio))
+		return true;
+
+	/* Already protected? */
+	if (bio_integrity(bio))
+		return true;
+
+	if (bi == NULL)
+		return true;
+
+	if (bio_data_dir(bio) == READ) {
+		if (!bi->profile->verify_fn ||
+		    !(bi->flags & BLK_INTEGRITY_VERIFY))
+			return true;
+	} else {
+		if (!bi->profile->generate_fn ||
+		    !(bi->flags & BLK_INTEGRITY_GENERATE))
+			return true;
+	}
 	intervals = bio_integrity_intervals(bi, bio_sectors(bio));

 	/* Allocate kernel buffer for protection data */
 	len = intervals * bi->tuple_size;
 	buf = kmalloc(len, GFP_NOIO | q->bounce_gfp);
+	status = BLK_STS_RESOURCE;
 	if (unlikely(buf == NULL)) {
 		printk(KERN_ERR "could not allocate integrity buffer\n");
-		return -ENOMEM;
+		goto err_end_io;
 	}

 	end = (((unsigned long) buf) + len + PAGE_SIZE - 1) >> PAGE_SHIFT;
@@ -304,7 +289,8 @@ int bio_integrity_prep(struct bio *bio)
 	if (IS_ERR(bip)) {
 		printk(KERN_ERR "could not allocate data integrity bioset\n");
 		kfree(buf);
-		return PTR_ERR(bip);
+		status = BLK_STS_RESOURCE;
+		goto err_end_io;
 	}

 	bip->bip_flags |= BIP_BLOCK_INTEGRITY;
@@ -330,7 +316,7 @@ int bio_integrity_prep(struct bio *bio)
 					     bytes, offset);

 		if (ret == 0)
-			return 0;
+			return false;

 		if (ret < bytes)
 			break;
@@ -340,17 +326,18 @@ int bio_integrity_prep(struct bio *bio)
 		offset = 0;
 	}

-	/* Install custom I/O completion handler if read verify is enabled */
-	if (bio_data_dir(bio) == READ) {
-		bip->bip_end_io = bio->bi_end_io;
-		bio->bi_end_io = bio_integrity_endio;
-	}
-
 	/* Auto-generate integrity metadata if this is a write */
-	if (bio_data_dir(bio) == WRITE)
-		bio_integrity_process(bio, bi->profile->generate_fn);
+	if (bio_data_dir(bio) == WRITE) {
+		bio_integrity_process(bio, &bio->bi_iter,
+				      bi->profile->generate_fn);
+	}
+	return true;
+
+err_end_io:
+	bio->bi_status = status;
+	bio_endio(bio);
+	return false;

-	return 0;
 }
 EXPORT_SYMBOL(bio_integrity_prep);

@@ -368,16 +355,26 @@ static void bio_integrity_verify_fn(struct work_struct *work)
 		container_of(work, struct bio_integrity_payload, bip_work);
 	struct bio *bio = bip->bip_bio;
 	struct blk_integrity *bi = bdev_get_integrity(bio->bi_bdev);
+	struct bvec_iter iter = bio->bi_iter;

-	bio->bi_status = bio_integrity_process(bio, bi->profile->verify_fn);
+	/*
+	 * At the moment verify is called bio's iterator was advanced
+	 * during split and completion, we need to rewind iterator to
+	 * it's original position.
+	 */
+	if (bio_rewind_iter(bio, &iter, iter.bi_done)) {
+		bio->bi_status = bio_integrity_process(bio, &iter,
+						       bi->profile->verify_fn);
+	} else {
+		bio->bi_status = BLK_STS_IOERR;
+	}

-	/* Restore original bio completion handler */
-	bio->bi_end_io = bip->bip_end_io;
+	bio_integrity_free(bio);
 	bio_endio(bio);
 }

 /**
- * bio_integrity_endio - Integrity I/O completion function
+ * __bio_integrity_endio - Integrity I/O completion function
 * @bio:	Protected bio
 * @error:	Pointer to errno
 *
@@ -388,27 +385,19 @@ static void bio_integrity_verify_fn(struct work_struct *work)
 * in process context.	This function postpones completion
 * accordingly.
 */
-void bio_integrity_endio(struct bio *bio)
+bool __bio_integrity_endio(struct bio *bio)
 {
-	struct bio_integrity_payload *bip = bio_integrity(bio);
+	if (bio_op(bio) == REQ_OP_READ && !bio->bi_status) {
+		struct bio_integrity_payload *bip = bio_integrity(bio);

-	BUG_ON(bip->bip_bio != bio);
-
-	/* In case of an I/O error there is no point in verifying the
-	 * integrity metadata.  Restore original bio end_io handler
-	 * and run it.
-	 */
-	if (bio->bi_status) {
-		bio->bi_end_io = bip->bip_end_io;
-		bio_endio(bio);
-
-		return;
+		INIT_WORK(&bip->bip_work, bio_integrity_verify_fn);
+		queue_work(kintegrityd_wq, &bip->bip_work);
+		return false;
 	}

-	INIT_WORK(&bip->bip_work, bio_integrity_verify_fn);
-	queue_work(kintegrityd_wq, &bip->bip_work);
+	bio_integrity_free(bio);
+	return true;
 }
-EXPORT_SYMBOL(bio_integrity_endio);

 /**
 * bio_integrity_advance - Advance integrity vector
@@ -425,6 +414,7 @@ void bio_integrity_advance(struct bio *bio, unsigned int bytes_done)
 	struct blk_integrity *bi = bdev_get_integrity(bio->bi_bdev);
 	unsigned bytes = bio_integrity_bytes(bi, bytes_done >> 9);

+	bip->bip_iter.bi_sector += bytes_done >> 9;
 	bvec_iter_advance(bip->bip_vec, &bip->bip_iter, bytes);
 }
 EXPORT_SYMBOL(bio_integrity_advance);
@@ -432,22 +422,15 @@ EXPORT_SYMBOL(bio_integrity_advance);
 /**
 * bio_integrity_trim - Trim integrity vector
 * @bio:	bio whose integrity vector to update
- * @offset:	offset to first data sector
- * @sectors:	number of data sectors
 *
 * Description: Used to trim the integrity vector in a cloned bio.
- * The ivec will be advanced corresponding to 'offset' data sectors
- * and the length will be truncated corresponding to 'len' data
- * sectors.
 */
-void bio_integrity_trim(struct bio *bio, unsigned int offset,
-			unsigned int sectors)
+void bio_integrity_trim(struct bio *bio)
 {
 	struct bio_integrity_payload *bip = bio_integrity(bio);
 	struct blk_integrity *bi = bdev_get_integrity(bio->bi_bdev);

-	bio_integrity_advance(bio, offset << 9);
-	bip->bip_iter.bi_size = bio_integrity_bytes(bi, sectors);
+	bip->bip_iter.bi_size = bio_integrity_bytes(bi, bio_sectors(bio));
 }
 EXPORT_SYMBOL(bio_integrity_trim);

@@ -243,9 +243,6 @@ fallback:
 void bio_uninit(struct bio *bio)
 {
 	bio_disassociate_task(bio);
-
-	if (bio_integrity(bio))
-		bio_integrity_free(bio);
 }
 EXPORT_SYMBOL(bio_uninit);

@@ -1813,6 +1810,8 @@ void bio_endio(struct bio *bio)
 again:
 	if (!bio_remaining_done(bio))
 		return;
+	if (!bio_integrity_endio(bio))
+		return;

 	/*
 	 * Need to have a real endio function for chained bios, otherwise
@@ -1834,6 +1833,8 @@ again:
 	}

 	blk_throtl_bio_endio(bio);
+	/* release cgroup info */
+	bio_uninit(bio);
 	if (bio->bi_end_io)
 		bio->bi_end_io(bio);
 }
@@ -1868,7 +1869,7 @@ struct bio *bio_split(struct bio *bio, int sectors,
 	split->bi_iter.bi_size = sectors << 9;

 	if (bio_integrity(split))
-		bio_integrity_trim(split, 0, sectors);
+		bio_integrity_trim(split);

 	bio_advance(bio, split->bi_iter.bi_size);

@@ -1900,6 +1901,10 @@ void bio_trim(struct bio *bio, int offset, int size)
 	bio_advance(bio, offset << 9);

 	bio->bi_iter.bi_size = size;
+
+	if (bio_integrity(bio))
+		bio_integrity_trim(bio);
+
 }
 EXPORT_SYMBOL_GPL(bio_trim);

@@ -1787,11 +1787,8 @@ static blk_qc_t blk_queue_bio(struct request_queue *q, struct bio *bio)

 	blk_queue_split(q, &bio);

-	if (bio_integrity_enabled(bio) && bio_integrity_prep(bio)) {
-		bio->bi_status = BLK_STS_IOERR;
-		bio_endio(bio);
+	if (!bio_integrity_prep(bio))
 		return BLK_QC_T_NONE;
-	}

 	if (op_is_flush(bio->bi_opf)) {
 		spin_lock_irq(q->queue_lock);
@@ -261,6 +261,19 @@ static int __blkdev_issue_write_zeroes(struct block_device *bdev,
 	return 0;
 }

+/*
+ * Convert a number of 512B sectors to a number of pages.
+ * The result is limited to a number of pages that can fit into a BIO.
+ * Also make sure that the result is always at least 1 (page) for the cases
+ * where nr_sects is lower than the number of sectors in a page.
+ */
+static unsigned int __blkdev_sectors_to_bio_pages(sector_t nr_sects)
+{
+	sector_t bytes = (nr_sects << 9) + PAGE_SIZE - 1;
+
+	return min(bytes >> PAGE_SHIFT, (sector_t)BIO_MAX_PAGES);
+}
+
 /**
 * __blkdev_issue_zeroout - generate number of zero filed write bios
 * @bdev:	blockdev to issue
@@ -307,18 +320,18 @@ int __blkdev_issue_zeroout(struct block_device *bdev, sector_t sector,

 	ret = 0;
 	while (nr_sects != 0) {
-		bio = next_bio(bio, min(nr_sects, (sector_t)BIO_MAX_PAGES),
-				gfp_mask);
+		bio = next_bio(bio, __blkdev_sectors_to_bio_pages(nr_sects),
+			       gfp_mask);
 		bio->bi_iter.bi_sector = sector;
 		bio->bi_bdev   = bdev;
 		bio_set_op_attrs(bio, REQ_OP_WRITE, 0);

 		while (nr_sects != 0) {
-			sz = min((sector_t) PAGE_SIZE >> 9 , nr_sects);
-			bi_size = bio_add_page(bio, ZERO_PAGE(0), sz << 9, 0);
+			sz = min((sector_t) PAGE_SIZE, nr_sects << 9);
+			bi_size = bio_add_page(bio, ZERO_PAGE(0), sz, 0);
 			nr_sects -= bi_size >> 9;
 			sector += bi_size >> 9;
-			if (bi_size < (sz << 9))
+			if (bi_size < sz)
 				break;
 		}
 		cond_resched();
@@ -515,10 +515,12 @@ int blk_mq_init_sched(struct request_queue *q, struct elevator_type *e)
 	}

 	/*
-	 * Default to 256, since we don't split into sync/async like the
-	 * old code did. Additionally, this is a per-hw queue depth.
+	 * Default to double of smaller one between hw queue_depth and 128,
+	 * since we don't split into sync/async like the old code did.
+	 * Additionally, this is a per-hw queue depth.
 	 */
-	q->nr_requests = 2 * BLKDEV_MAX_RQ;
+	q->nr_requests = 2 * min_t(unsigned int, q->tag_set->queue_depth,
+				   BLKDEV_MAX_RQ);

 	queue_for_each_hw_ctx(q, hctx, i) {
 		ret = blk_mq_sched_alloc_tags(q, hctx, i);
@@ -1547,10 +1547,8 @@ static blk_qc_t blk_mq_make_request(struct request_queue *q, struct bio *bio)

 	blk_queue_split(q, &bio);

-	if (bio_integrity_enabled(bio) && bio_integrity_prep(bio)) {
-		bio_io_error(bio);
+	if (!bio_integrity_prep(bio))
 		return BLK_QC_T_NONE;
-	}

 	if (!is_flush_fua && !blk_queue_nomerges(q) &&
 	    blk_attempt_plug_merge(q, bio, &request_count, &same_queue_rq))
@@ -81,10 +81,21 @@ static inline void blk_queue_enter_live(struct request_queue *q)

 #ifdef CONFIG_BLK_DEV_INTEGRITY
 void blk_flush_integrity(void);
+bool __bio_integrity_endio(struct bio *);
+static inline bool bio_integrity_endio(struct bio *bio)
+{
+	if (bio_integrity(bio))
+		return __bio_integrity_endio(bio);
+	return true;
+}
 #else
 static inline void blk_flush_integrity(void)
 {
 }
+static inline bool bio_integrity_endio(struct bio *bio)
+{
+	return true;
+}
 #endif

 void blk_timeout_work(struct work_struct *work);
@@ -28,9 +28,6 @@

 typedef __be16 (csum_fn) (void *, unsigned int);

-static const __be16 APP_ESCAPE = (__force __be16) 0xffff;
-static const __be32 REF_ESCAPE = (__force __be32) 0xffffffff;
-
 static __be16 t10_pi_crc_fn(void *data, unsigned int len)
 {
 	return cpu_to_be16(crc_t10dif(data, len));
@@ -82,7 +79,7 @@ static blk_status_t t10_pi_verify(struct blk_integrity_iter *iter,
 		switch (type) {
 		case 1:
 		case 2:
-			if (pi->app_tag == APP_ESCAPE)
+			if (pi->app_tag == T10_PI_APP_ESCAPE)
 				goto next;

 			if (be32_to_cpu(pi->ref_tag) !=
@@ -95,8 +92,8 @@ static blk_status_t t10_pi_verify(struct blk_integrity_iter *iter,
 			}
 			break;
 		case 3:
-			if (pi->app_tag == APP_ESCAPE &&
-			    pi->ref_tag == REF_ESCAPE)
+			if (pi->app_tag == T10_PI_APP_ESCAPE &&
+			    pi->ref_tag == T10_PI_REF_ESCAPE)
 				goto next;
 			break;
 		}
@@ -1944,6 +1944,13 @@ static void cciss_get_serial_no(ctlr_info_t *h, int logvol,
 	return;
 }

+static void cciss_initialize_rq(struct request *rq)
+{
+	struct scsi_request *sreq = blk_mq_rq_to_pdu(rq);
+
+	scsi_req_init(sreq);
+}
+
 /*
 * cciss_add_disk sets up the block device queue for a logical drive
 */
@@ -1956,6 +1963,7 @@ static int cciss_add_disk(ctlr_info_t *h, struct gendisk *disk,

 	disk->queue->cmd_size = sizeof(struct scsi_request);
 	disk->queue->request_fn = do_cciss_request;
+	disk->queue->initialize_rq_fn = cciss_initialize_rq;
 	disk->queue->queue_lock = &h->lock;
 	queue_flag_set_unlocked(QUEUE_FLAG_SCSI_PASSTHROUGH, disk->queue);
 	if (blk_init_allocated_queue(disk->queue) < 0)
@@ -174,7 +174,6 @@ static void mtip_init_cmd_header(struct request *rq)
 {
 	struct driver_data *dd = rq->q->queuedata;
 	struct mtip_cmd *cmd = blk_mq_rq_to_pdu(rq);
-	u32 host_cap_64 = readl(dd->mmio + HOST_CAP) & HOST_CAP_64;

 	/* Point the command headers at the command tables. */
 	cmd->command_header = dd->port->command_list +
@@ -182,7 +181,7 @@ static void mtip_init_cmd_header(struct request *rq)
 	cmd->command_header_dma = dd->port->command_list_dma +
 				(sizeof(struct mtip_cmd_hdr) * rq->tag);

-	if (host_cap_64)
+	if (test_bit(MTIP_PF_HOST_CAP_64, &dd->port->flags))
 		cmd->command_header->ctbau = __force_bit2int cpu_to_le32((cmd->command_dma >> 16) >> 16);

 	cmd->command_header->ctba = __force_bit2int cpu_to_le32(cmd->command_dma & 0xFFFFFFFF);
@@ -386,6 +385,7 @@ static void mtip_init_port(struct mtip_port *port)
 			 port->mmio + PORT_LST_ADDR_HI);
 		writel((port->rxfis_dma >> 16) >> 16,
 			 port->mmio + PORT_FIS_ADDR_HI);
+		set_bit(MTIP_PF_HOST_CAP_64, &port->flags);
 	}

 	writel(port->command_list_dma & 0xFFFFFFFF,
@@ -950,7 +950,7 @@ static int mtip_quiesce_io(struct mtip_port *port, unsigned long timeout)
 	unsigned long to;
 	bool active = true;

-	blk_mq_stop_hw_queues(port->dd->queue);
+	blk_mq_quiesce_queue(port->dd->queue);

 	to = jiffies + msecs_to_jiffies(timeout);
 	do {
@@ -970,10 +970,10 @@ static int mtip_quiesce_io(struct mtip_port *port, unsigned long timeout)
 			break;
 	} while (time_before(jiffies, to));

-	blk_mq_start_stopped_hw_queues(port->dd->queue, true);
+	blk_mq_unquiesce_queue(port->dd->queue);
 	return active ? -EBUSY : 0;
 err_fault:
-	blk_mq_start_stopped_hw_queues(port->dd->queue, true);
+	blk_mq_unquiesce_queue(port->dd->queue);
 	return -EFAULT;
 }

@@ -2737,6 +2737,9 @@ static void mtip_abort_cmd(struct request *req, void *data,
 	struct mtip_cmd *cmd = blk_mq_rq_to_pdu(req);
 	struct driver_data *dd = data;

+	if (!blk_mq_request_started(req))
+		return;
+
 	dbg_printk(MTIP_DRV_NAME " Aborting request, tag = %d\n", req->tag);

 	clear_bit(req->tag, dd->port->cmds_to_issue);
@@ -2749,6 +2752,9 @@ static void mtip_queue_cmd(struct request *req, void *data,
 {
 	struct driver_data *dd = data;

+	if (!blk_mq_request_started(req))
+		return;
+
 	set_bit(req->tag, dd->port->cmds_to_issue);
 	blk_abort_request(req);
 }
@@ -2814,6 +2820,8 @@ restart_eh:
 				dev_warn(&dd->pdev->dev,
 					"Completion workers still active!");

+			blk_mq_quiesce_queue(dd->queue);
+
 			spin_lock(dd->queue->queue_lock);
 			blk_mq_tagset_busy_iter(&dd->tags,
 							mtip_queue_cmd, dd);
@@ -2826,6 +2834,8 @@ restart_eh:
 							mtip_abort_cmd, dd);

 			clear_bit(MTIP_PF_TO_ACTIVE_BIT, &dd->port->flags);
+
+			blk_mq_unquiesce_queue(dd->queue);
 		}

 		if (test_bit(MTIP_PF_ISSUE_CMDS_BIT, &port->flags)) {
@@ -3995,8 +4005,9 @@ static int mtip_block_remove(struct driver_data *dd)
 						dd->disk->disk_name);

 	blk_freeze_queue_start(dd->queue);
-	blk_mq_stop_hw_queues(dd->queue);
+	blk_mq_quiesce_queue(dd->queue);
 	blk_mq_tagset_busy_iter(&dd->tags, mtip_no_dev_cleanup, dd);
+	blk_mq_unquiesce_queue(dd->queue);

 	/*
 	 * Delete our gendisk structure. This also removes the device
@@ -140,6 +140,7 @@ enum {
 				(1 << MTIP_PF_SE_ACTIVE_BIT) |
 				(1 << MTIP_PF_DM_ACTIVE_BIT) |
 				(1 << MTIP_PF_TO_ACTIVE_BIT)),
+	MTIP_PF_HOST_CAP_64         = 10, /* cache HOST_CAP_64 */

 	MTIP_PF_SVC_THD_ACTIVE_BIT  = 4,
 	MTIP_PF_ISSUE_CMDS_BIT      = 5,
@@ -661,9 +661,9 @@ static void nbd_clear_req(struct request *req, void *data, bool reserved)

 static void nbd_clear_que(struct nbd_device *nbd)
 {
-	blk_mq_stop_hw_queues(nbd->disk->queue);
+	blk_mq_quiesce_queue(nbd->disk->queue);
 	blk_mq_tagset_busy_iter(&nbd->tag_set, nbd_clear_req, NULL);
-	blk_mq_start_hw_queues(nbd->disk->queue);
+	blk_mq_unquiesce_queue(nbd->disk->queue);
 	dev_dbg(disk_to_dev(nbd->disk), "queue cleared\n");
 }

@@ -844,9 +844,6 @@ static int __init null_init(void)
 		queue_mode = NULL_Q_MQ;
 	}

-	if (queue_mode == NULL_Q_MQ && shared_tags)
-		null_init_tag_set(&tag_set);
-
 	if (queue_mode == NULL_Q_MQ && use_per_node_hctx) {
 		if (submit_queues < nr_online_nodes) {
 			pr_warn("null_blk: submit_queues param is set to %u.",
@@ -858,11 +855,19 @@ static int __init null_init(void)
 	else if (!submit_queues)
 		submit_queues = 1;

+	if (queue_mode == NULL_Q_MQ && shared_tags) {
+		ret = null_init_tag_set(&tag_set);
+		if (ret)
+			return ret;
+	}
+
 	mutex_init(&lock);

 	null_major = register_blkdev(0, "nullb");
-	if (null_major < 0)
-		return null_major;
+	if (null_major < 0) {
+		ret = null_major;
+		goto err_tagset;
+	}

 	if (use_lightnvm) {
 		ppa_cache = kmem_cache_create("ppa_cache", 64 * sizeof(u64),
@@ -891,6 +896,9 @@ err_dev:
 	kmem_cache_destroy(ppa_cache);
 err_ppa:
 	unregister_blkdev(null_major, "nullb");
+err_tagset:
+	if (queue_mode == NULL_Q_MQ && shared_tags)
+		blk_mq_free_tag_set(&tag_set);
 	return ret;
 }

@@ -840,7 +840,7 @@ static int virtblk_freeze(struct virtio_device *vdev)
 	/* Make sure no work handler is accessing the device. */
 	flush_work(&vblk->config_work);

-	blk_mq_stop_hw_queues(vblk->disk->queue);
+	blk_mq_quiesce_queue(vblk->disk->queue);

 	vdev->config->del_vqs(vdev);
 	return 0;
@@ -857,7 +857,7 @@ static int virtblk_restore(struct virtio_device *vdev)

 	virtio_device_ready(vdev);

-	blk_mq_start_stopped_hw_queues(vblk->disk->queue, true);
+	blk_mq_unquiesce_queue(vblk->disk->queue);
 	return 0;
 }
 #endif
@@ -1670,13 +1670,10 @@ void pblk_line_run_ws(struct pblk *pblk, struct pblk_line *line, void *priv,
 	queue_work(wq, &line_ws->ws);
 }

-void pblk_down_rq(struct pblk *pblk, struct ppa_addr *ppa_list, int nr_ppas,
-		  unsigned long *lun_bitmap)
+static void __pblk_down_page(struct pblk *pblk, struct ppa_addr *ppa_list,
+			     int nr_ppas, int pos)
 {
-	struct nvm_tgt_dev *dev = pblk->dev;
-	struct nvm_geo *geo = &dev->geo;
-	struct pblk_lun *rlun;
-	int pos = pblk_ppa_to_pos(geo, ppa_list[0]);
+	struct pblk_lun *rlun = &pblk->luns[pos];
 	int ret;

 	/*
@@ -1690,14 +1687,8 @@ void pblk_down_rq(struct pblk *pblk, struct ppa_addr *ppa_list, int nr_ppas,
 		WARN_ON(ppa_list[0].g.lun != ppa_list[i].g.lun ||
 				ppa_list[0].g.ch != ppa_list[i].g.ch);
 #endif
-	/* If the LUN has been locked for this same request, do no attempt to
-	 * lock it again
-	 */
-	if (test_and_set_bit(pos, lun_bitmap))
-		return;

-	rlun = &pblk->luns[pos];
-	ret = down_timeout(&rlun->wr_sem, msecs_to_jiffies(5000));
+	ret = down_timeout(&rlun->wr_sem, msecs_to_jiffies(30000));
 	if (ret) {
 		switch (ret) {
 		case -ETIME:
@@ -1710,6 +1701,50 @@ void pblk_down_rq(struct pblk *pblk, struct ppa_addr *ppa_list, int nr_ppas,
 	}
 }

+void pblk_down_page(struct pblk *pblk, struct ppa_addr *ppa_list, int nr_ppas)
+{
+	struct nvm_tgt_dev *dev = pblk->dev;
+	struct nvm_geo *geo = &dev->geo;
+	int pos = pblk_ppa_to_pos(geo, ppa_list[0]);
+
+	__pblk_down_page(pblk, ppa_list, nr_ppas, pos);
+}
+
+void pblk_down_rq(struct pblk *pblk, struct ppa_addr *ppa_list, int nr_ppas,
+		  unsigned long *lun_bitmap)
+{
+	struct nvm_tgt_dev *dev = pblk->dev;
+	struct nvm_geo *geo = &dev->geo;
+	int pos = pblk_ppa_to_pos(geo, ppa_list[0]);
+
+	/* If the LUN has been locked for this same request, do no attempt to
+	 * lock it again
+	 */
+	if (test_and_set_bit(pos, lun_bitmap))
+		return;
+
+	__pblk_down_page(pblk, ppa_list, nr_ppas, pos);
+}
+
+void pblk_up_page(struct pblk *pblk, struct ppa_addr *ppa_list, int nr_ppas)
+{
+	struct nvm_tgt_dev *dev = pblk->dev;
+	struct nvm_geo *geo = &dev->geo;
+	struct pblk_lun *rlun;
+	int pos = pblk_ppa_to_pos(geo, ppa_list[0]);
+
+#ifdef CONFIG_NVM_DEBUG
+	int i;
+
+	for (i = 1; i < nr_ppas; i++)
+		WARN_ON(ppa_list[0].g.lun != ppa_list[i].g.lun ||
+				ppa_list[0].g.ch != ppa_list[i].g.ch);
+#endif
+
+	rlun = &pblk->luns[pos];
+	up(&rlun->wr_sem);
+}
+
 void pblk_up_rq(struct pblk *pblk, struct ppa_addr *ppa_list, int nr_ppas,
 		unsigned long *lun_bitmap)
 {
@@ -340,9 +340,14 @@ static void pblk_end_io_recov(struct nvm_rq *rqd)
 	struct pblk *pblk = pad_rq->pblk;
 	struct nvm_tgt_dev *dev = pblk->dev;

-	kref_put(&pad_rq->ref, pblk_recov_complete);
+	pblk_up_page(pblk, rqd->ppa_list, rqd->nr_ppas);
+
+	bio_put(rqd->bio);
 	nvm_dev_dma_free(dev->parent, rqd->meta_list, rqd->dma_meta_list);
 	pblk_free_rqd(pblk, rqd, WRITE);
+
+	atomic_dec(&pblk->inflight_io);
+	kref_put(&pad_rq->ref, pblk_recov_complete);
 }

 static int pblk_recov_pad_oob(struct pblk *pblk, struct pblk_line *line,
@@ -385,7 +390,7 @@ next_pad_rq:
 	rq_ppas = pblk_calc_secs(pblk, left_ppas, 0);
 	if (rq_ppas < pblk->min_write_pgs) {
 		pr_err("pblk: corrupted pad line %d\n", line->id);
-		goto free_rq;
+		goto fail_free_pad;
 	}

 	rq_len = rq_ppas * geo->sec_size;
@@ -393,7 +398,7 @@ next_pad_rq:
 	meta_list = nvm_dev_dma_alloc(dev->parent, GFP_KERNEL, &dma_meta_list);
 	if (!meta_list) {
 		ret = -ENOMEM;
-		goto free_data;
+		goto fail_free_pad;
 	}

 	ppa_list = (void *)(meta_list) + pblk_dma_meta_size;
@@ -404,9 +409,9 @@ next_pad_rq:
 		ret = PTR_ERR(rqd);
 		goto fail_free_meta;
 	}
-	memset(rqd, 0, pblk_w_rq_size);

-	bio = bio_map_kern(dev->q, data, rq_len, GFP_KERNEL);
+	bio = pblk_bio_map_addr(pblk, data, rq_ppas, rq_len,
+						PBLK_VMALLOC_META, GFP_KERNEL);
 	if (IS_ERR(bio)) {
 		ret = PTR_ERR(bio);
 		goto fail_free_rqd;
@@ -453,15 +458,15 @@ next_pad_rq:
 	}

 	kref_get(&pad_rq->ref);
+	pblk_down_page(pblk, rqd->ppa_list, rqd->nr_ppas);

 	ret = pblk_submit_io(pblk, rqd);
 	if (ret) {
 		pr_err("pblk: I/O submission failed: %d\n", ret);
-		goto free_data;
+		pblk_up_page(pblk, rqd->ppa_list, rqd->nr_ppas);
+		goto fail_free_bio;
 	}

-	atomic_dec(&pblk->inflight_io);
-
 	left_line_ppas -= rq_ppas;
 	left_ppas -= rq_ppas;
 	if (left_ppas && left_line_ppas)
@@ -475,17 +480,23 @@ next_pad_rq:
 		ret = -ETIME;
 	}

+	if (!pblk_line_is_full(line))
+		pr_err("pblk: corrupted padded line: %d\n", line->id);
+
+	vfree(data);
 free_rq:
 	kfree(pad_rq);
-free_data:
-	vfree(data);
 	return ret;

+fail_free_bio:
+	bio_put(bio);
 fail_free_rqd:
 	pblk_free_rqd(pblk, rqd, WRITE);
 fail_free_meta:
 	nvm_dev_dma_free(dev->parent, meta_list, dma_meta_list);
+fail_free_pad:
 	kfree(pad_rq);
+	vfree(data);
 	return ret;
 }

--- a/Show More
+++ b/Show More