Merge branch 'for-4.2/core' of git://git.kernel.dk/linux-block

Pull core block IO update from Jens Axboe: "Nothing really major in here, mostly a collection of smaller optimizations and cleanups, mixed with various fixes. In more detail, this contains: - Addition of policy specific data to blkcg for block cgroups. From Arianna Avanzini. - Various cleanups around command types from Christoph. - Cleanup of the suspend block I/O path from Christoph. - Plugging updates from Shaohua and Jeff Moyer, for blk-mq. - Eliminating atomic inc/dec of both remaining IO count and reference count in a bio. From me. - Fixes for SG gap and chunk size support for data-less (discards) IO, so we can merge these better. From me. - Small restructuring of blk-mq shared tag support, freeing drivers from iterating hardware queues. From Keith Busch. - A few cfq-iosched tweaks, from Tahsin Erdogan and me. Makes the IOPS mode the default for non-rotational storage" * 'for-4.2/core' of git://git.kernel.dk/linux-block: (35 commits) cfq-iosched: fix other locations where blkcg_to_cfqgd() can return NULL cfq-iosched: fix sysfs oops when attempting to read unconfigured weights cfq-iosched: move group scheduling functions under ifdef cfq-iosched: fix the setting of IOPS mode on SSDs blktrace: Add blktrace.c to BLOCK LAYER in MAINTAINERS file block, cgroup: implement policy-specific per-blkcg data block: Make CFQ default to IOPS mode on SSDs block: add blk_set_queue_dying() to blkdev.h blk-mq: Shared tag enhancements block: don't honor chunk sizes for data-less IO block: only honor SG gap prevention for merges that contain data block: fix returnvar.cocci warnings block, dm: don't copy bios for request clones block: remove management of bi_remaining when restoring original bi_end_io block: replace trylock with mutex_lock in blkdev_reread_part() block: export blkdev_reread_part() and __blkdev_reread_part() suspend: simplify block I/O handling block: collapse bio bit space block: remove unused BIO_RW_BLOCK and BIO_EOF flags block: remove BIO_EOPNOTSUPP ...
2026-05-01 15:00:59 -07:00 · 2015-06-25 14:29:53 -07:00
parent cc8a0a9439 ae994ea972
commit bfffa1cc9d
64 changed files with 852 additions and 753 deletions
@@ -2075,6 +2075,7 @@ M:	Jens Axboe <axboe@kernel.dk>
 T:	git git://git.kernel.org/pub/scm/linux/kernel/git/axboe/linux-block.git
 S:	Maintained
 F:	block/
+F:	kernel/trace/blktrace.c

 BLOCK2MTD DRIVER
 M:	Joern Engel <joern@lazybastard.org>
@@ -361,7 +361,7 @@ static void bio_integrity_verify_fn(struct work_struct *work)

 	/* Restore original bio completion handler */
 	bio->bi_end_io = bip->bip_end_io;
-	bio_endio_nodec(bio, error);
+	bio_endio(bio, error);
 }

 /**
@@ -388,7 +388,7 @@ void bio_integrity_endio(struct bio *bio, int error)
 	 */
 	if (error) {
 		bio->bi_end_io = bip->bip_end_io;
-		bio_endio_nodec(bio, error);
+		bio_endio(bio, error);

 		return;
 	}
@@ -270,8 +270,8 @@ void bio_init(struct bio *bio)
 {
 	memset(bio, 0, sizeof(*bio));
 	bio->bi_flags = 1 << BIO_UPTODATE;
-	atomic_set(&bio->bi_remaining, 1);
-	atomic_set(&bio->bi_cnt, 1);
+	atomic_set(&bio->__bi_remaining, 1);
+	atomic_set(&bio->__bi_cnt, 1);
 }
 EXPORT_SYMBOL(bio_init);

@@ -292,8 +292,8 @@ void bio_reset(struct bio *bio)
 	__bio_free(bio);

 	memset(bio, 0, BIO_RESET_BYTES);
-	bio->bi_flags = flags|(1 << BIO_UPTODATE);
-	atomic_set(&bio->bi_remaining, 1);
+	bio->bi_flags = flags | (1 << BIO_UPTODATE);
+	atomic_set(&bio->__bi_remaining, 1);
 }
 EXPORT_SYMBOL(bio_reset);

@@ -303,6 +303,17 @@ static void bio_chain_endio(struct bio *bio, int error)
 	bio_put(bio);
 }

+/*
+ * Increment chain count for the bio. Make sure the CHAIN flag update
+ * is visible before the raised count.
+ */
+static inline void bio_inc_remaining(struct bio *bio)
+{
+	bio->bi_flags |= (1 << BIO_CHAIN);
+	smp_mb__before_atomic();
+	atomic_inc(&bio->__bi_remaining);
+}
+
 /**
 * bio_chain - chain bio completions
 * @bio: the target bio
@@ -320,7 +331,7 @@ void bio_chain(struct bio *bio, struct bio *parent)

 	bio->bi_private = parent;
 	bio->bi_end_io	= bio_chain_endio;
-	atomic_inc(&parent->bi_remaining);
+	bio_inc_remaining(parent);
 }
 EXPORT_SYMBOL(bio_chain);

@@ -524,13 +535,17 @@ EXPORT_SYMBOL(zero_fill_bio);
 **/
 void bio_put(struct bio *bio)
 {
-	BIO_BUG_ON(!atomic_read(&bio->bi_cnt));
-
-	/*
-	 * last put frees it
-	 */
-	if (atomic_dec_and_test(&bio->bi_cnt))
+	if (!bio_flagged(bio, BIO_REFFED))
 		bio_free(bio);
+	else {
+		BIO_BUG_ON(!atomic_read(&bio->__bi_cnt));
+
+		/*
+		 * last put frees it
+		 */
+		if (atomic_dec_and_test(&bio->__bi_cnt))
+			bio_free(bio);
+	}
 }
 EXPORT_SYMBOL(bio_put);

@@ -1741,6 +1756,25 @@ void bio_flush_dcache_pages(struct bio *bi)
 EXPORT_SYMBOL(bio_flush_dcache_pages);
 #endif

+static inline bool bio_remaining_done(struct bio *bio)
+{
+	/*
+	 * If we're not chaining, then ->__bi_remaining is always 1 and
+	 * we always end io on the first invocation.
+	 */
+	if (!bio_flagged(bio, BIO_CHAIN))
+		return true;
+
+	BUG_ON(atomic_read(&bio->__bi_remaining) <= 0);
+
+	if (atomic_dec_and_test(&bio->__bi_remaining)) {
+		clear_bit(BIO_CHAIN, &bio->bi_flags);
+		return true;
+	}
+
+	return false;
+}
+
 /**
 * bio_endio - end I/O on a bio
 * @bio:	bio
@@ -1758,15 +1792,13 @@ EXPORT_SYMBOL(bio_flush_dcache_pages);
 void bio_endio(struct bio *bio, int error)
 {
 	while (bio) {
-		BUG_ON(atomic_read(&bio->bi_remaining) <= 0);
-
 		if (error)
 			clear_bit(BIO_UPTODATE, &bio->bi_flags);
 		else if (!test_bit(BIO_UPTODATE, &bio->bi_flags))
 			error = -EIO;

-		if (!atomic_dec_and_test(&bio->bi_remaining))
-			return;
+		if (unlikely(!bio_remaining_done(bio)))
+			break;

 		/*
 		 * Need to have a real endio function for chained bios,
@@ -1789,21 +1821,6 @@ void bio_endio(struct bio *bio, int error)
 }
 EXPORT_SYMBOL(bio_endio);

-/**
- * bio_endio_nodec - end I/O on a bio, without decrementing bi_remaining
- * @bio:	bio
- * @error:	error, if any
- *
- * For code that has saved and restored bi_end_io; thing hard before using this
- * function, probably you should've cloned the entire bio.
- **/
-void bio_endio_nodec(struct bio *bio, int error)
-{
-	atomic_inc(&bio->bi_remaining);
-	bio_endio(bio, error);
-}
-EXPORT_SYMBOL(bio_endio_nodec);
-
 /**
 * bio_split - split a bio
 * @bio:	bio to split
@@ -9,6 +9,10 @@
 *
 * Copyright (C) 2009 Vivek Goyal <vgoyal@redhat.com>
 * 	              Nauman Rafique <nauman@google.com>
+ *
+ * For policy-specific per-blkcg data:
+ * Copyright (C) 2015 Paolo Valente <paolo.valente@unimore.it>
+ *                    Arianna Avanzini <avanzini.arianna@gmail.com>
 */
 #include <linux/ioprio.h>
 #include <linux/kdev_t.h>
@@ -26,8 +30,7 @@

 static DEFINE_MUTEX(blkcg_pol_mutex);

-struct blkcg blkcg_root = { .cfq_weight = 2 * CFQ_WEIGHT_DEFAULT,
-			    .cfq_leaf_weight = 2 * CFQ_WEIGHT_DEFAULT, };
+struct blkcg blkcg_root;
 EXPORT_SYMBOL_GPL(blkcg_root);

 static struct blkcg_policy *blkcg_policy[BLKCG_MAX_POLS];
@@ -823,6 +826,8 @@ static struct cgroup_subsys_state *
 blkcg_css_alloc(struct cgroup_subsys_state *parent_css)
 {
 	struct blkcg *blkcg;
+	struct cgroup_subsys_state *ret;
+	int i;

 	if (!parent_css) {
 		blkcg = &blkcg_root;
@@ -830,17 +835,49 @@ blkcg_css_alloc(struct cgroup_subsys_state *parent_css)
 	}

 	blkcg = kzalloc(sizeof(*blkcg), GFP_KERNEL);
-	if (!blkcg)
-		return ERR_PTR(-ENOMEM);
+	if (!blkcg) {
+		ret = ERR_PTR(-ENOMEM);
+		goto free_blkcg;
+	}
+
+	for (i = 0; i < BLKCG_MAX_POLS ; i++) {
+		struct blkcg_policy *pol = blkcg_policy[i];
+		struct blkcg_policy_data *cpd;
+
+		/*
+		 * If the policy hasn't been attached yet, wait for it
+		 * to be attached before doing anything else. Otherwise,
+		 * check if the policy requires any specific per-cgroup
+		 * data: if it does, allocate and initialize it.
+		 */
+		if (!pol || !pol->cpd_size)
+			continue;
+
+		BUG_ON(blkcg->pd[i]);
+		cpd = kzalloc(pol->cpd_size, GFP_KERNEL);
+		if (!cpd) {
+			ret = ERR_PTR(-ENOMEM);
+			goto free_pd_blkcg;
+		}
+		blkcg->pd[i] = cpd;
+		cpd->plid = i;
+		pol->cpd_init_fn(blkcg);
+	}

-	blkcg->cfq_weight = CFQ_WEIGHT_DEFAULT;
-	blkcg->cfq_leaf_weight = CFQ_WEIGHT_DEFAULT;
 done:
 	spin_lock_init(&blkcg->lock);
 	INIT_RADIX_TREE(&blkcg->blkg_tree, GFP_ATOMIC);
 	INIT_HLIST_HEAD(&blkcg->blkg_list);

 	return &blkcg->css;
+
+free_pd_blkcg:
+	for (i--; i >= 0; i--)
+		kfree(blkcg->pd[i]);
+
+free_blkcg:
+	kfree(blkcg);
+	return ret;
 }

 /**
@@ -958,8 +995,10 @@ int blkcg_activate_policy(struct request_queue *q,
 			  const struct blkcg_policy *pol)
 {
 	LIST_HEAD(pds);
+	LIST_HEAD(cpds);
 	struct blkcg_gq *blkg, *new_blkg;
-	struct blkg_policy_data *pd, *n;
+	struct blkg_policy_data *pd, *nd;
+	struct blkcg_policy_data *cpd, *cnd;
 	int cnt = 0, ret;
 	bool preloaded;

@@ -1003,7 +1042,10 @@ int blkcg_activate_policy(struct request_queue *q,

 	spin_unlock_irq(q->queue_lock);

-	/* allocate policy_data for all existing blkgs */
+	/*
+	 * Allocate per-blkg and per-blkcg policy data
+	 * for all existing blkgs.
+	 */
 	while (cnt--) {
 		pd = kzalloc_node(pol->pd_size, GFP_KERNEL, q->node);
 		if (!pd) {
@@ -1011,26 +1053,50 @@ int blkcg_activate_policy(struct request_queue *q,
 			goto out_free;
 		}
 		list_add_tail(&pd->alloc_node, &pds);
+
+		if (!pol->cpd_size)
+			continue;
+		cpd = kzalloc_node(pol->cpd_size, GFP_KERNEL, q->node);
+		if (!cpd) {
+			ret = -ENOMEM;
+			goto out_free;
+		}
+		list_add_tail(&cpd->alloc_node, &cpds);
 	}

 	/*
-	 * Install the allocated pds.  With @q bypassing, no new blkg
+	 * Install the allocated pds and cpds. With @q bypassing, no new blkg
 	 * should have been created while the queue lock was dropped.
 	 */
 	spin_lock_irq(q->queue_lock);

 	list_for_each_entry(blkg, &q->blkg_list, q_node) {
-		if (WARN_ON(list_empty(&pds))) {
+		if (WARN_ON(list_empty(&pds)) ||
+		    WARN_ON(pol->cpd_size && list_empty(&cpds))) {
 			/* umm... this shouldn't happen, just abort */
 			ret = -ENOMEM;
 			goto out_unlock;
 		}
+		cpd = list_first_entry(&cpds, struct blkcg_policy_data,
+				       alloc_node);
+		list_del_init(&cpd->alloc_node);
 		pd = list_first_entry(&pds, struct blkg_policy_data, alloc_node);
 		list_del_init(&pd->alloc_node);

 		/* grab blkcg lock too while installing @pd on @blkg */
 		spin_lock(&blkg->blkcg->lock);

+		if (!pol->cpd_size)
+			goto no_cpd;
+		if (!blkg->blkcg->pd[pol->plid]) {
+			/* Per-policy per-blkcg data */
+			blkg->blkcg->pd[pol->plid] = cpd;
+			cpd->plid = pol->plid;
+			pol->cpd_init_fn(blkg->blkcg);
+		} else { /* must free it as it has already been extracted */
+			kfree(cpd);
+		}
+no_cpd:
 		blkg->pd[pol->plid] = pd;
 		pd->blkg = blkg;
 		pd->plid = pol->plid;
@@ -1045,8 +1111,10 @@ out_unlock:
 	spin_unlock_irq(q->queue_lock);
 out_free:
 	blk_queue_bypass_end(q);
-	list_for_each_entry_safe(pd, n, &pds, alloc_node)
+	list_for_each_entry_safe(pd, nd, &pds, alloc_node)
 		kfree(pd);
+	list_for_each_entry_safe(cpd, cnd, &cpds, alloc_node)
+		kfree(cpd);
 	return ret;
 }
 EXPORT_SYMBOL_GPL(blkcg_activate_policy);
@@ -1087,6 +1155,8 @@ void blkcg_deactivate_policy(struct request_queue *q,

 		kfree(blkg->pd[pol->plid]);
 		blkg->pd[pol->plid] = NULL;
+		kfree(blkg->blkcg->pd[pol->plid]);
+		blkg->blkcg->pd[pol->plid] = NULL;

 		spin_unlock(&blkg->blkcg->lock);
 	}
@@ -23,11 +23,6 @@
 /* Max limits for throttle policy */
 #define THROTL_IOPS_MAX		UINT_MAX

-/* CFQ specific, out here for blkcg->cfq_weight */
-#define CFQ_WEIGHT_MIN		10
-#define CFQ_WEIGHT_MAX		1000
-#define CFQ_WEIGHT_DEFAULT	500
-
 #ifdef CONFIG_BLK_CGROUP

 enum blkg_rwstat_type {
@@ -50,9 +45,7 @@ struct blkcg {
 	struct blkcg_gq			*blkg_hint;
 	struct hlist_head		blkg_list;

-	/* TODO: per-policy storage in blkcg */
-	unsigned int			cfq_weight;	/* belongs to cfq */
-	unsigned int			cfq_leaf_weight;
+	struct blkcg_policy_data	*pd[BLKCG_MAX_POLS];
 };

 struct blkg_stat {
@@ -87,6 +80,24 @@ struct blkg_policy_data {
 	struct list_head		alloc_node;
 };

+/*
+ * Policies that need to keep per-blkcg data which is independent
+ * from any request_queue associated to it must specify its size
+ * with the cpd_size field of the blkcg_policy structure and
+ * embed a blkcg_policy_data in it. blkcg core allocates
+ * policy-specific per-blkcg structures lazily the first time
+ * they are actually needed, so it handles them together with
+ * blkgs. cpd_init() is invoked to let each policy handle
+ * per-blkcg data.
+ */
+struct blkcg_policy_data {
+	/* the policy id this per-policy data belongs to */
+	int				plid;
+
+	/* used during policy activation */
+	struct list_head		alloc_node;
+};
+
 /* association between a blk cgroup and a request queue */
 struct blkcg_gq {
 	/* Pointer to the associated request_queue */
@@ -112,6 +123,7 @@ struct blkcg_gq {
 	struct rcu_head			rcu_head;
 };

+typedef void (blkcg_pol_init_cpd_fn)(const struct blkcg *blkcg);
 typedef void (blkcg_pol_init_pd_fn)(struct blkcg_gq *blkg);
 typedef void (blkcg_pol_online_pd_fn)(struct blkcg_gq *blkg);
 typedef void (blkcg_pol_offline_pd_fn)(struct blkcg_gq *blkg);
@@ -122,10 +134,13 @@ struct blkcg_policy {
 	int				plid;
 	/* policy specific private data size */
 	size_t				pd_size;
+	/* policy specific per-blkcg data size */
+	size_t				cpd_size;
 	/* cgroup files for the policy */
 	struct cftype			*cftypes;

 	/* operations */
+	blkcg_pol_init_cpd_fn		*cpd_init_fn;
 	blkcg_pol_init_pd_fn		*pd_init_fn;
 	blkcg_pol_online_pd_fn		*pd_online_fn;
 	blkcg_pol_offline_pd_fn		*pd_offline_fn;
@@ -218,6 +233,12 @@ static inline struct blkg_policy_data *blkg_to_pd(struct blkcg_gq *blkg,
 	return blkg ? blkg->pd[pol->plid] : NULL;
 }

+static inline struct blkcg_policy_data *blkcg_to_cpd(struct blkcg *blkcg,
+						     struct blkcg_policy *pol)
+{
+	return blkcg ? blkcg->pd[pol->plid] : NULL;
+}
+
 /**
 * pdata_to_blkg - get blkg associated with policy private data
 * @pd: policy private data of interest
@@ -564,6 +585,9 @@ struct blkcg;
 struct blkg_policy_data {
 };

+struct blkcg_policy_data {
+};
+
 struct blkcg_gq {
 };

@@ -117,7 +117,7 @@ EXPORT_SYMBOL(blk_rq_init);
 static void req_bio_endio(struct request *rq, struct bio *bio,
 			  unsigned int nbytes, int error)
 {
-	if (error)
+	if (error && !(rq->cmd_flags & REQ_CLONE))
 		clear_bit(BIO_UPTODATE, &bio->bi_flags);
 	else if (!test_bit(BIO_UPTODATE, &bio->bi_flags))
 		error = -EIO;
@@ -128,7 +128,8 @@ static void req_bio_endio(struct request *rq, struct bio *bio,
 	bio_advance(bio, nbytes);

 	/* don't actually finish bio if it's part of flush sequence */
-	if (bio->bi_iter.bi_size == 0 && !(rq->cmd_flags & REQ_FLUSH_SEQ))
+	if (bio->bi_iter.bi_size == 0 &&
+	    !(rq->cmd_flags & (REQ_FLUSH_SEQ|REQ_CLONE)))
 		bio_endio(bio, error);
 }

@@ -285,6 +286,7 @@ inline void __blk_run_queue_uncond(struct request_queue *q)
 	q->request_fn(q);
 	q->request_fn_active--;
 }
+EXPORT_SYMBOL_GPL(__blk_run_queue_uncond);

 /**
 * __blk_run_queue - run a single device queue
@@ -1525,7 +1527,8 @@ bool bio_attempt_front_merge(struct request_queue *q, struct request *req,
 * Caller must ensure !blk_queue_nomerges(q) beforehand.
 */
 bool blk_attempt_plug_merge(struct request_queue *q, struct bio *bio,
-			    unsigned int *request_count)
+			    unsigned int *request_count,
+			    struct request **same_queue_rq)
 {
 	struct blk_plug *plug;
 	struct request *rq;
@@ -1545,8 +1548,16 @@ bool blk_attempt_plug_merge(struct request_queue *q, struct bio *bio,
 	list_for_each_entry_reverse(rq, plug_list, queuelist) {
 		int el_ret;

-		if (rq->q == q)
+		if (rq->q == q) {
 			(*request_count)++;
+			/*
+			 * Only blk-mq multiple hardware queues case checks the
+			 * rq in the same queue, there should be only one such
+			 * rq in a queue
+			 **/
+			if (same_queue_rq)
+				*same_queue_rq = rq;
+		}

 		if (rq->q != q || !blk_rq_merge_ok(rq, bio))
 			continue;
@@ -1611,7 +1622,7 @@ static void blk_queue_bio(struct request_queue *q, struct bio *bio)
 	 * any locks.
 	 */
 	if (!blk_queue_nomerges(q) &&
-	    blk_attempt_plug_merge(q, bio, &request_count))
+	    blk_attempt_plug_merge(q, bio, &request_count, NULL))
 		return;

 	spin_lock_irq(q->queue_lock);
@@ -1718,8 +1729,6 @@ static void handle_bad_sector(struct bio *bio)
 			bio->bi_rw,
 			(unsigned long long)bio_end_sector(bio),
 			(long long)(i_size_read(bio->bi_bdev->bd_inode) >> 9));
-
-	set_bit(BIO_EOF, &bio->bi_flags);
 }

 #ifdef CONFIG_FAIL_MAKE_REQUEST
@@ -2904,95 +2913,22 @@ int blk_lld_busy(struct request_queue *q)
 }
 EXPORT_SYMBOL_GPL(blk_lld_busy);

-/**
- * blk_rq_unprep_clone - Helper function to free all bios in a cloned request
- * @rq: the clone request to be cleaned up
- *
- * Description:
- *     Free all bios in @rq for a cloned request.
- */
-void blk_rq_unprep_clone(struct request *rq)
-{
-	struct bio *bio;
-
-	while ((bio = rq->bio) != NULL) {
-		rq->bio = bio->bi_next;
-
-		bio_put(bio);
-	}
-}
-EXPORT_SYMBOL_GPL(blk_rq_unprep_clone);
-
-/*
- * Copy attributes of the original request to the clone request.
- * The actual data parts (e.g. ->cmd, ->sense) are not copied.
- */
-static void __blk_rq_prep_clone(struct request *dst, struct request *src)
+void blk_rq_prep_clone(struct request *dst, struct request *src)
 {
 	dst->cpu = src->cpu;
-	dst->cmd_flags |= (src->cmd_flags & REQ_CLONE_MASK) | REQ_NOMERGE;
+	dst->cmd_flags |= (src->cmd_flags & REQ_CLONE_MASK);
+	dst->cmd_flags |= REQ_NOMERGE | REQ_CLONE;
 	dst->cmd_type = src->cmd_type;
 	dst->__sector = blk_rq_pos(src);
 	dst->__data_len = blk_rq_bytes(src);
 	dst->nr_phys_segments = src->nr_phys_segments;
 	dst->ioprio = src->ioprio;
 	dst->extra_len = src->extra_len;
-}
-
-/**
- * blk_rq_prep_clone - Helper function to setup clone request
- * @rq: the request to be setup
- * @rq_src: original request to be cloned
- * @bs: bio_set that bios for clone are allocated from
- * @gfp_mask: memory allocation mask for bio
- * @bio_ctr: setup function to be called for each clone bio.
- *           Returns %0 for success, non %0 for failure.
- * @data: private data to be passed to @bio_ctr
- *
- * Description:
- *     Clones bios in @rq_src to @rq, and copies attributes of @rq_src to @rq.
- *     The actual data parts of @rq_src (e.g. ->cmd, ->sense)
- *     are not copied, and copying such parts is the caller's responsibility.
- *     Also, pages which the original bios are pointing to are not copied
- *     and the cloned bios just point same pages.
- *     So cloned bios must be completed before original bios, which means
- *     the caller must complete @rq before @rq_src.
- */
-int blk_rq_prep_clone(struct request *rq, struct request *rq_src,
-		      struct bio_set *bs, gfp_t gfp_mask,
-		      int (*bio_ctr)(struct bio *, struct bio *, void *),
-		      void *data)
-{
-	struct bio *bio, *bio_src;
-
-	if (!bs)
-		bs = fs_bio_set;
-
-	__rq_for_each_bio(bio_src, rq_src) {
-		bio = bio_clone_fast(bio_src, gfp_mask, bs);
-		if (!bio)
-			goto free_and_out;
-
-		if (bio_ctr && bio_ctr(bio, bio_src, data))
-			goto free_and_out;
-
-		if (rq->bio) {
-			rq->biotail->bi_next = bio;
-			rq->biotail = bio;
-		} else
-			rq->bio = rq->biotail = bio;
-	}
-
-	__blk_rq_prep_clone(rq, rq_src);
-
-	return 0;
-
-free_and_out:
-	if (bio)
-		bio_put(bio);
-	blk_rq_unprep_clone(rq);
-
-	return -ENOMEM;
+	dst->bio = src->bio;
+	dst->biotail = src->biotail;
+	dst->cmd = src->cmd;
+	dst->cmd_len = src->cmd_len;
+	dst->sense = src->sense;
 }
 EXPORT_SYMBOL_GPL(blk_rq_prep_clone);

@@ -3034,21 +2970,20 @@ void blk_start_plug(struct blk_plug *plug)
 {
 	struct task_struct *tsk = current;

+	/*
+	 * If this is a nested plug, don't actually assign it.
+	 */
+	if (tsk->plug)
+		return;
+
 	INIT_LIST_HEAD(&plug->list);
 	INIT_LIST_HEAD(&plug->mq_list);
 	INIT_LIST_HEAD(&plug->cb_list);
-
 	/*
-	 * If this is a nested plug, don't actually assign it. It will be
-	 * flushed on its own.
+	 * Store ordering should not be needed here, since a potential
+	 * preempt will imply a full memory barrier
 	 */
-	if (!tsk->plug) {
-		/*
-		 * Store ordering should not be needed here, since a potential
-		 * preempt will imply a full memory barrier
-		 */
-		tsk->plug = plug;
-	}
+	tsk->plug = plug;
 }
 EXPORT_SYMBOL(blk_start_plug);

@@ -3195,10 +3130,11 @@ void blk_flush_plug_list(struct blk_plug *plug, bool from_schedule)

 void blk_finish_plug(struct blk_plug *plug)
 {
+	if (plug != current->plug)
+		return;
 	blk_flush_plug_list(plug, false);

-	if (plug == current->plug)
-		current->plug = NULL;
+	current->plug = NULL;
 }
 EXPORT_SYMBOL(blk_finish_plug);

@@ -53,7 +53,6 @@ void blk_execute_rq_nowait(struct request_queue *q, struct gendisk *bd_disk,
 			   rq_end_io_fn *done)
 {
 	int where = at_head ? ELEVATOR_INSERT_FRONT : ELEVATOR_INSERT_BACK;
-	bool is_pm_resume;

 	WARN_ON(irqs_disabled());
 	WARN_ON(rq->cmd_type == REQ_TYPE_FS);
@@ -70,12 +69,6 @@ void blk_execute_rq_nowait(struct request_queue *q, struct gendisk *bd_disk,
 		return;
 	}

-	/*
-	 * need to check this before __blk_run_queue(), because rq can
-	 * be freed before that returns.
-	 */
-	is_pm_resume = rq->cmd_type == REQ_TYPE_PM_RESUME;
-
 	spin_lock_irq(q->queue_lock);

 	if (unlikely(blk_queue_dying(q))) {
@@ -88,9 +81,6 @@ void blk_execute_rq_nowait(struct request_queue *q, struct gendisk *bd_disk,

 	__elv_add_request(q, rq, where);
 	__blk_run_queue(q);
-	/* the queue is stopped so it won't be run */
-	if (is_pm_resume)
-		__blk_run_queue_uncond(q);
 	spin_unlock_irq(q->queue_lock);
 }
 EXPORT_SYMBOL_GPL(blk_execute_rq_nowait);
@@ -589,7 +589,8 @@ bool blk_rq_merge_ok(struct request *rq, struct bio *bio)
 	    !blk_write_same_mergeable(rq->bio, bio))
 		return false;

-	if (q->queue_flags & (1 << QUEUE_FLAG_SG_GAPS)) {
+	/* Only check gaps if the bio carries data */
+	if (q->queue_flags & (1 << QUEUE_FLAG_SG_GAPS) && bio_has_data(bio)) {
 		struct bio_vec *bprev;

 		bprev = &rq->biotail->bi_io_vec[rq->biotail->bi_vcnt - 1];
@@ -438,6 +438,39 @@ static void bt_for_each(struct blk_mq_hw_ctx *hctx,
 	}
 }

+static void bt_tags_for_each(struct blk_mq_tags *tags,
+		struct blk_mq_bitmap_tags *bt, unsigned int off,
+		busy_tag_iter_fn *fn, void *data, bool reserved)
+{
+	struct request *rq;
+	int bit, i;
+
+	if (!tags->rqs)
+		return;
+	for (i = 0; i < bt->map_nr; i++) {
+		struct blk_align_bitmap *bm = &bt->map[i];
+
+		for (bit = find_first_bit(&bm->word, bm->depth);
+		     bit < bm->depth;
+		     bit = find_next_bit(&bm->word, bm->depth, bit + 1)) {
+			rq = blk_mq_tag_to_rq(tags, off + bit);
+			fn(rq, data, reserved);
+		}
+
+		off += (1 << bt->bits_per_word);
+	}
+}
+
+void blk_mq_all_tag_busy_iter(struct blk_mq_tags *tags, busy_tag_iter_fn *fn,
+		void *priv)
+{
+	if (tags->nr_reserved_tags)
+		bt_tags_for_each(tags, &tags->breserved_tags, 0, fn, priv, true);
+	bt_tags_for_each(tags, &tags->bitmap_tags, tags->nr_reserved_tags, fn, priv,
+			false);
+}
+EXPORT_SYMBOL(blk_mq_all_tag_busy_iter);
+
 void blk_mq_tag_busy_iter(struct blk_mq_hw_ctx *hctx, busy_iter_fn *fn,
 		void *priv)
 {
@@ -580,6 +613,11 @@ struct blk_mq_tags *blk_mq_init_tags(unsigned int total_tags,
 	if (!tags)
 		return NULL;

+	if (!zalloc_cpumask_var(&tags->cpumask, GFP_KERNEL)) {
+		kfree(tags);
+		return NULL;
+	}
+
 	tags->nr_tags = total_tags;
 	tags->nr_reserved_tags = reserved_tags;

@@ -44,6 +44,7 @@ struct blk_mq_tags {
 	struct list_head page_list;

 	int alloc_policy;
+	cpumask_var_t cpumask;
 };


@@ -89,7 +89,8 @@ static int blk_mq_queue_enter(struct request_queue *q, gfp_t gfp)
 			return -EBUSY;

 		ret = wait_event_interruptible(q->mq_freeze_wq,
-				!q->mq_freeze_depth || blk_queue_dying(q));
+				!atomic_read(&q->mq_freeze_depth) ||
+				blk_queue_dying(q));
 		if (blk_queue_dying(q))
 			return -ENODEV;
 		if (ret)
@@ -112,13 +113,10 @@ static void blk_mq_usage_counter_release(struct percpu_ref *ref)

 void blk_mq_freeze_queue_start(struct request_queue *q)
 {
-	bool freeze;
+	int freeze_depth;

-	spin_lock_irq(q->queue_lock);
-	freeze = !q->mq_freeze_depth++;
-	spin_unlock_irq(q->queue_lock);
-
-	if (freeze) {
+	freeze_depth = atomic_inc_return(&q->mq_freeze_depth);
+	if (freeze_depth == 1) {
 		percpu_ref_kill(&q->mq_usage_counter);
 		blk_mq_run_hw_queues(q, false);
 	}
@@ -143,13 +141,11 @@ EXPORT_SYMBOL_GPL(blk_mq_freeze_queue);

 void blk_mq_unfreeze_queue(struct request_queue *q)
 {
-	bool wake;
+	int freeze_depth;

-	spin_lock_irq(q->queue_lock);
-	wake = !--q->mq_freeze_depth;
-	WARN_ON_ONCE(q->mq_freeze_depth < 0);
-	spin_unlock_irq(q->queue_lock);
-	if (wake) {
+	freeze_depth = atomic_dec_return(&q->mq_freeze_depth);
+	WARN_ON_ONCE(freeze_depth < 0);
+	if (!freeze_depth) {
 		percpu_ref_reinit(&q->mq_usage_counter);
 		wake_up_all(&q->mq_freeze_wq);
 	}
@@ -1237,6 +1233,38 @@ static struct request *blk_mq_map_request(struct request_queue *q,
 	return rq;
 }

+static int blk_mq_direct_issue_request(struct request *rq)
+{
+	int ret;
+	struct request_queue *q = rq->q;
+	struct blk_mq_hw_ctx *hctx = q->mq_ops->map_queue(q,
+			rq->mq_ctx->cpu);
+	struct blk_mq_queue_data bd = {
+		.rq = rq,
+		.list = NULL,
+		.last = 1
+	};
+
+	/*
+	 * For OK queue, we are done. For error, kill it. Any other
+	 * error (busy), just add it to our list as we previously
+	 * would have done
+	 */
+	ret = q->mq_ops->queue_rq(hctx, &bd);
+	if (ret == BLK_MQ_RQ_QUEUE_OK)
+		return 0;
+	else {
+		__blk_mq_requeue_request(rq);
+
+		if (ret == BLK_MQ_RQ_QUEUE_ERROR) {
+			rq->errors = -EIO;
+			blk_mq_end_request(rq, rq->errors);
+			return 0;
+		}
+		return -1;
+	}
+}
+
 /*
 * Multiple hardware queue variant. This will not use per-process plugs,
 * but will attempt to bypass the hctx queueing if we can go straight to
@@ -1248,6 +1276,9 @@ static void blk_mq_make_request(struct request_queue *q, struct bio *bio)
 	const int is_flush_fua = bio->bi_rw & (REQ_FLUSH | REQ_FUA);
 	struct blk_map_ctx data;
 	struct request *rq;
+	unsigned int request_count = 0;
+	struct blk_plug *plug;
+	struct request *same_queue_rq = NULL;

 	blk_queue_bounce(q, &bio);

@@ -1256,6 +1287,10 @@ static void blk_mq_make_request(struct request_queue *q, struct bio *bio)
 		return;
 	}

+	if (!is_flush_fua && !blk_queue_nomerges(q) &&
+	    blk_attempt_plug_merge(q, bio, &request_count, &same_queue_rq))
+		return;
+
 	rq = blk_mq_map_request(q, bio, &data);
 	if (unlikely(!rq))
 		return;
@@ -1266,38 +1301,42 @@ static void blk_mq_make_request(struct request_queue *q, struct bio *bio)
 		goto run_queue;
 	}

+	plug = current->plug;
 	/*
 	 * If the driver supports defer issued based on 'last', then
 	 * queue it up like normal since we can potentially save some
 	 * CPU this way.
 	 */
-	if (is_sync && !(data.hctx->flags & BLK_MQ_F_DEFER_ISSUE)) {
-		struct blk_mq_queue_data bd = {
-			.rq = rq,
-			.list = NULL,
-			.last = 1
-		};
-		int ret;
+	if (((plug && !blk_queue_nomerges(q)) || is_sync) &&
+	    !(data.hctx->flags & BLK_MQ_F_DEFER_ISSUE)) {
+		struct request *old_rq = NULL;

 		blk_mq_bio_to_request(rq, bio);

 		/*
-		 * For OK queue, we are done. For error, kill it. Any other
-		 * error (busy), just add it to our list as we previously
-		 * would have done
+		 * we do limited pluging. If bio can be merged, do merge.
+		 * Otherwise the existing request in the plug list will be
+		 * issued. So the plug list will have one request at most
 		 */
-		ret = q->mq_ops->queue_rq(data.hctx, &bd);
-		if (ret == BLK_MQ_RQ_QUEUE_OK)
-			goto done;
-		else {
-			__blk_mq_requeue_request(rq);
-
-			if (ret == BLK_MQ_RQ_QUEUE_ERROR) {
-				rq->errors = -EIO;
-				blk_mq_end_request(rq, rq->errors);
-				goto done;
+		if (plug) {
+			/*
+			 * The plug list might get flushed before this. If that
+			 * happens, same_queue_rq is invalid and plug list is empty
+			 **/
+			if (same_queue_rq && !list_empty(&plug->mq_list)) {
+				old_rq = same_queue_rq;
+				list_del_init(&old_rq->queuelist);
 			}
-		}
+			list_add_tail(&rq->queuelist, &plug->mq_list);
+		} else /* is_sync */
+			old_rq = rq;
+		blk_mq_put_ctx(data.ctx);
+		if (!old_rq)
+			return;
+		if (!blk_mq_direct_issue_request(old_rq))
+			return;
+		blk_mq_insert_request(old_rq, false, true, true);
+		return;
 	}

 	if (!blk_mq_merge_queue_io(data.hctx, data.ctx, rq, bio)) {
@@ -1310,7 +1349,6 @@ static void blk_mq_make_request(struct request_queue *q, struct bio *bio)
 run_queue:
 		blk_mq_run_hw_queue(data.hctx, !is_sync || is_flush_fua);
 	}
-done:
 	blk_mq_put_ctx(data.ctx);
 }

@@ -1322,16 +1360,11 @@ static void blk_sq_make_request(struct request_queue *q, struct bio *bio)
 {
 	const int is_sync = rw_is_sync(bio->bi_rw);
 	const int is_flush_fua = bio->bi_rw & (REQ_FLUSH | REQ_FUA);
-	unsigned int use_plug, request_count = 0;
+	struct blk_plug *plug;
+	unsigned int request_count = 0;
 	struct blk_map_ctx data;
 	struct request *rq;

-	/*
-	 * If we have multiple hardware queues, just go directly to
-	 * one of those for sync IO.
-	 */
-	use_plug = !is_flush_fua && !is_sync;
-
 	blk_queue_bounce(q, &bio);

 	if (bio_integrity_enabled(bio) && bio_integrity_prep(bio)) {
@@ -1339,8 +1372,8 @@ static void blk_sq_make_request(struct request_queue *q, struct bio *bio)
 		return;
 	}

-	if (use_plug && !blk_queue_nomerges(q) &&
-	    blk_attempt_plug_merge(q, bio, &request_count))
+	if (!is_flush_fua && !blk_queue_nomerges(q) &&
+	    blk_attempt_plug_merge(q, bio, &request_count, NULL))
 		return;

 	rq = blk_mq_map_request(q, bio, &data);
@@ -1358,21 +1391,18 @@ static void blk_sq_make_request(struct request_queue *q, struct bio *bio)
 	 * utilize that to temporarily store requests until the task is
 	 * either done or scheduled away.
 	 */
-	if (use_plug) {
-		struct blk_plug *plug = current->plug;
-
-		if (plug) {
-			blk_mq_bio_to_request(rq, bio);
-			if (list_empty(&plug->mq_list))
-				trace_block_plug(q);
-			else if (request_count >= BLK_MAX_REQUEST_COUNT) {
-				blk_flush_plug_list(plug, false);
-				trace_block_plug(q);
-			}
-			list_add_tail(&rq->queuelist, &plug->mq_list);
-			blk_mq_put_ctx(data.ctx);
-			return;
+	plug = current->plug;
+	if (plug) {
+		blk_mq_bio_to_request(rq, bio);
+		if (list_empty(&plug->mq_list))
+			trace_block_plug(q);
+		else if (request_count >= BLK_MAX_REQUEST_COUNT) {
+			blk_flush_plug_list(plug, false);
+			trace_block_plug(q);
 		}
+		list_add_tail(&rq->queuelist, &plug->mq_list);
+		blk_mq_put_ctx(data.ctx);
+		return;
 	}

 	if (!blk_mq_merge_queue_io(data.hctx, data.ctx, rq, bio)) {
@@ -1508,7 +1538,6 @@ static struct blk_mq_tags *blk_mq_init_rq_map(struct blk_mq_tag_set *set,
 			i++;
 		}
 	}
-
 	return tags;

 fail:
@@ -1792,6 +1821,7 @@ static void blk_mq_map_swqueue(struct request_queue *q)

 		hctx = q->mq_ops->map_queue(q, i);
 		cpumask_set_cpu(i, hctx->cpumask);
+		cpumask_set_cpu(i, hctx->tags->cpumask);
 		ctx->index_hw = hctx->nr_ctx;
 		hctx->ctxs[hctx->nr_ctx++] = ctx;
 	}
@@ -2056,7 +2086,7 @@ void blk_mq_free_queue(struct request_queue *q)
 /* Basically redo blk_mq_init_queue with queue frozen */
 static void blk_mq_queue_reinit(struct request_queue *q)
 {
-	WARN_ON_ONCE(!q->mq_freeze_depth);
+	WARN_ON_ONCE(!atomic_read(&q->mq_freeze_depth));

 	blk_mq_sysfs_unregister(q);

@@ -2173,6 +2203,12 @@ static int blk_mq_alloc_rq_maps(struct blk_mq_tag_set *set)
 	return 0;
 }

+struct cpumask *blk_mq_tags_cpumask(struct blk_mq_tags *tags)
+{
+	return tags->cpumask;
+}
+EXPORT_SYMBOL_GPL(blk_mq_tags_cpumask);
+
 /*
 * Alloc a tag set to be associated with one or more request queues.
 * May fail with EINVAL for various error conditions. May adjust the
@@ -2234,8 +2270,10 @@ void blk_mq_free_tag_set(struct blk_mq_tag_set *set)
 	int i;

 	for (i = 0; i < set->nr_hw_queues; i++) {
-		if (set->tags[i])
+		if (set->tags[i]) {
 			blk_mq_free_rq_map(set, set->tags[i], i);
+			free_cpumask_var(set->tags[i]->cpumask);
+		}
 	}

 	kfree(set->tags);
@@ -78,7 +78,8 @@ bool bio_attempt_front_merge(struct request_queue *q, struct request *req,
 bool bio_attempt_back_merge(struct request_queue *q, struct request *req,
 			    struct bio *bio);
 bool blk_attempt_plug_merge(struct request_queue *q, struct bio *bio,
-			    unsigned int *request_count);
+			    unsigned int *request_count,
+			    struct request **same_queue_rq);

 void blk_account_io_start(struct request *req, bool new_io);
 void blk_account_io_completion(struct request *req, unsigned int bytes);
@@ -193,8 +194,6 @@ int blk_try_merge(struct request *rq, struct bio *bio);

 void blk_queue_congestion_threshold(struct request_queue *q);

-void __blk_run_queue_uncond(struct request_queue *q);
-
 int blk_dev_init(void);


@@ -128,9 +128,6 @@ static void bounce_end_io(struct bio *bio, mempool_t *pool, int err)
 	struct bio_vec *bvec, *org_vec;
 	int i;

-	if (test_bit(BIO_EOPNOTSUPP, &bio->bi_flags))
-		set_bit(BIO_EOPNOTSUPP, &bio_orig->bi_flags);
-
 	/*
 	 * free up bounce indirect pages used
 	 */
@@ -67,6 +67,11 @@ static struct kmem_cache *cfq_pool;
 #define sample_valid(samples)	((samples) > 80)
 #define rb_entry_cfqg(node)	rb_entry((node), struct cfq_group, rb_node)

+/* blkio-related constants */
+#define CFQ_WEIGHT_MIN          10
+#define CFQ_WEIGHT_MAX          1000
+#define CFQ_WEIGHT_DEFAULT      500
+
 struct cfq_ttime {
 	unsigned long last_end_request;

@@ -212,6 +217,15 @@ struct cfqg_stats {
 #endif	/* CONFIG_CFQ_GROUP_IOSCHED */
 };

+/* Per-cgroup data */
+struct cfq_group_data {
+	/* must be the first member */
+	struct blkcg_policy_data pd;
+
+	unsigned int weight;
+	unsigned int leaf_weight;
+};
+
 /* This is per cgroup per device grouping structure */
 struct cfq_group {
 	/* must be the first member */
@@ -446,16 +460,6 @@ CFQ_CFQQ_FNS(deep);
 CFQ_CFQQ_FNS(wait_busy);
 #undef CFQ_CFQQ_FNS

-static inline struct cfq_group *pd_to_cfqg(struct blkg_policy_data *pd)
-{
-	return pd ? container_of(pd, struct cfq_group, pd) : NULL;
-}
-
-static inline struct blkcg_gq *cfqg_to_blkg(struct cfq_group *cfqg)
-{
-	return pd_to_blkg(&cfqg->pd);
-}
-
 #if defined(CONFIG_CFQ_GROUP_IOSCHED) && defined(CONFIG_DEBUG_BLK_CGROUP)

 /* cfqg stats flags */
@@ -600,6 +604,22 @@ static inline void cfqg_stats_update_avg_queue_size(struct cfq_group *cfqg) { }

 #ifdef CONFIG_CFQ_GROUP_IOSCHED

+static inline struct cfq_group *pd_to_cfqg(struct blkg_policy_data *pd)
+{
+	return pd ? container_of(pd, struct cfq_group, pd) : NULL;
+}
+
+static struct cfq_group_data
+*cpd_to_cfqgd(struct blkcg_policy_data *cpd)
+{
+	return cpd ? container_of(cpd, struct cfq_group_data, pd) : NULL;
+}
+
+static inline struct blkcg_gq *cfqg_to_blkg(struct cfq_group *cfqg)
+{
+	return pd_to_blkg(&cfqg->pd);
+}
+
 static struct blkcg_policy blkcg_policy_cfq;

 static inline struct cfq_group *blkg_to_cfqg(struct blkcg_gq *blkg)
@@ -607,6 +627,11 @@ static inline struct cfq_group *blkg_to_cfqg(struct blkcg_gq *blkg)
 	return pd_to_cfqg(blkg_to_pd(blkg, &blkcg_policy_cfq));
 }

+static struct cfq_group_data *blkcg_to_cfqgd(struct blkcg *blkcg)
+{
+	return cpd_to_cfqgd(blkcg_to_cpd(blkcg, &blkcg_policy_cfq));
+}
+
 static inline struct cfq_group *cfqg_parent(struct cfq_group *cfqg)
 {
 	struct blkcg_gq *pblkg = cfqg_to_blkg(cfqg)->parent;
@@ -1544,13 +1569,28 @@ static void cfqg_stats_init(struct cfqg_stats *stats)
 #endif
 }

+static void cfq_cpd_init(const struct blkcg *blkcg)
+{
+	struct cfq_group_data *cgd =
+		cpd_to_cfqgd(blkcg->pd[blkcg_policy_cfq.plid]);
+
+	if (blkcg == &blkcg_root) {
+		cgd->weight = 2 * CFQ_WEIGHT_DEFAULT;
+		cgd->leaf_weight = 2 * CFQ_WEIGHT_DEFAULT;
+	} else {
+		cgd->weight = CFQ_WEIGHT_DEFAULT;
+		cgd->leaf_weight = CFQ_WEIGHT_DEFAULT;
+	}
+}
+
 static void cfq_pd_init(struct blkcg_gq *blkg)
 {
 	struct cfq_group *cfqg = blkg_to_cfqg(blkg);
+	struct cfq_group_data *cgd = blkcg_to_cfqgd(blkg->blkcg);

 	cfq_init_cfqg_base(cfqg);
-	cfqg->weight = blkg->blkcg->cfq_weight;
-	cfqg->leaf_weight = blkg->blkcg->cfq_leaf_weight;
+	cfqg->weight = cgd->weight;
+	cfqg->leaf_weight = cgd->leaf_weight;
 	cfqg_stats_init(&cfqg->stats);
 	cfqg_stats_init(&cfqg->dead_stats);
 }
@@ -1673,13 +1713,27 @@ static int cfqg_print_leaf_weight_device(struct seq_file *sf, void *v)

 static int cfq_print_weight(struct seq_file *sf, void *v)
 {
-	seq_printf(sf, "%u\n", css_to_blkcg(seq_css(sf))->cfq_weight);
+	struct blkcg *blkcg = css_to_blkcg(seq_css(sf));
+	struct cfq_group_data *cgd = blkcg_to_cfqgd(blkcg);
+	unsigned int val = 0;
+
+	if (cgd)
+		val = cgd->weight;
+
+	seq_printf(sf, "%u\n", val);
 	return 0;
 }

 static int cfq_print_leaf_weight(struct seq_file *sf, void *v)
 {
-	seq_printf(sf, "%u\n", css_to_blkcg(seq_css(sf))->cfq_leaf_weight);
+	struct blkcg *blkcg = css_to_blkcg(seq_css(sf));
+	struct cfq_group_data *cgd = blkcg_to_cfqgd(blkcg);
+	unsigned int val = 0;
+
+	if (cgd)
+		val = cgd->leaf_weight;
+
+	seq_printf(sf, "%u\n", val);
 	return 0;
 }

@@ -1690,6 +1744,7 @@ static ssize_t __cfqg_set_weight_device(struct kernfs_open_file *of,
 	struct blkcg *blkcg = css_to_blkcg(of_css(of));
 	struct blkg_conf_ctx ctx;
 	struct cfq_group *cfqg;
+	struct cfq_group_data *cfqgd;
 	int ret;

 	ret = blkg_conf_prep(blkcg, &blkcg_policy_cfq, buf, &ctx);
@@ -1698,17 +1753,22 @@ static ssize_t __cfqg_set_weight_device(struct kernfs_open_file *of,

 	ret = -EINVAL;
 	cfqg = blkg_to_cfqg(ctx.blkg);
+	cfqgd = blkcg_to_cfqgd(blkcg);
+	if (!cfqg || !cfqgd)
+		goto err;
+
 	if (!ctx.v || (ctx.v >= CFQ_WEIGHT_MIN && ctx.v <= CFQ_WEIGHT_MAX)) {
 		if (!is_leaf_weight) {
 			cfqg->dev_weight = ctx.v;
-			cfqg->new_weight = ctx.v ?: blkcg->cfq_weight;
+			cfqg->new_weight = ctx.v ?: cfqgd->weight;
 		} else {
 			cfqg->dev_leaf_weight = ctx.v;
-			cfqg->new_leaf_weight = ctx.v ?: blkcg->cfq_leaf_weight;
+			cfqg->new_leaf_weight = ctx.v ?: cfqgd->leaf_weight;
 		}
 		ret = 0;
 	}

+err:
 	blkg_conf_finish(&ctx);
 	return ret ?: nbytes;
 }
@@ -1730,16 +1790,23 @@ static int __cfq_set_weight(struct cgroup_subsys_state *css, struct cftype *cft,
 {
 	struct blkcg *blkcg = css_to_blkcg(css);
 	struct blkcg_gq *blkg;
+	struct cfq_group_data *cfqgd;
+	int ret = 0;

 	if (val < CFQ_WEIGHT_MIN || val > CFQ_WEIGHT_MAX)
 		return -EINVAL;

 	spin_lock_irq(&blkcg->lock);
+	cfqgd = blkcg_to_cfqgd(blkcg);
+	if (!cfqgd) {
+		ret = -EINVAL;
+		goto out;
+	}

 	if (!is_leaf_weight)
-		blkcg->cfq_weight = val;
+		cfqgd->weight = val;
 	else
-		blkcg->cfq_leaf_weight = val;
+		cfqgd->leaf_weight = val;

 	hlist_for_each_entry(blkg, &blkcg->blkg_list, blkcg_node) {
 		struct cfq_group *cfqg = blkg_to_cfqg(blkg);
@@ -1749,15 +1816,16 @@ static int __cfq_set_weight(struct cgroup_subsys_state *css, struct cftype *cft,

 		if (!is_leaf_weight) {
 			if (!cfqg->dev_weight)
-				cfqg->new_weight = blkcg->cfq_weight;
+				cfqg->new_weight = cfqgd->weight;
 		} else {
 			if (!cfqg->dev_leaf_weight)
-				cfqg->new_leaf_weight = blkcg->cfq_leaf_weight;
+				cfqg->new_leaf_weight = cfqgd->leaf_weight;
 		}
 	}

+out:
 	spin_unlock_irq(&blkcg->lock);
-	return 0;
+	return ret;
 }

 static int cfq_set_weight(struct cgroup_subsys_state *css, struct cftype *cft,
@@ -4477,6 +4545,18 @@ out_free:
 	return ret;
 }

+static void cfq_registered_queue(struct request_queue *q)
+{
+	struct elevator_queue *e = q->elevator;
+	struct cfq_data *cfqd = e->elevator_data;
+
+	/*
+	 * Default to IOPS mode with no idling for SSDs
+	 */
+	if (blk_queue_nonrot(q))
+		cfqd->cfq_slice_idle = 0;
+}
+
 /*
 * sysfs parts below -->
 */
@@ -4592,6 +4672,7 @@ static struct elevator_type iosched_cfq = {
 		.elevator_may_queue_fn =	cfq_may_queue,
 		.elevator_init_fn =		cfq_init_queue,
 		.elevator_exit_fn =		cfq_exit_queue,
+		.elevator_registered_fn =	cfq_registered_queue,
 	},
 	.icq_size	=	sizeof(struct cfq_io_cq),
 	.icq_align	=	__alignof__(struct cfq_io_cq),
@@ -4603,8 +4684,10 @@ static struct elevator_type iosched_cfq = {
 #ifdef CONFIG_CFQ_GROUP_IOSCHED
 static struct blkcg_policy blkcg_policy_cfq = {
 	.pd_size		= sizeof(struct cfq_group),
+	.cpd_size		= sizeof(struct cfq_group_data),
 	.cftypes		= cfq_blkcg_files,

+	.cpd_init_fn		= cfq_cpd_init,
 	.pd_init_fn		= cfq_pd_init,
 	.pd_offline_fn		= cfq_pd_offline,
 	.pd_reset_stats_fn	= cfq_pd_reset_stats,
@@ -806,6 +806,8 @@ int elv_register_queue(struct request_queue *q)
 		}
 		kobject_uevent(&e->kobj, KOBJ_ADD);
 		e->registered = 1;
+		if (e->type->ops.elevator_registered_fn)
+			e->type->ops.elevator_registered_fn(q);
 	}
 	return error;
 }
@@ -150,21 +150,48 @@ static int blkpg_ioctl(struct block_device *bdev, struct blkpg_ioctl_arg __user
 	}
 }

-static int blkdev_reread_part(struct block_device *bdev)
+/*
+ * This is an exported API for the block driver, and will not
+ * acquire bd_mutex. This API should be used in case that
+ * caller has held bd_mutex already.
+ */
+int __blkdev_reread_part(struct block_device *bdev)
 {
 	struct gendisk *disk = bdev->bd_disk;
-	int res;

 	if (!disk_part_scan_enabled(disk) || bdev != bdev->bd_contains)
 		return -EINVAL;
 	if (!capable(CAP_SYS_ADMIN))
 		return -EACCES;
-	if (!mutex_trylock(&bdev->bd_mutex))
-		return -EBUSY;
-	res = rescan_partitions(disk, bdev);
+
+	lockdep_assert_held(&bdev->bd_mutex);
+
+	return rescan_partitions(disk, bdev);
+}
+EXPORT_SYMBOL(__blkdev_reread_part);
+
+/*
+ * This is an exported API for the block driver, and will
+ * try to acquire bd_mutex. If bd_mutex has been held already
+ * in current context, please call __blkdev_reread_part().
+ *
+ * Make sure the held locks in current context aren't required
+ * in open()/close() handler and I/O path for avoiding ABBA deadlock:
+ * - bd_mutex is held before calling block driver's open/close
+ *   handler
+ * - reading partition table may submit I/O to the block device
+ */
+int blkdev_reread_part(struct block_device *bdev)
+{
+	int res;
+
+	mutex_lock(&bdev->bd_mutex);
+	res = __blkdev_reread_part(bdev);
 	mutex_unlock(&bdev->bd_mutex);
+
 	return res;
 }
+EXPORT_SYMBOL(blkdev_reread_part);

 static int blk_ioctl_discard(struct block_device *bdev, uint64_t start,
 			     uint64_t len, int secure)
@@ -230,29 +230,40 @@ static int nbd_send_req(struct nbd_device *nbd, struct request *req)
 	int result, flags;
 	struct nbd_request request;
 	unsigned long size = blk_rq_bytes(req);
+	u32 type;
+
+	if (req->cmd_type == REQ_TYPE_DRV_PRIV)
+		type = NBD_CMD_DISC;
+	else if (req->cmd_flags & REQ_DISCARD)
+		type = NBD_CMD_TRIM;
+	else if (req->cmd_flags & REQ_FLUSH)
+		type = NBD_CMD_FLUSH;
+	else if (rq_data_dir(req) == WRITE)
+		type = NBD_CMD_WRITE;
+	else
+		type = NBD_CMD_READ;

 	memset(&request, 0, sizeof(request));
 	request.magic = htonl(NBD_REQUEST_MAGIC);
-	request.type = htonl(nbd_cmd(req));
-
-	if (nbd_cmd(req) != NBD_CMD_FLUSH && nbd_cmd(req) != NBD_CMD_DISC) {
+	request.type = htonl(type);
+	if (type != NBD_CMD_FLUSH && type != NBD_CMD_DISC) {
 		request.from = cpu_to_be64((u64)blk_rq_pos(req) << 9);
 		request.len = htonl(size);
 	}
 	memcpy(request.handle, &req, sizeof(req));

 	dev_dbg(nbd_to_dev(nbd), "request %p: sending control (%s@%llu,%uB)\n",
-		req, nbdcmd_to_ascii(nbd_cmd(req)),
+		req, nbdcmd_to_ascii(type),
 		(unsigned long long)blk_rq_pos(req) << 9, blk_rq_bytes(req));
 	result = sock_xmit(nbd, 1, &request, sizeof(request),
-			(nbd_cmd(req) == NBD_CMD_WRITE) ? MSG_MORE : 0);
+			(type == NBD_CMD_WRITE) ? MSG_MORE : 0);
 	if (result <= 0) {
 		dev_err(disk_to_dev(nbd->disk),
 			"Send control failed (result %d)\n", result);
 		return -EIO;
 	}

-	if (nbd_cmd(req) == NBD_CMD_WRITE) {
+	if (type == NBD_CMD_WRITE) {
 		struct req_iterator iter;
 		struct bio_vec bvec;
 		/*
@@ -352,7 +363,7 @@ static struct request *nbd_read_stat(struct nbd_device *nbd)
 	}

 	dev_dbg(nbd_to_dev(nbd), "request %p: got reply\n", req);
-	if (nbd_cmd(req) == NBD_CMD_READ) {
+	if (rq_data_dir(req) != WRITE) {
 		struct req_iterator iter;
 		struct bio_vec bvec;

@@ -452,23 +463,11 @@ static void nbd_handle_req(struct nbd_device *nbd, struct request *req)
 	if (req->cmd_type != REQ_TYPE_FS)
 		goto error_out;

-	nbd_cmd(req) = NBD_CMD_READ;
-	if (rq_data_dir(req) == WRITE) {
-		if ((req->cmd_flags & REQ_DISCARD)) {
-			WARN_ON(!(nbd->flags & NBD_FLAG_SEND_TRIM));
-			nbd_cmd(req) = NBD_CMD_TRIM;
-		} else
-			nbd_cmd(req) = NBD_CMD_WRITE;
-		if (nbd->flags & NBD_FLAG_READ_ONLY) {
-			dev_err(disk_to_dev(nbd->disk),
-				"Write on read-only\n");
-			goto error_out;
-		}
-	}
-
-	if (req->cmd_flags & REQ_FLUSH) {
-		BUG_ON(unlikely(blk_rq_sectors(req)));
-		nbd_cmd(req) = NBD_CMD_FLUSH;
+	if (rq_data_dir(req) == WRITE &&
+	    (nbd->flags & NBD_FLAG_READ_ONLY)) {
+		dev_err(disk_to_dev(nbd->disk),
+			"Write on read-only\n");
+		goto error_out;
 	}

 	req->errors = 0;
@@ -592,8 +591,7 @@ static int __nbd_ioctl(struct block_device *bdev, struct nbd_device *nbd,
 		fsync_bdev(bdev);
 		mutex_lock(&nbd->tx_lock);
 		blk_rq_init(NULL, &sreq);
-		sreq.cmd_type = REQ_TYPE_SPECIAL;
-		nbd_cmd(&sreq) = NBD_CMD_DISC;
+		sreq.cmd_type = REQ_TYPE_DRV_PRIV;

 		/* Check again after getting mutex back.  */
 		if (!nbd->sock)
@@ -442,7 +442,7 @@ static char *pd_buf;		/* buffer for request in progress */

 static enum action do_pd_io_start(void)
 {
-	if (pd_req->cmd_type == REQ_TYPE_SPECIAL) {
+	if (pd_req->cmd_type == REQ_TYPE_DRV_PRIV) {
 		phase = pd_special;
 		return pd_special();
 	}
@@ -725,7 +725,7 @@ static int pd_special_command(struct pd_unit *disk,
 	if (IS_ERR(rq))
 		return PTR_ERR(rq);

-	rq->cmd_type = REQ_TYPE_SPECIAL;
+	rq->cmd_type = REQ_TYPE_DRV_PRIV;
 	rq->special = func;

 	err = blk_execute_rq(disk->gd->queue, disk->gd, rq, 0);
@@ -620,7 +620,7 @@ static int carm_array_info (struct carm_host *host, unsigned int array_idx)
 	spin_unlock_irq(&host->lock);

 	DPRINTK("blk_execute_rq_nowait, tag == %u\n", idx);
-	crq->rq->cmd_type = REQ_TYPE_SPECIAL;
+	crq->rq->cmd_type = REQ_TYPE_DRV_PRIV;
 	crq->rq->special = crq;
 	blk_execute_rq_nowait(host->oob_q, NULL, crq->rq, true, NULL);

@@ -661,7 +661,7 @@ static int carm_send_special (struct carm_host *host, carm_sspc_t func)
 	crq->msg_bucket = (u32) rc;

 	DPRINTK("blk_execute_rq_nowait, tag == %u\n", idx);
-	crq->rq->cmd_type = REQ_TYPE_SPECIAL;
+	crq->rq->cmd_type = REQ_TYPE_DRV_PRIV;
 	crq->rq->special = crq;
 	blk_execute_rq_nowait(host->oob_q, NULL, crq->rq, true, NULL);

@@ -124,7 +124,7 @@ static inline void virtblk_request_done(struct request *req)
 		req->resid_len = virtio32_to_cpu(vblk->vdev, vbr->in_hdr.residual);
 		req->sense_len = virtio32_to_cpu(vblk->vdev, vbr->in_hdr.sense_len);
 		req->errors = virtio32_to_cpu(vblk->vdev, vbr->in_hdr.errors);
-	} else if (req->cmd_type == REQ_TYPE_SPECIAL) {
+	} else if (req->cmd_type == REQ_TYPE_DRV_PRIV) {
 		req->errors = (error != 0);
 	}

@@ -188,7 +188,7 @@ static int virtio_queue_rq(struct blk_mq_hw_ctx *hctx,
 			vbr->out_hdr.sector = 0;
 			vbr->out_hdr.ioprio = cpu_to_virtio32(vblk->vdev, req_get_ioprio(vbr->req));
 			break;
-		case REQ_TYPE_SPECIAL:
+		case REQ_TYPE_DRV_PRIV:
 			vbr->out_hdr.type = cpu_to_virtio32(vblk->vdev, VIRTIO_BLK_T_GET_ID);
 			vbr->out_hdr.sector = 0;
 			vbr->out_hdr.ioprio = cpu_to_virtio32(vblk->vdev, req_get_ioprio(vbr->req));
@@ -251,7 +251,7 @@ static int virtblk_get_id(struct gendisk *disk, char *id_str)
 		return PTR_ERR(req);
 	}

-	req->cmd_type = REQ_TYPE_SPECIAL;
+	req->cmd_type = REQ_TYPE_DRV_PRIV;
 	err = blk_execute_rq(vblk->disk->queue, vblk->disk, req, false);
 	blk_put_request(req);

--- a/Show More
+++ b/Show More