Merge branch 'for-4.15/block' of git://git.kernel.dk/linux-block

Pull core block layer updates from Jens Axboe:
 "This is the main pull request for block storage for 4.15-rc1.

  Nothing out of the ordinary in here, and no API changes or anything
  like that. Just various new features for drivers, core changes, etc.
  In particular, this pull request contains:

   - A patch series from Bart, closing the whole on blk/scsi-mq queue
     quescing.

   - A series from Christoph, building towards hidden gendisks (for
     multipath) and ability to move bio chains around.

   - NVMe
        - Support for native multipath for NVMe (Christoph).
        - Userspace notifications for AENs (Keith).
        - Command side-effects support (Keith).
        - SGL support (Chaitanya Kulkarni)
        - FC fixes and improvements (James Smart)
        - Lots of fixes and tweaks (Various)

   - bcache
        - New maintainer (Michael Lyle)
        - Writeback control improvements (Michael)
        - Various fixes (Coly, Elena, Eric, Liang, et al)

   - lightnvm updates, mostly centered around the pblk interface
     (Javier, Hans, and Rakesh).

   - Removal of unused bio/bvec kmap atomic interfaces (me, Christoph)

   - Writeback series that fix the much discussed hundreds of millions
     of sync-all units. This goes all the way, as discussed previously
     (me).

   - Fix for missing wakeup on writeback timer adjustments (Yafang
     Shao).

   - Fix laptop mode on blk-mq (me).

   - {mq,name} tupple lookup for IO schedulers, allowing us to have
     alias names. This means you can use 'deadline' on both !mq and on
     mq (where it's called mq-deadline). (me).

   - blktrace race fix, oopsing on sg load (me).

   - blk-mq optimizations (me).

   - Obscure waitqueue race fix for kyber (Omar).

   - NBD fixes (Josef).

   - Disable writeback throttling by default on bfq, like we do on cfq
     (Luca Miccio).

   - Series from Ming that enable us to treat flush requests on blk-mq
     like any other request. This is a really nice cleanup.

   - Series from Ming that improves merging on blk-mq with schedulers,
     getting us closer to flipping the switch on scsi-mq again.

   - BFQ updates (Paolo).

   - blk-mq atomic flags memory ordering fixes (Peter Z).

   - Loop cgroup support (Shaohua).

   - Lots of minor fixes from lots of different folks, both for core and
     driver code"

* 'for-4.15/block' of git://git.kernel.dk/linux-block: (294 commits)
  nvme: fix visibility of "uuid" ns attribute
  blk-mq: fixup some comment typos and lengths
  ide: ide-atapi: fix compile error with defining macro DEBUG
  blk-mq: improve tag waiting setup for non-shared tags
  brd: remove unused brd_mutex
  blk-mq: only run the hardware queue if IO is pending
  block: avoid null pointer dereference on null disk
  fs: guard_bio_eod() needs to consider partitions
  xtensa/simdisk: fix compile error
  nvme: expose subsys attribute to sysfs
  nvme: create 'slaves' and 'holders' entries for hidden controllers
  block: create 'slaves' and 'holders' entries for hidden gendisks
  nvme: also expose the namespace identification sysfs files for mpath nodes
  nvme: implement multipath access to nvme subsystems
  nvme: track shared namespaces
  nvme: introduce a nvme_ns_ids structure
  nvme: track subsystems
  block, nvme: Introduce blk_mq_req_flags_t
  block, scsi: Make SCSI quiesce and resume work reliably
  block: Add the QUEUE_FLAG_PREEMPT_ONLY request queue flag
  ...
This commit is contained in:
Linus Torvalds
2017-11-14 15:32:19 -08:00
131 changed files with 5485 additions and 3104 deletions
+159 -68
View File
@@ -108,6 +108,7 @@
#include "blk-mq-tag.h"
#include "blk-mq-sched.h"
#include "bfq-iosched.h"
#include "blk-wbt.h"
#define BFQ_BFQQ_FNS(name) \
void bfq_mark_bfqq_##name(struct bfq_queue *bfqq) \
@@ -724,6 +725,44 @@ static void bfq_updated_next_req(struct bfq_data *bfqd,
}
}
static unsigned int bfq_wr_duration(struct bfq_data *bfqd)
{
u64 dur;
if (bfqd->bfq_wr_max_time > 0)
return bfqd->bfq_wr_max_time;
dur = bfqd->RT_prod;
do_div(dur, bfqd->peak_rate);
/*
* Limit duration between 3 and 13 seconds. Tests show that
* higher values than 13 seconds often yield the opposite of
* the desired result, i.e., worsen responsiveness by letting
* non-interactive and non-soft-real-time applications
* preserve weight raising for a too long time interval.
*
* On the other end, lower values than 3 seconds make it
* difficult for most interactive tasks to complete their jobs
* before weight-raising finishes.
*/
if (dur > msecs_to_jiffies(13000))
dur = msecs_to_jiffies(13000);
else if (dur < msecs_to_jiffies(3000))
dur = msecs_to_jiffies(3000);
return dur;
}
/* switch back from soft real-time to interactive weight raising */
static void switch_back_to_interactive_wr(struct bfq_queue *bfqq,
struct bfq_data *bfqd)
{
bfqq->wr_coeff = bfqd->bfq_wr_coeff;
bfqq->wr_cur_max_time = bfq_wr_duration(bfqd);
bfqq->last_wr_start_finish = bfqq->wr_start_at_switch_to_srt;
}
static void
bfq_bfqq_resume_state(struct bfq_queue *bfqq, struct bfq_data *bfqd,
struct bfq_io_cq *bic, bool bfq_already_existing)
@@ -750,10 +789,16 @@ bfq_bfqq_resume_state(struct bfq_queue *bfqq, struct bfq_data *bfqd,
if (bfqq->wr_coeff > 1 && (bfq_bfqq_in_large_burst(bfqq) ||
time_is_before_jiffies(bfqq->last_wr_start_finish +
bfqq->wr_cur_max_time))) {
bfq_log_bfqq(bfqq->bfqd, bfqq,
"resume state: switching off wr");
bfqq->wr_coeff = 1;
if (bfqq->wr_cur_max_time == bfqd->bfq_wr_rt_max_time &&
!bfq_bfqq_in_large_burst(bfqq) &&
time_is_after_eq_jiffies(bfqq->wr_start_at_switch_to_srt +
bfq_wr_duration(bfqd))) {
switch_back_to_interactive_wr(bfqq, bfqd);
} else {
bfqq->wr_coeff = 1;
bfq_log_bfqq(bfqq->bfqd, bfqq,
"resume state: switching off wr");
}
}
/* make sure weight will be updated, however we got here */
@@ -1173,33 +1218,22 @@ static bool bfq_bfqq_update_budg_for_activation(struct bfq_data *bfqd,
return wr_or_deserves_wr;
}
static unsigned int bfq_wr_duration(struct bfq_data *bfqd)
/*
* Return the farthest future time instant according to jiffies
* macros.
*/
static unsigned long bfq_greatest_from_now(void)
{
u64 dur;
return jiffies + MAX_JIFFY_OFFSET;
}
if (bfqd->bfq_wr_max_time > 0)
return bfqd->bfq_wr_max_time;
dur = bfqd->RT_prod;
do_div(dur, bfqd->peak_rate);
/*
* Limit duration between 3 and 13 seconds. Tests show that
* higher values than 13 seconds often yield the opposite of
* the desired result, i.e., worsen responsiveness by letting
* non-interactive and non-soft-real-time applications
* preserve weight raising for a too long time interval.
*
* On the other end, lower values than 3 seconds make it
* difficult for most interactive tasks to complete their jobs
* before weight-raising finishes.
*/
if (dur > msecs_to_jiffies(13000))
dur = msecs_to_jiffies(13000);
else if (dur < msecs_to_jiffies(3000))
dur = msecs_to_jiffies(3000);
return dur;
/*
* Return the farthest past time instant according to jiffies
* macros.
*/
static unsigned long bfq_smallest_from_now(void)
{
return jiffies - MAX_JIFFY_OFFSET;
}
static void bfq_update_bfqq_wr_on_rq_arrival(struct bfq_data *bfqd,
@@ -1216,7 +1250,19 @@ static void bfq_update_bfqq_wr_on_rq_arrival(struct bfq_data *bfqd,
bfqq->wr_coeff = bfqd->bfq_wr_coeff;
bfqq->wr_cur_max_time = bfq_wr_duration(bfqd);
} else {
bfqq->wr_start_at_switch_to_srt = jiffies;
/*
* No interactive weight raising in progress
* here: assign minus infinity to
* wr_start_at_switch_to_srt, to make sure
* that, at the end of the soft-real-time
* weight raising periods that is starting
* now, no interactive weight-raising period
* may be wrongly considered as still in
* progress (and thus actually started by
* mistake).
*/
bfqq->wr_start_at_switch_to_srt =
bfq_smallest_from_now();
bfqq->wr_coeff = bfqd->bfq_wr_coeff *
BFQ_SOFTRT_WEIGHT_FACTOR;
bfqq->wr_cur_max_time =
@@ -2016,10 +2062,27 @@ static void bfq_bfqq_save_state(struct bfq_queue *bfqq)
bic->saved_IO_bound = bfq_bfqq_IO_bound(bfqq);
bic->saved_in_large_burst = bfq_bfqq_in_large_burst(bfqq);
bic->was_in_burst_list = !hlist_unhashed(&bfqq->burst_list_node);
bic->saved_wr_coeff = bfqq->wr_coeff;
bic->saved_wr_start_at_switch_to_srt = bfqq->wr_start_at_switch_to_srt;
bic->saved_last_wr_start_finish = bfqq->last_wr_start_finish;
bic->saved_wr_cur_max_time = bfqq->wr_cur_max_time;
if (unlikely(bfq_bfqq_just_created(bfqq) &&
!bfq_bfqq_in_large_burst(bfqq))) {
/*
* bfqq being merged right after being created: bfqq
* would have deserved interactive weight raising, but
* did not make it to be set in a weight-raised state,
* because of this early merge. Store directly the
* weight-raising state that would have been assigned
* to bfqq, so that to avoid that bfqq unjustly fails
* to enjoy weight raising if split soon.
*/
bic->saved_wr_coeff = bfqq->bfqd->bfq_wr_coeff;
bic->saved_wr_cur_max_time = bfq_wr_duration(bfqq->bfqd);
bic->saved_last_wr_start_finish = jiffies;
} else {
bic->saved_wr_coeff = bfqq->wr_coeff;
bic->saved_wr_start_at_switch_to_srt =
bfqq->wr_start_at_switch_to_srt;
bic->saved_last_wr_start_finish = bfqq->last_wr_start_finish;
bic->saved_wr_cur_max_time = bfqq->wr_cur_max_time;
}
}
static void
@@ -2897,24 +2960,6 @@ static unsigned long bfq_bfqq_softrt_next_start(struct bfq_data *bfqd,
jiffies + nsecs_to_jiffies(bfqq->bfqd->bfq_slice_idle) + 4);
}
/*
* Return the farthest future time instant according to jiffies
* macros.
*/
static unsigned long bfq_greatest_from_now(void)
{
return jiffies + MAX_JIFFY_OFFSET;
}
/*
* Return the farthest past time instant according to jiffies
* macros.
*/
static unsigned long bfq_smallest_from_now(void)
{
return jiffies - MAX_JIFFY_OFFSET;
}
/**
* bfq_bfqq_expire - expire a queue.
* @bfqd: device owning the queue.
@@ -3489,11 +3534,7 @@ static void bfq_update_wr_data(struct bfq_data *bfqd, struct bfq_queue *bfqq)
bfq_wr_duration(bfqd)))
bfq_bfqq_end_wr(bfqq);
else {
/* switch back to interactive wr */
bfqq->wr_coeff = bfqd->bfq_wr_coeff;
bfqq->wr_cur_max_time = bfq_wr_duration(bfqd);
bfqq->last_wr_start_finish =
bfqq->wr_start_at_switch_to_srt;
switch_back_to_interactive_wr(bfqq, bfqd);
bfqq->entity.prio_changed = 1;
}
}
@@ -3685,16 +3726,37 @@ void bfq_put_queue(struct bfq_queue *bfqq)
if (bfqq->ref)
return;
if (bfq_bfqq_sync(bfqq))
/*
* The fact that this queue is being destroyed does not
* invalidate the fact that this queue may have been
* activated during the current burst. As a consequence,
* although the queue does not exist anymore, and hence
* needs to be removed from the burst list if there,
* the burst size has not to be decremented.
*/
if (!hlist_unhashed(&bfqq->burst_list_node)) {
hlist_del_init(&bfqq->burst_list_node);
/*
* Decrement also burst size after the removal, if the
* process associated with bfqq is exiting, and thus
* does not contribute to the burst any longer. This
* decrement helps filter out false positives of large
* bursts, when some short-lived process (often due to
* the execution of commands by some service) happens
* to start and exit while a complex application is
* starting, and thus spawning several processes that
* do I/O (and that *must not* be treated as a large
* burst, see comments on bfq_handle_burst).
*
* In particular, the decrement is performed only if:
* 1) bfqq is not a merged queue, because, if it is,
* then this free of bfqq is not triggered by the exit
* of the process bfqq is associated with, but exactly
* by the fact that bfqq has just been merged.
* 2) burst_size is greater than 0, to handle
* unbalanced decrements. Unbalanced decrements may
* happen in te following case: bfqq is inserted into
* the current burst list--without incrementing
* bust_size--because of a split, but the current
* burst list is not the burst list bfqq belonged to
* (see comments on the case of a split in
* bfq_set_request).
*/
if (bfqq->bic && bfqq->bfqd->burst_size > 0)
bfqq->bfqd->burst_size--;
}
kmem_cache_free(bfq_pool, bfqq);
#ifdef CONFIG_BFQ_GROUP_IOSCHED
@@ -4127,7 +4189,6 @@ static void __bfq_insert_request(struct bfq_data *bfqd, struct request *rq)
new_bfqq->allocated++;
bfqq->allocated--;
new_bfqq->ref++;
bfq_clear_bfqq_just_created(bfqq);
/*
* If the bic associated with the process
* issuing this request still points to bfqq
@@ -4139,6 +4200,8 @@ static void __bfq_insert_request(struct bfq_data *bfqd, struct request *rq)
if (bic_to_bfqq(RQ_BIC(rq), 1) == bfqq)
bfq_merge_bfqqs(bfqd, RQ_BIC(rq),
bfqq, new_bfqq);
bfq_clear_bfqq_just_created(bfqq);
/*
* rq is about to be enqueued into new_bfqq,
* release rq reference on bfqq
@@ -4424,6 +4487,34 @@ static struct bfq_queue *bfq_get_bfqq_handle_split(struct bfq_data *bfqd,
else {
bfq_clear_bfqq_in_large_burst(bfqq);
if (bic->was_in_burst_list)
/*
* If bfqq was in the current
* burst list before being
* merged, then we have to add
* it back. And we do not need
* to increase burst_size, as
* we did not decrement
* burst_size when we removed
* bfqq from the burst list as
* a consequence of a merge
* (see comments in
* bfq_put_queue). In this
* respect, it would be rather
* costly to know whether the
* current burst list is still
* the same burst list from
* which bfqq was removed on
* the merge. To avoid this
* cost, if bfqq was in a
* burst list, then we add
* bfqq to the current burst
* list without any further
* check. This can cause
* inappropriate insertions,
* but rarely enough to not
* harm the detection of large
* bursts significantly.
*/
hlist_add_head(&bfqq->burst_list_node,
&bfqd->burst_list);
}
@@ -4775,7 +4866,7 @@ static int bfq_init_queue(struct request_queue *q, struct elevator_type *e)
bfq_init_root_group(bfqd->root_group, bfqd);
bfq_init_entity(&bfqd->oom_bfqq.entity, bfqd->root_group);
wbt_disable_default(q);
return 0;
out_free:
+2 -5
View File
@@ -485,11 +485,8 @@ EXPORT_SYMBOL(bioset_integrity_create);
void bioset_integrity_free(struct bio_set *bs)
{
if (bs->bio_integrity_pool)
mempool_destroy(bs->bio_integrity_pool);
if (bs->bvec_integrity_pool)
mempool_destroy(bs->bvec_integrity_pool);
mempool_destroy(bs->bio_integrity_pool);
mempool_destroy(bs->bvec_integrity_pool);
}
EXPORT_SYMBOL(bioset_integrity_free);
+3 -37
View File
@@ -400,7 +400,7 @@ static void punt_bios_to_rescuer(struct bio_set *bs)
/**
* bio_alloc_bioset - allocate a bio for I/O
* @gfp_mask: the GFP_ mask given to the slab allocator
* @gfp_mask: the GFP_* mask given to the slab allocator
* @nr_iovecs: number of iovecs to pre-allocate
* @bs: the bio_set to allocate from.
*
@@ -1931,11 +1931,8 @@ void bioset_free(struct bio_set *bs)
if (bs->rescue_workqueue)
destroy_workqueue(bs->rescue_workqueue);
if (bs->bio_pool)
mempool_destroy(bs->bio_pool);
if (bs->bvec_pool)
mempool_destroy(bs->bvec_pool);
mempool_destroy(bs->bio_pool);
mempool_destroy(bs->bvec_pool);
bioset_integrity_free(bs);
bio_put_slab(bs);
@@ -2035,37 +2032,6 @@ int bio_associate_blkcg(struct bio *bio, struct cgroup_subsys_state *blkcg_css)
}
EXPORT_SYMBOL_GPL(bio_associate_blkcg);
/**
* bio_associate_current - associate a bio with %current
* @bio: target bio
*
* Associate @bio with %current if it hasn't been associated yet. Block
* layer will treat @bio as if it were issued by %current no matter which
* task actually issues it.
*
* This function takes an extra reference of @task's io_context and blkcg
* which will be put when @bio is released. The caller must own @bio,
* ensure %current->io_context exists, and is responsible for synchronizing
* calls to this function.
*/
int bio_associate_current(struct bio *bio)
{
struct io_context *ioc;
if (bio->bi_css)
return -EBUSY;
ioc = current->io_context;
if (!ioc)
return -ENOENT;
get_io_context_active(ioc);
bio->bi_ioc = ioc;
bio->bi_css = task_get_css(current, io_cgrp_id);
return 0;
}
EXPORT_SYMBOL_GPL(bio_associate_current);
/**
* bio_disassociate_task - undo bio_associate_current()
* @bio: target bio
+7 -2
View File
@@ -1419,6 +1419,11 @@ int blkcg_policy_register(struct blkcg_policy *pol)
if (i >= BLKCG_MAX_POLS)
goto err_unlock;
/* Make sure cpd/pd_alloc_fn and cpd/pd_free_fn in pairs */
if ((!pol->cpd_alloc_fn ^ !pol->cpd_free_fn) ||
(!pol->pd_alloc_fn ^ !pol->pd_free_fn))
goto err_unlock;
/* register @pol */
pol->plid = i;
blkcg_policy[pol->plid] = pol;
@@ -1452,7 +1457,7 @@ int blkcg_policy_register(struct blkcg_policy *pol)
return 0;
err_free_cpds:
if (pol->cpd_alloc_fn) {
if (pol->cpd_free_fn) {
list_for_each_entry(blkcg, &all_blkcgs, all_blkcgs_node) {
if (blkcg->cpd[pol->plid]) {
pol->cpd_free_fn(blkcg->cpd[pol->plid]);
@@ -1492,7 +1497,7 @@ void blkcg_policy_unregister(struct blkcg_policy *pol)
/* remove cpds and unregister */
mutex_lock(&blkcg_pol_mutex);
if (pol->cpd_alloc_fn) {
if (pol->cpd_free_fn) {
list_for_each_entry(blkcg, &all_blkcgs, all_blkcgs_node) {
if (blkcg->cpd[pol->plid]) {
pol->cpd_free_fn(blkcg->cpd[pol->plid]);
+236 -38
View File
@@ -333,11 +333,13 @@ EXPORT_SYMBOL(blk_stop_queue);
void blk_sync_queue(struct request_queue *q)
{
del_timer_sync(&q->timeout);
cancel_work_sync(&q->timeout_work);
if (q->mq_ops) {
struct blk_mq_hw_ctx *hctx;
int i;
cancel_delayed_work_sync(&q->requeue_work);
queue_for_each_hw_ctx(q, hctx, i)
cancel_delayed_work_sync(&hctx->run_work);
} else {
@@ -346,6 +348,37 @@ void blk_sync_queue(struct request_queue *q)
}
EXPORT_SYMBOL(blk_sync_queue);
/**
* blk_set_preempt_only - set QUEUE_FLAG_PREEMPT_ONLY
* @q: request queue pointer
*
* Returns the previous value of the PREEMPT_ONLY flag - 0 if the flag was not
* set and 1 if the flag was already set.
*/
int blk_set_preempt_only(struct request_queue *q)
{
unsigned long flags;
int res;
spin_lock_irqsave(q->queue_lock, flags);
res = queue_flag_test_and_set(QUEUE_FLAG_PREEMPT_ONLY, q);
spin_unlock_irqrestore(q->queue_lock, flags);
return res;
}
EXPORT_SYMBOL_GPL(blk_set_preempt_only);
void blk_clear_preempt_only(struct request_queue *q)
{
unsigned long flags;
spin_lock_irqsave(q->queue_lock, flags);
queue_flag_clear(QUEUE_FLAG_PREEMPT_ONLY, q);
wake_up_all(&q->mq_freeze_wq);
spin_unlock_irqrestore(q->queue_lock, flags);
}
EXPORT_SYMBOL_GPL(blk_clear_preempt_only);
/**
* __blk_run_queue_uncond - run a queue whether or not it has been stopped
* @q: The queue to run
@@ -610,6 +643,9 @@ void blk_set_queue_dying(struct request_queue *q)
}
spin_unlock_irq(q->queue_lock);
}
/* Make blk_queue_enter() reexamine the DYING flag. */
wake_up_all(&q->mq_freeze_wq);
}
EXPORT_SYMBOL_GPL(blk_set_queue_dying);
@@ -718,7 +754,7 @@ static void free_request_size(void *element, void *data)
int blk_init_rl(struct request_list *rl, struct request_queue *q,
gfp_t gfp_mask)
{
if (unlikely(rl->rq_pool))
if (unlikely(rl->rq_pool) || q->mq_ops)
return 0;
rl->q = q;
@@ -760,15 +796,38 @@ struct request_queue *blk_alloc_queue(gfp_t gfp_mask)
}
EXPORT_SYMBOL(blk_alloc_queue);
int blk_queue_enter(struct request_queue *q, bool nowait)
/**
* blk_queue_enter() - try to increase q->q_usage_counter
* @q: request queue pointer
* @flags: BLK_MQ_REQ_NOWAIT and/or BLK_MQ_REQ_PREEMPT
*/
int blk_queue_enter(struct request_queue *q, blk_mq_req_flags_t flags)
{
const bool preempt = flags & BLK_MQ_REQ_PREEMPT;
while (true) {
bool success = false;
int ret;
if (percpu_ref_tryget_live(&q->q_usage_counter))
rcu_read_lock_sched();
if (percpu_ref_tryget_live(&q->q_usage_counter)) {
/*
* The code that sets the PREEMPT_ONLY flag is
* responsible for ensuring that that flag is globally
* visible before the queue is unfrozen.
*/
if (preempt || !blk_queue_preempt_only(q)) {
success = true;
} else {
percpu_ref_put(&q->q_usage_counter);
}
}
rcu_read_unlock_sched();
if (success)
return 0;
if (nowait)
if (flags & BLK_MQ_REQ_NOWAIT)
return -EBUSY;
/*
@@ -781,7 +840,8 @@ int blk_queue_enter(struct request_queue *q, bool nowait)
smp_rmb();
ret = wait_event_interruptible(q->mq_freeze_wq,
!atomic_read(&q->mq_freeze_depth) ||
(atomic_read(&q->mq_freeze_depth) == 0 &&
(preempt || !blk_queue_preempt_only(q))) ||
blk_queue_dying(q));
if (blk_queue_dying(q))
return -ENODEV;
@@ -844,6 +904,7 @@ struct request_queue *blk_alloc_queue_node(gfp_t gfp_mask, int node_id)
setup_timer(&q->backing_dev_info->laptop_mode_wb_timer,
laptop_mode_timer_fn, (unsigned long) q);
setup_timer(&q->timeout, blk_rq_timed_out_timer, (unsigned long) q);
INIT_WORK(&q->timeout_work, NULL);
INIT_LIST_HEAD(&q->queue_head);
INIT_LIST_HEAD(&q->timeout_list);
INIT_LIST_HEAD(&q->icq_list);
@@ -1154,7 +1215,7 @@ int blk_update_nr_requests(struct request_queue *q, unsigned int nr)
* @rl: request list to allocate from
* @op: operation and flags
* @bio: bio to allocate request for (can be %NULL)
* @gfp_mask: allocation mask
* @flags: BLQ_MQ_REQ_* flags
*
* Get a free request from @q. This function may fail under memory
* pressure or if @q is dead.
@@ -1164,7 +1225,7 @@ int blk_update_nr_requests(struct request_queue *q, unsigned int nr)
* Returns request pointer on success, with @q->queue_lock *not held*.
*/
static struct request *__get_request(struct request_list *rl, unsigned int op,
struct bio *bio, gfp_t gfp_mask)
struct bio *bio, blk_mq_req_flags_t flags)
{
struct request_queue *q = rl->q;
struct request *rq;
@@ -1173,6 +1234,8 @@ static struct request *__get_request(struct request_list *rl, unsigned int op,
struct io_cq *icq = NULL;
const bool is_sync = op_is_sync(op);
int may_queue;
gfp_t gfp_mask = flags & BLK_MQ_REQ_NOWAIT ? GFP_ATOMIC :
__GFP_DIRECT_RECLAIM;
req_flags_t rq_flags = RQF_ALLOCED;
lockdep_assert_held(q->queue_lock);
@@ -1255,6 +1318,8 @@ static struct request *__get_request(struct request_list *rl, unsigned int op,
blk_rq_set_rl(rq, rl);
rq->cmd_flags = op;
rq->rq_flags = rq_flags;
if (flags & BLK_MQ_REQ_PREEMPT)
rq->rq_flags |= RQF_PREEMPT;
/* init elvpriv */
if (rq_flags & RQF_ELVPRIV) {
@@ -1333,7 +1398,7 @@ rq_starved:
* @q: request_queue to allocate request from
* @op: operation and flags
* @bio: bio to allocate request for (can be %NULL)
* @gfp_mask: allocation mask
* @flags: BLK_MQ_REQ_* flags.
*
* Get a free request from @q. If %__GFP_DIRECT_RECLAIM is set in @gfp_mask,
* this function keeps retrying under memory pressure and fails iff @q is dead.
@@ -1343,7 +1408,7 @@ rq_starved:
* Returns request pointer on success, with @q->queue_lock *not held*.
*/
static struct request *get_request(struct request_queue *q, unsigned int op,
struct bio *bio, gfp_t gfp_mask)
struct bio *bio, blk_mq_req_flags_t flags)
{
const bool is_sync = op_is_sync(op);
DEFINE_WAIT(wait);
@@ -1355,7 +1420,7 @@ static struct request *get_request(struct request_queue *q, unsigned int op,
rl = blk_get_rl(q, bio); /* transferred to @rq on success */
retry:
rq = __get_request(rl, op, bio, gfp_mask);
rq = __get_request(rl, op, bio, flags);
if (!IS_ERR(rq))
return rq;
@@ -1364,7 +1429,7 @@ retry:
return ERR_PTR(-EAGAIN);
}
if (!gfpflags_allow_blocking(gfp_mask) || unlikely(blk_queue_dying(q))) {
if ((flags & BLK_MQ_REQ_NOWAIT) || unlikely(blk_queue_dying(q))) {
blk_put_rl(rl);
return rq;
}
@@ -1391,20 +1456,28 @@ retry:
goto retry;
}
/* flags: BLK_MQ_REQ_PREEMPT and/or BLK_MQ_REQ_NOWAIT. */
static struct request *blk_old_get_request(struct request_queue *q,
unsigned int op, gfp_t gfp_mask)
unsigned int op, blk_mq_req_flags_t flags)
{
struct request *rq;
gfp_t gfp_mask = flags & BLK_MQ_REQ_NOWAIT ? GFP_ATOMIC :
__GFP_DIRECT_RECLAIM;
int ret = 0;
WARN_ON_ONCE(q->mq_ops);
/* create ioc upfront */
create_io_context(gfp_mask, q->node);
ret = blk_queue_enter(q, flags);
if (ret)
return ERR_PTR(ret);
spin_lock_irq(q->queue_lock);
rq = get_request(q, op, NULL, gfp_mask);
rq = get_request(q, op, NULL, flags);
if (IS_ERR(rq)) {
spin_unlock_irq(q->queue_lock);
blk_queue_exit(q);
return rq;
}
@@ -1415,25 +1488,40 @@ static struct request *blk_old_get_request(struct request_queue *q,
return rq;
}
struct request *blk_get_request(struct request_queue *q, unsigned int op,
gfp_t gfp_mask)
/**
* blk_get_request_flags - allocate a request
* @q: request queue to allocate a request for
* @op: operation (REQ_OP_*) and REQ_* flags, e.g. REQ_SYNC.
* @flags: BLK_MQ_REQ_* flags, e.g. BLK_MQ_REQ_NOWAIT.
*/
struct request *blk_get_request_flags(struct request_queue *q, unsigned int op,
blk_mq_req_flags_t flags)
{
struct request *req;
WARN_ON_ONCE(op & REQ_NOWAIT);
WARN_ON_ONCE(flags & ~(BLK_MQ_REQ_NOWAIT | BLK_MQ_REQ_PREEMPT));
if (q->mq_ops) {
req = blk_mq_alloc_request(q, op,
(gfp_mask & __GFP_DIRECT_RECLAIM) ?
0 : BLK_MQ_REQ_NOWAIT);
req = blk_mq_alloc_request(q, op, flags);
if (!IS_ERR(req) && q->mq_ops->initialize_rq_fn)
q->mq_ops->initialize_rq_fn(req);
} else {
req = blk_old_get_request(q, op, gfp_mask);
req = blk_old_get_request(q, op, flags);
if (!IS_ERR(req) && q->initialize_rq_fn)
q->initialize_rq_fn(req);
}
return req;
}
EXPORT_SYMBOL(blk_get_request_flags);
struct request *blk_get_request(struct request_queue *q, unsigned int op,
gfp_t gfp_mask)
{
return blk_get_request_flags(q, op, gfp_mask & __GFP_DIRECT_RECLAIM ?
0 : BLK_MQ_REQ_NOWAIT);
}
EXPORT_SYMBOL(blk_get_request);
/**
@@ -1576,6 +1664,7 @@ void __blk_put_request(struct request_queue *q, struct request *req)
blk_free_request(rl, req);
freed_request(rl, sync, rq_flags);
blk_put_rl(rl);
blk_queue_exit(q);
}
}
EXPORT_SYMBOL_GPL(__blk_put_request);
@@ -1857,8 +1946,10 @@ get_rq:
* Grab a free request. This is might sleep but can not fail.
* Returns with the queue unlocked.
*/
req = get_request(q, bio->bi_opf, bio, GFP_NOIO);
blk_queue_enter_live(q);
req = get_request(q, bio->bi_opf, bio, 0);
if (IS_ERR(req)) {
blk_queue_exit(q);
__wbt_done(q->rq_wb, wb_acct);
if (PTR_ERR(req) == -ENOMEM)
bio->bi_status = BLK_STS_RESOURCE;
@@ -2200,8 +2291,10 @@ blk_qc_t generic_make_request(struct bio *bio)
current->bio_list = bio_list_on_stack;
do {
struct request_queue *q = bio->bi_disk->queue;
blk_mq_req_flags_t flags = bio->bi_opf & REQ_NOWAIT ?
BLK_MQ_REQ_NOWAIT : 0;
if (likely(blk_queue_enter(q, bio->bi_opf & REQ_NOWAIT) == 0)) {
if (likely(blk_queue_enter(q, flags) == 0)) {
struct bio_list lower, same;
/* Create a fresh bio_list for all subordinate requests */
@@ -2241,6 +2334,40 @@ out:
}
EXPORT_SYMBOL(generic_make_request);
/**
* direct_make_request - hand a buffer directly to its device driver for I/O
* @bio: The bio describing the location in memory and on the device.
*
* This function behaves like generic_make_request(), but does not protect
* against recursion. Must only be used if the called driver is known
* to not call generic_make_request (or direct_make_request) again from
* its make_request function. (Calling direct_make_request again from
* a workqueue is perfectly fine as that doesn't recurse).
*/
blk_qc_t direct_make_request(struct bio *bio)
{
struct request_queue *q = bio->bi_disk->queue;
bool nowait = bio->bi_opf & REQ_NOWAIT;
blk_qc_t ret;
if (!generic_make_request_checks(bio))
return BLK_QC_T_NONE;
if (unlikely(blk_queue_enter(q, nowait ? BLK_MQ_REQ_NOWAIT : 0))) {
if (nowait && !blk_queue_dying(q))
bio->bi_status = BLK_STS_AGAIN;
else
bio->bi_status = BLK_STS_IOERR;
bio_endio(bio);
return BLK_QC_T_NONE;
}
ret = q->make_request_fn(q, bio);
blk_queue_exit(q);
return ret;
}
EXPORT_SYMBOL_GPL(direct_make_request);
/**
* submit_bio - submit a bio to the block device layer for I/O
* @bio: The &struct bio which describes the I/O
@@ -2285,6 +2412,17 @@ blk_qc_t submit_bio(struct bio *bio)
}
EXPORT_SYMBOL(submit_bio);
bool blk_poll(struct request_queue *q, blk_qc_t cookie)
{
if (!q->poll_fn || !blk_qc_t_valid(cookie))
return false;
if (current->plug)
blk_flush_plug_list(current->plug, false);
return q->poll_fn(q, cookie);
}
EXPORT_SYMBOL_GPL(blk_poll);
/**
* blk_cloned_rq_check_limits - Helper function to check a cloned request
* for new the queue limits
@@ -2350,7 +2488,7 @@ blk_status_t blk_insert_cloned_request(struct request_queue *q, struct request *
* bypass a potential scheduler on the bottom device for
* insert.
*/
blk_mq_request_bypass_insert(rq);
blk_mq_request_bypass_insert(rq, true);
return BLK_STS_OK;
}
@@ -2464,20 +2602,22 @@ void blk_account_io_done(struct request *req)
* Don't process normal requests when queue is suspended
* or in the process of suspending/resuming
*/
static struct request *blk_pm_peek_request(struct request_queue *q,
struct request *rq)
static bool blk_pm_allow_request(struct request *rq)
{
if (q->dev && (q->rpm_status == RPM_SUSPENDED ||
(q->rpm_status != RPM_ACTIVE && !(rq->rq_flags & RQF_PM))))
return NULL;
else
return rq;
switch (rq->q->rpm_status) {
case RPM_RESUMING:
case RPM_SUSPENDING:
return rq->rq_flags & RQF_PM;
case RPM_SUSPENDED:
return false;
}
return true;
}
#else
static inline struct request *blk_pm_peek_request(struct request_queue *q,
struct request *rq)
static bool blk_pm_allow_request(struct request *rq)
{
return rq;
return true;
}
#endif
@@ -2517,6 +2657,48 @@ void blk_account_io_start(struct request *rq, bool new_io)
part_stat_unlock();
}
static struct request *elv_next_request(struct request_queue *q)
{
struct request *rq;
struct blk_flush_queue *fq = blk_get_flush_queue(q, NULL);
WARN_ON_ONCE(q->mq_ops);
while (1) {
list_for_each_entry(rq, &q->queue_head, queuelist) {
if (blk_pm_allow_request(rq))
return rq;
if (rq->rq_flags & RQF_SOFTBARRIER)
break;
}
/*
* Flush request is running and flush request isn't queueable
* in the drive, we can hold the queue till flush request is
* finished. Even we don't do this, driver can't dispatch next
* requests and will requeue them. And this can improve
* throughput too. For example, we have request flush1, write1,
* flush 2. flush1 is dispatched, then queue is hold, write1
* isn't inserted to queue. After flush1 is finished, flush2
* will be dispatched. Since disk cache is already clean,
* flush2 will be finished very soon, so looks like flush2 is
* folded to flush1.
* Since the queue is hold, a flag is set to indicate the queue
* should be restarted later. Please see flush_end_io() for
* details.
*/
if (fq->flush_pending_idx != fq->flush_running_idx &&
!queue_flush_queueable(q)) {
fq->flush_queue_delayed = 1;
return NULL;
}
if (unlikely(blk_queue_bypass(q)) ||
!q->elevator->type->ops.sq.elevator_dispatch_fn(q, 0))
return NULL;
}
}
/**
* blk_peek_request - peek at the top of a request queue
* @q: request queue to peek at
@@ -2538,12 +2720,7 @@ struct request *blk_peek_request(struct request_queue *q)
lockdep_assert_held(q->queue_lock);
WARN_ON_ONCE(q->mq_ops);
while ((rq = __elv_next_request(q)) != NULL) {
rq = blk_pm_peek_request(q, rq);
if (!rq)
break;
while ((rq = elv_next_request(q)) != NULL) {
if (!(rq->rq_flags & RQF_STARTED)) {
/*
* This is the first time the device driver
@@ -2695,6 +2872,27 @@ struct request *blk_fetch_request(struct request_queue *q)
}
EXPORT_SYMBOL(blk_fetch_request);
/*
* Steal bios from a request and add them to a bio list.
* The request must not have been partially completed before.
*/
void blk_steal_bios(struct bio_list *list, struct request *rq)
{
if (rq->bio) {
if (list->tail)
list->tail->bi_next = rq->bio;
else
list->head = rq->bio;
list->tail = rq->biotail;
rq->bio = NULL;
rq->biotail = NULL;
}
rq->__data_len = 0;
}
EXPORT_SYMBOL_GPL(blk_steal_bios);
/**
* blk_update_request - Special helper function for request stacking drivers
* @req: the request being processed
+27 -10
View File
@@ -231,8 +231,13 @@ static void flush_end_io(struct request *flush_rq, blk_status_t error)
/* release the tag's ownership to the req cloned from */
spin_lock_irqsave(&fq->mq_flush_lock, flags);
hctx = blk_mq_map_queue(q, flush_rq->mq_ctx->cpu);
blk_mq_tag_set_rq(hctx, flush_rq->tag, fq->orig_rq);
flush_rq->tag = -1;
if (!q->elevator) {
blk_mq_tag_set_rq(hctx, flush_rq->tag, fq->orig_rq);
flush_rq->tag = -1;
} else {
blk_mq_put_driver_tag_hctx(hctx, flush_rq);
flush_rq->internal_tag = -1;
}
}
running = &fq->flush_queue[fq->flush_running_idx];
@@ -318,19 +323,26 @@ static bool blk_kick_flush(struct request_queue *q, struct blk_flush_queue *fq)
blk_rq_init(q, flush_rq);
/*
* Borrow tag from the first request since they can't
* be in flight at the same time. And acquire the tag's
* ownership for flush req.
* In case of none scheduler, borrow tag from the first request
* since they can't be in flight at the same time. And acquire
* the tag's ownership for flush req.
*
* In case of IO scheduler, flush rq need to borrow scheduler tag
* just for cheating put/get driver tag.
*/
if (q->mq_ops) {
struct blk_mq_hw_ctx *hctx;
flush_rq->mq_ctx = first_rq->mq_ctx;
flush_rq->tag = first_rq->tag;
fq->orig_rq = first_rq;
hctx = blk_mq_map_queue(q, first_rq->mq_ctx->cpu);
blk_mq_tag_set_rq(hctx, first_rq->tag, flush_rq);
if (!q->elevator) {
fq->orig_rq = first_rq;
flush_rq->tag = first_rq->tag;
hctx = blk_mq_map_queue(q, first_rq->mq_ctx->cpu);
blk_mq_tag_set_rq(hctx, first_rq->tag, flush_rq);
} else {
flush_rq->internal_tag = first_rq->internal_tag;
}
}
flush_rq->cmd_flags = REQ_OP_FLUSH | REQ_PREFLUSH;
@@ -394,6 +406,11 @@ static void mq_flush_data_end_io(struct request *rq, blk_status_t error)
hctx = blk_mq_map_queue(q, ctx->cpu);
if (q->elevator) {
WARN_ON(rq->tag < 0);
blk_mq_put_driver_tag_hctx(hctx, rq);
}
/*
* After populating an empty queue, kick it to avoid stall. Read
* the comment in flush_end_io().
@@ -463,7 +480,7 @@ void blk_insert_flush(struct request *rq)
if ((policy & REQ_FSEQ_DATA) &&
!(policy & (REQ_FSEQ_PREFLUSH | REQ_FSEQ_POSTFLUSH))) {
if (q->mq_ops)
blk_mq_sched_insert_request(rq, false, true, false, false);
blk_mq_request_bypass_insert(rq, false);
else
list_add_tail(&rq->queuelist, &q->queue_head);
return;
+82 -46
View File
@@ -275,51 +275,18 @@ static unsigned int __blkdev_sectors_to_bio_pages(sector_t nr_sects)
return min(pages, (sector_t)BIO_MAX_PAGES);
}
/**
* __blkdev_issue_zeroout - generate number of zero filed write bios
* @bdev: blockdev to issue
* @sector: start sector
* @nr_sects: number of sectors to write
* @gfp_mask: memory allocation flags (for bio_alloc)
* @biop: pointer to anchor bio
* @flags: controls detailed behavior
*
* Description:
* Zero-fill a block range, either using hardware offload or by explicitly
* writing zeroes to the device.
*
* Note that this function may fail with -EOPNOTSUPP if the driver signals
* zeroing offload support, but the device fails to process the command (for
* some devices there is no non-destructive way to verify whether this
* operation is actually supported). In this case the caller should call
* retry the call to blkdev_issue_zeroout() and the fallback path will be used.
*
* If a device is using logical block provisioning, the underlying space will
* not be released if %flags contains BLKDEV_ZERO_NOUNMAP.
*
* If %flags contains BLKDEV_ZERO_NOFALLBACK, the function will return
* -EOPNOTSUPP if no explicit hardware offload for zeroing is provided.
*/
int __blkdev_issue_zeroout(struct block_device *bdev, sector_t sector,
sector_t nr_sects, gfp_t gfp_mask, struct bio **biop,
unsigned flags)
static int __blkdev_issue_zero_pages(struct block_device *bdev,
sector_t sector, sector_t nr_sects, gfp_t gfp_mask,
struct bio **biop)
{
int ret;
int bi_size = 0;
struct request_queue *q = bdev_get_queue(bdev);
struct bio *bio = *biop;
int bi_size = 0;
unsigned int sz;
sector_t bs_mask;
bs_mask = (bdev_logical_block_size(bdev) >> 9) - 1;
if ((sector | nr_sects) & bs_mask)
return -EINVAL;
if (!q)
return -ENXIO;
ret = __blkdev_issue_write_zeroes(bdev, sector, nr_sects, gfp_mask,
biop, flags);
if (ret != -EOPNOTSUPP || (flags & BLKDEV_ZERO_NOFALLBACK))
goto out;
ret = 0;
while (nr_sects != 0) {
bio = next_bio(bio, __blkdev_sectors_to_bio_pages(nr_sects),
gfp_mask);
@@ -339,8 +306,46 @@ int __blkdev_issue_zeroout(struct block_device *bdev, sector_t sector,
}
*biop = bio;
out:
return ret;
return 0;
}
/**
* __blkdev_issue_zeroout - generate number of zero filed write bios
* @bdev: blockdev to issue
* @sector: start sector
* @nr_sects: number of sectors to write
* @gfp_mask: memory allocation flags (for bio_alloc)
* @biop: pointer to anchor bio
* @flags: controls detailed behavior
*
* Description:
* Zero-fill a block range, either using hardware offload or by explicitly
* writing zeroes to the device.
*
* If a device is using logical block provisioning, the underlying space will
* not be released if %flags contains BLKDEV_ZERO_NOUNMAP.
*
* If %flags contains BLKDEV_ZERO_NOFALLBACK, the function will return
* -EOPNOTSUPP if no explicit hardware offload for zeroing is provided.
*/
int __blkdev_issue_zeroout(struct block_device *bdev, sector_t sector,
sector_t nr_sects, gfp_t gfp_mask, struct bio **biop,
unsigned flags)
{
int ret;
sector_t bs_mask;
bs_mask = (bdev_logical_block_size(bdev) >> 9) - 1;
if ((sector | nr_sects) & bs_mask)
return -EINVAL;
ret = __blkdev_issue_write_zeroes(bdev, sector, nr_sects, gfp_mask,
biop, flags);
if (ret != -EOPNOTSUPP || (flags & BLKDEV_ZERO_NOFALLBACK))
return ret;
return __blkdev_issue_zero_pages(bdev, sector, nr_sects, gfp_mask,
biop);
}
EXPORT_SYMBOL(__blkdev_issue_zeroout);
@@ -360,18 +365,49 @@ EXPORT_SYMBOL(__blkdev_issue_zeroout);
int blkdev_issue_zeroout(struct block_device *bdev, sector_t sector,
sector_t nr_sects, gfp_t gfp_mask, unsigned flags)
{
int ret;
struct bio *bio = NULL;
int ret = 0;
sector_t bs_mask;
struct bio *bio;
struct blk_plug plug;
bool try_write_zeroes = !!bdev_write_zeroes_sectors(bdev);
bs_mask = (bdev_logical_block_size(bdev) >> 9) - 1;
if ((sector | nr_sects) & bs_mask)
return -EINVAL;
retry:
bio = NULL;
blk_start_plug(&plug);
ret = __blkdev_issue_zeroout(bdev, sector, nr_sects, gfp_mask,
&bio, flags);
if (try_write_zeroes) {
ret = __blkdev_issue_write_zeroes(bdev, sector, nr_sects,
gfp_mask, &bio, flags);
} else if (!(flags & BLKDEV_ZERO_NOFALLBACK)) {
ret = __blkdev_issue_zero_pages(bdev, sector, nr_sects,
gfp_mask, &bio);
} else {
/* No zeroing offload support */
ret = -EOPNOTSUPP;
}
if (ret == 0 && bio) {
ret = submit_bio_wait(bio);
bio_put(bio);
}
blk_finish_plug(&plug);
if (ret && try_write_zeroes) {
if (!(flags & BLKDEV_ZERO_NOFALLBACK)) {
try_write_zeroes = false;
goto retry;
}
if (!bdev_write_zeroes_sectors(bdev)) {
/*
* Zeroing offload support was indicated, but the
* device reported ILLEGAL REQUEST (for some devices
* there is no non-destructive way to verify whether
* WRITE ZEROES is actually supported).
*/
ret = -EOPNOTSUPP;
}
}
return ret;
}
+1 -2
View File
@@ -54,7 +54,6 @@ static const char *const blk_queue_flag_name[] = {
QUEUE_FLAG_NAME(NOMERGES),
QUEUE_FLAG_NAME(SAME_COMP),
QUEUE_FLAG_NAME(FAIL_IO),
QUEUE_FLAG_NAME(STACKABLE),
QUEUE_FLAG_NAME(NONROT),
QUEUE_FLAG_NAME(IO_STAT),
QUEUE_FLAG_NAME(DISCARD),
@@ -75,6 +74,7 @@ static const char *const blk_queue_flag_name[] = {
QUEUE_FLAG_NAME(REGISTERED),
QUEUE_FLAG_NAME(SCSI_PASSTHROUGH),
QUEUE_FLAG_NAME(QUIESCED),
QUEUE_FLAG_NAME(PREEMPT_ONLY),
};
#undef QUEUE_FLAG_NAME
@@ -180,7 +180,6 @@ static const char *const hctx_state_name[] = {
HCTX_STATE_NAME(STOPPED),
HCTX_STATE_NAME(TAG_ACTIVE),
HCTX_STATE_NAME(SCHED_RESTART),
HCTX_STATE_NAME(TAG_WAITING),
HCTX_STATE_NAME(START_ON_RUN),
};
#undef HCTX_STATE_NAME
+132 -73
View File
@@ -81,20 +81,103 @@ static bool blk_mq_sched_restart_hctx(struct blk_mq_hw_ctx *hctx)
} else
clear_bit(BLK_MQ_S_SCHED_RESTART, &hctx->state);
if (blk_mq_hctx_has_pending(hctx)) {
blk_mq_run_hw_queue(hctx, true);
return true;
}
return false;
return blk_mq_run_hw_queue(hctx, true);
}
/*
* Only SCSI implements .get_budget and .put_budget, and SCSI restarts
* its queue by itself in its completion handler, so we don't need to
* restart queue if .get_budget() returns BLK_STS_NO_RESOURCE.
*/
static void blk_mq_do_dispatch_sched(struct blk_mq_hw_ctx *hctx)
{
struct request_queue *q = hctx->queue;
struct elevator_queue *e = q->elevator;
LIST_HEAD(rq_list);
do {
struct request *rq;
if (e->type->ops.mq.has_work &&
!e->type->ops.mq.has_work(hctx))
break;
if (!blk_mq_get_dispatch_budget(hctx))
break;
rq = e->type->ops.mq.dispatch_request(hctx);
if (!rq) {
blk_mq_put_dispatch_budget(hctx);
break;
}
/*
* Now this rq owns the budget which has to be released
* if this rq won't be queued to driver via .queue_rq()
* in blk_mq_dispatch_rq_list().
*/
list_add(&rq->queuelist, &rq_list);
} while (blk_mq_dispatch_rq_list(q, &rq_list, true));
}
static struct blk_mq_ctx *blk_mq_next_ctx(struct blk_mq_hw_ctx *hctx,
struct blk_mq_ctx *ctx)
{
unsigned idx = ctx->index_hw;
if (++idx == hctx->nr_ctx)
idx = 0;
return hctx->ctxs[idx];
}
/*
* Only SCSI implements .get_budget and .put_budget, and SCSI restarts
* its queue by itself in its completion handler, so we don't need to
* restart queue if .get_budget() returns BLK_STS_NO_RESOURCE.
*/
static void blk_mq_do_dispatch_ctx(struct blk_mq_hw_ctx *hctx)
{
struct request_queue *q = hctx->queue;
LIST_HEAD(rq_list);
struct blk_mq_ctx *ctx = READ_ONCE(hctx->dispatch_from);
do {
struct request *rq;
if (!sbitmap_any_bit_set(&hctx->ctx_map))
break;
if (!blk_mq_get_dispatch_budget(hctx))
break;
rq = blk_mq_dequeue_from_ctx(hctx, ctx);
if (!rq) {
blk_mq_put_dispatch_budget(hctx);
break;
}
/*
* Now this rq owns the budget which has to be released
* if this rq won't be queued to driver via .queue_rq()
* in blk_mq_dispatch_rq_list().
*/
list_add(&rq->queuelist, &rq_list);
/* round robin for fair dispatch */
ctx = blk_mq_next_ctx(hctx, rq->mq_ctx);
} while (blk_mq_dispatch_rq_list(q, &rq_list, true));
WRITE_ONCE(hctx->dispatch_from, ctx);
}
/* return true if hw queue need to be run again */
void blk_mq_sched_dispatch_requests(struct blk_mq_hw_ctx *hctx)
{
struct request_queue *q = hctx->queue;
struct elevator_queue *e = q->elevator;
const bool has_sched_dispatch = e && e->type->ops.mq.dispatch_request;
bool did_work = false;
LIST_HEAD(rq_list);
/* RCU or SRCU read lock is needed before checking quiesced flag */
@@ -122,29 +205,34 @@ void blk_mq_sched_dispatch_requests(struct blk_mq_hw_ctx *hctx)
* scheduler, we can no longer merge or sort them. So it's best to
* leave them there for as long as we can. Mark the hw queue as
* needing a restart in that case.
*
* We want to dispatch from the scheduler if there was nothing
* on the dispatch list or we were able to dispatch from the
* dispatch list.
*/
if (!list_empty(&rq_list)) {
blk_mq_sched_mark_restart_hctx(hctx);
did_work = blk_mq_dispatch_rq_list(q, &rq_list);
} else if (!has_sched_dispatch) {
if (blk_mq_dispatch_rq_list(q, &rq_list, false)) {
if (has_sched_dispatch)
blk_mq_do_dispatch_sched(hctx);
else
blk_mq_do_dispatch_ctx(hctx);
}
} else if (has_sched_dispatch) {
blk_mq_do_dispatch_sched(hctx);
} else if (q->mq_ops->get_budget) {
/*
* If we need to get budget before queuing request, we
* dequeue request one by one from sw queue for avoiding
* to mess up I/O merge when dispatch runs out of resource.
*
* TODO: get more budgets, and dequeue more requests in
* one time.
*/
blk_mq_do_dispatch_ctx(hctx);
} else {
blk_mq_flush_busy_ctxs(hctx, &rq_list);
blk_mq_dispatch_rq_list(q, &rq_list);
}
/*
* We want to dispatch from the scheduler if we had no work left
* on the dispatch list, OR if we did have work but weren't able
* to make progress.
*/
if (!did_work && has_sched_dispatch) {
do {
struct request *rq;
rq = e->type->ops.mq.dispatch_request(hctx);
if (!rq)
break;
list_add(&rq->queuelist, &rq_list);
} while (blk_mq_dispatch_rq_list(q, &rq_list));
blk_mq_dispatch_rq_list(q, &rq_list, false);
}
}
@@ -260,21 +348,21 @@ void blk_mq_sched_request_inserted(struct request *rq)
EXPORT_SYMBOL_GPL(blk_mq_sched_request_inserted);
static bool blk_mq_sched_bypass_insert(struct blk_mq_hw_ctx *hctx,
bool has_sched,
struct request *rq)
{
if (rq->tag == -1) {
rq->rq_flags |= RQF_SORTED;
return false;
/* dispatch flush rq directly */
if (rq->rq_flags & RQF_FLUSH_SEQ) {
spin_lock(&hctx->lock);
list_add(&rq->queuelist, &hctx->dispatch);
spin_unlock(&hctx->lock);
return true;
}
/*
* If we already have a real request tag, send directly to
* the dispatch list.
*/
spin_lock(&hctx->lock);
list_add(&rq->queuelist, &hctx->dispatch);
spin_unlock(&hctx->lock);
return true;
if (has_sched)
rq->rq_flags |= RQF_SORTED;
return false;
}
/**
@@ -339,21 +427,6 @@ done:
}
}
/*
* Add flush/fua to the queue. If we fail getting a driver tag, then
* punt to the requeue list. Requeue will re-invoke us from a context
* that's safe to block from.
*/
static void blk_mq_sched_insert_flush(struct blk_mq_hw_ctx *hctx,
struct request *rq, bool can_block)
{
if (blk_mq_get_driver_tag(rq, &hctx, can_block)) {
blk_insert_flush(rq);
blk_mq_run_hw_queue(hctx, true);
} else
blk_mq_add_to_requeue_list(rq, false, true);
}
void blk_mq_sched_insert_request(struct request *rq, bool at_head,
bool run_queue, bool async, bool can_block)
{
@@ -362,12 +435,15 @@ void blk_mq_sched_insert_request(struct request *rq, bool at_head,
struct blk_mq_ctx *ctx = rq->mq_ctx;
struct blk_mq_hw_ctx *hctx = blk_mq_map_queue(q, ctx->cpu);
if (rq->tag == -1 && op_is_flush(rq->cmd_flags)) {
blk_mq_sched_insert_flush(hctx, rq, can_block);
return;
/* flush rq in flush machinery need to be dispatched directly */
if (!(rq->rq_flags & RQF_FLUSH_SEQ) && op_is_flush(rq->cmd_flags)) {
blk_insert_flush(rq);
goto run;
}
if (e && blk_mq_sched_bypass_insert(hctx, rq))
WARN_ON(e && (rq->tag != -1));
if (blk_mq_sched_bypass_insert(hctx, !!e, rq))
goto run;
if (e && e->type->ops.mq.insert_requests) {
@@ -393,23 +469,6 @@ void blk_mq_sched_insert_requests(struct request_queue *q,
struct blk_mq_hw_ctx *hctx = blk_mq_map_queue(q, ctx->cpu);
struct elevator_queue *e = hctx->queue->elevator;
if (e) {
struct request *rq, *next;
/*
* We bypass requests that already have a driver tag assigned,
* which should only be flushes. Flushes are only ever inserted
* as single requests, so we shouldn't ever hit the
* WARN_ON_ONCE() below (but let's handle it just in case).
*/
list_for_each_entry_safe(rq, next, list, queuelist) {
if (WARN_ON_ONCE(rq->tag != -1)) {
list_del_init(&rq->queuelist);
blk_mq_sched_bypass_insert(hctx, rq);
}
}
}
if (e && e->type->ops.mq.insert_requests)
e->type->ops.mq.insert_requests(hctx, list, false);
else
+5 -6
View File
@@ -298,12 +298,12 @@ void blk_mq_tagset_busy_iter(struct blk_mq_tag_set *tagset,
}
EXPORT_SYMBOL(blk_mq_tagset_busy_iter);
int blk_mq_reinit_tagset(struct blk_mq_tag_set *set,
int (reinit_request)(void *, struct request *))
int blk_mq_tagset_iter(struct blk_mq_tag_set *set, void *data,
int (fn)(void *, struct request *))
{
int i, j, ret = 0;
if (WARN_ON_ONCE(!reinit_request))
if (WARN_ON_ONCE(!fn))
goto out;
for (i = 0; i < set->nr_hw_queues; i++) {
@@ -316,8 +316,7 @@ int blk_mq_reinit_tagset(struct blk_mq_tag_set *set,
if (!tags->static_rqs[j])
continue;
ret = reinit_request(set->driver_data,
tags->static_rqs[j]);
ret = fn(data, tags->static_rqs[j]);
if (ret)
goto out;
}
@@ -326,7 +325,7 @@ int blk_mq_reinit_tagset(struct blk_mq_tag_set *set,
out:
return ret;
}
EXPORT_SYMBOL_GPL(blk_mq_reinit_tagset);
EXPORT_SYMBOL_GPL(blk_mq_tagset_iter);
void blk_mq_queue_tag_busy_iter(struct request_queue *q, busy_iter_fn *fn,
void *priv)
+1 -6
View File
@@ -44,14 +44,9 @@ static inline struct sbq_wait_state *bt_wait_ptr(struct sbitmap_queue *bt,
return sbq_wait_ptr(bt, &hctx->wait_index);
}
enum {
BLK_MQ_TAG_CACHE_MIN = 1,
BLK_MQ_TAG_CACHE_MAX = 64,
};
enum {
BLK_MQ_TAG_FAIL = -1U,
BLK_MQ_TAG_MIN = BLK_MQ_TAG_CACHE_MIN,
BLK_MQ_TAG_MIN = 1,
BLK_MQ_TAG_MAX = BLK_MQ_TAG_FAIL - 1,
};
+250 -174
View File
File diff suppressed because it is too large Load Diff
+55 -5
View File
@@ -3,6 +3,7 @@
#define INT_BLK_MQ_H
#include "blk-stat.h"
#include "blk-mq-tag.h"
struct blk_mq_tag_set;
@@ -26,16 +27,16 @@ struct blk_mq_ctx {
struct kobject kobj;
} ____cacheline_aligned_in_smp;
void blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx, bool async);
void blk_mq_freeze_queue(struct request_queue *q);
void blk_mq_free_queue(struct request_queue *q);
int blk_mq_update_nr_requests(struct request_queue *q, unsigned int nr);
void blk_mq_wake_waiters(struct request_queue *q);
bool blk_mq_dispatch_rq_list(struct request_queue *, struct list_head *);
bool blk_mq_dispatch_rq_list(struct request_queue *, struct list_head *, bool);
void blk_mq_flush_busy_ctxs(struct blk_mq_hw_ctx *hctx, struct list_head *list);
bool blk_mq_hctx_has_pending(struct blk_mq_hw_ctx *hctx);
bool blk_mq_get_driver_tag(struct request *rq, struct blk_mq_hw_ctx **hctx,
bool wait);
struct request *blk_mq_dequeue_from_ctx(struct blk_mq_hw_ctx *hctx,
struct blk_mq_ctx *start);
/*
* Internal helpers for allocating/freeing the request map
@@ -55,7 +56,7 @@ int blk_mq_alloc_rqs(struct blk_mq_tag_set *set, struct blk_mq_tags *tags,
*/
void __blk_mq_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq,
bool at_head);
void blk_mq_request_bypass_insert(struct request *rq);
void blk_mq_request_bypass_insert(struct request *rq, bool run_queue);
void blk_mq_insert_requests(struct blk_mq_hw_ctx *hctx, struct blk_mq_ctx *ctx,
struct list_head *list);
@@ -109,7 +110,7 @@ static inline void blk_mq_put_ctx(struct blk_mq_ctx *ctx)
struct blk_mq_alloc_data {
/* input parameter */
struct request_queue *q;
unsigned int flags;
blk_mq_req_flags_t flags;
unsigned int shallow_depth;
/* input & output parameter */
@@ -138,4 +139,53 @@ static inline bool blk_mq_hw_queue_mapped(struct blk_mq_hw_ctx *hctx)
void blk_mq_in_flight(struct request_queue *q, struct hd_struct *part,
unsigned int inflight[2]);
static inline void blk_mq_put_dispatch_budget(struct blk_mq_hw_ctx *hctx)
{
struct request_queue *q = hctx->queue;
if (q->mq_ops->put_budget)
q->mq_ops->put_budget(hctx);
}
static inline bool blk_mq_get_dispatch_budget(struct blk_mq_hw_ctx *hctx)
{
struct request_queue *q = hctx->queue;
if (q->mq_ops->get_budget)
return q->mq_ops->get_budget(hctx);
return true;
}
static inline void __blk_mq_put_driver_tag(struct blk_mq_hw_ctx *hctx,
struct request *rq)
{
blk_mq_put_tag(hctx, hctx->tags, rq->mq_ctx, rq->tag);
rq->tag = -1;
if (rq->rq_flags & RQF_MQ_INFLIGHT) {
rq->rq_flags &= ~RQF_MQ_INFLIGHT;
atomic_dec(&hctx->nr_active);
}
}
static inline void blk_mq_put_driver_tag_hctx(struct blk_mq_hw_ctx *hctx,
struct request *rq)
{
if (rq->tag == -1 || rq->internal_tag == -1)
return;
__blk_mq_put_driver_tag(hctx, rq);
}
static inline void blk_mq_put_driver_tag(struct request *rq)
{
struct blk_mq_hw_ctx *hctx;
if (rq->tag == -1 || rq->internal_tag == -1)
return;
hctx = blk_mq_map_queue(rq->q, rq->mq_ctx->cpu);
__blk_mq_put_driver_tag(hctx, rq);
}
#endif
+1 -1
View File
@@ -157,7 +157,7 @@ EXPORT_SYMBOL(blk_set_stacking_limits);
* Caveat:
* The driver that does this *must* be able to deal appropriately
* with buffers in "highmemory". This can be accomplished by either calling
* __bio_kmap_atomic() to get a temporary kernel mapping, or by calling
* kmap_atomic() to get a temporary kernel mapping, or by calling
* blk_queue_bounce() to create a buffer in normal memory.
**/
void blk_queue_make_request(struct request_queue *q, make_request_fn *mfn)
+7 -38
View File
@@ -11,8 +11,6 @@
#include "blk-mq.h"
#include "blk.h"
#define BLK_RQ_STAT_BATCH 64
struct blk_queue_stats {
struct list_head callbacks;
spinlock_t lock;
@@ -23,45 +21,21 @@ static void blk_stat_init(struct blk_rq_stat *stat)
{
stat->min = -1ULL;
stat->max = stat->nr_samples = stat->mean = 0;
stat->batch = stat->nr_batch = 0;
}
static void blk_stat_flush_batch(struct blk_rq_stat *stat)
{
const s32 nr_batch = READ_ONCE(stat->nr_batch);
const s32 nr_samples = READ_ONCE(stat->nr_samples);
if (!nr_batch)
return;
if (!nr_samples)
stat->mean = div64_s64(stat->batch, nr_batch);
else {
stat->mean = div64_s64((stat->mean * nr_samples) +
stat->batch,
nr_batch + nr_samples);
}
stat->nr_samples += nr_batch;
stat->nr_batch = stat->batch = 0;
stat->batch = 0;
}
/* src is a per-cpu stat, mean isn't initialized */
static void blk_stat_sum(struct blk_rq_stat *dst, struct blk_rq_stat *src)
{
blk_stat_flush_batch(src);
if (!src->nr_samples)
return;
dst->min = min(dst->min, src->min);
dst->max = max(dst->max, src->max);
if (!dst->nr_samples)
dst->mean = src->mean;
else {
dst->mean = div64_s64((src->mean * src->nr_samples) +
(dst->mean * dst->nr_samples),
dst->nr_samples + src->nr_samples);
}
dst->mean = div_u64(src->batch + dst->mean * dst->nr_samples,
dst->nr_samples + src->nr_samples);
dst->nr_samples += src->nr_samples;
}
@@ -69,13 +43,8 @@ static void __blk_stat_add(struct blk_rq_stat *stat, u64 value)
{
stat->min = min(stat->min, value);
stat->max = max(stat->max, value);
if (stat->batch + value < stat->batch ||
stat->nr_batch + 1 == BLK_RQ_STAT_BATCH)
blk_stat_flush_batch(stat);
stat->batch += value;
stat->nr_batch++;
stat->nr_samples++;
}
void blk_stat_add(struct request *rq)
@@ -84,7 +53,7 @@ void blk_stat_add(struct request *rq)
struct blk_stat_callback *cb;
struct blk_rq_stat *stat;
int bucket;
s64 now, value;
u64 now, value;
now = __blk_stat_time(ktime_to_ns(ktime_get()));
if (now < blk_stat_time(&rq->issue_stat))
+10 -2
View File
@@ -2113,8 +2113,12 @@ static inline void throtl_update_latency_buckets(struct throtl_data *td)
static void blk_throtl_assoc_bio(struct throtl_grp *tg, struct bio *bio)
{
#ifdef CONFIG_BLK_DEV_THROTTLING_LOW
if (bio->bi_css)
if (bio->bi_css) {
if (bio->bi_cg_private)
blkg_put(tg_to_blkg(bio->bi_cg_private));
bio->bi_cg_private = tg;
blkg_get(tg_to_blkg(tg));
}
blk_stat_set_issue(&bio->bi_issue_stat, bio_sectors(bio));
#endif
}
@@ -2284,8 +2288,10 @@ void blk_throtl_bio_endio(struct bio *bio)
start_time = blk_stat_time(&bio->bi_issue_stat) >> 10;
finish_time = __blk_stat_time(finish_time_ns) >> 10;
if (!start_time || finish_time <= start_time)
if (!start_time || finish_time <= start_time) {
blkg_put(tg_to_blkg(tg));
return;
}
lat = finish_time - start_time;
/* this is only for bio based driver */
@@ -2315,6 +2321,8 @@ void blk_throtl_bio_endio(struct bio *bio)
tg->bio_cnt /= 2;
tg->bad_bio_cnt /= 2;
}
blkg_put(tg_to_blkg(tg));
}
#endif
+1 -4
View File
@@ -134,8 +134,6 @@ void blk_timeout_work(struct work_struct *work)
struct request *rq, *tmp;
int next_set = 0;
if (blk_queue_enter(q, true))
return;
spin_lock_irqsave(q->queue_lock, flags);
list_for_each_entry_safe(rq, tmp, &q->timeout_list, timeout_list)
@@ -145,7 +143,6 @@ void blk_timeout_work(struct work_struct *work)
mod_timer(&q->timeout, round_jiffies_up(next));
spin_unlock_irqrestore(q->queue_lock, flags);
blk_queue_exit(q);
}
/**
@@ -211,7 +208,7 @@ void blk_add_timer(struct request *req)
if (!req->timeout)
req->timeout = q->rq_timeout;
req->deadline = jiffies + req->timeout;
WRITE_ONCE(req->deadline, jiffies + req->timeout);
/*
* Only the non-mq case needs to add the request to a protected list.
+1 -1
View File
@@ -654,7 +654,7 @@ void wbt_set_write_cache(struct rq_wb *rwb, bool write_cache_on)
}
/*
* Disable wbt, if enabled by default. Only called from CFQ.
* Disable wbt, if enabled by default.
*/
void wbt_disable_default(struct request_queue *q)
{
+7 -39
View File
@@ -123,8 +123,15 @@ void blk_account_io_done(struct request *req);
* Internal atomic flags for request handling
*/
enum rq_atomic_flags {
/*
* Keep these two bits first - not because we depend on the
* value of them, but we do depend on them being in the same
* byte of storage to ensure ordering on writes. Keeping them
* first will achieve that nicely.
*/
REQ_ATOM_COMPLETE = 0,
REQ_ATOM_STARTED,
REQ_ATOM_POLL_SLEPT,
};
@@ -149,45 +156,6 @@ static inline void blk_clear_rq_complete(struct request *rq)
void blk_insert_flush(struct request *rq);
static inline struct request *__elv_next_request(struct request_queue *q)
{
struct request *rq;
struct blk_flush_queue *fq = blk_get_flush_queue(q, NULL);
WARN_ON_ONCE(q->mq_ops);
while (1) {
if (!list_empty(&q->queue_head)) {
rq = list_entry_rq(q->queue_head.next);
return rq;
}
/*
* Flush request is running and flush request isn't queueable
* in the drive, we can hold the queue till flush request is
* finished. Even we don't do this, driver can't dispatch next
* requests and will requeue them. And this can improve
* throughput too. For example, we have request flush1, write1,
* flush 2. flush1 is dispatched, then queue is hold, write1
* isn't inserted to queue. After flush1 is finished, flush2
* will be dispatched. Since disk cache is already clean,
* flush2 will be finished very soon, so looks like flush2 is
* folded to flush1.
* Since the queue is hold, a flag is set to indicate the queue
* should be restarted later. Please see flush_end_io() for
* details.
*/
if (fq->flush_pending_idx != fq->flush_running_idx &&
!queue_flush_queueable(q)) {
fq->flush_queue_delayed = 1;
return NULL;
}
if (unlikely(blk_queue_bypass(q)) ||
!q->elevator->type->ops.sq.elevator_dispatch_fn(q, 0))
return NULL;
}
}
static inline void elv_activate_rq(struct request_queue *q, struct request *rq)
{
struct elevator_queue *e = q->elevator;
+8 -10
View File
@@ -137,7 +137,7 @@ static inline struct hlist_head *bsg_dev_idx_hash(int index)
static int blk_fill_sgv4_hdr_rq(struct request_queue *q, struct request *rq,
struct sg_io_v4 *hdr, struct bsg_device *bd,
fmode_t has_write_perm)
fmode_t mode)
{
struct scsi_request *req = scsi_req(rq);
@@ -152,7 +152,7 @@ static int blk_fill_sgv4_hdr_rq(struct request_queue *q, struct request *rq,
return -EFAULT;
if (hdr->subprotocol == BSG_SUB_PROTOCOL_SCSI_CMD) {
if (blk_verify_command(req->cmd, has_write_perm))
if (blk_verify_command(req->cmd, mode))
return -EPERM;
} else if (!capable(CAP_SYS_RAWIO))
return -EPERM;
@@ -206,7 +206,7 @@ bsg_validate_sgv4_hdr(struct sg_io_v4 *hdr, int *op)
* map sg_io_v4 to a request.
*/
static struct request *
bsg_map_hdr(struct bsg_device *bd, struct sg_io_v4 *hdr, fmode_t has_write_perm)
bsg_map_hdr(struct bsg_device *bd, struct sg_io_v4 *hdr, fmode_t mode)
{
struct request_queue *q = bd->queue;
struct request *rq, *next_rq = NULL;
@@ -237,7 +237,7 @@ bsg_map_hdr(struct bsg_device *bd, struct sg_io_v4 *hdr, fmode_t has_write_perm)
if (IS_ERR(rq))
return rq;
ret = blk_fill_sgv4_hdr_rq(q, rq, hdr, bd, has_write_perm);
ret = blk_fill_sgv4_hdr_rq(q, rq, hdr, bd, mode);
if (ret)
goto out;
@@ -587,8 +587,7 @@ bsg_read(struct file *file, char __user *buf, size_t count, loff_t *ppos)
}
static int __bsg_write(struct bsg_device *bd, const char __user *buf,
size_t count, ssize_t *bytes_written,
fmode_t has_write_perm)
size_t count, ssize_t *bytes_written, fmode_t mode)
{
struct bsg_command *bc;
struct request *rq;
@@ -619,7 +618,7 @@ static int __bsg_write(struct bsg_device *bd, const char __user *buf,
/*
* get a request, fill in the blanks, and add to request queue
*/
rq = bsg_map_hdr(bd, &bc->hdr, has_write_perm);
rq = bsg_map_hdr(bd, &bc->hdr, mode);
if (IS_ERR(rq)) {
ret = PTR_ERR(rq);
rq = NULL;
@@ -655,8 +654,7 @@ bsg_write(struct file *file, const char __user *buf, size_t count, loff_t *ppos)
bsg_set_block(bd, file);
bytes_written = 0;
ret = __bsg_write(bd, buf, count, &bytes_written,
file->f_mode & FMODE_WRITE);
ret = __bsg_write(bd, buf, count, &bytes_written, file->f_mode);
*ppos = bytes_written;
@@ -915,7 +913,7 @@ static long bsg_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
if (copy_from_user(&hdr, uarg, sizeof(hdr)))
return -EFAULT;
rq = bsg_map_hdr(bd, &hdr, file->f_mode & FMODE_WRITE);
rq = bsg_map_hdr(bd, &hdr, file->f_mode);
if (IS_ERR(rq))
return PTR_ERR(rq);

Some files were not shown because too many files have changed in this diff Show More