Merge branch 'for-4.16/block' of git://git.kernel.dk/linux-block

Pull block updates from Jens Axboe:
 "This is the main pull request for block IO related changes for the
  4.16 kernel. Nothing major in this pull request, but a good amount of
  improvements and fixes all over the map. This contains:

   - BFQ improvements, fixes, and cleanups from Angelo, Chiara, and
     Paolo.

   - Support for SMR zones for deadline and mq-deadline from Damien and
     Christoph.

   - Set of fixes for bcache by way of Michael Lyle, including fixes
     from himself, Kent, Rui, Tang, and Coly.

   - Series from Matias for lightnvm with fixes from Hans Holmberg,
     Javier, and Matias. Mostly centered around pblk, and the removing
     rrpc 1.2 in preparation for supporting 2.0.

   - A couple of NVMe pull requests from Christoph. Nothing major in
     here, just fixes and cleanups, and support for command tracing from
     Johannes.

   - Support for blk-throttle for tracking reads and writes separately.
     From Joseph Qi. A few cleanups/fixes also for blk-throttle from
     Weiping.

   - Series from Mike Snitzer that enables dm to register its queue more
     logically, something that's alwways been problematic on dm since
     it's a stacked device.

   - Series from Ming cleaning up some of the bio accessor use, in
     preparation for supporting multipage bvecs.

   - Various fixes from Ming closing up holes around queue mapping and
     quiescing.

   - BSD partition fix from Richard Narron, fixing a problem where we
     can't mount newer (10/11) FreeBSD partitions.

   - Series from Tejun reworking blk-mq timeout handling. The previous
     scheme relied on atomic bits, but it had races where we would think
     a request had timed out if it to reused at the wrong time.

   - null_blk now supports faking timeouts, to enable us to better
     exercise and test that functionality separately. From me.

   - Kill the separate atomic poll bit in the request struct. After
     this, we don't use the atomic bits on blk-mq anymore at all. From
     me.

   - sgl_alloc/free helpers from Bart.

   - Heavily contended tag case scalability improvement from me.

   - Various little fixes and cleanups from Arnd, Bart, Corentin,
     Douglas, Eryu, Goldwyn, and myself"

* 'for-4.16/block' of git://git.kernel.dk/linux-block: (186 commits)
  block: remove smart1,2.h
  nvme: add tracepoint for nvme_complete_rq
  nvme: add tracepoint for nvme_setup_cmd
  nvme-pci: introduce RECONNECTING state to mark initializing procedure
  nvme-rdma: remove redundant boolean for inline_data
  nvme: don't free uuid pointer before printing it
  nvme-pci: Suspend queues after deleting them
  bsg: use pr_debug instead of hand crafted macros
  blk-mq-debugfs: don't allow write on attributes with seq_operations set
  nvme-pci: Fix queue double allocations
  block: Set BIO_TRACE_COMPLETION on new bio during split
  blk-throttle: use queue_is_rq_based
  block: Remove kblockd_schedule_delayed_work{,_on}()
  blk-mq: Avoid that blk_mq_delay_run_hw_queue() introduces unintended delays
  blk-mq: Rename blk_mq_request_direct_issue() into blk_mq_request_issue_directly()
  lib/scatterlist: Fix chaining support in sgl_alloc_order()
  blk-throttle: track read and write request individually
  block: add bdev_read_only() checks to common helpers
  block: fail op_is_write() requests to read-only partitions
  blk-throttle: export io_serviced_recursive, io_service_bytes_recursive
  ...
This commit is contained in:
Linus Torvalds
2018-01-29 11:51:49 -08:00
124 changed files with 3878 additions and 4723 deletions
+6 -3
View File
@@ -775,10 +775,11 @@ static void bfq_pd_offline(struct blkg_policy_data *pd)
unsigned long flags;
int i;
if (!entity) /* root group */
return;
spin_lock_irqsave(&bfqd->lock, flags);
if (!entity) /* root group */
goto put_async_queues;
/*
* Empty all service_trees belonging to this group before
* deactivating the group itself.
@@ -809,6 +810,8 @@ static void bfq_pd_offline(struct blkg_policy_data *pd)
}
__bfq_deactivate_entity(entity, false);
put_async_queues:
bfq_put_async_queues(bfqd, bfqg);
spin_unlock_irqrestore(&bfqd->lock, flags);
+374 -155
View File
File diff suppressed because it is too large Load Diff
+19
View File
@@ -337,6 +337,11 @@ struct bfq_queue {
* last transition from idle to backlogged.
*/
unsigned long service_from_backlogged;
/*
* Cumulative service received from the @bfq_queue since its
* last transition to weight-raised state.
*/
unsigned long service_from_wr;
/*
* Value of wr start time when switching to soft rt
@@ -344,6 +349,8 @@ struct bfq_queue {
unsigned long wr_start_at_switch_to_srt;
unsigned long split_time; /* time of last split */
unsigned long first_IO_time; /* time of first I/O for this queue */
};
/**
@@ -627,6 +634,18 @@ struct bfq_data {
struct bfq_io_cq *bio_bic;
/* bfqq associated with the task issuing current bio for merging */
struct bfq_queue *bio_bfqq;
/*
* Cached sbitmap shift, used to compute depth limits in
* bfq_update_depths.
*/
unsigned int sb_shift;
/*
* Depth limits used in bfq_limit_depth (see comments on the
* function)
*/
unsigned int word_depths[2][2];
};
enum bfqq_state_flags {
+7
View File
@@ -835,6 +835,13 @@ void bfq_bfqq_served(struct bfq_queue *bfqq, int served)
struct bfq_entity *entity = &bfqq->entity;
struct bfq_service_tree *st;
if (!bfqq->service_from_backlogged)
bfqq->first_IO_time = jiffies;
if (bfqq->wr_coeff > 1)
bfqq->service_from_wr += served;
bfqq->service_from_backlogged += served;
for_each_entity(entity) {
st = bfq_entity_service_tree(entity);
-1
View File
@@ -374,7 +374,6 @@ static void bio_integrity_verify_fn(struct work_struct *work)
/**
* __bio_integrity_endio - Integrity I/O completion function
* @bio: Protected bio
* @error: Pointer to errno
*
* Description: Completion for integrity I/O
*
+1 -29
View File
@@ -970,34 +970,6 @@ void bio_advance(struct bio *bio, unsigned bytes)
}
EXPORT_SYMBOL(bio_advance);
/**
* bio_alloc_pages - allocates a single page for each bvec in a bio
* @bio: bio to allocate pages for
* @gfp_mask: flags for allocation
*
* Allocates pages up to @bio->bi_vcnt.
*
* Returns 0 on success, -ENOMEM on failure. On failure, any allocated pages are
* freed.
*/
int bio_alloc_pages(struct bio *bio, gfp_t gfp_mask)
{
int i;
struct bio_vec *bv;
bio_for_each_segment_all(bv, bio, i) {
bv->bv_page = alloc_page(gfp_mask);
if (!bv->bv_page) {
while (--bv >= bio->bi_io_vec)
__free_page(bv->bv_page);
return -ENOMEM;
}
}
return 0;
}
EXPORT_SYMBOL(bio_alloc_pages);
/**
* bio_copy_data - copy contents of data buffers from one chain of bios to
* another
@@ -1838,7 +1810,7 @@ struct bio *bio_split(struct bio *bio, int sectors,
bio_advance(bio, split->bi_iter.bi_size);
if (bio_flagged(bio, BIO_TRACE_COMPLETION))
bio_set_flag(bio, BIO_TRACE_COMPLETION);
bio_set_flag(split, BIO_TRACE_COMPLETION);
return split;
}
+52 -35
View File
@@ -126,6 +126,8 @@ void blk_rq_init(struct request_queue *q, struct request *rq)
rq->start_time = jiffies;
set_start_time_ns(rq);
rq->part = NULL;
seqcount_init(&rq->gstate_seq);
u64_stats_init(&rq->aborted_gstate_sync);
}
EXPORT_SYMBOL(blk_rq_init);
@@ -699,6 +701,15 @@ void blk_cleanup_queue(struct request_queue *q)
queue_flag_set(QUEUE_FLAG_DEAD, q);
spin_unlock_irq(lock);
/*
* make sure all in-progress dispatch are completed because
* blk_freeze_queue() can only complete all requests, and
* dispatch may still be in-progress since we dispatch requests
* from more than one contexts
*/
if (q->mq_ops)
blk_mq_quiesce_queue(q);
/* for synchronous bio-based driver finish in-flight integrity i/o */
blk_flush_integrity();
@@ -1646,6 +1657,7 @@ void __blk_put_request(struct request_queue *q, struct request *req)
lockdep_assert_held(q->queue_lock);
blk_req_zone_write_unlock(req);
blk_pm_put_request(req);
elv_completed_request(q, req);
@@ -2055,6 +2067,21 @@ static inline bool should_fail_request(struct hd_struct *part,
#endif /* CONFIG_FAIL_MAKE_REQUEST */
static inline bool bio_check_ro(struct bio *bio, struct hd_struct *part)
{
if (part->policy && op_is_write(bio_op(bio))) {
char b[BDEVNAME_SIZE];
printk(KERN_ERR
"generic_make_request: Trying to write "
"to read-only block-device %s (partno %d)\n",
bio_devname(bio, b), part->partno);
return true;
}
return false;
}
/*
* Remap block n of partition p to block n+start(p) of the disk.
*/
@@ -2063,27 +2090,28 @@ static inline int blk_partition_remap(struct bio *bio)
struct hd_struct *p;
int ret = 0;
rcu_read_lock();
p = __disk_get_part(bio->bi_disk, bio->bi_partno);
if (unlikely(!p || should_fail_request(p, bio->bi_iter.bi_size) ||
bio_check_ro(bio, p))) {
ret = -EIO;
goto out;
}
/*
* Zone reset does not include bi_size so bio_sectors() is always 0.
* Include a test for the reset op code and perform the remap if needed.
*/
if (!bio->bi_partno ||
(!bio_sectors(bio) && bio_op(bio) != REQ_OP_ZONE_RESET))
return 0;
if (!bio_sectors(bio) && bio_op(bio) != REQ_OP_ZONE_RESET)
goto out;
rcu_read_lock();
p = __disk_get_part(bio->bi_disk, bio->bi_partno);
if (likely(p && !should_fail_request(p, bio->bi_iter.bi_size))) {
bio->bi_iter.bi_sector += p->start_sect;
bio->bi_partno = 0;
trace_block_bio_remap(bio->bi_disk->queue, bio, part_devt(p),
bio->bi_iter.bi_sector - p->start_sect);
} else {
printk("%s: fail for partition %d\n", __func__, bio->bi_partno);
ret = -EIO;
}
bio->bi_iter.bi_sector += p->start_sect;
bio->bi_partno = 0;
trace_block_bio_remap(bio->bi_disk->queue, bio, part_devt(p),
bio->bi_iter.bi_sector - p->start_sect);
out:
rcu_read_unlock();
return ret;
}
@@ -2142,15 +2170,19 @@ generic_make_request_checks(struct bio *bio)
* For a REQ_NOWAIT based request, return -EOPNOTSUPP
* if queue is not a request based queue.
*/
if ((bio->bi_opf & REQ_NOWAIT) && !queue_is_rq_based(q))
goto not_supported;
if (should_fail_request(&bio->bi_disk->part0, bio->bi_iter.bi_size))
goto end_io;
if (blk_partition_remap(bio))
goto end_io;
if (!bio->bi_partno) {
if (unlikely(bio_check_ro(bio, &bio->bi_disk->part0)))
goto end_io;
} else {
if (blk_partition_remap(bio))
goto end_io;
}
if (bio_check_eod(bio, nr_sectors))
goto end_io;
@@ -2493,8 +2525,7 @@ blk_status_t blk_insert_cloned_request(struct request_queue *q, struct request *
* bypass a potential scheduler on the bottom device for
* insert.
*/
blk_mq_request_bypass_insert(rq, true);
return BLK_STS_OK;
return blk_mq_request_issue_directly(rq);
}
spin_lock_irqsave(q->queue_lock, flags);
@@ -2846,7 +2877,7 @@ void blk_start_request(struct request *req)
wbt_issue(req->q->rq_wb, &req->issue_stat);
}
BUG_ON(test_bit(REQ_ATOM_COMPLETE, &req->atomic_flags));
BUG_ON(blk_rq_is_complete(req));
blk_add_timer(req);
}
EXPORT_SYMBOL(blk_start_request);
@@ -3415,20 +3446,6 @@ int kblockd_mod_delayed_work_on(int cpu, struct delayed_work *dwork,
}
EXPORT_SYMBOL(kblockd_mod_delayed_work_on);
int kblockd_schedule_delayed_work(struct delayed_work *dwork,
unsigned long delay)
{
return queue_delayed_work(kblockd_workqueue, dwork, delay);
}
EXPORT_SYMBOL(kblockd_schedule_delayed_work);
int kblockd_schedule_delayed_work_on(int cpu, struct delayed_work *dwork,
unsigned long delay)
{
return queue_delayed_work_on(cpu, kblockd_workqueue, dwork, delay);
}
EXPORT_SYMBOL(kblockd_schedule_delayed_work_on);
/**
* blk_start_plug - initialize blk_plug and track it inside the task_struct
* @plug: The &struct blk_plug that needs to be initialized
+1 -1
View File
@@ -61,7 +61,7 @@ void blk_execute_rq_nowait(struct request_queue *q, struct gendisk *bd_disk,
* be reused after dying flag is set
*/
if (q->mq_ops) {
blk_mq_sched_insert_request(rq, at_head, true, false, false);
blk_mq_sched_insert_request(rq, at_head, true, false);
return;
}
+12
View File
@@ -37,6 +37,9 @@ int __blkdev_issue_discard(struct block_device *bdev, sector_t sector,
if (!q)
return -ENXIO;
if (bdev_read_only(bdev))
return -EPERM;
if (flags & BLKDEV_DISCARD_SECURE) {
if (!blk_queue_secure_erase(q))
return -EOPNOTSUPP;
@@ -156,6 +159,9 @@ static int __blkdev_issue_write_same(struct block_device *bdev, sector_t sector,
if (!q)
return -ENXIO;
if (bdev_read_only(bdev))
return -EPERM;
bs_mask = (bdev_logical_block_size(bdev) >> 9) - 1;
if ((sector | nr_sects) & bs_mask)
return -EINVAL;
@@ -233,6 +239,9 @@ static int __blkdev_issue_write_zeroes(struct block_device *bdev,
if (!q)
return -ENXIO;
if (bdev_read_only(bdev))
return -EPERM;
/* Ensure that max_write_zeroes_sectors doesn't overflow bi_size */
max_write_zeroes_sectors = bdev_write_zeroes_sectors(bdev);
@@ -287,6 +296,9 @@ static int __blkdev_issue_zero_pages(struct block_device *bdev,
if (!q)
return -ENXIO;
if (bdev_read_only(bdev))
return -EPERM;
while (nr_sects != 0) {
bio = next_bio(bio, __blkdev_sectors_to_bio_pages(nr_sects),
gfp_mask);
+2 -2
View File
@@ -119,7 +119,7 @@ int blk_rq_map_user_iov(struct request_queue *q, struct request *rq,
unsigned long align = q->dma_pad_mask | queue_dma_alignment(q);
struct bio *bio = NULL;
struct iov_iter i;
int ret;
int ret = -EINVAL;
if (!iter_is_iovec(iter))
goto fail;
@@ -148,7 +148,7 @@ unmap_rq:
__blk_rq_unmap_user(bio);
fail:
rq->bio = NULL;
return -EINVAL;
return ret;
}
EXPORT_SYMBOL(blk_rq_map_user_iov);
+6 -7
View File
@@ -128,9 +128,7 @@ static struct bio *blk_bio_segment_split(struct request_queue *q,
nsegs++;
sectors = max_sectors;
}
if (sectors)
goto split;
/* Make this single bvec as the 1st segment */
goto split;
}
if (bvprvp && blk_queue_cluster(q)) {
@@ -146,22 +144,21 @@ static struct bio *blk_bio_segment_split(struct request_queue *q,
bvprvp = &bvprv;
sectors += bv.bv_len >> 9;
if (nsegs == 1 && seg_size > front_seg_size)
front_seg_size = seg_size;
continue;
}
new_segment:
if (nsegs == queue_max_segments(q))
goto split;
if (nsegs == 1 && seg_size > front_seg_size)
front_seg_size = seg_size;
nsegs++;
bvprv = bv;
bvprvp = &bvprv;
seg_size = bv.bv_len;
sectors += bv.bv_len >> 9;
if (nsegs == 1 && seg_size > front_seg_size)
front_seg_size = seg_size;
}
do_split = false;
@@ -174,6 +171,8 @@ split:
bio = new;
}
if (nsegs == 1 && seg_size > front_seg_size)
front_seg_size = seg_size;
bio->bi_seg_front_size = front_seg_size;
if (seg_size > bio->bi_seg_back_size)
bio->bi_seg_back_size = seg_size;
+10 -12
View File
@@ -289,17 +289,12 @@ static const char *const rqf_name[] = {
RQF_NAME(HASHED),
RQF_NAME(STATS),
RQF_NAME(SPECIAL_PAYLOAD),
RQF_NAME(ZONE_WRITE_LOCKED),
RQF_NAME(MQ_TIMEOUT_EXPIRED),
RQF_NAME(MQ_POLL_SLEPT),
};
#undef RQF_NAME
#define RQAF_NAME(name) [REQ_ATOM_##name] = #name
static const char *const rqaf_name[] = {
RQAF_NAME(COMPLETE),
RQAF_NAME(STARTED),
RQAF_NAME(POLL_SLEPT),
};
#undef RQAF_NAME
int __blk_mq_debugfs_rq_show(struct seq_file *m, struct request *rq)
{
const struct blk_mq_ops *const mq_ops = rq->q->mq_ops;
@@ -316,8 +311,7 @@ int __blk_mq_debugfs_rq_show(struct seq_file *m, struct request *rq)
seq_puts(m, ", .rq_flags=");
blk_flags_show(m, (__force unsigned int)rq->rq_flags, rqf_name,
ARRAY_SIZE(rqf_name));
seq_puts(m, ", .atomic_flags=");
blk_flags_show(m, rq->atomic_flags, rqaf_name, ARRAY_SIZE(rqaf_name));
seq_printf(m, ", complete=%d", blk_rq_is_complete(rq));
seq_printf(m, ", .tag=%d, .internal_tag=%d", rq->tag,
rq->internal_tag);
if (mq_ops->show_rq)
@@ -409,7 +403,7 @@ static void hctx_show_busy_rq(struct request *rq, void *data, bool reserved)
const struct show_busy_params *params = data;
if (blk_mq_map_queue(rq->q, rq->mq_ctx->cpu) == params->hctx &&
test_bit(REQ_ATOM_STARTED, &rq->atomic_flags))
blk_mq_rq_state(rq) != MQ_RQ_IDLE)
__blk_mq_debugfs_rq_show(params->m,
list_entry_rq(&rq->queuelist));
}
@@ -703,7 +697,11 @@ static ssize_t blk_mq_debugfs_write(struct file *file, const char __user *buf,
const struct blk_mq_debugfs_attr *attr = m->private;
void *data = d_inode(file->f_path.dentry->d_parent)->i_private;
if (!attr->write)
/*
* Attributes that only implement .seq_ops are read-only and 'attr' is
* the same with 'data' in this case.
*/
if (attr == data || !attr->write)
return -EPERM;
return attr->write(data, buf, count, ppos);
+1 -2
View File
@@ -172,7 +172,6 @@ static void blk_mq_do_dispatch_ctx(struct blk_mq_hw_ctx *hctx)
WRITE_ONCE(hctx->dispatch_from, ctx);
}
/* return true if hw queue need to be run again */
void blk_mq_sched_dispatch_requests(struct blk_mq_hw_ctx *hctx)
{
struct request_queue *q = hctx->queue;
@@ -428,7 +427,7 @@ done:
}
void blk_mq_sched_insert_request(struct request *rq, bool at_head,
bool run_queue, bool async, bool can_block)
bool run_queue, bool async)
{
struct request_queue *q = rq->q;
struct elevator_queue *e = q->elevator;
+1 -1
View File
@@ -18,7 +18,7 @@ bool blk_mq_sched_try_insert_merge(struct request_queue *q, struct request *rq);
void blk_mq_sched_restart(struct blk_mq_hw_ctx *hctx);
void blk_mq_sched_insert_request(struct request *rq, bool at_head,
bool run_queue, bool async, bool can_block);
bool run_queue, bool async);
void blk_mq_sched_insert_requests(struct request_queue *q,
struct blk_mq_ctx *ctx,
struct list_head *list, bool run_queue_async);
+1 -8
View File
@@ -248,7 +248,7 @@ static int blk_mq_register_hctx(struct blk_mq_hw_ctx *hctx)
return ret;
}
static void __blk_mq_unregister_dev(struct device *dev, struct request_queue *q)
void blk_mq_unregister_dev(struct device *dev, struct request_queue *q)
{
struct blk_mq_hw_ctx *hctx;
int i;
@@ -265,13 +265,6 @@ static void __blk_mq_unregister_dev(struct device *dev, struct request_queue *q)
q->mq_sysfs_init_done = false;
}
void blk_mq_unregister_dev(struct device *dev, struct request_queue *q)
{
mutex_lock(&q->sysfs_lock);
__blk_mq_unregister_dev(dev, q);
mutex_unlock(&q->sysfs_lock);
}
void blk_mq_hctx_kobj_init(struct blk_mq_hw_ctx *hctx)
{
kobject_init(&hctx->kobj, &blk_mq_hw_ktype);
+7 -6
View File
@@ -134,12 +134,6 @@ unsigned int blk_mq_get_tag(struct blk_mq_alloc_data *data)
ws = bt_wait_ptr(bt, data->hctx);
drop_ctx = data->ctx == NULL;
do {
prepare_to_wait(&ws->wait, &wait, TASK_UNINTERRUPTIBLE);
tag = __blk_mq_get_tag(data, bt);
if (tag != -1)
break;
/*
* We're out of tags on this hardware queue, kick any
* pending IO submits before going to sleep waiting for
@@ -155,6 +149,13 @@ unsigned int blk_mq_get_tag(struct blk_mq_alloc_data *data)
if (tag != -1)
break;
prepare_to_wait_exclusive(&ws->wait, &wait,
TASK_UNINTERRUPTIBLE);
tag = __blk_mq_get_tag(data, bt);
if (tag != -1)
break;
if (data->ctx)
blk_mq_put_ctx(data->ctx);
+438 -231
View File
File diff suppressed because it is too large Load Diff
+50 -2
View File
@@ -27,6 +27,20 @@ struct blk_mq_ctx {
struct kobject kobj;
} ____cacheline_aligned_in_smp;
/*
* Bits for request->gstate. The lower two bits carry MQ_RQ_* state value
* and the upper bits the generation number.
*/
enum mq_rq_state {
MQ_RQ_IDLE = 0,
MQ_RQ_IN_FLIGHT = 1,
MQ_RQ_COMPLETE = 2,
MQ_RQ_STATE_BITS = 2,
MQ_RQ_STATE_MASK = (1 << MQ_RQ_STATE_BITS) - 1,
MQ_RQ_GEN_INC = 1 << MQ_RQ_STATE_BITS,
};
void blk_mq_freeze_queue(struct request_queue *q);
void blk_mq_free_queue(struct request_queue *q);
int blk_mq_update_nr_requests(struct request_queue *q, unsigned int nr);
@@ -60,6 +74,9 @@ void blk_mq_request_bypass_insert(struct request *rq, bool run_queue);
void blk_mq_insert_requests(struct blk_mq_hw_ctx *hctx, struct blk_mq_ctx *ctx,
struct list_head *list);
/* Used by blk_insert_cloned_request() to issue request directly */
blk_status_t blk_mq_request_issue_directly(struct request *rq);
/*
* CPU -> queue mappings
*/
@@ -81,10 +98,41 @@ extern int blk_mq_sysfs_register(struct request_queue *q);
extern void blk_mq_sysfs_unregister(struct request_queue *q);
extern void blk_mq_hctx_kobj_init(struct blk_mq_hw_ctx *hctx);
extern void blk_mq_rq_timed_out(struct request *req, bool reserved);
void blk_mq_release(struct request_queue *q);
/**
* blk_mq_rq_state() - read the current MQ_RQ_* state of a request
* @rq: target request.
*/
static inline int blk_mq_rq_state(struct request *rq)
{
return READ_ONCE(rq->gstate) & MQ_RQ_STATE_MASK;
}
/**
* blk_mq_rq_update_state() - set the current MQ_RQ_* state of a request
* @rq: target request.
* @state: new state to set.
*
* Set @rq's state to @state. The caller is responsible for ensuring that
* there are no other updaters. A request can transition into IN_FLIGHT
* only from IDLE and doing so increments the generation number.
*/
static inline void blk_mq_rq_update_state(struct request *rq,
enum mq_rq_state state)
{
u64 old_val = READ_ONCE(rq->gstate);
u64 new_val = (old_val & ~MQ_RQ_STATE_MASK) | state;
if (state == MQ_RQ_IN_FLIGHT) {
WARN_ON_ONCE((old_val & MQ_RQ_STATE_MASK) != MQ_RQ_IDLE);
new_val += MQ_RQ_GEN_INC;
}
/* avoid exposing interim values */
WRITE_ONCE(rq->gstate, new_val);
}
static inline struct blk_mq_ctx *__blk_mq_get_ctx(struct request_queue *q,
unsigned int cpu)
{
+39 -8
View File
@@ -853,6 +853,10 @@ struct kobj_type blk_queue_ktype = {
.release = blk_release_queue,
};
/**
* blk_register_queue - register a block layer queue with sysfs
* @disk: Disk of which the request queue should be registered with sysfs.
*/
int blk_register_queue(struct gendisk *disk)
{
int ret;
@@ -909,11 +913,12 @@ int blk_register_queue(struct gendisk *disk)
if (q->request_fn || (q->mq_ops && q->elevator)) {
ret = elv_register_queue(q);
if (ret) {
mutex_unlock(&q->sysfs_lock);
kobject_uevent(&q->kobj, KOBJ_REMOVE);
kobject_del(&q->kobj);
blk_trace_remove_sysfs(dev);
kobject_put(&dev->kobj);
goto unlock;
return ret;
}
}
ret = 0;
@@ -921,7 +926,15 @@ unlock:
mutex_unlock(&q->sysfs_lock);
return ret;
}
EXPORT_SYMBOL_GPL(blk_register_queue);
/**
* blk_unregister_queue - counterpart of blk_register_queue()
* @disk: Disk of which the request queue should be unregistered from sysfs.
*
* Note: the caller is responsible for guaranteeing that this function is called
* after blk_register_queue() has finished.
*/
void blk_unregister_queue(struct gendisk *disk)
{
struct request_queue *q = disk->queue;
@@ -929,21 +942,39 @@ void blk_unregister_queue(struct gendisk *disk)
if (WARN_ON(!q))
return;
/* Return early if disk->queue was never registered. */
if (!test_bit(QUEUE_FLAG_REGISTERED, &q->queue_flags))
return;
/*
* Since sysfs_remove_dir() prevents adding new directory entries
* before removal of existing entries starts, protect against
* concurrent elv_iosched_store() calls.
*/
mutex_lock(&q->sysfs_lock);
queue_flag_clear_unlocked(QUEUE_FLAG_REGISTERED, q);
mutex_unlock(&q->sysfs_lock);
wbt_exit(q);
spin_lock_irq(q->queue_lock);
queue_flag_clear(QUEUE_FLAG_REGISTERED, q);
spin_unlock_irq(q->queue_lock);
/*
* Remove the sysfs attributes before unregistering the queue data
* structures that can be modified through sysfs.
*/
if (q->mq_ops)
blk_mq_unregister_dev(disk_to_dev(disk), q);
if (q->request_fn || (q->mq_ops && q->elevator))
elv_unregister_queue(q);
mutex_unlock(&q->sysfs_lock);
kobject_uevent(&q->kobj, KOBJ_REMOVE);
kobject_del(&q->kobj);
blk_trace_remove_sysfs(disk_to_dev(disk));
wbt_exit(q);
mutex_lock(&q->sysfs_lock);
if (q->request_fn || (q->mq_ops && q->elevator))
elv_unregister_queue(q);
mutex_unlock(&q->sysfs_lock);
kobject_put(&disk_to_dev(disk)->kobj);
}
+89 -55
View File
@@ -216,9 +216,9 @@ struct throtl_data
unsigned int scale;
struct latency_bucket tmp_buckets[LATENCY_BUCKET_SIZE];
struct avg_latency_bucket avg_buckets[LATENCY_BUCKET_SIZE];
struct latency_bucket __percpu *latency_buckets;
struct latency_bucket tmp_buckets[2][LATENCY_BUCKET_SIZE];
struct avg_latency_bucket avg_buckets[2][LATENCY_BUCKET_SIZE];
struct latency_bucket __percpu *latency_buckets[2];
unsigned long last_calculate_time;
unsigned long filtered_latency;
@@ -1510,11 +1510,21 @@ static struct cftype throtl_legacy_files[] = {
.private = (unsigned long)&blkcg_policy_throtl,
.seq_show = blkg_print_stat_bytes,
},
{
.name = "throttle.io_service_bytes_recursive",
.private = (unsigned long)&blkcg_policy_throtl,
.seq_show = blkg_print_stat_bytes_recursive,
},
{
.name = "throttle.io_serviced",
.private = (unsigned long)&blkcg_policy_throtl,
.seq_show = blkg_print_stat_ios,
},
{
.name = "throttle.io_serviced_recursive",
.private = (unsigned long)&blkcg_policy_throtl,
.seq_show = blkg_print_stat_ios_recursive,
},
{ } /* terminate */
};
@@ -2040,10 +2050,10 @@ static void blk_throtl_update_idletime(struct throtl_grp *tg)
#ifdef CONFIG_BLK_DEV_THROTTLING_LOW
static void throtl_update_latency_buckets(struct throtl_data *td)
{
struct avg_latency_bucket avg_latency[LATENCY_BUCKET_SIZE];
int i, cpu;
unsigned long last_latency = 0;
unsigned long latency;
struct avg_latency_bucket avg_latency[2][LATENCY_BUCKET_SIZE];
int i, cpu, rw;
unsigned long last_latency[2] = { 0 };
unsigned long latency[2];
if (!blk_queue_nonrot(td->queue))
return;
@@ -2052,56 +2062,67 @@ static void throtl_update_latency_buckets(struct throtl_data *td)
td->last_calculate_time = jiffies;
memset(avg_latency, 0, sizeof(avg_latency));
for (i = 0; i < LATENCY_BUCKET_SIZE; i++) {
struct latency_bucket *tmp = &td->tmp_buckets[i];
for (rw = READ; rw <= WRITE; rw++) {
for (i = 0; i < LATENCY_BUCKET_SIZE; i++) {
struct latency_bucket *tmp = &td->tmp_buckets[rw][i];
for_each_possible_cpu(cpu) {
struct latency_bucket *bucket;
for_each_possible_cpu(cpu) {
struct latency_bucket *bucket;
/* this isn't race free, but ok in practice */
bucket = per_cpu_ptr(td->latency_buckets, cpu);
tmp->total_latency += bucket[i].total_latency;
tmp->samples += bucket[i].samples;
bucket[i].total_latency = 0;
bucket[i].samples = 0;
}
/* this isn't race free, but ok in practice */
bucket = per_cpu_ptr(td->latency_buckets[rw],
cpu);
tmp->total_latency += bucket[i].total_latency;
tmp->samples += bucket[i].samples;
bucket[i].total_latency = 0;
bucket[i].samples = 0;
}
if (tmp->samples >= 32) {
int samples = tmp->samples;
if (tmp->samples >= 32) {
int samples = tmp->samples;
latency = tmp->total_latency;
latency[rw] = tmp->total_latency;
tmp->total_latency = 0;
tmp->samples = 0;
latency /= samples;
if (latency == 0)
continue;
avg_latency[i].latency = latency;
tmp->total_latency = 0;
tmp->samples = 0;
latency[rw] /= samples;
if (latency[rw] == 0)
continue;
avg_latency[rw][i].latency = latency[rw];
}
}
}
for (i = 0; i < LATENCY_BUCKET_SIZE; i++) {
if (!avg_latency[i].latency) {
if (td->avg_buckets[i].latency < last_latency)
td->avg_buckets[i].latency = last_latency;
continue;
for (rw = READ; rw <= WRITE; rw++) {
for (i = 0; i < LATENCY_BUCKET_SIZE; i++) {
if (!avg_latency[rw][i].latency) {
if (td->avg_buckets[rw][i].latency < last_latency[rw])
td->avg_buckets[rw][i].latency =
last_latency[rw];
continue;
}
if (!td->avg_buckets[rw][i].valid)
latency[rw] = avg_latency[rw][i].latency;
else
latency[rw] = (td->avg_buckets[rw][i].latency * 7 +
avg_latency[rw][i].latency) >> 3;
td->avg_buckets[rw][i].latency = max(latency[rw],
last_latency[rw]);
td->avg_buckets[rw][i].valid = true;
last_latency[rw] = td->avg_buckets[rw][i].latency;
}
if (!td->avg_buckets[i].valid)
latency = avg_latency[i].latency;
else
latency = (td->avg_buckets[i].latency * 7 +
avg_latency[i].latency) >> 3;
td->avg_buckets[i].latency = max(latency, last_latency);
td->avg_buckets[i].valid = true;
last_latency = td->avg_buckets[i].latency;
}
for (i = 0; i < LATENCY_BUCKET_SIZE; i++)
throtl_log(&td->service_queue,
"Latency bucket %d: latency=%ld, valid=%d", i,
td->avg_buckets[i].latency, td->avg_buckets[i].valid);
"Latency bucket %d: read latency=%ld, read valid=%d, "
"write latency=%ld, write valid=%d", i,
td->avg_buckets[READ][i].latency,
td->avg_buckets[READ][i].valid,
td->avg_buckets[WRITE][i].latency,
td->avg_buckets[WRITE][i].valid);
}
#else
static inline void throtl_update_latency_buckets(struct throtl_data *td)
@@ -2242,16 +2263,17 @@ static void throtl_track_latency(struct throtl_data *td, sector_t size,
struct latency_bucket *latency;
int index;
if (!td || td->limit_index != LIMIT_LOW || op != REQ_OP_READ ||
if (!td || td->limit_index != LIMIT_LOW ||
!(op == REQ_OP_READ || op == REQ_OP_WRITE) ||
!blk_queue_nonrot(td->queue))
return;
index = request_bucket_index(size);
latency = get_cpu_ptr(td->latency_buckets);
latency = get_cpu_ptr(td->latency_buckets[op]);
latency[index].total_latency += time;
latency[index].samples++;
put_cpu_ptr(td->latency_buckets);
put_cpu_ptr(td->latency_buckets[op]);
}
void blk_throtl_stat_add(struct request *rq, u64 time_ns)
@@ -2270,6 +2292,7 @@ void blk_throtl_bio_endio(struct bio *bio)
unsigned long finish_time;
unsigned long start_time;
unsigned long lat;
int rw = bio_data_dir(bio);
tg = bio->bi_cg_private;
if (!tg)
@@ -2298,7 +2321,7 @@ void blk_throtl_bio_endio(struct bio *bio)
bucket = request_bucket_index(
blk_stat_size(&bio->bi_issue_stat));
threshold = tg->td->avg_buckets[bucket].latency +
threshold = tg->td->avg_buckets[rw][bucket].latency +
tg->latency_target;
if (lat > threshold)
tg->bad_bio_cnt++;
@@ -2391,9 +2414,16 @@ int blk_throtl_init(struct request_queue *q)
td = kzalloc_node(sizeof(*td), GFP_KERNEL, q->node);
if (!td)
return -ENOMEM;
td->latency_buckets = __alloc_percpu(sizeof(struct latency_bucket) *
td->latency_buckets[READ] = __alloc_percpu(sizeof(struct latency_bucket) *
LATENCY_BUCKET_SIZE, __alignof__(u64));
if (!td->latency_buckets) {
if (!td->latency_buckets[READ]) {
kfree(td);
return -ENOMEM;
}
td->latency_buckets[WRITE] = __alloc_percpu(sizeof(struct latency_bucket) *
LATENCY_BUCKET_SIZE, __alignof__(u64));
if (!td->latency_buckets[WRITE]) {
free_percpu(td->latency_buckets[READ]);
kfree(td);
return -ENOMEM;
}
@@ -2412,7 +2442,8 @@ int blk_throtl_init(struct request_queue *q)
/* activate policy */
ret = blkcg_activate_policy(q, &blkcg_policy_throtl);
if (ret) {
free_percpu(td->latency_buckets);
free_percpu(td->latency_buckets[READ]);
free_percpu(td->latency_buckets[WRITE]);
kfree(td);
}
return ret;
@@ -2423,7 +2454,8 @@ void blk_throtl_exit(struct request_queue *q)
BUG_ON(!q->td);
throtl_shutdown_wq(q);
blkcg_deactivate_policy(q, &blkcg_policy_throtl);
free_percpu(q->td->latency_buckets);
free_percpu(q->td->latency_buckets[READ]);
free_percpu(q->td->latency_buckets[WRITE]);
kfree(q->td);
}
@@ -2441,15 +2473,17 @@ void blk_throtl_register_queue(struct request_queue *q)
} else {
td->throtl_slice = DFL_THROTL_SLICE_HD;
td->filtered_latency = LATENCY_FILTERED_HD;
for (i = 0; i < LATENCY_BUCKET_SIZE; i++)
td->avg_buckets[i].latency = DFL_HD_BASELINE_LATENCY;
for (i = 0; i < LATENCY_BUCKET_SIZE; i++) {
td->avg_buckets[READ][i].latency = DFL_HD_BASELINE_LATENCY;
td->avg_buckets[WRITE][i].latency = DFL_HD_BASELINE_LATENCY;
}
}
#ifndef CONFIG_BLK_DEV_THROTTLING_LOW
/* if no low limit, use previous default */
td->throtl_slice = DFL_THROTL_SLICE_HD;
#endif
td->track_bio_latency = !q->mq_ops && !q->request_fn;
td->track_bio_latency = !queue_is_rq_based(q);
if (!td->track_bio_latency)
blk_stat_enable_accounting(q);
}

Some files were not shown because too many files have changed in this diff Show More