Merge branch 'for-3.11/core' of git://git.kernel.dk/linux-block

Pull core block IO updates from Jens Axboe:
 "Here are the core IO block bits for 3.11. It contains:

   - A tweak to the reserved tag logic from Jan, for weirdo devices with
     just 3 free tags.  But for those it improves things substantially
     for random writes.

   - Periodic writeback fix from Jan.  Marked for stable as well.

   - Fix for a race condition in IO scheduler switching from Jianpeng.

   - The hierarchical blk-cgroup support from Tejun.  This is the grunt
     of the series.

   - blk-throttle fix from Vivek.

  Just a note that I'm in the middle of a relocation, whole family is
  flying out tomorrow.  Hence I will be awal the remainder of this week,
  but back at work again on Monday the 15th.  CC'ing Tejun, since any
  potential "surprises" will most likely be from the blk-cgroup work.
  But it's been brewing for a while and sitting in my tree and
  linux-next for a long time, so should be solid."

* 'for-3.11/core' of git://git.kernel.dk/linux-block: (36 commits)
  elevator: Fix a race in elevator switching
  block: Reserve only one queue tag for sync IO if only 3 tags are available
  writeback: Fix periodic writeback after fs mount
  blk-throttle: implement proper hierarchy support
  blk-throttle: implement throtl_grp->has_rules[]
  blk-throttle: Account for child group's start time in parent while bio climbs up
  blk-throttle: add throtl_qnode for dispatch fairness
  blk-throttle: make throtl_pending_timer_fn() ready for hierarchy
  blk-throttle: make tg_dispatch_one_bio() ready for hierarchy
  blk-throttle: make blk_throtl_bio() ready for hierarchy
  blk-throttle: make blk_throtl_drain() ready for hierarchy
  blk-throttle: dispatch from throtl_pending_timer_fn()
  blk-throttle: implement dispatch looping
  blk-throttle: separate out throtl_service_queue->pending_timer from throtl_data->dispatch_work
  blk-throttle: set REQ_THROTTLED from throtl_charge_bio() and gate stats update with it
  blk-throttle: implement sq_to_tg(), sq_to_td() and throtl_log()
  blk-throttle: add throtl_service_queue->parent_sq
  blk-throttle: generalize update_disptime optimization in blk_throtl_bio()
  blk-throttle: dispatch to throtl_data->service_queue.bio_lists[]
  blk-throttle: move bio_lists[] and friends to throtl_service_queue
  ...
This commit is contained in:
Linus Torvalds
2013-07-11 13:03:24 -07:00
12 changed files with 910 additions and 439 deletions
+40 -65
View File
@@ -32,26 +32,6 @@ EXPORT_SYMBOL_GPL(blkcg_root);
static struct blkcg_policy *blkcg_policy[BLKCG_MAX_POLS];
static struct blkcg_gq *__blkg_lookup(struct blkcg *blkcg,
struct request_queue *q, bool update_hint);
/**
* blkg_for_each_descendant_pre - pre-order walk of a blkg's descendants
* @d_blkg: loop cursor pointing to the current descendant
* @pos_cgrp: used for iteration
* @p_blkg: target blkg to walk descendants of
*
* Walk @c_blkg through the descendants of @p_blkg. Must be used with RCU
* read locked. If called under either blkcg or queue lock, the iteration
* is guaranteed to include all and only online blkgs. The caller may
* update @pos_cgrp by calling cgroup_rightmost_descendant() to skip
* subtree.
*/
#define blkg_for_each_descendant_pre(d_blkg, pos_cgrp, p_blkg) \
cgroup_for_each_descendant_pre((pos_cgrp), (p_blkg)->blkcg->css.cgroup) \
if (((d_blkg) = __blkg_lookup(cgroup_to_blkcg(pos_cgrp), \
(p_blkg)->q, false)))
static bool blkcg_policy_enabled(struct request_queue *q,
const struct blkcg_policy *pol)
{
@@ -71,18 +51,8 @@ static void blkg_free(struct blkcg_gq *blkg)
if (!blkg)
return;
for (i = 0; i < BLKCG_MAX_POLS; i++) {
struct blkcg_policy *pol = blkcg_policy[i];
struct blkg_policy_data *pd = blkg->pd[i];
if (!pd)
continue;
if (pol && pol->pd_exit_fn)
pol->pd_exit_fn(blkg);
kfree(pd);
}
for (i = 0; i < BLKCG_MAX_POLS; i++)
kfree(blkg->pd[i]);
blk_exit_rl(&blkg->rl);
kfree(blkg);
@@ -134,10 +104,6 @@ static struct blkcg_gq *blkg_alloc(struct blkcg *blkcg, struct request_queue *q,
blkg->pd[i] = pd;
pd->blkg = blkg;
pd->plid = i;
/* invoke per-policy init */
if (pol->pd_init_fn)
pol->pd_init_fn(blkg);
}
return blkg;
@@ -158,8 +124,8 @@ err_free:
* @q's bypass state. If @update_hint is %true, the caller should be
* holding @q->queue_lock and lookup hint is updated on success.
*/
static struct blkcg_gq *__blkg_lookup(struct blkcg *blkcg,
struct request_queue *q, bool update_hint)
struct blkcg_gq *__blkg_lookup(struct blkcg *blkcg, struct request_queue *q,
bool update_hint)
{
struct blkcg_gq *blkg;
@@ -234,16 +200,25 @@ static struct blkcg_gq *blkg_create(struct blkcg *blkcg,
}
blkg = new_blkg;
/* link parent and insert */
/* link parent */
if (blkcg_parent(blkcg)) {
blkg->parent = __blkg_lookup(blkcg_parent(blkcg), q, false);
if (WARN_ON_ONCE(!blkg->parent)) {
blkg = ERR_PTR(-EINVAL);
ret = -EINVAL;
goto err_put_css;
}
blkg_get(blkg->parent);
}
/* invoke per-policy init */
for (i = 0; i < BLKCG_MAX_POLS; i++) {
struct blkcg_policy *pol = blkcg_policy[i];
if (blkg->pd[i] && pol->pd_init_fn)
pol->pd_init_fn(blkg);
}
/* insert */
spin_lock(&blkcg->lock);
ret = radix_tree_insert(&blkcg->blkg_tree, q->id, blkg);
if (likely(!ret)) {
@@ -394,30 +369,38 @@ static void blkg_destroy_all(struct request_queue *q)
q->root_rl.blkg = NULL;
}
static void blkg_rcu_free(struct rcu_head *rcu_head)
/*
* A group is RCU protected, but having an rcu lock does not mean that one
* can access all the fields of blkg and assume these are valid. For
* example, don't try to follow throtl_data and request queue links.
*
* Having a reference to blkg under an rcu allows accesses to only values
* local to groups like group stats and group rate limits.
*/
void __blkg_release_rcu(struct rcu_head *rcu_head)
{
blkg_free(container_of(rcu_head, struct blkcg_gq, rcu_head));
}
struct blkcg_gq *blkg = container_of(rcu_head, struct blkcg_gq, rcu_head);
int i;
/* tell policies that this one is being freed */
for (i = 0; i < BLKCG_MAX_POLS; i++) {
struct blkcg_policy *pol = blkcg_policy[i];
if (blkg->pd[i] && pol->pd_exit_fn)
pol->pd_exit_fn(blkg);
}
void __blkg_release(struct blkcg_gq *blkg)
{
/* release the blkcg and parent blkg refs this blkg has been holding */
css_put(&blkg->blkcg->css);
if (blkg->parent)
if (blkg->parent) {
spin_lock_irq(blkg->q->queue_lock);
blkg_put(blkg->parent);
spin_unlock_irq(blkg->q->queue_lock);
}
/*
* A group is freed in rcu manner. But having an rcu lock does not
* mean that one can access all the fields of blkg and assume these
* are valid. For example, don't try to follow throtl_data and
* request queue links.
*
* Having a reference to blkg under an rcu allows acess to only
* values local to groups like group stats and group rate limits
*/
call_rcu(&blkg->rcu_head, blkg_rcu_free);
blkg_free(blkg);
}
EXPORT_SYMBOL_GPL(__blkg_release);
EXPORT_SYMBOL_GPL(__blkg_release_rcu);
/*
* The next function used by blk_queue_for_each_rl(). It's a bit tricky
@@ -928,14 +911,6 @@ struct cgroup_subsys blkio_subsys = {
.subsys_id = blkio_subsys_id,
.base_cftypes = blkcg_files,
.module = THIS_MODULE,
/*
* blkio subsystem is utterly broken in terms of hierarchy support.
* It treats all cgroups equally regardless of where they're
* located in the hierarchy - all cgroups are treated as if they're
* right below the root. Fix it and remove the following.
*/
.broken_hierarchy = true,
};
EXPORT_SYMBOL_GPL(blkio_subsys);
+36 -2
View File
@@ -266,7 +266,7 @@ static inline void blkg_get(struct blkcg_gq *blkg)
blkg->refcnt++;
}
void __blkg_release(struct blkcg_gq *blkg);
void __blkg_release_rcu(struct rcu_head *rcu);
/**
* blkg_put - put a blkg reference
@@ -279,9 +279,43 @@ static inline void blkg_put(struct blkcg_gq *blkg)
lockdep_assert_held(blkg->q->queue_lock);
WARN_ON_ONCE(blkg->refcnt <= 0);
if (!--blkg->refcnt)
__blkg_release(blkg);
call_rcu(&blkg->rcu_head, __blkg_release_rcu);
}
struct blkcg_gq *__blkg_lookup(struct blkcg *blkcg, struct request_queue *q,
bool update_hint);
/**
* blkg_for_each_descendant_pre - pre-order walk of a blkg's descendants
* @d_blkg: loop cursor pointing to the current descendant
* @pos_cgrp: used for iteration
* @p_blkg: target blkg to walk descendants of
*
* Walk @c_blkg through the descendants of @p_blkg. Must be used with RCU
* read locked. If called under either blkcg or queue lock, the iteration
* is guaranteed to include all and only online blkgs. The caller may
* update @pos_cgrp by calling cgroup_rightmost_descendant() to skip
* subtree.
*/
#define blkg_for_each_descendant_pre(d_blkg, pos_cgrp, p_blkg) \
cgroup_for_each_descendant_pre((pos_cgrp), (p_blkg)->blkcg->css.cgroup) \
if (((d_blkg) = __blkg_lookup(cgroup_to_blkcg(pos_cgrp), \
(p_blkg)->q, false)))
/**
* blkg_for_each_descendant_post - post-order walk of a blkg's descendants
* @d_blkg: loop cursor pointing to the current descendant
* @pos_cgrp: used for iteration
* @p_blkg: target blkg to walk descendants of
*
* Similar to blkg_for_each_descendant_pre() but performs post-order
* traversal instead. Synchronization rules are the same.
*/
#define blkg_for_each_descendant_post(d_blkg, pos_cgrp, p_blkg) \
cgroup_for_each_descendant_post((pos_cgrp), (p_blkg)->blkcg->css.cgroup) \
if (((d_blkg) = __blkg_lookup(cgroup_to_blkcg(pos_cgrp), \
(p_blkg)->q, false)))
/**
* blk_get_rl - get request_list to use
* @q: request_queue of interest
+9 -2
View File
@@ -348,9 +348,16 @@ int blk_queue_start_tag(struct request_queue *q, struct request *rq)
*/
max_depth = bqt->max_depth;
if (!rq_is_sync(rq) && max_depth > 1) {
max_depth -= 2;
if (!max_depth)
switch (max_depth) {
case 2:
max_depth = 1;
break;
case 3:
max_depth = 2;
break;
default:
max_depth -= 2;
}
if (q->in_flight[BLK_RW_ASYNC] > max_depth)
return 1;
}
+747 -323
View File
File diff suppressed because it is too large Load Diff
+15 -4
View File
@@ -4347,18 +4347,28 @@ static void cfq_exit_queue(struct elevator_queue *e)
kfree(cfqd);
}
static int cfq_init_queue(struct request_queue *q)
static int cfq_init_queue(struct request_queue *q, struct elevator_type *e)
{
struct cfq_data *cfqd;
struct blkcg_gq *blkg __maybe_unused;
int i, ret;
struct elevator_queue *eq;
cfqd = kmalloc_node(sizeof(*cfqd), GFP_KERNEL | __GFP_ZERO, q->node);
if (!cfqd)
eq = elevator_alloc(q, e);
if (!eq)
return -ENOMEM;
cfqd = kmalloc_node(sizeof(*cfqd), GFP_KERNEL | __GFP_ZERO, q->node);
if (!cfqd) {
kobject_put(&eq->kobj);
return -ENOMEM;
}
eq->elevator_data = cfqd;
cfqd->queue = q;
q->elevator->elevator_data = cfqd;
spin_lock_irq(q->queue_lock);
q->elevator = eq;
spin_unlock_irq(q->queue_lock);
/* Init root service tree */
cfqd->grp_service_tree = CFQ_RB_ROOT;
@@ -4433,6 +4443,7 @@ static int cfq_init_queue(struct request_queue *q)
out_free:
kfree(cfqd);
kobject_put(&eq->kobj);
return ret;
}
+13 -3
View File
@@ -337,13 +337,21 @@ static void deadline_exit_queue(struct elevator_queue *e)
/*
* initialize elevator private data (deadline_data).
*/
static int deadline_init_queue(struct request_queue *q)
static int deadline_init_queue(struct request_queue *q, struct elevator_type *e)
{
struct deadline_data *dd;
struct elevator_queue *eq;
eq = elevator_alloc(q, e);
if (!eq)
return -ENOMEM;
dd = kmalloc_node(sizeof(*dd), GFP_KERNEL | __GFP_ZERO, q->node);
if (!dd)
if (!dd) {
kobject_put(&eq->kobj);
return -ENOMEM;
}
eq->elevator_data = dd;
INIT_LIST_HEAD(&dd->fifo_list[READ]);
INIT_LIST_HEAD(&dd->fifo_list[WRITE]);
@@ -355,7 +363,9 @@ static int deadline_init_queue(struct request_queue *q)
dd->front_merges = 1;
dd->fifo_batch = fifo_batch;
q->elevator->elevator_data = dd;
spin_lock_irq(q->queue_lock);
q->elevator = eq;
spin_unlock_irq(q->queue_lock);
return 0;
}
+5 -20
View File
@@ -150,7 +150,7 @@ void __init load_default_elevator_module(void)
static struct kobj_type elv_ktype;
static struct elevator_queue *elevator_alloc(struct request_queue *q,
struct elevator_queue *elevator_alloc(struct request_queue *q,
struct elevator_type *e)
{
struct elevator_queue *eq;
@@ -170,6 +170,7 @@ err:
elevator_put(e);
return NULL;
}
EXPORT_SYMBOL(elevator_alloc);
static void elevator_release(struct kobject *kobj)
{
@@ -221,16 +222,7 @@ int elevator_init(struct request_queue *q, char *name)
}
}
q->elevator = elevator_alloc(q, e);
if (!q->elevator)
return -ENOMEM;
err = e->ops.elevator_init_fn(q);
if (err) {
kobject_put(&q->elevator->kobj);
return err;
}
err = e->ops.elevator_init_fn(q, e);
return 0;
}
EXPORT_SYMBOL(elevator_init);
@@ -935,17 +927,10 @@ static int elevator_switch(struct request_queue *q, struct elevator_type *new_e)
spin_unlock_irq(q->queue_lock);
/* allocate, init and register new elevator */
err = -ENOMEM;
q->elevator = elevator_alloc(q, new_e);
if (!q->elevator)
err = new_e->ops.elevator_init_fn(q, new_e);
if (err)
goto fail_init;
err = new_e->ops.elevator_init_fn(q);
if (err) {
kobject_put(&q->elevator->kobj);
goto fail_init;
}
if (registered) {
err = elv_register_queue(q);
if (err)
+15 -4
View File
@@ -59,16 +59,27 @@ noop_latter_request(struct request_queue *q, struct request *rq)
return list_entry(rq->queuelist.next, struct request, queuelist);
}
static int noop_init_queue(struct request_queue *q)
static int noop_init_queue(struct request_queue *q, struct elevator_type *e)
{
struct noop_data *nd;
struct elevator_queue *eq;
nd = kmalloc_node(sizeof(*nd), GFP_KERNEL, q->node);
if (!nd)
eq = elevator_alloc(q, e);
if (!eq)
return -ENOMEM;
nd = kmalloc_node(sizeof(*nd), GFP_KERNEL, q->node);
if (!nd) {
kobject_put(&eq->kobj);
return -ENOMEM;
}
eq->elevator_data = nd;
INIT_LIST_HEAD(&nd->queue);
q->elevator->elevator_data = nd;
spin_lock_irq(q->queue_lock);
q->elevator = eq;
spin_unlock_irq(q->queue_lock);
return 0;
}