dm stats: collect and report histogram of IO latencies

Add an option to dm statistics to collect and report a histogram of
IO latencies.

Signed-off-by: Mikulas Patocka <mpatocka@redhat.com>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
This commit is contained in:
Mikulas Patocka
2015-06-09 17:22:05 -04:00
committed by Mike Snitzer
parent c96aec344d
commit dfcfac3e4c
2 changed files with 199 additions and 25 deletions
+183 -22
View File
@@ -29,6 +29,7 @@ struct dm_stat_percpu {
unsigned long long io_ticks[2];
unsigned long long io_ticks_total;
unsigned long long time_in_queue;
unsigned long long *histogram;
};
struct dm_stat_shared {
@@ -45,11 +46,14 @@ struct dm_stat {
sector_t start;
sector_t end;
sector_t step;
unsigned n_histogram_entries;
unsigned long long *histogram_boundaries;
const char *program_id;
const char *aux_data;
struct rcu_head rcu_head;
size_t shared_alloc_size;
size_t percpu_alloc_size;
size_t histogram_alloc_size;
struct dm_stat_percpu *stat_percpu[NR_CPUS];
struct dm_stat_shared stat_shared[0];
};
@@ -173,8 +177,11 @@ static void dm_stat_free(struct rcu_head *head)
kfree(s->program_id);
kfree(s->aux_data);
for_each_possible_cpu(cpu)
for_each_possible_cpu(cpu) {
dm_kvfree(s->stat_percpu[cpu][0].histogram, s->histogram_alloc_size);
dm_kvfree(s->stat_percpu[cpu], s->percpu_alloc_size);
}
dm_kvfree(s->stat_shared[0].tmp.histogram, s->histogram_alloc_size);
dm_kvfree(s, s->shared_alloc_size);
}
@@ -228,6 +235,8 @@ void dm_stats_cleanup(struct dm_stats *stats)
static int dm_stats_create(struct dm_stats *stats, sector_t start, sector_t end,
sector_t step, unsigned stat_flags,
unsigned n_histogram_entries,
unsigned long long *histogram_boundaries,
const char *program_id, const char *aux_data,
void (*suspend_callback)(struct mapped_device *),
void (*resume_callback)(struct mapped_device *),
@@ -239,6 +248,7 @@ static int dm_stats_create(struct dm_stats *stats, sector_t start, sector_t end,
size_t ni;
size_t shared_alloc_size;
size_t percpu_alloc_size;
size_t histogram_alloc_size;
struct dm_stat_percpu *p;
int cpu;
int ret_id;
@@ -262,7 +272,12 @@ static int dm_stats_create(struct dm_stats *stats, sector_t start, sector_t end,
if (percpu_alloc_size / sizeof(struct dm_stat_percpu) != n_entries)
return -EOVERFLOW;
if (!check_shared_memory(shared_alloc_size + num_possible_cpus() * percpu_alloc_size))
histogram_alloc_size = (n_histogram_entries + 1) * (size_t)n_entries * sizeof(unsigned long long);
if (histogram_alloc_size / (n_histogram_entries + 1) != (size_t)n_entries * sizeof(unsigned long long))
return -EOVERFLOW;
if (!check_shared_memory(shared_alloc_size + histogram_alloc_size +
num_possible_cpus() * (percpu_alloc_size + histogram_alloc_size)))
return -ENOMEM;
s = dm_kvzalloc(shared_alloc_size, NUMA_NO_NODE);
@@ -276,6 +291,15 @@ static int dm_stats_create(struct dm_stats *stats, sector_t start, sector_t end,
s->step = step;
s->shared_alloc_size = shared_alloc_size;
s->percpu_alloc_size = percpu_alloc_size;
s->histogram_alloc_size = histogram_alloc_size;
s->n_histogram_entries = n_histogram_entries;
s->histogram_boundaries = kmemdup(histogram_boundaries,
s->n_histogram_entries * sizeof(unsigned long long), GFP_KERNEL);
if (!s->histogram_boundaries) {
r = -ENOMEM;
goto out;
}
s->program_id = kstrdup(program_id, GFP_KERNEL);
if (!s->program_id) {
@@ -293,6 +317,19 @@ static int dm_stats_create(struct dm_stats *stats, sector_t start, sector_t end,
atomic_set(&s->stat_shared[ni].in_flight[WRITE], 0);
}
if (s->n_histogram_entries) {
unsigned long long *hi;
hi = dm_kvzalloc(s->histogram_alloc_size, NUMA_NO_NODE);
if (!hi) {
r = -ENOMEM;
goto out;
}
for (ni = 0; ni < n_entries; ni++) {
s->stat_shared[ni].tmp.histogram = hi;
hi += s->n_histogram_entries + 1;
}
}
for_each_possible_cpu(cpu) {
p = dm_kvzalloc(percpu_alloc_size, cpu_to_node(cpu));
if (!p) {
@@ -300,6 +337,18 @@ static int dm_stats_create(struct dm_stats *stats, sector_t start, sector_t end,
goto out;
}
s->stat_percpu[cpu] = p;
if (s->n_histogram_entries) {
unsigned long long *hi;
hi = dm_kvzalloc(s->histogram_alloc_size, cpu_to_node(cpu));
if (!hi) {
r = -ENOMEM;
goto out;
}
for (ni = 0; ni < n_entries; ni++) {
p[ni].histogram = hi;
hi += s->n_histogram_entries + 1;
}
}
}
/*
@@ -377,9 +426,11 @@ static int dm_stats_delete(struct dm_stats *stats, int id)
* vfree can't be called from RCU callback
*/
for_each_possible_cpu(cpu)
if (is_vmalloc_addr(s->stat_percpu))
if (is_vmalloc_addr(s->stat_percpu) ||
is_vmalloc_addr(s->stat_percpu[cpu][0].histogram))
goto do_sync_free;
if (is_vmalloc_addr(s)) {
if (is_vmalloc_addr(s) ||
is_vmalloc_addr(s->stat_shared[0].tmp.histogram)) {
do_sync_free:
synchronize_rcu_expedited();
dm_stat_free(&s->rcu_head);
@@ -486,15 +537,32 @@ static void dm_stat_for_entry(struct dm_stat *s, size_t entry,
dm_stat_round(s, shared, p);
atomic_inc(&shared->in_flight[idx]);
} else {
unsigned long long duration;
dm_stat_round(s, shared, p);
atomic_dec(&shared->in_flight[idx]);
p->sectors[idx] += len;
p->ios[idx] += 1;
p->merges[idx] += stats_aux->merged;
if (!(s->stat_flags & STAT_PRECISE_TIMESTAMPS))
if (!(s->stat_flags & STAT_PRECISE_TIMESTAMPS)) {
p->ticks[idx] += duration_jiffies;
else
duration = jiffies_to_msecs(duration_jiffies);
} else {
p->ticks[idx] += stats_aux->duration_ns;
duration = stats_aux->duration_ns;
}
if (s->n_histogram_entries) {
unsigned lo = 0, hi = s->n_histogram_entries + 1;
while (lo + 1 < hi) {
unsigned mid = (lo + hi) / 2;
if (s->histogram_boundaries[mid - 1] > duration) {
hi = mid;
} else {
lo = mid;
}
}
p->histogram[lo]++;
}
}
#if BITS_PER_LONG == 32
@@ -600,7 +668,22 @@ static void __dm_stat_init_temporary_percpu_totals(struct dm_stat_shared *shared
dm_stat_round(s, shared, p);
local_irq_enable();
memset(&shared->tmp, 0, sizeof(shared->tmp));
shared->tmp.sectors[READ] = 0;
shared->tmp.sectors[WRITE] = 0;
shared->tmp.ios[READ] = 0;
shared->tmp.ios[WRITE] = 0;
shared->tmp.merges[READ] = 0;
shared->tmp.merges[WRITE] = 0;
shared->tmp.ticks[READ] = 0;
shared->tmp.ticks[WRITE] = 0;
shared->tmp.io_ticks[READ] = 0;
shared->tmp.io_ticks[WRITE] = 0;
shared->tmp.io_ticks_total = 0;
shared->tmp.time_in_queue = 0;
if (s->n_histogram_entries)
memset(shared->tmp.histogram, 0, (s->n_histogram_entries + 1) * sizeof(unsigned long long));
for_each_possible_cpu(cpu) {
p = &s->stat_percpu[cpu][x];
shared->tmp.sectors[READ] += ACCESS_ONCE(p->sectors[READ]);
@@ -615,6 +698,11 @@ static void __dm_stat_init_temporary_percpu_totals(struct dm_stat_shared *shared
shared->tmp.io_ticks[WRITE] += ACCESS_ONCE(p->io_ticks[WRITE]);
shared->tmp.io_ticks_total += ACCESS_ONCE(p->io_ticks_total);
shared->tmp.time_in_queue += ACCESS_ONCE(p->time_in_queue);
if (s->n_histogram_entries) {
unsigned i;
for (i = 0; i < s->n_histogram_entries + 1; i++)
shared->tmp.histogram[i] += ACCESS_ONCE(p->histogram[i]);
}
}
}
@@ -644,6 +732,15 @@ static void __dm_stat_clear(struct dm_stat *s, size_t idx_start, size_t idx_end,
p->io_ticks_total -= shared->tmp.io_ticks_total;
p->time_in_queue -= shared->tmp.time_in_queue;
local_irq_enable();
if (s->n_histogram_entries) {
unsigned i;
for (i = 0; i < s->n_histogram_entries + 1; i++) {
local_irq_disable();
p = &s->stat_percpu[smp_processor_id()][x];
p->histogram[i] -= shared->tmp.histogram[i];
local_irq_enable();
}
}
}
}
@@ -733,7 +830,7 @@ static int dm_stats_print(struct dm_stats *stats, int id,
__dm_stat_init_temporary_percpu_totals(shared, s, x);
DMEMIT("%llu+%llu %llu %llu %llu %llu %llu %llu %llu %llu %d %llu %llu %llu %llu\n",
DMEMIT("%llu+%llu %llu %llu %llu %llu %llu %llu %llu %llu %d %llu %llu %llu %llu",
(unsigned long long)start,
(unsigned long long)step,
shared->tmp.ios[READ],
@@ -749,6 +846,13 @@ static int dm_stats_print(struct dm_stats *stats, int id,
dm_jiffies_to_msec64(s, shared->tmp.time_in_queue),
dm_jiffies_to_msec64(s, shared->tmp.io_ticks[READ]),
dm_jiffies_to_msec64(s, shared->tmp.io_ticks[WRITE]));
if (s->n_histogram_entries) {
unsigned i;
for (i = 0; i < s->n_histogram_entries + 1; i++) {
DMEMIT("%s%llu", !i ? " " : ":", shared->tmp.histogram[i]);
}
}
DMEMIT("\n");
if (unlikely(sz + 1 >= maxlen))
goto buffer_overflow;
@@ -790,10 +894,47 @@ static int dm_stats_set_aux(struct dm_stats *stats, int id, const char *aux_data
return 0;
}
static int parse_histogram(const char *h, unsigned *n_histogram_entries,
unsigned long long **histogram_boundaries)
{
const char *q;
unsigned n;
unsigned long long last;
*n_histogram_entries = 1;
for (q = h; *q; q++)
if (*q == ',')
(*n_histogram_entries)++;
*histogram_boundaries = kmalloc(*n_histogram_entries * sizeof(unsigned long long), GFP_KERNEL);
if (!*histogram_boundaries)
return -ENOMEM;
n = 0;
last = 0;
while (1) {
unsigned long long hi;
int s;
char ch;
s = sscanf(h, "%llu%c", &hi, &ch);
if (!s || (s == 2 && ch != ','))
return -EINVAL;
if (hi <= last)
return -EINVAL;
last = hi;
(*histogram_boundaries)[n] = hi;
if (s == 1)
return 0;
h = strchr(h, ',') + 1;
n++;
}
}
static int message_stats_create(struct mapped_device *md,
unsigned argc, char **argv,
char *result, unsigned maxlen)
{
int r;
int id;
char dummy;
unsigned long long start, end, len, step;
@@ -801,6 +942,9 @@ static int message_stats_create(struct mapped_device *md,
const char *program_id, *aux_data;
unsigned stat_flags = 0;
unsigned n_histogram_entries = 0;
unsigned long long *histogram_boundaries = NULL;
struct dm_arg_set as, as_backup;
const char *a;
unsigned feature_args;
@@ -811,7 +955,7 @@ static int message_stats_create(struct mapped_device *md,
*/
if (argc < 3)
return -EINVAL;
goto ret_einval;
as.argc = argc;
as.argv = argv;
@@ -825,11 +969,11 @@ static int message_stats_create(struct mapped_device *md,
len = 1;
} else if (sscanf(a, "%llu+%llu%c", &start, &len, &dummy) != 2 ||
start != (sector_t)start || len != (sector_t)len)
return -EINVAL;
goto ret_einval;
end = start + len;
if (start >= end)
return -EINVAL;
goto ret_einval;
a = dm_shift_arg(&as);
if (sscanf(a, "/%u%c", &divisor, &dummy) == 1) {
@@ -842,7 +986,7 @@ static int message_stats_create(struct mapped_device *md,
step = 1;
} else if (sscanf(a, "%llu%c", &step, &dummy) != 1 ||
step != (sector_t)step || !step)
return -EINVAL;
goto ret_einval;
as_backup = as;
a = dm_shift_arg(&as);
@@ -850,11 +994,16 @@ static int message_stats_create(struct mapped_device *md,
while (feature_args--) {
a = dm_shift_arg(&as);
if (!a)
return -EINVAL;
goto ret_einval;
if (!strcasecmp(a, "precise_timestamps"))
stat_flags |= STAT_PRECISE_TIMESTAMPS;
else
return -EINVAL;
else if (!strncasecmp(a, "histogram:", 10)) {
if (n_histogram_entries)
goto ret_einval;
if ((r = parse_histogram(a + 10, &n_histogram_entries, &histogram_boundaries)))
goto ret;
} else
goto ret_einval;
}
} else {
as = as_backup;
@@ -872,7 +1021,7 @@ static int message_stats_create(struct mapped_device *md,
aux_data = a;
if (as.argc)
return -EINVAL;
goto ret_einval;
/*
* If a buffer overflow happens after we created the region,
@@ -881,17 +1030,29 @@ static int message_stats_create(struct mapped_device *md,
* leaked). So we must detect buffer overflow in advance.
*/
snprintf(result, maxlen, "%d", INT_MAX);
if (dm_message_test_buffer_overflow(result, maxlen))
return 1;
if (dm_message_test_buffer_overflow(result, maxlen)) {
r = 1;
goto ret;
}
id = dm_stats_create(dm_get_stats(md), start, end, step, stat_flags, program_id, aux_data,
id = dm_stats_create(dm_get_stats(md), start, end, step, stat_flags,
n_histogram_entries, histogram_boundaries, program_id, aux_data,
dm_internal_suspend_fast, dm_internal_resume_fast, md);
if (id < 0)
return id;
if (id < 0) {
r = id;
goto ret;
}
snprintf(result, maxlen, "%d", id);
return 1;
r = 1;
goto ret;
ret_einval:
r = -EINVAL;
ret:
kfree(histogram_boundaries);
return r;
}
static int message_stats_delete(struct mapped_device *md,