lightnvm: physical block device (pblk) target

This patch introduces pblk, a host-side translation layer for
Open-Channel SSDs to expose them like block devices. The translation
layer allows data placement decisions, and I/O scheduling to be
managed by the host, enabling users to optimize the SSD for their
specific workloads.

An open-channel SSD has a set of LUNs (parallel units) and a
collection of blocks. Each block can be read in any order, but
writes must be sequential. Writes may also fail, and if a block
requires it, must also be reset before new writes can be
applied.

To manage the constraints, pblk maintains a logical to
physical address (L2P) table,  write cache, garbage
collection logic, recovery scheme, and logic to rate-limit
user I/Os versus garbage collection I/Os.

The L2P table is fully-associative and manages sectors at a
4KB granularity. Pblk stores the L2P table in two places, in
the out-of-band area of the media and on the last page of a
line. In the cause of a power failure, pblk will perform a
scan to recover the L2P table.

The user data is organized into lines. A line is data
striped across blocks and LUNs. The lines enable the host to
reduce the amount of metadata to maintain besides the user
data and makes it easier to implement RAID or erasure coding
in the future.

pblk implements multi-tenant support and can be instantiated
multiple times on the same drive. Each instance owns a
portion of the SSD - both regarding I/O bandwidth and
capacity - providing I/O isolation for each case.

Finally, pblk also exposes a sysfs interface that allows
user-space to peek into the internals of pblk. The interface
is available at /dev/block/*/pblk/ where * is the block
device name exposed.

This work also contains contributions from:
  Matias Bjørling <matias@cnexlabs.com>
  Simon A. F. Lund <slund@cnexlabs.com>
  Young Tack Jin <youngtack.jin@gmail.com>
  Huaicheng Li <huaicheng@cs.uchicago.edu>

Signed-off-by: Javier González <javier@cnexlabs.com>
Signed-off-by: Matias Bjørling <matias@cnexlabs.com>
Signed-off-by: Jens Axboe <axboe@fb.com>
This commit is contained in:
Javier González
2017-04-15 20:55:50 +02:00
committed by Jens Axboe
parent 6eb082452d
commit a4bd217b43
15 changed files with 8044 additions and 0 deletions

View File

@@ -0,0 +1,21 @@
pblk: Physical Block Device Target
==================================
pblk implements a fully associative, host-based FTL that exposes a traditional
block I/O interface. Its primary responsibilities are:
- Map logical addresses onto physical addresses (4KB granularity) in a
logical-to-physical (L2P) table.
- Maintain the integrity and consistency of the L2P table as well as its
recovery from normal tear down and power outage.
- Deal with controller- and media-specific constrains.
- Handle I/O errors.
- Implement garbage collection.
- Maintain consistency across the I/O stack during synchronization points.
For more information please refer to:
http://lightnvm.io
which maintains updated FAQs, manual pages, technical documentation, tools,
contacts, etc.

View File

@@ -33,4 +33,13 @@ config NVM_RRPC
host. The target is implemented using a linear mapping table and
cost-based garbage collection. It is optimized for 4K IO sizes.
config NVM_PBLK
tristate "Physical Block Device Open-Channel SSD target"
---help---
Allows an open-channel SSD to be exposed as a block device to the
host. The target assumes the device exposes raw flash and must be
explicitly managed by the host.
Please note the disk format is considered EXPERIMENTAL for now.
endif # NVM

View File

@@ -4,3 +4,8 @@
obj-$(CONFIG_NVM) := core.o
obj-$(CONFIG_NVM_RRPC) += rrpc.o
obj-$(CONFIG_NVM_PBLK) += pblk.o
pblk-y := pblk-init.o pblk-core.o pblk-rb.o \
pblk-write.o pblk-cache.o pblk-read.o \
pblk-gc.o pblk-recovery.o pblk-map.o \
pblk-rl.o pblk-sysfs.o

View File

@@ -0,0 +1,114 @@
/*
* Copyright (C) 2016 CNEX Labs
* Initial release: Javier Gonzalez <javier@cnexlabs.com>
* Matias Bjorling <matias@cnexlabs.com>
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License version
* 2 as published by the Free Software Foundation.
*
* This program is distributed in the hope that it will be useful, but
* WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* General Public License for more details.
*
* pblk-cache.c - pblk's write cache
*/
#include "pblk.h"
int pblk_write_to_cache(struct pblk *pblk, struct bio *bio, unsigned long flags)
{
struct pblk_w_ctx w_ctx;
sector_t lba = pblk_get_lba(bio);
unsigned int bpos, pos;
int nr_entries = pblk_get_secs(bio);
int i, ret;
/* Update the write buffer head (mem) with the entries that we can
* write. The write in itself cannot fail, so there is no need to
* rollback from here on.
*/
retry:
ret = pblk_rb_may_write_user(&pblk->rwb, bio, nr_entries, &bpos);
if (ret == NVM_IO_REQUEUE) {
io_schedule();
goto retry;
}
if (unlikely(!bio_has_data(bio)))
goto out;
w_ctx.flags = flags;
pblk_ppa_set_empty(&w_ctx.ppa);
for (i = 0; i < nr_entries; i++) {
void *data = bio_data(bio);
w_ctx.lba = lba + i;
pos = pblk_rb_wrap_pos(&pblk->rwb, bpos + i);
pblk_rb_write_entry_user(&pblk->rwb, data, w_ctx, pos);
bio_advance(bio, PBLK_EXPOSED_PAGE_SIZE);
}
#ifdef CONFIG_NVM_DEBUG
atomic_long_add(nr_entries, &pblk->inflight_writes);
atomic_long_add(nr_entries, &pblk->req_writes);
#endif
out:
pblk_write_should_kick(pblk);
return ret;
}
/*
* On GC the incoming lbas are not necessarily sequential. Also, some of the
* lbas might not be valid entries, which are marked as empty by the GC thread
*/
int pblk_write_gc_to_cache(struct pblk *pblk, void *data, u64 *lba_list,
unsigned int nr_entries, unsigned int nr_rec_entries,
struct pblk_line *gc_line, unsigned long flags)
{
struct pblk_w_ctx w_ctx;
unsigned int bpos, pos;
int i, valid_entries;
/* Update the write buffer head (mem) with the entries that we can
* write. The write in itself cannot fail, so there is no need to
* rollback from here on.
*/
retry:
if (!pblk_rb_may_write_gc(&pblk->rwb, nr_rec_entries, &bpos)) {
io_schedule();
goto retry;
}
w_ctx.flags = flags;
pblk_ppa_set_empty(&w_ctx.ppa);
for (i = 0, valid_entries = 0; i < nr_entries; i++) {
if (lba_list[i] == ADDR_EMPTY)
continue;
w_ctx.lba = lba_list[i];
pos = pblk_rb_wrap_pos(&pblk->rwb, bpos + valid_entries);
pblk_rb_write_entry_gc(&pblk->rwb, data, w_ctx, gc_line, pos);
data += PBLK_EXPOSED_PAGE_SIZE;
valid_entries++;
}
WARN_ONCE(nr_rec_entries != valid_entries,
"pblk: inconsistent GC write\n");
#ifdef CONFIG_NVM_DEBUG
atomic_long_add(valid_entries, &pblk->inflight_writes);
atomic_long_add(valid_entries, &pblk->recov_gc_writes);
#endif
pblk_write_should_kick(pblk);
return NVM_IO_OK;
}

1655
drivers/lightnvm/pblk-core.c Normal file

File diff suppressed because it is too large Load Diff

555
drivers/lightnvm/pblk-gc.c Normal file

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

136
drivers/lightnvm/pblk-map.c Normal file
View File

@@ -0,0 +1,136 @@
/*
* Copyright (C) 2016 CNEX Labs
* Initial release: Javier Gonzalez <javier@cnexlabs.com>
* Matias Bjorling <matias@cnexlabs.com>
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License version
* 2 as published by the Free Software Foundation.
*
* This program is distributed in the hope that it will be useful, but
* WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* General Public License for more details.
*
* pblk-map.c - pblk's lba-ppa mapping strategy
*
*/
#include "pblk.h"
static void pblk_map_page_data(struct pblk *pblk, unsigned int sentry,
struct ppa_addr *ppa_list,
unsigned long *lun_bitmap,
struct pblk_sec_meta *meta_list,
unsigned int valid_secs)
{
struct pblk_line *line = pblk_line_get_data(pblk);
struct line_emeta *emeta = line->emeta;
struct pblk_w_ctx *w_ctx;
__le64 *lba_list = pblk_line_emeta_to_lbas(emeta);
u64 paddr;
int nr_secs = pblk->min_write_pgs;
int i;
paddr = pblk_alloc_page(pblk, line, nr_secs);
for (i = 0; i < nr_secs; i++, paddr++) {
/* ppa to be sent to the device */
ppa_list[i] = addr_to_gen_ppa(pblk, paddr, line->id);
/* Write context for target bio completion on write buffer. Note
* that the write buffer is protected by the sync backpointer,
* and a single writer thread have access to each specific entry
* at a time. Thus, it is safe to modify the context for the
* entry we are setting up for submission without taking any
* lock or memory barrier.
*/
if (i < valid_secs) {
kref_get(&line->ref);
w_ctx = pblk_rb_w_ctx(&pblk->rwb, sentry + i);
w_ctx->ppa = ppa_list[i];
meta_list[i].lba = cpu_to_le64(w_ctx->lba);
lba_list[paddr] = cpu_to_le64(w_ctx->lba);
le64_add_cpu(&line->emeta->nr_valid_lbas, 1);
} else {
meta_list[i].lba = cpu_to_le64(ADDR_EMPTY);
lba_list[paddr] = cpu_to_le64(ADDR_EMPTY);
pblk_map_pad_invalidate(pblk, line, paddr);
}
}
if (pblk_line_is_full(line)) {
line = pblk_line_replace_data(pblk);
if (!line)
return;
}
pblk_down_rq(pblk, ppa_list, nr_secs, lun_bitmap);
}
void pblk_map_rq(struct pblk *pblk, struct nvm_rq *rqd, unsigned int sentry,
unsigned long *lun_bitmap, unsigned int valid_secs,
unsigned int off)
{
struct pblk_sec_meta *meta_list = rqd->meta_list;
unsigned int map_secs;
int min = pblk->min_write_pgs;
int i;
for (i = off; i < rqd->nr_ppas; i += min) {
map_secs = (i + min > valid_secs) ? (valid_secs % min) : min;
pblk_map_page_data(pblk, sentry + i, &rqd->ppa_list[i],
lun_bitmap, &meta_list[i], map_secs);
}
}
/* only if erase_ppa is set, acquire erase semaphore */
void pblk_map_erase_rq(struct pblk *pblk, struct nvm_rq *rqd,
unsigned int sentry, unsigned long *lun_bitmap,
unsigned int valid_secs, struct ppa_addr *erase_ppa)
{
struct nvm_tgt_dev *dev = pblk->dev;
struct nvm_geo *geo = &dev->geo;
struct pblk_line *e_line = pblk_line_get_data_next(pblk);
struct pblk_sec_meta *meta_list = rqd->meta_list;
unsigned int map_secs;
int min = pblk->min_write_pgs;
int i, erase_lun;
for (i = 0; i < rqd->nr_ppas; i += min) {
map_secs = (i + min > valid_secs) ? (valid_secs % min) : min;
pblk_map_page_data(pblk, sentry + i, &rqd->ppa_list[i],
lun_bitmap, &meta_list[i], map_secs);
erase_lun = rqd->ppa_list[i].g.lun * geo->nr_chnls +
rqd->ppa_list[i].g.ch;
if (!test_bit(erase_lun, e_line->erase_bitmap)) {
if (down_trylock(&pblk->erase_sem))
continue;
set_bit(erase_lun, e_line->erase_bitmap);
e_line->left_eblks--;
*erase_ppa = rqd->ppa_list[i];
erase_ppa->g.blk = e_line->id;
/* Avoid evaluating e_line->left_eblks */
return pblk_map_rq(pblk, rqd, sentry, lun_bitmap,
valid_secs, i + min);
}
}
/* Erase blocks that are bad in this line but might not be in next */
if (unlikely(ppa_empty(*erase_ppa))) {
struct pblk_line_meta *lm = &pblk->lm;
i = find_first_zero_bit(e_line->erase_bitmap, lm->blk_per_line);
if (i == lm->blk_per_line)
return;
set_bit(i, e_line->erase_bitmap);
e_line->left_eblks--;
*erase_ppa = pblk->luns[i].bppa; /* set ch and lun */
erase_ppa->g.blk = e_line->id;
}
}

852
drivers/lightnvm/pblk-rb.c Normal file

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

182
drivers/lightnvm/pblk-rl.c Normal file
View File

@@ -0,0 +1,182 @@
/*
* Copyright (C) 2016 CNEX Labs
* Initial release: Javier Gonzalez <javier@cnexlabs.com>
* Matias Bjorling <matias@cnexlabs.com>
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License version
* 2 as published by the Free Software Foundation.
*
* This program is distributed in the hope that it will be useful, but
* WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* General Public License for more details.
*
* pblk-rl.c - pblk's rate limiter for user I/O
*
*/
#include "pblk.h"
static void pblk_rl_kick_u_timer(struct pblk_rl *rl)
{
mod_timer(&rl->u_timer, jiffies + msecs_to_jiffies(5000));
}
int pblk_rl_user_may_insert(struct pblk_rl *rl, int nr_entries)
{
int rb_user_cnt = atomic_read(&rl->rb_user_cnt);
return (!(rb_user_cnt + nr_entries > rl->rb_user_max));
}
int pblk_rl_gc_may_insert(struct pblk_rl *rl, int nr_entries)
{
int rb_gc_cnt = atomic_read(&rl->rb_gc_cnt);
int rb_user_active;
/* If there is no user I/O let GC take over space on the write buffer */
rb_user_active = READ_ONCE(rl->rb_user_active);
return (!(rb_gc_cnt + nr_entries > rl->rb_gc_max && rb_user_active));
}
void pblk_rl_user_in(struct pblk_rl *rl, int nr_entries)
{
atomic_add(nr_entries, &rl->rb_user_cnt);
/* Release user I/O state. Protect from GC */
smp_store_release(&rl->rb_user_active, 1);
pblk_rl_kick_u_timer(rl);
}
void pblk_rl_gc_in(struct pblk_rl *rl, int nr_entries)
{
atomic_add(nr_entries, &rl->rb_gc_cnt);
}
void pblk_rl_out(struct pblk_rl *rl, int nr_user, int nr_gc)
{
atomic_sub(nr_user, &rl->rb_user_cnt);
atomic_sub(nr_gc, &rl->rb_gc_cnt);
}
unsigned long pblk_rl_nr_free_blks(struct pblk_rl *rl)
{
return atomic_read(&rl->free_blocks);
}
/*
* We check for (i) the number of free blocks in the current LUN and (ii) the
* total number of free blocks in the pblk instance. This is to even out the
* number of free blocks on each LUN when GC kicks in.
*
* Only the total number of free blocks is used to configure the rate limiter.
*/
static int pblk_rl_update_rates(struct pblk_rl *rl, unsigned long max)
{
unsigned long free_blocks = pblk_rl_nr_free_blks(rl);
if (free_blocks >= rl->high) {
rl->rb_user_max = max - rl->rb_gc_rsv;
rl->rb_gc_max = rl->rb_gc_rsv;
rl->rb_state = PBLK_RL_HIGH;
} else if (free_blocks < rl->high) {
int shift = rl->high_pw - rl->rb_windows_pw;
int user_windows = free_blocks >> shift;
int user_max = user_windows << PBLK_MAX_REQ_ADDRS_PW;
int gc_max;
rl->rb_user_max = user_max;
gc_max = max - rl->rb_user_max;
rl->rb_gc_max = max(gc_max, rl->rb_gc_rsv);
if (free_blocks > rl->low)
rl->rb_state = PBLK_RL_MID;
else
rl->rb_state = PBLK_RL_LOW;
}
return rl->rb_state;
}
void pblk_rl_set_gc_rsc(struct pblk_rl *rl, int rsv)
{
rl->rb_gc_rsv = rl->rb_gc_max = rsv;
}
void pblk_rl_free_lines_inc(struct pblk_rl *rl, struct pblk_line *line)
{
struct pblk *pblk = container_of(rl, struct pblk, rl);
int ret;
atomic_add(line->blk_in_line, &rl->free_blocks);
/* Rates will not change that often - no need to lock update */
ret = pblk_rl_update_rates(rl, rl->rb_budget);
if (ret == (PBLK_RL_MID | PBLK_RL_LOW))
pblk_gc_should_start(pblk);
else
pblk_gc_should_stop(pblk);
}
void pblk_rl_free_lines_dec(struct pblk_rl *rl, struct pblk_line *line)
{
struct pblk *pblk = container_of(rl, struct pblk, rl);
int ret;
atomic_sub(line->blk_in_line, &rl->free_blocks);
/* Rates will not change that often - no need to lock update */
ret = pblk_rl_update_rates(rl, rl->rb_budget);
if (ret == (PBLK_RL_MID | PBLK_RL_LOW))
pblk_gc_should_start(pblk);
else
pblk_gc_should_stop(pblk);
}
int pblk_rl_gc_thrs(struct pblk_rl *rl)
{
return rl->high;
}
int pblk_rl_sysfs_rate_show(struct pblk_rl *rl)
{
return rl->rb_user_max;
}
static void pblk_rl_u_timer(unsigned long data)
{
struct pblk_rl *rl = (struct pblk_rl *)data;
/* Release user I/O state. Protect from GC */
smp_store_release(&rl->rb_user_active, 0);
}
void pblk_rl_free(struct pblk_rl *rl)
{
del_timer(&rl->u_timer);
}
void pblk_rl_init(struct pblk_rl *rl, int budget)
{
unsigned int rb_windows;
rl->high = rl->total_blocks / PBLK_USER_HIGH_THRS;
rl->low = rl->total_blocks / PBLK_USER_LOW_THRS;
rl->high_pw = get_count_order(rl->high);
/* This will always be a power-of-2 */
rb_windows = budget / PBLK_MAX_REQ_ADDRS;
rl->rb_windows_pw = get_count_order(rb_windows) + 1;
/* To start with, all buffer is available to user I/O writers */
rl->rb_budget = budget;
rl->rb_user_max = budget;
atomic_set(&rl->rb_user_cnt, 0);
rl->rb_gc_max = 0;
rl->rb_state = PBLK_RL_HIGH;
atomic_set(&rl->rb_gc_cnt, 0);
setup_timer(&rl->u_timer, pblk_rl_u_timer, (unsigned long)rl);
rl->rb_user_active = 0;
}

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,411 @@
/*
* Copyright (C) 2016 CNEX Labs
* Initial release: Javier Gonzalez <javier@cnexlabs.com>
* Matias Bjorling <matias@cnexlabs.com>
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License version
* 2 as published by the Free Software Foundation.
*
* This program is distributed in the hope that it will be useful, but
* WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* General Public License for more details.
*
* pblk-write.c - pblk's write path from write buffer to media
*/
#include "pblk.h"
static void pblk_sync_line(struct pblk *pblk, struct pblk_line *line)
{
#ifdef CONFIG_NVM_DEBUG
atomic_long_inc(&pblk->sync_writes);
#endif
/* Counter protected by rb sync lock */
line->left_ssecs--;
if (!line->left_ssecs)
pblk_line_run_ws(pblk, line, NULL, pblk_line_close_ws);
}
static unsigned long pblk_end_w_bio(struct pblk *pblk, struct nvm_rq *rqd,
struct pblk_c_ctx *c_ctx)
{
struct nvm_tgt_dev *dev = pblk->dev;
struct bio *original_bio;
unsigned long ret;
int i;
for (i = 0; i < c_ctx->nr_valid; i++) {
struct pblk_w_ctx *w_ctx;
struct ppa_addr p;
struct pblk_line *line;
w_ctx = pblk_rb_w_ctx(&pblk->rwb, c_ctx->sentry + i);
p = rqd->ppa_list[i];
line = &pblk->lines[pblk_dev_ppa_to_line(p)];
pblk_sync_line(pblk, line);
while ((original_bio = bio_list_pop(&w_ctx->bios)))
bio_endio(original_bio);
}
#ifdef CONFIG_NVM_DEBUG
atomic_long_add(c_ctx->nr_valid, &pblk->compl_writes);
#endif
ret = pblk_rb_sync_advance(&pblk->rwb, c_ctx->nr_valid);
if (rqd->meta_list)
nvm_dev_dma_free(dev->parent, rqd->meta_list,
rqd->dma_meta_list);
bio_put(rqd->bio);
pblk_free_rqd(pblk, rqd, WRITE);
return ret;
}
static unsigned long pblk_end_queued_w_bio(struct pblk *pblk,
struct nvm_rq *rqd,
struct pblk_c_ctx *c_ctx)
{
list_del(&c_ctx->list);
return pblk_end_w_bio(pblk, rqd, c_ctx);
}
static void pblk_complete_write(struct pblk *pblk, struct nvm_rq *rqd,
struct pblk_c_ctx *c_ctx)
{
struct pblk_c_ctx *c, *r;
unsigned long flags;
unsigned long pos;
#ifdef CONFIG_NVM_DEBUG
atomic_long_sub(c_ctx->nr_valid, &pblk->inflight_writes);
#endif
pblk_up_rq(pblk, rqd->ppa_list, rqd->nr_ppas, c_ctx->lun_bitmap);
pos = pblk_rb_sync_init(&pblk->rwb, &flags);
if (pos == c_ctx->sentry) {
pos = pblk_end_w_bio(pblk, rqd, c_ctx);
retry:
list_for_each_entry_safe(c, r, &pblk->compl_list, list) {
rqd = nvm_rq_from_c_ctx(c);
if (c->sentry == pos) {
pos = pblk_end_queued_w_bio(pblk, rqd, c);
goto retry;
}
}
} else {
WARN_ON(nvm_rq_from_c_ctx(c_ctx) != rqd);
list_add_tail(&c_ctx->list, &pblk->compl_list);
}
pblk_rb_sync_end(&pblk->rwb, &flags);
}
/* When a write fails, we are not sure whether the block has grown bad or a page
* range is more susceptible to write errors. If a high number of pages fail, we
* assume that the block is bad and we mark it accordingly. In all cases, we
* remap and resubmit the failed entries as fast as possible; if a flush is
* waiting on a completion, the whole stack would stall otherwise.
*/
static void pblk_end_w_fail(struct pblk *pblk, struct nvm_rq *rqd)
{
void *comp_bits = &rqd->ppa_status;
struct pblk_c_ctx *c_ctx = nvm_rq_to_pdu(rqd);
struct pblk_rec_ctx *recovery;
struct ppa_addr *ppa_list = rqd->ppa_list;
int nr_ppas = rqd->nr_ppas;
unsigned int c_entries;
int bit, ret;
if (unlikely(nr_ppas == 1))
ppa_list = &rqd->ppa_addr;
recovery = mempool_alloc(pblk->rec_pool, GFP_ATOMIC);
if (!recovery) {
pr_err("pblk: could not allocate recovery context\n");
return;
}
INIT_LIST_HEAD(&recovery->failed);
bit = -1;
while ((bit = find_next_bit(comp_bits, nr_ppas, bit + 1)) < nr_ppas) {
struct pblk_rb_entry *entry;
struct ppa_addr ppa;
/* Logic error */
if (bit > c_ctx->nr_valid) {
WARN_ON_ONCE("pblk: corrupted write request\n");
goto out;
}
ppa = ppa_list[bit];
entry = pblk_rb_sync_scan_entry(&pblk->rwb, &ppa);
if (!entry) {
pr_err("pblk: could not scan entry on write failure\n");
goto out;
}
/* The list is filled first and emptied afterwards. No need for
* protecting it with a lock
*/
list_add_tail(&entry->index, &recovery->failed);
}
c_entries = find_first_bit(comp_bits, nr_ppas);
ret = pblk_recov_setup_rq(pblk, c_ctx, recovery, comp_bits, c_entries);
if (ret) {
pr_err("pblk: could not recover from write failure\n");
goto out;
}
INIT_WORK(&recovery->ws_rec, pblk_submit_rec);
queue_work(pblk->kw_wq, &recovery->ws_rec);
out:
pblk_complete_write(pblk, rqd, c_ctx);
}
static void pblk_end_io_write(struct nvm_rq *rqd)
{
struct pblk *pblk = rqd->private;
struct pblk_c_ctx *c_ctx = nvm_rq_to_pdu(rqd);
if (rqd->error) {
pblk_log_write_err(pblk, rqd);
return pblk_end_w_fail(pblk, rqd);
}
#ifdef CONFIG_NVM_DEBUG
else
WARN_ONCE(rqd->bio->bi_error, "pblk: corrupted write error\n");
#endif
pblk_complete_write(pblk, rqd, c_ctx);
}
static int pblk_alloc_w_rq(struct pblk *pblk, struct nvm_rq *rqd,
unsigned int nr_secs)
{
struct nvm_tgt_dev *dev = pblk->dev;
/* Setup write request */
rqd->opcode = NVM_OP_PWRITE;
rqd->nr_ppas = nr_secs;
rqd->flags = pblk_set_progr_mode(pblk, WRITE);
rqd->private = pblk;
rqd->end_io = pblk_end_io_write;
rqd->meta_list = nvm_dev_dma_alloc(dev->parent, GFP_KERNEL,
&rqd->dma_meta_list);
if (!rqd->meta_list)
return -ENOMEM;
if (unlikely(nr_secs == 1))
return 0;
rqd->ppa_list = rqd->meta_list + pblk_dma_meta_size;
rqd->dma_ppa_list = rqd->dma_meta_list + pblk_dma_meta_size;
return 0;
}
static int pblk_setup_w_rq(struct pblk *pblk, struct nvm_rq *rqd,
struct pblk_c_ctx *c_ctx)
{
struct pblk_line_meta *lm = &pblk->lm;
struct pblk_line *e_line = pblk_line_get_data_next(pblk);
struct ppa_addr erase_ppa;
unsigned int valid = c_ctx->nr_valid;
unsigned int padded = c_ctx->nr_padded;
unsigned int nr_secs = valid + padded;
unsigned long *lun_bitmap;
int ret = 0;
lun_bitmap = kzalloc(lm->lun_bitmap_len, GFP_KERNEL);
if (!lun_bitmap) {
ret = -ENOMEM;
goto out;
}
c_ctx->lun_bitmap = lun_bitmap;
ret = pblk_alloc_w_rq(pblk, rqd, nr_secs);
if (ret) {
kfree(lun_bitmap);
goto out;
}
ppa_set_empty(&erase_ppa);
if (likely(!e_line || !e_line->left_eblks))
pblk_map_rq(pblk, rqd, c_ctx->sentry, lun_bitmap, valid, 0);
else
pblk_map_erase_rq(pblk, rqd, c_ctx->sentry, lun_bitmap,
valid, &erase_ppa);
out:
if (unlikely(e_line && !ppa_empty(erase_ppa))) {
if (pblk_blk_erase_async(pblk, erase_ppa)) {
struct nvm_tgt_dev *dev = pblk->dev;
struct nvm_geo *geo = &dev->geo;
int bit;
e_line->left_eblks++;
bit = erase_ppa.g.lun * geo->nr_chnls + erase_ppa.g.ch;
WARN_ON(!test_and_clear_bit(bit, e_line->erase_bitmap));
up(&pblk->erase_sem);
}
}
return ret;
}
int pblk_setup_w_rec_rq(struct pblk *pblk, struct nvm_rq *rqd,
struct pblk_c_ctx *c_ctx)
{
struct pblk_line_meta *lm = &pblk->lm;
unsigned long *lun_bitmap;
int ret;
lun_bitmap = kzalloc(lm->lun_bitmap_len, GFP_KERNEL);
if (!lun_bitmap)
return -ENOMEM;
c_ctx->lun_bitmap = lun_bitmap;
ret = pblk_alloc_w_rq(pblk, rqd, rqd->nr_ppas);
if (ret)
return ret;
pblk_map_rq(pblk, rqd, c_ctx->sentry, lun_bitmap, c_ctx->nr_valid, 0);
rqd->ppa_status = (u64)0;
rqd->flags = pblk_set_progr_mode(pblk, WRITE);
return ret;
}
static int pblk_calc_secs_to_sync(struct pblk *pblk, unsigned int secs_avail,
unsigned int secs_to_flush)
{
int secs_to_sync;
secs_to_sync = pblk_calc_secs(pblk, secs_avail, secs_to_flush);
#ifdef CONFIG_NVM_DEBUG
if ((!secs_to_sync && secs_to_flush)
|| (secs_to_sync < 0)
|| (secs_to_sync > secs_avail && !secs_to_flush)) {
pr_err("pblk: bad sector calculation (a:%d,s:%d,f:%d)\n",
secs_avail, secs_to_sync, secs_to_flush);
}
#endif
return secs_to_sync;
}
static int pblk_submit_write(struct pblk *pblk)
{
struct bio *bio;
struct nvm_rq *rqd;
struct pblk_c_ctx *c_ctx;
unsigned int pgs_read;
unsigned int secs_avail, secs_to_sync, secs_to_com;
unsigned int secs_to_flush;
unsigned long pos;
int err;
/* If there are no sectors in the cache, flushes (bios without data)
* will be cleared on the cache threads
*/
secs_avail = pblk_rb_read_count(&pblk->rwb);
if (!secs_avail)
return 1;
secs_to_flush = pblk_rb_sync_point_count(&pblk->rwb);
if (!secs_to_flush && secs_avail < pblk->min_write_pgs)
return 1;
rqd = pblk_alloc_rqd(pblk, WRITE);
if (IS_ERR(rqd)) {
pr_err("pblk: cannot allocate write req.\n");
return 1;
}
c_ctx = nvm_rq_to_pdu(rqd);
bio = bio_alloc(GFP_KERNEL, pblk->max_write_pgs);
if (!bio) {
pr_err("pblk: cannot allocate write bio\n");
goto fail_free_rqd;
}
bio->bi_iter.bi_sector = 0; /* internal bio */
bio_set_op_attrs(bio, REQ_OP_WRITE, 0);
rqd->bio = bio;
secs_to_sync = pblk_calc_secs_to_sync(pblk, secs_avail, secs_to_flush);
if (secs_to_sync > pblk->max_write_pgs) {
pr_err("pblk: bad buffer sync calculation\n");
goto fail_put_bio;
}
secs_to_com = (secs_to_sync > secs_avail) ? secs_avail : secs_to_sync;
pos = pblk_rb_read_commit(&pblk->rwb, secs_to_com);
pgs_read = pblk_rb_read_to_bio(&pblk->rwb, bio, c_ctx, pos,
secs_to_sync, secs_avail);
if (!pgs_read) {
pr_err("pblk: corrupted write bio\n");
goto fail_put_bio;
}
if (c_ctx->nr_padded)
if (pblk_bio_add_pages(pblk, bio, GFP_KERNEL, c_ctx->nr_padded))
goto fail_put_bio;
/* Assign lbas to ppas and populate request structure */
err = pblk_setup_w_rq(pblk, rqd, c_ctx);
if (err) {
pr_err("pblk: could not setup write request\n");
goto fail_free_bio;
}
err = pblk_submit_io(pblk, rqd);
if (err) {
pr_err("pblk: I/O submission failed: %d\n", err);
goto fail_free_bio;
}
#ifdef CONFIG_NVM_DEBUG
atomic_long_add(secs_to_sync, &pblk->sub_writes);
#endif
return 0;
fail_free_bio:
if (c_ctx->nr_padded)
pblk_bio_free_pages(pblk, bio, secs_to_sync, c_ctx->nr_padded);
fail_put_bio:
bio_put(bio);
fail_free_rqd:
pblk_free_rqd(pblk, rqd, WRITE);
return 1;
}
int pblk_write_ts(void *data)
{
struct pblk *pblk = data;
while (!kthread_should_stop()) {
if (!pblk_submit_write(pblk))
continue;
set_current_state(TASK_INTERRUPTIBLE);
io_schedule();
}
return 0;
}

1121
drivers/lightnvm/pblk.h Normal file

File diff suppressed because it is too large Load Diff