Merge tag 'bcachefs-2024-07-18.2' of https://evilpiepirate.org/git/bcachefs

Pull bcachefs updates from Kent Overstreet:

 - Metadata version 1.8: Stripe sectors accounting, BCH_DATA_unstriped

   This splits out the accounting of dirty sectors and stripe sectors in
   alloc keys; this lets us see stripe buckets that still have unstriped
   data in them.

   This is needed for ensuring that erasure coding is working correctly,
   as well as completing stripe creation after a crash.

 - Metadata version 1.9: Disk accounting rewrite

   The previous disk accounting scheme relied heavily on percpu counters
   that were also sharded by outstanding journal buffer; it was fast but
   not extensible or scalable, and meant that all accounting counters
   were recorded in every journal entry.

   The new disk accounting scheme stores accounting as normal btree
   keys; updates are deltas until they are flushed by the btree write
   buffer.

   This means we have no practical limit on the number of counters, and
   a new tagged union format that's easy to extend.

   We now have counters for compression type/ratio, per-snapshot-id
   usage, per-btree-id usage, and pending rebalance work.

 - Self healing on read IO/checksum error

   Data is now automatically rewritten if we get a read error and then a
   successful retry

 - Mount API conversion (thanks to Thomas Bertschinger)

 - Better lockdep coverage

   Previously, btree node locks were tracked individually by lockdep,
   like any other lock. But we may take _many_ btree node locks
   simultaneously, we easily blow through the limit of 48 locks that
   lockdep can track, leading to lockdep turning itself off.

   Tracking each btree node lock individually isn't really necessary
   since we have our own cycle detector for deadlock avoidance and
   centralized tracking of btree node locks, so we now have a single
   lockdep_map in btree_trans for "any btree nodes are locked".

 - Some more small incremental work towards online check_allocations

 - Lots more debugging improvements

 - Fixes, including:
    - undefined behaviour fixes, originally noted as breaking userspace
      LTO builds
    - fix a spurious warning in fsck_err, reported by Marcin
    - fix an integer overflow on trans->nr_updates, also reported by
      Marcin; this broke during deletion of highly fragmented indirect
      extents

* tag 'bcachefs-2024-07-18.2' of https://evilpiepirate.org/git/bcachefs: (120 commits)
  lockdep: Add comments for lockdep_set_no{validate,track}_class()
  bcachefs: Fix integer overflow on trans->nr_updates
  bcachefs: silence silly kdoc warning
  bcachefs: Fix fsck warning about btree_trans not passed to fsck error
  bcachefs: Add an error message for insufficient rw journal devs
  bcachefs: varint: Avoid left-shift of a negative value
  bcachefs: darray: Don't pass NULL to memcpy()
  bcachefs: Kill bch2_assert_btree_nodes_not_locked()
  bcachefs: Rename BCH_WRITE_DONE -> BCH_WRITE_SUBMITTED
  bcachefs: __bch2_read(): call trans_begin() on every loop iter
  bcachefs: show none if label is not set
  bcachefs: drop packed, aligned from bkey_inode_buf
  bcachefs: btree node scan: fall back to comparing by journal seq
  bcachefs: Add lockdep support for btree node locks
  lockdep: lockdep_set_notrack_class()
  bcachefs: Improve copygc_wait_to_text()
  bcachefs: Convert clock code to u64s
  bcachefs: Improve startup message
  bcachefs: Self healing on read IO error
  bcachefs: Make read_only a mount option again, but hidden
  ...
This commit is contained in:
Linus Torvalds
2024-07-18 17:27:43 -07:00
115 changed files with 3915 additions and 2690 deletions

View File

@@ -3720,7 +3720,6 @@ F: drivers/md/bcache/
BCACHEFS
M: Kent Overstreet <kent.overstreet@linux.dev>
R: Brian Foster <bfoster@redhat.com>
L: linux-bcachefs@vger.kernel.org
S: Supported
C: irc://irc.oftc.net/bcache

View File

@@ -29,10 +29,11 @@ bcachefs-y := \
clock.o \
compress.o \
darray.o \
data_update.o \
debug.o \
dirent.o \
disk_accounting.o \
disk_groups.o \
data_update.o \
ec.o \
errcode.o \
error.o \

View File

@@ -346,7 +346,6 @@ int bch2_set_acl(struct mnt_idmap *idmap,
{
struct bch_inode_info *inode = to_bch_ei(dentry->d_inode);
struct bch_fs *c = inode->v.i_sb->s_fs_info;
struct btree_trans *trans = bch2_trans_get(c);
struct btree_iter inode_iter = { NULL };
struct bch_inode_unpacked inode_u;
struct posix_acl *acl;
@@ -354,6 +353,7 @@ int bch2_set_acl(struct mnt_idmap *idmap,
int ret;
mutex_lock(&inode->ei_update_lock);
struct btree_trans *trans = bch2_trans_get(c);
retry:
bch2_trans_begin(trans);
acl = _acl;
@@ -394,8 +394,8 @@ btree_err:
set_cached_acl(&inode->v, type, acl);
err:
mutex_unlock(&inode->ei_update_lock);
bch2_trans_put(trans);
mutex_unlock(&inode->ei_update_lock);
return ret;
}

View File

@@ -15,6 +15,7 @@
#include "buckets_waiting_for_journal.h"
#include "clock.h"
#include "debug.h"
#include "disk_accounting.h"
#include "ec.h"
#include "error.h"
#include "lru.h"
@@ -268,27 +269,41 @@ int bch2_alloc_v4_invalid(struct bch_fs *c, struct bkey_s_c k,
i == READ ? "read" : "write",
a.v->io_time[i], LRU_TIME_MAX);
unsigned stripe_sectors = BCH_ALLOC_V4_BACKPOINTERS_START(a.v) * sizeof(u64) >
offsetof(struct bch_alloc_v4, stripe_sectors)
? a.v->stripe_sectors
: 0;
switch (a.v->data_type) {
case BCH_DATA_free:
case BCH_DATA_need_gc_gens:
case BCH_DATA_need_discard:
bkey_fsck_err_on(bch2_bucket_sectors_total(*a.v) || a.v->stripe,
bkey_fsck_err_on(stripe_sectors ||
a.v->dirty_sectors ||
a.v->cached_sectors ||
a.v->stripe,
c, err, alloc_key_empty_but_have_data,
"empty data type free but have data");
"empty data type free but have data %u.%u.%u %u",
stripe_sectors,
a.v->dirty_sectors,
a.v->cached_sectors,
a.v->stripe);
break;
case BCH_DATA_sb:
case BCH_DATA_journal:
case BCH_DATA_btree:
case BCH_DATA_user:
case BCH_DATA_parity:
bkey_fsck_err_on(!bch2_bucket_sectors_dirty(*a.v),
bkey_fsck_err_on(!a.v->dirty_sectors &&
!stripe_sectors,
c, err, alloc_key_dirty_sectors_0,
"data_type %s but dirty_sectors==0",
bch2_data_type_str(a.v->data_type));
break;
case BCH_DATA_cached:
bkey_fsck_err_on(!a.v->cached_sectors ||
bch2_bucket_sectors_dirty(*a.v) ||
a.v->dirty_sectors ||
stripe_sectors ||
a.v->stripe,
c, err, alloc_key_cached_inconsistency,
"data type inconsistency");
@@ -319,6 +334,7 @@ void bch2_alloc_v4_swab(struct bkey_s k)
a->stripe = swab32(a->stripe);
a->nr_external_backpointers = swab32(a->nr_external_backpointers);
a->fragmentation_lru = swab64(a->fragmentation_lru);
a->stripe_sectors = swab32(a->stripe_sectors);
bps = alloc_v4_backpointers(a);
for (bp = bps; bp < bps + BCH_ALLOC_V4_NR_BACKPOINTERS(a); bp++) {
@@ -343,6 +359,7 @@ void bch2_alloc_to_text(struct printbuf *out, struct bch_fs *c, struct bkey_s_c
prt_printf(out, "need_discard %llu\n", BCH_ALLOC_V4_NEED_DISCARD(a));
prt_printf(out, "need_inc_gen %llu\n", BCH_ALLOC_V4_NEED_INC_GEN(a));
prt_printf(out, "dirty_sectors %u\n", a->dirty_sectors);
prt_printf(out, "stripe_sectors %u\n", a->stripe_sectors);
prt_printf(out, "cached_sectors %u\n", a->cached_sectors);
prt_printf(out, "stripe %u\n", a->stripe);
prt_printf(out, "stripe_redundancy %u\n", a->stripe_redundancy);
@@ -460,7 +477,8 @@ err:
}
__flatten
struct bkey_i_alloc_v4 *bch2_trans_start_alloc_update(struct btree_trans *trans, struct bpos pos)
struct bkey_i_alloc_v4 *bch2_trans_start_alloc_update(struct btree_trans *trans, struct bpos pos,
enum btree_iter_update_trigger_flags flags)
{
struct btree_iter iter;
struct bkey_i_alloc_v4 *a = bch2_trans_start_alloc_update_noupdate(trans, &iter, pos);
@@ -468,7 +486,7 @@ struct bkey_i_alloc_v4 *bch2_trans_start_alloc_update(struct btree_trans *trans,
if (ret)
return ERR_PTR(ret);
ret = bch2_trans_update(trans, &iter, &a->k_i, 0);
ret = bch2_trans_update(trans, &iter, &a->k_i, flags);
bch2_trans_iter_exit(trans, &iter);
return unlikely(ret) ? ERR_PTR(ret) : a;
}
@@ -579,8 +597,6 @@ int bch2_alloc_read(struct bch_fs *c)
struct bch_dev *ca = NULL;
int ret;
down_read(&c->gc_lock);
if (c->sb.version_upgrade_complete >= bcachefs_metadata_version_bucket_gens) {
ret = for_each_btree_key(trans, iter, BTREE_ID_bucket_gens, POS_MIN,
BTREE_ITER_prefetch, k, ({
@@ -629,7 +645,6 @@ int bch2_alloc_read(struct bch_fs *c)
bch2_dev_put(ca);
bch2_trans_put(trans);
up_read(&c->gc_lock);
bch_err_fn(c, ret);
return ret;
@@ -744,6 +759,61 @@ static noinline int bch2_bucket_gen_update(struct btree_trans *trans,
return ret;
}
static inline int bch2_dev_data_type_accounting_mod(struct btree_trans *trans, struct bch_dev *ca,
enum bch_data_type data_type,
s64 delta_buckets,
s64 delta_sectors,
s64 delta_fragmented, unsigned flags)
{
struct disk_accounting_pos acc = {
.type = BCH_DISK_ACCOUNTING_dev_data_type,
.dev_data_type.dev = ca->dev_idx,
.dev_data_type.data_type = data_type,
};
s64 d[3] = { delta_buckets, delta_sectors, delta_fragmented };
return bch2_disk_accounting_mod(trans, &acc, d, 3, flags & BTREE_TRIGGER_gc);
}
int bch2_alloc_key_to_dev_counters(struct btree_trans *trans, struct bch_dev *ca,
const struct bch_alloc_v4 *old,
const struct bch_alloc_v4 *new,
unsigned flags)
{
s64 old_sectors = bch2_bucket_sectors(*old);
s64 new_sectors = bch2_bucket_sectors(*new);
if (old->data_type != new->data_type) {
int ret = bch2_dev_data_type_accounting_mod(trans, ca, new->data_type,
1, new_sectors, bch2_bucket_sectors_fragmented(ca, *new), flags) ?:
bch2_dev_data_type_accounting_mod(trans, ca, old->data_type,
-1, -old_sectors, -bch2_bucket_sectors_fragmented(ca, *old), flags);
if (ret)
return ret;
} else if (old_sectors != new_sectors) {
int ret = bch2_dev_data_type_accounting_mod(trans, ca, new->data_type,
0,
new_sectors - old_sectors,
bch2_bucket_sectors_fragmented(ca, *new) -
bch2_bucket_sectors_fragmented(ca, *old), flags);
if (ret)
return ret;
}
s64 old_unstriped = bch2_bucket_sectors_unstriped(*old);
s64 new_unstriped = bch2_bucket_sectors_unstriped(*new);
if (old_unstriped != new_unstriped) {
int ret = bch2_dev_data_type_accounting_mod(trans, ca, BCH_DATA_unstriped,
!!new_unstriped - !!old_unstriped,
new_unstriped - old_unstriped,
0,
flags);
if (ret)
return ret;
}
return 0;
}
int bch2_trigger_alloc(struct btree_trans *trans,
enum btree_id btree, unsigned level,
struct bkey_s_c old, struct bkey_s new,
@@ -759,10 +829,9 @@ int bch2_trigger_alloc(struct btree_trans *trans,
struct bch_alloc_v4 old_a_convert;
const struct bch_alloc_v4 *old_a = bch2_alloc_to_v4(old, &old_a_convert);
struct bch_alloc_v4 *new_a = bkey_s_to_alloc_v4(new).v;
if (flags & BTREE_TRIGGER_transactional) {
struct bch_alloc_v4 *new_a = bkey_s_to_alloc_v4(new).v;
alloc_data_type_set(new_a, new_a->data_type);
if (bch2_bucket_sectors_total(*new_a) > bch2_bucket_sectors_total(*old_a)) {
@@ -819,22 +888,21 @@ int bch2_trigger_alloc(struct btree_trans *trans,
goto err;
}
/*
* need to know if we're getting called from the invalidate path or
* not:
*/
if ((flags & BTREE_TRIGGER_bucket_invalidate) &&
old_a->cached_sectors) {
ret = bch2_update_cached_sectors_list(trans, new.k->p.inode,
-((s64) old_a->cached_sectors));
ret = bch2_mod_dev_cached_sectors(trans, ca->dev_idx,
-((s64) old_a->cached_sectors),
flags & BTREE_TRIGGER_gc);
if (ret)
goto err;
}
ret = bch2_alloc_key_to_dev_counters(trans, ca, old_a, new_a, flags);
if (ret)
goto err;
}
if ((flags & BTREE_TRIGGER_atomic) && (flags & BTREE_TRIGGER_insert)) {
struct bch_alloc_v4 *new_a = bkey_s_to_alloc_v4(new).v;
u64 journal_seq = trans->journal_res.seq;
u64 bucket_journal_seq = new_a->journal_seq;
@@ -863,26 +931,22 @@ int bch2_trigger_alloc(struct btree_trans *trans,
c->journal.flushed_seq_ondisk,
new.k->p.inode, new.k->p.offset,
bucket_journal_seq);
if (ret) {
bch2_fs_fatal_error(c,
"setting bucket_needs_journal_commit: %s", bch2_err_str(ret));
if (bch2_fs_fatal_err_on(ret, c,
"setting bucket_needs_journal_commit: %s", bch2_err_str(ret)))
goto err;
}
}
percpu_down_read(&c->mark_lock);
if (new_a->gen != old_a->gen) {
rcu_read_lock();
u8 *gen = bucket_gen(ca, new.k->p.offset);
if (unlikely(!gen)) {
percpu_up_read(&c->mark_lock);
rcu_read_unlock();
goto invalid_bucket;
}
*gen = new_a->gen;
rcu_read_unlock();
}
bch2_dev_usage_update(c, ca, old_a, new_a, journal_seq, false);
percpu_up_read(&c->mark_lock);
#define eval_state(_a, expr) ({ const struct bch_alloc_v4 *a = _a; expr; })
#define statechange(expr) !eval_state(old_a, expr) && eval_state(new_a, expr)
#define bucket_flushed(a) (!a->journal_seq || a->journal_seq <= c->journal.flushed_seq_ondisk)
@@ -905,31 +969,16 @@ int bch2_trigger_alloc(struct btree_trans *trans,
bch2_gc_gens_async(c);
}
if ((flags & BTREE_TRIGGER_gc) &&
(flags & BTREE_TRIGGER_bucket_invalidate)) {
struct bch_alloc_v4 new_a_convert;
const struct bch_alloc_v4 *new_a = bch2_alloc_to_v4(new.s_c, &new_a_convert);
percpu_down_read(&c->mark_lock);
if ((flags & BTREE_TRIGGER_gc) && (flags & BTREE_TRIGGER_insert)) {
rcu_read_lock();
struct bucket *g = gc_bucket(ca, new.k->p.offset);
if (unlikely(!g)) {
percpu_up_read(&c->mark_lock);
rcu_read_unlock();
goto invalid_bucket;
}
g->gen_valid = 1;
bucket_lock(g);
g->gen_valid = 1;
g->gen = new_a->gen;
g->data_type = new_a->data_type;
g->stripe = new_a->stripe;
g->stripe_redundancy = new_a->stripe_redundancy;
g->dirty_sectors = new_a->dirty_sectors;
g->cached_sectors = new_a->cached_sectors;
bucket_unlock(g);
percpu_up_read(&c->mark_lock);
g->gen = new_a->gen;
rcu_read_unlock();
}
err:
printbuf_exit(&buf);
@@ -1063,7 +1112,7 @@ int bch2_check_alloc_key(struct btree_trans *trans,
struct bch_dev *ca = bch2_dev_bucket_tryget_noerror(c, alloc_k.k->p);
if (fsck_err_on(!ca,
c, alloc_key_to_missing_dev_bucket,
trans, alloc_key_to_missing_dev_bucket,
"alloc key for invalid device:bucket %llu:%llu",
alloc_k.k->p.inode, alloc_k.k->p.offset))
ret = bch2_btree_delete_at(trans, alloc_iter, 0);
@@ -1083,7 +1132,7 @@ int bch2_check_alloc_key(struct btree_trans *trans,
goto err;
if (fsck_err_on(k.k->type != discard_key_type,
c, need_discard_key_wrong,
trans, need_discard_key_wrong,
"incorrect key in need_discard btree (got %s should be %s)\n"
" %s",
bch2_bkey_types[k.k->type],
@@ -1113,7 +1162,7 @@ int bch2_check_alloc_key(struct btree_trans *trans,
goto err;
if (fsck_err_on(k.k->type != freespace_key_type,
c, freespace_key_wrong,
trans, freespace_key_wrong,
"incorrect key in freespace btree (got %s should be %s)\n"
" %s",
bch2_bkey_types[k.k->type],
@@ -1144,7 +1193,7 @@ int bch2_check_alloc_key(struct btree_trans *trans,
goto err;
if (fsck_err_on(a->gen != alloc_gen(k, gens_offset),
c, bucket_gens_key_wrong,
trans, bucket_gens_key_wrong,
"incorrect gen in bucket_gens btree (got %u should be %u)\n"
" %s",
alloc_gen(k, gens_offset), a->gen,
@@ -1185,7 +1234,6 @@ int bch2_check_alloc_hole_freespace(struct btree_trans *trans,
struct bpos *end,
struct btree_iter *freespace_iter)
{
struct bch_fs *c = trans->c;
struct bkey_s_c k;
struct printbuf buf = PRINTBUF;
int ret;
@@ -1203,7 +1251,7 @@ int bch2_check_alloc_hole_freespace(struct btree_trans *trans,
*end = bkey_min(k.k->p, *end);
if (fsck_err_on(k.k->type != KEY_TYPE_set,
c, freespace_hole_missing,
trans, freespace_hole_missing,
"hole in alloc btree missing in freespace btree\n"
" device %llu buckets %llu-%llu",
freespace_iter->pos.inode,
@@ -1239,7 +1287,6 @@ int bch2_check_alloc_hole_bucket_gens(struct btree_trans *trans,
struct bpos *end,
struct btree_iter *bucket_gens_iter)
{
struct bch_fs *c = trans->c;
struct bkey_s_c k;
struct printbuf buf = PRINTBUF;
unsigned i, gens_offset, gens_end_offset;
@@ -1263,7 +1310,7 @@ int bch2_check_alloc_hole_bucket_gens(struct btree_trans *trans,
bkey_reassemble(&g.k_i, k);
for (i = gens_offset; i < gens_end_offset; i++) {
if (fsck_err_on(g.v.gens[i], c,
if (fsck_err_on(g.v.gens[i], trans,
bucket_gens_hole_wrong,
"hole in alloc btree at %llu:%llu with nonzero gen in bucket_gens btree (%u)",
bucket_gens_pos_to_alloc(k.k->p, i).inode,
@@ -1321,8 +1368,8 @@ static noinline_for_stack int bch2_check_discard_freespace_key(struct btree_tran
if (ret)
return ret;
if (fsck_err_on(!bch2_dev_bucket_exists(c, pos), c,
need_discard_freespace_key_to_invalid_dev_bucket,
if (fsck_err_on(!bch2_dev_bucket_exists(c, pos),
trans, need_discard_freespace_key_to_invalid_dev_bucket,
"entry in %s btree for nonexistant dev:bucket %llu:%llu",
bch2_btree_id_str(iter->btree_id), pos.inode, pos.offset))
goto delete;
@@ -1331,8 +1378,8 @@ static noinline_for_stack int bch2_check_discard_freespace_key(struct btree_tran
if (fsck_err_on(a->data_type != state ||
(state == BCH_DATA_free &&
genbits != alloc_freespace_genbits(*a)), c,
need_discard_freespace_key_bad,
genbits != alloc_freespace_genbits(*a)),
trans, need_discard_freespace_key_bad,
"%s\n incorrectly set at %s:%llu:%llu:0 (free %u, genbits %llu should be %llu)",
(bch2_bkey_val_to_text(&buf, c, alloc_k), buf.buf),
bch2_btree_id_str(iter->btree_id),
@@ -1379,7 +1426,7 @@ int bch2_check_bucket_gens_key(struct btree_trans *trans,
struct bch_dev *ca = bch2_dev_tryget_noerror(c, k.k->p.inode);
if (!ca) {
if (fsck_err(c, bucket_gens_to_invalid_dev,
if (fsck_err(trans, bucket_gens_to_invalid_dev,
"bucket_gens key for invalid device:\n %s",
(bch2_bkey_val_to_text(&buf, c, k), buf.buf)))
ret = bch2_btree_delete_at(trans, iter, 0);
@@ -1387,8 +1434,8 @@ int bch2_check_bucket_gens_key(struct btree_trans *trans,
}
if (fsck_err_on(end <= ca->mi.first_bucket ||
start >= ca->mi.nbuckets, c,
bucket_gens_to_invalid_buckets,
start >= ca->mi.nbuckets,
trans, bucket_gens_to_invalid_buckets,
"bucket_gens key for invalid buckets:\n %s",
(bch2_bkey_val_to_text(&buf, c, k), buf.buf))) {
ret = bch2_btree_delete_at(trans, iter, 0);
@@ -1396,16 +1443,16 @@ int bch2_check_bucket_gens_key(struct btree_trans *trans,
}
for (b = start; b < ca->mi.first_bucket; b++)
if (fsck_err_on(g.v.gens[b & KEY_TYPE_BUCKET_GENS_MASK], c,
bucket_gens_nonzero_for_invalid_buckets,
if (fsck_err_on(g.v.gens[b & KEY_TYPE_BUCKET_GENS_MASK],
trans, bucket_gens_nonzero_for_invalid_buckets,
"bucket_gens key has nonzero gen for invalid bucket")) {
g.v.gens[b & KEY_TYPE_BUCKET_GENS_MASK] = 0;
need_update = true;
}
for (b = ca->mi.nbuckets; b < end; b++)
if (fsck_err_on(g.v.gens[b & KEY_TYPE_BUCKET_GENS_MASK], c,
bucket_gens_nonzero_for_invalid_buckets,
if (fsck_err_on(g.v.gens[b & KEY_TYPE_BUCKET_GENS_MASK],
trans, bucket_gens_nonzero_for_invalid_buckets,
"bucket_gens key has nonzero gen for invalid bucket")) {
g.v.gens[b & KEY_TYPE_BUCKET_GENS_MASK] = 0;
need_update = true;
@@ -1585,8 +1632,8 @@ static int bch2_check_alloc_to_lru_ref(struct btree_trans *trans,
if (a->data_type != BCH_DATA_cached)
return 0;
if (fsck_err_on(!a->io_time[READ], c,
alloc_key_cached_but_read_time_zero,
if (fsck_err_on(!a->io_time[READ],
trans, alloc_key_cached_but_read_time_zero,
"cached bucket with read_time 0\n"
" %s",
(printbuf_reset(&buf),
@@ -1960,7 +2007,7 @@ static int invalidate_one_bucket(struct btree_trans *trans,
if (bch2_bucket_is_open_safe(c, bucket.inode, bucket.offset))
return 0;
a = bch2_trans_start_alloc_update(trans, bucket);
a = bch2_trans_start_alloc_update(trans, bucket, BTREE_TRIGGER_bucket_invalidate);
ret = PTR_ERR_OR_ZERO(a);
if (ret)
goto out;
@@ -1981,6 +2028,7 @@ static int invalidate_one_bucket(struct btree_trans *trans,
a->v.gen++;
a->v.data_type = 0;
a->v.dirty_sectors = 0;
a->v.stripe_sectors = 0;
a->v.cached_sectors = 0;
a->v.io_time[READ] = bch2_current_io_time(c, READ);
a->v.io_time[WRITE] = bch2_current_io_time(c, WRITE);
@@ -2336,6 +2384,7 @@ void bch2_recalc_capacity(struct bch_fs *c)
reserved_sectors = min(reserved_sectors, capacity);
c->reserved = reserved_sectors;
c->capacity = capacity - reserved_sectors;
c->bucket_size_max = bucket_size_max;

View File

@@ -41,6 +41,7 @@ static inline void alloc_to_bucket(struct bucket *dst, struct bch_alloc_v4 src)
{
dst->gen = src.gen;
dst->data_type = src.data_type;
dst->stripe_sectors = src.stripe_sectors;
dst->dirty_sectors = src.dirty_sectors;
dst->cached_sectors = src.cached_sectors;
dst->stripe = src.stripe;
@@ -50,6 +51,7 @@ static inline void __bucket_m_to_alloc(struct bch_alloc_v4 *dst, struct bucket s
{
dst->gen = src.gen;
dst->data_type = src.data_type;
dst->stripe_sectors = src.stripe_sectors;
dst->dirty_sectors = src.dirty_sectors;
dst->cached_sectors = src.cached_sectors;
dst->stripe = src.stripe;
@@ -80,30 +82,49 @@ static inline bool bucket_data_type_mismatch(enum bch_data_type bucket,
bucket_data_type(bucket) != bucket_data_type(ptr);
}
static inline unsigned bch2_bucket_sectors_total(struct bch_alloc_v4 a)
static inline s64 bch2_bucket_sectors_total(struct bch_alloc_v4 a)
{
return a.dirty_sectors + a.cached_sectors;
return a.stripe_sectors + a.dirty_sectors + a.cached_sectors;
}
static inline unsigned bch2_bucket_sectors_dirty(struct bch_alloc_v4 a)
static inline s64 bch2_bucket_sectors_dirty(struct bch_alloc_v4 a)
{
return a.dirty_sectors;
return a.stripe_sectors + a.dirty_sectors;
}
static inline unsigned bch2_bucket_sectors_fragmented(struct bch_dev *ca,
static inline s64 bch2_bucket_sectors(struct bch_alloc_v4 a)
{
return a.data_type == BCH_DATA_cached
? a.cached_sectors
: bch2_bucket_sectors_dirty(a);
}
static inline s64 bch2_bucket_sectors_fragmented(struct bch_dev *ca,
struct bch_alloc_v4 a)
{
int d = bch2_bucket_sectors_dirty(a);
int d = bch2_bucket_sectors(a);
return d ? max(0, ca->mi.bucket_size - d) : 0;
}
static inline s64 bch2_gc_bucket_sectors_fragmented(struct bch_dev *ca, struct bucket a)
{
int d = a.stripe_sectors + a.dirty_sectors;
return d ? max(0, ca->mi.bucket_size - d) : 0;
}
static inline s64 bch2_bucket_sectors_unstriped(struct bch_alloc_v4 a)
{
return a.data_type == BCH_DATA_stripe ? a.dirty_sectors : 0;
}
static inline enum bch_data_type alloc_data_type(struct bch_alloc_v4 a,
enum bch_data_type data_type)
{
if (a.stripe)
return data_type == BCH_DATA_parity ? data_type : BCH_DATA_stripe;
if (a.dirty_sectors)
if (bch2_bucket_sectors_dirty(a))
return data_type;
if (a.cached_sectors)
return BCH_DATA_cached;
@@ -185,7 +206,8 @@ static inline void set_alloc_v4_u64s(struct bkey_i_alloc_v4 *a)
struct bkey_i_alloc_v4 *
bch2_trans_start_alloc_update_noupdate(struct btree_trans *, struct btree_iter *, struct bpos);
struct bkey_i_alloc_v4 *
bch2_trans_start_alloc_update(struct btree_trans *, struct bpos);
bch2_trans_start_alloc_update(struct btree_trans *, struct bpos,
enum btree_iter_update_trigger_flags);
void __bch2_alloc_to_v4(struct bkey_s_c, struct bch_alloc_v4 *);
@@ -270,6 +292,9 @@ static inline bool bkey_is_alloc(const struct bkey *k)
int bch2_alloc_read(struct bch_fs *);
int bch2_alloc_key_to_dev_counters(struct btree_trans *, struct bch_dev *,
const struct bch_alloc_v4 *,
const struct bch_alloc_v4 *, unsigned);
int bch2_trigger_alloc(struct btree_trans *, enum btree_id, unsigned,
struct bkey_s_c, struct bkey_s,
enum btree_iter_update_trigger_flags);

View File

@@ -70,6 +70,8 @@ struct bch_alloc_v4 {
__u32 stripe;
__u32 nr_external_backpointers;
__u64 fragmentation_lru;
__u32 stripe_sectors;
__u32 pad;
} __packed __aligned(8);
#define BCH_ALLOC_V4_U64s_V0 6

View File

@@ -1589,7 +1589,7 @@ void bch2_fs_allocator_foreground_init(struct bch_fs *c)
}
}
static void bch2_open_bucket_to_text(struct printbuf *out, struct bch_fs *c, struct open_bucket *ob)
void bch2_open_bucket_to_text(struct printbuf *out, struct bch_fs *c, struct open_bucket *ob)
{
struct bch_dev *ca = ob_dev(c, ob);
unsigned data_type = ob->data_type;
@@ -1706,15 +1706,15 @@ void bch2_fs_alloc_debug_to_text(struct printbuf *out, struct bch_fs *c)
printbuf_tabstops_reset(out);
printbuf_tabstop_push(out, 24);
percpu_down_read(&c->mark_lock);
prt_printf(out, "hidden\t%llu\n", bch2_fs_usage_read_one(c, &c->usage_base->b.hidden));
prt_printf(out, "btree\t%llu\n", bch2_fs_usage_read_one(c, &c->usage_base->b.btree));
prt_printf(out, "data\t%llu\n", bch2_fs_usage_read_one(c, &c->usage_base->b.data));
prt_printf(out, "cached\t%llu\n", bch2_fs_usage_read_one(c, &c->usage_base->b.cached));
prt_printf(out, "reserved\t%llu\n", bch2_fs_usage_read_one(c, &c->usage_base->b.reserved));
prt_printf(out, "online_reserved\t%llu\n", percpu_u64_get(c->online_reserved));
prt_printf(out, "nr_inodes\t%llu\n", bch2_fs_usage_read_one(c, &c->usage_base->b.nr_inodes));
percpu_up_read(&c->mark_lock);
prt_printf(out, "capacity\t%llu\n", c->capacity);
prt_printf(out, "reserved\t%llu\n", c->reserved);
prt_printf(out, "hidden\t%llu\n", percpu_u64_get(&c->usage->hidden));
prt_printf(out, "btree\t%llu\n", percpu_u64_get(&c->usage->btree));
prt_printf(out, "data\t%llu\n", percpu_u64_get(&c->usage->data));
prt_printf(out, "cached\t%llu\n", percpu_u64_get(&c->usage->cached));
prt_printf(out, "reserved\t%llu\n", percpu_u64_get(&c->usage->reserved));
prt_printf(out, "online_reserved\t%llu\n", percpu_u64_get(c->online_reserved));
prt_printf(out, "nr_inodes\t%llu\n", percpu_u64_get(&c->usage->nr_inodes));
prt_newline(out);
prt_printf(out, "freelist_wait\t%s\n", c->freelist_wait.list.first ? "waiting" : "empty");

View File

@@ -222,6 +222,7 @@ static inline struct write_point_specifier writepoint_ptr(struct write_point *wp
void bch2_fs_allocator_foreground_init(struct bch_fs *);
void bch2_open_bucket_to_text(struct printbuf *, struct bch_fs *, struct open_bucket *);
void bch2_open_buckets_to_text(struct printbuf *, struct bch_fs *);
void bch2_open_buckets_partial_to_text(struct printbuf *, struct bch_fs *);

View File

@@ -395,7 +395,7 @@ static int bch2_check_btree_backpointer(struct btree_trans *trans, struct btree_
struct bpos bucket;
if (!bp_pos_to_bucket_nodev_noerror(c, k.k->p, &bucket)) {
if (fsck_err(c, backpointer_to_missing_device,
if (fsck_err(trans, backpointer_to_missing_device,
"backpointer for missing device:\n%s",
(bch2_bkey_val_to_text(&buf, c, k), buf.buf)))
ret = bch2_btree_delete_at(trans, bp_iter, 0);
@@ -407,8 +407,8 @@ static int bch2_check_btree_backpointer(struct btree_trans *trans, struct btree_
if (ret)
goto out;
if (fsck_err_on(alloc_k.k->type != KEY_TYPE_alloc_v4, c,
backpointer_to_missing_alloc,
if (fsck_err_on(alloc_k.k->type != KEY_TYPE_alloc_v4,
trans, backpointer_to_missing_alloc,
"backpointer for nonexistent alloc key: %llu:%llu:0\n%s",
alloc_iter.pos.inode, alloc_iter.pos.offset,
(bch2_bkey_val_to_text(&buf, c, k), buf.buf))) {
@@ -505,7 +505,7 @@ found:
struct nonce nonce = extent_nonce(extent.k->version, p.crc);
struct bch_csum csum = bch2_checksum(c, p.crc.csum_type, nonce, data_buf, bytes);
if (fsck_err_on(bch2_crc_cmp(csum, p.crc.csum),
c, dup_backpointer_to_bad_csum_extent,
trans, dup_backpointer_to_bad_csum_extent,
"%s", buf.buf))
ret = drop_dev_and_update(trans, btree, extent, dev) ?: 1;
fsck_err:
@@ -647,7 +647,7 @@ missing:
prt_printf(&buf, "\n want: ");
bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&n_bp_k.k_i));
if (fsck_err(c, ptr_to_missing_backpointer, "%s", buf.buf))
if (fsck_err(trans, ptr_to_missing_backpointer, "%s", buf.buf))
ret = bch2_bucket_backpointer_mod(trans, ca, bucket, bp, orig_k, true);
goto out;
@@ -762,12 +762,12 @@ static int bch2_get_btree_in_memory_pos(struct btree_trans *trans,
for (enum btree_id btree = start.btree;
btree < BTREE_ID_NR && !ret;
btree++) {
unsigned depth = ((1U << btree) & btree_leaf_mask) ? 0 : 1;
unsigned depth = (BIT_ULL(btree) & btree_leaf_mask) ? 0 : 1;
struct btree_iter iter;
struct btree *b;
if (!((1U << btree) & btree_leaf_mask) &&
!((1U << btree) & btree_interior_mask))
if (!(BIT_ULL(btree) & btree_leaf_mask) &&
!(BIT_ULL(btree) & btree_interior_mask))
continue;
bch2_trans_begin(trans);
@@ -908,7 +908,7 @@ static int check_one_backpointer(struct btree_trans *trans,
if (ret)
goto out;
if (fsck_err(c, backpointer_to_missing_ptr,
if (fsck_err(trans, backpointer_to_missing_ptr,
"backpointer for missing %s\n %s",
bp.v->level ? "btree node" : "extent",
(bch2_bkey_val_to_text(&buf, c, bp.s_c), buf.buf))) {
@@ -951,8 +951,8 @@ int bch2_check_backpointers_to_extents(struct bch_fs *c)
while (1) {
ret = bch2_get_btree_in_memory_pos(trans,
(1U << BTREE_ID_extents)|
(1U << BTREE_ID_reflink),
BIT_ULL(BTREE_ID_extents)|
BIT_ULL(BTREE_ID_reflink),
~0,
start, &end);
if (ret)

View File

@@ -205,6 +205,7 @@
#include <linux/zstd.h>
#include "bcachefs_format.h"
#include "disk_accounting_types.h"
#include "errcode.h"
#include "fifo.h"
#include "nocow_locking_types.h"
@@ -266,6 +267,8 @@ do { \
#define bch2_fmt(_c, fmt) bch2_log_msg(_c, fmt "\n")
void bch2_print_str(struct bch_fs *, const char *);
__printf(2, 3)
void bch2_print_opts(struct bch_opts *, const char *, ...);
@@ -535,8 +538,8 @@ struct bch_dev {
/*
* Buckets:
* Per-bucket arrays are protected by c->mark_lock, bucket_lock and
* gc_lock, for device resize - holding any is sufficient for access:
* Or rcu_read_lock(), but only for dev_ptr_stale():
* gc_gens_lock, for device resize - holding any is sufficient for
* access: Or rcu_read_lock(), but only for dev_ptr_stale():
*/
struct bucket_array __rcu *buckets_gc;
struct bucket_gens __rcu *bucket_gens;
@@ -544,9 +547,7 @@ struct bch_dev {
unsigned long *buckets_nouse;
struct rw_semaphore bucket_lock;
struct bch_dev_usage *usage_base;
struct bch_dev_usage __percpu *usage[JOURNAL_BUF_NR];
struct bch_dev_usage __percpu *usage_gc;
struct bch_dev_usage __percpu *usage;
/* Allocator: */
u64 new_fs_bucket_idx;
@@ -592,6 +593,8 @@ struct bch_dev {
#define BCH_FS_FLAGS() \
x(new_fs) \
x(started) \
x(btree_running) \
x(accounting_replay_done) \
x(may_go_rw) \
x(rw) \
x(was_rw) \
@@ -670,8 +673,6 @@ struct btree_trans_buf {
struct btree_trans *trans;
};
#define REPLICAS_DELTA_LIST_MAX (1U << 16)
#define BCACHEFS_ROOT_SUBVOL_INUM \
((subvol_inum) { BCACHEFS_ROOT_SUBVOL, BCACHEFS_ROOT_INO })
@@ -741,15 +742,14 @@ struct bch_fs {
struct bch_dev __rcu *devs[BCH_SB_MEMBERS_MAX];
struct bch_accounting_mem accounting;
struct bch_replicas_cpu replicas;
struct bch_replicas_cpu replicas_gc;
struct mutex replicas_gc_lock;
mempool_t replicas_delta_pool;
struct journal_entry_res btree_root_journal_res;
struct journal_entry_res replicas_journal_res;
struct journal_entry_res clock_journal_res;
struct journal_entry_res dev_usage_journal_res;
struct bch_disk_groups_cpu __rcu *disk_groups;
@@ -872,6 +872,7 @@ struct bch_fs {
struct bch_devs_mask rw_devs[BCH_DATA_NR];
u64 capacity; /* sectors */
u64 reserved; /* sectors */
/*
* When capacity _decreases_ (due to a disk being removed), we
@@ -889,15 +890,9 @@ struct bch_fs {
struct percpu_rw_semaphore mark_lock;
seqcount_t usage_lock;
struct bch_fs_usage *usage_base;
struct bch_fs_usage __percpu *usage[JOURNAL_BUF_NR];
struct bch_fs_usage __percpu *usage_gc;
struct bch_fs_usage_base __percpu *usage;
u64 __percpu *online_reserved;
/* single element mempool: */
struct mutex usage_scratch_lock;
struct bch_fs_usage_online *usage_scratch;
struct io_clock io_clock[2];
/* JOURNAL SEQ BLACKLIST */

View File

@@ -417,7 +417,8 @@ static inline void bkey_init(struct bkey *k)
x(bucket_gens, 30) \
x(snapshot_tree, 31) \
x(logged_op_truncate, 32) \
x(logged_op_finsert, 33)
x(logged_op_finsert, 33) \
x(accounting, 34)
enum bch_bkey_type {
#define x(name, nr) KEY_TYPE_##name = nr,
@@ -467,18 +468,6 @@ struct bch_backpointer {
struct bpos pos;
} __packed __aligned(8);
/* LRU btree: */
struct bch_lru {
struct bch_val v;
__le64 idx;
} __packed __aligned(8);
#define LRU_ID_STRIPES (1U << 16)
#define LRU_TIME_BITS 48
#define LRU_TIME_MAX ((1ULL << LRU_TIME_BITS) - 1)
/* Optional/variable size superblock sections: */
struct bch_sb_field {
@@ -505,6 +494,9 @@ struct bch_sb_field {
x(downgrade, 14)
#include "alloc_background_format.h"
#include "dirent_format.h"
#include "disk_accounting_format.h"
#include "disk_groups_format.h"
#include "extents_format.h"
#include "ec_format.h"
#include "dirent_format.h"
@@ -512,6 +504,7 @@ struct bch_sb_field {
#include "inode_format.h"
#include "journal_seq_blacklist_format.h"
#include "logged_ops_format.h"
#include "lru_format.h"
#include "quota_format.h"
#include "reflink_format.h"
#include "replicas_format.h"
@@ -602,48 +595,6 @@ LE64_BITMASK(BCH_KDF_SCRYPT_N, struct bch_sb_field_crypt, kdf_flags, 0, 16);
LE64_BITMASK(BCH_KDF_SCRYPT_R, struct bch_sb_field_crypt, kdf_flags, 16, 32);
LE64_BITMASK(BCH_KDF_SCRYPT_P, struct bch_sb_field_crypt, kdf_flags, 32, 48);
#define BCH_DATA_TYPES() \
x(free, 0) \
x(sb, 1) \
x(journal, 2) \
x(btree, 3) \
x(user, 4) \
x(cached, 5) \
x(parity, 6) \
x(stripe, 7) \
x(need_gc_gens, 8) \
x(need_discard, 9)
enum bch_data_type {
#define x(t, n) BCH_DATA_##t,
BCH_DATA_TYPES()
#undef x
BCH_DATA_NR
};
static inline bool data_type_is_empty(enum bch_data_type type)
{
switch (type) {
case BCH_DATA_free:
case BCH_DATA_need_gc_gens:
case BCH_DATA_need_discard:
return true;
default:
return false;
}
}
static inline bool data_type_is_hidden(enum bch_data_type type)
{
switch (type) {
case BCH_DATA_sb:
case BCH_DATA_journal:
return true;
default:
return false;
}
}
/*
* On clean shutdown, store btree roots and current journal sequence number in
* the superblock:
@@ -722,7 +673,9 @@ struct bch_sb_field_ext {
x(member_seq, BCH_VERSION(1, 4)) \
x(subvolume_fs_parent, BCH_VERSION(1, 5)) \
x(btree_subvolume_children, BCH_VERSION(1, 6)) \
x(mi_btree_bitmap, BCH_VERSION(1, 7))
x(mi_btree_bitmap, BCH_VERSION(1, 7)) \
x(bucket_stripe_sectors, BCH_VERSION(1, 8)) \
x(disk_accounting_v2, BCH_VERSION(1, 9))
enum bcachefs_metadata_version {
bcachefs_metadata_version_min = 9,
@@ -1174,7 +1127,6 @@ static inline bool jset_entry_is_key(struct jset_entry *e)
switch (e->type) {
case BCH_JSET_ENTRY_btree_keys:
case BCH_JSET_ENTRY_btree_root:
case BCH_JSET_ENTRY_overwrite:
case BCH_JSET_ENTRY_write_buffer_keys:
return true;
}
@@ -1375,7 +1327,9 @@ enum btree_id_flags {
x(rebalance_work, 18, BTREE_ID_SNAPSHOT_FIELD, \
BIT_ULL(KEY_TYPE_set)|BIT_ULL(KEY_TYPE_cookie)) \
x(subvolume_children, 19, 0, \
BIT_ULL(KEY_TYPE_set))
BIT_ULL(KEY_TYPE_set)) \
x(accounting, 20, BTREE_ID_SNAPSHOT_FIELD, \
BIT_ULL(KEY_TYPE_accounting)) \
enum btree_id {
#define x(name, nr, ...) BTREE_ID_##name = nr,

View File

@@ -5,6 +5,7 @@
#include <linux/uuid.h>
#include <asm/ioctl.h>
#include "bcachefs_format.h"
#include "bkey_types.h"
/*
* Flags common to multiple ioctls:
@@ -85,6 +86,7 @@ struct bch_ioctl_incremental {
#define BCH_IOCTL_FSCK_OFFLINE _IOW(0xbc, 19, struct bch_ioctl_fsck_offline)
#define BCH_IOCTL_FSCK_ONLINE _IOW(0xbc, 20, struct bch_ioctl_fsck_online)
#define BCH_IOCTL_QUERY_ACCOUNTING _IOW(0xbc, 21, struct bch_ioctl_query_accounting)
/* ioctl below act on a particular file, not the filesystem as a whole: */
@@ -251,12 +253,18 @@ struct bch_replicas_usage {
struct bch_replicas_entry_v1 r;
} __packed;
static inline unsigned replicas_usage_bytes(struct bch_replicas_usage *u)
{
return offsetof(struct bch_replicas_usage, r) + replicas_entry_bytes(&u->r);
}
static inline struct bch_replicas_usage *
replicas_usage_next(struct bch_replicas_usage *u)
{
return (void *) u + replicas_entry_bytes(&u->r) + 8;
return (void *) u + replicas_usage_bytes(u);
}
/* Obsolete */
/*
* BCH_IOCTL_FS_USAGE: query filesystem disk space usage
*
@@ -282,6 +290,7 @@ struct bch_ioctl_fs_usage {
struct bch_replicas_usage replicas[];
};
/* Obsolete */
/*
* BCH_IOCTL_DEV_USAGE: query device disk space usage
*
@@ -306,6 +315,7 @@ struct bch_ioctl_dev_usage {
} d[10];
};
/* Obsolete */
struct bch_ioctl_dev_usage_v2 {
__u64 dev;
__u32 flags;
@@ -409,4 +419,28 @@ struct bch_ioctl_fsck_online {
__u64 opts; /* string */
};
/*
* BCH_IOCTL_QUERY_ACCOUNTING: query filesystem disk accounting
*
* Returns disk space usage broken out by data type, number of replicas, and
* by component device
*
* @replica_entries_bytes - size, in bytes, allocated for replica usage entries
*
* On success, @replica_entries_bytes will be changed to indicate the number of
* bytes actually used.
*
* Returns -ERANGE if @replica_entries_bytes was too small
*/
struct bch_ioctl_query_accounting {
__u64 capacity;
__u64 used;
__u64 online_reserved;
__u32 accounting_u64s; /* input parameter */
__u32 accounting_types_mask; /* input parameter */
struct bkey_i_accounting accounting[];
};
#endif /* _BCACHEFS_IOCTL_H */

View File

@@ -7,6 +7,7 @@
#include "btree_types.h"
#include "alloc_background.h"
#include "dirent.h"
#include "disk_accounting.h"
#include "ec.h"
#include "error.h"
#include "extents.h"

View File

@@ -602,8 +602,8 @@ int bch2_btree_cache_cannibalize_lock(struct btree_trans *trans, struct closure
struct btree_cache *bc = &c->btree_cache;
struct task_struct *old;
old = cmpxchg(&bc->alloc_lock, NULL, current);
if (old == NULL || old == current)
old = NULL;
if (try_cmpxchg(&bc->alloc_lock, &old, current) || old == current)
goto success;
if (!cl) {
@@ -614,8 +614,8 @@ int bch2_btree_cache_cannibalize_lock(struct btree_trans *trans, struct closure
closure_wait(&bc->alloc_wait, cl);
/* Try again, after adding ourselves to waitlist */
old = cmpxchg(&bc->alloc_lock, NULL, current);
if (old == NULL || old == current) {
old = NULL;
if (try_cmpxchg(&bc->alloc_lock, &old, current) || old == current) {
/* We raced */
closure_wake_up(&bc->alloc_wait);
goto success;
@@ -1257,6 +1257,14 @@ const char *bch2_btree_id_str(enum btree_id btree)
return btree < BTREE_ID_NR ? __bch2_btree_ids[btree] : "(unknown)";
}
void bch2_btree_id_to_text(struct printbuf *out, enum btree_id btree)
{
if (btree < BTREE_ID_NR)
prt_str(out, __bch2_btree_ids[btree]);
else
prt_printf(out, "(unknown btree %u)", btree);
}
void bch2_btree_pos_to_text(struct printbuf *out, struct bch_fs *c, const struct btree *b)
{
prt_printf(out, "%s level %u/%u\n ",

View File

@@ -132,6 +132,8 @@ static inline struct btree *btree_node_root(struct bch_fs *c, struct btree *b)
}
const char *bch2_btree_id_str(enum btree_id);
void bch2_btree_id_to_text(struct printbuf *, enum btree_id);
void bch2_btree_pos_to_text(struct printbuf *, struct bch_fs *, const struct btree *);
void bch2_btree_node_to_text(struct printbuf *, struct bch_fs *, const struct btree *);
void bch2_btree_cache_to_text(struct printbuf *, const struct btree_cache *);

File diff suppressed because it is too large Load Diff

View File

@@ -47,17 +47,10 @@ static inline struct gc_pos gc_pos_btree(enum btree_id btree, unsigned level,
};
}
/*
* GC position of the pointers within a btree node: note, _not_ for &b->key
* itself, that lives in the parent node:
*/
static inline struct gc_pos gc_pos_btree_node(struct btree *b)
{
return gc_pos_btree(b->c.btree_id, b->c.level, b->key.k.p);
}
static inline int gc_btree_order(enum btree_id btree)
{
if (btree == BTREE_ID_alloc)
return -2;
if (btree == BTREE_ID_stripes)
return -1;
return btree;
@@ -65,11 +58,11 @@ static inline int gc_btree_order(enum btree_id btree)
static inline int gc_pos_cmp(struct gc_pos l, struct gc_pos r)
{
return cmp_int(l.phase, r.phase) ?:
cmp_int(gc_btree_order(l.btree),
gc_btree_order(r.btree)) ?:
-cmp_int(l.level, r.level) ?:
bpos_cmp(l.pos, r.pos);
return cmp_int(l.phase, r.phase) ?:
cmp_int(gc_btree_order(l.btree),
gc_btree_order(r.btree)) ?:
cmp_int(l.level, r.level) ?:
bpos_cmp(l.pos, r.pos);
}
static inline bool gc_visited(struct bch_fs *c, struct gc_pos pos)
@@ -85,6 +78,8 @@ static inline bool gc_visited(struct bch_fs *c, struct gc_pos pos)
return ret;
}
void bch2_gc_pos_to_text(struct printbuf *, struct gc_pos *);
int bch2_gc_gens(struct bch_fs *);
void bch2_gc_gens_async(struct bch_fs *);
void bch2_fs_gc_init(struct bch_fs *);

View File

@@ -4,11 +4,16 @@
#include <linux/generic-radix-tree.h>
#define GC_PHASES() \
x(not_running) \
x(start) \
x(sb) \
x(btree)
enum gc_phase {
GC_PHASE_not_running,
GC_PHASE_start,
GC_PHASE_sb,
GC_PHASE_btree,
#define x(n) GC_PHASE_##n,
GC_PHASES()
#undef x
};
struct gc_pos {

View File

@@ -46,8 +46,6 @@ void bch2_btree_node_io_unlock(struct btree *b)
void bch2_btree_node_io_lock(struct btree *b)
{
bch2_assert_btree_nodes_not_locked();
wait_on_bit_lock_io(&b->flags, BTREE_NODE_write_in_flight,
TASK_UNINTERRUPTIBLE);
}
@@ -66,16 +64,12 @@ void __bch2_btree_node_wait_on_write(struct btree *b)
void bch2_btree_node_wait_on_read(struct btree *b)
{
bch2_assert_btree_nodes_not_locked();
wait_on_bit_io(&b->flags, BTREE_NODE_read_in_flight,
TASK_UNINTERRUPTIBLE);
}
void bch2_btree_node_wait_on_write(struct btree *b)
{
bch2_assert_btree_nodes_not_locked();
wait_on_bit_io(&b->flags, BTREE_NODE_write_in_flight,
TASK_UNINTERRUPTIBLE);
}
@@ -534,7 +528,7 @@ static void btree_err_msg(struct printbuf *out, struct bch_fs *c,
printbuf_indent_add(out, 2);
prt_printf(out, "\nnode offset %u/%u",
b->written, btree_ptr_sectors_written(&b->key));
b->written, btree_ptr_sectors_written(bkey_i_to_s_c(&b->key)));
if (i)
prt_printf(out, " bset u64s %u", le16_to_cpu(i->u64s));
if (k)
@@ -585,7 +579,7 @@ static int __btree_err(int ret,
switch (ret) {
case -BCH_ERR_btree_node_read_err_fixable:
ret = !silent
? bch2_fsck_err(c, FSCK_CAN_FIX, err_type, "%s", out.buf)
? __bch2_fsck_err(c, NULL, FSCK_CAN_FIX, err_type, "%s", out.buf)
: -BCH_ERR_fsck_fix;
if (ret != -BCH_ERR_fsck_fix &&
ret != -BCH_ERR_fsck_ignore)
@@ -689,6 +683,7 @@ static int validate_bset(struct bch_fs *c, struct bch_dev *ca,
int write, bool have_retry, bool *saw_error)
{
unsigned version = le16_to_cpu(i->version);
unsigned ptr_written = btree_ptr_sectors_written(bkey_i_to_s_c(&b->key));
struct printbuf buf1 = PRINTBUF;
struct printbuf buf2 = PRINTBUF;
int ret = 0;
@@ -732,11 +727,13 @@ static int validate_bset(struct bch_fs *c, struct bch_dev *ca,
btree_node_unsupported_version,
"BSET_SEPARATE_WHITEOUTS no longer supported");
if (btree_err_on(offset + sectors > btree_sectors(c),
if (!write &&
btree_err_on(offset + sectors > (ptr_written ?: btree_sectors(c)),
-BCH_ERR_btree_node_read_err_fixable,
c, ca, b, i, NULL,
bset_past_end_of_btree_node,
"bset past end of btree node")) {
"bset past end of btree node (offset %u len %u but written %zu)",
offset, sectors, ptr_written ?: btree_sectors(c))) {
i->u64s = 0;
ret = 0;
goto out;
@@ -1002,7 +999,8 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca,
bool updated_range = b->key.k.type == KEY_TYPE_btree_ptr_v2 &&
BTREE_PTR_RANGE_UPDATED(&bkey_i_to_btree_ptr_v2(&b->key)->v);
unsigned u64s;
unsigned ptr_written = btree_ptr_sectors_written(&b->key);
unsigned ptr_written = btree_ptr_sectors_written(bkey_i_to_s_c(&b->key));
u64 max_journal_seq = 0;
struct printbuf buf = PRINTBUF;
int ret = 0, retry_read = 0, write = READ;
u64 start_time = local_clock();
@@ -1178,6 +1176,8 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca,
sort_iter_add(iter,
vstruct_idx(i, 0),
vstruct_last(i));
max_journal_seq = max(max_journal_seq, le64_to_cpu(i->journal_seq));
}
if (ptr_written) {
@@ -1214,6 +1214,7 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca,
swap(sorted, b->data);
set_btree_bset(b, b->set, &b->data->keys);
b->nsets = 1;
b->data->keys.journal_seq = cpu_to_le64(max_journal_seq);
BUG_ON(b->nr.live_u64s != u64s);
@@ -1796,15 +1797,16 @@ int bch2_btree_root_read(struct bch_fs *c, enum btree_id id,
static void bch2_btree_complete_write(struct bch_fs *c, struct btree *b,
struct btree_write *w)
{
unsigned long old, new, v = READ_ONCE(b->will_make_reachable);
unsigned long old, new;
old = READ_ONCE(b->will_make_reachable);
do {
old = new = v;
new = old;
if (!(old & 1))
break;
new &= ~1UL;
} while ((v = cmpxchg(&b->will_make_reachable, old, new)) != old);
} while (!try_cmpxchg(&b->will_make_reachable, &old, new));
if (old & 1)
closure_put(&((struct btree_update *) new)->cl);
@@ -1815,14 +1817,14 @@ static void bch2_btree_complete_write(struct bch_fs *c, struct btree *b,
static void __btree_node_write_done(struct bch_fs *c, struct btree *b)
{
struct btree_write *w = btree_prev_write(b);
unsigned long old, new, v;
unsigned long old, new;
unsigned type = 0;
bch2_btree_complete_write(c, b, w);
v = READ_ONCE(b->flags);
old = READ_ONCE(b->flags);
do {
old = new = v;
new = old;
if ((old & (1U << BTREE_NODE_dirty)) &&
(old & (1U << BTREE_NODE_need_write)) &&
@@ -1842,7 +1844,7 @@ static void __btree_node_write_done(struct bch_fs *c, struct btree *b)
new &= ~(1U << BTREE_NODE_write_in_flight);
new &= ~(1U << BTREE_NODE_write_in_flight_inner);
}
} while ((v = cmpxchg(&b->flags, old, new)) != old);
} while (!try_cmpxchg(&b->flags, &old, new));
if (new & (1U << BTREE_NODE_write_in_flight))
__bch2_btree_node_write(c, b, BTREE_WRITE_ALREADY_STARTED|type);
@@ -2014,8 +2016,9 @@ void __bch2_btree_node_write(struct bch_fs *c, struct btree *b, unsigned flags)
* dirty bit requires a write lock, we can't race with other threads
* redirtying it:
*/
old = READ_ONCE(b->flags);
do {
old = new = READ_ONCE(b->flags);
new = old;
if (!(old & (1 << BTREE_NODE_dirty)))
return;
@@ -2046,7 +2049,7 @@ void __bch2_btree_node_write(struct bch_fs *c, struct btree *b, unsigned flags)
new |= (1 << BTREE_NODE_write_in_flight_inner);
new |= (1 << BTREE_NODE_just_written);
new ^= (1 << BTREE_NODE_write_idx);
} while (cmpxchg_acquire(&b->flags, old, new) != old);
} while (!try_cmpxchg_acquire(&b->flags, &old, new));
if (new & (1U << BTREE_NODE_need_write))
return;
@@ -2133,7 +2136,7 @@ do_write:
if (!b->written &&
b->key.k.type == KEY_TYPE_btree_ptr_v2)
BUG_ON(btree_ptr_sectors_written(&b->key) != sectors_to_write);
BUG_ON(btree_ptr_sectors_written(bkey_i_to_s_c(&b->key)) != sectors_to_write);
memset(data + bytes_to_write, 0,
(sectors_to_write << 9) - bytes_to_write);

View File

@@ -27,10 +27,10 @@ static inline void clear_btree_node_dirty_acct(struct bch_fs *c, struct btree *b
atomic_dec(&c->btree_cache.dirty);
}
static inline unsigned btree_ptr_sectors_written(struct bkey_i *k)
static inline unsigned btree_ptr_sectors_written(struct bkey_s_c k)
{
return k->k.type == KEY_TYPE_btree_ptr_v2
? le16_to_cpu(bkey_i_to_btree_ptr_v2(k)->v.sectors_written)
return k.k->type == KEY_TYPE_btree_ptr_v2
? le16_to_cpu(bkey_s_c_to_btree_ptr_v2(k).v->sectors_written)
: 0;
}

Some files were not shown because too many files have changed in this diff Show More