From 3de8fd4a33c9caf5ca798373800a37e4f206d8ed Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Thu, 11 Jul 2024 14:08:38 -0400
Subject: [PATCH 001/120] bcachefs: Print allocator stuck on timeout in
 fallocate path

same as in io_write.c, if we're waiting on the allocator for an
excessive amount of time, print what's going on

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/io_misc.c | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/fs/bcachefs/io_misc.c b/fs/bcachefs/io_misc.c
index 4583c9386e8c..2cf6297756f8 100644
--- a/fs/bcachefs/io_misc.c
+++ b/fs/bcachefs/io_misc.c
@@ -126,7 +126,11 @@ err_noprint:
 
 	if (closure_nr_remaining(&cl) != 1) {
 		bch2_trans_unlock_long(trans);
-		closure_sync(&cl);
+
+		if (closure_sync_timeout(&cl, HZ * 10)) {
+			bch2_print_allocator_stuck(c);
+			closure_sync(&cl);
+		}
 	}
 
 	return ret;

From 52fd0f96206831ed4b75d2060f2e2a96bf2624b0 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Wed, 10 Jul 2024 18:03:11 -0400
Subject: [PATCH 002/120] bcachefs: btree ids are 64 bit bitmasks

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/backpointers.c | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/fs/bcachefs/backpointers.c b/fs/bcachefs/backpointers.c
index 6d8b1bc90be0..018b19f7c346 100644
--- a/fs/bcachefs/backpointers.c
+++ b/fs/bcachefs/backpointers.c
@@ -762,12 +762,12 @@ static int bch2_get_btree_in_memory_pos(struct btree_trans *trans,
 	for (enum btree_id btree = start.btree;
 	     btree < BTREE_ID_NR && !ret;
 	     btree++) {
-		unsigned depth = ((1U << btree) & btree_leaf_mask) ? 0 : 1;
+		unsigned depth = (BIT_ULL(btree) & btree_leaf_mask) ? 0 : 1;
 		struct btree_iter iter;
 		struct btree *b;
 
-		if (!((1U << btree) & btree_leaf_mask) &&
-		    !((1U << btree) & btree_interior_mask))
+		if (!(BIT_ULL(btree) & btree_leaf_mask) &&
+		    !(BIT_ULL(btree) & btree_interior_mask))
 			continue;
 
 		bch2_trans_begin(trans);
@@ -951,8 +951,8 @@ int bch2_check_backpointers_to_extents(struct bch_fs *c)
 
 	while (1) {
 		ret = bch2_get_btree_in_memory_pos(trans,
-						   (1U << BTREE_ID_extents)|
-						   (1U << BTREE_ID_reflink),
+						   BIT_ULL(BTREE_ID_extents)|
+						   BIT_ULL(BTREE_ID_reflink),
 						   ~0,
 						   start, &end);
 		if (ret)

From 8a3c8303e2f1834a79e00305d94ef562946c2ef4 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Sat, 29 Jun 2024 14:37:46 -0400
Subject: [PATCH 003/120] bcachefs: uninline fallocate functions

better stack traces

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/fs-io.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/fs/bcachefs/fs-io.c b/fs/bcachefs/fs-io.c
index ef20b64033e0..ec7901265da5 100644
--- a/fs/bcachefs/fs-io.c
+++ b/fs/bcachefs/fs-io.c
@@ -508,7 +508,7 @@ static int inode_update_times_fn(struct btree_trans *trans,
 	return 0;
 }
 
-static long bchfs_fpunch(struct bch_inode_info *inode, loff_t offset, loff_t len)
+static noinline long bchfs_fpunch(struct bch_inode_info *inode, loff_t offset, loff_t len)
 {
 	struct bch_fs *c = inode->v.i_sb->s_fs_info;
 	u64 end		= offset + len;
@@ -547,7 +547,7 @@ err:
 	return ret;
 }
 
-static long bchfs_fcollapse_finsert(struct bch_inode_info *inode,
+static noinline long bchfs_fcollapse_finsert(struct bch_inode_info *inode,
 				   loff_t offset, loff_t len,
 				   bool insert)
 {
@@ -583,7 +583,7 @@ static long bchfs_fcollapse_finsert(struct bch_inode_info *inode,
 	return ret;
 }
 
-static int __bchfs_fallocate(struct bch_inode_info *inode, int mode,
+static noinline int __bchfs_fallocate(struct bch_inode_info *inode, int mode,
 			     u64 start_sector, u64 end_sector)
 {
 	struct bch_fs *c = inode->v.i_sb->s_fs_info;
@@ -704,7 +704,7 @@ bkey_err:
 	return ret;
 }
 
-static long bchfs_fallocate(struct bch_inode_info *inode, int mode,
+static noinline long bchfs_fallocate(struct bch_inode_info *inode, int mode,
 			    loff_t offset, loff_t len)
 {
 	struct bch_fs *c = inode->v.i_sb->s_fs_info;

From 26a170aa6182209723f7654eaeddcab7b58a9d83 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Thu, 11 Jul 2024 16:00:46 -0400
Subject: [PATCH 004/120] bcachefs: add capacity, reserved to
 fs_alloc_debug_to_text()

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/alloc_background.c | 1 +
 fs/bcachefs/alloc_foreground.c | 2 ++
 fs/bcachefs/bcachefs.h         | 1 +
 3 files changed, 4 insertions(+)

diff --git a/fs/bcachefs/alloc_background.c b/fs/bcachefs/alloc_background.c
index 658f11aebda1..9c12ce5f4da3 100644
--- a/fs/bcachefs/alloc_background.c
+++ b/fs/bcachefs/alloc_background.c
@@ -2336,6 +2336,7 @@ void bch2_recalc_capacity(struct bch_fs *c)
 
 	reserved_sectors = min(reserved_sectors, capacity);
 
+	c->reserved = reserved_sectors;
 	c->capacity = capacity - reserved_sectors;
 
 	c->bucket_size_max = bucket_size_max;
diff --git a/fs/bcachefs/alloc_foreground.c b/fs/bcachefs/alloc_foreground.c
index 27d97c22ae27..ae59536cac08 100644
--- a/fs/bcachefs/alloc_foreground.c
+++ b/fs/bcachefs/alloc_foreground.c
@@ -1706,6 +1706,8 @@ void bch2_fs_alloc_debug_to_text(struct printbuf *out, struct bch_fs *c)
 	printbuf_tabstops_reset(out);
 	printbuf_tabstop_push(out, 24);
 
+	prt_printf(out, "capacity\t%llu\n",		c->capacity);
+	prt_printf(out, "reserved\t%llu\n",		c->reserved);
 	percpu_down_read(&c->mark_lock);
 	prt_printf(out, "hidden\t%llu\n",			bch2_fs_usage_read_one(c, &c->usage_base->b.hidden));
 	prt_printf(out, "btree\t%llu\n",			bch2_fs_usage_read_one(c, &c->usage_base->b.btree));
diff --git a/fs/bcachefs/bcachefs.h b/fs/bcachefs/bcachefs.h
index 1106fec6e155..4d93889f3bae 100644
--- a/fs/bcachefs/bcachefs.h
+++ b/fs/bcachefs/bcachefs.h
@@ -872,6 +872,7 @@ struct bch_fs {
 	struct bch_devs_mask	rw_devs[BCH_DATA_NR];
 
 	u64			capacity; /* sectors */
+	u64			reserved; /* sectors */
 
 	/*
 	 * When capacity _decreases_ (due to a disk being removed), we

From a1e7a97f22bf688a1234e698d342726e70f8a25b Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Fri, 28 Jun 2024 21:40:00 -0400
Subject: [PATCH 005/120] bcachefs: sysfs internal/trigger_journal_writes

another debugging knob - trigger the journal to do ready journal writes

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/sysfs.c | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/fs/bcachefs/sysfs.c b/fs/bcachefs/sysfs.c
index 93ca74d108b1..3f54203f0499 100644
--- a/fs/bcachefs/sysfs.c
+++ b/fs/bcachefs/sysfs.c
@@ -140,6 +140,7 @@ write_attribute(trigger_gc);
 write_attribute(trigger_discards);
 write_attribute(trigger_invalidates);
 write_attribute(trigger_journal_flush);
+write_attribute(trigger_journal_writes);
 write_attribute(trigger_btree_cache_shrink);
 write_attribute(trigger_btree_key_cache_shrink);
 rw_attribute(gc_gens_pos);
@@ -497,6 +498,9 @@ STORE(bch2_fs)
 		bch2_journal_meta(&c->journal);
 	}
 
+	if (attr == &sysfs_trigger_journal_writes)
+		bch2_journal_do_writes(&c->journal);
+
 #ifdef CONFIG_BCACHEFS_TESTS
 	if (attr == &sysfs_perf_test) {
 		char *tmp = kstrdup(buf, GFP_KERNEL), *p = tmp;
@@ -615,6 +619,7 @@ struct attribute *bch2_fs_internal_files[] = {
 	&sysfs_trigger_discards,
 	&sysfs_trigger_invalidates,
 	&sysfs_trigger_journal_flush,
+	&sysfs_trigger_journal_writes,
 	&sysfs_trigger_btree_cache_shrink,
 	&sysfs_trigger_btree_key_cache_shrink,
 

From d06a26d24db090d8be0ea4c9bfa1457e334940b7 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Thu, 11 Jul 2024 16:11:45 -0400
Subject: [PATCH 006/120] bcachefs: sysfs trigger_freelist_wakeup

another debugging knob

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/sysfs.c | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/fs/bcachefs/sysfs.c b/fs/bcachefs/sysfs.c
index 3f54203f0499..97e9ef4acc49 100644
--- a/fs/bcachefs/sysfs.c
+++ b/fs/bcachefs/sysfs.c
@@ -143,6 +143,7 @@ write_attribute(trigger_journal_flush);
 write_attribute(trigger_journal_writes);
 write_attribute(trigger_btree_cache_shrink);
 write_attribute(trigger_btree_key_cache_shrink);
+write_attribute(trigger_freelist_wakeup);
 rw_attribute(gc_gens_pos);
 
 read_attribute(uuid);
@@ -501,6 +502,9 @@ STORE(bch2_fs)
 	if (attr == &sysfs_trigger_journal_writes)
 		bch2_journal_do_writes(&c->journal);
 
+	if (attr == &sysfs_trigger_freelist_wakeup)
+		closure_wake_up(&c->freelist_wait);
+
 #ifdef CONFIG_BCACHEFS_TESTS
 	if (attr == &sysfs_perf_test) {
 		char *tmp = kstrdup(buf, GFP_KERNEL), *p = tmp;
@@ -622,6 +626,7 @@ struct attribute *bch2_fs_internal_files[] = {
 	&sysfs_trigger_journal_writes,
 	&sysfs_trigger_btree_cache_shrink,
 	&sysfs_trigger_btree_key_cache_shrink,
+	&sysfs_trigger_freelist_wakeup,
 
 	&sysfs_gc_gens_pos,
 

From cdda2126ab0dfeafc52c725f808baed7ea26d0b5 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Thu, 11 Jul 2024 16:30:41 -0400
Subject: [PATCH 007/120] bcachefs: bch2_btree_reserve_cache_to_text()

Add a pretty printer so the btree reserve cache can be seen in sysfs; as
it pins open_buckets we need it for tracking down open_buckets issues.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/alloc_foreground.c      |  2 +-
 fs/bcachefs/alloc_foreground.h      |  1 +
 fs/bcachefs/btree_update_interior.c | 22 ++++++++++++++++++++++
 fs/bcachefs/btree_update_interior.h |  2 ++
 fs/bcachefs/sysfs.c                 |  6 ++++++
 5 files changed, 32 insertions(+), 1 deletion(-)

diff --git a/fs/bcachefs/alloc_foreground.c b/fs/bcachefs/alloc_foreground.c
index ae59536cac08..991e07a79064 100644
--- a/fs/bcachefs/alloc_foreground.c
+++ b/fs/bcachefs/alloc_foreground.c
@@ -1589,7 +1589,7 @@ void bch2_fs_allocator_foreground_init(struct bch_fs *c)
 	}
 }
 
-static void bch2_open_bucket_to_text(struct printbuf *out, struct bch_fs *c, struct open_bucket *ob)
+void bch2_open_bucket_to_text(struct printbuf *out, struct bch_fs *c, struct open_bucket *ob)
 {
 	struct bch_dev *ca = ob_dev(c, ob);
 	unsigned data_type = ob->data_type;
diff --git a/fs/bcachefs/alloc_foreground.h b/fs/bcachefs/alloc_foreground.h
index a42c9730d32a..6da9e7e29026 100644
--- a/fs/bcachefs/alloc_foreground.h
+++ b/fs/bcachefs/alloc_foreground.h
@@ -222,6 +222,7 @@ static inline struct write_point_specifier writepoint_ptr(struct write_point *wp
 
 void bch2_fs_allocator_foreground_init(struct bch_fs *);
 
+void bch2_open_bucket_to_text(struct printbuf *, struct bch_fs *, struct open_bucket *);
 void bch2_open_buckets_to_text(struct printbuf *, struct bch_fs *);
 void bch2_open_buckets_partial_to_text(struct printbuf *, struct bch_fs *);
 
diff --git a/fs/bcachefs/btree_update_interior.c b/fs/bcachefs/btree_update_interior.c
index 60b8544cea48..d5f7992969d1 100644
--- a/fs/bcachefs/btree_update_interior.c
+++ b/fs/bcachefs/btree_update_interior.c
@@ -2647,6 +2647,28 @@ bch2_btree_roots_to_journal_entries(struct bch_fs *c,
 	return end;
 }
 
+static void bch2_btree_alloc_to_text(struct printbuf *out,
+				     struct bch_fs *c,
+				     struct btree_alloc *a)
+{
+	printbuf_indent_add(out, 2);
+	bch2_bkey_val_to_text(out, c, bkey_i_to_s_c(&a->k));
+	prt_newline(out);
+
+	struct open_bucket *ob;
+	unsigned i;
+	open_bucket_for_each(c, &a->ob, ob, i)
+		bch2_open_bucket_to_text(out, c, ob);
+
+	printbuf_indent_sub(out, 2);
+}
+
+void bch2_btree_reserve_cache_to_text(struct printbuf *out, struct bch_fs *c)
+{
+	for (unsigned i = 0; i < c->btree_reserve_cache_nr; i++)
+		bch2_btree_alloc_to_text(out, c, &c->btree_reserve_cache[i]);
+}
+
 void bch2_fs_btree_interior_update_exit(struct bch_fs *c)
 {
 	if (c->btree_node_rewrite_worker)
diff --git a/fs/bcachefs/btree_update_interior.h b/fs/bcachefs/btree_update_interior.h
index b5b76ce01cfc..02c6ecada97c 100644
--- a/fs/bcachefs/btree_update_interior.h
+++ b/fs/bcachefs/btree_update_interior.h
@@ -335,6 +335,8 @@ struct jset_entry *bch2_btree_roots_to_journal_entries(struct bch_fs *,
 void bch2_do_pending_node_rewrites(struct bch_fs *);
 void bch2_free_pending_node_rewrites(struct bch_fs *);
 
+void bch2_btree_reserve_cache_to_text(struct printbuf *, struct bch_fs *);
+
 void bch2_fs_btree_interior_update_exit(struct bch_fs *);
 void bch2_fs_btree_interior_update_init_early(struct bch_fs *);
 int bch2_fs_btree_interior_update_init(struct bch_fs *);
diff --git a/fs/bcachefs/sysfs.c b/fs/bcachefs/sysfs.c
index 97e9ef4acc49..91f1516ada8f 100644
--- a/fs/bcachefs/sysfs.c
+++ b/fs/bcachefs/sysfs.c
@@ -17,6 +17,7 @@
 #include "btree_iter.h"
 #include "btree_key_cache.h"
 #include "btree_update.h"
+#include "btree_update_interior.h"
 #include "btree_gc.h"
 #include "buckets.h"
 #include "clock.h"
@@ -170,6 +171,7 @@ read_attribute(compression_stats);
 read_attribute(journal_debug);
 read_attribute(btree_cache);
 read_attribute(btree_key_cache);
+read_attribute(btree_reserve_cache);
 read_attribute(stripes_heap);
 read_attribute(open_buckets);
 read_attribute(open_buckets_partial);
@@ -390,6 +392,9 @@ SHOW(bch2_fs)
 	if (attr == &sysfs_btree_key_cache)
 		bch2_btree_key_cache_to_text(out, &c->btree_key_cache);
 
+	if (attr == &sysfs_btree_reserve_cache)
+		bch2_btree_reserve_cache_to_text(out, c);
+
 	if (attr == &sysfs_stripes_heap)
 		bch2_stripes_heap_to_text(out, c);
 
@@ -607,6 +612,7 @@ struct attribute *bch2_fs_internal_files[] = {
 	&sysfs_journal_debug,
 	&sysfs_btree_cache,
 	&sysfs_btree_key_cache,
+	&sysfs_btree_reserve_cache,
 	&sysfs_new_stripes,
 	&sysfs_stripes_heap,
 	&sysfs_open_buckets,

From f369de82679f4f71d4f5e8a3149a80ebc1bfc987 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Thu, 4 Jul 2024 17:10:29 -0400
Subject: [PATCH 008/120] bcachefs: fix ei_update_lock lock ordering

ei_update_lock is largely vestigal and will probably be removed, but
we're not ready for that just yet.

this fixes some lockdep splats with the new lockdep support for btree
node locks; they're harmless, since we were taking ei_update_lock before
actually locking any btree nodes, but "any btree nodes locked" are now
tracked at the btree_trans level.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/acl.c |  4 ++--
 fs/bcachefs/fs.c  | 11 ++++++-----
 2 files changed, 8 insertions(+), 7 deletions(-)

diff --git a/fs/bcachefs/acl.c b/fs/bcachefs/acl.c
index 250d6c6d3a3a..a7b425d3c8a0 100644
--- a/fs/bcachefs/acl.c
+++ b/fs/bcachefs/acl.c
@@ -346,7 +346,6 @@ int bch2_set_acl(struct mnt_idmap *idmap,
 {
 	struct bch_inode_info *inode = to_bch_ei(dentry->d_inode);
 	struct bch_fs *c = inode->v.i_sb->s_fs_info;
-	struct btree_trans *trans = bch2_trans_get(c);
 	struct btree_iter inode_iter = { NULL };
 	struct bch_inode_unpacked inode_u;
 	struct posix_acl *acl;
@@ -354,6 +353,7 @@ int bch2_set_acl(struct mnt_idmap *idmap,
 	int ret;
 
 	mutex_lock(&inode->ei_update_lock);
+	struct btree_trans *trans = bch2_trans_get(c);
 retry:
 	bch2_trans_begin(trans);
 	acl = _acl;
@@ -394,8 +394,8 @@ btree_err:
 
 	set_cached_acl(&inode->v, type, acl);
 err:
-	mutex_unlock(&inode->ei_update_lock);
 	bch2_trans_put(trans);
+	mutex_unlock(&inode->ei_update_lock);
 
 	return ret;
 }
diff --git a/fs/bcachefs/fs.c b/fs/bcachefs/fs.c
index fa1fee05cf8f..78f2d80b8bb7 100644
--- a/fs/bcachefs/fs.c
+++ b/fs/bcachefs/fs.c
@@ -517,11 +517,11 @@ static int __bch2_link(struct bch_fs *c,
 		       struct bch_inode_info *dir,
 		       struct dentry *dentry)
 {
-	struct btree_trans *trans = bch2_trans_get(c);
 	struct bch_inode_unpacked dir_u, inode_u;
 	int ret;
 
 	mutex_lock(&inode->ei_update_lock);
+	struct btree_trans *trans = bch2_trans_get(c);
 
 	ret = commit_do(trans, NULL, NULL, 0,
 			bch2_link_trans(trans,
@@ -568,11 +568,12 @@ int __bch2_unlink(struct inode *vdir, struct dentry *dentry,
 	struct bch_inode_info *dir = to_bch_ei(vdir);
 	struct bch_inode_info *inode = to_bch_ei(dentry->d_inode);
 	struct bch_inode_unpacked dir_u, inode_u;
-	struct btree_trans *trans = bch2_trans_get(c);
 	int ret;
 
 	bch2_lock_inodes(INODE_UPDATE_LOCK, dir, inode);
 
+	struct btree_trans *trans = bch2_trans_get(c);
+
 	ret = commit_do(trans, NULL, NULL,
 			BCH_TRANS_COMMIT_no_enospc,
 		bch2_unlink_trans(trans,
@@ -595,8 +596,8 @@ int __bch2_unlink(struct inode *vdir, struct dentry *dentry,
 		set_nlink(&inode->v, 0);
 	}
 err:
-	bch2_unlock_inodes(INODE_UPDATE_LOCK, dir, inode);
 	bch2_trans_put(trans);
+	bch2_unlock_inodes(INODE_UPDATE_LOCK, dir, inode);
 
 	return ret;
 }
@@ -681,14 +682,14 @@ static int bch2_rename2(struct mnt_idmap *idmap,
 			return ret;
 	}
 
-	trans = bch2_trans_get(c);
-
 	bch2_lock_inodes(INODE_UPDATE_LOCK,
 			 src_dir,
 			 dst_dir,
 			 src_inode,
 			 dst_inode);
 
+	trans = bch2_trans_get(c);
+
 	ret   = bch2_subvol_is_ro_trans(trans, src_dir->ei_subvol) ?:
 		bch2_subvol_is_ro_trans(trans, dst_dir->ei_subvol);
 	if (ret)

From 12e7ff1a1ee5dda5880116db451a3f3d23a79267 Mon Sep 17 00:00:00 2001
From: Youling Tang <tangyouling@kylinos.cn>
Date: Thu, 18 Apr 2024 16:31:03 +0800
Subject: [PATCH 009/120] bcachefs: Fix missing spaces in
 journal_entry_dev_usage_to_text

Fixed missing spaces displayed in journal_entry_dev_usage_to_text
while adjusting the display format to improve readability.

before:
```
 # bcachefs list_journal -a -t alloc:1:0 /dev/sdb
 ...
     dev_usage: dev=0free: buckets=233180 sectors=0 fragmented=0sb: buckets=13 sectors=6152 fragmented=504journal: buckets=1847 sectors=945664 fragmented=0btree: buckets=20 sectors=10240 fragmented=0user: buckets=1419 sectors=726513 fragmented=15cached: buckets=0 sectors=0 fragmented=0parity: buckets=0 sectors=0 fragmented=0stripe: buckets=0 sectors=0 fragmented=0need_gc_gens: buckets=0 sectors=0 fragmented=0need_discard: buckets=1 sectors=0 fragmented=0
```

after:
```
 # bcachefs list_journal -a -t alloc:1:0 /dev/sdb
 ...
     dev_usage: dev=0
       free: buckets=233180 sectors=0 fragmented=0
       sb: buckets=13 sectors=6152 fragmented=504
       journal: buckets=1847 sectors=945664 fragmented=0
       btree: buckets=20 sectors=10240 fragmented=0
       user: buckets=1419 sectors=726513 fragmented=15
       cached: buckets=0 sectors=0 fragmented=0
       parity: buckets=0 sectors=0 fragmented=0
       stripe: buckets=0 sectors=0 fragmented=0
       need_gc_gens: buckets=0 sectors=0 fragmented=0
       need_discard: buckets=1 sectors=0 fragmented=0
```
Signed-off-by: Youling Tang <tangyouling@kylinos.cn>

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/journal_io.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/fs/bcachefs/journal_io.c b/fs/bcachefs/journal_io.c
index 2326e2cb9cd2..63608ee44f08 100644
--- a/fs/bcachefs/journal_io.c
+++ b/fs/bcachefs/journal_io.c
@@ -724,13 +724,16 @@ static void journal_entry_dev_usage_to_text(struct printbuf *out, struct bch_fs
 
 	prt_printf(out, "dev=%u", le32_to_cpu(u->dev));
 
+	printbuf_indent_add(out, 2);
 	for (i = 0; i < nr_types; i++) {
+		prt_newline(out);
 		bch2_prt_data_type(out, i);
 		prt_printf(out, ": buckets=%llu sectors=%llu fragmented=%llu",
 		       le64_to_cpu(u->d[i].buckets),
 		       le64_to_cpu(u->d[i].sectors),
 		       le64_to_cpu(u->d[i].fragmented));
 	}
+	printbuf_indent_sub(out, 2);
 }
 
 static int journal_entry_log_validate(struct bch_fs *c,

From da6fa380d369a7e4d61f7a654efb48a018e3e563 Mon Sep 17 00:00:00 2001
From: Youling Tang <tangyouling@kylinos.cn>
Date: Thu, 18 Apr 2024 08:50:55 +0800
Subject: [PATCH 010/120] bcachefs: Align the display format of
 `btrees/inodes/keys`

Before patch:
```
 #cat btrees/inodes/keys
 u64s 17 type inode_v3 0:4096:U32_MAX len 0 ver 0:   mode=40755
   flags= (16300000)
   bi_size=0
```

After patch:
```
 #cat btrees/inodes/keys
 u64s 17 type inode_v3 0:4096:U32_MAX len 0 ver 0:
   mode=40755
   flags=(16300000)
   bi_size=0
```

Signed-off-by: Youling Tang <tangyouling@kylinos.cn>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/inode.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/fs/bcachefs/inode.c b/fs/bcachefs/inode.c
index aafa79fa6351..4fd8c736744a 100644
--- a/fs/bcachefs/inode.c
+++ b/fs/bcachefs/inode.c
@@ -534,12 +534,13 @@ fsck_err:
 static void __bch2_inode_unpacked_to_text(struct printbuf *out,
 					  struct bch_inode_unpacked *inode)
 {
+	prt_printf(out, "\n");
 	printbuf_indent_add(out, 2);
 	prt_printf(out, "mode=%o\n", inode->bi_mode);
 
 	prt_str(out, "flags=");
 	prt_bitflags(out, bch2_inode_flag_strs, inode->bi_flags & ((1U << 20) - 1));
-	prt_printf(out, " (%x)\n", inode->bi_flags);
+	prt_printf(out, "(%x)\n", inode->bi_flags);
 
 	prt_printf(out, "journal_seq=%llu\n",	inode->bi_journal_seq);
 	prt_printf(out, "bi_size=%llu\n",	inode->bi_size);

From 630d565ddad52c1777ec3b48de04ecdc21004991 Mon Sep 17 00:00:00 2001
From: Youling Tang <tangyouling@kylinos.cn>
Date: Thu, 25 Apr 2024 17:16:59 +0800
Subject: [PATCH 011/120] bcachefs: Use filemap_read() to simplify the
 execution flow

Using filemap_read() can reduce unnecessary code execution
for non IOCB_DIRECT paths.

Signed-off-by: Youling Tang <tangyouling@kylinos.cn>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/fs-io-direct.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/fs/bcachefs/fs-io-direct.c b/fs/bcachefs/fs-io-direct.c
index 049b61bc9a5b..e246b1e05aa2 100644
--- a/fs/bcachefs/fs-io-direct.c
+++ b/fs/bcachefs/fs-io-direct.c
@@ -179,7 +179,7 @@ ssize_t bch2_read_iter(struct kiocb *iocb, struct iov_iter *iter)
 	struct bch_inode_info *inode = file_bch_inode(file);
 	struct address_space *mapping = file->f_mapping;
 	size_t count = iov_iter_count(iter);
-	ssize_t ret;
+	ssize_t ret = 0;
 
 	if (!count)
 		return 0; /* skip atime */
@@ -205,7 +205,7 @@ ssize_t bch2_read_iter(struct kiocb *iocb, struct iov_iter *iter)
 			iocb->ki_pos += ret;
 	} else {
 		bch2_pagecache_add_get(inode);
-		ret = generic_file_read_iter(iocb, iter);
+		ret = filemap_read(iocb, iter, ret);
 		bch2_pagecache_add_put(inode);
 	}
 out:

From 546b65378d0436f55cd6fcbbba890525b77d65f3 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Mon, 17 Jun 2024 13:58:17 -0400
Subject: [PATCH 012/120] bcachefs: fix missing include

fs-common.h needs dirent.h for enum bch_rename_mode

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/fs-common.h | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/fs/bcachefs/fs-common.h b/fs/bcachefs/fs-common.h
index dde237859514..c934e807b380 100644
--- a/fs/bcachefs/fs-common.h
+++ b/fs/bcachefs/fs-common.h
@@ -2,6 +2,8 @@
 #ifndef _BCACHEFS_FS_COMMON_H
 #define _BCACHEFS_FS_COMMON_H
 
+#include "dirent.h"
+
 struct posix_acl;
 
 #define BCH_CREATE_TMPFILE		(1U << 0)

From e76a2b65b0565f55ea668ec46d54f6a00b8ea9fc Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Fri, 7 Jun 2024 16:35:42 -0400
Subject: [PATCH 013/120] bcachefs: add might_sleep() annotations for
 fsck_err()

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/error.c | 2 ++
 fs/bcachefs/error.h | 5 ++++-
 2 files changed, 6 insertions(+), 1 deletion(-)

diff --git a/fs/bcachefs/error.c b/fs/bcachefs/error.c
index d95c40f1b6af..46cd9dcb48fc 100644
--- a/fs/bcachefs/error.c
+++ b/fs/bcachefs/error.c
@@ -210,6 +210,8 @@ int bch2_fsck_err(struct bch_fs *c,
 	int ret = -BCH_ERR_fsck_ignore;
 	const char *action_orig = "fix?", *action = action_orig;
 
+	might_sleep();
+
 	if (!WARN_ON(err >= ARRAY_SIZE(fsck_flags_extra)))
 		flags |= fsck_flags_extra[err];
 
diff --git a/fs/bcachefs/error.h b/fs/bcachefs/error.h
index 777711504c35..ba6a4f5257f4 100644
--- a/fs/bcachefs/error.h
+++ b/fs/bcachefs/error.h
@@ -136,7 +136,10 @@ void bch2_flush_fsck_errs(struct bch_fs *);
 /* XXX: mark in superblock that filesystem contains errors, if we ignore: */
 
 #define __fsck_err_on(cond, c, _flags, _err_type, ...)			\
-	(unlikely(cond) ? __fsck_err(c, _flags, _err_type, __VA_ARGS__) : false)
+({									\
+	might_sleep();							\
+	(unlikely(cond) ? __fsck_err(c, _flags, _err_type, __VA_ARGS__) : false);\
+})									\
 
 #define need_fsck_err_on(cond, c, _err_type, ...)				\
 	__fsck_err_on(cond, c, FSCK_CAN_IGNORE|FSCK_NEED_FSCK, _err_type, __VA_ARGS__)

From 68573b936d3fceda9cd5cce3a577e035d19ad426 Mon Sep 17 00:00:00 2001
From: Uros Bizjak <ubizjak@gmail.com>
Date: Thu, 23 May 2024 11:19:26 +0200
Subject: [PATCH 014/120] bcachefs: Use try_cmpxchg() family of functions
 instead of cmpxchg()

Use try_cmpxchg() family of functions instead of
cmpxchg (*ptr, old, new) == old. x86 CMPXCHG instruction returns
success in ZF flag, so this change saves a compare after cmpxchg
(and related move instruction in front of cmpxchg).

Also, try_cmpxchg() implicitly assigns old *ptr value to "old" when
cmpxchg fails. There is no need to re-read the value in the loop.

No functional change intended.

Signed-off-by: Uros Bizjak <ubizjak@gmail.com>
Cc: Kent Overstreet <kent.overstreet@linux.dev>
Cc: Brian Foster <bfoster@redhat.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_cache.c           |  8 ++++----
 fs/bcachefs/btree_io.c              | 20 +++++++++++---------
 fs/bcachefs/btree_trans_commit.c    |  8 ++++----
 fs/bcachefs/btree_update_interior.c |  8 ++++----
 fs/bcachefs/buckets.c               | 17 ++++++++---------
 fs/bcachefs/buckets.h               |  4 ++--
 fs/bcachefs/io_write.c              |  7 +++----
 fs/bcachefs/journal.c               | 17 ++++++++---------
 fs/bcachefs/journal.h               |  8 ++++----
 fs/bcachefs/journal_io.c            |  9 +++++----
 fs/bcachefs/two_state_shared_lock.h | 11 +++++------
 11 files changed, 58 insertions(+), 59 deletions(-)

diff --git a/fs/bcachefs/btree_cache.c b/fs/bcachefs/btree_cache.c
index 4f5e411771ba..6a9b248217a0 100644
--- a/fs/bcachefs/btree_cache.c
+++ b/fs/bcachefs/btree_cache.c
@@ -602,8 +602,8 @@ int bch2_btree_cache_cannibalize_lock(struct btree_trans *trans, struct closure
 	struct btree_cache *bc = &c->btree_cache;
 	struct task_struct *old;
 
-	old = cmpxchg(&bc->alloc_lock, NULL, current);
-	if (old == NULL || old == current)
+	old = NULL;
+	if (try_cmpxchg(&bc->alloc_lock, &old, current) || old == current)
 		goto success;
 
 	if (!cl) {
@@ -614,8 +614,8 @@ int bch2_btree_cache_cannibalize_lock(struct btree_trans *trans, struct closure
 	closure_wait(&bc->alloc_wait, cl);
 
 	/* Try again, after adding ourselves to waitlist */
-	old = cmpxchg(&bc->alloc_lock, NULL, current);
-	if (old == NULL || old == current) {
+	old = NULL;
+	if (try_cmpxchg(&bc->alloc_lock, &old, current) || old == current) {
 		/* We raced */
 		closure_wake_up(&bc->alloc_wait);
 		goto success;
diff --git a/fs/bcachefs/btree_io.c b/fs/bcachefs/btree_io.c
index 7bca15c604f5..0f3d01225878 100644
--- a/fs/bcachefs/btree_io.c
+++ b/fs/bcachefs/btree_io.c
@@ -1796,15 +1796,16 @@ int bch2_btree_root_read(struct bch_fs *c, enum btree_id id,
 static void bch2_btree_complete_write(struct bch_fs *c, struct btree *b,
 				      struct btree_write *w)
 {
-	unsigned long old, new, v = READ_ONCE(b->will_make_reachable);
+	unsigned long old, new;
 
+	old = READ_ONCE(b->will_make_reachable);
 	do {
-		old = new = v;
+		new = old;
 		if (!(old & 1))
 			break;
 
 		new &= ~1UL;
-	} while ((v = cmpxchg(&b->will_make_reachable, old, new)) != old);
+	} while (!try_cmpxchg(&b->will_make_reachable, &old, new));
 
 	if (old & 1)
 		closure_put(&((struct btree_update *) new)->cl);
@@ -1815,14 +1816,14 @@ static void bch2_btree_complete_write(struct bch_fs *c, struct btree *b,
 static void __btree_node_write_done(struct bch_fs *c, struct btree *b)
 {
 	struct btree_write *w = btree_prev_write(b);
-	unsigned long old, new, v;
+	unsigned long old, new;
 	unsigned type = 0;
 
 	bch2_btree_complete_write(c, b, w);
 
-	v = READ_ONCE(b->flags);
+	old = READ_ONCE(b->flags);
 	do {
-		old = new = v;
+		new = old;
 
 		if ((old & (1U << BTREE_NODE_dirty)) &&
 		    (old & (1U << BTREE_NODE_need_write)) &&
@@ -1842,7 +1843,7 @@ static void __btree_node_write_done(struct bch_fs *c, struct btree *b)
 			new &= ~(1U << BTREE_NODE_write_in_flight);
 			new &= ~(1U << BTREE_NODE_write_in_flight_inner);
 		}
-	} while ((v = cmpxchg(&b->flags, old, new)) != old);
+	} while (!try_cmpxchg(&b->flags, &old, new));
 
 	if (new & (1U << BTREE_NODE_write_in_flight))
 		__bch2_btree_node_write(c, b, BTREE_WRITE_ALREADY_STARTED|type);
@@ -2014,8 +2015,9 @@ void __bch2_btree_node_write(struct bch_fs *c, struct btree *b, unsigned flags)
 	 * dirty bit requires a write lock, we can't race with other threads
 	 * redirtying it:
 	 */
+	old = READ_ONCE(b->flags);
 	do {
-		old = new = READ_ONCE(b->flags);
+		new = old;
 
 		if (!(old & (1 << BTREE_NODE_dirty)))
 			return;
@@ -2046,7 +2048,7 @@ void __bch2_btree_node_write(struct bch_fs *c, struct btree *b, unsigned flags)
 		new |=  (1 << BTREE_NODE_write_in_flight_inner);
 		new |=  (1 << BTREE_NODE_just_written);
 		new ^=  (1 << BTREE_NODE_write_idx);
-	} while (cmpxchg_acquire(&b->flags, old, new) != old);
+	} while (!try_cmpxchg_acquire(&b->flags, &old, new));
 
 	if (new & (1U << BTREE_NODE_need_write))
 		return;
diff --git a/fs/bcachefs/btree_trans_commit.c b/fs/bcachefs/btree_trans_commit.c
index 74e1ff225674..5e67dcb30f33 100644
--- a/fs/bcachefs/btree_trans_commit.c
+++ b/fs/bcachefs/btree_trans_commit.c
@@ -228,14 +228,14 @@ static int __btree_node_flush(struct journal *j, struct journal_entry_pin *pin,
 	struct btree_write *w = container_of(pin, struct btree_write, journal);
 	struct btree *b = container_of(w, struct btree, writes[i]);
 	struct btree_trans *trans = bch2_trans_get(c);
-	unsigned long old, new, v;
+	unsigned long old, new;
 	unsigned idx = w - b->writes;
 
 	btree_node_lock_nopath_nofail(trans, &b->c, SIX_LOCK_read);
-	v = READ_ONCE(b->flags);
 
+	old = READ_ONCE(b->flags);
 	do {
-		old = new = v;
+		new = old;
 
 		if (!(old & (1 << BTREE_NODE_dirty)) ||
 		    !!(old & (1 << BTREE_NODE_write_idx)) != idx ||
@@ -245,7 +245,7 @@ static int __btree_node_flush(struct journal *j, struct journal_entry_pin *pin,
 		new &= ~BTREE_WRITE_TYPE_MASK;
 		new |= BTREE_WRITE_journal_reclaim;
 		new |= 1 << BTREE_NODE_need_write;
-	} while ((v = cmpxchg(&b->flags, old, new)) != old);
+	} while (!try_cmpxchg(&b->flags, &old, new));
 
 	btree_node_write_if_need(c, b, SIX_LOCK_read);
 	six_unlock_read(&b->c.lock);
diff --git a/fs/bcachefs/btree_update_interior.c b/fs/bcachefs/btree_update_interior.c
index d5f7992969d1..c58ceb328645 100644
--- a/fs/bcachefs/btree_update_interior.c
+++ b/fs/bcachefs/btree_update_interior.c
@@ -1356,7 +1356,7 @@ static void bch2_insert_fixup_btree_ptr(struct btree_update *as,
 	struct bch_fs *c = as->c;
 	struct bkey_packed *k;
 	struct printbuf buf = PRINTBUF;
-	unsigned long old, new, v;
+	unsigned long old, new;
 
 	BUG_ON(insert->k.type == KEY_TYPE_btree_ptr_v2 &&
 	       !btree_ptr_sectors_written(insert));
@@ -1395,14 +1395,14 @@ static void bch2_insert_fixup_btree_ptr(struct btree_update *as,
 	bch2_btree_bset_insert_key(trans, path, b, node_iter, insert);
 	set_btree_node_dirty_acct(c, b);
 
-	v = READ_ONCE(b->flags);
+	old = READ_ONCE(b->flags);
 	do {
-		old = new = v;
+		new = old;
 
 		new &= ~BTREE_WRITE_TYPE_MASK;
 		new |= BTREE_WRITE_interior;
 		new |= 1 << BTREE_NODE_need_write;
-	} while ((v = cmpxchg(&b->flags, old, new)) != old);
+	} while (!try_cmpxchg(&b->flags, &old, new));
 
 	printbuf_exit(&buf);
 }
diff --git a/fs/bcachefs/buckets.c b/fs/bcachefs/buckets.c
index 314ee3e0187f..5145066330ed 100644
--- a/fs/bcachefs/buckets.c
+++ b/fs/bcachefs/buckets.c
@@ -916,13 +916,13 @@ void bch2_trans_account_disk_usage_change(struct btree_trans *trans)
 	 */
 	s64 should_not_have_added = added - (s64) disk_res_sectors;
 	if (unlikely(should_not_have_added > 0)) {
-		u64 old, new, v = atomic64_read(&c->sectors_available);
+		u64 old, new;
 
+		old = atomic64_read(&c->sectors_available);
 		do {
-			old = v;
 			new = max_t(s64, 0, old - should_not_have_added);
-		} while ((v = atomic64_cmpxchg(&c->sectors_available,
-					       old, new)) != old);
+		} while (!atomic64_try_cmpxchg(&c->sectors_available,
+					       &old, new));
 
 		added -= should_not_have_added;
 		warn = true;
@@ -1523,7 +1523,7 @@ int __bch2_disk_reservation_add(struct bch_fs *c, struct disk_reservation *res,
 			      u64 sectors, int flags)
 {
 	struct bch_fs_pcpu *pcpu;
-	u64 old, v, get;
+	u64 old, get;
 	s64 sectors_available;
 	int ret;
 
@@ -1534,17 +1534,16 @@ int __bch2_disk_reservation_add(struct bch_fs *c, struct disk_reservation *res,
 	if (sectors <= pcpu->sectors_available)
 		goto out;
 
-	v = atomic64_read(&c->sectors_available);
+	old = atomic64_read(&c->sectors_available);
 	do {
-		old = v;
 		get = min((u64) sectors + SECTORS_CACHE, old);
 
 		if (get < sectors) {
 			preempt_enable();
 			goto recalculate;
 		}
-	} while ((v = atomic64_cmpxchg(&c->sectors_available,
-				       old, old - get)) != old);
+	} while (!atomic64_try_cmpxchg(&c->sectors_available,
+				       &old, old - get));
 
 	pcpu->sectors_available		+= get;
 
diff --git a/fs/bcachefs/buckets.h b/fs/bcachefs/buckets.h
index 8ad4be73860c..05a1c98754f2 100644
--- a/fs/bcachefs/buckets.h
+++ b/fs/bcachefs/buckets.h
@@ -432,13 +432,13 @@ static inline int bch2_disk_reservation_add(struct bch_fs *c, struct disk_reserv
 #ifdef __KERNEL__
 	u64 old, new;
 
+	old = this_cpu_read(c->pcpu->sectors_available);
 	do {
-		old = this_cpu_read(c->pcpu->sectors_available);
 		if (sectors > old)
 			return __bch2_disk_reservation_add(c, res, sectors, flags);
 
 		new = old - sectors;
-	} while (this_cpu_cmpxchg(c->pcpu->sectors_available, old, new) != old);
+	} while (!this_cpu_try_cmpxchg(c->pcpu->sectors_available, &old, new));
 
 	this_cpu_add(*c->online_reserved, sectors);
 	res->sectors			+= sectors;
diff --git a/fs/bcachefs/io_write.c b/fs/bcachefs/io_write.c
index 05e0cbef420b..c6197e6aa0b8 100644
--- a/fs/bcachefs/io_write.c
+++ b/fs/bcachefs/io_write.c
@@ -69,11 +69,10 @@ void bch2_latency_acct(struct bch_dev *ca, u64 submit_time, int rw)
 	u64 io_latency = time_after64(now, submit_time)
 		? now - submit_time
 		: 0;
-	u64 old, new, v = atomic64_read(latency);
+	u64 old, new;
 
+	old = atomic64_read(latency);
 	do {
-		old = v;
-
 		/*
 		 * If the io latency was reasonably close to the current
 		 * latency, skip doing the update and atomic operation - most of
@@ -84,7 +83,7 @@ void bch2_latency_acct(struct bch_dev *ca, u64 submit_time, int rw)
 			break;
 
 		new = ewma_add(old, io_latency, 5);
-	} while ((v = atomic64_cmpxchg(latency, old, new)) != old);
+	} while (!atomic64_try_cmpxchg(latency, &old, new));
 
 	bch2_congested_acct(ca, io_latency, now, rw);
 
diff --git a/fs/bcachefs/journal.c b/fs/bcachefs/journal.c
index 10b19791ec98..649e3a01608a 100644
--- a/fs/bcachefs/journal.c
+++ b/fs/bcachefs/journal.c
@@ -230,7 +230,6 @@ static void __journal_entry_close(struct journal *j, unsigned closed_val, bool t
 	struct bch_fs *c = container_of(j, struct bch_fs, journal);
 	struct journal_buf *buf = journal_cur_buf(j);
 	union journal_res_state old, new;
-	u64 v = atomic64_read(&j->reservations.counter);
 	unsigned sectors;
 
 	BUG_ON(closed_val != JOURNAL_ENTRY_CLOSED_VAL &&
@@ -238,15 +237,16 @@ static void __journal_entry_close(struct journal *j, unsigned closed_val, bool t
 
 	lockdep_assert_held(&j->lock);
 
+	old.v = atomic64_read(&j->reservations.counter);
 	do {
-		old.v = new.v = v;
+		new.v = old.v;
 		new.cur_entry_offset = closed_val;
 
 		if (old.cur_entry_offset == JOURNAL_ENTRY_ERROR_VAL ||
 		    old.cur_entry_offset == new.cur_entry_offset)
 			return;
-	} while ((v = atomic64_cmpxchg(&j->reservations.counter,
-				       old.v, new.v)) != old.v);
+	} while (!atomic64_try_cmpxchg(&j->reservations.counter,
+				       &old.v, new.v));
 
 	if (!__journal_entry_is_open(old))
 		return;
@@ -353,7 +353,6 @@ static int journal_entry_open(struct journal *j)
 		((journal_cur_seq(j) + 1) & JOURNAL_BUF_MASK);
 	union journal_res_state old, new;
 	int u64s;
-	u64 v;
 
 	lockdep_assert_held(&j->lock);
 	BUG_ON(journal_entry_is_open(j));
@@ -432,9 +431,9 @@ static int journal_entry_open(struct journal *j)
 	 */
 	j->cur_entry_u64s = u64s;
 
-	v = atomic64_read(&j->reservations.counter);
+	old.v = atomic64_read(&j->reservations.counter);
 	do {
-		old.v = new.v = v;
+		new.v = old.v;
 
 		BUG_ON(old.cur_entry_offset == JOURNAL_ENTRY_ERROR_VAL);
 
@@ -446,8 +445,8 @@ static int journal_entry_open(struct journal *j)
 
 		/* Handle any already added entries */
 		new.cur_entry_offset = le32_to_cpu(buf->data->u64s);
-	} while ((v = atomic64_cmpxchg(&j->reservations.counter,
-				       old.v, new.v)) != old.v);
+	} while (!atomic64_try_cmpxchg(&j->reservations.counter,
+				       &old.v, new.v));
 
 	if (nr_unwritten_journal_entries(j) == 1)
 		mod_delayed_work(j->wq,
diff --git a/fs/bcachefs/journal.h b/fs/bcachefs/journal.h
index bc6b9c39dcb4..377a3750406e 100644
--- a/fs/bcachefs/journal.h
+++ b/fs/bcachefs/journal.h
@@ -327,10 +327,10 @@ static inline int journal_res_get_fast(struct journal *j,
 				       unsigned flags)
 {
 	union journal_res_state old, new;
-	u64 v = atomic64_read(&j->reservations.counter);
 
+	old.v = atomic64_read(&j->reservations.counter);
 	do {
-		old.v = new.v = v;
+		new.v = old.v;
 
 		/*
 		 * Check if there is still room in the current journal
@@ -356,8 +356,8 @@ static inline int journal_res_get_fast(struct journal *j,
 
 		if (flags & JOURNAL_RES_GET_CHECK)
 			return 1;
-	} while ((v = atomic64_cmpxchg(&j->reservations.counter,
-				       old.v, new.v)) != old.v);
+	} while (!atomic64_try_cmpxchg(&j->reservations.counter,
+				       &old.v, new.v));
 
 	res->ref	= true;
 	res->idx	= old.idx;
diff --git a/fs/bcachefs/journal_io.c b/fs/bcachefs/journal_io.c
index 63608ee44f08..3fa63cacfd94 100644
--- a/fs/bcachefs/journal_io.c
+++ b/fs/bcachefs/journal_io.c
@@ -1588,7 +1588,7 @@ static CLOSURE_CALLBACK(journal_write_done)
 	struct bch_fs *c = container_of(j, struct bch_fs, journal);
 	struct bch_replicas_padded replicas;
 	union journal_res_state old, new;
-	u64 v, seq = le64_to_cpu(w->data->seq);
+	u64 seq = le64_to_cpu(w->data->seq);
 	int err = 0;
 
 	bch2_time_stats_update(!JSET_NO_FLUSH(w->data)
@@ -1647,14 +1647,15 @@ static CLOSURE_CALLBACK(journal_write_done)
 		if (j->watermark != BCH_WATERMARK_stripe)
 			journal_reclaim_kick(&c->journal);
 
-		v = atomic64_read(&j->reservations.counter);
+		old.v = atomic64_read(&j->reservations.counter);
 		do {
-			old.v = new.v = v;
+			new.v = old.v;
 			BUG_ON(journal_state_count(new, new.unwritten_idx));
 			BUG_ON(new.unwritten_idx != (seq & JOURNAL_BUF_MASK));
 
 			new.unwritten_idx++;
-		} while ((v = atomic64_cmpxchg(&j->reservations.counter, old.v, new.v)) != old.v);
+		} while (!atomic64_try_cmpxchg(&j->reservations.counter,
+					       &old.v, new.v));
 
 		closure_wake_up(&w->wait);
 		completed = true;
diff --git a/fs/bcachefs/two_state_shared_lock.h b/fs/bcachefs/two_state_shared_lock.h
index 905801772002..7f647846b511 100644
--- a/fs/bcachefs/two_state_shared_lock.h
+++ b/fs/bcachefs/two_state_shared_lock.h
@@ -36,15 +36,14 @@ static inline void bch2_two_state_unlock(two_state_lock_t *lock, int s)
 static inline bool bch2_two_state_trylock(two_state_lock_t *lock, int s)
 {
 	long i = s ? 1 : -1;
-	long v = atomic_long_read(&lock->v), old;
+	long old;
 
+	old = atomic_long_read(&lock->v);
 	do {
-		old = v;
-
-		if (i > 0 ? v < 0 : v > 0)
+		if (i > 0 ? old < 0 : old > 0)
 			return false;
-	} while ((v = atomic_long_cmpxchg_acquire(&lock->v,
-					old, old + i)) != old);
+	} while (!atomic_long_try_cmpxchg_acquire(&lock->v, &old, old + i));
+
 	return true;
 }
 

From 9cc8eb3098b8e78c727bc136f8e97ce5178fcce8 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Fri, 10 May 2024 09:27:09 -0400
Subject: [PATCH 015/120] bcachefs: Check for bsets past
 bch_btree_ptr_v2.sectors_written

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_io.c | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/fs/bcachefs/btree_io.c b/fs/bcachefs/btree_io.c
index 0f3d01225878..f523039f1be9 100644
--- a/fs/bcachefs/btree_io.c
+++ b/fs/bcachefs/btree_io.c
@@ -689,6 +689,7 @@ static int validate_bset(struct bch_fs *c, struct bch_dev *ca,
 			 int write, bool have_retry, bool *saw_error)
 {
 	unsigned version = le16_to_cpu(i->version);
+	unsigned ptr_written = btree_ptr_sectors_written(&b->key);
 	struct printbuf buf1 = PRINTBUF;
 	struct printbuf buf2 = PRINTBUF;
 	int ret = 0;
@@ -732,11 +733,13 @@ static int validate_bset(struct bch_fs *c, struct bch_dev *ca,
 		     btree_node_unsupported_version,
 		     "BSET_SEPARATE_WHITEOUTS no longer supported");
 
-	if (btree_err_on(offset + sectors > btree_sectors(c),
+	if (!write &&
+	    btree_err_on(offset + sectors > (ptr_written ?: btree_sectors(c)),
 			 -BCH_ERR_btree_node_read_err_fixable,
 			 c, ca, b, i, NULL,
 			 bset_past_end_of_btree_node,
-			 "bset past end of btree node")) {
+			 "bset past end of btree node (offset %u len %u but written %zu)",
+			 offset, sectors, ptr_written ?: btree_sectors(c))) {
 		i->u64s = 0;
 		ret = 0;
 		goto out;

From 652bc7fabc28294e6a6798c4d37861db605bda8d Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Fri, 24 May 2024 18:04:22 -0400
Subject: [PATCH 016/120] bcachefs: btree_ptr_sectors_written() now takes
 bkey_s_c

this is for the userspace metadata dump tool

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_io.c              | 8 ++++----
 fs/bcachefs/btree_io.h              | 6 +++---
 fs/bcachefs/btree_update_interior.c | 2 +-
 fs/bcachefs/move.c                  | 2 +-
 4 files changed, 9 insertions(+), 9 deletions(-)

diff --git a/fs/bcachefs/btree_io.c b/fs/bcachefs/btree_io.c
index f523039f1be9..95a141c12e1d 100644
--- a/fs/bcachefs/btree_io.c
+++ b/fs/bcachefs/btree_io.c
@@ -534,7 +534,7 @@ static void btree_err_msg(struct printbuf *out, struct bch_fs *c,
 	printbuf_indent_add(out, 2);
 
 	prt_printf(out, "\nnode offset %u/%u",
-		   b->written, btree_ptr_sectors_written(&b->key));
+		   b->written, btree_ptr_sectors_written(bkey_i_to_s_c(&b->key)));
 	if (i)
 		prt_printf(out, " bset u64s %u", le16_to_cpu(i->u64s));
 	if (k)
@@ -689,7 +689,7 @@ static int validate_bset(struct bch_fs *c, struct bch_dev *ca,
 			 int write, bool have_retry, bool *saw_error)
 {
 	unsigned version = le16_to_cpu(i->version);
-	unsigned ptr_written = btree_ptr_sectors_written(&b->key);
+	unsigned ptr_written = btree_ptr_sectors_written(bkey_i_to_s_c(&b->key));
 	struct printbuf buf1 = PRINTBUF;
 	struct printbuf buf2 = PRINTBUF;
 	int ret = 0;
@@ -1005,7 +1005,7 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca,
 	bool updated_range = b->key.k.type == KEY_TYPE_btree_ptr_v2 &&
 		BTREE_PTR_RANGE_UPDATED(&bkey_i_to_btree_ptr_v2(&b->key)->v);
 	unsigned u64s;
-	unsigned ptr_written = btree_ptr_sectors_written(&b->key);
+	unsigned ptr_written = btree_ptr_sectors_written(bkey_i_to_s_c(&b->key));
 	struct printbuf buf = PRINTBUF;
 	int ret = 0, retry_read = 0, write = READ;
 	u64 start_time = local_clock();
@@ -2138,7 +2138,7 @@ do_write:
 
 	if (!b->written &&
 	    b->key.k.type == KEY_TYPE_btree_ptr_v2)
-		BUG_ON(btree_ptr_sectors_written(&b->key) != sectors_to_write);
+		BUG_ON(btree_ptr_sectors_written(bkey_i_to_s_c(&b->key)) != sectors_to_write);
 
 	memset(data + bytes_to_write, 0,
 	       (sectors_to_write << 9) - bytes_to_write);
diff --git a/fs/bcachefs/btree_io.h b/fs/bcachefs/btree_io.h
index 2b8b564fc560..63d76f5c6403 100644
--- a/fs/bcachefs/btree_io.h
+++ b/fs/bcachefs/btree_io.h
@@ -27,10 +27,10 @@ static inline void clear_btree_node_dirty_acct(struct bch_fs *c, struct btree *b
 		atomic_dec(&c->btree_cache.dirty);
 }
 
-static inline unsigned btree_ptr_sectors_written(struct bkey_i *k)
+static inline unsigned btree_ptr_sectors_written(struct bkey_s_c k)
 {
-	return k->k.type == KEY_TYPE_btree_ptr_v2
-		? le16_to_cpu(bkey_i_to_btree_ptr_v2(k)->v.sectors_written)
+	return k.k->type == KEY_TYPE_btree_ptr_v2
+		? le16_to_cpu(bkey_s_c_to_btree_ptr_v2(k).v->sectors_written)
 		: 0;
 }
 
diff --git a/fs/bcachefs/btree_update_interior.c b/fs/bcachefs/btree_update_interior.c
index c58ceb328645..6d340f36aacf 100644
--- a/fs/bcachefs/btree_update_interior.c
+++ b/fs/bcachefs/btree_update_interior.c
@@ -1359,7 +1359,7 @@ static void bch2_insert_fixup_btree_ptr(struct btree_update *as,
 	unsigned long old, new;
 
 	BUG_ON(insert->k.type == KEY_TYPE_btree_ptr_v2 &&
-	       !btree_ptr_sectors_written(insert));
+	       !btree_ptr_sectors_written(bkey_i_to_s_c(insert)));
 
 	if (unlikely(!test_bit(JOURNAL_replay_done, &c->journal.flags)))
 		bch2_journal_key_overwritten(c, b->c.btree_id, b->c.level, insert->k.p);
diff --git a/fs/bcachefs/move.c b/fs/bcachefs/move.c
index e714e3bd5bbb..7d3920e03742 100644
--- a/fs/bcachefs/move.c
+++ b/fs/bcachefs/move.c
@@ -780,7 +780,7 @@ int bch2_evacuate_bucket(struct moving_context *ctxt,
 			if (!b)
 				goto next;
 
-			unsigned sectors = btree_ptr_sectors_written(&b->key);
+			unsigned sectors = btree_ptr_sectors_written(bkey_i_to_s_c(&b->key));
 
 			ret = bch2_btree_node_rewrite(trans, &iter, b, 0);
 			bch2_trans_iter_exit(trans, &iter);

From 03ec0927fa15e1af1268c8637f61b0ac98082907 Mon Sep 17 00:00:00 2001
From: Thomas Bertschinger <tahbertschinger@gmail.com>
Date: Sun, 26 May 2024 13:08:19 -0600
Subject: [PATCH 017/120] bcachefs: make offline fsck set read_only fs flag

A subsequent change will remove "read_only" as a mount option in favor
of the standard option "ro", meaning the userspace fsck command cannot
pass it to the fsck ioctl. Instead, in offline fsck, set "read_only"
kernel-side without trying to parse it as a mount option.

For compatibility with versions of the "bcachefs fsck" command that try
to pass the "read_only" mount opt, remove it from the mount options
string prior to parsing when it is present.

Signed-off-by: Thomas Bertschinger <tahbertschinger@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/chardev.c | 13 +++++++++++++
 1 file changed, 13 insertions(+)

diff --git a/fs/bcachefs/chardev.c b/fs/bcachefs/chardev.c
index 6d82e1165adc..8c5b13cd3205 100644
--- a/fs/bcachefs/chardev.c
+++ b/fs/bcachefs/chardev.c
@@ -213,6 +213,18 @@ static long bch2_ioctl_fsck_offline(struct bch_ioctl_fsck_offline __user *user_a
 
 	if (arg.opts) {
 		char *optstr = strndup_user((char __user *)(unsigned long) arg.opts, 1 << 16);
+		char *ro, *rest;
+
+		/*
+		 * If passed a "read_only" mount option, remove it because it is
+		 * no longer a valid mount option, and the filesystem will be
+		 * set "read_only" regardless.
+		 */
+		ro = strstr(optstr, "read_only");
+		if (ro) {
+			rest = ro + strlen("read_only");
+			memmove(ro, rest, strlen(rest) + 1);
+		}
 
 		ret =   PTR_ERR_OR_ZERO(optstr) ?:
 			bch2_parse_mount_opts(NULL, &thr->opts, optstr);
@@ -224,6 +236,7 @@ static long bch2_ioctl_fsck_offline(struct bch_ioctl_fsck_offline __user *user_a
 	}
 
 	opt_set(thr->opts, stdio, (u64)(unsigned long)&thr->thr.stdio);
+	opt_set(thr->opts, read_only, 1);
 
 	/* We need request_key() to be called before we punt to kthread: */
 	opt_set(thr->opts, nostart, true);

From babe30fe8db62b79f7e3df4144acc344b3672f60 Mon Sep 17 00:00:00 2001
From: Thomas Bertschinger <tahbertschinger@gmail.com>
Date: Sun, 26 May 2024 13:08:20 -0600
Subject: [PATCH 018/120] bcachefs: don't expose "read_only" as a mount option

When "read_only" is exposed as a mount option, it is redundant with the
standard option "ro" and gives users multiple ways to specify that a
bcachefs filesystem should be mounted read-only. This presents the risk
of having inconsistent options specified.

This can be seen when remounting a read-only filesystem in read-write
mode, using mount(8) from util-linux. Because mount(8) parses the
existing mount options from `/proc/mounts` and applies them when
remounting, it can end up applying both "read_only" and "rw":

$ mount img -o ro /mnt
$ strace mount -o remount,rw /mnt
...
fsconfig(4, FSCONFIG_SET_FLAG, "read_only", NULL, 0) = 0
fsconfig(4, FSCONFIG_SET_FLAG, "rw", NULL, 0) = 0
...

Making "read_only" no longer a mount option means this edge case cannot
occur.

Fixes: 62719cf33c3a ("bcachefs: Fix nochanges/read_only interaction")
Signed-off-by: Thomas Bertschinger <tahbertschinger@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/opts.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fs/bcachefs/opts.h b/fs/bcachefs/opts.h
index b197ec90d4cb..c9da6267894b 100644
--- a/fs/bcachefs/opts.h
+++ b/fs/bcachefs/opts.h
@@ -406,7 +406,7 @@ enum fsck_err_opts {
 	  BCH2_NO_SB_OPT,		BCH_SB_SECTOR,			\
 	  "offset",	"Sector offset of superblock")			\
 	x(read_only,			u8,				\
-	  OPT_FS|OPT_MOUNT,						\
+	  OPT_FS,							\
 	  OPT_BOOL(),							\
 	  BCH2_NO_SB_OPT,		false,				\
 	  NULL,		NULL)						\

From 3811f48aa3d6ab97b199bdf4bdacce7abe7cfdeb Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Sun, 26 May 2024 22:20:34 -0400
Subject: [PATCH 019/120] bcachefs: bch2_printbuf_strip_trailing_newline()

Add a new helper to fix inode_to_text()

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/inode.c    |  2 ++
 fs/bcachefs/printbuf.c | 14 ++++++++++++++
 fs/bcachefs/printbuf.h |  1 +
 3 files changed, 17 insertions(+)

diff --git a/fs/bcachefs/inode.c b/fs/bcachefs/inode.c
index 4fd8c736744a..84d5385e1046 100644
--- a/fs/bcachefs/inode.c
+++ b/fs/bcachefs/inode.c
@@ -551,6 +551,8 @@ static void __bch2_inode_unpacked_to_text(struct printbuf *out,
 	prt_printf(out, #_name "=%llu\n", (u64) inode->_name);
 	BCH_INODE_FIELDS_v3()
 #undef  x
+
+	bch2_printbuf_strip_trailing_newline(out);
 	printbuf_indent_sub(out, 2);
 }
 
diff --git a/fs/bcachefs/printbuf.c b/fs/bcachefs/printbuf.c
index 9f529e4c1b16..4cf5a2af1e6f 100644
--- a/fs/bcachefs/printbuf.c
+++ b/fs/bcachefs/printbuf.c
@@ -316,6 +316,20 @@ void bch2_prt_newline(struct printbuf *buf)
 	buf->cur_tabstop	= 0;
 }
 
+void bch2_printbuf_strip_trailing_newline(struct printbuf *out)
+{
+	for (int p = out->pos - 1; p >= 0; --p) {
+		if (out->buf[p] == '\n') {
+			out->pos = p;
+			break;
+		}
+		if (out->buf[p] != ' ')
+			break;
+	}
+
+	printbuf_nul_terminate_reserved(out);
+}
+
 static void __prt_tab(struct printbuf *out)
 {
 	int spaces = max_t(int, 0, cur_tabstop(out) - printbuf_linelen(out));
diff --git a/fs/bcachefs/printbuf.h b/fs/bcachefs/printbuf.h
index 9ecc56bc9635..1d570387b77f 100644
--- a/fs/bcachefs/printbuf.h
+++ b/fs/bcachefs/printbuf.h
@@ -115,6 +115,7 @@ void bch2_printbuf_indent_add(struct printbuf *, unsigned);
 void bch2_printbuf_indent_sub(struct printbuf *, unsigned);
 
 void bch2_prt_newline(struct printbuf *);
+void bch2_printbuf_strip_trailing_newline(struct printbuf *);
 void bch2_prt_tab(struct printbuf *);
 void bch2_prt_tab_rjust(struct printbuf *);
 

From 51fc436c806c638512ea745579dbc4098a60b2d5 Mon Sep 17 00:00:00 2001
From: Thomas Bertschinger <tahbertschinger@gmail.com>
Date: Sat, 25 May 2024 13:36:19 -0600
Subject: [PATCH 020/120] bcachefs: allow passing full device path for target
 options

The output of mount options such as "metadata_target" in `/proc/mounts`
uses the full path to the device.

mount(8) from util-linux uses the output from `/proc/mounts` to pass
existing mount options when performing a remount, so bcachefs should
accept as input the same form that it prints as output.

Without this change:

$ mount -t bcachefs -o metadata_target=vdb /dev/vdb /mnt
$ strace mount -o remount /mnt
...
fsconfig(4, FSCONFIG_SET_STRING, "metadata_target", "/dev/vdb", 0) = -1 EINVAL (Invalid argument)
...

Signed-off-by: Thomas Bertschinger <tahbertschinger@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/super.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/fs/bcachefs/super.c b/fs/bcachefs/super.c
index da735608d47c..bbc9e5a926bb 100644
--- a/fs/bcachefs/super.c
+++ b/fs/bcachefs/super.c
@@ -2041,6 +2041,9 @@ err:
 /* return with ref on ca->ref: */
 struct bch_dev *bch2_dev_lookup(struct bch_fs *c, const char *name)
 {
+	if (!strncmp(name, "/dev/", strlen("/dev/")))
+		name += strlen("/dev/");
+
 	for_each_member_device(c, ca)
 		if (!strcmp(name, ca->name))
 			return ca;

From c13d526d9dc1aa0c4962b017c881c28c1e23ca26 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Sun, 26 May 2024 18:11:37 -0400
Subject: [PATCH 021/120] bcachefs: check_key_has_inode()

Consolidate duplicated checks for extents/dirents/xattrs - these keys
should all have a corresponding inode of the correct type.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/fsck.c             | 203 ++++++++++++++++++---------------
 fs/bcachefs/sb-errors_format.h |   4 +-
 2 files changed, 113 insertions(+), 94 deletions(-)

diff --git a/fs/bcachefs/fsck.c b/fs/bcachefs/fsck.c
index 921bcdb3e5e4..aeb59da74e52 100644
--- a/fs/bcachefs/fsck.c
+++ b/fs/bcachefs/fsck.c
@@ -455,33 +455,44 @@ static int reconstruct_subvol(struct btree_trans *trans, u32 snapshotid, u32 sub
 	return 0;
 }
 
-static int reconstruct_inode(struct btree_trans *trans, u32 snapshot, u64 inum, u64 size, unsigned mode)
+static int reconstruct_inode(struct btree_trans *trans, enum btree_id btree, u32 snapshot, u64 inum)
 {
 	struct bch_fs *c = trans->c;
-	struct bch_inode_unpacked new_inode;
+	unsigned i_mode = S_IFREG;
+	u64 i_size = 0;
 
+	switch (btree) {
+	case BTREE_ID_extents: {
+		struct btree_iter iter = {};
+
+		bch2_trans_iter_init(trans, &iter, BTREE_ID_extents, SPOS(inum, U64_MAX, snapshot), 0);
+		struct bkey_s_c k = bch2_btree_iter_peek_prev(&iter);
+		bch2_trans_iter_exit(trans, &iter);
+		int ret = bkey_err(k);
+		if (ret)
+			return ret;
+
+		i_size = k.k->p.offset << 9;
+		break;
+	}
+	case BTREE_ID_dirents:
+		i_mode = S_IFDIR;
+		break;
+	case BTREE_ID_xattrs:
+		break;
+	default:
+		BUG();
+	}
+
+	struct bch_inode_unpacked new_inode;
 	bch2_inode_init_early(c, &new_inode);
-	bch2_inode_init_late(&new_inode, bch2_current_time(c), 0, 0, mode|0755, 0, NULL);
-	new_inode.bi_size = size;
+	bch2_inode_init_late(&new_inode, bch2_current_time(c), 0, 0, i_mode|0600, 0, NULL);
+	new_inode.bi_size = i_size;
 	new_inode.bi_inum = inum;
 
 	return __bch2_fsck_write_inode(trans, &new_inode, snapshot);
 }
 
-static int reconstruct_reg_inode(struct btree_trans *trans, u32 snapshot, u64 inum)
-{
-	struct btree_iter iter = {};
-
-	bch2_trans_iter_init(trans, &iter, BTREE_ID_extents, SPOS(inum, U64_MAX, snapshot), 0);
-	struct bkey_s_c k = bch2_btree_iter_peek_prev(&iter);
-	bch2_trans_iter_exit(trans, &iter);
-	int ret = bkey_err(k);
-	if (ret)
-		return ret;
-
-	return reconstruct_inode(trans, snapshot, inum, k.k->p.offset << 9, S_IFREG);
-}
-
 struct snapshots_seen {
 	struct bpos			pos;
 	snapshot_id_list		ids;
@@ -1170,6 +1181,70 @@ int bch2_check_inodes(struct bch_fs *c)
 	return ret;
 }
 
+static inline bool btree_matches_i_mode(enum btree_id btree, unsigned mode)
+{
+	switch (btree) {
+	case BTREE_ID_extents:
+		return S_ISREG(mode) || S_ISLNK(mode);
+	case BTREE_ID_dirents:
+		return S_ISDIR(mode);
+	case BTREE_ID_xattrs:
+		return true;
+	default:
+		BUG();
+	}
+}
+
+static int check_key_has_inode(struct btree_trans *trans,
+			       struct btree_iter *iter,
+			       struct inode_walker *inode,
+			       struct inode_walker_entry *i,
+			       struct bkey_s_c k)
+{
+	struct bch_fs *c = trans->c;
+	struct printbuf buf = PRINTBUF;
+	int ret = PTR_ERR_OR_ZERO(i);
+	if (ret)
+		return ret;
+
+	if (k.k->type == KEY_TYPE_whiteout)
+		goto out;
+
+	if (!i && (c->sb.btrees_lost_data & BIT_ULL(BTREE_ID_inodes))) {
+		ret =   reconstruct_inode(trans, iter->btree_id, k.k->p.snapshot, k.k->p.inode) ?:
+			bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc);
+		if (ret)
+			goto err;
+
+		inode->last_pos.inode--;
+		ret = -BCH_ERR_transaction_restart_nested;
+		goto err;
+	}
+
+	if (fsck_err_on(!i, c, key_in_missing_inode,
+			"key in missing inode:\n  %s",
+			(printbuf_reset(&buf),
+			 bch2_bkey_val_to_text(&buf, c, k), buf.buf)))
+		goto delete;
+
+	if (fsck_err_on(i && !btree_matches_i_mode(iter->btree_id, i->inode.bi_mode),
+			c, key_in_wrong_inode_type,
+			"key for wrong inode mode %o:\n  %s",
+			i->inode.bi_mode,
+			(printbuf_reset(&buf),
+			 bch2_bkey_val_to_text(&buf, c, k), buf.buf)))
+		goto delete;
+out:
+err:
+fsck_err:
+	printbuf_exit(&buf);
+	bch_err_fn(c, ret);
+	return ret;
+delete:
+	ret = bch2_btree_delete_at(trans, iter, BTREE_UPDATE_internal_snapshot_node);
+	goto out;
+}
+
 static int check_i_sectors_notnested(struct btree_trans *trans, struct inode_walker *w)
 {
 	struct bch_fs *c = trans->c;
@@ -1476,43 +1551,20 @@ static int check_extent(struct btree_trans *trans, struct btree_iter *iter,
 			goto err;
 	}
 
+	ret = snapshots_seen_update(c, s, iter->btree_id, k.k->p);
+	if (ret)
+		goto err;
+
 	i = walk_inode(trans, inode, k);
 	ret = PTR_ERR_OR_ZERO(i);
 	if (ret)
 		goto err;
 
-	ret = snapshots_seen_update(c, s, iter->btree_id, k.k->p);
+	ret = check_key_has_inode(trans, iter, inode, i, k);
 	if (ret)
 		goto err;
 
 	if (k.k->type != KEY_TYPE_whiteout) {
-		if (!i && (c->sb.btrees_lost_data & BIT_ULL(BTREE_ID_inodes))) {
-			ret =   reconstruct_reg_inode(trans, k.k->p.snapshot, k.k->p.inode) ?:
-				bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc);
-			if (ret)
-				goto err;
-
-			inode->last_pos.inode--;
-			ret = -BCH_ERR_transaction_restart_nested;
-			goto err;
-		}
-
-		if (fsck_err_on(!i, c, extent_in_missing_inode,
-				"extent in missing inode:\n  %s",
-				(printbuf_reset(&buf),
-				 bch2_bkey_val_to_text(&buf, c, k), buf.buf)))
-			goto delete;
-
-		if (fsck_err_on(i &&
-				!S_ISREG(i->inode.bi_mode) &&
-				!S_ISLNK(i->inode.bi_mode),
-				c, extent_in_non_reg_inode,
-				"extent in non regular inode mode %o:\n  %s",
-				i->inode.bi_mode,
-				(printbuf_reset(&buf),
-				 bch2_bkey_val_to_text(&buf, c, k), buf.buf)))
-			goto delete;
-
 		ret = check_overlapping_extents(trans, s, extent_ends, k, iter,
 						&inode->recalculate_sums);
 		if (ret)
@@ -1525,7 +1577,7 @@ static int check_extent(struct btree_trans *trans, struct btree_iter *iter,
 	 * didn't have one, iterate over all inodes:
 	 */
 	if (!i)
-		i = inode->inodes.data + inode->inodes.nr - 1;
+		i = &darray_last(inode->inodes);
 
 	for (;
 	     inode->inodes.data && i >= inode->inodes.data;
@@ -1574,9 +1626,6 @@ fsck_err:
 	printbuf_exit(&buf);
 	bch_err_fn(c, ret);
 	return ret;
-delete:
-	ret = bch2_btree_delete_at(trans, iter, BTREE_UPDATE_internal_snapshot_node);
-	goto out;
 }
 
 /*
@@ -2009,49 +2058,21 @@ static int check_dirent(struct btree_trans *trans, struct btree_iter *iter,
 			goto err;
 	}
 
-	BUG_ON(!btree_iter_path(trans, iter)->should_be_locked);
-
 	i = walk_inode(trans, dir, k);
 	ret = PTR_ERR_OR_ZERO(i);
 	if (ret < 0)
 		goto err;
 
-	if (dir->first_this_inode && dir->inodes.nr)
-		*hash_info = bch2_hash_info_init(c, &dir->inodes.data[0].inode);
-	dir->first_this_inode = false;
-
-	if (!i && (c->sb.btrees_lost_data & BIT_ULL(BTREE_ID_inodes))) {
-		ret =   reconstruct_inode(trans, k.k->p.snapshot, k.k->p.inode, 0, S_IFDIR) ?:
-			bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc);
-		if (ret)
-			goto err;
-
-		dir->last_pos.inode--;
-		ret = -BCH_ERR_transaction_restart_nested;
+	ret = check_key_has_inode(trans, iter, dir, i, k);
+	if (ret)
 		goto err;
-	}
-
-	if (fsck_err_on(!i, c, dirent_in_missing_dir_inode,
-			"dirent in nonexisting directory:\n%s",
-			(printbuf_reset(&buf),
-			 bch2_bkey_val_to_text(&buf, c, k), buf.buf))) {
-		ret = bch2_btree_delete_at(trans, iter,
-				BTREE_UPDATE_internal_snapshot_node);
-		goto out;
-	}
 
 	if (!i)
 		goto out;
 
-	if (fsck_err_on(!S_ISDIR(i->inode.bi_mode),
-			c, dirent_in_non_dir_inode,
-			"dirent in non directory inode type %s:\n%s",
-			bch2_d_type_str(inode_d_type(&i->inode)),
-			(printbuf_reset(&buf),
-			 bch2_bkey_val_to_text(&buf, c, k), buf.buf))) {
-		ret = bch2_btree_delete_at(trans, iter, 0);
-		goto out;
-	}
+	if (dir->first_this_inode)
+		*hash_info = bch2_hash_info_init(c, &i->inode);
+	dir->first_this_inode = false;
 
 	ret = hash_check_key(trans, bch2_dirent_hash_desc, hash_info, iter, k);
 	if (ret < 0)
@@ -2156,20 +2177,18 @@ static int check_xattr(struct btree_trans *trans, struct btree_iter *iter,
 	if (ret)
 		return ret;
 
-	if (inode->first_this_inode && inode->inodes.nr)
-		*hash_info = bch2_hash_info_init(c, &inode->inodes.data[0].inode);
-	inode->first_this_inode = false;
-
-	if (fsck_err_on(!i, c, xattr_in_missing_inode,
-			"xattr for missing inode %llu",
-			k.k->p.inode))
-		return bch2_btree_delete_at(trans, iter, 0);
+	ret = check_key_has_inode(trans, iter, inode, i, k);
+	if (ret)
+		return ret;
 
 	if (!i)
 		return 0;
 
+	if (inode->first_this_inode)
+		*hash_info = bch2_hash_info_init(c, &i->inode);
+	inode->first_this_inode = false;
+
 	ret = hash_check_key(trans, bch2_xattr_hash_desc, hash_info, iter, k);
-fsck_err:
 	bch_err_fn(c, ret);
 	return ret;
 }
diff --git a/fs/bcachefs/sb-errors_format.h b/fs/bcachefs/sb-errors_format.h
index d54121ec093f..d1b2f2aa397a 100644
--- a/fs/bcachefs/sb-errors_format.h
+++ b/fs/bcachefs/sb-errors_format.h
@@ -227,8 +227,8 @@ enum bch_fsck_flags {
 	x(deleted_inode_is_dir,					213,	0)		\
 	x(deleted_inode_not_unlinked,				214,	0)		\
 	x(extent_overlapping,					215,	0)		\
-	x(extent_in_missing_inode,				216,	0)		\
-	x(extent_in_non_reg_inode,				217,	0)		\
+	x(key_in_missing_inode,					216,	0)		\
+	x(key_in_wrong_inode_type,				217,	0)		\
 	x(extent_past_end_of_inode,				218,	0)		\
 	x(dirent_empty_name,					219,	0)		\
 	x(dirent_val_too_big,					220,	0)		\

From 55f7962da3bb2d34525c1973189413a113667a24 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Thu, 23 Nov 2023 17:21:23 -0500
Subject: [PATCH 022/120] bcachefs: bch_alloc->stripe_sectors

Add a separate counter to bch_alloc_v4 for amount of striped data; this
lets us separately track striped and unstriped data in a bucket, which
lets us see when erasure coding has failed to update extents with stripe
pointers, and also find buckets to continue updating if we crash mid way
through creating a new stripe.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/alloc_background.c        | 25 +++++++++++++++++++++----
 fs/bcachefs/alloc_background.h        |  8 +++++---
 fs/bcachefs/alloc_background_format.h |  2 ++
 fs/bcachefs/btree_gc.c                |  2 ++
 fs/bcachefs/buckets.c                 | 16 +++++++++-------
 fs/bcachefs/buckets_types.h           |  3 ++-
 6 files changed, 41 insertions(+), 15 deletions(-)

diff --git a/fs/bcachefs/alloc_background.c b/fs/bcachefs/alloc_background.c
index 9c12ce5f4da3..2af0f0a631f6 100644
--- a/fs/bcachefs/alloc_background.c
+++ b/fs/bcachefs/alloc_background.c
@@ -268,27 +268,41 @@ int bch2_alloc_v4_invalid(struct bch_fs *c, struct bkey_s_c k,
 				 i == READ ? "read" : "write",
 				 a.v->io_time[i], LRU_TIME_MAX);
 
+	unsigned stripe_sectors = BCH_ALLOC_V4_BACKPOINTERS_START(a.v) * sizeof(u64) >
+		offsetof(struct bch_alloc_v4, stripe_sectors)
+		? a.v->stripe_sectors
+		: 0;
+
 	switch (a.v->data_type) {
 	case BCH_DATA_free:
 	case BCH_DATA_need_gc_gens:
 	case BCH_DATA_need_discard:
-		bkey_fsck_err_on(bch2_bucket_sectors_total(*a.v) || a.v->stripe,
+		bkey_fsck_err_on(stripe_sectors ||
+				 a.v->dirty_sectors ||
+				 a.v->cached_sectors ||
+				 a.v->stripe,
 				 c, err, alloc_key_empty_but_have_data,
-				 "empty data type free but have data");
+				 "empty data type free but have data %u.%u.%u %u",
+				 stripe_sectors,
+				 a.v->dirty_sectors,
+				 a.v->cached_sectors,
+				 a.v->stripe);
 		break;
 	case BCH_DATA_sb:
 	case BCH_DATA_journal:
 	case BCH_DATA_btree:
 	case BCH_DATA_user:
 	case BCH_DATA_parity:
-		bkey_fsck_err_on(!bch2_bucket_sectors_dirty(*a.v),
+		bkey_fsck_err_on(!a.v->dirty_sectors &&
+				 !stripe_sectors,
 				 c, err, alloc_key_dirty_sectors_0,
 				 "data_type %s but dirty_sectors==0",
 				 bch2_data_type_str(a.v->data_type));
 		break;
 	case BCH_DATA_cached:
 		bkey_fsck_err_on(!a.v->cached_sectors ||
-				 bch2_bucket_sectors_dirty(*a.v) ||
+				 a.v->dirty_sectors ||
+				 stripe_sectors ||
 				 a.v->stripe,
 				 c, err, alloc_key_cached_inconsistency,
 				 "data type inconsistency");
@@ -319,6 +333,7 @@ void bch2_alloc_v4_swab(struct bkey_s k)
 	a->stripe		= swab32(a->stripe);
 	a->nr_external_backpointers = swab32(a->nr_external_backpointers);
 	a->fragmentation_lru	= swab64(a->fragmentation_lru);
+	a->stripe_sectors	= swab32(a->stripe_sectors);
 
 	bps = alloc_v4_backpointers(a);
 	for (bp = bps; bp < bps + BCH_ALLOC_V4_NR_BACKPOINTERS(a); bp++) {
@@ -343,6 +358,7 @@ void bch2_alloc_to_text(struct printbuf *out, struct bch_fs *c, struct bkey_s_c
 	prt_printf(out, "need_discard      %llu\n",	BCH_ALLOC_V4_NEED_DISCARD(a));
 	prt_printf(out, "need_inc_gen      %llu\n",	BCH_ALLOC_V4_NEED_INC_GEN(a));
 	prt_printf(out, "dirty_sectors     %u\n",	a->dirty_sectors);
+	prt_printf(out, "stripe_sectors    %u\n",	a->stripe_sectors);
 	prt_printf(out, "cached_sectors    %u\n",	a->cached_sectors);
 	prt_printf(out, "stripe            %u\n",	a->stripe);
 	prt_printf(out, "stripe_redundancy %u\n",	a->stripe_redundancy);
@@ -1981,6 +1997,7 @@ static int invalidate_one_bucket(struct btree_trans *trans,
 	a->v.gen++;
 	a->v.data_type		= 0;
 	a->v.dirty_sectors	= 0;
+	a->v.stripe_sectors	= 0;
 	a->v.cached_sectors	= 0;
 	a->v.io_time[READ]	= bch2_current_io_time(c, READ);
 	a->v.io_time[WRITE]	= bch2_current_io_time(c, WRITE);
diff --git a/fs/bcachefs/alloc_background.h b/fs/bcachefs/alloc_background.h
index ba2c5557a3f0..d7eb9feb7a7e 100644
--- a/fs/bcachefs/alloc_background.h
+++ b/fs/bcachefs/alloc_background.h
@@ -41,6 +41,7 @@ static inline void alloc_to_bucket(struct bucket *dst, struct bch_alloc_v4 src)
 {
 	dst->gen		= src.gen;
 	dst->data_type		= src.data_type;
+	dst->stripe_sectors	= src.stripe_sectors;
 	dst->dirty_sectors	= src.dirty_sectors;
 	dst->cached_sectors	= src.cached_sectors;
 	dst->stripe		= src.stripe;
@@ -50,6 +51,7 @@ static inline void __bucket_m_to_alloc(struct bch_alloc_v4 *dst, struct bucket s
 {
 	dst->gen		= src.gen;
 	dst->data_type		= src.data_type;
+	dst->stripe_sectors	= src.stripe_sectors;
 	dst->dirty_sectors	= src.dirty_sectors;
 	dst->cached_sectors	= src.cached_sectors;
 	dst->stripe		= src.stripe;
@@ -82,12 +84,12 @@ static inline bool bucket_data_type_mismatch(enum bch_data_type bucket,
 
 static inline unsigned bch2_bucket_sectors_total(struct bch_alloc_v4 a)
 {
-	return a.dirty_sectors + a.cached_sectors;
+	return a.stripe_sectors + a.dirty_sectors + a.cached_sectors;
 }
 
 static inline unsigned bch2_bucket_sectors_dirty(struct bch_alloc_v4 a)
 {
-	return a.dirty_sectors;
+	return a.stripe_sectors + a.dirty_sectors;
 }
 
 static inline unsigned bch2_bucket_sectors_fragmented(struct bch_dev *ca,
@@ -103,7 +105,7 @@ static inline enum bch_data_type alloc_data_type(struct bch_alloc_v4 a,
 {
 	if (a.stripe)
 		return data_type == BCH_DATA_parity ? data_type : BCH_DATA_stripe;
-	if (a.dirty_sectors)
+	if (bch2_bucket_sectors_dirty(a))
 		return data_type;
 	if (a.cached_sectors)
 		return BCH_DATA_cached;
diff --git a/fs/bcachefs/alloc_background_format.h b/fs/bcachefs/alloc_background_format.h
index b4ec20be93b8..47d9d006502c 100644
--- a/fs/bcachefs/alloc_background_format.h
+++ b/fs/bcachefs/alloc_background_format.h
@@ -70,6 +70,8 @@ struct bch_alloc_v4 {
 	__u32			stripe;
 	__u32			nr_external_backpointers;
 	__u64			fragmentation_lru;
+	__u32			stripe_sectors;
+	__u32			pad;
 } __packed __aligned(8);
 
 #define BCH_ALLOC_V4_U64s_V0	6
diff --git a/fs/bcachefs/btree_gc.c b/fs/bcachefs/btree_gc.c
index a0deb8266011..4f1c567ce6f8 100644
--- a/fs/bcachefs/btree_gc.c
+++ b/fs/bcachefs/btree_gc.c
@@ -872,6 +872,7 @@ static inline bool bch2_alloc_v4_cmp(struct bch_alloc_v4 l,
 		l.oldest_gen != r.oldest_gen		||
 		l.data_type != r.data_type		||
 		l.dirty_sectors	!= r.dirty_sectors	||
+		l.stripe_sectors != r.stripe_sectors	||
 		l.cached_sectors != r.cached_sectors	 ||
 		l.stripe_redundancy != r.stripe_redundancy ||
 		l.stripe != r.stripe;
@@ -941,6 +942,7 @@ static int bch2_alloc_write_key(struct btree_trans *trans,
 
 	copy_bucket_field(alloc_key_gen_wrong,			gen);
 	copy_bucket_field(alloc_key_dirty_sectors_wrong,	dirty_sectors);
+	copy_bucket_field(alloc_key_stripe_sectors_wrong,	stripe_sectors);
 	copy_bucket_field(alloc_key_cached_sectors_wrong,	cached_sectors);
 	copy_bucket_field(alloc_key_stripe_wrong,		stripe);
 	copy_bucket_field(alloc_key_stripe_redundancy_wrong,	stripe_redundancy);
diff --git a/fs/bcachefs/buckets.c b/fs/bcachefs/buckets.c
index 5145066330ed..8017faf56b0f 100644
--- a/fs/bcachefs/buckets.c
+++ b/fs/bcachefs/buckets.c
@@ -533,6 +533,7 @@ static int bch2_check_fix_ptr(struct btree_trans *trans,
 			g->gen_valid		= true;
 			g->gen			= p.ptr.gen;
 			g->data_type		= 0;
+			g->stripe_sectors	= 0;
 			g->dirty_sectors	= 0;
 			g->cached_sectors	= 0;
 		} else {
@@ -578,6 +579,7 @@ static int bch2_check_fix_ptr(struct btree_trans *trans,
 			g->gen_valid		= true;
 			g->gen			= p.ptr.gen;
 			g->data_type		= data_type;
+			g->stripe_sectors	= 0;
 			g->dirty_sectors	= 0;
 			g->cached_sectors	= 0;
 		} else {
@@ -990,14 +992,14 @@ need_mark:
 
 static int __mark_pointer(struct btree_trans *trans, struct bch_dev *ca,
 			  struct bkey_s_c k,
-			  const struct bch_extent_ptr *ptr,
+			  const struct extent_ptr_decoded *p,
 			  s64 sectors, enum bch_data_type ptr_data_type,
 			  struct bch_alloc_v4 *a)
 {
-	u32 *dst_sectors = !ptr->cached
-		? &a->dirty_sectors
-		: &a->cached_sectors;
-	int ret = bch2_bucket_ref_update(trans, ca, k, ptr, sectors, ptr_data_type,
+	u32 *dst_sectors = p->has_ec	? &a->stripe_sectors :
+		!p->ptr.cached		? &a->dirty_sectors :
+					  &a->cached_sectors;
+	int ret = bch2_bucket_ref_update(trans, ca, k, &p->ptr, sectors, ptr_data_type,
 					 a->gen, a->data_type, dst_sectors);
 
 	if (ret)
@@ -1034,7 +1036,7 @@ static int bch2_trigger_pointer(struct btree_trans *trans,
 	if (flags & BTREE_TRIGGER_transactional) {
 		struct bkey_i_alloc_v4 *a = bch2_trans_start_alloc_update(trans, bucket);
 		ret = PTR_ERR_OR_ZERO(a) ?:
-			__mark_pointer(trans, ca, k, &p.ptr, *sectors, bp.data_type, &a->v);
+			__mark_pointer(trans, ca, k, &p, *sectors, bp.data_type, &a->v);
 		if (ret)
 			goto err;
 
@@ -1057,7 +1059,7 @@ static int bch2_trigger_pointer(struct btree_trans *trans,
 
 		bucket_lock(g);
 		struct bch_alloc_v4 old = bucket_m_to_alloc(*g), new = old;
-		ret = __mark_pointer(trans, ca, k, &p.ptr, *sectors, bp.data_type, &new);
+		ret = __mark_pointer(trans, ca, k, &p, *sectors, bp.data_type, &new);
 		if (!ret) {
 			alloc_to_bucket(g, new);
 			bch2_dev_usage_update(c, ca, &old, &new, 0, true);
diff --git a/fs/bcachefs/buckets_types.h b/fs/bcachefs/buckets_types.h
index f636e17c4caf..6c4ee3163d0f 100644
--- a/fs/bcachefs/buckets_types.h
+++ b/fs/bcachefs/buckets_types.h
@@ -16,7 +16,8 @@ struct bucket {
 	u32			stripe;
 	u32			dirty_sectors;
 	u32			cached_sectors;
-};
+	u32			stripe_sectors;
+} __aligned(sizeof(long));
 
 struct bucket_array {
 	struct rcu_head		rcu;

From 2612e29142ff718e6f120c62e6792f0a67fd3005 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Thu, 23 Nov 2023 16:34:03 -0500
Subject: [PATCH 023/120] bcachefs: BCH_DATA_unstriped

Add a new pseudo data type, to track buckets that are members of a
stripe, but have unstriped data in them.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/alloc_background.h |  5 +++++
 fs/bcachefs/bcachefs_format.h  |  3 ++-
 fs/bcachefs/buckets.c          | 14 +++++++++++---
 fs/bcachefs/chardev.c          |  2 +-
 4 files changed, 19 insertions(+), 5 deletions(-)

diff --git a/fs/bcachefs/alloc_background.h b/fs/bcachefs/alloc_background.h
index d7eb9feb7a7e..a766eaf48863 100644
--- a/fs/bcachefs/alloc_background.h
+++ b/fs/bcachefs/alloc_background.h
@@ -100,6 +100,11 @@ static inline unsigned bch2_bucket_sectors_fragmented(struct bch_dev *ca,
 	return d ? max(0, ca->mi.bucket_size - d) : 0;
 }
 
+static inline unsigned bch2_bucket_sectors_unstriped(struct bch_alloc_v4 a)
+{
+	return a.data_type == BCH_DATA_stripe ? a.dirty_sectors : 0;
+}
+
 static inline enum bch_data_type alloc_data_type(struct bch_alloc_v4 a,
 						 enum bch_data_type data_type)
 {
diff --git a/fs/bcachefs/bcachefs_format.h b/fs/bcachefs/bcachefs_format.h
index e3b1bde489c3..1d580e529da5 100644
--- a/fs/bcachefs/bcachefs_format.h
+++ b/fs/bcachefs/bcachefs_format.h
@@ -612,7 +612,8 @@ LE64_BITMASK(BCH_KDF_SCRYPT_P,	struct bch_sb_field_crypt, kdf_flags, 32, 48);
 	x(parity,	6)		\
 	x(stripe,	7)		\
 	x(need_gc_gens,	8)		\
-	x(need_discard,	9)
+	x(need_discard,	9)		\
+	x(unstriped,	10)
 
 enum bch_data_type {
 #define x(t, n) BCH_DATA_##t,
diff --git a/fs/bcachefs/buckets.c b/fs/bcachefs/buckets.c
index 8017faf56b0f..33b75f92b6ae 100644
--- a/fs/bcachefs/buckets.c
+++ b/fs/bcachefs/buckets.c
@@ -309,12 +309,20 @@ void bch2_dev_usage_update(struct bch_fs *c, struct bch_dev *ca,
 	u->d[old->data_type].sectors -= bch2_bucket_sectors_dirty(*old);
 	u->d[new->data_type].sectors += bch2_bucket_sectors_dirty(*new);
 
-	u->d[BCH_DATA_cached].sectors += new->cached_sectors;
-	u->d[BCH_DATA_cached].sectors -= old->cached_sectors;
-
 	u->d[old->data_type].fragmented -= bch2_bucket_sectors_fragmented(ca, *old);
 	u->d[new->data_type].fragmented += bch2_bucket_sectors_fragmented(ca, *new);
 
+	u->d[BCH_DATA_cached].sectors -= old->cached_sectors;
+	u->d[BCH_DATA_cached].sectors += new->cached_sectors;
+
+	unsigned old_unstriped = bch2_bucket_sectors_unstriped(*old);
+	u->d[BCH_DATA_unstriped].buckets -= old_unstriped != 0;
+	u->d[BCH_DATA_unstriped].sectors -= old_unstriped;
+
+	unsigned new_unstriped = bch2_bucket_sectors_unstriped(*new);
+	u->d[BCH_DATA_unstriped].buckets += new_unstriped != 0;
+	u->d[BCH_DATA_unstriped].sectors += new_unstriped;
+
 	preempt_enable();
 }
 
diff --git a/fs/bcachefs/chardev.c b/fs/bcachefs/chardev.c
index 8c5b13cd3205..7fd80f6de221 100644
--- a/fs/bcachefs/chardev.c
+++ b/fs/bcachefs/chardev.c
@@ -619,7 +619,7 @@ static long bch2_ioctl_dev_usage(struct bch_fs *c,
 	arg.bucket_size		= ca->mi.bucket_size;
 	arg.nr_buckets		= ca->mi.nbuckets - ca->mi.first_bucket;
 
-	for (i = 0; i < BCH_DATA_NR; i++) {
+	for (i = 0; i < ARRAY_SIZE(arg.d); i++) {
 		arg.d[i].buckets	= src.d[i].buckets;
 		arg.d[i].sectors	= src.d[i].sectors;
 		arg.d[i].fragmented	= src.d[i].fragmented;

From 7773df19c35fabdcc8c36176a480a1b19ad32866 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Thu, 25 Apr 2024 20:45:00 -0400
Subject: [PATCH 024/120] bcachefs: metadata version bucket_stripe_sectors

New on disk format version for bch_alloc->stripe_sectors and
BCH_DATA_unstriped - accounting for unstriped data in stripe buckets.

Upgrade/downgrade requires regenerating alloc info - but only if erasure
coding is in use.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/bcachefs.h        |  1 +
 fs/bcachefs/bcachefs_format.h |  3 +-
 fs/bcachefs/btree_iter.h      |  8 ++++
 fs/bcachefs/btree_update.h    |  8 ----
 fs/bcachefs/recovery.c        |  5 ++
 fs/bcachefs/sb-downgrade.c    | 88 +++++++++++++++++++++++++++++++----
 fs/bcachefs/sb-downgrade.h    |  1 +
 7 files changed, 96 insertions(+), 18 deletions(-)

diff --git a/fs/bcachefs/bcachefs.h b/fs/bcachefs/bcachefs.h
index 4d93889f3bae..6eec526c45d4 100644
--- a/fs/bcachefs/bcachefs.h
+++ b/fs/bcachefs/bcachefs.h
@@ -592,6 +592,7 @@ struct bch_dev {
 #define BCH_FS_FLAGS()			\
 	x(new_fs)			\
 	x(started)			\
+	x(btree_running)		\
 	x(may_go_rw)			\
 	x(rw)				\
 	x(was_rw)			\
diff --git a/fs/bcachefs/bcachefs_format.h b/fs/bcachefs/bcachefs_format.h
index 1d580e529da5..cdcb9bc4cc14 100644
--- a/fs/bcachefs/bcachefs_format.h
+++ b/fs/bcachefs/bcachefs_format.h
@@ -723,7 +723,8 @@ struct bch_sb_field_ext {
 	x(member_seq,			BCH_VERSION(1,  4))		\
 	x(subvolume_fs_parent,		BCH_VERSION(1,  5))		\
 	x(btree_subvolume_children,	BCH_VERSION(1,  6))		\
-	x(mi_btree_bitmap,		BCH_VERSION(1,  7))
+	x(mi_btree_bitmap,		BCH_VERSION(1,  7))		\
+	x(bucket_stripe_sectors,	BCH_VERSION(1,  8))
 
 enum bcachefs_metadata_version {
 	bcachefs_metadata_version_min = 9,
diff --git a/fs/bcachefs/btree_iter.h b/fs/bcachefs/btree_iter.h
index 798eb1c47966..699c1b8ef112 100644
--- a/fs/bcachefs/btree_iter.h
+++ b/fs/bcachefs/btree_iter.h
@@ -866,6 +866,14 @@ __bch2_btree_iter_peek_and_restart(struct btree_trans *trans,
 	_p;								\
 })
 
+#define bch2_trans_run(_c, _do)						\
+({									\
+	struct btree_trans *trans = bch2_trans_get(_c);			\
+	int _ret = (_do);						\
+	bch2_trans_put(trans);						\
+	_ret;								\
+})
+
 void bch2_trans_updates_to_text(struct printbuf *, struct btree_trans *);
 void bch2_btree_path_to_text(struct printbuf *, struct btree_trans *, btree_path_idx_t);
 void bch2_trans_paths_to_text(struct printbuf *, struct btree_trans *);
diff --git a/fs/bcachefs/btree_update.h b/fs/bcachefs/btree_update.h
index b4894e4d5447..cbe0ee3c7168 100644
--- a/fs/bcachefs/btree_update.h
+++ b/fs/bcachefs/btree_update.h
@@ -178,14 +178,6 @@ static inline int bch2_trans_commit(struct btree_trans *trans,
 	nested_lockrestart_do(_trans, _do ?: bch2_trans_commit(_trans, (_disk_res),\
 					(_journal_seq), (_flags)))
 
-#define bch2_trans_run(_c, _do)						\
-({									\
-	struct btree_trans *trans = bch2_trans_get(_c);			\
-	int _ret = (_do);						\
-	bch2_trans_put(trans);						\
-	_ret;								\
-})
-
 #define bch2_trans_do(_c, _disk_res, _journal_seq, _flags, _do)		\
 	bch2_trans_run(_c, commit_do(trans, _disk_res, _journal_seq, _flags, _do))
 
diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c
index 1f9d044ed920..34c4899c4599 100644
--- a/fs/bcachefs/recovery.c
+++ b/fs/bcachefs/recovery.c
@@ -810,6 +810,10 @@ use_clean:
 	if (ret)
 		goto err;
 
+	set_bit(BCH_FS_btree_running, &c->flags);
+
+	ret = bch2_sb_set_upgrade_extra(c);
+
 	ret = bch2_run_recovery_passes(c);
 	if (ret)
 		goto err;
@@ -969,6 +973,7 @@ int bch2_fs_initialize(struct bch_fs *c)
 	mutex_unlock(&c->sb_lock);
 
 	c->curr_recovery_pass = BCH_RECOVERY_PASS_NR;
+	set_bit(BCH_FS_btree_running, &c->flags);
 	set_bit(BCH_FS_may_go_rw, &c->flags);
 
 	for (unsigned i = 0; i < BTREE_ID_NR; i++)
diff --git a/fs/bcachefs/sb-downgrade.c b/fs/bcachefs/sb-downgrade.c
index 4710b61631f0..9596f470e166 100644
--- a/fs/bcachefs/sb-downgrade.c
+++ b/fs/bcachefs/sb-downgrade.c
@@ -56,7 +56,9 @@
 	  BIT_ULL(BCH_RECOVERY_PASS_check_allocations),		\
 	  BCH_FSCK_ERR_btree_bitmap_not_marked)
 
-#define DOWNGRADE_TABLE()
+#define DOWNGRADE_TABLE()					\
+	x(bucket_stripe_sectors,				\
+	  0)
 
 struct upgrade_downgrade_entry {
 	u64		recovery_passes;
@@ -80,6 +82,37 @@ UPGRADE_TABLE()
 #undef x
 };
 
+static int have_stripes(struct bch_fs *c)
+{
+	return !btree_node_fake(c->btree_roots_known[BTREE_ID_stripes].b);
+}
+
+int bch2_sb_set_upgrade_extra(struct bch_fs *c)
+{
+	unsigned old_version = c->sb.version_upgrade_complete ?: c->sb.version;
+	unsigned new_version = c->sb.version;
+	bool write_sb = false;
+	int ret = 0;
+
+	mutex_lock(&c->sb_lock);
+	struct bch_sb_field_ext *ext = bch2_sb_field_get(c->disk_sb.sb, ext);
+
+	if (old_version <  bcachefs_metadata_version_bucket_stripe_sectors &&
+	    new_version >= bcachefs_metadata_version_bucket_stripe_sectors &&
+	    (ret = have_stripes(c) > 0)) {
+		__set_bit_le64(BCH_RECOVERY_PASS_STABLE_check_allocations, ext->recovery_passes_required);
+		__set_bit_le64(BCH_FSCK_ERR_alloc_key_dirty_sectors_wrong, ext->errors_silent);
+		__set_bit_le64(BCH_FSCK_ERR_alloc_key_stripe_sectors_wrong, ext->errors_silent);
+		write_sb = true;
+	}
+
+	if (write_sb)
+		bch2_write_super(c);
+	mutex_unlock(&c->sb_lock);
+
+	return ret < 0 ? ret : 0;
+}
+
 void bch2_sb_set_upgrade(struct bch_fs *c,
 			 unsigned old_version,
 			 unsigned new_version)
@@ -101,16 +134,12 @@ void bch2_sb_set_upgrade(struct bch_fs *c,
 			ext->recovery_passes_required[0] |=
 				cpu_to_le64(bch2_recovery_passes_to_stable(passes));
 
-			for (const u16 *e = i->errors;
-			     e < i->errors + i->nr_errors;
-			     e++) {
-				__set_bit(*e, c->sb.errors_silent);
-				ext->errors_silent[*e / 64] |= cpu_to_le64(BIT_ULL(*e % 64));
-			}
+			for (const u16 *e = i->errors; e < i->errors + i->nr_errors; e++)
+				__set_bit_le64(*e, ext->errors_silent);
 		}
 }
 
-#define x(ver, passes, ...) static const u16 downgrade_ver_##errors[] = { __VA_ARGS__ };
+#define x(ver, passes, ...) static const u16 downgrade_##ver##_errors[] = { __VA_ARGS__ };
 DOWNGRADE_TABLE()
 #undef x
 
@@ -125,6 +154,37 @@ DOWNGRADE_TABLE()
 #undef x
 };
 
+static int downgrade_table_extra(struct bch_fs *c, darray_char *table)
+{
+	struct bch_sb_field_downgrade_entry *dst = (void *) &darray_top(*table);
+	unsigned bytes = sizeof(*dst) + sizeof(dst->errors[0]) * le16_to_cpu(dst->nr_errors);
+	int ret = 0;
+
+	unsigned nr_errors = le16_to_cpu(dst->nr_errors);
+
+	switch (le16_to_cpu(dst->version)) {
+	case bcachefs_metadata_version_bucket_stripe_sectors:
+		if (have_stripes(c)) {
+			bytes += sizeof(dst->errors[0]) * 2;
+
+			ret = darray_make_room(table, bytes);
+			if (ret)
+				return ret;
+
+			/* open coded __set_bit_le64, as dst is packed and
+			 * dst->recovery_passes is misaligned */
+			unsigned b = BCH_RECOVERY_PASS_STABLE_check_allocations;
+			dst->recovery_passes[b / 64] |= cpu_to_le64(BIT_ULL(b % 64));
+
+			dst->errors[nr_errors++] = cpu_to_le16(BCH_FSCK_ERR_alloc_key_dirty_sectors_wrong);
+		}
+		break;
+	}
+
+	dst->nr_errors = cpu_to_le16(nr_errors);
+	return ret;
+}
+
 static inline const struct bch_sb_field_downgrade_entry *
 downgrade_entry_next_c(const struct bch_sb_field_downgrade_entry *e)
 {
@@ -210,6 +270,9 @@ const struct bch_sb_field_ops bch_sb_field_ops_downgrade = {
 
 int bch2_sb_downgrade_update(struct bch_fs *c)
 {
+	if (!test_bit(BCH_FS_btree_running, &c->flags))
+		return 0;
+
 	darray_char table = {};
 	int ret = 0;
 
@@ -234,7 +297,14 @@ int bch2_sb_downgrade_update(struct bch_fs *c)
 		for (unsigned i = 0; i < src->nr_errors; i++)
 			dst->errors[i] = cpu_to_le16(src->errors[i]);
 
-		table.nr += bytes;
+		downgrade_table_extra(c, &table);
+
+		if (!dst->recovery_passes[0] &&
+		    !dst->recovery_passes[1] &&
+		    !dst->nr_errors)
+			continue;
+
+		table.nr += sizeof(*dst) + sizeof(dst->errors[0]) * le16_to_cpu(dst->nr_errors);
 	}
 
 	struct bch_sb_field_downgrade *d = bch2_sb_field_get(c->disk_sb.sb, downgrade);
diff --git a/fs/bcachefs/sb-downgrade.h b/fs/bcachefs/sb-downgrade.h
index 57e6c916fc73..095b7cc9bb47 100644
--- a/fs/bcachefs/sb-downgrade.h
+++ b/fs/bcachefs/sb-downgrade.h
@@ -6,6 +6,7 @@ extern const struct bch_sb_field_ops bch_sb_field_ops_downgrade;
 
 int bch2_sb_downgrade_update(struct bch_fs *);
 void bch2_sb_set_upgrade(struct bch_fs *, unsigned, unsigned);
+int bch2_sb_set_upgrade_extra(struct bch_fs *);
 void bch2_sb_set_downgrade(struct bch_fs *, unsigned, unsigned);
 
 #endif /* _BCACHEFS_SB_DOWNGRADE_H */

From 9b7f0b5d3d220ccba3151b95a5532780e04e1954 Mon Sep 17 00:00:00 2001
From: Thomas Bertschinger <tahbertschinger@gmail.com>
Date: Mon, 27 May 2024 22:36:09 -0600
Subject: [PATCH 025/120] bcachefs: add printbuf arg to bch2_parse_mount_opts()

Mount options that take the name of a device that may be part of a
filesystem, for example "metadata_target", cannot be validated until
after the filesystem has been opened. However, an attempt to parse those
options may be made prior to the filesystem being opened.

This change adds a printbuf parameter to bch2_parse_mount_opts() which
will be used to save those mount options, when they are supplied prior
to the FS being opened, so that they can be parsed later.

This functionality is not currently needed, but will be used after
bcachefs starts using the new mount API to parse mount options. This is
because using the new mount API, we will process mount options prior to
opening the FS, but the new API doesn't provide a convenient way to
"replay" mount option parsing. So we save these options ourselves to
accomplish this.

This change also splits out the code to parse a single option into
bch2_parse_one_mount_opt(), which will be useful when using the new
mount API which deals with a single mount option at a time.

Signed-off-by: Thomas Bertschinger <tahbertschinger@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/chardev.c |   4 +-
 fs/bcachefs/fs.c      |   6 +--
 fs/bcachefs/opts.c    | 105 +++++++++++++++++++++++++-----------------
 fs/bcachefs/opts.h    |   5 +-
 4 files changed, 71 insertions(+), 49 deletions(-)

diff --git a/fs/bcachefs/chardev.c b/fs/bcachefs/chardev.c
index 7fd80f6de221..77d7186b4ba3 100644
--- a/fs/bcachefs/chardev.c
+++ b/fs/bcachefs/chardev.c
@@ -227,7 +227,7 @@ static long bch2_ioctl_fsck_offline(struct bch_ioctl_fsck_offline __user *user_a
 		}
 
 		ret =   PTR_ERR_OR_ZERO(optstr) ?:
-			bch2_parse_mount_opts(NULL, &thr->opts, optstr);
+			bch2_parse_mount_opts(NULL, &thr->opts, NULL, optstr);
 		if (!IS_ERR(optstr))
 			kfree(optstr);
 
@@ -864,7 +864,7 @@ static long bch2_ioctl_fsck_online(struct bch_fs *c,
 		char *optstr = strndup_user((char __user *)(unsigned long) arg.opts, 1 << 16);
 
 		ret =   PTR_ERR_OR_ZERO(optstr) ?:
-			bch2_parse_mount_opts(c, &thr->opts, optstr);
+			bch2_parse_mount_opts(c, &thr->opts, NULL, optstr);
 		if (!IS_ERR(optstr))
 			kfree(optstr);
 
diff --git a/fs/bcachefs/fs.c b/fs/bcachefs/fs.c
index 78f2d80b8bb7..d2fccdc6524f 100644
--- a/fs/bcachefs/fs.c
+++ b/fs/bcachefs/fs.c
@@ -1730,7 +1730,7 @@ static int bch2_remount(struct super_block *sb, int *flags, char *data)
 	struct bch_opts opts = bch2_opts_empty();
 	int ret;
 
-	ret = bch2_parse_mount_opts(c, &opts, data);
+	ret = bch2_parse_mount_opts(c, &opts, NULL, data);
 	if (ret)
 		goto err;
 
@@ -1903,7 +1903,7 @@ static struct dentry *bch2_mount(struct file_system_type *fs_type,
 
 	opt_set(opts, read_only, (flags & SB_RDONLY) != 0);
 
-	ret = bch2_parse_mount_opts(NULL, &opts, data);
+	ret = bch2_parse_mount_opts(NULL, &opts, NULL, data);
 	if (ret) {
 		ret = bch2_err_class(ret);
 		return ERR_PTR(ret);
@@ -1937,7 +1937,7 @@ static struct dentry *bch2_mount(struct file_system_type *fs_type,
 	}
 
 	/* Some options can't be parsed until after the fs is started: */
-	ret = bch2_parse_mount_opts(c, &opts, data);
+	ret = bch2_parse_mount_opts(c, &opts, NULL, data);
 	if (ret) {
 		bch2_fs_stop(c);
 		sb = ERR_PTR(ret);
diff --git a/fs/bcachefs/opts.c b/fs/bcachefs/opts.c
index bb068fd72465..e794706276cf 100644
--- a/fs/bcachefs/opts.c
+++ b/fs/bcachefs/opts.c
@@ -460,14 +460,70 @@ int bch2_opts_check_may_set(struct bch_fs *c)
 	return 0;
 }
 
+int bch2_parse_one_mount_opt(struct bch_fs *c, struct bch_opts *opts,
+			     struct printbuf *parse_later,
+			     const char *name, const char *val)
+{
+	struct printbuf err = PRINTBUF;
+	u64 v;
+	int ret, id;
+
+	id = bch2_mount_opt_lookup(name);
+
+	/* Check for the form "noopt", negation of a boolean opt: */
+	if (id < 0 &&
+	    !val &&
+	    !strncmp("no", name, 2)) {
+		id = bch2_mount_opt_lookup(name + 2);
+		val = "0";
+	}
+
+	/* Unknown options are ignored: */
+	if (id < 0)
+		return 0;
+
+	if (!(bch2_opt_table[id].flags & OPT_MOUNT))
+		goto bad_opt;
+
+	if (id == Opt_acl &&
+	    !IS_ENABLED(CONFIG_BCACHEFS_POSIX_ACL))
+		goto bad_opt;
+
+	if ((id == Opt_usrquota ||
+	     id == Opt_grpquota) &&
+	    !IS_ENABLED(CONFIG_BCACHEFS_QUOTA))
+		goto bad_opt;
+
+	ret = bch2_opt_parse(c, &bch2_opt_table[id], val, &v, &err);
+	if (ret < 0)
+		goto bad_val;
+
+	if (opts)
+		bch2_opt_set_by_id(opts, id, v);
+
+	ret = 0;
+	goto out;
+
+bad_opt:
+	pr_err("Bad mount option %s", name);
+	ret = -BCH_ERR_option_name;
+	goto out;
+
+bad_val:
+	pr_err("Invalid mount option %s", err.buf);
+	ret = -BCH_ERR_option_value;
+
+out:
+	printbuf_exit(&err);
+	return ret;
+}
+
 int bch2_parse_mount_opts(struct bch_fs *c, struct bch_opts *opts,
-			  char *options)
+			  struct printbuf *parse_later, char *options)
 {
 	char *copied_opts, *copied_opts_start;
 	char *opt, *name, *val;
-	int ret, id;
-	struct printbuf err = PRINTBUF;
-	u64 v;
+	int ret;
 
 	if (!options)
 		return 0;
@@ -488,53 +544,16 @@ int bch2_parse_mount_opts(struct bch_fs *c, struct bch_opts *opts,
 		name	= strsep(&opt, "=");
 		val	= opt;
 
-		id = bch2_mount_opt_lookup(name);
-
-		/* Check for the form "noopt", negation of a boolean opt: */
-		if (id < 0 &&
-		    !val &&
-		    !strncmp("no", name, 2)) {
-			id = bch2_mount_opt_lookup(name + 2);
-			val = "0";
-		}
-
-		/* Unknown options are ignored: */
-		if (id < 0)
-			continue;
-
-		if (!(bch2_opt_table[id].flags & OPT_MOUNT))
-			goto bad_opt;
-
-		if (id == Opt_acl &&
-		    !IS_ENABLED(CONFIG_BCACHEFS_POSIX_ACL))
-			goto bad_opt;
-
-		if ((id == Opt_usrquota ||
-		     id == Opt_grpquota) &&
-		    !IS_ENABLED(CONFIG_BCACHEFS_QUOTA))
-			goto bad_opt;
-
-		ret = bch2_opt_parse(c, &bch2_opt_table[id], val, &v, &err);
+		ret = bch2_parse_one_mount_opt(c, opts, parse_later, name, val);
 		if (ret < 0)
-			goto bad_val;
-
-		bch2_opt_set_by_id(opts, id, v);
+			goto out;
 	}
 
 	ret = 0;
 	goto out;
 
-bad_opt:
-	pr_err("Bad mount option %s", name);
-	ret = -BCH_ERR_option_name;
-	goto out;
-bad_val:
-	pr_err("Invalid mount option %s", err.buf);
-	ret = -BCH_ERR_option_value;
-	goto out;
 out:
 	kfree(copied_opts_start);
-	printbuf_exit(&err);
 	return ret;
 }
 
diff --git a/fs/bcachefs/opts.h b/fs/bcachefs/opts.h
index c9da6267894b..edd509561ea2 100644
--- a/fs/bcachefs/opts.h
+++ b/fs/bcachefs/opts.h
@@ -566,7 +566,10 @@ void bch2_opt_to_text(struct printbuf *, struct bch_fs *, struct bch_sb *,
 
 int bch2_opt_check_may_set(struct bch_fs *, int, u64);
 int bch2_opts_check_may_set(struct bch_fs *);
-int bch2_parse_mount_opts(struct bch_fs *, struct bch_opts *, char *);
+int bch2_parse_one_mount_opt(struct bch_fs *, struct bch_opts *,
+			     struct printbuf *, const char *, const char *);
+int bch2_parse_mount_opts(struct bch_fs *, struct bch_opts *, struct printbuf *,
+			  char *);
 
 /* inode opts: */
 

From 1c12d1caf8d627d8b791f4dc25af2522dac7cd10 Mon Sep 17 00:00:00 2001
From: Thomas Bertschinger <tahbertschinger@gmail.com>
Date: Mon, 27 May 2024 22:36:10 -0600
Subject: [PATCH 026/120] bcachefs: Add error code to defer option parsing

This introduces a new error code, option_needs_open_fs, which is used to
indicate that an attempt was made to parse a mount option prior to
opening a filesystem, when that mount option requires an open filesystem
in order to be validated.

Returning this error results in bch2_parse_one_mount_opt() saving that
option for later parsing, after the filesystem is opened.

Signed-off-by: Thomas Bertschinger <tahbertschinger@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/disk_groups.c |  2 +-
 fs/bcachefs/errcode.h     |  3 ++-
 fs/bcachefs/opts.c        | 15 +++++++++++++++
 3 files changed, 18 insertions(+), 2 deletions(-)

diff --git a/fs/bcachefs/disk_groups.c b/fs/bcachefs/disk_groups.c
index 521a86df5e52..5df8de0b8c02 100644
--- a/fs/bcachefs/disk_groups.c
+++ b/fs/bcachefs/disk_groups.c
@@ -511,7 +511,7 @@ int bch2_opt_target_parse(struct bch_fs *c, const char *val, u64 *res,
 		return -EINVAL;
 
 	if (!c)
-		return 0;
+		return -BCH_ERR_option_needs_open_fs;
 
 	if (!strlen(val) || !strcmp(val, "none")) {
 		*res = 0;
diff --git a/fs/bcachefs/errcode.h b/fs/bcachefs/errcode.h
index 58612abf7927..a268af3e52bf 100644
--- a/fs/bcachefs/errcode.h
+++ b/fs/bcachefs/errcode.h
@@ -257,7 +257,8 @@
 	x(BCH_ERR_nopromote,		nopromote_no_writes)			\
 	x(BCH_ERR_nopromote,		nopromote_enomem)			\
 	x(0,				need_inode_lock)			\
-	x(0,				invalid_snapshot_node)
+	x(0,				invalid_snapshot_node)			\
+	x(0,				option_needs_open_fs)
 
 enum bch_errcode {
 	BCH_ERR_START		= 2048,
diff --git a/fs/bcachefs/opts.c b/fs/bcachefs/opts.c
index e794706276cf..e10fc1da71b1 100644
--- a/fs/bcachefs/opts.c
+++ b/fs/bcachefs/opts.c
@@ -378,6 +378,10 @@ int bch2_opt_parse(struct bch_fs *c,
 		break;
 	case BCH_OPT_FN:
 		ret = opt->fn.parse(c, val, res, err);
+
+		if (ret == -BCH_ERR_option_needs_open_fs)
+			return ret;
+
 		if (ret < 0) {
 			if (err)
 				prt_printf(err, "%s: parse error",
@@ -495,6 +499,17 @@ int bch2_parse_one_mount_opt(struct bch_fs *c, struct bch_opts *opts,
 		goto bad_opt;
 
 	ret = bch2_opt_parse(c, &bch2_opt_table[id], val, &v, &err);
+	if (ret == -BCH_ERR_option_needs_open_fs && parse_later) {
+		prt_printf(parse_later, "%s=%s,", name, val);
+		if (parse_later->allocation_failure) {
+			ret = -ENOMEM;
+			goto out;
+		}
+
+		ret = 0;
+		goto out;
+	}
+
 	if (ret < 0)
 		goto bad_val;
 

From 929d954330142a6273697b2cbf855f0f904a12f5 Mon Sep 17 00:00:00 2001
From: Thomas Bertschinger <tahbertschinger@gmail.com>
Date: Mon, 27 May 2024 22:36:11 -0600
Subject: [PATCH 027/120] bcachefs: use new mount API

This updates bcachefs to use the new mount API:

- Update the file_system_type to use the new init_fs_context()
  function.

- Define the new fs_context_operations functions.

- No longer register bch2_mount() and bch2_remount(); these are now
  called via the new fs_context functions.

- Define a new helper type, bch2_opts_parse that includes a struct
  bch_opts and additionally a printbuf used to save options that can't
  be parsed until after the FS is opened. This enables us to parse as
  many options as possible prior to opening the filesystem while saving
  those options that need the open FS for later parsing.

Signed-off-by: Thomas Bertschinger <tahbertschinger@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/fs.c   | 115 ++++++++++++++++++++++++++++++++++++---------
 fs/bcachefs/opts.h |   7 +++
 2 files changed, 100 insertions(+), 22 deletions(-)

diff --git a/fs/bcachefs/fs.c b/fs/bcachefs/fs.c
index d2fccdc6524f..6a92c0d434d9 100644
--- a/fs/bcachefs/fs.c
+++ b/fs/bcachefs/fs.c
@@ -31,6 +31,7 @@
 #include <linux/backing-dev.h>
 #include <linux/exportfs.h>
 #include <linux/fiemap.h>
+#include <linux/fs_context.h>
 #include <linux/module.h>
 #include <linux/pagemap.h>
 #include <linux/posix_acl.h>
@@ -1724,15 +1725,11 @@ static struct bch_fs *bch2_path_to_fs(const char *path)
 	return c ?: ERR_PTR(-ENOENT);
 }
 
-static int bch2_remount(struct super_block *sb, int *flags, char *data)
+static int bch2_remount(struct super_block *sb, int *flags,
+			struct bch_opts opts)
 {
 	struct bch_fs *c = sb->s_fs_info;
-	struct bch_opts opts = bch2_opts_empty();
-	int ret;
-
-	ret = bch2_parse_mount_opts(c, &opts, NULL, data);
-	if (ret)
-		goto err;
+	int ret = 0;
 
 	opt_set(opts, read_only, (*flags & SB_RDONLY) != 0);
 
@@ -1859,7 +1856,6 @@ static const struct super_operations bch_super_operations = {
 	.statfs		= bch2_statfs,
 	.show_devname	= bch2_show_devname,
 	.show_options	= bch2_show_options,
-	.remount_fs	= bch2_remount,
 	.put_super	= bch2_put_super,
 	.freeze_fs	= bch2_freeze,
 	.unfreeze_fs	= bch2_unfreeze,
@@ -1893,22 +1889,17 @@ static int bch2_test_super(struct super_block *s, void *data)
 }
 
 static struct dentry *bch2_mount(struct file_system_type *fs_type,
-				 int flags, const char *dev_name, void *data)
+				 int flags, const char *dev_name,
+				 struct bch2_opts_parse opts_parse)
 {
 	struct bch_fs *c;
 	struct super_block *sb;
 	struct inode *vinode;
-	struct bch_opts opts = bch2_opts_empty();
+	struct bch_opts opts = opts_parse.opts;
 	int ret;
 
 	opt_set(opts, read_only, (flags & SB_RDONLY) != 0);
 
-	ret = bch2_parse_mount_opts(NULL, &opts, NULL, data);
-	if (ret) {
-		ret = bch2_err_class(ret);
-		return ERR_PTR(ret);
-	}
-
 	if (!dev_name || strlen(dev_name) == 0)
 		return ERR_PTR(-EINVAL);
 
@@ -1937,7 +1928,7 @@ static struct dentry *bch2_mount(struct file_system_type *fs_type,
 	}
 
 	/* Some options can't be parsed until after the fs is started: */
-	ret = bch2_parse_mount_opts(c, &opts, NULL, data);
+	ret = bch2_parse_mount_opts(c, &opts, NULL, opts_parse.parse_later.buf);
 	if (ret) {
 		bch2_fs_stop(c);
 		sb = ERR_PTR(ret);
@@ -2054,12 +2045,92 @@ static void bch2_kill_sb(struct super_block *sb)
 	bch2_fs_free(c);
 }
 
+static void bch2_fs_context_free(struct fs_context *fc)
+{
+	struct bch2_opts_parse *opts = fc->fs_private;
+
+	if (opts) {
+		printbuf_exit(&opts->parse_later);
+		kfree(opts);
+	}
+}
+
+static int bch2_fs_parse_param(struct fs_context *fc,
+			       struct fs_parameter *param)
+{
+	/*
+	 * the "source" param, i.e., the name of the device(s) to mount,
+	 * is handled by the VFS layer.
+	 */
+	if (!strcmp(param->key, "source"))
+		return -ENOPARAM;
+
+	struct bch2_opts_parse *opts = fc->fs_private;
+	struct bch_fs *c = NULL;
+
+	/* for reconfigure, we already have a struct bch_fs */
+	if (fc->root)
+		c = fc->root->d_sb->s_fs_info;
+
+	int ret = bch2_parse_one_mount_opt(c, &opts->opts,
+					   &opts->parse_later, param->key,
+					   param->string);
+
+	return bch2_err_class(ret);
+}
+
+static int bch2_fs_get_tree(struct fs_context *fc)
+{
+	struct bch2_opts_parse *opts = fc->fs_private;
+	const char *dev_name = fc->source;
+	struct dentry *root;
+
+	root = bch2_mount(fc->fs_type, fc->sb_flags, dev_name, *opts);
+
+	if (IS_ERR(root))
+		return PTR_ERR(root);
+
+	fc->root = root;
+
+	return 0;
+}
+
+static int bch2_fs_reconfigure(struct fs_context *fc)
+{
+	struct super_block *sb = fc->root->d_sb;
+	struct bch2_opts_parse *opts = fc->fs_private;
+
+	return bch2_remount(sb, &fc->sb_flags, opts->opts);
+}
+
+static const struct fs_context_operations bch2_context_ops = {
+	.free        = bch2_fs_context_free,
+	.parse_param = bch2_fs_parse_param,
+	.get_tree    = bch2_fs_get_tree,
+	.reconfigure = bch2_fs_reconfigure,
+};
+
+static int bch2_init_fs_context(struct fs_context *fc)
+{
+	struct bch2_opts_parse *opts = kzalloc(sizeof(*opts), GFP_KERNEL);
+
+	if (!opts)
+		return -ENOMEM;
+
+	opts->parse_later = PRINTBUF;
+
+	fc->ops = &bch2_context_ops;
+	fc->fs_private = opts;
+
+	return 0;
+}
+
 static struct file_system_type bcache_fs_type = {
-	.owner		= THIS_MODULE,
-	.name		= "bcachefs",
-	.mount		= bch2_mount,
-	.kill_sb	= bch2_kill_sb,
-	.fs_flags	= FS_REQUIRES_DEV,
+	.owner			= THIS_MODULE,
+	.name			= "bcachefs",
+	.init_fs_context	= bch2_init_fs_context,
+	.kill_sb		= bch2_kill_sb,
+	.fs_flags		= FS_REQUIRES_DEV,
 };
 
 MODULE_ALIAS_FS("bcachefs");
diff --git a/fs/bcachefs/opts.h b/fs/bcachefs/opts.h
index edd509561ea2..840dfd756760 100644
--- a/fs/bcachefs/opts.h
+++ b/fs/bcachefs/opts.h
@@ -488,6 +488,13 @@ struct bch_opts {
 #undef x
 };
 
+struct bch2_opts_parse {
+	struct bch_opts opts;
+
+	/* to save opts that can't be parsed before the FS is opened: */
+	struct printbuf parse_later;
+};
+
 static const __maybe_unused struct bch_opts bch2_opts_default = {
 #define x(_name, _bits, _mode, _type, _sb_opt, _default, ...)		\
 	._name##_defined = true,					\

From 2744e5c9eb1a1090b5f61c955e934c70bfe6b04c Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Wed, 27 Dec 2023 18:31:46 -0500
Subject: [PATCH 028/120] bcachefs: KEY_TYPE_accounting

New key type for the disk space accounting rewrite.

 - Holds a variable sized array of u64s (may be more than one for
   accounting e.g. compressed and uncompressed size, or buckets and
   sectors for a given data type)

 - Updates are deltas, not new versions of the key: this means updates
   to accounting can happen via the btree write buffer, which we'll be
   teaching to accumulate deltas.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/Makefile                 |   3 +-
 fs/bcachefs/bcachefs_format.h        |  56 ++---------
 fs/bcachefs/bkey_methods.c           |   1 +
 fs/bcachefs/disk_accounting.c        |  70 +++++++++++++
 fs/bcachefs/disk_accounting.h        |  52 ++++++++++
 fs/bcachefs/disk_accounting_format.h | 144 +++++++++++++++++++++++++++
 fs/bcachefs/recovery.c               |   1 +
 fs/bcachefs/sb-downgrade.c           |  25 ++++-
 8 files changed, 303 insertions(+), 49 deletions(-)
 create mode 100644 fs/bcachefs/disk_accounting.c
 create mode 100644 fs/bcachefs/disk_accounting.h
 create mode 100644 fs/bcachefs/disk_accounting_format.h

diff --git a/fs/bcachefs/Makefile b/fs/bcachefs/Makefile
index 66ca0bbee639..0ab533a2b03b 100644
--- a/fs/bcachefs/Makefile
+++ b/fs/bcachefs/Makefile
@@ -29,10 +29,11 @@ bcachefs-y		:=	\
 	clock.o			\
 	compress.o		\
 	darray.o		\
+	data_update.o		\
 	debug.o			\
 	dirent.o		\
+	disk_accounting.o	\
 	disk_groups.o		\
-	data_update.o		\
 	ec.o			\
 	errcode.o		\
 	error.o			\
diff --git a/fs/bcachefs/bcachefs_format.h b/fs/bcachefs/bcachefs_format.h
index cdcb9bc4cc14..6ad5104e97a1 100644
--- a/fs/bcachefs/bcachefs_format.h
+++ b/fs/bcachefs/bcachefs_format.h
@@ -417,7 +417,8 @@ static inline void bkey_init(struct bkey *k)
 	x(bucket_gens,		30)			\
 	x(snapshot_tree,	31)			\
 	x(logged_op_truncate,	32)			\
-	x(logged_op_finsert,	33)
+	x(logged_op_finsert,	33)			\
+	x(accounting,		34)
 
 enum bch_bkey_type {
 #define x(name, nr) KEY_TYPE_##name	= nr,
@@ -505,6 +506,9 @@ struct bch_sb_field {
 	x(downgrade,			14)
 
 #include "alloc_background_format.h"
+#include "dirent_format.h"
+#include "disk_accounting_format.h"
+#include "disk_groups_format.h"
 #include "extents_format.h"
 #include "ec_format.h"
 #include "dirent_format.h"
@@ -602,49 +606,6 @@ LE64_BITMASK(BCH_KDF_SCRYPT_N,	struct bch_sb_field_crypt, kdf_flags,  0, 16);
 LE64_BITMASK(BCH_KDF_SCRYPT_R,	struct bch_sb_field_crypt, kdf_flags, 16, 32);
 LE64_BITMASK(BCH_KDF_SCRYPT_P,	struct bch_sb_field_crypt, kdf_flags, 32, 48);
 
-#define BCH_DATA_TYPES()		\
-	x(free,		0)		\
-	x(sb,		1)		\
-	x(journal,	2)		\
-	x(btree,	3)		\
-	x(user,		4)		\
-	x(cached,	5)		\
-	x(parity,	6)		\
-	x(stripe,	7)		\
-	x(need_gc_gens,	8)		\
-	x(need_discard,	9)		\
-	x(unstriped,	10)
-
-enum bch_data_type {
-#define x(t, n) BCH_DATA_##t,
-	BCH_DATA_TYPES()
-#undef x
-	BCH_DATA_NR
-};
-
-static inline bool data_type_is_empty(enum bch_data_type type)
-{
-	switch (type) {
-	case BCH_DATA_free:
-	case BCH_DATA_need_gc_gens:
-	case BCH_DATA_need_discard:
-		return true;
-	default:
-		return false;
-	}
-}
-
-static inline bool data_type_is_hidden(enum bch_data_type type)
-{
-	switch (type) {
-	case BCH_DATA_sb:
-	case BCH_DATA_journal:
-		return true;
-	default:
-		return false;
-	}
-}
-
 /*
  * On clean shutdown, store btree roots and current journal sequence number in
  * the superblock:
@@ -724,7 +685,8 @@ struct bch_sb_field_ext {
 	x(subvolume_fs_parent,		BCH_VERSION(1,  5))		\
 	x(btree_subvolume_children,	BCH_VERSION(1,  6))		\
 	x(mi_btree_bitmap,		BCH_VERSION(1,  7))		\
-	x(bucket_stripe_sectors,	BCH_VERSION(1,  8))
+	x(bucket_stripe_sectors,	BCH_VERSION(1,  8))		\
+	x(disk_accounting_v2,		BCH_VERSION(1,  9))
 
 enum bcachefs_metadata_version {
 	bcachefs_metadata_version_min = 9,
@@ -1377,7 +1339,9 @@ enum btree_id_flags {
 	x(rebalance_work,	18,	BTREE_ID_SNAPSHOT_FIELD,		\
 	  BIT_ULL(KEY_TYPE_set)|BIT_ULL(KEY_TYPE_cookie))			\
 	x(subvolume_children,	19,	0,					\
-	  BIT_ULL(KEY_TYPE_set))
+	  BIT_ULL(KEY_TYPE_set))						\
+	x(accounting,		20,	BTREE_ID_SNAPSHOT_FIELD,		\
+	  BIT_ULL(KEY_TYPE_accounting))						\
 
 enum btree_id {
 #define x(name, nr, ...) BTREE_ID_##name = nr,
diff --git a/fs/bcachefs/bkey_methods.c b/fs/bcachefs/bkey_methods.c
index bd32aac05192..5f07cf853d0c 100644
--- a/fs/bcachefs/bkey_methods.c
+++ b/fs/bcachefs/bkey_methods.c
@@ -7,6 +7,7 @@
 #include "btree_types.h"
 #include "alloc_background.h"
 #include "dirent.h"
+#include "disk_accounting.h"
 #include "ec.h"
 #include "error.h"
 #include "extents.h"
diff --git a/fs/bcachefs/disk_accounting.c b/fs/bcachefs/disk_accounting.c
new file mode 100644
index 000000000000..db501c6cf3ee
--- /dev/null
+++ b/fs/bcachefs/disk_accounting.c
@@ -0,0 +1,70 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include "bcachefs.h"
+#include "btree_update.h"
+#include "buckets.h"
+#include "disk_accounting.h"
+#include "replicas.h"
+
+static const char * const disk_accounting_type_strs[] = {
+#define x(t, n, ...) [n] = #t,
+	BCH_DISK_ACCOUNTING_TYPES()
+#undef x
+	NULL
+};
+
+int bch2_accounting_invalid(struct bch_fs *c, struct bkey_s_c k,
+			    enum bch_validate_flags flags,
+			    struct printbuf *err)
+{
+	return 0;
+}
+
+void bch2_accounting_key_to_text(struct printbuf *out, struct disk_accounting_pos *k)
+{
+	if (k->type >= BCH_DISK_ACCOUNTING_TYPE_NR) {
+		prt_printf(out, "unknown type %u", k->type);
+		return;
+	}
+
+	prt_str(out, disk_accounting_type_strs[k->type]);
+	prt_str(out, " ");
+
+	switch (k->type) {
+	case BCH_DISK_ACCOUNTING_nr_inodes:
+		break;
+	case BCH_DISK_ACCOUNTING_persistent_reserved:
+		prt_printf(out, "replicas=%u", k->persistent_reserved.nr_replicas);
+		break;
+	case BCH_DISK_ACCOUNTING_replicas:
+		bch2_replicas_entry_to_text(out, &k->replicas);
+		break;
+	case BCH_DISK_ACCOUNTING_dev_data_type:
+		prt_printf(out, "dev=%u data_type=", k->dev_data_type.dev);
+		bch2_prt_data_type(out, k->dev_data_type.data_type);
+		break;
+	case BCH_DISK_ACCOUNTING_dev_stripe_buckets:
+		prt_printf(out, "dev=%u", k->dev_stripe_buckets.dev);
+		break;
+	}
+}
+
+void bch2_accounting_to_text(struct printbuf *out, struct bch_fs *c, struct bkey_s_c k)
+{
+	struct bkey_s_c_accounting acc = bkey_s_c_to_accounting(k);
+	struct disk_accounting_pos acc_k;
+	bpos_to_disk_accounting_pos(&acc_k, k.k->p);
+
+	bch2_accounting_key_to_text(out, &acc_k);
+
+	for (unsigned i = 0; i < bch2_accounting_counters(k.k); i++)
+		prt_printf(out, " %lli", acc.v->d[i]);
+}
+
+void bch2_accounting_swab(struct bkey_s k)
+{
+	for (u64 *p = (u64 *) k.v;
+	     p < (u64 *) bkey_val_end(k);
+	     p++)
+		*p = swab64(*p);
+}
diff --git a/fs/bcachefs/disk_accounting.h b/fs/bcachefs/disk_accounting.h
new file mode 100644
index 000000000000..178ec25b7ef4
--- /dev/null
+++ b/fs/bcachefs/disk_accounting.h
@@ -0,0 +1,52 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_DISK_ACCOUNTING_H
+#define _BCACHEFS_DISK_ACCOUNTING_H
+
+static inline unsigned bch2_accounting_counters(const struct bkey *k)
+{
+	return bkey_val_u64s(k) - offsetof(struct bch_accounting, d) / sizeof(u64);
+}
+
+static inline void bch2_accounting_accumulate(struct bkey_i_accounting *dst,
+					      struct bkey_s_c_accounting src)
+{
+	EBUG_ON(dst->k.u64s != src.k->u64s);
+
+	for (unsigned i = 0; i < bch2_accounting_counters(&dst->k); i++)
+		dst->v.d[i] += src.v->d[i];
+	if (bversion_cmp(dst->k.version, src.k->version) < 0)
+		dst->k.version = src.k->version;
+}
+
+static inline void bpos_to_disk_accounting_pos(struct disk_accounting_pos *acc, struct bpos p)
+{
+	acc->_pad = p;
+#if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
+	bch2_bpos_swab(&acc->_pad);
+#endif
+}
+
+static inline struct bpos disk_accounting_pos_to_bpos(struct disk_accounting_pos *k)
+{
+	struct bpos ret = k->_pad;
+
+#if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
+	bch2_bpos_swab(&ret);
+#endif
+	return ret;
+}
+
+int bch2_accounting_invalid(struct bch_fs *, struct bkey_s_c,
+			    enum bch_validate_flags, struct printbuf *);
+void bch2_accounting_key_to_text(struct printbuf *, struct disk_accounting_pos *);
+void bch2_accounting_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c);
+void bch2_accounting_swab(struct bkey_s);
+
+#define bch2_bkey_ops_accounting ((struct bkey_ops) {	\
+	.key_invalid	= bch2_accounting_invalid,	\
+	.val_to_text	= bch2_accounting_to_text,	\
+	.swab		= bch2_accounting_swab,		\
+	.min_val_size	= 8,				\
+})
+
+#endif /* _BCACHEFS_DISK_ACCOUNTING_H */
diff --git a/fs/bcachefs/disk_accounting_format.h b/fs/bcachefs/disk_accounting_format.h
new file mode 100644
index 000000000000..4ff42466f2a6
--- /dev/null
+++ b/fs/bcachefs/disk_accounting_format.h
@@ -0,0 +1,144 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_DISK_ACCOUNTING_FORMAT_H
+#define _BCACHEFS_DISK_ACCOUNTING_FORMAT_H
+
+#include "replicas_format.h"
+
+/*
+ * Disk accounting - KEY_TYPE_accounting - on disk format:
+ *
+ * Here, the key has considerably more structure than a typical key (bpos); an
+ * accounting key is 'struct disk_accounting_pos', which is a union of bpos.
+ *
+ * More specifically: a key is just a muliword integer (where word endianness
+ * matches native byte order), so we're treating bpos as an opaque 20 byte
+ * integer and mapping bch_accounting_key to that.
+ *
+ * This is a type-tagged union of all our various subtypes; a disk accounting
+ * key can be device counters, replicas counters, et cetera - it's extensible.
+ *
+ * The value is a list of u64s or s64s; the number of counters is specific to a
+ * given accounting type.
+ *
+ * Unlike with other key types, updates are _deltas_, and the deltas are not
+ * resolved until the update to the underlying btree, done by btree write buffer
+ * flush or journal replay.
+ *
+ * Journal replay in particular requires special handling. The journal tracks a
+ * range of entries which may possibly have not yet been applied to the btree
+ * yet - it does not know definitively whether individual entries are dirty and
+ * still need to be applied.
+ *
+ * To handle this, we use the version field of struct bkey, and give every
+ * accounting update a unique version number - a total ordering in time; the
+ * version number is derived from the key's position in the journal. Then
+ * journal replay can compare the version number of the key from the journal
+ * with the version number of the key in the btree to determine if a key needs
+ * to be replayed.
+ *
+ * For this to work, we must maintain this strict time ordering of updates as
+ * they are flushed to the btree, both via write buffer flush and via journal
+ * replay. This has complications for the write buffer code while journal replay
+ * is still in progress; the write buffer cannot flush any accounting keys to
+ * the btree until journal replay has finished replaying its accounting keys, or
+ * the (newer) version number of the keys from the write buffer will cause
+ * updates from journal replay to be lost.
+ */
+
+struct bch_accounting {
+	struct bch_val		v;
+	__u64			d[];
+};
+
+#define BCH_ACCOUNTING_MAX_COUNTERS		3
+
+#define BCH_DATA_TYPES()		\
+	x(free,		0)		\
+	x(sb,		1)		\
+	x(journal,	2)		\
+	x(btree,	3)		\
+	x(user,		4)		\
+	x(cached,	5)		\
+	x(parity,	6)		\
+	x(stripe,	7)		\
+	x(need_gc_gens,	8)		\
+	x(need_discard,	9)		\
+	x(unstriped,	10)
+
+enum bch_data_type {
+#define x(t, n) BCH_DATA_##t,
+	BCH_DATA_TYPES()
+#undef x
+	BCH_DATA_NR
+};
+
+static inline bool data_type_is_empty(enum bch_data_type type)
+{
+	switch (type) {
+	case BCH_DATA_free:
+	case BCH_DATA_need_gc_gens:
+	case BCH_DATA_need_discard:
+		return true;
+	default:
+		return false;
+	}
+}
+
+static inline bool data_type_is_hidden(enum bch_data_type type)
+{
+	switch (type) {
+	case BCH_DATA_sb:
+	case BCH_DATA_journal:
+		return true;
+	default:
+		return false;
+	}
+}
+
+#define BCH_DISK_ACCOUNTING_TYPES()		\
+	x(nr_inodes,		0)		\
+	x(persistent_reserved,	1)		\
+	x(replicas,		2)		\
+	x(dev_data_type,	3)		\
+	x(dev_stripe_buckets,	4)
+
+enum disk_accounting_type {
+#define x(f, nr)	BCH_DISK_ACCOUNTING_##f	= nr,
+	BCH_DISK_ACCOUNTING_TYPES()
+#undef x
+	BCH_DISK_ACCOUNTING_TYPE_NR,
+};
+
+struct bch_nr_inodes {
+};
+
+struct bch_persistent_reserved {
+	__u8			nr_replicas;
+};
+
+struct bch_dev_data_type {
+	__u8			dev;
+	__u8			data_type;
+};
+
+struct bch_dev_stripe_buckets {
+	__u8			dev;
+};
+
+struct disk_accounting_pos {
+	union {
+	struct {
+		__u8				type;
+		union {
+		struct bch_nr_inodes		nr_inodes;
+		struct bch_persistent_reserved	persistent_reserved;
+		struct bch_replicas_entry_v1	replicas;
+		struct bch_dev_data_type	dev_data_type;
+		struct bch_dev_stripe_buckets	dev_stripe_buckets;
+		};
+	};
+		struct bpos			_pad;
+	};
+};
+
+#endif /* _BCACHEFS_DISK_ACCOUNTING_FORMAT_H */
diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c
index 34c4899c4599..d336a7c69edd 100644
--- a/fs/bcachefs/recovery.c
+++ b/fs/bcachefs/recovery.c
@@ -90,6 +90,7 @@ static void bch2_reconstruct_alloc(struct bch_fs *c)
 	__set_bit_le64(BCH_FSCK_ERR_freespace_hole_missing, ext->errors_silent);
 	__set_bit_le64(BCH_FSCK_ERR_ptr_to_missing_backpointer, ext->errors_silent);
 	__set_bit_le64(BCH_FSCK_ERR_lru_entry_bad, ext->errors_silent);
+	__set_bit_le64(BCH_FSCK_ERR_accounting_mismatch, ext->errors_silent);
 	c->sb.compat &= ~(1ULL << BCH_COMPAT_alloc_info);
 
 	bch2_write_super(c);
diff --git a/fs/bcachefs/sb-downgrade.c b/fs/bcachefs/sb-downgrade.c
index 9596f470e166..dfbbd33c8731 100644
--- a/fs/bcachefs/sb-downgrade.c
+++ b/fs/bcachefs/sb-downgrade.c
@@ -54,11 +54,32 @@
 	  BCH_FSCK_ERR_subvol_children_not_set)			\
 	x(mi_btree_bitmap,					\
 	  BIT_ULL(BCH_RECOVERY_PASS_check_allocations),		\
-	  BCH_FSCK_ERR_btree_bitmap_not_marked)
+	  BCH_FSCK_ERR_btree_bitmap_not_marked)			\
+	x(disk_accounting_v2,					\
+	  BIT_ULL(BCH_RECOVERY_PASS_check_allocations),		\
+	  BCH_FSCK_ERR_bkey_version_in_future,			\
+	  BCH_FSCK_ERR_dev_usage_buckets_wrong,			\
+	  BCH_FSCK_ERR_dev_usage_sectors_wrong,			\
+	  BCH_FSCK_ERR_dev_usage_fragmented_wrong,		\
+	  BCH_FSCK_ERR_accounting_mismatch)
 
 #define DOWNGRADE_TABLE()					\
 	x(bucket_stripe_sectors,				\
-	  0)
+	  0)							\
+	x(disk_accounting_v2,					\
+	  BIT_ULL(BCH_RECOVERY_PASS_check_allocations),		\
+	  BCH_FSCK_ERR_dev_usage_buckets_wrong,			\
+	  BCH_FSCK_ERR_dev_usage_sectors_wrong,			\
+	  BCH_FSCK_ERR_dev_usage_fragmented_wrong,		\
+	  BCH_FSCK_ERR_fs_usage_hidden_wrong,			\
+	  BCH_FSCK_ERR_fs_usage_btree_wrong,			\
+	  BCH_FSCK_ERR_fs_usage_data_wrong,			\
+	  BCH_FSCK_ERR_fs_usage_cached_wrong,			\
+	  BCH_FSCK_ERR_fs_usage_reserved_wrong,			\
+	  BCH_FSCK_ERR_fs_usage_nr_inodes_wrong,		\
+	  BCH_FSCK_ERR_fs_usage_persistent_reserved_wrong,	\
+	  BCH_FSCK_ERR_fs_usage_replicas_wrong,			\
+	  BCH_FSCK_ERR_bkey_version_in_future)
 
 struct upgrade_downgrade_entry {
 	u64		recovery_passes;

From 9dec2a473bd1ba6a111382928e3ceaddfbb720ba Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Wed, 27 Dec 2023 20:59:01 -0500
Subject: [PATCH 029/120] bcachefs: Accumulate accounting keys in journal
 replay

Until accounting keys hit the btree, they are deltas, not new versions
of the existing key; this means we have to teach journal replay to
accumulate them.

Additionally, the journal doesn't track precisely which entries have
been flushed to the btree; it only tracks a range of entries that may
possibly still need to be flushed.

That means we need to compare accounting keys against the version in the
btree and only flush updates that are newer.

There's another wrinkle with the write buffer: if the write buffer
starts flushing accounting keys before journal replay has finished
flushing accounting keys, journal replay will see the version number
from the new updates and updates from the journal will be lost.

To avoid this, journal replay has to flush accounting keys first, and
we'll be adding a flag so that write buffer flush knows to hold
accounting keys until then.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_journal_iter.c | 23 ++++------
 fs/bcachefs/btree_journal_iter.h | 15 +++++++
 fs/bcachefs/btree_trans_commit.c | 26 +++++++++---
 fs/bcachefs/btree_update.h       | 14 ++++++-
 fs/bcachefs/recovery.c           | 72 +++++++++++++++++++++++++++++++-
 5 files changed, 126 insertions(+), 24 deletions(-)

diff --git a/fs/bcachefs/btree_journal_iter.c b/fs/bcachefs/btree_journal_iter.c
index 332dbf164929..74933490aaba 100644
--- a/fs/bcachefs/btree_journal_iter.c
+++ b/fs/bcachefs/btree_journal_iter.c
@@ -16,21 +16,6 @@
  * operations for the regular btree iter code to use:
  */
 
-static int __journal_key_cmp(enum btree_id	l_btree_id,
-			     unsigned		l_level,
-			     struct bpos	l_pos,
-			     const struct journal_key *r)
-{
-	return (cmp_int(l_btree_id,	r->btree_id) ?:
-		cmp_int(l_level,	r->level) ?:
-		bpos_cmp(l_pos,	r->k->k.p));
-}
-
-static int journal_key_cmp(const struct journal_key *l, const struct journal_key *r)
-{
-	return __journal_key_cmp(l->btree_id, l->level, l->k->k.p, r);
-}
-
 static inline size_t idx_to_pos(struct journal_keys *keys, size_t idx)
 {
 	size_t gap_size = keys->size - keys->nr;
@@ -548,7 +533,13 @@ static void __journal_keys_sort(struct journal_keys *keys)
 	struct journal_key *dst = keys->data;
 
 	darray_for_each(*keys, src) {
-		if (src + 1 < &darray_top(*keys) &&
+		/*
+		 * We don't accumulate accounting keys here because we have to
+		 * compare each individual accounting key against the version in
+		 * the btree during replay:
+		 */
+		if (src->k->k.type != KEY_TYPE_accounting &&
+		    src + 1 < &darray_top(*keys) &&
 		    !journal_key_cmp(src, src + 1))
 			continue;
 
diff --git a/fs/bcachefs/btree_journal_iter.h b/fs/bcachefs/btree_journal_iter.h
index 1ba4a79b0ef9..5b66c8f85fc1 100644
--- a/fs/bcachefs/btree_journal_iter.h
+++ b/fs/bcachefs/btree_journal_iter.h
@@ -26,6 +26,21 @@ struct btree_and_journal_iter {
 	bool			prefetch;
 };
 
+static inline int __journal_key_cmp(enum btree_id	l_btree_id,
+				    unsigned		l_level,
+				    struct bpos	l_pos,
+				    const struct journal_key *r)
+{
+	return (cmp_int(l_btree_id,	r->btree_id) ?:
+		cmp_int(l_level,	r->level) ?:
+		bpos_cmp(l_pos,	r->k->k.p));
+}
+
+static inline int journal_key_cmp(const struct journal_key *l, const struct journal_key *r)
+{
+	return __journal_key_cmp(l->btree_id, l->level, l->k->k.p, r);
+}
+
 struct bkey_i *bch2_journal_keys_peek_upto(struct bch_fs *, enum btree_id,
 				unsigned, struct bpos, struct bpos, size_t *);
 struct bkey_i *bch2_journal_keys_peek_slot(struct bch_fs *, enum btree_id,
diff --git a/fs/bcachefs/btree_trans_commit.c b/fs/bcachefs/btree_trans_commit.c
index 5e67dcb30f33..05e819174697 100644
--- a/fs/bcachefs/btree_trans_commit.c
+++ b/fs/bcachefs/btree_trans_commit.c
@@ -785,8 +785,15 @@ revert_fs_usage:
 
 static noinline void bch2_drop_overwrites_from_journal(struct btree_trans *trans)
 {
+	/*
+	 * Accounting keys aren't deduped in the journal: we have to compare
+	 * each individual update against what's in the btree to see if it has
+	 * been applied yet, and accounting updates also don't overwrite,
+	 * they're deltas that accumulate.
+	 */
 	trans_for_each_update(trans, i)
-		bch2_journal_key_overwritten(trans->c, i->btree_id, i->level, i->k->k.p);
+		if (i->k->k.type != KEY_TYPE_accounting)
+			bch2_journal_key_overwritten(trans->c, i->btree_id, i->level, i->k->k.p);
 }
 
 static noinline int bch2_trans_commit_bkey_invalid(struct btree_trans *trans,
@@ -993,15 +1000,24 @@ static noinline int
 do_bch2_trans_commit_to_journal_replay(struct btree_trans *trans)
 {
 	struct bch_fs *c = trans->c;
-	int ret = 0;
 
 	trans_for_each_update(trans, i) {
-		ret = bch2_journal_key_insert(c, i->btree_id, i->level, i->k);
+		int ret = bch2_journal_key_insert(c, i->btree_id, i->level, i->k);
 		if (ret)
-			break;
+			return ret;
 	}
 
-	return ret;
+	for (struct jset_entry *i = trans->journal_entries;
+	     i != (void *) ((u64 *) trans->journal_entries + trans->journal_entries_u64s);
+	     i = vstruct_next(i))
+		if (i->type == BCH_JSET_ENTRY_btree_keys ||
+		    i->type == BCH_JSET_ENTRY_write_buffer_keys) {
+			int ret = bch2_journal_key_insert(c, i->btree_id, i->level, i->start);
+			if (ret)
+				return ret;
+		}
+
+	return 0;
 }
 
 int __bch2_trans_commit(struct btree_trans *trans, unsigned flags)
diff --git a/fs/bcachefs/btree_update.h b/fs/bcachefs/btree_update.h
index cbe0ee3c7168..3758af7bbde8 100644
--- a/fs/bcachefs/btree_update.h
+++ b/fs/bcachefs/btree_update.h
@@ -130,7 +130,19 @@ static inline int __must_check bch2_trans_update_buffered(struct btree_trans *tr
 					    enum btree_id btree,
 					    struct bkey_i *k)
 {
-	if (unlikely(trans->journal_replay_not_finished))
+	/*
+	 * Most updates skip the btree write buffer until journal replay is
+	 * finished because synchronization with journal replay relies on having
+	 * a btree node locked - if we're overwriting a key in the journal that
+	 * journal replay hasn't yet replayed, we have to mark it as
+	 * overwritten.
+	 *
+	 * But accounting updates don't overwrite, they're deltas, and they have
+	 * to be flushed to the btree strictly in order for journal replay to be
+	 * able to tell which updates need to be applied:
+	 */
+	if (k->k.type != KEY_TYPE_accounting &&
+	    unlikely(trans->journal_replay_not_finished))
 		return bch2_btree_insert_clone_trans(trans, btree, k);
 
 	struct jset_entry *e = bch2_trans_jset_entry_alloc(trans, jset_u64s(k->k.u64s));
diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c
index d336a7c69edd..0091af3beeef 100644
--- a/fs/bcachefs/recovery.c
+++ b/fs/bcachefs/recovery.c
@@ -10,6 +10,7 @@
 #include "btree_io.h"
 #include "buckets.h"
 #include "dirent.h"
+#include "disk_accounting.h"
 #include "errcode.h"
 #include "error.h"
 #include "fs-common.h"
@@ -135,6 +136,47 @@ static void replay_now_at(struct journal *j, u64 seq)
 		bch2_journal_pin_put(j, j->replay_journal_seq++);
 }
 
+static int bch2_journal_replay_accounting_key(struct btree_trans *trans,
+					      struct journal_key *k)
+{
+	struct journal_keys *keys = &trans->c->journal_keys;
+
+	struct btree_iter iter;
+	bch2_trans_node_iter_init(trans, &iter, k->btree_id, k->k->k.p,
+				  BTREE_MAX_DEPTH, k->level,
+				  BTREE_ITER_intent);
+	int ret = bch2_btree_iter_traverse(&iter);
+	if (ret)
+		goto out;
+
+	struct bkey u;
+	struct bkey_s_c old = bch2_btree_path_peek_slot(btree_iter_path(trans, &iter), &u);
+
+	/* Has this delta already been applied to the btree? */
+	if (bversion_cmp(old.k->version, k->k->k.version) >= 0) {
+		ret = 0;
+		goto out;
+	}
+
+	struct bkey_i *new = k->k;
+	if (old.k->type == KEY_TYPE_accounting) {
+		new = bch2_bkey_make_mut_noupdate(trans, bkey_i_to_s_c(k->k));
+		ret = PTR_ERR_OR_ZERO(new);
+		if (ret)
+			goto out;
+
+		bch2_accounting_accumulate(bkey_i_to_accounting(new),
+					   bkey_s_c_to_accounting(old));
+	}
+
+	trans->journal_res.seq = k->journal_seq;
+
+	ret = bch2_trans_update(trans, &iter, new, BTREE_TRIGGER_norun);
+out:
+	bch2_trans_iter_exit(trans, &iter);
+	return ret;
+}
+
 static int bch2_journal_replay_key(struct btree_trans *trans,
 				   struct journal_key *k)
 {
@@ -185,6 +227,11 @@ static int bch2_journal_replay_key(struct btree_trans *trans,
 	if (k->overwritten)
 		goto out;
 
+	if (k->k->k.type == KEY_TYPE_accounting) {
+		ret = bch2_trans_update_buffered(trans, BTREE_ID_accounting, k->k);
+		goto out;
+	}
+
 	ret = bch2_trans_update(trans, &iter, k->k, update_flags);
 out:
 	bch2_trans_iter_exit(trans, &iter);
@@ -222,6 +269,27 @@ int bch2_journal_replay(struct bch_fs *c)
 	move_gap(keys, keys->nr);
 	trans = bch2_trans_get(c);
 
+	/*
+	 * Replay accounting keys first: we can't allow the write buffer to
+	 * flush accounting keys until we're done
+	 */
+	darray_for_each(*keys, k) {
+		if (!(k->k->k.type == KEY_TYPE_accounting && !k->allocated))
+			continue;
+
+		cond_resched();
+
+		ret = commit_do(trans, NULL, NULL,
+				BCH_TRANS_COMMIT_no_enospc|
+				BCH_TRANS_COMMIT_journal_reclaim|
+				BCH_TRANS_COMMIT_no_journal_res,
+			     bch2_journal_replay_accounting_key(trans, k));
+		if (bch2_fs_fatal_err_on(ret, c, "error replaying accounting; %s", bch2_err_str(ret)))
+			goto err;
+
+		k->overwritten = true;
+	}
+
 	/*
 	 * First, attempt to replay keys in sorted order. This is more
 	 * efficient - better locality of btree access -  but some might fail if
@@ -244,7 +312,7 @@ int bch2_journal_replay(struct bch_fs *c)
 				  BCH_TRANS_COMMIT_journal_reclaim|
 				  (!k->allocated ? BCH_TRANS_COMMIT_no_journal_res : 0),
 			     bch2_journal_replay_key(trans, k));
-		BUG_ON(!ret && !k->overwritten);
+		BUG_ON(!ret && !k->overwritten && k->k->k.type != KEY_TYPE_accounting);
 		if (ret) {
 			ret = darray_push(&keys_sorted, k);
 			if (ret)
@@ -281,7 +349,7 @@ int bch2_journal_replay(struct bch_fs *c)
 		if (ret)
 			goto err;
 
-		BUG_ON(!k->overwritten);
+		BUG_ON(k->btree_id != BTREE_ID_accounting && !k->overwritten);
 	}
 
 	/*

From 5d9667d1d6eaca3f6cd3c63cd6a0f309147c7f5c Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Fri, 17 Nov 2023 00:23:07 -0500
Subject: [PATCH 030/120] bcachefs: btree write buffer knows how to accumulate
 bch_accounting keys

Teach the btree write buffer how to accumulate accounting keys - instead
of having the newer key overwrite the older key as we do with other
updates, we need to add them together.

Also, add a flag so that write buffer flush knows when journal replay is
finished flushing accounting, and teach it to hold accounting keys until
that flag is set.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/bcachefs.h           |  1 +
 fs/bcachefs/btree_write_buffer.c | 84 ++++++++++++++++++++++++++++----
 fs/bcachefs/recovery.c           |  3 ++
 3 files changed, 79 insertions(+), 9 deletions(-)

diff --git a/fs/bcachefs/bcachefs.h b/fs/bcachefs/bcachefs.h
index 6eec526c45d4..b1a74d3ebc12 100644
--- a/fs/bcachefs/bcachefs.h
+++ b/fs/bcachefs/bcachefs.h
@@ -593,6 +593,7 @@ struct bch_dev {
 	x(new_fs)			\
 	x(started)			\
 	x(btree_running)		\
+	x(accounting_replay_done)	\
 	x(may_go_rw)			\
 	x(rw)				\
 	x(was_rw)			\
diff --git a/fs/bcachefs/btree_write_buffer.c b/fs/bcachefs/btree_write_buffer.c
index d0e92d948002..e9e36d8aded9 100644
--- a/fs/bcachefs/btree_write_buffer.c
+++ b/fs/bcachefs/btree_write_buffer.c
@@ -6,6 +6,7 @@
 #include "btree_update.h"
 #include "btree_update_interior.h"
 #include "btree_write_buffer.h"
+#include "disk_accounting.h"
 #include "error.h"
 #include "extents.h"
 #include "journal.h"
@@ -134,7 +135,9 @@ static noinline int wb_flush_one_slowpath(struct btree_trans *trans,
 
 static inline int wb_flush_one(struct btree_trans *trans, struct btree_iter *iter,
 			       struct btree_write_buffered_key *wb,
-			       bool *write_locked, size_t *fast)
+			       bool *write_locked,
+			       bool *accounting_accumulated,
+			       size_t *fast)
 {
 	struct btree_path *path;
 	int ret;
@@ -147,6 +150,16 @@ static inline int wb_flush_one(struct btree_trans *trans, struct btree_iter *ite
 	if (ret)
 		return ret;
 
+	if (!*accounting_accumulated && wb->k.k.type == KEY_TYPE_accounting) {
+		struct bkey u;
+		struct bkey_s_c k = bch2_btree_path_peek_slot_exact(btree_iter_path(trans, iter), &u);
+
+		if (k.k->type == KEY_TYPE_accounting)
+			bch2_accounting_accumulate(bkey_i_to_accounting(&wb->k),
+						   bkey_s_c_to_accounting(k));
+	}
+	*accounting_accumulated = true;
+
 	/*
 	 * We can't clone a path that has write locks: unshare it now, before
 	 * set_pos and traverse():
@@ -259,8 +272,9 @@ static int bch2_btree_write_buffer_flush_locked(struct btree_trans *trans)
 	struct journal *j = &c->journal;
 	struct btree_write_buffer *wb = &c->btree_write_buffer;
 	struct btree_iter iter = { NULL };
-	size_t skipped = 0, fast = 0, slowpath = 0;
+	size_t overwritten = 0, fast = 0, slowpath = 0, could_not_insert = 0;
 	bool write_locked = false;
+	bool accounting_replay_done = test_bit(BCH_FS_accounting_replay_done, &c->flags);
 	int ret = 0;
 
 	bch2_trans_unlock(trans);
@@ -301,11 +315,22 @@ static int bch2_btree_write_buffer_flush_locked(struct btree_trans *trans)
 
 		BUG_ON(!k->journal_seq);
 
+		if (!accounting_replay_done &&
+		    k->k.k.type == KEY_TYPE_accounting) {
+			slowpath++;
+			continue;
+		}
+
 		if (i + 1 < &darray_top(wb->sorted) &&
 		    wb_key_eq(i, i + 1)) {
 			struct btree_write_buffered_key *n = &wb->flushing.keys.data[i[1].idx];
 
-			skipped++;
+			if (k->k.k.type == KEY_TYPE_accounting &&
+			    n->k.k.type == KEY_TYPE_accounting)
+				bch2_accounting_accumulate(bkey_i_to_accounting(&n->k),
+							   bkey_i_to_s_c_accounting(&k->k));
+
+			overwritten++;
 			n->journal_seq = min_t(u64, n->journal_seq, k->journal_seq);
 			k->journal_seq = 0;
 			continue;
@@ -340,13 +365,15 @@ static int bch2_btree_write_buffer_flush_locked(struct btree_trans *trans)
 		bch2_btree_iter_set_pos(&iter, k->k.k.p);
 		btree_iter_path(trans, &iter)->preserve = false;
 
+		bool accounting_accumulated = false;
 		do {
 			if (race_fault()) {
 				ret = -BCH_ERR_journal_reclaim_would_deadlock;
 				break;
 			}
 
-			ret = wb_flush_one(trans, &iter, k, &write_locked, &fast);
+			ret = wb_flush_one(trans, &iter, k, &write_locked,
+					   &accounting_accumulated, &fast);
 			if (!write_locked)
 				bch2_trans_begin(trans);
 		} while (bch2_err_matches(ret, BCH_ERR_transaction_restart));
@@ -387,8 +414,15 @@ static int bch2_btree_write_buffer_flush_locked(struct btree_trans *trans)
 			if (!i->journal_seq)
 				continue;
 
-			bch2_journal_pin_update(j, i->journal_seq, &wb->flushing.pin,
-						bch2_btree_write_buffer_journal_flush);
+			if (!accounting_replay_done &&
+			    i->k.k.type == KEY_TYPE_accounting) {
+				could_not_insert++;
+				continue;
+			}
+
+			if (!could_not_insert)
+				bch2_journal_pin_update(j, i->journal_seq, &wb->flushing.pin,
+							bch2_btree_write_buffer_journal_flush);
 
 			bch2_trans_begin(trans);
 
@@ -401,13 +435,45 @@ static int bch2_btree_write_buffer_flush_locked(struct btree_trans *trans)
 					btree_write_buffered_insert(trans, i));
 			if (ret)
 				goto err;
+
+			i->journal_seq = 0;
+		}
+
+		/*
+		 * If journal replay hasn't finished with accounting keys we
+		 * can't flush accounting keys at all - condense them and leave
+		 * them for next time.
+		 *
+		 * Q: Can the write buffer overflow?
+		 * A Shouldn't be any actual risk. It's just new accounting
+		 * updates that the write buffer can't flush, and those are only
+		 * going to be generated by interior btree node updates as
+		 * journal replay has to split/rewrite nodes to make room for
+		 * its updates.
+		 *
+		 * And for those new acounting updates, updates to the same
+		 * counters get accumulated as they're flushed from the journal
+		 * to the write buffer - see the patch for eytzingcer tree
+		 * accumulated. So we could only overflow if the number of
+		 * distinct counters touched somehow was very large.
+		 */
+		if (could_not_insert) {
+			struct btree_write_buffered_key *dst = wb->flushing.keys.data;
+
+			darray_for_each(wb->flushing.keys, i)
+				if (i->journal_seq)
+					*dst++ = *i;
+			wb->flushing.keys.nr = dst - wb->flushing.keys.data;
 		}
 	}
 err:
+	if (ret || !could_not_insert) {
+		bch2_journal_pin_drop(j, &wb->flushing.pin);
+		wb->flushing.keys.nr = 0;
+	}
+
 	bch2_fs_fatal_err_on(ret, c, "%s", bch2_err_str(ret));
-	trace_write_buffer_flush(trans, wb->flushing.keys.nr, skipped, fast, 0);
-	bch2_journal_pin_drop(j, &wb->flushing.pin);
-	wb->flushing.keys.nr = 0;
+	trace_write_buffer_flush(trans, wb->flushing.keys.nr, overwritten, fast, 0);
 	return ret;
 }
 
diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c
index 0091af3beeef..5c6bfa9e69d5 100644
--- a/fs/bcachefs/recovery.c
+++ b/fs/bcachefs/recovery.c
@@ -290,6 +290,8 @@ int bch2_journal_replay(struct bch_fs *c)
 		k->overwritten = true;
 	}
 
+	set_bit(BCH_FS_accounting_replay_done, &c->flags);
+
 	/*
 	 * First, attempt to replay keys in sorted order. This is more
 	 * efficient - better locality of btree access -  but some might fail if
@@ -1060,6 +1062,7 @@ int bch2_fs_initialize(struct bch_fs *c)
 	 * set up the journal.pin FIFO and journal.cur pointer:
 	 */
 	bch2_fs_journal_start(&c->journal, 1);
+	set_bit(BCH_FS_accounting_replay_done, &c->flags);
 	bch2_journal_set_replay_done(&c->journal);
 
 	ret = bch2_fs_read_write_early(c);

From 1d16c605cc55ef26f0c65b362665a6c99080ccbc Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Thu, 9 Nov 2023 14:22:46 -0500
Subject: [PATCH 031/120] bcachefs: Disk space accounting rewrite

Main part of the disk accounting rewrite.

This is a wholesale rewrite of the existing disk space accounting, which
relies on percepu counters that are sharded by journal buffer, and
rolled up and added to each journal write.

With the new scheme, every set of counters is a distinct key in the
accounting btree; this fixes scaling limitations of the old scheme,
where counters took up space in each journal entry and required multiple
percpu counters.

Now, in memory accounting requires a single set of percpu counters - not
multiple for each in flight journal buffer - and in the future we'll
probably also have counters that don't use in memory percpu counters,
they're not strictly required.

An accounting update is now a normal btree update, using the btree write
buffer path. At transaction commit time, we apply accounting updates to
the in memory counters, which are percpu counters indexed in an
eytzinger tree by the accounting key.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/alloc_background.c       |  79 +++++-
 fs/bcachefs/alloc_background.h       |  27 +-
 fs/bcachefs/bcachefs.h               |   6 +-
 fs/bcachefs/bcachefs_format.h        |   1 -
 fs/bcachefs/bcachefs_ioctl.h         |   7 +-
 fs/bcachefs/btree_gc.c               |   5 +-
 fs/bcachefs/btree_iter.c             |   9 -
 fs/bcachefs/btree_trans_commit.c     |  65 +++--
 fs/bcachefs/btree_types.h            |   1 -
 fs/bcachefs/btree_update.h           |   9 +-
 fs/bcachefs/buckets.c                | 291 ++++-----------------
 fs/bcachefs/buckets.h                |  26 +-
 fs/bcachefs/disk_accounting.c        | 366 ++++++++++++++++++++++++++-
 fs/bcachefs/disk_accounting.h        | 131 ++++++++++
 fs/bcachefs/disk_accounting_format.h |   3 +-
 fs/bcachefs/disk_accounting_types.h  |  20 ++
 fs/bcachefs/ec.c                     |  26 +-
 fs/bcachefs/inode.c                  |   9 +-
 fs/bcachefs/recovery.c               |  17 +-
 fs/bcachefs/recovery_passes.c        |   1 +
 fs/bcachefs/recovery_passes_types.h  |   1 +
 fs/bcachefs/replicas.c               |  44 +---
 fs/bcachefs/replicas.h               |  11 +-
 fs/bcachefs/replicas_types.h         |  16 --
 fs/bcachefs/super.c                  |  49 ++--
 25 files changed, 796 insertions(+), 424 deletions(-)
 create mode 100644 fs/bcachefs/disk_accounting_types.h

diff --git a/fs/bcachefs/alloc_background.c b/fs/bcachefs/alloc_background.c
index 2af0f0a631f6..3df1099750af 100644
--- a/fs/bcachefs/alloc_background.c
+++ b/fs/bcachefs/alloc_background.c
@@ -15,6 +15,7 @@
 #include "buckets_waiting_for_journal.h"
 #include "clock.h"
 #include "debug.h"
+#include "disk_accounting.h"
 #include "ec.h"
 #include "error.h"
 #include "lru.h"
@@ -760,6 +761,61 @@ static noinline int bch2_bucket_gen_update(struct btree_trans *trans,
 	return ret;
 }
 
+static inline int bch2_dev_data_type_accounting_mod(struct btree_trans *trans, struct bch_dev *ca,
+						    enum bch_data_type data_type,
+						    s64 delta_buckets,
+						    s64 delta_sectors,
+						    s64 delta_fragmented, unsigned flags)
+{
+	struct disk_accounting_pos acc = {
+		.type = BCH_DISK_ACCOUNTING_dev_data_type,
+		.dev_data_type.dev		= ca->dev_idx,
+		.dev_data_type.data_type	= data_type,
+	};
+	s64 d[3] = { delta_buckets, delta_sectors, delta_fragmented };
+
+	return bch2_disk_accounting_mod(trans, &acc, d, 3);
+}
+
+int bch2_alloc_key_to_dev_counters(struct btree_trans *trans, struct bch_dev *ca,
+				   const struct bch_alloc_v4 *old,
+				   const struct bch_alloc_v4 *new,
+				   unsigned flags)
+{
+	s64 old_sectors = bch2_bucket_sectors(*old);
+	s64 new_sectors = bch2_bucket_sectors(*new);
+	if (old->data_type != new->data_type) {
+		int ret = bch2_dev_data_type_accounting_mod(trans, ca, new->data_type,
+				 1,  new_sectors,  bch2_bucket_sectors_fragmented(ca, *new), flags) ?:
+			  bch2_dev_data_type_accounting_mod(trans, ca, old->data_type,
+				-1, -old_sectors, -bch2_bucket_sectors_fragmented(ca, *old), flags);
+		if (ret)
+			return ret;
+	} else if (old_sectors != new_sectors) {
+		int ret = bch2_dev_data_type_accounting_mod(trans, ca, new->data_type,
+					 0,
+					 new_sectors - old_sectors,
+					 bch2_bucket_sectors_fragmented(ca, *new) -
+					 bch2_bucket_sectors_fragmented(ca, *old), flags);
+		if (ret)
+			return ret;
+	}
+
+	s64 old_unstriped = bch2_bucket_sectors_unstriped(*old);
+	s64 new_unstriped = bch2_bucket_sectors_unstriped(*new);
+	if (old_unstriped != new_unstriped) {
+		int ret = bch2_dev_data_type_accounting_mod(trans, ca, BCH_DATA_unstriped,
+					 !!new_unstriped - !!old_unstriped,
+					 new_unstriped - old_unstriped,
+					 0,
+					 flags);
+		if (ret)
+			return ret;
+	}
+
+	return 0;
+}
+
 int bch2_trigger_alloc(struct btree_trans *trans,
 		       enum btree_id btree, unsigned level,
 		       struct bkey_s_c old, struct bkey_s new,
@@ -835,18 +891,17 @@ int bch2_trigger_alloc(struct btree_trans *trans,
 				goto err;
 		}
 
-		/*
-		 * need to know if we're getting called from the invalidate path or
-		 * not:
-		 */
-
 		if ((flags & BTREE_TRIGGER_bucket_invalidate) &&
 		    old_a->cached_sectors) {
-			ret = bch2_update_cached_sectors_list(trans, new.k->p.inode,
-							      -((s64) old_a->cached_sectors));
+			ret = bch2_mod_dev_cached_sectors(trans, ca->dev_idx,
+					 -((s64) old_a->cached_sectors));
 			if (ret)
 				goto err;
 		}
+
+		ret = bch2_alloc_key_to_dev_counters(trans, ca, old_a, new_a, flags);
+		if (ret)
+			goto err;
 	}
 
 	if ((flags & BTREE_TRIGGER_atomic) && (flags & BTREE_TRIGGER_insert)) {
@@ -886,19 +941,17 @@ int bch2_trigger_alloc(struct btree_trans *trans,
 			}
 		}
 
-		percpu_down_read(&c->mark_lock);
 		if (new_a->gen != old_a->gen) {
+			rcu_read_lock();
 			u8 *gen = bucket_gen(ca, new.k->p.offset);
 			if (unlikely(!gen)) {
-				percpu_up_read(&c->mark_lock);
+				rcu_read_unlock();
 				goto invalid_bucket;
 			}
 			*gen = new_a->gen;
+			rcu_read_unlock();
 		}
 
-		bch2_dev_usage_update(c, ca, old_a, new_a, journal_seq, false);
-		percpu_up_read(&c->mark_lock);
-
 #define eval_state(_a, expr)		({ const struct bch_alloc_v4 *a = _a; expr; })
 #define statechange(expr)		!eval_state(old_a, expr) && eval_state(new_a, expr)
 #define bucket_flushed(a)		(!a->journal_seq || a->journal_seq <= c->journal.flushed_seq_ondisk)
@@ -946,6 +999,8 @@ int bch2_trigger_alloc(struct btree_trans *trans,
 
 		bucket_unlock(g);
 		percpu_up_read(&c->mark_lock);
+
+		bch2_dev_usage_update(c, ca, old_a, new_a);
 	}
 err:
 	printbuf_exit(&buf);
diff --git a/fs/bcachefs/alloc_background.h b/fs/bcachefs/alloc_background.h
index a766eaf48863..dd7d14363a68 100644
--- a/fs/bcachefs/alloc_background.h
+++ b/fs/bcachefs/alloc_background.h
@@ -82,25 +82,39 @@ static inline bool bucket_data_type_mismatch(enum bch_data_type bucket,
 		bucket_data_type(bucket) != bucket_data_type(ptr);
 }
 
-static inline unsigned bch2_bucket_sectors_total(struct bch_alloc_v4 a)
+static inline s64 bch2_bucket_sectors_total(struct bch_alloc_v4 a)
 {
 	return a.stripe_sectors + a.dirty_sectors + a.cached_sectors;
 }
 
-static inline unsigned bch2_bucket_sectors_dirty(struct bch_alloc_v4 a)
+static inline s64 bch2_bucket_sectors_dirty(struct bch_alloc_v4 a)
 {
 	return a.stripe_sectors + a.dirty_sectors;
 }
 
-static inline unsigned bch2_bucket_sectors_fragmented(struct bch_dev *ca,
+static inline s64 bch2_bucket_sectors(struct bch_alloc_v4 a)
+{
+	return a.data_type == BCH_DATA_cached
+		? a.cached_sectors
+		: bch2_bucket_sectors_dirty(a);
+}
+
+static inline s64 bch2_bucket_sectors_fragmented(struct bch_dev *ca,
 						 struct bch_alloc_v4 a)
 {
-	int d = bch2_bucket_sectors_dirty(a);
+	int d = bch2_bucket_sectors(a);
 
 	return d ? max(0, ca->mi.bucket_size - d) : 0;
 }
 
-static inline unsigned bch2_bucket_sectors_unstriped(struct bch_alloc_v4 a)
+static inline s64 bch2_gc_bucket_sectors_fragmented(struct bch_dev *ca, struct bucket a)
+{
+	int d = a.stripe_sectors + a.dirty_sectors;
+
+	return d ? max(0, ca->mi.bucket_size - d) : 0;
+}
+
+static inline s64 bch2_bucket_sectors_unstriped(struct bch_alloc_v4 a)
 {
 	return a.data_type == BCH_DATA_stripe ? a.dirty_sectors : 0;
 }
@@ -277,6 +291,9 @@ static inline bool bkey_is_alloc(const struct bkey *k)
 
 int bch2_alloc_read(struct bch_fs *);
 
+int bch2_alloc_key_to_dev_counters(struct btree_trans *, struct bch_dev *,
+				   const struct bch_alloc_v4 *,
+				   const struct bch_alloc_v4 *, unsigned);
 int bch2_trigger_alloc(struct btree_trans *, enum btree_id, unsigned,
 		       struct bkey_s_c, struct bkey_s,
 		       enum btree_iter_update_trigger_flags);
diff --git a/fs/bcachefs/bcachefs.h b/fs/bcachefs/bcachefs.h
index b1a74d3ebc12..03e4f15f34f5 100644
--- a/fs/bcachefs/bcachefs.h
+++ b/fs/bcachefs/bcachefs.h
@@ -205,6 +205,7 @@
 #include <linux/zstd.h>
 
 #include "bcachefs_format.h"
+#include "disk_accounting_types.h"
 #include "errcode.h"
 #include "fifo.h"
 #include "nocow_locking_types.h"
@@ -672,8 +673,6 @@ struct btree_trans_buf {
 	struct btree_trans	*trans;
 };
 
-#define REPLICAS_DELTA_LIST_MAX	(1U << 16)
-
 #define BCACHEFS_ROOT_SUBVOL_INUM					\
 	((subvol_inum) { BCACHEFS_ROOT_SUBVOL,	BCACHEFS_ROOT_INO })
 
@@ -743,10 +742,11 @@ struct bch_fs {
 
 	struct bch_dev __rcu	*devs[BCH_SB_MEMBERS_MAX];
 
+	struct bch_accounting_mem accounting;
+
 	struct bch_replicas_cpu replicas;
 	struct bch_replicas_cpu replicas_gc;
 	struct mutex		replicas_gc_lock;
-	mempool_t		replicas_delta_pool;
 
 	struct journal_entry_res btree_root_journal_res;
 	struct journal_entry_res replicas_journal_res;
diff --git a/fs/bcachefs/bcachefs_format.h b/fs/bcachefs/bcachefs_format.h
index 6ad5104e97a1..8d8444fa0ec2 100644
--- a/fs/bcachefs/bcachefs_format.h
+++ b/fs/bcachefs/bcachefs_format.h
@@ -1138,7 +1138,6 @@ static inline bool jset_entry_is_key(struct jset_entry *e)
 	switch (e->type) {
 	case BCH_JSET_ENTRY_btree_keys:
 	case BCH_JSET_ENTRY_btree_root:
-	case BCH_JSET_ENTRY_overwrite:
 	case BCH_JSET_ENTRY_write_buffer_keys:
 		return true;
 	}
diff --git a/fs/bcachefs/bcachefs_ioctl.h b/fs/bcachefs/bcachefs_ioctl.h
index 4b8fba754b1c..0b82a4dd099f 100644
--- a/fs/bcachefs/bcachefs_ioctl.h
+++ b/fs/bcachefs/bcachefs_ioctl.h
@@ -251,10 +251,15 @@ struct bch_replicas_usage {
 	struct bch_replicas_entry_v1 r;
 } __packed;
 
+static inline unsigned replicas_usage_bytes(struct bch_replicas_usage *u)
+{
+	return offsetof(struct bch_replicas_usage, r) + replicas_entry_bytes(&u->r);
+}
+
 static inline struct bch_replicas_usage *
 replicas_usage_next(struct bch_replicas_usage *u)
 {
-	return (void *) u + replicas_entry_bytes(&u->r) + 8;
+	return (void *) u + replicas_usage_bytes(u);
 }
 
 /*
diff --git a/fs/bcachefs/btree_gc.c b/fs/bcachefs/btree_gc.c
index 4f1c567ce6f8..b5fe6785d3e4 100644
--- a/fs/bcachefs/btree_gc.c
+++ b/fs/bcachefs/btree_gc.c
@@ -583,7 +583,8 @@ static int bch2_gc_mark_key(struct btree_trans *trans, enum btree_id btree_id,
 		BUG_ON(bch2_journal_seq_verify &&
 		       k.k->version.lo > atomic64_read(&c->journal.seq));
 
-		if (fsck_err_on(k.k->version.lo > atomic64_read(&c->key_version), c,
+		if (fsck_err_on(btree_id != BTREE_ID_accounting &&
+				k.k->version.lo > atomic64_read(&c->key_version), c,
 				bkey_version_in_future,
 				"key version number higher than recorded %llu\n  %s",
 				atomic64_read(&c->key_version),
@@ -915,7 +916,7 @@ static int bch2_alloc_write_key(struct btree_trans *trans,
 
 	if (gc.data_type != old_gc.data_type ||
 	    gc.dirty_sectors != old_gc.dirty_sectors)
-		bch2_dev_usage_update(c, ca, &old_gc, &gc, 0, true);
+		bch2_dev_usage_update(c, ca, &old_gc, &gc);
 	percpu_up_read(&c->mark_lock);
 
 	gc.fragmentation_lru = alloc_lru_idx_fragmentation(gc, ca);
diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c
index 19352a08ea20..d6e63aa01940 100644
--- a/fs/bcachefs/btree_iter.c
+++ b/fs/bcachefs/btree_iter.c
@@ -3240,15 +3240,6 @@ void bch2_trans_put(struct btree_trans *trans)
 		srcu_read_unlock(&c->btree_trans_barrier, trans->srcu_idx);
 	}
 
-	if (trans->fs_usage_deltas) {
-		if (trans->fs_usage_deltas->size + sizeof(trans->fs_usage_deltas) ==
-		    REPLICAS_DELTA_LIST_MAX)
-			mempool_free(trans->fs_usage_deltas,
-				     &c->replicas_delta_pool);
-		else
-			kfree(trans->fs_usage_deltas);
-	}
-
 	if (unlikely(trans->journal_replay_not_finished))
 		bch2_journal_keys_put(c);
 
diff --git a/fs/bcachefs/btree_trans_commit.c b/fs/bcachefs/btree_trans_commit.c
index 05e819174697..92305c12cb75 100644
--- a/fs/bcachefs/btree_trans_commit.c
+++ b/fs/bcachefs/btree_trans_commit.c
@@ -10,6 +10,7 @@
 #include "btree_update_interior.h"
 #include "btree_write_buffer.h"
 #include "buckets.h"
+#include "disk_accounting.h"
 #include "errcode.h"
 #include "error.h"
 #include "journal.h"
@@ -620,6 +621,14 @@ static noinline int bch2_trans_commit_run_gc_triggers(struct btree_trans *trans)
 	return 0;
 }
 
+static struct bversion journal_pos_to_bversion(struct journal_res *res, unsigned offset)
+{
+	return (struct bversion) {
+		.hi = res->seq >> 32,
+		.lo = (res->seq << 32) | (res->offset + offset),
+	};
+}
+
 static inline int
 bch2_trans_commit_write_locked(struct btree_trans *trans, unsigned flags,
 			       struct btree_insert_entry **stopped_at,
@@ -628,7 +637,7 @@ bch2_trans_commit_write_locked(struct btree_trans *trans, unsigned flags,
 	struct bch_fs *c = trans->c;
 	struct btree_trans_commit_hook *h;
 	unsigned u64s = 0;
-	int ret;
+	int ret = 0;
 
 	bch2_trans_verify_not_unlocked(trans);
 	bch2_trans_verify_not_in_restart(trans);
@@ -693,21 +702,38 @@ bch2_trans_commit_write_locked(struct btree_trans *trans, unsigned flags,
 				i->k->k.version = MAX_VERSION;
 	}
 
-	if (trans->fs_usage_deltas &&
-	    bch2_trans_fs_usage_apply(trans, trans->fs_usage_deltas))
-		return -BCH_ERR_btree_insert_need_mark_replicas;
-
-	/* XXX: we only want to run this if deltas are nonzero */
-	bch2_trans_account_disk_usage_change(trans);
-
 	h = trans->hooks;
 	while (h) {
 		ret = h->fn(trans, h);
 		if (ret)
-			goto revert_fs_usage;
+			return ret;
 		h = h->next;
 	}
 
+	struct jset_entry *entry = trans->journal_entries;
+
+	if (likely(!(flags & BCH_TRANS_COMMIT_skip_accounting_apply))) {
+		percpu_down_read(&c->mark_lock);
+
+		for (entry = trans->journal_entries;
+		     entry != (void *) ((u64 *) trans->journal_entries + trans->journal_entries_u64s);
+		     entry = vstruct_next(entry))
+			if (jset_entry_is_key(entry) && entry->start->k.type == KEY_TYPE_accounting) {
+				struct bkey_i_accounting *a = bkey_i_to_accounting(entry->start);
+
+				a->k.version = journal_pos_to_bversion(&trans->journal_res,
+								(u64 *) entry - (u64 *) trans->journal_entries);
+				BUG_ON(bversion_zero(a->k.version));
+				ret = bch2_accounting_mem_mod(trans, accounting_i_to_s_c(a));
+				if (ret)
+					goto revert_fs_usage;
+			}
+		percpu_up_read(&c->mark_lock);
+
+		/* XXX: we only want to run this if deltas are nonzero */
+		bch2_trans_account_disk_usage_change(trans);
+	}
+
 	trans_for_each_update(trans, i)
 		if (BTREE_NODE_TYPE_HAS_ATOMIC_TRIGGERS & (1U << i->bkey_type)) {
 			ret = run_one_mem_trigger(trans, i, BTREE_TRIGGER_atomic|i->flags);
@@ -776,10 +802,20 @@ bch2_trans_commit_write_locked(struct btree_trans *trans, unsigned flags,
 
 	return 0;
 fatal_err:
-	bch2_fatal_error(c);
+	bch2_fs_fatal_error(c, "fatal error in transaction commit: %s", bch2_err_str(ret));
+	percpu_down_read(&c->mark_lock);
 revert_fs_usage:
-	if (trans->fs_usage_deltas)
-		bch2_trans_fs_usage_revert(trans, trans->fs_usage_deltas);
+	for (struct jset_entry *entry2 = trans->journal_entries;
+	     entry2 != entry;
+	     entry2 = vstruct_next(entry2))
+		if (jset_entry_is_key(entry2) && entry2->start->k.type == KEY_TYPE_accounting) {
+			struct bkey_s_accounting a = bkey_i_to_s_accounting(entry2->start);
+
+			bch2_accounting_neg(a);
+			bch2_accounting_mem_mod(trans, a.c);
+			bch2_accounting_neg(a);
+		}
+	percpu_up_read(&c->mark_lock);
 	return ret;
 }
 
@@ -929,7 +965,7 @@ int bch2_trans_commit_error(struct btree_trans *trans, unsigned flags,
 		break;
 	case -BCH_ERR_btree_insert_need_mark_replicas:
 		ret = drop_locks_do(trans,
-			bch2_replicas_delta_list_mark(c, trans->fs_usage_deltas));
+			bch2_accounting_update_sb(trans));
 		break;
 	case -BCH_ERR_journal_res_get_blocked:
 		/*
@@ -1033,8 +1069,6 @@ int __bch2_trans_commit(struct btree_trans *trans, unsigned flags)
 	    !trans->journal_entries_u64s)
 		goto out_reset;
 
-	memset(&trans->fs_usage_delta, 0, sizeof(trans->fs_usage_delta));
-
 	ret = bch2_trans_commit_run_triggers(trans);
 	if (ret)
 		goto out_reset;
@@ -1131,6 +1165,7 @@ retry:
 	bch2_trans_verify_not_in_restart(trans);
 	if (likely(!(flags & BCH_TRANS_COMMIT_no_journal_res)))
 		memset(&trans->journal_res, 0, sizeof(trans->journal_res));
+	memset(&trans->fs_usage_delta, 0, sizeof(trans->fs_usage_delta));
 
 	ret = do_bch2_trans_commit(trans, flags, &errored_at, _RET_IP_);
 
diff --git a/fs/bcachefs/btree_types.h b/fs/bcachefs/btree_types.h
index 48cb1a7d31c5..b0b5c46aec62 100644
--- a/fs/bcachefs/btree_types.h
+++ b/fs/bcachefs/btree_types.h
@@ -523,7 +523,6 @@ struct btree_trans {
 
 	unsigned		journal_u64s;
 	unsigned		extra_disk_res; /* XXX kill */
-	struct replicas_delta_list *fs_usage_deltas;
 
 	/* Entries before this are zeroed out on every bch2_trans_get() call */
 
diff --git a/fs/bcachefs/btree_update.h b/fs/bcachefs/btree_update.h
index 3758af7bbde8..b907f4c1312b 100644
--- a/fs/bcachefs/btree_update.h
+++ b/fs/bcachefs/btree_update.h
@@ -29,6 +29,7 @@ void bch2_btree_insert_key_leaf(struct btree_trans *, struct btree_path *,
 			"pin journal entry referred to by trans->journal_res.seq")	\
 	x(journal_reclaim, "operation required for journal reclaim; may return error"	\
 			"instead of deadlocking if BCH_WATERMARK_reclaim not specified")\
+	x(skip_accounting_apply, "we're in journal replay - accounting updates have already been applied")
 
 enum __bch_trans_commit_flags {
 	/* First bits for bch_watermark: */
@@ -207,14 +208,6 @@ static inline void bch2_trans_reset_updates(struct btree_trans *trans)
 	trans->journal_entries_u64s	= 0;
 	trans->hooks			= NULL;
 	trans->extra_disk_res		= 0;
-
-	if (trans->fs_usage_deltas) {
-		trans->fs_usage_deltas->used = 0;
-		memset((void *) trans->fs_usage_deltas +
-		       offsetof(struct replicas_delta_list, memset_start), 0,
-		       (void *) &trans->fs_usage_deltas->memset_end -
-		       (void *) &trans->fs_usage_deltas->memset_start);
-	}
 }
 
 static inline struct bkey_i *__bch2_bkey_make_mut_noupdate(struct btree_trans *trans, struct bkey_s_c k,
diff --git a/fs/bcachefs/buckets.c b/fs/bcachefs/buckets.c
index 33b75f92b6ae..c3dac9d1c45b 100644
--- a/fs/bcachefs/buckets.c
+++ b/fs/bcachefs/buckets.c
@@ -13,6 +13,7 @@
 #include "btree_update.h"
 #include "buckets.h"
 #include "buckets_waiting_for_journal.h"
+#include "disk_accounting.h"
 #include "ec.h"
 #include "error.h"
 #include "inode.h"
@@ -25,24 +26,16 @@
 
 #include <linux/preempt.h>
 
-static inline void fs_usage_data_type_to_base(struct bch_fs_usage_base *fs_usage,
-					      enum bch_data_type data_type,
-					      s64 sectors)
+static inline struct bch_fs_usage *fs_usage_ptr(struct bch_fs *c,
+						unsigned journal_seq,
+						bool gc)
 {
-	switch (data_type) {
-	case BCH_DATA_btree:
-		fs_usage->btree		+= sectors;
-		break;
-	case BCH_DATA_user:
-	case BCH_DATA_parity:
-		fs_usage->data		+= sectors;
-		break;
-	case BCH_DATA_cached:
-		fs_usage->cached	+= sectors;
-		break;
-	default:
-		break;
-	}
+	percpu_rwsem_assert_held(&c->mark_lock);
+	BUG_ON(!gc && !journal_seq);
+
+	return this_cpu_ptr(gc
+			    ? c->usage_gc
+			    : c->usage[journal_seq & JOURNAL_BUF_MASK]);
 }
 
 void bch2_fs_usage_initialize(struct bch_fs *c)
@@ -67,24 +60,13 @@ void bch2_fs_usage_initialize(struct bch_fs *c)
 		struct bch_dev_usage dev = bch2_dev_usage_read(ca);
 
 		usage->b.hidden += (dev.d[BCH_DATA_sb].buckets +
-				    dev.d[BCH_DATA_journal].buckets) *
+				  dev.d[BCH_DATA_journal].buckets) *
 			ca->mi.bucket_size;
 	}
 
 	percpu_up_write(&c->mark_lock);
 }
 
-static inline struct bch_dev_usage *dev_usage_ptr(struct bch_dev *ca,
-						  unsigned journal_seq,
-						  bool gc)
-{
-	BUG_ON(!gc && !journal_seq);
-
-	return this_cpu_ptr(gc
-			    ? ca->usage_gc
-			    : ca->usage[journal_seq & JOURNAL_BUF_MASK]);
-}
-
 void bch2_dev_usage_read_fast(struct bch_dev *ca, struct bch_dev_usage *usage)
 {
 	struct bch_fs *c = ca->fs;
@@ -267,11 +249,6 @@ bch2_fs_usage_read_short(struct bch_fs *c)
 	return ret;
 }
 
-void bch2_dev_usage_init(struct bch_dev *ca)
-{
-	ca->usage_base->d[BCH_DATA_free].buckets = ca->mi.nbuckets - ca->mi.first_bucket;
-}
-
 void bch2_dev_usage_to_text(struct printbuf *out, struct bch_dev_usage *usage)
 {
 	prt_printf(out, "\tbuckets\rsectors\rfragmented\r\n");
@@ -287,21 +264,20 @@ void bch2_dev_usage_to_text(struct printbuf *out, struct bch_dev_usage *usage)
 
 void bch2_dev_usage_update(struct bch_fs *c, struct bch_dev *ca,
 			   const struct bch_alloc_v4 *old,
-			   const struct bch_alloc_v4 *new,
-			   u64 journal_seq, bool gc)
+			   const struct bch_alloc_v4 *new)
 {
 	struct bch_fs_usage *fs_usage;
 	struct bch_dev_usage *u;
 
 	preempt_disable();
-	fs_usage = fs_usage_ptr(c, journal_seq, gc);
+	fs_usage = this_cpu_ptr(c->usage_gc);
 
 	if (data_type_is_hidden(old->data_type))
 		fs_usage->b.hidden -= ca->mi.bucket_size;
 	if (data_type_is_hidden(new->data_type))
 		fs_usage->b.hidden += ca->mi.bucket_size;
 
-	u = dev_usage_ptr(ca, journal_seq, gc);
+	u = this_cpu_ptr(ca->usage_gc);
 
 	u->d[old->data_type].buckets--;
 	u->d[new->data_type].buckets++;
@@ -326,24 +302,8 @@ void bch2_dev_usage_update(struct bch_fs *c, struct bch_dev *ca,
 	preempt_enable();
 }
 
-static inline int __update_replicas(struct bch_fs *c,
-				    struct bch_fs_usage *fs_usage,
-				    struct bch_replicas_entry_v1 *r,
-				    s64 sectors)
-{
-	int idx = bch2_replicas_entry_idx(c, r);
-
-	if (idx < 0)
-		return -1;
-
-	fs_usage_data_type_to_base(&fs_usage->b, r->data_type, sectors);
-	fs_usage->replicas[idx]		+= sectors;
-	return 0;
-}
-
 int bch2_update_replicas(struct bch_fs *c, struct bkey_s_c k,
-			 struct bch_replicas_entry_v1 *r, s64 sectors,
-			 unsigned journal_seq, bool gc)
+			 struct bch_replicas_entry_v1 *r, s64 sectors)
 {
 	struct bch_fs_usage *fs_usage;
 	int idx, ret = 0;
@@ -370,7 +330,7 @@ int bch2_update_replicas(struct bch_fs *c, struct bkey_s_c k,
 	}
 
 	preempt_disable();
-	fs_usage = fs_usage_ptr(c, journal_seq, gc);
+	fs_usage = this_cpu_ptr(c->usage_gc);
 	fs_usage_data_type_to_base(&fs_usage->b, r->data_type, sectors);
 	fs_usage->replicas[idx]		+= sectors;
 	preempt_enable();
@@ -383,94 +343,13 @@ fsck_err:
 
 static inline int update_cached_sectors(struct bch_fs *c,
 			struct bkey_s_c k,
-			unsigned dev, s64 sectors,
-			unsigned journal_seq, bool gc)
+			unsigned dev, s64 sectors)
 {
 	struct bch_replicas_padded r;
 
 	bch2_replicas_entry_cached(&r.e, dev);
 
-	return bch2_update_replicas(c, k, &r.e, sectors, journal_seq, gc);
-}
-
-static int __replicas_deltas_realloc(struct btree_trans *trans, unsigned more,
-				     gfp_t gfp)
-{
-	struct replicas_delta_list *d = trans->fs_usage_deltas;
-	unsigned new_size = d ? (d->size + more) * 2 : 128;
-	unsigned alloc_size = sizeof(*d) + new_size;
-
-	WARN_ON_ONCE(alloc_size > REPLICAS_DELTA_LIST_MAX);
-
-	if (!d || d->used + more > d->size) {
-		d = krealloc(d, alloc_size, gfp|__GFP_ZERO);
-
-		if (unlikely(!d)) {
-			if (alloc_size > REPLICAS_DELTA_LIST_MAX)
-				return -ENOMEM;
-
-			d = mempool_alloc(&trans->c->replicas_delta_pool, gfp);
-			if (!d)
-				return -ENOMEM;
-
-			memset(d, 0, REPLICAS_DELTA_LIST_MAX);
-
-			if (trans->fs_usage_deltas)
-				memcpy(d, trans->fs_usage_deltas,
-				       trans->fs_usage_deltas->size + sizeof(*d));
-
-			new_size = REPLICAS_DELTA_LIST_MAX - sizeof(*d);
-			kfree(trans->fs_usage_deltas);
-		}
-
-		d->size = new_size;
-		trans->fs_usage_deltas = d;
-	}
-
-	return 0;
-}
-
-int bch2_replicas_deltas_realloc(struct btree_trans *trans, unsigned more)
-{
-	return allocate_dropping_locks_errcode(trans,
-				__replicas_deltas_realloc(trans, more, _gfp));
-}
-
-int bch2_update_replicas_list(struct btree_trans *trans,
-			 struct bch_replicas_entry_v1 *r,
-			 s64 sectors)
-{
-	struct replicas_delta_list *d;
-	struct replicas_delta *n;
-	unsigned b;
-	int ret;
-
-	if (!sectors)
-		return 0;
-
-	b = replicas_entry_bytes(r) + 8;
-	ret = bch2_replicas_deltas_realloc(trans, b);
-	if (ret)
-		return ret;
-
-	d = trans->fs_usage_deltas;
-	n = (void *) d->d + d->used;
-	n->delta = sectors;
-	unsafe_memcpy((void *) n + offsetof(struct replicas_delta, r),
-		      r, replicas_entry_bytes(r),
-		      "flexible array member embedded in strcuct with padding");
-	bch2_replicas_entry_sort(&n->r);
-	d->used += b;
-	return 0;
-}
-
-int bch2_update_cached_sectors_list(struct btree_trans *trans, unsigned dev, s64 sectors)
-{
-	struct bch_replicas_padded r;
-
-	bch2_replicas_entry_cached(&r.e, dev);
-
-	return bch2_update_replicas_list(trans, &r.e, sectors);
+	return bch2_update_replicas(c, k, &r.e, sectors);
 }
 
 static int bch2_check_fix_ptr(struct btree_trans *trans,
@@ -865,47 +744,6 @@ err:
 	goto out;
 }
 
-void bch2_trans_fs_usage_revert(struct btree_trans *trans,
-				struct replicas_delta_list *deltas)
-{
-	struct bch_fs *c = trans->c;
-	struct bch_fs_usage *dst;
-	struct replicas_delta *d, *top = (void *) deltas->d + deltas->used;
-	s64 added = 0;
-	unsigned i;
-
-	percpu_down_read(&c->mark_lock);
-	preempt_disable();
-	dst = fs_usage_ptr(c, trans->journal_res.seq, false);
-
-	/* revert changes: */
-	for (d = deltas->d; d != top; d = replicas_delta_next(d)) {
-		switch (d->r.data_type) {
-		case BCH_DATA_btree:
-		case BCH_DATA_user:
-		case BCH_DATA_parity:
-			added += d->delta;
-		}
-		BUG_ON(__update_replicas(c, dst, &d->r, -d->delta));
-	}
-
-	dst->b.nr_inodes -= deltas->nr_inodes;
-
-	for (i = 0; i < BCH_REPLICAS_MAX; i++) {
-		added				-= deltas->persistent_reserved[i];
-		dst->b.reserved			-= deltas->persistent_reserved[i];
-		dst->persistent_reserved[i]	-= deltas->persistent_reserved[i];
-	}
-
-	if (added > 0) {
-		trans->disk_res->sectors += added;
-		this_cpu_add(*c->online_reserved, added);
-	}
-
-	preempt_enable();
-	percpu_up_read(&c->mark_lock);
-}
-
 void bch2_trans_account_disk_usage_change(struct btree_trans *trans)
 {
 	struct bch_fs *c = trans->c;
@@ -959,43 +797,6 @@ void bch2_trans_account_disk_usage_change(struct btree_trans *trans)
 					should_not_have_added, disk_res_sectors);
 }
 
-int bch2_trans_fs_usage_apply(struct btree_trans *trans,
-			      struct replicas_delta_list *deltas)
-{
-	struct bch_fs *c = trans->c;
-	struct replicas_delta *d, *d2;
-	struct replicas_delta *top = (void *) deltas->d + deltas->used;
-	struct bch_fs_usage *dst;
-	unsigned i;
-
-	percpu_down_read(&c->mark_lock);
-	preempt_disable();
-	dst = fs_usage_ptr(c, trans->journal_res.seq, false);
-
-	for (d = deltas->d; d != top; d = replicas_delta_next(d))
-		if (__update_replicas(c, dst, &d->r, d->delta))
-			goto need_mark;
-
-	dst->b.nr_inodes += deltas->nr_inodes;
-
-	for (i = 0; i < BCH_REPLICAS_MAX; i++) {
-		dst->b.reserved			+= deltas->persistent_reserved[i];
-		dst->persistent_reserved[i]	+= deltas->persistent_reserved[i];
-	}
-
-	preempt_enable();
-	percpu_up_read(&c->mark_lock);
-	return 0;
-need_mark:
-	/* revert changes: */
-	for (d2 = deltas->d; d2 != d; d2 = replicas_delta_next(d2))
-		BUG_ON(__update_replicas(c, dst, &d2->r, -d2->delta));
-
-	preempt_enable();
-	percpu_up_read(&c->mark_lock);
-	return -1;
-}
-
 /* KEY_TYPE_extent: */
 
 static int __mark_pointer(struct btree_trans *trans, struct bch_dev *ca,
@@ -1070,7 +871,7 @@ static int bch2_trigger_pointer(struct btree_trans *trans,
 		ret = __mark_pointer(trans, ca, k, &p, *sectors, bp.data_type, &new);
 		if (!ret) {
 			alloc_to_bucket(g, new);
-			bch2_dev_usage_update(c, ca, &old, &new, 0, true);
+			bch2_dev_usage_update(c, ca, &old, &new);
 		}
 		bucket_unlock(g);
 err_unlock:
@@ -1114,10 +915,12 @@ static int bch2_trigger_stripe_ptr(struct btree_trans *trans,
 			stripe_blockcount_get(&s->v, p.ec.block) +
 			sectors);
 
-		struct bch_replicas_padded r;
-		bch2_bkey_to_replicas(&r.e, bkey_i_to_s_c(&s->k_i));
-		r.e.data_type = data_type;
-		ret = bch2_update_replicas_list(trans, &r.e, sectors);
+		struct disk_accounting_pos acc = {
+			.type = BCH_DISK_ACCOUNTING_replicas,
+		};
+		bch2_bkey_to_replicas(&acc.replicas, bkey_i_to_s_c(&s->k_i));
+		acc.replicas.data_type = data_type;
+		ret = bch2_disk_accounting_mod(trans, &acc, &sectors, 1);
 err:
 		bch2_trans_iter_exit(trans, &iter);
 		return ret;
@@ -1154,7 +957,7 @@ err:
 		mutex_unlock(&c->ec_stripes_heap_lock);
 
 		r.e.data_type = data_type;
-		bch2_update_replicas(c, k, &r.e, sectors, trans->journal_res.seq, true);
+		bch2_update_replicas(c, k, &r.e, sectors);
 	}
 
 	return 0;
@@ -1170,16 +973,18 @@ static int __trigger_extent(struct btree_trans *trans,
 	struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
 	const union bch_extent_entry *entry;
 	struct extent_ptr_decoded p;
-	struct bch_replicas_padded r;
 	enum bch_data_type data_type = bkey_is_btree_ptr(k.k)
 		? BCH_DATA_btree
 		: BCH_DATA_user;
 	s64 replicas_sectors = 0;
 	int ret = 0;
 
-	r.e.data_type	= data_type;
-	r.e.nr_devs	= 0;
-	r.e.nr_required	= 1;
+	struct disk_accounting_pos acc = {
+		.type			= BCH_DISK_ACCOUNTING_replicas,
+		.replicas.data_type	= data_type,
+		.replicas.nr_devs	= 0,
+		.replicas.nr_required	= 1,
+	};
 
 	bkey_for_each_ptr_decode(k.k, ptrs, p, entry) {
 		s64 disk_sectors = 0;
@@ -1192,8 +997,8 @@ static int __trigger_extent(struct btree_trans *trans,
 		if (p.ptr.cached) {
 			if (!stale) {
 				ret = !gc
-					? bch2_update_cached_sectors_list(trans, p.ptr.dev, disk_sectors)
-					: update_cached_sectors(c, k, p.ptr.dev, disk_sectors, 0, true);
+					? bch2_mod_dev_cached_sectors(trans, p.ptr.dev, disk_sectors)
+					: update_cached_sectors(c, k, p.ptr.dev, disk_sectors);
 				bch2_fs_fatal_err_on(ret && gc, c, "%s: no replicas entry while updating cached sectors",
 						     bch2_err_str(ret));
 				if (ret)
@@ -1201,7 +1006,7 @@ static int __trigger_extent(struct btree_trans *trans,
 			}
 		} else if (!p.has_ec) {
 			replicas_sectors       += disk_sectors;
-			r.e.devs[r.e.nr_devs++]	= p.ptr.dev;
+			acc.replicas.devs[acc.replicas.nr_devs++] = p.ptr.dev;
 		} else {
 			ret = bch2_trigger_stripe_ptr(trans, k, p, data_type, disk_sectors, flags);
 			if (ret)
@@ -1212,14 +1017,14 @@ static int __trigger_extent(struct btree_trans *trans,
 			 * if so they're not required for mounting if we have an
 			 * erasure coded pointer in this extent:
 			 */
-			r.e.nr_required = 0;
+			acc.replicas.nr_required = 0;
 		}
 	}
 
-	if (r.e.nr_devs) {
+	if (acc.replicas.nr_devs) {
 		ret = !gc
-			? bch2_update_replicas_list(trans, &r.e, replicas_sectors)
-			: bch2_update_replicas(c, k, &r.e, replicas_sectors, 0, true);
+			? bch2_disk_accounting_mod(trans, &acc, &replicas_sectors, 1)
+			: bch2_update_replicas(c, k, &acc.replicas, replicas_sectors);
 		if (unlikely(ret && gc)) {
 			struct printbuf buf = PRINTBUF;
 
@@ -1281,23 +1086,23 @@ static int __trigger_reservation(struct btree_trans *trans,
 {
 	struct bch_fs *c = trans->c;
 	unsigned replicas = bkey_s_c_to_reservation(k).v->nr_replicas;
-	s64 sectors = (s64) k.k->size * replicas;
+	s64 sectors = (s64) k.k->size;
 
 	if (flags & BTREE_TRIGGER_overwrite)
 		sectors = -sectors;
 
 	if (flags & BTREE_TRIGGER_transactional) {
-		int ret = bch2_replicas_deltas_realloc(trans, 0);
-		if (ret)
-			return ret;
+		struct disk_accounting_pos acc = {
+			.type = BCH_DISK_ACCOUNTING_persistent_reserved,
+			.persistent_reserved.nr_replicas = replicas,
+		};
 
-		struct replicas_delta_list *d = trans->fs_usage_deltas;
-		replicas = min(replicas, ARRAY_SIZE(d->persistent_reserved));
-
-		d->persistent_reserved[replicas - 1] += sectors;
+		return bch2_disk_accounting_mod(trans, &acc, &sectors, 1);
 	}
 
 	if (flags & BTREE_TRIGGER_gc) {
+		sectors *= replicas;
+
 		percpu_down_read(&c->mark_lock);
 		preempt_disable();
 
@@ -1392,7 +1197,7 @@ static int bch2_mark_metadata_bucket(struct bch_fs *c, struct bch_dev *ca,
 	g->data_type = data_type;
 	g->dirty_sectors += sectors;
 	struct bch_alloc_v4 new = bucket_m_to_alloc(*g);
-	bch2_dev_usage_update(c, ca, &old, &new, 0, true);
+	bch2_dev_usage_update(c, ca, &old, &new);
 	percpu_up_read(&c->mark_lock);
 	return 0;
 err:
diff --git a/fs/bcachefs/buckets.h b/fs/bcachefs/buckets.h
index 05a1c98754f2..b61942fc3090 100644
--- a/fs/bcachefs/buckets.h
+++ b/fs/bcachefs/buckets.h
@@ -212,7 +212,6 @@ static inline struct bch_dev_usage bch2_dev_usage_read(struct bch_dev *ca)
 	return ret;
 }
 
-void bch2_dev_usage_init(struct bch_dev *);
 void bch2_dev_usage_to_text(struct printbuf *, struct bch_dev_usage *);
 
 static inline u64 bch2_dev_buckets_reserved(struct bch_dev *ca, enum bch_watermark watermark)
@@ -315,29 +314,9 @@ bch2_fs_usage_read_short(struct bch_fs *);
 
 void bch2_dev_usage_update(struct bch_fs *, struct bch_dev *,
 			   const struct bch_alloc_v4 *,
-			   const struct bch_alloc_v4 *, u64, bool);
-
-/* key/bucket marking: */
-
-static inline struct bch_fs_usage *fs_usage_ptr(struct bch_fs *c,
-						unsigned journal_seq,
-						bool gc)
-{
-	percpu_rwsem_assert_held(&c->mark_lock);
-	BUG_ON(!gc && !journal_seq);
-
-	return this_cpu_ptr(gc
-			    ? c->usage_gc
-			    : c->usage[journal_seq & JOURNAL_BUF_MASK]);
-}
-
+			   const struct bch_alloc_v4 *);
 int bch2_update_replicas(struct bch_fs *, struct bkey_s_c,
-			 struct bch_replicas_entry_v1 *, s64,
-			 unsigned, bool);
-int bch2_update_replicas_list(struct btree_trans *,
 			 struct bch_replicas_entry_v1 *, s64);
-int bch2_update_cached_sectors_list(struct btree_trans *, unsigned, s64);
-int bch2_replicas_deltas_realloc(struct btree_trans *, unsigned);
 
 void bch2_fs_usage_initialize(struct bch_fs *);
 
@@ -369,9 +348,6 @@ int bch2_trigger_reservation(struct btree_trans *, enum btree_id, unsigned,
 
 void bch2_trans_account_disk_usage_change(struct btree_trans *);
 
-void bch2_trans_fs_usage_revert(struct btree_trans *, struct replicas_delta_list *);
-int bch2_trans_fs_usage_apply(struct btree_trans *, struct replicas_delta_list *);
-
 int bch2_trans_mark_metadata_bucket(struct btree_trans *, struct bch_dev *, u64,
 				    enum bch_data_type, unsigned,
 				    enum btree_iter_update_trigger_flags);
diff --git a/fs/bcachefs/disk_accounting.c b/fs/bcachefs/disk_accounting.c
index db501c6cf3ee..2cc2e0f8cb53 100644
--- a/fs/bcachefs/disk_accounting.c
+++ b/fs/bcachefs/disk_accounting.c
@@ -1,11 +1,63 @@
 // SPDX-License-Identifier: GPL-2.0
 
 #include "bcachefs.h"
+#include "bcachefs_ioctl.h"
 #include "btree_update.h"
+#include "btree_write_buffer.h"
 #include "buckets.h"
 #include "disk_accounting.h"
+#include "error.h"
+#include "journal_io.h"
 #include "replicas.h"
 
+/*
+ * Notes on disk accounting:
+ *
+ * We have two parallel sets of counters to be concerned with, and both must be
+ * kept in sync.
+ *
+ *  - Persistent/on disk accounting, stored in the accounting btree and updated
+ *    via btree write buffer updates that treat new accounting keys as deltas to
+ *    apply to existing values. But reading from a write buffer btree is
+ *    expensive, so we also have
+ *
+ *  - In memory accounting, where accounting is stored as an array of percpu
+ *    counters, indexed by an eytzinger array of disk acounting keys/bpos (which
+ *    are the same thing, excepting byte swabbing on big endian).
+ *
+ *    Cheap to read, but non persistent.
+ *
+ * Disk accounting updates are generated by transactional triggers; these run as
+ * keys enter and leave the btree, and can compare old and new versions of keys;
+ * the output of these triggers are deltas to the various counters.
+ *
+ * Disk accounting updates are done as btree write buffer updates, where the
+ * counters in the disk accounting key are deltas that will be applied to the
+ * counter in the btree when the key is flushed by the write buffer (or journal
+ * replay).
+ *
+ * To do a disk accounting update:
+ * - initialize a disk_accounting_key, to specify which counter is being update
+ * - initialize counter deltas, as an array of 1-3 s64s
+ * - call bch2_disk_accounting_mod()
+ *
+ * This queues up the accounting update to be done at transaction commit time.
+ * Underneath, it's a normal btree write buffer update.
+ *
+ * The transaction commit path is responsible for propagating updates to the in
+ * memory counters, with bch2_accounting_mem_mod().
+ *
+ * The commit path also assigns every disk accounting update a unique version
+ * number, based on the journal sequence number and offset within that journal
+ * buffer; this is used by journal replay to determine which updates have been
+ * done.
+ *
+ * The transaction commit path also ensures that replicas entry accounting
+ * updates are properly marked in the superblock (so that we know whether we can
+ * mount without data being unavailable); it will update the superblock if
+ * bch2_accounting_mem_mod() tells it to.
+ */
+
 static const char * const disk_accounting_type_strs[] = {
 #define x(t, n, ...) [n] = #t,
 	BCH_DISK_ACCOUNTING_TYPES()
@@ -13,6 +65,44 @@ static const char * const disk_accounting_type_strs[] = {
 	NULL
 };
 
+int bch2_disk_accounting_mod(struct btree_trans *trans,
+			     struct disk_accounting_pos *k,
+			     s64 *d, unsigned nr)
+{
+	/* Normalize: */
+	switch (k->type) {
+	case BCH_DISK_ACCOUNTING_replicas:
+		bubble_sort(k->replicas.devs, k->replicas.nr_devs, u8_cmp);
+		break;
+	}
+
+	BUG_ON(nr > BCH_ACCOUNTING_MAX_COUNTERS);
+
+	struct {
+		__BKEY_PADDED(k, BCH_ACCOUNTING_MAX_COUNTERS);
+	} k_i;
+	struct bkey_i_accounting *acc = bkey_accounting_init(&k_i.k);
+
+	acc->k.p = disk_accounting_pos_to_bpos(k);
+	set_bkey_val_u64s(&acc->k, sizeof(struct bch_accounting) / sizeof(u64) + nr);
+
+	memcpy_u64s_small(acc->v.d, d, nr);
+
+	return bch2_trans_update_buffered(trans, BTREE_ID_accounting, &acc->k_i);
+}
+
+int bch2_mod_dev_cached_sectors(struct btree_trans *trans,
+				unsigned dev, s64 sectors)
+{
+	struct disk_accounting_pos acc = {
+		.type = BCH_DISK_ACCOUNTING_replicas,
+	};
+
+	bch2_replicas_entry_cached(&acc.replicas, dev);
+
+	return bch2_disk_accounting_mod(trans, &acc, &sectors, 1);
+}
+
 int bch2_accounting_invalid(struct bch_fs *c, struct bkey_s_c k,
 			    enum bch_validate_flags flags,
 			    struct printbuf *err)
@@ -43,9 +133,6 @@ void bch2_accounting_key_to_text(struct printbuf *out, struct disk_accounting_po
 		prt_printf(out, "dev=%u data_type=", k->dev_data_type.dev);
 		bch2_prt_data_type(out, k->dev_data_type.data_type);
 		break;
-	case BCH_DISK_ACCOUNTING_dev_stripe_buckets:
-		prt_printf(out, "dev=%u", k->dev_stripe_buckets.dev);
-		break;
 	}
 }
 
@@ -68,3 +155,276 @@ void bch2_accounting_swab(struct bkey_s k)
 	     p++)
 		*p = swab64(*p);
 }
+
+static inline bool accounting_to_replicas(struct bch_replicas_entry_v1 *r, struct bpos p)
+{
+	struct disk_accounting_pos acc_k;
+	bpos_to_disk_accounting_pos(&acc_k, p);
+
+	switch (acc_k.type) {
+	case BCH_DISK_ACCOUNTING_replicas:
+		unsafe_memcpy(r, &acc_k.replicas,
+			      replicas_entry_bytes(&acc_k.replicas),
+			      "variable length struct");
+		return true;
+	default:
+		return false;
+	}
+}
+
+static int bch2_accounting_update_sb_one(struct bch_fs *c, struct bpos p)
+{
+	struct bch_replicas_padded r;
+	return accounting_to_replicas(&r.e, p)
+		? bch2_mark_replicas(c, &r.e)
+		: 0;
+}
+
+/*
+ * Ensure accounting keys being updated are present in the superblock, when
+ * applicable (i.e. replicas updates)
+ */
+int bch2_accounting_update_sb(struct btree_trans *trans)
+{
+	for (struct jset_entry *i = trans->journal_entries;
+	     i != (void *) ((u64 *) trans->journal_entries + trans->journal_entries_u64s);
+	     i = vstruct_next(i))
+		if (jset_entry_is_key(i) && i->start->k.type == KEY_TYPE_accounting) {
+			int ret = bch2_accounting_update_sb_one(trans->c, i->start->k.p);
+			if (ret)
+				return ret;
+		}
+
+	return 0;
+}
+
+static int __bch2_accounting_mem_mod_slowpath(struct bch_fs *c, struct bkey_s_c_accounting a)
+{
+	struct bch_replicas_padded r;
+
+	if (accounting_to_replicas(&r.e, a.k->p) &&
+	    !bch2_replicas_marked_locked(c, &r.e))
+		return -BCH_ERR_btree_insert_need_mark_replicas;
+
+	struct bch_accounting_mem *acc = &c->accounting;
+	unsigned new_nr_counters = acc->nr_counters + bch2_accounting_counters(a.k);
+
+	u64 __percpu *new_counters = __alloc_percpu_gfp(new_nr_counters * sizeof(u64),
+							sizeof(u64), GFP_KERNEL);
+	if (!new_counters)
+		return -BCH_ERR_ENOMEM_disk_accounting;
+
+	preempt_disable();
+	memcpy(this_cpu_ptr(new_counters),
+	       bch2_acc_percpu_u64s(acc->v, acc->nr_counters),
+	       acc->nr_counters * sizeof(u64));
+	preempt_enable();
+
+	struct accounting_pos_offset n = {
+		.pos		= a.k->p,
+		.version	= a.k->version,
+		.offset		= acc->nr_counters,
+		.nr_counters	= bch2_accounting_counters(a.k),
+	};
+	if (darray_push(&acc->k, n)) {
+		free_percpu(new_counters);
+		return -BCH_ERR_ENOMEM_disk_accounting;
+	}
+
+	eytzinger0_sort(acc->k.data, acc->k.nr, sizeof(acc->k.data[0]), accounting_pos_cmp, NULL);
+
+	free_percpu(acc->v);
+	acc->v = new_counters;
+	acc->nr_counters = new_nr_counters;
+
+	for (unsigned i = 0; i < n.nr_counters; i++)
+		this_cpu_add(acc->v[n.offset + i], a.v->d[i]);
+	return 0;
+}
+
+int bch2_accounting_mem_mod_slowpath(struct bch_fs *c, struct bkey_s_c_accounting a)
+{
+	percpu_up_read(&c->mark_lock);
+	percpu_down_write(&c->mark_lock);
+	int ret = __bch2_accounting_mem_mod_slowpath(c, a);
+	percpu_up_write(&c->mark_lock);
+	percpu_down_read(&c->mark_lock);
+	return ret;
+}
+
+/*
+ * Read out accounting keys for replicas entries, as an array of
+ * bch_replicas_usage entries.
+ *
+ * Note: this may be deprecated/removed at smoe point in the future and replaced
+ * with something more general, it exists to support the ioctl used by the
+ * 'bcachefs fs usage' command.
+ */
+int bch2_fs_replicas_usage_read(struct bch_fs *c, darray_char *usage)
+{
+	struct bch_accounting_mem *acc = &c->accounting;
+	int ret = 0;
+
+	darray_init(usage);
+
+	percpu_down_read(&c->mark_lock);
+	darray_for_each(acc->k, i) {
+		struct {
+			struct bch_replicas_usage r;
+			u8 pad[BCH_BKEY_PTRS_MAX];
+		} u;
+
+		if (!accounting_to_replicas(&u.r.r, i->pos))
+			continue;
+
+		u64 sectors;
+		bch2_accounting_mem_read(c, i->pos, &sectors, 1);
+		u.r.sectors = sectors;
+
+		ret = darray_make_room(usage, replicas_usage_bytes(&u.r));
+		if (ret)
+			break;
+
+		memcpy(&darray_top(*usage), &u.r, replicas_usage_bytes(&u.r));
+		usage->nr += replicas_usage_bytes(&u.r);
+	}
+	percpu_up_read(&c->mark_lock);
+
+	if (ret)
+		darray_exit(usage);
+	return ret;
+}
+
+static int accounting_read_key(struct bch_fs *c, struct bkey_s_c k)
+{
+	struct printbuf buf = PRINTBUF;
+
+	if (k.k->type != KEY_TYPE_accounting)
+		return 0;
+
+	percpu_down_read(&c->mark_lock);
+	int ret = __bch2_accounting_mem_mod(c, bkey_s_c_to_accounting(k));
+	percpu_up_read(&c->mark_lock);
+
+	if (bch2_accounting_key_is_zero(bkey_s_c_to_accounting(k)) &&
+	    ret == -BCH_ERR_btree_insert_need_mark_replicas)
+		ret = 0;
+
+	struct disk_accounting_pos acc;
+	bpos_to_disk_accounting_pos(&acc, k.k->p);
+
+	if (fsck_err_on(ret == -BCH_ERR_btree_insert_need_mark_replicas,
+			c, accounting_replicas_not_marked,
+			"accounting not marked in superblock replicas\n  %s",
+			(bch2_accounting_key_to_text(&buf, &acc),
+			 buf.buf)))
+		ret = bch2_accounting_update_sb_one(c, k.k->p);
+fsck_err:
+	printbuf_exit(&buf);
+	return ret;
+}
+
+/*
+ * At startup time, initialize the in memory accounting from the btree (and
+ * journal)
+ */
+int bch2_accounting_read(struct bch_fs *c)
+{
+	struct bch_accounting_mem *acc = &c->accounting;
+
+	int ret = bch2_trans_run(c,
+		for_each_btree_key(trans, iter,
+				BTREE_ID_accounting, POS_MIN,
+				BTREE_ITER_prefetch|BTREE_ITER_all_snapshots, k, ({
+			struct bkey u;
+			struct bkey_s_c k = bch2_btree_path_peek_slot_exact(btree_iter_path(trans, &iter), &u);
+			accounting_read_key(c, k);
+		})));
+	if (ret)
+		goto err;
+
+	struct journal_keys *keys = &c->journal_keys;
+	move_gap(keys, keys->nr);
+	darray_for_each(*keys, i) {
+		if (i->k->k.type == KEY_TYPE_accounting) {
+			struct bkey_s_c k = bkey_i_to_s_c(i->k);
+			unsigned idx = eytzinger0_find(acc->k.data, acc->k.nr,
+						sizeof(acc->k.data[0]),
+						accounting_pos_cmp, &k.k->p);
+
+			bool applied = idx < acc->k.nr &&
+				bversion_cmp(acc->k.data[idx].version, k.k->version) >= 0;
+
+			if (applied)
+				continue;
+
+			ret = accounting_read_key(c, k);
+			if (ret)
+				goto err;
+		}
+	}
+
+	percpu_down_read(&c->mark_lock);
+	preempt_disable();
+	struct bch_fs_usage_base *usage = &c->usage_base->b;
+
+	for (unsigned i = 0; i < acc->k.nr; i++) {
+		struct disk_accounting_pos k;
+		bpos_to_disk_accounting_pos(&k, acc->k.data[i].pos);
+
+		u64 v[BCH_ACCOUNTING_MAX_COUNTERS];
+		bch2_accounting_mem_read_counters(c, i, v, ARRAY_SIZE(v));
+
+		switch (k.type) {
+		case BCH_DISK_ACCOUNTING_persistent_reserved:
+			usage->reserved += v[0] * k.persistent_reserved.nr_replicas;
+			break;
+		case BCH_DISK_ACCOUNTING_replicas:
+			fs_usage_data_type_to_base(usage, k.replicas.data_type, v[0]);
+			break;
+		}
+	}
+	preempt_enable();
+	percpu_up_read(&c->mark_lock);
+err:
+	bch_err_fn(c, ret);
+	return ret;
+}
+
+int bch2_dev_usage_remove(struct bch_fs *c, unsigned dev)
+{
+	return bch2_trans_run(c,
+		bch2_btree_write_buffer_flush_sync(trans) ?:
+		for_each_btree_key_commit(trans, iter, BTREE_ID_accounting, POS_MIN,
+				BTREE_ITER_all_snapshots, k, NULL, NULL, 0, ({
+			struct disk_accounting_pos acc;
+			bpos_to_disk_accounting_pos(&acc, k.k->p);
+
+			acc.type == BCH_DISK_ACCOUNTING_dev_data_type &&
+			acc.dev_data_type.dev == dev
+				? bch2_btree_bit_mod_buffered(trans, BTREE_ID_accounting, k.k->p, 0)
+				: 0;
+		})) ?:
+		bch2_btree_write_buffer_flush_sync(trans));
+}
+
+int bch2_dev_usage_init(struct bch_dev *ca)
+{
+	struct disk_accounting_pos acc = {
+		.type = BCH_DISK_ACCOUNTING_dev_data_type,
+		.dev_data_type.dev = ca->dev_idx,
+		.dev_data_type.data_type = BCH_DATA_free,
+	};
+	u64 v[3] = { ca->mi.nbuckets - ca->mi.first_bucket, 0, 0 };
+
+	return bch2_trans_do(ca->fs, NULL, NULL, 0,
+			     bch2_disk_accounting_mod(trans, &acc, v, ARRAY_SIZE(v)));
+}
+
+void bch2_fs_accounting_exit(struct bch_fs *c)
+{
+	struct bch_accounting_mem *acc = &c->accounting;
+
+	darray_exit(&acc->k);
+	free_percpu(acc->v);
+}
diff --git a/fs/bcachefs/disk_accounting.h b/fs/bcachefs/disk_accounting.h
index 178ec25b7ef4..ec1b8ae2aeee 100644
--- a/fs/bcachefs/disk_accounting.h
+++ b/fs/bcachefs/disk_accounting.h
@@ -2,11 +2,32 @@
 #ifndef _BCACHEFS_DISK_ACCOUNTING_H
 #define _BCACHEFS_DISK_ACCOUNTING_H
 
+#include "eytzinger.h"
+
+static inline void bch2_u64s_neg(u64 *v, unsigned nr)
+{
+	for (unsigned i = 0; i < nr; i++)
+		v[i] = -v[i];
+}
+
 static inline unsigned bch2_accounting_counters(const struct bkey *k)
 {
 	return bkey_val_u64s(k) - offsetof(struct bch_accounting, d) / sizeof(u64);
 }
 
+static inline void bch2_accounting_neg(struct bkey_s_accounting a)
+{
+	bch2_u64s_neg(a.v->d, bch2_accounting_counters(a.k));
+}
+
+static inline bool bch2_accounting_key_is_zero(struct bkey_s_c_accounting a)
+{
+	for (unsigned i = 0;  i < bch2_accounting_counters(a.k); i++)
+		if (a.v->d[i])
+			return false;
+	return true;
+}
+
 static inline void bch2_accounting_accumulate(struct bkey_i_accounting *dst,
 					      struct bkey_s_c_accounting src)
 {
@@ -18,6 +39,26 @@ static inline void bch2_accounting_accumulate(struct bkey_i_accounting *dst,
 		dst->k.version = src.k->version;
 }
 
+static inline void fs_usage_data_type_to_base(struct bch_fs_usage_base *fs_usage,
+					      enum bch_data_type data_type,
+					      s64 sectors)
+{
+	switch (data_type) {
+	case BCH_DATA_btree:
+		fs_usage->btree		+= sectors;
+		break;
+	case BCH_DATA_user:
+	case BCH_DATA_parity:
+		fs_usage->data		+= sectors;
+		break;
+	case BCH_DATA_cached:
+		fs_usage->cached	+= sectors;
+		break;
+	default:
+		break;
+	}
+}
+
 static inline void bpos_to_disk_accounting_pos(struct disk_accounting_pos *acc, struct bpos p)
 {
 	acc->_pad = p;
@@ -36,6 +77,12 @@ static inline struct bpos disk_accounting_pos_to_bpos(struct disk_accounting_pos
 	return ret;
 }
 
+int bch2_disk_accounting_mod(struct btree_trans *,
+			     struct disk_accounting_pos *,
+			     s64 *, unsigned);
+int bch2_mod_dev_cached_sectors(struct btree_trans *trans,
+				unsigned dev, s64 sectors);
+
 int bch2_accounting_invalid(struct bch_fs *, struct bkey_s_c,
 			    enum bch_validate_flags, struct printbuf *);
 void bch2_accounting_key_to_text(struct printbuf *, struct disk_accounting_pos *);
@@ -49,4 +96,88 @@ void bch2_accounting_swab(struct bkey_s);
 	.min_val_size	= 8,				\
 })
 
+int bch2_accounting_update_sb(struct btree_trans *);
+
+static inline int accounting_pos_cmp(const void *_l, const void *_r)
+{
+	const struct bpos *l = _l, *r = _r;
+
+	return bpos_cmp(*l, *r);
+}
+
+int bch2_accounting_mem_mod_slowpath(struct bch_fs *, struct bkey_s_c_accounting);
+
+static inline int __bch2_accounting_mem_mod(struct bch_fs *c, struct bkey_s_c_accounting a)
+{
+	struct bch_accounting_mem *acc = &c->accounting;
+	unsigned idx = eytzinger0_find(acc->k.data, acc->k.nr, sizeof(acc->k.data[0]),
+				       accounting_pos_cmp, &a.k->p);
+	if (unlikely(idx >= acc->k.nr))
+		return bch2_accounting_mem_mod_slowpath(c, a);
+
+	unsigned offset = acc->k.data[idx].offset;
+
+	EBUG_ON(bch2_accounting_counters(a.k) != acc->k.data[idx].nr_counters);
+
+	for (unsigned i = 0; i < bch2_accounting_counters(a.k); i++)
+		this_cpu_add(acc->v[offset + i], a.v->d[i]);
+	return 0;
+}
+
+/*
+ * Update in memory counters so they match the btree update we're doing; called
+ * from transaction commit path
+ */
+static inline int bch2_accounting_mem_mod(struct btree_trans *trans, struct
+					  bkey_s_c_accounting a)
+{
+	struct disk_accounting_pos acc_k;
+	bpos_to_disk_accounting_pos(&acc_k, a.k->p);
+
+	switch (acc_k.type) {
+	case BCH_DISK_ACCOUNTING_persistent_reserved:
+		trans->fs_usage_delta.reserved += acc_k.persistent_reserved.nr_replicas * a.v->d[0];
+		break;
+	case BCH_DISK_ACCOUNTING_replicas:
+		fs_usage_data_type_to_base(&trans->fs_usage_delta, acc_k.replicas.data_type, a.v->d[0]);
+		break;
+	}
+	return __bch2_accounting_mem_mod(trans->c, a);
+}
+
+static inline void bch2_accounting_mem_read_counters(struct bch_fs *c,
+						     unsigned idx,
+						     u64 *v, unsigned nr)
+{
+	memset(v, 0, sizeof(*v) * nr);
+
+	struct bch_accounting_mem *acc = &c->accounting;
+	if (unlikely(idx >= acc->k.nr))
+		return;
+
+	unsigned offset = acc->k.data[idx].offset;
+	nr = min_t(unsigned, nr, acc->k.data[idx].nr_counters);
+
+	for (unsigned i = 0; i < nr; i++)
+		v[i] = percpu_u64_get(acc->v + offset + i);
+}
+
+static inline void bch2_accounting_mem_read(struct bch_fs *c, struct bpos p,
+					    u64 *v, unsigned nr)
+{
+	struct bch_accounting_mem *acc = &c->accounting;
+	unsigned idx = eytzinger0_find(acc->k.data, acc->k.nr, sizeof(acc->k.data[0]),
+				       accounting_pos_cmp, &p);
+
+	bch2_accounting_mem_read_counters(c, idx, v, nr);
+}
+
+int bch2_fs_replicas_usage_read(struct bch_fs *, darray_char *);
+
+int bch2_accounting_read(struct bch_fs *);
+
+int bch2_dev_usage_remove(struct bch_fs *, unsigned);
+int bch2_dev_usage_init(struct bch_dev *);
+void bch2_fs_accounting_exit(struct bch_fs *);
+
 #endif /* _BCACHEFS_DISK_ACCOUNTING_H */
diff --git a/fs/bcachefs/disk_accounting_format.h b/fs/bcachefs/disk_accounting_format.h
index 4ff42466f2a6..af5f5789fe5d 100644
--- a/fs/bcachefs/disk_accounting_format.h
+++ b/fs/bcachefs/disk_accounting_format.h
@@ -99,8 +99,7 @@ static inline bool data_type_is_hidden(enum bch_data_type type)
 	x(nr_inodes,		0)		\
 	x(persistent_reserved,	1)		\
 	x(replicas,		2)		\
-	x(dev_data_type,	3)		\
-	x(dev_stripe_buckets,	4)
+	x(dev_data_type,	3)
 
 enum disk_accounting_type {
 #define x(f, nr)	BCH_DISK_ACCOUNTING_##f	= nr,
diff --git a/fs/bcachefs/disk_accounting_types.h b/fs/bcachefs/disk_accounting_types.h
new file mode 100644
index 000000000000..5656ac540a10
--- /dev/null
+++ b/fs/bcachefs/disk_accounting_types.h
@@ -0,0 +1,20 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_DISK_ACCOUNTING_TYPES_H
+#define _BCACHEFS_DISK_ACCOUNTING_TYPES_H
+
+#include "darray.h"
+
+struct accounting_pos_offset {
+	struct bpos				pos;
+	struct bversion				version;
+	u32					offset:24,
+						nr_counters:8;
+};
+
+struct bch_accounting_mem {
+	DARRAY(struct accounting_pos_offset)	k;
+	u64 __percpu				*v;
+	unsigned				nr_counters;
+};
+
+#endif /* _BCACHEFS_DISK_ACCOUNTING_TYPES_H */
diff --git a/fs/bcachefs/ec.c b/fs/bcachefs/ec.c
index 83e279d41829..819d61b9ab83 100644
--- a/fs/bcachefs/ec.c
+++ b/fs/bcachefs/ec.c
@@ -13,6 +13,7 @@
 #include "btree_write_buffer.h"
 #include "buckets.h"
 #include "checksum.h"
+#include "disk_accounting.h"
 #include "disk_groups.h"
 #include "ec.h"
 #include "error.h"
@@ -302,7 +303,7 @@ static int mark_stripe_bucket(struct btree_trans *trans,
 		ret = __mark_stripe_bucket(trans, ca, s, ptr_idx, deleting, bucket, &new, flags);
 		if (!ret) {
 			alloc_to_bucket(g, new);
-			bch2_dev_usage_update(c, ca, &old, &new, 0, true);
+			bch2_dev_usage_update(c, ca, &old, &new);
 		}
 		bucket_unlock(g);
 err_unlock:
@@ -384,21 +385,25 @@ int bch2_trigger_stripe(struct btree_trans *trans,
 			new_s->nr_redundant	!= old_s->nr_redundant));
 
 		if (new_s) {
-			s64 sectors = le16_to_cpu(new_s->sectors);
+			s64 sectors = (u64) le16_to_cpu(new_s->sectors) * new_s->nr_redundant;
 
-			struct bch_replicas_padded r;
-			bch2_bkey_to_replicas(&r.e, new);
-			int ret = bch2_update_replicas_list(trans, &r.e, sectors * new_s->nr_redundant);
+			struct disk_accounting_pos acc = {
+				.type = BCH_DISK_ACCOUNTING_replicas,
+			};
+			bch2_bkey_to_replicas(&acc.replicas, new);
+			int ret = bch2_disk_accounting_mod(trans, &acc, &sectors, 1);
 			if (ret)
 				return ret;
 		}
 
 		if (old_s) {
-			s64 sectors = -((s64) le16_to_cpu(old_s->sectors));
+			s64 sectors = -((s64) le16_to_cpu(old_s->sectors)) * old_s->nr_redundant;
 
-			struct bch_replicas_padded r;
-			bch2_bkey_to_replicas(&r.e, old);
-			int ret = bch2_update_replicas_list(trans, &r.e, sectors * old_s->nr_redundant);
+			struct disk_accounting_pos acc = {
+				.type = BCH_DISK_ACCOUNTING_replicas,
+			};
+			bch2_bkey_to_replicas(&acc.replicas, old);
+			int ret = bch2_disk_accounting_mod(trans, &acc, &sectors, 1);
 			if (ret)
 				return ret;
 		}
@@ -481,8 +486,7 @@ int bch2_trigger_stripe(struct btree_trans *trans,
 			return ret;
 
 		ret = bch2_update_replicas(c, new, &m->r.e,
-				      ((s64) m->sectors * m->nr_redundant),
-				      0, true);
+				      ((s64) m->sectors * m->nr_redundant));
 		if (ret) {
 			struct printbuf buf = PRINTBUF;
 
diff --git a/fs/bcachefs/inode.c b/fs/bcachefs/inode.c
index 84d5385e1046..4ad55ca15775 100644
--- a/fs/bcachefs/inode.c
+++ b/fs/bcachefs/inode.c
@@ -8,6 +8,7 @@
 #include "buckets.h"
 #include "compress.h"
 #include "dirent.h"
+#include "disk_accounting.h"
 #include "error.h"
 #include "extents.h"
 #include "extent_update.h"
@@ -603,11 +604,13 @@ int bch2_trigger_inode(struct btree_trans *trans,
 
 	if (flags & BTREE_TRIGGER_transactional) {
 		if (nr) {
-			int ret = bch2_replicas_deltas_realloc(trans, 0);
+			struct disk_accounting_pos acc = {
+				.type = BCH_DISK_ACCOUNTING_nr_inodes
+			};
+
+			int ret = bch2_disk_accounting_mod(trans, &acc, &nr, 1);
 			if (ret)
 				return ret;
-
-			trans->fs_usage_deltas->nr_inodes += nr;
 		}
 
 		bool old_deleted = bkey_is_deleted_inode(old);
diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c
index 5c6bfa9e69d5..2d03b4ff31dd 100644
--- a/fs/bcachefs/recovery.c
+++ b/fs/bcachefs/recovery.c
@@ -139,8 +139,6 @@ static void replay_now_at(struct journal *j, u64 seq)
 static int bch2_journal_replay_accounting_key(struct btree_trans *trans,
 					      struct journal_key *k)
 {
-	struct journal_keys *keys = &trans->c->journal_keys;
-
 	struct btree_iter iter;
 	bch2_trans_node_iter_init(trans, &iter, k->btree_id, k->k->k.p,
 				  BTREE_MAX_DEPTH, k->level,
@@ -282,6 +280,7 @@ int bch2_journal_replay(struct bch_fs *c)
 		ret = commit_do(trans, NULL, NULL,
 				BCH_TRANS_COMMIT_no_enospc|
 				BCH_TRANS_COMMIT_journal_reclaim|
+				BCH_TRANS_COMMIT_skip_accounting_apply|
 				BCH_TRANS_COMMIT_no_journal_res,
 			     bch2_journal_replay_accounting_key(trans, k));
 		if (bch2_fs_fatal_err_on(ret, c, "error replaying accounting; %s", bch2_err_str(ret)))
@@ -312,6 +311,7 @@ int bch2_journal_replay(struct bch_fs *c)
 			commit_do(trans, NULL, NULL,
 				  BCH_TRANS_COMMIT_no_enospc|
 				  BCH_TRANS_COMMIT_journal_reclaim|
+				  BCH_TRANS_COMMIT_skip_accounting_apply|
 				  (!k->allocated ? BCH_TRANS_COMMIT_no_journal_res : 0),
 			     bch2_journal_replay_key(trans, k));
 		BUG_ON(!ret && !k->overwritten && k->k->k.type != KEY_TYPE_accounting);
@@ -342,6 +342,7 @@ int bch2_journal_replay(struct bch_fs *c)
 
 		ret = commit_do(trans, NULL, NULL,
 				BCH_TRANS_COMMIT_no_enospc|
+				BCH_TRANS_COMMIT_skip_accounting_apply|
 				(!k->allocated
 				 ? BCH_TRANS_COMMIT_no_journal_res|BCH_WATERMARK_reclaim
 				 : 0),
@@ -1050,9 +1051,6 @@ int bch2_fs_initialize(struct bch_fs *c)
 	for (unsigned i = 0; i < BTREE_ID_NR; i++)
 		bch2_btree_root_alloc_fake(c, i, 0);
 
-	for_each_member_device(c, ca)
-		bch2_dev_usage_init(ca);
-
 	ret = bch2_fs_journal_alloc(c);
 	if (ret)
 		goto err;
@@ -1069,6 +1067,15 @@ int bch2_fs_initialize(struct bch_fs *c)
 	if (ret)
 		goto err;
 
+	for_each_member_device(c, ca) {
+		ret = bch2_dev_usage_init(ca);
+		bch_err_msg(c, ret, "initializing device usage");
+		if (ret) {
+			bch2_dev_put(ca);
+			goto err;
+		}
+	}
+
 	/*
 	 * Write out the superblock and journal buckets, now that we can do
 	 * btree updates
diff --git a/fs/bcachefs/recovery_passes.c b/fs/bcachefs/recovery_passes.c
index 4a9eb9582b6e..4a59f52f8d56 100644
--- a/fs/bcachefs/recovery_passes.c
+++ b/fs/bcachefs/recovery_passes.c
@@ -5,6 +5,7 @@
 #include "backpointers.h"
 #include "btree_gc.h"
 #include "btree_node_scan.h"
+#include "disk_accounting.h"
 #include "ec.h"
 #include "fsck.h"
 #include "inode.h"
diff --git a/fs/bcachefs/recovery_passes_types.h b/fs/bcachefs/recovery_passes_types.h
index 773aea9a0080..8c7dee5983d2 100644
--- a/fs/bcachefs/recovery_passes_types.h
+++ b/fs/bcachefs/recovery_passes_types.h
@@ -15,6 +15,7 @@
 #define BCH_RECOVERY_PASSES()							\
 	x(scan_for_btree_nodes,			37, 0)				\
 	x(check_topology,			 4, 0)				\
+	x(accounting_read,			39, PASS_ALWAYS)		\
 	x(alloc_read,				 0, PASS_ALWAYS)		\
 	x(stripes_read,				 1, PASS_ALWAYS)		\
 	x(initialize_subvolumes,		 2, 0)				\
diff --git a/fs/bcachefs/replicas.c b/fs/bcachefs/replicas.c
index 57a1f09cca09..9cf1d118f146 100644
--- a/fs/bcachefs/replicas.c
+++ b/fs/bcachefs/replicas.c
@@ -243,23 +243,25 @@ static bool __replicas_has_entry(struct bch_replicas_cpu *r,
 	return __replicas_entry_idx(r, search) >= 0;
 }
 
+bool bch2_replicas_marked_locked(struct bch_fs *c,
+			  struct bch_replicas_entry_v1 *search)
+{
+	verify_replicas_entry(search);
+
+	return !search->nr_devs ||
+		(__replicas_has_entry(&c->replicas, search) &&
+		 (likely((!c->replicas_gc.entries)) ||
+		  __replicas_has_entry(&c->replicas_gc, search)));
+}
+
 bool bch2_replicas_marked(struct bch_fs *c,
 			  struct bch_replicas_entry_v1 *search)
 {
-	bool marked;
-
-	if (!search->nr_devs)
-		return true;
-
-	verify_replicas_entry(search);
-
 	percpu_down_read(&c->mark_lock);
-	marked = __replicas_has_entry(&c->replicas, search) &&
-		(likely((!c->replicas_gc.entries)) ||
-		 __replicas_has_entry(&c->replicas_gc, search));
+	bool ret = bch2_replicas_marked_locked(c, search);
 	percpu_up_read(&c->mark_lock);
 
-	return marked;
+	return ret;
 }
 
 static void __replicas_table_update(struct bch_fs_usage *dst,
@@ -457,20 +459,6 @@ int bch2_mark_replicas(struct bch_fs *c, struct bch_replicas_entry_v1 *r)
 		? 0 : bch2_mark_replicas_slowpath(c, r);
 }
 
-/* replicas delta list: */
-
-int bch2_replicas_delta_list_mark(struct bch_fs *c,
-				  struct replicas_delta_list *r)
-{
-	struct replicas_delta *d = r->d;
-	struct replicas_delta *top = (void *) r->d + r->used;
-	int ret = 0;
-
-	for (d = r->d; !ret && d != top; d = replicas_delta_next(d))
-		ret = bch2_mark_replicas(c, &d->r);
-	return ret;
-}
-
 /*
  * Old replicas_gc mechanism: only used for journal replicas entries now, should
  * die at some point:
@@ -1046,8 +1034,6 @@ void bch2_fs_replicas_exit(struct bch_fs *c)
 	kfree(c->usage_base);
 	kfree(c->replicas.entries);
 	kfree(c->replicas_gc.entries);
-
-	mempool_exit(&c->replicas_delta_pool);
 }
 
 int bch2_fs_replicas_init(struct bch_fs *c)
@@ -1056,7 +1042,5 @@ int bch2_fs_replicas_init(struct bch_fs *c)
 			&c->replicas_journal_res,
 			reserve_journal_replicas(c, &c->replicas));
 
-	return mempool_init_kmalloc_pool(&c->replicas_delta_pool, 1,
-					 REPLICAS_DELTA_LIST_MAX) ?:
-		replicas_table_update(c, &c->replicas);
+	return replicas_table_update(c, &c->replicas);
 }
diff --git a/fs/bcachefs/replicas.h b/fs/bcachefs/replicas.h
index 654a4b26d3a3..0a24ebcf71bd 100644
--- a/fs/bcachefs/replicas.h
+++ b/fs/bcachefs/replicas.h
@@ -25,18 +25,13 @@ int bch2_replicas_entry_idx(struct bch_fs *,
 void bch2_devlist_to_replicas(struct bch_replicas_entry_v1 *,
 			      enum bch_data_type,
 			      struct bch_devs_list);
+
+bool bch2_replicas_marked_locked(struct bch_fs *,
+			  struct bch_replicas_entry_v1 *);
 bool bch2_replicas_marked(struct bch_fs *, struct bch_replicas_entry_v1 *);
 int bch2_mark_replicas(struct bch_fs *,
 		       struct bch_replicas_entry_v1 *);
 
-static inline struct replicas_delta *
-replicas_delta_next(struct replicas_delta *d)
-{
-	return (void *) d + replicas_entry_bytes(&d->r) + 8;
-}
-
-int bch2_replicas_delta_list_mark(struct bch_fs *, struct replicas_delta_list *);
-
 void bch2_bkey_to_replicas(struct bch_replicas_entry_v1 *, struct bkey_s_c);
 
 static inline void bch2_replicas_entry_cached(struct bch_replicas_entry_v1 *e,
diff --git a/fs/bcachefs/replicas_types.h b/fs/bcachefs/replicas_types.h
index ac90d142c4e8..fed71c861fe7 100644
--- a/fs/bcachefs/replicas_types.h
+++ b/fs/bcachefs/replicas_types.h
@@ -8,20 +8,4 @@ struct bch_replicas_cpu {
 	struct bch_replicas_entry_v1 *entries;
 };
 
-struct replicas_delta {
-	s64			delta;
-	struct bch_replicas_entry_v1 r;
-} __packed;
-
-struct replicas_delta_list {
-	unsigned		size;
-	unsigned		used;
-
-	struct			{} memset_start;
-	u64			nr_inodes;
-	u64			persistent_reserved[BCH_REPLICAS_MAX];
-	struct			{} memset_end;
-	struct replicas_delta	d[];
-};
-
 #endif /* _BCACHEFS_REPLICAS_TYPES_H */
diff --git a/fs/bcachefs/super.c b/fs/bcachefs/super.c
index bbc9e5a926bb..597addbf2bca 100644
--- a/fs/bcachefs/super.c
+++ b/fs/bcachefs/super.c
@@ -25,6 +25,7 @@
 #include "clock.h"
 #include "compress.h"
 #include "debug.h"
+#include "disk_accounting.h"
 #include "disk_groups.h"
 #include "ec.h"
 #include "errcode.h"
@@ -536,6 +537,7 @@ static void __bch2_fs_free(struct bch_fs *c)
 
 	bch2_find_btree_nodes_exit(&c->found_btree_nodes);
 	bch2_free_pending_node_rewrites(c);
+	bch2_fs_accounting_exit(c);
 	bch2_fs_sb_errors_exit(c);
 	bch2_fs_counters_exit(c);
 	bch2_fs_snapshots_exit(c);
@@ -1615,7 +1617,8 @@ static int bch2_dev_remove_alloc(struct bch_fs *c, struct bch_dev *ca)
 		bch2_btree_delete_range(c, BTREE_ID_alloc, start, end,
 					BTREE_TRIGGER_norun, NULL) ?:
 		bch2_btree_delete_range(c, BTREE_ID_bucket_gens, start, end,
-					BTREE_TRIGGER_norun, NULL);
+					BTREE_TRIGGER_norun, NULL) ?:
+		bch2_dev_usage_remove(c, ca->dev_idx);
 	bch_err_msg(c, ret, "removing dev alloc info");
 	return ret;
 }
@@ -1652,6 +1655,16 @@ int bch2_dev_remove(struct bch_fs *c, struct bch_dev *ca, int flags)
 	if (ret)
 		goto err;
 
+	/*
+	 * We need to flush the entire journal to get rid of keys that reference
+	 * the device being removed before removing the superblock entry
+	 */
+	bch2_journal_flush_all_pins(&c->journal);
+
+	/*
+	 * this is really just needed for the bch2_replicas_gc_(start|end)
+	 * calls, and could be cleaned up:
+	 */
 	ret = bch2_journal_flush_device_pins(&c->journal, ca->dev_idx);
 	bch_err_msg(ca, ret, "bch2_journal_flush_device_pins()");
 	if (ret)
@@ -1694,17 +1707,6 @@ int bch2_dev_remove(struct bch_fs *c, struct bch_dev *ca, int flags)
 
 	bch2_dev_free(ca);
 
-	/*
-	 * At this point the device object has been removed in-core, but the
-	 * on-disk journal might still refer to the device index via sb device
-	 * usage entries. Recovery fails if it sees usage information for an
-	 * invalid device. Flush journal pins to push the back of the journal
-	 * past now invalid device index references before we update the
-	 * superblock, but after the device object has been removed so any
-	 * further journal writes elide usage info for the device.
-	 */
-	bch2_journal_flush_all_pins(&c->journal);
-
 	/*
 	 * Free this device's slot in the bch_member array - all pointers to
 	 * this device must be gone:
@@ -1766,8 +1768,6 @@ int bch2_dev_add(struct bch_fs *c, const char *path)
 		goto err;
 	}
 
-	bch2_dev_usage_init(ca);
-
 	ret = __bch2_dev_attach_bdev(ca, &sb);
 	if (ret)
 		goto err;
@@ -1851,6 +1851,10 @@ have_slot:
 
 	bch2_dev_usage_journal_reserve(c);
 
+	ret = bch2_dev_usage_init(ca);
+	if (ret)
+		goto err_late;
+
 	ret = bch2_trans_mark_dev_sb(c, ca, BTREE_TRIGGER_transactional);
 	bch_err_msg(ca, ret, "marking new superblock");
 	if (ret)
@@ -2021,15 +2025,18 @@ int bch2_dev_resize(struct bch_fs *c, struct bch_dev *ca, u64 nbuckets)
 	mutex_unlock(&c->sb_lock);
 
 	if (ca->mi.freespace_initialized) {
-		ret = bch2_dev_freespace_init(c, ca, old_nbuckets, nbuckets);
+		struct disk_accounting_pos acc = {
+			.type = BCH_DISK_ACCOUNTING_dev_data_type,
+			.dev_data_type.dev = ca->dev_idx,
+			.dev_data_type.data_type = BCH_DATA_free,
+		};
+		u64 v[3] = { nbuckets - old_nbuckets, 0, 0 };
+
+		ret   = bch2_dev_freespace_init(c, ca, old_nbuckets, nbuckets) ?:
+			bch2_trans_do(ca->fs, NULL, NULL, 0,
+				bch2_disk_accounting_mod(trans, &acc, v, ARRAY_SIZE(v)));
 		if (ret)
 			goto err;
-
-		/*
-		 * XXX: this is all wrong transactionally - we'll be able to do
-		 * this correctly after the disk space accounting rewrite
-		 */
-		ca->usage_base->d[BCH_DATA_free].buckets += nbuckets - old_nbuckets;
 	}
 
 	bch2_recalc_capacity(c);

From 2e8d686a4a13c01d9a2b329507a0f5ce6455b5a8 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Tue, 4 Jun 2024 18:31:13 -0400
Subject: [PATCH 032/120] bcachefs: Coalesce accounting keys before journal
 replay

This fixes a performance regression in journal replay; without
colaescing accounting keys we have multiple keys at the same position,
which means journal_keys_peek_upto() has to skip past many overwritten
keys - turning journal replay into an O(n^2) algorithm.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_journal_iter.h |  2 ++
 fs/bcachefs/disk_accounting.c    | 18 ++++++++++++++++++
 2 files changed, 20 insertions(+)

diff --git a/fs/bcachefs/btree_journal_iter.h b/fs/bcachefs/btree_journal_iter.h
index 5b66c8f85fc1..1653de9d609b 100644
--- a/fs/bcachefs/btree_journal_iter.h
+++ b/fs/bcachefs/btree_journal_iter.h
@@ -2,6 +2,8 @@
 #ifndef _BCACHEFS_BTREE_JOURNAL_ITER_H
 #define _BCACHEFS_BTREE_JOURNAL_ITER_H
 
+#include "bkey.h"
+
 struct journal_iter {
 	struct list_head	list;
 	enum btree_id		btree_id;
diff --git a/fs/bcachefs/disk_accounting.c b/fs/bcachefs/disk_accounting.c
index 2cc2e0f8cb53..dbdc16f2fc1c 100644
--- a/fs/bcachefs/disk_accounting.c
+++ b/fs/bcachefs/disk_accounting.c
@@ -2,6 +2,7 @@
 
 #include "bcachefs.h"
 #include "bcachefs_ioctl.h"
+#include "btree_journal_iter.h"
 #include "btree_update.h"
 #include "btree_write_buffer.h"
 #include "buckets.h"
@@ -344,7 +345,9 @@ int bch2_accounting_read(struct bch_fs *c)
 		goto err;
 
 	struct journal_keys *keys = &c->journal_keys;
+	struct journal_key *dst = keys->data;
 	move_gap(keys, keys->nr);
+
 	darray_for_each(*keys, i) {
 		if (i->k->k.type == KEY_TYPE_accounting) {
 			struct bkey_s_c k = bkey_i_to_s_c(i->k);
@@ -358,11 +361,26 @@ int bch2_accounting_read(struct bch_fs *c)
 			if (applied)
 				continue;
 
+			if (i + 1 < &darray_top(*keys) &&
+			    i[1].k->k.type == KEY_TYPE_accounting &&
+			    !journal_key_cmp(i, i + 1)) {
+				BUG_ON(bversion_cmp(i[0].k->k.version, i[1].k->k.version) >= 0);
+
+				i[1].journal_seq = i[0].journal_seq;
+
+				bch2_accounting_accumulate(bkey_i_to_accounting(i[1].k),
+							   bkey_s_c_to_accounting(k));
+				continue;
+			}
+
 			ret = accounting_read_key(c, k);
 			if (ret)
 				goto err;
 		}
+
+		*dst++ = *i;
 	}
+	keys->gap = keys->nr = dst - keys->data;
 
 	percpu_down_read(&c->mark_lock);
 	preempt_disable();

From f5095b9f85a1674a92d00e7ab466499a8ba49ce1 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Mon, 1 Jan 2024 19:42:37 -0500
Subject: [PATCH 033/120] bcachefs: dev_usage updated by new accounting

Reading disk accounting now requires an eytzinger lookup (see:
bch2_accounting_mem_read()), but the per-device counters are used
frequently enough that we'd like to still be able to read them with just
a percpu sum, as in the old code.

This patch special cases the device counters; when we update in-memory
accounting we also update the old style percpu counters if it's a deice
counter update.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/bcachefs.h        |  3 +--
 fs/bcachefs/btree_gc.c        |  2 +-
 fs/bcachefs/buckets.c         | 36 +++++------------------------------
 fs/bcachefs/buckets_types.h   |  2 +-
 fs/bcachefs/disk_accounting.c | 15 +++++++++++++++
 fs/bcachefs/disk_accounting.h | 14 +++++++++++++-
 fs/bcachefs/recovery.c        | 17 -----------------
 fs/bcachefs/sb-clean.c        | 17 -----------------
 8 files changed, 36 insertions(+), 70 deletions(-)

diff --git a/fs/bcachefs/bcachefs.h b/fs/bcachefs/bcachefs.h
index 03e4f15f34f5..8ffb683c368a 100644
--- a/fs/bcachefs/bcachefs.h
+++ b/fs/bcachefs/bcachefs.h
@@ -545,8 +545,7 @@ struct bch_dev {
 	unsigned long		*buckets_nouse;
 	struct rw_semaphore	bucket_lock;
 
-	struct bch_dev_usage		*usage_base;
-	struct bch_dev_usage __percpu	*usage[JOURNAL_BUF_NR];
+	struct bch_dev_usage __percpu	*usage;
 	struct bch_dev_usage __percpu	*usage_gc;
 
 	/* Allocator: */
diff --git a/fs/bcachefs/btree_gc.c b/fs/bcachefs/btree_gc.c
index b5fe6785d3e4..fe7293166e37 100644
--- a/fs/bcachefs/btree_gc.c
+++ b/fs/bcachefs/btree_gc.c
@@ -773,7 +773,7 @@ static int bch2_gc_done(struct bch_fs *c)
 		bch2_fs_usage_acc_to_base(c, i);
 
 	__for_each_member_device(c, ca) {
-		struct bch_dev_usage *dst = ca->usage_base;
+		struct bch_dev_usage *dst = this_cpu_ptr(ca->usage);
 		struct bch_dev_usage *src = (void *)
 			bch2_acc_percpu_u64s((u64 __percpu *) ca->usage_gc,
 					     dev_usage_u64s());
diff --git a/fs/bcachefs/buckets.c b/fs/bcachefs/buckets.c
index c3dac9d1c45b..37347159ef4d 100644
--- a/fs/bcachefs/buckets.c
+++ b/fs/bcachefs/buckets.c
@@ -69,15 +69,8 @@ void bch2_fs_usage_initialize(struct bch_fs *c)
 
 void bch2_dev_usage_read_fast(struct bch_dev *ca, struct bch_dev_usage *usage)
 {
-	struct bch_fs *c = ca->fs;
-	unsigned seq, i, u64s = dev_usage_u64s();
-
-	do {
-		seq = read_seqcount_begin(&c->usage_lock);
-		memcpy(usage, ca->usage_base, u64s * sizeof(u64));
-		for (i = 0; i < ARRAY_SIZE(ca->usage); i++)
-			acc_u64s_percpu((u64 *) usage, (u64 __percpu *) ca->usage[i], u64s);
-	} while (read_seqcount_retry(&c->usage_lock, seq));
+	memset(usage, 0, sizeof(*usage));
+	acc_u64s_percpu((u64 *) usage, (u64 __percpu *) ca->usage, dev_usage_u64s());
 }
 
 u64 bch2_fs_usage_read_one(struct bch_fs *c, u64 *v)
@@ -147,16 +140,6 @@ void bch2_fs_usage_acc_to_base(struct bch_fs *c, unsigned idx)
 			(u64 __percpu *) c->usage[idx], u64s);
 	percpu_memset(c->usage[idx], 0, u64s * sizeof(u64));
 
-	rcu_read_lock();
-	for_each_member_device_rcu(c, ca, NULL) {
-		u64s = dev_usage_u64s();
-
-		acc_u64s_percpu((u64 *) ca->usage_base,
-				(u64 __percpu *) ca->usage[idx], u64s);
-		percpu_memset(ca->usage[idx], 0, u64s * sizeof(u64));
-	}
-	rcu_read_unlock();
-
 	write_seqcount_end(&c->usage_lock);
 	preempt_enable();
 }
@@ -1488,23 +1471,14 @@ void bch2_dev_buckets_free(struct bch_dev *ca)
 {
 	kvfree(ca->buckets_nouse);
 	kvfree(rcu_dereference_protected(ca->bucket_gens, 1));
-
-	for (unsigned i = 0; i < ARRAY_SIZE(ca->usage); i++)
-		free_percpu(ca->usage[i]);
-	kfree(ca->usage_base);
+	free_percpu(ca->usage);
 }
 
 int bch2_dev_buckets_alloc(struct bch_fs *c, struct bch_dev *ca)
 {
-	ca->usage_base = kzalloc(sizeof(struct bch_dev_usage), GFP_KERNEL);
-	if (!ca->usage_base)
+	ca->usage = alloc_percpu(struct bch_dev_usage);
+	if (!ca->usage)
 		return -BCH_ERR_ENOMEM_usage_init;
 
-	for (unsigned i = 0; i < ARRAY_SIZE(ca->usage); i++) {
-		ca->usage[i] = alloc_percpu(struct bch_dev_usage);
-		if (!ca->usage[i])
-			return -BCH_ERR_ENOMEM_usage_init;
-	}
-
 	return bch2_dev_buckets_resize(c, ca, ca->mi.nbuckets);
 }
diff --git a/fs/bcachefs/buckets_types.h b/fs/bcachefs/buckets_types.h
index 6c4ee3163d0f..a15e3f79b9cb 100644
--- a/fs/bcachefs/buckets_types.h
+++ b/fs/bcachefs/buckets_types.h
@@ -36,7 +36,7 @@ struct bucket_gens {
 };
 
 struct bch_dev_usage {
-	struct {
+	struct bch_dev_usage_type {
 		u64		buckets;
 		u64		sectors; /* _compressed_ sectors: */
 		/*
diff --git a/fs/bcachefs/disk_accounting.c b/fs/bcachefs/disk_accounting.c
index dbdc16f2fc1c..ccd3f8d878d1 100644
--- a/fs/bcachefs/disk_accounting.c
+++ b/fs/bcachefs/disk_accounting.c
@@ -400,6 +400,21 @@ int bch2_accounting_read(struct bch_fs *c)
 		case BCH_DISK_ACCOUNTING_replicas:
 			fs_usage_data_type_to_base(usage, k.replicas.data_type, v[0]);
 			break;
+		case BCH_DISK_ACCOUNTING_dev_data_type:
+			rcu_read_lock();
+			struct bch_dev *ca = bch2_dev_rcu(c, k.dev_data_type.dev);
+			if (ca) {
+				struct bch_dev_usage_type __percpu *d = &ca->usage->d[k.dev_data_type.data_type];
+				percpu_u64_set(&d->buckets,	v[0]);
+				percpu_u64_set(&d->sectors,	v[1]);
+				percpu_u64_set(&d->fragmented,	v[2]);
+
+				if (k.dev_data_type.data_type == BCH_DATA_sb ||
+				    k.dev_data_type.data_type == BCH_DATA_journal)
+					usage->hidden += v[0] * ca->mi.bucket_size;
+			}
+			rcu_read_unlock();
+			break;
 		}
 	}
 	preempt_enable();
diff --git a/fs/bcachefs/disk_accounting.h b/fs/bcachefs/disk_accounting.h
index ec1b8ae2aeee..9f2078c970f3 100644
--- a/fs/bcachefs/disk_accounting.h
+++ b/fs/bcachefs/disk_accounting.h
@@ -3,6 +3,7 @@
 #define _BCACHEFS_DISK_ACCOUNTING_H
 
 #include "eytzinger.h"
+#include "sb-members.h"
 
 static inline void bch2_u64s_neg(u64 *v, unsigned nr)
 {
@@ -131,6 +132,7 @@ static inline int __bch2_accounting_mem_mod(struct bch_fs *c, struct bkey_s_c_ac
 static inline int bch2_accounting_mem_mod(struct btree_trans *trans, struct
 					  bkey_s_c_accounting a)
 {
+	struct bch_fs *c = trans->c;
 	struct disk_accounting_pos acc_k;
 	bpos_to_disk_accounting_pos(&acc_k, a.k->p);
 
@@ -141,8 +143,18 @@ static inline int bch2_accounting_mem_mod(struct btree_trans *trans, struct
 	case BCH_DISK_ACCOUNTING_replicas:
 		fs_usage_data_type_to_base(&trans->fs_usage_delta, acc_k.replicas.data_type, a.v->d[0]);
 		break;
+	case BCH_DISK_ACCOUNTING_dev_data_type:
+		rcu_read_lock();
+		struct bch_dev *ca = bch2_dev_rcu(c, acc_k.dev_data_type.dev);
+		if (ca) {
+			this_cpu_add(ca->usage->d[acc_k.dev_data_type.data_type].buckets, a.v->d[0]);
+			this_cpu_add(ca->usage->d[acc_k.dev_data_type.data_type].sectors, a.v->d[1]);
+			this_cpu_add(ca->usage->d[acc_k.dev_data_type.data_type].fragmented, a.v->d[2]);
+		}
+		rcu_read_unlock();
+		break;
 	}
-	return __bch2_accounting_mem_mod(trans->c, a);
+	return __bch2_accounting_mem_mod(c, a);
 }
 
 static inline void bch2_accounting_mem_read_counters(struct bch_fs *c,
diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c
index 2d03b4ff31dd..fcc8d5bc6c2f 100644
--- a/fs/bcachefs/recovery.c
+++ b/fs/bcachefs/recovery.c
@@ -451,23 +451,6 @@ static int journal_replay_entry_early(struct bch_fs *c,
 					      le64_to_cpu(u->v));
 		break;
 	}
-	case BCH_JSET_ENTRY_dev_usage: {
-		struct jset_entry_dev_usage *u =
-			container_of(entry, struct jset_entry_dev_usage, entry);
-		unsigned nr_types = jset_entry_dev_usage_nr_types(u);
-
-		rcu_read_lock();
-		struct bch_dev *ca = bch2_dev_rcu(c, le32_to_cpu(u->dev));
-		if (ca)
-			for (unsigned i = 0; i < min_t(unsigned, nr_types, BCH_DATA_NR); i++) {
-				ca->usage_base->d[i].buckets	= le64_to_cpu(u->d[i].buckets);
-				ca->usage_base->d[i].sectors	= le64_to_cpu(u->d[i].sectors);
-				ca->usage_base->d[i].fragmented	= le64_to_cpu(u->d[i].fragmented);
-			}
-		rcu_read_unlock();
-
-		break;
-	}
 	case BCH_JSET_ENTRY_blacklist: {
 		struct jset_entry_blacklist *bl_entry =
 			container_of(entry, struct jset_entry_blacklist, entry);
diff --git a/fs/bcachefs/sb-clean.c b/fs/bcachefs/sb-clean.c
index 47f10ab57f40..57fd744b71e7 100644
--- a/fs/bcachefs/sb-clean.c
+++ b/fs/bcachefs/sb-clean.c
@@ -236,23 +236,6 @@ void bch2_journal_super_entries_add_common(struct bch_fs *c,
 			      "embedded variable length struct");
 	}
 
-	for_each_member_device(c, ca) {
-		unsigned b = sizeof(struct jset_entry_dev_usage) +
-			sizeof(struct jset_entry_dev_usage_type) * BCH_DATA_NR;
-		struct jset_entry_dev_usage *u =
-			container_of(jset_entry_init(end, b),
-				     struct jset_entry_dev_usage, entry);
-
-		u->entry.type = BCH_JSET_ENTRY_dev_usage;
-		u->dev = cpu_to_le32(ca->dev_idx);
-
-		for (unsigned i = 0; i < BCH_DATA_NR; i++) {
-			u->d[i].buckets = cpu_to_le64(ca->usage_base->d[i].buckets);
-			u->d[i].sectors	= cpu_to_le64(ca->usage_base->d[i].sectors);
-			u->d[i].fragmented = cpu_to_le64(ca->usage_base->d[i].fragmented);
-		}
-	}
-
 	percpu_up_read(&c->mark_lock);
 
 	for (unsigned i = 0; i < 2; i++) {

From 72a6bb098c5879ce6aa51f714730971737a0b3ca Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Fri, 5 Jan 2024 21:23:07 -0500
Subject: [PATCH 034/120] bcachefs: Kill bch2_fs_usage_initialize()

Deleting code for the old disk accounting scheme.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/buckets.c  | 29 -----------------------------
 fs/bcachefs/buckets.h  |  2 --
 fs/bcachefs/recovery.c |  2 --
 3 files changed, 33 deletions(-)

diff --git a/fs/bcachefs/buckets.c b/fs/bcachefs/buckets.c
index 37347159ef4d..2fd7a55f6373 100644
--- a/fs/bcachefs/buckets.c
+++ b/fs/bcachefs/buckets.c
@@ -38,35 +38,6 @@ static inline struct bch_fs_usage *fs_usage_ptr(struct bch_fs *c,
 			    : c->usage[journal_seq & JOURNAL_BUF_MASK]);
 }
 
-void bch2_fs_usage_initialize(struct bch_fs *c)
-{
-	percpu_down_write(&c->mark_lock);
-	struct bch_fs_usage *usage = c->usage_base;
-
-	for (unsigned i = 0; i < ARRAY_SIZE(c->usage); i++)
-		bch2_fs_usage_acc_to_base(c, i);
-
-	for (unsigned i = 0; i < BCH_REPLICAS_MAX; i++)
-		usage->b.reserved += usage->persistent_reserved[i];
-
-	for (unsigned i = 0; i < c->replicas.nr; i++) {
-		struct bch_replicas_entry_v1 *e =
-			cpu_replicas_entry(&c->replicas, i);
-
-		fs_usage_data_type_to_base(&usage->b, e->data_type, usage->replicas[i]);
-	}
-
-	for_each_member_device(c, ca) {
-		struct bch_dev_usage dev = bch2_dev_usage_read(ca);
-
-		usage->b.hidden += (dev.d[BCH_DATA_sb].buckets +
-				  dev.d[BCH_DATA_journal].buckets) *
-			ca->mi.bucket_size;
-	}
-
-	percpu_up_write(&c->mark_lock);
-}
-
 void bch2_dev_usage_read_fast(struct bch_dev *ca, struct bch_dev_usage *usage)
 {
 	memset(usage, 0, sizeof(*usage));
diff --git a/fs/bcachefs/buckets.h b/fs/bcachefs/buckets.h
index b61942fc3090..dc99df654ac9 100644
--- a/fs/bcachefs/buckets.h
+++ b/fs/bcachefs/buckets.h
@@ -318,8 +318,6 @@ void bch2_dev_usage_update(struct bch_fs *, struct bch_dev *,
 int bch2_update_replicas(struct bch_fs *, struct bkey_s_c,
 			 struct bch_replicas_entry_v1 *, s64);
 
-void bch2_fs_usage_initialize(struct bch_fs *);
-
 int bch2_bucket_ref_update(struct btree_trans *, struct bch_dev *,
 			   struct bkey_s_c, const struct bch_extent_ptr *,
 			   s64, enum bch_data_type, u8, u8, u32 *);
diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c
index fcc8d5bc6c2f..abdb26d45068 100644
--- a/fs/bcachefs/recovery.c
+++ b/fs/bcachefs/recovery.c
@@ -509,8 +509,6 @@ static int journal_replay_early(struct bch_fs *c,
 		}
 	}
 
-	bch2_fs_usage_initialize(c);
-
 	return 0;
 }
 

From 6b39638b84b462e29e8008f5b5f6fad93e021544 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Sat, 6 Jan 2024 20:29:25 -0500
Subject: [PATCH 035/120] bcachefs: Convert bch2_ioctl_fs_usage() to new
 accounting

This converts bch2_ioctl_fs_usage() to read from the new disk
accounting, via bch2_fs_replicas_usage_read().

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/chardev.c | 68 ++++++++++++-------------------------------
 1 file changed, 19 insertions(+), 49 deletions(-)

diff --git a/fs/bcachefs/chardev.c b/fs/bcachefs/chardev.c
index 77d7186b4ba3..57bb02996f8e 100644
--- a/fs/bcachefs/chardev.c
+++ b/fs/bcachefs/chardev.c
@@ -5,6 +5,7 @@
 #include "bcachefs_ioctl.h"
 #include "buckets.h"
 #include "chardev.h"
+#include "disk_accounting.h"
 #include "journal.h"
 #include "move.h"
 #include "recovery_passes.h"
@@ -516,11 +517,11 @@ static long bch2_ioctl_data(struct bch_fs *c,
 static long bch2_ioctl_fs_usage(struct bch_fs *c,
 				struct bch_ioctl_fs_usage __user *user_arg)
 {
-	struct bch_ioctl_fs_usage *arg = NULL;
-	struct bch_replicas_usage *dst_e, *dst_end;
-	struct bch_fs_usage_online *src;
-	u32 replica_entries_bytes;
+	struct bch_ioctl_fs_usage arg;
+	struct bch_fs_usage_online *src = NULL;
+	darray_char replicas = {};
 	unsigned i;
+	u32 replica_entries_bytes;
 	int ret = 0;
 
 	if (!test_bit(BCH_FS_started, &c->flags))
@@ -529,9 +530,16 @@ static long bch2_ioctl_fs_usage(struct bch_fs *c,
 	if (get_user(replica_entries_bytes, &user_arg->replica_entries_bytes))
 		return -EFAULT;
 
-	arg = kzalloc(size_add(sizeof(*arg), replica_entries_bytes), GFP_KERNEL);
-	if (!arg)
-		return -ENOMEM;
+	ret   = bch2_fs_replicas_usage_read(c, &replicas) ?:
+		(replica_entries_bytes < replicas.nr ? -ERANGE : 0) ?:
+		copy_to_user_errcode(&user_arg->replicas, replicas.data, replicas.nr);
+	if (ret)
+		goto err;
+
+	arg.capacity		= c->capacity;
+	arg.used		= bch2_fs_sectors_used(c, src);
+	arg.online_reserved	= src->online_reserved;
+	arg.replica_entries_bytes = replicas.nr;
 
 	src = bch2_fs_usage_read(c);
 	if (!src) {
@@ -539,52 +547,14 @@ static long bch2_ioctl_fs_usage(struct bch_fs *c,
 		goto err;
 	}
 
-	arg->capacity		= c->capacity;
-	arg->used		= bch2_fs_sectors_used(c, src);
-	arg->online_reserved	= src->online_reserved;
-
 	for (i = 0; i < BCH_REPLICAS_MAX; i++)
-		arg->persistent_reserved[i] = src->u.persistent_reserved[i];
-
-	dst_e	= arg->replicas;
-	dst_end = (void *) arg->replicas + replica_entries_bytes;
-
-	for (i = 0; i < c->replicas.nr; i++) {
-		struct bch_replicas_entry_v1 *src_e =
-			cpu_replicas_entry(&c->replicas, i);
-
-		/* check that we have enough space for one replicas entry */
-		if (dst_e + 1 > dst_end) {
-			ret = -ERANGE;
-			break;
-		}
-
-		dst_e->sectors		= src->u.replicas[i];
-		dst_e->r		= *src_e;
-
-		/* recheck after setting nr_devs: */
-		if (replicas_usage_next(dst_e) > dst_end) {
-			ret = -ERANGE;
-			break;
-		}
-
-		memcpy(dst_e->r.devs, src_e->devs, src_e->nr_devs);
-
-		dst_e = replicas_usage_next(dst_e);
-	}
-
-	arg->replica_entries_bytes = (void *) dst_e - (void *) arg->replicas;
-
+		arg.persistent_reserved[i] = src->u.persistent_reserved[i];
 	percpu_up_read(&c->mark_lock);
-	kfree(src);
 
-	if (ret)
-		goto err;
-
-	ret = copy_to_user_errcode(user_arg, arg,
-			sizeof(*arg) + arg->replica_entries_bytes);
+	ret = copy_to_user_errcode(user_arg, &arg, sizeof(arg));
 err:
-	kfree(arg);
+	darray_exit(&replicas);
+	kfree(src);
 	return ret;
 }
 

From 3afb8dbf03408099e745f12e4d121f1412575618 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Mon, 1 Jan 2024 23:36:23 -0500
Subject: [PATCH 036/120] bcachefs: kill bch2_fs_usage_read()

With bch2_ioctl_fs_usage(), this is now dead code.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/bcachefs.h        |  4 ----
 fs/bcachefs/buckets.c         | 34 ----------------------------------
 fs/bcachefs/buckets.h         |  2 --
 fs/bcachefs/chardev.c         | 25 ++++++++++++-------------
 fs/bcachefs/disk_accounting.c |  2 +-
 fs/bcachefs/replicas.c        |  7 -------
 fs/bcachefs/super.c           |  2 --
 7 files changed, 13 insertions(+), 63 deletions(-)

diff --git a/fs/bcachefs/bcachefs.h b/fs/bcachefs/bcachefs.h
index 8ffb683c368a..89ffe38b6bfe 100644
--- a/fs/bcachefs/bcachefs.h
+++ b/fs/bcachefs/bcachefs.h
@@ -896,10 +896,6 @@ struct bch_fs {
 	struct bch_fs_usage __percpu	*usage_gc;
 	u64 __percpu		*online_reserved;
 
-	/* single element mempool: */
-	struct mutex		usage_scratch_lock;
-	struct bch_fs_usage_online *usage_scratch;
-
 	struct io_clock		io_clock[2];
 
 	/* JOURNAL SEQ BLACKLIST */
diff --git a/fs/bcachefs/buckets.c b/fs/bcachefs/buckets.c
index 2fd7a55f6373..07cdb4299918 100644
--- a/fs/bcachefs/buckets.c
+++ b/fs/bcachefs/buckets.c
@@ -64,40 +64,6 @@ u64 bch2_fs_usage_read_one(struct bch_fs *c, u64 *v)
 	return ret;
 }
 
-struct bch_fs_usage_online *bch2_fs_usage_read(struct bch_fs *c)
-{
-	struct bch_fs_usage_online *ret;
-	unsigned nr_replicas = READ_ONCE(c->replicas.nr);
-	unsigned seq, i;
-retry:
-	ret = kmalloc(__fs_usage_online_u64s(nr_replicas) * sizeof(u64), GFP_KERNEL);
-	if (unlikely(!ret))
-		return NULL;
-
-	percpu_down_read(&c->mark_lock);
-
-	if (nr_replicas != c->replicas.nr) {
-		nr_replicas = c->replicas.nr;
-		percpu_up_read(&c->mark_lock);
-		kfree(ret);
-		goto retry;
-	}
-
-	ret->online_reserved = percpu_u64_get(c->online_reserved);
-
-	do {
-		seq = read_seqcount_begin(&c->usage_lock);
-		unsafe_memcpy(&ret->u, c->usage_base,
-			      __fs_usage_u64s(nr_replicas) * sizeof(u64),
-			      "embedded variable length struct");
-		for (i = 0; i < ARRAY_SIZE(c->usage); i++)
-			acc_u64s_percpu((u64 *) &ret->u, (u64 __percpu *) c->usage[i],
-					__fs_usage_u64s(nr_replicas));
-	} while (read_seqcount_retry(&c->usage_lock, seq));
-
-	return ret;
-}
-
 void bch2_fs_usage_acc_to_base(struct bch_fs *c, unsigned idx)
 {
 	unsigned u64s = fs_usage_u64s(c);
diff --git a/fs/bcachefs/buckets.h b/fs/bcachefs/buckets.h
index dc99df654ac9..df73a47a4123 100644
--- a/fs/bcachefs/buckets.h
+++ b/fs/bcachefs/buckets.h
@@ -300,8 +300,6 @@ static inline unsigned dev_usage_u64s(void)
 
 u64 bch2_fs_usage_read_one(struct bch_fs *, u64 *);
 
-struct bch_fs_usage_online *bch2_fs_usage_read(struct bch_fs *);
-
 void bch2_fs_usage_acc_to_base(struct bch_fs *, unsigned);
 
 void bch2_fs_usage_to_text(struct printbuf *,
diff --git a/fs/bcachefs/chardev.c b/fs/bcachefs/chardev.c
index 57bb02996f8e..0e76e06ab844 100644
--- a/fs/bcachefs/chardev.c
+++ b/fs/bcachefs/chardev.c
@@ -518,9 +518,7 @@ static long bch2_ioctl_fs_usage(struct bch_fs *c,
 				struct bch_ioctl_fs_usage __user *user_arg)
 {
 	struct bch_ioctl_fs_usage arg;
-	struct bch_fs_usage_online *src = NULL;
 	darray_char replicas = {};
-	unsigned i;
 	u32 replica_entries_bytes;
 	int ret = 0;
 
@@ -536,25 +534,26 @@ static long bch2_ioctl_fs_usage(struct bch_fs *c,
 	if (ret)
 		goto err;
 
+	struct bch_fs_usage_short u = bch2_fs_usage_read_short(c);
 	arg.capacity		= c->capacity;
-	arg.used		= bch2_fs_sectors_used(c, src);
-	arg.online_reserved	= src->online_reserved;
+	arg.used		= u.used;
+	arg.online_reserved	= percpu_u64_get(c->online_reserved);
 	arg.replica_entries_bytes = replicas.nr;
 
-	src = bch2_fs_usage_read(c);
-	if (!src) {
-		ret = -ENOMEM;
-		goto err;
-	}
+	for (unsigned i = 0; i < BCH_REPLICAS_MAX; i++) {
+		struct disk_accounting_pos k = {
+			.type = BCH_DISK_ACCOUNTING_persistent_reserved,
+			.persistent_reserved.nr_replicas = i,
+		};
 
-	for (i = 0; i < BCH_REPLICAS_MAX; i++)
-		arg.persistent_reserved[i] = src->u.persistent_reserved[i];
-	percpu_up_read(&c->mark_lock);
+		bch2_accounting_mem_read(c,
+					 disk_accounting_pos_to_bpos(&k),
+					 &arg.persistent_reserved[i], 1);
+	}
 
 	ret = copy_to_user_errcode(user_arg, &arg, sizeof(arg));
 err:
 	darray_exit(&replicas);
-	kfree(src);
 	return ret;
 }
 
diff --git a/fs/bcachefs/disk_accounting.c b/fs/bcachefs/disk_accounting.c
index ccd3f8d878d1..eadf4f6392bf 100644
--- a/fs/bcachefs/disk_accounting.c
+++ b/fs/bcachefs/disk_accounting.c
@@ -38,7 +38,7 @@
  * replay).
  *
  * To do a disk accounting update:
- * - initialize a disk_accounting_key, to specify which counter is being update
+ * - initialize a disk_accounting_pos, to specify which counter is being update
  * - initialize counter deltas, as an array of 1-3 s64s
  * - call bch2_disk_accounting_mod()
  *
diff --git a/fs/bcachefs/replicas.c b/fs/bcachefs/replicas.c
index 9cf1d118f146..06783f357f8d 100644
--- a/fs/bcachefs/replicas.c
+++ b/fs/bcachefs/replicas.c
@@ -308,13 +308,10 @@ static int replicas_table_update(struct bch_fs *c,
 				 struct bch_replicas_cpu *new_r)
 {
 	struct bch_fs_usage __percpu *new_usage[JOURNAL_BUF_NR];
-	struct bch_fs_usage_online *new_scratch = NULL;
 	struct bch_fs_usage __percpu *new_gc = NULL;
 	struct bch_fs_usage *new_base = NULL;
 	unsigned i, bytes = sizeof(struct bch_fs_usage) +
 		sizeof(u64) * new_r->nr;
-	unsigned scratch_bytes = sizeof(struct bch_fs_usage_online) +
-		sizeof(u64) * new_r->nr;
 	int ret = 0;
 
 	memset(new_usage, 0, sizeof(new_usage));
@@ -325,7 +322,6 @@ static int replicas_table_update(struct bch_fs *c,
 			goto err;
 
 	if (!(new_base = kzalloc(bytes, GFP_KERNEL)) ||
-	    !(new_scratch  = kmalloc(scratch_bytes, GFP_KERNEL)) ||
 	    (c->usage_gc &&
 	     !(new_gc = __alloc_percpu_gfp(bytes, sizeof(u64), GFP_KERNEL))))
 		goto err;
@@ -344,12 +340,10 @@ static int replicas_table_update(struct bch_fs *c,
 	for (i = 0; i < ARRAY_SIZE(new_usage); i++)
 		swap(c->usage[i],	new_usage[i]);
 	swap(c->usage_base,	new_base);
-	swap(c->usage_scratch,	new_scratch);
 	swap(c->usage_gc,	new_gc);
 	swap(c->replicas,	*new_r);
 out:
 	free_percpu(new_gc);
-	kfree(new_scratch);
 	for (i = 0; i < ARRAY_SIZE(new_usage); i++)
 		free_percpu(new_usage[i]);
 	kfree(new_base);
@@ -1028,7 +1022,6 @@ void bch2_fs_replicas_exit(struct bch_fs *c)
 {
 	unsigned i;
 
-	kfree(c->usage_scratch);
 	for (i = 0; i < ARRAY_SIZE(c->usage); i++)
 		free_percpu(c->usage[i]);
 	kfree(c->usage_base);
diff --git a/fs/bcachefs/super.c b/fs/bcachefs/super.c
index 597addbf2bca..e7a17179f741 100644
--- a/fs/bcachefs/super.c
+++ b/fs/bcachefs/super.c
@@ -791,8 +791,6 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts)
 
 	INIT_LIST_HEAD(&c->list);
 
-	mutex_init(&c->usage_scratch_lock);
-
 	mutex_init(&c->bio_bounce_pages_lock);
 	mutex_init(&c->snapshot_table_lock);
 	init_rwsem(&c->snapshot_create_lock);

From 5b9bc272e6c37406fcc5dba720a477d00240477a Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Sun, 31 Dec 2023 22:30:15 -0500
Subject: [PATCH 037/120] bcachefs: Kill writing old accounting to journal

More ripping out of the old disk space accounting.

Note that the new disk space accounting is incompatible with the old,
and writing out old style disk space accounting with the new code is
infeasible.

This means upgrading and downgrading past this version requires
regenerating accounting.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/sb-clean.c | 45 ------------------------------------------
 1 file changed, 45 deletions(-)

diff --git a/fs/bcachefs/sb-clean.c b/fs/bcachefs/sb-clean.c
index 57fd744b71e7..c57d42bb8d1b 100644
--- a/fs/bcachefs/sb-clean.c
+++ b/fs/bcachefs/sb-clean.c
@@ -183,25 +183,6 @@ void bch2_journal_super_entries_add_common(struct bch_fs *c,
 					   struct jset_entry **end,
 					   u64 journal_seq)
 {
-	percpu_down_read(&c->mark_lock);
-
-	if (!journal_seq) {
-		for (unsigned i = 0; i < ARRAY_SIZE(c->usage); i++)
-			bch2_fs_usage_acc_to_base(c, i);
-	} else {
-		bch2_fs_usage_acc_to_base(c, journal_seq & JOURNAL_BUF_MASK);
-	}
-
-	{
-		struct jset_entry_usage *u =
-			container_of(jset_entry_init(end, sizeof(*u)),
-				     struct jset_entry_usage, entry);
-
-		u->entry.type	= BCH_JSET_ENTRY_usage;
-		u->entry.btree_id = BCH_FS_USAGE_inodes;
-		u->v		= cpu_to_le64(c->usage_base->b.nr_inodes);
-	}
-
 	{
 		struct jset_entry_usage *u =
 			container_of(jset_entry_init(end, sizeof(*u)),
@@ -212,32 +193,6 @@ void bch2_journal_super_entries_add_common(struct bch_fs *c,
 		u->v		= cpu_to_le64(atomic64_read(&c->key_version));
 	}
 
-	for (unsigned i = 0; i < BCH_REPLICAS_MAX; i++) {
-		struct jset_entry_usage *u =
-			container_of(jset_entry_init(end, sizeof(*u)),
-				     struct jset_entry_usage, entry);
-
-		u->entry.type	= BCH_JSET_ENTRY_usage;
-		u->entry.btree_id = BCH_FS_USAGE_reserved;
-		u->entry.level	= i;
-		u->v		= cpu_to_le64(c->usage_base->persistent_reserved[i]);
-	}
-
-	for (unsigned i = 0; i < c->replicas.nr; i++) {
-		struct bch_replicas_entry_v1 *e =
-			cpu_replicas_entry(&c->replicas, i);
-		struct jset_entry_data_usage *u =
-			container_of(jset_entry_init(end, sizeof(*u) + e->nr_devs),
-				     struct jset_entry_data_usage, entry);
-
-		u->entry.type	= BCH_JSET_ENTRY_data_usage;
-		u->v		= cpu_to_le64(c->usage_base->replicas[i]);
-		unsafe_memcpy(&u->r, e, replicas_entry_bytes(e),
-			      "embedded variable length struct");
-	}
-
-	percpu_up_read(&c->mark_lock);
-
 	for (unsigned i = 0; i < 2; i++) {
 		struct jset_entry_clock *clock =
 			container_of(jset_entry_init(end, sizeof(*clock)),

From 8bb8d683a4013f94953da3991da4bb2c38111063 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Wed, 27 Dec 2023 22:09:25 -0500
Subject: [PATCH 038/120] bcachefs: Delete journal-buf-sharded old style
 accounting

More deletion of dead code.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/bcachefs.h        |  3 +-
 fs/bcachefs/btree_gc.c        |  9 ++---
 fs/bcachefs/buckets.c         | 61 ++++-----------------------------
 fs/bcachefs/buckets.h         |  4 ---
 fs/bcachefs/disk_accounting.c |  2 +-
 fs/bcachefs/recovery.c        | 20 +----------
 fs/bcachefs/replicas.c        | 63 +++--------------------------------
 fs/bcachefs/replicas.h        |  4 ---
 fs/bcachefs/super.c           |  2 ++
 9 files changed, 21 insertions(+), 147 deletions(-)

diff --git a/fs/bcachefs/bcachefs.h b/fs/bcachefs/bcachefs.h
index 89ffe38b6bfe..50937f6aec0a 100644
--- a/fs/bcachefs/bcachefs.h
+++ b/fs/bcachefs/bcachefs.h
@@ -891,8 +891,7 @@ struct bch_fs {
 	struct percpu_rw_semaphore	mark_lock;
 
 	seqcount_t			usage_lock;
-	struct bch_fs_usage		*usage_base;
-	struct bch_fs_usage __percpu	*usage[JOURNAL_BUF_NR];
+	struct bch_fs_usage_base __percpu *usage;
 	struct bch_fs_usage __percpu	*usage_gc;
 	u64 __percpu		*online_reserved;
 
diff --git a/fs/bcachefs/btree_gc.c b/fs/bcachefs/btree_gc.c
index fe7293166e37..c79258e3e69c 100644
--- a/fs/bcachefs/btree_gc.c
+++ b/fs/bcachefs/btree_gc.c
@@ -769,10 +769,8 @@ static int bch2_gc_done(struct bch_fs *c)
 #define copy_fs_field(_err, _f, _msg, ...)					\
 	copy_field(_err, _f, "fs has wrong " _msg, ##__VA_ARGS__)
 
-	for (i = 0; i < ARRAY_SIZE(c->usage); i++)
-		bch2_fs_usage_acc_to_base(c, i);
-
 	__for_each_member_device(c, ca) {
+		/* XXX */
 		struct bch_dev_usage *dst = this_cpu_ptr(ca->usage);
 		struct bch_dev_usage *src = (void *)
 			bch2_acc_percpu_u64s((u64 __percpu *) ca->usage_gc,
@@ -789,8 +787,10 @@ static int bch2_gc_done(struct bch_fs *c)
 	}
 
 	{
+#if 0
 		unsigned nr = fs_usage_u64s(c);
-		struct bch_fs_usage *dst = c->usage_base;
+		/* XX: */
+		struct bch_fs_usage *dst = this_cpu_ptr(c->usage);
 		struct bch_fs_usage *src = (void *)
 			bch2_acc_percpu_u64s((u64 __percpu *) c->usage_gc, nr);
 
@@ -823,6 +823,7 @@ static int bch2_gc_done(struct bch_fs *c)
 			copy_fs_field(fs_usage_replicas_wrong,
 				      replicas[i], "%s", buf.buf);
 		}
+#endif
 	}
 
 #undef copy_fs_field
diff --git a/fs/bcachefs/buckets.c b/fs/bcachefs/buckets.c
index 07cdb4299918..240fc8185d4f 100644
--- a/fs/bcachefs/buckets.c
+++ b/fs/bcachefs/buckets.c
@@ -26,61 +26,12 @@
 
 #include <linux/preempt.h>
 
-static inline struct bch_fs_usage *fs_usage_ptr(struct bch_fs *c,
-						unsigned journal_seq,
-						bool gc)
-{
-	percpu_rwsem_assert_held(&c->mark_lock);
-	BUG_ON(!gc && !journal_seq);
-
-	return this_cpu_ptr(gc
-			    ? c->usage_gc
-			    : c->usage[journal_seq & JOURNAL_BUF_MASK]);
-}
-
 void bch2_dev_usage_read_fast(struct bch_dev *ca, struct bch_dev_usage *usage)
 {
 	memset(usage, 0, sizeof(*usage));
 	acc_u64s_percpu((u64 *) usage, (u64 __percpu *) ca->usage, dev_usage_u64s());
 }
 
-u64 bch2_fs_usage_read_one(struct bch_fs *c, u64 *v)
-{
-	ssize_t offset = v - (u64 *) c->usage_base;
-	unsigned i, seq;
-	u64 ret;
-
-	BUG_ON(offset < 0 || offset >= fs_usage_u64s(c));
-	percpu_rwsem_assert_held(&c->mark_lock);
-
-	do {
-		seq = read_seqcount_begin(&c->usage_lock);
-		ret = *v;
-
-		for (i = 0; i < ARRAY_SIZE(c->usage); i++)
-			ret += percpu_u64_get((u64 __percpu *) c->usage[i] + offset);
-	} while (read_seqcount_retry(&c->usage_lock, seq));
-
-	return ret;
-}
-
-void bch2_fs_usage_acc_to_base(struct bch_fs *c, unsigned idx)
-{
-	unsigned u64s = fs_usage_u64s(c);
-
-	BUG_ON(idx >= ARRAY_SIZE(c->usage));
-
-	preempt_disable();
-	write_seqcount_begin(&c->usage_lock);
-
-	acc_u64s_percpu((u64 *) c->usage_base,
-			(u64 __percpu *) c->usage[idx], u64s);
-	percpu_memset(c->usage[idx], 0, u64s * sizeof(u64));
-
-	write_seqcount_end(&c->usage_lock);
-	preempt_enable();
-}
-
 void bch2_fs_usage_to_text(struct printbuf *out,
 			   struct bch_fs *c,
 			   struct bch_fs_usage_online *fs_usage)
@@ -142,17 +93,17 @@ __bch2_fs_usage_read_short(struct bch_fs *c)
 	u64 data, reserved;
 
 	ret.capacity = c->capacity -
-		bch2_fs_usage_read_one(c, &c->usage_base->b.hidden);
+		percpu_u64_get(&c->usage->hidden);
 
-	data		= bch2_fs_usage_read_one(c, &c->usage_base->b.data) +
-		bch2_fs_usage_read_one(c, &c->usage_base->b.btree);
-	reserved	= bch2_fs_usage_read_one(c, &c->usage_base->b.reserved) +
+	data		= percpu_u64_get(&c->usage->data) +
+		percpu_u64_get(&c->usage->btree);
+	reserved	= percpu_u64_get(&c->usage->reserved) +
 		percpu_u64_get(c->online_reserved);
 
 	ret.used	= min(ret.capacity, data + reserve_factor(reserved));
 	ret.free	= ret.capacity - ret.used;
 
-	ret.nr_inodes	= bch2_fs_usage_read_one(c, &c->usage_base->b.nr_inodes);
+	ret.nr_inodes	= percpu_u64_get(&c->usage->nr_inodes);
 
 	return ret;
 }
@@ -673,7 +624,7 @@ void bch2_trans_account_disk_usage_change(struct btree_trans *trans)
 
 	percpu_down_read(&c->mark_lock);
 	preempt_disable();
-	struct bch_fs_usage_base *dst = &fs_usage_ptr(c, trans->journal_res.seq, false)->b;
+	struct bch_fs_usage_base *dst = this_cpu_ptr(c->usage);
 	struct bch_fs_usage_base *src = &trans->fs_usage_delta;
 
 	s64 added = src->btree + src->data + src->reserved;
diff --git a/fs/bcachefs/buckets.h b/fs/bcachefs/buckets.h
index df73a47a4123..711c85e24f6d 100644
--- a/fs/bcachefs/buckets.h
+++ b/fs/bcachefs/buckets.h
@@ -298,10 +298,6 @@ static inline unsigned dev_usage_u64s(void)
 	return sizeof(struct bch_dev_usage) / sizeof(u64);
 }
 
-u64 bch2_fs_usage_read_one(struct bch_fs *, u64 *);
-
-void bch2_fs_usage_acc_to_base(struct bch_fs *, unsigned);
-
 void bch2_fs_usage_to_text(struct printbuf *,
 			   struct bch_fs *, struct bch_fs_usage_online *);
 
diff --git a/fs/bcachefs/disk_accounting.c b/fs/bcachefs/disk_accounting.c
index eadf4f6392bf..f5b5d896979e 100644
--- a/fs/bcachefs/disk_accounting.c
+++ b/fs/bcachefs/disk_accounting.c
@@ -384,7 +384,7 @@ int bch2_accounting_read(struct bch_fs *c)
 
 	percpu_down_read(&c->mark_lock);
 	preempt_disable();
-	struct bch_fs_usage_base *usage = &c->usage_base->b;
+	struct bch_fs_usage_base *usage = this_cpu_ptr(c->usage);
 
 	for (unsigned i = 0; i < acc->k.nr; i++) {
 		struct disk_accounting_pos k;
diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c
index abdb26d45068..4006b8ec4fe8 100644
--- a/fs/bcachefs/recovery.c
+++ b/fs/bcachefs/recovery.c
@@ -427,28 +427,10 @@ static int journal_replay_entry_early(struct bch_fs *c,
 			container_of(entry, struct jset_entry_usage, entry);
 
 		switch (entry->btree_id) {
-		case BCH_FS_USAGE_reserved:
-			if (entry->level < BCH_REPLICAS_MAX)
-				c->usage_base->persistent_reserved[entry->level] =
-					le64_to_cpu(u->v);
-			break;
-		case BCH_FS_USAGE_inodes:
-			c->usage_base->b.nr_inodes = le64_to_cpu(u->v);
-			break;
 		case BCH_FS_USAGE_key_version:
-			atomic64_set(&c->key_version,
-				     le64_to_cpu(u->v));
+			atomic64_set(&c->key_version, le64_to_cpu(u->v));
 			break;
 		}
-
-		break;
-	}
-	case BCH_JSET_ENTRY_data_usage: {
-		struct jset_entry_data_usage *u =
-			container_of(entry, struct jset_entry_data_usage, entry);
-
-		ret = bch2_replicas_set_usage(c, &u->r,
-					      le64_to_cpu(u->v));
 		break;
 	}
 	case BCH_JSET_ENTRY_blacklist: {
diff --git a/fs/bcachefs/replicas.c b/fs/bcachefs/replicas.c
index 06783f357f8d..05214ad2ad35 100644
--- a/fs/bcachefs/replicas.c
+++ b/fs/bcachefs/replicas.c
@@ -307,46 +307,23 @@ static void __replicas_table_update_pcpu(struct bch_fs_usage __percpu *dst_p,
 static int replicas_table_update(struct bch_fs *c,
 				 struct bch_replicas_cpu *new_r)
 {
-	struct bch_fs_usage __percpu *new_usage[JOURNAL_BUF_NR];
 	struct bch_fs_usage __percpu *new_gc = NULL;
-	struct bch_fs_usage *new_base = NULL;
-	unsigned i, bytes = sizeof(struct bch_fs_usage) +
+	unsigned bytes = sizeof(struct bch_fs_usage) +
 		sizeof(u64) * new_r->nr;
 	int ret = 0;
 
-	memset(new_usage, 0, sizeof(new_usage));
-
-	for (i = 0; i < ARRAY_SIZE(new_usage); i++)
-		if (!(new_usage[i] = __alloc_percpu_gfp(bytes,
-					sizeof(u64), GFP_KERNEL)))
-			goto err;
-
-	if (!(new_base = kzalloc(bytes, GFP_KERNEL)) ||
-	    (c->usage_gc &&
+	if ((c->usage_gc &&
 	     !(new_gc = __alloc_percpu_gfp(bytes, sizeof(u64), GFP_KERNEL))))
 		goto err;
 
-	for (i = 0; i < ARRAY_SIZE(new_usage); i++)
-		if (c->usage[i])
-			__replicas_table_update_pcpu(new_usage[i], new_r,
-						     c->usage[i], &c->replicas);
-	if (c->usage_base)
-		__replicas_table_update(new_base,		new_r,
-					c->usage_base,		&c->replicas);
 	if (c->usage_gc)
 		__replicas_table_update_pcpu(new_gc,		new_r,
 					     c->usage_gc,	&c->replicas);
 
-	for (i = 0; i < ARRAY_SIZE(new_usage); i++)
-		swap(c->usage[i],	new_usage[i]);
-	swap(c->usage_base,	new_base);
 	swap(c->usage_gc,	new_gc);
 	swap(c->replicas,	*new_r);
 out:
 	free_percpu(new_gc);
-	for (i = 0; i < ARRAY_SIZE(new_usage); i++)
-		free_percpu(new_usage[i]);
-	kfree(new_base);
 	return ret;
 err:
 	bch_err(c, "error updating replicas table: memory allocation failure");
@@ -537,6 +514,8 @@ int bch2_replicas_gc_start(struct bch_fs *c, unsigned typemask)
  */
 int bch2_replicas_gc2(struct bch_fs *c)
 {
+	return 0;
+#if 0
 	struct bch_replicas_cpu new = { 0 };
 	unsigned i, nr;
 	int ret = 0;
@@ -591,34 +570,7 @@ retry:
 	mutex_unlock(&c->sb_lock);
 
 	return ret;
-}
-
-int bch2_replicas_set_usage(struct bch_fs *c,
-			    struct bch_replicas_entry_v1 *r,
-			    u64 sectors)
-{
-	int ret, idx = bch2_replicas_entry_idx(c, r);
-
-	if (idx < 0) {
-		struct bch_replicas_cpu n;
-
-		n = cpu_replicas_add_entry(c, &c->replicas, r);
-		if (!n.entries)
-			return -BCH_ERR_ENOMEM_cpu_replicas;
-
-		ret = replicas_table_update(c, &n);
-		if (ret)
-			return ret;
-
-		kfree(n.entries);
-
-		idx = bch2_replicas_entry_idx(c, r);
-		BUG_ON(ret < 0);
-	}
-
-	c->usage_base->replicas[idx] = sectors;
-
-	return 0;
+#endif
 }
 
 /* Replicas tracking - superblock: */
@@ -1020,11 +972,6 @@ unsigned bch2_dev_has_data(struct bch_fs *c, struct bch_dev *ca)
 
 void bch2_fs_replicas_exit(struct bch_fs *c)
 {
-	unsigned i;
-
-	for (i = 0; i < ARRAY_SIZE(c->usage); i++)
-		free_percpu(c->usage[i]);
-	kfree(c->usage_base);
 	kfree(c->replicas.entries);
 	kfree(c->replicas_gc.entries);
 }
diff --git a/fs/bcachefs/replicas.h b/fs/bcachefs/replicas.h
index 0a24ebcf71bd..eade75ed4839 100644
--- a/fs/bcachefs/replicas.h
+++ b/fs/bcachefs/replicas.h
@@ -53,10 +53,6 @@ int bch2_replicas_gc_end(struct bch_fs *, int);
 int bch2_replicas_gc_start(struct bch_fs *, unsigned);
 int bch2_replicas_gc2(struct bch_fs *);
 
-int bch2_replicas_set_usage(struct bch_fs *,
-			    struct bch_replicas_entry_v1 *,
-			    u64);
-
 #define for_each_cpu_replicas_entry(_r, _i)				\
 	for (_i = (_r)->entries;					\
 	     (void *) (_i) < (void *) (_r)->entries + (_r)->nr * (_r)->entry_size;\
diff --git a/fs/bcachefs/super.c b/fs/bcachefs/super.c
index e7a17179f741..01f8c7dd45a5 100644
--- a/fs/bcachefs/super.c
+++ b/fs/bcachefs/super.c
@@ -573,6 +573,7 @@ static void __bch2_fs_free(struct bch_fs *c)
 
 	darray_exit(&c->btree_roots_extra);
 	free_percpu(c->pcpu);
+	free_percpu(c->usage);
 	mempool_exit(&c->large_bkey_pool);
 	mempool_exit(&c->btree_bounce_pool);
 	bioset_exit(&c->btree_bio);
@@ -898,6 +899,7 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts)
 			    offsetof(struct btree_write_bio, wbio.bio)),
 			BIOSET_NEED_BVECS) ||
 	    !(c->pcpu = alloc_percpu(struct bch_fs_pcpu)) ||
+	    !(c->usage = alloc_percpu(struct bch_fs_usage_base)) ||
 	    !(c->online_reserved = alloc_percpu(u64)) ||
 	    mempool_init_kvmalloc_pool(&c->btree_bounce_pool, 1,
 				       c->opts.btree_node_size) ||

From fe5eddc0d02176d54609c9e38044c2870750bf1d Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Sat, 24 Feb 2024 20:04:48 -0500
Subject: [PATCH 039/120] bcachefs: Kill bch2_fs_usage_to_text()

Dead code.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/buckets.c | 39 ---------------------------------------
 fs/bcachefs/buckets.h |  3 ---
 2 files changed, 42 deletions(-)

diff --git a/fs/bcachefs/buckets.c b/fs/bcachefs/buckets.c
index 240fc8185d4f..a1257e958742 100644
--- a/fs/bcachefs/buckets.c
+++ b/fs/bcachefs/buckets.c
@@ -32,45 +32,6 @@ void bch2_dev_usage_read_fast(struct bch_dev *ca, struct bch_dev_usage *usage)
 	acc_u64s_percpu((u64 *) usage, (u64 __percpu *) ca->usage, dev_usage_u64s());
 }
 
-void bch2_fs_usage_to_text(struct printbuf *out,
-			   struct bch_fs *c,
-			   struct bch_fs_usage_online *fs_usage)
-{
-	unsigned i;
-
-	prt_printf(out, "capacity:\t\t\t%llu\n", c->capacity);
-
-	prt_printf(out, "hidden:\t\t\t\t%llu\n",
-	       fs_usage->u.b.hidden);
-	prt_printf(out, "data:\t\t\t\t%llu\n",
-	       fs_usage->u.b.data);
-	prt_printf(out, "cached:\t\t\t\t%llu\n",
-	       fs_usage->u.b.cached);
-	prt_printf(out, "reserved:\t\t\t%llu\n",
-	       fs_usage->u.b.reserved);
-	prt_printf(out, "nr_inodes:\t\t\t%llu\n",
-	       fs_usage->u.b.nr_inodes);
-	prt_printf(out, "online reserved:\t\t%llu\n",
-	       fs_usage->online_reserved);
-
-	for (i = 0;
-	     i < ARRAY_SIZE(fs_usage->u.persistent_reserved);
-	     i++) {
-		prt_printf(out, "%u replicas:\n", i + 1);
-		prt_printf(out, "\treserved:\t\t%llu\n",
-		       fs_usage->u.persistent_reserved[i]);
-	}
-
-	for (i = 0; i < c->replicas.nr; i++) {
-		struct bch_replicas_entry_v1 *e =
-			cpu_replicas_entry(&c->replicas, i);
-
-		prt_printf(out, "\t");
-		bch2_replicas_entry_to_text(out, e);
-		prt_printf(out, ":\t%llu\n", fs_usage->u.replicas[i]);
-	}
-}
-
 static u64 reserve_factor(u64 r)
 {
 	return r + (round_up(r, (1 << RESERVE_FACTOR)) >> RESERVE_FACTOR);
diff --git a/fs/bcachefs/buckets.h b/fs/bcachefs/buckets.h
index 711c85e24f6d..e56fde47e453 100644
--- a/fs/bcachefs/buckets.h
+++ b/fs/bcachefs/buckets.h
@@ -298,9 +298,6 @@ static inline unsigned dev_usage_u64s(void)
 	return sizeof(struct bch_dev_usage) / sizeof(u64);
 }
 
-void bch2_fs_usage_to_text(struct printbuf *,
-			   struct bch_fs *, struct bch_fs_usage_online *);
-
 u64 bch2_fs_sectors_used(struct bch_fs *, struct bch_fs_usage_online *);
 
 struct bch_fs_usage_short

From 66a57684c6afce8e20ac38b389caca078407ac29 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Tue, 2 Jan 2024 00:15:16 -0500
Subject: [PATCH 040/120] bcachefs: Kill fs_usage_online

More dead code deletion.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/buckets.c       | 10 ----------
 fs/bcachefs/buckets.h       | 12 ------------
 fs/bcachefs/buckets_types.h |  5 -----
 3 files changed, 27 deletions(-)

diff --git a/fs/bcachefs/buckets.c b/fs/bcachefs/buckets.c
index a1257e958742..12faf2ffda1c 100644
--- a/fs/bcachefs/buckets.c
+++ b/fs/bcachefs/buckets.c
@@ -37,16 +37,6 @@ static u64 reserve_factor(u64 r)
 	return r + (round_up(r, (1 << RESERVE_FACTOR)) >> RESERVE_FACTOR);
 }
 
-u64 bch2_fs_sectors_used(struct bch_fs *c, struct bch_fs_usage_online *fs_usage)
-{
-	return min(fs_usage->u.b.hidden +
-		   fs_usage->u.b.btree +
-		   fs_usage->u.b.data +
-		   reserve_factor(fs_usage->u.b.reserved +
-				  fs_usage->online_reserved),
-		   c->capacity);
-}
-
 static struct bch_fs_usage_short
 __bch2_fs_usage_read_short(struct bch_fs *c)
 {
diff --git a/fs/bcachefs/buckets.h b/fs/bcachefs/buckets.h
index e56fde47e453..42ff3e9df587 100644
--- a/fs/bcachefs/buckets.h
+++ b/fs/bcachefs/buckets.h
@@ -283,23 +283,11 @@ static inline unsigned fs_usage_u64s(struct bch_fs *c)
 	return __fs_usage_u64s(READ_ONCE(c->replicas.nr));
 }
 
-static inline unsigned __fs_usage_online_u64s(unsigned nr_replicas)
-{
-	return sizeof(struct bch_fs_usage_online) / sizeof(u64) + nr_replicas;
-}
-
-static inline unsigned fs_usage_online_u64s(struct bch_fs *c)
-{
-	return __fs_usage_online_u64s(READ_ONCE(c->replicas.nr));
-}
-
 static inline unsigned dev_usage_u64s(void)
 {
 	return sizeof(struct bch_dev_usage) / sizeof(u64);
 }
 
-u64 bch2_fs_sectors_used(struct bch_fs *, struct bch_fs_usage_online *);
-
 struct bch_fs_usage_short
 bch2_fs_usage_read_short(struct bch_fs *);
 
diff --git a/fs/bcachefs/buckets_types.h b/fs/bcachefs/buckets_types.h
index a15e3f79b9cb..7ad15f809348 100644
--- a/fs/bcachefs/buckets_types.h
+++ b/fs/bcachefs/buckets_types.h
@@ -64,11 +64,6 @@ struct bch_fs_usage {
 	u64			replicas[];
 };
 
-struct bch_fs_usage_online {
-	u64			online_reserved;
-	struct bch_fs_usage	u;
-};
-
 struct bch_fs_usage_short {
 	u64			capacity;
 	u64			used;

From 4c4a7d48bd59380fa4fc75f2cd341e9de09adbf7 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Tue, 2 Jan 2024 00:22:57 -0500
Subject: [PATCH 041/120] bcachefs: Kill replicas_journal_res

More dead code deletion

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/bcachefs.h |  2 --
 fs/bcachefs/replicas.c | 34 ----------------------------------
 fs/bcachefs/super.c    | 21 ---------------------
 3 files changed, 57 deletions(-)

diff --git a/fs/bcachefs/bcachefs.h b/fs/bcachefs/bcachefs.h
index 50937f6aec0a..b9f5327ab033 100644
--- a/fs/bcachefs/bcachefs.h
+++ b/fs/bcachefs/bcachefs.h
@@ -748,9 +748,7 @@ struct bch_fs {
 	struct mutex		replicas_gc_lock;
 
 	struct journal_entry_res btree_root_journal_res;
-	struct journal_entry_res replicas_journal_res;
 	struct journal_entry_res clock_journal_res;
-	struct journal_entry_res dev_usage_journal_res;
 
 	struct bch_disk_groups_cpu __rcu *disk_groups;
 
diff --git a/fs/bcachefs/replicas.c b/fs/bcachefs/replicas.c
index 05214ad2ad35..5252d3ee8a2a 100644
--- a/fs/bcachefs/replicas.c
+++ b/fs/bcachefs/replicas.c
@@ -331,32 +331,6 @@ err:
 	goto out;
 }
 
-static unsigned reserve_journal_replicas(struct bch_fs *c,
-				     struct bch_replicas_cpu *r)
-{
-	struct bch_replicas_entry_v1 *e;
-	unsigned journal_res_u64s = 0;
-
-	/* nr_inodes: */
-	journal_res_u64s +=
-		DIV_ROUND_UP(sizeof(struct jset_entry_usage), sizeof(u64));
-
-	/* key_version: */
-	journal_res_u64s +=
-		DIV_ROUND_UP(sizeof(struct jset_entry_usage), sizeof(u64));
-
-	/* persistent_reserved: */
-	journal_res_u64s +=
-		DIV_ROUND_UP(sizeof(struct jset_entry_usage), sizeof(u64)) *
-		BCH_REPLICAS_MAX;
-
-	for_each_cpu_replicas_entry(r, e)
-		journal_res_u64s +=
-			DIV_ROUND_UP(sizeof(struct jset_entry_data_usage) +
-				     e->nr_devs, sizeof(u64));
-	return journal_res_u64s;
-}
-
 noinline
 static int bch2_mark_replicas_slowpath(struct bch_fs *c,
 				struct bch_replicas_entry_v1 *new_entry)
@@ -390,10 +364,6 @@ static int bch2_mark_replicas_slowpath(struct bch_fs *c,
 		ret = bch2_cpu_replicas_to_sb_replicas(c, &new_r);
 		if (ret)
 			goto err;
-
-		bch2_journal_entry_res_resize(&c->journal,
-				&c->replicas_journal_res,
-				reserve_journal_replicas(c, &new_r));
 	}
 
 	if (!new_r.entries &&
@@ -978,9 +948,5 @@ void bch2_fs_replicas_exit(struct bch_fs *c)
 
 int bch2_fs_replicas_init(struct bch_fs *c)
 {
-	bch2_journal_entry_res_resize(&c->journal,
-			&c->replicas_journal_res,
-			reserve_journal_replicas(c, &c->replicas));
-
 	return replicas_table_update(c, &c->replicas);
 }
diff --git a/fs/bcachefs/super.c b/fs/bcachefs/super.c
index 01f8c7dd45a5..baee01d7856f 100644
--- a/fs/bcachefs/super.c
+++ b/fs/bcachefs/super.c
@@ -223,22 +223,6 @@ struct bch_fs *bch2_uuid_to_fs(__uuid_t uuid)
 	return c;
 }
 
-static void bch2_dev_usage_journal_reserve(struct bch_fs *c)
-{
-	unsigned nr = 0, u64s =
-		((sizeof(struct jset_entry_dev_usage) +
-		  sizeof(struct jset_entry_dev_usage_type) * BCH_DATA_NR)) /
-		sizeof(u64);
-
-	rcu_read_lock();
-	for_each_member_device_rcu(c, ca, NULL)
-		nr++;
-	rcu_read_unlock();
-
-	bch2_journal_entry_res_resize(&c->journal,
-			&c->dev_usage_journal_res, u64s * nr);
-}
-
 /* Filesystem RO/RW: */
 
 /*
@@ -946,7 +930,6 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts)
 	bch2_journal_entry_res_resize(&c->journal,
 			&c->btree_root_journal_res,
 			BTREE_ID_NR * (JSET_KEYS_U64s + BKEY_BTREE_PTR_U64s_MAX));
-	bch2_dev_usage_journal_reserve(c);
 	bch2_journal_entry_res_resize(&c->journal,
 			&c->clock_journal_res,
 			(sizeof(struct jset_entry_clock) / sizeof(u64)) * 2);
@@ -1719,8 +1702,6 @@ int bch2_dev_remove(struct bch_fs *c, struct bch_dev *ca, int flags)
 
 	mutex_unlock(&c->sb_lock);
 	up_write(&c->state_lock);
-
-	bch2_dev_usage_journal_reserve(c);
 	return 0;
 err:
 	if (ca->mi.state == BCH_MEMBER_STATE_rw &&
@@ -1849,8 +1830,6 @@ have_slot:
 	bch2_write_super(c);
 	mutex_unlock(&c->sb_lock);
 
-	bch2_dev_usage_journal_reserve(c);
-
 	ret = bch2_dev_usage_init(ca);
 	if (ret)
 		goto err_late;

From fb23d57a6dfc4e521c003dc542799f07d22d269e Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Sun, 11 Feb 2024 22:48:05 -0500
Subject: [PATCH 042/120] bcachefs: Convert gc to new accounting

Rewrite fsck/gc for the new accounting scheme.

This adds a second set of in-memory accounting counters for gc to use;
like with other parts of gc we run all trigger in TRIGGER_GC mode, then
compare what we calculated to existing in-memory accounting at the end.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/alloc_background.c   |  34 +-----
 fs/bcachefs/alloc_foreground.c   |  16 ++-
 fs/bcachefs/bcachefs.h           |   4 +-
 fs/bcachefs/btree_gc.c           | 133 +++------------------
 fs/bcachefs/btree_trans_commit.c |   4 +-
 fs/bcachefs/buckets.c            | 185 +++++------------------------
 fs/bcachefs/buckets.h            |  16 ---
 fs/bcachefs/buckets_types.h      |   7 --
 fs/bcachefs/disk_accounting.c    | 193 ++++++++++++++++++++++++++-----
 fs/bcachefs/disk_accounting.h    |  86 ++++++++------
 fs/bcachefs/ec.c                 |  97 +++++++---------
 fs/bcachefs/inode.c              |  43 +++----
 fs/bcachefs/recovery.c           |   3 +-
 fs/bcachefs/replicas.c           |  86 +-------------
 fs/bcachefs/replicas.h           |   1 -
 fs/bcachefs/super.c              |   9 +-
 fs/bcachefs/util.h               |   4 +-
 17 files changed, 344 insertions(+), 577 deletions(-)

diff --git a/fs/bcachefs/alloc_background.c b/fs/bcachefs/alloc_background.c
index 3df1099750af..9bb0dbe134d5 100644
--- a/fs/bcachefs/alloc_background.c
+++ b/fs/bcachefs/alloc_background.c
@@ -774,7 +774,7 @@ static inline int bch2_dev_data_type_accounting_mod(struct btree_trans *trans, s
 	};
 	s64 d[3] = { delta_buckets, delta_sectors, delta_fragmented };
 
-	return bch2_disk_accounting_mod(trans, &acc, d, 3);
+	return bch2_disk_accounting_mod(trans, &acc, d, 3, flags & BTREE_TRIGGER_gc);
 }
 
 int bch2_alloc_key_to_dev_counters(struct btree_trans *trans, struct bch_dev *ca,
@@ -894,7 +894,8 @@ int bch2_trigger_alloc(struct btree_trans *trans,
 		if ((flags & BTREE_TRIGGER_bucket_invalidate) &&
 		    old_a->cached_sectors) {
 			ret = bch2_mod_dev_cached_sectors(trans, ca->dev_idx,
-					 -((s64) old_a->cached_sectors));
+					 -((s64) old_a->cached_sectors),
+					 flags & BTREE_TRIGGER_gc);
 			if (ret)
 				goto err;
 		}
@@ -973,35 +974,6 @@ int bch2_trigger_alloc(struct btree_trans *trans,
 		if (statechange(a->data_type == BCH_DATA_need_gc_gens))
 			bch2_gc_gens_async(c);
 	}
-
-	if ((flags & BTREE_TRIGGER_gc) &&
-	    (flags & BTREE_TRIGGER_bucket_invalidate)) {
-		struct bch_alloc_v4 new_a_convert;
-		const struct bch_alloc_v4 *new_a = bch2_alloc_to_v4(new.s_c, &new_a_convert);
-
-		percpu_down_read(&c->mark_lock);
-		struct bucket *g = gc_bucket(ca, new.k->p.offset);
-		if (unlikely(!g)) {
-			percpu_up_read(&c->mark_lock);
-			goto invalid_bucket;
-		}
-		g->gen_valid	= 1;
-
-		bucket_lock(g);
-
-		g->gen_valid		= 1;
-		g->gen			= new_a->gen;
-		g->data_type		= new_a->data_type;
-		g->stripe		= new_a->stripe;
-		g->stripe_redundancy	= new_a->stripe_redundancy;
-		g->dirty_sectors	= new_a->dirty_sectors;
-		g->cached_sectors	= new_a->cached_sectors;
-
-		bucket_unlock(g);
-		percpu_up_read(&c->mark_lock);
-
-		bch2_dev_usage_update(c, ca, old_a, new_a);
-	}
 err:
 	printbuf_exit(&buf);
 	bch2_dev_put(ca);
diff --git a/fs/bcachefs/alloc_foreground.c b/fs/bcachefs/alloc_foreground.c
index 991e07a79064..cabf866c7956 100644
--- a/fs/bcachefs/alloc_foreground.c
+++ b/fs/bcachefs/alloc_foreground.c
@@ -1708,15 +1708,13 @@ void bch2_fs_alloc_debug_to_text(struct printbuf *out, struct bch_fs *c)
 
 	prt_printf(out, "capacity\t%llu\n",		c->capacity);
 	prt_printf(out, "reserved\t%llu\n",		c->reserved);
-	percpu_down_read(&c->mark_lock);
-	prt_printf(out, "hidden\t%llu\n",			bch2_fs_usage_read_one(c, &c->usage_base->b.hidden));
-	prt_printf(out, "btree\t%llu\n",			bch2_fs_usage_read_one(c, &c->usage_base->b.btree));
-	prt_printf(out, "data\t%llu\n",				bch2_fs_usage_read_one(c, &c->usage_base->b.data));
-	prt_printf(out, "cached\t%llu\n",			bch2_fs_usage_read_one(c, &c->usage_base->b.cached));
-	prt_printf(out, "reserved\t%llu\n",			bch2_fs_usage_read_one(c, &c->usage_base->b.reserved));
-	prt_printf(out, "online_reserved\t%llu\n",		percpu_u64_get(c->online_reserved));
-	prt_printf(out, "nr_inodes\t%llu\n",			bch2_fs_usage_read_one(c, &c->usage_base->b.nr_inodes));
-	percpu_up_read(&c->mark_lock);
+	prt_printf(out, "hidden\t%llu\n",		percpu_u64_get(&c->usage->hidden));
+	prt_printf(out, "btree\t%llu\n",		percpu_u64_get(&c->usage->btree));
+	prt_printf(out, "data\t%llu\n",			percpu_u64_get(&c->usage->data));
+	prt_printf(out, "cached\t%llu\n",		percpu_u64_get(&c->usage->cached));
+	prt_printf(out, "reserved\t%llu\n",		percpu_u64_get(&c->usage->reserved));
+	prt_printf(out, "online_reserved\t%llu\n",	percpu_u64_get(c->online_reserved));
+	prt_printf(out, "nr_inodes\t%llu\n",		percpu_u64_get(&c->usage->nr_inodes));
 
 	prt_newline(out);
 	prt_printf(out, "freelist_wait\t%s\n",			c->freelist_wait.list.first ? "waiting" : "empty");
diff --git a/fs/bcachefs/bcachefs.h b/fs/bcachefs/bcachefs.h
index b9f5327ab033..33605fa8e70f 100644
--- a/fs/bcachefs/bcachefs.h
+++ b/fs/bcachefs/bcachefs.h
@@ -546,7 +546,6 @@ struct bch_dev {
 	struct rw_semaphore	bucket_lock;
 
 	struct bch_dev_usage __percpu	*usage;
-	struct bch_dev_usage __percpu	*usage_gc;
 
 	/* Allocator: */
 	u64			new_fs_bucket_idx;
@@ -741,7 +740,7 @@ struct bch_fs {
 
 	struct bch_dev __rcu	*devs[BCH_SB_MEMBERS_MAX];
 
-	struct bch_accounting_mem accounting;
+	struct bch_accounting_mem accounting[2];
 
 	struct bch_replicas_cpu replicas;
 	struct bch_replicas_cpu replicas_gc;
@@ -890,7 +889,6 @@ struct bch_fs {
 
 	seqcount_t			usage_lock;
 	struct bch_fs_usage_base __percpu *usage;
-	struct bch_fs_usage __percpu	*usage_gc;
 	u64 __percpu		*online_reserved;
 
 	struct io_clock		io_clock[2];
diff --git a/fs/bcachefs/btree_gc.c b/fs/bcachefs/btree_gc.c
index c79258e3e69c..0fe869cff8be 100644
--- a/fs/bcachefs/btree_gc.c
+++ b/fs/bcachefs/btree_gc.c
@@ -20,6 +20,7 @@
 #include "buckets.h"
 #include "clock.h"
 #include "debug.h"
+#include "disk_accounting.h"
 #include "ec.h"
 #include "error.h"
 #include "extents.h"
@@ -735,132 +736,25 @@ static int bch2_mark_superblocks(struct bch_fs *c)
 
 static void bch2_gc_free(struct bch_fs *c)
 {
+	bch2_accounting_free(&c->accounting[1]);
+
 	genradix_free(&c->reflink_gc_table);
 	genradix_free(&c->gc_stripes);
 
 	for_each_member_device(c, ca) {
 		kvfree(rcu_dereference_protected(ca->buckets_gc, 1));
 		ca->buckets_gc = NULL;
-
-		free_percpu(ca->usage_gc);
-		ca->usage_gc = NULL;
 	}
-
-	free_percpu(c->usage_gc);
-	c->usage_gc = NULL;
-}
-
-static int bch2_gc_done(struct bch_fs *c)
-{
-	struct bch_dev *ca = NULL;
-	struct printbuf buf = PRINTBUF;
-	unsigned i;
-	int ret = 0;
-
-	percpu_down_write(&c->mark_lock);
-
-#define copy_field(_err, _f, _msg, ...)						\
-	if (fsck_err_on(dst->_f != src->_f, c, _err,				\
-			_msg ": got %llu, should be %llu" , ##__VA_ARGS__,	\
-			dst->_f, src->_f))					\
-		dst->_f = src->_f
-#define copy_dev_field(_err, _f, _msg, ...)					\
-	copy_field(_err, _f, "dev %u has wrong " _msg, ca->dev_idx, ##__VA_ARGS__)
-#define copy_fs_field(_err, _f, _msg, ...)					\
-	copy_field(_err, _f, "fs has wrong " _msg, ##__VA_ARGS__)
-
-	__for_each_member_device(c, ca) {
-		/* XXX */
-		struct bch_dev_usage *dst = this_cpu_ptr(ca->usage);
-		struct bch_dev_usage *src = (void *)
-			bch2_acc_percpu_u64s((u64 __percpu *) ca->usage_gc,
-					     dev_usage_u64s());
-
-		for (i = 0; i < BCH_DATA_NR; i++) {
-			copy_dev_field(dev_usage_buckets_wrong,
-				       d[i].buckets,	"%s buckets", bch2_data_type_str(i));
-			copy_dev_field(dev_usage_sectors_wrong,
-				       d[i].sectors,	"%s sectors", bch2_data_type_str(i));
-			copy_dev_field(dev_usage_fragmented_wrong,
-				       d[i].fragmented,	"%s fragmented", bch2_data_type_str(i));
-		}
-	}
-
-	{
-#if 0
-		unsigned nr = fs_usage_u64s(c);
-		/* XX: */
-		struct bch_fs_usage *dst = this_cpu_ptr(c->usage);
-		struct bch_fs_usage *src = (void *)
-			bch2_acc_percpu_u64s((u64 __percpu *) c->usage_gc, nr);
-
-		copy_fs_field(fs_usage_hidden_wrong,
-			      b.hidden,		"hidden");
-		copy_fs_field(fs_usage_btree_wrong,
-			      b.btree,		"btree");
-
-		copy_fs_field(fs_usage_data_wrong,
-			      b.data,	"data");
-		copy_fs_field(fs_usage_cached_wrong,
-			      b.cached,	"cached");
-		copy_fs_field(fs_usage_reserved_wrong,
-			      b.reserved,	"reserved");
-		copy_fs_field(fs_usage_nr_inodes_wrong,
-			      b.nr_inodes,"nr_inodes");
-
-		for (i = 0; i < BCH_REPLICAS_MAX; i++)
-			copy_fs_field(fs_usage_persistent_reserved_wrong,
-				      persistent_reserved[i],
-				      "persistent_reserved[%i]", i);
-
-		for (i = 0; i < c->replicas.nr; i++) {
-			struct bch_replicas_entry_v1 *e =
-				cpu_replicas_entry(&c->replicas, i);
-
-			printbuf_reset(&buf);
-			bch2_replicas_entry_to_text(&buf, e);
-
-			copy_fs_field(fs_usage_replicas_wrong,
-				      replicas[i], "%s", buf.buf);
-		}
-#endif
-	}
-
-#undef copy_fs_field
-#undef copy_dev_field
-#undef copy_stripe_field
-#undef copy_field
-fsck_err:
-	bch2_dev_put(ca);
-	bch_err_fn(c, ret);
-	percpu_up_write(&c->mark_lock);
-	printbuf_exit(&buf);
-	return ret;
 }
 
 static int bch2_gc_start(struct bch_fs *c)
 {
-	BUG_ON(c->usage_gc);
-
-	c->usage_gc = __alloc_percpu_gfp(fs_usage_u64s(c) * sizeof(u64),
-					 sizeof(u64), GFP_KERNEL);
-	if (!c->usage_gc) {
-		bch_err(c, "error allocating c->usage_gc");
-		return -BCH_ERR_ENOMEM_gc_start;
-	}
-
 	for_each_member_device(c, ca) {
-		BUG_ON(ca->usage_gc);
-
-		ca->usage_gc = alloc_percpu(struct bch_dev_usage);
-		if (!ca->usage_gc) {
-			bch_err(c, "error allocating ca->usage_gc");
+		int ret = bch2_dev_usage_init(ca, true);
+		if (ret) {
 			bch2_dev_put(ca);
-			return -BCH_ERR_ENOMEM_gc_start;
+			return ret;
 		}
-
-		this_cpu_write(ca->usage_gc->d[BCH_DATA_free].buckets,
-			       ca->mi.nbuckets - ca->mi.first_bucket);
 	}
 
 	return 0;
@@ -908,6 +802,7 @@ static int bch2_alloc_write_key(struct btree_trans *trans,
 		gc.data_type = old->data_type;
 		gc.dirty_sectors = old->dirty_sectors;
 	}
+	percpu_up_read(&c->mark_lock);
 
 	/*
 	 * gc.data_type doesn't yet include need_discard & need_gc_gen states -
@@ -916,9 +811,11 @@ static int bch2_alloc_write_key(struct btree_trans *trans,
 	alloc_data_type_set(&gc, gc.data_type);
 
 	if (gc.data_type != old_gc.data_type ||
-	    gc.dirty_sectors != old_gc.dirty_sectors)
-		bch2_dev_usage_update(c, ca, &old_gc, &gc);
-	percpu_up_read(&c->mark_lock);
+	    gc.dirty_sectors != old_gc.dirty_sectors) {
+		ret = bch2_alloc_key_to_dev_counters(trans, ca, &old_gc, &gc, BTREE_TRIGGER_gc);
+		if (ret)
+			return ret;
+	}
 
 	gc.fragmentation_lru = alloc_lru_idx_fragmentation(gc, ca);
 
@@ -1235,7 +1132,9 @@ int bch2_check_allocations(struct bch_fs *c)
 	gc_pos_set(c, gc_phase(GC_PHASE_start));
 
 	ret = bch2_mark_superblocks(c);
-	BUG_ON(ret);
+	bch_err_msg(c, ret, "marking superblocks");
+	if (ret)
+		goto out;
 
 	ret = bch2_gc_btrees(c);
 	if (ret)
@@ -1246,7 +1145,7 @@ int bch2_check_allocations(struct bch_fs *c)
 	bch2_journal_block(&c->journal);
 out:
 	ret   = bch2_gc_alloc_done(c) ?:
-		bch2_gc_done(c) ?:
+		bch2_accounting_gc_done(c) ?:
 		bch2_gc_stripes_done(c) ?:
 		bch2_gc_reflink_done(c);
 
diff --git a/fs/bcachefs/btree_trans_commit.c b/fs/bcachefs/btree_trans_commit.c
index 92305c12cb75..30e24725eb12 100644
--- a/fs/bcachefs/btree_trans_commit.c
+++ b/fs/bcachefs/btree_trans_commit.c
@@ -724,7 +724,7 @@ bch2_trans_commit_write_locked(struct btree_trans *trans, unsigned flags,
 				a->k.version = journal_pos_to_bversion(&trans->journal_res,
 								(u64 *) entry - (u64 *) trans->journal_entries);
 				BUG_ON(bversion_zero(a->k.version));
-				ret = bch2_accounting_mem_mod(trans, accounting_i_to_s_c(a));
+				ret = bch2_accounting_mem_mod_locked(trans, accounting_i_to_s_c(a), false);
 				if (ret)
 					goto revert_fs_usage;
 			}
@@ -812,7 +812,7 @@ revert_fs_usage:
 			struct bkey_s_accounting a = bkey_i_to_s_accounting(entry2->start);
 
 			bch2_accounting_neg(a);
-			bch2_accounting_mem_mod(trans, a.c);
+			bch2_accounting_mem_mod_locked(trans, a.c, false);
 			bch2_accounting_neg(a);
 		}
 	percpu_up_read(&c->mark_lock);
diff --git a/fs/bcachefs/buckets.c b/fs/bcachefs/buckets.c
index 12faf2ffda1c..e3bf7ed5c073 100644
--- a/fs/bcachefs/buckets.c
+++ b/fs/bcachefs/buckets.c
@@ -84,96 +84,6 @@ void bch2_dev_usage_to_text(struct printbuf *out, struct bch_dev_usage *usage)
 	}
 }
 
-void bch2_dev_usage_update(struct bch_fs *c, struct bch_dev *ca,
-			   const struct bch_alloc_v4 *old,
-			   const struct bch_alloc_v4 *new)
-{
-	struct bch_fs_usage *fs_usage;
-	struct bch_dev_usage *u;
-
-	preempt_disable();
-	fs_usage = this_cpu_ptr(c->usage_gc);
-
-	if (data_type_is_hidden(old->data_type))
-		fs_usage->b.hidden -= ca->mi.bucket_size;
-	if (data_type_is_hidden(new->data_type))
-		fs_usage->b.hidden += ca->mi.bucket_size;
-
-	u = this_cpu_ptr(ca->usage_gc);
-
-	u->d[old->data_type].buckets--;
-	u->d[new->data_type].buckets++;
-
-	u->d[old->data_type].sectors -= bch2_bucket_sectors_dirty(*old);
-	u->d[new->data_type].sectors += bch2_bucket_sectors_dirty(*new);
-
-	u->d[old->data_type].fragmented -= bch2_bucket_sectors_fragmented(ca, *old);
-	u->d[new->data_type].fragmented += bch2_bucket_sectors_fragmented(ca, *new);
-
-	u->d[BCH_DATA_cached].sectors -= old->cached_sectors;
-	u->d[BCH_DATA_cached].sectors += new->cached_sectors;
-
-	unsigned old_unstriped = bch2_bucket_sectors_unstriped(*old);
-	u->d[BCH_DATA_unstriped].buckets -= old_unstriped != 0;
-	u->d[BCH_DATA_unstriped].sectors -= old_unstriped;
-
-	unsigned new_unstriped = bch2_bucket_sectors_unstriped(*new);
-	u->d[BCH_DATA_unstriped].buckets += new_unstriped != 0;
-	u->d[BCH_DATA_unstriped].sectors += new_unstriped;
-
-	preempt_enable();
-}
-
-int bch2_update_replicas(struct bch_fs *c, struct bkey_s_c k,
-			 struct bch_replicas_entry_v1 *r, s64 sectors)
-{
-	struct bch_fs_usage *fs_usage;
-	int idx, ret = 0;
-	struct printbuf buf = PRINTBUF;
-
-	percpu_down_read(&c->mark_lock);
-
-	idx = bch2_replicas_entry_idx(c, r);
-	if (idx < 0 &&
-	    fsck_err(c, ptr_to_missing_replicas_entry,
-		     "no replicas entry\n  while marking %s",
-		     (bch2_bkey_val_to_text(&buf, c, k), buf.buf))) {
-		percpu_up_read(&c->mark_lock);
-		ret = bch2_mark_replicas(c, r);
-		percpu_down_read(&c->mark_lock);
-
-		if (ret)
-			goto err;
-		idx = bch2_replicas_entry_idx(c, r);
-	}
-	if (idx < 0) {
-		ret = -1;
-		goto err;
-	}
-
-	preempt_disable();
-	fs_usage = this_cpu_ptr(c->usage_gc);
-	fs_usage_data_type_to_base(&fs_usage->b, r->data_type, sectors);
-	fs_usage->replicas[idx]		+= sectors;
-	preempt_enable();
-err:
-fsck_err:
-	percpu_up_read(&c->mark_lock);
-	printbuf_exit(&buf);
-	return ret;
-}
-
-static inline int update_cached_sectors(struct bch_fs *c,
-			struct bkey_s_c k,
-			unsigned dev, s64 sectors)
-{
-	struct bch_replicas_padded r;
-
-	bch2_replicas_entry_cached(&r.e, dev);
-
-	return bch2_update_replicas(c, k, &r.e, sectors);
-}
-
 static int bch2_check_fix_ptr(struct btree_trans *trans,
 			      struct bkey_s_c k,
 			      struct extent_ptr_decoded p,
@@ -574,8 +484,6 @@ void bch2_trans_account_disk_usage_change(struct btree_trans *trans)
 	bool warn = false;
 
 	percpu_down_read(&c->mark_lock);
-	preempt_disable();
-	struct bch_fs_usage_base *dst = this_cpu_ptr(c->usage);
 	struct bch_fs_usage_base *src = &trans->fs_usage_delta;
 
 	s64 added = src->btree + src->data + src->reserved;
@@ -603,13 +511,9 @@ void bch2_trans_account_disk_usage_change(struct btree_trans *trans)
 		this_cpu_sub(*c->online_reserved, added);
 	}
 
-	dst->hidden	+= src->hidden;
-	dst->btree	+= src->btree;
-	dst->data	+= src->data;
-	dst->cached	+= src->cached;
-	dst->reserved	+= src->reserved;
-	dst->nr_inodes	+= src->nr_inodes;
-
+	preempt_disable();
+	struct bch_fs_usage_base *dst = this_cpu_ptr(c->usage);
+	acc_u64s((u64 *) dst, (u64 *) src, sizeof(*src) / sizeof(u64));
 	preempt_enable();
 	percpu_up_read(&c->mark_lock);
 
@@ -691,13 +595,13 @@ static int bch2_trigger_pointer(struct btree_trans *trans,
 		bucket_lock(g);
 		struct bch_alloc_v4 old = bucket_m_to_alloc(*g), new = old;
 		ret = __mark_pointer(trans, ca, k, &p, *sectors, bp.data_type, &new);
-		if (!ret) {
-			alloc_to_bucket(g, new);
-			bch2_dev_usage_update(c, ca, &old, &new);
-		}
+		alloc_to_bucket(g, new);
 		bucket_unlock(g);
 err_unlock:
 		percpu_up_read(&c->mark_lock);
+
+		if (!ret)
+			ret = bch2_alloc_key_to_dev_counters(trans, ca, &old, &new, flags);
 	}
 err:
 	bch2_dev_put(ca);
@@ -742,7 +646,7 @@ static int bch2_trigger_stripe_ptr(struct btree_trans *trans,
 		};
 		bch2_bkey_to_replicas(&acc.replicas, bkey_i_to_s_c(&s->k_i));
 		acc.replicas.data_type = data_type;
-		ret = bch2_disk_accounting_mod(trans, &acc, &sectors, 1);
+		ret = bch2_disk_accounting_mod(trans, &acc, &sectors, 1, false);
 err:
 		bch2_trans_iter_exit(trans, &iter);
 		return ret;
@@ -751,8 +655,6 @@ err:
 	if (flags & BTREE_TRIGGER_gc) {
 		struct bch_fs *c = trans->c;
 
-		BUG_ON(!(flags & BTREE_TRIGGER_gc));
-
 		struct gc_stripe *m = genradix_ptr_alloc(&c->gc_stripes, p.ec.idx, GFP_KERNEL);
 		if (!m) {
 			bch_err(c, "error allocating memory for gc_stripes, idx %llu",
@@ -775,11 +677,16 @@ err:
 
 		m->block_sectors[p.ec.block] += sectors;
 
-		struct bch_replicas_padded r = m->r;
+		struct disk_accounting_pos acc = {
+			.type = BCH_DISK_ACCOUNTING_replicas,
+		};
+		memcpy(&acc.replicas, &m->r.e, replicas_entry_bytes(&m->r.e));
 		mutex_unlock(&c->ec_stripes_heap_lock);
 
-		r.e.data_type = data_type;
-		bch2_update_replicas(c, k, &r.e, sectors);
+		acc.replicas.data_type = data_type;
+		int ret = bch2_disk_accounting_mod(trans, &acc, &sectors, 1, true);
+		if (ret)
+			return ret;
 	}
 
 	return 0;
@@ -791,7 +698,6 @@ static int __trigger_extent(struct btree_trans *trans,
 			    enum btree_iter_update_trigger_flags flags)
 {
 	bool gc = flags & BTREE_TRIGGER_gc;
-	struct bch_fs *c = trans->c;
 	struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
 	const union bch_extent_entry *entry;
 	struct extent_ptr_decoded p;
@@ -818,11 +724,7 @@ static int __trigger_extent(struct btree_trans *trans,
 
 		if (p.ptr.cached) {
 			if (!stale) {
-				ret = !gc
-					? bch2_mod_dev_cached_sectors(trans, p.ptr.dev, disk_sectors)
-					: update_cached_sectors(c, k, p.ptr.dev, disk_sectors);
-				bch2_fs_fatal_err_on(ret && gc, c, "%s: no replicas entry while updating cached sectors",
-						     bch2_err_str(ret));
+				ret = bch2_mod_dev_cached_sectors(trans, p.ptr.dev, disk_sectors, gc);
 				if (ret)
 					return ret;
 			}
@@ -844,16 +746,7 @@ static int __trigger_extent(struct btree_trans *trans,
 	}
 
 	if (acc.replicas.nr_devs) {
-		ret = !gc
-			? bch2_disk_accounting_mod(trans, &acc, &replicas_sectors, 1)
-			: bch2_update_replicas(c, k, &acc.replicas, replicas_sectors);
-		if (unlikely(ret && gc)) {
-			struct printbuf buf = PRINTBUF;
-
-			bch2_bkey_val_to_text(&buf, c, k);
-			bch2_fs_fatal_error(c, ": no replicas entry for %s", buf.buf);
-			printbuf_exit(&buf);
-		}
+		ret = bch2_disk_accounting_mod(trans, &acc, &replicas_sectors, 1, gc);
 		if (ret)
 			return ret;
 	}
@@ -906,36 +799,18 @@ static int __trigger_reservation(struct btree_trans *trans,
 			enum btree_id btree_id, unsigned level, struct bkey_s_c k,
 			enum btree_iter_update_trigger_flags flags)
 {
-	struct bch_fs *c = trans->c;
-	unsigned replicas = bkey_s_c_to_reservation(k).v->nr_replicas;
-	s64 sectors = (s64) k.k->size;
+	if (flags & (BTREE_TRIGGER_transactional|BTREE_TRIGGER_gc)) {
+		s64 sectors = k.k->size;
 
-	if (flags & BTREE_TRIGGER_overwrite)
-		sectors = -sectors;
+		if (flags & BTREE_TRIGGER_overwrite)
+			sectors = -sectors;
 
-	if (flags & BTREE_TRIGGER_transactional) {
 		struct disk_accounting_pos acc = {
 			.type = BCH_DISK_ACCOUNTING_persistent_reserved,
-			.persistent_reserved.nr_replicas = replicas,
+			.persistent_reserved.nr_replicas = bkey_s_c_to_reservation(k).v->nr_replicas,
 		};
 
-		return bch2_disk_accounting_mod(trans, &acc, &sectors, 1);
-	}
-
-	if (flags & BTREE_TRIGGER_gc) {
-		sectors *= replicas;
-
-		percpu_down_read(&c->mark_lock);
-		preempt_disable();
-
-		struct bch_fs_usage *fs_usage = this_cpu_ptr(c->usage_gc);
-
-		replicas = min(replicas, ARRAY_SIZE(fs_usage->persistent_reserved));
-		fs_usage->b.reserved				+= sectors;
-		fs_usage->persistent_reserved[replicas - 1]	+= sectors;
-
-		preempt_enable();
-		percpu_up_read(&c->mark_lock);
+		return bch2_disk_accounting_mod(trans, &acc, &sectors, 1, flags & BTREE_TRIGGER_gc);
 	}
 
 	return 0;
@@ -989,10 +864,13 @@ err:
 	return ret;
 }
 
-static int bch2_mark_metadata_bucket(struct bch_fs *c, struct bch_dev *ca,
+static int bch2_mark_metadata_bucket(struct btree_trans *trans, struct bch_dev *ca,
 			u64 b, enum bch_data_type data_type, unsigned sectors,
 			enum btree_iter_update_trigger_flags flags)
 {
+	struct bch_fs *c = trans->c;
+	int ret = 0;
+
 	percpu_down_read(&c->mark_lock);
 	struct bucket *g = gc_bucket(ca, b);
 	if (bch2_fs_inconsistent_on(!g, c, "reference to invalid bucket on device %u when marking metadata type %s",
@@ -1019,9 +897,10 @@ static int bch2_mark_metadata_bucket(struct bch_fs *c, struct bch_dev *ca,
 	g->data_type = data_type;
 	g->dirty_sectors += sectors;
 	struct bch_alloc_v4 new = bucket_m_to_alloc(*g);
-	bch2_dev_usage_update(c, ca, &old, &new);
+	bucket_unlock(g);
 	percpu_up_read(&c->mark_lock);
-	return 0;
+	ret = bch2_alloc_key_to_dev_counters(trans, ca, &old, &new, flags);
+	return ret;
 err:
 	bucket_unlock(g);
 err_unlock:
@@ -1045,7 +924,7 @@ int bch2_trans_mark_metadata_bucket(struct btree_trans *trans,
 		return 0;
 
 	if (flags & BTREE_TRIGGER_gc)
-		return bch2_mark_metadata_bucket(trans->c, ca, b, type, sectors, flags);
+		return bch2_mark_metadata_bucket(trans, ca, b, type, sectors, flags);
 	else if (flags & BTREE_TRIGGER_transactional)
 		return commit_do(trans, NULL, NULL, 0,
 				 __bch2_trans_mark_metadata_bucket(trans, ca, b, type, sectors));
diff --git a/fs/bcachefs/buckets.h b/fs/bcachefs/buckets.h
index 42ff3e9df587..fc6359f84e82 100644
--- a/fs/bcachefs/buckets.h
+++ b/fs/bcachefs/buckets.h
@@ -273,16 +273,6 @@ static inline u64 dev_buckets_available(struct bch_dev *ca,
 
 /* Filesystem usage: */
 
-static inline unsigned __fs_usage_u64s(unsigned nr_replicas)
-{
-	return sizeof(struct bch_fs_usage) / sizeof(u64) + nr_replicas;
-}
-
-static inline unsigned fs_usage_u64s(struct bch_fs *c)
-{
-	return __fs_usage_u64s(READ_ONCE(c->replicas.nr));
-}
-
 static inline unsigned dev_usage_u64s(void)
 {
 	return sizeof(struct bch_dev_usage) / sizeof(u64);
@@ -291,12 +281,6 @@ static inline unsigned dev_usage_u64s(void)
 struct bch_fs_usage_short
 bch2_fs_usage_read_short(struct bch_fs *);
 
-void bch2_dev_usage_update(struct bch_fs *, struct bch_dev *,
-			   const struct bch_alloc_v4 *,
-			   const struct bch_alloc_v4 *);
-int bch2_update_replicas(struct bch_fs *, struct bkey_s_c,
-			 struct bch_replicas_entry_v1 *, s64);
-
 int bch2_bucket_ref_update(struct btree_trans *, struct bch_dev *,
 			   struct bkey_s_c, const struct bch_extent_ptr *,
 			   s64, enum bch_data_type, u8, u8, u32 *);
diff --git a/fs/bcachefs/buckets_types.h b/fs/bcachefs/buckets_types.h
index 7ad15f809348..c9698cdf866f 100644
--- a/fs/bcachefs/buckets_types.h
+++ b/fs/bcachefs/buckets_types.h
@@ -57,13 +57,6 @@ struct bch_fs_usage_base {
 	u64			nr_inodes;
 };
 
-struct bch_fs_usage {
-	/* all fields are in units of 512 byte sectors: */
-	struct bch_fs_usage_base b;
-	u64			persistent_reserved[BCH_REPLICAS_MAX];
-	u64			replicas[];
-};
-
 struct bch_fs_usage_short {
 	u64			capacity;
 	u64			used;
diff --git a/fs/bcachefs/disk_accounting.c b/fs/bcachefs/disk_accounting.c
index f5b5d896979e..e8dfd67eab8a 100644
--- a/fs/bcachefs/disk_accounting.c
+++ b/fs/bcachefs/disk_accounting.c
@@ -66,9 +66,20 @@ static const char * const disk_accounting_type_strs[] = {
 	NULL
 };
 
+static inline void accounting_key_init(struct bkey_i *k, struct disk_accounting_pos *pos,
+				       s64 *d, unsigned nr)
+{
+	struct bkey_i_accounting *acc = bkey_accounting_init(k);
+
+	acc->k.p = disk_accounting_pos_to_bpos(pos);
+	set_bkey_val_u64s(&acc->k, sizeof(struct bch_accounting) / sizeof(u64) + nr);
+
+	memcpy_u64s_small(acc->v.d, d, nr);
+}
+
 int bch2_disk_accounting_mod(struct btree_trans *trans,
 			     struct disk_accounting_pos *k,
-			     s64 *d, unsigned nr)
+			     s64 *d, unsigned nr, bool gc)
 {
 	/* Normalize: */
 	switch (k->type) {
@@ -79,21 +90,18 @@ int bch2_disk_accounting_mod(struct btree_trans *trans,
 
 	BUG_ON(nr > BCH_ACCOUNTING_MAX_COUNTERS);
 
-	struct {
-		__BKEY_PADDED(k, BCH_ACCOUNTING_MAX_COUNTERS);
-	} k_i;
-	struct bkey_i_accounting *acc = bkey_accounting_init(&k_i.k);
+	struct { __BKEY_PADDED(k, BCH_ACCOUNTING_MAX_COUNTERS); } k_i;
 
-	acc->k.p = disk_accounting_pos_to_bpos(k);
-	set_bkey_val_u64s(&acc->k, sizeof(struct bch_accounting) / sizeof(u64) + nr);
+	accounting_key_init(&k_i.k, k, d, nr);
 
-	memcpy_u64s_small(acc->v.d, d, nr);
-
-	return bch2_trans_update_buffered(trans, BTREE_ID_accounting, &acc->k_i);
+	return likely(!gc)
+		? bch2_trans_update_buffered(trans, BTREE_ID_accounting, &k_i.k)
+		: bch2_accounting_mem_add(trans, bkey_i_to_s_c_accounting(&k_i.k), true);
 }
 
 int bch2_mod_dev_cached_sectors(struct btree_trans *trans,
-				unsigned dev, s64 sectors)
+				unsigned dev, s64 sectors,
+				bool gc)
 {
 	struct disk_accounting_pos acc = {
 		.type = BCH_DISK_ACCOUNTING_replicas,
@@ -101,7 +109,7 @@ int bch2_mod_dev_cached_sectors(struct btree_trans *trans,
 
 	bch2_replicas_entry_cached(&acc.replicas, dev);
 
-	return bch2_disk_accounting_mod(trans, &acc, &sectors, 1);
+	return bch2_disk_accounting_mod(trans, &acc, &sectors, 1, gc);
 }
 
 int bch2_accounting_invalid(struct bch_fs *c, struct bkey_s_c k,
@@ -199,7 +207,7 @@ int bch2_accounting_update_sb(struct btree_trans *trans)
 	return 0;
 }
 
-static int __bch2_accounting_mem_mod_slowpath(struct bch_fs *c, struct bkey_s_c_accounting a)
+static int __bch2_accounting_mem_mod_slowpath(struct bch_fs *c, struct bkey_s_c_accounting a, bool gc)
 {
 	struct bch_replicas_padded r;
 
@@ -207,7 +215,7 @@ static int __bch2_accounting_mem_mod_slowpath(struct bch_fs *c, struct bkey_s_c_
 	    !bch2_replicas_marked_locked(c, &r.e))
 		return -BCH_ERR_btree_insert_need_mark_replicas;
 
-	struct bch_accounting_mem *acc = &c->accounting;
+	struct bch_accounting_mem *acc = &c->accounting[gc];
 	unsigned new_nr_counters = acc->nr_counters + bch2_accounting_counters(a.k);
 
 	u64 __percpu *new_counters = __alloc_percpu_gfp(new_nr_counters * sizeof(u64),
@@ -243,11 +251,11 @@ static int __bch2_accounting_mem_mod_slowpath(struct bch_fs *c, struct bkey_s_c_
 	return 0;
 }
 
-int bch2_accounting_mem_mod_slowpath(struct bch_fs *c, struct bkey_s_c_accounting a)
+int bch2_accounting_mem_mod_slowpath(struct bch_fs *c, struct bkey_s_c_accounting a, bool gc)
 {
 	percpu_up_read(&c->mark_lock);
 	percpu_down_write(&c->mark_lock);
-	int ret = __bch2_accounting_mem_mod_slowpath(c, a);
+	int ret = __bch2_accounting_mem_mod_slowpath(c, a, gc);
 	percpu_up_write(&c->mark_lock);
 	percpu_down_read(&c->mark_lock);
 	return ret;
@@ -263,7 +271,7 @@ int bch2_accounting_mem_mod_slowpath(struct bch_fs *c, struct bkey_s_c_accountin
  */
 int bch2_fs_replicas_usage_read(struct bch_fs *c, darray_char *usage)
 {
-	struct bch_accounting_mem *acc = &c->accounting;
+	struct bch_accounting_mem *acc = &c->accounting[0];
 	int ret = 0;
 
 	darray_init(usage);
@@ -296,6 +304,129 @@ int bch2_fs_replicas_usage_read(struct bch_fs *c, darray_char *usage)
 	return ret;
 }
 
+/* Ensures all counters in @src exist in @dst: */
+static int copy_counters(struct bch_accounting_mem *dst,
+			 struct bch_accounting_mem *src)
+{
+	unsigned orig_dst_k_nr = dst->k.nr;
+	unsigned dst_counters = dst->nr_counters;
+
+	darray_for_each(src->k, i)
+		if (eytzinger0_find(dst->k.data, orig_dst_k_nr, sizeof(dst->k.data[0]),
+				    accounting_pos_cmp, &i->pos) >= orig_dst_k_nr) {
+			if (darray_push(&dst->k, ((struct accounting_pos_offset) {
+					.pos		= i->pos,
+					.offset		= dst_counters,
+					.nr_counters	= i->nr_counters })))
+				goto err;
+
+			dst_counters += i->nr_counters;
+		}
+
+	if (dst->k.nr == orig_dst_k_nr)
+		return 0;
+
+	u64 __percpu *new_counters = __alloc_percpu_gfp(dst_counters * sizeof(u64),
+							sizeof(u64), GFP_KERNEL);
+	if (!new_counters)
+		goto err;
+
+	preempt_disable();
+	memcpy(this_cpu_ptr(new_counters),
+	       bch2_acc_percpu_u64s(dst->v, dst->nr_counters),
+	       dst->nr_counters * sizeof(u64));
+	preempt_enable();
+
+	free_percpu(dst->v);
+	dst->v = new_counters;
+	dst->nr_counters = dst_counters;
+
+	eytzinger0_sort(dst->k.data, dst->k.nr, sizeof(dst->k.data[0]), accounting_pos_cmp, NULL);
+
+	return 0;
+err:
+	dst->k.nr = orig_dst_k_nr;
+	return -BCH_ERR_ENOMEM_disk_accounting;
+}
+
+int bch2_accounting_gc_done(struct bch_fs *c)
+{
+	struct bch_accounting_mem *dst = &c->accounting[0];
+	struct bch_accounting_mem *src = &c->accounting[1];
+	struct btree_trans *trans = bch2_trans_get(c);
+	struct printbuf buf = PRINTBUF;
+	int ret = 0;
+
+	percpu_down_write(&c->mark_lock);
+
+	ret   = copy_counters(dst, src) ?:
+		copy_counters(src, dst);
+	if (ret)
+		goto err;
+
+	BUG_ON(dst->k.nr != src->k.nr);
+
+	for (unsigned i = 0; i < src->k.nr; i++) {
+		BUG_ON(src->k.data[i].nr_counters != dst->k.data[i].nr_counters);
+		BUG_ON(!bpos_eq(dst->k.data[i].pos, src->k.data[i].pos));
+
+		struct disk_accounting_pos acc_k;
+		bpos_to_disk_accounting_pos(&acc_k, src->k.data[i].pos);
+
+		unsigned nr = src->k.data[i].nr_counters;
+		u64 src_v[BCH_ACCOUNTING_MAX_COUNTERS];
+		u64 dst_v[BCH_ACCOUNTING_MAX_COUNTERS];
+
+		bch2_accounting_mem_read_counters(c, i, dst_v, nr, false);
+		bch2_accounting_mem_read_counters(c, i, src_v, nr, true);
+
+		if (memcmp(dst_v, src_v, nr * sizeof(u64))) {
+			printbuf_reset(&buf);
+			prt_str(&buf, "accounting mismatch for ");
+			bch2_accounting_key_to_text(&buf, &acc_k);
+
+			prt_str(&buf, ": got");
+			for (unsigned j = 0; j < nr; j++)
+				prt_printf(&buf, " %llu", dst_v[j]);
+
+			prt_str(&buf, " should be");
+			for (unsigned j = 0; j < nr; j++)
+				prt_printf(&buf, " %llu", src_v[j]);
+
+			for (unsigned j = 0; j < nr; j++)
+				src_v[j] -= dst_v[j];
+
+			if (fsck_err(c, accounting_mismatch, "%s", buf.buf)) {
+				ret = commit_do(trans, NULL, NULL, 0,
+						bch2_disk_accounting_mod(trans, &acc_k, src_v, nr, false));
+				if (ret)
+					goto err;
+
+				if (!test_bit(BCH_FS_may_go_rw, &c->flags)) {
+					memset(&trans->fs_usage_delta, 0, sizeof(trans->fs_usage_delta));
+					struct { __BKEY_PADDED(k, BCH_ACCOUNTING_MAX_COUNTERS); } k_i;
+
+					accounting_key_init(&k_i.k, &acc_k, src_v, nr);
+					bch2_accounting_mem_mod_locked(trans, bkey_i_to_s_c_accounting(&k_i.k), false);
+
+					preempt_disable();
+					struct bch_fs_usage_base *dst = this_cpu_ptr(c->usage);
+					struct bch_fs_usage_base *src = &trans->fs_usage_delta;
+					acc_u64s((u64 *) dst, (u64 *) src, sizeof(*src) / sizeof(u64));
+					preempt_enable();
+				}
+			}
+		}
+	}
+err:
+fsck_err:
+	percpu_up_write(&c->mark_lock);
+	printbuf_exit(&buf);
+	bch2_trans_put(trans);
+	bch_err_fn(c, ret);
+	return ret;
+}
+
 static int accounting_read_key(struct bch_fs *c, struct bkey_s_c k)
 {
 	struct printbuf buf = PRINTBUF;
@@ -304,7 +435,7 @@ static int accounting_read_key(struct bch_fs *c, struct bkey_s_c k)
 		return 0;
 
 	percpu_down_read(&c->mark_lock);
-	int ret = __bch2_accounting_mem_mod(c, bkey_s_c_to_accounting(k));
+	int ret = __bch2_accounting_mem_mod(c, bkey_s_c_to_accounting(k), false);
 	percpu_up_read(&c->mark_lock);
 
 	if (bch2_accounting_key_is_zero(bkey_s_c_to_accounting(k)) &&
@@ -331,7 +462,7 @@ fsck_err:
  */
 int bch2_accounting_read(struct bch_fs *c)
 {
-	struct bch_accounting_mem *acc = &c->accounting;
+	struct bch_accounting_mem *acc = &c->accounting[0];
 
 	int ret = bch2_trans_run(c,
 		for_each_btree_key(trans, iter,
@@ -391,7 +522,7 @@ int bch2_accounting_read(struct bch_fs *c)
 		bpos_to_disk_accounting_pos(&k, acc->k.data[i].pos);
 
 		u64 v[BCH_ACCOUNTING_MAX_COUNTERS];
-		bch2_accounting_mem_read_counters(c, i, v, ARRAY_SIZE(v));
+		bch2_accounting_mem_read_counters(c, i, v, ARRAY_SIZE(v), false);
 
 		switch (k.type) {
 		case BCH_DISK_ACCOUNTING_persistent_reserved:
@@ -441,8 +572,9 @@ int bch2_dev_usage_remove(struct bch_fs *c, unsigned dev)
 		bch2_btree_write_buffer_flush_sync(trans));
 }
 
-int bch2_dev_usage_init(struct bch_dev *ca)
+int bch2_dev_usage_init(struct bch_dev *ca, bool gc)
 {
+	struct bch_fs *c = ca->fs;
 	struct disk_accounting_pos acc = {
 		.type = BCH_DISK_ACCOUNTING_dev_data_type,
 		.dev_data_type.dev = ca->dev_idx,
@@ -450,14 +582,21 @@ int bch2_dev_usage_init(struct bch_dev *ca)
 	};
 	u64 v[3] = { ca->mi.nbuckets - ca->mi.first_bucket, 0, 0 };
 
-	return bch2_trans_do(ca->fs, NULL, NULL, 0,
-			     bch2_disk_accounting_mod(trans, &acc, v, ARRAY_SIZE(v)));
+	int ret = bch2_trans_do(c, NULL, NULL, 0,
+			bch2_disk_accounting_mod(trans, &acc, v, ARRAY_SIZE(v), gc));
+	bch_err_fn(c, ret);
+	return ret;
+}
+
+void bch2_accounting_free(struct bch_accounting_mem *acc)
+{
+	darray_exit(&acc->k);
+	free_percpu(acc->v);
+	acc->v = NULL;
+	acc->nr_counters = 0;
 }
 
 void bch2_fs_accounting_exit(struct bch_fs *c)
 {
-	struct bch_accounting_mem *acc = &c->accounting;
-
-	darray_exit(&acc->k);
-	free_percpu(acc->v);
+	bch2_accounting_free(&c->accounting[0]);
 }
diff --git a/fs/bcachefs/disk_accounting.h b/fs/bcachefs/disk_accounting.h
index 9f2078c970f3..76445ffd9172 100644
--- a/fs/bcachefs/disk_accounting.h
+++ b/fs/bcachefs/disk_accounting.h
@@ -78,11 +78,9 @@ static inline struct bpos disk_accounting_pos_to_bpos(struct disk_accounting_pos
 	return ret;
 }
 
-int bch2_disk_accounting_mod(struct btree_trans *,
-			     struct disk_accounting_pos *,
-			     s64 *, unsigned);
-int bch2_mod_dev_cached_sectors(struct btree_trans *trans,
-				unsigned dev, s64 sectors);
+int bch2_disk_accounting_mod(struct btree_trans *, struct disk_accounting_pos *,
+			     s64 *, unsigned, bool);
+int bch2_mod_dev_cached_sectors(struct btree_trans *, unsigned, s64, bool);
 
 int bch2_accounting_invalid(struct bch_fs *, struct bkey_s_c,
 			    enum bch_validate_flags, struct printbuf *);
@@ -106,15 +104,15 @@ static inline int accounting_pos_cmp(const void *_l, const void *_r)
 	return bpos_cmp(*l, *r);
 }
 
-int bch2_accounting_mem_mod_slowpath(struct bch_fs *, struct bkey_s_c_accounting);
+int bch2_accounting_mem_mod_slowpath(struct bch_fs *, struct bkey_s_c_accounting, bool);
 
-static inline int __bch2_accounting_mem_mod(struct bch_fs *c, struct bkey_s_c_accounting a)
+static inline int __bch2_accounting_mem_mod(struct bch_fs *c, struct bkey_s_c_accounting a, bool gc)
 {
-	struct bch_accounting_mem *acc = &c->accounting;
+	struct bch_accounting_mem *acc = &c->accounting[gc];
 	unsigned idx = eytzinger0_find(acc->k.data, acc->k.nr, sizeof(acc->k.data[0]),
 				       accounting_pos_cmp, &a.k->p);
 	if (unlikely(idx >= acc->k.nr))
-		return bch2_accounting_mem_mod_slowpath(c, a);
+		return bch2_accounting_mem_mod_slowpath(c, a, gc);
 
 	unsigned offset = acc->k.data[idx].offset;
 
@@ -129,41 +127,51 @@ static inline int __bch2_accounting_mem_mod(struct bch_fs *c, struct bkey_s_c_ac
  * Update in memory counters so they match the btree update we're doing; called
  * from transaction commit path
  */
-static inline int bch2_accounting_mem_mod(struct btree_trans *trans, struct
-					  bkey_s_c_accounting a)
+static inline int bch2_accounting_mem_mod_locked(struct btree_trans *trans, struct bkey_s_c_accounting a, bool gc)
 {
 	struct bch_fs *c = trans->c;
-	struct disk_accounting_pos acc_k;
-	bpos_to_disk_accounting_pos(&acc_k, a.k->p);
 
-	switch (acc_k.type) {
-	case BCH_DISK_ACCOUNTING_persistent_reserved:
-		trans->fs_usage_delta.reserved += acc_k.persistent_reserved.nr_replicas * a.v->d[0];
-		break;
-	case BCH_DISK_ACCOUNTING_replicas:
-		fs_usage_data_type_to_base(&trans->fs_usage_delta, acc_k.replicas.data_type, a.v->d[0]);
-		break;
-	case BCH_DISK_ACCOUNTING_dev_data_type:
-		rcu_read_lock();
-		struct bch_dev *ca = bch2_dev_rcu(c, acc_k.dev_data_type.dev);
-		if (ca) {
-			this_cpu_add(ca->usage->d[acc_k.dev_data_type.data_type].buckets, a.v->d[0]);
-			this_cpu_add(ca->usage->d[acc_k.dev_data_type.data_type].sectors, a.v->d[1]);
-			this_cpu_add(ca->usage->d[acc_k.dev_data_type.data_type].fragmented, a.v->d[2]);
+	if (!gc) {
+		struct disk_accounting_pos acc_k;
+		bpos_to_disk_accounting_pos(&acc_k, a.k->p);
+
+		switch (acc_k.type) {
+		case BCH_DISK_ACCOUNTING_persistent_reserved:
+			trans->fs_usage_delta.reserved += acc_k.persistent_reserved.nr_replicas * a.v->d[0];
+			break;
+		case BCH_DISK_ACCOUNTING_replicas:
+			fs_usage_data_type_to_base(&trans->fs_usage_delta, acc_k.replicas.data_type, a.v->d[0]);
+			break;
+		case BCH_DISK_ACCOUNTING_dev_data_type:
+			rcu_read_lock();
+			struct bch_dev *ca = bch2_dev_rcu(c, acc_k.dev_data_type.dev);
+			if (ca) {
+				this_cpu_add(ca->usage->d[acc_k.dev_data_type.data_type].buckets, a.v->d[0]);
+				this_cpu_add(ca->usage->d[acc_k.dev_data_type.data_type].sectors, a.v->d[1]);
+				this_cpu_add(ca->usage->d[acc_k.dev_data_type.data_type].fragmented, a.v->d[2]);
+			}
+			rcu_read_unlock();
+			break;
 		}
-		rcu_read_unlock();
-		break;
 	}
-	return __bch2_accounting_mem_mod(c, a);
+
+	return __bch2_accounting_mem_mod(c, a, gc);
 }
 
-static inline void bch2_accounting_mem_read_counters(struct bch_fs *c,
-						     unsigned idx,
-						     u64 *v, unsigned nr)
+static inline int bch2_accounting_mem_add(struct btree_trans *trans, struct bkey_s_c_accounting a, bool gc)
+{
+	percpu_down_read(&trans->c->mark_lock);
+	int ret = bch2_accounting_mem_mod_locked(trans, a, gc);
+	percpu_up_read(&trans->c->mark_lock);
+	return ret;
+}
+
+static inline void bch2_accounting_mem_read_counters(struct bch_fs *c, unsigned idx,
+						     u64 *v, unsigned nr, bool gc)
 {
 	memset(v, 0, sizeof(*v) * nr);
 
-	struct bch_accounting_mem *acc = &c->accounting;
+	struct bch_accounting_mem *acc = &c->accounting[0];
 	if (unlikely(idx >= acc->k.nr))
 		return;
 
@@ -177,19 +185,23 @@ static inline void bch2_accounting_mem_read_counters(struct bch_fs *c,
 static inline void bch2_accounting_mem_read(struct bch_fs *c, struct bpos p,
 					    u64 *v, unsigned nr)
 {
-	struct bch_accounting_mem *acc = &c->accounting;
+	struct bch_accounting_mem *acc = &c->accounting[0];
 	unsigned idx = eytzinger0_find(acc->k.data, acc->k.nr, sizeof(acc->k.data[0]),
 				       accounting_pos_cmp, &p);
 
-	bch2_accounting_mem_read_counters(c, idx, v, nr);
+	bch2_accounting_mem_read_counters(c, idx, v, nr, false);
 }
 
 int bch2_fs_replicas_usage_read(struct bch_fs *, darray_char *);
 
+int bch2_accounting_gc_done(struct bch_fs *);
+
 int bch2_accounting_read(struct bch_fs *);
 
 int bch2_dev_usage_remove(struct bch_fs *, unsigned);
-int bch2_dev_usage_init(struct bch_dev *);
+int bch2_dev_usage_init(struct bch_dev *, bool);
+
+void bch2_accounting_free(struct bch_accounting_mem *);
 void bch2_fs_accounting_exit(struct bch_fs *);
 
 #endif /* _BCACHEFS_DISK_ACCOUNTING_H */
diff --git a/fs/bcachefs/ec.c b/fs/bcachefs/ec.c
index 819d61b9ab83..3c3a2a7e8389 100644
--- a/fs/bcachefs/ec.c
+++ b/fs/bcachefs/ec.c
@@ -301,13 +301,12 @@ static int mark_stripe_bucket(struct btree_trans *trans,
 		bucket_lock(g);
 		struct bch_alloc_v4 old = bucket_m_to_alloc(*g), new = old;
 		ret = __mark_stripe_bucket(trans, ca, s, ptr_idx, deleting, bucket, &new, flags);
-		if (!ret) {
-			alloc_to_bucket(g, new);
-			bch2_dev_usage_update(c, ca, &old, &new);
-		}
+		alloc_to_bucket(g, new);
 		bucket_unlock(g);
 err_unlock:
 		percpu_up_read(&c->mark_lock);
+		if (!ret)
+			ret = bch2_alloc_key_to_dev_counters(trans, ca, &old, &new, flags);
 	}
 err:
 	bch2_dev_put(ca);
@@ -369,7 +368,12 @@ int bch2_trigger_stripe(struct btree_trans *trans,
 	if (unlikely(flags & BTREE_TRIGGER_check_repair))
 		return bch2_check_fix_ptrs(trans, btree, level, _new.s_c, flags);
 
-	if (flags & BTREE_TRIGGER_transactional) {
+	BUG_ON(new_s && old_s &&
+	       (new_s->nr_blocks	!= old_s->nr_blocks ||
+		new_s->nr_redundant	!= old_s->nr_redundant));
+
+
+	if (flags & (BTREE_TRIGGER_transactional|BTREE_TRIGGER_gc)) {
 		/*
 		 * If the pointers aren't changing, we don't need to do anything:
 		 */
@@ -380,9 +384,34 @@ int bch2_trigger_stripe(struct btree_trans *trans,
 			    new_s->nr_blocks * sizeof(struct bch_extent_ptr)))
 			return 0;
 
-		BUG_ON(new_s && old_s &&
-		       (new_s->nr_blocks	!= old_s->nr_blocks ||
-			new_s->nr_redundant	!= old_s->nr_redundant));
+		struct gc_stripe *gc = NULL;
+		if (flags & BTREE_TRIGGER_gc) {
+			gc = genradix_ptr_alloc(&c->gc_stripes, idx, GFP_KERNEL);
+			if (!gc) {
+				bch_err(c, "error allocating memory for gc_stripes, idx %llu", idx);
+				return -BCH_ERR_ENOMEM_mark_stripe;
+			}
+
+			/*
+			 * This will be wrong when we bring back runtime gc: we should
+			 * be unmarking the old key and then marking the new key
+			 *
+			 * Also: when we bring back runtime gc, locking
+			 */
+			gc->alive	= true;
+			gc->sectors	= le16_to_cpu(new_s->sectors);
+			gc->nr_blocks	= new_s->nr_blocks;
+			gc->nr_redundant	= new_s->nr_redundant;
+
+			for (unsigned i = 0; i < new_s->nr_blocks; i++)
+				gc->ptrs[i] = new_s->ptrs[i];
+
+			/*
+			 * gc recalculates this field from stripe ptr
+			 * references:
+			 */
+			memset(gc->block_sectors, 0, sizeof(gc->block_sectors));
+		}
 
 		if (new_s) {
 			s64 sectors = (u64) le16_to_cpu(new_s->sectors) * new_s->nr_redundant;
@@ -391,9 +420,12 @@ int bch2_trigger_stripe(struct btree_trans *trans,
 				.type = BCH_DISK_ACCOUNTING_replicas,
 			};
 			bch2_bkey_to_replicas(&acc.replicas, new);
-			int ret = bch2_disk_accounting_mod(trans, &acc, &sectors, 1);
+			int ret = bch2_disk_accounting_mod(trans, &acc, &sectors, 1, gc);
 			if (ret)
 				return ret;
+
+			if (gc)
+				memcpy(&gc->r.e, &acc.replicas, replicas_entry_bytes(&acc.replicas));
 		}
 
 		if (old_s) {
@@ -403,7 +435,7 @@ int bch2_trigger_stripe(struct btree_trans *trans,
 				.type = BCH_DISK_ACCOUNTING_replicas,
 			};
 			bch2_bkey_to_replicas(&acc.replicas, old);
-			int ret = bch2_disk_accounting_mod(trans, &acc, &sectors, 1);
+			int ret = bch2_disk_accounting_mod(trans, &acc, &sectors, 1, gc);
 			if (ret)
 				return ret;
 		}
@@ -452,51 +484,6 @@ int bch2_trigger_stripe(struct btree_trans *trans,
 		}
 	}
 
-	if (flags & BTREE_TRIGGER_gc) {
-		struct gc_stripe *m =
-			genradix_ptr_alloc(&c->gc_stripes, idx, GFP_KERNEL);
-
-		if (!m) {
-			bch_err(c, "error allocating memory for gc_stripes, idx %llu",
-				idx);
-			return -BCH_ERR_ENOMEM_mark_stripe;
-		}
-		/*
-		 * This will be wrong when we bring back runtime gc: we should
-		 * be unmarking the old key and then marking the new key
-		 */
-		m->alive	= true;
-		m->sectors	= le16_to_cpu(new_s->sectors);
-		m->nr_blocks	= new_s->nr_blocks;
-		m->nr_redundant	= new_s->nr_redundant;
-
-		for (unsigned i = 0; i < new_s->nr_blocks; i++)
-			m->ptrs[i] = new_s->ptrs[i];
-
-		bch2_bkey_to_replicas(&m->r.e, new);
-
-		/*
-		 * gc recalculates this field from stripe ptr
-		 * references:
-		 */
-		memset(m->block_sectors, 0, sizeof(m->block_sectors));
-
-		int ret = mark_stripe_buckets(trans, old, new, flags);
-		if (ret)
-			return ret;
-
-		ret = bch2_update_replicas(c, new, &m->r.e,
-				      ((s64) m->sectors * m->nr_redundant));
-		if (ret) {
-			struct printbuf buf = PRINTBUF;
-
-			bch2_bkey_val_to_text(&buf, c, new);
-			bch2_fs_fatal_error(c, ": no replicas entry for %s", buf.buf);
-			printbuf_exit(&buf);
-			return ret;
-		}
-	}
-
 	return 0;
 }
 
diff --git a/fs/bcachefs/inode.c b/fs/bcachefs/inode.c
index 4ad55ca15775..d556d7f28661 100644
--- a/fs/bcachefs/inode.c
+++ b/fs/bcachefs/inode.c
@@ -600,41 +600,26 @@ int bch2_trigger_inode(struct btree_trans *trans,
 		       struct bkey_s new,
 		       enum btree_iter_update_trigger_flags flags)
 {
-	s64 nr = (s64) bkey_is_inode(new.k) - (s64) bkey_is_inode(old.k);
-
-	if (flags & BTREE_TRIGGER_transactional) {
-		if (nr) {
-			struct disk_accounting_pos acc = {
-				.type = BCH_DISK_ACCOUNTING_nr_inodes
-			};
-
-			int ret = bch2_disk_accounting_mod(trans, &acc, &nr, 1);
-			if (ret)
-				return ret;
-		}
-
-		bool old_deleted = bkey_is_deleted_inode(old);
-		bool new_deleted = bkey_is_deleted_inode(new.s_c);
-		if (old_deleted != new_deleted) {
-			int ret = bch2_btree_bit_mod_buffered(trans, BTREE_ID_deleted_inodes,
-							      new.k->p, new_deleted);
-			if (ret)
-				return ret;
-		}
-	}
-
 	if ((flags & BTREE_TRIGGER_atomic) && (flags & BTREE_TRIGGER_insert)) {
 		BUG_ON(!trans->journal_res.seq);
-
 		bkey_s_to_inode_v3(new).v->bi_journal_seq = cpu_to_le64(trans->journal_res.seq);
 	}
 
-	if (flags & BTREE_TRIGGER_gc) {
-		struct bch_fs *c = trans->c;
+	s64 nr = bkey_is_inode(new.k) - bkey_is_inode(old.k);
+	if ((flags & (BTREE_TRIGGER_transactional|BTREE_TRIGGER_gc)) && nr) {
+		struct disk_accounting_pos acc = { .type = BCH_DISK_ACCOUNTING_nr_inodes };
+		int ret = bch2_disk_accounting_mod(trans, &acc, &nr, 1, flags & BTREE_TRIGGER_gc);
+		if (ret)
+			return ret;
+	}
 
-		percpu_down_read(&c->mark_lock);
-		this_cpu_add(c->usage_gc->b.nr_inodes, nr);
-		percpu_up_read(&c->mark_lock);
+	int deleted_delta =	(int) bkey_is_deleted_inode(new.s_c) -
+				(int) bkey_is_deleted_inode(old);
+	if ((flags & BTREE_TRIGGER_transactional) && deleted_delta) {
+		int ret = bch2_btree_bit_mod_buffered(trans, BTREE_ID_deleted_inodes,
+						      new.k->p, deleted_delta > 0);
+		if (ret)
+			return ret;
 	}
 
 	return 0;
diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c
index 4006b8ec4fe8..514bff68d971 100644
--- a/fs/bcachefs/recovery.c
+++ b/fs/bcachefs/recovery.c
@@ -1031,8 +1031,7 @@ int bch2_fs_initialize(struct bch_fs *c)
 		goto err;
 
 	for_each_member_device(c, ca) {
-		ret = bch2_dev_usage_init(ca);
-		bch_err_msg(c, ret, "initializing device usage");
+		ret = bch2_dev_usage_init(ca, false);
 		if (ret) {
 			bch2_dev_put(ca);
 			goto err;
diff --git a/fs/bcachefs/replicas.c b/fs/bcachefs/replicas.c
index 5252d3ee8a2a..ac68ef5e453f 100644
--- a/fs/bcachefs/replicas.c
+++ b/fs/bcachefs/replicas.c
@@ -264,73 +264,6 @@ bool bch2_replicas_marked(struct bch_fs *c,
 	return ret;
 }
 
-static void __replicas_table_update(struct bch_fs_usage *dst,
-				    struct bch_replicas_cpu *dst_r,
-				    struct bch_fs_usage *src,
-				    struct bch_replicas_cpu *src_r)
-{
-	int src_idx, dst_idx;
-
-	*dst = *src;
-
-	for (src_idx = 0; src_idx < src_r->nr; src_idx++) {
-		if (!src->replicas[src_idx])
-			continue;
-
-		dst_idx = __replicas_entry_idx(dst_r,
-				cpu_replicas_entry(src_r, src_idx));
-		BUG_ON(dst_idx < 0);
-
-		dst->replicas[dst_idx] = src->replicas[src_idx];
-	}
-}
-
-static void __replicas_table_update_pcpu(struct bch_fs_usage __percpu *dst_p,
-				    struct bch_replicas_cpu *dst_r,
-				    struct bch_fs_usage __percpu *src_p,
-				    struct bch_replicas_cpu *src_r)
-{
-	unsigned src_nr = sizeof(struct bch_fs_usage) / sizeof(u64) + src_r->nr;
-	struct bch_fs_usage *dst, *src = (void *)
-		bch2_acc_percpu_u64s((u64 __percpu *) src_p, src_nr);
-
-	preempt_disable();
-	dst = this_cpu_ptr(dst_p);
-	preempt_enable();
-
-	__replicas_table_update(dst, dst_r, src, src_r);
-}
-
-/*
- * Resize filesystem accounting:
- */
-static int replicas_table_update(struct bch_fs *c,
-				 struct bch_replicas_cpu *new_r)
-{
-	struct bch_fs_usage __percpu *new_gc = NULL;
-	unsigned bytes = sizeof(struct bch_fs_usage) +
-		sizeof(u64) * new_r->nr;
-	int ret = 0;
-
-	if ((c->usage_gc &&
-	     !(new_gc = __alloc_percpu_gfp(bytes, sizeof(u64), GFP_KERNEL))))
-		goto err;
-
-	if (c->usage_gc)
-		__replicas_table_update_pcpu(new_gc,		new_r,
-					     c->usage_gc,	&c->replicas);
-
-	swap(c->usage_gc,	new_gc);
-	swap(c->replicas,	*new_r);
-out:
-	free_percpu(new_gc);
-	return ret;
-err:
-	bch_err(c, "error updating replicas table: memory allocation failure");
-	ret = -BCH_ERR_ENOMEM_replicas_table;
-	goto out;
-}
-
 noinline
 static int bch2_mark_replicas_slowpath(struct bch_fs *c,
 				struct bch_replicas_entry_v1 *new_entry)
@@ -378,7 +311,7 @@ static int bch2_mark_replicas_slowpath(struct bch_fs *c,
 	/* don't update in memory replicas until changes are persistent */
 	percpu_down_write(&c->mark_lock);
 	if (new_r.entries)
-		ret = replicas_table_update(c, &new_r);
+		swap(c->replicas, new_r);
 	if (new_gc.entries)
 		swap(new_gc, c->replicas_gc);
 	percpu_up_write(&c->mark_lock);
@@ -413,8 +346,9 @@ int bch2_replicas_gc_end(struct bch_fs *c, int ret)
 	percpu_down_write(&c->mark_lock);
 
 	ret =   ret ?:
-		bch2_cpu_replicas_to_sb_replicas(c, &c->replicas_gc) ?:
-		replicas_table_update(c, &c->replicas_gc);
+		bch2_cpu_replicas_to_sb_replicas(c, &c->replicas_gc);
+	if (!ret)
+		swap(c->replicas, c->replicas_gc);
 
 	kfree(c->replicas_gc.entries);
 	c->replicas_gc.entries = NULL;
@@ -628,8 +562,7 @@ int bch2_sb_replicas_to_cpu_replicas(struct bch_fs *c)
 	bch2_cpu_replicas_sort(&new_r);
 
 	percpu_down_write(&c->mark_lock);
-
-	ret = replicas_table_update(c, &new_r);
+	swap(c->replicas, new_r);
 	percpu_up_write(&c->mark_lock);
 
 	kfree(new_r.entries);
@@ -931,10 +864,8 @@ unsigned bch2_sb_dev_has_data(struct bch_sb *sb, unsigned dev)
 
 unsigned bch2_dev_has_data(struct bch_fs *c, struct bch_dev *ca)
 {
-	unsigned ret;
-
 	mutex_lock(&c->sb_lock);
-	ret = bch2_sb_dev_has_data(c->disk_sb.sb, ca->dev_idx);
+	unsigned ret = bch2_sb_dev_has_data(c->disk_sb.sb, ca->dev_idx);
 	mutex_unlock(&c->sb_lock);
 
 	return ret;
@@ -945,8 +876,3 @@ void bch2_fs_replicas_exit(struct bch_fs *c)
 	kfree(c->replicas.entries);
 	kfree(c->replicas_gc.entries);
 }
-
-int bch2_fs_replicas_init(struct bch_fs *c)
-{
-	return replicas_table_update(c, &c->replicas);
-}
diff --git a/fs/bcachefs/replicas.h b/fs/bcachefs/replicas.h
index eade75ed4839..622482559c3d 100644
--- a/fs/bcachefs/replicas.h
+++ b/fs/bcachefs/replicas.h
@@ -79,6 +79,5 @@ extern const struct bch_sb_field_ops bch_sb_field_ops_replicas;
 extern const struct bch_sb_field_ops bch_sb_field_ops_replicas_v0;
 
 void bch2_fs_replicas_exit(struct bch_fs *);
-int bch2_fs_replicas_init(struct bch_fs *);
 
 #endif /* _BCACHEFS_REPLICAS_H */
diff --git a/fs/bcachefs/super.c b/fs/bcachefs/super.c
index baee01d7856f..af947f19e388 100644
--- a/fs/bcachefs/super.c
+++ b/fs/bcachefs/super.c
@@ -899,7 +899,6 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts)
 	    bch2_io_clock_init(&c->io_clock[READ]) ?:
 	    bch2_io_clock_init(&c->io_clock[WRITE]) ?:
 	    bch2_fs_journal_init(&c->journal) ?:
-	    bch2_fs_replicas_init(c) ?:
 	    bch2_fs_btree_iter_init(c) ?:
 	    bch2_fs_btree_cache_init(c) ?:
 	    bch2_fs_btree_key_cache_init(&c->btree_key_cache) ?:
@@ -1830,7 +1829,7 @@ have_slot:
 	bch2_write_super(c);
 	mutex_unlock(&c->sb_lock);
 
-	ret = bch2_dev_usage_init(ca);
+	ret = bch2_dev_usage_init(ca, false);
 	if (ret)
 		goto err_late;
 
@@ -2011,9 +2010,9 @@ int bch2_dev_resize(struct bch_fs *c, struct bch_dev *ca, u64 nbuckets)
 		};
 		u64 v[3] = { nbuckets - old_nbuckets, 0, 0 };
 
-		ret   = bch2_dev_freespace_init(c, ca, old_nbuckets, nbuckets) ?:
-			bch2_trans_do(ca->fs, NULL, NULL, 0,
-				bch2_disk_accounting_mod(trans, &acc, v, ARRAY_SIZE(v)));
+		ret   = bch2_trans_do(ca->fs, NULL, NULL, 0,
+				bch2_disk_accounting_mod(trans, &acc, v, ARRAY_SIZE(v), false)) ?:
+			bch2_dev_freespace_init(c, ca, old_nbuckets, nbuckets);
 		if (ret)
 			goto err;
 	}
diff --git a/fs/bcachefs/util.h b/fs/bcachefs/util.h
index 5b0533ec4c7e..cd09edd12d8a 100644
--- a/fs/bcachefs/util.h
+++ b/fs/bcachefs/util.h
@@ -719,9 +719,7 @@ static inline void percpu_u64_set(u64 __percpu *dst, u64 src)
 
 static inline void acc_u64s(u64 *acc, const u64 *src, unsigned nr)
 {
-	unsigned i;
-
-	for (i = 0; i < nr; i++)
+	for (unsigned i = 0; i < nr; i++)
 		acc[i] += src[i];
 }
 

From 00839addfc771e5653bb9562c5a87cd78eea0eee Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Mon, 12 Feb 2024 15:21:10 -0500
Subject: [PATCH 043/120] bcachefs: Convert bch2_replicas_gc2() to new
 accounting

bch2_replicas_gc2() is used for garbage collection superblock replicas
entries that are empty - this converts it to the new accounting scheme.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/replicas.c | 26 +++++++++++++++-----------
 1 file changed, 15 insertions(+), 11 deletions(-)

diff --git a/fs/bcachefs/replicas.c b/fs/bcachefs/replicas.c
index ac68ef5e453f..06f6d48f74c0 100644
--- a/fs/bcachefs/replicas.c
+++ b/fs/bcachefs/replicas.c
@@ -2,6 +2,7 @@
 
 #include "bcachefs.h"
 #include "buckets.h"
+#include "disk_accounting.h"
 #include "journal.h"
 #include "replicas.h"
 #include "super-io.h"
@@ -418,8 +419,6 @@ int bch2_replicas_gc_start(struct bch_fs *c, unsigned typemask)
  */
 int bch2_replicas_gc2(struct bch_fs *c)
 {
-	return 0;
-#if 0
 	struct bch_replicas_cpu new = { 0 };
 	unsigned i, nr;
 	int ret = 0;
@@ -449,20 +448,26 @@ retry:
 		struct bch_replicas_entry_v1 *e =
 			cpu_replicas_entry(&c->replicas, i);
 
-		if (e->data_type == BCH_DATA_journal ||
-		    c->usage_base->replicas[i] ||
-		    percpu_u64_get(&c->usage[0]->replicas[i]) ||
-		    percpu_u64_get(&c->usage[1]->replicas[i]) ||
-		    percpu_u64_get(&c->usage[2]->replicas[i]) ||
-		    percpu_u64_get(&c->usage[3]->replicas[i]))
+		struct disk_accounting_pos k = {
+			.type = BCH_DISK_ACCOUNTING_replicas,
+		};
+
+		memcpy(&k.replicas, e, replicas_entry_bytes(e));
+
+		u64 v = 0;
+		bch2_accounting_mem_read(c, disk_accounting_pos_to_bpos(&k), &v, 1);
+
+		if (e->data_type == BCH_DATA_journal || v)
 			memcpy(cpu_replicas_entry(&new, new.nr++),
 			       e, new.entry_size);
 	}
 
 	bch2_cpu_replicas_sort(&new);
 
-	ret =   bch2_cpu_replicas_to_sb_replicas(c, &new) ?:
-		replicas_table_update(c, &new);
+	ret = bch2_cpu_replicas_to_sb_replicas(c, &new);
+
+	if (!ret)
+		swap(c->replicas, new);
 
 	kfree(new.entries);
 
@@ -474,7 +479,6 @@ retry:
 	mutex_unlock(&c->sb_lock);
 
 	return ret;
-#endif
 }
 
 /* Replicas tracking - superblock: */

From 5668e5deec253dc4674aea00997716cc3a66aaac Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Sun, 18 Feb 2024 00:13:22 -0500
Subject: [PATCH 044/120] bcachefs: bch2_verify_accounting_clean()

Verify that the in-memory accounting verifies the on-disk accounting
after a clean shutdown.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/disk_accounting.c | 87 +++++++++++++++++++++++++++++++++++
 fs/bcachefs/disk_accounting.h |  4 +-
 fs/bcachefs/super.c           |  1 +
 3 files changed, 91 insertions(+), 1 deletion(-)

diff --git a/fs/bcachefs/disk_accounting.c b/fs/bcachefs/disk_accounting.c
index e8dfd67eab8a..1d45d8280618 100644
--- a/fs/bcachefs/disk_accounting.c
+++ b/fs/bcachefs/disk_accounting.c
@@ -588,6 +588,93 @@ int bch2_dev_usage_init(struct bch_dev *ca, bool gc)
 	return ret;
 }
 
+void bch2_verify_accounting_clean(struct bch_fs *c)
+{
+	bool mismatch = false;
+	struct bch_fs_usage_base base = {}, base_inmem = {};
+
+	bch2_trans_run(c,
+		for_each_btree_key(trans, iter,
+				   BTREE_ID_accounting, POS_MIN,
+				   BTREE_ITER_all_snapshots, k, ({
+			u64 v[BCH_ACCOUNTING_MAX_COUNTERS];
+			struct bkey_s_c_accounting a = bkey_s_c_to_accounting(k);
+			unsigned nr = bch2_accounting_counters(k.k);
+
+			bch2_accounting_mem_read(c, k.k->p, v, nr);
+
+			if (memcmp(a.v->d, v, nr * sizeof(u64))) {
+				struct printbuf buf = PRINTBUF;
+
+				bch2_bkey_val_to_text(&buf, c, k);
+				prt_str(&buf, " !=");
+				for (unsigned j = 0; j < nr; j++)
+					prt_printf(&buf, " %llu", v[j]);
+
+				pr_err("%s", buf.buf);
+				printbuf_exit(&buf);
+				mismatch = true;
+			}
+
+			struct disk_accounting_pos acc_k;
+			bpos_to_disk_accounting_pos(&acc_k, a.k->p);
+
+			switch (acc_k.type) {
+			case BCH_DISK_ACCOUNTING_persistent_reserved:
+				base.reserved += acc_k.persistent_reserved.nr_replicas * a.v->d[0];
+				break;
+			case BCH_DISK_ACCOUNTING_replicas:
+				fs_usage_data_type_to_base(&base, acc_k.replicas.data_type, a.v->d[0]);
+				break;
+			case BCH_DISK_ACCOUNTING_dev_data_type: {
+				rcu_read_lock();
+				struct bch_dev *ca = bch2_dev_rcu(c, acc_k.dev_data_type.dev);
+				if (!ca) {
+					rcu_read_unlock();
+					continue;
+				}
+
+				v[0] = percpu_u64_get(&ca->usage->d[acc_k.dev_data_type.data_type].buckets);
+				v[1] = percpu_u64_get(&ca->usage->d[acc_k.dev_data_type.data_type].sectors);
+				v[2] = percpu_u64_get(&ca->usage->d[acc_k.dev_data_type.data_type].fragmented);
+				rcu_read_unlock();
+
+				if (memcmp(a.v->d, v, 3 * sizeof(u64))) {
+					struct printbuf buf = PRINTBUF;
+
+					bch2_bkey_val_to_text(&buf, c, k);
+					prt_str(&buf, " in mem");
+					for (unsigned j = 0; j < nr; j++)
+						prt_printf(&buf, " %llu", v[j]);
+
+					pr_err("dev accounting mismatch: %s", buf.buf);
+					printbuf_exit(&buf);
+					mismatch = true;
+				}
+			}
+			}
+
+			0;
+		})));
+
+	acc_u64s_percpu(&base_inmem.hidden, &c->usage->hidden, sizeof(base_inmem) / sizeof(u64));
+
+#define check(x)										\
+	if (base.x != base_inmem.x) {								\
+		pr_err("fs_usage_base.%s mismatch: %llu != %llu", #x, base.x, base_inmem.x);	\
+		mismatch = true;								\
+	}
+
+	//check(hidden);
+	check(btree);
+	check(data);
+	check(cached);
+	check(reserved);
+	check(nr_inodes);
+
+	WARN_ON(mismatch);
+}
+
 void bch2_accounting_free(struct bch_accounting_mem *acc)
 {
 	darray_exit(&acc->k);
diff --git a/fs/bcachefs/disk_accounting.h b/fs/bcachefs/disk_accounting.h
index 76445ffd9172..5132b3dd1745 100644
--- a/fs/bcachefs/disk_accounting.h
+++ b/fs/bcachefs/disk_accounting.h
@@ -171,7 +171,7 @@ static inline void bch2_accounting_mem_read_counters(struct bch_fs *c, unsigned
 {
 	memset(v, 0, sizeof(*v) * nr);
 
-	struct bch_accounting_mem *acc = &c->accounting[0];
+	struct bch_accounting_mem *acc = &c->accounting[gc];
 	if (unlikely(idx >= acc->k.nr))
 		return;
 
@@ -201,6 +201,8 @@ int bch2_accounting_read(struct bch_fs *);
 int bch2_dev_usage_remove(struct bch_fs *, unsigned);
 int bch2_dev_usage_init(struct bch_dev *, bool);
 
+void bch2_verify_accounting_clean(struct bch_fs *c);
+
 void bch2_accounting_free(struct bch_accounting_mem *);
 void bch2_fs_accounting_exit(struct bch_fs *);
 
diff --git a/fs/bcachefs/super.c b/fs/bcachefs/super.c
index af947f19e388..b9f851103297 100644
--- a/fs/bcachefs/super.c
+++ b/fs/bcachefs/super.c
@@ -361,6 +361,7 @@ void bch2_fs_read_only(struct bch_fs *c)
 		BUG_ON(atomic_long_read(&c->btree_key_cache.nr_dirty));
 		BUG_ON(c->btree_write_buffer.inc.keys.nr);
 		BUG_ON(c->btree_write_buffer.flushing.keys.nr);
+		bch2_verify_accounting_clean(c);
 
 		bch_verbose(c, "marking filesystem clean");
 		bch2_fs_mark_clean(c);

From bfcaa9079d91cb843630403e7379bbb86d2f73b0 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Sat, 6 Jan 2024 21:42:36 -0500
Subject: [PATCH 045/120] bcachefs: bch_acct_compression

This adds per-compression-type accounting of compressed and uncompressed
size as well as number of extents - meaning we can now see compression
ratio (without walking the whole filesystem).

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/buckets.c                | 57 +++++++++++++++++++++++-----
 fs/bcachefs/disk_accounting.c        |  4 ++
 fs/bcachefs/disk_accounting_format.h |  8 +++-
 3 files changed, 58 insertions(+), 11 deletions(-)

diff --git a/fs/bcachefs/buckets.c b/fs/bcachefs/buckets.c
index e3bf7ed5c073..06e968d98018 100644
--- a/fs/bcachefs/buckets.c
+++ b/fs/bcachefs/buckets.c
@@ -707,13 +707,18 @@ static int __trigger_extent(struct btree_trans *trans,
 	s64 replicas_sectors = 0;
 	int ret = 0;
 
-	struct disk_accounting_pos acc = {
+	struct disk_accounting_pos acc_replicas_key = {
 		.type			= BCH_DISK_ACCOUNTING_replicas,
 		.replicas.data_type	= data_type,
 		.replicas.nr_devs	= 0,
 		.replicas.nr_required	= 1,
 	};
 
+	struct disk_accounting_pos acct_compression_key = {
+		.type			= BCH_DISK_ACCOUNTING_compression,
+	};
+	u64 compression_acct[3] = { 1, 0, 0 };
+
 	bkey_for_each_ptr_decode(k.k, ptrs, p, entry) {
 		s64 disk_sectors = 0;
 		ret = bch2_trigger_pointer(trans, btree_id, level, k, p, entry, &disk_sectors, flags);
@@ -722,15 +727,16 @@ static int __trigger_extent(struct btree_trans *trans,
 
 		bool stale = ret > 0;
 
+		if (p.ptr.cached && stale)
+			continue;
+
 		if (p.ptr.cached) {
-			if (!stale) {
-				ret = bch2_mod_dev_cached_sectors(trans, p.ptr.dev, disk_sectors, gc);
-				if (ret)
-					return ret;
-			}
+			ret = bch2_mod_dev_cached_sectors(trans, p.ptr.dev, disk_sectors, gc);
+			if (ret)
+				return ret;
 		} else if (!p.has_ec) {
 			replicas_sectors       += disk_sectors;
-			acc.replicas.devs[acc.replicas.nr_devs++] = p.ptr.dev;
+			acc_replicas_key.replicas.devs[acc_replicas_key.replicas.nr_devs++] = p.ptr.dev;
 		} else {
 			ret = bch2_trigger_stripe_ptr(trans, k, p, data_type, disk_sectors, flags);
 			if (ret)
@@ -741,12 +747,43 @@ static int __trigger_extent(struct btree_trans *trans,
 			 * if so they're not required for mounting if we have an
 			 * erasure coded pointer in this extent:
 			 */
-			acc.replicas.nr_required = 0;
+			acc_replicas_key.replicas.nr_required = 0;
+		}
+
+		if (acct_compression_key.compression.type &&
+		    acct_compression_key.compression.type != p.crc.compression_type) {
+			if (flags & BTREE_TRIGGER_overwrite)
+				bch2_u64s_neg(compression_acct, ARRAY_SIZE(compression_acct));
+
+			ret = bch2_disk_accounting_mod(trans, &acct_compression_key, compression_acct,
+						       ARRAY_SIZE(compression_acct), gc);
+			if (ret)
+				return ret;
+
+			compression_acct[0] = 1;
+			compression_acct[1] = 0;
+			compression_acct[2] = 0;
+		}
+
+		acct_compression_key.compression.type = p.crc.compression_type;
+		if (p.crc.compression_type) {
+			compression_acct[1] += p.crc.uncompressed_size;
+			compression_acct[2] += p.crc.compressed_size;
 		}
 	}
 
-	if (acc.replicas.nr_devs) {
-		ret = bch2_disk_accounting_mod(trans, &acc, &replicas_sectors, 1, gc);
+	if (acc_replicas_key.replicas.nr_devs) {
+		ret = bch2_disk_accounting_mod(trans, &acc_replicas_key, &replicas_sectors, 1, gc);
+		if (ret)
+			return ret;
+	}
+
+	if (acct_compression_key.compression.type) {
+		if (flags & BTREE_TRIGGER_overwrite)
+			bch2_u64s_neg(compression_acct, ARRAY_SIZE(compression_acct));
+
+		ret = bch2_disk_accounting_mod(trans, &acct_compression_key, compression_acct,
+					       ARRAY_SIZE(compression_acct), gc);
 		if (ret)
 			return ret;
 	}
diff --git a/fs/bcachefs/disk_accounting.c b/fs/bcachefs/disk_accounting.c
index 1d45d8280618..3b76b25b5ca2 100644
--- a/fs/bcachefs/disk_accounting.c
+++ b/fs/bcachefs/disk_accounting.c
@@ -6,6 +6,7 @@
 #include "btree_update.h"
 #include "btree_write_buffer.h"
 #include "buckets.h"
+#include "compress.h"
 #include "disk_accounting.h"
 #include "error.h"
 #include "journal_io.h"
@@ -142,6 +143,9 @@ void bch2_accounting_key_to_text(struct printbuf *out, struct disk_accounting_po
 		prt_printf(out, "dev=%u data_type=", k->dev_data_type.dev);
 		bch2_prt_data_type(out, k->dev_data_type.data_type);
 		break;
+	case BCH_DISK_ACCOUNTING_compression:
+		bch2_prt_compression_type(out, k->compression.type);
+		break;
 	}
 }
 
diff --git a/fs/bcachefs/disk_accounting_format.h b/fs/bcachefs/disk_accounting_format.h
index af5f5789fe5d..2ded8ae65001 100644
--- a/fs/bcachefs/disk_accounting_format.h
+++ b/fs/bcachefs/disk_accounting_format.h
@@ -99,7 +99,8 @@ static inline bool data_type_is_hidden(enum bch_data_type type)
 	x(nr_inodes,		0)		\
 	x(persistent_reserved,	1)		\
 	x(replicas,		2)		\
-	x(dev_data_type,	3)
+	x(dev_data_type,	3)		\
+	x(compression,		4)
 
 enum disk_accounting_type {
 #define x(f, nr)	BCH_DISK_ACCOUNTING_##f	= nr,
@@ -124,6 +125,10 @@ struct bch_dev_stripe_buckets {
 	__u8			dev;
 };
 
+struct bch_acct_compression {
+	__u8			type;
+};
+
 struct disk_accounting_pos {
 	union {
 	struct {
@@ -134,6 +139,7 @@ struct disk_accounting_pos {
 		struct bch_replicas_entry_v1	replicas;
 		struct bch_dev_data_type	dev_data_type;
 		struct bch_dev_stripe_buckets	dev_stripe_buckets;
+		struct bch_acct_compression	compression;
 		};
 	};
 		struct bpos			_pad;

From 91f44781d59a00a3478240e0b907c70d972e8897 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Sat, 24 Feb 2024 21:09:51 -0500
Subject: [PATCH 046/120] bcachefs: Convert bch2_compression_stats_to_text() to
 new accounting

We no longer have to walk the whole btree to calculate compression
stats.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/sysfs.c | 86 ++++++++++-----------------------------------
 1 file changed, 19 insertions(+), 67 deletions(-)

diff --git a/fs/bcachefs/sysfs.c b/fs/bcachefs/sysfs.c
index 91f1516ada8f..f363863337ad 100644
--- a/fs/bcachefs/sysfs.c
+++ b/fs/bcachefs/sysfs.c
@@ -22,6 +22,7 @@
 #include "buckets.h"
 #include "clock.h"
 #include "compress.h"
+#include "disk_accounting.h"
 #include "disk_groups.h"
 #include "ec.h"
 #include "inode.h"
@@ -255,91 +256,42 @@ static size_t bch2_btree_cache_size(struct bch_fs *c)
 
 static int bch2_compression_stats_to_text(struct printbuf *out, struct bch_fs *c)
 {
-	struct btree_trans *trans;
-	enum btree_id id;
-	struct compression_type_stats {
-		u64		nr_extents;
-		u64		sectors_compressed;
-		u64		sectors_uncompressed;
-	} s[BCH_COMPRESSION_TYPE_NR];
-	u64 compressed_incompressible = 0;
-	int ret = 0;
-
-	memset(s, 0, sizeof(s));
-
-	if (!test_bit(BCH_FS_started, &c->flags))
-		return -EPERM;
-
-	trans = bch2_trans_get(c);
-
-	for (id = 0; id < BTREE_ID_NR; id++) {
-		if (!btree_type_has_ptrs(id))
-			continue;
-
-		ret = for_each_btree_key(trans, iter, id, POS_MIN,
-					 BTREE_ITER_all_snapshots, k, ({
-			struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
-			struct bch_extent_crc_unpacked crc;
-			const union bch_extent_entry *entry;
-			bool compressed = false, incompressible = false;
-
-			bkey_for_each_crc(k.k, ptrs, crc, entry) {
-				incompressible	|= crc.compression_type == BCH_COMPRESSION_TYPE_incompressible;
-				compressed	|= crc_is_compressed(crc);
-
-				if (crc_is_compressed(crc)) {
-					s[crc.compression_type].nr_extents++;
-					s[crc.compression_type].sectors_compressed += crc.compressed_size;
-					s[crc.compression_type].sectors_uncompressed += crc.uncompressed_size;
-				}
-			}
-
-			compressed_incompressible += compressed && incompressible;
-
-			if (!compressed) {
-				unsigned t = incompressible ? BCH_COMPRESSION_TYPE_incompressible : 0;
-
-				s[t].nr_extents++;
-				s[t].sectors_compressed += k.k->size;
-				s[t].sectors_uncompressed += k.k->size;
-			}
-			0;
-		}));
-	}
-
-	bch2_trans_put(trans);
-
-	if (ret)
-		return ret;
-
+	prt_str(out, "type");
 	printbuf_tabstop_push(out, 12);
 	printbuf_tabstop_push(out, 16);
 	printbuf_tabstop_push(out, 16);
 	printbuf_tabstop_push(out, 24);
 	prt_printf(out, "type\tcompressed\runcompressed\raverage extent size\r\n");
 
-	for (unsigned i = 0; i < ARRAY_SIZE(s); i++) {
+	for (unsigned i = 1; i < BCH_COMPRESSION_TYPE_NR; i++) {
+		struct disk_accounting_pos a = {
+			.type			= BCH_DISK_ACCOUNTING_compression,
+			.compression.type	= i,
+		};
+		struct bpos p = disk_accounting_pos_to_bpos(&a);
+		u64 v[3];
+		bch2_accounting_mem_read(c, p, v, ARRAY_SIZE(v));
+
+		u64 nr_extents			= v[0];
+		u64 sectors_uncompressed	= v[1];
+		u64 sectors_compressed		= v[2];
+
 		bch2_prt_compression_type(out, i);
 		prt_tab(out);
 
-		prt_human_readable_u64(out, s[i].sectors_compressed << 9);
+		prt_human_readable_u64(out, sectors_compressed << 9);
 		prt_tab_rjust(out);
 
-		prt_human_readable_u64(out, s[i].sectors_uncompressed << 9);
+		prt_human_readable_u64(out, sectors_uncompressed << 9);
 		prt_tab_rjust(out);
 
-		prt_human_readable_u64(out, s[i].nr_extents
-				       ? div_u64(s[i].sectors_uncompressed << 9, s[i].nr_extents)
+		prt_human_readable_u64(out, nr_extents
+				       ? div_u64(sectors_uncompressed << 9, nr_extents)
 				       : 0);
 		prt_tab_rjust(out);
 		prt_newline(out);
 	}
 
-	if (compressed_incompressible) {
-		prt_printf(out, "%llu compressed & incompressible extents", compressed_incompressible);
-		prt_newline(out);
-	}
-
 	return 0;
 }
 

From f93bb76ba288ccc1e014037413b007b174e03f3a Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Sat, 24 Feb 2024 19:58:07 -0500
Subject: [PATCH 047/120] bcachefs: bch2_fs_accounting_to_text()

Helper to show raw accounting in sysfs, mainly for debugging.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/disk_accounting.c | 26 ++++++++++++++++++++++++++
 fs/bcachefs/disk_accounting.h |  1 +
 fs/bcachefs/sysfs.c           |  5 +++++
 3 files changed, 32 insertions(+)

diff --git a/fs/bcachefs/disk_accounting.c b/fs/bcachefs/disk_accounting.c
index 3b76b25b5ca2..8c6e76e50a9b 100644
--- a/fs/bcachefs/disk_accounting.c
+++ b/fs/bcachefs/disk_accounting.c
@@ -308,6 +308,32 @@ int bch2_fs_replicas_usage_read(struct bch_fs *c, darray_char *usage)
 	return ret;
 }
 
+void bch2_fs_accounting_to_text(struct printbuf *out, struct bch_fs *c)
+{
+	struct bch_accounting_mem *acc = &c->accounting[0];
+
+	percpu_down_read(&c->mark_lock);
+	out->atomic++;
+
+	eytzinger0_for_each(i, acc->k.nr) {
+		struct disk_accounting_pos acc_k;
+		bpos_to_disk_accounting_pos(&acc_k, acc->k.data[i].pos);
+
+		bch2_accounting_key_to_text(out, &acc_k);
+
+		u64 v[BCH_ACCOUNTING_MAX_COUNTERS];
+		bch2_accounting_mem_read_counters(c, i, v, ARRAY_SIZE(v), false);
+
+		prt_str(out, ":");
+		for (unsigned j = 0; j < acc->k.data[i].nr_counters; j++)
+			prt_printf(out, " %llu", v[j]);
+		prt_newline(out);
+	}
+
+	--out->atomic;
+	percpu_up_read(&c->mark_lock);
+}
+
 /* Ensures all counters in @src exist in @dst: */
 static int copy_counters(struct bch_accounting_mem *dst,
 			 struct bch_accounting_mem *src)
diff --git a/fs/bcachefs/disk_accounting.h b/fs/bcachefs/disk_accounting.h
index 5132b3dd1745..5164995f3139 100644
--- a/fs/bcachefs/disk_accounting.h
+++ b/fs/bcachefs/disk_accounting.h
@@ -193,6 +193,7 @@ static inline void bch2_accounting_mem_read(struct bch_fs *c, struct bpos p,
 }
 
 int bch2_fs_replicas_usage_read(struct bch_fs *, darray_char *);
+void bch2_fs_accounting_to_text(struct printbuf *, struct bch_fs *);
 
 int bch2_accounting_gc_done(struct bch_fs *);
 
diff --git a/fs/bcachefs/sysfs.c b/fs/bcachefs/sysfs.c
index f363863337ad..88800f111d9e 100644
--- a/fs/bcachefs/sysfs.c
+++ b/fs/bcachefs/sysfs.c
@@ -203,6 +203,7 @@ read_attribute(disk_groups);
 
 read_attribute(has_data);
 read_attribute(alloc_debug);
+read_attribute(accounting);
 
 #define x(t, n, ...) read_attribute(t);
 BCH_PERSISTENT_COUNTERS()
@@ -388,6 +389,9 @@ SHOW(bch2_fs)
 	if (attr == &sysfs_alloc_debug)
 		bch2_fs_alloc_debug_to_text(out, c);
 
+	if (attr == &sysfs_accounting)
+		bch2_fs_accounting_to_text(out, c);
+
 	return 0;
 }
 
@@ -601,6 +605,7 @@ struct attribute *bch2_fs_internal_files[] = {
 
 	&sysfs_disk_groups,
 	&sysfs_alloc_debug,
+	&sysfs_accounting,
 	NULL
 };
 

From 72c277878031a988ca472aa41370488c726d33b0 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Fri, 23 Feb 2024 17:23:41 -0500
Subject: [PATCH 048/120] bcachefs: bch2_fs_usage_base_to_text()

Helper to show raw accounting in sysfs, mainly for debugging.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/sysfs.c | 19 +++++++++++++++++++
 1 file changed, 19 insertions(+)

diff --git a/fs/bcachefs/sysfs.c b/fs/bcachefs/sysfs.c
index 88800f111d9e..1c0d1fb20276 100644
--- a/fs/bcachefs/sysfs.c
+++ b/fs/bcachefs/sysfs.c
@@ -204,6 +204,7 @@ read_attribute(disk_groups);
 read_attribute(has_data);
 read_attribute(alloc_debug);
 read_attribute(accounting);
+read_attribute(usage_base);
 
 #define x(t, n, ...) read_attribute(t);
 BCH_PERSISTENT_COUNTERS()
@@ -303,6 +304,20 @@ static void bch2_gc_gens_pos_to_text(struct printbuf *out, struct bch_fs *c)
 	prt_printf(out, "\n");
 }
 
+static void bch2_fs_usage_base_to_text(struct printbuf *out, struct bch_fs *c)
+{
+	struct bch_fs_usage_base b = {};
+
+	acc_u64s_percpu(&b.hidden, &c->usage->hidden, sizeof(b) / sizeof(u64));
+
+	prt_printf(out, "hidden:\t\t%llu\n",	b.hidden);
+	prt_printf(out, "btree:\t\t%llu\n",	b.btree);
+	prt_printf(out, "data:\t\t%llu\n",	b.data);
+	prt_printf(out, "cached:\t%llu\n",	b.cached);
+	prt_printf(out, "reserved:\t\t%llu\n",	b.reserved);
+	prt_printf(out, "nr_inodes:\t%llu\n",	b.nr_inodes);
+}
+
 SHOW(bch2_fs)
 {
 	struct bch_fs *c = container_of(kobj, struct bch_fs, kobj);
@@ -392,6 +407,9 @@ SHOW(bch2_fs)
 	if (attr == &sysfs_accounting)
 		bch2_fs_accounting_to_text(out, c);
 
+	if (attr == &sysfs_usage_base)
+		bch2_fs_usage_base_to_text(out, c);
+
 	return 0;
 }
 
@@ -606,6 +624,7 @@ struct attribute *bch2_fs_internal_files[] = {
 	&sysfs_disk_groups,
 	&sysfs_alloc_debug,
 	&sysfs_accounting,
+	&sysfs_usage_base,
 	NULL
 };
 

From 6675c37662341cf1a2e3b502a4dafbf7571978b2 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Mon, 12 Feb 2024 02:17:02 -0500
Subject: [PATCH 049/120] bcachefs: bch_acct_snapshot

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/buckets.c                | 10 ++++++++++
 fs/bcachefs/disk_accounting.c        |  3 +++
 fs/bcachefs/disk_accounting_format.h |  8 +++++++-
 3 files changed, 20 insertions(+), 1 deletion(-)

diff --git a/fs/bcachefs/buckets.c b/fs/bcachefs/buckets.c
index 06e968d98018..caa79e84f2cc 100644
--- a/fs/bcachefs/buckets.c
+++ b/fs/bcachefs/buckets.c
@@ -778,6 +778,16 @@ static int __trigger_extent(struct btree_trans *trans,
 			return ret;
 	}
 
+	if (acc_replicas_key.replicas.nr_devs && !level && k.k->p.snapshot) {
+		struct disk_accounting_pos acc_snapshot_key = {
+			.type			= BCH_DISK_ACCOUNTING_snapshot,
+			.snapshot.id		= k.k->p.snapshot,
+		};
+		ret = bch2_disk_accounting_mod(trans, &acc_snapshot_key, &replicas_sectors, 1, gc);
+		if (ret)
+			return ret;
+	}
+
 	if (acct_compression_key.compression.type) {
 		if (flags & BTREE_TRIGGER_overwrite)
 			bch2_u64s_neg(compression_acct, ARRAY_SIZE(compression_acct));
diff --git a/fs/bcachefs/disk_accounting.c b/fs/bcachefs/disk_accounting.c
index 8c6e76e50a9b..f980cd5af41d 100644
--- a/fs/bcachefs/disk_accounting.c
+++ b/fs/bcachefs/disk_accounting.c
@@ -146,6 +146,9 @@ void bch2_accounting_key_to_text(struct printbuf *out, struct disk_accounting_po
 	case BCH_DISK_ACCOUNTING_compression:
 		bch2_prt_compression_type(out, k->compression.type);
 		break;
+	case BCH_DISK_ACCOUNTING_snapshot:
+		prt_printf(out, "id=%u", k->snapshot.id);
+		break;
 	}
 }
 
diff --git a/fs/bcachefs/disk_accounting_format.h b/fs/bcachefs/disk_accounting_format.h
index 2ded8ae65001..355b8545df88 100644
--- a/fs/bcachefs/disk_accounting_format.h
+++ b/fs/bcachefs/disk_accounting_format.h
@@ -100,7 +100,8 @@ static inline bool data_type_is_hidden(enum bch_data_type type)
 	x(persistent_reserved,	1)		\
 	x(replicas,		2)		\
 	x(dev_data_type,	3)		\
-	x(compression,		4)
+	x(compression,		4)		\
+	x(snapshot,		5)
 
 enum disk_accounting_type {
 #define x(f, nr)	BCH_DISK_ACCOUNTING_##f	= nr,
@@ -129,6 +130,10 @@ struct bch_acct_compression {
 	__u8			type;
 };
 
+struct bch_acct_snapshot {
+	__u32			id;
+};
+
 struct disk_accounting_pos {
 	union {
 	struct {
@@ -140,6 +145,7 @@ struct disk_accounting_pos {
 		struct bch_dev_data_type	dev_data_type;
 		struct bch_dev_stripe_buckets	dev_stripe_buckets;
 		struct bch_acct_compression	compression;
+		struct bch_acct_snapshot	snapshot;
 		};
 	};
 		struct bpos			_pad;

From 6af91147b6b90552cdf1dd400eeb9da66920f3ce Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Wed, 28 Feb 2024 22:37:21 -0500
Subject: [PATCH 050/120] bcachefs: bch_acct_btree

Add counters for how much disk space we're using per btree.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/buckets.c                | 10 ++++++++++
 fs/bcachefs/disk_accounting.c        |  4 ++++
 fs/bcachefs/disk_accounting_format.h |  8 +++++++-
 3 files changed, 21 insertions(+), 1 deletion(-)

diff --git a/fs/bcachefs/buckets.c b/fs/bcachefs/buckets.c
index caa79e84f2cc..e4a9bd29e93c 100644
--- a/fs/bcachefs/buckets.c
+++ b/fs/bcachefs/buckets.c
@@ -798,6 +798,16 @@ static int __trigger_extent(struct btree_trans *trans,
 			return ret;
 	}
 
+	if (level) {
+		struct disk_accounting_pos acc_btree_key = {
+			.type		= BCH_DISK_ACCOUNTING_btree,
+			.btree.id	= btree_id,
+		};
+		ret = bch2_disk_accounting_mod(trans, &acc_btree_key, &replicas_sectors, 1, gc);
+		if (ret)
+			return ret;
+	}
+
 	return 0;
 }
 
diff --git a/fs/bcachefs/disk_accounting.c b/fs/bcachefs/disk_accounting.c
index f980cd5af41d..ed3b368ea1af 100644
--- a/fs/bcachefs/disk_accounting.c
+++ b/fs/bcachefs/disk_accounting.c
@@ -2,6 +2,7 @@
 
 #include "bcachefs.h"
 #include "bcachefs_ioctl.h"
+#include "btree_cache.h"
 #include "btree_journal_iter.h"
 #include "btree_update.h"
 #include "btree_write_buffer.h"
@@ -149,6 +150,9 @@ void bch2_accounting_key_to_text(struct printbuf *out, struct disk_accounting_po
 	case BCH_DISK_ACCOUNTING_snapshot:
 		prt_printf(out, "id=%u", k->snapshot.id);
 		break;
+	case BCH_DISK_ACCOUNTING_btree:
+		prt_printf(out, "btree=%s", bch2_btree_id_str(k->btree.id));
+		break;
 	}
 }
 
diff --git a/fs/bcachefs/disk_accounting_format.h b/fs/bcachefs/disk_accounting_format.h
index 355b8545df88..057d82c7e0b1 100644
--- a/fs/bcachefs/disk_accounting_format.h
+++ b/fs/bcachefs/disk_accounting_format.h
@@ -101,7 +101,8 @@ static inline bool data_type_is_hidden(enum bch_data_type type)
 	x(replicas,		2)		\
 	x(dev_data_type,	3)		\
 	x(compression,		4)		\
-	x(snapshot,		5)
+	x(snapshot,		5)		\
+	x(btree,		6)
 
 enum disk_accounting_type {
 #define x(f, nr)	BCH_DISK_ACCOUNTING_##f	= nr,
@@ -134,6 +135,10 @@ struct bch_acct_snapshot {
 	__u32			id;
 };
 
+struct bch_acct_btree {
+	__u32			id;
+};
+
 struct disk_accounting_pos {
 	union {
 	struct {
@@ -146,6 +151,7 @@ struct disk_accounting_pos {
 		struct bch_dev_stripe_buckets	dev_stripe_buckets;
 		struct bch_acct_compression	compression;
 		struct bch_acct_snapshot	snapshot;
+		struct bch_acct_btree		btree;
 		};
 	};
 		struct bpos			_pad;

From 20ac515a9cc73d48be1462d2a04cda75215a1867 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Tue, 19 Mar 2024 00:04:52 -0400
Subject: [PATCH 051/120] bcachefs: bch_acct_rebalance_work

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/buckets.c                | 9 +++++++++
 fs/bcachefs/disk_accounting_format.h | 3 ++-
 2 files changed, 11 insertions(+), 1 deletion(-)

diff --git a/fs/bcachefs/buckets.c b/fs/bcachefs/buckets.c
index e4a9bd29e93c..f89e2e9a6dd2 100644
--- a/fs/bcachefs/buckets.c
+++ b/fs/bcachefs/buckets.c
@@ -808,6 +808,15 @@ static int __trigger_extent(struct btree_trans *trans,
 			return ret;
 	}
 
+	if (bch2_bkey_rebalance_opts(k)) {
+		struct disk_accounting_pos acc = {
+			.type		= BCH_DISK_ACCOUNTING_rebalance_work,
+		};
+		ret = bch2_disk_accounting_mod(trans, &acc, &replicas_sectors, 1, gc);
+		if (ret)
+			return ret;
+	}
+
 	return 0;
 }
 
diff --git a/fs/bcachefs/disk_accounting_format.h b/fs/bcachefs/disk_accounting_format.h
index 057d82c7e0b1..cba417060b33 100644
--- a/fs/bcachefs/disk_accounting_format.h
+++ b/fs/bcachefs/disk_accounting_format.h
@@ -102,7 +102,8 @@ static inline bool data_type_is_hidden(enum bch_data_type type)
 	x(dev_data_type,	3)		\
 	x(compression,		4)		\
 	x(snapshot,		5)		\
-	x(btree,		6)
+	x(btree,		6)		\
+	x(rebalance_work,	7)
 
 enum disk_accounting_type {
 #define x(f, nr)	BCH_DISK_ACCOUNTING_##f	= nr,

From b9efa9673e1d3fee530b582dbde1827d336513a8 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Wed, 27 Dec 2023 11:33:21 -0500
Subject: [PATCH 052/120] bcachefs: Eytzinger accumulation for accounting keys

The btree write buffer takes as input keys from the journal, sorts them,
deduplicates them, and flushes them back to the btree in sorted order.

The disk space accounting rewrite is moving accounting to normal btree
keys, with update (in this case deltas) accumulated in the write buffer
and then flushed to the btree; but this is going to increase the number
of keys handled by the write buffer by perhaps as much as a factor of
3x-5x.

The overhead from copying around and sorting this many keys would cause
a significant performance regression, but: there is huge locality in
updates to accounting keys that we can take advantage of.

Instead of appending accounting keys to the list of keys to be sorted,
this patch adds an eytzinger search tree of recently seen accounting
keys. We look up the accounting key in the eytzinger search tree and
apply the delta directly, adding it if it doesn't exist, and
periodically prune the eytzinger tree of unused entries.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_write_buffer.c       | 54 +++++++++++++++++++++++++-
 fs/bcachefs/btree_write_buffer.h       | 49 +++++++++++++++++++++--
 fs/bcachefs/btree_write_buffer_types.h |  2 +
 fs/bcachefs/journal_io.c               | 10 ++++-
 4 files changed, 107 insertions(+), 8 deletions(-)

diff --git a/fs/bcachefs/btree_write_buffer.c b/fs/bcachefs/btree_write_buffer.c
index e9e36d8aded9..b9fe736820b4 100644
--- a/fs/bcachefs/btree_write_buffer.c
+++ b/fs/bcachefs/btree_write_buffer.c
@@ -610,6 +610,29 @@ static void bch2_btree_write_buffer_flush_work(struct work_struct *work)
 	bch2_write_ref_put(c, BCH_WRITE_REF_btree_write_buffer);
 }
 
+static void wb_accounting_sort(struct btree_write_buffer *wb)
+{
+	eytzinger0_sort(wb->accounting.data, wb->accounting.nr,
+			sizeof(wb->accounting.data[0]),
+			wb_key_cmp, NULL);
+}
+
+int bch2_accounting_key_to_wb_slowpath(struct bch_fs *c, enum btree_id btree,
+				       struct bkey_i_accounting *k)
+{
+	struct btree_write_buffer *wb = &c->btree_write_buffer;
+	struct btree_write_buffered_key new = { .btree = btree };
+
+	bkey_copy(&new.k, &k->k_i);
+
+	int ret = darray_push(&wb->accounting, new);
+	if (ret)
+		return ret;
+
+	wb_accounting_sort(wb);
+	return 0;
+}
+
 int bch2_journal_key_to_wb_slowpath(struct bch_fs *c,
 			     struct journal_keys_to_wb *dst,
 			     enum btree_id btree, struct bkey_i *k)
@@ -679,11 +702,35 @@ void bch2_journal_keys_to_write_buffer_start(struct bch_fs *c, struct journal_ke
 
 	bch2_journal_pin_add(&c->journal, seq, &dst->wb->pin,
 			     bch2_btree_write_buffer_journal_flush);
+
+	darray_for_each(wb->accounting, i)
+		memset(&i->k.v, 0, bkey_val_bytes(&i->k.k));
 }
 
-void bch2_journal_keys_to_write_buffer_end(struct bch_fs *c, struct journal_keys_to_wb *dst)
+int bch2_journal_keys_to_write_buffer_end(struct bch_fs *c, struct journal_keys_to_wb *dst)
 {
 	struct btree_write_buffer *wb = &c->btree_write_buffer;
+	unsigned live_accounting_keys = 0;
+	int ret = 0;
+
+	darray_for_each(wb->accounting, i)
+		if (!bch2_accounting_key_is_zero(bkey_i_to_s_c_accounting(&i->k))) {
+			i->journal_seq = dst->seq;
+			live_accounting_keys++;
+			ret = __bch2_journal_key_to_wb(c, dst, i->btree, &i->k);
+			if (ret)
+				break;
+		}
+
+	if (live_accounting_keys * 2 < wb->accounting.nr) {
+		struct btree_write_buffered_key *dst = wb->accounting.data;
+
+		darray_for_each(wb->accounting, src)
+			if (!bch2_accounting_key_is_zero(bkey_i_to_s_c_accounting(&src->k)))
+				*dst++ = *src;
+		wb->accounting.nr = dst - wb->accounting.data;
+		wb_accounting_sort(wb);
+	}
 
 	if (!dst->wb->keys.nr)
 		bch2_journal_pin_drop(&c->journal, &dst->wb->pin);
@@ -696,6 +743,8 @@ void bch2_journal_keys_to_write_buffer_end(struct bch_fs *c, struct journal_keys
 	if (dst->wb == &wb->flushing)
 		mutex_unlock(&wb->flushing.lock);
 	mutex_unlock(&wb->inc.lock);
+
+	return ret;
 }
 
 static int bch2_journal_keys_to_write_buffer(struct bch_fs *c, struct journal_buf *buf)
@@ -719,7 +768,7 @@ static int bch2_journal_keys_to_write_buffer(struct bch_fs *c, struct journal_bu
 	buf->need_flush_to_write_buffer = false;
 	spin_unlock(&c->journal.lock);
 out:
-	bch2_journal_keys_to_write_buffer_end(c, &dst);
+	ret = bch2_journal_keys_to_write_buffer_end(c, &dst) ?: ret;
 	return ret;
 }
 
@@ -751,6 +800,7 @@ void bch2_fs_btree_write_buffer_exit(struct bch_fs *c)
 	BUG_ON((wb->inc.keys.nr || wb->flushing.keys.nr) &&
 	       !bch2_journal_error(&c->journal));
 
+	darray_exit(&wb->accounting);
 	darray_exit(&wb->sorted);
 	darray_exit(&wb->flushing.keys);
 	darray_exit(&wb->inc.keys);
diff --git a/fs/bcachefs/btree_write_buffer.h b/fs/bcachefs/btree_write_buffer.h
index dd5e64218b50..725e79654216 100644
--- a/fs/bcachefs/btree_write_buffer.h
+++ b/fs/bcachefs/btree_write_buffer.h
@@ -3,6 +3,7 @@
 #define _BCACHEFS_BTREE_WRITE_BUFFER_H
 
 #include "bkey.h"
+#include "disk_accounting.h"
 
 static inline bool bch2_btree_write_buffer_should_flush(struct bch_fs *c)
 {
@@ -32,16 +33,45 @@ struct journal_keys_to_wb {
 	u64				seq;
 };
 
+static inline int wb_key_cmp(const void *_l, const void *_r)
+{
+	const struct btree_write_buffered_key *l = _l;
+	const struct btree_write_buffered_key *r = _r;
+
+	return cmp_int(l->btree, r->btree) ?: bpos_cmp(l->k.k.p, r->k.k.p);
+}
+
+int bch2_accounting_key_to_wb_slowpath(struct bch_fs *,
+			      enum btree_id, struct bkey_i_accounting *);
+
+static inline int bch2_accounting_key_to_wb(struct bch_fs *c,
+			     enum btree_id btree, struct bkey_i_accounting *k)
+{
+	struct btree_write_buffer *wb = &c->btree_write_buffer;
+	struct btree_write_buffered_key search;
+	search.btree = btree;
+	search.k.k.p = k->k.p;
+
+	unsigned idx = eytzinger0_find(wb->accounting.data, wb->accounting.nr,
+			sizeof(wb->accounting.data[0]),
+			wb_key_cmp, &search);
+
+	if (idx >= wb->accounting.nr)
+		return bch2_accounting_key_to_wb_slowpath(c, btree, k);
+
+	struct bkey_i_accounting *dst = bkey_i_to_accounting(&wb->accounting.data[idx].k);
+	bch2_accounting_accumulate(dst, accounting_i_to_s_c(k));
+	return 0;
+}
+
 int bch2_journal_key_to_wb_slowpath(struct bch_fs *,
 			     struct journal_keys_to_wb *,
 			     enum btree_id, struct bkey_i *);
 
-static inline int bch2_journal_key_to_wb(struct bch_fs *c,
+static inline int __bch2_journal_key_to_wb(struct bch_fs *c,
 			     struct journal_keys_to_wb *dst,
 			     enum btree_id btree, struct bkey_i *k)
 {
-	EBUG_ON(!dst->seq);
-
 	if (unlikely(!dst->room))
 		return bch2_journal_key_to_wb_slowpath(c, dst, btree, k);
 
@@ -54,8 +84,19 @@ static inline int bch2_journal_key_to_wb(struct bch_fs *c,
 	return 0;
 }
 
+static inline int bch2_journal_key_to_wb(struct bch_fs *c,
+			     struct journal_keys_to_wb *dst,
+			     enum btree_id btree, struct bkey_i *k)
+{
+	EBUG_ON(!dst->seq);
+
+	return k->k.type == KEY_TYPE_accounting
+		? bch2_accounting_key_to_wb(c, btree, bkey_i_to_accounting(k))
+		: __bch2_journal_key_to_wb(c, dst, btree, k);
+}
+
 void bch2_journal_keys_to_write_buffer_start(struct bch_fs *, struct journal_keys_to_wb *, u64);
-void bch2_journal_keys_to_write_buffer_end(struct bch_fs *, struct journal_keys_to_wb *);
+int bch2_journal_keys_to_write_buffer_end(struct bch_fs *, struct journal_keys_to_wb *);
 
 int bch2_btree_write_buffer_resize(struct bch_fs *, size_t);
 void bch2_fs_btree_write_buffer_exit(struct bch_fs *);
diff --git a/fs/bcachefs/btree_write_buffer_types.h b/fs/bcachefs/btree_write_buffer_types.h
index 9b9433de9c36..e9e76e20f43b 100644
--- a/fs/bcachefs/btree_write_buffer_types.h
+++ b/fs/bcachefs/btree_write_buffer_types.h
@@ -52,6 +52,8 @@ struct btree_write_buffer {
 	struct btree_write_buffer_keys	inc;
 	struct btree_write_buffer_keys	flushing;
 	struct work_struct		flush_work;
+
+	DARRAY(struct btree_write_buffered_key) accounting;
 };
 
 #endif /* _BCACHEFS_BTREE_WRITE_BUFFER_TYPES_H */
diff --git a/fs/bcachefs/journal_io.c b/fs/bcachefs/journal_io.c
index 3fa63cacfd94..bf3433cc78be 100644
--- a/fs/bcachefs/journal_io.c
+++ b/fs/bcachefs/journal_io.c
@@ -1862,8 +1862,14 @@ static int bch2_journal_write_prep(struct journal *j, struct journal_buf *w)
 		}
 	}
 
-	if (wb.wb)
-		bch2_journal_keys_to_write_buffer_end(c, &wb);
+	if (wb.wb) {
+		ret = bch2_journal_keys_to_write_buffer_end(c, &wb);
+		if (ret) {
+			bch2_fs_fatal_error(c, "error flushing journal keys to btree write buffer: %s",
+					    bch2_err_str(ret));
+			return ret;
+		}
+	}
 
 	spin_lock(&c->journal.lock);
 	w->need_flush_to_write_buffer = false;

From 25ee25e637a6f734ed549b79b1294b0bb96d112e Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Wed, 29 May 2024 13:38:06 -0400
Subject: [PATCH 053/120] bcachefs: Kill bch2_mount()

Fold into bch2_fs_get_tree()

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/fs.c | 46 +++++++++++++++-------------------------------
 1 file changed, 15 insertions(+), 31 deletions(-)

diff --git a/fs/bcachefs/fs.c b/fs/bcachefs/fs.c
index 6a92c0d434d9..7bcf79406103 100644
--- a/fs/bcachefs/fs.c
+++ b/fs/bcachefs/fs.c
@@ -1888,25 +1888,24 @@ static int bch2_test_super(struct super_block *s, void *data)
 	return true;
 }
 
-static struct dentry *bch2_mount(struct file_system_type *fs_type,
-				 int flags, const char *dev_name,
-				 struct bch2_opts_parse opts_parse)
+static int bch2_fs_get_tree(struct fs_context *fc)
 {
 	struct bch_fs *c;
 	struct super_block *sb;
 	struct inode *vinode;
-	struct bch_opts opts = opts_parse.opts;
+	struct bch2_opts_parse *opts_parse = fc->fs_private;
+	struct bch_opts opts = opts_parse->opts;
 	int ret;
 
-	opt_set(opts, read_only, (flags & SB_RDONLY) != 0);
+	opt_set(opts, read_only, (fc->sb_flags & SB_RDONLY) != 0);
 
-	if (!dev_name || strlen(dev_name) == 0)
-		return ERR_PTR(-EINVAL);
+	if (!fc->source || strlen(fc->source) == 0)
+		return -EINVAL;
 
 	darray_str devs;
-	ret = bch2_split_devs(dev_name, &devs);
+	ret = bch2_split_devs(fc->source, &devs);
 	if (ret)
-		return ERR_PTR(ret);
+		return ret;
 
 	darray_fs devs_to_fs = {};
 	darray_for_each(devs, i) {
@@ -1917,7 +1916,7 @@ static struct dentry *bch2_mount(struct file_system_type *fs_type,
 		}
 	}
 
-	sb = sget(fs_type, bch2_test_super, bch2_noset_super, flags|SB_NOSEC, &devs_to_fs);
+	sb = sget(fc->fs_type, bch2_test_super, bch2_noset_super, fc->sb_flags|SB_NOSEC, &devs_to_fs);
 	if (!IS_ERR(sb))
 		goto got_sb;
 
@@ -1928,7 +1927,7 @@ static struct dentry *bch2_mount(struct file_system_type *fs_type,
 	}
 
 	/* Some options can't be parsed until after the fs is started: */
-	ret = bch2_parse_mount_opts(c, &opts, NULL, opts_parse.parse_later.buf);
+	ret = bch2_parse_mount_opts(c, &opts, NULL, opts_parse->parse_later.buf);
 	if (ret) {
 		bch2_fs_stop(c);
 		sb = ERR_PTR(ret);
@@ -1937,7 +1936,7 @@ static struct dentry *bch2_mount(struct file_system_type *fs_type,
 
 	bch2_opts_apply(&c->opts, opts);
 
-	sb = sget(fs_type, NULL, bch2_set_super, flags|SB_NOSEC, c);
+	sb = sget(fc->fs_type, NULL, bch2_set_super, fc->sb_flags|SB_NOSEC, c);
 	if (IS_ERR(sb))
 		bch2_fs_stop(c);
 got_sb:
@@ -1952,7 +1951,7 @@ got_sb:
 	c = sb->s_fs_info;
 
 	if (sb->s_root) {
-		if ((flags ^ sb->s_flags) & SB_RDONLY) {
+		if ((fc->sb_flags ^ sb->s_flags) & SB_RDONLY) {
 			ret = -EBUSY;
 			goto err_put_super;
 		}
@@ -2018,7 +2017,8 @@ got_sb:
 
 	sb->s_flags |= SB_ACTIVE;
 out:
-	return dget(sb->s_root);
+	fc->root = dget(sb->s_root);
+	return 0;
 
 err_put_super:
 	__bch2_fs_stop(c);
@@ -2034,7 +2034,7 @@ err:
 	 */
 	if (bch2_err_matches(ret, EROFS) && ret != -EROFS)
 		ret = -EIO;
-	return ERR_PTR(bch2_err_class(ret));
+	return bch2_err_class(ret);
 }
 
 static void bch2_kill_sb(struct super_block *sb)
@@ -2079,22 +2079,6 @@ static int bch2_fs_parse_param(struct fs_context *fc,
 	return bch2_err_class(ret);
 }
 
-static int bch2_fs_get_tree(struct fs_context *fc)
-{
-	struct bch2_opts_parse *opts = fc->fs_private;
-	const char *dev_name = fc->source;
-	struct dentry *root;
-
-	root = bch2_mount(fc->fs_type, fc->sb_flags, dev_name, *opts);
-
-	if (IS_ERR(root))
-		return PTR_ERR(root);
-
-	fc->root = root;
-
-	return 0;
-}
-
 static int bch2_fs_reconfigure(struct fs_context *fc)
 {
 	struct super_block *sb = fc->root->d_sb;

From 5645c32ccf3197ee900dd0c0fbf66a40ad39f79f Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Wed, 29 May 2024 13:55:49 -0400
Subject: [PATCH 054/120] bcachefs: bch2_fs_get_tree() cleanup

- improve error paths
- call bch2_fs_start() separately, after applying late-parsed options

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/fs.c | 59 ++++++++++++++++++++++++------------------------
 1 file changed, 29 insertions(+), 30 deletions(-)

diff --git a/fs/bcachefs/fs.c b/fs/bcachefs/fs.c
index 7bcf79406103..1e0e5a842243 100644
--- a/fs/bcachefs/fs.c
+++ b/fs/bcachefs/fs.c
@@ -1895,25 +1895,24 @@ static int bch2_fs_get_tree(struct fs_context *fc)
 	struct inode *vinode;
 	struct bch2_opts_parse *opts_parse = fc->fs_private;
 	struct bch_opts opts = opts_parse->opts;
+	darray_str devs;
+	darray_fs devs_to_fs = {};
 	int ret;
 
 	opt_set(opts, read_only, (fc->sb_flags & SB_RDONLY) != 0);
+	opt_set(opts, nostart, true);
 
 	if (!fc->source || strlen(fc->source) == 0)
 		return -EINVAL;
 
-	darray_str devs;
 	ret = bch2_split_devs(fc->source, &devs);
 	if (ret)
 		return ret;
 
-	darray_fs devs_to_fs = {};
 	darray_for_each(devs, i) {
 		ret = darray_push(&devs_to_fs, bch2_path_to_fs(*i));
-		if (ret) {
-			sb = ERR_PTR(ret);
-			goto got_sb;
-		}
+		if (ret)
+			goto err;
 	}
 
 	sb = sget(fc->fs_type, bch2_test_super, bch2_noset_super, fc->sb_flags|SB_NOSEC, &devs_to_fs);
@@ -1921,33 +1920,27 @@ static int bch2_fs_get_tree(struct fs_context *fc)
 		goto got_sb;
 
 	c = bch2_fs_open(devs.data, devs.nr, opts);
-	if (IS_ERR(c)) {
-		sb = ERR_CAST(c);
-		goto got_sb;
-	}
+	ret = PTR_ERR_OR_ZERO(c);
+	if (ret)
+		goto err;
 
 	/* Some options can't be parsed until after the fs is started: */
+	opts = bch2_opts_empty();
 	ret = bch2_parse_mount_opts(c, &opts, NULL, opts_parse->parse_later.buf);
-	if (ret) {
-		bch2_fs_stop(c);
-		sb = ERR_PTR(ret);
-		goto got_sb;
-	}
+	if (ret)
+		goto err_stop_fs;
 
 	bch2_opts_apply(&c->opts, opts);
 
+	ret = bch2_fs_start(c);
+	if (ret)
+		goto err_stop_fs;
+
 	sb = sget(fc->fs_type, NULL, bch2_set_super, fc->sb_flags|SB_NOSEC, c);
-	if (IS_ERR(sb))
-		bch2_fs_stop(c);
+	ret = PTR_ERR_OR_ZERO(sb);
+	if (ret)
+		goto err_stop_fs;
 got_sb:
-	darray_exit(&devs_to_fs);
-	bch2_darray_str_exit(&devs);
-
-	if (IS_ERR(sb)) {
-		ret = PTR_ERR(sb);
-		goto err;
-	}
-
 	c = sb->s_fs_info;
 
 	if (sb->s_root) {
@@ -2018,12 +2011,9 @@ got_sb:
 	sb->s_flags |= SB_ACTIVE;
 out:
 	fc->root = dget(sb->s_root);
-	return 0;
-
-err_put_super:
-	__bch2_fs_stop(c);
-	deactivate_locked_super(sb);
 err:
+	darray_exit(&devs_to_fs);
+	bch2_darray_str_exit(&devs);
 	if (ret)
 		pr_err("error: %s", bch2_err_str(ret));
 	/*
@@ -2035,6 +2025,15 @@ err:
 	if (bch2_err_matches(ret, EROFS) && ret != -EROFS)
 		ret = -EIO;
 	return bch2_err_class(ret);
+
+err_stop_fs:
+	bch2_fs_stop(c);
+	goto err;
+
+err_put_super:
+	__bch2_fs_stop(c);
+	deactivate_locked_super(sb);
+	goto err;
 }
 
 static void bch2_kill_sb(struct super_block *sb)

From 86d46471d52bf33e8652b220741d3223338d1724 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Wed, 29 May 2024 17:54:46 -0400
Subject: [PATCH 055/120] bcachefs: Don't block journal when finishing
 check_allocations()

Blocking the journal was needed to finish checking old style accounting,
but that code is gone and it's not needed in the alloc rewrite,
mark_lock is sufficient for synchronization.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_gc.c | 6 +-----
 1 file changed, 1 insertion(+), 5 deletions(-)

diff --git a/fs/bcachefs/btree_gc.c b/fs/bcachefs/btree_gc.c
index 0fe869cff8be..cd5880c94edd 100644
--- a/fs/bcachefs/btree_gc.c
+++ b/fs/bcachefs/btree_gc.c
@@ -1142,15 +1142,11 @@ int bch2_check_allocations(struct bch_fs *c)
 
 	c->gc_count++;
 
-	bch2_journal_block(&c->journal);
-out:
 	ret   = bch2_gc_alloc_done(c) ?:
 		bch2_accounting_gc_done(c) ?:
 		bch2_gc_stripes_done(c) ?:
 		bch2_gc_reflink_done(c);
-
-	bch2_journal_unblock(&c->journal);
-
+out:
 	percpu_down_write(&c->mark_lock);
 	/* Indicates that gc is no longer in progress: */
 	__gc_pos_set(c, gc_phase(GC_PHASE_not_running));

From 9ab55df5991eb967f6b94f8e1b855ffacdc50668 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Wed, 29 May 2024 18:53:48 -0400
Subject: [PATCH 056/120] bcachefs: Walk leaf to root in btree_gc

Next change will move gc_alloc_start initialization into the alloc
trigger, so we have to mark those first.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_gc.c | 37 +++++++++++++++++--------------------
 fs/bcachefs/btree_gc.h | 12 +++++++-----
 2 files changed, 24 insertions(+), 25 deletions(-)

diff --git a/fs/bcachefs/btree_gc.c b/fs/bcachefs/btree_gc.c
index cd5880c94edd..22771a861b29 100644
--- a/fs/bcachefs/btree_gc.c
+++ b/fs/bcachefs/btree_gc.c
@@ -635,13 +635,27 @@ fsck_err:
 static int bch2_gc_btree(struct btree_trans *trans, enum btree_id btree, bool initial)
 {
 	struct bch_fs *c = trans->c;
-	int level = 0, target_depth = btree_node_type_needs_gc(__btree_node_type(0, btree)) ? 0 : 1;
+	unsigned target_depth = btree_node_type_needs_gc(__btree_node_type(0, btree)) ? 0 : 1;
 	int ret = 0;
 
 	/* We need to make sure every leaf node is readable before going RW */
 	if (initial)
 		target_depth = 0;
 
+	for (unsigned level = target_depth; level < BTREE_MAX_DEPTH; level++) {
+		struct btree *prev = NULL;
+		struct btree_iter iter;
+		bch2_trans_node_iter_init(trans, &iter, btree, POS_MIN, 0, level,
+					  BTREE_ITER_prefetch);
+
+		ret = for_each_btree_key_continue(trans, iter, 0, k, ({
+			gc_pos_set(c, gc_pos_btree(btree, level, k.k->p));
+			bch2_gc_mark_key(trans, btree, level, &prev, &iter, k, initial);
+		}));
+		if (ret)
+			goto err;
+	}
+
 	/* root */
 	do {
 retry_root:
@@ -663,28 +677,11 @@ retry_root:
 		gc_pos_set(c, gc_pos_btree(btree, b->c.level + 1, SPOS_MAX));
 		struct bkey_s_c k = bkey_i_to_s_c(&b->key);
 		ret = bch2_gc_mark_key(trans, btree, b->c.level + 1, NULL, NULL, k, initial);
-		level = b->c.level;
 err_root:
 		bch2_trans_iter_exit(trans, &iter);
 	} while (bch2_err_matches(ret, BCH_ERR_transaction_restart));
-
-	if (ret)
-		return ret;
-
-	for (; level >= target_depth; --level) {
-		struct btree *prev = NULL;
-		struct btree_iter iter;
-		bch2_trans_node_iter_init(trans, &iter, btree, POS_MIN, 0, level,
-					  BTREE_ITER_prefetch);
-
-		ret = for_each_btree_key_continue(trans, iter, 0, k, ({
-			gc_pos_set(c, gc_pos_btree(btree, level, k.k->p));
-			bch2_gc_mark_key(trans, btree, level, &prev, &iter, k, initial);
-		}));
-		if (ret)
-			break;
-	}
-
+err:
+	bch_err_fn(c, ret);
 	return ret;
 }
 
diff --git a/fs/bcachefs/btree_gc.h b/fs/bcachefs/btree_gc.h
index 876d81e2017d..1bdf841dc44b 100644
--- a/fs/bcachefs/btree_gc.h
+++ b/fs/bcachefs/btree_gc.h
@@ -58,6 +58,8 @@ static inline struct gc_pos gc_pos_btree_node(struct btree *b)
 
 static inline int gc_btree_order(enum btree_id btree)
 {
+	if (btree == BTREE_ID_alloc)
+		return -2;
 	if (btree == BTREE_ID_stripes)
 		return -1;
 	return btree;
@@ -65,11 +67,11 @@ static inline int gc_btree_order(enum btree_id btree)
 
 static inline int gc_pos_cmp(struct gc_pos l, struct gc_pos r)
 {
-	return   cmp_int(l.phase, r.phase) ?:
-		 cmp_int(gc_btree_order(l.btree),
-			 gc_btree_order(r.btree)) ?:
-		-cmp_int(l.level, r.level) ?:
-		 bpos_cmp(l.pos, r.pos);
+	return  cmp_int(l.phase, r.phase) ?:
+		cmp_int(gc_btree_order(l.btree),
+			gc_btree_order(r.btree)) ?:
+		cmp_int(l.level, r.level) ?:
+		bpos_cmp(l.pos, r.pos);
 }
 
 static inline bool gc_visited(struct bch_fs *c, struct gc_pos pos)

From 38ad9dc8c6194413a468f1eb288836b6c2783c6f Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Wed, 29 May 2024 18:54:39 -0400
Subject: [PATCH 057/120] bcachefs: Initialize gc buckets in alloc trigger

Needed for online fsck; we need the trigger to initialize newly
allocated buckets and generation number changes while gc is running.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/alloc_background.c | 22 +++++++++++++++-------
 fs/bcachefs/btree_gc.c         | 29 +++++------------------------
 2 files changed, 20 insertions(+), 31 deletions(-)

diff --git a/fs/bcachefs/alloc_background.c b/fs/bcachefs/alloc_background.c
index 9bb0dbe134d5..831fd0e2a5bf 100644
--- a/fs/bcachefs/alloc_background.c
+++ b/fs/bcachefs/alloc_background.c
@@ -831,10 +831,9 @@ int bch2_trigger_alloc(struct btree_trans *trans,
 
 	struct bch_alloc_v4 old_a_convert;
 	const struct bch_alloc_v4 *old_a = bch2_alloc_to_v4(old, &old_a_convert);
+	struct bch_alloc_v4 *new_a = bkey_s_to_alloc_v4(new).v;
 
 	if (flags & BTREE_TRIGGER_transactional) {
-		struct bch_alloc_v4 *new_a = bkey_s_to_alloc_v4(new).v;
-
 		alloc_data_type_set(new_a, new_a->data_type);
 
 		if (bch2_bucket_sectors_total(*new_a) > bch2_bucket_sectors_total(*old_a)) {
@@ -906,7 +905,6 @@ int bch2_trigger_alloc(struct btree_trans *trans,
 	}
 
 	if ((flags & BTREE_TRIGGER_atomic) && (flags & BTREE_TRIGGER_insert)) {
-		struct bch_alloc_v4 *new_a = bkey_s_to_alloc_v4(new).v;
 		u64 journal_seq = trans->journal_res.seq;
 		u64 bucket_journal_seq = new_a->journal_seq;
 
@@ -935,11 +933,9 @@ int bch2_trigger_alloc(struct btree_trans *trans,
 					c->journal.flushed_seq_ondisk,
 					new.k->p.inode, new.k->p.offset,
 					bucket_journal_seq);
-			if (ret) {
-				bch2_fs_fatal_error(c,
-					"setting bucket_needs_journal_commit: %s", bch2_err_str(ret));
+			if (bch2_fs_fatal_err_on(ret, c,
+					"setting bucket_needs_journal_commit: %s", bch2_err_str(ret)))
 				goto err;
-			}
 		}
 
 		if (new_a->gen != old_a->gen) {
@@ -974,6 +970,18 @@ int bch2_trigger_alloc(struct btree_trans *trans,
 		if (statechange(a->data_type == BCH_DATA_need_gc_gens))
 			bch2_gc_gens_async(c);
 	}
+
+	if ((flags & BTREE_TRIGGER_gc) && (flags & BTREE_TRIGGER_insert)) {
+		rcu_read_lock();
+		struct bucket *g = gc_bucket(ca, new.k->p.offset);
+		if (unlikely(!g)) {
+			rcu_read_unlock();
+			goto invalid_bucket;
+		}
+		g->gen_valid	= 1;
+		g->gen		= new_a->gen;
+		rcu_read_unlock();
+	}
 err:
 	printbuf_exit(&buf);
 	bch2_dev_put(ca);
diff --git a/fs/bcachefs/btree_gc.c b/fs/bcachefs/btree_gc.c
index 22771a861b29..f651a70b2a83 100644
--- a/fs/bcachefs/btree_gc.c
+++ b/fs/bcachefs/btree_gc.c
@@ -624,7 +624,7 @@ static int bch2_gc_mark_key(struct btree_trans *trans, enum btree_id btree_id,
 	}
 
 	ret = bch2_key_trigger(trans, btree_id, level, old, unsafe_bkey_s_c_to_s(k),
-			       BTREE_TRIGGER_gc|flags);
+			       BTREE_TRIGGER_gc|BTREE_TRIGGER_insert|flags);
 out:
 fsck_err:
 	printbuf_exit(&buf);
@@ -891,14 +891,16 @@ static int bch2_gc_alloc_done(struct bch_fs *c)
 
 static int bch2_gc_alloc_start(struct bch_fs *c)
 {
+	int ret = 0;
+
 	for_each_member_device(c, ca) {
 		struct bucket_array *buckets = kvmalloc(sizeof(struct bucket_array) +
 				ca->mi.nbuckets * sizeof(struct bucket),
 				GFP_KERNEL|__GFP_ZERO);
 		if (!buckets) {
 			bch2_dev_put(ca);
-			bch_err(c, "error allocating ca->buckets[gc]");
-			return -BCH_ERR_ENOMEM_gc_alloc_start;
+			ret = -BCH_ERR_ENOMEM_gc_alloc_start;
+			break;
 		}
 
 		buckets->first_bucket	= ca->mi.first_bucket;
@@ -908,27 +910,6 @@ static int bch2_gc_alloc_start(struct bch_fs *c)
 		rcu_assign_pointer(ca->buckets_gc, buckets);
 	}
 
-	struct bch_dev *ca = NULL;
-	int ret = bch2_trans_run(c,
-		for_each_btree_key(trans, iter, BTREE_ID_alloc, POS_MIN,
-					 BTREE_ITER_prefetch, k, ({
-			ca = bch2_dev_iterate(c, ca, k.k->p.inode);
-			if (!ca) {
-				bch2_btree_iter_set_pos(&iter, POS(k.k->p.inode + 1, 0));
-				continue;
-			}
-
-			if (bucket_valid(ca, k.k->p.offset)) {
-				struct bch_alloc_v4 a_convert;
-				const struct bch_alloc_v4 *a = bch2_alloc_to_v4(k, &a_convert);
-
-				struct bucket *g = gc_bucket(ca, k.k->p.offset);
-				g->gen_valid	= 1;
-				g->gen		= a->gen;
-			}
-			0;
-		})));
-	bch2_dev_put(ca);
 	bch_err_fn(c, ret);
 	return ret;
 }

From 174722de552a8f0e3cffbe93d48420275141cc77 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Sat, 6 Apr 2024 23:58:01 -0400
Subject: [PATCH 058/120] bcachefs: Delete old assertion for online fsck

the order in which btree_gc walks keys have changed, so we no longer
have the sort of issues with online fsck this assertion was warning
about.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_trans_commit.c | 9 +--------
 1 file changed, 1 insertion(+), 8 deletions(-)

diff --git a/fs/bcachefs/btree_trans_commit.c b/fs/bcachefs/btree_trans_commit.c
index 30e24725eb12..585c41a13f4d 100644
--- a/fs/bcachefs/btree_trans_commit.c
+++ b/fs/bcachefs/btree_trans_commit.c
@@ -603,20 +603,13 @@ static int bch2_trans_commit_run_triggers(struct btree_trans *trans)
 
 static noinline int bch2_trans_commit_run_gc_triggers(struct btree_trans *trans)
 {
-	trans_for_each_update(trans, i) {
-		/*
-		 * XXX: synchronization of cached update triggers with gc
-		 * XXX: synchronization of interior node updates with gc
-		 */
-		BUG_ON(i->cached || i->level);
-
+	trans_for_each_update(trans, i)
 		if (btree_node_type_needs_gc(__btree_node_type(i->level, i->btree_id)) &&
 		    gc_visited(trans->c, gc_pos_btree_node(insert_l(trans, i)->b))) {
 			int ret = run_one_mem_trigger(trans, i, i->flags|BTREE_TRIGGER_gc);
 			if (ret)
 				return ret;
 		}
-	}
 
 	return 0;
 }

From 38e3ca275cb00034d9bd5006573f6e32fc5e2a45 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Wed, 29 May 2024 19:37:29 -0400
Subject: [PATCH 059/120] bcachefs: btree_types bitmask cleanups

Make things more consistent and ensure that we're using u64 bitfields -
key types and btree ids are already around 32 bits.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_gc.c           |  2 +-
 fs/bcachefs/btree_trans_commit.c | 50 ++++++++++++++------------------
 fs/bcachefs/btree_types.h        | 14 +++++++--
 3 files changed, 35 insertions(+), 31 deletions(-)

diff --git a/fs/bcachefs/btree_gc.c b/fs/bcachefs/btree_gc.c
index f651a70b2a83..84f283cfe8a4 100644
--- a/fs/bcachefs/btree_gc.c
+++ b/fs/bcachefs/btree_gc.c
@@ -635,7 +635,7 @@ fsck_err:
 static int bch2_gc_btree(struct btree_trans *trans, enum btree_id btree, bool initial)
 {
 	struct bch_fs *c = trans->c;
-	unsigned target_depth = btree_node_type_needs_gc(__btree_node_type(0, btree)) ? 0 : 1;
+	unsigned target_depth = btree_node_type_has_triggers(__btree_node_type(0, btree)) ? 0 : 1;
 	int ret = 0;
 
 	/* We need to make sure every leaf node is readable before going RW */
diff --git a/fs/bcachefs/btree_trans_commit.c b/fs/bcachefs/btree_trans_commit.c
index 585c41a13f4d..843558d96887 100644
--- a/fs/bcachefs/btree_trans_commit.c
+++ b/fs/bcachefs/btree_trans_commit.c
@@ -457,34 +457,36 @@ static int run_one_mem_trigger(struct btree_trans *trans,
 			       struct btree_insert_entry *i,
 			       unsigned flags)
 {
-	struct bkey_s_c old = { &i->old_k, i->old_v };
-	struct bkey_i *new = i->k;
-	const struct bkey_ops *old_ops = bch2_bkey_type_ops(old.k->type);
-	const struct bkey_ops *new_ops = bch2_bkey_type_ops(i->k->k.type);
-	int ret;
-
 	verify_update_old_key(trans, i);
 
 	if (unlikely(flags & BTREE_TRIGGER_norun))
 		return 0;
 
-	if (old_ops->trigger == new_ops->trigger) {
-		ret   = bch2_key_trigger(trans, i->btree_id, i->level,
+	struct bkey_s_c old = { &i->old_k, i->old_v };
+	struct bkey_i *new = i->k;
+	const struct bkey_ops *old_ops = bch2_bkey_type_ops(old.k->type);
+	const struct bkey_ops *new_ops = bch2_bkey_type_ops(i->k->k.type);
+
+	if (old_ops->trigger == new_ops->trigger)
+		return bch2_key_trigger(trans, i->btree_id, i->level,
 				old, bkey_i_to_s(new),
 				BTREE_TRIGGER_insert|BTREE_TRIGGER_overwrite|flags);
-	} else {
-		ret   = bch2_key_trigger_new(trans, i->btree_id, i->level,
+	else
+		return bch2_key_trigger_new(trans, i->btree_id, i->level,
 				bkey_i_to_s(new), flags) ?:
-			bch2_key_trigger_old(trans, i->btree_id, i->level,
+		       bch2_key_trigger_old(trans, i->btree_id, i->level,
 				old, flags);
-	}
-
-	return ret;
 }
 
 static int run_one_trans_trigger(struct btree_trans *trans, struct btree_insert_entry *i,
 				 bool overwrite)
 {
+	verify_update_old_key(trans, i);
+
+	if ((i->flags & BTREE_TRIGGER_norun) ||
+	    !btree_node_type_has_trans_triggers(i->bkey_type))
+		return 0;
+
 	/*
 	 * Transactional triggers create new btree_insert_entries, so we can't
 	 * pass them a pointer to a btree_insert_entry, that memory is going to
@@ -496,12 +498,6 @@ static int run_one_trans_trigger(struct btree_trans *trans, struct btree_insert_
 	const struct bkey_ops *new_ops = bch2_bkey_type_ops(i->k->k.type);
 	unsigned flags = i->flags|BTREE_TRIGGER_transactional;
 
-	verify_update_old_key(trans, i);
-
-	if ((i->flags & BTREE_TRIGGER_norun) ||
-	    !(BTREE_NODE_TYPE_HAS_TRANS_TRIGGERS & (1U << i->bkey_type)))
-		return 0;
-
 	if (!i->insert_trigger_run &&
 	    !i->overwrite_trigger_run &&
 	    old_ops->trigger == new_ops->trigger) {
@@ -524,10 +520,8 @@ static int run_one_trans_trigger(struct btree_trans *trans, struct btree_insert_
 static int run_btree_triggers(struct btree_trans *trans, enum btree_id btree_id,
 			      unsigned btree_id_start)
 {
-	bool trans_trigger_run;
-	int ret, overwrite;
-
-	for (overwrite = 1; overwrite >= 0; --overwrite) {
+	for (int overwrite = 1; overwrite >= 0; --overwrite) {
+		bool trans_trigger_run;
 
 		/*
 		 * Running triggers will append more updates to the list of updates as
@@ -542,7 +536,7 @@ static int run_btree_triggers(struct btree_trans *trans, enum btree_id btree_id,
 				if (trans->updates[i].btree_id != btree_id)
 					continue;
 
-				ret = run_one_trans_trigger(trans, trans->updates + i, overwrite);
+				int ret = run_one_trans_trigger(trans, trans->updates + i, overwrite);
 				if (ret < 0)
 					return ret;
 				if (ret)
@@ -595,7 +589,7 @@ static int bch2_trans_commit_run_triggers(struct btree_trans *trans)
 #ifdef CONFIG_BCACHEFS_DEBUG
 	trans_for_each_update(trans, i)
 		BUG_ON(!(i->flags & BTREE_TRIGGER_norun) &&
-		       (BTREE_NODE_TYPE_HAS_TRANS_TRIGGERS & (1U << i->bkey_type)) &&
+		       btree_node_type_has_trans_triggers(i->bkey_type) &&
 		       (!i->insert_trigger_run || !i->overwrite_trigger_run));
 #endif
 	return 0;
@@ -604,7 +598,7 @@ static int bch2_trans_commit_run_triggers(struct btree_trans *trans)
 static noinline int bch2_trans_commit_run_gc_triggers(struct btree_trans *trans)
 {
 	trans_for_each_update(trans, i)
-		if (btree_node_type_needs_gc(__btree_node_type(i->level, i->btree_id)) &&
+		if (btree_node_type_has_triggers(i->bkey_type) &&
 		    gc_visited(trans->c, gc_pos_btree_node(insert_l(trans, i)->b))) {
 			int ret = run_one_mem_trigger(trans, i, i->flags|BTREE_TRIGGER_gc);
 			if (ret)
@@ -728,7 +722,7 @@ bch2_trans_commit_write_locked(struct btree_trans *trans, unsigned flags,
 	}
 
 	trans_for_each_update(trans, i)
-		if (BTREE_NODE_TYPE_HAS_ATOMIC_TRIGGERS & (1U << i->bkey_type)) {
+		if (btree_node_type_has_atomic_triggers(i->bkey_type)) {
 			ret = run_one_mem_trigger(trans, i, BTREE_TRIGGER_atomic|i->flags);
 			if (ret)
 				goto fatal_err;
diff --git a/fs/bcachefs/btree_types.h b/fs/bcachefs/btree_types.h
index b0b5c46aec62..8d06ea56919c 100644
--- a/fs/bcachefs/btree_types.h
+++ b/fs/bcachefs/btree_types.h
@@ -754,9 +754,19 @@ const char *bch2_btree_node_type_str(enum btree_node_type);
 	(BTREE_NODE_TYPE_HAS_TRANS_TRIGGERS|		\
 	 BTREE_NODE_TYPE_HAS_ATOMIC_TRIGGERS)
 
-static inline bool btree_node_type_needs_gc(enum btree_node_type type)
+static inline bool btree_node_type_has_trans_triggers(enum btree_node_type type)
 {
-	return BTREE_NODE_TYPE_HAS_TRIGGERS & BIT_ULL(type);
+	return BIT_ULL(type) & BTREE_NODE_TYPE_HAS_TRANS_TRIGGERS;
+}
+
+static inline bool btree_node_type_has_atomic_triggers(enum btree_node_type type)
+{
+	return BIT_ULL(type) & BTREE_NODE_TYPE_HAS_ATOMIC_TRIGGERS;
+}
+
+static inline bool btree_node_type_has_triggers(enum btree_node_type type)
+{
+	return BIT_ULL(type) & BTREE_NODE_TYPE_HAS_TRIGGERS;
 }
 
 static inline bool btree_node_type_is_extents(enum btree_node_type type)

From a850bde6498b46d6e3143d8847f9aa9598491f9f Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Thu, 8 Feb 2024 21:10:32 -0500
Subject: [PATCH 060/120] bcachefs: fsck_err() may now take a btree_trans

fsck_err() now optionally takes a btree_trans; if the current thread has
one, it is required that it be passed.

The next patch will use this to unlock when waiting for user input.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/alloc_background.c      | 40 +++++++-------
 fs/bcachefs/backpointers.c          | 12 ++---
 fs/bcachefs/btree_gc.c              | 54 ++++++++++---------
 fs/bcachefs/btree_io.c              |  2 +-
 fs/bcachefs/btree_iter.c            | 14 +++++
 fs/bcachefs/btree_iter.h            |  2 +
 fs/bcachefs/btree_update_interior.c | 10 ++--
 fs/bcachefs/buckets.c               | 40 +++++++-------
 fs/bcachefs/disk_accounting.c       | 18 ++++---
 fs/bcachefs/error.c                 |  9 +++-
 fs/bcachefs/error.h                 | 19 ++++---
 fs/bcachefs/fsck.c                  | 81 +++++++++++++++--------------
 fs/bcachefs/inode.c                 | 13 +++--
 fs/bcachefs/lru.c                   |  8 +--
 fs/bcachefs/reflink.c               |  2 +-
 fs/bcachefs/snapshot.c              | 24 +++++----
 fs/bcachefs/subvolume.c             | 14 ++---
 17 files changed, 200 insertions(+), 162 deletions(-)

diff --git a/fs/bcachefs/alloc_background.c b/fs/bcachefs/alloc_background.c
index 831fd0e2a5bf..23e4aa9baa3a 100644
--- a/fs/bcachefs/alloc_background.c
+++ b/fs/bcachefs/alloc_background.c
@@ -1114,7 +1114,7 @@ int bch2_check_alloc_key(struct btree_trans *trans,
 
 	struct bch_dev *ca = bch2_dev_bucket_tryget_noerror(c, alloc_k.k->p);
 	if (fsck_err_on(!ca,
-			c, alloc_key_to_missing_dev_bucket,
+			trans, alloc_key_to_missing_dev_bucket,
 			"alloc key for invalid device:bucket %llu:%llu",
 			alloc_k.k->p.inode, alloc_k.k->p.offset))
 		ret = bch2_btree_delete_at(trans, alloc_iter, 0);
@@ -1134,7 +1134,7 @@ int bch2_check_alloc_key(struct btree_trans *trans,
 		goto err;
 
 	if (fsck_err_on(k.k->type != discard_key_type,
-			c, need_discard_key_wrong,
+			trans, need_discard_key_wrong,
 			"incorrect key in need_discard btree (got %s should be %s)\n"
 			"  %s",
 			bch2_bkey_types[k.k->type],
@@ -1164,7 +1164,7 @@ int bch2_check_alloc_key(struct btree_trans *trans,
 		goto err;
 
 	if (fsck_err_on(k.k->type != freespace_key_type,
-			c, freespace_key_wrong,
+			trans, freespace_key_wrong,
 			"incorrect key in freespace btree (got %s should be %s)\n"
 			"  %s",
 			bch2_bkey_types[k.k->type],
@@ -1195,7 +1195,7 @@ int bch2_check_alloc_key(struct btree_trans *trans,
 		goto err;
 
 	if (fsck_err_on(a->gen != alloc_gen(k, gens_offset),
-			c, bucket_gens_key_wrong,
+			trans, bucket_gens_key_wrong,
 			"incorrect gen in bucket_gens btree (got %u should be %u)\n"
 			"  %s",
 			alloc_gen(k, gens_offset), a->gen,
@@ -1236,7 +1236,6 @@ int bch2_check_alloc_hole_freespace(struct btree_trans *trans,
 				    struct bpos *end,
 				    struct btree_iter *freespace_iter)
 {
-	struct bch_fs *c = trans->c;
 	struct bkey_s_c k;
 	struct printbuf buf = PRINTBUF;
 	int ret;
@@ -1254,7 +1253,7 @@ int bch2_check_alloc_hole_freespace(struct btree_trans *trans,
 	*end = bkey_min(k.k->p, *end);
 
 	if (fsck_err_on(k.k->type != KEY_TYPE_set,
-			c, freespace_hole_missing,
+			trans, freespace_hole_missing,
 			"hole in alloc btree missing in freespace btree\n"
 			"  device %llu buckets %llu-%llu",
 			freespace_iter->pos.inode,
@@ -1290,7 +1289,6 @@ int bch2_check_alloc_hole_bucket_gens(struct btree_trans *trans,
 				      struct bpos *end,
 				      struct btree_iter *bucket_gens_iter)
 {
-	struct bch_fs *c = trans->c;
 	struct bkey_s_c k;
 	struct printbuf buf = PRINTBUF;
 	unsigned i, gens_offset, gens_end_offset;
@@ -1314,7 +1312,7 @@ int bch2_check_alloc_hole_bucket_gens(struct btree_trans *trans,
 		bkey_reassemble(&g.k_i, k);
 
 		for (i = gens_offset; i < gens_end_offset; i++) {
-			if (fsck_err_on(g.v.gens[i], c,
+			if (fsck_err_on(g.v.gens[i], trans,
 					bucket_gens_hole_wrong,
 					"hole in alloc btree at %llu:%llu with nonzero gen in bucket_gens btree (%u)",
 					bucket_gens_pos_to_alloc(k.k->p, i).inode,
@@ -1372,8 +1370,8 @@ static noinline_for_stack int bch2_check_discard_freespace_key(struct btree_tran
 	if (ret)
 		return ret;
 
-	if (fsck_err_on(!bch2_dev_bucket_exists(c, pos), c,
-			need_discard_freespace_key_to_invalid_dev_bucket,
+	if (fsck_err_on(!bch2_dev_bucket_exists(c, pos),
+			trans, need_discard_freespace_key_to_invalid_dev_bucket,
 			"entry in %s btree for nonexistant dev:bucket %llu:%llu",
 			bch2_btree_id_str(iter->btree_id), pos.inode, pos.offset))
 		goto delete;
@@ -1382,8 +1380,8 @@ static noinline_for_stack int bch2_check_discard_freespace_key(struct btree_tran
 
 	if (fsck_err_on(a->data_type != state ||
 			(state == BCH_DATA_free &&
-			 genbits != alloc_freespace_genbits(*a)), c,
-			need_discard_freespace_key_bad,
+			 genbits != alloc_freespace_genbits(*a)),
+			trans, need_discard_freespace_key_bad,
 			"%s\n  incorrectly set at %s:%llu:%llu:0 (free %u, genbits %llu should be %llu)",
 			(bch2_bkey_val_to_text(&buf, c, alloc_k), buf.buf),
 			bch2_btree_id_str(iter->btree_id),
@@ -1430,7 +1428,7 @@ int bch2_check_bucket_gens_key(struct btree_trans *trans,
 
 	struct bch_dev *ca = bch2_dev_tryget_noerror(c, k.k->p.inode);
 	if (!ca) {
-		if (fsck_err(c, bucket_gens_to_invalid_dev,
+		if (fsck_err(trans, bucket_gens_to_invalid_dev,
 			     "bucket_gens key for invalid device:\n  %s",
 			     (bch2_bkey_val_to_text(&buf, c, k), buf.buf)))
 			ret = bch2_btree_delete_at(trans, iter, 0);
@@ -1438,8 +1436,8 @@ int bch2_check_bucket_gens_key(struct btree_trans *trans,
 	}
 
 	if (fsck_err_on(end <= ca->mi.first_bucket ||
-			start >= ca->mi.nbuckets, c,
-			bucket_gens_to_invalid_buckets,
+			start >= ca->mi.nbuckets,
+			trans, bucket_gens_to_invalid_buckets,
 			"bucket_gens key for invalid buckets:\n  %s",
 			(bch2_bkey_val_to_text(&buf, c, k), buf.buf))) {
 		ret = bch2_btree_delete_at(trans, iter, 0);
@@ -1447,16 +1445,16 @@ int bch2_check_bucket_gens_key(struct btree_trans *trans,
 	}
 
 	for (b = start; b < ca->mi.first_bucket; b++)
-		if (fsck_err_on(g.v.gens[b & KEY_TYPE_BUCKET_GENS_MASK], c,
-				bucket_gens_nonzero_for_invalid_buckets,
+		if (fsck_err_on(g.v.gens[b & KEY_TYPE_BUCKET_GENS_MASK],
+				trans, bucket_gens_nonzero_for_invalid_buckets,
 				"bucket_gens key has nonzero gen for invalid bucket")) {
 			g.v.gens[b & KEY_TYPE_BUCKET_GENS_MASK] = 0;
 			need_update = true;
 		}
 
 	for (b = ca->mi.nbuckets; b < end; b++)
-		if (fsck_err_on(g.v.gens[b & KEY_TYPE_BUCKET_GENS_MASK], c,
-				bucket_gens_nonzero_for_invalid_buckets,
+		if (fsck_err_on(g.v.gens[b & KEY_TYPE_BUCKET_GENS_MASK],
+				trans, bucket_gens_nonzero_for_invalid_buckets,
 				"bucket_gens key has nonzero gen for invalid bucket")) {
 			g.v.gens[b & KEY_TYPE_BUCKET_GENS_MASK] = 0;
 			need_update = true;
@@ -1636,8 +1634,8 @@ static int bch2_check_alloc_to_lru_ref(struct btree_trans *trans,
 	if (a->data_type != BCH_DATA_cached)
 		return 0;
 
-	if (fsck_err_on(!a->io_time[READ], c,
-			alloc_key_cached_but_read_time_zero,
+	if (fsck_err_on(!a->io_time[READ],
+			trans, alloc_key_cached_but_read_time_zero,
 			"cached bucket with read_time 0\n"
 			"  %s",
 		(printbuf_reset(&buf),
diff --git a/fs/bcachefs/backpointers.c b/fs/bcachefs/backpointers.c
index 018b19f7c346..3cc02479a982 100644
--- a/fs/bcachefs/backpointers.c
+++ b/fs/bcachefs/backpointers.c
@@ -395,7 +395,7 @@ static int bch2_check_btree_backpointer(struct btree_trans *trans, struct btree_
 
 	struct bpos bucket;
 	if (!bp_pos_to_bucket_nodev_noerror(c, k.k->p, &bucket)) {
-		if (fsck_err(c, backpointer_to_missing_device,
+		if (fsck_err(trans, backpointer_to_missing_device,
 			     "backpointer for missing device:\n%s",
 			     (bch2_bkey_val_to_text(&buf, c, k), buf.buf)))
 			ret = bch2_btree_delete_at(trans, bp_iter, 0);
@@ -407,8 +407,8 @@ static int bch2_check_btree_backpointer(struct btree_trans *trans, struct btree_
 	if (ret)
 		goto out;
 
-	if (fsck_err_on(alloc_k.k->type != KEY_TYPE_alloc_v4, c,
-			backpointer_to_missing_alloc,
+	if (fsck_err_on(alloc_k.k->type != KEY_TYPE_alloc_v4,
+			trans, backpointer_to_missing_alloc,
 			"backpointer for nonexistent alloc key: %llu:%llu:0\n%s",
 			alloc_iter.pos.inode, alloc_iter.pos.offset,
 			(bch2_bkey_val_to_text(&buf, c, k), buf.buf))) {
@@ -505,7 +505,7 @@ found:
 	struct nonce nonce = extent_nonce(extent.k->version, p.crc);
 	struct bch_csum csum = bch2_checksum(c, p.crc.csum_type, nonce, data_buf, bytes);
 	if (fsck_err_on(bch2_crc_cmp(csum, p.crc.csum),
-			c, dup_backpointer_to_bad_csum_extent,
+			trans, dup_backpointer_to_bad_csum_extent,
 			"%s", buf.buf))
 		ret = drop_dev_and_update(trans, btree, extent, dev) ?: 1;
 fsck_err:
@@ -647,7 +647,7 @@ missing:
 	prt_printf(&buf, "\n  want:  ");
 	bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&n_bp_k.k_i));
 
-	if (fsck_err(c, ptr_to_missing_backpointer, "%s", buf.buf))
+	if (fsck_err(trans, ptr_to_missing_backpointer, "%s", buf.buf))
 		ret = bch2_bucket_backpointer_mod(trans, ca, bucket, bp, orig_k, true);
 
 	goto out;
@@ -908,7 +908,7 @@ static int check_one_backpointer(struct btree_trans *trans,
 		if (ret)
 			goto out;
 
-		if (fsck_err(c, backpointer_to_missing_ptr,
+		if (fsck_err(trans, backpointer_to_missing_ptr,
 			     "backpointer for missing %s\n  %s",
 			     bp.v->level ? "btree node" : "extent",
 			     (bch2_bkey_val_to_text(&buf, c, bp.s_c), buf.buf))) {
diff --git a/fs/bcachefs/btree_gc.c b/fs/bcachefs/btree_gc.c
index 84f283cfe8a4..88f7c7d64a1d 100644
--- a/fs/bcachefs/btree_gc.c
+++ b/fs/bcachefs/btree_gc.c
@@ -175,10 +175,11 @@ static int set_node_max(struct bch_fs *c, struct btree *b, struct bpos new_max)
 	return 0;
 }
 
-static int btree_check_node_boundaries(struct bch_fs *c, struct btree *b,
+static int btree_check_node_boundaries(struct btree_trans *trans, struct btree *b,
 				       struct btree *prev, struct btree *cur,
 				       struct bpos *pulled_from_scan)
 {
+	struct bch_fs *c = trans->c;
 	struct bpos expected_start = !prev
 		? b->data->min_key
 		: bpos_successor(prev->key.k.p);
@@ -216,29 +217,29 @@ static int btree_check_node_boundaries(struct bch_fs *c, struct btree *b,
 			*pulled_from_scan = cur->data->min_key;
 			ret = DID_FILL_FROM_SCAN;
 		} else {
-			if (mustfix_fsck_err(c, btree_node_topology_bad_min_key,
+			if (mustfix_fsck_err(trans, btree_node_topology_bad_min_key,
 					     "btree node with incorrect min_key%s", buf.buf))
 				ret = set_node_min(c, cur, expected_start);
 		}
 	} else {									/* overlap */
 		if (prev && BTREE_NODE_SEQ(cur->data) > BTREE_NODE_SEQ(prev->data)) {	/* cur overwrites prev */
 			if (bpos_ge(prev->data->min_key, cur->data->min_key)) {		/* fully? */
-				if (mustfix_fsck_err(c, btree_node_topology_overwritten_by_next_node,
+				if (mustfix_fsck_err(trans, btree_node_topology_overwritten_by_next_node,
 						     "btree node overwritten by next node%s", buf.buf))
 					ret = DROP_PREV_NODE;
 			} else {
-				if (mustfix_fsck_err(c, btree_node_topology_bad_max_key,
+				if (mustfix_fsck_err(trans, btree_node_topology_bad_max_key,
 						     "btree node with incorrect max_key%s", buf.buf))
 					ret = set_node_max(c, prev,
 							   bpos_predecessor(cur->data->min_key));
 			}
 		} else {
 			if (bpos_ge(expected_start, cur->data->max_key)) {		/* fully? */
-				if (mustfix_fsck_err(c, btree_node_topology_overwritten_by_prev_node,
+				if (mustfix_fsck_err(trans, btree_node_topology_overwritten_by_prev_node,
 						     "btree node overwritten by prev node%s", buf.buf))
 					ret = DROP_THIS_NODE;
 			} else {
-				if (mustfix_fsck_err(c, btree_node_topology_bad_min_key,
+				if (mustfix_fsck_err(trans, btree_node_topology_bad_min_key,
 						     "btree node with incorrect min_key%s", buf.buf))
 					ret = set_node_min(c, cur, expected_start);
 			}
@@ -250,9 +251,10 @@ fsck_err:
 	return ret;
 }
 
-static int btree_repair_node_end(struct bch_fs *c, struct btree *b,
+static int btree_repair_node_end(struct btree_trans *trans, struct btree *b,
 				 struct btree *child, struct bpos *pulled_from_scan)
 {
+	struct bch_fs *c = trans->c;
 	struct printbuf buf = PRINTBUF;
 	int ret = 0;
 
@@ -266,7 +268,7 @@ static int btree_repair_node_end(struct bch_fs *c, struct btree *b,
 	prt_str(&buf, "\n  child: ");
 	bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&child->key));
 
-	if (mustfix_fsck_err(c, btree_node_topology_bad_max_key,
+	if (mustfix_fsck_err(trans, btree_node_topology_bad_max_key,
 			     "btree node with incorrect max_key%s", buf.buf)) {
 		if (b->c.level == 1 &&
 		    bpos_lt(*pulled_from_scan, b->key.k.p)) {
@@ -325,8 +327,8 @@ again:
 		printbuf_reset(&buf);
 		bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(cur_k.k));
 
-		if (mustfix_fsck_err_on(bch2_err_matches(ret, EIO), c,
-				btree_node_unreadable,
+		if (mustfix_fsck_err_on(bch2_err_matches(ret, EIO),
+				trans, btree_node_unreadable,
 				"Topology repair: unreadable btree node at btree %s level %u:\n"
 				"  %s",
 				bch2_btree_id_str(b->c.btree_id),
@@ -363,7 +365,7 @@ again:
 			continue;
 		}
 
-		ret = btree_check_node_boundaries(c, b, prev, cur, pulled_from_scan);
+		ret = btree_check_node_boundaries(trans, b, prev, cur, pulled_from_scan);
 		if (ret == DID_FILL_FROM_SCAN) {
 			new_pass = true;
 			ret = 0;
@@ -404,7 +406,7 @@ again:
 
 	if (!ret && !IS_ERR_OR_NULL(prev)) {
 		BUG_ON(cur);
-		ret = btree_repair_node_end(c, b, prev, pulled_from_scan);
+		ret = btree_repair_node_end(trans, b, prev, pulled_from_scan);
 		if (ret == DID_FILL_FROM_SCAN) {
 			new_pass = true;
 			ret = 0;
@@ -462,8 +464,8 @@ again:
 	printbuf_reset(&buf);
 	bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&b->key));
 
-	if (mustfix_fsck_err_on(!have_child, c,
-			btree_node_topology_interior_node_empty,
+	if (mustfix_fsck_err_on(!have_child,
+			trans, btree_node_topology_interior_node_empty,
 			"empty interior btree node at btree %s level %u\n"
 			"  %s",
 			bch2_btree_id_str(b->c.btree_id),
@@ -510,7 +512,7 @@ reconstruct_root:
 			r->error = 0;
 
 			if (!bch2_btree_has_scanned_nodes(c, i)) {
-				mustfix_fsck_err(c, btree_root_unreadable_and_scan_found_nothing,
+				mustfix_fsck_err(trans, btree_root_unreadable_and_scan_found_nothing,
 						 "no nodes found for btree %s, continue?", bch2_btree_id_str(i));
 				bch2_btree_root_alloc_fake_trans(trans, i, 0);
 			} else {
@@ -585,8 +587,8 @@ static int bch2_gc_mark_key(struct btree_trans *trans, enum btree_id btree_id,
 		       k.k->version.lo > atomic64_read(&c->journal.seq));
 
 		if (fsck_err_on(btree_id != BTREE_ID_accounting &&
-				k.k->version.lo > atomic64_read(&c->key_version), c,
-				bkey_version_in_future,
+				k.k->version.lo > atomic64_read(&c->key_version),
+				trans, bkey_version_in_future,
 				"key version number higher than recorded %llu\n  %s",
 				atomic64_read(&c->key_version),
 				(bch2_bkey_val_to_text(&buf, c, k), buf.buf)))
@@ -594,7 +596,7 @@ static int bch2_gc_mark_key(struct btree_trans *trans, enum btree_id btree_id,
 	}
 
 	if (mustfix_fsck_err_on(level && !bch2_dev_btree_bitmap_marked(c, k),
-				c, btree_bitmap_not_marked,
+				trans, btree_bitmap_not_marked,
 				"btree ptr not marked in member info btree allocated bitmap\n  %s",
 				(printbuf_reset(&buf),
 				 bch2_bkey_val_to_text(&buf, c, k),
@@ -710,7 +712,7 @@ static int bch2_gc_btrees(struct bch_fs *c)
 		ret = bch2_gc_btree(trans, btree, true);
 
 		if (mustfix_fsck_err_on(bch2_err_matches(ret, EIO),
-					c, btree_node_read_error,
+					trans, btree_node_read_error,
 			       "btree node read error for %s",
 			       bch2_btree_id_str(btree)))
 			ret = bch2_run_explicit_recovery_pass(c, BCH_RECOVERY_PASS_check_topology);
@@ -816,8 +818,8 @@ static int bch2_alloc_write_key(struct btree_trans *trans,
 
 	gc.fragmentation_lru = alloc_lru_idx_fragmentation(gc, ca);
 
-	if (fsck_err_on(new.data_type != gc.data_type, c,
-			alloc_key_data_type_wrong,
+	if (fsck_err_on(new.data_type != gc.data_type,
+			trans, alloc_key_data_type_wrong,
 			"bucket %llu:%llu gen %u has wrong data_type"
 			": got %s, should be %s",
 			iter->pos.inode, iter->pos.offset,
@@ -827,7 +829,8 @@ static int bch2_alloc_write_key(struct btree_trans *trans,
 		new.data_type = gc.data_type;
 
 #define copy_bucket_field(_errtype, _f)					\
-	if (fsck_err_on(new._f != gc._f, c, _errtype,			\
+	if (fsck_err_on(new._f != gc._f,				\
+			trans, _errtype,				\
 			"bucket %llu:%llu gen %u data type %s has wrong " #_f	\
 			": got %llu, should be %llu",			\
 			iter->pos.inode, iter->pos.offset,		\
@@ -939,8 +942,8 @@ static int bch2_gc_write_reflink_key(struct btree_trans *trans,
 		return -EINVAL;
 	}
 
-	if (fsck_err_on(r->refcount != le64_to_cpu(*refcount), c,
-			reflink_v_refcount_wrong,
+	if (fsck_err_on(r->refcount != le64_to_cpu(*refcount),
+			trans, reflink_v_refcount_wrong,
 			"reflink key has wrong refcount:\n"
 			"  %s\n"
 			"  should be %u",
@@ -1038,7 +1041,8 @@ static int bch2_gc_write_stripes_key(struct btree_trans *trans,
 	if (bad)
 		bch2_bkey_val_to_text(&buf, c, k);
 
-	if (fsck_err_on(bad, c, stripe_sector_count_wrong,
+	if (fsck_err_on(bad,
+			trans, stripe_sector_count_wrong,
 			"%s", buf.buf)) {
 		struct bkey_i_stripe *new;
 
diff --git a/fs/bcachefs/btree_io.c b/fs/bcachefs/btree_io.c
index 95a141c12e1d..e092f541c449 100644
--- a/fs/bcachefs/btree_io.c
+++ b/fs/bcachefs/btree_io.c
@@ -585,7 +585,7 @@ static int __btree_err(int ret,
 	switch (ret) {
 	case -BCH_ERR_btree_node_read_err_fixable:
 		ret = !silent
-			? bch2_fsck_err(c, FSCK_CAN_FIX, err_type, "%s", out.buf)
+			? __bch2_fsck_err(c, NULL, FSCK_CAN_FIX, err_type, "%s", out.buf)
 			: -BCH_ERR_fsck_fix;
 		if (ret != -BCH_ERR_fsck_fix &&
 		    ret != -BCH_ERR_fsck_ignore)
diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c
index d6e63aa01940..755909f7b55d 100644
--- a/fs/bcachefs/btree_iter.c
+++ b/fs/bcachefs/btree_iter.c
@@ -3275,6 +3275,20 @@ void bch2_trans_put(struct btree_trans *trans)
 	}
 }
 
+bool bch2_current_has_btree_trans(struct bch_fs *c)
+{
+	seqmutex_lock(&c->btree_trans_lock);
+	struct btree_trans *trans;
+	bool ret = false;
+	list_for_each_entry(trans, &c->btree_trans_list, list)
+		if (trans->locking_wait.task == current) {
+			ret = true;
+			break;
+		}
+	seqmutex_unlock(&c->btree_trans_lock);
+	return ret;
+}
+
 static void __maybe_unused
 bch2_btree_bkey_cached_common_to_text(struct printbuf *out,
 				      struct btree_bkey_cached_common *b)
diff --git a/fs/bcachefs/btree_iter.h b/fs/bcachefs/btree_iter.h
index 699c1b8ef112..bdb3cd2ef98a 100644
--- a/fs/bcachefs/btree_iter.h
+++ b/fs/bcachefs/btree_iter.h
@@ -883,6 +883,8 @@ void bch2_dump_trans_paths_updates(struct btree_trans *);
 struct btree_trans *__bch2_trans_get(struct bch_fs *, unsigned);
 void bch2_trans_put(struct btree_trans *);
 
+bool bch2_current_has_btree_trans(struct bch_fs *);
+
 extern const char *bch2_btree_transaction_fns[BCH_TRANSACTIONS_NR];
 unsigned bch2_trans_get_fn_idx(const char *);
 
diff --git a/fs/bcachefs/btree_update_interior.c b/fs/bcachefs/btree_update_interior.c
index 6d340f36aacf..31ee50184be2 100644
--- a/fs/bcachefs/btree_update_interior.c
+++ b/fs/bcachefs/btree_update_interior.c
@@ -61,7 +61,7 @@ int bch2_btree_node_check_topology(struct btree_trans *trans, struct btree *b)
 		if (!bpos_eq(b->data->min_key, POS_MIN)) {
 			printbuf_reset(&buf);
 			bch2_bpos_to_text(&buf, b->data->min_key);
-			need_fsck_err(c, btree_root_bad_min_key,
+			need_fsck_err(trans, btree_root_bad_min_key,
 				      "btree root with incorrect min_key: %s", buf.buf);
 			goto topology_repair;
 		}
@@ -69,7 +69,7 @@ int bch2_btree_node_check_topology(struct btree_trans *trans, struct btree *b)
 		if (!bpos_eq(b->data->max_key, SPOS_MAX)) {
 			printbuf_reset(&buf);
 			bch2_bpos_to_text(&buf, b->data->max_key);
-			need_fsck_err(c, btree_root_bad_max_key,
+			need_fsck_err(trans, btree_root_bad_max_key,
 				      "btree root with incorrect max_key: %s", buf.buf);
 			goto topology_repair;
 		}
@@ -105,7 +105,7 @@ int bch2_btree_node_check_topology(struct btree_trans *trans, struct btree *b)
 			prt_str(&buf, "\n  next ");
 			bch2_bkey_val_to_text(&buf, c, k);
 
-			need_fsck_err(c, btree_node_topology_bad_min_key, "%s", buf.buf);
+			need_fsck_err(trans, btree_node_topology_bad_min_key, "%s", buf.buf);
 			goto topology_repair;
 		}
 
@@ -122,7 +122,7 @@ int bch2_btree_node_check_topology(struct btree_trans *trans, struct btree *b)
 			   bch2_btree_id_str(b->c.btree_id), b->c.level);
 		bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&b->key));
 
-		need_fsck_err(c, btree_node_topology_empty_interior_node, "%s", buf.buf);
+		need_fsck_err(trans, btree_node_topology_empty_interior_node, "%s", buf.buf);
 		goto topology_repair;
 	} else if (!bpos_eq(prev.k->k.p, b->key.k.p)) {
 		bch2_topology_error(c);
@@ -135,7 +135,7 @@ int bch2_btree_node_check_topology(struct btree_trans *trans, struct btree *b)
 		prt_str(&buf, "\n  last key ");
 		bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(prev.k));
 
-		need_fsck_err(c, btree_node_topology_bad_max_key, "%s", buf.buf);
+		need_fsck_err(trans, btree_node_topology_bad_max_key, "%s", buf.buf);
 		goto topology_repair;
 	}
 out:
diff --git a/fs/bcachefs/buckets.c b/fs/bcachefs/buckets.c
index f89e2e9a6dd2..42fd77fe1fe8 100644
--- a/fs/bcachefs/buckets.c
+++ b/fs/bcachefs/buckets.c
@@ -96,7 +96,7 @@ static int bch2_check_fix_ptr(struct btree_trans *trans,
 
 	struct bch_dev *ca = bch2_dev_tryget(c, p.ptr.dev);
 	if (!ca) {
-		if (fsck_err(c, ptr_to_invalid_device,
+		if (fsck_err(trans, ptr_to_invalid_device,
 			     "pointer to missing device %u\n"
 			     "while marking %s",
 			     p.ptr.dev,
@@ -108,7 +108,7 @@ static int bch2_check_fix_ptr(struct btree_trans *trans,
 
 	struct bucket *g = PTR_GC_BUCKET(ca, &p.ptr);
 	if (!g) {
-		if (fsck_err(c, ptr_to_invalid_device,
+		if (fsck_err(trans, ptr_to_invalid_device,
 			     "pointer to invalid bucket on device %u\n"
 			     "while marking %s",
 			     p.ptr.dev,
@@ -121,7 +121,7 @@ static int bch2_check_fix_ptr(struct btree_trans *trans,
 	enum bch_data_type data_type = bch2_bkey_ptr_data_type(k, p, entry);
 
 	if (fsck_err_on(!g->gen_valid,
-			c, ptr_to_missing_alloc_key,
+			trans, ptr_to_missing_alloc_key,
 			"bucket %u:%zu data type %s ptr gen %u missing in alloc btree\n"
 			"while marking %s",
 			p.ptr.dev, PTR_BUCKET_NR(ca, &p.ptr),
@@ -138,7 +138,7 @@ static int bch2_check_fix_ptr(struct btree_trans *trans,
 	}
 
 	if (fsck_err_on(gen_cmp(p.ptr.gen, g->gen) > 0,
-			c, ptr_gen_newer_than_bucket_gen,
+			trans, ptr_gen_newer_than_bucket_gen,
 			"bucket %u:%zu data type %s ptr gen in the future: %u > %u\n"
 			"while marking %s",
 			p.ptr.dev, PTR_BUCKET_NR(ca, &p.ptr),
@@ -161,7 +161,7 @@ static int bch2_check_fix_ptr(struct btree_trans *trans,
 	}
 
 	if (fsck_err_on(gen_cmp(g->gen, p.ptr.gen) > BUCKET_GC_GEN_MAX,
-			c, ptr_gen_newer_than_bucket_gen,
+			trans, ptr_gen_newer_than_bucket_gen,
 			"bucket %u:%zu gen %u data type %s: ptr gen %u too stale\n"
 			"while marking %s",
 			p.ptr.dev, PTR_BUCKET_NR(ca, &p.ptr), g->gen,
@@ -172,7 +172,7 @@ static int bch2_check_fix_ptr(struct btree_trans *trans,
 		*do_update = true;
 
 	if (fsck_err_on(!p.ptr.cached && gen_cmp(p.ptr.gen, g->gen) < 0,
-			c, stale_dirty_ptr,
+			trans, stale_dirty_ptr,
 			"bucket %u:%zu data type %s stale dirty ptr: %u < %u\n"
 			"while marking %s",
 			p.ptr.dev, PTR_BUCKET_NR(ca, &p.ptr),
@@ -186,7 +186,7 @@ static int bch2_check_fix_ptr(struct btree_trans *trans,
 		goto out;
 
 	if (fsck_err_on(bucket_data_type_mismatch(g->data_type, data_type),
-			c, ptr_bucket_data_type_mismatch,
+			trans, ptr_bucket_data_type_mismatch,
 			"bucket %u:%zu gen %u different types of data in same bucket: %s, %s\n"
 			"while marking %s",
 			p.ptr.dev, PTR_BUCKET_NR(ca, &p.ptr), g->gen,
@@ -210,7 +210,7 @@ static int bch2_check_fix_ptr(struct btree_trans *trans,
 		struct gc_stripe *m = genradix_ptr(&c->gc_stripes, p.ec.idx);
 
 		if (fsck_err_on(!m || !m->alive,
-				c, ptr_to_missing_stripe,
+				trans, ptr_to_missing_stripe,
 				"pointer to nonexistent stripe %llu\n"
 				"while marking %s",
 				(u64) p.ec.idx,
@@ -219,7 +219,7 @@ static int bch2_check_fix_ptr(struct btree_trans *trans,
 			*do_update = true;
 
 		if (fsck_err_on(m && m->alive && !bch2_ptr_matches_stripe_m(m, p),
-				c, ptr_to_incorrect_stripe,
+				trans, ptr_to_incorrect_stripe,
 				"pointer does not match stripe %llu\n"
 				"while marking %s",
 				(u64) p.ec.idx,
@@ -387,8 +387,8 @@ int bch2_bucket_ref_update(struct btree_trans *trans, struct bch_dev *ca,
 	BUG_ON(!sectors);
 
 	if (gen_after(ptr->gen, b_gen)) {
-		bch2_fsck_err(c, FSCK_CAN_IGNORE|FSCK_NEED_FSCK,
-			      BCH_FSCK_ERR_ptr_gen_newer_than_bucket_gen,
+		bch2_fsck_err(trans, FSCK_CAN_IGNORE|FSCK_NEED_FSCK,
+			      ptr_gen_newer_than_bucket_gen,
 			"bucket %u:%zu gen %u data type %s: ptr gen %u newer than bucket gen\n"
 			"while marking %s",
 			ptr->dev, bucket_nr, b_gen,
@@ -401,8 +401,8 @@ int bch2_bucket_ref_update(struct btree_trans *trans, struct bch_dev *ca,
 	}
 
 	if (gen_cmp(b_gen, ptr->gen) > BUCKET_GC_GEN_MAX) {
-		bch2_fsck_err(c, FSCK_CAN_IGNORE|FSCK_NEED_FSCK,
-			      BCH_FSCK_ERR_ptr_too_stale,
+		bch2_fsck_err(trans, FSCK_CAN_IGNORE|FSCK_NEED_FSCK,
+			      ptr_too_stale,
 			"bucket %u:%zu gen %u data type %s: ptr gen %u too stale\n"
 			"while marking %s",
 			ptr->dev, bucket_nr, b_gen,
@@ -421,8 +421,8 @@ int bch2_bucket_ref_update(struct btree_trans *trans, struct bch_dev *ca,
 	}
 
 	if (b_gen != ptr->gen) {
-		bch2_fsck_err(c, FSCK_CAN_IGNORE|FSCK_NEED_FSCK,
-			      BCH_FSCK_ERR_stale_dirty_ptr,
+		bch2_fsck_err(trans, FSCK_CAN_IGNORE|FSCK_NEED_FSCK,
+			      stale_dirty_ptr,
 			"bucket %u:%zu gen %u (mem gen %u) data type %s: stale dirty ptr (gen %u)\n"
 			"while marking %s",
 			ptr->dev, bucket_nr, b_gen,
@@ -437,8 +437,8 @@ int bch2_bucket_ref_update(struct btree_trans *trans, struct bch_dev *ca,
 	}
 
 	if (bucket_data_type_mismatch(bucket_data_type, ptr_data_type)) {
-		bch2_fsck_err(c, FSCK_CAN_IGNORE|FSCK_NEED_FSCK,
-			      BCH_FSCK_ERR_ptr_bucket_data_type_mismatch,
+		bch2_fsck_err(trans, FSCK_CAN_IGNORE|FSCK_NEED_FSCK,
+			      ptr_bucket_data_type_mismatch,
 			"bucket %u:%zu gen %u different types of data in same bucket: %s, %s\n"
 			"while marking %s",
 			ptr->dev, bucket_nr, b_gen,
@@ -452,8 +452,8 @@ int bch2_bucket_ref_update(struct btree_trans *trans, struct bch_dev *ca,
 	}
 
 	if ((u64) *bucket_sectors + sectors > U32_MAX) {
-		bch2_fsck_err(c, FSCK_CAN_IGNORE|FSCK_NEED_FSCK,
-			      BCH_FSCK_ERR_bucket_sector_count_overflow,
+		bch2_fsck_err(trans, FSCK_CAN_IGNORE|FSCK_NEED_FSCK,
+			      bucket_sector_count_overflow,
 			"bucket %u:%zu gen %u data type %s sector count overflow: %u + %lli > U32_MAX\n"
 			"while marking %s",
 			ptr->dev, bucket_nr, b_gen,
@@ -908,7 +908,7 @@ static int __bch2_trans_mark_metadata_bucket(struct btree_trans *trans,
 
 	if (a->v.data_type && type && a->v.data_type != type) {
 		bch2_fsck_err(c, FSCK_CAN_IGNORE|FSCK_NEED_FSCK,
-			      BCH_FSCK_ERR_bucket_metadata_type_mismatch,
+			      bucket_metadata_type_mismatch,
 			"bucket %llu:%llu gen %u different types of data in same bucket: %s, %s\n"
 			"while marking %s",
 			iter.pos.inode, iter.pos.offset, a->v.gen,
diff --git a/fs/bcachefs/disk_accounting.c b/fs/bcachefs/disk_accounting.c
index ed3b368ea1af..bc45f53efc27 100644
--- a/fs/bcachefs/disk_accounting.c
+++ b/fs/bcachefs/disk_accounting.c
@@ -433,7 +433,7 @@ int bch2_accounting_gc_done(struct bch_fs *c)
 			for (unsigned j = 0; j < nr; j++)
 				src_v[j] -= dst_v[j];
 
-			if (fsck_err(c, accounting_mismatch, "%s", buf.buf)) {
+			if (fsck_err(trans, accounting_mismatch, "%s", buf.buf)) {
 				ret = commit_do(trans, NULL, NULL, 0,
 						bch2_disk_accounting_mod(trans, &acc_k, src_v, nr, false));
 				if (ret)
@@ -464,8 +464,9 @@ fsck_err:
 	return ret;
 }
 
-static int accounting_read_key(struct bch_fs *c, struct bkey_s_c k)
+static int accounting_read_key(struct btree_trans *trans, struct bkey_s_c k)
 {
+	struct bch_fs *c = trans->c;
 	struct printbuf buf = PRINTBUF;
 
 	if (k.k->type != KEY_TYPE_accounting)
@@ -483,7 +484,7 @@ static int accounting_read_key(struct bch_fs *c, struct bkey_s_c k)
 	bpos_to_disk_accounting_pos(&acc, k.k->p);
 
 	if (fsck_err_on(ret == -BCH_ERR_btree_insert_need_mark_replicas,
-			c, accounting_replicas_not_marked,
+			trans, accounting_replicas_not_marked,
 			"accounting not marked in superblock replicas\n  %s",
 			(bch2_accounting_key_to_text(&buf, &acc),
 			 buf.buf)))
@@ -500,15 +501,15 @@ fsck_err:
 int bch2_accounting_read(struct bch_fs *c)
 {
 	struct bch_accounting_mem *acc = &c->accounting[0];
+	struct btree_trans *trans = bch2_trans_get(c);
 
-	int ret = bch2_trans_run(c,
-		for_each_btree_key(trans, iter,
+	int ret = for_each_btree_key(trans, iter,
 				BTREE_ID_accounting, POS_MIN,
 				BTREE_ITER_prefetch|BTREE_ITER_all_snapshots, k, ({
 			struct bkey u;
 			struct bkey_s_c k = bch2_btree_path_peek_slot_exact(btree_iter_path(trans, &iter), &u);
-			accounting_read_key(c, k);
-		})));
+			accounting_read_key(trans, k);
+		}));
 	if (ret)
 		goto err;
 
@@ -541,7 +542,7 @@ int bch2_accounting_read(struct bch_fs *c)
 				continue;
 			}
 
-			ret = accounting_read_key(c, k);
+			ret = accounting_read_key(trans, k);
 			if (ret)
 				goto err;
 		}
@@ -588,6 +589,7 @@ int bch2_accounting_read(struct bch_fs *c)
 	preempt_enable();
 	percpu_up_read(&c->mark_lock);
 err:
+	bch2_trans_put(trans);
 	bch_err_fn(c, ret);
 	return ret;
 }
diff --git a/fs/bcachefs/error.c b/fs/bcachefs/error.c
index 46cd9dcb48fc..3a9d0a03fecf 100644
--- a/fs/bcachefs/error.c
+++ b/fs/bcachefs/error.c
@@ -1,5 +1,6 @@
 // SPDX-License-Identifier: GPL-2.0
 #include "bcachefs.h"
+#include "btree_iter.h"
 #include "error.h"
 #include "journal.h"
 #include "recovery_passes.h"
@@ -198,7 +199,8 @@ static const u8 fsck_flags_extra[] = {
 #undef x
 };
 
-int bch2_fsck_err(struct bch_fs *c,
+int __bch2_fsck_err(struct bch_fs *c,
+		  struct btree_trans *trans,
 		  enum bch_fsck_flags flags,
 		  enum bch_sb_error_id err,
 		  const char *fmt, ...)
@@ -215,6 +217,11 @@ int bch2_fsck_err(struct bch_fs *c,
 	if (!WARN_ON(err >= ARRAY_SIZE(fsck_flags_extra)))
 		flags |= fsck_flags_extra[err];
 
+	if (!c)
+		c = trans->c;
+
+	WARN_ON(!trans && bch2_current_has_btree_trans(c));
+
 	if ((flags & FSCK_CAN_FIX) &&
 	    test_bit(err, c->sb.errors_silent))
 		return -BCH_ERR_fsck_fix;
diff --git a/fs/bcachefs/error.h b/fs/bcachefs/error.h
index ba6a4f5257f4..995e6bba9bad 100644
--- a/fs/bcachefs/error.h
+++ b/fs/bcachefs/error.h
@@ -110,18 +110,21 @@ struct fsck_err_state {
 
 #define fsck_err_count(_c, _err)	bch2_sb_err_count(_c, BCH_FSCK_ERR_##_err)
 
-__printf(4, 5) __cold
-int bch2_fsck_err(struct bch_fs *,
+__printf(5, 6) __cold
+int __bch2_fsck_err(struct bch_fs *, struct btree_trans *,
 		  enum bch_fsck_flags,
 		  enum bch_sb_error_id,
 		  const char *, ...);
+#define bch2_fsck_err(c, _flags, _err_type, ...)				\
+	__bch2_fsck_err(type_is(c, struct bch_fs *) ? (struct bch_fs *) c : NULL,\
+			type_is(c, struct btree_trans *) ? (struct btree_trans *) c : NULL,\
+			_flags, BCH_FSCK_ERR_##_err_type, __VA_ARGS__)
+
 void bch2_flush_fsck_errs(struct bch_fs *);
 
 #define __fsck_err(c, _flags, _err_type, ...)				\
 ({									\
-	int _ret = bch2_fsck_err(c, _flags, BCH_FSCK_ERR_##_err_type,	\
-				 __VA_ARGS__);				\
-									\
+	int _ret = bch2_fsck_err(c, _flags, _err_type, __VA_ARGS__);	\
 	if (_ret != -BCH_ERR_fsck_fix &&				\
 	    _ret != -BCH_ERR_fsck_ignore) {				\
 		ret = _ret;						\
@@ -138,8 +141,12 @@ void bch2_flush_fsck_errs(struct bch_fs *);
 #define __fsck_err_on(cond, c, _flags, _err_type, ...)			\
 ({									\
 	might_sleep();							\
+									\
+	if (type_is(c, struct bch_fs *))				\
+		WARN_ON(bch2_current_has_btree_trans((struct bch_fs *) c));\
+									\
 	(unlikely(cond) ? __fsck_err(c, _flags, _err_type, __VA_ARGS__) : false);\
-})									\
+})
 
 #define need_fsck_err_on(cond, c, _err_type, ...)				\
 	__fsck_err_on(cond, c, FSCK_CAN_IGNORE|FSCK_NEED_FSCK, _err_type, __VA_ARGS__)
diff --git a/fs/bcachefs/fsck.c b/fs/bcachefs/fsck.c
index aeb59da74e52..cc4f0963c0c5 100644
--- a/fs/bcachefs/fsck.c
+++ b/fs/bcachefs/fsck.c
@@ -835,8 +835,8 @@ static int hash_check_key(struct btree_trans *trans,
 			break;
 
 		if (fsck_err_on(k.k->type == desc.key_type &&
-				!desc.cmp_bkey(k, hash_k), c,
-				hash_table_key_duplicate,
+				!desc.cmp_bkey(k, hash_k),
+				trans, hash_table_key_duplicate,
 				"duplicate hash table keys:\n%s",
 				(printbuf_reset(&buf),
 				 bch2_bkey_val_to_text(&buf, c, hash_k),
@@ -855,7 +855,7 @@ out:
 	printbuf_exit(&buf);
 	return ret;
 bad_hash:
-	if (fsck_err(c, hash_table_key_wrong_offset,
+	if (fsck_err(trans, hash_table_key_wrong_offset,
 		     "hash table key at wrong offset: btree %s inode %llu offset %llu, hashed to %llu\n%s",
 		     bch2_btree_id_str(desc.btree_id), hash_k.k->p.inode, hash_k.k->p.offset, hash,
 		     (printbuf_reset(&buf),
@@ -930,11 +930,11 @@ static int check_inode_dirent_inode(struct btree_trans *trans, struct bkey_s_c i
 		return ret;
 
 	if (fsck_err_on(ret,
-			c, inode_points_to_missing_dirent,
+			trans, inode_points_to_missing_dirent,
 			"inode points to missing dirent\n%s",
 			(bch2_bkey_val_to_text(&buf, c, inode_k), buf.buf)) ||
 	    fsck_err_on(!ret && !dirent_points_to_inode(d, inode),
-			c, inode_points_to_wrong_dirent,
+			trans, inode_points_to_wrong_dirent,
 			"inode points to dirent that does not point back:\n%s",
 			(bch2_bkey_val_to_text(&buf, c, inode_k),
 			 prt_newline(&buf),
@@ -997,7 +997,7 @@ static int check_inode(struct btree_trans *trans,
 
 	if (fsck_err_on(prev->bi_hash_seed	!= u.bi_hash_seed ||
 			inode_d_type(prev)	!= inode_d_type(&u),
-			c, inode_snapshot_mismatch,
+			trans, inode_snapshot_mismatch,
 			"inodes in different snapshots don't match")) {
 		bch_err(c, "repair not implemented yet");
 		return -BCH_ERR_fsck_repair_unimplemented;
@@ -1029,7 +1029,8 @@ static int check_inode(struct btree_trans *trans,
 		if (ret < 0)
 			return ret;
 
-		fsck_err_on(!ret, c, unlinked_inode_not_on_deleted_list,
+		fsck_err_on(!ret,
+			    trans, unlinked_inode_not_on_deleted_list,
 			    "inode %llu:%u unlinked, but not on deleted list",
 			    u.bi_inum, k.k->p.snapshot);
 		ret = 0;
@@ -1037,7 +1038,7 @@ static int check_inode(struct btree_trans *trans,
 
 	if (u.bi_flags & BCH_INODE_unlinked &&
 	    (!c->sb.clean ||
-	     fsck_err(c, inode_unlinked_but_clean,
+	     fsck_err(trans, inode_unlinked_but_clean,
 		      "filesystem marked clean, but inode %llu unlinked",
 		      u.bi_inum))) {
 		ret = bch2_inode_rm_snapshot(trans, u.bi_inum, iter->pos.snapshot);
@@ -1047,7 +1048,7 @@ static int check_inode(struct btree_trans *trans,
 
 	if (u.bi_flags & BCH_INODE_i_size_dirty &&
 	    (!c->sb.clean ||
-	     fsck_err(c, inode_i_size_dirty_but_clean,
+	     fsck_err(trans, inode_i_size_dirty_but_clean,
 		      "filesystem marked clean, but inode %llu has i_size dirty",
 		      u.bi_inum))) {
 		bch_verbose(c, "truncating inode %llu", u.bi_inum);
@@ -1077,7 +1078,7 @@ static int check_inode(struct btree_trans *trans,
 
 	if (u.bi_flags & BCH_INODE_i_sectors_dirty &&
 	    (!c->sb.clean ||
-	     fsck_err(c, inode_i_sectors_dirty_but_clean,
+	     fsck_err(trans, inode_i_sectors_dirty_but_clean,
 		      "filesystem marked clean, but inode %llu has i_sectors dirty",
 		      u.bi_inum))) {
 		s64 sectors;
@@ -1112,7 +1113,7 @@ static int check_inode(struct btree_trans *trans,
 	if (fsck_err_on(u.bi_parent_subvol &&
 			(u.bi_subvol == 0 ||
 			 u.bi_subvol == BCACHEFS_ROOT_SUBVOL),
-			c, inode_bi_parent_nonzero,
+			trans, inode_bi_parent_nonzero,
 			"inode %llu:%u has subvol %u but nonzero parent subvol %u",
 			u.bi_inum, k.k->p.snapshot, u.bi_subvol, u.bi_parent_subvol)) {
 		u.bi_parent_subvol = 0;
@@ -1132,13 +1133,13 @@ static int check_inode(struct btree_trans *trans,
 		}
 
 		if (fsck_err_on(ret,
-				c, inode_bi_subvol_missing,
+				trans, inode_bi_subvol_missing,
 				"inode %llu:%u bi_subvol points to missing subvolume %u",
 				u.bi_inum, k.k->p.snapshot, u.bi_subvol) ||
 		    fsck_err_on(le64_to_cpu(s.inode) != u.bi_inum ||
 				!bch2_snapshot_is_ancestor(c, le32_to_cpu(s.snapshot),
 							   k.k->p.snapshot),
-				c, inode_bi_subvol_wrong,
+				trans, inode_bi_subvol_wrong,
 				"inode %llu:%u points to subvol %u, but subvol points to %llu:%u",
 				u.bi_inum, k.k->p.snapshot, u.bi_subvol,
 				le64_to_cpu(s.inode),
@@ -1221,14 +1222,15 @@ static int check_key_has_inode(struct btree_trans *trans,
 		goto err;
 	}
 
-	if (fsck_err_on(!i, c, key_in_missing_inode,
+	if (fsck_err_on(!i,
+			trans, key_in_missing_inode,
 			"key in missing inode:\n  %s",
 			(printbuf_reset(&buf),
 			 bch2_bkey_val_to_text(&buf, c, k), buf.buf)))
 		goto delete;
 
 	if (fsck_err_on(i && !btree_matches_i_mode(iter->btree_id, i->inode.bi_mode),
-			c, key_in_wrong_inode_type,
+			trans, key_in_wrong_inode_type,
 			"key for wrong inode mode %o:\n  %s",
 			i->inode.bi_mode,
 			(printbuf_reset(&buf),
@@ -1267,7 +1269,7 @@ static int check_i_sectors_notnested(struct btree_trans *trans, struct inode_wal
 		}
 
 		if (fsck_err_on(!(i->inode.bi_flags & BCH_INODE_i_sectors_dirty),
-				c, inode_i_sectors_wrong,
+				trans, inode_i_sectors_wrong,
 				"inode %llu:%u has incorrect i_sectors: got %llu, should be %llu",
 				w->last_pos.inode, i->snapshot,
 				i->inode.bi_sectors, i->count)) {
@@ -1415,7 +1417,7 @@ static int overlapping_extents_found(struct btree_trans *trans,
 	prt_printf(&buf, "\n  overwriting %s extent",
 		   pos1.snapshot >= pos2.p.snapshot ? "first" : "second");
 
-	if (fsck_err(c, extent_overlapping,
+	if (fsck_err(trans, extent_overlapping,
 		     "overlapping extents%s", buf.buf)) {
 		struct btree_iter *old_iter = &iter1;
 		struct disk_reservation res = { 0 };
@@ -1590,7 +1592,7 @@ static int check_extent(struct btree_trans *trans, struct btree_iter *iter,
 			if (fsck_err_on(!(i->inode.bi_flags & BCH_INODE_i_size_dirty) &&
 					k.k->p.offset > round_up(i->inode.bi_size, block_bytes(c)) >> 9 &&
 					!bkey_extent_is_reservation(k),
-					c, extent_past_end_of_inode,
+					trans, extent_past_end_of_inode,
 					"extent type past end of inode %llu:%u, i_size %llu\n  %s",
 					i->inode.bi_inum, i->snapshot, i->inode.bi_size,
 					(bch2_bkey_val_to_text(&buf, c, k), buf.buf))) {
@@ -1705,7 +1707,7 @@ static int check_subdir_count_notnested(struct btree_trans *trans, struct inode_
 		}
 
 		if (fsck_err_on(i->inode.bi_nlink != i->count,
-				c, inode_dir_wrong_nlink,
+				trans, inode_dir_wrong_nlink,
 				"directory %llu:%u with wrong i_nlink: got %u, should be %llu",
 				w->last_pos.inode, i->snapshot, i->inode.bi_nlink, i->count)) {
 			i->inode.bi_nlink = i->count;
@@ -1741,7 +1743,7 @@ static int check_dirent_inode_dirent(struct btree_trans *trans,
 		return 0;
 
 	if (bch2_inode_should_have_bp(target) &&
-	    !fsck_err(c, inode_wrong_backpointer,
+	    !fsck_err(trans, inode_wrong_backpointer,
 		      "dirent points to inode that does not point back:\n  %s",
 		      (bch2_bkey_val_to_text(&buf, c, d.s_c),
 		       prt_printf(&buf, "\n  "),
@@ -1767,7 +1769,7 @@ static int check_dirent_inode_dirent(struct btree_trans *trans,
 	ret = 0;
 
 	if (fsck_err_on(!backpointer_exists,
-			c, inode_wrong_backpointer,
+			trans, inode_wrong_backpointer,
 			"inode %llu:%u has wrong backpointer:\n"
 			"got       %llu:%llu\n"
 			"should be %llu:%llu",
@@ -1790,7 +1792,7 @@ static int check_dirent_inode_dirent(struct btree_trans *trans,
 	if (fsck_err_on(backpointer_exists &&
 			(S_ISDIR(target->bi_mode) ||
 			 target->bi_subvol),
-			c, inode_dir_multiple_links,
+			trans, inode_dir_multiple_links,
 			"%s %llu:%u with multiple links\n%s",
 			S_ISDIR(target->bi_mode) ? "directory" : "subvolume",
 			target->bi_inum, target_snapshot, buf.buf)) {
@@ -1804,7 +1806,7 @@ static int check_dirent_inode_dirent(struct btree_trans *trans,
 	 * it up, it ignores inodes with nlink 0
 	 */
 	if (fsck_err_on(backpointer_exists && !target->bi_nlink,
-			c, inode_multiple_links_but_nlink_0,
+			trans, inode_multiple_links_but_nlink_0,
 			"inode %llu:%u type %s has multiple links but i_nlink 0\n%s",
 			target->bi_inum, target_snapshot, bch2_d_types[d.v->d_type], buf.buf)) {
 		target->bi_nlink++;
@@ -1840,7 +1842,7 @@ static int check_dirent_target(struct btree_trans *trans,
 		goto err;
 
 	if (fsck_err_on(d.v->d_type != inode_d_type(target),
-			c, dirent_d_type_wrong,
+			trans, dirent_d_type_wrong,
 			"incorrect d_type: got %s, should be %s:\n%s",
 			bch2_d_type_str(d.v->d_type),
 			bch2_d_type_str(inode_d_type(target)),
@@ -1938,11 +1940,12 @@ static int check_dirent_to_subvol(struct btree_trans *trans, struct btree_iter *
 		parent_snapshot = d.k->p.snapshot;
 	}
 
-	if (fsck_err_on(ret, c, dirent_to_missing_parent_subvol,
+	if (fsck_err_on(ret,
+			trans, dirent_to_missing_parent_subvol,
 			"dirent parent_subvol points to missing subvolume\n%s",
 			(bch2_bkey_val_to_text(&buf, c, d.s_c), buf.buf)) ||
 	    fsck_err_on(!ret && !bch2_snapshot_is_ancestor(c, parent_snapshot, d.k->p.snapshot),
-			c, dirent_not_visible_in_parent_subvol,
+			trans, dirent_not_visible_in_parent_subvol,
 			"dirent not visible in parent_subvol (not an ancestor of subvol snap %u)\n%s",
 			parent_snapshot,
 			(bch2_bkey_val_to_text(&buf, c, d.s_c), buf.buf))) {
@@ -1968,7 +1971,7 @@ static int check_dirent_to_subvol(struct btree_trans *trans, struct btree_iter *
 		return ret;
 
 	if (ret) {
-		if (fsck_err(c, dirent_to_missing_subvol,
+		if (fsck_err(trans, dirent_to_missing_subvol,
 			     "dirent points to missing subvolume\n%s",
 			     (bch2_bkey_val_to_text(&buf, c, d.s_c), buf.buf)))
 			return __remove_dirent(trans, d.k->p);
@@ -1977,7 +1980,7 @@ static int check_dirent_to_subvol(struct btree_trans *trans, struct btree_iter *
 	}
 
 	if (fsck_err_on(le32_to_cpu(s.v->fs_path_parent) != parent_subvol,
-			c, subvol_fs_path_parent_wrong,
+			trans, subvol_fs_path_parent_wrong,
 			"subvol with wrong fs_path_parent, should be be %u\n%s",
 			parent_subvol,
 			(bch2_bkey_val_to_text(&buf, c, s.s_c), buf.buf))) {
@@ -2005,7 +2008,7 @@ static int check_dirent_to_subvol(struct btree_trans *trans, struct btree_iter *
 	}
 
 	if (fsck_err_on(!ret && parent_subvol != subvol_root.bi_parent_subvol,
-			c, inode_bi_parent_wrong,
+			trans, inode_bi_parent_wrong,
 			"subvol root %llu has wrong bi_parent_subvol: got %u, should be %u",
 			target_inum,
 			subvol_root.bi_parent_subvol, parent_subvol)) {
@@ -2098,7 +2101,7 @@ static int check_dirent(struct btree_trans *trans, struct btree_iter *iter,
 			goto err;
 
 		if (fsck_err_on(!target->inodes.nr,
-				c, dirent_to_missing_inode,
+				trans, dirent_to_missing_inode,
 				"dirent points to missing inode:\n%s",
 				(printbuf_reset(&buf),
 				 bch2_bkey_val_to_text(&buf, c, k),
@@ -2226,7 +2229,7 @@ static int check_root_trans(struct btree_trans *trans)
 	if (ret && !bch2_err_matches(ret, ENOENT))
 		return ret;
 
-	if (mustfix_fsck_err_on(ret, c, root_subvol_missing,
+	if (mustfix_fsck_err_on(ret, trans, root_subvol_missing,
 				"root subvol missing")) {
 		struct bkey_i_subvolume *root_subvol =
 			bch2_trans_kmalloc(trans, sizeof(*root_subvol));
@@ -2252,10 +2255,11 @@ static int check_root_trans(struct btree_trans *trans)
 	if (ret && !bch2_err_matches(ret, ENOENT))
 		return ret;
 
-	if (mustfix_fsck_err_on(ret, c, root_dir_missing,
+	if (mustfix_fsck_err_on(ret,
+				trans, root_dir_missing,
 				"root directory missing") ||
 	    mustfix_fsck_err_on(!S_ISDIR(root_inode.bi_mode),
-				c, root_inode_not_dir,
+				trans, root_inode_not_dir,
 				"root inode not a directory")) {
 		bch2_inode_init(c, &root_inode, 0, 0, S_IFDIR|0755,
 				0, NULL);
@@ -2327,7 +2331,7 @@ static int check_subvol_path(struct btree_trans *trans, struct btree_iter *iter,
 			break;
 
 		if (fsck_err_on(!ret,
-				c, subvol_unreachable,
+				trans, subvol_unreachable,
 				"unreachable subvolume %s",
 				(bch2_bkey_val_to_text(&buf, c, s.s_c),
 				 buf.buf))) {
@@ -2352,7 +2356,7 @@ static int check_subvol_path(struct btree_trans *trans, struct btree_iter *iter,
 			goto err;
 
 		if (fsck_err_on(k.k->type != KEY_TYPE_subvolume,
-				c, subvol_unreachable,
+				trans, subvol_unreachable,
 				"unreachable subvolume %s",
 				(bch2_bkey_val_to_text(&buf, c, s.s_c),
 				 buf.buf))) {
@@ -2431,7 +2435,7 @@ static int check_path(struct btree_trans *trans, pathbuf *p, struct bkey_s_c ino
 
 		if (bch2_err_matches(ret, ENOENT)) {
 			ret = 0;
-			if (fsck_err(c, inode_unreachable,
+			if (fsck_err(trans, inode_unreachable,
 				     "unreachable inode\n%s",
 				     (printbuf_reset(&buf),
 				      bch2_bkey_val_to_text(&buf, c, inode_k),
@@ -2477,7 +2481,7 @@ static int check_path(struct btree_trans *trans, pathbuf *p, struct bkey_s_c ino
 				pr_err("%llu:%u", i->inum, i->snapshot);
 			pr_err("%llu:%u", inode.bi_inum, snapshot);
 
-			if (fsck_err(c, dir_loop, "directory structure loop")) {
+			if (fsck_err(trans, dir_loop, "directory structure loop")) {
 				ret = remove_backpointer(trans, &inode);
 				bch_err_msg(c, ret, "removing dirent");
 				if (ret)
@@ -2683,7 +2687,6 @@ static int check_nlinks_update_inode(struct btree_trans *trans, struct btree_ite
 				     struct nlink_table *links,
 				     size_t *idx, u64 range_end)
 {
-	struct bch_fs *c = trans->c;
 	struct bch_inode_unpacked u;
 	struct nlink *link = &links->d[*idx];
 	int ret = 0;
@@ -2709,7 +2712,7 @@ static int check_nlinks_update_inode(struct btree_trans *trans, struct btree_ite
 	}
 
 	if (fsck_err_on(bch2_inode_nlink_get(&u) != link->count,
-			c, inode_wrong_nlink,
+			trans, inode_wrong_nlink,
 			"inode %llu type %s has wrong i_nlink (%u, should be %u)",
 			u.bi_inum, bch2_d_types[mode_to_type(u.bi_mode)],
 			bch2_inode_nlink_get(&u), link->count)) {
diff --git a/fs/bcachefs/inode.c b/fs/bcachefs/inode.c
index d556d7f28661..1e20020eadd1 100644
--- a/fs/bcachefs/inode.c
+++ b/fs/bcachefs/inode.c
@@ -1087,8 +1087,8 @@ static int may_delete_deleted_inode(struct btree_trans *trans,
 		return ret;
 
 	ret = bkey_is_inode(k.k) ? 0 : -BCH_ERR_ENOENT_inode;
-	if (fsck_err_on(!bkey_is_inode(k.k), c,
-			deleted_inode_missing,
+	if (fsck_err_on(!bkey_is_inode(k.k),
+			trans, deleted_inode_missing,
 			"nonexistent inode %llu:%u in deleted_inodes btree",
 			pos.offset, pos.snapshot))
 		goto delete;
@@ -1100,7 +1100,7 @@ static int may_delete_deleted_inode(struct btree_trans *trans,
 	if (S_ISDIR(inode.bi_mode)) {
 		ret = bch2_empty_dir_snapshot(trans, pos.offset, 0, pos.snapshot);
 		if (fsck_err_on(bch2_err_matches(ret, ENOTEMPTY),
-				c, deleted_inode_is_dir,
+				trans, deleted_inode_is_dir,
 				"non empty directory %llu:%u in deleted_inodes btree",
 				pos.offset, pos.snapshot))
 			goto delete;
@@ -1108,15 +1108,14 @@ static int may_delete_deleted_inode(struct btree_trans *trans,
 			goto out;
 	}
 
-	if (fsck_err_on(!(inode.bi_flags & BCH_INODE_unlinked), c,
-			deleted_inode_not_unlinked,
+	if (fsck_err_on(!(inode.bi_flags & BCH_INODE_unlinked),
+			trans, deleted_inode_not_unlinked,
 			"non-deleted inode %llu:%u in deleted_inodes btree",
 			pos.offset, pos.snapshot))
 		goto delete;
 
 	if (c->sb.clean &&
-	    !fsck_err(c,
-		      deleted_inode_but_clean,
+	    !fsck_err(trans, deleted_inode_but_clean,
 		      "filesystem marked as clean but have deleted inode %llu:%u",
 		      pos.offset, pos.snapshot)) {
 		ret = 0;
diff --git a/fs/bcachefs/lru.c b/fs/bcachefs/lru.c
index b12894ef44f3..83b1586cb371 100644
--- a/fs/bcachefs/lru.c
+++ b/fs/bcachefs/lru.c
@@ -99,7 +99,7 @@ int bch2_lru_check_set(struct btree_trans *trans,
 		if (ret)
 			goto err;
 
-		if (fsck_err(c, alloc_key_to_missing_lru_entry,
+		if (fsck_err(trans, alloc_key_to_missing_lru_entry,
 			     "missing %s lru entry\n"
 			     "  %s",
 			     bch2_lru_types[lru_type(lru_k)],
@@ -133,8 +133,8 @@ static int bch2_check_lru_key(struct btree_trans *trans,
 	u64 idx;
 	int ret;
 
-	if (fsck_err_on(!bch2_dev_bucket_exists(c, alloc_pos), c,
-			lru_entry_to_invalid_bucket,
+	if (fsck_err_on(!bch2_dev_bucket_exists(c, alloc_pos),
+			trans, lru_entry_to_invalid_bucket,
 			"lru key points to nonexistent device:bucket %llu:%llu",
 			alloc_pos.inode, alloc_pos.offset))
 		return bch2_btree_delete_at(trans, lru_iter, 0);
@@ -164,7 +164,7 @@ static int bch2_check_lru_key(struct btree_trans *trans,
 			goto out;
 		}
 
-		if (fsck_err(c, lru_entry_bad,
+		if (fsck_err(trans, lru_entry_bad,
 			     "incorrect lru entry: lru %s time %llu\n"
 			     "  %s\n"
 			     "  for %s",
diff --git a/fs/bcachefs/reflink.c b/fs/bcachefs/reflink.c
index 9ac6cf21cfbf..5f92715e1525 100644
--- a/fs/bcachefs/reflink.c
+++ b/fs/bcachefs/reflink.c
@@ -171,7 +171,7 @@ static s64 gc_trigger_reflink_p_segment(struct btree_trans *trans,
 not_found:
 	BUG_ON(!(flags & BTREE_TRIGGER_check_repair));
 
-	if (fsck_err(c, reflink_p_to_missing_reflink_v,
+	if (fsck_err(trans, reflink_p_to_missing_reflink_v,
 		     "pointer to missing indirect extent\n"
 		     "  %s\n"
 		     "  missing range %llu-%llu",
diff --git a/fs/bcachefs/snapshot.c b/fs/bcachefs/snapshot.c
index 24023d6a9698..96744b1a76f5 100644
--- a/fs/bcachefs/snapshot.c
+++ b/fs/bcachefs/snapshot.c
@@ -552,7 +552,7 @@ static int check_snapshot_tree(struct btree_trans *trans,
 	if (fsck_err_on(ret ||
 			root_id != bch2_snapshot_root(c, root_id) ||
 			st.k->p.offset != le32_to_cpu(s.tree),
-			c, snapshot_tree_to_missing_snapshot,
+			trans, snapshot_tree_to_missing_snapshot,
 			"snapshot tree points to missing/incorrect snapshot:\n  %s",
 			(bch2_bkey_val_to_text(&buf, c, st.s_c), buf.buf))) {
 		ret = bch2_btree_delete_at(trans, iter, 0);
@@ -565,19 +565,19 @@ static int check_snapshot_tree(struct btree_trans *trans,
 		goto err;
 
 	if (fsck_err_on(ret,
-			c, snapshot_tree_to_missing_subvol,
+			trans, snapshot_tree_to_missing_subvol,
 			"snapshot tree points to missing subvolume:\n  %s",
 			(printbuf_reset(&buf),
 			 bch2_bkey_val_to_text(&buf, c, st.s_c), buf.buf)) ||
 	    fsck_err_on(!bch2_snapshot_is_ancestor(c,
 						le32_to_cpu(subvol.snapshot),
 						root_id),
-			c, snapshot_tree_to_wrong_subvol,
+			trans, snapshot_tree_to_wrong_subvol,
 			"snapshot tree points to subvolume that does not point to snapshot in this tree:\n  %s",
 			(printbuf_reset(&buf),
 			 bch2_bkey_val_to_text(&buf, c, st.s_c), buf.buf)) ||
 	    fsck_err_on(BCH_SUBVOLUME_SNAP(&subvol),
-			c, snapshot_tree_to_snapshot_subvol,
+			trans, snapshot_tree_to_snapshot_subvol,
 			"snapshot tree points to snapshot subvolume:\n  %s",
 			(printbuf_reset(&buf),
 			 bch2_bkey_val_to_text(&buf, c, st.s_c), buf.buf))) {
@@ -814,7 +814,7 @@ static int check_snapshot(struct btree_trans *trans,
 		}
 	} else {
 		if (fsck_err_on(s.subvol,
-				c, snapshot_should_not_have_subvol,
+				trans, snapshot_should_not_have_subvol,
 				"snapshot should not point to subvol:\n  %s",
 				(bch2_bkey_val_to_text(&buf, c, k), buf.buf))) {
 			u = bch2_bkey_make_mut_typed(trans, iter, &k, 0, snapshot);
@@ -831,7 +831,8 @@ static int check_snapshot(struct btree_trans *trans,
 	if (ret < 0)
 		goto err;
 
-	if (fsck_err_on(!ret, c, snapshot_to_bad_snapshot_tree,
+	if (fsck_err_on(!ret,
+			trans, snapshot_to_bad_snapshot_tree,
 			"snapshot points to missing/incorrect tree:\n  %s",
 			(bch2_bkey_val_to_text(&buf, c, k), buf.buf))) {
 		ret = snapshot_tree_ptr_repair(trans, iter, k, &s);
@@ -843,7 +844,7 @@ static int check_snapshot(struct btree_trans *trans,
 	real_depth = bch2_snapshot_depth(c, parent_id);
 
 	if (fsck_err_on(le32_to_cpu(s.depth) != real_depth,
-			c, snapshot_bad_depth,
+			trans, snapshot_bad_depth,
 			"snapshot with incorrect depth field, should be %u:\n  %s",
 			real_depth, (bch2_bkey_val_to_text(&buf, c, k), buf.buf))) {
 		u = bch2_bkey_make_mut_typed(trans, iter, &k, 0, snapshot);
@@ -859,7 +860,8 @@ static int check_snapshot(struct btree_trans *trans,
 	if (ret < 0)
 		goto err;
 
-	if (fsck_err_on(!ret, c, snapshot_bad_skiplist,
+	if (fsck_err_on(!ret,
+			trans, snapshot_bad_skiplist,
 			"snapshot with bad skiplist field:\n  %s",
 			(bch2_bkey_val_to_text(&buf, c, k), buf.buf))) {
 		u = bch2_bkey_make_mut_typed(trans, iter, &k, 0, snapshot);
@@ -1021,7 +1023,7 @@ int bch2_reconstruct_snapshots(struct bch_fs *c)
 
 		darray_for_each(*t, id) {
 			if (fsck_err_on(!bch2_snapshot_equiv(c, *id),
-					c, snapshot_node_missing,
+					trans, snapshot_node_missing,
 					"snapshot node %u from tree %s missing, recreate?", *id, buf.buf)) {
 				if (t->nr > 1) {
 					bch_err(c, "cannot reconstruct snapshot trees with multiple nodes");
@@ -1053,8 +1055,8 @@ int bch2_check_key_has_snapshot(struct btree_trans *trans,
 	struct printbuf buf = PRINTBUF;
 	int ret = 0;
 
-	if (fsck_err_on(!bch2_snapshot_equiv(c, k.k->p.snapshot), c,
-			bkey_in_missing_snapshot,
+	if (fsck_err_on(!bch2_snapshot_equiv(c, k.k->p.snapshot),
+			trans, bkey_in_missing_snapshot,
 			"key in missing snapshot %s, delete?",
 			(bch2_bkey_val_to_text(&buf, c, k), buf.buf)))
 		ret = bch2_btree_delete_at(trans, iter,
diff --git a/fs/bcachefs/subvolume.c b/fs/bcachefs/subvolume.c
index dfc9cf305756..488ca6eb06a7 100644
--- a/fs/bcachefs/subvolume.c
+++ b/fs/bcachefs/subvolume.c
@@ -57,7 +57,7 @@ static int check_subvol(struct btree_trans *trans,
 
 	if (fsck_err_on(subvol.k->p.offset == BCACHEFS_ROOT_SUBVOL &&
 			subvol.v->fs_path_parent,
-			c, subvol_root_fs_path_parent_nonzero,
+			trans, subvol_root_fs_path_parent_nonzero,
 			"root subvolume has nonzero fs_path_parent\n%s",
 			(bch2_bkey_val_to_text(&buf, c, k), buf.buf))) {
 		struct bkey_i_subvolume *n =
@@ -80,7 +80,7 @@ static int check_subvol(struct btree_trans *trans,
 			goto err;
 
 		if (fsck_err_on(subvol_children_k.k->type != KEY_TYPE_set,
-				c, subvol_children_not_set,
+				trans, subvol_children_not_set,
 				"subvolume not set in subvolume_children btree at %llu:%llu\n%s",
 				pos.inode, pos.offset,
 				(printbuf_reset(&buf),
@@ -101,7 +101,8 @@ static int check_subvol(struct btree_trans *trans,
 	if (ret && !bch2_err_matches(ret, ENOENT))
 		return ret;
 
-	if (fsck_err_on(ret, c, subvol_to_missing_root,
+	if (fsck_err_on(ret,
+			trans, subvol_to_missing_root,
 			"subvolume %llu points to missing subvolume root %llu:%u",
 			k.k->p.offset, le64_to_cpu(subvol.v->inode),
 			le32_to_cpu(subvol.v->snapshot))) {
@@ -111,7 +112,7 @@ static int check_subvol(struct btree_trans *trans,
 	}
 
 	if (fsck_err_on(inode.bi_subvol != subvol.k->p.offset,
-			c, subvol_root_wrong_bi_subvol,
+			trans, subvol_root_wrong_bi_subvol,
 			"subvol root %llu:%u has wrong bi_subvol field: got %u, should be %llu",
 			inode.bi_inum, inode_iter.k.p.snapshot,
 			inode.bi_subvol, subvol.k->p.offset)) {
@@ -139,7 +140,7 @@ static int check_subvol(struct btree_trans *trans,
 			return ret;
 
 		if (fsck_err_on(le32_to_cpu(st.master_subvol) != subvol.k->p.offset,
-				c, subvol_not_master_and_not_snapshot,
+				trans, subvol_not_master_and_not_snapshot,
 				"subvolume %llu is not set as snapshot but is not master subvolume",
 				k.k->p.offset)) {
 			struct bkey_i_subvolume *s =
@@ -173,7 +174,6 @@ static int check_subvol_child(struct btree_trans *trans,
 			      struct btree_iter *child_iter,
 			      struct bkey_s_c child_k)
 {
-	struct bch_fs *c = trans->c;
 	struct bch_subvolume s;
 	int ret = bch2_bkey_get_val_typed(trans, BTREE_ID_subvolumes, POS(0, child_k.k->p.offset),
 					  0, subvolume, &s);
@@ -182,7 +182,7 @@ static int check_subvol_child(struct btree_trans *trans,
 
 	if (fsck_err_on(ret ||
 			le32_to_cpu(s.fs_path_parent) != child_k.k->p.inode,
-			c, subvol_children_bad,
+			trans, subvol_children_bad,
 			"incorrect entry in subvolume_children btree %llu:%llu",
 			child_k.k->p.inode, child_k.k->p.offset)) {
 		ret = bch2_btree_delete_at(trans, child_iter, 0);

From 36008d5d01ad155e14fd9df876d4356433613088 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Wed, 29 May 2024 22:06:00 -0400
Subject: [PATCH 061/120] bcachefs: Plumb more logging through stdio redirect

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/bcachefs.h   |  2 ++
 fs/bcachefs/btree_iter.c |  4 ++--
 fs/bcachefs/super.c      | 13 +++++++++++++
 3 files changed, 17 insertions(+), 2 deletions(-)

diff --git a/fs/bcachefs/bcachefs.h b/fs/bcachefs/bcachefs.h
index 33605fa8e70f..e0e9afb08ef6 100644
--- a/fs/bcachefs/bcachefs.h
+++ b/fs/bcachefs/bcachefs.h
@@ -267,6 +267,8 @@ do {									\
 
 #define bch2_fmt(_c, fmt)		bch2_log_msg(_c, fmt "\n")
 
+void bch2_print_str(struct bch_fs *, const char *);
+
 __printf(2, 3)
 void bch2_print_opts(struct bch_opts *, const char *, ...);
 
diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c
index 755909f7b55d..849a397bb919 100644
--- a/fs/bcachefs/btree_iter.c
+++ b/fs/bcachefs/btree_iter.c
@@ -1465,7 +1465,7 @@ void bch2_dump_trans_updates(struct btree_trans *trans)
 	struct printbuf buf = PRINTBUF;
 
 	bch2_trans_updates_to_text(&buf, trans);
-	bch2_print_string_as_lines(KERN_ERR, buf.buf);
+	bch2_print_str(trans->c, buf.buf);
 	printbuf_exit(&buf);
 }
 
@@ -1557,7 +1557,7 @@ void __bch2_dump_trans_paths_updates(struct btree_trans *trans, bool nosort)
 	__bch2_trans_paths_to_text(&buf, trans, nosort);
 	bch2_trans_updates_to_text(&buf, trans);
 
-	bch2_print_string_as_lines(KERN_ERR, buf.buf);
+	bch2_print_str(trans->c, buf.buf);
 	printbuf_exit(&buf);
 }
 
diff --git a/fs/bcachefs/super.c b/fs/bcachefs/super.c
index b9f851103297..c22a8ef2d2e1 100644
--- a/fs/bcachefs/super.c
+++ b/fs/bcachefs/super.c
@@ -89,6 +89,19 @@ const char * const bch2_fs_flag_strs[] = {
 	NULL
 };
 
+void bch2_print_str(struct bch_fs *c, const char *str)
+{
+#ifdef __KERNEL__
+	struct stdio_redirect *stdio = bch2_fs_stdio_redirect(c);
+
+	if (unlikely(stdio)) {
+		bch2_stdio_redirect_printf(stdio, true, "%s", str);
+		return;
+	}
+#endif
+	bch2_print_string_as_lines(KERN_ERR, str);
+}
+
 __printf(2, 0)
 static void bch2_print_maybe_redirect(struct stdio_redirect *stdio, const char *fmt, va_list args)
 {

From 0c97c437e362fb825b7501bd5da801bac77981b4 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Wed, 29 May 2024 20:34:48 -0400
Subject: [PATCH 062/120] bcachefs: twf: convert bch2_stdio_redirect_readline()
 to darray

We now read the line from the buffer atomically, which means we have to
allow the buffer to grow past STDIO_REDIRECT_BUFSIZE if we're waiting
for a full line - this behaviour is necessary for
stdio_redirect_readline_timeout() in the next patch.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/error.c                  | 15 +++---
 fs/bcachefs/thread_with_file.c       | 70 +++++++++++++++++-----------
 fs/bcachefs/thread_with_file.h       |  3 +-
 fs/bcachefs/thread_with_file_types.h |  1 +
 4 files changed, 54 insertions(+), 35 deletions(-)

diff --git a/fs/bcachefs/error.c b/fs/bcachefs/error.c
index 3a9d0a03fecf..cfe791215915 100644
--- a/fs/bcachefs/error.c
+++ b/fs/bcachefs/error.c
@@ -109,18 +109,21 @@ static enum ask_yn bch2_fsck_ask_yn(struct bch_fs *c)
 	if (!stdio)
 		return YN_NO;
 
-	char buf[100];
+	darray_char line = {};
 	int ret;
 
 	do {
 		bch2_print(c, " (y,n, or Y,N for all errors of this type) ");
 
-		int r = bch2_stdio_redirect_readline(stdio, buf, sizeof(buf) - 1);
-		if (r < 0)
-			return YN_NO;
-		buf[r] = '\0';
-	} while ((ret = parse_yn_response(buf)) < 0);
+		int r = bch2_stdio_redirect_readline(stdio, &line);
+		if (r < 0) {
+			ret = YN_NO;
+			break;
+		}
+		darray_last(line) = '\0';
+	} while ((ret = parse_yn_response(line.data)) < 0);
 
+	darray_exit(&line);
 	return ret;
 }
 #else
diff --git a/fs/bcachefs/thread_with_file.c b/fs/bcachefs/thread_with_file.c
index b1af7ac430f6..080afa7eff25 100644
--- a/fs/bcachefs/thread_with_file.c
+++ b/fs/bcachefs/thread_with_file.c
@@ -67,9 +67,14 @@ err:
 
 /* stdio_redirect */
 
+static bool stdio_redirect_has_more_input(struct stdio_redirect *stdio, size_t seen)
+{
+	return stdio->input.buf.nr > seen || stdio->done;
+}
+
 static bool stdio_redirect_has_input(struct stdio_redirect *stdio)
 {
-	return stdio->input.buf.nr || stdio->done;
+	return stdio_redirect_has_more_input(stdio, 0);
 }
 
 static bool stdio_redirect_has_output(struct stdio_redirect *stdio)
@@ -181,9 +186,13 @@ static ssize_t thread_with_stdio_write(struct file *file, const char __user *ubu
 		}
 
 		spin_lock(&buf->lock);
-		if (buf->buf.nr < STDIO_REDIRECT_BUFSIZE)
-			darray_make_room_gfp(&buf->buf,
-				min(b, STDIO_REDIRECT_BUFSIZE - buf->buf.nr), GFP_NOWAIT);
+		size_t makeroom = b;
+		if (!buf->waiting_for_line || memchr(buf->buf.data, '\n', buf->buf.nr))
+			makeroom = min_t(ssize_t, makeroom,
+				   max_t(ssize_t, STDIO_REDIRECT_BUFSIZE - buf->buf.nr,
+						  0));
+		darray_make_room_gfp(&buf->buf, makeroom, GFP_NOWAIT);
+
 		b = min(len, darray_room(buf->buf));
 
 		if (b && !copy_from_user_nofault(&darray_top(buf->buf), ubuf, b)) {
@@ -355,43 +364,48 @@ int bch2_stdio_redirect_read(struct stdio_redirect *stdio, char *ubuf, size_t le
 	return ret;
 }
 
-int bch2_stdio_redirect_readline(struct stdio_redirect *stdio, char *ubuf, size_t len)
+int bch2_stdio_redirect_readline(struct stdio_redirect *stdio, darray_char *line)
 {
 	struct stdio_buf *buf = &stdio->input;
-	size_t copied = 0;
-	ssize_t ret = 0;
+	size_t seen = 0;
 again:
-	do {
-		wait_event_timeout(buf->wait, stdio_redirect_has_input(stdio),
-				   sysctl_hung_task_timeout_secs * HZ / 2);
-	} while (!stdio_redirect_has_input(stdio));
+	wait_event_timeout(buf->wait, stdio_redirect_has_more_input(stdio, seen),
+			   sysctl_hung_task_timeout_secs * HZ / 2);
 
-	if (stdio->done) {
-		ret = -1;
-		goto out;
-	}
+	if (stdio->done)
+		return -1;
 
 	spin_lock(&buf->lock);
-	size_t b = min(len, buf->buf.nr);
-	char *n = memchr(buf->buf.data, '\n', b);
-	if (n)
-		b = min_t(size_t, b, n + 1 - buf->buf.data);
+	seen = buf->buf.nr;
+	char *n = memchr(buf->buf.data, '\n', seen);
+	if (!n) {
+		buf->waiting_for_line = true;
+		spin_unlock(&buf->lock);
+		goto again;
+	}
+
+	size_t b = n + 1 - buf->buf.data;
+	if (b > line->size) {
+		spin_unlock(&buf->lock);
+		int ret = darray_resize(line, b);
+		if (ret)
+			return ret;
+		seen = 0;
+		goto again;
+	}
+
 	buf->buf.nr -= b;
-	memcpy(ubuf, buf->buf.data, b);
+	memcpy(line->data, buf->buf.data, b);
 	memmove(buf->buf.data,
 		buf->buf.data + b,
 		buf->buf.nr);
-	ubuf += b;
-	len -= b;
-	copied += b;
+	line->nr = b;
+
+	buf->waiting_for_line = false;
 	spin_unlock(&buf->lock);
 
 	wake_up(&buf->wait);
-
-	if (!n && len)
-		goto again;
-out:
-	return copied ?: ret;
+	return 0;
 }
 
 __printf(3, 0)
diff --git a/fs/bcachefs/thread_with_file.h b/fs/bcachefs/thread_with_file.h
index 1d63d14d7dca..e415dc2e2fb1 100644
--- a/fs/bcachefs/thread_with_file.h
+++ b/fs/bcachefs/thread_with_file.h
@@ -71,7 +71,8 @@ int bch2_run_thread_with_stdio(struct thread_with_stdio *,
 int bch2_run_thread_with_stdout(struct thread_with_stdio *,
 				const struct thread_with_stdio_ops *);
 int bch2_stdio_redirect_read(struct stdio_redirect *, char *, size_t);
-int bch2_stdio_redirect_readline(struct stdio_redirect *, char *, size_t);
+
+int bch2_stdio_redirect_readline(struct stdio_redirect *, darray_char *);
 
 __printf(3, 0) ssize_t bch2_stdio_redirect_vprintf(struct stdio_redirect *, bool, const char *, va_list);
 __printf(3, 4) ssize_t bch2_stdio_redirect_printf(struct stdio_redirect *, bool, const char *, ...);
diff --git a/fs/bcachefs/thread_with_file_types.h b/fs/bcachefs/thread_with_file_types.h
index e0daf4eec341..12668ff3d65d 100644
--- a/fs/bcachefs/thread_with_file_types.h
+++ b/fs/bcachefs/thread_with_file_types.h
@@ -8,6 +8,7 @@ struct stdio_buf {
 	spinlock_t		lock;
 	wait_queue_head_t	wait;
 	darray_char		buf;
+	bool			waiting_for_line;
 };
 
 struct stdio_redirect {

From d37dd9b60487357cf831720383e335ee9baf5f25 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Wed, 29 May 2024 20:37:39 -0400
Subject: [PATCH 063/120] bcachefs: bch2_stdio_redirect_readline_timeout()

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/thread_with_file.c | 25 ++++++++++++++++++++++---
 fs/bcachefs/thread_with_file.h |  1 +
 2 files changed, 23 insertions(+), 3 deletions(-)

diff --git a/fs/bcachefs/thread_with_file.c b/fs/bcachefs/thread_with_file.c
index 080afa7eff25..0807ce9b171a 100644
--- a/fs/bcachefs/thread_with_file.c
+++ b/fs/bcachefs/thread_with_file.c
@@ -364,13 +364,21 @@ int bch2_stdio_redirect_read(struct stdio_redirect *stdio, char *ubuf, size_t le
 	return ret;
 }
 
-int bch2_stdio_redirect_readline(struct stdio_redirect *stdio, darray_char *line)
+int bch2_stdio_redirect_readline_timeout(struct stdio_redirect *stdio,
+					 darray_char *line,
+					 unsigned long timeout)
 {
+	unsigned long until = jiffies + timeout, t;
 	struct stdio_buf *buf = &stdio->input;
 	size_t seen = 0;
 again:
-	wait_event_timeout(buf->wait, stdio_redirect_has_more_input(stdio, seen),
-			   sysctl_hung_task_timeout_secs * HZ / 2);
+	t = timeout != MAX_SCHEDULE_TIMEOUT
+		? max_t(long, until - jiffies, 0)
+		: timeout;
+
+	t = min(t, sysctl_hung_task_timeout_secs * HZ / 2);
+
+	wait_event_timeout(buf->wait, stdio_redirect_has_more_input(stdio, seen), t);
 
 	if (stdio->done)
 		return -1;
@@ -378,6 +386,12 @@ again:
 	spin_lock(&buf->lock);
 	seen = buf->buf.nr;
 	char *n = memchr(buf->buf.data, '\n', seen);
+
+	if (!n && timeout != MAX_SCHEDULE_TIMEOUT && jiffies >= until) {
+		spin_unlock(&buf->lock);
+		return -ETIME;
+	}
+
 	if (!n) {
 		buf->waiting_for_line = true;
 		spin_unlock(&buf->lock);
@@ -408,6 +422,11 @@ again:
 	return 0;
 }
 
+int bch2_stdio_redirect_readline(struct stdio_redirect *stdio, darray_char *line)
+{
+	return bch2_stdio_redirect_readline_timeout(stdio, line, MAX_SCHEDULE_TIMEOUT);
+}
+
 __printf(3, 0)
 static ssize_t bch2_darray_vprintf(darray_char *out, gfp_t gfp, const char *fmt, va_list args)
 {
diff --git a/fs/bcachefs/thread_with_file.h b/fs/bcachefs/thread_with_file.h
index e415dc2e2fb1..72497b921911 100644
--- a/fs/bcachefs/thread_with_file.h
+++ b/fs/bcachefs/thread_with_file.h
@@ -72,6 +72,7 @@ int bch2_run_thread_with_stdout(struct thread_with_stdio *,
 				const struct thread_with_stdio_ops *);
 int bch2_stdio_redirect_read(struct stdio_redirect *, char *, size_t);
 
+int bch2_stdio_redirect_readline_timeout(struct stdio_redirect *, darray_char *, unsigned long);
 int bch2_stdio_redirect_readline(struct stdio_redirect *, darray_char *);
 
 __printf(3, 0) ssize_t bch2_stdio_redirect_vprintf(struct stdio_redirect *, bool, const char *, va_list);

From 7ed122aea2e26686467c6ec63dc4a1c060e0ff44 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Thu, 30 May 2024 15:54:08 -0400
Subject: [PATCH 064/120] bcachefs: twf: delete dead struct fields

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/thread_with_file_types.h | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/fs/bcachefs/thread_with_file_types.h b/fs/bcachefs/thread_with_file_types.h
index 12668ff3d65d..f4d484d44f63 100644
--- a/fs/bcachefs/thread_with_file_types.h
+++ b/fs/bcachefs/thread_with_file_types.h
@@ -14,10 +14,6 @@ struct stdio_buf {
 struct stdio_redirect {
 	struct stdio_buf	input;
 	struct stdio_buf	output;
-
-	spinlock_t		input_lock;
-	wait_queue_head_t	input_wait;
-	darray_char		input_buf;
 	bool			done;
 };
 

From f8b0147364d5938c71c0bf315becd7a9de0d64b0 Mon Sep 17 00:00:00 2001
From: Ariel Miculas <ariel.miculas@gmail.com>
Date: Fri, 31 May 2024 00:13:58 +0300
Subject: [PATCH 065/120] bcachefs: bch2_dir_emit() - fix directory reads in
 the fuse driver

Commit 0c0cbfdb84725e9933a24ecf47c61bdeeda06ba2 dropped the ctx->pos
update before the call to dir_emit. This breaks the userspace
implementation, causing the directory reads to be stuck in an infinite
loop. This doesn't happen in the kernel because the vfs handles the
updates to ctx->pos, but in the fuse implementation nobody updates
it.

Signed-off-by: Ariel Miculas <ariel.miculas@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/dirent.c | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/fs/bcachefs/dirent.c b/fs/bcachefs/dirent.c
index c67460d8205d..d743da89308e 100644
--- a/fs/bcachefs/dirent.c
+++ b/fs/bcachefs/dirent.c
@@ -534,6 +534,14 @@ int bch2_empty_dir_trans(struct btree_trans *trans, subvol_inum dir)
 static int bch2_dir_emit(struct dir_context *ctx, struct bkey_s_c_dirent d, subvol_inum target)
 {
 	struct qstr name = bch2_dirent_get_name(d);
+	/*
+	 * Although not required by the kernel code, updating ctx->pos is needed
+	 * for the bcachefs FUSE driver. Without this update, the FUSE
+	 * implementation will be stuck in an infinite loop when reading
+	 * directories (via the bcachefs_fuse_readdir callback).
+	 * In kernel space, ctx->pos is updated by the VFS code.
+	 */
+	ctx->pos = d.k->p.offset;
 	bool ret = dir_emit(ctx, name.name,
 		      name.len,
 		      target.inum,

From 8b0882505d012ef5f309a1b547fae53080a62f07 Mon Sep 17 00:00:00 2001
From: Youling Tang <tangyouling@kylinos.cn>
Date: Fri, 31 May 2024 10:31:15 +0800
Subject: [PATCH 066/120] bcachefs: track writeback errors using the generic
 tracking infrastructure

We already using mapping_set_error() in bch2_writepage_io_done(), so all
we need to do is to use file_check_and_advance_wb_err() when handling
fsync() requests in bch2_fsync().

Signed-off-by: Youling Tang <tangyouling@kylinos.cn>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/fs-io.c | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/fs/bcachefs/fs-io.c b/fs/bcachefs/fs-io.c
index ec7901265da5..1b734fd4acd2 100644
--- a/fs/bcachefs/fs-io.c
+++ b/fs/bcachefs/fs-io.c
@@ -192,7 +192,7 @@ int bch2_fsync(struct file *file, loff_t start, loff_t end, int datasync)
 {
 	struct bch_inode_info *inode = file_bch_inode(file);
 	struct bch_fs *c = inode->v.i_sb->s_fs_info;
-	int ret;
+	int ret, err;
 
 	ret = file_write_and_wait_range(file, start, end);
 	if (ret)
@@ -205,6 +205,11 @@ out:
 	ret = bch2_err_class(ret);
 	if (ret == -EROFS)
 		ret = -EIO;
+
+	err = file_check_and_advance_wb_err(file);
+	if (!ret)
+		ret = err;
+
 	return ret;
 }
 

From 747d1d6c7efdfb4e59a93a129eaf434cb9325eb8 Mon Sep 17 00:00:00 2001
From: Youling Tang <tangyouling@kylinos.cn>
Date: Fri, 31 May 2024 10:35:09 +0800
Subject: [PATCH 067/120] bcachefs: Add tracepoints for bch2_sync_fs() and
 bch2_fsync()

Add trace_bch2_sync_fs() and trace_bch2_fsync() implementations.

The output in trace is as follows:
  sync-29779   [000] .....   193.700935: bch2_sync_fs: dev 254,16 wait 1
  <...>-40027  [002] .....   342.535227: bch2_fsync: dev 254,32 ino 4099 parent 4096 datasync 1

Signed-off-by: Youling Tang <tangyouling@kylinos.cn>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/fs-io.c |  2 ++
 fs/bcachefs/fs.c    |  3 +++
 fs/bcachefs/trace.h | 50 +++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 55 insertions(+)

diff --git a/fs/bcachefs/fs-io.c b/fs/bcachefs/fs-io.c
index 1b734fd4acd2..fead9ab10e4a 100644
--- a/fs/bcachefs/fs-io.c
+++ b/fs/bcachefs/fs-io.c
@@ -194,6 +194,8 @@ int bch2_fsync(struct file *file, loff_t start, loff_t end, int datasync)
 	struct bch_fs *c = inode->v.i_sb->s_fs_info;
 	int ret, err;
 
+	trace_bch2_fsync(file, datasync);
+
 	ret = file_write_and_wait_range(file, start, end);
 	if (ret)
 		goto out;
diff --git a/fs/bcachefs/fs.c b/fs/bcachefs/fs.c
index 1e0e5a842243..3dd60aa3c3de 100644
--- a/fs/bcachefs/fs.c
+++ b/fs/bcachefs/fs.c
@@ -26,6 +26,7 @@
 #include "snapshot.h"
 #include "super.h"
 #include "xattr.h"
+#include "trace.h"
 
 #include <linux/aio.h>
 #include <linux/backing-dev.h>
@@ -1697,6 +1698,8 @@ static int bch2_sync_fs(struct super_block *sb, int wait)
 	struct bch_fs *c = sb->s_fs_info;
 	int ret;
 
+	trace_bch2_sync_fs(sb, wait);
+
 	if (c->opts.journal_flush_disabled)
 		return 0;
 
diff --git a/fs/bcachefs/trace.h b/fs/bcachefs/trace.h
index 84fcf26e306e..d0e6b9deb6cb 100644
--- a/fs/bcachefs/trace.h
+++ b/fs/bcachefs/trace.h
@@ -200,6 +200,56 @@ DECLARE_EVENT_CLASS(bio,
 		  (unsigned long long)__entry->sector, __entry->nr_sector)
 );
 
+/* fs.c: */
+TRACE_EVENT(bch2_sync_fs,
+	TP_PROTO(struct super_block *sb, int wait),
+
+	TP_ARGS(sb, wait),
+
+	TP_STRUCT__entry(
+		__field(	dev_t,	dev			)
+		__field(	int,	wait			)
+
+	),
+
+	TP_fast_assign(
+		__entry->dev	= sb->s_dev;
+		__entry->wait	= wait;
+	),
+
+	TP_printk("dev %d,%d wait %d",
+		  MAJOR(__entry->dev), MINOR(__entry->dev),
+		  __entry->wait)
+);
+
+/* fs-io.c: */
+TRACE_EVENT(bch2_fsync,
+	TP_PROTO(struct file *file, int datasync),
+
+	TP_ARGS(file, datasync),
+
+	TP_STRUCT__entry(
+		__field(	dev_t,	dev			)
+		__field(	ino_t,	ino			)
+		__field(	ino_t,	parent			)
+		__field(	int,	datasync		)
+	),
+
+	TP_fast_assign(
+		struct dentry *dentry = file->f_path.dentry;
+
+		__entry->dev		= dentry->d_sb->s_dev;
+		__entry->ino		= d_inode(dentry)->i_ino;
+		__entry->parent		= d_inode(dentry->d_parent)->i_ino;
+		__entry->datasync	= datasync;
+	),
+
+	TP_printk("dev %d,%d ino %lu parent %lu datasync %d ",
+		  MAJOR(__entry->dev), MINOR(__entry->dev),
+		  (unsigned long) __entry->ino,
+		  (unsigned long) __entry->parent, __entry->datasync)
+);
+
 /* super-io.c: */
 TRACE_EVENT(write_super,
 	TP_PROTO(struct bch_fs *c, unsigned long ip),

From 889fb3dc5d6f8e0931649cb5f016dd38d50fd9fe Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Wed, 29 May 2024 21:14:40 -0400
Subject: [PATCH 068/120] bcachefs: Unlock trans when waiting for user input in
 fsck

We can't hold locks while waiting for user input, that's a deadlock.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/error.c | 32 ++++++++++++++++++++++++++++----
 1 file changed, 28 insertions(+), 4 deletions(-)

diff --git a/fs/bcachefs/error.c b/fs/bcachefs/error.c
index cfe791215915..a62b63108820 100644
--- a/fs/bcachefs/error.c
+++ b/fs/bcachefs/error.c
@@ -99,7 +99,7 @@ static enum ask_yn parse_yn_response(char *buf)
 }
 
 #ifdef __KERNEL__
-static enum ask_yn bch2_fsck_ask_yn(struct bch_fs *c)
+static enum ask_yn bch2_fsck_ask_yn(struct bch_fs *c, struct btree_trans *trans)
 {
 	struct stdio_redirect *stdio = c->stdio;
 
@@ -109,17 +109,33 @@ static enum ask_yn bch2_fsck_ask_yn(struct bch_fs *c)
 	if (!stdio)
 		return YN_NO;
 
+	if (trans)
+		bch2_trans_unlock(trans);
+
+	unsigned long unlock_long_at = trans ? jiffies + HZ * 2 : 0;
 	darray_char line = {};
 	int ret;
 
 	do {
+		unsigned long t;
 		bch2_print(c, " (y,n, or Y,N for all errors of this type) ");
+rewait:
+		t = unlock_long_at
+			? max_t(long, unlock_long_at - jiffies, 0)
+			: MAX_SCHEDULE_TIMEOUT;
+
+		int r = bch2_stdio_redirect_readline_timeout(stdio, &line, t);
+		if (r == -ETIME) {
+			bch2_trans_unlock_long(trans);
+			unlock_long_at = 0;
+			goto rewait;
+		}
 
-		int r = bch2_stdio_redirect_readline(stdio, &line);
 		if (r < 0) {
 			ret = YN_NO;
 			break;
 		}
+
 		darray_last(line) = '\0';
 	} while ((ret = parse_yn_response(line.data)) < 0);
 
@@ -130,7 +146,7 @@ static enum ask_yn bch2_fsck_ask_yn(struct bch_fs *c)
 
 #include "tools-util.h"
 
-static enum ask_yn bch2_fsck_ask_yn(struct bch_fs *c)
+static enum ask_yn bch2_fsck_ask_yn(struct bch_fs *c, struct btree_trans *trans)
 {
 	char *buf = NULL;
 	size_t buflen = 0;
@@ -326,7 +342,15 @@ int __bch2_fsck_err(struct bch_fs *c,
 				bch2_print_string_as_lines(KERN_ERR, out->buf);
 			print = false;
 
-			int ask = bch2_fsck_ask_yn(c);
+			int ask = bch2_fsck_ask_yn(c, trans);
+
+			if (trans) {
+				ret = bch2_trans_relock(trans);
+				if (ret) {
+					mutex_unlock(&c->fsck_error_msgs_lock);
+					goto err;
+				}
+			}
 
 			if (ask >= YN_ALLNO && s)
 				s->fix = ask == YN_ALLNO

From 8a4ef7e28abade861d894e25b167988c5b8977a7 Mon Sep 17 00:00:00 2001
From: Hongbo Li <lihongbo22@huawei.com>
Date: Mon, 3 Jun 2024 21:26:18 +0800
Subject: [PATCH 069/120] bcachefs: implement FS_IOC_GETVERSION to support
 lsattr

In this patch we add the FS_IOC_GETVERSION ioctl for getting
i_generation from inode, after that, users can list file's
generation number by using "lsattr".

Signed-off-by: Hongbo Li <lihongbo22@huawei.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/fs-ioctl.c | 10 +++++++++-
 1 file changed, 9 insertions(+), 1 deletion(-)

diff --git a/fs/bcachefs/fs-ioctl.c b/fs/bcachefs/fs-ioctl.c
index 79a0c8732bce..add90172b475 100644
--- a/fs/bcachefs/fs-ioctl.c
+++ b/fs/bcachefs/fs-ioctl.c
@@ -272,6 +272,11 @@ err1:
 	return ret;
 }
 
+static int bch2_ioc_getversion(struct bch_inode_info *inode, u32 __user *arg)
+{
+	return put_user(inode->v.i_generation, arg);
+}
+
 static int bch2_ioc_goingdown(struct bch_fs *c, u32 __user *arg)
 {
 	u32 flags;
@@ -499,7 +504,7 @@ long bch2_fs_file_ioctl(struct file *file, unsigned cmd, unsigned long arg)
 		break;
 
 	case FS_IOC_GETVERSION:
-		ret = -ENOTTY;
+		ret = bch2_ioc_getversion(inode, (u32 __user *) arg);
 		break;
 
 	case FS_IOC_SETVERSION:
@@ -547,6 +552,9 @@ long bch2_compat_fs_ioctl(struct file *file, unsigned cmd, unsigned long arg)
 	case FS_IOC32_SETFLAGS:
 		cmd = FS_IOC_SETFLAGS;
 		break;
+	case FS_IOC32_GETVERSION:
+		cmd = FS_IOC_GETVERSION;
+		break;
 	default:
 		return -ENOIOCTLCMD;
 	}

From 81bce3cf2b2b4adcba4eda58fb3ebc4082b13fb3 Mon Sep 17 00:00:00 2001
From: Hongbo Li <lihongbo22@huawei.com>
Date: Mon, 3 Jun 2024 21:26:19 +0800
Subject: [PATCH 070/120] bcachefs: support get fs label

Implement support for FS_IOC_GETFSLABEL ioctl to read filesystem
label.

Signed-off-by: Hongbo Li <lihongbo22@huawei.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/fs-ioctl.c | 30 ++++++++++++++++++++++++++++++
 1 file changed, 30 insertions(+)

diff --git a/fs/bcachefs/fs-ioctl.c b/fs/bcachefs/fs-ioctl.c
index add90172b475..c82c25e7f45a 100644
--- a/fs/bcachefs/fs-ioctl.c
+++ b/fs/bcachefs/fs-ioctl.c
@@ -277,6 +277,30 @@ static int bch2_ioc_getversion(struct bch_inode_info *inode, u32 __user *arg)
 	return put_user(inode->v.i_generation, arg);
 }
 
+static int bch2_ioc_getlabel(struct bch_fs *c, char __user *user_label)
+{
+	int ret;
+	size_t len;
+	char label[BCH_SB_LABEL_SIZE];
+
+	BUILD_BUG_ON(BCH_SB_LABEL_SIZE >= FSLABEL_MAX);
+
+	mutex_lock(&c->sb_lock);
+	memcpy(label, c->disk_sb.sb->label, BCH_SB_LABEL_SIZE);
+	mutex_unlock(&c->sb_lock);
+
+	len = strnlen(label, BCH_SB_LABEL_SIZE);
+	if (len == BCH_SB_LABEL_SIZE) {
+		bch_warn(c,
+			"label is too long, return the first %zu bytes",
+			--len);
+	}
+
+	ret = copy_to_user(user_label, label, len);
+
+	return ret ? -EFAULT : 0;
+}
+
 static int bch2_ioc_goingdown(struct bch_fs *c, u32 __user *arg)
 {
 	u32 flags;
@@ -511,6 +535,10 @@ long bch2_fs_file_ioctl(struct file *file, unsigned cmd, unsigned long arg)
 		ret = -ENOTTY;
 		break;
 
+	case FS_IOC_GETFSLABEL:
+		ret = bch2_ioc_getlabel(c, (void __user *) arg);
+		break;
+
 	case FS_IOC_GOINGDOWN:
 		ret = bch2_ioc_goingdown(c, (u32 __user *) arg);
 		break;
@@ -555,6 +583,8 @@ long bch2_compat_fs_ioctl(struct file *file, unsigned cmd, unsigned long arg)
 	case FS_IOC32_GETVERSION:
 		cmd = FS_IOC_GETVERSION;
 		break;
+	case FS_IOC_GETFSLABEL:
+		break;
 	default:
 		return -ENOIOCTLCMD;
 	}

From 7a254053a59008913247606d0ce4a0a8b61fe6ee Mon Sep 17 00:00:00 2001
From: Hongbo Li <lihongbo22@huawei.com>
Date: Mon, 3 Jun 2024 21:26:20 +0800
Subject: [PATCH 071/120] bcachefs: support FS_IOC_SETFSLABEL

Implement support for FS_IOC_SETFSLABEL ioctl to set filesystem
label.

Signed-off-by: Hongbo Li <lihongbo22@huawei.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/fs-ioctl.c | 40 ++++++++++++++++++++++++++++++++++++++++
 1 file changed, 40 insertions(+)

diff --git a/fs/bcachefs/fs-ioctl.c b/fs/bcachefs/fs-ioctl.c
index c82c25e7f45a..aea8132d2c40 100644
--- a/fs/bcachefs/fs-ioctl.c
+++ b/fs/bcachefs/fs-ioctl.c
@@ -301,6 +301,41 @@ static int bch2_ioc_getlabel(struct bch_fs *c, char __user *user_label)
 	return ret ? -EFAULT : 0;
 }
 
+static int bch2_ioc_setlabel(struct bch_fs *c,
+			     struct file *file,
+			     struct bch_inode_info *inode,
+			     const char __user *user_label)
+{
+	int ret;
+	char label[BCH_SB_LABEL_SIZE];
+
+	if (!capable(CAP_SYS_ADMIN))
+		return -EPERM;
+
+	if (copy_from_user(label, user_label, sizeof(label)))
+		return -EFAULT;
+
+	if (strnlen(label, BCH_SB_LABEL_SIZE) == BCH_SB_LABEL_SIZE) {
+		bch_err(c,
+			"unable to set label with more than %d bytes",
+			BCH_SB_LABEL_SIZE - 1);
+		return -EINVAL;
+	}
+
+	ret = mnt_want_write_file(file);
+	if (ret)
+		return ret;
+
+	mutex_lock(&c->sb_lock);
+	strscpy(c->disk_sb.sb->label, label, BCH_SB_LABEL_SIZE);
+	mutex_unlock(&c->sb_lock);
+
+	ret = bch2_write_super(c);
+
+	mnt_drop_write_file(file);
+	return ret;
+}
+
 static int bch2_ioc_goingdown(struct bch_fs *c, u32 __user *arg)
 {
 	u32 flags;
@@ -539,6 +574,10 @@ long bch2_fs_file_ioctl(struct file *file, unsigned cmd, unsigned long arg)
 		ret = bch2_ioc_getlabel(c, (void __user *) arg);
 		break;
 
+	case FS_IOC_SETFSLABEL:
+		ret = bch2_ioc_setlabel(c, file, inode, (const void __user *) arg);
+		break;
+
 	case FS_IOC_GOINGDOWN:
 		ret = bch2_ioc_goingdown(c, (u32 __user *) arg);
 		break;
@@ -584,6 +623,7 @@ long bch2_compat_fs_ioctl(struct file *file, unsigned cmd, unsigned long arg)
 		cmd = FS_IOC_GETVERSION;
 		break;
 	case FS_IOC_GETFSLABEL:
+	case FS_IOC_SETFSLABEL:
 		break;
 	default:
 		return -ENOIOCTLCMD;

From 7f3dc6c98b52204c0060fa7eee0fbced05958544 Mon Sep 17 00:00:00 2001
From: Reed Riley <reed@riley.engineer>
Date: Sat, 11 May 2024 00:20:12 +0000
Subject: [PATCH 072/120] bcachefs: support REMAP_FILE_DEDUP in
 bch2_remap_file_range

By removing the early-exit when REMAP_FILE_DEDUP is set, we should be
able to support the fideduperange ioctl, albeit less efficiently than if
we handled some of the extent locking and comparison logic inside
bcachefs.  Extent comparison logic already exists inside of
`__generic_remap_file_range_prep`.

Signed-off-by: Reed Riley <reed@riley.engineer>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/fs-io.c | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/fs/bcachefs/fs-io.c b/fs/bcachefs/fs-io.c
index fead9ab10e4a..77b85da30fb2 100644
--- a/fs/bcachefs/fs-io.c
+++ b/fs/bcachefs/fs-io.c
@@ -867,9 +867,6 @@ loff_t bch2_remap_file_range(struct file *file_src, loff_t pos_src,
 	if (remap_flags & ~(REMAP_FILE_DEDUP|REMAP_FILE_ADVISORY))
 		return -EINVAL;
 
-	if (remap_flags & REMAP_FILE_DEDUP)
-		return -EOPNOTSUPP;
-
 	if ((pos_src & (block_bytes(c) - 1)) ||
 	    (pos_dst & (block_bytes(c) - 1)))
 		return -EINVAL;
@@ -902,7 +899,8 @@ loff_t bch2_remap_file_range(struct file *file_src, loff_t pos_src,
 	if (ret)
 		goto err;
 
-	file_update_time(file_dst);
+	if (!(remap_flags & REMAP_FILE_DEDUP))
+		file_update_time(file_dst);
 
 	bch2_mark_pagecache_unallocated(src, pos_src >> 9,
 				   (pos_src + aligned_len) >> 9);

From 8863d1e092005d5b31f7e712827a5605f8a7ba22 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Fri, 1 Mar 2024 18:43:39 -0500
Subject: [PATCH 073/120] bcachefs: BCH_IOCTL_QUERY_ACCOUNTING

Add a new ioctl that can return the new accounting counter types; it
takes as input a bitmask of accounting types to return.

This will be used for returning e.g. compression accounting and
rebalance_work accounting.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/bcachefs_ioctl.h  | 29 ++++++++++++++++++++++++++
 fs/bcachefs/chardev.c         | 30 +++++++++++++++++++++++++++
 fs/bcachefs/disk_accounting.c | 38 +++++++++++++++++++++++++++++++++++
 fs/bcachefs/disk_accounting.h |  1 +
 4 files changed, 98 insertions(+)

diff --git a/fs/bcachefs/bcachefs_ioctl.h b/fs/bcachefs/bcachefs_ioctl.h
index 0b82a4dd099f..3c23bdf788ce 100644
--- a/fs/bcachefs/bcachefs_ioctl.h
+++ b/fs/bcachefs/bcachefs_ioctl.h
@@ -5,6 +5,7 @@
 #include <linux/uuid.h>
 #include <asm/ioctl.h>
 #include "bcachefs_format.h"
+#include "bkey_types.h"
 
 /*
  * Flags common to multiple ioctls:
@@ -85,6 +86,7 @@ struct bch_ioctl_incremental {
 
 #define BCH_IOCTL_FSCK_OFFLINE	_IOW(0xbc,	19,  struct bch_ioctl_fsck_offline)
 #define BCH_IOCTL_FSCK_ONLINE	_IOW(0xbc,	20,  struct bch_ioctl_fsck_online)
+#define BCH_IOCTL_QUERY_ACCOUNTING _IOW(0xbc,	21,  struct bch_ioctl_query_accounting)
 
 /* ioctl below act on a particular file, not the filesystem as a whole: */
 
@@ -262,6 +264,7 @@ replicas_usage_next(struct bch_replicas_usage *u)
 	return (void *) u + replicas_usage_bytes(u);
 }
 
+/* Obsolete */
 /*
  * BCH_IOCTL_FS_USAGE: query filesystem disk space usage
  *
@@ -287,6 +290,7 @@ struct bch_ioctl_fs_usage {
 	struct bch_replicas_usage replicas[];
 };
 
+/* Obsolete */
 /*
  * BCH_IOCTL_DEV_USAGE: query device disk space usage
  *
@@ -311,6 +315,7 @@ struct bch_ioctl_dev_usage {
 	}			d[10];
 };
 
+/* Obsolete */
 struct bch_ioctl_dev_usage_v2 {
 	__u64			dev;
 	__u32			flags;
@@ -414,4 +419,28 @@ struct bch_ioctl_fsck_online {
 	__u64			opts;		/* string */
 };
 
+/*
+ * BCH_IOCTL_QUERY_ACCOUNTING: query filesystem disk accounting
+ *
+ * Returns disk space usage broken out by data type, number of replicas, and
+ * by component device
+ *
+ * @replica_entries_bytes - size, in bytes, allocated for replica usage entries
+ *
+ * On success, @replica_entries_bytes will be changed to indicate the number of
+ * bytes actually used.
+ *
+ * Returns -ERANGE if @replica_entries_bytes was too small
+ */
+struct bch_ioctl_query_accounting {
+	__u64			capacity;
+	__u64			used;
+	__u64			online_reserved;
+
+	__u32			accounting_u64s; /* input parameter */
+	__u32			accounting_types_mask; /* input parameter */
+
+	struct bkey_i_accounting accounting[];
+};
+
 #endif /* _BCACHEFS_IOCTL_H */
diff --git a/fs/bcachefs/chardev.c b/fs/bcachefs/chardev.c
index 0e76e06ab844..cbadba4027c2 100644
--- a/fs/bcachefs/chardev.c
+++ b/fs/bcachefs/chardev.c
@@ -557,6 +557,34 @@ err:
 	return ret;
 }
 
+static long bch2_ioctl_query_accounting(struct bch_fs *c,
+			struct bch_ioctl_query_accounting __user *user_arg)
+{
+	struct bch_ioctl_query_accounting arg;
+	darray_char accounting = {};
+	int ret = 0;
+
+	if (!test_bit(BCH_FS_started, &c->flags))
+		return -EINVAL;
+
+	ret   = copy_from_user_errcode(&arg, user_arg, sizeof(arg)) ?:
+		bch2_fs_accounting_read(c, &accounting, arg.accounting_types_mask) ?:
+		(arg.accounting_u64s * sizeof(u64) < accounting.nr ? -ERANGE : 0) ?:
+		copy_to_user_errcode(&user_arg->accounting, accounting.data, accounting.nr);
+	if (ret)
+		goto err;
+
+	arg.capacity		= c->capacity;
+	arg.used		= bch2_fs_usage_read_short(c).used;
+	arg.online_reserved	= percpu_u64_get(c->online_reserved);
+	arg.accounting_u64s	= accounting.nr / sizeof(u64);
+
+	ret = copy_to_user_errcode(user_arg, &arg, sizeof(arg));
+err:
+	darray_exit(&accounting);
+	return ret;
+}
+
 /* obsolete, didn't allow for new data types: */
 static long bch2_ioctl_dev_usage(struct bch_fs *c,
 				 struct bch_ioctl_dev_usage __user *user_arg)
@@ -910,6 +938,8 @@ long bch2_fs_ioctl(struct bch_fs *c, unsigned cmd, void __user *arg)
 		BCH_IOCTL(disk_resize_journal, struct bch_ioctl_disk_resize_journal);
 	case BCH_IOCTL_FSCK_ONLINE:
 		BCH_IOCTL(fsck_online, struct bch_ioctl_fsck_online);
+	case BCH_IOCTL_QUERY_ACCOUNTING:
+		return bch2_ioctl_query_accounting(c, arg);
 	default:
 		return -ENOTTY;
 	}
diff --git a/fs/bcachefs/disk_accounting.c b/fs/bcachefs/disk_accounting.c
index bc45f53efc27..510ed683f0a0 100644
--- a/fs/bcachefs/disk_accounting.c
+++ b/fs/bcachefs/disk_accounting.c
@@ -315,6 +315,44 @@ int bch2_fs_replicas_usage_read(struct bch_fs *c, darray_char *usage)
 	return ret;
 }
 
+int bch2_fs_accounting_read(struct bch_fs *c, darray_char *out_buf, unsigned accounting_types_mask)
+{
+
+	struct bch_accounting_mem *acc = &c->accounting[0];
+	int ret = 0;
+
+	darray_init(out_buf);
+
+	percpu_down_read(&c->mark_lock);
+	darray_for_each(acc->k, i) {
+		struct disk_accounting_pos a_p;
+		bpos_to_disk_accounting_pos(&a_p, i->pos);
+
+		if (!(accounting_types_mask & BIT(a_p.type)))
+			continue;
+
+		ret = darray_make_room(out_buf, sizeof(struct bkey_i_accounting) +
+				       sizeof(u64) * i->nr_counters);
+		if (ret)
+			break;
+
+		struct bkey_i_accounting *a_out =
+			bkey_accounting_init((void *) &darray_top(*out_buf));
+		set_bkey_val_u64s(&a_out->k, i->nr_counters);
+		a_out->k.p = i->pos;
+		bch2_accounting_mem_read(c, i->pos, a_out->v.d, i->nr_counters);
+
+		if (!bch2_accounting_key_is_zero(accounting_i_to_s_c(a_out)))
+			out_buf->nr += bkey_bytes(&a_out->k);
+	}
+
+	percpu_up_read(&c->mark_lock);
+
+	if (ret)
+		darray_exit(out_buf);
+	return ret;
+}
+
 void bch2_fs_accounting_to_text(struct printbuf *out, struct bch_fs *c)
 {
 	struct bch_accounting_mem *acc = &c->accounting[0];
diff --git a/fs/bcachefs/disk_accounting.h b/fs/bcachefs/disk_accounting.h
index 5164995f3139..ab1f74cb97c7 100644
--- a/fs/bcachefs/disk_accounting.h
+++ b/fs/bcachefs/disk_accounting.h
@@ -193,6 +193,7 @@ static inline void bch2_accounting_mem_read(struct bch_fs *c, struct bpos p,
 }
 
 int bch2_fs_replicas_usage_read(struct bch_fs *, darray_char *);
+int bch2_fs_accounting_read(struct bch_fs *, darray_char *, unsigned);
 void bch2_fs_accounting_to_text(struct printbuf *, struct bch_fs *);
 
 int bch2_accounting_gc_done(struct bch_fs *);

From 49858d869b62446e552bf9421077fb6c3125b13a Mon Sep 17 00:00:00 2001
From: Ariel Miculas <ariel.miculas@gmail.com>
Date: Mon, 3 Jun 2024 23:47:31 +0300
Subject: [PATCH 074/120] bcachefs: bch2_btree_insert() - add btree iter flags

The commit 65bd44239727 ("bcachefs: bch2_btree_insert_trans() no longer
specifies BTREE_ITER_cached") removes BTREE_ITER_cached from
bch2_btree_insert_trans, which causes the update_inode function from
bcachefs-tools to take a long time (~20s).  Add an iter_flags parameter
to bch2_btree_insert, so the users can specify iter update trigger
flags, such as BTREE_ITER_cached.

Signed-off-by: Ariel Miculas <ariel.miculas@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_update.c |  6 ++++--
 fs/bcachefs/btree_update.h |  5 +++--
 fs/bcachefs/recovery.c     |  2 +-
 fs/bcachefs/subvolume.c    |  6 +++---
 fs/bcachefs/tests.c        | 14 +++++++-------
 5 files changed, 18 insertions(+), 15 deletions(-)

diff --git a/fs/bcachefs/btree_update.c b/fs/bcachefs/btree_update.c
index f3c645a43dcb..d6f6df10dcc3 100644
--- a/fs/bcachefs/btree_update.c
+++ b/fs/bcachefs/btree_update.c
@@ -656,14 +656,16 @@ int bch2_btree_insert_trans(struct btree_trans *trans, enum btree_id id,
  * @disk_res:		must be non-NULL whenever inserting or potentially
  *			splitting data extents
  * @flags:		transaction commit flags
+ * @iter_flags:		btree iter update trigger flags
  *
  * Returns:		0 on success, error code on failure
  */
 int bch2_btree_insert(struct bch_fs *c, enum btree_id id, struct bkey_i *k,
-		      struct disk_reservation *disk_res, int flags)
+		      struct disk_reservation *disk_res, int flags,
+		      enum btree_iter_update_trigger_flags iter_flags)
 {
 	return bch2_trans_do(c, disk_res, NULL, flags,
-			     bch2_btree_insert_trans(trans, id, k, 0));
+			     bch2_btree_insert_trans(trans, id, k, iter_flags));
 }
 
 int bch2_btree_delete_extent_at(struct btree_trans *trans, struct btree_iter *iter,
diff --git a/fs/bcachefs/btree_update.h b/fs/bcachefs/btree_update.h
index b907f4c1312b..60393e98084d 100644
--- a/fs/bcachefs/btree_update.h
+++ b/fs/bcachefs/btree_update.h
@@ -57,8 +57,9 @@ int bch2_btree_insert_nonextent(struct btree_trans *, enum btree_id,
 
 int bch2_btree_insert_trans(struct btree_trans *, enum btree_id, struct bkey_i *,
 			enum btree_iter_update_trigger_flags);
-int bch2_btree_insert(struct bch_fs *, enum btree_id, struct bkey_i *,
-		     struct disk_reservation *, int flags);
+int bch2_btree_insert(struct bch_fs *, enum btree_id, struct bkey_i *, struct
+		disk_reservation *, int flags, enum
+		btree_iter_update_trigger_flags iter_flags);
 
 int bch2_btree_delete_range_trans(struct btree_trans *, enum btree_id,
 				  struct bpos, struct bpos, unsigned, u64 *);
diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c
index 514bff68d971..d89eb43c5ce9 100644
--- a/fs/bcachefs/recovery.c
+++ b/fs/bcachefs/recovery.c
@@ -1071,7 +1071,7 @@ int bch2_fs_initialize(struct bch_fs *c)
 	bch2_inode_pack(&packed_inode, &root_inode);
 	packed_inode.inode.k.p.snapshot = U32_MAX;
 
-	ret = bch2_btree_insert(c, BTREE_ID_inodes, &packed_inode.inode.k_i, NULL, 0);
+	ret = bch2_btree_insert(c, BTREE_ID_inodes, &packed_inode.inode.k_i, NULL, 0, 0);
 	bch_err_msg(c, ret, "creating root directory");
 	if (ret)
 		goto err;
diff --git a/fs/bcachefs/subvolume.c b/fs/bcachefs/subvolume.c
index 488ca6eb06a7..f56720b55862 100644
--- a/fs/bcachefs/subvolume.c
+++ b/fs/bcachefs/subvolume.c
@@ -630,9 +630,9 @@ int bch2_initialize_subvolumes(struct bch_fs *c)
 	root_volume.v.snapshot	= cpu_to_le32(U32_MAX);
 	root_volume.v.inode	= cpu_to_le64(BCACHEFS_ROOT_INO);
 
-	ret =   bch2_btree_insert(c, BTREE_ID_snapshot_trees,	&root_tree.k_i, NULL, 0) ?:
-		bch2_btree_insert(c, BTREE_ID_snapshots,	&root_snapshot.k_i, NULL, 0) ?:
-		bch2_btree_insert(c, BTREE_ID_subvolumes,	&root_volume.k_i, NULL, 0);
+	ret =   bch2_btree_insert(c, BTREE_ID_snapshot_trees,	&root_tree.k_i, NULL, 0, 0) ?:
+		bch2_btree_insert(c, BTREE_ID_snapshots,	&root_snapshot.k_i, NULL, 0, 0) ?:
+		bch2_btree_insert(c, BTREE_ID_subvolumes,	&root_volume.k_i, NULL, 0, 0);
 	bch_err_fn(c, ret);
 	return ret;
 }
diff --git a/fs/bcachefs/tests.c b/fs/bcachefs/tests.c
index 68104b2056d9..01b768c9b767 100644
--- a/fs/bcachefs/tests.c
+++ b/fs/bcachefs/tests.c
@@ -121,7 +121,7 @@ static int test_iterate(struct bch_fs *c, u64 nr)
 		ck.k.p.offset = i;
 		ck.k.p.snapshot = U32_MAX;
 
-		ret = bch2_btree_insert(c, BTREE_ID_xattrs, &ck.k_i, NULL, 0);
+		ret = bch2_btree_insert(c, BTREE_ID_xattrs, &ck.k_i, NULL, 0, 0);
 		bch_err_msg(c, ret, "insert error");
 		if (ret)
 			return ret;
@@ -176,7 +176,7 @@ static int test_iterate_extents(struct bch_fs *c, u64 nr)
 		ck.k.p.snapshot = U32_MAX;
 		ck.k.size = 8;
 
-		ret = bch2_btree_insert(c, BTREE_ID_extents, &ck.k_i, NULL, 0);
+		ret = bch2_btree_insert(c, BTREE_ID_extents, &ck.k_i, NULL, 0, 0);
 		bch_err_msg(c, ret, "insert error");
 		if (ret)
 			return ret;
@@ -232,7 +232,7 @@ static int test_iterate_slots(struct bch_fs *c, u64 nr)
 		ck.k.p.offset = i * 2;
 		ck.k.p.snapshot = U32_MAX;
 
-		ret = bch2_btree_insert(c, BTREE_ID_xattrs, &ck.k_i, NULL, 0);
+		ret = bch2_btree_insert(c, BTREE_ID_xattrs, &ck.k_i, NULL, 0, 0);
 		bch_err_msg(c, ret, "insert error");
 		if (ret)
 			return ret;
@@ -292,7 +292,7 @@ static int test_iterate_slots_extents(struct bch_fs *c, u64 nr)
 		ck.k.p.snapshot = U32_MAX;
 		ck.k.size = 8;
 
-		ret = bch2_btree_insert(c, BTREE_ID_extents, &ck.k_i, NULL, 0);
+		ret = bch2_btree_insert(c, BTREE_ID_extents, &ck.k_i, NULL, 0, 0);
 		bch_err_msg(c, ret, "insert error");
 		if (ret)
 			return ret;
@@ -396,7 +396,7 @@ static int insert_test_extent(struct bch_fs *c,
 	k.k_i.k.size = end - start;
 	k.k_i.k.version.lo = test_version++;
 
-	ret = bch2_btree_insert(c, BTREE_ID_extents, &k.k_i, NULL, 0);
+	ret = bch2_btree_insert(c, BTREE_ID_extents, &k.k_i, NULL, 0, 0);
 	bch_err_fn(c, ret);
 	return ret;
 }
@@ -481,7 +481,7 @@ static int test_snapshot_filter(struct bch_fs *c, u32 snapid_lo, u32 snapid_hi)
 
 	bkey_cookie_init(&cookie.k_i);
 	cookie.k.p.snapshot = snapid_hi;
-	ret = bch2_btree_insert(c, BTREE_ID_xattrs, &cookie.k_i, NULL, 0);
+	ret = bch2_btree_insert(c, BTREE_ID_xattrs, &cookie.k_i, NULL, 0, 0);
 	if (ret)
 		return ret;
 
@@ -506,7 +506,7 @@ static int test_snapshots(struct bch_fs *c, u64 nr)
 
 	bkey_cookie_init(&cookie.k_i);
 	cookie.k.p.snapshot = U32_MAX;
-	ret = bch2_btree_insert(c, BTREE_ID_xattrs, &cookie.k_i, NULL, 0);
+	ret = bch2_btree_insert(c, BTREE_ID_xattrs, &cookie.k_i, NULL, 0, 0);
 	if (ret)
 		return ret;
 

From f295920bc4a0f34f7c4182f53c8f0c929e4358a2 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Wed, 5 Jun 2024 12:35:48 -0400
Subject: [PATCH 075/120] bcachefs: Fix race in bch2_accounting_mem_insert()

bch2_accounting_mem_insert() drops and retakes mark_lock; thus, we need
to check if the entry in question has already been inserted.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/disk_accounting.c | 14 ++++++++------
 fs/bcachefs/disk_accounting.h | 14 +++++++++-----
 2 files changed, 17 insertions(+), 11 deletions(-)

diff --git a/fs/bcachefs/disk_accounting.c b/fs/bcachefs/disk_accounting.c
index 510ed683f0a0..29bc4c816f95 100644
--- a/fs/bcachefs/disk_accounting.c
+++ b/fs/bcachefs/disk_accounting.c
@@ -218,7 +218,7 @@ int bch2_accounting_update_sb(struct btree_trans *trans)
 	return 0;
 }
 
-static int __bch2_accounting_mem_mod_slowpath(struct bch_fs *c, struct bkey_s_c_accounting a, bool gc)
+static int __bch2_accounting_mem_insert(struct bch_fs *c, struct bkey_s_c_accounting a, bool gc)
 {
 	struct bch_replicas_padded r;
 
@@ -226,7 +226,12 @@ static int __bch2_accounting_mem_mod_slowpath(struct bch_fs *c, struct bkey_s_c_
 	    !bch2_replicas_marked_locked(c, &r.e))
 		return -BCH_ERR_btree_insert_need_mark_replicas;
 
+	/* raced with another insert, already present: */
 	struct bch_accounting_mem *acc = &c->accounting[gc];
+	if (eytzinger0_find(acc->k.data, acc->k.nr, sizeof(acc->k.data[0]),
+			    accounting_pos_cmp, &a.k->p) < acc->k.nr)
+		return 0;
+
 	unsigned new_nr_counters = acc->nr_counters + bch2_accounting_counters(a.k);
 
 	u64 __percpu *new_counters = __alloc_percpu_gfp(new_nr_counters * sizeof(u64),
@@ -256,17 +261,14 @@ static int __bch2_accounting_mem_mod_slowpath(struct bch_fs *c, struct bkey_s_c_
 	free_percpu(acc->v);
 	acc->v = new_counters;
 	acc->nr_counters = new_nr_counters;
-
-	for (unsigned i = 0; i < n.nr_counters; i++)
-		this_cpu_add(acc->v[n.offset + i], a.v->d[i]);
 	return 0;
 }
 
-int bch2_accounting_mem_mod_slowpath(struct bch_fs *c, struct bkey_s_c_accounting a, bool gc)
+int bch2_accounting_mem_insert(struct bch_fs *c, struct bkey_s_c_accounting a, bool gc)
 {
 	percpu_up_read(&c->mark_lock);
 	percpu_down_write(&c->mark_lock);
-	int ret = __bch2_accounting_mem_mod_slowpath(c, a, gc);
+	int ret = __bch2_accounting_mem_insert(c, a, gc);
 	percpu_up_write(&c->mark_lock);
 	percpu_down_read(&c->mark_lock);
 	return ret;
diff --git a/fs/bcachefs/disk_accounting.h b/fs/bcachefs/disk_accounting.h
index ab1f74cb97c7..4b37f0c24b4e 100644
--- a/fs/bcachefs/disk_accounting.h
+++ b/fs/bcachefs/disk_accounting.h
@@ -104,15 +104,19 @@ static inline int accounting_pos_cmp(const void *_l, const void *_r)
 	return bpos_cmp(*l, *r);
 }
 
-int bch2_accounting_mem_mod_slowpath(struct bch_fs *, struct bkey_s_c_accounting, bool);
+int bch2_accounting_mem_insert(struct bch_fs *, struct bkey_s_c_accounting, bool);
 
 static inline int __bch2_accounting_mem_mod(struct bch_fs *c, struct bkey_s_c_accounting a, bool gc)
 {
 	struct bch_accounting_mem *acc = &c->accounting[gc];
-	unsigned idx = eytzinger0_find(acc->k.data, acc->k.nr, sizeof(acc->k.data[0]),
-				       accounting_pos_cmp, &a.k->p);
-	if (unlikely(idx >= acc->k.nr))
-		return bch2_accounting_mem_mod_slowpath(c, a, gc);
+	unsigned idx;
+
+	while ((idx = eytzinger0_find(acc->k.data, acc->k.nr, sizeof(acc->k.data[0]),
+				      accounting_pos_cmp, &a.k->p)) >= acc->k.nr) {
+		int ret = bch2_accounting_mem_insert(c, a, gc);
+		if (ret)
+			return ret;
+	}
 
 	unsigned offset = acc->k.data[idx].offset;
 

From b5597347a5c7182dba80f9fb561c404abb3cead8 Mon Sep 17 00:00:00 2001
From: Brian Foster <bfoster@redhat.com>
Date: Thu, 6 Jun 2024 09:58:26 -0400
Subject: [PATCH 076/120] bcachefs: fix smatch data leak warning in fs usage
 ioctl

smatch warns that the copy of arg to userspace is a potential data
leak by virtue of arg.pad not being checked or zeroed. This was
introduced by the commit referenced below that switched arg from
being a zeroed runtime allocation to living on the stack. Fix by
simply zero initializing the structure.

Fixes: cde738a61e65 ("bcachefs: Convert bch2_ioctl_fs_usage() to new accounting")
Reported-by: Dan Carpenter <dan.carpenter@linaro.org>
Signed-off-by: Brian Foster <bfoster@redhat.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/chardev.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fs/bcachefs/chardev.c b/fs/bcachefs/chardev.c
index cbadba4027c2..72ade3664d7b 100644
--- a/fs/bcachefs/chardev.c
+++ b/fs/bcachefs/chardev.c
@@ -517,7 +517,7 @@ static long bch2_ioctl_data(struct bch_fs *c,
 static long bch2_ioctl_fs_usage(struct bch_fs *c,
 				struct bch_ioctl_fs_usage __user *user_arg)
 {
-	struct bch_ioctl_fs_usage arg;
+	struct bch_ioctl_fs_usage arg = {};
 	darray_char replicas = {};
 	u32 replica_entries_bytes;
 	int ret = 0;

From 2574e95a8b78ef853100d6889f154883fec989a2 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Thu, 6 Jun 2024 13:25:28 -0400
Subject: [PATCH 077/120] bcachefs: Refactor disk accounting data structures

Break up the percpu counter allocations into individual allocations for
each disk accounting counter; this fixes an issue on large systems where
we have too many replica entries to for the percpu allocator's max
practical size.

Also, use just one eytzinger tree for the normal set of counters and the
gc counters; this simplifies accounting_gc_done() where we need the same
set of counters to be present in both tables.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/bcachefs.h              |   2 +-
 fs/bcachefs/btree_gc.c              |   7 +-
 fs/bcachefs/disk_accounting.c       | 216 +++++++++++++---------------
 fs/bcachefs/disk_accounting.h       |  31 ++--
 fs/bcachefs/disk_accounting_types.h |  11 +-
 5 files changed, 125 insertions(+), 142 deletions(-)

diff --git a/fs/bcachefs/bcachefs.h b/fs/bcachefs/bcachefs.h
index e0e9afb08ef6..ea4bf11fb8dd 100644
--- a/fs/bcachefs/bcachefs.h
+++ b/fs/bcachefs/bcachefs.h
@@ -742,7 +742,7 @@ struct bch_fs {
 
 	struct bch_dev __rcu	*devs[BCH_SB_MEMBERS_MAX];
 
-	struct bch_accounting_mem accounting[2];
+	struct bch_accounting_mem accounting;
 
 	struct bch_replicas_cpu replicas;
 	struct bch_replicas_cpu replicas_gc;
diff --git a/fs/bcachefs/btree_gc.c b/fs/bcachefs/btree_gc.c
index 88f7c7d64a1d..dfc842d6e822 100644
--- a/fs/bcachefs/btree_gc.c
+++ b/fs/bcachefs/btree_gc.c
@@ -735,7 +735,7 @@ static int bch2_mark_superblocks(struct bch_fs *c)
 
 static void bch2_gc_free(struct bch_fs *c)
 {
-	bch2_accounting_free(&c->accounting[1]);
+	bch2_accounting_gc_free(c);
 
 	genradix_free(&c->reflink_gc_table);
 	genradix_free(&c->gc_stripes);
@@ -1105,7 +1105,8 @@ int bch2_check_allocations(struct bch_fs *c)
 
 	bch2_btree_interior_updates_flush(c);
 
-	ret   = bch2_gc_start(c) ?:
+	ret   = bch2_gc_accounting_start(c) ?:
+		bch2_gc_start(c) ?:
 		bch2_gc_alloc_start(c) ?:
 		bch2_gc_reflink_start(c);
 	if (ret)
@@ -1125,7 +1126,7 @@ int bch2_check_allocations(struct bch_fs *c)
 	c->gc_count++;
 
 	ret   = bch2_gc_alloc_done(c) ?:
-		bch2_accounting_gc_done(c) ?:
+		bch2_gc_accounting_done(c) ?:
 		bch2_gc_stripes_done(c) ?:
 		bch2_gc_reflink_done(c);
 out:
diff --git a/fs/bcachefs/disk_accounting.c b/fs/bcachefs/disk_accounting.c
index 29bc4c816f95..3327d465908d 100644
--- a/fs/bcachefs/disk_accounting.c
+++ b/fs/bcachefs/disk_accounting.c
@@ -218,7 +218,46 @@ int bch2_accounting_update_sb(struct btree_trans *trans)
 	return 0;
 }
 
-static int __bch2_accounting_mem_insert(struct bch_fs *c, struct bkey_s_c_accounting a, bool gc)
+static int __bch2_accounting_mem_insert(struct bch_fs *c, struct bkey_s_c_accounting a)
+{
+	struct bch_accounting_mem *acc = &c->accounting;
+
+	/* raced with another insert, already present: */
+	if (eytzinger0_find(acc->k.data, acc->k.nr, sizeof(acc->k.data[0]),
+			    accounting_pos_cmp, &a.k->p) < acc->k.nr)
+		return 0;
+
+	struct accounting_mem_entry n = {
+		.pos		= a.k->p,
+		.version	= a.k->version,
+		.nr_counters	= bch2_accounting_counters(a.k),
+		.v[0]		= __alloc_percpu_gfp(n.nr_counters * sizeof(u64),
+						     sizeof(u64), GFP_KERNEL),
+	};
+
+	if (!n.v[0])
+		goto err;
+
+	if (acc->gc_running) {
+		n.v[1] = __alloc_percpu_gfp(n.nr_counters * sizeof(u64),
+					    sizeof(u64), GFP_KERNEL);
+		if (!n.v[1])
+			goto err;
+	}
+
+	if (darray_push(&acc->k, n))
+		goto err;
+
+	eytzinger0_sort(acc->k.data, acc->k.nr, sizeof(acc->k.data[0]),
+			accounting_pos_cmp, NULL);
+	return 0;
+err:
+	free_percpu(n.v[1]);
+	free_percpu(n.v[0]);
+	return -BCH_ERR_ENOMEM_disk_accounting;
+}
+
+int bch2_accounting_mem_insert(struct bch_fs *c, struct bkey_s_c_accounting a, bool gc)
 {
 	struct bch_replicas_padded r;
 
@@ -226,49 +265,9 @@ static int __bch2_accounting_mem_insert(struct bch_fs *c, struct bkey_s_c_accoun
 	    !bch2_replicas_marked_locked(c, &r.e))
 		return -BCH_ERR_btree_insert_need_mark_replicas;
 
-	/* raced with another insert, already present: */
-	struct bch_accounting_mem *acc = &c->accounting[gc];
-	if (eytzinger0_find(acc->k.data, acc->k.nr, sizeof(acc->k.data[0]),
-			    accounting_pos_cmp, &a.k->p) < acc->k.nr)
-		return 0;
-
-	unsigned new_nr_counters = acc->nr_counters + bch2_accounting_counters(a.k);
-
-	u64 __percpu *new_counters = __alloc_percpu_gfp(new_nr_counters * sizeof(u64),
-							sizeof(u64), GFP_KERNEL);
-	if (!new_counters)
-		return -BCH_ERR_ENOMEM_disk_accounting;
-
-	preempt_disable();
-	memcpy(this_cpu_ptr(new_counters),
-	       bch2_acc_percpu_u64s(acc->v, acc->nr_counters),
-	       acc->nr_counters * sizeof(u64));
-	preempt_enable();
-
-	struct accounting_pos_offset n = {
-		.pos		= a.k->p,
-		.version	= a.k->version,
-		.offset		= acc->nr_counters,
-		.nr_counters	= bch2_accounting_counters(a.k),
-	};
-	if (darray_push(&acc->k, n)) {
-		free_percpu(new_counters);
-		return -BCH_ERR_ENOMEM_disk_accounting;
-	}
-
-	eytzinger0_sort(acc->k.data, acc->k.nr, sizeof(acc->k.data[0]), accounting_pos_cmp, NULL);
-
-	free_percpu(acc->v);
-	acc->v = new_counters;
-	acc->nr_counters = new_nr_counters;
-	return 0;
-}
-
-int bch2_accounting_mem_insert(struct bch_fs *c, struct bkey_s_c_accounting a, bool gc)
-{
 	percpu_up_read(&c->mark_lock);
 	percpu_down_write(&c->mark_lock);
-	int ret = __bch2_accounting_mem_insert(c, a, gc);
+	int ret = __bch2_accounting_mem_insert(c, a);
 	percpu_up_write(&c->mark_lock);
 	percpu_down_read(&c->mark_lock);
 	return ret;
@@ -284,7 +283,7 @@ int bch2_accounting_mem_insert(struct bch_fs *c, struct bkey_s_c_accounting a, b
  */
 int bch2_fs_replicas_usage_read(struct bch_fs *c, darray_char *usage)
 {
-	struct bch_accounting_mem *acc = &c->accounting[0];
+	struct bch_accounting_mem *acc = &c->accounting;
 	int ret = 0;
 
 	darray_init(usage);
@@ -300,7 +299,7 @@ int bch2_fs_replicas_usage_read(struct bch_fs *c, darray_char *usage)
 			continue;
 
 		u64 sectors;
-		bch2_accounting_mem_read(c, i->pos, &sectors, 1);
+		bch2_accounting_mem_read_counters(acc, i - acc->k.data, &sectors, 1, false);
 		u.r.sectors = sectors;
 
 		ret = darray_make_room(usage, replicas_usage_bytes(&u.r));
@@ -320,7 +319,7 @@ int bch2_fs_replicas_usage_read(struct bch_fs *c, darray_char *usage)
 int bch2_fs_accounting_read(struct bch_fs *c, darray_char *out_buf, unsigned accounting_types_mask)
 {
 
-	struct bch_accounting_mem *acc = &c->accounting[0];
+	struct bch_accounting_mem *acc = &c->accounting;
 	int ret = 0;
 
 	darray_init(out_buf);
@@ -342,7 +341,8 @@ int bch2_fs_accounting_read(struct bch_fs *c, darray_char *out_buf, unsigned acc
 			bkey_accounting_init((void *) &darray_top(*out_buf));
 		set_bkey_val_u64s(&a_out->k, i->nr_counters);
 		a_out->k.p = i->pos;
-		bch2_accounting_mem_read(c, i->pos, a_out->v.d, i->nr_counters);
+		bch2_accounting_mem_read_counters(acc, i - acc->k.data,
+						  a_out->v.d, i->nr_counters, false);
 
 		if (!bch2_accounting_key_is_zero(accounting_i_to_s_c(a_out)))
 			out_buf->nr += bkey_bytes(&a_out->k);
@@ -357,7 +357,7 @@ int bch2_fs_accounting_read(struct bch_fs *c, darray_char *out_buf, unsigned acc
 
 void bch2_fs_accounting_to_text(struct printbuf *out, struct bch_fs *c)
 {
-	struct bch_accounting_mem *acc = &c->accounting[0];
+	struct bch_accounting_mem *acc = &c->accounting;
 
 	percpu_down_read(&c->mark_lock);
 	out->atomic++;
@@ -369,7 +369,7 @@ void bch2_fs_accounting_to_text(struct printbuf *out, struct bch_fs *c)
 		bch2_accounting_key_to_text(out, &acc_k);
 
 		u64 v[BCH_ACCOUNTING_MAX_COUNTERS];
-		bch2_accounting_mem_read_counters(c, i, v, ARRAY_SIZE(v), false);
+		bch2_accounting_mem_read_counters(acc, i, v, ARRAY_SIZE(v), false);
 
 		prt_str(out, ":");
 		for (unsigned j = 0; j < acc->k.data[i].nr_counters; j++)
@@ -381,81 +381,56 @@ void bch2_fs_accounting_to_text(struct printbuf *out, struct bch_fs *c)
 	percpu_up_read(&c->mark_lock);
 }
 
-/* Ensures all counters in @src exist in @dst: */
-static int copy_counters(struct bch_accounting_mem *dst,
-			 struct bch_accounting_mem *src)
+static void bch2_accounting_free_counters(struct bch_accounting_mem *acc, bool gc)
 {
-	unsigned orig_dst_k_nr = dst->k.nr;
-	unsigned dst_counters = dst->nr_counters;
-
-	darray_for_each(src->k, i)
-		if (eytzinger0_find(dst->k.data, orig_dst_k_nr, sizeof(dst->k.data[0]),
-				    accounting_pos_cmp, &i->pos) >= orig_dst_k_nr) {
-			if (darray_push(&dst->k, ((struct accounting_pos_offset) {
-					.pos		= i->pos,
-					.offset		= dst_counters,
-					.nr_counters	= i->nr_counters })))
-				goto err;
-
-			dst_counters += i->nr_counters;
-		}
-
-	if (dst->k.nr == orig_dst_k_nr)
-		return 0;
-
-	u64 __percpu *new_counters = __alloc_percpu_gfp(dst_counters * sizeof(u64),
-							sizeof(u64), GFP_KERNEL);
-	if (!new_counters)
-		goto err;
-
-	preempt_disable();
-	memcpy(this_cpu_ptr(new_counters),
-	       bch2_acc_percpu_u64s(dst->v, dst->nr_counters),
-	       dst->nr_counters * sizeof(u64));
-	preempt_enable();
-
-	free_percpu(dst->v);
-	dst->v = new_counters;
-	dst->nr_counters = dst_counters;
-
-	eytzinger0_sort(dst->k.data, dst->k.nr, sizeof(dst->k.data[0]), accounting_pos_cmp, NULL);
-
-	return 0;
-err:
-	dst->k.nr = orig_dst_k_nr;
-	return -BCH_ERR_ENOMEM_disk_accounting;
+	darray_for_each(acc->k, e) {
+		free_percpu(e->v[gc]);
+		e->v[gc] = NULL;
+	}
 }
 
-int bch2_accounting_gc_done(struct bch_fs *c)
+int bch2_gc_accounting_start(struct bch_fs *c)
 {
-	struct bch_accounting_mem *dst = &c->accounting[0];
-	struct bch_accounting_mem *src = &c->accounting[1];
+	struct bch_accounting_mem *acc = &c->accounting;
+	int ret = 0;
+
+	percpu_down_write(&c->mark_lock);
+	darray_for_each(acc->k, e) {
+		e->v[1] = __alloc_percpu_gfp(e->nr_counters * sizeof(u64),
+					     sizeof(u64), GFP_KERNEL);
+		if (!e->v[1]) {
+			bch2_accounting_free_counters(acc, true);
+			ret = -BCH_ERR_ENOMEM_disk_accounting;
+			break;
+		}
+	}
+
+	acc->gc_running = !ret;
+	percpu_up_write(&c->mark_lock);
+
+	return ret;
+}
+
+int bch2_gc_accounting_done(struct bch_fs *c)
+{
+	struct bch_accounting_mem *acc = &c->accounting;
 	struct btree_trans *trans = bch2_trans_get(c);
 	struct printbuf buf = PRINTBUF;
 	int ret = 0;
 
-	percpu_down_write(&c->mark_lock);
-
-	ret   = copy_counters(dst, src) ?:
-		copy_counters(src, dst);
-	if (ret)
-		goto err;
-
-	BUG_ON(dst->k.nr != src->k.nr);
-
-	for (unsigned i = 0; i < src->k.nr; i++) {
-		BUG_ON(src->k.data[i].nr_counters != dst->k.data[i].nr_counters);
-		BUG_ON(!bpos_eq(dst->k.data[i].pos, src->k.data[i].pos));
+	percpu_down_read(&c->mark_lock);
 
+	darray_for_each(acc->k, e) {
 		struct disk_accounting_pos acc_k;
-		bpos_to_disk_accounting_pos(&acc_k, src->k.data[i].pos);
+		bpos_to_disk_accounting_pos(&acc_k, e->pos);
 
-		unsigned nr = src->k.data[i].nr_counters;
 		u64 src_v[BCH_ACCOUNTING_MAX_COUNTERS];
 		u64 dst_v[BCH_ACCOUNTING_MAX_COUNTERS];
 
-		bch2_accounting_mem_read_counters(c, i, dst_v, nr, false);
-		bch2_accounting_mem_read_counters(c, i, src_v, nr, true);
+		unsigned idx = e - acc->k.data;
+		unsigned nr = e->nr_counters;
+		bch2_accounting_mem_read_counters(acc, idx, dst_v, nr, false);
+		bch2_accounting_mem_read_counters(acc, idx, src_v, nr, true);
 
 		if (memcmp(dst_v, src_v, nr * sizeof(u64))) {
 			printbuf_reset(&buf);
@@ -497,7 +472,7 @@ int bch2_accounting_gc_done(struct bch_fs *c)
 	}
 err:
 fsck_err:
-	percpu_up_write(&c->mark_lock);
+	percpu_up_read(&c->mark_lock);
 	printbuf_exit(&buf);
 	bch2_trans_put(trans);
 	bch_err_fn(c, ret);
@@ -540,7 +515,7 @@ fsck_err:
  */
 int bch2_accounting_read(struct bch_fs *c)
 {
-	struct bch_accounting_mem *acc = &c->accounting[0];
+	struct bch_accounting_mem *acc = &c->accounting;
 	struct btree_trans *trans = bch2_trans_get(c);
 
 	int ret = for_each_btree_key(trans, iter,
@@ -600,7 +575,7 @@ int bch2_accounting_read(struct bch_fs *c)
 		bpos_to_disk_accounting_pos(&k, acc->k.data[i].pos);
 
 		u64 v[BCH_ACCOUNTING_MAX_COUNTERS];
-		bch2_accounting_mem_read_counters(c, i, v, ARRAY_SIZE(v), false);
+		bch2_accounting_mem_read_counters(acc, i, v, ARRAY_SIZE(v), false);
 
 		switch (k.type) {
 		case BCH_DISK_ACCOUNTING_persistent_reserved:
@@ -754,15 +729,20 @@ void bch2_verify_accounting_clean(struct bch_fs *c)
 	WARN_ON(mismatch);
 }
 
-void bch2_accounting_free(struct bch_accounting_mem *acc)
+void bch2_accounting_gc_free(struct bch_fs *c)
 {
-	darray_exit(&acc->k);
-	free_percpu(acc->v);
-	acc->v = NULL;
-	acc->nr_counters = 0;
+	lockdep_assert_held(&c->mark_lock);
+
+	struct bch_accounting_mem *acc = &c->accounting;
+
+	bch2_accounting_free_counters(acc, true);
+	acc->gc_running = false;
 }
 
 void bch2_fs_accounting_exit(struct bch_fs *c)
 {
-	bch2_accounting_free(&c->accounting[0]);
+	struct bch_accounting_mem *acc = &c->accounting;
+
+	bch2_accounting_free_counters(acc, false);
+	darray_exit(&acc->k);
 }
diff --git a/fs/bcachefs/disk_accounting.h b/fs/bcachefs/disk_accounting.h
index 4b37f0c24b4e..81dab01d1eb8 100644
--- a/fs/bcachefs/disk_accounting.h
+++ b/fs/bcachefs/disk_accounting.h
@@ -108,9 +108,11 @@ int bch2_accounting_mem_insert(struct bch_fs *, struct bkey_s_c_accounting, bool
 
 static inline int __bch2_accounting_mem_mod(struct bch_fs *c, struct bkey_s_c_accounting a, bool gc)
 {
-	struct bch_accounting_mem *acc = &c->accounting[gc];
+	struct bch_accounting_mem *acc = &c->accounting;
 	unsigned idx;
 
+	EBUG_ON(gc && !acc->gc_running);
+
 	while ((idx = eytzinger0_find(acc->k.data, acc->k.nr, sizeof(acc->k.data[0]),
 				      accounting_pos_cmp, &a.k->p)) >= acc->k.nr) {
 		int ret = bch2_accounting_mem_insert(c, a, gc);
@@ -118,12 +120,12 @@ static inline int __bch2_accounting_mem_mod(struct bch_fs *c, struct bkey_s_c_ac
 			return ret;
 	}
 
-	unsigned offset = acc->k.data[idx].offset;
+	struct accounting_mem_entry *e = &acc->k.data[idx];
 
-	EBUG_ON(bch2_accounting_counters(a.k) != acc->k.data[idx].nr_counters);
+	EBUG_ON(bch2_accounting_counters(a.k) != e->nr_counters);
 
 	for (unsigned i = 0; i < bch2_accounting_counters(a.k); i++)
-		this_cpu_add(acc->v[offset + i], a.v->d[i]);
+		this_cpu_add(e->v[gc][i], a.v->d[i]);
 	return 0;
 }
 
@@ -170,37 +172,38 @@ static inline int bch2_accounting_mem_add(struct btree_trans *trans, struct bkey
 	return ret;
 }
 
-static inline void bch2_accounting_mem_read_counters(struct bch_fs *c, unsigned idx,
-						     u64 *v, unsigned nr, bool gc)
+static inline void bch2_accounting_mem_read_counters(struct bch_accounting_mem *acc,
+						     unsigned idx, u64 *v, unsigned nr, bool gc)
 {
 	memset(v, 0, sizeof(*v) * nr);
 
-	struct bch_accounting_mem *acc = &c->accounting[gc];
 	if (unlikely(idx >= acc->k.nr))
 		return;
 
-	unsigned offset = acc->k.data[idx].offset;
-	nr = min_t(unsigned, nr, acc->k.data[idx].nr_counters);
+	struct accounting_mem_entry *e = &acc->k.data[idx];
+
+	nr = min_t(unsigned, nr, e->nr_counters);
 
 	for (unsigned i = 0; i < nr; i++)
-		v[i] = percpu_u64_get(acc->v + offset + i);
+		v[i] = percpu_u64_get(e->v[gc] + i);
 }
 
 static inline void bch2_accounting_mem_read(struct bch_fs *c, struct bpos p,
 					    u64 *v, unsigned nr)
 {
-	struct bch_accounting_mem *acc = &c->accounting[0];
+	struct bch_accounting_mem *acc = &c->accounting;
 	unsigned idx = eytzinger0_find(acc->k.data, acc->k.nr, sizeof(acc->k.data[0]),
 				       accounting_pos_cmp, &p);
 
-	bch2_accounting_mem_read_counters(c, idx, v, nr, false);
+	bch2_accounting_mem_read_counters(acc, idx, v, nr, false);
 }
 
 int bch2_fs_replicas_usage_read(struct bch_fs *, darray_char *);
 int bch2_fs_accounting_read(struct bch_fs *, darray_char *, unsigned);
 void bch2_fs_accounting_to_text(struct printbuf *, struct bch_fs *);
 
-int bch2_accounting_gc_done(struct bch_fs *);
+int bch2_gc_accounting_start(struct bch_fs *);
+int bch2_gc_accounting_done(struct bch_fs *);
 
 int bch2_accounting_read(struct bch_fs *);
 
@@ -209,7 +212,7 @@ int bch2_dev_usage_init(struct bch_dev *, bool);
 
 void bch2_verify_accounting_clean(struct bch_fs *c);
 
-void bch2_accounting_free(struct bch_accounting_mem *);
+void bch2_accounting_gc_free(struct bch_fs *);
 void bch2_fs_accounting_exit(struct bch_fs *);
 
 #endif /* _BCACHEFS_DISK_ACCOUNTING_H */
diff --git a/fs/bcachefs/disk_accounting_types.h b/fs/bcachefs/disk_accounting_types.h
index 5656ac540a10..1687a45177a7 100644
--- a/fs/bcachefs/disk_accounting_types.h
+++ b/fs/bcachefs/disk_accounting_types.h
@@ -4,17 +4,16 @@
 
 #include "darray.h"
 
-struct accounting_pos_offset {
+struct accounting_mem_entry {
 	struct bpos				pos;
 	struct bversion				version;
-	u32					offset:24,
-						nr_counters:8;
+	unsigned				nr_counters;
+	u64 __percpu				*v[2];
 };
 
 struct bch_accounting_mem {
-	DARRAY(struct accounting_pos_offset)	k;
-	u64 __percpu				*v;
-	unsigned				nr_counters;
+	DARRAY(struct accounting_mem_entry)	k;
+	bool					gc_running;
 };
 
 #endif /* _BCACHEFS_DISK_ACCOUNTING_TYPES_H */

From f73e6bb6d6c70b72aff021237b8c4722cc43a919 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Thu, 6 Jun 2024 13:48:54 -0400
Subject: [PATCH 078/120] bcachefs: bch2_accounting_mem_gc()

Add a new helper to free zeroed out accounting entries, and use it in
bch2_replicas_gc2(); bch2_replicas_gc2() was killing superblock replicas
entries if their corresponding accounting counters were nonzero, but
that's incorrect - the superblock replicas entry needs to exist if the
accounting entry exists, not if it's nonzero, because we check and
create the replicas entry when creating the new accounting entry - we
don't know when it's becoming nonzero.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/disk_accounting.c | 32 ++++++++++++++++++++++++++++++++
 fs/bcachefs/disk_accounting.h |  1 +
 fs/bcachefs/replicas.c        | 15 +++++++++------
 3 files changed, 42 insertions(+), 6 deletions(-)

diff --git a/fs/bcachefs/disk_accounting.c b/fs/bcachefs/disk_accounting.c
index 3327d465908d..5b1546d1a23d 100644
--- a/fs/bcachefs/disk_accounting.c
+++ b/fs/bcachefs/disk_accounting.c
@@ -273,6 +273,38 @@ int bch2_accounting_mem_insert(struct bch_fs *c, struct bkey_s_c_accounting a, b
 	return ret;
 }
 
+static bool accounting_mem_entry_is_zero(struct accounting_mem_entry *e)
+{
+	for (unsigned i = 0; i < e->nr_counters; i++)
+		if (percpu_u64_get(e->v[0] + i) ||
+		    (e->v[1] &&
+		     percpu_u64_get(e->v[1] + i)))
+			return false;
+	return true;
+}
+
+void bch2_accounting_mem_gc(struct bch_fs *c)
+{
+	struct bch_accounting_mem *acc = &c->accounting;
+
+	percpu_down_write(&c->mark_lock);
+	struct accounting_mem_entry *dst = acc->k.data;
+
+	darray_for_each(acc->k, src) {
+		if (accounting_mem_entry_is_zero(src)) {
+			free_percpu(src->v[0]);
+			free_percpu(src->v[1]);
+		} else {
+			*dst++ = *src;
+		}
+	}
+
+	acc->k.nr = dst - acc->k.data;
+	eytzinger0_sort(acc->k.data, acc->k.nr, sizeof(acc->k.data[0]),
+			accounting_pos_cmp, NULL);
+	percpu_up_write(&c->mark_lock);
+}
+
 /*
  * Read out accounting keys for replicas entries, as an array of
  * bch_replicas_usage entries.
diff --git a/fs/bcachefs/disk_accounting.h b/fs/bcachefs/disk_accounting.h
index 81dab01d1eb8..3d3f25e08b69 100644
--- a/fs/bcachefs/disk_accounting.h
+++ b/fs/bcachefs/disk_accounting.h
@@ -105,6 +105,7 @@ static inline int accounting_pos_cmp(const void *_l, const void *_r)
 }
 
 int bch2_accounting_mem_insert(struct bch_fs *, struct bkey_s_c_accounting, bool);
+void bch2_accounting_mem_gc(struct bch_fs *);
 
 static inline int __bch2_accounting_mem_mod(struct bch_fs *c, struct bkey_s_c_accounting a, bool gc)
 {
diff --git a/fs/bcachefs/replicas.c b/fs/bcachefs/replicas.c
index 06f6d48f74c0..10c96cb2047a 100644
--- a/fs/bcachefs/replicas.c
+++ b/fs/bcachefs/replicas.c
@@ -420,10 +420,10 @@ int bch2_replicas_gc_start(struct bch_fs *c, unsigned typemask)
 int bch2_replicas_gc2(struct bch_fs *c)
 {
 	struct bch_replicas_cpu new = { 0 };
-	unsigned i, nr;
+	unsigned nr;
 	int ret = 0;
 
-	bch2_journal_meta(&c->journal);
+	bch2_accounting_mem_gc(c);
 retry:
 	nr		= READ_ONCE(c->replicas.nr);
 	new.entry_size	= READ_ONCE(c->replicas.entry_size);
@@ -444,7 +444,7 @@ retry:
 		goto retry;
 	}
 
-	for (i = 0; i < c->replicas.nr; i++) {
+	for (unsigned i = 0; i < c->replicas.nr; i++) {
 		struct bch_replicas_entry_v1 *e =
 			cpu_replicas_entry(&c->replicas, i);
 
@@ -454,10 +454,13 @@ retry:
 
 		memcpy(&k.replicas, e, replicas_entry_bytes(e));
 
-		u64 v = 0;
-		bch2_accounting_mem_read(c, disk_accounting_pos_to_bpos(&k), &v, 1);
+		struct bpos p = disk_accounting_pos_to_bpos(&k);
 
-		if (e->data_type == BCH_DATA_journal || v)
+		struct bch_accounting_mem *acc = &c->accounting;
+		bool kill = eytzinger0_find(acc->k.data, acc->k.nr, sizeof(acc->k.data[0]),
+					    accounting_pos_cmp, &p) >= acc->k.nr;
+
+		if (e->data_type == BCH_DATA_journal || !kill)
 			memcpy(cpu_replicas_entry(&new, new.nr++),
 			       e, new.entry_size);
 	}

From 820b9efeb142a45b2c55df0806feb34936025c2a Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Thu, 6 Jun 2024 14:33:27 -0400
Subject: [PATCH 079/120] bcachefs: Fix bch2_gc_accounting_done() locking

The transaction commit path takes mark_lock, so we shouldn't be holding
it; use a bpos as an iterator so that we can drop and retake.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/disk_accounting.c | 18 ++++++++++++++----
 fs/bcachefs/eytzinger.h       | 11 +++++++++++
 2 files changed, 25 insertions(+), 4 deletions(-)

diff --git a/fs/bcachefs/disk_accounting.c b/fs/bcachefs/disk_accounting.c
index 5b1546d1a23d..dcdd59249c23 100644
--- a/fs/bcachefs/disk_accounting.c
+++ b/fs/bcachefs/disk_accounting.c
@@ -448,18 +448,26 @@ int bch2_gc_accounting_done(struct bch_fs *c)
 	struct bch_accounting_mem *acc = &c->accounting;
 	struct btree_trans *trans = bch2_trans_get(c);
 	struct printbuf buf = PRINTBUF;
+	struct bpos pos = POS_MIN;
 	int ret = 0;
 
-	percpu_down_read(&c->mark_lock);
+	percpu_down_write(&c->mark_lock);
+	while (1) {
+		unsigned idx = eytzinger0_find_ge(acc->k.data, acc->k.nr, sizeof(acc->k.data[0]),
+						  accounting_pos_cmp, &pos);
+
+		if (idx >= acc->k.nr)
+			break;
+
+		struct accounting_mem_entry *e = acc->k.data + idx;
+		pos = bpos_successor(e->pos);
 
-	darray_for_each(acc->k, e) {
 		struct disk_accounting_pos acc_k;
 		bpos_to_disk_accounting_pos(&acc_k, e->pos);
 
 		u64 src_v[BCH_ACCOUNTING_MAX_COUNTERS];
 		u64 dst_v[BCH_ACCOUNTING_MAX_COUNTERS];
 
-		unsigned idx = e - acc->k.data;
 		unsigned nr = e->nr_counters;
 		bch2_accounting_mem_read_counters(acc, idx, dst_v, nr, false);
 		bch2_accounting_mem_read_counters(acc, idx, src_v, nr, true);
@@ -481,8 +489,10 @@ int bch2_gc_accounting_done(struct bch_fs *c)
 				src_v[j] -= dst_v[j];
 
 			if (fsck_err(trans, accounting_mismatch, "%s", buf.buf)) {
+				percpu_up_write(&c->mark_lock);
 				ret = commit_do(trans, NULL, NULL, 0,
 						bch2_disk_accounting_mod(trans, &acc_k, src_v, nr, false));
+				percpu_down_write(&c->mark_lock);
 				if (ret)
 					goto err;
 
@@ -504,7 +514,7 @@ int bch2_gc_accounting_done(struct bch_fs *c)
 	}
 err:
 fsck_err:
-	percpu_up_read(&c->mark_lock);
+	percpu_up_write(&c->mark_lock);
 	printbuf_exit(&buf);
 	bch2_trans_put(trans);
 	bch_err_fn(c, ret);
diff --git a/fs/bcachefs/eytzinger.h b/fs/bcachefs/eytzinger.h
index 795f4fc0bab1..0541192d7bc0 100644
--- a/fs/bcachefs/eytzinger.h
+++ b/fs/bcachefs/eytzinger.h
@@ -286,6 +286,17 @@ static inline int eytzinger0_find_gt(void *base, size_t nr, size_t size,
 	return eytzinger0_next(idx, nr);
 }
 
+static inline int eytzinger0_find_ge(void *base, size_t nr, size_t size,
+				     cmp_func_t cmp, const void *search)
+{
+	ssize_t idx = eytzinger0_find_le(base, nr, size, cmp, search);
+
+	if (idx < nr && !cmp(base + idx * size, search))
+		return idx;
+
+	return eytzinger0_next(idx, nr);
+}
+
 #define eytzinger0_find(base, nr, size, _cmp, search)			\
 ({									\
 	void *_base		= (base);				\

From ae4fb17e86701c55da6867ded662d1b7aef40f12 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Fri, 7 Jun 2024 20:51:57 -0400
Subject: [PATCH 080/120] bcachefs: Kill gc_pos_btree_node()

gc_pos is now based on keys, not nodes, for invariantness w.r.t. splits
and merges

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_gc.h           | 9 ---------
 fs/bcachefs/btree_trans_commit.c | 2 +-
 2 files changed, 1 insertion(+), 10 deletions(-)

diff --git a/fs/bcachefs/btree_gc.h b/fs/bcachefs/btree_gc.h
index 1bdf841dc44b..7f8855420c19 100644
--- a/fs/bcachefs/btree_gc.h
+++ b/fs/bcachefs/btree_gc.h
@@ -47,15 +47,6 @@ static inline struct gc_pos gc_pos_btree(enum btree_id btree, unsigned level,
 	};
 }
 
-/*
- * GC position of the pointers within a btree node: note, _not_ for &b->key
- * itself, that lives in the parent node:
- */
-static inline struct gc_pos gc_pos_btree_node(struct btree *b)
-{
-	return gc_pos_btree(b->c.btree_id, b->c.level, b->key.k.p);
-}
-
 static inline int gc_btree_order(enum btree_id btree)
 {
 	if (btree == BTREE_ID_alloc)
diff --git a/fs/bcachefs/btree_trans_commit.c b/fs/bcachefs/btree_trans_commit.c
index 843558d96887..8ab85f212f60 100644
--- a/fs/bcachefs/btree_trans_commit.c
+++ b/fs/bcachefs/btree_trans_commit.c
@@ -599,7 +599,7 @@ static noinline int bch2_trans_commit_run_gc_triggers(struct btree_trans *trans)
 {
 	trans_for_each_update(trans, i)
 		if (btree_node_type_has_triggers(i->bkey_type) &&
-		    gc_visited(trans->c, gc_pos_btree_node(insert_l(trans, i)->b))) {
+		    gc_visited(trans->c, gc_pos_btree(i->btree_id, i->level, i->k->k.p))) {
 			int ret = run_one_mem_trigger(trans, i, i->flags|BTREE_TRIGGER_gc);
 			if (ret)
 				return ret;

From 11169d9983d55f1550ba2d65e074128dd0373937 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Fri, 7 Jun 2024 18:19:39 -0400
Subject: [PATCH 081/120] bcachefs: bch2_btree_id_to_text()

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_cache.c | 8 ++++++++
 fs/bcachefs/btree_cache.h | 2 ++
 2 files changed, 10 insertions(+)

diff --git a/fs/bcachefs/btree_cache.c b/fs/bcachefs/btree_cache.c
index 6a9b248217a0..f5d85b50b6f2 100644
--- a/fs/bcachefs/btree_cache.c
+++ b/fs/bcachefs/btree_cache.c
@@ -1257,6 +1257,14 @@ const char *bch2_btree_id_str(enum btree_id btree)
 	return btree < BTREE_ID_NR ? __bch2_btree_ids[btree] : "(unknown)";
 }
 
+void bch2_btree_id_to_text(struct printbuf *out, enum btree_id btree)
+{
+	if (btree < BTREE_ID_NR)
+		prt_str(out, __bch2_btree_ids[btree]);
+	else
+		prt_printf(out, "(unknown btree %u)", btree);
+}
+
 void bch2_btree_pos_to_text(struct printbuf *out, struct bch_fs *c, const struct btree *b)
 {
 	prt_printf(out, "%s level %u/%u\n  ",
diff --git a/fs/bcachefs/btree_cache.h b/fs/bcachefs/btree_cache.h
index fed35de3e4de..c0eb87a057cc 100644
--- a/fs/bcachefs/btree_cache.h
+++ b/fs/bcachefs/btree_cache.h
@@ -132,6 +132,8 @@ static inline struct btree *btree_node_root(struct bch_fs *c, struct btree *b)
 }
 
 const char *bch2_btree_id_str(enum btree_id);
+void bch2_btree_id_to_text(struct printbuf *, enum btree_id);
+
 void bch2_btree_pos_to_text(struct printbuf *, struct bch_fs *, const struct btree *);
 void bch2_btree_node_to_text(struct printbuf *, struct bch_fs *, const struct btree *);
 void bch2_btree_cache_to_text(struct printbuf *, const struct btree_cache *);

From dd3995a6a4fcf041297d73fb7a12c6edde339986 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Fri, 7 Jun 2024 20:53:02 -0400
Subject: [PATCH 082/120] bcachefs: bch2_gc_pos_to_text()

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_gc.c       | 16 ++++++++++++++++
 fs/bcachefs/btree_gc.h       |  2 ++
 fs/bcachefs/btree_gc_types.h | 13 +++++++++----
 3 files changed, 27 insertions(+), 4 deletions(-)

diff --git a/fs/bcachefs/btree_gc.c b/fs/bcachefs/btree_gc.c
index dfc842d6e822..b712620a1703 100644
--- a/fs/bcachefs/btree_gc.c
+++ b/fs/bcachefs/btree_gc.c
@@ -45,6 +45,22 @@
 #define DROP_PREV_NODE		11
 #define DID_FILL_FROM_SCAN	12
 
+static const char * const bch2_gc_phase_strs[] = {
+#define x(n)	#n,
+	GC_PHASES()
+#undef x
+	NULL
+};
+
+void bch2_gc_pos_to_text(struct printbuf *out, struct gc_pos *p)
+{
+	prt_str(out, bch2_gc_phase_strs[p->phase]);
+	prt_char(out, ' ');
+	bch2_btree_id_to_text(out, p->btree);
+	prt_printf(out, " l=%u ", p->level);
+	bch2_bpos_to_text(out, p->pos);
+}
+
 static struct bkey_s unsafe_bkey_s_c_to_s(struct bkey_s_c k)
 {
 	return (struct bkey_s) {{{
diff --git a/fs/bcachefs/btree_gc.h b/fs/bcachefs/btree_gc.h
index 7f8855420c19..8a47e8bd0791 100644
--- a/fs/bcachefs/btree_gc.h
+++ b/fs/bcachefs/btree_gc.h
@@ -78,6 +78,8 @@ static inline bool gc_visited(struct bch_fs *c, struct gc_pos pos)
 	return ret;
 }
 
+void bch2_gc_pos_to_text(struct printbuf *, struct gc_pos *);
+
 int bch2_gc_gens(struct bch_fs *);
 void bch2_gc_gens_async(struct bch_fs *);
 void bch2_fs_gc_init(struct bch_fs *);
diff --git a/fs/bcachefs/btree_gc_types.h b/fs/bcachefs/btree_gc_types.h
index b82c24bcc088..c24dd6edf377 100644
--- a/fs/bcachefs/btree_gc_types.h
+++ b/fs/bcachefs/btree_gc_types.h
@@ -4,11 +4,16 @@
 
 #include <linux/generic-radix-tree.h>
 
+#define GC_PHASES()		\
+	x(not_running)		\
+	x(start)		\
+	x(sb)			\
+	x(btree)
+
 enum gc_phase {
-	GC_PHASE_not_running,
-	GC_PHASE_start,
-	GC_PHASE_sb,
-	GC_PHASE_btree,
+#define x(n)	GC_PHASE_##n,
+	GC_PHASES()
+#undef x
 };
 
 struct gc_pos {

From 71fdc0b5a678766e874c5001e8bc83bde5931655 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Sat, 8 Jun 2024 15:25:12 -0400
Subject: [PATCH 083/120] bcachefs: btree_node_unlock() assert

we have a separate helper for releasing write locks

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_locking.h | 1 +
 1 file changed, 1 insertion(+)

diff --git a/fs/bcachefs/btree_locking.h b/fs/bcachefs/btree_locking.h
index 75a6274c7d27..8f5f1973c7d8 100644
--- a/fs/bcachefs/btree_locking.h
+++ b/fs/bcachefs/btree_locking.h
@@ -136,6 +136,7 @@ static inline void btree_node_unlock(struct btree_trans *trans,
 	int lock_type = btree_node_locked_type(path, level);
 
 	EBUG_ON(level >= BTREE_MAX_DEPTH);
+	EBUG_ON(lock_type == BTREE_NODE_WRITE_LOCKED);
 
 	if (lock_type != BTREE_NODE_UNLOCKED) {
 		six_unlock_type(&path->l[level].b->c.lock, lock_type);

From c30402e5483df785a5319ffe07127bfd7238d8d9 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Sat, 8 Jun 2024 15:24:14 -0400
Subject: [PATCH 084/120] bcachefs: btree_path_cached_set()

new helper - small refactoring

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_key_cache.c | 50 +++++++++++++++++------------------
 1 file changed, 24 insertions(+), 26 deletions(-)

diff --git a/fs/bcachefs/btree_key_cache.c b/fs/bcachefs/btree_key_cache.c
index 2d3c0d45c37f..8b2fd0ae6028 100644
--- a/fs/bcachefs/btree_key_cache.c
+++ b/fs/bcachefs/btree_key_cache.c
@@ -39,6 +39,15 @@ static const struct rhashtable_params bch2_btree_key_cache_params = {
 	.automatic_shrinking	= true,
 };
 
+static inline void btree_path_cached_set(struct btree_trans *trans, struct btree_path *path,
+					 struct bkey_cached *ck,
+					 enum btree_node_locked_type lock_held)
+{
+	path->l[0].lock_seq	= six_lock_seq(&ck->c.lock);
+	path->l[0].b		= (void *) ck;
+	mark_btree_node_locked(trans, path, 0, lock_held);
+}
+
 __flatten
 inline struct bkey_cached *
 bch2_btree_key_cache_find(struct bch_fs *c, enum btree_id btree_id, struct bpos pos)
@@ -259,9 +268,7 @@ bkey_cached_alloc(struct btree_trans *trans, struct btree_path *path,
 			return ERR_PTR(ret);
 		}
 
-		path->l[0].b = (void *) ck;
-		path->l[0].lock_seq = six_lock_seq(&ck->c.lock);
-		mark_btree_node_locked(trans, path, 0, BTREE_NODE_INTENT_LOCKED);
+		btree_path_cached_set(trans, path, ck, BTREE_NODE_INTENT_LOCKED);
 
 		ret = bch2_btree_node_lock_write(trans, path, &ck->c);
 		if (unlikely(ret)) {
@@ -489,7 +496,7 @@ retry:
 		if (!ck)
 			goto retry;
 
-		mark_btree_node_locked(trans, path, 0, BTREE_NODE_INTENT_LOCKED);
+		btree_path_cached_set(trans, path, ck, BTREE_NODE_INTENT_LOCKED);
 		path->locks_want = 1;
 	} else {
 		enum six_lock_type lock_want = __btree_lock_want(path, 0);
@@ -507,12 +514,8 @@ retry:
 			goto retry;
 		}
 
-		mark_btree_node_locked(trans, path, 0,
-				       (enum btree_node_locked_type) lock_want);
+		btree_path_cached_set(trans, path, ck, (enum btree_node_locked_type) lock_want);
 	}
-
-	path->l[0].lock_seq	= six_lock_seq(&ck->c.lock);
-	path->l[0].b		= (void *) ck;
 fill:
 	path->uptodate = BTREE_ITER_UPTODATE;
 
@@ -559,30 +562,25 @@ int bch2_btree_path_traverse_cached(struct btree_trans *trans, struct btree_path
 	}
 retry:
 	ck = bch2_btree_key_cache_find(c, path->btree_id, path->pos);
-	if (!ck) {
+	if (!ck)
 		return bch2_btree_path_traverse_cached_slowpath(trans, path, flags);
-	} else {
-		enum six_lock_type lock_want = __btree_lock_want(path, 0);
 
-		ret = btree_node_lock(trans, path, (void *) ck, 0,
-				      lock_want, _THIS_IP_);
-		EBUG_ON(ret && !bch2_err_matches(ret, BCH_ERR_transaction_restart));
+	enum six_lock_type lock_want = __btree_lock_want(path, 0);
 
-		if (ret)
-			return ret;
+	ret = btree_node_lock(trans, path, (void *) ck, 0,
+			      lock_want, _THIS_IP_);
+	EBUG_ON(ret && !bch2_err_matches(ret, BCH_ERR_transaction_restart));
 
-		if (ck->key.btree_id != path->btree_id ||
-		    !bpos_eq(ck->key.pos, path->pos)) {
-			six_unlock_type(&ck->c.lock, lock_want);
-			goto retry;
-		}
+	if (ret)
+		return ret;
 
-		mark_btree_node_locked(trans, path, 0,
-				       (enum btree_node_locked_type) lock_want);
+	if (ck->key.btree_id != path->btree_id ||
+	    !bpos_eq(ck->key.pos, path->pos)) {
+		six_unlock_type(&ck->c.lock, lock_want);
+		goto retry;
 	}
 
-	path->l[0].lock_seq	= six_lock_seq(&ck->c.lock);
-	path->l[0].b		= (void *) ck;
+	btree_path_cached_set(trans, path, ck, (enum btree_node_locked_type) lock_want);
 fill:
 	if (!ck->valid)
 		return bch2_btree_path_traverse_cached_slowpath(trans, path, flags);

From 385f0c05d670a252943299506750ed90a4843861 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Sat, 8 Jun 2024 16:46:58 -0400
Subject: [PATCH 085/120] bcachefs: kill key cache arg to
 bch2_assert_pos_locked()

this is an internal implementation detail - and we're improving key
cache coherency

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_iter.c | 27 +++++++++++++--------------
 fs/bcachefs/btree_iter.h |  5 ++---
 fs/bcachefs/fs.c         |  4 +---
 3 files changed, 16 insertions(+), 20 deletions(-)

diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c
index 849a397bb919..756a438f46a9 100644
--- a/fs/bcachefs/btree_iter.c
+++ b/fs/bcachefs/btree_iter.c
@@ -325,7 +325,7 @@ out:
 }
 
 void bch2_assert_pos_locked(struct btree_trans *trans, enum btree_id id,
-			    struct bpos pos, bool key_cache)
+			    struct bpos pos)
 {
 	bch2_trans_verify_not_unlocked(trans);
 
@@ -336,19 +336,12 @@ void bch2_assert_pos_locked(struct btree_trans *trans, enum btree_id id,
 	btree_trans_sort_paths(trans);
 
 	trans_for_each_path_inorder(trans, path, iter) {
-		int cmp = cmp_int(path->btree_id, id) ?:
-			cmp_int(path->cached, key_cache);
-
-		if (cmp > 0)
-			break;
-		if (cmp < 0)
-			continue;
-
-		if (!btree_node_locked(path, 0) ||
+		if (path->btree_id != id ||
+		    !btree_node_locked(path, 0) ||
 		    !path->should_be_locked)
 			continue;
 
-		if (!key_cache) {
+		if (!path->cached) {
 			if (bkey_ge(pos, path->l[0].b->data->min_key) &&
 			    bkey_le(pos, path->l[0].b->key.k.p))
 				return;
@@ -361,9 +354,7 @@ void bch2_assert_pos_locked(struct btree_trans *trans, enum btree_id id,
 	bch2_dump_trans_paths_updates(trans);
 	bch2_bpos_to_text(&buf, pos);
 
-	panic("not locked: %s %s%s\n",
-	      bch2_btree_id_str(id), buf.buf,
-	      key_cache ? " cached" : "");
+	panic("not locked: %s %s\n", bch2_btree_id_str(id), buf.buf);
 }
 
 #else
@@ -1482,6 +1473,14 @@ static void bch2_btree_path_to_text_short(struct printbuf *out, struct btree_tra
 		   path->level);
 	bch2_bpos_to_text(out, path->pos);
 
+	if (!path->cached && btree_node_locked(path, path->level)) {
+		prt_char(out, ' ');
+		struct btree *b = path_l(path)->b;
+		bch2_bpos_to_text(out, b->data->min_key);
+		prt_char(out, '-');
+		bch2_bpos_to_text(out, b->key.k.p);
+	}
+
 #ifdef TRACK_PATH_ALLOCATED
 	prt_printf(out, " %pS", (void *) path->ip_allocated);
 #endif
diff --git a/fs/bcachefs/btree_iter.h b/fs/bcachefs/btree_iter.h
index bdb3cd2ef98a..c7725865309c 100644
--- a/fs/bcachefs/btree_iter.h
+++ b/fs/bcachefs/btree_iter.h
@@ -268,12 +268,11 @@ static inline int bch2_trans_mutex_lock(struct btree_trans *trans, struct mutex
 
 #ifdef CONFIG_BCACHEFS_DEBUG
 void bch2_trans_verify_paths(struct btree_trans *);
-void bch2_assert_pos_locked(struct btree_trans *, enum btree_id,
-			    struct bpos, bool);
+void bch2_assert_pos_locked(struct btree_trans *, enum btree_id, struct bpos);
 #else
 static inline void bch2_trans_verify_paths(struct btree_trans *trans) {}
 static inline void bch2_assert_pos_locked(struct btree_trans *trans, enum btree_id id,
-					  struct bpos pos, bool key_cache) {}
+					  struct bpos pos) {}
 #endif
 
 void bch2_btree_path_fix_key_modified(struct btree_trans *trans,
diff --git a/fs/bcachefs/fs.c b/fs/bcachefs/fs.c
index 3dd60aa3c3de..3483d34c3be1 100644
--- a/fs/bcachefs/fs.c
+++ b/fs/bcachefs/fs.c
@@ -58,9 +58,7 @@ void bch2_inode_update_after_write(struct btree_trans *trans,
 
 	BUG_ON(bi->bi_inum != inode->v.i_ino);
 
-	bch2_assert_pos_locked(trans, BTREE_ID_inodes,
-			       POS(0, bi->bi_inum),
-			       c->opts.inodes_use_key_cache);
+	bch2_assert_pos_locked(trans, BTREE_ID_inodes, POS(0, bi->bi_inum));
 
 	set_nlink(&inode->v, bch2_inode_nlink_get(bi));
 	i_uid_write(&inode->v, bi->bi_uid);

From 63567f643a3e2cdc54d0e190a7a14c01c523b71c Mon Sep 17 00:00:00 2001
From: Brian Foster <bfoster@redhat.com>
Date: Mon, 10 Jun 2024 08:26:39 -0400
Subject: [PATCH 086/120] MAINTAINERS: remove Brian Foster as a reviewer for
 bcachefs

Signed-off-by: Brian Foster <bfoster@redhat.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 MAINTAINERS | 1 -
 1 file changed, 1 deletion(-)

diff --git a/MAINTAINERS b/MAINTAINERS
index 958e935449e5..294501c6a451 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -3649,7 +3649,6 @@ F:	drivers/md/bcache/
 
 BCACHEFS
 M:	Kent Overstreet <kent.overstreet@linux.dev>
-R:	Brian Foster <bfoster@redhat.com>
 L:	linux-bcachefs@vger.kernel.org
 S:	Supported
 C:	irc://irc.oftc.net/bcache

From 132e1a2380d06c31a17c773aac6c676658b9686a Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Thu, 13 Jun 2024 14:11:48 -0400
Subject: [PATCH 087/120] bcachefs: per_cpu_sum()

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/util.h | 17 +++++++++++------
 1 file changed, 11 insertions(+), 6 deletions(-)

diff --git a/fs/bcachefs/util.h b/fs/bcachefs/util.h
index cd09edd12d8a..2def4f761ca6 100644
--- a/fs/bcachefs/util.h
+++ b/fs/bcachefs/util.h
@@ -698,14 +698,19 @@ do {									\
 	}								\
 } while (0)
 
+#define per_cpu_sum(_p)							\
+({									\
+	typeof(*_p) _ret = 0;						\
+									\
+	int cpu;							\
+	for_each_possible_cpu(cpu)					\
+		_ret += *per_cpu_ptr(_p, cpu);				\
+	_ret;								\
+})
+
 static inline u64 percpu_u64_get(u64 __percpu *src)
 {
-	u64 ret = 0;
-	int cpu;
-
-	for_each_possible_cpu(cpu)
-		ret += *per_cpu_ptr(src, cpu);
-	return ret;
+	return per_cpu_sum(src);
 }
 
 static inline void percpu_u64_set(u64 __percpu *dst, u64 src)

From b0d3ab531f07b6fc22ed9b84e7b9a5ff9be90df9 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Thu, 13 Jun 2024 17:07:36 -0400
Subject: [PATCH 088/120] bcachefs: Reduce the scope of gc_lock

gc_lock is now only for synchronization between check_alloc_info and
interior btree updates - nothing else

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/alloc_background.c | 3 ---
 fs/bcachefs/bcachefs.h         | 4 ++--
 fs/bcachefs/btree_gc.c         | 7 ++++---
 fs/bcachefs/buckets.c          | 2 --
 fs/bcachefs/buckets.h          | 4 ++--
 fs/bcachefs/recovery_passes.c  | 4 ++++
 6 files changed, 12 insertions(+), 12 deletions(-)

diff --git a/fs/bcachefs/alloc_background.c b/fs/bcachefs/alloc_background.c
index 23e4aa9baa3a..54e066ee8dca 100644
--- a/fs/bcachefs/alloc_background.c
+++ b/fs/bcachefs/alloc_background.c
@@ -596,8 +596,6 @@ int bch2_alloc_read(struct bch_fs *c)
 	struct bch_dev *ca = NULL;
 	int ret;
 
-	down_read(&c->gc_lock);
-
 	if (c->sb.version_upgrade_complete >= bcachefs_metadata_version_bucket_gens) {
 		ret = for_each_btree_key(trans, iter, BTREE_ID_bucket_gens, POS_MIN,
 					 BTREE_ITER_prefetch, k, ({
@@ -646,7 +644,6 @@ int bch2_alloc_read(struct bch_fs *c)
 
 	bch2_dev_put(ca);
 	bch2_trans_put(trans);
-	up_read(&c->gc_lock);
 
 	bch_err_fn(c, ret);
 	return ret;
diff --git a/fs/bcachefs/bcachefs.h b/fs/bcachefs/bcachefs.h
index ea4bf11fb8dd..91361a167dcd 100644
--- a/fs/bcachefs/bcachefs.h
+++ b/fs/bcachefs/bcachefs.h
@@ -538,8 +538,8 @@ struct bch_dev {
 	/*
 	 * Buckets:
 	 * Per-bucket arrays are protected by c->mark_lock, bucket_lock and
-	 * gc_lock, for device resize - holding any is sufficient for access:
-	 * Or rcu_read_lock(), but only for dev_ptr_stale():
+	 * gc_gens_lock, for device resize - holding any is sufficient for
+	 * access: Or rcu_read_lock(), but only for dev_ptr_stale():
 	 */
 	struct bucket_array __rcu *buckets_gc;
 	struct bucket_gens __rcu *bucket_gens;
diff --git a/fs/bcachefs/btree_gc.c b/fs/bcachefs/btree_gc.c
index b712620a1703..6cbf2aa6a947 100644
--- a/fs/bcachefs/btree_gc.c
+++ b/fs/bcachefs/btree_gc.c
@@ -1240,7 +1240,7 @@ int bch2_gc_gens(struct bch_fs *c)
 	int ret;
 
 	/*
-	 * Ideally we would be using state_lock and not gc_lock here, but that
+	 * Ideally we would be using state_lock and not gc_gens_lock here, but that
 	 * introduces a deadlock in the RO path - we currently take the state
 	 * lock at the start of going RO, thus the gc thread may get stuck:
 	 */
@@ -1248,7 +1248,8 @@ int bch2_gc_gens(struct bch_fs *c)
 		return 0;
 
 	trace_and_count(c, gc_gens_start, c);
-	down_read(&c->gc_lock);
+
+	down_read(&c->state_lock);
 
 	for_each_member_device(c, ca) {
 		struct bucket_gens *gens = bucket_gens(ca);
@@ -1317,7 +1318,7 @@ err:
 		ca->oldest_gen = NULL;
 	}
 
-	up_read(&c->gc_lock);
+	up_read(&c->state_lock);
 	mutex_unlock(&c->gc_gens_lock);
 	if (!bch2_err_matches(ret, EROFS))
 		bch_err_fn(c, ret);
diff --git a/fs/bcachefs/buckets.c b/fs/bcachefs/buckets.c
index 42fd77fe1fe8..95e27995875a 100644
--- a/fs/bcachefs/buckets.c
+++ b/fs/bcachefs/buckets.c
@@ -1217,7 +1217,6 @@ int bch2_dev_buckets_resize(struct bch_fs *c, struct bch_dev *ca, u64 nbuckets)
 		bucket_gens->nbuckets - bucket_gens->first_bucket;
 
 	if (resize) {
-		down_write(&c->gc_lock);
 		down_write(&ca->bucket_lock);
 		percpu_down_write(&c->mark_lock);
 	}
@@ -1240,7 +1239,6 @@ int bch2_dev_buckets_resize(struct bch_fs *c, struct bch_dev *ca, u64 nbuckets)
 	if (resize) {
 		percpu_up_write(&c->mark_lock);
 		up_write(&ca->bucket_lock);
-		up_write(&c->gc_lock);
 	}
 
 	ret = 0;
diff --git a/fs/bcachefs/buckets.h b/fs/bcachefs/buckets.h
index fc6359f84e82..2d35eeb24a2d 100644
--- a/fs/bcachefs/buckets.h
+++ b/fs/bcachefs/buckets.h
@@ -85,7 +85,7 @@ static inline struct bucket_array *gc_bucket_array(struct bch_dev *ca)
 	return rcu_dereference_check(ca->buckets_gc,
 				     !ca->fs ||
 				     percpu_rwsem_is_held(&ca->fs->mark_lock) ||
-				     lockdep_is_held(&ca->fs->gc_lock) ||
+				     lockdep_is_held(&ca->fs->state_lock) ||
 				     lockdep_is_held(&ca->bucket_lock));
 }
 
@@ -103,7 +103,7 @@ static inline struct bucket_gens *bucket_gens(struct bch_dev *ca)
 	return rcu_dereference_check(ca->bucket_gens,
 				     !ca->fs ||
 				     percpu_rwsem_is_held(&ca->fs->mark_lock) ||
-				     lockdep_is_held(&ca->fs->gc_lock) ||
+				     lockdep_is_held(&ca->fs->state_lock) ||
 				     lockdep_is_held(&ca->bucket_lock));
 }
 
diff --git a/fs/bcachefs/recovery_passes.c b/fs/bcachefs/recovery_passes.c
index 4a59f52f8d56..73339a0a3111 100644
--- a/fs/bcachefs/recovery_passes.c
+++ b/fs/bcachefs/recovery_passes.c
@@ -193,6 +193,8 @@ int bch2_run_online_recovery_passes(struct bch_fs *c)
 {
 	int ret = 0;
 
+	down_read(&c->state_lock);
+
 	for (unsigned i = 0; i < ARRAY_SIZE(recovery_pass_fns); i++) {
 		struct recovery_pass_fn *p = recovery_pass_fns + i;
 
@@ -208,6 +210,8 @@ int bch2_run_online_recovery_passes(struct bch_fs *c)
 			break;
 	}
 
+	up_read(&c->state_lock);
+
 	return ret;
 }
 

From 2b02b9552c7839840e4ce05244d20738515ab05c Mon Sep 17 00:00:00 2001
From: Pankaj Raghav <p.raghav@samsung.com>
Date: Fri, 14 Jun 2024 10:50:30 +0000
Subject: [PATCH 089/120] bcachefs: use FGP_WRITEBEGIN instead of combining
 individual flags

Use FGP_WRITEBEGIN to avoid repeating the individual FGP flags before
starting a buffered write.

Signed-off-by: Pankaj Raghav <p.raghav@samsung.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/fs-io-buffered.c | 10 ++++------
 1 file changed, 4 insertions(+), 6 deletions(-)

diff --git a/fs/bcachefs/fs-io-buffered.c b/fs/bcachefs/fs-io-buffered.c
index 54873ecc635c..865691dd0173 100644
--- a/fs/bcachefs/fs-io-buffered.c
+++ b/fs/bcachefs/fs-io-buffered.c
@@ -677,9 +677,8 @@ int bch2_write_begin(struct file *file, struct address_space *mapping,
 
 	bch2_pagecache_add_get(inode);
 
-	folio = __filemap_get_folio(mapping, pos >> PAGE_SHIFT,
-				FGP_LOCK|FGP_WRITE|FGP_CREAT|FGP_STABLE,
-				mapping_gfp_mask(mapping));
+	folio = __filemap_get_folio(mapping, pos >> PAGE_SHIFT, FGP_WRITEBEGIN,
+				    mapping_gfp_mask(mapping));
 	if (IS_ERR_OR_NULL(folio))
 		goto err_unlock;
 
@@ -820,9 +819,8 @@ static int __bch2_buffered_write(struct bch_inode_info *inode,
 	darray_init(&fs);
 
 	ret = bch2_filemap_get_contig_folios_d(mapping, pos, end,
-				   FGP_LOCK|FGP_WRITE|FGP_STABLE|FGP_CREAT,
-				   mapping_gfp_mask(mapping),
-				   &fs);
+					       FGP_WRITEBEGIN,
+					       mapping_gfp_mask(mapping), &fs);
 	if (ret)
 		goto out;
 

From febc33cb352afb8c8dc87286635c35cc644fbdb9 Mon Sep 17 00:00:00 2001
From: Pankaj Raghav <p.raghav@samsung.com>
Date: Fri, 14 Jun 2024 10:50:31 +0000
Subject: [PATCH 090/120] bcachefs: set fgf order hint before starting a
 buffered write

Set the preferred folio order in the fgp_flags by calling
fgf_set_order(). Page cache will try to allocate large folio of the
preferred order whenever possible instead of allocating multiple 0 order
folios.

This improves the buffered write performance up to 1.25x with default
mount options and up to 1.57x when mounted with no_data_io option with
the following fio workload:

fio --name=bcachefs --filename=/mnt/test  --size=100G \
     --ioengine=io_uring --iodepth=16 --rw=write --bs=128k

Signed-off-by: Pankaj Raghav <p.raghav@samsung.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/fs-io-buffered.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/fs/bcachefs/fs-io-buffered.c b/fs/bcachefs/fs-io-buffered.c
index 865691dd0173..1355d618f988 100644
--- a/fs/bcachefs/fs-io-buffered.c
+++ b/fs/bcachefs/fs-io-buffered.c
@@ -677,7 +677,8 @@ int bch2_write_begin(struct file *file, struct address_space *mapping,
 
 	bch2_pagecache_add_get(inode);
 
-	folio = __filemap_get_folio(mapping, pos >> PAGE_SHIFT, FGP_WRITEBEGIN,
+	folio = __filemap_get_folio(mapping, pos >> PAGE_SHIFT,
+				    FGP_WRITEBEGIN | fgf_set_order(len),
 				    mapping_gfp_mask(mapping));
 	if (IS_ERR_OR_NULL(folio))
 		goto err_unlock;
@@ -819,7 +820,7 @@ static int __bch2_buffered_write(struct bch_inode_info *inode,
 	darray_init(&fs);
 
 	ret = bch2_filemap_get_contig_folios_d(mapping, pos, end,
-					       FGP_WRITEBEGIN,
+					       FGP_WRITEBEGIN | fgf_set_order(len),
 					       mapping_gfp_mask(mapping), &fs);
 	if (ret)
 		goto out;

From 789566da258f27ec5bffd4ae685306c71d7b4809 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Sat, 8 Jun 2024 15:20:53 -0400
Subject: [PATCH 091/120] bcachefs: bch2_btree_key_cache_drop() now evicts

As part of improving btree key cache coherency, the bkey_cached.valid
flag is going away.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_key_cache.c    |  7 ++++++-
 fs/bcachefs/btree_trans_commit.c | 11 +++++------
 2 files changed, 11 insertions(+), 7 deletions(-)

diff --git a/fs/bcachefs/btree_key_cache.c b/fs/bcachefs/btree_key_cache.c
index 8b2fd0ae6028..88ccf8dbb5a9 100644
--- a/fs/bcachefs/btree_key_cache.c
+++ b/fs/bcachefs/btree_key_cache.c
@@ -792,6 +792,7 @@ void bch2_btree_key_cache_drop(struct btree_trans *trans,
 			       struct btree_path *path)
 {
 	struct bch_fs *c = trans->c;
+	struct btree_key_cache *bc = &c->btree_key_cache;
 	struct bkey_cached *ck = (void *) path->l[0].b;
 
 	BUG_ON(!ck->valid);
@@ -806,7 +807,11 @@ void bch2_btree_key_cache_drop(struct btree_trans *trans,
 		bch2_journal_pin_drop(&c->journal, &ck->journal);
 	}
 
-	ck->valid = false;
+	bkey_cached_evict(bc, ck);
+	bkey_cached_free_fast(bc, ck);
+
+	mark_btree_node_locked(trans, path, 0, BTREE_NODE_UNLOCKED);
+	btree_path_set_dirty(path, BTREE_ITER_NEED_TRAVERSE);
 }
 
 static unsigned long bch2_btree_key_cache_scan(struct shrinker *shrink,
diff --git a/fs/bcachefs/btree_trans_commit.c b/fs/bcachefs/btree_trans_commit.c
index 8ab85f212f60..cca336fe46e9 100644
--- a/fs/bcachefs/btree_trans_commit.c
+++ b/fs/bcachefs/btree_trans_commit.c
@@ -137,7 +137,8 @@ static inline void bch2_trans_unlock_write(struct btree_trans *trans)
 {
 	if (likely(trans->write_locked)) {
 		trans_for_each_update(trans, i)
-			if (!same_leaf_as_prev(trans, i))
+			if (btree_node_locked_type(trans->paths + i->path, i->level) ==
+			    BTREE_NODE_WRITE_LOCKED)
 				bch2_btree_node_unlock_write_inlined(trans,
 						trans->paths + i->path, insert_l(trans, i)->b);
 		trans->write_locked = false;
@@ -777,14 +778,12 @@ bch2_trans_commit_write_locked(struct btree_trans *trans, unsigned flags,
 	trans_for_each_update(trans, i) {
 		struct btree_path *path = trans->paths + i->path;
 
-		if (!i->cached) {
+		if (!i->cached)
 			bch2_btree_insert_key_leaf(trans, path, i->k, trans->journal_res.seq);
-		} else if (!i->key_cache_already_flushed)
+		else if (!i->key_cache_already_flushed)
 			bch2_btree_insert_key_cached(trans, flags, i);
-		else {
+		else
 			bch2_btree_key_cache_drop(trans, path);
-			btree_path_set_dirty(path, BTREE_ITER_NEED_TRAVERSE);
-		}
 	}
 
 	return 0;

From 7aa7183e00d92539c61b3a01a7ebf676b0ecb91e Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Wed, 19 Jun 2024 09:00:11 -0400
Subject: [PATCH 092/120] bcachefs: split out lru_format.h

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/bcachefs_format.h | 13 +------------
 fs/bcachefs/lru.h             | 12 ------------
 fs/bcachefs/lru_format.h      | 25 +++++++++++++++++++++++++
 3 files changed, 26 insertions(+), 24 deletions(-)
 create mode 100644 fs/bcachefs/lru_format.h

diff --git a/fs/bcachefs/bcachefs_format.h b/fs/bcachefs/bcachefs_format.h
index 8d8444fa0ec2..74a60b1a4ddf 100644
--- a/fs/bcachefs/bcachefs_format.h
+++ b/fs/bcachefs/bcachefs_format.h
@@ -468,18 +468,6 @@ struct bch_backpointer {
 	struct bpos		pos;
 } __packed __aligned(8);
 
-/* LRU btree: */
-
-struct bch_lru {
-	struct bch_val		v;
-	__le64			idx;
-} __packed __aligned(8);
-
-#define LRU_ID_STRIPES		(1U << 16)
-
-#define LRU_TIME_BITS	48
-#define LRU_TIME_MAX	((1ULL << LRU_TIME_BITS) - 1)
-
 /* Optional/variable size superblock sections: */
 
 struct bch_sb_field {
@@ -516,6 +504,7 @@ struct bch_sb_field {
 #include "inode_format.h"
 #include "journal_seq_blacklist_format.h"
 #include "logged_ops_format.h"
+#include "lru_format.h"
 #include "quota_format.h"
 #include "reflink_format.h"
 #include "replicas_format.h"
diff --git a/fs/bcachefs/lru.h b/fs/bcachefs/lru.h
index ed75bcf59d47..5bd8974a7f11 100644
--- a/fs/bcachefs/lru.h
+++ b/fs/bcachefs/lru.h
@@ -24,18 +24,6 @@ static inline struct bpos lru_pos(u16 lru_id, u64 dev_bucket, u64 time)
 	return pos;
 }
 
-#define BCH_LRU_TYPES()		\
-	x(read)			\
-	x(fragmentation)
-
-enum bch_lru_type {
-#define x(n) BCH_LRU_##n,
-	BCH_LRU_TYPES()
-#undef x
-};
-
-#define BCH_LRU_FRAGMENTATION_START	((1U << 16) - 1)
-
 static inline enum bch_lru_type lru_type(struct bkey_s_c l)
 {
 	u16 lru_id = l.k->p.inode >> 48;
diff --git a/fs/bcachefs/lru_format.h b/fs/bcachefs/lru_format.h
new file mode 100644
index 000000000000..f372cb3b8cda
--- /dev/null
+++ b/fs/bcachefs/lru_format.h
@@ -0,0 +1,25 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_LRU_FORMAT_H
+#define _BCACHEFS_LRU_FORMAT_H
+
+struct bch_lru {
+	struct bch_val		v;
+	__le64			idx;
+} __packed __aligned(8);
+
+#define BCH_LRU_TYPES()		\
+	x(read)			\
+	x(fragmentation)
+
+enum bch_lru_type {
+#define x(n) BCH_LRU_##n,
+	BCH_LRU_TYPES()
+#undef x
+};
+
+#define BCH_LRU_FRAGMENTATION_START	((1U << 16) - 1)
+
+#define LRU_TIME_BITS			48
+#define LRU_TIME_MAX			((1ULL << LRU_TIME_BITS) - 1)
+
+#endif /* _BCACHEFS_LRU_FORMAT_H */

From 95924420b038a0d025c4d16c75be2a858e7c09df Mon Sep 17 00:00:00 2001
From: Hongbo Li <lihongbo22@huawei.com>
Date: Thu, 20 Jun 2024 21:21:12 +0800
Subject: [PATCH 093/120] bcachefs: support STATX_DIOALIGN for statx file

Add support for STATX_DIOALIGN to bcachefs, so that direct I/O alignment
restrictions are exposed to userspace in a generic way.

[Before]
```
./statx_test /mnt/bcachefs/test
statx(/mnt/bcachefs/test) = 0
dio mem align:0
dio offset align:0
```

[After]
```
./statx_test /mnt/bcachefs/test
statx(/mnt/bcachefs/test) = 0
dio mem align:1
dio offset align:512
```

Signed-off-by: Hongbo Li <lihongbo22@huawei.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/fs.c | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/fs/bcachefs/fs.c b/fs/bcachefs/fs.c
index 3483d34c3be1..b734d91c4446 100644
--- a/fs/bcachefs/fs.c
+++ b/fs/bcachefs/fs.c
@@ -895,6 +895,16 @@ static int bch2_getattr(struct mnt_idmap *idmap,
 	stat->subvol	= inode->ei_subvol;
 	stat->result_mask |= STATX_SUBVOL;
 
+	if ((request_mask & STATX_DIOALIGN) && S_ISREG(inode->v.i_mode)) {
+		stat->result_mask |= STATX_DIOALIGN;
+		/*
+		 * this is incorrect; we should be tracking this in superblock,
+		 * and checking the alignment of open devices
+		 */
+		stat->dio_mem_align = SECTOR_SIZE;
+		stat->dio_offset_align = block_bytes(c);
+	}
+
 	if (request_mask & STATX_BTIME) {
 		stat->result_mask |= STATX_BTIME;
 		stat->btime = bch2_time_to_timespec(c, inode->ei_inode.bi_otime);

From 7554a8bb6ddeeca87fa8abd1d9766111477a6643 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Sun, 10 Sep 2023 17:29:39 -0400
Subject: [PATCH 094/120] bcachefs: Ensure buffered writes write as much as
 they can

This adds a new helper, bch2_folio_reservation_get_partial(), which
reserves as many blocks as possible and may return partial success.

__bch2_buffered_write() is switched to the new helper - this fixes
fstests generic/275, the write until -ENOSPC test.

generic/230 now fails: this appears to be a test bug, where xfs_io isn't
looping after a partial write to get the error code.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/fs-io-buffered.c  | 30 +++++++++++++++-------------
 fs/bcachefs/fs-io-pagecache.c | 37 +++++++++++++++++++++++++++--------
 fs/bcachefs/fs-io-pagecache.h |  7 ++++++-
 3 files changed, 51 insertions(+), 23 deletions(-)

diff --git a/fs/bcachefs/fs-io-buffered.c b/fs/bcachefs/fs-io-buffered.c
index 1355d618f988..cc33d763f722 100644
--- a/fs/bcachefs/fs-io-buffered.c
+++ b/fs/bcachefs/fs-io-buffered.c
@@ -863,24 +863,26 @@ static int __bch2_buffered_write(struct bch_inode_info *inode,
 	f_pos = pos;
 	f_offset = pos - folio_pos(darray_first(fs));
 	darray_for_each(fs, fi) {
+		ssize_t f_reserved;
+
 		f = *fi;
 		f_len = min(end, folio_end_pos(f)) - f_pos;
+		f_reserved = bch2_folio_reservation_get_partial(c, inode, f, &res, f_offset, f_len);
 
-		/*
-		 * XXX: per POSIX and fstests generic/275, on -ENOSPC we're
-		 * supposed to write as much as we have disk space for.
-		 *
-		 * On failure here we should still write out a partial page if
-		 * we aren't completely out of disk space - we don't do that
-		 * yet:
-		 */
-		ret = bch2_folio_reservation_get(c, inode, f, &res, f_offset, f_len);
-		if (unlikely(ret)) {
-			folios_trunc(&fs, fi);
-			if (!fs.nr)
-				goto out;
+		if (unlikely(f_reserved != f_len)) {
+			if (f_reserved < 0) {
+				if (f == darray_first(fs)) {
+					ret = f_reserved;
+					goto out;
+				}
+
+				folios_trunc(&fs, fi);
+				end = min(end, folio_end_pos(darray_last(fs)));
+			} else {
+				folios_trunc(&fs, fi + 1);
+				end = f_pos + f_reserved;
+			}
 
-			end = min(end, folio_end_pos(darray_last(fs)));
 			break;
 		}
 
diff --git a/fs/bcachefs/fs-io-pagecache.c b/fs/bcachefs/fs-io-pagecache.c
index 872283e5bd1e..a9cc5cad9cc9 100644
--- a/fs/bcachefs/fs-io-pagecache.c
+++ b/fs/bcachefs/fs-io-pagecache.c
@@ -423,7 +423,7 @@ int bch2_folio_reservation_get(struct bch_fs *c,
 			struct bch_inode_info *inode,
 			struct folio *folio,
 			struct bch2_folio_reservation *res,
-			unsigned offset, unsigned len)
+			size_t offset, size_t len)
 {
 	struct bch_folio *s = bch2_folio_create(folio, 0);
 	unsigned i, disk_sectors = 0, quota_sectors = 0;
@@ -437,8 +437,7 @@ int bch2_folio_reservation_get(struct bch_fs *c,
 	for (i = round_down(offset, block_bytes(c)) >> 9;
 	     i < round_up(offset + len, block_bytes(c)) >> 9;
 	     i++) {
-		disk_sectors += sectors_to_reserve(&s->s[i],
-						res->disk.nr_replicas);
+		disk_sectors += sectors_to_reserve(&s->s[i], res->disk.nr_replicas);
 		quota_sectors += s->s[i].state == SECTOR_unallocated;
 	}
 
@@ -449,12 +448,9 @@ int bch2_folio_reservation_get(struct bch_fs *c,
 	}
 
 	if (quota_sectors) {
-		ret = bch2_quota_reservation_add(c, inode, &res->quota,
-						 quota_sectors, true);
+		ret = bch2_quota_reservation_add(c, inode, &res->quota, quota_sectors, true);
 		if (unlikely(ret)) {
-			struct disk_reservation tmp = {
-				.sectors = disk_sectors
-			};
+			struct disk_reservation tmp = { .sectors = disk_sectors };
 
 			bch2_disk_reservation_put(c, &tmp);
 			res->disk.sectors -= disk_sectors;
@@ -465,6 +461,31 @@ int bch2_folio_reservation_get(struct bch_fs *c,
 	return 0;
 }
 
+ssize_t bch2_folio_reservation_get_partial(struct bch_fs *c,
+			struct bch_inode_info *inode,
+			struct folio *folio,
+			struct bch2_folio_reservation *res,
+			size_t offset, size_t len)
+{
+	size_t l, reserved = 0;
+	int ret;
+
+	while ((l = len - reserved)) {
+		while ((ret = bch2_folio_reservation_get(c, inode, folio, res, offset, l))) {
+			if ((offset & (block_bytes(c) - 1)) + l <= block_bytes(c))
+				return reserved ?: ret;
+
+			len = reserved + l;
+			l /= 2;
+		}
+
+		offset += l;
+		reserved += l;
+	}
+
+	return reserved;
+}
+
 static void bch2_clear_folio_bits(struct folio *folio)
 {
 	struct bch_inode_info *inode = to_bch_ei(folio->mapping->host);
diff --git a/fs/bcachefs/fs-io-pagecache.h b/fs/bcachefs/fs-io-pagecache.h
index 828c3d7c8f19..fd7d692c087e 100644
--- a/fs/bcachefs/fs-io-pagecache.h
+++ b/fs/bcachefs/fs-io-pagecache.h
@@ -153,7 +153,12 @@ int bch2_folio_reservation_get(struct bch_fs *,
 			struct bch_inode_info *,
 			struct folio *,
 			struct bch2_folio_reservation *,
-			unsigned, unsigned);
+			size_t, size_t);
+ssize_t bch2_folio_reservation_get_partial(struct bch_fs *,
+			struct bch_inode_info *,
+			struct folio *,
+			struct bch2_folio_reservation *,
+			size_t, size_t);
 
 void bch2_set_folio_dirty(struct bch_fs *,
 			  struct bch_inode_info *,

From e0d5bc6a66182dd68e117638c04a6794f2604331 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Sun, 23 Jun 2024 18:48:22 -0400
Subject: [PATCH 095/120] bcachefs: Fix missing BTREE_TRIGGER_bucket_invalidate
 flag

This fixes an accounting mismatch for cached data.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/alloc_background.c | 7 ++++---
 fs/bcachefs/alloc_background.h | 3 ++-
 fs/bcachefs/buckets.c          | 2 +-
 fs/bcachefs/ec.c               | 2 +-
 4 files changed, 8 insertions(+), 6 deletions(-)

diff --git a/fs/bcachefs/alloc_background.c b/fs/bcachefs/alloc_background.c
index 54e066ee8dca..d9c5a92fa708 100644
--- a/fs/bcachefs/alloc_background.c
+++ b/fs/bcachefs/alloc_background.c
@@ -477,7 +477,8 @@ err:
 }
 
 __flatten
-struct bkey_i_alloc_v4 *bch2_trans_start_alloc_update(struct btree_trans *trans, struct bpos pos)
+struct bkey_i_alloc_v4 *bch2_trans_start_alloc_update(struct btree_trans *trans, struct bpos pos,
+						      enum btree_iter_update_trigger_flags flags)
 {
 	struct btree_iter iter;
 	struct bkey_i_alloc_v4 *a = bch2_trans_start_alloc_update_noupdate(trans, &iter, pos);
@@ -485,7 +486,7 @@ struct bkey_i_alloc_v4 *bch2_trans_start_alloc_update(struct btree_trans *trans,
 	if (ret)
 		return ERR_PTR(ret);
 
-	ret = bch2_trans_update(trans, &iter, &a->k_i, 0);
+	ret = bch2_trans_update(trans, &iter, &a->k_i, flags);
 	bch2_trans_iter_exit(trans, &iter);
 	return unlikely(ret) ? ERR_PTR(ret) : a;
 }
@@ -2006,7 +2007,7 @@ static int invalidate_one_bucket(struct btree_trans *trans,
 	if (bch2_bucket_is_open_safe(c, bucket.inode, bucket.offset))
 		return 0;
 
-	a = bch2_trans_start_alloc_update(trans, bucket);
+	a = bch2_trans_start_alloc_update(trans, bucket, BTREE_TRIGGER_bucket_invalidate);
 	ret = PTR_ERR_OR_ZERO(a);
 	if (ret)
 		goto out;
diff --git a/fs/bcachefs/alloc_background.h b/fs/bcachefs/alloc_background.h
index dd7d14363a68..8d2b62c9588e 100644
--- a/fs/bcachefs/alloc_background.h
+++ b/fs/bcachefs/alloc_background.h
@@ -206,7 +206,8 @@ static inline void set_alloc_v4_u64s(struct bkey_i_alloc_v4 *a)
 struct bkey_i_alloc_v4 *
 bch2_trans_start_alloc_update_noupdate(struct btree_trans *, struct btree_iter *, struct bpos);
 struct bkey_i_alloc_v4 *
-bch2_trans_start_alloc_update(struct btree_trans *, struct bpos);
+bch2_trans_start_alloc_update(struct btree_trans *, struct bpos,
+			      enum btree_iter_update_trigger_flags);
 
 void __bch2_alloc_to_v4(struct bkey_s_c, struct bch_alloc_v4 *);
 
diff --git a/fs/bcachefs/buckets.c b/fs/bcachefs/buckets.c
index 95e27995875a..2650a0d24663 100644
--- a/fs/bcachefs/buckets.c
+++ b/fs/bcachefs/buckets.c
@@ -569,7 +569,7 @@ static int bch2_trigger_pointer(struct btree_trans *trans,
 	*sectors = insert ? bp.bucket_len : -((s64) bp.bucket_len);
 
 	if (flags & BTREE_TRIGGER_transactional) {
-		struct bkey_i_alloc_v4 *a = bch2_trans_start_alloc_update(trans, bucket);
+		struct bkey_i_alloc_v4 *a = bch2_trans_start_alloc_update(trans, bucket, 0);
 		ret = PTR_ERR_OR_ZERO(a) ?:
 			__mark_pointer(trans, ca, k, &p, *sectors, bp.data_type, &a->v);
 		if (ret)
diff --git a/fs/bcachefs/ec.c b/fs/bcachefs/ec.c
index 3c3a2a7e8389..86948d110f6b 100644
--- a/fs/bcachefs/ec.c
+++ b/fs/bcachefs/ec.c
@@ -283,7 +283,7 @@ static int mark_stripe_bucket(struct btree_trans *trans,
 
 	if (flags & BTREE_TRIGGER_transactional) {
 		struct bkey_i_alloc_v4 *a =
-			bch2_trans_start_alloc_update(trans, bucket);
+			bch2_trans_start_alloc_update(trans, bucket, 0);
 		ret = PTR_ERR_OR_ZERO(a) ?:
 			__mark_stripe_bucket(trans, ca, s, ptr_idx, deleting, bucket, &a->v, flags);
 	}

From 39d5d8290cd4ae709b5c9625f991a2b028234315 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Sun, 23 Jun 2024 02:13:44 -0400
Subject: [PATCH 096/120] bcachefs: Improve "unable to allocate journal write"
 message

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/journal_io.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/fs/bcachefs/journal_io.c b/fs/bcachefs/journal_io.c
index bf3433cc78be..7a833a3f1c63 100644
--- a/fs/bcachefs/journal_io.c
+++ b/fs/bcachefs/journal_io.c
@@ -2034,8 +2034,9 @@ CLOSURE_CALLBACK(bch2_journal_write)
 		struct printbuf buf = PRINTBUF;
 		buf.atomic++;
 
-		prt_printf(&buf, bch2_fmt(c, "Unable to allocate journal write: %s"),
-			   bch2_err_str(ret));
+		prt_printf(&buf, bch2_fmt(c, "Unable to allocate journal write at seq %llu: %s"),
+					  le64_to_cpu(w->data->seq),
+					  bch2_err_str(ret));
 		__bch2_journal_debug_to_text(&buf, j);
 		spin_unlock(&j->lock);
 		bch2_print_string_as_lines(KERN_ERR, buf.buf);

From d2cb6b219d37284d78deeac1be8eb9d7670eebd1 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Sat, 8 Jun 2024 17:49:11 -0400
Subject: [PATCH 097/120] bcachefs: Simplify btree key cache fill path

Don't allocate the new bkey_cached until after we've done the btree
lookup; this means we can kill bkey_cached.valid.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_iter.c      |   9 +-
 fs/bcachefs/btree_key_cache.c | 323 +++++++++++++---------------------
 fs/bcachefs/btree_types.h     |   1 -
 3 files changed, 125 insertions(+), 208 deletions(-)

diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c
index 756a438f46a9..9485208b6758 100644
--- a/fs/bcachefs/btree_iter.c
+++ b/fs/bcachefs/btree_iter.c
@@ -1800,13 +1800,12 @@ struct bkey_s_c bch2_btree_path_peek_slot(struct btree_path *path, struct bkey *
 			goto hole;
 	} else {
 		struct bkey_cached *ck = (void *) path->l[0].b;
-
-		EBUG_ON(ck &&
-			(path->btree_id != ck->key.btree_id ||
-			 !bkey_eq(path->pos, ck->key.pos)));
-		if (!ck || !ck->valid)
+		if (!ck)
 			return bkey_s_c_null;
 
+		EBUG_ON(path->btree_id != ck->key.btree_id ||
+			!bkey_eq(path->pos, ck->key.pos));
+
 		*u = ck->k->k;
 		k = bkey_i_to_s_c(ck->k);
 	}
diff --git a/fs/bcachefs/btree_key_cache.c b/fs/bcachefs/btree_key_cache.c
index 88ccf8dbb5a9..f2f2e525460b 100644
--- a/fs/bcachefs/btree_key_cache.c
+++ b/fs/bcachefs/btree_key_cache.c
@@ -205,9 +205,22 @@ static void bkey_cached_free_fast(struct btree_key_cache *bc,
 	six_unlock_intent(&ck->c.lock);
 }
 
+static struct bkey_cached *__bkey_cached_alloc(unsigned key_u64s, gfp_t gfp)
+{
+	struct bkey_cached *ck = kmem_cache_zalloc(bch2_key_cache, gfp);
+	if (unlikely(!ck))
+		return NULL;
+	ck->k = kmalloc(key_u64s * sizeof(u64), gfp);
+	if (unlikely(!ck->k)) {
+		kmem_cache_free(bch2_key_cache, ck);
+		return NULL;
+	}
+	ck->u64s = key_u64s;
+	return ck;
+}
+
 static struct bkey_cached *
-bkey_cached_alloc(struct btree_trans *trans, struct btree_path *path,
-		  bool *was_new)
+bkey_cached_alloc(struct btree_trans *trans, struct btree_path *path, unsigned key_u64s)
 {
 	struct bch_fs *c = trans->c;
 	struct btree_key_cache *bc = &c->btree_key_cache;
@@ -281,8 +294,10 @@ bkey_cached_alloc(struct btree_trans *trans, struct btree_path *path,
 	}
 
 	ck = allocate_dropping_locks(trans, ret,
-			kmem_cache_zalloc(bch2_key_cache, _gfp));
+				     __bkey_cached_alloc(key_u64s, _gfp));
 	if (ret) {
+		if (ck)
+			kfree(ck->k);
 		kmem_cache_free(bch2_key_cache, ck);
 		return ERR_PTR(ret);
 	}
@@ -296,7 +311,6 @@ bkey_cached_alloc(struct btree_trans *trans, struct btree_path *path,
 	ck->c.cached = true;
 	BUG_ON(!six_trylock_intent(&ck->c.lock));
 	BUG_ON(!six_trylock_write(&ck->c.lock));
-	*was_new = true;
 	return ck;
 }
 
@@ -326,71 +340,102 @@ out:
 	return ck;
 }
 
-static struct bkey_cached *
-btree_key_cache_create(struct btree_trans *trans, struct btree_path *path)
+static int btree_key_cache_create(struct btree_trans *trans, struct btree_path *path,
+				  struct bkey_s_c k)
 {
 	struct bch_fs *c = trans->c;
 	struct btree_key_cache *bc = &c->btree_key_cache;
-	struct bkey_cached *ck;
-	bool was_new = false;
 
-	ck = bkey_cached_alloc(trans, path, &was_new);
-	if (IS_ERR(ck))
-		return ck;
+	/*
+	 * bch2_varint_decode can read past the end of the buffer by at
+	 * most 7 bytes (it won't be used):
+	 */
+	unsigned key_u64s = k.k->u64s + 1;
+
+	/*
+	 * Allocate some extra space so that the transaction commit path is less
+	 * likely to have to reallocate, since that requires a transaction
+	 * restart:
+	 */
+	key_u64s = min(256U, (key_u64s * 3) / 2);
+	key_u64s = roundup_pow_of_two(key_u64s);
+
+	struct bkey_cached *ck = bkey_cached_alloc(trans, path, key_u64s);
+	int ret = PTR_ERR_OR_ZERO(ck);
+	if (ret)
+		return ret;
 
 	if (unlikely(!ck)) {
 		ck = bkey_cached_reuse(bc);
 		if (unlikely(!ck)) {
 			bch_err(c, "error allocating memory for key cache item, btree %s",
 				bch2_btree_id_str(path->btree_id));
-			return ERR_PTR(-BCH_ERR_ENOMEM_btree_key_cache_create);
+			return -BCH_ERR_ENOMEM_btree_key_cache_create;
 		}
-
-		mark_btree_node_locked(trans, path, 0, BTREE_NODE_INTENT_LOCKED);
 	}
 
 	ck->c.level		= 0;
 	ck->c.btree_id		= path->btree_id;
 	ck->key.btree_id	= path->btree_id;
 	ck->key.pos		= path->pos;
-	ck->valid		= false;
 	ck->flags		= 1U << BKEY_CACHED_ACCESSED;
 
-	if (unlikely(rhashtable_lookup_insert_fast(&bc->table,
-					  &ck->hash,
-					  bch2_btree_key_cache_params))) {
-		/* We raced with another fill: */
+	if (unlikely(key_u64s > ck->u64s)) {
+		mark_btree_node_locked_noreset(path, 0, BTREE_NODE_UNLOCKED);
 
-		if (likely(was_new)) {
-			six_unlock_write(&ck->c.lock);
-			six_unlock_intent(&ck->c.lock);
-			kfree(ck);
-		} else {
-			bkey_cached_free_fast(bc, ck);
+		struct bkey_i *new_k = allocate_dropping_locks(trans, ret,
+				kmalloc(key_u64s * sizeof(u64), _gfp));
+		if (unlikely(!new_k)) {
+			bch_err(trans->c, "error allocating memory for key cache key, btree %s u64s %u",
+				bch2_btree_id_str(ck->key.btree_id), key_u64s);
+			ret = -BCH_ERR_ENOMEM_btree_key_cache_fill;
+		} else if (ret) {
+			kfree(new_k);
+			goto err;
 		}
 
-		mark_btree_node_locked(trans, path, 0, BTREE_NODE_UNLOCKED);
-		return NULL;
+		kfree(ck->k);
+		ck->k = new_k;
+		ck->u64s = key_u64s;
 	}
 
-	atomic_long_inc(&bc->nr_keys);
+	bkey_reassemble(ck->k, k);
 
+	ret = rhashtable_lookup_insert_fast(&bc->table, &ck->hash, bch2_btree_key_cache_params);
+	if (unlikely(ret)) /* raced with another fill? */
+		goto err;
+
+	atomic_long_inc(&bc->nr_keys);
 	six_unlock_write(&ck->c.lock);
 
-	return ck;
+	enum six_lock_type lock_want = __btree_lock_want(path, 0);
+	if (lock_want == SIX_LOCK_read)
+		six_lock_downgrade(&ck->c.lock);
+	btree_path_cached_set(trans, path, ck, (enum btree_node_locked_type) lock_want);
+	path->uptodate = BTREE_ITER_UPTODATE;
+	return 0;
+err:
+	bkey_cached_free_fast(bc, ck);
+	mark_btree_node_locked_noreset(path, 0, BTREE_NODE_UNLOCKED);
+
+	return ret;
 }
 
-static int btree_key_cache_fill(struct btree_trans *trans,
-				struct btree_path *ck_path,
-				struct bkey_cached *ck)
+static noinline int btree_key_cache_fill(struct btree_trans *trans,
+					 struct btree_path *ck_path,
+					 unsigned flags)
 {
+	if (flags & BTREE_ITER_cached_nofill) {
+		ck_path->uptodate = BTREE_ITER_UPTODATE;
+		return 0;
+	}
+
+	struct bch_fs *c = trans->c;
 	struct btree_iter iter;
 	struct bkey_s_c k;
-	unsigned new_u64s = 0;
-	struct bkey_i *new_k = NULL;
 	int ret;
 
-	bch2_trans_iter_init(trans, &iter, ck->key.btree_id, ck->key.pos,
+	bch2_trans_iter_init(trans, &iter, ck_path->btree_id, ck_path->pos,
 			     BTREE_ITER_key_cache_fill|
 			     BTREE_ITER_cached_nofill);
 	iter.flags &= ~BTREE_ITER_with_journal;
@@ -399,70 +444,15 @@ static int btree_key_cache_fill(struct btree_trans *trans,
 	if (ret)
 		goto err;
 
-	if (!bch2_btree_node_relock(trans, ck_path, 0)) {
-		trace_and_count(trans->c, trans_restart_relock_key_cache_fill, trans, _THIS_IP_, ck_path);
-		ret = btree_trans_restart(trans, BCH_ERR_transaction_restart_key_cache_fill);
+	/* Recheck after btree lookup, before allocating: */
+	ret = bch2_btree_key_cache_find(c, ck_path->btree_id, ck_path->pos) ? -EEXIST : 0;
+	if (unlikely(ret))
+		goto out;
+
+	ret = btree_key_cache_create(trans, ck_path, k);
+	if (ret)
 		goto err;
-	}
-
-	/*
-	 * bch2_varint_decode can read past the end of the buffer by at
-	 * most 7 bytes (it won't be used):
-	 */
-	new_u64s = k.k->u64s + 1;
-
-	/*
-	 * Allocate some extra space so that the transaction commit path is less
-	 * likely to have to reallocate, since that requires a transaction
-	 * restart:
-	 */
-	new_u64s = min(256U, (new_u64s * 3) / 2);
-
-	if (new_u64s > ck->u64s) {
-		new_u64s = roundup_pow_of_two(new_u64s);
-		new_k = kmalloc(new_u64s * sizeof(u64), GFP_NOWAIT|__GFP_NOWARN);
-		if (!new_k) {
-			bch2_trans_unlock(trans);
-
-			new_k = kmalloc(new_u64s * sizeof(u64), GFP_KERNEL);
-			if (!new_k) {
-				bch_err(trans->c, "error allocating memory for key cache key, btree %s u64s %u",
-					bch2_btree_id_str(ck->key.btree_id), new_u64s);
-				ret = -BCH_ERR_ENOMEM_btree_key_cache_fill;
-				goto err;
-			}
-
-			ret = bch2_trans_relock(trans);
-			if (ret) {
-				kfree(new_k);
-				goto err;
-			}
-
-			if (!bch2_btree_node_relock(trans, ck_path, 0)) {
-				kfree(new_k);
-				trace_and_count(trans->c, trans_restart_relock_key_cache_fill, trans, _THIS_IP_, ck_path);
-				ret = btree_trans_restart(trans, BCH_ERR_transaction_restart_key_cache_fill);
-				goto err;
-			}
-		}
-	}
-
-	ret = bch2_btree_node_lock_write(trans, ck_path, &ck_path->l[0].b->c);
-	if (ret) {
-		kfree(new_k);
-		goto err;
-	}
-
-	if (new_k) {
-		kfree(ck->k);
-		ck->u64s = new_u64s;
-		ck->k = new_k;
-	}
-
-	bkey_reassemble(ck->k, k);
-	ck->valid = true;
-	bch2_btree_node_unlock_write(trans, ck_path, ck_path->l[0].b);
-
+out:
 	/* We're not likely to need this iterator again: */
 	bch2_set_btree_iter_dontneed(&iter);
 err:
@@ -470,107 +460,19 @@ err:
 	return ret;
 }
 
-static noinline int
-bch2_btree_path_traverse_cached_slowpath(struct btree_trans *trans, struct btree_path *path,
-					 unsigned flags)
+static inline int btree_path_traverse_cached_fast(struct btree_trans *trans,
+						  struct btree_path *path)
 {
 	struct bch_fs *c = trans->c;
 	struct bkey_cached *ck;
-	int ret = 0;
-
-	BUG_ON(path->level);
-
-	path->l[1].b = NULL;
-
-	if (bch2_btree_node_relock_notrace(trans, path, 0)) {
-		ck = (void *) path->l[0].b;
-		goto fill;
-	}
-retry:
-	ck = bch2_btree_key_cache_find(c, path->btree_id, path->pos);
-	if (!ck) {
-		ck = btree_key_cache_create(trans, path);
-		ret = PTR_ERR_OR_ZERO(ck);
-		if (ret)
-			goto err;
-		if (!ck)
-			goto retry;
-
-		btree_path_cached_set(trans, path, ck, BTREE_NODE_INTENT_LOCKED);
-		path->locks_want = 1;
-	} else {
-		enum six_lock_type lock_want = __btree_lock_want(path, 0);
-
-		ret = btree_node_lock(trans, path, (void *) ck, 0,
-				      lock_want, _THIS_IP_);
-		if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
-			goto err;
-
-		BUG_ON(ret);
-
-		if (ck->key.btree_id != path->btree_id ||
-		    !bpos_eq(ck->key.pos, path->pos)) {
-			six_unlock_type(&ck->c.lock, lock_want);
-			goto retry;
-		}
-
-		btree_path_cached_set(trans, path, ck, (enum btree_node_locked_type) lock_want);
-	}
-fill:
-	path->uptodate = BTREE_ITER_UPTODATE;
-
-	if (!ck->valid && !(flags & BTREE_ITER_cached_nofill)) {
-		ret =   bch2_btree_path_upgrade(trans, path, 1) ?:
-			btree_key_cache_fill(trans, path, ck) ?:
-			bch2_btree_path_relock(trans, path, _THIS_IP_);
-		if (ret)
-			goto err;
-
-		path->uptodate = BTREE_ITER_UPTODATE;
-	}
-
-	if (!test_bit(BKEY_CACHED_ACCESSED, &ck->flags))
-		set_bit(BKEY_CACHED_ACCESSED, &ck->flags);
-
-	BUG_ON(btree_node_locked_type(path, 0) != btree_lock_want(path, 0));
-	BUG_ON(path->uptodate);
-
-	return ret;
-err:
-	path->uptodate = BTREE_ITER_NEED_TRAVERSE;
-	if (!bch2_err_matches(ret, BCH_ERR_transaction_restart)) {
-		btree_node_unlock(trans, path, 0);
-		path->l[0].b = ERR_PTR(ret);
-	}
-	return ret;
-}
-
-int bch2_btree_path_traverse_cached(struct btree_trans *trans, struct btree_path *path,
-				    unsigned flags)
-{
-	struct bch_fs *c = trans->c;
-	struct bkey_cached *ck;
-	int ret = 0;
-
-	EBUG_ON(path->level);
-
-	path->l[1].b = NULL;
-
-	if (bch2_btree_node_relock_notrace(trans, path, 0)) {
-		ck = (void *) path->l[0].b;
-		goto fill;
-	}
 retry:
 	ck = bch2_btree_key_cache_find(c, path->btree_id, path->pos);
 	if (!ck)
-		return bch2_btree_path_traverse_cached_slowpath(trans, path, flags);
+		return -ENOENT;
 
 	enum six_lock_type lock_want = __btree_lock_want(path, 0);
 
-	ret = btree_node_lock(trans, path, (void *) ck, 0,
-			      lock_want, _THIS_IP_);
-	EBUG_ON(ret && !bch2_err_matches(ret, BCH_ERR_transaction_restart));
-
+	int ret = btree_node_lock(trans, path, (void *) ck, 0, lock_want, _THIS_IP_);
 	if (ret)
 		return ret;
 
@@ -580,18 +482,40 @@ retry:
 		goto retry;
 	}
 
-	btree_path_cached_set(trans, path, ck, (enum btree_node_locked_type) lock_want);
-fill:
-	if (!ck->valid)
-		return bch2_btree_path_traverse_cached_slowpath(trans, path, flags);
-
 	if (!test_bit(BKEY_CACHED_ACCESSED, &ck->flags))
 		set_bit(BKEY_CACHED_ACCESSED, &ck->flags);
 
+	btree_path_cached_set(trans, path, ck, (enum btree_node_locked_type) lock_want);
 	path->uptodate = BTREE_ITER_UPTODATE;
-	EBUG_ON(!ck->valid);
-	EBUG_ON(btree_node_locked_type(path, 0) != btree_lock_want(path, 0));
+	return 0;
+}
 
+int bch2_btree_path_traverse_cached(struct btree_trans *trans, struct btree_path *path,
+				    unsigned flags)
+{
+	EBUG_ON(path->level);
+
+	path->l[1].b = NULL;
+
+	if (bch2_btree_node_relock_notrace(trans, path, 0)) {
+		path->uptodate = BTREE_ITER_UPTODATE;
+		return 0;
+	}
+
+	int ret;
+	do {
+		ret = btree_path_traverse_cached_fast(trans, path);
+		if (unlikely(ret == -ENOENT))
+			ret = btree_key_cache_fill(trans, path, flags);
+	} while (ret == -EEXIST);
+
+	if (unlikely(ret)) {
+		path->uptodate = BTREE_ITER_NEED_TRAVERSE;
+		if (!bch2_err_matches(ret, BCH_ERR_transaction_restart)) {
+			btree_node_unlock(trans, path, 0);
+			path->l[0].b = ERR_PTR(ret);
+		}
+	}
 	return ret;
 }
 
@@ -630,8 +554,6 @@ static int btree_key_cache_flush_pos(struct btree_trans *trans,
 		goto out;
 	}
 
-	BUG_ON(!ck->valid);
-
 	if (journal_seq && ck->journal.seq != journal_seq)
 		goto out;
 
@@ -753,7 +675,6 @@ bool bch2_btree_insert_key_cached(struct btree_trans *trans,
 	BUG_ON(insert->k.u64s > ck->u64s);
 
 	bkey_copy(ck->k, insert);
-	ck->valid = true;
 
 	if (!test_bit(BKEY_CACHED_DIRTY, &ck->flags)) {
 		EBUG_ON(test_bit(BCH_FS_clean_shutdown, &c->flags));
@@ -795,8 +716,6 @@ void bch2_btree_key_cache_drop(struct btree_trans *trans,
 	struct btree_key_cache *bc = &c->btree_key_cache;
 	struct bkey_cached *ck = (void *) path->l[0].b;
 
-	BUG_ON(!ck->valid);
-
 	/*
 	 * We just did an update to the btree, bypassing the key cache: the key
 	 * cache key is now stale and must be dropped, even if dirty:
diff --git a/fs/bcachefs/btree_types.h b/fs/bcachefs/btree_types.h
index 8d06ea56919c..4fe77d7f7242 100644
--- a/fs/bcachefs/btree_types.h
+++ b/fs/bcachefs/btree_types.h
@@ -388,7 +388,6 @@ struct bkey_cached {
 	unsigned long		flags;
 	unsigned long		btree_trans_barrier_seq;
 	u16			u64s;
-	bool			valid;
 	struct bkey_cached_key	key;
 
 	struct rhash_head	hash;

From 0f3372dcee6212ad49a8c32ea3ca80b91f36b9d3 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Fri, 28 Jun 2024 13:36:00 -0400
Subject: [PATCH 098/120] bcachefs: spelling fix

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/io_write.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fs/bcachefs/io_write.c b/fs/bcachefs/io_write.c
index c6197e6aa0b8..335e66222869 100644
--- a/fs/bcachefs/io_write.c
+++ b/fs/bcachefs/io_write.c
@@ -1080,7 +1080,7 @@ do_write:
 	*_dst = dst;
 	return more;
 csum_err:
-	bch_err(c, "%s writ error: error verifying existing checksum while rewriting existing data (memory corruption?)",
+	bch_err(c, "%s write error: error verifying existing checksum while rewriting existing data (memory corruption?)",
 		op->flags & BCH_WRITE_MOVE ? "move" : "user");
 	ret = -EIO;
 err:

From 5e3c208325189df6d0c08307cbadb2a03b2fbe2e Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Fri, 28 Jun 2024 13:51:38 -0400
Subject: [PATCH 099/120] bcachefs: Ratelimit checksum error messages

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/checksum.c | 5 +++--
 fs/bcachefs/io_write.c | 5 ++++-
 2 files changed, 7 insertions(+), 3 deletions(-)

diff --git a/fs/bcachefs/checksum.c b/fs/bcachefs/checksum.c
index 3bd3aba90d8f..e7208bf1974e 100644
--- a/fs/bcachefs/checksum.c
+++ b/fs/bcachefs/checksum.c
@@ -10,6 +10,7 @@
 #include <linux/xxhash.h>
 #include <linux/key.h>
 #include <linux/random.h>
+#include <linux/ratelimit.h>
 #include <linux/scatterlist.h>
 #include <crypto/algapi.h>
 #include <crypto/chacha.h>
@@ -436,7 +437,7 @@ int bch2_rechecksum_bio(struct bch_fs *c, struct bio *bio,
 	if (bch2_crc_cmp(merged, crc_old.csum) && !c->opts.no_data_io) {
 		struct printbuf buf = PRINTBUF;
 		prt_printf(&buf, "checksum error in %s() (memory corruption or bug?)\n"
-			   "expected %0llx:%0llx got %0llx:%0llx (old type ",
+			   "  expected %0llx:%0llx got %0llx:%0llx (old type ",
 			   __func__,
 			   crc_old.csum.hi,
 			   crc_old.csum.lo,
@@ -446,7 +447,7 @@ int bch2_rechecksum_bio(struct bch_fs *c, struct bio *bio,
 		prt_str(&buf, " new type ");
 		bch2_prt_csum_type(&buf, new_csum_type);
 		prt_str(&buf, ")");
-		bch_err(c, "%s", buf.buf);
+		WARN_RATELIMIT(1, "%s", buf.buf);
 		printbuf_exit(&buf);
 		return -EIO;
 	}
diff --git a/fs/bcachefs/io_write.c b/fs/bcachefs/io_write.c
index 335e66222869..b3b05e9392ae 100644
--- a/fs/bcachefs/io_write.c
+++ b/fs/bcachefs/io_write.c
@@ -1080,7 +1080,10 @@ do_write:
 	*_dst = dst;
 	return more;
 csum_err:
-	bch_err(c, "%s write error: error verifying existing checksum while rewriting existing data (memory corruption?)",
+	bch_err_inum_offset_ratelimited(c,
+		op->pos.inode,
+		op->pos.offset << 9,
+		"%s write error: error verifying existing checksum while rewriting existing data (memory corruption?)",
 		op->flags & BCH_WRITE_MOVE ? "move" : "user");
 	ret = -EIO;
 err:

From 9d9d212e26399c04c567c232f500179cbdc8dc7e Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Fri, 28 Jun 2024 16:25:39 -0400
Subject: [PATCH 100/120] bcachefs: bch2_extent_crc_unpacked_to_text()

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/extents.c | 21 ++++++++++++++-------
 fs/bcachefs/extents.h |  2 ++
 2 files changed, 16 insertions(+), 7 deletions(-)

diff --git a/fs/bcachefs/extents.c b/fs/bcachefs/extents.c
index 410b8bd81b5a..057df38fccf8 100644
--- a/fs/bcachefs/extents.c
+++ b/fs/bcachefs/extents.c
@@ -1034,6 +1034,18 @@ void bch2_extent_ptr_to_text(struct printbuf *out, struct bch_fs *c, const struc
 	--out->atomic;
 }
 
+void bch2_extent_crc_unpacked_to_text(struct printbuf *out, struct bch_extent_crc_unpacked *crc)
+{
+	prt_printf(out, "crc: c_size %u size %u offset %u nonce %u csum ",
+		   crc->compressed_size,
+		   crc->uncompressed_size,
+		   crc->offset, crc->nonce);
+	bch2_prt_csum_type(out, crc->csum_type);
+	prt_printf(out, " %0llx:%0llx ", crc->csum.hi, crc->csum.lo);
+	prt_str(out, " compress ");
+	bch2_prt_compression_type(out, crc->compression_type);
+}
+
 void bch2_bkey_ptrs_to_text(struct printbuf *out, struct bch_fs *c,
 			    struct bkey_s_c k)
 {
@@ -1059,13 +1071,7 @@ void bch2_bkey_ptrs_to_text(struct printbuf *out, struct bch_fs *c,
 			struct bch_extent_crc_unpacked crc =
 				bch2_extent_crc_unpack(k.k, entry_to_crc(entry));
 
-			prt_printf(out, "crc: c_size %u size %u offset %u nonce %u csum ",
-			       crc.compressed_size,
-			       crc.uncompressed_size,
-			       crc.offset, crc.nonce);
-			bch2_prt_csum_type(out, crc.csum_type);
-			prt_str(out, " compress ");
-			bch2_prt_compression_type(out, crc.compression_type);
+			bch2_extent_crc_unpacked_to_text(out, &crc);
 			break;
 		}
 		case BCH_EXTENT_ENTRY_stripe_ptr: {
@@ -1096,6 +1102,7 @@ void bch2_bkey_ptrs_to_text(struct printbuf *out, struct bch_fs *c,
 	}
 }
 
+
 static int extent_ptr_invalid(struct bch_fs *c,
 			      struct bkey_s_c k,
 			      enum bch_validate_flags flags,
diff --git a/fs/bcachefs/extents.h b/fs/bcachefs/extents.h
index 1ade959652b2..530686aa6fd9 100644
--- a/fs/bcachefs/extents.h
+++ b/fs/bcachefs/extents.h
@@ -212,6 +212,8 @@ static inline bool crc_is_encoded(struct bch_extent_crc_unpacked crc)
 	return crc.csum_type != BCH_CSUM_none || crc_is_compressed(crc);
 }
 
+void bch2_extent_crc_unpacked_to_text(struct printbuf *, struct bch_extent_crc_unpacked *);
+
 /* bkey_ptrs: generically over any key type that has ptrs */
 
 struct bkey_ptrs_c {

From b1d63b06e8398eb048dcc455acc628e6655d7499 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Fri, 28 Jun 2024 18:10:47 -0400
Subject: [PATCH 101/120] bcachefs: Make read_only a mount option again, but
 hidden

fsck passes read_only as a mount option, and it's required for
nochanges, which it also uses.

Usually read_only is handled by the VFS, but we need to be able to
handle it too; we just don't want to print it out twice, so mark it as a
hidden option.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/chardev.c | 13 -------------
 fs/bcachefs/fs.c      |  3 ++-
 fs/bcachefs/opts.h    |  3 ++-
 3 files changed, 4 insertions(+), 15 deletions(-)

diff --git a/fs/bcachefs/chardev.c b/fs/bcachefs/chardev.c
index 72ade3664d7b..ef1f74866e23 100644
--- a/fs/bcachefs/chardev.c
+++ b/fs/bcachefs/chardev.c
@@ -214,19 +214,6 @@ static long bch2_ioctl_fsck_offline(struct bch_ioctl_fsck_offline __user *user_a
 
 	if (arg.opts) {
 		char *optstr = strndup_user((char __user *)(unsigned long) arg.opts, 1 << 16);
-		char *ro, *rest;
-
-		/*
-		 * If passed a "read_only" mount option, remove it because it is
-		 * no longer a valid mount option, and the filesystem will be
-		 * set "read_only" regardless.
-		 */
-		ro = strstr(optstr, "read_only");
-		if (ro) {
-			rest = ro + strlen("read_only");
-			memmove(ro, rest, strlen(rest) + 1);
-		}
-
 		ret =   PTR_ERR_OR_ZERO(optstr) ?:
 			bch2_parse_mount_opts(NULL, &thr->opts, NULL, optstr);
 		if (!IS_ERR(optstr))
diff --git a/fs/bcachefs/fs.c b/fs/bcachefs/fs.c
index b734d91c4446..24fba256eb8b 100644
--- a/fs/bcachefs/fs.c
+++ b/fs/bcachefs/fs.c
@@ -1800,7 +1800,8 @@ static int bch2_show_options(struct seq_file *seq, struct dentry *root)
 		const struct bch_option *opt = &bch2_opt_table[i];
 		u64 v = bch2_opt_get_by_id(&c->opts, i);
 
-		if (!(opt->flags & OPT_MOUNT))
+		if ((opt->flags & OPT_HIDDEN) ||
+		    !(opt->flags & OPT_MOUNT))
 			continue;
 
 		if (v == bch2_opt_get_by_id(&bch2_opts_default, i))
diff --git a/fs/bcachefs/opts.h b/fs/bcachefs/opts.h
index 840dfd756760..60b93018501f 100644
--- a/fs/bcachefs/opts.h
+++ b/fs/bcachefs/opts.h
@@ -63,6 +63,7 @@ enum opt_flags {
 	OPT_MUST_BE_POW_2 = (1 << 7),	/* Must be power of 2 */
 	OPT_SB_FIELD_SECTORS = (1 << 8),/* Superblock field is >> 9 of actual value */
 	OPT_SB_FIELD_ILOG2 = (1 << 9),	/* Superblock field is ilog2 of actual value */
+	OPT_HIDDEN	= (1 << 10),
 };
 
 enum opt_type {
@@ -406,7 +407,7 @@ enum fsck_err_opts {
 	  BCH2_NO_SB_OPT,		BCH_SB_SECTOR,			\
 	  "offset",	"Sector offset of superblock")			\
 	x(read_only,			u8,				\
-	  OPT_FS,							\
+	  OPT_FS|OPT_MOUNT|OPT_HIDDEN,					\
 	  OPT_BOOL(),							\
 	  BCH2_NO_SB_OPT,		false,				\
 	  NULL,		NULL)						\

From a2cb8a6236daafbea5e3d9d720f5e55ba692817b Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Fri, 28 Jun 2024 13:28:30 -0400
Subject: [PATCH 102/120] bcachefs: Self healing on read IO error

This repurposes the promote path, which already knows how to call
data_update() after a read: we now automatically rewrite bad data when
we get a read error and then successfully retry from a different
replica.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/extents.c |  8 ++---
 fs/bcachefs/extents.h |  2 ++
 fs/bcachefs/io_read.c | 69 +++++++++++++++++++++++++++++--------------
 3 files changed, 53 insertions(+), 26 deletions(-)

diff --git a/fs/bcachefs/extents.c b/fs/bcachefs/extents.c
index 057df38fccf8..07973198e35f 100644
--- a/fs/bcachefs/extents.c
+++ b/fs/bcachefs/extents.c
@@ -37,8 +37,8 @@ static void bch2_extent_crc_pack(union bch_extent_crc *,
 				 struct bch_extent_crc_unpacked,
 				 enum bch_extent_entry_type);
 
-static struct bch_dev_io_failures *dev_io_failures(struct bch_io_failures *f,
-						   unsigned dev)
+struct bch_dev_io_failures *bch2_dev_io_failures(struct bch_io_failures *f,
+						 unsigned dev)
 {
 	struct bch_dev_io_failures *i;
 
@@ -52,7 +52,7 @@ static struct bch_dev_io_failures *dev_io_failures(struct bch_io_failures *f,
 void bch2_mark_io_failure(struct bch_io_failures *failed,
 			  struct extent_ptr_decoded *p)
 {
-	struct bch_dev_io_failures *f = dev_io_failures(failed, p->ptr.dev);
+	struct bch_dev_io_failures *f = bch2_dev_io_failures(failed, p->ptr.dev);
 
 	if (!f) {
 		BUG_ON(failed->nr >= ARRAY_SIZE(failed->devs));
@@ -140,7 +140,7 @@ int bch2_bkey_pick_read_device(struct bch_fs *c, struct bkey_s_c k,
 		if (p.ptr.cached && (!ca || dev_ptr_stale_rcu(ca, &p.ptr)))
 			continue;
 
-		f = failed ? dev_io_failures(failed, p.ptr.dev) : NULL;
+		f = failed ? bch2_dev_io_failures(failed, p.ptr.dev) : NULL;
 		if (f)
 			p.idx = f->nr_failed < f->nr_retries
 				? f->idx
diff --git a/fs/bcachefs/extents.h b/fs/bcachefs/extents.h
index 530686aa6fd9..facdb8a86eec 100644
--- a/fs/bcachefs/extents.h
+++ b/fs/bcachefs/extents.h
@@ -399,6 +399,8 @@ out:									\
 
 /* utility code common to all keys with pointers: */
 
+struct bch_dev_io_failures *bch2_dev_io_failures(struct bch_io_failures *,
+						 unsigned);
 void bch2_mark_io_failure(struct bch_io_failures *,
 			  struct extent_ptr_decoded *);
 int bch2_bkey_pick_read_device(struct bch_fs *, struct bkey_s_c,
diff --git a/fs/bcachefs/io_read.c b/fs/bcachefs/io_read.c
index ebf39ef72fb2..8b484c75757c 100644
--- a/fs/bcachefs/io_read.c
+++ b/fs/bcachefs/io_read.c
@@ -93,21 +93,24 @@ static const struct rhashtable_params bch_promote_params = {
 static inline int should_promote(struct bch_fs *c, struct bkey_s_c k,
 				  struct bpos pos,
 				  struct bch_io_opts opts,
-				  unsigned flags)
+				  unsigned flags,
+				  struct bch_io_failures *failed)
 {
-	BUG_ON(!opts.promote_target);
+	if (!failed) {
+		BUG_ON(!opts.promote_target);
 
-	if (!(flags & BCH_READ_MAY_PROMOTE))
-		return -BCH_ERR_nopromote_may_not;
+		if (!(flags & BCH_READ_MAY_PROMOTE))
+			return -BCH_ERR_nopromote_may_not;
 
-	if (bch2_bkey_has_target(c, k, opts.promote_target))
-		return -BCH_ERR_nopromote_already_promoted;
+		if (bch2_bkey_has_target(c, k, opts.promote_target))
+			return -BCH_ERR_nopromote_already_promoted;
 
-	if (bkey_extent_is_unwritten(k))
-		return -BCH_ERR_nopromote_unwritten;
+		if (bkey_extent_is_unwritten(k))
+			return -BCH_ERR_nopromote_unwritten;
 
-	if (bch2_target_congested(c, opts.promote_target))
-		return -BCH_ERR_nopromote_congested;
+		if (bch2_target_congested(c, opts.promote_target))
+			return -BCH_ERR_nopromote_congested;
+	}
 
 	if (rhashtable_lookup_fast(&c->promote_table, &pos,
 				   bch_promote_params))
@@ -164,7 +167,8 @@ static struct promote_op *__promote_alloc(struct btree_trans *trans,
 					  struct extent_ptr_decoded *pick,
 					  struct bch_io_opts opts,
 					  unsigned sectors,
-					  struct bch_read_bio **rbio)
+					  struct bch_read_bio **rbio,
+					  struct bch_io_failures *failed)
 {
 	struct bch_fs *c = trans->c;
 	struct promote_op *op = NULL;
@@ -217,14 +221,28 @@ static struct promote_op *__promote_alloc(struct btree_trans *trans,
 	bio = &op->write.op.wbio.bio;
 	bio_init(bio, NULL, bio->bi_inline_vecs, pages, 0);
 
+	struct data_update_opts update_opts = {};
+
+	if (!failed) {
+		update_opts.target = opts.promote_target;
+		update_opts.extra_replicas = 1;
+		update_opts.write_flags = BCH_WRITE_ALLOC_NOWAIT|BCH_WRITE_CACHED;
+	} else {
+		update_opts.target = opts.foreground_target;
+
+		struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
+		unsigned i = 0;
+		bkey_for_each_ptr(ptrs, ptr) {
+			if (bch2_dev_io_failures(failed, ptr->dev))
+				update_opts.rewrite_ptrs |= BIT(i);
+			i++;
+		}
+	}
+
 	ret = bch2_data_update_init(trans, NULL, NULL, &op->write,
 			writepoint_hashed((unsigned long) current),
 			opts,
-			(struct data_update_opts) {
-				.target		= opts.promote_target,
-				.extra_replicas	= 1,
-				.write_flags	= BCH_WRITE_ALLOC_NOWAIT|BCH_WRITE_CACHED,
-			},
+			update_opts,
 			btree_id, k);
 	/*
 	 * possible errors: -BCH_ERR_nocow_lock_blocked,
@@ -258,10 +276,17 @@ static struct promote_op *promote_alloc(struct btree_trans *trans,
 					unsigned flags,
 					struct bch_read_bio **rbio,
 					bool *bounce,
-					bool *read_full)
+					bool *read_full,
+					struct bch_io_failures *failed)
 {
 	struct bch_fs *c = trans->c;
-	bool promote_full = *read_full || READ_ONCE(c->promote_whole_extents);
+	/*
+	 * if failed != NULL we're not actually doing a promote, we're
+	 * recovering from an io/checksum error
+	 */
+	bool promote_full = (failed ||
+			     *read_full ||
+			     READ_ONCE(c->promote_whole_extents));
 	/* data might have to be decompressed in the write path: */
 	unsigned sectors = promote_full
 		? max(pick->crc.compressed_size, pick->crc.live_size)
@@ -272,7 +297,7 @@ static struct promote_op *promote_alloc(struct btree_trans *trans,
 	struct promote_op *promote;
 	int ret;
 
-	ret = should_promote(c, k, pos, opts, flags);
+	ret = should_promote(c, k, pos, opts, flags, failed);
 	if (ret)
 		goto nopromote;
 
@@ -280,7 +305,7 @@ static struct promote_op *promote_alloc(struct btree_trans *trans,
 				  k.k->type == KEY_TYPE_reflink_v
 				  ? BTREE_ID_reflink
 				  : BTREE_ID_extents,
-				  k, pos, pick, opts, sectors, rbio);
+				  k, pos, pick, opts, sectors, rbio, failed);
 	ret = PTR_ERR_OR_ZERO(promote);
 	if (ret)
 		goto nopromote;
@@ -910,9 +935,9 @@ retry_pick:
 		bounce = true;
 	}
 
-	if (orig->opts.promote_target)
+	if (orig->opts.promote_target)// || failed)
 		promote = promote_alloc(trans, iter, k, &pick, orig->opts, flags,
-					&rbio, &bounce, &read_full);
+					&rbio, &bounce, &read_full, failed);
 
 	if (!read_full) {
 		EBUG_ON(crc_is_compressed(pick.crc));

From ec8bf491a9008b8db97076ba7a6905edb4537bb9 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Sat, 29 Jun 2024 11:43:23 -0400
Subject: [PATCH 103/120] bcachefs: Improve startup message

We're not always mounting when we start the filesystem

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/super.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fs/bcachefs/super.c b/fs/bcachefs/super.c
index c22a8ef2d2e1..0455a1001fec 100644
--- a/fs/bcachefs/super.c
+++ b/fs/bcachefs/super.c
@@ -968,7 +968,7 @@ static void print_mount_opts(struct bch_fs *c)
 	struct printbuf p = PRINTBUF;
 	bool first = true;
 
-	prt_str(&p, "mounting version ");
+	prt_str(&p, "starting version ");
 	bch2_version_to_text(&p, c->sb.version);
 
 	if (c->opts.read_only) {

From 27d033df35154256e0063450d651000bd1b36d72 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Sat, 29 Jun 2024 18:08:20 -0400
Subject: [PATCH 104/120] bcachefs: Convert clock code to u64s

Eliminate possible integer truncation bugs on 32 bit

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/clock.c       | 65 +++++++++++++++++----------------------
 fs/bcachefs/clock.h       |  9 +++---
 fs/bcachefs/clock_types.h |  3 +-
 3 files changed, 35 insertions(+), 42 deletions(-)

diff --git a/fs/bcachefs/clock.c b/fs/bcachefs/clock.c
index 0f40b585ce2b..df3763c18c0e 100644
--- a/fs/bcachefs/clock.c
+++ b/fs/bcachefs/clock.c
@@ -15,18 +15,15 @@ static inline long io_timer_cmp(io_timer_heap *h,
 
 void bch2_io_timer_add(struct io_clock *clock, struct io_timer *timer)
 {
-	size_t i;
-
 	spin_lock(&clock->timer_lock);
 
-	if (time_after_eq((unsigned long) atomic64_read(&clock->now),
-			  timer->expire)) {
+	if (time_after_eq64((u64) atomic64_read(&clock->now), timer->expire)) {
 		spin_unlock(&clock->timer_lock);
 		timer->fn(timer);
 		return;
 	}
 
-	for (i = 0; i < clock->timers.used; i++)
+	for (size_t i = 0; i < clock->timers.used; i++)
 		if (clock->timers.data[i] == timer)
 			goto out;
 
@@ -37,11 +34,9 @@ out:
 
 void bch2_io_timer_del(struct io_clock *clock, struct io_timer *timer)
 {
-	size_t i;
-
 	spin_lock(&clock->timer_lock);
 
-	for (i = 0; i < clock->timers.used; i++)
+	for (size_t i = 0; i < clock->timers.used; i++)
 		if (clock->timers.data[i] == timer) {
 			heap_del(&clock->timers, i, io_timer_cmp, NULL);
 			break;
@@ -75,33 +70,31 @@ static void io_clock_cpu_timeout(struct timer_list *timer)
 	wake_up_process(wait->task);
 }
 
-void bch2_io_clock_schedule_timeout(struct io_clock *clock, unsigned long until)
+void bch2_io_clock_schedule_timeout(struct io_clock *clock, u64 until)
 {
-	struct io_clock_wait wait;
+	struct io_clock_wait wait = {
+		.io_timer.expire	= until,
+		.io_timer.fn		= io_clock_wait_fn,
+		.io_timer.fn2		= (void *) _RET_IP_,
+		.task			= current,
+	};
 
-	/* XXX: calculate sleep time rigorously */
-	wait.io_timer.expire	= until;
-	wait.io_timer.fn	= io_clock_wait_fn;
-	wait.task		= current;
-	wait.expired		= 0;
 	bch2_io_timer_add(clock, &wait.io_timer);
-
 	schedule();
-
 	bch2_io_timer_del(clock, &wait.io_timer);
 }
 
 void bch2_kthread_io_clock_wait(struct io_clock *clock,
-				unsigned long io_until,
-				unsigned long cpu_timeout)
+				u64 io_until, unsigned long cpu_timeout)
 {
 	bool kthread = (current->flags & PF_KTHREAD) != 0;
-	struct io_clock_wait wait;
+	struct io_clock_wait wait = {
+		.io_timer.expire	= io_until,
+		.io_timer.fn		= io_clock_wait_fn,
+		.io_timer.fn2		= (void *) _RET_IP_,
+		.task			= current,
+	};
 
-	wait.io_timer.expire	= io_until;
-	wait.io_timer.fn	= io_clock_wait_fn;
-	wait.task		= current;
-	wait.expired		= 0;
 	bch2_io_timer_add(clock, &wait.io_timer);
 
 	timer_setup_on_stack(&wait.cpu_timer, io_clock_cpu_timeout, 0);
@@ -127,21 +120,20 @@ void bch2_kthread_io_clock_wait(struct io_clock *clock,
 	bch2_io_timer_del(clock, &wait.io_timer);
 }
 
-static struct io_timer *get_expired_timer(struct io_clock *clock,
-					  unsigned long now)
+static struct io_timer *get_expired_timer(struct io_clock *clock, u64 now)
 {
 	struct io_timer *ret = NULL;
 
 	if (clock->timers.used &&
-	    time_after_eq(now, clock->timers.data[0]->expire))
+	    time_after_eq64(now, clock->timers.data[0]->expire))
 		heap_pop(&clock->timers, ret, io_timer_cmp, NULL);
 	return ret;
 }
 
-void __bch2_increment_clock(struct io_clock *clock, unsigned sectors)
+void __bch2_increment_clock(struct io_clock *clock, u64 sectors)
 {
 	struct io_timer *timer;
-	unsigned long now = atomic64_add_return(sectors, &clock->now);
+	u64 now = atomic64_add_return(sectors, &clock->now);
 
 	spin_lock(&clock->timer_lock);
 	while ((timer = get_expired_timer(clock, now)))
@@ -151,17 +143,18 @@ void __bch2_increment_clock(struct io_clock *clock, unsigned sectors)
 
 void bch2_io_timers_to_text(struct printbuf *out, struct io_clock *clock)
 {
-	unsigned long now;
-	unsigned i;
-
 	out->atomic++;
 	spin_lock(&clock->timer_lock);
-	now = atomic64_read(&clock->now);
+	u64 now = atomic64_read(&clock->now);
 
-	for (i = 0; i < clock->timers.used; i++)
-		prt_printf(out, "%ps:\t%li\n",
+	printbuf_tabstop_push(out, 40);
+	prt_printf(out, "current time:\t%llu\n", now);
+
+	for (unsigned i = 0; i < clock->timers.used; i++)
+		prt_printf(out, "%ps %ps:\t%llu\n",
 		       clock->timers.data[i]->fn,
-		       clock->timers.data[i]->expire - now);
+		       clock->timers.data[i]->fn2,
+		       clock->timers.data[i]->expire);
 	spin_unlock(&clock->timer_lock);
 	--out->atomic;
 }
diff --git a/fs/bcachefs/clock.h b/fs/bcachefs/clock.h
index 70a0f7436c84..85c975dfbcfe 100644
--- a/fs/bcachefs/clock.h
+++ b/fs/bcachefs/clock.h
@@ -4,12 +4,11 @@
 
 void bch2_io_timer_add(struct io_clock *, struct io_timer *);
 void bch2_io_timer_del(struct io_clock *, struct io_timer *);
-void bch2_kthread_io_clock_wait(struct io_clock *, unsigned long,
-				unsigned long);
+void bch2_kthread_io_clock_wait(struct io_clock *, u64, unsigned long);
 
-void __bch2_increment_clock(struct io_clock *, unsigned);
+void __bch2_increment_clock(struct io_clock *, u64);
 
-static inline void bch2_increment_clock(struct bch_fs *c, unsigned sectors,
+static inline void bch2_increment_clock(struct bch_fs *c, u64 sectors,
 					int rw)
 {
 	struct io_clock *clock = &c->io_clock[rw];
@@ -19,7 +18,7 @@ static inline void bch2_increment_clock(struct bch_fs *c, unsigned sectors,
 		__bch2_increment_clock(clock, this_cpu_xchg(*clock->pcpu_buf, 0));
 }
 
-void bch2_io_clock_schedule_timeout(struct io_clock *, unsigned long);
+void bch2_io_clock_schedule_timeout(struct io_clock *, u64);
 
 #define bch2_kthread_wait_event_ioclock_timeout(condition, clock, timeout)\
 ({									\
diff --git a/fs/bcachefs/clock_types.h b/fs/bcachefs/clock_types.h
index 5fae0012d808..9c25d0fcf294 100644
--- a/fs/bcachefs/clock_types.h
+++ b/fs/bcachefs/clock_types.h
@@ -17,7 +17,8 @@ typedef void (*io_timer_fn)(struct io_timer *);
 
 struct io_timer {
 	io_timer_fn		fn;
-	unsigned long		expire;
+	void			*fn2;
+	u64			expire;
 };
 
 /* Amount to buffer up on a percpu counter */

From 8f523d425e0255a68d2f47f9b21db4f684c185c9 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Sat, 29 Jun 2024 16:04:40 -0400
Subject: [PATCH 105/120] bcachefs: Improve copygc_wait_to_text()

printing the raw values can occasionally be very useful

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/movinggc.c | 11 ++++++++---
 1 file changed, 8 insertions(+), 3 deletions(-)

diff --git a/fs/bcachefs/movinggc.c b/fs/bcachefs/movinggc.c
index eb49dd045eff..deef4f024d20 100644
--- a/fs/bcachefs/movinggc.c
+++ b/fs/bcachefs/movinggc.c
@@ -290,18 +290,23 @@ unsigned long bch2_copygc_wait_amount(struct bch_fs *c)
 
 void bch2_copygc_wait_to_text(struct printbuf *out, struct bch_fs *c)
 {
-	prt_printf(out, "Currently waiting for:     ");
+	printbuf_tabstop_push(out, 32);
+	prt_printf(out, "running:\t%u\n",		c->copygc_running);
+	prt_printf(out, "copygc_wait:\t%llu\n",		c->copygc_wait);
+	prt_printf(out, "copygc_wait_at:\t%llu\n",	c->copygc_wait_at);
+
+	prt_printf(out, "Currently waiting for:\t");
 	prt_human_readable_u64(out, max(0LL, c->copygc_wait -
 					atomic64_read(&c->io_clock[WRITE].now)) << 9);
 	prt_newline(out);
 
-	prt_printf(out, "Currently waiting since:   ");
+	prt_printf(out, "Currently waiting since:\t");
 	prt_human_readable_u64(out, max(0LL,
 					atomic64_read(&c->io_clock[WRITE].now) -
 					c->copygc_wait_at) << 9);
 	prt_newline(out);
 
-	prt_printf(out, "Currently calculated wait: ");
+	prt_printf(out, "Currently calculated wait:\t");
 	prt_human_readable_u64(out, bch2_copygc_wait_amount(c));
 	prt_newline(out);
 }

From 1a616c2fe96b357894b74b41787d4ea6987f6199 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Thu, 21 Dec 2023 20:34:17 -0500
Subject: [PATCH 106/120] lockdep: lockdep_set_notrack_class()

Add a new helper to disable lockdep tracking entirely for a given class.

This is needed for bcachefs, which takes too many btree node locks for
lockdep to track. Instead, we have a single lockdep_map for "btree_trans
has any btree nodes locked", which makes more since given that we have
centralized lock management and a cycle detector.

Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Will Deacon <will@kernel.org>
Cc: Waiman Long <longman@redhat.com>
Cc: Boqun Feng <boqun.feng@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_locking.c   | 2 +-
 include/linux/lockdep.h       | 4 ++++
 include/linux/lockdep_types.h | 1 +
 kernel/locking/lockdep.c      | 9 ++++++++-
 4 files changed, 14 insertions(+), 2 deletions(-)

diff --git a/fs/bcachefs/btree_locking.c b/fs/bcachefs/btree_locking.c
index c51826fd557f..8bdfe573e95b 100644
--- a/fs/bcachefs/btree_locking.c
+++ b/fs/bcachefs/btree_locking.c
@@ -10,7 +10,7 @@ void bch2_btree_lock_init(struct btree_bkey_cached_common *b,
 			  enum six_lock_init_flags flags)
 {
 	__six_lock_init(&b->lock, "b->c.lock", &bch2_btree_node_lock_key, flags);
-	lockdep_set_novalidate_class(&b->lock);
+	lockdep_set_notrack_class(&b->lock);
 }
 
 #ifdef CONFIG_LOCKDEP
diff --git a/include/linux/lockdep.h b/include/linux/lockdep.h
index 08b0d1d9d78b..b76f1bcd2f7f 100644
--- a/include/linux/lockdep.h
+++ b/include/linux/lockdep.h
@@ -181,6 +181,9 @@ static inline void lockdep_init_map(struct lockdep_map *lock, const char *name,
 #define lockdep_set_novalidate_class(lock) \
 	lockdep_set_class_and_name(lock, &__lockdep_no_validate__, #lock)
 
+#define lockdep_set_notrack_class(lock) \
+	lockdep_set_class_and_name(lock, &__lockdep_no_track__, #lock)
+
 /*
  * Compare locking classes
  */
@@ -338,6 +341,7 @@ static inline void lockdep_set_selftest_task(struct task_struct *task)
 #define lockdep_set_subclass(lock, sub)		do { } while (0)
 
 #define lockdep_set_novalidate_class(lock) do { } while (0)
+#define lockdep_set_notrack_class(lock) do { } while (0)
 
 /*
  * We don't define lockdep_match_class() and lockdep_match_key() for !LOCKDEP
diff --git a/include/linux/lockdep_types.h b/include/linux/lockdep_types.h
index 70d30d40ea4a..9f361d3ab9d9 100644
--- a/include/linux/lockdep_types.h
+++ b/include/linux/lockdep_types.h
@@ -80,6 +80,7 @@ struct lock_class_key {
 };
 
 extern struct lock_class_key __lockdep_no_validate__;
+extern struct lock_class_key __lockdep_no_track__;
 
 struct lock_trace;
 
diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c
index 151bd3de5936..b6bb9fcd992a 100644
--- a/kernel/locking/lockdep.c
+++ b/kernel/locking/lockdep.c
@@ -4918,6 +4918,9 @@ EXPORT_SYMBOL_GPL(lockdep_init_map_type);
 struct lock_class_key __lockdep_no_validate__;
 EXPORT_SYMBOL_GPL(__lockdep_no_validate__);
 
+struct lock_class_key __lockdep_no_track__;
+EXPORT_SYMBOL_GPL(__lockdep_no_track__);
+
 #ifdef CONFIG_PROVE_LOCKING
 void lockdep_set_lock_cmp_fn(struct lockdep_map *lock, lock_cmp_fn cmp_fn,
 			     lock_print_fn print_fn)
@@ -5002,6 +5005,9 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass,
 	if (unlikely(!debug_locks))
 		return 0;
 
+	if (unlikely(lock->key == &__lockdep_no_track__))
+		return 0;
+
 	if (!prove_locking || lock->key == &__lockdep_no_validate__)
 		check = 0;
 
@@ -5764,7 +5770,8 @@ void lock_release(struct lockdep_map *lock, unsigned long ip)
 
 	trace_lock_release(lock, ip);
 
-	if (unlikely(!lockdep_enabled()))
+	if (unlikely(!lockdep_enabled() ||
+		     lock->key == &__lockdep_no_track__))
 		return;
 
 	raw_local_irq_save(flags);

From 375476c41405ff6fc379cdbf1ad1df35c737500c Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Thu, 21 Dec 2023 18:54:09 -0500
Subject: [PATCH 107/120] bcachefs: Add lockdep support for btree node locks

This adds lockdep tracking for held btree locks with a single dep_map in
btree_trans, i.e. tracking all held btree locks as one object.

This is more practical and more useful than having lockdep track held
btree locks individually, because
 - we can take more locks than lockdep can track (unbounded, now that we
   have dynamically resizable btree paths)
 - there's no lock ordering between btree locks for lockdep to track (we
   do cycle detection)
 - and this makes it easy to teach lockdep that btree locks are not safe
   to hold while invoking memory reclaim.

The last rule is one that lockdep would never learn, because we only do
trylock() from within shrinkers - but we very much do not want to be
invoking memory reclaim while holding btree node locks.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_iter.c    | 24 +++++++++++++++++++++---
 fs/bcachefs/btree_locking.h |  2 ++
 fs/bcachefs/btree_types.h   |  3 +++
 3 files changed, 26 insertions(+), 3 deletions(-)

diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c
index 9485208b6758..803cc58ff577 100644
--- a/fs/bcachefs/btree_iter.c
+++ b/fs/bcachefs/btree_iter.c
@@ -3171,6 +3171,9 @@ got_trans:
 
 	trans->paths_allocated[0] = 1;
 
+	static struct lock_class_key lockdep_key;
+	lockdep_init_map(&trans->dep_map, "bcachefs_btree", &lockdep_key, 0);
+
 	if (fn_idx < BCH_TRANSACTIONS_NR) {
 		trans->fn = bch2_btree_transaction_fns[fn_idx];
 
@@ -3440,7 +3443,22 @@ int bch2_fs_btree_iter_init(struct bch_fs *c)
 		mempool_init_kmalloc_pool(&c->btree_trans_mem_pool, 1,
 					  BTREE_TRANS_MEM_MAX) ?:
 		init_srcu_struct(&c->btree_trans_barrier);
-	if (!ret)
-		c->btree_trans_barrier_initialized = true;
-	return ret;
+	if (ret)
+		return ret;
+
+	/*
+	 * static annotation (hackily done) for lock ordering of reclaim vs.
+	 * btree node locks:
+	 */
+#ifdef CONFIG_LOCKDEP
+	fs_reclaim_acquire(GFP_KERNEL);
+	struct btree_trans *trans = bch2_trans_get(c);
+	trans_set_locked(trans);
+	bch2_trans_put(trans);
+	fs_reclaim_release(GFP_KERNEL);
+#endif
+
+	c->btree_trans_barrier_initialized = true;
+	return 0;
+
 }
diff --git a/fs/bcachefs/btree_locking.h b/fs/bcachefs/btree_locking.h
index 8f5f1973c7d8..8dbceec8ec25 100644
--- a/fs/bcachefs/btree_locking.h
+++ b/fs/bcachefs/btree_locking.h
@@ -197,6 +197,7 @@ int bch2_six_check_for_deadlock(struct six_lock *lock, void *p);
 static inline void trans_set_locked(struct btree_trans *trans)
 {
 	if (!trans->locked) {
+		lock_acquire_exclusive(&trans->dep_map, 0, 0, NULL, _THIS_IP_);
 		trans->locked = true;
 		trans->last_unlock_ip = 0;
 
@@ -208,6 +209,7 @@ static inline void trans_set_locked(struct btree_trans *trans)
 static inline void trans_set_unlocked(struct btree_trans *trans)
 {
 	if (trans->locked) {
+		lock_release(&trans->dep_map, _THIS_IP_);
 		trans->locked = false;
 		trans->last_unlock_ip = _RET_IP_;
 
diff --git a/fs/bcachefs/btree_types.h b/fs/bcachefs/btree_types.h
index 4fe77d7f7242..79898f687772 100644
--- a/fs/bcachefs/btree_types.h
+++ b/fs/bcachefs/btree_types.h
@@ -523,6 +523,9 @@ struct btree_trans {
 	unsigned		journal_u64s;
 	unsigned		extra_disk_res; /* XXX kill */
 
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+	struct lockdep_map	dep_map;
+#endif
 	/* Entries before this are zeroed out on every bch2_trans_get() call */
 
 	struct list_head	list;

From 6ec8623f7cdd7e32a560fbef1e742bd8bc23925e Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Fri, 12 Jul 2024 14:16:01 -0400
Subject: [PATCH 108/120] bcachefs: btree node scan: fall back to comparing by
 journal seq

highly damaged filesystems, or filesystems that have been damaged and
repair and damaged again, may have sequence numbers we can't fully trust
- which in itself is something we need to debug.

Add a journal_seq fallback so that repair doesn't get stuck.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_io.c              |  4 +++
 fs/bcachefs/btree_node_scan.c       | 51 ++++++++++++++++++++---------
 fs/bcachefs/btree_node_scan_types.h |  1 +
 3 files changed, 41 insertions(+), 15 deletions(-)

diff --git a/fs/bcachefs/btree_io.c b/fs/bcachefs/btree_io.c
index e092f541c449..db700caf8afa 100644
--- a/fs/bcachefs/btree_io.c
+++ b/fs/bcachefs/btree_io.c
@@ -1006,6 +1006,7 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca,
 		BTREE_PTR_RANGE_UPDATED(&bkey_i_to_btree_ptr_v2(&b->key)->v);
 	unsigned u64s;
 	unsigned ptr_written = btree_ptr_sectors_written(bkey_i_to_s_c(&b->key));
+	u64 max_journal_seq = 0;
 	struct printbuf buf = PRINTBUF;
 	int ret = 0, retry_read = 0, write = READ;
 	u64 start_time = local_clock();
@@ -1181,6 +1182,8 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca,
 		sort_iter_add(iter,
 			      vstruct_idx(i, 0),
 			      vstruct_last(i));
+
+		max_journal_seq = max(max_journal_seq, le64_to_cpu(i->journal_seq));
 	}
 
 	if (ptr_written) {
@@ -1217,6 +1220,7 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca,
 	swap(sorted, b->data);
 	set_btree_bset(b, b->set, &b->data->keys);
 	b->nsets = 1;
+	b->data->keys.journal_seq = cpu_to_le64(max_journal_seq);
 
 	BUG_ON(b->nr.live_u64s != u64s);
 
diff --git a/fs/bcachefs/btree_node_scan.c b/fs/bcachefs/btree_node_scan.c
index 2cb0442f6cc9..001107226377 100644
--- a/fs/bcachefs/btree_node_scan.c
+++ b/fs/bcachefs/btree_node_scan.c
@@ -22,7 +22,9 @@ struct find_btree_nodes_worker {
 
 static void found_btree_node_to_text(struct printbuf *out, struct bch_fs *c, const struct found_btree_node *n)
 {
-	prt_printf(out, "%s l=%u seq=%u cookie=%llx ", bch2_btree_id_str(n->btree_id), n->level, n->seq, n->cookie);
+	prt_printf(out, "%s l=%u seq=%u journal_seq=%llu cookie=%llx ",
+		   bch2_btree_id_str(n->btree_id), n->level, n->seq,
+		   n->journal_seq, n->cookie);
 	bch2_bpos_to_text(out, n->min_key);
 	prt_str(out, "-");
 	bch2_bpos_to_text(out, n->max_key);
@@ -63,19 +65,37 @@ static void found_btree_node_to_key(struct bkey_i *k, const struct found_btree_n
 	memcpy(bp->v.start, f->ptrs, sizeof(struct bch_extent_ptr) * f->nr_ptrs);
 }
 
+static inline u64 bkey_journal_seq(struct bkey_s_c k)
+{
+	switch (k.k->type) {
+	case KEY_TYPE_inode_v3:
+		return le64_to_cpu(bkey_s_c_to_inode_v3(k).v->bi_journal_seq);
+	default:
+		return 0;
+	}
+}
+
 static bool found_btree_node_is_readable(struct btree_trans *trans,
 					 struct found_btree_node *f)
 {
-	struct { __BKEY_PADDED(k, BKEY_BTREE_PTR_VAL_U64s_MAX); } k;
+	struct { __BKEY_PADDED(k, BKEY_BTREE_PTR_VAL_U64s_MAX); } tmp;
 
-	found_btree_node_to_key(&k.k, f);
+	found_btree_node_to_key(&tmp.k, f);
 
-	struct btree *b = bch2_btree_node_get_noiter(trans, &k.k, f->btree_id, f->level, false);
+	struct btree *b = bch2_btree_node_get_noiter(trans, &tmp.k, f->btree_id, f->level, false);
 	bool ret = !IS_ERR_OR_NULL(b);
 	if (!ret)
 		return ret;
 
 	f->sectors_written = b->written;
+	f->journal_seq = le64_to_cpu(b->data->keys.journal_seq);
+
+	struct bkey_s_c k;
+	struct bkey unpacked;
+	struct btree_node_iter iter;
+	for_each_btree_node_key_unpack(b, k, &iter, &unpacked)
+		f->journal_seq = max(f->journal_seq, bkey_journal_seq(k));
+
 	six_unlock_read(&b->c.lock);
 
 	/*
@@ -84,7 +104,7 @@ static bool found_btree_node_is_readable(struct btree_trans *trans,
 	 * this node
 	 */
 	if (b != btree_node_root(trans->c, b))
-		bch2_btree_node_evict(trans, &k.k);
+		bch2_btree_node_evict(trans, &tmp.k);
 	return ret;
 }
 
@@ -105,7 +125,8 @@ static int found_btree_node_cmp_cookie(const void *_l, const void *_r)
 static int found_btree_node_cmp_time(const struct found_btree_node *l,
 				     const struct found_btree_node *r)
 {
-	return cmp_int(l->seq, r->seq);
+	return  cmp_int(l->seq, r->seq) ?:
+		cmp_int(l->journal_seq, r->journal_seq);
 }
 
 static int found_btree_node_cmp_pos(const void *_l, const void *_r)
@@ -309,15 +330,15 @@ again:
 		} else if (n->level) {
 			n->overwritten = true;
 		} else {
-			struct printbuf buf = PRINTBUF;
-
-			prt_str(&buf, "overlapping btree nodes with same seq! halting\n  ");
-			found_btree_node_to_text(&buf, c, start);
-			prt_str(&buf, "\n  ");
-			found_btree_node_to_text(&buf, c, n);
-			bch_err(c, "%s", buf.buf);
-			printbuf_exit(&buf);
-			return -BCH_ERR_fsck_repair_unimplemented;
+			if (bpos_cmp(start->max_key, n->max_key) >= 0)
+				n->overwritten = true;
+			else {
+				n->range_updated = true;
+				n->min_key = bpos_successor(start->max_key);
+				n->range_updated = true;
+				bubble_up(n, end);
+				goto again;
+			}
 		}
 	}
 
diff --git a/fs/bcachefs/btree_node_scan_types.h b/fs/bcachefs/btree_node_scan_types.h
index 5cfaeb5ac831..b6c36c45d0be 100644
--- a/fs/bcachefs/btree_node_scan_types.h
+++ b/fs/bcachefs/btree_node_scan_types.h
@@ -11,6 +11,7 @@ struct found_btree_node {
 	u8			level;
 	unsigned		sectors_written;
 	u32			seq;
+	u64			journal_seq;
 	u64			cookie;
 
 	struct bpos		min_key;

From 7b6dda7282b1f26094aac21e1862690c79152e51 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Fri, 12 Jul 2024 14:35:46 -0400
Subject: [PATCH 109/120] bcachefs: drop packed, aligned from bkey_inode_buf

Unnecessary here, and this broke the rust bindings:

error[E0588]: packed type cannot transitively contain a `#[repr(align)]` type
     --> /build/source/target/release/build/bch_bindgen-9445b24c90aca2a3/out/bcachefs.rs:29025:1
      |
29025 | pub struct bkey_i_inode_v3 {
      | ^^^^^^^^^^^^^^^^^^^^^^^^^^
      |
note: `bch_inode_v3` has a `#[repr(align)]` attribute
     --> /build/source/target/release/build/bch_bindgen-9445b24c90aca2a3/out/bcachefs.rs:8949:1
      |
8949  | pub struct bch_inode_v3 {
      | ^^^^^^^^^^^^^^^^^^^^^^^

error[E0588]: packed type cannot transitively contain a `#[repr(align)]` type
     --> /build/source/target/release/build/bch_bindgen-9445b24c90aca2a3/out/bcachefs.rs:32826:1
      |
32826 | pub struct bkey_inode_buf {
      | ^^^^^^^^^^^^^^^^^^^^^^^^^
      |
note: `bch_inode_v3` has a `#[repr(align)]` attribute
     --> /build/source/target/release/build/bch_bindgen-9445b24c90aca2a3/out/bcachefs.rs:8949:1
      |
8949  | pub struct bch_inode_v3 {
      | ^^^^^^^^^^^^^^^^^^^^^^^
note: `bkey_inode_buf` contains a field of type `bkey_i_inode_v3`
     --> /build/source/target/release/build/bch_bindgen-9445b24c90aca2a3/out/bcachefs.rs:32827:9
      |
32827 |     pub inode: bkey_i_inode_v3,
      |         ^^^^^
note: ...which contains a field of type `bch_inode_v3`
     --> /build/source/target/release/build/bch_bindgen-9445b24c90aca2a3/out/bcachefs.rs:29027:9
      |
29027 |     pub v: bch_inode_v3,
      |         ^

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/inode.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fs/bcachefs/inode.h b/fs/bcachefs/inode.h
index 679f5f5e5d15..da0e4a745099 100644
--- a/fs/bcachefs/inode.h
+++ b/fs/bcachefs/inode.h
@@ -88,7 +88,7 @@ struct bkey_inode_buf {
 #define x(_name, _bits)		+ 8 + _bits / 8
 	u8		_pad[0 + BCH_INODE_FIELDS_v3()];
 #undef  x
-} __packed __aligned(8);
+};
 
 void bch2_inode_pack(struct bkey_inode_buf *, const struct bch_inode_unpacked *);
 int bch2_inode_unpack(struct bkey_s_c, struct bch_inode_unpacked *);

From 114f530e1e600245b0f9c114aa75c1ecc4376c67 Mon Sep 17 00:00:00 2001
From: Hongbo Li <lihongbo22@huawei.com>
Date: Fri, 12 Jul 2024 15:09:25 +0800
Subject: [PATCH 110/120] bcachefs: show none if label is not set

If label is not set, the Label tag in superblock info show '(none)'.

```
[Before]
Device index:                               0
Label:
Version:                                    1.4: member_seq

[After]
Device index:                               0
Label:                                      (none)
Version:                                    1.4: member_seq
```

Signed-off-by: Hongbo Li <lihongbo22@huawei.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/super-io.c | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/fs/bcachefs/super-io.c b/fs/bcachefs/super-io.c
index b156fc85b8a3..8bc819832790 100644
--- a/fs/bcachefs/super-io.c
+++ b/fs/bcachefs/super-io.c
@@ -1312,7 +1312,10 @@ void bch2_sb_to_text(struct printbuf *out, struct bch_sb *sb,
 	prt_printf(out, "Device index:\t%u\n", sb->dev_idx);
 
 	prt_printf(out, "Label:\t");
-	prt_printf(out, "%.*s", (int) sizeof(sb->label), sb->label);
+	if (!strlen(sb->label))
+		prt_printf(out, "(none)");
+	else
+		prt_printf(out, "%.*s", (int) sizeof(sb->label), sb->label);
 	prt_newline(out);
 
 	prt_printf(out, "Version:\t");

From 1d18b5cabc89f9c0ea0a3a1d738d5a02d478117f Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Sun, 14 Jul 2024 16:32:11 -0400
Subject: [PATCH 111/120] bcachefs: __bch2_read(): call trans_begin() on every
 loop iter

perusal of /sys/kernel/debug/bcachefs/*/btree_transaction_stats shows
that the read path has been acculumalating unneeded paths on the reflink
btree, which we don't want.

The solution is to call bch2_trans_begin(), which drops paths not used
on previous loop iteration.

bch2_readahead:
  Max mem used: 0
  Transaction duration:
    count:      194235
                           since mount        recent
    duration of events
      min:                      150 ns
      max:                        9 ms
      total:                    838 ms
      mean:                       4 us          6 us
      stddev:                    34 us          7 us
    time between events
      min:                       10 ns
      max:                       15 h
      mean:                       2 s          12 s
      stddev:                     2 s           3 ms
  Maximum allocated btree paths (193):
    path: idx  2 ref 0:0 P   btree=extents l=0 pos 270943112:392:U32_MAX locks 0
    path: idx  3 ref 1:0   S btree=extents l=0 pos 270943112:24578:U32_MAX locks 1
    path: idx  4 ref 0:0 P   btree=reflink l=0 pos 0:24773509:0 locks 0
    path: idx  5 ref 0:0 P S btree=reflink l=0 pos 0:24773631:0 locks 1
    path: idx  6 ref 0:0 P S btree=reflink l=0 pos 0:24773759:0 locks 1
    path: idx  7 ref 0:0 P S btree=reflink l=0 pos 0:24773887:0 locks 1
    path: idx  8 ref 0:0 P S btree=reflink l=0 pos 0:24774015:0 locks 1
    path: idx  9 ref 0:0 P S btree=reflink l=0 pos 0:24774143:0 locks 1
    path: idx 10 ref 0:0 P S btree=reflink l=0 pos 0:24774271:0 locks 1
<many more reflink paths>

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/io_read.c | 45 ++++++++++++++++++-------------------------
 1 file changed, 19 insertions(+), 26 deletions(-)

diff --git a/fs/bcachefs/io_read.c b/fs/bcachefs/io_read.c
index 8b484c75757c..4531c9ab3e12 100644
--- a/fs/bcachefs/io_read.c
+++ b/fs/bcachefs/io_read.c
@@ -1147,34 +1147,27 @@ void __bch2_read(struct bch_fs *c, struct bch_read_bio *rbio,
 	struct btree_iter iter;
 	struct bkey_buf sk;
 	struct bkey_s_c k;
-	u32 snapshot;
 	int ret;
 
 	BUG_ON(flags & BCH_READ_NODECODE);
 
 	bch2_bkey_buf_init(&sk);
-retry:
-	bch2_trans_begin(trans);
-	iter = (struct btree_iter) { NULL };
-
-	ret = bch2_subvolume_get_snapshot(trans, inum.subvol, &snapshot);
-	if (ret)
-		goto err;
-
 	bch2_trans_iter_init(trans, &iter, BTREE_ID_extents,
-			     SPOS(inum.inum, bvec_iter.bi_sector, snapshot),
+			     POS(inum.inum, bvec_iter.bi_sector),
 			     BTREE_ITER_slots);
+
 	while (1) {
 		unsigned bytes, sectors, offset_into_extent;
 		enum btree_id data_btree = BTREE_ID_extents;
 
-		/*
-		 * read_extent -> io_time_reset may cause a transaction restart
-		 * without returning an error, we need to check for that here:
-		 */
-		ret = bch2_trans_relock(trans);
+		bch2_trans_begin(trans);
+
+		u32 snapshot;
+		ret = bch2_subvolume_get_snapshot(trans, inum.subvol, &snapshot);
 		if (ret)
-			break;
+			goto err;
+
+		bch2_btree_iter_set_snapshot(&iter, snapshot);
 
 		bch2_btree_iter_set_pos(&iter,
 				POS(inum.inum, bvec_iter.bi_sector));
@@ -1182,7 +1175,7 @@ retry:
 		k = bch2_btree_iter_peek_slot(&iter);
 		ret = bkey_err(k);
 		if (ret)
-			break;
+			goto err;
 
 		offset_into_extent = iter.pos.offset -
 			bkey_start_offset(k.k);
@@ -1193,7 +1186,7 @@ retry:
 		ret = bch2_read_indirect_extent(trans, &data_btree,
 					&offset_into_extent, &sk);
 		if (ret)
-			break;
+			goto err;
 
 		k = bkey_i_to_s_c(sk.k);
 
@@ -1213,7 +1206,7 @@ retry:
 					 data_btree, k,
 					 offset_into_extent, failed, flags);
 		if (ret)
-			break;
+			goto err;
 
 		if (flags & BCH_READ_LAST_FRAGMENT)
 			break;
@@ -1223,16 +1216,16 @@ retry:
 
 		ret = btree_trans_too_many_iters(trans);
 		if (ret)
+			goto err;
+err:
+		if (ret &&
+		    !bch2_err_matches(ret, BCH_ERR_transaction_restart) &&
+		    ret != READ_RETRY &&
+		    ret != READ_RETRY_AVOID)
 			break;
 	}
-err:
+
 	bch2_trans_iter_exit(trans, &iter);
-
-	if (bch2_err_matches(ret, BCH_ERR_transaction_restart) ||
-	    ret == READ_RETRY ||
-	    ret == READ_RETRY_AVOID)
-		goto retry;
-
 	bch2_trans_put(trans);
 	bch2_bkey_buf_exit(&sk, c);
 

From ae469056313f7d8a63646de8ef53eebfefc2d9c0 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Mon, 28 Aug 2023 16:13:18 -0400
Subject: [PATCH 112/120] bcachefs: Rename BCH_WRITE_DONE ->
 BCH_WRITE_SUBMITTED

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/io_write.c | 24 ++++++++++++------------
 fs/bcachefs/io_write.h |  2 +-
 2 files changed, 13 insertions(+), 13 deletions(-)

diff --git a/fs/bcachefs/io_write.c b/fs/bcachefs/io_write.c
index b3b05e9392ae..d31c8d006d97 100644
--- a/fs/bcachefs/io_write.c
+++ b/fs/bcachefs/io_write.c
@@ -554,7 +554,7 @@ out:
 err:
 	keys->top = keys->keys;
 	op->error = ret;
-	op->flags |= BCH_WRITE_DONE;
+	op->flags |= BCH_WRITE_SUBMITTED;
 	goto out;
 }
 
@@ -589,7 +589,7 @@ static CLOSURE_CALLBACK(bch2_write_index)
 	struct workqueue_struct *wq = index_update_wq(op);
 	unsigned long flags;
 
-	if ((op->flags & BCH_WRITE_DONE) &&
+	if ((op->flags & BCH_WRITE_SUBMITTED) &&
 	    (op->flags & BCH_WRITE_MOVE))
 		bch2_bio_free_pages_pool(op->c, &op->wbio.bio);
 
@@ -634,7 +634,7 @@ void bch2_write_point_do_index_updates(struct work_struct *work)
 
 		__bch2_write_index(op);
 
-		if (!(op->flags & BCH_WRITE_DONE))
+		if (!(op->flags & BCH_WRITE_SUBMITTED))
 			__bch2_write(op);
 		else
 			bch2_write_done(&op->cl);
@@ -1318,7 +1318,7 @@ retry:
 			wbio_init(bio)->put_bio = true;
 			bio->bi_opf = op->wbio.bio.bi_opf;
 		} else {
-			op->flags |= BCH_WRITE_DONE;
+			op->flags |= BCH_WRITE_SUBMITTED;
 		}
 
 		op->pos.offset += bio_sectors(bio);
@@ -1332,7 +1332,7 @@ retry:
 					  op->insert_keys.top, true);
 
 		bch2_keylist_push(&op->insert_keys);
-		if (op->flags & BCH_WRITE_DONE)
+		if (op->flags & BCH_WRITE_SUBMITTED)
 			break;
 		bch2_btree_iter_advance(&iter);
 	}
@@ -1347,14 +1347,14 @@ err:
 			op->pos.inode, op->pos.offset << 9,
 			"%s: btree lookup error %s", __func__, bch2_err_str(ret));
 		op->error = ret;
-		op->flags |= BCH_WRITE_DONE;
+		op->flags |= BCH_WRITE_SUBMITTED;
 	}
 
 	bch2_trans_put(trans);
 	darray_exit(&buckets);
 
 	/* fallback to cow write path? */
-	if (!(op->flags & BCH_WRITE_DONE)) {
+	if (!(op->flags & BCH_WRITE_SUBMITTED)) {
 		closure_sync(&op->cl);
 		__bch2_nocow_write_done(op);
 		op->insert_keys.top = op->insert_keys.keys;
@@ -1410,7 +1410,7 @@ static void __bch2_write(struct bch_write_op *op)
 
 	if (unlikely(op->opts.nocow && c->opts.nocow_enabled)) {
 		bch2_nocow_write(op);
-		if (op->flags & BCH_WRITE_DONE)
+		if (op->flags & BCH_WRITE_SUBMITTED)
 			goto out_nofs_restore;
 	}
 again:
@@ -1465,7 +1465,7 @@ again:
 		bch2_alloc_sectors_done_inlined(c, wp);
 err:
 		if (ret <= 0) {
-			op->flags |= BCH_WRITE_DONE;
+			op->flags |= BCH_WRITE_SUBMITTED;
 
 			if (ret < 0) {
 				if (!(op->flags & BCH_WRITE_ALLOC_NOWAIT))
@@ -1501,7 +1501,7 @@ err:
 	 * once, as that signals backpressure to the caller.
 	 */
 	if ((op->flags & BCH_WRITE_SYNC) ||
-	    (!(op->flags & BCH_WRITE_DONE) &&
+	    (!(op->flags & BCH_WRITE_SUBMITTED) &&
 	     !(op->flags & BCH_WRITE_IN_WORKER))) {
 		if (closure_sync_timeout(&op->cl, HZ * 10)) {
 			bch2_print_allocator_stuck(c);
@@ -1510,7 +1510,7 @@ err:
 
 		__bch2_write_index(op);
 
-		if (!(op->flags & BCH_WRITE_DONE))
+		if (!(op->flags & BCH_WRITE_SUBMITTED))
 			goto again;
 		bch2_write_done(&op->cl);
 	} else {
@@ -1532,7 +1532,7 @@ static void bch2_write_data_inline(struct bch_write_op *op, unsigned data_len)
 	memset(&op->failed, 0, sizeof(op->failed));
 
 	op->flags |= BCH_WRITE_WROTE_DATA_INLINE;
-	op->flags |= BCH_WRITE_DONE;
+	op->flags |= BCH_WRITE_SUBMITTED;
 
 	bch2_check_set_feature(op->c, BCH_FEATURE_inline_data);
 
diff --git a/fs/bcachefs/io_write.h b/fs/bcachefs/io_write.h
index 6c276a48f95d..5400ce94ee57 100644
--- a/fs/bcachefs/io_write.h
+++ b/fs/bcachefs/io_write.h
@@ -33,7 +33,7 @@ void bch2_submit_wbio_replicas(struct bch_write_bio *, struct bch_fs *,
 	x(SYNC)				\
 	x(MOVE)				\
 	x(IN_WORKER)			\
-	x(DONE)				\
+	x(SUBMITTED)			\
 	x(IO_ERROR)			\
 	x(CONVERT_UNWRITTEN)
 

From efb2018e4d238cc205690ac62c0917d60d291e66 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Sun, 14 Jul 2024 19:51:01 -0400
Subject: [PATCH 113/120] bcachefs: Kill bch2_assert_btree_nodes_not_locked()

We no longer track individual btree node locks with lockdep, so this
will never be enabled.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_io.c      |  6 ------
 fs/bcachefs/btree_locking.c | 10 ----------
 fs/bcachefs/btree_locking.h |  6 ------
 3 files changed, 22 deletions(-)

diff --git a/fs/bcachefs/btree_io.c b/fs/bcachefs/btree_io.c
index db700caf8afa..2c424435ca4a 100644
--- a/fs/bcachefs/btree_io.c
+++ b/fs/bcachefs/btree_io.c
@@ -46,8 +46,6 @@ void bch2_btree_node_io_unlock(struct btree *b)
 
 void bch2_btree_node_io_lock(struct btree *b)
 {
-	bch2_assert_btree_nodes_not_locked();
-
 	wait_on_bit_lock_io(&b->flags, BTREE_NODE_write_in_flight,
 			    TASK_UNINTERRUPTIBLE);
 }
@@ -66,16 +64,12 @@ void __bch2_btree_node_wait_on_write(struct btree *b)
 
 void bch2_btree_node_wait_on_read(struct btree *b)
 {
-	bch2_assert_btree_nodes_not_locked();
-
 	wait_on_bit_io(&b->flags, BTREE_NODE_read_in_flight,
 		       TASK_UNINTERRUPTIBLE);
 }
 
 void bch2_btree_node_wait_on_write(struct btree *b)
 {
-	bch2_assert_btree_nodes_not_locked();
-
 	wait_on_bit_io(&b->flags, BTREE_NODE_write_in_flight,
 		       TASK_UNINTERRUPTIBLE);
 }
diff --git a/fs/bcachefs/btree_locking.c b/fs/bcachefs/btree_locking.c
index 8bdfe573e95b..efe2a007b482 100644
--- a/fs/bcachefs/btree_locking.c
+++ b/fs/bcachefs/btree_locking.c
@@ -13,16 +13,6 @@ void bch2_btree_lock_init(struct btree_bkey_cached_common *b,
 	lockdep_set_notrack_class(&b->lock);
 }
 
-#ifdef CONFIG_LOCKDEP
-void bch2_assert_btree_nodes_not_locked(void)
-{
-#if 0
-	//Re-enable when lock_class_is_held() is merged:
-	BUG_ON(lock_class_is_held(&bch2_btree_node_lock_key));
-#endif
-}
-#endif
-
 /* Btree node locking: */
 
 struct six_lock_count bch2_btree_node_lock_counts(struct btree_trans *trans,
diff --git a/fs/bcachefs/btree_locking.h b/fs/bcachefs/btree_locking.h
index 8dbceec8ec25..11a64ead8685 100644
--- a/fs/bcachefs/btree_locking.h
+++ b/fs/bcachefs/btree_locking.h
@@ -15,12 +15,6 @@
 
 void bch2_btree_lock_init(struct btree_bkey_cached_common *, enum six_lock_init_flags);
 
-#ifdef CONFIG_LOCKDEP
-void bch2_assert_btree_nodes_not_locked(void);
-#else
-static inline void bch2_assert_btree_nodes_not_locked(void) {}
-#endif
-
 void bch2_trans_unlock_noassert(struct btree_trans *);
 
 static inline bool is_btree_node(struct btree_path *path, unsigned l)

From 2e118ba36d56acf78084518dfb7cb53b1d417da0 Mon Sep 17 00:00:00 2001
From: Tavian Barnes <tavianator@tavianator.com>
Date: Fri, 21 Jun 2024 16:29:32 -0400
Subject: [PATCH 114/120] bcachefs: darray: Don't pass NULL to memcpy()

memcpy's second parameter must not be NULL, even if size is zero.

Signed-off-by: Tavian Barnes <tavianator@tavianator.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/darray.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/fs/bcachefs/darray.c b/fs/bcachefs/darray.c
index ac35b8b705ae..b7d223f85873 100644
--- a/fs/bcachefs/darray.c
+++ b/fs/bcachefs/darray.c
@@ -13,7 +13,8 @@ int __bch2_darray_resize(darray_char *d, size_t element_size, size_t new_size, g
 		if (!data)
 			return -ENOMEM;
 
-		memcpy(data, d->data, d->size * element_size);
+		if (d->size)
+			memcpy(data, d->data, d->size * element_size);
 		if (d->data != d->preallocated)
 			kvfree(d->data);
 		d->data	= data;

From ee1b8dc17ac367f3fbea18fee4f7825eb11eb757 Mon Sep 17 00:00:00 2001
From: Tavian Barnes <tavianator@tavianator.com>
Date: Fri, 21 Jun 2024 16:39:58 -0400
Subject: [PATCH 115/120] bcachefs: varint: Avoid left-shift of a negative
 value

Shifting a negative value left is undefined.

Signed-off-by: Tavian Barnes <tavianator@tavianator.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/varint.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fs/bcachefs/varint.c b/fs/bcachefs/varint.c
index cb4f33ed9ab3..a9ebcd82c602 100644
--- a/fs/bcachefs/varint.c
+++ b/fs/bcachefs/varint.c
@@ -85,7 +85,7 @@ int bch2_varint_encode_fast(u8 *out, u64 v)
 
 	if (likely(bytes < 9)) {
 		v <<= bytes;
-		v |= ~(~0 << (bytes - 1));
+		v |= ~(~0U << (bytes - 1));
 	} else {
 		*out++ = 255;
 		bytes = 9;

From f12410bb7dddc64b58cbd6fca224b82ff40c5807 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Mon, 15 Jul 2024 16:30:44 -0400
Subject: [PATCH 116/120] bcachefs: Add an error message for insufficient rw
 journal devs

This causes us to go read-only - need an error message saying why.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/journal_reclaim.c | 11 +++++++++++
 1 file changed, 11 insertions(+)

diff --git a/fs/bcachefs/journal_reclaim.c b/fs/bcachefs/journal_reclaim.c
index 79be0eaddfa0..d8a630742887 100644
--- a/fs/bcachefs/journal_reclaim.c
+++ b/fs/bcachefs/journal_reclaim.c
@@ -205,6 +205,17 @@ void bch2_journal_space_available(struct journal *j)
 	j->can_discard = can_discard;
 
 	if (nr_online < metadata_replicas_required(c)) {
+		struct printbuf buf = PRINTBUF;
+		prt_printf(&buf, "insufficient writeable journal devices available: have %u, need %u\n"
+			   "rw journal devs:", nr_online, metadata_replicas_required(c));
+
+		rcu_read_lock();
+		for_each_member_device_rcu(c, ca, &c->rw_devs[BCH_DATA_journal])
+			prt_printf(&buf, " %s", ca->name);
+		rcu_read_unlock();
+
+		bch_err(c, "%s", buf.buf);
+		printbuf_exit(&buf);
 		ret = JOURNAL_ERR_insufficient_devices;
 		goto out;
 	}

From 2c4c17fefc49e895e322b3ab0f49d946f384f71b Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Mon, 15 Jul 2024 19:03:17 -0400
Subject: [PATCH 117/120] bcachefs: Fix fsck warning about btree_trans not
 passed to fsck error

If a btree_trans is in use it's supposed to be passed to fsck_err so
that it can be unlocked if we're waiting on userspace input; but the
btree IO paths do call fsck errors where a btree_trans exists on the
stack but it's not passed through.

But it's ok, because it's unlocked while doing IO.

Fixes: a850bde6498b ("bcachefs: fsck_err() may now take a btree_trans")
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_iter.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c
index 803cc58ff577..36872207f09b 100644
--- a/fs/bcachefs/btree_iter.c
+++ b/fs/bcachefs/btree_iter.c
@@ -3282,7 +3282,8 @@ bool bch2_current_has_btree_trans(struct bch_fs *c)
 	struct btree_trans *trans;
 	bool ret = false;
 	list_for_each_entry(trans, &c->btree_trans_list, list)
-		if (trans->locking_wait.task == current) {
+		if (trans->locking_wait.task == current &&
+		    trans->locked) {
 			ret = true;
 			break;
 		}

From f05a0b9c73bc1728b130ac8d1d76b7bbf3f0241d Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Tue, 16 Jul 2024 16:43:59 -0400
Subject: [PATCH 118/120] bcachefs: silence silly kdoc warning

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_write_buffer.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fs/bcachefs/btree_write_buffer.c b/fs/bcachefs/btree_write_buffer.c
index b9fe736820b4..3f56b584f8ec 100644
--- a/fs/bcachefs/btree_write_buffer.c
+++ b/fs/bcachefs/btree_write_buffer.c
@@ -560,7 +560,7 @@ int bch2_btree_write_buffer_tryflush(struct btree_trans *trans)
 	return ret;
 }
 
-/**
+/*
  * In check and repair code, when checking references to write buffer btrees we
  * need to issue a flush before we have a definitive error: this issues a flush
  * if this is a key we haven't yet checked.

From 6f719cbe0c8b3b8a14b403b9e60fdb565fd829fe Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Tue, 16 Jul 2024 20:20:21 -0400
Subject: [PATCH 119/120] bcachefs: Fix integer overflow on trans->nr_updates

We can't have more updates than paths, so btree_path_idx_t is the
correct type to use.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_types.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fs/bcachefs/btree_types.h b/fs/bcachefs/btree_types.h
index 79898f687772..b256b2a20a4f 100644
--- a/fs/bcachefs/btree_types.h
+++ b/fs/bcachefs/btree_types.h
@@ -477,8 +477,8 @@ struct btree_trans {
 	btree_path_idx_t	nr_sorted;
 	btree_path_idx_t	nr_paths;
 	btree_path_idx_t	nr_paths_max;
+	btree_path_idx_t	nr_updates;
 	u8			fn_idx;
-	u8			nr_updates;
 	u8			lock_must_abort;
 	bool			lock_may_not_fail:1;
 	bool			srcu_held:1;

From a97b43fac5b9b3ffca71b8a917a249789902fce9 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Thu, 18 Jul 2024 17:17:10 -0400
Subject: [PATCH 120/120] lockdep: Add comments for
 lockdep_set_no{validate,track}_class()

Cc: Waiman Long <longman@redhat.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 include/linux/lockdep.h | 15 +++++++++++++++
 1 file changed, 15 insertions(+)

diff --git a/include/linux/lockdep.h b/include/linux/lockdep.h
index b76f1bcd2f7f..0759e30df392 100644
--- a/include/linux/lockdep.h
+++ b/include/linux/lockdep.h
@@ -178,9 +178,24 @@ static inline void lockdep_init_map(struct lockdep_map *lock, const char *name,
 			      (lock)->dep_map.wait_type_outer,		\
 			      (lock)->dep_map.lock_type)
 
+/**
+ * lockdep_set_novalidate_class: disable checking of lock ordering on a given
+ * lock
+ * @lock: Lock to mark
+ *
+ * Lockdep will still record that this lock has been taken, and print held
+ * instances when dumping locks
+ */
 #define lockdep_set_novalidate_class(lock) \
 	lockdep_set_class_and_name(lock, &__lockdep_no_validate__, #lock)
 
+/**
+ * lockdep_set_notrack_class: disable lockdep tracking of a given lock entirely
+ * @lock: Lock to mark
+ *
+ * Bigger hammer than lockdep_set_novalidate_class: so far just for bcachefs,
+ * which takes more locks than lockdep is able to track (48).
+ */
 #define lockdep_set_notrack_class(lock) \
 	lockdep_set_class_and_name(lock, &__lockdep_no_track__, #lock)