bcachefs: Initial commit

Initially forked from drivers/md/bcache, bcachefs is a new copy-on-write
filesystem with every feature you could possibly want.

Website: https://bcachefs.org

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
This commit is contained in:
Kent Overstreet
2017-03-16 22:18:50 -08:00
committed by Kent Overstreet
parent 0d29a833b7
commit 1c6fdbd8f2
122 changed files with 57147 additions and 0 deletions

View File

@@ -48,6 +48,7 @@ source "fs/ocfs2/Kconfig"
source "fs/btrfs/Kconfig"
source "fs/nilfs2/Kconfig"
source "fs/f2fs/Kconfig"
source "fs/bcachefs/Kconfig"
source "fs/zonefs/Kconfig"
endif # BLOCK

View File

@@ -123,6 +123,7 @@ obj-$(CONFIG_OCFS2_FS) += ocfs2/
obj-$(CONFIG_BTRFS_FS) += btrfs/
obj-$(CONFIG_GFS2_FS) += gfs2/
obj-$(CONFIG_F2FS_FS) += f2fs/
obj-$(CONFIG_BCACHEFS_FS) += bcachefs/
obj-$(CONFIG_CEPH_FS) += ceph/
obj-$(CONFIG_PSTORE) += pstore/
obj-$(CONFIG_EFIVAR_FS) += efivarfs/

52
fs/bcachefs/Kconfig Normal file
View File

@@ -0,0 +1,52 @@
config BCACHEFS_FS
tristate "bcachefs filesystem support"
depends on BLOCK
select EXPORTFS
select CLOSURES
select LIBCRC32C
select FS_POSIX_ACL
select LZ4_COMPRESS
select LZ4_DECOMPRESS
select ZLIB_DEFLATE
select ZLIB_INFLATE
select ZSTD_COMPRESS
select ZSTD_DECOMPRESS
select CRYPTO_SHA256
select CRYPTO_CHACHA20
select CRYPTO_POLY1305
select KEYS
help
The bcachefs filesystem - a modern, copy on write filesystem, with
support for multiple devices, compression, checksumming, etc.
config BCACHEFS_QUOTA
bool "bcachefs quota support"
depends on BCACHEFS_FS
select QUOTACTL
config BCACHEFS_POSIX_ACL
bool "bcachefs POSIX ACL support"
depends on BCACHEFS_FS
select FS_POSIX_ACL
config BCACHEFS_DEBUG
bool "bcachefs debugging"
depends on BCACHEFS_FS
help
Enables many extra debugging checks and assertions.
The resulting code will be significantly slower than normal; you
probably shouldn't select this option unless you're a developer.
config BCACHEFS_TESTS
bool "bcachefs unit and performance tests"
depends on BCACHEFS_FS
help
Include some unit and performance tests for the core btree code
config BCACHEFS_NO_LATENCY_ACCT
bool "disable latency accounting and time stats"
depends on BCACHEFS_FS
help
This disables device latency tracking and time stats, only for performance testing

53
fs/bcachefs/Makefile Normal file
View File

@@ -0,0 +1,53 @@
obj-$(CONFIG_BCACHEFS_FS) += bcachefs.o
bcachefs-y := \
acl.o \
alloc.o \
bkey.o \
bkey_methods.o \
bset.o \
btree_cache.o \
btree_gc.o \
btree_io.o \
btree_iter.o \
btree_update_interior.o \
btree_update_leaf.o \
buckets.o \
chardev.o \
checksum.o \
clock.o \
compress.o \
debug.o \
dirent.o \
disk_groups.o \
error.o \
extents.o \
fs.o \
fs-ioctl.o \
fs-io.o \
fsck.o \
inode.o \
io.o \
journal.o \
journal_io.o \
journal_reclaim.o \
journal_seq_blacklist.o \
keylist.o \
migrate.o \
move.o \
movinggc.o \
opts.o \
quota.o \
rebalance.o \
recovery.o \
replicas.o \
siphash.o \
six.o \
super.o \
super-io.o \
sysfs.o \
tests.o \
trace.o \
util.o \
xattr.o

387
fs/bcachefs/acl.c Normal file
View File

@@ -0,0 +1,387 @@
// SPDX-License-Identifier: GPL-2.0
#ifdef CONFIG_BCACHEFS_POSIX_ACL
#include "bcachefs.h"
#include <linux/fs.h>
#include <linux/posix_acl.h>
#include <linux/posix_acl_xattr.h>
#include <linux/sched.h>
#include <linux/slab.h>
#include "acl.h"
#include "fs.h"
#include "xattr.h"
static inline size_t bch2_acl_size(unsigned nr_short, unsigned nr_long)
{
return sizeof(bch_acl_header) +
sizeof(bch_acl_entry_short) * nr_short +
sizeof(bch_acl_entry) * nr_long;
}
static inline int acl_to_xattr_type(int type)
{
switch (type) {
case ACL_TYPE_ACCESS:
return BCH_XATTR_INDEX_POSIX_ACL_ACCESS;
case ACL_TYPE_DEFAULT:
return BCH_XATTR_INDEX_POSIX_ACL_DEFAULT;
default:
BUG();
}
}
/*
* Convert from filesystem to in-memory representation.
*/
static struct posix_acl *bch2_acl_from_disk(const void *value, size_t size)
{
const void *p, *end = value + size;
struct posix_acl *acl;
struct posix_acl_entry *out;
unsigned count = 0;
if (!value)
return NULL;
if (size < sizeof(bch_acl_header))
goto invalid;
if (((bch_acl_header *)value)->a_version !=
cpu_to_le32(BCH_ACL_VERSION))
goto invalid;
p = value + sizeof(bch_acl_header);
while (p < end) {
const bch_acl_entry *entry = p;
if (p + sizeof(bch_acl_entry_short) > end)
goto invalid;
switch (le16_to_cpu(entry->e_tag)) {
case ACL_USER_OBJ:
case ACL_GROUP_OBJ:
case ACL_MASK:
case ACL_OTHER:
p += sizeof(bch_acl_entry_short);
break;
case ACL_USER:
case ACL_GROUP:
p += sizeof(bch_acl_entry);
break;
default:
goto invalid;
}
count++;
}
if (p > end)
goto invalid;
if (!count)
return NULL;
acl = posix_acl_alloc(count, GFP_KERNEL);
if (!acl)
return ERR_PTR(-ENOMEM);
out = acl->a_entries;
p = value + sizeof(bch_acl_header);
while (p < end) {
const bch_acl_entry *in = p;
out->e_tag = le16_to_cpu(in->e_tag);
out->e_perm = le16_to_cpu(in->e_perm);
switch (out->e_tag) {
case ACL_USER_OBJ:
case ACL_GROUP_OBJ:
case ACL_MASK:
case ACL_OTHER:
p += sizeof(bch_acl_entry_short);
break;
case ACL_USER:
out->e_uid = make_kuid(&init_user_ns,
le32_to_cpu(in->e_id));
p += sizeof(bch_acl_entry);
break;
case ACL_GROUP:
out->e_gid = make_kgid(&init_user_ns,
le32_to_cpu(in->e_id));
p += sizeof(bch_acl_entry);
break;
}
out++;
}
BUG_ON(out != acl->a_entries + acl->a_count);
return acl;
invalid:
pr_err("invalid acl entry");
return ERR_PTR(-EINVAL);
}
#define acl_for_each_entry(acl, acl_e) \
for (acl_e = acl->a_entries; \
acl_e < acl->a_entries + acl->a_count; \
acl_e++)
/*
* Convert from in-memory to filesystem representation.
*/
static struct bkey_i_xattr *
bch2_acl_to_xattr(struct btree_trans *trans,
const struct posix_acl *acl,
int type)
{
struct bkey_i_xattr *xattr;
bch_acl_header *acl_header;
const struct posix_acl_entry *acl_e;
void *outptr;
unsigned nr_short = 0, nr_long = 0, acl_len, u64s;
acl_for_each_entry(acl, acl_e) {
switch (acl_e->e_tag) {
case ACL_USER:
case ACL_GROUP:
nr_long++;
break;
case ACL_USER_OBJ:
case ACL_GROUP_OBJ:
case ACL_MASK:
case ACL_OTHER:
nr_short++;
break;
default:
return ERR_PTR(-EINVAL);
}
}
acl_len = bch2_acl_size(nr_short, nr_long);
u64s = BKEY_U64s + xattr_val_u64s(0, acl_len);
if (u64s > U8_MAX)
return ERR_PTR(-E2BIG);
xattr = bch2_trans_kmalloc(trans, u64s * sizeof(u64));
if (IS_ERR(xattr))
return xattr;
bkey_xattr_init(&xattr->k_i);
xattr->k.u64s = u64s;
xattr->v.x_type = acl_to_xattr_type(type);
xattr->v.x_name_len = 0,
xattr->v.x_val_len = cpu_to_le16(acl_len);
acl_header = xattr_val(&xattr->v);
acl_header->a_version = cpu_to_le32(BCH_ACL_VERSION);
outptr = (void *) acl_header + sizeof(*acl_header);
acl_for_each_entry(acl, acl_e) {
bch_acl_entry *entry = outptr;
entry->e_tag = cpu_to_le16(acl_e->e_tag);
entry->e_perm = cpu_to_le16(acl_e->e_perm);
switch (acl_e->e_tag) {
case ACL_USER:
entry->e_id = cpu_to_le32(
from_kuid(&init_user_ns, acl_e->e_uid));
outptr += sizeof(bch_acl_entry);
break;
case ACL_GROUP:
entry->e_id = cpu_to_le32(
from_kgid(&init_user_ns, acl_e->e_gid));
outptr += sizeof(bch_acl_entry);
break;
case ACL_USER_OBJ:
case ACL_GROUP_OBJ:
case ACL_MASK:
case ACL_OTHER:
outptr += sizeof(bch_acl_entry_short);
break;
}
}
BUG_ON(outptr != xattr_val(&xattr->v) + acl_len);
return xattr;
}
struct posix_acl *bch2_get_acl(struct mnt_idmap *idmap,
struct dentry *dentry, int type)
{
struct bch_inode_info *inode = to_bch_ei(dentry->d_inode);
struct bch_fs *c = inode->v.i_sb->s_fs_info;
struct btree_trans trans;
struct btree_iter *iter;
struct bkey_s_c_xattr xattr;
struct posix_acl *acl = NULL;
bch2_trans_init(&trans, c);
retry:
bch2_trans_begin(&trans);
iter = bch2_hash_lookup(&trans, bch2_xattr_hash_desc,
&inode->ei_str_hash, inode->v.i_ino,
&X_SEARCH(acl_to_xattr_type(type), "", 0),
0);
if (IS_ERR(iter)) {
if (PTR_ERR(iter) == -EINTR)
goto retry;
if (PTR_ERR(iter) != -ENOENT)
acl = ERR_CAST(iter);
goto out;
}
xattr = bkey_s_c_to_xattr(bch2_btree_iter_peek_slot(iter));
acl = bch2_acl_from_disk(xattr_val(xattr.v),
le16_to_cpu(xattr.v->x_val_len));
if (!IS_ERR(acl))
set_cached_acl(&inode->v, type, acl);
out:
bch2_trans_exit(&trans);
return acl;
}
int bch2_set_acl_trans(struct btree_trans *trans,
struct bch_inode_unpacked *inode_u,
const struct bch_hash_info *hash_info,
struct posix_acl *acl, int type)
{
int ret;
if (type == ACL_TYPE_DEFAULT &&
!S_ISDIR(inode_u->bi_mode))
return acl ? -EACCES : 0;
if (acl) {
struct bkey_i_xattr *xattr =
bch2_acl_to_xattr(trans, acl, type);
if (IS_ERR(xattr))
return PTR_ERR(xattr);
ret = __bch2_hash_set(trans, bch2_xattr_hash_desc, hash_info,
inode_u->bi_inum, &xattr->k_i, 0);
} else {
struct xattr_search_key search =
X_SEARCH(acl_to_xattr_type(type), "", 0);
ret = bch2_hash_delete(trans, bch2_xattr_hash_desc, hash_info,
inode_u->bi_inum, &search);
}
return ret == -ENOENT ? 0 : ret;
}
static int inode_update_for_set_acl_fn(struct bch_inode_info *inode,
struct bch_inode_unpacked *bi,
void *p)
{
struct bch_fs *c = inode->v.i_sb->s_fs_info;
struct timespec64 now = current_time(&inode->v);
umode_t mode = (unsigned long) p;
bi->bi_ctime = timespec_to_bch2_time(c, now);
bi->bi_mode = mode;
return 0;
}
int bch2_set_acl(struct mnt_idmap *idmap,
struct dentry *dentry,
struct posix_acl *acl, int type)
{
struct bch_inode_info *inode = to_bch_ei(dentry->d_inode);
struct bch_fs *c = inode->v.i_sb->s_fs_info;
struct btree_trans trans;
struct bch_inode_unpacked inode_u;
umode_t mode = inode->v.i_mode;
int ret;
if (type == ACL_TYPE_ACCESS && acl) {
ret = posix_acl_update_mode(idmap, &inode->v, &mode, &acl);
if (ret)
return ret;
}
bch2_trans_init(&trans, c);
retry:
bch2_trans_begin(&trans);
ret = bch2_set_acl_trans(&trans,
&inode->ei_inode,
&inode->ei_str_hash,
acl, type) ?:
bch2_write_inode_trans(&trans, inode, &inode_u,
inode_update_for_set_acl_fn,
(void *)(unsigned long) mode) ?:
bch2_trans_commit(&trans, NULL, NULL,
&inode->ei_journal_seq,
BTREE_INSERT_ATOMIC|
BTREE_INSERT_NOUNLOCK);
if (ret == -EINTR)
goto retry;
if (unlikely(ret))
goto err;
bch2_inode_update_after_write(c, inode, &inode_u,
ATTR_CTIME|ATTR_MODE);
set_cached_acl(&inode->v, type, acl);
err:
bch2_trans_exit(&trans);
return ret;
}
int bch2_acl_chmod(struct btree_trans *trans,
struct bch_inode_info *inode,
umode_t mode,
struct posix_acl **new_acl)
{
struct btree_iter *iter;
struct bkey_s_c_xattr xattr;
struct bkey_i_xattr *new;
struct posix_acl *acl;
int ret = 0;
iter = bch2_hash_lookup(trans, bch2_xattr_hash_desc,
&inode->ei_str_hash, inode->v.i_ino,
&X_SEARCH(BCH_XATTR_INDEX_POSIX_ACL_ACCESS, "", 0),
BTREE_ITER_INTENT);
if (IS_ERR(iter))
return PTR_ERR(iter) != -ENOENT ? PTR_ERR(iter) : 0;
xattr = bkey_s_c_to_xattr(bch2_btree_iter_peek_slot(iter));
acl = bch2_acl_from_disk(xattr_val(xattr.v),
le16_to_cpu(xattr.v->x_val_len));
if (IS_ERR_OR_NULL(acl))
return PTR_ERR(acl);
ret = __posix_acl_chmod(&acl, GFP_KERNEL, mode);
if (ret)
goto err;
new = bch2_acl_to_xattr(trans, acl, ACL_TYPE_ACCESS);
if (IS_ERR(new)) {
ret = PTR_ERR(new);
goto err;
}
bch2_trans_update(trans, iter, &new->k_i, 0);
*new_acl = acl;
acl = NULL;
err:
kfree(acl);
return ret;
}
#endif /* CONFIG_BCACHEFS_POSIX_ACL */

59
fs/bcachefs/acl.h Normal file
View File

@@ -0,0 +1,59 @@
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _BCACHEFS_ACL_H
#define _BCACHEFS_ACL_H
struct bch_inode_unpacked;
struct bch_hash_info;
struct bch_inode_info;
struct posix_acl;
#ifdef CONFIG_BCACHEFS_POSIX_ACL
#define BCH_ACL_VERSION 0x0001
typedef struct {
__le16 e_tag;
__le16 e_perm;
__le32 e_id;
} bch_acl_entry;
typedef struct {
__le16 e_tag;
__le16 e_perm;
} bch_acl_entry_short;
typedef struct {
__le32 a_version;
} bch_acl_header;
struct posix_acl *bch2_get_acl(struct mnt_idmap *, struct dentry *, int);
int bch2_set_acl_trans(struct btree_trans *,
struct bch_inode_unpacked *,
const struct bch_hash_info *,
struct posix_acl *, int);
int bch2_set_acl(struct mnt_idmap *, struct dentry *, struct posix_acl *, int);
int bch2_acl_chmod(struct btree_trans *, struct bch_inode_info *,
umode_t, struct posix_acl **);
#else
static inline int bch2_set_acl_trans(struct btree_trans *trans,
struct bch_inode_unpacked *inode_u,
const struct bch_hash_info *hash_info,
struct posix_acl *acl, int type)
{
return 0;
}
static inline int bch2_acl_chmod(struct btree_trans *trans,
struct bch_inode_info *inode,
umode_t mode,
struct posix_acl **new_acl)
{
return 0;
}
#endif /* CONFIG_BCACHEFS_POSIX_ACL */
#endif /* _BCACHEFS_ACL_H */

2205
fs/bcachefs/alloc.c Normal file

File diff suppressed because it is too large Load Diff

141
fs/bcachefs/alloc.h Normal file
View File

@@ -0,0 +1,141 @@
#ifndef _BCACHEFS_ALLOC_H
#define _BCACHEFS_ALLOC_H
#include "bcachefs.h"
#include "alloc_types.h"
struct bkey;
struct bch_dev;
struct bch_fs;
struct bch_devs_List;
const char *bch2_alloc_invalid(const struct bch_fs *, struct bkey_s_c);
void bch2_alloc_to_text(struct bch_fs *, char *, size_t, struct bkey_s_c);
#define bch2_bkey_alloc_ops (struct bkey_ops) { \
.key_invalid = bch2_alloc_invalid, \
.val_to_text = bch2_alloc_to_text, \
}
struct dev_alloc_list {
unsigned nr;
u8 devs[BCH_SB_MEMBERS_MAX];
};
struct dev_alloc_list bch2_wp_alloc_list(struct bch_fs *,
struct write_point *,
struct bch_devs_mask *);
void bch2_wp_rescale(struct bch_fs *, struct bch_dev *,
struct write_point *);
int bch2_alloc_read(struct bch_fs *, struct list_head *);
int bch2_alloc_replay_key(struct bch_fs *, struct bpos);
enum bucket_alloc_ret {
ALLOC_SUCCESS = 0,
OPEN_BUCKETS_EMPTY = -1,
FREELIST_EMPTY = -2, /* Allocator thread not keeping up */
NO_DEVICES = -3, /* -EROFS */
};
long bch2_bucket_alloc_new_fs(struct bch_dev *);
int bch2_bucket_alloc(struct bch_fs *, struct bch_dev *, enum alloc_reserve, bool,
struct closure *);
#define __writepoint_for_each_ptr(_wp, _ob, _i, _start) \
for ((_i) = (_start); \
(_i) < (_wp)->nr_ptrs && ((_ob) = (_wp)->ptrs[_i], true); \
(_i)++)
#define writepoint_for_each_ptr_all(_wp, _ob, _i) \
__writepoint_for_each_ptr(_wp, _ob, _i, 0)
#define writepoint_for_each_ptr(_wp, _ob, _i) \
__writepoint_for_each_ptr(_wp, _ob, _i, wp->first_ptr)
void __bch2_open_bucket_put(struct bch_fs *, struct open_bucket *);
static inline void bch2_open_bucket_put(struct bch_fs *c, struct open_bucket *ob)
{
if (atomic_dec_and_test(&ob->pin))
__bch2_open_bucket_put(c, ob);
}
static inline void bch2_open_bucket_put_refs(struct bch_fs *c, u8 *nr, u8 *refs)
{
unsigned i;
for (i = 0; i < *nr; i++)
bch2_open_bucket_put(c, c->open_buckets + refs[i]);
*nr = 0;
}
static inline void bch2_open_bucket_get(struct bch_fs *c,
struct write_point *wp,
u8 *nr, u8 *refs)
{
struct open_bucket *ob;
unsigned i;
writepoint_for_each_ptr(wp, ob, i) {
atomic_inc(&ob->pin);
refs[(*nr)++] = ob - c->open_buckets;
}
}
struct write_point *bch2_alloc_sectors_start(struct bch_fs *,
unsigned,
struct write_point_specifier,
struct bch_devs_list *,
unsigned, unsigned,
enum alloc_reserve,
unsigned,
struct closure *);
void bch2_alloc_sectors_append_ptrs(struct bch_fs *, struct write_point *,
struct bkey_i_extent *, unsigned);
void bch2_alloc_sectors_done(struct bch_fs *, struct write_point *);
static inline void bch2_wake_allocator(struct bch_dev *ca)
{
struct task_struct *p;
rcu_read_lock();
p = rcu_dereference(ca->alloc_thread);
if (p)
wake_up_process(p);
rcu_read_unlock();
}
static inline struct write_point_specifier writepoint_hashed(unsigned long v)
{
return (struct write_point_specifier) { .v = v | 1 };
}
static inline struct write_point_specifier writepoint_ptr(struct write_point *wp)
{
return (struct write_point_specifier) { .v = (unsigned long) wp };
}
void bch2_recalc_capacity(struct bch_fs *);
void bch2_dev_allocator_remove(struct bch_fs *, struct bch_dev *);
void bch2_dev_allocator_add(struct bch_fs *, struct bch_dev *);
void bch2_dev_allocator_stop(struct bch_dev *);
int bch2_dev_allocator_start(struct bch_dev *);
static inline void writepoint_init(struct write_point *wp,
enum bch_data_type type)
{
mutex_init(&wp->lock);
wp->type = type;
}
int bch2_alloc_write(struct bch_fs *);
int bch2_fs_allocator_start(struct bch_fs *);
void bch2_fs_allocator_init(struct bch_fs *);
#endif /* _BCACHEFS_ALLOC_H */

90
fs/bcachefs/alloc_types.h Normal file
View File

@@ -0,0 +1,90 @@
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _BCACHEFS_ALLOC_TYPES_H
#define _BCACHEFS_ALLOC_TYPES_H
#include <linux/mutex.h>
#include <linux/spinlock.h>
#include "clock_types.h"
#include "fifo.h"
/* There's two of these clocks, one for reads and one for writes: */
struct bucket_clock {
/*
* "now" in (read/write) IO time - incremented whenever we do X amount
* of reads or writes.
*
* Goes with the bucket read/write prios: when we read or write to a
* bucket we reset the bucket's prio to the current hand; thus hand -
* prio = time since bucket was last read/written.
*
* The units are some amount (bytes/sectors) of data read/written, and
* the units can change on the fly if we need to rescale to fit
* everything in a u16 - your only guarantee is that the units are
* consistent.
*/
u16 hand;
u16 max_last_io;
int rw;
struct io_timer rescale;
struct mutex lock;
};
/* There is one reserve for each type of btree, one for prios and gens
* and one for moving GC */
enum alloc_reserve {
RESERVE_ALLOC = -1,
RESERVE_BTREE = 0,
RESERVE_MOVINGGC = 1,
RESERVE_NONE = 2,
RESERVE_NR = 3,
};
typedef FIFO(long) alloc_fifo;
/* Enough for 16 cache devices, 2 tiers and some left over for pipelining */
#define OPEN_BUCKETS_COUNT 256
#define WRITE_POINT_COUNT 32
struct open_bucket {
spinlock_t lock;
atomic_t pin;
u8 freelist;
bool valid;
bool on_partial_list;
unsigned sectors_free;
struct bch_extent_ptr ptr;
};
struct write_point {
struct hlist_node node;
struct mutex lock;
u64 last_used;
unsigned long write_point;
enum bch_data_type type;
u8 nr_ptrs;
u8 first_ptr;
/* calculated based on how many pointers we're actually going to use: */
unsigned sectors_free;
struct open_bucket *ptrs[BCH_REPLICAS_MAX * 2];
u64 next_alloc[BCH_SB_MEMBERS_MAX];
};
struct write_point_specifier {
unsigned long v;
};
struct alloc_heap_entry {
size_t bucket;
size_t nr;
unsigned long key;
};
typedef HEAP(struct alloc_heap_entry) alloc_heap;
#endif /* _BCACHEFS_ALLOC_TYPES_H */

785
fs/bcachefs/bcachefs.h Normal file

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,310 @@
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _BCACHEFS_IOCTL_H
#define _BCACHEFS_IOCTL_H
#include <linux/uuid.h>
#include <asm/ioctl.h>
#include "bcachefs_format.h"
/*
* Flags common to multiple ioctls:
*/
#define BCH_FORCE_IF_DATA_LOST (1 << 0)
#define BCH_FORCE_IF_METADATA_LOST (1 << 1)
#define BCH_FORCE_IF_DATA_DEGRADED (1 << 2)
#define BCH_FORCE_IF_METADATA_DEGRADED (1 << 3)
#define BCH_FORCE_IF_DEGRADED \
(BCH_FORCE_IF_DATA_DEGRADED| \
BCH_FORCE_IF_METADATA_DEGRADED)
/*
* If cleared, ioctl that refer to a device pass it as a pointer to a pathname
* (e.g. /dev/sda1); if set, the dev field is the device's index within the
* filesystem:
*/
#define BCH_BY_INDEX (1 << 4)
/*
* For BCH_IOCTL_READ_SUPER: get superblock of a specific device, not filesystem
* wide superblock:
*/
#define BCH_READ_DEV (1 << 5)
/* global control dev: */
/* These are currently broken, and probably unnecessary: */
#if 0
#define BCH_IOCTL_ASSEMBLE _IOW(0xbc, 1, struct bch_ioctl_assemble)
#define BCH_IOCTL_INCREMENTAL _IOW(0xbc, 2, struct bch_ioctl_incremental)
struct bch_ioctl_assemble {
__u32 flags;
__u32 nr_devs;
__u64 pad;
__u64 devs[];
};
struct bch_ioctl_incremental {
__u32 flags;
__u64 pad;
__u64 dev;
};
#endif
/* filesystem ioctls: */
#define BCH_IOCTL_QUERY_UUID _IOR(0xbc, 1, struct bch_ioctl_query_uuid)
/* These only make sense when we also have incremental assembly */
#if 0
#define BCH_IOCTL_START _IOW(0xbc, 2, struct bch_ioctl_start)
#define BCH_IOCTL_STOP _IO(0xbc, 3)
#endif
#define BCH_IOCTL_DISK_ADD _IOW(0xbc, 4, struct bch_ioctl_disk)
#define BCH_IOCTL_DISK_REMOVE _IOW(0xbc, 5, struct bch_ioctl_disk)
#define BCH_IOCTL_DISK_ONLINE _IOW(0xbc, 6, struct bch_ioctl_disk)
#define BCH_IOCTL_DISK_OFFLINE _IOW(0xbc, 7, struct bch_ioctl_disk)
#define BCH_IOCTL_DISK_SET_STATE _IOW(0xbc, 8, struct bch_ioctl_disk_set_state)
#define BCH_IOCTL_DATA _IOW(0xbc, 10, struct bch_ioctl_data)
#define BCH_IOCTL_USAGE _IOWR(0xbc, 11, struct bch_ioctl_usage)
#define BCH_IOCTL_READ_SUPER _IOW(0xbc, 12, struct bch_ioctl_read_super)
#define BCH_IOCTL_DISK_GET_IDX _IOW(0xbc, 13, struct bch_ioctl_disk_get_idx)
#define BCH_IOCTL_DISK_RESIZE _IOW(0xbc, 13, struct bch_ioctl_disk_resize)
/*
* BCH_IOCTL_QUERY_UUID: get filesystem UUID
*
* Returns user visible UUID, not internal UUID (which may not ever be changed);
* the filesystem's sysfs directory may be found under /sys/fs/bcachefs with
* this UUID.
*/
struct bch_ioctl_query_uuid {
__uuid_t uuid;
};
#if 0
struct bch_ioctl_start {
__u32 flags;
__u32 pad;
};
#endif
/*
* BCH_IOCTL_DISK_ADD: add a new device to an existing filesystem
*
* The specified device must not be open or in use. On success, the new device
* will be an online member of the filesystem just like any other member.
*
* The device must first be prepared by userspace by formatting with a bcachefs
* superblock, which is only used for passing in superblock options/parameters
* for that device (in struct bch_member). The new device's superblock should
* not claim to be a member of any existing filesystem - UUIDs on it will be
* ignored.
*/
/*
* BCH_IOCTL_DISK_REMOVE: permanently remove a member device from a filesystem
*
* Any data present on @dev will be permanently deleted, and @dev will be
* removed from its slot in the filesystem's list of member devices. The device
* may be either offline or offline.
*
* Will fail removing @dev would leave us with insufficient read write devices
* or degraded/unavailable data, unless the approprate BCH_FORCE_IF_* flags are
* set.
*/
/*
* BCH_IOCTL_DISK_ONLINE: given a disk that is already a member of a filesystem
* but is not open (e.g. because we started in degraded mode), bring it online
*
* all existing data on @dev will be available once the device is online,
* exactly as if @dev was present when the filesystem was first mounted
*/
/*
* BCH_IOCTL_DISK_OFFLINE: offline a disk, causing the kernel to close that
* block device, without removing it from the filesystem (so it can be brought
* back online later)
*
* Data present on @dev will be unavailable while @dev is offline (unless
* replicated), but will still be intact and untouched if @dev is brought back
* online
*
* Will fail (similarly to BCH_IOCTL_DISK_SET_STATE) if offlining @dev would
* leave us with insufficient read write devices or degraded/unavailable data,
* unless the approprate BCH_FORCE_IF_* flags are set.
*/
struct bch_ioctl_disk {
__u32 flags;
__u32 pad;
__u64 dev;
};
/*
* BCH_IOCTL_DISK_SET_STATE: modify state of a member device of a filesystem
*
* @new_state - one of the bch_member_state states (rw, ro, failed,
* spare)
*
* Will refuse to change member state if we would then have insufficient devices
* to write to, or if it would result in degraded data (when @new_state is
* failed or spare) unless the appropriate BCH_FORCE_IF_* flags are set.
*/
struct bch_ioctl_disk_set_state {
__u32 flags;
__u8 new_state;
__u8 pad[3];
__u64 dev;
};
enum bch_data_ops {
BCH_DATA_OP_SCRUB = 0,
BCH_DATA_OP_REREPLICATE = 1,
BCH_DATA_OP_MIGRATE = 2,
BCH_DATA_OP_NR = 3,
};
/*
* BCH_IOCTL_DATA: operations that walk and manipulate filesystem data (e.g.
* scrub, rereplicate, migrate).
*
* This ioctl kicks off a job in the background, and returns a file descriptor.
* Reading from the file descriptor returns a struct bch_ioctl_data_event,
* indicating current progress, and closing the file descriptor will stop the
* job. The file descriptor is O_CLOEXEC.
*/
struct bch_ioctl_data {
__u32 op;
__u32 flags;
struct bpos start;
struct bpos end;
union {
struct {
__u32 dev;
__u32 pad;
} migrate;
struct {
__u64 pad[8];
};
};
} __attribute__((packed, aligned(8)));
enum bch_data_event {
BCH_DATA_EVENT_PROGRESS = 0,
/* XXX: add an event for reporting errors */
BCH_DATA_EVENT_NR = 1,
};
struct bch_ioctl_data_progress {
__u8 data_type;
__u8 btree_id;
__u8 pad[2];
struct bpos pos;
__u64 sectors_done;
__u64 sectors_total;
} __attribute__((packed, aligned(8)));
struct bch_ioctl_data_event {
__u8 type;
__u8 pad[7];
union {
struct bch_ioctl_data_progress p;
__u64 pad2[15];
};
} __attribute__((packed, aligned(8)));
struct bch_ioctl_dev_usage {
__u8 state;
__u8 alive;
__u8 pad[6];
__u32 dev;
__u32 bucket_size;
__u64 nr_buckets;
__u64 buckets[BCH_DATA_NR];
__u64 sectors[BCH_DATA_NR];
};
struct bch_ioctl_fs_usage {
__u64 capacity;
__u64 used;
__u64 online_reserved;
__u64 persistent_reserved[BCH_REPLICAS_MAX];
__u64 sectors[BCH_DATA_NR][BCH_REPLICAS_MAX];
};
/*
* BCH_IOCTL_USAGE: query filesystem disk space usage
*
* Returns disk space usage broken out by data type, number of replicas, and
* by component device
*
* @nr_devices - number of devices userspace allocated space for in @devs
*
* On success, @fs and @devs will be filled out appropriately and devs[i].alive
* will indicate if a device was present in that slot
*
* Returns -ERANGE if @nr_devices was too small
*/
struct bch_ioctl_usage {
__u16 nr_devices;
__u16 pad[3];
struct bch_ioctl_fs_usage fs;
struct bch_ioctl_dev_usage devs[0];
};
/*
* BCH_IOCTL_READ_SUPER: read filesystem superblock
*
* Equivalent to reading the superblock directly from the block device, except
* avoids racing with the kernel writing the superblock or having to figure out
* which block device to read
*
* @sb - buffer to read into
* @size - size of userspace allocated buffer
* @dev - device to read superblock for, if BCH_READ_DEV flag is
* specified
*
* Returns -ERANGE if buffer provided is too small
*/
struct bch_ioctl_read_super {
__u32 flags;
__u32 pad;
__u64 dev;
__u64 size;
__u64 sb;
};
/*
* BCH_IOCTL_DISK_GET_IDX: give a path to a block device, query filesystem to
* determine if disk is a (online) member - if so, returns device's index
*
* Returns -ENOENT if not found
*/
struct bch_ioctl_disk_get_idx {
__u64 dev;
};
/*
* BCH_IOCTL_DISK_RESIZE: resize filesystem on a device
*
* @dev - member to resize
* @nbuckets - new number of buckets
*/
struct bch_ioctl_disk_resize {
__u32 flags;
__u32 pad;
__u64 dev;
__u64 nbuckets;
};
#endif /* _BCACHEFS_IOCTL_H */

1164
fs/bcachefs/bkey.c Normal file

File diff suppressed because it is too large Load Diff

627
fs/bcachefs/bkey.h Normal file

File diff suppressed because it is too large Load Diff

192
fs/bcachefs/bkey_methods.c Normal file
View File

@@ -0,0 +1,192 @@
// SPDX-License-Identifier: GPL-2.0
#include "bcachefs.h"
#include "bkey_methods.h"
#include "btree_types.h"
#include "alloc.h"
#include "dirent.h"
#include "error.h"
#include "extents.h"
#include "inode.h"
#include "quota.h"
#include "xattr.h"
const struct bkey_ops bch2_bkey_ops[] = {
[BKEY_TYPE_EXTENTS] = bch2_bkey_extent_ops,
[BKEY_TYPE_INODES] = bch2_bkey_inode_ops,
[BKEY_TYPE_DIRENTS] = bch2_bkey_dirent_ops,
[BKEY_TYPE_XATTRS] = bch2_bkey_xattr_ops,
[BKEY_TYPE_ALLOC] = bch2_bkey_alloc_ops,
[BKEY_TYPE_QUOTAS] = bch2_bkey_quota_ops,
[BKEY_TYPE_BTREE] = bch2_bkey_btree_ops,
};
const char *bch2_bkey_val_invalid(struct bch_fs *c, enum bkey_type type,
struct bkey_s_c k)
{
const struct bkey_ops *ops = &bch2_bkey_ops[type];
switch (k.k->type) {
case KEY_TYPE_DELETED:
case KEY_TYPE_DISCARD:
return NULL;
case KEY_TYPE_ERROR:
return bkey_val_bytes(k.k) != 0
? "value size should be zero"
: NULL;
case KEY_TYPE_COOKIE:
return bkey_val_bytes(k.k) != sizeof(struct bch_cookie)
? "incorrect value size"
: NULL;
default:
if (k.k->type < KEY_TYPE_GENERIC_NR)
return "invalid type";
return ops->key_invalid(c, k);
}
}
const char *__bch2_bkey_invalid(struct bch_fs *c, enum bkey_type type,
struct bkey_s_c k)
{
const struct bkey_ops *ops = &bch2_bkey_ops[type];
if (k.k->u64s < BKEY_U64s)
return "u64s too small";
if (!ops->is_extents) {
if (k.k->size)
return "nonzero size field";
} else {
if ((k.k->size == 0) != bkey_deleted(k.k))
return "bad size field";
}
if (ops->is_extents &&
!k.k->size &&
!bkey_deleted(k.k))
return "zero size field";
if (k.k->p.snapshot)
return "nonzero snapshot";
if (type != BKEY_TYPE_BTREE &&
!bkey_cmp(k.k->p, POS_MAX))
return "POS_MAX key";
return NULL;
}
const char *bch2_bkey_invalid(struct bch_fs *c, enum bkey_type type,
struct bkey_s_c k)
{
return __bch2_bkey_invalid(c, type, k) ?:
bch2_bkey_val_invalid(c, type, k);
}
const char *bch2_bkey_in_btree_node(struct btree *b, struct bkey_s_c k)
{
if (bkey_cmp(bkey_start_pos(k.k), b->data->min_key) < 0)
return "key before start of btree node";
if (bkey_cmp(k.k->p, b->data->max_key) > 0)
return "key past end of btree node";
return NULL;
}
void bch2_bkey_debugcheck(struct bch_fs *c, struct btree *b, struct bkey_s_c k)
{
enum bkey_type type = btree_node_type(b);
const struct bkey_ops *ops = &bch2_bkey_ops[type];
const char *invalid;
BUG_ON(!k.k->u64s);
invalid = bch2_bkey_invalid(c, type, k) ?:
bch2_bkey_in_btree_node(b, k);
if (invalid) {
char buf[160];
bch2_bkey_val_to_text(c, type, buf, sizeof(buf), k);
bch2_fs_bug(c, "invalid bkey %s: %s", buf, invalid);
return;
}
if (k.k->type >= KEY_TYPE_GENERIC_NR &&
ops->key_debugcheck)
ops->key_debugcheck(c, b, k);
}
#define p(...) (out += scnprintf(out, end - out, __VA_ARGS__))
int bch2_bkey_to_text(char *buf, size_t size, const struct bkey *k)
{
char *out = buf, *end = buf + size;
p("u64s %u type %u ", k->u64s, k->type);
if (bkey_cmp(k->p, POS_MAX))
p("%llu:%llu", k->p.inode, k->p.offset);
else
p("POS_MAX");
p(" snap %u len %u ver %llu", k->p.snapshot, k->size, k->version.lo);
return out - buf;
}
int bch2_val_to_text(struct bch_fs *c, enum bkey_type type,
char *buf, size_t size, struct bkey_s_c k)
{
const struct bkey_ops *ops = &bch2_bkey_ops[type];
char *out = buf, *end = buf + size;
switch (k.k->type) {
case KEY_TYPE_DELETED:
p(" deleted");
break;
case KEY_TYPE_DISCARD:
p(" discard");
break;
case KEY_TYPE_ERROR:
p(" error");
break;
case KEY_TYPE_COOKIE:
p(" cookie");
break;
default:
if (k.k->type >= KEY_TYPE_GENERIC_NR && ops->val_to_text)
ops->val_to_text(c, buf, size, k);
break;
}
return out - buf;
}
int bch2_bkey_val_to_text(struct bch_fs *c, enum bkey_type type,
char *buf, size_t size, struct bkey_s_c k)
{
char *out = buf, *end = buf + size;
out += bch2_bkey_to_text(out, end - out, k.k);
out += scnprintf(out, end - out, ": ");
out += bch2_val_to_text(c, type, out, end - out, k);
return out - buf;
}
void bch2_bkey_swab(enum bkey_type type,
const struct bkey_format *f,
struct bkey_packed *k)
{
const struct bkey_ops *ops = &bch2_bkey_ops[type];
bch2_bkey_swab_key(f, k);
if (ops->swab)
ops->swab(f, k);
}

View File

@@ -0,0 +1,87 @@
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _BCACHEFS_BKEY_METHODS_H
#define _BCACHEFS_BKEY_METHODS_H
#include "bkey.h"
#define DEF_BTREE_ID(kwd, val, name) BKEY_TYPE_##kwd = val,
enum bkey_type {
DEFINE_BCH_BTREE_IDS()
BKEY_TYPE_BTREE,
};
#undef DEF_BTREE_ID
/* Type of a key in btree @id at level @level: */
static inline enum bkey_type bkey_type(unsigned level, enum btree_id id)
{
return level ? BKEY_TYPE_BTREE : (enum bkey_type) id;
}
static inline bool btree_type_has_ptrs(enum bkey_type type)
{
switch (type) {
case BKEY_TYPE_BTREE:
case BKEY_TYPE_EXTENTS:
return true;
default:
return false;
}
}
struct bch_fs;
struct btree;
struct bkey;
enum merge_result {
BCH_MERGE_NOMERGE,
/*
* The keys were mergeable, but would have overflowed size - so instead
* l was changed to the maximum size, and both keys were modified:
*/
BCH_MERGE_PARTIAL,
BCH_MERGE_MERGE,
};
typedef bool (*key_filter_fn)(struct bch_fs *, struct btree *,
struct bkey_s);
typedef enum merge_result (*key_merge_fn)(struct bch_fs *,
struct btree *,
struct bkey_i *, struct bkey_i *);
struct bkey_ops {
/* Returns reason for being invalid if invalid, else NULL: */
const char * (*key_invalid)(const struct bch_fs *,
struct bkey_s_c);
void (*key_debugcheck)(struct bch_fs *, struct btree *,
struct bkey_s_c);
void (*val_to_text)(struct bch_fs *, char *,
size_t, struct bkey_s_c);
void (*swab)(const struct bkey_format *, struct bkey_packed *);
key_filter_fn key_normalize;
key_merge_fn key_merge;
bool is_extents;
};
const char *bch2_bkey_val_invalid(struct bch_fs *, enum bkey_type,
struct bkey_s_c);
const char *__bch2_bkey_invalid(struct bch_fs *, enum bkey_type, struct bkey_s_c);
const char *bch2_bkey_invalid(struct bch_fs *, enum bkey_type, struct bkey_s_c);
const char *bch2_bkey_in_btree_node(struct btree *, struct bkey_s_c);
void bch2_bkey_debugcheck(struct bch_fs *, struct btree *, struct bkey_s_c);
int bch2_bkey_to_text(char *, size_t, const struct bkey *);
int bch2_val_to_text(struct bch_fs *, enum bkey_type,
char *, size_t, struct bkey_s_c);
int bch2_bkey_val_to_text(struct bch_fs *, enum bkey_type,
char *, size_t, struct bkey_s_c);
void bch2_bkey_swab(enum bkey_type, const struct bkey_format *,
struct bkey_packed *);
extern const struct bkey_ops bch2_bkey_ops[];
#endif /* _BCACHEFS_BKEY_METHODS_H */

1849
fs/bcachefs/bset.c Normal file

File diff suppressed because it is too large Load Diff

668
fs/bcachefs/bset.h Normal file

File diff suppressed because it is too large Load Diff

941
fs/bcachefs/btree_cache.c Normal file

File diff suppressed because it is too large Load Diff

91
fs/bcachefs/btree_cache.h Normal file
View File

@@ -0,0 +1,91 @@
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _BCACHEFS_BTREE_CACHE_H
#define _BCACHEFS_BTREE_CACHE_H
#include "bcachefs.h"
#include "btree_types.h"
#include "extents.h"
struct btree_iter;
extern const char * const bch2_btree_ids[];
void bch2_recalc_btree_reserve(struct bch_fs *);
void bch2_btree_node_hash_remove(struct btree_cache *, struct btree *);
int __bch2_btree_node_hash_insert(struct btree_cache *, struct btree *);
int bch2_btree_node_hash_insert(struct btree_cache *, struct btree *,
unsigned, enum btree_id);
void bch2_btree_cache_cannibalize_unlock(struct bch_fs *);
int bch2_btree_cache_cannibalize_lock(struct bch_fs *, struct closure *);
struct btree *bch2_btree_node_mem_alloc(struct bch_fs *);
struct btree *bch2_btree_node_get(struct bch_fs *, struct btree_iter *,
const struct bkey_i *, unsigned,
enum six_lock_type, bool);
struct btree *bch2_btree_node_get_sibling(struct bch_fs *, struct btree_iter *,
struct btree *, bool,
enum btree_node_sibling);
void bch2_btree_node_prefetch(struct bch_fs *, struct btree_iter *,
const struct bkey_i *, unsigned);
void bch2_fs_btree_cache_exit(struct bch_fs *);
int bch2_fs_btree_cache_init(struct bch_fs *);
void bch2_fs_btree_cache_init_early(struct btree_cache *);
#define PTR_HASH(_k) (bkey_i_to_extent_c(_k)->v._data[0])
/* is btree node in hash table? */
static inline bool btree_node_hashed(struct btree *b)
{
return bkey_extent_is_data(&b->key.k) && PTR_HASH(&b->key);
}
#define for_each_cached_btree(_b, _c, _tbl, _iter, _pos) \
for ((_tbl) = rht_dereference_rcu((_c)->btree_cache.table.tbl, \
&(_c)->btree_cache.table), \
_iter = 0; _iter < (_tbl)->size; _iter++) \
rht_for_each_entry_rcu((_b), (_pos), _tbl, _iter, hash)
static inline size_t btree_bytes(struct bch_fs *c)
{
return c->opts.btree_node_size << 9;
}
static inline size_t btree_max_u64s(struct bch_fs *c)
{
return (btree_bytes(c) - sizeof(struct btree_node)) / sizeof(u64);
}
static inline size_t btree_page_order(struct bch_fs *c)
{
return get_order(btree_bytes(c));
}
static inline size_t btree_pages(struct bch_fs *c)
{
return 1 << btree_page_order(c);
}
static inline unsigned btree_blocks(struct bch_fs *c)
{
return c->opts.btree_node_size >> c->block_bits;
}
#define BTREE_SPLIT_THRESHOLD(c) (btree_blocks(c) * 3 / 4)
#define BTREE_FOREGROUND_MERGE_THRESHOLD(c) (btree_max_u64s(c) * 1 / 3)
#define BTREE_FOREGROUND_MERGE_HYSTERESIS(c) \
(BTREE_FOREGROUND_MERGE_THRESHOLD(c) + \
(BTREE_FOREGROUND_MERGE_THRESHOLD(c) << 2))
#define btree_node_root(_c, _b) ((_c)->btree_roots[(_b)->btree_id].b)
int bch2_print_btree_node(struct bch_fs *, struct btree *,
char *, size_t);
#endif /* _BCACHEFS_BTREE_CACHE_H */

Some files were not shown because too many files have changed in this diff Show More