Files
linux-apfs-rw/snapshot.c
Ernesto A. Fernández c87d7aa77f Fix build for RHEL 9.6
The build has broken again for RHEL:

  https://github.com/linux-apfs/linux-apfs-rw/issues/95

Fix it.

Signed-off-by: Ernesto A. Fernández <ernesto@corellium.com>
2025-05-14 20:43:28 -03:00

688 lines
18 KiB
C

// SPDX-License-Identifier: GPL-2.0-only
/*
* Copyright (C) 2022 Ernesto A. Fernández <ernesto@corellium.com>
*/
#include <linux/mount.h>
#include <linux/slab.h>
#include "apfs.h"
/**
* apfs_create_superblock_snapshot - Take a snapshot of the volume superblock
* @sb: superblock structure
* @bno: on return, the block number for the new superblock copy
*
* Returns 0 on success or a negative error code in case of failure.
*/
static int apfs_create_superblock_snapshot(struct super_block *sb, u64 *bno)
{
struct apfs_sb_info *sbi = APFS_SB(sb);
struct apfs_superblock *vsb_raw = sbi->s_vsb_raw;
struct buffer_head *curr_bh = NULL;
struct buffer_head *snap_bh = NULL;
struct apfs_superblock *snap_raw = NULL;
int err;
err = apfs_spaceman_allocate_block(sb, bno, true /* backwards */);
if (err) {
apfs_err(sb, "block allocation failed");
goto fail;
}
snap_bh = apfs_getblk(sb, *bno);
if (!snap_bh) {
apfs_err(sb, "failed to map block for volume snap (0x%llx)", *bno);
err = -EIO;
goto fail;
}
apfs_assert_in_transaction(sb, &vsb_raw->apfs_o);
curr_bh = sbi->s_vobject.o_bh;
memcpy(snap_bh->b_data, curr_bh->b_data, sb->s_blocksize);
curr_bh = NULL;
err = apfs_transaction_join(sb, snap_bh);
if (err)
goto fail;
set_buffer_csum(snap_bh);
snap_raw = (struct apfs_superblock *)snap_bh->b_data;
/* Volume superblocks in snapshots are physical objects */
snap_raw->apfs_o.o_oid = cpu_to_le64p(bno);
snap_raw->apfs_o.o_type = cpu_to_le32(APFS_OBJ_PHYSICAL | APFS_OBJECT_TYPE_FS);
/* The omap is shared with the current volume */
snap_raw->apfs_omap_oid = 0;
/* The extent reference tree is given by the snapshot metadata */
snap_raw->apfs_extentref_tree_oid = 0;
/* No snapshots inside snapshots */
snap_raw->apfs_snap_meta_tree_oid = 0;
err = 0;
fail:
snap_raw = NULL;
brelse(snap_bh);
snap_bh = NULL;
return err;
}
static int apfs_create_snap_metadata_rec(struct inode *mntpoint, struct apfs_node *snap_root, const char *name, int name_len, u64 sblock_oid)
{
struct super_block *sb = mntpoint->i_sb;
struct apfs_sb_info *sbi = APFS_SB(sb);
struct apfs_superblock *vsb_raw = sbi->s_vsb_raw;
struct apfs_query *query = NULL;
struct apfs_snap_metadata_key raw_key;
struct apfs_snap_metadata_val *raw_val = NULL;
int val_len;
struct timespec64 now;
u64 xid = APFS_NXI(sb)->nx_xid;
int err;
query = apfs_alloc_query(snap_root, NULL /* parent */);
if (!query) {
err = -ENOMEM;
goto fail;
}
apfs_init_snap_metadata_key(xid, &query->key);
query->flags |= APFS_QUERY_SNAP_META | APFS_QUERY_EXACT;
err = apfs_btree_query(sb, &query);
if (err == 0) {
apfs_err(sb, "record exists for xid 0x%llx", xid);
err = -EFSCORRUPTED;
goto fail;
}
if (err != -ENODATA) {
apfs_err(sb, "query failed for xid 0x%llx", xid);
goto fail;
}
apfs_key_set_hdr(APFS_TYPE_SNAP_METADATA, xid, &raw_key);
val_len = sizeof(*raw_val) + name_len + 1;
raw_val = kzalloc(val_len, GFP_KERNEL);
if (!raw_val) {
err = -ENOMEM;
goto fail;
}
raw_val->extentref_tree_oid = vsb_raw->apfs_extentref_tree_oid;
raw_val->sblock_oid = cpu_to_le64(sblock_oid);
now = current_time(mntpoint);
raw_val->create_time = cpu_to_le64(timespec64_to_ns(&now));
raw_val->change_time = raw_val->create_time;
raw_val->extentref_tree_type = vsb_raw->apfs_extentref_tree_type;
raw_val->flags = 0;
raw_val->name_len = cpu_to_le16(name_len + 1); /* Count the null byte */
strscpy(raw_val->name, name, name_len + 1);
apfs_assert_in_transaction(sb, &vsb_raw->apfs_o);
raw_val->inum = vsb_raw->apfs_next_obj_id;
le64_add_cpu(&vsb_raw->apfs_next_obj_id, 1);
err = apfs_btree_insert(query, &raw_key, sizeof(raw_key), raw_val, val_len);
if (err)
apfs_err(sb, "insertion failed for xid 0x%llx", xid);
fail:
kfree(raw_val);
raw_val = NULL;
apfs_free_query(query);
query = NULL;
return err;
}
static int apfs_create_snap_name_rec(struct apfs_node *snap_root, const char *name, int name_len)
{
struct super_block *sb = snap_root->object.sb;
struct apfs_query *query = NULL;
struct apfs_snap_name_key *raw_key = NULL;
struct apfs_snap_name_val raw_val;
int key_len;
int err;
query = apfs_alloc_query(snap_root, NULL /* parent */);
if (!query) {
err = -ENOMEM;
goto fail;
}
apfs_init_snap_name_key(name, &query->key);
query->flags |= APFS_QUERY_SNAP_META | APFS_QUERY_EXACT;
err = apfs_btree_query(sb, &query);
if (err == 0) {
/* This should never happen here, the caller has checked */
apfs_alert(sb, "snapshot name record already exists");
err = -EFSCORRUPTED;
goto fail;
}
if (err != -ENODATA) {
apfs_err(sb, "query failed (%s)", name);
goto fail;
}
key_len = sizeof(*raw_key) + name_len + 1;
raw_key = kzalloc(key_len, GFP_KERNEL);
if (!raw_key) {
err = -ENOMEM;
goto fail;
}
apfs_key_set_hdr(APFS_TYPE_SNAP_NAME, APFS_SNAP_NAME_OBJ_ID, raw_key);
raw_key->name_len = cpu_to_le16(name_len + 1); /* Count the null byte */
strscpy(raw_key->name, name, name_len + 1);
raw_val.snap_xid = cpu_to_le64(APFS_NXI(sb)->nx_xid);
err = apfs_btree_insert(query, raw_key, key_len, &raw_val, sizeof(raw_val));
if (err)
apfs_err(sb, "insertion failed (%s)", name);
fail:
kfree(raw_key);
raw_key = NULL;
apfs_free_query(query);
query = NULL;
return err;
}
static int apfs_create_snap_meta_records(struct inode *mntpoint, const char *name, int name_len, u64 sblock_oid)
{
struct super_block *sb = mntpoint->i_sb;
struct apfs_superblock *vsb_raw = APFS_SB(sb)->s_vsb_raw;
struct apfs_node *snap_root = NULL;
int err;
snap_root = apfs_read_node(sb, le64_to_cpu(vsb_raw->apfs_snap_meta_tree_oid), APFS_OBJ_PHYSICAL, true /* write */);
if (IS_ERR(snap_root)) {
apfs_err(sb, "failed to read snap meta root 0x%llx", le64_to_cpu(vsb_raw->apfs_snap_meta_tree_oid));
return PTR_ERR(snap_root);
}
apfs_assert_in_transaction(sb, &vsb_raw->apfs_o);
vsb_raw->apfs_snap_meta_tree_oid = cpu_to_le64(snap_root->object.oid);
err = apfs_create_snap_metadata_rec(mntpoint, snap_root, name, name_len, sblock_oid);
if (err) {
apfs_err(sb, "meta rec creation failed");
goto fail;
}
err = apfs_create_snap_name_rec(snap_root, name, name_len);
if (err)
apfs_err(sb, "name rec creation failed");
fail:
apfs_node_free(snap_root);
return err;
}
static int apfs_create_new_extentref_tree(struct super_block *sb)
{
struct apfs_superblock *vsb_raw = APFS_SB(sb)->s_vsb_raw;
u64 oid;
int err;
err = apfs_make_empty_btree_root(sb, APFS_OBJECT_TYPE_BLOCKREFTREE, &oid);
if (err) {
apfs_err(sb, "failed to make empty root");
return err;
}
apfs_assert_in_transaction(sb, &vsb_raw->apfs_o);
vsb_raw->apfs_extentref_tree_oid = cpu_to_le64(oid);
return 0;
}
/**
* apfs_update_omap_snap_tree - Add the current xid to the omap's snapshot tree
* @sb: filesystem superblock
* @oid_p: pointer to the on-disk block number for the root node
*
* Returns 0 on success, or a negative error code in case of failure.
*/
static int apfs_update_omap_snap_tree(struct super_block *sb, __le64 *oid_p)
{
struct apfs_nxsb_info *nxi = APFS_NXI(sb);
struct apfs_node *root = NULL;
u64 oid = le64_to_cpup(oid_p);
struct apfs_query *query = NULL;
__le64 raw_key;
struct apfs_omap_snapshot raw_val = {0};
int err;
/* An empty snapshots tree may not even have a root yet */
if (!oid) {
err = apfs_make_empty_btree_root(sb, APFS_OBJECT_TYPE_OMAP_SNAPSHOT, &oid);
if (err) {
apfs_err(sb, "failed to make empty root");
return err;
}
}
root = apfs_read_node(sb, oid, APFS_OBJ_PHYSICAL, true /* write */);
if (IS_ERR(root)) {
apfs_err(sb, "failed to read omap snap root 0x%llx", oid);
return PTR_ERR(root);
}
oid = 0;
query = apfs_alloc_query(root, NULL /* parent */);
if (!query) {
err = -ENOMEM;
goto fail;
}
apfs_init_omap_snap_key(nxi->nx_xid, &query->key);
query->flags = APFS_QUERY_OMAP_SNAP | APFS_QUERY_EXACT;
err = apfs_btree_query(sb, &query);
if (err == 0) {
apfs_err(sb, "record exists for xid 0x%llx", nxi->nx_xid);
err = -EFSCORRUPTED;
goto fail;
}
if (err != -ENODATA) {
apfs_err(sb, "query failed for xid 0x%llx", nxi->nx_xid);
goto fail;
}
raw_key = cpu_to_le64(nxi->nx_xid);
err = apfs_btree_insert(query, &raw_key, sizeof(raw_key), &raw_val, sizeof(raw_val));
if (err)
apfs_err(sb, "insertion failed for xid 0x%llx", nxi->nx_xid);
*oid_p = cpu_to_le64(root->object.block_nr);
fail:
apfs_free_query(query);
query = NULL;
apfs_node_free(root);
root = NULL;
return err;
}
/**
* apfs_update_omap_snapshots - Add the current xid to the omap's snapshots
* @sb: filesystem superblock
*
* Returns 0 on success, or a negative error code in case of failure.
*/
static int apfs_update_omap_snapshots(struct super_block *sb)
{
struct apfs_superblock *vsb_raw = APFS_SB(sb)->s_vsb_raw;
struct buffer_head *bh = NULL;
struct apfs_omap_phys *omap = NULL;
u64 omap_blk;
u64 xid;
int err;
xid = APFS_NXI(sb)->nx_xid;
omap_blk = le64_to_cpu(vsb_raw->apfs_omap_oid);
bh = apfs_read_object_block(sb, omap_blk, true /* write */, false /* preserve */);
if (IS_ERR(bh)) {
apfs_err(sb, "CoW failed for bno 0x%llx", omap_blk);
return PTR_ERR(bh);
}
omap = (struct apfs_omap_phys *)bh->b_data;
apfs_assert_in_transaction(sb, &omap->om_o);
le32_add_cpu(&omap->om_snap_count, 1);
omap->om_most_recent_snap = cpu_to_le64(xid);
err = apfs_update_omap_snap_tree(sb, &omap->om_snapshot_tree_oid);
if (err)
apfs_err(sb, "omap snap tree update failed");
omap = NULL;
brelse(bh);
bh = NULL;
return err;
}
/**
* apfs_snapshot_name_check - Check if a snapshot with the given name exists
* @sb: filesystem superblock
* @name: name to check
* @name_len: length of @name
* @eexist: on return, whether the name exists or not
*
* Returns 0 on success, or a negative error code in case of failure.
*/
static int apfs_snapshot_name_check(struct super_block *sb, const char *name, int name_len, bool *eexist)
{
struct apfs_sb_info *sbi = APFS_SB(sb);
struct apfs_superblock *vsb_raw = sbi->s_vsb_raw;
struct apfs_node *snap_root = NULL;
struct apfs_query *query = NULL;
int err;
snap_root = apfs_read_node(sb, le64_to_cpu(vsb_raw->apfs_snap_meta_tree_oid), APFS_OBJ_PHYSICAL, false /* write */);
if (IS_ERR(snap_root)) {
apfs_err(sb, "failed to read snap meta root 0x%llx", le64_to_cpu(vsb_raw->apfs_snap_meta_tree_oid));
return PTR_ERR(snap_root);
}
vsb_raw = NULL;
query = apfs_alloc_query(snap_root, NULL /* parent */);
if (!query) {
err = -ENOMEM;
goto out;
}
apfs_init_snap_name_key(name, &query->key);
query->flags |= APFS_QUERY_SNAP_META | APFS_QUERY_EXACT;
err = apfs_btree_query(sb, &query);
if (err == 0) {
*eexist = true;
} else if (err == -ENODATA) {
*eexist = false;
err = 0;
} else {
apfs_err(sb, "query failed (%s)", name);
}
out:
apfs_free_query(query);
query = NULL;
apfs_node_free(snap_root);
snap_root = NULL;
return err;
}
/**
* apfs_do_ioc_takesnapshot - Actual work for apfs_ioc_take_snapshot()
* @mntpoint: inode of the mount point to snapshot
* @name: label for the snapshot
*
* Returns 0 on success, or a negative error code in case of failure.
*/
static int apfs_do_ioc_take_snapshot(struct inode *mntpoint, const char *name, int name_len)
{
struct super_block *sb = mntpoint->i_sb;
struct apfs_sb_info *sbi = APFS_SB(sb);
struct apfs_superblock *vsb_raw = NULL;
struct apfs_omap *omap = sbi->s_omap;
u64 sblock_oid;
bool eexist = false;
int err;
err = apfs_transaction_start(sb, APFS_TRANS_REG);
if (err)
return err;
/*
* This check can fail in normal operation, so run it before making any
* changes and exit the transaction cleanly if necessary. The name
* lookup will have to be repeated later, but it's ok because I don't
* expect snapshot creation to be a bottleneck for anyone.
*/
err = apfs_snapshot_name_check(sb, name, name_len, &eexist);
if (err) {
apfs_err(sb, "snapshot name existence check failed");
goto fail;
}
if (eexist) {
err = apfs_transaction_commit(sb);
if (err)
goto fail;
apfs_warn(sb, "snapshot name already in use (%s)", name);
return -EEXIST;
}
/*
* Flush the extent caches to the extenref tree before it gets moved to
* the snapshot. It seems safer in general to avoid big unpredictable
* changes to the layout after the snapshot is set up.
*/
err = apfs_transaction_flush_all_inodes(sb);
if (err) {
apfs_err(sb, "failed to flush all inodes");
goto fail;
}
err = apfs_create_superblock_snapshot(sb, &sblock_oid);
if (err) {
apfs_err(sb, "failed to snapshot superblock");
goto fail;
}
err = apfs_create_snap_meta_records(mntpoint, name, name_len, sblock_oid);
if (err) {
apfs_err(sb, "failed to create snap meta records");
goto fail;
}
err = apfs_create_new_extentref_tree(sb);
if (err) {
apfs_err(sb, "failed to create new extref tree");
goto fail;
}
err = apfs_update_omap_snapshots(sb);
if (err) {
apfs_err(sb, "failed to update omap snapshots");
goto fail;
}
/*
* The official reference allows old implementations to ignore extended
* snapshot metadata, so I don't see any reason why we can't do the
* same for now.
*/
vsb_raw = sbi->s_vsb_raw;
apfs_assert_in_transaction(sb, &vsb_raw->apfs_o);
le64_add_cpu(&vsb_raw->apfs_num_snapshots, 1);
omap->omap_latest_snap = APFS_NXI(sb)->nx_xid;
sbi->s_nxi->nx_transaction.t_state |= APFS_NX_TRANS_FORCE_COMMIT;
err = apfs_transaction_commit(sb);
if (err)
goto fail;
return 0;
fail:
apfs_transaction_abort(sb);
return err;
}
/**
* apfs_ioc_take_snapshot - Ioctl handler for APFS_IOC_CREATE_SNAPSHOT
* @file: affected file
* @arg: ioctl argument
*
* Returns 0 on success, or a negative error code in case of failure.
*/
int apfs_ioc_take_snapshot(struct file *file, void __user *user_arg)
{
struct inode *inode = file_inode(file);
struct super_block *sb = inode->i_sb;
struct apfs_ioctl_snap_name *arg = NULL;
size_t name_len;
int err;
if (apfs_ino(inode) != APFS_ROOT_DIR_INO_NUM) {
apfs_info(sb, "snapshot must be requested on mountpoint");
return -ENOTTY;
}
#if LINUX_VERSION_CODE < KERNEL_VERSION(5, 12, 0)
if (!inode_owner_or_capable(inode))
#elif LINUX_VERSION_CODE < KERNEL_VERSION(6, 3, 0) && !RHEL_VERSION_GE(9, 6)
if (!inode_owner_or_capable(&init_user_ns, inode))
#else
if (!inode_owner_or_capable(&nop_mnt_idmap, inode))
#endif
return -EPERM;
err = mnt_want_write_file(file);
if (err)
return err;
arg = kzalloc(sizeof(*arg), GFP_KERNEL);
if (!arg) {
err = -ENOMEM;
goto fail;
}
if (copy_from_user(arg, user_arg, sizeof(*arg))) {
err = -EFAULT;
goto fail;
}
name_len = strnlen(arg->name, sizeof(arg->name));
if (name_len == sizeof(arg->name)) {
apfs_warn(sb, "snapshot name is too long (%d)", (int)name_len);
err = -EINVAL;
goto fail;
}
err = apfs_do_ioc_take_snapshot(inode, arg->name, name_len);
fail:
kfree(arg);
arg = NULL;
mnt_drop_write_file(file);
return err;
}
static int apfs_snap_xid_from_query(struct apfs_query *query, u64 *xid)
{
char *raw = query->node->object.data;
__le64 *val = NULL;
if (query->len != sizeof(*val)) {
apfs_err(query->node->object.sb, "bad value length (%d)", query->len);
return -EFSCORRUPTED;
}
val = (__le64 *)(raw + query->off);
*xid = le64_to_cpup(val);
return 0;
}
static int apfs_snapshot_name_to_xid(struct apfs_node *snap_root, const char *name, u64 *xid)
{
struct super_block *sb = snap_root->object.sb;
struct apfs_query *query = NULL;
int err;
query = apfs_alloc_query(snap_root, NULL /* parent */);
if (!query)
return -ENOMEM;
apfs_init_snap_name_key(name, &query->key);
query->flags |= APFS_QUERY_SNAP_META | APFS_QUERY_EXACT;
err = apfs_btree_query(sb, &query);
if (err) {
if (err != -ENODATA)
apfs_err(sb, "query failed (%s)", name);
goto fail;
}
err = apfs_snap_xid_from_query(query, xid);
if (err)
apfs_err(sb, "bad snap name record (%s)", name);
fail:
apfs_free_query(query);
query = NULL;
return err;
}
static int apfs_snap_sblock_from_query(struct apfs_query *query, u64 *sblock_oid)
{
struct super_block *sb = query->node->object.sb;
char *raw = query->node->object.data;
struct apfs_snap_metadata_val *val = NULL;
if (query->len < sizeof(*val)) {
apfs_err(sb, "bad value length (%d)", query->len);
return -EFSCORRUPTED;
}
val = (struct apfs_snap_metadata_val *)(raw + query->off);
/* I don't know anything about these snapshots, can they be mounted? */
if (le32_to_cpu(val->flags) & APFS_SNAP_META_PENDING_DATALESS) {
apfs_warn(sb, "the requested snapshot is dataless");
return -EOPNOTSUPP;
}
*sblock_oid = le64_to_cpu(val->sblock_oid);
return 0;
}
static int apfs_snapshot_xid_to_sblock(struct apfs_node *snap_root, u64 xid, u64 *sblock_oid)
{
struct super_block *sb = snap_root->object.sb;
struct apfs_query *query = NULL;
int err;
query = apfs_alloc_query(snap_root, NULL /* parent */);
if (!query)
return -ENOMEM;
apfs_init_snap_metadata_key(xid, &query->key);
query->flags |= APFS_QUERY_SNAP_META | APFS_QUERY_EXACT;
err = apfs_btree_query(sb, &query);
if (err) {
apfs_err(sb, "query failed for xid 0x%llx", xid);
goto fail;
}
err = apfs_snap_sblock_from_query(query, sblock_oid);
if (err)
apfs_err(sb, "bad snap meta record for xid 0x%llx", xid);
fail:
apfs_free_query(query);
query = NULL;
return err;
}
/**
* apfs_switch_to_snapshot - Start working with the snapshot volume superblock
* @sb: superblock structure
*
* Maps the volume superblock from the snapshot specified in the mount options.
* Returns 0 on success or a negative error code in case of failure.
*/
int apfs_switch_to_snapshot(struct super_block *sb)
{
struct apfs_sb_info *sbi = APFS_SB(sb);
struct apfs_nxsb_info *nxi = APFS_NXI(sb);
struct apfs_superblock *vsb_raw = sbi->s_vsb_raw;
struct apfs_node *snap_root = NULL;
const char *name = NULL;
u64 sblock_oid = 0;
u64 xid = 0;
int err;
ASSERT(sb->s_flags & SB_RDONLY);
name = sbi->s_snap_name;
if (strlen(name) > APFS_SNAP_MAX_NAMELEN) {
apfs_warn(sb, "snapshot name is too long");
return -EINVAL;
}
snap_root = apfs_read_node(sb, le64_to_cpu(vsb_raw->apfs_snap_meta_tree_oid), APFS_OBJ_PHYSICAL, false /* write */);
if (IS_ERR(snap_root)) {
apfs_err(sb, "failed to read snap meta root 0x%llx", le64_to_cpu(vsb_raw->apfs_snap_meta_tree_oid));
return PTR_ERR(snap_root);
}
vsb_raw = NULL;
err = apfs_snapshot_name_to_xid(snap_root, name, &xid);
if (err) {
if (err == -ENODATA)
apfs_info(sb, "no snapshot under that name (%s)", name);
goto fail;
}
sbi->s_snap_xid = xid;
err = apfs_snapshot_xid_to_sblock(snap_root, xid, &sblock_oid);
if (err)
goto fail;
apfs_unmap_volume_super(sb);
err = apfs_map_volume_super_bno(sb, sblock_oid, nxi->nx_flags & APFS_CHECK_NODES);
if (err)
apfs_err(sb, "failed to map volume block 0x%llx", sblock_oid);
fail:
apfs_node_free(snap_root);
return err;
}