Files

2070 lines
60 KiB
C
Raw Permalink Normal View History

2021-11-16 18:36:02 -03:00
// SPDX-License-Identifier: GPL-2.0-only
/*
* Copyright (C) 2018 Ernesto A. Fernández <ernesto.mnd.fernandez@gmail.com>
*/
#include <linux/slab.h>
#include <linux/buffer_head.h>
#include "apfs.h"
/**
* apfs_node_is_valid - Check basic sanity of the node index
* @sb: filesystem superblock
* @node: node to check
*
* Verifies that the node index fits in a single block, and that the number
* of records fits in the index. Without this check a crafted filesystem could
* pretend to have too many records, and calls to apfs_node_locate_key() and
2024-12-04 20:32:26 -03:00
* apfs_node_locate_value() would read beyond the limits of the node.
*/
static bool apfs_node_is_valid(struct super_block *sb,
struct apfs_node *node)
{
2022-02-03 18:17:11 -03:00
u32 records = node->records;
int index_size = node->key - sizeof(struct apfs_btree_node_phys);
int entry_size;
if (node->key > sb->s_blocksize)
return false;
entry_size = (apfs_node_has_fixed_kv_size(node)) ?
sizeof(struct apfs_kvoff) : sizeof(struct apfs_kvloc);
2022-02-03 18:17:11 -03:00
/* Coarse bound to prevent multiplication overflow in final check */
if (records > 1 << 16)
return false;
return records * entry_size <= index_size;
}
2022-02-25 21:09:26 -03:00
void apfs_node_free(struct apfs_node *node)
{
2023-12-29 23:48:30 -03:00
struct apfs_object *obj = NULL;
if (!node)
return;
obj = &node->object;
if (obj->o_bh) {
brelse(obj->o_bh);
obj->o_bh = NULL;
2024-02-22 20:28:20 -03:00
} else if (!obj->ephemeral) {
/* Ephemeral data always remains in memory */
kfree(obj->data);
}
2024-02-22 20:28:20 -03:00
obj->data = NULL;
kfree(node);
}
/**
* apfs_read_node - Read a node header from disk
* @sb: filesystem superblock
* @oid: object id for the node
* @storage: storage type for the node object
* @write: request write access?
*
* Returns ERR_PTR in case of failure, otherwise return a pointer to the
* resulting apfs_node structure with the initial reference taken.
*
* For now we assume the node has not been read before.
*/
struct apfs_node *apfs_read_node(struct super_block *sb, u64 oid, u32 storage,
bool write)
{
struct apfs_sb_info *sbi = APFS_SB(sb);
struct apfs_nxsb_info *nxi = APFS_NXI(sb);
struct buffer_head *bh = NULL;
2024-02-22 20:28:20 -03:00
struct apfs_ephemeral_object_info *eph_info = NULL;
struct apfs_btree_node_phys *raw = NULL;
struct apfs_node *node = NULL;
struct apfs_nloc *free_head = NULL;
u64 bno;
int err;
switch (storage) {
case APFS_OBJ_VIRTUAL:
/* All virtual nodes are inside a volume, at least for now */
err = apfs_omap_lookup_block(sb, sbi->s_omap, oid, &bno, write);
2023-03-24 22:07:47 -03:00
if (err) {
apfs_err(sb, "omap lookup failed for oid 0x%llx", oid);
return ERR_PTR(err);
2023-03-24 22:07:47 -03:00
}
/* CoW has already been done, don't worry about snapshots */
bh = apfs_read_object_block(sb, bno, write, false /* preserve */);
2023-03-24 22:07:47 -03:00
if (IS_ERR(bh)) {
apfs_err(sb, "object read failed for bno 0x%llx", bno);
return (void *)bh;
2023-03-24 22:07:47 -03:00
}
2024-02-22 20:28:20 -03:00
bno = bh->b_blocknr;
raw = (struct apfs_btree_node_phys *)bh->b_data;
break;
case APFS_OBJ_PHYSICAL:
bh = apfs_read_object_block(sb, oid, write, false /* preserve */);
2023-03-24 22:07:47 -03:00
if (IS_ERR(bh)) {
apfs_err(sb, "object read failed for bno 0x%llx", oid);
return (void *)bh;
2023-03-24 22:07:47 -03:00
}
2024-02-22 20:28:20 -03:00
bno = oid = bh->b_blocknr;
raw = (struct apfs_btree_node_phys *)bh->b_data;
break;
case APFS_OBJ_EPHEMERAL:
2024-02-22 20:28:20 -03:00
/* Ephemeral objects are already in memory */
eph_info = apfs_ephemeral_object_lookup(sb, oid);
if (IS_ERR(eph_info)) {
apfs_err(sb, "no ephemeral node for oid 0x%llx", oid);
return (void *)eph_info;
2023-03-24 22:07:47 -03:00
}
2024-02-22 20:28:20 -03:00
if (eph_info->size != sb->s_blocksize) {
apfs_err(sb, "unsupported size for ephemeral node (%u)", eph_info->size);
return ERR_PTR(-EOPNOTSUPP);
}
bno = 0; /* In memory, so meaningless */
raw = eph_info->object;
/* Only for consistency, will happen again on commit */
if (write)
raw->btn_o.o_xid = cpu_to_le64(nxi->nx_xid);
break;
default:
apfs_alert(sb, "invalid storage type %u - bug!", storage);
return ERR_PTR(-EINVAL);
}
node = kmalloc(sizeof(*node), GFP_KERNEL);
if (!node) {
brelse(bh);
return ERR_PTR(-ENOMEM);
}
node->tree_type = le32_to_cpu(raw->btn_o.o_subtype);
node->flags = le16_to_cpu(raw->btn_flags);
node->records = le32_to_cpu(raw->btn_nkeys);
node->key = sizeof(*raw) + le16_to_cpu(raw->btn_table_space.off)
+ le16_to_cpu(raw->btn_table_space.len);
node->free = node->key + le16_to_cpu(raw->btn_free_space.off);
2024-12-04 20:32:26 -03:00
node->val = node->free + le16_to_cpu(raw->btn_free_space.len);
free_head = &raw->btn_key_free_list;
node->key_free_list_len = le16_to_cpu(free_head->len);
free_head = &raw->btn_val_free_list;
node->val_free_list_len = le16_to_cpu(free_head->len);
node->object.sb = sb;
2024-02-22 20:28:20 -03:00
node->object.block_nr = bno;
node->object.oid = oid;
node->object.o_bh = bh;
2024-02-22 20:28:20 -03:00
node->object.data = (char *)raw;
node->object.ephemeral = !bh;
2024-02-22 20:28:20 -03:00
/* Ephemeral objects already got checked on mount */
if (!node->object.ephemeral && nxi->nx_flags & APFS_CHECK_NODES && !apfs_obj_verify_csum(sb, bh)) {
/* TODO: don't check this twice for virtual/physical objects */
2024-02-22 20:28:20 -03:00
apfs_err(sb, "bad checksum for node in block 0x%llx", (unsigned long long)bno);
2022-02-25 21:09:26 -03:00
apfs_node_free(node);
return ERR_PTR(-EFSBADCRC);
}
if (!apfs_node_is_valid(sb, node)) {
2024-02-22 20:28:20 -03:00
apfs_err(sb, "bad node in block 0x%llx", (unsigned long long)bno);
2022-02-25 21:09:26 -03:00
apfs_node_free(node);
return ERR_PTR(-EFSCORRUPTED);
}
return node;
}
/**
2022-09-26 22:24:32 -03:00
* apfs_node_min_table_size - Return the minimum size for a node's toc
* @sb: superblock structure
* @type: tree type for the node
* @flags: flags for the node
*/
static int apfs_node_min_table_size(struct super_block *sb, u32 type, u16 flags)
{
bool leaf = flags & APFS_BTNODE_LEAF;
int key_size, val_size, toc_size;
int space, count;
/* Preallocate the whole table for trees with fixed key/value sizes */
switch (type) {
case APFS_OBJECT_TYPE_OMAP:
key_size = sizeof(struct apfs_omap_key);
val_size = leaf ? sizeof(struct apfs_omap_val) : sizeof(__le64);
toc_size = sizeof(struct apfs_kvoff);
break;
case APFS_OBJECT_TYPE_SPACEMAN_FREE_QUEUE:
key_size = sizeof(struct apfs_spaceman_free_queue_key);
val_size = sizeof(__le64); /* We assume no ghosts here */
toc_size = sizeof(struct apfs_kvoff);
break;
case APFS_OBJECT_TYPE_OMAP_SNAPSHOT:
key_size = sizeof(__le64);
val_size = leaf ? sizeof(struct apfs_omap_snapshot) : sizeof(__le64);
toc_size = sizeof(struct apfs_kvoff);
break;
case APFS_OBJECT_TYPE_FEXT_TREE:
key_size = sizeof(struct apfs_fext_tree_key);
val_size = leaf ? sizeof(struct apfs_fext_tree_val) : sizeof(__le64);
toc_size = sizeof(struct apfs_kvoff);
break;
default:
/* Make room for one record at least */
toc_size = sizeof(struct apfs_kvloc);
return APFS_BTREE_TOC_ENTRY_INCREMENT * toc_size;
}
/* The footer of root nodes is ignored for some reason */
space = sb->s_blocksize - sizeof(struct apfs_btree_node_phys);
count = space / (key_size + val_size + toc_size);
return count * toc_size;
}
2022-09-26 22:24:32 -03:00
/**
* apfs_set_empty_btree_info - Set the info footer for an empty b-tree node
* @sb: filesystem superblock
* @info: pointer to the on-disk info footer
* @subtype: subtype of the root node, i.e., tree type
*
* For now only supports the extent reference tree.
*/
static void apfs_set_empty_btree_info(struct super_block *sb, struct apfs_btree_info *info, u32 subtype)
{
u32 flags;
ASSERT(subtype == APFS_OBJECT_TYPE_BLOCKREFTREE || subtype == APFS_OBJECT_TYPE_OMAP_SNAPSHOT);
memset(info, 0, sizeof(*info));
flags = APFS_BTREE_PHYSICAL;
if (subtype == APFS_OBJECT_TYPE_BLOCKREFTREE)
flags |= APFS_BTREE_KV_NONALIGNED;
info->bt_fixed.bt_flags = cpu_to_le32(flags);
info->bt_fixed.bt_node_size = cpu_to_le32(sb->s_blocksize);
info->bt_key_count = 0;
info->bt_node_count = cpu_to_le64(1); /* Only one node: the root */
if (subtype == APFS_OBJECT_TYPE_BLOCKREFTREE)
return;
info->bt_fixed.bt_key_size = cpu_to_le32(8);
info->bt_longest_key = info->bt_fixed.bt_key_size;
info->bt_fixed.bt_val_size = cpu_to_le32(sizeof(struct apfs_omap_snapshot));
info->bt_longest_val = info->bt_fixed.bt_val_size;
}
/**
* apfs_make_empty_btree_root - Make an empty root for a b-tree
* @sb: filesystem superblock
* @subtype: subtype of the root node, i.e., tree type
* @oid: on return, the root's object id
*
* For now only supports the extent reference tree and an omap's snapshot tree.
* Returns 0 on success or a negative error code in case of failure.
*/
int apfs_make_empty_btree_root(struct super_block *sb, u32 subtype, u64 *oid)
{
struct apfs_superblock *vsb_raw = APFS_SB(sb)->s_vsb_raw;
struct apfs_btree_node_phys *root = NULL;
struct buffer_head *bh = NULL;
u64 bno;
u16 flags;
int toc_len, free_len, head_len, info_len;
int err;
ASSERT(subtype == APFS_OBJECT_TYPE_BLOCKREFTREE || subtype == APFS_OBJECT_TYPE_OMAP_SNAPSHOT);
err = apfs_spaceman_allocate_block(sb, &bno, true /* backwards */);
2023-03-24 22:07:47 -03:00
if (err) {
apfs_err(sb, "block allocation failed");
2022-09-26 22:24:32 -03:00
return err;
2023-03-24 22:07:47 -03:00
}
2022-09-26 22:24:32 -03:00
apfs_assert_in_transaction(sb, &vsb_raw->apfs_o);
le64_add_cpu(&vsb_raw->apfs_fs_alloc_count, 1);
le64_add_cpu(&vsb_raw->apfs_total_blocks_alloced, 1);
2022-09-26 22:24:32 -03:00
bh = apfs_getblk(sb, bno);
if (!bh)
return -EIO;
root = (void *)bh->b_data;
err = apfs_transaction_join(sb, bh);
if (err)
goto fail;
set_buffer_csum(bh);
flags = APFS_BTNODE_ROOT | APFS_BTNODE_LEAF;
if (subtype == APFS_OBJECT_TYPE_OMAP_SNAPSHOT)
flags |= APFS_BTNODE_FIXED_KV_SIZE;
root->btn_flags = cpu_to_le16(flags);
toc_len = apfs_node_min_table_size(sb, subtype, flags);
head_len = sizeof(*root);
info_len = sizeof(struct apfs_btree_info);
free_len = sb->s_blocksize - head_len - toc_len - info_len;
root->btn_level = 0; /* Root */
/* No keys and no values, so this is straightforward */
root->btn_nkeys = 0;
root->btn_table_space.off = 0;
root->btn_table_space.len = cpu_to_le16(toc_len);
root->btn_free_space.off = 0;
root->btn_free_space.len = cpu_to_le16(free_len);
/* No fragmentation */
root->btn_key_free_list.off = cpu_to_le16(APFS_BTOFF_INVALID);
root->btn_key_free_list.len = 0;
root->btn_val_free_list.off = cpu_to_le16(APFS_BTOFF_INVALID);
root->btn_val_free_list.len = 0;
apfs_set_empty_btree_info(sb, (void *)root + sb->s_blocksize - info_len, subtype);
root->btn_o.o_oid = cpu_to_le64(bno);
root->btn_o.o_xid = cpu_to_le64(APFS_NXI(sb)->nx_xid);
root->btn_o.o_type = cpu_to_le32(APFS_OBJECT_TYPE_BTREE | APFS_OBJ_PHYSICAL);
root->btn_o.o_subtype = cpu_to_le32(subtype);
*oid = bno;
err = 0;
fail:
root = NULL;
brelse(bh);
bh = NULL;
return err;
}
/**
* apfs_create_node - Allocates a new nonroot b-tree node on disk
* @sb: filesystem superblock
* @storage: storage type for the node object
*
* On success returns a pointer to the new in-memory node structure; the object
* header is initialized, and the node fields are given reasonable defaults.
* On failure, returns an error pointer.
*/
static struct apfs_node *apfs_create_node(struct super_block *sb, u32 storage)
{
struct apfs_sb_info *sbi = APFS_SB(sb);
struct apfs_nxsb_info *nxi = APFS_NXI(sb);
struct apfs_nx_superblock *msb_raw = nxi->nx_raw;
struct apfs_superblock *vsb_raw = sbi->s_vsb_raw;
2024-02-22 20:28:20 -03:00
struct apfs_ephemeral_object_info *eph_info = NULL;
struct apfs_node *node = NULL;
struct buffer_head *bh = NULL;
struct apfs_btree_node_phys *raw = NULL;
u64 bno, oid;
int err;
switch (storage) {
case APFS_OBJ_VIRTUAL:
err = apfs_spaceman_allocate_block(sb, &bno, true /* backwards */);
2023-03-24 22:07:47 -03:00
if (err) {
apfs_err(sb, "block allocation failed");
return ERR_PTR(err);
2023-03-24 22:07:47 -03:00
}
apfs_assert_in_transaction(sb, &vsb_raw->apfs_o);
le64_add_cpu(&vsb_raw->apfs_fs_alloc_count, 1);
le64_add_cpu(&vsb_raw->apfs_total_blocks_alloced, 1);
oid = le64_to_cpu(msb_raw->nx_next_oid);
le64_add_cpu(&msb_raw->nx_next_oid, 1);
err = apfs_create_omap_rec(sb, oid, bno);
2023-03-24 22:07:47 -03:00
if (err) {
apfs_err(sb, "omap rec creation failed (0x%llx-0x%llx)", oid, bno);
return ERR_PTR(err);
2023-03-24 22:07:47 -03:00
}
break;
case APFS_OBJ_PHYSICAL:
err = apfs_spaceman_allocate_block(sb, &bno, true /* backwards */);
2023-03-24 22:07:47 -03:00
if (err) {
apfs_err(sb, "block allocation failed");
return ERR_PTR(err);
2023-03-24 22:07:47 -03:00
}
/* We don't write to the container's omap */
apfs_assert_in_transaction(sb, &vsb_raw->apfs_o);
le64_add_cpu(&vsb_raw->apfs_fs_alloc_count, 1);
le64_add_cpu(&vsb_raw->apfs_total_blocks_alloced, 1);
oid = bno;
break;
case APFS_OBJ_EPHEMERAL:
2024-02-22 20:28:20 -03:00
if (nxi->nx_eph_count >= APFS_EPHEMERAL_LIST_LIMIT) {
apfs_err(sb, "creating too many ephemeral objects?");
return ERR_PTR(-EOPNOTSUPP);
2023-03-24 22:07:47 -03:00
}
2024-02-22 20:28:20 -03:00
eph_info = &nxi->nx_eph_list[nxi->nx_eph_count++];
eph_info->object = kzalloc(sb->s_blocksize, GFP_KERNEL);
if (!eph_info->object)
return ERR_PTR(-ENOMEM);
eph_info->size = sb->s_blocksize;
oid = eph_info->oid = le64_to_cpu(msb_raw->nx_next_oid);
le64_add_cpu(&msb_raw->nx_next_oid, 1);
break;
default:
apfs_alert(sb, "invalid storage type %u - bug!", storage);
return ERR_PTR(-EINVAL);
}
2024-02-22 20:28:20 -03:00
if (storage == APFS_OBJ_EPHEMERAL) {
bh = NULL;
bno = 0;
raw = eph_info->object;
} else {
bh = apfs_getblk(sb, bno);
if (!bh)
return ERR_PTR(-EIO);
bno = bh->b_blocknr;
raw = (void *)bh->b_data;
err = apfs_transaction_join(sb, bh);
if (err)
goto fail;
set_buffer_csum(bh);
}
/* Set most of the object header, but the subtype is up to the caller */
raw->btn_o.o_oid = cpu_to_le64(oid);
raw->btn_o.o_xid = cpu_to_le64(nxi->nx_xid);
raw->btn_o.o_type = cpu_to_le32(storage | APFS_OBJECT_TYPE_BTREE_NODE);
raw->btn_o.o_subtype = 0;
/* The caller is expected to change most node fields */
raw->btn_flags = 0;
raw->btn_level = 0;
raw->btn_nkeys = 0;
raw->btn_table_space.off = 0; /* Put the toc right after the header */
raw->btn_table_space.len = 0;
raw->btn_free_space.off = 0;
raw->btn_free_space.len = cpu_to_le16(sb->s_blocksize - sizeof(*raw));
raw->btn_key_free_list.off = cpu_to_le16(APFS_BTOFF_INVALID);
raw->btn_key_free_list.len = 0;
raw->btn_val_free_list.off = cpu_to_le16(APFS_BTOFF_INVALID);
raw->btn_val_free_list.len = 0;
node = kmalloc(sizeof(*node), GFP_KERNEL);
if (!node) {
err = -ENOMEM;
goto fail;
}
node->object.sb = sb;
2024-02-22 20:28:20 -03:00
node->object.block_nr = bno;
node->object.oid = oid;
node->object.o_bh = bh;
2024-02-22 20:28:20 -03:00
node->object.data = (char *)raw;
node->object.ephemeral = !bh;
return node;
fail:
2024-02-22 20:28:20 -03:00
if (storage == APFS_OBJ_EPHEMERAL)
kfree(raw);
else
brelse(bh);
raw = NULL;
bh = NULL;
return ERR_PTR(err);
}
/**
* apfs_delete_node - Deletes a nonroot node from disk
* @node: node to delete
* @type: tree type for the query that found the node
*
* Does nothing to the in-memory node structure. Returns 0 on success, or a
* negative error code in case of failure.
*/
int apfs_delete_node(struct apfs_node *node, int type)
{
struct super_block *sb = node->object.sb;
2024-02-22 20:28:20 -03:00
struct apfs_nxsb_info *nxi = APFS_NXI(sb);
struct apfs_superblock *vsb_raw;
u64 oid = node->object.oid;
u64 bno = node->object.block_nr;
2024-02-22 20:28:20 -03:00
struct apfs_ephemeral_object_info *eph_info = NULL, *eph_info_end = NULL;
int err;
switch (type) {
case APFS_QUERY_CAT:
err = apfs_free_queue_insert(sb, bno, 1);
2023-03-24 22:07:47 -03:00
if (err) {
apfs_err(sb, "free queue insertion failed for 0x%llx", bno);
return err;
2023-03-24 22:07:47 -03:00
}
err = apfs_delete_omap_rec(sb, oid);
2023-03-24 22:07:47 -03:00
if (err) {
apfs_err(sb, "omap rec deletion failed (0x%llx)", oid);
return err;
2023-03-24 22:07:47 -03:00
}
vsb_raw = APFS_SB(sb)->s_vsb_raw;
apfs_assert_in_transaction(sb, &vsb_raw->apfs_o);
le64_add_cpu(&vsb_raw->apfs_fs_alloc_count, -1);
le64_add_cpu(&vsb_raw->apfs_total_blocks_freed, 1);
2021-04-10 01:21:51 -03:00
return 0;
case APFS_QUERY_OMAP:
case APFS_QUERY_EXTENTREF:
2022-09-26 22:24:32 -03:00
case APFS_QUERY_SNAP_META:
err = apfs_free_queue_insert(sb, bno, 1);
2023-03-24 22:07:47 -03:00
if (err) {
apfs_err(sb, "free queue insertion failed for 0x%llx", bno);
return err;
2023-03-24 22:07:47 -03:00
}
/* We don't write to the container's omap */
vsb_raw = APFS_SB(sb)->s_vsb_raw;
apfs_assert_in_transaction(sb, &vsb_raw->apfs_o);
le64_add_cpu(&vsb_raw->apfs_fs_alloc_count, -1);
le64_add_cpu(&vsb_raw->apfs_total_blocks_freed, 1);
2021-04-10 01:21:51 -03:00
return 0;
case APFS_QUERY_FREE_QUEUE:
2024-02-22 20:28:20 -03:00
eph_info_end = &nxi->nx_eph_list[nxi->nx_eph_count];
eph_info = apfs_ephemeral_object_lookup(sb, node->object.oid);
if (IS_ERR(eph_info)) {
apfs_alert(sb, "can't find ephemeral object to delete");
return PTR_ERR(eph_info);
2023-03-24 22:07:47 -03:00
}
2024-02-22 20:28:20 -03:00
kfree(eph_info->object);
eph_info->object = NULL;
memmove(eph_info, eph_info + 1, (char *)eph_info_end - (char *)(eph_info + 1));
eph_info_end->object = NULL;
--nxi->nx_eph_count;
2021-04-10 01:21:51 -03:00
return 0;
default:
apfs_alert(sb, "new query type must implement node deletion (%d)", type);
return -EOPNOTSUPP;
}
}
/**
* apfs_update_node - Update an existing node header
* @node: the modified in-memory node
*/
void apfs_update_node(struct apfs_node *node)
{
struct super_block *sb = node->object.sb;
struct buffer_head *bh = node->object.o_bh;
struct apfs_btree_node_phys *raw = (void *)node->object.data;
struct apfs_nloc *free_head;
u32 tflags, type;
int toc_off;
apfs_assert_in_transaction(sb, &raw->btn_o);
raw->btn_o.o_oid = cpu_to_le64(node->object.oid);
/* The node may no longer be a root, so update the object type */
tflags = le32_to_cpu(raw->btn_o.o_type) & APFS_OBJECT_TYPE_FLAGS_MASK;
type = (node->flags & APFS_BTNODE_ROOT) ? APFS_OBJECT_TYPE_BTREE :
APFS_OBJECT_TYPE_BTREE_NODE;
raw->btn_o.o_type = cpu_to_le32(type | tflags);
raw->btn_o.o_subtype = cpu_to_le32(node->tree_type);
raw->btn_flags = cpu_to_le16(node->flags);
raw->btn_nkeys = cpu_to_le32(node->records);
toc_off = sizeof(*raw) + le16_to_cpu(raw->btn_table_space.off);
raw->btn_table_space.len = cpu_to_le16(node->key - toc_off);
raw->btn_free_space.off = cpu_to_le16(node->free - node->key);
2024-12-04 20:32:26 -03:00
raw->btn_free_space.len = cpu_to_le16(node->val - node->free);
2021-04-14 18:30:53 -03:00
/* Reset the lists on zero length, a defragmentation is taking place */
free_head = &raw->btn_key_free_list;
free_head->len = cpu_to_le16(node->key_free_list_len);
2021-04-14 18:30:53 -03:00
if (!free_head->len)
free_head->off = cpu_to_le16(APFS_BTOFF_INVALID);
free_head = &raw->btn_val_free_list;
free_head->len = cpu_to_le16(node->val_free_list_len);
2021-04-14 18:30:53 -03:00
if (!free_head->len)
free_head->off = cpu_to_le16(APFS_BTOFF_INVALID);
2024-02-22 20:28:20 -03:00
if (bh) {
ASSERT(buffer_trans(bh));
ASSERT(buffer_csum(bh));
}
}
/**
* apfs_node_locate_key - Locate the key of a node record
* @node: node to be searched
* @index: number of the entry to locate
* @off: on return will hold the offset in the block
*
* Returns the length of the key, or 0 in case of failure. The function checks
2024-12-04 20:32:26 -03:00
* that this length fits within the block; callers must use it to make sure
* they never operate outside its bounds.
*/
int apfs_node_locate_key(struct apfs_node *node, int index, int *off)
{
struct super_block *sb = node->object.sb;
struct apfs_btree_node_phys *raw;
int len;
2023-03-24 22:07:47 -03:00
if (index >= node->records) {
apfs_err(sb, "index out of bounds (%d of %d)", index, node->records);
return 0;
2023-03-24 22:07:47 -03:00
}
raw = (struct apfs_btree_node_phys *)node->object.data;
if (apfs_node_has_fixed_kv_size(node)) {
struct apfs_kvoff *entry;
entry = (struct apfs_kvoff *)raw->btn_data + index;
/* TODO: it would be cleaner to read this stuff from disk */
if (node->tree_type == APFS_OBJECT_TYPE_OMAP_SNAPSHOT)
len = 8;
else
len = 16;
/* Translate offset in key area to offset in block */
*off = node->key + le16_to_cpu(entry->k);
} else {
2024-12-04 20:32:26 -03:00
/* These node types have variable length keys and values */
struct apfs_kvloc *entry;
entry = (struct apfs_kvloc *)raw->btn_data + index;
len = le16_to_cpu(entry->k.len);
/* Translate offset in key area to offset in block */
*off = node->key + le16_to_cpu(entry->k.off);
}
if (*off + len > sb->s_blocksize) {
2023-03-24 22:07:47 -03:00
apfs_err(sb, "key out of bounds (%d-%d)", *off, len);
return 0;
}
return len;
}
/**
2024-12-04 20:32:26 -03:00
* apfs_node_locate_value - Locate the value of a node record
* @node: node to be searched
* @index: number of the entry to locate
* @off: on return will hold the offset in the block
*
2024-12-04 20:32:26 -03:00
* Returns the length of the value, which may be 0 in case of corruption or if
* the record is a ghost. The function checks that this length fits within the
2024-12-04 20:32:26 -03:00
* block; callers must use it to make sure they never operate outside its
* bounds.
*/
2024-12-04 20:32:26 -03:00
static int apfs_node_locate_value(struct apfs_node *node, int index, int *off)
{
struct super_block *sb = node->object.sb;
struct apfs_btree_node_phys *raw;
int len;
2023-03-24 22:07:47 -03:00
if (index >= node->records) {
apfs_err(sb, "index out of bounds (%d of %d)", index, node->records);
return 0;
2023-03-24 22:07:47 -03:00
}
raw = (struct apfs_btree_node_phys *)node->object.data;
if (apfs_node_has_fixed_kv_size(node)) {
2024-12-04 20:32:26 -03:00
/* These node types have fixed length keys and values */
struct apfs_kvoff *entry;
entry = (struct apfs_kvoff *)raw->btn_data + index;
if (node->tree_type == APFS_OBJECT_TYPE_SPACEMAN_FREE_QUEUE) {
/* A free-space queue record may have no value */
if (le16_to_cpu(entry->v) == APFS_BTOFF_INVALID) {
*off = 0;
return 0;
}
len = 8;
} else {
/* This is an omap or omap snapshots node */
len = apfs_node_is_leaf(node) ? 16 : 8;
}
/*
2024-12-04 20:32:26 -03:00
* Value offsets are counted backwards from the end of the
* block, or from the beginning of the footer when it exists
*/
if (apfs_node_is_root(node)) /* has footer */
*off = sb->s_blocksize - sizeof(struct apfs_btree_info)
- le16_to_cpu(entry->v);
else
*off = sb->s_blocksize - le16_to_cpu(entry->v);
} else {
2024-12-04 20:32:26 -03:00
/* These node types have variable length keys and values */
struct apfs_kvloc *entry;
entry = (struct apfs_kvloc *)raw->btn_data + index;
len = le16_to_cpu(entry->v.len);
/*
2024-12-04 20:32:26 -03:00
* Value offsets are counted backwards from the end of the
* block, or from the beginning of the footer when it exists
*/
if (apfs_node_is_root(node)) /* has footer */
*off = sb->s_blocksize - sizeof(struct apfs_btree_info)
- le16_to_cpu(entry->v.off);
else
*off = sb->s_blocksize - le16_to_cpu(entry->v.off);
}
if (*off < 0 || *off + len > sb->s_blocksize) {
2023-03-24 22:07:47 -03:00
apfs_err(sb, "value out of bounds (%d-%d)", *off, len);
return 0;
}
return len;
}
/**
* apfs_create_toc_entry - Create the table-of-contents entry for a record
* @query: query pointing to the record
*
* Creates a toc entry for the record at index @query->index and increases
* @node->records. The caller must ensure enough space in the table.
*/
2021-04-14 18:30:53 -03:00
static void apfs_create_toc_entry(struct apfs_query *query)
{
struct apfs_node *node = query->node;
struct super_block *sb = node->object.sb;
struct apfs_btree_node_phys *raw = (void *)node->object.data;
int value_end;
int recs = node->records;
int index = query->index;
value_end = sb->s_blocksize;
if (apfs_node_is_root(node))
value_end -= sizeof(struct apfs_btree_info);
if (apfs_node_has_fixed_kv_size(node)) {
struct apfs_kvoff *kvoff;
kvoff = (struct apfs_kvoff *)raw->btn_data + query->index;
memmove(kvoff + 1, kvoff, (recs - index) * sizeof(*kvoff));
if (!query->len) /* Ghost record */
kvoff->v = cpu_to_le16(APFS_BTOFF_INVALID);
else
kvoff->v = cpu_to_le16(value_end - query->off);
kvoff->k = cpu_to_le16(query->key_off - node->key);
} else {
struct apfs_kvloc *kvloc;
kvloc = (struct apfs_kvloc *)raw->btn_data + query->index;
memmove(kvloc + 1, kvloc, (recs - index) * sizeof(*kvloc));
kvloc->v.off = cpu_to_le16(value_end - query->off);
kvloc->v.len = cpu_to_le16(query->len);
kvloc->k.off = cpu_to_le16(query->key_off - node->key);
kvloc->k.len = cpu_to_le16(query->key_len);
}
node->records++;
}
/**
* apfs_key_from_query - Read the current key from a query structure
* @query: the query, with @query->key_off and @query->key_len already set
* @key: return parameter for the key
*
* Reads the key into @key and performs some basic sanity checks as a
* protection against crafted filesystems. Returns 0 on success or a
* negative error code otherwise.
*/
static int apfs_key_from_query(struct apfs_query *query, struct apfs_key *key)
{
struct super_block *sb = query->node->object.sb;
char *raw = query->node->object.data;
void *raw_key = (void *)(raw + query->key_off);
bool hashed;
int err = 0;
switch (query->flags & APFS_QUERY_TREE_MASK) {
case APFS_QUERY_CAT:
hashed = apfs_is_normalization_insensitive(sb);
err = apfs_read_cat_key(raw_key, query->key_len, key, hashed);
break;
case APFS_QUERY_OMAP:
err = apfs_read_omap_key(raw_key, query->key_len, key);
break;
case APFS_QUERY_FREE_QUEUE:
err = apfs_read_free_queue_key(raw_key, query->key_len, key);
break;
case APFS_QUERY_EXTENTREF:
err = apfs_read_extentref_key(raw_key, query->key_len, key);
break;
2023-01-13 23:12:19 -03:00
case APFS_QUERY_FEXT:
err = apfs_read_fext_key(raw_key, query->key_len, key);
break;
2022-09-26 22:24:32 -03:00
case APFS_QUERY_SNAP_META:
err = apfs_read_snap_meta_key(raw_key, query->key_len, key);
break;
case APFS_QUERY_OMAP_SNAP:
err = apfs_read_omap_snap_key(raw_key, query->key_len, key);
break;
default:
2023-03-24 22:07:47 -03:00
apfs_alert(sb, "new query type must implement key reads (%d)", query->flags & APFS_QUERY_TREE_MASK);
err = -EOPNOTSUPP;
break;
}
2023-03-24 22:07:47 -03:00
if (err)
apfs_err(sb, "bad node key in block 0x%llx", query->node->object.block_nr);
/* A multiple query must ignore some of these fields */
if (query->flags & APFS_QUERY_ANY_NAME)
key->name = NULL;
if (query->flags & APFS_QUERY_ANY_NUMBER)
key->number = 0;
return err;
}
/**
* apfs_node_prev - Find the previous record in the current node
* @sb: filesystem superblock
* @query: query in execution
*
* Returns 0 on success, -EAGAIN if the previous record is in another node,
* -ENODATA if no more records exist, or another negative error code in case
* of failure.
*
* The meaning of "next" and "previous" is reverted here, because regular
* multiple always start with the final record, and then they go backwards.
* TODO: consider renaming this for clarity.
*/
static int apfs_node_prev(struct super_block *sb, struct apfs_query *query)
{
struct apfs_node *node = query->node;
if (query->index + 1 == node->records) {
/* The next record may be in another node */
return -EAGAIN;
}
++query->index;
query->key_len = apfs_node_locate_key(node, query->index, &query->key_off);
if (query->key_len == 0) {
apfs_err(sb, "bad key for index %d", query->index);
return -EFSCORRUPTED;
}
2024-12-04 20:32:26 -03:00
query->len = apfs_node_locate_value(node, query->index, &query->off);
if (query->len == 0) {
apfs_err(sb, "bad value for index %d", query->index);
return -EFSCORRUPTED;
}
return 0;
}
/**
* apfs_node_next - Find the next matching record in the current node
* @sb: filesystem superblock
* @query: multiple query in execution
*
* Returns 0 on success, -EAGAIN if the next record is in another node,
* -ENODATA if no more matching records exist, or another negative error
* code in case of failure.
*/
static int apfs_node_next(struct super_block *sb, struct apfs_query *query)
{
struct apfs_node *node = query->node;
struct apfs_key curr_key;
int cmp, err;
if (query->flags & APFS_QUERY_DONE)
/* Nothing left to search; the query failed */
return -ENODATA;
if (!query->index) /* The next record may be in another node */
return -EAGAIN;
--query->index;
query->key_len = apfs_node_locate_key(node, query->index,
&query->key_off);
err = apfs_key_from_query(query, &curr_key);
2023-03-24 22:07:47 -03:00
if (err) {
apfs_err(sb, "bad key for index %d", query->index);
return err;
2023-03-24 22:07:47 -03:00
}
2023-12-20 20:14:49 -03:00
cmp = apfs_keycmp(&curr_key, &query->key);
2023-03-24 22:07:47 -03:00
if (cmp > 0) {
apfs_err(sb, "records are out of order");
return -EFSCORRUPTED;
2023-03-24 22:07:47 -03:00
}
if (cmp != 0 && apfs_node_is_leaf(node) &&
query->flags & APFS_QUERY_EXACT)
return -ENODATA;
2024-12-04 20:32:26 -03:00
query->len = apfs_node_locate_value(node, query->index, &query->off);
2023-03-24 22:07:47 -03:00
if (query->len == 0) {
apfs_err(sb, "bad value for index %d", query->index);
return -EFSCORRUPTED;
2023-03-24 22:07:47 -03:00
}
if (cmp != 0) {
/*
* This is the last entry that can be relevant in this node.
* Keep searching the children, but don't return to this level.
*/
query->flags |= APFS_QUERY_DONE;
}
return 0;
}
/**
* apfs_node_query - Execute a query on a single node
* @sb: filesystem superblock
* @query: the query to execute
*
* The search will start at index @query->index, looking for the key that comes
* right before @query->key, according to the order given by apfs_keycmp().
*
* The @query->index will be updated to the last index checked. This is
* important when searching for multiple entries, since the query may need
* to remember where it was on this level. If we are done with this node, the
* query will be flagged as APFS_QUERY_DONE, and the search will end in failure
* as soon as we return to this level. The function may also return -EAGAIN,
* to signal that the search should go on in a different branch.
*
2024-12-04 20:32:26 -03:00
* On success returns 0; the offset of the value within the block will be saved
* in @query->off, and its length in @query->len. The function checks that this
* length fits within the block; callers must use the returned value to make
* sure they never operate outside its bounds.
*
* -ENODATA will be returned if no appropriate entry was found, -EFSCORRUPTED
* in case of corruption.
*/
int apfs_node_query(struct super_block *sb, struct apfs_query *query)
{
struct apfs_node *node = query->node;
int left, right;
int cmp;
int err;
if (query->flags & APFS_QUERY_PREV)
return apfs_node_prev(sb, query);
if (query->flags & APFS_QUERY_NEXT)
return apfs_node_next(sb, query);
/* Search by bisection */
cmp = 1;
left = 0;
do {
struct apfs_key curr_key;
2023-10-26 20:57:37 -03:00
if (cmp > 0) {
right = query->index - 1;
if (right < left) {
query->index = -1;
return -ENODATA;
}
query->index = (left + right) / 2;
} else {
left = query->index;
query->index = DIV_ROUND_UP(left + right, 2);
}
query->key_len = apfs_node_locate_key(node, query->index,
&query->key_off);
err = apfs_key_from_query(query, &curr_key);
2023-03-24 22:07:47 -03:00
if (err) {
apfs_err(sb, "bad key for index %d", query->index);
return err;
2023-03-24 22:07:47 -03:00
}
2023-12-20 20:14:49 -03:00
cmp = apfs_keycmp(&curr_key, &query->key);
if (cmp == 0 && !(query->flags & APFS_QUERY_MULTIPLE))
break;
} while (left != right);
if (cmp > 0) {
query->index = -1;
return -ENODATA;
}
if (cmp != 0 && apfs_node_is_leaf(query->node) &&
query->flags & APFS_QUERY_EXACT)
return -ENODATA;
if (query->flags & APFS_QUERY_MULTIPLE) {
if (cmp != 0) /* Last relevant entry in level */
query->flags |= APFS_QUERY_DONE;
query->flags |= APFS_QUERY_NEXT;
}
2024-12-04 20:32:26 -03:00
query->len = apfs_node_locate_value(node, query->index, &query->off);
return 0;
}
/**
* apfs_node_query_first - Find the first record in a node
* @query: on return this query points to the record
*/
void apfs_node_query_first(struct apfs_query *query)
{
struct apfs_node *node = query->node;
query->index = 0;
query->key_len = apfs_node_locate_key(node, query->index, &query->key_off);
2024-12-04 20:32:26 -03:00
query->len = apfs_node_locate_value(node, query->index, &query->off);
}
/**
2022-09-30 17:40:33 -03:00
* apfs_omap_map_from_query - Read the mapping found by a successful omap query
* @query: the query that found the record
2022-09-30 17:40:33 -03:00
* @map: Return parameter. The mapping found.
*
* Returns -EOPNOTSUPP if the object doesn't fit in one block, and -EFSCORRUPTED
2022-09-30 17:40:33 -03:00
* if the filesystem appears to be malicious. Otherwise, reads the mapping info
* in the omap record into @map and returns 0.
*/
2022-09-30 17:40:33 -03:00
int apfs_omap_map_from_query(struct apfs_query *query, struct apfs_omap_map *map)
{
struct super_block *sb = query->node->object.sb;
2022-09-30 17:40:33 -03:00
struct apfs_omap_key *key = NULL;
struct apfs_omap_val *val = NULL;
char *raw = query->node->object.data;
2023-03-24 22:07:47 -03:00
if (query->len != sizeof(*val) || query->key_len != sizeof(*key)) {
apfs_err(sb, "bad length of key (%d) or value (%d)", query->key_len, query->len);
return -EFSCORRUPTED;
2023-03-24 22:07:47 -03:00
}
2022-09-30 17:40:33 -03:00
key = (struct apfs_omap_key *)(raw + query->key_off);
val = (struct apfs_omap_val *)(raw + query->off);
/* TODO: support objects with multiple blocks */
2022-09-30 17:40:33 -03:00
if (le32_to_cpu(val->ov_size) != sb->s_blocksize) {
apfs_err(sb, "object size doesn't match block size");
return -EOPNOTSUPP;
}
2022-09-30 17:40:33 -03:00
map->xid = le64_to_cpu(key->ok_xid);
map->bno = le64_to_cpu(val->ov_paddr);
map->flags = le32_to_cpu(val->ov_flags);
return 0;
}
/**
* apfs_btree_inc_height - Increase the height of a b-tree
* @query: query pointing to the root node
*
* On success returns 0, and @query is left pointing to the same record.
* Returns a negative error code in case of failure.
*/
static int apfs_btree_inc_height(struct apfs_query *query)
{
struct apfs_query *root_query;
struct apfs_node *root = query->node;
struct apfs_node *new_node;
struct super_block *sb = root->object.sb;
struct apfs_btree_node_phys *root_raw;
struct apfs_btree_node_phys *new_raw;
struct apfs_btree_info *info;
__le64 *raw_oid;
u32 storage = apfs_query_storage(query);
root_raw = (void *)root->object.data;
apfs_assert_in_transaction(sb, &root_raw->btn_o);
2023-03-24 22:07:47 -03:00
if (query->parent || query->depth) {
apfs_err(sb, "invalid root query");
return -EFSCORRUPTED;
2023-03-24 22:07:47 -03:00
}
/* Create a new child node */
new_node = apfs_create_node(sb, storage);
2023-03-24 22:07:47 -03:00
if (IS_ERR(new_node)) {
apfs_err(sb, "node creation failed");
return PTR_ERR(new_node);
2023-03-24 22:07:47 -03:00
}
new_node->flags = root->flags & ~APFS_BTNODE_ROOT;
new_node->tree_type = root->tree_type;
/* Move all records into the child node; get rid of the info footer */
new_node->records = root->records;
new_node->key = root->key;
new_node->free = root->free;
2024-12-04 20:32:26 -03:00
new_node->val = root->val + sizeof(*info);
new_node->key_free_list_len = root->key_free_list_len;
new_node->val_free_list_len = root->val_free_list_len;
new_raw = (void *)new_node->object.data;
/* Don't copy the object header, already set by apfs_create_node() */
memcpy((void *)new_raw + sizeof(new_raw->btn_o),
(void *)root_raw + sizeof(root_raw->btn_o),
root->free - sizeof(new_raw->btn_o));
2024-12-04 20:32:26 -03:00
memcpy((void *)new_raw + new_node->val,
(void *)root_raw + root->val,
sb->s_blocksize - new_node->val);
query->off += sizeof(*info);
apfs_update_node(new_node);
/* Add a new level to the query chain */
root_query = query->parent = apfs_alloc_query(root, NULL /* parent */);
if (!query->parent) {
2022-02-25 21:09:26 -03:00
apfs_node_free(new_node);
return -ENOMEM;
}
root_query->key = query->key;
root_query->flags = query->flags;
query->node = new_node;
2022-02-25 21:09:26 -03:00
query->depth = 1;
/* Now assemble the new root with only the first key */
root_query->key_len = apfs_node_locate_key(root, 0 /* index */,
&root_query->key_off);
2023-03-24 22:07:47 -03:00
if (!root_query->key_len) {
apfs_err(sb, "bad key for index %d", 0);
return -EFSCORRUPTED;
2023-03-24 22:07:47 -03:00
}
root->key = sizeof(*root_raw) +
apfs_node_min_table_size(sb, root->tree_type, root->flags & ~APFS_BTNODE_LEAF);
memmove((void *)root_raw + root->key,
(void *)root_raw + root_query->key_off, root_query->key_len);
root_query->key_off = root->key;
root->free = root->key + root_query->key_len;
/* The new root is a nonleaf node; the record value is the child id */
root->flags &= ~APFS_BTNODE_LEAF;
2024-12-04 20:32:26 -03:00
root->val = sb->s_blocksize - sizeof(*info) - sizeof(*raw_oid);
raw_oid = (void *)root_raw + root->val;
*raw_oid = cpu_to_le64(new_node->object.oid);
2024-12-04 20:32:26 -03:00
root_query->off = root->val;
root_query->len = sizeof(*raw_oid);
/* With the key and value in place, set the table-of-contents */
root->records = 0;
root_query->index = 0;
apfs_create_toc_entry(root_query);
/* There is no internal fragmentation */
root->key_free_list_len = 0;
root->val_free_list_len = 0;
/* Finally, update the node count in the info footer */
apfs_btree_change_node_count(root_query, 1 /* change */);
le16_add_cpu(&root_raw->btn_level, 1); /* TODO: move to update_node() */
apfs_update_node(root);
return 0;
}
/**
* apfs_copy_record_range - Copy a range of records to an empty node
* @dest_node: destination node
* @src_node: source node
* @start: index of first record in range
* @end: index of first record after the range
*
* Doesn't modify the info footer of root nodes. Returns 0 on success or a
* negative error code in case of failure.
*/
static int apfs_copy_record_range(struct apfs_node *dest_node,
struct apfs_node *src_node,
int start, int end)
{
struct super_block *sb = dest_node->object.sb;
struct apfs_btree_node_phys *dest_raw;
struct apfs_btree_node_phys *src_raw;
struct apfs_query *query = NULL;
int toc_size, toc_entry_size;
int err;
int i;
dest_raw = (void *)dest_node->object.data;
src_raw = (void *)src_node->object.data;
ASSERT(!dest_node->records);
apfs_assert_in_transaction(sb, &dest_raw->btn_o);
/* Resize the table of contents so that all the records fit */
if (apfs_node_has_fixed_kv_size(src_node))
toc_entry_size = sizeof(struct apfs_kvoff);
else
toc_entry_size = sizeof(struct apfs_kvloc);
toc_size = apfs_node_min_table_size(sb, src_node->tree_type, src_node->flags);
if (toc_size < toc_entry_size * (end - start))
toc_size = toc_entry_size * round_up(end - start, APFS_BTREE_TOC_ENTRY_INCREMENT);
dest_node->key = sizeof(*dest_raw) + toc_size;
dest_node->free = dest_node->key;
2024-12-04 20:32:26 -03:00
dest_node->val = sb->s_blocksize;
if (apfs_node_is_root(dest_node))
2024-12-04 20:32:26 -03:00
dest_node->val -= sizeof(struct apfs_btree_info);
/* We'll use a temporary query structure to move the records around */
query = apfs_alloc_query(dest_node, NULL /* parent */);
if (!query) {
err = -ENOMEM;
goto fail;
}
err = -EFSCORRUPTED;
for (i = start; i < end; ++i) {
int len, off;
len = apfs_node_locate_key(src_node, i, &off);
2023-03-24 22:07:47 -03:00
if (dest_node->free + len > sb->s_blocksize) {
apfs_err(sb, "key of length %d doesn't fit", len);
goto fail;
2023-03-24 22:07:47 -03:00
}
memcpy((char *)dest_raw + dest_node->free,
(char *)src_raw + off, len);
query->key_off = dest_node->free;
query->key_len = len;
dest_node->free += len;
2024-12-04 20:32:26 -03:00
len = apfs_node_locate_value(src_node, i, &off);
dest_node->val -= len;
if (dest_node->val < 0) {
2023-03-24 22:07:47 -03:00
apfs_err(sb, "value of length %d doesn't fit", len);
goto fail;
2023-03-24 22:07:47 -03:00
}
2024-12-04 20:32:26 -03:00
memcpy((char *)dest_raw + dest_node->val,
(char *)src_raw + off, len);
2024-12-04 20:32:26 -03:00
query->off = dest_node->val;
query->len = len;
query->index = i - start;
apfs_create_toc_entry(query);
}
err = 0;
fail:
apfs_free_query(query);
return err;
}
/**
* apfs_attach_child - Attach a new node to its parent
* @query: query pointing to the previous record in the parent
* @child: the new child node to attach
*
* Returns 0 on success or a negative error code in case of failure (which may
* be -EAGAIN if a node split has happened and the caller must refresh and
* retry).
*/
static int apfs_attach_child(struct apfs_query *query, struct apfs_node *child)
{
struct apfs_object *object = &child->object;
struct apfs_btree_node_phys *raw = (void *)object->data;
int key_len, key_off;
__le64 raw_oid = cpu_to_le64(object->oid);
key_len = apfs_node_locate_key(child, 0, &key_off);
2023-03-24 22:07:47 -03:00
if (!key_len) {
/* This should never happen: @child was made by us */
apfs_alert(object->sb, "bad key for index %d", 0);
return -EFSCORRUPTED;
2023-03-24 22:07:47 -03:00
}
return __apfs_btree_insert(query, (void *)raw + key_off, key_len, &raw_oid, sizeof(raw_oid));
}
2022-02-25 12:31:16 -03:00
/**
* apfs_node_temp_dup - Make an in-memory duplicate of a node
* @original: node to duplicate
* @duplicate: on success, the duplicate node
*
* Returns 0 on success or a negative error code in case of failure.
*/
static int apfs_node_temp_dup(const struct apfs_node *original, struct apfs_node **duplicate)
{
struct super_block *sb = original->object.sb;
struct apfs_node *dup = NULL;
char *buffer = NULL;
dup = kmalloc(sizeof(*dup), GFP_KERNEL);
if (!dup)
return -ENOMEM;
*dup = *original;
dup->object.o_bh = NULL;
dup->object.data = NULL;
2024-02-22 20:28:20 -03:00
dup->object.ephemeral = false;
2022-02-25 12:31:16 -03:00
buffer = kmalloc(sb->s_blocksize, GFP_KERNEL);
if (!buffer) {
kfree(dup);
return -ENOMEM;
}
memcpy(buffer, original->object.data, sb->s_blocksize);
dup->object.data = buffer;
*duplicate = dup;
return 0;
}
/**
* apfs_node_split - Split a b-tree node in two
* @query: query pointing to the node
*
* On success returns 0, and @query is left pointing to the same record on the
* tip; to simplify the implementation, @query->parent is set to NULL. Returns
* a negative error code in case of failure, which may be -EAGAIN if a node
* split has happened and the caller must refresh and retry.
*/
int apfs_node_split(struct apfs_query *query)
{
struct super_block *sb = query->node->object.sb;
struct apfs_node *old_node = NULL, *new_node = NULL, *tmp_node = NULL;
struct apfs_btree_node_phys *new_raw = NULL, *old_raw = NULL;
u32 storage = apfs_query_storage(query);
2021-07-22 19:18:49 -03:00
int record_count, new_rec_count, old_rec_count;
int err;
apfs_assert_query_is_valid(query);
if (apfs_node_is_root(query->node)) {
err = apfs_btree_inc_height(query);
2023-03-24 22:07:47 -03:00
if (err) {
apfs_err(sb, "failed to increase tree height");
return err;
2023-03-24 22:07:47 -03:00
}
2021-07-26 21:32:18 -03:00
} else if (!query->parent) {
2023-03-24 22:07:47 -03:00
apfs_err(sb, "nonroot node with no parent");
2021-07-26 21:32:18 -03:00
return -EFSCORRUPTED;
}
old_node = query->node;
old_raw = (void *)old_node->object.data;
apfs_assert_in_transaction(sb, &old_raw->btn_o);
/*
2022-02-25 12:31:16 -03:00
* To defragment the original node, we put all records in a temporary
* in-memory node before dealing them out.
*/
2022-02-25 12:31:16 -03:00
err = apfs_node_temp_dup(old_node, &tmp_node);
if (err)
return err;
2021-07-22 19:18:49 -03:00
record_count = old_node->records;
if (record_count == 1) {
apfs_alert(sb, "splitting node with a single record");
err = -EFSCORRUPTED;
goto out;
}
2021-07-22 19:18:49 -03:00
new_rec_count = record_count / 2;
old_rec_count = record_count - new_rec_count;
/*
* The second half of the records go into a new node. This is done
* before the first half to avoid committing to any actual changes
* until we know for sure that no ancestor splits are expected.
*/
new_node = apfs_create_node(sb, storage);
if (IS_ERR(new_node)) {
apfs_err(sb, "node creation failed");
err = PTR_ERR(new_node);
new_node = NULL;
goto out;
}
new_node->tree_type = old_node->tree_type;
new_node->flags = old_node->flags;
new_node->records = 0;
new_node->key_free_list_len = 0;
new_node->val_free_list_len = 0;
err = apfs_copy_record_range(new_node, tmp_node, old_rec_count, record_count);
if (err) {
apfs_err(sb, "record copy failed");
goto out;
}
new_raw = (void *)new_node->object.data;
apfs_assert_in_transaction(sb, &new_raw->btn_o);
new_raw->btn_level = old_raw->btn_level;
apfs_update_node(new_node);
err = apfs_attach_child(query->parent, new_node);
if (err) {
if (err != -EAGAIN) {
apfs_err(sb, "child attachment failed");
goto out;
}
err = apfs_delete_node(new_node, query->flags & APFS_QUERY_TREE_MASK);
if (err) {
apfs_err(sb, "node cleanup failed for query retry");
goto out;
}
err = -EAGAIN;
goto out;
}
apfs_assert_query_is_valid(query->parent);
apfs_btree_change_node_count(query->parent, 1 /* change */);
/*
* No more risk of ancestor splits, now actual changes can be made. The
* first half of the records go into the original node.
*/
old_node->records = 0;
old_node->key_free_list_len = 0;
old_node->val_free_list_len = 0;
2022-02-25 12:31:16 -03:00
err = apfs_copy_record_range(old_node, tmp_node, 0, old_rec_count);
2023-03-24 22:07:47 -03:00
if (err) {
apfs_err(sb, "record copy failed");
2021-07-22 19:18:49 -03:00
goto out;
2023-03-24 22:07:47 -03:00
}
apfs_update_node(old_node);
/* Point the query back to the original record */
if (query->index >= old_rec_count) {
/* The record got moved to the new node */
apfs_node_free(query->node);
query->node = new_node;
new_node = NULL;
query->index -= old_rec_count;
}
/*
* This could be avoided in most cases, and queries could get refreshed
* only when really orphaned. But refreshing queries is probably not a
* bottleneck, and trying to be clever with this stuff has caused me a
* lot of trouble already.
*/
apfs_free_query(query->parent);
query->parent = NULL; /* The caller only gets the leaf */
2021-07-22 19:18:49 -03:00
out:
apfs_node_free(new_node);
2022-02-25 21:09:26 -03:00
apfs_node_free(tmp_node);
return err;
}
2021-04-14 18:30:53 -03:00
/* TODO: the following 4 functions could be reused elsewhere */
/**
* apfs_off_to_val_off - Translate offset in node to offset in value area
* @node: the node
* @off: offset in the node
*/
static u16 apfs_off_to_val_off(struct apfs_node *node, u16 off)
{
struct super_block *sb = node->object.sb;
u16 val_end;
val_end = sb->s_blocksize;
if (apfs_node_is_root(node)) /* has footer */
val_end -= sizeof(struct apfs_btree_info);
return val_end - off;
}
/**
* apfs_val_off_to_off - Translate offset in value area to offset in node
* @node: the node
* @off: offset in the value area
*/
static u16 apfs_val_off_to_off(struct apfs_node *node, u16 off)
{
return apfs_off_to_val_off(node, off);
}
/**
* apfs_off_to_key_off - Translate offset in node to offset in key area
* @node: the node
* @off: offset in the node
*/
static u16 apfs_off_to_key_off(struct apfs_node *node, u16 off)
{
return off - node->key;
}
/**
* apfs_key_off_to_off - Translate offset in key area to offset in node
* @node: the node
* @off: offset in the key area
*/
static u16 apfs_key_off_to_off(struct apfs_node *node, u16 off)
{
return off + node->key;
}
/* The type of the previous four functions, used for node offset calculations */
typedef u16 (*offcalc)(struct apfs_node *, u16);
/**
* apfs_node_free_list_add - Add a free node segment to the proper free list
* @node: node for the segment
* @off: offset of the segment to add
* @len: length of the segment to add
*
* The caller must ensure that the freed segment fits in the node.
*/
static void apfs_node_free_list_add(struct apfs_node *node, u16 off, u16 len)
{
struct super_block *sb = node->object.sb;
struct apfs_btree_node_phys *node_raw = (void *)node->object.data;
2021-04-14 18:30:53 -03:00
struct apfs_nloc *head, *new;
offcalc off_to_rel;
apfs_assert_in_transaction(sb, &node_raw->btn_o);
2024-12-04 20:32:26 -03:00
if (off >= node->val) { /* Value area */
2021-04-14 18:30:53 -03:00
off_to_rel = apfs_off_to_val_off;
head = &node_raw->btn_val_free_list;
node->val_free_list_len += len;
} else { /* Key area */
off_to_rel = apfs_off_to_key_off;
head = &node_raw->btn_key_free_list;
node->key_free_list_len += len;
}
/* Very small segments are leaked until defragmentation */
if (len < sizeof(*new))
return;
/* The free list doesn't seem to be kept in any particular order */
new = (void *)node_raw + off;
new->off = head->off;
new->len = cpu_to_le16(len);
head->off = cpu_to_le16(off_to_rel(node, off));
}
/**
* apfs_node_free_range - Free space from a node's key or value areas
* @node: the node
* @off: offset to free
* @len: length to free
*
* Returns 0 on success or a negative error code in case of failure.
*/
void apfs_node_free_range(struct apfs_node *node, u16 off, u16 len)
{
struct super_block *sb = node->object.sb;
struct apfs_btree_node_phys *raw = (void *)node->object.data;
2021-04-14 18:30:53 -03:00
apfs_assert_in_transaction(sb, &raw->btn_o);
2024-12-04 20:32:26 -03:00
if (off == node->val)
node->val += len;
2021-04-14 18:30:53 -03:00
else if (off + len == node->free)
node->free -= len;
else
apfs_node_free_list_add(node, off, len);
}
/**
* apfs_node_free_list_unlink - Unlink an entry from a node's free list
* @prev: previous entry
* @curr: entry to unlink
*/
static void apfs_node_free_list_unlink(struct apfs_nloc *prev, struct apfs_nloc *curr)
{
prev->off = curr->off;
}
/**
* apfs_node_free_list_alloc - Allocate a free segment from a free list
* @node: the node
* @len: length to allocate
* @value: true to allocate in the value area, false for the key area
*
* Returns the offset in the node on success, or a negative error code in case
* of failure, which may be -ENOSPC if the node seems full.
2021-04-14 18:30:53 -03:00
*/
static int apfs_node_free_list_alloc(struct apfs_node *node, u16 len, bool value)
{
struct super_block *sb = node->object.sb;
struct apfs_btree_node_phys *node_raw = (void *)node->object.data;
2021-04-14 18:30:53 -03:00
struct apfs_nloc *head, *curr, *prev;
offcalc rel_to_off;
int *list_len;
int bound = sb->s_blocksize;
apfs_assert_in_transaction(sb, &node_raw->btn_o);
if (value) { /* Value area */
rel_to_off = apfs_val_off_to_off;
head = &node_raw->btn_val_free_list;
list_len = &node->val_free_list_len;
} else { /* Key area */
rel_to_off = apfs_key_off_to_off;
head = &node_raw->btn_key_free_list;
list_len = &node->key_free_list_len;
}
if (*list_len < len)
return -ENOSPC;
prev = head;
while (bound--) {
u16 curr_off = le16_to_cpu(prev->off);
u16 abs_off = rel_to_off(node, curr_off);
u16 curr_len;
if (curr_off == APFS_BTOFF_INVALID)
return -ENOSPC;
2023-03-24 22:07:47 -03:00
if (abs_off + sizeof(*curr) > sb->s_blocksize) {
apfs_err(sb, "nloc out of bounds (%d-%d)", abs_off, (int)sizeof(*curr));
2021-04-14 18:30:53 -03:00
return -EFSCORRUPTED;
2023-03-24 22:07:47 -03:00
}
2021-04-14 18:30:53 -03:00
curr = (void *)node_raw + abs_off;
curr_len = le16_to_cpu(curr->len);
if (curr_len >= len) {
2023-03-24 22:07:47 -03:00
if (abs_off + curr_len > sb->s_blocksize) {
apfs_err(sb, "entry out of bounds (%d-%d)", abs_off, curr_len);
2021-04-14 18:30:53 -03:00
return -EFSCORRUPTED;
2023-03-24 22:07:47 -03:00
}
*list_len -= curr_len;
2021-04-14 18:30:53 -03:00
apfs_node_free_list_unlink(prev, curr);
apfs_node_free_list_add(node, abs_off + len, curr_len - len);
return abs_off;
}
prev = curr;
}
/* Don't loop forever if the free list is corrupted and doesn't end */
2023-03-24 22:07:47 -03:00
apfs_err(sb, "free list never ends");
2021-04-14 18:30:53 -03:00
return -EFSCORRUPTED;
}
/**
* apfs_node_alloc_key - Allocated free space for a new key
* @node: the node to search
* @len: wanted key length
*
* Returns the offset in the node on success, or a negative error code in case
* of failure, which may be -ENOSPC if the node seems full.
2021-04-14 18:30:53 -03:00
*/
static int apfs_node_alloc_key(struct apfs_node *node, u16 len)
{
int off;
2024-12-04 20:32:26 -03:00
if (node->free + len <= node->val) {
2021-04-14 18:30:53 -03:00
off = node->free;
node->free += len;
return off;
}
return apfs_node_free_list_alloc(node, len, false /* value */);
}
/**
* apfs_node_alloc_val - Allocated free space for a new value
* @node: the node to search
* @len: wanted value length
*
* Returns the offset in the node on success, or a negative error code in case
* of failure, which may be -ENOSPC if the node seems full.
2021-04-14 18:30:53 -03:00
*/
static int apfs_node_alloc_val(struct apfs_node *node, u16 len)
{
int off;
2024-12-04 20:32:26 -03:00
if (node->free + len <= node->val) {
off = node->val - len;
node->val -= len;
2021-04-14 18:30:53 -03:00
return off;
}
return apfs_node_free_list_alloc(node, len, true /* value */);
}
2023-12-29 19:56:01 -03:00
/**
* apfs_node_total_room - Total free space in a node
* @node: the node
*/
static int apfs_node_total_room(struct apfs_node *node)
{
2024-12-04 20:32:26 -03:00
return node->val - node->free + node->key_free_list_len + node->val_free_list_len;
2023-12-29 19:56:01 -03:00
}
/**
* apfs_node_has_room - Check if a node has room for insertion or replacement
* @node: node to check
* @length: length of the needed space (may be negative on replace)
* @replace: are we replacing a record?
*/
bool apfs_node_has_room(struct apfs_node *node, int length, bool replace)
{
struct apfs_btree_node_phys *node_raw = (void *)node->object.data;
int toc_entry_size, needed_room;
if (apfs_node_has_fixed_kv_size(node))
toc_entry_size = sizeof(struct apfs_kvoff);
else
toc_entry_size = sizeof(struct apfs_kvloc);
needed_room = length;
if (!replace) {
if (sizeof(*node_raw) + (node->records + 1) * toc_entry_size > node->key)
needed_room += APFS_BTREE_TOC_ENTRY_INCREMENT * toc_entry_size;
}
return apfs_node_total_room(node) >= needed_room;
}
2023-12-29 19:56:01 -03:00
/**
* apfs_defragment_node - Make all free space in a node contiguous
* @node: node to defragment
*
* Returns 0 on success or a negative error code in case of failure.
*/
static int apfs_defragment_node(struct apfs_node *node)
{
struct super_block *sb = node->object.sb;
struct apfs_btree_node_phys *node_raw = (void *)node->object.data;
struct apfs_node *tmp_node = NULL;
int record_count, err;
apfs_assert_in_transaction(sb, &node_raw->btn_o);
/* Put all records in a temporary in-memory node and deal them out */
err = apfs_node_temp_dup(node, &tmp_node);
if (err)
return err;
record_count = node->records;
node->records = 0;
node->key_free_list_len = 0;
node->val_free_list_len = 0;
err = apfs_copy_record_range(node, tmp_node, 0, record_count);
if (err) {
apfs_err(sb, "record copy failed");
goto fail;
}
apfs_update_node(node);
fail:
apfs_node_free(tmp_node);
return err;
}
2023-12-29 19:40:24 -03:00
/**
* apfs_node_update_toc_entry - Update a table of contents entry in place
* @query: query pointing to the toc entry
*
* The toc entry gets updated with the length and offset for the key/value
* provided by @query. Don't call this function for nodes with fixed length
* key/values, those never need to update their toc entries.
*/
static void apfs_node_update_toc_entry(struct apfs_query *query)
{
struct super_block *sb = NULL;
struct apfs_node *node = NULL;
struct apfs_btree_node_phys *node_raw = NULL;
struct apfs_kvloc *kvloc = NULL;
int value_end;
node = query->node;
ASSERT(!apfs_node_has_fixed_kv_size(node));
sb = node->object.sb;
node_raw = (void *)node->object.data;
value_end = sb->s_blocksize;
if (apfs_node_is_root(node))
value_end -= sizeof(struct apfs_btree_info);
kvloc = (struct apfs_kvloc *)node_raw->btn_data + query->index;
kvloc->v.off = cpu_to_le16(value_end - query->off);
kvloc->v.len = cpu_to_le16(query->len);
kvloc->k.off = cpu_to_le16(query->key_off - node->key);
kvloc->k.len = cpu_to_le16(query->key_len);
}
2021-04-14 18:30:53 -03:00
/**
* apfs_node_replace - Replace a record in a node that has enough room
2021-04-14 18:30:53 -03:00
* @query: exact query that found the record
* @key: new on-disk record key (NULL if unchanged)
* @key_len: length of @key
* @val: new on-disk record value (NULL if unchanged)
* @val_len: length of @val
*
* Returns 0 on success, and @query is left pointing to the same record. Returns
* a negative error code in case of failure.
2021-04-14 18:30:53 -03:00
*/
int apfs_node_replace(struct apfs_query *query, void *key, int key_len, void *val, int val_len)
{
struct apfs_node *node = query->node;
struct super_block *sb = node->object.sb;
struct apfs_btree_node_phys *node_raw = (void *)node->object.data;
2021-04-14 18:30:53 -03:00
int key_off = 0, val_off = 0, err = 0;
2023-12-29 19:56:01 -03:00
bool defragged = false;
int qtree = query->flags & APFS_QUERY_TREE_MASK;
2021-04-14 18:30:53 -03:00
apfs_assert_in_transaction(sb, &node_raw->btn_o);
/*
* Free queues are weird because their tables of contents don't report
* record lengths, as if they were fixed, but some of the leaf values
* are actually "ghosts", that is, zero-length. Supporting replace of
* such records would require some changes, and so far I've had no need
* for it.
*/
(void)qtree;
ASSERT(!(qtree == APFS_QUERY_FREE_QUEUE && apfs_node_is_leaf(node)));
2023-12-29 19:56:01 -03:00
retry:
2021-04-14 18:30:53 -03:00
if (key) {
if (key_len <= query->key_len) {
u16 end = query->key_off + key_len;
u16 diff = query->key_len - key_len;
apfs_node_free_range(node, end, diff);
key_off = query->key_off;
} else {
apfs_node_free_range(node, query->key_off, query->key_len);
key_off = apfs_node_alloc_key(node, key_len);
if (key_off < 0) {
2023-12-29 19:56:01 -03:00
if (key_off == -ENOSPC)
goto defrag;
return key_off;
2021-04-14 18:30:53 -03:00
}
}
}
if (val) {
if (val_len <= query->len) {
u16 end = query->off + val_len;
u16 diff = query->len - val_len;
apfs_node_free_range(node, end, diff);
val_off = query->off;
} else {
apfs_node_free_range(node, query->off, query->len);
val_off = apfs_node_alloc_val(node, val_len);
if (val_off < 0) {
2023-12-29 19:56:01 -03:00
if (val_off == -ENOSPC)
goto defrag;
return val_off;
2021-04-14 18:30:53 -03:00
}
}
}
if (key) {
query->key_off = key_off;
query->key_len = key_len;
memcpy((void *)node_raw + key_off, key, key_len);
}
if (val) {
query->off = val_off;
query->len = val_len;
memcpy((void *)node_raw + val_off, val, val_len);
}
/* If the key or value were resized, update the table of contents */
2023-12-29 19:40:24 -03:00
if (!apfs_node_has_fixed_kv_size(node))
apfs_node_update_toc_entry(query);
2021-04-14 18:30:53 -03:00
apfs_update_node(node);
return 0;
2023-12-29 19:56:01 -03:00
defrag:
if (defragged) {
apfs_alert(sb, "no room in defragged node");
return -EFSCORRUPTED;
2023-12-29 19:56:01 -03:00
}
/* Crush the replaced entry, so that defragmentation is complete */
if (apfs_node_has_fixed_kv_size(node)) {
apfs_alert(sb, "failed to replace a fixed size record");
return -EFSCORRUPTED;
}
if (key)
query->key_len = 0;
if (val)
query->len = 0;
apfs_node_update_toc_entry(query);
2023-12-29 19:56:01 -03:00
err = apfs_defragment_node(node);
if (err) {
apfs_err(sb, "failed to defragment node");
return err;
}
defragged = true;
/* The record to replace probably moved around */
2024-12-04 20:32:26 -03:00
query->len = apfs_node_locate_value(query->node, query->index, &query->off);
2023-12-29 19:56:01 -03:00
query->key_len = apfs_node_locate_key(query->node, query->index, &query->key_off);
goto retry;
2021-04-14 18:30:53 -03:00
}
/**
* apfs_node_insert - Insert a new record in a node that has enough room
2021-04-14 18:30:53 -03:00
* @query: query run to search for the record
* @key: on-disk record key
* @key_len: length of @key
* @val: on-disk record value (NULL for ghost records)
* @val_len: length of @val (0 for ghost records)
*
* The new record is placed right after the one found by @query. On success,
* returns 0 and sets @query to the new record. In case of failure, returns a
* negative error code and leaves @query pointing to the same record.
2021-04-14 18:30:53 -03:00
*/
int apfs_node_insert(struct apfs_query *query, void *key, int key_len, void *val, int val_len)
{
struct apfs_node *node = query->node;
struct super_block *sb = node->object.sb;
struct apfs_btree_node_phys *node_raw = (void *)node->object.data;
2021-04-14 18:30:53 -03:00
int toc_entry_size;
int key_off, val_off, err;
bool defragged = false;
2021-04-14 18:30:53 -03:00
apfs_assert_in_transaction(sb, &node_raw->btn_o);
retry:
2021-04-14 18:30:53 -03:00
if (apfs_node_has_fixed_kv_size(node))
toc_entry_size = sizeof(struct apfs_kvoff);
else
toc_entry_size = sizeof(struct apfs_kvloc);
/* Expand the table of contents if necessary */
if (sizeof(*node_raw) + (node->records + 1) * toc_entry_size > node->key) {
2021-04-14 18:30:53 -03:00
int new_key_base = node->key;
int new_free_base = node->free;
int inc;
inc = APFS_BTREE_TOC_ENTRY_INCREMENT * toc_entry_size;
new_key_base += inc;
new_free_base += inc;
2024-12-04 20:32:26 -03:00
if (new_free_base > node->val)
goto defrag;
2021-04-14 18:30:53 -03:00
memmove((void *)node_raw + new_key_base,
(void *)node_raw + node->key, node->free - node->key);
node->key = new_key_base;
node->free = new_free_base;
query->key_off += inc;
2021-04-14 18:30:53 -03:00
}
key_off = apfs_node_alloc_key(node, key_len);
if (key_off < 0) {
if (key_off == -ENOSPC)
goto defrag;
return key_off;
2021-04-14 18:30:53 -03:00
}
if (val) {
val_off = apfs_node_alloc_val(node, val_len);
if (val_off < 0) {
if (val_off == -ENOSPC) {
/*
* There is no need for an update of the on-disk
* node before the defrag, since only in-memory
* data should be used there...
*/
goto defrag;
}
return val_off;
2021-04-14 18:30:53 -03:00
}
}
query->key_len = key_len;
query->key_off = key_off;
memcpy((void *)node_raw + key_off, key, key_len);
query->len = val_len;
if (val) {
query->off = val_off;
memcpy((void *)node_raw + val_off, val, val_len);
} else {
query->off = 0;
2021-04-14 18:30:53 -03:00
}
query->index++; /* The query returned the record right before @key */
/* Add the new entry to the table of contents */
apfs_create_toc_entry(query);
apfs_update_node(node);
return 0;
defrag:
if (defragged) {
apfs_err(sb, "node reports incorrect free space");
return -EFSCORRUPTED;
}
err = apfs_defragment_node(node);
if (err) {
apfs_err(sb, "failed to defragment node");
return err;
}
defragged = true;
goto retry;
2021-04-14 18:30:53 -03:00
}
/**
* apfs_create_single_rec_node - Creates a new node with a single record
* @query: query run to search for the record
* @key: on-disk record key
* @key_len: length of @key
* @val: on-disk record value
* @val_len: length of @val
*
* The new node is placed right after the one found by @query, which must have
* a single record. On success, returns 0 and sets @query to the new record;
* returns a negative error code in case of failure, which may be -EAGAIN if a
* node split has happened and the caller must refresh and retry.
*/
int apfs_create_single_rec_node(struct apfs_query *query, void *key, int key_len, void *val, int val_len)
{
struct super_block *sb = NULL;
struct apfs_node *new_node = NULL, *prev_node = NULL;
struct apfs_btree_node_phys *prev_raw = NULL;
struct apfs_btree_node_phys *new_raw = NULL;
int err;
prev_node = query->node;
sb = prev_node->object.sb;
ASSERT(query->parent);
ASSERT(prev_node->records == 1);
ASSERT(val && val_len);
/* This function should only be needed for huge catalog records */
if (prev_node->tree_type != APFS_OBJECT_TYPE_FSTREE) {
2023-03-24 22:07:47 -03:00
apfs_err(sb, "huge node records in the wrong tree");
return -EFSCORRUPTED;
}
/*
* This will only be called for leaf nodes because it's the values that
* can get huge, not the keys. It will also never be called for root,
* because the catalog always has more than a single record.
*/
if (apfs_node_is_root(prev_node) || !apfs_node_is_leaf(prev_node)) {
apfs_err(sb, "huge record in index node");
return -EFSCORRUPTED;
}
new_node = apfs_create_node(sb, apfs_query_storage(query));
2023-03-24 22:07:47 -03:00
if (IS_ERR(new_node)) {
apfs_err(sb, "node creation failed");
return PTR_ERR(new_node);
2023-03-24 22:07:47 -03:00
}
new_node->tree_type = prev_node->tree_type;
new_node->flags = prev_node->flags;
new_node->records = 0;
new_node->key_free_list_len = 0;
new_node->val_free_list_len = 0;
new_node->key = new_node->free = sizeof(*new_raw);
2024-12-04 20:32:26 -03:00
new_node->val = sb->s_blocksize; /* Nonroot */
prev_raw = (void *)prev_node->object.data;
new_raw = (void *)new_node->object.data;
apfs_assert_in_transaction(sb, &new_raw->btn_o);
new_raw->btn_level = prev_raw->btn_level;
apfs_update_node(new_node);
query->node = new_node;
new_node = NULL;
query->index = -1;
err = apfs_node_insert(query, key, key_len, val, val_len);
2023-03-24 22:07:47 -03:00
if (err) {
apfs_err(sb, "node record insertion failed");
goto fail;
2023-03-24 22:07:47 -03:00
}
err = apfs_attach_child(query->parent, query->node);
if (err) {
if (err != -EAGAIN) {
apfs_err(sb, "child attachment failed");
goto fail;
}
err = apfs_delete_node(query->node, query->flags & APFS_QUERY_TREE_MASK);
if (err) {
apfs_err(sb, "node cleanup failed for query retry");
goto fail;
}
/*
* The query must remain pointing to the original node for the
* refresh to take place. The index will not matter though.
*/
new_node = query->node;
query->node = prev_node;
prev_node = NULL;
err = -EAGAIN;
goto fail;
}
apfs_btree_change_node_count(query->parent, 1 /* change */);
fail:
apfs_node_free(prev_node);
apfs_node_free(new_node);
return err;
}