Files
linux-apfs-rw/node.c
T
Ernesto A. Fernández 8b99af84d1 Try to avoid failing insertion on fragmented nodes
I'm getting somewhat regular failures on generic/112 because the ip free
queue for the test image has a one-node limit, and it sometimes gets
full. The thing is, the node actually has plenty of room, but it's all
in the key free list, so I can't allocate values. From now on, if an
insertion fails with ENOSPC, check the available room and defragment
the node if it should be enough. Then retry the insertion instead of
jumping straight to a split.

Signed-off-by: Ernesto A. Fernández <ernesto@corellium.com>
2023-04-17 20:05:26 -03:00

1899 lines
55 KiB
C

// SPDX-License-Identifier: GPL-2.0-only
/*
* Copyright (C) 2018 Ernesto A. Fernández <ernesto.mnd.fernandez@gmail.com>
*/
#include <linux/slab.h>
#include <linux/buffer_head.h>
#include "apfs.h"
/**
* apfs_node_is_valid - Check basic sanity of the node index
* @sb: filesystem superblock
* @node: node to check
*
* Verifies that the node index fits in a single block, and that the number
* of records fits in the index. Without this check a crafted filesystem could
* pretend to have too many records, and calls to apfs_node_locate_key() and
* apfs_node_locate_data() would read beyond the limits of the node.
*/
static bool apfs_node_is_valid(struct super_block *sb,
struct apfs_node *node)
{
u32 records = node->records;
int index_size = node->key - sizeof(struct apfs_btree_node_phys);
int entry_size;
if (node->key > sb->s_blocksize)
return false;
entry_size = (apfs_node_has_fixed_kv_size(node)) ?
sizeof(struct apfs_kvoff) : sizeof(struct apfs_kvloc);
/* Coarse bound to prevent multiplication overflow in final check */
if (records > 1 << 16)
return false;
return records * entry_size <= index_size;
}
void apfs_node_free(struct apfs_node *node)
{
struct apfs_object *obj = &node->object;
if (obj->o_bh) {
obj->data = NULL;
brelse(obj->o_bh);
obj->o_bh = NULL;
} else {
kfree(obj->data);
obj->data = NULL;
}
kfree(node);
}
/**
* apfs_read_node - Read a node header from disk
* @sb: filesystem superblock
* @oid: object id for the node
* @storage: storage type for the node object
* @write: request write access?
*
* Returns ERR_PTR in case of failure, otherwise return a pointer to the
* resulting apfs_node structure with the initial reference taken.
*
* For now we assume the node has not been read before.
*/
struct apfs_node *apfs_read_node(struct super_block *sb, u64 oid, u32 storage,
bool write)
{
struct apfs_sb_info *sbi = APFS_SB(sb);
struct apfs_nxsb_info *nxi = APFS_NXI(sb);
struct buffer_head *bh = NULL;
struct apfs_btree_node_phys *raw;
struct apfs_node *node;
struct apfs_nloc *free_head;
u64 bno;
int err;
switch (storage) {
case APFS_OBJ_VIRTUAL:
/* All virtual nodes are inside a volume, at least for now */
err = apfs_omap_lookup_block(sb, sbi->s_omap, oid, &bno, write);
if (err) {
apfs_err(sb, "omap lookup failed for oid 0x%llx", oid);
return ERR_PTR(err);
}
/* CoW has already been done, don't worry about snapshots */
bh = apfs_read_object_block(sb, bno, write, false /* preserve */);
if (IS_ERR(bh)) {
apfs_err(sb, "object read failed for bno 0x%llx", bno);
return (void *)bh;
}
break;
case APFS_OBJ_PHYSICAL:
bh = apfs_read_object_block(sb, oid, write, false /* preserve */);
if (IS_ERR(bh)) {
apfs_err(sb, "object read failed for bno 0x%llx", oid);
return (void *)bh;
}
oid = bh->b_blocknr;
break;
case APFS_OBJ_EPHEMERAL:
/* Ephemeral objects are checkpoint data, so ignore 'write' */
bh = apfs_read_ephemeral_object(sb, oid);
if (IS_ERR(bh)) {
apfs_err(sb, "ephemeral read failed for oid 0x%llx", oid);
return (void *)bh;
}
break;
}
raw = (struct apfs_btree_node_phys *) bh->b_data;
node = kmalloc(sizeof(*node), GFP_KERNEL);
if (!node) {
brelse(bh);
return ERR_PTR(-ENOMEM);
}
node->tree_type = le32_to_cpu(raw->btn_o.o_subtype);
node->flags = le16_to_cpu(raw->btn_flags);
node->records = le32_to_cpu(raw->btn_nkeys);
node->key = sizeof(*raw) + le16_to_cpu(raw->btn_table_space.off)
+ le16_to_cpu(raw->btn_table_space.len);
node->free = node->key + le16_to_cpu(raw->btn_free_space.off);
node->data = node->free + le16_to_cpu(raw->btn_free_space.len);
free_head = &raw->btn_key_free_list;
node->key_free_list_len = le16_to_cpu(free_head->len);
free_head = &raw->btn_val_free_list;
node->val_free_list_len = le16_to_cpu(free_head->len);
node->object.sb = sb;
node->object.block_nr = bh->b_blocknr;
node->object.oid = oid;
node->object.o_bh = bh;
node->object.data = bh->b_data;
if (nxi->nx_flags & APFS_CHECK_NODES && !apfs_obj_verify_csum(sb, bh)) {
/* TODO: don't check this twice for virtual/physical objects */
apfs_err(sb, "bad checksum for node in block 0x%llx", bh->b_blocknr);
apfs_node_free(node);
return ERR_PTR(-EFSBADCRC);
}
if (!apfs_node_is_valid(sb, node)) {
apfs_err(sb, "bad node in block 0x%llx", bh->b_blocknr);
apfs_node_free(node);
return ERR_PTR(-EFSCORRUPTED);
}
return node;
}
/**
* apfs_node_min_table_size - Return the minimum size for a node's toc
* @sb: superblock structure
* @type: tree type for the node
* @flags: flags for the node
*/
static int apfs_node_min_table_size(struct super_block *sb, u32 type, u16 flags)
{
bool leaf = flags & APFS_BTNODE_LEAF;
int key_size, val_size, toc_size;
int space, count;
/* Preallocate the whole table for trees with fixed key/value sizes */
switch (type) {
case APFS_OBJECT_TYPE_OMAP:
key_size = sizeof(struct apfs_omap_key);
val_size = leaf ? sizeof(struct apfs_omap_val) : sizeof(__le64);
toc_size = sizeof(struct apfs_kvoff);
break;
case APFS_OBJECT_TYPE_SPACEMAN_FREE_QUEUE:
key_size = sizeof(struct apfs_spaceman_free_queue_key);
val_size = sizeof(__le64); /* We assume no ghosts here */
toc_size = sizeof(struct apfs_kvoff);
break;
case APFS_OBJECT_TYPE_OMAP_SNAPSHOT:
key_size = sizeof(__le64);
val_size = leaf ? sizeof(struct apfs_omap_snapshot) : sizeof(__le64);
toc_size = sizeof(struct apfs_kvoff);
break;
case APFS_OBJECT_TYPE_FEXT_TREE:
key_size = sizeof(struct apfs_fext_tree_key);
val_size = leaf ? sizeof(struct apfs_fext_tree_val) : sizeof(__le64);
toc_size = sizeof(struct apfs_kvoff);
break;
default:
/* Make room for one record at least */
toc_size = sizeof(struct apfs_kvloc);
return APFS_BTREE_TOC_ENTRY_INCREMENT * toc_size;
}
/* The footer of root nodes is ignored for some reason */
space = sb->s_blocksize - sizeof(struct apfs_btree_node_phys);
count = space / (key_size + val_size + toc_size);
return count * toc_size;
}
/**
* apfs_set_empty_btree_info - Set the info footer for an empty b-tree node
* @sb: filesystem superblock
* @info: pointer to the on-disk info footer
* @subtype: subtype of the root node, i.e., tree type
*
* For now only supports the extent reference tree.
*/
static void apfs_set_empty_btree_info(struct super_block *sb, struct apfs_btree_info *info, u32 subtype)
{
u32 flags;
ASSERT(subtype == APFS_OBJECT_TYPE_BLOCKREFTREE || subtype == APFS_OBJECT_TYPE_OMAP_SNAPSHOT);
memset(info, 0, sizeof(*info));
flags = APFS_BTREE_PHYSICAL;
if (subtype == APFS_OBJECT_TYPE_BLOCKREFTREE)
flags |= APFS_BTREE_KV_NONALIGNED;
info->bt_fixed.bt_flags = cpu_to_le32(flags);
info->bt_fixed.bt_node_size = cpu_to_le32(sb->s_blocksize);
info->bt_key_count = 0;
info->bt_node_count = cpu_to_le64(1); /* Only one node: the root */
if (subtype == APFS_OBJECT_TYPE_BLOCKREFTREE)
return;
info->bt_fixed.bt_key_size = cpu_to_le32(8);
info->bt_longest_key = info->bt_fixed.bt_key_size;
info->bt_fixed.bt_val_size = cpu_to_le32(sizeof(struct apfs_omap_snapshot));
info->bt_longest_val = info->bt_fixed.bt_val_size;
}
/**
* apfs_make_empty_btree_root - Make an empty root for a b-tree
* @sb: filesystem superblock
* @subtype: subtype of the root node, i.e., tree type
* @oid: on return, the root's object id
*
* For now only supports the extent reference tree and an omap's snapshot tree.
* Returns 0 on success or a negative error code in case of failure.
*/
int apfs_make_empty_btree_root(struct super_block *sb, u32 subtype, u64 *oid)
{
struct apfs_superblock *vsb_raw = APFS_SB(sb)->s_vsb_raw;
struct apfs_btree_node_phys *root = NULL;
struct buffer_head *bh = NULL;
u64 bno;
u16 flags;
int toc_len, free_len, head_len, info_len;
int err;
ASSERT(subtype == APFS_OBJECT_TYPE_BLOCKREFTREE || subtype == APFS_OBJECT_TYPE_OMAP_SNAPSHOT);
err = apfs_spaceman_allocate_block(sb, &bno, true /* backwards */);
if (err) {
apfs_err(sb, "block allocation failed");
return err;
}
apfs_assert_in_transaction(sb, &vsb_raw->apfs_o);
le64_add_cpu(&vsb_raw->apfs_fs_alloc_count, 1);
le64_add_cpu(&vsb_raw->apfs_total_blocks_alloced, 1);
bh = apfs_getblk(sb, bno);
if (!bh)
return -EIO;
root = (void *)bh->b_data;
err = apfs_transaction_join(sb, bh);
if (err)
goto fail;
set_buffer_csum(bh);
flags = APFS_BTNODE_ROOT | APFS_BTNODE_LEAF;
if (subtype == APFS_OBJECT_TYPE_OMAP_SNAPSHOT)
flags |= APFS_BTNODE_FIXED_KV_SIZE;
root->btn_flags = cpu_to_le16(flags);
toc_len = apfs_node_min_table_size(sb, subtype, flags);
head_len = sizeof(*root);
info_len = sizeof(struct apfs_btree_info);
free_len = sb->s_blocksize - head_len - toc_len - info_len;
root->btn_level = 0; /* Root */
/* No keys and no values, so this is straightforward */
root->btn_nkeys = 0;
root->btn_table_space.off = 0;
root->btn_table_space.len = cpu_to_le16(toc_len);
root->btn_free_space.off = 0;
root->btn_free_space.len = cpu_to_le16(free_len);
/* No fragmentation */
root->btn_key_free_list.off = cpu_to_le16(APFS_BTOFF_INVALID);
root->btn_key_free_list.len = 0;
root->btn_val_free_list.off = cpu_to_le16(APFS_BTOFF_INVALID);
root->btn_val_free_list.len = 0;
apfs_set_empty_btree_info(sb, (void *)root + sb->s_blocksize - info_len, subtype);
root->btn_o.o_oid = cpu_to_le64(bno);
root->btn_o.o_xid = cpu_to_le64(APFS_NXI(sb)->nx_xid);
root->btn_o.o_type = cpu_to_le32(APFS_OBJECT_TYPE_BTREE | APFS_OBJ_PHYSICAL);
root->btn_o.o_subtype = cpu_to_le32(subtype);
*oid = bno;
err = 0;
fail:
root = NULL;
brelse(bh);
bh = NULL;
return err;
}
/**
* apfs_create_node - Allocates a new nonroot b-tree node on disk
* @sb: filesystem superblock
* @storage: storage type for the node object
*
* On success returns a pointer to the new in-memory node structure; the object
* header is initialized, and the node fields are given reasonable defaults.
* On failure, returns an error pointer.
*/
static struct apfs_node *apfs_create_node(struct super_block *sb, u32 storage)
{
struct apfs_sb_info *sbi = APFS_SB(sb);
struct apfs_nxsb_info *nxi = APFS_NXI(sb);
struct apfs_nx_superblock *msb_raw = nxi->nx_raw;
struct apfs_superblock *vsb_raw = sbi->s_vsb_raw;
struct apfs_node *node;
struct buffer_head *bh;
struct apfs_btree_node_phys *raw;
u64 bno, oid;
int err;
switch (storage) {
case APFS_OBJ_VIRTUAL:
err = apfs_spaceman_allocate_block(sb, &bno, true /* backwards */);
if (err) {
apfs_err(sb, "block allocation failed");
return ERR_PTR(err);
}
apfs_assert_in_transaction(sb, &vsb_raw->apfs_o);
le64_add_cpu(&vsb_raw->apfs_fs_alloc_count, 1);
le64_add_cpu(&vsb_raw->apfs_total_blocks_alloced, 1);
oid = le64_to_cpu(msb_raw->nx_next_oid);
le64_add_cpu(&msb_raw->nx_next_oid, 1);
err = apfs_create_omap_rec(sb, oid, bno);
if (err) {
apfs_err(sb, "omap rec creation failed (0x%llx-0x%llx)", oid, bno);
return ERR_PTR(err);
}
break;
case APFS_OBJ_PHYSICAL:
err = apfs_spaceman_allocate_block(sb, &bno, true /* backwards */);
if (err) {
apfs_err(sb, "block allocation failed");
return ERR_PTR(err);
}
/* We don't write to the container's omap */
apfs_assert_in_transaction(sb, &vsb_raw->apfs_o);
le64_add_cpu(&vsb_raw->apfs_fs_alloc_count, 1);
le64_add_cpu(&vsb_raw->apfs_total_blocks_alloced, 1);
oid = bno;
break;
case APFS_OBJ_EPHEMERAL:
apfs_cpoint_data_allocate(sb, &bno);
oid = le64_to_cpu(msb_raw->nx_next_oid);
le64_add_cpu(&msb_raw->nx_next_oid, 1);
err = apfs_create_cpoint_map(sb, oid, bno);
if (err) {
apfs_err(sb, "checkpoint map creation failed (0x%llx-0x%llx)", oid, bno);
return ERR_PTR(err);
}
break;
default:
ASSERT(false);
return ERR_PTR(-EINVAL); /* Prevent compiler warnings */
}
bh = apfs_getblk(sb, bno);
if (!bh)
return ERR_PTR(-EIO);
raw = (void *)bh->b_data;
err = apfs_transaction_join(sb, bh);
if (err)
goto fail;
set_buffer_csum(bh);
/* Set most of the object header, but the subtype is up to the caller */
raw->btn_o.o_oid = cpu_to_le64(oid);
raw->btn_o.o_xid = cpu_to_le64(nxi->nx_xid);
raw->btn_o.o_type = cpu_to_le32(storage | APFS_OBJECT_TYPE_BTREE_NODE);
raw->btn_o.o_subtype = 0;
/* The caller is expected to change most node fields */
raw->btn_flags = 0;
raw->btn_level = 0;
raw->btn_nkeys = 0;
raw->btn_table_space.off = 0; /* Put the toc right after the header */
raw->btn_table_space.len = 0;
raw->btn_free_space.off = 0;
raw->btn_free_space.len = cpu_to_le16(sb->s_blocksize - sizeof(*raw));
raw->btn_key_free_list.off = cpu_to_le16(APFS_BTOFF_INVALID);
raw->btn_key_free_list.len = 0;
raw->btn_val_free_list.off = cpu_to_le16(APFS_BTOFF_INVALID);
raw->btn_val_free_list.len = 0;
node = kmalloc(sizeof(*node), GFP_KERNEL);
if (!node) {
err = -ENOMEM;
goto fail;
}
node->object.sb = sb;
node->object.block_nr = bh->b_blocknr;
node->object.oid = oid;
node->object.o_bh = bh;
node->object.data = bh->b_data;
return node;
fail:
brelse(bh);
return ERR_PTR(err);
}
/**
* apfs_delete_node - Deletes a nonroot node from disk
* @query: query pointing to the node
*
* Does nothing to the in-memory node structure. Returns 0 on success, or a
* negative error code in case of failure.
*/
int apfs_delete_node(struct apfs_query *query)
{
struct super_block *sb = query->node->object.sb;
struct apfs_superblock *vsb_raw;
struct apfs_node *node = query->node;
u64 oid = node->object.oid;
u64 bno = node->object.block_nr;
int err;
ASSERT(query->parent);
/*
* For ephemeral nodes, it's important to do this before actually
* deleting the node, because that involves moving blocks around.
*/
err = apfs_btree_remove(query->parent);
if (err) {
apfs_err(sb, "parent index removal failed for oid x%llx", oid);
return err;
}
switch (query->flags & APFS_QUERY_TREE_MASK) {
case APFS_QUERY_CAT:
err = apfs_free_queue_insert(sb, bno, 1);
if (err) {
apfs_err(sb, "free queue insertion failed for 0x%llx", bno);
return err;
}
err = apfs_delete_omap_rec(sb, oid);
if (err) {
apfs_err(sb, "omap rec deletion failed (0x%llx)", oid);
return err;
}
vsb_raw = APFS_SB(sb)->s_vsb_raw;
apfs_assert_in_transaction(sb, &vsb_raw->apfs_o);
le64_add_cpu(&vsb_raw->apfs_fs_alloc_count, -1);
le64_add_cpu(&vsb_raw->apfs_total_blocks_freed, 1);
return 0;
case APFS_QUERY_OMAP:
case APFS_QUERY_EXTENTREF:
case APFS_QUERY_SNAP_META:
err = apfs_free_queue_insert(sb, bno, 1);
if (err) {
apfs_err(sb, "free queue insertion failed for 0x%llx", bno);
return err;
}
/* We don't write to the container's omap */
vsb_raw = APFS_SB(sb)->s_vsb_raw;
apfs_assert_in_transaction(sb, &vsb_raw->apfs_o);
le64_add_cpu(&vsb_raw->apfs_fs_alloc_count, -1);
le64_add_cpu(&vsb_raw->apfs_total_blocks_freed, 1);
return 0;
case APFS_QUERY_FREE_QUEUE:
err = apfs_cpoint_data_free(sb, bno);
if (err) {
apfs_err(sb, "failed to free checkpoint block 0x%llx", bno);
return err;
}
err = apfs_remove_cpoint_map(sb, bno);
if (err) {
apfs_err(sb, "checkpoint map removal failed (0x%llx)", bno);
return err;
}
return 0;
default:
apfs_alert(sb, "new query type must implement node deletion (%d)", query->flags & APFS_QUERY_TREE_MASK);
return -EOPNOTSUPP;
}
}
/**
* apfs_update_node - Update an existing node header
* @node: the modified in-memory node
*/
void apfs_update_node(struct apfs_node *node)
{
struct super_block *sb = node->object.sb;
struct buffer_head *bh = node->object.o_bh;
struct apfs_btree_node_phys *raw = (void *)node->object.data;
struct apfs_nloc *free_head;
u32 tflags, type;
int toc_off;
apfs_assert_in_transaction(sb, &raw->btn_o);
raw->btn_o.o_oid = cpu_to_le64(node->object.oid);
/* The node may no longer be a root, so update the object type */
tflags = le32_to_cpu(raw->btn_o.o_type) & APFS_OBJECT_TYPE_FLAGS_MASK;
type = (node->flags & APFS_BTNODE_ROOT) ? APFS_OBJECT_TYPE_BTREE :
APFS_OBJECT_TYPE_BTREE_NODE;
raw->btn_o.o_type = cpu_to_le32(type | tflags);
raw->btn_o.o_subtype = cpu_to_le32(node->tree_type);
raw->btn_flags = cpu_to_le16(node->flags);
raw->btn_nkeys = cpu_to_le32(node->records);
toc_off = sizeof(*raw) + le16_to_cpu(raw->btn_table_space.off);
raw->btn_table_space.len = cpu_to_le16(node->key - toc_off);
raw->btn_free_space.off = cpu_to_le16(node->free - node->key);
raw->btn_free_space.len = cpu_to_le16(node->data - node->free);
/* Reset the lists on zero length, a defragmentation is taking place */
free_head = &raw->btn_key_free_list;
free_head->len = cpu_to_le16(node->key_free_list_len);
if (!free_head->len)
free_head->off = cpu_to_le16(APFS_BTOFF_INVALID);
free_head = &raw->btn_val_free_list;
free_head->len = cpu_to_le16(node->val_free_list_len);
if (!free_head->len)
free_head->off = cpu_to_le16(APFS_BTOFF_INVALID);
(void)bh;
ASSERT(buffer_trans(bh));
ASSERT(buffer_csum(bh));
}
/**
* apfs_node_locate_key - Locate the key of a node record
* @node: node to be searched
* @index: number of the entry to locate
* @off: on return will hold the offset in the block
*
* Returns the length of the key, or 0 in case of failure. The function checks
* that this length fits within the block; callers must use the returned value
* to make sure they never operate outside its bounds.
*/
int apfs_node_locate_key(struct apfs_node *node, int index, int *off)
{
struct super_block *sb = node->object.sb;
struct apfs_btree_node_phys *raw;
int len;
if (index >= node->records) {
apfs_err(sb, "index out of bounds (%d of %d)", index, node->records);
return 0;
}
raw = (struct apfs_btree_node_phys *)node->object.data;
if (apfs_node_has_fixed_kv_size(node)) {
struct apfs_kvoff *entry;
entry = (struct apfs_kvoff *)raw->btn_data + index;
/* TODO: it would be cleaner to read this stuff from disk */
if (node->tree_type == APFS_OBJECT_TYPE_OMAP_SNAPSHOT)
len = 8;
else
len = 16;
/* Translate offset in key area to offset in block */
*off = node->key + le16_to_cpu(entry->k);
} else {
/* These node types have variable length keys and data */
struct apfs_kvloc *entry;
entry = (struct apfs_kvloc *)raw->btn_data + index;
len = le16_to_cpu(entry->k.len);
/* Translate offset in key area to offset in block */
*off = node->key + le16_to_cpu(entry->k.off);
}
if (*off + len > sb->s_blocksize) {
apfs_err(sb, "key out of bounds (%d-%d)", *off, len);
return 0;
}
return len;
}
/**
* apfs_node_locate_data - Locate the data of a node record
* @node: node to be searched
* @index: number of the entry to locate
* @off: on return will hold the offset in the block
*
* Returns the length of the data, which may be 0 in case of corruption or if
* the record is a ghost. The function checks that this length fits within the
* block; callers must use the returned value to make sure they never operate
* outside its bounds.
*/
static int apfs_node_locate_data(struct apfs_node *node, int index, int *off)
{
struct super_block *sb = node->object.sb;
struct apfs_btree_node_phys *raw;
int len;
if (index >= node->records) {
apfs_err(sb, "index out of bounds (%d of %d)", index, node->records);
return 0;
}
raw = (struct apfs_btree_node_phys *)node->object.data;
if (apfs_node_has_fixed_kv_size(node)) {
/* These node types have fixed length keys and data */
struct apfs_kvoff *entry;
entry = (struct apfs_kvoff *)raw->btn_data + index;
if (node->tree_type == APFS_OBJECT_TYPE_SPACEMAN_FREE_QUEUE) {
/* A free-space queue record may have no value */
if (le16_to_cpu(entry->v) == APFS_BTOFF_INVALID)
return 0;
len = 8;
} else {
/* This is an omap or omap snapshots node */
len = apfs_node_is_leaf(node) ? 16 : 8;
}
/*
* Data offsets are counted backwards from the end of the
* block, or from the beginning of the footer when it exists
*/
if (apfs_node_is_root(node)) /* has footer */
*off = sb->s_blocksize - sizeof(struct apfs_btree_info)
- le16_to_cpu(entry->v);
else
*off = sb->s_blocksize - le16_to_cpu(entry->v);
} else {
/* These node types have variable length keys and data */
struct apfs_kvloc *entry;
entry = (struct apfs_kvloc *)raw->btn_data + index;
len = le16_to_cpu(entry->v.len);
/*
* Data offsets are counted backwards from the end of the
* block, or from the beginning of the footer when it exists
*/
if (apfs_node_is_root(node)) /* has footer */
*off = sb->s_blocksize - sizeof(struct apfs_btree_info)
- le16_to_cpu(entry->v.off);
else
*off = sb->s_blocksize - le16_to_cpu(entry->v.off);
}
if (*off < 0 || *off + len > sb->s_blocksize) {
apfs_err(sb, "value out of bounds (%d-%d)", *off, len);
return 0;
}
return len;
}
/**
* apfs_create_toc_entry - Create the table-of-contents entry for a record
* @query: query pointing to the record
*
* Creates a toc entry for the record at index @query->index and increases
* @node->records. The caller must ensure enough space in the table.
*/
static void apfs_create_toc_entry(struct apfs_query *query)
{
struct apfs_node *node = query->node;
struct super_block *sb = node->object.sb;
struct apfs_btree_node_phys *raw = (void *)node->object.data;
int value_end;
int recs = node->records;
int index = query->index;
value_end = sb->s_blocksize;
if (apfs_node_is_root(node))
value_end -= sizeof(struct apfs_btree_info);
if (apfs_node_has_fixed_kv_size(node)) {
struct apfs_kvoff *kvoff;
kvoff = (struct apfs_kvoff *)raw->btn_data + query->index;
memmove(kvoff + 1, kvoff, (recs - index) * sizeof(*kvoff));
if (!query->len) /* Ghost record */
kvoff->v = cpu_to_le16(APFS_BTOFF_INVALID);
else
kvoff->v = cpu_to_le16(value_end - query->off);
kvoff->k = cpu_to_le16(query->key_off - node->key);
} else {
struct apfs_kvloc *kvloc;
kvloc = (struct apfs_kvloc *)raw->btn_data + query->index;
memmove(kvloc + 1, kvloc, (recs - index) * sizeof(*kvloc));
kvloc->v.off = cpu_to_le16(value_end - query->off);
kvloc->v.len = cpu_to_le16(query->len);
kvloc->k.off = cpu_to_le16(query->key_off - node->key);
kvloc->k.len = cpu_to_le16(query->key_len);
}
node->records++;
}
/**
* apfs_key_from_query - Read the current key from a query structure
* @query: the query, with @query->key_off and @query->key_len already set
* @key: return parameter for the key
*
* Reads the key into @key and performs some basic sanity checks as a
* protection against crafted filesystems. Returns 0 on success or a
* negative error code otherwise.
*/
static int apfs_key_from_query(struct apfs_query *query, struct apfs_key *key)
{
struct super_block *sb = query->node->object.sb;
char *raw = query->node->object.data;
void *raw_key = (void *)(raw + query->key_off);
bool hashed;
int err = 0;
switch (query->flags & APFS_QUERY_TREE_MASK) {
case APFS_QUERY_CAT:
hashed = apfs_is_normalization_insensitive(sb);
err = apfs_read_cat_key(raw_key, query->key_len, key, hashed);
break;
case APFS_QUERY_OMAP:
err = apfs_read_omap_key(raw_key, query->key_len, key);
break;
case APFS_QUERY_FREE_QUEUE:
err = apfs_read_free_queue_key(raw_key, query->key_len, key);
break;
case APFS_QUERY_EXTENTREF:
err = apfs_read_extentref_key(raw_key, query->key_len, key);
break;
case APFS_QUERY_FEXT:
err = apfs_read_fext_key(raw_key, query->key_len, key);
break;
case APFS_QUERY_SNAP_META:
err = apfs_read_snap_meta_key(raw_key, query->key_len, key);
break;
case APFS_QUERY_OMAP_SNAP:
err = apfs_read_omap_snap_key(raw_key, query->key_len, key);
break;
default:
apfs_alert(sb, "new query type must implement key reads (%d)", query->flags & APFS_QUERY_TREE_MASK);
err = -EOPNOTSUPP;
break;
}
if (err)
apfs_err(sb, "bad node key in block 0x%llx", query->node->object.block_nr);
/* A multiple query must ignore some of these fields */
if (query->flags & APFS_QUERY_ANY_NAME)
key->name = NULL;
if (query->flags & APFS_QUERY_ANY_NUMBER)
key->number = 0;
return err;
}
/**
* apfs_node_next - Find the next matching record in the current node
* @sb: filesystem superblock
* @query: multiple query in execution
*
* Returns 0 on success, -EAGAIN if the next record is in another node,
* -ENODATA if no more matching records exist, or another negative error
* code in case of failure.
*/
static int apfs_node_next(struct super_block *sb, struct apfs_query *query)
{
struct apfs_node *node = query->node;
struct apfs_key curr_key;
int cmp, err;
if (query->flags & APFS_QUERY_DONE)
/* Nothing left to search; the query failed */
return -ENODATA;
if (!query->index) /* The next record may be in another node */
return -EAGAIN;
--query->index;
query->key_len = apfs_node_locate_key(node, query->index,
&query->key_off);
err = apfs_key_from_query(query, &curr_key);
if (err) {
apfs_err(sb, "bad key for index %d", query->index);
return err;
}
cmp = apfs_keycmp(&curr_key, query->key);
if (cmp > 0) {
apfs_err(sb, "records are out of order");
return -EFSCORRUPTED;
}
if (cmp != 0 && apfs_node_is_leaf(node) &&
query->flags & APFS_QUERY_EXACT)
return -ENODATA;
query->len = apfs_node_locate_data(node, query->index, &query->off);
if (query->len == 0) {
apfs_err(sb, "bad value for index %d", query->index);
return -EFSCORRUPTED;
}
if (cmp != 0) {
/*
* This is the last entry that can be relevant in this node.
* Keep searching the children, but don't return to this level.
*/
query->flags |= APFS_QUERY_DONE;
}
return 0;
}
/**
* apfs_node_query - Execute a query on a single node
* @sb: filesystem superblock
* @query: the query to execute
*
* The search will start at index @query->index, looking for the key that comes
* right before @query->key, according to the order given by apfs_keycmp().
*
* The @query->index will be updated to the last index checked. This is
* important when searching for multiple entries, since the query may need
* to remember where it was on this level. If we are done with this node, the
* query will be flagged as APFS_QUERY_DONE, and the search will end in failure
* as soon as we return to this level. The function may also return -EAGAIN,
* to signal that the search should go on in a different branch.
*
* On success returns 0; the offset of the data within the block will be saved
* in @query->off, and its length in @query->len. The function checks that this
* length fits within the block; callers must use the returned value to make
* sure they never operate outside its bounds.
*
* -ENODATA will be returned if no appropriate entry was found, -EFSCORRUPTED
* in case of corruption.
*/
int apfs_node_query(struct super_block *sb, struct apfs_query *query)
{
struct apfs_node *node = query->node;
int left, right;
int cmp;
int err;
if (query->flags & APFS_QUERY_NEXT)
return apfs_node_next(sb, query);
/* Search by bisection */
cmp = 1;
left = 0;
do {
struct apfs_key curr_key;
if (cmp > 0) {
right = query->index - 1;
if (right < left) {
query->index = -1;
return -ENODATA;
}
query->index = (left + right) / 2;
} else {
left = query->index;
query->index = DIV_ROUND_UP(left + right, 2);
}
query->key_len = apfs_node_locate_key(node, query->index,
&query->key_off);
err = apfs_key_from_query(query, &curr_key);
if (err) {
apfs_err(sb, "bad key for index %d", query->index);
return err;
}
cmp = apfs_keycmp(&curr_key, query->key);
if (cmp == 0 && !(query->flags & APFS_QUERY_MULTIPLE))
break;
} while (left != right);
if (cmp > 0) {
query->index = -1;
return -ENODATA;
}
if (cmp != 0 && apfs_node_is_leaf(query->node) &&
query->flags & APFS_QUERY_EXACT)
return -ENODATA;
if (query->flags & APFS_QUERY_MULTIPLE) {
if (cmp != 0) /* Last relevant entry in level */
query->flags |= APFS_QUERY_DONE;
query->flags |= APFS_QUERY_NEXT;
}
query->len = apfs_node_locate_data(node, query->index, &query->off);
return 0;
}
/**
* apfs_node_query_first - Find the first record in a node
* @query: on return this query points to the record
*/
void apfs_node_query_first(struct apfs_query *query)
{
struct apfs_node *node = query->node;
query->index = 0;
query->key_len = apfs_node_locate_key(node, query->index, &query->key_off);
query->len = apfs_node_locate_data(node, query->index, &query->off);
}
/**
* apfs_omap_map_from_query - Read the mapping found by a successful omap query
* @query: the query that found the record
* @map: Return parameter. The mapping found.
*
* Returns -EOPNOTSUPP if the object doesn't fit in one block, and -EFSCORRUPTED
* if the filesystem appears to be malicious. Otherwise, reads the mapping info
* in the omap record into @map and returns 0.
*/
int apfs_omap_map_from_query(struct apfs_query *query, struct apfs_omap_map *map)
{
struct super_block *sb = query->node->object.sb;
struct apfs_omap_key *key = NULL;
struct apfs_omap_val *val = NULL;
char *raw = query->node->object.data;
if (query->len != sizeof(*val) || query->key_len != sizeof(*key)) {
apfs_err(sb, "bad length of key (%d) or value (%d)", query->key_len, query->len);
return -EFSCORRUPTED;
}
key = (struct apfs_omap_key *)(raw + query->key_off);
val = (struct apfs_omap_val *)(raw + query->off);
/* TODO: support objects with multiple blocks */
if (le32_to_cpu(val->ov_size) != sb->s_blocksize) {
apfs_err(sb, "object size doesn't match block size");
return -EOPNOTSUPP;
}
map->xid = le64_to_cpu(key->ok_xid);
map->bno = le64_to_cpu(val->ov_paddr);
map->flags = le32_to_cpu(val->ov_flags);
return 0;
}
/**
* apfs_btree_inc_height - Increase the height of a b-tree
* @query: query pointing to the root node
*
* On success returns 0, and @query is left pointing to the same record.
* Returns a negative error code in case of failure.
*/
static int apfs_btree_inc_height(struct apfs_query *query)
{
struct apfs_query *root_query;
struct apfs_node *root = query->node;
struct apfs_node *new_node;
struct super_block *sb = root->object.sb;
struct apfs_btree_node_phys *root_raw;
struct apfs_btree_node_phys *new_raw;
struct apfs_btree_info *info;
__le64 *raw_oid;
u32 storage = apfs_query_storage(query);
root_raw = (void *)root->object.data;
apfs_assert_in_transaction(sb, &root_raw->btn_o);
if (query->parent || query->depth) {
apfs_err(sb, "invalid root query");
return -EFSCORRUPTED;
}
/* Create a new child node */
new_node = apfs_create_node(sb, storage);
if (IS_ERR(new_node)) {
apfs_err(sb, "node creation failed");
return PTR_ERR(new_node);
}
new_node->flags = root->flags & ~APFS_BTNODE_ROOT;
new_node->tree_type = root->tree_type;
/* Move all records into the child node; get rid of the info footer */
new_node->records = root->records;
new_node->key = root->key;
new_node->free = root->free;
new_node->data = root->data + sizeof(*info);
new_node->key_free_list_len = root->key_free_list_len;
new_node->val_free_list_len = root->val_free_list_len;
new_raw = (void *)new_node->object.data;
/* Don't copy the object header, already set by apfs_create_node() */
memcpy((void *)new_raw + sizeof(new_raw->btn_o),
(void *)root_raw + sizeof(root_raw->btn_o),
root->free - sizeof(new_raw->btn_o));
memcpy((void *)new_raw + new_node->data,
(void *)root_raw + root->data,
sb->s_blocksize - new_node->data);
query->off += sizeof(*info);
apfs_update_node(new_node);
/* Add a new level to the query chain */
root_query = query->parent = apfs_alloc_query(root, NULL /* parent */);
if (!query->parent) {
apfs_node_free(new_node);
return -ENOMEM;
}
root_query->key = query->key;
root_query->flags = query->flags;
query->node = new_node;
query->depth = 1;
/* Now assemble the new root with only the first key */
root_query->key_len = apfs_node_locate_key(root, 0 /* index */,
&root_query->key_off);
if (!root_query->key_len) {
apfs_err(sb, "bad key for index %d", 0);
return -EFSCORRUPTED;
}
root->key = sizeof(*root_raw) +
apfs_node_min_table_size(sb, root->tree_type, root->flags & ~APFS_BTNODE_LEAF);
memmove((void *)root_raw + root->key,
(void *)root_raw + root_query->key_off, root_query->key_len);
root_query->key_off = root->key;
root->free = root->key + root_query->key_len;
/* The new root is a nonleaf node; the record value is the child id */
root->flags &= ~APFS_BTNODE_LEAF;
root->data = sb->s_blocksize - sizeof(*info) - sizeof(*raw_oid);
raw_oid = (void *)root_raw + root->data;
*raw_oid = cpu_to_le64(new_node->object.oid);
root_query->off = root->data;
root_query->len = sizeof(*raw_oid);
/* With the key and value in place, set the table-of-contents */
root->records = 0;
root_query->index = 0;
apfs_create_toc_entry(root_query);
/* There is no internal fragmentation */
root->key_free_list_len = 0;
root->val_free_list_len = 0;
/* Finally, update the node count in the info footer */
apfs_btree_change_node_count(root_query, 1 /* change */);
le16_add_cpu(&root_raw->btn_level, 1); /* TODO: move to update_node() */
apfs_update_node(root);
return 0;
}
/**
* apfs_copy_record_range - Copy a range of records to an empty node
* @dest_node: destination node
* @src_node: source node
* @start: index of first record in range
* @end: index of first record after the range
*
* Doesn't modify the info footer of root nodes. Returns 0 on success or a
* negative error code in case of failure.
*/
static int apfs_copy_record_range(struct apfs_node *dest_node,
struct apfs_node *src_node,
int start, int end)
{
struct super_block *sb = dest_node->object.sb;
struct apfs_btree_node_phys *dest_raw;
struct apfs_btree_node_phys *src_raw;
struct apfs_query *query = NULL;
int toc_size, toc_entry_size;
int err;
int i;
dest_raw = (void *)dest_node->object.data;
src_raw = (void *)src_node->object.data;
ASSERT(!dest_node->records);
apfs_assert_in_transaction(sb, &dest_raw->btn_o);
/* Resize the table of contents so that all the records fit */
if (apfs_node_has_fixed_kv_size(src_node))
toc_entry_size = sizeof(struct apfs_kvoff);
else
toc_entry_size = sizeof(struct apfs_kvloc);
toc_size = apfs_node_min_table_size(sb, src_node->tree_type, src_node->flags);
if (toc_size < toc_entry_size * (end - start))
toc_size = toc_entry_size * round_up(end - start, APFS_BTREE_TOC_ENTRY_INCREMENT);
dest_node->key = sizeof(*dest_raw) + toc_size;
dest_node->free = dest_node->key;
dest_node->data = sb->s_blocksize;
if (apfs_node_is_root(dest_node))
dest_node->data -= sizeof(struct apfs_btree_info);
/* We'll use a temporary query structure to move the records around */
query = apfs_alloc_query(dest_node, NULL /* parent */);
if (!query) {
err = -ENOMEM;
goto fail;
}
err = -EFSCORRUPTED;
for (i = start; i < end; ++i) {
int len, off;
len = apfs_node_locate_key(src_node, i, &off);
if (dest_node->free + len > sb->s_blocksize) {
apfs_err(sb, "key of length %d doesn't fit", len);
goto fail;
}
memcpy((char *)dest_raw + dest_node->free,
(char *)src_raw + off, len);
query->key_off = dest_node->free;
query->key_len = len;
dest_node->free += len;
len = apfs_node_locate_data(src_node, i, &off);
dest_node->data -= len;
if (dest_node->data < 0) {
apfs_err(sb, "value of length %d doesn't fit", len);
goto fail;
}
memcpy((char *)dest_raw + dest_node->data,
(char *)src_raw + off, len);
query->off = dest_node->data;
query->len = len;
query->index = i - start;
apfs_create_toc_entry(query);
}
err = 0;
fail:
apfs_free_query(query);
return err;
}
/**
* apfs_attach_child - Attach a new node to its parent
* @query: query pointing to the previous record in the parent
* @child: the new child node to attach
*
* Returns 0 on success or a negative error code in case of failure; @query is
* rendered invalid either way and must be freed by the caller.
*/
static int apfs_attach_child(struct apfs_query *query, struct apfs_node *child)
{
struct apfs_object *object = &child->object;
struct apfs_btree_node_phys *raw = (void *)object->data;
int key_len, key_off;
__le64 raw_oid = cpu_to_le64(object->oid);
key_len = apfs_node_locate_key(child, 0, &key_off);
if (!key_len) {
/* This should never happen: @child was made by us */
apfs_alert(object->sb, "bad key for index %d", 0);
return -EFSCORRUPTED;
}
return apfs_btree_insert(query, (void *)raw + key_off, key_len,
&raw_oid, sizeof(raw_oid));
}
/**
* apfs_node_temp_dup - Make an in-memory duplicate of a node
* @original: node to duplicate
* @duplicate: on success, the duplicate node
*
* Returns 0 on success or a negative error code in case of failure.
*/
static int apfs_node_temp_dup(const struct apfs_node *original, struct apfs_node **duplicate)
{
struct super_block *sb = original->object.sb;
struct apfs_node *dup = NULL;
char *buffer = NULL;
dup = kmalloc(sizeof(*dup), GFP_KERNEL);
if (!dup)
return -ENOMEM;
*dup = *original;
dup->object.o_bh = NULL;
dup->object.data = NULL;
buffer = kmalloc(sb->s_blocksize, GFP_KERNEL);
if (!buffer) {
kfree(dup);
return -ENOMEM;
}
memcpy(buffer, original->object.data, sb->s_blocksize);
dup->object.data = buffer;
*duplicate = dup;
return 0;
}
/**
* apfs_node_split - Split a b-tree node in two
* @query: query pointing to the node
*
* On success returns 0, and @query is left pointing to the same record on the
* leaf; to simplify the implementation, @query->parent is set to NULL. Returns
* a negative error code in case of failure.
*/
int apfs_node_split(struct apfs_query *query)
{
struct super_block *sb = query->node->object.sb;
struct apfs_node *old_node;
struct apfs_btree_node_phys *old_raw;
struct apfs_node *tmp_node = NULL;
u32 storage = apfs_query_storage(query);
int record_count, new_rec_count, old_rec_count;
int err;
if (apfs_node_is_root(query->node)) {
err = apfs_btree_inc_height(query);
if (err) {
apfs_err(sb, "failed to increase tree height");
return err;
}
} else if (!query->parent) {
apfs_err(sb, "nonroot node with no parent");
return -EFSCORRUPTED;
}
old_node = query->node;
old_raw = (void *)old_node->object.data;
apfs_assert_in_transaction(sb, &old_raw->btn_o);
/*
* To defragment the original node, we put all records in a temporary
* in-memory node before dealing them out.
*/
err = apfs_node_temp_dup(old_node, &tmp_node);
if (err)
return err;
/*
* The first half of the records go in the original node. If there's
* only one record, just defragment the node and don't split anything.
*/
record_count = old_node->records;
new_rec_count = record_count / 2;
old_rec_count = record_count - new_rec_count;
old_node->records = 0;
old_node->key_free_list_len = 0;
old_node->val_free_list_len = 0;
err = apfs_copy_record_range(old_node, tmp_node, 0, old_rec_count);
if (err) {
apfs_err(sb, "record copy failed");
goto out;
}
apfs_update_node(old_node);
/* The second half is copied to a new node */
if (new_rec_count != 0) {
struct apfs_node *new_node;
struct apfs_btree_node_phys *new_raw;
apfs_btree_change_node_count(query->parent, 1 /* change */);
new_node = apfs_create_node(sb, storage);
if (IS_ERR(new_node)) {
apfs_err(sb, "node creation failed");
err = PTR_ERR(new_node);
goto out;
}
new_node->tree_type = old_node->tree_type;
new_node->flags = old_node->flags;
new_node->records = 0;
new_node->key_free_list_len = 0;
new_node->val_free_list_len = 0;
err = apfs_copy_record_range(new_node, tmp_node, old_rec_count, record_count);
if (err) {
apfs_err(sb, "record copy failed");
apfs_node_free(new_node);
goto out;
}
new_raw = (void *)new_node->object.data;
apfs_assert_in_transaction(sb, &new_raw->btn_o);
new_raw->btn_level = old_raw->btn_level;
apfs_update_node(new_node);
err = apfs_attach_child(query->parent, new_node);
if (err) {
apfs_err(sb, "child attachment failed");
apfs_node_free(new_node);
goto out;
}
/* Point the query back to the original record */
if (query->index >= old_rec_count) {
/* The record got moved to the new node */
apfs_node_free(query->node);
query->node = new_node;
query->index -= old_rec_count;
} else {
apfs_node_free(new_node);
}
}
apfs_free_query(query->parent);
query->parent = NULL; /* The caller only gets the leaf */
/* Updating these fields isn't really necessary, but it's cleaner */
query->len = apfs_node_locate_data(query->node, query->index,
&query->off);
query->key_len = apfs_node_locate_key(query->node, query->index,
&query->key_off);
out:
apfs_node_free(tmp_node);
return err;
}
/* TODO: the following 4 functions could be reused elsewhere */
/**
* apfs_off_to_val_off - Translate offset in node to offset in value area
* @node: the node
* @off: offset in the node
*/
static u16 apfs_off_to_val_off(struct apfs_node *node, u16 off)
{
struct super_block *sb = node->object.sb;
u16 val_end;
val_end = sb->s_blocksize;
if (apfs_node_is_root(node)) /* has footer */
val_end -= sizeof(struct apfs_btree_info);
return val_end - off;
}
/**
* apfs_val_off_to_off - Translate offset in value area to offset in node
* @node: the node
* @off: offset in the value area
*/
static u16 apfs_val_off_to_off(struct apfs_node *node, u16 off)
{
return apfs_off_to_val_off(node, off);
}
/**
* apfs_off_to_key_off - Translate offset in node to offset in key area
* @node: the node
* @off: offset in the node
*/
static u16 apfs_off_to_key_off(struct apfs_node *node, u16 off)
{
return off - node->key;
}
/**
* apfs_key_off_to_off - Translate offset in key area to offset in node
* @node: the node
* @off: offset in the key area
*/
static u16 apfs_key_off_to_off(struct apfs_node *node, u16 off)
{
return off + node->key;
}
/* The type of the previous four functions, used for node offset calculations */
typedef u16 (*offcalc)(struct apfs_node *, u16);
/**
* apfs_node_free_list_add - Add a free node segment to the proper free list
* @node: node for the segment
* @off: offset of the segment to add
* @len: length of the segment to add
*
* The caller must ensure that the freed segment fits in the node.
*/
static void apfs_node_free_list_add(struct apfs_node *node, u16 off, u16 len)
{
struct super_block *sb = node->object.sb;
struct apfs_btree_node_phys *node_raw = (void *)node->object.data;
struct apfs_nloc *head, *new;
offcalc off_to_rel;
apfs_assert_in_transaction(sb, &node_raw->btn_o);
if (off >= node->data) { /* Value area */
off_to_rel = apfs_off_to_val_off;
head = &node_raw->btn_val_free_list;
node->val_free_list_len += len;
} else { /* Key area */
off_to_rel = apfs_off_to_key_off;
head = &node_raw->btn_key_free_list;
node->key_free_list_len += len;
}
/* Very small segments are leaked until defragmentation */
if (len < sizeof(*new))
return;
/* The free list doesn't seem to be kept in any particular order */
new = (void *)node_raw + off;
new->off = head->off;
new->len = cpu_to_le16(len);
head->off = cpu_to_le16(off_to_rel(node, off));
}
/**
* apfs_node_free_range - Free space from a node's key or value areas
* @node: the node
* @off: offset to free
* @len: length to free
*
* Returns 0 on success or a negative error code in case of failure.
*/
void apfs_node_free_range(struct apfs_node *node, u16 off, u16 len)
{
struct super_block *sb = node->object.sb;
struct apfs_btree_node_phys *raw = (void *)node->object.data;
apfs_assert_in_transaction(sb, &raw->btn_o);
if (off == node->data)
node->data += len;
else if (off + len == node->free)
node->free -= len;
else
apfs_node_free_list_add(node, off, len);
}
/**
* apfs_node_free_list_unlink - Unlink an entry from a node's free list
* @prev: previous entry
* @curr: entry to unlink
*/
static void apfs_node_free_list_unlink(struct apfs_nloc *prev, struct apfs_nloc *curr)
{
prev->off = curr->off;
}
/**
* apfs_node_free_list_alloc - Allocate a free segment from a free list
* @node: the node
* @len: length to allocate
* @value: true to allocate in the value area, false for the key area
*
* Returns the offset in the node on success, or a negative error code in case
* of failure.
*/
static int apfs_node_free_list_alloc(struct apfs_node *node, u16 len, bool value)
{
struct super_block *sb = node->object.sb;
struct apfs_btree_node_phys *node_raw = (void *)node->object.data;
struct apfs_nloc *head, *curr, *prev;
offcalc rel_to_off;
int *list_len;
int bound = sb->s_blocksize;
apfs_assert_in_transaction(sb, &node_raw->btn_o);
if (value) { /* Value area */
rel_to_off = apfs_val_off_to_off;
head = &node_raw->btn_val_free_list;
list_len = &node->val_free_list_len;
} else { /* Key area */
rel_to_off = apfs_key_off_to_off;
head = &node_raw->btn_key_free_list;
list_len = &node->key_free_list_len;
}
if (*list_len < len)
return -ENOSPC;
prev = head;
while (bound--) {
u16 curr_off = le16_to_cpu(prev->off);
u16 abs_off = rel_to_off(node, curr_off);
u16 curr_len;
if (curr_off == APFS_BTOFF_INVALID)
return -ENOSPC;
if (abs_off + sizeof(*curr) > sb->s_blocksize) {
apfs_err(sb, "nloc out of bounds (%d-%d)", abs_off, (int)sizeof(*curr));
return -EFSCORRUPTED;
}
curr = (void *)node_raw + abs_off;
curr_len = le16_to_cpu(curr->len);
if (curr_len >= len) {
if (abs_off + curr_len > sb->s_blocksize) {
apfs_err(sb, "entry out of bounds (%d-%d)", abs_off, curr_len);
return -EFSCORRUPTED;
}
*list_len -= curr_len;
apfs_node_free_list_unlink(prev, curr);
apfs_node_free_list_add(node, abs_off + len, curr_len - len);
return abs_off;
}
prev = curr;
}
/* Don't loop forever if the free list is corrupted and doesn't end */
apfs_err(sb, "free list never ends");
return -EFSCORRUPTED;
}
/**
* apfs_node_alloc_key - Allocated free space for a new key
* @node: the node to search
* @len: wanted key length
*
* Returns the offset in the node on success, or a negative error code in case
* of failure.
*/
static int apfs_node_alloc_key(struct apfs_node *node, u16 len)
{
int off;
if (node->free + len <= node->data) {
off = node->free;
node->free += len;
return off;
}
return apfs_node_free_list_alloc(node, len, false /* value */);
}
/**
* apfs_node_alloc_val - Allocated free space for a new value
* @node: the node to search
* @len: wanted value length
*
* Returns the offset in the node on success, or a negative error code in case
* of failure.
*/
static int apfs_node_alloc_val(struct apfs_node *node, u16 len)
{
int off;
if (node->free + len <= node->data) {
off = node->data - len;
node->data -= len;
return off;
}
return apfs_node_free_list_alloc(node, len, true /* value */);
}
/**
* apfs_node_replace - Replace a record in a node
* @query: exact query that found the record
* @key: new on-disk record key (NULL if unchanged)
* @key_len: length of @key
* @val: new on-disk record value (NULL if unchanged)
* @val_len: length of @val
*
* Returns 0 on success, and @query is left pointing to the same record. Returns
* a negative error code in case of failure, which may be -ENOSPC if the node
* seems full.
*/
int apfs_node_replace(struct apfs_query *query, void *key, int key_len, void *val, int val_len)
{
struct apfs_node *node = query->node;
struct super_block *sb = node->object.sb;
struct apfs_btree_node_phys *node_raw = (void *)node->object.data;
int old_free = node->free, old_data = node->data;
int old_key_free_len = node->key_free_list_len;
int old_val_free_len = node->val_free_list_len;
int key_off = 0, val_off = 0, err = 0;
apfs_assert_in_transaction(sb, &node_raw->btn_o);
if (key) {
if (key_len <= query->key_len) {
u16 end = query->key_off + key_len;
u16 diff = query->key_len - key_len;
apfs_node_free_range(node, end, diff);
key_off = query->key_off;
} else {
apfs_node_free_range(node, query->key_off, query->key_len);
key_off = apfs_node_alloc_key(node, key_len);
if (key_off < 0) {
err = key_off;
goto fail;
}
}
}
if (val) {
if (val_len <= query->len) {
u16 end = query->off + val_len;
u16 diff = query->len - val_len;
apfs_node_free_range(node, end, diff);
val_off = query->off;
} else {
apfs_node_free_range(node, query->off, query->len);
val_off = apfs_node_alloc_val(node, val_len);
if (val_off < 0) {
err = val_off;
goto fail;
}
}
}
if (key) {
query->key_off = key_off;
query->key_len = key_len;
memcpy((void *)node_raw + key_off, key, key_len);
}
if (val) {
query->off = val_off;
query->len = val_len;
memcpy((void *)node_raw + val_off, val, val_len);
}
/* If the key or value were resized, update the table of contents */
if (!apfs_node_has_fixed_kv_size(node)) {
struct apfs_kvloc *kvloc;
int value_end;
value_end = sb->s_blocksize;
if (apfs_node_is_root(node))
value_end -= sizeof(struct apfs_btree_info);
kvloc = (struct apfs_kvloc *)node_raw->btn_data + query->index;
kvloc->v.off = cpu_to_le16(value_end - query->off);
kvloc->v.len = cpu_to_le16(query->len);
kvloc->k.off = cpu_to_le16(query->key_off - node->key);
kvloc->k.len = cpu_to_le16(query->key_len);
}
apfs_update_node(node);
return 0;
fail:
/*
* This isn't very tidy, but the transaction may continue on -ENOSPC,
* so we must restore the in-memory node fields that may have been
* modified. The on-disk free lists can be ignored because the node
* should be split soon.
*/
node->free = old_free;
node->data = old_data;
node->key_free_list_len = old_key_free_len;
node->val_free_list_len = old_val_free_len;
return err;
}
/**
* apfs_node_total_room - Total free space in a node
* @node: the node
*/
static int apfs_node_total_room(struct apfs_node *node)
{
return node->data - node->free + node->key_free_list_len + node->val_free_list_len;
}
/**
* apfs_defragment_node - Make all free space in a node contiguous
* @node: node to defragment
*
* Returns 0 on success or a negative error code in case of failure.
*/
static int apfs_defragment_node(struct apfs_node *node)
{
struct super_block *sb = node->object.sb;
struct apfs_btree_node_phys *node_raw = (void *)node->object.data;
struct apfs_node *tmp_node = NULL;
int record_count, err;
apfs_assert_in_transaction(sb, &node_raw->btn_o);
/* Put all records in a temporary in-memory node and deal them out */
err = apfs_node_temp_dup(node, &tmp_node);
if (err)
return err;
record_count = node->records;
node->records = 0;
node->key_free_list_len = 0;
node->val_free_list_len = 0;
err = apfs_copy_record_range(node, tmp_node, 0, record_count);
if (err) {
apfs_err(sb, "record copy failed");
goto fail;
}
apfs_update_node(node);
fail:
apfs_node_free(tmp_node);
return err;
}
/**
* apfs_node_insert - Insert a new record in a node
* @query: query run to search for the record
* @key: on-disk record key
* @key_len: length of @key
* @val: on-disk record value (NULL for ghost records)
* @val_len: length of @val (0 for ghost records)
*
* The new record is placed right after the one found by @query. On success,
* returns 0 and sets @query to the new record. In case of failure, returns a
* negative error code and leaves @query pointing to the same record. The error
* may be -ENOSPC if the node seems full.
*/
int apfs_node_insert(struct apfs_query *query, void *key, int key_len, void *val, int val_len)
{
struct apfs_node *node = query->node;
struct super_block *sb = node->object.sb;
struct apfs_btree_node_phys *node_raw = (void *)node->object.data;
int toc_entry_size;
int old_free, old_data, old_key_free_len, old_val_free_len;
int key_off, val_off, err;
int needed_room;
bool defragged = false;
apfs_assert_in_transaction(sb, &node_raw->btn_o);
retry:
needed_room = key_len + val_len;
err = 0;
if (apfs_node_has_fixed_kv_size(node))
toc_entry_size = sizeof(struct apfs_kvoff);
else
toc_entry_size = sizeof(struct apfs_kvloc);
/* Expand the table of contents if necessary */
if (sizeof(*node_raw) +
(node->records + 1) * toc_entry_size > node->key) {
int new_key_base = node->key;
int new_free_base = node->free;
int inc;
inc = APFS_BTREE_TOC_ENTRY_INCREMENT * toc_entry_size;
new_key_base += inc;
new_free_base += inc;
if (new_free_base > node->data) {
needed_room += inc;
err = -ENOSPC;
goto fail_defrag;
}
memmove((void *)node_raw + new_key_base,
(void *)node_raw + node->key, node->free - node->key);
node->key = new_key_base;
node->free = new_free_base;
query->key_off += inc;
}
old_free = node->free;
old_data = node->data;
old_key_free_len = node->key_free_list_len;
old_val_free_len = node->val_free_list_len;
key_off = apfs_node_alloc_key(node, key_len);
if (key_off < 0) {
err = key_off;
goto fail_update;
}
if (val) {
val_off = apfs_node_alloc_val(node, val_len);
if (val_off < 0) {
err = val_off;
goto fail_update;
}
}
query->key_len = key_len;
query->key_off = key_off;
memcpy((void *)node_raw + key_off, key, key_len);
query->len = val_len;
if (val) {
query->off = val_off;
memcpy((void *)node_raw + val_off, val, val_len);
}
query->index++; /* The query returned the record right before @key */
/* Add the new entry to the table of contents */
apfs_create_toc_entry(query);
fail_update:
/*
* We must update the on-disk node even on failure, because we did
* expand the table of contents.
*/
if (err) {
node->free = old_free;
node->data = old_data;
node->key_free_list_len = old_key_free_len;
node->val_free_list_len = old_val_free_len;
}
apfs_update_node(node);
fail_defrag:
if (err == -ENOSPC && !defragged && needed_room <= apfs_node_total_room(node)) {
/*
* If we know we should have enough room, defragment the node
* and retry the insertion to avoid a split. This is especially
* important for the ip free queue, because it has a very low
* hard limit in the number of nodes allowed.
*/
err = apfs_defragment_node(node);
if (err) {
apfs_err(sb, "failed to defragment node");
return err;
}
defragged = true;
goto retry;
}
if (err == -ENOSPC && defragged) {
apfs_err(sb, "node reports incorrect free space");
err = -EFSCORRUPTED;
}
return err;
}
/**
* apfs_create_single_rec_node - Creates a new node with a single record
* @query: query run to search for the record
* @key: on-disk record key
* @key_len: length of @key
* @val: on-disk record value
* @val_len: length of @val
*
* The new node is placed right after the one found by @query, which must have
* a single record. On success, returns 0 and sets @query to the new record;
* returns a negative error code in case of failure.
*/
int apfs_create_single_rec_node(struct apfs_query *query, void *key, int key_len, void *val, int val_len)
{
struct apfs_node *prev_node = query->node;
struct super_block *sb = prev_node->object.sb;
struct apfs_node *new_node = NULL;
struct apfs_btree_node_phys *prev_raw = NULL;
struct apfs_btree_node_phys *new_raw = NULL;
int err;
ASSERT(query->parent);
ASSERT(prev_node->records == 1);
ASSERT(val && val_len);
/* This function should only be needed for huge catalog records */
if (prev_node->tree_type != APFS_OBJECT_TYPE_FSTREE) {
apfs_err(sb, "huge node records in the wrong tree");
return -EFSCORRUPTED;
}
apfs_btree_change_node_count(query->parent, 1 /* change */);
new_node = apfs_create_node(sb, apfs_query_storage(query));
if (IS_ERR(new_node)) {
apfs_err(sb, "node creation failed");
return PTR_ERR(new_node);
}
new_node->tree_type = prev_node->tree_type;
new_node->flags = prev_node->flags;
new_node->records = 0;
new_node->key_free_list_len = 0;
new_node->val_free_list_len = 0;
new_node->key = new_node->free = sizeof(*new_raw);
new_node->data = sb->s_blocksize; /* Nonroot */
prev_raw = (void *)prev_node->object.data;
new_raw = (void *)new_node->object.data;
apfs_assert_in_transaction(sb, &new_raw->btn_o);
new_raw->btn_level = prev_raw->btn_level;
apfs_update_node(new_node);
prev_node = NULL;
prev_raw = NULL;
apfs_node_free(query->node);
query->node = new_node;
query->index = -1;
err = apfs_node_insert(query, key, key_len, val, val_len);
if (err) {
apfs_err(sb, "node record insertion failed");
return err;
}
err = apfs_attach_child(query->parent, new_node);
if (err)
apfs_err(sb, "child attachment failed");
return err;
}