Files
Ernesto A. Fernández 6fcc69c05d Don't refer to record values as "data"
The paper I followed in the early days of the driver (before the
official reference was published) referred to record values as "data",
so some of my oldest code uses that name as well. This is inconsistent,
and not very clear because the word "data" shows up everywhere. Reword
all instances of this I can identify.

Signed-off-by: Ernesto A. Fernández <ernesto@corellium.com>
2024-12-04 21:04:23 -03:00

1175 lines
32 KiB
C

// SPDX-License-Identifier: GPL-2.0-only
/*
* Copyright (C) 2018 Ernesto A. Fernández <ernesto.mnd.fernandez@gmail.com>
*/
#include <linux/buffer_head.h>
#include <linux/slab.h>
#include "apfs.h"
struct apfs_node *apfs_query_root(const struct apfs_query *query)
{
while (query->parent)
query = query->parent;
ASSERT(apfs_node_is_root(query->node));
return query->node;
}
static u64 apfs_catalog_base_oid(struct apfs_query *query)
{
struct apfs_query *root_query = NULL;
root_query = query;
while (root_query->parent)
root_query = root_query->parent;
return root_query->node->object.oid;
}
/**
* apfs_child_from_query - Read the child id found by a successful nonleaf query
* @query: the query that found the record
* @child: Return parameter. The child id found.
*
* Reads the child id in the nonleaf node record into @child and performs a
* basic sanity check as a protection against crafted filesystems. Returns 0
* on success or -EFSCORRUPTED otherwise.
*/
static int apfs_child_from_query(struct apfs_query *query, u64 *child)
{
struct super_block *sb = query->node->object.sb;
char *raw = query->node->object.data;
if (query->flags & APFS_QUERY_CAT && apfs_is_sealed(sb)) {
struct apfs_btn_index_node_val *index_val = NULL;
if (query->len != sizeof(*index_val)) {
apfs_err(sb, "bad sealed index value length (%d)", query->len);
return -EFSCORRUPTED;
}
index_val = (struct apfs_btn_index_node_val *)(raw + query->off);
*child = le64_to_cpu(index_val->binv_child_oid) + apfs_catalog_base_oid(query);
} else {
if (query->len != 8) { /* The value on a nonleaf node is the child id */
apfs_err(sb, "bad index value length (%d)", query->len);
return -EFSCORRUPTED;
}
*child = le64_to_cpup((__le64 *)(raw + query->off));
}
return 0;
}
/**
* apfs_omap_cache_lookup - Look for an oid in an omap's cache
* @omap: the object map
* @oid: object id to look up
* @bno: on return, the block number for the oid
*
* Returns 0 on success, or -1 if this mapping is not cached.
*/
static int apfs_omap_cache_lookup(struct apfs_omap *omap, u64 oid, u64 *bno)
{
struct apfs_omap_cache *cache = &omap->omap_cache;
struct apfs_omap_rec *record = NULL;
int slot;
int ret = -1;
if (cache->disabled)
return -1;
/* Uninitialized cache records use OID 0, so check this just in case */
if (!oid)
return -1;
slot = oid & APFS_OMAP_CACHE_SLOT_MASK;
record = &cache->recs[slot];
spin_lock(&cache->lock);
if (record->oid == oid) {
*bno = record->bno;
ret = 0;
}
spin_unlock(&cache->lock);
return ret;
}
/**
* apfs_omap_cache_save - Save a record in an omap's cache
* @omap: the object map
* @oid: object id of the record
* @bno: block number for the oid
*/
static void apfs_omap_cache_save(struct apfs_omap *omap, u64 oid, u64 bno)
{
struct apfs_omap_cache *cache = &omap->omap_cache;
struct apfs_omap_rec *record = NULL;
int slot;
if (cache->disabled)
return;
slot = oid & APFS_OMAP_CACHE_SLOT_MASK;
record = &cache->recs[slot];
spin_lock(&cache->lock);
record->oid = oid;
record->bno = bno;
spin_unlock(&cache->lock);
}
/**
* apfs_omap_cache_delete - Try to delete a record from an omap's cache
* @omap: the object map
* @oid: object id of the record
*/
static void apfs_omap_cache_delete(struct apfs_omap *omap, u64 oid)
{
struct apfs_omap_cache *cache = &omap->omap_cache;
struct apfs_omap_rec *record = NULL;
int slot;
if (cache->disabled)
return;
slot = oid & APFS_OMAP_CACHE_SLOT_MASK;
record = &cache->recs[slot];
spin_lock(&cache->lock);
if (record->oid == oid) {
record->oid = 0;
record->bno = 0;
}
spin_unlock(&cache->lock);
}
/**
* apfs_mounted_xid - Returns the mounted xid for this superblock
* @sb: superblock structure
*
* This function is needed instead of APFS_NXI(@sb)->nx_xid in situations where
* we might be working with a snapshot. Snapshots are read-only and should
* mostly ignore xids, so this only appears to matter for omap lookups.
*/
static inline u64 apfs_mounted_xid(struct super_block *sb)
{
struct apfs_sb_info *sbi = APFS_SB(sb);
struct apfs_nxsb_info *nxi = APFS_NXI(sb);
return sbi->s_snap_xid ? sbi->s_snap_xid : nxi->nx_xid;
}
/**
* apfs_xid_in_snapshot - Check if an xid is part of a snapshot
* @omap: the object map
* @xid: the xid to check
*/
static inline bool apfs_xid_in_snapshot(struct apfs_omap *omap, u64 xid)
{
return xid <= omap->omap_latest_snap;
}
/**
* apfs_omap_lookup_block_with_xid - Find bno of a virtual object from oid/xid
* @sb: filesystem superblock
* @omap: object map to be searched
* @id: id of the node
* @xid: transaction id
* @block: on return, the found block number
* @write: get write access to the object?
*
* Searches @omap for the most recent matching object with a transaction id
* below @xid. Returns 0 on success or a negative error code in case of failure.
*/
static int apfs_omap_lookup_block_with_xid(struct super_block *sb, struct apfs_omap *omap, u64 id, u64 xid, u64 *block, bool write)
{
struct apfs_nxsb_info *nxi = APFS_NXI(sb);
struct apfs_query *query;
struct apfs_omap_map map = {0};
int ret = 0;
if (!write) {
if (!apfs_omap_cache_lookup(omap, id, block))
return 0;
}
query = apfs_alloc_query(omap->omap_root, NULL /* parent */);
if (!query)
return -ENOMEM;
apfs_init_omap_key(id, xid, &query->key);
query->flags |= APFS_QUERY_OMAP;
ret = apfs_btree_query(sb, &query);
if (ret) {
if (ret != -ENODATA)
apfs_err(sb, "query failed for oid 0x%llx, xid 0x%llx", id, xid);
goto fail;
}
ret = apfs_omap_map_from_query(query, &map);
if (ret) {
apfs_alert(sb, "bad object map leaf block: 0x%llx",
query->node->object.block_nr);
goto fail;
}
*block = map.bno;
if (write) {
struct apfs_omap_key key;
struct apfs_omap_val val;
struct buffer_head *new_bh;
bool preserve;
preserve = apfs_xid_in_snapshot(omap, map.xid);
new_bh = apfs_read_object_block(sb, *block, write, preserve);
if (IS_ERR(new_bh)) {
apfs_err(sb, "CoW failed for oid 0x%llx, xid 0x%llx", id, xid);
ret = PTR_ERR(new_bh);
goto fail;
}
key.ok_oid = cpu_to_le64(id);
key.ok_xid = cpu_to_le64(nxi->nx_xid);
val.ov_flags = cpu_to_le32(map.flags);
val.ov_size = cpu_to_le32(sb->s_blocksize);
val.ov_paddr = cpu_to_le64(new_bh->b_blocknr);
if (preserve)
ret = apfs_btree_insert(query, &key, sizeof(key), &val, sizeof(val));
else
ret = apfs_btree_replace(query, &key, sizeof(key), &val, sizeof(val));
if (ret)
apfs_err(sb, "CoW omap update failed (oid 0x%llx, xid 0x%llx)", id, xid);
*block = new_bh->b_blocknr;
brelse(new_bh);
}
apfs_omap_cache_save(omap, id, *block);
fail:
apfs_free_query(query);
return ret;
}
/**
* apfs_omap_lookup_block - Find the block number of a b-tree node from its id
* @sb: filesystem superblock
* @omap: object map to be searched
* @id: id of the node
* @block: on return, the found block number
* @write: get write access to the object?
*
* Returns 0 on success or a negative error code in case of failure.
*/
int apfs_omap_lookup_block(struct super_block *sb, struct apfs_omap *omap, u64 id, u64 *block, bool write)
{
return apfs_omap_lookup_block_with_xid(sb, omap, id, apfs_mounted_xid(sb), block, write);
}
/**
* apfs_omap_lookup_newest_block - Find newest bno for a virtual object's oid
* @sb: filesystem superblock
* @omap: object map to be searched
* @id: id of the object
* @block: on return, the found block number
* @write: get write access to the object?
*
* Returns 0 on success or a negative error code in case of failure.
*/
int apfs_omap_lookup_newest_block(struct super_block *sb, struct apfs_omap *omap, u64 id, u64 *block, bool write)
{
return apfs_omap_lookup_block_with_xid(sb, omap, id, -1, block, write);
}
/**
* apfs_create_omap_rec - Create a record in the volume's omap tree
* @sb: filesystem superblock
* @oid: object id
* @bno: block number
*
* Returns 0 on success or a negative error code in case of failure.
*/
int apfs_create_omap_rec(struct super_block *sb, u64 oid, u64 bno)
{
struct apfs_sb_info *sbi = APFS_SB(sb);
struct apfs_nxsb_info *nxi = APFS_NXI(sb);
struct apfs_omap *omap = sbi->s_omap;
struct apfs_query *query;
struct apfs_omap_key raw_key;
struct apfs_omap_val raw_val;
int ret;
query = apfs_alloc_query(omap->omap_root, NULL /* parent */);
if (!query)
return -ENOMEM;
apfs_init_omap_key(oid, nxi->nx_xid, &query->key);
query->flags |= APFS_QUERY_OMAP;
ret = apfs_btree_query(sb, &query);
if (ret && ret != -ENODATA) {
apfs_err(sb, "query failed for oid 0x%llx, bno 0x%llx", oid, bno);
goto fail;
}
raw_key.ok_oid = cpu_to_le64(oid);
raw_key.ok_xid = cpu_to_le64(nxi->nx_xid);
raw_val.ov_flags = 0;
raw_val.ov_size = cpu_to_le32(sb->s_blocksize);
raw_val.ov_paddr = cpu_to_le64(bno);
ret = apfs_btree_insert(query, &raw_key, sizeof(raw_key),
&raw_val, sizeof(raw_val));
if (ret) {
apfs_err(sb, "insertion failed for oid 0x%llx, bno 0x%llx", oid, bno);
goto fail;
}
apfs_omap_cache_save(omap, oid, bno);
fail:
apfs_free_query(query);
return ret;
}
/**
* apfs_delete_omap_rec - Delete an existing record from the volume's omap tree
* @sb: filesystem superblock
* @oid: object id for the record
*
* Returns 0 on success or a negative error code in case of failure.
*/
int apfs_delete_omap_rec(struct super_block *sb, u64 oid)
{
struct apfs_sb_info *sbi = APFS_SB(sb);
struct apfs_nxsb_info *nxi = APFS_NXI(sb);
struct apfs_omap *omap = sbi->s_omap;
struct apfs_query *query;
int ret;
query = apfs_alloc_query(omap->omap_root, NULL /* parent */);
if (!query)
return -ENOMEM;
apfs_init_omap_key(oid, nxi->nx_xid, &query->key);
query->flags |= APFS_QUERY_OMAP;
ret = apfs_btree_query(sb, &query);
if (ret == -ENODATA) {
apfs_err(sb, "nonexistent record (oid 0x%llx)", oid);
ret = -EFSCORRUPTED;
goto fail;
}
if (ret) {
apfs_err(sb, "query failed (oid 0x%llx)", oid);
goto fail;
}
ret = apfs_btree_remove(query);
if (ret) {
apfs_err(sb, "removal failed (oid 0x%llx)", oid);
goto fail;
}
apfs_omap_cache_delete(omap, oid);
fail:
apfs_free_query(query);
return ret;
}
/**
* apfs_alloc_query - Allocates a query structure
* @node: node to be searched
* @parent: query for the parent node
*
* Callers other than apfs_btree_query() should set @parent to NULL, and @node
* to the root of the b-tree. They should also initialize most of the query
* fields themselves; when @parent is not NULL the query will inherit them.
*
* Returns the allocated query, or NULL in case of failure.
*/
struct apfs_query *apfs_alloc_query(struct apfs_node *node,
struct apfs_query *parent)
{
struct apfs_query *query;
query = kzalloc(sizeof(*query), GFP_KERNEL);
if (!query)
return NULL;
/* To be released by free_query. */
query->node = node;
if (parent) {
query->key = parent->key;
query->flags = parent->flags & ~(APFS_QUERY_DONE | APFS_QUERY_NEXT);
query->parent = parent;
query->depth = parent->depth + 1;
}
/*
* We start the search with the last record and go backwards, but
* some queries later use the PREV flag later to list them in order.
*/
if (query->flags & APFS_QUERY_PREV)
query->index = -1;
else
query->index = node->records;
return query;
}
/**
* apfs_free_query - Free a query structure
* @query: query to free
*
* Also frees the ancestor queries, if they are kept.
*/
void apfs_free_query(struct apfs_query *query)
{
while (query) {
struct apfs_query *parent = query->parent;
/* The caller decides whether to free the root node */
if (query->depth != 0)
apfs_node_free(query->node);
kfree(query);
query = parent;
}
}
/**
* apfs_query_set_before_first - Set the query to point before the first record
* @sb: superblock structure
* @query: the query to set
*
* Queries set in this way are used to insert a record before the first one.
* Only the leaf gets set to the -1 entry; queries for other levels must be set
* to 0, since the first entry in each index node will need to be modified.
*
* Returns 0 on success, or a negative error code in case of failure.
*/
static int apfs_query_set_before_first(struct super_block *sb, struct apfs_query **query)
{
struct apfs_node *node = NULL;
struct apfs_query *parent = NULL;
u64 child_id;
u32 storage = apfs_query_storage(*query);
int err;
while ((*query)->depth < 12) {
if (apfs_node_is_leaf((*query)->node)) {
(*query)->index = -1;
return 0;
}
apfs_node_query_first(*query);
err = apfs_child_from_query(*query, &child_id);
if (err) {
apfs_alert(sb, "bad index block: 0x%llx",
(*query)->node->object.block_nr);
return err;
}
/* Now go a level deeper */
node = apfs_read_node(sb, child_id, storage, false /* write */);
if (IS_ERR(node)) {
apfs_err(sb, "failed to read child 0x%llx of node 0x%llx", child_id, (*query)->node->object.oid);
return PTR_ERR(node);
}
parent = *query;
*query = apfs_alloc_query(node, parent);
if (!*query) {
apfs_node_free(node);
*query = parent;
return -ENOMEM;
}
node = NULL;
}
apfs_err(sb, "btree is too high");
return -EFSCORRUPTED;
}
/**
* apfs_btree_query - Execute a query on a b-tree
* @sb: filesystem superblock
* @query: the query to execute
*
* Searches the b-tree starting at @query->index in @query->node, looking for
* the record corresponding to @query->key.
*
* Returns 0 in case of success and sets the @query->len, @query->off and
* @query->index fields to the results of the query. @query->node will now
* point to the leaf node holding the record.
*
* In case of failure returns an appropriate error code.
*/
int apfs_btree_query(struct super_block *sb, struct apfs_query **query)
{
struct apfs_node *node;
struct apfs_query *parent;
u64 child_id;
u32 storage = apfs_query_storage(*query);
int err;
next_node:
if ((*query)->depth >= 12) {
/*
* We need a maximum depth for the tree so we can't loop
* forever if the filesystem is damaged. 12 should be more
* than enough to map every block.
*/
apfs_err(sb, "btree is too high");
err = -EFSCORRUPTED;
goto fail;
}
err = apfs_node_query(sb, *query);
if (err == -ENODATA && !(*query)->parent && (*query)->index == -1) {
/*
* We may be trying to insert a record before all others: don't
* let the query give up at the root node.
*/
err = apfs_query_set_before_first(sb, query);
if (err) {
apfs_err(sb, "failed to set before the first record");
goto fail;
}
err = -ENODATA;
goto fail;
} else if (err == -EAGAIN) {
if (!(*query)->parent) {
/* We are at the root of the tree */
err = -ENODATA;
goto fail;
}
/* Move back up one level and continue the query */
parent = (*query)->parent;
(*query)->parent = NULL; /* Don't free the parent */
apfs_free_query(*query);
*query = parent;
goto next_node;
} else if (err) {
goto fail;
}
if (apfs_node_is_leaf((*query)->node)) /* All done */
return 0;
err = apfs_child_from_query(*query, &child_id);
if (err) {
apfs_alert(sb, "bad index block: 0x%llx",
(*query)->node->object.block_nr);
goto fail;
}
/* Now go a level deeper and search the child */
node = apfs_read_node(sb, child_id, storage, false /* write */);
if (IS_ERR(node)) {
apfs_err(sb, "failed to read node 0x%llx", child_id);
err = PTR_ERR(node);
goto fail;
}
if (node->object.oid != child_id)
apfs_debug(sb, "corrupt b-tree");
/*
* Remember the parent node and index in case the search needs
* to be continued later.
*/
parent = *query;
*query = apfs_alloc_query(node, parent);
if (!*query) {
apfs_node_free(node);
*query = parent;
err = -ENOMEM;
goto fail;
}
node = NULL;
goto next_node;
fail:
/* Don't leave stale record info here or some callers will use it */
(*query)->key_len = (*query)->len = 0;
return err;
}
static int __apfs_btree_replace(struct apfs_query *query, void *key, int key_len, void *val, int val_len);
/**
* apfs_query_join_transaction - Add the found node to the current transaction
* @query: query that found the node
*/
int apfs_query_join_transaction(struct apfs_query *query)
{
struct apfs_node *node = query->node;
struct super_block *sb = node->object.sb;
u64 oid = node->object.oid;
u32 storage = apfs_query_storage(query);
struct apfs_obj_phys *raw = NULL;
/*
* Ephemeral objects are checkpoint data, and all of their xids get
* updated on commit. There is no real need to do it here as well, but
* it's better for consistency with the other object types.
*/
if (storage == APFS_OBJ_EPHEMERAL) {
ASSERT(node->object.ephemeral);
raw = (void *)node->object.data;
raw->o_xid = cpu_to_le64(APFS_NXI(sb)->nx_xid);
return 0;
}
if (buffer_trans(node->object.o_bh)) /* Already in the transaction */
return 0;
/* Root nodes should join the transaction before the query is created */
ASSERT(!apfs_node_is_root(node));
node = apfs_read_node(sb, oid, storage, true /* write */);
if (IS_ERR(node)) {
apfs_err(sb, "Cow failed for node 0x%llx", oid);
return PTR_ERR(node);
}
apfs_node_free(query->node);
query->node = node;
if (storage == APFS_OBJ_PHYSICAL && query->parent) {
__le64 bno = cpu_to_le64(node->object.block_nr);
/* The parent node needs to report the new location */
return __apfs_btree_replace(query->parent, NULL /* key */, 0 /* key_len */, &bno, sizeof(bno));
}
return 0;
}
/**
* apfs_btree_change_rec_count - Update the b-tree info before a record change
* @query: query used to insert/remove/replace the leaf record
* @change: change in the record count
* @key_len: length of the new leaf record key (0 if removed or unchanged)
* @val_len: length of the new leaf record value (0 if removed or unchanged)
*
* Don't call this function if @query->parent was reset to NULL, or if the same
* is true of any of its ancestor queries.
*/
static void apfs_btree_change_rec_count(struct apfs_query *query, int change,
int key_len, int val_len)
{
struct super_block *sb;
struct apfs_node *root;
struct apfs_btree_node_phys *root_raw;
struct apfs_btree_info *info;
if (change == -1)
ASSERT(!key_len && !val_len);
ASSERT(apfs_node_is_leaf(query->node));
root = apfs_query_root(query);
ASSERT(apfs_node_is_root(root));
sb = root->object.sb;
root_raw = (void *)root->object.data;
info = (void *)root_raw + sb->s_blocksize - sizeof(*info);
apfs_assert_in_transaction(sb, &root_raw->btn_o);
if (key_len > le32_to_cpu(info->bt_longest_key))
info->bt_longest_key = cpu_to_le32(key_len);
if (val_len > le32_to_cpu(info->bt_longest_val))
info->bt_longest_val = cpu_to_le32(val_len);
le64_add_cpu(&info->bt_key_count, change);
}
/**
* apfs_btree_change_node_count - Change the node count for a b-tree
* @query: query used to remove/create the node
* @change: change in the node count
*
* Also changes the node count in the volume superblock. Don't call this
* function if @query->parent was reset to NULL, or if the same is true of
* any of its ancestor queries.
*/
void apfs_btree_change_node_count(struct apfs_query *query, int change)
{
struct super_block *sb;
struct apfs_node *root;
struct apfs_btree_node_phys *root_raw;
struct apfs_btree_info *info;
root = apfs_query_root(query);
ASSERT(apfs_node_is_root(root));
sb = root->object.sb;
root_raw = (void *)root->object.data;
info = (void *)root_raw + sb->s_blocksize - sizeof(*info);
apfs_assert_in_transaction(sb, &root_raw->btn_o);
le64_add_cpu(&info->bt_node_count, change);
}
/**
* apfs_query_refresh - Recreate a catalog query invalidated by node splits
* @old_query: query chain to refresh
* @root: root node of the query chain
* @nodata: is the query expected to find nothing?
*
* On success, @old_query is left pointing to the same leaf record, but with
* valid ancestor queries as well. Returns a negative error code in case of
* failure, or 0 on success.
*/
static int apfs_query_refresh(struct apfs_query *old_query, struct apfs_node *root, bool nodata)
{
struct super_block *sb = NULL;
struct apfs_query *new_query = NULL;
int err = 0;
sb = root->object.sb;
if (!apfs_node_is_leaf(old_query->node)) {
apfs_alert(sb, "attempting refresh of non-leaf query");
return -EFSCORRUPTED;
}
if (apfs_node_is_root(old_query->node)) {
apfs_alert(sb, "attempting refresh of root query");
return -EFSCORRUPTED;
}
new_query = apfs_alloc_query(root, NULL /* parent */);
if (!new_query)
return -ENOMEM;
new_query->key = old_query->key;
new_query->flags = old_query->flags & ~(APFS_QUERY_DONE | APFS_QUERY_NEXT);
err = apfs_btree_query(sb, &new_query);
if (!nodata && err == -ENODATA) {
apfs_err(sb, "record should exist");
err = -EFSCORRUPTED;
goto fail;
}
if (err && err != -ENODATA) {
apfs_err(sb, "failed to rerun");
goto fail;
}
err = 0;
/* Replace the parent of the original query with the new valid one */
apfs_free_query(old_query->parent);
old_query->parent = new_query->parent;
new_query->parent = NULL;
/*
* The records may have moved around so update this too. TODO: rework
* the query struct so this stuff is not needed.
*/
ASSERT(old_query->node->object.oid == new_query->node->object.oid);
old_query->index = new_query->index;
old_query->key_off = new_query->key_off;
old_query->key_len = new_query->key_len;
old_query->off = new_query->off;
old_query->len = new_query->len;
old_query->depth = new_query->depth;
fail:
apfs_free_query(new_query);
return err;
}
/**
* __apfs_btree_insert - Insert a new record into a b-tree (at any level)
* @query: query run to search for the record
* @key: on-disk record key
* @key_len: length of @key
* @val: on-disk record value (NULL for ghost records)
* @val_len: length of @val (0 for ghost records)
*
* The new record is placed right after the one found by @query. On success,
* returns 0 and sets @query to the new record; returns a negative error code
* in case of failure, which may be -EAGAIN if a split happened and the caller
* must retry.
*/
int __apfs_btree_insert(struct apfs_query *query, void *key, int key_len, void *val, int val_len)
{
struct apfs_node *node = query->node;
struct super_block *sb = node->object.sb;
struct apfs_btree_node_phys *node_raw;
int needed_room;
int err;
apfs_assert_query_is_valid(query);
err = apfs_query_join_transaction(query);
if (err) {
apfs_err(sb, "query join failed");
return err;
}
node = query->node;
node_raw = (void *)node->object.data;
apfs_assert_in_transaction(node->object.sb, &node_raw->btn_o);
needed_room = key_len + val_len;
if (!apfs_node_has_room(node, needed_room, false /* replace */)) {
if (node->records == 1) {
/* The new record just won't fit in the node */
err = apfs_create_single_rec_node(query, key, key_len, val, val_len);
if (err && err != -EAGAIN)
apfs_err(sb, "failed to create single-record node");
return err;
}
err = apfs_node_split(query);
if (err && err != -EAGAIN) {
apfs_err(sb, "node split failed");
return err;
}
return -EAGAIN;
}
apfs_assert_query_is_valid(query);
if (query->parent && query->index == -1) {
/* We are about to insert a record before all others */
err = __apfs_btree_replace(query->parent, key, key_len, NULL /* val */, 0 /* val_len */);
if (err) {
if (err != -EAGAIN)
apfs_err(sb, "parent update failed");
return err;
}
}
apfs_assert_query_is_valid(query);
err = apfs_node_insert(query, key, key_len, val, val_len);
if (err) {
apfs_err(sb, "node record insertion failed");
return err;
}
return 0;
}
/**
* apfs_btree_insert - Insert a new record into a b-tree leaf
* @query: query run to search for the record
* @key: on-disk record key
* @key_len: length of @key
* @val: on-disk record value (NULL for ghost records)
* @val_len: length of @val (0 for ghost records)
*
* The new record is placed right after the one found by @query. On success,
* returns 0 and sets @query to the new record; returns a negative error code
* in case of failure.
*/
int apfs_btree_insert(struct apfs_query *query, void *key, int key_len, void *val, int val_len)
{
struct super_block *sb = NULL;
struct apfs_node *root = NULL, *leaf = NULL;
int err;
root = apfs_query_root(query);
ASSERT(apfs_node_is_root(root));
leaf = query->node;
ASSERT(apfs_node_is_leaf(leaf));
sb = root->object.sb;
while (true) {
err = __apfs_btree_insert(query, key, key_len, val, val_len);
if (err != -EAGAIN) {
if (err)
return err;
break;
}
err = apfs_query_refresh(query, root, true /* nodata */);
if (err) {
apfs_err(sb, "query refresh failed");
return err;
}
}
apfs_assert_query_is_valid(query);
apfs_btree_change_rec_count(query, 1 /* change */, key_len, val_len);
return 0;
}
/**
* __apfs_btree_remove - Remove a record from a b-tree (at any level)
* @query: exact query that found the record
*
* Returns 0 on success, or a negative error code in case of failure, which may
* be -EAGAIN if a split happened and the caller must retry.
*/
static int __apfs_btree_remove(struct apfs_query *query)
{
struct apfs_node *node = query->node;
struct super_block *sb = node->object.sb;
struct apfs_btree_node_phys *node_raw;
int later_entries = node->records - query->index - 1;
int err;
apfs_assert_query_is_valid(query);
err = apfs_query_join_transaction(query);
if (err) {
apfs_err(sb, "query join failed");
return err;
}
node = query->node;
node_raw = (void *)query->node->object.data;
apfs_assert_in_transaction(node->object.sb, &node_raw->btn_o);
if (query->parent && node->records == 1) {
/* Just get rid of the node */
err = __apfs_btree_remove(query->parent);
if (err == -EAGAIN)
return -EAGAIN;
if (err) {
apfs_err(sb, "parent index removal failed");
return err;
}
apfs_btree_change_node_count(query, -1 /* change */);
err = apfs_delete_node(node, query->flags & APFS_QUERY_TREE_MASK);
if (err) {
apfs_err(sb, "node deletion failed");
return err;
}
return 0;
}
apfs_assert_query_is_valid(query);
/* The first key in a node must match the parent record's */
if (query->parent && query->index == 0) {
int first_key_len, first_key_off;
void *key;
first_key_len = apfs_node_locate_key(node, 1, &first_key_off);
if (!first_key_len)
return -EFSCORRUPTED;
key = (void *)node_raw + first_key_off;
err = __apfs_btree_replace(query->parent, key, first_key_len, NULL /* val */, 0 /* val_len */);
if (err == -EAGAIN)
return -EAGAIN;
if (err) {
apfs_err(sb, "parent update failed");
return err;
}
}
apfs_assert_query_is_valid(query);
/* Remove the entry from the table of contents */
if (apfs_node_has_fixed_kv_size(node)) {
struct apfs_kvoff *toc_entry;
toc_entry = (struct apfs_kvoff *)node_raw->btn_data +
query->index;
memmove(toc_entry, toc_entry + 1,
later_entries * sizeof(*toc_entry));
} else {
struct apfs_kvloc *toc_entry;
toc_entry = (struct apfs_kvloc *)node_raw->btn_data +
query->index;
memmove(toc_entry, toc_entry + 1,
later_entries * sizeof(*toc_entry));
}
apfs_node_free_range(node, query->key_off, query->key_len);
apfs_node_free_range(node, query->off, query->len);
--node->records;
if (node->records == 0) {
/* All descendants are gone, root is the whole tree */
node_raw->btn_level = 0;
node->flags |= APFS_BTNODE_LEAF;
}
apfs_update_node(node);
--query->index;
return 0;
}
/**
* apfs_btree_remove - Remove a record from a b-tree leaf
* @query: exact query that found the record
*
* Returns 0 on success, or a negative error code in case of failure.
*/
int apfs_btree_remove(struct apfs_query *query)
{
struct super_block *sb = NULL;
struct apfs_node *root = NULL, *leaf = NULL;
int err;
root = apfs_query_root(query);
ASSERT(apfs_node_is_root(root));
leaf = query->node;
ASSERT(apfs_node_is_leaf(leaf));
sb = root->object.sb;
while (true) {
err = __apfs_btree_remove(query);
if (err != -EAGAIN) {
if (err)
return err;
break;
}
err = apfs_query_refresh(query, root, false /* nodata */);
if (err) {
apfs_err(sb, "query refresh failed");
return err;
}
}
apfs_assert_query_is_valid(query);
apfs_btree_change_rec_count(query, -1 /* change */, 0 /* key_len */, 0 /* val_len */);
return 0;
}
/**
* __apfs_btree_replace - Replace a record in a b-tree (at any level)
* @query: exact query that found the record
* @key: new on-disk record key (NULL if unchanged)
* @key_len: length of @key
* @val: new on-disk record value (NULL if unchanged)
* @val_len: length of @val
*
* It's important that the order of the records is not changed by the new @key.
* This function is not needed to replace an old value with a new one of the
* same length: it can just be overwritten in place.
*
* Returns 0 on success, and @query is left pointing to the same record; returns
* a negative error code in case of failure, which may be -EAGAIN if a split
* happened and the caller must retry.
*/
static int __apfs_btree_replace(struct apfs_query *query, void *key, int key_len, void *val, int val_len)
{
struct apfs_node *node = query->node;
struct super_block *sb = node->object.sb;
struct apfs_btree_node_phys *node_raw;
int needed_room;
int err;
ASSERT(key || val);
apfs_assert_query_is_valid(query);
err = apfs_query_join_transaction(query);
if (err) {
apfs_err(sb, "query join failed");
return err;
}
node = query->node;
node_raw = (void *)node->object.data;
apfs_assert_in_transaction(sb, &node_raw->btn_o);
needed_room = key_len + val_len;
/* We can reuse the space of the replaced key/value */
if (key)
needed_room -= query->key_len;
if (val)
needed_room -= query->len;
if (!apfs_node_has_room(node, needed_room, true /* replace */)) {
if (node->records == 1) {
apfs_alert(sb, "no room in empty node?");
return -EFSCORRUPTED;
}
err = apfs_node_split(query);
if (err && err != -EAGAIN) {
apfs_err(sb, "node split failed");
return err;
}
return -EAGAIN;
}
apfs_assert_query_is_valid(query);
/* The first key in a node must match the parent record's */
if (key && query->parent && query->index == 0) {
err = __apfs_btree_replace(query->parent, key, key_len, NULL /* val */, 0 /* val_len */);
if (err) {
if (err != -EAGAIN)
apfs_err(sb, "parent update failed");
return err;
}
}
apfs_assert_query_is_valid(query);
err = apfs_node_replace(query, key, key_len, val, val_len);
if (err) {
apfs_err(sb, "node record replacement failed");
return err;
}
return 0;
}
/**
* apfs_btree_replace - Replace a record in a b-tree leaf
* @query: exact query that found the record
* @key: new on-disk record key (NULL if unchanged)
* @key_len: length of @key
* @val: new on-disk record value (NULL if unchanged)
* @val_len: length of @val
*
* It's important that the order of the records is not changed by the new @key.
* This function is not needed to replace an old value with a new one of the
* same length: it can just be overwritten in place.
*
* Returns 0 on success, and @query is left pointing to the same record; returns
* a negative error code in case of failure.
*/
int apfs_btree_replace(struct apfs_query *query, void *key, int key_len, void *val, int val_len)
{
struct super_block *sb = NULL;
struct apfs_node *root = NULL, *leaf = NULL;
int err;
root = apfs_query_root(query);
ASSERT(apfs_node_is_root(root));
leaf = query->node;
ASSERT(apfs_node_is_leaf(leaf));
sb = root->object.sb;
while (true) {
err = __apfs_btree_replace(query, key, key_len, val, val_len);
if (err != -EAGAIN) {
if (err)
return err;
break;
}
err = apfs_query_refresh(query, root, false /* nodata */);
if (err) {
apfs_err(sb, "query refresh failed");
return err;
}
}
apfs_assert_query_is_valid(query);
apfs_btree_change_rec_count(query, 0 /* change */, key_len, val_len);
return 0;
}
/**
* apfs_query_direct_forward - Set a query to start listing records forwards
* @query: a successfully executed query
*
* Multiple queries list records backwards, but queries marked with this
* function after execution will go in the opposite direction.
*/
void apfs_query_direct_forward(struct apfs_query *query)
{
if (query->flags & APFS_QUERY_PREV)
return;
apfs_assert_query_is_valid(query);
ASSERT(apfs_node_is_leaf(query->node));
while (query) {
query->flags |= APFS_QUERY_PREV;
query = query->parent;
}
}