linux-apfs-rw/spaceman.c

// SPDX-License-Identifier: GPL-2.0-only
/*
 * Copyright (C) 2019 Ernesto A. Fernández <ernesto.mnd.fernandez@gmail.com>
 */

#include <linux/buffer_head.h>
#include <linux/fs.h>
#include <linux/slab.h>
#include "apfs.h"

/**
 * apfs_spaceman_read_cib_addr - Get the address of a cib from the spaceman
 * @sb:		superblock structure
 * @index:	index of the chunk-info block
 *
 * Returns the block number for the chunk-info block.
 *
 * This is not described in the official documentation; credit for figuring it
 * out should go to Joachim Metz: <https://github.com/libyal/libfsapfs>.
 */
static u64 apfs_spaceman_read_cib_addr(struct super_block *sb, int index)
{
	struct apfs_spaceman *sm = APFS_SM(sb);
	struct apfs_spaceman_phys *sm_raw = sm->sm_raw;
	u32 offset;
	__le64 *addr_p;

	offset = sm->sm_addr_offset + index * sizeof(*addr_p);
	addr_p = (void *)sm_raw + offset;
	return le64_to_cpup(addr_p);
}

/**
 * apfs_spaceman_write_cib_addr - Store the address of a cib in the spaceman
 * @sb:		superblock structure
 * @index:	index of the chunk-info block
 * @addr:	address of the chunk-info block
 */
static void apfs_spaceman_write_cib_addr(struct super_block *sb,
					 int index, u64 addr)
{
	struct apfs_spaceman *sm = APFS_SM(sb);
	struct apfs_spaceman_phys *sm_raw = sm->sm_raw;
	u32 offset;
	__le64 *addr_p;

	apfs_assert_in_transaction(sb, &sm_raw->sm_o);

	offset = sm->sm_addr_offset + index * sizeof(*addr_p);
	addr_p = (void *)sm_raw + offset;
	*addr_p = cpu_to_le64(addr);
}

/**
 * apfs_max_chunks_per_cib - Find the maximum chunk count for a chunk-info block
 * @sb: superblock structure
 */
static inline int apfs_max_chunks_per_cib(struct super_block *sb)
{
	return (sb->s_blocksize - sizeof(struct apfs_chunk_info_block)) /
						sizeof(struct apfs_chunk_info);
}

/**
 * apfs_read_spaceman_dev - Read a space manager device structure
 * @sb:		superblock structure
 * @dev:	on-disk device structure
 *
 * Initializes the in-memory spaceman fields related to the main device; fusion
 * drives are not yet supported.  Returns 0 on success, or a negative error code
 * in case of failure.
 */
static int apfs_read_spaceman_dev(struct super_block *sb,
				  struct apfs_spaceman_device *dev)
{
	struct apfs_spaceman *spaceman = APFS_SM(sb);

	if (dev->sm_cab_count) {
		apfs_err(sb, "large devices are not supported");
		return -EINVAL;
	}

	spaceman->sm_block_count = le64_to_cpu(dev->sm_block_count);
	spaceman->sm_chunk_count = le64_to_cpu(dev->sm_chunk_count);
	spaceman->sm_cib_count = le32_to_cpu(dev->sm_cib_count);
	spaceman->sm_free_count = le64_to_cpu(dev->sm_free_count);
	spaceman->sm_addr_offset = le32_to_cpu(dev->sm_addr_offset);

	/* Check that all the cib addresses fit in the spaceman object */
	if ((long long)spaceman->sm_addr_offset +
	    (long long)spaceman->sm_cib_count * sizeof(u64) > spaceman->sm_size) {
		apfs_err(sb, "too many cibs (%u)", spaceman->sm_cib_count);
		return -EFSCORRUPTED;
	}

	return 0;
}

/**
 * apfs_spaceman_get_16 - Get a 16-bit value from an offset in the spaceman
 * @sb:		superblock structure
 * @off:	offset for the value
 *
 * Returns a pointer to the value, or NULL if it doesn't fit.
 */
static __le16 *apfs_spaceman_get_16(struct super_block *sb, size_t off)
{
	struct apfs_spaceman *spaceman = APFS_SM(sb);
	struct apfs_spaceman_phys *sm_raw = spaceman->sm_raw;

	if (off > spaceman->sm_size)
		return NULL;
	if (off + sizeof(__le16) > spaceman->sm_size)
		return NULL;
	return (void *)sm_raw + off;
}

/**
 * apfs_spaceman_get_64 - Get a 64-bit value from an offset in the spaceman
 * @sb:		superblock structure
 * @off:	offset for the value
 *
 * Returns a pointer to the value, or NULL if it doesn't fit.
 */
static __le64 *apfs_spaceman_get_64(struct super_block *sb, size_t off)
{
	struct apfs_spaceman *spaceman = APFS_SM(sb);
	struct apfs_spaceman_phys *sm_raw = spaceman->sm_raw;

	if (off > spaceman->sm_size)
		return NULL;
	if (off + sizeof(__le64) > spaceman->sm_size)
		return NULL;
	return (void *)sm_raw + off;
}

/**
 * apfs_allocate_ip_bitmap - Allocate a free ip bitmap block
 * @sb:		filesystem superblock
 * @offset_p:	on return, the offset from sm_ip_bm_base of the allocated block
 *
 * Returns 0 on success or a negative error code in case of failure.
 */
static int apfs_allocate_ip_bitmap(struct super_block *sb, u16 *offset_p)
{
	struct apfs_spaceman *spaceman = NULL;
	struct apfs_spaceman_phys *sm_raw = NULL;
	u32 free_next_offset, old_head_off;
	u16 free_head, blkcnt;
	__le16 *old_head_p = NULL;

	spaceman = APFS_SM(sb);
	sm_raw = spaceman->sm_raw;
	free_next_offset = le32_to_cpu(sm_raw->sm_ip_bm_free_next_offset);
	free_head = le16_to_cpu(sm_raw->sm_ip_bm_free_head);
	blkcnt = (u16)le32_to_cpu(sm_raw->sm_ip_bm_block_count);

	/*
	 * The "free_next" array is a linked list of free blocks that starts
	 * with the "free_head". Allocate this head then, and make the next
	 * block into the new head.
	 */
	old_head_off = free_next_offset + free_head * sizeof(*old_head_p);
	old_head_p = apfs_spaceman_get_16(sb, old_head_off);
	if (!old_head_p) {
		apfs_err(sb, "free_next head offset out of bounds (%u)", old_head_off);
		return -EFSCORRUPTED;
	}
	*offset_p = free_head;
	free_head = le16_to_cpup(old_head_p);
	sm_raw->sm_ip_bm_free_head = *old_head_p;
	/* No longer free, no longer part of the linked list */
	*old_head_p = cpu_to_le16(APFS_SPACEMAN_IP_BM_INDEX_INVALID);

	/* Just a little sanity check because I've messed this up before */
	if (free_head >= blkcnt || *offset_p >= blkcnt) {
		apfs_err(sb, "free next list seems empty or corrupt");
		return -EFSCORRUPTED;
	}

	return 0;
}

/**
 * apfs_free_ip_bitmap - Free a used ip bitmap block
 * @sb:		filesystem superblock
 * @offset:	the offset from sm_ip_bm_base of the block to free
 *
 * Returns 0 on success or a negative error code in case of failure.
 */
static int apfs_free_ip_bitmap(struct super_block *sb, u16 offset)
{
	struct apfs_spaceman *spaceman = NULL;
	struct apfs_spaceman_phys *sm_raw = NULL;
	u32 free_next_offset, old_tail_off;
	u16 free_tail;
	__le16 *old_tail_p = NULL;

	spaceman = APFS_SM(sb);
	sm_raw = spaceman->sm_raw;
	free_next_offset = le32_to_cpu(sm_raw->sm_ip_bm_free_next_offset);
	free_tail = le16_to_cpu(sm_raw->sm_ip_bm_free_tail);

	/*
	 * The "free_next" array is a linked list of free blocks that ends
	 * with the "free_tail". The block getting freed will become the new
	 * tail of the list.
	 */
	old_tail_off = free_next_offset + free_tail * sizeof(*old_tail_p);
	old_tail_p = apfs_spaceman_get_16(sb, old_tail_off);
	if (!old_tail_p) {
		apfs_err(sb, "free_next tail offset out of bounds (%u)", old_tail_off);
		return -EFSCORRUPTED;
	}
	*old_tail_p = cpu_to_le16(offset);
	sm_raw->sm_ip_bm_free_tail = cpu_to_le16(offset);
	free_tail = offset;

	return 0;
}

/**
 * apfs_reallocate_ip_bitmap - Find a new block for an ip bitmap
 * @sb:		filesystem superblock
 * @offset_p:	the offset from sm_ip_bm_base of the block to free
 *
 * On success returns 0 and updates @offset_p to the new offset allocated for
 * the ip bitmap. Since blocks are allocated at the head of the list and freed
 * at the tail, there is no risk of reuse by future reallocations within the
 * same transaction (under there is some serious corruption, of course).
 *
 * Returns a negative error code in case of failure.
 */
static int apfs_reallocate_ip_bitmap(struct super_block *sb, __le16 *offset_p)
{
	int err;
	u16 offset;

	offset = le16_to_cpup(offset_p);

	err = apfs_free_ip_bitmap(sb, offset);
	if (err) {
		apfs_err(sb, "failed to free ip bitmap %u", offset);
		return err;
	}
	err = apfs_allocate_ip_bitmap(sb, &offset);
	if (err) {
		apfs_err(sb, "failed to allocate a new ip bitmap block");
		return err;
	}

	*offset_p = cpu_to_le16(offset);
	return 0;
}

/**
 * apfs_write_single_ip_bitmap - Write a single ip bitmap to disk
 * @sb:		filesystem superblock
 * @bitmap:	bitmap to write
 * @idx:	index of the ip bitmap to write
 *
 * Returns 0 on success or a negative error code in case of failure.
 */
static int apfs_write_single_ip_bitmap(struct super_block *sb, char *bitmap, u32 idx)
{
	struct apfs_nxsb_info *nxi = APFS_NXI(sb);
	struct apfs_spaceman *spaceman = APFS_SM(sb);
	struct apfs_spaceman_phys *sm_raw = spaceman->sm_raw;
	struct buffer_head *bh = NULL;
	u64 ip_bm_base, ip_bitmap_bno;
	u32 xid_off, ip_bitmap_off;
	__le64 *xid_p = NULL;
	__le16 *ip_bitmap_p = NULL;
	int err;

	ip_bm_base = le64_to_cpu(sm_raw->sm_ip_bm_base);

	/* First update the xid, which is kept in a separate array */
	xid_off = le32_to_cpu(sm_raw->sm_ip_bm_xid_offset) + idx * sizeof(*xid_p);
	xid_p = apfs_spaceman_get_64(sb, xid_off);
	if (!xid_p) {
		apfs_err(sb, "xid out of bounds (%u)", xid_off);
		return -EFSCORRUPTED;
	}
	*xid_p = cpu_to_le64(nxi->nx_xid);

	/* Now get find new location for the ip bitmap (and free the old one) */
	ip_bitmap_off = le32_to_cpu(sm_raw->sm_ip_bitmap_offset) + idx * sizeof(*ip_bitmap_p);
	ip_bitmap_p = apfs_spaceman_get_16(sb, ip_bitmap_off);
	if (!ip_bitmap_p) {
		apfs_err(sb, "bmap offset out of bounds (%u)", ip_bitmap_off);
		return -EFSCORRUPTED;
	}
	err = apfs_reallocate_ip_bitmap(sb, ip_bitmap_p);
	if (err) {
		apfs_err(sb, "failed to reallocate ip bitmap %u", le16_to_cpup(ip_bitmap_p));
		return err;
	}

	/* Finally, write the dirty bitmap to the new location */
	ip_bitmap_bno = ip_bm_base + le16_to_cpup(ip_bitmap_p);
	bh = apfs_getblk(sb, ip_bitmap_bno);
	if (!bh) {
		apfs_err(sb, "failed to map block for CoW (0x%llx)", ip_bitmap_bno);
		return -EIO;
	}
	memcpy(bh->b_data, bitmap, sb->s_blocksize);
	err = apfs_transaction_join(sb, bh);
	if (err)
		goto fail;
	bh = NULL;

	spaceman->sm_ip_bmaps[idx].dirty = false;
	return 0;

fail:
	brelse(bh);
	bh = NULL;
	return err;
}

/**
 * apfs_write_ip_bitmaps - Write all dirty ip bitmaps to disk
 * @sb: superblock structure
 *
 * Returns 0 on success or a negative error code in case of failure.
 */
int apfs_write_ip_bitmaps(struct super_block *sb)
{
	struct apfs_spaceman *spaceman = APFS_SM(sb);
	struct apfs_spaceman_phys *sm_raw = spaceman->sm_raw;
	struct apfs_ip_bitmap_block_info *info = NULL;
	u32 bmaps_count = spaceman->sm_ip_bmaps_count;
	int err;
	u32 i;

	apfs_assert_in_transaction(sb, &sm_raw->sm_o);

	for (i = 0; i < bmaps_count; ++i) {
		info = &spaceman->sm_ip_bmaps[i];
		if (!info->dirty)
			continue;
		err = apfs_write_single_ip_bitmap(sb, info->block, i);
		if (err) {
			apfs_err(sb, "failed to rotate ip bitmap %u", i);
			return err;
		}
	}
	return 0;
}

/**
* apfs_read_single_ip_bitmap - Read a single ip bitmap to memory
* @sb:	filesystem superblock
* @idx:	index of the ip bitmap to read
*
* Returns 0 on success or a negative error code in case of failure.
*/
static int apfs_read_single_ip_bitmap(struct super_block *sb, u32 idx)
{
	struct apfs_spaceman *spaceman = APFS_SM(sb);
	struct apfs_spaceman_phys *sm_raw = spaceman->sm_raw;
	struct buffer_head *bh = NULL;
	char *bitmap = NULL;
	u64 ip_bm_base, ip_bitmap_bno;
	u32 ip_bitmap_off;
	__le16 *ip_bitmap_p = NULL;
	int err;

	ip_bm_base = le64_to_cpu(sm_raw->sm_ip_bm_base);

	ip_bitmap_off = le32_to_cpu(sm_raw->sm_ip_bitmap_offset) + idx * sizeof(*ip_bitmap_p);
	ip_bitmap_p = apfs_spaceman_get_16(sb, ip_bitmap_off);
	if (!ip_bitmap_p) {
		apfs_err(sb, "bmap offset out of bounds (%u)", ip_bitmap_off);
		return -EFSCORRUPTED;
	}

	bitmap = kmalloc(sb->s_blocksize, GFP_KERNEL);
	if (!bitmap)
		return -ENOMEM;

	ip_bitmap_bno = ip_bm_base + le16_to_cpup(ip_bitmap_p);
	bh = apfs_sb_bread(sb, ip_bitmap_bno);
	if (!bh) {
		apfs_err(sb, "failed to read ip bitmap (0x%llx)", ip_bitmap_bno);
		err = -EIO;
		goto fail;
	}
	memcpy(bitmap, bh->b_data, sb->s_blocksize);
	brelse(bh);
	bh = NULL;

	spaceman->sm_ip_bmaps[idx].dirty = false;
	spaceman->sm_ip_bmaps[idx].block = bitmap;
	bitmap = NULL;
	return 0;

fail:
	kfree(bitmap);
	bitmap = NULL;
	return err;
}

/**
 * apfs_read_ip_bitmaps - Read all the ip bitmaps to memory
 * @sb: superblock structure
 *
 * Returns 0 on success or a negative error code in case of failure.
 */
static int apfs_read_ip_bitmaps(struct super_block *sb)
{
	struct apfs_spaceman *spaceman = APFS_SM(sb);
	u32 bmaps_count = spaceman->sm_ip_bmaps_count;
	int err;
	u32 i;

	for (i = 0; i < bmaps_count; ++i) {
		err = apfs_read_single_ip_bitmap(sb, i);
		if (err) {
			apfs_err(sb, "failed to read ip bitmap %u", i);
			return err;
		}
	}
	return 0;
}

/*
 * Free queue record data
 */
struct apfs_fq_rec {
	u64 xid;
	u64 bno;
	u64 len;
};

/**
 * apfs_fq_rec_from_query - Read the free queue record found by a query
 * @query:	the query that found the record
 * @fqrec:	on return, the free queue record
 *
 * Reads the free queue record into @fqrec and performs some basic sanity
 * checks as a protection against crafted filesystems. Returns 0 on success
 * or -EFSCORRUPTED otherwise.
 */
static int apfs_fq_rec_from_query(struct apfs_query *query, struct apfs_fq_rec *fqrec)
{
	char *raw = query->node->object.data;
	struct apfs_spaceman_free_queue_key *key;

	if (query->key_len != sizeof(*key)) {
		apfs_err(query->node->object.sb, "bad key length (%d)", query->key_len);
		return -EFSCORRUPTED;
	}
	key = (struct apfs_spaceman_free_queue_key *)(raw + query->key_off);

	fqrec->xid = le64_to_cpu(key->sfqk_xid);
	fqrec->bno = le64_to_cpu(key->sfqk_paddr);

	if (query->len == 0) {
		fqrec->len = 1; /* Ghost record */
		return 0;
	} else if (query->len == sizeof(__le64)) {
		fqrec->len = le64_to_cpup((__le64 *)(raw + query->off));
		return 0;
	}
	apfs_err(query->node->object.sb, "bad value length (%d)", query->len);
	return -EFSCORRUPTED;
}

/**
 * apfs_block_in_ip - Does this block belong to the internal pool?
 * @sm:		in-memory spaceman structure
 * @bno:	block number to check
 */
static inline bool apfs_block_in_ip(struct apfs_spaceman *sm, u64 bno)
{
	struct apfs_spaceman_phys *sm_raw = sm->sm_raw;
	u64 start = le64_to_cpu(sm_raw->sm_ip_base);
	u64 end = start + le64_to_cpu(sm_raw->sm_ip_block_count);

	return bno >= start && bno < end;
}

/**
 * apfs_ip_mark_free - Mark a block in the internal pool as free
 * @sb:		superblock structure
 * @bno:	block number (must belong to the ip)
 */
static int apfs_ip_mark_free(struct super_block *sb, u64 bno)
{
	struct apfs_spaceman *sm = APFS_SM(sb);
	struct apfs_spaceman_phys *sm_raw = sm->sm_raw;
	struct apfs_ip_bitmap_block_info *info = NULL;

	bno -= le64_to_cpu(sm_raw->sm_ip_base);
	info = &sm->sm_ip_bmaps[bno >> sm->sm_ip_bmaps_shift];
	__clear_bit_le(bno & sm->sm_ip_bmaps_mask, info->block);
	info->dirty = true;

	return 0;
}

/*
 * apfs_main_free - Mark a regular block as free
 */
static int apfs_main_free(struct super_block *sb, u64 bno);

/**
 * apfs_flush_fq_rec - Delete a single fq record and mark its blocks as free
 * @root:	free queue root node
 * @xid:	transaction to target
 * @len:	on return, the number of freed blocks
 *
 * Returns 0 on success, or a negative error code in case of failure. -ENODATA
 * in particular means that there are no matching records left.
 */
static int apfs_flush_fq_rec(struct apfs_node *root, u64 xid, u64 *len)
{
	struct super_block *sb = root->object.sb;
	struct apfs_spaceman *sm = APFS_SM(sb);
	struct apfs_query *query = NULL;
	struct apfs_fq_rec fqrec = {0};
	u64 bno;
	int err;

	query = apfs_alloc_query(root, NULL /* parent */);
	if (!query)
		return -ENOMEM;
	apfs_init_free_queue_key(xid, 0 /* paddr */, &query->key);
	query->flags |= APFS_QUERY_FREE_QUEUE | APFS_QUERY_ANY_NUMBER | APFS_QUERY_EXACT;

	err = apfs_btree_query(sb, &query);
	if (err) {
		if (err != -ENODATA)
			apfs_err(sb, "query failed for xid 0x%llx, paddr 0x%llx", xid, 0ULL);
		goto fail;
	}
	err = apfs_fq_rec_from_query(query, &fqrec);
	if (err) {
		apfs_err(sb, "bad free queue rec for xid 0x%llx", xid);
		goto fail;
	}

	for (bno = fqrec.bno; bno < fqrec.bno + fqrec.len; ++bno) {
		if (apfs_block_in_ip(sm, bno))
			err = apfs_ip_mark_free(sb, bno);
		else
			err = apfs_main_free(sb, bno);
		if (err) {
			apfs_err(sb, "freeing block 0x%llx failed (%d)", (unsigned long long)bno, err);
			goto fail;
		}
	}
	err = apfs_btree_remove(query);
	if (err) {
		apfs_err(sb, "removal failed for xid 0x%llx", xid);
		goto fail;
	}
	*len = fqrec.len;

fail:
	apfs_free_query(query);
	return err;
}

/**
 * apfs_free_queue_oldest_xid - Find the oldest xid among the free queue records
 * @root: free queue root node
 */
static u64 apfs_free_queue_oldest_xid(struct apfs_node *root)
{
	struct apfs_spaceman_free_queue_key *key;
	char *raw = root->object.data;
	int len, off;

	if (root->records == 0)
		return 0;
	len = apfs_node_locate_key(root, 0, &off);
	if (len != sizeof(*key)) {
		/* TODO: abort transaction */
		apfs_err(root->object.sb, "bad key length (%d)", len);
		return 0;
	}
	key = (struct apfs_spaceman_free_queue_key *)(raw + off);
	return le64_to_cpu(key->sfqk_xid);
}

/**
 * apfs_flush_free_queue - Free ip blocks queued by old transactions
 * @sb:		superblock structure
 * @qid:	queue to be freed
 *
 * Returns 0 on success or a negative error code in case of failure.
 */
static int apfs_flush_free_queue(struct super_block *sb, unsigned int qid)
{
	struct apfs_nxsb_info *nxi = APFS_NXI(sb);
	struct apfs_spaceman *sm = APFS_SM(sb);
	struct apfs_spaceman_phys *sm_raw = sm->sm_raw;
	struct apfs_spaceman_free_queue *fq = &sm_raw->sm_fq[qid];
	struct apfs_node *fq_root;
	struct apfs_btree_info *fq_info = NULL;
	u64 oldest = le64_to_cpu(fq->sfq_oldest_xid);
	int err;

	fq_root = apfs_read_node(sb, le64_to_cpu(fq->sfq_tree_oid),
				 APFS_OBJ_EPHEMERAL, true /* write */);
	if (IS_ERR(fq_root)) {
		apfs_err(sb, "failed to read fq root 0x%llx", le64_to_cpu(fq->sfq_tree_oid));
		return PTR_ERR(fq_root);
	}

	while (oldest) {
		/*
		 * Blocks freed in the current transaction can't be reused
		 * safely until after the commit, but I don't think there is
		 * any point in preserving old transacions. I'm guessing the
		 * official driver keeps multiple transactions going at the
		 * same time, that must be why they need a free queue.
		 */
		if (oldest == nxi->nx_xid)
			break;

		while (true) {
			u64 count = 0;

			/* Probably not very efficient... */
			err = apfs_flush_fq_rec(fq_root, oldest, &count);
			if (err == -ENODATA) {
				err = 0;
				break;
			} else if (err) {
				apfs_err(sb, "failed to flush fq");
				goto fail;
			} else {
				le64_add_cpu(&fq->sfq_count, -count);
			}
		}
		oldest = apfs_free_queue_oldest_xid(fq_root);
		fq->sfq_oldest_xid = cpu_to_le64(oldest);
	}

	if (qid == APFS_SFQ_MAIN) {
		fq_info = (void *)fq_root->object.data + sb->s_blocksize - sizeof(*fq_info);
		sm->sm_main_fq_nodes = le64_to_cpu(fq_info->bt_node_count);
		if (sm->sm_main_fq_nodes != 1) {
			apfs_alert(sb, "main queue wasn't flushed in full - bug!");
			err = -EFSCORRUPTED;
			goto fail;
		}
	}

fail:
	apfs_node_free(fq_root);
	return err;
}

/**
 * apfs_allocate_spaceman - Allocate an in-memory spaceman struct, if needed
 * @sb:		superblock structure
 * @raw:	on-disk spaceman struct
 * @size:	size of the on-disk spaceman
 *
 * Returns the spaceman and sets it in the superblock info. Also performs all
 * initializations for the internal pool, including reading all the ip bitmaps.
 * This is a bit out of place here, but it's convenient because it has to
 * happen only once.
 *
 * On failure, returns an error pointer.
 */
static struct apfs_spaceman *apfs_allocate_spaceman(struct super_block *sb, struct apfs_spaceman_phys *raw, u32 size)
{
	struct apfs_nxsb_info *nxi = APFS_NXI(sb);
	struct apfs_spaceman *spaceman = NULL;
	int blk_bitcnt = sb->s_blocksize * 8;
	size_t sm_size;
	u32 bmap_cnt;
	int err;

	if (nxi->nx_spaceman)
		return nxi->nx_spaceman;

	/* We don't expect filesystems this big, it would be like 260 TiB */
	bmap_cnt = le32_to_cpu(raw->sm_ip_bm_size_in_blocks);
	if (bmap_cnt > 200) {
		apfs_err(sb, "too many ip bitmap blocks (%u)", bmap_cnt);
		return ERR_PTR(-EFSCORRUPTED);
	}
	sm_size = sizeof(*spaceman) + bmap_cnt * sizeof(spaceman->sm_ip_bmaps[0]);

	spaceman = nxi->nx_spaceman = kzalloc(sm_size, GFP_KERNEL);
	if (!spaceman)
		return ERR_PTR(-ENOMEM);
	spaceman->sm_nxi = nxi;
	/*
	 * These two fields must be set before reading the ip bitmaps, since
	 * that stuff involves several variable-length arrays inside the
	 * spaceman object itself.
	 */
	spaceman->sm_raw = raw;
	spaceman->sm_size = size;

	spaceman->sm_ip_bmaps_count = bmap_cnt;
	spaceman->sm_ip_bmaps_mask = blk_bitcnt - 1;
	spaceman->sm_ip_bmaps_shift = order_base_2(blk_bitcnt);

	/* This must happen only once, so it's easier to just leave it here */
	err = apfs_read_ip_bitmaps(sb);
	if (err) {
		apfs_err(sb, "failed to read the ip bitmaps");
		kfree(spaceman);
		nxi->nx_spaceman = spaceman = NULL;
		return ERR_PTR(err);
	}

	return nxi->nx_spaceman;
}

/**
 * apfs_read_spaceman - Find and read the space manager
 * @sb: superblock structure
 *
 * Reads the space manager structure from disk and initializes its in-memory
 * counterpart; returns 0 on success, or a negative error code in case of
 * failure.
 */
int apfs_read_spaceman(struct super_block *sb)
{
	struct apfs_nxsb_info *nxi = APFS_NXI(sb);
	struct apfs_nx_superblock *raw_sb = nxi->nx_raw;
	struct apfs_spaceman *spaceman = NULL;
	struct apfs_ephemeral_object_info *sm_eph_info = NULL;
	struct apfs_spaceman_phys *sm_raw;
	u32 sm_flags;
	u64 oid = le64_to_cpu(raw_sb->nx_spaceman_oid);
	int err;

	if (sb->s_flags & SB_RDONLY) /* The space manager won't be needed */
		return 0;

	sm_eph_info = apfs_ephemeral_object_lookup(sb, oid);
	if (IS_ERR(sm_eph_info)) {
		apfs_err(sb, "no spaceman object for oid 0x%llx", oid);
		return PTR_ERR(sm_eph_info);
	}
	sm_raw = (struct apfs_spaceman_phys *)sm_eph_info->object;
	sm_raw->sm_o.o_xid = cpu_to_le64(nxi->nx_xid);

	spaceman = apfs_allocate_spaceman(sb, sm_raw, sm_eph_info->size);
	if (IS_ERR(spaceman)) {
		apfs_err(sb, "failed to allocate spaceman");
		err = PTR_ERR(spaceman);
		goto fail;
	}

	spaceman->sm_free_cache_base = spaceman->sm_free_cache_blkcnt = 0;

	sm_flags = le32_to_cpu(sm_raw->sm_flags);
	/* Undocumented feature, but it's too common to refuse to mount */
	if (sm_flags & APFS_SM_FLAG_VERSIONED)
		pr_warn_once("APFS: space manager is versioned\n");

	/* Only read the main device; fusion drives are not yet supported */
	err = apfs_read_spaceman_dev(sb, &sm_raw->sm_dev[APFS_SD_MAIN]);
	if (err) {
		apfs_err(sb, "failed to read main device");
		goto fail;
	}

	spaceman->sm_blocks_per_chunk =
				le32_to_cpu(sm_raw->sm_blocks_per_chunk);
	spaceman->sm_chunks_per_cib = le32_to_cpu(sm_raw->sm_chunks_per_cib);
	if (spaceman->sm_chunks_per_cib > apfs_max_chunks_per_cib(sb)) {
		apfs_err(sb, "too many chunks per cib (%u)", spaceman->sm_chunks_per_cib);
		err = -EFSCORRUPTED;
		goto fail;
	}

	/*
	 * We flush free queues whole when each transaction begins, to make it
	 * harder for the btrees to become too unbalanced.
	 */
	err = apfs_flush_free_queue(sb, APFS_SFQ_IP);
	if (err) {
		apfs_err(sb, "failed to flush ip fq");
		goto fail;
	}
	err = apfs_flush_free_queue(sb, APFS_SFQ_MAIN);
	if (err) {
		apfs_err(sb, "failed to flush main fq");
		goto fail;
	}
	return 0;

fail:
	spaceman->sm_raw = NULL;
	return err;
}

/**
 * apfs_write_spaceman - Write the in-memory spaceman fields to the disk buffer
 * @sm: in-memory spaceman structure
 *
 * Copies the updated in-memory fields of the space manager into the on-disk
 * structure; the buffer is not dirtied.
 */
static void apfs_write_spaceman(struct apfs_spaceman *sm)
{
	struct apfs_spaceman_phys *sm_raw = sm->sm_raw;
	struct apfs_spaceman_device *dev_raw = &sm_raw->sm_dev[APFS_SD_MAIN];
	struct apfs_nxsb_info *nxi;

	nxi = sm->sm_nxi;
	ASSERT(le64_to_cpu(sm_raw->sm_o.o_xid) == nxi->nx_xid);

	dev_raw->sm_free_count = cpu_to_le64(sm->sm_free_count);
}

/**
 * apfs_ip_find_free - Find a free block inside the internal pool
 * @sb:		superblock structure
 *
 * Returns the block number for a free block, or 0 in case of corruption.
 */
static u64 apfs_ip_find_free(struct super_block *sb)
{
	struct apfs_spaceman *sm = APFS_SM(sb);
	struct apfs_spaceman_phys *sm_raw = sm->sm_raw;
	int blk_bitcnt = sb->s_blocksize * 8;
	u64 full_bitcnt = le64_to_cpu(sm_raw->sm_ip_block_count);
	u32 i;

	for (i = 0; i < sm->sm_ip_bmaps_count; ++i) {
		char *bitmap = sm->sm_ip_bmaps[i].block;
		u64 off_in_bmap_blk, off_in_ip;

		off_in_bmap_blk = find_next_zero_bit_le(bitmap, blk_bitcnt, 0 /* offset */);
		if (off_in_bmap_blk >= blk_bitcnt) /* No space in this chunk */
			continue;

		/* We found something, confirm that it's not outside the ip */
		off_in_ip = (i << sm->sm_ip_bmaps_shift) + off_in_bmap_blk;
		if (off_in_ip >= full_bitcnt)
			break;
		return le64_to_cpu(sm_raw->sm_ip_base) + off_in_ip;
	}
	apfs_err(sb, "internal pool seems full");
	return 0;
}

/**
 * apfs_chunk_find_free - Find a free block inside a chunk
 * @sb:		superblock structure
 * @bitmap:	allocation bitmap for the chunk, which should have free blocks
 * @addr:	number of the first block in the chunk
 *
 * Returns the block number for a free block, or 0 in case of corruption.
 */
static u64 apfs_chunk_find_free(struct super_block *sb, char *bitmap, u64 addr)
{
	int bitcount = sb->s_blocksize * 8;
	u64 bno;

	bno = find_next_zero_bit_le(bitmap, bitcount, 0 /* offset */);
	if (bno >= bitcount)
		return 0;
	return addr + bno;
}

/**
 * apfs_ip_mark_used - Mark a block in the internal pool as used
 * @sb:		superblock strucuture
 * @bno:	block number (must belong to the ip)
 */
static void apfs_ip_mark_used(struct super_block *sb, u64 bno)
{
	struct apfs_spaceman *sm = APFS_SM(sb);
	struct apfs_spaceman_phys *sm_raw = sm->sm_raw;
	struct apfs_ip_bitmap_block_info *info = NULL;

	bno -= le64_to_cpu(sm_raw->sm_ip_base);
	info = &sm->sm_ip_bmaps[bno >> sm->sm_ip_bmaps_shift];
	__set_bit_le(bno & sm->sm_ip_bmaps_mask, info->block);
	info->dirty = true;
}

/**
 * apfs_chunk_mark_used - Mark a block inside a chunk as used
 * @sb:		superblock structure
 * @bitmap:	allocation bitmap for the chunk
 * @bno:	block number (must belong to the chunk)
 */
static inline void apfs_chunk_mark_used(struct super_block *sb, char *bitmap,
					u64 bno)
{
	int bitcount = sb->s_blocksize * 8;

	__set_bit_le(bno & (bitcount - 1), bitmap);
}

/**
 * apfs_chunk_mark_free - Mark a block inside a chunk as free
 * @sb:		superblock structure
 * @bitmap:	allocation bitmap for the chunk
 * @bno:	block number (must belong to the chunk)
 */
static inline int apfs_chunk_mark_free(struct super_block *sb, char *bitmap,
					u64 bno)
{
	int bitcount = sb->s_blocksize * 8;

	return __test_and_clear_bit_le(bno & (bitcount - 1), bitmap);
}

/**
 * apfs_free_queue_try_insert - Try to add a block range to its free queue
 * @sb:		superblock structure
 * @bno:	first block number to free
 * @count:	number of consecutive blocks to free
 *
 * Same as apfs_free_queue_insert_nocache(), except that this one can also fail
 * with -ENOSPC if there is no room for the new record.
 */
static int apfs_free_queue_try_insert(struct super_block *sb, u64 bno, u64 count)
{
	struct apfs_nxsb_info *nxi = APFS_NXI(sb);
	struct apfs_spaceman *sm = APFS_SM(sb);
	struct apfs_spaceman_phys *sm_raw = sm->sm_raw;
	struct apfs_spaceman_free_queue *fq;
	struct apfs_node *fq_root = NULL;
	struct apfs_btree_info *fq_info = NULL;
	struct apfs_query *query = NULL;
	struct apfs_spaceman_free_queue_key raw_key;
	bool ghost = count == 1;
	int needed_room;
	__le64 raw_val;
	u64 node_count;
	u16 node_limit;
	unsigned int qid;
	int err;

	qid = apfs_block_in_ip(sm, bno) ? APFS_SFQ_IP : APFS_SFQ_MAIN;
	fq = &sm_raw->sm_fq[qid];

	fq_root = apfs_read_node(sb, le64_to_cpu(fq->sfq_tree_oid),
				 APFS_OBJ_EPHEMERAL, true /* write */);
	if (IS_ERR(fq_root)) {
		apfs_err(sb, "failed to read fq root 0x%llx", le64_to_cpu(fq->sfq_tree_oid));
		return PTR_ERR(fq_root);
	}

	query = apfs_alloc_query(fq_root, NULL /* parent */);
	if (!query) {
		err = -ENOMEM;
		goto fail;
	}
	apfs_init_free_queue_key(nxi->nx_xid, bno, &query->key);
	query->flags |= APFS_QUERY_FREE_QUEUE;

	err = apfs_btree_query(sb, &query);
	if (err && err != -ENODATA) {
		apfs_err(sb, "query failed for xid 0x%llx, paddr 0x%llx", nxi->nx_xid, bno);
		goto fail;
	}

	fq_info = (void *)fq_root->object.data + sb->s_blocksize - sizeof(*fq_info);
	node_count = le64_to_cpu(fq_info->bt_node_count);
	node_limit = le16_to_cpu(fq->sfq_tree_node_limit);
	if (node_count == node_limit) {
		needed_room = sizeof(raw_key) + (ghost ? 0 : sizeof(raw_val));
		if (!apfs_node_has_room(query->node, needed_room, false /* replace */)) {
			err = -ENOSPC;
			goto fail;
		}
	}

	raw_key.sfqk_xid = cpu_to_le64(nxi->nx_xid);
	raw_key.sfqk_paddr = cpu_to_le64(bno);
	if (ghost) {
		/* A lack of value (ghost record) means single-block extent */
		err = apfs_btree_insert(query, &raw_key, sizeof(raw_key), NULL /* val */, 0 /* val_len */);
	} else {
		raw_val = cpu_to_le64(count);
		err = apfs_btree_insert(query, &raw_key, sizeof(raw_key), &raw_val, sizeof(raw_val));
	}
	if (err) {
		apfs_err(sb, "insertion failed for xid 0x%llx, paddr 0x%llx", nxi->nx_xid, bno);
		goto fail;
	}

	if (!fq->sfq_oldest_xid)
		fq->sfq_oldest_xid = cpu_to_le64(nxi->nx_xid);
	le64_add_cpu(&fq->sfq_count, count);

	if (qid == APFS_SFQ_MAIN)
		sm->sm_main_fq_nodes = le64_to_cpu(fq_info->bt_node_count);

fail:
	apfs_free_query(query);
	apfs_node_free(fq_root);
	return err;
}

/**
 * apfs_free_queue_insert_nocache - Add a block range to its free queue
 * @sb:		superblock structure
 * @bno:	first block number to free
 * @count:	number of consecutive blocks to free
 *
 * Same as apfs_free_queue_insert(), but writes to the free queue directly,
 * bypassing the cache of the latest freed block range.
 *
 * Returns 0 on success or a negative error code in case of failure.
 */
int apfs_free_queue_insert_nocache(struct super_block *sb, u64 bno, u64 count)
{
	unsigned int qid;
	int err;

	err = apfs_free_queue_try_insert(sb, bno, count);
	if (err == -ENOSPC) {
		qid = apfs_block_in_ip(APFS_SM(sb), bno) ? APFS_SFQ_IP : APFS_SFQ_MAIN;
		apfs_alert(sb, "free queue (%u) seems full - bug!", qid);
		err = -EFSCORRUPTED;
	}
	if (err) {
		apfs_err(sb, "fq insert failed (0x%llx-0x%llx)", bno, count);
		return err;
	}
	return 0;
}

/**
 * apfs_free_queue_insert - Add a block range to its free queue
 * @sb:		superblock structure
 * @bno:	first block number to free
 * @count:	number of consecutive blocks to free
 *
 * Uses a cache to delay the actual tree operations as much as possible.
 *
 * Returns 0 on success or a negative error code in case of failure.
 */
int apfs_free_queue_insert(struct super_block *sb, u64 bno, u64 count)
{
	struct apfs_spaceman *sm = APFS_SM(sb);
	int err;

	if (sm->sm_free_cache_base == 0) {
		/* Nothing yet cached */
		sm->sm_free_cache_base = bno;
		sm->sm_free_cache_blkcnt = count;
		return 0;
	}

	/*
	 * First attempt to extend the cache of freed blocks, but never cache
	 * a range that doesn't belong to a single free queue.
	 */
	if (apfs_block_in_ip(sm, bno) == apfs_block_in_ip(sm, sm->sm_free_cache_base)) {
		if (bno == sm->sm_free_cache_base + sm->sm_free_cache_blkcnt) {
			sm->sm_free_cache_blkcnt += count;
			return 0;
		}
		if (bno + count == sm->sm_free_cache_base) {
			sm->sm_free_cache_base -= count;
			sm->sm_free_cache_blkcnt += count;
			return 0;
		}
	}

	/* Failed to extend the cache, so flush it and replace it */
	err = apfs_free_queue_insert_nocache(sb, sm->sm_free_cache_base, sm->sm_free_cache_blkcnt);
	if (err) {
		apfs_err(sb, "fq cache flush failed (0x%llx-0x%llx)", sm->sm_free_cache_base, sm->sm_free_cache_blkcnt);
		return err;
	}
	sm->sm_free_cache_base = bno;
	sm->sm_free_cache_blkcnt = count;
	return 0;
}

/**
 * apfs_chunk_alloc_free - Allocate or free block in given CIB and chunk
 * @sb:		superblock structure
 * @cib_bh:	buffer head for the chunk-info block
 * @index:	index of this chunk's info structure inside @cib
 * @bno:	block number
 * @is_alloc:	true to allocate, false to free
 */
static int apfs_chunk_alloc_free(struct super_block *sb,
				 struct buffer_head **cib_bh,
				 int index, u64 *bno, bool is_alloc)
{
	struct apfs_nxsb_info *nxi = APFS_NXI(sb);
	struct apfs_spaceman *sm = APFS_SM(sb);
	struct apfs_chunk_info_block *cib;
	struct apfs_chunk_info *ci;
	struct buffer_head *bmap_bh = NULL;
	char *bmap = NULL;
	bool old_cib = false;
	bool old_bmap = false;
	int err = 0;

	cib = (struct apfs_chunk_info_block *)(*cib_bh)->b_data;
	ci = &cib->cib_chunk_info[index];

	/* Cibs and bitmaps from old transactions can't be modified in place */
	if (le64_to_cpu(cib->cib_o.o_xid) < nxi->nx_xid)
		old_cib = true;
	if (le64_to_cpu(ci->ci_xid) < nxi->nx_xid)
		old_bmap = true;
	if (is_alloc && le32_to_cpu(ci->ci_free_count) < 1)
		return -ENOSPC;

	/* Read the current bitmap, or allocate it if necessary */
	if (!ci->ci_bitmap_addr) {
		u64 bmap_bno;

		if (!is_alloc) {
			apfs_err(sb, "attempt to free block in all-free chunk");
			return -EFSCORRUPTED;
		}

		/* All blocks in this chunk are free */
		bmap_bno = apfs_ip_find_free(sb);
		if (!bmap_bno) {
			apfs_err(sb, "no free blocks in ip");
			return -EFSCORRUPTED;
		}
		bmap_bh = apfs_sb_bread(sb, bmap_bno);
	} else {
		bmap_bh = apfs_sb_bread(sb, le64_to_cpu(ci->ci_bitmap_addr));
	}
	if (!bmap_bh) {
		apfs_err(sb, "failed to read bitmap block");
		return -EIO;
	}
	bmap = bmap_bh->b_data;
	if (!ci->ci_bitmap_addr) {
		memset(bmap, 0, sb->s_blocksize);
		old_bmap = false;
	}

	/* Write the bitmap to its location for the next transaction */
	if (old_bmap) {
		struct buffer_head *new_bmap_bh;
		u64 new_bmap_bno;

		new_bmap_bno = apfs_ip_find_free(sb);
		if (!new_bmap_bno) {
			apfs_err(sb, "no free blocks in ip");
			err = -EFSCORRUPTED;
			goto fail;
		}

		new_bmap_bh = apfs_getblk(sb, new_bmap_bno);
		if (!new_bmap_bh) {
			apfs_err(sb, "failed to map new bmap block (0x%llx)", new_bmap_bno);
			err = -EIO;
			goto fail;
		}
		memcpy(new_bmap_bh->b_data, bmap, sb->s_blocksize);
		err = apfs_free_queue_insert(sb, bmap_bh->b_blocknr, 1);
		brelse(bmap_bh);
		bmap_bh = new_bmap_bh;
		if (err) {
			apfs_err(sb, "free queue insertion failed");
			goto fail;
		}
		bmap = bmap_bh->b_data;
	}
	apfs_ip_mark_used(sb, bmap_bh->b_blocknr);

	/* Write the cib to its location for the next transaction */
	if (old_cib) {
		struct buffer_head *new_cib_bh;
		u64 new_cib_bno;

		new_cib_bno = apfs_ip_find_free(sb);
		if (!new_cib_bno) {
			apfs_err(sb, "no free blocks in ip");
			err = -EFSCORRUPTED;
			goto fail;
		}

		new_cib_bh = apfs_getblk(sb, new_cib_bno);
		if (!new_cib_bh) {
			apfs_err(sb, "failed to map new cib block (0x%llx)", new_cib_bno);
			err = -EIO;
			goto fail;
		}
		memcpy(new_cib_bh->b_data, (*cib_bh)->b_data, sb->s_blocksize);
		err = apfs_free_queue_insert(sb, (*cib_bh)->b_blocknr, 1);
		brelse(*cib_bh);
		*cib_bh = new_cib_bh;
		if (err) {
			apfs_err(sb, "free queue insertion failed");
			goto fail;
		}

		err = apfs_transaction_join(sb, *cib_bh);
		if (err)
			goto fail;

		cib = (struct apfs_chunk_info_block *)(*cib_bh)->b_data;
		ci = &cib->cib_chunk_info[index];
		cib->cib_o.o_oid = cpu_to_le64(new_cib_bno);
		cib->cib_o.o_xid = cpu_to_le64(nxi->nx_xid);

		apfs_ip_mark_used(sb, new_cib_bno);
	}

	/* The chunk info can be updated now */
	apfs_assert_in_transaction(sb, &cib->cib_o);
	ci->ci_xid = cpu_to_le64(nxi->nx_xid);
	le32_add_cpu(&ci->ci_free_count, is_alloc ? -1 : 1);
	ci->ci_bitmap_addr = cpu_to_le64(bmap_bh->b_blocknr);
	ASSERT(buffer_trans(*cib_bh));
	set_buffer_csum(*cib_bh);

	/* Finally, allocate / free the actual block that was requested */
	if (is_alloc) {
		*bno = apfs_chunk_find_free(sb, bmap, le64_to_cpu(ci->ci_addr));
		if (!*bno) {
			apfs_err(sb, "no free blocks in chunk");
			err = -EFSCORRUPTED;
			goto fail;
		}
		apfs_chunk_mark_used(sb, bmap, *bno);
		sm->sm_free_count -= 1;
	} else {
		if (!apfs_chunk_mark_free(sb, bmap, *bno)) {
			apfs_err(sb, "block already marked as free (0x%llx)", *bno);
			le32_add_cpu(&ci->ci_free_count, -1);
			set_buffer_csum(*cib_bh);
			err = -EFSCORRUPTED;
		} else
			sm->sm_free_count += 1;
	}
	mark_buffer_dirty(bmap_bh);

fail:
	brelse(bmap_bh);
	return err;
}

/**
 * apfs_chunk_allocate_block - Allocate a single block from a chunk
 * @sb:		superblock structure
 * @cib_bh:	buffer head for the chunk-info block
 * @index:	index of this chunk's info structure inside @cib
 * @bno:	on return, the allocated block number
 *
 * Finds a free block in the chunk and marks it as used; the buffer at @cib_bh
 * may be replaced if needed for copy-on-write.  Returns 0 on success, or a
 * negative error code in case of failure.
 */
static int apfs_chunk_allocate_block(struct super_block *sb,
				     struct buffer_head **cib_bh,
				     int index, u64 *bno)
{
	return apfs_chunk_alloc_free(sb, cib_bh, index, bno, true);
}

/**
 * apfs_cib_allocate_block - Allocate a single block from a cib
 * @sb:		superblock structure
 * @cib_bh:	buffer head for the chunk-info block
 * @bno:	on return, the allocated block number
 * @backwards:	start the search on the last chunk
 *
 * Finds a free block among all the chunks in the cib and marks it as used; the
 * buffer at @cib_bh may be replaced if needed for copy-on-write.  Returns 0 on
 * success, or a negative error code in case of failure.
 */
static int apfs_cib_allocate_block(struct super_block *sb,
				   struct buffer_head **cib_bh, u64 *bno, bool backwards)
{
	struct apfs_nxsb_info *nxi = APFS_NXI(sb);
	struct apfs_spaceman *sm = APFS_SM(sb);
	struct apfs_chunk_info_block *cib;
	u32 chunk_count;
	int i;

	cib = (struct apfs_chunk_info_block *)(*cib_bh)->b_data;
	if (nxi->nx_flags & APFS_CHECK_NODES && !apfs_obj_verify_csum(sb, *cib_bh)) {
		apfs_err(sb, "bad checksum for chunk-info block");
		return -EFSBADCRC;
	}

	/* Avoid out-of-bounds operations on corrupted cibs */
	chunk_count = le32_to_cpu(cib->cib_chunk_info_count);
	if (chunk_count > sm->sm_chunks_per_cib) {
		apfs_err(sb, "too many chunks in cib (%u)", chunk_count);
		return -EFSCORRUPTED;
	}

	for (i = 0; i < chunk_count; ++i) {
		int index;
		int err;

		index = backwards ? chunk_count - 1 - i : i;

		err = apfs_chunk_allocate_block(sb, cib_bh, index, bno);
		if (err == -ENOSPC) /* This chunk is full */
			continue;
		if (err)
			apfs_err(sb, "error during allocation");
		return err;
	}
	return -ENOSPC;
}

/**
 * apfs_spaceman_allocate_block - Allocate a single on-disk block
 * @sb:		superblock structure
 * @bno:	on return, the allocated block number
 * @backwards:	start the search on the last chunk
 *
 * Finds a free block among the spaceman bitmaps and marks it as used.  Returns
 * 0 on success, or a negative error code in case of failure.
 */
int apfs_spaceman_allocate_block(struct super_block *sb, u64 *bno, bool backwards)
{
	struct apfs_spaceman *sm = APFS_SM(sb);
	int i;

	for (i = 0; i < sm->sm_cib_count; ++i) {
		struct buffer_head *cib_bh;
		u64 cib_bno;
		int index;
		int err;

		/* Keep extents and metadata separate to limit fragmentation */
		index = backwards ? sm->sm_cib_count - 1 - i : i;

		cib_bno = apfs_spaceman_read_cib_addr(sb, index);
		cib_bh = apfs_sb_bread(sb, cib_bno);
		if (!cib_bh) {
			apfs_err(sb, "failed to read cib");
			return -EIO;
		}

		err = apfs_cib_allocate_block(sb, &cib_bh, bno, backwards);
		if (!err) {
			/* The cib may have been moved */
			apfs_spaceman_write_cib_addr(sb, index, cib_bh->b_blocknr);
			/* The free block count has changed */
			apfs_write_spaceman(sm);
		}
		brelse(cib_bh);
		if (err == -ENOSPC) /* This cib is full */
			continue;
		if (err)
			apfs_err(sb, "error during allocation");
		return err;
	}
	/*
	 * We checked the free space before starting the transaction, so this
	 * isn't expected to happen.
	 */
	apfs_err(sb, "ran out of space during transaction");
	return -ENOSPC;
}

/**
 * apfs_chunk_free - Mark a regular block as free given CIB and chunk
 * @sb:		superblock structure
 * @cib_bh:	buffer head for the chunk-info block
 * @index:	index of this chunk's info structure inside @cib
 * @bno:	block number (must not belong to the ip)
 */
static int apfs_chunk_free(struct super_block *sb,
				struct buffer_head **cib_bh,
				int index, u64 bno)
{
	return apfs_chunk_alloc_free(sb, cib_bh, index, &bno, false);
}

/**
 * apfs_main_free - Mark a regular block as free
 * @sb:		superblock structure
 * @bno:	block number (must not belong to the ip)
 */
static int apfs_main_free(struct super_block *sb, u64 bno)
{
	struct apfs_spaceman *sm = APFS_SM(sb);
	struct apfs_spaceman_phys *sm_raw = sm->sm_raw;
	struct apfs_sb_info *sbi = NULL;
	u64 cib_idx, chunk_idx;
	struct buffer_head *cib_bh;
	u64 cib_bno;
	int err, orphan_err;

	if (!sm_raw->sm_blocks_per_chunk || !sm_raw->sm_chunks_per_cib) {
		apfs_err(sb, "block or chunk count not set");
		return -EINVAL;
	}
	/* TODO: use bitshifts instead of do_div() */
	chunk_idx = bno;
	do_div(chunk_idx, sm->sm_blocks_per_chunk);
	cib_idx = chunk_idx;
	chunk_idx = do_div(cib_idx, sm->sm_chunks_per_cib);

	cib_bno = apfs_spaceman_read_cib_addr(sb, cib_idx);
	cib_bh = apfs_sb_bread(sb, cib_bno);
	if (!cib_bh) {
		apfs_err(sb, "failed to read cib");
		return -EIO;
	}

	err = apfs_chunk_free(sb, &cib_bh, chunk_idx, bno);
	if (!err) {
		/* The cib may have been moved */
		apfs_spaceman_write_cib_addr(sb, cib_idx, cib_bh->b_blocknr);
		/* The free block count has changed */
		apfs_write_spaceman(sm);
	}
	brelse(cib_bh);
	if (err) {
		apfs_err(sb, "error during free");
		return err;
	}

	/* It may be time to resume orphan cleanups, if we made enough room */
	sbi = APFS_SB(sb);
	orphan_err = atomic_read(&sbi->s_orphan_cleanup_err);
	if (orphan_err == -ENOSPC && sm->sm_free_count >= 2 * APFS_DEL_ROOM) {
		atomic_set(&sbi->s_orphan_cleanup_err, 0);
		apfs_schedule_orphan_cleanup(sb);
	}

	return err;
}

/**
 * apfs_spaceman_get_free_blkcnt - Calculate the total number of free blocks
 * @sb:		filesystem superblock
 * @blkcnt:	on return, the total number of free blocks for all devices
 *
 * Can be called even if the spaceman has not been read (for example, on a
 * read-only mount). Returns 0 on success, or a negative error code in case of
 * failure.
 */
int apfs_spaceman_get_free_blkcnt(struct super_block *sb, u64 *blkcnt)
{
	struct apfs_nxsb_info *nxi = APFS_NXI(sb);
	struct apfs_nx_superblock *raw_sb = NULL;
	struct apfs_spaceman_phys *sm_raw = NULL;
	struct apfs_ephemeral_object_info *sm_eph_info = NULL;
	struct apfs_spaceman_device *dev = NULL;
	u64 oid;
	int err;

	if (!nxi->nx_eph_list) {
		err = apfs_read_ephemeral_objects(sb);
		if (err) {
			apfs_err(sb, "failed to read the ephemeral objects");
			return err;
		}
	}

	raw_sb = nxi->nx_raw;
	oid = le64_to_cpu(raw_sb->nx_spaceman_oid);
	sm_eph_info = apfs_ephemeral_object_lookup(sb, oid);
	if (IS_ERR(sm_eph_info)) {
		apfs_err(sb, "no spaceman object for oid 0x%llx", oid);
		return PTR_ERR(sm_eph_info);
	}
	sm_raw = (struct apfs_spaceman_phys *)sm_eph_info->object;

	*blkcnt = 0;
	dev = &sm_raw->sm_dev[APFS_SD_MAIN];
	*blkcnt += le64_to_cpu(dev->sm_free_count);
	dev = &sm_raw->sm_dev[APFS_SD_TIER2];
	*blkcnt += le64_to_cpu(dev->sm_free_count);
	return 0;
}