// SPDX-License-Identifier: GPL-2.0-only
/*
 * Copyright (C) 2018 Ernesto A. Fernández <ernesto.mnd.fernandez@gmail.com>
 */

#include <linux/slab.h>
#include <linux/buffer_head.h>
#include <linux/mount.h>
#include <linux/mpage.h>
#include <linux/blk_types.h>
#include "apfs.h"

#if LINUX_VERSION_CODE >= KERNEL_VERSION(5, 19, 0) || RHEL_VERSION_GE(9, 3)
#include <linux/sched/mm.h>
#endif

#if LINUX_VERSION_CODE >= KERNEL_VERSION(5, 13, 0)
#include <linux/fileattr.h>
#endif

#define MAX_PFK_LEN	512

#if LINUX_VERSION_CODE >= KERNEL_VERSION(6, 15, 0)
static struct page *grab_cache_page_write_begin(struct address_space *mapping, pgoff_t index)
{
	return pagecache_get_page(mapping, index, FGP_WRITEBEGIN, mapping_gfp_mask(mapping));
}
#endif

#if LINUX_VERSION_CODE >= KERNEL_VERSION(5, 19, 0) || RHEL_VERSION_GE(9, 3)

static int apfs_read_folio(struct file *file, struct folio *folio)
{
	return mpage_read_folio(folio, apfs_get_block);
}

#else

static int apfs_readpage(struct file *file, struct page *page)
{
	return mpage_readpage(page, apfs_get_block);
}

#endif

#if LINUX_VERSION_CODE >= KERNEL_VERSION(5, 8, 0) /* Misses mpage_readpages() */

static void apfs_readahead(struct readahead_control *rac)
{
	mpage_readahead(rac, apfs_get_block);
}

#else /* LINUX_VERSION_CODE >= KERNEL_VERSION(5, 8, 0) */

static int apfs_readpages(struct file *file, struct address_space *mapping,
			  struct list_head *pages, unsigned int nr_pages)
{
	return mpage_readpages(mapping, pages, nr_pages, apfs_get_block);
}

#endif /* LINUX_VERSION_CODE >= KERNEL_VERSION(5, 8, 0) */

/**
 * apfs_create_dstream_rec - Create a data stream record
 * @dstream: data stream info
 *
 * Does nothing if the record already exists.  TODO: support cloned files.
 * Returns 0 on success or a negative error code in case of failure.
 */
static int apfs_create_dstream_rec(struct apfs_dstream_info *dstream)
{
	struct super_block *sb = dstream->ds_sb;
	struct apfs_sb_info *sbi = APFS_SB(sb);
	struct apfs_query *query;
	struct apfs_dstream_id_key raw_key;
	struct apfs_dstream_id_val raw_val;
	int ret;

	query = apfs_alloc_query(sbi->s_cat_root, NULL /* parent */);
	if (!query)
		return -ENOMEM;
	apfs_init_dstream_id_key(dstream->ds_id, &query->key);
	query->flags |= APFS_QUERY_CAT | APFS_QUERY_EXACT;

	ret = apfs_btree_query(sb, &query);
	if (ret != -ENODATA) /* Either an error, or the record already exists */
		goto out;

	apfs_key_set_hdr(APFS_TYPE_DSTREAM_ID, dstream->ds_id, &raw_key);
	raw_val.refcnt = cpu_to_le32(1);
	ret = apfs_btree_insert(query, &raw_key, sizeof(raw_key), &raw_val, sizeof(raw_val));
	if (ret) {
		apfs_err(sb, "insertion failed for id 0x%llx", dstream->ds_id);
		goto out;
	}
out:
	apfs_free_query(query);
	return ret;
}

static int apfs_check_dstream_refcnt(struct inode *inode);
static int apfs_put_dstream_rec(struct apfs_dstream_info *dstream);

/**
 * apfs_inode_create_exclusive_dstream - Make an inode's dstream not shared
 * @inode: the vfs inode
 *
 * Returns 0 on success, or a negative error code in case of failure.
 */
int apfs_inode_create_exclusive_dstream(struct inode *inode)
{
	struct super_block *sb = inode->i_sb;
	struct apfs_superblock *vsb_raw = APFS_SB(sb)->s_vsb_raw;
	struct apfs_inode_info *ai = APFS_I(inode);
	struct apfs_dstream_info *dstream = &ai->i_dstream;
	u64 new_id;
	int err;

	if (!ai->i_has_dstream || !dstream->ds_shared)
		return 0;

	/*
	 * The ds_shared field is not updated when the other user of the
	 * dstream puts it, so it could be a false positive. Check it again
	 * before actually putting the dstream. The double query is wasteful,
	 * but I don't know if it makes sense to optimize this (TODO).
	 */
	err = apfs_check_dstream_refcnt(inode);
	if (err) {
		apfs_err(sb, "failed to check refcnt for ino 0x%llx", apfs_ino(inode));
		return err;
	}
	if (!dstream->ds_shared)
		return 0;
	err = apfs_put_dstream_rec(dstream);
	if (err) {
		apfs_err(sb, "failed to put dstream for ino 0x%llx", apfs_ino(inode));
		return err;
	}

	apfs_assert_in_transaction(sb, &vsb_raw->apfs_o);
	new_id = le64_to_cpu(vsb_raw->apfs_next_obj_id);
	le64_add_cpu(&vsb_raw->apfs_next_obj_id, 1);

	err = apfs_clone_extents(dstream, new_id);
	if (err) {
		apfs_err(sb, "failed clone extents for ino 0x%llx", apfs_ino(inode));
		return err;
	}

	dstream->ds_id = new_id;
	err = apfs_create_dstream_rec(dstream);
	if (err) {
		apfs_err(sb, "failed to create dstream for ino 0x%llx", apfs_ino(inode));
		return err;
	}

	dstream->ds_shared = false;
	return 0;
}

/**
 * apfs_inode_create_dstream_rec - Create the data stream record for an inode
 * @inode: the vfs inode
 *
 * Does nothing if the record already exists.  TODO: support cloned files.
 * Returns 0 on success or a negative error code in case of failure.
 */
static int apfs_inode_create_dstream_rec(struct inode *inode)
{
	struct apfs_inode_info *ai = APFS_I(inode);
	int err;

	if (ai->i_has_dstream)
		return apfs_inode_create_exclusive_dstream(inode);

	err = apfs_create_dstream_rec(&ai->i_dstream);
	if (err)
		return err;

	ai->i_has_dstream = true;
	return 0;
}

/**
 * apfs_dstream_adj_refcnt - Adjust dstream record refcount
 * @dstream:	data stream info
 * @delta:	desired change in reference count
 *
 * Deletes the record if the reference count goes to zero. Returns 0 on success
 * or a negative error code in case of failure.
 */
int apfs_dstream_adj_refcnt(struct apfs_dstream_info *dstream, u32 delta)
{
	struct super_block *sb = dstream->ds_sb;
	struct apfs_sb_info *sbi = APFS_SB(sb);
	struct apfs_query *query;
	struct apfs_dstream_id_val raw_val;
	void *raw = NULL;
	u32 refcnt;
	int ret;

	ASSERT(APFS_I(dstream->ds_inode)->i_has_dstream);

	query = apfs_alloc_query(sbi->s_cat_root, NULL /* parent */);
	if (!query)
		return -ENOMEM;
	apfs_init_dstream_id_key(dstream->ds_id, &query->key);
	query->flags |= APFS_QUERY_CAT | APFS_QUERY_EXACT;

	ret = apfs_btree_query(sb, &query);
	if (ret) {
		apfs_err(sb, "query failed for id 0x%llx", dstream->ds_id);
		if (ret == -ENODATA)
			ret = -EFSCORRUPTED;
		goto out;
	}

	if (query->len != sizeof(raw_val)) {
		apfs_err(sb, "bad value length (%d)", query->len);
		ret = -EFSCORRUPTED;
		goto out;
	}
	raw = query->node->object.data;
	raw_val = *(struct apfs_dstream_id_val *)(raw + query->off);
	refcnt = le32_to_cpu(raw_val.refcnt);

	refcnt += delta;
	if (refcnt == 0) {
		ret = apfs_btree_remove(query);
		if (ret)
			apfs_err(sb, "removal failed for id 0x%llx", dstream->ds_id);
		goto out;
	}

	raw_val.refcnt = cpu_to_le32(refcnt);
	ret = apfs_btree_replace(query, NULL /* key */, 0 /* key_len */, &raw_val, sizeof(raw_val));
	if (ret)
		apfs_err(sb, "update failed for id 0x%llx", dstream->ds_id);
out:
	apfs_free_query(query);
	return ret;
}

/**
 * apfs_put_dstream_rec - Put a reference for a data stream record
 * @dstream: data stream info
 *
 * Deletes the record if the reference count goes to zero. Returns 0 on success
 * or a negative error code in case of failure.
 */
static int apfs_put_dstream_rec(struct apfs_dstream_info *dstream)
{
	struct apfs_inode_info *ai = APFS_I(dstream->ds_inode);

	if (!ai->i_has_dstream)
		return 0;
	return apfs_dstream_adj_refcnt(dstream, -1);
}

/**
 * apfs_create_crypto_rec - Create the crypto state record for an inode
 * @inode: the vfs inode
 *
 * Does nothing if the record already exists.  TODO: support cloned files.
 * Returns 0 on success or a negative error code in case of failure.
 */
static int apfs_create_crypto_rec(struct inode *inode)
{
	struct super_block *sb = inode->i_sb;
	struct apfs_sb_info *sbi = APFS_SB(sb);
	struct apfs_dstream_info *dstream = &APFS_I(inode)->i_dstream;
	struct apfs_query *query;
	struct apfs_crypto_state_key raw_key;
	int ret;

	if (inode->i_size || inode->i_blocks) /* Already has a dstream */
		return 0;

	query = apfs_alloc_query(sbi->s_cat_root, NULL /* parent */);
	if (!query)
		return -ENOMEM;
	apfs_init_crypto_state_key(dstream->ds_id, &query->key);
	query->flags |= APFS_QUERY_CAT | APFS_QUERY_EXACT;

	ret = apfs_btree_query(sb, &query);
	if (ret != -ENODATA) /* Either an error, or the record already exists */
		goto out;

	apfs_key_set_hdr(APFS_TYPE_CRYPTO_STATE, dstream->ds_id, &raw_key);
	if (sbi->s_dflt_pfk) {
		struct apfs_crypto_state_val *raw_val = sbi->s_dflt_pfk;
		unsigned int key_len = le16_to_cpu(raw_val->state.key_len);

		ret = apfs_btree_insert(query, &raw_key, sizeof(raw_key), raw_val, sizeof(*raw_val) + key_len);
		if (ret)
			apfs_err(sb, "insertion failed for id 0x%llx", dstream->ds_id);
	} else {
		struct apfs_crypto_state_val raw_val;

		raw_val.refcnt = cpu_to_le32(1);
		raw_val.state.major_version = cpu_to_le16(APFS_WMCS_MAJOR_VERSION);
		raw_val.state.minor_version = cpu_to_le16(APFS_WMCS_MINOR_VERSION);
		raw_val.state.cpflags = 0;
		raw_val.state.persistent_class = cpu_to_le32(APFS_PROTECTION_CLASS_F);
		raw_val.state.key_os_version = 0;
		raw_val.state.key_revision = cpu_to_le16(1);
		raw_val.state.key_len = cpu_to_le16(0);
		ret = apfs_btree_insert(query, &raw_key, sizeof(raw_key), &raw_val, sizeof(raw_val));
		if (ret)
			apfs_err(sb, "insertion failed for id 0x%llx", dstream->ds_id);
	}
out:
	apfs_free_query(query);
	return ret;
}

/**
 * apfs_dflt_key_class - Returns default key class for files in volume
 * @sb: volume superblock
 */
static unsigned int apfs_dflt_key_class(struct super_block *sb)
{
	struct apfs_sb_info *sbi = APFS_SB(sb);

	if (!sbi->s_dflt_pfk)
		return APFS_PROTECTION_CLASS_F;

	return le32_to_cpu(sbi->s_dflt_pfk->state.persistent_class);
}

/**
 * apfs_create_crypto_rec - Adjust crypto state record refcount
 * @sb: volume superblock
 * @crypto_id: crypto_id to adjust
 * @delta: desired change in reference count
 *
 * This function is used when adding or removing extents, as each extent holds
 * a reference to the crypto ID. It should also be used when removing inodes,
 * and in that case it should also remove the crypto record (TODO).
 */
int apfs_crypto_adj_refcnt(struct super_block *sb, u64 crypto_id, int delta)
{
	struct apfs_sb_info *sbi = APFS_SB(sb);
	struct apfs_query *query;
	struct apfs_crypto_state_val *raw_val;
	char *raw;
	int ret;

	if (!crypto_id)
		return 0;

	query = apfs_alloc_query(sbi->s_cat_root, NULL /* parent */);
	if (!query)
		return -ENOMEM;
	apfs_init_crypto_state_key(crypto_id, &query->key);
	query->flags |= APFS_QUERY_CAT | APFS_QUERY_EXACT;

	ret = apfs_btree_query(sb, &query);
	if (ret) {
		apfs_err(sb, "query failed for id 0x%llx", crypto_id);
		goto out;
	}

	ret = apfs_query_join_transaction(query);
	if (ret) {
		apfs_err(sb, "query join failed");
		return ret;
	}
	raw = query->node->object.data;
	raw_val = (void *)raw + query->off;

	le32_add_cpu(&raw_val->refcnt, delta);

out:
	apfs_free_query(query);
	return ret;
}

/**
 * apfs_crypto_set_key - Modify content of crypto state record
 * @sb: volume superblock
 * @crypto_id: crypto_id to modify
 * @new_val: new crypto state data; new_val->refcnt is overridden
 *
 * This function does not alter the inode's default protection class field.
 * It needs to be done separately if the class changes.
 */
static int apfs_crypto_set_key(struct super_block *sb, u64 crypto_id, struct apfs_crypto_state_val *new_val)
{
	struct apfs_sb_info *sbi = APFS_SB(sb);
	struct apfs_query *query;
	struct apfs_crypto_state_val *raw_val;
	char *raw;
	int ret;
	unsigned int pfk_len;

	if (!crypto_id)
		return 0;

	pfk_len = le16_to_cpu(new_val->state.key_len);

	query = apfs_alloc_query(sbi->s_cat_root, NULL /* parent */);
	if (!query)
		return -ENOMEM;
	apfs_init_crypto_state_key(crypto_id, &query->key);
	query->flags |= APFS_QUERY_CAT | APFS_QUERY_EXACT;

	ret = apfs_btree_query(sb, &query);
	if (ret) {
		apfs_err(sb, "query failed for id 0x%llx", crypto_id);
		goto out;
	}
	raw = query->node->object.data;
	raw_val = (void *)raw + query->off;

	new_val->refcnt = raw_val->refcnt;

	ret = apfs_btree_replace(query, NULL /* key */, 0 /* key_len */, new_val, sizeof(*new_val) + pfk_len);
	if (ret)
		apfs_err(sb, "update failed for id 0x%llx", crypto_id);

out:
	apfs_free_query(query);
	return ret;
}

/**
 * apfs_crypto_get_key - Retrieve content of crypto state record
 * @sb: volume superblock
 * @crypto_id: crypto_id to modify
 * @val: result crypto state data
 * @max_len: maximum allowed value of val->state.key_len
 */
static int apfs_crypto_get_key(struct super_block *sb, u64 crypto_id, struct apfs_crypto_state_val *val,
			       unsigned int max_len)
{
	struct apfs_sb_info *sbi = APFS_SB(sb);
	struct apfs_query *query;
	struct apfs_crypto_state_val *raw_val;
	char *raw;
	int ret;
	unsigned int pfk_len;

	if (!crypto_id)
		return -ENOENT;

	query = apfs_alloc_query(sbi->s_cat_root, NULL /* parent */);
	if (!query)
		return -ENOMEM;
	apfs_init_crypto_state_key(crypto_id, &query->key);
	query->flags |= APFS_QUERY_CAT | APFS_QUERY_EXACT;

	ret = apfs_btree_query(sb, &query);
	if (ret)
		goto out;
	raw = query->node->object.data;
	raw_val = (void *)raw + query->off;

	pfk_len = le16_to_cpu(raw_val->state.key_len);
	if (pfk_len > max_len) {
		ret = -ENOSPC;
		goto out;
	}

	memcpy(val, raw_val, sizeof(*val) + pfk_len);

out:
	apfs_free_query(query);
	return ret;
}

#if LINUX_VERSION_CODE >= KERNEL_VERSION(6, 17, 0)
int __apfs_write_begin(const struct kiocb *file, struct address_space *mapping, loff_t pos, unsigned int len, unsigned int flags, struct page **pagep, void **fsdata)
#else
int __apfs_write_begin(struct file *file, struct address_space *mapping, loff_t pos, unsigned int len, unsigned int flags, struct page **pagep, void **fsdata)
#endif
{
	struct inode *inode = mapping->host;
	struct apfs_dstream_info *dstream = &APFS_I(inode)->i_dstream;
	struct super_block *sb = inode->i_sb;
	struct page *page;
#if LINUX_VERSION_CODE >= KERNEL_VERSION(6, 7, 0)
	struct folio *folio;
#endif
	struct buffer_head *bh, *head;
	unsigned int blocksize, block_start, block_end, from, to;
	pgoff_t index = pos >> PAGE_SHIFT;
	sector_t iblock = (sector_t)index << (PAGE_SHIFT - inode->i_blkbits);
	loff_t i_blks_end;
	int err;

	apfs_inode_join_transaction(sb, inode);

	err = apfs_inode_create_dstream_rec(inode);
	if (err) {
		apfs_err(sb, "failed to create dstream for ino 0x%llx", apfs_ino(inode));
		return err;
	}

	if (apfs_vol_is_encrypted(sb)) {
		err = apfs_create_crypto_rec(inode);
		if (err) {
			apfs_err(sb, "crypto creation failed for ino 0x%llx", apfs_ino(inode));
			return err;
		}
	}

#if LINUX_VERSION_CODE >= KERNEL_VERSION(5, 19, 0) || RHEL_VERSION_GE(9, 3)
	flags = memalloc_nofs_save();
	page = grab_cache_page_write_begin(mapping, index);
	memalloc_nofs_restore(flags);
#else
	page = grab_cache_page_write_begin(mapping, index, flags | AOP_FLAG_NOFS);
#endif
	if (!page)
		return -ENOMEM;
	if (!page_has_buffers(page)) {
#if LINUX_VERSION_CODE < KERNEL_VERSION(6, 7, 0)
		create_empty_buffers(page, sb->s_blocksize, 0);
#else
		folio = page_folio(page);
		bh = folio_buffers(folio);
		if (!bh)
			bh = create_empty_buffers(folio, sb->s_blocksize, 0);
#endif
	}

	/* CoW moves existing blocks, so read them but mark them as unmapped */
	head = page_buffers(page);
	blocksize = head->b_size;
	i_blks_end = (inode->i_size + sb->s_blocksize - 1) >> inode->i_blkbits;
	i_blks_end <<= inode->i_blkbits;
	if (i_blks_end >= pos) {
		from = pos & (PAGE_SIZE - 1);
		to = from + min(i_blks_end - pos, (loff_t)len);
	} else {
		/* TODO: deal with preallocated tail blocks */
		from = UINT_MAX;
		to = 0;
	}
	for (bh = head, block_start = 0; bh != head || !block_start;
	     block_start = block_end, bh = bh->b_this_page, ++iblock) {
		block_end = block_start + blocksize;
		if (to > block_start && from < block_end) {
			if (buffer_trans(bh))
				continue;
			if (!buffer_mapped(bh)) {
				err = __apfs_get_block(dstream, iblock, bh,
						       false /* create */);
				if (err) {
					apfs_err(sb, "failed to map block for ino 0x%llx", apfs_ino(inode));
					goto out_put_page;
				}
			}
			if (buffer_mapped(bh) && !buffer_uptodate(bh)) {
				get_bh(bh);
				lock_buffer(bh);
				bh->b_end_io = end_buffer_read_sync;
				apfs_submit_bh(REQ_OP_READ, 0, bh);
				wait_on_buffer(bh);
				if (!buffer_uptodate(bh)) {
					apfs_err(sb, "failed to read block for ino 0x%llx", apfs_ino(inode));
					err = -EIO;
					goto out_put_page;
				}
			}
			clear_buffer_mapped(bh);
		}
	}

#if LINUX_VERSION_CODE >= KERNEL_VERSION(6, 12, 0)
	err = __block_write_begin(page_folio(page), pos, len, apfs_get_new_block);
#else
	err = __block_write_begin(page, pos, len, apfs_get_new_block);
#endif
	if (err) {
		apfs_err(sb, "CoW failed in inode 0x%llx", apfs_ino(inode));
		goto out_put_page;
	}

	*pagep = page;
	return 0;

out_put_page:
	unlock_page(page);
	put_page(page);
	return err;
}

#if LINUX_VERSION_CODE >= KERNEL_VERSION(6, 17, 0)
static int apfs_write_begin(const struct kiocb *file, struct address_space *mapping,
			    loff_t pos, unsigned int len,
			    struct folio **foliop, void **fsdata)
#elif LINUX_VERSION_CODE >= KERNEL_VERSION(6, 12, 0)
static int apfs_write_begin(struct file *file, struct address_space *mapping,
			    loff_t pos, unsigned int len,
			    struct folio **foliop, void **fsdata)
#elif LINUX_VERSION_CODE >= KERNEL_VERSION(5, 19, 0) || RHEL_VERSION_GE(9, 3)
static int apfs_write_begin(struct file *file, struct address_space *mapping,
			    loff_t pos, unsigned int len,
			    struct page **pagep, void **fsdata)
#else
static int apfs_write_begin(struct file *file, struct address_space *mapping,
			    loff_t pos, unsigned int len, unsigned int flags,
			    struct page **pagep, void **fsdata)
#endif
{
	struct inode *inode = mapping->host;
	struct super_block *sb = inode->i_sb;
#if LINUX_VERSION_CODE >= KERNEL_VERSION(6, 12, 0)
	struct page *page = NULL;
	struct page **pagep = &page;
#endif
	int err;
#if LINUX_VERSION_CODE >= KERNEL_VERSION(5, 19, 0) || RHEL_VERSION_GE(9, 3)
	unsigned int flags = 0;
#endif

	if (unlikely(pos >= APFS_MAX_FILE_SIZE))
		return -EFBIG;

	err = apfs_transaction_start(sb, APFS_TRANS_REG);
	if (err)
		return err;

	err = __apfs_write_begin(file, mapping, pos, len, flags, pagep, fsdata);
	if (err)
		goto fail;
#if LINUX_VERSION_CODE >= KERNEL_VERSION(6, 12, 0)
	*foliop = page_folio(page);
#endif
	return 0;

fail:
	apfs_transaction_abort(sb);
	return err;
}

#if LINUX_VERSION_CODE >= KERNEL_VERSION(6, 17, 0)
int __apfs_write_end(const struct kiocb *file, struct address_space *mapping, loff_t pos, unsigned int len, unsigned int copied, struct page *page, void *fsdata)
#else
int __apfs_write_end(struct file *file, struct address_space *mapping, loff_t pos, unsigned int len, unsigned int copied, struct page *page, void *fsdata)
#endif
{
	struct inode *inode = mapping->host;
	struct apfs_dstream_info *dstream = &APFS_I(inode)->i_dstream;
	int ret, err;

#if LINUX_VERSION_CODE >= KERNEL_VERSION(6, 12, 0)
	ret = generic_write_end(file, mapping, pos, len, copied, page_folio(page), fsdata);
#else
	ret = generic_write_end(file, mapping, pos, len, copied, page, fsdata);
#endif
	dstream->ds_size = i_size_read(inode);
	if (ret < len && pos + len > inode->i_size) {
		truncate_pagecache(inode, inode->i_size);
		err = apfs_truncate(dstream, inode->i_size);
		if (err) {
			apfs_err(inode->i_sb, "truncation failed for ino 0x%llx", apfs_ino(inode));
			return err;
		}
	}
	return ret;
}

#if LINUX_VERSION_CODE >= KERNEL_VERSION(6, 17, 0)
static int apfs_write_end(const struct kiocb *file, struct address_space *mapping,
			  loff_t pos, unsigned int len, unsigned int copied,
			  struct folio *folio, void *fsdata)
#elif LINUX_VERSION_CODE >= KERNEL_VERSION(6, 12, 0)
static int apfs_write_end(struct file *file, struct address_space *mapping,
			  loff_t pos, unsigned int len, unsigned int copied,
			  struct folio *folio, void *fsdata)
#else
static int apfs_write_end(struct file *file, struct address_space *mapping,
			  loff_t pos, unsigned int len, unsigned int copied,
			  struct page *page, void *fsdata)
#endif
{
	struct inode *inode = mapping->host;
	struct super_block *sb = inode->i_sb;
	struct apfs_nx_transaction *trans = &APFS_NXI(sb)->nx_transaction;
#if LINUX_VERSION_CODE >= KERNEL_VERSION(6, 12, 0)
	struct page *page = &folio->page;
#endif
	int ret, err;

	ret = __apfs_write_end(file, mapping, pos, len, copied, page, fsdata);
	if (ret < 0) {
		err = ret;
		goto fail;
	}

	if ((pos + ret) & (sb->s_blocksize - 1))
		trans->t_state |= APFS_NX_TRANS_INCOMPLETE_BLOCK;
	else
		trans->t_state &= ~APFS_NX_TRANS_INCOMPLETE_BLOCK;

	err = apfs_transaction_commit(sb);
	if (!err)
		return ret;

fail:
	apfs_transaction_abort(sb);
	return err;
}

#if LINUX_VERSION_CODE >= KERNEL_VERSION(5, 18, 0) || RHEL_VERSION_GE(9, 3)
static void apfs_noop_invalidate_folio(struct folio *folio, size_t offset, size_t length)
#else
static void apfs_noop_invalidatepage(struct page *page, unsigned int offset, unsigned int length)
#endif
{
}

/* bmap is not implemented to avoid issues with CoW on swapfiles */
static const struct address_space_operations apfs_aops = {
#if LINUX_VERSION_CODE >= KERNEL_VERSION(5, 18, 0) || RHEL_VERSION_GE(9, 2)
	.dirty_folio	= block_dirty_folio,
#elif LINUX_VERSION_CODE >= KERNEL_VERSION(5, 14, 0)
	.set_page_dirty	= __set_page_dirty_buffers,
#endif

#if LINUX_VERSION_CODE >= KERNEL_VERSION(5, 19, 0) || RHEL_VERSION_GE(9, 3)
	.read_folio	= apfs_read_folio,
#else
	.readpage	= apfs_readpage,
#endif

#if LINUX_VERSION_CODE >= KERNEL_VERSION(5, 8, 0)
	.readahead	= apfs_readahead,
#else
	.readpages	= apfs_readpages,
#endif

	.write_begin	= apfs_write_begin,
	.write_end	= apfs_write_end,

	/* The intention is to keep bhs around until the transaction is over */
#if LINUX_VERSION_CODE >= KERNEL_VERSION(5, 18, 0) || RHEL_VERSION_GE(9, 3)
	.invalidate_folio = apfs_noop_invalidate_folio,
#else
	.invalidatepage	= apfs_noop_invalidatepage,
#endif
};

/**
 * apfs_inode_set_ops - Set up an inode's operations
 * @inode:	vfs inode to set up
 * @rdev:	device id (0 if not a device file)
 * @compressed:	is this a compressed inode?
 *
 * For device files, also sets the device id to @rdev.
 */
static void apfs_inode_set_ops(struct inode *inode, dev_t rdev, bool compressed)
{
	/* A lot of operations still missing, of course */
	switch (inode->i_mode & S_IFMT) {
	case S_IFREG:
		inode->i_op = &apfs_file_inode_operations;
		if (compressed) {
			inode->i_fop = &apfs_compress_file_operations;
			inode->i_mapping->a_ops = &apfs_compress_aops;
		} else {
			inode->i_fop = &apfs_file_operations;
			inode->i_mapping->a_ops = &apfs_aops;
		}
		break;
	case S_IFDIR:
		inode->i_op = &apfs_dir_inode_operations;
		inode->i_fop = &apfs_dir_operations;
		break;
	case S_IFLNK:
		inode->i_op = &apfs_symlink_inode_operations;
		break;
	default:
		inode->i_op = &apfs_special_inode_operations;
		init_special_inode(inode, inode->i_mode, rdev);
		break;
	}
}

/**
 * apfs_inode_from_query - Read the inode found by a successful query
 * @query:	the query that found the record
 * @inode:	vfs inode to be filled with the read data
 *
 * Reads the inode record into @inode and performs some basic sanity checks,
 * mostly as a protection against crafted filesystems.  Returns 0 on success
 * or a negative error code otherwise.
 */
static int apfs_inode_from_query(struct apfs_query *query, struct inode *inode)
{
	struct apfs_inode_info *ai = APFS_I(inode);
	struct apfs_dstream_info *dstream = &ai->i_dstream;
	struct apfs_inode_val *inode_val;
	char *raw = query->node->object.data;
	char *xval = NULL;
	int xlen;
	u32 rdev = 0, bsd_flags;
	bool compressed = false;

	if (query->len < sizeof(*inode_val))
		goto corrupted;

	inode_val = (struct apfs_inode_val *)(raw + query->off);

	ai->i_parent_id = le64_to_cpu(inode_val->parent_id);
	dstream->ds_id = le64_to_cpu(inode_val->private_id);
	inode->i_mode = le16_to_cpu(inode_val->mode);
	ai->i_key_class = le32_to_cpu(inode_val->default_protection_class);
	ai->i_int_flags = le64_to_cpu(inode_val->internal_flags);

	ai->i_saved_uid = le32_to_cpu(inode_val->owner);
	i_uid_write(inode, ai->i_saved_uid);
	ai->i_saved_gid = le32_to_cpu(inode_val->group);
	i_gid_write(inode, ai->i_saved_gid);

	ai->i_bsd_flags = bsd_flags = le32_to_cpu(inode_val->bsd_flags);
	if (bsd_flags & APFS_INOBSD_IMMUTABLE)
		inode->i_flags |= S_IMMUTABLE;
	if (bsd_flags & APFS_INOBSD_APPEND)
		inode->i_flags |= S_APPEND;

	if (!S_ISDIR(inode->i_mode)) {
		/*
		 * Directory inodes don't store their link count, so to provide
		 * it we would have to actually count the subdirectories. The
		 * HFS/HFS+ modules just leave it at 1, and so do we, for now.
		 */
		set_nlink(inode, le32_to_cpu(inode_val->nlink));
	} else {
		ai->i_nchildren = le32_to_cpu(inode_val->nchildren);
	}

#if LINUX_VERSION_CODE < KERNEL_VERSION(6, 6, 0)
	inode->i_ctime = ns_to_timespec64(le64_to_cpu(inode_val->change_time));
#else
	inode_set_ctime_to_ts(inode, ns_to_timespec64(le64_to_cpu(inode_val->change_time)));
#endif

#if LINUX_VERSION_CODE < KERNEL_VERSION(6, 7, 0)
	inode->i_atime = ns_to_timespec64(le64_to_cpu(inode_val->access_time));
	inode->i_mtime = ns_to_timespec64(le64_to_cpu(inode_val->mod_time));
#else
	inode_set_atime_to_ts(inode, ns_to_timespec64(le64_to_cpu(inode_val->access_time)));
	inode_set_mtime_to_ts(inode, ns_to_timespec64(le64_to_cpu(inode_val->mod_time)));
#endif
	ai->i_crtime = ns_to_timespec64(le64_to_cpu(inode_val->create_time));

	dstream->ds_size = inode->i_size = inode->i_blocks = 0;
	ai->i_has_dstream = false;
	if ((bsd_flags & APFS_INOBSD_COMPRESSED) && !S_ISDIR(inode->i_mode)) {
		if (!apfs_compress_get_size(inode, &inode->i_size)) {
			inode->i_blocks = (inode->i_size + 511) >> 9;
			compressed = true;
		}
	} else {
		xlen = apfs_find_xfield(inode_val->xfields,
					query->len - sizeof(*inode_val),
					APFS_INO_EXT_TYPE_DSTREAM, &xval);
		if (xlen >= sizeof(struct apfs_dstream)) {
			struct apfs_dstream *dstream_raw = (struct apfs_dstream *)xval;

			dstream->ds_size = inode->i_size = le64_to_cpu(dstream_raw->size);
			inode->i_blocks = le64_to_cpu(dstream_raw->alloced_size) >> 9;
			ai->i_has_dstream = true;
		}
	}
	xval = NULL;

	/* TODO: move each xfield read to its own function */
	dstream->ds_sparse_bytes = 0;
	xlen = apfs_find_xfield(inode_val->xfields, query->len - sizeof(*inode_val), APFS_INO_EXT_TYPE_SPARSE_BYTES, &xval);
	if (xlen >= sizeof(__le64)) {
		__le64 *sparse_bytes_p = (__le64 *)xval;

		dstream->ds_sparse_bytes = le64_to_cpup(sparse_bytes_p);
	}
	xval = NULL;

	rdev = 0;
	xlen = apfs_find_xfield(inode_val->xfields,
				query->len - sizeof(*inode_val),
				APFS_INO_EXT_TYPE_RDEV, &xval);
	if (xlen >= sizeof(__le32)) {
		__le32 *rdev_p = (__le32 *)xval;

		rdev = le32_to_cpup(rdev_p);
	}

	apfs_inode_set_ops(inode, rdev, compressed);
	return 0;

corrupted:
	apfs_err(inode->i_sb, "bad inode record for inode 0x%llx", apfs_ino(inode));
	return -EFSCORRUPTED;
}

/**
 * apfs_inode_lookup - Lookup an inode record in the catalog b-tree
 * @inode:	vfs inode to lookup
 *
 * Runs a catalog query for the apfs_ino(@inode) inode record; returns a pointer
 * to the query structure on success, or an error pointer in case of failure.
 */
static struct apfs_query *apfs_inode_lookup(const struct inode *inode)
{
	struct super_block *sb = inode->i_sb;
	struct apfs_sb_info *sbi = APFS_SB(sb);
	struct apfs_query *query;
	int ret;

	query = apfs_alloc_query(sbi->s_cat_root, NULL /* parent */);
	if (!query)
		return ERR_PTR(-ENOMEM);
	apfs_init_inode_key(apfs_ino(inode), &query->key);
	query->flags |= APFS_QUERY_CAT | APFS_QUERY_EXACT;

	ret = apfs_btree_query(sb, &query);
	if (!ret)
		return query;

	/* Don't complain if an orphan is already gone */
	if (!current_work() || ret != -ENODATA)
		apfs_err(sb, "query failed for id 0x%llx", apfs_ino(inode));
	apfs_free_query(query);
	return ERR_PTR(ret);
}

/**
 * apfs_test_inode - Check if the inode matches a 64-bit inode number
 * @inode:	inode to test
 * @cnid:	pointer to the inode number
 */
static int apfs_test_inode(struct inode *inode, void *cnid)
{
	u64 *ino = cnid;

	return apfs_ino(inode) == *ino;
}

/**
 * apfs_set_inode - Set a 64-bit inode number on the given inode
 * @inode:	inode to set
 * @cnid:	pointer to the inode number
 */
static int apfs_set_inode(struct inode *inode, void *cnid)
{
	apfs_set_ino(inode, *(u64 *)cnid);
	return 0;
}

/**
 * apfs_iget_locked - Wrapper for iget5_locked()
 * @sb:		filesystem superblock
 * @cnid:	64-bit inode number
 *
 * Works the same as iget_locked(), but can handle 64-bit inode numbers on
 * 32-bit architectures.
 */
static struct inode *apfs_iget_locked(struct super_block *sb, u64 cnid)
{
	return iget5_locked(sb, cnid, apfs_test_inode, apfs_set_inode, &cnid);
}

/**
 * apfs_check_dstream_refcnt - Check if an inode's dstream is shared
 * @inode:	the inode to check
 *
 * Sets the value of ds_shared for the inode's dstream. Returns 0 on success,
 * or a negative error code in case of failure.
 */
static int apfs_check_dstream_refcnt(struct inode *inode)
{
	struct apfs_inode_info *ai = APFS_I(inode);
	struct apfs_dstream_info *dstream = &ai->i_dstream;
	struct super_block *sb = inode->i_sb;
	struct apfs_sb_info *sbi = APFS_SB(sb);
	struct apfs_query *query = NULL;
	struct apfs_dstream_id_val raw_val;
	void *raw = NULL;
	u32 refcnt;
	int ret;

	if (!ai->i_has_dstream) {
		dstream->ds_shared = false;
		return 0;
	}

	query = apfs_alloc_query(sbi->s_cat_root, NULL /* parent */);
	if (!query)
		return -ENOMEM;
	apfs_init_dstream_id_key(dstream->ds_id, &query->key);
	query->flags |= APFS_QUERY_CAT | APFS_QUERY_EXACT;

	ret = apfs_btree_query(sb, &query);
	if (ret) {
		apfs_err(sb, "query failed for id 0x%llx", dstream->ds_id);
		if (ret == -ENODATA)
			ret = -EFSCORRUPTED;
		goto fail;
	}

	if (query->len != sizeof(raw_val)) {
		ret = -EFSCORRUPTED;
		goto fail;
	}
	raw = query->node->object.data;
	raw_val = *(struct apfs_dstream_id_val *)(raw + query->off);
	refcnt = le32_to_cpu(raw_val.refcnt);

	dstream->ds_shared = refcnt > 1;
fail:
	apfs_free_query(query);
	return ret;
}

/**
 * apfs_iget - Populate inode structures with metadata from disk
 * @sb:		filesystem superblock
 * @cnid:	inode number
 *
 * Populates the vfs inode and the corresponding apfs_inode_info structure.
 * Returns a pointer to the vfs inode in case of success, or an appropriate
 * error pointer otherwise.
 */
struct inode *apfs_iget(struct super_block *sb, u64 cnid)
{
	struct apfs_sb_info *sbi = APFS_SB(sb);
	struct apfs_nxsb_info *nxi = APFS_NXI(sb);
	struct inode *inode;
	struct apfs_query *query;
	int err;

	inode = apfs_iget_locked(sb, cnid);
	if (!inode)
		return ERR_PTR(-ENOMEM);
	if (!(apfs_inode_state_read_once(inode) & I_NEW))
		return inode;

	down_read(&nxi->nx_big_sem);
	query = apfs_inode_lookup(inode);
	if (IS_ERR(query)) {
		err = PTR_ERR(query);
		/* Don't complain if an orphan is already gone */
		if (!current_work() || err != -ENODATA)
			apfs_err(sb, "lookup failed for ino 0x%llx", cnid);
		goto fail;
	}
	err = apfs_inode_from_query(query, inode);
	apfs_free_query(query);
	if (err)
		goto fail;
	err = apfs_check_dstream_refcnt(inode);
	if (err) {
		apfs_err(sb, "refcnt check failed for ino 0x%llx", cnid);
		goto fail;
	}
	up_read(&nxi->nx_big_sem);

	/* Allow the user to override the ownership */
	if (uid_valid(sbi->s_uid))
		inode->i_uid = sbi->s_uid;
	if (gid_valid(sbi->s_gid))
		inode->i_gid = sbi->s_gid;

	/* Inode flags are not important for now, leave them at 0 */
	unlock_new_inode(inode);
	return inode;

fail:
	up_read(&nxi->nx_big_sem);
	iget_failed(inode);
	return ERR_PTR(err);
}

#if LINUX_VERSION_CODE < KERNEL_VERSION(4, 11, 0) /* No statx yet... */

int apfs_getattr(struct vfsmount *mnt, struct dentry *dentry,
		 struct kstat *stat)
{
	struct inode *inode = d_inode(dentry);

	generic_fillattr(inode, stat);
	stat->dev = APFS_SB(inode->i_sb)->s_anon_dev;
	stat->ino = apfs_ino(inode);
	return 0;
}

#else /* LINUX_VERSION_CODE < KERNEL_VERSION(4, 11, 0) */

#if LINUX_VERSION_CODE < KERNEL_VERSION(5, 12, 0)
int apfs_getattr(const struct path *path, struct kstat *stat,
		 u32 request_mask, unsigned int query_flags)
#elif LINUX_VERSION_CODE < KERNEL_VERSION(6, 3, 0) && !RHEL_VERSION_GE(9, 6)
int apfs_getattr(struct user_namespace *mnt_userns,
		 const struct path *path, struct kstat *stat, u32 request_mask,
		 unsigned int query_flags)
#else
int apfs_getattr(struct mnt_idmap *idmap,
		 const struct path *path, struct kstat *stat, u32 request_mask,
		 unsigned int query_flags)
#endif
{
	struct inode *inode = d_inode(path->dentry);
	struct apfs_inode_info *ai = APFS_I(inode);

	stat->result_mask |= STATX_BTIME;
	stat->btime = ai->i_crtime;

	if (ai->i_bsd_flags & APFS_INOBSD_APPEND)
		stat->attributes |= STATX_ATTR_APPEND;
	if (ai->i_bsd_flags & APFS_INOBSD_IMMUTABLE)
		stat->attributes |= STATX_ATTR_IMMUTABLE;
	if (ai->i_bsd_flags & APFS_INOBSD_NODUMP)
		stat->attributes |= STATX_ATTR_NODUMP;
	if (ai->i_bsd_flags & APFS_INOBSD_COMPRESSED)
		stat->attributes |= STATX_ATTR_COMPRESSED;

	stat->attributes_mask |= STATX_ATTR_APPEND;
	stat->attributes_mask |= STATX_ATTR_IMMUTABLE;
	stat->attributes_mask |= STATX_ATTR_NODUMP;
	stat->attributes_mask |= STATX_ATTR_COMPRESSED;

#if LINUX_VERSION_CODE < KERNEL_VERSION(5, 12, 0)
	generic_fillattr(inode, stat);
#elif LINUX_VERSION_CODE < KERNEL_VERSION(6, 3, 0) && !RHEL_VERSION_GE(9, 6)
	generic_fillattr(mnt_userns, inode, stat);
#elif LINUX_VERSION_CODE < KERNEL_VERSION(6, 6, 0)
	generic_fillattr(idmap, inode, stat);
#else
	generic_fillattr(idmap, request_mask, inode, stat);
#endif

	stat->dev = APFS_SB(inode->i_sb)->s_anon_dev;
	stat->ino = apfs_ino(inode);
	return 0;
}

#endif /* LINUX_VERSION_CODE < KERNEL_VERSION(4, 11, 0) */

/**
 * apfs_build_inode_val - Allocate and initialize the value for an inode record
 * @inode:	vfs inode to record
 * @qname:	filename for primary link
 * @val_p:	on return, a pointer to the new on-disk value structure
 *
 * Returns the length of the value, or a negative error code in case of failure.
 */
static int apfs_build_inode_val(struct inode *inode, const struct qstr *qname,
				struct apfs_inode_val **val_p)
{
	struct apfs_inode_val *val;
	struct apfs_x_field xkey;
#if LINUX_VERSION_CODE >= KERNEL_VERSION(6, 7, 0)
	struct timespec64 ts;
#endif
	int total_xlen, val_len;
	bool is_device = S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode);
	__le32 rdev;

	/* The only required xfield is the name, and the id if it's a device */
	total_xlen = sizeof(struct apfs_xf_blob);
	total_xlen += sizeof(xkey) + round_up(qname->len + 1, 8);
	if (is_device)
		total_xlen += sizeof(xkey) + round_up(sizeof(rdev), 8);

	val_len = sizeof(*val) + total_xlen;
	val = kzalloc(val_len, GFP_KERNEL);
	if (!val)
		return -ENOMEM;

	val->parent_id = cpu_to_le64(APFS_I(inode)->i_parent_id);
	val->private_id = cpu_to_le64(apfs_ino(inode));

#if LINUX_VERSION_CODE < KERNEL_VERSION(6, 7, 0)
	val->mod_time = cpu_to_le64(timespec64_to_ns(&inode->i_mtime));
#else
	ts = inode_get_mtime(inode);
	val->mod_time = cpu_to_le64(timespec64_to_ns(&ts));
#endif
	val->create_time = val->change_time = val->access_time = val->mod_time;

	if (S_ISDIR(inode->i_mode))
		val->nchildren = 0;
	else
		val->nlink = cpu_to_le32(1);

	val->owner = cpu_to_le32(i_uid_read(inode));
	val->group = cpu_to_le32(i_gid_read(inode));
	val->mode = cpu_to_le16(inode->i_mode);

	/* The buffer was just allocated: none of these functions should fail */
	apfs_init_xfields(val->xfields, total_xlen);
	xkey.x_type = APFS_INO_EXT_TYPE_NAME;
	xkey.x_flags = APFS_XF_DO_NOT_COPY;
	xkey.x_size = cpu_to_le16(qname->len + 1);
	apfs_insert_xfield(val->xfields, total_xlen, &xkey, qname->name);
	if (is_device) {
		rdev = cpu_to_le32(inode->i_rdev);
		xkey.x_type = APFS_INO_EXT_TYPE_RDEV;
		xkey.x_flags = 0; /* TODO: proper flags here? */
		xkey.x_size = cpu_to_le16(sizeof(rdev));
		apfs_insert_xfield(val->xfields, total_xlen, &xkey, &rdev);
	}

	*val_p = val;
	return val_len;
}

/*
 * apfs_inode_rename - Update the primary name reported in an inode record
 * @inode:	the in-memory inode
 * @new_name:	name of the new primary link (NULL if unchanged)
 * @query:	the query that found the inode record
 *
 * Returns 0 on success, or a negative error code in case of failure.
 */
static int apfs_inode_rename(struct inode *inode, char *new_name,
			     struct apfs_query *query)
{
	char *raw = query->node->object.data;
	struct apfs_inode_val *new_val = NULL;
	int buflen, namelen;
	struct apfs_x_field xkey;
	int xlen;
	int err;

	if (!new_name)
		return 0;

	namelen = strlen(new_name) + 1; /* Count the null-termination */
	buflen = query->len;
	buflen += sizeof(struct apfs_x_field) + round_up(namelen, 8);
	new_val = kzalloc(buflen, GFP_KERNEL);
	if (!new_val)
		return -ENOMEM;
	memcpy(new_val, raw + query->off, query->len);

	/* TODO: can we assume that all inode records have an xfield blob? */
	xkey.x_type = APFS_INO_EXT_TYPE_NAME;
	xkey.x_flags = APFS_XF_DO_NOT_COPY;
	xkey.x_size = cpu_to_le16(namelen);
	xlen = apfs_insert_xfield(new_val->xfields, buflen - sizeof(*new_val),
				  &xkey, new_name);
	if (!xlen) {
		/* Buffer has enough space, but the metadata claims otherwise */
		apfs_err(inode->i_sb, "bad xfields on inode 0x%llx", apfs_ino(inode));
		err = -EFSCORRUPTED;
		goto fail;
	}

	/* Just remove the old record and create a new one */
	err = apfs_btree_replace(query, NULL /* key */, 0 /* key_len */, new_val, sizeof(*new_val) + xlen);
	if (err)
		apfs_err(inode->i_sb, "update failed for ino 0x%llx", apfs_ino(inode));

fail:
	kfree(new_val);
	return err;
}

/**
 * apfs_create_dstream_xfield - Create the inode xfield for a new data stream
 * @inode:	the in-memory inode
 * @query:	the query that found the inode record
 *
 * Returns 0 on success, or a negative error code in case of failure.
 */
static int apfs_create_dstream_xfield(struct inode *inode,
				      struct apfs_query *query)
{
	char *raw = query->node->object.data;
	struct apfs_inode_val *new_val;
	struct apfs_dstream dstream_raw = {0};
	struct apfs_x_field xkey;
	struct apfs_dstream_info *dstream = &APFS_I(inode)->i_dstream;
	int xlen;
	int buflen;
	int err;

	buflen = query->len;
	buflen += sizeof(struct apfs_x_field) + sizeof(dstream_raw);
	new_val = kzalloc(buflen, GFP_KERNEL);
	if (!new_val)
		return -ENOMEM;
	memcpy(new_val, raw + query->off, query->len);

	dstream_raw.size = cpu_to_le64(inode->i_size);
	dstream_raw.alloced_size = cpu_to_le64(apfs_alloced_size(dstream));
	if (apfs_vol_is_encrypted(inode->i_sb))
		dstream_raw.default_crypto_id = cpu_to_le64(dstream->ds_id);

	/* TODO: can we assume that all inode records have an xfield blob? */
	xkey.x_type = APFS_INO_EXT_TYPE_DSTREAM;
	xkey.x_flags = APFS_XF_SYSTEM_FIELD;
	xkey.x_size = cpu_to_le16(sizeof(dstream_raw));
	xlen = apfs_insert_xfield(new_val->xfields, buflen - sizeof(*new_val),
				  &xkey, &dstream_raw);
	if (!xlen) {
		/* Buffer has enough space, but the metadata claims otherwise */
		apfs_err(inode->i_sb, "bad xfields on inode 0x%llx", apfs_ino(inode));
		err = -EFSCORRUPTED;
		goto fail;
	}

	/* Just remove the old record and create a new one */
	err = apfs_btree_replace(query, NULL /* key */, 0 /* key_len */, new_val, sizeof(*new_val) + xlen);
	if (err)
		apfs_err(inode->i_sb, "update failed for ino 0x%llx", apfs_ino(inode));

fail:
	kfree(new_val);
	return err;
}

/**
 * apfs_inode_resize - Update the sizes reported in an inode record
 * @inode:	the in-memory inode
 * @query:	the query that found the inode record
 *
 * Returns 0 on success, or a negative error code in case of failure.
 */
static int apfs_inode_resize(struct inode *inode, struct apfs_query *query)
{
	struct apfs_inode_info *ai = APFS_I(inode);
	char *raw;
	struct apfs_inode_val *inode_raw;
	char *xval;
	int xlen;
	int err;

	/* All dstream records must have a matching xfield, even if empty */
	if (!ai->i_has_dstream)
		return 0;

	err = apfs_query_join_transaction(query);
	if (err) {
		apfs_err(inode->i_sb, "query join failed");
		return err;
	}
	raw = query->node->object.data;
	inode_raw = (void *)raw + query->off;

	xlen = apfs_find_xfield(inode_raw->xfields,
				query->len - sizeof(*inode_raw),
				APFS_INO_EXT_TYPE_DSTREAM, &xval);

	if (xlen) {
		struct apfs_dstream *dstream;

		if (xlen != sizeof(*dstream)) {
			apfs_err(inode->i_sb, "bad xlen (%d) on inode 0x%llx", xlen, apfs_ino(inode));
			return -EFSCORRUPTED;
		}
		dstream = (struct apfs_dstream *)xval;

		/* TODO: count bytes read and written */
		dstream->size = cpu_to_le64(inode->i_size);
		dstream->alloced_size = cpu_to_le64(apfs_alloced_size(&ai->i_dstream));
		return 0;
	}
	/* This inode has no dstream xfield, so we need to create it */
	return apfs_create_dstream_xfield(inode, query);
}

/**
 * apfs_create_sparse_xfield - Create an inode xfield to count sparse bytes
 * @inode:	the in-memory inode
 * @query:	the query that found the inode record
 *
 * Returns 0 on success, or a negative error code in case of failure.
 */
static int apfs_create_sparse_xfield(struct inode *inode, struct apfs_query *query)
{
	struct apfs_dstream_info *dstream = &APFS_I(inode)->i_dstream;
	char *raw = query->node->object.data;
	struct apfs_inode_val *new_val;
	__le64 sparse_bytes;
	struct apfs_x_field xkey;
	int xlen;
	int buflen;
	int err;

	buflen = query->len;
	buflen += sizeof(struct apfs_x_field) + sizeof(sparse_bytes);
	new_val = kzalloc(buflen, GFP_KERNEL);
	if (!new_val)
		return -ENOMEM;
	memcpy(new_val, raw + query->off, query->len);

	sparse_bytes = cpu_to_le64(dstream->ds_sparse_bytes);

	/* TODO: can we assume that all inode records have an xfield blob? */
	xkey.x_type = APFS_INO_EXT_TYPE_SPARSE_BYTES;
	xkey.x_flags = APFS_XF_SYSTEM_FIELD | APFS_XF_CHILDREN_INHERIT;
	xkey.x_size = cpu_to_le16(sizeof(sparse_bytes));
	xlen = apfs_insert_xfield(new_val->xfields, buflen - sizeof(*new_val), &xkey, &sparse_bytes);
	if (!xlen) {
		/* Buffer has enough space, but the metadata claims otherwise */
		apfs_err(inode->i_sb, "bad xfields on inode 0x%llx", apfs_ino(inode));
		err = -EFSCORRUPTED;
		goto fail;
	}

	/* Just remove the old record and create a new one */
	err = apfs_btree_replace(query, NULL /* key */, 0 /* key_len */, new_val, sizeof(*new_val) + xlen);
	if (err)
		apfs_err(inode->i_sb, "update failed for ino 0x%llx", apfs_ino(inode));

fail:
	kfree(new_val);
	return err;
}

/**
 * apfs_inode_resize_sparse - Update sparse byte count reported in inode record
 * @inode:	the in-memory inode
 * @query:	the query that found the inode record
 *
 * Returns 0 on success, or a negative error code in case of failure.
 *
 * TODO: should the xfield be removed if the count reaches 0? Should the inode
 * flag change?
 */
static int apfs_inode_resize_sparse(struct inode *inode, struct apfs_query *query)
{
	struct apfs_dstream_info *dstream = &APFS_I(inode)->i_dstream;
	char *raw;
	struct apfs_inode_val *inode_raw;
	char *xval;
	int xlen;
	int err;

	err = apfs_query_join_transaction(query);
	if (err) {
		apfs_err(inode->i_sb, "query join failed");
		return err;
	}
	raw = query->node->object.data;
	inode_raw = (void *)raw + query->off;

	xlen = apfs_find_xfield(inode_raw->xfields,
				query->len - sizeof(*inode_raw),
				APFS_INO_EXT_TYPE_SPARSE_BYTES, &xval);
	if (!xlen && !dstream->ds_sparse_bytes)
		return 0;

	if (xlen) {
		__le64 *sparse_bytes_p;

		if (xlen != sizeof(*sparse_bytes_p)) {
			apfs_err(inode->i_sb, "bad xlen (%d) on inode 0x%llx", xlen, apfs_ino(inode));
			return -EFSCORRUPTED;
		}
		sparse_bytes_p = (__le64 *)xval;

		*sparse_bytes_p = cpu_to_le64(dstream->ds_sparse_bytes);
		return 0;
	}
	return apfs_create_sparse_xfield(inode, query);
}

/**
 * apfs_update_inode - Update an existing inode record
 * @inode:	the modified in-memory inode
 * @new_name:	name of the new primary link (NULL if unchanged)
 *
 * Returns 0 on success, or a negative error code in case of failure.
 */
int apfs_update_inode(struct inode *inode, char *new_name)
{
	struct super_block *sb = inode->i_sb;
	struct apfs_sb_info *sbi = APFS_SB(sb);
	struct apfs_inode_info *ai = APFS_I(inode);
	struct apfs_dstream_info *dstream = &ai->i_dstream;
	struct apfs_query *query;
	struct apfs_btree_node_phys *node_raw;
	struct apfs_inode_val *inode_raw;
	int err;

	err = apfs_flush_extent_cache(dstream);
	if (err) {
		apfs_err(sb, "extent cache flush failed for inode 0x%llx", apfs_ino(inode));
		return err;
	}

	query = apfs_inode_lookup(inode);
	if (IS_ERR(query)) {
		apfs_err(sb, "lookup failed for ino 0x%llx", apfs_ino(inode));
		return PTR_ERR(query);
	}

	/* TODO: copy the record to memory and make all xfield changes there */
	err = apfs_inode_rename(inode, new_name, query);
	if (err) {
		apfs_err(sb, "rename failed for ino 0x%llx", apfs_ino(inode));
		goto fail;
	}

	err = apfs_inode_resize(inode, query);
	if (err) {
		apfs_err(sb, "resize failed for ino 0x%llx", apfs_ino(inode));
		goto fail;
	}

	err = apfs_inode_resize_sparse(inode, query);
	if (err) {
		apfs_err(sb, "sparse resize failed for ino 0x%llx", apfs_ino(inode));
		goto fail;
	}
	if (dstream->ds_sparse_bytes)
		ai->i_int_flags |= APFS_INODE_IS_SPARSE;

	/* TODO: just use apfs_btree_replace()? */
	err = apfs_query_join_transaction(query);
	if (err) {
		apfs_err(sb, "query join failed");
		goto fail;
	}
	node_raw = (void *)query->node->object.data;
	apfs_assert_in_transaction(sb, &node_raw->btn_o);
	inode_raw = (void *)node_raw + query->off;

	inode_raw->parent_id = cpu_to_le64(ai->i_parent_id);
	inode_raw->private_id = cpu_to_le64(dstream->ds_id);
	inode_raw->mode = cpu_to_le16(inode->i_mode);
	inode_raw->owner = cpu_to_le32(i_uid_read(inode));
	inode_raw->group = cpu_to_le32(i_gid_read(inode));
	inode_raw->default_protection_class = cpu_to_le32(ai->i_key_class);
	inode_raw->internal_flags = cpu_to_le64(ai->i_int_flags);
	inode_raw->bsd_flags = cpu_to_le32(ai->i_bsd_flags);

	/* Don't persist the uid/gid provided by the user on mount */
	if (uid_valid(sbi->s_uid))
		inode_raw->owner = cpu_to_le32(ai->i_saved_uid);
	if (gid_valid(sbi->s_gid))
		inode_raw->group = cpu_to_le32(ai->i_saved_gid);

#if LINUX_VERSION_CODE < KERNEL_VERSION(6, 6, 0)
	inode_raw->change_time = cpu_to_le64(timespec64_to_ns(&inode->i_ctime));
#else
	struct timespec64 ictime = inode_get_ctime(inode);
	inode_raw->change_time = cpu_to_le64(timespec64_to_ns(&ictime));
#endif

#if LINUX_VERSION_CODE < KERNEL_VERSION(6, 7, 0)
	inode_raw->access_time = cpu_to_le64(timespec64_to_ns(&inode->i_atime));
	inode_raw->mod_time = cpu_to_le64(timespec64_to_ns(&inode->i_mtime));
#else
	struct timespec64 ts = inode_get_mtime(inode);
	inode_raw->mod_time = cpu_to_le64(timespec64_to_ns(&ts));
	ts = inode_get_atime(inode);
	inode_raw->access_time = cpu_to_le64(timespec64_to_ns(&ts));
#endif
	inode_raw->create_time = cpu_to_le64(timespec64_to_ns(&ai->i_crtime));

	if (S_ISDIR(inode->i_mode)) {
		inode_raw->nchildren = cpu_to_le32(ai->i_nchildren);
	} else {
		/* The remaining link for orphan inodes is not counted */
		inode_raw->nlink = cpu_to_le32(inode->i_nlink);
	}

fail:
	apfs_free_query(query);
	return err;
}

/**
 * apfs_delete_inode - Delete an inode record
 * @inode: the vfs inode to delete
 *
 * Returns 0 on success or a negative error code in case of failure, which may
 * be -EAGAIN if the inode was not deleted in full.
 */
static int apfs_delete_inode(struct inode *inode)
{
	struct super_block *sb = inode->i_sb;
	struct apfs_inode_info *ai = APFS_I(inode);
	struct apfs_dstream_info *dstream = NULL;
	struct apfs_query *query;
	u64 old_dstream_id;
	int ret;

	ret = apfs_delete_all_xattrs(inode);
	if (ret) {
		apfs_err(sb, "xattr deletion failed for ino 0x%llx", apfs_ino(inode));
		return ret;
	}

	dstream = &ai->i_dstream;
	old_dstream_id = dstream->ds_id;

	/*
	 * This is very wasteful since all the new extents and references will
	 * get deleted right away, but it only affects clones, so I don't see a
	 * big reason to improve it (TODO)
	 */
	ret = apfs_inode_create_exclusive_dstream(inode);
	if (ret) {
		apfs_err(sb, "dstream creation failed for ino 0x%llx", apfs_ino(inode));
		return ret;
	}

	/* TODO: what about partial deletion of xattrs? Is that allowed? */
	ret = apfs_inode_delete_front(inode);
	if (ret) {
		/*
		 * If the inode had too many extents, only the first few get
		 * deleted and the inode remains in the orphan list for now.
		 * I don't know why the deletion starts at the front, but it
		 * seems to be what the official driver does.
		 */
		if (ret != -EAGAIN) {
			apfs_err(sb, "head deletion failed for ino 0x%llx", apfs_ino(inode));
			return ret;
		}
		if (dstream->ds_id != old_dstream_id) {
			ret = apfs_update_inode(inode, NULL /* new_name */);
			if (ret) {
				apfs_err(sb, "dstream id update failed for orphan 0x%llx", apfs_ino(inode));
				return ret;
			}
		}
		return -EAGAIN;
	}

	ret = apfs_put_dstream_rec(dstream);
	if (ret) {
		apfs_err(sb, "failed to put dstream for ino 0x%llx", apfs_ino(inode));
		return ret;
	}
	dstream = NULL;
	ai->i_has_dstream = false;

	query = apfs_inode_lookup(inode);
	if (IS_ERR(query)) {
		apfs_err(sb, "lookup failed for ino 0x%llx", apfs_ino(inode));
		return PTR_ERR(query);
	}
	ret = apfs_btree_remove(query);
	apfs_free_query(query);
	if (ret) {
		apfs_err(sb, "removal failed for ino 0x%llx", apfs_ino(inode));
		return ret;
	}

	ai->i_cleaned = true;
	return ret;
}

/**
 * apfs_clean_single_orphan - Clean the given orphan file
 * @inode:	inode for the file to clean
 *
 * Returns 0 on success or a negative error code in case of failure, which may
 * be -EAGAIN if the file could not be deleted in full.
 */
static int apfs_clean_single_orphan(struct inode *inode)
{
	struct super_block *sb = inode->i_sb;
	u64 ino = apfs_ino(inode);
	bool eagain = false;
	int err;

	err = apfs_transaction_start(sb, APFS_TRANS_DEL);
	if (err)
		return err;
	err = apfs_delete_inode(inode);
	if (err) {
		if (err != -EAGAIN) {
			apfs_err(sb, "failed to delete orphan 0x%llx", ino);
			goto fail;
		}
		eagain = true;
	} else {
		err = apfs_delete_orphan_link(inode);
		if (err) {
			apfs_err(sb, "failed to unlink orphan 0x%llx", ino);
			goto fail;
		}
	}
	err = apfs_transaction_commit(sb);
	if (err)
		goto fail;
	return eagain ? -EAGAIN : 0;

fail:
	apfs_transaction_abort(sb);
	return err;
}

/**
 * apfs_clean_any_orphan - Pick an orphan and delete as much as reasonable
 * @sb:		filesystem superblock
 *
 * Returns 0 on success, or a negative error code in case of failure, which may
 * be -ENODATA if there are no more orphan files or -EAGAIN if a file could not
 * be deleted in full.
 */
static int apfs_clean_any_orphan(struct super_block *sb)
{
	struct apfs_nxsb_info *nxi = APFS_NXI(sb);
	struct inode *inode = NULL;
	int err;
	u64 ino;

	down_read(&nxi->nx_big_sem);
	err = apfs_any_orphan_ino(sb, &ino);
	up_read(&nxi->nx_big_sem);
	if (err) {
		if (err == -ENODATA)
			return -ENODATA;
		apfs_err(sb, "failed to find orphan inode numbers");
		return err;
	}

	inode = apfs_iget(sb, ino);
	if (IS_ERR(inode)) {
		err = PTR_ERR(inode);
		if (err != -ENODATA) {
			apfs_err(sb, "iget failed for orphan 0x%llx", ino);
			return err;
		}
		/*
		 * This happens rarely for files with no extents, if we hit a
		 * race with ->evict_inode(). Not a problem: the file is gone.
		 */
		apfs_notice(sb, "orphan 0x%llx not found", ino);
		return 0;
	}

	if (atomic_read(&inode->i_count) > 1)
		goto out;
	err = apfs_clean_single_orphan(inode);
	if (err && err != -EAGAIN) {
		apfs_err(sb, "failed to clean orphan 0x%llx", ino);
		goto out;
	}
out:
	iput(inode);
	return err;
}

/**
 * apfs_schedule_orphan_cleanup - Schedule cleanup for orphan inodes
 * @sb: filesystem superblock
 */
void apfs_schedule_orphan_cleanup(struct super_block *sb)
{
	struct apfs_sb_info *sbi = APFS_SB(sb);

	/*
	 * Don't schedule cleanups during unmount: completing all of it could
	 * take a while so just leave future mounts to handle the orphans.
	 */
	if (atomic_read(&sb->s_active) == 0)
		return;

	/*
	 * Don't keep retrying orphan cleanups nonstop when they run into an
	 * unexpected error: it won't do any good and it will flood dmesg. We
	 * will retry eventually for ENOSPC, but that's handled elsewhere.
	 */
	if (atomic_read(&sbi->s_orphan_cleanup_err))
		return;

	schedule_work(&sbi->s_orphan_cleanup_work);
}

/**
 * apfs_clean_orphans - Delete as many orphan files as is reasonable
 * @sb: filesystem superblock
 *
 * Returns 0 on success or a negative error code in case of failure.
 */
static int apfs_clean_orphans(struct super_block *sb)
{
	struct apfs_sb_info *sbi = APFS_SB(sb);
	int ret, i;

	for (i = 0; i < 100; ++i) {
		ret = apfs_clean_any_orphan(sb);
		if (ret == 0)
			continue;
		if (ret == -ENODATA)
			return 0;
		if (ret == -EAGAIN)
			break;
		apfs_err(sb, "failed to delete an orphan file");
		atomic_set(&sbi->s_orphan_cleanup_err, ret);
		return ret;
	}

	/*
	 * If a file is too big, or if there are too many files, take a break
	 * and continue later.
	 */
	apfs_schedule_orphan_cleanup(sb);
	return 0;
}

void apfs_evict_inode(struct inode *inode)
{
	struct super_block *sb = inode->i_sb;
	struct apfs_inode_info *ai = APFS_I(inode);
	int err;

	if (is_bad_inode(inode) || inode->i_nlink || ai->i_cleaned)
		goto out;

	if (!ai->i_has_dstream || ai->i_dstream.ds_size == 0) {
		/* For files with no extents, scheduled cleanup wastes time */
		err = apfs_clean_single_orphan(inode);
		if (err) {
			apfs_err(sb, "failed to clean orphan 0x%llx (err:%d)", apfs_ino(inode), err);
			atomic_set(&APFS_SB(sb)->s_orphan_cleanup_err, err);
		}
		goto out;
	}

	/* If the inode still has extents then schedule cleanup for the rest */
	apfs_schedule_orphan_cleanup(sb);
out:
	truncate_inode_pages_final(&inode->i_data);
	clear_inode(inode);
}

void apfs_orphan_cleanup_work(struct work_struct *work)
{
	struct super_block *sb = NULL;
	struct apfs_sb_info *sbi = NULL;
	struct inode *priv = NULL;
	int err;

	sbi = container_of(work, struct apfs_sb_info, s_orphan_cleanup_work);
	priv = sbi->s_private_dir;
	sb = priv->i_sb;

	if (sb->s_flags & SB_RDONLY) {
		apfs_alert(sb, "attempt to flush orphans in read-only mount");
		return;
	}

	err = apfs_clean_orphans(sb);
	if (err)
		apfs_err(sb, "orphan cleanup failed (err:%d)", err);
}

/**
 * apfs_insert_inode_locked - Wrapper for insert_inode_locked4()
 * @inode: vfs inode to insert in cache
 *
 * Works the same as insert_inode_locked(), but can handle 64-bit inode numbers
 * on 32-bit architectures.
 */
static int apfs_insert_inode_locked(struct inode *inode)
{
	u64 cnid = apfs_ino(inode);

	return insert_inode_locked4(inode, cnid, apfs_test_inode, &cnid);
}

/**
 * apfs_new_inode - Create a new in-memory inode
 * @dir:	parent inode
 * @mode:	mode bits for the new inode
 * @rdev:	device id (0 if not a device file)
 *
 * Returns a pointer to the new vfs inode on success, or an error pointer in
 * case of failure.
 */
struct inode *apfs_new_inode(struct inode *dir, umode_t mode, dev_t rdev)
{
	struct super_block *sb = dir->i_sb;
	struct apfs_superblock *vsb_raw = APFS_SB(sb)->s_vsb_raw;
	struct inode *inode;
	struct apfs_inode_info *ai;
	struct apfs_dstream_info *dstream;
	u64 cnid;
	struct timespec64 now;

	/* Updating on-disk structures here is odd, but it works for now */
	apfs_assert_in_transaction(sb, &vsb_raw->apfs_o);

	inode = new_inode(sb);
	if (!inode)
		return ERR_PTR(-ENOMEM);
	ai = APFS_I(inode);
	dstream = &ai->i_dstream;

	cnid = le64_to_cpu(vsb_raw->apfs_next_obj_id);
	le64_add_cpu(&vsb_raw->apfs_next_obj_id, 1);
	apfs_set_ino(inode, cnid);

#if LINUX_VERSION_CODE < KERNEL_VERSION(5, 12, 0)
	inode_init_owner(inode, dir, mode);
#elif LINUX_VERSION_CODE < KERNEL_VERSION(6, 3, 0) && !RHEL_VERSION_GE(9, 6)
	inode_init_owner(&init_user_ns, inode, dir, mode);
#else
	inode_init_owner(&nop_mnt_idmap, inode, dir, mode);
#endif

	ai->i_saved_uid = i_uid_read(inode);
	ai->i_saved_gid = i_gid_read(inode);
	ai->i_parent_id = apfs_ino(dir);
	set_nlink(inode, 1);
	ai->i_nchildren = 0;
	if (apfs_vol_is_encrypted(sb) && S_ISREG(mode))
		ai->i_key_class = apfs_dflt_key_class(sb);
	else
		ai->i_key_class = 0;
	ai->i_int_flags = APFS_INODE_NO_RSRC_FORK;
	ai->i_bsd_flags = 0;

	ai->i_has_dstream = false;
	dstream->ds_id = cnid;
	dstream->ds_size = 0;
	dstream->ds_sparse_bytes = 0;
	dstream->ds_shared = false;

	now = current_time(inode);
#if LINUX_VERSION_CODE < KERNEL_VERSION(6, 6, 0)
	inode->i_atime = inode->i_mtime = inode->i_ctime = ai->i_crtime = now;
#elif LINUX_VERSION_CODE < KERNEL_VERSION(6, 7, 0)
	inode_set_ctime_to_ts(inode, now);
	inode->i_atime = inode->i_mtime = ai->i_crtime = now;
#else
	ai->i_crtime = simple_inode_init_ts(inode);
#endif
	vsb_raw->apfs_last_mod_time = cpu_to_le64(timespec64_to_ns(&now));

	if (S_ISREG(mode))
		le64_add_cpu(&vsb_raw->apfs_num_files, 1);
	else if (S_ISDIR(mode))
		le64_add_cpu(&vsb_raw->apfs_num_directories, 1);
	else if (S_ISLNK(mode))
		le64_add_cpu(&vsb_raw->apfs_num_symlinks, 1);
	else
		le64_add_cpu(&vsb_raw->apfs_num_other_fsobjects, 1);

	if (apfs_insert_inode_locked(inode)) {
		/* The inode number should have been free, but wasn't */
		apfs_err(sb, "next obj_id (0x%llx) not free", cnid);
		make_bad_inode(inode);
		iput(inode);
		return ERR_PTR(-EFSCORRUPTED);
	}

	/* No need to dirty the inode, we'll write it to disk right away */
	apfs_inode_set_ops(inode, rdev, false /* compressed */);
	return inode;
}

/**
 * apfs_create_inode_rec - Create an inode record in the catalog b-tree
 * @sb:		filesystem superblock
 * @inode:	vfs inode to record
 * @dentry:	dentry for primary link
 *
 * Returns 0 on success or a negative error code in case of failure.
 */
int apfs_create_inode_rec(struct super_block *sb, struct inode *inode,
			  struct dentry *dentry)
{
	struct apfs_sb_info *sbi = APFS_SB(sb);
	struct apfs_query *query;
	struct apfs_inode_key raw_key;
	struct apfs_inode_val *raw_val;
	int val_len;
	int ret;

	query = apfs_alloc_query(sbi->s_cat_root, NULL /* parent */);
	if (!query)
		return -ENOMEM;
	apfs_init_inode_key(apfs_ino(inode), &query->key);
	query->flags |= APFS_QUERY_CAT;

	ret = apfs_btree_query(sb, &query);
	if (ret && ret != -ENODATA) {
		apfs_err(sb, "query failed for ino 0x%llx", apfs_ino(inode));
		goto fail;
	}

	apfs_key_set_hdr(APFS_TYPE_INODE, apfs_ino(inode), &raw_key);

	val_len = apfs_build_inode_val(inode, &dentry->d_name, &raw_val);
	if (val_len < 0) {
		ret = val_len;
		goto fail;
	}

	ret = apfs_btree_insert(query, &raw_key, sizeof(raw_key), raw_val, val_len);
	if (ret)
		apfs_err(sb, "insertion failed for ino 0x%llx", apfs_ino(inode));
	kfree(raw_val);

fail:
	apfs_free_query(query);
	return ret;
}

/**
 * apfs_setsize - Change the size of a regular file
 * @inode:	the vfs inode
 * @new_size:	the new size
 *
 * Returns 0 on success or a negative error code in case of failure.
 */
static int apfs_setsize(struct inode *inode, loff_t new_size)
{
	struct super_block *sb = inode->i_sb;
	struct apfs_dstream_info *dstream = &APFS_I(inode)->i_dstream;
	int err;

	if (new_size == inode->i_size)
		return 0;
#if LINUX_VERSION_CODE < KERNEL_VERSION(6, 6, 0)
	inode->i_mtime = inode->i_ctime = current_time(inode);
#elif LINUX_VERSION_CODE < KERNEL_VERSION(6, 7, 0)
	inode->i_mtime = inode_set_ctime_current(inode);
#else
	inode_set_mtime_to_ts(inode, inode_set_ctime_current(inode));
#endif

	err = apfs_inode_create_dstream_rec(inode);
	if (err) {
		apfs_err(sb, "failed to create dstream for ino 0x%llx", apfs_ino(inode));
		return err;
	}

	/* Must be called before i_size is changed */
	err = apfs_truncate(dstream, new_size);
	if (err) {
		apfs_err(sb, "truncation failed for ino 0x%llx", apfs_ino(inode));
		return err;
	}

	truncate_setsize(inode, new_size);
	dstream->ds_size = i_size_read(inode);
	return 0;
}

#if LINUX_VERSION_CODE < KERNEL_VERSION(5, 12, 0)
int apfs_setattr(struct dentry *dentry, struct iattr *iattr)
#elif LINUX_VERSION_CODE < KERNEL_VERSION(6, 3, 0) && !RHEL_VERSION_GE(9, 6)
int apfs_setattr(struct user_namespace *mnt_userns,
		 struct dentry *dentry, struct iattr *iattr)
#else
int apfs_setattr(struct mnt_idmap *idmap,
		 struct dentry *dentry, struct iattr *iattr)
#endif
{
	struct inode *inode = d_inode(dentry);
	struct super_block *sb = inode->i_sb;
	bool resizing = S_ISREG(inode->i_mode) && (iattr->ia_valid & ATTR_SIZE);
	bool shrinking = false;
	int err;

	if (resizing && iattr->ia_size > APFS_MAX_FILE_SIZE)
		return -EFBIG;
	if (resizing && iattr->ia_size < inode->i_size)
		shrinking = true;

#if LINUX_VERSION_CODE < KERNEL_VERSION(5, 12, 0)
	err = setattr_prepare(dentry, iattr);
#elif LINUX_VERSION_CODE < KERNEL_VERSION(6, 3, 0) && !RHEL_VERSION_GE(9, 6)
	err = setattr_prepare(&init_user_ns, dentry, iattr);
#else
	err = setattr_prepare(&nop_mnt_idmap, dentry, iattr);
#endif
	if (err)
		return err;

	/* TODO: figure out why ->write_inode() isn't firing */
	err = apfs_transaction_start(sb, shrinking ? APFS_TRANS_DEL : APFS_TRANS_REG);
	if (err)
		return err;
	apfs_inode_join_transaction(sb, inode);

	if (resizing) {
		err = apfs_setsize(inode, iattr->ia_size);
		if (err) {
			apfs_err(sb, "setsize failed for ino 0x%llx", apfs_ino(inode));
			goto fail;
		}
	}

#if LINUX_VERSION_CODE < KERNEL_VERSION(5, 12, 0)
	setattr_copy(inode, iattr);
#elif LINUX_VERSION_CODE < KERNEL_VERSION(6, 3, 0) && !RHEL_VERSION_GE(9, 6)
	setattr_copy(&init_user_ns, inode, iattr);
#else
	setattr_copy(&nop_mnt_idmap, inode, iattr);
#endif

	mark_inode_dirty(inode);
	err = apfs_transaction_commit(sb);
	if (err)
		goto fail;
	return 0;

fail:
	apfs_transaction_abort(sb);
	return err;
}

/* TODO: this only seems to be necessary because ->write_inode() isn't firing */
#if LINUX_VERSION_CODE >= KERNEL_VERSION(7, 0, 0)
int apfs_update_time(struct inode *inode, enum fs_update_time time, unsigned int flags)
#elif LINUX_VERSION_CODE < KERNEL_VERSION(6, 6, 0)
int apfs_update_time(struct inode *inode, struct timespec64 *time, int flags)
#else
int apfs_update_time(struct inode *inode, int flags)
#endif
{
	struct super_block *sb = inode->i_sb;
	int err;

	err = apfs_transaction_start(sb, APFS_TRANS_REG);
	if (err)
		return err;
	apfs_inode_join_transaction(sb, inode);

#if LINUX_VERSION_CODE >= KERNEL_VERSION(7, 0, 0)
	generic_update_time(inode, time, flags);
#elif LINUX_VERSION_CODE < KERNEL_VERSION(6, 6, 0) && !RHEL_VERSION_GE(9, 6)
	generic_update_time(inode, time, flags);
#else
	generic_update_time(inode, flags);
#endif

	err = apfs_transaction_commit(sb);
	if (err)
		goto fail;
	return 0;

fail:
	apfs_transaction_abort(sb);
	return err;
}

static int apfs_ioc_set_dflt_pfk(struct file *file, void __user *user_pfk)
{
	struct inode *inode = file_inode(file);
	struct super_block *sb = inode->i_sb;
	struct apfs_sb_info *sbi = APFS_SB(sb);
	struct apfs_nxsb_info *nxi = APFS_NXI(sb);
	struct apfs_wrapped_crypto_state pfk_hdr;
	struct apfs_crypto_state_val *pfk;
	unsigned int key_len;

	if (__copy_from_user(&pfk_hdr, user_pfk, sizeof(pfk_hdr)))
		return -EFAULT;
	key_len = le16_to_cpu(pfk_hdr.key_len);
	if (key_len > MAX_PFK_LEN)
		return -EFBIG;
	pfk = kmalloc(sizeof(*pfk) + key_len, GFP_KERNEL);
	if (!pfk)
		return -ENOMEM;
	if (__copy_from_user(&pfk->state, user_pfk, sizeof(pfk_hdr) + key_len)) {
		kfree(pfk);
		return -EFAULT;
	}
	pfk->refcnt = cpu_to_le32(1);

	down_write(&nxi->nx_big_sem);

	if (sbi->s_dflt_pfk)
		kfree(sbi->s_dflt_pfk);
	sbi->s_dflt_pfk = pfk;

	up_write(&nxi->nx_big_sem);

	return 0;
}

static int apfs_ioc_set_dir_class(struct file *file, u32 __user *user_class)
{
	struct inode *inode = file_inode(file);
	struct apfs_inode_info *ai = APFS_I(inode);
	struct super_block *sb = inode->i_sb;
	u32 class;
	int err;

	if (get_user(class, user_class))
		return -EFAULT;

	ai->i_key_class = class;

	err = apfs_transaction_start(sb, APFS_TRANS_REG);
	if (err)
		return err;
	apfs_inode_join_transaction(sb, inode);
	err = apfs_transaction_commit(sb);
	if (err)
		goto fail;
	return 0;

fail:
	apfs_transaction_abort(sb);
	return err;
}

static int apfs_ioc_set_pfk(struct file *file, void __user *user_pfk)
{
	struct inode *inode = file_inode(file);
	struct super_block *sb = inode->i_sb;
	struct apfs_wrapped_crypto_state pfk_hdr;
	struct apfs_crypto_state_val *pfk;
	struct apfs_inode_info *ai = APFS_I(inode);
	struct apfs_dstream_info *dstream = &ai->i_dstream;
	unsigned int key_len, key_class;
	int err;

	if (__copy_from_user(&pfk_hdr, user_pfk, sizeof(pfk_hdr)))
		return -EFAULT;
	key_len = le16_to_cpu(pfk_hdr.key_len);
	if (key_len > MAX_PFK_LEN)
		return -EFBIG;
	pfk = kmalloc(sizeof(*pfk) + key_len, GFP_KERNEL);
	if (!pfk)
		return -ENOMEM;
	if (__copy_from_user(&pfk->state, user_pfk, sizeof(pfk_hdr) + key_len)) {
		kfree(pfk);
		return -EFAULT;
	}
	pfk->refcnt = cpu_to_le32(1);

	err = apfs_transaction_start(sb, APFS_TRANS_REG);
	if (err) {
		kfree(pfk);
		return err;
	}

	err = apfs_crypto_set_key(sb, dstream->ds_id, pfk);
	if (err)
		goto fail;

	key_class = le32_to_cpu(pfk_hdr.persistent_class);
	if (ai->i_key_class != key_class) {
		ai->i_key_class = key_class;
		apfs_inode_join_transaction(sb, inode);
	}

	err = apfs_transaction_commit(sb);
	if (err)
		goto fail;
	kfree(pfk);
	return 0;

fail:
	apfs_transaction_abort(sb);
	kfree(pfk);
	return err;
}

static int apfs_ioc_get_class(struct file *file, u32 __user *user_class)
{
	struct inode *inode = file_inode(file);
	struct apfs_inode_info *ai = APFS_I(inode);
	u32 class;

	class = ai->i_key_class;
	if (put_user(class, user_class))
		return -EFAULT;
	return 0;
}

static int apfs_ioc_get_pfk(struct file *file, void __user *user_pfk)
{
	struct inode *inode = file_inode(file);
	struct super_block *sb = inode->i_sb;
	struct apfs_nxsb_info *nxi = APFS_NXI(sb);
	struct apfs_wrapped_crypto_state pfk_hdr;
	struct apfs_crypto_state_val *pfk;
	unsigned int max_len, key_len;
	struct apfs_dstream_info *dstream = &APFS_I(inode)->i_dstream;
	int err;

	if (__copy_from_user(&pfk_hdr, user_pfk, sizeof(pfk_hdr)))
		return -EFAULT;
	max_len = le16_to_cpu(pfk_hdr.key_len);
	if (max_len > MAX_PFK_LEN)
		return -EFBIG;
	pfk = kmalloc(sizeof(*pfk) + max_len, GFP_KERNEL);
	if (!pfk)
		return -ENOMEM;

	down_read(&nxi->nx_big_sem);

	err = apfs_crypto_get_key(sb, dstream->ds_id, pfk, max_len);
	if (err)
		goto fail;

	up_read(&nxi->nx_big_sem);

	key_len = le16_to_cpu(pfk->state.key_len);
	if (__copy_to_user(user_pfk, &pfk->state, sizeof(pfk_hdr) + key_len)) {
		kfree(pfk);
		return -EFAULT;
	}

	kfree(pfk);
	return 0;

fail:
	up_read(&nxi->nx_big_sem);
	kfree(pfk);
	return err;
}

/*
 * Older kernels have no vfs_ioc_setflags_prepare(), so don't implement the
 * SETFLAGS/GETFLAGS ioctls there. It should be easy to fix, but it's not
 * really needed at all. Be careful with this macro check, because it nests
 * over a few others.
 */
#if LINUX_VERSION_CODE >= KERNEL_VERSION(5, 3, 0)

/**
 * apfs_getflags - Read an inode's bsd flags in FS_IOC_GETFLAGS format
 * @inode: the vfs inode
 */
static unsigned int apfs_getflags(struct inode *inode)
{
	struct apfs_inode_info *ai = APFS_I(inode);
	unsigned int flags = 0;

	if (ai->i_bsd_flags & APFS_INOBSD_APPEND)
		flags |= FS_APPEND_FL;
	if (ai->i_bsd_flags & APFS_INOBSD_IMMUTABLE)
		flags |= FS_IMMUTABLE_FL;
	if (ai->i_bsd_flags & APFS_INOBSD_NODUMP)
		flags |= FS_NODUMP_FL;
	return flags;
}

/**
 * apfs_setflags - Set an inode's bsd flags
 * @inode: the vfs inode
 * @flags: flags to set, in FS_IOC_SETFLAGS format
 */
static void apfs_setflags(struct inode *inode, unsigned int flags)
{
	struct apfs_inode_info *ai = APFS_I(inode);
	unsigned int i_flags = 0;

	if (flags & FS_APPEND_FL) {
		ai->i_bsd_flags |= APFS_INOBSD_APPEND;
		i_flags |= S_APPEND;
	} else {
		ai->i_bsd_flags &= ~APFS_INOBSD_APPEND;
	}

	if (flags & FS_IMMUTABLE_FL) {
		ai->i_bsd_flags |= APFS_INOBSD_IMMUTABLE;
		i_flags |= S_IMMUTABLE;
	} else {
		ai->i_bsd_flags &= ~APFS_INOBSD_IMMUTABLE;
	}

	if (flags & FS_NODUMP_FL)
		ai->i_bsd_flags |= APFS_INOBSD_NODUMP;
	else
		ai->i_bsd_flags &= ~APFS_INOBSD_NODUMP;

	inode_set_flags(inode, i_flags, S_IMMUTABLE | S_APPEND);
}

#if LINUX_VERSION_CODE < KERNEL_VERSION(5, 13, 0)

/**
 * apfs_ioc_getflags - Ioctl handler for FS_IOC_GETFLAGS
 * @file:	affected file
 * @arg:	ioctl argument
 *
 * Returns 0 on success, or a negative error code in case of failure.
 */
static int apfs_ioc_getflags(struct file *file, int __user *arg)
{
	unsigned int flags = apfs_getflags(file_inode(file));

	return put_user(flags, arg);
}

/**
 * apfs_do_ioc_setflags - Actual work for apfs_ioc_setflags(), after preparation
 * @inode:	affected vfs inode
 * @newflags:	inode flags to set, in FS_IOC_SETFLAGS format
 *
 * Returns 0 on success, or a negative error code in case of failure.
 */
static int apfs_do_ioc_setflags(struct inode *inode, unsigned int newflags)
{
	struct super_block *sb = inode->i_sb;
	unsigned int oldflags;
	int err;

	lockdep_assert_held_write(&inode->i_rwsem);

	oldflags = apfs_getflags(inode);
	err = vfs_ioc_setflags_prepare(inode, oldflags, newflags);
	if (err)
		return err;

	err = apfs_transaction_start(sb, APFS_TRANS_REG);
	if (err)
		return err;

	apfs_inode_join_transaction(sb, inode);
	apfs_setflags(inode, newflags);
	inode->i_ctime = current_time(inode);

	err = apfs_transaction_commit(sb);
	if (err)
		apfs_transaction_abort(sb);
	return err;
}

/**
 * apfs_ioc_setflags - Ioctl handler for FS_IOC_SETFLAGS
 * @file:	affected file
 * @arg:	ioctl argument
 *
 * Returns 0 on success, or a negative error code in case of failure.
 */
static int apfs_ioc_setflags(struct file *file, int __user *arg)
{
	struct inode *inode = file_inode(file);
	struct super_block *sb = inode->i_sb;
	unsigned int newflags;
	int err;

	if (sb->s_flags & SB_RDONLY)
		return -EROFS;

#if LINUX_VERSION_CODE < KERNEL_VERSION(5, 12, 0)
	if (!inode_owner_or_capable(inode))
#else
	if (!inode_owner_or_capable(&init_user_ns, inode))
#endif
		return -EPERM;

	if (get_user(newflags, arg))
		return -EFAULT;

	if (newflags & ~(FS_APPEND_FL | FS_IMMUTABLE_FL | FS_NODUMP_FL))
		return -EOPNOTSUPP;

	err = mnt_want_write_file(file);
	if (err)
		return err;

	inode_lock(inode);
	err = apfs_do_ioc_setflags(inode, newflags);
	inode_unlock(inode);

	mnt_drop_write_file(file);
	return err;
}

#elif LINUX_VERSION_CODE < KERNEL_VERSION(6, 3, 0) && !RHEL_VERSION_GE(9, 6)

int apfs_fileattr_get(struct dentry *dentry, struct fileattr *fa)
{
	unsigned int flags = apfs_getflags(d_inode(dentry));

	fileattr_fill_flags(fa, flags);
	return 0;
}

int apfs_fileattr_set(struct user_namespace *mnt_userns, struct dentry *dentry, struct fileattr *fa)
{
	struct inode *inode = d_inode(dentry);
	struct super_block *sb = inode->i_sb;
	int err;

	if (sb->s_flags & SB_RDONLY)
		return -EROFS;

	if (fa->flags & ~(FS_APPEND_FL | FS_IMMUTABLE_FL | FS_NODUMP_FL))
		return -EOPNOTSUPP;
	if (fileattr_has_fsx(fa))
		return -EOPNOTSUPP;

	lockdep_assert_held_write(&inode->i_rwsem);

	err = apfs_transaction_start(sb, APFS_TRANS_REG);
	if (err)
		return err;

	apfs_inode_join_transaction(sb, inode);
	apfs_setflags(inode, fa->flags);
	inode->i_ctime = current_time(inode);

	err = apfs_transaction_commit(sb);
	if (err)
		apfs_transaction_abort(sb);
	return err;
}

#else /* LINUX_VERSION_CODE >= KERNEL_VERSION(6, 3, 0) */

int apfs_fileattr_get(struct dentry *dentry, struct fileattr *fa)
{
	unsigned int flags = apfs_getflags(d_inode(dentry));

	fileattr_fill_flags(fa, flags);
	return 0;
}

int apfs_fileattr_set(struct mnt_idmap *idmap, struct dentry *dentry, struct fileattr *fa)
{
	struct inode *inode = d_inode(dentry);
	struct super_block *sb = inode->i_sb;
	int err;

	if (sb->s_flags & SB_RDONLY)
		return -EROFS;

	if (fa->flags & ~(FS_APPEND_FL | FS_IMMUTABLE_FL | FS_NODUMP_FL))
		return -EOPNOTSUPP;
	if (fileattr_has_fsx(fa))
		return -EOPNOTSUPP;

	lockdep_assert_held_write(&inode->i_rwsem);

	err = apfs_transaction_start(sb, APFS_TRANS_REG);
	if (err)
		return err;

	apfs_inode_join_transaction(sb, inode);
	apfs_setflags(inode, fa->flags);
#if LINUX_VERSION_CODE < KERNEL_VERSION(6, 6, 0)
	inode->i_ctime = current_time(inode);
#else
	inode_set_ctime_current(inode);
#endif

	err = apfs_transaction_commit(sb);
	if (err)
		apfs_transaction_abort(sb);
	return err;
}

#endif /* LINUX_VERSION_CODE < KERNEL_VERSION(5, 13, 0) */

#endif /* LINUX_VERSION_CODE >= KERNEL_VERSION(5, 3, 0) */

long apfs_dir_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
{
	void __user *argp = (void __user *)arg;

	switch (cmd) {
#if LINUX_VERSION_CODE < KERNEL_VERSION(5, 13, 0) && LINUX_VERSION_CODE >= KERNEL_VERSION(5, 3, 0)
	case FS_IOC_GETFLAGS:
		return apfs_ioc_getflags(file, argp);
	case FS_IOC_SETFLAGS:
		return apfs_ioc_setflags(file, argp);
#endif
	case APFS_IOC_SET_DFLT_PFK:
		return apfs_ioc_set_dflt_pfk(file, argp);
	case APFS_IOC_SET_DIR_CLASS:
		return apfs_ioc_set_dir_class(file, argp);
	case APFS_IOC_GET_CLASS:
		return apfs_ioc_get_class(file, argp);
	case APFS_IOC_TAKE_SNAPSHOT:
		return apfs_ioc_take_snapshot(file, argp);
	default:
		return -ENOTTY;
	}
}

long apfs_file_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
{
	void __user *argp = (void __user *)arg;

	switch (cmd) {
#if LINUX_VERSION_CODE < KERNEL_VERSION(5, 13, 0) && LINUX_VERSION_CODE >= KERNEL_VERSION(5, 3, 0)
	case FS_IOC_GETFLAGS:
		return apfs_ioc_getflags(file, argp);
	case FS_IOC_SETFLAGS:
		return apfs_ioc_setflags(file, argp);
#endif
	case APFS_IOC_SET_PFK:
		return apfs_ioc_set_pfk(file, argp);
	case APFS_IOC_GET_CLASS:
		return apfs_ioc_get_class(file, argp);
	case APFS_IOC_GET_PFK:
		return apfs_ioc_get_pfk(file, argp);
	default:
		return -ENOTTY;
	}
}