// SPDX-License-Identifier: GPL-2.0-only /* * Copyright (C) 2019 Ernesto A. Fernández */ #include #include #include "apfs.h" /** * apfs_checkpoint_end - End the new checkpoint * @sb: filesystem superblock * * Flushes all changes to disk, and commits the new checkpoint by setting the * fletcher checksum on its superblock. Returns 0 on success, or a negative * error code in case of failure. */ static int apfs_checkpoint_end(struct super_block *sb) { struct apfs_nxsb_info *nxi = APFS_NXI(sb); struct apfs_obj_phys *obj = &nxi->nx_raw->nx_o; struct buffer_head *bh = NULL; struct apfs_blkdev_info *bd_info = nxi->nx_blkdev_info; #if LINUX_VERSION_CODE >= KERNEL_VERSION(6, 10, 0) struct address_space *bdev_map = bd_info->blki_bdev->bd_mapping; #else struct inode *bdev_inode = bd_info->blki_bdev->bd_inode; struct address_space *bdev_map = bdev_inode->i_mapping; #endif int err; ASSERT(!(sb->s_flags & SB_RDONLY)); bh = apfs_getblk(sb, nxi->nx_bno); if (!bh) { apfs_err(sb, "failed to map new checkpoint superblock"); return -EIO; } obj->o_xid = cpu_to_le64(nxi->nx_xid); apfs_obj_set_csum(sb, obj); memcpy(bh->b_data, obj, sb->s_blocksize); err = filemap_write_and_wait(bdev_map); if (err) goto out; mark_buffer_dirty(bh); err = sync_dirty_buffer(bh); if (err) goto out; err = filemap_write_and_wait(bdev_map); out: brelse(bh); bh = NULL; return err; } /** * apfs_transaction_has_room - Is there enough free space for this transaction? * @sb: superblock structure * @kind: transaction kind for space preallocation */ static bool apfs_transaction_has_room(struct super_block *sb, enum apfs_trans_kind kind) { struct apfs_nx_transaction *trans = NULL; struct apfs_spaceman *sm = NULL; u64 max_blks; trans = &APFS_NXI(sb)->nx_transaction; sm = APFS_SM(sb); /* * It's hard to keep track of the maximum number of blocks that each * operation could need because of the complexities of apfs btrees, * plus snapshots and clones. I try to keep things simple here. */ switch (kind) { case APFS_TRANS_REG: /* * For transactions that are expected to reduce free space, we * use a very coarse bound (512 KiB) that is certain to be much * more than enough, and will always leave room for deletions. */ max_blks = APFS_REG_ROOM; break; case APFS_TRANS_DEL: /* * For transactions that are likely to increase free space, we * use a much tighter bound (80 KiB), so that users have the * opportunity to fix their ENOSPC situation. * * For huge filesystems with huge numbers of records, there is * a tiny chance that this might be too permissive, in which * case the transaction will later abort. I think that's * acceptable. */ max_blks = APFS_DEL_ROOM; break; case APFS_TRANS_SYNC: /* * We try to allow sync as much as possible, for the user's * peace of mind and because flushing the free queues could * make some room. * * Even if we do nothing the transaction still allocates a new * volume superblock, and new roots for the omap and catalog, * which consumes 6 blocks in total. This could be avoided... */ if (trans->t_starts_count == 0) max_blks = APFS_SYNC_ROOM; else max_blks = 0; break; default: apfs_alert(sb, "invalid transaction kind %u - bug!", kind); return false; } return max_blks <= sm->sm_free_count; } /** * apfs_read_single_ephemeral_object - Read a single ephemeral object to memory * @sb: filesystem superblock * @map: checkpoint mapping for the object * * Returns 0 on success or a negative error code in case of failure. */ static int apfs_read_single_ephemeral_object(struct super_block *sb, struct apfs_checkpoint_mapping *map) { struct apfs_nxsb_info *nxi = APFS_NXI(sb); struct apfs_nx_superblock *raw_sb = NULL; struct apfs_ephemeral_object_info *list = NULL; struct buffer_head *bh = NULL; char *object = NULL; int count; u32 data_blks, size; u64 data_base, bno, oid; int err, i, data_idx; raw_sb = nxi->nx_raw; data_base = le64_to_cpu(raw_sb->nx_xp_data_base); data_blks = le32_to_cpu(raw_sb->nx_xp_data_blocks); list = nxi->nx_eph_list; count = nxi->nx_eph_count; if (count >= APFS_EPHEMERAL_LIST_LIMIT) { apfs_err(sb, "too many ephemeral objects?"); return -EOPNOTSUPP; } bno = le64_to_cpu(map->cpm_paddr); oid = le64_to_cpu(map->cpm_oid); size = le32_to_cpu(map->cpm_size); if (size > sb->s_blocksize << 1) { /* * No reason not to support bigger objects, but there has to be * a limit somewhere and this is all I've seen so far. */ apfs_warn(sb, "ephemeral object has more than 2 blocks"); return -EOPNOTSUPP; } if (!size || (size & (sb->s_blocksize - 1))) { apfs_err(sb, "invalid object size (0x%x)", size); return -EFSCORRUPTED; } object = kmalloc(size, GFP_KERNEL); if (!object) return -ENOMEM; data_idx = bno - data_base; for (i = 0; i < size >> sb->s_blocksize_bits; ++i) { bh = apfs_sb_bread(sb, data_base + data_idx); if (!bh) { apfs_err(sb, "failed to read ephemeral block"); err = -EIO; goto fail; } memcpy(object + (i << sb->s_blocksize_bits), bh->b_data, sb->s_blocksize); brelse(bh); bh = NULL; /* Somewhat surprisingly, objects can wrap around */ if (++data_idx == data_blks) data_idx = 0; } /* * The official reference requires that we always verify ephemeral * checksums on mount, so do it even if the user didn't ask. We should * actually try to mount an older checkpoint when this fails (TODO), * which I guess means that the official driver writes all checkpoint * blocks at once, instead of leaving the superblock for last like we * do. */ if (!apfs_multiblock_verify_csum(object, size)) { apfs_err(sb, "bad checksum for ephemeral object 0x%llx", oid); err = -EFSBADCRC; goto fail; } list[count].oid = oid; list[count].size = size; list[count].object = object; object = NULL; nxi->nx_eph_count = count + 1; return 0; fail: kfree(object); object = NULL; return err; } /** * apfs_read_single_cpm_block - Read all ephemeral objects in a cpm block * @sb: filesystem superblock * @cpm_bno: block number for the cpm block * * Returns 0 on success or a negative error code in case of failure. */ static int apfs_read_single_cpm_block(struct super_block *sb, u64 cpm_bno) { struct buffer_head *bh = NULL; struct apfs_checkpoint_map_phys *cpm = NULL; u32 map_count; int err, i; bh = apfs_sb_bread(sb, cpm_bno); if (!bh) { apfs_err(sb, "failed to read cpm block"); return -EIO; } if (!apfs_obj_verify_csum(sb, bh)) { /* * The reference seems to imply that we need to check these on * mount, and retry an older checkpoint on failure (TODO). */ apfs_err(sb, "bad checksum for cpm block at 0x%llx", cpm_bno); err = -EFSBADCRC; goto out; } cpm = (struct apfs_checkpoint_map_phys *)bh->b_data; map_count = le32_to_cpu(cpm->cpm_count); if (map_count > apfs_max_maps_per_block(sb)) { apfs_err(sb, "block has too many maps (%d)", map_count); err = -EFSCORRUPTED; goto out; } for (i = 0; i < map_count; ++i) { err = apfs_read_single_ephemeral_object(sb, &cpm->cpm_map[i]); if (err) { apfs_err(sb, "failed to read ephemeral object %u", i); goto out; } } out: brelse(bh); cpm = NULL; return err; } static void apfs_force_readonly(struct apfs_nxsb_info *nxi); /** * apfs_read_ephemeral_objects - Read all ephemeral objects to memory * @sb: superblock structure * * Returns 0 on success or a negative error code in case of failure. */ int apfs_read_ephemeral_objects(struct super_block *sb) { struct apfs_nxsb_info *nxi = APFS_NXI(sb); struct apfs_nx_superblock *raw_sb = nxi->nx_raw; u64 desc_base; u32 desc_index, desc_blks, desc_len, i; int err; if (nxi->nx_eph_list) { apfs_alert(sb, "attempt to reread ephemeral object list"); return -EFSCORRUPTED; } nxi->nx_eph_list = kzalloc(APFS_EPHEMERAL_LIST_SIZE, GFP_KERNEL); if (!nxi->nx_eph_list) return -ENOMEM; nxi->nx_eph_count = 0; desc_base = le64_to_cpu(raw_sb->nx_xp_desc_base); desc_index = le32_to_cpu(raw_sb->nx_xp_desc_index); desc_blks = le32_to_cpu(raw_sb->nx_xp_desc_blocks); desc_len = le32_to_cpu(raw_sb->nx_xp_desc_len); /* Last block in the area is superblock; the rest are mapping blocks */ for (i = 0; i < desc_len - 1; ++i) { u64 cpm_bno = desc_base + (desc_index + i) % desc_blks; err = apfs_read_single_cpm_block(sb, cpm_bno); if (err) { apfs_err(sb, "failed to read cpm block %u", i); /* No transaction to abort yet */ apfs_force_readonly(nxi); return err; } } return 0; } static void apfs_trans_commit_work(struct work_struct *work) { struct super_block *sb = NULL; struct apfs_nxsb_info *nxi = NULL; struct apfs_nx_transaction *trans = NULL; int err; trans = container_of(to_delayed_work(work), struct apfs_nx_transaction, t_work); nxi = container_of(trans, struct apfs_nxsb_info, nx_transaction); sb = trans->t_work_sb; /* * If sb is set then the transaction already started, there is no need * for apfs_transaction_start() here. It would be cleaner to call it * anyway (and check in there if sb is set), but maxops is a problem * because we don't need any space. I really need to rethink that stuff * (TODO). */ down_write(&nxi->nx_big_sem); if (!sb || sb->s_flags & SB_RDONLY) { /* The commit already took place, or there was an abort */ up_write(&nxi->nx_big_sem); return; } trans->t_state |= APFS_NX_TRANS_FORCE_COMMIT; err = apfs_transaction_commit(sb); if (err) { apfs_err(sb, "queued commit failed (err:%d)", err); apfs_transaction_abort(sb); } } /** * apfs_transaction_init - Initialize the transaction struct for the container * @trans: the transaction structure */ void apfs_transaction_init(struct apfs_nx_transaction *trans) { trans->t_state = 0; INIT_DELAYED_WORK(&trans->t_work, apfs_trans_commit_work); INIT_LIST_HEAD(&trans->t_inodes); INIT_LIST_HEAD(&trans->t_buffers); trans->t_buffers_count = 0; trans->t_starts_count = 0; } /** * apfs_transaction_start - Begin a new transaction * @sb: superblock structure * @kind: transaction kind for space preallocation * * Also locks the filesystem for writing; returns 0 on success or a negative * error code in case of failure. */ int apfs_transaction_start(struct super_block *sb, enum apfs_trans_kind kind) { struct apfs_nxsb_info *nxi = APFS_NXI(sb); struct apfs_nx_transaction *nx_trans = &nxi->nx_transaction; int err; down_write(&nxi->nx_big_sem); if (sb->s_flags & SB_RDONLY) { /* A previous transaction has failed; this should be rare */ up_write(&nxi->nx_big_sem); return -EROFS; } /* * Ephemeral objects are read only once, kept in memory, and committed * to disk along with each transaction. */ if (!nxi->nx_eph_list) { err = apfs_read_ephemeral_objects(sb); if (err) { up_write(&nxi->nx_big_sem); apfs_err(sb, "failed to read the ephemeral objects"); return err; } } if (nx_trans->t_starts_count == 0) { ++nxi->nx_xid; nxi->nx_raw->nx_next_xid = cpu_to_le64(nxi->nx_xid + 1); err = apfs_read_spaceman(sb); if (err) { apfs_err(sb, "failed to read the spaceman"); goto fail; } } /* Don't start transactions unless we are sure they fit in disk */ if (!apfs_transaction_has_room(sb, kind)) { /* Commit what we have so far to flush the queues */ nx_trans->t_state |= APFS_NX_TRANS_FORCE_COMMIT; err = apfs_transaction_commit(sb); if (err) { apfs_err(sb, "commit failed"); goto fail; } return -ENOSPC; } if (nx_trans->t_starts_count == 0) { err = apfs_map_volume_super(sb, true /* write */); if (err) { apfs_err(sb, "CoW failed for volume super"); goto fail; } /* TODO: don't copy these nodes for transactions that don't use them */ err = apfs_read_omap(sb, true /* write */); if (err) { apfs_err(sb, "CoW failed for omap"); goto fail; } err = apfs_read_catalog(sb, true /* write */); if (err) { apfs_err(sb, "Cow failed for catalog"); goto fail; } } nx_trans->t_starts_count++; return 0; fail: apfs_transaction_abort(sb); return err; } /** * apfs_transaction_flush_all_inodes - Flush inode metadata to the buffer heads * @sb: superblock structure * * This messes a lot with the disk layout, so it must be called ahead of time * if we need it to be stable for the rest or the transaction (for example, if * we are setting up a snapshot). */ int apfs_transaction_flush_all_inodes(struct super_block *sb) { struct apfs_nxsb_info *nxi = APFS_NXI(sb); struct apfs_nx_transaction *nx_trans = &nxi->nx_transaction; int err = 0, curr_err; ASSERT(!(sb->s_flags & SB_RDONLY)); while (!list_empty(&nx_trans->t_inodes)) { struct apfs_inode_info *ai = NULL; struct inode *inode = NULL; ai = list_first_entry(&nx_trans->t_inodes, struct apfs_inode_info, i_list); inode = &ai->vfs_inode; /* This is a bit wasteful if the inode will get deleted */ curr_err = apfs_update_inode(inode, NULL /* new_name */); if (curr_err) err = curr_err; apfs_inode_state_clear_raw(inode, I_DIRTY_ALL); /* * The same inode may get dirtied again as soon as we release * the lock, and we don't want to miss that. */ list_del_init(&ai->i_list); nx_trans->t_state |= APFS_NX_TRANS_COMMITTING; up_write(&nxi->nx_big_sem); /* Unlocked, so it may call evict() and wait for writeback */ iput(inode); down_write(&nxi->nx_big_sem); nx_trans->t_state = 0; /* Transaction aborted during writeback, error code is lost */ if (sb->s_flags & SB_RDONLY) { apfs_err(sb, "abort during inode writeback"); return -EROFS; } } return err; } /** * apfs_write_single_ephemeral_object - Write a single ephemeral object to bh's * @sb: filesystem superblock * @obj_raw: contents of the object * @map: checkpoint mapping for the object, already updated * * Returns 0 on success or a negative error code in case of failure. */ static int apfs_write_single_ephemeral_object(struct super_block *sb, struct apfs_obj_phys *obj_raw, const struct apfs_checkpoint_mapping *map) { struct apfs_nxsb_info *nxi = APFS_NXI(sb); struct apfs_nx_superblock *raw_sb = NULL; struct buffer_head *bh = NULL; u64 data_base, bno; u32 data_blks, size; int err, i, data_idx; raw_sb = nxi->nx_raw; data_base = le64_to_cpu(raw_sb->nx_xp_data_base); data_blks = le32_to_cpu(raw_sb->nx_xp_data_blocks); bno = le64_to_cpu(map->cpm_paddr); size = le32_to_cpu(map->cpm_size); obj_raw->o_xid = cpu_to_le64(nxi->nx_xid); apfs_multiblock_set_csum((char *)obj_raw, size); data_idx = bno - data_base; for (i = 0; i < size >> sb->s_blocksize_bits; ++i) { bh = apfs_getblk(sb, data_base + data_idx); if (!bh) { apfs_err(sb, "failed to map ephemeral block"); return -EIO; } err = apfs_transaction_join(sb, bh); if (err) { brelse(bh); bh = NULL; return err; } memcpy(bh->b_data, (char *)obj_raw + (i << sb->s_blocksize_bits), sb->s_blocksize); brelse(bh); bh = NULL; /* Somewhat surprisingly, objects can wrap around */ if (++data_idx == data_blks) data_idx = 0; } return 0; } /** * apfs_write_ephemeral_objects - Write all ephemeral objects to bh's * @sb: filesystem superblock * * Returns 0 on sucess, or a negative error code in case of failure. */ static int apfs_write_ephemeral_objects(struct super_block *sb) { struct apfs_nxsb_info *nxi = APFS_NXI(sb); struct apfs_nx_superblock *raw_sb = nxi->nx_raw; struct apfs_checkpoint_map_phys *cpm = NULL; struct buffer_head *cpm_bh = NULL; struct apfs_ephemeral_object_info *eph_info = NULL; u64 cpm_bno; u64 desc_base, data_base; u32 desc_index, desc_blks, desc_len, desc_next; u32 data_index, data_blks, data_len, data_next; u32 desc_limit, data_limit; u32 obj_blkcnt; int err, i, cpm_start; if (!nxi->nx_eph_list) { apfs_alert(sb, "missing ephemeral object list"); return -EFSCORRUPTED; } desc_next = le32_to_cpu(raw_sb->nx_xp_desc_next); desc_base = le64_to_cpu(raw_sb->nx_xp_desc_base); desc_index = desc_next; /* New checkpoint */ desc_blks = le32_to_cpu(raw_sb->nx_xp_desc_blocks); desc_len = 0; /* For now */ data_next = le32_to_cpu(raw_sb->nx_xp_data_next); data_base = le64_to_cpu(raw_sb->nx_xp_data_base); data_index = data_next; /* New checkpoint */ data_blks = le32_to_cpu(raw_sb->nx_xp_data_blocks); data_len = 0; /* For now */ /* * The reference doesn't mention anything about this, but I need to * put some sort of a limit or else the rings could wrap around and * corrupt themselves. */ desc_limit = desc_blks >> 2; data_limit = data_blks >> 2; for (i = 0; i < nxi->nx_eph_count; ++i) { if (data_len == data_limit) { apfs_err(sb, "too many checkpoint data blocks"); return -EFSCORRUPTED; } if (!cpm) { cpm_start = i; if (desc_len == desc_limit) { apfs_err(sb, "too many checkpoint descriptor blocks"); return -EFSCORRUPTED; } cpm_bno = desc_base + (desc_index + desc_len) % desc_blks; err = apfs_create_cpm_block(sb, cpm_bno, &cpm_bh); if (err) { apfs_err(sb, "failed to create cpm block"); return err; } cpm = (void *)cpm_bh->b_data; desc_len += 1; } eph_info = &nxi->nx_eph_list[i]; data_next = (data_index + data_len) % data_blks; obj_blkcnt = eph_info->size >> sb->s_blocksize_bits; err = apfs_create_cpoint_map(sb, cpm, eph_info->object, data_base + data_next, eph_info->size); if (err) { if (err == -ENOSPC) cpm->cpm_flags = 0; /* No longer the last */ brelse(cpm_bh); cpm = NULL; cpm_bh = NULL; if (err == -ENOSPC) { --i; continue; } apfs_err(sb, "failed to create cpm map %d", i); return err; } err = apfs_write_single_ephemeral_object(sb, eph_info->object, &cpm->cpm_map[i - cpm_start]); if (err) { brelse(cpm_bh); cpm = NULL; cpm_bh = NULL; apfs_err(sb, "failed to write ephemeral object %d", i); return err; } data_len += obj_blkcnt; } /* * The checkpoint superblock can't be set until the very end of the * transaction commit, but allocate its block here already. */ nxi->nx_bno = desc_base + (desc_index + desc_len) % desc_blks; desc_len += 1; desc_next = (desc_index + desc_len) % desc_blks; data_next = (data_index + data_len) % data_blks; raw_sb->nx_xp_desc_next = cpu_to_le32(desc_next); raw_sb->nx_xp_desc_index = cpu_to_le32(desc_index); raw_sb->nx_xp_desc_len = cpu_to_le32(desc_len); raw_sb->nx_xp_data_next = cpu_to_le32(data_next); raw_sb->nx_xp_data_index = cpu_to_le32(data_index); raw_sb->nx_xp_data_len = cpu_to_le32(data_len); return 0; } /** * apfs_transaction_commit_nx - Definitely commit the current transaction * @sb: superblock structure */ static int apfs_transaction_commit_nx(struct super_block *sb) { struct apfs_spaceman *sm = APFS_SM(sb); struct apfs_nxsb_info *nxi = APFS_NXI(sb); struct apfs_nx_transaction *nx_trans = &nxi->nx_transaction; struct apfs_bh_info *bhi, *tmp; int err = 0; ASSERT(!(sb->s_flags & SB_RDONLY)); /* Before committing the bhs, write all inode metadata to them */ err = apfs_transaction_flush_all_inodes(sb); if (err) { apfs_err(sb, "failed to flush all inodes"); return err; } /* * Now that nothing else will be freed, flush the last update to the * free queues so that it can be committed to disk along with all the * ephemeral objects. */ if (sm->sm_free_cache_base) { err = apfs_free_queue_insert_nocache(sb, sm->sm_free_cache_base, sm->sm_free_cache_blkcnt); if (err) { apfs_err(sb, "fq cache flush failed (0x%llx-0x%llx)", sm->sm_free_cache_base, sm->sm_free_cache_blkcnt); return err; } sm->sm_free_cache_base = sm->sm_free_cache_blkcnt = 0; } /* * Writing the ip bitmaps modifies the spaceman, so it must happen * before we commit the ephemeral objects. It must also happen after we * flush the free queue, in case the last freed range was in the ip. */ err = apfs_write_ip_bitmaps(sb); if (err) { apfs_err(sb, "failed to commit the ip bitmaps"); return err; } err = apfs_write_ephemeral_objects(sb); if (err) return err; list_for_each_entry(bhi, &nx_trans->t_buffers, list) { struct buffer_head *bh = bhi->bh; ASSERT(buffer_trans(bh)); if (buffer_csum(bh)) apfs_obj_set_csum(sb, (void *)bh->b_data); clear_buffer_dirty(bh); get_bh(bh); lock_buffer(bh); bh->b_end_io = end_buffer_write_sync; apfs_submit_bh(REQ_OP_WRITE, REQ_SYNC, bh); } list_for_each_entry_safe(bhi, tmp, &nx_trans->t_buffers, list) { struct buffer_head *bh = bhi->bh; #if LINUX_VERSION_CODE >= KERNEL_VERSION(6, 11, 0) struct folio *folio = NULL; #else struct page *page = NULL; #endif bool is_metadata; ASSERT(buffer_trans(bh)); wait_on_buffer(bh); if (!buffer_uptodate(bh)) { apfs_err(sb, "failed to write some blocks"); return -EIO; } list_del(&bhi->list); clear_buffer_trans(bh); nx_trans->t_buffers_count--; bh->b_private = NULL; bhi->bh = NULL; kfree(bhi); bhi = NULL; #if LINUX_VERSION_CODE >= KERNEL_VERSION(6, 11, 0) folio = page_folio(bh->b_page); folio_get(folio); #else page = bh->b_page; get_page(page); #endif is_metadata = buffer_csum(bh); clear_buffer_csum(bh); put_bh(bh); bh = NULL; #if LINUX_VERSION_CODE >= KERNEL_VERSION(6, 11, 0) folio_lock(folio); folio_mkclean(folio); #else /* Future writes to mmapped areas should fault for CoW */ lock_page(page); page_mkclean(page); #endif /* XXX: otherwise, the page cache fills up and crashes the machine */ if (!is_metadata) { #if LINUX_VERSION_CODE >= KERNEL_VERSION(6, 11, 0) try_to_free_buffers(folio); #elif LINUX_VERSION_CODE >= KERNEL_VERSION(5, 19, 0) || RHEL_VERSION_GE(9, 3) try_to_free_buffers(page_folio(page)); #else try_to_free_buffers(page); #endif } #if LINUX_VERSION_CODE >= KERNEL_VERSION(6, 11, 0) folio_unlock(folio); folio_put(folio); #else unlock_page(page); put_page(page); #endif } err = apfs_checkpoint_end(sb); if (err) { apfs_err(sb, "failed to end the checkpoint"); return err; } nx_trans->t_starts_count = 0; nx_trans->t_buffers_count = 0; return 0; } /** * apfs_transaction_need_commit - Evaluate if a commit is required * @sb: superblock structure */ static bool apfs_transaction_need_commit(struct super_block *sb) { struct apfs_spaceman *sm = APFS_SM(sb); struct apfs_nxsb_info *nxi = APFS_NXI(sb); struct apfs_nx_transaction *nx_trans = &nxi->nx_transaction; if (nx_trans->t_state & APFS_NX_TRANS_DEFER_COMMIT) { nx_trans->t_state &= ~APFS_NX_TRANS_DEFER_COMMIT; return false; } /* Avoid nested commits on inode writeback */ if (nx_trans->t_state & APFS_NX_TRANS_COMMITTING) return false; if (nx_trans->t_state & APFS_NX_TRANS_FORCE_COMMIT) { nx_trans->t_state = 0; return true; } if (sm) { struct apfs_spaceman_phys *sm_raw = sm->sm_raw; struct apfs_spaceman_free_queue *fq_ip = &sm_raw->sm_fq[APFS_SFQ_IP]; struct apfs_spaceman_free_queue *fq_main = &sm_raw->sm_fq[APFS_SFQ_MAIN]; int buffers_max = nxi->nx_trans_buffers_max; int starts_max = APFS_TRANS_STARTS_MAX; int mq_max = APFS_TRANS_MAIN_QUEUE_MAX; int maxnodes; /* * Try to avoid committing halfway through a data block write, * otherwise the block will be put through copy-on-write again, * causing unnecessary fragmentation. */ if (nx_trans->t_state & APFS_NX_TRANS_INCOMPLETE_BLOCK) { buffers_max += 50; starts_max += 50; mq_max += 20; } if (nx_trans->t_buffers_count > buffers_max) return true; if (nx_trans->t_starts_count > starts_max) return true; /* * The internal pool has enough blocks to map the container * exactly 3 times. Don't allow large transactions if we can't * be sure the bitmap changes will all fit. */ if (le64_to_cpu(fq_ip->sfq_count) * 3 > le64_to_cpu(sm_raw->sm_ip_block_count)) return true; /* Don't let the main queue get too full either */ if (le64_to_cpu(fq_main->sfq_count) > mq_max) return true; /* * The main free queue can become unbalanced enough to reach * the node limit while being mostly empty. For now, the only * way I have to rebalance it is to flush it entirely with a * new transaction. We could wait longer to do it, but I don't * see the point. */ maxnodes = le16_to_cpu(fq_main->sfq_tree_node_limit); maxnodes = (maxnodes + 1) >> 1; if (sm->sm_main_fq_nodes > 1 && sm->sm_main_fq_nodes >= maxnodes) return true; } return false; } /** * apfs_transaction_commit - Possibly commit the current transaction * @sb: superblock structure * * On success returns 0 and releases the big filesystem lock. On failure, * returns a negative error code, and the caller is responsibly for aborting * the transaction. */ int apfs_transaction_commit(struct super_block *sb) { struct apfs_nxsb_info *nxi = APFS_NXI(sb); struct apfs_nx_transaction *trans = NULL; int err = 0; trans = &nxi->nx_transaction; if (apfs_transaction_need_commit(sb)) { err = apfs_transaction_commit_nx(sb); if (err) { apfs_err(sb, "transaction commit failed"); return err; } trans->t_work_sb = NULL; cancel_delayed_work(&trans->t_work); } else { trans->t_work_sb = sb; mod_delayed_work(system_wq, &trans->t_work, msecs_to_jiffies(100)); } up_write(&nxi->nx_big_sem); return 0; } /** * apfs_inode_join_transaction - Add an inode to the current transaction * @sb: superblock structure * @inode: vfs inode to add */ void apfs_inode_join_transaction(struct super_block *sb, struct inode *inode) { struct apfs_nxsb_info *nxi = APFS_NXI(sb); struct apfs_nx_transaction *nx_trans = &nxi->nx_transaction; struct apfs_inode_info *ai = APFS_I(inode); ASSERT(!(sb->s_flags & SB_RDONLY)); lockdep_assert_held_write(&nxi->nx_big_sem); if (!list_empty(&ai->i_list)) /* Already in the transaction */ return; ihold(inode); list_add(&ai->i_list, &nx_trans->t_inodes); } /** * apfs_transaction_join - Add a buffer head to the current transaction * @sb: superblock structure * @bh: the buffer head */ int apfs_transaction_join(struct super_block *sb, struct buffer_head *bh) { struct apfs_nxsb_info *nxi = APFS_NXI(sb); struct apfs_nx_transaction *nx_trans = &nxi->nx_transaction; struct apfs_bh_info *bhi; ASSERT(!(sb->s_flags & SB_RDONLY)); lockdep_assert_held_write(&nxi->nx_big_sem); if (buffer_trans(bh)) /* Already part of the only transaction */ return 0; /* TODO: use a slab cache */ bhi = kzalloc(sizeof(*bhi), GFP_NOFS); if (!bhi) return -ENOMEM; get_bh(bh); bhi->bh = bh; list_add(&bhi->list, &nx_trans->t_buffers); nx_trans->t_buffers_count++; set_buffer_trans(bh); bh->b_private = bhi; return 0; } /** * apfs_force_readonly - Set the whole container as read-only * @nxi: container superblock info */ static void apfs_force_readonly(struct apfs_nxsb_info *nxi) { struct apfs_sb_info *sbi = NULL; struct super_block *sb = NULL; list_for_each_entry(sbi, &nxi->vol_list, list) { sb = sbi->s_vobject.sb; sb->s_flags |= SB_RDONLY; } nxi->nx_flags &= ~APFS_READWRITE; } /** * apfs_transaction_abort - Abort the current transaction * @sb: superblock structure * * Releases the big filesystem lock and clears the in-memory transaction data; * the on-disk changes are irrelevant because the superblock checksum hasn't * been written yet. Leaves the filesystem in read-only state. */ void apfs_transaction_abort(struct super_block *sb) { struct apfs_nxsb_info *nxi = APFS_NXI(sb); struct apfs_nx_transaction *nx_trans = &nxi->nx_transaction; struct apfs_bh_info *bhi, *tmp; struct apfs_inode_info *ai, *ai_tmp; if (sb->s_flags & SB_RDONLY) { /* Transaction already aborted, do nothing */ ASSERT(list_empty(&nx_trans->t_inodes)); ASSERT(list_empty(&nx_trans->t_buffers)); up_write(&nxi->nx_big_sem); return; } nx_trans->t_state = 0; apfs_err(sb, "aborting transaction"); --nxi->nx_xid; list_for_each_entry_safe(bhi, tmp, &nx_trans->t_buffers, list) { struct buffer_head *bh = bhi->bh; bh->b_private = NULL; clear_buffer_dirty(bh); clear_buffer_trans(bh); clear_buffer_csum(bh); brelse(bh); bhi->bh = NULL; list_del(&bhi->list); kfree(bhi); } /* * It's not possible to undo in-memory changes from old operations in * the aborted transaction. To avoid corruption, never write again. */ apfs_force_readonly(nxi); up_write(&nxi->nx_big_sem); list_for_each_entry_safe(ai, ai_tmp, &nx_trans->t_inodes, i_list) { list_del_init(&ai->i_list); iput(&ai->vfs_inode); } }