From a0ff9971802b664aaa12481c46cec11eae77ea51 Mon Sep 17 00:00:00 2001 From: Lennart Poettering Date: Mon, 19 Apr 2021 17:35:54 +0200 Subject: [PATCH 01/19] repart: fix incorrect error code propagation --- src/partition/repart.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/partition/repart.c b/src/partition/repart.c index 04d5bb18f4..6ad12772fd 100644 --- a/src/partition/repart.c +++ b/src/partition/repart.c @@ -2710,7 +2710,7 @@ static int do_copy_files(Partition *p, const char *fs) { pfd = chase_symlinks_and_open(dn, fs, CHASE_PREFIX_ROOT|CHASE_WARN, O_RDONLY|O_DIRECTORY|O_CLOEXEC, NULL); if (pfd < 0) - return log_error_errno(tfd, "Failed to open parent directory of target: %m"); + return log_error_errno(pfd, "Failed to open parent directory of target: %m"); tfd = openat(pfd, basename(*target), O_CREAT|O_EXCL|O_WRONLY|O_CLOEXEC, 0700); if (tfd < 0) From 0ade2213e6a515f30dd587d9378958675e0ea245 Mon Sep 17 00:00:00 2001 From: Lennart Poettering Date: Mon, 19 Apr 2021 17:50:02 +0200 Subject: [PATCH 02/19] repart: port more code to generic path_simplify_and_warn() We have this nice helper, hence use it when parsing paths and logging about it. --- src/partition/repart.c | 12 ++++-------- 1 file changed, 4 insertions(+), 8 deletions(-) diff --git a/src/partition/repart.c b/src/partition/repart.c index 6ad12772fd..46da0d04ed 100644 --- a/src/partition/repart.c +++ b/src/partition/repart.c @@ -1139,11 +1139,9 @@ static int config_parse_copy_files( return 0; } - if (!path_is_absolute(resolved_source) || !path_is_normalized(resolved_source)) { - log_syntax(unit, LOG_WARNING, filename, line, 0, - "Invalid path name in CopyFiles= source, ignoring: %s", resolved_source); + r = path_simplify_and_warn(resolved_source, PATH_CHECK_ABSOLUTE, unit, filename, line, lvalue); + if (r < 0) return 0; - } r = specifier_printf(target, specifier_table, NULL, &resolved_target); if (r < 0) { @@ -1152,11 +1150,9 @@ static int config_parse_copy_files( return 0; } - if (!path_is_absolute(resolved_target) || !path_is_normalized(resolved_target)) { - log_syntax(unit, LOG_WARNING, filename, line, 0, - "Invalid path name in CopyFiles= source, ignoring: %s", resolved_target); + r = path_simplify_and_warn(resolved_target, PATH_CHECK_ABSOLUTE, unit, filename, line, lvalue); + if (r < 0) return 0; - } r = strv_consume_pair(&partition->copy_files, TAKE_PTR(resolved_source), TAKE_PTR(resolved_target)); if (r < 0) From ef9c184d3d2971f41ffc4bc9b8b4df17a704a29e Mon Sep 17 00:00:00 2001 From: Lennart Poettering Date: Fri, 19 Mar 2021 22:19:08 +0100 Subject: [PATCH 03/19] dissect: split read-only flag into two Let's have one flag to request that when dissecting an image the loopback device is made read-only, and another one to request that when it is mounted to make it read-only. Previously both concepts were always done read-only together. (Of course, making the loopback device read-only but mounting it read-write doesn't make too much sense, but the kernel should catch that for us, no need to make restrictions from our side there) Use-case for this: in systemd-repart we'd like to operate on images for adding partitions. Thus we'd like to have the loopback device writable, but if we read repart.d/ snippets from it, we want to do that read-only. --- src/core/namespace.c | 2 +- src/dissect/dissect.c | 2 +- src/shared/dissect-image.c | 6 +++--- src/shared/dissect-image.h | 11 +++++++---- 4 files changed, 12 insertions(+), 9 deletions(-) diff --git a/src/core/namespace.c b/src/core/namespace.c index 7eb42ee405..9f9d47d34a 100644 --- a/src/core/namespace.c +++ b/src/core/namespace.c @@ -1853,7 +1853,7 @@ int setup_namespace( r = loop_device_make_by_path( root_image, - FLAGS_SET(dissect_image_flags, DISSECT_IMAGE_READ_ONLY) ? O_RDONLY : -1 /* < 0 means writable if possible, read-only as fallback */, + FLAGS_SET(dissect_image_flags, DISSECT_IMAGE_DEVICE_READ_ONLY) ? O_RDONLY : -1 /* < 0 means writable if possible, read-only as fallback */, FLAGS_SET(dissect_image_flags, DISSECT_IMAGE_NO_PARTITION_TABLE) ? 0 : LO_FLAGS_PARTSCAN, &loop_device); if (r < 0) diff --git a/src/dissect/dissect.c b/src/dissect/dissect.c index 65ddb1d149..c21d3e47e4 100644 --- a/src/dissect/dissect.c +++ b/src/dissect/dissect.c @@ -770,7 +770,7 @@ static int run(int argc, char *argv[]) { r = loop_device_make_by_path( arg_image, - FLAGS_SET(arg_flags, DISSECT_IMAGE_READ_ONLY) ? O_RDONLY : O_RDWR, + FLAGS_SET(arg_flags, DISSECT_IMAGE_DEVICE_READ_ONLY) ? O_RDONLY : O_RDWR, FLAGS_SET(arg_flags, DISSECT_IMAGE_NO_PARTITION_TABLE) ? 0 : LO_FLAGS_PARTSCAN, &d); if (r < 0) diff --git a/src/shared/dissect-image.c b/src/shared/dissect-image.c index ce8a683bd6..1624e9daa8 100644 --- a/src/shared/dissect-image.c +++ b/src/shared/dissect-image.c @@ -1408,7 +1408,7 @@ static int mount_partition( if (streq(fstype, "crypto_LUKS")) return -EUNATCH; - rw = m->rw && !(flags & DISSECT_IMAGE_READ_ONLY); + rw = m->rw && !(flags & DISSECT_IMAGE_MOUNT_READ_ONLY); if (FLAGS_SET(flags, DISSECT_IMAGE_FSCK) && rw) { r = run_fsck(node, fstype); @@ -1756,7 +1756,7 @@ static int decrypt_partition( return log_debug_errno(r, "Failed to load LUKS metadata: %m"); r = sym_crypt_activate_by_passphrase(cd, name, CRYPT_ANY_SLOT, passphrase, strlen(passphrase), - ((flags & DISSECT_IMAGE_READ_ONLY) ? CRYPT_ACTIVATE_READONLY : 0) | + ((flags & DISSECT_IMAGE_DEVICE_READ_ONLY) ? CRYPT_ACTIVATE_READONLY : 0) | ((flags & DISSECT_IMAGE_DISCARD_ON_CRYPTO) ? CRYPT_ACTIVATE_ALLOW_DISCARDS : 0)); if (r < 0) { log_debug_errno(r, "Failed to activate LUKS device: %m"); @@ -2674,7 +2674,7 @@ int mount_image_privately_interactively( r = loop_device_make_by_path( image, - FLAGS_SET(flags, DISSECT_IMAGE_READ_ONLY) ? O_RDONLY : O_RDWR, + FLAGS_SET(flags, DISSECT_IMAGE_DEVICE_READ_ONLY) ? O_RDONLY : O_RDWR, FLAGS_SET(flags, DISSECT_IMAGE_NO_PARTITION_TABLE) ? 0 : LO_FLAGS_PARTSCAN, &d); if (r < 0) diff --git a/src/shared/dissect-image.h b/src/shared/dissect-image.h index f07955230b..d51049e78a 100644 --- a/src/shared/dissect-image.h +++ b/src/shared/dissect-image.h @@ -87,13 +87,13 @@ static inline PartitionDesignator PARTITION_VERITY_OF(PartitionDesignator p) { } typedef enum DissectImageFlags { - DISSECT_IMAGE_READ_ONLY = 1 << 0, + DISSECT_IMAGE_DEVICE_READ_ONLY = 1 << 0, /* Make device read-only */ DISSECT_IMAGE_DISCARD_ON_LOOP = 1 << 1, /* Turn on "discard" if on a loop device and file system supports it */ DISSECT_IMAGE_DISCARD = 1 << 2, /* Turn on "discard" if file system supports it, on all block devices */ DISSECT_IMAGE_DISCARD_ON_CRYPTO = 1 << 3, /* Turn on "discard" also on crypto devices */ - DISSECT_IMAGE_DISCARD_ANY = DISSECT_IMAGE_DISCARD_ON_LOOP | - DISSECT_IMAGE_DISCARD | - DISSECT_IMAGE_DISCARD_ON_CRYPTO, + DISSECT_IMAGE_DISCARD_ANY = DISSECT_IMAGE_DISCARD_ON_LOOP | + DISSECT_IMAGE_DISCARD | + DISSECT_IMAGE_DISCARD_ON_CRYPTO, DISSECT_IMAGE_GPT_ONLY = 1 << 4, /* Only recognize images with GPT partition tables */ DISSECT_IMAGE_GENERIC_ROOT = 1 << 5, /* If no partition table or only single generic partition, assume it's the root fs */ DISSECT_IMAGE_MOUNT_ROOT_ONLY = 1 << 6, /* Mount only the root and /usr partitions */ @@ -107,6 +107,9 @@ typedef enum DissectImageFlags { DISSECT_IMAGE_MKDIR = 1 << 14, /* Make top-level directory to mount right before mounting, if missing */ DISSECT_IMAGE_USR_NO_ROOT = 1 << 15, /* If no root fs is in the image, but /usr is, then allow this (so that we can mount the rootfs as tmpfs or so */ DISSECT_IMAGE_REQUIRE_ROOT = 1 << 16, /* Don't accept disks without root partition (or at least /usr partition if DISSECT_IMAGE_USR_NO_ROOT is set) */ + DISSECT_IMAGE_MOUNT_READ_ONLY = 1 << 17, /* Make mounts read-only */ + DISSECT_IMAGE_READ_ONLY = DISSECT_IMAGE_DEVICE_READ_ONLY | + DISSECT_IMAGE_MOUNT_READ_ONLY, } DissectImageFlags; struct DissectedImage { From 0efb3f83da1f2210b21aa26d3d2ecba3fca118ef Mon Sep 17 00:00:00 2001 From: Lennart Poettering Date: Thu, 18 Mar 2021 22:47:24 +0100 Subject: [PATCH 04/19] repart: move NOP destructors into shared code --- src/partition/repart.c | 6 ------ src/shared/cryptsetup-util.h | 8 ++++++++ 2 files changed, 8 insertions(+), 6 deletions(-) diff --git a/src/partition/repart.c b/src/partition/repart.c index 46da0d04ed..0be7acf42e 100644 --- a/src/partition/repart.c +++ b/src/partition/repart.c @@ -76,12 +76,6 @@ /* LUKS2 takes off 16M of the partition size with its metadata by default */ #define LUKS2_METADATA_SIZE (16*1024*1024) -#if !HAVE_LIBCRYPTSETUP -struct crypt_device; -static inline void sym_crypt_free(struct crypt_device* cd) {} -static inline void sym_crypt_freep(struct crypt_device** cd) {} -#endif - /* Note: When growing and placing new partitions we always align to 4K sector size. It's how newer hard disks * are designed, and if everything is aligned to that performance is best. And for older hard disks with 512B * sector size devices were generally assumed to have an even number of sectors, hence at the worst we'll diff --git a/src/shared/cryptsetup-util.h b/src/shared/cryptsetup-util.h index 5ebb0ac576..855997f335 100644 --- a/src/shared/cryptsetup-util.h +++ b/src/shared/cryptsetup-util.h @@ -61,4 +61,12 @@ int cryptsetup_get_token_as_json(struct crypt_device *cd, int idx, const char *v int cryptsetup_get_keyslot_from_token(JsonVariant *v); int cryptsetup_add_token_json(struct crypt_device *cd, JsonVariant *v); +#else + +/* If libcryptsetup is not available, let's at least define the basic type and NOP destructors for it, to + * make a little bit less #ifdeferry necessary in main programs. */ +struct crypt_device; +static inline void sym_crypt_free(struct crypt_device* cd) {} +static inline void sym_crypt_freep(struct crypt_device** cd) {} + #endif From 8e5f3cecdf71817bbfbf6a9c61acdb47b676d5d6 Mon Sep 17 00:00:00 2001 From: Lennart Poettering Date: Fri, 19 Mar 2021 11:16:22 +0100 Subject: [PATCH 05/19] repart: slightly improve error message if partition is not on dm-crypt/dm-verity --- src/partition/repart.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/partition/repart.c b/src/partition/repart.c index 0be7acf42e..155be0610c 100644 --- a/src/partition/repart.c +++ b/src/partition/repart.c @@ -3885,7 +3885,9 @@ static int acquire_root_devno(const char *p, int mode, char **ret, int *ret_fd) /* From dm-crypt to backing partition */ r = block_get_originating(devno, &devno); - if (r < 0) + if (r == -ENOENT) + log_debug_errno(r, "Device '%s' has no dm-crypt/dm-verity device, no need to look for underlying block device.", p); + else if (r < 0) log_debug_errno(r, "Failed to find underlying block device for '%s', ignoring: %m", p); /* From partition to whole disk containing it */ From 252d6267111dc4db5ee8ab29ef520ec309ba1167 Mon Sep 17 00:00:00 2001 From: Lennart Poettering Date: Fri, 19 Mar 2021 11:19:00 +0100 Subject: [PATCH 06/19] repart: add --image= switch This is similar to the --image= switch in the other tools, like systemd-sysusers or systemd-tmpfiles, i.e. it apply the configuration from the image to the image. This is particularly useful for downloading minimized GPT image, and then extending it to the desired size via: # systemd-repart --image=foo.image --size=5G --- man/repart.d.xml | 6 +- man/systemd-repart.xml | 33 ++++--- src/partition/repart.c | 213 +++++++++++++++++++++++++++++++---------- 3 files changed, 189 insertions(+), 63 deletions(-) diff --git a/man/repart.d.xml b/man/repart.d.xml index b6346b3f85..67f687947e 100644 --- a/man/repart.d.xml +++ b/man/repart.d.xml @@ -486,7 +486,11 @@ The copy operation is executed before the file system is registered in the partition table, thus ensuring that a file system populated this way only ever exists fully initialized. - This option cannot be combined with CopyBlocks=. + This option cannot be combined with CopyBlocks=. + + When systemd-repart is invoked with the or + command line switches the source paths specified are taken relative to the + specified root directory or disk image root. diff --git a/man/systemd-repart.xml b/man/systemd-repart.xml index a5a0890c52..380ba57884 100644 --- a/man/systemd-repart.xml +++ b/man/systemd-repart.xml @@ -40,16 +40,17 @@ repart.d5. - If invoked with no arguments, it operates on the block device backing the root file system partition - of the OS, thus growing and adding partitions of the booted OS image itself. When called in the initial - RAM disk it operates on the block device backing /sysroot/ instead, i.e. on the - block device the system will soon transition into. The systemd-repart.service - service is generally run at boot in the initial RAM disk, in order to augment the partition table of the - OS before its partitions are mounted. systemd-repart (mostly) operates in a purely - incremental mode: it only grows existing and adds new partitions; it does not shrink, delete or move - existing partitions. The service is intended to be run on every boot, but when it detects that the - partition table already matches the installed repart.d/*.conf configuration - files, it executes no operation. + If invoked with no arguments, it operates on the block device backing the root file system + partition of the running OS, thus growing and adding partitions of the booted OS image itself. If + --image= is used it will operate on the specified image file. When called in the + initrd it operates on the block device backing /sysroot/ instead, + i.e. on the block device the system will soon transition into. The + systemd-repart.service service is generally run at boot in the initial RAM disk, in + order to augment the partition table of the OS before its partitions are + mounted. systemd-repart (mostly) operates in a purely incremental mode: it only grows + existing and adds new partitions; it does not shrink, delete or move existing partitions. The service is + intended to be run on every boot, but when it detects that the partition table already matches the + installed repart.d/*.conf configuration files, it executes no operation. systemd-repart is intended to be used when deploying OS images, to automatically adjust them to the system they are running on, during first boot. This way the deployed image can be @@ -251,13 +252,21 @@ Takes a path to a directory to use as root file system when searching for - repart.d/*.conf files and for the machine ID file to use as seed. By default - when invoked on the regular system this defaults to the host's root file system + repart.d/*.conf files, for the machine ID file to use as seed and for the + CopyFiles= and CopyBlocks= source files and directories. By + default when invoked on the regular system this defaults to the host's root file system /. If invoked from the initial RAM disk this defaults to /sysroot/, so that the tool operates on the configuration and machine ID stored in the root file system later transitioned into itself. + + + + Takes a path to a disk image file or device to mount and use in a similar fashion to + , see above. + + diff --git a/src/partition/repart.c b/src/partition/repart.c index 155be0610c..081f95ebe7 100644 --- a/src/partition/repart.c +++ b/src/partition/repart.c @@ -92,6 +92,7 @@ static enum { static bool arg_dry_run = true; static const char *arg_node = NULL; static char *arg_root = NULL; +static char *arg_image = NULL; static char *arg_definitions = NULL; static bool arg_discard = true; static bool arg_can_factory_reset = false; @@ -110,6 +111,7 @@ static char *arg_tpm2_device = NULL; static uint32_t arg_tpm2_pcr_mask = UINT32_MAX; STATIC_DESTRUCTOR_REGISTER(arg_root, freep); +STATIC_DESTRUCTOR_REGISTER(arg_image, freep); STATIC_DESTRUCTOR_REGISTER(arg_definitions, freep); STATIC_DESTRUCTOR_REGISTER(arg_key, erase_and_freep); STATIC_DESTRUCTOR_REGISTER(arg_tpm2_device, freep); @@ -3483,6 +3485,7 @@ static int help(void) { " them\n" " --can-factory-reset Test whether factory reset is defined\n" " --root=PATH Operate relative to root path\n" + " --image=PATH Operate relative to image file\n" " --definitions=DIR Find partition definitions in specified directory\n" " --key-file=PATH Key to use when encrypting partitions\n" " --tpm2-device=PATH Path to TPM2 device node to use\n" @@ -3513,6 +3516,7 @@ static int parse_argv(int argc, char *argv[]) { ARG_FACTORY_RESET, ARG_CAN_FACTORY_RESET, ARG_ROOT, + ARG_IMAGE, ARG_SEED, ARG_PRETTY, ARG_DEFINITIONS, @@ -3534,6 +3538,7 @@ static int parse_argv(int argc, char *argv[]) { { "factory-reset", required_argument, NULL, ARG_FACTORY_RESET }, { "can-factory-reset", no_argument, NULL, ARG_CAN_FACTORY_RESET }, { "root", required_argument, NULL, ARG_ROOT }, + { "image", required_argument, NULL, ARG_IMAGE }, { "seed", required_argument, NULL, ARG_SEED }, { "pretty", required_argument, NULL, ARG_PRETTY }, { "definitions", required_argument, NULL, ARG_DEFINITIONS }, @@ -3613,7 +3618,13 @@ static int parse_argv(int argc, char *argv[]) { break; case ARG_ROOT: - r = parse_path_argument(optarg, false, &arg_root); + r = parse_path_argument(optarg, /* suppress_root= */ false, &arg_root); + if (r < 0) + return r; + break; + + case ARG_IMAGE: + r = parse_path_argument(optarg, /* suppress_root= */ false, &arg_image); if (r < 0) return r; break; @@ -3763,9 +3774,18 @@ static int parse_argv(int argc, char *argv[]) { return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "If --empty=create is specified, --size= must be specified, too."); + if (arg_image && arg_root) + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Please specify either --root= or --image=, the combination of both is not supported."); + else if (!arg_image && !arg_root && in_initrd()) { + /* Default to operation on /sysroot when invoked in the initrd! */ + arg_root = strdup("/sysroot"); + if (!arg_root) + return log_oom(); + } + arg_node = argc > optind ? argv[optind] : NULL; - if (IN_SET(arg_empty, EMPTY_FORCE, EMPTY_REQUIRE, EMPTY_CREATE) && !arg_node) + if (IN_SET(arg_empty, EMPTY_FORCE, EMPTY_REQUIRE, EMPTY_CREATE) && !arg_node && !arg_image) return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "A path to a device node or loopback file must be specified when --empty=force, --empty=require or --empty=create are used."); @@ -3838,39 +3858,44 @@ static int remove_efi_variable_factory_reset(void) { return 0; } -static int acquire_root_devno(const char *p, int mode, char **ret, int *ret_fd) { +static int acquire_root_devno( + const char *p, + const char *root, + int mode, + char **ret, + int *ret_fd) { + + _cleanup_free_ char *found_path = NULL; + dev_t devno, fd_devno = MODE_INVALID; _cleanup_close_ int fd = -1; struct stat st; - dev_t devno, fd_devno = MODE_INVALID; int r; assert(p); assert(ret); assert(ret_fd); - fd = open(p, mode); + fd = chase_symlinks_and_open(p, root, CHASE_PREFIX_ROOT, mode, &found_path); if (fd < 0) - return -errno; + return fd; if (fstat(fd, &st) < 0) return -errno; if (S_ISREG(st.st_mode)) { - char *s; - - s = strdup(p); - if (!s) - return log_oom(); - - *ret = s; + *ret = TAKE_PTR(found_path); *ret_fd = TAKE_FD(fd); - return 0; } - if (S_ISBLK(st.st_mode)) + if (S_ISBLK(st.st_mode)) { + /* Refuse referencing explicit block devices if a root dir is specified, after all we should + * be able to leave the image the root path constraints us to. */ + if (root) + return -EPERM; + fd_devno = devno = st.st_rdev; - else if (S_ISDIR(st.st_mode)) { + } else if (S_ISDIR(st.st_mode)) { devno = st.st_dev; if (major(devno) == 0) { @@ -3930,7 +3955,9 @@ static int find_root(char **ret, int *ret_fd) { return 0; } - r = acquire_root_devno(arg_node, O_RDONLY|O_CLOEXEC, ret, ret_fd); + /* Note that we don't specify a root argument here: if the user explicitly configured a node + * we'll take it relative to the host, not the image */ + r = acquire_root_devno(arg_node, NULL, O_RDONLY|O_CLOEXEC, ret, ret_fd); if (r == -EUCLEAN) return btrfs_log_dev_root(LOG_ERR, r, arg_node); if (r < 0) @@ -3958,7 +3985,7 @@ static int find_root(char **ret, int *ret_fd) { } else p = t; - r = acquire_root_devno(p, O_RDONLY|O_DIRECTORY|O_CLOEXEC, ret, ret_fd); + r = acquire_root_devno(p, arg_root, O_RDONLY|O_DIRECTORY|O_CLOEXEC, ret, ret_fd); if (r < 0) { if (r == -EUCLEAN) return btrfs_log_dev_root(LOG_ERR, r, p); @@ -4005,9 +4032,15 @@ static int resize_pt(int fd) { return 1; } -static int resize_backing_fd(const char *node, int *fd) { +static int resize_backing_fd( + const char *node, /* The primary way we access the disk image to operate on */ + int *fd, /* An O_RDONLY fd referring to that inode */ + const char *backing_file, /* If the above refers to a loopback device, the backing regular file for that, which we can grow */ + LoopDevice *loop_device) { + char buf1[FORMAT_BYTES_MAX], buf2[FORMAT_BYTES_MAX]; _cleanup_close_ int writable_fd = -1; + uint64_t current_size; struct stat st; int r; @@ -4028,25 +4061,64 @@ static int resize_backing_fd(const char *node, int *fd) { if (fstat(*fd, &st) < 0) return log_error_errno(errno, "Failed to stat '%s': %m", node); - r = stat_verify_regular(&st); - if (r < 0) - return log_error_errno(r, "Specified path '%s' is not a regular file, cannot resize: %m", node); + if (S_ISBLK(st.st_mode)) { + if (!backing_file) + return log_error_errno(SYNTHETIC_ERRNO(EBADF), "Cannot resize block device '%s'.", node); - assert_se(format_bytes(buf1, sizeof(buf1), st.st_size)); + assert(loop_device); + + if (ioctl(*fd, BLKGETSIZE64, ¤t_size) < 0) + return log_error_errno(errno, "Failed to determine size of block device %s: %m", node); + } else { + r = stat_verify_regular(&st); + if (r < 0) + return log_error_errno(r, "Specified path '%s' is not a regular file or loopback block device, cannot resize: %m", node); + + assert(!backing_file); + assert(!loop_device); + current_size = st.st_size; + } + + assert_se(format_bytes(buf1, sizeof(buf1), current_size)); assert_se(format_bytes(buf2, sizeof(buf2), arg_size)); - if ((uint64_t) st.st_size >= arg_size) { + if (current_size >= arg_size) { log_info("File '%s' already is of requested size or larger, not growing. (%s >= %s)", node, buf1, buf2); return 0; } - /* The file descriptor is read-only. In order to grow the file we need to have a writable fd. We - * reopen the file for that temporarily. We keep the writable fd only open for this operation though, - * as fdisk can't accept it anyway. */ + if (S_ISBLK(st.st_mode)) { + assert(backing_file); - writable_fd = fd_reopen(*fd, O_WRONLY|O_CLOEXEC); - if (writable_fd < 0) - return log_error_errno(writable_fd, "Failed to reopen backing file '%s' writable: %m", node); + /* This is a loopback device. We can't really grow those directly, but we can grow the + * backing file, hence let's do that. */ + + writable_fd = open(backing_file, O_WRONLY|O_CLOEXEC|O_NONBLOCK); + if (writable_fd < 0) + return log_error_errno(errno, "Failed to open backing file '%s': %m", backing_file); + + if (fstat(writable_fd, &st) < 0) + return log_error_errno(errno, "Failed to stat() backing file '%s': %m", backing_file); + + r = stat_verify_regular(&st); + if (r < 0) + return log_error_errno(r, "Backing file '%s' of block device is not a regular file: %m", backing_file); + + if ((uint64_t) st.st_size != current_size) + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), + "Size of backing file '%s' of loopback block device '%s' don't match, refusing.", node, backing_file); + } else { + assert(S_ISREG(st.st_mode)); + assert(!backing_file); + + /* The file descriptor is read-only. In order to grow the file we need to have a writable fd. We + * reopen the file for that temporarily. We keep the writable fd only open for this operation though, + * as fdisk can't accept it anyway. */ + + writable_fd = fd_reopen(*fd, O_WRONLY|O_CLOEXEC); + if (writable_fd < 0) + return log_error_errno(writable_fd, "Failed to reopen backing file '%s' writable: %m", node); + } if (!arg_discard) { if (fallocate(writable_fd, 0, 0, arg_size) < 0) { @@ -4057,16 +4129,12 @@ static int resize_backing_fd(const char *node, int *fd) { /* Fallback to truncation, if fallocate() is not supported. */ log_debug("Backing file system does not support fallocate(), falling back to ftruncate()."); } else { - r = resize_pt(writable_fd); - if (r < 0) - return r; - - if (st.st_size == 0) /* Likely regular file just created by us */ + if (current_size == 0) /* Likely regular file just created by us */ log_info("Allocated %s for '%s'.", buf2, node); else log_info("File '%s' grown from %s to %s by allocation.", node, buf1, buf2); - return 1; + goto done; } } @@ -4074,14 +4142,21 @@ static int resize_backing_fd(const char *node, int *fd) { return log_error_errno(errno, "Failed to grow '%s' from %s to %s by truncation: %m", node, buf1, buf2); + if (current_size == 0) /* Likely regular file just created by us */ + log_info("Sized '%s' to %s.", node, buf2); + else + log_info("File '%s' grown from %s to %s by truncation.", node, buf1, buf2); + +done: r = resize_pt(writable_fd); if (r < 0) return r; - if (st.st_size == 0) /* Likely regular file just created by us */ - log_info("Sized '%s' to %s.", node, buf2); - else - log_info("File '%s' grown from %s to %s by truncation.", node, buf1, buf2); + if (loop_device) { + r = loop_device_refresh_size(loop_device, UINT64_MAX, arg_size); + if (r < 0) + return log_error_errno(r, "Failed to update loop device size: %m"); + } return 1; } @@ -4116,23 +4191,19 @@ static int determine_auto_size(Context *c) { } static int run(int argc, char *argv[]) { + _cleanup_(loop_device_unrefp) LoopDevice *loop_device = NULL; + _cleanup_(decrypted_image_unrefp) DecryptedImage *decrypted_image = NULL; + _cleanup_(umount_and_rmdir_and_freep) char *mounted_dir = NULL; _cleanup_(context_freep) Context* context = NULL; _cleanup_free_ char *node = NULL; _cleanup_close_ int backing_fd = -1; - bool from_scratch; + bool from_scratch, node_is_our_loop = false; int r; log_show_color(true); log_parse_environment(); log_open(); - if (in_initrd()) { - /* Default to operation on /sysroot when invoked in the initrd! */ - arg_root = strdup("/sysroot"); - if (!arg_root) - return log_oom(); - } - r = parse_argv(argc, argv); if (r <= 0) return r; @@ -4145,6 +4216,40 @@ static int run(int argc, char *argv[]) { if (r < 0) return r; + if (arg_image) { + assert(!arg_root); + + /* Mount this strictly read-only: we shall modify the partition table, not the file + * systems */ + r = mount_image_privately_interactively( + arg_image, + DISSECT_IMAGE_MOUNT_READ_ONLY | + (arg_node ? DISSECT_IMAGE_DEVICE_READ_ONLY : 0) | /* If a different node to make changes to is specified let's open the device in read-only mode) */ + DISSECT_IMAGE_GPT_ONLY | + DISSECT_IMAGE_RELAX_VAR_CHECK | + DISSECT_IMAGE_USR_NO_ROOT | + DISSECT_IMAGE_REQUIRE_ROOT, + &mounted_dir, + &loop_device, + &decrypted_image); + if (r < 0) + return r; + + arg_root = strdup(mounted_dir); + if (!arg_root) + return log_oom(); + + if (!arg_node) { + arg_node = strdup(loop_device->node); + if (!arg_node) + return log_oom(); + + /* Remember that the the device we are about to manipulate is actually the one we + * allocated here, and thus to increase its backing file we know what to do */ + node_is_our_loop = true; + } + } + context = context_new(arg_seed); if (!context) return log_oom(); @@ -4163,7 +4268,11 @@ static int run(int argc, char *argv[]) { return r; if (arg_size != UINT64_MAX) { - r = resize_backing_fd(node, &backing_fd); + r = resize_backing_fd( + node, + &backing_fd, + node_is_our_loop ? arg_image : NULL, + node_is_our_loop ? loop_device : NULL); if (r < 0) return r; } @@ -4225,7 +4334,11 @@ static int run(int argc, char *argv[]) { context_unload_partition_table(context); assert_se(arg_size != UINT64_MAX); - r = resize_backing_fd(node, &backing_fd); + r = resize_backing_fd( + node, + &backing_fd, + node_is_our_loop ? arg_image : NULL, + node_is_our_loop ? loop_device : NULL); if (r < 0) return r; From d17db7b2bf716dbf6c20691755b4710a8068e284 Mon Sep 17 00:00:00 2001 From: Lennart Poettering Date: Fri, 19 Mar 2021 20:31:29 +0100 Subject: [PATCH 07/19] repart: when we can't fit in all partitions explain how large the image would have to be --- src/partition/repart.c | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/src/partition/repart.c b/src/partition/repart.c index 081f95ebe7..1ee5eaa25e 100644 --- a/src/partition/repart.c +++ b/src/partition/repart.c @@ -4167,8 +4167,6 @@ static int determine_auto_size(Context *c) { Partition *p; assert_se(c); - assert_se(arg_size == UINT64_MAX); - assert_se(arg_size_auto); LIST_FOREACH(partitions, p, c->partitions) { uint64_t m; @@ -4352,9 +4350,13 @@ static int run(int argc, char *argv[]) { if (context_allocate_partitions(context)) break; /* Success! */ - if (!context_drop_one_priority(context)) - return log_error_errno(SYNTHETIC_ERRNO(ENOSPC), - "Can't fit requested partitions into free space, refusing."); + if (!context_drop_one_priority(context)) { + r = log_error_errno(SYNTHETIC_ERRNO(ENOSPC), + "Can't fit requested partitions into free space, refusing."); + + determine_auto_size(context); + return r; + } } /* Now assign free space according to the weight logic */ From 55d380144a7a2cb7afb880ce3fc9c481eb5efde2 Mon Sep 17 00:00:00 2001 From: Lennart Poettering Date: Fri, 19 Mar 2021 22:24:48 +0100 Subject: [PATCH 08/19] repart: add one more overflow check --- src/partition/repart.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/src/partition/repart.c b/src/partition/repart.c index 1ee5eaa25e..ac6e4e7f86 100644 --- a/src/partition/repart.c +++ b/src/partition/repart.c @@ -2227,6 +2227,9 @@ static int context_discard_range( range[0] = round_up_size(offset, 512); + if (offset > UINT64_MAX - size) + return -ERANGE; + end = offset + size; if (end <= range[0]) return 0; From 22163eb51b682afe969f9381d56315dade874ec1 Mon Sep 17 00:00:00 2001 From: Lennart Poettering Date: Fri, 19 Mar 2021 22:25:09 +0100 Subject: [PATCH 09/19] repart: handle DISCARD failing with EBUSY gracefully --- src/partition/repart.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/src/partition/repart.c b/src/partition/repart.c index ac6e4e7f86..351b3d1b5e 100644 --- a/src/partition/repart.c +++ b/src/partition/repart.c @@ -2269,6 +2269,11 @@ static int context_discard_partition(Context *context, Partition *p) { log_info("Storage does not support discard, not discarding data in future partition %" PRIu64 ".", p->partno); return 0; } + if (r == -EBUSY) { + /* Let's handle this gracefully: https://bugzilla.kernel.org/show_bug.cgi?id=211167 */ + log_info("Block device is busy, not discarding partition %" PRIu64 " because it probably is mounted.", p->partno); + return 0; + } if (r == 0) { log_info("Partition %" PRIu64 " too short for discard, skipping.", p->partno); return 0; From be9ce0188ebb414319f0c003f805ea02b5eb473e Mon Sep 17 00:00:00 2001 From: Lennart Poettering Date: Fri, 19 Mar 2021 22:45:15 +0100 Subject: [PATCH 10/19] repart: deal with empty partition label sensibly libfdisk appears to return NULL when encountering an empty partition label, let's handle this sanely, and treat NULL and "" for the current label as the same, but for the new label as distinct: there NULL means nothing is set, and "" means an actual empty label. --- src/partition/repart.c | 19 ++++++++++--------- 1 file changed, 10 insertions(+), 9 deletions(-) diff --git a/src/partition/repart.c b/src/partition/repart.c index 351b3d1b5e..0db8d4aba1 100644 --- a/src/partition/repart.c +++ b/src/partition/repart.c @@ -963,6 +963,9 @@ static int config_parse_label( assert(rvalue); assert(label); + /* Nota bene: the empty label is a totally valid one. Let's hence not follow our usual rule of + * assigning the empty string to reset to default here, but really accept it as label to set. */ + r = specifier_printf(rvalue, specifier_table, NULL, &resolved); if (r < 0) { log_syntax(unit, LOG_WARNING, filename, line, r, @@ -1873,7 +1876,7 @@ static int context_dump_partitions(Context *context, const char *node) { r = table_add_many( t, TABLE_STRING, gpt_partition_type_uuid_to_string_harder(p->type_uuid, uuid_buffer), - TABLE_STRING, label ?: "-", TABLE_SET_COLOR, label ? NULL : ansi_grey(), + TABLE_STRING, empty_to_null(label) ?: "-", TABLE_SET_COLOR, empty_to_null(label) ? NULL : ansi_grey(), TABLE_UUID, sd_id128_is_null(p->new_uuid) ? p->current_uuid : p->new_uuid, TABLE_STRING, p->definition_path ? basename(p->definition_path) : "-", TABLE_SET_COLOR, p->definition_path ? NULL : ansi_grey(), TABLE_STRING, partname ?: "-", TABLE_SET_COLOR, partname ? NULL : ansi_highlight(), @@ -2444,7 +2447,7 @@ static int partition_encrypt( volume_key, volume_key_size, &(struct crypt_params_luks2) { - .label = p->new_label, + .label = strempty(p->new_label), .sector_size = 512U, }); if (r < 0) @@ -2838,7 +2841,7 @@ static int context_mkfs(Context *context) { if (r < 0) return r; - r = make_filesystem(fsdev, p->format, p->new_label, fs_uuid, arg_discard); + r = make_filesystem(fsdev, p->format, strempty(p->new_label), fs_uuid, arg_discard); if (r < 0) { encrypted_dev_fd = safe_close(encrypted_dev_fd); (void) deactivate_luks(cd, encrypted); @@ -3025,7 +3028,7 @@ static int context_acquire_partition_uuids_and_labels(Context *context) { if (p->current_label) { free(p->new_label); - p->new_label = strdup(p->current_label); + p->new_label = strdup(strempty(p->current_label)); if (!p->new_label) return log_oom(); } @@ -3108,9 +3111,7 @@ static int context_mangle_partitions(Context *context) { } if (!streq_ptr(p->new_label, p->current_label)) { - assert(!isempty(p->new_label)); - - r = fdisk_partition_set_name(p->current_partition, p->new_label); + r = fdisk_partition_set_name(p->current_partition, strempty(p->new_label)); if (r < 0) return log_error_errno(r, "Failed to set partition label: %m"); @@ -3134,7 +3135,7 @@ static int context_mangle_partitions(Context *context) { assert(p->offset % 512 == 0); assert(p->new_size % 512 == 0); assert(!sd_id128_is_null(p->new_uuid)); - assert(!isempty(p->new_label)); + assert(p->new_label); t = fdisk_new_parttype(); if (!t) @@ -3172,7 +3173,7 @@ static int context_mangle_partitions(Context *context) { if (r < 0) return log_error_errno(r, "Failed to set partition UUID: %m"); - r = fdisk_partition_set_name(q, p->new_label); + r = fdisk_partition_set_name(q, strempty(p->new_label)); if (r < 0) return log_error_errno(r, "Failed to set partition label: %m"); From 78eee6ce4d45528e63d0857b4629de39a2140db5 Mon Sep 17 00:00:00 2001 From: Lennart Poettering Date: Fri, 19 Mar 2021 22:45:28 +0100 Subject: [PATCH 11/19] repart: use free_and_strdup_warn() where appropriate --- src/partition/repart.c | 15 +++++++-------- 1 file changed, 7 insertions(+), 8 deletions(-) diff --git a/src/partition/repart.c b/src/partition/repart.c index 0db8d4aba1..bbf4fbbc83 100644 --- a/src/partition/repart.c +++ b/src/partition/repart.c @@ -3027,10 +3027,9 @@ static int context_acquire_partition_uuids_and_labels(Context *context) { p->new_uuid = p->current_uuid; if (p->current_label) { - free(p->new_label); - p->new_label = strdup(strempty(p->current_label)); - if (!p->new_label) - return log_oom(); + r = free_and_strdup_warn(&p->new_label, strempty(p->current_label)); + if (r < 0) + return r; } continue; @@ -3046,10 +3045,10 @@ static int context_acquire_partition_uuids_and_labels(Context *context) { } if (!isempty(p->current_label)) { - free(p->new_label); - p->new_label = strdup(p->current_label); /* never change initialized labels */ - if (!p->new_label) - return log_oom(); + /* never change initialized labels */ + r = free_and_strdup_warn(&p->new_label, p->current_label); + if (r < 0) + return r; } else if (!p->new_label) { /* Not explicitly set by user! */ From d83d80486326607ed3b6e3c73f1f18e9baabf3ba Mon Sep 17 00:00:00 2001 From: Lennart Poettering Date: Sat, 20 Mar 2021 14:05:28 +0100 Subject: [PATCH 12/19] repart: add high-level setting for creating dirs in formatted file systems MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit So far we already had the CopyFiles= option in systemd-repart drop-in files, as a mechanism for populating freshly formatted file systems with files and directories. This adds MakeDirectories= in similar style, and creates simple directories as listed. The option is of course entirely redundant, since the same can be done with CopyFiles= simply by copying in a directory. It's kinda nice to encode the dirs to create directly in the drop-in files however, instead of providing a directory subtree to copy in somehere, to make the files more self-contained — since often just creating dirs is entirely sufficient. The main usecase for this are GPT OS images that carry only a /usr/ tree, and for which a root file system is only formatted on first boot via repart. Without any additional CopyFiles=/MakeDirectories= configuration these root file systems are entirely empty of course initially. To mount in the /usr/ tree, a directory inode for /usr/ to mount over needs to be created. systemd-nspawn will do so automatically when booting up the image, as will the initrd during boot. However, this requires the image to be writable – which is OK for npawn and initrd-based boots, but there are plenty tools where read-only operation is desirable after repart ran, before the image was booted for the first time. Specifically, "systemd-dissect" opens the image in read-only to inspect its contents, and this will only work of /usr/ can be properly mounted. Moreover systemd-dissect --mount --read-only won't succeed either if the fs is read-only. Via MakeDirectories= we now provide a way that ensures that the image can be mounted/inspected in a fully read-only way immediately after systemd-repart completed. Specifically, let's consider a GPT disk image shipping with a file usr/lib/repart.d/50-root.conf: [Partition] Type=root Format=btrfs MakeDirectories=/usr MakeDirectories=/efi With this in place systemd-repart will create a root partition when run, and add /usr and /efi into it as directory inods. This ensures that the whole image can then be mounted truly read-only anf /usr and /efi can be overmounted by the /usr partition and the ESP. --- man/repart.d.xml | 27 ++++++++++++++ src/partition/repart.c | 84 +++++++++++++++++++++++++++++++++++++++--- 2 files changed, 105 insertions(+), 6 deletions(-) diff --git a/man/repart.d.xml b/man/repart.d.xml index 67f687947e..43540234a7 100644 --- a/man/repart.d.xml +++ b/man/repart.d.xml @@ -493,6 +493,33 @@ specified root directory or disk image root. + + MakeDirectories= + + akes one or more absolute paths, separated by whitespace, each declaring a directory + to create within the new file system. Behaviour is similar to CopyFiles=, but + instead of copying in a set of files this just creates the specified directories with the default + mode of 0755 owned by the root user and group, plus all their parent directories (with the same + ownership and access mode). To configure directories with different ownership or access mode, use + CopyFiles= and specify a source tree to copy containing appropriately + owned/configured directories. This option may be used more than once to create multiple + directories. When CopyFiles= and MakeDirectories= are used + together the former is applied first. If a directory listed already exists no operation is executed + (in particular, the ownership/access mode of the directories is left as is). + + The primary usecase for this option is to create a minimal set of directories that may be + mounted over by other partitions contained in the same disk image. For example, a disk image where + the root file system is formatted at first boot might want to automatically pre-create + /usr/ in it this way, so that the usr partition may + over-mount it. + + Consider using + systemd-tmpfiles8 + with its option to pre-create other, more complex directory hierarchies (as + well as other inodes) with fine-grained control of ownership, access modes and other file + attributes. + + Encrypt= diff --git a/src/partition/repart.c b/src/partition/repart.c index bbf4fbbc83..b011eb8895 100644 --- a/src/partition/repart.c +++ b/src/partition/repart.c @@ -162,6 +162,7 @@ struct Partition { char *format; char **copy_files; + char **make_directories; EncryptMode encrypt; LIST_FIELDS(Partition, partitions); @@ -258,6 +259,7 @@ static Partition* partition_free(Partition *p) { free(p->format); strv_free(p->copy_files); + strv_free(p->make_directories); return mfree(p); } @@ -1160,6 +1162,55 @@ static int config_parse_copy_files( return 0; } +static int config_parse_make_dirs( + const char *unit, + const char *filename, + unsigned line, + const char *section, + unsigned section_line, + const char *lvalue, + int ltype, + const char *rvalue, + void *data, + void *userdata) { + + Partition *partition = data; + const char *p = rvalue; + int r; + + assert(rvalue); + assert(partition); + + for (;;) { + _cleanup_free_ char *word = NULL, *d = NULL; + + r = extract_first_word(&p, &word, NULL, EXTRACT_UNQUOTE); + if (r == -ENOMEM) + return log_oom(); + if (r < 0) { + log_syntax(unit, LOG_WARNING, filename, line, r, "Invalid syntax, ignoring: %s", rvalue); + return 0; + } + if (r == 0) + return 0; + + r = specifier_printf(word, specifier_table, NULL, &d); + if (r < 0) { + log_syntax(unit, LOG_WARNING, filename, line, r, + "Failed to expand specifiers in MakeDirectories= parameter, ignoring: %s", word); + continue; + } + + r = path_simplify_and_warn(d, PATH_CHECK_ABSOLUTE, unit, filename, line, lvalue); + if (r < 0) + continue; + + r = strv_consume(&partition->make_directories, TAKE_PTR(d)); + if (r < 0) + return log_oom(); + } +} + static DEFINE_CONFIG_PARSE_ENUM_WITH_DEFAULT(config_parse_encrypt, encrypt_mode, EncryptMode, ENCRYPT_OFF, "Invalid encryption mode"); static int partition_read_definition(Partition *p, const char *path) { @@ -1179,6 +1230,7 @@ static int partition_read_definition(Partition *p, const char *path) { { "Partition", "CopyBlocks", config_parse_path, 0, &p->copy_blocks_path }, { "Partition", "Format", config_parse_fstype, 0, &p->format }, { "Partition", "CopyFiles", config_parse_copy_files, 0, p }, + { "Partition", "MakeDirectories", config_parse_make_dirs, 0, p }, { "Partition", "Encrypt", config_parse_encrypt, 0, &p->encrypt }, {} }; @@ -1205,15 +1257,15 @@ static int partition_read_definition(Partition *p, const char *path) { return log_syntax(NULL, LOG_ERR, path, 1, SYNTHETIC_ERRNO(EINVAL), "Type= not defined, refusing."); - if (p->copy_blocks_path && (p->format || !strv_isempty(p->copy_files))) + if (p->copy_blocks_path && (p->format || !strv_isempty(p->copy_files) || !strv_isempty(p->make_directories))) return log_syntax(NULL, LOG_ERR, path, 1, SYNTHETIC_ERRNO(EINVAL), "Format= and CopyBlocks= cannot be combined, refusing."); - if (!strv_isempty(p->copy_files) && streq_ptr(p->format, "swap")) + if ((!strv_isempty(p->copy_files) || !strv_isempty(p->make_directories)) && streq_ptr(p->format, "swap")) return log_syntax(NULL, LOG_ERR, path, 1, SYNTHETIC_ERRNO(EINVAL), "Format=swap and CopyFiles= cannot be combined, refusing."); - if (!p->format && (!strv_isempty(p->copy_files) || (p->encrypt != ENCRYPT_OFF && !p->copy_blocks_path))) { + if (!p->format && (!strv_isempty(p->copy_files) || !strv_isempty(p->make_directories) || (p->encrypt != ENCRYPT_OFF && !p->copy_blocks_path))) { /* Pick "ext4" as file system if we are configured to copy files or encrypt the device */ p->format = strdup("ext4"); if (!p->format) @@ -2732,13 +2784,30 @@ static int do_copy_files(Partition *p, const char *fs) { return 0; } -static int partition_copy_files(Partition *p, const char *node) { +static int do_make_directories(Partition *p, const char *fs) { + char **d; + int r; + + assert(p); + assert(fs); + + STRV_FOREACH(d, p->make_directories) { + + r = mkdir_p_root(fs, *d, UID_INVALID, GID_INVALID, 0755); + if (r < 0) + return log_error_errno(r, "Failed to create directory '%s' in file system: %m", *d); + } + + return 0; +} + +static int partition_populate(Partition *p, const char *node) { int r; assert(p); assert(node); - if (strv_isempty(p->copy_files)) + if (strv_isempty(p->copy_files) && strv_isempty(p->make_directories)) return 0; log_info("Populating partition %" PRIu64 " with files.", p->partno); @@ -2766,6 +2835,9 @@ static int partition_copy_files(Partition *p, const char *node) { if (do_copy_files(p, fs) < 0) _exit(EXIT_FAILURE); + if (do_make_directories(p, fs) < 0) + _exit(EXIT_FAILURE); + r = syncfs_path(AT_FDCWD, fs); if (r < 0) { log_error_errno(r, "Failed to synchronize written files: %m"); @@ -2855,7 +2927,7 @@ static int context_mkfs(Context *context) { if (flock(encrypted_dev_fd, LOCK_UN) < 0) return log_error_errno(errno, "Failed to unlock LUKS device: %m"); - r = partition_copy_files(p, fsdev); + r = partition_populate(p, fsdev); if (r < 0) { encrypted_dev_fd = safe_close(encrypted_dev_fd); (void) deactivate_luks(cd, encrypted); From f3859d5f5591494c3022d6e9e4af1538174526fd Mon Sep 17 00:00:00 2001 From: Lennart Poettering Date: Mon, 22 Mar 2021 21:22:22 +0100 Subject: [PATCH 13/19] loop-util: store device major/minor in LoopDevice object Let's store this away. It's useful when matching up mounts (i.e. struct stat's .st_dev field) with loopback devices. --- src/shared/loop-util.c | 6 ++++++ src/shared/loop-util.h | 1 + 2 files changed, 7 insertions(+) diff --git a/src/shared/loop-util.c b/src/shared/loop-util.c index 84f415aa61..2aa4936965 100644 --- a/src/shared/loop-util.c +++ b/src/shared/loop-util.c @@ -353,6 +353,7 @@ int loop_device_make( .nr = nr, .node = TAKE_PTR(loopdev), .relinquished = true, /* It's not allocated by us, don't destroy it when this object is freed */ + .devno = st.st_rdev, }; *ret = d; @@ -425,6 +426,10 @@ int loop_device_make( UINT64_C(240) * USEC_PER_MSEC * n_attempts/64)); } + if (fstat(loop_with_fd, &st) < 0) + return -errno; + assert(S_ISBLK(st.st_mode)); + d = new(LoopDevice, 1); if (!d) return -ENOMEM; @@ -432,6 +437,7 @@ int loop_device_make( .fd = TAKE_FD(loop_with_fd), .node = TAKE_PTR(loopdev), .nr = nr, + .devno = st.st_rdev, }; *ret = d; diff --git a/src/shared/loop-util.h b/src/shared/loop-util.h index 9538daea31..619b34716b 100644 --- a/src/shared/loop-util.h +++ b/src/shared/loop-util.h @@ -10,6 +10,7 @@ typedef struct LoopDevice LoopDevice; struct LoopDevice { int fd; int nr; + dev_t devno; char *node; bool relinquished; }; From e81acfd251eb9e6afc6cc7ee589f89e9553935f5 Mon Sep 17 00:00:00 2001 From: Lennart Poettering Date: Mon, 22 Mar 2021 21:23:40 +0100 Subject: [PATCH 14/19] gpt: add some simple helpers for categorizing GPT partition types --- src/shared/gpt.c | 40 ++++++++++++++++++++++++++++++++++++++++ src/shared/gpt.h | 5 +++++ 2 files changed, 45 insertions(+) diff --git a/src/shared/gpt.c b/src/shared/gpt.c index a96f5ee02d..5f2de0d947 100644 --- a/src/shared/gpt.c +++ b/src/shared/gpt.c @@ -106,3 +106,43 @@ int gpt_partition_label_valid(const char *s) { return char16_strlen(recoded) <= 36; } + +bool gpt_partition_type_is_root(sd_id128_t id) { + return sd_id128_equal(id, GPT_ROOT_X86) || + sd_id128_equal(id, GPT_ROOT_X86_64) || + sd_id128_equal(id, GPT_ROOT_ARM) || + sd_id128_equal(id, GPT_ROOT_ARM_64) || + sd_id128_equal(id, GPT_ROOT_IA64) || + sd_id128_equal(id, GPT_ROOT_RISCV32) || + sd_id128_equal(id, GPT_ROOT_RISCV64); +} + +bool gpt_partition_type_is_root_verity(sd_id128_t id) { + return sd_id128_equal(id, GPT_ROOT_X86_VERITY) || + sd_id128_equal(id, GPT_ROOT_X86_64_VERITY) || + sd_id128_equal(id, GPT_ROOT_ARM_VERITY) || + sd_id128_equal(id, GPT_ROOT_ARM_64_VERITY) || + sd_id128_equal(id, GPT_ROOT_IA64_VERITY) || + sd_id128_equal(id, GPT_ROOT_RISCV32_VERITY) || + sd_id128_equal(id, GPT_ROOT_RISCV64_VERITY); +} + +bool gpt_partition_type_is_usr(sd_id128_t id) { + return sd_id128_equal(id, GPT_USR_X86) || + sd_id128_equal(id, GPT_USR_X86_64) || + sd_id128_equal(id, GPT_USR_ARM) || + sd_id128_equal(id, GPT_USR_ARM_64) || + sd_id128_equal(id, GPT_USR_IA64) || + sd_id128_equal(id, GPT_USR_RISCV32) || + sd_id128_equal(id, GPT_USR_RISCV64); +} + +bool gpt_partition_type_is_usr_verity(sd_id128_t id) { + return sd_id128_equal(id, GPT_USR_X86_VERITY) || + sd_id128_equal(id, GPT_USR_X86_64_VERITY) || + sd_id128_equal(id, GPT_USR_ARM_VERITY) || + sd_id128_equal(id, GPT_USR_ARM_64_VERITY) || + sd_id128_equal(id, GPT_USR_IA64_VERITY) || + sd_id128_equal(id, GPT_USR_RISCV32_VERITY) || + sd_id128_equal(id, GPT_USR_RISCV64_VERITY); +} diff --git a/src/shared/gpt.h b/src/shared/gpt.h index 2e0f50c3c6..22b1d68d5f 100644 --- a/src/shared/gpt.h +++ b/src/shared/gpt.h @@ -128,3 +128,8 @@ typedef struct GptPartitionType { extern const GptPartitionType gpt_partition_type_table[]; int gpt_partition_label_valid(const char *s); + +bool gpt_partition_type_is_root(sd_id128_t id); +bool gpt_partition_type_is_root_verity(sd_id128_t id); +bool gpt_partition_type_is_usr(sd_id128_t id); +bool gpt_partition_type_is_usr_verity(sd_id128_t id); From 5c08da586fc8fe7cda4010e0057cb79ba1d74335 Mon Sep 17 00:00:00 2001 From: Lennart Poettering Date: Tue, 23 Mar 2021 14:12:38 +0100 Subject: [PATCH 15/19] repart: add CopyBlocks=auto support When using systemd-repart as an installer that replicates the install medium on another medium it is useful to reference the root partition/usr partition or verity data that is currently booted, in particular in A/B scenarios where we have two copies and want to reference the one we currently use. Let's add a CopyBlocks=auto for this case: for a partition that uses that we'll copy a suitable partition from the host. CopyBlocks=auto finds the partition to copy like this: based on the configured partition type uuid we determine the usual mount point (i.e. for the /usr partition type we determine /usr/, and so on). We then figure out the block device behind that path, through dm-verity and dm-crypt if necessary. Finally, we compare the partition type uuid of the partition found that way with the one we are supposed to fill and only use it if it matches (the latter is primarily important on dm-verity setups where a volume is likely backed by two partitions and we need to find the right one). This is particularly fun to use in conjunction with --image= (where we'll restrict the device search onto the specify device, for security reasons), as this allows "duplicating" an image like this: # systemd-repart --image=source.raw --empty=create --size=auto target.raw If the right repart data is embedded into "source.raw" this will be able to create and initialize a partition table on target.raw that carrries all needed partitions, and will stream the source's file systems onto it as configured. --- man/repart.d.xml | 25 ++- src/partition/repart.c | 463 +++++++++++++++++++++++++++++++++++++---- 2 files changed, 446 insertions(+), 42 deletions(-) diff --git a/man/repart.d.xml b/man/repart.d.xml index 43540234a7..d404645588 100644 --- a/man/repart.d.xml +++ b/man/repart.d.xml @@ -422,12 +422,25 @@ CopyBlocks= - Takes a path to a regular file, block device node or directory. If specified and the - partition is newly created the data from the specified path is written to the newly created - partition, on the block level. If a directory is specified the backing block device of the file - system the directory is on is determined and the data read directly from that. This option is useful - to efficiently replicate existing file systems on the block level on a new partition, for example to - build a simple OS installer or OS image builder. + Takes a path to a regular file, block device node or directory, or the special value + auto. If specified and the partition is newly created, the data from the specified + path is written to the newly created partition, on the block level. If a directory is specified, the + backing block device of the file system the directory is on is determined, and the data read directly + from that. This option is useful to efficiently replicate existing file systems onto new partitions + on the block level — for example to build a simple OS installer or an OS image builder. + + If the special value auto is specified, the source to copy from is + automatically picked up from the running system (or the image specified with + — if used). A partition that matches both the configured partition type (as + declared with Type= above), and the currently mounted directory appropriate for + that partition type is determined. For example, if the partition type is set to + root the partition backing the root directory (/) is used as + source to copy from — if its partition type is set to root as well. If the + declared type is usr the partition backing /usr/ is used as + source to copy blocks from — if its partition type is set to usr too. The logic is + capable of automatically tracking down the the backing partitions for encrypted and Verity-enabled + volumes. CopyBlocks=auto is useful for implementing "self-replicating" systems, + i.e. systems that are their own installer. The file specified here must have a size that is a multiple of the basic block size 512 and not be empty. If this option is used, the size allocation algorithm is slightly altered: the partition is diff --git a/src/partition/repart.c b/src/partition/repart.c index b011eb8895..12ad0dd49e 100644 --- a/src/partition/repart.c +++ b/src/partition/repart.c @@ -26,6 +26,7 @@ #include "conf-parser.h" #include "cryptsetup-util.h" #include "def.h" +#include "dirent-util.h" #include "efivars.h" #include "errno-util.h" #include "fd-util.h" @@ -44,6 +45,7 @@ #include "mkdir.h" #include "mkfs-util.h" #include "mount-util.h" +#include "mountpoint-util.h" #include "parse-argument.h" #include "parse-util.h" #include "path-util.h" @@ -157,6 +159,7 @@ struct Partition { FreeArea *allocated_to_area; char *copy_blocks_path; + bool copy_blocks_auto; int copy_blocks_fd; uint64_t copy_blocks_size; @@ -1162,6 +1165,53 @@ static int config_parse_copy_files( return 0; } +static int config_parse_copy_blocks( + const char *unit, + const char *filename, + unsigned line, + const char *section, + unsigned section_line, + const char *lvalue, + int ltype, + const char *rvalue, + void *data, + void *userdata) { + + _cleanup_free_ char *d = NULL; + Partition *partition = data; + int r; + + assert(rvalue); + assert(partition); + + if (isempty(rvalue)) { + partition->copy_blocks_path = mfree(partition->copy_blocks_path); + partition->copy_blocks_auto = false; + return 0; + } + + if (streq(rvalue, "auto")) { + partition->copy_blocks_path = mfree(partition->copy_blocks_path); + partition->copy_blocks_auto = true; + return 0; + } + + r = specifier_printf(rvalue, specifier_table, NULL, &d); + if (r < 0) { + log_syntax(unit, LOG_WARNING, filename, line, r, + "Failed to expand specifiers in CopyBlocks= source path, ignoring: %s", rvalue); + return 0; + } + + r = path_simplify_and_warn(d, PATH_CHECK_ABSOLUTE, unit, filename, line, lvalue); + if (r < 0) + return 0; + + free_and_replace(partition->copy_blocks_path, d); + partition->copy_blocks_auto = false; + return 0; +} + static int config_parse_make_dirs( const char *unit, const char *filename, @@ -1216,22 +1266,22 @@ static DEFINE_CONFIG_PARSE_ENUM_WITH_DEFAULT(config_parse_encrypt, encrypt_mode, static int partition_read_definition(Partition *p, const char *path) { ConfigTableItem table[] = { - { "Partition", "Type", config_parse_type, 0, &p->type_uuid }, - { "Partition", "Label", config_parse_label, 0, &p->new_label }, - { "Partition", "UUID", config_parse_id128, 0, &p->new_uuid }, - { "Partition", "Priority", config_parse_int32, 0, &p->priority }, - { "Partition", "Weight", config_parse_weight, 0, &p->weight }, - { "Partition", "PaddingWeight", config_parse_weight, 0, &p->padding_weight }, - { "Partition", "SizeMinBytes", config_parse_size4096, 1, &p->size_min }, - { "Partition", "SizeMaxBytes", config_parse_size4096, -1, &p->size_max }, - { "Partition", "PaddingMinBytes", config_parse_size4096, 1, &p->padding_min }, - { "Partition", "PaddingMaxBytes", config_parse_size4096, -1, &p->padding_max }, - { "Partition", "FactoryReset", config_parse_bool, 0, &p->factory_reset }, - { "Partition", "CopyBlocks", config_parse_path, 0, &p->copy_blocks_path }, - { "Partition", "Format", config_parse_fstype, 0, &p->format }, - { "Partition", "CopyFiles", config_parse_copy_files, 0, p }, - { "Partition", "MakeDirectories", config_parse_make_dirs, 0, p }, - { "Partition", "Encrypt", config_parse_encrypt, 0, &p->encrypt }, + { "Partition", "Type", config_parse_type, 0, &p->type_uuid }, + { "Partition", "Label", config_parse_label, 0, &p->new_label }, + { "Partition", "UUID", config_parse_id128, 0, &p->new_uuid }, + { "Partition", "Priority", config_parse_int32, 0, &p->priority }, + { "Partition", "Weight", config_parse_weight, 0, &p->weight }, + { "Partition", "PaddingWeight", config_parse_weight, 0, &p->padding_weight }, + { "Partition", "SizeMinBytes", config_parse_size4096, 1, &p->size_min }, + { "Partition", "SizeMaxBytes", config_parse_size4096, -1, &p->size_max }, + { "Partition", "PaddingMinBytes", config_parse_size4096, 1, &p->padding_min }, + { "Partition", "PaddingMaxBytes", config_parse_size4096, -1, &p->padding_max }, + { "Partition", "FactoryReset", config_parse_bool, 0, &p->factory_reset }, + { "Partition", "CopyBlocks", config_parse_copy_blocks, 0, p }, + { "Partition", "Format", config_parse_fstype, 0, &p->format }, + { "Partition", "CopyFiles", config_parse_copy_files, 0, p }, + { "Partition", "MakeDirectories", config_parse_make_dirs, 0, p }, + { "Partition", "Encrypt", config_parse_encrypt, 0, &p->encrypt }, {} }; int r; @@ -1257,15 +1307,16 @@ static int partition_read_definition(Partition *p, const char *path) { return log_syntax(NULL, LOG_ERR, path, 1, SYNTHETIC_ERRNO(EINVAL), "Type= not defined, refusing."); - if (p->copy_blocks_path && (p->format || !strv_isempty(p->copy_files) || !strv_isempty(p->make_directories))) + if ((p->copy_blocks_path || p->copy_blocks_auto) && + (p->format || !strv_isempty(p->copy_files) || !strv_isempty(p->make_directories))) return log_syntax(NULL, LOG_ERR, path, 1, SYNTHETIC_ERRNO(EINVAL), - "Format= and CopyBlocks= cannot be combined, refusing."); + "Format=/CopyFiles=/MakeDirectories= and CopyBlocks= cannot be combined, refusing."); if ((!strv_isempty(p->copy_files) || !strv_isempty(p->make_directories)) && streq_ptr(p->format, "swap")) return log_syntax(NULL, LOG_ERR, path, 1, SYNTHETIC_ERRNO(EINVAL), "Format=swap and CopyFiles= cannot be combined, refusing."); - if (!p->format && (!strv_isempty(p->copy_files) || !strv_isempty(p->make_directories) || (p->encrypt != ENCRYPT_OFF && !p->copy_blocks_path))) { + if (!p->format && (!strv_isempty(p->copy_files) || !strv_isempty(p->make_directories) || (p->encrypt != ENCRYPT_OFF && !(p->copy_blocks_path || p->copy_blocks_auto)))) { /* Pick "ext4" as file system if we are configured to copy files or encrypt the device */ p->format = strdup("ext4"); if (!p->format) @@ -3459,7 +3510,311 @@ static int context_can_factory_reset(Context *context) { return false; } -static int context_open_copy_block_paths(Context *context) { +static int resolve_copy_blocks_auto_candidate( + dev_t partition_devno, + sd_id128_t partition_type_uuid, + dev_t restrict_devno, + sd_id128_t *ret_uuid) { + + _cleanup_(blkid_free_probep) blkid_probe b = NULL; + _cleanup_free_ char *p = NULL; + _cleanup_close_ int fd = -1; + const char *pttype, *t; + sd_id128_t pt_parsed, u; + blkid_partition pp; + dev_t whole_devno; + blkid_partlist pl; + struct stat st; + int r; + + /* Checks if the specified partition has the specified GPT type UUID, and is located on the specified + * 'restrict_devno' device. The type check is particularly relevant if we have Verity volume which is + * backed by two separate partitions: the data and the hash partitions, and we need to find the right + * one of the two. */ + + r = block_get_whole_disk(partition_devno, &whole_devno); + if (r < 0) + return log_error_errno( + r, + "Unable to determine containing block device of partition %u:%u: %m", + major(partition_devno), minor(partition_devno)); + + if (restrict_devno != (dev_t) -1 && + restrict_devno != whole_devno) + return log_error_errno( + SYNTHETIC_ERRNO(EPERM), + "Partition %u:%u is located outside of block device %u:%u, refusing.", + major(partition_devno), minor(partition_devno), + major(restrict_devno), minor(restrict_devno)); + + r = device_path_make_major_minor(S_IFBLK, whole_devno, &p); + if (r < 0) + return log_error_errno(r, "Failed to convert block device to device node path: %m"); + + fd = open(p, O_RDONLY|O_CLOEXEC|O_NONBLOCK); + if (fd < 0) + return log_error_errno(r, "Failed to open '%s': %m", p); + + if (fstat(fd, &st) < 0) + return log_error_errno(r, "Failed to stat '%s': %m", p); + + if (!S_ISBLK(st.st_mode) || st.st_rdev != whole_devno) + return log_error_errno( + SYNTHETIC_ERRNO(EPERM), + "Opened and determined block device don't match, refusing."); + + b = blkid_new_probe(); + if (!b) + return log_oom(); + + errno = 0; + r = blkid_probe_set_device(b, fd, 0, 0); + if (r != 0) + return log_error_errno(errno_or_else(ENOMEM), "Failed to open block device '%s': %m", p); + + (void) blkid_probe_enable_partitions(b, 1); + (void) blkid_probe_set_partitions_flags(b, BLKID_PARTS_ENTRY_DETAILS); + + errno = 0; + r = blkid_do_safeprobe(b); + if (IN_SET(r, -2, 1)) { /* nothing found or ambiguous result */ + log_debug("Didn't find partition table on block device '%s'.", p); + return false; + } + if (r != 0) + return log_error_errno(errno_or_else(EIO), "Unable to probe for partition table of '%s': %m", p); + + (void) blkid_probe_lookup_value(b, "PTTYPE", &pttype, NULL); + if (!streq_ptr(pttype, "gpt")) { + log_debug("Didn't find a GPT partition table on '%s'.", p); + return false; + } + + errno = 0; + pl = blkid_probe_get_partitions(b); + if (!pl) + return log_error_errno(errno_or_else(EIO), "Unable read partition table of '%s': %m", p); + errno = 0; + + pp = blkid_partlist_devno_to_partition(pl, partition_devno); + if (!pp) { + log_debug("Partition %u:%u has no matching partition table entry on '%s'.", + major(partition_devno), minor(partition_devno), p); + return false; + } + + t = blkid_partition_get_type_string(pp); + if (isempty(t)) { + log_debug("Partition %u:%u has no type on '%s'.", + major(partition_devno), minor(partition_devno), p); + return false; + } + + r = sd_id128_from_string(t, &pt_parsed); + if (r < 0) { + log_debug_errno(r, "Failed to parse partition type \"%s\": %m", t); + return false; + } + + if (!sd_id128_equal(pt_parsed, partition_type_uuid)) { + log_debug("Partition %u:%u has non-matching partition type " SD_ID128_FORMAT_STR " (needed: " SD_ID128_FORMAT_STR "), ignoring.", + major(partition_devno), minor(partition_devno), + SD_ID128_FORMAT_VAL(pt_parsed), SD_ID128_FORMAT_VAL(partition_type_uuid)); + return false; + } + + t = blkid_partition_get_uuid(pp); + if (isempty(t)) { + log_debug("Partition %u:%u has no UUID.", + major(partition_devno), minor(partition_devno)); + return false; + } + + r = sd_id128_from_string(t, &u); + if (r < 0) { + log_debug_errno(r, "Failed to parse partition UUID \"%s\": %m", t); + return false; + } + + log_debug("Automatically found partition %u:%u of right type " SD_ID128_FORMAT_STR ".", + major(partition_devno), minor(partition_devno), + SD_ID128_FORMAT_VAL(pt_parsed)); + + if (ret_uuid) + *ret_uuid = u; + + return true; +} + +static int find_backing_devno( + const char *path, + const char *root, + dev_t *ret) { + + _cleanup_free_ char *resolved = NULL; + int r; + + assert(path); + + r = chase_symlinks(path, root, CHASE_PREFIX_ROOT, &resolved, NULL); + if (r < 0) + return r; + + r = path_is_mount_point(resolved, NULL, 0); + if (r < 0) + return r; + if (r == 0) /* Not a mount point, then it's not a partition of its own, let's not automatically use it. */ + return -ENOENT; + + r = get_block_device(resolved, ret); + if (r < 0) + return r; + if (r == 0) /* Not backed by physical file system, we can't use this */ + return -ENOENT; + + return 0; +} + +static int resolve_copy_blocks_auto( + sd_id128_t type_uuid, + const char *root, + dev_t restrict_devno, + char **ret_path, + sd_id128_t *ret_uuid) { + + const char *try1 = NULL, *try2 = NULL; + char p[SYS_BLOCK_PATH_MAX("/slaves")]; + _cleanup_(closedirp) DIR *d = NULL; + sd_id128_t found_uuid = SD_ID128_NULL; + dev_t devno, found = 0; + int r; + + assert(ret_path); + + /* Enforce some security restrictions: CopyBlocks=auto should not be an avenue to get outside of the + * --root=/--image= confinement. Specifically, refuse CopyBlocks= in combination with --root= at all, + * and restrict block device references in the --image= case to loopback block device we set up. + * + * restrict_devno contain the dev_t of the loop back device we operate on in case of --image=, and + * thus declares which device (and its partition subdevices) we shall limit access to. If + * restrict_devno is zero no device probing access shall be allowed at all (used for --root=) and if + * it is (dev_t) -1 then free access shall be allowed (if neither switch is used). */ + + if (restrict_devno == 0) + return log_error_errno(SYNTHETIC_ERRNO(EPERM), + "Automatic discovery of backing block devices not permitted in --root= mode, refusing."); + + /* Handles CopyBlocks=auto, and finds the right source partition to copy from. We look for matching + * partitions in the host, using the appropriate directory as key and ensuring that the partition + * type matches. */ + + if (gpt_partition_type_is_root(type_uuid)) + try1 = "/"; + else if (gpt_partition_type_is_usr(type_uuid)) + try1 = "/usr/"; + else if (gpt_partition_type_is_root_verity(type_uuid)) + try1 = "/"; + else if (gpt_partition_type_is_usr_verity(type_uuid)) + try1 = "/usr/"; + else if (sd_id128_equal(type_uuid, GPT_ESP)) { + try1 = "/efi/"; + try2 = "/boot/"; + } else if (sd_id128_equal(type_uuid, GPT_XBOOTLDR)) + try1 = "/boot/"; + else + return log_error_errno(SYNTHETIC_ERRNO(EOPNOTSUPP), + "Partition type " SD_ID128_FORMAT_STR " not supported from automatic source block device discovery.", + SD_ID128_FORMAT_VAL(type_uuid)); + + r = find_backing_devno(try1, root, &devno); + if (r == -ENOENT && try2) + r = find_backing_devno(try2, root, &devno); + if (r < 0) + return log_error_errno(r, "Failed to resolve automatic CopyBlocks= path for partition type " SD_ID128_FORMAT_STR ", sorry: %m", + SD_ID128_FORMAT_VAL(type_uuid)); + + xsprintf_sys_block_path(p, "/slaves", devno); + d = opendir(p); + if (d) { + struct dirent *de; + + for (;;) { + _cleanup_free_ char *q = NULL, *t = NULL; + sd_id128_t u; + dev_t sl; + + errno = 0; + de = readdir_no_dot(d); + if (!de) { + if (errno != 0) + return log_error_errno(errno, "Failed to read directory '%s': %m", p); + + break; + } + + if (!IN_SET(de->d_type, DT_LNK, DT_UNKNOWN)) + continue; + + q = path_join(p, de->d_name, "/dev"); + if (!q) + return log_oom(); + + r = read_one_line_file(q, &t); + if (r < 0) + return log_error_errno(r, "Failed to read %s: %m", q); + + r = parse_dev(t, &sl); + if (r < 0) { + log_debug_errno(r, "Failed to parse %s, ignoring: %m", q); + continue; + } + if (major(sl) == 0) { + log_debug_errno(r, "Device backing %s is special, ignoring: %m", q); + continue; + } + + r = resolve_copy_blocks_auto_candidate(sl, type_uuid, restrict_devno, &u); + if (r < 0) + return r; + if (r > 0) { + /* We found a matching one! */ + if (found != 0) + return log_error_errno(SYNTHETIC_ERRNO(ENOTUNIQ), + "Multiple matching partitions found, refusing."); + + found = sl; + found_uuid = u; + } + } + } else if (errno != ENOENT) + return log_error_errno(errno, "Failed open %s: %m", p); + else { + r = resolve_copy_blocks_auto_candidate(devno, type_uuid, restrict_devno, &found_uuid); + if (r < 0) + return r; + if (r > 0) + found = devno; + } + + if (found == 0) + return log_error_errno(SYNTHETIC_ERRNO(ENXIO), + "Unable to automatically discover suitable partition to copy blocks from."); + + r = device_path_make_major_minor(S_IFBLK, found, ret_path); + if (r < 0) + return log_error_errno(r, "Failed to convert dev_t to device node path: %m"); + + if (ret_uuid) + *ret_uuid = found_uuid; + + return 0; +} + +static int context_open_copy_block_paths( + Context *context, + const char *root, + dev_t restrict_devno) { + Partition *p; int r; @@ -3467,6 +3822,8 @@ static int context_open_copy_block_paths(Context *context) { LIST_FOREACH(partitions, p, context->partitions) { _cleanup_close_ int source_fd = -1; + _cleanup_free_ char *opened = NULL; + sd_id128_t uuid = SD_ID128_NULL; uint64_t size; struct stat st; @@ -3476,16 +3833,39 @@ static int context_open_copy_block_paths(Context *context) { if (PARTITION_EXISTS(p)) /* Never copy over partitions that already exist! */ continue; - if (!p->copy_blocks_path) + if (p->copy_blocks_path) { + + source_fd = chase_symlinks_and_open(p->copy_blocks_path, root, CHASE_PREFIX_ROOT, O_RDONLY|O_CLOEXEC|O_NONBLOCK, &opened); + if (source_fd < 0) + return log_error_errno(source_fd, "Failed to open '%s': %m", p->copy_blocks_path); + + if (fstat(source_fd, &st) < 0) + return log_error_errno(errno, "Failed to stat block copy file '%s': %m", opened); + + if (!S_ISREG(st.st_mode) && restrict_devno != (dev_t) -1) + return log_error_errno(SYNTHETIC_ERRNO(EPERM), + "Copying from block device node is not permitted in --image=/--root= mode, refusing."); + + } else if (p->copy_blocks_auto) { + + r = resolve_copy_blocks_auto(p->type_uuid, root, restrict_devno, &opened, &uuid); + if (r < 0) + return r; + + source_fd = open(opened, O_RDONLY|O_CLOEXEC|O_NOCTTY); + if (source_fd < 0) + return log_error_errno(errno, "Failed to open automatically determined source block copy device '%s': %m", opened); + + if (fstat(source_fd, &st) < 0) + return log_error_errno(errno, "Failed to stat block copy file '%s': %m", opened); + + /* If we found it automatically, it must be a block device, let's enforce that */ + if (!S_ISBLK(st.st_mode)) + return log_error_errno(SYNTHETIC_ERRNO(EBADF), + "Automatically detected source block copy device '%s' is not a block device, refusing: %m", opened); + } else continue; - source_fd = open(p->copy_blocks_path, O_RDONLY|O_CLOEXEC|O_NOCTTY); - if (source_fd < 0) - return log_error_errno(errno, "Failed to open block copy file '%s': %m", p->copy_blocks_path); - - if (fstat(source_fd, &st) < 0) - return log_error_errno(errno, "Failed to stat block copy file '%s': %m", p->copy_blocks_path); - if (S_ISDIR(st.st_mode)) { _cleanup_free_ char *bdev = NULL; @@ -3500,14 +3880,14 @@ static int context_open_copy_block_paths(Context *context) { r = btrfs_get_block_device_fd(source_fd, &devt); if (r == -EUCLEAN) - return btrfs_log_dev_root(LOG_ERR, r, p->copy_blocks_path); + return btrfs_log_dev_root(LOG_ERR, r, opened); if (r < 0) - return log_error_errno(r, "Unable to determine backing block device of '%s': %m", p->copy_blocks_path); + return log_error_errno(r, "Unable to determine backing block device of '%s': %m", opened); r = device_path_make_major_minor(S_IFBLK, devt, &bdev); } if (r < 0) - return log_error_errno(r, "Failed to determine block device path for block device backing '%s': %m", p->copy_blocks_path); + return log_error_errno(r, "Failed to determine block device path for block device backing '%s': %m", opened); safe_close(source_fd); @@ -3528,15 +3908,21 @@ static int context_open_copy_block_paths(Context *context) { if (ioctl(source_fd, BLKGETSIZE64, &size) != 0) return log_error_errno(errno, "Failed to determine size of block device to copy from: %m"); } else - return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Specified path to copy blocks from '%s' is not a regular file, block device or directory, refusing: %m", p->copy_blocks_path); + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Specified path to copy blocks from '%s' is not a regular file, block device or directory, refusing: %m", opened); if (size <= 0) - return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "File to copy bytes from '%s' has zero size, refusing.", p->copy_blocks_path); + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "File to copy bytes from '%s' has zero size, refusing.", opened); if (size % 512 != 0) - return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "File to copy bytes from '%s' has size that is not multiple of 512, refusing.", p->copy_blocks_path); + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "File to copy bytes from '%s' has size that is not multiple of 512, refusing.", opened); p->copy_blocks_fd = TAKE_FD(source_fd); p->copy_blocks_size = size; + + free_and_replace(p->copy_blocks_path, opened); + + /* When copying from an existing partition copy that partitions UUID if none is configured explicitly */ + if (sd_id128_is_null(p->new_uuid) && !sd_id128_is_null(uuid)) + p->new_uuid = uuid; } return 0; @@ -3970,7 +4356,7 @@ static int acquire_root_devno( if (S_ISBLK(st.st_mode)) { /* Refuse referencing explicit block devices if a root dir is specified, after all we should - * be able to leave the image the root path constraints us to. */ + * not be able to leave the image the root path constrains us to. */ if (root) return -EPERM; @@ -4399,7 +4785,12 @@ static int run(int argc, char *argv[]) { return r; /* Open all files to copy blocks from now, since we want to take their size into consideration */ - r = context_open_copy_block_paths(context); + r = context_open_copy_block_paths( + context, + arg_root, + loop_device ? loop_device->devno : /* if --image= is specified, only allow partitions on the loopback device*/ + arg_root && !arg_image ? 0 : /* if --root= is specified, don't accept any block device */ + (dev_t) -1); /* if neither is specified, make no restrictions */ if (r < 0) return r; From e73309c532999cb15490a78575dd882b24bbe96f Mon Sep 17 00:00:00 2001 From: Lennart Poettering Date: Tue, 23 Mar 2021 16:16:42 +0100 Subject: [PATCH 16/19] repart: add new ReadOnly= and Flags= settings for repart dropins Let's make the GPT partition flags configurable when creating new partitions. This is primarily useful for the read-only flag (which we want to set for verity enabled partitions). This adds two settings for this: Flags= and ReadOnly=, which strictly speaking are redundant. The main reason to have both is that usually the ReadOnly= setting is the one wants to control, and it' more generic. Moreover we might later on introduce inherting of flags from CopyBlocks= partitions, where one might want to control most flags as is except for the RO flag and similar, hence let's keep them separate. --- man/repart.d.xml | 23 +++++++++++++ src/partition/repart.c | 75 ++++++++++++++++++++++++++++++++++++++++++ src/shared/gpt.c | 11 +++++++ src/shared/gpt.h | 2 ++ 4 files changed, 111 insertions(+) diff --git a/man/repart.d.xml b/man/repart.d.xml index d404645588..5223f50364 100644 --- a/man/repart.d.xml +++ b/man/repart.d.xml @@ -565,6 +565,29 @@ factory reset operation. This functionality is useful to implement schemes where images can be reset into their original state by removing partitions and creating them anew. Defaults to off. + + + Flags= + + Configures the 64bit GPT partition flags to set for the partition when creating + it. This option has no effect if the partition already exists. If not specified the flags values is + set to all zeroes, except if the partition type (as configured with Type= above) + refers to a Verity partition, in wich case bit 60 is set (i.e. the read-only bit). This bit may also + be configured separately via ReadOnly=, see below. Specify the flags value in + hexadecimal (by prefixing it with 0x), binary (prefix 0b) or + decimal (no prefix). + + + + ReadOnly= + + Configures the Read-Only partition flags (bit 60) of the partition table entry. This + option is a friendly way to set bit 60 of the partition flags value without setting any of the other + bits, and may be set via Flags= too, see above. + + If both Flags= and ReadOnly= are set the latter controls + the value of the flag. + diff --git a/src/partition/repart.c b/src/partition/repart.c index 12ad0dd49e..af319d2e33 100644 --- a/src/partition/repart.c +++ b/src/partition/repart.c @@ -168,6 +168,9 @@ struct Partition { char **make_directories; EncryptMode encrypt; + uint64_t gpt_flags; + int read_only; + LIST_FIELDS(Partition, partitions); }; @@ -239,6 +242,7 @@ static Partition *partition_new(void) { .offset = UINT64_MAX, .copy_blocks_fd = -1, .copy_blocks_size = UINT64_MAX, + .read_only = -1, }; return p; @@ -1263,6 +1267,34 @@ static int config_parse_make_dirs( static DEFINE_CONFIG_PARSE_ENUM_WITH_DEFAULT(config_parse_encrypt, encrypt_mode, EncryptMode, ENCRYPT_OFF, "Invalid encryption mode"); +static int config_parse_gpt_flags( + const char *unit, + const char *filename, + unsigned line, + const char *section, + unsigned section_line, + const char *lvalue, + int ltype, + const char *rvalue, + void *data, + void *userdata) { + + uint64_t *gpt_flags = data; + int r; + + assert(rvalue); + assert(gpt_flags); + + r = safe_atou64(rvalue, gpt_flags); + if (r < 0) { + log_syntax(unit, LOG_WARNING, filename, line, r, + "Failed to parse Flags= value, ignoring: %s", rvalue); + return 0; + } + + return 0; +} + static int partition_read_definition(Partition *p, const char *path) { ConfigTableItem table[] = { @@ -1282,6 +1314,8 @@ static int partition_read_definition(Partition *p, const char *path) { { "Partition", "CopyFiles", config_parse_copy_files, 0, p }, { "Partition", "MakeDirectories", config_parse_make_dirs, 0, p }, { "Partition", "Encrypt", config_parse_encrypt, 0, &p->encrypt }, + { "Partition", "Flags", config_parse_gpt_flags, 0, &p->gpt_flags }, + { "Partition", "ReadOnly", config_parse_tristate, 0, &p->read_only }, {} }; int r; @@ -1323,6 +1357,12 @@ static int partition_read_definition(Partition *p, const char *path) { return log_oom(); } + /* Verity partitions are read only, let's imply the RO flag hence, unless explicitly configured otherwise. */ + if ((gpt_partition_type_is_root_verity(p->type_uuid) || + gpt_partition_type_is_usr_verity(p->type_uuid)) && + p->read_only < 0) + p->read_only = true; + return 0; } @@ -3184,6 +3224,24 @@ static int context_acquire_partition_uuids_and_labels(Context *context) { return 0; } +static int set_gpt_flags(struct fdisk_partition *q, uint64_t flags) { + _cleanup_free_ char *a = NULL; + + for (unsigned i = 0; i < sizeof(flags) * 8; i++) { + uint64_t bit = UINT64_C(1) << i; + char buf[DECIMAL_STR_MAX(unsigned)+1]; + + if (!FLAGS_SET(flags, bit)) + continue; + + xsprintf(buf, "%u", i); + if (!strextend_with_separator(&a, ",", buf)) + return -ENOMEM; + } + + return fdisk_partition_set_attrs(q, a); +} + static int context_mangle_partitions(Context *context) { Partition *p; int r; @@ -3252,6 +3310,7 @@ static int context_mangle_partitions(Context *context) { _cleanup_(fdisk_unref_partitionp) struct fdisk_partition *q = NULL; _cleanup_(fdisk_unref_parttypep) struct fdisk_parttype *t = NULL; char ids[ID128_UUID_STRING_MAX]; + uint64_t f; assert(!p->new_partition); assert(p->offset % 512 == 0); @@ -3299,6 +3358,22 @@ static int context_mangle_partitions(Context *context) { if (r < 0) return log_error_errno(r, "Failed to set partition label: %m"); + /* Merge the read only setting with the literal flags */ + f = p->gpt_flags; + if (p->read_only >= 0) { + if (gpt_partition_type_knows_read_only(p->type_uuid)) + SET_FLAG(f, GPT_FLAG_READ_ONLY, p->read_only); + else { + char buffer[ID128_UUID_STRING_MAX]; + log_warning("Configured ReadOnly=yes for partition type '%s' that doesn't support it, ignoring.", + gpt_partition_type_uuid_to_string_harder(p->type_uuid, buffer)); + } + } + + r = set_gpt_flags(q, f); + if (r < 0) + return log_error_errno(r, "Failed to set GPT partition flags: %m"); + log_info("Adding new partition %" PRIu64 " to partition table.", p->partno); r = fdisk_add_partition(context->fdisk_context, q, NULL); diff --git a/src/shared/gpt.c b/src/shared/gpt.c index 5f2de0d947..846b2fe48f 100644 --- a/src/shared/gpt.c +++ b/src/shared/gpt.c @@ -146,3 +146,14 @@ bool gpt_partition_type_is_usr_verity(sd_id128_t id) { sd_id128_equal(id, GPT_USR_RISCV32_VERITY) || sd_id128_equal(id, GPT_USR_RISCV64_VERITY); } + +bool gpt_partition_type_knows_read_only(sd_id128_t id) { + return gpt_partition_type_is_root(id) || + gpt_partition_type_is_usr(id) || + sd_id128_equal(id, GPT_HOME) || + sd_id128_equal(id, GPT_SRV) || + sd_id128_equal(id, GPT_VAR) || + sd_id128_equal(id, GPT_TMP) || + gpt_partition_type_is_root_verity(id) || /* pretty much implied, but let's set the bit to make things really clear */ + gpt_partition_type_is_usr_verity(id); /* ditto */ +} diff --git a/src/shared/gpt.h b/src/shared/gpt.h index 22b1d68d5f..f3a74813f0 100644 --- a/src/shared/gpt.h +++ b/src/shared/gpt.h @@ -133,3 +133,5 @@ bool gpt_partition_type_is_root(sd_id128_t id); bool gpt_partition_type_is_root_verity(sd_id128_t id); bool gpt_partition_type_is_usr(sd_id128_t id); bool gpt_partition_type_is_usr_verity(sd_id128_t id); + +bool gpt_partition_type_knows_read_only(sd_id128_t id); From b620bf332f575ba9b8e4cd60c93446a0c35c23e8 Mon Sep 17 00:00:00 2001 From: Lennart Poettering Date: Mon, 19 Apr 2021 22:47:33 +0200 Subject: [PATCH 17/19] dissect: ext4 and loopback files are unimpressed by read-only access Even if we set up a loopback device read-only and mount it read-only this means nothing, ext4 will still write through to the backing storage file. Yes, I lost 6h debugging time on this. Apparently, we have to specify "norecovery" when mounting such file systems, to force them into truly read-only mode. Let's do so. --- src/shared/dissect-image.c | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) diff --git a/src/shared/dissect-image.c b/src/shared/dissect-image.c index 1624e9daa8..f6971038de 100644 --- a/src/shared/dissect-image.c +++ b/src/shared/dissect-image.c @@ -1463,6 +1463,27 @@ static int mount_partition( if (!strextend_with_separator(&options, ",", m->mount_options)) return -ENOMEM; + /* So, when you request MS_RDONLY from ext4, then this means nothing. It happily still writes to the + * backing storage. What's worse, the BLKRO[GS]ET flag and (in case of loopback devices) + * LO_FLAGS_READ_ONLY don't mean anything, they affect userspace accesses only, and write accesses + * from the upper file system still get propagated through to the underlying file system, + * unrestricted. To actually get ext4/xfs/btrfs to stop writing to the device we need to specify + * "norecovery" as mount option, in addition to MS_RDONLY. Yes, this sucks, since it means we need to + * carry a per file system table here. + * + * Note that this means that we might not be able to mount corrupted file systems as read-only + * anymore (since in some cases the kernel implementations will refuse mounting when corrupted, + * read-only and "norecovery" is specified). But I think for the case of automatically determined + * mount options for loopback devices this is the right choice, since otherwise using the same + * loopback file twice even in read-only mode, is going to fail badly sooner or later. The usecase of + * making reuse of the immutable images "just work" is more relevant to us than having read-only + * access that actually modifies stuff work on such image files. Or to say this differently: if + * people want their file systems to be fixed up they should just open them in writable mode, where + * all these problems don't exist. */ + if (!rw && STRPTR_IN_SET(fstype, "ext3", "ext4", "xfs", "btrfs")) + if (!strextend_with_separator(&options, ",", "norecovery")) + return -ENOMEM; + r = mount_nofollow_verbose(LOG_DEBUG, node, p, fstype, MS_NODEV|(rw ? 0 : MS_RDONLY), options); if (r < 0) return r; From 5a3b86404a65d1f47da2d387fd84cebd1b3418a2 Mon Sep 17 00:00:00 2001 From: Lennart Poettering Date: Mon, 19 Apr 2021 17:30:33 +0200 Subject: [PATCH 18/19] test: add test for new repart features --- test/TEST-58-REPART/Makefile | 1 + test/TEST-58-REPART/test.sh | 6 +++ test/units/testsuite-58.service | 6 +++ test/units/testsuite-58.sh | 72 +++++++++++++++++++++++++++++++++ 4 files changed, 85 insertions(+) create mode 120000 test/TEST-58-REPART/Makefile create mode 100755 test/TEST-58-REPART/test.sh create mode 100644 test/units/testsuite-58.service create mode 100755 test/units/testsuite-58.sh diff --git a/test/TEST-58-REPART/Makefile b/test/TEST-58-REPART/Makefile new file mode 120000 index 0000000000..e9f93b1104 --- /dev/null +++ b/test/TEST-58-REPART/Makefile @@ -0,0 +1 @@ +../TEST-01-BASIC/Makefile \ No newline at end of file diff --git a/test/TEST-58-REPART/test.sh b/test/TEST-58-REPART/test.sh new file mode 100755 index 0000000000..d94a9cbfdc --- /dev/null +++ b/test/TEST-58-REPART/test.sh @@ -0,0 +1,6 @@ +#!/usr/bin/env bash +set -e +TEST_DESCRIPTION="test systemd-repart" +. $TEST_BASE_DIR/test-functions + +do_test "$@" 56 diff --git a/test/units/testsuite-58.service b/test/units/testsuite-58.service new file mode 100644 index 0000000000..d8ad589ca0 --- /dev/null +++ b/test/units/testsuite-58.service @@ -0,0 +1,6 @@ +[Unit] +Description=TEST-56-EXIT-TYPE + +[Service] +ExecStart=/usr/lib/systemd/tests/testdata/units/%N.sh +Type=oneshot diff --git a/test/units/testsuite-58.sh b/test/units/testsuite-58.sh new file mode 100755 index 0000000000..772f1b78d4 --- /dev/null +++ b/test/units/testsuite-58.sh @@ -0,0 +1,72 @@ +#!/usr/bin/env bash +set -eux + +export SYSTEMD_LOG_LEVEL=debug +export PAGER=cat + +mkdir -p /tmp/testsuite-58-defs/ + +# First part: create a disk image and verify its in order + +cat > /tmp/testsuite-58-defs/esp.conf < /tmp/testsuite-58-defs/usr.conf < /tmp/testsuite-58-defs/root.conf < /tmp/testsuite-58.dump + +grep -qxF '/var/tmp/testsuite-58.img1 : start= 2048, size= 20480, type=C12A7328-F81F-11D2-BA4B-00A0C93EC93B, uuid=39107B09-615D-48FB-BA37-C663885FCE67, name="esp"' /tmp/testsuite-58.dump +grep -qxF '/var/tmp/testsuite-58.img2 : start= 22528, size= 20480, type=4F68BCE3-E8CD-4DB1-96E7-FBCAF984B709, uuid=60F33797-1D71-4DCB-AA6F-20564F036CD0, name="root-x86-64"' /tmp/testsuite-58.dump +grep -qxF '/var/tmp/testsuite-58.img3 : start= 43008, size= 20480, type=8484680C-9521-48C6-9C11-B0720656F69E, uuid=7E3369DD-D653-4513-ADF5-B993A9F20C16, name="usr-x86-64", attrs="GUID:60"' /tmp/testsuite-58.dump + +# Second part, duplicate it with CopyBlocks=auto + +cat > /tmp/testsuite-58-defs/esp.conf < /tmp/testsuite-58-defs/usr.conf < /tmp/testsuite-58-defs/root.conf </testok + +exit 0 From 7cc3966693f753fe743314f4a5915d4c5bb210d3 Mon Sep 17 00:00:00 2001 From: Lennart Poettering Date: Tue, 23 Mar 2021 19:00:32 +0100 Subject: [PATCH 19/19] update TODO --- TODO | 2 -- 1 file changed, 2 deletions(-) diff --git a/TODO b/TODO index 2a8d46d0ab..b05e61552c 100644 --- a/TODO +++ b/TODO @@ -400,8 +400,6 @@ Features: * systemd-repart: allow sizing partitions as factor of available RAM, so that we can reasonably size swap partitions for hibernation. -* systemd-repart: allow managing the gpt read-only partition flag + auto-mount flag - * systemd-repart: allow boolean option that ensures that if existing partition doesn't exist within the configured size bounds the whole command fails. This is useful to implement ESP vs. XBOOTLDR schemes in installers: have one set