diff --git a/TODO b/TODO index 2072f1e2da..4cf7eec7cd 100644 --- a/TODO +++ b/TODO @@ -35,6 +35,10 @@ Features: the quota of a the user indicated in User= via unit file settings, like the other resource management concepts. Would mix nicely with DynamicUser=1 +* add dissect_image_warn() as a wrapper around dissect_image() that prints + friendly log messages for the returned errors, so that we don't have to + duplicate that in nspawn, systemd-dissect and PID 1. + * maybe set a new set of env vars for services, based on RuntimeDirectory=, StateDirectory=, LogsDirectory=, CacheDirectory= and ConfigurationDirectory= automatically. For example, there could be $RUNTIME_DIRECTORY, diff --git a/src/basic/btrfs-util.c b/src/basic/btrfs-util.c index c2061addd1..783ff13a36 100644 --- a/src/basic/btrfs-util.c +++ b/src/basic/btrfs-util.c @@ -1212,7 +1212,7 @@ static int subvol_remove_children(int fd, const char *subvolume, uint64_t subvol if (!S_ISDIR(st.st_mode)) return -EINVAL; - subvol_fd = openat(fd, subvolume, O_RDONLY|O_NOCTTY|O_CLOEXEC|O_DIRECTORY); + subvol_fd = openat(fd, subvolume, O_RDONLY|O_NOCTTY|O_CLOEXEC|O_DIRECTORY|O_NOFOLLOW); if (subvol_fd < 0) return -errno; @@ -1292,7 +1292,7 @@ static int subvol_remove_children(int fd, const char *subvolume, uint64_t subvol * hence we need to open the * containing directory first */ - child_fd = openat(subvol_fd, ino_args.name, O_RDONLY|O_NOCTTY|O_CLOEXEC|O_DIRECTORY); + child_fd = openat(subvol_fd, ino_args.name, O_RDONLY|O_NOCTTY|O_CLOEXEC|O_DIRECTORY|O_NOFOLLOW); if (child_fd < 0) return -errno; @@ -1641,7 +1641,7 @@ static int subvol_snapshot_children(int old_fd, int new_fd, const char *subvolum if (!c) return -ENOMEM; - old_child_fd = openat(old_fd, c, O_RDONLY|O_NOCTTY|O_CLOEXEC|O_DIRECTORY); + old_child_fd = openat(old_fd, c, O_RDONLY|O_NOCTTY|O_CLOEXEC|O_DIRECTORY|O_NOFOLLOW); if (old_child_fd < 0) return -errno; @@ -1649,7 +1649,7 @@ static int subvol_snapshot_children(int old_fd, int new_fd, const char *subvolum if (!np) return -ENOMEM; - new_child_fd = openat(new_fd, np, O_RDONLY|O_NOCTTY|O_CLOEXEC|O_DIRECTORY); + new_child_fd = openat(new_fd, np, O_RDONLY|O_NOCTTY|O_CLOEXEC|O_DIRECTORY|O_NOFOLLOW); if (new_child_fd < 0) return -errno; @@ -1660,7 +1660,7 @@ static int subvol_snapshot_children(int old_fd, int new_fd, const char *subvolum * into place. */ if (subvolume_fd < 0) { - subvolume_fd = openat(new_fd, subvolume, O_RDONLY|O_NOCTTY|O_CLOEXEC|O_DIRECTORY); + subvolume_fd = openat(new_fd, subvolume, O_RDONLY|O_NOCTTY|O_CLOEXEC|O_DIRECTORY|O_NOFOLLOW); if (subvolume_fd < 0) return -errno; } diff --git a/src/basic/fs-util.c b/src/basic/fs-util.c index b90f343ed3..946f779a32 100644 --- a/src/basic/fs-util.c +++ b/src/basic/fs-util.c @@ -766,12 +766,11 @@ int chase_symlinks(const char *path, const char *original_root, unsigned flags, return -ENOMEM; } - } - - /* Prefix what's left to do with what we just read, and start the loop again, - * but remain in the current directory. */ - - joined = strjoin("/", destination, todo); + /* Prefix what's left to do with what we just read, and start the loop again, but + * remain in the current directory. */ + joined = strjoin(destination, todo); + } else + joined = strjoin("/", destination, todo); if (!joined) return -ENOMEM; diff --git a/src/dissect/dissect.c b/src/dissect/dissect.c index 06564e94b1..e5264d09df 100644 --- a/src/dissect/dissect.c +++ b/src/dissect/dissect.c @@ -216,6 +216,10 @@ int main(int argc, char *argv[]) { log_error_errno(r, "No suitable root partition found in image %s.", arg_image); goto finish; } + if (r == -EPROTONOSUPPORT) { + log_error_errno(r, "Device %s is loopback block device with partition scanning turned off, please turn it on.", arg_image); + goto finish; + } if (r < 0) { log_error_errno(r, "Failed to dissect image: %m"); goto finish; diff --git a/src/machine/image-dbus.c b/src/machine/image-dbus.c index 18e0e34896..e534470feb 100644 --- a/src/machine/image-dbus.c +++ b/src/machine/image-dbus.c @@ -431,6 +431,7 @@ int bus_image_method_get_os_release( break; case IMAGE_RAW: + case IMAGE_BLOCK: r = raw_image_get_os_release(image, &v, error); break; diff --git a/src/nspawn/nspawn.c b/src/nspawn/nspawn.c index 4e3803be82..cbe98a3ff8 100644 --- a/src/nspawn/nspawn.c +++ b/src/nspawn/nspawn.c @@ -2089,7 +2089,7 @@ static int determine_names(void) { return -ENOENT; } - if (i->type == IMAGE_RAW) + if (IN_SET(i->type, IMAGE_RAW, IMAGE_BLOCK)) r = free_and_strdup(&arg_image, i->path); else r = free_and_strdup(&arg_directory, i->path); @@ -3883,6 +3883,10 @@ int main(int argc, char *argv[]) { log_error_errno(r, "--image= is not supported, compiled without blkid support."); goto finish; } + if (r == -EPROTONOSUPPORT) { + log_error_errno(r, "Device is loopback block device with partition scanning turned off, please turn it on."); + goto finish; + } if (r < 0) { log_error_errno(r, "Failed to dissect image: %m"); goto finish; diff --git a/src/shared/dissect-image.c b/src/shared/dissect-image.c index 4b59f2ca7d..de9510df63 100644 --- a/src/shared/dissect-image.c +++ b/src/shared/dissect-image.c @@ -266,18 +266,34 @@ int dissect_image(int fd, const void *root_hash, size_t root_hash_size, DissectI return -EIO; } if (n < z + 1) { - unsigned j; + unsigned j = 0; /* The kernel has probed fewer partitions than blkid? Maybe the kernel prober is still running * or it got EBUSY because udev already opened the device. Let's reprobe the device, which is a * synchronous call that waits until probing is complete. */ - for (j = 0; j < 20; j++) { + for (;;) { + if (j++ > 20) + return -EBUSY; - r = ioctl(fd, BLKRRPART, 0); - if (r < 0) + if (ioctl(fd, BLKRRPART, 0) < 0) { r = -errno; - if (r >= 0 || r != -EBUSY) + + if (r == -EINVAL) { + struct loop_info64 info; + + /* If we are running on a loop device that has partition scanning off, + * return an explicit recognizable error about this, so that callers + * can generate a proper message explaining the situation. */ + + if (ioctl(fd, LOOP_GET_STATUS64, &info) >= 0 && (info.lo_flags & LO_FLAGS_PARTSCAN) == 0) { + log_debug("Device is loop device and partition scanning is off!"); + return -EPROTONOSUPPORT; + } + } + if (r != -EBUSY) + return r; + } else break; /* If something else has the device open, such as an udev rule, the ioctl will return @@ -286,11 +302,8 @@ int dissect_image(int fd, const void *root_hash, size_t root_hash_size, DissectI * * This is really something they should fix in the kernel! */ - usleep(50 * USEC_PER_MSEC); + (void) usleep(50 * USEC_PER_MSEC); } - - if (r < 0) - return r; } e = udev_enumerate_unref(e); diff --git a/src/shared/machine-image.c b/src/shared/machine-image.c index 859e5ffc1a..24209fa35d 100644 --- a/src/shared/machine-image.c +++ b/src/shared/machine-image.c @@ -171,9 +171,8 @@ static int image_make( assert(filename); - /* We explicitly *do* follow symlinks here, since we want to - * allow symlinking trees into /var/lib/machines/, and treat - * them normally. */ + /* We explicitly *do* follow symlinks here, since we want to allow symlinking trees, raw files and block + * devices into /var/lib/machines/, and treat them normally. */ if (fstatat(dfd, filename, &st, 0) < 0) return -errno; @@ -286,6 +285,58 @@ static int image_make( (*ret)->limit = (*ret)->limit_exclusive = st.st_size; return 1; + + } else if (S_ISBLK(st.st_mode)) { + _cleanup_close_ int block_fd = -1; + uint64_t size = UINT64_MAX; + + /* A block device */ + + if (!ret) + return 1; + + if (!pretty) + pretty = filename; + + block_fd = openat(dfd, filename, O_RDONLY|O_NONBLOCK|O_CLOEXEC|O_NOCTTY); + if (block_fd < 0) + log_debug_errno(errno, "Failed to open block device %s/%s, ignoring: %m", path, filename); + else { + if (fstat(block_fd, &st) < 0) + return -errno; + if (!S_ISBLK(st.st_mode)) /* Verify that what we opened is actually what we think it is */ + return -ENOTTY; + + if (!read_only) { + int state = 0; + + if (ioctl(block_fd, BLKROGET, &state) < 0) + log_debug_errno(errno, "Failed to issue BLKROGET on device %s/%s, ignoring: %m", path, filename); + else if (state) + read_only = true; + } + + if (ioctl(block_fd, BLKGETSIZE64, &size) < 0) + log_debug_errno(errno, "Failed to issue BLKFLSBUF on device %s/%s, ignoring: %m", path, filename); + + block_fd = safe_close(block_fd); + } + + r = image_new(IMAGE_BLOCK, + pretty, + path, + filename, + !(st.st_mode & 0222) || read_only, + 0, + 0, + ret); + if (r < 0) + return r; + + if (size != 0 && size != UINT64_MAX) + (*ret)->usage = (*ret)->usage_exclusive = (*ret)->limit = (*ret)->limit_exclusive = size; + + return 1; } return 0; @@ -432,9 +483,15 @@ int image_remove(Image *i) { switch (i->type) { case IMAGE_SUBVOLUME: - r = btrfs_subvol_remove(i->path, BTRFS_REMOVE_RECURSIVE|BTRFS_REMOVE_QUOTA); - if (r < 0) - return r; + + /* Let's unlink first, maybe it is a symlink? If that works we are happy. Otherwise, let's get out the + * big guns */ + if (unlink(i->path) < 0) { + r = btrfs_subvol_remove(i->path, BTRFS_REMOVE_RECURSIVE|BTRFS_REMOVE_QUOTA); + if (r < 0) + return r; + } + break; case IMAGE_DIRECTORY: @@ -446,6 +503,17 @@ int image_remove(Image *i) { break; + case IMAGE_BLOCK: + + /* If this is inside of /dev, then it's a real block device, hence let's not touch the device node + * itself (but let's remove the stuff stored alongside it). If it's anywhere else, let's try to unlink + * the thing (it's most likely a symlink after all). */ + + if (path_startswith(i->path, "/dev")) + break; + + /* fallthrough */ + case IMAGE_RAW: if (unlink(i->path) < 0) return -errno; @@ -536,6 +604,15 @@ int image_rename(Image *i, const char *new_name) { new_path = file_in_same_dir(i->path, new_name); break; + case IMAGE_BLOCK: + + /* Refuse renaming raw block devices in /dev, the names are picked by udev after all. */ + if (path_startswith(i->path, "/dev")) + return -EROFS; + + new_path = file_in_same_dir(i->path, new_name); + break; + case IMAGE_RAW: { const char *fn; @@ -659,6 +736,7 @@ int image_clone(Image *i, const char *new_name, bool read_only) { r = copy_file_atomic(i->path, new_path, read_only ? 0444 : 0644, FS_NOCOW_FL, COPY_REFLINK); break; + case IMAGE_BLOCK: default: return -EOPNOTSUPP; } @@ -737,6 +815,26 @@ int image_read_only(Image *i, bool b) { break; } + case IMAGE_BLOCK: { + _cleanup_close_ int fd = -1; + struct stat st; + int state = b; + + fd = open(i->path, O_CLOEXEC|O_RDONLY|O_NONBLOCK|O_NOCTTY); + if (fd < 0) + return -errno; + + if (fstat(fd, &st) < 0) + return -errno; + if (!S_ISBLK(st.st_mode)) + return -ENOTTY; + + if (ioctl(fd, BLKROSET, &state) < 0) + return -errno; + + break; + } + default: return -EOPNOTSUPP; } @@ -772,13 +870,24 @@ int image_path_lock(const char *path, int operation, LockFile *global, LockFile return -EBUSY; if (stat(path, &st) >= 0) { - if (asprintf(&p, "/run/systemd/nspawn/locks/inode-%lu:%lu", (unsigned long) st.st_dev, (unsigned long) st.st_ino) < 0) + if (S_ISBLK(st.st_mode)) + r = asprintf(&p, "/run/systemd/nspawn/locks/block-%u:%u", major(st.st_rdev), minor(st.st_rdev)); + else if (S_ISDIR(st.st_mode) || S_ISREG(st.st_mode)) + r = asprintf(&p, "/run/systemd/nspawn/locks/inode-%lu:%lu", (unsigned long) st.st_dev, (unsigned long) st.st_ino); + else + return -ENOTTY; + + if (r < 0) return -ENOMEM; } - r = make_lock_file_for(path, operation, &t); - if (r < 0) - return r; + /* For block devices we don't need the "local" lock, as the major/minor lock above should be sufficient, since + * block devices are device local anyway. */ + if (!path_startswith(path, "/dev")) { + r = make_lock_file_for(path, operation, &t); + if (r < 0) + return r; + } if (p) { mkdir_p("/run/systemd/nspawn/locks", 0700); @@ -860,6 +969,7 @@ static const char* const image_type_table[_IMAGE_TYPE_MAX] = { [IMAGE_DIRECTORY] = "directory", [IMAGE_SUBVOLUME] = "subvolume", [IMAGE_RAW] = "raw", + [IMAGE_BLOCK] = "block", }; DEFINE_STRING_TABLE_LOOKUP(image_type, ImageType); diff --git a/src/shared/machine-image.h b/src/shared/machine-image.h index 7410168c4f..50d89e4392 100644 --- a/src/shared/machine-image.h +++ b/src/shared/machine-image.h @@ -33,6 +33,7 @@ typedef enum ImageType { IMAGE_DIRECTORY, IMAGE_SUBVOLUME, IMAGE_RAW, + IMAGE_BLOCK, _IMAGE_TYPE_MAX, _IMAGE_TYPE_INVALID = -1 } ImageType;