diff --git a/src/core/namespace.c b/src/core/namespace.c index d774467658..831dcaa81e 100644 --- a/src/core/namespace.c +++ b/src/core/namespace.c @@ -2058,16 +2058,11 @@ int setup_namespace( root_image, FLAGS_SET(dissect_image_flags, DISSECT_IMAGE_DEVICE_READ_ONLY) ? O_RDONLY : -1 /* < 0 means writable if possible, read-only as fallback */, FLAGS_SET(dissect_image_flags, DISSECT_IMAGE_NO_PARTITION_TABLE) ? 0 : LO_FLAGS_PARTSCAN, + LOCK_SH, &loop_device); if (r < 0) return log_debug_errno(r, "Failed to create loop device for root image: %m"); - /* Make sure udevd won't issue BLKRRPART (which might flush out the loaded partition table) - * while we are still trying to mount things */ - r = loop_device_flock(loop_device, LOCK_SH); - if (r < 0) - return log_debug_errno(r, "Failed to lock loopback device with LOCK_SH: %m"); - r = dissect_image( loop_device->fd, &verity, @@ -2432,7 +2427,6 @@ int setup_namespace( } } - dissected_image_relinquish(dissected_image); loop_device_relinquish(loop_device); } else if (root_directory) { diff --git a/src/dissect/dissect.c b/src/dissect/dissect.c index 17291cf22f..f0094a390f 100644 --- a/src/dissect/dissect.c +++ b/src/dissect/dissect.c @@ -685,7 +685,6 @@ static int action_mount(DissectedImage *m, LoopDevice *d) { return log_error_errno(r, "Failed to relinquish DM devices: %m"); } - dissected_image_relinquish(m); loop_device_relinquish(d); return 0; } @@ -738,7 +737,6 @@ static int action_copy(DissectedImage *m, LoopDevice *d) { return log_error_errno(r, "Failed to relinquish DM devices: %m"); } - dissected_image_relinquish(m); loop_device_relinquish(d); if (arg_action == ACTION_COPY_FROM) { @@ -864,7 +862,7 @@ static int action_umount(const char *path) { _cleanup_free_ char *canonical = NULL, *devname = NULL; _cleanup_(loop_device_unrefp) LoopDevice *d = NULL; dev_t devno; - int r, k; + int r; fd = chase_symlinks_and_open(path, NULL, 0, O_DIRECTORY, &canonical); if (fd == -ENOTDIR) @@ -887,14 +885,10 @@ static int action_umount(const char *path) { return log_error_errno(r, "Failed to get devname of block device " DEVNUM_FORMAT_STR ": %m", DEVNUM_FORMAT_VAL(devno)); - r = loop_device_open(devname, 0, &d); + r = loop_device_open(devname, 0, LOCK_EX, &d); if (r < 0) return log_error_errno(r, "Failed to open loop device '%s': %m", devname); - r = loop_device_flock(d, LOCK_EX); - if (r < 0) - return log_error_errno(r, "Failed to lock loop device '%s': %m", devname); - /* We've locked the loop device, now we're ready to unmount. To allow the unmount to succeed, we have * to close the O_PATH fd we opened earlier. */ fd = safe_close(fd); @@ -907,25 +901,12 @@ static int action_umount(const char *path) { loop_device_unrelinquish(d); if (arg_rmdir) { - k = RET_NERRNO(rmdir(canonical)); - if (k < 0) - log_error_errno(k, "Failed to remove mount directory '%s': %m", canonical); - } else - k = 0; - - /* Before loop_device_unrefp() kicks in, let's explicitly remove all the partition subdevices of the - * loop device. We do this to ensure that all traces of the loop device are gone by the time this - * command exits. */ - r = block_device_remove_all_partitions(d->fd); - if (r == -EBUSY) { - log_error_errno(r, "One or more partitions of '%s' are busy, ignoring", devname); - r = 0; + r = RET_NERRNO(rmdir(canonical)); + if (r < 0) + return log_error_errno(r, "Failed to remove mount directory '%s': %m", canonical); } - if (r < 0) - log_error_errno(r, "Failed to remove one or more partitions of '%s': %m", devname); - - return k < 0 ? k : r; + return 0; } static int run(int argc, char *argv[]) { @@ -959,16 +940,11 @@ static int run(int argc, char *argv[]) { arg_image, FLAGS_SET(arg_flags, DISSECT_IMAGE_DEVICE_READ_ONLY) ? O_RDONLY : O_RDWR, FLAGS_SET(arg_flags, DISSECT_IMAGE_NO_PARTITION_TABLE) ? 0 : LO_FLAGS_PARTSCAN, + LOCK_SH, &d); if (r < 0) return log_error_errno(r, "Failed to set up loopback device for %s: %m", arg_image); - /* Make sure udevd doesn't issue BLKRRPART underneath us thus making devices disappear in the middle, - * that we assume already are there. */ - r = loop_device_flock(d, LOCK_SH); - if (r < 0) - return log_error_errno(r, "Failed to lock loopback device: %m"); - r = dissect_image_and_warn( d->fd, arg_image, diff --git a/src/gpt-auto-generator/gpt-auto-generator.c b/src/gpt-auto-generator/gpt-auto-generator.c index a95f384ecb..bd16ae333c 100644 --- a/src/gpt-auto-generator/gpt-auto-generator.c +++ b/src/gpt-auto-generator/gpt-auto-generator.c @@ -772,8 +772,6 @@ static int enumerate_partitions(dev_t devnum) { r = k; } - dissected_image_relinquish(m); - return r; } diff --git a/src/home/homework-luks.c b/src/home/homework-luks.c index 68a5ac9c2e..eea282fe12 100644 --- a/src/home/homework-luks.c +++ b/src/home/homework-luks.c @@ -1284,7 +1284,7 @@ int home_setup_luks( return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Failed to determine backing device for DM %s.", setup->dm_name); if (!setup->loop) { - r = loop_device_open(n, O_RDWR, &setup->loop); + r = loop_device_open(n, O_RDWR, LOCK_UN, &setup->loop); if (r < 0) return log_error_errno(r, "Failed to open loopback device %s: %m", n); } @@ -1378,7 +1378,7 @@ int home_setup_luks( return r; } - r = loop_device_make(setup->image_fd, O_RDWR, offset, size, 0, &setup->loop); + r = loop_device_make(setup->image_fd, O_RDWR, offset, size, 0, LOCK_UN, &setup->loop); if (r == -ENOENT) { log_error_errno(r, "Loopback block device support is not available on this system."); return -ENOLINK; /* make recognizable */ @@ -2299,7 +2299,7 @@ int home_create_luks( log_info("Writing of partition table completed."); - r = loop_device_make(setup->image_fd, O_RDWR, partition_offset, partition_size, 0, &setup->loop); + r = loop_device_make(setup->image_fd, O_RDWR, partition_offset, partition_size, 0, LOCK_EX, &setup->loop); if (r < 0) { if (r == -ENOENT) { /* this means /dev/loop-control doesn't exist, i.e. we are in a container * or similar and loopback bock devices are not available, return a @@ -2311,10 +2311,6 @@ int home_create_luks( return log_error_errno(r, "Failed to set up loopback device for %s: %m", setup->temporary_image_path); } - r = loop_device_flock(setup->loop, LOCK_EX); /* make sure udev won't read before we are done */ - if (r < 0) - return log_error_errno(r, "Failed to take lock on loop device: %m"); - log_info("Setting up loopback device %s completed.", setup->loop->node ?: ip); r = luks_format(setup->loop->node, diff --git a/src/nspawn/nspawn.c b/src/nspawn/nspawn.c index 75af4c58f9..45e138d48a 100644 --- a/src/nspawn/nspawn.c +++ b/src/nspawn/nspawn.c @@ -5746,19 +5746,13 @@ static int run(int argc, char *argv[]) { arg_image, arg_read_only ? O_RDONLY : O_RDWR, FLAGS_SET(dissect_image_flags, DISSECT_IMAGE_NO_PARTITION_TABLE) ? 0 : LO_FLAGS_PARTSCAN, + LOCK_SH, &loop); if (r < 0) { log_error_errno(r, "Failed to set up loopback block device: %m"); goto finish; } - /* Take a LOCK_SH lock on the device, so that udevd doesn't issue BLKRRPART in our back */ - r = loop_device_flock(loop, LOCK_SH); - if (r < 0) { - log_error_errno(r, "Failed to take lock on loopback block device: %m"); - goto finish; - } - r = dissect_image_and_warn( loop->fd, arg_image, diff --git a/src/partition/repart.c b/src/partition/repart.c index 815e81052b..bb95c0aab5 100644 --- a/src/partition/repart.c +++ b/src/partition/repart.c @@ -2789,14 +2789,10 @@ static int context_copy_blocks(Context *context) { assert_se((whole_fd = fdisk_get_devfd(context->fdisk_context)) >= 0); if (p->encrypt != ENCRYPT_OFF) { - r = loop_device_make(whole_fd, O_RDWR, p->offset, p->new_size, 0, &d); + r = loop_device_make(whole_fd, O_RDWR, p->offset, p->new_size, 0, LOCK_EX, &d); if (r < 0) return log_error_errno(r, "Failed to make loopback device of future partition %" PRIu64 ": %m", p->partno); - r = loop_device_flock(d, LOCK_EX); - if (r < 0) - return log_error_errno(r, "Failed to lock loopback device: %m"); - r = partition_encrypt(context, p, d->node, &cd, &encrypted, &encrypted_dev_fd); if (r < 0) return log_error_errno(r, "Failed to encrypt device: %m"); @@ -3038,14 +3034,10 @@ static int context_mkfs(Context *context) { /* Loopback block devices are not only useful to turn regular files into block devices, but * also to cut out sections of block devices into new block devices. */ - r = loop_device_make(fd, O_RDWR, p->offset, p->new_size, 0, &d); + r = loop_device_make(fd, O_RDWR, p->offset, p->new_size, 0, LOCK_EX, &d); if (r < 0) return log_error_errno(r, "Failed to make loopback device of future partition %" PRIu64 ": %m", p->partno); - r = loop_device_flock(d, LOCK_EX); - if (r < 0) - return log_error_errno(r, "Failed to lock loopback device: %m"); - if (p->encrypt != ENCRYPT_OFF) { r = partition_encrypt(context, p, d->node, &cd, &encrypted, &encrypted_dev_fd); if (r < 0) diff --git a/src/portable/portable.c b/src/portable/portable.c index 256362355c..0f6c71042e 100644 --- a/src/portable/portable.c +++ b/src/portable/portable.c @@ -333,7 +333,7 @@ static int portable_extract_by_path( assert(path); - r = loop_device_make_by_path(path, O_RDONLY, LO_FLAGS_PARTSCAN, &d); + r = loop_device_make_by_path(path, O_RDONLY, LO_FLAGS_PARTSCAN, LOCK_SH, &d); if (r == -EISDIR) { _cleanup_free_ char *image_name = NULL; @@ -359,10 +359,6 @@ static int portable_extract_by_path( /* We now have a loopback block device, let's fork off a child in its own mount namespace, mount it * there, and extract the metadata we need. The metadata is sent from the child back to us. */ - r = loop_device_flock(d, LOCK_SH); - if (r < 0) - return log_debug_errno(r, "Failed to acquire lock on loopback block device: %m"); - BLOCK_SIGNALS(SIGCHLD); r = mkdtemp_malloc("/tmp/inspect-XXXXXX", &tmpdir); diff --git a/src/shared/discover-image.c b/src/shared/discover-image.c index ea0afb81a5..3733cf06b2 100644 --- a/src/shared/discover-image.c +++ b/src/shared/discover-image.c @@ -1192,13 +1192,7 @@ int image_read_metadata(Image *i) { _cleanup_(loop_device_unrefp) LoopDevice *d = NULL; _cleanup_(dissected_image_unrefp) DissectedImage *m = NULL; - r = loop_device_make_by_path(i->path, O_RDONLY, LO_FLAGS_PARTSCAN, &d); - if (r < 0) - return r; - - /* Make sure udevd doesn't issue BLKRRPART in the background which might make our partitions - * disappear temporarily. */ - r = loop_device_flock(d, LOCK_SH); + r = loop_device_make_by_path(i->path, O_RDONLY, LO_FLAGS_PARTSCAN, LOCK_SH, &d); if (r < 0) return r; diff --git a/src/shared/dissect-image.c b/src/shared/dissect-image.c index 4782330a0c..6b3a18c011 100644 --- a/src/shared/dissect-image.c +++ b/src/shared/dissect-image.c @@ -150,20 +150,9 @@ static void check_partition_flags( } #endif -static void dissected_partition_done(int fd, DissectedPartition *p) { - assert(fd >= 0); +static void dissected_partition_done(DissectedPartition *p) { assert(p); -#if HAVE_BLKID - if (p->node && p->partno > 0 && !p->relinquished) { - int r; - - r = block_device_remove_partition(fd, p->node, p->partno); - if (r < 0) - log_debug_errno(r, "BLKPG_DEL_PARTITION failed, ignoring: %m"); - } -#endif - free(p->fstype); free(p->node); free(p->label); @@ -312,14 +301,9 @@ int dissect_image( return -ENOMEM; *m = (DissectedImage) { - .fd = -1, .has_init_system = -1, }; - m->fd = fcntl(fd, F_DUPFD_CLOEXEC, 3); - if (m->fd < 0) - return -errno; - r = sd_device_get_sysname(d, &sysname); if (r < 0) return log_debug_errno(r, "Failed to get device sysname: %m"); @@ -775,14 +759,10 @@ int dissect_image( * scheme in OS images. */ if (!PARTITION_DESIGNATOR_VERSIONED(designator) || - strverscmp_improved(m->partitions[designator].label, label) >= 0) { - r = block_device_remove_partition(fd, node, nr); - if (r < 0) - log_debug_errno(r, "BLKPG_DEL_PARTITION failed, ignoring: %m"); + strverscmp_improved(m->partitions[designator].label, label) >= 0) continue; - } - dissected_partition_done(fd, m->partitions + designator); + dissected_partition_done(m->partitions + designator); } if (fstype) { @@ -852,12 +832,8 @@ int dissect_image( const char *sid, *options = NULL; /* First one wins */ - if (m->partitions[PARTITION_XBOOTLDR].found) { - r = block_device_remove_partition(fd, node, nr); - if (r < 0) - log_debug_errno(r, "BLKPG_DEL_PARTITION failed, ignoring: %m"); + if (m->partitions[PARTITION_XBOOTLDR].found) continue; - } sid = blkid_partition_get_uuid(pp); if (sid) @@ -1171,9 +1147,8 @@ DissectedImage* dissected_image_unref(DissectedImage *m) { return NULL; for (PartitionDesignator i = 0; i < _PARTITION_DESIGNATOR_MAX; i++) - dissected_partition_done(m->fd, m->partitions + i); + dissected_partition_done(m->partitions + i); - safe_close(m->fd); free(m->image_name); free(m->hostname); strv_free(m->machine_info); @@ -1183,16 +1158,6 @@ DissectedImage* dissected_image_unref(DissectedImage *m) { return mfree(m); } -void dissected_image_relinquish(DissectedImage *m) { - assert(m); - - /* Partitions are automatically removed when the underlying loop device is closed. We just need to - * make sure we don't try to remove the partitions early. */ - - for (PartitionDesignator i = 0; i < _PARTITION_DESIGNATOR_MAX; i++) - m->partitions[i].relinquished = true; -} - static int is_loop_device(const char *path) { char s[SYS_BLOCK_PATH_MAX("/../loop/")]; struct stat st; @@ -3001,15 +2966,11 @@ int mount_image_privately_interactively( image, FLAGS_SET(flags, DISSECT_IMAGE_DEVICE_READ_ONLY) ? O_RDONLY : O_RDWR, FLAGS_SET(flags, DISSECT_IMAGE_NO_PARTITION_TABLE) ? 0 : LO_FLAGS_PARTSCAN, + LOCK_SH, &d); if (r < 0) return log_error_errno(r, "Failed to set up loopback device for %s: %m", image); - /* Make sure udevd doesn't issue BLKRRPART behind our backs */ - r = loop_device_flock(d, LOCK_SH); - if (r < 0) - return r; - r = dissect_image_and_warn(d->fd, image, &verity, NULL, d->diskseq, d->uevent_seqnum_not_before, d->timestamp_not_before, flags, &dissected_image); if (r < 0) return r; @@ -3046,7 +3007,6 @@ int mount_image_privately_interactively( return log_error_errno(r, "Failed to relinquish DM devices: %m"); } - dissected_image_relinquish(dissected_image); loop_device_relinquish(d); *ret_directory = TAKE_PTR(created_dir); @@ -3117,14 +3077,11 @@ int verity_dissect_and_mount( src_fd >= 0 ? FORMAT_PROC_FD_PATH(src_fd) : src, -1, verity.data_path ? 0 : LO_FLAGS_PARTSCAN, + LOCK_SH, &loop_device); if (r < 0) return log_debug_errno(r, "Failed to create loop device for image: %m"); - r = loop_device_flock(loop_device, LOCK_SH); - if (r < 0) - return log_debug_errno(r, "Failed to lock loop device: %m"); - r = dissect_image( loop_device->fd, &verity, @@ -3209,7 +3166,6 @@ int verity_dissect_and_mount( return log_debug_errno(r, "Failed to relinquish decrypted image: %m"); } - dissected_image_relinquish(dissected_image); loop_device_relinquish(loop_device); return 0; diff --git a/src/shared/dissect-image.h b/src/shared/dissect-image.h index 5230189c16..55bb8a1dad 100644 --- a/src/shared/dissect-image.h +++ b/src/shared/dissect-image.h @@ -31,7 +31,6 @@ struct DissectedPartition { char *mount_options; uint64_t size; uint64_t offset; - bool relinquished; }; typedef enum PartitionDesignator { @@ -204,8 +203,6 @@ typedef enum DissectImageFlags { } DissectImageFlags; struct DissectedImage { - int fd; /* Backing fd */ - bool encrypted:1; bool has_verity:1; /* verity available in image, but not necessarily used */ bool has_verity_sig:1; /* pkcs#7 signature embedded in image */ @@ -261,7 +258,6 @@ int dissect_image_and_warn(int fd, const char *name, const VeritySettings *verit DissectedImage* dissected_image_unref(DissectedImage *m); DEFINE_TRIVIAL_CLEANUP_FUNC(DissectedImage*, dissected_image_unref); -void dissected_image_relinquish(DissectedImage *m); int dissected_image_decrypt(DissectedImage *m, const char *passphrase, const VeritySettings *verity, DissectImageFlags flags, DecryptedImage **ret); int dissected_image_decrypt_interactively(DissectedImage *m, const char *passphrase, const VeritySettings *verity, DissectImageFlags flags, DecryptedImage **ret); diff --git a/src/shared/loop-util.c b/src/shared/loop-util.c index def2fa89fb..32941c7fa1 100644 --- a/src/shared/loop-util.c +++ b/src/shared/loop-util.c @@ -117,13 +117,28 @@ static int device_has_block_children(sd_device *d) { return !!sd_device_enumerator_get_device_first(e); } +static int open_lock_fd(int primary_fd, int operation) { + int lock_fd; + + assert(primary_fd >= 0); + + lock_fd = fd_reopen(primary_fd, O_RDWR|O_CLOEXEC|O_NONBLOCK|O_NOCTTY); + if (lock_fd < 0) + return lock_fd; + if (flock(lock_fd, operation) < 0) + return -errno; + + return lock_fd; +} + static int loop_configure( int fd, int nr, const struct loop_config *c, bool *try_loop_configure, uint64_t *ret_seqnum_not_before, - usec_t *ret_timestamp_not_before) { + usec_t *ret_timestamp_not_before, + int *ret_lock_fd) { _cleanup_(sd_device_unrefp) sd_device *d = NULL; _cleanup_free_ char *sysname = NULL; @@ -152,18 +167,15 @@ static int loop_configure( * long time udev would possibly never run on it again, even though the fd is unlocked, simply * because we never close() it. It also has the nice benefit we can use the _cleanup_close_ logic to * automatically release the lock, after we are done. */ - lock_fd = fd_reopen(fd, O_RDWR|O_CLOEXEC|O_NONBLOCK|O_NOCTTY); + lock_fd = open_lock_fd(fd, LOCK_EX); if (lock_fd < 0) return lock_fd; - if (flock(lock_fd, LOCK_EX) < 0) - return -errno; /* Let's see if the device is really detached, i.e. currently has no associated partition block * devices. On various kernels (such as 5.8) it is possible to have a loopback block device that - * superficially is detached but still has partition block devices associated for it. They only go - * away when the device is reattached. (Yes, LOOP_CLR_FD doesn't work then, because officially - * nothing is attached and LOOP_CTL_REMOVE doesn't either, since it doesn't care about partition - * block devices. */ + * superficially is detached but still has partition block devices associated for it. Let's then + * manually remove the partitions via BLKPG, and tell the caller we did that via EUCLEAN, so they try + * again. */ r = device_has_block_children(d); if (r < 0) return r; @@ -174,8 +186,14 @@ static int loop_configure( if (r > 0) return -EBUSY; - return -EUCLEAN; /* Bound but children? Tell caller to reattach something so that the - * partition block devices are gone too. */ + /* Unbound but has children? Remove all partitions, and report this to the caller, to try + * again, and count this as an attempt. */ + + r = block_device_remove_all_partitions(fd); + if (r < 0) + return r; + + return -EUCLEAN; } if (*try_loop_configure) { @@ -247,12 +265,7 @@ static int loop_configure( goto fail; } - if (ret_seqnum_not_before) - *ret_seqnum_not_before = seqnum; - if (ret_timestamp_not_before) - *ret_timestamp_not_before = timestamp; - - return 0; + goto success; } } @@ -317,65 +330,34 @@ static int loop_configure( log_debug_errno(errno, "Failed to enable direct IO mode on loopback device /dev/loop%i, ignoring: %m", nr); } +success: if (ret_seqnum_not_before) *ret_seqnum_not_before = seqnum; if (ret_timestamp_not_before) *ret_timestamp_not_before = timestamp; + if (ret_lock_fd) + *ret_lock_fd = TAKE_FD(lock_fd); return 0; fail: + /* Close the lock fd explicitly before clearing the loopback block device, since an additional open + * fd would block the clearing to succeed */ + lock_fd = safe_close(lock_fd); (void) ioctl(fd, LOOP_CLR_FD); return r; } -static int attach_empty_file(int loop, int nr) { - _cleanup_close_ int fd = -1; - - /* So here's the thing: on various kernels (5.8 at least) loop block devices might enter a state - * where they are detached but nonetheless have partitions, when used heavily. Accessing these - * partitions results in immediatey IO errors. There's no pretty way to get rid of them - * again. Neither LOOP_CLR_FD nor LOOP_CTL_REMOVE suffice (see above). What does work is to - * reassociate them with a new fd however. This is what we do here hence: we associate the devices - * with an empty file (i.e. an image that definitely has no partitions). We then immediately clear it - * again. This suffices to make the partitions go away. Ugly but appears to work. */ - - log_debug("Found unattached loopback block device /dev/loop%i with partitions. Attaching empty file to remove them.", nr); - - fd = open_tmpfile_unlinkable(NULL, O_RDONLY); - if (fd < 0) - return fd; - - if (flock(loop, LOCK_EX) < 0) - return -errno; - - if (ioctl(loop, LOOP_SET_FD, fd) < 0) - return -errno; - - if (ioctl(loop, LOOP_SET_STATUS64, &(struct loop_info64) { - .lo_flags = LO_FLAGS_READ_ONLY| - LO_FLAGS_AUTOCLEAR| - LO_FLAGS_PARTSCAN, /* enable partscan, so that the partitions really go away */ - }) < 0) - return -errno; - - if (ioctl(loop, LOOP_CLR_FD) < 0) - return -errno; - - /* The caller is expected to immediately close the loopback device after this, so that the BSD lock - * is released, and udev sees the changes. */ - return 0; -} - static int loop_device_make_internal( int fd, int open_flags, uint64_t offset, uint64_t size, uint32_t loop_flags, + int lock_op, LoopDevice **ret) { - _cleanup_close_ int direct_io_fd = -1; + _cleanup_close_ int direct_io_fd = -1, lock_fd = -1; _cleanup_free_ char *loopdev = NULL; bool try_loop_configure = true; struct loop_config config; @@ -393,6 +375,12 @@ static int loop_device_make_internal( return -errno; if (S_ISBLK(st.st_mode)) { + if (lock_op != LOCK_UN) { + lock_fd = open_lock_fd(fd, lock_op); + if (lock_fd < 0) + return lock_fd; + } + if (ioctl(fd, LOOP_GET_STATUS64, &config.info) >= 0) { /* Oh! This is a loopback device? That's interesting! */ @@ -430,6 +418,7 @@ static int loop_device_make_internal( return -ENOMEM; *d = (LoopDevice) { .fd = TAKE_FD(copy), + .lock_fd = TAKE_FD(lock_fd), .nr = nr, .node = TAKE_PTR(loopdev), .relinquished = true, /* It's not allocated by us, don't destroy it when this object is freed */ @@ -501,7 +490,10 @@ static int loop_device_make_internal( * unnecessary busywork less likely. Note that this is just something we do to optimize our * own code (and whoever else decides to use LOCK_EX locks for this), taking this lock is not * necessary, it just means it's less likely we have to iterate through this loop again and - * again if our own code races against our own code. */ + * again if our own code races against our own code. + * + * Note: our lock protocol is to take the /dev/loop-control lock first, and the block device + * lock second, if both are taken, and always in this order, to avoid ABBA locking issues. */ if (flock(control, LOCK_EX) < 0) return -errno; @@ -519,17 +511,13 @@ static int loop_device_make_internal( if (!ERRNO_IS_DEVICE_ABSENT(errno)) return -errno; } else { - r = loop_configure(loop, nr, &config, &try_loop_configure, &seqnum, ×tamp); + r = loop_configure(loop, nr, &config, &try_loop_configure, &seqnum, ×tamp, &lock_fd); if (r >= 0) { loop_with_fd = TAKE_FD(loop); break; } - if (r == -EUCLEAN) { - /* Make left-over partition disappear hack (see above) */ - r = attach_empty_file(loop, nr); - if (r < 0 && r != -EBUSY) - return r; - } else if (r != -EBUSY) + if (!IN_SET(r, -EBUSY, -EUCLEAN)) /* Busy, or some left-over partition devices that + * were cleaned up. */ return r; } @@ -583,11 +571,26 @@ static int loop_device_make_internal( if (r < 0 && r != -EOPNOTSUPP) return r; + switch (lock_op & ~LOCK_NB) { + case LOCK_EX: /* Already in effect */ + break; + case LOCK_SH: /* Downgrade */ + if (flock(lock_fd, lock_op) < 0) + return -errno; + break; + case LOCK_UN: /* Release */ + lock_fd = safe_close(lock_fd); + break; + default: + assert_not_reached(); + } + d = new(LoopDevice, 1); if (!d) return -ENOMEM; *d = (LoopDevice) { .fd = TAKE_FD(loop_with_fd), + .lock_fd = TAKE_FD(lock_fd), .node = TAKE_PTR(loopdev), .nr = nr, .devno = st.st_rdev, @@ -622,6 +625,7 @@ int loop_device_make( uint64_t offset, uint64_t size, uint32_t loop_flags, + int lock_op, LoopDevice **ret) { assert(fd >= 0); @@ -633,6 +637,7 @@ int loop_device_make( offset, size, loop_flags_mangle(loop_flags), + lock_op, ret); } @@ -640,6 +645,7 @@ int loop_device_make_by_path( const char *path, int open_flags, uint32_t loop_flags, + int lock_op, LoopDevice **ret) { int r, basic_flags, direct_flags, rdwr_flags; @@ -693,46 +699,74 @@ int loop_device_make_by_path( direct ? "enabled" : "disabled", direct != (direct_flags != 0) ? " (O_DIRECT was requested but not supported)" : ""); - return loop_device_make_internal(fd, open_flags, 0, 0, loop_flags, ret); + return loop_device_make_internal(fd, open_flags, 0, 0, loop_flags, lock_op, ret); } LoopDevice* loop_device_unref(LoopDevice *d) { + _cleanup_close_ int control = -1; + int r; + if (!d) return NULL; + /* Release any lock we might have on the device first. We want to open+lock the /dev/loop-control + * device below, but our lock protocol says that if both control and block device locks are taken, + * the control lock needs to be taken first, the block device lock second — in order to avoid ABBA + * locking issues. Moreover, we want to issue LOOP_CLR_FD on the block device further down, and that + * would fail if we had another fd open to the device. */ + d->lock_fd = safe_close(d->lock_fd); + + /* Let's open the control device early, and lock it, so that we can release our block device and + * delete it in a synchronized fashion, and allocators won't needlessly see the block device as free + * while we are about to delete it. */ + if (!LOOP_DEVICE_IS_FOREIGN(d) && !d->relinquished) { + control = open("/dev/loop-control", O_RDWR|O_CLOEXEC|O_NOCTTY|O_NONBLOCK); + if (control < 0) + log_debug_errno(errno, "Failed to open loop control device, cannot remove loop device '%s', ignoring: %m", strna(d->node)); + else if (flock(control, LOCK_EX) < 0) + log_debug_errno(errno, "Failed to lock loop control device, ignoring: %m"); + } + + /* Then let's release the loopback block device */ if (d->fd >= 0) { /* Implicitly sync the device, since otherwise in-flight blocks might not get written */ if (fsync(d->fd) < 0) log_debug_errno(errno, "Failed to sync loop block device, ignoring: %m"); - if (d->nr >= 0 && !d->relinquished) { - if (ioctl(d->fd, LOOP_CLR_FD) < 0) - log_debug_errno(errno, "Failed to clear loop device: %m"); + if (!LOOP_DEVICE_IS_FOREIGN(d) && !d->relinquished) { + /* We are supposed to clear the loopback device. Let's do this synchronously: lock + * the device, manually remove all partitions and then clear it. This should ensure + * udev doesn't concurrently access the devices, and we can be reasonably sure that + * once we are done here the device is cleared and all its partition children + * removed. Note that we lock our primary device fd here (and not a separate locking + * fd, as we do during allocation, since we want to keep the lock all the way through + * the LOOP_CLR_FD, but that call would fail if we had more than one fd open.) */ + if (flock(d->fd, LOCK_EX) < 0) + log_debug_errno(errno, "Failed to lock loop block device, ignoring: %m"); + + r = block_device_remove_all_partitions(d->fd); + if (r < 0) + log_debug_errno(r, "Failed to remove partitions of loopback block device, ignoring: %m"); + + if (ioctl(d->fd, LOOP_CLR_FD) < 0) + log_debug_errno(errno, "Failed to clear loop device, ignoring: %m"); } safe_close(d->fd); } - if (d->nr >= 0 && !d->relinquished) { - _cleanup_close_ int control = -1; - - control = open("/dev/loop-control", O_RDWR|O_CLOEXEC|O_NOCTTY|O_NONBLOCK); - if (control < 0) - log_warning_errno(errno, - "Failed to open loop control device, cannot remove loop device %s: %m", - strna(d->node)); - else - for (unsigned n_attempts = 0;;) { - if (ioctl(control, LOOP_CTL_REMOVE, d->nr) >= 0) - break; - if (errno != EBUSY || ++n_attempts >= 64) { - log_warning_errno(errno, "Failed to remove device %s: %m", strna(d->node)); - break; - } - (void) usleep(50 * USEC_PER_MSEC); + /* Now that the block device is released, let's also try to remove it */ + if (control >= 0) + for (unsigned n_attempts = 0;;) { + if (ioctl(control, LOOP_CTL_REMOVE, d->nr) >= 0) + break; + if (errno != EBUSY || ++n_attempts >= 64) { + log_debug_errno(errno, "Failed to remove device %s: %m", strna(d->node)); + break; } - } + (void) usleep(50 * USEC_PER_MSEC); + } free(d->node); return mfree(d); @@ -752,8 +786,13 @@ void loop_device_unrelinquish(LoopDevice *d) { d->relinquished = false; } -int loop_device_open(const char *loop_path, int open_flags, LoopDevice **ret) { - _cleanup_close_ int loop_fd = -1; +int loop_device_open( + const char *loop_path, + int open_flags, + int lock_op, + LoopDevice **ret) { + + _cleanup_close_ int loop_fd = -1, lock_fd = -1; _cleanup_free_ char *p = NULL; struct loop_info64 info; struct stat st; @@ -782,6 +821,12 @@ int loop_device_open(const char *loop_path, int open_flags, LoopDevice **ret) { } else nr = -1; + if ((lock_op & ~LOCK_NB) != LOCK_UN) { + lock_fd = open_lock_fd(loop_fd, lock_op); + if (lock_fd < 0) + return lock_fd; + } + p = strdup(loop_path); if (!p) return -ENOMEM; @@ -792,6 +837,7 @@ int loop_device_open(const char *loop_path, int open_flags, LoopDevice **ret) { *d = (LoopDevice) { .fd = TAKE_FD(loop_fd), + .lock_fd = TAKE_FD(lock_fd), .nr = nr, .node = TAKE_PTR(p), .relinquished = true, /* It's not ours, don't try to destroy it when this object is freed */ @@ -878,7 +924,9 @@ static int resize_partition(int partition_fd, uint64_t offset, uint64_t size) { int loop_device_refresh_size(LoopDevice *d, uint64_t offset, uint64_t size) { struct loop_info64 info; + assert(d); + assert(d->fd >= 0); /* Changes the offset/start of the loop device relative to the beginning of the underlying file or * block device. If this loop device actually refers to a partition and not a loopback device, we'll @@ -886,9 +934,6 @@ int loop_device_refresh_size(LoopDevice *d, uint64_t offset, uint64_t size) { * * If either offset or size is UINT64_MAX we won't change that parameter. */ - if (d->fd < 0) - return -EBADF; - if (d->nr < 0) /* not a loopback device */ return resize_partition(d->fd, offset, size); @@ -914,22 +959,36 @@ int loop_device_refresh_size(LoopDevice *d, uint64_t offset, uint64_t size) { } int loop_device_flock(LoopDevice *d, int operation) { + assert(IN_SET(operation & ~LOCK_NB, LOCK_UN, LOCK_SH, LOCK_EX)); assert(d); - if (d->fd < 0) - return -EBADF; + /* When unlocking just close the lock fd */ + if ((operation & ~LOCK_NB) == LOCK_UN) { + d->lock_fd = safe_close(d->lock_fd); + return 0; + } - return RET_NERRNO(flock(d->fd, operation)); + /* If we had no lock fd so far, create one and lock it right-away */ + if (d->lock_fd < 0) { + assert(d->fd >= 0); + + d->lock_fd = open_lock_fd(d->fd, operation); + if (d->lock_fd < 0) + return d->lock_fd; + + return 0; + } + + /* Otherwise change the current lock mode on the existing fd */ + return RET_NERRNO(flock(d->lock_fd, operation)); } int loop_device_sync(LoopDevice *d) { assert(d); + assert(d->fd >= 0); /* We also do this implicitly in loop_device_unref(). Doing this explicitly here has the benefit that * we can check the return value though. */ - if (d->fd < 0) - return -EBADF; - return RET_NERRNO(fsync(d->fd)); } diff --git a/src/shared/loop-util.h b/src/shared/loop-util.h index a33d7e3e59..bdbdac60eb 100644 --- a/src/shared/loop-util.h +++ b/src/shared/loop-util.h @@ -10,7 +10,8 @@ typedef struct LoopDevice LoopDevice; struct LoopDevice { int fd; - int nr; + int lock_fd; + int nr; /* The loopback device index (i.e. 4 for /dev/loop4); if this object encapsulates a non-loopback block device, set to -1 */ dev_t devno; char *node; bool relinquished; @@ -19,9 +20,12 @@ struct LoopDevice { usec_t timestamp_not_before; /* CLOCK_MONOTONIC timestamp taken immediately before attaching the loopback device, or USEC_INFINITY if we don't know */ }; -int loop_device_make(int fd, int open_flags, uint64_t offset, uint64_t size, uint32_t loop_flags, LoopDevice **ret); -int loop_device_make_by_path(const char *path, int open_flags, uint32_t loop_flags, LoopDevice **ret); -int loop_device_open(const char *loop_path, int open_flags, LoopDevice **ret); +/* Returns true if LoopDevice object is not actually a loopback device but some other block device we just wrap */ +#define LOOP_DEVICE_IS_FOREIGN(d) ((d)->nr < 0) + +int loop_device_make(int fd, int open_flags, uint64_t offset, uint64_t size, uint32_t loop_flags, int lock_op, LoopDevice **ret); +int loop_device_make_by_path(const char *path, int open_flags, uint32_t loop_flags, int lock_op, LoopDevice **ret); +int loop_device_open(const char *loop_path, int open_flags, int lock_op, LoopDevice **ret); LoopDevice* loop_device_unref(LoopDevice *d); DEFINE_TRIVIAL_CLEANUP_FUNC(LoopDevice*, loop_device_unref); diff --git a/src/sysext/sysext.c b/src/sysext/sysext.c index a8593cec4d..6e533f335c 100644 --- a/src/sysext/sysext.c +++ b/src/sysext/sysext.c @@ -534,14 +534,11 @@ static int merge_subprocess(Hashmap *images, const char *workspace) { img->path, O_RDONLY, FLAGS_SET(flags, DISSECT_IMAGE_NO_PARTITION_TABLE) ? 0 : LO_FLAGS_PARTSCAN, + LOCK_SH, &d); if (r < 0) return log_error_errno(r, "Failed to set up loopback device for %s: %m", img->path); - r = loop_device_flock(d, LOCK_SH); - if (r < 0) - return log_error_errno(r, "Failed to lock loopback device: %m"); - r = dissect_image_and_warn( d->fd, img->path, @@ -585,7 +582,6 @@ static int merge_subprocess(Hashmap *images, const char *workspace) { return log_error_errno(r, "Failed to relinquish DM devices: %m"); } - dissected_image_relinquish(m); loop_device_relinquish(d); break; } diff --git a/src/test/test-loop-block.c b/src/test/test-loop-block.c index 9179ef5d60..fb5e0a2fc9 100644 --- a/src/test/test-loop-block.c +++ b/src/test/test-loop-block.c @@ -49,16 +49,13 @@ static void* thread_func(void *ptr) { assert_se(mkdtemp_malloc(NULL, &mounted) >= 0); - r = loop_device_make(fd, O_RDONLY, 0, UINT64_MAX, LO_FLAGS_PARTSCAN, &loop); + r = loop_device_make(fd, O_RDONLY, 0, UINT64_MAX, LO_FLAGS_PARTSCAN, LOCK_SH, &loop); if (r < 0) log_error_errno(r, "Failed to allocate loopback device: %m"); assert_se(r >= 0); log_notice("Acquired loop device %s, will mount on %s", loop->node, mounted); - /* Let's make sure udev doesn't call BLKRRPART in the background, while we try to mount the device. */ - assert_se(loop_device_flock(loop, LOCK_SH) >= 0); - r = dissect_image(loop->fd, NULL, NULL, loop->diskseq, loop->uevent_seqnum_not_before, loop->timestamp_not_before, DISSECT_IMAGE_READ_ONLY, &dissected); if (r < 0) log_error_errno(r, "Failed dissect loopback device %s: %m", loop->node); @@ -215,7 +212,7 @@ static int run(int argc, char *argv[]) { assert_se(pclose(sfdisk) == 0); sfdisk = NULL; - assert_se(loop_device_make(fd, O_RDWR, 0, UINT64_MAX, LO_FLAGS_PARTSCAN, &loop) >= 0); + assert_se(loop_device_make(fd, O_RDWR, 0, UINT64_MAX, LO_FLAGS_PARTSCAN, LOCK_EX, &loop) >= 0); #if HAVE_BLKID _cleanup_(dissected_image_unrefp) DissectedImage *dissected = NULL; @@ -223,11 +220,6 @@ static int run(int argc, char *argv[]) { pthread_t threads[arg_n_threads]; sd_id128_t id; - /* Take an explicit lock while we format the file systems, in accordance with - * https://systemd.io/BLOCK_DEVICE_LOCKING/. We don't want udev to interfere and probe while we write - * or even issue BLKRRPART or similar while we are working on this. */ - assert_se(loop_device_flock(loop, LOCK_EX) >= 0); - assert_se(dissect_image(loop->fd, NULL, NULL, loop->diskseq, loop->uevent_seqnum_not_before, loop->timestamp_not_before, 0, &dissected) >= 0); assert_se(dissected->partitions[PARTITION_ESP].found);