mirror of
https://github.com/Dasharo/systemd.git
synced 2026-03-06 15:02:31 -08:00
Merge pull request #24467 from qdeslandes/nspawn_rootidmap
nspawn: add rootidmap as --bind option
This commit is contained in:
@@ -1363,16 +1363,37 @@ After=sys-subsystem-net-devices-ens1.device</programlisting>
|
||||
multiple times for creating multiple independent bind mount points.</para>
|
||||
|
||||
<para>Mount options are comma-separated. <option>rbind</option> and <option>norbind</option> control whether
|
||||
to create a recursive or a regular bind mount. Defaults to "rbind". <option>idmap</option> and <option>noidmap</option>
|
||||
control if the bind mount should use filesystem id mappings. Using this option requires support by the source filesystem
|
||||
for id mappings. Defaults to "noidmap".</para>
|
||||
to create a recursive or a regular bind mount. Defaults to "rbind". <option>noidmap</option>,
|
||||
<option>idmap</option>, and <option>rootidmap</option> control ID mapping.</para>
|
||||
|
||||
<para>Using <option>idmap</option> or <option>rootidmap</option> requires support by the source filesystem
|
||||
for user/group ID mapped mounts. Defaults to "noidmap". With <option>x</option> being the container's UID range
|
||||
offset, <option>y</option> being the length of the container's UID range, and <option>p</option> being the
|
||||
owner UID of the bind mount source inode on the host:
|
||||
|
||||
<itemizedlist>
|
||||
<listitem><para>If <option>noidmap</option> is used, any user <option>z</option> in the range
|
||||
<option>0 … y</option> seen from inside of the container is mapped to <option>x + z</option> in the
|
||||
<option>x … x + y</option> range on the host. All host users outside of that range are mapped to
|
||||
<option>nobody</option> inside the container.</para></listitem>
|
||||
<listitem><para>If <option>idmap</option> is used, any user <option>z</option> in the UID range
|
||||
<option>0 … y</option> as seen from inside the container is mapped to the same <option>z</option>
|
||||
in the same <option>0 … y</option> range on the host. All host users outside of that range are
|
||||
mapped to <option>nobody</option> inside the container.</para></listitem>
|
||||
<listitem><para>If <option>rootidmap</option> is used, the user <option>0</option> seen from inside
|
||||
of the container is mapped to <option>p</option> on the host. All host users outside of that range
|
||||
are mapped to <option>nobody</option> inside the container.</para></listitem>
|
||||
</itemizedlist></para>
|
||||
|
||||
<para>Whichever ID mapping option is used, the same mapping will be used for users and groups IDs. If
|
||||
<option>rootidmap</option> is used, the group owning the bind mounted directory will have no effect</para>
|
||||
|
||||
<para>Note that when this option is used in combination with <option>--private-users</option>, the resulting
|
||||
mount points will be owned by the <constant>nobody</constant> user. That's because the mount and its files and
|
||||
directories continue to be owned by the relevant host users and groups, which do not exist in the container,
|
||||
and thus show up under the wildcard UID 65534 (nobody). If such bind mounts are created, it is recommended to
|
||||
make them read-only, using <option>--bind-ro=</option>. Alternatively you can use the "idmap" mount option to
|
||||
map the filesystem ids.</para></listitem>
|
||||
map the filesystem IDs.</para></listitem>
|
||||
</varlistentry>
|
||||
|
||||
<varlistentry>
|
||||
|
||||
@@ -67,9 +67,9 @@ int take_etc_passwd_lock(const char *root);
|
||||
#define UID_NOBODY ((uid_t) 65534U)
|
||||
#define GID_NOBODY ((gid_t) 65534U)
|
||||
|
||||
/* If REMOUNT_IDMAP_HOST_ROOT is set for remount_idmap() we'll include a mapping here that maps the host root
|
||||
* user accessing the idmapped mount to the this user ID on the backing fs. This is the last valid UID in the
|
||||
* *signed* 32bit range. You might wonder why precisely use this specific UID for this purpose? Well, we
|
||||
/* If REMOUNT_IDMAPPING_HOST_ROOT is set for remount_idmap() we'll include a mapping here that maps the host
|
||||
* root user accessing the idmapped mount to the this user ID on the backing fs. This is the last valid UID in
|
||||
* the *signed* 32bit range. You might wonder why precisely use this specific UID for this purpose? Well, we
|
||||
* definitely cannot use the first 0…65536 UIDs for that, since in most cases that's precisely the file range
|
||||
* we intend to map to some high UID range, and since UID mappings have to be bijective we thus cannot use
|
||||
* them at all. Furthermore the UID range beyond INT32_MAX (i.e. the range above the signed 32bit range) is
|
||||
|
||||
@@ -708,10 +708,10 @@ int mount_all(const char *dest,
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int parse_mount_bind_options(const char *options, unsigned long *mount_flags, char **mount_opts, bool *idmapped) {
|
||||
static int parse_mount_bind_options(const char *options, unsigned long *mount_flags, char **mount_opts, RemountIdmapping *idmapping) {
|
||||
unsigned long flags = *mount_flags;
|
||||
char *opts = NULL;
|
||||
bool flag_idmapped = *idmapped;
|
||||
RemountIdmapping new_idmapping = *idmapping;
|
||||
int r;
|
||||
|
||||
assert(options);
|
||||
@@ -730,16 +730,18 @@ static int parse_mount_bind_options(const char *options, unsigned long *mount_fl
|
||||
else if (streq(word, "norbind"))
|
||||
flags &= ~MS_REC;
|
||||
else if (streq(word, "idmap"))
|
||||
flag_idmapped = true;
|
||||
new_idmapping = REMOUNT_IDMAPPING_HOST_ROOT;
|
||||
else if (streq(word, "noidmap"))
|
||||
flag_idmapped = false;
|
||||
new_idmapping = REMOUNT_IDMAPPING_NONE;
|
||||
else if (streq(word, "rootidmap"))
|
||||
new_idmapping = REMOUNT_IDMAPPING_HOST_OWNER;
|
||||
else
|
||||
return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
|
||||
"Invalid bind mount option: %s", word);
|
||||
}
|
||||
|
||||
*mount_flags = flags;
|
||||
*idmapped = flag_idmapped;
|
||||
*idmapping = new_idmapping;
|
||||
/* in the future mount_opts will hold string options for mount(2) */
|
||||
*mount_opts = opts;
|
||||
|
||||
@@ -751,13 +753,13 @@ static int mount_bind(const char *dest, CustomMount *m, uid_t uid_shift, uid_t u
|
||||
unsigned long mount_flags = MS_BIND | MS_REC;
|
||||
struct stat source_st, dest_st;
|
||||
int r;
|
||||
bool idmapped = false;
|
||||
RemountIdmapping idmapping = REMOUNT_IDMAPPING_NONE;
|
||||
|
||||
assert(dest);
|
||||
assert(m);
|
||||
|
||||
if (m->options) {
|
||||
r = parse_mount_bind_options(m->options, &mount_flags, &mount_opts, &idmapped);
|
||||
r = parse_mount_bind_options(m->options, &mount_flags, &mount_opts, &idmapping);
|
||||
if (r < 0)
|
||||
return r;
|
||||
}
|
||||
@@ -815,8 +817,8 @@ static int mount_bind(const char *dest, CustomMount *m, uid_t uid_shift, uid_t u
|
||||
return log_error_errno(r, "Read-only bind mount failed: %m");
|
||||
}
|
||||
|
||||
if (idmapped) {
|
||||
r = remount_idmap(where, uid_shift, uid_range, REMOUNT_IDMAP_HOST_ROOT);
|
||||
if (idmapping != REMOUNT_IDMAPPING_NONE) {
|
||||
r = remount_idmap(where, uid_shift, uid_range, source_st.st_uid, idmapping);
|
||||
if (r < 0)
|
||||
return log_error_errno(r, "Failed to map ids for bind mount %s: %m", where);
|
||||
}
|
||||
|
||||
@@ -3806,7 +3806,7 @@ static int outer_child(
|
||||
IN_SET(arg_userns_ownership, USER_NAMESPACE_OWNERSHIP_MAP, USER_NAMESPACE_OWNERSHIP_AUTO) &&
|
||||
arg_uid_shift != 0) {
|
||||
|
||||
r = remount_idmap(directory, arg_uid_shift, arg_uid_range, REMOUNT_IDMAP_HOST_ROOT);
|
||||
r = remount_idmap(directory, arg_uid_shift, arg_uid_range, UID_INVALID, REMOUNT_IDMAPPING_HOST_ROOT);
|
||||
if (r == -EINVAL || ERRNO_IS_NOT_SUPPORTED(r)) {
|
||||
/* This might fail because the kernel or file system doesn't support idmapping. We
|
||||
* can't really distinguish this nicely, nor do we have any guarantees about the
|
||||
|
||||
@@ -1380,7 +1380,7 @@ static int mount_partition(
|
||||
(void) fs_grow(node, p);
|
||||
|
||||
if (remap_uid_gid) {
|
||||
r = remount_idmap(p, uid_shift, uid_range, REMOUNT_IDMAP_HOST_ROOT);
|
||||
r = remount_idmap(p, uid_shift, uid_range, UID_INVALID, REMOUNT_IDMAPPING_HOST_ROOT);
|
||||
if (r < 0)
|
||||
return r;
|
||||
}
|
||||
|
||||
@@ -1053,32 +1053,43 @@ int make_mount_point(const char *path) {
|
||||
return 1;
|
||||
}
|
||||
|
||||
static int make_userns(uid_t uid_shift, uid_t uid_range, RemountIdmapFlags flags) {
|
||||
static int make_userns(uid_t uid_shift, uid_t uid_range, uid_t owner, RemountIdmapping idmapping) {
|
||||
_cleanup_close_ int userns_fd = -1;
|
||||
_cleanup_free_ char *line = NULL;
|
||||
|
||||
/* Allocates a userns file descriptor with the mapping we need. For this we'll fork off a child
|
||||
* process whose only purpose is to give us a new user namespace. It's killed when we got it. */
|
||||
|
||||
if (asprintf(&line, UID_FMT " " UID_FMT " " UID_FMT "\n", 0u, uid_shift, uid_range) < 0)
|
||||
return log_oom_debug();
|
||||
|
||||
/* If requested we'll include an entry in the mapping so that the host root user can make changes to
|
||||
* the uidmapped mount like it normally would. Specifically, we'll map the user with UID_HOST_ROOT on
|
||||
* the backing fs to UID 0. This is useful, since nspawn code wants to create various missing inodes
|
||||
* in the OS tree before booting into it, and this becomes very easy and straightforward to do if it
|
||||
* can just do it under its own regular UID. Note that in that case the container's runtime uidmap
|
||||
* (i.e. the one the container payload processes run in) will leave this UID unmapped, i.e. if we
|
||||
* accidentally leave files owned by host root in the already uidmapped tree around they'll show up
|
||||
* as owned by 'nobody', which is safe. (Of course, we shouldn't leave such inodes around, but always
|
||||
* chown() them to the container's own UID range, but it's good to have a safety net, in case we
|
||||
* forget it.) */
|
||||
if (flags & REMOUNT_IDMAP_HOST_ROOT)
|
||||
if (strextendf(&line,
|
||||
UID_FMT " " UID_FMT " " UID_FMT "\n",
|
||||
UID_MAPPED_ROOT, 0u, 1u) < 0)
|
||||
if (IN_SET(idmapping, REMOUNT_IDMAPPING_NONE, REMOUNT_IDMAPPING_HOST_ROOT)) {
|
||||
if (asprintf(&line, UID_FMT " " UID_FMT " " UID_FMT "\n", 0u, uid_shift, uid_range) < 0)
|
||||
return log_oom_debug();
|
||||
|
||||
/* If requested we'll include an entry in the mapping so that the host root user can make
|
||||
* changes to the uidmapped mount like it normally would. Specifically, we'll map the user
|
||||
* with UID_MAPPED_ROOT on the backing fs to UID 0. This is useful, since nspawn code wants
|
||||
* to create various missing inodes in the OS tree before booting into it, and this becomes
|
||||
* very easy and straightforward to do if it can just do it under its own regular UID. Note
|
||||
* that in that case the container's runtime uidmap (i.e. the one the container payload
|
||||
* processes run in) will leave this UID unmapped, i.e. if we accidentally leave files owned
|
||||
* by host root in the already uidmapped tree around they'll show up as owned by 'nobody',
|
||||
* which is safe. (Of course, we shouldn't leave such inodes around, but always chown() them
|
||||
* to the container's own UID range, but it's good to have a safety net, in case we
|
||||
* forget it.) */
|
||||
if (idmapping == REMOUNT_IDMAPPING_HOST_ROOT)
|
||||
if (strextendf(&line,
|
||||
UID_FMT " " UID_FMT " " UID_FMT "\n",
|
||||
UID_MAPPED_ROOT, 0u, 1u) < 0)
|
||||
return log_oom_debug();
|
||||
}
|
||||
|
||||
if (idmapping == REMOUNT_IDMAPPING_HOST_OWNER) {
|
||||
/* Remap the owner of the bind mounted directory to the root user within the container. This
|
||||
* way every file written by root within the container to the bind-mounted directory will
|
||||
* be owned by the original user. All other user will remain unmapped. */
|
||||
if (asprintf(&line, UID_FMT " " UID_FMT " " UID_FMT "\n", owner, uid_shift, 1u) < 0)
|
||||
return log_oom_debug();
|
||||
}
|
||||
|
||||
/* We always assign the same UID and GID ranges */
|
||||
userns_fd = userns_acquire(line, line);
|
||||
if (userns_fd < 0)
|
||||
@@ -1091,7 +1102,8 @@ int remount_idmap(
|
||||
const char *p,
|
||||
uid_t uid_shift,
|
||||
uid_t uid_range,
|
||||
RemountIdmapFlags flags) {
|
||||
uid_t owner,
|
||||
RemountIdmapping idmapping) {
|
||||
|
||||
_cleanup_close_ int mount_fd = -1, userns_fd = -1;
|
||||
int r;
|
||||
@@ -1107,7 +1119,7 @@ int remount_idmap(
|
||||
return log_debug_errno(errno, "Failed to open tree of mounted filesystem '%s': %m", p);
|
||||
|
||||
/* Create a user namespace mapping */
|
||||
userns_fd = make_userns(uid_shift, uid_range, flags);
|
||||
userns_fd = make_userns(uid_shift, uid_range, owner, idmapping);
|
||||
if (userns_fd < 0)
|
||||
return userns_fd;
|
||||
|
||||
|
||||
@@ -118,7 +118,8 @@ int mount_image_in_namespace(pid_t target, const char *propagate_path, const cha
|
||||
|
||||
int make_mount_point(const char *path);
|
||||
|
||||
typedef enum RemountIdmapFlags {
|
||||
typedef enum RemountIdmapping {
|
||||
REMOUNT_IDMAPPING_NONE,
|
||||
/* Include a mapping from UID_MAPPED_ROOT (i.e. UID 2^31-2) on the backing fs to UID 0 on the
|
||||
* uidmapped fs. This is useful to ensure that the host root user can safely add inodes to the
|
||||
* uidmapped fs (which otherwise wouldn't work as the host root user is not defined on the uidmapped
|
||||
@@ -126,10 +127,16 @@ typedef enum RemountIdmapFlags {
|
||||
* these inodes are quickly re-chown()ed to more suitable UIDs/GIDs. Any code that intends to be able
|
||||
* to add inodes to file systems mapped this way should set this flag, but given it comes with
|
||||
* certain security implications defaults to off, and requires explicit opt-in. */
|
||||
REMOUNT_IDMAP_HOST_ROOT = 1 << 0,
|
||||
} RemountIdmapFlags;
|
||||
REMOUNT_IDMAPPING_HOST_ROOT,
|
||||
/* Define a mapping from root user within the container to the owner of the bind mounted directory.
|
||||
* This ensure no root-owned files will be written in a bind-mounted directory owned by a different
|
||||
* user. No other users are mapped. */
|
||||
REMOUNT_IDMAPPING_HOST_OWNER,
|
||||
_REMOUNT_IDMAPPING_MAX,
|
||||
_REMOUNT_IDMAPPING_INVALID = -EINVAL,
|
||||
} RemountIdmapping;
|
||||
|
||||
int remount_idmap(const char *p, uid_t uid_shift, uid_t uid_range, RemountIdmapFlags flags);
|
||||
int remount_idmap(const char *p, uid_t uid_shift, uid_t uid_range, uid_t owner, RemountIdmapping idmapping);
|
||||
|
||||
/* Creates a mount point (not parents) based on the source path or stat - ie, a file or a directory */
|
||||
int make_mount_point_inode_from_stat(const struct stat *st, const char *dest, mode_t mode);
|
||||
|
||||
Reference in New Issue
Block a user