Merge pull request #24467 from qdeslandes/nspawn_rootidmap

nspawn: add rootidmap as --bind option
This commit is contained in:
Yu Watanabe
2022-09-06 03:45:53 +09:00
committed by GitHub
7 changed files with 84 additions and 42 deletions

View File

@@ -1363,16 +1363,37 @@ After=sys-subsystem-net-devices-ens1.device</programlisting>
multiple times for creating multiple independent bind mount points.</para>
<para>Mount options are comma-separated. <option>rbind</option> and <option>norbind</option> control whether
to create a recursive or a regular bind mount. Defaults to "rbind". <option>idmap</option> and <option>noidmap</option>
control if the bind mount should use filesystem id mappings. Using this option requires support by the source filesystem
for id mappings. Defaults to "noidmap".</para>
to create a recursive or a regular bind mount. Defaults to "rbind". <option>noidmap</option>,
<option>idmap</option>, and <option>rootidmap</option> control ID mapping.</para>
<para>Using <option>idmap</option> or <option>rootidmap</option> requires support by the source filesystem
for user/group ID mapped mounts. Defaults to "noidmap". With <option>x</option> being the container's UID range
offset, <option>y</option> being the length of the container's UID range, and <option>p</option> being the
owner UID of the bind mount source inode on the host:
<itemizedlist>
<listitem><para>If <option>noidmap</option> is used, any user <option>z</option> in the range
<option>0 … y</option> seen from inside of the container is mapped to <option>x + z</option> in the
<option>x … x + y</option> range on the host. All host users outside of that range are mapped to
<option>nobody</option> inside the container.</para></listitem>
<listitem><para>If <option>idmap</option> is used, any user <option>z</option> in the UID range
<option>0 … y</option> as seen from inside the container is mapped to the same <option>z</option>
in the same <option>0 … y</option> range on the host. All host users outside of that range are
mapped to <option>nobody</option> inside the container.</para></listitem>
<listitem><para>If <option>rootidmap</option> is used, the user <option>0</option> seen from inside
of the container is mapped to <option>p</option> on the host. All host users outside of that range
are mapped to <option>nobody</option> inside the container.</para></listitem>
</itemizedlist></para>
<para>Whichever ID mapping option is used, the same mapping will be used for users and groups IDs. If
<option>rootidmap</option> is used, the group owning the bind mounted directory will have no effect</para>
<para>Note that when this option is used in combination with <option>--private-users</option>, the resulting
mount points will be owned by the <constant>nobody</constant> user. That's because the mount and its files and
directories continue to be owned by the relevant host users and groups, which do not exist in the container,
and thus show up under the wildcard UID 65534 (nobody). If such bind mounts are created, it is recommended to
make them read-only, using <option>--bind-ro=</option>. Alternatively you can use the "idmap" mount option to
map the filesystem ids.</para></listitem>
map the filesystem IDs.</para></listitem>
</varlistentry>
<varlistentry>

View File

@@ -67,9 +67,9 @@ int take_etc_passwd_lock(const char *root);
#define UID_NOBODY ((uid_t) 65534U)
#define GID_NOBODY ((gid_t) 65534U)
/* If REMOUNT_IDMAP_HOST_ROOT is set for remount_idmap() we'll include a mapping here that maps the host root
* user accessing the idmapped mount to the this user ID on the backing fs. This is the last valid UID in the
* *signed* 32bit range. You might wonder why precisely use this specific UID for this purpose? Well, we
/* If REMOUNT_IDMAPPING_HOST_ROOT is set for remount_idmap() we'll include a mapping here that maps the host
* root user accessing the idmapped mount to the this user ID on the backing fs. This is the last valid UID in
* the *signed* 32bit range. You might wonder why precisely use this specific UID for this purpose? Well, we
* definitely cannot use the first 0…65536 UIDs for that, since in most cases that's precisely the file range
* we intend to map to some high UID range, and since UID mappings have to be bijective we thus cannot use
* them at all. Furthermore the UID range beyond INT32_MAX (i.e. the range above the signed 32bit range) is

View File

@@ -708,10 +708,10 @@ int mount_all(const char *dest,
return 0;
}
static int parse_mount_bind_options(const char *options, unsigned long *mount_flags, char **mount_opts, bool *idmapped) {
static int parse_mount_bind_options(const char *options, unsigned long *mount_flags, char **mount_opts, RemountIdmapping *idmapping) {
unsigned long flags = *mount_flags;
char *opts = NULL;
bool flag_idmapped = *idmapped;
RemountIdmapping new_idmapping = *idmapping;
int r;
assert(options);
@@ -730,16 +730,18 @@ static int parse_mount_bind_options(const char *options, unsigned long *mount_fl
else if (streq(word, "norbind"))
flags &= ~MS_REC;
else if (streq(word, "idmap"))
flag_idmapped = true;
new_idmapping = REMOUNT_IDMAPPING_HOST_ROOT;
else if (streq(word, "noidmap"))
flag_idmapped = false;
new_idmapping = REMOUNT_IDMAPPING_NONE;
else if (streq(word, "rootidmap"))
new_idmapping = REMOUNT_IDMAPPING_HOST_OWNER;
else
return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
"Invalid bind mount option: %s", word);
}
*mount_flags = flags;
*idmapped = flag_idmapped;
*idmapping = new_idmapping;
/* in the future mount_opts will hold string options for mount(2) */
*mount_opts = opts;
@@ -751,13 +753,13 @@ static int mount_bind(const char *dest, CustomMount *m, uid_t uid_shift, uid_t u
unsigned long mount_flags = MS_BIND | MS_REC;
struct stat source_st, dest_st;
int r;
bool idmapped = false;
RemountIdmapping idmapping = REMOUNT_IDMAPPING_NONE;
assert(dest);
assert(m);
if (m->options) {
r = parse_mount_bind_options(m->options, &mount_flags, &mount_opts, &idmapped);
r = parse_mount_bind_options(m->options, &mount_flags, &mount_opts, &idmapping);
if (r < 0)
return r;
}
@@ -815,8 +817,8 @@ static int mount_bind(const char *dest, CustomMount *m, uid_t uid_shift, uid_t u
return log_error_errno(r, "Read-only bind mount failed: %m");
}
if (idmapped) {
r = remount_idmap(where, uid_shift, uid_range, REMOUNT_IDMAP_HOST_ROOT);
if (idmapping != REMOUNT_IDMAPPING_NONE) {
r = remount_idmap(where, uid_shift, uid_range, source_st.st_uid, idmapping);
if (r < 0)
return log_error_errno(r, "Failed to map ids for bind mount %s: %m", where);
}

View File

@@ -3806,7 +3806,7 @@ static int outer_child(
IN_SET(arg_userns_ownership, USER_NAMESPACE_OWNERSHIP_MAP, USER_NAMESPACE_OWNERSHIP_AUTO) &&
arg_uid_shift != 0) {
r = remount_idmap(directory, arg_uid_shift, arg_uid_range, REMOUNT_IDMAP_HOST_ROOT);
r = remount_idmap(directory, arg_uid_shift, arg_uid_range, UID_INVALID, REMOUNT_IDMAPPING_HOST_ROOT);
if (r == -EINVAL || ERRNO_IS_NOT_SUPPORTED(r)) {
/* This might fail because the kernel or file system doesn't support idmapping. We
* can't really distinguish this nicely, nor do we have any guarantees about the

View File

@@ -1380,7 +1380,7 @@ static int mount_partition(
(void) fs_grow(node, p);
if (remap_uid_gid) {
r = remount_idmap(p, uid_shift, uid_range, REMOUNT_IDMAP_HOST_ROOT);
r = remount_idmap(p, uid_shift, uid_range, UID_INVALID, REMOUNT_IDMAPPING_HOST_ROOT);
if (r < 0)
return r;
}

View File

@@ -1053,32 +1053,43 @@ int make_mount_point(const char *path) {
return 1;
}
static int make_userns(uid_t uid_shift, uid_t uid_range, RemountIdmapFlags flags) {
static int make_userns(uid_t uid_shift, uid_t uid_range, uid_t owner, RemountIdmapping idmapping) {
_cleanup_close_ int userns_fd = -1;
_cleanup_free_ char *line = NULL;
/* Allocates a userns file descriptor with the mapping we need. For this we'll fork off a child
* process whose only purpose is to give us a new user namespace. It's killed when we got it. */
if (asprintf(&line, UID_FMT " " UID_FMT " " UID_FMT "\n", 0u, uid_shift, uid_range) < 0)
return log_oom_debug();
/* If requested we'll include an entry in the mapping so that the host root user can make changes to
* the uidmapped mount like it normally would. Specifically, we'll map the user with UID_HOST_ROOT on
* the backing fs to UID 0. This is useful, since nspawn code wants to create various missing inodes
* in the OS tree before booting into it, and this becomes very easy and straightforward to do if it
* can just do it under its own regular UID. Note that in that case the container's runtime uidmap
* (i.e. the one the container payload processes run in) will leave this UID unmapped, i.e. if we
* accidentally leave files owned by host root in the already uidmapped tree around they'll show up
* as owned by 'nobody', which is safe. (Of course, we shouldn't leave such inodes around, but always
* chown() them to the container's own UID range, but it's good to have a safety net, in case we
* forget it.) */
if (flags & REMOUNT_IDMAP_HOST_ROOT)
if (strextendf(&line,
UID_FMT " " UID_FMT " " UID_FMT "\n",
UID_MAPPED_ROOT, 0u, 1u) < 0)
if (IN_SET(idmapping, REMOUNT_IDMAPPING_NONE, REMOUNT_IDMAPPING_HOST_ROOT)) {
if (asprintf(&line, UID_FMT " " UID_FMT " " UID_FMT "\n", 0u, uid_shift, uid_range) < 0)
return log_oom_debug();
/* If requested we'll include an entry in the mapping so that the host root user can make
* changes to the uidmapped mount like it normally would. Specifically, we'll map the user
* with UID_MAPPED_ROOT on the backing fs to UID 0. This is useful, since nspawn code wants
* to create various missing inodes in the OS tree before booting into it, and this becomes
* very easy and straightforward to do if it can just do it under its own regular UID. Note
* that in that case the container's runtime uidmap (i.e. the one the container payload
* processes run in) will leave this UID unmapped, i.e. if we accidentally leave files owned
* by host root in the already uidmapped tree around they'll show up as owned by 'nobody',
* which is safe. (Of course, we shouldn't leave such inodes around, but always chown() them
* to the container's own UID range, but it's good to have a safety net, in case we
* forget it.) */
if (idmapping == REMOUNT_IDMAPPING_HOST_ROOT)
if (strextendf(&line,
UID_FMT " " UID_FMT " " UID_FMT "\n",
UID_MAPPED_ROOT, 0u, 1u) < 0)
return log_oom_debug();
}
if (idmapping == REMOUNT_IDMAPPING_HOST_OWNER) {
/* Remap the owner of the bind mounted directory to the root user within the container. This
* way every file written by root within the container to the bind-mounted directory will
* be owned by the original user. All other user will remain unmapped. */
if (asprintf(&line, UID_FMT " " UID_FMT " " UID_FMT "\n", owner, uid_shift, 1u) < 0)
return log_oom_debug();
}
/* We always assign the same UID and GID ranges */
userns_fd = userns_acquire(line, line);
if (userns_fd < 0)
@@ -1091,7 +1102,8 @@ int remount_idmap(
const char *p,
uid_t uid_shift,
uid_t uid_range,
RemountIdmapFlags flags) {
uid_t owner,
RemountIdmapping idmapping) {
_cleanup_close_ int mount_fd = -1, userns_fd = -1;
int r;
@@ -1107,7 +1119,7 @@ int remount_idmap(
return log_debug_errno(errno, "Failed to open tree of mounted filesystem '%s': %m", p);
/* Create a user namespace mapping */
userns_fd = make_userns(uid_shift, uid_range, flags);
userns_fd = make_userns(uid_shift, uid_range, owner, idmapping);
if (userns_fd < 0)
return userns_fd;

View File

@@ -118,7 +118,8 @@ int mount_image_in_namespace(pid_t target, const char *propagate_path, const cha
int make_mount_point(const char *path);
typedef enum RemountIdmapFlags {
typedef enum RemountIdmapping {
REMOUNT_IDMAPPING_NONE,
/* Include a mapping from UID_MAPPED_ROOT (i.e. UID 2^31-2) on the backing fs to UID 0 on the
* uidmapped fs. This is useful to ensure that the host root user can safely add inodes to the
* uidmapped fs (which otherwise wouldn't work as the host root user is not defined on the uidmapped
@@ -126,10 +127,16 @@ typedef enum RemountIdmapFlags {
* these inodes are quickly re-chown()ed to more suitable UIDs/GIDs. Any code that intends to be able
* to add inodes to file systems mapped this way should set this flag, but given it comes with
* certain security implications defaults to off, and requires explicit opt-in. */
REMOUNT_IDMAP_HOST_ROOT = 1 << 0,
} RemountIdmapFlags;
REMOUNT_IDMAPPING_HOST_ROOT,
/* Define a mapping from root user within the container to the owner of the bind mounted directory.
* This ensure no root-owned files will be written in a bind-mounted directory owned by a different
* user. No other users are mapped. */
REMOUNT_IDMAPPING_HOST_OWNER,
_REMOUNT_IDMAPPING_MAX,
_REMOUNT_IDMAPPING_INVALID = -EINVAL,
} RemountIdmapping;
int remount_idmap(const char *p, uid_t uid_shift, uid_t uid_range, RemountIdmapFlags flags);
int remount_idmap(const char *p, uid_t uid_shift, uid_t uid_range, uid_t owner, RemountIdmapping idmapping);
/* Creates a mount point (not parents) based on the source path or stat - ie, a file or a directory */
int make_mount_point_inode_from_stat(const struct stat *st, const char *dest, mode_t mode);