Merge pull request #26826 from poettering/mntfsd

unprivileged DDI mounts + dynamic userns range allocation via IPC
This commit is contained in:
Luca Boccassi
2024-04-07 19:43:34 +01:00
committed by GitHub
81 changed files with 7892 additions and 430 deletions

12
TODO
View File

@@ -268,6 +268,18 @@ Features:
* use udev rule networkd ownership property to take ownership of network
interfaces nspawn creates
* mountfsd/nsresourced
- userdb: maybe allow callers to map one uid to their own uid
- bpflsm: allow writes if resulting UID on disk would be userns' owner UID
- make encrypted DDIs work (password…)
- add API for creating a new file system from scratch (together with some
dm-integrity/HMAC key). Should probably work using systemd-repart (access
via varlink).
- add api to make an existing file "trusted" via dm-integry/HMAC key
- port: portabled
- port: tmpfiles, sysusers and similar
- lets see if we can make runtime bind mounts into unpriv nspawn work
* add a kernel cmdline switch (and cred?) for marking a system to be
"headless", in which case we never open /dev/console for reading, only for
writing. This would then mean: systemd-firstboot would process creds but not

View File

@@ -676,3 +676,21 @@ Tools using the Varlink protocol (such as `varlinkctl`) or sd-bus (such as
service. Takes a file system path: if specified the tool will listen on an
`AF_UNIX` stream socket on the specified path in addition to whatever else it
would listen on.
`systemd-mountfsd`:
* `$SYSTEMD_MOUNTFSD_TRUSTED_DIRECTORIES` takes a boolean argument. If true
disk images from the usual disk image directories (`/var/lib/machines/`,
`/var/lib/confexts/`, …) will be considered "trusted", i.e. are validated
with a more relaxed image policy (typically not requiring Verity signature
checking) than those from other directories (where Verity signature checks
are mandatory). If false all images are treated the same, regardless if
placed in the usual disk image directories or elsewhere. If not set defaults
to a compile time setting.
* `$SYSTEMD_MOUNTFSD_IMAGE_POLICY_TRUSTED`,
`$SYSTEMD_MOUNTFSD_IMAGE_POLICY_UNTRUSTED` the default image policy to
apply to trusted and untrusted disk images. An image is considered trusted if
placed in a trusted disk image directory (see above), or if suitable polkit
authentication was acquired. See `systemd.image-policy(7)` for the valid
syntax for image policy strings.

View File

@@ -999,6 +999,7 @@ manpages = [
['systemd-measure', '1', [], 'HAVE_TPM2 HAVE_BLKID HAVE_OPENSSL'],
['systemd-modules-load.service', '8', ['systemd-modules-load'], 'HAVE_KMOD'],
['systemd-mount', '1', ['systemd-umount'], ''],
['systemd-mountfsd.service', '8', ['systemd-mountfsd'], 'ENABLE_MOUNTFSD'],
['systemd-network-generator.service', '8', ['systemd-network-generator'], ''],
['systemd-networkd-wait-online.service',
'8',
@@ -1007,6 +1008,10 @@ manpages = [
['systemd-networkd.service', '8', ['systemd-networkd'], 'ENABLE_NETWORKD'],
['systemd-notify', '1', [], ''],
['systemd-nspawn', '1', [], ''],
['systemd-nsresourced.service',
'8',
['systemd-nsresourced'],
'ENABLE_NSRESOURCED'],
['systemd-oomd.service', '8', ['systemd-oomd'], 'ENABLE_OOMD'],
['systemd-path', '1', [], ''],
['systemd-pcrlock',

View File

@@ -0,0 +1,70 @@
<?xml version='1.0'?> <!--*-nxml-*-->
<!DOCTYPE refentry PUBLIC "-//OASIS//DTD DocBook XML V4.5//EN"
"http://www.oasis-open.org/docbook/xml/4.2/docbookx.dtd">
<!-- SPDX-License-Identifier: LGPL-2.1-or-later -->
<refentry id="systemd-mountfsd.service" conditional='ENABLE_MOUNTFSD'>
<refentryinfo>
<title>systemd-mountfsd.service</title>
<productname>systemd</productname>
</refentryinfo>
<refmeta>
<refentrytitle>systemd-mountfsd.service</refentrytitle>
<manvolnum>8</manvolnum>
</refmeta>
<refnamediv>
<refname>systemd-mountfsd.service</refname>
<refname>systemd-mountfsd</refname>
<refpurpose>Disk Image File System Mount Service</refpurpose>
</refnamediv>
<refsynopsisdiv>
<para><filename>systemd-mountfsd.service</filename></para>
<para><filename>/usr/lib/systemd/systemd-mountfsd</filename></para>
</refsynopsisdiv>
<refsect1>
<title>Description</title>
<para><command>systemd-mountfsd</command> is a system service that dissects disk images, and returns mount
file descriptors for the file systems contained therein to clients, via a Varlink IPC API.</para>
<para>The disk images provided must contain a raw file system image or must follow the <ulink
url="https://uapi-group.org/specifications/specs/discoverable_partitions_specification/">Discoverable
Partitions Specification</ulink>. Before mounting any file systems authenticity of the disk image is
established in one or a combination of the following ways:</para>
<orderedlist>
<listitem><para>If the disk image is located in a regular file in one of the directories
<filename>/var/lib/machines/</filename>, <filename>/var/lib/portables/</filename>,
<filename>/var/lib/extensions/</filename>, <filename>/var/lib/confexts/</filename> or their
counterparts in the <filename>/etc/</filename>, <filename>/run/</filename>,
<filename>/usr/lib/</filename> it is assumed to be trusted.</para></listitem>
<listitem><para>If the disk image contains a Verity enabled disk image, along with a signature
partition with a key in the kernel keyring or in <filename>/etc/verity.d/</filename> (and related
directories) the disk image is considered trusted.</para></listitem>
</orderedlist>
<para>This service provides one <ulink url="https://varlink.org/">Varlink</ulink> service:
<constant>io.systemd.MountFileSystem</constant> which accepts a file descriptor to a regular file or
block device, and returns a number of file descriptors referring to an <function>fsmount()</function>
file descriptor the client may then attach to a path of their choice.</para>
<para>The returned mounts are automatically allowlisted in the per-user-namespace allowlist maintained by
<citerefentry><refentrytitle>systemd-nsresourced.service</refentrytitle><manvolnum>8</manvolnum></citerefentry>.</para>
<para>The file systems are automatically fsck'ed before mounting.</para>
</refsect1>
<refsect1>
<title>See Also</title>
<para>
<citerefentry><refentrytitle>systemd</refentrytitle><manvolnum>1</manvolnum></citerefentry>,
<citerefentry><refentrytitle>systemd-nsresourced.service</refentrytitle><manvolnum>8</manvolnum></citerefentry>
</para>
</refsect1>
</refentry>

View File

@@ -118,6 +118,28 @@
keeps track of running containers, and provides programming interfaces to interact with them.</para>
</refsect1>
<refsect1>
<title>Unprivileged Operation</title>
<para><command>systemd-nspawn</command> may be invoked with or without privileges. The full functionality
is currently only available when invoked with privileges. When invoked without privileges, various
limitations apply, including, but not limited to:</para>
<itemizedlist>
<listitem><para>Only disk image based containers are supported (i.e. <option>--image=</option>).
Directory based ones (i.e. <option>--directory=</option>) are not supported.</para></listitem>
<listitem><para>Machine registration via <option>--machine=</option> is not supported.</para></listitem>
<listitem><para>Only <option>--private-network</option> and <option>--network-veth</option> networking modes are supported.</para></listitem>
</itemizedlist>
<para>When running in unprivileged mode, some needed functionality is provided via
<citerefentry><refentrytitle>systemd-mountfsd.service</refentrytitle><manvolnum>8</manvolnum></citerefentry>
and
<citerefentry><refentrytitle>systemd-nsresourced.service</refentrytitle><manvolnum>8</manvolnum></citerefentry></para>
</refsect1>
<refsect1>
<title>Options</title>
@@ -1910,6 +1932,8 @@ After=sys-subsystem-net-devices-ens1.device</programlisting>
<member><citerefentry><refentrytitle>systemd.slice</refentrytitle><manvolnum>5</manvolnum></citerefentry></member>
<member><citerefentry><refentrytitle>machinectl</refentrytitle><manvolnum>1</manvolnum></citerefentry></member>
<member><citerefentry><refentrytitle>importctl</refentrytitle><manvolnum>1</manvolnum></citerefentry></member>
<member><citerefentry><refentrytitle>systemd-mountfsd.service</refentrytitle><manvolnum>8</manvolnum></citerefentry></member>
<member><citerefentry><refentrytitle>systemd-nsresourced.service</refentrytitle><manvolnum>8</manvolnum></citerefentry></member>
<member><citerefentry project='url'><refentrytitle url='https://btrfs.readthedocs.io/en/latest/btrfs.html'>btrfs</refentrytitle><manvolnum>8</manvolnum></citerefentry></member>
</simplelist></para>
</refsect1>

View File

@@ -0,0 +1,81 @@
<?xml version='1.0'?> <!--*-nxml-*-->
<!DOCTYPE refentry PUBLIC "-//OASIS//DTD DocBook XML V4.5//EN"
"http://www.oasis-open.org/docbook/xml/4.2/docbookx.dtd">
<!-- SPDX-License-Identifier: LGPL-2.1-or-later -->
<refentry id="systemd-nsresourced.service" conditional='ENABLE_NSRESOURCED'>
<refentryinfo>
<title>systemd-nsresourced.service</title>
<productname>systemd</productname>
</refentryinfo>
<refmeta>
<refentrytitle>systemd-nsresourced.service</refentrytitle>
<manvolnum>8</manvolnum>
</refmeta>
<refnamediv>
<refname>systemd-nsresourced.service</refname>
<refname>systemd-nsresourced</refname>
<refpurpose>User Namespace Resource Delegation Service</refpurpose>
</refnamediv>
<refsynopsisdiv>
<para><filename>systemd-nsresourced.service</filename></para>
<para><filename>/usr/lib/systemd/systemd-nsresourced</filename></para>
</refsynopsisdiv>
<refsect1>
<title>Description</title>
<para><command>systemd-nsresourced</command> is a system service that permits transient delegation of a a
UID/GID range to a user namespace (see <citerefentry
project='man-pages'><refentrytitle>user_namespaces</refentrytitle><manvolnum>7</manvolnum></citerefentry>)
allocated by a client, via a Varlink IPC API.</para>
<para>Unprivileged clients may allocate a user namespace, and then request a UID/GID range to be assigned
to it via this service. The user namespace may then be used to run containers and other sandboxes, and/or
apply it to an id-mapped mount.</para>
<para>Allocations of UIDs/GIDs this way are transient: when a user namespace goes away, its UID/GID range
is returned to the pool of available ranges. In order to ensure that clients cannot gain persistency in
their transient UID/GID range a BPF-LSM based policy is enforced that ensures that user namespaces set up
this way can only write to file systems they allocate themselves or that are explicitly allowlisted via
<command>systemd-nsresourced</command>.</para>
<para><command>systemd-nsresourced</command> automatically ensures that any registered UID ranges show up
in the system's NSS database via the <ulink url="https://systemd.io/USER_GROUP_API">User/Group Record
Lookup API via Varlink</ulink>.</para>
<para>Currently, only UID/GID ranges consisting of either exactly 1 or exactly 65536 UIDs/GIDs can be
registered with this service. Moreover, UIDs and GIDs are always allocated together, and
symmetrically.</para>
<para>The service provides API calls to allowlist mounts (referenced via their mount file descriptors as
per Linux <function>fsmount()</function> API), to pass ownership of a cgroup subtree to the user
namespace and to delegate a virtual Ethernet device pair to the user namespace. When used in combination
this is sufficient to implement fully unprivileged container environments, as implemented by
<citerefentry><refentrytitle>systemd-nspawn</refentrytitle><manvolnum>1</manvolnum></citerefentry>, fully
unprivileged <varname>RootImage=</varname> (see
<citerefentry><refentrytitle>systemd.exec</refentrytitle><manvolnum>5</manvolnum></citerefentry>) or
fully unprivileged disk image tools such as
<citerefentry><refentrytitle>systemd-dissect</refentrytitle><manvolnum>1</manvolnum></citerefentry>.</para>
<para>This service provides one <ulink url="https://varlink.org/">Varlink</ulink> service:
<constant>io.systemd.NamespaceResource</constant> allows registering user namespaces, and assign mounts,
cgroups and network interfaces to it.</para>
</refsect1>
<refsect1>
<title>See Also</title>
<para>
<citerefentry><refentrytitle>systemd</refentrytitle><manvolnum>1</manvolnum></citerefentry>,
<citerefentry><refentrytitle>systemd-mountfsd.service</refentrytitle><manvolnum>8</manvolnum></citerefentry>,
<citerefentry><refentrytitle>systemd-nspawn</refentrytitle><manvolnum>1</manvolnum></citerefentry>,
<citerefentry><refentrytitle>systemd.exec</refentrytitle><manvolnum>5</manvolnum></citerefentry>,
<citerefentry><refentrytitle>systemd-dissect</refentrytitle><manvolnum>1</manvolnum></citerefentry>,
<citerefentry project='man-pages'><refentrytitle>user_namespaces</refentrytitle><manvolnum>7</manvolnum></citerefentry>
</para>
</refsect1>
</refentry>

View File

@@ -272,6 +272,8 @@ conf.set_quoted('SYSTEMD_TEST_DATA', testdata_dir)
conf.set_quoted('SYSTEMD_TTY_ASK_PASSWORD_AGENT_BINARY_PATH', bindir / 'systemd-tty-ask-password-agent')
conf.set_quoted('SYSTEMD_UPDATE_HELPER_PATH', libexecdir / 'systemd-update-helper')
conf.set_quoted('SYSTEMD_USERWORK_PATH', libexecdir / 'systemd-userwork')
conf.set_quoted('SYSTEMD_MOUNTWORK_PATH', libexecdir / 'systemd-mountwork')
conf.set_quoted('SYSTEMD_NSRESOURCEWORK_PATH', libexecdir / 'systemd-nsresourcework')
conf.set_quoted('SYSTEMD_VERITYSETUP_PATH', libexecdir / 'systemd-veritysetup')
conf.set_quoted('SYSTEM_CONFIG_UNIT_DIR', pkgsysconfdir / 'system')
conf.set_quoted('SYSTEM_DATA_UNIT_DIR', systemunitdir)
@@ -1597,6 +1599,8 @@ conf.set10('ENABLE_REMOTE', have)
feature = get_option('vmspawn').disable_auto_if(conf.get('BUILD_MODE_DEVELOPER') == 0)
conf.set10('ENABLE_VMSPAWN', feature.allowed())
conf.set10('DEFAULT_MOUNTFSD_TRUSTED_DIRECTORIES', get_option('default-mountfsd-trusted-directories'))
foreach term : ['analyze',
'backlight',
'binfmt',
@@ -1617,8 +1621,10 @@ foreach term : ['analyze',
'localed',
'logind',
'machined',
'mountfsd',
'networkd',
'nscd',
'nsresourced',
'nss-myhostname',
'nss-systemd',
'oomd',
@@ -1839,6 +1845,79 @@ conf.set10('ENABLE_UKIFY', want_ukify)
#####################################################################
check_efi_alignment_py = find_program('tools/check-efi-alignment.py')
#####################################################################
use_provided_vmlinux_h = false
use_generated_vmlinux_h = false
provided_vmlinux_h_path = get_option('vmlinux-h-path')
# For the more complex BPF programs we really want a vmlinux.h (which is arch
# specific, but only somewhat bound to kernel version). Ideally the kernel
# development headers would ship that, but right now they don't. Hence address
# this in two ways:
#
# 1. Provide a vmlinux.h at build time
# 2. Generate the file on the fly where possible (which requires /sys/ to be mounted)
#
# We generally prefer the former (to support reproducible builds), but will
# fallback to the latter.
if conf.get('BPF_FRAMEWORK') == 1
enable_vmlinux_h = get_option('vmlinux-h')
if enable_vmlinux_h == 'auto'
if provided_vmlinux_h_path != ''
use_provided_vmlinux_h = true
elif fs.exists('/sys/kernel/btf/vmlinux') and \
bpftool.found() and \
(host_machine.cpu_family() == build_machine.cpu_family()) and \
host_machine.cpu_family() in ['x86_64', 'aarch64']
# We will only generate a vmlinux.h from the running
# kernel if the host and build machine are of the same
# family. Also for now we focus on x86_64 and aarch64,
# since other archs don't seem to be ready yet.
use_generated_vmlinux_h = true
endif
elif enable_vmlinux_h == 'provided'
use_provided_vmlinux_h = true
elif enable_vmlinux_h == 'generated'
if not fs.exists('/sys/kernel/btf/vmlinux')
error('BTF data from kernel not available (/sys/kernel/btf/vmlinux missing), cannot generate vmlinux.h, but was asked to.')
endif
if not bpftool.found()
error('bpftool not available, cannot generate vmlinux.h, but was asked to.')
endif
use_generated_vmlinux_h = true
endif
endif
if use_provided_vmlinux_h
if not fs.exists(provided_vmlinux_h_path)
error('Path to provided vmlinux.h does not exist.')
endif
vmlinux_h_dependency = []
bpf_o_unstripped_cmd += ['-I' + fs.parent(provided_vmlinux_h_path)]
message('Using provided @0@'.format(provided_vmlinux_h_path))
elif use_generated_vmlinux_h
vmlinux_h_dependency = custom_target(
'vmlinux.h',
output: 'vmlinux.h',
command : [ bpftool, 'btf', 'dump', 'file', '/sys/kernel/btf/vmlinux', 'format', 'c' ],
capture : true)
bpf_o_unstripped_cmd += ['-I' + fs.parent(vmlinux_h_dependency.full_path())]
message('Using generated @0@'.format(vmlinux_h_dependency.full_path()))
else
message('Using neither provided nor generated vmlinux.h, some features will not be available.')
endif
conf.set10('HAVE_VMLINUX_H', use_provided_vmlinux_h or use_generated_vmlinux_h)
#####################################################################
check_version_history_py = find_program('tools/check-version-history.py')
elf2efi_py = find_program('tools/elf2efi.py')
export_dbus_interfaces_py = find_program('tools/dbus_exporter.py')
@@ -2201,11 +2280,13 @@ subdir('src/locale')
subdir('src/login')
subdir('src/machine')
subdir('src/machine-id-setup')
subdir('src/mountfsd')
subdir('src/modules-load')
subdir('src/mount')
subdir('src/network')
subdir('src/notify')
subdir('src/nspawn')
subdir('src/nsresourced')
subdir('src/nss-myhostname')
subdir('src/nss-mymachines')
subdir('src/nss-resolve')

View File

@@ -124,6 +124,8 @@ option('portabled', type : 'boolean',
description : 'install the systemd-portabled stack')
option('sysext', type : 'boolean',
description : 'install the systemd-sysext stack')
option('mountfsd', type : 'boolean',
description : 'install the systemd-mountfsd stack')
option('userdb', type : 'boolean',
description : 'install the systemd-userdbd stack')
option('homed', type : 'feature', deprecated : { 'true' : 'enabled', 'false' : 'disabled' },
@@ -140,6 +142,8 @@ option('remote', type : 'feature', deprecated : { 'true' : 'enabled', 'false' :
description : 'support for "journal over the network"')
option('create-log-dirs', type : 'boolean',
description : 'create /var/log/journal{,/remote}')
option('nsresourced', type : 'boolean',
description : 'install the systemd-nsresourced stack')
option('nss-myhostname', type : 'boolean',
description : 'install nss-myhostname module')
option('nss-mymachines', type : 'feature', deprecated : { 'true' : 'enabled', 'false' : 'disabled' },
@@ -519,6 +523,13 @@ option('analyze', type: 'boolean', value: true,
description : 'install systemd-analyze')
option('bpf-compiler', type : 'combo', choices : ['clang', 'gcc'],
description: 'compiler used to build BPF programs')
description : 'compiler used to build BPF programs')
option('bpf-framework', type : 'feature', deprecated : { 'true' : 'enabled', 'false' : 'disabled' },
description: 'build BPF programs from source code in restricted C')
description : 'build BPF programs from source code in restricted C')
option('vmlinux-h', type : 'combo', choices : ['auto', 'provided', 'generated', 'disabled'],
description : 'which vmlinux.h to use')
option('vmlinux-h-path', type : 'string', value : '',
description : 'path to vmlinux.h to use')
option('default-mountfsd-trusted-directories', type : 'boolean', value: false,
description : 'controls whether mountfsd should apply a relaxed policy on DDIs in system DDI directories')

View File

@@ -0,0 +1,31 @@
# SPDX-License-Identifier: MIT-0
#
# This config file is installed as part of systemd.
# It may be freely copied and edited (following the MIT No Attribution license).
#
# To make local modifications, one of the following methods may be used:
# 1. add a drop-in file that extends this file by creating the
# /etc/systemd/network/80-namespace-ns.network.d/ directory and creating a
# new .conf file there.
# 2. copy this file into /etc/systemd/network or one of the other paths checked
# by systemd-networkd and edit it there.
# This file should not be edited in place, because it'll be overwritten on upgrades.
# This network file matches the host-side of the virtual Ethernet link
# created by systemd-nsresourced's network support. See systemd-nsresourced(1) for
# details.
[Match]
Kind=veth
Name=ns-*
[Network]
# Default to using a /28 prefix, giving up to 13 addresses per namespace
Address=0.0.0.0/28
LinkLocalAddressing=yes
DHCPServer=yes
IPMasquerade=both
LLDP=yes
EmitLLDP=customer-bridge
IPv6AcceptRA=no
IPv6SendRA=yes

View File

@@ -11,6 +11,7 @@ if conf.get('ENABLE_NETWORKD') == 1
'80-container-ve.link',
'80-container-vz.network',
'80-container-vz.link',
'80-namespace-ns.network',
'80-vm-vt.network',
'80-vm-vt.link',
'80-wifi-adhoc.network',

View File

@@ -23,9 +23,11 @@ enable systemd-homed.service
enable systemd-homed-activate.service
enable systemd-homed-firstboot.service
enable systemd-journald-audit.socket
enable systemd-mountfsd.socket
enable systemd-network-generator.service
enable systemd-networkd.service
enable systemd-networkd-wait-online.service
enable systemd-nsresourced.socket
enable systemd-pstore.service
enable systemd-resolved.service
enable systemd-sysext.service

View File

@@ -22,6 +22,7 @@
#include "log.h"
#include "login-util.h"
#include "macro.h"
#include "missing_fs.h"
#include "missing_magic.h"
#include "missing_threads.h"
#include "mkdir.h"
@@ -39,6 +40,38 @@
#include "user-util.h"
#include "xattr-util.h"
int cg_path_open(const char *controller, const char *path) {
_cleanup_free_ char *fs = NULL;
int r;
r = cg_get_path(controller, path, /* item=*/ NULL, &fs);
if (r < 0)
return r;
return RET_NERRNO(open(fs, O_DIRECTORY|O_CLOEXEC));
}
int cg_cgroupid_open(int cgroupfs_fd, uint64_t id) {
_cleanup_close_ int fsfd = -EBADF;
if (cgroupfs_fd < 0) {
fsfd = open("/sys/fs/cgroup", O_CLOEXEC|O_DIRECTORY);
if (fsfd < 0)
return -errno;
cgroupfs_fd = fsfd;
}
cg_file_handle fh = CG_FILE_HANDLE_INIT;
CG_FILE_HANDLE_CGROUPID(fh) = id;
int fd = open_by_handle_at(cgroupfs_fd, &fh.file_handle, O_DIRECTORY|O_CLOEXEC);
if (fd < 0)
return -errno;
return fd;
}
static int cg_enumerate_items(const char *controller, const char *path, FILE **ret, const char *item) {
_cleanup_free_ char *fs = NULL;
FILE *f;
@@ -1404,7 +1437,7 @@ int cg_pid_get_machine_name(pid_t pid, char **ret_machine) {
int cg_path_get_cgroupid(const char *path, uint64_t *ret) {
cg_file_handle fh = CG_FILE_HANDLE_INIT;
int mnt_id = -1;
int mnt_id;
assert(path);
assert(ret);
@@ -1418,6 +1451,20 @@ int cg_path_get_cgroupid(const char *path, uint64_t *ret) {
return 0;
}
int cg_fd_get_cgroupid(int fd, uint64_t *ret) {
cg_file_handle fh = CG_FILE_HANDLE_INIT;
int mnt_id = -1;
assert(fd >= 0);
assert(ret);
if (name_to_handle_at(fd, "", &fh.file_handle, &mnt_id, AT_EMPTY_PATH) < 0)
return -errno;
*ret = CG_FILE_HANDLE_CGROUPID(fh);
return 0;
}
int cg_path_get_session(const char *path, char **ret_session) {
_cleanup_free_ char *unit = NULL;
char *start, *end;

View File

@@ -180,6 +180,9 @@ typedef enum CGroupUnified {
* generate paths with multiple adjacent / removed.
*/
int cg_path_open(const char *controller, const char *path);
int cg_cgroupid_open(int fsfd, uint64_t id);
int cg_enumerate_processes(const char *controller, const char *path, FILE **ret);
int cg_read_pid(FILE *f, pid_t *ret);
int cg_read_pidref(FILE *f, PidRef *ret);
@@ -267,6 +270,7 @@ int cg_is_empty_recursive(const char *controller, const char *path);
int cg_get_root_path(char **path);
int cg_path_get_cgroupid(const char *path, uint64_t *ret);
int cg_fd_get_cgroupid(int fd, uint64_t *ret);
int cg_path_get_session(const char *path, char **ret_session);
int cg_path_get_owner_uid(const char *path, uid_t *ret_uid);
int cg_path_get_unit(const char *path, char **ret_unit);
@@ -352,5 +356,10 @@ typedef union {
uint8_t space[offsetof(struct file_handle, f_handle) + sizeof(uint64_t)];
} cg_file_handle;
#define CG_FILE_HANDLE_INIT { .file_handle.handle_bytes = sizeof(uint64_t) }
#define CG_FILE_HANDLE_INIT \
(cg_file_handle) { \
.file_handle.handle_bytes = sizeof(uint64_t), \
.file_handle.handle_type = FILEID_KERNFS, \
}
#define CG_FILE_HANDLE_CGROUPID(fh) (*(uint64_t*) (fh).file_handle.f_handle)

View File

@@ -17,7 +17,7 @@ static inline int make_lock_file(const char *p, int operation, LockFile *ret) {
int make_lock_file_for(const char *p, int operation, LockFile *ret);
void release_lock_file(LockFile *f);
#define LOCK_FILE_INIT { .dir_fd = -EBADF, .fd = -EBADF }
#define LOCK_FILE_INIT (LockFile) { .dir_fd = -EBADF, .fd = -EBADF }
/* POSIX locks with the same interface as flock(). */
int posix_lock(int fd, int operation);

View File

@@ -111,3 +111,8 @@ assert_cc(FS_PROJINHERIT_FL == 0x20000000);
#else
assert_cc(FS_KEY_DESCRIPTOR_SIZE == 8);
#endif
/* linux/exportfs.h */
#ifndef FILEID_KERNFS
#define FILEID_KERNFS 0xfe
#endif

View File

@@ -34,6 +34,14 @@ const struct namespace_info namespace_info[] = {
#define pid_namespace_path(pid, type) procfs_file_alloca(pid, namespace_info[type].proc_path)
static NamespaceType clone_flag_to_namespace_type(unsigned long clone_flag) {
for (NamespaceType t = 0; t < _NAMESPACE_TYPE_MAX; t++)
if (((namespace_info[t].clone_flag ^ clone_flag) & (CLONE_NEWCGROUP|CLONE_NEWIPC|CLONE_NEWNET|CLONE_NEWNS|CLONE_NEWPID|CLONE_NEWUSER|CLONE_NEWUTS|CLONE_NEWTIME)) == 0)
return t;
return _NAMESPACE_TYPE_INVALID;
}
int namespace_open(
pid_t pid,
int *ret_pidns_fd,
@@ -214,6 +222,88 @@ int detach_mount_namespace(void) {
return 0;
}
int detach_mount_namespace_harder(uid_t target_uid, gid_t target_gid) {
int r;
/* Tried detach_mount_namespace() first. If that doesn't work due to permissions, opens up an
* unprivileged user namespace with a mapping of the originating UID/GID to the specified target
* UID/GID. Then, tries detach_mount_namespace() again.
*
* Or in other words: tries much harder to get a mount namespace, making use of unprivileged user
* namespaces if need be.
*
* Note that after this function completed:
*
* → if we had privs, afterwards uids/gids on files and processes are as before
*
* → if we had no privs, our own id and all our files will show up owned by target_uid/target_gid,
* and everything else owned by nobody.
*
* Yes, that's quite a difference. */
if (!uid_is_valid(target_uid))
return -EINVAL;
if (!gid_is_valid(target_gid))
return -EINVAL;
r = detach_mount_namespace();
if (r != -EPERM)
return r;
if (unshare(CLONE_NEWUSER) < 0)
return log_debug_errno(errno, "Failed to acquire user namespace: %m");
r = write_string_filef("/proc/self/uid_map", 0,
UID_FMT " " UID_FMT " 1\n", target_uid, getuid());
if (r < 0)
return log_debug_errno(r, "Failed to write uid map: %m");
r = write_string_file("/proc/self/setgroups", "deny", 0);
if (r < 0)
return log_debug_errno(r, "Failed to write setgroups file: %m");
r = write_string_filef("/proc/self/gid_map", 0,
GID_FMT " " GID_FMT " 1\n", target_gid, getgid());
if (r < 0)
return log_debug_errno(r, "Failed to write gid map: %m");
return detach_mount_namespace();
}
int detach_mount_namespace_userns(int userns_fd) {
int r;
assert(userns_fd >= 0);
if (setns(userns_fd, CLONE_NEWUSER) < 0)
return log_debug_errno(errno, "Failed to join user namespace: %m");
r = reset_uid_gid();
if (r < 0)
return log_debug_errno(r, "Failed to become root in user namespace: %m");
return detach_mount_namespace();
}
int userns_acquire_empty(void) {
_cleanup_(sigkill_waitp) pid_t pid = 0;
_cleanup_close_ int userns_fd = -EBADF;
int r;
r = safe_fork("(sd-mkuserns)", FORK_CLOSE_ALL_FDS|FORK_DEATHSIG_SIGKILL|FORK_NEW_USERNS, &pid);
if (r < 0)
return r;
if (r == 0)
/* Child. We do nothing here, just freeze until somebody kills us. */
freeze();
r = namespace_open(pid, NULL, NULL, NULL, &userns_fd, NULL);
if (r < 0)
return log_error_errno(r, "Failed to open userns fd: %m");
return TAKE_FD(userns_fd);
}
int userns_acquire(const char *uid_map, const char *gid_map) {
char path[STRLEN("/proc//uid_map") + DECIMAL_STR_MAX(pid_t) + 1];
_cleanup_(sigkill_waitp) pid_t pid = 0;
@@ -344,3 +434,50 @@ int parse_userns_uid_range(const char *s, uid_t *ret_uid_shift, uid_t *ret_uid_r
return 0;
}
int namespace_open_by_type(NamespaceType type) {
const char *p;
int fd;
assert(type >= 0);
assert(type < _NAMESPACE_TYPE_MAX);
p = pid_namespace_path(0, type);
fd = RET_NERRNO(open(p, O_RDONLY|O_NOCTTY|O_CLOEXEC));
if (fd == -ENOENT && proc_mounted() == 0)
return -ENOSYS;
return fd;
}
int is_our_namespace(int fd, NamespaceType request_type) {
int clone_flag;
assert(fd >= 0);
clone_flag = ioctl(fd, NS_GET_NSTYPE);
if (clone_flag < 0)
return -errno;
NamespaceType found_type = clone_flag_to_namespace_type(clone_flag);
if (found_type < 0)
return -EBADF; /* Uh? Unknown namespace type? */
if (request_type >= 0 && request_type != found_type) /* It's a namespace, but not of the right type? */
return -EUCLEAN;
struct stat st_fd, st_ours;
if (fstat(fd, &st_fd) < 0)
return -errno;
const char *p = pid_namespace_path(0, found_type);
if (stat(p, &st_ours) < 0) {
if (errno == ENOENT)
return proc_mounted() == 0 ? -ENOSYS : -ENOENT;
return -errno;
}
return stat_inode_same(&st_ours, &st_fd);
}

View File

@@ -34,6 +34,8 @@ int namespace_enter(int pidns_fd, int mntns_fd, int netns_fd, int userns_fd, int
int fd_is_ns(int fd, unsigned long nsflag);
int detach_mount_namespace(void);
int detach_mount_namespace_harder(uid_t target_uid, gid_t target_gid);
int detach_mount_namespace_userns(int userns_fd);
static inline bool userns_shift_range_valid(uid_t shift, uid_t range) {
/* Checks that the specified userns range makes sense, i.e. contains at least one UID, and the end
@@ -50,8 +52,15 @@ static inline bool userns_shift_range_valid(uid_t shift, uid_t range) {
return true;
}
int userns_acquire_empty(void);
int userns_acquire(const char *uid_map, const char *gid_map);
int netns_acquire(void);
int in_same_namespace(pid_t pid1, pid_t pid2, NamespaceType type);
int parse_userns_uid_range(const char *s, uid_t *ret_uid_shift, uid_t *ret_uid_range);
int namespace_open_by_type(NamespaceType type);
int is_our_namespace(int fd, NamespaceType type);

View File

@@ -10,6 +10,7 @@
#include "format-util.h"
#include "macro.h"
#include "path-util.h"
#include "process-util.h"
#include "sort-util.h"
#include "stat-util.h"
#include "uid-range.h"
@@ -204,7 +205,7 @@ int uid_map_read_one(FILE *f, uid_t *ret_base, uid_t *ret_shift, uid_t *ret_rang
return 0;
}
int uid_range_load_userns(UIDRange **ret, const char *path) {
int uid_range_load_userns(UIDRange **ret, const char *path, UIDRangeUsernsMode mode) {
_cleanup_(uid_range_freep) UIDRange *range = NULL;
_cleanup_fclose_ FILE *f = NULL;
int r;
@@ -216,9 +217,11 @@ int uid_range_load_userns(UIDRange **ret, const char *path) {
* To simplify things this will modify the passed array in case of later failure. */
assert(ret);
assert(mode >= 0);
assert(mode < _UID_RANGE_USERNS_MODE_MAX);
if (!path)
path = "/proc/self/uid_map";
path = IN_SET(mode, UID_RANGE_USERNS_INSIDE, UID_RANGE_USERNS_OUTSIDE) ? "/proc/self/uid_map" : "/proc/self/gid_map";
f = fopen(path, "re");
if (!f) {
@@ -243,7 +246,11 @@ int uid_range_load_userns(UIDRange **ret, const char *path) {
if (r < 0)
return r;
r = uid_range_add_internal(&range, uid_base, uid_range, /* coalesce = */ false);
r = uid_range_add_internal(
&range,
IN_SET(mode, UID_RANGE_USERNS_INSIDE, GID_RANGE_USERNS_INSIDE) ? uid_base : uid_shift,
uid_range,
/* coalesce = */ false);
if (r < 0)
return r;
}
@@ -253,3 +260,103 @@ int uid_range_load_userns(UIDRange **ret, const char *path) {
*ret = TAKE_PTR(range);
return 0;
}
int uid_range_load_userns_by_fd(int userns_fd, UIDRangeUsernsMode mode, UIDRange **ret) {
_cleanup_(close_pairp) int pfd[2] = EBADF_PAIR;
_cleanup_(sigkill_waitp) pid_t pid = 0;
ssize_t n;
char x;
int r;
assert(userns_fd >= 0);
assert(mode >= 0);
assert(mode < _UID_RANGE_USERNS_MODE_MAX);
assert(ret);
if (pipe2(pfd, O_CLOEXEC) < 0)
return -errno;
r = safe_fork_full(
"(sd-mkuserns)",
/* stdio_fds= */ NULL,
(int[]) { pfd[1], userns_fd }, 2,
FORK_CLOSE_ALL_FDS|FORK_DEATHSIG_SIGKILL,
&pid);
if (r < 0)
return r;
if (r == 0) {
/* Child. */
if (setns(userns_fd, CLONE_NEWUSER) < 0) {
log_debug_errno(errno, "Failed to join userns: %m");
_exit(EXIT_FAILURE);
}
userns_fd = safe_close(userns_fd);
n = write(pfd[1], &(const char) { 'x' }, 1);
if (n < 0) {
log_debug_errno(errno, "Failed to write to fifo: %m");
_exit(EXIT_FAILURE);
}
assert(n == 1);
freeze();
}
pfd[1] = safe_close(pfd[1]);
n = read(pfd[0], &x, 1);
if (n < 0)
return -errno;
if (n == 0)
return -EPROTO;
assert(n == 1);
assert(x == 'x');
const char *p = procfs_file_alloca(
pid,
IN_SET(mode, UID_RANGE_USERNS_INSIDE, UID_RANGE_USERNS_OUTSIDE) ? "uid_map" : "gid_map");
return uid_range_load_userns(ret, p, mode);
}
bool uid_range_overlaps(const UIDRange *range, uid_t start, uid_t nr) {
if (!range)
return false;
/* Avoid overflow */
if (start > UINT32_MAX - nr)
nr = UINT32_MAX - start;
if (nr == 0)
return false;
FOREACH_ARRAY(entry, range->entries, range->n_entries)
if (start < entry->start + entry->nr &&
start + nr >= entry->start)
return true;
return false;
}
bool uid_range_equal(const UIDRange *a, const UIDRange *b) {
if (a == b)
return true;
if (!a || !b)
return false;
if (a->n_entries != b->n_entries)
return false;
for (size_t i = 0; i < a->n_entries; i++) {
if (a->entries[i].start != b->entries[i].start)
return false;
if (a->entries[i].nr != b->entries[i].nr)
return false;
}
return true;
}

View File

@@ -33,4 +33,46 @@ static inline bool uid_range_contains(const UIDRange *range, uid_t uid) {
int uid_map_read_one(FILE *f, uid_t *ret_base, uid_t *ret_shift, uid_t *ret_range);
int uid_range_load_userns(UIDRange **ret, const char *path);
static inline size_t uid_range_entries(const UIDRange *range) {
return range ? range->n_entries : 0;
}
static inline unsigned uid_range_size(const UIDRange *range) {
if (!range)
return 0;
unsigned n = 0;
FOREACH_ARRAY(e, range->entries, range->n_entries)
n += e->nr;
return n;
}
static inline bool uid_range_is_empty(const UIDRange *range) {
if (!range)
return true;
FOREACH_ARRAY(e, range->entries, range->n_entries)
if (e->nr > 0)
return false;
return true;
}
bool uid_range_equal(const UIDRange *a, const UIDRange *b);
typedef enum UIDRangeUsernsMode {
UID_RANGE_USERNS_INSIDE,
UID_RANGE_USERNS_OUTSIDE,
GID_RANGE_USERNS_INSIDE,
GID_RANGE_USERNS_OUTSIDE,
_UID_RANGE_USERNS_MODE_MAX,
_UID_RANGE_USERNS_MODE_INVALID = -EINVAL,
} UIDRangeUsernsMode;
int uid_range_load_userns(UIDRange **ret, const char *path, UIDRangeUsernsMode mode);
int uid_range_load_userns_by_fd(int userns_fd, UIDRangeUsernsMode mode, UIDRange **ret);
bool uid_range_overlaps(const UIDRange *range, uid_t start, uid_t nr);

View File

@@ -4805,7 +4805,7 @@ static int short_uid_range(const char *path) {
/* Taint systemd if we the UID range assigned to this environment doesn't at least cover 0…65534,
* i.e. from root to nobody. */
r = uid_range_load_userns(&p, path);
r = uid_range_load_userns(&p, path, UID_RANGE_USERNS_INSIDE);
if (ERRNO_IS_NEG_NOT_SUPPORTED(r))
return false;
if (r < 0)

Some files were not shown because too many files have changed in this diff Show More