From fc772c61e88afbae74e713c818a9d5115983aa1a Mon Sep 17 00:00:00 2001 From: Lennart Poettering Date: Tue, 7 Nov 2023 16:37:39 +0100 Subject: [PATCH 01/10] sd-bus: add ability to connect to bus as a specific user --- src/libsystemd/sd-bus/bus-internal.h | 3 + src/libsystemd/sd-bus/bus-socket.c | 96 ++++++++++++++++++++++++++-- src/libsystemd/sd-bus/sd-bus.c | 27 +++++++- 3 files changed, 121 insertions(+), 5 deletions(-) diff --git a/src/libsystemd/sd-bus/bus-internal.h b/src/libsystemd/sd-bus/bus-internal.h index 8229c432c2..e0f4746b7e 100644 --- a/src/libsystemd/sd-bus/bus-internal.h +++ b/src/libsystemd/sd-bus/bus-internal.h @@ -254,6 +254,9 @@ struct sd_bus { char *address; unsigned address_index; + uid_t connect_as_uid; + gid_t connect_as_gid; + int last_connect_error; enum bus_auth auth; diff --git a/src/libsystemd/sd-bus/bus-socket.c b/src/libsystemd/sd-bus/bus-socket.c index f0f518a350..07179e0705 100644 --- a/src/libsystemd/sd-bus/bus-socket.c +++ b/src/libsystemd/sd-bus/bus-socket.c @@ -503,11 +503,38 @@ static int bus_socket_write_auth(sd_bus *b) { if (b->prefer_writev) k = writev(b->output_fd, b->auth_iovec + b->auth_index, ELEMENTSOF(b->auth_iovec) - b->auth_index); else { + CMSG_BUFFER_TYPE(CMSG_SPACE(sizeof(struct ucred))) control = {}; + struct msghdr mh = { .msg_iov = b->auth_iovec + b->auth_index, .msg_iovlen = ELEMENTSOF(b->auth_iovec) - b->auth_index, }; + if (uid_is_valid(b->connect_as_uid) || gid_is_valid(b->connect_as_gid)) { + + /* If we shall connect under some specific UID/GID, then synthesize an + * SCM_CREDENTIALS record accordingly. After all we want to adopt this UID/GID both + * for SO_PEERCRED (where we have to fork()) and SCM_CREDENTIALS (where we can just + * fake it via sendmsg()) */ + + struct ucred ucred = { + .pid = getpid_cached(), + .uid = uid_is_valid(b->connect_as_uid) ? b->connect_as_uid : getuid(), + .gid = gid_is_valid(b->connect_as_gid) ? b->connect_as_gid : getgid(), + }; + + mh.msg_control = &control; + mh.msg_controllen = sizeof(control); + struct cmsghdr *cmsg = CMSG_FIRSTHDR(&mh); + *cmsg = (struct cmsghdr) { + .cmsg_level = SOL_SOCKET, + .cmsg_type = SCM_CREDENTIALS, + .cmsg_len = CMSG_LEN(sizeof(struct ucred)), + }; + + memcpy(CMSG_DATA(cmsg), &ucred, sizeof(struct ucred)); + } + k = sendmsg(b->output_fd, &mh, MSG_DONTWAIT|MSG_NOSIGNAL); if (k < 0 && errno == ENOTSOCK) { b->prefer_writev = true; @@ -949,6 +976,66 @@ static int bind_description(sd_bus *b, int fd, int family) { return 0; } +static int connect_as(int fd, const struct sockaddr *sa, socklen_t salen, uid_t uid, gid_t gid) { + _cleanup_(close_pairp) int pfd[2] = EBADF_PAIR; + ssize_t n; + int r; + + /* Shortcut if we are not supposed to drop privileges */ + if (!uid_is_valid(uid) && !gid_is_valid(gid)) + return RET_NERRNO(connect(fd, sa, salen)); + + /* This changes identity to the specified uid/gid and issues connect() as that. This is useful to + * make sure SO_PEERCRED reports the selected UID/GID rather than the usual one of the caller. */ + + if (pipe2(pfd, O_CLOEXEC) < 0) + return -errno; + + r = safe_fork("(sd-setresuid)", FORK_RESET_SIGNALS|FORK_DEATHSIG_SIGKILL|FORK_WAIT, /* ret_pid= */ NULL); + if (r < 0) + return r; + if (r == 0) { + /* child */ + + pfd[0] = safe_close(pfd[0]); + + r = RET_NERRNO(setgroups(0, NULL)); + if (r < 0) + goto child_finish; + + if (gid_is_valid(gid)) { + r = RET_NERRNO(setresgid(gid, gid, gid)); + if (r < 0) + goto child_finish; + } + + if (uid_is_valid(uid)) { + r = RET_NERRNO(setresuid(uid, uid, uid)); + if (r < 0) + goto child_finish; + } + + r = RET_NERRNO(connect(fd, sa, salen)); + if (r < 0) + goto child_finish; + + r = 0; + + child_finish: + n = write(pfd[1], &r, sizeof(r)); + if (n != sizeof(r)) + _exit(EXIT_FAILURE); + + _exit(EXIT_SUCCESS); + } + + n = read(pfd[0], &r, sizeof(r)); + if (n != sizeof(r)) + return -EIO; + + return r; +} + int bus_socket_connect(sd_bus *b) { bool inotify_done = false; int r; @@ -980,8 +1067,9 @@ int bus_socket_connect(sd_bus *b) { b->output_fd = b->input_fd; bus_socket_setup(b); - if (connect(b->input_fd, &b->sockaddr.sa, b->sockaddr_size) < 0) { - if (errno == EINPROGRESS) { + r = connect_as(b->input_fd, &b->sockaddr.sa, b->sockaddr_size, b->connect_as_uid, b->connect_as_gid); + if (r < 0) { + if (r == -EINPROGRESS) { /* If we have any inotify watches open, close them now, we don't need them anymore, as * we have successfully initiated a connection */ @@ -994,7 +1082,7 @@ int bus_socket_connect(sd_bus *b) { return 1; } - if (IN_SET(errno, ENOENT, ECONNREFUSED) && /* ENOENT → unix socket doesn't exist at all; ECONNREFUSED → unix socket stale */ + if (IN_SET(r, -ENOENT, -ECONNREFUSED) && /* ENOENT → unix socket doesn't exist at all; ECONNREFUSED → unix socket stale */ b->watch_bind && b->sockaddr.sa.sa_family == AF_UNIX && b->sockaddr.un.sun_path[0] != 0) { @@ -1022,7 +1110,7 @@ int bus_socket_connect(sd_bus *b) { inotify_done = true; } else - return -errno; + return r; } else break; } diff --git a/src/libsystemd/sd-bus/sd-bus.c b/src/libsystemd/sd-bus/sd-bus.c index a25369e930..22784e8f66 100644 --- a/src/libsystemd/sd-bus/sd-bus.c +++ b/src/libsystemd/sd-bus/sd-bus.c @@ -259,6 +259,8 @@ _public_ int sd_bus_new(sd_bus **ret) { .ucred = UCRED_INVALID, .pidfd = -EBADF, .runtime_scope = _RUNTIME_SCOPE_INVALID, + .connect_as_uid = UID_INVALID, + .connect_as_gid = GID_INVALID, }; /* We guarantee that wqueue always has space for at least one entry */ @@ -716,7 +718,7 @@ static void skip_address_key(const char **p) { } static int parse_unix_address(sd_bus *b, const char **p, char **guid) { - _cleanup_free_ char *path = NULL, *abstract = NULL; + _cleanup_free_ char *path = NULL, *abstract = NULL, *uids = NULL, *gids = NULL; size_t l; int r; @@ -744,6 +746,18 @@ static int parse_unix_address(sd_bus *b, const char **p, char **guid) { else if (r > 0) continue; + r = parse_address_key(p, "uid", &uids); + if (r < 0) + return r; + else if (r > 0) + continue; + + r = parse_address_key(p, "gid", &gids); + if (r < 0) + return r; + else if (r > 0) + continue; + skip_address_key(p); } @@ -780,6 +794,17 @@ static int parse_unix_address(sd_bus *b, const char **p, char **guid) { b->sockaddr_size = offsetof(struct sockaddr_un, sun_path) + 1 + l; } + if (uids) { + r = parse_uid(uids, &b->connect_as_uid); + if (r < 0) + return r; + } + if (gids) { + r = parse_gid(gids, &b->connect_as_gid); + if (r < 0) + return r; + } + b->is_local = true; return 0; From 9b94ae834b8b11e3ac7d64b11b5f42eb26dbac65 Mon Sep 17 00:00:00 2001 From: Lennart Poettering Date: Wed, 25 Oct 2023 22:16:52 +0200 Subject: [PATCH 02/10] units: add systemd-capsule@.service --- units/capsule.slice | 13 +++++++++++++ units/capsule@.service.in | 33 +++++++++++++++++++++++++++++++++ units/meson.build | 2 ++ units/user/capsule@.target | 15 +++++++++++++++ units/user/meson.build | 1 + 5 files changed, 64 insertions(+) create mode 100644 units/capsule.slice create mode 100644 units/capsule@.service.in create mode 100644 units/user/capsule@.target diff --git a/units/capsule.slice b/units/capsule.slice new file mode 100644 index 0000000000..cb8995a03f --- /dev/null +++ b/units/capsule.slice @@ -0,0 +1,13 @@ +# SPDX-License-Identifier: LGPL-2.1-or-later +# +# This file is part of systemd. +# +# systemd is free software; you can redistribute it and/or modify it +# under the terms of the GNU Lesser General Public License as published by +# the Free Software Foundation; either version 2.1 of the License, or +# (at your option) any later version. + +[Unit] +Description=Capsule Slice +Documentation=man:systemd.special(7) +Before=slices.target diff --git a/units/capsule@.service.in b/units/capsule@.service.in new file mode 100644 index 0000000000..f2bb9e3a45 --- /dev/null +++ b/units/capsule@.service.in @@ -0,0 +1,33 @@ +# SPDX-License-Identifier: LGPL-2.1-or-later +# +# This file is part of systemd. +# +# systemd is free software; you can redistribute it and/or modify it +# under the terms of the GNU Lesser General Public License as published by +# the Free Software Foundation; either version 2.1 of the License, or +# (at your option) any later version. + +[Unit] +Description=Capsule Service Manager for %i +Documentation=man:capsule@.service(5) +After=dbus.service systemd-oomd.service + +[Service] +User=c-%i +DynamicUser=yes +Type=notify-reload +ExecStart={{LIBEXECDIR}}/systemd --user --unit=capsule@%i.target +Environment=HOME=/var/lib/capsules/%i +Environment=XDG_RUNTIME_DIR=/run/capsules/%i +StateDirectory=capsules/%i +RuntimeDirectory=capsules/%i +LogExtraFields=CAPSULE=%i +Slice=capsule.slice +KillMode=mixed +Delegate=pids memory cpu +DelegateSubgroup=init.scope +TasksMax=infinity +TimeoutStopSec={{ DEFAULT_USER_TIMEOUT_SEC*4//3 }}s +KeyringMode=inherit +OOMScoreAdjust=100 +MemoryPressureWatch=skip diff --git a/units/meson.build b/units/meson.build index 1458cc5986..5bd2c6774e 100644 --- a/units/meson.build +++ b/units/meson.build @@ -745,6 +745,8 @@ units = [ { 'file' : 'user-runtime-dir@.service.in' }, { 'file' : 'user.slice' }, { 'file' : 'user@.service.in' }, + { 'file' : 'capsule@.service.in' }, + { 'file' : 'capsule.slice' }, { 'file' : 'var-lib-machines.mount', 'conditions' : ['ENABLE_MACHINED'], diff --git a/units/user/capsule@.target b/units/user/capsule@.target new file mode 100644 index 0000000000..986e3ad92e --- /dev/null +++ b/units/user/capsule@.target @@ -0,0 +1,15 @@ +# SPDX-License-Identifier: LGPL-2.1-or-later +# +# This file is part of systemd. +# +# systemd is free software; you can redistribute it and/or modify it +# under the terms of the GNU Lesser General Public License as published by +# the Free Software Foundation; either version 2.1 of the License, or +# (at your option) any later version. + +[Unit] +Description=Main Capsule Target for %i +Documentation=man:systemd.special(7) +Requires=basic.target +After=basic.target +AllowIsolate=yes diff --git a/units/user/meson.build b/units/user/meson.build index 850ac2c297..248745d933 100644 --- a/units/user/meson.build +++ b/units/user/meson.build @@ -11,6 +11,7 @@ units = [ 'graphical-session.target', 'paths.target', 'printer.target', + 'capsule@.target', 'session.slice', 'shutdown.target', 'smartcard.target', From 9367af8a295ed0a998bbfe780f96dd1bf101559d Mon Sep 17 00:00:00 2001 From: Lennart Poettering Date: Tue, 7 Nov 2023 18:19:25 +0100 Subject: [PATCH 03/10] util: add capsule-util.[ch] with helpers for capsules For now, there's only a routine for validating capsule names. More will be added later. --- src/shared/capsule-util.c | 17 +++++++++++++++++ src/shared/capsule-util.h | 4 ++++ src/shared/meson.build | 1 + 3 files changed, 22 insertions(+) create mode 100644 src/shared/capsule-util.c create mode 100644 src/shared/capsule-util.h diff --git a/src/shared/capsule-util.c b/src/shared/capsule-util.c new file mode 100644 index 0000000000..3689a78a8b --- /dev/null +++ b/src/shared/capsule-util.c @@ -0,0 +1,17 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include "capsule-util.h" +#include "path-util.h" +#include "user-util.h" + +int capsule_name_is_valid(const char *name) { + + if (!filename_is_valid(name)) + return false; + + _cleanup_free_ char *prefixed = strjoin("c-", name); + if (!prefixed) + return -ENOMEM; + + return valid_user_group_name(prefixed, /* flags= */ 0); +} diff --git a/src/shared/capsule-util.h b/src/shared/capsule-util.h new file mode 100644 index 0000000000..437153be9e --- /dev/null +++ b/src/shared/capsule-util.h @@ -0,0 +1,4 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +int capsule_name_is_valid(const char *name); diff --git a/src/shared/meson.build b/src/shared/meson.build index d540631f21..11f2b5ade9 100644 --- a/src/shared/meson.build +++ b/src/shared/meson.build @@ -140,6 +140,7 @@ shared_sources = files( 'pkcs11-util.c', 'plymouth-util.c', 'pretty-print.c', + 'capsule-util.c', 'ptyfwd.c', 'qrcode-util.c', 'quota-util.c', From ad963c3f5680796ccd094b81f35ff7aa20b57247 Mon Sep 17 00:00:00 2001 From: Lennart Poettering Date: Thu, 26 Oct 2023 09:18:37 +0200 Subject: [PATCH 04/10] bus-util: add ability to connect directly to capsule instances of systemd --user --- src/shared/bus-util.c | 179 +++++++++++++++++++++++++++++++++++++++--- src/shared/bus-util.h | 5 ++ 2 files changed, 171 insertions(+), 13 deletions(-) diff --git a/src/shared/bus-util.c b/src/shared/bus-util.c index 3f54914591..842e747c50 100644 --- a/src/shared/bus-util.c +++ b/src/shared/bus-util.c @@ -18,13 +18,17 @@ #include "bus-internal.h" #include "bus-label.h" #include "bus-util.h" +#include "capsule-util.h" +#include "chase.h" #include "daemon-util.h" #include "data-fd-util.h" #include "fd-util.h" +#include "format-util.h" #include "memstream-util.h" #include "path-util.h" #include "socket-util.h" #include "stdio-util.h" +#include "uid-classification.h" static int name_owner_change_callback(sd_bus_message *m, void *userdata, sd_bus_error *ret_error) { sd_event *e = ASSERT_PTR(userdata); @@ -268,6 +272,140 @@ int bus_connect_user_systemd(sd_bus **ret_bus) { return 0; } +static int pin_capsule_socket(const char *capsule, const char *suffix, uid_t *ret_uid, gid_t *ret_gid) { + _cleanup_close_ int inode_fd = -EBADF; + _cleanup_free_ char *p = NULL; + struct stat st; + int r; + + assert(capsule); + assert(suffix); + + p = path_join("/run/capsules", capsule, suffix); + if (!p) + return -ENOMEM; + + /* We enter territory owned by the user, hence let's be paranoid about symlinks and ownership */ + r = chase(p, /* root= */ NULL, CHASE_SAFE|CHASE_PROHIBIT_SYMLINKS, /* ret_path= */ NULL, &inode_fd); + if (r < 0) + return r; + + if (fstat(inode_fd, &st) < 0) + return -errno; + + /* Paranoid safety check */ + if (uid_is_system(st.st_uid) || gid_is_system(st.st_gid)) + return -EPERM; + + *ret_uid = st.st_uid; + *ret_gid = st.st_gid; + + return TAKE_FD(inode_fd); +} + +int bus_connect_capsule_systemd(const char *capsule, sd_bus **ret_bus) { + _cleanup_(sd_bus_close_unrefp) sd_bus *bus = NULL; + _cleanup_close_ int inode_fd = -EBADF; + _cleanup_free_ char *pp = NULL; + uid_t uid; + gid_t gid; + int r; + + assert(capsule); + assert(ret_bus); + + r = capsule_name_is_valid(capsule); + if (r < 0) + return r; + if (r == 0) + return -EINVAL; + + /* Connects to a capsule's user bus. We need to do so under the capsule's UID/GID, otherwise the + * the service manager might refuse our connection. Hence fake it. */ + + inode_fd = pin_capsule_socket(capsule, "systemd/private", &uid, &gid); + if (inode_fd < 0) + return inode_fd; + + pp = bus_address_escape(FORMAT_PROC_FD_PATH(inode_fd)); + if (!pp) + return -ENOMEM; + + r = sd_bus_new(&bus); + if (r < 0) + return r; + + if (asprintf(&bus->address, "unix:path=%s,uid=" UID_FMT ",gid=" GID_FMT, pp, uid, gid) < 0) + return -ENOMEM; + + r = sd_bus_start(bus); + if (r < 0) + return r; + + *ret_bus = TAKE_PTR(bus); + return 0; +} + +int bus_set_address_capsule_bus(sd_bus *bus, const char *capsule, int *ret_pin_fd) { + _cleanup_free_ char *pp = NULL; + _cleanup_close_ int inode_fd = -EBADF; + uid_t uid; + gid_t gid; + int r; + + assert(bus); + assert(capsule); + assert(ret_pin_fd); + + r = capsule_name_is_valid(capsule); + if (r < 0) + return r; + if (r == 0) + return -EINVAL; + + inode_fd = pin_capsule_socket(capsule, "bus", &uid, &gid); + if (inode_fd < 0) + return inode_fd; + + pp = bus_address_escape(FORMAT_PROC_FD_PATH(inode_fd)); + if (!pp) + return -ENOMEM; + + if (asprintf(&bus->address, "unix:path=%s,uid=" UID_FMT ",gid=" GID_FMT, pp, uid, gid) < 0) + return -ENOMEM; + + *ret_pin_fd = TAKE_FD(inode_fd); /* This fd must be kept pinned until the connection has been established */ + return 0; +} + +int bus_connect_capsule_bus(const char *capsule, sd_bus **ret_bus) { + _cleanup_(sd_bus_close_unrefp) sd_bus *bus = NULL; + _cleanup_close_ int inode_fd = -EBADF; + int r; + + assert(capsule); + assert(ret_bus); + + r = sd_bus_new(&bus); + if (r < 0) + return r; + + r = bus_set_address_capsule_bus(bus, capsule, &inode_fd); + if (r < 0) + return r; + + r = sd_bus_set_bus_client(bus, true); + if (r < 0) + return r; + + r = sd_bus_start(bus); + if (r < 0) + return r; + + *ret_bus = TAKE_PTR(bus); + return 0; +} + int bus_connect_transport( BusTransport transport, const char *host, @@ -281,12 +419,10 @@ int bus_connect_transport( assert(transport < _BUS_TRANSPORT_MAX); assert(ret); - assert_return((transport == BUS_TRANSPORT_LOCAL) == !host, -EINVAL); - assert_return(transport != BUS_TRANSPORT_REMOTE || runtime_scope == RUNTIME_SCOPE_SYSTEM, -EOPNOTSUPP); - switch (transport) { case BUS_TRANSPORT_LOCAL: + assert_return(!host, -EINVAL); switch (runtime_scope) { @@ -308,11 +444,12 @@ int bus_connect_transport( break; case BUS_TRANSPORT_REMOTE: + assert_return(runtime_scope == RUNTIME_SCOPE_SYSTEM, -EOPNOTSUPP); + r = sd_bus_open_system_remote(&bus, host); break; case BUS_TRANSPORT_MACHINE: - switch (runtime_scope) { case RUNTIME_SCOPE_USER: @@ -329,6 +466,12 @@ int bus_connect_transport( break; + case BUS_TRANSPORT_CAPSULE: + assert_return(runtime_scope == RUNTIME_SCOPE_USER, -EINVAL); + + r = bus_connect_capsule_bus(host, &bus); + break; + default: assert_not_reached(); } @@ -343,28 +486,32 @@ int bus_connect_transport( return 0; } -int bus_connect_transport_systemd(BusTransport transport, const char *host, RuntimeScope runtime_scope, sd_bus **bus) { +int bus_connect_transport_systemd( + BusTransport transport, + const char *host, + RuntimeScope runtime_scope, + sd_bus **ret_bus) { + assert(transport >= 0); assert(transport < _BUS_TRANSPORT_MAX); - assert(bus); - - assert_return((transport == BUS_TRANSPORT_LOCAL) == !host, -EINVAL); - assert_return(transport == BUS_TRANSPORT_LOCAL || runtime_scope == RUNTIME_SCOPE_SYSTEM, -EOPNOTSUPP); + assert(ret_bus); switch (transport) { case BUS_TRANSPORT_LOCAL: + assert_return(!host, -EINVAL); + switch (runtime_scope) { case RUNTIME_SCOPE_USER: - return bus_connect_user_systemd(bus); + return bus_connect_user_systemd(ret_bus); case RUNTIME_SCOPE_SYSTEM: if (sd_booted() <= 0) /* Print a friendly message when the local system is actually not running systemd as PID 1. */ return log_error_errno(SYNTHETIC_ERRNO(EHOSTDOWN), "System has not been booted with systemd as init system (PID 1). Can't operate."); - return bus_connect_system_systemd(bus); + return bus_connect_system_systemd(ret_bus); default: assert_not_reached(); @@ -373,10 +520,16 @@ int bus_connect_transport_systemd(BusTransport transport, const char *host, Runt break; case BUS_TRANSPORT_REMOTE: - return sd_bus_open_system_remote(bus, host); + assert_return(runtime_scope == RUNTIME_SCOPE_SYSTEM, -EOPNOTSUPP); + return sd_bus_open_system_remote(ret_bus, host); case BUS_TRANSPORT_MACHINE: - return sd_bus_open_system_machine(bus, host); + assert_return(runtime_scope == RUNTIME_SCOPE_SYSTEM, -EOPNOTSUPP); + return sd_bus_open_system_machine(ret_bus, host); + + case BUS_TRANSPORT_CAPSULE: + assert_return(runtime_scope == RUNTIME_SCOPE_USER, -EINVAL); + return bus_connect_capsule_systemd(host, ret_bus); default: assert_not_reached(); diff --git a/src/shared/bus-util.h b/src/shared/bus-util.h index d55665f502..9c6f01d8eb 100644 --- a/src/shared/bus-util.h +++ b/src/shared/bus-util.h @@ -21,6 +21,7 @@ typedef enum BusTransport { BUS_TRANSPORT_LOCAL, BUS_TRANSPORT_REMOTE, BUS_TRANSPORT_MACHINE, + BUS_TRANSPORT_CAPSULE, _BUS_TRANSPORT_MAX, _BUS_TRANSPORT_INVALID = -EINVAL, } BusTransport; @@ -36,8 +37,12 @@ bool bus_error_is_unknown_service(const sd_bus_error *error); int bus_check_peercred(sd_bus *c); +int bus_set_address_capsule_bus(sd_bus *bus, const char *capsule, int *ret_pin_fd); + int bus_connect_system_systemd(sd_bus **ret_bus); int bus_connect_user_systemd(sd_bus **ret_bus); +int bus_connect_capsule_systemd(const char *capsule, sd_bus **ret_bus); +int bus_connect_capsule_bus(const char *capsule, sd_bus **ret_bus); int bus_connect_transport(BusTransport transport, const char *host, RuntimeScope runtime_scope, sd_bus **bus); int bus_connect_transport_systemd(BusTransport transport, const char *host, RuntimeScope runtime_scope, sd_bus **bus); From 56cb74c3cd1358d7d0b3f613feaf2eeab601a6bd Mon Sep 17 00:00:00 2001 From: Lennart Poettering Date: Thu, 26 Oct 2023 09:19:32 +0200 Subject: [PATCH 05/10] systemctl: allow connecting to capsule instances with --capsule=/-C --- src/systemctl/systemctl-start-unit.c | 23 +++++++++++++++++++---- src/systemctl/systemctl-util.c | 2 +- src/systemctl/systemctl.c | 18 +++++++++++++++++- 3 files changed, 37 insertions(+), 6 deletions(-) diff --git a/src/systemctl/systemctl-start-unit.c b/src/systemctl/systemctl-start-unit.c index c844f7e1e7..de8873caaf 100644 --- a/src/systemctl/systemctl-start-unit.c +++ b/src/systemctl/systemctl-start-unit.c @@ -255,14 +255,29 @@ static const char** make_extra_args(const char *extra_args[static 4]) { if (arg_runtime_scope != RUNTIME_SCOPE_SYSTEM) extra_args[n++] = "--user"; - if (arg_transport == BUS_TRANSPORT_REMOTE) { + switch (arg_transport) { + + case BUS_TRANSPORT_REMOTE: extra_args[n++] = "-H"; extra_args[n++] = arg_host; - } else if (arg_transport == BUS_TRANSPORT_MACHINE) { + break; + + case BUS_TRANSPORT_MACHINE: extra_args[n++] = "-M"; extra_args[n++] = arg_host; - } else - assert(arg_transport == BUS_TRANSPORT_LOCAL); + break; + + case BUS_TRANSPORT_CAPSULE: + extra_args[n++] = "-C"; + extra_args[n++] = arg_host; + break; + + case BUS_TRANSPORT_LOCAL: + break; + + default: + assert_not_reached(); + } extra_args[n] = NULL; return extra_args; diff --git a/src/systemctl/systemctl-util.c b/src/systemctl/systemctl-util.c index c3da750d64..75c6c547f7 100644 --- a/src/systemctl/systemctl-util.c +++ b/src/systemctl/systemctl-util.c @@ -42,7 +42,7 @@ int acquire_bus(BusFocus focus, sd_bus **ret) { return log_error_errno(SYNTHETIC_ERRNO(EOPNOTSUPP), "--global is not supported for this operation."); /* We only go directly to the manager, if we are using a local transport */ - if (arg_transport != BUS_TRANSPORT_LOCAL) + if (!IN_SET(arg_transport, BUS_TRANSPORT_LOCAL, BUS_TRANSPORT_CAPSULE)) focus = BUS_FULL; if (getenv_bool("SYSTEMCTL_FORCE_BUS") > 0) diff --git a/src/systemctl/systemctl.c b/src/systemctl/systemctl.c index 5241af5bfa..00452cac2c 100644 --- a/src/systemctl/systemctl.c +++ b/src/systemctl/systemctl.c @@ -18,6 +18,7 @@ #include "path-util.h" #include "pretty-print.h" #include "process-util.h" +#include "capsule-util.h" #include "reboot-util.h" #include "rlimit-util.h" #include "sigbus.h" @@ -63,6 +64,7 @@ #include "systemctl.h" #include "terminal-util.h" #include "time-util.h" +#include "user-util.h" #include "verbs.h" #include "virt.h" @@ -262,6 +264,7 @@ static int systemctl_help(void) { " --version Show package version\n" " --system Connect to system manager\n" " --user Connect to user service manager\n" + " -C --capsule=NAME Connect to service manager of specified capsule\n" " -H --host=[USER@]HOST Operate on remote host\n" " -M --machine=CONTAINER Operate on a local container\n" " -t --type=TYPE List units of a particular type\n" @@ -490,6 +493,7 @@ static int systemctl_parse_argv(int argc, char *argv[]) { { "user", no_argument, NULL, ARG_USER }, { "system", no_argument, NULL, ARG_SYSTEM }, { "global", no_argument, NULL, ARG_GLOBAL }, + { "capsule", required_argument, NULL, 'C' }, { "wait", no_argument, NULL, ARG_WAIT }, { "no-block", no_argument, NULL, ARG_NO_BLOCK }, { "legend", required_argument, NULL, ARG_LEGEND }, @@ -544,7 +548,7 @@ static int systemctl_parse_argv(int argc, char *argv[]) { /* We default to allowing interactive authorization only in systemctl (not in the legacy commands) */ arg_ask_password = true; - while ((c = getopt_long(argc, argv, "ht:p:P:alqfs:H:M:n:o:iTr.::", options, NULL)) >= 0) + while ((c = getopt_long(argc, argv, "hC:t:p:P:alqfs:H:M:n:o:iTr.::", options, NULL)) >= 0) switch (c) { @@ -679,6 +683,18 @@ static int systemctl_parse_argv(int argc, char *argv[]) { arg_runtime_scope = RUNTIME_SCOPE_GLOBAL; break; + case 'C': + r = capsule_name_is_valid(optarg); + if (r < 0) + return log_error_errno(r, "Unable to validate capsule name '%s': %m", optarg); + if (r == 0) + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Invalid capsule name: %s", optarg); + + arg_host = optarg; + arg_transport = BUS_TRANSPORT_CAPSULE; + arg_runtime_scope = RUNTIME_SCOPE_USER; + break; + case ARG_WAIT: arg_wait = true; break; From 759b3c082d463a488235592df45cbebefbe1ad5c Mon Sep 17 00:00:00 2001 From: Lennart Poettering Date: Thu, 26 Oct 2023 09:19:04 +0200 Subject: [PATCH 06/10] run: allow connecting to capsule instances with --capsule=/-C --- src/run/run.c | 53 ++++++++++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 50 insertions(+), 3 deletions(-) diff --git a/src/run/run.c b/src/run/run.c index c0c65b2ea1..e4b89d8475 100644 --- a/src/run/run.c +++ b/src/run/run.c @@ -17,11 +17,14 @@ #include "bus-unit-util.h" #include "bus-wait-for-jobs.h" #include "calendarspec.h" +#include "capsule-util.h" +#include "chase.h" #include "env-util.h" #include "escape.h" #include "exit-status.h" #include "fd-util.h" #include "format-util.h" +#include "fs-util.h" #include "hostname-util.h" #include "main-func.h" #include "parse-argument.h" @@ -35,6 +38,7 @@ #include "special.h" #include "strv.h" #include "terminal-util.h" +#include "uid-classification.h" #include "unit-def.h" #include "unit-name.h" #include "user-util.h" @@ -265,6 +269,7 @@ static int parse_argv(int argc, char *argv[]) { { "version", no_argument, NULL, ARG_VERSION }, { "user", no_argument, NULL, ARG_USER }, { "system", no_argument, NULL, ARG_SYSTEM }, + { "capsule", required_argument, NULL, 'C' }, { "scope", no_argument, NULL, ARG_SCOPE }, { "unit", required_argument, NULL, 'u' }, { "description", required_argument, NULL, ARG_DESCRIPTION }, @@ -317,7 +322,7 @@ static int parse_argv(int argc, char *argv[]) { /* Resetting to 0 forces the invocation of an internal initialization routine of getopt_long() * that checks for GNU extensions in optstring ('-' or '+' at the beginning). */ optind = 0; - while ((c = getopt_long(argc, argv, "+hrH:M:E:p:tPqGdSu:", options, NULL)) >= 0) + while ((c = getopt_long(argc, argv, "+hrC:H:M:E:p:tPqGdSu:", options, NULL)) >= 0) switch (c) { @@ -339,6 +344,18 @@ static int parse_argv(int argc, char *argv[]) { arg_runtime_scope = RUNTIME_SCOPE_SYSTEM; break; + case 'C': + r = capsule_name_is_valid(optarg); + if (r < 0) + return log_error_errno(r, "Unable to validate capsule name '%s': %m", optarg); + if (r == 0) + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Invalid capsule name: %s", optarg); + + arg_host = optarg; + arg_transport = BUS_TRANSPORT_CAPSULE; + arg_runtime_scope = RUNTIME_SCOPE_USER; + break; + case ARG_SCOPE: arg_scope = true; break; @@ -1603,6 +1620,28 @@ static void set_window_title(PTYForward *f) { (void) pty_forward_set_title_prefix(f, dot); } +static int chown_to_capsule(const char *path, const char *capsule) { + _cleanup_free_ char *p = NULL; + int r; + + assert(path); + assert(capsule); + + p = path_join("/run/capsules/", capsule); + if (!p) + return -ENOMEM; + + struct stat st; + r = chase_and_stat(p, /* root= */ NULL, CHASE_SAFE|CHASE_PROHIBIT_SYMLINKS, /* ret_path= */ NULL, &st); + if (r < 0) + return r; + + if (uid_is_system(st.st_uid) || gid_is_system(st.st_gid)) /* paranoid safety check */ + return -EPERM; + + return chmod_and_chown(path, 0600, st.st_uid, st.st_gid); +} + static int start_transient_service(sd_bus *bus) { _cleanup_(sd_bus_message_unrefp) sd_bus_message *m = NULL, *reply = NULL; _cleanup_(sd_bus_error_free) sd_bus_error error = SD_BUS_ERROR_NULL; @@ -1615,7 +1654,7 @@ static int start_transient_service(sd_bus *bus) { if (arg_stdio == ARG_STDIO_PTY) { - if (arg_transport == BUS_TRANSPORT_LOCAL) { + if (IN_SET(arg_transport, BUS_TRANSPORT_LOCAL, BUS_TRANSPORT_CAPSULE)) { master = posix_openpt(O_RDWR|O_NOCTTY|O_CLOEXEC|O_NONBLOCK); if (master < 0) return log_error_errno(errno, "Failed to acquire pseudo tty: %m"); @@ -1624,6 +1663,14 @@ static int start_transient_service(sd_bus *bus) { if (r < 0) return log_error_errno(r, "Failed to determine tty name: %m"); + if (arg_transport == BUS_TRANSPORT_CAPSULE) { + /* If we are in capsule mode, we must give the capsule UID/GID access to the PTY we just allocated first. */ + + r = chown_to_capsule(pty_path, arg_host); + if (r < 0) + return log_error_errno(r, "Failed to chown tty to capsule UID/GID: %m"); + } + if (unlockpt(master) < 0) return log_error_errno(errno, "Failed to unlock tty: %m"); @@ -2311,7 +2358,7 @@ static int run(int argc, char* argv[]) { * limited direct connection */ if (arg_wait || arg_stdio != ARG_STDIO_NONE || - (arg_runtime_scope == RUNTIME_SCOPE_USER && arg_transport != BUS_TRANSPORT_LOCAL)) + (arg_runtime_scope == RUNTIME_SCOPE_USER && !IN_SET(arg_transport, BUS_TRANSPORT_LOCAL, BUS_TRANSPORT_CAPSULE))) r = bus_connect_transport(arg_transport, arg_host, arg_runtime_scope, &bus); else r = bus_connect_transport_systemd(arg_transport, arg_host, arg_runtime_scope, &bus); From 00431b2b66cb59540deda4ea018170a289673585 Mon Sep 17 00:00:00 2001 From: Lennart Poettering Date: Tue, 7 Nov 2023 18:19:07 +0100 Subject: [PATCH 07/10] busctl: teach busctl a --capsule=/-C switch too --- src/busctl/busctl.c | 21 +++++++++++++++++++-- 1 file changed, 19 insertions(+), 2 deletions(-) diff --git a/src/busctl/busctl.c b/src/busctl/busctl.c index 9c750fcd53..ec00ed54a2 100644 --- a/src/busctl/busctl.c +++ b/src/busctl/busctl.c @@ -28,6 +28,7 @@ #include "parse-util.h" #include "path-util.h" #include "pretty-print.h" +#include "capsule-util.h" #include "runtime-scope.h" #include "set.h" #include "sort-util.h" @@ -72,6 +73,7 @@ static int json_transform_message(sd_bus_message *m, JsonVariant **ret); static int acquire_bus(bool set_monitor, sd_bus **ret) { _cleanup_(sd_bus_close_unrefp) sd_bus *bus = NULL; + _cleanup_close_ int pin_fd = -EBADF; int r; r = sd_bus_new(&bus); @@ -138,10 +140,13 @@ static int acquire_bus(bool set_monitor, sd_bus **ret) { r = bus_set_address_machine(bus, arg_runtime_scope, arg_host); break; + case BUS_TRANSPORT_CAPSULE: + r = bus_set_address_capsule_bus(bus, arg_host, &pin_fd); + break; + default: assert_not_reached(); } - if (r < 0) return bus_log_address_error(r, arg_transport); @@ -2385,6 +2390,7 @@ static int parse_argv(int argc, char *argv[]) { { "match", required_argument, NULL, ARG_MATCH }, { "host", required_argument, NULL, 'H' }, { "machine", required_argument, NULL, 'M' }, + { "capsule", required_argument, NULL, 'C' }, { "size", required_argument, NULL, ARG_SIZE }, { "list", no_argument, NULL, ARG_LIST }, { "quiet", no_argument, NULL, 'q' }, @@ -2406,7 +2412,7 @@ static int parse_argv(int argc, char *argv[]) { assert(argc >= 0); assert(argv); - while ((c = getopt_long(argc, argv, "hH:M:qjl", options, NULL)) >= 0) + while ((c = getopt_long(argc, argv, "hH:M:C:J:qjl", options, NULL)) >= 0) switch (c) { @@ -2490,6 +2496,17 @@ static int parse_argv(int argc, char *argv[]) { arg_host = optarg; break; + case 'C': + r = capsule_name_is_valid(optarg); + if (r < 0) + return log_error_errno(r, "Unable to validate capsule name '%s': %m", optarg); + if (r == 0) + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Invalid capsule name: %s", optarg); + + arg_host = optarg; + arg_transport = BUS_TRANSPORT_CAPSULE; + break; + case 'q': arg_quiet = true; break; From e8fd555471322829d0efcdc8871c80438cd0a75c Mon Sep 17 00:00:00 2001 From: Lennart Poettering Date: Wed, 3 Jan 2024 19:29:23 +0100 Subject: [PATCH 08/10] man: document the new concepts --- man/busctl.xml | 1 + man/capsule@.service.xml | 118 ++++++++++++++++++++++++++++++++++++ man/rules/meson.build | 1 + man/systemctl.xml | 1 + man/systemd-run.xml | 1 + man/systemd.special.xml | 66 ++++++++++++++------ man/user-system-options.xml | 11 ++++ man/user@.service.xml | 1 + 8 files changed, 180 insertions(+), 20 deletions(-) create mode 100644 man/capsule@.service.xml diff --git a/man/busctl.xml b/man/busctl.xml index 1add61728b..8cb9a4bfa5 100644 --- a/man/busctl.xml +++ b/man/busctl.xml @@ -449,6 +449,7 @@ + diff --git a/man/capsule@.service.xml b/man/capsule@.service.xml new file mode 100644 index 0000000000..d20b1dd9a3 --- /dev/null +++ b/man/capsule@.service.xml @@ -0,0 +1,118 @@ + + + + + + + capsule@.service + systemd + + + + capsule@.service + 5 + + + + capsule@.service + System unit for the capsule service manager + + + + capsule@NAME.service + + + + Description + + Service managers for capsules run in + capsule@NAME.service system units, with the capsule name as the + instance identifier. Capsules are way to run additional instances of the service manager, under dynamic + user IDs, i.e. UIDs that are allocated when the capsule service manager is started, and released when it + is stopped. + + In many ways capsule@.service is similar to the per-user + user@.service service manager, but there are a few important distinctions: + + + The capsule service manager utilizes DynamicUser= (see + systemd.exec5) to + allocate a new UID dynamically on invocation. The user name is automatically generated from the capsule + name, by prefixng p_. The UID is released when the service is terminated. The user + service manager on the other hand operates under a statically allocated user ID that must be + pre-existing, before the user service manager is invoked. + + User service managers register themselves with pam8, capsule + service managers do not. + + User service managers typically read their configuration from a + $HOME directory below /home/, capsule service managers from a + $HOME directory below /var/lib/capsules/. + + User service managers are collectively contained in the user.slice + unit, capsule service managers in capsule.slice. Also see + systemd.special7. + + User service managers start the user unit default.target + initially. Capsule service managers invoke the user unit capsule@.target + instead. + + + The capsule service manager and the capsule's bus broker can be reached via the + switch to + systemctl1, + systemd-run1 and + busctl1. + + New capsules can be started via a simple systemctl start + capsule@NAME.service command, and stopped via systemctl + stop capsule@NAME.service. Starting a capsule will implicitly create + a home directory /var/lib/capsules/NAME/, if missing. A + runtime directory is created as /run/capsules/NAME/. To + remove these resources use systemctl clean capsule@NAME.service, + for example with the switch. + + The capsule@.service unit invokes a systemd --user + service manager process. This means unit files are looked for according to the sames rules as for regular user + service managers, for example in + /var/lib/capsules/NAME/.config/systemd/user/. + + Capsule names may be chosen freely by the user, however, they must be suitable as UNIX filenames + (i.e. 255 characters max, and contain no /), and when prefixed with + p- be suitable as a user name matching strict POSIX rules, see User/Group Name Syntax for details. + + + + Examples + + Create a new capsule, invoke two programs in it (one interactively), terminate it, and clean everything up + + # systemctl start capsule@tatze.service +# systemd-run --capsule=tatze --unit=sleeptest.service sleep 999 +# systemctl --capsule=tatze status sleeptest.service +# systemd-run -t --capsule=tatze bash +# systemctl --capsule=tatze stop sleeptest.service +# systemctl stop capsule@tatze.service +# systemctl clean --all capsule@tatze.service + + + + + See Also + + systemd1, + user@.service5, + systemd.service5, + systemd.slice5, + systemd.exec5, + systemd.special7, + systemctl1, + systemd-run1, + busctl1, + pam8 + + + diff --git a/man/rules/meson.build b/man/rules/meson.build index 1f07e606c9..f1ac6288ca 100644 --- a/man/rules/meson.build +++ b/man/rules/meson.build @@ -8,6 +8,7 @@ manpages = [ ['bootctl', '1', [], ''], ['bootup', '7', [], ''], ['busctl', '1', [], ''], + ['capsule@.service', '5', [], ''], ['coredump.conf', '5', ['coredump.conf.d'], 'ENABLE_COREDUMP'], ['coredumpctl', '1', [], 'ENABLE_COREDUMP'], ['crypttab', '5', [], 'HAVE_LIBCRYPTSETUP'], diff --git a/man/systemctl.xml b/man/systemctl.xml index a8c12b2f35..ce560bf2e5 100644 --- a/man/systemctl.xml +++ b/man/systemctl.xml @@ -2813,6 +2813,7 @@ EOF + diff --git a/man/systemd-run.xml b/man/systemd-run.xml index e891920c63..51041a39f0 100644 --- a/man/systemd-run.xml +++ b/man/systemd-run.xml @@ -517,6 +517,7 @@ + diff --git a/man/systemd.special.xml b/man/systemd.special.xml index 988b7175ba..a8717d65c4 100644 --- a/man/systemd.special.xml +++ b/man/systemd.special.xml @@ -96,9 +96,10 @@ umount.target, usb-gadget.target, -.slice, + capsule.slice, + machine.slice, system.slice, user.slice, - machine.slice, -.mount, dbus.service, dbus.socket, @@ -1291,18 +1292,39 @@ -.slice - The root slice is the root of the slice hierarchy. It usually does not contain - units directly, but may be used to set defaults for the whole tree. + The root slice is the root of the slice hierarchy. It usually does not contain units + directly, but may be used to set defaults for the whole tree. + + machine.slice + + By default, all virtual machines and containers registered with + systemd-machined are found in this slice. This is pulled in by + systemd-machined.service. + + + + + + + capsule.slice + + By default, all capsules encapsulated in capsule@.service are found in + this slice. + + + + + system.slice - By default, all system services started by - systemd are found in this slice. + By default, all system services started by systemd are found in this + slice. @@ -1320,17 +1342,6 @@ - - machine.slice - - By default, all virtual machines and containers - registered with systemd-machined are - found in this slice. This is pulled in by - systemd-machined.service. - - - - @@ -1348,16 +1359,31 @@ default.target - This is the main target of the user session, started by default. Various services that - compose the normal user session should be pulled into this target. In this regard, - default.target is similar to multi-user.target in the - system instance, but it is a real unit, not an alias. + This is the main target of the user service manager, started by default when the service + manager is invoked. Various services that compose the normal user session should be pulled into + this target. In this regard, default.target is similar to + multi-user.target in the system instance, but it is a real unit, not an + alias. + + + capsule@.target + + This is the main target of capsule service managers, started by default, instantiated with + the capsule name. This may be used to define different sets of units that are started for + different capsules via generic unit definitions. For details about capsules see + capsule@.service5. + + + + + + In addition, the following units are available which have definitions similar to their system counterparts: exit.target, diff --git a/man/user-system-options.xml b/man/user-system-options.xml index 952ac18a4f..b7d7882171 100644 --- a/man/user-system-options.xml +++ b/man/user-system-options.xml @@ -55,4 +55,15 @@ implied. + + + + + + + Execute operation on a capsule. Specify a capsule name to connect to. See + capsule@.service5 for + details about capsules. + + diff --git a/man/user@.service.xml b/man/user@.service.xml index 819fd6feac..ba57f1b331 100644 --- a/man/user@.service.xml +++ b/man/user@.service.xml @@ -188,6 +188,7 @@ TasksMax=33% systemd.resource-control5 systemd.exec5 systemd.special7 + capsule@.service5 pam8 From a037f2eb9b9a4aff33ff44217a68e77bb14fdec8 Mon Sep 17 00:00:00 2001 From: Lennart Poettering Date: Wed, 8 Nov 2023 09:48:42 +0100 Subject: [PATCH 09/10] test: add integration test for capsules --- test/units/testsuite-74.capsule.sh | 53 ++++++++++++++++++++++++++++++ 1 file changed, 53 insertions(+) create mode 100755 test/units/testsuite-74.capsule.sh diff --git a/test/units/testsuite-74.capsule.sh b/test/units/testsuite-74.capsule.sh new file mode 100755 index 0000000000..e7b5c87747 --- /dev/null +++ b/test/units/testsuite-74.capsule.sh @@ -0,0 +1,53 @@ +#!/usr/bin/env bash +# SPDX-License-Identifier: LGPL-2.1-or-later +# shellcheck disable=SC2235 +set -eux +set -o pipefail + +at_exit() { + set +e + systemctl --no-block stop capsule@foobar.service + rm -rf /run/capsules/foobar + rm -rf /var/lib/capsules/foobar + rm -f /run/systemd/system/capsule@.service.d/99-asan.conf +} + +trap at_exit EXIT + +# Appease ASan, since the capsule@.service uses DynamicUser=yes +systemctl edit --runtime --stdin capsule@.service --drop-in=99-asan.conf < Date: Tue, 7 Nov 2023 22:37:21 +0100 Subject: [PATCH 10/10] update TODO --- TODO | 6 ------ 1 file changed, 6 deletions(-) diff --git a/TODO b/TODO index 5fd4fdcf69..10448b3566 100644 --- a/TODO +++ b/TODO @@ -356,12 +356,6 @@ Features: policy from currently booted kernel/event log, to close gap for first boot for pre-built images -* add a new systemd-project@.service that is very similar to user@.service but - uses DynamicUser=1 and no PAMName= to invoke an unprivileged somewhat - light-weight service manager. Use HOME=/var/lib/systemd/projects/%i as home - dir. Similar for $XDG_RUNTIME_DIR. Start project@%i.target. Use LogField= to - add a field identifying the project. - * in sd-boot and sd-stub measure the SMBIOS vendor strings to some PCR (at least some subset of them that look like systemd stuff), because apparently some firmware does not, but systemd honours it. avoid duplicate measurement