diff --git a/docs/ARCHITECTURE.md b/docs/ARCHITECTURE.md index c777fa75de..33486203a6 100644 --- a/docs/ARCHITECTURE.md +++ b/docs/ARCHITECTURE.md @@ -201,3 +201,25 @@ can be found under various directories such as `factory/`, `modprobe.d/`, `netwo `tools/`, `coccinelle/`, `.github/`, `.semaphore/`, `.mkosi/` host various utilities and scripts that are used by maintainers and developers. They are not shipped or installed. + +# Service Manager Overview + +The Service Manager takes configuration in the form of unit files, credentials, +kernel command line options and D-Bus commands, and based on those manages the +system and spawns other processes. It runs in system mode as PID1, and in user +mode with one instance per user session. + +When starting a unit requires forking a new process, configuration for the new +process will be serialized and passed over to the new process, created via a +posix_spawn() call. This is done in order to avoid excessive processing after +a fork() but before an exec(), which is against glibc's best practices and can +also result in a copy-on-write trap. The new process will start as the +`systemd-executor` binary, which will deserialize the configuration and apply +all the options (sandboxing, namespacing, cgroup, etc.) before exec'ing the +configured executable. + +``` + ┌──────┐posix_spawn() ┌───────────┐execve() ┌────────┐ + │ PID1 ├─────────────►│sd-executor├────────►│program │ + └──────┘ (memfd) └───────────┘ └────────┘ +``` diff --git a/meson.build b/meson.build index 1517065db6..5b6b928276 100644 --- a/meson.build +++ b/meson.build @@ -225,6 +225,7 @@ conf.set_quoted('SYSCONF_DIR', sysconfdir) conf.set_quoted('SYSCTL_DIR', sysctldir) conf.set_quoted('SYSTEMCTL_BINARY_PATH', bindir / 'systemctl') conf.set_quoted('SYSTEMD_BINARY_PATH', libexecdir / 'systemd') +conf.set_quoted('SYSTEMD_EXECUTOR_BINARY_PATH', libexecdir / 'systemd-executor') conf.set_quoted('SYSTEMD_CATALOG_DIR', catalogdir) conf.set_quoted('SYSTEMD_CGROUPS_AGENT_PATH', libexecdir / 'systemd-cgroups-agent') conf.set_quoted('SYSTEMD_CRYPTSETUP_PATH', bindir / 'systemd-cryptsetup') diff --git a/src/basic/cgroup-util.h b/src/basic/cgroup-util.h index 625816d9cf..80ea7e7ffa 100644 --- a/src/basic/cgroup-util.h +++ b/src/basic/cgroup-util.h @@ -36,7 +36,7 @@ typedef enum CGroupController { CGROUP_CONTROLLER_BPF_SOCKET_BIND, CGROUP_CONTROLLER_BPF_RESTRICT_NETWORK_INTERFACES, /* The BPF hook implementing RestrictFileSystems= is not defined here. - * It's applied as late as possible in exec_child() so we don't block + * It's applied as late as possible in exec_invoke() so we don't block * our own unit setup code. */ _CGROUP_CONTROLLER_MAX, diff --git a/src/core/dynamic-user.c b/src/core/dynamic-user.c index 2cab0d44ac..dc349afecf 100644 --- a/src/core/dynamic-user.c +++ b/src/core/dynamic-user.c @@ -28,7 +28,7 @@ DEFINE_TRIVIAL_REF_FUNC(DynamicUser, dynamic_user); -static DynamicUser* dynamic_user_free(DynamicUser *d) { +DynamicUser* dynamic_user_free(DynamicUser *d) { if (!d) return NULL; @@ -850,3 +850,12 @@ DynamicCreds* dynamic_creds_destroy(DynamicCreds *creds) { return mfree(creds); } + +void dynamic_creds_done(DynamicCreds *creds) { + if (!creds) + return; + + if (creds->group != creds->user) + dynamic_user_free(creds->group); + creds->group = creds->user = dynamic_user_free(creds->user); +} diff --git a/src/core/dynamic-user.h b/src/core/dynamic-user.h index 679c588a76..e86ee02796 100644 --- a/src/core/dynamic-user.h +++ b/src/core/dynamic-user.h @@ -28,6 +28,7 @@ struct DynamicUser { int dynamic_user_serialize(Manager *m, FILE *f, FDSet *fds); int dynamic_user_serialize_one(DynamicUser *d, const char *key, FILE *f, FDSet *fds); void dynamic_user_deserialize_one(Manager *m, const char *value, FDSet *fds, DynamicUser **ret); +DynamicUser* dynamic_user_free(DynamicUser *d); void dynamic_user_vacuum(Manager *m, bool close_user); int dynamic_user_current(DynamicUser *d, uid_t *ret); @@ -39,6 +40,7 @@ int dynamic_creds_realize(DynamicCreds *creds, char **suggested_paths, uid_t *ui DynamicCreds *dynamic_creds_unref(DynamicCreds *creds); DynamicCreds *dynamic_creds_destroy(DynamicCreds *creds); +void dynamic_creds_done(DynamicCreds *creds); DEFINE_TRIVIAL_CLEANUP_FUNC(DynamicCreds*, dynamic_creds_unref); DEFINE_TRIVIAL_CLEANUP_FUNC(DynamicCreds*, dynamic_creds_destroy); diff --git a/src/core/execute.c b/src/core/execute.c index c13024b1f8..ff9560665c 100644 --- a/src/core/execute.c +++ b/src/core/execute.c @@ -39,6 +39,7 @@ #include "argv-util.h" #include "async.h" #include "barrier.h" +#include "bpf-dlopen.h" #include "bpf-lsm.h" #include "btrfs-util.h" #include "cap-list.h" @@ -56,6 +57,7 @@ #include "escape.h" #include "exec-credential.h" #include "execute.h" +#include "execute-serialize.h" #include "exit-status.h" #include "fd-util.h" #include "fileio.h" @@ -85,6 +87,7 @@ #include "seccomp-util.h" #include "securebits-util.h" #include "selinux-util.h" +#include "serialize.h" #include "signal-util.h" #include "smack-util.h" #include "socket-util.h" @@ -1789,6 +1792,8 @@ static int apply_lock_personality(const ExecContext *c, const ExecParameters *p) #if HAVE_LIBBPF static int apply_restrict_filesystems(const ExecContext *c, const ExecParameters *p) { + int r; + assert(c); assert(p); @@ -1801,6 +1806,11 @@ static int apply_restrict_filesystems(const ExecContext *c, const ExecParameters return 0; } + /* We are in a new binary, so dl-open again */ + r = dlopen_bpf(); + if (r < 0) + return r; + return lsm_bpf_restrict_filesystems(c->restrict_filesystems, p->cgroup_id, p->bpf_outer_map_fd, c->restrict_filesystems_allow_list); } #endif @@ -4062,7 +4072,7 @@ static bool exec_context_shall_confirm_spawn(const ExecContext *context) { static int exec_context_load_environment(const Unit *unit, const ExecContext *c, char ***l); static int exec_context_named_iofds(const ExecContext *c, const ExecParameters *p, int named_iofds[static 3]); -static int exec_child( +int exec_invoke( const ExecCommand *command, const ExecContext *context, ExecParameters *params, @@ -4117,6 +4127,8 @@ static int exec_child( assert(command->path); assert(!strv_isempty(command->argv)); + LOG_CONTEXT_PUSH_EXEC(context, params); + if (context->std_input == EXEC_INPUT_SOCKET || context->std_output == EXEC_OUTPUT_SOCKET || context->std_error == EXEC_OUTPUT_SOCKET) { @@ -5283,7 +5295,6 @@ static int exec_child( return log_exec_error_errno(context, params, r, "Failed to execute %s: %m", executable); } - int exec_spawn(Unit *unit, ExecCommand *command, const ExecContext *context, @@ -5292,12 +5303,16 @@ int exec_spawn(Unit *unit, const CGroupContext *cgroup_context, pid_t *ret) { - _cleanup_free_ char *subcgroup_path = NULL; + char serialization_fd_number[DECIMAL_STR_MAX(int) + 1]; + _cleanup_free_ char *subcgroup_path = NULL, *log_level = NULL, *executor_path = NULL; + _cleanup_fdset_free_ FDSet *fdset = NULL; + _cleanup_fclose_ FILE *f = NULL; pid_t pid; int r; assert(unit); assert(unit->manager); + assert(unit->manager->executor_fd >= 0); assert(command); assert(context); assert(ret); @@ -5333,35 +5348,56 @@ int exec_spawn(Unit *unit, } } - pid = fork(); - if (pid < 0) - return log_unit_error_errno(unit, errno, "Failed to fork: %m"); + /* In order to avoid copy-on-write traps and OOM-kills when pid1's memory.current is above the + * child's memory.max, serialize all the state needed to start the unit, and pass it to the + * systemd-executor binary. clone() with CLONE_VM + CLONE_VFORK will pause the parent until the exec + * and ensure all memory is shared. The child immediately execs the new binary so the delay should + * be minimal. Once glibc provides a clone3 wrapper we can switch to that, and clone directly in the + * target cgroup. */ - if (pid == 0) { - int exit_status; + r = open_serialization_file("sd-executor-state", &f); + if (r < 0) + return log_unit_error_errno(unit, r, "Failed to open serialization stream: %m"); - r = exec_child(command, - context, - params, - runtime, - cgroup_context, - &exit_status); + fdset = fdset_new(); + if (!fdset) + return log_oom(); - if (r < 0) { - const char *status = ASSERT_PTR( - exit_status_to_string(exit_status, EXIT_STATUS_LIBC | EXIT_STATUS_SYSTEMD)); + r = exec_serialize_invocation(f, fdset, context, command, params, runtime, cgroup_context); + if (r < 0) + return log_unit_error_errno(unit, r, "Failed to serialize parameters: %m"); - log_unit_struct_errno(unit, LOG_ERR, r, - "MESSAGE_ID=" SD_MESSAGE_SPAWN_FAILED_STR, - LOG_UNIT_INVOCATION_ID(unit), - LOG_UNIT_MESSAGE(unit, "Failed at step %s spawning %s: %m", - status, command->path), - "EXECUTABLE=%s", command->path); - } else - assert(exit_status == EXIT_SUCCESS); + if (fseeko(f, 0, SEEK_SET) == (off_t) -1) + return log_unit_error_errno(unit, errno, "Failed to reseek on serialization stream: %m"); - _exit(exit_status); - } + r = fd_cloexec(fileno(f), false); + if (r < 0) + return log_unit_error_errno(unit, r, "Failed to set O_CLOEXEC on serialization fd: %m"); + + r = fdset_cloexec(fdset, false); + if (r < 0) + return log_unit_error_errno(unit, r, "Failed to set O_CLOEXEC on serialized fds: %m"); + + r = log_level_to_string_alloc(log_get_max_level(), &log_level); + if (r < 0) + return log_unit_error_errno(unit, r, "Failed to convert log level to string: %m"); + + r = fd_get_path(unit->manager->executor_fd, &executor_path); + if (r < 0) + return log_unit_error_errno(unit, r, "Failed to get executor path from fd: %m"); + + xsprintf(serialization_fd_number, "%i", fileno(f)); + + /* The executor binary is pinned, to avoid compatibility problems during upgrades. */ + r = posix_spawn_wrapper(FORMAT_PROC_FD_PATH(unit->manager->executor_fd), + STRV_MAKE(executor_path, + "--deserialize", serialization_fd_number, + "--log-level", log_level, + "--log-target", log_target_to_string(manager_get_executor_log_target(unit->manager))), + environ, + &pid); + if (r < 0) + return log_unit_error_errno(unit, r, "Failed to spawn executor: %m"); log_unit_debug(unit, "Forked %s as "PID_FMT, command->path, pid); @@ -5563,7 +5599,7 @@ int exec_context_destroy_mount_ns_dir(Unit *u) { return 0; } -static void exec_command_done(ExecCommand *c) { +void exec_command_done(ExecCommand *c) { assert(c); c->path = mfree(c->path); @@ -6679,9 +6715,9 @@ static char *destroy_tree(char *path) { return mfree(path); } -static ExecSharedRuntime* exec_shared_runtime_free(ExecSharedRuntime *rt) { +void exec_shared_runtime_done(ExecSharedRuntime *rt) { if (!rt) - return NULL; + return; if (rt->manager) (void) hashmap_remove(rt->manager->exec_shared_runtime_by_id, rt->id); @@ -6691,6 +6727,11 @@ static ExecSharedRuntime* exec_shared_runtime_free(ExecSharedRuntime *rt) { rt->var_tmp_dir = mfree(rt->var_tmp_dir); safe_close_pair(rt->netns_storage_socket); safe_close_pair(rt->ipcns_storage_socket); +} + +static ExecSharedRuntime* exec_shared_runtime_free(ExecSharedRuntime *rt) { + exec_shared_runtime_done(rt); + return mfree(rt); } @@ -7216,6 +7257,14 @@ ExecRuntime* exec_runtime_destroy(ExecRuntime *rt) { return exec_runtime_free(rt); } +void exec_runtime_clear(ExecRuntime *rt) { + if (!rt) + return; + + safe_close_pair(rt->ephemeral_storage_socket); + rt->ephemeral_copy = mfree(rt->ephemeral_copy); +} + void exec_params_clear(ExecParameters *p) { if (!p) return; @@ -7230,6 +7279,37 @@ void exec_params_clear(ExecParameters *p) { p->unit_id = mfree(p->unit_id); p->invocation_id = SD_ID128_NULL; p->invocation_id_string[0] = '\0'; + p->confirm_spawn = mfree(p->confirm_spawn); +} + +void exec_params_serialized_done(ExecParameters *p) { + if (!p) + return; + + for (size_t i = 0; p->fds && i < p->n_socket_fds + p->n_storage_fds; i++) + p->fds[i] = safe_close(p->fds[i]); + + p->cgroup_path = mfree(p->cgroup_path); + + p->prefix = strv_free(p->prefix); + p->received_credentials_directory = mfree(p->received_credentials_directory); + p->received_encrypted_credentials_directory = mfree(p->received_encrypted_credentials_directory); + + for (size_t i = 0; p->idle_pipe && i < 4; i++) + p->idle_pipe[i] = safe_close(p->idle_pipe[i]); + p->idle_pipe = mfree(p->idle_pipe); + + p->stdin_fd = safe_close(p->stdin_fd); + p->stdout_fd = safe_close(p->stdout_fd); + p->stderr_fd = safe_close(p->stderr_fd); + + p->notify_socket = mfree(p->notify_socket); + + open_file_free_many(&p->open_files); + + p->fallback_smack_process_label = mfree(p->fallback_smack_process_label); + + exec_params_clear(p); } void exec_directory_done(ExecDirectory *d) { diff --git a/src/core/execute.h b/src/core/execute.h index 25f8531d44..f315044582 100644 --- a/src/core/execute.h +++ b/src/core/execute.h @@ -471,6 +471,13 @@ struct ExecParameters { #include "unit.h" #include "dynamic-user.h" +int exec_invoke(const ExecCommand *command, + const ExecContext *context, + ExecParameters *params, + ExecRuntime *runtime, + const CGroupContext *cgroup_context, + int *exit_status); + int exec_spawn(Unit *unit, ExecCommand *command, const ExecContext *context, @@ -479,6 +486,7 @@ int exec_spawn(Unit *unit, const CGroupContext *cgroup_context, pid_t *ret); +void exec_command_done(ExecCommand *c); void exec_command_done_array(ExecCommand *c, size_t n); ExecCommand* exec_command_free_list(ExecCommand *c); void exec_command_free_array(ExecCommand **c, size_t n); @@ -524,15 +532,18 @@ DEFINE_TRIVIAL_CLEANUP_FUNC(ExecSharedRuntime*, exec_shared_runtime_unref); int exec_shared_runtime_serialize(const Manager *m, FILE *f, FDSet *fds); int exec_shared_runtime_deserialize_compat(Unit *u, const char *key, const char *value, FDSet *fds); int exec_shared_runtime_deserialize_one(Manager *m, const char *value, FDSet *fds); +void exec_shared_runtime_done(ExecSharedRuntime *rt); void exec_shared_runtime_vacuum(Manager *m); int exec_runtime_make(const Unit *unit, const ExecContext *context, ExecSharedRuntime *shared, DynamicCreds *creds, ExecRuntime **ret); ExecRuntime* exec_runtime_free(ExecRuntime *rt); DEFINE_TRIVIAL_CLEANUP_FUNC(ExecRuntime*, exec_runtime_free); ExecRuntime* exec_runtime_destroy(ExecRuntime *rt); +void exec_runtime_clear(ExecRuntime *rt); void exec_params_clear(ExecParameters *p); void exec_params_dump(const ExecParameters *p, FILE* f, const char *prefix); +void exec_params_serialized_done(ExecParameters *p); bool exec_context_get_cpu_affinity_from_numa(const ExecContext *c); diff --git a/src/core/executor.c b/src/core/executor.c new file mode 100644 index 0000000000..0f154ea347 --- /dev/null +++ b/src/core/executor.c @@ -0,0 +1,251 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include +#include + +#include "sd-messages.h" + +#include "alloc-util.h" +#include "build.h" +#include "execute-serialize.h" +#include "execute.h" +#include "exit-status.h" +#include "fdset.h" +#include "fd-util.h" +#include "fileio.h" +#include "getopt-defs.h" +#include "parse-util.h" +#include "pretty-print.h" +#include "static-destruct.h" + +static FILE* arg_serialization = NULL; + +STATIC_DESTRUCTOR_REGISTER(arg_serialization, fclosep); + +static int help(void) { + _cleanup_free_ char *link = NULL; + int r; + + r = terminal_urlify_man("systemd", "1", &link); + if (r < 0) + return log_oom(); + + printf("%s [OPTIONS...]\n\n" + "%sSandbox and execute processes.%s\n\n" + " -h --help Show this help and exit\n" + " --version Print version string and exit\n" + " --log-target=TARGET Set log target (console, journal,\n" + " journal-or-kmsg,\n" + " kmsg, null)\n" + " --log-level=LEVEL Set log level (debug, info, notice,\n" + " warning, err, crit,\n" + " alert, emerg)\n" + " --log-color=BOOL Highlight important messages\n" + " --log-location=BOOL Include code location in messages\n" + " --log-time=BOOL Prefix messages with current time\n" + " --deserialize=FD Deserialize process config from FD\n" + "\nSee the %s for details.\n", + program_invocation_short_name, + ansi_highlight(), + ansi_normal(), + link); + + return 0; +} + +static int parse_argv(int argc, char *argv[]) { + enum { + COMMON_GETOPT_ARGS, + ARG_VERSION, + ARG_DESERIALIZE, + }; + + static const struct option options[] = { + { "log-level", required_argument, NULL, ARG_LOG_LEVEL }, + { "log-target", required_argument, NULL, ARG_LOG_TARGET }, + { "log-color", required_argument, NULL, ARG_LOG_COLOR }, + { "log-location", required_argument, NULL, ARG_LOG_LOCATION }, + { "log-time", required_argument, NULL, ARG_LOG_TIME }, + { "help", no_argument, NULL, 'h' }, + { "version", no_argument, NULL, ARG_VERSION }, + { "deserialize", required_argument, NULL, ARG_DESERIALIZE }, + {} + }; + + int c, r; + + assert(argc >= 0); + assert(argv); + + while ((c = getopt_long(argc, argv, "h", options, NULL)) >= 0) + switch (c) { + case 'h': + return help(); + + case ARG_VERSION: + return version(); + + case ARG_LOG_LEVEL: + r = log_set_max_level_from_string(optarg); + if (r < 0) + return log_error_errno(r, "Failed to parse log level \"%s\": %m", optarg); + + break; + + case ARG_LOG_TARGET: + r = log_set_target_from_string(optarg); + if (r < 0) + return log_error_errno(r, "Failed to parse log target \"%s\": %m", optarg); + + break; + + case ARG_LOG_COLOR: + r = log_show_color_from_string(optarg); + if (r < 0) + return log_error_errno( + r, + "Failed to parse log color setting \"%s\": %m", + optarg); + + break; + + case ARG_LOG_LOCATION: + r = log_show_location_from_string(optarg); + if (r < 0) + return log_error_errno( + r, + "Failed to parse log location setting \"%s\": %m", + optarg); + + break; + + case ARG_LOG_TIME: + r = log_show_time_from_string(optarg); + if (r < 0) + return log_error_errno( + r, + "Failed to parse log time setting \"%s\": %m", + optarg); + + break; + + case ARG_DESERIALIZE: { + FILE *f; + int fd; + + fd = parse_fd(optarg); + if (fd < 0) + return log_error_errno( + fd, + "Failed to parse serialization fd \"%s\": %m", + optarg); + + r = fd_cloexec(fd, /* cloexec= */ true); + if (r < 0) + return log_error_errno( + r, + "Failed to set serialization fd \"%s\" to close-on-exec: %m", + optarg); + + f = fdopen(fd, "r"); + if (!f) + return log_error_errno(errno, "Failed to open serialization fd %d: %m", fd); + + safe_fclose(arg_serialization); + arg_serialization = f; + + break; + } + + case '?': + return -EINVAL; + + default: + assert_not_reached(); + } + + if (!arg_serialization) + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), + "No serialization fd specified."); + + return 1 /* work to do */; +} + +int main(int argc, char *argv[]) { + _cleanup_fdset_free_ FDSet *fdset = NULL; + int exit_status = EXIT_SUCCESS, r; + _cleanup_(cgroup_context_done) CGroupContext cgroup_context = {}; + _cleanup_(exec_context_done) ExecContext context = {}; + _cleanup_(exec_command_done) ExecCommand command = {}; + _cleanup_(exec_params_serialized_done) ExecParameters params = EXEC_PARAMETERS_INIT(/* flags= */ 0); + _cleanup_(exec_shared_runtime_done) ExecSharedRuntime shared = { + .netns_storage_socket = PIPE_EBADF, + .ipcns_storage_socket = PIPE_EBADF, + }; + _cleanup_(dynamic_creds_done) DynamicCreds dynamic_creds = {}; + _cleanup_(exec_runtime_clear) ExecRuntime runtime = { + .ephemeral_storage_socket = PIPE_EBADF, + .shared = &shared, + .dynamic_creds = &dynamic_creds, + }; + + exec_context_init(&context); + cgroup_context_init(&cgroup_context); + + /* We might be starting the journal itself, we'll be told by the caller what to do */ + log_set_always_reopen_console(true); + log_set_prohibit_ipc(true); + log_setup(); + + r = fdset_new_fill(/* filter_cloexec= */ 0, &fdset); + if (r < 0) + return log_error_errno(r, "Failed to create fd set: %m"); + + r = parse_argv(argc, argv); + if (r <= 0) + return r; + + /* Now try again if we were told it's fine to use a different target */ + if (log_get_target() != LOG_TARGET_KMSG) { + log_set_prohibit_ipc(false); + log_open(); + } + + r = fdset_remove(fdset, fileno(arg_serialization)); + if (r < 0) + return log_error_errno(r, "Failed to remove serialization fd from fd set: %m"); + + r = exec_deserialize_invocation(arg_serialization, + fdset, + &context, + &command, + ¶ms, + &runtime, + &cgroup_context); + if (r < 0) + return log_error_errno(r, "Failed to deserialize: %m"); + + arg_serialization = safe_fclose(arg_serialization); + fdset = fdset_free(fdset); + + r = exec_invoke(&command, + &context, + ¶ms, + &runtime, + &cgroup_context, + &exit_status); + if (r < 0) { + const char *status = ASSERT_PTR( + exit_status_to_string(exit_status, EXIT_STATUS_LIBC | EXIT_STATUS_SYSTEMD)); + + log_exec_struct_errno(&context, ¶ms, LOG_ERR, r, + "MESSAGE_ID=" SD_MESSAGE_SPAWN_FAILED_STR, + LOG_EXEC_INVOCATION_ID(¶ms), + LOG_EXEC_MESSAGE(¶ms, "Failed at step %s spawning %s: %m", + status, command.path), + "EXECUTABLE=%s", command.path); + } else + assert(exit_status == EXIT_SUCCESS); /* When 'skip' is chosen in the confirm spawn prompt */ + + return exit_status; +} diff --git a/src/core/fuzz-manager-serialize.c b/src/core/fuzz-manager-serialize.c index cbc89f5737..0e4bfa4484 100644 --- a/src/core/fuzz-manager-serialize.c +++ b/src/core/fuzz-manager-serialize.c @@ -24,7 +24,7 @@ int LLVMFuzzerTestOneInput(const uint8_t *data, size_t size) { log_set_target(LOG_TARGET_NULL); } - assert_se(manager_new(RUNTIME_SCOPE_SYSTEM, MANAGER_TEST_RUN_MINIMAL, &m) >= 0); + assert_se(manager_new(RUNTIME_SCOPE_SYSTEM, MANAGER_TEST_RUN_MINIMAL|MANAGER_TEST_DONT_OPEN_EXECUTOR, &m) >= 0); /* Set log overrides as well to make it harder for a serialization file * to switch log levels/targets during fuzzing */ manager_override_log_level(m, log_get_max_level()); diff --git a/src/core/fuzz-unit-file.c b/src/core/fuzz-unit-file.c index a11d6b53b5..7b738062b5 100644 --- a/src/core/fuzz-unit-file.c +++ b/src/core/fuzz-unit-file.c @@ -65,7 +65,7 @@ int LLVMFuzzerTestOneInput(const uint8_t *data, size_t size) { if (!getenv("SYSTEMD_LOG_LEVEL")) log_set_max_level(LOG_CRIT); - assert_se(manager_new(RUNTIME_SCOPE_SYSTEM, MANAGER_TEST_RUN_MINIMAL, &m) >= 0); + assert_se(manager_new(RUNTIME_SCOPE_SYSTEM, MANAGER_TEST_RUN_MINIMAL|MANAGER_TEST_DONT_OPEN_EXECUTOR, &m) >= 0); name = strjoina("a.", unit_type_to_string(t)); assert_se(unit_new_for_name(m, unit_vtable[t]->object_size, name, &u) >= 0); diff --git a/src/core/manager.c b/src/core/manager.c index 6042bda239..9307a13a79 100644 --- a/src/core/manager.c +++ b/src/core/manager.c @@ -921,6 +921,8 @@ int manager_new(RuntimeScope runtime_scope, ManagerTestRunFlags test_run_flags, .interval = 10 * USEC_PER_MINUTE, .burst = 10, }, + + .executor_fd = -EBADF, }; unit_defaults_init(&m->defaults, runtime_scope); @@ -1039,6 +1041,42 @@ int manager_new(RuntimeScope runtime_scope, ManagerTestRunFlags test_run_flags, if (r < 0 && r != -EEXIST) return r; + + m->executor_fd = open(SYSTEMD_EXECUTOR_BINARY_PATH, O_CLOEXEC|O_PATH); + if (m->executor_fd < 0) + return log_warning_errno(errno, + "Failed to open executor binary '%s': %m", + SYSTEMD_EXECUTOR_BINARY_PATH); + } else if (!FLAGS_SET(test_run_flags, MANAGER_TEST_DONT_OPEN_EXECUTOR)) { + _cleanup_free_ char *self_exe = NULL, *executor_path = NULL; + _cleanup_close_ int self_dir_fd = -EBADF; + int level = LOG_DEBUG; + + /* Prefer sd-executor from the same directory as the test, e.g.: when running unit tests from the + * build directory. Fallback to working directory and then the installation path. */ + r = readlink_and_make_absolute("/proc/self/exe", &self_exe); + if (r < 0) + return r; + + self_dir_fd = open_parent(self_exe, O_CLOEXEC|O_DIRECTORY, 0); + if (self_dir_fd < 0) + return -errno; + + m->executor_fd = openat(self_dir_fd, "systemd-executor", O_CLOEXEC|O_PATH); + if (m->executor_fd < 0 && errno == ENOENT) + m->executor_fd = openat(AT_FDCWD, "systemd-executor", O_CLOEXEC|O_PATH); + if (m->executor_fd < 0 && errno == ENOENT) { + m->executor_fd = open(SYSTEMD_EXECUTOR_BINARY_PATH, O_CLOEXEC|O_PATH); + level = LOG_WARNING; /* Tests should normally use local builds */ + } + if (m->executor_fd < 0) + return -errno; + + r = fd_get_path(m->executor_fd, &executor_path); + if (r < 0) + return r; + + log_full(level, "Using systemd-executor binary from '%s'", executor_path); } /* Note that we do not set up the notify fd here. We do that after deserialization, @@ -1701,6 +1739,8 @@ Manager* manager_free(Manager *m) { lsm_bpf_destroy(m->restrict_fs); #endif + safe_close(m->executor_fd); + return mfree(m); } @@ -4956,6 +4996,17 @@ void unit_defaults_done(UnitDefaults *defaults) { rlimit_free_all(defaults->rlimit); } +LogTarget manager_get_executor_log_target(Manager *m) { + assert(m); + + /* If journald is not available tell sd-executor to go to kmsg, as it might be starting journald */ + + if (manager_journal_is_running(m)) + return log_get_target(); + + return LOG_TARGET_KMSG; +} + static const char *const manager_state_table[_MANAGER_STATE_MAX] = { [MANAGER_INITIALIZING] = "initializing", [MANAGER_STARTING] = "starting", diff --git a/src/core/manager.h b/src/core/manager.h index 4595b1b686..6321a353a5 100644 --- a/src/core/manager.h +++ b/src/core/manager.h @@ -145,6 +145,7 @@ typedef enum ManagerTestRunFlags { MANAGER_TEST_RUN_ENV_GENERATORS = 1 << 2, /* also run env generators */ MANAGER_TEST_RUN_GENERATORS = 1 << 3, /* also run unit generators */ MANAGER_TEST_RUN_IGNORE_DEPENDENCIES = 1 << 4, /* run while ignoring dependencies */ + MANAGER_TEST_DONT_OPEN_EXECUTOR = 1 << 5, /* avoid trying to load sd-executor */ MANAGER_TEST_FULL = MANAGER_TEST_RUN_BASIC | MANAGER_TEST_RUN_ENV_GENERATORS | MANAGER_TEST_RUN_GENERATORS, } ManagerTestRunFlags; @@ -496,6 +497,10 @@ struct Manager { /* For NFTSet= */ FirewallContext *fw_ctx; + + /* Pin the systemd-executor binary, so that it never changes until re-exec, ensuring we don't have + * serialization/deserialization compatibility issues during upgrades. */ + int executor_fd; }; static inline usec_t manager_default_timeout_abort_usec(Manager *m) { @@ -628,6 +633,8 @@ void manager_override_watchdog(Manager *m, WatchdogType t, usec_t timeout); int manager_set_watchdog_pretimeout_governor(Manager *m, const char *governor); int manager_override_watchdog_pretimeout_governor(Manager *m, const char *governor); +LogTarget manager_get_executor_log_target(Manager *m); + const char* oom_policy_to_string(OOMPolicy i) _const_; OOMPolicy oom_policy_from_string(const char *s) _pure_; diff --git a/src/core/meson.build b/src/core/meson.build index 8c3fd76922..0508254d9a 100644 --- a/src/core/meson.build +++ b/src/core/meson.build @@ -144,6 +144,10 @@ systemd_sources = files( 'crash-handler.c', ) +systemd_executor_sources = files( + 'executor.c', +) + executables += [ libexec_template + { 'name' : 'systemd', @@ -156,6 +160,17 @@ executables += [ ], 'dependencies' : libseccomp, }, + libexec_template + { + 'name' : 'systemd-executor', + 'public' : true, + 'sources' : systemd_executor_sources, + 'include_directories' : core_includes, + 'link_with' : [ + libcore, + libshared, + ], + 'dependencies' : libseccomp, + }, fuzz_template + { 'sources' : files('fuzz-unit-file.c'), 'link_with' : [ diff --git a/src/core/unit.c b/src/core/unit.c index b71c21580e..aa809843f7 100644 --- a/src/core/unit.c +++ b/src/core/unit.c @@ -5357,6 +5357,7 @@ int unit_acquire_invocation_id(Unit *u) { } int unit_set_exec_params(Unit *u, ExecParameters *p) { + const char *confirm_spawn; int r; assert(u); @@ -5369,7 +5370,13 @@ int unit_set_exec_params(Unit *u, ExecParameters *p) { p->runtime_scope = u->manager->runtime_scope; - p->confirm_spawn = (char *)manager_get_confirm_spawn(u->manager); + confirm_spawn = manager_get_confirm_spawn(u->manager); + if (confirm_spawn) { + p->confirm_spawn = strdup(confirm_spawn); + if (!p->confirm_spawn) + return -ENOMEM; + } + p->cgroup_supported = u->manager->cgroup_supported; p->prefix = u->manager->prefix; SET_FLAG(p->flags, EXEC_PASS_LOG_UNIT|EXEC_CHOWN_DIRECTORIES, MANAGER_IS_SYSTEM(u->manager)); diff --git a/test/units/testsuite-55.sh b/test/units/testsuite-55.sh index c6258dead6..ffa9af2f6a 100755 --- a/test/units/testsuite-55.sh +++ b/test/units/testsuite-55.sh @@ -68,6 +68,12 @@ if systemctl is-active systemd-oomd.service; then systemctl restart systemd-oomd.service fi +# Ensure that we can start services even with a very low hard memory cap without oom-kills, but skip under +# sanitizers as they balloon memory usage. +if ! [[ -v ASAN_OPTIONS || -v UBSAN_OPTIONS ]]; then + systemd-run -t -p MemoryMax=10M -p MemorySwapMax=0 -p MemoryZSwapMax=0 /bin/true +fi + systemctl start testsuite-55-testchill.service systemctl start testsuite-55-testbloat.service