diff --git a/man/systemd-nspawn.xml b/man/systemd-nspawn.xml index 713782b859..c6b027c58f 100644 --- a/man/systemd-nspawn.xml +++ b/man/systemd-nspawn.xml @@ -734,6 +734,31 @@ capabilities are passed using the --capabilities=. + + + + Sets the specified POSIX resource limit for the container payload. Expects an assignment of the + form + LIMIT=SOFT:HARD + or LIMIT=VALUE, where + LIMIT should refer to a resource limit type, such as + RLIMIT_NOFILE or RLIMIT_NICE. The SOFT and + HARD fields should refer to the numeric soft and hard resource limit values. If the + second form is used, VALUE may specifiy a value that is used both as soft and hard + limit. In place of a numeric value the special string infinity may be used to turn off + resource limiting for the specific type of resource. This command line option may be used multiple times to + control limits on multiple limit types. If used multiple times for the same limit type, the last last use + wins. For details about resource limits see setrlimit2. By default + resource limits for the container's init process (PID 1) are set to the same values the Linux kernel originally + passed to the host init system. Note that some resource limits are enforced on resources counted per user, in + particular RLIMIT_NPROC. This means that unless user namespacing is deployed + (i.e. is used, see above), any limits set will be applied to the resource + usage of the same user on all local containers as well as the host. This means particular care needs to be + taken with these limits as they might be triggered by possibly less trusted code. Example: + --rlimit=RLIMIT_NOFILE=8192:16384. + + diff --git a/man/systemd.nspawn.xml b/man/systemd.nspawn.xml index b5c60a33e0..6bd7b33b34 100644 --- a/man/systemd.nspawn.xml +++ b/man/systemd.nspawn.xml @@ -278,6 +278,30 @@ details. + + LimitCPU= + LimitFSIZE= + LimitDATA= + LimitSTACK= + LimitCORE= + LimitRSS= + LimitNOFILE= + LimitAS= + LimitNPROC= + LimitMEMLOCK= + LimitLOCKS= + LimitSIGPENDING= + LimitMSGQUEUE= + LimitNICE= + LimitRTPRIO= + LimitRTTIME= + + Configures various types of resource limits applied to containers. This is equivalent to the + command line switch, and takes the same arguments. See + systemd-nspawn1 for + details. + + diff --git a/src/nspawn/nspawn-gperf.gperf b/src/nspawn/nspawn-gperf.gperf index ea66971fac..58184d412d 100644 --- a/src/nspawn/nspawn-gperf.gperf +++ b/src/nspawn/nspawn-gperf.gperf @@ -18,35 +18,51 @@ struct ConfigPerfItem; %struct-type %includes %% -Exec.Boot, config_parse_boot, 0, 0 -Exec.ProcessTwo, config_parse_pid2, 0, 0 -Exec.Parameters, config_parse_strv, 0, offsetof(Settings, parameters) -Exec.Environment, config_parse_strv, 0, offsetof(Settings, environment) -Exec.User, config_parse_string, 0, offsetof(Settings, user) -Exec.Capability, config_parse_capability, 0, offsetof(Settings, capability) -Exec.DropCapability, config_parse_capability, 0, offsetof(Settings, drop_capability) -Exec.KillSignal, config_parse_signal, 0, offsetof(Settings, kill_signal) -Exec.Personality, config_parse_personality, 0, offsetof(Settings, personality) -Exec.MachineID, config_parse_id128, 0, offsetof(Settings, machine_id) -Exec.WorkingDirectory, config_parse_path, 0, offsetof(Settings, working_directory) -Exec.PivotRoot, config_parse_pivot_root, 0, 0 -Exec.PrivateUsers, config_parse_private_users, 0, 0 -Exec.NotifyReady, config_parse_bool, 0, offsetof(Settings, notify_ready) -Exec.SystemCallFilter, config_parse_syscall_filter,0, 0, -Files.ReadOnly, config_parse_tristate, 0, offsetof(Settings, read_only) -Files.Volatile, config_parse_volatile_mode, 0, offsetof(Settings, volatile_mode) -Files.Bind, config_parse_bind, 0, 0 -Files.BindReadOnly, config_parse_bind, 1, 0 -Files.TemporaryFileSystem, config_parse_tmpfs, 0, 0 -Files.Overlay, config_parse_overlay, 0, 0 -Files.OverlayReadOnly, config_parse_overlay, 1, 0 -Files.PrivateUsersChown, config_parse_tristate, 0, offsetof(Settings, userns_chown) -Network.Private, config_parse_tristate, 0, offsetof(Settings, private_network) -Network.Interface, config_parse_strv, 0, offsetof(Settings, network_interfaces) -Network.MACVLAN, config_parse_strv, 0, offsetof(Settings, network_macvlan) -Network.IPVLAN, config_parse_strv, 0, offsetof(Settings, network_ipvlan) -Network.VirtualEthernet, config_parse_tristate, 0, offsetof(Settings, network_veth) -Network.VirtualEthernetExtra, config_parse_veth_extra, 0, 0 -Network.Bridge, config_parse_ifname, 0, offsetof(Settings, network_bridge) -Network.Zone, config_parse_network_zone, 0, 0 -Network.Port, config_parse_expose_port, 0, 0 +Exec.Boot, config_parse_boot, 0, 0 +Exec.ProcessTwo, config_parse_pid2, 0, 0 +Exec.Parameters, config_parse_strv, 0, offsetof(Settings, parameters) +Exec.Environment, config_parse_strv, 0, offsetof(Settings, environment) +Exec.User, config_parse_string, 0, offsetof(Settings, user) +Exec.Capability, config_parse_capability, 0, offsetof(Settings, capability) +Exec.DropCapability, config_parse_capability, 0, offsetof(Settings, drop_capability) +Exec.KillSignal, config_parse_signal, 0, offsetof(Settings, kill_signal) +Exec.Personality, config_parse_personality, 0, offsetof(Settings, personality) +Exec.MachineID, config_parse_id128, 0, offsetof(Settings, machine_id) +Exec.WorkingDirectory, config_parse_path, 0, offsetof(Settings, working_directory) +Exec.PivotRoot, config_parse_pivot_root, 0, 0 +Exec.PrivateUsers, config_parse_private_users, 0, 0 +Exec.NotifyReady, config_parse_bool, 0, offsetof(Settings, notify_ready) +Exec.SystemCallFilter, config_parse_syscall_filter, 0, 0, +Exec.LimitCPU, config_parse_rlimit, RLIMIT_CPU, offsetof(Settings, rlimit) +Exec.LimitFSIZE, config_parse_rlimit, RLIMIT_FSIZE, offsetof(Settings, rlimit) +Exec.LimitDATA, config_parse_rlimit, RLIMIT_DATA, offsetof(Settings, rlimit) +Exec.LimitSTACK, config_parse_rlimit, RLIMIT_STACK, offsetof(Settings, rlimit) +Exec.LimitCORE, config_parse_rlimit, RLIMIT_CORE, offsetof(Settings, rlimit) +Exec.LimitRSS, config_parse_rlimit, RLIMIT_RSS, offsetof(Settings, rlimit) +Exec.LimitNOFILE, config_parse_rlimit, RLIMIT_NOFILE, offsetof(Settings, rlimit) +Exec.LimitAS, config_parse_rlimit, RLIMIT_AS, offsetof(Settings, rlimit) +Exec.LimitNPROC, config_parse_rlimit, RLIMIT_NPROC, offsetof(Settings, rlimit) +Exec.LimitMEMLOCK, config_parse_rlimit, RLIMIT_MEMLOCK, offsetof(Settings, rlimit) +Exec.LimitLOCKS, config_parse_rlimit, RLIMIT_LOCKS, offsetof(Settings, rlimit) +Exec.LimitSIGPENDING, config_parse_rlimit, RLIMIT_SIGPENDING, offsetof(Settings, rlimit) +Exec.LimitMSGQUEUE, config_parse_rlimit, RLIMIT_MSGQUEUE, offsetof(Settings, rlimit) +Exec.LimitNICE, config_parse_rlimit, RLIMIT_NICE, offsetof(Settings, rlimit) +Exec.LimitRTPRIO, config_parse_rlimit, RLIMIT_RTPRIO, offsetof(Settings, rlimit) +Exec.LimitRTTIME, config_parse_rlimit, RLIMIT_RTTIME, offsetof(Settings, rlimit) +Files.ReadOnly, config_parse_tristate, 0, offsetof(Settings, read_only) +Files.Volatile, config_parse_volatile_mode, 0, offsetof(Settings, volatile_mode) +Files.Bind, config_parse_bind, 0, 0 +Files.BindReadOnly, config_parse_bind, 1, 0 +Files.TemporaryFileSystem, config_parse_tmpfs, 0, 0 +Files.Overlay, config_parse_overlay, 0, 0 +Files.OverlayReadOnly, config_parse_overlay, 1, 0 +Files.PrivateUsersChown, config_parse_tristate, 0, offsetof(Settings, userns_chown) +Network.Private, config_parse_tristate, 0, offsetof(Settings, private_network) +Network.Interface, config_parse_strv, 0, offsetof(Settings, network_interfaces) +Network.MACVLAN, config_parse_strv, 0, offsetof(Settings, network_macvlan) +Network.IPVLAN, config_parse_strv, 0, offsetof(Settings, network_ipvlan) +Network.VirtualEthernet, config_parse_tristate, 0, offsetof(Settings, network_veth) +Network.VirtualEthernetExtra, config_parse_veth_extra, 0, 0 +Network.Bridge, config_parse_ifname, 0, offsetof(Settings, network_bridge) +Network.Zone, config_parse_network_zone, 0, 0 +Network.Port, config_parse_expose_port, 0, 0 diff --git a/src/nspawn/nspawn-settings.c b/src/nspawn/nspawn-settings.c index 487514d40a..19bf1d4b94 100644 --- a/src/nspawn/nspawn-settings.c +++ b/src/nspawn/nspawn-settings.c @@ -12,6 +12,7 @@ #include "nspawn-settings.h" #include "parse-util.h" #include "process-util.h" +#include "rlimit-util.h" #include "socket-util.h" #include "string-util.h" #include "strv.h" @@ -80,6 +81,7 @@ Settings* settings_free(Settings *s) { free(s->working_directory); strv_free(s->syscall_whitelist); strv_free(s->syscall_blacklist); + rlimit_free_all(s->rlimit); strv_free(s->network_interfaces); strv_free(s->network_macvlan); diff --git a/src/nspawn/nspawn-settings.h b/src/nspawn/nspawn-settings.h index 731db87260..0e6ce61217 100644 --- a/src/nspawn/nspawn-settings.h +++ b/src/nspawn/nspawn-settings.h @@ -32,24 +32,26 @@ typedef enum UserNamespaceMode { } UserNamespaceMode; typedef enum SettingsMask { - SETTING_START_MODE = 1 << 0, - SETTING_ENVIRONMENT = 1 << 1, - SETTING_USER = 1 << 2, - SETTING_CAPABILITY = 1 << 3, - SETTING_KILL_SIGNAL = 1 << 4, - SETTING_PERSONALITY = 1 << 5, - SETTING_MACHINE_ID = 1 << 6, - SETTING_NETWORK = 1 << 7, - SETTING_EXPOSE_PORTS = 1 << 8, - SETTING_READ_ONLY = 1 << 9, - SETTING_VOLATILE_MODE = 1 << 10, - SETTING_CUSTOM_MOUNTS = 1 << 11, - SETTING_WORKING_DIRECTORY = 1 << 12, - SETTING_USERNS = 1 << 13, - SETTING_NOTIFY_READY = 1 << 14, - SETTING_PIVOT_ROOT = 1 << 15, - SETTING_SYSCALL_FILTER = 1 << 16, - _SETTINGS_MASK_ALL = (1 << 17) -1 + SETTING_START_MODE = UINT64_C(1) << 0, + SETTING_ENVIRONMENT = UINT64_C(1) << 1, + SETTING_USER = UINT64_C(1) << 2, + SETTING_CAPABILITY = UINT64_C(1) << 3, + SETTING_KILL_SIGNAL = UINT64_C(1) << 4, + SETTING_PERSONALITY = UINT64_C(1) << 5, + SETTING_MACHINE_ID = UINT64_C(1) << 6, + SETTING_NETWORK = UINT64_C(1) << 7, + SETTING_EXPOSE_PORTS = UINT64_C(1) << 8, + SETTING_READ_ONLY = UINT64_C(1) << 9, + SETTING_VOLATILE_MODE = UINT64_C(1) << 10, + SETTING_CUSTOM_MOUNTS = UINT64_C(1) << 11, + SETTING_WORKING_DIRECTORY = UINT64_C(1) << 12, + SETTING_USERNS = UINT64_C(1) << 13, + SETTING_NOTIFY_READY = UINT64_C(1) << 14, + SETTING_PIVOT_ROOT = UINT64_C(1) << 15, + SETTING_SYSCALL_FILTER = UINT64_C(1) << 16, + SETTING_RLIMIT_FIRST = UINT64_C(1) << 17, /* we define one bit per resource limit here */ + SETTING_RLIMIT_LAST = UINT64_C(1) << (17 + _RLIMIT_MAX - 1), + _SETTINGS_MASK_ALL = (UINT64_C(1) << (17 + _RLIMIT_MAX)) } SettingsMask; typedef struct Settings { @@ -71,6 +73,7 @@ typedef struct Settings { bool notify_ready; char **syscall_whitelist; char **syscall_blacklist; + struct rlimit *rlimit[_RLIMIT_MAX]; /* [Image] */ int read_only; diff --git a/src/nspawn/nspawn.c b/src/nspawn/nspawn.c index 12eaa6c0d7..8ba8e73bf7 100644 --- a/src/nspawn/nspawn.c +++ b/src/nspawn/nspawn.c @@ -81,6 +81,7 @@ #include "ptyfwd.h" #include "random-util.h" #include "raw-clone.h" +#include "rlimit-util.h" #include "rm-rf.h" #include "selinux-util.h" #include "signal-util.h" @@ -200,6 +201,7 @@ static void *arg_root_hash = NULL; static size_t arg_root_hash_size = 0; static char **arg_syscall_whitelist = NULL; static char **arg_syscall_blacklist = NULL; +static struct rlimit *arg_rlimit[_RLIMIT_MAX] = {}; static void help(void) { printf("%s [OPTIONS...] [PATH] [ARGUMENTS...]\n\n" @@ -264,6 +266,7 @@ static void help(void) { " --drop-capability=CAP Drop the specified capability from the default set\n" " --system-call-filter=LIST|~LIST\n" " Permit/prohibit specific system calls\n" + " --rlimit=NAME=LIMIT Set a resource limit for the payload\n" " --kill-signal=SIGNAL Select signal to use for shutting down PID 1\n" " --link-journal=MODE Link up guest journal, one of no, auto, guest, \n" " host, try-guest, try-host\n" @@ -439,6 +442,7 @@ static int parse_argv(int argc, char *argv[]) { ARG_NOTIFY_READY, ARG_ROOT_HASH, ARG_SYSTEM_CALL_FILTER, + ARG_RLIMIT, }; static const struct option options[] = { @@ -492,6 +496,7 @@ static int parse_argv(int argc, char *argv[]) { { "notify-ready", required_argument, NULL, ARG_NOTIFY_READY }, { "root-hash", required_argument, NULL, ARG_ROOT_HASH }, { "system-call-filter", required_argument, NULL, ARG_SYSTEM_CALL_FILTER }, + { "rlimit", required_argument, NULL, ARG_RLIMIT }, {} }; @@ -1094,6 +1099,41 @@ static int parse_argv(int argc, char *argv[]) { break; } + case ARG_RLIMIT: { + const char *eq; + char *name; + int rl; + + eq = strchr(optarg, '='); + if (!eq) { + log_error("--rlimit= expects an '=' assignment."); + return -EINVAL; + } + + name = strndup(optarg, eq - optarg); + if (!name) + return log_oom(); + + rl = rlimit_from_string_harder(name); + if (rl < 0) { + log_error("Unknown resource limit: %s", name); + return -EINVAL; + } + + if (!arg_rlimit[rl]) { + arg_rlimit[rl] = new0(struct rlimit, 1); + if (!arg_rlimit[rl]) + return log_oom(); + } + + r = rlimit_parse(rl, eq + 1, arg_rlimit[rl]); + if (r < 0) + return log_error_errno(r, "Failed to parse resource limit: %s", eq + 1); + + arg_settings_mask |= SETTING_RLIMIT_FIRST << rl; + break; + } + case '?': return -EINVAL; @@ -2282,7 +2322,6 @@ static int inner_child( NULL }; const char *exec_target; - _cleanup_strv_free_ char **env_use = NULL; int r; @@ -2559,10 +2598,10 @@ static int outer_child( FDSet *fds, int netns_fd) { + _cleanup_close_ int fd = -1; + int r, which_failed; pid_t pid; ssize_t l; - int r; - _cleanup_close_ int fd = -1; assert(barrier); assert(directory); @@ -2805,6 +2844,10 @@ static int outer_child( if (fd < 0) return fd; + r = setrlimit_closest_all((const struct rlimit *const*) arg_rlimit, &which_failed); + if (r < 0) + return log_error_errno(r, "Failed to apply resource limit RLIMIT_%s: %m", rlimit_to_string(which_failed)); + pid = raw_clone(SIGCHLD|CLONE_NEWNS| arg_clone_ns_flags | (arg_userns_mode != USER_NAMESPACE_NO ? CLONE_NEWUSER : 0)); @@ -3046,7 +3089,7 @@ static int load_settings(void) { _cleanup_fclose_ FILE *f = NULL; _cleanup_free_ char *p = NULL; const char *fn, *i; - int r; + int r, rl; /* If all settings are masked, there's no point in looking for * the settings file */ @@ -3256,6 +3299,21 @@ static int load_settings(void) { } } + for (rl = 0; rl < _RLIMIT_MAX; rl ++) { + if ((arg_settings_mask & (SETTING_RLIMIT_FIRST << rl))) + continue; + + if (!settings->rlimit[rl]) + continue; + + if (!arg_settings_trusted) { + log_warning("Ignoring Limit%s= setting, file '%s' is not trusted.", rlimit_to_string(rl), p); + continue; + } + + free_and_replace(arg_rlimit[rl], settings->rlimit[rl]); + } + return 0; } @@ -3767,6 +3825,71 @@ static int run(int master, return 1; /* loop again */ } +static int initialize_rlimits(void) { + + /* The default resource limits the kernel passes to PID 1, as per kernel 4.16. Let's pass our container payload + * the same values as the kernel originally passed to PID 1, in order to minimize differences between host and + * container execution environments. */ + + static const struct rlimit kernel_defaults[_RLIMIT_MAX] = { + [RLIMIT_AS] = { RLIM_INFINITY, RLIM_INFINITY }, + [RLIMIT_CORE] = { 0, RLIM_INFINITY }, + [RLIMIT_CPU] = { RLIM_INFINITY, RLIM_INFINITY }, + [RLIMIT_DATA] = { RLIM_INFINITY, RLIM_INFINITY }, + [RLIMIT_FSIZE] = { RLIM_INFINITY, RLIM_INFINITY }, + [RLIMIT_LOCKS] = { RLIM_INFINITY, RLIM_INFINITY }, + [RLIMIT_MEMLOCK] = { 65536, 65536 }, + [RLIMIT_MSGQUEUE] = { 819200, 819200 }, + [RLIMIT_NICE] = { 0, 0 }, + [RLIMIT_NOFILE] = { 1024, 4096 }, + [RLIMIT_RSS] = { RLIM_INFINITY, RLIM_INFINITY }, + [RLIMIT_RTPRIO] = { 0, 0 }, + [RLIMIT_RTTIME] = { RLIM_INFINITY, RLIM_INFINITY }, + [RLIMIT_STACK] = { 8388608, RLIM_INFINITY }, + + /* The kernel scales the default for RLIMIT_NPROC and RLIMIT_SIGPENDING based on the system's amount of + * RAM. To provide best compatibility we'll read these limits off PID 1 instead of hardcoding them + * here. This is safe as we know that PID 1 doesn't change these two limits and thus the original + * kernel's initialization should still be valid during runtime — at least if PID 1 is systemd. Note + * that PID 1 changes a number of other resource limits during early initialization which is why we + * don't read the other limits from PID 1 but prefer the static table above. */ + }; + + int rl; + + for (rl = 0; rl < _RLIMIT_MAX; rl++) { + + /* Let's only fill in what the user hasn't explicitly configured anyway */ + if ((arg_settings_mask & (SETTING_RLIMIT_FIRST << rl)) == 0) { + const struct rlimit *v; + struct rlimit buffer; + + if (IN_SET(rl, RLIMIT_NPROC, RLIMIT_SIGPENDING)) { + /* For these two let's read the limits off PID 1. See above for an explanation. */ + + if (prlimit(1, rl, NULL, &buffer) < 0) + return log_error_errno(errno, "Failed to read resource limit RLIMIT_%s of PID 1: %m", rlimit_to_string(rl)); + + v = &buffer; + } else + v = kernel_defaults + rl; + + arg_rlimit[rl] = newdup(struct rlimit, v, 1); + if (!arg_rlimit[rl]) + return log_oom(); + } + + if (DEBUG_LOGGING) { + _cleanup_free_ char *k = NULL; + + (void) rlimit_format(arg_rlimit[rl], &k); + log_debug("Setting RLIMIT_%s to %s.", rlimit_to_string(rl), k); + } + } + + return 0; +} + int main(int argc, char *argv[]) { _cleanup_free_ char *console = NULL; @@ -3799,6 +3922,10 @@ int main(int argc, char *argv[]) { if (r < 0) goto finish; + r = initialize_rlimits(); + if (r < 0) + goto finish; + r = determine_names(); if (r < 0) goto finish; @@ -4161,6 +4288,7 @@ finish: custom_mount_free_all(arg_custom_mounts, arg_n_custom_mounts); expose_port_free_all(arg_expose_ports); free(arg_root_hash); + rlimit_free_all(arg_rlimit); return r < 0 ? EXIT_FAILURE : ret; }