core: add DelegateSubgroup= setting

This implements a minimal subset of #24961, but in a lot more
restrictive way: we only allow one level of subcgroup (as that's enough
to address the no-processes in inner cgroups rule), and does not change
anything about threaded cgroup logic or similar, or make any of this new
behaviour mandatory.

All this does is this: all non-control processes we invoke for a unit
we'll invoke in a subgroup by the specified name.

We'll later port all our current services that use cgroup delegation
over to this, i.e. user@.service, systemd-nspawn@.service and
systemd-udevd.service.
This commit is contained in:
Lennart Poettering
2023-04-21 18:22:35 +02:00
parent 1a56b0c05d
commit a8b993dc11
11 changed files with 168 additions and 15 deletions

View File

@@ -271,7 +271,9 @@ your service has any of these four settings set, you must be prepared that a
means that your service code should have moved itself further down the cgroup
tree by the time it notifies the service manager about start-up readiness, so
that the service's main cgroup is definitely an inner node by the time the
service manager might start `ExecStartPost=`.)
service manager might start `ExecStartPost=`. Starting with systemd 254 you may
also use `DelegateSubgroup=` to let the service manager put your initial
service process into a subgroup right away.)
(Also note, if you intend to use "threaded" cgroups — as added in Linux 4.14 —,
then you should do that *two* levels down from the main service cgroup your

View File

@@ -2713,6 +2713,8 @@ node /org/freedesktop/systemd1/unit/avahi_2ddaemon_2eservice {
@org.freedesktop.DBus.Property.EmitsChangedSignal("false")
readonly as DelegateControllers = ['...', ...];
@org.freedesktop.DBus.Property.EmitsChangedSignal("false")
readonly s DelegateSubgroup = '...';
@org.freedesktop.DBus.Property.EmitsChangedSignal("false")
readonly b CPUAccounting = ...;
@org.freedesktop.DBus.Property.EmitsChangedSignal("false")
readonly t CPUWeight = ...;
@@ -3942,6 +3944,8 @@ node /org/freedesktop/systemd1/unit/avahi_2ddaemon_2eservice {
<variablelist class="dbus-property" generated="True" extra-ref="DelegateControllers"/>
<variablelist class="dbus-property" generated="True" extra-ref="DelegateSubgroup"/>
<variablelist class="dbus-property" generated="True" extra-ref="CPUAccounting"/>
<variablelist class="dbus-property" generated="True" extra-ref="CPUWeight"/>
@@ -4544,6 +4548,10 @@ node /org/freedesktop/systemd1/unit/avahi_2ddaemon_2eservice {
memory controller is reached. It will take into consideration limits on all parent slices, other than the
limits set on the unit itself.</para>
<para><varname>DelegateSubgroup</varname> contains the cgroup subgroup to place invoked unit processes
in. As configured by the option of the same name in unit files. This is set to the empty string when it
does not apply or no subgroup has been configured.</para>
<para><varname>RuntimeDirectorySymlink</varname>, <varname>StateDirectorySymlink</varname>,
<varname>CacheDirectorySymlink</varname> and <varname>LogsDirectorySymlink</varname> respectively
implement the destination parameter of the unit files settings <varname>RuntimeDirectory</varname>,
@@ -4715,6 +4723,8 @@ node /org/freedesktop/systemd1/unit/avahi_2ddaemon_2esocket {
@org.freedesktop.DBus.Property.EmitsChangedSignal("false")
readonly as DelegateControllers = ['...', ...];
@org.freedesktop.DBus.Property.EmitsChangedSignal("false")
readonly s DelegateSubgroup = '...';
@org.freedesktop.DBus.Property.EmitsChangedSignal("false")
readonly b CPUAccounting = ...;
@org.freedesktop.DBus.Property.EmitsChangedSignal("false")
readonly t CPUWeight = ...;
@@ -5936,6 +5946,8 @@ node /org/freedesktop/systemd1/unit/avahi_2ddaemon_2esocket {
<variablelist class="dbus-property" generated="True" extra-ref="DelegateControllers"/>
<variablelist class="dbus-property" generated="True" extra-ref="DelegateSubgroup"/>
<variablelist class="dbus-property" generated="True" extra-ref="CPUAccounting"/>
<variablelist class="dbus-property" generated="True" extra-ref="CPUWeight"/>
@@ -6588,6 +6600,8 @@ node /org/freedesktop/systemd1/unit/home_2emount {
@org.freedesktop.DBus.Property.EmitsChangedSignal("false")
readonly as DelegateControllers = ['...', ...];
@org.freedesktop.DBus.Property.EmitsChangedSignal("false")
readonly s DelegateSubgroup = '...';
@org.freedesktop.DBus.Property.EmitsChangedSignal("false")
readonly b CPUAccounting = ...;
@org.freedesktop.DBus.Property.EmitsChangedSignal("false")
readonly t CPUWeight = ...;
@@ -7655,6 +7669,8 @@ node /org/freedesktop/systemd1/unit/home_2emount {
<variablelist class="dbus-property" generated="True" extra-ref="DelegateControllers"/>
<variablelist class="dbus-property" generated="True" extra-ref="DelegateSubgroup"/>
<variablelist class="dbus-property" generated="True" extra-ref="CPUAccounting"/>
<variablelist class="dbus-property" generated="True" extra-ref="CPUWeight"/>
@@ -8434,6 +8450,8 @@ node /org/freedesktop/systemd1/unit/dev_2dsda3_2eswap {
@org.freedesktop.DBus.Property.EmitsChangedSignal("false")
readonly as DelegateControllers = ['...', ...];
@org.freedesktop.DBus.Property.EmitsChangedSignal("false")
readonly s DelegateSubgroup = '...';
@org.freedesktop.DBus.Property.EmitsChangedSignal("false")
readonly b CPUAccounting = ...;
@org.freedesktop.DBus.Property.EmitsChangedSignal("false")
readonly t CPUWeight = ...;
@@ -9473,6 +9491,8 @@ node /org/freedesktop/systemd1/unit/dev_2dsda3_2eswap {
<variablelist class="dbus-property" generated="True" extra-ref="DelegateControllers"/>
<variablelist class="dbus-property" generated="True" extra-ref="DelegateSubgroup"/>
<variablelist class="dbus-property" generated="True" extra-ref="CPUAccounting"/>
<variablelist class="dbus-property" generated="True" extra-ref="CPUWeight"/>
@@ -10111,6 +10131,8 @@ node /org/freedesktop/systemd1/unit/system_2eslice {
@org.freedesktop.DBus.Property.EmitsChangedSignal("false")
readonly as DelegateControllers = ['...', ...];
@org.freedesktop.DBus.Property.EmitsChangedSignal("false")
readonly s DelegateSubgroup = '...';
@org.freedesktop.DBus.Property.EmitsChangedSignal("false")
readonly b CPUAccounting = ...;
@org.freedesktop.DBus.Property.EmitsChangedSignal("false")
readonly t CPUWeight = ...;
@@ -10456,6 +10478,8 @@ node /org/freedesktop/systemd1/unit/system_2eslice {
<variablelist class="dbus-property" generated="True" extra-ref="DelegateControllers"/>
<variablelist class="dbus-property" generated="True" extra-ref="DelegateSubgroup"/>
<variablelist class="dbus-property" generated="True" extra-ref="CPUAccounting"/>
<variablelist class="dbus-property" generated="True" extra-ref="CPUWeight"/>
@@ -10656,6 +10680,8 @@ node /org/freedesktop/systemd1/unit/session_2d1_2escope {
@org.freedesktop.DBus.Property.EmitsChangedSignal("false")
readonly as DelegateControllers = ['...', ...];
@org.freedesktop.DBus.Property.EmitsChangedSignal("false")
readonly s DelegateSubgroup = '...';
@org.freedesktop.DBus.Property.EmitsChangedSignal("false")
readonly b CPUAccounting = ...;
@org.freedesktop.DBus.Property.EmitsChangedSignal("false")
readonly t CPUWeight = ...;
@@ -11051,6 +11077,8 @@ node /org/freedesktop/systemd1/unit/session_2d1_2escope {
<variablelist class="dbus-property" generated="True" extra-ref="DelegateControllers"/>
<variablelist class="dbus-property" generated="True" extra-ref="DelegateSubgroup"/>
<variablelist class="dbus-property" generated="True" extra-ref="CPUAccounting"/>
<variablelist class="dbus-property" generated="True" extra-ref="CPUWeight"/>

View File

@@ -1148,10 +1148,11 @@ DeviceAllow=/dev/loop-control
<term><varname>Delegate=</varname></term>
<listitem>
<para>Turns on delegation of further resource control partitioning to processes of the unit. Units where this
is enabled may create and manage their own private subhierarchy of control groups below the control group of
the unit itself. For unprivileged services (i.e. those using the <varname>User=</varname> setting) the unit's
control group will be made accessible to the relevant user.</para>
<para>Turns on delegation of further resource control partitioning to processes of the unit. Units
where this is enabled may create and manage their own private subhierarchy of control groups below
the control group of the unit itself. For unprivileged services (i.e. those using the
<varname>User=</varname> setting) the unit's control group will be made accessible to the relevant
user.</para>
<para>When enabled the service manager will refrain from manipulating control groups or moving
processes below the unit's control group, so that a clear concept of ownership is established: the
@@ -1188,6 +1189,29 @@ DeviceAllow=/dev/loop-control
</listitem>
</varlistentry>
<varlistentry>
<term><varname>DelegateSubgroup=</varname></term>
<listitem>
<para>Place unit processes in the specified subgroup of the unit's control group. Takes a valid
control group name (not a path!) as parameter, or an empty string to turn this feature
off. Defaults to off. The control group name must be usable as filename and avoid conflicts with
the kernel's control group attribute files (i.e. <filename>cgroup.procs</filename> is not an
acceptable name, since the kernel exposes a native control group attribute file by that name). This
option has no effect unless control group delegation is turned on via <varname>Delegate=</varname>,
see above. Note that this setting only applies to "main" processes of a unit, i.e. for services to
<varname>ExecStart=</varname>, but not for <varname>ExecReload=</varname> and similar. If
delegation is enabled, the latter are always placed inside a subgroup named
<filename>.control</filename>. The specified subgroup is automatically created (and potentially
ownership is passed to the unit's configured user/group) when a process is started in it.</para>
<para>This option is useful to avoid manually moving the invoked process into a subgroup after it
has been started. Since no processes should live in inner nodes of the control group tree it's
almost always necessary to run the main ("supervising") process of a unit that has delegation
turned on in a subgroup.</para>
</listitem>
</varlistentry>
<varlistentry>
<term><varname>DisableControllers=</varname></term>

View File

@@ -293,6 +293,8 @@ void cgroup_context_done(CGroupContext *c) {
cpu_set_reset(&c->startup_cpuset_cpus);
cpu_set_reset(&c->cpuset_mems);
cpu_set_reset(&c->startup_cpuset_mems);
c->delegate_subgroup = mfree(c->delegate_subgroup);
}
static int unit_get_kernel_memory_limit(Unit *u, const char *file, uint64_t *ret) {
@@ -570,6 +572,10 @@ void cgroup_context_dump(Unit *u, FILE* f, const char *prefix) {
prefix, managed_oom_preference_to_string(c->moom_preference),
prefix, cgroup_pressure_watch_to_string(c->memory_pressure_watch));
if (c->delegate_subgroup)
fprintf(f, "%sDelegateSubgroup: %s\n",
prefix, c->delegate_subgroup);
if (c->memory_pressure_threshold_usec != USEC_INFINITY)
fprintf(f, "%sMemoryPressureThresholdSec: %s\n",
prefix, FORMAT_TIMESPAN(c->memory_pressure_threshold_usec, 1));

View File

@@ -133,6 +133,7 @@ struct CGroupContext {
bool delegate;
CGroupMask delegate_controllers;
CGroupMask disable_controllers;
char *delegate_subgroup;
/* For unified hierarchy */
uint64_t cpu_weight;

View File

@@ -435,6 +435,7 @@ const sd_bus_vtable bus_cgroup_vtable[] = {
SD_BUS_VTABLE_START(0),
SD_BUS_PROPERTY("Delegate", "b", bus_property_get_bool, offsetof(CGroupContext, delegate), 0),
SD_BUS_PROPERTY("DelegateControllers", "as", property_get_delegate_controllers, 0, 0),
SD_BUS_PROPERTY("DelegateSubgroup", "s", NULL, offsetof(CGroupContext, delegate_subgroup), 0),
SD_BUS_PROPERTY("CPUAccounting", "b", bus_property_get_bool, offsetof(CGroupContext, cpu_accounting), 0),
SD_BUS_PROPERTY("CPUWeight", "t", NULL, offsetof(CGroupContext, cpu_weight), 0),
SD_BUS_PROPERTY("StartupCPUWeight", "t", NULL, offsetof(CGroupContext, startup_cpu_weight), 0),
@@ -536,6 +537,33 @@ static int bus_cgroup_set_transient_property(
return 1;
} else if (streq(name, "DelegateSubgroup")) {
const char *s;
if (!UNIT_VTABLE(u)->can_delegate)
return sd_bus_error_set(error, SD_BUS_ERROR_INVALID_ARGS, "Delegation not available for unit type");
r = sd_bus_message_read(message, "s", &s);
if (r < 0)
return r;
if (!isempty(s) && cg_needs_escape(s))
return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "Invalid control group name: %s", s);
if (!UNIT_WRITE_FLAGS_NOOP(flags)) {
if (isempty(s))
c->delegate_subgroup = mfree(c->delegate_subgroup);
else {
r = free_and_strdup_warn(&c->delegate_subgroup, s);
if (r < 0)
return r;
}
unit_write_settingf(u, flags, name, "DelegateSubgroup=%s", s);
}
return 1;
} else if (STR_IN_SET(name, "DelegateControllers", "DisableControllers")) {
CGroupMask mask = 0;

View File

@@ -4178,8 +4178,12 @@ static int compile_suggested_paths(const ExecContext *c, const ExecParameters *p
return 0;
}
static int exec_parameters_get_cgroup_path(const ExecParameters *params, char **ret) {
bool using_subcgroup;
static int exec_parameters_get_cgroup_path(
const ExecParameters *params,
const CGroupContext *c,
char **ret) {
const char *subgroup = NULL;
char *p;
assert(params);
@@ -4197,16 +4201,22 @@ static int exec_parameters_get_cgroup_path(const ExecParameters *params, char **
* this is not necessary, the cgroup is still empty. We distinguish these cases with the EXEC_CONTROL_CGROUP
* flag, which is only passed for the former statements, not for the latter. */
using_subcgroup = FLAGS_SET(params->flags, EXEC_CONTROL_CGROUP|EXEC_CGROUP_DELEGATE|EXEC_IS_CONTROL);
if (using_subcgroup)
p = path_join(params->cgroup_path, ".control");
if (FLAGS_SET(params->flags, EXEC_CGROUP_DELEGATE) && (FLAGS_SET(params->flags, EXEC_CONTROL_CGROUP) || c->delegate_subgroup)) {
if (FLAGS_SET(params->flags, EXEC_IS_CONTROL))
subgroup = ".control";
else
subgroup = c->delegate_subgroup;
}
if (subgroup)
p = path_join(params->cgroup_path, subgroup);
else
p = strdup(params->cgroup_path);
if (!p)
return -ENOMEM;
*ret = p;
return using_subcgroup;
return !!subgroup;
}
static int exec_context_cpu_affinity_from_numa(const ExecContext *c, CPUSet *ret) {
@@ -4705,7 +4715,7 @@ static int exec_child(
if (params->cgroup_path) {
_cleanup_free_ char *p = NULL;
r = exec_parameters_get_cgroup_path(params, &p);
r = exec_parameters_get_cgroup_path(params, cgroup_context, &p);
if (r < 0) {
*exit_status = EXIT_CGROUP;
return log_unit_error_errno(unit, r, "Failed to acquire cgroup path: %m");
@@ -4880,11 +4890,26 @@ static int exec_child(
* touch a single hierarchy too. */
if (params->flags & EXEC_CGROUP_DELEGATE) {
_cleanup_free_ char *p = NULL;
r = cg_set_access(SYSTEMD_CGROUP_CONTROLLER, params->cgroup_path, uid, gid);
if (r < 0) {
*exit_status = EXIT_CGROUP;
return log_unit_error_errno(unit, r, "Failed to adjust control group access: %m");
}
r = exec_parameters_get_cgroup_path(params, cgroup_context, &p);
if (r < 0) {
*exit_status = EXIT_CGROUP;
return log_unit_error_errno(unit, r, "Failed to acquire cgroup path: %m");
}
if (r > 0) {
r = cg_set_access(SYSTEMD_CGROUP_CONTROLLER, p, uid, gid);
if (r < 0) {
*exit_status = EXIT_CGROUP;
return log_unit_error_errno(unit, r, "Failed to adjust control subgroup access: %m");
}
}
}
if (cgroup_context && cg_unified() > 0 && is_pressure_supported() > 0) {
@@ -5635,13 +5660,13 @@ int exec_spawn(Unit *unit,
log_command_line(unit, "About to execute", command->path, command->argv);
if (params->cgroup_path) {
r = exec_parameters_get_cgroup_path(params, &subcgroup_path);
r = exec_parameters_get_cgroup_path(params, cgroup_context, &subcgroup_path);
if (r < 0)
return log_unit_error_errno(unit, r, "Failed to acquire subcgroup path: %m");
if (r > 0) { /* We are using a child cgroup */
r = cg_create(SYSTEMD_CGROUP_CONTROLLER, subcgroup_path);
if (r < 0)
return log_unit_error_errno(unit, r, "Failed to create control group '%s': %m", subcgroup_path);
return log_unit_error_errno(unit, r, "Failed to create subcgroup '%s': %m", subcgroup_path);
/* Normally we would not propagate the xattrs to children but since we created this
* sub-cgroup internally we should do it. */

View File

@@ -237,6 +237,7 @@
{{type}}.TasksAccounting, config_parse_bool, 0, offsetof({{type}}, cgroup_context.tasks_accounting)
{{type}}.TasksMax, config_parse_tasks_max, 0, offsetof({{type}}, cgroup_context.tasks_max)
{{type}}.Delegate, config_parse_delegate, 0, offsetof({{type}}, cgroup_context)
{{type}}.DelegateSubgroup, config_parse_delegate_subgroup , 0, offsetof({{type}}, cgroup_context)
{{type}}.DisableControllers, config_parse_disable_controllers, 0, offsetof({{type}}, cgroup_context)
{{type}}.IPAccounting, config_parse_bool, 0, offsetof({{type}}, cgroup_context.ip_accounting)
{{type}}.IPAddressAllow, config_parse_in_addr_prefixes, AF_UNSPEC, offsetof({{type}}, cgroup_context.ip_address_allow)

View File

@@ -4029,6 +4029,42 @@ int config_parse_delegate(
return 0;
}
int config_parse_delegate_subgroup(
const char *unit,
const char *filename,
unsigned line,
const char *section,
unsigned section_line,
const char *lvalue,
int ltype,
const char *rvalue,
void *data,
void *userdata) {
CGroupContext *c = ASSERT_PTR(data);
UnitType t;
t = unit_name_to_type(unit);
assert(t >= 0);
if (!unit_vtable[t]->can_delegate) {
log_syntax(unit, LOG_WARNING, filename, line, 0, "DelegateSubgroup= setting not supported for this unit type, ignoring.");
return 0;
}
if (isempty(rvalue)) {
c->delegate_subgroup = mfree(c->delegate_subgroup);
return 0;
}
if (cg_needs_escape(rvalue)) { /* Insist that specified names don't need escaping */
log_syntax(unit, LOG_WARNING, filename, line, 0, "Invalid control group name, ignoring: %s", rvalue);
return 0;
}
return free_and_strdup_warn(&c->delegate_subgroup, rvalue);
}
int config_parse_managed_oom_mode(
const char *unit,
const char *filename,

View File

@@ -83,6 +83,7 @@ CONFIG_PARSER_PROTOTYPE(config_parse_cpu_shares);
CONFIG_PARSER_PROTOTYPE(config_parse_memory_limit);
CONFIG_PARSER_PROTOTYPE(config_parse_tasks_max);
CONFIG_PARSER_PROTOTYPE(config_parse_delegate);
CONFIG_PARSER_PROTOTYPE(config_parse_delegate_subgroup);
CONFIG_PARSER_PROTOTYPE(config_parse_managed_oom_mode);
CONFIG_PARSER_PROTOTYPE(config_parse_managed_oom_mem_pressure_limit);
CONFIG_PARSER_PROTOTYPE(config_parse_managed_oom_preference);

View File

@@ -461,7 +461,8 @@ static int bus_append_cgroup_property(sd_bus_message *m, const char *field, cons
"ManagedOOMSwap",
"ManagedOOMMemoryPressure",
"ManagedOOMPreference",
"MemoryPressureWatch"))
"MemoryPressureWatch",
"DelegateSubgroup"))
return bus_append_string(m, field, eq);
if (STR_IN_SET(field, "ManagedOOMMemoryPressureLimit")) {