diff --git a/docs/CGROUP_DELEGATION.md b/docs/CGROUP_DELEGATION.md
index f5509fb833..4210a75767 100644
--- a/docs/CGROUP_DELEGATION.md
+++ b/docs/CGROUP_DELEGATION.md
@@ -271,7 +271,9 @@ your service has any of these four settings set, you must be prepared that a
means that your service code should have moved itself further down the cgroup
tree by the time it notifies the service manager about start-up readiness, so
that the service's main cgroup is definitely an inner node by the time the
-service manager might start `ExecStartPost=`.)
+service manager might start `ExecStartPost=`. Starting with systemd 254 you may
+also use `DelegateSubgroup=` to let the service manager put your initial
+service process into a subgroup right away.)
(Also note, if you intend to use "threaded" cgroups — as added in Linux 4.14 —,
then you should do that *two* levels down from the main service cgroup your
diff --git a/man/org.freedesktop.systemd1.xml b/man/org.freedesktop.systemd1.xml
index e462c60636..f2e892671a 100644
--- a/man/org.freedesktop.systemd1.xml
+++ b/man/org.freedesktop.systemd1.xml
@@ -2713,6 +2713,8 @@ node /org/freedesktop/systemd1/unit/avahi_2ddaemon_2eservice {
@org.freedesktop.DBus.Property.EmitsChangedSignal("false")
readonly as DelegateControllers = ['...', ...];
@org.freedesktop.DBus.Property.EmitsChangedSignal("false")
+ readonly s DelegateSubgroup = '...';
+ @org.freedesktop.DBus.Property.EmitsChangedSignal("false")
readonly b CPUAccounting = ...;
@org.freedesktop.DBus.Property.EmitsChangedSignal("false")
readonly t CPUWeight = ...;
@@ -3942,6 +3944,8 @@ node /org/freedesktop/systemd1/unit/avahi_2ddaemon_2eservice {
+
+
@@ -4544,6 +4548,10 @@ node /org/freedesktop/systemd1/unit/avahi_2ddaemon_2eservice {
memory controller is reached. It will take into consideration limits on all parent slices, other than the
limits set on the unit itself.
+ DelegateSubgroup contains the cgroup subgroup to place invoked unit processes
+ in. As configured by the option of the same name in unit files. This is set to the empty string when it
+ does not apply or no subgroup has been configured.
+
RuntimeDirectorySymlink, StateDirectorySymlink,
CacheDirectorySymlink and LogsDirectorySymlink respectively
implement the destination parameter of the unit files settings RuntimeDirectory,
@@ -4715,6 +4723,8 @@ node /org/freedesktop/systemd1/unit/avahi_2ddaemon_2esocket {
@org.freedesktop.DBus.Property.EmitsChangedSignal("false")
readonly as DelegateControllers = ['...', ...];
@org.freedesktop.DBus.Property.EmitsChangedSignal("false")
+ readonly s DelegateSubgroup = '...';
+ @org.freedesktop.DBus.Property.EmitsChangedSignal("false")
readonly b CPUAccounting = ...;
@org.freedesktop.DBus.Property.EmitsChangedSignal("false")
readonly t CPUWeight = ...;
@@ -5936,6 +5946,8 @@ node /org/freedesktop/systemd1/unit/avahi_2ddaemon_2esocket {
+
+
@@ -6588,6 +6600,8 @@ node /org/freedesktop/systemd1/unit/home_2emount {
@org.freedesktop.DBus.Property.EmitsChangedSignal("false")
readonly as DelegateControllers = ['...', ...];
@org.freedesktop.DBus.Property.EmitsChangedSignal("false")
+ readonly s DelegateSubgroup = '...';
+ @org.freedesktop.DBus.Property.EmitsChangedSignal("false")
readonly b CPUAccounting = ...;
@org.freedesktop.DBus.Property.EmitsChangedSignal("false")
readonly t CPUWeight = ...;
@@ -7655,6 +7669,8 @@ node /org/freedesktop/systemd1/unit/home_2emount {
+
+
@@ -8434,6 +8450,8 @@ node /org/freedesktop/systemd1/unit/dev_2dsda3_2eswap {
@org.freedesktop.DBus.Property.EmitsChangedSignal("false")
readonly as DelegateControllers = ['...', ...];
@org.freedesktop.DBus.Property.EmitsChangedSignal("false")
+ readonly s DelegateSubgroup = '...';
+ @org.freedesktop.DBus.Property.EmitsChangedSignal("false")
readonly b CPUAccounting = ...;
@org.freedesktop.DBus.Property.EmitsChangedSignal("false")
readonly t CPUWeight = ...;
@@ -9473,6 +9491,8 @@ node /org/freedesktop/systemd1/unit/dev_2dsda3_2eswap {
+
+
@@ -10111,6 +10131,8 @@ node /org/freedesktop/systemd1/unit/system_2eslice {
@org.freedesktop.DBus.Property.EmitsChangedSignal("false")
readonly as DelegateControllers = ['...', ...];
@org.freedesktop.DBus.Property.EmitsChangedSignal("false")
+ readonly s DelegateSubgroup = '...';
+ @org.freedesktop.DBus.Property.EmitsChangedSignal("false")
readonly b CPUAccounting = ...;
@org.freedesktop.DBus.Property.EmitsChangedSignal("false")
readonly t CPUWeight = ...;
@@ -10456,6 +10478,8 @@ node /org/freedesktop/systemd1/unit/system_2eslice {
+
+
@@ -10656,6 +10680,8 @@ node /org/freedesktop/systemd1/unit/session_2d1_2escope {
@org.freedesktop.DBus.Property.EmitsChangedSignal("false")
readonly as DelegateControllers = ['...', ...];
@org.freedesktop.DBus.Property.EmitsChangedSignal("false")
+ readonly s DelegateSubgroup = '...';
+ @org.freedesktop.DBus.Property.EmitsChangedSignal("false")
readonly b CPUAccounting = ...;
@org.freedesktop.DBus.Property.EmitsChangedSignal("false")
readonly t CPUWeight = ...;
@@ -11051,6 +11077,8 @@ node /org/freedesktop/systemd1/unit/session_2d1_2escope {
+
+
diff --git a/man/systemd.resource-control.xml b/man/systemd.resource-control.xml
index f4e4a492a0..610c11feb3 100644
--- a/man/systemd.resource-control.xml
+++ b/man/systemd.resource-control.xml
@@ -1148,10 +1148,11 @@ DeviceAllow=/dev/loop-control
Delegate=
- Turns on delegation of further resource control partitioning to processes of the unit. Units where this
- is enabled may create and manage their own private subhierarchy of control groups below the control group of
- the unit itself. For unprivileged services (i.e. those using the User= setting) the unit's
- control group will be made accessible to the relevant user.
+ Turns on delegation of further resource control partitioning to processes of the unit. Units
+ where this is enabled may create and manage their own private subhierarchy of control groups below
+ the control group of the unit itself. For unprivileged services (i.e. those using the
+ User= setting) the unit's control group will be made accessible to the relevant
+ user.
When enabled the service manager will refrain from manipulating control groups or moving
processes below the unit's control group, so that a clear concept of ownership is established: the
@@ -1188,6 +1189,29 @@ DeviceAllow=/dev/loop-control
+
+ DelegateSubgroup=
+
+
+ Place unit processes in the specified subgroup of the unit's control group. Takes a valid
+ control group name (not a path!) as parameter, or an empty string to turn this feature
+ off. Defaults to off. The control group name must be usable as filename and avoid conflicts with
+ the kernel's control group attribute files (i.e. cgroup.procs is not an
+ acceptable name, since the kernel exposes a native control group attribute file by that name). This
+ option has no effect unless control group delegation is turned on via Delegate=,
+ see above. Note that this setting only applies to "main" processes of a unit, i.e. for services to
+ ExecStart=, but not for ExecReload= and similar. If
+ delegation is enabled, the latter are always placed inside a subgroup named
+ .control. The specified subgroup is automatically created (and potentially
+ ownership is passed to the unit's configured user/group) when a process is started in it.
+
+ This option is useful to avoid manually moving the invoked process into a subgroup after it
+ has been started. Since no processes should live in inner nodes of the control group tree it's
+ almost always necessary to run the main ("supervising") process of a unit that has delegation
+ turned on in a subgroup.
+
+
+
DisableControllers=
diff --git a/src/core/cgroup.c b/src/core/cgroup.c
index cd48183f7a..4ec5dfa587 100644
--- a/src/core/cgroup.c
+++ b/src/core/cgroup.c
@@ -293,6 +293,8 @@ void cgroup_context_done(CGroupContext *c) {
cpu_set_reset(&c->startup_cpuset_cpus);
cpu_set_reset(&c->cpuset_mems);
cpu_set_reset(&c->startup_cpuset_mems);
+
+ c->delegate_subgroup = mfree(c->delegate_subgroup);
}
static int unit_get_kernel_memory_limit(Unit *u, const char *file, uint64_t *ret) {
@@ -570,6 +572,10 @@ void cgroup_context_dump(Unit *u, FILE* f, const char *prefix) {
prefix, managed_oom_preference_to_string(c->moom_preference),
prefix, cgroup_pressure_watch_to_string(c->memory_pressure_watch));
+ if (c->delegate_subgroup)
+ fprintf(f, "%sDelegateSubgroup: %s\n",
+ prefix, c->delegate_subgroup);
+
if (c->memory_pressure_threshold_usec != USEC_INFINITY)
fprintf(f, "%sMemoryPressureThresholdSec: %s\n",
prefix, FORMAT_TIMESPAN(c->memory_pressure_threshold_usec, 1));
diff --git a/src/core/cgroup.h b/src/core/cgroup.h
index d445ea1e8d..bbbf9408cc 100644
--- a/src/core/cgroup.h
+++ b/src/core/cgroup.h
@@ -133,6 +133,7 @@ struct CGroupContext {
bool delegate;
CGroupMask delegate_controllers;
CGroupMask disable_controllers;
+ char *delegate_subgroup;
/* For unified hierarchy */
uint64_t cpu_weight;
diff --git a/src/core/dbus-cgroup.c b/src/core/dbus-cgroup.c
index 3a02fcbdb1..682ad5edd5 100644
--- a/src/core/dbus-cgroup.c
+++ b/src/core/dbus-cgroup.c
@@ -435,6 +435,7 @@ const sd_bus_vtable bus_cgroup_vtable[] = {
SD_BUS_VTABLE_START(0),
SD_BUS_PROPERTY("Delegate", "b", bus_property_get_bool, offsetof(CGroupContext, delegate), 0),
SD_BUS_PROPERTY("DelegateControllers", "as", property_get_delegate_controllers, 0, 0),
+ SD_BUS_PROPERTY("DelegateSubgroup", "s", NULL, offsetof(CGroupContext, delegate_subgroup), 0),
SD_BUS_PROPERTY("CPUAccounting", "b", bus_property_get_bool, offsetof(CGroupContext, cpu_accounting), 0),
SD_BUS_PROPERTY("CPUWeight", "t", NULL, offsetof(CGroupContext, cpu_weight), 0),
SD_BUS_PROPERTY("StartupCPUWeight", "t", NULL, offsetof(CGroupContext, startup_cpu_weight), 0),
@@ -536,6 +537,33 @@ static int bus_cgroup_set_transient_property(
return 1;
+ } else if (streq(name, "DelegateSubgroup")) {
+ const char *s;
+
+ if (!UNIT_VTABLE(u)->can_delegate)
+ return sd_bus_error_set(error, SD_BUS_ERROR_INVALID_ARGS, "Delegation not available for unit type");
+
+ r = sd_bus_message_read(message, "s", &s);
+ if (r < 0)
+ return r;
+
+ if (!isempty(s) && cg_needs_escape(s))
+ return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "Invalid control group name: %s", s);
+
+ if (!UNIT_WRITE_FLAGS_NOOP(flags)) {
+ if (isempty(s))
+ c->delegate_subgroup = mfree(c->delegate_subgroup);
+ else {
+ r = free_and_strdup_warn(&c->delegate_subgroup, s);
+ if (r < 0)
+ return r;
+ }
+
+ unit_write_settingf(u, flags, name, "DelegateSubgroup=%s", s);
+ }
+
+ return 1;
+
} else if (STR_IN_SET(name, "DelegateControllers", "DisableControllers")) {
CGroupMask mask = 0;
diff --git a/src/core/execute.c b/src/core/execute.c
index 60f7a6439c..da7a37c187 100644
--- a/src/core/execute.c
+++ b/src/core/execute.c
@@ -4178,8 +4178,12 @@ static int compile_suggested_paths(const ExecContext *c, const ExecParameters *p
return 0;
}
-static int exec_parameters_get_cgroup_path(const ExecParameters *params, char **ret) {
- bool using_subcgroup;
+static int exec_parameters_get_cgroup_path(
+ const ExecParameters *params,
+ const CGroupContext *c,
+ char **ret) {
+
+ const char *subgroup = NULL;
char *p;
assert(params);
@@ -4197,16 +4201,22 @@ static int exec_parameters_get_cgroup_path(const ExecParameters *params, char **
* this is not necessary, the cgroup is still empty. We distinguish these cases with the EXEC_CONTROL_CGROUP
* flag, which is only passed for the former statements, not for the latter. */
- using_subcgroup = FLAGS_SET(params->flags, EXEC_CONTROL_CGROUP|EXEC_CGROUP_DELEGATE|EXEC_IS_CONTROL);
- if (using_subcgroup)
- p = path_join(params->cgroup_path, ".control");
+ if (FLAGS_SET(params->flags, EXEC_CGROUP_DELEGATE) && (FLAGS_SET(params->flags, EXEC_CONTROL_CGROUP) || c->delegate_subgroup)) {
+ if (FLAGS_SET(params->flags, EXEC_IS_CONTROL))
+ subgroup = ".control";
+ else
+ subgroup = c->delegate_subgroup;
+ }
+
+ if (subgroup)
+ p = path_join(params->cgroup_path, subgroup);
else
p = strdup(params->cgroup_path);
if (!p)
return -ENOMEM;
*ret = p;
- return using_subcgroup;
+ return !!subgroup;
}
static int exec_context_cpu_affinity_from_numa(const ExecContext *c, CPUSet *ret) {
@@ -4705,7 +4715,7 @@ static int exec_child(
if (params->cgroup_path) {
_cleanup_free_ char *p = NULL;
- r = exec_parameters_get_cgroup_path(params, &p);
+ r = exec_parameters_get_cgroup_path(params, cgroup_context, &p);
if (r < 0) {
*exit_status = EXIT_CGROUP;
return log_unit_error_errno(unit, r, "Failed to acquire cgroup path: %m");
@@ -4880,11 +4890,26 @@ static int exec_child(
* touch a single hierarchy too. */
if (params->flags & EXEC_CGROUP_DELEGATE) {
+ _cleanup_free_ char *p = NULL;
+
r = cg_set_access(SYSTEMD_CGROUP_CONTROLLER, params->cgroup_path, uid, gid);
if (r < 0) {
*exit_status = EXIT_CGROUP;
return log_unit_error_errno(unit, r, "Failed to adjust control group access: %m");
}
+
+ r = exec_parameters_get_cgroup_path(params, cgroup_context, &p);
+ if (r < 0) {
+ *exit_status = EXIT_CGROUP;
+ return log_unit_error_errno(unit, r, "Failed to acquire cgroup path: %m");
+ }
+ if (r > 0) {
+ r = cg_set_access(SYSTEMD_CGROUP_CONTROLLER, p, uid, gid);
+ if (r < 0) {
+ *exit_status = EXIT_CGROUP;
+ return log_unit_error_errno(unit, r, "Failed to adjust control subgroup access: %m");
+ }
+ }
}
if (cgroup_context && cg_unified() > 0 && is_pressure_supported() > 0) {
@@ -5635,13 +5660,13 @@ int exec_spawn(Unit *unit,
log_command_line(unit, "About to execute", command->path, command->argv);
if (params->cgroup_path) {
- r = exec_parameters_get_cgroup_path(params, &subcgroup_path);
+ r = exec_parameters_get_cgroup_path(params, cgroup_context, &subcgroup_path);
if (r < 0)
return log_unit_error_errno(unit, r, "Failed to acquire subcgroup path: %m");
if (r > 0) { /* We are using a child cgroup */
r = cg_create(SYSTEMD_CGROUP_CONTROLLER, subcgroup_path);
if (r < 0)
- return log_unit_error_errno(unit, r, "Failed to create control group '%s': %m", subcgroup_path);
+ return log_unit_error_errno(unit, r, "Failed to create subcgroup '%s': %m", subcgroup_path);
/* Normally we would not propagate the xattrs to children but since we created this
* sub-cgroup internally we should do it. */
diff --git a/src/core/load-fragment-gperf.gperf.in b/src/core/load-fragment-gperf.gperf.in
index 8a2823b075..110bccb7ad 100644
--- a/src/core/load-fragment-gperf.gperf.in
+++ b/src/core/load-fragment-gperf.gperf.in
@@ -237,6 +237,7 @@
{{type}}.TasksAccounting, config_parse_bool, 0, offsetof({{type}}, cgroup_context.tasks_accounting)
{{type}}.TasksMax, config_parse_tasks_max, 0, offsetof({{type}}, cgroup_context.tasks_max)
{{type}}.Delegate, config_parse_delegate, 0, offsetof({{type}}, cgroup_context)
+{{type}}.DelegateSubgroup, config_parse_delegate_subgroup , 0, offsetof({{type}}, cgroup_context)
{{type}}.DisableControllers, config_parse_disable_controllers, 0, offsetof({{type}}, cgroup_context)
{{type}}.IPAccounting, config_parse_bool, 0, offsetof({{type}}, cgroup_context.ip_accounting)
{{type}}.IPAddressAllow, config_parse_in_addr_prefixes, AF_UNSPEC, offsetof({{type}}, cgroup_context.ip_address_allow)
diff --git a/src/core/load-fragment.c b/src/core/load-fragment.c
index 581a051d46..ebfe98d7cc 100644
--- a/src/core/load-fragment.c
+++ b/src/core/load-fragment.c
@@ -4029,6 +4029,42 @@ int config_parse_delegate(
return 0;
}
+int config_parse_delegate_subgroup(
+ const char *unit,
+ const char *filename,
+ unsigned line,
+ const char *section,
+ unsigned section_line,
+ const char *lvalue,
+ int ltype,
+ const char *rvalue,
+ void *data,
+ void *userdata) {
+
+ CGroupContext *c = ASSERT_PTR(data);
+ UnitType t;
+
+ t = unit_name_to_type(unit);
+ assert(t >= 0);
+
+ if (!unit_vtable[t]->can_delegate) {
+ log_syntax(unit, LOG_WARNING, filename, line, 0, "DelegateSubgroup= setting not supported for this unit type, ignoring.");
+ return 0;
+ }
+
+ if (isempty(rvalue)) {
+ c->delegate_subgroup = mfree(c->delegate_subgroup);
+ return 0;
+ }
+
+ if (cg_needs_escape(rvalue)) { /* Insist that specified names don't need escaping */
+ log_syntax(unit, LOG_WARNING, filename, line, 0, "Invalid control group name, ignoring: %s", rvalue);
+ return 0;
+ }
+
+ return free_and_strdup_warn(&c->delegate_subgroup, rvalue);
+}
+
int config_parse_managed_oom_mode(
const char *unit,
const char *filename,
diff --git a/src/core/load-fragment.h b/src/core/load-fragment.h
index a38d697338..59f02a3207 100644
--- a/src/core/load-fragment.h
+++ b/src/core/load-fragment.h
@@ -83,6 +83,7 @@ CONFIG_PARSER_PROTOTYPE(config_parse_cpu_shares);
CONFIG_PARSER_PROTOTYPE(config_parse_memory_limit);
CONFIG_PARSER_PROTOTYPE(config_parse_tasks_max);
CONFIG_PARSER_PROTOTYPE(config_parse_delegate);
+CONFIG_PARSER_PROTOTYPE(config_parse_delegate_subgroup);
CONFIG_PARSER_PROTOTYPE(config_parse_managed_oom_mode);
CONFIG_PARSER_PROTOTYPE(config_parse_managed_oom_mem_pressure_limit);
CONFIG_PARSER_PROTOTYPE(config_parse_managed_oom_preference);
diff --git a/src/shared/bus-unit-util.c b/src/shared/bus-unit-util.c
index ebbd1f7f28..a321179609 100644
--- a/src/shared/bus-unit-util.c
+++ b/src/shared/bus-unit-util.c
@@ -461,7 +461,8 @@ static int bus_append_cgroup_property(sd_bus_message *m, const char *field, cons
"ManagedOOMSwap",
"ManagedOOMMemoryPressure",
"ManagedOOMPreference",
- "MemoryPressureWatch"))
+ "MemoryPressureWatch",
+ "DelegateSubgroup"))
return bus_append_string(m, field, eq);
if (STR_IN_SET(field, "ManagedOOMMemoryPressureLimit")) {