diff --git a/man/org.freedesktop.systemd1.xml b/man/org.freedesktop.systemd1.xml
index f9aa67739f..01d8f659d5 100644
--- a/man/org.freedesktop.systemd1.xml
+++ b/man/org.freedesktop.systemd1.xml
@@ -2028,6 +2028,8 @@ node /org/freedesktop/systemd1/unit/avahi_2ddaemon_2eservice {
@org.freedesktop.DBus.Property.EmitsChangedSignal("const")
readonly b DefaultDependencies = ...;
@org.freedesktop.DBus.Property.EmitsChangedSignal("const")
+ readonly b SurviveFinalKillSignal = ...;
+ @org.freedesktop.DBus.Property.EmitsChangedSignal("const")
readonly s OnSuccessJobMode = '...';
@org.freedesktop.DBus.Property.EmitsChangedSignal("const")
readonly s OnFailureJobMode = '...';
@@ -2142,6 +2144,8 @@ node /org/freedesktop/systemd1/unit/avahi_2ddaemon_2eservice {
+
+
@@ -2354,6 +2358,8 @@ node /org/freedesktop/systemd1/unit/avahi_2ddaemon_2eservice {
+
+
@@ -11613,6 +11619,7 @@ $ gdbus introspect --system --dest org.freedesktop.systemd1 \
AccessSELinuxContext and
ActivationDetails were added in version 252.QueueSignal() was added in version 254.
+ SurviveFinalKillSignal was added in version 255.Service Unit Objects
diff --git a/man/systemd-soft-reboot.service.xml b/man/systemd-soft-reboot.service.xml
index f7908d3f03..1de2fbc5f3 100644
--- a/man/systemd-soft-reboot.service.xml
+++ b/man/systemd-soft-reboot.service.xml
@@ -96,12 +96,41 @@
The /run/ file system remains mounted and populated and may be
used to pass state information between such userspace reboot cycles.
- Service processes may continue to run over the transition, if they are placed in
- services that remain active until the very end of shutdown (which again is achieved via
- DefaultDependencies=no). They must also be set up to avoid being killed by the
- aforementioned SIGTERM spree (as per systemd and Storage Daemons for the Root File
- System).
+ Service processes may continue to run over the transition, past soft-reboot and into
+ the next session, if they are placed in services that remain active until the very end of shutdown
+ (which again is achieved via DefaultDependencies=no). They must also be set up to
+ avoid being killed by the aforementioned SIGTERM and SIGKILL
+ via SurviveFinalKillSignal=yes, and also be configured to avoid being stopped on
+ isolate via IgnoreOnIsolate=yes. They also have to be configured to be stopped on
+ normal shutdown, reboot and maintenance mode. Finally, they have to be ordered after
+ basic.target to ensure correct ordeering on boot. Note that in case any new or
+ custom units are used to isolate to, or that implement an equivalent shutdown functionality, they will
+ also have to be configured manually for correct ordering and conflicting. For example:
+
+ [Unit]
+Description=My surviving service
+SurviveFinalKillSignal=yes
+IgnoreOnIsolate=yes
+DefaultDependencies=no
+After=basic.target
+Conflicts=reboot.target
+Before=reboot.target
+Conflicts=kexec.target
+Before=kexec.target
+Conflicts=poweroff.target
+Before=poweroff.target
+Conflicts=halt.target
+Before=halt.target
+Conflicts=rescue.target
+Before=rescue.target
+Conflicts=emergency.target
+Before=emergency.target
+
+[Service]
+Type=oneshot
+ExecStart=sleep infinity
+
+ File system mounts may remain mounted during the transition, and complex storage
attached, if configured to remain until the very end of the shutdown process. (Also achieved via
diff --git a/man/systemd.unit.xml b/man/systemd.unit.xml
index fa867dba1a..3447391c9e 100644
--- a/man/systemd.unit.xml
+++ b/man/systemd.unit.xml
@@ -1023,6 +1023,20 @@
+
+ SurviveFinalKillSignal=
+
+ Takes a boolean argument. Defaults to . If ,
+ processes belonging to this unit will not be sent the final SIGTERM and
+ SIGKILL signals during the final phase of the system shutdown process.
+ This functionality replaces the older mechanism that allowed a program to set
+ argv[0][0] = '@' as described at
+ systemd and Storage Daemons for the Root File
+ System, which however continues to be supported.
+
+
+
+
CollectMode=
diff --git a/src/core/cgroup.c b/src/core/cgroup.c
index 32c78a449b..e8f8ddc244 100644
--- a/src/core/cgroup.c
+++ b/src/core/cgroup.c
@@ -888,6 +888,7 @@ int cgroup_log_xattr_apply(Unit *u, const char *cgroup_path) {
static void cgroup_xattr_apply(Unit *u) {
bool b;
+ int r;
assert(u);
@@ -921,6 +922,32 @@ static void cgroup_xattr_apply(Unit *u) {
else
unit_remove_xattr_graceful(u, NULL, xn);
}
+
+ if (u->survive_final_kill_signal) {
+ r = cg_set_xattr(SYSTEMD_CGROUP_CONTROLLER,
+ u->cgroup_path,
+ "user.survive_final_kill_signal",
+ "1",
+ 1,
+ /* flags= */ 0);
+ /* user xattr support was added in kernel v5.7 */
+ if (ERRNO_IS_NEG_NOT_SUPPORTED(r))
+ r = cg_set_xattr(SYSTEMD_CGROUP_CONTROLLER,
+ u->cgroup_path,
+ "trusted.survive_final_kill_signal",
+ "1",
+ 1,
+ /* flags= */ 0);
+ if (r < 0)
+ log_unit_debug_errno(u,
+ r,
+ "Failed to set 'survive_final_kill_signal' xattr on control "
+ "group %s, ignoring: %m",
+ empty_to_root(u->cgroup_path));
+ } else {
+ unit_remove_xattr_graceful(u, /* cgroup_path= */ NULL, "user.survive_final_kill_signal");
+ unit_remove_xattr_graceful(u, /* cgroup_path= */ NULL, "trusted.survive_final_kill_signal");
+ }
}
static int lookup_block_device(const char *p, dev_t *ret) {
diff --git a/src/core/dbus-unit.c b/src/core/dbus-unit.c
index e9b446945a..05b80cbf33 100644
--- a/src/core/dbus-unit.c
+++ b/src/core/dbus-unit.c
@@ -921,6 +921,7 @@ const sd_bus_vtable bus_unit_vtable[] = {
SD_BUS_PROPERTY("RefuseManualStop", "b", bus_property_get_bool, offsetof(Unit, refuse_manual_stop), SD_BUS_VTABLE_PROPERTY_CONST),
SD_BUS_PROPERTY("AllowIsolate", "b", bus_property_get_bool, offsetof(Unit, allow_isolate), SD_BUS_VTABLE_PROPERTY_CONST),
SD_BUS_PROPERTY("DefaultDependencies", "b", bus_property_get_bool, offsetof(Unit, default_dependencies), SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("SurviveFinalKillSignal", "b", bus_property_get_bool, offsetof(Unit, survive_final_kill_signal), SD_BUS_VTABLE_PROPERTY_CONST),
SD_BUS_PROPERTY("OnSuccesJobMode", "s", property_get_job_mode, offsetof(Unit, on_success_job_mode), SD_BUS_VTABLE_PROPERTY_CONST|SD_BUS_VTABLE_HIDDEN), /* deprecated */
SD_BUS_PROPERTY("OnSuccessJobMode", "s", property_get_job_mode, offsetof(Unit, on_success_job_mode), SD_BUS_VTABLE_PROPERTY_CONST),
SD_BUS_PROPERTY("OnFailureJobMode", "s", property_get_job_mode, offsetof(Unit, on_failure_job_mode), SD_BUS_VTABLE_PROPERTY_CONST),
@@ -2174,6 +2175,9 @@ static int bus_unit_set_transient_property(
if (streq(name, "DefaultDependencies"))
return bus_set_transient_bool(u, name, &u->default_dependencies, message, flags, error);
+ if (streq(name, "SurviveFinalKillSignal"))
+ return bus_set_transient_bool(u, name, &u->survive_final_kill_signal, message, flags, error);
+
if (streq(name, "OnSuccessJobMode"))
return bus_set_transient_job_mode(u, name, &u->on_success_job_mode, message, flags, error);
diff --git a/src/core/load-fragment-gperf.gperf.in b/src/core/load-fragment-gperf.gperf.in
index 6cdf131975..77a0dce529 100644
--- a/src/core/load-fragment-gperf.gperf.in
+++ b/src/core/load-fragment-gperf.gperf.in
@@ -313,6 +313,7 @@ Unit.RefuseManualStart, config_parse_bool,
Unit.RefuseManualStop, config_parse_bool, 0, offsetof(Unit, refuse_manual_stop)
Unit.AllowIsolate, config_parse_bool, 0, offsetof(Unit, allow_isolate)
Unit.DefaultDependencies, config_parse_bool, 0, offsetof(Unit, default_dependencies)
+Unit.SurviveFinalKillSignal, config_parse_bool, 0, offsetof(Unit, survive_final_kill_signal)
Unit.OnSuccessJobMode, config_parse_job_mode, 0, offsetof(Unit, on_success_job_mode)
Unit.OnFailureJobMode, config_parse_job_mode, 0, offsetof(Unit, on_failure_job_mode)
{# The following is a legacy alias name for compatibility #}
diff --git a/src/core/main.c b/src/core/main.c
index 4ee5e8735f..cbcf3ddeea 100644
--- a/src/core/main.c
+++ b/src/core/main.c
@@ -1766,7 +1766,8 @@ static void finish_remaining_processes(ManagerObjective objective) {
if (IN_SET(objective, MANAGER_SWITCH_ROOT, MANAGER_SOFT_REBOOT))
broadcast_signal(SIGTERM, /* wait_for_exit= */ false, /* send_sighup= */ true, arg_defaults.timeout_stop_usec);
- /* On soft reboot really make sure nothing is left */
+ /* On soft reboot really make sure nothing is left. Note that this will skip cgroups
+ * of units that were configured with SurviveFinalKillSignal=yes. */
if (objective == MANAGER_SOFT_REBOOT)
broadcast_signal(SIGKILL, /* wait_for_exit= */ false, /* send_sighup= */ false, arg_defaults.timeout_stop_usec);
}
diff --git a/src/core/unit-serialize.c b/src/core/unit-serialize.c
index f0030d3211..2851002615 100644
--- a/src/core/unit-serialize.c
+++ b/src/core/unit-serialize.c
@@ -826,6 +826,7 @@ void unit_dump(Unit *u, FILE *f, const char *prefix) {
"%s\tRefuseManualStart: %s\n"
"%s\tRefuseManualStop: %s\n"
"%s\tDefaultDependencies: %s\n"
+ "%s\tSurviveFinalKillSignal: %s\n"
"%s\tOnSuccessJobMode: %s\n"
"%s\tOnFailureJobMode: %s\n"
"%s\tIgnoreOnIsolate: %s\n",
@@ -833,6 +834,7 @@ void unit_dump(Unit *u, FILE *f, const char *prefix) {
prefix, yes_no(u->refuse_manual_start),
prefix, yes_no(u->refuse_manual_stop),
prefix, yes_no(u->default_dependencies),
+ prefix, yes_no(u->survive_final_kill_signal),
prefix, job_mode_to_string(u->on_success_job_mode),
prefix, job_mode_to_string(u->on_failure_job_mode),
prefix, yes_no(u->ignore_on_isolate));
diff --git a/src/core/unit.h b/src/core/unit.h
index f1a80cc891..ee466f351a 100644
--- a/src/core/unit.h
+++ b/src/core/unit.h
@@ -451,6 +451,9 @@ typedef struct Unit {
/* Create default dependencies */
bool default_dependencies;
+ /* Configure so that the unit survives a system transition without stopping/starting. */
+ bool survive_final_kill_signal;
+
/* Refuse manual starting, allow starting only indirectly via dependency. */
bool refuse_manual_start;
diff --git a/src/shared/bus-unit-util.c b/src/shared/bus-unit-util.c
index f7003df9f0..f88a4f5aab 100644
--- a/src/shared/bus-unit-util.c
+++ b/src/shared/bus-unit-util.c
@@ -2607,6 +2607,7 @@ static int bus_append_unit_property(sd_bus_message *m, const char *field, const
"RefuseManualStop",
"AllowIsolate",
"IgnoreOnIsolate",
+ "SurviveFinalKillSignal",
"DefaultDependencies"))
return bus_append_parse_boolean(m, field, eq);
diff --git a/src/shared/killall.c b/src/shared/killall.c
index 0b5a6642ec..66acba5b09 100644
--- a/src/shared/killall.c
+++ b/src/shared/killall.c
@@ -11,6 +11,7 @@
#include "alloc-util.h"
#include "constants.h"
#include "dirent-util.h"
+#include "errno-util.h"
#include "fd-util.h"
#include "format-util.h"
#include "initrd-util.h"
@@ -22,10 +23,54 @@
#include "string-util.h"
#include "terminal-util.h"
-static bool ignore_proc(pid_t pid, bool warn_rootfs) {
+static bool argv_has_at(pid_t pid) {
_cleanup_fclose_ FILE *f = NULL;
const char *p;
char c = 0;
+
+ p = procfs_file_alloca(pid, "cmdline");
+ f = fopen(p, "re");
+ if (!f) {
+ log_debug_errno(errno, "Failed to open %s, ignoring: %m", p);
+ return true; /* not really, but has the desired effect */
+ }
+
+ /* Try to read the first character of the command line. If the cmdline is empty (which might be the case for
+ * kernel threads but potentially also other stuff), this line won't do anything, but we don't care much, as
+ * actual kernel threads are already filtered out above. */
+ (void) fread(&c, 1, 1, f);
+
+ /* Processes with argv[0][0] = '@' we ignore from the killing spree.
+ *
+ * https://systemd.io/ROOT_STORAGE_DAEMONS */
+ return c == '@';
+}
+
+static bool is_survivor_cgroup(pid_t pid) {
+ _cleanup_free_ char *cgroup_path = NULL;
+ int r;
+
+ r = cg_pid_get_path(/* root= */ NULL, pid, &cgroup_path);
+ if (r < 0) {
+ log_warning_errno(r, "Failed to get cgroup path of process " PID_FMT ", ignoring: %m", pid);
+ return false;
+ }
+
+ r = cg_get_xattr_bool(SYSTEMD_CGROUP_CONTROLLER, cgroup_path, "user.survive_final_kill_signal");
+ /* user xattr support was added to kernel v5.7, try with the trusted namespace as a fallback */
+ if (ERRNO_IS_NEG_XATTR_ABSENT(r))
+ r = cg_get_xattr_bool(SYSTEMD_CGROUP_CONTROLLER,
+ cgroup_path,
+ "trusted.survive_final_kill_signal");
+ if (r < 0)
+ log_debug_errno(r,
+ "Failed to get survive_final_kill_signal xattr of %s, ignoring: %m",
+ cgroup_path);
+
+ return r > 0;
+}
+
+static bool ignore_proc(pid_t pid, bool warn_rootfs) {
uid_t uid;
int r;
@@ -38,6 +83,10 @@ static bool ignore_proc(pid_t pid, bool warn_rootfs) {
if (r != 0)
return true; /* also ignore processes where we can't determine this */
+ /* Ignore processes that are part of a cgroup marked with the user.survive_final_kill_signal xattr */
+ if (is_survivor_cgroup(pid))
+ return true;
+
r = get_process_uid(pid, &uid);
if (r < 0)
return true; /* not really, but better safe than sorry */
@@ -46,20 +95,7 @@ static bool ignore_proc(pid_t pid, bool warn_rootfs) {
if (uid != 0)
return false;
- p = procfs_file_alloca(pid, "cmdline");
- f = fopen(p, "re");
- if (!f)
- return true; /* not really, but has the desired effect */
-
- /* Try to read the first character of the command line. If the cmdline is empty (which might be the case for
- * kernel threads but potentially also other stuff), this line won't do anything, but we don't care much, as
- * actual kernel threads are already filtered out above. */
- (void) fread(&c, 1, 1, f);
-
- /* Processes with argv[0][0] = '@' we ignore from the killing spree.
- *
- * https://systemd.io/ROOT_STORAGE_DAEMONS */
- if (c != '@')
+ if (!argv_has_at(pid))
return false;
if (warn_rootfs &&
diff --git a/test/units/testsuite-82.sh b/test/units/testsuite-82.sh
index d13fe1b76f..24bd976b87 100755
--- a/test/units/testsuite-82.sh
+++ b/test/units/testsuite-82.sh
@@ -20,8 +20,8 @@ if [ -f /run/testsuite82.touch3 ]; then
read -r x <&5
test "$x" = "oinkoink"
- # Check that no service is still around
- test "$(systemctl show -P ActiveState testsuite-82-survive.service)" != "active"
+ # Check that the surviving service is still around
+ test "$(systemctl show -P ActiveState testsuite-82-survive.service)" = "active"
test "$(systemctl show -P ActiveState testsuite-82-nosurvive.service)" != "active"
# All succeeded, exit cleanly now
@@ -43,8 +43,8 @@ elif [ -f /run/testsuite82.touch2 ]; then
systemd-notify --fd=3 --pid=parent 3<"$T"
rm "$T"
- # Check that no service is still around
- test "$(systemctl show -P ActiveState testsuite-82-survive.service)" != "active"
+ # Check that the surviving service is still around
+ test "$(systemctl show -P ActiveState testsuite-82-survive.service)" = "active"
test "$(systemctl show -P ActiveState testsuite-82-nosurvive.service)" != "active"
# Test that we really are in the new overlayfs root fs
@@ -57,6 +57,9 @@ elif [ -f /run/testsuite82.touch2 ]; then
mount --bind /original-root /run/nextroot
mount
+ # Restart the unit that is not supposed to survive
+ systemd-run -p Type=exec --unit=testsuite-82-nosurvive.service sleep infinity
+
# Now issue the soft reboot. We should be right back soon.
touch /run/testsuite82.touch3
systemctl --no-block soft-reboot
@@ -85,8 +88,8 @@ elif [ -f /run/testsuite82.touch ]; then
systemd-notify --fd=3 --pid=parent 3<"$T"
rm "$T"
- # Check that no service survived, regardless of the configuration
- test "$(systemctl show -P ActiveState testsuite-82-survive.service)" != "active"
+ # Check that the surviving service is still around
+ test "$(systemctl show -P ActiveState testsuite-82-survive.service)" = "active"
test "$(systemctl show -P ActiveState testsuite-82-nosurvive.service)" != "active"
# This time we test the /run/nextroot/ root switching logic. (We synthesize a new rootfs from the old via overlayfs)
@@ -107,6 +110,9 @@ elif [ -f /run/testsuite82.touch ]; then
# Bind our current root into the target so that we later can return to it
mount --bind / /run/nextroot/original-root
+ # Restart the unit that is not supposed to survive
+ systemd-run -p Type=exec --unit=testsuite-82-nosurvive.service sleep infinity
+
# Now issue the soft reboot. We should be right back soon.
touch /run/testsuite82.touch2
systemctl --no-block soft-reboot
@@ -123,23 +129,17 @@ else
systemd-notify --fd=3 --pid=parent 3<"$T"
rm "$T"
- # Create a script that can survive the soft reboot by ignoring SIGTERM (we
- # do this instead of the argv[0][0] = '@' thing because that's so hard to
- # do from a shell
- T="/dev/shm/survive-$RANDOM.sh"
- cat >$T <