diff --git a/man/org.freedesktop.systemd1.xml b/man/org.freedesktop.systemd1.xml
index f60b3ba702..1870ed02bb 100644
--- a/man/org.freedesktop.systemd1.xml
+++ b/man/org.freedesktop.systemd1.xml
@@ -2768,6 +2768,8 @@ node /org/freedesktop/systemd1/unit/avahi_2ddaemon_2eservice {
readonly t ExecMainStartTimestampMonotonic = ...;
readonly t ExecMainExitTimestamp = ...;
readonly t ExecMainExitTimestampMonotonic = ...;
+ readonly t ExecMainHandoverTimestamp = ...;
+ readonly t ExecMainHandoverTimestampMonotonic = ...;
readonly u ExecMainPID = ...;
readonly i ExecMainCode = ...;
readonly i ExecMainStatus = ...;
@@ -4057,6 +4059,10 @@ node /org/freedesktop/systemd1/unit/avahi_2ddaemon_2eservice {
+
+
+
+
@@ -4706,12 +4712,17 @@ node /org/freedesktop/systemd1/unit/avahi_2ddaemon_2eservice {
ExecMainStartTimestamp, ExecMainStartTimestampMonotonic,
ExecMainExitTimestamp, ExecMainExitTimestampMonotonic,
+ ExecMainHandoverTimestamp, ExecMainHandoverTimestampMonotonic,
ExecMainPID, ExecMainCode, ExecMainStatus
- contain information about the main process of the service as far as it is known. This is often the same
- runtime information that is stored in ExecStart. However, it deviates for
- Type=forking services where the main process of the service is not forked off
- systemd directly. These fields either contain information of the last run of the process or of the
- current running process.
+ contain information about the main process of the service as far as it is known. The
+ ExecMainStartTimestamp timestamps record when the main child process is spawned by
+ the service manager. ExecMainExitTimestamp timestamps record when the main child
+ process exit has been detected by the service manager. ExecMainHandoverTimestamp
+ timestamps record when the service executable is executed by systemd-executor.
+ This is often the same runtime information that is stored in ExecStart=. However,
+ it deviates for Type=forking services where the main process of the service is not
+ forked off systemd directly. These fields either contain information of the last run of the process or
+ of the current running process.
MainPID and ControlPID contain the main and control PID of
the service. The main PID is the current main PID of the service and is 0 when the service currently
@@ -12055,8 +12066,10 @@ $ gdbus introspect --system --dest org.freedesktop.systemd1 \
MemoryZSwapCurrent were added in version 255.
EffectiveMemoryHigh,
EffectiveMemoryMax,
- EffectiveTasksMax, and
- MemoryZSwapWriteback were added in version 256.
+ EffectiveTasksMax,
+ MemoryZSwapWriteback,
+ ExecMainHandoverTimestampMonotonic, and
+ ExecMainHandoverTimestamp were added in version 256.
Socket Unit Objects
diff --git a/src/core/dbus-execute.h b/src/core/dbus-execute.h
index 5926bdb4b1..a1205b0997 100644
--- a/src/core/dbus-execute.h
+++ b/src/core/dbus-execute.h
@@ -9,6 +9,7 @@
#define BUS_EXEC_STATUS_VTABLE(prefix, offset, flags) \
BUS_PROPERTY_DUAL_TIMESTAMP(prefix "StartTimestamp", (offset) + offsetof(ExecStatus, start_timestamp), flags), \
BUS_PROPERTY_DUAL_TIMESTAMP(prefix "ExitTimestamp", (offset) + offsetof(ExecStatus, exit_timestamp), flags), \
+ BUS_PROPERTY_DUAL_TIMESTAMP(prefix "HandoverTimestamp", (offset) + offsetof(ExecStatus, handover_timestamp), flags), \
SD_BUS_PROPERTY(prefix "PID", "u", bus_property_get_pid, (offset) + offsetof(ExecStatus, pid), flags), \
SD_BUS_PROPERTY(prefix "Code", "i", bus_property_get_int, (offset) + offsetof(ExecStatus, code), flags), \
SD_BUS_PROPERTY(prefix "Status", "i", bus_property_get_int, (offset) + offsetof(ExecStatus, status), flags)
diff --git a/src/core/exec-invoke.c b/src/core/exec-invoke.c
index 32b48ececc..ff0fdbec5f 100644
--- a/src/core/exec-invoke.c
+++ b/src/core/exec-invoke.c
@@ -5265,12 +5265,14 @@ int exec_invoke(
log_command_line(context, params, "Executing", executable, final_argv);
if (params->exec_fd >= 0) {
- uint8_t hot = 1;
+ usec_t t = now(CLOCK_MONOTONIC);
/* We have finished with all our initializations. Let's now let the manager know that. From this point
- * on, if the manager sees POLLHUP on the exec_fd, then execve() was successful. */
+ * on, if the manager sees POLLHUP on the exec_fd, then execve() was successful. We send a
+ * timestamp so that the service manager and users can track the precise moment we handed
+ * over execution of the service to the kernel. */
- if (write(params->exec_fd, &hot, sizeof(hot)) < 0) {
+ if (write(params->exec_fd, &t, sizeof(t)) < 0) {
*exit_status = EXIT_EXEC;
return log_exec_error_errno(context, params, errno, "Failed to enable exec_fd: %m");
}
@@ -5279,7 +5281,7 @@ int exec_invoke(
r = fexecve_or_execve(executable_fd, executable, final_argv, accum_env);
if (params->exec_fd >= 0) {
- uint8_t hot = 0;
+ uint64_t hot = 0;
/* The execve() failed. This means the exec_fd is still open. Which means we need to tell the manager
* that POLLHUP on it no longer means execve() succeeded. */
diff --git a/src/core/execute.c b/src/core/execute.c
index 5a4acd0775..cab74fdcc1 100644
--- a/src/core/execute.c
+++ b/src/core/execute.c
@@ -1866,6 +1866,11 @@ void exec_status_dump(const ExecStatus *s, FILE *f, const char *prefix) {
"%sStart Timestamp: %s\n",
prefix, FORMAT_TIMESTAMP(s->start_timestamp.realtime));
+ if (dual_timestamp_is_set(&s->handover_timestamp))
+ fprintf(f,
+ "%sHandover Timestamp: %s\n",
+ prefix, FORMAT_TIMESTAMP(s->handover_timestamp.realtime));
+
if (dual_timestamp_is_set(&s->exit_timestamp))
fprintf(f,
"%sExit Timestamp: %s\n"
diff --git a/src/core/execute.h b/src/core/execute.h
index c41f9cbec2..acca6e3152 100644
--- a/src/core/execute.h
+++ b/src/core/execute.h
@@ -91,6 +91,7 @@ typedef enum ExecKeyringMode {
struct ExecStatus {
dual_timestamp start_timestamp;
dual_timestamp exit_timestamp;
+ dual_timestamp handover_timestamp;
pid_t pid;
int code; /* as in siginfo_t::si_code */
int status; /* as in siginfo_t::si_status */
@@ -443,7 +444,9 @@ struct ExecParameters {
int stdout_fd;
int stderr_fd;
- /* An fd that is closed by the execve(), and thus will result in EOF when the execve() is done */
+ /* An fd that is closed by the execve(), and thus will result in EOF when the execve() is done. It
+ * will also be used to send a timestamp taken as the very last operation before execve, for
+ * tracking purposes. */
int exec_fd;
char *notify_socket;
diff --git a/src/core/service.c b/src/core/service.c
index f6800c3170..366f52f3e0 100644
--- a/src/core/service.c
+++ b/src/core/service.c
@@ -24,6 +24,7 @@
#include "fd-util.h"
#include "fileio.h"
#include "format-util.h"
+#include "io-util.h"
#include "load-dropin.h"
#include "load-fragment.h"
#include "log.h"
@@ -1675,7 +1676,8 @@ static int service_spawn_internal(
log_unit_debug(UNIT(s), "Passing %zu fds to service", exec_params.n_socket_fds + exec_params.n_storage_fds);
}
- if (!FLAGS_SET(exec_params.flags, EXEC_IS_CONTROL) && s->type == SERVICE_EXEC) {
+ if (!FLAGS_SET(exec_params.flags, EXEC_IS_CONTROL) &&
+ IN_SET(s->type, SERVICE_NOTIFY, SERVICE_NOTIFY_RELOAD, SERVICE_EXEC, SERVICE_DBUS)) {
r = service_allocate_exec_fd(s, &exec_fd_source, &exec_params.exec_fd);
if (r < 0)
return r;
@@ -3008,6 +3010,7 @@ static int service_serialize(Unit *u, FILE *f, FDSet *fds) {
(void) serialize_item_format(f, "main-exec-status-pid", PID_FMT, s->main_exec_status.pid);
(void) serialize_dual_timestamp(f, "main-exec-status-start", &s->main_exec_status.start_timestamp);
(void) serialize_dual_timestamp(f, "main-exec-status-exit", &s->main_exec_status.exit_timestamp);
+ (void) serialize_dual_timestamp(f, "main-exec-status-handover", &s->main_exec_status.handover_timestamp);
if (dual_timestamp_is_set(&s->main_exec_status.exit_timestamp)) {
(void) serialize_item_format(f, "main-exec-status-code", "%i", s->main_exec_status.code);
@@ -3292,6 +3295,8 @@ static int service_deserialize_item(Unit *u, const char *key, const char *value,
deserialize_dual_timestamp(value, &s->main_exec_status.start_timestamp);
else if (streq(key, "main-exec-status-exit"))
deserialize_dual_timestamp(value, &s->main_exec_status.exit_timestamp);
+ else if (streq(key, "main-exec-status-handover"))
+ deserialize_dual_timestamp(value, &s->main_exec_status.handover_timestamp);
else if (streq(key, "notify-access-override")) {
NotifyAccess notify_access;
@@ -3520,21 +3525,22 @@ static int service_dispatch_exec_io(sd_event_source *source, int fd, uint32_t ev
* the pipe to be closed (for example, a simple exit()). To deal with that we'll ignore EOFs on the pipe unless
* the child signalled us first that it is about to call the execve(). It does so by sending us a simple
* non-zero byte via the pipe. We also provide the child with a way to inform us in case execve() failed: if it
- * sends a zero byte we'll ignore POLLHUP on the fd again. */
+ * sends a zero byte we'll ignore POLLHUP on the fd again.
+ * For exec, dbus, notify and notify-reload types we also get a timestamp from sd-executor, taken immediately
+ * before the target executable is execve'd, so that we can accurately track when control is handed over to the
+ * payload. */
for (;;) {
- uint8_t x;
+ uint64_t x = 0;
ssize_t n;
- n = read(fd, &x, sizeof(x));
- if (n < 0) {
- if (errno == EAGAIN) /* O_NONBLOCK in effect → everything queued has now been processed. */
- return 0;
-
- return log_unit_error_errno(UNIT(s), errno, "Failed to read from exec_fd: %m");
- }
- if (n == 0) { /* EOF → the event we are waiting for */
-
+ n = loop_read(fd, &x, sizeof(x), /* do_poll= */ false);
+ if (n == -EAGAIN) /* O_NONBLOCK in effect → everything queued has now been processed. */
+ return 0;
+ if (n < 0)
+ return log_unit_error_errno(UNIT(s), n, "Failed to read from exec_fd: %m");
+ if (n == 0) {
+ /* EOF → the event we are waiting for in case of Type=exec */
s->exec_fd_event_source = sd_event_source_disable_unref(s->exec_fd_event_source);
if (s->exec_fd_hot) { /* Did the child tell us to expect EOF now? */
@@ -3550,13 +3556,37 @@ static int service_dispatch_exec_io(sd_event_source *source, int fd, uint32_t ev
return 0;
}
+ if (n == 1) {
+ /* Backward compatibility block: a single byte was read, which means somehow an old
+ * executor before this logic was introduced sent the notification, so there is no
+ * timestamp (reset it). */
- /* A byte was read → this turns on/off the exec fd logic */
- assert(n == sizeof(x));
- s->exec_fd_hot = x;
+ s->exec_fd_hot = x;
+ if (s->state == SERVICE_START)
+ s->main_exec_status.handover_timestamp = DUAL_TIMESTAMP_NULL;
+
+ continue;
+ }
+
+ if (x == 0) {
+ /* If we get x=0 then the execve actually failed, which means the exec fd logic is
+ * now off. Also reset the exec timestamp, given it has no meaning anymore. */
+
+ s->exec_fd_hot = false;
+ if (s->state == SERVICE_START)
+ s->main_exec_status.handover_timestamp = DUAL_TIMESTAMP_NULL;
+ } else {
+ /* A non-zero value was read, which means the exec fd logic is now enabled. Record
+ * the received timestamp so that users can track when control is handed over to the
+ * service payload. */
+
+ s->exec_fd_hot = true;
+ if (s->state == SERVICE_START) {
+ assert_cc(sizeof(uint64_t) == sizeof(usec_t));
+ dual_timestamp_from_monotonic(&s->main_exec_status.handover_timestamp, (usec_t)x);
+ }
+ }
}
-
- return 0;
}
static void service_notify_cgroup_empty_event(Unit *u) {
diff --git a/test/units/testsuite-07.exec-timestamps.sh b/test/units/testsuite-07.exec-timestamps.sh
new file mode 100755
index 0000000000..799bc89e06
--- /dev/null
+++ b/test/units/testsuite-07.exec-timestamps.sh
@@ -0,0 +1,17 @@
+#!/usr/bin/env bash
+# SPDX-License-Identifier: LGPL-2.1-or-later
+set -eux
+set -o pipefail
+
+# Check that timestamps of a Type=notify service are consistent
+
+systemd-run --service-type notify --property NotifyAccess=all --unit notify.service --wait sh -c 'systemd-notify --ready; exit 1' || :
+
+start=$(systemctl show --property=ExecMainStartTimestampMonotonic --value notify.service)
+handover=$(systemctl show --property=ExecMainHandoverTimestampMonotonic --value notify.service)
+active=$(systemctl show --property=ActiveEnterTimestampMonotonic --value notify.service)
+exit=$(systemctl show --property=ExecMainExitTimestampMonotonic --value notify.service)
+
+[[ $start -le $handover ]]
+[[ $handover -le $active ]]
+[[ $active -le $exit ]]