diff --git a/man/org.freedesktop.systemd1.xml b/man/org.freedesktop.systemd1.xml index f60b3ba702..1870ed02bb 100644 --- a/man/org.freedesktop.systemd1.xml +++ b/man/org.freedesktop.systemd1.xml @@ -2768,6 +2768,8 @@ node /org/freedesktop/systemd1/unit/avahi_2ddaemon_2eservice { readonly t ExecMainStartTimestampMonotonic = ...; readonly t ExecMainExitTimestamp = ...; readonly t ExecMainExitTimestampMonotonic = ...; + readonly t ExecMainHandoverTimestamp = ...; + readonly t ExecMainHandoverTimestampMonotonic = ...; readonly u ExecMainPID = ...; readonly i ExecMainCode = ...; readonly i ExecMainStatus = ...; @@ -4057,6 +4059,10 @@ node /org/freedesktop/systemd1/unit/avahi_2ddaemon_2eservice { + + + + @@ -4706,12 +4712,17 @@ node /org/freedesktop/systemd1/unit/avahi_2ddaemon_2eservice { ExecMainStartTimestamp, ExecMainStartTimestampMonotonic, ExecMainExitTimestamp, ExecMainExitTimestampMonotonic, + ExecMainHandoverTimestamp, ExecMainHandoverTimestampMonotonic, ExecMainPID, ExecMainCode, ExecMainStatus - contain information about the main process of the service as far as it is known. This is often the same - runtime information that is stored in ExecStart. However, it deviates for - Type=forking services where the main process of the service is not forked off - systemd directly. These fields either contain information of the last run of the process or of the - current running process. + contain information about the main process of the service as far as it is known. The + ExecMainStartTimestamp timestamps record when the main child process is spawned by + the service manager. ExecMainExitTimestamp timestamps record when the main child + process exit has been detected by the service manager. ExecMainHandoverTimestamp + timestamps record when the service executable is executed by systemd-executor. + This is often the same runtime information that is stored in ExecStart=. However, + it deviates for Type=forking services where the main process of the service is not + forked off systemd directly. These fields either contain information of the last run of the process or + of the current running process. MainPID and ControlPID contain the main and control PID of the service. The main PID is the current main PID of the service and is 0 when the service currently @@ -12055,8 +12066,10 @@ $ gdbus introspect --system --dest org.freedesktop.systemd1 \ MemoryZSwapCurrent were added in version 255. EffectiveMemoryHigh, EffectiveMemoryMax, - EffectiveTasksMax, and - MemoryZSwapWriteback were added in version 256. + EffectiveTasksMax, + MemoryZSwapWriteback, + ExecMainHandoverTimestampMonotonic, and + ExecMainHandoverTimestamp were added in version 256. Socket Unit Objects diff --git a/src/core/dbus-execute.h b/src/core/dbus-execute.h index 5926bdb4b1..a1205b0997 100644 --- a/src/core/dbus-execute.h +++ b/src/core/dbus-execute.h @@ -9,6 +9,7 @@ #define BUS_EXEC_STATUS_VTABLE(prefix, offset, flags) \ BUS_PROPERTY_DUAL_TIMESTAMP(prefix "StartTimestamp", (offset) + offsetof(ExecStatus, start_timestamp), flags), \ BUS_PROPERTY_DUAL_TIMESTAMP(prefix "ExitTimestamp", (offset) + offsetof(ExecStatus, exit_timestamp), flags), \ + BUS_PROPERTY_DUAL_TIMESTAMP(prefix "HandoverTimestamp", (offset) + offsetof(ExecStatus, handover_timestamp), flags), \ SD_BUS_PROPERTY(prefix "PID", "u", bus_property_get_pid, (offset) + offsetof(ExecStatus, pid), flags), \ SD_BUS_PROPERTY(prefix "Code", "i", bus_property_get_int, (offset) + offsetof(ExecStatus, code), flags), \ SD_BUS_PROPERTY(prefix "Status", "i", bus_property_get_int, (offset) + offsetof(ExecStatus, status), flags) diff --git a/src/core/exec-invoke.c b/src/core/exec-invoke.c index 32b48ececc..ff0fdbec5f 100644 --- a/src/core/exec-invoke.c +++ b/src/core/exec-invoke.c @@ -5265,12 +5265,14 @@ int exec_invoke( log_command_line(context, params, "Executing", executable, final_argv); if (params->exec_fd >= 0) { - uint8_t hot = 1; + usec_t t = now(CLOCK_MONOTONIC); /* We have finished with all our initializations. Let's now let the manager know that. From this point - * on, if the manager sees POLLHUP on the exec_fd, then execve() was successful. */ + * on, if the manager sees POLLHUP on the exec_fd, then execve() was successful. We send a + * timestamp so that the service manager and users can track the precise moment we handed + * over execution of the service to the kernel. */ - if (write(params->exec_fd, &hot, sizeof(hot)) < 0) { + if (write(params->exec_fd, &t, sizeof(t)) < 0) { *exit_status = EXIT_EXEC; return log_exec_error_errno(context, params, errno, "Failed to enable exec_fd: %m"); } @@ -5279,7 +5281,7 @@ int exec_invoke( r = fexecve_or_execve(executable_fd, executable, final_argv, accum_env); if (params->exec_fd >= 0) { - uint8_t hot = 0; + uint64_t hot = 0; /* The execve() failed. This means the exec_fd is still open. Which means we need to tell the manager * that POLLHUP on it no longer means execve() succeeded. */ diff --git a/src/core/execute.c b/src/core/execute.c index 5a4acd0775..cab74fdcc1 100644 --- a/src/core/execute.c +++ b/src/core/execute.c @@ -1866,6 +1866,11 @@ void exec_status_dump(const ExecStatus *s, FILE *f, const char *prefix) { "%sStart Timestamp: %s\n", prefix, FORMAT_TIMESTAMP(s->start_timestamp.realtime)); + if (dual_timestamp_is_set(&s->handover_timestamp)) + fprintf(f, + "%sHandover Timestamp: %s\n", + prefix, FORMAT_TIMESTAMP(s->handover_timestamp.realtime)); + if (dual_timestamp_is_set(&s->exit_timestamp)) fprintf(f, "%sExit Timestamp: %s\n" diff --git a/src/core/execute.h b/src/core/execute.h index c41f9cbec2..acca6e3152 100644 --- a/src/core/execute.h +++ b/src/core/execute.h @@ -91,6 +91,7 @@ typedef enum ExecKeyringMode { struct ExecStatus { dual_timestamp start_timestamp; dual_timestamp exit_timestamp; + dual_timestamp handover_timestamp; pid_t pid; int code; /* as in siginfo_t::si_code */ int status; /* as in siginfo_t::si_status */ @@ -443,7 +444,9 @@ struct ExecParameters { int stdout_fd; int stderr_fd; - /* An fd that is closed by the execve(), and thus will result in EOF when the execve() is done */ + /* An fd that is closed by the execve(), and thus will result in EOF when the execve() is done. It + * will also be used to send a timestamp taken as the very last operation before execve, for + * tracking purposes. */ int exec_fd; char *notify_socket; diff --git a/src/core/service.c b/src/core/service.c index f6800c3170..366f52f3e0 100644 --- a/src/core/service.c +++ b/src/core/service.c @@ -24,6 +24,7 @@ #include "fd-util.h" #include "fileio.h" #include "format-util.h" +#include "io-util.h" #include "load-dropin.h" #include "load-fragment.h" #include "log.h" @@ -1675,7 +1676,8 @@ static int service_spawn_internal( log_unit_debug(UNIT(s), "Passing %zu fds to service", exec_params.n_socket_fds + exec_params.n_storage_fds); } - if (!FLAGS_SET(exec_params.flags, EXEC_IS_CONTROL) && s->type == SERVICE_EXEC) { + if (!FLAGS_SET(exec_params.flags, EXEC_IS_CONTROL) && + IN_SET(s->type, SERVICE_NOTIFY, SERVICE_NOTIFY_RELOAD, SERVICE_EXEC, SERVICE_DBUS)) { r = service_allocate_exec_fd(s, &exec_fd_source, &exec_params.exec_fd); if (r < 0) return r; @@ -3008,6 +3010,7 @@ static int service_serialize(Unit *u, FILE *f, FDSet *fds) { (void) serialize_item_format(f, "main-exec-status-pid", PID_FMT, s->main_exec_status.pid); (void) serialize_dual_timestamp(f, "main-exec-status-start", &s->main_exec_status.start_timestamp); (void) serialize_dual_timestamp(f, "main-exec-status-exit", &s->main_exec_status.exit_timestamp); + (void) serialize_dual_timestamp(f, "main-exec-status-handover", &s->main_exec_status.handover_timestamp); if (dual_timestamp_is_set(&s->main_exec_status.exit_timestamp)) { (void) serialize_item_format(f, "main-exec-status-code", "%i", s->main_exec_status.code); @@ -3292,6 +3295,8 @@ static int service_deserialize_item(Unit *u, const char *key, const char *value, deserialize_dual_timestamp(value, &s->main_exec_status.start_timestamp); else if (streq(key, "main-exec-status-exit")) deserialize_dual_timestamp(value, &s->main_exec_status.exit_timestamp); + else if (streq(key, "main-exec-status-handover")) + deserialize_dual_timestamp(value, &s->main_exec_status.handover_timestamp); else if (streq(key, "notify-access-override")) { NotifyAccess notify_access; @@ -3520,21 +3525,22 @@ static int service_dispatch_exec_io(sd_event_source *source, int fd, uint32_t ev * the pipe to be closed (for example, a simple exit()). To deal with that we'll ignore EOFs on the pipe unless * the child signalled us first that it is about to call the execve(). It does so by sending us a simple * non-zero byte via the pipe. We also provide the child with a way to inform us in case execve() failed: if it - * sends a zero byte we'll ignore POLLHUP on the fd again. */ + * sends a zero byte we'll ignore POLLHUP on the fd again. + * For exec, dbus, notify and notify-reload types we also get a timestamp from sd-executor, taken immediately + * before the target executable is execve'd, so that we can accurately track when control is handed over to the + * payload. */ for (;;) { - uint8_t x; + uint64_t x = 0; ssize_t n; - n = read(fd, &x, sizeof(x)); - if (n < 0) { - if (errno == EAGAIN) /* O_NONBLOCK in effect → everything queued has now been processed. */ - return 0; - - return log_unit_error_errno(UNIT(s), errno, "Failed to read from exec_fd: %m"); - } - if (n == 0) { /* EOF → the event we are waiting for */ - + n = loop_read(fd, &x, sizeof(x), /* do_poll= */ false); + if (n == -EAGAIN) /* O_NONBLOCK in effect → everything queued has now been processed. */ + return 0; + if (n < 0) + return log_unit_error_errno(UNIT(s), n, "Failed to read from exec_fd: %m"); + if (n == 0) { + /* EOF → the event we are waiting for in case of Type=exec */ s->exec_fd_event_source = sd_event_source_disable_unref(s->exec_fd_event_source); if (s->exec_fd_hot) { /* Did the child tell us to expect EOF now? */ @@ -3550,13 +3556,37 @@ static int service_dispatch_exec_io(sd_event_source *source, int fd, uint32_t ev return 0; } + if (n == 1) { + /* Backward compatibility block: a single byte was read, which means somehow an old + * executor before this logic was introduced sent the notification, so there is no + * timestamp (reset it). */ - /* A byte was read → this turns on/off the exec fd logic */ - assert(n == sizeof(x)); - s->exec_fd_hot = x; + s->exec_fd_hot = x; + if (s->state == SERVICE_START) + s->main_exec_status.handover_timestamp = DUAL_TIMESTAMP_NULL; + + continue; + } + + if (x == 0) { + /* If we get x=0 then the execve actually failed, which means the exec fd logic is + * now off. Also reset the exec timestamp, given it has no meaning anymore. */ + + s->exec_fd_hot = false; + if (s->state == SERVICE_START) + s->main_exec_status.handover_timestamp = DUAL_TIMESTAMP_NULL; + } else { + /* A non-zero value was read, which means the exec fd logic is now enabled. Record + * the received timestamp so that users can track when control is handed over to the + * service payload. */ + + s->exec_fd_hot = true; + if (s->state == SERVICE_START) { + assert_cc(sizeof(uint64_t) == sizeof(usec_t)); + dual_timestamp_from_monotonic(&s->main_exec_status.handover_timestamp, (usec_t)x); + } + } } - - return 0; } static void service_notify_cgroup_empty_event(Unit *u) { diff --git a/test/units/testsuite-07.exec-timestamps.sh b/test/units/testsuite-07.exec-timestamps.sh new file mode 100755 index 0000000000..799bc89e06 --- /dev/null +++ b/test/units/testsuite-07.exec-timestamps.sh @@ -0,0 +1,17 @@ +#!/usr/bin/env bash +# SPDX-License-Identifier: LGPL-2.1-or-later +set -eux +set -o pipefail + +# Check that timestamps of a Type=notify service are consistent + +systemd-run --service-type notify --property NotifyAccess=all --unit notify.service --wait sh -c 'systemd-notify --ready; exit 1' || : + +start=$(systemctl show --property=ExecMainStartTimestampMonotonic --value notify.service) +handover=$(systemctl show --property=ExecMainHandoverTimestampMonotonic --value notify.service) +active=$(systemctl show --property=ActiveEnterTimestampMonotonic --value notify.service) +exit=$(systemctl show --property=ExecMainExitTimestampMonotonic --value notify.service) + +[[ $start -le $handover ]] +[[ $handover -le $active ]] +[[ $active -le $exit ]]