namespace: make setup_namespace() less crazy

Let's replace the ridiculous number of arguments with a structure, to
make this function less weird.

No change in behaviour, just some refactoring.
This commit is contained in:
Lennart Poettering
2023-10-10 21:36:50 +02:00
committed by Yu Watanabe
parent 22d7fb6646
commit 79d956db34
5 changed files with 332 additions and 426 deletions

View File

@@ -3138,7 +3138,6 @@ static int apply_mount_namespace(
*extension_dir = NULL, *host_os_release_stage = NULL;
const char *root_dir = NULL, *root_image = NULL, *tmp_dir = NULL, *var_tmp_dir = NULL;
char **read_write_paths;
NamespaceInfo ns_info;
bool needs_sandboxing, setup_os_release_symlink;
BindMount *bind_mounts = NULL;
size_t n_bind_mounts = 0;
@@ -3180,10 +3179,9 @@ static int apply_mount_namespace(
needs_sandboxing = (params->flags & EXEC_APPLY_SANDBOXING) && !(command_flags & EXEC_COMMAND_FULLY_PRIVILEGED);
if (needs_sandboxing) {
/* The runtime struct only contains the parent of the private /tmp,
* which is non-accessible to world users. Inside of it there's a /tmp
* that is sticky, and that's the one we want to use here.
* This does not apply when we are using /run/systemd/empty as fallback. */
/* The runtime struct only contains the parent of the private /tmp, which is non-accessible
* to world users. Inside of it there's a /tmp that is sticky, and that's the one we want to
* use here. This does not apply when we are using /run/systemd/empty as fallback. */
if (context->private_tmp && runtime && runtime->shared) {
if (streq_ptr(runtime->shared->tmp_dir, RUN_SYSTEMD_EMPTY))
@@ -3196,39 +3194,10 @@ static int apply_mount_namespace(
else if (runtime->shared->var_tmp_dir)
var_tmp_dir = strjoina(runtime->shared->var_tmp_dir, "/tmp");
}
ns_info = (NamespaceInfo) {
.ignore_protect_paths = false,
.private_dev = context->private_devices,
.protect_control_groups = context->protect_control_groups,
.protect_kernel_tunables = context->protect_kernel_tunables,
.protect_kernel_modules = context->protect_kernel_modules,
.protect_kernel_logs = context->protect_kernel_logs,
.protect_hostname = context->protect_hostname,
.mount_apivfs = exec_context_get_effective_mount_apivfs(context),
.protect_home = context->protect_home,
.protect_system = context->protect_system,
.protect_proc = context->protect_proc,
.proc_subset = context->proc_subset,
.private_network = exec_needs_network_namespace(context),
.private_ipc = exec_needs_ipc_namespace(context),
/* If NNP is on, we can turn on MS_NOSUID, since it won't have any effect anymore. */
.mount_nosuid = context->no_new_privileges && !mac_selinux_use(),
};
} else if (!context->dynamic_user && root_dir)
/*
* If DynamicUser=no and RootDirectory= is set then lets pass a relaxed
* sandbox info, otherwise enforce it, don't ignore protected paths and
* fail if we are enable to apply the sandbox inside the mount namespace.
*/
ns_info = (NamespaceInfo) {
.ignore_protect_paths = true,
};
else
ns_info = (NamespaceInfo) {};
}
/* Symlinks (exec dirs, os-release) are set up after other mounts, before they are made read-only. */
setup_os_release_symlink = ns_info.mount_apivfs && (root_dir || root_image);
setup_os_release_symlink = needs_sandboxing && exec_context_get_effective_mount_apivfs(context) && (root_dir || root_image);
r = compile_symlinks(context, params, setup_os_release_symlink, &symlinks);
if (r < 0)
return r;
@@ -3287,44 +3256,81 @@ static int apply_mount_namespace(
return r;
}
r = setup_namespace(
root_dir,
root_image,
context->root_image_options,
context->root_image_policy ?: &image_policy_service,
&ns_info,
read_write_paths,
needs_sandboxing ? context->read_only_paths : NULL,
needs_sandboxing ? context->inaccessible_paths : NULL,
needs_sandboxing ? context->exec_paths : NULL,
needs_sandboxing ? context->no_exec_paths : NULL,
empty_directories,
symlinks,
bind_mounts,
n_bind_mounts,
context->temporary_filesystems,
context->n_temporary_filesystems,
context->mount_images,
context->n_mount_images,
context->mount_image_policy ?: &image_policy_service,
tmp_dir,
var_tmp_dir,
creds_path,
context->log_namespace,
context->mount_propagation_flag,
&verity,
context->extension_images,
context->n_extension_images,
context->extension_image_policy ?: &image_policy_sysext,
context->extension_directories,
propagate_dir,
incoming_dir,
extension_dir,
root_dir || root_image ? params->notify_socket : NULL,
host_os_release_stage,
params->runtime_scope,
error_path);
NamespaceParameters parameters = {
.runtime_scope = params->runtime_scope,
.root_directory = root_dir,
.root_image = root_image,
.root_image_options = context->root_image_options,
.root_image_policy = context->root_image_policy ?: &image_policy_service,
.read_write_paths = read_write_paths,
.read_only_paths = needs_sandboxing ? context->read_only_paths : NULL,
.inaccessible_paths = needs_sandboxing ? context->inaccessible_paths : NULL,
.exec_paths = needs_sandboxing ? context->exec_paths : NULL,
.no_exec_paths = needs_sandboxing ? context->no_exec_paths : NULL,
.empty_directories = empty_directories,
.symlinks = symlinks,
.bind_mounts = bind_mounts,
.n_bind_mounts = n_bind_mounts,
.temporary_filesystems = context->temporary_filesystems,
.n_temporary_filesystems = context->n_temporary_filesystems,
.mount_images = context->mount_images,
.n_mount_images = context->n_mount_images,
.mount_image_policy = context->mount_image_policy ?: &image_policy_service,
.tmp_dir = tmp_dir,
.var_tmp_dir = var_tmp_dir,
.creds_path = creds_path,
.log_namespace = context->log_namespace,
.mount_propagation_flag = context->mount_propagation_flag,
.verity = &verity,
.extension_images = context->extension_images,
.n_extension_images = context->n_extension_images,
.extension_image_policy = context->extension_image_policy ?: &image_policy_sysext,
.extension_directories = context->extension_directories,
.propagate_dir = propagate_dir,
.incoming_dir = incoming_dir,
.extension_dir = extension_dir,
.notify_socket = root_dir || root_image ? params->notify_socket : NULL,
.host_os_release_stage = host_os_release_stage,
/* If DynamicUser=no and RootDirectory= is set then lets pass a relaxed sandbox info,
* otherwise enforce it, don't ignore protected paths and fail if we are enable to apply the
* sandbox inside the mount namespace. */
.ignore_protect_paths = !needs_sandboxing && !context->dynamic_user && root_dir,
.protect_control_groups = needs_sandboxing && context->protect_control_groups,
.protect_kernel_tunables = needs_sandboxing && context->protect_kernel_tunables,
.protect_kernel_modules = needs_sandboxing && context->protect_kernel_modules,
.protect_kernel_logs = needs_sandboxing && context->protect_kernel_logs,
.protect_hostname = needs_sandboxing && context->protect_hostname,
.private_dev = needs_sandboxing && context->private_devices,
.private_network = needs_sandboxing && exec_needs_network_namespace(context),
.private_ipc = needs_sandboxing && exec_needs_ipc_namespace(context),
.mount_apivfs = needs_sandboxing && exec_context_get_effective_mount_apivfs(context),
/* If NNP is on, we can turn on MS_NOSUID, since it won't have any effect anymore. */
.mount_nosuid = needs_sandboxing && context->no_new_privileges && !mac_selinux_use(),
.protect_home = needs_sandboxing && context->protect_home,
.protect_system = needs_sandboxing && context->protect_system,
.protect_proc = needs_sandboxing && context->protect_proc,
.proc_subset = needs_sandboxing && context->proc_subset,
};
r = setup_namespace(&parameters, error_path);
/* If we couldn't set up the namespace this is probably due to a missing capability. setup_namespace() reports
* that with a special, recognizable error ENOANO. In this case, silently proceed, but only if exclusively
* sandboxing options were used, i.e. nothing such as RootDirectory= or BindMount= that would result in a

File diff suppressed because it is too large Load Diff

View File

@@ -5,7 +5,7 @@
Copyright © 2016 Djalal Harouni
***/
typedef struct NamespaceInfo NamespaceInfo;
typedef struct NamespaceParameters NamespaceParameters;
typedef struct BindMount BindMount;
typedef struct TemporaryFileSystem TemporaryFileSystem;
typedef struct MountImage MountImage;
@@ -53,24 +53,6 @@ typedef enum ProcSubset {
_PROC_SUBSET_INVALID = -EINVAL,
} ProcSubset;
struct NamespaceInfo {
bool ignore_protect_paths;
bool private_dev;
bool protect_control_groups;
bool protect_kernel_tunables;
bool protect_kernel_modules;
bool protect_kernel_logs;
bool mount_apivfs;
bool protect_hostname;
bool private_network;
bool private_ipc;
bool mount_nosuid;
ProtectHome protect_home;
ProtectSystem protect_system;
ProtectProc protect_proc;
ProcSubset proc_subset;
};
struct BindMount {
char *source;
char *destination;
@@ -100,43 +82,77 @@ struct MountImage {
MountImageType type;
};
int setup_namespace(
const char *root_directory,
const char *root_image,
const MountOptions *root_image_options,
const ImagePolicy *root_image_policy,
const NamespaceInfo *ns_info,
char **read_write_paths,
char **read_only_paths,
char **inaccessible_paths,
char **exec_paths,
char **no_exec_paths,
char **empty_directories,
char **symlinks,
const BindMount *bind_mounts,
size_t n_bind_mounts,
const TemporaryFileSystem *temporary_filesystems,
size_t n_temporary_filesystems,
const MountImage *mount_images,
size_t n_mount_images,
const ImagePolicy *mount_image_policy,
const char *tmp_dir,
const char *var_tmp_dir,
const char *creds_path,
const char *log_namespace,
unsigned long mount_propagation_flag,
VeritySettings *verity,
const MountImage *extension_images,
size_t n_extension_images,
const ImagePolicy *extension_image_policy,
char **extension_directories,
const char *propagate_dir,
const char *incoming_dir,
const char *extension_dir,
const char *notify_socket,
const char *host_os_release_stage,
RuntimeScope scope,
char **error_path);
struct NamespaceParameters {
RuntimeScope runtime_scope;
const char *root_directory;
const char *root_image;
const MountOptions *root_image_options;
const ImagePolicy *root_image_policy;
char **read_write_paths;
char **read_only_paths;
char **inaccessible_paths;
char **exec_paths;
char **no_exec_paths;
char **empty_directories;
char **symlinks;
const BindMount *bind_mounts;
size_t n_bind_mounts;
const TemporaryFileSystem *temporary_filesystems;
size_t n_temporary_filesystems;
const MountImage *mount_images;
size_t n_mount_images;
const ImagePolicy *mount_image_policy;
const char *tmp_dir;
const char *var_tmp_dir;
const char *creds_path;
const char *log_namespace;
unsigned long mount_propagation_flag;
VeritySettings *verity;
const MountImage *extension_images;
size_t n_extension_images;
const ImagePolicy *extension_image_policy;
char **extension_directories;
const char *propagate_dir;
const char *incoming_dir;
const char *extension_dir;
const char *notify_socket;
const char *host_os_release_stage;
bool ignore_protect_paths;
bool protect_control_groups;
bool protect_kernel_tunables;
bool protect_kernel_modules;
bool protect_kernel_logs;
bool protect_hostname;
bool private_dev;
bool private_network;
bool private_ipc;
bool mount_apivfs;
bool mount_nosuid;
ProtectHome protect_home;
ProtectSystem protect_system;
ProtectProc protect_proc;
ProcSubset proc_subset;
};
int setup_namespace(const NamespaceParameters *p, char **error_path);
#define RUN_SYSTEMD_EMPTY "/run/systemd/empty"

View File

@@ -149,11 +149,12 @@ TEST(ipcns) {
}
TEST(protect_kernel_logs) {
int r;
pid_t pid;
static const NamespaceInfo ns_info = {
static const NamespaceParameters p = {
.runtime_scope = RUNTIME_SCOPE_SYSTEM,
.protect_kernel_logs = true,
};
pid_t pid;
int r;
if (geteuid() > 0) {
(void) log_tests_skipped("not root");
@@ -175,39 +176,7 @@ TEST(protect_kernel_logs) {
fd = open("/dev/kmsg", O_RDONLY | O_CLOEXEC);
assert_se(fd > 0);
r = setup_namespace(NULL,
NULL,
NULL,
NULL,
&ns_info,
NULL,
NULL,
NULL,
NULL,
NULL,
NULL,
NULL,
NULL, 0,
NULL, 0,
NULL, 0,
NULL,
NULL,
NULL,
NULL,
NULL,
0,
NULL,
NULL,
0,
NULL,
NULL,
NULL,
NULL,
NULL,
NULL,
NULL,
RUNTIME_SCOPE_SYSTEM,
NULL);
r = setup_namespace(&p, NULL);
assert_se(r == 0);
assert_se(setresuid(UID_NOBODY, UID_NOBODY, UID_NOBODY) >= 0);

View File

@@ -44,13 +44,15 @@ int main(int argc, char *argv[]) {
NULL
};
static const NamespaceInfo ns_info = {
.private_dev = true,
.protect_control_groups = true,
.protect_kernel_tunables = true,
.protect_kernel_modules = true,
.protect_proc = PROTECT_PROC_NOACCESS,
.proc_subset = PROC_SUBSET_PID,
static const BindMount bind_mount = {
.source = (char*) "/usr/bin",
.destination = (char*) "/etc/systemd",
.read_only = true,
};
static const TemporaryFileSystem tmpfs = {
.path = (char*) "/var",
.options = (char*) "ro",
};
char *root_directory;
@@ -76,40 +78,36 @@ int main(int argc, char *argv[]) {
else
log_info("Not chrooted");
r = setup_namespace(root_directory,
NULL,
NULL,
NULL,
&ns_info,
(char **) writable,
(char **) readonly,
(char **) inaccessible,
NULL,
(char **) exec,
(char **) no_exec,
NULL,
&(BindMount) { .source = (char*) "/usr/bin", .destination = (char*) "/etc/systemd", .read_only = true }, 1,
&(TemporaryFileSystem) { .path = (char*) "/var", .options = (char*) "ro" }, 1,
NULL,
0,
NULL,
tmp_dir,
var_tmp_dir,
NULL,
NULL,
0,
NULL,
NULL,
0,
NULL,
NULL,
NULL,
NULL,
NULL,
NULL,
NULL,
RUNTIME_SCOPE_SYSTEM,
NULL);
NamespaceParameters p = {
.runtime_scope = RUNTIME_SCOPE_SYSTEM,
.root_directory = root_directory,
.read_write_paths = (char**) writable,
.read_only_paths = (char**) readonly,
.inaccessible_paths = (char**) inaccessible,
.exec_paths = (char**) exec,
.no_exec_paths = (char**) no_exec,
.tmp_dir = tmp_dir,
.var_tmp_dir = var_tmp_dir,
.bind_mounts = &bind_mount,
.n_bind_mounts = 1,
.temporary_filesystems = &tmpfs,
.n_temporary_filesystems = 1,
.private_dev = true,
.protect_control_groups = true,
.protect_kernel_tunables = true,
.protect_kernel_modules = true,
.protect_proc = PROTECT_PROC_NOACCESS,
.proc_subset = PROC_SUBSET_PID,
};
r = setup_namespace(&p, NULL);
if (r < 0) {
log_error_errno(r, "Failed to set up namespace: %m");