Merge patch series "fs: lockless mntns lookup"

Christian Brauner <brauner@kernel.org> says:

Currently we take the read lock when looking for a mount namespace to
list mounts in. We can make this lockless. The simple search case can
just use a sequence counter to detect concurrent changes to the rbtree.

For walking the list of mount namespaces sequentially via nsfs we keep a
separate rcu list as rb_prev() and rb_next() aren't usable safely with
rcu.

Since creating mount namespaces is a relatively rare event compared with
querying mounts in a foreign mount namespace this is worth it. Once
libmount and systemd pick up this mechanism to list mounts in foreign
mount namespaces this will be used very frequently.

* patches from https://lore.kernel.org/r/20241213-work-mount-rbtree-lockless-v3-0-6e3cdaf9b280@kernel.org:
  samples: add test-list-all-mounts
  selftests: remove unneeded include
  selftests: add tests for mntns iteration
  seltests: move nsfs into filesystems subfolder
  fs: simplify rwlock to spinlock
  fs: lockless mntns lookup for nsfs
  rculist: add list_bidir_{del,prev}_rcu()
  fs: lockless mntns rbtree lookup
  fs: add mount namespace to rbtree late
  mount: remove inlude/nospec.h include

Link: https://lore.kernel.org/r/20241213-work-mount-rbtree-lockless-v3-0-6e3cdaf9b280@kernel.org
Signed-off-by: Christian Brauner <brauner@kernel.org>
This commit is contained in:
Christian Brauner
2024-12-13 20:31:52 +01:00
14 changed files with 540 additions and 80 deletions

View File

@@ -12,11 +12,15 @@ struct mnt_namespace {
struct user_namespace *user_ns;
struct ucounts *ucounts;
u64 seq; /* Sequence number to prevent loops */
wait_queue_head_t poll;
union {
wait_queue_head_t poll;
struct rcu_head mnt_ns_rcu;
};
u64 event;
unsigned int nr_mounts; /* # of mounts in the namespace */
unsigned int pending_mounts;
struct rb_node mnt_ns_tree_node; /* node in the mnt_ns_tree */
struct list_head mnt_ns_list; /* entry in the sequential list of mounts namespace */
refcount_t passive; /* number references not pinning @mounts */
} __randomize_layout;
@@ -157,15 +161,9 @@ static inline void move_from_ns(struct mount *mnt, struct list_head *dt_list)
}
bool has_locked_children(struct mount *mnt, struct dentry *dentry);
struct mnt_namespace *__lookup_next_mnt_ns(struct mnt_namespace *mnt_ns, bool previous);
static inline struct mnt_namespace *lookup_next_mnt_ns(struct mnt_namespace *mntns)
{
return __lookup_next_mnt_ns(mntns, false);
}
static inline struct mnt_namespace *lookup_prev_mnt_ns(struct mnt_namespace *mntns)
{
return __lookup_next_mnt_ns(mntns, true);
}
struct mnt_namespace *get_sequential_mnt_ns(struct mnt_namespace *mnt_ns,
bool previous);
static inline struct mnt_namespace *to_mnt_ns(struct ns_common *ns)
{
return container_of(ns, struct mnt_namespace, ns);

View File

@@ -32,7 +32,6 @@
#include <linux/fs_context.h>
#include <linux/shmem_fs.h>
#include <linux/mnt_idmapping.h>
#include <linux/nospec.h>
#include "pnode.h"
#include "internal.h"
@@ -79,8 +78,10 @@ static struct kmem_cache *mnt_cache __ro_after_init;
static DECLARE_RWSEM(namespace_sem);
static HLIST_HEAD(unmounted); /* protected by namespace_sem */
static LIST_HEAD(ex_mountpoints); /* protected by namespace_sem */
static DEFINE_RWLOCK(mnt_ns_tree_lock);
static DEFINE_SEQLOCK(mnt_ns_tree_lock);
static struct rb_root mnt_ns_tree = RB_ROOT; /* protected by mnt_ns_tree_lock */
static LIST_HEAD(mnt_ns_list); /* protected by mnt_ns_tree_lock */
struct mount_kattr {
unsigned int attr_set;
@@ -106,17 +107,6 @@ EXPORT_SYMBOL_GPL(fs_kobj);
*/
__cacheline_aligned_in_smp DEFINE_SEQLOCK(mount_lock);
static int mnt_ns_cmp(u64 seq, const struct mnt_namespace *ns)
{
u64 seq_b = ns->seq;
if (seq < seq_b)
return -1;
if (seq > seq_b)
return 1;
return 0;
}
static inline struct mnt_namespace *node_to_mnt_ns(const struct rb_node *node)
{
if (!node)
@@ -124,24 +114,53 @@ static inline struct mnt_namespace *node_to_mnt_ns(const struct rb_node *node)
return rb_entry(node, struct mnt_namespace, mnt_ns_tree_node);
}
static bool mnt_ns_less(struct rb_node *a, const struct rb_node *b)
static int mnt_ns_cmp(struct rb_node *a, const struct rb_node *b)
{
struct mnt_namespace *ns_a = node_to_mnt_ns(a);
struct mnt_namespace *ns_b = node_to_mnt_ns(b);
u64 seq_a = ns_a->seq;
u64 seq_b = ns_b->seq;
return mnt_ns_cmp(seq_a, ns_b) < 0;
if (seq_a < seq_b)
return -1;
if (seq_a > seq_b)
return 1;
return 0;
}
static inline void mnt_ns_tree_write_lock(void)
{
write_seqlock(&mnt_ns_tree_lock);
}
static inline void mnt_ns_tree_write_unlock(void)
{
write_sequnlock(&mnt_ns_tree_lock);
}
static void mnt_ns_tree_add(struct mnt_namespace *ns)
{
guard(write_lock)(&mnt_ns_tree_lock);
rb_add(&ns->mnt_ns_tree_node, &mnt_ns_tree, mnt_ns_less);
struct rb_node *node, *prev;
mnt_ns_tree_write_lock();
node = rb_find_add_rcu(&ns->mnt_ns_tree_node, &mnt_ns_tree, mnt_ns_cmp);
/*
* If there's no previous entry simply add it after the
* head and if there is add it after the previous entry.
*/
prev = rb_prev(&ns->mnt_ns_tree_node);
if (!prev)
list_add_rcu(&ns->mnt_ns_list, &mnt_ns_list);
else
list_add_rcu(&ns->mnt_ns_list, &node_to_mnt_ns(prev)->mnt_ns_list);
mnt_ns_tree_write_unlock();
WARN_ON_ONCE(node);
}
static void mnt_ns_release(struct mnt_namespace *ns)
{
lockdep_assert_not_held(&mnt_ns_tree_lock);
lockdep_assert_not_held(&mnt_ns_tree_lock.lock);
/* keep alive for {list,stat}mount() */
if (refcount_dec_and_test(&ns->passive)) {
@@ -151,41 +170,34 @@ static void mnt_ns_release(struct mnt_namespace *ns)
}
DEFINE_FREE(mnt_ns_release, struct mnt_namespace *, if (_T) mnt_ns_release(_T))
static void mnt_ns_release_rcu(struct rcu_head *rcu)
{
mnt_ns_release(container_of(rcu, struct mnt_namespace, mnt_ns_rcu));
}
static void mnt_ns_tree_remove(struct mnt_namespace *ns)
{
/* remove from global mount namespace list */
if (!is_anon_ns(ns)) {
guard(write_lock)(&mnt_ns_tree_lock);
mnt_ns_tree_write_lock();
rb_erase(&ns->mnt_ns_tree_node, &mnt_ns_tree);
list_bidir_del_rcu(&ns->mnt_ns_list);
mnt_ns_tree_write_unlock();
}
mnt_ns_release(ns);
call_rcu(&ns->mnt_ns_rcu, mnt_ns_release_rcu);
}
/*
* Returns the mount namespace which either has the specified id, or has the
* next smallest id afer the specified one.
*/
static struct mnt_namespace *mnt_ns_find_id_at(u64 mnt_ns_id)
static int mnt_ns_find(const void *key, const struct rb_node *node)
{
struct rb_node *node = mnt_ns_tree.rb_node;
struct mnt_namespace *ret = NULL;
const u64 mnt_ns_id = *(u64 *)key;
const struct mnt_namespace *ns = node_to_mnt_ns(node);
lockdep_assert_held(&mnt_ns_tree_lock);
while (node) {
struct mnt_namespace *n = node_to_mnt_ns(node);
if (mnt_ns_id <= n->seq) {
ret = node_to_mnt_ns(node);
if (mnt_ns_id == n->seq)
break;
node = node->rb_left;
} else {
node = node->rb_right;
}
}
return ret;
if (mnt_ns_id < ns->seq)
return -1;
if (mnt_ns_id > ns->seq)
return 1;
return 0;
}
/*
@@ -195,18 +207,37 @@ static struct mnt_namespace *mnt_ns_find_id_at(u64 mnt_ns_id)
* namespace the @namespace_sem must first be acquired. If the namespace has
* already shut down before acquiring @namespace_sem, {list,stat}mount() will
* see that the mount rbtree of the namespace is empty.
*
* Note the lookup is lockless protected by a sequence counter. We only
* need to guard against false negatives as false positives aren't
* possible. So if we didn't find a mount namespace and the sequence
* counter has changed we need to retry. If the sequence counter is
* still the same we know the search actually failed.
*/
static struct mnt_namespace *lookup_mnt_ns(u64 mnt_ns_id)
{
struct mnt_namespace *ns;
struct mnt_namespace *ns;
struct rb_node *node;
unsigned int seq;
guard(read_lock)(&mnt_ns_tree_lock);
ns = mnt_ns_find_id_at(mnt_ns_id);
if (!ns || ns->seq != mnt_ns_id)
return NULL;
guard(rcu)();
do {
seq = read_seqbegin(&mnt_ns_tree_lock);
node = rb_find_rcu(&mnt_ns_id, &mnt_ns_tree, mnt_ns_find);
if (node)
break;
} while (read_seqretry(&mnt_ns_tree_lock, seq));
refcount_inc(&ns->passive);
return ns;
if (!node)
return NULL;
/*
* The last reference count is put with RCU delay so we can
* unconditonally acquire a reference here.
*/
ns = node_to_mnt_ns(node);
refcount_inc(&ns->passive);
return ns;
}
static inline void lock_mount_hash(void)
@@ -2063,30 +2094,34 @@ struct ns_common *from_mnt_ns(struct mnt_namespace *mnt)
return &mnt->ns;
}
struct mnt_namespace *__lookup_next_mnt_ns(struct mnt_namespace *mntns, bool previous)
struct mnt_namespace *get_sequential_mnt_ns(struct mnt_namespace *mntns, bool previous)
{
guard(read_lock)(&mnt_ns_tree_lock);
guard(rcu)();
for (;;) {
struct rb_node *node;
struct list_head *list;
if (previous)
node = rb_prev(&mntns->mnt_ns_tree_node);
list = rcu_dereference(list_bidir_prev_rcu(&mntns->mnt_ns_list));
else
node = rb_next(&mntns->mnt_ns_tree_node);
if (!node)
list = rcu_dereference(list_next_rcu(&mntns->mnt_ns_list));
if (list_is_head(list, &mnt_ns_list))
return ERR_PTR(-ENOENT);
mntns = node_to_mnt_ns(node);
node = &mntns->mnt_ns_tree_node;
mntns = list_entry_rcu(list, struct mnt_namespace, mnt_ns_list);
/*
* The last passive reference count is put with RCU
* delay so accessing the mount namespace is not just
* safe but all relevant members are still valid.
*/
if (!ns_capable_noaudit(mntns->user_ns, CAP_SYS_ADMIN))
continue;
/*
* Holding mnt_ns_tree_lock prevents the mount namespace from
* being freed but it may well be on it's deathbed. We want an
* active reference, not just a passive one here as we're
* persisting the mount namespace.
* We need an active reference count as we're persisting
* the mount namespace and it might already be on its
* deathbed.
*/
if (!refcount_inc_not_zero(&mntns->ns.count))
continue;
@@ -3903,6 +3938,7 @@ static struct mnt_namespace *alloc_mnt_ns(struct user_namespace *user_ns, bool a
refcount_set(&new_ns->ns.count, 1);
refcount_set(&new_ns->passive, 1);
new_ns->mounts = RB_ROOT;
INIT_LIST_HEAD(&new_ns->mnt_ns_list);
RB_CLEAR_NODE(&new_ns->mnt_ns_tree_node);
init_waitqueue_head(&new_ns->poll);
new_ns->user_ns = get_user_ns(user_ns);
@@ -3982,7 +4018,6 @@ struct mnt_namespace *copy_mnt_ns(unsigned long flags, struct mnt_namespace *ns,
while (p->mnt.mnt_root != q->mnt.mnt_root)
p = next_mnt(skip_mnt_tree(p), old);
}
mnt_ns_tree_add(new_ns);
namespace_unlock();
if (rootmnt)
@@ -3990,6 +4025,7 @@ struct mnt_namespace *copy_mnt_ns(unsigned long flags, struct mnt_namespace *ns,
if (pwdmnt)
mntput(pwdmnt);
mnt_ns_tree_add(new_ns);
return new_ns;
}

View File

@@ -274,10 +274,7 @@ static long ns_ioctl(struct file *filp, unsigned int ioctl,
if (usize < MNT_NS_INFO_SIZE_VER0)
return -EINVAL;
if (previous)
mnt_ns = lookup_prev_mnt_ns(to_mnt_ns(ns));
else
mnt_ns = lookup_next_mnt_ns(to_mnt_ns(ns));
mnt_ns = get_sequential_mnt_ns(to_mnt_ns(ns), previous);
if (IS_ERR(mnt_ns))
return PTR_ERR(mnt_ns);

View File

@@ -30,6 +30,17 @@ static inline void INIT_LIST_HEAD_RCU(struct list_head *list)
* way, we must not access it directly
*/
#define list_next_rcu(list) (*((struct list_head __rcu **)(&(list)->next)))
/*
* Return the ->prev pointer of a list_head in an rcu safe way. Don't
* access it directly.
*
* Any list traversed with list_bidir_prev_rcu() must never use
* list_del_rcu(). Doing so will poison the ->prev pointer that
* list_bidir_prev_rcu() relies on, which will result in segfaults.
* To prevent these segfaults, use list_bidir_del_rcu() instead
* of list_del_rcu().
*/
#define list_bidir_prev_rcu(list) (*((struct list_head __rcu **)(&(list)->prev)))
/**
* list_tail_rcu - returns the prev pointer of the head of the list
@@ -158,6 +169,39 @@ static inline void list_del_rcu(struct list_head *entry)
entry->prev = LIST_POISON2;
}
/**
* list_bidir_del_rcu - deletes entry from list without re-initialization
* @entry: the element to delete from the list.
*
* In contrast to list_del_rcu() doesn't poison the prev pointer thus
* allowing backwards traversal via list_bidir_prev_rcu().
*
* Note: list_empty() on entry does not return true after this because
* the entry is in a special undefined state that permits RCU-based
* lockfree reverse traversal. In particular this means that we can not
* poison the forward and backwards pointers that may still be used for
* walking the list.
*
* The caller must take whatever precautions are necessary (such as
* holding appropriate locks) to avoid racing with another list-mutation
* primitive, such as list_bidir_del_rcu() or list_add_rcu(), running on
* this same list. However, it is perfectly legal to run concurrently
* with the _rcu list-traversal primitives, such as
* list_for_each_entry_rcu().
*
* Note that list_del_rcu() and list_bidir_del_rcu() must not be used on
* the same list.
*
* Note that the caller is not permitted to immediately free
* the newly deleted entry. Instead, either synchronize_rcu()
* or call_rcu() must be used to defer freeing until an RCU
* grace period has elapsed.
*/
static inline void list_bidir_del_rcu(struct list_head *entry)
{
__list_del_entry(entry);
}
/**
* hlist_del_init_rcu - deletes entry from hash list with re-initialization
* @n: the element to delete from the hash list.

View File

@@ -1,4 +1,5 @@
# SPDX-License-Identifier: GPL-2.0-only
/test-fsmount
/test-list-all-mounts
/test-statx
/mountinfo

View File

@@ -1,4 +1,4 @@
# SPDX-License-Identifier: GPL-2.0-only
userprogs-always-y += test-fsmount test-statx mountinfo
userprogs-always-y += test-fsmount test-statx mountinfo test-list-all-mounts
userccflags += -I usr/include

View File

@@ -0,0 +1,235 @@
// SPDX-License-Identifier: GPL-2.0-or-later
// Copyright (c) 2024 Christian Brauner <brauner@kernel.org>
#define _GNU_SOURCE
#include <errno.h>
#include <limits.h>
#include <linux/types.h>
#include <stdio.h>
#include <sys/ioctl.h>
#include <sys/syscall.h>
#include "../../tools/testing/selftests/pidfd/pidfd.h"
#define die_errno(format, ...) \
do { \
fprintf(stderr, "%m | %s: %d: %s: " format "\n", __FILE__, \
__LINE__, __func__, ##__VA_ARGS__); \
exit(EXIT_FAILURE); \
} while (0)
/* Get the id for a mount namespace */
#define NS_GET_MNTNS_ID _IO(0xb7, 0x5)
/* Get next mount namespace. */
struct mnt_ns_info {
__u32 size;
__u32 nr_mounts;
__u64 mnt_ns_id;
};
#define MNT_NS_INFO_SIZE_VER0 16 /* size of first published struct */
/* Get information about namespace. */
#define NS_MNT_GET_INFO _IOR(0xb7, 10, struct mnt_ns_info)
/* Get next namespace. */
#define NS_MNT_GET_NEXT _IOR(0xb7, 11, struct mnt_ns_info)
/* Get previous namespace. */
#define NS_MNT_GET_PREV _IOR(0xb7, 12, struct mnt_ns_info)
#define PIDFD_GET_MNT_NAMESPACE _IO(0xFF, 3)
#ifndef __NR_listmount
#define __NR_listmount 458
#endif
#ifndef __NR_statmount
#define __NR_statmount 457
#endif
/* @mask bits for statmount(2) */
#define STATMOUNT_SB_BASIC 0x00000001U /* Want/got sb_... */
#define STATMOUNT_MNT_BASIC 0x00000002U /* Want/got mnt_... */
#define STATMOUNT_PROPAGATE_FROM 0x00000004U /* Want/got propagate_from */
#define STATMOUNT_MNT_ROOT 0x00000008U /* Want/got mnt_root */
#define STATMOUNT_MNT_POINT 0x00000010U /* Want/got mnt_point */
#define STATMOUNT_FS_TYPE 0x00000020U /* Want/got fs_type */
#define STATMOUNT_MNT_NS_ID 0x00000040U /* Want/got mnt_ns_id */
#define STATMOUNT_MNT_OPTS 0x00000080U /* Want/got mnt_opts */
#define STATX_MNT_ID_UNIQUE 0x00004000U /* Want/got extended stx_mount_id */
struct statmount {
__u32 size;
__u32 mnt_opts;
__u64 mask;
__u32 sb_dev_major;
__u32 sb_dev_minor;
__u64 sb_magic;
__u32 sb_flags;
__u32 fs_type;
__u64 mnt_id;
__u64 mnt_parent_id;
__u32 mnt_id_old;
__u32 mnt_parent_id_old;
__u64 mnt_attr;
__u64 mnt_propagation;
__u64 mnt_peer_group;
__u64 mnt_master;
__u64 propagate_from;
__u32 mnt_root;
__u32 mnt_point;
__u64 mnt_ns_id;
__u64 __spare2[49];
char str[];
};
struct mnt_id_req {
__u32 size;
__u32 spare;
__u64 mnt_id;
__u64 param;
__u64 mnt_ns_id;
};
#define MNT_ID_REQ_SIZE_VER1 32 /* sizeof second published struct */
#define LSMT_ROOT 0xffffffffffffffff /* root mount */
static int __statmount(__u64 mnt_id, __u64 mnt_ns_id, __u64 mask,
struct statmount *stmnt, size_t bufsize,
unsigned int flags)
{
struct mnt_id_req req = {
.size = MNT_ID_REQ_SIZE_VER1,
.mnt_id = mnt_id,
.param = mask,
.mnt_ns_id = mnt_ns_id,
};
return syscall(__NR_statmount, &req, stmnt, bufsize, flags);
}
static struct statmount *sys_statmount(__u64 mnt_id, __u64 mnt_ns_id,
__u64 mask, unsigned int flags)
{
size_t bufsize = 1 << 15;
struct statmount *stmnt = NULL, *tmp = NULL;
int ret;
for (;;) {
tmp = realloc(stmnt, bufsize);
if (!tmp)
goto out;
stmnt = tmp;
ret = __statmount(mnt_id, mnt_ns_id, mask, stmnt, bufsize, flags);
if (!ret)
return stmnt;
if (errno != EOVERFLOW)
goto out;
bufsize <<= 1;
if (bufsize >= UINT_MAX / 2)
goto out;
}
out:
free(stmnt);
return NULL;
}
static ssize_t sys_listmount(__u64 mnt_id, __u64 last_mnt_id, __u64 mnt_ns_id,
__u64 list[], size_t num, unsigned int flags)
{
struct mnt_id_req req = {
.size = MNT_ID_REQ_SIZE_VER1,
.mnt_id = mnt_id,
.param = last_mnt_id,
.mnt_ns_id = mnt_ns_id,
};
return syscall(__NR_listmount, &req, list, num, flags);
}
int main(int argc, char *argv[])
{
#define LISTMNT_BUFFER 10
__u64 list[LISTMNT_BUFFER], last_mnt_id = 0;
int ret, pidfd, fd_mntns;
struct mnt_ns_info info = {};
pidfd = sys_pidfd_open(getpid(), 0);
if (pidfd < 0)
die_errno("pidfd_open failed");
fd_mntns = ioctl(pidfd, PIDFD_GET_MNT_NAMESPACE, 0);
if (fd_mntns < 0)
die_errno("ioctl(PIDFD_GET_MNT_NAMESPACE) failed");
ret = ioctl(fd_mntns, NS_MNT_GET_INFO, &info);
if (ret < 0)
die_errno("ioctl(NS_GET_MNTNS_ID) failed");
printf("Listing %u mounts for mount namespace %llu\n",
info.nr_mounts, info.mnt_ns_id);
for (;;) {
ssize_t nr_mounts;
next:
nr_mounts = sys_listmount(LSMT_ROOT, last_mnt_id,
info.mnt_ns_id, list, LISTMNT_BUFFER,
0);
if (nr_mounts <= 0) {
int fd_mntns_next;
printf("Finished listing %u mounts for mount namespace %llu\n\n",
info.nr_mounts, info.mnt_ns_id);
fd_mntns_next = ioctl(fd_mntns, NS_MNT_GET_NEXT, &info);
if (fd_mntns_next < 0) {
if (errno == ENOENT) {
printf("Finished listing all mount namespaces\n");
exit(0);
}
die_errno("ioctl(NS_MNT_GET_NEXT) failed");
}
close(fd_mntns);
fd_mntns = fd_mntns_next;
last_mnt_id = 0;
printf("Listing %u mounts for mount namespace %llu\n",
info.nr_mounts, info.mnt_ns_id);
goto next;
}
for (size_t cur = 0; cur < nr_mounts; cur++) {
struct statmount *stmnt;
last_mnt_id = list[cur];
stmnt = sys_statmount(last_mnt_id, info.mnt_ns_id,
STATMOUNT_SB_BASIC |
STATMOUNT_MNT_BASIC |
STATMOUNT_MNT_ROOT |
STATMOUNT_MNT_POINT |
STATMOUNT_MNT_NS_ID |
STATMOUNT_MNT_OPTS |
STATMOUNT_FS_TYPE, 0);
if (!stmnt) {
printf("Failed to statmount(%llu) in mount namespace(%llu)\n",
last_mnt_id, info.mnt_ns_id);
continue;
}
printf("mnt_id:\t\t%llu\nmnt_parent_id:\t%llu\nfs_type:\t%s\nmnt_root:\t%s\nmnt_point:\t%s\nmnt_opts:\t%s\n\n",
stmnt->mnt_id,
stmnt->mnt_parent_id,
stmnt->str + stmnt->fs_type,
stmnt->str + stmnt->mnt_root,
stmnt->str + stmnt->mnt_point,
stmnt->str + stmnt->mnt_opts);
free(stmnt);
}
}
exit(0);
}

View File

@@ -1,3 +1,4 @@
# SPDX-License-Identifier: GPL-2.0-only
owner
pidns
iterate_mntns

View File

@@ -1,6 +1,6 @@
# SPDX-License-Identifier: GPL-2.0-only
TEST_GEN_PROGS := owner pidns
TEST_GEN_PROGS := owner pidns iterate_mntns
CFLAGS := -Wall -Werror
include ../lib.mk
include ../../lib.mk

View File

@@ -0,0 +1,149 @@
// SPDX-License-Identifier: GPL-2.0-or-later
// Copyright (c) 2024 Christian Brauner <brauner@kernel.org>
#define _GNU_SOURCE
#include <fcntl.h>
#include <sched.h>
#include <stdio.h>
#include <string.h>
#include <sys/stat.h>
#include <sys/mount.h>
#include <unistd.h>
#include "../../kselftest_harness.h"
#define MNT_NS_COUNT 11
#define MNT_NS_LAST_INDEX 10
struct mnt_ns_info {
__u32 size;
__u32 nr_mounts;
__u64 mnt_ns_id;
};
#define MNT_NS_INFO_SIZE_VER0 16 /* size of first published struct */
/* Get information about namespace. */
#define NS_MNT_GET_INFO _IOR(0xb7, 10, struct mnt_ns_info)
/* Get next namespace. */
#define NS_MNT_GET_NEXT _IOR(0xb7, 11, struct mnt_ns_info)
/* Get previous namespace. */
#define NS_MNT_GET_PREV _IOR(0xb7, 12, struct mnt_ns_info)
FIXTURE(iterate_mount_namespaces) {
int fd_mnt_ns[MNT_NS_COUNT];
__u64 mnt_ns_id[MNT_NS_COUNT];
};
FIXTURE_SETUP(iterate_mount_namespaces)
{
for (int i = 0; i < MNT_NS_COUNT; i++)
self->fd_mnt_ns[i] = -EBADF;
/*
* Creating a new user namespace let's us guarantee that we only see
* mount namespaces that we did actually create.
*/
ASSERT_EQ(unshare(CLONE_NEWUSER), 0);
for (int i = 0; i < MNT_NS_COUNT; i++) {
struct mnt_ns_info info = {};
ASSERT_EQ(unshare(CLONE_NEWNS), 0);
self->fd_mnt_ns[i] = open("/proc/self/ns/mnt", O_RDONLY | O_CLOEXEC);
ASSERT_GE(self->fd_mnt_ns[i], 0);
ASSERT_EQ(ioctl(self->fd_mnt_ns[i], NS_MNT_GET_INFO, &info), 0);
self->mnt_ns_id[i] = info.mnt_ns_id;
}
}
FIXTURE_TEARDOWN(iterate_mount_namespaces)
{
for (int i = 0; i < MNT_NS_COUNT; i++) {
if (self->fd_mnt_ns[i] < 0)
continue;
ASSERT_EQ(close(self->fd_mnt_ns[i]), 0);
}
}
TEST_F(iterate_mount_namespaces, iterate_all_forward)
{
int fd_mnt_ns_cur, count = 0;
fd_mnt_ns_cur = fcntl(self->fd_mnt_ns[0], F_DUPFD_CLOEXEC);
ASSERT_GE(fd_mnt_ns_cur, 0);
for (;; count++) {
struct mnt_ns_info info = {};
int fd_mnt_ns_next;
fd_mnt_ns_next = ioctl(fd_mnt_ns_cur, NS_MNT_GET_NEXT, &info);
if (fd_mnt_ns_next < 0 && errno == ENOENT)
break;
ASSERT_GE(fd_mnt_ns_next, 0);
ASSERT_EQ(close(fd_mnt_ns_cur), 0);
fd_mnt_ns_cur = fd_mnt_ns_next;
}
ASSERT_EQ(count, MNT_NS_LAST_INDEX);
}
TEST_F(iterate_mount_namespaces, iterate_all_backwards)
{
int fd_mnt_ns_cur, count = 0;
fd_mnt_ns_cur = fcntl(self->fd_mnt_ns[MNT_NS_LAST_INDEX], F_DUPFD_CLOEXEC);
ASSERT_GE(fd_mnt_ns_cur, 0);
for (;; count++) {
struct mnt_ns_info info = {};
int fd_mnt_ns_prev;
fd_mnt_ns_prev = ioctl(fd_mnt_ns_cur, NS_MNT_GET_PREV, &info);
if (fd_mnt_ns_prev < 0 && errno == ENOENT)
break;
ASSERT_GE(fd_mnt_ns_prev, 0);
ASSERT_EQ(close(fd_mnt_ns_cur), 0);
fd_mnt_ns_cur = fd_mnt_ns_prev;
}
ASSERT_EQ(count, MNT_NS_LAST_INDEX);
}
TEST_F(iterate_mount_namespaces, iterate_forward)
{
int fd_mnt_ns_cur;
ASSERT_EQ(setns(self->fd_mnt_ns[0], CLONE_NEWNS), 0);
fd_mnt_ns_cur = self->fd_mnt_ns[0];
for (int i = 1; i < MNT_NS_COUNT; i++) {
struct mnt_ns_info info = {};
int fd_mnt_ns_next;
fd_mnt_ns_next = ioctl(fd_mnt_ns_cur, NS_MNT_GET_NEXT, &info);
ASSERT_GE(fd_mnt_ns_next, 0);
ASSERT_EQ(close(fd_mnt_ns_cur), 0);
fd_mnt_ns_cur = fd_mnt_ns_next;
ASSERT_EQ(info.mnt_ns_id, self->mnt_ns_id[i]);
}
}
TEST_F(iterate_mount_namespaces, iterate_backward)
{
int fd_mnt_ns_cur;
ASSERT_EQ(setns(self->fd_mnt_ns[MNT_NS_LAST_INDEX], CLONE_NEWNS), 0);
fd_mnt_ns_cur = self->fd_mnt_ns[MNT_NS_LAST_INDEX];
for (int i = MNT_NS_LAST_INDEX - 1; i >= 0; i--) {
struct mnt_ns_info info = {};
int fd_mnt_ns_prev;
fd_mnt_ns_prev = ioctl(fd_mnt_ns_cur, NS_MNT_GET_PREV, &info);
ASSERT_GE(fd_mnt_ns_prev, 0);
ASSERT_EQ(close(fd_mnt_ns_cur), 0);
fd_mnt_ns_cur = fd_mnt_ns_prev;
ASSERT_EQ(info.mnt_ns_id, self->mnt_ns_id[i]);
}
}
TEST_HARNESS_MAIN

View File

@@ -12,7 +12,6 @@
#include <stdlib.h>
#include <string.h>
#include <syscall.h>
#include <sys/mount.h>
#include <sys/types.h>
#include <sys/wait.h>