mirror of
https://github.com/Dasharo/linux.git
synced 2026-03-06 15:25:10 -08:00
Merge tag 'threads-v5.6' of git://git.kernel.org/pub/scm/linux/kernel/git/brauner/linux
Pull thread management updates from Christian Brauner:
"Sargun Dhillon over the last cycle has worked on the pidfd_getfd()
syscall.
This syscall allows for the retrieval of file descriptors of a process
based on its pidfd. A task needs to have ptrace_may_access()
permissions with PTRACE_MODE_ATTACH_REALCREDS (suggested by Oleg and
Andy) on the target.
One of the main use-cases is in combination with seccomp's user
notification feature. As a reminder, seccomp's user notification
feature was made available in v5.0. It allows a task to retrieve a
file descriptor for its seccomp filter. The file descriptor is usually
handed of to a more privileged supervising process. The supervisor can
then listen for syscall events caught by the seccomp filter of the
supervisee and perform actions in lieu of the supervisee, usually
emulating syscalls. pidfd_getfd() is needed to expand its uses.
There are currently two major users that wait on pidfd_getfd() and one
future user:
- Netflix, Sargun said, is working on a service mesh where users
should be able to connect to a dns-based VIP. When a user connects
to e.g. 1.2.3.4:80 that runs e.g. service "foo" they will be
redirected to an envoy process. This service mesh uses seccomp user
notifications and pidfd to intercept all connect calls and instead
of connecting them to 1.2.3.4:80 connects them to e.g.
127.0.0.1:8080.
- LXD uses the seccomp notifier heavily to intercept and emulate
mknod() and mount() syscalls for unprivileged containers/processes.
With pidfd_getfd() more uses-cases e.g. bridging socket connections
will be possible.
- The patchset has also seen some interest from the browser corner.
Right now, Firefox is using a SECCOMP_RET_TRAP sandbox managed by a
broker process. In the future glibc will start blocking all signals
during dlopen() rendering this type of sandbox impossible. Hence,
in the future Firefox will switch to a seccomp-user-nofication
based sandbox which also makes use of file descriptor retrieval.
The thread for this can be found at
https://sourceware.org/ml/libc-alpha/2019-12/msg00079.html
With pidfd_getfd() it is e.g. possible to bridge socket connections
for the supervisee (binding to a privileged port) and taking actions
on file descriptors on behalf of the supervisee in general.
Sargun's first version was using an ioctl on pidfds but various people
pushed for it to be a proper syscall which he duely implemented as
well over various review cycles. Selftests are of course included.
I've also added instructions how to deal with merge conflicts below.
There's also a small fix coming from the kernel mentee project to
correctly annotate struct sighand_struct with __rcu to fix various
sparse warnings. We've received a few more such fixes and even though
they are mostly trivial I've decided to postpone them until after -rc1
since they came in rather late and I don't want to risk introducing
build warnings.
Finally, there's a new prctl() command PR_{G,S}ET_IO_FLUSHER which is
needed to avoid allocation recursions triggerable by storage drivers
that have userspace parts that run in the IO path (e.g. dm-multipath,
iscsi, etc). These allocation recursions deadlock the device.
The new prctl() allows such privileged userspace components to avoid
allocation recursions by setting the PF_MEMALLOC_NOIO and
PF_LESS_THROTTLE flags. The patch carries the necessary acks from the
relevant maintainers and is routed here as part of prctl()
thread-management."
* tag 'threads-v5.6' of git://git.kernel.org/pub/scm/linux/kernel/git/brauner/linux:
prctl: PR_{G,S}ET_IO_FLUSHER to support controlling memory reclaim
sched.h: Annotate sighand_struct with __rcu
test: Add test for pidfd getfd
arch: wire up pidfd_getfd syscall
pid: Implement pidfd_getfd syscall
vfs, fdtable: Add fget_task helper
This commit is contained in:
@@ -476,3 +476,4 @@
|
||||
544 common pidfd_open sys_pidfd_open
|
||||
# 545 reserved for clone3
|
||||
547 common openat2 sys_openat2
|
||||
548 common pidfd_getfd sys_pidfd_getfd
|
||||
|
||||
@@ -450,3 +450,4 @@
|
||||
434 common pidfd_open sys_pidfd_open
|
||||
435 common clone3 sys_clone3
|
||||
437 common openat2 sys_openat2
|
||||
438 common pidfd_getfd sys_pidfd_getfd
|
||||
|
||||
@@ -38,7 +38,7 @@
|
||||
#define __ARM_NR_compat_set_tls (__ARM_NR_COMPAT_BASE + 5)
|
||||
#define __ARM_NR_COMPAT_END (__ARM_NR_COMPAT_BASE + 0x800)
|
||||
|
||||
#define __NR_compat_syscalls 438
|
||||
#define __NR_compat_syscalls 439
|
||||
#endif
|
||||
|
||||
#define __ARCH_WANT_SYS_CLONE
|
||||
|
||||
@@ -881,6 +881,8 @@ __SYSCALL(__NR_pidfd_open, sys_pidfd_open)
|
||||
__SYSCALL(__NR_clone3, sys_clone3)
|
||||
#define __NR_openat2 437
|
||||
__SYSCALL(__NR_openat2, sys_openat2)
|
||||
#define __NR_pidfd_getfd 438
|
||||
__SYSCALL(__NR_pidfd_getfd, sys_pidfd_getfd)
|
||||
|
||||
/*
|
||||
* Please add new compat syscalls above this comment and update
|
||||
|
||||
@@ -357,3 +357,4 @@
|
||||
434 common pidfd_open sys_pidfd_open
|
||||
# 435 reserved for clone3
|
||||
437 common openat2 sys_openat2
|
||||
438 common pidfd_getfd sys_pidfd_getfd
|
||||
|
||||
@@ -436,3 +436,4 @@
|
||||
434 common pidfd_open sys_pidfd_open
|
||||
435 common clone3 __sys_clone3
|
||||
437 common openat2 sys_openat2
|
||||
438 common pidfd_getfd sys_pidfd_getfd
|
||||
|
||||
@@ -442,3 +442,4 @@
|
||||
434 common pidfd_open sys_pidfd_open
|
||||
435 common clone3 sys_clone3
|
||||
437 common openat2 sys_openat2
|
||||
438 common pidfd_getfd sys_pidfd_getfd
|
||||
|
||||
@@ -375,3 +375,4 @@
|
||||
434 n32 pidfd_open sys_pidfd_open
|
||||
435 n32 clone3 __sys_clone3
|
||||
437 n32 openat2 sys_openat2
|
||||
438 n32 pidfd_getfd sys_pidfd_getfd
|
||||
|
||||
@@ -351,3 +351,4 @@
|
||||
434 n64 pidfd_open sys_pidfd_open
|
||||
435 n64 clone3 __sys_clone3
|
||||
437 n64 openat2 sys_openat2
|
||||
438 n64 pidfd_getfd sys_pidfd_getfd
|
||||
|
||||
@@ -424,3 +424,4 @@
|
||||
434 o32 pidfd_open sys_pidfd_open
|
||||
435 o32 clone3 __sys_clone3
|
||||
437 o32 openat2 sys_openat2
|
||||
438 o32 pidfd_getfd sys_pidfd_getfd
|
||||
|
||||
@@ -434,3 +434,4 @@
|
||||
434 common pidfd_open sys_pidfd_open
|
||||
435 common clone3 sys_clone3_wrapper
|
||||
437 common openat2 sys_openat2
|
||||
438 common pidfd_getfd sys_pidfd_getfd
|
||||
|
||||
@@ -518,3 +518,4 @@
|
||||
434 common pidfd_open sys_pidfd_open
|
||||
435 nospu clone3 ppc_clone3
|
||||
437 common openat2 sys_openat2
|
||||
438 common pidfd_getfd sys_pidfd_getfd
|
||||
|
||||
@@ -439,3 +439,4 @@
|
||||
434 common pidfd_open sys_pidfd_open sys_pidfd_open
|
||||
435 common clone3 sys_clone3 sys_clone3
|
||||
437 common openat2 sys_openat2 sys_openat2
|
||||
438 common pidfd_getfd sys_pidfd_getfd sys_pidfd_getfd
|
||||
|
||||
@@ -439,3 +439,4 @@
|
||||
434 common pidfd_open sys_pidfd_open
|
||||
# 435 reserved for clone3
|
||||
437 common openat2 sys_openat2
|
||||
438 common pidfd_getfd sys_pidfd_getfd
|
||||
|
||||
@@ -482,3 +482,4 @@
|
||||
434 common pidfd_open sys_pidfd_open
|
||||
# 435 reserved for clone3
|
||||
437 common openat2 sys_openat2
|
||||
438 common pidfd_getfd sys_pidfd_getfd
|
||||
|
||||
@@ -441,3 +441,4 @@
|
||||
434 i386 pidfd_open sys_pidfd_open __ia32_sys_pidfd_open
|
||||
435 i386 clone3 sys_clone3 __ia32_sys_clone3
|
||||
437 i386 openat2 sys_openat2 __ia32_sys_openat2
|
||||
438 i386 pidfd_getfd sys_pidfd_getfd __ia32_sys_pidfd_getfd
|
||||
|
||||
@@ -358,6 +358,7 @@
|
||||
434 common pidfd_open __x64_sys_pidfd_open
|
||||
435 common clone3 __x64_sys_clone3/ptregs
|
||||
437 common openat2 __x64_sys_openat2
|
||||
438 common pidfd_getfd __x64_sys_pidfd_getfd
|
||||
|
||||
#
|
||||
# x32-specific system call numbers start at 512 to avoid cache impact
|
||||
|
||||
@@ -407,3 +407,4 @@
|
||||
434 common pidfd_open sys_pidfd_open
|
||||
435 common clone3 sys_clone3
|
||||
437 common openat2 sys_openat2
|
||||
438 common pidfd_getfd sys_pidfd_getfd
|
||||
|
||||
22
fs/file.c
22
fs/file.c
@@ -708,9 +708,9 @@ void do_close_on_exec(struct files_struct *files)
|
||||
spin_unlock(&files->file_lock);
|
||||
}
|
||||
|
||||
static struct file *__fget(unsigned int fd, fmode_t mask, unsigned int refs)
|
||||
static struct file *__fget_files(struct files_struct *files, unsigned int fd,
|
||||
fmode_t mask, unsigned int refs)
|
||||
{
|
||||
struct files_struct *files = current->files;
|
||||
struct file *file;
|
||||
|
||||
rcu_read_lock();
|
||||
@@ -731,6 +731,12 @@ loop:
|
||||
return file;
|
||||
}
|
||||
|
||||
static inline struct file *__fget(unsigned int fd, fmode_t mask,
|
||||
unsigned int refs)
|
||||
{
|
||||
return __fget_files(current->files, fd, mask, refs);
|
||||
}
|
||||
|
||||
struct file *fget_many(unsigned int fd, unsigned int refs)
|
||||
{
|
||||
return __fget(fd, FMODE_PATH, refs);
|
||||
@@ -748,6 +754,18 @@ struct file *fget_raw(unsigned int fd)
|
||||
}
|
||||
EXPORT_SYMBOL(fget_raw);
|
||||
|
||||
struct file *fget_task(struct task_struct *task, unsigned int fd)
|
||||
{
|
||||
struct file *file = NULL;
|
||||
|
||||
task_lock(task);
|
||||
if (task->files)
|
||||
file = __fget_files(task->files, fd, 0, 1);
|
||||
task_unlock(task);
|
||||
|
||||
return file;
|
||||
}
|
||||
|
||||
/*
|
||||
* Lightweight file lookup - no refcnt increment if fd table isn't shared.
|
||||
*
|
||||
|
||||
@@ -16,6 +16,7 @@ extern void fput(struct file *);
|
||||
extern void fput_many(struct file *, unsigned int);
|
||||
|
||||
struct file_operations;
|
||||
struct task_struct;
|
||||
struct vfsmount;
|
||||
struct dentry;
|
||||
struct inode;
|
||||
@@ -47,6 +48,7 @@ static inline void fdput(struct fd fd)
|
||||
extern struct file *fget(unsigned int fd);
|
||||
extern struct file *fget_many(unsigned int fd, unsigned int refs);
|
||||
extern struct file *fget_raw(unsigned int fd);
|
||||
extern struct file *fget_task(struct task_struct *task, unsigned int fd);
|
||||
extern unsigned long __fdget(unsigned int fd);
|
||||
extern unsigned long __fdget_raw(unsigned int fd);
|
||||
extern unsigned long __fdget_pos(unsigned int fd);
|
||||
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user