mirror of
https://github.com/armbian/linux-cix.git
synced 2026-01-06 12:30:45 -08:00
Merge tag 'for-5.7/io_uring-2020-03-29' of git://git.kernel.dk/linux-block
Pull io_uring updates from Jens Axboe:
"Here are the io_uring changes for this merge window. Light on new
features this time around (just splice + buffer selection), lots of
cleanups, fixes, and improvements to existing support. In particular,
this contains:
- Cleanup fixed file update handling for stack fallback (Hillf)
- Re-work of how pollable async IO is handled, we no longer require
thread offload to handle that. Instead we rely using poll to drive
this, with task_work execution.
- In conjunction with the above, allow expendable buffer selection,
so that poll+recv (for example) no longer has to be a split
operation.
- Make sure we honor RLIMIT_FSIZE for buffered writes
- Add support for splice (Pavel)
- Linked work inheritance fixes and optimizations (Pavel)
- Async work fixes and cleanups (Pavel)
- Improve io-wq locking (Pavel)
- Hashed link write improvements (Pavel)
- SETUP_IOPOLL|SETUP_SQPOLL improvements (Xiaoguang)"
* tag 'for-5.7/io_uring-2020-03-29' of git://git.kernel.dk/linux-block: (54 commits)
io_uring: cleanup io_alloc_async_ctx()
io_uring: fix missing 'return' in comment
io-wq: handle hashed writes in chains
io-uring: drop 'free_pfile' in struct io_file_put
io-uring: drop completion when removing file
io_uring: Fix ->data corruption on re-enqueue
io-wq: close cancel gap for hashed linked work
io_uring: make spdxcheck.py happy
io_uring: honor original task RLIMIT_FSIZE
io-wq: hash dependent work
io-wq: split hashing and enqueueing
io-wq: don't resched if there is no work
io-wq: remove duplicated cancel code
io_uring: fix truncated async read/readv and write/writev retry
io_uring: dual license io_uring.h uapi header
io_uring: io_uring_enter(2) don't poll while SETUP_IOPOLL|SETUP_SQPOLL enabled
io_uring: Fix unused function warnings
io_uring: add end-of-bits marker and build time verify it
io_uring: provide means of removing buffers
io_uring: add IOSQE_BUFFER_SELECT support for IORING_OP_RECVMSG
...
This commit is contained in:
372
fs/io-wq.c
372
fs/io-wq.c
File diff suppressed because it is too large
Load Diff
65
fs/io-wq.h
65
fs/io-wq.h
@@ -5,10 +5,8 @@ struct io_wq;
|
||||
|
||||
enum {
|
||||
IO_WQ_WORK_CANCEL = 1,
|
||||
IO_WQ_WORK_HAS_MM = 2,
|
||||
IO_WQ_WORK_HASHED = 4,
|
||||
IO_WQ_WORK_UNBOUND = 32,
|
||||
IO_WQ_WORK_CB = 128,
|
||||
IO_WQ_WORK_NO_CANCEL = 256,
|
||||
IO_WQ_WORK_CONCURRENT = 512,
|
||||
|
||||
@@ -30,6 +28,18 @@ struct io_wq_work_list {
|
||||
struct io_wq_work_node *last;
|
||||
};
|
||||
|
||||
static inline void wq_list_add_after(struct io_wq_work_node *node,
|
||||
struct io_wq_work_node *pos,
|
||||
struct io_wq_work_list *list)
|
||||
{
|
||||
struct io_wq_work_node *next = pos->next;
|
||||
|
||||
pos->next = node;
|
||||
node->next = next;
|
||||
if (!next)
|
||||
list->last = node;
|
||||
}
|
||||
|
||||
static inline void wq_list_add_tail(struct io_wq_work_node *node,
|
||||
struct io_wq_work_list *list)
|
||||
{
|
||||
@@ -42,17 +52,26 @@ static inline void wq_list_add_tail(struct io_wq_work_node *node,
|
||||
}
|
||||
}
|
||||
|
||||
static inline void wq_node_del(struct io_wq_work_list *list,
|
||||
static inline void wq_list_cut(struct io_wq_work_list *list,
|
||||
struct io_wq_work_node *last,
|
||||
struct io_wq_work_node *prev)
|
||||
{
|
||||
/* first in the list, if prev==NULL */
|
||||
if (!prev)
|
||||
WRITE_ONCE(list->first, last->next);
|
||||
else
|
||||
prev->next = last->next;
|
||||
|
||||
if (last == list->last)
|
||||
list->last = prev;
|
||||
last->next = NULL;
|
||||
}
|
||||
|
||||
static inline void wq_list_del(struct io_wq_work_list *list,
|
||||
struct io_wq_work_node *node,
|
||||
struct io_wq_work_node *prev)
|
||||
{
|
||||
if (node == list->first)
|
||||
WRITE_ONCE(list->first, node->next);
|
||||
if (node == list->last)
|
||||
list->last = prev;
|
||||
if (prev)
|
||||
prev->next = node->next;
|
||||
node->next = NULL;
|
||||
wq_list_cut(list, node, prev);
|
||||
}
|
||||
|
||||
#define wq_list_for_each(pos, prv, head) \
|
||||
@@ -65,10 +84,7 @@ static inline void wq_node_del(struct io_wq_work_list *list,
|
||||
} while (0)
|
||||
|
||||
struct io_wq_work {
|
||||
union {
|
||||
struct io_wq_work_node list;
|
||||
void *data;
|
||||
};
|
||||
struct io_wq_work_node list;
|
||||
void (*func)(struct io_wq_work **);
|
||||
struct files_struct *files;
|
||||
struct mm_struct *mm;
|
||||
@@ -83,14 +99,20 @@ struct io_wq_work {
|
||||
*(work) = (struct io_wq_work){ .func = _func }; \
|
||||
} while (0) \
|
||||
|
||||
typedef void (get_work_fn)(struct io_wq_work *);
|
||||
typedef void (put_work_fn)(struct io_wq_work *);
|
||||
static inline struct io_wq_work *wq_next_work(struct io_wq_work *work)
|
||||
{
|
||||
if (!work->list.next)
|
||||
return NULL;
|
||||
|
||||
return container_of(work->list.next, struct io_wq_work, list);
|
||||
}
|
||||
|
||||
typedef void (free_work_fn)(struct io_wq_work *);
|
||||
|
||||
struct io_wq_data {
|
||||
struct user_struct *user;
|
||||
|
||||
get_work_fn *get_work;
|
||||
put_work_fn *put_work;
|
||||
free_work_fn *free_work;
|
||||
};
|
||||
|
||||
struct io_wq *io_wq_create(unsigned bounded, struct io_wq_data *data);
|
||||
@@ -98,7 +120,12 @@ bool io_wq_get(struct io_wq *wq, struct io_wq_data *data);
|
||||
void io_wq_destroy(struct io_wq *wq);
|
||||
|
||||
void io_wq_enqueue(struct io_wq *wq, struct io_wq_work *work);
|
||||
void io_wq_enqueue_hashed(struct io_wq *wq, struct io_wq_work *work, void *val);
|
||||
void io_wq_hash_work(struct io_wq_work *work, void *val);
|
||||
|
||||
static inline bool io_wq_is_hashed(struct io_wq_work *work)
|
||||
{
|
||||
return work->flags & IO_WQ_WORK_HASHED;
|
||||
}
|
||||
|
||||
void io_wq_cancel_all(struct io_wq *wq);
|
||||
enum io_wq_cancel io_wq_cancel_work(struct io_wq *wq, struct io_wq_work *cwork);
|
||||
|
||||
2027
fs/io_uring.c
2027
fs/io_uring.c
File diff suppressed because it is too large
Load Diff
@@ -1109,9 +1109,9 @@ static int splice_pipe_to_pipe(struct pipe_inode_info *ipipe,
|
||||
/*
|
||||
* Determine where to splice to/from.
|
||||
*/
|
||||
static long do_splice(struct file *in, loff_t __user *off_in,
|
||||
struct file *out, loff_t __user *off_out,
|
||||
size_t len, unsigned int flags)
|
||||
long do_splice(struct file *in, loff_t __user *off_in,
|
||||
struct file *out, loff_t __user *off_out,
|
||||
size_t len, unsigned int flags)
|
||||
{
|
||||
struct pipe_inode_info *ipipe;
|
||||
struct pipe_inode_info *opipe;
|
||||
|
||||
@@ -391,6 +391,10 @@ extern int recvmsg_copy_msghdr(struct msghdr *msg,
|
||||
struct user_msghdr __user *umsg, unsigned flags,
|
||||
struct sockaddr __user **uaddr,
|
||||
struct iovec **iov);
|
||||
extern int __copy_msghdr_from_user(struct msghdr *kmsg,
|
||||
struct user_msghdr __user *umsg,
|
||||
struct sockaddr __user **save_addr,
|
||||
struct iovec __user **uiov, size_t *nsegs);
|
||||
|
||||
/* helpers which do the actual work for syscalls */
|
||||
extern int __sys_recvfrom(int fd, void __user *ubuf, size_t size,
|
||||
|
||||
@@ -78,6 +78,9 @@ extern ssize_t add_to_pipe(struct pipe_inode_info *,
|
||||
struct pipe_buffer *);
|
||||
extern ssize_t splice_direct_to_actor(struct file *, struct splice_desc *,
|
||||
splice_direct_actor *);
|
||||
extern long do_splice(struct file *in, loff_t __user *off_in,
|
||||
struct file *out, loff_t __user *off_out,
|
||||
size_t len, unsigned int flags);
|
||||
|
||||
/*
|
||||
* for dynamic pipe sizing
|
||||
|
||||
@@ -38,6 +38,9 @@ struct compat_cmsghdr {
|
||||
#define compat_mmsghdr mmsghdr
|
||||
#endif /* defined(CONFIG_COMPAT) */
|
||||
|
||||
int __get_compat_msghdr(struct msghdr *kmsg, struct compat_msghdr __user *umsg,
|
||||
struct sockaddr __user **save_addr, compat_uptr_t *ptr,
|
||||
compat_size_t *len);
|
||||
int get_compat_msghdr(struct msghdr *, struct compat_msghdr __user *,
|
||||
struct sockaddr __user **, struct iovec **);
|
||||
struct sock_fprog __user *get_compat_bpf_fprog(char __user *optval);
|
||||
|
||||
@@ -357,6 +357,109 @@ TRACE_EVENT(io_uring_submit_sqe,
|
||||
__entry->force_nonblock, __entry->sq_thread)
|
||||
);
|
||||
|
||||
TRACE_EVENT(io_uring_poll_arm,
|
||||
|
||||
TP_PROTO(void *ctx, u8 opcode, u64 user_data, int mask, int events),
|
||||
|
||||
TP_ARGS(ctx, opcode, user_data, mask, events),
|
||||
|
||||
TP_STRUCT__entry (
|
||||
__field( void *, ctx )
|
||||
__field( u8, opcode )
|
||||
__field( u64, user_data )
|
||||
__field( int, mask )
|
||||
__field( int, events )
|
||||
),
|
||||
|
||||
TP_fast_assign(
|
||||
__entry->ctx = ctx;
|
||||
__entry->opcode = opcode;
|
||||
__entry->user_data = user_data;
|
||||
__entry->mask = mask;
|
||||
__entry->events = events;
|
||||
),
|
||||
|
||||
TP_printk("ring %p, op %d, data 0x%llx, mask 0x%x, events 0x%x",
|
||||
__entry->ctx, __entry->opcode,
|
||||
(unsigned long long) __entry->user_data,
|
||||
__entry->mask, __entry->events)
|
||||
);
|
||||
|
||||
TRACE_EVENT(io_uring_poll_wake,
|
||||
|
||||
TP_PROTO(void *ctx, u8 opcode, u64 user_data, int mask),
|
||||
|
||||
TP_ARGS(ctx, opcode, user_data, mask),
|
||||
|
||||
TP_STRUCT__entry (
|
||||
__field( void *, ctx )
|
||||
__field( u8, opcode )
|
||||
__field( u64, user_data )
|
||||
__field( int, mask )
|
||||
),
|
||||
|
||||
TP_fast_assign(
|
||||
__entry->ctx = ctx;
|
||||
__entry->opcode = opcode;
|
||||
__entry->user_data = user_data;
|
||||
__entry->mask = mask;
|
||||
),
|
||||
|
||||
TP_printk("ring %p, op %d, data 0x%llx, mask 0x%x",
|
||||
__entry->ctx, __entry->opcode,
|
||||
(unsigned long long) __entry->user_data,
|
||||
__entry->mask)
|
||||
);
|
||||
|
||||
TRACE_EVENT(io_uring_task_add,
|
||||
|
||||
TP_PROTO(void *ctx, u8 opcode, u64 user_data, int mask),
|
||||
|
||||
TP_ARGS(ctx, opcode, user_data, mask),
|
||||
|
||||
TP_STRUCT__entry (
|
||||
__field( void *, ctx )
|
||||
__field( u8, opcode )
|
||||
__field( u64, user_data )
|
||||
__field( int, mask )
|
||||
),
|
||||
|
||||
TP_fast_assign(
|
||||
__entry->ctx = ctx;
|
||||
__entry->opcode = opcode;
|
||||
__entry->user_data = user_data;
|
||||
__entry->mask = mask;
|
||||
),
|
||||
|
||||
TP_printk("ring %p, op %d, data 0x%llx, mask %x",
|
||||
__entry->ctx, __entry->opcode,
|
||||
(unsigned long long) __entry->user_data,
|
||||
__entry->mask)
|
||||
);
|
||||
|
||||
TRACE_EVENT(io_uring_task_run,
|
||||
|
||||
TP_PROTO(void *ctx, u8 opcode, u64 user_data),
|
||||
|
||||
TP_ARGS(ctx, opcode, user_data),
|
||||
|
||||
TP_STRUCT__entry (
|
||||
__field( void *, ctx )
|
||||
__field( u8, opcode )
|
||||
__field( u64, user_data )
|
||||
),
|
||||
|
||||
TP_fast_assign(
|
||||
__entry->ctx = ctx;
|
||||
__entry->opcode = opcode;
|
||||
__entry->user_data = user_data;
|
||||
),
|
||||
|
||||
TP_printk("ring %p, op %d, data 0x%llx",
|
||||
__entry->ctx, __entry->opcode,
|
||||
(unsigned long long) __entry->user_data)
|
||||
);
|
||||
|
||||
#endif /* _TRACE_IO_URING_H */
|
||||
|
||||
/* This part must be outside protection */
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
|
||||
/* SPDX-License-Identifier: (GPL-2.0 WITH Linux-syscall-note) OR MIT */
|
||||
/*
|
||||
* Header file for the io_uring interface.
|
||||
*
|
||||
@@ -23,7 +23,10 @@ struct io_uring_sqe {
|
||||
__u64 off; /* offset into file */
|
||||
__u64 addr2;
|
||||
};
|
||||
__u64 addr; /* pointer to buffer or iovecs */
|
||||
union {
|
||||
__u64 addr; /* pointer to buffer or iovecs */
|
||||
__u64 splice_off_in;
|
||||
};
|
||||
__u32 len; /* buffer size or number of iovecs */
|
||||
union {
|
||||
__kernel_rwf_t rw_flags;
|
||||
@@ -37,14 +40,21 @@ struct io_uring_sqe {
|
||||
__u32 open_flags;
|
||||
__u32 statx_flags;
|
||||
__u32 fadvise_advice;
|
||||
__u32 splice_flags;
|
||||
};
|
||||
__u64 user_data; /* data to be passed back at completion time */
|
||||
union {
|
||||
struct {
|
||||
/* index into fixed buffers, if used */
|
||||
__u16 buf_index;
|
||||
/* pack this to avoid bogus arm OABI complaints */
|
||||
union {
|
||||
/* index into fixed buffers, if used */
|
||||
__u16 buf_index;
|
||||
/* for grouped buffer selection */
|
||||
__u16 buf_group;
|
||||
} __attribute__((packed));
|
||||
/* personality to use, if used */
|
||||
__u16 personality;
|
||||
__s32 splice_fd_in;
|
||||
};
|
||||
__u64 __pad2[3];
|
||||
};
|
||||
@@ -56,6 +66,7 @@ enum {
|
||||
IOSQE_IO_LINK_BIT,
|
||||
IOSQE_IO_HARDLINK_BIT,
|
||||
IOSQE_ASYNC_BIT,
|
||||
IOSQE_BUFFER_SELECT_BIT,
|
||||
};
|
||||
|
||||
/*
|
||||
@@ -71,6 +82,8 @@ enum {
|
||||
#define IOSQE_IO_HARDLINK (1U << IOSQE_IO_HARDLINK_BIT)
|
||||
/* always go async */
|
||||
#define IOSQE_ASYNC (1U << IOSQE_ASYNC_BIT)
|
||||
/* select buffer from sqe->buf_group */
|
||||
#define IOSQE_BUFFER_SELECT (1U << IOSQE_BUFFER_SELECT_BIT)
|
||||
|
||||
/*
|
||||
* io_uring_setup() flags
|
||||
@@ -113,6 +126,9 @@ enum {
|
||||
IORING_OP_RECV,
|
||||
IORING_OP_OPENAT2,
|
||||
IORING_OP_EPOLL_CTL,
|
||||
IORING_OP_SPLICE,
|
||||
IORING_OP_PROVIDE_BUFFERS,
|
||||
IORING_OP_REMOVE_BUFFERS,
|
||||
|
||||
/* this goes last, obviously */
|
||||
IORING_OP_LAST,
|
||||
@@ -128,6 +144,12 @@ enum {
|
||||
*/
|
||||
#define IORING_TIMEOUT_ABS (1U << 0)
|
||||
|
||||
/*
|
||||
* sqe->splice_flags
|
||||
* extends splice(2) flags
|
||||
*/
|
||||
#define SPLICE_F_FD_IN_FIXED (1U << 31) /* the last bit of __u32 */
|
||||
|
||||
/*
|
||||
* IO completion data structure (Completion Queue Entry)
|
||||
*/
|
||||
@@ -137,6 +159,17 @@ struct io_uring_cqe {
|
||||
__u32 flags;
|
||||
};
|
||||
|
||||
/*
|
||||
* cqe->flags
|
||||
*
|
||||
* IORING_CQE_F_BUFFER If set, the upper 16 bits are the buffer ID
|
||||
*/
|
||||
#define IORING_CQE_F_BUFFER (1U << 0)
|
||||
|
||||
enum {
|
||||
IORING_CQE_BUFFER_SHIFT = 16,
|
||||
};
|
||||
|
||||
/*
|
||||
* Magic offsets for the application to mmap the data it needs
|
||||
*/
|
||||
@@ -204,6 +237,7 @@ struct io_uring_params {
|
||||
#define IORING_FEAT_SUBMIT_STABLE (1U << 2)
|
||||
#define IORING_FEAT_RW_CUR_POS (1U << 3)
|
||||
#define IORING_FEAT_CUR_PERSONALITY (1U << 4)
|
||||
#define IORING_FEAT_FAST_POLL (1U << 5)
|
||||
|
||||
/*
|
||||
* io_uring_register(2) opcodes and arguments
|
||||
|
||||
@@ -97,16 +97,26 @@ void task_work_run(void)
|
||||
* work->func() can do task_work_add(), do not set
|
||||
* work_exited unless the list is empty.
|
||||
*/
|
||||
raw_spin_lock_irq(&task->pi_lock);
|
||||
do {
|
||||
head = NULL;
|
||||
work = READ_ONCE(task->task_works);
|
||||
head = !work && (task->flags & PF_EXITING) ?
|
||||
&work_exited : NULL;
|
||||
if (!work) {
|
||||
if (task->flags & PF_EXITING)
|
||||
head = &work_exited;
|
||||
else
|
||||
break;
|
||||
}
|
||||
} while (cmpxchg(&task->task_works, work, head) != work);
|
||||
raw_spin_unlock_irq(&task->pi_lock);
|
||||
|
||||
if (!work)
|
||||
break;
|
||||
/*
|
||||
* Synchronize with task_work_cancel(). It can not remove
|
||||
* the first entry == work, cmpxchg(task_works) must fail.
|
||||
* But it can remove another entry from the ->next list.
|
||||
*/
|
||||
raw_spin_lock_irq(&task->pi_lock);
|
||||
raw_spin_unlock_irq(&task->pi_lock);
|
||||
|
||||
do {
|
||||
next = work->next;
|
||||
|
||||
30
net/compat.c
30
net/compat.c
@@ -33,10 +33,10 @@
|
||||
#include <linux/uaccess.h>
|
||||
#include <net/compat.h>
|
||||
|
||||
int get_compat_msghdr(struct msghdr *kmsg,
|
||||
struct compat_msghdr __user *umsg,
|
||||
struct sockaddr __user **save_addr,
|
||||
struct iovec **iov)
|
||||
int __get_compat_msghdr(struct msghdr *kmsg,
|
||||
struct compat_msghdr __user *umsg,
|
||||
struct sockaddr __user **save_addr,
|
||||
compat_uptr_t *ptr, compat_size_t *len)
|
||||
{
|
||||
struct compat_msghdr msg;
|
||||
ssize_t err;
|
||||
@@ -79,10 +79,26 @@ int get_compat_msghdr(struct msghdr *kmsg,
|
||||
return -EMSGSIZE;
|
||||
|
||||
kmsg->msg_iocb = NULL;
|
||||
*ptr = msg.msg_iov;
|
||||
*len = msg.msg_iovlen;
|
||||
return 0;
|
||||
}
|
||||
|
||||
err = compat_import_iovec(save_addr ? READ : WRITE,
|
||||
compat_ptr(msg.msg_iov), msg.msg_iovlen,
|
||||
UIO_FASTIOV, iov, &kmsg->msg_iter);
|
||||
int get_compat_msghdr(struct msghdr *kmsg,
|
||||
struct compat_msghdr __user *umsg,
|
||||
struct sockaddr __user **save_addr,
|
||||
struct iovec **iov)
|
||||
{
|
||||
compat_uptr_t ptr;
|
||||
compat_size_t len;
|
||||
ssize_t err;
|
||||
|
||||
err = __get_compat_msghdr(kmsg, umsg, save_addr, &ptr, &len);
|
||||
if (err)
|
||||
return err;
|
||||
|
||||
err = compat_import_iovec(save_addr ? READ : WRITE, compat_ptr(ptr),
|
||||
len, UIO_FASTIOV, iov, &kmsg->msg_iter);
|
||||
return err < 0 ? err : 0;
|
||||
}
|
||||
|
||||
|
||||
25
net/socket.c
25
net/socket.c
@@ -2228,10 +2228,10 @@ struct used_address {
|
||||
unsigned int name_len;
|
||||
};
|
||||
|
||||
static int copy_msghdr_from_user(struct msghdr *kmsg,
|
||||
struct user_msghdr __user *umsg,
|
||||
struct sockaddr __user **save_addr,
|
||||
struct iovec **iov)
|
||||
int __copy_msghdr_from_user(struct msghdr *kmsg,
|
||||
struct user_msghdr __user *umsg,
|
||||
struct sockaddr __user **save_addr,
|
||||
struct iovec __user **uiov, size_t *nsegs)
|
||||
{
|
||||
struct user_msghdr msg;
|
||||
ssize_t err;
|
||||
@@ -2273,6 +2273,23 @@ static int copy_msghdr_from_user(struct msghdr *kmsg,
|
||||
return -EMSGSIZE;
|
||||
|
||||
kmsg->msg_iocb = NULL;
|
||||
*uiov = msg.msg_iov;
|
||||
*nsegs = msg.msg_iovlen;
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int copy_msghdr_from_user(struct msghdr *kmsg,
|
||||
struct user_msghdr __user *umsg,
|
||||
struct sockaddr __user **save_addr,
|
||||
struct iovec **iov)
|
||||
{
|
||||
struct user_msghdr msg;
|
||||
ssize_t err;
|
||||
|
||||
err = __copy_msghdr_from_user(kmsg, umsg, save_addr, &msg.msg_iov,
|
||||
&msg.msg_iovlen);
|
||||
if (err)
|
||||
return err;
|
||||
|
||||
err = import_iovec(save_addr ? READ : WRITE,
|
||||
msg.msg_iov, msg.msg_iovlen,
|
||||
|
||||
Reference in New Issue
Block a user