for-5.6/io_uring-vfs-2020-01-29

-----BEGIN PGP SIGNATURE-----
 
 iQJEBAABCAAuFiEEwPw5LcreJtl1+l5K99NY+ylx4KYFAl4yEegQHGF4Ym9lQGtl
 cm5lbC5kawAKCRD301j7KXHgpn5ZD/4/WlXs2cUDgg1C65bzZFO4qvevm+VkXmsk
 GbyrnFstRekvSH01/ZQxlyDVKS8Wux0XIJ6OArCh1047LvL1bEE5dvOW5iIiwa/r
 grjQuwFAzIPsE2fgcAO17BKIUzq2Z96+hwDzH7dw0i32yBuLvNmY/1SxcCHKfPut
 uzGyp7t3/2dIHbpWILRndMYe0O9j9ubmOMvKyKTwy723yDEafsUoqu2mlpigzTq4
 2i+DbYBIAd8qmLqG/m3e+vOt9xodJ2Q0hlO+v6DcP2SKXU64Hb/N98HadR//aWP9
 41DBXqs+dvDBcu3Jxb80PFUTiOQZECJivkns5cNcjuSXmNkOuQhDQR5K372AHmR9
 m6e6FSBxwej8HselAZCI6yu9uBKd0i+MM4FnFs/O73QGYx2ayXsEXp/Jad9xiYgW
 pC5XJTSqJQhPE0AYYEOzHPPcBLBcpvXHkvmGKdjkNb8OLhhgh2S/YG0DNC+8ABXr
 j1uIe/n3kJEEmOanUyiitGyLmDq+mXd7aCVKJL/J0KiGD8Gkc1avAZ1ZrTQgjujY
 FqqBFawO8gv3g0L4WMI8JI+HJGMnA488obet6UKm9+l/Z/urEpXzDAKf/W/vnx2B
 LD0FSA0bCh1tyO6JU+avFwHlwShtV7/rx/OhrmCK7CCYKtZCA2IEctxyr8U+PBIv
 DtwIMTYTsA==
 =ZZUI
 -----END PGP SIGNATURE-----

Merge tag 'for-5.6/io_uring-vfs-2020-01-29' of git://git.kernel.dk/linux-block

Pull io_uring updates from Jens Axboe:

 - Support for various new opcodes (fallocate, openat, close, statx,
   fadvise, madvise, openat2, non-vectored read/write, send/recv, and
   epoll_ctl)

 - Faster ring quiesce for fileset updates

 - Optimizations for overflow condition checking

 - Support for max-sized clamping

 - Support for probing what opcodes are supported

 - Support for io-wq backend sharing between "sibling" rings

 - Support for registering personalities

 - Lots of little fixes and improvements

* tag 'for-5.6/io_uring-vfs-2020-01-29' of git://git.kernel.dk/linux-block: (64 commits)
  io_uring: add support for epoll_ctl(2)
  eventpoll: support non-blocking do_epoll_ctl() calls
  eventpoll: abstract out epoll_ctl() handler
  io_uring: fix linked command file table usage
  io_uring: support using a registered personality for commands
  io_uring: allow registering credentials
  io_uring: add io-wq workqueue sharing
  io-wq: allow grabbing existing io-wq
  io_uring/io-wq: don't use static creds/mm assignments
  io-wq: make the io_wq ref counted
  io_uring: fix refcounting with batched allocations at OOM
  io_uring: add comment for drain_next
  io_uring: don't attempt to copy iovec for READ/WRITE
  io_uring: honor IOSQE_ASYNC for linked reqs
  io_uring: prep req when do IOSQE_ASYNC
  io_uring: use labeled array init in io_op_defs
  io_uring: optimise sqe-to-req flags translation
  io_uring: remove REQ_F_IO_DRAINED
  io_uring: file switch work needs to get flushed on exit
  io_uring: hide uring_fd in ctx
  ...
This commit is contained in:
Linus Torvalds 2020-01-29 18:53:37 -08:00
commit 896f8d23d0
15 changed files with 2131 additions and 496 deletions

View File

@ -2249,10 +2249,12 @@ static void binder_deferred_fd_close(int fd)
return;
init_task_work(&twcb->twork, binder_do_fd_close);
__close_fd_get_file(fd, &twcb->file);
if (twcb->file)
if (twcb->file) {
filp_close(twcb->file, current->files);
task_work_add(current, &twcb->twork, true);
else
} else {
kfree(twcb);
}
}
static void binder_transaction_buffer_release(struct binder_proc *proc,

View File

@ -354,12 +354,6 @@ static inline struct epitem *ep_item_from_epqueue(poll_table *p)
return container_of(p, struct ep_pqueue, pt)->epi;
}
/* Tells if the epoll_ctl(2) operation needs an event copy from userspace */
static inline int ep_op_has_event(int op)
{
return op != EPOLL_CTL_DEL;
}
/* Initialize the poll safe wake up structure */
static void ep_nested_calls_init(struct nested_calls *ncalls)
{
@ -2074,27 +2068,28 @@ SYSCALL_DEFINE1(epoll_create, int, size)
return do_epoll_create(0);
}
/*
* The following function implements the controller interface for
* the eventpoll file that enables the insertion/removal/change of
* file descriptors inside the interest set.
*/
SYSCALL_DEFINE4(epoll_ctl, int, epfd, int, op, int, fd,
struct epoll_event __user *, event)
static inline int epoll_mutex_lock(struct mutex *mutex, int depth,
bool nonblock)
{
if (!nonblock) {
mutex_lock_nested(mutex, depth);
return 0;
}
if (mutex_trylock(mutex))
return 0;
return -EAGAIN;
}
int do_epoll_ctl(int epfd, int op, int fd, struct epoll_event *epds,
bool nonblock)
{
int error;
int full_check = 0;
struct fd f, tf;
struct eventpoll *ep;
struct epitem *epi;
struct epoll_event epds;
struct eventpoll *tep = NULL;
error = -EFAULT;
if (ep_op_has_event(op) &&
copy_from_user(&epds, event, sizeof(struct epoll_event)))
goto error_return;
error = -EBADF;
f = fdget(epfd);
if (!f.file)
@ -2112,7 +2107,7 @@ SYSCALL_DEFINE4(epoll_ctl, int, epfd, int, op, int, fd,
/* Check if EPOLLWAKEUP is allowed */
if (ep_op_has_event(op))
ep_take_care_of_epollwakeup(&epds);
ep_take_care_of_epollwakeup(epds);
/*
* We have to check that the file structure underneath the file descriptor
@ -2128,11 +2123,11 @@ SYSCALL_DEFINE4(epoll_ctl, int, epfd, int, op, int, fd,
* so EPOLLEXCLUSIVE is not allowed for a EPOLL_CTL_MOD operation.
* Also, we do not currently supported nested exclusive wakeups.
*/
if (ep_op_has_event(op) && (epds.events & EPOLLEXCLUSIVE)) {
if (ep_op_has_event(op) && (epds->events & EPOLLEXCLUSIVE)) {
if (op == EPOLL_CTL_MOD)
goto error_tgt_fput;
if (op == EPOLL_CTL_ADD && (is_file_epoll(tf.file) ||
(epds.events & ~EPOLLEXCLUSIVE_OK_BITS)))
(epds->events & ~EPOLLEXCLUSIVE_OK_BITS)))
goto error_tgt_fput;
}
@ -2157,13 +2152,17 @@ SYSCALL_DEFINE4(epoll_ctl, int, epfd, int, op, int, fd,
* deep wakeup paths from forming in parallel through multiple
* EPOLL_CTL_ADD operations.
*/
mutex_lock_nested(&ep->mtx, 0);
error = epoll_mutex_lock(&ep->mtx, 0, nonblock);
if (error)
goto error_tgt_fput;
if (op == EPOLL_CTL_ADD) {
if (!list_empty(&f.file->f_ep_links) ||
is_file_epoll(tf.file)) {
full_check = 1;
mutex_unlock(&ep->mtx);
mutex_lock(&epmutex);
error = epoll_mutex_lock(&epmutex, 0, nonblock);
if (error)
goto error_tgt_fput;
full_check = 1;
if (is_file_epoll(tf.file)) {
error = -ELOOP;
if (ep_loop_check(ep, tf.file) != 0) {
@ -2173,10 +2172,19 @@ SYSCALL_DEFINE4(epoll_ctl, int, epfd, int, op, int, fd,
} else
list_add(&tf.file->f_tfile_llink,
&tfile_check_list);
mutex_lock_nested(&ep->mtx, 0);
error = epoll_mutex_lock(&ep->mtx, 0, nonblock);
if (error) {
out_del:
list_del(&tf.file->f_tfile_llink);
goto error_tgt_fput;
}
if (is_file_epoll(tf.file)) {
tep = tf.file->private_data;
mutex_lock_nested(&tep->mtx, 1);
error = epoll_mutex_lock(&tep->mtx, 1, nonblock);
if (error) {
mutex_unlock(&ep->mtx);
goto out_del;
}
}
}
}
@ -2192,8 +2200,8 @@ SYSCALL_DEFINE4(epoll_ctl, int, epfd, int, op, int, fd,
switch (op) {
case EPOLL_CTL_ADD:
if (!epi) {
epds.events |= EPOLLERR | EPOLLHUP;
error = ep_insert(ep, &epds, tf.file, fd, full_check);
epds->events |= EPOLLERR | EPOLLHUP;
error = ep_insert(ep, epds, tf.file, fd, full_check);
} else
error = -EEXIST;
if (full_check)
@ -2208,8 +2216,8 @@ SYSCALL_DEFINE4(epoll_ctl, int, epfd, int, op, int, fd,
case EPOLL_CTL_MOD:
if (epi) {
if (!(epi->event.events & EPOLLEXCLUSIVE)) {
epds.events |= EPOLLERR | EPOLLHUP;
error = ep_modify(ep, epi, &epds);
epds->events |= EPOLLERR | EPOLLHUP;
error = ep_modify(ep, epi, epds);
}
} else
error = -ENOENT;
@ -2231,6 +2239,23 @@ error_return:
return error;
}
/*
* The following function implements the controller interface for
* the eventpoll file that enables the insertion/removal/change of
* file descriptors inside the interest set.
*/
SYSCALL_DEFINE4(epoll_ctl, int, epfd, int, op, int, fd,
struct epoll_event __user *, event)
{
struct epoll_event epds;
if (ep_op_has_event(op) &&
copy_from_user(&epds, event, sizeof(struct epoll_event)))
return -EFAULT;
return do_epoll_ctl(epfd, op, fd, &epds, false);
}
/*
* Implement the event wait interface for the eventpoll file. It is the kernel
* part of the user space epoll_wait(2).

View File

@ -642,7 +642,9 @@ out_unlock:
EXPORT_SYMBOL(__close_fd); /* for ksys_close() */
/*
* variant of __close_fd that gets a ref on the file for later fput
* variant of __close_fd that gets a ref on the file for later fput.
* The caller must ensure that filp_close() called on the file, and then
* an fput().
*/
int __close_fd_get_file(unsigned int fd, struct file **res)
{
@ -662,7 +664,7 @@ int __close_fd_get_file(unsigned int fd, struct file **res)
spin_unlock(&files->file_lock);
get_file(file);
*res = file;
return filp_close(file, files);
return 0;
out_unlock:
spin_unlock(&files->file_lock);

View File

@ -124,6 +124,8 @@ extern struct file *do_filp_open(int dfd, struct filename *pathname,
const struct open_flags *op);
extern struct file *do_file_open_root(struct dentry *, struct vfsmount *,
const char *, const struct open_flags *);
extern struct open_how build_open_how(int flags, umode_t mode);
extern int build_open_flags(const struct open_how *how, struct open_flags *op);
long do_sys_ftruncate(unsigned int fd, loff_t length, int small);
long do_faccessat(int dfd, const char __user *filename, int mode);
@ -182,3 +184,9 @@ extern const struct dentry_operations ns_dentry_operations;
/* direct-io.c: */
int sb_init_dio_done_wq(struct super_block *sb);
/*
* fs/stat.c:
*/
unsigned vfs_stat_set_lookup_flags(unsigned *lookup_flags, int flags);
int cp_statx(const struct kstat *stat, struct statx __user *buffer);

View File

@ -56,7 +56,8 @@ struct io_worker {
struct rcu_head rcu;
struct mm_struct *mm;
const struct cred *creds;
const struct cred *cur_creds;
const struct cred *saved_creds;
struct files_struct *restore_files;
};
@ -109,10 +110,10 @@ struct io_wq {
struct task_struct *manager;
struct user_struct *user;
const struct cred *creds;
struct mm_struct *mm;
refcount_t refs;
struct completion done;
refcount_t use_refs;
};
static bool io_worker_get(struct io_worker *worker)
@ -135,9 +136,9 @@ static bool __io_worker_unuse(struct io_wqe *wqe, struct io_worker *worker)
{
bool dropped_lock = false;
if (worker->creds) {
revert_creds(worker->creds);
worker->creds = NULL;
if (worker->saved_creds) {
revert_creds(worker->saved_creds);
worker->cur_creds = worker->saved_creds = NULL;
}
if (current->files != worker->restore_files) {
@ -396,6 +397,43 @@ static struct io_wq_work *io_get_next_work(struct io_wqe *wqe, unsigned *hash)
return NULL;
}
static void io_wq_switch_mm(struct io_worker *worker, struct io_wq_work *work)
{
if (worker->mm) {
unuse_mm(worker->mm);
mmput(worker->mm);
worker->mm = NULL;
}
if (!work->mm) {
set_fs(KERNEL_DS);
return;
}
if (mmget_not_zero(work->mm)) {
use_mm(work->mm);
if (!worker->mm)
set_fs(USER_DS);
worker->mm = work->mm;
/* hang on to this mm */
work->mm = NULL;
return;
}
/* failed grabbing mm, ensure work gets cancelled */
work->flags |= IO_WQ_WORK_CANCEL;
}
static void io_wq_switch_creds(struct io_worker *worker,
struct io_wq_work *work)
{
const struct cred *old_creds = override_creds(work->creds);
worker->cur_creds = work->creds;
if (worker->saved_creds)
put_cred(old_creds); /* creds set by previous switch */
else
worker->saved_creds = old_creds;
}
static void io_worker_handle_work(struct io_worker *worker)
__releases(wqe->lock)
{
@ -438,24 +476,19 @@ next:
if (work->flags & IO_WQ_WORK_CB)
work->func(&work);
if ((work->flags & IO_WQ_WORK_NEEDS_FILES) &&
current->files != work->files) {
if (work->files && current->files != work->files) {
task_lock(current);
current->files = work->files;
task_unlock(current);
}
if ((work->flags & IO_WQ_WORK_NEEDS_USER) && !worker->mm &&
wq->mm) {
if (mmget_not_zero(wq->mm)) {
use_mm(wq->mm);
set_fs(USER_DS);
worker->mm = wq->mm;
} else {
work->flags |= IO_WQ_WORK_CANCEL;
}
}
if (!worker->creds)
worker->creds = override_creds(wq->creds);
if (work->mm != worker->mm)
io_wq_switch_mm(worker, work);
if (worker->cur_creds != work->creds)
io_wq_switch_creds(worker, work);
/*
* OK to set IO_WQ_WORK_CANCEL even for uncancellable work,
* the worker function will do the right thing.
*/
if (test_bit(IO_WQ_BIT_CANCEL, &wq->state))
work->flags |= IO_WQ_WORK_CANCEL;
if (worker->mm)
@ -720,6 +753,7 @@ static bool io_wq_can_queue(struct io_wqe *wqe, struct io_wqe_acct *acct,
static void io_wqe_enqueue(struct io_wqe *wqe, struct io_wq_work *work)
{
struct io_wqe_acct *acct = io_work_get_acct(wqe, work);
int work_flags;
unsigned long flags;
/*
@ -734,12 +768,14 @@ static void io_wqe_enqueue(struct io_wqe *wqe, struct io_wq_work *work)
return;
}
work_flags = work->flags;
spin_lock_irqsave(&wqe->lock, flags);
wq_list_add_tail(&work->list, &wqe->work_list);
wqe->flags &= ~IO_WQE_FLAG_STALLED;
spin_unlock_irqrestore(&wqe->lock, flags);
if (!atomic_read(&acct->nr_running))
if ((work_flags & IO_WQ_WORK_CONCURRENT) ||
!atomic_read(&acct->nr_running))
io_wqe_wake_worker(wqe, acct);
}
@ -828,6 +864,7 @@ static bool io_work_cancel(struct io_worker *worker, void *cancel_data)
*/
spin_lock_irqsave(&worker->lock, flags);
if (worker->cur_work &&
!(worker->cur_work->flags & IO_WQ_WORK_NO_CANCEL) &&
data->cancel(worker->cur_work, data->caller_data)) {
send_sig(SIGINT, worker->task, 1);
ret = true;
@ -902,7 +939,8 @@ static bool io_wq_worker_cancel(struct io_worker *worker, void *data)
return false;
spin_lock_irqsave(&worker->lock, flags);
if (worker->cur_work == work) {
if (worker->cur_work == work &&
!(worker->cur_work->flags & IO_WQ_WORK_NO_CANCEL)) {
send_sig(SIGINT, worker->task, 1);
ret = true;
}
@ -1026,7 +1064,6 @@ struct io_wq *io_wq_create(unsigned bounded, struct io_wq_data *data)
/* caller must already hold a reference to this */
wq->user = data->user;
wq->creds = data->creds;
for_each_node(node) {
struct io_wqe *wqe;
@ -1053,9 +1090,6 @@ struct io_wq *io_wq_create(unsigned bounded, struct io_wq_data *data)
init_completion(&wq->done);
/* caller must have already done mmgrab() on this mm */
wq->mm = data->mm;
wq->manager = kthread_create(io_wq_manager, wq, "io_wq_manager");
if (!IS_ERR(wq->manager)) {
wake_up_process(wq->manager);
@ -1064,6 +1098,7 @@ struct io_wq *io_wq_create(unsigned bounded, struct io_wq_data *data)
ret = -ENOMEM;
goto err;
}
refcount_set(&wq->use_refs, 1);
reinit_completion(&wq->done);
return wq;
}
@ -1078,13 +1113,21 @@ err:
return ERR_PTR(ret);
}
bool io_wq_get(struct io_wq *wq, struct io_wq_data *data)
{
if (data->get_work != wq->get_work || data->put_work != wq->put_work)
return false;
return refcount_inc_not_zero(&wq->use_refs);
}
static bool io_wq_worker_wake(struct io_worker *worker, void *data)
{
wake_up_process(worker->task);
return false;
}
void io_wq_destroy(struct io_wq *wq)
static void __io_wq_destroy(struct io_wq *wq)
{
int node;
@ -1104,3 +1147,9 @@ void io_wq_destroy(struct io_wq *wq)
kfree(wq->wqes);
kfree(wq);
}
void io_wq_destroy(struct io_wq *wq)
{
if (refcount_dec_and_test(&wq->use_refs))
__io_wq_destroy(wq);
}

View File

@ -7,11 +7,11 @@ enum {
IO_WQ_WORK_CANCEL = 1,
IO_WQ_WORK_HAS_MM = 2,
IO_WQ_WORK_HASHED = 4,
IO_WQ_WORK_NEEDS_USER = 8,
IO_WQ_WORK_NEEDS_FILES = 16,
IO_WQ_WORK_UNBOUND = 32,
IO_WQ_WORK_INTERNAL = 64,
IO_WQ_WORK_CB = 128,
IO_WQ_WORK_NO_CANCEL = 256,
IO_WQ_WORK_CONCURRENT = 512,
IO_WQ_HASH_SHIFT = 24, /* upper 8 bits are used for hash key */
};
@ -72,6 +72,8 @@ struct io_wq_work {
};
void (*func)(struct io_wq_work **);
struct files_struct *files;
struct mm_struct *mm;
const struct cred *creds;
unsigned flags;
};
@ -81,21 +83,22 @@ struct io_wq_work {
(work)->func = _func; \
(work)->flags = 0; \
(work)->files = NULL; \
(work)->mm = NULL; \
(work)->creds = NULL; \
} while (0) \
typedef void (get_work_fn)(struct io_wq_work *);
typedef void (put_work_fn)(struct io_wq_work *);
struct io_wq_data {
struct mm_struct *mm;
struct user_struct *user;
const struct cred *creds;
get_work_fn *get_work;
put_work_fn *put_work;
};
struct io_wq *io_wq_create(unsigned bounded, struct io_wq_data *data);
bool io_wq_get(struct io_wq *wq, struct io_wq_data *data);
void io_wq_destroy(struct io_wq *wq);
void io_wq_enqueue(struct io_wq *wq, struct io_wq_work *work);

File diff suppressed because it is too large Load Diff

View File

@ -958,7 +958,7 @@ EXPORT_SYMBOL(open_with_fake_path);
#define WILL_CREATE(flags) (flags & (O_CREAT | __O_TMPFILE))
#define O_PATH_FLAGS (O_DIRECTORY | O_NOFOLLOW | O_PATH | O_CLOEXEC)
static inline struct open_how build_open_how(int flags, umode_t mode)
inline struct open_how build_open_how(int flags, umode_t mode)
{
struct open_how how = {
.flags = flags & VALID_OPEN_FLAGS,
@ -974,8 +974,7 @@ static inline struct open_how build_open_how(int flags, umode_t mode)
return how;
}
static inline int build_open_flags(const struct open_how *how,
struct open_flags *op)
inline int build_open_flags(const struct open_how *how, struct open_flags *op)
{
int flags = how->flags;
int lookup_flags = 0;

View File

@ -21,6 +21,8 @@
#include <linux/uaccess.h>
#include <asm/unistd.h>
#include "internal.h"
/**
* generic_fillattr - Fill in the basic attributes from the inode struct
* @inode: Inode to use as the source
@ -150,6 +152,23 @@ int vfs_statx_fd(unsigned int fd, struct kstat *stat,
}
EXPORT_SYMBOL(vfs_statx_fd);
inline unsigned vfs_stat_set_lookup_flags(unsigned *lookup_flags, int flags)
{
if ((flags & ~(AT_SYMLINK_NOFOLLOW | AT_NO_AUTOMOUNT |
AT_EMPTY_PATH | KSTAT_QUERY_FLAGS)) != 0)
return -EINVAL;
*lookup_flags = LOOKUP_FOLLOW | LOOKUP_AUTOMOUNT;
if (flags & AT_SYMLINK_NOFOLLOW)
*lookup_flags &= ~LOOKUP_FOLLOW;
if (flags & AT_NO_AUTOMOUNT)
*lookup_flags &= ~LOOKUP_AUTOMOUNT;
if (flags & AT_EMPTY_PATH)
*lookup_flags |= LOOKUP_EMPTY;
return 0;
}
/**
* vfs_statx - Get basic and extra attributes by filename
* @dfd: A file descriptor representing the base dir for a relative filename
@ -170,19 +189,10 @@ int vfs_statx(int dfd, const char __user *filename, int flags,
{
struct path path;
int error = -EINVAL;
unsigned int lookup_flags = LOOKUP_FOLLOW | LOOKUP_AUTOMOUNT;
unsigned lookup_flags;
if ((flags & ~(AT_SYMLINK_NOFOLLOW | AT_NO_AUTOMOUNT |
AT_EMPTY_PATH | KSTAT_QUERY_FLAGS)) != 0)
if (vfs_stat_set_lookup_flags(&lookup_flags, flags))
return -EINVAL;
if (flags & AT_SYMLINK_NOFOLLOW)
lookup_flags &= ~LOOKUP_FOLLOW;
if (flags & AT_NO_AUTOMOUNT)
lookup_flags &= ~LOOKUP_AUTOMOUNT;
if (flags & AT_EMPTY_PATH)
lookup_flags |= LOOKUP_EMPTY;
retry:
error = user_path_at(dfd, filename, lookup_flags, &path);
if (error)
@ -523,7 +533,7 @@ SYSCALL_DEFINE4(fstatat64, int, dfd, const char __user *, filename,
}
#endif /* __ARCH_WANT_STAT64 || __ARCH_WANT_COMPAT_STAT64 */
static noinline_for_stack int
noinline_for_stack int
cp_statx(const struct kstat *stat, struct statx __user *buffer)
{
struct statx tmp;

View File

@ -61,6 +61,15 @@ static inline void eventpoll_release(struct file *file)
eventpoll_release_file(file);
}
int do_epoll_ctl(int epfd, int op, int fd, struct epoll_event *epds,
bool nonblock);
/* Tells if the epoll_ctl(2) operation needs an event copy from userspace */
static inline int ep_op_has_event(int op)
{
return op != EPOLL_CTL_DEL;
}
#else
static inline void eventpoll_init_file(struct file *file) {}

View File

@ -2323,6 +2323,7 @@ extern int __do_munmap(struct mm_struct *, unsigned long, size_t,
struct list_head *uf, bool downgrade);
extern int do_munmap(struct mm_struct *, unsigned long, size_t,
struct list_head *uf);
extern int do_madvise(unsigned long start, size_t len_in, int behavior);
static inline unsigned long
do_mmap_pgoff(struct file *file, unsigned long addr,

View File

@ -209,6 +209,36 @@ static inline void percpu_ref_get(struct percpu_ref *ref)
percpu_ref_get_many(ref, 1);
}
/**
* percpu_ref_tryget_many - try to increment a percpu refcount
* @ref: percpu_ref to try-get
* @nr: number of references to get
*
* Increment a percpu refcount by @nr unless its count already reached zero.
* Returns %true on success; %false on failure.
*
* This function is safe to call as long as @ref is between init and exit.
*/
static inline bool percpu_ref_tryget_many(struct percpu_ref *ref,
unsigned long nr)
{
unsigned long __percpu *percpu_count;
bool ret;
rcu_read_lock();
if (__ref_is_percpu(ref, &percpu_count)) {
this_cpu_add(*percpu_count, nr);
ret = true;
} else {
ret = atomic_long_add_unless(&ref->count, nr, 0);
}
rcu_read_unlock();
return ret;
}
/**
* percpu_ref_tryget - try to increment a percpu refcount
* @ref: percpu_ref to try-get
@ -220,21 +250,7 @@ static inline void percpu_ref_get(struct percpu_ref *ref)
*/
static inline bool percpu_ref_tryget(struct percpu_ref *ref)
{
unsigned long __percpu *percpu_count;
bool ret;
rcu_read_lock();
if (__ref_is_percpu(ref, &percpu_count)) {
this_cpu_inc(*percpu_count);
ret = true;
} else {
ret = atomic_long_inc_not_zero(&ref->count);
}
rcu_read_unlock();
return ret;
return percpu_ref_tryget_many(ref, 1);
}
/**

View File

@ -320,6 +320,7 @@ TRACE_EVENT(io_uring_complete,
* io_uring_submit_sqe - called before submitting one SQE
*
* @ctx: pointer to a ring context structure
* @opcode: opcode of request
* @user_data: user data associated with the request
* @force_nonblock: whether a context blocking or not
* @sq_thread: true if sq_thread has submitted this SQE
@ -329,12 +330,14 @@ TRACE_EVENT(io_uring_complete,
*/
TRACE_EVENT(io_uring_submit_sqe,
TP_PROTO(void *ctx, u64 user_data, bool force_nonblock, bool sq_thread),
TP_PROTO(void *ctx, u8 opcode, u64 user_data, bool force_nonblock,
bool sq_thread),
TP_ARGS(ctx, user_data, force_nonblock, sq_thread),
TP_ARGS(ctx, opcode, user_data, force_nonblock, sq_thread),
TP_STRUCT__entry (
__field( void *, ctx )
__field( u8, opcode )
__field( u64, user_data )
__field( bool, force_nonblock )
__field( bool, sq_thread )
@ -342,13 +345,15 @@ TRACE_EVENT(io_uring_submit_sqe,
TP_fast_assign(
__entry->ctx = ctx;
__entry->opcode = opcode;
__entry->user_data = user_data;
__entry->force_nonblock = force_nonblock;
__entry->sq_thread = sq_thread;
),
TP_printk("ring %p, user data 0x%llx, non block %d, sq_thread %d",
__entry->ctx, (unsigned long long) __entry->user_data,
TP_printk("ring %p, op %d, data 0x%llx, non block %d, sq_thread %d",
__entry->ctx, __entry->opcode,
(unsigned long long) __entry->user_data,
__entry->force_nonblock, __entry->sq_thread)
);

View File

@ -34,21 +34,43 @@ struct io_uring_sqe {
__u32 timeout_flags;
__u32 accept_flags;
__u32 cancel_flags;
__u32 open_flags;
__u32 statx_flags;
__u32 fadvise_advice;
};
__u64 user_data; /* data to be passed back at completion time */
union {
__u16 buf_index; /* index into fixed buffers, if used */
struct {
/* index into fixed buffers, if used */
__u16 buf_index;
/* personality to use, if used */
__u16 personality;
};
__u64 __pad2[3];
};
};
enum {
IOSQE_FIXED_FILE_BIT,
IOSQE_IO_DRAIN_BIT,
IOSQE_IO_LINK_BIT,
IOSQE_IO_HARDLINK_BIT,
IOSQE_ASYNC_BIT,
};
/*
* sqe->flags
*/
#define IOSQE_FIXED_FILE (1U << 0) /* use fixed fileset */
#define IOSQE_IO_DRAIN (1U << 1) /* issue after inflight IO */
#define IOSQE_IO_LINK (1U << 2) /* links next sqe */
#define IOSQE_IO_HARDLINK (1U << 3) /* like LINK, but stronger */
/* use fixed fileset */
#define IOSQE_FIXED_FILE (1U << IOSQE_FIXED_FILE_BIT)
/* issue after inflight IO */
#define IOSQE_IO_DRAIN (1U << IOSQE_IO_DRAIN_BIT)
/* links next sqe */
#define IOSQE_IO_LINK (1U << IOSQE_IO_LINK_BIT)
/* like LINK, but stronger */
#define IOSQE_IO_HARDLINK (1U << IOSQE_IO_HARDLINK_BIT)
/* always go async */
#define IOSQE_ASYNC (1U << IOSQE_ASYNC_BIT)
/*
* io_uring_setup() flags
@ -57,6 +79,8 @@ struct io_uring_sqe {
#define IORING_SETUP_SQPOLL (1U << 1) /* SQ poll thread */
#define IORING_SETUP_SQ_AFF (1U << 2) /* sq_thread_cpu is valid */
#define IORING_SETUP_CQSIZE (1U << 3) /* app defines CQ size */
#define IORING_SETUP_CLAMP (1U << 4) /* clamp SQ/CQ ring sizes */
#define IORING_SETUP_ATTACH_WQ (1U << 5) /* attach to existing wq */
enum {
IORING_OP_NOP,
@ -76,6 +100,19 @@ enum {
IORING_OP_ASYNC_CANCEL,
IORING_OP_LINK_TIMEOUT,
IORING_OP_CONNECT,
IORING_OP_FALLOCATE,
IORING_OP_OPENAT,
IORING_OP_CLOSE,
IORING_OP_FILES_UPDATE,
IORING_OP_STATX,
IORING_OP_READ,
IORING_OP_WRITE,
IORING_OP_FADVISE,
IORING_OP_MADVISE,
IORING_OP_SEND,
IORING_OP_RECV,
IORING_OP_OPENAT2,
IORING_OP_EPOLL_CTL,
/* this goes last, obviously */
IORING_OP_LAST,
@ -153,7 +190,8 @@ struct io_uring_params {
__u32 sq_thread_cpu;
__u32 sq_thread_idle;
__u32 features;
__u32 resv[4];
__u32 wq_fd;
__u32 resv[3];
struct io_sqring_offsets sq_off;
struct io_cqring_offsets cq_off;
};
@ -164,6 +202,8 @@ struct io_uring_params {
#define IORING_FEAT_SINGLE_MMAP (1U << 0)
#define IORING_FEAT_NODROP (1U << 1)
#define IORING_FEAT_SUBMIT_STABLE (1U << 2)
#define IORING_FEAT_RW_CUR_POS (1U << 3)
#define IORING_FEAT_CUR_PERSONALITY (1U << 4)
/*
* io_uring_register(2) opcodes and arguments
@ -175,6 +215,10 @@ struct io_uring_params {
#define IORING_REGISTER_EVENTFD 4
#define IORING_UNREGISTER_EVENTFD 5
#define IORING_REGISTER_FILES_UPDATE 6
#define IORING_REGISTER_EVENTFD_ASYNC 7
#define IORING_REGISTER_PROBE 8
#define IORING_REGISTER_PERSONALITY 9
#define IORING_UNREGISTER_PERSONALITY 10
struct io_uring_files_update {
__u32 offset;
@ -182,4 +226,21 @@ struct io_uring_files_update {
__aligned_u64 /* __s32 * */ fds;
};
#define IO_URING_OP_SUPPORTED (1U << 0)
struct io_uring_probe_op {
__u8 op;
__u8 resv;
__u16 flags; /* IO_URING_OP_* flags */
__u32 resv2;
};
struct io_uring_probe {
__u8 last_op; /* last opcode supported */
__u8 ops_len; /* length of ops[] array below */
__u16 resv;
__u32 resv2[3];
struct io_uring_probe_op ops[0];
};
#endif

View File

@ -1044,7 +1044,7 @@ madvise_behavior_valid(int behavior)
* -EBADF - map exists, but area maps something that isn't a file.
* -EAGAIN - a kernel resource was temporarily unavailable.
*/
SYSCALL_DEFINE3(madvise, unsigned long, start, size_t, len_in, int, behavior)
int do_madvise(unsigned long start, size_t len_in, int behavior)
{
unsigned long end, tmp;
struct vm_area_struct *vma, *prev;
@ -1141,3 +1141,8 @@ out:
return error;
}
SYSCALL_DEFINE3(madvise, unsigned long, start, size_t, len_in, int, behavior)
{
return do_madvise(start, len_in, behavior);
}