Merge tag 'io_uring-5.6-2020-02-05' of git://git.kernel.dk/linux-block

Pull io_uring updates from Jens Axboe: "Some later fixes for io_uring: - Small cleanup series from Pavel - Belt and suspenders build time check of sqe size and layout (Stefan) - Addition of ->show_fdinfo() on request of Jann Horn, to aid in understanding mapped personalities - eventfd recursion/deadlock fix, for both io_uring and aio - Fixup for send/recv handling - Fixup for double deferral of read/write request - Fix for potential double completion event for close request - Adjust fadvise advice async/inline behavior - Fix for shutdown hang with SQPOLL thread - Fix for potential use-after-free of fixed file table" * tag 'io_uring-5.6-2020-02-05' of git://git.kernel.dk/linux-block: io_uring: cleanup fixed file data table references io_uring: spin for sq thread to idle on shutdown aio: prevent potential eventfd recursion on poll io_uring: put the flag changing code in the same spot io_uring: iterate req cache backwards io_uring: punt even fadvise() WILLNEED to async context io_uring: fix sporadic double CQE entry for close io_uring: remove extra ->file check io_uring: don't map read/write iovec potentially twice io_uring: use the proper helpers for io_send/recv io_uring: prevent potential eventfd recursion on poll eventfd: track eventfd_signal() recursion depth io_uring: add BUILD_BUG_ON() to assert the layout of struct io_uring_sqe io_uring: add ->show_fdinfo() for the io_uring file descriptor

Merge tag 'io_uring-5.6-2020-02-05' of git://git.kernel.dk/linux-block
Pull io_uring updates from Jens Axboe: "Some later fixes for io_uring: - Small cleanup series from Pavel - Belt and suspenders build time check of sqe size and layout (Stefan) - Addition of ->show_fdinfo() on request of Jann Horn, to aid in understanding mapped personalities - eventfd recursion/deadlock fix, for both io_uring and aio - Fixup for send/recv handling - Fixup for double deferral of read/write request - Fix for potential double completion event for close request - Adjust fadvise advice async/inline behavior - Fix for shutdown hang with SQPOLL thread - Fix for potential use-after-free of fixed file table" * tag 'io_uring-5.6-2020-02-05' of git://git.kernel.dk/linux-block: io_uring: cleanup fixed file data table references io_uring: spin for sq thread to idle on shutdown aio: prevent potential eventfd recursion on poll io_uring: put the flag changing code in the same spot io_uring: iterate req cache backwards io_uring: punt even fadvise() WILLNEED to async context io_uring: fix sporadic double CQE entry for close io_uring: remove extra ->file check io_uring: don't map read/write iovec potentially twice io_uring: use the proper helpers for io_send/recv io_uring: prevent potential eventfd recursion on poll eventfd: track eventfd_signal() recursion depth io_uring: add BUILD_BUG_ON() to assert the layout of struct io_uring_sqe io_uring: add ->show_fdinfo() for the io_uring file descriptor
Linus Torvalds
2 parents ed535f2c9e 2faf852d1b
Showing 4 changed files Side-by-side Diff
fs/aio.c
fs/eventfd.c
fs/io_uring.c
include/linux/eventfd.h
@@ -1610,6 +1610,14 @@
 	return 0;
 }
  
+static void aio_poll_put_work(struct work_struct *work)
+{
+	struct poll_iocb *req = container_of(work, struct poll_iocb, work);
+	struct aio_kiocb *iocb = container_of(req, struct aio_kiocb, poll);
+
+	iocb_put(iocb);
+}
+
 static void aio_poll_complete_work(struct work_struct *work)
 {
 	struct poll_iocb *req = container_of(work, struct poll_iocb, work);
@@ -1674,6 +1682,8 @@
 	list_del_init(&req->wait.entry);
  
 	if (mask && spin_trylock_irqsave(&iocb->ki_ctx->ctx_lock, flags)) {
+		struct kioctx *ctx = iocb->ki_ctx;
+
 		/*
 		 * Try to complete the iocb inline if we can. Use
 		 * irqsave/irqrestore because not all filesystems (e.g. fuse)
@@ -1683,8 +1693,14 @@
 		list_del(&iocb->ki_list);
 		iocb->ki_res.res = mangle_poll(mask);
 		req->done = true;
-		spin_unlock_irqrestore(&iocb->ki_ctx->ctx_lock, flags);
-		iocb_put(iocb);
+		if (iocb->ki_eventfd && eventfd_signal_count()) {
+			iocb = NULL;
+			INIT_WORK(&req->work, aio_poll_put_work);
+			schedule_work(&req->work);
+		}
+		spin_unlock_irqrestore(&ctx->ctx_lock, flags);
+		if (iocb)
+			iocb_put(iocb);
 	} else {
 		schedule_work(&req->work);
 	}
@@ -24,6 +24,8 @@
 #include <linux/seq_file.h>
 #include <linux/idr.h>
  
+DEFINE_PER_CPU(int, eventfd_wake_count);
+
 static DEFINE_IDA(eventfd_ida);
  
 struct eventfd_ctx {
  
  
@@ -60,12 +62,25 @@
 {
 	unsigned long flags;
  
+	/*
+	 * Deadlock or stack overflow issues can happen if we recurse here
+	 * through waitqueue wakeup handlers. If the caller users potentially
+	 * nested waitqueues with custom wakeup handlers, then it should
+	 * check eventfd_signal_count() before calling this function. If
+	 * it returns true, the eventfd_signal() call should be deferred to a
+	 * safe context.
+	 */
+	if (WARN_ON_ONCE(this_cpu_read(eventfd_wake_count)))
+		return 0;
+
 	spin_lock_irqsave(&ctx->wqh.lock, flags);
+	this_cpu_inc(eventfd_wake_count);
 	if (ULLONG_MAX - ctx->count < n)
 		n = ULLONG_MAX - ctx->count;
 	ctx->count += n;
 	if (waitqueue_active(&ctx->wqh))
 		wake_up_locked_poll(&ctx->wqh, EPOLLIN);
+	this_cpu_dec(eventfd_wake_count);
 	spin_unlock_irqrestore(&ctx->wqh.lock, flags);
  
 	return n;
@@ -585,8 +585,7 @@
 	 * io_kiocb alloc cache
 	 */
 	void			*reqs[IO_IOPOLL_BATCH];
-	unsigned		int free_reqs;
-	unsigned		int cur_req;
+	unsigned int		free_reqs;
  
 	/*
 	 * File reference cache
@@ -754,6 +753,7 @@
 				 struct io_uring_files_update *ip,
 				 unsigned nr_args);
 static int io_grab_files(struct io_kiocb *req);
+static void io_ring_file_ref_flush(struct fixed_file_data *data);
  
 static struct kmem_cache *req_cachep;
  
  
  
  
@@ -1020,21 +1020,28 @@
  
 static inline bool io_should_trigger_evfd(struct io_ring_ctx *ctx)
 {
+	if (!ctx->cq_ev_fd)
+		return false;
 	if (!ctx->eventfd_async)
 		return true;
 	return io_wq_current_is_worker() || in_interrupt();
 }
  
-static void io_cqring_ev_posted(struct io_ring_ctx *ctx)
+static void __io_cqring_ev_posted(struct io_ring_ctx *ctx, bool trigger_ev)
 {
 	if (waitqueue_active(&ctx->wait))
 		wake_up(&ctx->wait);
 	if (waitqueue_active(&ctx->sqo_wait))
 		wake_up(&ctx->sqo_wait);
-	if (ctx->cq_ev_fd && io_should_trigger_evfd(ctx))
+	if (trigger_ev)
 		eventfd_signal(ctx->cq_ev_fd, 1);
 }
  
+static void io_cqring_ev_posted(struct io_ring_ctx *ctx)
+{
+	__io_cqring_ev_posted(ctx, io_should_trigger_evfd(ctx));
+}
+
 /* Returns true if there are no backlogged entries after the flush */
 static bool io_cqring_overflow_flush(struct io_ring_ctx *ctx, bool force)
 {
  
  
@@ -1183,12 +1190,10 @@
 			ret = 1;
 		}
 		state->free_reqs = ret - 1;
-		state->cur_req = 1;
-		req = state->reqs[0];
+		req = state->reqs[ret - 1];
 	} else {
-		req = state->reqs[state->cur_req];
 		state->free_reqs--;
-		state->cur_req++;
+		req = state->reqs[state->free_reqs];
 	}
  
 got_it:
@@ -1855,9 +1860,6 @@
 	unsigned ioprio;
 	int ret;
  
-	if (!req->file)
-		return -EBADF;
-
 	if (S_ISREG(file_inode(req->file)->i_mode))
 		req->flags |= REQ_F_ISREG;
  
  
@@ -1866,8 +1868,11 @@
 		req->flags |= REQ_F_CUR_POS;
 		kiocb->ki_pos = req->file->f_pos;
 	}
-	kiocb->ki_flags = iocb_flags(kiocb->ki_filp);
 	kiocb->ki_hint = ki_hint_validate(file_write_hint(kiocb->ki_filp));
+	kiocb->ki_flags = iocb_flags(kiocb->ki_filp);
+	ret = kiocb_set_rw_flags(kiocb, READ_ONCE(sqe->rw_flags));
+	if (unlikely(ret))
+		return ret;
  
 	ioprio = READ_ONCE(sqe->ioprio);
 	if (ioprio) {
@@ -1879,10 +1884,6 @@
 	} else
 		kiocb->ki_ioprio = get_current_ioprio();
  
-	ret = kiocb_set_rw_flags(kiocb, READ_ONCE(sqe->rw_flags));
-	if (unlikely(ret))
-		return ret;
-
 	/* don't allow async punt if RWF_NOWAIT was requested */
 	if ((kiocb->ki_flags & IOCB_NOWAIT) ||
 	    (req->file->f_flags & O_NONBLOCK))
  
@@ -2164,10 +2165,12 @@
 {
 	if (!io_op_defs[req->opcode].async_ctx)
 		return 0;
-	if (!req->io && io_alloc_async_ctx(req))
-		return -ENOMEM;
+	if (!req->io) {
+		if (io_alloc_async_ctx(req))
+			return -ENOMEM;
  
-	io_req_map_rw(req, io_size, iovec, fast_iov, iter);
+		io_req_map_rw(req, io_size, iovec, fast_iov, iter);
+	}
 	req->work.func = io_rw_async;
 	return 0;
 }
@@ -2724,9 +2727,16 @@
 	struct io_fadvise *fa = &req->fadvise;
 	int ret;
  
-	/* DONTNEED may block, others _should_ not */
-	if (fa->advice == POSIX_FADV_DONTNEED && force_nonblock)
-		return -EAGAIN;
+	if (force_nonblock) {
+		switch (fa->advice) {
+		case POSIX_FADV_NORMAL:
+		case POSIX_FADV_RANDOM:
+		case POSIX_FADV_SEQUENTIAL:
+			break;
+		default:
+			return -EAGAIN;
+		}
+	}
  
 	ret = vfs_fadvise(req->file, fa->offset, fa->len, fa->advice);
 	if (ret < 0)
  
  
@@ -2837,16 +2847,13 @@
 		int ret;
  
 		ret = filp_close(req->close.put_file, req->work.files);
-		if (ret < 0) {
+		if (ret < 0)
 			req_set_fail_links(req);
-		}
 		io_cqring_add_event(req, ret);
 	}
  
 	fput(req->close.put_file);
  
-	/* we bypassed the re-issue, drop the submission reference */
-	io_put_req(req);
 	io_put_req_find_next(req, &nxt);
 	if (nxt)
 		io_wq_assign_next(workptr, nxt);
@@ -2888,7 +2895,13 @@
  
 eagain:
 	req->work.func = io_close_finish;
-	return -EAGAIN;
+	/*
+	 * Do manual async queue here to avoid grabbing files - we don't
+	 * need the files, and it'll cause io_close_finish() to close
+	 * the file again and cause a double CQE entry for this request
+	 */
+	io_queue_async_work(req);
+	return 0;
 }
  
 static int io_prep_sfr(struct io_kiocb *req, const struct io_uring_sqe *sqe)
@@ -3083,7 +3096,8 @@
 		else if (force_nonblock)
 			flags |= MSG_DONTWAIT;
  
-		ret = __sys_sendmsg_sock(sock, &msg, flags);
+		msg.msg_flags = flags;
+		ret = sock_sendmsg(sock, &msg);
 		if (force_nonblock && ret == -EAGAIN)
 			return -EAGAIN;
 		if (ret == -ERESTARTSYS)
@@ -3109,6 +3123,7 @@
  
 	sr->msg_flags = READ_ONCE(sqe->msg_flags);
 	sr->msg = u64_to_user_ptr(READ_ONCE(sqe->addr));
+	sr->len = READ_ONCE(sqe->len);
  
 	if (!io || req->opcode == IORING_OP_RECV)
 		return 0;
@@ -3227,7 +3242,7 @@
 		else if (force_nonblock)
 			flags |= MSG_DONTWAIT;
  
-		ret = __sys_recvmsg_sock(sock, &msg, NULL, NULL, flags);
+		ret = sock_recvmsg(sock, &msg, flags);
 		if (force_nonblock && ret == -EAGAIN)
 			return -EAGAIN;
 		if (ret == -ERESTARTSYS)
@@ -3561,6 +3576,14 @@
 		__io_poll_flush(req->ctx, nodes);
 }
  
+static void io_poll_trigger_evfd(struct io_wq_work **workptr)
+{
+	struct io_kiocb *req = container_of(*workptr, struct io_kiocb, work);
+
+	eventfd_signal(req->ctx->cq_ev_fd, 1);
+	io_put_req(req);
+}
+
 static int io_poll_wake(struct wait_queue_entry *wait, unsigned mode, int sync,
 			void *key)
 {
  
  
@@ -3586,14 +3609,22 @@
  
 		if (llist_empty(&ctx->poll_llist) &&
 		    spin_trylock_irqsave(&ctx->completion_lock, flags)) {
+			bool trigger_ev;
+
 			hash_del(&req->hash_node);
 			io_poll_complete(req, mask, 0);
-			req->flags |= REQ_F_COMP_LOCKED;
-			io_put_req(req);
-			spin_unlock_irqrestore(&ctx->completion_lock, flags);
  
-			io_cqring_ev_posted(ctx);
-			req = NULL;
+			trigger_ev = io_should_trigger_evfd(ctx);
+			if (trigger_ev && eventfd_signal_count()) {
+				trigger_ev = false;
+				req->work.func = io_poll_trigger_evfd;
+			} else {
+				req->flags |= REQ_F_COMP_LOCKED;
+				io_put_req(req);
+				req = NULL;
+			}
+			spin_unlock_irqrestore(&ctx->completion_lock, flags);
+			__io_cqring_ev_posted(ctx, trigger_ev);
 		} else {
 			req->result = mask;
 			req->llist_node.next = NULL;
@@ -4815,8 +4846,7 @@
 	blk_finish_plug(&state->plug);
 	io_file_put(state);
 	if (state->free_reqs)
-		kmem_cache_free_bulk(req_cachep, state->free_reqs,
-					&state->reqs[state->cur_req]);
+		kmem_cache_free_bulk(req_cachep, state->free_reqs, state->reqs);
 }
  
 /*
@@ -5041,7 +5071,8 @@
 			 * reap events and wake us up.
 			 */
 			if (inflight ||
-			    (!time_after(jiffies, timeout) && ret != -EBUSY)) {
+			    (!time_after(jiffies, timeout) && ret != -EBUSY &&
+			    !percpu_ref_is_dying(&ctx->refs))) {
 				cond_resched();
 				continue;
 			}
  
  
@@ -5231,15 +5262,10 @@
 	if (!data)
 		return -ENXIO;
  
-	/* protect against inflight atomic switch, which drops the ref */
-	percpu_ref_get(&data->refs);
-	/* wait for existing switches */
-	flush_work(&data->ref_work);
 	percpu_ref_kill_and_confirm(&data->refs, io_file_ref_kill);
-	wait_for_completion(&data->done);
-	percpu_ref_put(&data->refs);
-	/* flush potential new switch */
 	flush_work(&data->ref_work);
+	wait_for_completion(&data->done);
+	io_ring_file_ref_flush(data);
 	percpu_ref_exit(&data->refs);
  
 	__io_sqe_files_unregister(ctx);
  
  
@@ -5477,14 +5503,11 @@
 	struct completion *done;
 };
  
-static void io_ring_file_ref_switch(struct work_struct *work)
+static void io_ring_file_ref_flush(struct fixed_file_data *data)
 {
 	struct io_file_put *pfile, *tmp;
-	struct fixed_file_data *data;
 	struct llist_node *node;
  
-	data = container_of(work, struct fixed_file_data, ref_work);
-
 	while ((node = llist_del_all(&data->put_llist)) != NULL) {
 		llist_for_each_entry_safe(pfile, tmp, node, llist) {
 			io_ring_file_put(data->ctx, pfile->file);
  
@@ -5494,7 +5517,14 @@
 				kfree(pfile);
 		}
 	}
+}
  
+static void io_ring_file_ref_switch(struct work_struct *work)
+{
+	struct fixed_file_data *data;
+
+	data = container_of(work, struct fixed_file_data, ref_work);
+	io_ring_file_ref_flush(data);
 	percpu_ref_get(&data->refs);
 	percpu_ref_switch_to_percpu(&data->refs);
 }
@@ -5505,8 +5535,14 @@
  
 	data = container_of(ref, struct fixed_file_data, refs);
  
-	/* we can't safely switch from inside this context, punt to wq */
-	queue_work(system_wq, &data->ref_work);
+	/*
+	 * We can't safely switch from inside this context, punt to wq. If
+	 * the table ref is going away, the table is being unregistered.
+	 * Don't queue up the async work for that case, the caller will
+	 * handle it.
+	 */
+	if (!percpu_ref_is_dying(&data->refs))
+		queue_work(system_wq, &data->ref_work);
 }
  
 static int io_sqe_files_register(struct io_ring_ctx *ctx, void __user *arg,
@@ -6295,6 +6331,16 @@
 	percpu_ref_kill(&ctx->refs);
 	mutex_unlock(&ctx->uring_lock);
  
+	/*
+	 * Wait for sq thread to idle, if we have one. It won't spin on new
+	 * work after we've killed the ctx ref above. This is important to do
+	 * before we cancel existing commands, as the thread could otherwise
+	 * be queueing new work post that. If that's work we need to cancel,
+	 * it could cause shutdown to hang.
+	 */
+	while (ctx->sqo_thread && !wq_has_sleeper(&ctx->sqo_wait))
+		cpu_relax();
+
 	io_kill_timeouts(ctx);
 	io_poll_remove_all(ctx);
  
@@ -6501,6 +6547,80 @@
 	return submitted ? submitted : ret;
 }
  
+static int io_uring_show_cred(int id, void *p, void *data)
+{
+	const struct cred *cred = p;
+	struct seq_file *m = data;
+	struct user_namespace *uns = seq_user_ns(m);
+	struct group_info *gi;
+	kernel_cap_t cap;
+	unsigned __capi;
+	int g;
+
+	seq_printf(m, "%5d\n", id);
+	seq_put_decimal_ull(m, "\tUid:\t", from_kuid_munged(uns, cred->uid));
+	seq_put_decimal_ull(m, "\t\t", from_kuid_munged(uns, cred->euid));
+	seq_put_decimal_ull(m, "\t\t", from_kuid_munged(uns, cred->suid));
+	seq_put_decimal_ull(m, "\t\t", from_kuid_munged(uns, cred->fsuid));
+	seq_put_decimal_ull(m, "\n\tGid:\t", from_kgid_munged(uns, cred->gid));
+	seq_put_decimal_ull(m, "\t\t", from_kgid_munged(uns, cred->egid));
+	seq_put_decimal_ull(m, "\t\t", from_kgid_munged(uns, cred->sgid));
+	seq_put_decimal_ull(m, "\t\t", from_kgid_munged(uns, cred->fsgid));
+	seq_puts(m, "\n\tGroups:\t");
+	gi = cred->group_info;
+	for (g = 0; g < gi->ngroups; g++) {
+		seq_put_decimal_ull(m, g ? " " : "",
+					from_kgid_munged(uns, gi->gid[g]));
+	}
+	seq_puts(m, "\n\tCapEff:\t");
+	cap = cred->cap_effective;
+	CAP_FOR_EACH_U32(__capi)
+		seq_put_hex_ll(m, NULL, cap.cap[CAP_LAST_U32 - __capi], 8);
+	seq_putc(m, '\n');
+	return 0;
+}
+
+static void __io_uring_show_fdinfo(struct io_ring_ctx *ctx, struct seq_file *m)
+{
+	int i;
+
+	mutex_lock(&ctx->uring_lock);
+	seq_printf(m, "UserFiles:\t%u\n", ctx->nr_user_files);
+	for (i = 0; i < ctx->nr_user_files; i++) {
+		struct fixed_file_table *table;
+		struct file *f;
+
+		table = &ctx->file_data->table[i >> IORING_FILE_TABLE_SHIFT];
+		f = table->files[i & IORING_FILE_TABLE_MASK];
+		if (f)
+			seq_printf(m, "%5u: %s\n", i, file_dentry(f)->d_iname);
+		else
+			seq_printf(m, "%5u: <none>\n", i);
+	}
+	seq_printf(m, "UserBufs:\t%u\n", ctx->nr_user_bufs);
+	for (i = 0; i < ctx->nr_user_bufs; i++) {
+		struct io_mapped_ubuf *buf = &ctx->user_bufs[i];
+
+		seq_printf(m, "%5u: 0x%llx/%u\n", i, buf->ubuf,
+						(unsigned int) buf->len);
+	}
+	if (!idr_is_empty(&ctx->personality_idr)) {
+		seq_printf(m, "Personalities:\n");
+		idr_for_each(&ctx->personality_idr, io_uring_show_cred, m);
+	}
+	mutex_unlock(&ctx->uring_lock);
+}
+
+static void io_uring_show_fdinfo(struct seq_file *m, struct file *f)
+{
+	struct io_ring_ctx *ctx = f->private_data;
+
+	if (percpu_ref_tryget(&ctx->refs)) {
+		__io_uring_show_fdinfo(ctx, m);
+		percpu_ref_put(&ctx->refs);
+	}
+}
+
 static const struct file_operations io_uring_fops = {
 	.release	= io_uring_release,
 	.flush		= io_uring_flush,
@@ -6511,6 +6631,7 @@
 #endif
 	.poll		= io_uring_poll,
 	.fasync		= io_uring_fasync,
+	.show_fdinfo	= io_uring_show_fdinfo,
 };
  
 static int io_allocate_scq_urings(struct io_ring_ctx *ctx,
@@ -6963,6 +7084,39 @@
  
 static int __init io_uring_init(void)
 {
+#define __BUILD_BUG_VERIFY_ELEMENT(stype, eoffset, etype, ename) do { \
+	BUILD_BUG_ON(offsetof(stype, ename) != eoffset); \
+	BUILD_BUG_ON(sizeof(etype) != sizeof_field(stype, ename)); \
+} while (0)
+
+#define BUILD_BUG_SQE_ELEM(eoffset, etype, ename) \
+	__BUILD_BUG_VERIFY_ELEMENT(struct io_uring_sqe, eoffset, etype, ename)
+	BUILD_BUG_ON(sizeof(struct io_uring_sqe) != 64);
+	BUILD_BUG_SQE_ELEM(0,  __u8,   opcode);
+	BUILD_BUG_SQE_ELEM(1,  __u8,   flags);
+	BUILD_BUG_SQE_ELEM(2,  __u16,  ioprio);
+	BUILD_BUG_SQE_ELEM(4,  __s32,  fd);
+	BUILD_BUG_SQE_ELEM(8,  __u64,  off);
+	BUILD_BUG_SQE_ELEM(8,  __u64,  addr2);
+	BUILD_BUG_SQE_ELEM(16, __u64,  addr);
+	BUILD_BUG_SQE_ELEM(24, __u32,  len);
+	BUILD_BUG_SQE_ELEM(28,     __kernel_rwf_t, rw_flags);
+	BUILD_BUG_SQE_ELEM(28, /* compat */   int, rw_flags);
+	BUILD_BUG_SQE_ELEM(28, /* compat */ __u32, rw_flags);
+	BUILD_BUG_SQE_ELEM(28, __u32,  fsync_flags);
+	BUILD_BUG_SQE_ELEM(28, __u16,  poll_events);
+	BUILD_BUG_SQE_ELEM(28, __u32,  sync_range_flags);
+	BUILD_BUG_SQE_ELEM(28, __u32,  msg_flags);
+	BUILD_BUG_SQE_ELEM(28, __u32,  timeout_flags);
+	BUILD_BUG_SQE_ELEM(28, __u32,  accept_flags);
+	BUILD_BUG_SQE_ELEM(28, __u32,  cancel_flags);
+	BUILD_BUG_SQE_ELEM(28, __u32,  open_flags);
+	BUILD_BUG_SQE_ELEM(28, __u32,  statx_flags);
+	BUILD_BUG_SQE_ELEM(28, __u32,  fadvise_advice);
+	BUILD_BUG_SQE_ELEM(32, __u64,  user_data);
+	BUILD_BUG_SQE_ELEM(40, __u16,  buf_index);
+	BUILD_BUG_SQE_ELEM(42, __u16,  personality);
+
 	BUILD_BUG_ON(ARRAY_SIZE(io_op_defs) != IORING_OP_LAST);
 	req_cachep = KMEM_CACHE(io_kiocb, SLAB_HWCACHE_ALIGN | SLAB_PANIC);
 	return 0;
@@ -12,6 +12,8 @@
 #include <linux/fcntl.h>
 #include <linux/wait.h>
 #include <linux/err.h>
+#include <linux/percpu-defs.h>
+#include <linux/percpu.h>
  
 /*
  * CAREFUL: Check include/uapi/asm-generic/fcntl.h when defining
@@ -40,6 +42,13 @@
 int eventfd_ctx_remove_wait_queue(struct eventfd_ctx *ctx, wait_queue_entry_t *wait,
 				  __u64 *cnt);
  
+DECLARE_PER_CPU(int, eventfd_wake_count);
+
+static inline bool eventfd_signal_count(void)
+{
+	return this_cpu_read(eventfd_wake_count);
+}
+
 #else /* CONFIG_EVENTFD */
  
 /*
@@ -66,6 +75,11 @@
 						wait_queue_entry_t *wait, __u64 *cnt)
 {
 	return -ENOSYS;
+}
+
+static inline bool eventfd_signal_count(void)
+{
+	return false;
 }
  
 #endif
...	...	@@ -1610,6 +1610,14 @@
1610	1610	return 0;
1611	1611	}
1612	1612
	1613	+static void aio_poll_put_work(struct work_struct *work)
	1614	+{
	1615	+ struct poll_iocb *req = container_of(work, struct poll_iocb, work);
	1616	+ struct aio_kiocb *iocb = container_of(req, struct aio_kiocb, poll);
	1617	+
	1618	+ iocb_put(iocb);
	1619	+}
	1620	+
1613	1621	static void aio_poll_complete_work(struct work_struct *work)
1614	1622	{
1615	1623	struct poll_iocb *req = container_of(work, struct poll_iocb, work);
...	...	@@ -1674,6 +1682,8 @@
1674	1682	list_del_init(&req->wait.entry);
1675	1683
1676	1684	if (mask && spin_trylock_irqsave(&iocb->ki_ctx->ctx_lock, flags)) {
	1685	+ struct kioctx *ctx = iocb->ki_ctx;
	1686	+
1677	1687	/*
1678	1688	* Try to complete the iocb inline if we can. Use
1679	1689	* irqsave/irqrestore because not all filesystems (e.g. fuse)
...	...	@@ -1683,8 +1693,14 @@
1683	1693	list_del(&iocb->ki_list);
1684	1694	iocb->ki_res.res = mangle_poll(mask);
1685	1695	req->done = true;
1686		- spin_unlock_irqrestore(&iocb->ki_ctx->ctx_lock, flags);
1687		- iocb_put(iocb);
	1696	+ if (iocb->ki_eventfd && eventfd_signal_count()) {
	1697	+ iocb = NULL;
	1698	+ INIT_WORK(&req->work, aio_poll_put_work);
	1699	+ schedule_work(&req->work);
	1700	+ }
	1701	+ spin_unlock_irqrestore(&ctx->ctx_lock, flags);
	1702	+ if (iocb)
	1703	+ iocb_put(iocb);
1688	1704	} else {
1689	1705	schedule_work(&req->work);
1690	1706	}
...	...	@@ -24,6 +24,8 @@
24	24	#include <linux/seq_file.h>
25	25	#include <linux/idr.h>
26	26
	27	+DEFINE_PER_CPU(int, eventfd_wake_count);
	28	+
27	29	static DEFINE_IDA(eventfd_ida);
28	30
29	31	struct eventfd_ctx {
30	32
31	33
...	...	@@ -60,12 +62,25 @@
60	62	{
61	63	unsigned long flags;
62	64
	65	+ /*
	66	+ * Deadlock or stack overflow issues can happen if we recurse here
	67	+ * through waitqueue wakeup handlers. If the caller users potentially
	68	+ * nested waitqueues with custom wakeup handlers, then it should
	69	+ * check eventfd_signal_count() before calling this function. If
	70	+ * it returns true, the eventfd_signal() call should be deferred to a
	71	+ * safe context.
	72	+ */
	73	+ if (WARN_ON_ONCE(this_cpu_read(eventfd_wake_count)))
	74	+ return 0;
	75	+
63	76	spin_lock_irqsave(&ctx->wqh.lock, flags);
	77	+ this_cpu_inc(eventfd_wake_count);
64	78	if (ULLONG_MAX - ctx->count < n)
65	79	n = ULLONG_MAX - ctx->count;
66	80	ctx->count += n;
67	81	if (waitqueue_active(&ctx->wqh))
68	82	wake_up_locked_poll(&ctx->wqh, EPOLLIN);
	83	+ this_cpu_dec(eventfd_wake_count);
69	84	spin_unlock_irqrestore(&ctx->wqh.lock, flags);
70	85
71	86	return n;
...	...	@@ -585,8 +585,7 @@
585	585	* io_kiocb alloc cache
586	586	*/
587	587	void *reqs[IO_IOPOLL_BATCH];
588		- unsigned int free_reqs;
589		- unsigned int cur_req;
	588	+ unsigned int free_reqs;
590	589
591	590	/*
592	591	* File reference cache
...	...	@@ -754,6 +753,7 @@
754	753	struct io_uring_files_update *ip,
755	754	unsigned nr_args);
756	755	static int io_grab_files(struct io_kiocb *req);
	756	+static void io_ring_file_ref_flush(struct fixed_file_data *data);
757	757
758	758	static struct kmem_cache *req_cachep;
759	759
760	760
761	761
762	762
...	...	@@ -1020,21 +1020,28 @@
1020	1020
1021	1021	static inline bool io_should_trigger_evfd(struct io_ring_ctx *ctx)
1022	1022	{
	1023	+ if (!ctx->cq_ev_fd)
	1024	+ return false;
1023	1025	if (!ctx->eventfd_async)
1024	1026	return true;
1025	1027	return io_wq_current_is_worker() \|\| in_interrupt();
1026	1028	}
1027	1029
1028		-static void io_cqring_ev_posted(struct io_ring_ctx *ctx)
	1030	+static void __io_cqring_ev_posted(struct io_ring_ctx *ctx, bool trigger_ev)
1029	1031	{
1030	1032	if (waitqueue_active(&ctx->wait))
1031	1033	wake_up(&ctx->wait);
1032	1034	if (waitqueue_active(&ctx->sqo_wait))
1033	1035	wake_up(&ctx->sqo_wait);
1034		- if (ctx->cq_ev_fd && io_should_trigger_evfd(ctx))
	1036	+ if (trigger_ev)
1035	1037	eventfd_signal(ctx->cq_ev_fd, 1);
1036	1038	}
1037	1039
	1040	+static void io_cqring_ev_posted(struct io_ring_ctx *ctx)
	1041	+{
	1042	+ __io_cqring_ev_posted(ctx, io_should_trigger_evfd(ctx));
	1043	+}
	1044	+
1038	1045	/* Returns true if there are no backlogged entries after the flush */
1039	1046	static bool io_cqring_overflow_flush(struct io_ring_ctx *ctx, bool force)
1040	1047	{
1041	1048
1042	1049
...	...	@@ -1183,12 +1190,10 @@
1183	1190	ret = 1;
1184	1191	}
1185	1192	state->free_reqs = ret - 1;
1186		- state->cur_req = 1;
1187		- req = state->reqs[0];
	1193	+ req = state->reqs[ret - 1];
1188	1194	} else {
1189		- req = state->reqs[state->cur_req];
1190	1195	state->free_reqs--;
1191		- state->cur_req++;
	1196	+ req = state->reqs[state->free_reqs];
1192	1197	}
1193	1198
1194	1199	got_it:
...	...	@@ -1855,9 +1860,6 @@
1855	1860	unsigned ioprio;
1856	1861	int ret;
1857	1862
1858		- if (!req->file)
1859		- return -EBADF;
1860		-
1861	1863	if (S_ISREG(file_inode(req->file)->i_mode))
1862	1864	req->flags \|= REQ_F_ISREG;
1863	1865
1864	1866
...	...	@@ -1866,8 +1868,11 @@
1866	1868	req->flags \|= REQ_F_CUR_POS;
1867	1869	kiocb->ki_pos = req->file->f_pos;
1868	1870	}
1869		- kiocb->ki_flags = iocb_flags(kiocb->ki_filp);
1870	1871	kiocb->ki_hint = ki_hint_validate(file_write_hint(kiocb->ki_filp));
	1872	+ kiocb->ki_flags = iocb_flags(kiocb->ki_filp);
	1873	+ ret = kiocb_set_rw_flags(kiocb, READ_ONCE(sqe->rw_flags));
	1874	+ if (unlikely(ret))
	1875	+ return ret;
1871	1876
1872	1877	ioprio = READ_ONCE(sqe->ioprio);
1873	1878	if (ioprio) {
...	...	@@ -1879,10 +1884,6 @@
1879	1884	} else
1880	1885	kiocb->ki_ioprio = get_current_ioprio();
1881	1886
1882		- ret = kiocb_set_rw_flags(kiocb, READ_ONCE(sqe->rw_flags));
1883		- if (unlikely(ret))
1884		- return ret;
1885		-
1886	1887	/* don't allow async punt if RWF_NOWAIT was requested */
1887	1888	if ((kiocb->ki_flags & IOCB_NOWAIT) \|\|
1888	1889	(req->file->f_flags & O_NONBLOCK))
1889	1890
...	...	@@ -2164,10 +2165,12 @@
2164	2165	{
2165	2166	if (!io_op_defs[req->opcode].async_ctx)
2166	2167	return 0;
2167		- if (!req->io && io_alloc_async_ctx(req))
2168		- return -ENOMEM;
	2168	+ if (!req->io) {
	2169	+ if (io_alloc_async_ctx(req))
	2170	+ return -ENOMEM;
2169	2171
2170		- io_req_map_rw(req, io_size, iovec, fast_iov, iter);
	2172	+ io_req_map_rw(req, io_size, iovec, fast_iov, iter);
	2173	+ }
2171	2174	req->work.func = io_rw_async;
2172	2175	return 0;
2173	2176	}
...	...	@@ -2724,9 +2727,16 @@
2724	2727	struct io_fadvise *fa = &req->fadvise;
2725	2728	int ret;
2726	2729
2727		- /* DONTNEED may block, others _should_ not */
2728		- if (fa->advice == POSIX_FADV_DONTNEED && force_nonblock)
2729		- return -EAGAIN;
	2730	+ if (force_nonblock) {
	2731	+ switch (fa->advice) {
	2732	+ case POSIX_FADV_NORMAL:
	2733	+ case POSIX_FADV_RANDOM:
	2734	+ case POSIX_FADV_SEQUENTIAL:
	2735	+ break;
	2736	+ default:
	2737	+ return -EAGAIN;
	2738	+ }
	2739	+ }
2730	2740
2731	2741	ret = vfs_fadvise(req->file, fa->offset, fa->len, fa->advice);
2732	2742	if (ret < 0)
2733	2743
2734	2744
...	...	@@ -2837,16 +2847,13 @@
2837	2847	int ret;
2838	2848
2839	2849	ret = filp_close(req->close.put_file, req->work.files);
2840		- if (ret < 0) {
	2850	+ if (ret < 0)
2841	2851	req_set_fail_links(req);
2842		- }
2843	2852	io_cqring_add_event(req, ret);
2844	2853	}
2845	2854
2846	2855	fput(req->close.put_file);
2847	2856
2848		- /* we bypassed the re-issue, drop the submission reference */
2849		- io_put_req(req);
2850	2857	io_put_req_find_next(req, &nxt);
2851	2858	if (nxt)
2852	2859	io_wq_assign_next(workptr, nxt);
...	...	@@ -2888,7 +2895,13 @@
2888	2895
2889	2896	eagain:
2890	2897	req->work.func = io_close_finish;
2891		- return -EAGAIN;
	2898	+ /*
	2899	+ * Do manual async queue here to avoid grabbing files - we don't
	2900	+ * need the files, and it'll cause io_close_finish() to close
	2901	+ * the file again and cause a double CQE entry for this request
	2902	+ */
	2903	+ io_queue_async_work(req);
	2904	+ return 0;
2892	2905	}
2893	2906
2894	2907	static int io_prep_sfr(struct io_kiocb req, const struct io_uring_sqe sqe)
...	...	@@ -3083,7 +3096,8 @@
3083	3096	else if (force_nonblock)
3084	3097	flags \|= MSG_DONTWAIT;
3085	3098
3086		- ret = __sys_sendmsg_sock(sock, &msg, flags);
	3099	+ msg.msg_flags = flags;
	3100	+ ret = sock_sendmsg(sock, &msg);
3087	3101	if (force_nonblock && ret == -EAGAIN)
3088	3102	return -EAGAIN;
3089	3103	if (ret == -ERESTARTSYS)
...	...	@@ -3109,6 +3123,7 @@
3109	3123
3110	3124	sr->msg_flags = READ_ONCE(sqe->msg_flags);
3111	3125	sr->msg = u64_to_user_ptr(READ_ONCE(sqe->addr));
	3126	+ sr->len = READ_ONCE(sqe->len);
3112	3127
3113	3128	if (!io \|\| req->opcode == IORING_OP_RECV)
3114	3129	return 0;
...	...	@@ -3227,7 +3242,7 @@
3227	3242	else if (force_nonblock)
3228	3243	flags \|= MSG_DONTWAIT;
3229	3244
3230		- ret = __sys_recvmsg_sock(sock, &msg, NULL, NULL, flags);
	3245	+ ret = sock_recvmsg(sock, &msg, flags);
3231	3246	if (force_nonblock && ret == -EAGAIN)
3232	3247	return -EAGAIN;
3233	3248	if (ret == -ERESTARTSYS)
...	...	@@ -3561,6 +3576,14 @@
3561	3576	__io_poll_flush(req->ctx, nodes);
3562	3577	}
3563	3578
	3579	+static void io_poll_trigger_evfd(struct io_wq_work **workptr)
	3580	+{
	3581	+ struct io_kiocb req = container_of(workptr, struct io_kiocb, work);
	3582	+
	3583	+ eventfd_signal(req->ctx->cq_ev_fd, 1);
	3584	+ io_put_req(req);
	3585	+}
	3586	+
3564	3587	static int io_poll_wake(struct wait_queue_entry *wait, unsigned mode, int sync,
3565	3588	void *key)
3566	3589	{
3567	3590
3568	3591
...	...	@@ -3586,14 +3609,22 @@
3586	3609
3587	3610	if (llist_empty(&ctx->poll_llist) &&
3588	3611	spin_trylock_irqsave(&ctx->completion_lock, flags)) {
	3612	+ bool trigger_ev;
	3613	+
3589	3614	hash_del(&req->hash_node);
3590	3615	io_poll_complete(req, mask, 0);
3591		- req->flags \|= REQ_F_COMP_LOCKED;
3592		- io_put_req(req);
3593		- spin_unlock_irqrestore(&ctx->completion_lock, flags);
3594	3616
3595		- io_cqring_ev_posted(ctx);
3596		- req = NULL;
	3617	+ trigger_ev = io_should_trigger_evfd(ctx);
	3618	+ if (trigger_ev && eventfd_signal_count()) {
	3619	+ trigger_ev = false;
	3620	+ req->work.func = io_poll_trigger_evfd;
	3621	+ } else {
	3622	+ req->flags \|= REQ_F_COMP_LOCKED;
	3623	+ io_put_req(req);
	3624	+ req = NULL;
	3625	+ }
	3626	+ spin_unlock_irqrestore(&ctx->completion_lock, flags);
	3627	+ __io_cqring_ev_posted(ctx, trigger_ev);
3597	3628	} else {
3598	3629	req->result = mask;
3599	3630	req->llist_node.next = NULL;
...	...	@@ -4815,8 +4846,7 @@
4815	4846	blk_finish_plug(&state->plug);
4816	4847	io_file_put(state);
4817	4848	if (state->free_reqs)
4818		- kmem_cache_free_bulk(req_cachep, state->free_reqs,
4819		- &state->reqs[state->cur_req]);
	4849	+ kmem_cache_free_bulk(req_cachep, state->free_reqs, state->reqs);
4820	4850	}
4821	4851
4822	4852	/*
...	...	@@ -5041,7 +5071,8 @@
5041	5071	* reap events and wake us up.
5042	5072	*/
5043	5073	if (inflight \|\|
5044		- (!time_after(jiffies, timeout) && ret != -EBUSY)) {
	5074	+ (!time_after(jiffies, timeout) && ret != -EBUSY &&
	5075	+ !percpu_ref_is_dying(&ctx->refs))) {
5045	5076	cond_resched();
5046	5077	continue;
5047	5078	}
5048	5079
5049	5080
...	...	@@ -5231,15 +5262,10 @@
5231	5262	if (!data)
5232	5263	return -ENXIO;
5233	5264
5234		- /* protect against inflight atomic switch, which drops the ref */
5235		- percpu_ref_get(&data->refs);
5236		- /* wait for existing switches */
5237		- flush_work(&data->ref_work);
5238	5265	percpu_ref_kill_and_confirm(&data->refs, io_file_ref_kill);
5239		- wait_for_completion(&data->done);
5240		- percpu_ref_put(&data->refs);
5241		- /* flush potential new switch */
5242	5266	flush_work(&data->ref_work);
	5267	+ wait_for_completion(&data->done);
	5268	+ io_ring_file_ref_flush(data);
5243	5269	percpu_ref_exit(&data->refs);
5244	5270
5245	5271	__io_sqe_files_unregister(ctx);
5246	5272
5247	5273
...	...	@@ -5477,14 +5503,11 @@
5477	5503	struct completion *done;
5478	5504	};
5479	5505
5480		-static void io_ring_file_ref_switch(struct work_struct *work)
	5506	+static void io_ring_file_ref_flush(struct fixed_file_data *data)
5481	5507	{
5482	5508	struct io_file_put pfile, tmp;
5483		- struct fixed_file_data *data;
5484	5509	struct llist_node *node;
5485	5510
5486		- data = container_of(work, struct fixed_file_data, ref_work);
5487		-
5488	5511	while ((node = llist_del_all(&data->put_llist)) != NULL) {
5489	5512	llist_for_each_entry_safe(pfile, tmp, node, llist) {
5490	5513	io_ring_file_put(data->ctx, pfile->file);
5491	5514
...	...	@@ -5494,7 +5517,14 @@
5494	5517	kfree(pfile);
5495	5518	}
5496	5519	}
	5520	+}
5497	5521
	5522	+static void io_ring_file_ref_switch(struct work_struct *work)
	5523	+{
	5524	+ struct fixed_file_data *data;
	5525	+
	5526	+ data = container_of(work, struct fixed_file_data, ref_work);
	5527	+ io_ring_file_ref_flush(data);
5498	5528	percpu_ref_get(&data->refs);
5499	5529	percpu_ref_switch_to_percpu(&data->refs);
5500	5530	}
...	...	@@ -5505,8 +5535,14 @@
5505	5535
5506	5536	data = container_of(ref, struct fixed_file_data, refs);
5507	5537
5508		- /* we can't safely switch from inside this context, punt to wq */
5509		- queue_work(system_wq, &data->ref_work);
	5538	+ /*
	5539	+ * We can't safely switch from inside this context, punt to wq. If
	5540	+ * the table ref is going away, the table is being unregistered.
	5541	+ * Don't queue up the async work for that case, the caller will
	5542	+ * handle it.
	5543	+ */
	5544	+ if (!percpu_ref_is_dying(&data->refs))
	5545	+ queue_work(system_wq, &data->ref_work);
5510	5546	}
5511	5547
5512	5548	static int io_sqe_files_register(struct io_ring_ctx ctx, void __user arg,
...	...	@@ -6295,6 +6331,16 @@
6295	6331	percpu_ref_kill(&ctx->refs);
6296	6332	mutex_unlock(&ctx->uring_lock);
6297	6333
	6334	+ /*
	6335	+ * Wait for sq thread to idle, if we have one. It won't spin on new
	6336	+ * work after we've killed the ctx ref above. This is important to do
	6337	+ * before we cancel existing commands, as the thread could otherwise
	6338	+ * be queueing new work post that. If that's work we need to cancel,
	6339	+ * it could cause shutdown to hang.
	6340	+ */
	6341	+ while (ctx->sqo_thread && !wq_has_sleeper(&ctx->sqo_wait))
	6342	+ cpu_relax();
	6343	+
6298	6344	io_kill_timeouts(ctx);
6299	6345	io_poll_remove_all(ctx);
6300	6346
...	...	@@ -6501,6 +6547,80 @@
6501	6547	return submitted ? submitted : ret;
6502	6548	}
6503	6549
	6550	+static int io_uring_show_cred(int id, void p, void data)
	6551	+{
	6552	+ const struct cred *cred = p;
	6553	+ struct seq_file *m = data;
	6554	+ struct user_namespace *uns = seq_user_ns(m);
	6555	+ struct group_info *gi;
	6556	+ kernel_cap_t cap;
	6557	+ unsigned __capi;
	6558	+ int g;
	6559	+
	6560	+ seq_printf(m, "%5d\n", id);
	6561	+ seq_put_decimal_ull(m, "\tUid:\t", from_kuid_munged(uns, cred->uid));
	6562	+ seq_put_decimal_ull(m, "\t\t", from_kuid_munged(uns, cred->euid));
	6563	+ seq_put_decimal_ull(m, "\t\t", from_kuid_munged(uns, cred->suid));
	6564	+ seq_put_decimal_ull(m, "\t\t", from_kuid_munged(uns, cred->fsuid));
	6565	+ seq_put_decimal_ull(m, "\n\tGid:\t", from_kgid_munged(uns, cred->gid));
	6566	+ seq_put_decimal_ull(m, "\t\t", from_kgid_munged(uns, cred->egid));
	6567	+ seq_put_decimal_ull(m, "\t\t", from_kgid_munged(uns, cred->sgid));
	6568	+ seq_put_decimal_ull(m, "\t\t", from_kgid_munged(uns, cred->fsgid));
	6569	+ seq_puts(m, "\n\tGroups:\t");
	6570	+ gi = cred->group_info;
	6571	+ for (g = 0; g < gi->ngroups; g++) {
	6572	+ seq_put_decimal_ull(m, g ? " " : "",
	6573	+ from_kgid_munged(uns, gi->gid[g]));
	6574	+ }
	6575	+ seq_puts(m, "\n\tCapEff:\t");
	6576	+ cap = cred->cap_effective;
	6577	+ CAP_FOR_EACH_U32(__capi)
	6578	+ seq_put_hex_ll(m, NULL, cap.cap[CAP_LAST_U32 - __capi], 8);
	6579	+ seq_putc(m, '\n');
	6580	+ return 0;
	6581	+}
	6582	+
	6583	+static void __io_uring_show_fdinfo(struct io_ring_ctx ctx, struct seq_file m)
	6584	+{
	6585	+ int i;
	6586	+
	6587	+ mutex_lock(&ctx->uring_lock);
	6588	+ seq_printf(m, "UserFiles:\t%u\n", ctx->nr_user_files);
	6589	+ for (i = 0; i < ctx->nr_user_files; i++) {
	6590	+ struct fixed_file_table *table;
	6591	+ struct file *f;
	6592	+
	6593	+ table = &ctx->file_data->table[i >> IORING_FILE_TABLE_SHIFT];
	6594	+ f = table->files[i & IORING_FILE_TABLE_MASK];
	6595	+ if (f)
	6596	+ seq_printf(m, "%5u: %s\n", i, file_dentry(f)->d_iname);
	6597	+ else
	6598	+ seq_printf(m, "%5u: <none>\n", i);
	6599	+ }
	6600	+ seq_printf(m, "UserBufs:\t%u\n", ctx->nr_user_bufs);
	6601	+ for (i = 0; i < ctx->nr_user_bufs; i++) {
	6602	+ struct io_mapped_ubuf *buf = &ctx->user_bufs[i];
	6603	+
	6604	+ seq_printf(m, "%5u: 0x%llx/%u\n", i, buf->ubuf,
	6605	+ (unsigned int) buf->len);
	6606	+ }
	6607	+ if (!idr_is_empty(&ctx->personality_idr)) {
	6608	+ seq_printf(m, "Personalities:\n");
	6609	+ idr_for_each(&ctx->personality_idr, io_uring_show_cred, m);
	6610	+ }
	6611	+ mutex_unlock(&ctx->uring_lock);
	6612	+}
	6613	+
	6614	+static void io_uring_show_fdinfo(struct seq_file m, struct file f)
	6615	+{
	6616	+ struct io_ring_ctx *ctx = f->private_data;
	6617	+
	6618	+ if (percpu_ref_tryget(&ctx->refs)) {
	6619	+ __io_uring_show_fdinfo(ctx, m);
	6620	+ percpu_ref_put(&ctx->refs);
	6621	+ }
	6622	+}
	6623	+
6504	6624	static const struct file_operations io_uring_fops = {
6505	6625	.release = io_uring_release,
6506	6626	.flush = io_uring_flush,
...	...	@@ -6511,6 +6631,7 @@
6511	6631	#endif
6512	6632	.poll = io_uring_poll,
6513	6633	.fasync = io_uring_fasync,
	6634	+ .show_fdinfo = io_uring_show_fdinfo,
6514	6635	};
6515	6636
6516	6637	static int io_allocate_scq_urings(struct io_ring_ctx *ctx,
...	...	@@ -6963,6 +7084,39 @@
6963	7084
6964	7085	static int __init io_uring_init(void)
6965	7086	{
	7087	+#define __BUILD_BUG_VERIFY_ELEMENT(stype, eoffset, etype, ename) do { \
	7088	+ BUILD_BUG_ON(offsetof(stype, ename) != eoffset); \
	7089	+ BUILD_BUG_ON(sizeof(etype) != sizeof_field(stype, ename)); \
	7090	+} while (0)
	7091	+
	7092	+#define BUILD_BUG_SQE_ELEM(eoffset, etype, ename) \
	7093	+ __BUILD_BUG_VERIFY_ELEMENT(struct io_uring_sqe, eoffset, etype, ename)
	7094	+ BUILD_BUG_ON(sizeof(struct io_uring_sqe) != 64);
	7095	+ BUILD_BUG_SQE_ELEM(0, __u8, opcode);
	7096	+ BUILD_BUG_SQE_ELEM(1, __u8, flags);
	7097	+ BUILD_BUG_SQE_ELEM(2, __u16, ioprio);
	7098	+ BUILD_BUG_SQE_ELEM(4, __s32, fd);
	7099	+ BUILD_BUG_SQE_ELEM(8, __u64, off);
	7100	+ BUILD_BUG_SQE_ELEM(8, __u64, addr2);
	7101	+ BUILD_BUG_SQE_ELEM(16, __u64, addr);
	7102	+ BUILD_BUG_SQE_ELEM(24, __u32, len);
	7103	+ BUILD_BUG_SQE_ELEM(28, __kernel_rwf_t, rw_flags);
	7104	+ BUILD_BUG_SQE_ELEM(28, /* compat */ int, rw_flags);
	7105	+ BUILD_BUG_SQE_ELEM(28, /* compat */ __u32, rw_flags);
	7106	+ BUILD_BUG_SQE_ELEM(28, __u32, fsync_flags);
	7107	+ BUILD_BUG_SQE_ELEM(28, __u16, poll_events);
	7108	+ BUILD_BUG_SQE_ELEM(28, __u32, sync_range_flags);
	7109	+ BUILD_BUG_SQE_ELEM(28, __u32, msg_flags);
	7110	+ BUILD_BUG_SQE_ELEM(28, __u32, timeout_flags);
	7111	+ BUILD_BUG_SQE_ELEM(28, __u32, accept_flags);
	7112	+ BUILD_BUG_SQE_ELEM(28, __u32, cancel_flags);
	7113	+ BUILD_BUG_SQE_ELEM(28, __u32, open_flags);
	7114	+ BUILD_BUG_SQE_ELEM(28, __u32, statx_flags);
	7115	+ BUILD_BUG_SQE_ELEM(28, __u32, fadvise_advice);
	7116	+ BUILD_BUG_SQE_ELEM(32, __u64, user_data);
	7117	+ BUILD_BUG_SQE_ELEM(40, __u16, buf_index);
	7118	+ BUILD_BUG_SQE_ELEM(42, __u16, personality);
	7119	+
6966	7120	BUILD_BUG_ON(ARRAY_SIZE(io_op_defs) != IORING_OP_LAST);
6967	7121	req_cachep = KMEM_CACHE(io_kiocb, SLAB_HWCACHE_ALIGN \| SLAB_PANIC);
6968	7122	return 0;
...	...	@@ -12,6 +12,8 @@
12	12	#include <linux/fcntl.h>
13	13	#include <linux/wait.h>
14	14	#include <linux/err.h>
	15	+#include <linux/percpu-defs.h>
	16	+#include <linux/percpu.h>
15	17
16	18	/*
17	19	* CAREFUL: Check include/uapi/asm-generic/fcntl.h when defining
...	...	@@ -40,6 +42,13 @@
40	42	int eventfd_ctx_remove_wait_queue(struct eventfd_ctx ctx, wait_queue_entry_t wait,
41	43	__u64 *cnt);
42	44
	45	+DECLARE_PER_CPU(int, eventfd_wake_count);
	46	+
	47	+static inline bool eventfd_signal_count(void)
	48	+{
	49	+ return this_cpu_read(eventfd_wake_count);
	50	+}
	51	+
43	52	#else /* CONFIG_EVENTFD */
44	53
45	54	/*
...	...	@@ -66,6 +75,11 @@
66	75	wait_queue_entry_t wait, __u64 cnt)
67	76	{
68	77	return -ENOSYS;
	78	+}
	79	+
	80	+static inline bool eventfd_signal_count(void)
	81	+{
	82	+ return false;
69	83	}
70	84
71	85	#endif