Blame view

fs/io_uring.c 97.9 KB
2b188cc1b   Jens Axboe   Add io_uring IO i...
1
2
3
4
5
6
  // SPDX-License-Identifier: GPL-2.0
  /*
   * Shared application/kernel submission and completion ring pairs, for
   * supporting fast/efficient IO.
   *
   * A note on the read/write ordering memory barriers that are matched between
1e84b97b7   Stefan Bühler   io_uring: fix not...
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
   * the application and kernel side.
   *
   * After the application reads the CQ ring tail, it must use an
   * appropriate smp_rmb() to pair with the smp_wmb() the kernel uses
   * before writing the tail (using smp_load_acquire to read the tail will
   * do). It also needs a smp_mb() before updating CQ head (ordering the
   * entry load(s) with the head store), pairing with an implicit barrier
   * through a control-dependency in io_get_cqring (smp_store_release to
   * store head will do). Failure to do so could lead to reading invalid
   * CQ entries.
   *
   * Likewise, the application must use an appropriate smp_wmb() before
   * writing the SQ tail (ordering SQ entry stores with the tail store),
   * which pairs with smp_load_acquire in io_get_sqring (smp_store_release
   * to store the tail will do). And it needs a barrier ordering the SQ
   * head load before writing new SQ entries (smp_load_acquire to read
   * head will do).
   *
   * When using the SQ poll thread (IORING_SETUP_SQPOLL), the application
   * needs to check the SQ flags for IORING_SQ_NEED_WAKEUP *after*
   * updating the SQ tail; a full memory barrier smp_mb() is needed
   * between.
2b188cc1b   Jens Axboe   Add io_uring IO i...
29
30
31
32
33
34
35
36
37
38
39
   *
   * Also see the examples in the liburing library:
   *
   *	git://git.kernel.dk/liburing
   *
   * io_uring also uses READ/WRITE_ONCE() for _any_ store or load that happens
   * from data shared between the kernel and application. This is done both
   * for ordering purposes, but also to ensure that once a value is loaded from
   * data that the application could potentially modify, it remains stable.
   *
   * Copyright (C) 2018-2019 Jens Axboe
c992fe292   Christoph Hellwig   io_uring: add fsy...
40
   * Copyright (c) 2018-2019 Christoph Hellwig
2b188cc1b   Jens Axboe   Add io_uring IO i...
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
   */
  #include <linux/kernel.h>
  #include <linux/init.h>
  #include <linux/errno.h>
  #include <linux/syscalls.h>
  #include <linux/compat.h>
  #include <linux/refcount.h>
  #include <linux/uio.h>
  
  #include <linux/sched/signal.h>
  #include <linux/fs.h>
  #include <linux/file.h>
  #include <linux/fdtable.h>
  #include <linux/mm.h>
  #include <linux/mman.h>
  #include <linux/mmu_context.h>
  #include <linux/percpu.h>
  #include <linux/slab.h>
  #include <linux/workqueue.h>
6c271ce2f   Jens Axboe   io_uring: add sub...
60
  #include <linux/kthread.h>
2b188cc1b   Jens Axboe   Add io_uring IO i...
61
  #include <linux/blkdev.h>
edafccee5   Jens Axboe   io_uring: add sup...
62
  #include <linux/bvec.h>
2b188cc1b   Jens Axboe   Add io_uring IO i...
63
64
65
  #include <linux/net.h>
  #include <net/sock.h>
  #include <net/af_unix.h>
6b06314c4   Jens Axboe   io_uring: add fil...
66
  #include <net/scm.h>
2b188cc1b   Jens Axboe   Add io_uring IO i...
67
68
69
70
  #include <linux/anon_inodes.h>
  #include <linux/sched/mm.h>
  #include <linux/uaccess.h>
  #include <linux/nospec.h>
edafccee5   Jens Axboe   io_uring: add sup...
71
72
  #include <linux/sizes.h>
  #include <linux/hugetlb.h>
1dec7fcac   Jens Axboe   io_uring: fix mis...
73
  #include <linux/highmem.h>
cac68d12c   Jens Axboe   io_uring: grab ->...
74
  #include <linux/fs_struct.h>
2b188cc1b   Jens Axboe   Add io_uring IO i...
75
76
77
78
  
  #include <uapi/linux/io_uring.h>
  
  #include "internal.h"
5277deaab   Daniel Xu   io_uring: increas...
79
  #define IORING_MAX_ENTRIES	32768
6b06314c4   Jens Axboe   io_uring: add fil...
80
  #define IORING_MAX_FIXED_FILES	1024
2b188cc1b   Jens Axboe   Add io_uring IO i...
81
82
83
84
85
  
  struct io_uring {
  	u32 head ____cacheline_aligned_in_smp;
  	u32 tail ____cacheline_aligned_in_smp;
  };
1e84b97b7   Stefan Bühler   io_uring: fix not...
86
  /*
75b28affd   Hristo Venev   io_uring: allocat...
87
88
   * This data is shared with the application through the mmap at offsets
   * IORING_OFF_SQ_RING and IORING_OFF_CQ_RING.
1e84b97b7   Stefan Bühler   io_uring: fix not...
89
90
91
92
   *
   * The offsets to the member fields are published through struct
   * io_sqring_offsets when calling io_uring_setup.
   */
75b28affd   Hristo Venev   io_uring: allocat...
93
  struct io_rings {
1e84b97b7   Stefan Bühler   io_uring: fix not...
94
95
96
97
  	/*
  	 * Head and tail offsets into the ring; the offsets need to be
  	 * masked to get valid indices.
  	 *
75b28affd   Hristo Venev   io_uring: allocat...
98
99
100
  	 * The kernel controls head of the sq ring and the tail of the cq ring,
  	 * and the application controls tail of the sq ring and the head of the
  	 * cq ring.
1e84b97b7   Stefan Bühler   io_uring: fix not...
101
  	 */
75b28affd   Hristo Venev   io_uring: allocat...
102
  	struct io_uring		sq, cq;
1e84b97b7   Stefan Bühler   io_uring: fix not...
103
  	/*
75b28affd   Hristo Venev   io_uring: allocat...
104
  	 * Bitmasks to apply to head and tail offsets (constant, equals
1e84b97b7   Stefan Bühler   io_uring: fix not...
105
106
  	 * ring_entries - 1)
  	 */
75b28affd   Hristo Venev   io_uring: allocat...
107
108
109
  	u32			sq_ring_mask, cq_ring_mask;
  	/* Ring sizes (constant, power of 2) */
  	u32			sq_ring_entries, cq_ring_entries;
1e84b97b7   Stefan Bühler   io_uring: fix not...
110
111
112
113
114
115
116
117
118
119
120
121
  	/*
  	 * Number of invalid entries dropped by the kernel due to
  	 * invalid index stored in array
  	 *
  	 * Written by the kernel, shouldn't be modified by the
  	 * application (i.e. get number of "new events" by comparing to
  	 * cached value).
  	 *
  	 * After a new SQ head value was read by the application this
  	 * counter includes all submissions that were dropped reaching
  	 * the new SQ head (and possibly more).
  	 */
75b28affd   Hristo Venev   io_uring: allocat...
122
  	u32			sq_dropped;
1e84b97b7   Stefan Bühler   io_uring: fix not...
123
124
125
126
127
128
129
130
131
  	/*
  	 * Runtime flags
  	 *
  	 * Written by the kernel, shouldn't be modified by the
  	 * application.
  	 *
  	 * The application needs a full memory barrier before checking
  	 * for IORING_SQ_NEED_WAKEUP after updating the sq tail.
  	 */
75b28affd   Hristo Venev   io_uring: allocat...
132
  	u32			sq_flags;
1e84b97b7   Stefan Bühler   io_uring: fix not...
133
134
135
136
137
138
139
140
141
142
143
144
145
  	/*
  	 * Number of completion events lost because the queue was full;
  	 * this should be avoided by the application by making sure
  	 * there are not more requests pending thatn there is space in
  	 * the completion queue.
  	 *
  	 * Written by the kernel, shouldn't be modified by the
  	 * application (i.e. get number of "new events" by comparing to
  	 * cached value).
  	 *
  	 * As completion events come in out of order this counter is not
  	 * ordered with any other data.
  	 */
75b28affd   Hristo Venev   io_uring: allocat...
146
  	u32			cq_overflow;
1e84b97b7   Stefan Bühler   io_uring: fix not...
147
148
149
150
151
152
153
  	/*
  	 * Ring buffer of completion events.
  	 *
  	 * The kernel writes completion events fresh every time they are
  	 * produced, so the application is allowed to modify pending
  	 * entries.
  	 */
75b28affd   Hristo Venev   io_uring: allocat...
154
  	struct io_uring_cqe	cqes[] ____cacheline_aligned_in_smp;
2b188cc1b   Jens Axboe   Add io_uring IO i...
155
  };
edafccee5   Jens Axboe   io_uring: add sup...
156
157
158
159
160
161
  struct io_mapped_ubuf {
  	u64		ubuf;
  	size_t		len;
  	struct		bio_vec *bvec;
  	unsigned int	nr_bvecs;
  };
31b515106   Jens Axboe   io_uring: allow w...
162
163
164
165
166
167
  struct async_list {
  	spinlock_t		lock;
  	atomic_t		cnt;
  	struct list_head	list;
  
  	struct file		*file;
6d5d5ac52   Jens Axboe   io_uring: extend ...
168
  	off_t			io_start;
9310a7ba6   Zhengyuan Liu   io_uring: track i...
169
  	size_t			io_len;
31b515106   Jens Axboe   io_uring: allow w...
170
  };
2b188cc1b   Jens Axboe   Add io_uring IO i...
171
172
173
174
175
176
177
178
179
  struct io_ring_ctx {
  	struct {
  		struct percpu_ref	refs;
  	} ____cacheline_aligned_in_smp;
  
  	struct {
  		unsigned int		flags;
  		bool			compat;
  		bool			account_mem;
75b28affd   Hristo Venev   io_uring: allocat...
180
181
182
183
184
185
186
187
188
189
190
191
  		/*
  		 * Ring buffer of indices into array of io_uring_sqe, which is
  		 * mmapped by the application using the IORING_OFF_SQES offset.
  		 *
  		 * This indirection could e.g. be used to assign fixed
  		 * io_uring_sqe entries to operations and only submit them to
  		 * the queue when needed.
  		 *
  		 * The kernel modifies neither the indices array nor the entries
  		 * array.
  		 */
  		u32			*sq_array;
2b188cc1b   Jens Axboe   Add io_uring IO i...
192
193
194
  		unsigned		cached_sq_head;
  		unsigned		sq_entries;
  		unsigned		sq_mask;
6c271ce2f   Jens Axboe   io_uring: add sub...
195
  		unsigned		sq_thread_idle;
498ccd9ed   Jens Axboe   io_uring: used ca...
196
  		unsigned		cached_sq_dropped;
2b188cc1b   Jens Axboe   Add io_uring IO i...
197
  		struct io_uring_sqe	*sq_sqes;
de0617e46   Jens Axboe   io_uring: add sup...
198
199
  
  		struct list_head	defer_list;
5262f5679   Jens Axboe   io_uring: IORING_...
200
  		struct list_head	timeout_list;
2b188cc1b   Jens Axboe   Add io_uring IO i...
201
202
203
  	} ____cacheline_aligned_in_smp;
  
  	/* IO offload */
54a91f3bb   Jens Axboe   io_uring: limit p...
204
  	struct workqueue_struct	*sqo_wq[2];
6c271ce2f   Jens Axboe   io_uring: add sub...
205
  	struct task_struct	*sqo_thread;	/* if using sq thread polling */
2b188cc1b   Jens Axboe   Add io_uring IO i...
206
  	struct mm_struct	*sqo_mm;
6c271ce2f   Jens Axboe   io_uring: add sub...
207
  	wait_queue_head_t	sqo_wait;
a4c0b3dec   Jackie Liu   io_uring: fix io_...
208
  	struct completion	sqo_thread_started;
2b188cc1b   Jens Axboe   Add io_uring IO i...
209
210
  
  	struct {
2b188cc1b   Jens Axboe   Add io_uring IO i...
211
  		unsigned		cached_cq_tail;
498ccd9ed   Jens Axboe   io_uring: used ca...
212
  		atomic_t		cached_cq_overflow;
2b188cc1b   Jens Axboe   Add io_uring IO i...
213
214
215
216
  		unsigned		cq_entries;
  		unsigned		cq_mask;
  		struct wait_queue_head	cq_wait;
  		struct fasync_struct	*cq_fasync;
9b402849e   Jens Axboe   io_uring: add sup...
217
  		struct eventfd_ctx	*cq_ev_fd;
5262f5679   Jens Axboe   io_uring: IORING_...
218
  		atomic_t		cq_timeouts;
2b188cc1b   Jens Axboe   Add io_uring IO i...
219
  	} ____cacheline_aligned_in_smp;
75b28affd   Hristo Venev   io_uring: allocat...
220
  	struct io_rings	*rings;
6b06314c4   Jens Axboe   io_uring: add fil...
221
222
223
224
225
226
227
  	/*
  	 * If used, fixed file set. Writers must ensure that ->refs is dead,
  	 * readers must ensure that ->refs is alive as long as the file* is
  	 * used. Only updated through io_uring_register(2).
  	 */
  	struct file		**user_files;
  	unsigned		nr_user_files;
edafccee5   Jens Axboe   io_uring: add sup...
228
229
230
  	/* if used, fixed mapped user buffers */
  	unsigned		nr_user_bufs;
  	struct io_mapped_ubuf	*user_bufs;
2b188cc1b   Jens Axboe   Add io_uring IO i...
231
  	struct user_struct	*user;
d1b69aabc   Jens Axboe   io_uring: use cur...
232
  	const struct cred	*creds;
8387e3688   Jens Axboe   io_uring: async w...
233

2b188cc1b   Jens Axboe   Add io_uring IO i...
234
235
236
237
238
239
240
241
242
  	struct completion	ctx_done;
  
  	struct {
  		struct mutex		uring_lock;
  		wait_queue_head_t	wait;
  	} ____cacheline_aligned_in_smp;
  
  	struct {
  		spinlock_t		completion_lock;
def596e95   Jens Axboe   io_uring: support...
243
244
245
246
247
248
249
250
  		bool			poll_multi_file;
  		/*
  		 * ->poll_list is protected by the ctx->uring_lock for
  		 * io_uring instances that don't use IORING_SETUP_SQPOLL.
  		 * For SQPOLL, only the single threaded io_sq_thread() will
  		 * manipulate the list, hence no extra locking is needed there.
  		 */
  		struct list_head	poll_list;
221c5eb23   Jens Axboe   io_uring: add sup...
251
  		struct list_head	cancel_list;
2b188cc1b   Jens Axboe   Add io_uring IO i...
252
  	} ____cacheline_aligned_in_smp;
31b515106   Jens Axboe   io_uring: allow w...
253
  	struct async_list	pending_async[2];
2b188cc1b   Jens Axboe   Add io_uring IO i...
254
255
256
257
258
259
260
261
  #if defined(CONFIG_UNIX)
  	struct socket		*ring_sock;
  #endif
  };
  
  struct sqe_submit {
  	const struct io_uring_sqe	*sqe;
  	unsigned short			index;
8776f3fa1   Jackie Liu   io_uring: fix wro...
262
  	u32				sequence;
2b188cc1b   Jens Axboe   Add io_uring IO i...
263
  	bool				has_user;
def596e95   Jens Axboe   io_uring: support...
264
  	bool				needs_lock;
6c271ce2f   Jens Axboe   io_uring: add sub...
265
  	bool				needs_fixed_file;
2b188cc1b   Jens Axboe   Add io_uring IO i...
266
  };
09bb83943   Jens Axboe   io_uring: fix fge...
267
268
269
270
  /*
   * First field must be the file pointer in all the
   * iocb unions! See also 'struct kiocb' in <linux/fs.h>
   */
221c5eb23   Jens Axboe   io_uring: add sup...
271
272
273
274
  struct io_poll_iocb {
  	struct file			*file;
  	struct wait_queue_head		*head;
  	__poll_t			events;
8c8387887   Jens Axboe   io_uring: fix pol...
275
  	bool				done;
221c5eb23   Jens Axboe   io_uring: add sup...
276
277
278
  	bool				canceled;
  	struct wait_queue_entry		wait;
  };
5262f5679   Jens Axboe   io_uring: IORING_...
279
280
281
282
  struct io_timeout {
  	struct file			*file;
  	struct hrtimer			timer;
  };
09bb83943   Jens Axboe   io_uring: fix fge...
283
284
285
286
287
288
  /*
   * NOTE! Each of the iocb union members has the file pointer
   * as the first entry in their struct definition. So you can
   * access the file pointer through any of the sub-structs,
   * or directly as just 'ki_filp' in this struct.
   */
2b188cc1b   Jens Axboe   Add io_uring IO i...
289
  struct io_kiocb {
221c5eb23   Jens Axboe   io_uring: add sup...
290
  	union {
09bb83943   Jens Axboe   io_uring: fix fge...
291
  		struct file		*file;
221c5eb23   Jens Axboe   io_uring: add sup...
292
293
  		struct kiocb		rw;
  		struct io_poll_iocb	poll;
5262f5679   Jens Axboe   io_uring: IORING_...
294
  		struct io_timeout	timeout;
221c5eb23   Jens Axboe   io_uring: add sup...
295
  	};
2b188cc1b   Jens Axboe   Add io_uring IO i...
296
297
298
299
300
  
  	struct sqe_submit	submit;
  
  	struct io_ring_ctx	*ctx;
  	struct list_head	list;
9e645e110   Jens Axboe   io_uring: add sup...
301
  	struct list_head	link_list;
2b188cc1b   Jens Axboe   Add io_uring IO i...
302
  	unsigned int		flags;
c16361c1d   Jens Axboe   io_uring: add io_...
303
  	refcount_t		refs;
8449eedaa   Stefan Bühler   io_uring: fix han...
304
  #define REQ_F_NOWAIT		1	/* must not punt to workers */
def596e95   Jens Axboe   io_uring: support...
305
  #define REQ_F_IOPOLL_COMPLETED	2	/* polled IO has completed */
6b06314c4   Jens Axboe   io_uring: add fil...
306
  #define REQ_F_FIXED_FILE	4	/* ctx owns file */
31b515106   Jens Axboe   io_uring: allow w...
307
  #define REQ_F_SEQ_PREV		8	/* sequential with previous */
e2033e33c   Stefan Bühler   io_uring: fix rac...
308
309
  #define REQ_F_IO_DRAIN		16	/* drain existing IO first */
  #define REQ_F_IO_DRAINED	32	/* drain done */
9e645e110   Jens Axboe   io_uring: add sup...
310
  #define REQ_F_LINK		64	/* linked sqes */
f7b76ac9d   Zhengyuan Liu   io_uring: fix cou...
311
312
  #define REQ_F_LINK_DONE		128	/* linked sqes done */
  #define REQ_F_FAIL_LINK		256	/* fail rest of links */
4fe2c9631   Jackie Liu   io_uring: add sup...
313
  #define REQ_F_SHADOW_DRAIN	512	/* link-drain shadow req */
5262f5679   Jens Axboe   io_uring: IORING_...
314
  #define REQ_F_TIMEOUT		1024	/* timeout request */
491381ce0   Jens Axboe   io_uring: fix up ...
315
316
  #define REQ_F_ISREG		2048	/* regular file */
  #define REQ_F_MUST_PUNT		4096	/* must be punted even for NONBLOCK */
93bd25bb6   Jens Axboe   io_uring: make ti...
317
  #define REQ_F_TIMEOUT_NOSEQ	8192	/* no timeout sequence */
2b188cc1b   Jens Axboe   Add io_uring IO i...
318
  	u64			user_data;
9e645e110   Jens Axboe   io_uring: add sup...
319
  	u32			result;
de0617e46   Jens Axboe   io_uring: add sup...
320
  	u32			sequence;
2b188cc1b   Jens Axboe   Add io_uring IO i...
321

cac68d12c   Jens Axboe   io_uring: grab ->...
322
  	struct fs_struct	*fs;
2b188cc1b   Jens Axboe   Add io_uring IO i...
323
324
325
326
  	struct work_struct	work;
  };
  
  #define IO_PLUG_THRESHOLD		2
def596e95   Jens Axboe   io_uring: support...
327
  #define IO_IOPOLL_BATCH			8
2b188cc1b   Jens Axboe   Add io_uring IO i...
328

9a56a2323   Jens Axboe   io_uring: use fge...
329
330
331
332
  struct io_submit_state {
  	struct blk_plug		plug;
  
  	/*
2579f913d   Jens Axboe   io_uring: batch i...
333
334
335
336
337
338
339
  	 * io_kiocb alloc cache
  	 */
  	void			*reqs[IO_IOPOLL_BATCH];
  	unsigned		int free_reqs;
  	unsigned		int cur_req;
  
  	/*
9a56a2323   Jens Axboe   io_uring: use fge...
340
341
342
343
344
345
346
347
  	 * File reference cache
  	 */
  	struct file		*file;
  	unsigned int		fd;
  	unsigned int		has_refs;
  	unsigned int		used_refs;
  	unsigned int		ios_left;
  };
de0617e46   Jens Axboe   io_uring: add sup...
348
  static void io_sq_wq_submit_work(struct work_struct *work);
5262f5679   Jens Axboe   io_uring: IORING_...
349
350
  static void io_cqring_fill_event(struct io_ring_ctx *ctx, u64 ki_user_data,
  				 long res);
4fe2c9631   Jackie Liu   io_uring: add sup...
351
  static void __io_free_req(struct io_kiocb *req);
de0617e46   Jens Axboe   io_uring: add sup...
352

2b188cc1b   Jens Axboe   Add io_uring IO i...
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
  static struct kmem_cache *req_cachep;
  
  static const struct file_operations io_uring_fops;
  
  struct sock *io_uring_get_socket(struct file *file)
  {
  #if defined(CONFIG_UNIX)
  	if (file->f_op == &io_uring_fops) {
  		struct io_ring_ctx *ctx = file->private_data;
  
  		return ctx->ring_sock->sk;
  	}
  #endif
  	return NULL;
  }
  EXPORT_SYMBOL(io_uring_get_socket);
  
  static void io_ring_ctx_ref_free(struct percpu_ref *ref)
  {
  	struct io_ring_ctx *ctx = container_of(ref, struct io_ring_ctx, refs);
  
  	complete(&ctx->ctx_done);
  }
  
  static struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p)
  {
  	struct io_ring_ctx *ctx;
31b515106   Jens Axboe   io_uring: allow w...
380
  	int i;
2b188cc1b   Jens Axboe   Add io_uring IO i...
381
382
383
384
  
  	ctx = kzalloc(sizeof(*ctx), GFP_KERNEL);
  	if (!ctx)
  		return NULL;
214828962   Roman Gushchin   io_uring: initial...
385
386
  	if (percpu_ref_init(&ctx->refs, io_ring_ctx_ref_free,
  			    PERCPU_REF_ALLOW_REINIT, GFP_KERNEL)) {
2b188cc1b   Jens Axboe   Add io_uring IO i...
387
388
389
390
391
392
393
  		kfree(ctx);
  		return NULL;
  	}
  
  	ctx->flags = p->flags;
  	init_waitqueue_head(&ctx->cq_wait);
  	init_completion(&ctx->ctx_done);
a4c0b3dec   Jackie Liu   io_uring: fix io_...
394
  	init_completion(&ctx->sqo_thread_started);
2b188cc1b   Jens Axboe   Add io_uring IO i...
395
396
  	mutex_init(&ctx->uring_lock);
  	init_waitqueue_head(&ctx->wait);
31b515106   Jens Axboe   io_uring: allow w...
397
398
399
400
401
  	for (i = 0; i < ARRAY_SIZE(ctx->pending_async); i++) {
  		spin_lock_init(&ctx->pending_async[i].lock);
  		INIT_LIST_HEAD(&ctx->pending_async[i].list);
  		atomic_set(&ctx->pending_async[i].cnt, 0);
  	}
2b188cc1b   Jens Axboe   Add io_uring IO i...
402
  	spin_lock_init(&ctx->completion_lock);
def596e95   Jens Axboe   io_uring: support...
403
  	INIT_LIST_HEAD(&ctx->poll_list);
221c5eb23   Jens Axboe   io_uring: add sup...
404
  	INIT_LIST_HEAD(&ctx->cancel_list);
de0617e46   Jens Axboe   io_uring: add sup...
405
  	INIT_LIST_HEAD(&ctx->defer_list);
5262f5679   Jens Axboe   io_uring: IORING_...
406
  	INIT_LIST_HEAD(&ctx->timeout_list);
2b188cc1b   Jens Axboe   Add io_uring IO i...
407
408
  	return ctx;
  }
7adf4eaf6   Jens Axboe   io_uring: fix seq...
409
410
411
  static inline bool __io_sequence_defer(struct io_ring_ctx *ctx,
  				       struct io_kiocb *req)
  {
498ccd9ed   Jens Axboe   io_uring: used ca...
412
413
  	return req->sequence != ctx->cached_cq_tail + ctx->cached_sq_dropped
  					+ atomic_read(&ctx->cached_cq_overflow);
7adf4eaf6   Jens Axboe   io_uring: fix seq...
414
  }
de0617e46   Jens Axboe   io_uring: add sup...
415
416
417
  static inline bool io_sequence_defer(struct io_ring_ctx *ctx,
  				     struct io_kiocb *req)
  {
7adf4eaf6   Jens Axboe   io_uring: fix seq...
418
  	if ((req->flags & (REQ_F_IO_DRAIN|REQ_F_IO_DRAINED)) != REQ_F_IO_DRAIN)
de0617e46   Jens Axboe   io_uring: add sup...
419
  		return false;
7adf4eaf6   Jens Axboe   io_uring: fix seq...
420
  	return __io_sequence_defer(ctx, req);
de0617e46   Jens Axboe   io_uring: add sup...
421
  }
7adf4eaf6   Jens Axboe   io_uring: fix seq...
422
  static struct io_kiocb *io_get_deferred_req(struct io_ring_ctx *ctx)
de0617e46   Jens Axboe   io_uring: add sup...
423
424
  {
  	struct io_kiocb *req;
7adf4eaf6   Jens Axboe   io_uring: fix seq...
425
426
  	req = list_first_entry_or_null(&ctx->defer_list, struct io_kiocb, list);
  	if (req && !io_sequence_defer(ctx, req)) {
de0617e46   Jens Axboe   io_uring: add sup...
427
428
429
430
431
432
  		list_del_init(&req->list);
  		return req;
  	}
  
  	return NULL;
  }
5262f5679   Jens Axboe   io_uring: IORING_...
433
434
  static struct io_kiocb *io_get_timeout_req(struct io_ring_ctx *ctx)
  {
7adf4eaf6   Jens Axboe   io_uring: fix seq...
435
436
437
  	struct io_kiocb *req;
  
  	req = list_first_entry_or_null(&ctx->timeout_list, struct io_kiocb, list);
93bd25bb6   Jens Axboe   io_uring: make ti...
438
439
440
441
442
443
444
  	if (req) {
  		if (req->flags & REQ_F_TIMEOUT_NOSEQ)
  			return NULL;
  		if (!__io_sequence_defer(ctx, req)) {
  			list_del_init(&req->list);
  			return req;
  		}
7adf4eaf6   Jens Axboe   io_uring: fix seq...
445
446
447
  	}
  
  	return NULL;
5262f5679   Jens Axboe   io_uring: IORING_...
448
  }
de0617e46   Jens Axboe   io_uring: add sup...
449
  static void __io_commit_cqring(struct io_ring_ctx *ctx)
2b188cc1b   Jens Axboe   Add io_uring IO i...
450
  {
75b28affd   Hristo Venev   io_uring: allocat...
451
  	struct io_rings *rings = ctx->rings;
2b188cc1b   Jens Axboe   Add io_uring IO i...
452

75b28affd   Hristo Venev   io_uring: allocat...
453
  	if (ctx->cached_cq_tail != READ_ONCE(rings->cq.tail)) {
2b188cc1b   Jens Axboe   Add io_uring IO i...
454
  		/* order cqe stores with ring update */
75b28affd   Hristo Venev   io_uring: allocat...
455
  		smp_store_release(&rings->cq.tail, ctx->cached_cq_tail);
2b188cc1b   Jens Axboe   Add io_uring IO i...
456

2b188cc1b   Jens Axboe   Add io_uring IO i...
457
458
459
460
461
462
  		if (wq_has_sleeper(&ctx->cq_wait)) {
  			wake_up_interruptible(&ctx->cq_wait);
  			kill_fasync(&ctx->cq_fasync, SIGIO, POLL_IN);
  		}
  	}
  }
18d9be1a9   Jens Axboe   io_uring: add io_...
463
464
465
  static inline void io_queue_async_work(struct io_ring_ctx *ctx,
  				       struct io_kiocb *req)
  {
6cc47d1d2   Jens Axboe   io_uring: ensure ...
466
  	int rw = 0;
54a91f3bb   Jens Axboe   io_uring: limit p...
467

6cc47d1d2   Jens Axboe   io_uring: ensure ...
468
469
470
471
472
473
474
  	if (req->submit.sqe) {
  		switch (req->submit.sqe->opcode) {
  		case IORING_OP_WRITEV:
  		case IORING_OP_WRITE_FIXED:
  			rw = !(req->rw.ki_flags & IOCB_DIRECT);
  			break;
  		}
54a91f3bb   Jens Axboe   io_uring: limit p...
475
476
477
  	}
  
  	queue_work(ctx->sqo_wq[rw], &req->work);
18d9be1a9   Jens Axboe   io_uring: add io_...
478
  }
5262f5679   Jens Axboe   io_uring: IORING_...
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
  static void io_kill_timeout(struct io_kiocb *req)
  {
  	int ret;
  
  	ret = hrtimer_try_to_cancel(&req->timeout.timer);
  	if (ret != -1) {
  		atomic_inc(&req->ctx->cq_timeouts);
  		list_del(&req->list);
  		io_cqring_fill_event(req->ctx, req->user_data, 0);
  		__io_free_req(req);
  	}
  }
  
  static void io_kill_timeouts(struct io_ring_ctx *ctx)
  {
  	struct io_kiocb *req, *tmp;
  
  	spin_lock_irq(&ctx->completion_lock);
  	list_for_each_entry_safe(req, tmp, &ctx->timeout_list, list)
  		io_kill_timeout(req);
  	spin_unlock_irq(&ctx->completion_lock);
  }
de0617e46   Jens Axboe   io_uring: add sup...
501
502
503
  static void io_commit_cqring(struct io_ring_ctx *ctx)
  {
  	struct io_kiocb *req;
5262f5679   Jens Axboe   io_uring: IORING_...
504
505
  	while ((req = io_get_timeout_req(ctx)) != NULL)
  		io_kill_timeout(req);
de0617e46   Jens Axboe   io_uring: add sup...
506
507
508
  	__io_commit_cqring(ctx);
  
  	while ((req = io_get_deferred_req(ctx)) != NULL) {
4fe2c9631   Jackie Liu   io_uring: add sup...
509
510
511
512
513
  		if (req->flags & REQ_F_SHADOW_DRAIN) {
  			/* Just for drain, free it. */
  			__io_free_req(req);
  			continue;
  		}
de0617e46   Jens Axboe   io_uring: add sup...
514
  		req->flags |= REQ_F_IO_DRAINED;
18d9be1a9   Jens Axboe   io_uring: add io_...
515
  		io_queue_async_work(ctx, req);
de0617e46   Jens Axboe   io_uring: add sup...
516
517
  	}
  }
2b188cc1b   Jens Axboe   Add io_uring IO i...
518
519
  static struct io_uring_cqe *io_get_cqring(struct io_ring_ctx *ctx)
  {
75b28affd   Hristo Venev   io_uring: allocat...
520
  	struct io_rings *rings = ctx->rings;
2b188cc1b   Jens Axboe   Add io_uring IO i...
521
522
523
  	unsigned tail;
  
  	tail = ctx->cached_cq_tail;
115e12e58   Stefan Bühler   io_uring: remove ...
524
525
526
527
528
  	/*
  	 * writes to the cq entry need to come after reading head; the
  	 * control dependency is enough as we're using WRITE_ONCE to
  	 * fill the cq entry
  	 */
75b28affd   Hristo Venev   io_uring: allocat...
529
  	if (tail - READ_ONCE(rings->cq.head) == rings->cq_ring_entries)
2b188cc1b   Jens Axboe   Add io_uring IO i...
530
531
532
  		return NULL;
  
  	ctx->cached_cq_tail++;
75b28affd   Hristo Venev   io_uring: allocat...
533
  	return &rings->cqes[tail & ctx->cq_mask];
2b188cc1b   Jens Axboe   Add io_uring IO i...
534
535
536
  }
  
  static void io_cqring_fill_event(struct io_ring_ctx *ctx, u64 ki_user_data,
c71ffb673   Jens Axboe   io_uring: remove ...
537
  				 long res)
2b188cc1b   Jens Axboe   Add io_uring IO i...
538
539
540
541
542
543
544
545
546
547
548
549
  {
  	struct io_uring_cqe *cqe;
  
  	/*
  	 * If we can't get a cq entry, userspace overflowed the
  	 * submission (by quite a lot). Increment the overflow count in
  	 * the ring.
  	 */
  	cqe = io_get_cqring(ctx);
  	if (cqe) {
  		WRITE_ONCE(cqe->user_data, ki_user_data);
  		WRITE_ONCE(cqe->res, res);
c71ffb673   Jens Axboe   io_uring: remove ...
550
  		WRITE_ONCE(cqe->flags, 0);
2b188cc1b   Jens Axboe   Add io_uring IO i...
551
  	} else {
498ccd9ed   Jens Axboe   io_uring: used ca...
552
553
  		WRITE_ONCE(ctx->rings->cq_overflow,
  				atomic_inc_return(&ctx->cached_cq_overflow));
2b188cc1b   Jens Axboe   Add io_uring IO i...
554
555
  	}
  }
8c8387887   Jens Axboe   io_uring: fix pol...
556
557
558
559
560
561
  static void io_cqring_ev_posted(struct io_ring_ctx *ctx)
  {
  	if (waitqueue_active(&ctx->wait))
  		wake_up(&ctx->wait);
  	if (waitqueue_active(&ctx->sqo_wait))
  		wake_up(&ctx->sqo_wait);
9b402849e   Jens Axboe   io_uring: add sup...
562
563
  	if (ctx->cq_ev_fd)
  		eventfd_signal(ctx->cq_ev_fd, 1);
8c8387887   Jens Axboe   io_uring: fix pol...
564
565
566
  }
  
  static void io_cqring_add_event(struct io_ring_ctx *ctx, u64 user_data,
c71ffb673   Jens Axboe   io_uring: remove ...
567
  				long res)
2b188cc1b   Jens Axboe   Add io_uring IO i...
568
569
570
571
  {
  	unsigned long flags;
  
  	spin_lock_irqsave(&ctx->completion_lock, flags);
c71ffb673   Jens Axboe   io_uring: remove ...
572
  	io_cqring_fill_event(ctx, user_data, res);
2b188cc1b   Jens Axboe   Add io_uring IO i...
573
574
  	io_commit_cqring(ctx);
  	spin_unlock_irqrestore(&ctx->completion_lock, flags);
8c8387887   Jens Axboe   io_uring: fix pol...
575
  	io_cqring_ev_posted(ctx);
2b188cc1b   Jens Axboe   Add io_uring IO i...
576
  }
2579f913d   Jens Axboe   io_uring: batch i...
577
578
  static struct io_kiocb *io_get_req(struct io_ring_ctx *ctx,
  				   struct io_submit_state *state)
2b188cc1b   Jens Axboe   Add io_uring IO i...
579
  {
fd6fab2cb   Jens Axboe   io_uring: retry b...
580
  	gfp_t gfp = GFP_KERNEL | __GFP_NOWARN;
2b188cc1b   Jens Axboe   Add io_uring IO i...
581
582
583
584
  	struct io_kiocb *req;
  
  	if (!percpu_ref_tryget(&ctx->refs))
  		return NULL;
2579f913d   Jens Axboe   io_uring: batch i...
585
  	if (!state) {
fd6fab2cb   Jens Axboe   io_uring: retry b...
586
  		req = kmem_cache_alloc(req_cachep, gfp);
2579f913d   Jens Axboe   io_uring: batch i...
587
588
589
590
591
592
593
  		if (unlikely(!req))
  			goto out;
  	} else if (!state->free_reqs) {
  		size_t sz;
  		int ret;
  
  		sz = min_t(size_t, state->ios_left, ARRAY_SIZE(state->reqs));
fd6fab2cb   Jens Axboe   io_uring: retry b...
594
595
596
597
598
599
600
601
602
603
604
605
  		ret = kmem_cache_alloc_bulk(req_cachep, gfp, sz, state->reqs);
  
  		/*
  		 * Bulk alloc is all-or-nothing. If we fail to get a batch,
  		 * retry single alloc to be on the safe side.
  		 */
  		if (unlikely(ret <= 0)) {
  			state->reqs[0] = kmem_cache_alloc(req_cachep, gfp);
  			if (!state->reqs[0])
  				goto out;
  			ret = 1;
  		}
2579f913d   Jens Axboe   io_uring: batch i...
606
607
608
609
610
611
612
  		state->free_reqs = ret - 1;
  		state->cur_req = 1;
  		req = state->reqs[0];
  	} else {
  		req = state->reqs[state->cur_req];
  		state->free_reqs--;
  		state->cur_req++;
2b188cc1b   Jens Axboe   Add io_uring IO i...
613
  	}
60c112b0a   Jens Axboe   io_uring: ensure ...
614
  	req->file = NULL;
2579f913d   Jens Axboe   io_uring: batch i...
615
616
  	req->ctx = ctx;
  	req->flags = 0;
e65ef56db   Jens Axboe   io_uring: use reg...
617
618
  	/* one is dropped after submission, the other at completion */
  	refcount_set(&req->refs, 2);
9e645e110   Jens Axboe   io_uring: add sup...
619
  	req->result = 0;
cac68d12c   Jens Axboe   io_uring: grab ->...
620
  	req->fs = NULL;
2579f913d   Jens Axboe   io_uring: batch i...
621
622
  	return req;
  out:
6805b32ec   Pavel Begunkov   io_uring: remove ...
623
  	percpu_ref_put(&ctx->refs);
2b188cc1b   Jens Axboe   Add io_uring IO i...
624
625
  	return NULL;
  }
def596e95   Jens Axboe   io_uring: support...
626
627
628
629
  static void io_free_req_many(struct io_ring_ctx *ctx, void **reqs, int *nr)
  {
  	if (*nr) {
  		kmem_cache_free_bulk(req_cachep, *nr, reqs);
6805b32ec   Pavel Begunkov   io_uring: remove ...
630
  		percpu_ref_put_many(&ctx->refs, *nr);
def596e95   Jens Axboe   io_uring: support...
631
632
633
  		*nr = 0;
  	}
  }
9e645e110   Jens Axboe   io_uring: add sup...
634
  static void __io_free_req(struct io_kiocb *req)
2b188cc1b   Jens Axboe   Add io_uring IO i...
635
  {
09bb83943   Jens Axboe   io_uring: fix fge...
636
637
  	if (req->file && !(req->flags & REQ_F_FIXED_FILE))
  		fput(req->file);
6805b32ec   Pavel Begunkov   io_uring: remove ...
638
  	percpu_ref_put(&req->ctx->refs);
e65ef56db   Jens Axboe   io_uring: use reg...
639
640
  	kmem_cache_free(req_cachep, req);
  }
9e645e110   Jens Axboe   io_uring: add sup...
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
  static void io_req_link_next(struct io_kiocb *req)
  {
  	struct io_kiocb *nxt;
  
  	/*
  	 * The list should never be empty when we are called here. But could
  	 * potentially happen if the chain is messed up, check to be on the
  	 * safe side.
  	 */
  	nxt = list_first_entry_or_null(&req->link_list, struct io_kiocb, list);
  	if (nxt) {
  		list_del(&nxt->list);
  		if (!list_empty(&req->link_list)) {
  			INIT_LIST_HEAD(&nxt->link_list);
  			list_splice(&req->link_list, &nxt->link_list);
  			nxt->flags |= REQ_F_LINK;
  		}
f7b76ac9d   Zhengyuan Liu   io_uring: fix cou...
658
  		nxt->flags |= REQ_F_LINK_DONE;
9e645e110   Jens Axboe   io_uring: add sup...
659
  		INIT_WORK(&nxt->work, io_sq_wq_submit_work);
18d9be1a9   Jens Axboe   io_uring: add io_...
660
  		io_queue_async_work(req->ctx, nxt);
9e645e110   Jens Axboe   io_uring: add sup...
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
  	}
  }
  
  /*
   * Called if REQ_F_LINK is set, and we fail the head request
   */
  static void io_fail_links(struct io_kiocb *req)
  {
  	struct io_kiocb *link;
  
  	while (!list_empty(&req->link_list)) {
  		link = list_first_entry(&req->link_list, struct io_kiocb, list);
  		list_del(&link->list);
  
  		io_cqring_add_event(req->ctx, link->user_data, -ECANCELED);
  		__io_free_req(link);
  	}
  }
  
  static void io_free_req(struct io_kiocb *req)
  {
  	/*
  	 * If LINK is set, we have dependent requests in this chain. If we
  	 * didn't fail this request, queue the first one up, moving any other
  	 * dependencies to the next request. In case of failure, fail the rest
  	 * of the chain.
  	 */
  	if (req->flags & REQ_F_LINK) {
  		if (req->flags & REQ_F_FAIL_LINK)
  			io_fail_links(req);
  		else
  			io_req_link_next(req);
  	}
  
  	__io_free_req(req);
  }
e65ef56db   Jens Axboe   io_uring: use reg...
697
698
699
700
  static void io_put_req(struct io_kiocb *req)
  {
  	if (refcount_dec_and_test(&req->refs))
  		io_free_req(req);
2b188cc1b   Jens Axboe   Add io_uring IO i...
701
  }
75b28affd   Hristo Venev   io_uring: allocat...
702
  static unsigned io_cqring_events(struct io_rings *rings)
a3a0e43fd   Jens Axboe   io_uring: don't e...
703
704
705
  {
  	/* See comment at the top of this file */
  	smp_rmb();
75b28affd   Hristo Venev   io_uring: allocat...
706
  	return READ_ONCE(rings->cq.tail) - READ_ONCE(rings->cq.head);
a3a0e43fd   Jens Axboe   io_uring: don't e...
707
  }
fb5ccc987   Pavel Begunkov   io_uring: Fix bro...
708
709
710
711
712
713
714
  static inline unsigned int io_sqring_entries(struct io_ring_ctx *ctx)
  {
  	struct io_rings *rings = ctx->rings;
  
  	/* make sure SQ entry isn't read before tail */
  	return smp_load_acquire(&rings->sq.tail) - ctx->cached_sq_head;
  }
def596e95   Jens Axboe   io_uring: support...
715
716
717
718
719
720
721
722
  /*
   * Find and free completed poll iocbs
   */
  static void io_iopoll_complete(struct io_ring_ctx *ctx, unsigned int *nr_events,
  			       struct list_head *done)
  {
  	void *reqs[IO_IOPOLL_BATCH];
  	struct io_kiocb *req;
09bb83943   Jens Axboe   io_uring: fix fge...
723
  	int to_free;
def596e95   Jens Axboe   io_uring: support...
724

09bb83943   Jens Axboe   io_uring: fix fge...
725
  	to_free = 0;
def596e95   Jens Axboe   io_uring: support...
726
727
728
  	while (!list_empty(done)) {
  		req = list_first_entry(done, struct io_kiocb, list);
  		list_del(&req->list);
9e645e110   Jens Axboe   io_uring: add sup...
729
  		io_cqring_fill_event(ctx, req->user_data, req->result);
def596e95   Jens Axboe   io_uring: support...
730
  		(*nr_events)++;
09bb83943   Jens Axboe   io_uring: fix fge...
731
732
733
734
  		if (refcount_dec_and_test(&req->refs)) {
  			/* If we're not using fixed files, we have to pair the
  			 * completion part with the file put. Use regular
  			 * completions for those, only batch free for fixed
9e645e110   Jens Axboe   io_uring: add sup...
735
  			 * file and non-linked commands.
09bb83943   Jens Axboe   io_uring: fix fge...
736
  			 */
9e645e110   Jens Axboe   io_uring: add sup...
737
738
  			if ((req->flags & (REQ_F_FIXED_FILE|REQ_F_LINK)) ==
  			    REQ_F_FIXED_FILE) {
09bb83943   Jens Axboe   io_uring: fix fge...
739
740
741
  				reqs[to_free++] = req;
  				if (to_free == ARRAY_SIZE(reqs))
  					io_free_req_many(ctx, reqs, &to_free);
6b06314c4   Jens Axboe   io_uring: add fil...
742
  			} else {
09bb83943   Jens Axboe   io_uring: fix fge...
743
  				io_free_req(req);
6b06314c4   Jens Axboe   io_uring: add fil...
744
  			}
9a56a2323   Jens Axboe   io_uring: use fge...
745
  		}
def596e95   Jens Axboe   io_uring: support...
746
  	}
def596e95   Jens Axboe   io_uring: support...
747

09bb83943   Jens Axboe   io_uring: fix fge...
748
  	io_commit_cqring(ctx);
def596e95   Jens Axboe   io_uring: support...
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
  	io_free_req_many(ctx, reqs, &to_free);
  }
  
  static int io_do_iopoll(struct io_ring_ctx *ctx, unsigned int *nr_events,
  			long min)
  {
  	struct io_kiocb *req, *tmp;
  	LIST_HEAD(done);
  	bool spin;
  	int ret;
  
  	/*
  	 * Only spin for completions if we don't have multiple devices hanging
  	 * off our complete list, and we're under the requested amount.
  	 */
  	spin = !ctx->poll_multi_file && *nr_events < min;
  
  	ret = 0;
  	list_for_each_entry_safe(req, tmp, &ctx->poll_list, list) {
  		struct kiocb *kiocb = &req->rw;
  
  		/*
  		 * Move completed entries to our local list. If we find a
  		 * request that requires polling, break out and complete
  		 * the done list first, if we have entries there.
  		 */
  		if (req->flags & REQ_F_IOPOLL_COMPLETED) {
  			list_move_tail(&req->list, &done);
  			continue;
  		}
  		if (!list_empty(&done))
  			break;
  
  		ret = kiocb->ki_filp->f_op->iopoll(kiocb, spin);
  		if (ret < 0)
  			break;
  
  		if (ret && spin)
  			spin = false;
  		ret = 0;
  	}
  
  	if (!list_empty(&done))
  		io_iopoll_complete(ctx, nr_events, &done);
  
  	return ret;
  }
  
  /*
   * Poll for a mininum of 'min' events. Note that if min == 0 we consider that a
   * non-spinning poll check - we'll still enter the driver poll loop, but only
   * as a non-spinning completion check.
   */
  static int io_iopoll_getevents(struct io_ring_ctx *ctx, unsigned int *nr_events,
  				long min)
  {
08f5439f1   Jens Axboe   io_uring: add nee...
805
  	while (!list_empty(&ctx->poll_list) && !need_resched()) {
def596e95   Jens Axboe   io_uring: support...
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
  		int ret;
  
  		ret = io_do_iopoll(ctx, nr_events, min);
  		if (ret < 0)
  			return ret;
  		if (!min || *nr_events >= min)
  			return 0;
  	}
  
  	return 1;
  }
  
  /*
   * We can't just wait for polled events to come to us, we have to actively
   * find and complete them.
   */
  static void io_iopoll_reap_events(struct io_ring_ctx *ctx)
  {
  	if (!(ctx->flags & IORING_SETUP_IOPOLL))
  		return;
  
  	mutex_lock(&ctx->uring_lock);
  	while (!list_empty(&ctx->poll_list)) {
  		unsigned int nr_events = 0;
  
  		io_iopoll_getevents(ctx, &nr_events, 1);
08f5439f1   Jens Axboe   io_uring: add nee...
832
833
834
835
836
837
  
  		/*
  		 * Ensure we allow local-to-the-cpu processing to take place,
  		 * in this case we need to ensure that we reap all events.
  		 */
  		cond_resched();
def596e95   Jens Axboe   io_uring: support...
838
839
840
  	}
  	mutex_unlock(&ctx->uring_lock);
  }
c7deb9612   Xiaoguang Wang   io_uring: fix __i...
841
842
  static int io_iopoll_check(struct io_ring_ctx *ctx, unsigned *nr_events,
  			   long min)
def596e95   Jens Axboe   io_uring: support...
843
  {
2b2ed9750   Jens Axboe   io_uring: fix bad...
844
  	int iters = 0, ret = 0;
500f9fbad   Jens Axboe   io_uring: fix pot...
845

c7deb9612   Xiaoguang Wang   io_uring: fix __i...
846
847
848
849
850
851
  	/*
  	 * We disallow the app entering submit/complete with polling, but we
  	 * still need to lock the ring to prevent racing with polled issue
  	 * that got punted to a workqueue.
  	 */
  	mutex_lock(&ctx->uring_lock);
def596e95   Jens Axboe   io_uring: support...
852
853
  	do {
  		int tmin = 0;
500f9fbad   Jens Axboe   io_uring: fix pot...
854
  		/*
a3a0e43fd   Jens Axboe   io_uring: don't e...
855
856
857
858
  		 * Don't enter poll loop if we already have events pending.
  		 * If we do, we can potentially be spinning for commands that
  		 * already triggered a CQE (eg in error).
  		 */
75b28affd   Hristo Venev   io_uring: allocat...
859
  		if (io_cqring_events(ctx->rings))
a3a0e43fd   Jens Axboe   io_uring: don't e...
860
861
862
  			break;
  
  		/*
500f9fbad   Jens Axboe   io_uring: fix pot...
863
864
865
866
867
868
869
870
871
872
873
874
875
  		 * If a submit got punted to a workqueue, we can have the
  		 * application entering polling for a command before it gets
  		 * issued. That app will hold the uring_lock for the duration
  		 * of the poll right here, so we need to take a breather every
  		 * now and then to ensure that the issue has a chance to add
  		 * the poll to the issued list. Otherwise we can spin here
  		 * forever, while the workqueue is stuck trying to acquire the
  		 * very same mutex.
  		 */
  		if (!(++iters & 7)) {
  			mutex_unlock(&ctx->uring_lock);
  			mutex_lock(&ctx->uring_lock);
  		}
def596e95   Jens Axboe   io_uring: support...
876
877
878
879
880
881
882
883
  		if (*nr_events < min)
  			tmin = min - *nr_events;
  
  		ret = io_iopoll_getevents(ctx, nr_events, tmin);
  		if (ret <= 0)
  			break;
  		ret = 0;
  	} while (min && !*nr_events && !need_resched());
500f9fbad   Jens Axboe   io_uring: fix pot...
884
  	mutex_unlock(&ctx->uring_lock);
def596e95   Jens Axboe   io_uring: support...
885
886
  	return ret;
  }
491381ce0   Jens Axboe   io_uring: fix up ...
887
  static void kiocb_end_write(struct io_kiocb *req)
2b188cc1b   Jens Axboe   Add io_uring IO i...
888
  {
491381ce0   Jens Axboe   io_uring: fix up ...
889
890
891
892
893
894
  	/*
  	 * Tell lockdep we inherited freeze protection from submission
  	 * thread.
  	 */
  	if (req->flags & REQ_F_ISREG) {
  		struct inode *inode = file_inode(req->file);
2b188cc1b   Jens Axboe   Add io_uring IO i...
895

491381ce0   Jens Axboe   io_uring: fix up ...
896
  		__sb_writers_acquired(inode->i_sb, SB_FREEZE_WRITE);
2b188cc1b   Jens Axboe   Add io_uring IO i...
897
  	}
491381ce0   Jens Axboe   io_uring: fix up ...
898
  	file_end_write(req->file);
2b188cc1b   Jens Axboe   Add io_uring IO i...
899
900
901
902
903
  }
  
  static void io_complete_rw(struct kiocb *kiocb, long res, long res2)
  {
  	struct io_kiocb *req = container_of(kiocb, struct io_kiocb, rw);
491381ce0   Jens Axboe   io_uring: fix up ...
904
905
  	if (kiocb->ki_flags & IOCB_WRITE)
  		kiocb_end_write(req);
2b188cc1b   Jens Axboe   Add io_uring IO i...
906

9e645e110   Jens Axboe   io_uring: add sup...
907
908
  	if ((req->flags & REQ_F_LINK) && res != req->result)
  		req->flags |= REQ_F_FAIL_LINK;
c71ffb673   Jens Axboe   io_uring: remove ...
909
  	io_cqring_add_event(req->ctx, req->user_data, res);
e65ef56db   Jens Axboe   io_uring: use reg...
910
  	io_put_req(req);
2b188cc1b   Jens Axboe   Add io_uring IO i...
911
  }
def596e95   Jens Axboe   io_uring: support...
912
913
914
  static void io_complete_rw_iopoll(struct kiocb *kiocb, long res, long res2)
  {
  	struct io_kiocb *req = container_of(kiocb, struct io_kiocb, rw);
491381ce0   Jens Axboe   io_uring: fix up ...
915
916
  	if (kiocb->ki_flags & IOCB_WRITE)
  		kiocb_end_write(req);
def596e95   Jens Axboe   io_uring: support...
917

9e645e110   Jens Axboe   io_uring: add sup...
918
919
920
  	if ((req->flags & REQ_F_LINK) && res != req->result)
  		req->flags |= REQ_F_FAIL_LINK;
  	req->result = res;
def596e95   Jens Axboe   io_uring: support...
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
  	if (res != -EAGAIN)
  		req->flags |= REQ_F_IOPOLL_COMPLETED;
  }
  
  /*
   * After the iocb has been issued, it's safe to be found on the poll list.
   * Adding the kiocb to the list AFTER submission ensures that we don't
   * find it from a io_iopoll_getevents() thread before the issuer is done
   * accessing the kiocb cookie.
   */
  static void io_iopoll_req_issued(struct io_kiocb *req)
  {
  	struct io_ring_ctx *ctx = req->ctx;
  
  	/*
  	 * Track whether we have multiple files in our lists. This will impact
  	 * how we do polling eventually, not spinning if we're on potentially
  	 * different devices.
  	 */
  	if (list_empty(&ctx->poll_list)) {
  		ctx->poll_multi_file = false;
  	} else if (!ctx->poll_multi_file) {
  		struct io_kiocb *list_req;
  
  		list_req = list_first_entry(&ctx->poll_list, struct io_kiocb,
  						list);
  		if (list_req->rw.ki_filp != req->rw.ki_filp)
  			ctx->poll_multi_file = true;
  	}
  
  	/*
  	 * For fast devices, IO may have already completed. If it has, add
  	 * it to the front so we find it first.
  	 */
  	if (req->flags & REQ_F_IOPOLL_COMPLETED)
  		list_add(&req->list, &ctx->poll_list);
  	else
  		list_add_tail(&req->list, &ctx->poll_list);
  }
3d6770fbd   Jens Axboe   io_uring: drop io...
960
  static void io_file_put(struct io_submit_state *state)
9a56a2323   Jens Axboe   io_uring: use fge...
961
  {
3d6770fbd   Jens Axboe   io_uring: drop io...
962
  	if (state->file) {
9a56a2323   Jens Axboe   io_uring: use fge...
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
  		int diff = state->has_refs - state->used_refs;
  
  		if (diff)
  			fput_many(state->file, diff);
  		state->file = NULL;
  	}
  }
  
  /*
   * Get as many references to a file as we have IOs left in this submission,
   * assuming most submissions are for one file, or at least that each file
   * has more than one submission.
   */
  static struct file *io_file_get(struct io_submit_state *state, int fd)
  {
  	if (!state)
  		return fget(fd);
  
  	if (state->file) {
  		if (state->fd == fd) {
  			state->used_refs++;
  			state->ios_left--;
  			return state->file;
  		}
3d6770fbd   Jens Axboe   io_uring: drop io...
987
  		io_file_put(state);
9a56a2323   Jens Axboe   io_uring: use fge...
988
989
990
991
992
993
994
995
996
997
998
  	}
  	state->file = fget_many(fd, state->ios_left);
  	if (!state->file)
  		return NULL;
  
  	state->fd = fd;
  	state->has_refs = state->ios_left;
  	state->used_refs = 1;
  	state->ios_left--;
  	return state->file;
  }
2b188cc1b   Jens Axboe   Add io_uring IO i...
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
  /*
   * If we tracked the file through the SCM inflight mechanism, we could support
   * any file. For now, just ensure that anything potentially problematic is done
   * inline.
   */
  static bool io_file_supports_async(struct file *file)
  {
  	umode_t mode = file_inode(file)->i_mode;
  
  	if (S_ISBLK(mode) || S_ISCHR(mode))
  		return true;
  	if (S_ISREG(mode) && file->f_op != &io_uring_fops)
  		return true;
  
  	return false;
  }
6c271ce2f   Jens Axboe   io_uring: add sub...
1015
  static int io_prep_rw(struct io_kiocb *req, const struct sqe_submit *s,
8358e3a82   Jens Axboe   io_uring: remove ...
1016
  		      bool force_nonblock)
2b188cc1b   Jens Axboe   Add io_uring IO i...
1017
  {
6c271ce2f   Jens Axboe   io_uring: add sub...
1018
  	const struct io_uring_sqe *sqe = s->sqe;
def596e95   Jens Axboe   io_uring: support...
1019
  	struct io_ring_ctx *ctx = req->ctx;
2b188cc1b   Jens Axboe   Add io_uring IO i...
1020
  	struct kiocb *kiocb = &req->rw;
09bb83943   Jens Axboe   io_uring: fix fge...
1021
1022
  	unsigned ioprio;
  	int ret;
2b188cc1b   Jens Axboe   Add io_uring IO i...
1023

09bb83943   Jens Axboe   io_uring: fix fge...
1024
1025
  	if (!req->file)
  		return -EBADF;
2b188cc1b   Jens Axboe   Add io_uring IO i...
1026

491381ce0   Jens Axboe   io_uring: fix up ...
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
  	if (S_ISREG(file_inode(req->file)->i_mode))
  		req->flags |= REQ_F_ISREG;
  
  	/*
  	 * If the file doesn't support async, mark it as REQ_F_MUST_PUNT so
  	 * we know to async punt it even if it was opened O_NONBLOCK
  	 */
  	if (force_nonblock && !io_file_supports_async(req->file)) {
  		req->flags |= REQ_F_MUST_PUNT;
  		return -EAGAIN;
  	}
6b06314c4   Jens Axboe   io_uring: add fil...
1038

2b188cc1b   Jens Axboe   Add io_uring IO i...
1039
1040
1041
1042
1043
1044
1045
1046
  	kiocb->ki_pos = READ_ONCE(sqe->off);
  	kiocb->ki_flags = iocb_flags(kiocb->ki_filp);
  	kiocb->ki_hint = ki_hint_validate(file_write_hint(kiocb->ki_filp));
  
  	ioprio = READ_ONCE(sqe->ioprio);
  	if (ioprio) {
  		ret = ioprio_check_cap(ioprio);
  		if (ret)
09bb83943   Jens Axboe   io_uring: fix fge...
1047
  			return ret;
2b188cc1b   Jens Axboe   Add io_uring IO i...
1048
1049
1050
1051
1052
1053
1054
  
  		kiocb->ki_ioprio = ioprio;
  	} else
  		kiocb->ki_ioprio = get_current_ioprio();
  
  	ret = kiocb_set_rw_flags(kiocb, READ_ONCE(sqe->rw_flags));
  	if (unlikely(ret))
09bb83943   Jens Axboe   io_uring: fix fge...
1055
  		return ret;
8449eedaa   Stefan Bühler   io_uring: fix han...
1056
1057
  
  	/* don't allow async punt if RWF_NOWAIT was requested */
491381ce0   Jens Axboe   io_uring: fix up ...
1058
1059
  	if ((kiocb->ki_flags & IOCB_NOWAIT) ||
  	    (req->file->f_flags & O_NONBLOCK))
8449eedaa   Stefan Bühler   io_uring: fix han...
1060
1061
1062
  		req->flags |= REQ_F_NOWAIT;
  
  	if (force_nonblock)
2b188cc1b   Jens Axboe   Add io_uring IO i...
1063
  		kiocb->ki_flags |= IOCB_NOWAIT;
8449eedaa   Stefan Bühler   io_uring: fix han...
1064

def596e95   Jens Axboe   io_uring: support...
1065
  	if (ctx->flags & IORING_SETUP_IOPOLL) {
def596e95   Jens Axboe   io_uring: support...
1066
1067
  		if (!(kiocb->ki_flags & IOCB_DIRECT) ||
  		    !kiocb->ki_filp->f_op->iopoll)
09bb83943   Jens Axboe   io_uring: fix fge...
1068
  			return -EOPNOTSUPP;
2b188cc1b   Jens Axboe   Add io_uring IO i...
1069

def596e95   Jens Axboe   io_uring: support...
1070
1071
  		kiocb->ki_flags |= IOCB_HIPRI;
  		kiocb->ki_complete = io_complete_rw_iopoll;
6873e0bd6   Jens Axboe   io_uring: ensure ...
1072
  		req->result = 0;
def596e95   Jens Axboe   io_uring: support...
1073
  	} else {
09bb83943   Jens Axboe   io_uring: fix fge...
1074
1075
  		if (kiocb->ki_flags & IOCB_HIPRI)
  			return -EINVAL;
def596e95   Jens Axboe   io_uring: support...
1076
1077
  		kiocb->ki_complete = io_complete_rw;
  	}
2b188cc1b   Jens Axboe   Add io_uring IO i...
1078
  	return 0;
2b188cc1b   Jens Axboe   Add io_uring IO i...
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
  }
  
  static inline void io_rw_done(struct kiocb *kiocb, ssize_t ret)
  {
  	switch (ret) {
  	case -EIOCBQUEUED:
  		break;
  	case -ERESTARTSYS:
  	case -ERESTARTNOINTR:
  	case -ERESTARTNOHAND:
  	case -ERESTART_RESTARTBLOCK:
  		/*
  		 * We can't just restart the syscall, since previously
  		 * submitted sqes may already be in progress. Just fail this
  		 * IO with EINTR.
  		 */
  		ret = -EINTR;
  		/* fall through */
  	default:
  		kiocb->ki_complete(kiocb, ret, 0);
  	}
  }
edafccee5   Jens Axboe   io_uring: add sup...
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
  static int io_import_fixed(struct io_ring_ctx *ctx, int rw,
  			   const struct io_uring_sqe *sqe,
  			   struct iov_iter *iter)
  {
  	size_t len = READ_ONCE(sqe->len);
  	struct io_mapped_ubuf *imu;
  	unsigned index, buf_index;
  	size_t offset;
  	u64 buf_addr;
  
  	/* attempt to use fixed buffers without having provided iovecs */
  	if (unlikely(!ctx->user_bufs))
  		return -EFAULT;
  
  	buf_index = READ_ONCE(sqe->buf_index);
  	if (unlikely(buf_index >= ctx->nr_user_bufs))
  		return -EFAULT;
  
  	index = array_index_nospec(buf_index, ctx->nr_user_bufs);
  	imu = &ctx->user_bufs[index];
  	buf_addr = READ_ONCE(sqe->addr);
  
  	/* overflow */
  	if (buf_addr + len < buf_addr)
  		return -EFAULT;
  	/* not inside the mapped region */
  	if (buf_addr < imu->ubuf || buf_addr + len > imu->ubuf + imu->len)
  		return -EFAULT;
  
  	/*
  	 * May not be a start of buffer, set size appropriately
  	 * and advance us to the beginning.
  	 */
  	offset = buf_addr - imu->ubuf;
  	iov_iter_bvec(iter, rw, imu->bvec, imu->nr_bvecs, offset + len);
bd11b3a39   Jens Axboe   io_uring: don't u...
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
  
  	if (offset) {
  		/*
  		 * Don't use iov_iter_advance() here, as it's really slow for
  		 * using the latter parts of a big fixed buffer - it iterates
  		 * over each segment manually. We can cheat a bit here, because
  		 * we know that:
  		 *
  		 * 1) it's a BVEC iter, we set it up
  		 * 2) all bvecs are PAGE_SIZE in size, except potentially the
  		 *    first and last bvec
  		 *
  		 * So just find our index, and adjust the iterator afterwards.
  		 * If the offset is within the first bvec (or the whole first
  		 * bvec, just use iov_iter_advance(). This makes it easier
  		 * since we can just skip the first segment, which may not
  		 * be PAGE_SIZE aligned.
  		 */
  		const struct bio_vec *bvec = imu->bvec;
  
  		if (offset <= bvec->bv_len) {
  			iov_iter_advance(iter, offset);
  		} else {
  			unsigned long seg_skip;
  
  			/* skip first vec */
  			offset -= bvec->bv_len;
  			seg_skip = 1 + (offset >> PAGE_SHIFT);
  
  			iter->bvec = bvec + seg_skip;
  			iter->nr_segs -= seg_skip;
99c79f669   Aleix Roca Nonell   io_uring: fix man...
1167
  			iter->count -= bvec->bv_len + offset;
bd11b3a39   Jens Axboe   io_uring: don't u...
1168
  			iter->iov_offset = offset & ~PAGE_MASK;
bd11b3a39   Jens Axboe   io_uring: don't u...
1169
1170
  		}
  	}
5e559561a   Jens Axboe   io_uring: ensure ...
1171
  	return len;
edafccee5   Jens Axboe   io_uring: add sup...
1172
  }
87e5e6dab   Jens Axboe   uio: make import_...
1173
1174
1175
  static ssize_t io_import_iovec(struct io_ring_ctx *ctx, int rw,
  			       const struct sqe_submit *s, struct iovec **iovec,
  			       struct iov_iter *iter)
2b188cc1b   Jens Axboe   Add io_uring IO i...
1176
1177
1178
1179
  {
  	const struct io_uring_sqe *sqe = s->sqe;
  	void __user *buf = u64_to_user_ptr(READ_ONCE(sqe->addr));
  	size_t sqe_len = READ_ONCE(sqe->len);
edafccee5   Jens Axboe   io_uring: add sup...
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
  	u8 opcode;
  
  	/*
  	 * We're reading ->opcode for the second time, but the first read
  	 * doesn't care whether it's _FIXED or not, so it doesn't matter
  	 * whether ->opcode changes concurrently. The first read does care
  	 * about whether it is a READ or a WRITE, so we don't trust this read
  	 * for that purpose and instead let the caller pass in the read/write
  	 * flag.
  	 */
  	opcode = READ_ONCE(sqe->opcode);
  	if (opcode == IORING_OP_READ_FIXED ||
  	    opcode == IORING_OP_WRITE_FIXED) {
87e5e6dab   Jens Axboe   uio: make import_...
1193
  		ssize_t ret = io_import_fixed(ctx, rw, sqe, iter);
edafccee5   Jens Axboe   io_uring: add sup...
1194
1195
1196
  		*iovec = NULL;
  		return ret;
  	}
2b188cc1b   Jens Axboe   Add io_uring IO i...
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
  
  	if (!s->has_user)
  		return -EFAULT;
  
  #ifdef CONFIG_COMPAT
  	if (ctx->compat)
  		return compat_import_iovec(rw, buf, sqe_len, UIO_FASTIOV,
  						iovec, iter);
  #endif
  
  	return import_iovec(rw, buf, sqe_len, UIO_FASTIOV, iovec, iter);
  }
6d5d5ac52   Jens Axboe   io_uring: extend ...
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
  static inline bool io_should_merge(struct async_list *al, struct kiocb *kiocb)
  {
  	if (al->file == kiocb->ki_filp) {
  		off_t start, end;
  
  		/*
  		 * Allow merging if we're anywhere in the range of the same
  		 * page. Generally this happens for sub-page reads or writes,
  		 * and it's beneficial to allow the first worker to bring the
  		 * page in and the piggy backed work can then work on the
  		 * cached page.
  		 */
  		start = al->io_start & PAGE_MASK;
  		end = (al->io_start + al->io_len + PAGE_SIZE - 1) & PAGE_MASK;
  		if (kiocb->ki_pos >= start && kiocb->ki_pos <= end)
  			return true;
  	}
  
  	al->file = NULL;
  	return false;
  }
31b515106   Jens Axboe   io_uring: allow w...
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
  /*
   * Make a note of the last file/offset/direction we punted to async
   * context. We'll use this information to see if we can piggy back a
   * sequential request onto the previous one, if it's still hasn't been
   * completed by the async worker.
   */
  static void io_async_list_note(int rw, struct io_kiocb *req, size_t len)
  {
  	struct async_list *async_list = &req->ctx->pending_async[rw];
  	struct kiocb *kiocb = &req->rw;
  	struct file *filp = kiocb->ki_filp;
31b515106   Jens Axboe   io_uring: allow w...
1241

6d5d5ac52   Jens Axboe   io_uring: extend ...
1242
  	if (io_should_merge(async_list, kiocb)) {
9310a7ba6   Zhengyuan Liu   io_uring: track i...
1243
  		unsigned long max_bytes;
31b515106   Jens Axboe   io_uring: allow w...
1244
1245
  
  		/* Use 8x RA size as a decent limiter for both reads/writes */
9310a7ba6   Zhengyuan Liu   io_uring: track i...
1246
1247
1248
1249
1250
1251
  		max_bytes = filp->f_ra.ra_pages << (PAGE_SHIFT + 3);
  		if (!max_bytes)
  			max_bytes = VM_READAHEAD_PAGES << (PAGE_SHIFT + 3);
  
  		/* If max len are exceeded, reset the state */
  		if (async_list->io_len + len <= max_bytes) {
31b515106   Jens Axboe   io_uring: allow w...
1252
  			req->flags |= REQ_F_SEQ_PREV;
9310a7ba6   Zhengyuan Liu   io_uring: track i...
1253
  			async_list->io_len += len;
31b515106   Jens Axboe   io_uring: allow w...
1254
  		} else {
6d5d5ac52   Jens Axboe   io_uring: extend ...
1255
  			async_list->file = NULL;
31b515106   Jens Axboe   io_uring: allow w...
1256
1257
1258
1259
1260
  		}
  	}
  
  	/* New file? Reset state. */
  	if (async_list->file != filp) {
6d5d5ac52   Jens Axboe   io_uring: extend ...
1261
1262
  		async_list->io_start = kiocb->ki_pos;
  		async_list->io_len = len;
31b515106   Jens Axboe   io_uring: allow w...
1263
1264
  		async_list->file = filp;
  	}
31b515106   Jens Axboe   io_uring: allow w...
1265
  }
32960613b   Jens Axboe   io_uring: correct...
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
  /*
   * For files that don't have ->read_iter() and ->write_iter(), handle them
   * by looping over ->read() or ->write() manually.
   */
  static ssize_t loop_rw_iter(int rw, struct file *file, struct kiocb *kiocb,
  			   struct iov_iter *iter)
  {
  	ssize_t ret = 0;
  
  	/*
  	 * Don't support polled IO through this interface, and we can't
  	 * support non-blocking either. For the latter, this just causes
  	 * the kiocb to be handled from an async context.
  	 */
  	if (kiocb->ki_flags & IOCB_HIPRI)
  		return -EOPNOTSUPP;
  	if (kiocb->ki_flags & IOCB_NOWAIT)
  		return -EAGAIN;
  
  	while (iov_iter_count(iter)) {
f246eedba   Pavel Begunkov   io_uring: fix dea...
1286
  		struct iovec iovec;
32960613b   Jens Axboe   io_uring: correct...
1287
  		ssize_t nr;
f246eedba   Pavel Begunkov   io_uring: fix dea...
1288
1289
1290
1291
1292
1293
1294
1295
1296
  		if (!iov_iter_is_bvec(iter)) {
  			iovec = iov_iter_iovec(iter);
  		} else {
  			/* fixed buffers import bvec */
  			iovec.iov_base = kmap(iter->bvec->bv_page)
  						+ iter->iov_offset;
  			iovec.iov_len = min(iter->count,
  					iter->bvec->bv_len - iter->iov_offset);
  		}
32960613b   Jens Axboe   io_uring: correct...
1297
1298
1299
1300
1301
1302
1303
  		if (rw == READ) {
  			nr = file->f_op->read(file, iovec.iov_base,
  					      iovec.iov_len, &kiocb->ki_pos);
  		} else {
  			nr = file->f_op->write(file, iovec.iov_base,
  					       iovec.iov_len, &kiocb->ki_pos);
  		}
f246eedba   Pavel Begunkov   io_uring: fix dea...
1304
1305
  		if (iov_iter_is_bvec(iter))
  			kunmap(iter->bvec->bv_page);
32960613b   Jens Axboe   io_uring: correct...
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
  		if (nr < 0) {
  			if (!ret)
  				ret = nr;
  			break;
  		}
  		ret += nr;
  		if (nr != iovec.iov_len)
  			break;
  		iov_iter_advance(iter, nr);
  	}
  
  	return ret;
  }
e0c5c576d   Jens Axboe   io_uring: make io...
1319
  static int io_read(struct io_kiocb *req, const struct sqe_submit *s,
8358e3a82   Jens Axboe   io_uring: remove ...
1320
  		   bool force_nonblock)
2b188cc1b   Jens Axboe   Add io_uring IO i...
1321
1322
1323
1324
1325
  {
  	struct iovec inline_vecs[UIO_FASTIOV], *iovec = inline_vecs;
  	struct kiocb *kiocb = &req->rw;
  	struct iov_iter iter;
  	struct file *file;
31b515106   Jens Axboe   io_uring: allow w...
1326
  	size_t iov_count;
9d93a3f5a   Jens Axboe   io_uring: punt sh...
1327
  	ssize_t read_size, ret;
2b188cc1b   Jens Axboe   Add io_uring IO i...
1328

8358e3a82   Jens Axboe   io_uring: remove ...
1329
  	ret = io_prep_rw(req, s, force_nonblock);
2b188cc1b   Jens Axboe   Add io_uring IO i...
1330
1331
1332
  	if (ret)
  		return ret;
  	file = kiocb->ki_filp;
2b188cc1b   Jens Axboe   Add io_uring IO i...
1333
  	if (unlikely(!(file->f_mode & FMODE_READ)))
09bb83943   Jens Axboe   io_uring: fix fge...
1334
  		return -EBADF;
2b188cc1b   Jens Axboe   Add io_uring IO i...
1335
1336
  
  	ret = io_import_iovec(req->ctx, READ, s, &iovec, &iter);
87e5e6dab   Jens Axboe   uio: make import_...
1337
  	if (ret < 0)
09bb83943   Jens Axboe   io_uring: fix fge...
1338
  		return ret;
2b188cc1b   Jens Axboe   Add io_uring IO i...
1339

9d93a3f5a   Jens Axboe   io_uring: punt sh...
1340
  	read_size = ret;
9e645e110   Jens Axboe   io_uring: add sup...
1341
1342
  	if (req->flags & REQ_F_LINK)
  		req->result = read_size;
31b515106   Jens Axboe   io_uring: allow w...
1343
1344
  	iov_count = iov_iter_count(&iter);
  	ret = rw_verify_area(READ, file, &kiocb->ki_pos, iov_count);
2b188cc1b   Jens Axboe   Add io_uring IO i...
1345
1346
  	if (!ret) {
  		ssize_t ret2;
32960613b   Jens Axboe   io_uring: correct...
1347
1348
1349
1350
  		if (file->f_op->read_iter)
  			ret2 = call_read_iter(file, kiocb, &iter);
  		else
  			ret2 = loop_rw_iter(READ, file, kiocb, &iter);
9d93a3f5a   Jens Axboe   io_uring: punt sh...
1351
1352
1353
1354
1355
1356
1357
1358
  		/*
  		 * In case of a short read, punt to async. This can happen
  		 * if we have data partially cached. Alternatively we can
  		 * return the short read, in which case the application will
  		 * need to issue another SQE and wait for it. That SQE will
  		 * need async punt anyway, so it's more efficient to do it
  		 * here.
  		 */
491381ce0   Jens Axboe   io_uring: fix up ...
1359
1360
1361
  		if (force_nonblock && !(req->flags & REQ_F_NOWAIT) &&
  		    (req->flags & REQ_F_ISREG) &&
  		    ret2 > 0 && ret2 < read_size)
9d93a3f5a   Jens Axboe   io_uring: punt sh...
1362
1363
  			ret2 = -EAGAIN;
  		/* Catch -EAGAIN return for forced non-blocking submission */
31b515106   Jens Axboe   io_uring: allow w...
1364
  		if (!force_nonblock || ret2 != -EAGAIN) {
2b188cc1b   Jens Axboe   Add io_uring IO i...
1365
  			io_rw_done(kiocb, ret2);
31b515106   Jens Axboe   io_uring: allow w...
1366
1367
1368
1369
1370
1371
1372
  		} else {
  			/*
  			 * If ->needs_lock is true, we're already in async
  			 * context.
  			 */
  			if (!s->needs_lock)
  				io_async_list_note(READ, req, iov_count);
2b188cc1b   Jens Axboe   Add io_uring IO i...
1373
  			ret = -EAGAIN;
31b515106   Jens Axboe   io_uring: allow w...
1374
  		}
2b188cc1b   Jens Axboe   Add io_uring IO i...
1375
1376
  	}
  	kfree(iovec);
2b188cc1b   Jens Axboe   Add io_uring IO i...
1377
1378
  	return ret;
  }
e0c5c576d   Jens Axboe   io_uring: make io...
1379
  static int io_write(struct io_kiocb *req, const struct sqe_submit *s,
8358e3a82   Jens Axboe   io_uring: remove ...
1380
  		    bool force_nonblock)
2b188cc1b   Jens Axboe   Add io_uring IO i...
1381
1382
1383
1384
1385
  {
  	struct iovec inline_vecs[UIO_FASTIOV], *iovec = inline_vecs;
  	struct kiocb *kiocb = &req->rw;
  	struct iov_iter iter;
  	struct file *file;
31b515106   Jens Axboe   io_uring: allow w...
1386
  	size_t iov_count;
87e5e6dab   Jens Axboe   uio: make import_...
1387
  	ssize_t ret;
2b188cc1b   Jens Axboe   Add io_uring IO i...
1388

8358e3a82   Jens Axboe   io_uring: remove ...
1389
  	ret = io_prep_rw(req, s, force_nonblock);
2b188cc1b   Jens Axboe   Add io_uring IO i...
1390
1391
  	if (ret)
  		return ret;
2b188cc1b   Jens Axboe   Add io_uring IO i...
1392

2b188cc1b   Jens Axboe   Add io_uring IO i...
1393
1394
  	file = kiocb->ki_filp;
  	if (unlikely(!(file->f_mode & FMODE_WRITE)))
09bb83943   Jens Axboe   io_uring: fix fge...
1395
  		return -EBADF;
2b188cc1b   Jens Axboe   Add io_uring IO i...
1396
1397
  
  	ret = io_import_iovec(req->ctx, WRITE, s, &iovec, &iter);
87e5e6dab   Jens Axboe   uio: make import_...
1398
  	if (ret < 0)
09bb83943   Jens Axboe   io_uring: fix fge...
1399
  		return ret;
2b188cc1b   Jens Axboe   Add io_uring IO i...
1400

9e645e110   Jens Axboe   io_uring: add sup...
1401
1402
  	if (req->flags & REQ_F_LINK)
  		req->result = ret;
31b515106   Jens Axboe   io_uring: allow w...
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
  	iov_count = iov_iter_count(&iter);
  
  	ret = -EAGAIN;
  	if (force_nonblock && !(kiocb->ki_flags & IOCB_DIRECT)) {
  		/* If ->needs_lock is true, we're already in async context. */
  		if (!s->needs_lock)
  			io_async_list_note(WRITE, req, iov_count);
  		goto out_free;
  	}
  
  	ret = rw_verify_area(WRITE, file, &kiocb->ki_pos, iov_count);
2b188cc1b   Jens Axboe   Add io_uring IO i...
1414
  	if (!ret) {
9bf7933fc   Roman Penyaev   io_uring: offload...
1415
  		ssize_t ret2;
2b188cc1b   Jens Axboe   Add io_uring IO i...
1416
1417
1418
1419
1420
1421
1422
  		/*
  		 * Open-code file_start_write here to grab freeze protection,
  		 * which will be released by another thread in
  		 * io_complete_rw().  Fool lockdep by telling it the lock got
  		 * released so that it doesn't complain about the held lock when
  		 * we return to userspace.
  		 */
491381ce0   Jens Axboe   io_uring: fix up ...
1423
  		if (req->flags & REQ_F_ISREG) {
2b188cc1b   Jens Axboe   Add io_uring IO i...
1424
1425
1426
1427
1428
1429
  			__sb_start_write(file_inode(file)->i_sb,
  						SB_FREEZE_WRITE, true);
  			__sb_writers_release(file_inode(file)->i_sb,
  						SB_FREEZE_WRITE);
  		}
  		kiocb->ki_flags |= IOCB_WRITE;
9bf7933fc   Roman Penyaev   io_uring: offload...
1430

32960613b   Jens Axboe   io_uring: correct...
1431
1432
1433
1434
  		if (file->f_op->write_iter)
  			ret2 = call_write_iter(file, kiocb, &iter);
  		else
  			ret2 = loop_rw_iter(WRITE, file, kiocb, &iter);
9bf7933fc   Roman Penyaev   io_uring: offload...
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
  		if (!force_nonblock || ret2 != -EAGAIN) {
  			io_rw_done(kiocb, ret2);
  		} else {
  			/*
  			 * If ->needs_lock is true, we're already in async
  			 * context.
  			 */
  			if (!s->needs_lock)
  				io_async_list_note(WRITE, req, iov_count);
  			ret = -EAGAIN;
  		}
2b188cc1b   Jens Axboe   Add io_uring IO i...
1446
  	}
31b515106   Jens Axboe   io_uring: allow w...
1447
  out_free:
2b188cc1b   Jens Axboe   Add io_uring IO i...
1448
  	kfree(iovec);
2b188cc1b   Jens Axboe   Add io_uring IO i...
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
  	return ret;
  }
  
  /*
   * IORING_OP_NOP just posts a completion event, nothing else.
   */
  static int io_nop(struct io_kiocb *req, u64 user_data)
  {
  	struct io_ring_ctx *ctx = req->ctx;
  	long err = 0;
def596e95   Jens Axboe   io_uring: support...
1459
1460
  	if (unlikely(ctx->flags & IORING_SETUP_IOPOLL))
  		return -EINVAL;
c71ffb673   Jens Axboe   io_uring: remove ...
1461
  	io_cqring_add_event(ctx, user_data, err);
e65ef56db   Jens Axboe   io_uring: use reg...
1462
  	io_put_req(req);
2b188cc1b   Jens Axboe   Add io_uring IO i...
1463
1464
  	return 0;
  }
c992fe292   Christoph Hellwig   io_uring: add fsy...
1465
1466
  static int io_prep_fsync(struct io_kiocb *req, const struct io_uring_sqe *sqe)
  {
6b06314c4   Jens Axboe   io_uring: add fil...
1467
  	struct io_ring_ctx *ctx = req->ctx;
c992fe292   Christoph Hellwig   io_uring: add fsy...
1468

09bb83943   Jens Axboe   io_uring: fix fge...
1469
1470
  	if (!req->file)
  		return -EBADF;
c992fe292   Christoph Hellwig   io_uring: add fsy...
1471

6b06314c4   Jens Axboe   io_uring: add fil...
1472
  	if (unlikely(ctx->flags & IORING_SETUP_IOPOLL))
def596e95   Jens Axboe   io_uring: support...
1473
  		return -EINVAL;
edafccee5   Jens Axboe   io_uring: add sup...
1474
  	if (unlikely(sqe->addr || sqe->ioprio || sqe->buf_index))
c992fe292   Christoph Hellwig   io_uring: add fsy...
1475
  		return -EINVAL;
c992fe292   Christoph Hellwig   io_uring: add fsy...
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
  	return 0;
  }
  
  static int io_fsync(struct io_kiocb *req, const struct io_uring_sqe *sqe,
  		    bool force_nonblock)
  {
  	loff_t sqe_off = READ_ONCE(sqe->off);
  	loff_t sqe_len = READ_ONCE(sqe->len);
  	loff_t end = sqe_off + sqe_len;
  	unsigned fsync_flags;
  	int ret;
  
  	fsync_flags = READ_ONCE(sqe->fsync_flags);
  	if (unlikely(fsync_flags & ~IORING_FSYNC_DATASYNC))
  		return -EINVAL;
  
  	ret = io_prep_fsync(req, sqe);
  	if (ret)
  		return ret;
  
  	/* fsync always requires a blocking context */
  	if (force_nonblock)
  		return -EAGAIN;
  
  	ret = vfs_fsync_range(req->rw.ki_filp, sqe_off,
  				end > 0 ? end : LLONG_MAX,
  				fsync_flags & IORING_FSYNC_DATASYNC);
9e645e110   Jens Axboe   io_uring: add sup...
1503
1504
  	if (ret < 0 && (req->flags & REQ_F_LINK))
  		req->flags |= REQ_F_FAIL_LINK;
c71ffb673   Jens Axboe   io_uring: remove ...
1505
  	io_cqring_add_event(req->ctx, sqe->user_data, ret);
e65ef56db   Jens Axboe   io_uring: use reg...
1506
  	io_put_req(req);
c992fe292   Christoph Hellwig   io_uring: add fsy...
1507
1508
  	return 0;
  }
5d17b4a4b   Jens Axboe   io_uring: add sup...
1509
1510
1511
1512
1513
1514
1515
  static int io_prep_sfr(struct io_kiocb *req, const struct io_uring_sqe *sqe)
  {
  	struct io_ring_ctx *ctx = req->ctx;
  	int ret = 0;
  
  	if (!req->file)
  		return -EBADF;
5d17b4a4b   Jens Axboe   io_uring: add sup...
1516
1517
1518
1519
1520
  
  	if (unlikely(ctx->flags & IORING_SETUP_IOPOLL))
  		return -EINVAL;
  	if (unlikely(sqe->addr || sqe->ioprio || sqe->buf_index))
  		return -EINVAL;
5d17b4a4b   Jens Axboe   io_uring: add sup...
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
  	return ret;
  }
  
  static int io_sync_file_range(struct io_kiocb *req,
  			      const struct io_uring_sqe *sqe,
  			      bool force_nonblock)
  {
  	loff_t sqe_off;
  	loff_t sqe_len;
  	unsigned flags;
  	int ret;
  
  	ret = io_prep_sfr(req, sqe);
  	if (ret)
  		return ret;
  
  	/* sync_file_range always requires a blocking context */
  	if (force_nonblock)
  		return -EAGAIN;
  
  	sqe_off = READ_ONCE(sqe->off);
  	sqe_len = READ_ONCE(sqe->len);
  	flags = READ_ONCE(sqe->sync_range_flags);
  
  	ret = sync_file_range(req->rw.ki_filp, sqe_off, sqe_len, flags);
9e645e110   Jens Axboe   io_uring: add sup...
1546
1547
  	if (ret < 0 && (req->flags & REQ_F_LINK))
  		req->flags |= REQ_F_FAIL_LINK;
c71ffb673   Jens Axboe   io_uring: remove ...
1548
  	io_cqring_add_event(req->ctx, sqe->user_data, ret);
5d17b4a4b   Jens Axboe   io_uring: add sup...
1549
1550
1551
  	io_put_req(req);
  	return 0;
  }
0fa03c624   Jens Axboe   io_uring: add sup...
1552
  #if defined(CONFIG_NET)
aa1fa28fc   Jens Axboe   io_uring: add sup...
1553
1554
1555
1556
1557
  static int io_send_recvmsg(struct io_kiocb *req, const struct io_uring_sqe *sqe,
  			   bool force_nonblock,
  		   long (*fn)(struct socket *, struct user_msghdr __user *,
  				unsigned int))
  {
0fa03c624   Jens Axboe   io_uring: add sup...
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
  	struct socket *sock;
  	int ret;
  
  	if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
  		return -EINVAL;
  
  	sock = sock_from_file(req->file, &ret);
  	if (sock) {
  		struct user_msghdr __user *msg;
  		unsigned flags;
  
  		flags = READ_ONCE(sqe->msg_flags);
  		if (flags & MSG_DONTWAIT)
  			req->flags |= REQ_F_NOWAIT;
  		else if (force_nonblock)
  			flags |= MSG_DONTWAIT;
7eaf718b8   Jens Axboe   io_uring: fix 32-...
1574
1575
1576
1577
  #ifdef CONFIG_COMPAT
  		if (req->ctx->compat)
  			flags |= MSG_CMSG_COMPAT;
  #endif
0fa03c624   Jens Axboe   io_uring: add sup...
1578
1579
  		msg = (struct user_msghdr __user *) (unsigned long)
  			READ_ONCE(sqe->addr);
aa1fa28fc   Jens Axboe   io_uring: add sup...
1580
  		ret = fn(sock, msg, flags);
0fa03c624   Jens Axboe   io_uring: add sup...
1581
1582
  		if (force_nonblock && ret == -EAGAIN)
  			return ret;
57aabff8c   Jens Axboe   io_uring: transfo...
1583
1584
  		if (ret == -ERESTARTSYS)
  			ret = -EINTR;
0fa03c624   Jens Axboe   io_uring: add sup...
1585
  	}
cac68d12c   Jens Axboe   io_uring: grab ->...
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
  	if (req->fs) {
  		struct fs_struct *fs = req->fs;
  
  		spin_lock(&req->fs->lock);
  		if (--fs->users)
  			fs = NULL;
  		spin_unlock(&req->fs->lock);
  		if (fs)
  			free_fs_struct(fs);
  	}
c71ffb673   Jens Axboe   io_uring: remove ...
1596
  	io_cqring_add_event(req->ctx, sqe->user_data, ret);
5d17b4a4b   Jens Axboe   io_uring: add sup...
1597
1598
1599
  	io_put_req(req);
  	return 0;
  }
aa1fa28fc   Jens Axboe   io_uring: add sup...
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
  #endif
  
  static int io_sendmsg(struct io_kiocb *req, const struct io_uring_sqe *sqe,
  		      bool force_nonblock)
  {
  #if defined(CONFIG_NET)
  	return io_send_recvmsg(req, sqe, force_nonblock, __sys_sendmsg_sock);
  #else
  	return -EOPNOTSUPP;
  #endif
  }
  
  static int io_recvmsg(struct io_kiocb *req, const struct io_uring_sqe *sqe,
  		      bool force_nonblock)
  {
  #if defined(CONFIG_NET)
  	return io_send_recvmsg(req, sqe, force_nonblock, __sys_recvmsg_sock);
0fa03c624   Jens Axboe   io_uring: add sup...
1617
1618
1619
1620
  #else
  	return -EOPNOTSUPP;
  #endif
  }
5d17b4a4b   Jens Axboe   io_uring: add sup...
1621

221c5eb23   Jens Axboe   io_uring: add sup...
1622
1623
1624
1625
1626
1627
1628
1629
  static void io_poll_remove_one(struct io_kiocb *req)
  {
  	struct io_poll_iocb *poll = &req->poll;
  
  	spin_lock(&poll->head->lock);
  	WRITE_ONCE(poll->canceled, true);
  	if (!list_empty(&poll->wait.entry)) {
  		list_del_init(&poll->wait.entry);
18d9be1a9   Jens Axboe   io_uring: add io_...
1630
  		io_queue_async_work(req->ctx, req);
221c5eb23   Jens Axboe   io_uring: add sup...
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
  	}
  	spin_unlock(&poll->head->lock);
  
  	list_del_init(&req->list);
  }
  
  static void io_poll_remove_all(struct io_ring_ctx *ctx)
  {
  	struct io_kiocb *req;
  
  	spin_lock_irq(&ctx->completion_lock);
  	while (!list_empty(&ctx->cancel_list)) {
  		req = list_first_entry(&ctx->cancel_list, struct io_kiocb,list);
  		io_poll_remove_one(req);
  	}
  	spin_unlock_irq(&ctx->completion_lock);
  }
  
  /*
   * Find a running poll command that matches one specified in sqe->addr,
   * and remove it if found.
   */
  static int io_poll_remove(struct io_kiocb *req, const struct io_uring_sqe *sqe)
  {
  	struct io_ring_ctx *ctx = req->ctx;
  	struct io_kiocb *poll_req, *next;
  	int ret = -ENOENT;
  
  	if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
  		return -EINVAL;
  	if (sqe->ioprio || sqe->off || sqe->len || sqe->buf_index ||
  	    sqe->poll_events)
  		return -EINVAL;
  
  	spin_lock_irq(&ctx->completion_lock);
  	list_for_each_entry_safe(poll_req, next, &ctx->cancel_list, list) {
  		if (READ_ONCE(sqe->addr) == poll_req->user_data) {
  			io_poll_remove_one(poll_req);
  			ret = 0;
  			break;
  		}
  	}
  	spin_unlock_irq(&ctx->completion_lock);
c71ffb673   Jens Axboe   io_uring: remove ...
1674
  	io_cqring_add_event(req->ctx, sqe->user_data, ret);
e65ef56db   Jens Axboe   io_uring: use reg...
1675
  	io_put_req(req);
221c5eb23   Jens Axboe   io_uring: add sup...
1676
1677
  	return 0;
  }
8c8387887   Jens Axboe   io_uring: fix pol...
1678
1679
  static void io_poll_complete(struct io_ring_ctx *ctx, struct io_kiocb *req,
  			     __poll_t mask)
221c5eb23   Jens Axboe   io_uring: add sup...
1680
  {
8c8387887   Jens Axboe   io_uring: fix pol...
1681
  	req->poll.done = true;
c71ffb673   Jens Axboe   io_uring: remove ...
1682
  	io_cqring_fill_event(ctx, req->user_data, mangle_poll(mask));
8c8387887   Jens Axboe   io_uring: fix pol...
1683
  	io_commit_cqring(ctx);
221c5eb23   Jens Axboe   io_uring: add sup...
1684
1685
1686
1687
1688
1689
1690
1691
  }
  
  static void io_poll_complete_work(struct work_struct *work)
  {
  	struct io_kiocb *req = container_of(work, struct io_kiocb, work);
  	struct io_poll_iocb *poll = &req->poll;
  	struct poll_table_struct pt = { ._key = poll->events };
  	struct io_ring_ctx *ctx = req->ctx;
8387e3688   Jens Axboe   io_uring: async w...
1692
  	const struct cred *old_cred;
221c5eb23   Jens Axboe   io_uring: add sup...
1693
  	__poll_t mask = 0;
8387e3688   Jens Axboe   io_uring: async w...
1694
  	old_cred = override_creds(ctx->creds);
221c5eb23   Jens Axboe   io_uring: add sup...
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
  	if (!READ_ONCE(poll->canceled))
  		mask = vfs_poll(poll->file, &pt) & poll->events;
  
  	/*
  	 * Note that ->ki_cancel callers also delete iocb from active_reqs after
  	 * calling ->ki_cancel.  We need the ctx_lock roundtrip here to
  	 * synchronize with them.  In the cancellation case the list_del_init
  	 * itself is not actually needed, but harmless so we keep it in to
  	 * avoid further branches in the fast path.
  	 */
  	spin_lock_irq(&ctx->completion_lock);
  	if (!mask && !READ_ONCE(poll->canceled)) {
  		add_wait_queue(poll->head, &poll->wait);
  		spin_unlock_irq(&ctx->completion_lock);
8387e3688   Jens Axboe   io_uring: async w...
1709
  		goto out;
221c5eb23   Jens Axboe   io_uring: add sup...
1710
1711
  	}
  	list_del_init(&req->list);
8c8387887   Jens Axboe   io_uring: fix pol...
1712
  	io_poll_complete(ctx, req, mask);
221c5eb23   Jens Axboe   io_uring: add sup...
1713
  	spin_unlock_irq(&ctx->completion_lock);
8c8387887   Jens Axboe   io_uring: fix pol...
1714
1715
  	io_cqring_ev_posted(ctx);
  	io_put_req(req);
8387e3688   Jens Axboe   io_uring: async w...
1716
1717
  out:
  	revert_creds(old_cred);
221c5eb23   Jens Axboe   io_uring: add sup...
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
  }
  
  static int io_poll_wake(struct wait_queue_entry *wait, unsigned mode, int sync,
  			void *key)
  {
  	struct io_poll_iocb *poll = container_of(wait, struct io_poll_iocb,
  							wait);
  	struct io_kiocb *req = container_of(poll, struct io_kiocb, poll);
  	struct io_ring_ctx *ctx = req->ctx;
  	__poll_t mask = key_to_poll(key);
8c8387887   Jens Axboe   io_uring: fix pol...
1728
  	unsigned long flags;
221c5eb23   Jens Axboe   io_uring: add sup...
1729
1730
  
  	/* for instances that support it check for an event match first: */
8c8387887   Jens Axboe   io_uring: fix pol...
1731
1732
  	if (mask && !(mask & poll->events))
  		return 0;
221c5eb23   Jens Axboe   io_uring: add sup...
1733

8c8387887   Jens Axboe   io_uring: fix pol...
1734
  	list_del_init(&poll->wait.entry);
221c5eb23   Jens Axboe   io_uring: add sup...
1735

8c8387887   Jens Axboe   io_uring: fix pol...
1736
1737
1738
1739
  	if (mask && spin_trylock_irqsave(&ctx->completion_lock, flags)) {
  		list_del(&req->list);
  		io_poll_complete(ctx, req, mask);
  		spin_unlock_irqrestore(&ctx->completion_lock, flags);
221c5eb23   Jens Axboe   io_uring: add sup...
1740

8c8387887   Jens Axboe   io_uring: fix pol...
1741
1742
1743
  		io_cqring_ev_posted(ctx);
  		io_put_req(req);
  	} else {
18d9be1a9   Jens Axboe   io_uring: add io_...
1744
  		io_queue_async_work(ctx, req);
221c5eb23   Jens Axboe   io_uring: add sup...
1745
  	}
221c5eb23   Jens Axboe   io_uring: add sup...
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
  	return 1;
  }
  
  struct io_poll_table {
  	struct poll_table_struct pt;
  	struct io_kiocb *req;
  	int error;
  };
  
  static void io_poll_queue_proc(struct file *file, struct wait_queue_head *head,
  			       struct poll_table_struct *p)
  {
  	struct io_poll_table *pt = container_of(p, struct io_poll_table, pt);
  
  	if (unlikely(pt->req->poll.head)) {
  		pt->error = -EINVAL;
  		return;
  	}
  
  	pt->error = 0;
  	pt->req->poll.head = head;
  	add_wait_queue(head, &pt->req->poll.wait);
  }
  
  static int io_poll_add(struct io_kiocb *req, const struct io_uring_sqe *sqe)
  {
  	struct io_poll_iocb *poll = &req->poll;
  	struct io_ring_ctx *ctx = req->ctx;
  	struct io_poll_table ipt;
8c8387887   Jens Axboe   io_uring: fix pol...
1775
  	bool cancel = false;
221c5eb23   Jens Axboe   io_uring: add sup...
1776
1777
  	__poll_t mask;
  	u16 events;
221c5eb23   Jens Axboe   io_uring: add sup...
1778
1779
1780
1781
1782
  
  	if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
  		return -EINVAL;
  	if (sqe->addr || sqe->ioprio || sqe->off || sqe->len || sqe->buf_index)
  		return -EINVAL;
09bb83943   Jens Axboe   io_uring: fix fge...
1783
1784
  	if (!poll->file)
  		return -EBADF;
221c5eb23   Jens Axboe   io_uring: add sup...
1785

6cc47d1d2   Jens Axboe   io_uring: ensure ...
1786
  	req->submit.sqe = NULL;
221c5eb23   Jens Axboe   io_uring: add sup...
1787
1788
1789
  	INIT_WORK(&req->work, io_poll_complete_work);
  	events = READ_ONCE(sqe->poll_events);
  	poll->events = demangle_poll(events) | EPOLLERR | EPOLLHUP;
221c5eb23   Jens Axboe   io_uring: add sup...
1790
  	poll->head = NULL;
8c8387887   Jens Axboe   io_uring: fix pol...
1791
  	poll->done = false;
221c5eb23   Jens Axboe   io_uring: add sup...
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
  	poll->canceled = false;
  
  	ipt.pt._qproc = io_poll_queue_proc;
  	ipt.pt._key = poll->events;
  	ipt.req = req;
  	ipt.error = -EINVAL; /* same as no support for IOCB_CMD_POLL */
  
  	/* initialized the list so that we can do list_empty checks */
  	INIT_LIST_HEAD(&poll->wait.entry);
  	init_waitqueue_func_entry(&poll->wait, io_poll_wake);
36703247d   Jens Axboe   io_uring: ensure ...
1802
  	INIT_LIST_HEAD(&req->list);
221c5eb23   Jens Axboe   io_uring: add sup...
1803
  	mask = vfs_poll(poll->file, &ipt.pt) & poll->events;
221c5eb23   Jens Axboe   io_uring: add sup...
1804
1805
  
  	spin_lock_irq(&ctx->completion_lock);
8c8387887   Jens Axboe   io_uring: fix pol...
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
  	if (likely(poll->head)) {
  		spin_lock(&poll->head->lock);
  		if (unlikely(list_empty(&poll->wait.entry))) {
  			if (ipt.error)
  				cancel = true;
  			ipt.error = 0;
  			mask = 0;
  		}
  		if (mask || ipt.error)
  			list_del_init(&poll->wait.entry);
  		else if (cancel)
  			WRITE_ONCE(poll->canceled, true);
  		else if (!poll->done) /* actually waiting for an event */
  			list_add_tail(&req->list, &ctx->cancel_list);
  		spin_unlock(&poll->head->lock);
  	}
  	if (mask) { /* no async, we'd stolen it */
221c5eb23   Jens Axboe   io_uring: add sup...
1823
  		ipt.error = 0;
8c8387887   Jens Axboe   io_uring: fix pol...
1824
  		io_poll_complete(ctx, req, mask);
221c5eb23   Jens Axboe   io_uring: add sup...
1825
  	}
221c5eb23   Jens Axboe   io_uring: add sup...
1826
  	spin_unlock_irq(&ctx->completion_lock);
8c8387887   Jens Axboe   io_uring: fix pol...
1827
1828
  	if (mask) {
  		io_cqring_ev_posted(ctx);
e65ef56db   Jens Axboe   io_uring: use reg...
1829
  		io_put_req(req);
221c5eb23   Jens Axboe   io_uring: add sup...
1830
  	}
8c8387887   Jens Axboe   io_uring: fix pol...
1831
  	return ipt.error;
221c5eb23   Jens Axboe   io_uring: add sup...
1832
  }
5262f5679   Jens Axboe   io_uring: IORING_...
1833
1834
1835
  static enum hrtimer_restart io_timeout_fn(struct hrtimer *timer)
  {
  	struct io_ring_ctx *ctx;
ef03681ae   zhangyi (F)   io_uring : correc...
1836
  	struct io_kiocb *req, *prev;
5262f5679   Jens Axboe   io_uring: IORING_...
1837
1838
1839
1840
1841
1842
1843
  	unsigned long flags;
  
  	req = container_of(timer, struct io_kiocb, timeout.timer);
  	ctx = req->ctx;
  	atomic_inc(&ctx->cq_timeouts);
  
  	spin_lock_irqsave(&ctx->completion_lock, flags);
ef03681ae   zhangyi (F)   io_uring : correc...
1844
1845
1846
1847
1848
1849
1850
1851
1852
  	/*
  	 * Adjust the reqs sequence before the current one because it
  	 * will consume a slot in the cq_ring and the the cq_tail pointer
  	 * will be increased, otherwise other timeout reqs may return in
  	 * advance without waiting for enough wait_nr.
  	 */
  	prev = req;
  	list_for_each_entry_continue_reverse(prev, &ctx->timeout_list, list)
  		prev->sequence++;
5262f5679   Jens Axboe   io_uring: IORING_...
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
  	list_del(&req->list);
  
  	io_cqring_fill_event(ctx, req->user_data, -ETIME);
  	io_commit_cqring(ctx);
  	spin_unlock_irqrestore(&ctx->completion_lock, flags);
  
  	io_cqring_ev_posted(ctx);
  
  	io_put_req(req);
  	return HRTIMER_NORESTART;
  }
  
  static int io_timeout(struct io_kiocb *req, const struct io_uring_sqe *sqe)
  {
5da0fb1ab   yangerkun   io_uring: conside...
1867
  	unsigned count;
5262f5679   Jens Axboe   io_uring: IORING_...
1868
1869
  	struct io_ring_ctx *ctx = req->ctx;
  	struct list_head *entry;
bdf200731   Arnd Bergmann   io_uring: use __k...
1870
  	struct timespec64 ts;
a1f58ba46   zhangyi (F)   io_uring: correct...
1871
  	unsigned span = 0;
5262f5679   Jens Axboe   io_uring: IORING_...
1872
1873
1874
1875
1876
1877
  
  	if (unlikely(ctx->flags & IORING_SETUP_IOPOLL))
  		return -EINVAL;
  	if (sqe->flags || sqe->ioprio || sqe->buf_index || sqe->timeout_flags ||
  	    sqe->len != 1)
  		return -EINVAL;
bdf200731   Arnd Bergmann   io_uring: use __k...
1878
1879
  
  	if (get_timespec64(&ts, u64_to_user_ptr(sqe->addr)))
5262f5679   Jens Axboe   io_uring: IORING_...
1880
  		return -EFAULT;
93bd25bb6   Jens Axboe   io_uring: make ti...
1881
  	req->flags |= REQ_F_TIMEOUT;
5262f5679   Jens Axboe   io_uring: IORING_...
1882
1883
  	/*
  	 * sqe->off holds how many events that need to occur for this
93bd25bb6   Jens Axboe   io_uring: make ti...
1884
1885
  	 * timeout event to be satisfied. If it isn't set, then this is
  	 * a pure timeout request, sequence isn't used.
5262f5679   Jens Axboe   io_uring: IORING_...
1886
1887
  	 */
  	count = READ_ONCE(sqe->off);
93bd25bb6   Jens Axboe   io_uring: make ti...
1888
1889
1890
1891
1892
1893
  	if (!count) {
  		req->flags |= REQ_F_TIMEOUT_NOSEQ;
  		spin_lock_irq(&ctx->completion_lock);
  		entry = ctx->timeout_list.prev;
  		goto add;
  	}
5262f5679   Jens Axboe   io_uring: IORING_...
1894
1895
  
  	req->sequence = ctx->cached_sq_head + count - 1;
5da0fb1ab   yangerkun   io_uring: conside...
1896
1897
  	/* reuse it to store the count */
  	req->submit.sequence = count;
5262f5679   Jens Axboe   io_uring: IORING_...
1898
1899
1900
1901
1902
  
  	/*
  	 * Insertion sort, ensuring the first entry in the list is always
  	 * the one we need first.
  	 */
5262f5679   Jens Axboe   io_uring: IORING_...
1903
1904
1905
  	spin_lock_irq(&ctx->completion_lock);
  	list_for_each_prev(entry, &ctx->timeout_list) {
  		struct io_kiocb *nxt = list_entry(entry, struct io_kiocb, list);
5da0fb1ab   yangerkun   io_uring: conside...
1906
1907
  		unsigned nxt_sq_head;
  		long long tmp, tmp_nxt;
5262f5679   Jens Axboe   io_uring: IORING_...
1908

93bd25bb6   Jens Axboe   io_uring: make ti...
1909
1910
  		if (nxt->flags & REQ_F_TIMEOUT_NOSEQ)
  			continue;
5da0fb1ab   yangerkun   io_uring: conside...
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
  		/*
  		 * Since cached_sq_head + count - 1 can overflow, use type long
  		 * long to store it.
  		 */
  		tmp = (long long)ctx->cached_sq_head + count - 1;
  		nxt_sq_head = nxt->sequence - nxt->submit.sequence + 1;
  		tmp_nxt = (long long)nxt_sq_head + nxt->submit.sequence - 1;
  
  		/*
  		 * cached_sq_head may overflow, and it will never overflow twice
  		 * once there is some timeout req still be valid.
  		 */
  		if (ctx->cached_sq_head < nxt_sq_head)
8b07a65ad   yangerkun   io_uring: fix log...
1924
  			tmp += UINT_MAX;
5da0fb1ab   yangerkun   io_uring: conside...
1925

a1f58ba46   zhangyi (F)   io_uring: correct...
1926
  		if (tmp > tmp_nxt)
5262f5679   Jens Axboe   io_uring: IORING_...
1927
  			break;
a1f58ba46   zhangyi (F)   io_uring: correct...
1928
1929
1930
1931
1932
1933
1934
  
  		/*
  		 * Sequence of reqs after the insert one and itself should
  		 * be adjusted because each timeout req consumes a slot.
  		 */
  		span++;
  		nxt->sequence++;
5262f5679   Jens Axboe   io_uring: IORING_...
1935
  	}
a1f58ba46   zhangyi (F)   io_uring: correct...
1936
  	req->sequence -= span;
93bd25bb6   Jens Axboe   io_uring: make ti...
1937
  add:
5262f5679   Jens Axboe   io_uring: IORING_...
1938
1939
1940
1941
1942
  	list_add(&req->list, entry);
  	spin_unlock_irq(&ctx->completion_lock);
  
  	hrtimer_init(&req->timeout.timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
  	req->timeout.timer.function = io_timeout_fn;
bdf200731   Arnd Bergmann   io_uring: use __k...
1943
  	hrtimer_start(&req->timeout.timer, timespec64_to_ktime(ts),
5262f5679   Jens Axboe   io_uring: IORING_...
1944
1945
1946
  			HRTIMER_MODE_REL);
  	return 0;
  }
de0617e46   Jens Axboe   io_uring: add sup...
1947
  static int io_req_defer(struct io_ring_ctx *ctx, struct io_kiocb *req,
74dcfcd1d   Jens Axboe   io_uring: ensure ...
1948
  			struct sqe_submit *s)
de0617e46   Jens Axboe   io_uring: add sup...
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
  {
  	struct io_uring_sqe *sqe_copy;
  
  	if (!io_sequence_defer(ctx, req) && list_empty(&ctx->defer_list))
  		return 0;
  
  	sqe_copy = kmalloc(sizeof(*sqe_copy), GFP_KERNEL);
  	if (!sqe_copy)
  		return -EAGAIN;
  
  	spin_lock_irq(&ctx->completion_lock);
  	if (!io_sequence_defer(ctx, req) && list_empty(&ctx->defer_list)) {
  		spin_unlock_irq(&ctx->completion_lock);
  		kfree(sqe_copy);
  		return 0;
  	}
74dcfcd1d   Jens Axboe   io_uring: ensure ...
1965
1966
  	memcpy(&req->submit, s, sizeof(*s));
  	memcpy(sqe_copy, s->sqe, sizeof(*sqe_copy));
de0617e46   Jens Axboe   io_uring: add sup...
1967
1968
1969
1970
1971
1972
1973
  	req->submit.sqe = sqe_copy;
  
  	INIT_WORK(&req->work, io_sq_wq_submit_work);
  	list_add_tail(&req->list, &ctx->defer_list);
  	spin_unlock_irq(&ctx->completion_lock);
  	return -EIOCBQUEUED;
  }
2b188cc1b   Jens Axboe   Add io_uring IO i...
1974
  static int __io_submit_sqe(struct io_ring_ctx *ctx, struct io_kiocb *req,
8358e3a82   Jens Axboe   io_uring: remove ...
1975
  			   const struct sqe_submit *s, bool force_nonblock)
2b188cc1b   Jens Axboe   Add io_uring IO i...
1976
  {
e0c5c576d   Jens Axboe   io_uring: make io...
1977
  	int ret, opcode;
2b188cc1b   Jens Axboe   Add io_uring IO i...
1978

9e645e110   Jens Axboe   io_uring: add sup...
1979
  	req->user_data = READ_ONCE(s->sqe->user_data);
2b188cc1b   Jens Axboe   Add io_uring IO i...
1980
1981
  	if (unlikely(s->index >= ctx->sq_entries))
  		return -EINVAL;
2b188cc1b   Jens Axboe   Add io_uring IO i...
1982
1983
1984
1985
1986
1987
1988
  
  	opcode = READ_ONCE(s->sqe->opcode);
  	switch (opcode) {
  	case IORING_OP_NOP:
  		ret = io_nop(req, req->user_data);
  		break;
  	case IORING_OP_READV:
edafccee5   Jens Axboe   io_uring: add sup...
1989
1990
  		if (unlikely(s->sqe->buf_index))
  			return -EINVAL;
8358e3a82   Jens Axboe   io_uring: remove ...
1991
  		ret = io_read(req, s, force_nonblock);
2b188cc1b   Jens Axboe   Add io_uring IO i...
1992
1993
  		break;
  	case IORING_OP_WRITEV:
edafccee5   Jens Axboe   io_uring: add sup...
1994
1995
  		if (unlikely(s->sqe->buf_index))
  			return -EINVAL;
8358e3a82   Jens Axboe   io_uring: remove ...
1996
  		ret = io_write(req, s, force_nonblock);
edafccee5   Jens Axboe   io_uring: add sup...
1997
1998
  		break;
  	case IORING_OP_READ_FIXED:
8358e3a82   Jens Axboe   io_uring: remove ...
1999
  		ret = io_read(req, s, force_nonblock);
edafccee5   Jens Axboe   io_uring: add sup...
2000
2001
  		break;
  	case IORING_OP_WRITE_FIXED:
8358e3a82   Jens Axboe   io_uring: remove ...
2002
  		ret = io_write(req, s, force_nonblock);
2b188cc1b   Jens Axboe   Add io_uring IO i...
2003
  		break;
c992fe292   Christoph Hellwig   io_uring: add fsy...
2004
2005
2006
  	case IORING_OP_FSYNC:
  		ret = io_fsync(req, s->sqe, force_nonblock);
  		break;
221c5eb23   Jens Axboe   io_uring: add sup...
2007
2008
2009
2010
2011
2012
  	case IORING_OP_POLL_ADD:
  		ret = io_poll_add(req, s->sqe);
  		break;
  	case IORING_OP_POLL_REMOVE:
  		ret = io_poll_remove(req, s->sqe);
  		break;
5d17b4a4b   Jens Axboe   io_uring: add sup...
2013
2014
2015
  	case IORING_OP_SYNC_FILE_RANGE:
  		ret = io_sync_file_range(req, s->sqe, force_nonblock);
  		break;
0fa03c624   Jens Axboe   io_uring: add sup...
2016
2017
2018
  	case IORING_OP_SENDMSG:
  		ret = io_sendmsg(req, s->sqe, force_nonblock);
  		break;
aa1fa28fc   Jens Axboe   io_uring: add sup...
2019
2020
2021
  	case IORING_OP_RECVMSG:
  		ret = io_recvmsg(req, s->sqe, force_nonblock);
  		break;
5262f5679   Jens Axboe   io_uring: IORING_...
2022
2023
2024
  	case IORING_OP_TIMEOUT:
  		ret = io_timeout(req, s->sqe);
  		break;
2b188cc1b   Jens Axboe   Add io_uring IO i...
2025
2026
2027
2028
  	default:
  		ret = -EINVAL;
  		break;
  	}
def596e95   Jens Axboe   io_uring: support...
2029
2030
2031
2032
  	if (ret)
  		return ret;
  
  	if (ctx->flags & IORING_SETUP_IOPOLL) {
9e645e110   Jens Axboe   io_uring: add sup...
2033
  		if (req->result == -EAGAIN)
def596e95   Jens Axboe   io_uring: support...
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
  			return -EAGAIN;
  
  		/* workqueue context doesn't hold uring_lock, grab it now */
  		if (s->needs_lock)
  			mutex_lock(&ctx->uring_lock);
  		io_iopoll_req_issued(req);
  		if (s->needs_lock)
  			mutex_unlock(&ctx->uring_lock);
  	}
  
  	return 0;
2b188cc1b   Jens Axboe   Add io_uring IO i...
2045
  }
31b515106   Jens Axboe   io_uring: allow w...
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
  static struct async_list *io_async_list_from_sqe(struct io_ring_ctx *ctx,
  						 const struct io_uring_sqe *sqe)
  {
  	switch (sqe->opcode) {
  	case IORING_OP_READV:
  	case IORING_OP_READ_FIXED:
  		return &ctx->pending_async[READ];
  	case IORING_OP_WRITEV:
  	case IORING_OP_WRITE_FIXED:
  		return &ctx->pending_async[WRITE];
  	default:
  		return NULL;
  	}
  }
edafccee5   Jens Axboe   io_uring: add sup...
2060
2061
2062
2063
2064
2065
2066
  static inline bool io_sqe_needs_user(const struct io_uring_sqe *sqe)
  {
  	u8 opcode = READ_ONCE(sqe->opcode);
  
  	return !(opcode == IORING_OP_READ_FIXED ||
  		 opcode == IORING_OP_WRITE_FIXED);
  }
2b188cc1b   Jens Axboe   Add io_uring IO i...
2067
2068
2069
  static void io_sq_wq_submit_work(struct work_struct *work)
  {
  	struct io_kiocb *req = container_of(work, struct io_kiocb, work);
cac68d12c   Jens Axboe   io_uring: grab ->...
2070
  	struct fs_struct *old_fs_struct = current->fs;
2b188cc1b   Jens Axboe   Add io_uring IO i...
2071
  	struct io_ring_ctx *ctx = req->ctx;
31b515106   Jens Axboe   io_uring: allow w...
2072
2073
  	struct mm_struct *cur_mm = NULL;
  	struct async_list *async_list;
8387e3688   Jens Axboe   io_uring: async w...
2074
  	const struct cred *old_cred;
31b515106   Jens Axboe   io_uring: allow w...
2075
  	LIST_HEAD(req_list);
edafccee5   Jens Axboe   io_uring: add sup...
2076
  	mm_segment_t old_fs;
2b188cc1b   Jens Axboe   Add io_uring IO i...
2077
  	int ret;
8387e3688   Jens Axboe   io_uring: async w...
2078
  	old_cred = override_creds(ctx->creds);
31b515106   Jens Axboe   io_uring: allow w...
2079
2080
2081
2082
2083
  	async_list = io_async_list_from_sqe(ctx, req->submit.sqe);
  restart:
  	do {
  		struct sqe_submit *s = &req->submit;
  		const struct io_uring_sqe *sqe = s->sqe;
d0ee87918   Jackie Liu   io_uring: fix KAS...
2084
  		unsigned int flags = req->flags;
2b188cc1b   Jens Axboe   Add io_uring IO i...
2085

8449eedaa   Stefan Bühler   io_uring: fix han...
2086
  		/* Ensure we clear previously set non-block flag */
31b515106   Jens Axboe   io_uring: allow w...
2087
  		req->rw.ki_flags &= ~IOCB_NOWAIT;
cac68d12c   Jens Axboe   io_uring: grab ->...
2088
2089
2090
2091
2092
2093
2094
2095
  		if (req->fs != current->fs && current->fs != old_fs_struct) {
  			task_lock(current);
  			if (req->fs)
  				current->fs = req->fs;
  			else
  				current->fs = old_fs_struct;
  			task_unlock(current);
  		}
31b515106   Jens Axboe   io_uring: allow w...
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
  		ret = 0;
  		if (io_sqe_needs_user(sqe) && !cur_mm) {
  			if (!mmget_not_zero(ctx->sqo_mm)) {
  				ret = -EFAULT;
  			} else {
  				cur_mm = ctx->sqo_mm;
  				use_mm(cur_mm);
  				old_fs = get_fs();
  				set_fs(USER_DS);
  			}
  		}
  
  		if (!ret) {
  			s->has_user = cur_mm != NULL;
  			s->needs_lock = true;
  			do {
8358e3a82   Jens Axboe   io_uring: remove ...
2112
  				ret = __io_submit_sqe(ctx, req, s, false);
31b515106   Jens Axboe   io_uring: allow w...
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
  				/*
  				 * We can get EAGAIN for polled IO even though
  				 * we're forcing a sync submission from here,
  				 * since we can't wait for request slots on the
  				 * block side.
  				 */
  				if (ret != -EAGAIN)
  					break;
  				cond_resched();
  			} while (1);
  		}
817869d25   Jens Axboe   io_uring: drop re...
2124
2125
2126
  
  		/* drop submission reference */
  		io_put_req(req);
31b515106   Jens Axboe   io_uring: allow w...
2127
  		if (ret) {
c71ffb673   Jens Axboe   io_uring: remove ...
2128
  			io_cqring_add_event(ctx, sqe->user_data, ret);
e65ef56db   Jens Axboe   io_uring: use reg...
2129
  			io_put_req(req);
31b515106   Jens Axboe   io_uring: allow w...
2130
2131
2132
2133
  		}
  
  		/* async context always use a copy of the sqe */
  		kfree(sqe);
f7b76ac9d   Zhengyuan Liu   io_uring: fix cou...
2134
  		/* req from defer and link list needn't decrease async cnt */
d0ee87918   Jackie Liu   io_uring: fix KAS...
2135
  		if (flags & (REQ_F_IO_DRAINED | REQ_F_LINK_DONE))
f7b76ac9d   Zhengyuan Liu   io_uring: fix cou...
2136
  			goto out;
31b515106   Jens Axboe   io_uring: allow w...
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
  		if (!async_list)
  			break;
  		if (!list_empty(&req_list)) {
  			req = list_first_entry(&req_list, struct io_kiocb,
  						list);
  			list_del(&req->list);
  			continue;
  		}
  		if (list_empty(&async_list->list))
  			break;
  
  		req = NULL;
  		spin_lock(&async_list->lock);
  		if (list_empty(&async_list->list)) {
  			spin_unlock(&async_list->lock);
  			break;
  		}
  		list_splice_init(&async_list->list, &req_list);
  		spin_unlock(&async_list->lock);
  
  		req = list_first_entry(&req_list, struct io_kiocb, list);
  		list_del(&req->list);
  	} while (req);
edafccee5   Jens Axboe   io_uring: add sup...
2160
2161
  
  	/*
31b515106   Jens Axboe   io_uring: allow w...
2162
2163
2164
  	 * Rare case of racing with a submitter. If we find the count has
  	 * dropped to zero AND we have pending work items, then restart
  	 * the processing. This is a tiny race window.
edafccee5   Jens Axboe   io_uring: add sup...
2165
  	 */
31b515106   Jens Axboe   io_uring: allow w...
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180
  	if (async_list) {
  		ret = atomic_dec_return(&async_list->cnt);
  		while (!ret && !list_empty(&async_list->list)) {
  			spin_lock(&async_list->lock);
  			atomic_inc(&async_list->cnt);
  			list_splice_init(&async_list->list, &req_list);
  			spin_unlock(&async_list->lock);
  
  			if (!list_empty(&req_list)) {
  				req = list_first_entry(&req_list,
  							struct io_kiocb, list);
  				list_del(&req->list);
  				goto restart;
  			}
  			ret = atomic_dec_return(&async_list->cnt);
edafccee5   Jens Axboe   io_uring: add sup...
2181
  		}
edafccee5   Jens Axboe   io_uring: add sup...
2182
  	}
2b188cc1b   Jens Axboe   Add io_uring IO i...
2183

f7b76ac9d   Zhengyuan Liu   io_uring: fix cou...
2184
  out:
31b515106   Jens Axboe   io_uring: allow w...
2185
  	if (cur_mm) {
edafccee5   Jens Axboe   io_uring: add sup...
2186
  		set_fs(old_fs);
31b515106   Jens Axboe   io_uring: allow w...
2187
2188
  		unuse_mm(cur_mm);
  		mmput(cur_mm);
2b188cc1b   Jens Axboe   Add io_uring IO i...
2189
  	}
8387e3688   Jens Axboe   io_uring: async w...
2190
  	revert_creds(old_cred);
cac68d12c   Jens Axboe   io_uring: grab ->...
2191
2192
2193
2194
2195
  	if (old_fs_struct) {
  		task_lock(current);
  		current->fs = old_fs_struct;
  		task_unlock(current);
  	}
31b515106   Jens Axboe   io_uring: allow w...
2196
  }
2b188cc1b   Jens Axboe   Add io_uring IO i...
2197

31b515106   Jens Axboe   io_uring: allow w...
2198
2199
2200
2201
2202
2203
2204
  /*
   * See if we can piggy back onto previously submitted work, that is still
   * running. We currently only allow this if the new request is sequential
   * to the previous one we punted.
   */
  static bool io_add_to_prev_work(struct async_list *list, struct io_kiocb *req)
  {
6d5d5ac52   Jens Axboe   io_uring: extend ...
2205
  	bool ret;
31b515106   Jens Axboe   io_uring: allow w...
2206
2207
2208
2209
2210
2211
2212
2213
2214
2215
2216
  
  	if (!list)
  		return false;
  	if (!(req->flags & REQ_F_SEQ_PREV))
  		return false;
  	if (!atomic_read(&list->cnt))
  		return false;
  
  	ret = true;
  	spin_lock(&list->lock);
  	list_add_tail(&req->list, &list->list);
c0e48f9de   Zhengyuan Liu   io_uring: add a m...
2217
2218
2219
2220
  	/*
  	 * Ensure we see a simultaneous modification from io_sq_wq_submit_work()
  	 */
  	smp_mb();
31b515106   Jens Axboe   io_uring: allow w...
2221
2222
2223
2224
2225
2226
  	if (!atomic_read(&list->cnt)) {
  		list_del_init(&req->list);
  		ret = false;
  	}
  	spin_unlock(&list->lock);
  	return ret;
2b188cc1b   Jens Axboe   Add io_uring IO i...
2227
  }
09bb83943   Jens Axboe   io_uring: fix fge...
2228
2229
2230
2231
2232
2233
2234
  static bool io_op_needs_file(const struct io_uring_sqe *sqe)
  {
  	int op = READ_ONCE(sqe->opcode);
  
  	switch (op) {
  	case IORING_OP_NOP:
  	case IORING_OP_POLL_REMOVE:
5683e5406   Pavel Begunkov   io_uring: Fix get...
2235
  	case IORING_OP_TIMEOUT:
09bb83943   Jens Axboe   io_uring: fix fge...
2236
2237
2238
2239
2240
2241
2242
2243
2244
2245
2246
2247
2248
2249
  		return false;
  	default:
  		return true;
  	}
  }
  
  static int io_req_set_file(struct io_ring_ctx *ctx, const struct sqe_submit *s,
  			   struct io_submit_state *state, struct io_kiocb *req)
  {
  	unsigned flags;
  	int fd;
  
  	flags = READ_ONCE(s->sqe->flags);
  	fd = READ_ONCE(s->sqe->fd);
4fe2c9631   Jackie Liu   io_uring: add sup...
2250
  	if (flags & IOSQE_IO_DRAIN)
de0617e46   Jens Axboe   io_uring: add sup...
2251
  		req->flags |= REQ_F_IO_DRAIN;
4fe2c9631   Jackie Liu   io_uring: add sup...
2252
2253
2254
2255
2256
2257
  	/*
  	 * All io need record the previous position, if LINK vs DARIN,
  	 * it can be used to mark the position of the first IO in the
  	 * link list.
  	 */
  	req->sequence = s->sequence;
de0617e46   Jens Axboe   io_uring: add sup...
2258

60c112b0a   Jens Axboe   io_uring: ensure ...
2259
  	if (!io_op_needs_file(s->sqe))
09bb83943   Jens Axboe   io_uring: fix fge...
2260
  		return 0;
09bb83943   Jens Axboe   io_uring: fix fge...
2261
2262
2263
2264
2265
2266
2267
2268
2269
2270
2271
2272
2273
2274
2275
2276
2277
  
  	if (flags & IOSQE_FIXED_FILE) {
  		if (unlikely(!ctx->user_files ||
  		    (unsigned) fd >= ctx->nr_user_files))
  			return -EBADF;
  		req->file = ctx->user_files[fd];
  		req->flags |= REQ_F_FIXED_FILE;
  	} else {
  		if (s->needs_fixed_file)
  			return -EBADF;
  		req->file = io_file_get(state, fd);
  		if (unlikely(!req->file))
  			return -EBADF;
  	}
  
  	return 0;
  }
4fe2c9631   Jackie Liu   io_uring: add sup...
2278
  static int __io_queue_sqe(struct io_ring_ctx *ctx, struct io_kiocb *req,
bc808bced   Jens Axboe   io_uring: revert ...
2279
  			struct sqe_submit *s)
2b188cc1b   Jens Axboe   Add io_uring IO i...
2280
  {
e0c5c576d   Jens Axboe   io_uring: make io...
2281
  	int ret;
2b188cc1b   Jens Axboe   Add io_uring IO i...
2282

bc808bced   Jens Axboe   io_uring: revert ...
2283
  	ret = __io_submit_sqe(ctx, req, s, true);
491381ce0   Jens Axboe   io_uring: fix up ...
2284
2285
2286
2287
2288
2289
2290
  
  	/*
  	 * We async punt it if the file wasn't marked NOWAIT, or if the file
  	 * doesn't support non-blocking read/write attempts
  	 */
  	if (ret == -EAGAIN && (!(req->flags & REQ_F_NOWAIT) ||
  	    (req->flags & REQ_F_MUST_PUNT))) {
2b188cc1b   Jens Axboe   Add io_uring IO i...
2291
  		struct io_uring_sqe *sqe_copy;
954dab193   Jackie Liu   io_uring: use kme...
2292
  		sqe_copy = kmemdup(s->sqe, sizeof(*sqe_copy), GFP_KERNEL);
2b188cc1b   Jens Axboe   Add io_uring IO i...
2293
  		if (sqe_copy) {
31b515106   Jens Axboe   io_uring: allow w...
2294
  			struct async_list *list;
2b188cc1b   Jens Axboe   Add io_uring IO i...
2295
  			s->sqe = sqe_copy;
2b188cc1b   Jens Axboe   Add io_uring IO i...
2296
  			memcpy(&req->submit, s, sizeof(*s));
31b515106   Jens Axboe   io_uring: allow w...
2297
2298
2299
2300
2301
  			list = io_async_list_from_sqe(ctx, s->sqe);
  			if (!io_add_to_prev_work(list, req)) {
  				if (list)
  					atomic_inc(&list->cnt);
  				INIT_WORK(&req->work, io_sq_wq_submit_work);
18d9be1a9   Jens Axboe   io_uring: add io_...
2302
  				io_queue_async_work(ctx, req);
31b515106   Jens Axboe   io_uring: allow w...
2303
  			}
e65ef56db   Jens Axboe   io_uring: use reg...
2304
2305
2306
  
  			/*
  			 * Queued up for async execution, worker will release
9e645e110   Jens Axboe   io_uring: add sup...
2307
  			 * submit reference when the iocb is actually submitted.
e65ef56db   Jens Axboe   io_uring: use reg...
2308
2309
  			 */
  			return 0;
2b188cc1b   Jens Axboe   Add io_uring IO i...
2310
2311
  		}
  	}
e65ef56db   Jens Axboe   io_uring: use reg...
2312
2313
2314
2315
2316
  
  	/* drop submission reference */
  	io_put_req(req);
  
  	/* and drop final reference, if we failed */
9e645e110   Jens Axboe   io_uring: add sup...
2317
2318
2319
2320
  	if (ret) {
  		io_cqring_add_event(ctx, req->user_data, ret);
  		if (req->flags & REQ_F_LINK)
  			req->flags |= REQ_F_FAIL_LINK;
e65ef56db   Jens Axboe   io_uring: use reg...
2321
  		io_put_req(req);
9e645e110   Jens Axboe   io_uring: add sup...
2322
  	}
2b188cc1b   Jens Axboe   Add io_uring IO i...
2323
2324
2325
  
  	return ret;
  }
4fe2c9631   Jackie Liu   io_uring: add sup...
2326
  static int io_queue_sqe(struct io_ring_ctx *ctx, struct io_kiocb *req,
bc808bced   Jens Axboe   io_uring: revert ...
2327
  			struct sqe_submit *s)
4fe2c9631   Jackie Liu   io_uring: add sup...
2328
2329
  {
  	int ret;
74dcfcd1d   Jens Axboe   io_uring: ensure ...
2330
  	ret = io_req_defer(ctx, req, s);
4fe2c9631   Jackie Liu   io_uring: add sup...
2331
2332
2333
2334
2335
2336
2337
  	if (ret) {
  		if (ret != -EIOCBQUEUED) {
  			io_free_req(req);
  			io_cqring_add_event(ctx, s->sqe->user_data, ret);
  		}
  		return 0;
  	}
bc808bced   Jens Axboe   io_uring: revert ...
2338
  	return __io_queue_sqe(ctx, req, s);
4fe2c9631   Jackie Liu   io_uring: add sup...
2339
2340
2341
  }
  
  static int io_queue_link_head(struct io_ring_ctx *ctx, struct io_kiocb *req,
bc808bced   Jens Axboe   io_uring: revert ...
2342
  			      struct sqe_submit *s, struct io_kiocb *shadow)
4fe2c9631   Jackie Liu   io_uring: add sup...
2343
2344
2345
2346
2347
  {
  	int ret;
  	int need_submit = false;
  
  	if (!shadow)
bc808bced   Jens Axboe   io_uring: revert ...
2348
  		return io_queue_sqe(ctx, req, s);
4fe2c9631   Jackie Liu   io_uring: add sup...
2349
2350
2351
2352
2353
2354
2355
  
  	/*
  	 * Mark the first IO in link list as DRAIN, let all the following
  	 * IOs enter the defer list. all IO needs to be completed before link
  	 * list.
  	 */
  	req->flags |= REQ_F_IO_DRAIN;
74dcfcd1d   Jens Axboe   io_uring: ensure ...
2356
  	ret = io_req_defer(ctx, req, s);
4fe2c9631   Jackie Liu   io_uring: add sup...
2357
2358
2359
  	if (ret) {
  		if (ret != -EIOCBQUEUED) {
  			io_free_req(req);
7b20238d2   Pavel Begunkov   io_uring: Fix lea...
2360
  			__io_free_req(shadow);
4fe2c9631   Jackie Liu   io_uring: add sup...
2361
2362
2363
2364
2365
2366
2367
2368
2369
2370
2371
2372
2373
2374
2375
2376
2377
  			io_cqring_add_event(ctx, s->sqe->user_data, ret);
  			return 0;
  		}
  	} else {
  		/*
  		 * If ret == 0 means that all IOs in front of link io are
  		 * running done. let's queue link head.
  		 */
  		need_submit = true;
  	}
  
  	/* Insert shadow req to defer_list, blocking next IOs */
  	spin_lock_irq(&ctx->completion_lock);
  	list_add_tail(&shadow->list, &ctx->defer_list);
  	spin_unlock_irq(&ctx->completion_lock);
  
  	if (need_submit)
bc808bced   Jens Axboe   io_uring: revert ...
2378
  		return __io_queue_sqe(ctx, req, s);
4fe2c9631   Jackie Liu   io_uring: add sup...
2379
2380
2381
  
  	return 0;
  }
9e645e110   Jens Axboe   io_uring: add sup...
2382
2383
2384
  #define SQE_VALID_FLAGS	(IOSQE_FIXED_FILE|IOSQE_IO_DRAIN|IOSQE_IO_LINK)
  
  static void io_submit_sqe(struct io_ring_ctx *ctx, struct sqe_submit *s,
bc808bced   Jens Axboe   io_uring: revert ...
2385
  			  struct io_submit_state *state, struct io_kiocb **link)
9e645e110   Jens Axboe   io_uring: add sup...
2386
2387
2388
2389
2390
2391
2392
2393
2394
2395
2396
2397
2398
2399
2400
2401
2402
2403
2404
2405
2406
2407
2408
2409
2410
  {
  	struct io_uring_sqe *sqe_copy;
  	struct io_kiocb *req;
  	int ret;
  
  	/* enforce forwards compatibility on users */
  	if (unlikely(s->sqe->flags & ~SQE_VALID_FLAGS)) {
  		ret = -EINVAL;
  		goto err;
  	}
  
  	req = io_get_req(ctx, state);
  	if (unlikely(!req)) {
  		ret = -EAGAIN;
  		goto err;
  	}
  
  	ret = io_req_set_file(ctx, s, state, req);
  	if (unlikely(ret)) {
  err_req:
  		io_free_req(req);
  err:
  		io_cqring_add_event(ctx, s->sqe->user_data, ret);
  		return;
  	}
84d55dc5b   Pavel Begunkov   io_uring: Fix cor...
2411
  	req->user_data = s->sqe->user_data;
cac68d12c   Jens Axboe   io_uring: grab ->...
2412
2413
2414
2415
2416
2417
2418
2419
2420
2421
2422
2423
2424
2425
2426
2427
  #if defined(CONFIG_NET)
  	switch (READ_ONCE(s->sqe->opcode)) {
  	case IORING_OP_SENDMSG:
  	case IORING_OP_RECVMSG:
  		spin_lock(&current->fs->lock);
  		if (!current->fs->in_exec) {
  			req->fs = current->fs;
  			req->fs->users++;
  		}
  		spin_unlock(&current->fs->lock);
  		if (!req->fs) {
  			ret = -EAGAIN;
  			goto err_req;
  		}
  	}
  #endif
9e645e110   Jens Axboe   io_uring: add sup...
2428
2429
2430
2431
2432
2433
2434
2435
2436
2437
2438
2439
2440
2441
2442
2443
2444
2445
2446
2447
2448
2449
2450
2451
2452
2453
  	/*
  	 * If we already have a head request, queue this one for async
  	 * submittal once the head completes. If we don't have a head but
  	 * IOSQE_IO_LINK is set in the sqe, start a new head. This one will be
  	 * submitted sync once the chain is complete. If none of those
  	 * conditions are true (normal request), then just queue it.
  	 */
  	if (*link) {
  		struct io_kiocb *prev = *link;
  
  		sqe_copy = kmemdup(s->sqe, sizeof(*sqe_copy), GFP_KERNEL);
  		if (!sqe_copy) {
  			ret = -EAGAIN;
  			goto err_req;
  		}
  
  		s->sqe = sqe_copy;
  		memcpy(&req->submit, s, sizeof(*s));
  		list_add_tail(&req->list, &prev->link_list);
  	} else if (s->sqe->flags & IOSQE_IO_LINK) {
  		req->flags |= REQ_F_LINK;
  
  		memcpy(&req->submit, s, sizeof(*s));
  		INIT_LIST_HEAD(&req->link_list);
  		*link = req;
  	} else {
bc808bced   Jens Axboe   io_uring: revert ...
2454
  		io_queue_sqe(ctx, req, s);
9e645e110   Jens Axboe   io_uring: add sup...
2455
2456
  	}
  }
9a56a2323   Jens Axboe   io_uring: use fge...
2457
2458
2459
2460
2461
2462
  /*
   * Batched submission is done, ensure local IO is flushed out.
   */
  static void io_submit_state_end(struct io_submit_state *state)
  {
  	blk_finish_plug(&state->plug);
3d6770fbd   Jens Axboe   io_uring: drop io...
2463
  	io_file_put(state);
2579f913d   Jens Axboe   io_uring: batch i...
2464
2465
2466
  	if (state->free_reqs)
  		kmem_cache_free_bulk(req_cachep, state->free_reqs,
  					&state->reqs[state->cur_req]);
9a56a2323   Jens Axboe   io_uring: use fge...
2467
2468
2469
2470
2471
2472
2473
2474
2475
  }
  
  /*
   * Start submission side cache.
   */
  static void io_submit_state_start(struct io_submit_state *state,
  				  struct io_ring_ctx *ctx, unsigned max_ios)
  {
  	blk_start_plug(&state->plug);
2579f913d   Jens Axboe   io_uring: batch i...
2476
  	state->free_reqs = 0;
9a56a2323   Jens Axboe   io_uring: use fge...
2477
2478
2479
  	state->file = NULL;
  	state->ios_left = max_ios;
  }
2b188cc1b   Jens Axboe   Add io_uring IO i...
2480
2481
  static void io_commit_sqring(struct io_ring_ctx *ctx)
  {
75b28affd   Hristo Venev   io_uring: allocat...
2482
  	struct io_rings *rings = ctx->rings;
2b188cc1b   Jens Axboe   Add io_uring IO i...
2483

75b28affd   Hristo Venev   io_uring: allocat...
2484
  	if (ctx->cached_sq_head != READ_ONCE(rings->sq.head)) {
2b188cc1b   Jens Axboe   Add io_uring IO i...
2485
2486
2487
2488
2489
  		/*
  		 * Ensure any loads from the SQEs are done at this point,
  		 * since once we write the new head, the application could
  		 * write new data to them.
  		 */
75b28affd   Hristo Venev   io_uring: allocat...
2490
  		smp_store_release(&rings->sq.head, ctx->cached_sq_head);
2b188cc1b   Jens Axboe   Add io_uring IO i...
2491
2492
2493
2494
  	}
  }
  
  /*
2b188cc1b   Jens Axboe   Add io_uring IO i...
2495
2496
2497
2498
2499
2500
2501
2502
2503
   * Fetch an sqe, if one is available. Note that s->sqe will point to memory
   * that is mapped by userspace. This means that care needs to be taken to
   * ensure that reads are stable, as we cannot rely on userspace always
   * being a good citizen. If members of the sqe are validated and then later
   * used, it's important that those reads are done through READ_ONCE() to
   * prevent a re-load down the line.
   */
  static bool io_get_sqring(struct io_ring_ctx *ctx, struct sqe_submit *s)
  {
75b28affd   Hristo Venev   io_uring: allocat...
2504
2505
  	struct io_rings *rings = ctx->rings;
  	u32 *sq_array = ctx->sq_array;
2b188cc1b   Jens Axboe   Add io_uring IO i...
2506
2507
2508
2509
2510
2511
2512
2513
2514
2515
2516
  	unsigned head;
  
  	/*
  	 * The cached sq head (or cq tail) serves two purposes:
  	 *
  	 * 1) allows us to batch the cost of updating the user visible
  	 *    head updates.
  	 * 2) allows the kernel side to track the head on its own, even
  	 *    though the application is the one updating it.
  	 */
  	head = ctx->cached_sq_head;
e523a29c4   Stefan Bühler   io_uring: fix rac...
2517
  	/* make sure SQ entry isn't read before tail */
75b28affd   Hristo Venev   io_uring: allocat...
2518
  	if (head == smp_load_acquire(&rings->sq.tail))
2b188cc1b   Jens Axboe   Add io_uring IO i...
2519
  		return false;
75b28affd   Hristo Venev   io_uring: allocat...
2520
  	head = READ_ONCE(sq_array[head & ctx->sq_mask]);
2b188cc1b   Jens Axboe   Add io_uring IO i...
2521
2522
2523
  	if (head < ctx->sq_entries) {
  		s->index = head;
  		s->sqe = &ctx->sq_sqes[head];
8776f3fa1   Jackie Liu   io_uring: fix wro...
2524
  		s->sequence = ctx->cached_sq_head;
2b188cc1b   Jens Axboe   Add io_uring IO i...
2525
2526
2527
2528
2529
2530
  		ctx->cached_sq_head++;
  		return true;
  	}
  
  	/* drop invalid entries */
  	ctx->cached_sq_head++;
498ccd9ed   Jens Axboe   io_uring: used ca...
2531
2532
  	ctx->cached_sq_dropped++;
  	WRITE_ONCE(rings->sq_dropped, ctx->cached_sq_dropped);
2b188cc1b   Jens Axboe   Add io_uring IO i...
2533
2534
  	return false;
  }
fb5ccc987   Pavel Begunkov   io_uring: Fix bro...
2535
2536
  static int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr,
  			  bool has_user, bool mm_fault)
6c271ce2f   Jens Axboe   io_uring: add sub...
2537
2538
  {
  	struct io_submit_state state, *statep = NULL;
9e645e110   Jens Axboe   io_uring: add sup...
2539
  	struct io_kiocb *link = NULL;
4fe2c9631   Jackie Liu   io_uring: add sup...
2540
  	struct io_kiocb *shadow_req = NULL;
9e645e110   Jens Axboe   io_uring: add sup...
2541
2542
  	bool prev_was_link = false;
  	int i, submitted = 0;
6c271ce2f   Jens Axboe   io_uring: add sub...
2543
2544
2545
2546
2547
2548
2549
  
  	if (nr > IO_PLUG_THRESHOLD) {
  		io_submit_state_start(&state, ctx, nr);
  		statep = &state;
  	}
  
  	for (i = 0; i < nr; i++) {
fb5ccc987   Pavel Begunkov   io_uring: Fix bro...
2550
2551
2552
2553
  		struct sqe_submit s;
  
  		if (!io_get_sqring(ctx, &s))
  			break;
9e645e110   Jens Axboe   io_uring: add sup...
2554
2555
2556
2557
2558
  		/*
  		 * If previous wasn't linked and we have a linked command,
  		 * that's the end of the chain. Submit the previous link.
  		 */
  		if (!prev_was_link && link) {
bc808bced   Jens Axboe   io_uring: revert ...
2559
  			io_queue_link_head(ctx, link, &link->submit, shadow_req);
9e645e110   Jens Axboe   io_uring: add sup...
2560
  			link = NULL;
5f5ad9ced   Jackie Liu   io_uring: fix use...
2561
  			shadow_req = NULL;
9e645e110   Jens Axboe   io_uring: add sup...
2562
  		}
fb5ccc987   Pavel Begunkov   io_uring: Fix bro...
2563
  		prev_was_link = (s.sqe->flags & IOSQE_IO_LINK) != 0;
9e645e110   Jens Axboe   io_uring: add sup...
2564

fb5ccc987   Pavel Begunkov   io_uring: Fix bro...
2565
  		if (link && (s.sqe->flags & IOSQE_IO_DRAIN)) {
4fe2c9631   Jackie Liu   io_uring: add sup...
2566
2567
  			if (!shadow_req) {
  				shadow_req = io_get_req(ctx, NULL);
a1041c27b   Jackie Liu   io_uring: fix pot...
2568
2569
  				if (unlikely(!shadow_req))
  					goto out;
4fe2c9631   Jackie Liu   io_uring: add sup...
2570
2571
2572
  				shadow_req->flags |= (REQ_F_IO_DRAIN | REQ_F_SHADOW_DRAIN);
  				refcount_dec(&shadow_req->refs);
  			}
fb5ccc987   Pavel Begunkov   io_uring: Fix bro...
2573
  			shadow_req->sequence = s.sequence;
4fe2c9631   Jackie Liu   io_uring: add sup...
2574
  		}
a1041c27b   Jackie Liu   io_uring: fix pot...
2575
  out:
6c271ce2f   Jens Axboe   io_uring: add sub...
2576
  		if (unlikely(mm_fault)) {
fb5ccc987   Pavel Begunkov   io_uring: Fix bro...
2577
  			io_cqring_add_event(ctx, s.sqe->user_data,
9e645e110   Jens Axboe   io_uring: add sup...
2578
  						-EFAULT);
6c271ce2f   Jens Axboe   io_uring: add sub...
2579
  		} else {
fb5ccc987   Pavel Begunkov   io_uring: Fix bro...
2580
2581
2582
2583
  			s.has_user = has_user;
  			s.needs_lock = true;
  			s.needs_fixed_file = true;
  			io_submit_sqe(ctx, &s, statep, &link);
6c271ce2f   Jens Axboe   io_uring: add sub...
2584
  			submitted++;
6c271ce2f   Jens Axboe   io_uring: add sub...
2585
  		}
6c271ce2f   Jens Axboe   io_uring: add sub...
2586
  	}
9e645e110   Jens Axboe   io_uring: add sup...
2587
  	if (link)
bc808bced   Jens Axboe   io_uring: revert ...
2588
  		io_queue_link_head(ctx, link, &link->submit, shadow_req);
6c271ce2f   Jens Axboe   io_uring: add sub...
2589
2590
2591
2592
2593
2594
2595
2596
  	if (statep)
  		io_submit_state_end(&state);
  
  	return submitted;
  }
  
  static int io_sq_thread(void *data)
  {
6c271ce2f   Jens Axboe   io_uring: add sub...
2597
2598
  	struct io_ring_ctx *ctx = data;
  	struct mm_struct *cur_mm = NULL;
8387e3688   Jens Axboe   io_uring: async w...
2599
  	const struct cred *old_cred;
6c271ce2f   Jens Axboe   io_uring: add sub...
2600
2601
2602
2603
  	mm_segment_t old_fs;
  	DEFINE_WAIT(wait);
  	unsigned inflight;
  	unsigned long timeout;
a4c0b3dec   Jackie Liu   io_uring: fix io_...
2604
  	complete(&ctx->sqo_thread_started);
6c271ce2f   Jens Axboe   io_uring: add sub...
2605
2606
  	old_fs = get_fs();
  	set_fs(USER_DS);
8387e3688   Jens Axboe   io_uring: async w...
2607
  	old_cred = override_creds(ctx->creds);
6c271ce2f   Jens Axboe   io_uring: add sub...
2608
2609
  
  	timeout = inflight = 0;
2bbcd6d3b   Roman Penyaev   io_uring: fix inf...
2610
  	while (!kthread_should_park()) {
fb5ccc987   Pavel Begunkov   io_uring: Fix bro...
2611
2612
  		bool mm_fault = false;
  		unsigned int to_submit;
6c271ce2f   Jens Axboe   io_uring: add sub...
2613
2614
2615
2616
2617
  
  		if (inflight) {
  			unsigned nr_events = 0;
  
  			if (ctx->flags & IORING_SETUP_IOPOLL) {
2b2ed9750   Jens Axboe   io_uring: fix bad...
2618
2619
2620
2621
2622
2623
2624
2625
2626
2627
  				/*
  				 * inflight is the count of the maximum possible
  				 * entries we submitted, but it can be smaller
  				 * if we dropped some of them. If we don't have
  				 * poll entries available, then we know that we
  				 * have nothing left to poll for. Reset the
  				 * inflight count to zero in that case.
  				 */
  				mutex_lock(&ctx->uring_lock);
  				if (!list_empty(&ctx->poll_list))
c7deb9612   Xiaoguang Wang   io_uring: fix __i...
2628
  					io_iopoll_getevents(ctx, &nr_events, 0);
2b2ed9750   Jens Axboe   io_uring: fix bad...
2629
2630
2631
  				else
  					inflight = 0;
  				mutex_unlock(&ctx->uring_lock);
6c271ce2f   Jens Axboe   io_uring: add sub...
2632
2633
2634
2635
2636
2637
2638
2639
2640
2641
2642
2643
  			} else {
  				/*
  				 * Normal IO, just pretend everything completed.
  				 * We don't have to poll completions for that.
  				 */
  				nr_events = inflight;
  			}
  
  			inflight -= nr_events;
  			if (!inflight)
  				timeout = jiffies + ctx->sq_thread_idle;
  		}
fb5ccc987   Pavel Begunkov   io_uring: Fix bro...
2644
2645
  		to_submit = io_sqring_entries(ctx);
  		if (!to_submit) {
6c271ce2f   Jens Axboe   io_uring: add sub...
2646
  			/*
6c271ce2f   Jens Axboe   io_uring: add sub...
2647
2648
2649
2650
2651
2652
2653
2654
2655
2656
  			 * Drop cur_mm before scheduling, we can't hold it for
  			 * long periods (or over schedule()). Do this before
  			 * adding ourselves to the waitqueue, as the unuse/drop
  			 * may sleep.
  			 */
  			if (cur_mm) {
  				unuse_mm(cur_mm);
  				mmput(cur_mm);
  				cur_mm = NULL;
  			}
8eb92c122   Stefano Garzarella   io_uring: prevent...
2657
2658
2659
2660
2661
2662
2663
2664
2665
  			/*
  			 * We're polling. If we're within the defined idle
  			 * period, then let us spin without work before going
  			 * to sleep.
  			 */
  			if (inflight || !time_after(jiffies, timeout)) {
  				cond_resched();
  				continue;
  			}
6c271ce2f   Jens Axboe   io_uring: add sub...
2666
2667
2668
2669
  			prepare_to_wait(&ctx->sqo_wait, &wait,
  						TASK_INTERRUPTIBLE);
  
  			/* Tell userspace we may need a wakeup call */
75b28affd   Hristo Venev   io_uring: allocat...
2670
  			ctx->rings->sq_flags |= IORING_SQ_NEED_WAKEUP;
0d7bae69c   Stefan Bühler   io_uring: fix rac...
2671
2672
  			/* make sure to read SQ tail after writing flags */
  			smp_mb();
6c271ce2f   Jens Axboe   io_uring: add sub...
2673

fb5ccc987   Pavel Begunkov   io_uring: Fix bro...
2674
2675
  			to_submit = io_sqring_entries(ctx);
  			if (!to_submit) {
2bbcd6d3b   Roman Penyaev   io_uring: fix inf...
2676
  				if (kthread_should_park()) {
6c271ce2f   Jens Axboe   io_uring: add sub...
2677
2678
2679
2680
2681
2682
2683
  					finish_wait(&ctx->sqo_wait, &wait);
  					break;
  				}
  				if (signal_pending(current))
  					flush_signals(current);
  				schedule();
  				finish_wait(&ctx->sqo_wait, &wait);
75b28affd   Hristo Venev   io_uring: allocat...
2684
  				ctx->rings->sq_flags &= ~IORING_SQ_NEED_WAKEUP;
6c271ce2f   Jens Axboe   io_uring: add sub...
2685
2686
2687
  				continue;
  			}
  			finish_wait(&ctx->sqo_wait, &wait);
75b28affd   Hristo Venev   io_uring: allocat...
2688
  			ctx->rings->sq_flags &= ~IORING_SQ_NEED_WAKEUP;
6c271ce2f   Jens Axboe   io_uring: add sub...
2689
  		}
6c271ce2f   Jens Axboe   io_uring: add sub...
2690
  		/* Unless all new commands are FIXED regions, grab mm */
fb5ccc987   Pavel Begunkov   io_uring: Fix bro...
2691
  		if (!cur_mm) {
6c271ce2f   Jens Axboe   io_uring: add sub...
2692
2693
2694
2695
2696
2697
  			mm_fault = !mmget_not_zero(ctx->sqo_mm);
  			if (!mm_fault) {
  				use_mm(ctx->sqo_mm);
  				cur_mm = ctx->sqo_mm;
  			}
  		}
fb5ccc987   Pavel Begunkov   io_uring: Fix bro...
2698
2699
2700
  		to_submit = min(to_submit, ctx->sq_entries);
  		inflight += io_submit_sqes(ctx, to_submit, cur_mm != NULL,
  					   mm_fault);
6c271ce2f   Jens Axboe   io_uring: add sub...
2701
2702
2703
2704
2705
2706
2707
2708
2709
2710
  
  		/* Commit SQ ring head once we've consumed all SQEs */
  		io_commit_sqring(ctx);
  	}
  
  	set_fs(old_fs);
  	if (cur_mm) {
  		unuse_mm(cur_mm);
  		mmput(cur_mm);
  	}
8387e3688   Jens Axboe   io_uring: async w...
2711
  	revert_creds(old_cred);
060586324   Jens Axboe   io_uring: park SQ...
2712

2bbcd6d3b   Roman Penyaev   io_uring: fix inf...
2713
  	kthread_parkme();
060586324   Jens Axboe   io_uring: park SQ...
2714

6c271ce2f   Jens Axboe   io_uring: add sub...
2715
2716
  	return 0;
  }
bc808bced   Jens Axboe   io_uring: revert ...
2717
  static int io_ring_submit(struct io_ring_ctx *ctx, unsigned int to_submit)
2b188cc1b   Jens Axboe   Add io_uring IO i...
2718
  {
9a56a2323   Jens Axboe   io_uring: use fge...
2719
  	struct io_submit_state state, *statep = NULL;
9e645e110   Jens Axboe   io_uring: add sup...
2720
  	struct io_kiocb *link = NULL;
4fe2c9631   Jackie Liu   io_uring: add sup...
2721
  	struct io_kiocb *shadow_req = NULL;
9e645e110   Jens Axboe   io_uring: add sup...
2722
  	bool prev_was_link = false;
5c8b0b54d   Jens Axboe   io_uring: have su...
2723
  	int i, submit = 0;
2b188cc1b   Jens Axboe   Add io_uring IO i...
2724

9a56a2323   Jens Axboe   io_uring: use fge...
2725
2726
2727
2728
  	if (to_submit > IO_PLUG_THRESHOLD) {
  		io_submit_state_start(&state, ctx, to_submit);
  		statep = &state;
  	}
2b188cc1b   Jens Axboe   Add io_uring IO i...
2729
2730
2731
2732
2733
2734
  
  	for (i = 0; i < to_submit; i++) {
  		struct sqe_submit s;
  
  		if (!io_get_sqring(ctx, &s))
  			break;
9e645e110   Jens Axboe   io_uring: add sup...
2735
2736
2737
2738
2739
  		/*
  		 * If previous wasn't linked and we have a linked command,
  		 * that's the end of the chain. Submit the previous link.
  		 */
  		if (!prev_was_link && link) {
bc808bced   Jens Axboe   io_uring: revert ...
2740
  			io_queue_link_head(ctx, link, &link->submit, shadow_req);
9e645e110   Jens Axboe   io_uring: add sup...
2741
  			link = NULL;
5f5ad9ced   Jackie Liu   io_uring: fix use...
2742
  			shadow_req = NULL;
9e645e110   Jens Axboe   io_uring: add sup...
2743
2744
  		}
  		prev_was_link = (s.sqe->flags & IOSQE_IO_LINK) != 0;
4fe2c9631   Jackie Liu   io_uring: add sup...
2745
2746
2747
  		if (link && (s.sqe->flags & IOSQE_IO_DRAIN)) {
  			if (!shadow_req) {
  				shadow_req = io_get_req(ctx, NULL);
a1041c27b   Jackie Liu   io_uring: fix pot...
2748
2749
  				if (unlikely(!shadow_req))
  					goto out;
4fe2c9631   Jackie Liu   io_uring: add sup...
2750
2751
2752
2753
2754
  				shadow_req->flags |= (REQ_F_IO_DRAIN | REQ_F_SHADOW_DRAIN);
  				refcount_dec(&shadow_req->refs);
  			}
  			shadow_req->sequence = s.sequence;
  		}
a1041c27b   Jackie Liu   io_uring: fix pot...
2755
  out:
2b188cc1b   Jens Axboe   Add io_uring IO i...
2756
  		s.has_user = true;
def596e95   Jens Axboe   io_uring: support...
2757
  		s.needs_lock = false;
6c271ce2f   Jens Axboe   io_uring: add sub...
2758
  		s.needs_fixed_file = false;
5c8b0b54d   Jens Axboe   io_uring: have su...
2759
  		submit++;
bc808bced   Jens Axboe   io_uring: revert ...
2760
  		io_submit_sqe(ctx, &s, statep, &link);
2b188cc1b   Jens Axboe   Add io_uring IO i...
2761
  	}
2b188cc1b   Jens Axboe   Add io_uring IO i...
2762

9e645e110   Jens Axboe   io_uring: add sup...
2763
  	if (link)
bc808bced   Jens Axboe   io_uring: revert ...
2764
  		io_queue_link_head(ctx, link, &link->submit, shadow_req);
9a56a2323   Jens Axboe   io_uring: use fge...
2765
2766
  	if (statep)
  		io_submit_state_end(statep);
2b188cc1b   Jens Axboe   Add io_uring IO i...
2767

935d1e459   Pavel Begunkov   io_uring: Fix rac...
2768
  	io_commit_sqring(ctx);
5c8b0b54d   Jens Axboe   io_uring: have su...
2769
  	return submit;
2b188cc1b   Jens Axboe   Add io_uring IO i...
2770
  }
bda521624   Jens Axboe   io_uring: make CQ...
2771
2772
2773
2774
2775
2776
2777
2778
2779
2780
2781
2782
2783
2784
2785
2786
2787
2788
2789
2790
2791
2792
2793
2794
2795
2796
2797
2798
2799
2800
2801
  struct io_wait_queue {
  	struct wait_queue_entry wq;
  	struct io_ring_ctx *ctx;
  	unsigned to_wait;
  	unsigned nr_timeouts;
  };
  
  static inline bool io_should_wake(struct io_wait_queue *iowq)
  {
  	struct io_ring_ctx *ctx = iowq->ctx;
  
  	/*
  	 * Wake up if we have enough events, or if a timeout occured since we
  	 * started waiting. For timeouts, we always want to return to userspace,
  	 * regardless of event count.
  	 */
  	return io_cqring_events(ctx->rings) >= iowq->to_wait ||
  			atomic_read(&ctx->cq_timeouts) != iowq->nr_timeouts;
  }
  
  static int io_wake_function(struct wait_queue_entry *curr, unsigned int mode,
  			    int wake_flags, void *key)
  {
  	struct io_wait_queue *iowq = container_of(curr, struct io_wait_queue,
  							wq);
  
  	if (!io_should_wake(iowq))
  		return -1;
  
  	return autoremove_wake_function(curr, mode, wake_flags, key);
  }
2b188cc1b   Jens Axboe   Add io_uring IO i...
2802
2803
2804
2805
2806
2807
2808
  /*
   * Wait until events become available, if we don't already have some. The
   * application must reap them itself, as they reside on the shared cq ring.
   */
  static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events,
  			  const sigset_t __user *sig, size_t sigsz)
  {
bda521624   Jens Axboe   io_uring: make CQ...
2809
2810
2811
2812
2813
2814
2815
2816
2817
  	struct io_wait_queue iowq = {
  		.wq = {
  			.private	= current,
  			.func		= io_wake_function,
  			.entry		= LIST_HEAD_INIT(iowq.wq.entry),
  		},
  		.ctx		= ctx,
  		.to_wait	= min_events,
  	};
75b28affd   Hristo Venev   io_uring: allocat...
2818
  	struct io_rings *rings = ctx->rings;
2b188cc1b   Jens Axboe   Add io_uring IO i...
2819
  	int ret;
75b28affd   Hristo Venev   io_uring: allocat...
2820
  	if (io_cqring_events(rings) >= min_events)
2b188cc1b   Jens Axboe   Add io_uring IO i...
2821
2822
2823
  		return 0;
  
  	if (sig) {
9e75ad5d8   Arnd Bergmann   io_uring: fix big...
2824
2825
2826
  #ifdef CONFIG_COMPAT
  		if (in_compat_syscall())
  			ret = set_compat_user_sigmask((const compat_sigset_t __user *)sig,
b772434be   Oleg Nesterov   signal: simplify ...
2827
  						      sigsz);
9e75ad5d8   Arnd Bergmann   io_uring: fix big...
2828
2829
  		else
  #endif
b772434be   Oleg Nesterov   signal: simplify ...
2830
  			ret = set_user_sigmask(sig, sigsz);
9e75ad5d8   Arnd Bergmann   io_uring: fix big...
2831

2b188cc1b   Jens Axboe   Add io_uring IO i...
2832
2833
2834
  		if (ret)
  			return ret;
  	}
bda521624   Jens Axboe   io_uring: make CQ...
2835
2836
2837
2838
2839
2840
2841
2842
2843
2844
2845
2846
2847
2848
  	ret = 0;
  	iowq.nr_timeouts = atomic_read(&ctx->cq_timeouts);
  	do {
  		prepare_to_wait_exclusive(&ctx->wait, &iowq.wq,
  						TASK_INTERRUPTIBLE);
  		if (io_should_wake(&iowq))
  			break;
  		schedule();
  		if (signal_pending(current)) {
  			ret = -ERESTARTSYS;
  			break;
  		}
  	} while (1);
  	finish_wait(&ctx->wait, &iowq.wq);
b772434be   Oleg Nesterov   signal: simplify ...
2849
  	restore_saved_sigmask_unless(ret == -ERESTARTSYS);
97abc889e   Oleg Nesterov   signal: remove th...
2850
2851
  	if (ret == -ERESTARTSYS)
  		ret = -EINTR;
2b188cc1b   Jens Axboe   Add io_uring IO i...
2852

75b28affd   Hristo Venev   io_uring: allocat...
2853
  	return READ_ONCE(rings->cq.head) == READ_ONCE(rings->cq.tail) ? ret : 0;
2b188cc1b   Jens Axboe   Add io_uring IO i...
2854
  }
6b06314c4   Jens Axboe   io_uring: add fil...
2855
2856
2857
2858
2859
2860
2861
2862
2863
2864
2865
2866
2867
2868
2869
2870
2871
2872
2873
2874
2875
2876
2877
2878
2879
2880
2881
2882
2883
  static void __io_sqe_files_unregister(struct io_ring_ctx *ctx)
  {
  #if defined(CONFIG_UNIX)
  	if (ctx->ring_sock) {
  		struct sock *sock = ctx->ring_sock->sk;
  		struct sk_buff *skb;
  
  		while ((skb = skb_dequeue(&sock->sk_receive_queue)) != NULL)
  			kfree_skb(skb);
  	}
  #else
  	int i;
  
  	for (i = 0; i < ctx->nr_user_files; i++)
  		fput(ctx->user_files[i]);
  #endif
  }
  
  static int io_sqe_files_unregister(struct io_ring_ctx *ctx)
  {
  	if (!ctx->user_files)
  		return -ENXIO;
  
  	__io_sqe_files_unregister(ctx);
  	kfree(ctx->user_files);
  	ctx->user_files = NULL;
  	ctx->nr_user_files = 0;
  	return 0;
  }
6c271ce2f   Jens Axboe   io_uring: add sub...
2884
2885
2886
  static void io_sq_thread_stop(struct io_ring_ctx *ctx)
  {
  	if (ctx->sqo_thread) {
a4c0b3dec   Jackie Liu   io_uring: fix io_...
2887
  		wait_for_completion(&ctx->sqo_thread_started);
2bbcd6d3b   Roman Penyaev   io_uring: fix inf...
2888
2889
2890
2891
2892
  		/*
  		 * The park is a bit of a work-around, without it we get
  		 * warning spews on shutdown with SQPOLL set and affinity
  		 * set to a single CPU.
  		 */
060586324   Jens Axboe   io_uring: park SQ...
2893
  		kthread_park(ctx->sqo_thread);
6c271ce2f   Jens Axboe   io_uring: add sub...
2894
2895
2896
2897
  		kthread_stop(ctx->sqo_thread);
  		ctx->sqo_thread = NULL;
  	}
  }
6b06314c4   Jens Axboe   io_uring: add fil...
2898
2899
  static void io_finish_async(struct io_ring_ctx *ctx)
  {
54a91f3bb   Jens Axboe   io_uring: limit p...
2900
  	int i;
6c271ce2f   Jens Axboe   io_uring: add sub...
2901
  	io_sq_thread_stop(ctx);
54a91f3bb   Jens Axboe   io_uring: limit p...
2902
2903
2904
2905
2906
  	for (i = 0; i < ARRAY_SIZE(ctx->sqo_wq); i++) {
  		if (ctx->sqo_wq[i]) {
  			destroy_workqueue(ctx->sqo_wq[i]);
  			ctx->sqo_wq[i] = NULL;
  		}
6b06314c4   Jens Axboe   io_uring: add fil...
2907
2908
2909
2910
2911
2912
2913
  	}
  }
  
  #if defined(CONFIG_UNIX)
  static void io_destruct_skb(struct sk_buff *skb)
  {
  	struct io_ring_ctx *ctx = skb->sk->sk_user_data;
8a9973408   Jens Axboe   io_uring: only fl...
2914
2915
2916
2917
2918
  	int i;
  
  	for (i = 0; i < ARRAY_SIZE(ctx->sqo_wq); i++)
  		if (ctx->sqo_wq[i])
  			flush_workqueue(ctx->sqo_wq[i]);
6b06314c4   Jens Axboe   io_uring: add fil...
2919

6b06314c4   Jens Axboe   io_uring: add fil...
2920
2921
2922
2923
2924
2925
2926
2927
2928
2929
2930
2931
2932
2933
2934
2935
2936
2937
2938
2939
2940
2941
2942
2943
2944
2945
2946
2947
2948
2949
2950
2951
2952
2953
2954
2955
2956
2957
2958
2959
2960
2961
2962
2963
2964
2965
2966
2967
2968
2969
2970
2971
2972
2973
2974
2975
2976
2977
2978
2979
2980
2981
2982
2983
2984
2985
  	unix_destruct_scm(skb);
  }
  
  /*
   * Ensure the UNIX gc is aware of our file set, so we are certain that
   * the io_uring can be safely unregistered on process exit, even if we have
   * loops in the file referencing.
   */
  static int __io_sqe_files_scm(struct io_ring_ctx *ctx, int nr, int offset)
  {
  	struct sock *sk = ctx->ring_sock->sk;
  	struct scm_fp_list *fpl;
  	struct sk_buff *skb;
  	int i;
  
  	if (!capable(CAP_SYS_RESOURCE) && !capable(CAP_SYS_ADMIN)) {
  		unsigned long inflight = ctx->user->unix_inflight + nr;
  
  		if (inflight > task_rlimit(current, RLIMIT_NOFILE))
  			return -EMFILE;
  	}
  
  	fpl = kzalloc(sizeof(*fpl), GFP_KERNEL);
  	if (!fpl)
  		return -ENOMEM;
  
  	skb = alloc_skb(0, GFP_KERNEL);
  	if (!skb) {
  		kfree(fpl);
  		return -ENOMEM;
  	}
  
  	skb->sk = sk;
  	skb->destructor = io_destruct_skb;
  
  	fpl->user = get_uid(ctx->user);
  	for (i = 0; i < nr; i++) {
  		fpl->fp[i] = get_file(ctx->user_files[i + offset]);
  		unix_inflight(fpl->user, fpl->fp[i]);
  	}
  
  	fpl->max = fpl->count = nr;
  	UNIXCB(skb).fp = fpl;
  	refcount_add(skb->truesize, &sk->sk_wmem_alloc);
  	skb_queue_head(&sk->sk_receive_queue, skb);
  
  	for (i = 0; i < nr; i++)
  		fput(fpl->fp[i]);
  
  	return 0;
  }
  
  /*
   * If UNIX sockets are enabled, fd passing can cause a reference cycle which
   * causes regular reference counting to break down. We rely on the UNIX
   * garbage collection to take care of this problem for us.
   */
  static int io_sqe_files_scm(struct io_ring_ctx *ctx)
  {
  	unsigned left, total;
  	int ret = 0;
  
  	total = 0;
  	left = ctx->nr_user_files;
  	while (left) {
  		unsigned this_files = min_t(unsigned, left, SCM_MAX_FD);
6b06314c4   Jens Axboe   io_uring: add fil...
2986
2987
2988
2989
2990
2991
2992
2993
2994
2995
2996
2997
2998
2999
3000
3001
3002
3003
3004
3005
3006
3007
3008
3009
3010
3011
3012
3013
3014
3015
3016
3017
3018
3019
3020
3021
3022
3023
3024
3025
3026
3027
3028
3029
3030
3031
3032
3033
3034
3035
3036
3037
3038
3039
3040
3041
3042
3043
3044
3045
3046
3047
3048
3049
3050
3051
3052
3053
3054
3055
3056
3057
3058
  
  		ret = __io_sqe_files_scm(ctx, this_files, total);
  		if (ret)
  			break;
  		left -= this_files;
  		total += this_files;
  	}
  
  	if (!ret)
  		return 0;
  
  	while (total < ctx->nr_user_files) {
  		fput(ctx->user_files[total]);
  		total++;
  	}
  
  	return ret;
  }
  #else
  static int io_sqe_files_scm(struct io_ring_ctx *ctx)
  {
  	return 0;
  }
  #endif
  
  static int io_sqe_files_register(struct io_ring_ctx *ctx, void __user *arg,
  				 unsigned nr_args)
  {
  	__s32 __user *fds = (__s32 __user *) arg;
  	int fd, ret = 0;
  	unsigned i;
  
  	if (ctx->user_files)
  		return -EBUSY;
  	if (!nr_args)
  		return -EINVAL;
  	if (nr_args > IORING_MAX_FIXED_FILES)
  		return -EMFILE;
  
  	ctx->user_files = kcalloc(nr_args, sizeof(struct file *), GFP_KERNEL);
  	if (!ctx->user_files)
  		return -ENOMEM;
  
  	for (i = 0; i < nr_args; i++) {
  		ret = -EFAULT;
  		if (copy_from_user(&fd, &fds[i], sizeof(fd)))
  			break;
  
  		ctx->user_files[i] = fget(fd);
  
  		ret = -EBADF;
  		if (!ctx->user_files[i])
  			break;
  		/*
  		 * Don't allow io_uring instances to be registered. If UNIX
  		 * isn't enabled, then this causes a reference cycle and this
  		 * instance can never get freed. If UNIX is enabled we'll
  		 * handle it just fine, but there's still no point in allowing
  		 * a ring fd as it doesn't support regular read/write anyway.
  		 */
  		if (ctx->user_files[i]->f_op == &io_uring_fops) {
  			fput(ctx->user_files[i]);
  			break;
  		}
  		ctx->nr_user_files++;
  		ret = 0;
  	}
  
  	if (ret) {
  		for (i = 0; i < ctx->nr_user_files; i++)
  			fput(ctx->user_files[i]);
  
  		kfree(ctx->user_files);
25adf50fe   Jens Axboe   io_uring: fix dou...
3059
  		ctx->user_files = NULL;
6b06314c4   Jens Axboe   io_uring: add fil...
3060
3061
3062
3063
3064
3065
3066
3067
3068
3069
  		ctx->nr_user_files = 0;
  		return ret;
  	}
  
  	ret = io_sqe_files_scm(ctx);
  	if (ret)
  		io_sqe_files_unregister(ctx);
  
  	return ret;
  }
6c271ce2f   Jens Axboe   io_uring: add sub...
3070
3071
  static int io_sq_offload_start(struct io_ring_ctx *ctx,
  			       struct io_uring_params *p)
2b188cc1b   Jens Axboe   Add io_uring IO i...
3072
3073
  {
  	int ret;
6c271ce2f   Jens Axboe   io_uring: add sub...
3074
  	init_waitqueue_head(&ctx->sqo_wait);
2b188cc1b   Jens Axboe   Add io_uring IO i...
3075
3076
  	mmgrab(current->mm);
  	ctx->sqo_mm = current->mm;
6c271ce2f   Jens Axboe   io_uring: add sub...
3077
  	if (ctx->flags & IORING_SETUP_SQPOLL) {
3ec482d15   Jens Axboe   io_uring: restric...
3078
3079
3080
  		ret = -EPERM;
  		if (!capable(CAP_SYS_ADMIN))
  			goto err;
917257daa   Jens Axboe   io_uring: only te...
3081
3082
3083
  		ctx->sq_thread_idle = msecs_to_jiffies(p->sq_thread_idle);
  		if (!ctx->sq_thread_idle)
  			ctx->sq_thread_idle = HZ;
6c271ce2f   Jens Axboe   io_uring: add sub...
3084
  		if (p->flags & IORING_SETUP_SQ_AFF) {
44a9bd18a   Jens Axboe   io_uring: fix fai...
3085
  			int cpu = p->sq_thread_cpu;
6c271ce2f   Jens Axboe   io_uring: add sub...
3086

917257daa   Jens Axboe   io_uring: only te...
3087
  			ret = -EINVAL;
44a9bd18a   Jens Axboe   io_uring: fix fai...
3088
3089
  			if (cpu >= nr_cpu_ids)
  				goto err;
7889f44dd   Shenghui Wang   io_uring: use cpu...
3090
  			if (!cpu_online(cpu))
917257daa   Jens Axboe   io_uring: only te...
3091
  				goto err;
6c271ce2f   Jens Axboe   io_uring: add sub...
3092
3093
3094
3095
3096
3097
3098
3099
3100
3101
3102
3103
3104
3105
3106
3107
3108
3109
  			ctx->sqo_thread = kthread_create_on_cpu(io_sq_thread,
  							ctx, cpu,
  							"io_uring-sq");
  		} else {
  			ctx->sqo_thread = kthread_create(io_sq_thread, ctx,
  							"io_uring-sq");
  		}
  		if (IS_ERR(ctx->sqo_thread)) {
  			ret = PTR_ERR(ctx->sqo_thread);
  			ctx->sqo_thread = NULL;
  			goto err;
  		}
  		wake_up_process(ctx->sqo_thread);
  	} else if (p->flags & IORING_SETUP_SQ_AFF) {
  		/* Can't have SQ_AFF without SQPOLL */
  		ret = -EINVAL;
  		goto err;
  	}
2b188cc1b   Jens Axboe   Add io_uring IO i...
3110
  	/* Do QD, or 2 * CPUS, whatever is smallest */
54a91f3bb   Jens Axboe   io_uring: limit p...
3111
3112
  	ctx->sqo_wq[0] = alloc_workqueue("io_ring-wq",
  			WQ_UNBOUND | WQ_FREEZABLE,
2b188cc1b   Jens Axboe   Add io_uring IO i...
3113
  			min(ctx->sq_entries - 1, 2 * num_online_cpus()));
54a91f3bb   Jens Axboe   io_uring: limit p...
3114
3115
3116
3117
3118
3119
3120
3121
3122
3123
3124
3125
3126
3127
3128
  	if (!ctx->sqo_wq[0]) {
  		ret = -ENOMEM;
  		goto err;
  	}
  
  	/*
  	 * This is for buffered writes, where we want to limit the parallelism
  	 * due to file locking in file systems. As "normal" buffered writes
  	 * should parellelize on writeout quite nicely, limit us to having 2
  	 * pending. This avoids massive contention on the inode when doing
  	 * buffered async writes.
  	 */
  	ctx->sqo_wq[1] = alloc_workqueue("io_ring-write-wq",
  						WQ_UNBOUND | WQ_FREEZABLE, 2);
  	if (!ctx->sqo_wq[1]) {
2b188cc1b   Jens Axboe   Add io_uring IO i...
3129
3130
3131
3132
3133
3134
  		ret = -ENOMEM;
  		goto err;
  	}
  
  	return 0;
  err:
54a91f3bb   Jens Axboe   io_uring: limit p...
3135
  	io_finish_async(ctx);
2b188cc1b   Jens Axboe   Add io_uring IO i...
3136
3137
3138
3139
3140
3141
3142
3143
3144
3145
3146
3147
3148
3149
3150
3151
3152
3153
3154
3155
3156
3157
3158
3159
3160
3161
3162
3163
3164
3165
  	mmdrop(ctx->sqo_mm);
  	ctx->sqo_mm = NULL;
  	return ret;
  }
  
  static void io_unaccount_mem(struct user_struct *user, unsigned long nr_pages)
  {
  	atomic_long_sub(nr_pages, &user->locked_vm);
  }
  
  static int io_account_mem(struct user_struct *user, unsigned long nr_pages)
  {
  	unsigned long page_limit, cur_pages, new_pages;
  
  	/* Don't allow more pages than we can safely lock */
  	page_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
  
  	do {
  		cur_pages = atomic_long_read(&user->locked_vm);
  		new_pages = cur_pages + nr_pages;
  		if (new_pages > page_limit)
  			return -ENOMEM;
  	} while (atomic_long_cmpxchg(&user->locked_vm, cur_pages,
  					new_pages) != cur_pages);
  
  	return 0;
  }
  
  static void io_mem_free(void *ptr)
  {
52e04ef4c   Mark Rutland   io_uring: free al...
3166
3167
3168
3169
  	struct page *page;
  
  	if (!ptr)
  		return;
2b188cc1b   Jens Axboe   Add io_uring IO i...
3170

52e04ef4c   Mark Rutland   io_uring: free al...
3171
  	page = virt_to_head_page(ptr);
2b188cc1b   Jens Axboe   Add io_uring IO i...
3172
3173
3174
3175
3176
3177
3178
3179
3180
3181
3182
  	if (put_page_testzero(page))
  		free_compound_page(page);
  }
  
  static void *io_mem_alloc(size_t size)
  {
  	gfp_t gfp_flags = GFP_KERNEL | __GFP_ZERO | __GFP_NOWARN | __GFP_COMP |
  				__GFP_NORETRY;
  
  	return (void *) __get_free_pages(gfp_flags, get_order(size));
  }
75b28affd   Hristo Venev   io_uring: allocat...
3183
3184
3185
3186
3187
3188
3189
3190
3191
3192
3193
3194
3195
3196
3197
3198
3199
3200
3201
3202
3203
3204
3205
3206
3207
3208
3209
3210
  static unsigned long rings_size(unsigned sq_entries, unsigned cq_entries,
  				size_t *sq_offset)
  {
  	struct io_rings *rings;
  	size_t off, sq_array_size;
  
  	off = struct_size(rings, cqes, cq_entries);
  	if (off == SIZE_MAX)
  		return SIZE_MAX;
  
  #ifdef CONFIG_SMP
  	off = ALIGN(off, SMP_CACHE_BYTES);
  	if (off == 0)
  		return SIZE_MAX;
  #endif
  
  	sq_array_size = array_size(sizeof(u32), sq_entries);
  	if (sq_array_size == SIZE_MAX)
  		return SIZE_MAX;
  
  	if (check_add_overflow(off, sq_array_size, &off))
  		return SIZE_MAX;
  
  	if (sq_offset)
  		*sq_offset = off;
  
  	return off;
  }
2b188cc1b   Jens Axboe   Add io_uring IO i...
3211
3212
  static unsigned long ring_pages(unsigned sq_entries, unsigned cq_entries)
  {
75b28affd   Hristo Venev   io_uring: allocat...
3213
  	size_t pages;
2b188cc1b   Jens Axboe   Add io_uring IO i...
3214

75b28affd   Hristo Venev   io_uring: allocat...
3215
3216
3217
3218
  	pages = (size_t)1 << get_order(
  		rings_size(sq_entries, cq_entries, NULL));
  	pages += (size_t)1 << get_order(
  		array_size(sizeof(struct io_uring_sqe), sq_entries));
2b188cc1b   Jens Axboe   Add io_uring IO i...
3219

75b28affd   Hristo Venev   io_uring: allocat...
3220
  	return pages;
2b188cc1b   Jens Axboe   Add io_uring IO i...
3221
  }
edafccee5   Jens Axboe   io_uring: add sup...
3222
3223
3224
3225
3226
3227
3228
3229
3230
3231
3232
  static int io_sqe_buffer_unregister(struct io_ring_ctx *ctx)
  {
  	int i, j;
  
  	if (!ctx->user_bufs)
  		return -ENXIO;
  
  	for (i = 0; i < ctx->nr_user_bufs; i++) {
  		struct io_mapped_ubuf *imu = &ctx->user_bufs[i];
  
  		for (j = 0; j < imu->nr_bvecs; j++)
27c4d3a32   John Hubbard   fs/io_uring.c: co...
3233
  			put_user_page(imu->bvec[j].bv_page);
edafccee5   Jens Axboe   io_uring: add sup...
3234
3235
3236
  
  		if (ctx->account_mem)
  			io_unaccount_mem(ctx->user, imu->nr_bvecs);
d4ef64751   Mark Rutland   io_uring: avoid p...
3237
  		kvfree(imu->bvec);
edafccee5   Jens Axboe   io_uring: add sup...
3238
3239
3240
3241
3242
3243
3244
3245
3246
3247
3248
3249
3250
3251
3252
3253
3254
3255
3256
3257
3258
3259
3260
3261
3262
3263
3264
3265
3266
3267
3268
3269
3270
3271
3272
3273
3274
3275
3276
3277
3278
3279
3280
3281
3282
3283
3284
3285
3286
3287
3288
3289
3290
3291
3292
3293
3294
3295
3296
3297
3298
  		imu->nr_bvecs = 0;
  	}
  
  	kfree(ctx->user_bufs);
  	ctx->user_bufs = NULL;
  	ctx->nr_user_bufs = 0;
  	return 0;
  }
  
  static int io_copy_iov(struct io_ring_ctx *ctx, struct iovec *dst,
  		       void __user *arg, unsigned index)
  {
  	struct iovec __user *src;
  
  #ifdef CONFIG_COMPAT
  	if (ctx->compat) {
  		struct compat_iovec __user *ciovs;
  		struct compat_iovec ciov;
  
  		ciovs = (struct compat_iovec __user *) arg;
  		if (copy_from_user(&ciov, &ciovs[index], sizeof(ciov)))
  			return -EFAULT;
  
  		dst->iov_base = (void __user *) (unsigned long) ciov.iov_base;
  		dst->iov_len = ciov.iov_len;
  		return 0;
  	}
  #endif
  	src = (struct iovec __user *) arg;
  	if (copy_from_user(dst, &src[index], sizeof(*dst)))
  		return -EFAULT;
  	return 0;
  }
  
  static int io_sqe_buffer_register(struct io_ring_ctx *ctx, void __user *arg,
  				  unsigned nr_args)
  {
  	struct vm_area_struct **vmas = NULL;
  	struct page **pages = NULL;
  	int i, j, got_pages = 0;
  	int ret = -EINVAL;
  
  	if (ctx->user_bufs)
  		return -EBUSY;
  	if (!nr_args || nr_args > UIO_MAXIOV)
  		return -EINVAL;
  
  	ctx->user_bufs = kcalloc(nr_args, sizeof(struct io_mapped_ubuf),
  					GFP_KERNEL);
  	if (!ctx->user_bufs)
  		return -ENOMEM;
  
  	for (i = 0; i < nr_args; i++) {
  		struct io_mapped_ubuf *imu = &ctx->user_bufs[i];
  		unsigned long off, start, end, ubuf;
  		int pret, nr_pages;
  		struct iovec iov;
  		size_t size;
  
  		ret = io_copy_iov(ctx, &iov, arg, i);
  		if (ret)
a278682da   Pavel Begunkov   io_uring: Fix __i...
3299
  			goto err;
edafccee5   Jens Axboe   io_uring: add sup...
3300
3301
3302
3303
3304
3305
3306
3307
3308
3309
3310
3311
3312
3313
3314
3315
3316
3317
3318
3319
3320
3321
3322
3323
3324
3325
3326
3327
3328
  
  		/*
  		 * Don't impose further limits on the size and buffer
  		 * constraints here, we'll -EINVAL later when IO is
  		 * submitted if they are wrong.
  		 */
  		ret = -EFAULT;
  		if (!iov.iov_base || !iov.iov_len)
  			goto err;
  
  		/* arbitrary limit, but we need something */
  		if (iov.iov_len > SZ_1G)
  			goto err;
  
  		ubuf = (unsigned long) iov.iov_base;
  		end = (ubuf + iov.iov_len + PAGE_SIZE - 1) >> PAGE_SHIFT;
  		start = ubuf >> PAGE_SHIFT;
  		nr_pages = end - start;
  
  		if (ctx->account_mem) {
  			ret = io_account_mem(ctx->user, nr_pages);
  			if (ret)
  				goto err;
  		}
  
  		ret = 0;
  		if (!pages || nr_pages > got_pages) {
  			kfree(vmas);
  			kfree(pages);
d4ef64751   Mark Rutland   io_uring: avoid p...
3329
  			pages = kvmalloc_array(nr_pages, sizeof(struct page *),
edafccee5   Jens Axboe   io_uring: add sup...
3330
  						GFP_KERNEL);
d4ef64751   Mark Rutland   io_uring: avoid p...
3331
  			vmas = kvmalloc_array(nr_pages,
edafccee5   Jens Axboe   io_uring: add sup...
3332
3333
3334
3335
3336
3337
3338
3339
3340
3341
  					sizeof(struct vm_area_struct *),
  					GFP_KERNEL);
  			if (!pages || !vmas) {
  				ret = -ENOMEM;
  				if (ctx->account_mem)
  					io_unaccount_mem(ctx->user, nr_pages);
  				goto err;
  			}
  			got_pages = nr_pages;
  		}
d4ef64751   Mark Rutland   io_uring: avoid p...
3342
  		imu->bvec = kvmalloc_array(nr_pages, sizeof(struct bio_vec),
edafccee5   Jens Axboe   io_uring: add sup...
3343
3344
3345
3346
3347
3348
3349
3350
3351
3352
  						GFP_KERNEL);
  		ret = -ENOMEM;
  		if (!imu->bvec) {
  			if (ctx->account_mem)
  				io_unaccount_mem(ctx->user, nr_pages);
  			goto err;
  		}
  
  		ret = 0;
  		down_read(&current->mm->mmap_sem);
932f4a630   Ira Weiny   mm/gup: replace g...
3353
3354
3355
  		pret = get_user_pages(ubuf, nr_pages,
  				      FOLL_WRITE | FOLL_LONGTERM,
  				      pages, vmas);
edafccee5   Jens Axboe   io_uring: add sup...
3356
3357
3358
3359
3360
3361
3362
3363
3364
3365
3366
3367
3368
3369
3370
3371
3372
3373
3374
3375
  		if (pret == nr_pages) {
  			/* don't support file backed memory */
  			for (j = 0; j < nr_pages; j++) {
  				struct vm_area_struct *vma = vmas[j];
  
  				if (vma->vm_file &&
  				    !is_file_hugepages(vma->vm_file)) {
  					ret = -EOPNOTSUPP;
  					break;
  				}
  			}
  		} else {
  			ret = pret < 0 ? pret : -EFAULT;
  		}
  		up_read(&current->mm->mmap_sem);
  		if (ret) {
  			/*
  			 * if we did partial map, or found file backed vmas,
  			 * release any pages we did get
  			 */
27c4d3a32   John Hubbard   fs/io_uring.c: co...
3376
3377
  			if (pret > 0)
  				put_user_pages(pages, pret);
edafccee5   Jens Axboe   io_uring: add sup...
3378
3379
  			if (ctx->account_mem)
  				io_unaccount_mem(ctx->user, nr_pages);
d4ef64751   Mark Rutland   io_uring: avoid p...
3380
  			kvfree(imu->bvec);
edafccee5   Jens Axboe   io_uring: add sup...
3381
3382
3383
3384
3385
3386
3387
3388
3389
3390
3391
3392
3393
3394
3395
3396
3397
3398
3399
3400
3401
3402
  			goto err;
  		}
  
  		off = ubuf & ~PAGE_MASK;
  		size = iov.iov_len;
  		for (j = 0; j < nr_pages; j++) {
  			size_t vec_len;
  
  			vec_len = min_t(size_t, size, PAGE_SIZE - off);
  			imu->bvec[j].bv_page = pages[j];
  			imu->bvec[j].bv_len = vec_len;
  			imu->bvec[j].bv_offset = off;
  			off = 0;
  			size -= vec_len;
  		}
  		/* store original address for later verification */
  		imu->ubuf = ubuf;
  		imu->len = iov.iov_len;
  		imu->nr_bvecs = nr_pages;
  
  		ctx->nr_user_bufs++;
  	}
d4ef64751   Mark Rutland   io_uring: avoid p...
3403
3404
  	kvfree(pages);
  	kvfree(vmas);
edafccee5   Jens Axboe   io_uring: add sup...
3405
3406
  	return 0;
  err:
d4ef64751   Mark Rutland   io_uring: avoid p...
3407
3408
  	kvfree(pages);
  	kvfree(vmas);
edafccee5   Jens Axboe   io_uring: add sup...
3409
3410
3411
  	io_sqe_buffer_unregister(ctx);
  	return ret;
  }
9b402849e   Jens Axboe   io_uring: add sup...
3412
3413
3414
3415
3416
3417
3418
3419
3420
3421
3422
3423
3424
3425
3426
3427
3428
3429
3430
3431
3432
3433
3434
3435
3436
3437
3438
3439
3440
3441
3442
  static int io_eventfd_register(struct io_ring_ctx *ctx, void __user *arg)
  {
  	__s32 __user *fds = arg;
  	int fd;
  
  	if (ctx->cq_ev_fd)
  		return -EBUSY;
  
  	if (copy_from_user(&fd, fds, sizeof(*fds)))
  		return -EFAULT;
  
  	ctx->cq_ev_fd = eventfd_ctx_fdget(fd);
  	if (IS_ERR(ctx->cq_ev_fd)) {
  		int ret = PTR_ERR(ctx->cq_ev_fd);
  		ctx->cq_ev_fd = NULL;
  		return ret;
  	}
  
  	return 0;
  }
  
  static int io_eventfd_unregister(struct io_ring_ctx *ctx)
  {
  	if (ctx->cq_ev_fd) {
  		eventfd_ctx_put(ctx->cq_ev_fd);
  		ctx->cq_ev_fd = NULL;
  		return 0;
  	}
  
  	return -ENXIO;
  }
2b188cc1b   Jens Axboe   Add io_uring IO i...
3443
3444
  static void io_ring_ctx_free(struct io_ring_ctx *ctx)
  {
6b06314c4   Jens Axboe   io_uring: add fil...
3445
  	io_finish_async(ctx);
2b188cc1b   Jens Axboe   Add io_uring IO i...
3446
3447
  	if (ctx->sqo_mm)
  		mmdrop(ctx->sqo_mm);
def596e95   Jens Axboe   io_uring: support...
3448
3449
  
  	io_iopoll_reap_events(ctx);
edafccee5   Jens Axboe   io_uring: add sup...
3450
  	io_sqe_buffer_unregister(ctx);
6b06314c4   Jens Axboe   io_uring: add fil...
3451
  	io_sqe_files_unregister(ctx);
9b402849e   Jens Axboe   io_uring: add sup...
3452
  	io_eventfd_unregister(ctx);
def596e95   Jens Axboe   io_uring: support...
3453

2b188cc1b   Jens Axboe   Add io_uring IO i...
3454
  #if defined(CONFIG_UNIX)
355e8d26f   Eric Biggers   io_uring: fix mem...
3455
3456
  	if (ctx->ring_sock) {
  		ctx->ring_sock->file = NULL; /* so that iput() is called */
2b188cc1b   Jens Axboe   Add io_uring IO i...
3457
  		sock_release(ctx->ring_sock);
355e8d26f   Eric Biggers   io_uring: fix mem...
3458
  	}
2b188cc1b   Jens Axboe   Add io_uring IO i...
3459
  #endif
75b28affd   Hristo Venev   io_uring: allocat...
3460
  	io_mem_free(ctx->rings);
2b188cc1b   Jens Axboe   Add io_uring IO i...
3461
  	io_mem_free(ctx->sq_sqes);
2b188cc1b   Jens Axboe   Add io_uring IO i...
3462
3463
3464
3465
3466
3467
  
  	percpu_ref_exit(&ctx->refs);
  	if (ctx->account_mem)
  		io_unaccount_mem(ctx->user,
  				ring_pages(ctx->sq_entries, ctx->cq_entries));
  	free_uid(ctx->user);
8387e3688   Jens Axboe   io_uring: async w...
3468
3469
  	if (ctx->creds)
  		put_cred(ctx->creds);
2b188cc1b   Jens Axboe   Add io_uring IO i...
3470
3471
3472
3473
3474
3475
3476
3477
3478
  	kfree(ctx);
  }
  
  static __poll_t io_uring_poll(struct file *file, poll_table *wait)
  {
  	struct io_ring_ctx *ctx = file->private_data;
  	__poll_t mask = 0;
  
  	poll_wait(file, &ctx->cq_wait, wait);
4f7067c3f   Stefan Bühler   io_uring: remove ...
3479
3480
3481
3482
  	/*
  	 * synchronizes with barrier from wq_has_sleeper call in
  	 * io_commit_cqring
  	 */
2b188cc1b   Jens Axboe   Add io_uring IO i...
3483
  	smp_rmb();
75b28affd   Hristo Venev   io_uring: allocat...
3484
3485
  	if (READ_ONCE(ctx->rings->sq.tail) - ctx->cached_sq_head !=
  	    ctx->rings->sq_ring_entries)
2b188cc1b   Jens Axboe   Add io_uring IO i...
3486
  		mask |= EPOLLOUT | EPOLLWRNORM;
daa5de541   yangerkun   io_uring: compare...
3487
  	if (READ_ONCE(ctx->rings->cq.head) != ctx->cached_cq_tail)
2b188cc1b   Jens Axboe   Add io_uring IO i...
3488
3489
3490
3491
3492
3493
3494
3495
3496
3497
3498
3499
3500
3501
3502
3503
3504
  		mask |= EPOLLIN | EPOLLRDNORM;
  
  	return mask;
  }
  
  static int io_uring_fasync(int fd, struct file *file, int on)
  {
  	struct io_ring_ctx *ctx = file->private_data;
  
  	return fasync_helper(fd, file, on, &ctx->cq_fasync);
  }
  
  static void io_ring_ctx_wait_and_kill(struct io_ring_ctx *ctx)
  {
  	mutex_lock(&ctx->uring_lock);
  	percpu_ref_kill(&ctx->refs);
  	mutex_unlock(&ctx->uring_lock);
5262f5679   Jens Axboe   io_uring: IORING_...
3505
  	io_kill_timeouts(ctx);
221c5eb23   Jens Axboe   io_uring: add sup...
3506
  	io_poll_remove_all(ctx);
def596e95   Jens Axboe   io_uring: support...
3507
  	io_iopoll_reap_events(ctx);
2b188cc1b   Jens Axboe   Add io_uring IO i...
3508
3509
3510
3511
3512
3513
3514
3515
3516
3517
3518
3519
3520
3521
3522
3523
3524
3525
3526
3527
3528
3529
3530
3531
  	wait_for_completion(&ctx->ctx_done);
  	io_ring_ctx_free(ctx);
  }
  
  static int io_uring_release(struct inode *inode, struct file *file)
  {
  	struct io_ring_ctx *ctx = file->private_data;
  
  	file->private_data = NULL;
  	io_ring_ctx_wait_and_kill(ctx);
  	return 0;
  }
  
  static int io_uring_mmap(struct file *file, struct vm_area_struct *vma)
  {
  	loff_t offset = (loff_t) vma->vm_pgoff << PAGE_SHIFT;
  	unsigned long sz = vma->vm_end - vma->vm_start;
  	struct io_ring_ctx *ctx = file->private_data;
  	unsigned long pfn;
  	struct page *page;
  	void *ptr;
  
  	switch (offset) {
  	case IORING_OFF_SQ_RING:
75b28affd   Hristo Venev   io_uring: allocat...
3532
3533
  	case IORING_OFF_CQ_RING:
  		ptr = ctx->rings;
2b188cc1b   Jens Axboe   Add io_uring IO i...
3534
3535
3536
3537
  		break;
  	case IORING_OFF_SQES:
  		ptr = ctx->sq_sqes;
  		break;
2b188cc1b   Jens Axboe   Add io_uring IO i...
3538
3539
3540
3541
3542
  	default:
  		return -EINVAL;
  	}
  
  	page = virt_to_head_page(ptr);
a50b854e0   Matthew Wilcox (Oracle)   mm: introduce pag...
3543
  	if (sz > page_size(page))
2b188cc1b   Jens Axboe   Add io_uring IO i...
3544
3545
3546
3547
3548
3549
3550
3551
3552
3553
3554
3555
3556
3557
  		return -EINVAL;
  
  	pfn = virt_to_phys(ptr) >> PAGE_SHIFT;
  	return remap_pfn_range(vma, vma->vm_start, pfn, sz, vma->vm_page_prot);
  }
  
  SYSCALL_DEFINE6(io_uring_enter, unsigned int, fd, u32, to_submit,
  		u32, min_complete, u32, flags, const sigset_t __user *, sig,
  		size_t, sigsz)
  {
  	struct io_ring_ctx *ctx;
  	long ret = -EBADF;
  	int submitted = 0;
  	struct fd f;
6c271ce2f   Jens Axboe   io_uring: add sub...
3558
  	if (flags & ~(IORING_ENTER_GETEVENTS | IORING_ENTER_SQ_WAKEUP))
2b188cc1b   Jens Axboe   Add io_uring IO i...
3559
3560
3561
3562
3563
3564
3565
3566
3567
3568
3569
3570
3571
3572
  		return -EINVAL;
  
  	f = fdget(fd);
  	if (!f.file)
  		return -EBADF;
  
  	ret = -EOPNOTSUPP;
  	if (f.file->f_op != &io_uring_fops)
  		goto out_fput;
  
  	ret = -ENXIO;
  	ctx = f.file->private_data;
  	if (!percpu_ref_tryget(&ctx->refs))
  		goto out_fput;
6c271ce2f   Jens Axboe   io_uring: add sub...
3573
3574
3575
3576
3577
  	/*
  	 * For SQ polling, the thread will do all submissions and completions.
  	 * Just return the requested submit count, and wake the thread if
  	 * we were asked to.
  	 */
b2a9eadab   Jens Axboe   io_uring: make sq...
3578
  	ret = 0;
6c271ce2f   Jens Axboe   io_uring: add sub...
3579
3580
3581
3582
  	if (ctx->flags & IORING_SETUP_SQPOLL) {
  		if (flags & IORING_ENTER_SQ_WAKEUP)
  			wake_up(&ctx->sqo_wait);
  		submitted = to_submit;
b2a9eadab   Jens Axboe   io_uring: make sq...
3583
  	} else if (to_submit) {
2b188cc1b   Jens Axboe   Add io_uring IO i...
3584
3585
3586
  		to_submit = min(to_submit, ctx->sq_entries);
  
  		mutex_lock(&ctx->uring_lock);
bc808bced   Jens Axboe   io_uring: revert ...
3587
  		submitted = io_ring_submit(ctx, to_submit);
2b188cc1b   Jens Axboe   Add io_uring IO i...
3588
  		mutex_unlock(&ctx->uring_lock);
002352747   Pavel Begunkov   io_uring: don't w...
3589
3590
3591
  
  		if (submitted != to_submit)
  			goto out;
2b188cc1b   Jens Axboe   Add io_uring IO i...
3592
3593
  	}
  	if (flags & IORING_ENTER_GETEVENTS) {
def596e95   Jens Axboe   io_uring: support...
3594
  		unsigned nr_events = 0;
2b188cc1b   Jens Axboe   Add io_uring IO i...
3595
  		min_complete = min(min_complete, ctx->cq_entries);
def596e95   Jens Axboe   io_uring: support...
3596
  		if (ctx->flags & IORING_SETUP_IOPOLL) {
def596e95   Jens Axboe   io_uring: support...
3597
  			ret = io_iopoll_check(ctx, &nr_events, min_complete);
def596e95   Jens Axboe   io_uring: support...
3598
3599
3600
  		} else {
  			ret = io_cqring_wait(ctx, min_complete, sig, sigsz);
  		}
2b188cc1b   Jens Axboe   Add io_uring IO i...
3601
  	}
002352747   Pavel Begunkov   io_uring: don't w...
3602
  out:
6805b32ec   Pavel Begunkov   io_uring: remove ...
3603
  	percpu_ref_put(&ctx->refs);
2b188cc1b   Jens Axboe   Add io_uring IO i...
3604
3605
3606
3607
3608
3609
3610
3611
3612
3613
3614
3615
3616
3617
3618
  out_fput:
  	fdput(f);
  	return submitted ? submitted : ret;
  }
  
  static const struct file_operations io_uring_fops = {
  	.release	= io_uring_release,
  	.mmap		= io_uring_mmap,
  	.poll		= io_uring_poll,
  	.fasync		= io_uring_fasync,
  };
  
  static int io_allocate_scq_urings(struct io_ring_ctx *ctx,
  				  struct io_uring_params *p)
  {
75b28affd   Hristo Venev   io_uring: allocat...
3619
3620
  	struct io_rings *rings;
  	size_t size, sq_array_offset;
2b188cc1b   Jens Axboe   Add io_uring IO i...
3621

75b28affd   Hristo Venev   io_uring: allocat...
3622
3623
3624
3625
3626
3627
  	size = rings_size(p->sq_entries, p->cq_entries, &sq_array_offset);
  	if (size == SIZE_MAX)
  		return -EOVERFLOW;
  
  	rings = io_mem_alloc(size);
  	if (!rings)
2b188cc1b   Jens Axboe   Add io_uring IO i...
3628
  		return -ENOMEM;
75b28affd   Hristo Venev   io_uring: allocat...
3629
3630
3631
3632
3633
3634
3635
3636
3637
3638
  	ctx->rings = rings;
  	ctx->sq_array = (u32 *)((char *)rings + sq_array_offset);
  	rings->sq_ring_mask = p->sq_entries - 1;
  	rings->cq_ring_mask = p->cq_entries - 1;
  	rings->sq_ring_entries = p->sq_entries;
  	rings->cq_ring_entries = p->cq_entries;
  	ctx->sq_mask = rings->sq_ring_mask;
  	ctx->cq_mask = rings->cq_ring_mask;
  	ctx->sq_entries = rings->sq_ring_entries;
  	ctx->cq_entries = rings->cq_ring_entries;
2b188cc1b   Jens Axboe   Add io_uring IO i...
3639
3640
  
  	size = array_size(sizeof(struct io_uring_sqe), p->sq_entries);
1768acaa6   Jens Axboe   io_uring: io_allo...
3641
3642
3643
  	if (size == SIZE_MAX) {
  		io_mem_free(ctx->rings);
  		ctx->rings = NULL;
2b188cc1b   Jens Axboe   Add io_uring IO i...
3644
  		return -EOVERFLOW;
1768acaa6   Jens Axboe   io_uring: io_allo...
3645
  	}
2b188cc1b   Jens Axboe   Add io_uring IO i...
3646
3647
  
  	ctx->sq_sqes = io_mem_alloc(size);
1768acaa6   Jens Axboe   io_uring: io_allo...
3648
3649
3650
  	if (!ctx->sq_sqes) {
  		io_mem_free(ctx->rings);
  		ctx->rings = NULL;
2b188cc1b   Jens Axboe   Add io_uring IO i...
3651
  		return -ENOMEM;
1768acaa6   Jens Axboe   io_uring: io_allo...
3652
  	}
2b188cc1b   Jens Axboe   Add io_uring IO i...
3653

2b188cc1b   Jens Axboe   Add io_uring IO i...
3654
3655
3656
3657
3658
3659
3660
3661
3662
3663
3664
3665
3666
3667
3668
3669
3670
3671
3672
3673
3674
3675
3676
3677
3678
3679
3680
3681
3682
3683
3684
3685
3686
3687
3688
  	return 0;
  }
  
  /*
   * Allocate an anonymous fd, this is what constitutes the application
   * visible backing of an io_uring instance. The application mmaps this
   * fd to gain access to the SQ/CQ ring details. If UNIX sockets are enabled,
   * we have to tie this fd to a socket for file garbage collection purposes.
   */
  static int io_uring_get_fd(struct io_ring_ctx *ctx)
  {
  	struct file *file;
  	int ret;
  
  #if defined(CONFIG_UNIX)
  	ret = sock_create_kern(&init_net, PF_UNIX, SOCK_RAW, IPPROTO_IP,
  				&ctx->ring_sock);
  	if (ret)
  		return ret;
  #endif
  
  	ret = get_unused_fd_flags(O_RDWR | O_CLOEXEC);
  	if (ret < 0)
  		goto err;
  
  	file = anon_inode_getfile("[io_uring]", &io_uring_fops, ctx,
  					O_RDWR | O_CLOEXEC);
  	if (IS_ERR(file)) {
  		put_unused_fd(ret);
  		ret = PTR_ERR(file);
  		goto err;
  	}
  
  #if defined(CONFIG_UNIX)
  	ctx->ring_sock->file = file;
6b06314c4   Jens Axboe   io_uring: add fil...
3689
  	ctx->ring_sock->sk->sk_user_data = ctx;
2b188cc1b   Jens Axboe   Add io_uring IO i...
3690
3691
3692
3693
3694
3695
3696
3697
3698
3699
3700
3701
3702
3703
3704
3705
3706
3707
3708
3709
3710
3711
3712
3713
3714
3715
3716
3717
3718
3719
3720
3721
3722
3723
3724
3725
3726
3727
3728
3729
3730
3731
3732
3733
3734
3735
3736
3737
3738
3739
3740
3741
3742
  #endif
  	fd_install(ret, file);
  	return ret;
  err:
  #if defined(CONFIG_UNIX)
  	sock_release(ctx->ring_sock);
  	ctx->ring_sock = NULL;
  #endif
  	return ret;
  }
  
  static int io_uring_create(unsigned entries, struct io_uring_params *p)
  {
  	struct user_struct *user = NULL;
  	struct io_ring_ctx *ctx;
  	bool account_mem;
  	int ret;
  
  	if (!entries || entries > IORING_MAX_ENTRIES)
  		return -EINVAL;
  
  	/*
  	 * Use twice as many entries for the CQ ring. It's possible for the
  	 * application to drive a higher depth than the size of the SQ ring,
  	 * since the sqes are only used at submission time. This allows for
  	 * some flexibility in overcommitting a bit.
  	 */
  	p->sq_entries = roundup_pow_of_two(entries);
  	p->cq_entries = 2 * p->sq_entries;
  
  	user = get_uid(current_user());
  	account_mem = !capable(CAP_IPC_LOCK);
  
  	if (account_mem) {
  		ret = io_account_mem(user,
  				ring_pages(p->sq_entries, p->cq_entries));
  		if (ret) {
  			free_uid(user);
  			return ret;
  		}
  	}
  
  	ctx = io_ring_ctx_alloc(p);
  	if (!ctx) {
  		if (account_mem)
  			io_unaccount_mem(user, ring_pages(p->sq_entries,
  								p->cq_entries));
  		free_uid(user);
  		return -ENOMEM;
  	}
  	ctx->compat = in_compat_syscall();
  	ctx->account_mem = account_mem;
  	ctx->user = user;
d1b69aabc   Jens Axboe   io_uring: use cur...
3743
  	ctx->creds = get_current_cred();
8387e3688   Jens Axboe   io_uring: async w...
3744
3745
3746
3747
  	if (!ctx->creds) {
  		ret = -ENOMEM;
  		goto err;
  	}
2b188cc1b   Jens Axboe   Add io_uring IO i...
3748
3749
3750
  	ret = io_allocate_scq_urings(ctx, p);
  	if (ret)
  		goto err;
6c271ce2f   Jens Axboe   io_uring: add sub...
3751
  	ret = io_sq_offload_start(ctx, p);
2b188cc1b   Jens Axboe   Add io_uring IO i...
3752
3753
  	if (ret)
  		goto err;
2b188cc1b   Jens Axboe   Add io_uring IO i...
3754
  	memset(&p->sq_off, 0, sizeof(p->sq_off));
75b28affd   Hristo Venev   io_uring: allocat...
3755
3756
3757
3758
3759
3760
3761
  	p->sq_off.head = offsetof(struct io_rings, sq.head);
  	p->sq_off.tail = offsetof(struct io_rings, sq.tail);
  	p->sq_off.ring_mask = offsetof(struct io_rings, sq_ring_mask);
  	p->sq_off.ring_entries = offsetof(struct io_rings, sq_ring_entries);
  	p->sq_off.flags = offsetof(struct io_rings, sq_flags);
  	p->sq_off.dropped = offsetof(struct io_rings, sq_dropped);
  	p->sq_off.array = (char *)ctx->sq_array - (char *)ctx->rings;
2b188cc1b   Jens Axboe   Add io_uring IO i...
3762
3763
  
  	memset(&p->cq_off, 0, sizeof(p->cq_off));
75b28affd   Hristo Venev   io_uring: allocat...
3764
3765
3766
3767
3768
3769
  	p->cq_off.head = offsetof(struct io_rings, cq.head);
  	p->cq_off.tail = offsetof(struct io_rings, cq.tail);
  	p->cq_off.ring_mask = offsetof(struct io_rings, cq_ring_mask);
  	p->cq_off.ring_entries = offsetof(struct io_rings, cq_ring_entries);
  	p->cq_off.overflow = offsetof(struct io_rings, cq_overflow);
  	p->cq_off.cqes = offsetof(struct io_rings, cqes);
ac90f249e   Jens Axboe   io_uring: expose ...
3770

044c1ab39   Jens Axboe   io_uring: don't t...
3771
3772
3773
3774
3775
3776
3777
  	/*
  	 * Install ring fd as the very last thing, so we don't risk someone
  	 * having closed it before we finish setup
  	 */
  	ret = io_uring_get_fd(ctx);
  	if (ret < 0)
  		goto err;
ac90f249e   Jens Axboe   io_uring: expose ...
3778
  	p->features = IORING_FEAT_SINGLE_MMAP;
2b188cc1b   Jens Axboe   Add io_uring IO i...
3779
3780
3781
3782
3783
3784
3785
3786
3787
3788
3789
3790
3791
3792
3793
3794
3795
3796
3797
3798
3799
3800
3801
  	return ret;
  err:
  	io_ring_ctx_wait_and_kill(ctx);
  	return ret;
  }
  
  /*
   * Sets up an aio uring context, and returns the fd. Applications asks for a
   * ring size, we return the actual sq/cq ring sizes (among other things) in the
   * params structure passed in.
   */
  static long io_uring_setup(u32 entries, struct io_uring_params __user *params)
  {
  	struct io_uring_params p;
  	long ret;
  	int i;
  
  	if (copy_from_user(&p, params, sizeof(p)))
  		return -EFAULT;
  	for (i = 0; i < ARRAY_SIZE(p.resv); i++) {
  		if (p.resv[i])
  			return -EINVAL;
  	}
6c271ce2f   Jens Axboe   io_uring: add sub...
3802
3803
  	if (p.flags & ~(IORING_SETUP_IOPOLL | IORING_SETUP_SQPOLL |
  			IORING_SETUP_SQ_AFF))
2b188cc1b   Jens Axboe   Add io_uring IO i...
3804
3805
3806
3807
3808
3809
3810
3811
3812
3813
3814
3815
3816
3817
3818
3819
3820
  		return -EINVAL;
  
  	ret = io_uring_create(entries, &p);
  	if (ret < 0)
  		return ret;
  
  	if (copy_to_user(params, &p, sizeof(p)))
  		return -EFAULT;
  
  	return ret;
  }
  
  SYSCALL_DEFINE2(io_uring_setup, u32, entries,
  		struct io_uring_params __user *, params)
  {
  	return io_uring_setup(entries, params);
  }
edafccee5   Jens Axboe   io_uring: add sup...
3821
3822
  static int __io_uring_register(struct io_ring_ctx *ctx, unsigned opcode,
  			       void __user *arg, unsigned nr_args)
b19062a56   Jens Axboe   io_uring: fix pos...
3823
3824
  	__releases(ctx->uring_lock)
  	__acquires(ctx->uring_lock)
edafccee5   Jens Axboe   io_uring: add sup...
3825
3826
  {
  	int ret;
35fa71a03   Jens Axboe   io_uring: fail io...
3827
3828
3829
3830
3831
3832
3833
  	/*
  	 * We're inside the ring mutex, if the ref is already dying, then
  	 * someone else killed the ctx or is already going through
  	 * io_uring_register().
  	 */
  	if (percpu_ref_is_dying(&ctx->refs))
  		return -ENXIO;
edafccee5   Jens Axboe   io_uring: add sup...
3834
  	percpu_ref_kill(&ctx->refs);
b19062a56   Jens Axboe   io_uring: fix pos...
3835
3836
3837
3838
3839
3840
3841
3842
3843
  
  	/*
  	 * Drop uring mutex before waiting for references to exit. If another
  	 * thread is currently inside io_uring_enter() it might need to grab
  	 * the uring_lock to make progress. If we hold it here across the drain
  	 * wait, then we can deadlock. It's safe to drop the mutex here, since
  	 * no new references will come in after we've killed the percpu ref.
  	 */
  	mutex_unlock(&ctx->uring_lock);
edafccee5   Jens Axboe   io_uring: add sup...
3844
  	wait_for_completion(&ctx->ctx_done);
b19062a56   Jens Axboe   io_uring: fix pos...
3845
  	mutex_lock(&ctx->uring_lock);
edafccee5   Jens Axboe   io_uring: add sup...
3846
3847
3848
3849
3850
3851
3852
3853
3854
3855
3856
  
  	switch (opcode) {
  	case IORING_REGISTER_BUFFERS:
  		ret = io_sqe_buffer_register(ctx, arg, nr_args);
  		break;
  	case IORING_UNREGISTER_BUFFERS:
  		ret = -EINVAL;
  		if (arg || nr_args)
  			break;
  		ret = io_sqe_buffer_unregister(ctx);
  		break;
6b06314c4   Jens Axboe   io_uring: add fil...
3857
3858
3859
3860
3861
3862
3863
3864
3865
  	case IORING_REGISTER_FILES:
  		ret = io_sqe_files_register(ctx, arg, nr_args);
  		break;
  	case IORING_UNREGISTER_FILES:
  		ret = -EINVAL;
  		if (arg || nr_args)
  			break;
  		ret = io_sqe_files_unregister(ctx);
  		break;
9b402849e   Jens Axboe   io_uring: add sup...
3866
3867
3868
3869
3870
3871
3872
3873
3874
3875
3876
3877
  	case IORING_REGISTER_EVENTFD:
  		ret = -EINVAL;
  		if (nr_args != 1)
  			break;
  		ret = io_eventfd_register(ctx, arg);
  		break;
  	case IORING_UNREGISTER_EVENTFD:
  		ret = -EINVAL;
  		if (arg || nr_args)
  			break;
  		ret = io_eventfd_unregister(ctx);
  		break;
edafccee5   Jens Axboe   io_uring: add sup...
3878
3879
3880
3881
3882
3883
3884
3885
3886
3887
3888
3889
3890
3891
3892
3893
3894
3895
3896
3897
3898
3899
3900
3901
3902
3903
3904
3905
3906
3907
3908
3909
3910
3911
3912
  	default:
  		ret = -EINVAL;
  		break;
  	}
  
  	/* bring the ctx back to life */
  	reinit_completion(&ctx->ctx_done);
  	percpu_ref_reinit(&ctx->refs);
  	return ret;
  }
  
  SYSCALL_DEFINE4(io_uring_register, unsigned int, fd, unsigned int, opcode,
  		void __user *, arg, unsigned int, nr_args)
  {
  	struct io_ring_ctx *ctx;
  	long ret = -EBADF;
  	struct fd f;
  
  	f = fdget(fd);
  	if (!f.file)
  		return -EBADF;
  
  	ret = -EOPNOTSUPP;
  	if (f.file->f_op != &io_uring_fops)
  		goto out_fput;
  
  	ctx = f.file->private_data;
  
  	mutex_lock(&ctx->uring_lock);
  	ret = __io_uring_register(ctx, opcode, arg, nr_args);
  	mutex_unlock(&ctx->uring_lock);
  out_fput:
  	fdput(f);
  	return ret;
  }
2b188cc1b   Jens Axboe   Add io_uring IO i...
3913
3914
3915
3916
3917
3918
  static int __init io_uring_init(void)
  {
  	req_cachep = KMEM_CACHE(io_kiocb, SLAB_HWCACHE_ALIGN | SLAB_PANIC);
  	return 0;
  };
  __initcall(io_uring_init);