Blame view

fs/io_uring.c 99.5 KB
2b188cc1b   Jens Axboe   Add io_uring IO i...
1
2
3
4
5
6
  // SPDX-License-Identifier: GPL-2.0
  /*
   * Shared application/kernel submission and completion ring pairs, for
   * supporting fast/efficient IO.
   *
   * A note on the read/write ordering memory barriers that are matched between
1e84b97b7   Stefan Bühler   io_uring: fix not...
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
   * the application and kernel side.
   *
   * After the application reads the CQ ring tail, it must use an
   * appropriate smp_rmb() to pair with the smp_wmb() the kernel uses
   * before writing the tail (using smp_load_acquire to read the tail will
   * do). It also needs a smp_mb() before updating CQ head (ordering the
   * entry load(s) with the head store), pairing with an implicit barrier
   * through a control-dependency in io_get_cqring (smp_store_release to
   * store head will do). Failure to do so could lead to reading invalid
   * CQ entries.
   *
   * Likewise, the application must use an appropriate smp_wmb() before
   * writing the SQ tail (ordering SQ entry stores with the tail store),
   * which pairs with smp_load_acquire in io_get_sqring (smp_store_release
   * to store the tail will do). And it needs a barrier ordering the SQ
   * head load before writing new SQ entries (smp_load_acquire to read
   * head will do).
   *
   * When using the SQ poll thread (IORING_SETUP_SQPOLL), the application
   * needs to check the SQ flags for IORING_SQ_NEED_WAKEUP *after*
   * updating the SQ tail; a full memory barrier smp_mb() is needed
   * between.
2b188cc1b   Jens Axboe   Add io_uring IO i...
29
30
31
32
33
34
35
36
37
38
39
   *
   * Also see the examples in the liburing library:
   *
   *	git://git.kernel.dk/liburing
   *
   * io_uring also uses READ/WRITE_ONCE() for _any_ store or load that happens
   * from data shared between the kernel and application. This is done both
   * for ordering purposes, but also to ensure that once a value is loaded from
   * data that the application could potentially modify, it remains stable.
   *
   * Copyright (C) 2018-2019 Jens Axboe
c992fe292   Christoph Hellwig   io_uring: add fsy...
40
   * Copyright (c) 2018-2019 Christoph Hellwig
2b188cc1b   Jens Axboe   Add io_uring IO i...
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
   */
  #include <linux/kernel.h>
  #include <linux/init.h>
  #include <linux/errno.h>
  #include <linux/syscalls.h>
  #include <linux/compat.h>
  #include <linux/refcount.h>
  #include <linux/uio.h>
  
  #include <linux/sched/signal.h>
  #include <linux/fs.h>
  #include <linux/file.h>
  #include <linux/fdtable.h>
  #include <linux/mm.h>
  #include <linux/mman.h>
  #include <linux/mmu_context.h>
  #include <linux/percpu.h>
  #include <linux/slab.h>
  #include <linux/workqueue.h>
6c271ce2f   Jens Axboe   io_uring: add sub...
60
  #include <linux/kthread.h>
2b188cc1b   Jens Axboe   Add io_uring IO i...
61
  #include <linux/blkdev.h>
edafccee5   Jens Axboe   io_uring: add sup...
62
  #include <linux/bvec.h>
2b188cc1b   Jens Axboe   Add io_uring IO i...
63
64
65
  #include <linux/net.h>
  #include <net/sock.h>
  #include <net/af_unix.h>
6b06314c4   Jens Axboe   io_uring: add fil...
66
  #include <net/scm.h>
2b188cc1b   Jens Axboe   Add io_uring IO i...
67
68
69
70
  #include <linux/anon_inodes.h>
  #include <linux/sched/mm.h>
  #include <linux/uaccess.h>
  #include <linux/nospec.h>
edafccee5   Jens Axboe   io_uring: add sup...
71
72
  #include <linux/sizes.h>
  #include <linux/hugetlb.h>
1dec7fcac   Jens Axboe   io_uring: fix mis...
73
  #include <linux/highmem.h>
cac68d12c   Jens Axboe   io_uring: grab ->...
74
  #include <linux/fs_struct.h>
2b188cc1b   Jens Axboe   Add io_uring IO i...
75
76
77
78
  
  #include <uapi/linux/io_uring.h>
  
  #include "internal.h"
5277deaab   Daniel Xu   io_uring: increas...
79
  #define IORING_MAX_ENTRIES	32768
6b06314c4   Jens Axboe   io_uring: add fil...
80
  #define IORING_MAX_FIXED_FILES	1024
2b188cc1b   Jens Axboe   Add io_uring IO i...
81
82
83
84
85
  
  struct io_uring {
  	u32 head ____cacheline_aligned_in_smp;
  	u32 tail ____cacheline_aligned_in_smp;
  };
1e84b97b7   Stefan Bühler   io_uring: fix not...
86
  /*
75b28affd   Hristo Venev   io_uring: allocat...
87
88
   * This data is shared with the application through the mmap at offsets
   * IORING_OFF_SQ_RING and IORING_OFF_CQ_RING.
1e84b97b7   Stefan Bühler   io_uring: fix not...
89
90
91
92
   *
   * The offsets to the member fields are published through struct
   * io_sqring_offsets when calling io_uring_setup.
   */
75b28affd   Hristo Venev   io_uring: allocat...
93
  struct io_rings {
1e84b97b7   Stefan Bühler   io_uring: fix not...
94
95
96
97
  	/*
  	 * Head and tail offsets into the ring; the offsets need to be
  	 * masked to get valid indices.
  	 *
75b28affd   Hristo Venev   io_uring: allocat...
98
99
100
  	 * The kernel controls head of the sq ring and the tail of the cq ring,
  	 * and the application controls tail of the sq ring and the head of the
  	 * cq ring.
1e84b97b7   Stefan Bühler   io_uring: fix not...
101
  	 */
75b28affd   Hristo Venev   io_uring: allocat...
102
  	struct io_uring		sq, cq;
1e84b97b7   Stefan Bühler   io_uring: fix not...
103
  	/*
75b28affd   Hristo Venev   io_uring: allocat...
104
  	 * Bitmasks to apply to head and tail offsets (constant, equals
1e84b97b7   Stefan Bühler   io_uring: fix not...
105
106
  	 * ring_entries - 1)
  	 */
75b28affd   Hristo Venev   io_uring: allocat...
107
108
109
  	u32			sq_ring_mask, cq_ring_mask;
  	/* Ring sizes (constant, power of 2) */
  	u32			sq_ring_entries, cq_ring_entries;
1e84b97b7   Stefan Bühler   io_uring: fix not...
110
111
112
113
114
115
116
117
118
119
120
121
  	/*
  	 * Number of invalid entries dropped by the kernel due to
  	 * invalid index stored in array
  	 *
  	 * Written by the kernel, shouldn't be modified by the
  	 * application (i.e. get number of "new events" by comparing to
  	 * cached value).
  	 *
  	 * After a new SQ head value was read by the application this
  	 * counter includes all submissions that were dropped reaching
  	 * the new SQ head (and possibly more).
  	 */
75b28affd   Hristo Venev   io_uring: allocat...
122
  	u32			sq_dropped;
1e84b97b7   Stefan Bühler   io_uring: fix not...
123
124
125
126
127
128
129
130
131
  	/*
  	 * Runtime flags
  	 *
  	 * Written by the kernel, shouldn't be modified by the
  	 * application.
  	 *
  	 * The application needs a full memory barrier before checking
  	 * for IORING_SQ_NEED_WAKEUP after updating the sq tail.
  	 */
75b28affd   Hristo Venev   io_uring: allocat...
132
  	u32			sq_flags;
1e84b97b7   Stefan Bühler   io_uring: fix not...
133
134
135
136
137
138
139
140
141
142
143
144
145
  	/*
  	 * Number of completion events lost because the queue was full;
  	 * this should be avoided by the application by making sure
  	 * there are not more requests pending thatn there is space in
  	 * the completion queue.
  	 *
  	 * Written by the kernel, shouldn't be modified by the
  	 * application (i.e. get number of "new events" by comparing to
  	 * cached value).
  	 *
  	 * As completion events come in out of order this counter is not
  	 * ordered with any other data.
  	 */
75b28affd   Hristo Venev   io_uring: allocat...
146
  	u32			cq_overflow;
1e84b97b7   Stefan Bühler   io_uring: fix not...
147
148
149
150
151
152
153
  	/*
  	 * Ring buffer of completion events.
  	 *
  	 * The kernel writes completion events fresh every time they are
  	 * produced, so the application is allowed to modify pending
  	 * entries.
  	 */
75b28affd   Hristo Venev   io_uring: allocat...
154
  	struct io_uring_cqe	cqes[] ____cacheline_aligned_in_smp;
2b188cc1b   Jens Axboe   Add io_uring IO i...
155
  };
edafccee5   Jens Axboe   io_uring: add sup...
156
157
158
159
160
161
  struct io_mapped_ubuf {
  	u64		ubuf;
  	size_t		len;
  	struct		bio_vec *bvec;
  	unsigned int	nr_bvecs;
  };
31b515106   Jens Axboe   io_uring: allow w...
162
163
164
165
166
167
  struct async_list {
  	spinlock_t		lock;
  	atomic_t		cnt;
  	struct list_head	list;
  
  	struct file		*file;
6d5d5ac52   Jens Axboe   io_uring: extend ...
168
  	off_t			io_start;
9310a7ba6   Zhengyuan Liu   io_uring: track i...
169
  	size_t			io_len;
31b515106   Jens Axboe   io_uring: allow w...
170
  };
2b188cc1b   Jens Axboe   Add io_uring IO i...
171
172
173
174
175
176
177
178
179
  struct io_ring_ctx {
  	struct {
  		struct percpu_ref	refs;
  	} ____cacheline_aligned_in_smp;
  
  	struct {
  		unsigned int		flags;
  		bool			compat;
  		bool			account_mem;
75b28affd   Hristo Venev   io_uring: allocat...
180
181
182
183
184
185
186
187
188
189
190
191
  		/*
  		 * Ring buffer of indices into array of io_uring_sqe, which is
  		 * mmapped by the application using the IORING_OFF_SQES offset.
  		 *
  		 * This indirection could e.g. be used to assign fixed
  		 * io_uring_sqe entries to operations and only submit them to
  		 * the queue when needed.
  		 *
  		 * The kernel modifies neither the indices array nor the entries
  		 * array.
  		 */
  		u32			*sq_array;
2b188cc1b   Jens Axboe   Add io_uring IO i...
192
193
194
  		unsigned		cached_sq_head;
  		unsigned		sq_entries;
  		unsigned		sq_mask;
6c271ce2f   Jens Axboe   io_uring: add sub...
195
  		unsigned		sq_thread_idle;
498ccd9ed   Jens Axboe   io_uring: used ca...
196
  		unsigned		cached_sq_dropped;
2b188cc1b   Jens Axboe   Add io_uring IO i...
197
  		struct io_uring_sqe	*sq_sqes;
de0617e46   Jens Axboe   io_uring: add sup...
198
199
  
  		struct list_head	defer_list;
5262f5679   Jens Axboe   io_uring: IORING_...
200
  		struct list_head	timeout_list;
2b188cc1b   Jens Axboe   Add io_uring IO i...
201
202
203
  	} ____cacheline_aligned_in_smp;
  
  	/* IO offload */
54a91f3bb   Jens Axboe   io_uring: limit p...
204
  	struct workqueue_struct	*sqo_wq[2];
6c271ce2f   Jens Axboe   io_uring: add sub...
205
  	struct task_struct	*sqo_thread;	/* if using sq thread polling */
2b188cc1b   Jens Axboe   Add io_uring IO i...
206
  	struct mm_struct	*sqo_mm;
6c271ce2f   Jens Axboe   io_uring: add sub...
207
  	wait_queue_head_t	sqo_wait;
a4c0b3dec   Jackie Liu   io_uring: fix io_...
208
  	struct completion	sqo_thread_started;
2b188cc1b   Jens Axboe   Add io_uring IO i...
209
210
  
  	struct {
2b188cc1b   Jens Axboe   Add io_uring IO i...
211
  		unsigned		cached_cq_tail;
498ccd9ed   Jens Axboe   io_uring: used ca...
212
  		atomic_t		cached_cq_overflow;
2b188cc1b   Jens Axboe   Add io_uring IO i...
213
214
215
216
  		unsigned		cq_entries;
  		unsigned		cq_mask;
  		struct wait_queue_head	cq_wait;
  		struct fasync_struct	*cq_fasync;
9b402849e   Jens Axboe   io_uring: add sup...
217
  		struct eventfd_ctx	*cq_ev_fd;
5262f5679   Jens Axboe   io_uring: IORING_...
218
  		atomic_t		cq_timeouts;
2b188cc1b   Jens Axboe   Add io_uring IO i...
219
  	} ____cacheline_aligned_in_smp;
75b28affd   Hristo Venev   io_uring: allocat...
220
  	struct io_rings	*rings;
6b06314c4   Jens Axboe   io_uring: add fil...
221
222
223
224
225
226
227
  	/*
  	 * If used, fixed file set. Writers must ensure that ->refs is dead,
  	 * readers must ensure that ->refs is alive as long as the file* is
  	 * used. Only updated through io_uring_register(2).
  	 */
  	struct file		**user_files;
  	unsigned		nr_user_files;
edafccee5   Jens Axboe   io_uring: add sup...
228
229
230
  	/* if used, fixed mapped user buffers */
  	unsigned		nr_user_bufs;
  	struct io_mapped_ubuf	*user_bufs;
2b188cc1b   Jens Axboe   Add io_uring IO i...
231
  	struct user_struct	*user;
d1b69aabc   Jens Axboe   io_uring: use cur...
232
  	const struct cred	*creds;
8387e3688   Jens Axboe   io_uring: async w...
233

2b188cc1b   Jens Axboe   Add io_uring IO i...
234
235
236
237
238
239
240
241
242
  	struct completion	ctx_done;
  
  	struct {
  		struct mutex		uring_lock;
  		wait_queue_head_t	wait;
  	} ____cacheline_aligned_in_smp;
  
  	struct {
  		spinlock_t		completion_lock;
def596e95   Jens Axboe   io_uring: support...
243
244
245
246
247
248
249
250
  		bool			poll_multi_file;
  		/*
  		 * ->poll_list is protected by the ctx->uring_lock for
  		 * io_uring instances that don't use IORING_SETUP_SQPOLL.
  		 * For SQPOLL, only the single threaded io_sq_thread() will
  		 * manipulate the list, hence no extra locking is needed there.
  		 */
  		struct list_head	poll_list;
221c5eb23   Jens Axboe   io_uring: add sup...
251
  		struct list_head	cancel_list;
2b188cc1b   Jens Axboe   Add io_uring IO i...
252
  	} ____cacheline_aligned_in_smp;
31b515106   Jens Axboe   io_uring: allow w...
253
  	struct async_list	pending_async[2];
2b188cc1b   Jens Axboe   Add io_uring IO i...
254
255
256
  #if defined(CONFIG_UNIX)
  	struct socket		*ring_sock;
  #endif
1c4404efc   Jens Axboe   io_uring: make su...
257
258
259
  
  	struct list_head	task_list;
  	spinlock_t		task_lock;
2b188cc1b   Jens Axboe   Add io_uring IO i...
260
261
262
263
264
  };
  
  struct sqe_submit {
  	const struct io_uring_sqe	*sqe;
  	unsigned short			index;
8776f3fa1   Jackie Liu   io_uring: fix wro...
265
  	u32				sequence;
2b188cc1b   Jens Axboe   Add io_uring IO i...
266
  	bool				has_user;
def596e95   Jens Axboe   io_uring: support...
267
  	bool				needs_lock;
6c271ce2f   Jens Axboe   io_uring: add sub...
268
  	bool				needs_fixed_file;
a4d61e66e   Jens Axboe   io_uring: prevent...
269
  	u8				opcode;
2b188cc1b   Jens Axboe   Add io_uring IO i...
270
  };
09bb83943   Jens Axboe   io_uring: fix fge...
271
272
273
274
  /*
   * First field must be the file pointer in all the
   * iocb unions! See also 'struct kiocb' in <linux/fs.h>
   */
221c5eb23   Jens Axboe   io_uring: add sup...
275
276
277
278
  struct io_poll_iocb {
  	struct file			*file;
  	struct wait_queue_head		*head;
  	__poll_t			events;
8c8387887   Jens Axboe   io_uring: fix pol...
279
  	bool				done;
221c5eb23   Jens Axboe   io_uring: add sup...
280
281
282
  	bool				canceled;
  	struct wait_queue_entry		wait;
  };
5262f5679   Jens Axboe   io_uring: IORING_...
283
284
285
286
  struct io_timeout {
  	struct file			*file;
  	struct hrtimer			timer;
  };
09bb83943   Jens Axboe   io_uring: fix fge...
287
288
289
290
291
292
  /*
   * NOTE! Each of the iocb union members has the file pointer
   * as the first entry in their struct definition. So you can
   * access the file pointer through any of the sub-structs,
   * or directly as just 'ki_filp' in this struct.
   */
2b188cc1b   Jens Axboe   Add io_uring IO i...
293
  struct io_kiocb {
221c5eb23   Jens Axboe   io_uring: add sup...
294
  	union {
09bb83943   Jens Axboe   io_uring: fix fge...
295
  		struct file		*file;
221c5eb23   Jens Axboe   io_uring: add sup...
296
297
  		struct kiocb		rw;
  		struct io_poll_iocb	poll;
5262f5679   Jens Axboe   io_uring: IORING_...
298
  		struct io_timeout	timeout;
221c5eb23   Jens Axboe   io_uring: add sup...
299
  	};
2b188cc1b   Jens Axboe   Add io_uring IO i...
300
301
302
303
304
  
  	struct sqe_submit	submit;
  
  	struct io_ring_ctx	*ctx;
  	struct list_head	list;
9e645e110   Jens Axboe   io_uring: add sup...
305
  	struct list_head	link_list;
2b188cc1b   Jens Axboe   Add io_uring IO i...
306
  	unsigned int		flags;
c16361c1d   Jens Axboe   io_uring: add io_...
307
  	refcount_t		refs;
8449eedaa   Stefan Bühler   io_uring: fix han...
308
  #define REQ_F_NOWAIT		1	/* must not punt to workers */
def596e95   Jens Axboe   io_uring: support...
309
  #define REQ_F_IOPOLL_COMPLETED	2	/* polled IO has completed */
6b06314c4   Jens Axboe   io_uring: add fil...
310
  #define REQ_F_FIXED_FILE	4	/* ctx owns file */
31b515106   Jens Axboe   io_uring: allow w...
311
  #define REQ_F_SEQ_PREV		8	/* sequential with previous */
e2033e33c   Stefan Bühler   io_uring: fix rac...
312
313
  #define REQ_F_IO_DRAIN		16	/* drain existing IO first */
  #define REQ_F_IO_DRAINED	32	/* drain done */
9e645e110   Jens Axboe   io_uring: add sup...
314
  #define REQ_F_LINK		64	/* linked sqes */
f7b76ac9d   Zhengyuan Liu   io_uring: fix cou...
315
316
  #define REQ_F_LINK_DONE		128	/* linked sqes done */
  #define REQ_F_FAIL_LINK		256	/* fail rest of links */
4fe2c9631   Jackie Liu   io_uring: add sup...
317
  #define REQ_F_SHADOW_DRAIN	512	/* link-drain shadow req */
5262f5679   Jens Axboe   io_uring: IORING_...
318
  #define REQ_F_TIMEOUT		1024	/* timeout request */
491381ce0   Jens Axboe   io_uring: fix up ...
319
320
  #define REQ_F_ISREG		2048	/* regular file */
  #define REQ_F_MUST_PUNT		4096	/* must be punted even for NONBLOCK */
93bd25bb6   Jens Axboe   io_uring: make ti...
321
  #define REQ_F_TIMEOUT_NOSEQ	8192	/* no timeout sequence */
1c4404efc   Jens Axboe   io_uring: make su...
322
  #define REQ_F_CANCEL		16384	/* cancel request */
7661469ef   Jens Axboe   io_uring: honor o...
323
  	unsigned long		fsize;
2b188cc1b   Jens Axboe   Add io_uring IO i...
324
  	u64			user_data;
9e645e110   Jens Axboe   io_uring: add sup...
325
  	u32			result;
de0617e46   Jens Axboe   io_uring: add sup...
326
  	u32			sequence;
1c4404efc   Jens Axboe   io_uring: make su...
327
  	struct task_struct	*task;
2b188cc1b   Jens Axboe   Add io_uring IO i...
328

cac68d12c   Jens Axboe   io_uring: grab ->...
329
  	struct fs_struct	*fs;
2b188cc1b   Jens Axboe   Add io_uring IO i...
330
  	struct work_struct	work;
1c4404efc   Jens Axboe   io_uring: make su...
331
332
  	struct task_struct	*work_task;
  	struct list_head	task_list;
2b188cc1b   Jens Axboe   Add io_uring IO i...
333
334
335
  };
  
  #define IO_PLUG_THRESHOLD		2
def596e95   Jens Axboe   io_uring: support...
336
  #define IO_IOPOLL_BATCH			8
2b188cc1b   Jens Axboe   Add io_uring IO i...
337

9a56a2323   Jens Axboe   io_uring: use fge...
338
339
340
341
  struct io_submit_state {
  	struct blk_plug		plug;
  
  	/*
2579f913d   Jens Axboe   io_uring: batch i...
342
343
344
345
346
347
348
  	 * io_kiocb alloc cache
  	 */
  	void			*reqs[IO_IOPOLL_BATCH];
  	unsigned		int free_reqs;
  	unsigned		int cur_req;
  
  	/*
9a56a2323   Jens Axboe   io_uring: use fge...
349
350
351
352
353
354
355
356
  	 * File reference cache
  	 */
  	struct file		*file;
  	unsigned int		fd;
  	unsigned int		has_refs;
  	unsigned int		used_refs;
  	unsigned int		ios_left;
  };
de0617e46   Jens Axboe   io_uring: add sup...
357
  static void io_sq_wq_submit_work(struct work_struct *work);
5262f5679   Jens Axboe   io_uring: IORING_...
358
359
  static void io_cqring_fill_event(struct io_ring_ctx *ctx, u64 ki_user_data,
  				 long res);
4fe2c9631   Jackie Liu   io_uring: add sup...
360
  static void __io_free_req(struct io_kiocb *req);
de0617e46   Jens Axboe   io_uring: add sup...
361

2b188cc1b   Jens Axboe   Add io_uring IO i...
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
  static struct kmem_cache *req_cachep;
  
  static const struct file_operations io_uring_fops;
  
  struct sock *io_uring_get_socket(struct file *file)
  {
  #if defined(CONFIG_UNIX)
  	if (file->f_op == &io_uring_fops) {
  		struct io_ring_ctx *ctx = file->private_data;
  
  		return ctx->ring_sock->sk;
  	}
  #endif
  	return NULL;
  }
  EXPORT_SYMBOL(io_uring_get_socket);
  
  static void io_ring_ctx_ref_free(struct percpu_ref *ref)
  {
  	struct io_ring_ctx *ctx = container_of(ref, struct io_ring_ctx, refs);
  
  	complete(&ctx->ctx_done);
  }
  
  static struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p)
  {
  	struct io_ring_ctx *ctx;
31b515106   Jens Axboe   io_uring: allow w...
389
  	int i;
2b188cc1b   Jens Axboe   Add io_uring IO i...
390
391
392
393
  
  	ctx = kzalloc(sizeof(*ctx), GFP_KERNEL);
  	if (!ctx)
  		return NULL;
214828962   Roman Gushchin   io_uring: initial...
394
395
  	if (percpu_ref_init(&ctx->refs, io_ring_ctx_ref_free,
  			    PERCPU_REF_ALLOW_REINIT, GFP_KERNEL)) {
2b188cc1b   Jens Axboe   Add io_uring IO i...
396
397
398
399
400
  		kfree(ctx);
  		return NULL;
  	}
  
  	ctx->flags = p->flags;
ba5501531   Jens Axboe   io_uring: initial...
401
  	init_waitqueue_head(&ctx->sqo_wait);
2b188cc1b   Jens Axboe   Add io_uring IO i...
402
403
  	init_waitqueue_head(&ctx->cq_wait);
  	init_completion(&ctx->ctx_done);
a4c0b3dec   Jackie Liu   io_uring: fix io_...
404
  	init_completion(&ctx->sqo_thread_started);
2b188cc1b   Jens Axboe   Add io_uring IO i...
405
406
  	mutex_init(&ctx->uring_lock);
  	init_waitqueue_head(&ctx->wait);
31b515106   Jens Axboe   io_uring: allow w...
407
408
409
410
411
  	for (i = 0; i < ARRAY_SIZE(ctx->pending_async); i++) {
  		spin_lock_init(&ctx->pending_async[i].lock);
  		INIT_LIST_HEAD(&ctx->pending_async[i].list);
  		atomic_set(&ctx->pending_async[i].cnt, 0);
  	}
2b188cc1b   Jens Axboe   Add io_uring IO i...
412
  	spin_lock_init(&ctx->completion_lock);
def596e95   Jens Axboe   io_uring: support...
413
  	INIT_LIST_HEAD(&ctx->poll_list);
221c5eb23   Jens Axboe   io_uring: add sup...
414
  	INIT_LIST_HEAD(&ctx->cancel_list);
de0617e46   Jens Axboe   io_uring: add sup...
415
  	INIT_LIST_HEAD(&ctx->defer_list);
5262f5679   Jens Axboe   io_uring: IORING_...
416
  	INIT_LIST_HEAD(&ctx->timeout_list);
1c4404efc   Jens Axboe   io_uring: make su...
417
418
  	INIT_LIST_HEAD(&ctx->task_list);
  	spin_lock_init(&ctx->task_lock);
2b188cc1b   Jens Axboe   Add io_uring IO i...
419
420
  	return ctx;
  }
7adf4eaf6   Jens Axboe   io_uring: fix seq...
421
422
423
  static inline bool __io_sequence_defer(struct io_ring_ctx *ctx,
  				       struct io_kiocb *req)
  {
498ccd9ed   Jens Axboe   io_uring: used ca...
424
425
  	return req->sequence != ctx->cached_cq_tail + ctx->cached_sq_dropped
  					+ atomic_read(&ctx->cached_cq_overflow);
7adf4eaf6   Jens Axboe   io_uring: fix seq...
426
  }
de0617e46   Jens Axboe   io_uring: add sup...
427
428
429
  static inline bool io_sequence_defer(struct io_ring_ctx *ctx,
  				     struct io_kiocb *req)
  {
7adf4eaf6   Jens Axboe   io_uring: fix seq...
430
  	if ((req->flags & (REQ_F_IO_DRAIN|REQ_F_IO_DRAINED)) != REQ_F_IO_DRAIN)
de0617e46   Jens Axboe   io_uring: add sup...
431
  		return false;
7adf4eaf6   Jens Axboe   io_uring: fix seq...
432
  	return __io_sequence_defer(ctx, req);
de0617e46   Jens Axboe   io_uring: add sup...
433
  }
7adf4eaf6   Jens Axboe   io_uring: fix seq...
434
  static struct io_kiocb *io_get_deferred_req(struct io_ring_ctx *ctx)
de0617e46   Jens Axboe   io_uring: add sup...
435
436
  {
  	struct io_kiocb *req;
7adf4eaf6   Jens Axboe   io_uring: fix seq...
437
438
  	req = list_first_entry_or_null(&ctx->defer_list, struct io_kiocb, list);
  	if (req && !io_sequence_defer(ctx, req)) {
de0617e46   Jens Axboe   io_uring: add sup...
439
440
441
442
443
444
  		list_del_init(&req->list);
  		return req;
  	}
  
  	return NULL;
  }
5262f5679   Jens Axboe   io_uring: IORING_...
445
446
  static struct io_kiocb *io_get_timeout_req(struct io_ring_ctx *ctx)
  {
7adf4eaf6   Jens Axboe   io_uring: fix seq...
447
448
449
  	struct io_kiocb *req;
  
  	req = list_first_entry_or_null(&ctx->timeout_list, struct io_kiocb, list);
93bd25bb6   Jens Axboe   io_uring: make ti...
450
451
452
453
454
455
456
  	if (req) {
  		if (req->flags & REQ_F_TIMEOUT_NOSEQ)
  			return NULL;
  		if (!__io_sequence_defer(ctx, req)) {
  			list_del_init(&req->list);
  			return req;
  		}
7adf4eaf6   Jens Axboe   io_uring: fix seq...
457
458
459
  	}
  
  	return NULL;
5262f5679   Jens Axboe   io_uring: IORING_...
460
  }
de0617e46   Jens Axboe   io_uring: add sup...
461
  static void __io_commit_cqring(struct io_ring_ctx *ctx)
2b188cc1b   Jens Axboe   Add io_uring IO i...
462
  {
75b28affd   Hristo Venev   io_uring: allocat...
463
  	struct io_rings *rings = ctx->rings;
2b188cc1b   Jens Axboe   Add io_uring IO i...
464

75b28affd   Hristo Venev   io_uring: allocat...
465
  	if (ctx->cached_cq_tail != READ_ONCE(rings->cq.tail)) {
2b188cc1b   Jens Axboe   Add io_uring IO i...
466
  		/* order cqe stores with ring update */
75b28affd   Hristo Venev   io_uring: allocat...
467
  		smp_store_release(&rings->cq.tail, ctx->cached_cq_tail);
2b188cc1b   Jens Axboe   Add io_uring IO i...
468

2b188cc1b   Jens Axboe   Add io_uring IO i...
469
470
471
472
473
474
  		if (wq_has_sleeper(&ctx->cq_wait)) {
  			wake_up_interruptible(&ctx->cq_wait);
  			kill_fasync(&ctx->cq_fasync, SIGIO, POLL_IN);
  		}
  	}
  }
18d9be1a9   Jens Axboe   io_uring: add io_...
475
476
477
  static inline void io_queue_async_work(struct io_ring_ctx *ctx,
  				       struct io_kiocb *req)
  {
1c4404efc   Jens Axboe   io_uring: make su...
478
  	unsigned long flags;
6cc47d1d2   Jens Axboe   io_uring: ensure ...
479
  	int rw = 0;
54a91f3bb   Jens Axboe   io_uring: limit p...
480

6cc47d1d2   Jens Axboe   io_uring: ensure ...
481
  	if (req->submit.sqe) {
a4d61e66e   Jens Axboe   io_uring: prevent...
482
  		switch (req->submit.opcode) {
6cc47d1d2   Jens Axboe   io_uring: ensure ...
483
484
485
486
487
  		case IORING_OP_WRITEV:
  		case IORING_OP_WRITE_FIXED:
  			rw = !(req->rw.ki_flags & IOCB_DIRECT);
  			break;
  		}
54a91f3bb   Jens Axboe   io_uring: limit p...
488
  	}
1c4404efc   Jens Axboe   io_uring: make su...
489
490
491
492
493
494
  	req->task = current;
  
  	spin_lock_irqsave(&ctx->task_lock, flags);
  	list_add(&req->task_list, &ctx->task_list);
  	req->work_task = NULL;
  	spin_unlock_irqrestore(&ctx->task_lock, flags);
54a91f3bb   Jens Axboe   io_uring: limit p...
495
  	queue_work(ctx->sqo_wq[rw], &req->work);
18d9be1a9   Jens Axboe   io_uring: add io_...
496
  }
5262f5679   Jens Axboe   io_uring: IORING_...
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
  static void io_kill_timeout(struct io_kiocb *req)
  {
  	int ret;
  
  	ret = hrtimer_try_to_cancel(&req->timeout.timer);
  	if (ret != -1) {
  		atomic_inc(&req->ctx->cq_timeouts);
  		list_del(&req->list);
  		io_cqring_fill_event(req->ctx, req->user_data, 0);
  		__io_free_req(req);
  	}
  }
  
  static void io_kill_timeouts(struct io_ring_ctx *ctx)
  {
  	struct io_kiocb *req, *tmp;
  
  	spin_lock_irq(&ctx->completion_lock);
  	list_for_each_entry_safe(req, tmp, &ctx->timeout_list, list)
  		io_kill_timeout(req);
  	spin_unlock_irq(&ctx->completion_lock);
  }
de0617e46   Jens Axboe   io_uring: add sup...
519
520
521
  static void io_commit_cqring(struct io_ring_ctx *ctx)
  {
  	struct io_kiocb *req;
5262f5679   Jens Axboe   io_uring: IORING_...
522
523
  	while ((req = io_get_timeout_req(ctx)) != NULL)
  		io_kill_timeout(req);
de0617e46   Jens Axboe   io_uring: add sup...
524
525
526
  	__io_commit_cqring(ctx);
  
  	while ((req = io_get_deferred_req(ctx)) != NULL) {
4fe2c9631   Jackie Liu   io_uring: add sup...
527
528
529
530
531
  		if (req->flags & REQ_F_SHADOW_DRAIN) {
  			/* Just for drain, free it. */
  			__io_free_req(req);
  			continue;
  		}
de0617e46   Jens Axboe   io_uring: add sup...
532
  		req->flags |= REQ_F_IO_DRAINED;
18d9be1a9   Jens Axboe   io_uring: add io_...
533
  		io_queue_async_work(ctx, req);
de0617e46   Jens Axboe   io_uring: add sup...
534
535
  	}
  }
2b188cc1b   Jens Axboe   Add io_uring IO i...
536
537
  static struct io_uring_cqe *io_get_cqring(struct io_ring_ctx *ctx)
  {
75b28affd   Hristo Venev   io_uring: allocat...
538
  	struct io_rings *rings = ctx->rings;
2b188cc1b   Jens Axboe   Add io_uring IO i...
539
540
541
  	unsigned tail;
  
  	tail = ctx->cached_cq_tail;
115e12e58   Stefan Bühler   io_uring: remove ...
542
543
544
545
546
  	/*
  	 * writes to the cq entry need to come after reading head; the
  	 * control dependency is enough as we're using WRITE_ONCE to
  	 * fill the cq entry
  	 */
75b28affd   Hristo Venev   io_uring: allocat...
547
  	if (tail - READ_ONCE(rings->cq.head) == rings->cq_ring_entries)
2b188cc1b   Jens Axboe   Add io_uring IO i...
548
549
550
  		return NULL;
  
  	ctx->cached_cq_tail++;
75b28affd   Hristo Venev   io_uring: allocat...
551
  	return &rings->cqes[tail & ctx->cq_mask];
2b188cc1b   Jens Axboe   Add io_uring IO i...
552
553
554
  }
  
  static void io_cqring_fill_event(struct io_ring_ctx *ctx, u64 ki_user_data,
c71ffb673   Jens Axboe   io_uring: remove ...
555
  				 long res)
2b188cc1b   Jens Axboe   Add io_uring IO i...
556
557
558
559
560
561
562
563
564
565
566
567
  {
  	struct io_uring_cqe *cqe;
  
  	/*
  	 * If we can't get a cq entry, userspace overflowed the
  	 * submission (by quite a lot). Increment the overflow count in
  	 * the ring.
  	 */
  	cqe = io_get_cqring(ctx);
  	if (cqe) {
  		WRITE_ONCE(cqe->user_data, ki_user_data);
  		WRITE_ONCE(cqe->res, res);
c71ffb673   Jens Axboe   io_uring: remove ...
568
  		WRITE_ONCE(cqe->flags, 0);
2b188cc1b   Jens Axboe   Add io_uring IO i...
569
  	} else {
498ccd9ed   Jens Axboe   io_uring: used ca...
570
571
  		WRITE_ONCE(ctx->rings->cq_overflow,
  				atomic_inc_return(&ctx->cached_cq_overflow));
2b188cc1b   Jens Axboe   Add io_uring IO i...
572
573
  	}
  }
8c8387887   Jens Axboe   io_uring: fix pol...
574
575
576
577
578
579
  static void io_cqring_ev_posted(struct io_ring_ctx *ctx)
  {
  	if (waitqueue_active(&ctx->wait))
  		wake_up(&ctx->wait);
  	if (waitqueue_active(&ctx->sqo_wait))
  		wake_up(&ctx->sqo_wait);
9b402849e   Jens Axboe   io_uring: add sup...
580
581
  	if (ctx->cq_ev_fd)
  		eventfd_signal(ctx->cq_ev_fd, 1);
8c8387887   Jens Axboe   io_uring: fix pol...
582
583
584
  }
  
  static void io_cqring_add_event(struct io_ring_ctx *ctx, u64 user_data,
c71ffb673   Jens Axboe   io_uring: remove ...
585
  				long res)
2b188cc1b   Jens Axboe   Add io_uring IO i...
586
587
588
589
  {
  	unsigned long flags;
  
  	spin_lock_irqsave(&ctx->completion_lock, flags);
c71ffb673   Jens Axboe   io_uring: remove ...
590
  	io_cqring_fill_event(ctx, user_data, res);
2b188cc1b   Jens Axboe   Add io_uring IO i...
591
592
  	io_commit_cqring(ctx);
  	spin_unlock_irqrestore(&ctx->completion_lock, flags);
8c8387887   Jens Axboe   io_uring: fix pol...
593
  	io_cqring_ev_posted(ctx);
2b188cc1b   Jens Axboe   Add io_uring IO i...
594
  }
2579f913d   Jens Axboe   io_uring: batch i...
595
596
  static struct io_kiocb *io_get_req(struct io_ring_ctx *ctx,
  				   struct io_submit_state *state)
2b188cc1b   Jens Axboe   Add io_uring IO i...
597
  {
fd6fab2cb   Jens Axboe   io_uring: retry b...
598
  	gfp_t gfp = GFP_KERNEL | __GFP_NOWARN;
2b188cc1b   Jens Axboe   Add io_uring IO i...
599
600
601
602
  	struct io_kiocb *req;
  
  	if (!percpu_ref_tryget(&ctx->refs))
  		return NULL;
2579f913d   Jens Axboe   io_uring: batch i...
603
  	if (!state) {
fd6fab2cb   Jens Axboe   io_uring: retry b...
604
  		req = kmem_cache_alloc(req_cachep, gfp);
2579f913d   Jens Axboe   io_uring: batch i...
605
606
607
608
609
610
611
  		if (unlikely(!req))
  			goto out;
  	} else if (!state->free_reqs) {
  		size_t sz;
  		int ret;
  
  		sz = min_t(size_t, state->ios_left, ARRAY_SIZE(state->reqs));
fd6fab2cb   Jens Axboe   io_uring: retry b...
612
613
614
615
616
617
618
619
620
621
622
623
  		ret = kmem_cache_alloc_bulk(req_cachep, gfp, sz, state->reqs);
  
  		/*
  		 * Bulk alloc is all-or-nothing. If we fail to get a batch,
  		 * retry single alloc to be on the safe side.
  		 */
  		if (unlikely(ret <= 0)) {
  			state->reqs[0] = kmem_cache_alloc(req_cachep, gfp);
  			if (!state->reqs[0])
  				goto out;
  			ret = 1;
  		}
2579f913d   Jens Axboe   io_uring: batch i...
624
625
626
627
628
629
630
  		state->free_reqs = ret - 1;
  		state->cur_req = 1;
  		req = state->reqs[0];
  	} else {
  		req = state->reqs[state->cur_req];
  		state->free_reqs--;
  		state->cur_req++;
2b188cc1b   Jens Axboe   Add io_uring IO i...
631
  	}
60c112b0a   Jens Axboe   io_uring: ensure ...
632
  	req->file = NULL;
2579f913d   Jens Axboe   io_uring: batch i...
633
634
  	req->ctx = ctx;
  	req->flags = 0;
e65ef56db   Jens Axboe   io_uring: use reg...
635
636
  	/* one is dropped after submission, the other at completion */
  	refcount_set(&req->refs, 2);
9e645e110   Jens Axboe   io_uring: add sup...
637
  	req->result = 0;
cac68d12c   Jens Axboe   io_uring: grab ->...
638
  	req->fs = NULL;
2579f913d   Jens Axboe   io_uring: batch i...
639
640
  	return req;
  out:
6805b32ec   Pavel Begunkov   io_uring: remove ...
641
  	percpu_ref_put(&ctx->refs);
2b188cc1b   Jens Axboe   Add io_uring IO i...
642
643
  	return NULL;
  }
def596e95   Jens Axboe   io_uring: support...
644
645
646
647
  static void io_free_req_many(struct io_ring_ctx *ctx, void **reqs, int *nr)
  {
  	if (*nr) {
  		kmem_cache_free_bulk(req_cachep, *nr, reqs);
6805b32ec   Pavel Begunkov   io_uring: remove ...
648
  		percpu_ref_put_many(&ctx->refs, *nr);
def596e95   Jens Axboe   io_uring: support...
649
650
651
  		*nr = 0;
  	}
  }
9e645e110   Jens Axboe   io_uring: add sup...
652
  static void __io_free_req(struct io_kiocb *req)
2b188cc1b   Jens Axboe   Add io_uring IO i...
653
  {
09bb83943   Jens Axboe   io_uring: fix fge...
654
655
  	if (req->file && !(req->flags & REQ_F_FIXED_FILE))
  		fput(req->file);
6805b32ec   Pavel Begunkov   io_uring: remove ...
656
  	percpu_ref_put(&req->ctx->refs);
e65ef56db   Jens Axboe   io_uring: use reg...
657
658
  	kmem_cache_free(req_cachep, req);
  }
9e645e110   Jens Axboe   io_uring: add sup...
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
  static void io_req_link_next(struct io_kiocb *req)
  {
  	struct io_kiocb *nxt;
  
  	/*
  	 * The list should never be empty when we are called here. But could
  	 * potentially happen if the chain is messed up, check to be on the
  	 * safe side.
  	 */
  	nxt = list_first_entry_or_null(&req->link_list, struct io_kiocb, list);
  	if (nxt) {
  		list_del(&nxt->list);
  		if (!list_empty(&req->link_list)) {
  			INIT_LIST_HEAD(&nxt->link_list);
  			list_splice(&req->link_list, &nxt->link_list);
  			nxt->flags |= REQ_F_LINK;
  		}
f7b76ac9d   Zhengyuan Liu   io_uring: fix cou...
676
  		nxt->flags |= REQ_F_LINK_DONE;
9e645e110   Jens Axboe   io_uring: add sup...
677
  		INIT_WORK(&nxt->work, io_sq_wq_submit_work);
18d9be1a9   Jens Axboe   io_uring: add io_...
678
  		io_queue_async_work(req->ctx, nxt);
9e645e110   Jens Axboe   io_uring: add sup...
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
  	}
  }
  
  /*
   * Called if REQ_F_LINK is set, and we fail the head request
   */
  static void io_fail_links(struct io_kiocb *req)
  {
  	struct io_kiocb *link;
  
  	while (!list_empty(&req->link_list)) {
  		link = list_first_entry(&req->link_list, struct io_kiocb, list);
  		list_del(&link->list);
  
  		io_cqring_add_event(req->ctx, link->user_data, -ECANCELED);
  		__io_free_req(link);
  	}
  }
  
  static void io_free_req(struct io_kiocb *req)
  {
  	/*
  	 * If LINK is set, we have dependent requests in this chain. If we
  	 * didn't fail this request, queue the first one up, moving any other
  	 * dependencies to the next request. In case of failure, fail the rest
  	 * of the chain.
  	 */
  	if (req->flags & REQ_F_LINK) {
  		if (req->flags & REQ_F_FAIL_LINK)
  			io_fail_links(req);
  		else
  			io_req_link_next(req);
  	}
  
  	__io_free_req(req);
  }
e65ef56db   Jens Axboe   io_uring: use reg...
715
716
717
718
  static void io_put_req(struct io_kiocb *req)
  {
  	if (refcount_dec_and_test(&req->refs))
  		io_free_req(req);
2b188cc1b   Jens Axboe   Add io_uring IO i...
719
  }
75b28affd   Hristo Venev   io_uring: allocat...
720
  static unsigned io_cqring_events(struct io_rings *rings)
a3a0e43fd   Jens Axboe   io_uring: don't e...
721
722
723
  {
  	/* See comment at the top of this file */
  	smp_rmb();
75b28affd   Hristo Venev   io_uring: allocat...
724
  	return READ_ONCE(rings->cq.tail) - READ_ONCE(rings->cq.head);
a3a0e43fd   Jens Axboe   io_uring: don't e...
725
  }
fb5ccc987   Pavel Begunkov   io_uring: Fix bro...
726
727
728
729
730
731
732
  static inline unsigned int io_sqring_entries(struct io_ring_ctx *ctx)
  {
  	struct io_rings *rings = ctx->rings;
  
  	/* make sure SQ entry isn't read before tail */
  	return smp_load_acquire(&rings->sq.tail) - ctx->cached_sq_head;
  }
def596e95   Jens Axboe   io_uring: support...
733
734
735
736
737
738
739
740
  /*
   * Find and free completed poll iocbs
   */
  static void io_iopoll_complete(struct io_ring_ctx *ctx, unsigned int *nr_events,
  			       struct list_head *done)
  {
  	void *reqs[IO_IOPOLL_BATCH];
  	struct io_kiocb *req;
09bb83943   Jens Axboe   io_uring: fix fge...
741
  	int to_free;
def596e95   Jens Axboe   io_uring: support...
742

09bb83943   Jens Axboe   io_uring: fix fge...
743
  	to_free = 0;
def596e95   Jens Axboe   io_uring: support...
744
745
746
  	while (!list_empty(done)) {
  		req = list_first_entry(done, struct io_kiocb, list);
  		list_del(&req->list);
9e645e110   Jens Axboe   io_uring: add sup...
747
  		io_cqring_fill_event(ctx, req->user_data, req->result);
def596e95   Jens Axboe   io_uring: support...
748
  		(*nr_events)++;
09bb83943   Jens Axboe   io_uring: fix fge...
749
750
751
752
  		if (refcount_dec_and_test(&req->refs)) {
  			/* If we're not using fixed files, we have to pair the
  			 * completion part with the file put. Use regular
  			 * completions for those, only batch free for fixed
9e645e110   Jens Axboe   io_uring: add sup...
753
  			 * file and non-linked commands.
09bb83943   Jens Axboe   io_uring: fix fge...
754
  			 */
9e645e110   Jens Axboe   io_uring: add sup...
755
756
  			if ((req->flags & (REQ_F_FIXED_FILE|REQ_F_LINK)) ==
  			    REQ_F_FIXED_FILE) {
09bb83943   Jens Axboe   io_uring: fix fge...
757
758
759
  				reqs[to_free++] = req;
  				if (to_free == ARRAY_SIZE(reqs))
  					io_free_req_many(ctx, reqs, &to_free);
6b06314c4   Jens Axboe   io_uring: add fil...
760
  			} else {
09bb83943   Jens Axboe   io_uring: fix fge...
761
  				io_free_req(req);
6b06314c4   Jens Axboe   io_uring: add fil...
762
  			}
9a56a2323   Jens Axboe   io_uring: use fge...
763
  		}
def596e95   Jens Axboe   io_uring: support...
764
  	}
def596e95   Jens Axboe   io_uring: support...
765

09bb83943   Jens Axboe   io_uring: fix fge...
766
  	io_commit_cqring(ctx);
def596e95   Jens Axboe   io_uring: support...
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
  	io_free_req_many(ctx, reqs, &to_free);
  }
  
  static int io_do_iopoll(struct io_ring_ctx *ctx, unsigned int *nr_events,
  			long min)
  {
  	struct io_kiocb *req, *tmp;
  	LIST_HEAD(done);
  	bool spin;
  	int ret;
  
  	/*
  	 * Only spin for completions if we don't have multiple devices hanging
  	 * off our complete list, and we're under the requested amount.
  	 */
  	spin = !ctx->poll_multi_file && *nr_events < min;
  
  	ret = 0;
  	list_for_each_entry_safe(req, tmp, &ctx->poll_list, list) {
  		struct kiocb *kiocb = &req->rw;
  
  		/*
  		 * Move completed entries to our local list. If we find a
  		 * request that requires polling, break out and complete
  		 * the done list first, if we have entries there.
  		 */
  		if (req->flags & REQ_F_IOPOLL_COMPLETED) {
  			list_move_tail(&req->list, &done);
  			continue;
  		}
  		if (!list_empty(&done))
  			break;
  
  		ret = kiocb->ki_filp->f_op->iopoll(kiocb, spin);
  		if (ret < 0)
  			break;
  
  		if (ret && spin)
  			spin = false;
  		ret = 0;
  	}
  
  	if (!list_empty(&done))
  		io_iopoll_complete(ctx, nr_events, &done);
  
  	return ret;
  }
  
  /*
   * Poll for a mininum of 'min' events. Note that if min == 0 we consider that a
   * non-spinning poll check - we'll still enter the driver poll loop, but only
   * as a non-spinning completion check.
   */
  static int io_iopoll_getevents(struct io_ring_ctx *ctx, unsigned int *nr_events,
  				long min)
  {
08f5439f1   Jens Axboe   io_uring: add nee...
823
  	while (!list_empty(&ctx->poll_list) && !need_resched()) {
def596e95   Jens Axboe   io_uring: support...
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
  		int ret;
  
  		ret = io_do_iopoll(ctx, nr_events, min);
  		if (ret < 0)
  			return ret;
  		if (!min || *nr_events >= min)
  			return 0;
  	}
  
  	return 1;
  }
  
  /*
   * We can't just wait for polled events to come to us, we have to actively
   * find and complete them.
   */
  static void io_iopoll_reap_events(struct io_ring_ctx *ctx)
  {
  	if (!(ctx->flags & IORING_SETUP_IOPOLL))
  		return;
  
  	mutex_lock(&ctx->uring_lock);
  	while (!list_empty(&ctx->poll_list)) {
  		unsigned int nr_events = 0;
  
  		io_iopoll_getevents(ctx, &nr_events, 1);
08f5439f1   Jens Axboe   io_uring: add nee...
850
851
852
853
854
855
  
  		/*
  		 * Ensure we allow local-to-the-cpu processing to take place,
  		 * in this case we need to ensure that we reap all events.
  		 */
  		cond_resched();
def596e95   Jens Axboe   io_uring: support...
856
857
858
  	}
  	mutex_unlock(&ctx->uring_lock);
  }
c7deb9612   Xiaoguang Wang   io_uring: fix __i...
859
860
  static int io_iopoll_check(struct io_ring_ctx *ctx, unsigned *nr_events,
  			   long min)
def596e95   Jens Axboe   io_uring: support...
861
  {
2b2ed9750   Jens Axboe   io_uring: fix bad...
862
  	int iters = 0, ret = 0;
500f9fbad   Jens Axboe   io_uring: fix pot...
863

c7deb9612   Xiaoguang Wang   io_uring: fix __i...
864
865
866
867
868
869
  	/*
  	 * We disallow the app entering submit/complete with polling, but we
  	 * still need to lock the ring to prevent racing with polled issue
  	 * that got punted to a workqueue.
  	 */
  	mutex_lock(&ctx->uring_lock);
def596e95   Jens Axboe   io_uring: support...
870
871
  	do {
  		int tmin = 0;
500f9fbad   Jens Axboe   io_uring: fix pot...
872
  		/*
a3a0e43fd   Jens Axboe   io_uring: don't e...
873
874
875
876
  		 * Don't enter poll loop if we already have events pending.
  		 * If we do, we can potentially be spinning for commands that
  		 * already triggered a CQE (eg in error).
  		 */
75b28affd   Hristo Venev   io_uring: allocat...
877
  		if (io_cqring_events(ctx->rings))
a3a0e43fd   Jens Axboe   io_uring: don't e...
878
879
880
  			break;
  
  		/*
500f9fbad   Jens Axboe   io_uring: fix pot...
881
882
883
884
885
886
887
888
889
890
891
892
893
  		 * If a submit got punted to a workqueue, we can have the
  		 * application entering polling for a command before it gets
  		 * issued. That app will hold the uring_lock for the duration
  		 * of the poll right here, so we need to take a breather every
  		 * now and then to ensure that the issue has a chance to add
  		 * the poll to the issued list. Otherwise we can spin here
  		 * forever, while the workqueue is stuck trying to acquire the
  		 * very same mutex.
  		 */
  		if (!(++iters & 7)) {
  			mutex_unlock(&ctx->uring_lock);
  			mutex_lock(&ctx->uring_lock);
  		}
def596e95   Jens Axboe   io_uring: support...
894
895
896
897
898
899
900
901
  		if (*nr_events < min)
  			tmin = min - *nr_events;
  
  		ret = io_iopoll_getevents(ctx, nr_events, tmin);
  		if (ret <= 0)
  			break;
  		ret = 0;
  	} while (min && !*nr_events && !need_resched());
500f9fbad   Jens Axboe   io_uring: fix pot...
902
  	mutex_unlock(&ctx->uring_lock);
def596e95   Jens Axboe   io_uring: support...
903
904
  	return ret;
  }
491381ce0   Jens Axboe   io_uring: fix up ...
905
  static void kiocb_end_write(struct io_kiocb *req)
2b188cc1b   Jens Axboe   Add io_uring IO i...
906
  {
491381ce0   Jens Axboe   io_uring: fix up ...
907
908
909
910
911
912
  	/*
  	 * Tell lockdep we inherited freeze protection from submission
  	 * thread.
  	 */
  	if (req->flags & REQ_F_ISREG) {
  		struct inode *inode = file_inode(req->file);
2b188cc1b   Jens Axboe   Add io_uring IO i...
913

491381ce0   Jens Axboe   io_uring: fix up ...
914
  		__sb_writers_acquired(inode->i_sb, SB_FREEZE_WRITE);
2b188cc1b   Jens Axboe   Add io_uring IO i...
915
  	}
491381ce0   Jens Axboe   io_uring: fix up ...
916
  	file_end_write(req->file);
2b188cc1b   Jens Axboe   Add io_uring IO i...
917
918
919
920
921
  }
  
  static void io_complete_rw(struct kiocb *kiocb, long res, long res2)
  {
  	struct io_kiocb *req = container_of(kiocb, struct io_kiocb, rw);
491381ce0   Jens Axboe   io_uring: fix up ...
922
923
  	if (kiocb->ki_flags & IOCB_WRITE)
  		kiocb_end_write(req);
2b188cc1b   Jens Axboe   Add io_uring IO i...
924

9e645e110   Jens Axboe   io_uring: add sup...
925
926
  	if ((req->flags & REQ_F_LINK) && res != req->result)
  		req->flags |= REQ_F_FAIL_LINK;
c71ffb673   Jens Axboe   io_uring: remove ...
927
  	io_cqring_add_event(req->ctx, req->user_data, res);
e65ef56db   Jens Axboe   io_uring: use reg...
928
  	io_put_req(req);
2b188cc1b   Jens Axboe   Add io_uring IO i...
929
  }
def596e95   Jens Axboe   io_uring: support...
930
931
932
  static void io_complete_rw_iopoll(struct kiocb *kiocb, long res, long res2)
  {
  	struct io_kiocb *req = container_of(kiocb, struct io_kiocb, rw);
491381ce0   Jens Axboe   io_uring: fix up ...
933
934
  	if (kiocb->ki_flags & IOCB_WRITE)
  		kiocb_end_write(req);
def596e95   Jens Axboe   io_uring: support...
935

9e645e110   Jens Axboe   io_uring: add sup...
936
937
938
  	if ((req->flags & REQ_F_LINK) && res != req->result)
  		req->flags |= REQ_F_FAIL_LINK;
  	req->result = res;
def596e95   Jens Axboe   io_uring: support...
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
  	if (res != -EAGAIN)
  		req->flags |= REQ_F_IOPOLL_COMPLETED;
  }
  
  /*
   * After the iocb has been issued, it's safe to be found on the poll list.
   * Adding the kiocb to the list AFTER submission ensures that we don't
   * find it from a io_iopoll_getevents() thread before the issuer is done
   * accessing the kiocb cookie.
   */
  static void io_iopoll_req_issued(struct io_kiocb *req)
  {
  	struct io_ring_ctx *ctx = req->ctx;
  
  	/*
  	 * Track whether we have multiple files in our lists. This will impact
  	 * how we do polling eventually, not spinning if we're on potentially
  	 * different devices.
  	 */
  	if (list_empty(&ctx->poll_list)) {
  		ctx->poll_multi_file = false;
  	} else if (!ctx->poll_multi_file) {
  		struct io_kiocb *list_req;
  
  		list_req = list_first_entry(&ctx->poll_list, struct io_kiocb,
  						list);
  		if (list_req->rw.ki_filp != req->rw.ki_filp)
  			ctx->poll_multi_file = true;
  	}
  
  	/*
  	 * For fast devices, IO may have already completed. If it has, add
  	 * it to the front so we find it first.
  	 */
  	if (req->flags & REQ_F_IOPOLL_COMPLETED)
  		list_add(&req->list, &ctx->poll_list);
  	else
  		list_add_tail(&req->list, &ctx->poll_list);
  }
3d6770fbd   Jens Axboe   io_uring: drop io...
978
  static void io_file_put(struct io_submit_state *state)
9a56a2323   Jens Axboe   io_uring: use fge...
979
  {
3d6770fbd   Jens Axboe   io_uring: drop io...
980
  	if (state->file) {
9a56a2323   Jens Axboe   io_uring: use fge...
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
  		int diff = state->has_refs - state->used_refs;
  
  		if (diff)
  			fput_many(state->file, diff);
  		state->file = NULL;
  	}
  }
  
  /*
   * Get as many references to a file as we have IOs left in this submission,
   * assuming most submissions are for one file, or at least that each file
   * has more than one submission.
   */
  static struct file *io_file_get(struct io_submit_state *state, int fd)
  {
  	if (!state)
  		return fget(fd);
  
  	if (state->file) {
  		if (state->fd == fd) {
  			state->used_refs++;
  			state->ios_left--;
  			return state->file;
  		}
3d6770fbd   Jens Axboe   io_uring: drop io...
1005
  		io_file_put(state);
9a56a2323   Jens Axboe   io_uring: use fge...
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
  	}
  	state->file = fget_many(fd, state->ios_left);
  	if (!state->file)
  		return NULL;
  
  	state->fd = fd;
  	state->has_refs = state->ios_left;
  	state->used_refs = 1;
  	state->ios_left--;
  	return state->file;
  }
2b188cc1b   Jens Axboe   Add io_uring IO i...
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
  /*
   * If we tracked the file through the SCM inflight mechanism, we could support
   * any file. For now, just ensure that anything potentially problematic is done
   * inline.
   */
  static bool io_file_supports_async(struct file *file)
  {
  	umode_t mode = file_inode(file)->i_mode;
  
  	if (S_ISBLK(mode) || S_ISCHR(mode))
  		return true;
  	if (S_ISREG(mode) && file->f_op != &io_uring_fops)
  		return true;
  
  	return false;
  }
6c271ce2f   Jens Axboe   io_uring: add sub...
1033
  static int io_prep_rw(struct io_kiocb *req, const struct sqe_submit *s,
8358e3a82   Jens Axboe   io_uring: remove ...
1034
  		      bool force_nonblock)
2b188cc1b   Jens Axboe   Add io_uring IO i...
1035
  {
6c271ce2f   Jens Axboe   io_uring: add sub...
1036
  	const struct io_uring_sqe *sqe = s->sqe;
def596e95   Jens Axboe   io_uring: support...
1037
  	struct io_ring_ctx *ctx = req->ctx;
2b188cc1b   Jens Axboe   Add io_uring IO i...
1038
  	struct kiocb *kiocb = &req->rw;
09bb83943   Jens Axboe   io_uring: fix fge...
1039
1040
  	unsigned ioprio;
  	int ret;
2b188cc1b   Jens Axboe   Add io_uring IO i...
1041

09bb83943   Jens Axboe   io_uring: fix fge...
1042
1043
  	if (!req->file)
  		return -EBADF;
2b188cc1b   Jens Axboe   Add io_uring IO i...
1044

491381ce0   Jens Axboe   io_uring: fix up ...
1045
1046
  	if (S_ISREG(file_inode(req->file)->i_mode))
  		req->flags |= REQ_F_ISREG;
7661469ef   Jens Axboe   io_uring: honor o...
1047
1048
  	if (force_nonblock)
  		req->fsize = rlimit(RLIMIT_FSIZE);
491381ce0   Jens Axboe   io_uring: fix up ...
1049
1050
1051
1052
1053
1054
1055
1056
  	/*
  	 * If the file doesn't support async, mark it as REQ_F_MUST_PUNT so
  	 * we know to async punt it even if it was opened O_NONBLOCK
  	 */
  	if (force_nonblock && !io_file_supports_async(req->file)) {
  		req->flags |= REQ_F_MUST_PUNT;
  		return -EAGAIN;
  	}
6b06314c4   Jens Axboe   io_uring: add fil...
1057

2b188cc1b   Jens Axboe   Add io_uring IO i...
1058
1059
1060
1061
1062
1063
1064
1065
  	kiocb->ki_pos = READ_ONCE(sqe->off);
  	kiocb->ki_flags = iocb_flags(kiocb->ki_filp);
  	kiocb->ki_hint = ki_hint_validate(file_write_hint(kiocb->ki_filp));
  
  	ioprio = READ_ONCE(sqe->ioprio);
  	if (ioprio) {
  		ret = ioprio_check_cap(ioprio);
  		if (ret)
09bb83943   Jens Axboe   io_uring: fix fge...
1066
  			return ret;
2b188cc1b   Jens Axboe   Add io_uring IO i...
1067
1068
1069
1070
1071
1072
1073
  
  		kiocb->ki_ioprio = ioprio;
  	} else
  		kiocb->ki_ioprio = get_current_ioprio();
  
  	ret = kiocb_set_rw_flags(kiocb, READ_ONCE(sqe->rw_flags));
  	if (unlikely(ret))
09bb83943   Jens Axboe   io_uring: fix fge...
1074
  		return ret;
8449eedaa   Stefan Bühler   io_uring: fix han...
1075
1076
  
  	/* don't allow async punt if RWF_NOWAIT was requested */
491381ce0   Jens Axboe   io_uring: fix up ...
1077
1078
  	if ((kiocb->ki_flags & IOCB_NOWAIT) ||
  	    (req->file->f_flags & O_NONBLOCK))
8449eedaa   Stefan Bühler   io_uring: fix han...
1079
1080
1081
  		req->flags |= REQ_F_NOWAIT;
  
  	if (force_nonblock)
2b188cc1b   Jens Axboe   Add io_uring IO i...
1082
  		kiocb->ki_flags |= IOCB_NOWAIT;
8449eedaa   Stefan Bühler   io_uring: fix han...
1083

def596e95   Jens Axboe   io_uring: support...
1084
  	if (ctx->flags & IORING_SETUP_IOPOLL) {
def596e95   Jens Axboe   io_uring: support...
1085
1086
  		if (!(kiocb->ki_flags & IOCB_DIRECT) ||
  		    !kiocb->ki_filp->f_op->iopoll)
09bb83943   Jens Axboe   io_uring: fix fge...
1087
  			return -EOPNOTSUPP;
2b188cc1b   Jens Axboe   Add io_uring IO i...
1088

def596e95   Jens Axboe   io_uring: support...
1089
1090
  		kiocb->ki_flags |= IOCB_HIPRI;
  		kiocb->ki_complete = io_complete_rw_iopoll;
6873e0bd6   Jens Axboe   io_uring: ensure ...
1091
  		req->result = 0;
def596e95   Jens Axboe   io_uring: support...
1092
  	} else {
09bb83943   Jens Axboe   io_uring: fix fge...
1093
1094
  		if (kiocb->ki_flags & IOCB_HIPRI)
  			return -EINVAL;
def596e95   Jens Axboe   io_uring: support...
1095
1096
  		kiocb->ki_complete = io_complete_rw;
  	}
2b188cc1b   Jens Axboe   Add io_uring IO i...
1097
  	return 0;
2b188cc1b   Jens Axboe   Add io_uring IO i...
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
  }
  
  static inline void io_rw_done(struct kiocb *kiocb, ssize_t ret)
  {
  	switch (ret) {
  	case -EIOCBQUEUED:
  		break;
  	case -ERESTARTSYS:
  	case -ERESTARTNOINTR:
  	case -ERESTARTNOHAND:
  	case -ERESTART_RESTARTBLOCK:
  		/*
  		 * We can't just restart the syscall, since previously
  		 * submitted sqes may already be in progress. Just fail this
  		 * IO with EINTR.
  		 */
  		ret = -EINTR;
  		/* fall through */
  	default:
  		kiocb->ki_complete(kiocb, ret, 0);
  	}
  }
edafccee5   Jens Axboe   io_uring: add sup...
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
  static int io_import_fixed(struct io_ring_ctx *ctx, int rw,
  			   const struct io_uring_sqe *sqe,
  			   struct iov_iter *iter)
  {
  	size_t len = READ_ONCE(sqe->len);
  	struct io_mapped_ubuf *imu;
  	unsigned index, buf_index;
  	size_t offset;
  	u64 buf_addr;
  
  	/* attempt to use fixed buffers without having provided iovecs */
  	if (unlikely(!ctx->user_bufs))
  		return -EFAULT;
  
  	buf_index = READ_ONCE(sqe->buf_index);
  	if (unlikely(buf_index >= ctx->nr_user_bufs))
  		return -EFAULT;
  
  	index = array_index_nospec(buf_index, ctx->nr_user_bufs);
  	imu = &ctx->user_bufs[index];
  	buf_addr = READ_ONCE(sqe->addr);
  
  	/* overflow */
  	if (buf_addr + len < buf_addr)
  		return -EFAULT;
  	/* not inside the mapped region */
  	if (buf_addr < imu->ubuf || buf_addr + len > imu->ubuf + imu->len)
  		return -EFAULT;
  
  	/*
  	 * May not be a start of buffer, set size appropriately
  	 * and advance us to the beginning.
  	 */
  	offset = buf_addr - imu->ubuf;
  	iov_iter_bvec(iter, rw, imu->bvec, imu->nr_bvecs, offset + len);
bd11b3a39   Jens Axboe   io_uring: don't u...
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
  
  	if (offset) {
  		/*
  		 * Don't use iov_iter_advance() here, as it's really slow for
  		 * using the latter parts of a big fixed buffer - it iterates
  		 * over each segment manually. We can cheat a bit here, because
  		 * we know that:
  		 *
  		 * 1) it's a BVEC iter, we set it up
  		 * 2) all bvecs are PAGE_SIZE in size, except potentially the
  		 *    first and last bvec
  		 *
  		 * So just find our index, and adjust the iterator afterwards.
  		 * If the offset is within the first bvec (or the whole first
  		 * bvec, just use iov_iter_advance(). This makes it easier
  		 * since we can just skip the first segment, which may not
  		 * be PAGE_SIZE aligned.
  		 */
  		const struct bio_vec *bvec = imu->bvec;
  
  		if (offset <= bvec->bv_len) {
  			iov_iter_advance(iter, offset);
  		} else {
  			unsigned long seg_skip;
  
  			/* skip first vec */
  			offset -= bvec->bv_len;
  			seg_skip = 1 + (offset >> PAGE_SHIFT);
  
  			iter->bvec = bvec + seg_skip;
  			iter->nr_segs -= seg_skip;
99c79f669   Aleix Roca Nonell   io_uring: fix man...
1186
  			iter->count -= bvec->bv_len + offset;
bd11b3a39   Jens Axboe   io_uring: don't u...
1187
  			iter->iov_offset = offset & ~PAGE_MASK;
bd11b3a39   Jens Axboe   io_uring: don't u...
1188
1189
  		}
  	}
5e559561a   Jens Axboe   io_uring: ensure ...
1190
  	return len;
edafccee5   Jens Axboe   io_uring: add sup...
1191
  }
87e5e6dab   Jens Axboe   uio: make import_...
1192
  static ssize_t io_import_iovec(struct io_ring_ctx *ctx, int rw,
a4d61e66e   Jens Axboe   io_uring: prevent...
1193
  			       struct io_kiocb *req, struct iovec **iovec,
87e5e6dab   Jens Axboe   uio: make import_...
1194
  			       struct iov_iter *iter)
2b188cc1b   Jens Axboe   Add io_uring IO i...
1195
  {
a4d61e66e   Jens Axboe   io_uring: prevent...
1196
  	const struct io_uring_sqe *sqe = req->submit.sqe;
2b188cc1b   Jens Axboe   Add io_uring IO i...
1197
1198
  	void __user *buf = u64_to_user_ptr(READ_ONCE(sqe->addr));
  	size_t sqe_len = READ_ONCE(sqe->len);
edafccee5   Jens Axboe   io_uring: add sup...
1199
  	u8 opcode;
a4d61e66e   Jens Axboe   io_uring: prevent...
1200
  	opcode = req->submit.opcode;
edafccee5   Jens Axboe   io_uring: add sup...
1201
1202
  	if (opcode == IORING_OP_READ_FIXED ||
  	    opcode == IORING_OP_WRITE_FIXED) {
87e5e6dab   Jens Axboe   uio: make import_...
1203
  		ssize_t ret = io_import_fixed(ctx, rw, sqe, iter);
edafccee5   Jens Axboe   io_uring: add sup...
1204
1205
1206
  		*iovec = NULL;
  		return ret;
  	}
2b188cc1b   Jens Axboe   Add io_uring IO i...
1207

a4d61e66e   Jens Axboe   io_uring: prevent...
1208
  	if (!req->submit.has_user)
2b188cc1b   Jens Axboe   Add io_uring IO i...
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
  		return -EFAULT;
  
  #ifdef CONFIG_COMPAT
  	if (ctx->compat)
  		return compat_import_iovec(rw, buf, sqe_len, UIO_FASTIOV,
  						iovec, iter);
  #endif
  
  	return import_iovec(rw, buf, sqe_len, UIO_FASTIOV, iovec, iter);
  }
6d5d5ac52   Jens Axboe   io_uring: extend ...
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
  static inline bool io_should_merge(struct async_list *al, struct kiocb *kiocb)
  {
  	if (al->file == kiocb->ki_filp) {
  		off_t start, end;
  
  		/*
  		 * Allow merging if we're anywhere in the range of the same
  		 * page. Generally this happens for sub-page reads or writes,
  		 * and it's beneficial to allow the first worker to bring the
  		 * page in and the piggy backed work can then work on the
  		 * cached page.
  		 */
  		start = al->io_start & PAGE_MASK;
  		end = (al->io_start + al->io_len + PAGE_SIZE - 1) & PAGE_MASK;
  		if (kiocb->ki_pos >= start && kiocb->ki_pos <= end)
  			return true;
  	}
  
  	al->file = NULL;
  	return false;
  }
31b515106   Jens Axboe   io_uring: allow w...
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
  /*
   * Make a note of the last file/offset/direction we punted to async
   * context. We'll use this information to see if we can piggy back a
   * sequential request onto the previous one, if it's still hasn't been
   * completed by the async worker.
   */
  static void io_async_list_note(int rw, struct io_kiocb *req, size_t len)
  {
  	struct async_list *async_list = &req->ctx->pending_async[rw];
  	struct kiocb *kiocb = &req->rw;
  	struct file *filp = kiocb->ki_filp;
31b515106   Jens Axboe   io_uring: allow w...
1251

6d5d5ac52   Jens Axboe   io_uring: extend ...
1252
  	if (io_should_merge(async_list, kiocb)) {
9310a7ba6   Zhengyuan Liu   io_uring: track i...
1253
  		unsigned long max_bytes;
31b515106   Jens Axboe   io_uring: allow w...
1254
1255
  
  		/* Use 8x RA size as a decent limiter for both reads/writes */
9310a7ba6   Zhengyuan Liu   io_uring: track i...
1256
1257
1258
1259
1260
1261
  		max_bytes = filp->f_ra.ra_pages << (PAGE_SHIFT + 3);
  		if (!max_bytes)
  			max_bytes = VM_READAHEAD_PAGES << (PAGE_SHIFT + 3);
  
  		/* If max len are exceeded, reset the state */
  		if (async_list->io_len + len <= max_bytes) {
31b515106   Jens Axboe   io_uring: allow w...
1262
  			req->flags |= REQ_F_SEQ_PREV;
9310a7ba6   Zhengyuan Liu   io_uring: track i...
1263
  			async_list->io_len += len;
31b515106   Jens Axboe   io_uring: allow w...
1264
  		} else {
6d5d5ac52   Jens Axboe   io_uring: extend ...
1265
  			async_list->file = NULL;
31b515106   Jens Axboe   io_uring: allow w...
1266
1267
1268
1269
1270
  		}
  	}
  
  	/* New file? Reset state. */
  	if (async_list->file != filp) {
6d5d5ac52   Jens Axboe   io_uring: extend ...
1271
1272
  		async_list->io_start = kiocb->ki_pos;
  		async_list->io_len = len;
31b515106   Jens Axboe   io_uring: allow w...
1273
1274
  		async_list->file = filp;
  	}
31b515106   Jens Axboe   io_uring: allow w...
1275
  }
32960613b   Jens Axboe   io_uring: correct...
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
  /*
   * For files that don't have ->read_iter() and ->write_iter(), handle them
   * by looping over ->read() or ->write() manually.
   */
  static ssize_t loop_rw_iter(int rw, struct file *file, struct kiocb *kiocb,
  			   struct iov_iter *iter)
  {
  	ssize_t ret = 0;
  
  	/*
  	 * Don't support polled IO through this interface, and we can't
  	 * support non-blocking either. For the latter, this just causes
  	 * the kiocb to be handled from an async context.
  	 */
  	if (kiocb->ki_flags & IOCB_HIPRI)
  		return -EOPNOTSUPP;
  	if (kiocb->ki_flags & IOCB_NOWAIT)
  		return -EAGAIN;
  
  	while (iov_iter_count(iter)) {
f246eedba   Pavel Begunkov   io_uring: fix dea...
1296
  		struct iovec iovec;
32960613b   Jens Axboe   io_uring: correct...
1297
  		ssize_t nr;
f246eedba   Pavel Begunkov   io_uring: fix dea...
1298
1299
1300
1301
1302
1303
1304
1305
1306
  		if (!iov_iter_is_bvec(iter)) {
  			iovec = iov_iter_iovec(iter);
  		} else {
  			/* fixed buffers import bvec */
  			iovec.iov_base = kmap(iter->bvec->bv_page)
  						+ iter->iov_offset;
  			iovec.iov_len = min(iter->count,
  					iter->bvec->bv_len - iter->iov_offset);
  		}
32960613b   Jens Axboe   io_uring: correct...
1307
1308
1309
1310
1311
1312
1313
  		if (rw == READ) {
  			nr = file->f_op->read(file, iovec.iov_base,
  					      iovec.iov_len, &kiocb->ki_pos);
  		} else {
  			nr = file->f_op->write(file, iovec.iov_base,
  					       iovec.iov_len, &kiocb->ki_pos);
  		}
f246eedba   Pavel Begunkov   io_uring: fix dea...
1314
1315
  		if (iov_iter_is_bvec(iter))
  			kunmap(iter->bvec->bv_page);
32960613b   Jens Axboe   io_uring: correct...
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
  		if (nr < 0) {
  			if (!ret)
  				ret = nr;
  			break;
  		}
  		ret += nr;
  		if (nr != iovec.iov_len)
  			break;
  		iov_iter_advance(iter, nr);
  	}
  
  	return ret;
  }
e0c5c576d   Jens Axboe   io_uring: make io...
1329
  static int io_read(struct io_kiocb *req, const struct sqe_submit *s,
8358e3a82   Jens Axboe   io_uring: remove ...
1330
  		   bool force_nonblock)
2b188cc1b   Jens Axboe   Add io_uring IO i...
1331
1332
1333
1334
1335
  {
  	struct iovec inline_vecs[UIO_FASTIOV], *iovec = inline_vecs;
  	struct kiocb *kiocb = &req->rw;
  	struct iov_iter iter;
  	struct file *file;
31b515106   Jens Axboe   io_uring: allow w...
1336
  	size_t iov_count;
9d93a3f5a   Jens Axboe   io_uring: punt sh...
1337
  	ssize_t read_size, ret;
2b188cc1b   Jens Axboe   Add io_uring IO i...
1338

8358e3a82   Jens Axboe   io_uring: remove ...
1339
  	ret = io_prep_rw(req, s, force_nonblock);
2b188cc1b   Jens Axboe   Add io_uring IO i...
1340
1341
1342
  	if (ret)
  		return ret;
  	file = kiocb->ki_filp;
2b188cc1b   Jens Axboe   Add io_uring IO i...
1343
  	if (unlikely(!(file->f_mode & FMODE_READ)))
09bb83943   Jens Axboe   io_uring: fix fge...
1344
  		return -EBADF;
2b188cc1b   Jens Axboe   Add io_uring IO i...
1345

a4d61e66e   Jens Axboe   io_uring: prevent...
1346
  	ret = io_import_iovec(req->ctx, READ, req, &iovec, &iter);
87e5e6dab   Jens Axboe   uio: make import_...
1347
  	if (ret < 0)
09bb83943   Jens Axboe   io_uring: fix fge...
1348
  		return ret;
2b188cc1b   Jens Axboe   Add io_uring IO i...
1349

9d93a3f5a   Jens Axboe   io_uring: punt sh...
1350
  	read_size = ret;
9e645e110   Jens Axboe   io_uring: add sup...
1351
1352
  	if (req->flags & REQ_F_LINK)
  		req->result = read_size;
31b515106   Jens Axboe   io_uring: allow w...
1353
1354
  	iov_count = iov_iter_count(&iter);
  	ret = rw_verify_area(READ, file, &kiocb->ki_pos, iov_count);
2b188cc1b   Jens Axboe   Add io_uring IO i...
1355
1356
  	if (!ret) {
  		ssize_t ret2;
32960613b   Jens Axboe   io_uring: correct...
1357
1358
  		if (file->f_op->read_iter)
  			ret2 = call_read_iter(file, kiocb, &iter);
5de0b5247   Guoyu Huang   io_uring: Fix NUL...
1359
  		else if (req->file->f_op->read)
32960613b   Jens Axboe   io_uring: correct...
1360
  			ret2 = loop_rw_iter(READ, file, kiocb, &iter);
5de0b5247   Guoyu Huang   io_uring: Fix NUL...
1361
1362
  		else
  			ret2 = -EINVAL;
32960613b   Jens Axboe   io_uring: correct...
1363

9d93a3f5a   Jens Axboe   io_uring: punt sh...
1364
1365
1366
1367
1368
1369
1370
1371
  		/*
  		 * In case of a short read, punt to async. This can happen
  		 * if we have data partially cached. Alternatively we can
  		 * return the short read, in which case the application will
  		 * need to issue another SQE and wait for it. That SQE will
  		 * need async punt anyway, so it's more efficient to do it
  		 * here.
  		 */
491381ce0   Jens Axboe   io_uring: fix up ...
1372
1373
1374
  		if (force_nonblock && !(req->flags & REQ_F_NOWAIT) &&
  		    (req->flags & REQ_F_ISREG) &&
  		    ret2 > 0 && ret2 < read_size)
9d93a3f5a   Jens Axboe   io_uring: punt sh...
1375
1376
  			ret2 = -EAGAIN;
  		/* Catch -EAGAIN return for forced non-blocking submission */
31b515106   Jens Axboe   io_uring: allow w...
1377
  		if (!force_nonblock || ret2 != -EAGAIN) {
2b188cc1b   Jens Axboe   Add io_uring IO i...
1378
  			io_rw_done(kiocb, ret2);
31b515106   Jens Axboe   io_uring: allow w...
1379
1380
1381
1382
1383
1384
1385
  		} else {
  			/*
  			 * If ->needs_lock is true, we're already in async
  			 * context.
  			 */
  			if (!s->needs_lock)
  				io_async_list_note(READ, req, iov_count);
2b188cc1b   Jens Axboe   Add io_uring IO i...
1386
  			ret = -EAGAIN;
31b515106   Jens Axboe   io_uring: allow w...
1387
  		}
2b188cc1b   Jens Axboe   Add io_uring IO i...
1388
1389
  	}
  	kfree(iovec);
2b188cc1b   Jens Axboe   Add io_uring IO i...
1390
1391
  	return ret;
  }
e0c5c576d   Jens Axboe   io_uring: make io...
1392
  static int io_write(struct io_kiocb *req, const struct sqe_submit *s,
8358e3a82   Jens Axboe   io_uring: remove ...
1393
  		    bool force_nonblock)
2b188cc1b   Jens Axboe   Add io_uring IO i...
1394
1395
1396
1397
1398
  {
  	struct iovec inline_vecs[UIO_FASTIOV], *iovec = inline_vecs;
  	struct kiocb *kiocb = &req->rw;
  	struct iov_iter iter;
  	struct file *file;
31b515106   Jens Axboe   io_uring: allow w...
1399
  	size_t iov_count;
87e5e6dab   Jens Axboe   uio: make import_...
1400
  	ssize_t ret;
2b188cc1b   Jens Axboe   Add io_uring IO i...
1401

8358e3a82   Jens Axboe   io_uring: remove ...
1402
  	ret = io_prep_rw(req, s, force_nonblock);
2b188cc1b   Jens Axboe   Add io_uring IO i...
1403
1404
  	if (ret)
  		return ret;
2b188cc1b   Jens Axboe   Add io_uring IO i...
1405

2b188cc1b   Jens Axboe   Add io_uring IO i...
1406
1407
  	file = kiocb->ki_filp;
  	if (unlikely(!(file->f_mode & FMODE_WRITE)))
09bb83943   Jens Axboe   io_uring: fix fge...
1408
  		return -EBADF;
2b188cc1b   Jens Axboe   Add io_uring IO i...
1409

a4d61e66e   Jens Axboe   io_uring: prevent...
1410
  	ret = io_import_iovec(req->ctx, WRITE, req, &iovec, &iter);
87e5e6dab   Jens Axboe   uio: make import_...
1411
  	if (ret < 0)
09bb83943   Jens Axboe   io_uring: fix fge...
1412
  		return ret;
2b188cc1b   Jens Axboe   Add io_uring IO i...
1413

9e645e110   Jens Axboe   io_uring: add sup...
1414
1415
  	if (req->flags & REQ_F_LINK)
  		req->result = ret;
31b515106   Jens Axboe   io_uring: allow w...
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
  	iov_count = iov_iter_count(&iter);
  
  	ret = -EAGAIN;
  	if (force_nonblock && !(kiocb->ki_flags & IOCB_DIRECT)) {
  		/* If ->needs_lock is true, we're already in async context. */
  		if (!s->needs_lock)
  			io_async_list_note(WRITE, req, iov_count);
  		goto out_free;
  	}
  
  	ret = rw_verify_area(WRITE, file, &kiocb->ki_pos, iov_count);
2b188cc1b   Jens Axboe   Add io_uring IO i...
1427
  	if (!ret) {
9bf7933fc   Roman Penyaev   io_uring: offload...
1428
  		ssize_t ret2;
2b188cc1b   Jens Axboe   Add io_uring IO i...
1429
1430
1431
1432
1433
1434
1435
  		/*
  		 * Open-code file_start_write here to grab freeze protection,
  		 * which will be released by another thread in
  		 * io_complete_rw().  Fool lockdep by telling it the lock got
  		 * released so that it doesn't complain about the held lock when
  		 * we return to userspace.
  		 */
491381ce0   Jens Axboe   io_uring: fix up ...
1436
  		if (req->flags & REQ_F_ISREG) {
2b188cc1b   Jens Axboe   Add io_uring IO i...
1437
1438
1439
1440
1441
1442
  			__sb_start_write(file_inode(file)->i_sb,
  						SB_FREEZE_WRITE, true);
  			__sb_writers_release(file_inode(file)->i_sb,
  						SB_FREEZE_WRITE);
  		}
  		kiocb->ki_flags |= IOCB_WRITE;
9bf7933fc   Roman Penyaev   io_uring: offload...
1443

7661469ef   Jens Axboe   io_uring: honor o...
1444
1445
  		if (!force_nonblock)
  			current->signal->rlim[RLIMIT_FSIZE].rlim_cur = req->fsize;
32960613b   Jens Axboe   io_uring: correct...
1446
1447
  		if (file->f_op->write_iter)
  			ret2 = call_write_iter(file, kiocb, &iter);
5de0b5247   Guoyu Huang   io_uring: Fix NUL...
1448
  		else if (req->file->f_op->write)
32960613b   Jens Axboe   io_uring: correct...
1449
  			ret2 = loop_rw_iter(WRITE, file, kiocb, &iter);
5de0b5247   Guoyu Huang   io_uring: Fix NUL...
1450
1451
  		else
  			ret2 = -EINVAL;
7661469ef   Jens Axboe   io_uring: honor o...
1452
1453
1454
  
  		if (!force_nonblock)
  			current->signal->rlim[RLIMIT_FSIZE].rlim_cur = RLIM_INFINITY;
9bf7933fc   Roman Penyaev   io_uring: offload...
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
  		if (!force_nonblock || ret2 != -EAGAIN) {
  			io_rw_done(kiocb, ret2);
  		} else {
  			/*
  			 * If ->needs_lock is true, we're already in async
  			 * context.
  			 */
  			if (!s->needs_lock)
  				io_async_list_note(WRITE, req, iov_count);
  			ret = -EAGAIN;
  		}
2b188cc1b   Jens Axboe   Add io_uring IO i...
1466
  	}
31b515106   Jens Axboe   io_uring: allow w...
1467
  out_free:
2b188cc1b   Jens Axboe   Add io_uring IO i...
1468
  	kfree(iovec);
2b188cc1b   Jens Axboe   Add io_uring IO i...
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
  	return ret;
  }
  
  /*
   * IORING_OP_NOP just posts a completion event, nothing else.
   */
  static int io_nop(struct io_kiocb *req, u64 user_data)
  {
  	struct io_ring_ctx *ctx = req->ctx;
  	long err = 0;
def596e95   Jens Axboe   io_uring: support...
1479
1480
  	if (unlikely(ctx->flags & IORING_SETUP_IOPOLL))
  		return -EINVAL;
c71ffb673   Jens Axboe   io_uring: remove ...
1481
  	io_cqring_add_event(ctx, user_data, err);
e65ef56db   Jens Axboe   io_uring: use reg...
1482
  	io_put_req(req);
2b188cc1b   Jens Axboe   Add io_uring IO i...
1483
1484
  	return 0;
  }
c992fe292   Christoph Hellwig   io_uring: add fsy...
1485
1486
  static int io_prep_fsync(struct io_kiocb *req, const struct io_uring_sqe *sqe)
  {
6b06314c4   Jens Axboe   io_uring: add fil...
1487
  	struct io_ring_ctx *ctx = req->ctx;
c992fe292   Christoph Hellwig   io_uring: add fsy...
1488

09bb83943   Jens Axboe   io_uring: fix fge...
1489
1490
  	if (!req->file)
  		return -EBADF;
c992fe292   Christoph Hellwig   io_uring: add fsy...
1491

6b06314c4   Jens Axboe   io_uring: add fil...
1492
  	if (unlikely(ctx->flags & IORING_SETUP_IOPOLL))
def596e95   Jens Axboe   io_uring: support...
1493
  		return -EINVAL;
edafccee5   Jens Axboe   io_uring: add sup...
1494
  	if (unlikely(sqe->addr || sqe->ioprio || sqe->buf_index))
c992fe292   Christoph Hellwig   io_uring: add fsy...
1495
  		return -EINVAL;
c992fe292   Christoph Hellwig   io_uring: add fsy...
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
  	return 0;
  }
  
  static int io_fsync(struct io_kiocb *req, const struct io_uring_sqe *sqe,
  		    bool force_nonblock)
  {
  	loff_t sqe_off = READ_ONCE(sqe->off);
  	loff_t sqe_len = READ_ONCE(sqe->len);
  	loff_t end = sqe_off + sqe_len;
  	unsigned fsync_flags;
  	int ret;
  
  	fsync_flags = READ_ONCE(sqe->fsync_flags);
  	if (unlikely(fsync_flags & ~IORING_FSYNC_DATASYNC))
  		return -EINVAL;
  
  	ret = io_prep_fsync(req, sqe);
  	if (ret)
  		return ret;
  
  	/* fsync always requires a blocking context */
  	if (force_nonblock)
  		return -EAGAIN;
  
  	ret = vfs_fsync_range(req->rw.ki_filp, sqe_off,
  				end > 0 ? end : LLONG_MAX,
  				fsync_flags & IORING_FSYNC_DATASYNC);
9e645e110   Jens Axboe   io_uring: add sup...
1523
1524
  	if (ret < 0 && (req->flags & REQ_F_LINK))
  		req->flags |= REQ_F_FAIL_LINK;
c71ffb673   Jens Axboe   io_uring: remove ...
1525
  	io_cqring_add_event(req->ctx, sqe->user_data, ret);
e65ef56db   Jens Axboe   io_uring: use reg...
1526
  	io_put_req(req);
c992fe292   Christoph Hellwig   io_uring: add fsy...
1527
1528
  	return 0;
  }
5d17b4a4b   Jens Axboe   io_uring: add sup...
1529
1530
1531
1532
1533
1534
1535
  static int io_prep_sfr(struct io_kiocb *req, const struct io_uring_sqe *sqe)
  {
  	struct io_ring_ctx *ctx = req->ctx;
  	int ret = 0;
  
  	if (!req->file)
  		return -EBADF;
5d17b4a4b   Jens Axboe   io_uring: add sup...
1536
1537
1538
1539
1540
  
  	if (unlikely(ctx->flags & IORING_SETUP_IOPOLL))
  		return -EINVAL;
  	if (unlikely(sqe->addr || sqe->ioprio || sqe->buf_index))
  		return -EINVAL;
5d17b4a4b   Jens Axboe   io_uring: add sup...
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
  	return ret;
  }
  
  static int io_sync_file_range(struct io_kiocb *req,
  			      const struct io_uring_sqe *sqe,
  			      bool force_nonblock)
  {
  	loff_t sqe_off;
  	loff_t sqe_len;
  	unsigned flags;
  	int ret;
  
  	ret = io_prep_sfr(req, sqe);
  	if (ret)
  		return ret;
  
  	/* sync_file_range always requires a blocking context */
  	if (force_nonblock)
  		return -EAGAIN;
  
  	sqe_off = READ_ONCE(sqe->off);
  	sqe_len = READ_ONCE(sqe->len);
  	flags = READ_ONCE(sqe->sync_range_flags);
  
  	ret = sync_file_range(req->rw.ki_filp, sqe_off, sqe_len, flags);
9e645e110   Jens Axboe   io_uring: add sup...
1566
1567
  	if (ret < 0 && (req->flags & REQ_F_LINK))
  		req->flags |= REQ_F_FAIL_LINK;
c71ffb673   Jens Axboe   io_uring: remove ...
1568
  	io_cqring_add_event(req->ctx, sqe->user_data, ret);
5d17b4a4b   Jens Axboe   io_uring: add sup...
1569
1570
1571
  	io_put_req(req);
  	return 0;
  }
0fa03c624   Jens Axboe   io_uring: add sup...
1572
  #if defined(CONFIG_NET)
aa1fa28fc   Jens Axboe   io_uring: add sup...
1573
1574
1575
1576
1577
  static int io_send_recvmsg(struct io_kiocb *req, const struct io_uring_sqe *sqe,
  			   bool force_nonblock,
  		   long (*fn)(struct socket *, struct user_msghdr __user *,
  				unsigned int))
  {
0fa03c624   Jens Axboe   io_uring: add sup...
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
  	struct socket *sock;
  	int ret;
  
  	if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
  		return -EINVAL;
  
  	sock = sock_from_file(req->file, &ret);
  	if (sock) {
  		struct user_msghdr __user *msg;
  		unsigned flags;
  
  		flags = READ_ONCE(sqe->msg_flags);
  		if (flags & MSG_DONTWAIT)
  			req->flags |= REQ_F_NOWAIT;
  		else if (force_nonblock)
  			flags |= MSG_DONTWAIT;
7eaf718b8   Jens Axboe   io_uring: fix 32-...
1594
1595
1596
1597
  #ifdef CONFIG_COMPAT
  		if (req->ctx->compat)
  			flags |= MSG_CMSG_COMPAT;
  #endif
0fa03c624   Jens Axboe   io_uring: add sup...
1598
1599
  		msg = (struct user_msghdr __user *) (unsigned long)
  			READ_ONCE(sqe->addr);
aa1fa28fc   Jens Axboe   io_uring: add sup...
1600
  		ret = fn(sock, msg, flags);
0fa03c624   Jens Axboe   io_uring: add sup...
1601
1602
  		if (force_nonblock && ret == -EAGAIN)
  			return ret;
57aabff8c   Jens Axboe   io_uring: transfo...
1603
1604
  		if (ret == -ERESTARTSYS)
  			ret = -EINTR;
0fa03c624   Jens Axboe   io_uring: add sup...
1605
  	}
cac68d12c   Jens Axboe   io_uring: grab ->...
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
  	if (req->fs) {
  		struct fs_struct *fs = req->fs;
  
  		spin_lock(&req->fs->lock);
  		if (--fs->users)
  			fs = NULL;
  		spin_unlock(&req->fs->lock);
  		if (fs)
  			free_fs_struct(fs);
  	}
c71ffb673   Jens Axboe   io_uring: remove ...
1616
  	io_cqring_add_event(req->ctx, sqe->user_data, ret);
5d17b4a4b   Jens Axboe   io_uring: add sup...
1617
1618
1619
  	io_put_req(req);
  	return 0;
  }
aa1fa28fc   Jens Axboe   io_uring: add sup...
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
  #endif
  
  static int io_sendmsg(struct io_kiocb *req, const struct io_uring_sqe *sqe,
  		      bool force_nonblock)
  {
  #if defined(CONFIG_NET)
  	return io_send_recvmsg(req, sqe, force_nonblock, __sys_sendmsg_sock);
  #else
  	return -EOPNOTSUPP;
  #endif
  }
  
  static int io_recvmsg(struct io_kiocb *req, const struct io_uring_sqe *sqe,
  		      bool force_nonblock)
  {
  #if defined(CONFIG_NET)
  	return io_send_recvmsg(req, sqe, force_nonblock, __sys_recvmsg_sock);
0fa03c624   Jens Axboe   io_uring: add sup...
1637
1638
1639
1640
  #else
  	return -EOPNOTSUPP;
  #endif
  }
5d17b4a4b   Jens Axboe   io_uring: add sup...
1641

221c5eb23   Jens Axboe   io_uring: add sup...
1642
1643
1644
1645
1646
1647
1648
1649
  static void io_poll_remove_one(struct io_kiocb *req)
  {
  	struct io_poll_iocb *poll = &req->poll;
  
  	spin_lock(&poll->head->lock);
  	WRITE_ONCE(poll->canceled, true);
  	if (!list_empty(&poll->wait.entry)) {
  		list_del_init(&poll->wait.entry);
18d9be1a9   Jens Axboe   io_uring: add io_...
1650
  		io_queue_async_work(req->ctx, req);
221c5eb23   Jens Axboe   io_uring: add sup...
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
  	}
  	spin_unlock(&poll->head->lock);
  
  	list_del_init(&req->list);
  }
  
  static void io_poll_remove_all(struct io_ring_ctx *ctx)
  {
  	struct io_kiocb *req;
  
  	spin_lock_irq(&ctx->completion_lock);
  	while (!list_empty(&ctx->cancel_list)) {
  		req = list_first_entry(&ctx->cancel_list, struct io_kiocb,list);
  		io_poll_remove_one(req);
  	}
  	spin_unlock_irq(&ctx->completion_lock);
  }
  
  /*
   * Find a running poll command that matches one specified in sqe->addr,
   * and remove it if found.
   */
  static int io_poll_remove(struct io_kiocb *req, const struct io_uring_sqe *sqe)
  {
  	struct io_ring_ctx *ctx = req->ctx;
  	struct io_kiocb *poll_req, *next;
  	int ret = -ENOENT;
  
  	if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
  		return -EINVAL;
  	if (sqe->ioprio || sqe->off || sqe->len || sqe->buf_index ||
  	    sqe->poll_events)
  		return -EINVAL;
  
  	spin_lock_irq(&ctx->completion_lock);
  	list_for_each_entry_safe(poll_req, next, &ctx->cancel_list, list) {
  		if (READ_ONCE(sqe->addr) == poll_req->user_data) {
  			io_poll_remove_one(poll_req);
  			ret = 0;
  			break;
  		}
  	}
  	spin_unlock_irq(&ctx->completion_lock);
c71ffb673   Jens Axboe   io_uring: remove ...
1694
  	io_cqring_add_event(req->ctx, sqe->user_data, ret);
e65ef56db   Jens Axboe   io_uring: use reg...
1695
  	io_put_req(req);
221c5eb23   Jens Axboe   io_uring: add sup...
1696
1697
  	return 0;
  }
8c8387887   Jens Axboe   io_uring: fix pol...
1698
1699
  static void io_poll_complete(struct io_ring_ctx *ctx, struct io_kiocb *req,
  			     __poll_t mask)
221c5eb23   Jens Axboe   io_uring: add sup...
1700
  {
8c8387887   Jens Axboe   io_uring: fix pol...
1701
  	req->poll.done = true;
c71ffb673   Jens Axboe   io_uring: remove ...
1702
  	io_cqring_fill_event(ctx, req->user_data, mangle_poll(mask));
8c8387887   Jens Axboe   io_uring: fix pol...
1703
  	io_commit_cqring(ctx);
221c5eb23   Jens Axboe   io_uring: add sup...
1704
1705
1706
1707
1708
1709
1710
1711
  }
  
  static void io_poll_complete_work(struct work_struct *work)
  {
  	struct io_kiocb *req = container_of(work, struct io_kiocb, work);
  	struct io_poll_iocb *poll = &req->poll;
  	struct poll_table_struct pt = { ._key = poll->events };
  	struct io_ring_ctx *ctx = req->ctx;
8387e3688   Jens Axboe   io_uring: async w...
1712
  	const struct cred *old_cred;
221c5eb23   Jens Axboe   io_uring: add sup...
1713
  	__poll_t mask = 0;
8387e3688   Jens Axboe   io_uring: async w...
1714
  	old_cred = override_creds(ctx->creds);
221c5eb23   Jens Axboe   io_uring: add sup...
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
  	if (!READ_ONCE(poll->canceled))
  		mask = vfs_poll(poll->file, &pt) & poll->events;
  
  	/*
  	 * Note that ->ki_cancel callers also delete iocb from active_reqs after
  	 * calling ->ki_cancel.  We need the ctx_lock roundtrip here to
  	 * synchronize with them.  In the cancellation case the list_del_init
  	 * itself is not actually needed, but harmless so we keep it in to
  	 * avoid further branches in the fast path.
  	 */
  	spin_lock_irq(&ctx->completion_lock);
  	if (!mask && !READ_ONCE(poll->canceled)) {
  		add_wait_queue(poll->head, &poll->wait);
  		spin_unlock_irq(&ctx->completion_lock);
8387e3688   Jens Axboe   io_uring: async w...
1729
  		goto out;
221c5eb23   Jens Axboe   io_uring: add sup...
1730
1731
  	}
  	list_del_init(&req->list);
8c8387887   Jens Axboe   io_uring: fix pol...
1732
  	io_poll_complete(ctx, req, mask);
221c5eb23   Jens Axboe   io_uring: add sup...
1733
  	spin_unlock_irq(&ctx->completion_lock);
8c8387887   Jens Axboe   io_uring: fix pol...
1734
1735
  	io_cqring_ev_posted(ctx);
  	io_put_req(req);
8387e3688   Jens Axboe   io_uring: async w...
1736
1737
  out:
  	revert_creds(old_cred);
221c5eb23   Jens Axboe   io_uring: add sup...
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
  }
  
  static int io_poll_wake(struct wait_queue_entry *wait, unsigned mode, int sync,
  			void *key)
  {
  	struct io_poll_iocb *poll = container_of(wait, struct io_poll_iocb,
  							wait);
  	struct io_kiocb *req = container_of(poll, struct io_kiocb, poll);
  	struct io_ring_ctx *ctx = req->ctx;
  	__poll_t mask = key_to_poll(key);
8c8387887   Jens Axboe   io_uring: fix pol...
1748
  	unsigned long flags;
221c5eb23   Jens Axboe   io_uring: add sup...
1749
1750
  
  	/* for instances that support it check for an event match first: */
8c8387887   Jens Axboe   io_uring: fix pol...
1751
1752
  	if (mask && !(mask & poll->events))
  		return 0;
221c5eb23   Jens Axboe   io_uring: add sup...
1753

8c8387887   Jens Axboe   io_uring: fix pol...
1754
  	list_del_init(&poll->wait.entry);
221c5eb23   Jens Axboe   io_uring: add sup...
1755

8c8387887   Jens Axboe   io_uring: fix pol...
1756
1757
1758
1759
  	if (mask && spin_trylock_irqsave(&ctx->completion_lock, flags)) {
  		list_del(&req->list);
  		io_poll_complete(ctx, req, mask);
  		spin_unlock_irqrestore(&ctx->completion_lock, flags);
221c5eb23   Jens Axboe   io_uring: add sup...
1760

8c8387887   Jens Axboe   io_uring: fix pol...
1761
1762
1763
  		io_cqring_ev_posted(ctx);
  		io_put_req(req);
  	} else {
18d9be1a9   Jens Axboe   io_uring: add io_...
1764
  		io_queue_async_work(ctx, req);
221c5eb23   Jens Axboe   io_uring: add sup...
1765
  	}
221c5eb23   Jens Axboe   io_uring: add sup...
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
  	return 1;
  }
  
  struct io_poll_table {
  	struct poll_table_struct pt;
  	struct io_kiocb *req;
  	int error;
  };
  
  static void io_poll_queue_proc(struct file *file, struct wait_queue_head *head,
  			       struct poll_table_struct *p)
  {
  	struct io_poll_table *pt = container_of(p, struct io_poll_table, pt);
  
  	if (unlikely(pt->req->poll.head)) {
  		pt->error = -EINVAL;
  		return;
  	}
  
  	pt->error = 0;
  	pt->req->poll.head = head;
  	add_wait_queue(head, &pt->req->poll.wait);
  }
  
  static int io_poll_add(struct io_kiocb *req, const struct io_uring_sqe *sqe)
  {
  	struct io_poll_iocb *poll = &req->poll;
  	struct io_ring_ctx *ctx = req->ctx;
  	struct io_poll_table ipt;
8c8387887   Jens Axboe   io_uring: fix pol...
1795
  	bool cancel = false;
221c5eb23   Jens Axboe   io_uring: add sup...
1796
1797
  	__poll_t mask;
  	u16 events;
221c5eb23   Jens Axboe   io_uring: add sup...
1798
1799
1800
1801
1802
  
  	if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
  		return -EINVAL;
  	if (sqe->addr || sqe->ioprio || sqe->off || sqe->len || sqe->buf_index)
  		return -EINVAL;
09bb83943   Jens Axboe   io_uring: fix fge...
1803
1804
  	if (!poll->file)
  		return -EBADF;
221c5eb23   Jens Axboe   io_uring: add sup...
1805

6cc47d1d2   Jens Axboe   io_uring: ensure ...
1806
  	req->submit.sqe = NULL;
221c5eb23   Jens Axboe   io_uring: add sup...
1807
1808
1809
  	INIT_WORK(&req->work, io_poll_complete_work);
  	events = READ_ONCE(sqe->poll_events);
  	poll->events = demangle_poll(events) | EPOLLERR | EPOLLHUP;
221c5eb23   Jens Axboe   io_uring: add sup...
1810
  	poll->head = NULL;
8c8387887   Jens Axboe   io_uring: fix pol...
1811
  	poll->done = false;
221c5eb23   Jens Axboe   io_uring: add sup...
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
  	poll->canceled = false;
  
  	ipt.pt._qproc = io_poll_queue_proc;
  	ipt.pt._key = poll->events;
  	ipt.req = req;
  	ipt.error = -EINVAL; /* same as no support for IOCB_CMD_POLL */
  
  	/* initialized the list so that we can do list_empty checks */
  	INIT_LIST_HEAD(&poll->wait.entry);
  	init_waitqueue_func_entry(&poll->wait, io_poll_wake);
36703247d   Jens Axboe   io_uring: ensure ...
1822
  	INIT_LIST_HEAD(&req->list);
221c5eb23   Jens Axboe   io_uring: add sup...
1823
  	mask = vfs_poll(poll->file, &ipt.pt) & poll->events;
221c5eb23   Jens Axboe   io_uring: add sup...
1824
1825
  
  	spin_lock_irq(&ctx->completion_lock);
8c8387887   Jens Axboe   io_uring: fix pol...
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
  	if (likely(poll->head)) {
  		spin_lock(&poll->head->lock);
  		if (unlikely(list_empty(&poll->wait.entry))) {
  			if (ipt.error)
  				cancel = true;
  			ipt.error = 0;
  			mask = 0;
  		}
  		if (mask || ipt.error)
  			list_del_init(&poll->wait.entry);
  		else if (cancel)
  			WRITE_ONCE(poll->canceled, true);
  		else if (!poll->done) /* actually waiting for an event */
  			list_add_tail(&req->list, &ctx->cancel_list);
  		spin_unlock(&poll->head->lock);
  	}
  	if (mask) { /* no async, we'd stolen it */
221c5eb23   Jens Axboe   io_uring: add sup...
1843
  		ipt.error = 0;
8c8387887   Jens Axboe   io_uring: fix pol...
1844
  		io_poll_complete(ctx, req, mask);
221c5eb23   Jens Axboe   io_uring: add sup...
1845
  	}
221c5eb23   Jens Axboe   io_uring: add sup...
1846
  	spin_unlock_irq(&ctx->completion_lock);
8c8387887   Jens Axboe   io_uring: fix pol...
1847
1848
  	if (mask) {
  		io_cqring_ev_posted(ctx);
e65ef56db   Jens Axboe   io_uring: use reg...
1849
  		io_put_req(req);
221c5eb23   Jens Axboe   io_uring: add sup...
1850
  	}
8c8387887   Jens Axboe   io_uring: fix pol...
1851
  	return ipt.error;
221c5eb23   Jens Axboe   io_uring: add sup...
1852
  }
5262f5679   Jens Axboe   io_uring: IORING_...
1853
1854
1855
  static enum hrtimer_restart io_timeout_fn(struct hrtimer *timer)
  {
  	struct io_ring_ctx *ctx;
ef03681ae   zhangyi (F)   io_uring : correc...
1856
  	struct io_kiocb *req, *prev;
5262f5679   Jens Axboe   io_uring: IORING_...
1857
1858
1859
1860
1861
1862
1863
  	unsigned long flags;
  
  	req = container_of(timer, struct io_kiocb, timeout.timer);
  	ctx = req->ctx;
  	atomic_inc(&ctx->cq_timeouts);
  
  	spin_lock_irqsave(&ctx->completion_lock, flags);
ef03681ae   zhangyi (F)   io_uring : correc...
1864
1865
1866
1867
1868
1869
1870
1871
1872
  	/*
  	 * Adjust the reqs sequence before the current one because it
  	 * will consume a slot in the cq_ring and the the cq_tail pointer
  	 * will be increased, otherwise other timeout reqs may return in
  	 * advance without waiting for enough wait_nr.
  	 */
  	prev = req;
  	list_for_each_entry_continue_reverse(prev, &ctx->timeout_list, list)
  		prev->sequence++;
5262f5679   Jens Axboe   io_uring: IORING_...
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
  	list_del(&req->list);
  
  	io_cqring_fill_event(ctx, req->user_data, -ETIME);
  	io_commit_cqring(ctx);
  	spin_unlock_irqrestore(&ctx->completion_lock, flags);
  
  	io_cqring_ev_posted(ctx);
  
  	io_put_req(req);
  	return HRTIMER_NORESTART;
  }
  
  static int io_timeout(struct io_kiocb *req, const struct io_uring_sqe *sqe)
  {
5da0fb1ab   yangerkun   io_uring: conside...
1887
  	unsigned count;
5262f5679   Jens Axboe   io_uring: IORING_...
1888
1889
  	struct io_ring_ctx *ctx = req->ctx;
  	struct list_head *entry;
bdf200731   Arnd Bergmann   io_uring: use __k...
1890
  	struct timespec64 ts;
a1f58ba46   zhangyi (F)   io_uring: correct...
1891
  	unsigned span = 0;
5262f5679   Jens Axboe   io_uring: IORING_...
1892
1893
1894
1895
1896
1897
  
  	if (unlikely(ctx->flags & IORING_SETUP_IOPOLL))
  		return -EINVAL;
  	if (sqe->flags || sqe->ioprio || sqe->buf_index || sqe->timeout_flags ||
  	    sqe->len != 1)
  		return -EINVAL;
bdf200731   Arnd Bergmann   io_uring: use __k...
1898
1899
  
  	if (get_timespec64(&ts, u64_to_user_ptr(sqe->addr)))
5262f5679   Jens Axboe   io_uring: IORING_...
1900
  		return -EFAULT;
93bd25bb6   Jens Axboe   io_uring: make ti...
1901
  	req->flags |= REQ_F_TIMEOUT;
5262f5679   Jens Axboe   io_uring: IORING_...
1902
1903
  	/*
  	 * sqe->off holds how many events that need to occur for this
93bd25bb6   Jens Axboe   io_uring: make ti...
1904
1905
  	 * timeout event to be satisfied. If it isn't set, then this is
  	 * a pure timeout request, sequence isn't used.
5262f5679   Jens Axboe   io_uring: IORING_...
1906
1907
  	 */
  	count = READ_ONCE(sqe->off);
93bd25bb6   Jens Axboe   io_uring: make ti...
1908
1909
1910
1911
1912
1913
  	if (!count) {
  		req->flags |= REQ_F_TIMEOUT_NOSEQ;
  		spin_lock_irq(&ctx->completion_lock);
  		entry = ctx->timeout_list.prev;
  		goto add;
  	}
5262f5679   Jens Axboe   io_uring: IORING_...
1914
1915
  
  	req->sequence = ctx->cached_sq_head + count - 1;
5da0fb1ab   yangerkun   io_uring: conside...
1916
1917
  	/* reuse it to store the count */
  	req->submit.sequence = count;
5262f5679   Jens Axboe   io_uring: IORING_...
1918
1919
1920
1921
1922
  
  	/*
  	 * Insertion sort, ensuring the first entry in the list is always
  	 * the one we need first.
  	 */
5262f5679   Jens Axboe   io_uring: IORING_...
1923
1924
1925
  	spin_lock_irq(&ctx->completion_lock);
  	list_for_each_prev(entry, &ctx->timeout_list) {
  		struct io_kiocb *nxt = list_entry(entry, struct io_kiocb, list);
5da0fb1ab   yangerkun   io_uring: conside...
1926
1927
  		unsigned nxt_sq_head;
  		long long tmp, tmp_nxt;
5262f5679   Jens Axboe   io_uring: IORING_...
1928

93bd25bb6   Jens Axboe   io_uring: make ti...
1929
1930
  		if (nxt->flags & REQ_F_TIMEOUT_NOSEQ)
  			continue;
5da0fb1ab   yangerkun   io_uring: conside...
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
  		/*
  		 * Since cached_sq_head + count - 1 can overflow, use type long
  		 * long to store it.
  		 */
  		tmp = (long long)ctx->cached_sq_head + count - 1;
  		nxt_sq_head = nxt->sequence - nxt->submit.sequence + 1;
  		tmp_nxt = (long long)nxt_sq_head + nxt->submit.sequence - 1;
  
  		/*
  		 * cached_sq_head may overflow, and it will never overflow twice
  		 * once there is some timeout req still be valid.
  		 */
  		if (ctx->cached_sq_head < nxt_sq_head)
8b07a65ad   yangerkun   io_uring: fix log...
1944
  			tmp += UINT_MAX;
5da0fb1ab   yangerkun   io_uring: conside...
1945

a1f58ba46   zhangyi (F)   io_uring: correct...
1946
  		if (tmp > tmp_nxt)
5262f5679   Jens Axboe   io_uring: IORING_...
1947
  			break;
a1f58ba46   zhangyi (F)   io_uring: correct...
1948
1949
1950
1951
1952
1953
1954
  
  		/*
  		 * Sequence of reqs after the insert one and itself should
  		 * be adjusted because each timeout req consumes a slot.
  		 */
  		span++;
  		nxt->sequence++;
5262f5679   Jens Axboe   io_uring: IORING_...
1955
  	}
a1f58ba46   zhangyi (F)   io_uring: correct...
1956
  	req->sequence -= span;
93bd25bb6   Jens Axboe   io_uring: make ti...
1957
  add:
5262f5679   Jens Axboe   io_uring: IORING_...
1958
1959
1960
1961
1962
  	list_add(&req->list, entry);
  	spin_unlock_irq(&ctx->completion_lock);
  
  	hrtimer_init(&req->timeout.timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
  	req->timeout.timer.function = io_timeout_fn;
bdf200731   Arnd Bergmann   io_uring: use __k...
1963
  	hrtimer_start(&req->timeout.timer, timespec64_to_ktime(ts),
5262f5679   Jens Axboe   io_uring: IORING_...
1964
1965
1966
  			HRTIMER_MODE_REL);
  	return 0;
  }
de0617e46   Jens Axboe   io_uring: add sup...
1967
  static int io_req_defer(struct io_ring_ctx *ctx, struct io_kiocb *req,
74dcfcd1d   Jens Axboe   io_uring: ensure ...
1968
  			struct sqe_submit *s)
de0617e46   Jens Axboe   io_uring: add sup...
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
  {
  	struct io_uring_sqe *sqe_copy;
  
  	if (!io_sequence_defer(ctx, req) && list_empty(&ctx->defer_list))
  		return 0;
  
  	sqe_copy = kmalloc(sizeof(*sqe_copy), GFP_KERNEL);
  	if (!sqe_copy)
  		return -EAGAIN;
  
  	spin_lock_irq(&ctx->completion_lock);
  	if (!io_sequence_defer(ctx, req) && list_empty(&ctx->defer_list)) {
  		spin_unlock_irq(&ctx->completion_lock);
  		kfree(sqe_copy);
  		return 0;
  	}
74dcfcd1d   Jens Axboe   io_uring: ensure ...
1985
1986
  	memcpy(&req->submit, s, sizeof(*s));
  	memcpy(sqe_copy, s->sqe, sizeof(*sqe_copy));
de0617e46   Jens Axboe   io_uring: add sup...
1987
1988
1989
1990
1991
1992
1993
  	req->submit.sqe = sqe_copy;
  
  	INIT_WORK(&req->work, io_sq_wq_submit_work);
  	list_add_tail(&req->list, &ctx->defer_list);
  	spin_unlock_irq(&ctx->completion_lock);
  	return -EIOCBQUEUED;
  }
2b188cc1b   Jens Axboe   Add io_uring IO i...
1994
  static int __io_submit_sqe(struct io_ring_ctx *ctx, struct io_kiocb *req,
8358e3a82   Jens Axboe   io_uring: remove ...
1995
  			   const struct sqe_submit *s, bool force_nonblock)
2b188cc1b   Jens Axboe   Add io_uring IO i...
1996
  {
a4d61e66e   Jens Axboe   io_uring: prevent...
1997
  	int ret;
2b188cc1b   Jens Axboe   Add io_uring IO i...
1998

9e645e110   Jens Axboe   io_uring: add sup...
1999
  	req->user_data = READ_ONCE(s->sqe->user_data);
2b188cc1b   Jens Axboe   Add io_uring IO i...
2000
2001
  	if (unlikely(s->index >= ctx->sq_entries))
  		return -EINVAL;
2b188cc1b   Jens Axboe   Add io_uring IO i...
2002

a4d61e66e   Jens Axboe   io_uring: prevent...
2003
  	switch (req->submit.opcode) {
2b188cc1b   Jens Axboe   Add io_uring IO i...
2004
2005
2006
2007
  	case IORING_OP_NOP:
  		ret = io_nop(req, req->user_data);
  		break;
  	case IORING_OP_READV:
edafccee5   Jens Axboe   io_uring: add sup...
2008
2009
  		if (unlikely(s->sqe->buf_index))
  			return -EINVAL;
8358e3a82   Jens Axboe   io_uring: remove ...
2010
  		ret = io_read(req, s, force_nonblock);
2b188cc1b   Jens Axboe   Add io_uring IO i...
2011
2012
  		break;
  	case IORING_OP_WRITEV:
edafccee5   Jens Axboe   io_uring: add sup...
2013
2014
  		if (unlikely(s->sqe->buf_index))
  			return -EINVAL;
8358e3a82   Jens Axboe   io_uring: remove ...
2015
  		ret = io_write(req, s, force_nonblock);
edafccee5   Jens Axboe   io_uring: add sup...
2016
2017
  		break;
  	case IORING_OP_READ_FIXED:
8358e3a82   Jens Axboe   io_uring: remove ...
2018
  		ret = io_read(req, s, force_nonblock);
edafccee5   Jens Axboe   io_uring: add sup...
2019
2020
  		break;
  	case IORING_OP_WRITE_FIXED:
8358e3a82   Jens Axboe   io_uring: remove ...
2021
  		ret = io_write(req, s, force_nonblock);
2b188cc1b   Jens Axboe   Add io_uring IO i...
2022
  		break;
c992fe292   Christoph Hellwig   io_uring: add fsy...
2023
2024
2025
  	case IORING_OP_FSYNC:
  		ret = io_fsync(req, s->sqe, force_nonblock);
  		break;
221c5eb23   Jens Axboe   io_uring: add sup...
2026
2027
2028
2029
2030
2031
  	case IORING_OP_POLL_ADD:
  		ret = io_poll_add(req, s->sqe);
  		break;
  	case IORING_OP_POLL_REMOVE:
  		ret = io_poll_remove(req, s->sqe);
  		break;
5d17b4a4b   Jens Axboe   io_uring: add sup...
2032
2033
2034
  	case IORING_OP_SYNC_FILE_RANGE:
  		ret = io_sync_file_range(req, s->sqe, force_nonblock);
  		break;
0fa03c624   Jens Axboe   io_uring: add sup...
2035
2036
2037
  	case IORING_OP_SENDMSG:
  		ret = io_sendmsg(req, s->sqe, force_nonblock);
  		break;
aa1fa28fc   Jens Axboe   io_uring: add sup...
2038
2039
2040
  	case IORING_OP_RECVMSG:
  		ret = io_recvmsg(req, s->sqe, force_nonblock);
  		break;
5262f5679   Jens Axboe   io_uring: IORING_...
2041
2042
2043
  	case IORING_OP_TIMEOUT:
  		ret = io_timeout(req, s->sqe);
  		break;
2b188cc1b   Jens Axboe   Add io_uring IO i...
2044
2045
2046
2047
  	default:
  		ret = -EINVAL;
  		break;
  	}
def596e95   Jens Axboe   io_uring: support...
2048
2049
2050
2051
  	if (ret)
  		return ret;
  
  	if (ctx->flags & IORING_SETUP_IOPOLL) {
9e645e110   Jens Axboe   io_uring: add sup...
2052
  		if (req->result == -EAGAIN)
def596e95   Jens Axboe   io_uring: support...
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
  			return -EAGAIN;
  
  		/* workqueue context doesn't hold uring_lock, grab it now */
  		if (s->needs_lock)
  			mutex_lock(&ctx->uring_lock);
  		io_iopoll_req_issued(req);
  		if (s->needs_lock)
  			mutex_unlock(&ctx->uring_lock);
  	}
  
  	return 0;
2b188cc1b   Jens Axboe   Add io_uring IO i...
2064
  }
a4d61e66e   Jens Axboe   io_uring: prevent...
2065
2066
  static struct async_list *io_async_list_from_req(struct io_ring_ctx *ctx,
  						 struct io_kiocb *req)
31b515106   Jens Axboe   io_uring: allow w...
2067
  {
a4d61e66e   Jens Axboe   io_uring: prevent...
2068
  	switch (req->submit.opcode) {
31b515106   Jens Axboe   io_uring: allow w...
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
  	case IORING_OP_READV:
  	case IORING_OP_READ_FIXED:
  		return &ctx->pending_async[READ];
  	case IORING_OP_WRITEV:
  	case IORING_OP_WRITE_FIXED:
  		return &ctx->pending_async[WRITE];
  	default:
  		return NULL;
  	}
  }
a4d61e66e   Jens Axboe   io_uring: prevent...
2079
  static inline bool io_req_needs_user(struct io_kiocb *req)
edafccee5   Jens Axboe   io_uring: add sup...
2080
  {
a4d61e66e   Jens Axboe   io_uring: prevent...
2081
2082
  	return !(req->submit.opcode == IORING_OP_READ_FIXED ||
  		req->submit.opcode == IORING_OP_WRITE_FIXED);
edafccee5   Jens Axboe   io_uring: add sup...
2083
  }
2b188cc1b   Jens Axboe   Add io_uring IO i...
2084
2085
2086
  static void io_sq_wq_submit_work(struct work_struct *work)
  {
  	struct io_kiocb *req = container_of(work, struct io_kiocb, work);
cac68d12c   Jens Axboe   io_uring: grab ->...
2087
  	struct fs_struct *old_fs_struct = current->fs;
2b188cc1b   Jens Axboe   Add io_uring IO i...
2088
  	struct io_ring_ctx *ctx = req->ctx;
31b515106   Jens Axboe   io_uring: allow w...
2089
2090
  	struct mm_struct *cur_mm = NULL;
  	struct async_list *async_list;
8387e3688   Jens Axboe   io_uring: async w...
2091
  	const struct cred *old_cred;
31b515106   Jens Axboe   io_uring: allow w...
2092
  	LIST_HEAD(req_list);
edafccee5   Jens Axboe   io_uring: add sup...
2093
  	mm_segment_t old_fs;
2b188cc1b   Jens Axboe   Add io_uring IO i...
2094
  	int ret;
8387e3688   Jens Axboe   io_uring: async w...
2095
  	old_cred = override_creds(ctx->creds);
a4d61e66e   Jens Axboe   io_uring: prevent...
2096
  	async_list = io_async_list_from_req(ctx, req);
1c4404efc   Jens Axboe   io_uring: make su...
2097
2098
  
  	allow_kernel_signal(SIGINT);
31b515106   Jens Axboe   io_uring: allow w...
2099
2100
2101
2102
  restart:
  	do {
  		struct sqe_submit *s = &req->submit;
  		const struct io_uring_sqe *sqe = s->sqe;
d0ee87918   Jackie Liu   io_uring: fix KAS...
2103
  		unsigned int flags = req->flags;
2b188cc1b   Jens Axboe   Add io_uring IO i...
2104

8449eedaa   Stefan Bühler   io_uring: fix han...
2105
  		/* Ensure we clear previously set non-block flag */
31b515106   Jens Axboe   io_uring: allow w...
2106
  		req->rw.ki_flags &= ~IOCB_NOWAIT;
cac68d12c   Jens Axboe   io_uring: grab ->...
2107
2108
2109
2110
2111
2112
2113
2114
  		if (req->fs != current->fs && current->fs != old_fs_struct) {
  			task_lock(current);
  			if (req->fs)
  				current->fs = req->fs;
  			else
  				current->fs = old_fs_struct;
  			task_unlock(current);
  		}
31b515106   Jens Axboe   io_uring: allow w...
2115
  		ret = 0;
a4d61e66e   Jens Axboe   io_uring: prevent...
2116
  		if (io_req_needs_user(req) && !cur_mm) {
31b515106   Jens Axboe   io_uring: allow w...
2117
2118
  			if (!mmget_not_zero(ctx->sqo_mm)) {
  				ret = -EFAULT;
e8053c683   Guoyu Huang   io_uring: Fix use...
2119
  				goto end_req;
31b515106   Jens Axboe   io_uring: allow w...
2120
2121
2122
2123
2124
2125
2126
2127
2128
  			} else {
  				cur_mm = ctx->sqo_mm;
  				use_mm(cur_mm);
  				old_fs = get_fs();
  				set_fs(USER_DS);
  			}
  		}
  
  		if (!ret) {
1c4404efc   Jens Axboe   io_uring: make su...
2129
2130
2131
2132
2133
  			req->work_task = current;
  			if (req->flags & REQ_F_CANCEL) {
  				ret = -ECANCELED;
  				goto end_req;
  			}
31b515106   Jens Axboe   io_uring: allow w...
2134
2135
2136
  			s->has_user = cur_mm != NULL;
  			s->needs_lock = true;
  			do {
8358e3a82   Jens Axboe   io_uring: remove ...
2137
  				ret = __io_submit_sqe(ctx, req, s, false);
31b515106   Jens Axboe   io_uring: allow w...
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
  				/*
  				 * We can get EAGAIN for polled IO even though
  				 * we're forcing a sync submission from here,
  				 * since we can't wait for request slots on the
  				 * block side.
  				 */
  				if (ret != -EAGAIN)
  					break;
  				cond_resched();
  			} while (1);
1c4404efc   Jens Axboe   io_uring: make su...
2148
2149
2150
2151
2152
2153
  end_req:
  			if (!list_empty(&req->task_list)) {
  				spin_lock_irq(&ctx->task_lock);
  				list_del_init(&req->task_list);
  				spin_unlock_irq(&ctx->task_lock);
  			}
31b515106   Jens Axboe   io_uring: allow w...
2154
  		}
817869d25   Jens Axboe   io_uring: drop re...
2155
2156
2157
  
  		/* drop submission reference */
  		io_put_req(req);
31b515106   Jens Axboe   io_uring: allow w...
2158
  		if (ret) {
c71ffb673   Jens Axboe   io_uring: remove ...
2159
  			io_cqring_add_event(ctx, sqe->user_data, ret);
e65ef56db   Jens Axboe   io_uring: use reg...
2160
  			io_put_req(req);
31b515106   Jens Axboe   io_uring: allow w...
2161
2162
2163
2164
  		}
  
  		/* async context always use a copy of the sqe */
  		kfree(sqe);
f7b76ac9d   Zhengyuan Liu   io_uring: fix cou...
2165
  		/* req from defer and link list needn't decrease async cnt */
d0ee87918   Jackie Liu   io_uring: fix KAS...
2166
  		if (flags & (REQ_F_IO_DRAINED | REQ_F_LINK_DONE))
f7b76ac9d   Zhengyuan Liu   io_uring: fix cou...
2167
  			goto out;
31b515106   Jens Axboe   io_uring: allow w...
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
  		if (!async_list)
  			break;
  		if (!list_empty(&req_list)) {
  			req = list_first_entry(&req_list, struct io_kiocb,
  						list);
  			list_del(&req->list);
  			continue;
  		}
  		if (list_empty(&async_list->list))
  			break;
  
  		req = NULL;
  		spin_lock(&async_list->lock);
  		if (list_empty(&async_list->list)) {
  			spin_unlock(&async_list->lock);
  			break;
  		}
  		list_splice_init(&async_list->list, &req_list);
  		spin_unlock(&async_list->lock);
  
  		req = list_first_entry(&req_list, struct io_kiocb, list);
  		list_del(&req->list);
  	} while (req);
edafccee5   Jens Axboe   io_uring: add sup...
2191
2192
  
  	/*
31b515106   Jens Axboe   io_uring: allow w...
2193
2194
2195
  	 * Rare case of racing with a submitter. If we find the count has
  	 * dropped to zero AND we have pending work items, then restart
  	 * the processing. This is a tiny race window.
edafccee5   Jens Axboe   io_uring: add sup...
2196
  	 */
31b515106   Jens Axboe   io_uring: allow w...
2197
2198
2199
2200
2201
2202
2203
2204
2205
2206
2207
2208
2209
2210
2211
  	if (async_list) {
  		ret = atomic_dec_return(&async_list->cnt);
  		while (!ret && !list_empty(&async_list->list)) {
  			spin_lock(&async_list->lock);
  			atomic_inc(&async_list->cnt);
  			list_splice_init(&async_list->list, &req_list);
  			spin_unlock(&async_list->lock);
  
  			if (!list_empty(&req_list)) {
  				req = list_first_entry(&req_list,
  							struct io_kiocb, list);
  				list_del(&req->list);
  				goto restart;
  			}
  			ret = atomic_dec_return(&async_list->cnt);
edafccee5   Jens Axboe   io_uring: add sup...
2212
  		}
edafccee5   Jens Axboe   io_uring: add sup...
2213
  	}
2b188cc1b   Jens Axboe   Add io_uring IO i...
2214

f7b76ac9d   Zhengyuan Liu   io_uring: fix cou...
2215
  out:
1c4404efc   Jens Axboe   io_uring: make su...
2216
  	disallow_signal(SIGINT);
31b515106   Jens Axboe   io_uring: allow w...
2217
  	if (cur_mm) {
edafccee5   Jens Axboe   io_uring: add sup...
2218
  		set_fs(old_fs);
31b515106   Jens Axboe   io_uring: allow w...
2219
2220
  		unuse_mm(cur_mm);
  		mmput(cur_mm);
2b188cc1b   Jens Axboe   Add io_uring IO i...
2221
  	}
8387e3688   Jens Axboe   io_uring: async w...
2222
  	revert_creds(old_cred);
cac68d12c   Jens Axboe   io_uring: grab ->...
2223
2224
2225
2226
2227
  	if (old_fs_struct) {
  		task_lock(current);
  		current->fs = old_fs_struct;
  		task_unlock(current);
  	}
31b515106   Jens Axboe   io_uring: allow w...
2228
  }
2b188cc1b   Jens Axboe   Add io_uring IO i...
2229

31b515106   Jens Axboe   io_uring: allow w...
2230
2231
2232
2233
2234
2235
2236
  /*
   * See if we can piggy back onto previously submitted work, that is still
   * running. We currently only allow this if the new request is sequential
   * to the previous one we punted.
   */
  static bool io_add_to_prev_work(struct async_list *list, struct io_kiocb *req)
  {
6d5d5ac52   Jens Axboe   io_uring: extend ...
2237
  	bool ret;
31b515106   Jens Axboe   io_uring: allow w...
2238
2239
2240
2241
2242
2243
2244
2245
2246
2247
2248
  
  	if (!list)
  		return false;
  	if (!(req->flags & REQ_F_SEQ_PREV))
  		return false;
  	if (!atomic_read(&list->cnt))
  		return false;
  
  	ret = true;
  	spin_lock(&list->lock);
  	list_add_tail(&req->list, &list->list);
c0e48f9de   Zhengyuan Liu   io_uring: add a m...
2249
2250
2251
2252
  	/*
  	 * Ensure we see a simultaneous modification from io_sq_wq_submit_work()
  	 */
  	smp_mb();
31b515106   Jens Axboe   io_uring: allow w...
2253
2254
2255
2256
  	if (!atomic_read(&list->cnt)) {
  		list_del_init(&req->list);
  		ret = false;
  	}
54ee77961   Xin Yin   io_uring: Fix NUL...
2257
2258
2259
2260
2261
2262
2263
2264
2265
  
  	if (ret) {
  		struct io_ring_ctx *ctx = req->ctx;
  
  		spin_lock_irq(&ctx->task_lock);
  		list_add(&req->task_list, &ctx->task_list);
  		req->work_task = NULL;
  		spin_unlock_irq(&ctx->task_lock);
  	}
31b515106   Jens Axboe   io_uring: allow w...
2266
2267
  	spin_unlock(&list->lock);
  	return ret;
2b188cc1b   Jens Axboe   Add io_uring IO i...
2268
  }
a4d61e66e   Jens Axboe   io_uring: prevent...
2269
  static bool io_op_needs_file(struct io_kiocb *req)
09bb83943   Jens Axboe   io_uring: fix fge...
2270
  {
a4d61e66e   Jens Axboe   io_uring: prevent...
2271
  	switch (req->submit.opcode) {
09bb83943   Jens Axboe   io_uring: fix fge...
2272
2273
  	case IORING_OP_NOP:
  	case IORING_OP_POLL_REMOVE:
5683e5406   Pavel Begunkov   io_uring: Fix get...
2274
  	case IORING_OP_TIMEOUT:
09bb83943   Jens Axboe   io_uring: fix fge...
2275
2276
2277
2278
2279
2280
2281
2282
2283
2284
2285
2286
2287
2288
  		return false;
  	default:
  		return true;
  	}
  }
  
  static int io_req_set_file(struct io_ring_ctx *ctx, const struct sqe_submit *s,
  			   struct io_submit_state *state, struct io_kiocb *req)
  {
  	unsigned flags;
  	int fd;
  
  	flags = READ_ONCE(s->sqe->flags);
  	fd = READ_ONCE(s->sqe->fd);
4fe2c9631   Jackie Liu   io_uring: add sup...
2289
  	if (flags & IOSQE_IO_DRAIN)
de0617e46   Jens Axboe   io_uring: add sup...
2290
  		req->flags |= REQ_F_IO_DRAIN;
4fe2c9631   Jackie Liu   io_uring: add sup...
2291
2292
2293
2294
2295
2296
  	/*
  	 * All io need record the previous position, if LINK vs DARIN,
  	 * it can be used to mark the position of the first IO in the
  	 * link list.
  	 */
  	req->sequence = s->sequence;
de0617e46   Jens Axboe   io_uring: add sup...
2297

a4d61e66e   Jens Axboe   io_uring: prevent...
2298
  	if (!io_op_needs_file(req))
09bb83943   Jens Axboe   io_uring: fix fge...
2299
  		return 0;
09bb83943   Jens Axboe   io_uring: fix fge...
2300
2301
2302
2303
2304
2305
2306
2307
2308
2309
2310
2311
2312
2313
2314
2315
2316
  
  	if (flags & IOSQE_FIXED_FILE) {
  		if (unlikely(!ctx->user_files ||
  		    (unsigned) fd >= ctx->nr_user_files))
  			return -EBADF;
  		req->file = ctx->user_files[fd];
  		req->flags |= REQ_F_FIXED_FILE;
  	} else {
  		if (s->needs_fixed_file)
  			return -EBADF;
  		req->file = io_file_get(state, fd);
  		if (unlikely(!req->file))
  			return -EBADF;
  	}
  
  	return 0;
  }
4fe2c9631   Jackie Liu   io_uring: add sup...
2317
  static int __io_queue_sqe(struct io_ring_ctx *ctx, struct io_kiocb *req,
bc808bced   Jens Axboe   io_uring: revert ...
2318
  			struct sqe_submit *s)
2b188cc1b   Jens Axboe   Add io_uring IO i...
2319
  {
e0c5c576d   Jens Axboe   io_uring: make io...
2320
  	int ret;
2b188cc1b   Jens Axboe   Add io_uring IO i...
2321

bc808bced   Jens Axboe   io_uring: revert ...
2322
  	ret = __io_submit_sqe(ctx, req, s, true);
491381ce0   Jens Axboe   io_uring: fix up ...
2323
2324
2325
2326
2327
2328
2329
  
  	/*
  	 * We async punt it if the file wasn't marked NOWAIT, or if the file
  	 * doesn't support non-blocking read/write attempts
  	 */
  	if (ret == -EAGAIN && (!(req->flags & REQ_F_NOWAIT) ||
  	    (req->flags & REQ_F_MUST_PUNT))) {
2b188cc1b   Jens Axboe   Add io_uring IO i...
2330
  		struct io_uring_sqe *sqe_copy;
954dab193   Jackie Liu   io_uring: use kme...
2331
  		sqe_copy = kmemdup(s->sqe, sizeof(*sqe_copy), GFP_KERNEL);
2b188cc1b   Jens Axboe   Add io_uring IO i...
2332
  		if (sqe_copy) {
31b515106   Jens Axboe   io_uring: allow w...
2333
  			struct async_list *list;
2b188cc1b   Jens Axboe   Add io_uring IO i...
2334
  			s->sqe = sqe_copy;
2b188cc1b   Jens Axboe   Add io_uring IO i...
2335
  			memcpy(&req->submit, s, sizeof(*s));
a4d61e66e   Jens Axboe   io_uring: prevent...
2336
  			list = io_async_list_from_req(ctx, req);
31b515106   Jens Axboe   io_uring: allow w...
2337
2338
2339
2340
  			if (!io_add_to_prev_work(list, req)) {
  				if (list)
  					atomic_inc(&list->cnt);
  				INIT_WORK(&req->work, io_sq_wq_submit_work);
18d9be1a9   Jens Axboe   io_uring: add io_...
2341
  				io_queue_async_work(ctx, req);
31b515106   Jens Axboe   io_uring: allow w...
2342
  			}
e65ef56db   Jens Axboe   io_uring: use reg...
2343
2344
2345
  
  			/*
  			 * Queued up for async execution, worker will release
9e645e110   Jens Axboe   io_uring: add sup...
2346
  			 * submit reference when the iocb is actually submitted.
e65ef56db   Jens Axboe   io_uring: use reg...
2347
2348
  			 */
  			return 0;
2b188cc1b   Jens Axboe   Add io_uring IO i...
2349
2350
  		}
  	}
e65ef56db   Jens Axboe   io_uring: use reg...
2351
2352
2353
2354
2355
  
  	/* drop submission reference */
  	io_put_req(req);
  
  	/* and drop final reference, if we failed */
9e645e110   Jens Axboe   io_uring: add sup...
2356
2357
2358
2359
  	if (ret) {
  		io_cqring_add_event(ctx, req->user_data, ret);
  		if (req->flags & REQ_F_LINK)
  			req->flags |= REQ_F_FAIL_LINK;
e65ef56db   Jens Axboe   io_uring: use reg...
2360
  		io_put_req(req);
9e645e110   Jens Axboe   io_uring: add sup...
2361
  	}
2b188cc1b   Jens Axboe   Add io_uring IO i...
2362
2363
2364
  
  	return ret;
  }
4fe2c9631   Jackie Liu   io_uring: add sup...
2365
  static int io_queue_sqe(struct io_ring_ctx *ctx, struct io_kiocb *req,
bc808bced   Jens Axboe   io_uring: revert ...
2366
  			struct sqe_submit *s)
4fe2c9631   Jackie Liu   io_uring: add sup...
2367
2368
  {
  	int ret;
74dcfcd1d   Jens Axboe   io_uring: ensure ...
2369
  	ret = io_req_defer(ctx, req, s);
4fe2c9631   Jackie Liu   io_uring: add sup...
2370
2371
2372
2373
2374
2375
2376
  	if (ret) {
  		if (ret != -EIOCBQUEUED) {
  			io_free_req(req);
  			io_cqring_add_event(ctx, s->sqe->user_data, ret);
  		}
  		return 0;
  	}
bc808bced   Jens Axboe   io_uring: revert ...
2377
  	return __io_queue_sqe(ctx, req, s);
4fe2c9631   Jackie Liu   io_uring: add sup...
2378
2379
2380
  }
  
  static int io_queue_link_head(struct io_ring_ctx *ctx, struct io_kiocb *req,
bc808bced   Jens Axboe   io_uring: revert ...
2381
  			      struct sqe_submit *s, struct io_kiocb *shadow)
4fe2c9631   Jackie Liu   io_uring: add sup...
2382
2383
2384
2385
2386
  {
  	int ret;
  	int need_submit = false;
  
  	if (!shadow)
bc808bced   Jens Axboe   io_uring: revert ...
2387
  		return io_queue_sqe(ctx, req, s);
4fe2c9631   Jackie Liu   io_uring: add sup...
2388
2389
2390
2391
2392
2393
2394
  
  	/*
  	 * Mark the first IO in link list as DRAIN, let all the following
  	 * IOs enter the defer list. all IO needs to be completed before link
  	 * list.
  	 */
  	req->flags |= REQ_F_IO_DRAIN;
74dcfcd1d   Jens Axboe   io_uring: ensure ...
2395
  	ret = io_req_defer(ctx, req, s);
4fe2c9631   Jackie Liu   io_uring: add sup...
2396
2397
2398
  	if (ret) {
  		if (ret != -EIOCBQUEUED) {
  			io_free_req(req);
7b20238d2   Pavel Begunkov   io_uring: Fix lea...
2399
  			__io_free_req(shadow);
4fe2c9631   Jackie Liu   io_uring: add sup...
2400
2401
2402
2403
2404
2405
2406
2407
2408
2409
2410
2411
2412
2413
2414
2415
2416
  			io_cqring_add_event(ctx, s->sqe->user_data, ret);
  			return 0;
  		}
  	} else {
  		/*
  		 * If ret == 0 means that all IOs in front of link io are
  		 * running done. let's queue link head.
  		 */
  		need_submit = true;
  	}
  
  	/* Insert shadow req to defer_list, blocking next IOs */
  	spin_lock_irq(&ctx->completion_lock);
  	list_add_tail(&shadow->list, &ctx->defer_list);
  	spin_unlock_irq(&ctx->completion_lock);
  
  	if (need_submit)
bc808bced   Jens Axboe   io_uring: revert ...
2417
  		return __io_queue_sqe(ctx, req, s);
4fe2c9631   Jackie Liu   io_uring: add sup...
2418
2419
2420
  
  	return 0;
  }
9e645e110   Jens Axboe   io_uring: add sup...
2421
2422
2423
  #define SQE_VALID_FLAGS	(IOSQE_FIXED_FILE|IOSQE_IO_DRAIN|IOSQE_IO_LINK)
  
  static void io_submit_sqe(struct io_ring_ctx *ctx, struct sqe_submit *s,
bc808bced   Jens Axboe   io_uring: revert ...
2424
  			  struct io_submit_state *state, struct io_kiocb **link)
9e645e110   Jens Axboe   io_uring: add sup...
2425
2426
2427
2428
2429
2430
2431
2432
2433
2434
2435
2436
2437
2438
2439
2440
  {
  	struct io_uring_sqe *sqe_copy;
  	struct io_kiocb *req;
  	int ret;
  
  	/* enforce forwards compatibility on users */
  	if (unlikely(s->sqe->flags & ~SQE_VALID_FLAGS)) {
  		ret = -EINVAL;
  		goto err;
  	}
  
  	req = io_get_req(ctx, state);
  	if (unlikely(!req)) {
  		ret = -EAGAIN;
  		goto err;
  	}
a02df82a5   Liu Yong   fs/io_uring.c: Fi...
2441
  	memcpy(&req->submit, s, sizeof(*s));
9e645e110   Jens Axboe   io_uring: add sup...
2442
2443
2444
2445
2446
2447
2448
2449
  	ret = io_req_set_file(ctx, s, state, req);
  	if (unlikely(ret)) {
  err_req:
  		io_free_req(req);
  err:
  		io_cqring_add_event(ctx, s->sqe->user_data, ret);
  		return;
  	}
84d55dc5b   Pavel Begunkov   io_uring: Fix cor...
2450
  	req->user_data = s->sqe->user_data;
cac68d12c   Jens Axboe   io_uring: grab ->...
2451
  #if defined(CONFIG_NET)
a4d61e66e   Jens Axboe   io_uring: prevent...
2452
  	switch (req->submit.opcode) {
cac68d12c   Jens Axboe   io_uring: grab ->...
2453
2454
2455
2456
2457
2458
2459
2460
2461
2462
2463
2464
2465
2466
  	case IORING_OP_SENDMSG:
  	case IORING_OP_RECVMSG:
  		spin_lock(&current->fs->lock);
  		if (!current->fs->in_exec) {
  			req->fs = current->fs;
  			req->fs->users++;
  		}
  		spin_unlock(&current->fs->lock);
  		if (!req->fs) {
  			ret = -EAGAIN;
  			goto err_req;
  		}
  	}
  #endif
9e645e110   Jens Axboe   io_uring: add sup...
2467
2468
2469
2470
2471
2472
2473
2474
2475
2476
2477
2478
2479
2480
2481
2482
2483
2484
2485
2486
2487
2488
2489
2490
2491
2492
  	/*
  	 * If we already have a head request, queue this one for async
  	 * submittal once the head completes. If we don't have a head but
  	 * IOSQE_IO_LINK is set in the sqe, start a new head. This one will be
  	 * submitted sync once the chain is complete. If none of those
  	 * conditions are true (normal request), then just queue it.
  	 */
  	if (*link) {
  		struct io_kiocb *prev = *link;
  
  		sqe_copy = kmemdup(s->sqe, sizeof(*sqe_copy), GFP_KERNEL);
  		if (!sqe_copy) {
  			ret = -EAGAIN;
  			goto err_req;
  		}
  
  		s->sqe = sqe_copy;
  		memcpy(&req->submit, s, sizeof(*s));
  		list_add_tail(&req->list, &prev->link_list);
  	} else if (s->sqe->flags & IOSQE_IO_LINK) {
  		req->flags |= REQ_F_LINK;
  
  		memcpy(&req->submit, s, sizeof(*s));
  		INIT_LIST_HEAD(&req->link_list);
  		*link = req;
  	} else {
bc808bced   Jens Axboe   io_uring: revert ...
2493
  		io_queue_sqe(ctx, req, s);
9e645e110   Jens Axboe   io_uring: add sup...
2494
2495
  	}
  }
9a56a2323   Jens Axboe   io_uring: use fge...
2496
2497
2498
2499
2500
2501
  /*
   * Batched submission is done, ensure local IO is flushed out.
   */
  static void io_submit_state_end(struct io_submit_state *state)
  {
  	blk_finish_plug(&state->plug);
3d6770fbd   Jens Axboe   io_uring: drop io...
2502
  	io_file_put(state);
2579f913d   Jens Axboe   io_uring: batch i...
2503
2504
2505
  	if (state->free_reqs)
  		kmem_cache_free_bulk(req_cachep, state->free_reqs,
  					&state->reqs[state->cur_req]);
9a56a2323   Jens Axboe   io_uring: use fge...
2506
2507
2508
2509
2510
2511
2512
2513
2514
  }
  
  /*
   * Start submission side cache.
   */
  static void io_submit_state_start(struct io_submit_state *state,
  				  struct io_ring_ctx *ctx, unsigned max_ios)
  {
  	blk_start_plug(&state->plug);
2579f913d   Jens Axboe   io_uring: batch i...
2515
  	state->free_reqs = 0;
9a56a2323   Jens Axboe   io_uring: use fge...
2516
2517
2518
  	state->file = NULL;
  	state->ios_left = max_ios;
  }
2b188cc1b   Jens Axboe   Add io_uring IO i...
2519
2520
  static void io_commit_sqring(struct io_ring_ctx *ctx)
  {
75b28affd   Hristo Venev   io_uring: allocat...
2521
  	struct io_rings *rings = ctx->rings;
2b188cc1b   Jens Axboe   Add io_uring IO i...
2522

75b28affd   Hristo Venev   io_uring: allocat...
2523
  	if (ctx->cached_sq_head != READ_ONCE(rings->sq.head)) {
2b188cc1b   Jens Axboe   Add io_uring IO i...
2524
2525
2526
2527
2528
  		/*
  		 * Ensure any loads from the SQEs are done at this point,
  		 * since once we write the new head, the application could
  		 * write new data to them.
  		 */
75b28affd   Hristo Venev   io_uring: allocat...
2529
  		smp_store_release(&rings->sq.head, ctx->cached_sq_head);
2b188cc1b   Jens Axboe   Add io_uring IO i...
2530
2531
2532
2533
  	}
  }
  
  /*
2b188cc1b   Jens Axboe   Add io_uring IO i...
2534
2535
2536
2537
2538
2539
2540
2541
2542
   * Fetch an sqe, if one is available. Note that s->sqe will point to memory
   * that is mapped by userspace. This means that care needs to be taken to
   * ensure that reads are stable, as we cannot rely on userspace always
   * being a good citizen. If members of the sqe are validated and then later
   * used, it's important that those reads are done through READ_ONCE() to
   * prevent a re-load down the line.
   */
  static bool io_get_sqring(struct io_ring_ctx *ctx, struct sqe_submit *s)
  {
75b28affd   Hristo Venev   io_uring: allocat...
2543
2544
  	struct io_rings *rings = ctx->rings;
  	u32 *sq_array = ctx->sq_array;
2b188cc1b   Jens Axboe   Add io_uring IO i...
2545
2546
2547
2548
2549
2550
2551
2552
2553
2554
2555
  	unsigned head;
  
  	/*
  	 * The cached sq head (or cq tail) serves two purposes:
  	 *
  	 * 1) allows us to batch the cost of updating the user visible
  	 *    head updates.
  	 * 2) allows the kernel side to track the head on its own, even
  	 *    though the application is the one updating it.
  	 */
  	head = ctx->cached_sq_head;
e523a29c4   Stefan Bühler   io_uring: fix rac...
2556
  	/* make sure SQ entry isn't read before tail */
75b28affd   Hristo Venev   io_uring: allocat...
2557
  	if (head == smp_load_acquire(&rings->sq.tail))
2b188cc1b   Jens Axboe   Add io_uring IO i...
2558
  		return false;
75b28affd   Hristo Venev   io_uring: allocat...
2559
  	head = READ_ONCE(sq_array[head & ctx->sq_mask]);
2b188cc1b   Jens Axboe   Add io_uring IO i...
2560
2561
2562
  	if (head < ctx->sq_entries) {
  		s->index = head;
  		s->sqe = &ctx->sq_sqes[head];
a4d61e66e   Jens Axboe   io_uring: prevent...
2563
  		s->opcode = READ_ONCE(s->sqe->opcode);
8776f3fa1   Jackie Liu   io_uring: fix wro...
2564
  		s->sequence = ctx->cached_sq_head;
2b188cc1b   Jens Axboe   Add io_uring IO i...
2565
2566
2567
2568
2569
2570
  		ctx->cached_sq_head++;
  		return true;
  	}
  
  	/* drop invalid entries */
  	ctx->cached_sq_head++;
498ccd9ed   Jens Axboe   io_uring: used ca...
2571
2572
  	ctx->cached_sq_dropped++;
  	WRITE_ONCE(rings->sq_dropped, ctx->cached_sq_dropped);
2b188cc1b   Jens Axboe   Add io_uring IO i...
2573
2574
  	return false;
  }
fb5ccc987   Pavel Begunkov   io_uring: Fix bro...
2575
2576
  static int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr,
  			  bool has_user, bool mm_fault)
6c271ce2f   Jens Axboe   io_uring: add sub...
2577
2578
  {
  	struct io_submit_state state, *statep = NULL;
9e645e110   Jens Axboe   io_uring: add sup...
2579
  	struct io_kiocb *link = NULL;
4fe2c9631   Jackie Liu   io_uring: add sup...
2580
  	struct io_kiocb *shadow_req = NULL;
9e645e110   Jens Axboe   io_uring: add sup...
2581
2582
  	bool prev_was_link = false;
  	int i, submitted = 0;
6c271ce2f   Jens Axboe   io_uring: add sub...
2583
2584
2585
2586
2587
2588
2589
  
  	if (nr > IO_PLUG_THRESHOLD) {
  		io_submit_state_start(&state, ctx, nr);
  		statep = &state;
  	}
  
  	for (i = 0; i < nr; i++) {
fb5ccc987   Pavel Begunkov   io_uring: Fix bro...
2590
2591
2592
2593
  		struct sqe_submit s;
  
  		if (!io_get_sqring(ctx, &s))
  			break;
9e645e110   Jens Axboe   io_uring: add sup...
2594
2595
2596
2597
2598
  		/*
  		 * If previous wasn't linked and we have a linked command,
  		 * that's the end of the chain. Submit the previous link.
  		 */
  		if (!prev_was_link && link) {
bc808bced   Jens Axboe   io_uring: revert ...
2599
  			io_queue_link_head(ctx, link, &link->submit, shadow_req);
9e645e110   Jens Axboe   io_uring: add sup...
2600
  			link = NULL;
5f5ad9ced   Jackie Liu   io_uring: fix use...
2601
  			shadow_req = NULL;
9e645e110   Jens Axboe   io_uring: add sup...
2602
  		}
fb5ccc987   Pavel Begunkov   io_uring: Fix bro...
2603
  		prev_was_link = (s.sqe->flags & IOSQE_IO_LINK) != 0;
9e645e110   Jens Axboe   io_uring: add sup...
2604

fb5ccc987   Pavel Begunkov   io_uring: Fix bro...
2605
  		if (link && (s.sqe->flags & IOSQE_IO_DRAIN)) {
4fe2c9631   Jackie Liu   io_uring: add sup...
2606
2607
  			if (!shadow_req) {
  				shadow_req = io_get_req(ctx, NULL);
a1041c27b   Jackie Liu   io_uring: fix pot...
2608
2609
  				if (unlikely(!shadow_req))
  					goto out;
4fe2c9631   Jackie Liu   io_uring: add sup...
2610
2611
2612
  				shadow_req->flags |= (REQ_F_IO_DRAIN | REQ_F_SHADOW_DRAIN);
  				refcount_dec(&shadow_req->refs);
  			}
fb5ccc987   Pavel Begunkov   io_uring: Fix bro...
2613
  			shadow_req->sequence = s.sequence;
4fe2c9631   Jackie Liu   io_uring: add sup...
2614
  		}
a1041c27b   Jackie Liu   io_uring: fix pot...
2615
  out:
6c271ce2f   Jens Axboe   io_uring: add sub...
2616
  		if (unlikely(mm_fault)) {
fb5ccc987   Pavel Begunkov   io_uring: Fix bro...
2617
  			io_cqring_add_event(ctx, s.sqe->user_data,
9e645e110   Jens Axboe   io_uring: add sup...
2618
  						-EFAULT);
6c271ce2f   Jens Axboe   io_uring: add sub...
2619
  		} else {
fb5ccc987   Pavel Begunkov   io_uring: Fix bro...
2620
2621
2622
2623
  			s.has_user = has_user;
  			s.needs_lock = true;
  			s.needs_fixed_file = true;
  			io_submit_sqe(ctx, &s, statep, &link);
6c271ce2f   Jens Axboe   io_uring: add sub...
2624
  			submitted++;
6c271ce2f   Jens Axboe   io_uring: add sub...
2625
  		}
6c271ce2f   Jens Axboe   io_uring: add sub...
2626
  	}
9e645e110   Jens Axboe   io_uring: add sup...
2627
  	if (link)
bc808bced   Jens Axboe   io_uring: revert ...
2628
  		io_queue_link_head(ctx, link, &link->submit, shadow_req);
6c271ce2f   Jens Axboe   io_uring: add sub...
2629
2630
2631
2632
2633
2634
2635
2636
  	if (statep)
  		io_submit_state_end(&state);
  
  	return submitted;
  }
  
  static int io_sq_thread(void *data)
  {
6c271ce2f   Jens Axboe   io_uring: add sub...
2637
2638
  	struct io_ring_ctx *ctx = data;
  	struct mm_struct *cur_mm = NULL;
8387e3688   Jens Axboe   io_uring: async w...
2639
  	const struct cred *old_cred;
6c271ce2f   Jens Axboe   io_uring: add sub...
2640
2641
2642
2643
  	mm_segment_t old_fs;
  	DEFINE_WAIT(wait);
  	unsigned inflight;
  	unsigned long timeout;
a4c0b3dec   Jackie Liu   io_uring: fix io_...
2644
  	complete(&ctx->sqo_thread_started);
6c271ce2f   Jens Axboe   io_uring: add sub...
2645
2646
  	old_fs = get_fs();
  	set_fs(USER_DS);
8387e3688   Jens Axboe   io_uring: async w...
2647
  	old_cred = override_creds(ctx->creds);
6c271ce2f   Jens Axboe   io_uring: add sub...
2648
2649
  
  	timeout = inflight = 0;
2bbcd6d3b   Roman Penyaev   io_uring: fix inf...
2650
  	while (!kthread_should_park()) {
fb5ccc987   Pavel Begunkov   io_uring: Fix bro...
2651
2652
  		bool mm_fault = false;
  		unsigned int to_submit;
6c271ce2f   Jens Axboe   io_uring: add sub...
2653
2654
2655
2656
2657
  
  		if (inflight) {
  			unsigned nr_events = 0;
  
  			if (ctx->flags & IORING_SETUP_IOPOLL) {
2b2ed9750   Jens Axboe   io_uring: fix bad...
2658
2659
2660
2661
2662
2663
2664
2665
2666
2667
  				/*
  				 * inflight is the count of the maximum possible
  				 * entries we submitted, but it can be smaller
  				 * if we dropped some of them. If we don't have
  				 * poll entries available, then we know that we
  				 * have nothing left to poll for. Reset the
  				 * inflight count to zero in that case.
  				 */
  				mutex_lock(&ctx->uring_lock);
  				if (!list_empty(&ctx->poll_list))
c7deb9612   Xiaoguang Wang   io_uring: fix __i...
2668
  					io_iopoll_getevents(ctx, &nr_events, 0);
2b2ed9750   Jens Axboe   io_uring: fix bad...
2669
2670
2671
  				else
  					inflight = 0;
  				mutex_unlock(&ctx->uring_lock);
6c271ce2f   Jens Axboe   io_uring: add sub...
2672
2673
2674
2675
2676
2677
2678
2679
2680
2681
2682
2683
  			} else {
  				/*
  				 * Normal IO, just pretend everything completed.
  				 * We don't have to poll completions for that.
  				 */
  				nr_events = inflight;
  			}
  
  			inflight -= nr_events;
  			if (!inflight)
  				timeout = jiffies + ctx->sq_thread_idle;
  		}
fb5ccc987   Pavel Begunkov   io_uring: Fix bro...
2684
2685
  		to_submit = io_sqring_entries(ctx);
  		if (!to_submit) {
6c271ce2f   Jens Axboe   io_uring: add sub...
2686
  			/*
6c271ce2f   Jens Axboe   io_uring: add sub...
2687
2688
2689
2690
2691
2692
2693
2694
2695
2696
  			 * Drop cur_mm before scheduling, we can't hold it for
  			 * long periods (or over schedule()). Do this before
  			 * adding ourselves to the waitqueue, as the unuse/drop
  			 * may sleep.
  			 */
  			if (cur_mm) {
  				unuse_mm(cur_mm);
  				mmput(cur_mm);
  				cur_mm = NULL;
  			}
8eb92c122   Stefano Garzarella   io_uring: prevent...
2697
2698
2699
2700
2701
2702
2703
2704
2705
  			/*
  			 * We're polling. If we're within the defined idle
  			 * period, then let us spin without work before going
  			 * to sleep.
  			 */
  			if (inflight || !time_after(jiffies, timeout)) {
  				cond_resched();
  				continue;
  			}
6c271ce2f   Jens Axboe   io_uring: add sub...
2706
2707
2708
2709
  			prepare_to_wait(&ctx->sqo_wait, &wait,
  						TASK_INTERRUPTIBLE);
  
  			/* Tell userspace we may need a wakeup call */
75b28affd   Hristo Venev   io_uring: allocat...
2710
  			ctx->rings->sq_flags |= IORING_SQ_NEED_WAKEUP;
0d7bae69c   Stefan Bühler   io_uring: fix rac...
2711
2712
  			/* make sure to read SQ tail after writing flags */
  			smp_mb();
6c271ce2f   Jens Axboe   io_uring: add sub...
2713

fb5ccc987   Pavel Begunkov   io_uring: Fix bro...
2714
2715
  			to_submit = io_sqring_entries(ctx);
  			if (!to_submit) {
2bbcd6d3b   Roman Penyaev   io_uring: fix inf...
2716
  				if (kthread_should_park()) {
6c271ce2f   Jens Axboe   io_uring: add sub...
2717
2718
2719
2720
2721
2722
2723
  					finish_wait(&ctx->sqo_wait, &wait);
  					break;
  				}
  				if (signal_pending(current))
  					flush_signals(current);
  				schedule();
  				finish_wait(&ctx->sqo_wait, &wait);
75b28affd   Hristo Venev   io_uring: allocat...
2724
  				ctx->rings->sq_flags &= ~IORING_SQ_NEED_WAKEUP;
6c271ce2f   Jens Axboe   io_uring: add sub...
2725
2726
2727
  				continue;
  			}
  			finish_wait(&ctx->sqo_wait, &wait);
75b28affd   Hristo Venev   io_uring: allocat...
2728
  			ctx->rings->sq_flags &= ~IORING_SQ_NEED_WAKEUP;
6c271ce2f   Jens Axboe   io_uring: add sub...
2729
  		}
6c271ce2f   Jens Axboe   io_uring: add sub...
2730
  		/* Unless all new commands are FIXED regions, grab mm */
fb5ccc987   Pavel Begunkov   io_uring: Fix bro...
2731
  		if (!cur_mm) {
6c271ce2f   Jens Axboe   io_uring: add sub...
2732
2733
2734
2735
2736
2737
  			mm_fault = !mmget_not_zero(ctx->sqo_mm);
  			if (!mm_fault) {
  				use_mm(ctx->sqo_mm);
  				cur_mm = ctx->sqo_mm;
  			}
  		}
fb5ccc987   Pavel Begunkov   io_uring: Fix bro...
2738
2739
2740
  		to_submit = min(to_submit, ctx->sq_entries);
  		inflight += io_submit_sqes(ctx, to_submit, cur_mm != NULL,
  					   mm_fault);
6c271ce2f   Jens Axboe   io_uring: add sub...
2741
2742
2743
2744
2745
2746
2747
2748
2749
2750
  
  		/* Commit SQ ring head once we've consumed all SQEs */
  		io_commit_sqring(ctx);
  	}
  
  	set_fs(old_fs);
  	if (cur_mm) {
  		unuse_mm(cur_mm);
  		mmput(cur_mm);
  	}
8387e3688   Jens Axboe   io_uring: async w...
2751
  	revert_creds(old_cred);
060586324   Jens Axboe   io_uring: park SQ...
2752

2bbcd6d3b   Roman Penyaev   io_uring: fix inf...
2753
  	kthread_parkme();
060586324   Jens Axboe   io_uring: park SQ...
2754

6c271ce2f   Jens Axboe   io_uring: add sub...
2755
2756
  	return 0;
  }
bc808bced   Jens Axboe   io_uring: revert ...
2757
  static int io_ring_submit(struct io_ring_ctx *ctx, unsigned int to_submit)
2b188cc1b   Jens Axboe   Add io_uring IO i...
2758
  {
9a56a2323   Jens Axboe   io_uring: use fge...
2759
  	struct io_submit_state state, *statep = NULL;
9e645e110   Jens Axboe   io_uring: add sup...
2760
  	struct io_kiocb *link = NULL;
4fe2c9631   Jackie Liu   io_uring: add sup...
2761
  	struct io_kiocb *shadow_req = NULL;
9e645e110   Jens Axboe   io_uring: add sup...
2762
  	bool prev_was_link = false;
5c8b0b54d   Jens Axboe   io_uring: have su...
2763
  	int i, submit = 0;
2b188cc1b   Jens Axboe   Add io_uring IO i...
2764

9a56a2323   Jens Axboe   io_uring: use fge...
2765
2766
2767
2768
  	if (to_submit > IO_PLUG_THRESHOLD) {
  		io_submit_state_start(&state, ctx, to_submit);
  		statep = &state;
  	}
2b188cc1b   Jens Axboe   Add io_uring IO i...
2769
2770
2771
2772
2773
2774
  
  	for (i = 0; i < to_submit; i++) {
  		struct sqe_submit s;
  
  		if (!io_get_sqring(ctx, &s))
  			break;
9e645e110   Jens Axboe   io_uring: add sup...
2775
2776
2777
2778
2779
  		/*
  		 * If previous wasn't linked and we have a linked command,
  		 * that's the end of the chain. Submit the previous link.
  		 */
  		if (!prev_was_link && link) {
bc808bced   Jens Axboe   io_uring: revert ...
2780
  			io_queue_link_head(ctx, link, &link->submit, shadow_req);
9e645e110   Jens Axboe   io_uring: add sup...
2781
  			link = NULL;
5f5ad9ced   Jackie Liu   io_uring: fix use...
2782
  			shadow_req = NULL;
9e645e110   Jens Axboe   io_uring: add sup...
2783
2784
  		}
  		prev_was_link = (s.sqe->flags & IOSQE_IO_LINK) != 0;
4fe2c9631   Jackie Liu   io_uring: add sup...
2785
2786
2787
  		if (link && (s.sqe->flags & IOSQE_IO_DRAIN)) {
  			if (!shadow_req) {
  				shadow_req = io_get_req(ctx, NULL);
a1041c27b   Jackie Liu   io_uring: fix pot...
2788
2789
  				if (unlikely(!shadow_req))
  					goto out;
4fe2c9631   Jackie Liu   io_uring: add sup...
2790
2791
2792
2793
2794
  				shadow_req->flags |= (REQ_F_IO_DRAIN | REQ_F_SHADOW_DRAIN);
  				refcount_dec(&shadow_req->refs);
  			}
  			shadow_req->sequence = s.sequence;
  		}
a1041c27b   Jackie Liu   io_uring: fix pot...
2795
  out:
2b188cc1b   Jens Axboe   Add io_uring IO i...
2796
  		s.has_user = true;
def596e95   Jens Axboe   io_uring: support...
2797
  		s.needs_lock = false;
6c271ce2f   Jens Axboe   io_uring: add sub...
2798
  		s.needs_fixed_file = false;
5c8b0b54d   Jens Axboe   io_uring: have su...
2799
  		submit++;
bc808bced   Jens Axboe   io_uring: revert ...
2800
  		io_submit_sqe(ctx, &s, statep, &link);
2b188cc1b   Jens Axboe   Add io_uring IO i...
2801
  	}
2b188cc1b   Jens Axboe   Add io_uring IO i...
2802

9e645e110   Jens Axboe   io_uring: add sup...
2803
  	if (link)
bc808bced   Jens Axboe   io_uring: revert ...
2804
  		io_queue_link_head(ctx, link, &link->submit, shadow_req);
9a56a2323   Jens Axboe   io_uring: use fge...
2805
2806
  	if (statep)
  		io_submit_state_end(statep);
2b188cc1b   Jens Axboe   Add io_uring IO i...
2807

935d1e459   Pavel Begunkov   io_uring: Fix rac...
2808
  	io_commit_sqring(ctx);
5c8b0b54d   Jens Axboe   io_uring: have su...
2809
  	return submit;
2b188cc1b   Jens Axboe   Add io_uring IO i...
2810
  }
bda521624   Jens Axboe   io_uring: make CQ...
2811
2812
2813
2814
2815
2816
2817
2818
2819
2820
2821
2822
2823
2824
2825
2826
2827
2828
2829
2830
2831
2832
2833
2834
2835
2836
2837
2838
2839
2840
2841
  struct io_wait_queue {
  	struct wait_queue_entry wq;
  	struct io_ring_ctx *ctx;
  	unsigned to_wait;
  	unsigned nr_timeouts;
  };
  
  static inline bool io_should_wake(struct io_wait_queue *iowq)
  {
  	struct io_ring_ctx *ctx = iowq->ctx;
  
  	/*
  	 * Wake up if we have enough events, or if a timeout occured since we
  	 * started waiting. For timeouts, we always want to return to userspace,
  	 * regardless of event count.
  	 */
  	return io_cqring_events(ctx->rings) >= iowq->to_wait ||
  			atomic_read(&ctx->cq_timeouts) != iowq->nr_timeouts;
  }
  
  static int io_wake_function(struct wait_queue_entry *curr, unsigned int mode,
  			    int wake_flags, void *key)
  {
  	struct io_wait_queue *iowq = container_of(curr, struct io_wait_queue,
  							wq);
  
  	if (!io_should_wake(iowq))
  		return -1;
  
  	return autoremove_wake_function(curr, mode, wake_flags, key);
  }
2b188cc1b   Jens Axboe   Add io_uring IO i...
2842
2843
2844
2845
2846
2847
2848
  /*
   * Wait until events become available, if we don't already have some. The
   * application must reap them itself, as they reside on the shared cq ring.
   */
  static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events,
  			  const sigset_t __user *sig, size_t sigsz)
  {
bda521624   Jens Axboe   io_uring: make CQ...
2849
2850
2851
2852
2853
2854
2855
2856
2857
  	struct io_wait_queue iowq = {
  		.wq = {
  			.private	= current,
  			.func		= io_wake_function,
  			.entry		= LIST_HEAD_INIT(iowq.wq.entry),
  		},
  		.ctx		= ctx,
  		.to_wait	= min_events,
  	};
75b28affd   Hristo Venev   io_uring: allocat...
2858
  	struct io_rings *rings = ctx->rings;
2b188cc1b   Jens Axboe   Add io_uring IO i...
2859
  	int ret;
75b28affd   Hristo Venev   io_uring: allocat...
2860
  	if (io_cqring_events(rings) >= min_events)
2b188cc1b   Jens Axboe   Add io_uring IO i...
2861
2862
2863
  		return 0;
  
  	if (sig) {
9e75ad5d8   Arnd Bergmann   io_uring: fix big...
2864
2865
2866
  #ifdef CONFIG_COMPAT
  		if (in_compat_syscall())
  			ret = set_compat_user_sigmask((const compat_sigset_t __user *)sig,
b772434be   Oleg Nesterov   signal: simplify ...
2867
  						      sigsz);
9e75ad5d8   Arnd Bergmann   io_uring: fix big...
2868
2869
  		else
  #endif
b772434be   Oleg Nesterov   signal: simplify ...
2870
  			ret = set_user_sigmask(sig, sigsz);
9e75ad5d8   Arnd Bergmann   io_uring: fix big...
2871

2b188cc1b   Jens Axboe   Add io_uring IO i...
2872
2873
2874
  		if (ret)
  			return ret;
  	}
bda521624   Jens Axboe   io_uring: make CQ...
2875
2876
2877
2878
2879
2880
2881
2882
2883
2884
2885
2886
2887
2888
  	ret = 0;
  	iowq.nr_timeouts = atomic_read(&ctx->cq_timeouts);
  	do {
  		prepare_to_wait_exclusive(&ctx->wait, &iowq.wq,
  						TASK_INTERRUPTIBLE);
  		if (io_should_wake(&iowq))
  			break;
  		schedule();
  		if (signal_pending(current)) {
  			ret = -ERESTARTSYS;
  			break;
  		}
  	} while (1);
  	finish_wait(&ctx->wait, &iowq.wq);
b772434be   Oleg Nesterov   signal: simplify ...
2889
  	restore_saved_sigmask_unless(ret == -ERESTARTSYS);
97abc889e   Oleg Nesterov   signal: remove th...
2890
2891
  	if (ret == -ERESTARTSYS)
  		ret = -EINTR;
2b188cc1b   Jens Axboe   Add io_uring IO i...
2892

75b28affd   Hristo Venev   io_uring: allocat...
2893
  	return READ_ONCE(rings->cq.head) == READ_ONCE(rings->cq.tail) ? ret : 0;
2b188cc1b   Jens Axboe   Add io_uring IO i...
2894
  }
6b06314c4   Jens Axboe   io_uring: add fil...
2895
2896
2897
2898
2899
2900
2901
2902
2903
2904
2905
2906
2907
2908
2909
2910
2911
2912
2913
2914
2915
2916
2917
2918
2919
2920
2921
2922
2923
  static void __io_sqe_files_unregister(struct io_ring_ctx *ctx)
  {
  #if defined(CONFIG_UNIX)
  	if (ctx->ring_sock) {
  		struct sock *sock = ctx->ring_sock->sk;
  		struct sk_buff *skb;
  
  		while ((skb = skb_dequeue(&sock->sk_receive_queue)) != NULL)
  			kfree_skb(skb);
  	}
  #else
  	int i;
  
  	for (i = 0; i < ctx->nr_user_files; i++)
  		fput(ctx->user_files[i]);
  #endif
  }
  
  static int io_sqe_files_unregister(struct io_ring_ctx *ctx)
  {
  	if (!ctx->user_files)
  		return -ENXIO;
  
  	__io_sqe_files_unregister(ctx);
  	kfree(ctx->user_files);
  	ctx->user_files = NULL;
  	ctx->nr_user_files = 0;
  	return 0;
  }
6c271ce2f   Jens Axboe   io_uring: add sub...
2924
2925
2926
  static void io_sq_thread_stop(struct io_ring_ctx *ctx)
  {
  	if (ctx->sqo_thread) {
a4c0b3dec   Jackie Liu   io_uring: fix io_...
2927
  		wait_for_completion(&ctx->sqo_thread_started);
2bbcd6d3b   Roman Penyaev   io_uring: fix inf...
2928
2929
2930
2931
2932
  		/*
  		 * The park is a bit of a work-around, without it we get
  		 * warning spews on shutdown with SQPOLL set and affinity
  		 * set to a single CPU.
  		 */
060586324   Jens Axboe   io_uring: park SQ...
2933
  		kthread_park(ctx->sqo_thread);
6c271ce2f   Jens Axboe   io_uring: add sub...
2934
2935
2936
2937
  		kthread_stop(ctx->sqo_thread);
  		ctx->sqo_thread = NULL;
  	}
  }
6b06314c4   Jens Axboe   io_uring: add fil...
2938
2939
  static void io_finish_async(struct io_ring_ctx *ctx)
  {
54a91f3bb   Jens Axboe   io_uring: limit p...
2940
  	int i;
6c271ce2f   Jens Axboe   io_uring: add sub...
2941
  	io_sq_thread_stop(ctx);
54a91f3bb   Jens Axboe   io_uring: limit p...
2942
2943
2944
2945
2946
  	for (i = 0; i < ARRAY_SIZE(ctx->sqo_wq); i++) {
  		if (ctx->sqo_wq[i]) {
  			destroy_workqueue(ctx->sqo_wq[i]);
  			ctx->sqo_wq[i] = NULL;
  		}
6b06314c4   Jens Axboe   io_uring: add fil...
2947
2948
2949
2950
2951
2952
2953
  	}
  }
  
  #if defined(CONFIG_UNIX)
  static void io_destruct_skb(struct sk_buff *skb)
  {
  	struct io_ring_ctx *ctx = skb->sk->sk_user_data;
8a9973408   Jens Axboe   io_uring: only fl...
2954
2955
2956
2957
2958
  	int i;
  
  	for (i = 0; i < ARRAY_SIZE(ctx->sqo_wq); i++)
  		if (ctx->sqo_wq[i])
  			flush_workqueue(ctx->sqo_wq[i]);
6b06314c4   Jens Axboe   io_uring: add fil...
2959

6b06314c4   Jens Axboe   io_uring: add fil...
2960
2961
2962
2963
2964
2965
2966
2967
2968
2969
2970
2971
2972
2973
  	unix_destruct_scm(skb);
  }
  
  /*
   * Ensure the UNIX gc is aware of our file set, so we are certain that
   * the io_uring can be safely unregistered on process exit, even if we have
   * loops in the file referencing.
   */
  static int __io_sqe_files_scm(struct io_ring_ctx *ctx, int nr, int offset)
  {
  	struct sock *sk = ctx->ring_sock->sk;
  	struct scm_fp_list *fpl;
  	struct sk_buff *skb;
  	int i;
6b06314c4   Jens Axboe   io_uring: add fil...
2974
2975
2976
2977
2978
2979
2980
2981
2982
2983
2984
2985
2986
2987
2988
2989
2990
2991
2992
2993
2994
2995
2996
2997
2998
2999
3000
3001
3002
3003
3004
3005
3006
3007
3008
3009
3010
3011
3012
3013
3014
3015
3016
3017
  	fpl = kzalloc(sizeof(*fpl), GFP_KERNEL);
  	if (!fpl)
  		return -ENOMEM;
  
  	skb = alloc_skb(0, GFP_KERNEL);
  	if (!skb) {
  		kfree(fpl);
  		return -ENOMEM;
  	}
  
  	skb->sk = sk;
  	skb->destructor = io_destruct_skb;
  
  	fpl->user = get_uid(ctx->user);
  	for (i = 0; i < nr; i++) {
  		fpl->fp[i] = get_file(ctx->user_files[i + offset]);
  		unix_inflight(fpl->user, fpl->fp[i]);
  	}
  
  	fpl->max = fpl->count = nr;
  	UNIXCB(skb).fp = fpl;
  	refcount_add(skb->truesize, &sk->sk_wmem_alloc);
  	skb_queue_head(&sk->sk_receive_queue, skb);
  
  	for (i = 0; i < nr; i++)
  		fput(fpl->fp[i]);
  
  	return 0;
  }
  
  /*
   * If UNIX sockets are enabled, fd passing can cause a reference cycle which
   * causes regular reference counting to break down. We rely on the UNIX
   * garbage collection to take care of this problem for us.
   */
  static int io_sqe_files_scm(struct io_ring_ctx *ctx)
  {
  	unsigned left, total;
  	int ret = 0;
  
  	total = 0;
  	left = ctx->nr_user_files;
  	while (left) {
  		unsigned this_files = min_t(unsigned, left, SCM_MAX_FD);
6b06314c4   Jens Axboe   io_uring: add fil...
3018
3019
3020
3021
3022
3023
3024
3025
3026
3027
3028
3029
3030
3031
3032
3033
3034
3035
3036
3037
3038
3039
3040
3041
3042
3043
3044
3045
3046
3047
3048
3049
3050
3051
3052
3053
3054
3055
3056
3057
3058
3059
3060
3061
3062
3063
3064
3065
3066
3067
3068
3069
3070
3071
3072
3073
3074
3075
3076
3077
3078
3079
3080
3081
3082
3083
3084
3085
3086
3087
3088
3089
3090
  
  		ret = __io_sqe_files_scm(ctx, this_files, total);
  		if (ret)
  			break;
  		left -= this_files;
  		total += this_files;
  	}
  
  	if (!ret)
  		return 0;
  
  	while (total < ctx->nr_user_files) {
  		fput(ctx->user_files[total]);
  		total++;
  	}
  
  	return ret;
  }
  #else
  static int io_sqe_files_scm(struct io_ring_ctx *ctx)
  {
  	return 0;
  }
  #endif
  
  static int io_sqe_files_register(struct io_ring_ctx *ctx, void __user *arg,
  				 unsigned nr_args)
  {
  	__s32 __user *fds = (__s32 __user *) arg;
  	int fd, ret = 0;
  	unsigned i;
  
  	if (ctx->user_files)
  		return -EBUSY;
  	if (!nr_args)
  		return -EINVAL;
  	if (nr_args > IORING_MAX_FIXED_FILES)
  		return -EMFILE;
  
  	ctx->user_files = kcalloc(nr_args, sizeof(struct file *), GFP_KERNEL);
  	if (!ctx->user_files)
  		return -ENOMEM;
  
  	for (i = 0; i < nr_args; i++) {
  		ret = -EFAULT;
  		if (copy_from_user(&fd, &fds[i], sizeof(fd)))
  			break;
  
  		ctx->user_files[i] = fget(fd);
  
  		ret = -EBADF;
  		if (!ctx->user_files[i])
  			break;
  		/*
  		 * Don't allow io_uring instances to be registered. If UNIX
  		 * isn't enabled, then this causes a reference cycle and this
  		 * instance can never get freed. If UNIX is enabled we'll
  		 * handle it just fine, but there's still no point in allowing
  		 * a ring fd as it doesn't support regular read/write anyway.
  		 */
  		if (ctx->user_files[i]->f_op == &io_uring_fops) {
  			fput(ctx->user_files[i]);
  			break;
  		}
  		ctx->nr_user_files++;
  		ret = 0;
  	}
  
  	if (ret) {
  		for (i = 0; i < ctx->nr_user_files; i++)
  			fput(ctx->user_files[i]);
  
  		kfree(ctx->user_files);
25adf50fe   Jens Axboe   io_uring: fix dou...
3091
  		ctx->user_files = NULL;
6b06314c4   Jens Axboe   io_uring: add fil...
3092
3093
3094
3095
3096
3097
3098
3099
3100
3101
  		ctx->nr_user_files = 0;
  		return ret;
  	}
  
  	ret = io_sqe_files_scm(ctx);
  	if (ret)
  		io_sqe_files_unregister(ctx);
  
  	return ret;
  }
6c271ce2f   Jens Axboe   io_uring: add sub...
3102
3103
  static int io_sq_offload_start(struct io_ring_ctx *ctx,
  			       struct io_uring_params *p)
2b188cc1b   Jens Axboe   Add io_uring IO i...
3104
3105
3106
3107
3108
  {
  	int ret;
  
  	mmgrab(current->mm);
  	ctx->sqo_mm = current->mm;
6c271ce2f   Jens Axboe   io_uring: add sub...
3109
  	if (ctx->flags & IORING_SETUP_SQPOLL) {
3ec482d15   Jens Axboe   io_uring: restric...
3110
3111
3112
  		ret = -EPERM;
  		if (!capable(CAP_SYS_ADMIN))
  			goto err;
917257daa   Jens Axboe   io_uring: only te...
3113
3114
3115
  		ctx->sq_thread_idle = msecs_to_jiffies(p->sq_thread_idle);
  		if (!ctx->sq_thread_idle)
  			ctx->sq_thread_idle = HZ;
6c271ce2f   Jens Axboe   io_uring: add sub...
3116
  		if (p->flags & IORING_SETUP_SQ_AFF) {
44a9bd18a   Jens Axboe   io_uring: fix fai...
3117
  			int cpu = p->sq_thread_cpu;
6c271ce2f   Jens Axboe   io_uring: add sub...
3118

917257daa   Jens Axboe   io_uring: only te...
3119
  			ret = -EINVAL;
44a9bd18a   Jens Axboe   io_uring: fix fai...
3120
3121
  			if (cpu >= nr_cpu_ids)
  				goto err;
7889f44dd   Shenghui Wang   io_uring: use cpu...
3122
  			if (!cpu_online(cpu))
917257daa   Jens Axboe   io_uring: only te...
3123
  				goto err;
6c271ce2f   Jens Axboe   io_uring: add sub...
3124
3125
3126
3127
3128
3129
3130
3131
3132
3133
3134
3135
3136
3137
3138
3139
3140
3141
  			ctx->sqo_thread = kthread_create_on_cpu(io_sq_thread,
  							ctx, cpu,
  							"io_uring-sq");
  		} else {
  			ctx->sqo_thread = kthread_create(io_sq_thread, ctx,
  							"io_uring-sq");
  		}
  		if (IS_ERR(ctx->sqo_thread)) {
  			ret = PTR_ERR(ctx->sqo_thread);
  			ctx->sqo_thread = NULL;
  			goto err;
  		}
  		wake_up_process(ctx->sqo_thread);
  	} else if (p->flags & IORING_SETUP_SQ_AFF) {
  		/* Can't have SQ_AFF without SQPOLL */
  		ret = -EINVAL;
  		goto err;
  	}
2b188cc1b   Jens Axboe   Add io_uring IO i...
3142
  	/* Do QD, or 2 * CPUS, whatever is smallest */
54a91f3bb   Jens Axboe   io_uring: limit p...
3143
3144
  	ctx->sqo_wq[0] = alloc_workqueue("io_ring-wq",
  			WQ_UNBOUND | WQ_FREEZABLE,
2b188cc1b   Jens Axboe   Add io_uring IO i...
3145
  			min(ctx->sq_entries - 1, 2 * num_online_cpus()));
54a91f3bb   Jens Axboe   io_uring: limit p...
3146
3147
3148
3149
3150
3151
3152
3153
3154
3155
3156
3157
3158
3159
3160
  	if (!ctx->sqo_wq[0]) {
  		ret = -ENOMEM;
  		goto err;
  	}
  
  	/*
  	 * This is for buffered writes, where we want to limit the parallelism
  	 * due to file locking in file systems. As "normal" buffered writes
  	 * should parellelize on writeout quite nicely, limit us to having 2
  	 * pending. This avoids massive contention on the inode when doing
  	 * buffered async writes.
  	 */
  	ctx->sqo_wq[1] = alloc_workqueue("io_ring-write-wq",
  						WQ_UNBOUND | WQ_FREEZABLE, 2);
  	if (!ctx->sqo_wq[1]) {
2b188cc1b   Jens Axboe   Add io_uring IO i...
3161
3162
3163
3164
3165
3166
  		ret = -ENOMEM;
  		goto err;
  	}
  
  	return 0;
  err:
54a91f3bb   Jens Axboe   io_uring: limit p...
3167
  	io_finish_async(ctx);
2b188cc1b   Jens Axboe   Add io_uring IO i...
3168
3169
3170
3171
3172
3173
3174
3175
3176
3177
3178
3179
3180
3181
3182
3183
3184
3185
3186
3187
3188
3189
3190
3191
3192
3193
3194
3195
3196
3197
  	mmdrop(ctx->sqo_mm);
  	ctx->sqo_mm = NULL;
  	return ret;
  }
  
  static void io_unaccount_mem(struct user_struct *user, unsigned long nr_pages)
  {
  	atomic_long_sub(nr_pages, &user->locked_vm);
  }
  
  static int io_account_mem(struct user_struct *user, unsigned long nr_pages)
  {
  	unsigned long page_limit, cur_pages, new_pages;
  
  	/* Don't allow more pages than we can safely lock */
  	page_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
  
  	do {
  		cur_pages = atomic_long_read(&user->locked_vm);
  		new_pages = cur_pages + nr_pages;
  		if (new_pages > page_limit)
  			return -ENOMEM;
  	} while (atomic_long_cmpxchg(&user->locked_vm, cur_pages,
  					new_pages) != cur_pages);
  
  	return 0;
  }
  
  static void io_mem_free(void *ptr)
  {
52e04ef4c   Mark Rutland   io_uring: free al...
3198
3199
3200
3201
  	struct page *page;
  
  	if (!ptr)
  		return;
2b188cc1b   Jens Axboe   Add io_uring IO i...
3202

52e04ef4c   Mark Rutland   io_uring: free al...
3203
  	page = virt_to_head_page(ptr);
2b188cc1b   Jens Axboe   Add io_uring IO i...
3204
3205
3206
3207
3208
3209
3210
3211
3212
3213
3214
  	if (put_page_testzero(page))
  		free_compound_page(page);
  }
  
  static void *io_mem_alloc(size_t size)
  {
  	gfp_t gfp_flags = GFP_KERNEL | __GFP_ZERO | __GFP_NOWARN | __GFP_COMP |
  				__GFP_NORETRY;
  
  	return (void *) __get_free_pages(gfp_flags, get_order(size));
  }
75b28affd   Hristo Venev   io_uring: allocat...
3215
3216
3217
3218
3219
3220
3221
3222
3223
3224
3225
3226
3227
3228
3229
  static unsigned long rings_size(unsigned sq_entries, unsigned cq_entries,
  				size_t *sq_offset)
  {
  	struct io_rings *rings;
  	size_t off, sq_array_size;
  
  	off = struct_size(rings, cqes, cq_entries);
  	if (off == SIZE_MAX)
  		return SIZE_MAX;
  
  #ifdef CONFIG_SMP
  	off = ALIGN(off, SMP_CACHE_BYTES);
  	if (off == 0)
  		return SIZE_MAX;
  #endif
0b1799662   Dmitry Vyukov   io_uring: fix sq ...
3230
3231
  	if (sq_offset)
  		*sq_offset = off;
75b28affd   Hristo Venev   io_uring: allocat...
3232
3233
3234
3235
3236
3237
  	sq_array_size = array_size(sizeof(u32), sq_entries);
  	if (sq_array_size == SIZE_MAX)
  		return SIZE_MAX;
  
  	if (check_add_overflow(off, sq_array_size, &off))
  		return SIZE_MAX;
75b28affd   Hristo Venev   io_uring: allocat...
3238
3239
  	return off;
  }
2b188cc1b   Jens Axboe   Add io_uring IO i...
3240
3241
  static unsigned long ring_pages(unsigned sq_entries, unsigned cq_entries)
  {
75b28affd   Hristo Venev   io_uring: allocat...
3242
  	size_t pages;
2b188cc1b   Jens Axboe   Add io_uring IO i...
3243

75b28affd   Hristo Venev   io_uring: allocat...
3244
3245
3246
3247
  	pages = (size_t)1 << get_order(
  		rings_size(sq_entries, cq_entries, NULL));
  	pages += (size_t)1 << get_order(
  		array_size(sizeof(struct io_uring_sqe), sq_entries));
2b188cc1b   Jens Axboe   Add io_uring IO i...
3248

75b28affd   Hristo Venev   io_uring: allocat...
3249
  	return pages;
2b188cc1b   Jens Axboe   Add io_uring IO i...
3250
  }
edafccee5   Jens Axboe   io_uring: add sup...
3251
3252
3253
3254
3255
3256
3257
3258
3259
3260
3261
  static int io_sqe_buffer_unregister(struct io_ring_ctx *ctx)
  {
  	int i, j;
  
  	if (!ctx->user_bufs)
  		return -ENXIO;
  
  	for (i = 0; i < ctx->nr_user_bufs; i++) {
  		struct io_mapped_ubuf *imu = &ctx->user_bufs[i];
  
  		for (j = 0; j < imu->nr_bvecs; j++)
27c4d3a32   John Hubbard   fs/io_uring.c: co...
3262
  			put_user_page(imu->bvec[j].bv_page);
edafccee5   Jens Axboe   io_uring: add sup...
3263
3264
3265
  
  		if (ctx->account_mem)
  			io_unaccount_mem(ctx->user, imu->nr_bvecs);
d4ef64751   Mark Rutland   io_uring: avoid p...
3266
  		kvfree(imu->bvec);
edafccee5   Jens Axboe   io_uring: add sup...
3267
3268
3269
3270
3271
3272
3273
3274
3275
3276
3277
3278
3279
3280
3281
3282
3283
3284
3285
3286
3287
3288
3289
3290
3291
3292
3293
3294
3295
3296
3297
3298
3299
3300
3301
3302
3303
3304
3305
3306
3307
3308
3309
3310
3311
3312
3313
3314
3315
3316
3317
3318
3319
3320
3321
3322
3323
3324
3325
3326
3327
  		imu->nr_bvecs = 0;
  	}
  
  	kfree(ctx->user_bufs);
  	ctx->user_bufs = NULL;
  	ctx->nr_user_bufs = 0;
  	return 0;
  }
  
  static int io_copy_iov(struct io_ring_ctx *ctx, struct iovec *dst,
  		       void __user *arg, unsigned index)
  {
  	struct iovec __user *src;
  
  #ifdef CONFIG_COMPAT
  	if (ctx->compat) {
  		struct compat_iovec __user *ciovs;
  		struct compat_iovec ciov;
  
  		ciovs = (struct compat_iovec __user *) arg;
  		if (copy_from_user(&ciov, &ciovs[index], sizeof(ciov)))
  			return -EFAULT;
  
  		dst->iov_base = (void __user *) (unsigned long) ciov.iov_base;
  		dst->iov_len = ciov.iov_len;
  		return 0;
  	}
  #endif
  	src = (struct iovec __user *) arg;
  	if (copy_from_user(dst, &src[index], sizeof(*dst)))
  		return -EFAULT;
  	return 0;
  }
  
  static int io_sqe_buffer_register(struct io_ring_ctx *ctx, void __user *arg,
  				  unsigned nr_args)
  {
  	struct vm_area_struct **vmas = NULL;
  	struct page **pages = NULL;
  	int i, j, got_pages = 0;
  	int ret = -EINVAL;
  
  	if (ctx->user_bufs)
  		return -EBUSY;
  	if (!nr_args || nr_args > UIO_MAXIOV)
  		return -EINVAL;
  
  	ctx->user_bufs = kcalloc(nr_args, sizeof(struct io_mapped_ubuf),
  					GFP_KERNEL);
  	if (!ctx->user_bufs)
  		return -ENOMEM;
  
  	for (i = 0; i < nr_args; i++) {
  		struct io_mapped_ubuf *imu = &ctx->user_bufs[i];
  		unsigned long off, start, end, ubuf;
  		int pret, nr_pages;
  		struct iovec iov;
  		size_t size;
  
  		ret = io_copy_iov(ctx, &iov, arg, i);
  		if (ret)
a278682da   Pavel Begunkov   io_uring: Fix __i...
3328
  			goto err;
edafccee5   Jens Axboe   io_uring: add sup...
3329
3330
3331
3332
3333
3334
3335
3336
3337
3338
3339
3340
3341
3342
3343
3344
3345
3346
3347
3348
3349
3350
3351
3352
3353
3354
3355
  
  		/*
  		 * Don't impose further limits on the size and buffer
  		 * constraints here, we'll -EINVAL later when IO is
  		 * submitted if they are wrong.
  		 */
  		ret = -EFAULT;
  		if (!iov.iov_base || !iov.iov_len)
  			goto err;
  
  		/* arbitrary limit, but we need something */
  		if (iov.iov_len > SZ_1G)
  			goto err;
  
  		ubuf = (unsigned long) iov.iov_base;
  		end = (ubuf + iov.iov_len + PAGE_SIZE - 1) >> PAGE_SHIFT;
  		start = ubuf >> PAGE_SHIFT;
  		nr_pages = end - start;
  
  		if (ctx->account_mem) {
  			ret = io_account_mem(ctx->user, nr_pages);
  			if (ret)
  				goto err;
  		}
  
  		ret = 0;
  		if (!pages || nr_pages > got_pages) {
ab2df991e   Denis Efremov   io_uring: use kvf...
3356
3357
  			kvfree(vmas);
  			kvfree(pages);
d4ef64751   Mark Rutland   io_uring: avoid p...
3358
  			pages = kvmalloc_array(nr_pages, sizeof(struct page *),
edafccee5   Jens Axboe   io_uring: add sup...
3359
  						GFP_KERNEL);
d4ef64751   Mark Rutland   io_uring: avoid p...
3360
  			vmas = kvmalloc_array(nr_pages,
edafccee5   Jens Axboe   io_uring: add sup...
3361
3362
3363
3364
3365
3366
3367
3368
3369
3370
  					sizeof(struct vm_area_struct *),
  					GFP_KERNEL);
  			if (!pages || !vmas) {
  				ret = -ENOMEM;
  				if (ctx->account_mem)
  					io_unaccount_mem(ctx->user, nr_pages);
  				goto err;
  			}
  			got_pages = nr_pages;
  		}
d4ef64751   Mark Rutland   io_uring: avoid p...
3371
  		imu->bvec = kvmalloc_array(nr_pages, sizeof(struct bio_vec),
edafccee5   Jens Axboe   io_uring: add sup...
3372
3373
3374
3375
3376
3377
3378
3379
3380
3381
  						GFP_KERNEL);
  		ret = -ENOMEM;
  		if (!imu->bvec) {
  			if (ctx->account_mem)
  				io_unaccount_mem(ctx->user, nr_pages);
  			goto err;
  		}
  
  		ret = 0;
  		down_read(&current->mm->mmap_sem);
932f4a630   Ira Weiny   mm/gup: replace g...
3382
3383
3384
  		pret = get_user_pages(ubuf, nr_pages,
  				      FOLL_WRITE | FOLL_LONGTERM,
  				      pages, vmas);
edafccee5   Jens Axboe   io_uring: add sup...
3385
3386
3387
3388
3389
3390
3391
3392
3393
3394
3395
3396
3397
3398
3399
3400
3401
3402
3403
3404
  		if (pret == nr_pages) {
  			/* don't support file backed memory */
  			for (j = 0; j < nr_pages; j++) {
  				struct vm_area_struct *vma = vmas[j];
  
  				if (vma->vm_file &&
  				    !is_file_hugepages(vma->vm_file)) {
  					ret = -EOPNOTSUPP;
  					break;
  				}
  			}
  		} else {
  			ret = pret < 0 ? pret : -EFAULT;
  		}
  		up_read(&current->mm->mmap_sem);
  		if (ret) {
  			/*
  			 * if we did partial map, or found file backed vmas,
  			 * release any pages we did get
  			 */
27c4d3a32   John Hubbard   fs/io_uring.c: co...
3405
3406
  			if (pret > 0)
  				put_user_pages(pages, pret);
edafccee5   Jens Axboe   io_uring: add sup...
3407
3408
  			if (ctx->account_mem)
  				io_unaccount_mem(ctx->user, nr_pages);
d4ef64751   Mark Rutland   io_uring: avoid p...
3409
  			kvfree(imu->bvec);
edafccee5   Jens Axboe   io_uring: add sup...
3410
3411
3412
3413
3414
3415
3416
3417
3418
3419
3420
3421
3422
3423
3424
3425
3426
3427
3428
3429
3430
3431
  			goto err;
  		}
  
  		off = ubuf & ~PAGE_MASK;
  		size = iov.iov_len;
  		for (j = 0; j < nr_pages; j++) {
  			size_t vec_len;
  
  			vec_len = min_t(size_t, size, PAGE_SIZE - off);
  			imu->bvec[j].bv_page = pages[j];
  			imu->bvec[j].bv_len = vec_len;
  			imu->bvec[j].bv_offset = off;
  			off = 0;
  			size -= vec_len;
  		}
  		/* store original address for later verification */
  		imu->ubuf = ubuf;
  		imu->len = iov.iov_len;
  		imu->nr_bvecs = nr_pages;
  
  		ctx->nr_user_bufs++;
  	}
d4ef64751   Mark Rutland   io_uring: avoid p...
3432
3433
  	kvfree(pages);
  	kvfree(vmas);
edafccee5   Jens Axboe   io_uring: add sup...
3434
3435
  	return 0;
  err:
d4ef64751   Mark Rutland   io_uring: avoid p...
3436
3437
  	kvfree(pages);
  	kvfree(vmas);
edafccee5   Jens Axboe   io_uring: add sup...
3438
3439
3440
  	io_sqe_buffer_unregister(ctx);
  	return ret;
  }
9b402849e   Jens Axboe   io_uring: add sup...
3441
3442
3443
3444
3445
3446
3447
3448
3449
3450
3451
3452
3453
3454
3455
3456
3457
3458
3459
3460
3461
3462
3463
3464
3465
3466
3467
3468
3469
3470
3471
  static int io_eventfd_register(struct io_ring_ctx *ctx, void __user *arg)
  {
  	__s32 __user *fds = arg;
  	int fd;
  
  	if (ctx->cq_ev_fd)
  		return -EBUSY;
  
  	if (copy_from_user(&fd, fds, sizeof(*fds)))
  		return -EFAULT;
  
  	ctx->cq_ev_fd = eventfd_ctx_fdget(fd);
  	if (IS_ERR(ctx->cq_ev_fd)) {
  		int ret = PTR_ERR(ctx->cq_ev_fd);
  		ctx->cq_ev_fd = NULL;
  		return ret;
  	}
  
  	return 0;
  }
  
  static int io_eventfd_unregister(struct io_ring_ctx *ctx)
  {
  	if (ctx->cq_ev_fd) {
  		eventfd_ctx_put(ctx->cq_ev_fd);
  		ctx->cq_ev_fd = NULL;
  		return 0;
  	}
  
  	return -ENXIO;
  }
2b188cc1b   Jens Axboe   Add io_uring IO i...
3472
3473
  static void io_ring_ctx_free(struct io_ring_ctx *ctx)
  {
6b06314c4   Jens Axboe   io_uring: add fil...
3474
  	io_finish_async(ctx);
2b188cc1b   Jens Axboe   Add io_uring IO i...
3475
3476
  	if (ctx->sqo_mm)
  		mmdrop(ctx->sqo_mm);
def596e95   Jens Axboe   io_uring: support...
3477
3478
  
  	io_iopoll_reap_events(ctx);
edafccee5   Jens Axboe   io_uring: add sup...
3479
  	io_sqe_buffer_unregister(ctx);
6b06314c4   Jens Axboe   io_uring: add fil...
3480
  	io_sqe_files_unregister(ctx);
9b402849e   Jens Axboe   io_uring: add sup...
3481
  	io_eventfd_unregister(ctx);
def596e95   Jens Axboe   io_uring: support...
3482

2b188cc1b   Jens Axboe   Add io_uring IO i...
3483
  #if defined(CONFIG_UNIX)
355e8d26f   Eric Biggers   io_uring: fix mem...
3484
3485
  	if (ctx->ring_sock) {
  		ctx->ring_sock->file = NULL; /* so that iput() is called */
2b188cc1b   Jens Axboe   Add io_uring IO i...
3486
  		sock_release(ctx->ring_sock);
355e8d26f   Eric Biggers   io_uring: fix mem...
3487
  	}
2b188cc1b   Jens Axboe   Add io_uring IO i...
3488
  #endif
75b28affd   Hristo Venev   io_uring: allocat...
3489
  	io_mem_free(ctx->rings);
2b188cc1b   Jens Axboe   Add io_uring IO i...
3490
  	io_mem_free(ctx->sq_sqes);
2b188cc1b   Jens Axboe   Add io_uring IO i...
3491
3492
3493
3494
3495
3496
  
  	percpu_ref_exit(&ctx->refs);
  	if (ctx->account_mem)
  		io_unaccount_mem(ctx->user,
  				ring_pages(ctx->sq_entries, ctx->cq_entries));
  	free_uid(ctx->user);
8387e3688   Jens Axboe   io_uring: async w...
3497
3498
  	if (ctx->creds)
  		put_cred(ctx->creds);
2b188cc1b   Jens Axboe   Add io_uring IO i...
3499
3500
3501
3502
3503
3504
3505
3506
3507
  	kfree(ctx);
  }
  
  static __poll_t io_uring_poll(struct file *file, poll_table *wait)
  {
  	struct io_ring_ctx *ctx = file->private_data;
  	__poll_t mask = 0;
  
  	poll_wait(file, &ctx->cq_wait, wait);
4f7067c3f   Stefan Bühler   io_uring: remove ...
3508
3509
3510
3511
  	/*
  	 * synchronizes with barrier from wq_has_sleeper call in
  	 * io_commit_cqring
  	 */
2b188cc1b   Jens Axboe   Add io_uring IO i...
3512
  	smp_rmb();
75b28affd   Hristo Venev   io_uring: allocat...
3513
3514
  	if (READ_ONCE(ctx->rings->sq.tail) - ctx->cached_sq_head !=
  	    ctx->rings->sq_ring_entries)
2b188cc1b   Jens Axboe   Add io_uring IO i...
3515
  		mask |= EPOLLOUT | EPOLLWRNORM;
daa5de541   yangerkun   io_uring: compare...
3516
  	if (READ_ONCE(ctx->rings->cq.head) != ctx->cached_cq_tail)
2b188cc1b   Jens Axboe   Add io_uring IO i...
3517
3518
3519
3520
3521
3522
3523
3524
3525
3526
3527
  		mask |= EPOLLIN | EPOLLRDNORM;
  
  	return mask;
  }
  
  static int io_uring_fasync(int fd, struct file *file, int on)
  {
  	struct io_ring_ctx *ctx = file->private_data;
  
  	return fasync_helper(fd, file, on, &ctx->cq_fasync);
  }
1c4404efc   Jens Axboe   io_uring: make su...
3528
3529
3530
3531
3532
3533
3534
3535
3536
3537
3538
3539
3540
3541
3542
3543
3544
3545
  static void io_cancel_async_work(struct io_ring_ctx *ctx,
  				 struct task_struct *task)
  {
  	if (list_empty(&ctx->task_list))
  		return;
  
  	spin_lock_irq(&ctx->task_lock);
  	while (!list_empty(&ctx->task_list)) {
  		struct io_kiocb *req;
  
  		req = list_first_entry(&ctx->task_list, struct io_kiocb, task_list);
  		list_del_init(&req->task_list);
  		req->flags |= REQ_F_CANCEL;
  		if (req->work_task && (!task || req->task == task))
  			send_sig(SIGINT, req->work_task, 1);
  	}
  	spin_unlock_irq(&ctx->task_lock);
  }
2b188cc1b   Jens Axboe   Add io_uring IO i...
3546
3547
3548
3549
3550
  static void io_ring_ctx_wait_and_kill(struct io_ring_ctx *ctx)
  {
  	mutex_lock(&ctx->uring_lock);
  	percpu_ref_kill(&ctx->refs);
  	mutex_unlock(&ctx->uring_lock);
1c4404efc   Jens Axboe   io_uring: make su...
3551
  	io_cancel_async_work(ctx, NULL);
5262f5679   Jens Axboe   io_uring: IORING_...
3552
  	io_kill_timeouts(ctx);
221c5eb23   Jens Axboe   io_uring: add sup...
3553
  	io_poll_remove_all(ctx);
def596e95   Jens Axboe   io_uring: support...
3554
  	io_iopoll_reap_events(ctx);
2b188cc1b   Jens Axboe   Add io_uring IO i...
3555
3556
3557
  	wait_for_completion(&ctx->ctx_done);
  	io_ring_ctx_free(ctx);
  }
1c4404efc   Jens Axboe   io_uring: make su...
3558
3559
3560
3561
3562
3563
3564
3565
3566
  static int io_uring_flush(struct file *file, void *data)
  {
  	struct io_ring_ctx *ctx = file->private_data;
  
  	if (fatal_signal_pending(current) || (current->flags & PF_EXITING))
  		io_cancel_async_work(ctx, current);
  
  	return 0;
  }
2b188cc1b   Jens Axboe   Add io_uring IO i...
3567
3568
3569
3570
3571
3572
3573
3574
3575
3576
3577
3578
3579
3580
3581
3582
3583
3584
3585
3586
  static int io_uring_release(struct inode *inode, struct file *file)
  {
  	struct io_ring_ctx *ctx = file->private_data;
  
  	file->private_data = NULL;
  	io_ring_ctx_wait_and_kill(ctx);
  	return 0;
  }
  
  static int io_uring_mmap(struct file *file, struct vm_area_struct *vma)
  {
  	loff_t offset = (loff_t) vma->vm_pgoff << PAGE_SHIFT;
  	unsigned long sz = vma->vm_end - vma->vm_start;
  	struct io_ring_ctx *ctx = file->private_data;
  	unsigned long pfn;
  	struct page *page;
  	void *ptr;
  
  	switch (offset) {
  	case IORING_OFF_SQ_RING:
75b28affd   Hristo Venev   io_uring: allocat...
3587
3588
  	case IORING_OFF_CQ_RING:
  		ptr = ctx->rings;
2b188cc1b   Jens Axboe   Add io_uring IO i...
3589
3590
3591
3592
  		break;
  	case IORING_OFF_SQES:
  		ptr = ctx->sq_sqes;
  		break;
2b188cc1b   Jens Axboe   Add io_uring IO i...
3593
3594
3595
3596
3597
  	default:
  		return -EINVAL;
  	}
  
  	page = virt_to_head_page(ptr);
a50b854e0   Matthew Wilcox (Oracle)   mm: introduce pag...
3598
  	if (sz > page_size(page))
2b188cc1b   Jens Axboe   Add io_uring IO i...
3599
3600
3601
3602
3603
3604
3605
3606
3607
3608
3609
3610
3611
3612
  		return -EINVAL;
  
  	pfn = virt_to_phys(ptr) >> PAGE_SHIFT;
  	return remap_pfn_range(vma, vma->vm_start, pfn, sz, vma->vm_page_prot);
  }
  
  SYSCALL_DEFINE6(io_uring_enter, unsigned int, fd, u32, to_submit,
  		u32, min_complete, u32, flags, const sigset_t __user *, sig,
  		size_t, sigsz)
  {
  	struct io_ring_ctx *ctx;
  	long ret = -EBADF;
  	int submitted = 0;
  	struct fd f;
6c271ce2f   Jens Axboe   io_uring: add sub...
3613
  	if (flags & ~(IORING_ENTER_GETEVENTS | IORING_ENTER_SQ_WAKEUP))
2b188cc1b   Jens Axboe   Add io_uring IO i...
3614
3615
3616
3617
3618
3619
3620
3621
3622
3623
3624
3625
3626
3627
  		return -EINVAL;
  
  	f = fdget(fd);
  	if (!f.file)
  		return -EBADF;
  
  	ret = -EOPNOTSUPP;
  	if (f.file->f_op != &io_uring_fops)
  		goto out_fput;
  
  	ret = -ENXIO;
  	ctx = f.file->private_data;
  	if (!percpu_ref_tryget(&ctx->refs))
  		goto out_fput;
6c271ce2f   Jens Axboe   io_uring: add sub...
3628
3629
3630
3631
3632
  	/*
  	 * For SQ polling, the thread will do all submissions and completions.
  	 * Just return the requested submit count, and wake the thread if
  	 * we were asked to.
  	 */
b2a9eadab   Jens Axboe   io_uring: make sq...
3633
  	ret = 0;
6c271ce2f   Jens Axboe   io_uring: add sub...
3634
3635
3636
3637
  	if (ctx->flags & IORING_SETUP_SQPOLL) {
  		if (flags & IORING_ENTER_SQ_WAKEUP)
  			wake_up(&ctx->sqo_wait);
  		submitted = to_submit;
b2a9eadab   Jens Axboe   io_uring: make sq...
3638
  	} else if (to_submit) {
2b188cc1b   Jens Axboe   Add io_uring IO i...
3639
3640
3641
  		to_submit = min(to_submit, ctx->sq_entries);
  
  		mutex_lock(&ctx->uring_lock);
bc808bced   Jens Axboe   io_uring: revert ...
3642
  		submitted = io_ring_submit(ctx, to_submit);
2b188cc1b   Jens Axboe   Add io_uring IO i...
3643
  		mutex_unlock(&ctx->uring_lock);
002352747   Pavel Begunkov   io_uring: don't w...
3644
3645
3646
  
  		if (submitted != to_submit)
  			goto out;
2b188cc1b   Jens Axboe   Add io_uring IO i...
3647
3648
  	}
  	if (flags & IORING_ENTER_GETEVENTS) {
def596e95   Jens Axboe   io_uring: support...
3649
  		unsigned nr_events = 0;
2b188cc1b   Jens Axboe   Add io_uring IO i...
3650
  		min_complete = min(min_complete, ctx->cq_entries);
def596e95   Jens Axboe   io_uring: support...
3651
  		if (ctx->flags & IORING_SETUP_IOPOLL) {
def596e95   Jens Axboe   io_uring: support...
3652
  			ret = io_iopoll_check(ctx, &nr_events, min_complete);
def596e95   Jens Axboe   io_uring: support...
3653
3654
3655
  		} else {
  			ret = io_cqring_wait(ctx, min_complete, sig, sigsz);
  		}
2b188cc1b   Jens Axboe   Add io_uring IO i...
3656
  	}
002352747   Pavel Begunkov   io_uring: don't w...
3657
  out:
6805b32ec   Pavel Begunkov   io_uring: remove ...
3658
  	percpu_ref_put(&ctx->refs);
2b188cc1b   Jens Axboe   Add io_uring IO i...
3659
3660
3661
3662
3663
3664
3665
  out_fput:
  	fdput(f);
  	return submitted ? submitted : ret;
  }
  
  static const struct file_operations io_uring_fops = {
  	.release	= io_uring_release,
1c4404efc   Jens Axboe   io_uring: make su...
3666
  	.flush		= io_uring_flush,
2b188cc1b   Jens Axboe   Add io_uring IO i...
3667
3668
3669
3670
3671
3672
3673
3674
  	.mmap		= io_uring_mmap,
  	.poll		= io_uring_poll,
  	.fasync		= io_uring_fasync,
  };
  
  static int io_allocate_scq_urings(struct io_ring_ctx *ctx,
  				  struct io_uring_params *p)
  {
75b28affd   Hristo Venev   io_uring: allocat...
3675
3676
  	struct io_rings *rings;
  	size_t size, sq_array_offset;
2b188cc1b   Jens Axboe   Add io_uring IO i...
3677

3c512bd3d   Jens Axboe   io_uring: set ctx...
3678
3679
3680
  	/* make sure these are sane, as we already accounted them */
  	ctx->sq_entries = p->sq_entries;
  	ctx->cq_entries = p->cq_entries;
75b28affd   Hristo Venev   io_uring: allocat...
3681
3682
3683
3684
3685
3686
  	size = rings_size(p->sq_entries, p->cq_entries, &sq_array_offset);
  	if (size == SIZE_MAX)
  		return -EOVERFLOW;
  
  	rings = io_mem_alloc(size);
  	if (!rings)
2b188cc1b   Jens Axboe   Add io_uring IO i...
3687
  		return -ENOMEM;
75b28affd   Hristo Venev   io_uring: allocat...
3688
3689
3690
3691
3692
3693
3694
3695
  	ctx->rings = rings;
  	ctx->sq_array = (u32 *)((char *)rings + sq_array_offset);
  	rings->sq_ring_mask = p->sq_entries - 1;
  	rings->cq_ring_mask = p->cq_entries - 1;
  	rings->sq_ring_entries = p->sq_entries;
  	rings->cq_ring_entries = p->cq_entries;
  	ctx->sq_mask = rings->sq_ring_mask;
  	ctx->cq_mask = rings->cq_ring_mask;
2b188cc1b   Jens Axboe   Add io_uring IO i...
3696
3697
  
  	size = array_size(sizeof(struct io_uring_sqe), p->sq_entries);
1768acaa6   Jens Axboe   io_uring: io_allo...
3698
3699
3700
  	if (size == SIZE_MAX) {
  		io_mem_free(ctx->rings);
  		ctx->rings = NULL;
2b188cc1b   Jens Axboe   Add io_uring IO i...
3701
  		return -EOVERFLOW;
1768acaa6   Jens Axboe   io_uring: io_allo...
3702
  	}
2b188cc1b   Jens Axboe   Add io_uring IO i...
3703
3704
  
  	ctx->sq_sqes = io_mem_alloc(size);
1768acaa6   Jens Axboe   io_uring: io_allo...
3705
3706
3707
  	if (!ctx->sq_sqes) {
  		io_mem_free(ctx->rings);
  		ctx->rings = NULL;
2b188cc1b   Jens Axboe   Add io_uring IO i...
3708
  		return -ENOMEM;
1768acaa6   Jens Axboe   io_uring: io_allo...
3709
  	}
2b188cc1b   Jens Axboe   Add io_uring IO i...
3710

2b188cc1b   Jens Axboe   Add io_uring IO i...
3711
3712
3713
3714
3715
3716
3717
3718
3719
3720
3721
3722
3723
3724
3725
3726
3727
3728
3729
3730
3731
3732
3733
3734
3735
3736
3737
3738
3739
3740
3741
3742
3743
3744
3745
  	return 0;
  }
  
  /*
   * Allocate an anonymous fd, this is what constitutes the application
   * visible backing of an io_uring instance. The application mmaps this
   * fd to gain access to the SQ/CQ ring details. If UNIX sockets are enabled,
   * we have to tie this fd to a socket for file garbage collection purposes.
   */
  static int io_uring_get_fd(struct io_ring_ctx *ctx)
  {
  	struct file *file;
  	int ret;
  
  #if defined(CONFIG_UNIX)
  	ret = sock_create_kern(&init_net, PF_UNIX, SOCK_RAW, IPPROTO_IP,
  				&ctx->ring_sock);
  	if (ret)
  		return ret;
  #endif
  
  	ret = get_unused_fd_flags(O_RDWR | O_CLOEXEC);
  	if (ret < 0)
  		goto err;
  
  	file = anon_inode_getfile("[io_uring]", &io_uring_fops, ctx,
  					O_RDWR | O_CLOEXEC);
  	if (IS_ERR(file)) {
  		put_unused_fd(ret);
  		ret = PTR_ERR(file);
  		goto err;
  	}
  
  #if defined(CONFIG_UNIX)
  	ctx->ring_sock->file = file;
6b06314c4   Jens Axboe   io_uring: add fil...
3746
  	ctx->ring_sock->sk->sk_user_data = ctx;
2b188cc1b   Jens Axboe   Add io_uring IO i...
3747
3748
3749
3750
3751
3752
3753
3754
3755
3756
3757
3758
3759
3760
3761
3762
3763
3764
3765
3766
3767
3768
3769
3770
3771
3772
3773
3774
3775
3776
3777
3778
3779
3780
3781
3782
3783
3784
3785
3786
3787
3788
3789
3790
3791
3792
3793
3794
3795
3796
3797
3798
3799
  #endif
  	fd_install(ret, file);
  	return ret;
  err:
  #if defined(CONFIG_UNIX)
  	sock_release(ctx->ring_sock);
  	ctx->ring_sock = NULL;
  #endif
  	return ret;
  }
  
  static int io_uring_create(unsigned entries, struct io_uring_params *p)
  {
  	struct user_struct *user = NULL;
  	struct io_ring_ctx *ctx;
  	bool account_mem;
  	int ret;
  
  	if (!entries || entries > IORING_MAX_ENTRIES)
  		return -EINVAL;
  
  	/*
  	 * Use twice as many entries for the CQ ring. It's possible for the
  	 * application to drive a higher depth than the size of the SQ ring,
  	 * since the sqes are only used at submission time. This allows for
  	 * some flexibility in overcommitting a bit.
  	 */
  	p->sq_entries = roundup_pow_of_two(entries);
  	p->cq_entries = 2 * p->sq_entries;
  
  	user = get_uid(current_user());
  	account_mem = !capable(CAP_IPC_LOCK);
  
  	if (account_mem) {
  		ret = io_account_mem(user,
  				ring_pages(p->sq_entries, p->cq_entries));
  		if (ret) {
  			free_uid(user);
  			return ret;
  		}
  	}
  
  	ctx = io_ring_ctx_alloc(p);
  	if (!ctx) {
  		if (account_mem)
  			io_unaccount_mem(user, ring_pages(p->sq_entries,
  								p->cq_entries));
  		free_uid(user);
  		return -ENOMEM;
  	}
  	ctx->compat = in_compat_syscall();
  	ctx->account_mem = account_mem;
  	ctx->user = user;
d1b69aabc   Jens Axboe   io_uring: use cur...
3800
  	ctx->creds = get_current_cred();
8387e3688   Jens Axboe   io_uring: async w...
3801
3802
3803
3804
  	if (!ctx->creds) {
  		ret = -ENOMEM;
  		goto err;
  	}
2b188cc1b   Jens Axboe   Add io_uring IO i...
3805
3806
3807
  	ret = io_allocate_scq_urings(ctx, p);
  	if (ret)
  		goto err;
6c271ce2f   Jens Axboe   io_uring: add sub...
3808
  	ret = io_sq_offload_start(ctx, p);
2b188cc1b   Jens Axboe   Add io_uring IO i...
3809
3810
  	if (ret)
  		goto err;
2b188cc1b   Jens Axboe   Add io_uring IO i...
3811
  	memset(&p->sq_off, 0, sizeof(p->sq_off));
75b28affd   Hristo Venev   io_uring: allocat...
3812
3813
3814
3815
3816
3817
3818
  	p->sq_off.head = offsetof(struct io_rings, sq.head);
  	p->sq_off.tail = offsetof(struct io_rings, sq.tail);
  	p->sq_off.ring_mask = offsetof(struct io_rings, sq_ring_mask);
  	p->sq_off.ring_entries = offsetof(struct io_rings, sq_ring_entries);
  	p->sq_off.flags = offsetof(struct io_rings, sq_flags);
  	p->sq_off.dropped = offsetof(struct io_rings, sq_dropped);
  	p->sq_off.array = (char *)ctx->sq_array - (char *)ctx->rings;
2b188cc1b   Jens Axboe   Add io_uring IO i...
3819
3820
  
  	memset(&p->cq_off, 0, sizeof(p->cq_off));
75b28affd   Hristo Venev   io_uring: allocat...
3821
3822
3823
3824
3825
3826
  	p->cq_off.head = offsetof(struct io_rings, cq.head);
  	p->cq_off.tail = offsetof(struct io_rings, cq.tail);
  	p->cq_off.ring_mask = offsetof(struct io_rings, cq_ring_mask);
  	p->cq_off.ring_entries = offsetof(struct io_rings, cq_ring_entries);
  	p->cq_off.overflow = offsetof(struct io_rings, cq_overflow);
  	p->cq_off.cqes = offsetof(struct io_rings, cqes);
ac90f249e   Jens Axboe   io_uring: expose ...
3827

044c1ab39   Jens Axboe   io_uring: don't t...
3828
3829
3830
3831
3832
3833
3834
  	/*
  	 * Install ring fd as the very last thing, so we don't risk someone
  	 * having closed it before we finish setup
  	 */
  	ret = io_uring_get_fd(ctx);
  	if (ret < 0)
  		goto err;
ac90f249e   Jens Axboe   io_uring: expose ...
3835
  	p->features = IORING_FEAT_SINGLE_MMAP;
2b188cc1b   Jens Axboe   Add io_uring IO i...
3836
3837
3838
3839
3840
3841
3842
3843
3844
3845
3846
3847
3848
3849
3850
3851
3852
3853
3854
3855
3856
3857
3858
  	return ret;
  err:
  	io_ring_ctx_wait_and_kill(ctx);
  	return ret;
  }
  
  /*
   * Sets up an aio uring context, and returns the fd. Applications asks for a
   * ring size, we return the actual sq/cq ring sizes (among other things) in the
   * params structure passed in.
   */
  static long io_uring_setup(u32 entries, struct io_uring_params __user *params)
  {
  	struct io_uring_params p;
  	long ret;
  	int i;
  
  	if (copy_from_user(&p, params, sizeof(p)))
  		return -EFAULT;
  	for (i = 0; i < ARRAY_SIZE(p.resv); i++) {
  		if (p.resv[i])
  			return -EINVAL;
  	}
6c271ce2f   Jens Axboe   io_uring: add sub...
3859
3860
  	if (p.flags & ~(IORING_SETUP_IOPOLL | IORING_SETUP_SQPOLL |
  			IORING_SETUP_SQ_AFF))
2b188cc1b   Jens Axboe   Add io_uring IO i...
3861
3862
3863
3864
3865
3866
3867
3868
3869
3870
3871
3872
3873
3874
3875
3876
3877
  		return -EINVAL;
  
  	ret = io_uring_create(entries, &p);
  	if (ret < 0)
  		return ret;
  
  	if (copy_to_user(params, &p, sizeof(p)))
  		return -EFAULT;
  
  	return ret;
  }
  
  SYSCALL_DEFINE2(io_uring_setup, u32, entries,
  		struct io_uring_params __user *, params)
  {
  	return io_uring_setup(entries, params);
  }
edafccee5   Jens Axboe   io_uring: add sup...
3878
3879
  static int __io_uring_register(struct io_ring_ctx *ctx, unsigned opcode,
  			       void __user *arg, unsigned nr_args)
b19062a56   Jens Axboe   io_uring: fix pos...
3880
3881
  	__releases(ctx->uring_lock)
  	__acquires(ctx->uring_lock)
edafccee5   Jens Axboe   io_uring: add sup...
3882
3883
  {
  	int ret;
35fa71a03   Jens Axboe   io_uring: fail io...
3884
3885
3886
3887
3888
3889
3890
  	/*
  	 * We're inside the ring mutex, if the ref is already dying, then
  	 * someone else killed the ctx or is already going through
  	 * io_uring_register().
  	 */
  	if (percpu_ref_is_dying(&ctx->refs))
  		return -ENXIO;
edafccee5   Jens Axboe   io_uring: add sup...
3891
  	percpu_ref_kill(&ctx->refs);
b19062a56   Jens Axboe   io_uring: fix pos...
3892
3893
3894
3895
3896
3897
3898
3899
3900
  
  	/*
  	 * Drop uring mutex before waiting for references to exit. If another
  	 * thread is currently inside io_uring_enter() it might need to grab
  	 * the uring_lock to make progress. If we hold it here across the drain
  	 * wait, then we can deadlock. It's safe to drop the mutex here, since
  	 * no new references will come in after we've killed the percpu ref.
  	 */
  	mutex_unlock(&ctx->uring_lock);
edafccee5   Jens Axboe   io_uring: add sup...
3901
  	wait_for_completion(&ctx->ctx_done);
b19062a56   Jens Axboe   io_uring: fix pos...
3902
  	mutex_lock(&ctx->uring_lock);
edafccee5   Jens Axboe   io_uring: add sup...
3903
3904
3905
3906
3907
3908
3909
3910
3911
3912
3913
  
  	switch (opcode) {
  	case IORING_REGISTER_BUFFERS:
  		ret = io_sqe_buffer_register(ctx, arg, nr_args);
  		break;
  	case IORING_UNREGISTER_BUFFERS:
  		ret = -EINVAL;
  		if (arg || nr_args)
  			break;
  		ret = io_sqe_buffer_unregister(ctx);
  		break;
6b06314c4   Jens Axboe   io_uring: add fil...
3914
3915
3916
3917
3918
3919
3920
3921
3922
  	case IORING_REGISTER_FILES:
  		ret = io_sqe_files_register(ctx, arg, nr_args);
  		break;
  	case IORING_UNREGISTER_FILES:
  		ret = -EINVAL;
  		if (arg || nr_args)
  			break;
  		ret = io_sqe_files_unregister(ctx);
  		break;
9b402849e   Jens Axboe   io_uring: add sup...
3923
3924
3925
3926
3927
3928
3929
3930
3931
3932
3933
3934
  	case IORING_REGISTER_EVENTFD:
  		ret = -EINVAL;
  		if (nr_args != 1)
  			break;
  		ret = io_eventfd_register(ctx, arg);
  		break;
  	case IORING_UNREGISTER_EVENTFD:
  		ret = -EINVAL;
  		if (arg || nr_args)
  			break;
  		ret = io_eventfd_unregister(ctx);
  		break;
edafccee5   Jens Axboe   io_uring: add sup...
3935
3936
3937
3938
3939
3940
3941
3942
3943
3944
3945
3946
3947
3948
3949
3950
3951
3952
3953
3954
3955
3956
3957
3958
3959
3960
3961
3962
3963
3964
3965
3966
3967
3968
3969
  	default:
  		ret = -EINVAL;
  		break;
  	}
  
  	/* bring the ctx back to life */
  	reinit_completion(&ctx->ctx_done);
  	percpu_ref_reinit(&ctx->refs);
  	return ret;
  }
  
  SYSCALL_DEFINE4(io_uring_register, unsigned int, fd, unsigned int, opcode,
  		void __user *, arg, unsigned int, nr_args)
  {
  	struct io_ring_ctx *ctx;
  	long ret = -EBADF;
  	struct fd f;
  
  	f = fdget(fd);
  	if (!f.file)
  		return -EBADF;
  
  	ret = -EOPNOTSUPP;
  	if (f.file->f_op != &io_uring_fops)
  		goto out_fput;
  
  	ctx = f.file->private_data;
  
  	mutex_lock(&ctx->uring_lock);
  	ret = __io_uring_register(ctx, opcode, arg, nr_args);
  	mutex_unlock(&ctx->uring_lock);
  out_fput:
  	fdput(f);
  	return ret;
  }
2b188cc1b   Jens Axboe   Add io_uring IO i...
3970
3971
3972
3973
3974
3975
  static int __init io_uring_init(void)
  {
  	req_cachep = KMEM_CACHE(io_kiocb, SLAB_HWCACHE_ALIGN | SLAB_PANIC);
  	return 0;
  };
  __initcall(io_uring_init);