Blame view

fs/select.c 34.8 KB
b24413180   Greg Kroah-Hartman   License cleanup: ...
1
  // SPDX-License-Identifier: GPL-2.0
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
  /*
   * This file contains the procedures for the handling of select and poll
   *
   * Created for Linux based loosely upon Mathius Lattner's minix
   * patches by Peter MacDonald. Heavily edited by Linus.
   *
   *  4 February 1994
   *     COFF/ELF binary emulation. If the process has the STICKY_TIMEOUTS
   *     flag set in its personality we do *not* modify the given timeout
   *     parameter to reflect time remaining.
   *
   *  24 January 2000
   *     Changed sys_poll()/do_poll() to use PAGE_SIZE chunk-based allocation 
   *     of fds to overcome nfds < 16390 descriptors limit (Tigran Aivazian).
   */
022a16924   Milind Arun Choudhary   ROUND_UP macro cl...
17
  #include <linux/kernel.h>
3f07c0144   Ingo Molnar   sched/headers: Pr...
18
19
  #include <linux/sched/signal.h>
  #include <linux/sched/rt.h>
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
20
  #include <linux/syscalls.h>
630d9c472   Paul Gortmaker   fs: reduce the us...
21
  #include <linux/export.h>
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
22
  #include <linux/slab.h>
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
23
24
25
  #include <linux/poll.h>
  #include <linux/personality.h> /* for STICKY_TIMEOUTS */
  #include <linux/file.h>
9f3acc314   Al Viro   [PATCH] split lin...
26
  #include <linux/fdtable.h>
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
27
  #include <linux/fs.h>
b835996f6   Dipankar Sarma   [PATCH] files: lo...
28
  #include <linux/rcupdate.h>
8ff3e8e85   Arjan van de Ven   select: switch se...
29
  #include <linux/hrtimer.h>
9745cdb36   Colin Cross   select: use freez...
30
  #include <linux/freezer.h>
076bb0c82   Eliezer Tamir   net: rename inclu...
31
  #include <net/busy_poll.h>
2d19309cf   Vlastimil Babka   fs/select: add vm...
32
  #include <linux/vmalloc.h>
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
33

7c0f6ba68   Linus Torvalds   Replace <asm/uacc...
34
  #include <linux/uaccess.h>
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
35

90d6e24a3   Arjan van de Ven   hrtimer: make sel...
36
37
38
39
40
41
42
43
44
45
46
47
  
  /*
   * Estimate expected accuracy in ns from a timeval.
   *
   * After quite a bit of churning around, we've settled on
   * a simple thing of taking 0.1% of the timeout as the
   * slack, with a cap of 100 msec.
   * "nice" tasks get a 0.5% slack instead.
   *
   * Consider this comment an open invitation to come up with even
   * better solutions..
   */
5ae87e79e   Guillaume Knispel   poll/select: avoi...
48
  #define MAX_SLACK	(100 * NSEC_PER_MSEC)
766b9f928   Deepa Dinamani   fs: poll/select/r...
49
  static long __estimate_accuracy(struct timespec64 *tv)
90d6e24a3   Arjan van de Ven   hrtimer: make sel...
50
  {
96d2ab484   Arjan van de Ven   hrtimer: fix sign...
51
  	long slack;
90d6e24a3   Arjan van de Ven   hrtimer: make sel...
52
  	int divfactor = 1000;
5ae87e79e   Guillaume Knispel   poll/select: avoi...
53
54
  	if (tv->tv_sec < 0)
  		return 0;
4ce105d30   Arjan van de Ven   hrtimer: incorpor...
55
  	if (task_nice(current) > 0)
90d6e24a3   Arjan van de Ven   hrtimer: make sel...
56
  		divfactor = divfactor / 5;
5ae87e79e   Guillaume Knispel   poll/select: avoi...
57
58
  	if (tv->tv_sec > MAX_SLACK / (NSEC_PER_SEC/divfactor))
  		return MAX_SLACK;
90d6e24a3   Arjan van de Ven   hrtimer: make sel...
59
60
  	slack = tv->tv_nsec / divfactor;
  	slack += tv->tv_sec * (NSEC_PER_SEC/divfactor);
5ae87e79e   Guillaume Knispel   poll/select: avoi...
61
62
  	if (slack > MAX_SLACK)
  		return MAX_SLACK;
96d2ab484   Arjan van de Ven   hrtimer: fix sign...
63

90d6e24a3   Arjan van de Ven   hrtimer: make sel...
64
65
  	return slack;
  }
766b9f928   Deepa Dinamani   fs: poll/select/r...
66
  u64 select_estimate_accuracy(struct timespec64 *tv)
90d6e24a3   Arjan van de Ven   hrtimer: make sel...
67
  {
da8b44d5a   John Stultz   timer: convert ti...
68
  	u64 ret;
766b9f928   Deepa Dinamani   fs: poll/select/r...
69
  	struct timespec64 now;
90d6e24a3   Arjan van de Ven   hrtimer: make sel...
70
71
72
73
  
  	/*
  	 * Realtime tasks get a slack of 0 for obvious reasons.
  	 */
4ce105d30   Arjan van de Ven   hrtimer: incorpor...
74
  	if (rt_task(current))
90d6e24a3   Arjan van de Ven   hrtimer: make sel...
75
  		return 0;
766b9f928   Deepa Dinamani   fs: poll/select/r...
76
77
  	ktime_get_ts64(&now);
  	now = timespec64_sub(*tv, now);
90d6e24a3   Arjan van de Ven   hrtimer: make sel...
78
79
80
81
82
  	ret = __estimate_accuracy(&now);
  	if (ret < current->timer_slack_ns)
  		return current->timer_slack_ns;
  	return ret;
  }
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
  struct poll_table_page {
  	struct poll_table_page * next;
  	struct poll_table_entry * entry;
  	struct poll_table_entry entries[0];
  };
  
  #define POLL_TABLE_FULL(table) \
  	((unsigned long)((table)->entry+1) > PAGE_SIZE + (unsigned long)(table))
  
  /*
   * Ok, Peter made a complicated, but straightforward multiple_wait() function.
   * I have rewritten this, taking some shortcuts: This code may not be easy to
   * follow, but it should be free of race-conditions, and it's practical. If you
   * understand what I'm doing here, then you understand how the linux
   * sleep/wakeup mechanism works.
   *
   * Two very simple procedures, poll_wait() and poll_freewait() make all the
   * work.  poll_wait() is an inline-function defined in <linux/poll.h>,
   * as all select/poll functions have to call it to add an entry to the
   * poll table.
   */
75c96f858   Adrian Bunk   [PATCH] make some...
104
105
  static void __pollwait(struct file *filp, wait_queue_head_t *wait_address,
  		       poll_table *p);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
106
107
108
109
  
  void poll_initwait(struct poll_wqueues *pwq)
  {
  	init_poll_funcptr(&pwq->pt, __pollwait);
5f820f648   Tejun Heo   poll: allow f_op-...
110
  	pwq->polling_task = current;
b2add73db   Guillaume Knispel   poll/select: init...
111
  	pwq->triggered = 0;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
112
113
  	pwq->error = 0;
  	pwq->table = NULL;
70674f95c   Andi Kleen   [PATCH] Optimize ...
114
  	pwq->inline_index = 0;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
115
  }
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
116
  EXPORT_SYMBOL(poll_initwait);
70674f95c   Andi Kleen   [PATCH] Optimize ...
117
118
  static void free_poll_entry(struct poll_table_entry *entry)
  {
ccf6780dc   WANG Cong   Style fix in fs/s...
119
  	remove_wait_queue(entry->wait_address, &entry->wait);
70674f95c   Andi Kleen   [PATCH] Optimize ...
120
121
  	fput(entry->filp);
  }
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
122
123
124
  void poll_freewait(struct poll_wqueues *pwq)
  {
  	struct poll_table_page * p = pwq->table;
70674f95c   Andi Kleen   [PATCH] Optimize ...
125
126
127
  	int i;
  	for (i = 0; i < pwq->inline_index; i++)
  		free_poll_entry(pwq->inline_entries + i);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
128
129
130
131
132
133
134
  	while (p) {
  		struct poll_table_entry * entry;
  		struct poll_table_page *old;
  
  		entry = p->entry;
  		do {
  			entry--;
70674f95c   Andi Kleen   [PATCH] Optimize ...
135
  			free_poll_entry(entry);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
136
137
138
139
140
141
  		} while (entry > p->entries);
  		old = p;
  		p = p->next;
  		free_page((unsigned long) old);
  	}
  }
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
142
  EXPORT_SYMBOL(poll_freewait);
5f820f648   Tejun Heo   poll: allow f_op-...
143
  static struct poll_table_entry *poll_get_entry(struct poll_wqueues *p)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
144
  {
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
145
  	struct poll_table_page *table = p->table;
70674f95c   Andi Kleen   [PATCH] Optimize ...
146
147
  	if (p->inline_index < N_INLINE_POLL_ENTRIES)
  		return p->inline_entries + p->inline_index++;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
148
149
150
151
152
153
  	if (!table || POLL_TABLE_FULL(table)) {
  		struct poll_table_page *new_table;
  
  		new_table = (struct poll_table_page *) __get_free_page(GFP_KERNEL);
  		if (!new_table) {
  			p->error = -ENOMEM;
70674f95c   Andi Kleen   [PATCH] Optimize ...
154
  			return NULL;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
155
156
157
158
159
160
  		}
  		new_table->entry = new_table->entries;
  		new_table->next = table;
  		p->table = new_table;
  		table = new_table;
  	}
70674f95c   Andi Kleen   [PATCH] Optimize ...
161
162
  	return table->entry++;
  }
ac6424b98   Ingo Molnar   sched/wait: Renam...
163
  static int __pollwake(wait_queue_entry_t *wait, unsigned mode, int sync, void *key)
5f820f648   Tejun Heo   poll: allow f_op-...
164
165
166
167
168
169
170
171
172
  {
  	struct poll_wqueues *pwq = wait->private;
  	DECLARE_WAITQUEUE(dummy_wait, pwq->polling_task);
  
  	/*
  	 * Although this function is called under waitqueue lock, LOCK
  	 * doesn't imply write barrier and the users expect write
  	 * barrier semantics on wakeup functions.  The following
  	 * smp_wmb() is equivalent to smp_wmb() in try_to_wake_up()
b92b8b35a   Peter Zijlstra   locking/arch: Ren...
173
  	 * and is paired with smp_store_mb() in poll_schedule_timeout.
5f820f648   Tejun Heo   poll: allow f_op-...
174
175
176
177
178
179
180
181
182
183
184
185
186
187
  	 */
  	smp_wmb();
  	pwq->triggered = 1;
  
  	/*
  	 * Perform the default wake up operation using a dummy
  	 * waitqueue.
  	 *
  	 * TODO: This is hacky but there currently is no interface to
  	 * pass in @sync.  @sync is scheduled to be removed and once
  	 * that happens, wake_up_process() can be used directly.
  	 */
  	return default_wake_function(&dummy_wait, mode, sync, key);
  }
ac6424b98   Ingo Molnar   sched/wait: Renam...
188
  static int pollwake(wait_queue_entry_t *wait, unsigned mode, int sync, void *key)
4938d7e02   Eric Dumazet   poll: avoid extra...
189
190
191
192
193
194
195
196
  {
  	struct poll_table_entry *entry;
  
  	entry = container_of(wait, struct poll_table_entry, wait);
  	if (key && !((unsigned long)key & entry->key))
  		return 0;
  	return __pollwake(wait, mode, sync, key);
  }
70674f95c   Andi Kleen   [PATCH] Optimize ...
197
198
199
200
  /* Add a new entry */
  static void __pollwait(struct file *filp, wait_queue_head_t *wait_address,
  				poll_table *p)
  {
5f820f648   Tejun Heo   poll: allow f_op-...
201
202
  	struct poll_wqueues *pwq = container_of(p, struct poll_wqueues, pt);
  	struct poll_table_entry *entry = poll_get_entry(pwq);
70674f95c   Andi Kleen   [PATCH] Optimize ...
203
204
  	if (!entry)
  		return;
cb0942b81   Al Viro   make get_file() r...
205
  	entry->filp = get_file(filp);
70674f95c   Andi Kleen   [PATCH] Optimize ...
206
  	entry->wait_address = wait_address;
626cf2366   Hans Verkuil   poll: add poll_re...
207
  	entry->key = p->_key;
5f820f648   Tejun Heo   poll: allow f_op-...
208
209
  	init_waitqueue_func_entry(&entry->wait, pollwake);
  	entry->wait.private = pwq;
ccf6780dc   WANG Cong   Style fix in fs/s...
210
  	add_wait_queue(wait_address, &entry->wait);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
211
  }
5f820f648   Tejun Heo   poll: allow f_op-...
212
213
214
215
216
217
218
  int poll_schedule_timeout(struct poll_wqueues *pwq, int state,
  			  ktime_t *expires, unsigned long slack)
  {
  	int rc = -EINTR;
  
  	set_current_state(state);
  	if (!pwq->triggered)
59612d187   Rafael J. Wysocki   Revert "select: u...
219
  		rc = schedule_hrtimeout_range(expires, slack, HRTIMER_MODE_ABS);
5f820f648   Tejun Heo   poll: allow f_op-...
220
221
222
223
224
  	__set_current_state(TASK_RUNNING);
  
  	/*
  	 * Prepare for the next iteration.
  	 *
b92b8b35a   Peter Zijlstra   locking/arch: Ren...
225
  	 * The following smp_store_mb() serves two purposes.  First, it's
5f820f648   Tejun Heo   poll: allow f_op-...
226
227
228
229
230
231
232
  	 * the counterpart rmb of the wmb in pollwake() such that data
  	 * written before wake up is always visible after wake up.
  	 * Second, the full barrier guarantees that triggered clearing
  	 * doesn't pass event check of the next iteration.  Note that
  	 * this problem doesn't exist for the first iteration as
  	 * add_wait_queue() has full barrier semantics.
  	 */
b92b8b35a   Peter Zijlstra   locking/arch: Ren...
233
  	smp_store_mb(pwq->triggered, 0);
5f820f648   Tejun Heo   poll: allow f_op-...
234
235
236
237
  
  	return rc;
  }
  EXPORT_SYMBOL(poll_schedule_timeout);
b773ad40a   Thomas Gleixner   select: add poll_...
238
239
  /**
   * poll_select_set_timeout - helper function to setup the timeout value
766b9f928   Deepa Dinamani   fs: poll/select/r...
240
   * @to:		pointer to timespec64 variable for the final timeout
b773ad40a   Thomas Gleixner   select: add poll_...
241
242
243
244
245
246
247
248
   * @sec:	seconds (from user space)
   * @nsec:	nanoseconds (from user space)
   *
   * Note, we do not use a timespec for the user space value here, That
   * way we can use the function for timeval and compat interfaces as well.
   *
   * Returns -EINVAL if sec/nsec are not normalized. Otherwise 0.
   */
766b9f928   Deepa Dinamani   fs: poll/select/r...
249
  int poll_select_set_timeout(struct timespec64 *to, time64_t sec, long nsec)
b773ad40a   Thomas Gleixner   select: add poll_...
250
  {
766b9f928   Deepa Dinamani   fs: poll/select/r...
251
  	struct timespec64 ts = {.tv_sec = sec, .tv_nsec = nsec};
b773ad40a   Thomas Gleixner   select: add poll_...
252

766b9f928   Deepa Dinamani   fs: poll/select/r...
253
  	if (!timespec64_valid(&ts))
b773ad40a   Thomas Gleixner   select: add poll_...
254
255
256
257
258
259
  		return -EINVAL;
  
  	/* Optimize for the zero timeout value here */
  	if (!sec && !nsec) {
  		to->tv_sec = to->tv_nsec = 0;
  	} else {
766b9f928   Deepa Dinamani   fs: poll/select/r...
260
261
  		ktime_get_ts64(to);
  		*to = timespec64_add_safe(*to, ts);
b773ad40a   Thomas Gleixner   select: add poll_...
262
263
264
  	}
  	return 0;
  }
766b9f928   Deepa Dinamani   fs: poll/select/r...
265
266
  static int poll_select_copy_remaining(struct timespec64 *end_time,
  				      void __user *p,
b773ad40a   Thomas Gleixner   select: add poll_...
267
268
  				      int timeval, int ret)
  {
766b9f928   Deepa Dinamani   fs: poll/select/r...
269
  	struct timespec64 rts64;
b773ad40a   Thomas Gleixner   select: add poll_...
270
271
272
273
274
275
276
277
278
279
280
281
  	struct timespec rts;
  	struct timeval rtv;
  
  	if (!p)
  		return ret;
  
  	if (current->personality & STICKY_TIMEOUTS)
  		goto sticky;
  
  	/* No update for zero timeout */
  	if (!end_time->tv_sec && !end_time->tv_nsec)
  		return ret;
766b9f928   Deepa Dinamani   fs: poll/select/r...
282
283
284
285
286
287
  	ktime_get_ts64(&rts64);
  	rts64 = timespec64_sub(*end_time, rts64);
  	if (rts64.tv_sec < 0)
  		rts64.tv_sec = rts64.tv_nsec = 0;
  
  	rts = timespec64_to_timespec(rts64);
b773ad40a   Thomas Gleixner   select: add poll_...
288
289
  
  	if (timeval) {
65329bf46   Vasiliy Kulikov   fs/select.c: fix ...
290
291
  		if (sizeof(rtv) > sizeof(rtv.tv_sec) + sizeof(rtv.tv_usec))
  			memset(&rtv, 0, sizeof(rtv));
766b9f928   Deepa Dinamani   fs: poll/select/r...
292
293
  		rtv.tv_sec = rts64.tv_sec;
  		rtv.tv_usec = rts64.tv_nsec / NSEC_PER_USEC;
b773ad40a   Thomas Gleixner   select: add poll_...
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
  
  		if (!copy_to_user(p, &rtv, sizeof(rtv)))
  			return ret;
  
  	} else if (!copy_to_user(p, &rts, sizeof(rts)))
  		return ret;
  
  	/*
  	 * If an application puts its timeval in read-only memory, we
  	 * don't want the Linux-specific update to the timeval to
  	 * cause a fault after the select has completed
  	 * successfully. However, because we're not updating the
  	 * timeval, we can't restart the system call.
  	 */
  
  sticky:
  	if (ret == -ERESTARTNOHAND)
  		ret = -EINTR;
  	return ret;
  }
e99ca56ce   Al Viro   move compat selec...
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
  /*
   * Scalable version of the fd_set.
   */
  
  typedef struct {
  	unsigned long *in, *out, *ex;
  	unsigned long *res_in, *res_out, *res_ex;
  } fd_set_bits;
  
  /*
   * How many longwords for "nr" bits?
   */
  #define FDS_BITPERLONG	(8*sizeof(long))
  #define FDS_LONGS(nr)	(((nr)+FDS_BITPERLONG-1)/FDS_BITPERLONG)
  #define FDS_BYTES(nr)	(FDS_LONGS(nr)*sizeof(long))
  
  /*
   * We do a VERIFY_WRITE here even though we are only reading this time:
   * we'll write to it eventually..
   *
   * Use "unsigned long" accesses to let user-mode fd_set's be long-aligned.
   */
  static inline
  int get_fd_set(unsigned long nr, void __user *ufdset, unsigned long *fdset)
  {
  	nr = FDS_BYTES(nr);
  	if (ufdset)
  		return copy_from_user(fdset, ufdset, nr) ? -EFAULT : 0;
  
  	memset(fdset, 0, nr);
  	return 0;
  }
  
  static inline unsigned long __must_check
  set_fd_set(unsigned long nr, void __user *ufdset, unsigned long *fdset)
  {
  	if (ufdset)
  		return __copy_to_user(ufdset, fdset, FDS_BYTES(nr));
  	return 0;
  }
  
  static inline
  void zero_fd_set(unsigned long nr, unsigned long *fdset)
  {
  	memset(fdset, 0, FDS_BYTES(nr));
  }
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
360
361
362
363
364
365
366
367
368
369
370
  #define FDS_IN(fds, n)		(fds->in + n)
  #define FDS_OUT(fds, n)		(fds->out + n)
  #define FDS_EX(fds, n)		(fds->ex + n)
  
  #define BITS(fds, n)	(*FDS_IN(fds, n)|*FDS_OUT(fds, n)|*FDS_EX(fds, n))
  
  static int max_select_fd(unsigned long n, fd_set_bits *fds)
  {
  	unsigned long *open_fds;
  	unsigned long set;
  	int max;
badf16621   Dipankar Sarma   [PATCH] files: br...
371
  	struct fdtable *fdt;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
372
373
  
  	/* handle last in-complete long-word first */
8ded2bbc1   Josh Boyer   posix_types.h: Cl...
374
375
  	set = ~(~0UL << (n & (BITS_PER_LONG-1)));
  	n /= BITS_PER_LONG;
badf16621   Dipankar Sarma   [PATCH] files: br...
376
  	fdt = files_fdtable(current->files);
1fd36adcd   David Howells   Replace the fd_se...
377
  	open_fds = fdt->open_fds + n;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
  	max = 0;
  	if (set) {
  		set &= BITS(fds, n);
  		if (set) {
  			if (!(set & ~*open_fds))
  				goto get_max;
  			return -EBADF;
  		}
  	}
  	while (n) {
  		open_fds--;
  		n--;
  		set = BITS(fds, n);
  		if (!set)
  			continue;
  		if (set & ~*open_fds)
  			return -EBADF;
  		if (max)
  			continue;
  get_max:
  		do {
  			max++;
  			set >>= 1;
  		} while (set);
8ded2bbc1   Josh Boyer   posix_types.h: Cl...
402
  		max += n * BITS_PER_LONG;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
403
404
405
406
  	}
  
  	return max;
  }
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
407
408
409
  #define POLLIN_SET (POLLRDNORM | POLLRDBAND | POLLIN | POLLHUP | POLLERR)
  #define POLLOUT_SET (POLLWRBAND | POLLWRNORM | POLLOUT | POLLERR)
  #define POLLEX_SET (POLLPRI)
4938d7e02   Eric Dumazet   poll: avoid extra...
410
  static inline void wait_key_set(poll_table *wait, unsigned long in,
2d48d67fa   Eliezer Tamir   net: poll/select ...
411
412
  				unsigned long out, unsigned long bit,
  				unsigned int ll_flag)
4938d7e02   Eric Dumazet   poll: avoid extra...
413
  {
2d48d67fa   Eliezer Tamir   net: poll/select ...
414
  	wait->_key = POLLEX_SET | ll_flag;
626cf2366   Hans Verkuil   poll: add poll_re...
415
416
417
418
  	if (in & bit)
  		wait->_key |= POLLIN_SET;
  	if (out & bit)
  		wait->_key |= POLLOUT_SET;
4938d7e02   Eric Dumazet   poll: avoid extra...
419
  }
e99ca56ce   Al Viro   move compat selec...
420
  static int do_select(int n, fd_set_bits *fds, struct timespec64 *end_time)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
421
  {
8ff3e8e85   Arjan van de Ven   select: switch se...
422
  	ktime_t expire, *to = NULL;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
423
424
  	struct poll_wqueues table;
  	poll_table *wait;
8ff3e8e85   Arjan van de Ven   select: switch se...
425
  	int retval, i, timed_out = 0;
da8b44d5a   John Stultz   timer: convert ti...
426
  	u64 slack = 0;
cbf55001b   Eliezer Tamir   net: rename low l...
427
  	unsigned int busy_flag = net_busy_loop_on() ? POLL_BUSY_LOOP : 0;
37056719b   Alexander Duyck   net: Track start ...
428
  	unsigned long busy_start = 0;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
429

b835996f6   Dipankar Sarma   [PATCH] files: lo...
430
  	rcu_read_lock();
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
431
  	retval = max_select_fd(n, fds);
b835996f6   Dipankar Sarma   [PATCH] files: lo...
432
  	rcu_read_unlock();
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
433
434
435
436
437
438
439
  
  	if (retval < 0)
  		return retval;
  	n = retval;
  
  	poll_initwait(&table);
  	wait = &table.pt;
8ff3e8e85   Arjan van de Ven   select: switch se...
440
  	if (end_time && !end_time->tv_sec && !end_time->tv_nsec) {
626cf2366   Hans Verkuil   poll: add poll_re...
441
  		wait->_qproc = NULL;
8ff3e8e85   Arjan van de Ven   select: switch se...
442
443
  		timed_out = 1;
  	}
96d2ab484   Arjan van de Ven   hrtimer: fix sign...
444
  	if (end_time && !timed_out)
231f3d393   Andrew Morton   select: rename es...
445
  		slack = select_estimate_accuracy(end_time);
90d6e24a3   Arjan van de Ven   hrtimer: make sel...
446

1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
447
448
449
  	retval = 0;
  	for (;;) {
  		unsigned long *rinp, *routp, *rexp, *inp, *outp, *exp;
cbf55001b   Eliezer Tamir   net: rename low l...
450
  		bool can_busy_loop = false;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
451

1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
452
453
454
455
456
457
  		inp = fds->in; outp = fds->out; exp = fds->ex;
  		rinp = fds->res_in; routp = fds->res_out; rexp = fds->res_ex;
  
  		for (i = 0; i < n; ++rinp, ++routp, ++rexp) {
  			unsigned long in, out, ex, all_bits, bit = 1, mask, j;
  			unsigned long res_in = 0, res_out = 0, res_ex = 0;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
458
459
460
461
  
  			in = *inp++; out = *outp++; ex = *exp++;
  			all_bits = in | out | ex;
  			if (all_bits == 0) {
8ded2bbc1   Josh Boyer   posix_types.h: Cl...
462
  				i += BITS_PER_LONG;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
463
464
  				continue;
  			}
8ded2bbc1   Josh Boyer   posix_types.h: Cl...
465
  			for (j = 0; j < BITS_PER_LONG; ++j, ++i, bit <<= 1) {
2903ff019   Al Viro   switch simple cas...
466
  				struct fd f;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
467
468
469
470
  				if (i >= n)
  					break;
  				if (!(bit & all_bits))
  					continue;
2903ff019   Al Viro   switch simple cas...
471
472
473
474
  				f = fdget(i);
  				if (f.file) {
  					const struct file_operations *f_op;
  					f_op = f.file->f_op;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
475
  					mask = DEFAULT_POLLMASK;
72c2d5319   Al Viro   file->f_op is nev...
476
  					if (f_op->poll) {
2d48d67fa   Eliezer Tamir   net: poll/select ...
477
  						wait_key_set(wait, in, out,
cbf55001b   Eliezer Tamir   net: rename low l...
478
  							     bit, busy_flag);
2903ff019   Al Viro   switch simple cas...
479
  						mask = (*f_op->poll)(f.file, wait);
4938d7e02   Eric Dumazet   poll: avoid extra...
480
  					}
2903ff019   Al Viro   switch simple cas...
481
  					fdput(f);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
482
483
484
  					if ((mask & POLLIN_SET) && (in & bit)) {
  						res_in |= bit;
  						retval++;
626cf2366   Hans Verkuil   poll: add poll_re...
485
  						wait->_qproc = NULL;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
486
487
488
489
  					}
  					if ((mask & POLLOUT_SET) && (out & bit)) {
  						res_out |= bit;
  						retval++;
626cf2366   Hans Verkuil   poll: add poll_re...
490
  						wait->_qproc = NULL;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
491
492
493
494
  					}
  					if ((mask & POLLEX_SET) && (ex & bit)) {
  						res_ex |= bit;
  						retval++;
626cf2366   Hans Verkuil   poll: add poll_re...
495
  						wait->_qproc = NULL;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
496
  					}
2d48d67fa   Eliezer Tamir   net: poll/select ...
497
  					/* got something, stop busy polling */
cbf55001b   Eliezer Tamir   net: rename low l...
498
499
500
501
502
503
504
505
506
507
  					if (retval) {
  						can_busy_loop = false;
  						busy_flag = 0;
  
  					/*
  					 * only remember a returned
  					 * POLL_BUSY_LOOP if we asked for it
  					 */
  					} else if (busy_flag & mask)
  						can_busy_loop = true;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
508
  				}
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
509
510
511
512
513
514
515
  			}
  			if (res_in)
  				*rinp = res_in;
  			if (res_out)
  				*routp = res_out;
  			if (res_ex)
  				*rexp = res_ex;
55d853849   Linus Torvalds   Fix performance r...
516
  			cond_resched();
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
517
  		}
626cf2366   Hans Verkuil   poll: add poll_re...
518
  		wait->_qproc = NULL;
8ff3e8e85   Arjan van de Ven   select: switch se...
519
  		if (retval || timed_out || signal_pending(current))
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
520
  			break;
f5264481c   Pavel Machek   trivial: small cl...
521
  		if (table.error) {
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
522
523
524
  			retval = table.error;
  			break;
  		}
9f72949f6   David Woodhouse   [PATCH] Add psele...
525

cbf55001b   Eliezer Tamir   net: rename low l...
526
  		/* only if found POLL_BUSY_LOOP sockets && not out of time */
76b1e9b98   Eliezer Tamir   net/fs: change bu...
527
  		if (can_busy_loop && !need_resched()) {
37056719b   Alexander Duyck   net: Track start ...
528
529
  			if (!busy_start) {
  				busy_start = busy_loop_current_time();
76b1e9b98   Eliezer Tamir   net/fs: change bu...
530
531
  				continue;
  			}
37056719b   Alexander Duyck   net: Track start ...
532
  			if (!busy_loop_timeout(busy_start))
76b1e9b98   Eliezer Tamir   net/fs: change bu...
533
534
535
  				continue;
  		}
  		busy_flag = 0;
2d48d67fa   Eliezer Tamir   net: poll/select ...
536

8ff3e8e85   Arjan van de Ven   select: switch se...
537
538
539
540
541
542
  		/*
  		 * If this is the first loop and we have a timeout
  		 * given, then we convert to ktime_t and set the to
  		 * pointer to the expiry value.
  		 */
  		if (end_time && !to) {
766b9f928   Deepa Dinamani   fs: poll/select/r...
543
  			expire = timespec64_to_ktime(*end_time);
8ff3e8e85   Arjan van de Ven   select: switch se...
544
  			to = &expire;
9f72949f6   David Woodhouse   [PATCH] Add psele...
545
  		}
8ff3e8e85   Arjan van de Ven   select: switch se...
546

5f820f648   Tejun Heo   poll: allow f_op-...
547
548
  		if (!poll_schedule_timeout(&table, TASK_INTERRUPTIBLE,
  					   to, slack))
8ff3e8e85   Arjan van de Ven   select: switch se...
549
  			timed_out = 1;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
550
  	}
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
551
552
  
  	poll_freewait(&table);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
553
554
  	return retval;
  }
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
555
556
557
558
559
560
561
562
  /*
   * We can actually return ERESTARTSYS instead of EINTR, but I'd
   * like to be certain this leads to no problems. So I return
   * EINTR just for safety.
   *
   * Update: ERESTARTSYS breaks at least the xview clock binary, so
   * I'm trying ERESTARTNOHAND which restart only when you want to.
   */
a2dcb44c3   Al Viro   [PATCH] make osf_...
563
  int core_sys_select(int n, fd_set __user *inp, fd_set __user *outp,
766b9f928   Deepa Dinamani   fs: poll/select/r...
564
  			   fd_set __user *exp, struct timespec64 *end_time)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
565
566
  {
  	fd_set_bits fds;
29ff2db55   Andrew Morton   [PATCH] select() ...
567
  	void *bits;
bbea9f696   Vadim Lobanov   [PATCH] fdtable: ...
568
  	int ret, max_fds;
2d19309cf   Vlastimil Babka   fs/select: add vm...
569
  	size_t size, alloc_size;
badf16621   Dipankar Sarma   [PATCH] files: br...
570
  	struct fdtable *fdt;
70674f95c   Andi Kleen   [PATCH] Optimize ...
571
  	/* Allocate small arguments on the stack to save memory and be faster */
30c14e40e   Jes Sorensen   [PATCH] avoid una...
572
  	long stack_fds[SELECT_STACK_ALLOC/sizeof(long)];
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
573

1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
574
575
576
  	ret = -EINVAL;
  	if (n < 0)
  		goto out_nofds;
bbea9f696   Vadim Lobanov   [PATCH] fdtable: ...
577
  	/* max_fds can increase, so grab it once to avoid race */
b835996f6   Dipankar Sarma   [PATCH] files: lo...
578
  	rcu_read_lock();
badf16621   Dipankar Sarma   [PATCH] files: br...
579
  	fdt = files_fdtable(current->files);
bbea9f696   Vadim Lobanov   [PATCH] fdtable: ...
580
  	max_fds = fdt->max_fds;
b835996f6   Dipankar Sarma   [PATCH] files: lo...
581
  	rcu_read_unlock();
bbea9f696   Vadim Lobanov   [PATCH] fdtable: ...
582
583
  	if (n > max_fds)
  		n = max_fds;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
584
585
586
587
588
589
  
  	/*
  	 * We need 6 bitmaps (in/out/ex for both incoming and outgoing),
  	 * since we used fdset we need to allocate memory in units of
  	 * long-words. 
  	 */
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
590
  	size = FDS_BYTES(n);
b04eb6aa0   Mitchell Blank Jr   [PATCH] select: d...
591
592
593
594
  	bits = stack_fds;
  	if (size > sizeof(stack_fds) / 6) {
  		/* Not enough space in on-stack array; must use kmalloc */
  		ret = -ENOMEM;
2d19309cf   Vlastimil Babka   fs/select: add vm...
595
596
597
598
  		if (size > (SIZE_MAX / 6))
  			goto out_nofds;
  
  		alloc_size = 6 * size;
752ade68c   Michal Hocko   treewide: use kv[...
599
  		bits = kvmalloc(alloc_size, GFP_KERNEL);
b04eb6aa0   Mitchell Blank Jr   [PATCH] select: d...
600
601
602
  		if (!bits)
  			goto out_nofds;
  	}
29ff2db55   Andrew Morton   [PATCH] select() ...
603
604
605
606
607
608
  	fds.in      = bits;
  	fds.out     = bits +   size;
  	fds.ex      = bits + 2*size;
  	fds.res_in  = bits + 3*size;
  	fds.res_out = bits + 4*size;
  	fds.res_ex  = bits + 5*size;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
609
610
611
612
613
614
615
616
  
  	if ((ret = get_fd_set(n, inp, fds.in)) ||
  	    (ret = get_fd_set(n, outp, fds.out)) ||
  	    (ret = get_fd_set(n, exp, fds.ex)))
  		goto out;
  	zero_fd_set(n, fds.res_in);
  	zero_fd_set(n, fds.res_out);
  	zero_fd_set(n, fds.res_ex);
8ff3e8e85   Arjan van de Ven   select: switch se...
617
  	ret = do_select(n, &fds, end_time);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
  
  	if (ret < 0)
  		goto out;
  	if (!ret) {
  		ret = -ERESTARTNOHAND;
  		if (signal_pending(current))
  			goto out;
  		ret = 0;
  	}
  
  	if (set_fd_set(n, inp, fds.res_in) ||
  	    set_fd_set(n, outp, fds.res_out) ||
  	    set_fd_set(n, exp, fds.res_ex))
  		ret = -EFAULT;
  
  out:
70674f95c   Andi Kleen   [PATCH] Optimize ...
634
  	if (bits != stack_fds)
2d19309cf   Vlastimil Babka   fs/select: add vm...
635
  		kvfree(bits);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
636
637
638
  out_nofds:
  	return ret;
  }
5a8a82b1d   Heiko Carstens   [CVE-2009-0029] S...
639
640
  SYSCALL_DEFINE5(select, int, n, fd_set __user *, inp, fd_set __user *, outp,
  		fd_set __user *, exp, struct timeval __user *, tvp)
9f72949f6   David Woodhouse   [PATCH] Add psele...
641
  {
766b9f928   Deepa Dinamani   fs: poll/select/r...
642
  	struct timespec64 end_time, *to = NULL;
9f72949f6   David Woodhouse   [PATCH] Add psele...
643
644
645
646
647
648
  	struct timeval tv;
  	int ret;
  
  	if (tvp) {
  		if (copy_from_user(&tv, tvp, sizeof(tv)))
  			return -EFAULT;
8ff3e8e85   Arjan van de Ven   select: switch se...
649
  		to = &end_time;
4d36a9e65   Arjan van de Ven   select: deal with...
650
651
652
  		if (poll_select_set_timeout(to,
  				tv.tv_sec + (tv.tv_usec / USEC_PER_SEC),
  				(tv.tv_usec % USEC_PER_SEC) * NSEC_PER_USEC))
9f72949f6   David Woodhouse   [PATCH] Add psele...
653
  			return -EINVAL;
9f72949f6   David Woodhouse   [PATCH] Add psele...
654
  	}
8ff3e8e85   Arjan van de Ven   select: switch se...
655
656
  	ret = core_sys_select(n, inp, outp, exp, to);
  	ret = poll_select_copy_remaining(&end_time, tvp, 1, ret);
9f72949f6   David Woodhouse   [PATCH] Add psele...
657
658
659
  
  	return ret;
  }
c9da9f212   Heiko Carstens   [CVE-2009-0029] M...
660
661
662
  static long do_pselect(int n, fd_set __user *inp, fd_set __user *outp,
  		       fd_set __user *exp, struct timespec __user *tsp,
  		       const sigset_t __user *sigmask, size_t sigsetsize)
9f72949f6   David Woodhouse   [PATCH] Add psele...
663
  {
9f72949f6   David Woodhouse   [PATCH] Add psele...
664
  	sigset_t ksigmask, sigsaved;
766b9f928   Deepa Dinamani   fs: poll/select/r...
665
666
  	struct timespec ts;
  	struct timespec64 ts64, end_time, *to = NULL;
9f72949f6   David Woodhouse   [PATCH] Add psele...
667
668
669
670
671
  	int ret;
  
  	if (tsp) {
  		if (copy_from_user(&ts, tsp, sizeof(ts)))
  			return -EFAULT;
766b9f928   Deepa Dinamani   fs: poll/select/r...
672
  		ts64 = timespec_to_timespec64(ts);
9f72949f6   David Woodhouse   [PATCH] Add psele...
673

8ff3e8e85   Arjan van de Ven   select: switch se...
674
  		to = &end_time;
766b9f928   Deepa Dinamani   fs: poll/select/r...
675
  		if (poll_select_set_timeout(to, ts64.tv_sec, ts64.tv_nsec))
9f72949f6   David Woodhouse   [PATCH] Add psele...
676
  			return -EINVAL;
9f72949f6   David Woodhouse   [PATCH] Add psele...
677
678
679
680
681
682
683
684
685
686
687
688
  	}
  
  	if (sigmask) {
  		/* XXX: Don't preclude handling different sized sigset_t's.  */
  		if (sigsetsize != sizeof(sigset_t))
  			return -EINVAL;
  		if (copy_from_user(&ksigmask, sigmask, sizeof(ksigmask)))
  			return -EFAULT;
  
  		sigdelsetmask(&ksigmask, sigmask(SIGKILL)|sigmask(SIGSTOP));
  		sigprocmask(SIG_SETMASK, &ksigmask, &sigsaved);
  	}
62568510b   Bernd Schmidt   Fix timeouts in s...
689
  	ret = core_sys_select(n, inp, outp, exp, to);
8ff3e8e85   Arjan van de Ven   select: switch se...
690
  	ret = poll_select_copy_remaining(&end_time, tsp, 0, ret);
9f72949f6   David Woodhouse   [PATCH] Add psele...
691
692
693
694
695
696
697
698
699
700
  
  	if (ret == -ERESTARTNOHAND) {
  		/*
  		 * Don't restore the signal mask yet. Let do_signal() deliver
  		 * the signal on the way back to userspace, before the signal
  		 * mask is restored.
  		 */
  		if (sigmask) {
  			memcpy(&current->saved_sigmask, &sigsaved,
  					sizeof(sigsaved));
4e4c22c71   Roland McGrath   signals: add set_...
701
  			set_restore_sigmask();
9f72949f6   David Woodhouse   [PATCH] Add psele...
702
703
704
705
706
707
708
709
710
711
712
713
714
  		}
  	} else if (sigmask)
  		sigprocmask(SIG_SETMASK, &sigsaved, NULL);
  
  	return ret;
  }
  
  /*
   * Most architectures can't handle 7-argument syscalls. So we provide a
   * 6-argument version where the sixth argument is a pointer to a structure
   * which has a pointer to the sigset_t itself followed by a size_t containing
   * the sigset size.
   */
d4e82042c   Heiko Carstens   [CVE-2009-0029] S...
715
716
717
  SYSCALL_DEFINE6(pselect6, int, n, fd_set __user *, inp, fd_set __user *, outp,
  		fd_set __user *, exp, struct timespec __user *, tsp,
  		void __user *, sig)
9f72949f6   David Woodhouse   [PATCH] Add psele...
718
719
720
721
722
723
  {
  	size_t sigsetsize = 0;
  	sigset_t __user *up = NULL;
  
  	if (sig) {
  		if (!access_ok(VERIFY_READ, sig, sizeof(void *)+sizeof(size_t))
e110ab94e   Al Viro   [PATCH] fix __use...
724
  		    || __get_user(up, (sigset_t __user * __user *)sig)
9f72949f6   David Woodhouse   [PATCH] Add psele...
725
  		    || __get_user(sigsetsize,
e110ab94e   Al Viro   [PATCH] fix __use...
726
  				(size_t __user *)(sig+sizeof(void *))))
9f72949f6   David Woodhouse   [PATCH] Add psele...
727
728
  			return -EFAULT;
  	}
c9da9f212   Heiko Carstens   [CVE-2009-0029] M...
729
  	return do_pselect(n, inp, outp, exp, tsp, up, sigsetsize);
9f72949f6   David Woodhouse   [PATCH] Add psele...
730
  }
9f72949f6   David Woodhouse   [PATCH] Add psele...
731

5d0e52830   Christoph Hellwig   Add generic sys_o...
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
  #ifdef __ARCH_WANT_SYS_OLD_SELECT
  struct sel_arg_struct {
  	unsigned long n;
  	fd_set __user *inp, *outp, *exp;
  	struct timeval __user *tvp;
  };
  
  SYSCALL_DEFINE1(old_select, struct sel_arg_struct __user *, arg)
  {
  	struct sel_arg_struct a;
  
  	if (copy_from_user(&a, arg, sizeof(a)))
  		return -EFAULT;
  	return sys_select(a.n, a.inp, a.outp, a.exp, a.tvp);
  }
  #endif
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
748
749
750
751
752
753
754
  struct poll_list {
  	struct poll_list *next;
  	int len;
  	struct pollfd entries[0];
  };
  
  #define POLLFD_PER_PAGE  ((PAGE_SIZE-sizeof(struct poll_list)) / sizeof(struct pollfd))
4a4b69f79   Vadim Lobanov   [PATCH] Poll clea...
755
756
757
758
759
  /*
   * Fish for pollable events on the pollfd->fd file descriptor. We're only
   * interested in events matching the pollfd->events mask, and the result
   * matching that mask is both recorded in pollfd->revents and returned. The
   * pwait poll_table will be used by the fd-provided poll handler for waiting,
626cf2366   Hans Verkuil   poll: add poll_re...
760
   * if pwait->_qproc is non-NULL.
4a4b69f79   Vadim Lobanov   [PATCH] Poll clea...
761
   */
2d48d67fa   Eliezer Tamir   net: poll/select ...
762
  static inline unsigned int do_pollfd(struct pollfd *pollfd, poll_table *pwait,
cbf55001b   Eliezer Tamir   net: rename low l...
763
764
  				     bool *can_busy_poll,
  				     unsigned int busy_flag)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
765
  {
4a4b69f79   Vadim Lobanov   [PATCH] Poll clea...
766
767
768
769
770
771
  	unsigned int mask;
  	int fd;
  
  	mask = 0;
  	fd = pollfd->fd;
  	if (fd >= 0) {
2903ff019   Al Viro   switch simple cas...
772
  		struct fd f = fdget(fd);
4a4b69f79   Vadim Lobanov   [PATCH] Poll clea...
773
  		mask = POLLNVAL;
2903ff019   Al Viro   switch simple cas...
774
  		if (f.file) {
4a4b69f79   Vadim Lobanov   [PATCH] Poll clea...
775
  			mask = DEFAULT_POLLMASK;
72c2d5319   Al Viro   file->f_op is nev...
776
  			if (f.file->f_op->poll) {
626cf2366   Hans Verkuil   poll: add poll_re...
777
  				pwait->_key = pollfd->events|POLLERR|POLLHUP;
cbf55001b   Eliezer Tamir   net: rename low l...
778
  				pwait->_key |= busy_flag;
2903ff019   Al Viro   switch simple cas...
779
  				mask = f.file->f_op->poll(f.file, pwait);
cbf55001b   Eliezer Tamir   net: rename low l...
780
781
  				if (mask & busy_flag)
  					*can_busy_poll = true;
4938d7e02   Eric Dumazet   poll: avoid extra...
782
  			}
4a4b69f79   Vadim Lobanov   [PATCH] Poll clea...
783
784
  			/* Mask out unneeded events. */
  			mask &= pollfd->events | POLLERR | POLLHUP;
2903ff019   Al Viro   switch simple cas...
785
  			fdput(f);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
786
  		}
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
787
  	}
4a4b69f79   Vadim Lobanov   [PATCH] Poll clea...
788
789
790
  	pollfd->revents = mask;
  
  	return mask;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
791
  }
ccec5ee30   Mateusz Guzik   poll: plug an unu...
792
  static int do_poll(struct poll_list *list, struct poll_wqueues *wait,
766b9f928   Deepa Dinamani   fs: poll/select/r...
793
  		   struct timespec64 *end_time)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
794
  {
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
795
  	poll_table* pt = &wait->pt;
8ff3e8e85   Arjan van de Ven   select: switch se...
796
797
  	ktime_t expire, *to = NULL;
  	int timed_out = 0, count = 0;
da8b44d5a   John Stultz   timer: convert ti...
798
  	u64 slack = 0;
cbf55001b   Eliezer Tamir   net: rename low l...
799
  	unsigned int busy_flag = net_busy_loop_on() ? POLL_BUSY_LOOP : 0;
37056719b   Alexander Duyck   net: Track start ...
800
  	unsigned long busy_start = 0;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
801

9f72949f6   David Woodhouse   [PATCH] Add psele...
802
  	/* Optimise the no-wait case */
8ff3e8e85   Arjan van de Ven   select: switch se...
803
  	if (end_time && !end_time->tv_sec && !end_time->tv_nsec) {
626cf2366   Hans Verkuil   poll: add poll_re...
804
  		pt->_qproc = NULL;
8ff3e8e85   Arjan van de Ven   select: switch se...
805
806
  		timed_out = 1;
  	}
9bf084f70   Oleg Nesterov   do_poll: return -...
807

96d2ab484   Arjan van de Ven   hrtimer: fix sign...
808
  	if (end_time && !timed_out)
231f3d393   Andrew Morton   select: rename es...
809
  		slack = select_estimate_accuracy(end_time);
90d6e24a3   Arjan van de Ven   hrtimer: make sel...
810

1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
811
812
  	for (;;) {
  		struct poll_list *walk;
cbf55001b   Eliezer Tamir   net: rename low l...
813
  		bool can_busy_loop = false;
9f72949f6   David Woodhouse   [PATCH] Add psele...
814

4a4b69f79   Vadim Lobanov   [PATCH] Poll clea...
815
816
817
818
819
820
821
822
  		for (walk = list; walk != NULL; walk = walk->next) {
  			struct pollfd * pfd, * pfd_end;
  
  			pfd = walk->entries;
  			pfd_end = pfd + walk->len;
  			for (; pfd != pfd_end; pfd++) {
  				/*
  				 * Fish for events. If we found one, record it
626cf2366   Hans Verkuil   poll: add poll_re...
823
  				 * and kill poll_table->_qproc, so we don't
4a4b69f79   Vadim Lobanov   [PATCH] Poll clea...
824
825
826
827
  				 * needlessly register any other waiters after
  				 * this. They'll get immediately deregistered
  				 * when we break out and return.
  				 */
cbf55001b   Eliezer Tamir   net: rename low l...
828
829
  				if (do_pollfd(pfd, pt, &can_busy_loop,
  					      busy_flag)) {
4a4b69f79   Vadim Lobanov   [PATCH] Poll clea...
830
  					count++;
626cf2366   Hans Verkuil   poll: add poll_re...
831
  					pt->_qproc = NULL;
cbf55001b   Eliezer Tamir   net: rename low l...
832
833
834
  					/* found something, stop busy polling */
  					busy_flag = 0;
  					can_busy_loop = false;
4a4b69f79   Vadim Lobanov   [PATCH] Poll clea...
835
836
  				}
  			}
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
837
  		}
4a4b69f79   Vadim Lobanov   [PATCH] Poll clea...
838
839
  		/*
  		 * All waiters have already been registered, so don't provide
626cf2366   Hans Verkuil   poll: add poll_re...
840
  		 * a poll_table->_qproc to them on the next loop iteration.
4a4b69f79   Vadim Lobanov   [PATCH] Poll clea...
841
  		 */
626cf2366   Hans Verkuil   poll: add poll_re...
842
  		pt->_qproc = NULL;
9bf084f70   Oleg Nesterov   do_poll: return -...
843
844
845
846
847
  		if (!count) {
  			count = wait->error;
  			if (signal_pending(current))
  				count = -EINTR;
  		}
8ff3e8e85   Arjan van de Ven   select: switch se...
848
  		if (count || timed_out)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
849
  			break;
9f72949f6   David Woodhouse   [PATCH] Add psele...
850

cbf55001b   Eliezer Tamir   net: rename low l...
851
  		/* only if found POLL_BUSY_LOOP sockets && not out of time */
76b1e9b98   Eliezer Tamir   net/fs: change bu...
852
  		if (can_busy_loop && !need_resched()) {
37056719b   Alexander Duyck   net: Track start ...
853
854
  			if (!busy_start) {
  				busy_start = busy_loop_current_time();
76b1e9b98   Eliezer Tamir   net/fs: change bu...
855
856
  				continue;
  			}
37056719b   Alexander Duyck   net: Track start ...
857
  			if (!busy_loop_timeout(busy_start))
76b1e9b98   Eliezer Tamir   net/fs: change bu...
858
859
860
  				continue;
  		}
  		busy_flag = 0;
91e2fd337   Eliezer Tamir   net: avoid callin...
861

8ff3e8e85   Arjan van de Ven   select: switch se...
862
863
864
865
866
867
  		/*
  		 * If this is the first loop and we have a timeout
  		 * given, then we convert to ktime_t and set the to
  		 * pointer to the expiry value.
  		 */
  		if (end_time && !to) {
766b9f928   Deepa Dinamani   fs: poll/select/r...
868
  			expire = timespec64_to_ktime(*end_time);
8ff3e8e85   Arjan van de Ven   select: switch se...
869
  			to = &expire;
9f72949f6   David Woodhouse   [PATCH] Add psele...
870
  		}
5f820f648   Tejun Heo   poll: allow f_op-...
871
  		if (!poll_schedule_timeout(wait, TASK_INTERRUPTIBLE, to, slack))
8ff3e8e85   Arjan van de Ven   select: switch se...
872
  			timed_out = 1;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
873
  	}
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
874
875
  	return count;
  }
70674f95c   Andi Kleen   [PATCH] Optimize ...
876
877
  #define N_STACK_PPS ((sizeof(stack_pps) - sizeof(struct poll_list))  / \
  			sizeof(struct pollfd))
e99ca56ce   Al Viro   move compat selec...
878
  static int do_sys_poll(struct pollfd __user *ufds, unsigned int nfds,
766b9f928   Deepa Dinamani   fs: poll/select/r...
879
  		struct timespec64 *end_time)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
880
881
  {
  	struct poll_wqueues table;
252e5725c   Oleg Nesterov   do_sys_poll: simp...
882
   	int err = -EFAULT, fdcount, len, size;
30c14e40e   Jes Sorensen   [PATCH] avoid una...
883
884
885
886
  	/* Allocate small arguments on the stack to save memory and be
  	   faster - use long to make sure the buffer is aligned properly
  	   on 64 bit archs to avoid unaligned access */
  	long stack_pps[POLL_STACK_ALLOC/sizeof(long)];
252e5725c   Oleg Nesterov   do_sys_poll: simp...
887
888
889
  	struct poll_list *const head = (struct poll_list *)stack_pps;
   	struct poll_list *walk = head;
   	unsigned long todo = nfds;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
890

d554ed895   Jiri Slaby   fs: use rlimit he...
891
  	if (nfds > rlimit(RLIMIT_NOFILE))
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
892
  		return -EINVAL;
252e5725c   Oleg Nesterov   do_sys_poll: simp...
893
894
895
896
897
898
  	len = min_t(unsigned int, nfds, N_STACK_PPS);
  	for (;;) {
  		walk->next = NULL;
  		walk->len = len;
  		if (!len)
  			break;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
899

252e5725c   Oleg Nesterov   do_sys_poll: simp...
900
901
902
903
904
905
906
  		if (copy_from_user(walk->entries, ufds + nfds-todo,
  					sizeof(struct pollfd) * walk->len))
  			goto out_fds;
  
  		todo -= walk->len;
  		if (!todo)
  			break;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
907

252e5725c   Oleg Nesterov   do_sys_poll: simp...
908
909
910
911
912
  		len = min(todo, POLLFD_PER_PAGE);
  		size = sizeof(struct poll_list) + sizeof(struct pollfd) * len;
  		walk = walk->next = kmalloc(size, GFP_KERNEL);
  		if (!walk) {
  			err = -ENOMEM;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
913
914
  			goto out_fds;
  		}
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
915
  	}
9f72949f6   David Woodhouse   [PATCH] Add psele...
916

252e5725c   Oleg Nesterov   do_sys_poll: simp...
917
  	poll_initwait(&table);
ccec5ee30   Mateusz Guzik   poll: plug an unu...
918
  	fdcount = do_poll(head, &table, end_time);
252e5725c   Oleg Nesterov   do_sys_poll: simp...
919
  	poll_freewait(&table);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
920

252e5725c   Oleg Nesterov   do_sys_poll: simp...
921
  	for (walk = head; walk; walk = walk->next) {
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
922
923
  		struct pollfd *fds = walk->entries;
  		int j;
252e5725c   Oleg Nesterov   do_sys_poll: simp...
924
925
  		for (j = 0; j < walk->len; j++, ufds++)
  			if (__put_user(fds[j].revents, &ufds->revents))
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
926
  				goto out_fds;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
927
    	}
252e5725c   Oleg Nesterov   do_sys_poll: simp...
928

1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
929
  	err = fdcount;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
930
  out_fds:
252e5725c   Oleg Nesterov   do_sys_poll: simp...
931
932
933
934
935
  	walk = head->next;
  	while (walk) {
  		struct poll_list *pos = walk;
  		walk = walk->next;
  		kfree(pos);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
936
  	}
252e5725c   Oleg Nesterov   do_sys_poll: simp...
937

1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
938
939
  	return err;
  }
9f72949f6   David Woodhouse   [PATCH] Add psele...
940

3075d9da0   Chris Wright   Use ERESTART_REST...
941
942
  static long do_restart_poll(struct restart_block *restart_block)
  {
8ff3e8e85   Arjan van de Ven   select: switch se...
943
944
  	struct pollfd __user *ufds = restart_block->poll.ufds;
  	int nfds = restart_block->poll.nfds;
766b9f928   Deepa Dinamani   fs: poll/select/r...
945
  	struct timespec64 *to = NULL, end_time;
3075d9da0   Chris Wright   Use ERESTART_REST...
946
  	int ret;
8ff3e8e85   Arjan van de Ven   select: switch se...
947
948
949
950
951
952
953
  	if (restart_block->poll.has_timeout) {
  		end_time.tv_sec = restart_block->poll.tv_sec;
  		end_time.tv_nsec = restart_block->poll.tv_nsec;
  		to = &end_time;
  	}
  
  	ret = do_sys_poll(ufds, nfds, to);
3075d9da0   Chris Wright   Use ERESTART_REST...
954
955
  	if (ret == -EINTR) {
  		restart_block->fn = do_restart_poll;
3075d9da0   Chris Wright   Use ERESTART_REST...
956
957
958
959
  		ret = -ERESTART_RESTARTBLOCK;
  	}
  	return ret;
  }
5a8a82b1d   Heiko Carstens   [CVE-2009-0029] S...
960
  SYSCALL_DEFINE3(poll, struct pollfd __user *, ufds, unsigned int, nfds,
faf309009   Linus Torvalds   sys_poll: fix inc...
961
  		int, timeout_msecs)
9f72949f6   David Woodhouse   [PATCH] Add psele...
962
  {
766b9f928   Deepa Dinamani   fs: poll/select/r...
963
  	struct timespec64 end_time, *to = NULL;
3075d9da0   Chris Wright   Use ERESTART_REST...
964
  	int ret;
9f72949f6   David Woodhouse   [PATCH] Add psele...
965

8ff3e8e85   Arjan van de Ven   select: switch se...
966
967
968
969
  	if (timeout_msecs >= 0) {
  		to = &end_time;
  		poll_select_set_timeout(to, timeout_msecs / MSEC_PER_SEC,
  			NSEC_PER_MSEC * (timeout_msecs % MSEC_PER_SEC));
9f72949f6   David Woodhouse   [PATCH] Add psele...
970
  	}
8ff3e8e85   Arjan van de Ven   select: switch se...
971
  	ret = do_sys_poll(ufds, nfds, to);
3075d9da0   Chris Wright   Use ERESTART_REST...
972
973
  	if (ret == -EINTR) {
  		struct restart_block *restart_block;
8ff3e8e85   Arjan van de Ven   select: switch se...
974

f56141e3e   Andy Lutomirski   all arches, signa...
975
  		restart_block = &current->restart_block;
3075d9da0   Chris Wright   Use ERESTART_REST...
976
  		restart_block->fn = do_restart_poll;
8ff3e8e85   Arjan van de Ven   select: switch se...
977
978
979
980
981
982
983
984
985
  		restart_block->poll.ufds = ufds;
  		restart_block->poll.nfds = nfds;
  
  		if (timeout_msecs >= 0) {
  			restart_block->poll.tv_sec = end_time.tv_sec;
  			restart_block->poll.tv_nsec = end_time.tv_nsec;
  			restart_block->poll.has_timeout = 1;
  		} else
  			restart_block->poll.has_timeout = 0;
3075d9da0   Chris Wright   Use ERESTART_REST...
986
987
988
  		ret = -ERESTART_RESTARTBLOCK;
  	}
  	return ret;
9f72949f6   David Woodhouse   [PATCH] Add psele...
989
  }
d4e82042c   Heiko Carstens   [CVE-2009-0029] S...
990
991
992
  SYSCALL_DEFINE5(ppoll, struct pollfd __user *, ufds, unsigned int, nfds,
  		struct timespec __user *, tsp, const sigset_t __user *, sigmask,
  		size_t, sigsetsize)
9f72949f6   David Woodhouse   [PATCH] Add psele...
993
994
  {
  	sigset_t ksigmask, sigsaved;
766b9f928   Deepa Dinamani   fs: poll/select/r...
995
996
  	struct timespec ts;
  	struct timespec64 end_time, *to = NULL;
9f72949f6   David Woodhouse   [PATCH] Add psele...
997
998
999
1000
1001
  	int ret;
  
  	if (tsp) {
  		if (copy_from_user(&ts, tsp, sizeof(ts)))
  			return -EFAULT;
8ff3e8e85   Arjan van de Ven   select: switch se...
1002
1003
1004
  		to = &end_time;
  		if (poll_select_set_timeout(to, ts.tv_sec, ts.tv_nsec))
  			return -EINVAL;
9f72949f6   David Woodhouse   [PATCH] Add psele...
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
  	}
  
  	if (sigmask) {
  		/* XXX: Don't preclude handling different sized sigset_t's.  */
  		if (sigsetsize != sizeof(sigset_t))
  			return -EINVAL;
  		if (copy_from_user(&ksigmask, sigmask, sizeof(ksigmask)))
  			return -EFAULT;
  
  		sigdelsetmask(&ksigmask, sigmask(SIGKILL)|sigmask(SIGSTOP));
  		sigprocmask(SIG_SETMASK, &ksigmask, &sigsaved);
  	}
8ff3e8e85   Arjan van de Ven   select: switch se...
1017
  	ret = do_sys_poll(ufds, nfds, to);
9f72949f6   David Woodhouse   [PATCH] Add psele...
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
  
  	/* We can restart this syscall, usually */
  	if (ret == -EINTR) {
  		/*
  		 * Don't restore the signal mask yet. Let do_signal() deliver
  		 * the signal on the way back to userspace, before the signal
  		 * mask is restored.
  		 */
  		if (sigmask) {
  			memcpy(&current->saved_sigmask, &sigsaved,
  					sizeof(sigsaved));
4e4c22c71   Roland McGrath   signals: add set_...
1029
  			set_restore_sigmask();
9f72949f6   David Woodhouse   [PATCH] Add psele...
1030
1031
1032
1033
  		}
  		ret = -ERESTARTNOHAND;
  	} else if (sigmask)
  		sigprocmask(SIG_SETMASK, &sigsaved, NULL);
8ff3e8e85   Arjan van de Ven   select: switch se...
1034
  	ret = poll_select_copy_remaining(&end_time, tsp, 0, ret);
9f72949f6   David Woodhouse   [PATCH] Add psele...
1035
1036
1037
  
  	return ret;
  }
e99ca56ce   Al Viro   move compat selec...
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
  
  #ifdef CONFIG_COMPAT
  #define __COMPAT_NFDBITS       (8 * sizeof(compat_ulong_t))
  
  static
  int compat_poll_select_copy_remaining(struct timespec *end_time, void __user *p,
  				      int timeval, int ret)
  {
  	struct timespec ts;
  
  	if (!p)
  		return ret;
  
  	if (current->personality & STICKY_TIMEOUTS)
  		goto sticky;
  
  	/* No update for zero timeout */
  	if (!end_time->tv_sec && !end_time->tv_nsec)
  		return ret;
  
  	ktime_get_ts(&ts);
  	ts = timespec_sub(*end_time, ts);
  	if (ts.tv_sec < 0)
  		ts.tv_sec = ts.tv_nsec = 0;
  
  	if (timeval) {
  		struct compat_timeval rtv;
  
  		rtv.tv_sec = ts.tv_sec;
  		rtv.tv_usec = ts.tv_nsec / NSEC_PER_USEC;
  
  		if (!copy_to_user(p, &rtv, sizeof(rtv)))
  			return ret;
  	} else {
  		struct compat_timespec rts;
  
  		rts.tv_sec = ts.tv_sec;
  		rts.tv_nsec = ts.tv_nsec;
  
  		if (!copy_to_user(p, &rts, sizeof(rts)))
  			return ret;
  	}
  	/*
  	 * If an application puts its timeval in read-only memory, we
  	 * don't want the Linux-specific update to the timeval to
  	 * cause a fault after the select has completed
  	 * successfully. However, because we're not updating the
  	 * timeval, we can't restart the system call.
  	 */
  
  sticky:
  	if (ret == -ERESTARTNOHAND)
  		ret = -EINTR;
  	return ret;
  }
  
  /*
   * Ooo, nasty.  We need here to frob 32-bit unsigned longs to
   * 64-bit unsigned longs.
   */
  static
  int compat_get_fd_set(unsigned long nr, compat_ulong_t __user *ufdset,
  			unsigned long *fdset)
  {
e99ca56ce   Al Viro   move compat selec...
1102
  	if (ufdset) {
464d62421   Al Viro   select: switch co...
1103
  		return compat_get_bitmap(fdset, ufdset, nr);
e99ca56ce   Al Viro   move compat selec...
1104
  	} else {
79de3cbe9   Helge Deller   fs/select: Fix me...
1105
  		zero_fd_set(nr, fdset);
464d62421   Al Viro   select: switch co...
1106
  		return 0;
e99ca56ce   Al Viro   move compat selec...
1107
  	}
e99ca56ce   Al Viro   move compat selec...
1108
1109
1110
1111
1112
1113
  }
  
  static
  int compat_set_fd_set(unsigned long nr, compat_ulong_t __user *ufdset,
  		      unsigned long *fdset)
  {
e99ca56ce   Al Viro   move compat selec...
1114
1115
  	if (!ufdset)
  		return 0;
464d62421   Al Viro   select: switch co...
1116
  	return compat_put_bitmap(ufdset, fdset, nr);
e99ca56ce   Al Viro   move compat selec...
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
  }
  
  
  /*
   * This is a virtual copy of sys_select from fs/select.c and probably
   * should be compared to it from time to time
   */
  
  /*
   * We can actually return ERESTARTSYS instead of EINTR, but I'd
   * like to be certain this leads to no problems. So I return
   * EINTR just for safety.
   *
   * Update: ERESTARTSYS breaks at least the xview clock binary, so
   * I'm trying ERESTARTNOHAND which restart only when you want to.
   */
  static int compat_core_sys_select(int n, compat_ulong_t __user *inp,
  	compat_ulong_t __user *outp, compat_ulong_t __user *exp,
  	struct timespec *end_time)
  {
  	fd_set_bits fds;
  	void *bits;
  	int size, max_fds, ret = -EINVAL;
  	struct fdtable *fdt;
  	long stack_fds[SELECT_STACK_ALLOC/sizeof(long)];
  
  	if (n < 0)
  		goto out_nofds;
  
  	/* max_fds can increase, so grab it once to avoid race */
  	rcu_read_lock();
  	fdt = files_fdtable(current->files);
  	max_fds = fdt->max_fds;
  	rcu_read_unlock();
  	if (n > max_fds)
  		n = max_fds;
  
  	/*
  	 * We need 6 bitmaps (in/out/ex for both incoming and outgoing),
  	 * since we used fdset we need to allocate memory in units of
  	 * long-words.
  	 */
  	size = FDS_BYTES(n);
  	bits = stack_fds;
  	if (size > sizeof(stack_fds) / 6) {
  		bits = kmalloc(6 * size, GFP_KERNEL);
  		ret = -ENOMEM;
  		if (!bits)
  			goto out_nofds;
  	}
  	fds.in      = (unsigned long *)  bits;
  	fds.out     = (unsigned long *) (bits +   size);
  	fds.ex      = (unsigned long *) (bits + 2*size);
  	fds.res_in  = (unsigned long *) (bits + 3*size);
  	fds.res_out = (unsigned long *) (bits + 4*size);
  	fds.res_ex  = (unsigned long *) (bits + 5*size);
  
  	if ((ret = compat_get_fd_set(n, inp, fds.in)) ||
  	    (ret = compat_get_fd_set(n, outp, fds.out)) ||
  	    (ret = compat_get_fd_set(n, exp, fds.ex)))
  		goto out;
  	zero_fd_set(n, fds.res_in);
  	zero_fd_set(n, fds.res_out);
  	zero_fd_set(n, fds.res_ex);
  
  	ret = do_select(n, &fds, end_time);
  
  	if (ret < 0)
  		goto out;
  	if (!ret) {
  		ret = -ERESTARTNOHAND;
  		if (signal_pending(current))
  			goto out;
  		ret = 0;
  	}
  
  	if (compat_set_fd_set(n, inp, fds.res_in) ||
  	    compat_set_fd_set(n, outp, fds.res_out) ||
  	    compat_set_fd_set(n, exp, fds.res_ex))
  		ret = -EFAULT;
  out:
  	if (bits != stack_fds)
  		kfree(bits);
  out_nofds:
  	return ret;
  }
  
  COMPAT_SYSCALL_DEFINE5(select, int, n, compat_ulong_t __user *, inp,
  	compat_ulong_t __user *, outp, compat_ulong_t __user *, exp,
  	struct compat_timeval __user *, tvp)
  {
  	struct timespec end_time, *to = NULL;
  	struct compat_timeval tv;
  	int ret;
  
  	if (tvp) {
  		if (copy_from_user(&tv, tvp, sizeof(tv)))
  			return -EFAULT;
  
  		to = &end_time;
  		if (poll_select_set_timeout(to,
  				tv.tv_sec + (tv.tv_usec / USEC_PER_SEC),
  				(tv.tv_usec % USEC_PER_SEC) * NSEC_PER_USEC))
  			return -EINVAL;
  	}
  
  	ret = compat_core_sys_select(n, inp, outp, exp, to);
  	ret = compat_poll_select_copy_remaining(&end_time, tvp, 1, ret);
  
  	return ret;
  }
  
  struct compat_sel_arg_struct {
  	compat_ulong_t n;
  	compat_uptr_t inp;
  	compat_uptr_t outp;
  	compat_uptr_t exp;
  	compat_uptr_t tvp;
  };
  
  COMPAT_SYSCALL_DEFINE1(old_select, struct compat_sel_arg_struct __user *, arg)
  {
  	struct compat_sel_arg_struct a;
  
  	if (copy_from_user(&a, arg, sizeof(a)))
  		return -EFAULT;
  	return compat_sys_select(a.n, compat_ptr(a.inp), compat_ptr(a.outp),
  				 compat_ptr(a.exp), compat_ptr(a.tvp));
  }
  
  static long do_compat_pselect(int n, compat_ulong_t __user *inp,
  	compat_ulong_t __user *outp, compat_ulong_t __user *exp,
  	struct compat_timespec __user *tsp, compat_sigset_t __user *sigmask,
  	compat_size_t sigsetsize)
  {
  	compat_sigset_t ss32;
  	sigset_t ksigmask, sigsaved;
  	struct compat_timespec ts;
  	struct timespec end_time, *to = NULL;
  	int ret;
  
  	if (tsp) {
  		if (copy_from_user(&ts, tsp, sizeof(ts)))
  			return -EFAULT;
  
  		to = &end_time;
  		if (poll_select_set_timeout(to, ts.tv_sec, ts.tv_nsec))
  			return -EINVAL;
  	}
  
  	if (sigmask) {
  		if (sigsetsize != sizeof(compat_sigset_t))
  			return -EINVAL;
  		if (copy_from_user(&ss32, sigmask, sizeof(ss32)))
  			return -EFAULT;
  		sigset_from_compat(&ksigmask, &ss32);
  
  		sigdelsetmask(&ksigmask, sigmask(SIGKILL)|sigmask(SIGSTOP));
  		sigprocmask(SIG_SETMASK, &ksigmask, &sigsaved);
  	}
  
  	ret = compat_core_sys_select(n, inp, outp, exp, to);
  	ret = compat_poll_select_copy_remaining(&end_time, tsp, 0, ret);
  
  	if (ret == -ERESTARTNOHAND) {
  		/*
  		 * Don't restore the signal mask yet. Let do_signal() deliver
  		 * the signal on the way back to userspace, before the signal
  		 * mask is restored.
  		 */
  		if (sigmask) {
  			memcpy(&current->saved_sigmask, &sigsaved,
  					sizeof(sigsaved));
  			set_restore_sigmask();
  		}
  	} else if (sigmask)
  		sigprocmask(SIG_SETMASK, &sigsaved, NULL);
  
  	return ret;
  }
  
  COMPAT_SYSCALL_DEFINE6(pselect6, int, n, compat_ulong_t __user *, inp,
  	compat_ulong_t __user *, outp, compat_ulong_t __user *, exp,
  	struct compat_timespec __user *, tsp, void __user *, sig)
  {
  	compat_size_t sigsetsize = 0;
  	compat_uptr_t up = 0;
  
  	if (sig) {
  		if (!access_ok(VERIFY_READ, sig,
  				sizeof(compat_uptr_t)+sizeof(compat_size_t)) ||
  		    	__get_user(up, (compat_uptr_t __user *)sig) ||
  		    	__get_user(sigsetsize,
  				(compat_size_t __user *)(sig+sizeof(up))))
  			return -EFAULT;
  	}
  	return do_compat_pselect(n, inp, outp, exp, tsp, compat_ptr(up),
  				 sigsetsize);
  }
  
  COMPAT_SYSCALL_DEFINE5(ppoll, struct pollfd __user *, ufds,
  	unsigned int,  nfds, struct compat_timespec __user *, tsp,
  	const compat_sigset_t __user *, sigmask, compat_size_t, sigsetsize)
  {
  	compat_sigset_t ss32;
  	sigset_t ksigmask, sigsaved;
  	struct compat_timespec ts;
  	struct timespec end_time, *to = NULL;
  	int ret;
  
  	if (tsp) {
  		if (copy_from_user(&ts, tsp, sizeof(ts)))
  			return -EFAULT;
  
  		to = &end_time;
  		if (poll_select_set_timeout(to, ts.tv_sec, ts.tv_nsec))
  			return -EINVAL;
  	}
  
  	if (sigmask) {
  		if (sigsetsize != sizeof(compat_sigset_t))
  			return -EINVAL;
  		if (copy_from_user(&ss32, sigmask, sizeof(ss32)))
  			return -EFAULT;
  		sigset_from_compat(&ksigmask, &ss32);
  
  		sigdelsetmask(&ksigmask, sigmask(SIGKILL)|sigmask(SIGSTOP));
  		sigprocmask(SIG_SETMASK, &ksigmask, &sigsaved);
  	}
  
  	ret = do_sys_poll(ufds, nfds, to);
  
  	/* We can restart this syscall, usually */
  	if (ret == -EINTR) {
  		/*
  		 * Don't restore the signal mask yet. Let do_signal() deliver
  		 * the signal on the way back to userspace, before the signal
  		 * mask is restored.
  		 */
  		if (sigmask) {
  			memcpy(&current->saved_sigmask, &sigsaved,
  				sizeof(sigsaved));
  			set_restore_sigmask();
  		}
  		ret = -ERESTARTNOHAND;
  	} else if (sigmask)
  		sigprocmask(SIG_SETMASK, &sigsaved, NULL);
  
  	ret = compat_poll_select_copy_remaining(&end_time, tsp, 0, ret);
  
  	return ret;
  }
  #endif