Blame view

fs/select.c 25.4 KB
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
  /*
   * This file contains the procedures for the handling of select and poll
   *
   * Created for Linux based loosely upon Mathius Lattner's minix
   * patches by Peter MacDonald. Heavily edited by Linus.
   *
   *  4 February 1994
   *     COFF/ELF binary emulation. If the process has the STICKY_TIMEOUTS
   *     flag set in its personality we do *not* modify the given timeout
   *     parameter to reflect time remaining.
   *
   *  24 January 2000
   *     Changed sys_poll()/do_poll() to use PAGE_SIZE chunk-based allocation 
   *     of fds to overcome nfds < 16390 descriptors limit (Tigran Aivazian).
   */
022a16924   Milind Arun Choudhary   ROUND_UP macro cl...
16
  #include <linux/kernel.h>
a99bbaf5e   Alexey Dobriyan   headers: remove s...
17
  #include <linux/sched.h>
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
18
  #include <linux/syscalls.h>
630d9c472   Paul Gortmaker   fs: reduce the us...
19
  #include <linux/export.h>
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
20
  #include <linux/slab.h>
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
21
22
23
  #include <linux/poll.h>
  #include <linux/personality.h> /* for STICKY_TIMEOUTS */
  #include <linux/file.h>
9f3acc314   Al Viro   [PATCH] split lin...
24
  #include <linux/fdtable.h>
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
25
  #include <linux/fs.h>
b835996f6   Dipankar Sarma   [PATCH] files: lo...
26
  #include <linux/rcupdate.h>
8ff3e8e85   Arjan van de Ven   select: switch se...
27
  #include <linux/hrtimer.h>
8bd75c77b   Clark Williams   sched/rt: Move rt...
28
  #include <linux/sched/rt.h>
9745cdb36   Colin Cross   select: use freez...
29
  #include <linux/freezer.h>
076bb0c82   Eliezer Tamir   net: rename inclu...
30
  #include <net/busy_poll.h>
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
31
32
  
  #include <asm/uaccess.h>
90d6e24a3   Arjan van de Ven   hrtimer: make sel...
33
34
35
36
37
38
39
40
41
42
43
44
  
  /*
   * Estimate expected accuracy in ns from a timeval.
   *
   * After quite a bit of churning around, we've settled on
   * a simple thing of taking 0.1% of the timeout as the
   * slack, with a cap of 100 msec.
   * "nice" tasks get a 0.5% slack instead.
   *
   * Consider this comment an open invitation to come up with even
   * better solutions..
   */
5ae87e79e   Guillaume Knispel   poll/select: avoi...
45
  #define MAX_SLACK	(100 * NSEC_PER_MSEC)
96d2ab484   Arjan van de Ven   hrtimer: fix sign...
46
  static long __estimate_accuracy(struct timespec *tv)
90d6e24a3   Arjan van de Ven   hrtimer: make sel...
47
  {
96d2ab484   Arjan van de Ven   hrtimer: fix sign...
48
  	long slack;
90d6e24a3   Arjan van de Ven   hrtimer: make sel...
49
  	int divfactor = 1000;
5ae87e79e   Guillaume Knispel   poll/select: avoi...
50
51
  	if (tv->tv_sec < 0)
  		return 0;
4ce105d30   Arjan van de Ven   hrtimer: incorpor...
52
  	if (task_nice(current) > 0)
90d6e24a3   Arjan van de Ven   hrtimer: make sel...
53
  		divfactor = divfactor / 5;
5ae87e79e   Guillaume Knispel   poll/select: avoi...
54
55
  	if (tv->tv_sec > MAX_SLACK / (NSEC_PER_SEC/divfactor))
  		return MAX_SLACK;
90d6e24a3   Arjan van de Ven   hrtimer: make sel...
56
57
  	slack = tv->tv_nsec / divfactor;
  	slack += tv->tv_sec * (NSEC_PER_SEC/divfactor);
5ae87e79e   Guillaume Knispel   poll/select: avoi...
58
59
  	if (slack > MAX_SLACK)
  		return MAX_SLACK;
96d2ab484   Arjan van de Ven   hrtimer: fix sign...
60

90d6e24a3   Arjan van de Ven   hrtimer: make sel...
61
62
  	return slack;
  }
95aac7b1c   Shawn Bohrer   epoll: make epoll...
63
  long select_estimate_accuracy(struct timespec *tv)
90d6e24a3   Arjan van de Ven   hrtimer: make sel...
64
65
66
67
68
69
70
  {
  	unsigned long ret;
  	struct timespec now;
  
  	/*
  	 * Realtime tasks get a slack of 0 for obvious reasons.
  	 */
4ce105d30   Arjan van de Ven   hrtimer: incorpor...
71
  	if (rt_task(current))
90d6e24a3   Arjan van de Ven   hrtimer: make sel...
72
73
74
75
76
77
78
79
80
  		return 0;
  
  	ktime_get_ts(&now);
  	now = timespec_sub(*tv, now);
  	ret = __estimate_accuracy(&now);
  	if (ret < current->timer_slack_ns)
  		return current->timer_slack_ns;
  	return ret;
  }
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
  struct poll_table_page {
  	struct poll_table_page * next;
  	struct poll_table_entry * entry;
  	struct poll_table_entry entries[0];
  };
  
  #define POLL_TABLE_FULL(table) \
  	((unsigned long)((table)->entry+1) > PAGE_SIZE + (unsigned long)(table))
  
  /*
   * Ok, Peter made a complicated, but straightforward multiple_wait() function.
   * I have rewritten this, taking some shortcuts: This code may not be easy to
   * follow, but it should be free of race-conditions, and it's practical. If you
   * understand what I'm doing here, then you understand how the linux
   * sleep/wakeup mechanism works.
   *
   * Two very simple procedures, poll_wait() and poll_freewait() make all the
   * work.  poll_wait() is an inline-function defined in <linux/poll.h>,
   * as all select/poll functions have to call it to add an entry to the
   * poll table.
   */
75c96f858   Adrian Bunk   [PATCH] make some...
102
103
  static void __pollwait(struct file *filp, wait_queue_head_t *wait_address,
  		       poll_table *p);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
104
105
106
107
  
  void poll_initwait(struct poll_wqueues *pwq)
  {
  	init_poll_funcptr(&pwq->pt, __pollwait);
5f820f648   Tejun Heo   poll: allow f_op-...
108
  	pwq->polling_task = current;
b2add73db   Guillaume Knispel   poll/select: init...
109
  	pwq->triggered = 0;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
110
111
  	pwq->error = 0;
  	pwq->table = NULL;
70674f95c   Andi Kleen   [PATCH] Optimize ...
112
  	pwq->inline_index = 0;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
113
  }
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
114
  EXPORT_SYMBOL(poll_initwait);
70674f95c   Andi Kleen   [PATCH] Optimize ...
115
116
  static void free_poll_entry(struct poll_table_entry *entry)
  {
ccf6780dc   WANG Cong   Style fix in fs/s...
117
  	remove_wait_queue(entry->wait_address, &entry->wait);
70674f95c   Andi Kleen   [PATCH] Optimize ...
118
119
  	fput(entry->filp);
  }
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
120
121
122
  void poll_freewait(struct poll_wqueues *pwq)
  {
  	struct poll_table_page * p = pwq->table;
70674f95c   Andi Kleen   [PATCH] Optimize ...
123
124
125
  	int i;
  	for (i = 0; i < pwq->inline_index; i++)
  		free_poll_entry(pwq->inline_entries + i);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
126
127
128
129
130
131
132
  	while (p) {
  		struct poll_table_entry * entry;
  		struct poll_table_page *old;
  
  		entry = p->entry;
  		do {
  			entry--;
70674f95c   Andi Kleen   [PATCH] Optimize ...
133
  			free_poll_entry(entry);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
134
135
136
137
138
139
  		} while (entry > p->entries);
  		old = p;
  		p = p->next;
  		free_page((unsigned long) old);
  	}
  }
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
140
  EXPORT_SYMBOL(poll_freewait);
5f820f648   Tejun Heo   poll: allow f_op-...
141
  static struct poll_table_entry *poll_get_entry(struct poll_wqueues *p)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
142
  {
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
143
  	struct poll_table_page *table = p->table;
70674f95c   Andi Kleen   [PATCH] Optimize ...
144
145
  	if (p->inline_index < N_INLINE_POLL_ENTRIES)
  		return p->inline_entries + p->inline_index++;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
146
147
148
149
150
151
  	if (!table || POLL_TABLE_FULL(table)) {
  		struct poll_table_page *new_table;
  
  		new_table = (struct poll_table_page *) __get_free_page(GFP_KERNEL);
  		if (!new_table) {
  			p->error = -ENOMEM;
70674f95c   Andi Kleen   [PATCH] Optimize ...
152
  			return NULL;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
153
154
155
156
157
158
  		}
  		new_table->entry = new_table->entries;
  		new_table->next = table;
  		p->table = new_table;
  		table = new_table;
  	}
70674f95c   Andi Kleen   [PATCH] Optimize ...
159
160
  	return table->entry++;
  }
4938d7e02   Eric Dumazet   poll: avoid extra...
161
  static int __pollwake(wait_queue_t *wait, unsigned mode, int sync, void *key)
5f820f648   Tejun Heo   poll: allow f_op-...
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
  {
  	struct poll_wqueues *pwq = wait->private;
  	DECLARE_WAITQUEUE(dummy_wait, pwq->polling_task);
  
  	/*
  	 * Although this function is called under waitqueue lock, LOCK
  	 * doesn't imply write barrier and the users expect write
  	 * barrier semantics on wakeup functions.  The following
  	 * smp_wmb() is equivalent to smp_wmb() in try_to_wake_up()
  	 * and is paired with set_mb() in poll_schedule_timeout.
  	 */
  	smp_wmb();
  	pwq->triggered = 1;
  
  	/*
  	 * Perform the default wake up operation using a dummy
  	 * waitqueue.
  	 *
  	 * TODO: This is hacky but there currently is no interface to
  	 * pass in @sync.  @sync is scheduled to be removed and once
  	 * that happens, wake_up_process() can be used directly.
  	 */
  	return default_wake_function(&dummy_wait, mode, sync, key);
  }
4938d7e02   Eric Dumazet   poll: avoid extra...
186
187
188
189
190
191
192
193
194
  static int pollwake(wait_queue_t *wait, unsigned mode, int sync, void *key)
  {
  	struct poll_table_entry *entry;
  
  	entry = container_of(wait, struct poll_table_entry, wait);
  	if (key && !((unsigned long)key & entry->key))
  		return 0;
  	return __pollwake(wait, mode, sync, key);
  }
70674f95c   Andi Kleen   [PATCH] Optimize ...
195
196
197
198
  /* Add a new entry */
  static void __pollwait(struct file *filp, wait_queue_head_t *wait_address,
  				poll_table *p)
  {
5f820f648   Tejun Heo   poll: allow f_op-...
199
200
  	struct poll_wqueues *pwq = container_of(p, struct poll_wqueues, pt);
  	struct poll_table_entry *entry = poll_get_entry(pwq);
70674f95c   Andi Kleen   [PATCH] Optimize ...
201
202
  	if (!entry)
  		return;
cb0942b81   Al Viro   make get_file() r...
203
  	entry->filp = get_file(filp);
70674f95c   Andi Kleen   [PATCH] Optimize ...
204
  	entry->wait_address = wait_address;
626cf2366   Hans Verkuil   poll: add poll_re...
205
  	entry->key = p->_key;
5f820f648   Tejun Heo   poll: allow f_op-...
206
207
  	init_waitqueue_func_entry(&entry->wait, pollwake);
  	entry->wait.private = pwq;
ccf6780dc   WANG Cong   Style fix in fs/s...
208
  	add_wait_queue(wait_address, &entry->wait);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
209
  }
5f820f648   Tejun Heo   poll: allow f_op-...
210
211
212
213
214
215
216
  int poll_schedule_timeout(struct poll_wqueues *pwq, int state,
  			  ktime_t *expires, unsigned long slack)
  {
  	int rc = -EINTR;
  
  	set_current_state(state);
  	if (!pwq->triggered)
59612d187   Rafael J. Wysocki   Revert "select: u...
217
  		rc = schedule_hrtimeout_range(expires, slack, HRTIMER_MODE_ABS);
5f820f648   Tejun Heo   poll: allow f_op-...
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
  	__set_current_state(TASK_RUNNING);
  
  	/*
  	 * Prepare for the next iteration.
  	 *
  	 * The following set_mb() serves two purposes.  First, it's
  	 * the counterpart rmb of the wmb in pollwake() such that data
  	 * written before wake up is always visible after wake up.
  	 * Second, the full barrier guarantees that triggered clearing
  	 * doesn't pass event check of the next iteration.  Note that
  	 * this problem doesn't exist for the first iteration as
  	 * add_wait_queue() has full barrier semantics.
  	 */
  	set_mb(pwq->triggered, 0);
  
  	return rc;
  }
  EXPORT_SYMBOL(poll_schedule_timeout);
b773ad40a   Thomas Gleixner   select: add poll_...
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
  /**
   * poll_select_set_timeout - helper function to setup the timeout value
   * @to:		pointer to timespec variable for the final timeout
   * @sec:	seconds (from user space)
   * @nsec:	nanoseconds (from user space)
   *
   * Note, we do not use a timespec for the user space value here, That
   * way we can use the function for timeval and compat interfaces as well.
   *
   * Returns -EINVAL if sec/nsec are not normalized. Otherwise 0.
   */
  int poll_select_set_timeout(struct timespec *to, long sec, long nsec)
  {
  	struct timespec ts = {.tv_sec = sec, .tv_nsec = nsec};
  
  	if (!timespec_valid(&ts))
  		return -EINVAL;
  
  	/* Optimize for the zero timeout value here */
  	if (!sec && !nsec) {
  		to->tv_sec = to->tv_nsec = 0;
  	} else {
  		ktime_get_ts(to);
  		*to = timespec_add_safe(*to, ts);
  	}
  	return 0;
  }
  
  static int poll_select_copy_remaining(struct timespec *end_time, void __user *p,
  				      int timeval, int ret)
  {
  	struct timespec rts;
  	struct timeval rtv;
  
  	if (!p)
  		return ret;
  
  	if (current->personality & STICKY_TIMEOUTS)
  		goto sticky;
  
  	/* No update for zero timeout */
  	if (!end_time->tv_sec && !end_time->tv_nsec)
  		return ret;
  
  	ktime_get_ts(&rts);
  	rts = timespec_sub(*end_time, rts);
  	if (rts.tv_sec < 0)
  		rts.tv_sec = rts.tv_nsec = 0;
  
  	if (timeval) {
65329bf46   Vasiliy Kulikov   fs/select.c: fix ...
286
287
  		if (sizeof(rtv) > sizeof(rtv.tv_sec) + sizeof(rtv.tv_usec))
  			memset(&rtv, 0, sizeof(rtv));
b773ad40a   Thomas Gleixner   select: add poll_...
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
  		rtv.tv_sec = rts.tv_sec;
  		rtv.tv_usec = rts.tv_nsec / NSEC_PER_USEC;
  
  		if (!copy_to_user(p, &rtv, sizeof(rtv)))
  			return ret;
  
  	} else if (!copy_to_user(p, &rts, sizeof(rts)))
  		return ret;
  
  	/*
  	 * If an application puts its timeval in read-only memory, we
  	 * don't want the Linux-specific update to the timeval to
  	 * cause a fault after the select has completed
  	 * successfully. However, because we're not updating the
  	 * timeval, we can't restart the system call.
  	 */
  
  sticky:
  	if (ret == -ERESTARTNOHAND)
  		ret = -EINTR;
  	return ret;
  }
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
310
311
312
313
314
315
316
317
318
319
320
  #define FDS_IN(fds, n)		(fds->in + n)
  #define FDS_OUT(fds, n)		(fds->out + n)
  #define FDS_EX(fds, n)		(fds->ex + n)
  
  #define BITS(fds, n)	(*FDS_IN(fds, n)|*FDS_OUT(fds, n)|*FDS_EX(fds, n))
  
  static int max_select_fd(unsigned long n, fd_set_bits *fds)
  {
  	unsigned long *open_fds;
  	unsigned long set;
  	int max;
badf16621   Dipankar Sarma   [PATCH] files: br...
321
  	struct fdtable *fdt;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
322
323
  
  	/* handle last in-complete long-word first */
8ded2bbc1   Josh Boyer   posix_types.h: Cl...
324
325
  	set = ~(~0UL << (n & (BITS_PER_LONG-1)));
  	n /= BITS_PER_LONG;
badf16621   Dipankar Sarma   [PATCH] files: br...
326
  	fdt = files_fdtable(current->files);
1fd36adcd   David Howells   Replace the fd_se...
327
  	open_fds = fdt->open_fds + n;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
  	max = 0;
  	if (set) {
  		set &= BITS(fds, n);
  		if (set) {
  			if (!(set & ~*open_fds))
  				goto get_max;
  			return -EBADF;
  		}
  	}
  	while (n) {
  		open_fds--;
  		n--;
  		set = BITS(fds, n);
  		if (!set)
  			continue;
  		if (set & ~*open_fds)
  			return -EBADF;
  		if (max)
  			continue;
  get_max:
  		do {
  			max++;
  			set >>= 1;
  		} while (set);
8ded2bbc1   Josh Boyer   posix_types.h: Cl...
352
  		max += n * BITS_PER_LONG;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
353
354
355
356
  	}
  
  	return max;
  }
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
357
358
359
  #define POLLIN_SET (POLLRDNORM | POLLRDBAND | POLLIN | POLLHUP | POLLERR)
  #define POLLOUT_SET (POLLWRBAND | POLLWRNORM | POLLOUT | POLLERR)
  #define POLLEX_SET (POLLPRI)
4938d7e02   Eric Dumazet   poll: avoid extra...
360
  static inline void wait_key_set(poll_table *wait, unsigned long in,
2d48d67fa   Eliezer Tamir   net: poll/select ...
361
362
  				unsigned long out, unsigned long bit,
  				unsigned int ll_flag)
4938d7e02   Eric Dumazet   poll: avoid extra...
363
  {
2d48d67fa   Eliezer Tamir   net: poll/select ...
364
  	wait->_key = POLLEX_SET | ll_flag;
626cf2366   Hans Verkuil   poll: add poll_re...
365
366
367
368
  	if (in & bit)
  		wait->_key |= POLLIN_SET;
  	if (out & bit)
  		wait->_key |= POLLOUT_SET;
4938d7e02   Eric Dumazet   poll: avoid extra...
369
  }
8ff3e8e85   Arjan van de Ven   select: switch se...
370
  int do_select(int n, fd_set_bits *fds, struct timespec *end_time)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
371
  {
8ff3e8e85   Arjan van de Ven   select: switch se...
372
  	ktime_t expire, *to = NULL;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
373
374
  	struct poll_wqueues table;
  	poll_table *wait;
8ff3e8e85   Arjan van de Ven   select: switch se...
375
  	int retval, i, timed_out = 0;
90d6e24a3   Arjan van de Ven   hrtimer: make sel...
376
  	unsigned long slack = 0;
cbf55001b   Eliezer Tamir   net: rename low l...
377
  	unsigned int busy_flag = net_busy_loop_on() ? POLL_BUSY_LOOP : 0;
76b1e9b98   Eliezer Tamir   net/fs: change bu...
378
  	unsigned long busy_end = 0;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
379

b835996f6   Dipankar Sarma   [PATCH] files: lo...
380
  	rcu_read_lock();
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
381
  	retval = max_select_fd(n, fds);
b835996f6   Dipankar Sarma   [PATCH] files: lo...
382
  	rcu_read_unlock();
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
383
384
385
386
387
388
389
  
  	if (retval < 0)
  		return retval;
  	n = retval;
  
  	poll_initwait(&table);
  	wait = &table.pt;
8ff3e8e85   Arjan van de Ven   select: switch se...
390
  	if (end_time && !end_time->tv_sec && !end_time->tv_nsec) {
626cf2366   Hans Verkuil   poll: add poll_re...
391
  		wait->_qproc = NULL;
8ff3e8e85   Arjan van de Ven   select: switch se...
392
393
  		timed_out = 1;
  	}
96d2ab484   Arjan van de Ven   hrtimer: fix sign...
394
  	if (end_time && !timed_out)
231f3d393   Andrew Morton   select: rename es...
395
  		slack = select_estimate_accuracy(end_time);
90d6e24a3   Arjan van de Ven   hrtimer: make sel...
396

1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
397
398
399
  	retval = 0;
  	for (;;) {
  		unsigned long *rinp, *routp, *rexp, *inp, *outp, *exp;
cbf55001b   Eliezer Tamir   net: rename low l...
400
  		bool can_busy_loop = false;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
401

1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
402
403
404
405
406
407
  		inp = fds->in; outp = fds->out; exp = fds->ex;
  		rinp = fds->res_in; routp = fds->res_out; rexp = fds->res_ex;
  
  		for (i = 0; i < n; ++rinp, ++routp, ++rexp) {
  			unsigned long in, out, ex, all_bits, bit = 1, mask, j;
  			unsigned long res_in = 0, res_out = 0, res_ex = 0;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
408
409
410
411
  
  			in = *inp++; out = *outp++; ex = *exp++;
  			all_bits = in | out | ex;
  			if (all_bits == 0) {
8ded2bbc1   Josh Boyer   posix_types.h: Cl...
412
  				i += BITS_PER_LONG;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
413
414
  				continue;
  			}
8ded2bbc1   Josh Boyer   posix_types.h: Cl...
415
  			for (j = 0; j < BITS_PER_LONG; ++j, ++i, bit <<= 1) {
2903ff019   Al Viro   switch simple cas...
416
  				struct fd f;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
417
418
419
420
  				if (i >= n)
  					break;
  				if (!(bit & all_bits))
  					continue;
2903ff019   Al Viro   switch simple cas...
421
422
423
424
  				f = fdget(i);
  				if (f.file) {
  					const struct file_operations *f_op;
  					f_op = f.file->f_op;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
425
  					mask = DEFAULT_POLLMASK;
72c2d5319   Al Viro   file->f_op is nev...
426
  					if (f_op->poll) {
2d48d67fa   Eliezer Tamir   net: poll/select ...
427
  						wait_key_set(wait, in, out,
cbf55001b   Eliezer Tamir   net: rename low l...
428
  							     bit, busy_flag);
2903ff019   Al Viro   switch simple cas...
429
  						mask = (*f_op->poll)(f.file, wait);
4938d7e02   Eric Dumazet   poll: avoid extra...
430
  					}
2903ff019   Al Viro   switch simple cas...
431
  					fdput(f);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
432
433
434
  					if ((mask & POLLIN_SET) && (in & bit)) {
  						res_in |= bit;
  						retval++;
626cf2366   Hans Verkuil   poll: add poll_re...
435
  						wait->_qproc = NULL;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
436
437
438
439
  					}
  					if ((mask & POLLOUT_SET) && (out & bit)) {
  						res_out |= bit;
  						retval++;
626cf2366   Hans Verkuil   poll: add poll_re...
440
  						wait->_qproc = NULL;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
441
442
443
444
  					}
  					if ((mask & POLLEX_SET) && (ex & bit)) {
  						res_ex |= bit;
  						retval++;
626cf2366   Hans Verkuil   poll: add poll_re...
445
  						wait->_qproc = NULL;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
446
  					}
2d48d67fa   Eliezer Tamir   net: poll/select ...
447
  					/* got something, stop busy polling */
cbf55001b   Eliezer Tamir   net: rename low l...
448
449
450
451
452
453
454
455
456
457
  					if (retval) {
  						can_busy_loop = false;
  						busy_flag = 0;
  
  					/*
  					 * only remember a returned
  					 * POLL_BUSY_LOOP if we asked for it
  					 */
  					} else if (busy_flag & mask)
  						can_busy_loop = true;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
458
  				}
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
459
460
461
462
463
464
465
  			}
  			if (res_in)
  				*rinp = res_in;
  			if (res_out)
  				*routp = res_out;
  			if (res_ex)
  				*rexp = res_ex;
55d853849   Linus Torvalds   Fix performance r...
466
  			cond_resched();
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
467
  		}
626cf2366   Hans Verkuil   poll: add poll_re...
468
  		wait->_qproc = NULL;
8ff3e8e85   Arjan van de Ven   select: switch se...
469
  		if (retval || timed_out || signal_pending(current))
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
470
  			break;
f5264481c   Pavel Machek   trivial: small cl...
471
  		if (table.error) {
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
472
473
474
  			retval = table.error;
  			break;
  		}
9f72949f6   David Woodhouse   [PATCH] Add psele...
475

cbf55001b   Eliezer Tamir   net: rename low l...
476
  		/* only if found POLL_BUSY_LOOP sockets && not out of time */
76b1e9b98   Eliezer Tamir   net/fs: change bu...
477
478
479
480
481
482
483
484
485
  		if (can_busy_loop && !need_resched()) {
  			if (!busy_end) {
  				busy_end = busy_loop_end_time();
  				continue;
  			}
  			if (!busy_loop_timeout(busy_end))
  				continue;
  		}
  		busy_flag = 0;
2d48d67fa   Eliezer Tamir   net: poll/select ...
486

8ff3e8e85   Arjan van de Ven   select: switch se...
487
488
489
490
491
492
493
494
  		/*
  		 * If this is the first loop and we have a timeout
  		 * given, then we convert to ktime_t and set the to
  		 * pointer to the expiry value.
  		 */
  		if (end_time && !to) {
  			expire = timespec_to_ktime(*end_time);
  			to = &expire;
9f72949f6   David Woodhouse   [PATCH] Add psele...
495
  		}
8ff3e8e85   Arjan van de Ven   select: switch se...
496

5f820f648   Tejun Heo   poll: allow f_op-...
497
498
  		if (!poll_schedule_timeout(&table, TASK_INTERRUPTIBLE,
  					   to, slack))
8ff3e8e85   Arjan van de Ven   select: switch se...
499
  			timed_out = 1;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
500
  	}
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
501
502
  
  	poll_freewait(&table);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
503
504
  	return retval;
  }
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
505
506
507
508
509
510
511
512
  /*
   * We can actually return ERESTARTSYS instead of EINTR, but I'd
   * like to be certain this leads to no problems. So I return
   * EINTR just for safety.
   *
   * Update: ERESTARTSYS breaks at least the xview clock binary, so
   * I'm trying ERESTARTNOHAND which restart only when you want to.
   */
a2dcb44c3   Al Viro   [PATCH] make osf_...
513
  int core_sys_select(int n, fd_set __user *inp, fd_set __user *outp,
8ff3e8e85   Arjan van de Ven   select: switch se...
514
  			   fd_set __user *exp, struct timespec *end_time)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
515
516
  {
  	fd_set_bits fds;
29ff2db55   Andrew Morton   [PATCH] select() ...
517
  	void *bits;
bbea9f696   Vadim Lobanov   [PATCH] fdtable: ...
518
  	int ret, max_fds;
b04eb6aa0   Mitchell Blank Jr   [PATCH] select: d...
519
  	unsigned int size;
badf16621   Dipankar Sarma   [PATCH] files: br...
520
  	struct fdtable *fdt;
70674f95c   Andi Kleen   [PATCH] Optimize ...
521
  	/* Allocate small arguments on the stack to save memory and be faster */
30c14e40e   Jes Sorensen   [PATCH] avoid una...
522
  	long stack_fds[SELECT_STACK_ALLOC/sizeof(long)];
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
523

1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
524
525
526
  	ret = -EINVAL;
  	if (n < 0)
  		goto out_nofds;
bbea9f696   Vadim Lobanov   [PATCH] fdtable: ...
527
  	/* max_fds can increase, so grab it once to avoid race */
b835996f6   Dipankar Sarma   [PATCH] files: lo...
528
  	rcu_read_lock();
badf16621   Dipankar Sarma   [PATCH] files: br...
529
  	fdt = files_fdtable(current->files);
bbea9f696   Vadim Lobanov   [PATCH] fdtable: ...
530
  	max_fds = fdt->max_fds;
b835996f6   Dipankar Sarma   [PATCH] files: lo...
531
  	rcu_read_unlock();
bbea9f696   Vadim Lobanov   [PATCH] fdtable: ...
532
533
  	if (n > max_fds)
  		n = max_fds;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
534
535
536
537
538
539
  
  	/*
  	 * We need 6 bitmaps (in/out/ex for both incoming and outgoing),
  	 * since we used fdset we need to allocate memory in units of
  	 * long-words. 
  	 */
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
540
  	size = FDS_BYTES(n);
b04eb6aa0   Mitchell Blank Jr   [PATCH] select: d...
541
542
543
544
  	bits = stack_fds;
  	if (size > sizeof(stack_fds) / 6) {
  		/* Not enough space in on-stack array; must use kmalloc */
  		ret = -ENOMEM;
70674f95c   Andi Kleen   [PATCH] Optimize ...
545
  		bits = kmalloc(6 * size, GFP_KERNEL);
b04eb6aa0   Mitchell Blank Jr   [PATCH] select: d...
546
547
548
  		if (!bits)
  			goto out_nofds;
  	}
29ff2db55   Andrew Morton   [PATCH] select() ...
549
550
551
552
553
554
  	fds.in      = bits;
  	fds.out     = bits +   size;
  	fds.ex      = bits + 2*size;
  	fds.res_in  = bits + 3*size;
  	fds.res_out = bits + 4*size;
  	fds.res_ex  = bits + 5*size;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
555
556
557
558
559
560
561
562
  
  	if ((ret = get_fd_set(n, inp, fds.in)) ||
  	    (ret = get_fd_set(n, outp, fds.out)) ||
  	    (ret = get_fd_set(n, exp, fds.ex)))
  		goto out;
  	zero_fd_set(n, fds.res_in);
  	zero_fd_set(n, fds.res_out);
  	zero_fd_set(n, fds.res_ex);
8ff3e8e85   Arjan van de Ven   select: switch se...
563
  	ret = do_select(n, &fds, end_time);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
  
  	if (ret < 0)
  		goto out;
  	if (!ret) {
  		ret = -ERESTARTNOHAND;
  		if (signal_pending(current))
  			goto out;
  		ret = 0;
  	}
  
  	if (set_fd_set(n, inp, fds.res_in) ||
  	    set_fd_set(n, outp, fds.res_out) ||
  	    set_fd_set(n, exp, fds.res_ex))
  		ret = -EFAULT;
  
  out:
70674f95c   Andi Kleen   [PATCH] Optimize ...
580
581
  	if (bits != stack_fds)
  		kfree(bits);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
582
583
584
  out_nofds:
  	return ret;
  }
5a8a82b1d   Heiko Carstens   [CVE-2009-0029] S...
585
586
  SYSCALL_DEFINE5(select, int, n, fd_set __user *, inp, fd_set __user *, outp,
  		fd_set __user *, exp, struct timeval __user *, tvp)
9f72949f6   David Woodhouse   [PATCH] Add psele...
587
  {
8ff3e8e85   Arjan van de Ven   select: switch se...
588
  	struct timespec end_time, *to = NULL;
9f72949f6   David Woodhouse   [PATCH] Add psele...
589
590
591
592
593
594
  	struct timeval tv;
  	int ret;
  
  	if (tvp) {
  		if (copy_from_user(&tv, tvp, sizeof(tv)))
  			return -EFAULT;
8ff3e8e85   Arjan van de Ven   select: switch se...
595
  		to = &end_time;
4d36a9e65   Arjan van de Ven   select: deal with...
596
597
598
  		if (poll_select_set_timeout(to,
  				tv.tv_sec + (tv.tv_usec / USEC_PER_SEC),
  				(tv.tv_usec % USEC_PER_SEC) * NSEC_PER_USEC))
9f72949f6   David Woodhouse   [PATCH] Add psele...
599
  			return -EINVAL;
9f72949f6   David Woodhouse   [PATCH] Add psele...
600
  	}
8ff3e8e85   Arjan van de Ven   select: switch se...
601
602
  	ret = core_sys_select(n, inp, outp, exp, to);
  	ret = poll_select_copy_remaining(&end_time, tvp, 1, ret);
9f72949f6   David Woodhouse   [PATCH] Add psele...
603
604
605
  
  	return ret;
  }
c9da9f212   Heiko Carstens   [CVE-2009-0029] M...
606
607
608
  static long do_pselect(int n, fd_set __user *inp, fd_set __user *outp,
  		       fd_set __user *exp, struct timespec __user *tsp,
  		       const sigset_t __user *sigmask, size_t sigsetsize)
9f72949f6   David Woodhouse   [PATCH] Add psele...
609
  {
9f72949f6   David Woodhouse   [PATCH] Add psele...
610
  	sigset_t ksigmask, sigsaved;
8ff3e8e85   Arjan van de Ven   select: switch se...
611
  	struct timespec ts, end_time, *to = NULL;
9f72949f6   David Woodhouse   [PATCH] Add psele...
612
613
614
615
616
  	int ret;
  
  	if (tsp) {
  		if (copy_from_user(&ts, tsp, sizeof(ts)))
  			return -EFAULT;
8ff3e8e85   Arjan van de Ven   select: switch se...
617
618
  		to = &end_time;
  		if (poll_select_set_timeout(to, ts.tv_sec, ts.tv_nsec))
9f72949f6   David Woodhouse   [PATCH] Add psele...
619
  			return -EINVAL;
9f72949f6   David Woodhouse   [PATCH] Add psele...
620
621
622
623
624
625
626
627
628
629
630
631
  	}
  
  	if (sigmask) {
  		/* XXX: Don't preclude handling different sized sigset_t's.  */
  		if (sigsetsize != sizeof(sigset_t))
  			return -EINVAL;
  		if (copy_from_user(&ksigmask, sigmask, sizeof(ksigmask)))
  			return -EFAULT;
  
  		sigdelsetmask(&ksigmask, sigmask(SIGKILL)|sigmask(SIGSTOP));
  		sigprocmask(SIG_SETMASK, &ksigmask, &sigsaved);
  	}
62568510b   Bernd Schmidt   Fix timeouts in s...
632
  	ret = core_sys_select(n, inp, outp, exp, to);
8ff3e8e85   Arjan van de Ven   select: switch se...
633
  	ret = poll_select_copy_remaining(&end_time, tsp, 0, ret);
9f72949f6   David Woodhouse   [PATCH] Add psele...
634
635
636
637
638
639
640
641
642
643
  
  	if (ret == -ERESTARTNOHAND) {
  		/*
  		 * Don't restore the signal mask yet. Let do_signal() deliver
  		 * the signal on the way back to userspace, before the signal
  		 * mask is restored.
  		 */
  		if (sigmask) {
  			memcpy(&current->saved_sigmask, &sigsaved,
  					sizeof(sigsaved));
4e4c22c71   Roland McGrath   signals: add set_...
644
  			set_restore_sigmask();
9f72949f6   David Woodhouse   [PATCH] Add psele...
645
646
647
648
649
650
651
652
653
654
655
656
657
  		}
  	} else if (sigmask)
  		sigprocmask(SIG_SETMASK, &sigsaved, NULL);
  
  	return ret;
  }
  
  /*
   * Most architectures can't handle 7-argument syscalls. So we provide a
   * 6-argument version where the sixth argument is a pointer to a structure
   * which has a pointer to the sigset_t itself followed by a size_t containing
   * the sigset size.
   */
d4e82042c   Heiko Carstens   [CVE-2009-0029] S...
658
659
660
  SYSCALL_DEFINE6(pselect6, int, n, fd_set __user *, inp, fd_set __user *, outp,
  		fd_set __user *, exp, struct timespec __user *, tsp,
  		void __user *, sig)
9f72949f6   David Woodhouse   [PATCH] Add psele...
661
662
663
664
665
666
  {
  	size_t sigsetsize = 0;
  	sigset_t __user *up = NULL;
  
  	if (sig) {
  		if (!access_ok(VERIFY_READ, sig, sizeof(void *)+sizeof(size_t))
e110ab94e   Al Viro   [PATCH] fix __use...
667
  		    || __get_user(up, (sigset_t __user * __user *)sig)
9f72949f6   David Woodhouse   [PATCH] Add psele...
668
  		    || __get_user(sigsetsize,
e110ab94e   Al Viro   [PATCH] fix __use...
669
  				(size_t __user *)(sig+sizeof(void *))))
9f72949f6   David Woodhouse   [PATCH] Add psele...
670
671
  			return -EFAULT;
  	}
c9da9f212   Heiko Carstens   [CVE-2009-0029] M...
672
  	return do_pselect(n, inp, outp, exp, tsp, up, sigsetsize);
9f72949f6   David Woodhouse   [PATCH] Add psele...
673
  }
9f72949f6   David Woodhouse   [PATCH] Add psele...
674

5d0e52830   Christoph Hellwig   Add generic sys_o...
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
  #ifdef __ARCH_WANT_SYS_OLD_SELECT
  struct sel_arg_struct {
  	unsigned long n;
  	fd_set __user *inp, *outp, *exp;
  	struct timeval __user *tvp;
  };
  
  SYSCALL_DEFINE1(old_select, struct sel_arg_struct __user *, arg)
  {
  	struct sel_arg_struct a;
  
  	if (copy_from_user(&a, arg, sizeof(a)))
  		return -EFAULT;
  	return sys_select(a.n, a.inp, a.outp, a.exp, a.tvp);
  }
  #endif
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
691
692
693
694
695
696
697
  struct poll_list {
  	struct poll_list *next;
  	int len;
  	struct pollfd entries[0];
  };
  
  #define POLLFD_PER_PAGE  ((PAGE_SIZE-sizeof(struct poll_list)) / sizeof(struct pollfd))
4a4b69f79   Vadim Lobanov   [PATCH] Poll clea...
698
699
700
701
702
  /*
   * Fish for pollable events on the pollfd->fd file descriptor. We're only
   * interested in events matching the pollfd->events mask, and the result
   * matching that mask is both recorded in pollfd->revents and returned. The
   * pwait poll_table will be used by the fd-provided poll handler for waiting,
626cf2366   Hans Verkuil   poll: add poll_re...
703
   * if pwait->_qproc is non-NULL.
4a4b69f79   Vadim Lobanov   [PATCH] Poll clea...
704
   */
2d48d67fa   Eliezer Tamir   net: poll/select ...
705
  static inline unsigned int do_pollfd(struct pollfd *pollfd, poll_table *pwait,
cbf55001b   Eliezer Tamir   net: rename low l...
706
707
  				     bool *can_busy_poll,
  				     unsigned int busy_flag)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
708
  {
4a4b69f79   Vadim Lobanov   [PATCH] Poll clea...
709
710
711
712
713
714
  	unsigned int mask;
  	int fd;
  
  	mask = 0;
  	fd = pollfd->fd;
  	if (fd >= 0) {
2903ff019   Al Viro   switch simple cas...
715
  		struct fd f = fdget(fd);
4a4b69f79   Vadim Lobanov   [PATCH] Poll clea...
716
  		mask = POLLNVAL;
2903ff019   Al Viro   switch simple cas...
717
  		if (f.file) {
4a4b69f79   Vadim Lobanov   [PATCH] Poll clea...
718
  			mask = DEFAULT_POLLMASK;
72c2d5319   Al Viro   file->f_op is nev...
719
  			if (f.file->f_op->poll) {
626cf2366   Hans Verkuil   poll: add poll_re...
720
  				pwait->_key = pollfd->events|POLLERR|POLLHUP;
cbf55001b   Eliezer Tamir   net: rename low l...
721
  				pwait->_key |= busy_flag;
2903ff019   Al Viro   switch simple cas...
722
  				mask = f.file->f_op->poll(f.file, pwait);
cbf55001b   Eliezer Tamir   net: rename low l...
723
724
  				if (mask & busy_flag)
  					*can_busy_poll = true;
4938d7e02   Eric Dumazet   poll: avoid extra...
725
  			}
4a4b69f79   Vadim Lobanov   [PATCH] Poll clea...
726
727
  			/* Mask out unneeded events. */
  			mask &= pollfd->events | POLLERR | POLLHUP;
2903ff019   Al Viro   switch simple cas...
728
  			fdput(f);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
729
  		}
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
730
  	}
4a4b69f79   Vadim Lobanov   [PATCH] Poll clea...
731
732
733
  	pollfd->revents = mask;
  
  	return mask;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
734
735
736
  }
  
  static int do_poll(unsigned int nfds,  struct poll_list *list,
8ff3e8e85   Arjan van de Ven   select: switch se...
737
  		   struct poll_wqueues *wait, struct timespec *end_time)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
738
  {
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
739
  	poll_table* pt = &wait->pt;
8ff3e8e85   Arjan van de Ven   select: switch se...
740
741
  	ktime_t expire, *to = NULL;
  	int timed_out = 0, count = 0;
90d6e24a3   Arjan van de Ven   hrtimer: make sel...
742
  	unsigned long slack = 0;
cbf55001b   Eliezer Tamir   net: rename low l...
743
  	unsigned int busy_flag = net_busy_loop_on() ? POLL_BUSY_LOOP : 0;
76b1e9b98   Eliezer Tamir   net/fs: change bu...
744
  	unsigned long busy_end = 0;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
745

9f72949f6   David Woodhouse   [PATCH] Add psele...
746
  	/* Optimise the no-wait case */
8ff3e8e85   Arjan van de Ven   select: switch se...
747
  	if (end_time && !end_time->tv_sec && !end_time->tv_nsec) {
626cf2366   Hans Verkuil   poll: add poll_re...
748
  		pt->_qproc = NULL;
8ff3e8e85   Arjan van de Ven   select: switch se...
749
750
  		timed_out = 1;
  	}
9bf084f70   Oleg Nesterov   do_poll: return -...
751

96d2ab484   Arjan van de Ven   hrtimer: fix sign...
752
  	if (end_time && !timed_out)
231f3d393   Andrew Morton   select: rename es...
753
  		slack = select_estimate_accuracy(end_time);
90d6e24a3   Arjan van de Ven   hrtimer: make sel...
754

1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
755
756
  	for (;;) {
  		struct poll_list *walk;
cbf55001b   Eliezer Tamir   net: rename low l...
757
  		bool can_busy_loop = false;
9f72949f6   David Woodhouse   [PATCH] Add psele...
758

4a4b69f79   Vadim Lobanov   [PATCH] Poll clea...
759
760
761
762
763
764
765
766
  		for (walk = list; walk != NULL; walk = walk->next) {
  			struct pollfd * pfd, * pfd_end;
  
  			pfd = walk->entries;
  			pfd_end = pfd + walk->len;
  			for (; pfd != pfd_end; pfd++) {
  				/*
  				 * Fish for events. If we found one, record it
626cf2366   Hans Verkuil   poll: add poll_re...
767
  				 * and kill poll_table->_qproc, so we don't
4a4b69f79   Vadim Lobanov   [PATCH] Poll clea...
768
769
770
771
  				 * needlessly register any other waiters after
  				 * this. They'll get immediately deregistered
  				 * when we break out and return.
  				 */
cbf55001b   Eliezer Tamir   net: rename low l...
772
773
  				if (do_pollfd(pfd, pt, &can_busy_loop,
  					      busy_flag)) {
4a4b69f79   Vadim Lobanov   [PATCH] Poll clea...
774
  					count++;
626cf2366   Hans Verkuil   poll: add poll_re...
775
  					pt->_qproc = NULL;
cbf55001b   Eliezer Tamir   net: rename low l...
776
777
778
  					/* found something, stop busy polling */
  					busy_flag = 0;
  					can_busy_loop = false;
4a4b69f79   Vadim Lobanov   [PATCH] Poll clea...
779
780
  				}
  			}
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
781
  		}
4a4b69f79   Vadim Lobanov   [PATCH] Poll clea...
782
783
  		/*
  		 * All waiters have already been registered, so don't provide
626cf2366   Hans Verkuil   poll: add poll_re...
784
  		 * a poll_table->_qproc to them on the next loop iteration.
4a4b69f79   Vadim Lobanov   [PATCH] Poll clea...
785
  		 */
626cf2366   Hans Verkuil   poll: add poll_re...
786
  		pt->_qproc = NULL;
9bf084f70   Oleg Nesterov   do_poll: return -...
787
788
789
790
791
  		if (!count) {
  			count = wait->error;
  			if (signal_pending(current))
  				count = -EINTR;
  		}
8ff3e8e85   Arjan van de Ven   select: switch se...
792
  		if (count || timed_out)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
793
  			break;
9f72949f6   David Woodhouse   [PATCH] Add psele...
794

cbf55001b   Eliezer Tamir   net: rename low l...
795
  		/* only if found POLL_BUSY_LOOP sockets && not out of time */
76b1e9b98   Eliezer Tamir   net/fs: change bu...
796
797
798
799
800
801
802
803
804
  		if (can_busy_loop && !need_resched()) {
  			if (!busy_end) {
  				busy_end = busy_loop_end_time();
  				continue;
  			}
  			if (!busy_loop_timeout(busy_end))
  				continue;
  		}
  		busy_flag = 0;
91e2fd337   Eliezer Tamir   net: avoid callin...
805

8ff3e8e85   Arjan van de Ven   select: switch se...
806
807
808
809
810
811
812
813
  		/*
  		 * If this is the first loop and we have a timeout
  		 * given, then we convert to ktime_t and set the to
  		 * pointer to the expiry value.
  		 */
  		if (end_time && !to) {
  			expire = timespec_to_ktime(*end_time);
  			to = &expire;
9f72949f6   David Woodhouse   [PATCH] Add psele...
814
  		}
5f820f648   Tejun Heo   poll: allow f_op-...
815
  		if (!poll_schedule_timeout(wait, TASK_INTERRUPTIBLE, to, slack))
8ff3e8e85   Arjan van de Ven   select: switch se...
816
  			timed_out = 1;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
817
  	}
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
818
819
  	return count;
  }
70674f95c   Andi Kleen   [PATCH] Optimize ...
820
821
  #define N_STACK_PPS ((sizeof(stack_pps) - sizeof(struct poll_list))  / \
  			sizeof(struct pollfd))
8ff3e8e85   Arjan van de Ven   select: switch se...
822
823
  int do_sys_poll(struct pollfd __user *ufds, unsigned int nfds,
  		struct timespec *end_time)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
824
825
  {
  	struct poll_wqueues table;
252e5725c   Oleg Nesterov   do_sys_poll: simp...
826
   	int err = -EFAULT, fdcount, len, size;
30c14e40e   Jes Sorensen   [PATCH] avoid una...
827
828
829
830
  	/* Allocate small arguments on the stack to save memory and be
  	   faster - use long to make sure the buffer is aligned properly
  	   on 64 bit archs to avoid unaligned access */
  	long stack_pps[POLL_STACK_ALLOC/sizeof(long)];
252e5725c   Oleg Nesterov   do_sys_poll: simp...
831
832
833
  	struct poll_list *const head = (struct poll_list *)stack_pps;
   	struct poll_list *walk = head;
   	unsigned long todo = nfds;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
834

d554ed895   Jiri Slaby   fs: use rlimit he...
835
  	if (nfds > rlimit(RLIMIT_NOFILE))
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
836
  		return -EINVAL;
252e5725c   Oleg Nesterov   do_sys_poll: simp...
837
838
839
840
841
842
  	len = min_t(unsigned int, nfds, N_STACK_PPS);
  	for (;;) {
  		walk->next = NULL;
  		walk->len = len;
  		if (!len)
  			break;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
843

252e5725c   Oleg Nesterov   do_sys_poll: simp...
844
845
846
847
848
849
850
  		if (copy_from_user(walk->entries, ufds + nfds-todo,
  					sizeof(struct pollfd) * walk->len))
  			goto out_fds;
  
  		todo -= walk->len;
  		if (!todo)
  			break;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
851

252e5725c   Oleg Nesterov   do_sys_poll: simp...
852
853
854
855
856
  		len = min(todo, POLLFD_PER_PAGE);
  		size = sizeof(struct poll_list) + sizeof(struct pollfd) * len;
  		walk = walk->next = kmalloc(size, GFP_KERNEL);
  		if (!walk) {
  			err = -ENOMEM;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
857
858
  			goto out_fds;
  		}
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
859
  	}
9f72949f6   David Woodhouse   [PATCH] Add psele...
860

252e5725c   Oleg Nesterov   do_sys_poll: simp...
861
  	poll_initwait(&table);
8ff3e8e85   Arjan van de Ven   select: switch se...
862
  	fdcount = do_poll(nfds, head, &table, end_time);
252e5725c   Oleg Nesterov   do_sys_poll: simp...
863
  	poll_freewait(&table);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
864

252e5725c   Oleg Nesterov   do_sys_poll: simp...
865
  	for (walk = head; walk; walk = walk->next) {
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
866
867
  		struct pollfd *fds = walk->entries;
  		int j;
252e5725c   Oleg Nesterov   do_sys_poll: simp...
868
869
  		for (j = 0; j < walk->len; j++, ufds++)
  			if (__put_user(fds[j].revents, &ufds->revents))
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
870
  				goto out_fds;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
871
    	}
252e5725c   Oleg Nesterov   do_sys_poll: simp...
872

1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
873
  	err = fdcount;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
874
  out_fds:
252e5725c   Oleg Nesterov   do_sys_poll: simp...
875
876
877
878
879
  	walk = head->next;
  	while (walk) {
  		struct poll_list *pos = walk;
  		walk = walk->next;
  		kfree(pos);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
880
  	}
252e5725c   Oleg Nesterov   do_sys_poll: simp...
881

1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
882
883
  	return err;
  }
9f72949f6   David Woodhouse   [PATCH] Add psele...
884

3075d9da0   Chris Wright   Use ERESTART_REST...
885
886
  static long do_restart_poll(struct restart_block *restart_block)
  {
8ff3e8e85   Arjan van de Ven   select: switch se...
887
888
889
  	struct pollfd __user *ufds = restart_block->poll.ufds;
  	int nfds = restart_block->poll.nfds;
  	struct timespec *to = NULL, end_time;
3075d9da0   Chris Wright   Use ERESTART_REST...
890
  	int ret;
8ff3e8e85   Arjan van de Ven   select: switch se...
891
892
893
894
895
896
897
  	if (restart_block->poll.has_timeout) {
  		end_time.tv_sec = restart_block->poll.tv_sec;
  		end_time.tv_nsec = restart_block->poll.tv_nsec;
  		to = &end_time;
  	}
  
  	ret = do_sys_poll(ufds, nfds, to);
3075d9da0   Chris Wright   Use ERESTART_REST...
898
899
  	if (ret == -EINTR) {
  		restart_block->fn = do_restart_poll;
3075d9da0   Chris Wright   Use ERESTART_REST...
900
901
902
903
  		ret = -ERESTART_RESTARTBLOCK;
  	}
  	return ret;
  }
5a8a82b1d   Heiko Carstens   [CVE-2009-0029] S...
904
  SYSCALL_DEFINE3(poll, struct pollfd __user *, ufds, unsigned int, nfds,
faf309009   Linus Torvalds   sys_poll: fix inc...
905
  		int, timeout_msecs)
9f72949f6   David Woodhouse   [PATCH] Add psele...
906
  {
8ff3e8e85   Arjan van de Ven   select: switch se...
907
  	struct timespec end_time, *to = NULL;
3075d9da0   Chris Wright   Use ERESTART_REST...
908
  	int ret;
9f72949f6   David Woodhouse   [PATCH] Add psele...
909

8ff3e8e85   Arjan van de Ven   select: switch se...
910
911
912
913
  	if (timeout_msecs >= 0) {
  		to = &end_time;
  		poll_select_set_timeout(to, timeout_msecs / MSEC_PER_SEC,
  			NSEC_PER_MSEC * (timeout_msecs % MSEC_PER_SEC));
9f72949f6   David Woodhouse   [PATCH] Add psele...
914
  	}
8ff3e8e85   Arjan van de Ven   select: switch se...
915
  	ret = do_sys_poll(ufds, nfds, to);
3075d9da0   Chris Wright   Use ERESTART_REST...
916
917
  	if (ret == -EINTR) {
  		struct restart_block *restart_block;
8ff3e8e85   Arjan van de Ven   select: switch se...
918

3075d9da0   Chris Wright   Use ERESTART_REST...
919
920
  		restart_block = &current_thread_info()->restart_block;
  		restart_block->fn = do_restart_poll;
8ff3e8e85   Arjan van de Ven   select: switch se...
921
922
923
924
925
926
927
928
929
  		restart_block->poll.ufds = ufds;
  		restart_block->poll.nfds = nfds;
  
  		if (timeout_msecs >= 0) {
  			restart_block->poll.tv_sec = end_time.tv_sec;
  			restart_block->poll.tv_nsec = end_time.tv_nsec;
  			restart_block->poll.has_timeout = 1;
  		} else
  			restart_block->poll.has_timeout = 0;
3075d9da0   Chris Wright   Use ERESTART_REST...
930
931
932
  		ret = -ERESTART_RESTARTBLOCK;
  	}
  	return ret;
9f72949f6   David Woodhouse   [PATCH] Add psele...
933
  }
d4e82042c   Heiko Carstens   [CVE-2009-0029] S...
934
935
936
  SYSCALL_DEFINE5(ppoll, struct pollfd __user *, ufds, unsigned int, nfds,
  		struct timespec __user *, tsp, const sigset_t __user *, sigmask,
  		size_t, sigsetsize)
9f72949f6   David Woodhouse   [PATCH] Add psele...
937
938
  {
  	sigset_t ksigmask, sigsaved;
8ff3e8e85   Arjan van de Ven   select: switch se...
939
  	struct timespec ts, end_time, *to = NULL;
9f72949f6   David Woodhouse   [PATCH] Add psele...
940
941
942
943
944
  	int ret;
  
  	if (tsp) {
  		if (copy_from_user(&ts, tsp, sizeof(ts)))
  			return -EFAULT;
8ff3e8e85   Arjan van de Ven   select: switch se...
945
946
947
  		to = &end_time;
  		if (poll_select_set_timeout(to, ts.tv_sec, ts.tv_nsec))
  			return -EINVAL;
9f72949f6   David Woodhouse   [PATCH] Add psele...
948
949
950
951
952
953
954
955
956
957
958
959
  	}
  
  	if (sigmask) {
  		/* XXX: Don't preclude handling different sized sigset_t's.  */
  		if (sigsetsize != sizeof(sigset_t))
  			return -EINVAL;
  		if (copy_from_user(&ksigmask, sigmask, sizeof(ksigmask)))
  			return -EFAULT;
  
  		sigdelsetmask(&ksigmask, sigmask(SIGKILL)|sigmask(SIGSTOP));
  		sigprocmask(SIG_SETMASK, &ksigmask, &sigsaved);
  	}
8ff3e8e85   Arjan van de Ven   select: switch se...
960
  	ret = do_sys_poll(ufds, nfds, to);
9f72949f6   David Woodhouse   [PATCH] Add psele...
961
962
963
964
965
966
967
968
969
970
971
  
  	/* We can restart this syscall, usually */
  	if (ret == -EINTR) {
  		/*
  		 * Don't restore the signal mask yet. Let do_signal() deliver
  		 * the signal on the way back to userspace, before the signal
  		 * mask is restored.
  		 */
  		if (sigmask) {
  			memcpy(&current->saved_sigmask, &sigsaved,
  					sizeof(sigsaved));
4e4c22c71   Roland McGrath   signals: add set_...
972
  			set_restore_sigmask();
9f72949f6   David Woodhouse   [PATCH] Add psele...
973
974
975
976
  		}
  		ret = -ERESTARTNOHAND;
  	} else if (sigmask)
  		sigprocmask(SIG_SETMASK, &sigsaved, NULL);
8ff3e8e85   Arjan van de Ven   select: switch se...
977
  	ret = poll_select_copy_remaining(&end_time, tsp, 0, ret);
9f72949f6   David Woodhouse   [PATCH] Add psele...
978
979
980
  
  	return ret;
  }