Blame view

fs/eventpoll.c 37.5 KB
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1
  /*
c7ea76302   Davide Libenzi   epoll locks chang...
2
3
   *  fs/eventpoll.c (Efficent event polling implementation)
   *  Copyright (C) 2001,...,2007	 Davide Libenzi
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
4
5
6
7
8
9
10
11
12
   *
   *  This program is free software; you can redistribute it and/or modify
   *  it under the terms of the GNU General Public License as published by
   *  the Free Software Foundation; either version 2 of the License, or
   *  (at your option) any later version.
   *
   *  Davide Libenzi <davidel@xmailserver.org>
   *
   */
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
13
14
15
16
17
18
19
20
21
22
  #include <linux/init.h>
  #include <linux/kernel.h>
  #include <linux/sched.h>
  #include <linux/fs.h>
  #include <linux/file.h>
  #include <linux/signal.h>
  #include <linux/errno.h>
  #include <linux/mm.h>
  #include <linux/slab.h>
  #include <linux/poll.h>
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
23
24
25
26
27
  #include <linux/string.h>
  #include <linux/list.h>
  #include <linux/hash.h>
  #include <linux/spinlock.h>
  #include <linux/syscalls.h>
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
28
29
30
31
32
  #include <linux/rbtree.h>
  #include <linux/wait.h>
  #include <linux/eventpoll.h>
  #include <linux/mount.h>
  #include <linux/bitops.h>
144efe3e3   Arjan van de Ven   [PATCH] sem2mutex...
33
  #include <linux/mutex.h>
da66f7cb0   Davide Libenzi   epoll: use anonym...
34
  #include <linux/anon_inodes.h>
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
35
36
37
38
39
  #include <asm/uaccess.h>
  #include <asm/system.h>
  #include <asm/io.h>
  #include <asm/mman.h>
  #include <asm/atomic.h>
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
40

1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
41
42
43
44
  /*
   * LOCKING:
   * There are three level of locking required by epoll :
   *
144efe3e3   Arjan van de Ven   [PATCH] sem2mutex...
45
   * 1) epmutex (mutex)
c7ea76302   Davide Libenzi   epoll locks chang...
46
47
   * 2) ep->mtx (mutex)
   * 3) ep->lock (spinlock)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
48
49
50
51
52
53
54
55
56
   *
   * The acquire order is the one listed above, from 1 to 3.
   * We need a spinlock (ep->lock) because we manipulate objects
   * from inside the poll callback, that might be triggered from
   * a wake_up() that in turn might be called from IRQ context.
   * So we can't sleep inside the poll callback and hence we need
   * a spinlock. During the event transfer loop (from kernel to
   * user space) we could end up sleeping due a copy_to_user(), so
   * we need a lock that will allow us to sleep. This lock is a
d47de16c7   Davide Libenzi   fix epoll single ...
57
58
59
60
61
   * mutex (ep->mtx). It is acquired during the event transfer loop,
   * during epoll_ctl(EPOLL_CTL_DEL) and during eventpoll_release_file().
   * Then we also need a global mutex to serialize eventpoll_release_file()
   * and ep_free().
   * This mutex is acquired by ep_free() during the epoll file
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
62
63
64
   * cleanup path and it is also acquired by eventpoll_release_file()
   * if a file has been pushed inside an epoll set and it is then
   * close()d without a previous call toepoll_ctl(EPOLL_CTL_DEL).
d47de16c7   Davide Libenzi   fix epoll single ...
65
66
67
   * It is possible to drop the "ep->mtx" and to use the global
   * mutex "epmutex" (together with "ep->lock") to have it working,
   * but having "ep->mtx" will make the interface more scalable.
144efe3e3   Arjan van de Ven   [PATCH] sem2mutex...
68
   * Events that require holding "epmutex" are very rare, while for
d47de16c7   Davide Libenzi   fix epoll single ...
69
70
   * normal operations the epoll private "ep->mtx" will guarantee
   * a better scalability.
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
71
   */
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
  #define DEBUG_EPOLL 0
  
  #if DEBUG_EPOLL > 0
  #define DPRINTK(x) printk x
  #define DNPRINTK(n, x) do { if ((n) <= DEBUG_EPOLL) printk x; } while (0)
  #else /* #if DEBUG_EPOLL > 0 */
  #define DPRINTK(x) (void) 0
  #define DNPRINTK(n, x) (void) 0
  #endif /* #if DEBUG_EPOLL > 0 */
  
  #define DEBUG_EPI 0
  
  #if DEBUG_EPI != 0
  #define EPI_SLAB_DEBUG (SLAB_DEBUG_FREE | SLAB_RED_ZONE /* | SLAB_POISON */)
  #else /* #if DEBUG_EPI != 0 */
  #define EPI_SLAB_DEBUG 0
  #endif /* #if DEBUG_EPI != 0 */
  
  /* Epoll private bits inside the event mask */
  #define EP_PRIVATE_BITS (EPOLLONESHOT | EPOLLET)
  
  /* Maximum number of poll wake up nests we are allowing */
  #define EP_MAX_POLLWAKE_NESTS 4
e3306dd5f   Davide Libenzi   [PATCH] epoll: ha...
95
96
  /* Maximum msec timeout value storeable in a long int */
  #define EP_MAX_MSTIMEO min(1000ULL * MAX_SCHEDULE_TIMEOUT / HZ, (LONG_MAX - 999ULL) / HZ)
b611967de   Davide Libenzi   [PATCH] epoll_pwa...
97
  #define EP_MAX_EVENTS (INT_MAX / sizeof(struct epoll_event))
d47de16c7   Davide Libenzi   fix epoll single ...
98
  #define EP_UNACTIVE_PTR ((void *) -1L)
7ef9964e6   Davide Libenzi   epoll: introduce ...
99
  #define EP_ITEM_COST (sizeof(struct epitem) + sizeof(struct eppoll_entry))
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
100
101
102
103
104
105
106
107
108
  struct epoll_filefd {
  	struct file *file;
  	int fd;
  };
  
  /*
   * Node that is linked into the "wake_task_list" member of the "struct poll_safewake".
   * It is used to keep track on all tasks that are currently inside the wake_up() code
   * to 1) short-circuit the one coming from the same task and same wait queue head
d47de16c7   Davide Libenzi   fix epoll single ...
109
   * (loop) 2) allow a maximum number of epoll descriptors inclusion nesting
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
110
111
112
113
   * 3) let go the ones coming from other tasks.
   */
  struct wake_task_node {
  	struct list_head llink;
36c8b5868   Ingo Molnar   [PATCH] sched: cl...
114
  	struct task_struct *task;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
115
116
117
118
119
120
121
122
123
124
125
126
127
  	wait_queue_head_t *wq;
  };
  
  /*
   * This is used to implement the safe poll wake up avoiding to reenter
   * the poll callback from inside wake_up().
   */
  struct poll_safewake {
  	struct list_head wake_task_list;
  	spinlock_t lock;
  };
  
  /*
d47de16c7   Davide Libenzi   fix epoll single ...
128
129
130
131
   * Each file descriptor added to the eventpoll interface will
   * have an entry of this type linked to the "rbr" RB tree.
   */
  struct epitem {
67647d0fb   Davide Libenzi   epoll: fix some c...
132
  	/* RB tree node used to link this structure to the eventpoll RB tree */
d47de16c7   Davide Libenzi   fix epoll single ...
133
134
135
136
  	struct rb_node rbn;
  
  	/* List header used to link this structure to the eventpoll ready list */
  	struct list_head rdllink;
c7ea76302   Davide Libenzi   epoll locks chang...
137
138
139
140
141
  	/*
  	 * Works together "struct eventpoll"->ovflist in keeping the
  	 * single linked chain of items.
  	 */
  	struct epitem *next;
d47de16c7   Davide Libenzi   fix epoll single ...
142
143
144
145
146
147
148
149
150
151
152
  	/* The file descriptor information this item refers to */
  	struct epoll_filefd ffd;
  
  	/* Number of active wait queue attached to poll operations */
  	int nwait;
  
  	/* List containing poll wait queues */
  	struct list_head pwqlist;
  
  	/* The "container" of this item */
  	struct eventpoll *ep;
d47de16c7   Davide Libenzi   fix epoll single ...
153
154
  	/* List header used to link this item to the "struct file" items list */
  	struct list_head fllink;
c7ea76302   Davide Libenzi   epoll locks chang...
155
156
  	/* The structure that describe the interested events and the source fd */
  	struct epoll_event event;
d47de16c7   Davide Libenzi   fix epoll single ...
157
158
159
  };
  
  /*
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
160
161
162
163
164
165
   * This structure is stored inside the "private_data" member of the file
   * structure and rapresent the main data sructure for the eventpoll
   * interface.
   */
  struct eventpoll {
  	/* Protect the this structure access */
c7ea76302   Davide Libenzi   epoll locks chang...
166
  	spinlock_t lock;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
167
168
  
  	/*
d47de16c7   Davide Libenzi   fix epoll single ...
169
170
171
172
  	 * This mutex is used to ensure that files are not removed
  	 * while epoll is using them. This is held during the event
  	 * collection loop, the file cleanup path, the epoll file exit
  	 * code and the ctl operations.
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
173
  	 */
d47de16c7   Davide Libenzi   fix epoll single ...
174
  	struct mutex mtx;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
175
176
177
178
179
180
181
182
183
  
  	/* Wait queue used by sys_epoll_wait() */
  	wait_queue_head_t wq;
  
  	/* Wait queue used by file->poll() */
  	wait_queue_head_t poll_wait;
  
  	/* List of ready file descriptors */
  	struct list_head rdllist;
67647d0fb   Davide Libenzi   epoll: fix some c...
184
  	/* RB tree root used to store monitored fd structs */
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
185
  	struct rb_root rbr;
d47de16c7   Davide Libenzi   fix epoll single ...
186
187
188
189
190
191
192
  
  	/*
  	 * This is a single linked list that chains all the "struct epitem" that
  	 * happened while transfering ready events to userspace w/out
  	 * holding ->lock.
  	 */
  	struct epitem *ovflist;
7ef9964e6   Davide Libenzi   epoll: introduce ...
193
194
195
  
  	/* The user that created the eventpoll descriptor */
  	struct user_struct *user;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
  };
  
  /* Wait structure used by the poll hooks */
  struct eppoll_entry {
  	/* List header used to link this structure to the "struct epitem" */
  	struct list_head llink;
  
  	/* The "base" pointer is set to the container "struct epitem" */
  	void *base;
  
  	/*
  	 * Wait queue item that will be linked to the target file wait
  	 * queue head.
  	 */
  	wait_queue_t wait;
  
  	/* The wait queue head that linked the "wait" wait queue item */
  	wait_queue_head_t *whead;
  };
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
215
216
217
218
219
  /* Wrapper struct used by poll queueing */
  struct ep_pqueue {
  	poll_table pt;
  	struct epitem *epi;
  };
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
220
  /*
7ef9964e6   Davide Libenzi   epoll: introduce ...
221
222
223
224
225
226
227
228
   * Configuration options available inside /proc/sys/fs/epoll/
   */
  /* Maximum number of epoll devices, per user */
  static int max_user_instances __read_mostly;
  /* Maximum number of epoll watched descriptors, per user */
  static int max_user_watches __read_mostly;
  
  /*
d47de16c7   Davide Libenzi   fix epoll single ...
229
   * This mutex is used to serialize ep_free() and eventpoll_release_file().
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
230
   */
7ef9964e6   Davide Libenzi   epoll: introduce ...
231
  static DEFINE_MUTEX(epmutex);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
232
233
234
235
236
  
  /* Safe wake up implementation */
  static struct poll_safewake psw;
  
  /* Slab cache used to allocate "struct epitem" */
e18b890bb   Christoph Lameter   [PATCH] slab: rem...
237
  static struct kmem_cache *epi_cache __read_mostly;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
238
239
  
  /* Slab cache used to allocate "struct eppoll_entry" */
e18b890bb   Christoph Lameter   [PATCH] slab: rem...
240
  static struct kmem_cache *pwq_cache __read_mostly;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
241

7ef9964e6   Davide Libenzi   epoll: introduce ...
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
  #ifdef CONFIG_SYSCTL
  
  #include <linux/sysctl.h>
  
  static int zero;
  
  ctl_table epoll_table[] = {
  	{
  		.procname	= "max_user_instances",
  		.data		= &max_user_instances,
  		.maxlen		= sizeof(int),
  		.mode		= 0644,
  		.proc_handler	= &proc_dointvec_minmax,
  		.extra1		= &zero,
  	},
  	{
  		.procname	= "max_user_watches",
  		.data		= &max_user_watches,
  		.maxlen		= sizeof(int),
  		.mode		= 0644,
  		.proc_handler	= &proc_dointvec_minmax,
  		.extra1		= &zero,
  	},
  	{ .ctl_name = 0 }
  };
  #endif /* CONFIG_SYSCTL */
b030a4dd6   Pekka Enberg   [PATCH] Remove ev...
268

67647d0fb   Davide Libenzi   epoll: fix some c...
269
  /* Setup the structure that is used as key for the RB tree */
b030a4dd6   Pekka Enberg   [PATCH] Remove ev...
270
271
272
273
274
275
  static inline void ep_set_ffd(struct epoll_filefd *ffd,
  			      struct file *file, int fd)
  {
  	ffd->file = file;
  	ffd->fd = fd;
  }
67647d0fb   Davide Libenzi   epoll: fix some c...
276
  /* Compare RB tree keys */
b030a4dd6   Pekka Enberg   [PATCH] Remove ev...
277
278
279
280
281
282
  static inline int ep_cmp_ffd(struct epoll_filefd *p1,
  			     struct epoll_filefd *p2)
  {
  	return (p1->file > p2->file ? +1:
  	        (p1->file < p2->file ? -1 : p1->fd - p2->fd));
  }
b030a4dd6   Pekka Enberg   [PATCH] Remove ev...
283
284
285
286
287
288
289
  /* Tells us if the item is currently linked */
  static inline int ep_is_linked(struct list_head *p)
  {
  	return !list_empty(p);
  }
  
  /* Get the "struct epitem" from a wait queue pointer */
cdac75e6f   Davide Libenzi   epoll: avoid kmem...
290
  static inline struct epitem *ep_item_from_wait(wait_queue_t *p)
b030a4dd6   Pekka Enberg   [PATCH] Remove ev...
291
292
293
294
295
  {
  	return container_of(p, struct eppoll_entry, wait)->base;
  }
  
  /* Get the "struct epitem" from an epoll queue wrapper */
cdac75e6f   Davide Libenzi   epoll: avoid kmem...
296
  static inline struct epitem *ep_item_from_epqueue(poll_table *p)
b030a4dd6   Pekka Enberg   [PATCH] Remove ev...
297
298
299
300
301
  {
  	return container_of(p, struct ep_pqueue, pt)->epi;
  }
  
  /* Tells if the epoll_ctl(2) operation needs an event copy from userspace */
6192bd536   Davide Libenzi   epoll: optimizati...
302
  static inline int ep_op_has_event(int op)
b030a4dd6   Pekka Enberg   [PATCH] Remove ev...
303
304
305
  {
  	return op != EPOLL_CTL_DEL;
  }
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
306
307
308
309
310
311
312
  /* Initialize the poll safe wake up structure */
  static void ep_poll_safewake_init(struct poll_safewake *psw)
  {
  
  	INIT_LIST_HEAD(&psw->wake_task_list);
  	spin_lock_init(&psw->lock);
  }
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
  /*
   * Perform a safe wake up of the poll wait list. The problem is that
   * with the new callback'd wake up system, it is possible that the
   * poll callback is reentered from inside the call to wake_up() done
   * on the poll wait queue head. The rule is that we cannot reenter the
   * wake up code from the same task more than EP_MAX_POLLWAKE_NESTS times,
   * and we cannot reenter the same wait queue head at all. This will
   * enable to have a hierarchy of epoll file descriptor of no more than
   * EP_MAX_POLLWAKE_NESTS deep. We need the irq version of the spin lock
   * because this one gets called by the poll callback, that in turn is called
   * from inside a wake_up(), that might be called from irq context.
   */
  static void ep_poll_safewake(struct poll_safewake *psw, wait_queue_head_t *wq)
  {
  	int wake_nests = 0;
  	unsigned long flags;
36c8b5868   Ingo Molnar   [PATCH] sched: cl...
329
  	struct task_struct *this_task = current;
b70c39409   Matthias Kaehlcke   fs/eventpoll.c: u...
330
  	struct list_head *lsthead = &psw->wake_task_list;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
331
332
333
334
335
336
  	struct wake_task_node *tncur;
  	struct wake_task_node tnode;
  
  	spin_lock_irqsave(&psw->lock, flags);
  
  	/* Try to see if the current task is already inside this wakeup call */
b70c39409   Matthias Kaehlcke   fs/eventpoll.c: u...
337
  	list_for_each_entry(tncur, lsthead, llink) {
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
  
  		if (tncur->wq == wq ||
  		    (tncur->task == this_task && ++wake_nests > EP_MAX_POLLWAKE_NESTS)) {
  			/*
  			 * Ops ... loop detected or maximum nest level reached.
  			 * We abort this wake by breaking the cycle itself.
  			 */
  			spin_unlock_irqrestore(&psw->lock, flags);
  			return;
  		}
  	}
  
  	/* Add the current task to the list */
  	tnode.task = this_task;
  	tnode.wq = wq;
  	list_add(&tnode.llink, lsthead);
  
  	spin_unlock_irqrestore(&psw->lock, flags);
  
  	/* Do really wake up now */
0ccf831cb   Peter Zijlstra   lockdep: annotate...
358
  	wake_up_nested(wq, 1 + wake_nests);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
359
360
361
362
363
364
  
  	/* Remove the current task from the list */
  	spin_lock_irqsave(&psw->lock, flags);
  	list_del(&tnode.llink);
  	spin_unlock_irqrestore(&psw->lock, flags);
  }
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
365
  /*
7699acd13   Davide Libenzi   epoll cleanups: e...
366
367
368
   * This function unregister poll callbacks from the associated file descriptor.
   * Since this must be called without holding "ep->lock" the atomic exchange trick
   * will protect us from multiple unregister.
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
369
   */
7699acd13   Davide Libenzi   epoll cleanups: e...
370
  static void ep_unregister_pollwait(struct eventpoll *ep, struct epitem *epi)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
371
  {
7699acd13   Davide Libenzi   epoll cleanups: e...
372
373
374
  	int nwait;
  	struct list_head *lsthead = &epi->pwqlist;
  	struct eppoll_entry *pwq;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
375

7699acd13   Davide Libenzi   epoll cleanups: e...
376
377
  	/* This is called without locks, so we need the atomic exchange */
  	nwait = xchg(&epi->nwait, 0);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
378

7699acd13   Davide Libenzi   epoll cleanups: e...
379
380
381
  	if (nwait) {
  		while (!list_empty(lsthead)) {
  			pwq = list_first_entry(lsthead, struct eppoll_entry, llink);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
382

7699acd13   Davide Libenzi   epoll cleanups: e...
383
384
385
386
  			list_del_init(&pwq->llink);
  			remove_wait_queue(pwq->whead, &pwq->wait);
  			kmem_cache_free(pwq_cache, pwq);
  		}
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
387
  	}
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
388
  }
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
389
  /*
7699acd13   Davide Libenzi   epoll cleanups: e...
390
   * Removes a "struct epitem" from the eventpoll RB tree and deallocates
c7ea76302   Davide Libenzi   epoll locks chang...
391
   * all the associated resources. Must be called with "mtx" held.
7699acd13   Davide Libenzi   epoll cleanups: e...
392
393
394
   */
  static int ep_remove(struct eventpoll *ep, struct epitem *epi)
  {
7699acd13   Davide Libenzi   epoll cleanups: e...
395
396
  	unsigned long flags;
  	struct file *file = epi->ffd.file;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
397
398
  
  	/*
7699acd13   Davide Libenzi   epoll cleanups: e...
399
400
401
402
403
404
  	 * Removes poll wait queue hooks. We _have_ to do this without holding
  	 * the "ep->lock" otherwise a deadlock might occur. This because of the
  	 * sequence of the lock acquisition. Here we do "ep->lock" then the wait
  	 * queue head lock when unregistering the wait queue. The wakeup callback
  	 * will run by holding the wait queue head lock and will call our callback
  	 * that will try to get "ep->lock".
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
405
  	 */
7699acd13   Davide Libenzi   epoll cleanups: e...
406
  	ep_unregister_pollwait(ep, epi);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
407

7699acd13   Davide Libenzi   epoll cleanups: e...
408
409
410
411
412
  	/* Remove the current item from the list of epoll hooks */
  	spin_lock(&file->f_ep_lock);
  	if (ep_is_linked(&epi->fllink))
  		list_del_init(&epi->fllink);
  	spin_unlock(&file->f_ep_lock);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
413

cdac75e6f   Davide Libenzi   epoll: avoid kmem...
414
  	rb_erase(&epi->rbn, &ep->rbr);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
415

c7ea76302   Davide Libenzi   epoll locks chang...
416
417
418
419
  	spin_lock_irqsave(&ep->lock, flags);
  	if (ep_is_linked(&epi->rdllink))
  		list_del_init(&epi->rdllink);
  	spin_unlock_irqrestore(&ep->lock, flags);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
420

7699acd13   Davide Libenzi   epoll cleanups: e...
421
  	/* At this point it is safe to free the eventpoll item */
c7ea76302   Davide Libenzi   epoll locks chang...
422
  	kmem_cache_free(epi_cache, epi);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
423

7ef9964e6   Davide Libenzi   epoll: introduce ...
424
  	atomic_dec(&ep->user->epoll_watches);
c7ea76302   Davide Libenzi   epoll locks chang...
425
426
427
  	DNPRINTK(3, (KERN_INFO "[%p] eventpoll: ep_remove(%p, %p)
  ",
  		     current, ep, file));
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
428

c7ea76302   Davide Libenzi   epoll locks chang...
429
  	return 0;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
430
  }
7699acd13   Davide Libenzi   epoll cleanups: e...
431
  static void ep_free(struct eventpoll *ep)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
432
  {
7699acd13   Davide Libenzi   epoll cleanups: e...
433
434
  	struct rb_node *rbp;
  	struct epitem *epi;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
435

7699acd13   Davide Libenzi   epoll cleanups: e...
436
437
438
  	/* We need to release all tasks waiting for these file */
  	if (waitqueue_active(&ep->poll_wait))
  		ep_poll_safewake(&psw, &ep->poll_wait);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
439

7699acd13   Davide Libenzi   epoll cleanups: e...
440
441
442
  	/*
  	 * We need to lock this because we could be hit by
  	 * eventpoll_release_file() while we're freeing the "struct eventpoll".
d47de16c7   Davide Libenzi   fix epoll single ...
443
  	 * We do not need to hold "ep->mtx" here because the epoll file
7699acd13   Davide Libenzi   epoll cleanups: e...
444
445
446
447
448
  	 * is on the way to be removed and no one has references to it
  	 * anymore. The only hit might come from eventpoll_release_file() but
  	 * holding "epmutex" is sufficent here.
  	 */
  	mutex_lock(&epmutex);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
449
450
  
  	/*
7699acd13   Davide Libenzi   epoll cleanups: e...
451
  	 * Walks through the whole tree by unregistering poll callbacks.
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
452
  	 */
7699acd13   Davide Libenzi   epoll cleanups: e...
453
454
455
456
457
  	for (rbp = rb_first(&ep->rbr); rbp; rbp = rb_next(rbp)) {
  		epi = rb_entry(rbp, struct epitem, rbn);
  
  		ep_unregister_pollwait(ep, epi);
  	}
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
458
459
  
  	/*
7699acd13   Davide Libenzi   epoll cleanups: e...
460
461
  	 * Walks through the whole tree by freeing each "struct epitem". At this
  	 * point we are sure no poll callbacks will be lingering around, and also by
d47de16c7   Davide Libenzi   fix epoll single ...
462
  	 * holding "epmutex" we can be sure that no file cleanup code will hit
7699acd13   Davide Libenzi   epoll cleanups: e...
463
  	 * us during this operation. So we can avoid the lock on "ep->lock".
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
464
  	 */
c80544dc0   Stephen Hemminger   sparse pointer us...
465
  	while ((rbp = rb_first(&ep->rbr)) != NULL) {
7699acd13   Davide Libenzi   epoll cleanups: e...
466
467
468
  		epi = rb_entry(rbp, struct epitem, rbn);
  		ep_remove(ep, epi);
  	}
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
469

7699acd13   Davide Libenzi   epoll cleanups: e...
470
  	mutex_unlock(&epmutex);
d47de16c7   Davide Libenzi   fix epoll single ...
471
  	mutex_destroy(&ep->mtx);
7ef9964e6   Davide Libenzi   epoll: introduce ...
472
473
  	atomic_dec(&ep->user->epoll_devs);
  	free_uid(ep->user);
f0ee9aabb   Davide Libenzi   epoll: move kfree...
474
  	kfree(ep);
7699acd13   Davide Libenzi   epoll cleanups: e...
475
  }
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
476

7699acd13   Davide Libenzi   epoll cleanups: e...
477
478
479
  static int ep_eventpoll_release(struct inode *inode, struct file *file)
  {
  	struct eventpoll *ep = file->private_data;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
480

f0ee9aabb   Davide Libenzi   epoll: move kfree...
481
  	if (ep)
7699acd13   Davide Libenzi   epoll cleanups: e...
482
  		ep_free(ep);
7699acd13   Davide Libenzi   epoll cleanups: e...
483
484
485
486
  
  	DNPRINTK(3, (KERN_INFO "[%p] eventpoll: close() ep=%p
  ", current, ep));
  	return 0;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
487
  }
7699acd13   Davide Libenzi   epoll cleanups: e...
488
489
490
491
492
  static unsigned int ep_eventpoll_poll(struct file *file, poll_table *wait)
  {
  	unsigned int pollflags = 0;
  	unsigned long flags;
  	struct eventpoll *ep = file->private_data;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
493

7699acd13   Davide Libenzi   epoll cleanups: e...
494
495
496
497
  	/* Insert inside our poll wait queue */
  	poll_wait(file, &ep->poll_wait, wait);
  
  	/* Check our condition */
c7ea76302   Davide Libenzi   epoll locks chang...
498
  	spin_lock_irqsave(&ep->lock, flags);
7699acd13   Davide Libenzi   epoll cleanups: e...
499
500
  	if (!list_empty(&ep->rdllist))
  		pollflags = POLLIN | POLLRDNORM;
c7ea76302   Davide Libenzi   epoll locks chang...
501
  	spin_unlock_irqrestore(&ep->lock, flags);
7699acd13   Davide Libenzi   epoll cleanups: e...
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
  
  	return pollflags;
  }
  
  /* File callbacks that implement the eventpoll file behaviour */
  static const struct file_operations eventpoll_fops = {
  	.release	= ep_eventpoll_release,
  	.poll		= ep_eventpoll_poll
  };
  
  /* Fast test to see if the file is an evenpoll file */
  static inline int is_file_epoll(struct file *f)
  {
  	return f->f_op == &eventpoll_fops;
  }
b611967de   Davide Libenzi   [PATCH] epoll_pwa...
517
518
  
  /*
7699acd13   Davide Libenzi   epoll cleanups: e...
519
520
521
   * This is called from eventpoll_release() to unlink files from the eventpoll
   * interface. We need to have this facility to cleanup correctly files that are
   * closed without being removed from the eventpoll interface.
b611967de   Davide Libenzi   [PATCH] epoll_pwa...
522
   */
7699acd13   Davide Libenzi   epoll cleanups: e...
523
  void eventpoll_release_file(struct file *file)
b611967de   Davide Libenzi   [PATCH] epoll_pwa...
524
  {
7699acd13   Davide Libenzi   epoll cleanups: e...
525
526
527
  	struct list_head *lsthead = &file->f_ep_links;
  	struct eventpoll *ep;
  	struct epitem *epi;
b611967de   Davide Libenzi   [PATCH] epoll_pwa...
528
529
  
  	/*
7699acd13   Davide Libenzi   epoll cleanups: e...
530
531
532
  	 * We don't want to get "file->f_ep_lock" because it is not
  	 * necessary. It is not necessary because we're in the "struct file"
  	 * cleanup path, and this means that noone is using this file anymore.
67647d0fb   Davide Libenzi   epoll: fix some c...
533
534
  	 * So, for example, epoll_ctl() cannot hit here sicne if we reach this
  	 * point, the file counter already went to zero and fget() would fail.
d47de16c7   Davide Libenzi   fix epoll single ...
535
  	 * The only hit might come from ep_free() but by holding the mutex
7699acd13   Davide Libenzi   epoll cleanups: e...
536
  	 * will correctly serialize the operation. We do need to acquire
d47de16c7   Davide Libenzi   fix epoll single ...
537
  	 * "ep->mtx" after "epmutex" because ep_remove() requires it when called
7699acd13   Davide Libenzi   epoll cleanups: e...
538
  	 * from anywhere but ep_free().
b611967de   Davide Libenzi   [PATCH] epoll_pwa...
539
  	 */
7699acd13   Davide Libenzi   epoll cleanups: e...
540
  	mutex_lock(&epmutex);
b611967de   Davide Libenzi   [PATCH] epoll_pwa...
541

7699acd13   Davide Libenzi   epoll cleanups: e...
542
543
  	while (!list_empty(lsthead)) {
  		epi = list_first_entry(lsthead, struct epitem, fllink);
b611967de   Davide Libenzi   [PATCH] epoll_pwa...
544

7699acd13   Davide Libenzi   epoll cleanups: e...
545
546
  		ep = epi->ep;
  		list_del_init(&epi->fllink);
d47de16c7   Davide Libenzi   fix epoll single ...
547
  		mutex_lock(&ep->mtx);
7699acd13   Davide Libenzi   epoll cleanups: e...
548
  		ep_remove(ep, epi);
d47de16c7   Davide Libenzi   fix epoll single ...
549
  		mutex_unlock(&ep->mtx);
b611967de   Davide Libenzi   [PATCH] epoll_pwa...
550
  	}
7699acd13   Davide Libenzi   epoll cleanups: e...
551
  	mutex_unlock(&epmutex);
b611967de   Davide Libenzi   [PATCH] epoll_pwa...
552
  }
53d2be79d   Davide Libenzi   [PATCH] epoll: fi...
553
  static int ep_alloc(struct eventpoll **pep)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
554
  {
7ef9964e6   Davide Libenzi   epoll: introduce ...
555
556
557
  	int error;
  	struct user_struct *user;
  	struct eventpoll *ep;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
558

7ef9964e6   Davide Libenzi   epoll: introduce ...
559
560
561
562
563
564
565
566
567
  	user = get_current_user();
  	error = -EMFILE;
  	if (unlikely(atomic_read(&user->epoll_devs) >=
  			max_user_instances))
  		goto free_uid;
  	error = -ENOMEM;
  	ep = kzalloc(sizeof(*ep), GFP_KERNEL);
  	if (unlikely(!ep))
  		goto free_uid;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
568

c7ea76302   Davide Libenzi   epoll locks chang...
569
  	spin_lock_init(&ep->lock);
d47de16c7   Davide Libenzi   fix epoll single ...
570
  	mutex_init(&ep->mtx);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
571
572
573
574
  	init_waitqueue_head(&ep->wq);
  	init_waitqueue_head(&ep->poll_wait);
  	INIT_LIST_HEAD(&ep->rdllist);
  	ep->rbr = RB_ROOT;
d47de16c7   Davide Libenzi   fix epoll single ...
575
  	ep->ovflist = EP_UNACTIVE_PTR;
7ef9964e6   Davide Libenzi   epoll: introduce ...
576
  	ep->user = user;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
577

53d2be79d   Davide Libenzi   [PATCH] epoll: fi...
578
  	*pep = ep;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
579

53d2be79d   Davide Libenzi   [PATCH] epoll: fi...
580
581
  	DNPRINTK(3, (KERN_INFO "[%p] eventpoll: ep_alloc() ep=%p
  ",
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
582
583
  		     current, ep));
  	return 0;
7ef9964e6   Davide Libenzi   epoll: introduce ...
584
585
586
587
  
  free_uid:
  	free_uid(user);
  	return error;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
588
  }
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
589
  /*
c7ea76302   Davide Libenzi   epoll locks chang...
590
591
592
   * Search the file inside the eventpoll tree. The RB tree operations
   * are protected by the "mtx" mutex, and ep_find() must be called with
   * "mtx" held.
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
593
594
595
596
   */
  static struct epitem *ep_find(struct eventpoll *ep, struct file *file, int fd)
  {
  	int kcmp;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
597
598
599
  	struct rb_node *rbp;
  	struct epitem *epi, *epir = NULL;
  	struct epoll_filefd ffd;
b030a4dd6   Pekka Enberg   [PATCH] Remove ev...
600
  	ep_set_ffd(&ffd, file, fd);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
601
602
  	for (rbp = ep->rbr.rb_node; rbp; ) {
  		epi = rb_entry(rbp, struct epitem, rbn);
b030a4dd6   Pekka Enberg   [PATCH] Remove ev...
603
  		kcmp = ep_cmp_ffd(&ffd, &epi->ffd);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
604
605
606
607
608
  		if (kcmp > 0)
  			rbp = rbp->rb_right;
  		else if (kcmp < 0)
  			rbp = rbp->rb_left;
  		else {
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
609
610
611
612
  			epir = epi;
  			break;
  		}
  	}
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
613
614
615
616
617
618
619
  
  	DNPRINTK(3, (KERN_INFO "[%p] eventpoll: ep_find(%p) -> %p
  ",
  		     current, file, epir));
  
  	return epir;
  }
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
620
  /*
7699acd13   Davide Libenzi   epoll cleanups: e...
621
622
623
   * This is the callback that is passed to the wait queue wakeup
   * machanism. It is called by the stored file descriptors when they
   * have events to report.
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
624
   */
7699acd13   Davide Libenzi   epoll cleanups: e...
625
  static int ep_poll_callback(wait_queue_t *wait, unsigned mode, int sync, void *key)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
626
  {
7699acd13   Davide Libenzi   epoll cleanups: e...
627
628
629
630
  	int pwake = 0;
  	unsigned long flags;
  	struct epitem *epi = ep_item_from_wait(wait);
  	struct eventpoll *ep = epi->ep;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
631

7699acd13   Davide Libenzi   epoll cleanups: e...
632
633
634
  	DNPRINTK(3, (KERN_INFO "[%p] eventpoll: poll_callback(%p) epi=%p ep=%p
  ",
  		     current, epi->ffd.file, epi, ep));
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
635

c7ea76302   Davide Libenzi   epoll locks chang...
636
  	spin_lock_irqsave(&ep->lock, flags);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
637

7699acd13   Davide Libenzi   epoll cleanups: e...
638
639
640
641
642
643
644
  	/*
  	 * If the event mask does not contain any poll(2) event, we consider the
  	 * descriptor to be disabled. This condition is likely the effect of the
  	 * EPOLLONESHOT bit that disables the descriptor when an event is received,
  	 * until the next EPOLL_CTL_MOD will be issued.
  	 */
  	if (!(epi->event.events & ~EP_PRIVATE_BITS))
d47de16c7   Davide Libenzi   fix epoll single ...
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
  		goto out_unlock;
  
  	/*
  	 * If we are trasfering events to userspace, we can hold no locks
  	 * (because we're accessing user memory, and because of linux f_op->poll()
  	 * semantics). All the events that happens during that period of time are
  	 * chained in ep->ovflist and requeued later on.
  	 */
  	if (unlikely(ep->ovflist != EP_UNACTIVE_PTR)) {
  		if (epi->next == EP_UNACTIVE_PTR) {
  			epi->next = ep->ovflist;
  			ep->ovflist = epi;
  		}
  		goto out_unlock;
  	}
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
660

7699acd13   Davide Libenzi   epoll cleanups: e...
661
662
663
664
665
666
667
668
669
670
671
672
  	/* If this file is already in the ready list we exit soon */
  	if (ep_is_linked(&epi->rdllink))
  		goto is_linked;
  
  	list_add_tail(&epi->rdllink, &ep->rdllist);
  
  is_linked:
  	/*
  	 * Wake up ( if active ) both the eventpoll wait list and the ->poll()
  	 * wait list.
  	 */
  	if (waitqueue_active(&ep->wq))
4a6e9e2ce   Matthew Wilcox   Use wake_up_locke...
673
  		wake_up_locked(&ep->wq);
7699acd13   Davide Libenzi   epoll cleanups: e...
674
675
  	if (waitqueue_active(&ep->poll_wait))
  		pwake++;
d47de16c7   Davide Libenzi   fix epoll single ...
676
  out_unlock:
c7ea76302   Davide Libenzi   epoll locks chang...
677
  	spin_unlock_irqrestore(&ep->lock, flags);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
678

7699acd13   Davide Libenzi   epoll cleanups: e...
679
680
681
682
683
684
  	/* We have to call this outside the lock */
  	if (pwake)
  		ep_poll_safewake(&psw, &ep->poll_wait);
  
  	return 1;
  }
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
685
686
687
688
689
690
691
692
  
  /*
   * This is the callback that is used to add our wait queue to the
   * target file wakeup lists.
   */
  static void ep_ptable_queue_proc(struct file *file, wait_queue_head_t *whead,
  				 poll_table *pt)
  {
b030a4dd6   Pekka Enberg   [PATCH] Remove ev...
693
  	struct epitem *epi = ep_item_from_epqueue(pt);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
694
  	struct eppoll_entry *pwq;
e94b17660   Christoph Lameter   [PATCH] slab: rem...
695
  	if (epi->nwait >= 0 && (pwq = kmem_cache_alloc(pwq_cache, GFP_KERNEL))) {
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
696
697
698
699
700
701
702
703
704
705
706
  		init_waitqueue_func_entry(&pwq->wait, ep_poll_callback);
  		pwq->whead = whead;
  		pwq->base = epi;
  		add_wait_queue(whead, &pwq->wait);
  		list_add_tail(&pwq->llink, &epi->pwqlist);
  		epi->nwait++;
  	} else {
  		/* We have to signal that an error occurred */
  		epi->nwait = -1;
  	}
  }
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
707
708
709
710
711
712
713
714
715
  static void ep_rbtree_insert(struct eventpoll *ep, struct epitem *epi)
  {
  	int kcmp;
  	struct rb_node **p = &ep->rbr.rb_node, *parent = NULL;
  	struct epitem *epic;
  
  	while (*p) {
  		parent = *p;
  		epic = rb_entry(parent, struct epitem, rbn);
b030a4dd6   Pekka Enberg   [PATCH] Remove ev...
716
  		kcmp = ep_cmp_ffd(&epi->ffd, &epic->ffd);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
717
718
719
720
721
722
723
724
  		if (kcmp > 0)
  			p = &parent->rb_right;
  		else
  			p = &parent->rb_left;
  	}
  	rb_link_node(&epi->rbn, parent, p);
  	rb_insert_color(&epi->rbn, &ep->rbr);
  }
c7ea76302   Davide Libenzi   epoll locks chang...
725
726
727
  /*
   * Must be called with "mtx" held.
   */
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
728
729
730
731
732
733
734
  static int ep_insert(struct eventpoll *ep, struct epoll_event *event,
  		     struct file *tfile, int fd)
  {
  	int error, revents, pwake = 0;
  	unsigned long flags;
  	struct epitem *epi;
  	struct ep_pqueue epq;
7ef9964e6   Davide Libenzi   epoll: introduce ...
735
736
737
  	if (unlikely(atomic_read(&ep->user->epoll_watches) >=
  		     max_user_watches))
  		return -ENOSPC;
e94b17660   Christoph Lameter   [PATCH] slab: rem...
738
  	if (!(epi = kmem_cache_alloc(epi_cache, GFP_KERNEL)))
7ef9964e6   Davide Libenzi   epoll: introduce ...
739
  		return -ENOMEM;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
740
741
  
  	/* Item initialization follow here ... */
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
742
743
  	INIT_LIST_HEAD(&epi->rdllink);
  	INIT_LIST_HEAD(&epi->fllink);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
744
745
  	INIT_LIST_HEAD(&epi->pwqlist);
  	epi->ep = ep;
b030a4dd6   Pekka Enberg   [PATCH] Remove ev...
746
  	ep_set_ffd(&epi->ffd, tfile, fd);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
747
  	epi->event = *event;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
748
  	epi->nwait = 0;
d47de16c7   Davide Libenzi   fix epoll single ...
749
  	epi->next = EP_UNACTIVE_PTR;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
750
751
752
753
754
755
756
757
  
  	/* Initialize the poll table using the queue callback */
  	epq.epi = epi;
  	init_poll_funcptr(&epq.pt, ep_ptable_queue_proc);
  
  	/*
  	 * Attach the item to the poll hooks and get current event bits.
  	 * We can safely use the file* here because its usage count has
c7ea76302   Davide Libenzi   epoll locks chang...
758
759
760
  	 * been increased by the caller of this function. Note that after
  	 * this operation completes, the poll callback can start hitting
  	 * the new item.
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
761
762
763
764
765
766
767
768
  	 */
  	revents = tfile->f_op->poll(tfile, &epq.pt);
  
  	/*
  	 * We have to check if something went wrong during the poll wait queue
  	 * install process. Namely an allocation for a wait queue failed due
  	 * high memory pressure.
  	 */
7ef9964e6   Davide Libenzi   epoll: introduce ...
769
  	error = -ENOMEM;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
770
  	if (epi->nwait < 0)
7699acd13   Davide Libenzi   epoll cleanups: e...
771
  		goto error_unregister;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
772
773
774
775
776
  
  	/* Add the current item to the list of active epoll hook for this file */
  	spin_lock(&tfile->f_ep_lock);
  	list_add_tail(&epi->fllink, &tfile->f_ep_links);
  	spin_unlock(&tfile->f_ep_lock);
c7ea76302   Davide Libenzi   epoll locks chang...
777
778
779
780
  	/*
  	 * Add the current item to the RB tree. All RB tree operations are
  	 * protected by "mtx", and ep_insert() is called with "mtx" held.
  	 */
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
781
  	ep_rbtree_insert(ep, epi);
c7ea76302   Davide Libenzi   epoll locks chang...
782
783
  	/* We have to drop the new item inside our item list to keep track of it */
  	spin_lock_irqsave(&ep->lock, flags);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
784
  	/* If the file is already "ready" we drop it inside the ready list */
b030a4dd6   Pekka Enberg   [PATCH] Remove ev...
785
  	if ((revents & event->events) && !ep_is_linked(&epi->rdllink)) {
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
786
787
788
789
  		list_add_tail(&epi->rdllink, &ep->rdllist);
  
  		/* Notify waiting tasks that events are available */
  		if (waitqueue_active(&ep->wq))
4a6e9e2ce   Matthew Wilcox   Use wake_up_locke...
790
  			wake_up_locked(&ep->wq);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
791
792
793
  		if (waitqueue_active(&ep->poll_wait))
  			pwake++;
  	}
c7ea76302   Davide Libenzi   epoll locks chang...
794
  	spin_unlock_irqrestore(&ep->lock, flags);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
795

7ef9964e6   Davide Libenzi   epoll: introduce ...
796
  	atomic_inc(&ep->user->epoll_watches);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
797
798
799
800
801
802
803
804
805
  	/* We have to call this outside the lock */
  	if (pwake)
  		ep_poll_safewake(&psw, &ep->poll_wait);
  
  	DNPRINTK(3, (KERN_INFO "[%p] eventpoll: ep_insert(%p, %p, %d)
  ",
  		     current, ep, tfile, fd));
  
  	return 0;
7699acd13   Davide Libenzi   epoll cleanups: e...
806
  error_unregister:
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
807
808
809
810
  	ep_unregister_pollwait(ep, epi);
  
  	/*
  	 * We need to do this because an event could have been arrived on some
67647d0fb   Davide Libenzi   epoll: fix some c...
811
812
813
  	 * allocated wait queue. Note that we don't care about the ep->ovflist
  	 * list, since that is used/cleaned only inside a section bound by "mtx".
  	 * And ep_insert() is called with "mtx" held.
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
814
  	 */
c7ea76302   Davide Libenzi   epoll locks chang...
815
  	spin_lock_irqsave(&ep->lock, flags);
b030a4dd6   Pekka Enberg   [PATCH] Remove ev...
816
  	if (ep_is_linked(&epi->rdllink))
6192bd536   Davide Libenzi   epoll: optimizati...
817
  		list_del_init(&epi->rdllink);
c7ea76302   Davide Libenzi   epoll locks chang...
818
  	spin_unlock_irqrestore(&ep->lock, flags);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
819

b030a4dd6   Pekka Enberg   [PATCH] Remove ev...
820
  	kmem_cache_free(epi_cache, epi);
7ef9964e6   Davide Libenzi   epoll: introduce ...
821

1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
822
823
  	return error;
  }
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
824
825
  /*
   * Modify the interest event mask by dropping an event if the new mask
c7ea76302   Davide Libenzi   epoll locks chang...
826
   * has a match in the current file status. Must be called with "mtx" held.
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
   */
  static int ep_modify(struct eventpoll *ep, struct epitem *epi, struct epoll_event *event)
  {
  	int pwake = 0;
  	unsigned int revents;
  	unsigned long flags;
  
  	/*
  	 * Set the new event interest mask before calling f_op->poll(), otherwise
  	 * a potential race might occur. In fact if we do this operation inside
  	 * the lock, an event might happen between the f_op->poll() call and the
  	 * new event set registering.
  	 */
  	epi->event.events = event->events;
  
  	/*
  	 * Get current event bits. We can safely use the file* here because
  	 * its usage count has been increased by the caller of this function.
  	 */
  	revents = epi->ffd.file->f_op->poll(epi->ffd.file, NULL);
c7ea76302   Davide Libenzi   epoll locks chang...
847
  	spin_lock_irqsave(&ep->lock, flags);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
848
849
850
851
852
  
  	/* Copy the data member from inside the lock */
  	epi->event.data = event->data;
  
  	/*
c7ea76302   Davide Libenzi   epoll locks chang...
853
  	 * If the item is "hot" and it is not registered inside the ready
67647d0fb   Davide Libenzi   epoll: fix some c...
854
  	 * list, push it inside.
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
855
  	 */
c7ea76302   Davide Libenzi   epoll locks chang...
856
857
858
859
860
861
  	if (revents & event->events) {
  		if (!ep_is_linked(&epi->rdllink)) {
  			list_add_tail(&epi->rdllink, &ep->rdllist);
  
  			/* Notify waiting tasks that events are available */
  			if (waitqueue_active(&ep->wq))
4a6e9e2ce   Matthew Wilcox   Use wake_up_locke...
862
  				wake_up_locked(&ep->wq);
c7ea76302   Davide Libenzi   epoll locks chang...
863
864
  			if (waitqueue_active(&ep->poll_wait))
  				pwake++;
7699acd13   Davide Libenzi   epoll cleanups: e...
865
866
  		}
  	}
c7ea76302   Davide Libenzi   epoll locks chang...
867
  	spin_unlock_irqrestore(&ep->lock, flags);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
868

7699acd13   Davide Libenzi   epoll cleanups: e...
869
870
871
  	/* We have to call this outside the lock */
  	if (pwake)
  		ep_poll_safewake(&psw, &ep->poll_wait);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
872

7699acd13   Davide Libenzi   epoll cleanups: e...
873
  	return 0;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
874
  }
d47de16c7   Davide Libenzi   fix epoll single ...
875
876
  static int ep_send_events(struct eventpoll *ep, struct epoll_event __user *events,
  			  int maxevents)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
877
  {
6192bd536   Davide Libenzi   epoll: optimizati...
878
  	int eventcnt, error = -EFAULT, pwake = 0;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
879
  	unsigned int revents;
6192bd536   Davide Libenzi   epoll: optimizati...
880
  	unsigned long flags;
d47de16c7   Davide Libenzi   fix epoll single ...
881
882
883
884
  	struct epitem *epi, *nepi;
  	struct list_head txlist;
  
  	INIT_LIST_HEAD(&txlist);
6192bd536   Davide Libenzi   epoll: optimizati...
885

d47de16c7   Davide Libenzi   fix epoll single ...
886
887
888
889
890
891
892
893
894
895
896
897
898
  	/*
  	 * We need to lock this because we could be hit by
  	 * eventpoll_release_file() and epoll_ctl(EPOLL_CTL_DEL).
  	 */
  	mutex_lock(&ep->mtx);
  
  	/*
  	 * Steal the ready list, and re-init the original one to the
  	 * empty list. Also, set ep->ovflist to NULL so that events
  	 * happening while looping w/out locks, are not lost. We cannot
  	 * have the poll callback to queue directly on ep->rdllist,
  	 * because we are doing it in the loop below, in a lockless way.
  	 */
c7ea76302   Davide Libenzi   epoll locks chang...
899
  	spin_lock_irqsave(&ep->lock, flags);
d47de16c7   Davide Libenzi   fix epoll single ...
900
901
902
  	list_splice(&ep->rdllist, &txlist);
  	INIT_LIST_HEAD(&ep->rdllist);
  	ep->ovflist = NULL;
c7ea76302   Davide Libenzi   epoll locks chang...
903
  	spin_unlock_irqrestore(&ep->lock, flags);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
904
905
906
  
  	/*
  	 * We can loop without lock because this is a task private list.
6192bd536   Davide Libenzi   epoll: optimizati...
907
  	 * We just splice'd out the ep->rdllist in ep_collect_ready_items().
d47de16c7   Davide Libenzi   fix epoll single ...
908
  	 * Items cannot vanish during the loop because we are holding "mtx".
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
909
  	 */
d47de16c7   Davide Libenzi   fix epoll single ...
910
911
912
913
  	for (eventcnt = 0; !list_empty(&txlist) && eventcnt < maxevents;) {
  		epi = list_first_entry(&txlist, struct epitem, rdllink);
  
  		list_del_init(&epi->rdllink);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
914
915
916
  
  		/*
  		 * Get the ready file event set. We can safely use the file
d47de16c7   Davide Libenzi   fix epoll single ...
917
918
  		 * because we are holding the "mtx" and this will guarantee
  		 * that both the file and the item will not vanish.
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
919
920
  		 */
  		revents = epi->ffd.file->f_op->poll(epi->ffd.file, NULL);
6192bd536   Davide Libenzi   epoll: optimizati...
921
  		revents &= epi->event.events;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
922
923
  
  		/*
6192bd536   Davide Libenzi   epoll: optimizati...
924
925
  		 * Is the event mask intersect the caller-requested one,
  		 * deliver the event to userspace. Again, we are holding
d47de16c7   Davide Libenzi   fix epoll single ...
926
927
  		 * "mtx", so no operations coming from userspace can change
  		 * the item.
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
928
  		 */
6192bd536   Davide Libenzi   epoll: optimizati...
929
930
  		if (revents) {
  			if (__put_user(revents,
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
931
932
933
  				       &events[eventcnt].events) ||
  			    __put_user(epi->event.data,
  				       &events[eventcnt].data))
6192bd536   Davide Libenzi   epoll: optimizati...
934
  				goto errxit;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
935
936
937
938
  			if (epi->event.events & EPOLLONESHOT)
  				epi->event.events &= EP_PRIVATE_BITS;
  			eventcnt++;
  		}
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
939
  		/*
d47de16c7   Davide Libenzi   fix epoll single ...
940
941
942
  		 * At this point, noone can insert into ep->rdllist besides
  		 * us. The epoll_ctl() callers are locked out by us holding
  		 * "mtx" and the poll callback will queue them in ep->ovflist.
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
943
  		 */
6192bd536   Davide Libenzi   epoll: optimizati...
944
  		if (!(epi->event.events & EPOLLET) &&
d47de16c7   Davide Libenzi   fix epoll single ...
945
946
  		    (revents & epi->event.events))
  			list_add_tail(&epi->rdllink, &ep->rdllist);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
947
  	}
6192bd536   Davide Libenzi   epoll: optimizati...
948
  	error = 0;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
949

d47de16c7   Davide Libenzi   fix epoll single ...
950
  errxit:
6192bd536   Davide Libenzi   epoll: optimizati...
951

c7ea76302   Davide Libenzi   epoll locks chang...
952
  	spin_lock_irqsave(&ep->lock, flags);
d47de16c7   Davide Libenzi   fix epoll single ...
953
954
955
  	/*
  	 * During the time we spent in the loop above, some other events
  	 * might have been queued by the poll callback. We re-insert them
f337b9c58   Davide Libenzi   epoll: drop unnec...
956
  	 * inside the main ready-list here.
d47de16c7   Davide Libenzi   fix epoll single ...
957
958
  	 */
  	for (nepi = ep->ovflist; (epi = nepi) != NULL;
9ce209d64   Davide Libenzi   epoll: avoid doub...
959
960
961
962
963
964
965
966
967
  	     nepi = epi->next, epi->next = EP_UNACTIVE_PTR) {
  		/*
  		 * If the above loop quit with errors, the epoll item might still
  		 * be linked to "txlist", and the list_splice() done below will
  		 * take care of those cases.
  		 */
  		if (!ep_is_linked(&epi->rdllink))
  			list_add_tail(&epi->rdllink, &ep->rdllist);
  	}
6192bd536   Davide Libenzi   epoll: optimizati...
968
  	/*
d47de16c7   Davide Libenzi   fix epoll single ...
969
970
971
  	 * We need to set back ep->ovflist to EP_UNACTIVE_PTR, so that after
  	 * releasing the lock, events will be queued in the normal way inside
  	 * ep->rdllist.
6192bd536   Davide Libenzi   epoll: optimizati...
972
  	 */
d47de16c7   Davide Libenzi   fix epoll single ...
973
  	ep->ovflist = EP_UNACTIVE_PTR;
6192bd536   Davide Libenzi   epoll: optimizati...
974

d47de16c7   Davide Libenzi   fix epoll single ...
975
  	/*
67647d0fb   Davide Libenzi   epoll: fix some c...
976
977
978
  	 * In case of error in the event-send loop, or in case the number of
  	 * ready events exceeds the userspace limit, we need to splice the
  	 * "txlist" back inside ep->rdllist.
d47de16c7   Davide Libenzi   fix epoll single ...
979
980
981
982
  	 */
  	list_splice(&txlist, &ep->rdllist);
  
  	if (!list_empty(&ep->rdllist)) {
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
983
  		/*
d47de16c7   Davide Libenzi   fix epoll single ...
984
  		 * Wake up (if active) both the eventpoll wait list and the ->poll()
67647d0fb   Davide Libenzi   epoll: fix some c...
985
  		 * wait list (delayed after we release the lock).
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
986
987
  		 */
  		if (waitqueue_active(&ep->wq))
4a6e9e2ce   Matthew Wilcox   Use wake_up_locke...
988
  			wake_up_locked(&ep->wq);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
989
990
  		if (waitqueue_active(&ep->poll_wait))
  			pwake++;
6192bd536   Davide Libenzi   epoll: optimizati...
991
  	}
c7ea76302   Davide Libenzi   epoll locks chang...
992
  	spin_unlock_irqrestore(&ep->lock, flags);
d47de16c7   Davide Libenzi   fix epoll single ...
993
994
  
  	mutex_unlock(&ep->mtx);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
995
996
997
998
  
  	/* We have to call this outside the lock */
  	if (pwake)
  		ep_poll_safewake(&psw, &ep->poll_wait);
6192bd536   Davide Libenzi   epoll: optimizati...
999
1000
  
  	return eventcnt == 0 ? error: eventcnt;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1001
  }
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
  static int ep_poll(struct eventpoll *ep, struct epoll_event __user *events,
  		   int maxevents, long timeout)
  {
  	int res, eavail;
  	unsigned long flags;
  	long jtimeout;
  	wait_queue_t wait;
  
  	/*
  	 * Calculate the timeout by checking for the "infinite" value ( -1 )
  	 * and the overflow condition. The passed timeout is in milliseconds,
  	 * that why (t * HZ) / 1000.
  	 */
e3306dd5f   Davide Libenzi   [PATCH] epoll: ha...
1015
1016
  	jtimeout = (timeout < 0 || timeout >= EP_MAX_MSTIMEO) ?
  		MAX_SCHEDULE_TIMEOUT : (timeout * HZ + 999) / 1000;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1017
1018
  
  retry:
c7ea76302   Davide Libenzi   epoll locks chang...
1019
  	spin_lock_irqsave(&ep->lock, flags);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1020
1021
1022
1023
1024
1025
1026
1027
1028
  
  	res = 0;
  	if (list_empty(&ep->rdllist)) {
  		/*
  		 * We don't have any available event to return to the caller.
  		 * We need to sleep here, and we will be wake up by
  		 * ep_poll_callback() when events will become available.
  		 */
  		init_waitqueue_entry(&wait, current);
d47de16c7   Davide Libenzi   fix epoll single ...
1029
  		wait.flags |= WQ_FLAG_EXCLUSIVE;
3419b23a9   Davide Libenzi   [PATCH] epoll: us...
1030
  		__add_wait_queue(&ep->wq, &wait);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
  
  		for (;;) {
  			/*
  			 * We don't want to sleep if the ep_poll_callback() sends us
  			 * a wakeup in between. That's why we set the task state
  			 * to TASK_INTERRUPTIBLE before doing the checks.
  			 */
  			set_current_state(TASK_INTERRUPTIBLE);
  			if (!list_empty(&ep->rdllist) || !jtimeout)
  				break;
  			if (signal_pending(current)) {
  				res = -EINTR;
  				break;
  			}
c7ea76302   Davide Libenzi   epoll locks chang...
1045
  			spin_unlock_irqrestore(&ep->lock, flags);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1046
  			jtimeout = schedule_timeout(jtimeout);
c7ea76302   Davide Libenzi   epoll locks chang...
1047
  			spin_lock_irqsave(&ep->lock, flags);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1048
  		}
3419b23a9   Davide Libenzi   [PATCH] epoll: us...
1049
  		__remove_wait_queue(&ep->wq, &wait);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1050
1051
1052
1053
1054
1055
  
  		set_current_state(TASK_RUNNING);
  	}
  
  	/* Is it worth to try to dig for events ? */
  	eavail = !list_empty(&ep->rdllist);
c7ea76302   Davide Libenzi   epoll locks chang...
1056
  	spin_unlock_irqrestore(&ep->lock, flags);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1057
1058
1059
1060
1061
1062
1063
  
  	/*
  	 * Try to transfer events to user space. In case we get 0 events and
  	 * there's still timeout left over, we go trying again in search of
  	 * more luck.
  	 */
  	if (!res && eavail &&
d47de16c7   Davide Libenzi   fix epoll single ...
1064
  	    !(res = ep_send_events(ep, events, maxevents)) && jtimeout)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1065
1066
1067
1068
  		goto retry;
  
  	return res;
  }
7699acd13   Davide Libenzi   epoll cleanups: e...
1069
  /*
523723bb5   Andrew Morton   fs/eventpoll.c: f...
1070
   * Open an eventpoll file descriptor.
7699acd13   Davide Libenzi   epoll cleanups: e...
1071
   */
9fe5ad9c8   Ulrich Drepper   flag parameters a...
1072
  asmlinkage long sys_epoll_create1(int flags)
7699acd13   Davide Libenzi   epoll cleanups: e...
1073
1074
1075
  {
  	int error, fd = -1;
  	struct eventpoll *ep;
7699acd13   Davide Libenzi   epoll cleanups: e...
1076

e38b36f32   Ulrich Drepper   flag parameters: ...
1077
1078
  	/* Check the EPOLL_* constant for consistency.  */
  	BUILD_BUG_ON(EPOLL_CLOEXEC != O_CLOEXEC);
a0998b50c   Ulrich Drepper   flag parameters: ...
1079
1080
  	if (flags & ~EPOLL_CLOEXEC)
  		return -EINVAL;
7699acd13   Davide Libenzi   epoll cleanups: e...
1081
1082
  	DNPRINTK(3, (KERN_INFO "[%p] eventpoll: sys_epoll_create(%d)
  ",
9fe5ad9c8   Ulrich Drepper   flag parameters a...
1083
  		     current, flags));
7699acd13   Davide Libenzi   epoll cleanups: e...
1084
1085
  
  	/*
9fe5ad9c8   Ulrich Drepper   flag parameters a...
1086
  	 * Create the internal data structure ( "struct eventpoll" ).
7699acd13   Davide Libenzi   epoll cleanups: e...
1087
  	 */
9fe5ad9c8   Ulrich Drepper   flag parameters a...
1088
1089
  	error = ep_alloc(&ep);
  	if (error < 0) {
2030a42ce   Al Viro   [PATCH] sanitize ...
1090
  		fd = error;
7699acd13   Davide Libenzi   epoll cleanups: e...
1091
  		goto error_return;
2030a42ce   Al Viro   [PATCH] sanitize ...
1092
  	}
7699acd13   Davide Libenzi   epoll cleanups: e...
1093
1094
1095
  
  	/*
  	 * Creates all the items needed to setup an eventpoll file. That is,
2030a42ce   Al Viro   [PATCH] sanitize ...
1096
  	 * a file structure and a free file descriptor.
7699acd13   Davide Libenzi   epoll cleanups: e...
1097
  	 */
a0998b50c   Ulrich Drepper   flag parameters: ...
1098
1099
  	fd = anon_inode_getfd("[eventpoll]", &eventpoll_fops, ep,
  			      flags & O_CLOEXEC);
2030a42ce   Al Viro   [PATCH] sanitize ...
1100
1101
  	if (fd < 0)
  		ep_free(ep);
7ef9964e6   Davide Libenzi   epoll: introduce ...
1102
  	atomic_inc(&ep->user->epoll_devs);
7699acd13   Davide Libenzi   epoll cleanups: e...
1103

2030a42ce   Al Viro   [PATCH] sanitize ...
1104
  error_return:
7699acd13   Davide Libenzi   epoll cleanups: e...
1105
1106
  	DNPRINTK(3, (KERN_INFO "[%p] eventpoll: sys_epoll_create(%d) = %d
  ",
9fe5ad9c8   Ulrich Drepper   flag parameters a...
1107
  		     current, flags, fd));
7699acd13   Davide Libenzi   epoll cleanups: e...
1108
1109
  
  	return fd;
7699acd13   Davide Libenzi   epoll cleanups: e...
1110
  }
a0998b50c   Ulrich Drepper   flag parameters: ...
1111
1112
  asmlinkage long sys_epoll_create(int size)
  {
9fe5ad9c8   Ulrich Drepper   flag parameters a...
1113
1114
1115
1116
  	if (size < 0)
  		return -EINVAL;
  
  	return sys_epoll_create1(0);
a0998b50c   Ulrich Drepper   flag parameters: ...
1117
  }
7699acd13   Davide Libenzi   epoll cleanups: e...
1118
1119
1120
  /*
   * The following function implements the controller interface for
   * the eventpoll file that enables the insertion/removal/change of
67647d0fb   Davide Libenzi   epoll: fix some c...
1121
   * file descriptors inside the interest set.
7699acd13   Davide Libenzi   epoll cleanups: e...
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
   */
  asmlinkage long sys_epoll_ctl(int epfd, int op, int fd,
  			      struct epoll_event __user *event)
  {
  	int error;
  	struct file *file, *tfile;
  	struct eventpoll *ep;
  	struct epitem *epi;
  	struct epoll_event epds;
  
  	DNPRINTK(3, (KERN_INFO "[%p] eventpoll: sys_epoll_ctl(%d, %d, %d, %p)
  ",
  		     current, epfd, op, fd, event));
  
  	error = -EFAULT;
  	if (ep_op_has_event(op) &&
  	    copy_from_user(&epds, event, sizeof(struct epoll_event)))
  		goto error_return;
  
  	/* Get the "struct file *" for the eventpoll file */
  	error = -EBADF;
  	file = fget(epfd);
  	if (!file)
  		goto error_return;
  
  	/* Get the "struct file *" for the target file */
  	tfile = fget(fd);
  	if (!tfile)
  		goto error_fput;
  
  	/* The target file descriptor must support poll */
  	error = -EPERM;
  	if (!tfile->f_op || !tfile->f_op->poll)
  		goto error_tgt_fput;
  
  	/*
  	 * We have to check that the file structure underneath the file descriptor
  	 * the user passed to us _is_ an eventpoll file. And also we do not permit
  	 * adding an epoll file descriptor inside itself.
  	 */
  	error = -EINVAL;
  	if (file == tfile || !is_file_epoll(file))
  		goto error_tgt_fput;
  
  	/*
  	 * At this point it is safe to assume that the "private_data" contains
  	 * our own data structure.
  	 */
  	ep = file->private_data;
d47de16c7   Davide Libenzi   fix epoll single ...
1171
  	mutex_lock(&ep->mtx);
7699acd13   Davide Libenzi   epoll cleanups: e...
1172

67647d0fb   Davide Libenzi   epoll: fix some c...
1173
1174
1175
1176
1177
  	/*
  	 * Try to lookup the file inside our RB tree, Since we grabbed "mtx"
  	 * above, we can be sure to be able to use the item looked up by
  	 * ep_find() till we release the mutex.
  	 */
7699acd13   Davide Libenzi   epoll cleanups: e...
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
  	epi = ep_find(ep, tfile, fd);
  
  	error = -EINVAL;
  	switch (op) {
  	case EPOLL_CTL_ADD:
  		if (!epi) {
  			epds.events |= POLLERR | POLLHUP;
  
  			error = ep_insert(ep, &epds, tfile, fd);
  		} else
  			error = -EEXIST;
  		break;
  	case EPOLL_CTL_DEL:
  		if (epi)
  			error = ep_remove(ep, epi);
  		else
  			error = -ENOENT;
  		break;
  	case EPOLL_CTL_MOD:
  		if (epi) {
  			epds.events |= POLLERR | POLLHUP;
  			error = ep_modify(ep, epi, &epds);
  		} else
  			error = -ENOENT;
  		break;
  	}
d47de16c7   Davide Libenzi   fix epoll single ...
1204
  	mutex_unlock(&ep->mtx);
7699acd13   Davide Libenzi   epoll cleanups: e...
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
  
  error_tgt_fput:
  	fput(tfile);
  error_fput:
  	fput(file);
  error_return:
  	DNPRINTK(3, (KERN_INFO "[%p] eventpoll: sys_epoll_ctl(%d, %d, %d, %p) = %d
  ",
  		     current, epfd, op, fd, event, error));
  
  	return error;
  }
  
  /*
   * Implement the event wait interface for the eventpoll file. It is the kernel
   * part of the user space epoll_wait(2).
   */
  asmlinkage long sys_epoll_wait(int epfd, struct epoll_event __user *events,
  			       int maxevents, int timeout)
  {
  	int error;
  	struct file *file;
  	struct eventpoll *ep;
  
  	DNPRINTK(3, (KERN_INFO "[%p] eventpoll: sys_epoll_wait(%d, %p, %d, %d)
  ",
  		     current, epfd, events, maxevents, timeout));
  
  	/* The maximum number of event must be greater than zero */
  	if (maxevents <= 0 || maxevents > EP_MAX_EVENTS)
  		return -EINVAL;
  
  	/* Verify that the area passed by the user is writeable */
  	if (!access_ok(VERIFY_WRITE, events, maxevents * sizeof(struct epoll_event))) {
  		error = -EFAULT;
  		goto error_return;
  	}
  
  	/* Get the "struct file *" for the eventpoll file */
  	error = -EBADF;
  	file = fget(epfd);
  	if (!file)
  		goto error_return;
  
  	/*
  	 * We have to check that the file structure underneath the fd
  	 * the user passed to us _is_ an eventpoll file.
  	 */
  	error = -EINVAL;
  	if (!is_file_epoll(file))
  		goto error_fput;
  
  	/*
  	 * At this point it is safe to assume that the "private_data" contains
  	 * our own data structure.
  	 */
  	ep = file->private_data;
  
  	/* Time to fish for events ... */
  	error = ep_poll(ep, events, maxevents, timeout);
  
  error_fput:
  	fput(file);
  error_return:
  	DNPRINTK(3, (KERN_INFO "[%p] eventpoll: sys_epoll_wait(%d, %p, %d, %d) = %d
  ",
  		     current, epfd, events, maxevents, timeout, error));
  
  	return error;
  }
f3de272b8   Roland McGrath   signals: use HAVE...
1275
  #ifdef HAVE_SET_RESTORE_SIGMASK
7699acd13   Davide Libenzi   epoll cleanups: e...
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
  
  /*
   * Implement the event wait interface for the eventpoll file. It is the kernel
   * part of the user space epoll_pwait(2).
   */
  asmlinkage long sys_epoll_pwait(int epfd, struct epoll_event __user *events,
  		int maxevents, int timeout, const sigset_t __user *sigmask,
  		size_t sigsetsize)
  {
  	int error;
  	sigset_t ksigmask, sigsaved;
  
  	/*
  	 * If the caller wants a certain signal mask to be set during the wait,
  	 * we apply it here.
  	 */
  	if (sigmask) {
  		if (sigsetsize != sizeof(sigset_t))
  			return -EINVAL;
  		if (copy_from_user(&ksigmask, sigmask, sizeof(ksigmask)))
  			return -EFAULT;
  		sigdelsetmask(&ksigmask, sigmask(SIGKILL) | sigmask(SIGSTOP));
  		sigprocmask(SIG_SETMASK, &ksigmask, &sigsaved);
  	}
  
  	error = sys_epoll_wait(epfd, events, maxevents, timeout);
  
  	/*
  	 * If we changed the signal mask, we need to restore the original one.
  	 * In case we've got a signal while waiting, we do not restore the
  	 * signal mask yet, and we allow do_signal() to deliver the signal on
  	 * the way back to userspace, before the signal mask is restored.
  	 */
  	if (sigmask) {
  		if (error == -EINTR) {
  			memcpy(&current->saved_sigmask, &sigsaved,
c7ea76302   Davide Libenzi   epoll locks chang...
1312
  			       sizeof(sigsaved));
4e4c22c71   Roland McGrath   signals: add set_...
1313
  			set_restore_sigmask();
7699acd13   Davide Libenzi   epoll cleanups: e...
1314
1315
1316
1317
1318
1319
  		} else
  			sigprocmask(SIG_SETMASK, &sigsaved, NULL);
  	}
  
  	return error;
  }
f3de272b8   Roland McGrath   signals: use HAVE...
1320
  #endif /* HAVE_SET_RESTORE_SIGMASK */
7699acd13   Davide Libenzi   epoll cleanups: e...
1321

1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1322
1323
  static int __init eventpoll_init(void)
  {
7ef9964e6   Davide Libenzi   epoll: introduce ...
1324
1325
1326
1327
1328
1329
  	struct sysinfo si;
  
  	si_meminfo(&si);
  	max_user_instances = 128;
  	max_user_watches = (((si.totalram - si.totalhigh) / 32) << PAGE_SHIFT) /
  		EP_ITEM_COST;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1330
1331
1332
1333
1334
1335
1336
  
  	/* Initialize the structure used to perform safe poll wait head wake ups */
  	ep_poll_safewake_init(&psw);
  
  	/* Allocates slab cache used to allocate "struct epitem" items */
  	epi_cache = kmem_cache_create("eventpoll_epi", sizeof(struct epitem),
  			0, SLAB_HWCACHE_ALIGN|EPI_SLAB_DEBUG|SLAB_PANIC,
20c2df83d   Paul Mundt   mm: Remove slab d...
1337
  			NULL);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1338
1339
1340
1341
  
  	/* Allocates slab cache used to allocate "struct eppoll_entry" */
  	pwq_cache = kmem_cache_create("eventpoll_pwq",
  			sizeof(struct eppoll_entry), 0,
20c2df83d   Paul Mundt   mm: Remove slab d...
1342
  			EPI_SLAB_DEBUG|SLAB_PANIC, NULL);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1343

1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1344
  	return 0;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1345
  }
cea692418   Davide Libenzi   epoll cleanups: e...
1346
  fs_initcall(eventpoll_init);