Blame view

fs/eventfd.c 11 KB
e1ad7468c   Davide Libenzi   signal/timer/even...
1
2
3
4
5
6
7
8
9
10
11
12
13
  /*
   *  fs/eventfd.c
   *
   *  Copyright (C) 2007  Davide Libenzi <davidel@xmailserver.org>
   *
   */
  
  #include <linux/file.h>
  #include <linux/poll.h>
  #include <linux/init.h>
  #include <linux/fs.h>
  #include <linux/sched.h>
  #include <linux/kernel.h>
5a0e3ad6a   Tejun Heo   include cleanup: ...
14
  #include <linux/slab.h>
e1ad7468c   Davide Libenzi   signal/timer/even...
15
16
17
  #include <linux/list.h>
  #include <linux/spinlock.h>
  #include <linux/anon_inodes.h>
7747cdb2f   Adrian Bunk   fs/eventfd.c shou...
18
  #include <linux/syscalls.h>
5718607bb   Rusty Russell   eventfd: export e...
19
  #include <linux/module.h>
133890103   Davide Libenzi   eventfd: revised ...
20
21
  #include <linux/kref.h>
  #include <linux/eventfd.h>
e1ad7468c   Davide Libenzi   signal/timer/even...
22
23
  
  struct eventfd_ctx {
133890103   Davide Libenzi   eventfd: revised ...
24
  	struct kref kref;
e1ad7468c   Davide Libenzi   signal/timer/even...
25
26
27
28
29
30
  	wait_queue_head_t wqh;
  	/*
  	 * Every time that a write(2) is performed on an eventfd, the
  	 * value of the __u64 being written is added to "count" and a
  	 * wakeup is performed on "wqh". A read(2) will return the "count"
  	 * value to userspace, and will reset "count" to zero. The kernel
133890103   Davide Libenzi   eventfd: revised ...
31
  	 * side eventfd_signal() also, adds to the "count" counter and
e1ad7468c   Davide Libenzi   signal/timer/even...
32
33
34
  	 * issue a wakeup.
  	 */
  	__u64 count;
bcd0b235b   Davide Libenzi   eventfd: improve ...
35
  	unsigned int flags;
e1ad7468c   Davide Libenzi   signal/timer/even...
36
  };
133890103   Davide Libenzi   eventfd: revised ...
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
  /**
   * eventfd_signal - Adds @n to the eventfd counter.
   * @ctx: [in] Pointer to the eventfd context.
   * @n: [in] Value of the counter to be added to the eventfd internal counter.
   *          The value cannot be negative.
   *
   * This function is supposed to be called by the kernel in paths that do not
   * allow sleeping. In this function we allow the counter to reach the ULLONG_MAX
   * value, and we signal this as overflow condition by returining a POLLERR
   * to poll(2).
   *
   * Returns @n in case of success, a non-negative number lower than @n in case
   * of overflow, or the following error codes:
   *
   * -EINVAL    : The value of @n is negative.
e1ad7468c   Davide Libenzi   signal/timer/even...
52
   */
133890103   Davide Libenzi   eventfd: revised ...
53
  int eventfd_signal(struct eventfd_ctx *ctx, int n)
e1ad7468c   Davide Libenzi   signal/timer/even...
54
  {
e1ad7468c   Davide Libenzi   signal/timer/even...
55
56
57
58
  	unsigned long flags;
  
  	if (n < 0)
  		return -EINVAL;
d48eb2331   Davide Libenzi   eventfd use waitq...
59
  	spin_lock_irqsave(&ctx->wqh.lock, flags);
e1ad7468c   Davide Libenzi   signal/timer/even...
60
61
62
63
  	if (ULLONG_MAX - ctx->count < n)
  		n = (int) (ULLONG_MAX - ctx->count);
  	ctx->count += n;
  	if (waitqueue_active(&ctx->wqh))
395108880   Davide Libenzi   epoll keyed wakeu...
64
  		wake_up_locked_poll(&ctx->wqh, POLLIN);
d48eb2331   Davide Libenzi   eventfd use waitq...
65
  	spin_unlock_irqrestore(&ctx->wqh.lock, flags);
e1ad7468c   Davide Libenzi   signal/timer/even...
66
67
68
  
  	return n;
  }
5718607bb   Rusty Russell   eventfd: export e...
69
  EXPORT_SYMBOL_GPL(eventfd_signal);
e1ad7468c   Davide Libenzi   signal/timer/even...
70

562787a5c   Davide Libenzi   anonfd: split int...
71
72
73
74
  static void eventfd_free_ctx(struct eventfd_ctx *ctx)
  {
  	kfree(ctx);
  }
133890103   Davide Libenzi   eventfd: revised ...
75
76
77
  static void eventfd_free(struct kref *kref)
  {
  	struct eventfd_ctx *ctx = container_of(kref, struct eventfd_ctx, kref);
562787a5c   Davide Libenzi   anonfd: split int...
78
  	eventfd_free_ctx(ctx);
133890103   Davide Libenzi   eventfd: revised ...
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
  }
  
  /**
   * eventfd_ctx_get - Acquires a reference to the internal eventfd context.
   * @ctx: [in] Pointer to the eventfd context.
   *
   * Returns: In case of success, returns a pointer to the eventfd context.
   */
  struct eventfd_ctx *eventfd_ctx_get(struct eventfd_ctx *ctx)
  {
  	kref_get(&ctx->kref);
  	return ctx;
  }
  EXPORT_SYMBOL_GPL(eventfd_ctx_get);
  
  /**
   * eventfd_ctx_put - Releases a reference to the internal eventfd context.
   * @ctx: [in] Pointer to eventfd context.
   *
   * The eventfd context reference must have been previously acquired either
361821854   Randy Dunlap   Docbook: add fs/e...
99
   * with eventfd_ctx_get() or eventfd_ctx_fdget().
133890103   Davide Libenzi   eventfd: revised ...
100
101
102
103
104
105
   */
  void eventfd_ctx_put(struct eventfd_ctx *ctx)
  {
  	kref_put(&ctx->kref, eventfd_free);
  }
  EXPORT_SYMBOL_GPL(eventfd_ctx_put);
e1ad7468c   Davide Libenzi   signal/timer/even...
106
107
  static int eventfd_release(struct inode *inode, struct file *file)
  {
133890103   Davide Libenzi   eventfd: revised ...
108
109
110
111
  	struct eventfd_ctx *ctx = file->private_data;
  
  	wake_up_poll(&ctx->wqh, POLLHUP);
  	eventfd_ctx_put(ctx);
e1ad7468c   Davide Libenzi   signal/timer/even...
112
113
114
115
116
117
118
119
120
121
  	return 0;
  }
  
  static unsigned int eventfd_poll(struct file *file, poll_table *wait)
  {
  	struct eventfd_ctx *ctx = file->private_data;
  	unsigned int events = 0;
  	unsigned long flags;
  
  	poll_wait(file, &ctx->wqh, wait);
d48eb2331   Davide Libenzi   eventfd use waitq...
122
  	spin_lock_irqsave(&ctx->wqh.lock, flags);
e1ad7468c   Davide Libenzi   signal/timer/even...
123
124
125
126
127
128
  	if (ctx->count > 0)
  		events |= POLLIN;
  	if (ctx->count == ULLONG_MAX)
  		events |= POLLERR;
  	if (ULLONG_MAX - 1 > ctx->count)
  		events |= POLLOUT;
d48eb2331   Davide Libenzi   eventfd use waitq...
129
  	spin_unlock_irqrestore(&ctx->wqh.lock, flags);
e1ad7468c   Davide Libenzi   signal/timer/even...
130
131
132
  
  	return events;
  }
cb289d624   Davide Libenzi   eventfd - allow a...
133
134
135
136
137
138
139
140
141
142
  static void eventfd_ctx_do_read(struct eventfd_ctx *ctx, __u64 *cnt)
  {
  	*cnt = (ctx->flags & EFD_SEMAPHORE) ? 1 : ctx->count;
  	ctx->count -= *cnt;
  }
  
  /**
   * eventfd_ctx_remove_wait_queue - Read the current counter and removes wait queue.
   * @ctx: [in] Pointer to eventfd context.
   * @wait: [in] Wait queue to be removed.
361821854   Randy Dunlap   Docbook: add fs/e...
143
   * @cnt: [out] Pointer to the 64-bit counter value.
cb289d624   Davide Libenzi   eventfd - allow a...
144
   *
361821854   Randy Dunlap   Docbook: add fs/e...
145
   * Returns %0 if successful, or the following error codes:
cb289d624   Davide Libenzi   eventfd - allow a...
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
   *
   * -EAGAIN      : The operation would have blocked.
   *
   * This is used to atomically remove a wait queue entry from the eventfd wait
   * queue head, and read/reset the counter value.
   */
  int eventfd_ctx_remove_wait_queue(struct eventfd_ctx *ctx, wait_queue_t *wait,
  				  __u64 *cnt)
  {
  	unsigned long flags;
  
  	spin_lock_irqsave(&ctx->wqh.lock, flags);
  	eventfd_ctx_do_read(ctx, cnt);
  	__remove_wait_queue(&ctx->wqh, wait);
  	if (*cnt != 0 && waitqueue_active(&ctx->wqh))
  		wake_up_locked_poll(&ctx->wqh, POLLOUT);
  	spin_unlock_irqrestore(&ctx->wqh.lock, flags);
  
  	return *cnt != 0 ? 0 : -EAGAIN;
  }
  EXPORT_SYMBOL_GPL(eventfd_ctx_remove_wait_queue);
  
  /**
   * eventfd_ctx_read - Reads the eventfd counter or wait if it is zero.
   * @ctx: [in] Pointer to eventfd context.
   * @no_wait: [in] Different from zero if the operation should not block.
361821854   Randy Dunlap   Docbook: add fs/e...
172
   * @cnt: [out] Pointer to the 64-bit counter value.
cb289d624   Davide Libenzi   eventfd - allow a...
173
   *
361821854   Randy Dunlap   Docbook: add fs/e...
174
   * Returns %0 if successful, or the following error codes:
cb289d624   Davide Libenzi   eventfd - allow a...
175
   *
361821854   Randy Dunlap   Docbook: add fs/e...
176
   * -EAGAIN      : The operation would have blocked but @no_wait was non-zero.
cb289d624   Davide Libenzi   eventfd - allow a...
177
178
179
180
181
182
   * -ERESTARTSYS : A signal interrupted the wait operation.
   *
   * If @no_wait is zero, the function might sleep until the eventfd internal
   * counter becomes greater than zero.
   */
  ssize_t eventfd_ctx_read(struct eventfd_ctx *ctx, int no_wait, __u64 *cnt)
e1ad7468c   Davide Libenzi   signal/timer/even...
183
  {
e1ad7468c   Davide Libenzi   signal/timer/even...
184
  	ssize_t res;
e1ad7468c   Davide Libenzi   signal/timer/even...
185
  	DECLARE_WAITQUEUE(wait, current);
d48eb2331   Davide Libenzi   eventfd use waitq...
186
  	spin_lock_irq(&ctx->wqh.lock);
cb289d624   Davide Libenzi   eventfd - allow a...
187
  	*cnt = 0;
e1ad7468c   Davide Libenzi   signal/timer/even...
188
  	res = -EAGAIN;
bcd0b235b   Davide Libenzi   eventfd: improve ...
189
  	if (ctx->count > 0)
cb289d624   Davide Libenzi   eventfd - allow a...
190
191
  		res = 0;
  	else if (!no_wait) {
e1ad7468c   Davide Libenzi   signal/timer/even...
192
  		__add_wait_queue(&ctx->wqh, &wait);
cb289d624   Davide Libenzi   eventfd - allow a...
193
  		for (;;) {
e1ad7468c   Davide Libenzi   signal/timer/even...
194
195
  			set_current_state(TASK_INTERRUPTIBLE);
  			if (ctx->count > 0) {
cb289d624   Davide Libenzi   eventfd - allow a...
196
  				res = 0;
e1ad7468c   Davide Libenzi   signal/timer/even...
197
198
199
200
201
202
  				break;
  			}
  			if (signal_pending(current)) {
  				res = -ERESTARTSYS;
  				break;
  			}
d48eb2331   Davide Libenzi   eventfd use waitq...
203
  			spin_unlock_irq(&ctx->wqh.lock);
e1ad7468c   Davide Libenzi   signal/timer/even...
204
  			schedule();
d48eb2331   Davide Libenzi   eventfd use waitq...
205
  			spin_lock_irq(&ctx->wqh.lock);
e1ad7468c   Davide Libenzi   signal/timer/even...
206
207
208
209
  		}
  		__remove_wait_queue(&ctx->wqh, &wait);
  		__set_current_state(TASK_RUNNING);
  	}
cb289d624   Davide Libenzi   eventfd - allow a...
210
211
  	if (likely(res == 0)) {
  		eventfd_ctx_do_read(ctx, cnt);
e1ad7468c   Davide Libenzi   signal/timer/even...
212
  		if (waitqueue_active(&ctx->wqh))
395108880   Davide Libenzi   epoll keyed wakeu...
213
  			wake_up_locked_poll(&ctx->wqh, POLLOUT);
e1ad7468c   Davide Libenzi   signal/timer/even...
214
  	}
d48eb2331   Davide Libenzi   eventfd use waitq...
215
  	spin_unlock_irq(&ctx->wqh.lock);
e1ad7468c   Davide Libenzi   signal/timer/even...
216
217
218
  
  	return res;
  }
cb289d624   Davide Libenzi   eventfd - allow a...
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
  EXPORT_SYMBOL_GPL(eventfd_ctx_read);
  
  static ssize_t eventfd_read(struct file *file, char __user *buf, size_t count,
  			    loff_t *ppos)
  {
  	struct eventfd_ctx *ctx = file->private_data;
  	ssize_t res;
  	__u64 cnt;
  
  	if (count < sizeof(cnt))
  		return -EINVAL;
  	res = eventfd_ctx_read(ctx, file->f_flags & O_NONBLOCK, &cnt);
  	if (res < 0)
  		return res;
  
  	return put_user(cnt, (__u64 __user *) buf) ? -EFAULT : sizeof(cnt);
  }
e1ad7468c   Davide Libenzi   signal/timer/even...
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
  
  static ssize_t eventfd_write(struct file *file, const char __user *buf, size_t count,
  			     loff_t *ppos)
  {
  	struct eventfd_ctx *ctx = file->private_data;
  	ssize_t res;
  	__u64 ucnt;
  	DECLARE_WAITQUEUE(wait, current);
  
  	if (count < sizeof(ucnt))
  		return -EINVAL;
  	if (copy_from_user(&ucnt, buf, sizeof(ucnt)))
  		return -EFAULT;
  	if (ucnt == ULLONG_MAX)
  		return -EINVAL;
d48eb2331   Davide Libenzi   eventfd use waitq...
251
  	spin_lock_irq(&ctx->wqh.lock);
e1ad7468c   Davide Libenzi   signal/timer/even...
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
  	res = -EAGAIN;
  	if (ULLONG_MAX - ctx->count > ucnt)
  		res = sizeof(ucnt);
  	else if (!(file->f_flags & O_NONBLOCK)) {
  		__add_wait_queue(&ctx->wqh, &wait);
  		for (res = 0;;) {
  			set_current_state(TASK_INTERRUPTIBLE);
  			if (ULLONG_MAX - ctx->count > ucnt) {
  				res = sizeof(ucnt);
  				break;
  			}
  			if (signal_pending(current)) {
  				res = -ERESTARTSYS;
  				break;
  			}
d48eb2331   Davide Libenzi   eventfd use waitq...
267
  			spin_unlock_irq(&ctx->wqh.lock);
e1ad7468c   Davide Libenzi   signal/timer/even...
268
  			schedule();
d48eb2331   Davide Libenzi   eventfd use waitq...
269
  			spin_lock_irq(&ctx->wqh.lock);
e1ad7468c   Davide Libenzi   signal/timer/even...
270
271
272
273
  		}
  		__remove_wait_queue(&ctx->wqh, &wait);
  		__set_current_state(TASK_RUNNING);
  	}
bcd0b235b   Davide Libenzi   eventfd: improve ...
274
  	if (likely(res > 0)) {
e1ad7468c   Davide Libenzi   signal/timer/even...
275
276
  		ctx->count += ucnt;
  		if (waitqueue_active(&ctx->wqh))
395108880   Davide Libenzi   epoll keyed wakeu...
277
  			wake_up_locked_poll(&ctx->wqh, POLLIN);
e1ad7468c   Davide Libenzi   signal/timer/even...
278
  	}
d48eb2331   Davide Libenzi   eventfd use waitq...
279
  	spin_unlock_irq(&ctx->wqh.lock);
e1ad7468c   Davide Libenzi   signal/timer/even...
280
281
282
283
284
285
286
287
288
  
  	return res;
  }
  
  static const struct file_operations eventfd_fops = {
  	.release	= eventfd_release,
  	.poll		= eventfd_poll,
  	.read		= eventfd_read,
  	.write		= eventfd_write,
6038f373a   Arnd Bergmann   llseek: automatic...
289
  	.llseek		= noop_llseek,
e1ad7468c   Davide Libenzi   signal/timer/even...
290
  };
133890103   Davide Libenzi   eventfd: revised ...
291
292
293
294
295
296
297
298
299
300
  /**
   * eventfd_fget - Acquire a reference of an eventfd file descriptor.
   * @fd: [in] Eventfd file descriptor.
   *
   * Returns a pointer to the eventfd file structure in case of success, or the
   * following error pointer:
   *
   * -EBADF    : Invalid @fd file descriptor.
   * -EINVAL   : The @fd file descriptor is not an eventfd file.
   */
e1ad7468c   Davide Libenzi   signal/timer/even...
301
302
303
304
305
306
307
308
309
310
311
312
313
314
  struct file *eventfd_fget(int fd)
  {
  	struct file *file;
  
  	file = fget(fd);
  	if (!file)
  		return ERR_PTR(-EBADF);
  	if (file->f_op != &eventfd_fops) {
  		fput(file);
  		return ERR_PTR(-EINVAL);
  	}
  
  	return file;
  }
5718607bb   Rusty Russell   eventfd: export e...
315
  EXPORT_SYMBOL_GPL(eventfd_fget);
e1ad7468c   Davide Libenzi   signal/timer/even...
316

133890103   Davide Libenzi   eventfd: revised ...
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
  /**
   * eventfd_ctx_fdget - Acquires a reference to the internal eventfd context.
   * @fd: [in] Eventfd file descriptor.
   *
   * Returns a pointer to the internal eventfd context, otherwise the error
   * pointers returned by the following functions:
   *
   * eventfd_fget
   */
  struct eventfd_ctx *eventfd_ctx_fdget(int fd)
  {
  	struct file *file;
  	struct eventfd_ctx *ctx;
  
  	file = eventfd_fget(fd);
  	if (IS_ERR(file))
  		return (struct eventfd_ctx *) file;
  	ctx = eventfd_ctx_get(file->private_data);
  	fput(file);
  
  	return ctx;
  }
  EXPORT_SYMBOL_GPL(eventfd_ctx_fdget);
  
  /**
   * eventfd_ctx_fileget - Acquires a reference to the internal eventfd context.
   * @file: [in] Eventfd file pointer.
   *
   * Returns a pointer to the internal eventfd context, otherwise the error
   * pointer:
   *
   * -EINVAL   : The @fd file descriptor is not an eventfd file.
   */
  struct eventfd_ctx *eventfd_ctx_fileget(struct file *file)
  {
  	if (file->f_op != &eventfd_fops)
  		return ERR_PTR(-EINVAL);
  
  	return eventfd_ctx_get(file->private_data);
  }
  EXPORT_SYMBOL_GPL(eventfd_ctx_fileget);
562787a5c   Davide Libenzi   anonfd: split int...
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
  /**
   * eventfd_file_create - Creates an eventfd file pointer.
   * @count: Initial eventfd counter value.
   * @flags: Flags for the eventfd file.
   *
   * This function creates an eventfd file pointer, w/out installing it into
   * the fd table. This is useful when the eventfd file is used during the
   * initialization of data structures that require extra setup after the eventfd
   * creation. So the eventfd creation is split into the file pointer creation
   * phase, and the file descriptor installation phase.
   * In this way races with userspace closing the newly installed file descriptor
   * can be avoided.
   * Returns an eventfd file pointer, or a proper error pointer.
   */
  struct file *eventfd_file_create(unsigned int count, int flags)
e1ad7468c   Davide Libenzi   signal/timer/even...
373
  {
562787a5c   Davide Libenzi   anonfd: split int...
374
  	struct file *file;
e1ad7468c   Davide Libenzi   signal/timer/even...
375
  	struct eventfd_ctx *ctx;
e1ad7468c   Davide Libenzi   signal/timer/even...
376

e38b36f32   Ulrich Drepper   flag parameters: ...
377
378
379
  	/* Check the EFD_* constants for consistency.  */
  	BUILD_BUG_ON(EFD_CLOEXEC != O_CLOEXEC);
  	BUILD_BUG_ON(EFD_NONBLOCK != O_NONBLOCK);
bcd0b235b   Davide Libenzi   eventfd: improve ...
380
  	if (flags & ~EFD_FLAGS_SET)
562787a5c   Davide Libenzi   anonfd: split int...
381
  		return ERR_PTR(-EINVAL);
b087498eb   Ulrich Drepper   flag parameters: ...
382

e1ad7468c   Davide Libenzi   signal/timer/even...
383
384
  	ctx = kmalloc(sizeof(*ctx), GFP_KERNEL);
  	if (!ctx)
562787a5c   Davide Libenzi   anonfd: split int...
385
  		return ERR_PTR(-ENOMEM);
e1ad7468c   Davide Libenzi   signal/timer/even...
386

133890103   Davide Libenzi   eventfd: revised ...
387
  	kref_init(&ctx->kref);
e1ad7468c   Davide Libenzi   signal/timer/even...
388
  	init_waitqueue_head(&ctx->wqh);
e1ad7468c   Davide Libenzi   signal/timer/even...
389
  	ctx->count = count;
bcd0b235b   Davide Libenzi   eventfd: improve ...
390
  	ctx->flags = flags;
e1ad7468c   Davide Libenzi   signal/timer/even...
391

562787a5c   Davide Libenzi   anonfd: split int...
392
  	file = anon_inode_getfile("[eventfd]", &eventfd_fops, ctx,
628ff7c1d   Roland Dreier   anonfd: Allow mak...
393
  				  O_RDWR | (flags & EFD_SHARED_FCNTL_FLAGS));
562787a5c   Davide Libenzi   anonfd: split int...
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
  	if (IS_ERR(file))
  		eventfd_free_ctx(ctx);
  
  	return file;
  }
  
  SYSCALL_DEFINE2(eventfd2, unsigned int, count, int, flags)
  {
  	int fd, error;
  	struct file *file;
  
  	error = get_unused_fd_flags(flags & EFD_SHARED_FCNTL_FLAGS);
  	if (error < 0)
  		return error;
  	fd = error;
  
  	file = eventfd_file_create(count, flags);
  	if (IS_ERR(file)) {
  		error = PTR_ERR(file);
  		goto err_put_unused_fd;
  	}
  	fd_install(fd, file);
2030a42ce   Al Viro   [PATCH] sanitize ...
416
  	return fd;
562787a5c   Davide Libenzi   anonfd: split int...
417
418
419
420
421
  
  err_put_unused_fd:
  	put_unused_fd(fd);
  
  	return error;
e1ad7468c   Davide Libenzi   signal/timer/even...
422
  }
d4e82042c   Heiko Carstens   [CVE-2009-0029] S...
423
  SYSCALL_DEFINE1(eventfd, unsigned int, count)
b087498eb   Ulrich Drepper   flag parameters: ...
424
425
426
  {
  	return sys_eventfd2(count, 0);
  }
bcd0b235b   Davide Libenzi   eventfd: improve ...
427