Blame view
fs/file_table.c
13.2 KB
1da177e4c
|
1 2 3 4 5 6 7 8 9 10 |
/* * linux/fs/file_table.c * * Copyright (C) 1991, 1992 Linus Torvalds * Copyright (C) 1997 David S. Miller (davem@caip.rutgers.edu) */ #include <linux/string.h> #include <linux/slab.h> #include <linux/file.h> |
9f3acc314
|
11 |
#include <linux/fdtable.h> |
1da177e4c
|
12 13 |
#include <linux/init.h> #include <linux/module.h> |
1da177e4c
|
14 15 16 |
#include <linux/fs.h> #include <linux/security.h> #include <linux/eventpoll.h> |
ab2af1f50
|
17 |
#include <linux/rcupdate.h> |
1da177e4c
|
18 |
#include <linux/mount.h> |
16f7e0fe2
|
19 |
#include <linux/capability.h> |
1da177e4c
|
20 |
#include <linux/cdev.h> |
0eeca2830
|
21 |
#include <linux/fsnotify.h> |
529bf6be5
|
22 |
#include <linux/sysctl.h> |
6416ccb78
|
23 |
#include <linux/lglock.h> |
529bf6be5
|
24 |
#include <linux/percpu_counter.h> |
6416ccb78
|
25 |
#include <linux/percpu.h> |
0552f879d
|
26 |
#include <linux/ima.h> |
529bf6be5
|
27 |
|
60063497a
|
28 |
#include <linux/atomic.h> |
1da177e4c
|
29 |
|
e81e3f4dc
|
30 |
#include "internal.h" |
1da177e4c
|
31 32 33 34 |
/* sysctl tunables... */ struct files_stat_struct files_stat = { .max_files = NR_FILE }; |
6416ccb78
|
35 36 |
DECLARE_LGLOCK(files_lglock); DEFINE_LGLOCK(files_lglock); |
1da177e4c
|
37 |
|
b6b3fdead
|
38 39 |
/* SLAB cache for file structures */ static struct kmem_cache *filp_cachep __read_mostly; |
529bf6be5
|
40 |
static struct percpu_counter nr_files __cacheline_aligned_in_smp; |
1da177e4c
|
41 |
|
529bf6be5
|
42 |
static inline void file_free_rcu(struct rcu_head *head) |
1da177e4c
|
43 |
{ |
d76b0d9b2
|
44 45 46 |
struct file *f = container_of(head, struct file, f_u.fu_rcuhead); put_cred(f->f_cred); |
529bf6be5
|
47 |
kmem_cache_free(filp_cachep, f); |
1da177e4c
|
48 |
} |
529bf6be5
|
49 |
static inline void file_free(struct file *f) |
1da177e4c
|
50 |
{ |
529bf6be5
|
51 |
percpu_counter_dec(&nr_files); |
ad775f5a8
|
52 |
file_check_state(f); |
529bf6be5
|
53 |
call_rcu(&f->f_u.fu_rcuhead, file_free_rcu); |
1da177e4c
|
54 |
} |
529bf6be5
|
55 56 57 |
/* * Return the total number of open files in the system */ |
518de9b39
|
58 |
static long get_nr_files(void) |
1da177e4c
|
59 |
{ |
529bf6be5
|
60 |
return percpu_counter_read_positive(&nr_files); |
1da177e4c
|
61 |
} |
529bf6be5
|
62 63 64 |
/* * Return the maximum number of open files in the system */ |
518de9b39
|
65 |
unsigned long get_max_files(void) |
ab2af1f50
|
66 |
{ |
529bf6be5
|
67 |
return files_stat.max_files; |
ab2af1f50
|
68 |
} |
529bf6be5
|
69 70 71 72 73 74 |
EXPORT_SYMBOL_GPL(get_max_files); /* * Handle nr_files sysctl */ #if defined(CONFIG_SYSCTL) && defined(CONFIG_PROC_FS) |
8d65af789
|
75 |
int proc_nr_files(ctl_table *table, int write, |
529bf6be5
|
76 77 78 |
void __user *buffer, size_t *lenp, loff_t *ppos) { files_stat.nr_files = get_nr_files(); |
518de9b39
|
79 |
return proc_doulongvec_minmax(table, write, buffer, lenp, ppos); |
529bf6be5
|
80 81 |
} #else |
8d65af789
|
82 |
int proc_nr_files(ctl_table *table, int write, |
529bf6be5
|
83 84 85 86 87 |
void __user *buffer, size_t *lenp, loff_t *ppos) { return -ENOSYS; } #endif |
ab2af1f50
|
88 |
|
1da177e4c
|
89 90 91 |
/* Find an unused file structure and return a pointer to it. * Returns NULL, if there are no more free file structures or * we run out of memory. |
430e285e0
|
92 93 94 95 96 97 |
* * Be very careful using this. You are responsible for * getting write access to any mount that you might assign * to this filp, if it is opened for write. If this is not * done, you will imbalance int the mount's writer count * and a warning at __fput() time. |
1da177e4c
|
98 99 100 |
*/ struct file *get_empty_filp(void) { |
86a264abe
|
101 |
const struct cred *cred = current_cred(); |
518de9b39
|
102 |
static long old_max; |
1da177e4c
|
103 104 105 106 107 |
struct file * f; /* * Privileged users can go above max_files */ |
529bf6be5
|
108 109 110 111 112 |
if (get_nr_files() >= files_stat.max_files && !capable(CAP_SYS_ADMIN)) { /* * percpu_counters are inaccurate. Do an expensive check before * we go and fail. */ |
52d9f3b40
|
113 |
if (percpu_counter_sum_positive(&nr_files) >= files_stat.max_files) |
529bf6be5
|
114 115 |
goto over; } |
af4d2ecbf
|
116 |
|
4975e45ff
|
117 |
f = kmem_cache_zalloc(filp_cachep, GFP_KERNEL); |
af4d2ecbf
|
118 119 |
if (f == NULL) goto fail; |
529bf6be5
|
120 |
percpu_counter_inc(&nr_files); |
78d297887
|
121 |
f->f_cred = get_cred(cred); |
af4d2ecbf
|
122 123 |
if (security_file_alloc(f)) goto fail_sec; |
1da177e4c
|
124 |
|
5a6b7951b
|
125 |
INIT_LIST_HEAD(&f->f_u.fu_list); |
516e0cc56
|
126 |
atomic_long_set(&f->f_count, 1); |
af4d2ecbf
|
127 |
rwlock_init(&f->f_owner.lock); |
684999149
|
128 |
spin_lock_init(&f->f_lock); |
5a6b7951b
|
129 |
eventpoll_init_file(f); |
af4d2ecbf
|
130 |
/* f->f_version: 0 */ |
af4d2ecbf
|
131 132 133 |
return f; over: |
1da177e4c
|
134 |
/* Ran out of filps - report that */ |
529bf6be5
|
135 |
if (get_nr_files() > old_max) { |
518de9b39
|
136 137 |
pr_info("VFS: file-max limit %lu reached ", get_max_files()); |
529bf6be5
|
138 |
old_max = get_nr_files(); |
1da177e4c
|
139 |
} |
af4d2ecbf
|
140 141 142 143 |
goto fail; fail_sec: file_free(f); |
1da177e4c
|
144 145 146 |
fail: return NULL; } |
ce8d2cdf3
|
147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 |
/** * alloc_file - allocate and initialize a 'struct file' * @mnt: the vfsmount on which the file will reside * @dentry: the dentry representing the new file * @mode: the mode with which the new file will be opened * @fop: the 'struct file_operations' for the new file * * Use this instead of get_empty_filp() to get a new * 'struct file'. Do so because of the same initialization * pitfalls reasons listed for init_file(). This is a * preferred interface to using init_file(). * * If all the callers of init_file() are eliminated, its * code should be moved into this function. */ |
2c48b9c45
|
162 163 |
struct file *alloc_file(struct path *path, fmode_t mode, const struct file_operations *fop) |
ce8d2cdf3
|
164 165 |
{ struct file *file; |
ce8d2cdf3
|
166 167 168 169 |
file = get_empty_filp(); if (!file) return NULL; |
2c48b9c45
|
170 171 |
file->f_path = *path; file->f_mapping = path->dentry->d_inode->i_mapping; |
ce8d2cdf3
|
172 173 |
file->f_mode = mode; file->f_op = fop; |
4a3fd211c
|
174 175 176 177 178 179 180 |
/* * These mounts don't really matter in practice * for r/o bind mounts. They aren't userspace- * visible. We do this for consistency, and so * that we can do debugging checks at __fput() */ |
2c48b9c45
|
181 |
if ((mode & FMODE_WRITE) && !special_file(path->dentry->d_inode->i_mode)) { |
ad775f5a8
|
182 |
file_take_write(file); |
385e3ed4f
|
183 |
WARN_ON(mnt_clone_write(path->mnt)); |
4a3fd211c
|
184 |
} |
890275b5e
|
185 186 |
if ((mode & (FMODE_READ | FMODE_WRITE)) == FMODE_READ) i_readcount_inc(path->dentry->d_inode); |
3d1e46315
|
187 |
return file; |
ce8d2cdf3
|
188 |
} |
73efc4681
|
189 |
EXPORT_SYMBOL(alloc_file); |
ce8d2cdf3
|
190 |
|
aceaf78da
|
191 192 193 194 195 196 197 198 199 200 |
/** * drop_file_write_access - give up ability to write to a file * @file: the file to which we will stop writing * * This is a central place which will give up the ability * to write to @file, along with access to write through * its vfsmount. */ void drop_file_write_access(struct file *file) { |
4a3fd211c
|
201 |
struct vfsmount *mnt = file->f_path.mnt; |
aceaf78da
|
202 203 204 205 |
struct dentry *dentry = file->f_path.dentry; struct inode *inode = dentry->d_inode; put_write_access(inode); |
ad775f5a8
|
206 207 208 209 210 211 212 |
if (special_file(inode->i_mode)) return; if (file_check_writeable(file) != 0) return; mnt_drop_write(mnt); file_release_write(file); |
aceaf78da
|
213 214 |
} EXPORT_SYMBOL_GPL(drop_file_write_access); |
d7065da03
|
215 |
/* the real guts of fput() - releasing the last reference to file |
1da177e4c
|
216 |
*/ |
d7065da03
|
217 |
static void __fput(struct file *file) |
1da177e4c
|
218 |
{ |
0f7fc9e4d
|
219 220 |
struct dentry *dentry = file->f_path.dentry; struct vfsmount *mnt = file->f_path.mnt; |
1da177e4c
|
221 222 223 |
struct inode *inode = dentry->d_inode; might_sleep(); |
0eeca2830
|
224 225 |
fsnotify_close(file); |
1da177e4c
|
226 227 228 229 230 231 |
/* * The function eventpoll_release() should be the first called * in the file cleanup chain. */ eventpoll_release(file); locks_remove_flock(file); |
233e70f42
|
232 233 234 235 |
if (unlikely(file->f_flags & FASYNC)) { if (file->f_op && file->f_op->fasync) file->f_op->fasync(-1, file, 0); } |
1da177e4c
|
236 237 238 |
if (file->f_op && file->f_op->release) file->f_op->release(inode, file); security_file_free(file); |
89068c576
|
239 |
ima_file_free(file); |
60ed8cf78
|
240 241 |
if (unlikely(S_ISCHR(inode->i_mode) && inode->i_cdev != NULL && !(file->f_mode & FMODE_PATH))) { |
1da177e4c
|
242 |
cdev_put(inode->i_cdev); |
60ed8cf78
|
243 |
} |
1da177e4c
|
244 |
fops_put(file->f_op); |
609d7fa95
|
245 |
put_pid(file->f_owner.pid); |
ee2ffa0df
|
246 |
file_sb_list_del(file); |
890275b5e
|
247 248 |
if ((file->f_mode & (FMODE_READ | FMODE_WRITE)) == FMODE_READ) i_readcount_dec(inode); |
aceaf78da
|
249 250 |
if (file->f_mode & FMODE_WRITE) drop_file_write_access(file); |
0f7fc9e4d
|
251 252 |
file->f_path.dentry = NULL; file->f_path.mnt = NULL; |
1da177e4c
|
253 254 255 256 |
file_free(file); dput(dentry); mntput(mnt); } |
d7065da03
|
257 258 259 260 261 262 263 |
void fput(struct file *file) { if (atomic_long_dec_and_test(&file->f_count)) __fput(file); } EXPORT_SYMBOL(fput); |
fc9b52cd8
|
264 |
struct file *fget(unsigned int fd) |
1da177e4c
|
265 266 267 |
{ struct file *file; struct files_struct *files = current->files; |
ab2af1f50
|
268 |
rcu_read_lock(); |
1da177e4c
|
269 |
file = fcheck_files(files, fd); |
ab2af1f50
|
270 |
if (file) { |
1abf0c718
|
271 272 273 274 |
/* File object ref couldn't be taken */ if (file->f_mode & FMODE_PATH || !atomic_long_inc_not_zero(&file->f_count)) file = NULL; |
ab2af1f50
|
275 276 |
} rcu_read_unlock(); |
1da177e4c
|
277 278 279 280 |
return file; } EXPORT_SYMBOL(fget); |
1abf0c718
|
281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 |
struct file *fget_raw(unsigned int fd) { struct file *file; struct files_struct *files = current->files; rcu_read_lock(); file = fcheck_files(files, fd); if (file) { /* File object ref couldn't be taken */ if (!atomic_long_inc_not_zero(&file->f_count)) file = NULL; } rcu_read_unlock(); return file; } |
326be7b48
|
297 |
EXPORT_SYMBOL(fget_raw); |
1da177e4c
|
298 |
/* |
58939473b
|
299 300 301 302 303 304 305 306 307 308 309 310 311 312 |
* Lightweight file lookup - no refcnt increment if fd table isn't shared. * * You can use this instead of fget if you satisfy all of the following * conditions: * 1) You must call fput_light before exiting the syscall and returning control * to userspace (i.e. you cannot remember the returned struct file * after * returning to userspace). * 2) You must not call filp_close on the returned struct file * in between * calls to fget_light and fput_light. * 3) You must not clone the current task in between the calls to fget_light * and fput_light. * * The fput_needed flag returned by fget_light should be passed to the * corresponding fput_light. |
1da177e4c
|
313 |
*/ |
fc9b52cd8
|
314 |
struct file *fget_light(unsigned int fd, int *fput_needed) |
1da177e4c
|
315 316 317 318 319 |
{ struct file *file; struct files_struct *files = current->files; *fput_needed = 0; |
3bc0ba430
|
320 |
if (atomic_read(&files->count) == 1) { |
1da177e4c
|
321 |
file = fcheck_files(files, fd); |
1abf0c718
|
322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 |
if (file && (file->f_mode & FMODE_PATH)) file = NULL; } else { rcu_read_lock(); file = fcheck_files(files, fd); if (file) { if (!(file->f_mode & FMODE_PATH) && atomic_long_inc_not_zero(&file->f_count)) *fput_needed = 1; else /* Didn't get the reference, someone's freed */ file = NULL; } rcu_read_unlock(); } return file; } struct file *fget_raw_light(unsigned int fd, int *fput_needed) { struct file *file; struct files_struct *files = current->files; *fput_needed = 0; if (atomic_read(&files->count) == 1) { file = fcheck_files(files, fd); |
1da177e4c
|
349 |
} else { |
ab2af1f50
|
350 |
rcu_read_lock(); |
1da177e4c
|
351 352 |
file = fcheck_files(files, fd); if (file) { |
516e0cc56
|
353 |
if (atomic_long_inc_not_zero(&file->f_count)) |
ab2af1f50
|
354 355 356 357 |
*fput_needed = 1; else /* Didn't get the reference, someone's freed */ file = NULL; |
1da177e4c
|
358 |
} |
ab2af1f50
|
359 |
rcu_read_unlock(); |
1da177e4c
|
360 |
} |
ab2af1f50
|
361 |
|
1da177e4c
|
362 363 |
return file; } |
1da177e4c
|
364 365 |
void put_filp(struct file *file) { |
516e0cc56
|
366 |
if (atomic_long_dec_and_test(&file->f_count)) { |
1da177e4c
|
367 |
security_file_free(file); |
ee2ffa0df
|
368 |
file_sb_list_del(file); |
1da177e4c
|
369 370 371 |
file_free(file); } } |
6416ccb78
|
372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 |
static inline int file_list_cpu(struct file *file) { #ifdef CONFIG_SMP return file->f_sb_list_cpu; #else return smp_processor_id(); #endif } /* helper for file_sb_list_add to reduce ifdefs */ static inline void __file_sb_list_add(struct file *file, struct super_block *sb) { struct list_head *list; #ifdef CONFIG_SMP int cpu; cpu = smp_processor_id(); file->f_sb_list_cpu = cpu; list = per_cpu_ptr(sb->s_files, cpu); #else list = &sb->s_files; #endif list_add(&file->f_u.fu_list, list); } /** * file_sb_list_add - add a file to the sb's file list * @file: file to add * @sb: sb to add it to * * Use this function to associate a file with the superblock of the inode it * refers to. */ |
ee2ffa0df
|
404 |
void file_sb_list_add(struct file *file, struct super_block *sb) |
1da177e4c
|
405 |
{ |
6416ccb78
|
406 407 408 |
lg_local_lock(files_lglock); __file_sb_list_add(file, sb); lg_local_unlock(files_lglock); |
1da177e4c
|
409 |
} |
6416ccb78
|
410 411 412 413 414 415 416 |
/** * file_sb_list_del - remove a file from the sb's file list * @file: file to remove * @sb: sb to remove it from * * Use this function to remove a file from its superblock. */ |
ee2ffa0df
|
417 |
void file_sb_list_del(struct file *file) |
1da177e4c
|
418 |
{ |
2f5120166
|
419 |
if (!list_empty(&file->f_u.fu_list)) { |
6416ccb78
|
420 |
lg_local_lock_cpu(files_lglock, file_list_cpu(file)); |
2f5120166
|
421 |
list_del_init(&file->f_u.fu_list); |
6416ccb78
|
422 |
lg_local_unlock_cpu(files_lglock, file_list_cpu(file)); |
1da177e4c
|
423 424 |
} } |
6416ccb78
|
425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 |
#ifdef CONFIG_SMP /* * These macros iterate all files on all CPUs for a given superblock. * files_lglock must be held globally. */ #define do_file_list_for_each_entry(__sb, __file) \ { \ int i; \ for_each_possible_cpu(i) { \ struct list_head *list; \ list = per_cpu_ptr((__sb)->s_files, i); \ list_for_each_entry((__file), list, f_u.fu_list) #define while_file_list_for_each_entry \ } \ } #else #define do_file_list_for_each_entry(__sb, __file) \ { \ struct list_head *list; \ list = &(sb)->s_files; \ list_for_each_entry((__file), list, f_u.fu_list) #define while_file_list_for_each_entry \ } #endif |
1da177e4c
|
455 456 |
int fs_may_remount_ro(struct super_block *sb) { |
cfdaf9e5f
|
457 |
struct file *file; |
1da177e4c
|
458 |
/* Check that no files are currently opened for writing. */ |
6416ccb78
|
459 460 |
lg_global_lock(files_lglock); do_file_list_for_each_entry(sb, file) { |
0f7fc9e4d
|
461 |
struct inode *inode = file->f_path.dentry->d_inode; |
1da177e4c
|
462 463 464 465 466 467 468 469 |
/* File with pending delete? */ if (inode->i_nlink == 0) goto too_bad; /* Writeable file? */ if (S_ISREG(inode->i_mode) && (file->f_mode & FMODE_WRITE)) goto too_bad; |
6416ccb78
|
470 471 |
} while_file_list_for_each_entry; lg_global_unlock(files_lglock); |
1da177e4c
|
472 473 |
return 1; /* Tis' cool bro. */ too_bad: |
6416ccb78
|
474 |
lg_global_unlock(files_lglock); |
1da177e4c
|
475 476 |
return 0; } |
864d7c4c0
|
477 478 479 480 481 482 483 484 485 486 487 488 |
/** * mark_files_ro - mark all files read-only * @sb: superblock in question * * All files are marked read-only. We don't care about pending * delete files so this should be used in 'force' mode only. */ void mark_files_ro(struct super_block *sb) { struct file *f; retry: |
6416ccb78
|
489 490 |
lg_global_lock(files_lglock); do_file_list_for_each_entry(sb, f) { |
864d7c4c0
|
491 492 493 494 495 496 497 |
struct vfsmount *mnt; if (!S_ISREG(f->f_path.dentry->d_inode->i_mode)) continue; if (!file_count(f)) continue; if (!(f->f_mode & FMODE_WRITE)) continue; |
42e496086
|
498 |
spin_lock(&f->f_lock); |
864d7c4c0
|
499 |
f->f_mode &= ~FMODE_WRITE; |
42e496086
|
500 |
spin_unlock(&f->f_lock); |
864d7c4c0
|
501 502 503 504 |
if (file_check_writeable(f) != 0) continue; file_release_write(f); mnt = mntget(f->f_path.mnt); |
ee2ffa0df
|
505 |
/* This can sleep, so we can't hold the spinlock. */ |
6416ccb78
|
506 |
lg_global_unlock(files_lglock); |
864d7c4c0
|
507 508 509 |
mnt_drop_write(mnt); mntput(mnt); goto retry; |
6416ccb78
|
510 511 |
} while_file_list_for_each_entry; lg_global_unlock(files_lglock); |
864d7c4c0
|
512 |
} |
1da177e4c
|
513 514 |
void __init files_init(unsigned long mempages) { |
518de9b39
|
515 |
unsigned long n; |
b6b3fdead
|
516 517 518 519 520 521 |
filp_cachep = kmem_cache_create("filp", sizeof(struct file), 0, SLAB_HWCACHE_ALIGN | SLAB_PANIC, NULL); /* * One file with associated inode and dcache is very roughly 1K. |
1da177e4c
|
522 523 524 525 |
* Per default don't use more than 10% of our memory for files. */ n = (mempages * (PAGE_SIZE / 1024)) / 10; |
518de9b39
|
526 |
files_stat.max_files = max_t(unsigned long, n, NR_FILE); |
ab2af1f50
|
527 |
files_defer_init(); |
6416ccb78
|
528 |
lg_lock_init(files_lglock); |
0216bfcff
|
529 |
percpu_counter_init(&nr_files, 0); |
1da177e4c
|
530 |
} |