Commit ab2af1f5005069321c5d130f09cce577b03f43ef

Authored by Dipankar Sarma
Committed by Linus Torvalds
1 parent 6e72ad2c58

[PATCH] files: files struct with RCU

Patch to eliminate struct files_struct.file_lock spinlock on the reader side
and use rcu refcounting rcuref_xxx api for the f_count refcounter.  The
updates to the fdtable are done by allocating a new fdtable structure and
setting files->fdt to point to the new structure.  The fdtable structure is
protected by RCU thereby allowing lock-free lookup.  For fd arrays/sets that
are vmalloced, we use keventd to free them since RCU callbacks can't sleep.  A
global list of fdtable to be freed is not scalable, so we use a per-cpu list.
If keventd is already handling the current cpu's work, we use a timer to defer
queueing of that work.

Since the last publication, this patch has been re-written to avoid using
explicit memory barriers and use rcu_assign_pointer(), rcu_dereference()
premitives instead.  This required that the fd information is kept in a
separate structure (fdtable) and updated atomically.

Signed-off-by: Dipankar Sarma <dipankar@in.ibm.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>

Showing 10 changed files with 345 additions and 166 deletions Side-by-side Diff

... ... @@ -29,6 +29,7 @@
29 29 #include <linux/highmem.h>
30 30 #include <linux/workqueue.h>
31 31 #include <linux/security.h>
  32 +#include <linux/rcuref.h>
32 33  
33 34 #include <asm/kmap_types.h>
34 35 #include <asm/uaccess.h>
... ... @@ -499,7 +500,7 @@
499 500 /* Must be done under the lock to serialise against cancellation.
500 501 * Call this aio_fput as it duplicates fput via the fput_work.
501 502 */
502   - if (unlikely(atomic_dec_and_test(&req->ki_filp->f_count))) {
  503 + if (unlikely(rcuref_dec_and_test(&req->ki_filp->f_count))) {
503 504 get_ioctx(ctx);
504 505 spin_lock(&fput_lock);
505 506 list_add(&req->ki_list, &fput_head);
... ... @@ -16,6 +16,7 @@
16 16 #include <linux/security.h>
17 17 #include <linux/ptrace.h>
18 18 #include <linux/signal.h>
  19 +#include <linux/rcupdate.h>
19 20  
20 21 #include <asm/poll.h>
21 22 #include <asm/siginfo.h>
22 23  
... ... @@ -64,8 +65,8 @@
64 65 if (orig_start >= current->signal->rlim[RLIMIT_NOFILE].rlim_cur)
65 66 goto out;
66 67  
67   - fdt = files_fdtable(files);
68 68 repeat:
  69 + fdt = files_fdtable(files);
69 70 /*
70 71 * Someone might have closed fd's in the range
71 72 * orig_start..fdt->next_fd
72 73  
... ... @@ -95,9 +96,15 @@
95 96 if (error)
96 97 goto repeat;
97 98  
  99 + /*
  100 + * We reacquired files_lock, so we are safe as long as
  101 + * we reacquire the fdtable pointer and use it while holding
  102 + * the lock, no one can free it during that time.
  103 + */
  104 + fdt = files_fdtable(files);
98 105 if (start <= fdt->next_fd)
99 106 fdt->next_fd = newfd + 1;
100   -
  107 +
101 108 error = newfd;
102 109  
103 110 out:
... ... @@ -163,7 +170,7 @@
163 170 if (!tofree && FD_ISSET(newfd, fdt->open_fds))
164 171 goto out_fput;
165 172  
166   - fdt->fd[newfd] = file;
  173 + rcu_assign_pointer(fdt->fd[newfd], file);
167 174 FD_SET(newfd, fdt->open_fds);
168 175 FD_CLR(newfd, fdt->close_on_exec);
169 176 spin_unlock(&files->file_lock);
... ... @@ -13,9 +13,28 @@
13 13 #include <linux/vmalloc.h>
14 14 #include <linux/file.h>
15 15 #include <linux/bitops.h>
  16 +#include <linux/interrupt.h>
  17 +#include <linux/spinlock.h>
  18 +#include <linux/rcupdate.h>
  19 +#include <linux/workqueue.h>
16 20  
  21 +struct fdtable_defer {
  22 + spinlock_t lock;
  23 + struct work_struct wq;
  24 + struct timer_list timer;
  25 + struct fdtable *next;
  26 +};
17 27  
18 28 /*
  29 + * We use this list to defer free fdtables that have vmalloced
  30 + * sets/arrays. By keeping a per-cpu list, we avoid having to embed
  31 + * the work_struct in fdtable itself which avoids a 64 byte (i386) increase in
  32 + * this per-task structure.
  33 + */
  34 +static DEFINE_PER_CPU(struct fdtable_defer, fdtable_defer_list);
  35 +
  36 +
  37 +/*
19 38 * Allocate an fd array, using kmalloc or vmalloc.
20 39 * Note: the array isn't cleared at allocation time.
21 40 */
22 41  
23 42  
24 43  
25 44  
26 45  
27 46  
28 47  
29 48  
30 49  
31 50  
32 51  
33 52  
34 53  
... ... @@ -48,85 +67,143 @@
48 67 vfree(array);
49 68 }
50 69  
51   -/*
52   - * Expand the fd array in the files_struct. Called with the files
53   - * spinlock held for write.
54   - */
  70 +static void __free_fdtable(struct fdtable *fdt)
  71 +{
  72 + int fdset_size, fdarray_size;
55 73  
56   -static int expand_fd_array(struct files_struct *files, int nr)
57   - __releases(files->file_lock)
58   - __acquires(files->file_lock)
  74 + fdset_size = fdt->max_fdset / 8;
  75 + fdarray_size = fdt->max_fds * sizeof(struct file *);
  76 + free_fdset(fdt->open_fds, fdset_size);
  77 + free_fdset(fdt->close_on_exec, fdset_size);
  78 + free_fd_array(fdt->fd, fdarray_size);
  79 + kfree(fdt);
  80 +}
  81 +
  82 +static void fdtable_timer(unsigned long data)
59 83 {
60   - struct file **new_fds;
61   - int error, nfds;
62   - struct fdtable *fdt;
  84 + struct fdtable_defer *fddef = (struct fdtable_defer *)data;
63 85  
64   -
65   - error = -EMFILE;
66   - fdt = files_fdtable(files);
67   - if (fdt->max_fds >= NR_OPEN || nr >= NR_OPEN)
  86 + spin_lock(&fddef->lock);
  87 + /*
  88 + * If someone already emptied the queue return.
  89 + */
  90 + if (!fddef->next)
68 91 goto out;
  92 + if (!schedule_work(&fddef->wq))
  93 + mod_timer(&fddef->timer, 5);
  94 +out:
  95 + spin_unlock(&fddef->lock);
  96 +}
69 97  
70   - nfds = fdt->max_fds;
71   - spin_unlock(&files->file_lock);
  98 +static void free_fdtable_work(struct fdtable_defer *f)
  99 +{
  100 + struct fdtable *fdt;
72 101  
73   - /*
74   - * Expand to the max in easy steps, and keep expanding it until
75   - * we have enough for the requested fd array size.
76   - */
  102 + spin_lock_bh(&f->lock);
  103 + fdt = f->next;
  104 + f->next = NULL;
  105 + spin_unlock_bh(&f->lock);
  106 + while(fdt) {
  107 + struct fdtable *next = fdt->next;
  108 + __free_fdtable(fdt);
  109 + fdt = next;
  110 + }
  111 +}
77 112  
78   - do {
79   -#if NR_OPEN_DEFAULT < 256
80   - if (nfds < 256)
81   - nfds = 256;
82   - else
83   -#endif
84   - if (nfds < (PAGE_SIZE / sizeof(struct file *)))
85   - nfds = PAGE_SIZE / sizeof(struct file *);
86   - else {
87   - nfds = nfds * 2;
88   - if (nfds > NR_OPEN)
89   - nfds = NR_OPEN;
90   - }
91   - } while (nfds <= nr);
  113 +static void free_fdtable_rcu(struct rcu_head *rcu)
  114 +{
  115 + struct fdtable *fdt = container_of(rcu, struct fdtable, rcu);
  116 + int fdset_size, fdarray_size;
  117 + struct fdtable_defer *fddef;
92 118  
93   - error = -ENOMEM;
94   - new_fds = alloc_fd_array(nfds);
95   - spin_lock(&files->file_lock);
96   - if (!new_fds)
97   - goto out;
  119 + BUG_ON(!fdt);
  120 + fdset_size = fdt->max_fdset / 8;
  121 + fdarray_size = fdt->max_fds * sizeof(struct file *);
98 122  
99   - /* Copy the existing array and install the new pointer */
100   - fdt = files_fdtable(files);
  123 + if (fdt->free_files) {
  124 + /*
  125 + * The this fdtable was embedded in the files structure
  126 + * and the files structure itself was getting destroyed.
  127 + * It is now safe to free the files structure.
  128 + */
  129 + kmem_cache_free(files_cachep, fdt->free_files);
  130 + return;
  131 + }
  132 + if (fdt->max_fdset <= __FD_SETSIZE && fdt->max_fds <= NR_OPEN_DEFAULT) {
  133 + /*
  134 + * The fdtable was embedded
  135 + */
  136 + return;
  137 + }
  138 + if (fdset_size <= PAGE_SIZE && fdarray_size <= PAGE_SIZE) {
  139 + kfree(fdt->open_fds);
  140 + kfree(fdt->close_on_exec);
  141 + kfree(fdt->fd);
  142 + kfree(fdt);
  143 + } else {
  144 + fddef = &get_cpu_var(fdtable_defer_list);
  145 + spin_lock(&fddef->lock);
  146 + fdt->next = fddef->next;
  147 + fddef->next = fdt;
  148 + /*
  149 + * vmallocs are handled from the workqueue context.
  150 + * If the per-cpu workqueue is running, then we
  151 + * defer work scheduling through a timer.
  152 + */
  153 + if (!schedule_work(&fddef->wq))
  154 + mod_timer(&fddef->timer, 5);
  155 + spin_unlock(&fddef->lock);
  156 + put_cpu_var(fdtable_defer_list);
  157 + }
  158 +}
101 159  
102   - if (nfds > fdt->max_fds) {
103   - struct file **old_fds;
104   - int i;
105   -
106   - old_fds = xchg(&fdt->fd, new_fds);
107   - i = xchg(&fdt->max_fds, nfds);
  160 +void free_fdtable(struct fdtable *fdt)
  161 +{
  162 + if (fdt->free_files || fdt->max_fdset > __FD_SETSIZE ||
  163 + fdt->max_fds > NR_OPEN_DEFAULT)
  164 + call_rcu(&fdt->rcu, free_fdtable_rcu);
  165 +}
108 166  
109   - /* Don't copy/clear the array if we are creating a new
110   - fd array for fork() */
111   - if (i) {
112   - memcpy(new_fds, old_fds, i * sizeof(struct file *));
113   - /* clear the remainder of the array */
114   - memset(&new_fds[i], 0,
115   - (nfds-i) * sizeof(struct file *));
  167 +/*
  168 + * Expand the fdset in the files_struct. Called with the files spinlock
  169 + * held for write.
  170 + */
  171 +static void copy_fdtable(struct fdtable *nfdt, struct fdtable *fdt)
  172 +{
  173 + int i;
  174 + int count;
116 175  
117   - spin_unlock(&files->file_lock);
118   - free_fd_array(old_fds, i);
119   - spin_lock(&files->file_lock);
120   - }
121   - } else {
122   - /* Somebody expanded the array while we slept ... */
123   - spin_unlock(&files->file_lock);
124   - free_fd_array(new_fds, nfds);
125   - spin_lock(&files->file_lock);
  176 + BUG_ON(nfdt->max_fdset < fdt->max_fdset);
  177 + BUG_ON(nfdt->max_fds < fdt->max_fds);
  178 + /* Copy the existing tables and install the new pointers */
  179 +
  180 + i = fdt->max_fdset / (sizeof(unsigned long) * 8);
  181 + count = (nfdt->max_fdset - fdt->max_fdset) / 8;
  182 +
  183 + /*
  184 + * Don't copy the entire array if the current fdset is
  185 + * not yet initialised.
  186 + */
  187 + if (i) {
  188 + memcpy (nfdt->open_fds, fdt->open_fds,
  189 + fdt->max_fdset/8);
  190 + memcpy (nfdt->close_on_exec, fdt->close_on_exec,
  191 + fdt->max_fdset/8);
  192 + memset (&nfdt->open_fds->fds_bits[i], 0, count);
  193 + memset (&nfdt->close_on_exec->fds_bits[i], 0, count);
126 194 }
127   - error = 0;
128   -out:
129   - return error;
  195 +
  196 + /* Don't copy/clear the array if we are creating a new
  197 + fd array for fork() */
  198 + if (fdt->max_fds) {
  199 + memcpy(nfdt->fd, fdt->fd,
  200 + fdt->max_fds * sizeof(struct file *));
  201 + /* clear the remainder of the array */
  202 + memset(&nfdt->fd[fdt->max_fds], 0,
  203 + (nfdt->max_fds - fdt->max_fds) *
  204 + sizeof(struct file *));
  205 + }
  206 + nfdt->next_fd = fdt->next_fd;
130 207 }
131 208  
132 209 /*
133 210  
134 211  
135 212  
... ... @@ -157,28 +234,21 @@
157 234 vfree(array);
158 235 }
159 236  
160   -/*
161   - * Expand the fdset in the files_struct. Called with the files spinlock
162   - * held for write.
163   - */
164   -static int expand_fdset(struct files_struct *files, int nr)
165   - __releases(file->file_lock)
166   - __acquires(file->file_lock)
  237 +static struct fdtable *alloc_fdtable(int nr)
167 238 {
168   - fd_set *new_openset = NULL, *new_execset = NULL;
169   - int error, nfds = 0;
170   - struct fdtable *fdt;
  239 + struct fdtable *fdt = NULL;
  240 + int nfds = 0;
  241 + fd_set *new_openset = NULL, *new_execset = NULL;
  242 + struct file **new_fds;
171 243  
172   - error = -EMFILE;
173   - fdt = files_fdtable(files);
174   - if (fdt->max_fdset >= NR_OPEN || nr >= NR_OPEN)
175   - goto out;
  244 + fdt = kmalloc(sizeof(*fdt), GFP_KERNEL);
  245 + if (!fdt)
  246 + goto out;
  247 + memset(fdt, 0, sizeof(*fdt));
176 248  
177   - nfds = fdt->max_fdset;
178   - spin_unlock(&files->file_lock);
179   -
180   - /* Expand to the max in easy steps */
181   - do {
  249 + nfds = __FD_SETSIZE;
  250 + /* Expand to the max in easy steps */
  251 + do {
182 252 if (nfds < (PAGE_SIZE * 8))
183 253 nfds = PAGE_SIZE * 8;
184 254 else {
185 255  
186 256  
187 257  
188 258  
189 259  
190 260  
... ... @@ -188,50 +258,88 @@
188 258 }
189 259 } while (nfds <= nr);
190 260  
191   - error = -ENOMEM;
192   - new_openset = alloc_fdset(nfds);
193   - new_execset = alloc_fdset(nfds);
194   - spin_lock(&files->file_lock);
195   - if (!new_openset || !new_execset)
  261 + new_openset = alloc_fdset(nfds);
  262 + new_execset = alloc_fdset(nfds);
  263 + if (!new_openset || !new_execset)
  264 + goto out;
  265 + fdt->open_fds = new_openset;
  266 + fdt->close_on_exec = new_execset;
  267 + fdt->max_fdset = nfds;
  268 +
  269 + nfds = NR_OPEN_DEFAULT;
  270 + /*
  271 + * Expand to the max in easy steps, and keep expanding it until
  272 + * we have enough for the requested fd array size.
  273 + */
  274 + do {
  275 +#if NR_OPEN_DEFAULT < 256
  276 + if (nfds < 256)
  277 + nfds = 256;
  278 + else
  279 +#endif
  280 + if (nfds < (PAGE_SIZE / sizeof(struct file *)))
  281 + nfds = PAGE_SIZE / sizeof(struct file *);
  282 + else {
  283 + nfds = nfds * 2;
  284 + if (nfds > NR_OPEN)
  285 + nfds = NR_OPEN;
  286 + }
  287 + } while (nfds <= nr);
  288 + new_fds = alloc_fd_array(nfds);
  289 + if (!new_fds)
196 290 goto out;
  291 + fdt->fd = new_fds;
  292 + fdt->max_fds = nfds;
  293 + fdt->free_files = NULL;
  294 + return fdt;
  295 +out:
  296 + if (new_openset)
  297 + free_fdset(new_openset, nfds);
  298 + if (new_execset)
  299 + free_fdset(new_execset, nfds);
  300 + kfree(fdt);
  301 + return NULL;
  302 +}
197 303  
198   - error = 0;
199   -
200   - /* Copy the existing tables and install the new pointers */
  304 +/*
  305 + * Expands the file descriptor table - it will allocate a new fdtable and
  306 + * both fd array and fdset. It is expected to be called with the
  307 + * files_lock held.
  308 + */
  309 +static int expand_fdtable(struct files_struct *files, int nr)
  310 + __releases(files->file_lock)
  311 + __acquires(files->file_lock)
  312 +{
  313 + int error = 0;
  314 + struct fdtable *fdt;
  315 + struct fdtable *nfdt = NULL;
  316 +
  317 + spin_unlock(&files->file_lock);
  318 + nfdt = alloc_fdtable(nr);
  319 + if (!nfdt) {
  320 + error = -ENOMEM;
  321 + spin_lock(&files->file_lock);
  322 + goto out;
  323 + }
  324 +
  325 + spin_lock(&files->file_lock);
201 326 fdt = files_fdtable(files);
202   - if (nfds > fdt->max_fdset) {
203   - int i = fdt->max_fdset / (sizeof(unsigned long) * 8);
204   - int count = (nfds - fdt->max_fdset) / 8;
205   -
206   - /*
207   - * Don't copy the entire array if the current fdset is
208   - * not yet initialised.
209   - */
210   - if (i) {
211   - memcpy (new_openset, fdt->open_fds, fdt->max_fdset/8);
212   - memcpy (new_execset, fdt->close_on_exec, fdt->max_fdset/8);
213   - memset (&new_openset->fds_bits[i], 0, count);
214   - memset (&new_execset->fds_bits[i], 0, count);
215   - }
216   -
217   - nfds = xchg(&fdt->max_fdset, nfds);
218   - new_openset = xchg(&fdt->open_fds, new_openset);
219   - new_execset = xchg(&fdt->close_on_exec, new_execset);
  327 + /*
  328 + * Check again since another task may have expanded the
  329 + * fd table while we dropped the lock
  330 + */
  331 + if (nr >= fdt->max_fds || nr >= fdt->max_fdset) {
  332 + copy_fdtable(nfdt, fdt);
  333 + } else {
  334 + /* Somebody expanded while we dropped file_lock */
220 335 spin_unlock(&files->file_lock);
221   - free_fdset (new_openset, nfds);
222   - free_fdset (new_execset, nfds);
  336 + __free_fdtable(nfdt);
223 337 spin_lock(&files->file_lock);
224   - return 0;
225   - }
226   - /* Somebody expanded the array while we slept ... */
227   -
  338 + goto out;
  339 + }
  340 + rcu_assign_pointer(files->fdt, nfdt);
  341 + free_fdtable(fdt);
228 342 out:
229   - spin_unlock(&files->file_lock);
230   - if (new_openset)
231   - free_fdset(new_openset, nfds);
232   - if (new_execset)
233   - free_fdset(new_execset, nfds);
234   - spin_lock(&files->file_lock);
235 343 return error;
236 344 }
237 345  
238 346  
239 347  
240 348  
... ... @@ -246,18 +354,37 @@
246 354 struct fdtable *fdt;
247 355  
248 356 fdt = files_fdtable(files);
249   - if (nr >= fdt->max_fdset) {
250   - expand = 1;
251   - if ((err = expand_fdset(files, nr)))
  357 + if (nr >= fdt->max_fdset || nr >= fdt->max_fds) {
  358 + if (fdt->max_fdset >= NR_OPEN ||
  359 + fdt->max_fds >= NR_OPEN || nr >= NR_OPEN) {
  360 + err = -EMFILE;
252 361 goto out;
253   - }
254   - if (nr >= fdt->max_fds) {
  362 + }
255 363 expand = 1;
256   - if ((err = expand_fd_array(files, nr)))
  364 + if ((err = expand_fdtable(files, nr)))
257 365 goto out;
258 366 }
259 367 err = expand;
260 368 out:
261 369 return err;
  370 +}
  371 +
  372 +static void __devinit fdtable_defer_list_init(int cpu)
  373 +{
  374 + struct fdtable_defer *fddef = &per_cpu(fdtable_defer_list, cpu);
  375 + spin_lock_init(&fddef->lock);
  376 + INIT_WORK(&fddef->wq, (void (*)(void *))free_fdtable_work, fddef);
  377 + init_timer(&fddef->timer);
  378 + fddef->timer.data = (unsigned long)fddef;
  379 + fddef->timer.function = fdtable_timer;
  380 + fddef->next = NULL;
  381 +}
  382 +
  383 +void __init files_defer_init(void)
  384 +{
  385 + int i;
  386 + /* Really early - can't use for_each_cpu */
  387 + for (i = 0; i < NR_CPUS; i++)
  388 + fdtable_defer_list_init(i);
262 389 }
... ... @@ -14,6 +14,7 @@
14 14 #include <linux/fs.h>
15 15 #include <linux/security.h>
16 16 #include <linux/eventpoll.h>
  17 +#include <linux/rcupdate.h>
17 18 #include <linux/mount.h>
18 19 #include <linux/cdev.h>
19 20 #include <linux/fsnotify.h>
20 21  
21 22  
... ... @@ -53,11 +54,17 @@
53 54 spin_unlock_irqrestore(&filp_count_lock, flags);
54 55 }
55 56  
56   -static inline void file_free(struct file *f)
  57 +static inline void file_free_rcu(struct rcu_head *head)
57 58 {
  59 + struct file *f = container_of(head, struct file, f_rcuhead);
58 60 kmem_cache_free(filp_cachep, f);
59 61 }
60 62  
  63 +static inline void file_free(struct file *f)
  64 +{
  65 + call_rcu(&f->f_rcuhead, file_free_rcu);
  66 +}
  67 +
61 68 /* Find an unused file structure and return a pointer to it.
62 69 * Returns NULL, if there are no more free file structures or
63 70 * we run out of memory.
... ... @@ -110,7 +117,7 @@
110 117  
111 118 void fastcall fput(struct file *file)
112 119 {
113   - if (atomic_dec_and_test(&file->f_count))
  120 + if (rcuref_dec_and_test(&file->f_count))
114 121 __fput(file);
115 122 }
116 123  
117 124  
... ... @@ -156,11 +163,17 @@
156 163 struct file *file;
157 164 struct files_struct *files = current->files;
158 165  
159   - spin_lock(&files->file_lock);
  166 + rcu_read_lock();
160 167 file = fcheck_files(files, fd);
161   - if (file)
162   - get_file(file);
163   - spin_unlock(&files->file_lock);
  168 + if (file) {
  169 + if (!rcuref_inc_lf(&file->f_count)) {
  170 + /* File object ref couldn't be taken */
  171 + rcu_read_unlock();
  172 + return NULL;
  173 + }
  174 + }
  175 + rcu_read_unlock();
  176 +
164 177 return file;
165 178 }
166 179  
167 180  
168 181  
169 182  
170 183  
... ... @@ -182,21 +195,25 @@
182 195 if (likely((atomic_read(&files->count) == 1))) {
183 196 file = fcheck_files(files, fd);
184 197 } else {
185   - spin_lock(&files->file_lock);
  198 + rcu_read_lock();
186 199 file = fcheck_files(files, fd);
187 200 if (file) {
188   - get_file(file);
189   - *fput_needed = 1;
  201 + if (rcuref_inc_lf(&file->f_count))
  202 + *fput_needed = 1;
  203 + else
  204 + /* Didn't get the reference, someone's freed */
  205 + file = NULL;
190 206 }
191   - spin_unlock(&files->file_lock);
  207 + rcu_read_unlock();
192 208 }
  209 +
193 210 return file;
194 211 }
195 212  
196 213  
197 214 void put_filp(struct file *file)
198 215 {
199   - if (atomic_dec_and_test(&file->f_count)) {
  216 + if (rcuref_dec_and_test(&file->f_count)) {
200 217 security_file_free(file);
201 218 file_kill(file);
202 219 file_free(file);
... ... @@ -257,5 +274,6 @@
257 274 files_stat.max_files = n;
258 275 if (files_stat.max_files < NR_FILE)
259 276 files_stat.max_files = NR_FILE;
  277 + files_defer_init();
260 278 }
... ... @@ -24,6 +24,7 @@
24 24 #include <linux/personality.h>
25 25 #include <linux/pagemap.h>
26 26 #include <linux/syscalls.h>
  27 +#include <linux/rcupdate.h>
27 28  
28 29 #include <asm/unistd.h>
29 30  
... ... @@ -930,9 +931,8 @@
930 931 struct fdtable *fdt;
931 932 spin_lock(&files->file_lock);
932 933 fdt = files_fdtable(files);
933   - if (unlikely(fdt->fd[fd] != NULL))
934   - BUG();
935   - fdt->fd[fd] = file;
  934 + BUG_ON(fdt->fd[fd] != NULL);
  935 + rcu_assign_pointer(fdt->fd[fd], file);
936 936 spin_unlock(&files->file_lock);
937 937 }
938 938  
... ... @@ -1024,7 +1024,7 @@
1024 1024 filp = fdt->fd[fd];
1025 1025 if (!filp)
1026 1026 goto out_unlock;
1027   - fdt->fd[fd] = NULL;
  1027 + rcu_assign_pointer(fdt->fd[fd], NULL);
1028 1028 FD_CLR(fd, fdt->close_on_exec);
1029 1029 __put_unused_fd(files, fd);
1030 1030 spin_unlock(&files->file_lock);
include/linux/file.h
... ... @@ -9,6 +9,7 @@
9 9 #include <linux/posix_types.h>
10 10 #include <linux/compiler.h>
11 11 #include <linux/spinlock.h>
  12 +#include <linux/rcupdate.h>
12 13  
13 14 /*
14 15 * The default fd array needs to be at least BITS_PER_LONG,
... ... @@ -23,6 +24,9 @@
23 24 struct file ** fd; /* current fd array */
24 25 fd_set *close_on_exec;
25 26 fd_set *open_fds;
  27 + struct rcu_head rcu;
  28 + struct files_struct *free_files;
  29 + struct fdtable *next;
26 30 };
27 31  
28 32 /*
29 33  
... ... @@ -31,13 +35,14 @@
31 35 struct files_struct {
32 36 atomic_t count;
33 37 spinlock_t file_lock; /* Protects all the below members. Nests inside tsk->alloc_lock */
  38 + struct fdtable *fdt;
34 39 struct fdtable fdtab;
35 40 fd_set close_on_exec_init;
36 41 fd_set open_fds_init;
37 42 struct file * fd_array[NR_OPEN_DEFAULT];
38 43 };
39 44  
40   -#define files_fdtable(files) (&(files)->fdtab)
  45 +#define files_fdtable(files) (rcu_dereference((files)->fdt))
41 46  
42 47 extern void FASTCALL(__fput(struct file *));
43 48 extern void FASTCALL(fput(struct file *));
... ... @@ -65,6 +70,8 @@
65 70 extern void free_fdset(fd_set *, int);
66 71  
67 72 extern int expand_files(struct files_struct *, int nr);
  73 +extern void free_fdtable(struct fdtable *fdt);
  74 +extern void __init files_defer_init(void);
68 75  
69 76 static inline struct file * fcheck_files(struct files_struct *files, unsigned int fd)
70 77 {
... ... @@ -72,7 +79,7 @@
72 79 struct fdtable *fdt = files_fdtable(files);
73 80  
74 81 if (fd < fdt->max_fds)
75   - file = fdt->fd[fd];
  82 + file = rcu_dereference(fdt->fd[fd]);
76 83 return file;
77 84 }
78 85  
... ... @@ -9,6 +9,7 @@
9 9 #include <linux/config.h>
10 10 #include <linux/limits.h>
11 11 #include <linux/ioctl.h>
  12 +#include <linux/rcuref.h>
12 13  
13 14 /*
14 15 * It's silly to have NR_OPEN bigger than NR_FILE, but you can change
15 16  
... ... @@ -597,12 +598,13 @@
597 598 spinlock_t f_ep_lock;
598 599 #endif /* #ifdef CONFIG_EPOLL */
599 600 struct address_space *f_mapping;
  601 + struct rcu_head f_rcuhead;
600 602 };
601 603 extern spinlock_t files_lock;
602 604 #define file_list_lock() spin_lock(&files_lock);
603 605 #define file_list_unlock() spin_unlock(&files_lock);
604 606  
605   -#define get_file(x) atomic_inc(&(x)->f_count)
  607 +#define get_file(x) rcuref_inc(&(x)->f_count)
606 608 #define file_count(x) atomic_read(&(x)->f_count)
607 609  
608 610 #define MAX_NON_LFS ((1UL<<31) - 1)
include/linux/init_task.h
... ... @@ -2,6 +2,7 @@
2 2 #define _LINUX__INIT_TASK_H
3 3  
4 4 #include <linux/file.h>
  5 +#include <linux/rcupdate.h>
5 6  
6 7 #define INIT_FDTABLE \
7 8 { \
8 9  
... ... @@ -11,12 +12,16 @@
11 12 .fd = &init_files.fd_array[0], \
12 13 .close_on_exec = &init_files.close_on_exec_init, \
13 14 .open_fds = &init_files.open_fds_init, \
  15 + .rcu = RCU_HEAD_INIT, \
  16 + .free_files = NULL, \
  17 + .next = NULL, \
14 18 }
15 19  
16 20 #define INIT_FILES \
17 21 { \
18 22 .count = ATOMIC_INIT(1), \
19 23 .file_lock = SPIN_LOCK_UNLOCKED, \
  24 + .fdt = &init_files.fdtab, \
20 25 .fdtab = INIT_FDTABLE, \
21 26 .close_on_exec_init = { { 0, } }, \
22 27 .open_fds_init = { { 0, } }, \
... ... @@ -411,15 +411,16 @@
411 411 close_files(files);
412 412 /*
413 413 * Free the fd and fdset arrays if we expanded them.
  414 + * If the fdtable was embedded, pass files for freeing
  415 + * at the end of the RCU grace period. Otherwise,
  416 + * you can free files immediately.
414 417 */
415 418 fdt = files_fdtable(files);
416   - if (fdt->fd != &files->fd_array[0])
417   - free_fd_array(fdt->fd, fdt->max_fds);
418   - if (fdt->max_fdset > __FD_SETSIZE) {
419   - free_fdset(fdt->open_fds, fdt->max_fdset);
420   - free_fdset(fdt->close_on_exec, fdt->max_fdset);
421   - }
422   - kmem_cache_free(files_cachep, files);
  419 + if (fdt == &files->fdtab)
  420 + fdt->free_files = files;
  421 + else
  422 + kmem_cache_free(files_cachep, files);
  423 + free_fdtable(fdt);
423 424 }
424 425 }
425 426  
... ... @@ -35,6 +35,7 @@
35 35 #include <linux/syscalls.h>
36 36 #include <linux/jiffies.h>
37 37 #include <linux/futex.h>
  38 +#include <linux/rcupdate.h>
38 39 #include <linux/ptrace.h>
39 40 #include <linux/mount.h>
40 41 #include <linux/audit.h>
41 42  
42 43  
43 44  
... ... @@ -565,13 +566,12 @@
565 566 return 0;
566 567 }
567 568  
568   -static int count_open_files(struct files_struct *files, int size)
  569 +static int count_open_files(struct fdtable *fdt)
569 570 {
  571 + int size = fdt->max_fdset;
570 572 int i;
571   - struct fdtable *fdt;
572 573  
573 574 /* Find the last open fd */
574   - fdt = files_fdtable(files);
575 575 for (i = size/(8*sizeof(long)); i > 0; ) {
576 576 if (fdt->open_fds->fds_bits[--i])
577 577 break;
578 578  
... ... @@ -592,13 +592,17 @@
592 592 atomic_set(&newf->count, 1);
593 593  
594 594 spin_lock_init(&newf->file_lock);
595   - fdt = files_fdtable(newf);
  595 + fdt = &newf->fdtab;
596 596 fdt->next_fd = 0;
597 597 fdt->max_fds = NR_OPEN_DEFAULT;
598 598 fdt->max_fdset = __FD_SETSIZE;
599 599 fdt->close_on_exec = &newf->close_on_exec_init;
600 600 fdt->open_fds = &newf->open_fds_init;
601 601 fdt->fd = &newf->fd_array[0];
  602 + INIT_RCU_HEAD(&fdt->rcu);
  603 + fdt->free_files = NULL;
  604 + fdt->next = NULL;
  605 + rcu_assign_pointer(newf->fdt, fdt);
602 606 out:
603 607 return newf;
604 608 }
... ... @@ -637,7 +641,7 @@
637 641 old_fdt = files_fdtable(oldf);
638 642 new_fdt = files_fdtable(newf);
639 643 size = old_fdt->max_fdset;
640   - open_files = count_open_files(oldf, old_fdt->max_fdset);
  644 + open_files = count_open_files(old_fdt);
641 645 expand = 0;
642 646  
643 647 /*
644 648  
... ... @@ -661,7 +665,14 @@
661 665 spin_unlock(&newf->file_lock);
662 666 if (error < 0)
663 667 goto out_release;
  668 + new_fdt = files_fdtable(newf);
  669 + /*
  670 + * Reacquire the oldf lock and a pointer to its fd table
  671 + * who knows it may have a new bigger fd table. We need
  672 + * the latest pointer.
  673 + */
664 674 spin_lock(&oldf->file_lock);
  675 + old_fdt = files_fdtable(oldf);
665 676 }
666 677  
667 678 old_fds = old_fdt->fd;
... ... @@ -683,7 +694,7 @@
683 694 */
684 695 FD_CLR(open_files - i, new_fdt->open_fds);
685 696 }
686   - *new_fds++ = f;
  697 + rcu_assign_pointer(*new_fds++, f);
687 698 }
688 699 spin_unlock(&oldf->file_lock);
689 700