Commit ab2af1f5005069321c5d130f09cce577b03f43ef
Committed by
Linus Torvalds
1 parent
6e72ad2c58
Exists in
master
and in
4 other branches
[PATCH] files: files struct with RCU
Patch to eliminate struct files_struct.file_lock spinlock on the reader side and use rcu refcounting rcuref_xxx api for the f_count refcounter. The updates to the fdtable are done by allocating a new fdtable structure and setting files->fdt to point to the new structure. The fdtable structure is protected by RCU thereby allowing lock-free lookup. For fd arrays/sets that are vmalloced, we use keventd to free them since RCU callbacks can't sleep. A global list of fdtable to be freed is not scalable, so we use a per-cpu list. If keventd is already handling the current cpu's work, we use a timer to defer queueing of that work. Since the last publication, this patch has been re-written to avoid using explicit memory barriers and use rcu_assign_pointer(), rcu_dereference() premitives instead. This required that the fd information is kept in a separate structure (fdtable) and updated atomically. Signed-off-by: Dipankar Sarma <dipankar@in.ibm.com> Signed-off-by: Andrew Morton <akpm@osdl.org> Signed-off-by: Linus Torvalds <torvalds@osdl.org>
Showing 10 changed files with 345 additions and 166 deletions Side-by-side Diff
fs/aio.c
... | ... | @@ -29,6 +29,7 @@ |
29 | 29 | #include <linux/highmem.h> |
30 | 30 | #include <linux/workqueue.h> |
31 | 31 | #include <linux/security.h> |
32 | +#include <linux/rcuref.h> | |
32 | 33 | |
33 | 34 | #include <asm/kmap_types.h> |
34 | 35 | #include <asm/uaccess.h> |
... | ... | @@ -499,7 +500,7 @@ |
499 | 500 | /* Must be done under the lock to serialise against cancellation. |
500 | 501 | * Call this aio_fput as it duplicates fput via the fput_work. |
501 | 502 | */ |
502 | - if (unlikely(atomic_dec_and_test(&req->ki_filp->f_count))) { | |
503 | + if (unlikely(rcuref_dec_and_test(&req->ki_filp->f_count))) { | |
503 | 504 | get_ioctx(ctx); |
504 | 505 | spin_lock(&fput_lock); |
505 | 506 | list_add(&req->ki_list, &fput_head); |
fs/fcntl.c
... | ... | @@ -16,6 +16,7 @@ |
16 | 16 | #include <linux/security.h> |
17 | 17 | #include <linux/ptrace.h> |
18 | 18 | #include <linux/signal.h> |
19 | +#include <linux/rcupdate.h> | |
19 | 20 | |
20 | 21 | #include <asm/poll.h> |
21 | 22 | #include <asm/siginfo.h> |
22 | 23 | |
... | ... | @@ -64,8 +65,8 @@ |
64 | 65 | if (orig_start >= current->signal->rlim[RLIMIT_NOFILE].rlim_cur) |
65 | 66 | goto out; |
66 | 67 | |
67 | - fdt = files_fdtable(files); | |
68 | 68 | repeat: |
69 | + fdt = files_fdtable(files); | |
69 | 70 | /* |
70 | 71 | * Someone might have closed fd's in the range |
71 | 72 | * orig_start..fdt->next_fd |
72 | 73 | |
... | ... | @@ -95,9 +96,15 @@ |
95 | 96 | if (error) |
96 | 97 | goto repeat; |
97 | 98 | |
99 | + /* | |
100 | + * We reacquired files_lock, so we are safe as long as | |
101 | + * we reacquire the fdtable pointer and use it while holding | |
102 | + * the lock, no one can free it during that time. | |
103 | + */ | |
104 | + fdt = files_fdtable(files); | |
98 | 105 | if (start <= fdt->next_fd) |
99 | 106 | fdt->next_fd = newfd + 1; |
100 | - | |
107 | + | |
101 | 108 | error = newfd; |
102 | 109 | |
103 | 110 | out: |
... | ... | @@ -163,7 +170,7 @@ |
163 | 170 | if (!tofree && FD_ISSET(newfd, fdt->open_fds)) |
164 | 171 | goto out_fput; |
165 | 172 | |
166 | - fdt->fd[newfd] = file; | |
173 | + rcu_assign_pointer(fdt->fd[newfd], file); | |
167 | 174 | FD_SET(newfd, fdt->open_fds); |
168 | 175 | FD_CLR(newfd, fdt->close_on_exec); |
169 | 176 | spin_unlock(&files->file_lock); |
fs/file.c
... | ... | @@ -13,9 +13,28 @@ |
13 | 13 | #include <linux/vmalloc.h> |
14 | 14 | #include <linux/file.h> |
15 | 15 | #include <linux/bitops.h> |
16 | +#include <linux/interrupt.h> | |
17 | +#include <linux/spinlock.h> | |
18 | +#include <linux/rcupdate.h> | |
19 | +#include <linux/workqueue.h> | |
16 | 20 | |
21 | +struct fdtable_defer { | |
22 | + spinlock_t lock; | |
23 | + struct work_struct wq; | |
24 | + struct timer_list timer; | |
25 | + struct fdtable *next; | |
26 | +}; | |
17 | 27 | |
18 | 28 | /* |
29 | + * We use this list to defer free fdtables that have vmalloced | |
30 | + * sets/arrays. By keeping a per-cpu list, we avoid having to embed | |
31 | + * the work_struct in fdtable itself which avoids a 64 byte (i386) increase in | |
32 | + * this per-task structure. | |
33 | + */ | |
34 | +static DEFINE_PER_CPU(struct fdtable_defer, fdtable_defer_list); | |
35 | + | |
36 | + | |
37 | +/* | |
19 | 38 | * Allocate an fd array, using kmalloc or vmalloc. |
20 | 39 | * Note: the array isn't cleared at allocation time. |
21 | 40 | */ |
22 | 41 | |
23 | 42 | |
24 | 43 | |
25 | 44 | |
26 | 45 | |
27 | 46 | |
28 | 47 | |
29 | 48 | |
30 | 49 | |
31 | 50 | |
32 | 51 | |
33 | 52 | |
34 | 53 | |
... | ... | @@ -48,85 +67,143 @@ |
48 | 67 | vfree(array); |
49 | 68 | } |
50 | 69 | |
51 | -/* | |
52 | - * Expand the fd array in the files_struct. Called with the files | |
53 | - * spinlock held for write. | |
54 | - */ | |
70 | +static void __free_fdtable(struct fdtable *fdt) | |
71 | +{ | |
72 | + int fdset_size, fdarray_size; | |
55 | 73 | |
56 | -static int expand_fd_array(struct files_struct *files, int nr) | |
57 | - __releases(files->file_lock) | |
58 | - __acquires(files->file_lock) | |
74 | + fdset_size = fdt->max_fdset / 8; | |
75 | + fdarray_size = fdt->max_fds * sizeof(struct file *); | |
76 | + free_fdset(fdt->open_fds, fdset_size); | |
77 | + free_fdset(fdt->close_on_exec, fdset_size); | |
78 | + free_fd_array(fdt->fd, fdarray_size); | |
79 | + kfree(fdt); | |
80 | +} | |
81 | + | |
82 | +static void fdtable_timer(unsigned long data) | |
59 | 83 | { |
60 | - struct file **new_fds; | |
61 | - int error, nfds; | |
62 | - struct fdtable *fdt; | |
84 | + struct fdtable_defer *fddef = (struct fdtable_defer *)data; | |
63 | 85 | |
64 | - | |
65 | - error = -EMFILE; | |
66 | - fdt = files_fdtable(files); | |
67 | - if (fdt->max_fds >= NR_OPEN || nr >= NR_OPEN) | |
86 | + spin_lock(&fddef->lock); | |
87 | + /* | |
88 | + * If someone already emptied the queue return. | |
89 | + */ | |
90 | + if (!fddef->next) | |
68 | 91 | goto out; |
92 | + if (!schedule_work(&fddef->wq)) | |
93 | + mod_timer(&fddef->timer, 5); | |
94 | +out: | |
95 | + spin_unlock(&fddef->lock); | |
96 | +} | |
69 | 97 | |
70 | - nfds = fdt->max_fds; | |
71 | - spin_unlock(&files->file_lock); | |
98 | +static void free_fdtable_work(struct fdtable_defer *f) | |
99 | +{ | |
100 | + struct fdtable *fdt; | |
72 | 101 | |
73 | - /* | |
74 | - * Expand to the max in easy steps, and keep expanding it until | |
75 | - * we have enough for the requested fd array size. | |
76 | - */ | |
102 | + spin_lock_bh(&f->lock); | |
103 | + fdt = f->next; | |
104 | + f->next = NULL; | |
105 | + spin_unlock_bh(&f->lock); | |
106 | + while(fdt) { | |
107 | + struct fdtable *next = fdt->next; | |
108 | + __free_fdtable(fdt); | |
109 | + fdt = next; | |
110 | + } | |
111 | +} | |
77 | 112 | |
78 | - do { | |
79 | -#if NR_OPEN_DEFAULT < 256 | |
80 | - if (nfds < 256) | |
81 | - nfds = 256; | |
82 | - else | |
83 | -#endif | |
84 | - if (nfds < (PAGE_SIZE / sizeof(struct file *))) | |
85 | - nfds = PAGE_SIZE / sizeof(struct file *); | |
86 | - else { | |
87 | - nfds = nfds * 2; | |
88 | - if (nfds > NR_OPEN) | |
89 | - nfds = NR_OPEN; | |
90 | - } | |
91 | - } while (nfds <= nr); | |
113 | +static void free_fdtable_rcu(struct rcu_head *rcu) | |
114 | +{ | |
115 | + struct fdtable *fdt = container_of(rcu, struct fdtable, rcu); | |
116 | + int fdset_size, fdarray_size; | |
117 | + struct fdtable_defer *fddef; | |
92 | 118 | |
93 | - error = -ENOMEM; | |
94 | - new_fds = alloc_fd_array(nfds); | |
95 | - spin_lock(&files->file_lock); | |
96 | - if (!new_fds) | |
97 | - goto out; | |
119 | + BUG_ON(!fdt); | |
120 | + fdset_size = fdt->max_fdset / 8; | |
121 | + fdarray_size = fdt->max_fds * sizeof(struct file *); | |
98 | 122 | |
99 | - /* Copy the existing array and install the new pointer */ | |
100 | - fdt = files_fdtable(files); | |
123 | + if (fdt->free_files) { | |
124 | + /* | |
125 | + * The this fdtable was embedded in the files structure | |
126 | + * and the files structure itself was getting destroyed. | |
127 | + * It is now safe to free the files structure. | |
128 | + */ | |
129 | + kmem_cache_free(files_cachep, fdt->free_files); | |
130 | + return; | |
131 | + } | |
132 | + if (fdt->max_fdset <= __FD_SETSIZE && fdt->max_fds <= NR_OPEN_DEFAULT) { | |
133 | + /* | |
134 | + * The fdtable was embedded | |
135 | + */ | |
136 | + return; | |
137 | + } | |
138 | + if (fdset_size <= PAGE_SIZE && fdarray_size <= PAGE_SIZE) { | |
139 | + kfree(fdt->open_fds); | |
140 | + kfree(fdt->close_on_exec); | |
141 | + kfree(fdt->fd); | |
142 | + kfree(fdt); | |
143 | + } else { | |
144 | + fddef = &get_cpu_var(fdtable_defer_list); | |
145 | + spin_lock(&fddef->lock); | |
146 | + fdt->next = fddef->next; | |
147 | + fddef->next = fdt; | |
148 | + /* | |
149 | + * vmallocs are handled from the workqueue context. | |
150 | + * If the per-cpu workqueue is running, then we | |
151 | + * defer work scheduling through a timer. | |
152 | + */ | |
153 | + if (!schedule_work(&fddef->wq)) | |
154 | + mod_timer(&fddef->timer, 5); | |
155 | + spin_unlock(&fddef->lock); | |
156 | + put_cpu_var(fdtable_defer_list); | |
157 | + } | |
158 | +} | |
101 | 159 | |
102 | - if (nfds > fdt->max_fds) { | |
103 | - struct file **old_fds; | |
104 | - int i; | |
105 | - | |
106 | - old_fds = xchg(&fdt->fd, new_fds); | |
107 | - i = xchg(&fdt->max_fds, nfds); | |
160 | +void free_fdtable(struct fdtable *fdt) | |
161 | +{ | |
162 | + if (fdt->free_files || fdt->max_fdset > __FD_SETSIZE || | |
163 | + fdt->max_fds > NR_OPEN_DEFAULT) | |
164 | + call_rcu(&fdt->rcu, free_fdtable_rcu); | |
165 | +} | |
108 | 166 | |
109 | - /* Don't copy/clear the array if we are creating a new | |
110 | - fd array for fork() */ | |
111 | - if (i) { | |
112 | - memcpy(new_fds, old_fds, i * sizeof(struct file *)); | |
113 | - /* clear the remainder of the array */ | |
114 | - memset(&new_fds[i], 0, | |
115 | - (nfds-i) * sizeof(struct file *)); | |
167 | +/* | |
168 | + * Expand the fdset in the files_struct. Called with the files spinlock | |
169 | + * held for write. | |
170 | + */ | |
171 | +static void copy_fdtable(struct fdtable *nfdt, struct fdtable *fdt) | |
172 | +{ | |
173 | + int i; | |
174 | + int count; | |
116 | 175 | |
117 | - spin_unlock(&files->file_lock); | |
118 | - free_fd_array(old_fds, i); | |
119 | - spin_lock(&files->file_lock); | |
120 | - } | |
121 | - } else { | |
122 | - /* Somebody expanded the array while we slept ... */ | |
123 | - spin_unlock(&files->file_lock); | |
124 | - free_fd_array(new_fds, nfds); | |
125 | - spin_lock(&files->file_lock); | |
176 | + BUG_ON(nfdt->max_fdset < fdt->max_fdset); | |
177 | + BUG_ON(nfdt->max_fds < fdt->max_fds); | |
178 | + /* Copy the existing tables and install the new pointers */ | |
179 | + | |
180 | + i = fdt->max_fdset / (sizeof(unsigned long) * 8); | |
181 | + count = (nfdt->max_fdset - fdt->max_fdset) / 8; | |
182 | + | |
183 | + /* | |
184 | + * Don't copy the entire array if the current fdset is | |
185 | + * not yet initialised. | |
186 | + */ | |
187 | + if (i) { | |
188 | + memcpy (nfdt->open_fds, fdt->open_fds, | |
189 | + fdt->max_fdset/8); | |
190 | + memcpy (nfdt->close_on_exec, fdt->close_on_exec, | |
191 | + fdt->max_fdset/8); | |
192 | + memset (&nfdt->open_fds->fds_bits[i], 0, count); | |
193 | + memset (&nfdt->close_on_exec->fds_bits[i], 0, count); | |
126 | 194 | } |
127 | - error = 0; | |
128 | -out: | |
129 | - return error; | |
195 | + | |
196 | + /* Don't copy/clear the array if we are creating a new | |
197 | + fd array for fork() */ | |
198 | + if (fdt->max_fds) { | |
199 | + memcpy(nfdt->fd, fdt->fd, | |
200 | + fdt->max_fds * sizeof(struct file *)); | |
201 | + /* clear the remainder of the array */ | |
202 | + memset(&nfdt->fd[fdt->max_fds], 0, | |
203 | + (nfdt->max_fds - fdt->max_fds) * | |
204 | + sizeof(struct file *)); | |
205 | + } | |
206 | + nfdt->next_fd = fdt->next_fd; | |
130 | 207 | } |
131 | 208 | |
132 | 209 | /* |
133 | 210 | |
134 | 211 | |
135 | 212 | |
... | ... | @@ -157,28 +234,21 @@ |
157 | 234 | vfree(array); |
158 | 235 | } |
159 | 236 | |
160 | -/* | |
161 | - * Expand the fdset in the files_struct. Called with the files spinlock | |
162 | - * held for write. | |
163 | - */ | |
164 | -static int expand_fdset(struct files_struct *files, int nr) | |
165 | - __releases(file->file_lock) | |
166 | - __acquires(file->file_lock) | |
237 | +static struct fdtable *alloc_fdtable(int nr) | |
167 | 238 | { |
168 | - fd_set *new_openset = NULL, *new_execset = NULL; | |
169 | - int error, nfds = 0; | |
170 | - struct fdtable *fdt; | |
239 | + struct fdtable *fdt = NULL; | |
240 | + int nfds = 0; | |
241 | + fd_set *new_openset = NULL, *new_execset = NULL; | |
242 | + struct file **new_fds; | |
171 | 243 | |
172 | - error = -EMFILE; | |
173 | - fdt = files_fdtable(files); | |
174 | - if (fdt->max_fdset >= NR_OPEN || nr >= NR_OPEN) | |
175 | - goto out; | |
244 | + fdt = kmalloc(sizeof(*fdt), GFP_KERNEL); | |
245 | + if (!fdt) | |
246 | + goto out; | |
247 | + memset(fdt, 0, sizeof(*fdt)); | |
176 | 248 | |
177 | - nfds = fdt->max_fdset; | |
178 | - spin_unlock(&files->file_lock); | |
179 | - | |
180 | - /* Expand to the max in easy steps */ | |
181 | - do { | |
249 | + nfds = __FD_SETSIZE; | |
250 | + /* Expand to the max in easy steps */ | |
251 | + do { | |
182 | 252 | if (nfds < (PAGE_SIZE * 8)) |
183 | 253 | nfds = PAGE_SIZE * 8; |
184 | 254 | else { |
185 | 255 | |
186 | 256 | |
187 | 257 | |
188 | 258 | |
189 | 259 | |
190 | 260 | |
... | ... | @@ -188,50 +258,88 @@ |
188 | 258 | } |
189 | 259 | } while (nfds <= nr); |
190 | 260 | |
191 | - error = -ENOMEM; | |
192 | - new_openset = alloc_fdset(nfds); | |
193 | - new_execset = alloc_fdset(nfds); | |
194 | - spin_lock(&files->file_lock); | |
195 | - if (!new_openset || !new_execset) | |
261 | + new_openset = alloc_fdset(nfds); | |
262 | + new_execset = alloc_fdset(nfds); | |
263 | + if (!new_openset || !new_execset) | |
264 | + goto out; | |
265 | + fdt->open_fds = new_openset; | |
266 | + fdt->close_on_exec = new_execset; | |
267 | + fdt->max_fdset = nfds; | |
268 | + | |
269 | + nfds = NR_OPEN_DEFAULT; | |
270 | + /* | |
271 | + * Expand to the max in easy steps, and keep expanding it until | |
272 | + * we have enough for the requested fd array size. | |
273 | + */ | |
274 | + do { | |
275 | +#if NR_OPEN_DEFAULT < 256 | |
276 | + if (nfds < 256) | |
277 | + nfds = 256; | |
278 | + else | |
279 | +#endif | |
280 | + if (nfds < (PAGE_SIZE / sizeof(struct file *))) | |
281 | + nfds = PAGE_SIZE / sizeof(struct file *); | |
282 | + else { | |
283 | + nfds = nfds * 2; | |
284 | + if (nfds > NR_OPEN) | |
285 | + nfds = NR_OPEN; | |
286 | + } | |
287 | + } while (nfds <= nr); | |
288 | + new_fds = alloc_fd_array(nfds); | |
289 | + if (!new_fds) | |
196 | 290 | goto out; |
291 | + fdt->fd = new_fds; | |
292 | + fdt->max_fds = nfds; | |
293 | + fdt->free_files = NULL; | |
294 | + return fdt; | |
295 | +out: | |
296 | + if (new_openset) | |
297 | + free_fdset(new_openset, nfds); | |
298 | + if (new_execset) | |
299 | + free_fdset(new_execset, nfds); | |
300 | + kfree(fdt); | |
301 | + return NULL; | |
302 | +} | |
197 | 303 | |
198 | - error = 0; | |
199 | - | |
200 | - /* Copy the existing tables and install the new pointers */ | |
304 | +/* | |
305 | + * Expands the file descriptor table - it will allocate a new fdtable and | |
306 | + * both fd array and fdset. It is expected to be called with the | |
307 | + * files_lock held. | |
308 | + */ | |
309 | +static int expand_fdtable(struct files_struct *files, int nr) | |
310 | + __releases(files->file_lock) | |
311 | + __acquires(files->file_lock) | |
312 | +{ | |
313 | + int error = 0; | |
314 | + struct fdtable *fdt; | |
315 | + struct fdtable *nfdt = NULL; | |
316 | + | |
317 | + spin_unlock(&files->file_lock); | |
318 | + nfdt = alloc_fdtable(nr); | |
319 | + if (!nfdt) { | |
320 | + error = -ENOMEM; | |
321 | + spin_lock(&files->file_lock); | |
322 | + goto out; | |
323 | + } | |
324 | + | |
325 | + spin_lock(&files->file_lock); | |
201 | 326 | fdt = files_fdtable(files); |
202 | - if (nfds > fdt->max_fdset) { | |
203 | - int i = fdt->max_fdset / (sizeof(unsigned long) * 8); | |
204 | - int count = (nfds - fdt->max_fdset) / 8; | |
205 | - | |
206 | - /* | |
207 | - * Don't copy the entire array if the current fdset is | |
208 | - * not yet initialised. | |
209 | - */ | |
210 | - if (i) { | |
211 | - memcpy (new_openset, fdt->open_fds, fdt->max_fdset/8); | |
212 | - memcpy (new_execset, fdt->close_on_exec, fdt->max_fdset/8); | |
213 | - memset (&new_openset->fds_bits[i], 0, count); | |
214 | - memset (&new_execset->fds_bits[i], 0, count); | |
215 | - } | |
216 | - | |
217 | - nfds = xchg(&fdt->max_fdset, nfds); | |
218 | - new_openset = xchg(&fdt->open_fds, new_openset); | |
219 | - new_execset = xchg(&fdt->close_on_exec, new_execset); | |
327 | + /* | |
328 | + * Check again since another task may have expanded the | |
329 | + * fd table while we dropped the lock | |
330 | + */ | |
331 | + if (nr >= fdt->max_fds || nr >= fdt->max_fdset) { | |
332 | + copy_fdtable(nfdt, fdt); | |
333 | + } else { | |
334 | + /* Somebody expanded while we dropped file_lock */ | |
220 | 335 | spin_unlock(&files->file_lock); |
221 | - free_fdset (new_openset, nfds); | |
222 | - free_fdset (new_execset, nfds); | |
336 | + __free_fdtable(nfdt); | |
223 | 337 | spin_lock(&files->file_lock); |
224 | - return 0; | |
225 | - } | |
226 | - /* Somebody expanded the array while we slept ... */ | |
227 | - | |
338 | + goto out; | |
339 | + } | |
340 | + rcu_assign_pointer(files->fdt, nfdt); | |
341 | + free_fdtable(fdt); | |
228 | 342 | out: |
229 | - spin_unlock(&files->file_lock); | |
230 | - if (new_openset) | |
231 | - free_fdset(new_openset, nfds); | |
232 | - if (new_execset) | |
233 | - free_fdset(new_execset, nfds); | |
234 | - spin_lock(&files->file_lock); | |
235 | 343 | return error; |
236 | 344 | } |
237 | 345 | |
238 | 346 | |
239 | 347 | |
240 | 348 | |
... | ... | @@ -246,18 +354,37 @@ |
246 | 354 | struct fdtable *fdt; |
247 | 355 | |
248 | 356 | fdt = files_fdtable(files); |
249 | - if (nr >= fdt->max_fdset) { | |
250 | - expand = 1; | |
251 | - if ((err = expand_fdset(files, nr))) | |
357 | + if (nr >= fdt->max_fdset || nr >= fdt->max_fds) { | |
358 | + if (fdt->max_fdset >= NR_OPEN || | |
359 | + fdt->max_fds >= NR_OPEN || nr >= NR_OPEN) { | |
360 | + err = -EMFILE; | |
252 | 361 | goto out; |
253 | - } | |
254 | - if (nr >= fdt->max_fds) { | |
362 | + } | |
255 | 363 | expand = 1; |
256 | - if ((err = expand_fd_array(files, nr))) | |
364 | + if ((err = expand_fdtable(files, nr))) | |
257 | 365 | goto out; |
258 | 366 | } |
259 | 367 | err = expand; |
260 | 368 | out: |
261 | 369 | return err; |
370 | +} | |
371 | + | |
372 | +static void __devinit fdtable_defer_list_init(int cpu) | |
373 | +{ | |
374 | + struct fdtable_defer *fddef = &per_cpu(fdtable_defer_list, cpu); | |
375 | + spin_lock_init(&fddef->lock); | |
376 | + INIT_WORK(&fddef->wq, (void (*)(void *))free_fdtable_work, fddef); | |
377 | + init_timer(&fddef->timer); | |
378 | + fddef->timer.data = (unsigned long)fddef; | |
379 | + fddef->timer.function = fdtable_timer; | |
380 | + fddef->next = NULL; | |
381 | +} | |
382 | + | |
383 | +void __init files_defer_init(void) | |
384 | +{ | |
385 | + int i; | |
386 | + /* Really early - can't use for_each_cpu */ | |
387 | + for (i = 0; i < NR_CPUS; i++) | |
388 | + fdtable_defer_list_init(i); | |
262 | 389 | } |
fs/file_table.c
... | ... | @@ -14,6 +14,7 @@ |
14 | 14 | #include <linux/fs.h> |
15 | 15 | #include <linux/security.h> |
16 | 16 | #include <linux/eventpoll.h> |
17 | +#include <linux/rcupdate.h> | |
17 | 18 | #include <linux/mount.h> |
18 | 19 | #include <linux/cdev.h> |
19 | 20 | #include <linux/fsnotify.h> |
20 | 21 | |
21 | 22 | |
... | ... | @@ -53,11 +54,17 @@ |
53 | 54 | spin_unlock_irqrestore(&filp_count_lock, flags); |
54 | 55 | } |
55 | 56 | |
56 | -static inline void file_free(struct file *f) | |
57 | +static inline void file_free_rcu(struct rcu_head *head) | |
57 | 58 | { |
59 | + struct file *f = container_of(head, struct file, f_rcuhead); | |
58 | 60 | kmem_cache_free(filp_cachep, f); |
59 | 61 | } |
60 | 62 | |
63 | +static inline void file_free(struct file *f) | |
64 | +{ | |
65 | + call_rcu(&f->f_rcuhead, file_free_rcu); | |
66 | +} | |
67 | + | |
61 | 68 | /* Find an unused file structure and return a pointer to it. |
62 | 69 | * Returns NULL, if there are no more free file structures or |
63 | 70 | * we run out of memory. |
... | ... | @@ -110,7 +117,7 @@ |
110 | 117 | |
111 | 118 | void fastcall fput(struct file *file) |
112 | 119 | { |
113 | - if (atomic_dec_and_test(&file->f_count)) | |
120 | + if (rcuref_dec_and_test(&file->f_count)) | |
114 | 121 | __fput(file); |
115 | 122 | } |
116 | 123 | |
117 | 124 | |
... | ... | @@ -156,11 +163,17 @@ |
156 | 163 | struct file *file; |
157 | 164 | struct files_struct *files = current->files; |
158 | 165 | |
159 | - spin_lock(&files->file_lock); | |
166 | + rcu_read_lock(); | |
160 | 167 | file = fcheck_files(files, fd); |
161 | - if (file) | |
162 | - get_file(file); | |
163 | - spin_unlock(&files->file_lock); | |
168 | + if (file) { | |
169 | + if (!rcuref_inc_lf(&file->f_count)) { | |
170 | + /* File object ref couldn't be taken */ | |
171 | + rcu_read_unlock(); | |
172 | + return NULL; | |
173 | + } | |
174 | + } | |
175 | + rcu_read_unlock(); | |
176 | + | |
164 | 177 | return file; |
165 | 178 | } |
166 | 179 | |
167 | 180 | |
168 | 181 | |
169 | 182 | |
170 | 183 | |
... | ... | @@ -182,21 +195,25 @@ |
182 | 195 | if (likely((atomic_read(&files->count) == 1))) { |
183 | 196 | file = fcheck_files(files, fd); |
184 | 197 | } else { |
185 | - spin_lock(&files->file_lock); | |
198 | + rcu_read_lock(); | |
186 | 199 | file = fcheck_files(files, fd); |
187 | 200 | if (file) { |
188 | - get_file(file); | |
189 | - *fput_needed = 1; | |
201 | + if (rcuref_inc_lf(&file->f_count)) | |
202 | + *fput_needed = 1; | |
203 | + else | |
204 | + /* Didn't get the reference, someone's freed */ | |
205 | + file = NULL; | |
190 | 206 | } |
191 | - spin_unlock(&files->file_lock); | |
207 | + rcu_read_unlock(); | |
192 | 208 | } |
209 | + | |
193 | 210 | return file; |
194 | 211 | } |
195 | 212 | |
196 | 213 | |
197 | 214 | void put_filp(struct file *file) |
198 | 215 | { |
199 | - if (atomic_dec_and_test(&file->f_count)) { | |
216 | + if (rcuref_dec_and_test(&file->f_count)) { | |
200 | 217 | security_file_free(file); |
201 | 218 | file_kill(file); |
202 | 219 | file_free(file); |
... | ... | @@ -257,5 +274,6 @@ |
257 | 274 | files_stat.max_files = n; |
258 | 275 | if (files_stat.max_files < NR_FILE) |
259 | 276 | files_stat.max_files = NR_FILE; |
277 | + files_defer_init(); | |
260 | 278 | } |
fs/open.c
... | ... | @@ -24,6 +24,7 @@ |
24 | 24 | #include <linux/personality.h> |
25 | 25 | #include <linux/pagemap.h> |
26 | 26 | #include <linux/syscalls.h> |
27 | +#include <linux/rcupdate.h> | |
27 | 28 | |
28 | 29 | #include <asm/unistd.h> |
29 | 30 | |
... | ... | @@ -930,9 +931,8 @@ |
930 | 931 | struct fdtable *fdt; |
931 | 932 | spin_lock(&files->file_lock); |
932 | 933 | fdt = files_fdtable(files); |
933 | - if (unlikely(fdt->fd[fd] != NULL)) | |
934 | - BUG(); | |
935 | - fdt->fd[fd] = file; | |
934 | + BUG_ON(fdt->fd[fd] != NULL); | |
935 | + rcu_assign_pointer(fdt->fd[fd], file); | |
936 | 936 | spin_unlock(&files->file_lock); |
937 | 937 | } |
938 | 938 | |
... | ... | @@ -1024,7 +1024,7 @@ |
1024 | 1024 | filp = fdt->fd[fd]; |
1025 | 1025 | if (!filp) |
1026 | 1026 | goto out_unlock; |
1027 | - fdt->fd[fd] = NULL; | |
1027 | + rcu_assign_pointer(fdt->fd[fd], NULL); | |
1028 | 1028 | FD_CLR(fd, fdt->close_on_exec); |
1029 | 1029 | __put_unused_fd(files, fd); |
1030 | 1030 | spin_unlock(&files->file_lock); |
include/linux/file.h
... | ... | @@ -9,6 +9,7 @@ |
9 | 9 | #include <linux/posix_types.h> |
10 | 10 | #include <linux/compiler.h> |
11 | 11 | #include <linux/spinlock.h> |
12 | +#include <linux/rcupdate.h> | |
12 | 13 | |
13 | 14 | /* |
14 | 15 | * The default fd array needs to be at least BITS_PER_LONG, |
... | ... | @@ -23,6 +24,9 @@ |
23 | 24 | struct file ** fd; /* current fd array */ |
24 | 25 | fd_set *close_on_exec; |
25 | 26 | fd_set *open_fds; |
27 | + struct rcu_head rcu; | |
28 | + struct files_struct *free_files; | |
29 | + struct fdtable *next; | |
26 | 30 | }; |
27 | 31 | |
28 | 32 | /* |
29 | 33 | |
... | ... | @@ -31,13 +35,14 @@ |
31 | 35 | struct files_struct { |
32 | 36 | atomic_t count; |
33 | 37 | spinlock_t file_lock; /* Protects all the below members. Nests inside tsk->alloc_lock */ |
38 | + struct fdtable *fdt; | |
34 | 39 | struct fdtable fdtab; |
35 | 40 | fd_set close_on_exec_init; |
36 | 41 | fd_set open_fds_init; |
37 | 42 | struct file * fd_array[NR_OPEN_DEFAULT]; |
38 | 43 | }; |
39 | 44 | |
40 | -#define files_fdtable(files) (&(files)->fdtab) | |
45 | +#define files_fdtable(files) (rcu_dereference((files)->fdt)) | |
41 | 46 | |
42 | 47 | extern void FASTCALL(__fput(struct file *)); |
43 | 48 | extern void FASTCALL(fput(struct file *)); |
... | ... | @@ -65,6 +70,8 @@ |
65 | 70 | extern void free_fdset(fd_set *, int); |
66 | 71 | |
67 | 72 | extern int expand_files(struct files_struct *, int nr); |
73 | +extern void free_fdtable(struct fdtable *fdt); | |
74 | +extern void __init files_defer_init(void); | |
68 | 75 | |
69 | 76 | static inline struct file * fcheck_files(struct files_struct *files, unsigned int fd) |
70 | 77 | { |
... | ... | @@ -72,7 +79,7 @@ |
72 | 79 | struct fdtable *fdt = files_fdtable(files); |
73 | 80 | |
74 | 81 | if (fd < fdt->max_fds) |
75 | - file = fdt->fd[fd]; | |
82 | + file = rcu_dereference(fdt->fd[fd]); | |
76 | 83 | return file; |
77 | 84 | } |
78 | 85 |
include/linux/fs.h
... | ... | @@ -9,6 +9,7 @@ |
9 | 9 | #include <linux/config.h> |
10 | 10 | #include <linux/limits.h> |
11 | 11 | #include <linux/ioctl.h> |
12 | +#include <linux/rcuref.h> | |
12 | 13 | |
13 | 14 | /* |
14 | 15 | * It's silly to have NR_OPEN bigger than NR_FILE, but you can change |
15 | 16 | |
... | ... | @@ -597,12 +598,13 @@ |
597 | 598 | spinlock_t f_ep_lock; |
598 | 599 | #endif /* #ifdef CONFIG_EPOLL */ |
599 | 600 | struct address_space *f_mapping; |
601 | + struct rcu_head f_rcuhead; | |
600 | 602 | }; |
601 | 603 | extern spinlock_t files_lock; |
602 | 604 | #define file_list_lock() spin_lock(&files_lock); |
603 | 605 | #define file_list_unlock() spin_unlock(&files_lock); |
604 | 606 | |
605 | -#define get_file(x) atomic_inc(&(x)->f_count) | |
607 | +#define get_file(x) rcuref_inc(&(x)->f_count) | |
606 | 608 | #define file_count(x) atomic_read(&(x)->f_count) |
607 | 609 | |
608 | 610 | #define MAX_NON_LFS ((1UL<<31) - 1) |
include/linux/init_task.h
... | ... | @@ -2,6 +2,7 @@ |
2 | 2 | #define _LINUX__INIT_TASK_H |
3 | 3 | |
4 | 4 | #include <linux/file.h> |
5 | +#include <linux/rcupdate.h> | |
5 | 6 | |
6 | 7 | #define INIT_FDTABLE \ |
7 | 8 | { \ |
8 | 9 | |
... | ... | @@ -11,12 +12,16 @@ |
11 | 12 | .fd = &init_files.fd_array[0], \ |
12 | 13 | .close_on_exec = &init_files.close_on_exec_init, \ |
13 | 14 | .open_fds = &init_files.open_fds_init, \ |
15 | + .rcu = RCU_HEAD_INIT, \ | |
16 | + .free_files = NULL, \ | |
17 | + .next = NULL, \ | |
14 | 18 | } |
15 | 19 | |
16 | 20 | #define INIT_FILES \ |
17 | 21 | { \ |
18 | 22 | .count = ATOMIC_INIT(1), \ |
19 | 23 | .file_lock = SPIN_LOCK_UNLOCKED, \ |
24 | + .fdt = &init_files.fdtab, \ | |
20 | 25 | .fdtab = INIT_FDTABLE, \ |
21 | 26 | .close_on_exec_init = { { 0, } }, \ |
22 | 27 | .open_fds_init = { { 0, } }, \ |
kernel/exit.c
... | ... | @@ -411,15 +411,16 @@ |
411 | 411 | close_files(files); |
412 | 412 | /* |
413 | 413 | * Free the fd and fdset arrays if we expanded them. |
414 | + * If the fdtable was embedded, pass files for freeing | |
415 | + * at the end of the RCU grace period. Otherwise, | |
416 | + * you can free files immediately. | |
414 | 417 | */ |
415 | 418 | fdt = files_fdtable(files); |
416 | - if (fdt->fd != &files->fd_array[0]) | |
417 | - free_fd_array(fdt->fd, fdt->max_fds); | |
418 | - if (fdt->max_fdset > __FD_SETSIZE) { | |
419 | - free_fdset(fdt->open_fds, fdt->max_fdset); | |
420 | - free_fdset(fdt->close_on_exec, fdt->max_fdset); | |
421 | - } | |
422 | - kmem_cache_free(files_cachep, files); | |
419 | + if (fdt == &files->fdtab) | |
420 | + fdt->free_files = files; | |
421 | + else | |
422 | + kmem_cache_free(files_cachep, files); | |
423 | + free_fdtable(fdt); | |
423 | 424 | } |
424 | 425 | } |
425 | 426 |
kernel/fork.c
... | ... | @@ -35,6 +35,7 @@ |
35 | 35 | #include <linux/syscalls.h> |
36 | 36 | #include <linux/jiffies.h> |
37 | 37 | #include <linux/futex.h> |
38 | +#include <linux/rcupdate.h> | |
38 | 39 | #include <linux/ptrace.h> |
39 | 40 | #include <linux/mount.h> |
40 | 41 | #include <linux/audit.h> |
41 | 42 | |
42 | 43 | |
43 | 44 | |
... | ... | @@ -565,13 +566,12 @@ |
565 | 566 | return 0; |
566 | 567 | } |
567 | 568 | |
568 | -static int count_open_files(struct files_struct *files, int size) | |
569 | +static int count_open_files(struct fdtable *fdt) | |
569 | 570 | { |
571 | + int size = fdt->max_fdset; | |
570 | 572 | int i; |
571 | - struct fdtable *fdt; | |
572 | 573 | |
573 | 574 | /* Find the last open fd */ |
574 | - fdt = files_fdtable(files); | |
575 | 575 | for (i = size/(8*sizeof(long)); i > 0; ) { |
576 | 576 | if (fdt->open_fds->fds_bits[--i]) |
577 | 577 | break; |
578 | 578 | |
... | ... | @@ -592,13 +592,17 @@ |
592 | 592 | atomic_set(&newf->count, 1); |
593 | 593 | |
594 | 594 | spin_lock_init(&newf->file_lock); |
595 | - fdt = files_fdtable(newf); | |
595 | + fdt = &newf->fdtab; | |
596 | 596 | fdt->next_fd = 0; |
597 | 597 | fdt->max_fds = NR_OPEN_DEFAULT; |
598 | 598 | fdt->max_fdset = __FD_SETSIZE; |
599 | 599 | fdt->close_on_exec = &newf->close_on_exec_init; |
600 | 600 | fdt->open_fds = &newf->open_fds_init; |
601 | 601 | fdt->fd = &newf->fd_array[0]; |
602 | + INIT_RCU_HEAD(&fdt->rcu); | |
603 | + fdt->free_files = NULL; | |
604 | + fdt->next = NULL; | |
605 | + rcu_assign_pointer(newf->fdt, fdt); | |
602 | 606 | out: |
603 | 607 | return newf; |
604 | 608 | } |
... | ... | @@ -637,7 +641,7 @@ |
637 | 641 | old_fdt = files_fdtable(oldf); |
638 | 642 | new_fdt = files_fdtable(newf); |
639 | 643 | size = old_fdt->max_fdset; |
640 | - open_files = count_open_files(oldf, old_fdt->max_fdset); | |
644 | + open_files = count_open_files(old_fdt); | |
641 | 645 | expand = 0; |
642 | 646 | |
643 | 647 | /* |
644 | 648 | |
... | ... | @@ -661,7 +665,14 @@ |
661 | 665 | spin_unlock(&newf->file_lock); |
662 | 666 | if (error < 0) |
663 | 667 | goto out_release; |
668 | + new_fdt = files_fdtable(newf); | |
669 | + /* | |
670 | + * Reacquire the oldf lock and a pointer to its fd table | |
671 | + * who knows it may have a new bigger fd table. We need | |
672 | + * the latest pointer. | |
673 | + */ | |
664 | 674 | spin_lock(&oldf->file_lock); |
675 | + old_fdt = files_fdtable(oldf); | |
665 | 676 | } |
666 | 677 | |
667 | 678 | old_fds = old_fdt->fd; |
... | ... | @@ -683,7 +694,7 @@ |
683 | 694 | */ |
684 | 695 | FD_CLR(open_files - i, new_fdt->open_fds); |
685 | 696 | } |
686 | - *new_fds++ = f; | |
697 | + rcu_assign_pointer(*new_fds++, f); | |
687 | 698 | } |
688 | 699 | spin_unlock(&oldf->file_lock); |
689 | 700 |