Commit 39be95e9c8c0b5668c9f8806ffe29bf9f4bc0f40
Committed by
Alex Elder
1 parent
c420276a53
Exists in
smarc-l5.0.0_1.0.0-ga
and in
5 other branches
ceph: ceph_pagelist_append might sleep while atomic
Ceph's encode_caps_cb() worked hard to not call __page_cache_alloc() while holding a lock, but it's spoiled because ceph_pagelist_addpage() always calls kmap(), which might sleep. Here's the result: [13439.295457] ceph: mds0 reconnect start [13439.300572] BUG: sleeping function called from invalid context at include/linux/highmem.h:58 [13439.309243] in_atomic(): 1, irqs_disabled(): 0, pid: 12059, name: kworker/1:1 . . . [13439.376225] Call Trace: [13439.378757] [<ffffffff81076f4c>] __might_sleep+0xfc/0x110 [13439.384353] [<ffffffffa03f4ce0>] ceph_pagelist_append+0x120/0x1b0 [libceph] [13439.391491] [<ffffffffa0448fe9>] ceph_encode_locks+0x89/0x190 [ceph] [13439.398035] [<ffffffff814ee849>] ? _raw_spin_lock+0x49/0x50 [13439.403775] [<ffffffff811cadf5>] ? lock_flocks+0x15/0x20 [13439.409277] [<ffffffffa045e2af>] encode_caps_cb+0x41f/0x4a0 [ceph] [13439.415622] [<ffffffff81196748>] ? igrab+0x28/0x70 [13439.420610] [<ffffffffa045e9f8>] ? iterate_session_caps+0xe8/0x250 [ceph] [13439.427584] [<ffffffffa045ea25>] iterate_session_caps+0x115/0x250 [ceph] [13439.434499] [<ffffffffa045de90>] ? set_request_path_attr+0x2d0/0x2d0 [ceph] [13439.441646] [<ffffffffa0462888>] send_mds_reconnect+0x238/0x450 [ceph] [13439.448363] [<ffffffffa0464542>] ? ceph_mdsmap_decode+0x5e2/0x770 [ceph] [13439.455250] [<ffffffffa0462e42>] check_new_map+0x352/0x500 [ceph] [13439.461534] [<ffffffffa04631ad>] ceph_mdsc_handle_map+0x1bd/0x260 [ceph] [13439.468432] [<ffffffff814ebc7e>] ? mutex_unlock+0xe/0x10 [13439.473934] [<ffffffffa043c612>] extra_mon_dispatch+0x22/0x30 [ceph] [13439.480464] [<ffffffffa03f6c2c>] dispatch+0xbc/0x110 [libceph] [13439.486492] [<ffffffffa03eec3d>] process_message+0x1ad/0x1d0 [libceph] [13439.493190] [<ffffffffa03f1498>] ? read_partial_message+0x3e8/0x520 [libceph] . . . [13439.587132] ceph: mds0 reconnect success [13490.720032] ceph: mds0 caps stale [13501.235257] ceph: mds0 recovery completed [13501.300419] ceph: mds0 caps renewed Fix it up by encoding locks into a buffer first, and when the number of encoded locks is stable, copy that into a ceph_pagelist. [elder@inktank.com: abbreviated the stack info a bit.] Cc: stable@vger.kernel.org # 3.4+ Signed-off-by: Jim Schutt <jaschut@sandia.gov> Reviewed-by: Alex Elder <elder@inktank.com>
Showing 3 changed files with 89 additions and 61 deletions Side-by-side Diff
fs/ceph/locks.c
... | ... | @@ -191,29 +191,23 @@ |
191 | 191 | } |
192 | 192 | |
193 | 193 | /** |
194 | - * Encode the flock and fcntl locks for the given inode into the pagelist. | |
195 | - * Format is: #fcntl locks, sequential fcntl locks, #flock locks, | |
196 | - * sequential flock locks. | |
197 | - * Must be called with lock_flocks() already held. | |
198 | - * If we encounter more of a specific lock type than expected, | |
199 | - * we return the value 1. | |
194 | + * Encode the flock and fcntl locks for the given inode into the ceph_filelock | |
195 | + * array. Must be called with lock_flocks() already held. | |
196 | + * If we encounter more of a specific lock type than expected, return -ENOSPC. | |
200 | 197 | */ |
201 | -int ceph_encode_locks(struct inode *inode, struct ceph_pagelist *pagelist, | |
202 | - int num_fcntl_locks, int num_flock_locks) | |
198 | +int ceph_encode_locks_to_buffer(struct inode *inode, | |
199 | + struct ceph_filelock *flocks, | |
200 | + int num_fcntl_locks, int num_flock_locks) | |
203 | 201 | { |
204 | 202 | struct file_lock *lock; |
205 | - struct ceph_filelock cephlock; | |
206 | 203 | int err = 0; |
207 | 204 | int seen_fcntl = 0; |
208 | 205 | int seen_flock = 0; |
209 | - __le32 nlocks; | |
206 | + int l = 0; | |
210 | 207 | |
211 | 208 | dout("encoding %d flock and %d fcntl locks", num_flock_locks, |
212 | 209 | num_fcntl_locks); |
213 | - nlocks = cpu_to_le32(num_fcntl_locks); | |
214 | - err = ceph_pagelist_append(pagelist, &nlocks, sizeof(nlocks)); | |
215 | - if (err) | |
216 | - goto fail; | |
210 | + | |
217 | 211 | for (lock = inode->i_flock; lock != NULL; lock = lock->fl_next) { |
218 | 212 | if (lock->fl_flags & FL_POSIX) { |
219 | 213 | ++seen_fcntl; |
220 | 214 | |
221 | 215 | |
222 | 216 | |
... | ... | @@ -221,20 +215,12 @@ |
221 | 215 | err = -ENOSPC; |
222 | 216 | goto fail; |
223 | 217 | } |
224 | - err = lock_to_ceph_filelock(lock, &cephlock); | |
218 | + err = lock_to_ceph_filelock(lock, &flocks[l]); | |
225 | 219 | if (err) |
226 | 220 | goto fail; |
227 | - err = ceph_pagelist_append(pagelist, &cephlock, | |
228 | - sizeof(struct ceph_filelock)); | |
221 | + ++l; | |
229 | 222 | } |
230 | - if (err) | |
231 | - goto fail; | |
232 | 223 | } |
233 | - | |
234 | - nlocks = cpu_to_le32(num_flock_locks); | |
235 | - err = ceph_pagelist_append(pagelist, &nlocks, sizeof(nlocks)); | |
236 | - if (err) | |
237 | - goto fail; | |
238 | 224 | for (lock = inode->i_flock; lock != NULL; lock = lock->fl_next) { |
239 | 225 | if (lock->fl_flags & FL_FLOCK) { |
240 | 226 | ++seen_flock; |
241 | 227 | |
242 | 228 | |
243 | 229 | |
... | ... | @@ -242,16 +228,48 @@ |
242 | 228 | err = -ENOSPC; |
243 | 229 | goto fail; |
244 | 230 | } |
245 | - err = lock_to_ceph_filelock(lock, &cephlock); | |
231 | + err = lock_to_ceph_filelock(lock, &flocks[l]); | |
246 | 232 | if (err) |
247 | 233 | goto fail; |
248 | - err = ceph_pagelist_append(pagelist, &cephlock, | |
249 | - sizeof(struct ceph_filelock)); | |
234 | + ++l; | |
250 | 235 | } |
251 | - if (err) | |
252 | - goto fail; | |
253 | 236 | } |
254 | 237 | fail: |
238 | + return err; | |
239 | +} | |
240 | + | |
241 | +/** | |
242 | + * Copy the encoded flock and fcntl locks into the pagelist. | |
243 | + * Format is: #fcntl locks, sequential fcntl locks, #flock locks, | |
244 | + * sequential flock locks. | |
245 | + * Returns zero on success. | |
246 | + */ | |
247 | +int ceph_locks_to_pagelist(struct ceph_filelock *flocks, | |
248 | + struct ceph_pagelist *pagelist, | |
249 | + int num_fcntl_locks, int num_flock_locks) | |
250 | +{ | |
251 | + int err = 0; | |
252 | + __le32 nlocks; | |
253 | + | |
254 | + nlocks = cpu_to_le32(num_fcntl_locks); | |
255 | + err = ceph_pagelist_append(pagelist, &nlocks, sizeof(nlocks)); | |
256 | + if (err) | |
257 | + goto out_fail; | |
258 | + | |
259 | + err = ceph_pagelist_append(pagelist, flocks, | |
260 | + num_fcntl_locks * sizeof(*flocks)); | |
261 | + if (err) | |
262 | + goto out_fail; | |
263 | + | |
264 | + nlocks = cpu_to_le32(num_flock_locks); | |
265 | + err = ceph_pagelist_append(pagelist, &nlocks, sizeof(nlocks)); | |
266 | + if (err) | |
267 | + goto out_fail; | |
268 | + | |
269 | + err = ceph_pagelist_append(pagelist, | |
270 | + &flocks[num_fcntl_locks], | |
271 | + num_flock_locks * sizeof(*flocks)); | |
272 | +out_fail: | |
255 | 273 | return err; |
256 | 274 | } |
257 | 275 |
fs/ceph/mds_client.c
... | ... | @@ -2478,39 +2478,44 @@ |
2478 | 2478 | |
2479 | 2479 | if (recon_state->flock) { |
2480 | 2480 | int num_fcntl_locks, num_flock_locks; |
2481 | - struct ceph_pagelist_cursor trunc_point; | |
2481 | + struct ceph_filelock *flocks; | |
2482 | 2482 | |
2483 | - ceph_pagelist_set_cursor(pagelist, &trunc_point); | |
2484 | - do { | |
2485 | - lock_flocks(); | |
2486 | - ceph_count_locks(inode, &num_fcntl_locks, | |
2487 | - &num_flock_locks); | |
2488 | - rec.v2.flock_len = cpu_to_le32(2*sizeof(u32) + | |
2489 | - (num_fcntl_locks+num_flock_locks) * | |
2490 | - sizeof(struct ceph_filelock)); | |
2491 | - unlock_flocks(); | |
2492 | - | |
2493 | - /* pre-alloc pagelist */ | |
2494 | - ceph_pagelist_truncate(pagelist, &trunc_point); | |
2495 | - err = ceph_pagelist_append(pagelist, &rec, reclen); | |
2496 | - if (!err) | |
2497 | - err = ceph_pagelist_reserve(pagelist, | |
2498 | - rec.v2.flock_len); | |
2499 | - | |
2500 | - /* encode locks */ | |
2501 | - if (!err) { | |
2502 | - lock_flocks(); | |
2503 | - err = ceph_encode_locks(inode, | |
2504 | - pagelist, | |
2505 | - num_fcntl_locks, | |
2506 | - num_flock_locks); | |
2507 | - unlock_flocks(); | |
2508 | - } | |
2509 | - } while (err == -ENOSPC); | |
2483 | +encode_again: | |
2484 | + lock_flocks(); | |
2485 | + ceph_count_locks(inode, &num_fcntl_locks, &num_flock_locks); | |
2486 | + unlock_flocks(); | |
2487 | + flocks = kmalloc((num_fcntl_locks+num_flock_locks) * | |
2488 | + sizeof(struct ceph_filelock), GFP_NOFS); | |
2489 | + if (!flocks) { | |
2490 | + err = -ENOMEM; | |
2491 | + goto out_free; | |
2492 | + } | |
2493 | + lock_flocks(); | |
2494 | + err = ceph_encode_locks_to_buffer(inode, flocks, | |
2495 | + num_fcntl_locks, | |
2496 | + num_flock_locks); | |
2497 | + unlock_flocks(); | |
2498 | + if (err) { | |
2499 | + kfree(flocks); | |
2500 | + if (err == -ENOSPC) | |
2501 | + goto encode_again; | |
2502 | + goto out_free; | |
2503 | + } | |
2504 | + /* | |
2505 | + * number of encoded locks is stable, so copy to pagelist | |
2506 | + */ | |
2507 | + rec.v2.flock_len = cpu_to_le32(2*sizeof(u32) + | |
2508 | + (num_fcntl_locks+num_flock_locks) * | |
2509 | + sizeof(struct ceph_filelock)); | |
2510 | + err = ceph_pagelist_append(pagelist, &rec, reclen); | |
2511 | + if (!err) | |
2512 | + err = ceph_locks_to_pagelist(flocks, pagelist, | |
2513 | + num_fcntl_locks, | |
2514 | + num_flock_locks); | |
2515 | + kfree(flocks); | |
2510 | 2516 | } else { |
2511 | 2517 | err = ceph_pagelist_append(pagelist, &rec, reclen); |
2512 | 2518 | } |
2513 | - | |
2514 | 2519 | out_free: |
2515 | 2520 | kfree(path); |
2516 | 2521 | out_dput: |
fs/ceph/super.h
... | ... | @@ -822,8 +822,13 @@ |
822 | 822 | extern int ceph_lock(struct file *file, int cmd, struct file_lock *fl); |
823 | 823 | extern int ceph_flock(struct file *file, int cmd, struct file_lock *fl); |
824 | 824 | extern void ceph_count_locks(struct inode *inode, int *p_num, int *f_num); |
825 | -extern int ceph_encode_locks(struct inode *i, struct ceph_pagelist *p, | |
826 | - int p_locks, int f_locks); | |
825 | +extern int ceph_encode_locks_to_buffer(struct inode *inode, | |
826 | + struct ceph_filelock *flocks, | |
827 | + int num_fcntl_locks, | |
828 | + int num_flock_locks); | |
829 | +extern int ceph_locks_to_pagelist(struct ceph_filelock *flocks, | |
830 | + struct ceph_pagelist *pagelist, | |
831 | + int num_fcntl_locks, int num_flock_locks); | |
827 | 832 | extern int lock_to_ceph_filelock(struct file_lock *fl, struct ceph_filelock *c); |
828 | 833 | |
829 | 834 | /* debugfs.c */ |