Commit 39be95e9c8c0b5668c9f8806ffe29bf9f4bc0f40

Authored by Jim Schutt
Committed by Alex Elder
1 parent c420276a53

ceph: ceph_pagelist_append might sleep while atomic

Ceph's encode_caps_cb() worked hard to not call __page_cache_alloc()
while holding a lock, but it's spoiled because ceph_pagelist_addpage()
always calls kmap(), which might sleep.  Here's the result:

[13439.295457] ceph: mds0 reconnect start
[13439.300572] BUG: sleeping function called from invalid context at include/linux/highmem.h:58
[13439.309243] in_atomic(): 1, irqs_disabled(): 0, pid: 12059, name: kworker/1:1
    . . .
[13439.376225] Call Trace:
[13439.378757]  [<ffffffff81076f4c>] __might_sleep+0xfc/0x110
[13439.384353]  [<ffffffffa03f4ce0>] ceph_pagelist_append+0x120/0x1b0 [libceph]
[13439.391491]  [<ffffffffa0448fe9>] ceph_encode_locks+0x89/0x190 [ceph]
[13439.398035]  [<ffffffff814ee849>] ? _raw_spin_lock+0x49/0x50
[13439.403775]  [<ffffffff811cadf5>] ? lock_flocks+0x15/0x20
[13439.409277]  [<ffffffffa045e2af>] encode_caps_cb+0x41f/0x4a0 [ceph]
[13439.415622]  [<ffffffff81196748>] ? igrab+0x28/0x70
[13439.420610]  [<ffffffffa045e9f8>] ? iterate_session_caps+0xe8/0x250 [ceph]
[13439.427584]  [<ffffffffa045ea25>] iterate_session_caps+0x115/0x250 [ceph]
[13439.434499]  [<ffffffffa045de90>] ? set_request_path_attr+0x2d0/0x2d0 [ceph]
[13439.441646]  [<ffffffffa0462888>] send_mds_reconnect+0x238/0x450 [ceph]
[13439.448363]  [<ffffffffa0464542>] ? ceph_mdsmap_decode+0x5e2/0x770 [ceph]
[13439.455250]  [<ffffffffa0462e42>] check_new_map+0x352/0x500 [ceph]
[13439.461534]  [<ffffffffa04631ad>] ceph_mdsc_handle_map+0x1bd/0x260 [ceph]
[13439.468432]  [<ffffffff814ebc7e>] ? mutex_unlock+0xe/0x10
[13439.473934]  [<ffffffffa043c612>] extra_mon_dispatch+0x22/0x30 [ceph]
[13439.480464]  [<ffffffffa03f6c2c>] dispatch+0xbc/0x110 [libceph]
[13439.486492]  [<ffffffffa03eec3d>] process_message+0x1ad/0x1d0 [libceph]
[13439.493190]  [<ffffffffa03f1498>] ? read_partial_message+0x3e8/0x520 [libceph]
    . . .
[13439.587132] ceph: mds0 reconnect success
[13490.720032] ceph: mds0 caps stale
[13501.235257] ceph: mds0 recovery completed
[13501.300419] ceph: mds0 caps renewed

Fix it up by encoding locks into a buffer first, and when the number
of encoded locks is stable, copy that into a ceph_pagelist.

[elder@inktank.com: abbreviated the stack info a bit.]

Cc: stable@vger.kernel.org # 3.4+
Signed-off-by: Jim Schutt <jaschut@sandia.gov>
Reviewed-by: Alex Elder <elder@inktank.com>

Showing 3 changed files with 89 additions and 61 deletions Side-by-side Diff

... ... @@ -191,29 +191,23 @@
191 191 }
192 192  
193 193 /**
194   - * Encode the flock and fcntl locks for the given inode into the pagelist.
195   - * Format is: #fcntl locks, sequential fcntl locks, #flock locks,
196   - * sequential flock locks.
197   - * Must be called with lock_flocks() already held.
198   - * If we encounter more of a specific lock type than expected,
199   - * we return the value 1.
  194 + * Encode the flock and fcntl locks for the given inode into the ceph_filelock
  195 + * array. Must be called with lock_flocks() already held.
  196 + * If we encounter more of a specific lock type than expected, return -ENOSPC.
200 197 */
201   -int ceph_encode_locks(struct inode *inode, struct ceph_pagelist *pagelist,
202   - int num_fcntl_locks, int num_flock_locks)
  198 +int ceph_encode_locks_to_buffer(struct inode *inode,
  199 + struct ceph_filelock *flocks,
  200 + int num_fcntl_locks, int num_flock_locks)
203 201 {
204 202 struct file_lock *lock;
205   - struct ceph_filelock cephlock;
206 203 int err = 0;
207 204 int seen_fcntl = 0;
208 205 int seen_flock = 0;
209   - __le32 nlocks;
  206 + int l = 0;
210 207  
211 208 dout("encoding %d flock and %d fcntl locks", num_flock_locks,
212 209 num_fcntl_locks);
213   - nlocks = cpu_to_le32(num_fcntl_locks);
214   - err = ceph_pagelist_append(pagelist, &nlocks, sizeof(nlocks));
215   - if (err)
216   - goto fail;
  210 +
217 211 for (lock = inode->i_flock; lock != NULL; lock = lock->fl_next) {
218 212 if (lock->fl_flags & FL_POSIX) {
219 213 ++seen_fcntl;
220 214  
221 215  
222 216  
... ... @@ -221,20 +215,12 @@
221 215 err = -ENOSPC;
222 216 goto fail;
223 217 }
224   - err = lock_to_ceph_filelock(lock, &cephlock);
  218 + err = lock_to_ceph_filelock(lock, &flocks[l]);
225 219 if (err)
226 220 goto fail;
227   - err = ceph_pagelist_append(pagelist, &cephlock,
228   - sizeof(struct ceph_filelock));
  221 + ++l;
229 222 }
230   - if (err)
231   - goto fail;
232 223 }
233   -
234   - nlocks = cpu_to_le32(num_flock_locks);
235   - err = ceph_pagelist_append(pagelist, &nlocks, sizeof(nlocks));
236   - if (err)
237   - goto fail;
238 224 for (lock = inode->i_flock; lock != NULL; lock = lock->fl_next) {
239 225 if (lock->fl_flags & FL_FLOCK) {
240 226 ++seen_flock;
241 227  
242 228  
243 229  
... ... @@ -242,16 +228,48 @@
242 228 err = -ENOSPC;
243 229 goto fail;
244 230 }
245   - err = lock_to_ceph_filelock(lock, &cephlock);
  231 + err = lock_to_ceph_filelock(lock, &flocks[l]);
246 232 if (err)
247 233 goto fail;
248   - err = ceph_pagelist_append(pagelist, &cephlock,
249   - sizeof(struct ceph_filelock));
  234 + ++l;
250 235 }
251   - if (err)
252   - goto fail;
253 236 }
254 237 fail:
  238 + return err;
  239 +}
  240 +
  241 +/**
  242 + * Copy the encoded flock and fcntl locks into the pagelist.
  243 + * Format is: #fcntl locks, sequential fcntl locks, #flock locks,
  244 + * sequential flock locks.
  245 + * Returns zero on success.
  246 + */
  247 +int ceph_locks_to_pagelist(struct ceph_filelock *flocks,
  248 + struct ceph_pagelist *pagelist,
  249 + int num_fcntl_locks, int num_flock_locks)
  250 +{
  251 + int err = 0;
  252 + __le32 nlocks;
  253 +
  254 + nlocks = cpu_to_le32(num_fcntl_locks);
  255 + err = ceph_pagelist_append(pagelist, &nlocks, sizeof(nlocks));
  256 + if (err)
  257 + goto out_fail;
  258 +
  259 + err = ceph_pagelist_append(pagelist, flocks,
  260 + num_fcntl_locks * sizeof(*flocks));
  261 + if (err)
  262 + goto out_fail;
  263 +
  264 + nlocks = cpu_to_le32(num_flock_locks);
  265 + err = ceph_pagelist_append(pagelist, &nlocks, sizeof(nlocks));
  266 + if (err)
  267 + goto out_fail;
  268 +
  269 + err = ceph_pagelist_append(pagelist,
  270 + &flocks[num_fcntl_locks],
  271 + num_flock_locks * sizeof(*flocks));
  272 +out_fail:
255 273 return err;
256 274 }
257 275  
fs/ceph/mds_client.c
... ... @@ -2478,39 +2478,44 @@
2478 2478  
2479 2479 if (recon_state->flock) {
2480 2480 int num_fcntl_locks, num_flock_locks;
2481   - struct ceph_pagelist_cursor trunc_point;
  2481 + struct ceph_filelock *flocks;
2482 2482  
2483   - ceph_pagelist_set_cursor(pagelist, &trunc_point);
2484   - do {
2485   - lock_flocks();
2486   - ceph_count_locks(inode, &num_fcntl_locks,
2487   - &num_flock_locks);
2488   - rec.v2.flock_len = cpu_to_le32(2*sizeof(u32) +
2489   - (num_fcntl_locks+num_flock_locks) *
2490   - sizeof(struct ceph_filelock));
2491   - unlock_flocks();
2492   -
2493   - /* pre-alloc pagelist */
2494   - ceph_pagelist_truncate(pagelist, &trunc_point);
2495   - err = ceph_pagelist_append(pagelist, &rec, reclen);
2496   - if (!err)
2497   - err = ceph_pagelist_reserve(pagelist,
2498   - rec.v2.flock_len);
2499   -
2500   - /* encode locks */
2501   - if (!err) {
2502   - lock_flocks();
2503   - err = ceph_encode_locks(inode,
2504   - pagelist,
2505   - num_fcntl_locks,
2506   - num_flock_locks);
2507   - unlock_flocks();
2508   - }
2509   - } while (err == -ENOSPC);
  2483 +encode_again:
  2484 + lock_flocks();
  2485 + ceph_count_locks(inode, &num_fcntl_locks, &num_flock_locks);
  2486 + unlock_flocks();
  2487 + flocks = kmalloc((num_fcntl_locks+num_flock_locks) *
  2488 + sizeof(struct ceph_filelock), GFP_NOFS);
  2489 + if (!flocks) {
  2490 + err = -ENOMEM;
  2491 + goto out_free;
  2492 + }
  2493 + lock_flocks();
  2494 + err = ceph_encode_locks_to_buffer(inode, flocks,
  2495 + num_fcntl_locks,
  2496 + num_flock_locks);
  2497 + unlock_flocks();
  2498 + if (err) {
  2499 + kfree(flocks);
  2500 + if (err == -ENOSPC)
  2501 + goto encode_again;
  2502 + goto out_free;
  2503 + }
  2504 + /*
  2505 + * number of encoded locks is stable, so copy to pagelist
  2506 + */
  2507 + rec.v2.flock_len = cpu_to_le32(2*sizeof(u32) +
  2508 + (num_fcntl_locks+num_flock_locks) *
  2509 + sizeof(struct ceph_filelock));
  2510 + err = ceph_pagelist_append(pagelist, &rec, reclen);
  2511 + if (!err)
  2512 + err = ceph_locks_to_pagelist(flocks, pagelist,
  2513 + num_fcntl_locks,
  2514 + num_flock_locks);
  2515 + kfree(flocks);
2510 2516 } else {
2511 2517 err = ceph_pagelist_append(pagelist, &rec, reclen);
2512 2518 }
2513   -
2514 2519 out_free:
2515 2520 kfree(path);
2516 2521 out_dput:
... ... @@ -822,8 +822,13 @@
822 822 extern int ceph_lock(struct file *file, int cmd, struct file_lock *fl);
823 823 extern int ceph_flock(struct file *file, int cmd, struct file_lock *fl);
824 824 extern void ceph_count_locks(struct inode *inode, int *p_num, int *f_num);
825   -extern int ceph_encode_locks(struct inode *i, struct ceph_pagelist *p,
826   - int p_locks, int f_locks);
  825 +extern int ceph_encode_locks_to_buffer(struct inode *inode,
  826 + struct ceph_filelock *flocks,
  827 + int num_fcntl_locks,
  828 + int num_flock_locks);
  829 +extern int ceph_locks_to_pagelist(struct ceph_filelock *flocks,
  830 + struct ceph_pagelist *pagelist,
  831 + int num_fcntl_locks, int num_flock_locks);
827 832 extern int lock_to_ceph_filelock(struct file_lock *fl, struct ceph_filelock *c);
828 833  
829 834 /* debugfs.c */