Commit d4400156d415540086c34a06e5d233122d6bf56a

Authored by David Teigland
Committed by Steven Whitehouse
1 parent 435618b75b

[DLM] fix requestqueue race

Red Hat BZ 211914

There's a race between dlm_recoverd (1) enabling locking and (2) clearing
out the requestqueue, and dlm_recvd (1) checking if locking is enabled and
(2) adding a message to the requestqueue.  An order of recoverd(1),
recvd(1), recvd(2), recoverd(2) will result in a message being left on the
requestqueue.  The fix is to have dlm_recvd check if dlm_recoverd has
enabled locking after taking the mutex for the requestqueue and if it has
processing the message instead of queueing it.

Signed-off-by: David Teigland <teigland@redhat.com>
Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>

Showing 3 changed files with 29 additions and 9 deletions Side-by-side Diff

... ... @@ -3028,10 +3028,17 @@
3028 3028  
3029 3029 while (1) {
3030 3030 if (dlm_locking_stopped(ls)) {
3031   - if (!recovery)
3032   - dlm_add_requestqueue(ls, nodeid, hd);
3033   - error = -EINTR;
3034   - goto out;
  3031 + if (recovery) {
  3032 + error = -EINTR;
  3033 + goto out;
  3034 + }
  3035 + error = dlm_add_requestqueue(ls, nodeid, hd);
  3036 + if (error == -EAGAIN)
  3037 + continue;
  3038 + else {
  3039 + error = -EINTR;
  3040 + goto out;
  3041 + }
3035 3042 }
3036 3043  
3037 3044 if (lock_recovery_try(ls))
fs/dlm/requestqueue.c
... ... @@ -30,26 +30,39 @@
30 30 * lockspace is enabled on some while still suspended on others.
31 31 */
32 32  
33   -void dlm_add_requestqueue(struct dlm_ls *ls, int nodeid, struct dlm_header *hd)
  33 +int dlm_add_requestqueue(struct dlm_ls *ls, int nodeid, struct dlm_header *hd)
34 34 {
35 35 struct rq_entry *e;
36 36 int length = hd->h_length;
  37 + int rv = 0;
37 38  
38 39 if (dlm_is_removed(ls, nodeid))
39   - return;
  40 + return 0;
40 41  
41 42 e = kmalloc(sizeof(struct rq_entry) + length, GFP_KERNEL);
42 43 if (!e) {
43 44 log_print("dlm_add_requestqueue: out of memory\n");
44   - return;
  45 + return 0;
45 46 }
46 47  
47 48 e->nodeid = nodeid;
48 49 memcpy(e->request, hd, length);
49 50  
  51 + /* We need to check dlm_locking_stopped() after taking the mutex to
  52 + avoid a race where dlm_recoverd enables locking and runs
  53 + process_requestqueue between our earlier dlm_locking_stopped check
  54 + and this addition to the requestqueue. */
  55 +
50 56 mutex_lock(&ls->ls_requestqueue_mutex);
51   - list_add_tail(&e->list, &ls->ls_requestqueue);
  57 + if (dlm_locking_stopped(ls))
  58 + list_add_tail(&e->list, &ls->ls_requestqueue);
  59 + else {
  60 + log_debug(ls, "dlm_add_requestqueue skip from %d", nodeid);
  61 + kfree(e);
  62 + rv = -EAGAIN;
  63 + }
52 64 mutex_unlock(&ls->ls_requestqueue_mutex);
  65 + return rv;
53 66 }
54 67  
55 68 int dlm_process_requestqueue(struct dlm_ls *ls)
fs/dlm/requestqueue.h
... ... @@ -13,7 +13,7 @@
13 13 #ifndef __REQUESTQUEUE_DOT_H__
14 14 #define __REQUESTQUEUE_DOT_H__
15 15  
16   -void dlm_add_requestqueue(struct dlm_ls *ls, int nodeid, struct dlm_header *hd);
  16 +int dlm_add_requestqueue(struct dlm_ls *ls, int nodeid, struct dlm_header *hd);
17 17 int dlm_process_requestqueue(struct dlm_ls *ls);
18 18 void dlm_wait_requestqueue(struct dlm_ls *ls);
19 19 void dlm_purge_requestqueue(struct dlm_ls *ls);