[DLM] fix requestqueue race

Red Hat BZ 211914 There's a race between dlm_recoverd (1) enabling locking and (2) clearing out the requestqueue, and dlm_recvd (1) checking if locking is enabled and (2) adding a message to the requestqueue. An order of recoverd(1), recvd(1), recvd(2), recoverd(2) will result in a message being left on the requestqueue. The fix is to have dlm_recvd check if dlm_recoverd has enabled locking after taking the mutex for the requestqueue and if it has processing the message instead of queueing it. Signed-off-by: David Teigland <teigland@redhat.com> Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>

[DLM] fix requestqueue race
Red Hat BZ 211914 There's a race between dlm_recoverd (1) enabling locking and (2) clearing out the requestqueue, and dlm_recvd (1) checking if locking is enabled and (2) adding a message to the requestqueue. An order of recoverd(1), recvd(1), recvd(2), recoverd(2) will result in a message being left on the requestqueue. The fix is to have dlm_recvd check if dlm_recoverd has enabled locking after taking the mutex for the requestqueue and if it has processing the message instead of queueing it. Signed-off-by: David Teigland <teigland@redhat.com> Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>
David Teigland · Steven Whitehouse
1 parent 435618b75b
Showing 3 changed files with 29 additions and 9 deletions Side-by-side Diff
fs/dlm/lock.c
fs/dlm/requestqueue.c
fs/dlm/requestqueue.h
@@ -3028,10 +3028,17 @@
  
 	while (1) {
 		if (dlm_locking_stopped(ls)) {
-			if (!recovery)
-				dlm_add_requestqueue(ls, nodeid, hd);
-			error = -EINTR;
-			goto out;
+			if (recovery) {
+				error = -EINTR;
+				goto out;
+			}
+			error = dlm_add_requestqueue(ls, nodeid, hd);
+			if (error == -EAGAIN)
+				continue;
+			else {
+				error = -EINTR;
+				goto out;
+			}
 		}
  
 		if (lock_recovery_try(ls))
@@ -30,26 +30,39 @@
  * lockspace is enabled on some while still suspended on others.
  */
  
-void dlm_add_requestqueue(struct dlm_ls *ls, int nodeid, struct dlm_header *hd)
+int dlm_add_requestqueue(struct dlm_ls *ls, int nodeid, struct dlm_header *hd)
 {
 	struct rq_entry *e;
 	int length = hd->h_length;
+	int rv = 0;
  
 	if (dlm_is_removed(ls, nodeid))
-		return;
+		return 0;
  
 	e = kmalloc(sizeof(struct rq_entry) + length, GFP_KERNEL);
 	if (!e) {
 		log_print("dlm_add_requestqueue: out of memory\n");
-		return;
+		return 0;
 	}
  
 	e->nodeid = nodeid;
 	memcpy(e->request, hd, length);
  
+	/* We need to check dlm_locking_stopped() after taking the mutex to
+	   avoid a race where dlm_recoverd enables locking and runs
+	   process_requestqueue between our earlier dlm_locking_stopped check
+	   and this addition to the requestqueue. */
+
 	mutex_lock(&ls->ls_requestqueue_mutex);
-	list_add_tail(&e->list, &ls->ls_requestqueue);
+	if (dlm_locking_stopped(ls))
+		list_add_tail(&e->list, &ls->ls_requestqueue);
+	else {
+		log_debug(ls, "dlm_add_requestqueue skip from %d", nodeid);
+		kfree(e);
+		rv = -EAGAIN;
+	}
 	mutex_unlock(&ls->ls_requestqueue_mutex);
+	return rv;
 }
  
 int dlm_process_requestqueue(struct dlm_ls *ls)
@@ -13,7 +13,7 @@
 #ifndef __REQUESTQUEUE_DOT_H__
 #define __REQUESTQUEUE_DOT_H__
  
-void dlm_add_requestqueue(struct dlm_ls *ls, int nodeid, struct dlm_header *hd);
+int dlm_add_requestqueue(struct dlm_ls *ls, int nodeid, struct dlm_header *hd);
 int dlm_process_requestqueue(struct dlm_ls *ls);
 void dlm_wait_requestqueue(struct dlm_ls *ls);
 void dlm_purge_requestqueue(struct dlm_ls *ls);
...	...	@@ -3028,10 +3028,17 @@
3028	3028
3029	3029	while (1) {
3030	3030	if (dlm_locking_stopped(ls)) {
3031		- if (!recovery)
3032		- dlm_add_requestqueue(ls, nodeid, hd);
3033		- error = -EINTR;
3034		- goto out;
	3031	+ if (recovery) {
	3032	+ error = -EINTR;
	3033	+ goto out;
	3034	+ }
	3035	+ error = dlm_add_requestqueue(ls, nodeid, hd);
	3036	+ if (error == -EAGAIN)
	3037	+ continue;
	3038	+ else {
	3039	+ error = -EINTR;
	3040	+ goto out;
	3041	+ }
3035	3042	}
3036	3043
3037	3044	if (lock_recovery_try(ls))
...	...	@@ -30,26 +30,39 @@
30	30	* lockspace is enabled on some while still suspended on others.
31	31	*/
32	32
33		-void dlm_add_requestqueue(struct dlm_ls ls, int nodeid, struct dlm_header hd)
	33	+int dlm_add_requestqueue(struct dlm_ls ls, int nodeid, struct dlm_header hd)
34	34	{
35	35	struct rq_entry *e;
36	36	int length = hd->h_length;
	37	+ int rv = 0;
37	38
38	39	if (dlm_is_removed(ls, nodeid))
39		- return;
	40	+ return 0;
40	41
41	42	e = kmalloc(sizeof(struct rq_entry) + length, GFP_KERNEL);
42	43	if (!e) {
43	44	log_print("dlm_add_requestqueue: out of memory\n");
44		- return;
	45	+ return 0;
45	46	}
46	47
47	48	e->nodeid = nodeid;
48	49	memcpy(e->request, hd, length);
49	50
	51	+ /* We need to check dlm_locking_stopped() after taking the mutex to
	52	+ avoid a race where dlm_recoverd enables locking and runs
	53	+ process_requestqueue between our earlier dlm_locking_stopped check
	54	+ and this addition to the requestqueue. */
	55	+
50	56	mutex_lock(&ls->ls_requestqueue_mutex);
51		- list_add_tail(&e->list, &ls->ls_requestqueue);
	57	+ if (dlm_locking_stopped(ls))
	58	+ list_add_tail(&e->list, &ls->ls_requestqueue);
	59	+ else {
	60	+ log_debug(ls, "dlm_add_requestqueue skip from %d", nodeid);
	61	+ kfree(e);
	62	+ rv = -EAGAIN;
	63	+ }
52	64	mutex_unlock(&ls->ls_requestqueue_mutex);
	65	+ return rv;
53	66	}
54	67
55	68	int dlm_process_requestqueue(struct dlm_ls *ls)
...	...	@@ -13,7 +13,7 @@
13	13	#ifndef __REQUESTQUEUE_DOT_H__
14	14	#define __REQUESTQUEUE_DOT_H__
15	15
16		-void dlm_add_requestqueue(struct dlm_ls ls, int nodeid, struct dlm_header hd);
	16	+int dlm_add_requestqueue(struct dlm_ls ls, int nodeid, struct dlm_header hd);
17	17	int dlm_process_requestqueue(struct dlm_ls *ls);
18	18	void dlm_wait_requestqueue(struct dlm_ls *ls);
19	19	void dlm_purge_requestqueue(struct dlm_ls *ls);