Blame view

fs/dlm/recoverd.c 7.39 KB
e7fd41792   David Teigland   [DLM] The core of...
1
2
3
4
  /******************************************************************************
  *******************************************************************************
  **
  **  Copyright (C) Sistina Software, Inc.  1997-2003  All rights reserved.
60f98d183   David Teigland   dlm: add recovery...
5
  **  Copyright (C) 2004-2011 Red Hat, Inc.  All rights reserved.
e7fd41792   David Teigland   [DLM] The core of...
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
  **
  **  This copyrighted material is made available to anyone wishing to use,
  **  modify, copy, or redistribute it subject to the terms and conditions
  **  of the GNU General Public License v.2.
  **
  *******************************************************************************
  ******************************************************************************/
  
  #include "dlm_internal.h"
  #include "lockspace.h"
  #include "member.h"
  #include "dir.h"
  #include "ast.h"
  #include "recover.h"
  #include "lowcomms.h"
  #include "lock.h"
  #include "requestqueue.h"
  #include "recoverd.h"
  
  
  /* If the start for which we're re-enabling locking (seq) has been superseded
c36258b59   David Teigland   [DLM] block dlm_r...
27
28
29
30
31
     by a newer stop (ls_recover_seq), we need to leave locking disabled.
  
     We suspend dlm_recv threads here to avoid the race where dlm_recv a) sees
     locking stopped and b) adds a message to the requestqueue, but dlm_recoverd
     enables locking and clears the requestqueue between a and b. */
e7fd41792   David Teigland   [DLM] The core of...
32
33
34
35
  
  static int enable_locking(struct dlm_ls *ls, uint64_t seq)
  {
  	int error = -EINTR;
c36258b59   David Teigland   [DLM] block dlm_r...
36
  	down_write(&ls->ls_recv_active);
e7fd41792   David Teigland   [DLM] The core of...
37
38
39
  	spin_lock(&ls->ls_recover_lock);
  	if (ls->ls_recover_seq == seq) {
  		set_bit(LSFL_RUNNING, &ls->ls_flags);
c36258b59   David Teigland   [DLM] block dlm_r...
40
  		/* unblocks processes waiting to enter the dlm */
e7fd41792   David Teigland   [DLM] The core of...
41
42
43
44
  		up_write(&ls->ls_in_recovery);
  		error = 0;
  	}
  	spin_unlock(&ls->ls_recover_lock);
c36258b59   David Teigland   [DLM] block dlm_r...
45
46
  
  	up_write(&ls->ls_recv_active);
e7fd41792   David Teigland   [DLM] The core of...
47
48
49
50
51
52
53
  	return error;
  }
  
  static int ls_recover(struct dlm_ls *ls, struct dlm_recover *rv)
  {
  	unsigned long start;
  	int error, neg = 0;
f95a34c66   David Teigland   dlm: move recover...
54
  	log_debug(ls, "dlm_recover %llx", (unsigned long long)rv->seq);
e7fd41792   David Teigland   [DLM] The core of...
55

901359256   David Teigland   [DLM] Update DLM ...
56
  	mutex_lock(&ls->ls_recoverd_active);
e7fd41792   David Teigland   [DLM] The core of...
57

23e8e1aaa   David Teigland   dlm: use workqueu...
58
  	dlm_callback_suspend(ls);
e7fd41792   David Teigland   [DLM] The core of...
59
60
  
  	/*
85f0379aa   David Teigland   dlm: keep cached ...
61
62
  	 * Free non-master tossed rsb's.  Master rsb's are kept on toss
  	 * list and put on root list to be included in resdir recovery.
e7fd41792   David Teigland   [DLM] The core of...
63
  	 */
85f0379aa   David Teigland   dlm: keep cached ...
64
  	dlm_clear_toss_list(ls);
e7fd41792   David Teigland   [DLM] The core of...
65
66
  
  	/*
85f0379aa   David Teigland   dlm: keep cached ...
67
68
  	 * This list of root rsb's will be the basis of most of the recovery
  	 * routines.
e7fd41792   David Teigland   [DLM] The core of...
69
  	 */
85f0379aa   David Teigland   dlm: keep cached ...
70
  	dlm_create_root_list(ls);
e7fd41792   David Teigland   [DLM] The core of...
71
72
73
  
  	/*
  	 * Add or remove nodes from the lockspace's ls_nodes list.
e7fd41792   David Teigland   [DLM] The core of...
74
75
76
77
  	 */
  
  	error = dlm_recover_members(ls, rv, &neg);
  	if (error) {
f95a34c66   David Teigland   dlm: move recover...
78
  		log_debug(ls, "dlm_recover_members error %d", error);
e7fd41792   David Teigland   [DLM] The core of...
79
80
  		goto fail;
  	}
f95a34c66   David Teigland   dlm: move recover...
81
82
83
84
85
86
87
88
  
  	dlm_set_recover_status(ls, DLM_RS_NODES);
  
  	error = dlm_recover_members_wait(ls);
  	if (error) {
  		log_debug(ls, "dlm_recover_members_wait error %d", error);
  		goto fail;
  	}
e7fd41792   David Teigland   [DLM] The core of...
89
90
91
92
93
94
95
96
97
  	start = jiffies;
  
  	/*
  	 * Rebuild our own share of the directory by collecting from all other
  	 * nodes their master rsb names that hash to us.
  	 */
  
  	error = dlm_recover_directory(ls);
  	if (error) {
f95a34c66   David Teigland   dlm: move recover...
98
  		log_debug(ls, "dlm_recover_directory error %d", error);
e7fd41792   David Teigland   [DLM] The core of...
99
100
  		goto fail;
  	}
f95a34c66   David Teigland   dlm: move recover...
101
  	dlm_set_recover_status(ls, DLM_RS_DIR);
e7fd41792   David Teigland   [DLM] The core of...
102
103
104
  
  	error = dlm_recover_directory_wait(ls);
  	if (error) {
f95a34c66   David Teigland   dlm: move recover...
105
  		log_debug(ls, "dlm_recover_directory_wait error %d", error);
e7fd41792   David Teigland   [DLM] The core of...
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
  		goto fail;
  	}
  
  	/*
  	 * We may have outstanding operations that are waiting for a reply from
  	 * a failed node.  Mark these to be resent after recovery.  Unlock and
  	 * cancel ops can just be completed.
  	 */
  
  	dlm_recover_waiters_pre(ls);
  
  	error = dlm_recovery_stopped(ls);
  	if (error)
  		goto fail;
  
  	if (neg || dlm_no_directory(ls)) {
  		/*
  		 * Clear lkb's for departed nodes.
  		 */
  
  		dlm_purge_locks(ls);
  
  		/*
  		 * Get new master nodeid's for rsb's that were mastered on
  		 * departed nodes.
  		 */
  
  		error = dlm_recover_masters(ls);
  		if (error) {
f95a34c66   David Teigland   dlm: move recover...
135
  			log_debug(ls, "dlm_recover_masters error %d", error);
e7fd41792   David Teigland   [DLM] The core of...
136
137
138
139
140
141
142
143
144
  			goto fail;
  		}
  
  		/*
  		 * Send our locks on remastered rsb's to the new masters.
  		 */
  
  		error = dlm_recover_locks(ls);
  		if (error) {
f95a34c66   David Teigland   dlm: move recover...
145
  			log_debug(ls, "dlm_recover_locks error %d", error);
e7fd41792   David Teigland   [DLM] The core of...
146
147
  			goto fail;
  		}
f95a34c66   David Teigland   dlm: move recover...
148
  		dlm_set_recover_status(ls, DLM_RS_LOCKS);
e7fd41792   David Teigland   [DLM] The core of...
149
150
  		error = dlm_recover_locks_wait(ls);
  		if (error) {
f95a34c66   David Teigland   dlm: move recover...
151
  			log_debug(ls, "dlm_recover_locks_wait error %d", error);
e7fd41792   David Teigland   [DLM] The core of...
152
153
154
155
156
157
158
159
160
161
  			goto fail;
  		}
  
  		/*
  		 * Finalize state in master rsb's now that all locks can be
  		 * checked.  This includes conversion resolution and lvb
  		 * settings.
  		 */
  
  		dlm_recover_rsbs(ls);
91c0dc93a   David Teigland   [DLM] fix aborted...
162
163
164
165
  	} else {
  		/*
  		 * Other lockspace members may be going through the "neg" steps
  		 * while also adding us to the lockspace, in which case they'll
4b77f2c93   David Teigland   [DLM] do full rec...
166
  		 * be doing the recover_locks (RS_LOCKS) barrier.
91c0dc93a   David Teigland   [DLM] fix aborted...
167
168
  		 */
  		dlm_set_recover_status(ls, DLM_RS_LOCKS);
4b77f2c93   David Teigland   [DLM] do full rec...
169
170
171
  
  		error = dlm_recover_locks_wait(ls);
  		if (error) {
f95a34c66   David Teigland   dlm: move recover...
172
  			log_debug(ls, "dlm_recover_locks_wait error %d", error);
4b77f2c93   David Teigland   [DLM] do full rec...
173
174
  			goto fail;
  		}
e7fd41792   David Teigland   [DLM] The core of...
175
176
177
  	}
  
  	dlm_release_root_list(ls);
2896ee37c   David Teigland   [DLM] fix add_req...
178
179
180
181
182
183
184
  	/*
  	 * Purge directory-related requests that are saved in requestqueue.
  	 * All dir requests from before recovery are invalid now due to the dir
  	 * rebuild and will be resent by the requesting nodes.
  	 */
  
  	dlm_purge_requestqueue(ls);
e7fd41792   David Teigland   [DLM] The core of...
185
  	dlm_set_recover_status(ls, DLM_RS_DONE);
f95a34c66   David Teigland   dlm: move recover...
186

e7fd41792   David Teigland   [DLM] The core of...
187
188
  	error = dlm_recover_done_wait(ls);
  	if (error) {
f95a34c66   David Teigland   dlm: move recover...
189
  		log_debug(ls, "dlm_recover_done_wait error %d", error);
e7fd41792   David Teigland   [DLM] The core of...
190
191
192
193
  		goto fail;
  	}
  
  	dlm_clear_members_gone(ls);
3ae1acf93   David Teigland   [DLM] add lock ti...
194
  	dlm_adjust_timeouts(ls);
23e8e1aaa   David Teigland   dlm: use workqueu...
195
  	dlm_callback_resume(ls);
e7fd41792   David Teigland   [DLM] The core of...
196
197
  	error = enable_locking(ls, rv->seq);
  	if (error) {
f95a34c66   David Teigland   dlm: move recover...
198
  		log_debug(ls, "enable_locking error %d", error);
e7fd41792   David Teigland   [DLM] The core of...
199
200
201
202
203
  		goto fail;
  	}
  
  	error = dlm_process_requestqueue(ls);
  	if (error) {
f95a34c66   David Teigland   dlm: move recover...
204
  		log_debug(ls, "dlm_process_requestqueue error %d", error);
e7fd41792   David Teigland   [DLM] The core of...
205
206
207
208
209
  		goto fail;
  	}
  
  	error = dlm_recover_waiters_post(ls);
  	if (error) {
f95a34c66   David Teigland   dlm: move recover...
210
  		log_debug(ls, "dlm_recover_waiters_post error %d", error);
e7fd41792   David Teigland   [DLM] The core of...
211
212
213
214
  		goto fail;
  	}
  
  	dlm_grant_after_purge(ls);
60f98d183   David Teigland   dlm: add recovery...
215
216
  	log_debug(ls, "dlm_recover %llx generation %u done: %u ms",
  		  (unsigned long long)rv->seq, ls->ls_generation,
e7fd41792   David Teigland   [DLM] The core of...
217
  		  jiffies_to_msecs(jiffies - start));
901359256   David Teigland   [DLM] Update DLM ...
218
  	mutex_unlock(&ls->ls_recoverd_active);
e7fd41792   David Teigland   [DLM] The core of...
219

60f98d183   David Teigland   dlm: add recovery...
220
  	dlm_lsop_recover_done(ls);
e7fd41792   David Teigland   [DLM] The core of...
221
222
223
224
  	return 0;
  
   fail:
  	dlm_release_root_list(ls);
f95a34c66   David Teigland   dlm: move recover...
225
  	log_debug(ls, "dlm_recover %llx error %d",
57adf7eed   Ryusuke Konishi   [DLM] fix format ...
226
  		  (unsigned long long)rv->seq, error);
901359256   David Teigland   [DLM] Update DLM ...
227
  	mutex_unlock(&ls->ls_recoverd_active);
e7fd41792   David Teigland   [DLM] The core of...
228
229
  	return error;
  }
2cdc98aaf   David Teigland   [DLM] fix stoppin...
230
231
232
  /* The dlm_ls_start() that created the rv we take here may already have been
     stopped via dlm_ls_stop(); in that case we need to leave the RECOVERY_STOP
     flag set. */
e7fd41792   David Teigland   [DLM] The core of...
233
234
235
236
237
238
239
  static void do_ls_recovery(struct dlm_ls *ls)
  {
  	struct dlm_recover *rv = NULL;
  
  	spin_lock(&ls->ls_recover_lock);
  	rv = ls->ls_recover_args;
  	ls->ls_recover_args = NULL;
2cdc98aaf   David Teigland   [DLM] fix stoppin...
240
241
  	if (rv && ls->ls_recover_seq == rv->seq)
  		clear_bit(LSFL_RECOVERY_STOP, &ls->ls_flags);
e7fd41792   David Teigland   [DLM] The core of...
242
243
244
245
  	spin_unlock(&ls->ls_recover_lock);
  
  	if (rv) {
  		ls_recover(ls, rv);
60f98d183   David Teigland   dlm: add recovery...
246
  		kfree(rv->nodes);
e7fd41792   David Teigland   [DLM] The core of...
247
248
249
250
251
252
253
254
255
  		kfree(rv);
  	}
  }
  
  static int dlm_recoverd(void *arg)
  {
  	struct dlm_ls *ls;
  
  	ls = dlm_find_lockspace_local(arg);
5f88f1ea1   David Teigland   [DLM] add new loc...
256
257
258
259
  	if (!ls) {
  		log_print("dlm_recoverd: no lockspace %p", arg);
  		return -1;
  	}
e7fd41792   David Teigland   [DLM] The core of...
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
  
  	while (!kthread_should_stop()) {
  		set_current_state(TASK_INTERRUPTIBLE);
  		if (!test_bit(LSFL_WORK, &ls->ls_flags))
  			schedule();
  		set_current_state(TASK_RUNNING);
  
  		if (test_and_clear_bit(LSFL_WORK, &ls->ls_flags))
  			do_ls_recovery(ls);
  	}
  
  	dlm_put_lockspace(ls);
  	return 0;
  }
  
  void dlm_recoverd_kick(struct dlm_ls *ls)
  {
  	set_bit(LSFL_WORK, &ls->ls_flags);
  	wake_up_process(ls->ls_recoverd_task);
  }
  
  int dlm_recoverd_start(struct dlm_ls *ls)
  {
  	struct task_struct *p;
  	int error = 0;
  
  	p = kthread_run(dlm_recoverd, ls, "dlm_recoverd");
  	if (IS_ERR(p))
  		error = PTR_ERR(p);
  	else
                  ls->ls_recoverd_task = p;
  	return error;
  }
  
  void dlm_recoverd_stop(struct dlm_ls *ls)
  {
  	kthread_stop(ls->ls_recoverd_task);
  }
  
  void dlm_recoverd_suspend(struct dlm_ls *ls)
  {
f6db1b8e7   David Teigland   [DLM] abort recov...
301
  	wake_up(&ls->ls_wait_general);
901359256   David Teigland   [DLM] Update DLM ...
302
  	mutex_lock(&ls->ls_recoverd_active);
e7fd41792   David Teigland   [DLM] The core of...
303
304
305
306
  }
  
  void dlm_recoverd_resume(struct dlm_ls *ls)
  {
901359256   David Teigland   [DLM] Update DLM ...
307
  	mutex_unlock(&ls->ls_recoverd_active);
e7fd41792   David Teigland   [DLM] The core of...
308
  }