Blame view

fs/dlm/recoverd.c 7.94 KB
e7fd41792   David Teigland   [DLM] The core of...
1
2
3
4
  /******************************************************************************
  *******************************************************************************
  **
  **  Copyright (C) Sistina Software, Inc.  1997-2003  All rights reserved.
60f98d183   David Teigland   dlm: add recovery...
5
  **  Copyright (C) 2004-2011 Red Hat, Inc.  All rights reserved.
e7fd41792   David Teigland   [DLM] The core of...
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
  **
  **  This copyrighted material is made available to anyone wishing to use,
  **  modify, copy, or redistribute it subject to the terms and conditions
  **  of the GNU General Public License v.2.
  **
  *******************************************************************************
  ******************************************************************************/
  
  #include "dlm_internal.h"
  #include "lockspace.h"
  #include "member.h"
  #include "dir.h"
  #include "ast.h"
  #include "recover.h"
  #include "lowcomms.h"
  #include "lock.h"
  #include "requestqueue.h"
  #include "recoverd.h"
  
  
  /* If the start for which we're re-enabling locking (seq) has been superseded
c36258b59   David Teigland   [DLM] block dlm_r...
27
28
29
30
31
     by a newer stop (ls_recover_seq), we need to leave locking disabled.
  
     We suspend dlm_recv threads here to avoid the race where dlm_recv a) sees
     locking stopped and b) adds a message to the requestqueue, but dlm_recoverd
     enables locking and clears the requestqueue between a and b. */
e7fd41792   David Teigland   [DLM] The core of...
32
33
34
35
  
  static int enable_locking(struct dlm_ls *ls, uint64_t seq)
  {
  	int error = -EINTR;
c36258b59   David Teigland   [DLM] block dlm_r...
36
  	down_write(&ls->ls_recv_active);
e7fd41792   David Teigland   [DLM] The core of...
37
38
39
  	spin_lock(&ls->ls_recover_lock);
  	if (ls->ls_recover_seq == seq) {
  		set_bit(LSFL_RUNNING, &ls->ls_flags);
c36258b59   David Teigland   [DLM] block dlm_r...
40
  		/* unblocks processes waiting to enter the dlm */
e7fd41792   David Teigland   [DLM] The core of...
41
  		up_write(&ls->ls_in_recovery);
475f230c6   David Teigland   dlm: fix unlock b...
42
  		clear_bit(LSFL_RECOVER_LOCK, &ls->ls_flags);
e7fd41792   David Teigland   [DLM] The core of...
43
44
45
  		error = 0;
  	}
  	spin_unlock(&ls->ls_recover_lock);
c36258b59   David Teigland   [DLM] block dlm_r...
46
47
  
  	up_write(&ls->ls_recv_active);
e7fd41792   David Teigland   [DLM] The core of...
48
49
50
51
52
53
54
  	return error;
  }
  
  static int ls_recover(struct dlm_ls *ls, struct dlm_recover *rv)
  {
  	unsigned long start;
  	int error, neg = 0;
075f01775   David Teigland   dlm: use INFO for...
55
  	log_rinfo(ls, "dlm_recover %llu", (unsigned long long)rv->seq);
e7fd41792   David Teigland   [DLM] The core of...
56

901359256   David Teigland   [DLM] Update DLM ...
57
  	mutex_lock(&ls->ls_recoverd_active);
e7fd41792   David Teigland   [DLM] The core of...
58

23e8e1aaa   David Teigland   dlm: use workqueu...
59
  	dlm_callback_suspend(ls);
e7fd41792   David Teigland   [DLM] The core of...
60

c04fecb4d   David Teigland   dlm: use rsbtbl a...
61
  	dlm_clear_toss(ls);
e7fd41792   David Teigland   [DLM] The core of...
62
63
  
  	/*
85f0379aa   David Teigland   dlm: keep cached ...
64
65
  	 * This list of root rsb's will be the basis of most of the recovery
  	 * routines.
e7fd41792   David Teigland   [DLM] The core of...
66
  	 */
85f0379aa   David Teigland   dlm: keep cached ...
67
  	dlm_create_root_list(ls);
e7fd41792   David Teigland   [DLM] The core of...
68
69
70
  
  	/*
  	 * Add or remove nodes from the lockspace's ls_nodes list.
e7fd41792   David Teigland   [DLM] The core of...
71
72
73
74
  	 */
  
  	error = dlm_recover_members(ls, rv, &neg);
  	if (error) {
075f01775   David Teigland   dlm: use INFO for...
75
  		log_rinfo(ls, "dlm_recover_members error %d", error);
e7fd41792   David Teigland   [DLM] The core of...
76
77
  		goto fail;
  	}
f95a34c66   David Teigland   dlm: move recover...
78

c04fecb4d   David Teigland   dlm: use rsbtbl a...
79
80
81
82
  	dlm_recover_dir_nodeid(ls);
  
  	ls->ls_recover_dir_sent_res = 0;
  	ls->ls_recover_dir_sent_msg = 0;
4875647a0   David Teigland   dlm: fixes for no...
83
  	ls->ls_recover_locks_in = 0;
f95a34c66   David Teigland   dlm: move recover...
84
85
86
87
  	dlm_set_recover_status(ls, DLM_RS_NODES);
  
  	error = dlm_recover_members_wait(ls);
  	if (error) {
075f01775   David Teigland   dlm: use INFO for...
88
  		log_rinfo(ls, "dlm_recover_members_wait error %d", error);
f95a34c66   David Teigland   dlm: move recover...
89
90
  		goto fail;
  	}
e7fd41792   David Teigland   [DLM] The core of...
91
92
93
94
95
96
97
98
99
  	start = jiffies;
  
  	/*
  	 * Rebuild our own share of the directory by collecting from all other
  	 * nodes their master rsb names that hash to us.
  	 */
  
  	error = dlm_recover_directory(ls);
  	if (error) {
075f01775   David Teigland   dlm: use INFO for...
100
  		log_rinfo(ls, "dlm_recover_directory error %d", error);
e7fd41792   David Teigland   [DLM] The core of...
101
102
  		goto fail;
  	}
f95a34c66   David Teigland   dlm: move recover...
103
  	dlm_set_recover_status(ls, DLM_RS_DIR);
e7fd41792   David Teigland   [DLM] The core of...
104
105
106
  
  	error = dlm_recover_directory_wait(ls);
  	if (error) {
075f01775   David Teigland   dlm: use INFO for...
107
  		log_rinfo(ls, "dlm_recover_directory_wait error %d", error);
e7fd41792   David Teigland   [DLM] The core of...
108
109
  		goto fail;
  	}
075f01775   David Teigland   dlm: use INFO for...
110
  	log_rinfo(ls, "dlm_recover_directory %u out %u messages",
c04fecb4d   David Teigland   dlm: use rsbtbl a...
111
  		  ls->ls_recover_dir_sent_res, ls->ls_recover_dir_sent_msg);
e7fd41792   David Teigland   [DLM] The core of...
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
  	/*
  	 * We may have outstanding operations that are waiting for a reply from
  	 * a failed node.  Mark these to be resent after recovery.  Unlock and
  	 * cancel ops can just be completed.
  	 */
  
  	dlm_recover_waiters_pre(ls);
  
  	error = dlm_recovery_stopped(ls);
  	if (error)
  		goto fail;
  
  	if (neg || dlm_no_directory(ls)) {
  		/*
  		 * Clear lkb's for departed nodes.
  		 */
4875647a0   David Teigland   dlm: fixes for no...
128
  		dlm_recover_purge(ls);
e7fd41792   David Teigland   [DLM] The core of...
129
130
131
132
133
134
135
136
  
  		/*
  		 * Get new master nodeid's for rsb's that were mastered on
  		 * departed nodes.
  		 */
  
  		error = dlm_recover_masters(ls);
  		if (error) {
075f01775   David Teigland   dlm: use INFO for...
137
  			log_rinfo(ls, "dlm_recover_masters error %d", error);
e7fd41792   David Teigland   [DLM] The core of...
138
139
140
141
142
143
144
145
146
  			goto fail;
  		}
  
  		/*
  		 * Send our locks on remastered rsb's to the new masters.
  		 */
  
  		error = dlm_recover_locks(ls);
  		if (error) {
075f01775   David Teigland   dlm: use INFO for...
147
  			log_rinfo(ls, "dlm_recover_locks error %d", error);
e7fd41792   David Teigland   [DLM] The core of...
148
149
  			goto fail;
  		}
f95a34c66   David Teigland   dlm: move recover...
150
  		dlm_set_recover_status(ls, DLM_RS_LOCKS);
e7fd41792   David Teigland   [DLM] The core of...
151
152
  		error = dlm_recover_locks_wait(ls);
  		if (error) {
075f01775   David Teigland   dlm: use INFO for...
153
  			log_rinfo(ls, "dlm_recover_locks_wait error %d", error);
e7fd41792   David Teigland   [DLM] The core of...
154
155
  			goto fail;
  		}
075f01775   David Teigland   dlm: use INFO for...
156
  		log_rinfo(ls, "dlm_recover_locks %u in",
4875647a0   David Teigland   dlm: fixes for no...
157
  			  ls->ls_recover_locks_in);
e7fd41792   David Teigland   [DLM] The core of...
158
159
160
161
162
163
164
  		/*
  		 * Finalize state in master rsb's now that all locks can be
  		 * checked.  This includes conversion resolution and lvb
  		 * settings.
  		 */
  
  		dlm_recover_rsbs(ls);
91c0dc93a   David Teigland   [DLM] fix aborted...
165
166
167
168
  	} else {
  		/*
  		 * Other lockspace members may be going through the "neg" steps
  		 * while also adding us to the lockspace, in which case they'll
4b77f2c93   David Teigland   [DLM] do full rec...
169
  		 * be doing the recover_locks (RS_LOCKS) barrier.
91c0dc93a   David Teigland   [DLM] fix aborted...
170
171
  		 */
  		dlm_set_recover_status(ls, DLM_RS_LOCKS);
4b77f2c93   David Teigland   [DLM] do full rec...
172
173
174
  
  		error = dlm_recover_locks_wait(ls);
  		if (error) {
075f01775   David Teigland   dlm: use INFO for...
175
  			log_rinfo(ls, "dlm_recover_locks_wait error %d", error);
4b77f2c93   David Teigland   [DLM] do full rec...
176
177
  			goto fail;
  		}
e7fd41792   David Teigland   [DLM] The core of...
178
179
180
  	}
  
  	dlm_release_root_list(ls);
2896ee37c   David Teigland   [DLM] fix add_req...
181
182
183
184
185
186
187
  	/*
  	 * Purge directory-related requests that are saved in requestqueue.
  	 * All dir requests from before recovery are invalid now due to the dir
  	 * rebuild and will be resent by the requesting nodes.
  	 */
  
  	dlm_purge_requestqueue(ls);
e7fd41792   David Teigland   [DLM] The core of...
188
  	dlm_set_recover_status(ls, DLM_RS_DONE);
f95a34c66   David Teigland   dlm: move recover...
189

e7fd41792   David Teigland   [DLM] The core of...
190
191
  	error = dlm_recover_done_wait(ls);
  	if (error) {
075f01775   David Teigland   dlm: use INFO for...
192
  		log_rinfo(ls, "dlm_recover_done_wait error %d", error);
e7fd41792   David Teigland   [DLM] The core of...
193
194
195
196
  		goto fail;
  	}
  
  	dlm_clear_members_gone(ls);
3ae1acf93   David Teigland   [DLM] add lock ti...
197
  	dlm_adjust_timeouts(ls);
23e8e1aaa   David Teigland   dlm: use workqueu...
198
  	dlm_callback_resume(ls);
e7fd41792   David Teigland   [DLM] The core of...
199
200
  	error = enable_locking(ls, rv->seq);
  	if (error) {
075f01775   David Teigland   dlm: use INFO for...
201
  		log_rinfo(ls, "enable_locking error %d", error);
e7fd41792   David Teigland   [DLM] The core of...
202
203
204
205
206
  		goto fail;
  	}
  
  	error = dlm_process_requestqueue(ls);
  	if (error) {
075f01775   David Teigland   dlm: use INFO for...
207
  		log_rinfo(ls, "dlm_process_requestqueue error %d", error);
e7fd41792   David Teigland   [DLM] The core of...
208
209
210
211
212
  		goto fail;
  	}
  
  	error = dlm_recover_waiters_post(ls);
  	if (error) {
075f01775   David Teigland   dlm: use INFO for...
213
  		log_rinfo(ls, "dlm_recover_waiters_post error %d", error);
e7fd41792   David Teigland   [DLM] The core of...
214
215
  		goto fail;
  	}
4875647a0   David Teigland   dlm: fixes for no...
216
  	dlm_recover_grant(ls);
e7fd41792   David Teigland   [DLM] The core of...
217

075f01775   David Teigland   dlm: use INFO for...
218
  	log_rinfo(ls, "dlm_recover %llu generation %u done: %u ms",
60f98d183   David Teigland   dlm: add recovery...
219
  		  (unsigned long long)rv->seq, ls->ls_generation,
e7fd41792   David Teigland   [DLM] The core of...
220
  		  jiffies_to_msecs(jiffies - start));
901359256   David Teigland   [DLM] Update DLM ...
221
  	mutex_unlock(&ls->ls_recoverd_active);
e7fd41792   David Teigland   [DLM] The core of...
222

60f98d183   David Teigland   dlm: add recovery...
223
  	dlm_lsop_recover_done(ls);
e7fd41792   David Teigland   [DLM] The core of...
224
225
226
227
  	return 0;
  
   fail:
  	dlm_release_root_list(ls);
075f01775   David Teigland   dlm: use INFO for...
228
  	log_rinfo(ls, "dlm_recover %llu error %d",
57adf7eed   Ryusuke Konishi   [DLM] fix format ...
229
  		  (unsigned long long)rv->seq, error);
901359256   David Teigland   [DLM] Update DLM ...
230
  	mutex_unlock(&ls->ls_recoverd_active);
e7fd41792   David Teigland   [DLM] The core of...
231
232
  	return error;
  }
2cdc98aaf   David Teigland   [DLM] fix stoppin...
233
234
235
  /* The dlm_ls_start() that created the rv we take here may already have been
     stopped via dlm_ls_stop(); in that case we need to leave the RECOVERY_STOP
     flag set. */
e7fd41792   David Teigland   [DLM] The core of...
236
237
238
239
240
241
242
  static void do_ls_recovery(struct dlm_ls *ls)
  {
  	struct dlm_recover *rv = NULL;
  
  	spin_lock(&ls->ls_recover_lock);
  	rv = ls->ls_recover_args;
  	ls->ls_recover_args = NULL;
2cdc98aaf   David Teigland   [DLM] fix stoppin...
243
  	if (rv && ls->ls_recover_seq == rv->seq)
475f230c6   David Teigland   dlm: fix unlock b...
244
  		clear_bit(LSFL_RECOVER_STOP, &ls->ls_flags);
e7fd41792   David Teigland   [DLM] The core of...
245
246
247
248
  	spin_unlock(&ls->ls_recover_lock);
  
  	if (rv) {
  		ls_recover(ls, rv);
60f98d183   David Teigland   dlm: add recovery...
249
  		kfree(rv->nodes);
e7fd41792   David Teigland   [DLM] The core of...
250
251
252
253
254
255
256
257
258
  		kfree(rv);
  	}
  }
  
  static int dlm_recoverd(void *arg)
  {
  	struct dlm_ls *ls;
  
  	ls = dlm_find_lockspace_local(arg);
5f88f1ea1   David Teigland   [DLM] add new loc...
259
260
261
262
  	if (!ls) {
  		log_print("dlm_recoverd: no lockspace %p", arg);
  		return -1;
  	}
e7fd41792   David Teigland   [DLM] The core of...
263

475f230c6   David Teigland   dlm: fix unlock b...
264
265
266
  	down_write(&ls->ls_in_recovery);
  	set_bit(LSFL_RECOVER_LOCK, &ls->ls_flags);
  	wake_up(&ls->ls_recover_lock_wait);
e7fd41792   David Teigland   [DLM] The core of...
267
268
  	while (!kthread_should_stop()) {
  		set_current_state(TASK_INTERRUPTIBLE);
475f230c6   David Teigland   dlm: fix unlock b...
269
270
  		if (!test_bit(LSFL_RECOVER_WORK, &ls->ls_flags) &&
  		    !test_bit(LSFL_RECOVER_DOWN, &ls->ls_flags))
e7fd41792   David Teigland   [DLM] The core of...
271
272
  			schedule();
  		set_current_state(TASK_RUNNING);
475f230c6   David Teigland   dlm: fix unlock b...
273
274
275
276
277
278
279
  		if (test_and_clear_bit(LSFL_RECOVER_DOWN, &ls->ls_flags)) {
  			down_write(&ls->ls_in_recovery);
  			set_bit(LSFL_RECOVER_LOCK, &ls->ls_flags);
  			wake_up(&ls->ls_recover_lock_wait);
  		}
  
  		if (test_and_clear_bit(LSFL_RECOVER_WORK, &ls->ls_flags))
e7fd41792   David Teigland   [DLM] The core of...
280
281
  			do_ls_recovery(ls);
  	}
475f230c6   David Teigland   dlm: fix unlock b...
282
283
  	if (test_bit(LSFL_RECOVER_LOCK, &ls->ls_flags))
  		up_write(&ls->ls_in_recovery);
e7fd41792   David Teigland   [DLM] The core of...
284
285
286
  	dlm_put_lockspace(ls);
  	return 0;
  }
e7fd41792   David Teigland   [DLM] The core of...
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
  int dlm_recoverd_start(struct dlm_ls *ls)
  {
  	struct task_struct *p;
  	int error = 0;
  
  	p = kthread_run(dlm_recoverd, ls, "dlm_recoverd");
  	if (IS_ERR(p))
  		error = PTR_ERR(p);
  	else
                  ls->ls_recoverd_task = p;
  	return error;
  }
  
  void dlm_recoverd_stop(struct dlm_ls *ls)
  {
  	kthread_stop(ls->ls_recoverd_task);
  }
  
  void dlm_recoverd_suspend(struct dlm_ls *ls)
  {
f6db1b8e7   David Teigland   [DLM] abort recov...
307
  	wake_up(&ls->ls_wait_general);
901359256   David Teigland   [DLM] Update DLM ...
308
  	mutex_lock(&ls->ls_recoverd_active);
e7fd41792   David Teigland   [DLM] The core of...
309
310
311
312
  }
  
  void dlm_recoverd_resume(struct dlm_ls *ls)
  {
901359256   David Teigland   [DLM] Update DLM ...
313
  	mutex_unlock(&ls->ls_recoverd_active);
e7fd41792   David Teigland   [DLM] The core of...
314
  }