Blame view
fs/dlm/recoverd.c
7.39 KB
e7fd41792 [DLM] The core of... |
1 2 3 4 |
/****************************************************************************** ******************************************************************************* ** ** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. |
60f98d183 dlm: add recovery... |
5 |
** Copyright (C) 2004-2011 Red Hat, Inc. All rights reserved. |
e7fd41792 [DLM] The core of... |
6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 |
** ** This copyrighted material is made available to anyone wishing to use, ** modify, copy, or redistribute it subject to the terms and conditions ** of the GNU General Public License v.2. ** ******************************************************************************* ******************************************************************************/ #include "dlm_internal.h" #include "lockspace.h" #include "member.h" #include "dir.h" #include "ast.h" #include "recover.h" #include "lowcomms.h" #include "lock.h" #include "requestqueue.h" #include "recoverd.h" /* If the start for which we're re-enabling locking (seq) has been superseded |
c36258b59 [DLM] block dlm_r... |
27 28 29 30 31 |
by a newer stop (ls_recover_seq), we need to leave locking disabled. We suspend dlm_recv threads here to avoid the race where dlm_recv a) sees locking stopped and b) adds a message to the requestqueue, but dlm_recoverd enables locking and clears the requestqueue between a and b. */ |
e7fd41792 [DLM] The core of... |
32 33 34 35 |
static int enable_locking(struct dlm_ls *ls, uint64_t seq) { int error = -EINTR; |
c36258b59 [DLM] block dlm_r... |
36 |
down_write(&ls->ls_recv_active); |
e7fd41792 [DLM] The core of... |
37 38 39 |
spin_lock(&ls->ls_recover_lock); if (ls->ls_recover_seq == seq) { set_bit(LSFL_RUNNING, &ls->ls_flags); |
c36258b59 [DLM] block dlm_r... |
40 |
/* unblocks processes waiting to enter the dlm */ |
e7fd41792 [DLM] The core of... |
41 42 43 44 |
up_write(&ls->ls_in_recovery); error = 0; } spin_unlock(&ls->ls_recover_lock); |
c36258b59 [DLM] block dlm_r... |
45 46 |
up_write(&ls->ls_recv_active); |
e7fd41792 [DLM] The core of... |
47 48 49 50 51 52 53 |
return error; } static int ls_recover(struct dlm_ls *ls, struct dlm_recover *rv) { unsigned long start; int error, neg = 0; |
f95a34c66 dlm: move recover... |
54 |
log_debug(ls, "dlm_recover %llx", (unsigned long long)rv->seq); |
e7fd41792 [DLM] The core of... |
55 |
|
901359256 [DLM] Update DLM ... |
56 |
mutex_lock(&ls->ls_recoverd_active); |
e7fd41792 [DLM] The core of... |
57 |
|
23e8e1aaa dlm: use workqueu... |
58 |
dlm_callback_suspend(ls); |
e7fd41792 [DLM] The core of... |
59 60 |
/* |
85f0379aa dlm: keep cached ... |
61 62 |
* Free non-master tossed rsb's. Master rsb's are kept on toss * list and put on root list to be included in resdir recovery. |
e7fd41792 [DLM] The core of... |
63 |
*/ |
85f0379aa dlm: keep cached ... |
64 |
dlm_clear_toss_list(ls); |
e7fd41792 [DLM] The core of... |
65 66 |
/* |
85f0379aa dlm: keep cached ... |
67 68 |
* This list of root rsb's will be the basis of most of the recovery * routines. |
e7fd41792 [DLM] The core of... |
69 |
*/ |
85f0379aa dlm: keep cached ... |
70 |
dlm_create_root_list(ls); |
e7fd41792 [DLM] The core of... |
71 72 73 |
/* * Add or remove nodes from the lockspace's ls_nodes list. |
e7fd41792 [DLM] The core of... |
74 75 76 77 |
*/ error = dlm_recover_members(ls, rv, &neg); if (error) { |
f95a34c66 dlm: move recover... |
78 |
log_debug(ls, "dlm_recover_members error %d", error); |
e7fd41792 [DLM] The core of... |
79 80 |
goto fail; } |
f95a34c66 dlm: move recover... |
81 82 83 84 85 86 87 88 |
dlm_set_recover_status(ls, DLM_RS_NODES); error = dlm_recover_members_wait(ls); if (error) { log_debug(ls, "dlm_recover_members_wait error %d", error); goto fail; } |
e7fd41792 [DLM] The core of... |
89 90 91 92 93 94 95 96 97 |
start = jiffies; /* * Rebuild our own share of the directory by collecting from all other * nodes their master rsb names that hash to us. */ error = dlm_recover_directory(ls); if (error) { |
f95a34c66 dlm: move recover... |
98 |
log_debug(ls, "dlm_recover_directory error %d", error); |
e7fd41792 [DLM] The core of... |
99 100 |
goto fail; } |
f95a34c66 dlm: move recover... |
101 |
dlm_set_recover_status(ls, DLM_RS_DIR); |
e7fd41792 [DLM] The core of... |
102 103 104 |
error = dlm_recover_directory_wait(ls); if (error) { |
f95a34c66 dlm: move recover... |
105 |
log_debug(ls, "dlm_recover_directory_wait error %d", error); |
e7fd41792 [DLM] The core of... |
106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 |
goto fail; } /* * We may have outstanding operations that are waiting for a reply from * a failed node. Mark these to be resent after recovery. Unlock and * cancel ops can just be completed. */ dlm_recover_waiters_pre(ls); error = dlm_recovery_stopped(ls); if (error) goto fail; if (neg || dlm_no_directory(ls)) { /* * Clear lkb's for departed nodes. */ dlm_purge_locks(ls); /* * Get new master nodeid's for rsb's that were mastered on * departed nodes. */ error = dlm_recover_masters(ls); if (error) { |
f95a34c66 dlm: move recover... |
135 |
log_debug(ls, "dlm_recover_masters error %d", error); |
e7fd41792 [DLM] The core of... |
136 137 138 139 140 141 142 143 144 |
goto fail; } /* * Send our locks on remastered rsb's to the new masters. */ error = dlm_recover_locks(ls); if (error) { |
f95a34c66 dlm: move recover... |
145 |
log_debug(ls, "dlm_recover_locks error %d", error); |
e7fd41792 [DLM] The core of... |
146 147 |
goto fail; } |
f95a34c66 dlm: move recover... |
148 |
dlm_set_recover_status(ls, DLM_RS_LOCKS); |
e7fd41792 [DLM] The core of... |
149 150 |
error = dlm_recover_locks_wait(ls); if (error) { |
f95a34c66 dlm: move recover... |
151 |
log_debug(ls, "dlm_recover_locks_wait error %d", error); |
e7fd41792 [DLM] The core of... |
152 153 154 155 156 157 158 159 160 161 |
goto fail; } /* * Finalize state in master rsb's now that all locks can be * checked. This includes conversion resolution and lvb * settings. */ dlm_recover_rsbs(ls); |
91c0dc93a [DLM] fix aborted... |
162 163 164 165 |
} else { /* * Other lockspace members may be going through the "neg" steps * while also adding us to the lockspace, in which case they'll |
4b77f2c93 [DLM] do full rec... |
166 |
* be doing the recover_locks (RS_LOCKS) barrier. |
91c0dc93a [DLM] fix aborted... |
167 168 |
*/ dlm_set_recover_status(ls, DLM_RS_LOCKS); |
4b77f2c93 [DLM] do full rec... |
169 170 171 |
error = dlm_recover_locks_wait(ls); if (error) { |
f95a34c66 dlm: move recover... |
172 |
log_debug(ls, "dlm_recover_locks_wait error %d", error); |
4b77f2c93 [DLM] do full rec... |
173 174 |
goto fail; } |
e7fd41792 [DLM] The core of... |
175 176 177 |
} dlm_release_root_list(ls); |
2896ee37c [DLM] fix add_req... |
178 179 180 181 182 183 184 |
/* * Purge directory-related requests that are saved in requestqueue. * All dir requests from before recovery are invalid now due to the dir * rebuild and will be resent by the requesting nodes. */ dlm_purge_requestqueue(ls); |
e7fd41792 [DLM] The core of... |
185 |
dlm_set_recover_status(ls, DLM_RS_DONE); |
f95a34c66 dlm: move recover... |
186 |
|
e7fd41792 [DLM] The core of... |
187 188 |
error = dlm_recover_done_wait(ls); if (error) { |
f95a34c66 dlm: move recover... |
189 |
log_debug(ls, "dlm_recover_done_wait error %d", error); |
e7fd41792 [DLM] The core of... |
190 191 192 193 |
goto fail; } dlm_clear_members_gone(ls); |
3ae1acf93 [DLM] add lock ti... |
194 |
dlm_adjust_timeouts(ls); |
23e8e1aaa dlm: use workqueu... |
195 |
dlm_callback_resume(ls); |
e7fd41792 [DLM] The core of... |
196 197 |
error = enable_locking(ls, rv->seq); if (error) { |
f95a34c66 dlm: move recover... |
198 |
log_debug(ls, "enable_locking error %d", error); |
e7fd41792 [DLM] The core of... |
199 200 201 202 203 |
goto fail; } error = dlm_process_requestqueue(ls); if (error) { |
f95a34c66 dlm: move recover... |
204 |
log_debug(ls, "dlm_process_requestqueue error %d", error); |
e7fd41792 [DLM] The core of... |
205 206 207 208 209 |
goto fail; } error = dlm_recover_waiters_post(ls); if (error) { |
f95a34c66 dlm: move recover... |
210 |
log_debug(ls, "dlm_recover_waiters_post error %d", error); |
e7fd41792 [DLM] The core of... |
211 212 213 214 |
goto fail; } dlm_grant_after_purge(ls); |
60f98d183 dlm: add recovery... |
215 216 |
log_debug(ls, "dlm_recover %llx generation %u done: %u ms", (unsigned long long)rv->seq, ls->ls_generation, |
e7fd41792 [DLM] The core of... |
217 |
jiffies_to_msecs(jiffies - start)); |
901359256 [DLM] Update DLM ... |
218 |
mutex_unlock(&ls->ls_recoverd_active); |
e7fd41792 [DLM] The core of... |
219 |
|
60f98d183 dlm: add recovery... |
220 |
dlm_lsop_recover_done(ls); |
e7fd41792 [DLM] The core of... |
221 222 223 224 |
return 0; fail: dlm_release_root_list(ls); |
f95a34c66 dlm: move recover... |
225 |
log_debug(ls, "dlm_recover %llx error %d", |
57adf7eed [DLM] fix format ... |
226 |
(unsigned long long)rv->seq, error); |
901359256 [DLM] Update DLM ... |
227 |
mutex_unlock(&ls->ls_recoverd_active); |
e7fd41792 [DLM] The core of... |
228 229 |
return error; } |
2cdc98aaf [DLM] fix stoppin... |
230 231 232 |
/* The dlm_ls_start() that created the rv we take here may already have been stopped via dlm_ls_stop(); in that case we need to leave the RECOVERY_STOP flag set. */ |
e7fd41792 [DLM] The core of... |
233 234 235 236 237 238 239 |
static void do_ls_recovery(struct dlm_ls *ls) { struct dlm_recover *rv = NULL; spin_lock(&ls->ls_recover_lock); rv = ls->ls_recover_args; ls->ls_recover_args = NULL; |
2cdc98aaf [DLM] fix stoppin... |
240 241 |
if (rv && ls->ls_recover_seq == rv->seq) clear_bit(LSFL_RECOVERY_STOP, &ls->ls_flags); |
e7fd41792 [DLM] The core of... |
242 243 244 245 |
spin_unlock(&ls->ls_recover_lock); if (rv) { ls_recover(ls, rv); |
60f98d183 dlm: add recovery... |
246 |
kfree(rv->nodes); |
e7fd41792 [DLM] The core of... |
247 248 249 250 251 252 253 254 255 |
kfree(rv); } } static int dlm_recoverd(void *arg) { struct dlm_ls *ls; ls = dlm_find_lockspace_local(arg); |
5f88f1ea1 [DLM] add new loc... |
256 257 258 259 |
if (!ls) { log_print("dlm_recoverd: no lockspace %p", arg); return -1; } |
e7fd41792 [DLM] The core of... |
260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 |
while (!kthread_should_stop()) { set_current_state(TASK_INTERRUPTIBLE); if (!test_bit(LSFL_WORK, &ls->ls_flags)) schedule(); set_current_state(TASK_RUNNING); if (test_and_clear_bit(LSFL_WORK, &ls->ls_flags)) do_ls_recovery(ls); } dlm_put_lockspace(ls); return 0; } void dlm_recoverd_kick(struct dlm_ls *ls) { set_bit(LSFL_WORK, &ls->ls_flags); wake_up_process(ls->ls_recoverd_task); } int dlm_recoverd_start(struct dlm_ls *ls) { struct task_struct *p; int error = 0; p = kthread_run(dlm_recoverd, ls, "dlm_recoverd"); if (IS_ERR(p)) error = PTR_ERR(p); else ls->ls_recoverd_task = p; return error; } void dlm_recoverd_stop(struct dlm_ls *ls) { kthread_stop(ls->ls_recoverd_task); } void dlm_recoverd_suspend(struct dlm_ls *ls) { |
f6db1b8e7 [DLM] abort recov... |
301 |
wake_up(&ls->ls_wait_general); |
901359256 [DLM] Update DLM ... |
302 |
mutex_lock(&ls->ls_recoverd_active); |
e7fd41792 [DLM] The core of... |
303 304 305 306 |
} void dlm_recoverd_resume(struct dlm_ls *ls) { |
901359256 [DLM] Update DLM ... |
307 |
mutex_unlock(&ls->ls_recoverd_active); |
e7fd41792 [DLM] The core of... |
308 |
} |