Commit 96006ea6d4eea73466e90ef353bf34e507724e77
1 parent
c503a62103
Exists in
smarc-l5.0.0_1.0.0-ga
and in
5 other branches
dlm: fix missing dir remove
I don't know exactly how, but in some cases, a dir record is not removed, or a new one is created when it shouldn't be. The result is that the dir node lookup returns a master node where the rsb does not exist. In this case, The master node will repeatedly return -EBADR for requests, and the lock requests will be stuck. Until all possible ways for this to happen can be eliminated, a simple and effective way to recover from this situation is for the supposed master node to send a standard remove message to the dir node when it receives a request for a resource it has no rsb for. Signed-off-by: David Teigland <teigland@redhat.com>
Showing 1 changed file with 68 additions and 2 deletions Side-by-side Diff
fs/dlm/lock.c
... | ... | @@ -4000,12 +4000,70 @@ |
4000 | 4000 | return error; |
4001 | 4001 | } |
4002 | 4002 | |
4003 | +static void send_repeat_remove(struct dlm_ls *ls, char *ms_name, int len) | |
4004 | +{ | |
4005 | + char name[DLM_RESNAME_MAXLEN + 1]; | |
4006 | + struct dlm_message *ms; | |
4007 | + struct dlm_mhandle *mh; | |
4008 | + struct dlm_rsb *r; | |
4009 | + uint32_t hash, b; | |
4010 | + int rv, dir_nodeid; | |
4011 | + | |
4012 | + memset(name, 0, sizeof(name)); | |
4013 | + memcpy(name, ms_name, len); | |
4014 | + | |
4015 | + hash = jhash(name, len, 0); | |
4016 | + b = hash & (ls->ls_rsbtbl_size - 1); | |
4017 | + | |
4018 | + dir_nodeid = dlm_hash2nodeid(ls, hash); | |
4019 | + | |
4020 | + log_error(ls, "send_repeat_remove dir %d %s", dir_nodeid, name); | |
4021 | + | |
4022 | + spin_lock(&ls->ls_rsbtbl[b].lock); | |
4023 | + rv = dlm_search_rsb_tree(&ls->ls_rsbtbl[b].keep, name, len, &r); | |
4024 | + if (!rv) { | |
4025 | + spin_unlock(&ls->ls_rsbtbl[b].lock); | |
4026 | + log_error(ls, "repeat_remove on keep %s", name); | |
4027 | + return; | |
4028 | + } | |
4029 | + | |
4030 | + rv = dlm_search_rsb_tree(&ls->ls_rsbtbl[b].toss, name, len, &r); | |
4031 | + if (!rv) { | |
4032 | + spin_unlock(&ls->ls_rsbtbl[b].lock); | |
4033 | + log_error(ls, "repeat_remove on toss %s", name); | |
4034 | + return; | |
4035 | + } | |
4036 | + | |
4037 | + /* use ls->remove_name2 to avoid conflict with shrink? */ | |
4038 | + | |
4039 | + spin_lock(&ls->ls_remove_spin); | |
4040 | + ls->ls_remove_len = len; | |
4041 | + memcpy(ls->ls_remove_name, name, DLM_RESNAME_MAXLEN); | |
4042 | + spin_unlock(&ls->ls_remove_spin); | |
4043 | + spin_unlock(&ls->ls_rsbtbl[b].lock); | |
4044 | + | |
4045 | + rv = _create_message(ls, sizeof(struct dlm_message) + len, | |
4046 | + dir_nodeid, DLM_MSG_REMOVE, &ms, &mh); | |
4047 | + if (rv) | |
4048 | + return; | |
4049 | + | |
4050 | + memcpy(ms->m_extra, name, len); | |
4051 | + ms->m_hash = hash; | |
4052 | + | |
4053 | + send_message(mh, ms); | |
4054 | + | |
4055 | + spin_lock(&ls->ls_remove_spin); | |
4056 | + ls->ls_remove_len = 0; | |
4057 | + memset(ls->ls_remove_name, 0, DLM_RESNAME_MAXLEN); | |
4058 | + spin_unlock(&ls->ls_remove_spin); | |
4059 | +} | |
4060 | + | |
4003 | 4061 | static int receive_request(struct dlm_ls *ls, struct dlm_message *ms) |
4004 | 4062 | { |
4005 | 4063 | struct dlm_lkb *lkb; |
4006 | 4064 | struct dlm_rsb *r; |
4007 | 4065 | int from_nodeid; |
4008 | - int error, namelen; | |
4066 | + int error, namelen = 0; | |
4009 | 4067 | |
4010 | 4068 | from_nodeid = ms->m_header.h_nodeid; |
4011 | 4069 | |
4012 | 4070 | |
4013 | 4071 | |
... | ... | @@ -4073,11 +4131,19 @@ |
4073 | 4131 | delayed in being sent/arriving/being processed on the dir node. |
4074 | 4132 | Another node would repeatedly lookup up the master, and the dir |
4075 | 4133 | node would continue returning our nodeid until our send_remove |
4076 | - took effect. */ | |
4134 | + took effect. | |
4077 | 4135 | |
4136 | + We send another remove message in case our previous send_remove | |
4137 | + was lost/ignored/missed somehow. */ | |
4138 | + | |
4078 | 4139 | if (error != -ENOTBLK) { |
4079 | 4140 | log_limit(ls, "receive_request %x from %d %d", |
4080 | 4141 | ms->m_lkid, from_nodeid, error); |
4142 | + } | |
4143 | + | |
4144 | + if (namelen && error == -EBADR) { | |
4145 | + send_repeat_remove(ls, ms->m_extra, namelen); | |
4146 | + msleep(1000); | |
4081 | 4147 | } |
4082 | 4148 | |
4083 | 4149 | setup_stub_lkb(ls, ms); |