Commit 0db638f44e7db9732d9c5704ca837f57ce061f42
1 parent
4ba63adce0
Exists in
master
and in
7 other branches
ocfs2: warn the user on a dead timeout mismatch
Print a warning to the user when a node with a different dead count joins the region. Signed-off-by: Mark Fasheh <mark.fasheh@oracle.com>
Showing 2 changed files with 21 additions and 0 deletions Side-by-side Diff
fs/ocfs2/cluster/heartbeat.c
... | ... | @@ -517,6 +517,7 @@ |
517 | 517 | hb_block->hb_seq = cpu_to_le64(cputime); |
518 | 518 | hb_block->hb_node = node_num; |
519 | 519 | hb_block->hb_generation = cpu_to_le64(generation); |
520 | + hb_block->hb_dead_ms = cpu_to_le32(o2hb_dead_threshold * O2HB_REGION_TIMEOUT_MS); | |
520 | 521 | |
521 | 522 | /* This step must always happen last! */ |
522 | 523 | hb_block->hb_cksum = cpu_to_le32(o2hb_compute_block_crc_le(reg, |
... | ... | @@ -645,6 +646,8 @@ |
645 | 646 | struct o2nm_node *node; |
646 | 647 | struct o2hb_disk_heartbeat_block *hb_block = reg->hr_tmp_block; |
647 | 648 | u64 cputime; |
649 | + unsigned int dead_ms = o2hb_dead_threshold * O2HB_REGION_TIMEOUT_MS; | |
650 | + unsigned int slot_dead_ms; | |
648 | 651 | |
649 | 652 | memcpy(hb_block, slot->ds_raw_block, reg->hr_block_bytes); |
650 | 653 | |
... | ... | @@ -733,6 +736,23 @@ |
733 | 736 | &o2hb_live_slots[slot->ds_node_num]); |
734 | 737 | |
735 | 738 | slot->ds_equal_samples = 0; |
739 | + | |
740 | + /* We want to be sure that all nodes agree on the | |
741 | + * number of milliseconds before a node will be | |
742 | + * considered dead. The self-fencing timeout is | |
743 | + * computed from this value, and a discrepancy might | |
744 | + * result in heartbeat calling a node dead when it | |
745 | + * hasn't self-fenced yet. */ | |
746 | + slot_dead_ms = le32_to_cpu(hb_block->hb_dead_ms); | |
747 | + if (slot_dead_ms && slot_dead_ms != dead_ms) { | |
748 | + /* TODO: Perhaps we can fail the region here. */ | |
749 | + mlog(ML_ERROR, "Node %d on device %s has a dead count " | |
750 | + "of %u ms, but our count is %u ms.\n" | |
751 | + "Please double check your configuration values " | |
752 | + "for 'O2CB_HEARTBEAT_THRESHOLD'\n", | |
753 | + slot->ds_node_num, reg->hr_dev_name, slot_dead_ms, | |
754 | + dead_ms); | |
755 | + } | |
736 | 756 | goto out; |
737 | 757 | } |
738 | 758 |