Commit 0db638f44e7db9732d9c5704ca837f57ce061f42

Authored by Mark Fasheh
1 parent 4ba63adce0

ocfs2: warn the user on a dead timeout mismatch

Print a warning to the user when a node with a different dead count joins
the region.

Signed-off-by: Mark Fasheh <mark.fasheh@oracle.com>

Showing 2 changed files with 21 additions and 0 deletions Side-by-side Diff

fs/ocfs2/cluster/heartbeat.c
... ... @@ -517,6 +517,7 @@
517 517 hb_block->hb_seq = cpu_to_le64(cputime);
518 518 hb_block->hb_node = node_num;
519 519 hb_block->hb_generation = cpu_to_le64(generation);
  520 + hb_block->hb_dead_ms = cpu_to_le32(o2hb_dead_threshold * O2HB_REGION_TIMEOUT_MS);
520 521  
521 522 /* This step must always happen last! */
522 523 hb_block->hb_cksum = cpu_to_le32(o2hb_compute_block_crc_le(reg,
... ... @@ -645,6 +646,8 @@
645 646 struct o2nm_node *node;
646 647 struct o2hb_disk_heartbeat_block *hb_block = reg->hr_tmp_block;
647 648 u64 cputime;
  649 + unsigned int dead_ms = o2hb_dead_threshold * O2HB_REGION_TIMEOUT_MS;
  650 + unsigned int slot_dead_ms;
648 651  
649 652 memcpy(hb_block, slot->ds_raw_block, reg->hr_block_bytes);
650 653  
... ... @@ -733,6 +736,23 @@
733 736 &o2hb_live_slots[slot->ds_node_num]);
734 737  
735 738 slot->ds_equal_samples = 0;
  739 +
  740 + /* We want to be sure that all nodes agree on the
  741 + * number of milliseconds before a node will be
  742 + * considered dead. The self-fencing timeout is
  743 + * computed from this value, and a discrepancy might
  744 + * result in heartbeat calling a node dead when it
  745 + * hasn't self-fenced yet. */
  746 + slot_dead_ms = le32_to_cpu(hb_block->hb_dead_ms);
  747 + if (slot_dead_ms && slot_dead_ms != dead_ms) {
  748 + /* TODO: Perhaps we can fail the region here. */
  749 + mlog(ML_ERROR, "Node %d on device %s has a dead count "
  750 + "of %u ms, but our count is %u ms.\n"
  751 + "Please double check your configuration values "
  752 + "for 'O2CB_HEARTBEAT_THRESHOLD'\n",
  753 + slot->ds_node_num, reg->hr_dev_name, slot_dead_ms,
  754 + dead_ms);
  755 + }
736 756 goto out;
737 757 }
738 758  
fs/ocfs2/cluster/ocfs2_heartbeat.h
... ... @@ -32,6 +32,7 @@
32 32 __u8 hb_pad1[3];
33 33 __le32 hb_cksum;
34 34 __le64 hb_generation;
  35 + __le32 hb_dead_ms;
35 36 };
36 37  
37 38 #endif /* _OCFS2_HEARTBEAT_H */