[patch 3/3] OCFS2 Configurable timeouts - Protocol changes

Modify the OCFS2 handshake to ensure essential timeouts are configured identically on all nodes. Only allow changes when there are no connected peers Improves the logic in o2net_advance_rx() which broke now that sizeof(struct o2net_handshake) is greater than sizeof(struct o2net_msg) Included is the field for userspace-heartbeat timeout to avoid the need for further protocol changes. Uses a global spinlock to ensure the decisions to update configfs entries are made on the correct value. The region covered by the spinlock when incrementing the counter is much larger as this is the more critical case. Small cleanup contributed by Adrian Bunk <bunk@stusta.de> Signed-off-by: Andrew Beekhof <abeekhof@suse.de> Signed-off-by: Mark Fasheh <mark.fasheh@oracle.com>

[patch 3/3] OCFS2 Configurable timeouts - Protocol changes
Modify the OCFS2 handshake to ensure essential timeouts are configured identically on all nodes. Only allow changes when there are no connected peers Improves the logic in o2net_advance_rx() which broke now that sizeof(struct o2net_handshake) is greater than sizeof(struct o2net_msg) Included is the field for userspace-heartbeat timeout to avoid the need for further protocol changes. Uses a global spinlock to ensure the decisions to update configfs entries are made on the correct value. The region covered by the spinlock when incrementing the counter is much larger as this is the more critical case. Small cleanup contributed by Adrian Bunk <bunk@stusta.de> Signed-off-by: Andrew Beekhof <abeekhof@suse.de> Signed-off-by: Mark Fasheh <mark.fasheh@oracle.com>
Andrew Beekhof · Mark Fasheh
1 parent b5dd80304d
Showing 4 changed files with 116 additions and 16 deletions Side-by-side Diff
fs/ocfs2/cluster/nodemanager.c
fs/ocfs2/cluster/tcp.c
fs/ocfs2/cluster/tcp.h
fs/ocfs2/cluster/tcp_internal.h
@@ -573,12 +573,21 @@
 	ret =  o2nm_cluster_attr_write(page, count, &val);
  
 	if (ret > 0) {
-		if (val <= cluster->cl_keepalive_delay_ms) {
+		if (cluster->cl_idle_timeout_ms != val
+			&& o2net_num_connected_peers()) {
+			mlog(ML_NOTICE,
+			     "o2net: cannot change idle timeout after "
+			     "the first peer has agreed to it."
+			     "  %d connected peers\n",
+			     o2net_num_connected_peers());
+			ret = -EINVAL;
+		} else if (val <= cluster->cl_keepalive_delay_ms) {
 			mlog(ML_NOTICE, "o2net: idle timeout must be larger "
 			     "than keepalive delay\n");
-			return -EINVAL;
+			ret = -EINVAL;
+		} else {
+			cluster->cl_idle_timeout_ms = val;
 		}
-		cluster->cl_idle_timeout_ms = val;
 	}
  
 	return ret;
  
  
@@ -599,12 +608,21 @@
 	ret =  o2nm_cluster_attr_write(page, count, &val);
  
 	if (ret > 0) {
-		if (val >= cluster->cl_idle_timeout_ms) {
+		if (cluster->cl_keepalive_delay_ms != val
+		    && o2net_num_connected_peers()) {
+			mlog(ML_NOTICE,
+			     "o2net: cannot change keepalive delay after"
+			     " the first peer has agreed to it."
+			     "  %d connected peers\n",
+			     o2net_num_connected_peers());
+			ret = -EINVAL;
+		} else if (val >= cluster->cl_idle_timeout_ms) {
 			mlog(ML_NOTICE, "o2net: keepalive delay must be "
 			     "smaller than idle timeout\n");
-			return -EINVAL;
+			ret = -EINVAL;
+		} else {
+			cluster->cl_keepalive_delay_ms = val;
 		}
-		cluster->cl_keepalive_delay_ms = val;
 	}
  
 	return ret;
@@ -380,6 +380,13 @@
 		sc_put(sc);
 }
  
+static atomic_t o2net_connected_peers = ATOMIC_INIT(0);
+
+int o2net_num_connected_peers(void)
+{
+	return atomic_read(&o2net_connected_peers);
+}
+
 static void o2net_set_nn_state(struct o2net_node *nn,
 			       struct o2net_sock_container *sc,
 			       unsigned valid, int err)
@@ -390,6 +397,11 @@
  
 	assert_spin_locked(&nn->nn_lock);
  
+	if (old_sc && !sc)
+		atomic_dec(&o2net_connected_peers);
+	else if (!old_sc && sc)
+		atomic_inc(&o2net_connected_peers);
+
 	/* the node num comparison and single connect/accept path should stop
 	 * an non-null sc from being overwritten with another */
 	BUG_ON(sc && nn->nn_sc && nn->nn_sc != sc);
@@ -1123,6 +1135,44 @@
 		return -1;
 	}
  
+	/*
+	 * Ensure timeouts are consistent with other nodes, otherwise
+	 * we can end up with one node thinking that the other must be down,
+	 * but isn't. This can ultimately cause corruption.
+	 */
+	if (be32_to_cpu(hand->o2net_idle_timeout_ms) !=
+				o2net_idle_timeout(sc->sc_node)) {
+		mlog(ML_NOTICE, SC_NODEF_FMT " uses a network idle timeout of "
+		     "%u ms, but we use %u ms locally.  disconnecting\n",
+		     SC_NODEF_ARGS(sc),
+		     be32_to_cpu(hand->o2net_idle_timeout_ms),
+		     o2net_idle_timeout(sc->sc_node));
+		o2net_ensure_shutdown(nn, sc, -ENOTCONN);
+		return -1;
+	}
+
+	if (be32_to_cpu(hand->o2net_keepalive_delay_ms) !=
+			o2net_keepalive_delay(sc->sc_node)) {
+		mlog(ML_NOTICE, SC_NODEF_FMT " uses a keepalive delay of "
+		     "%u ms, but we use %u ms locally.  disconnecting\n",
+		     SC_NODEF_ARGS(sc),
+		     be32_to_cpu(hand->o2net_keepalive_delay_ms),
+		     o2net_keepalive_delay(sc->sc_node));
+		o2net_ensure_shutdown(nn, sc, -ENOTCONN);
+		return -1;
+	}
+
+	if (be32_to_cpu(hand->o2hb_heartbeat_timeout_ms) !=
+			O2HB_MAX_WRITE_TIMEOUT_MS) {
+		mlog(ML_NOTICE, SC_NODEF_FMT " uses a heartbeat timeout of "
+		     "%u ms, but we use %u ms locally.  disconnecting\n",
+		     SC_NODEF_ARGS(sc),
+		     be32_to_cpu(hand->o2hb_heartbeat_timeout_ms),
+		     O2HB_MAX_WRITE_TIMEOUT_MS);
+		o2net_ensure_shutdown(nn, sc, -ENOTCONN);
+		return -1;
+	}
+
 	sc->sc_handshake_ok = 1;
  
 	spin_lock(&nn->nn_lock);
@@ -1155,6 +1205,23 @@
 	sclog(sc, "receiving\n");
 	do_gettimeofday(&sc->sc_tv_advance_start);
  
+	if (unlikely(sc->sc_handshake_ok == 0)) {
+		if(sc->sc_page_off < sizeof(struct o2net_handshake)) {
+			data = page_address(sc->sc_page) + sc->sc_page_off;
+			datalen = sizeof(struct o2net_handshake) - sc->sc_page_off;
+			ret = o2net_recv_tcp_msg(sc->sc_sock, data, datalen);
+			if (ret > 0)
+				sc->sc_page_off += ret;
+		}
+
+		if (sc->sc_page_off == sizeof(struct o2net_handshake)) {
+			o2net_check_handshake(sc);
+			if (unlikely(sc->sc_handshake_ok == 0))
+				ret = -EPROTO;
+		}
+		goto out;
+	}
+
 	/* do we need more header? */
 	if (sc->sc_page_off < sizeof(struct o2net_msg)) {
 		data = page_address(sc->sc_page) + sc->sc_page_off;
@@ -1162,15 +1229,6 @@
 		ret = o2net_recv_tcp_msg(sc->sc_sock, data, datalen);
 		if (ret > 0) {
 			sc->sc_page_off += ret;
-
-			/* this working relies on the handshake being
-			 * smaller than the normal message header */
-			if (sc->sc_page_off >= sizeof(struct o2net_handshake)&&
-			    !sc->sc_handshake_ok && o2net_check_handshake(sc)) {
-				ret = -EPROTO;
-				goto out;
-			}
-
 			/* only swab incoming here.. we can
 			 * only get here once as we cross from
 			 * being under to over */
@@ -1272,6 +1330,18 @@
 	return ret;
 }
  
+static void o2net_initialize_handshake(void)
+{
+	o2net_hand->o2hb_heartbeat_timeout_ms = cpu_to_be32(
+		O2HB_MAX_WRITE_TIMEOUT_MS);
+	o2net_hand->o2net_idle_timeout_ms = cpu_to_be32(
+		o2net_idle_timeout(NULL));
+	o2net_hand->o2net_keepalive_delay_ms = cpu_to_be32(
+		o2net_keepalive_delay(NULL));
+	o2net_hand->o2net_reconnect_delay_ms = cpu_to_be32(
+		o2net_reconnect_delay(NULL));
+}
+
 /* ------------------------------------------------------------ */
  
 /* called when a connect completes and after a sock is accepted.  the
@@ -1286,6 +1356,7 @@
               (unsigned long long)O2NET_PROTOCOL_VERSION,
 	      (unsigned long long)be64_to_cpu(o2net_hand->connector_id));
  
+	o2net_initialize_handshake();
 	o2net_sendpage(sc, o2net_hand, sizeof(*o2net_hand));
 	sc_put(sc);
 }
@@ -1514,6 +1585,8 @@
  
 	if (node_num != o2nm_this_node())
 		o2net_disconnect_node(node);
+
+	BUG_ON(atomic_read(&o2net_connected_peers) < 0);
 }
  
 static void o2net_hb_node_up_cb(struct o2nm_node *node, int node_num,
@@ -1677,6 +1750,7 @@
 	o2net_register_callbacks(sc->sc_sock->sk, sc);
 	o2net_sc_queue_work(sc, &sc->sc_rx_work);
  
+	o2net_initialize_handshake();
 	o2net_sendpage(sc, o2net_hand, sizeof(*o2net_hand));
  
 out:
@@ -108,6 +108,7 @@
 int o2net_start_listening(struct o2nm_node *node);
 void o2net_stop_listening(struct o2nm_node *node);
 void o2net_disconnect_node(struct o2nm_node *node);
+int o2net_num_connected_peers(void);
  
 int o2net_init(void);
 void o2net_exit(void);
@@ -38,6 +38,9 @@
  * locking semantics of the file system using the protocol.  It should 
  * be somewhere else, I'm sure, but right now it isn't.
  *
+ * New in version 5:
+ * 	- Network timeout checking protocol
+ *
  * New in version 4:
  * 	- Remove i_generation from lock names for better stat performance.
  *
  
@@ -48,10 +51,14 @@
  * 	- full 64 bit i_size in the metadata lock lvbs
  * 	- introduction of "rw" lock and pushing meta/data locking down
  */
-#define O2NET_PROTOCOL_VERSION 4ULL
+#define O2NET_PROTOCOL_VERSION 5ULL
 struct o2net_handshake {
 	__be64	protocol_version;
 	__be64	connector_id;
+	__be32  o2hb_heartbeat_timeout_ms;
+	__be32  o2net_idle_timeout_ms;
+	__be32  o2net_keepalive_delay_ms;
+	__be32  o2net_reconnect_delay_ms;
 };
  
 struct o2net_node {
...	...	@@ -573,12 +573,21 @@
573	573	ret = o2nm_cluster_attr_write(page, count, &val);
574	574
575	575	if (ret > 0) {
576		- if (val <= cluster->cl_keepalive_delay_ms) {
	576	+ if (cluster->cl_idle_timeout_ms != val
	577	+ && o2net_num_connected_peers()) {
	578	+ mlog(ML_NOTICE,
	579	+ "o2net: cannot change idle timeout after "
	580	+ "the first peer has agreed to it."
	581	+ " %d connected peers\n",
	582	+ o2net_num_connected_peers());
	583	+ ret = -EINVAL;
	584	+ } else if (val <= cluster->cl_keepalive_delay_ms) {
577	585	mlog(ML_NOTICE, "o2net: idle timeout must be larger "
578	586	"than keepalive delay\n");
579		- return -EINVAL;
	587	+ ret = -EINVAL;
	588	+ } else {
	589	+ cluster->cl_idle_timeout_ms = val;
580	590	}
581		- cluster->cl_idle_timeout_ms = val;
582	591	}
583	592
584	593	return ret;
585	594
586	595
...	...	@@ -599,12 +608,21 @@
599	608	ret = o2nm_cluster_attr_write(page, count, &val);
600	609
601	610	if (ret > 0) {
602		- if (val >= cluster->cl_idle_timeout_ms) {
	611	+ if (cluster->cl_keepalive_delay_ms != val
	612	+ && o2net_num_connected_peers()) {
	613	+ mlog(ML_NOTICE,
	614	+ "o2net: cannot change keepalive delay after"
	615	+ " the first peer has agreed to it."
	616	+ " %d connected peers\n",
	617	+ o2net_num_connected_peers());
	618	+ ret = -EINVAL;
	619	+ } else if (val >= cluster->cl_idle_timeout_ms) {
603	620	mlog(ML_NOTICE, "o2net: keepalive delay must be "
604	621	"smaller than idle timeout\n");
605		- return -EINVAL;
	622	+ ret = -EINVAL;
	623	+ } else {
	624	+ cluster->cl_keepalive_delay_ms = val;
606	625	}
607		- cluster->cl_keepalive_delay_ms = val;
608	626	}
609	627
610	628	return ret;
...	...	@@ -380,6 +380,13 @@
380	380	sc_put(sc);
381	381	}
382	382
	383	+static atomic_t o2net_connected_peers = ATOMIC_INIT(0);
	384	+
	385	+int o2net_num_connected_peers(void)
	386	+{
	387	+ return atomic_read(&o2net_connected_peers);
	388	+}
	389	+
383	390	static void o2net_set_nn_state(struct o2net_node *nn,
384	391	struct o2net_sock_container *sc,
385	392	unsigned valid, int err)
...	...	@@ -390,6 +397,11 @@
390	397
391	398	assert_spin_locked(&nn->nn_lock);
392	399
	400	+ if (old_sc && !sc)
	401	+ atomic_dec(&o2net_connected_peers);
	402	+ else if (!old_sc && sc)
	403	+ atomic_inc(&o2net_connected_peers);
	404	+
393	405	/* the node num comparison and single connect/accept path should stop
394	406	* an non-null sc from being overwritten with another */
395	407	BUG_ON(sc && nn->nn_sc && nn->nn_sc != sc);
...	...	@@ -1123,6 +1135,44 @@
1123	1135	return -1;
1124	1136	}
1125	1137
	1138	+ /*
	1139	+ * Ensure timeouts are consistent with other nodes, otherwise
	1140	+ * we can end up with one node thinking that the other must be down,
	1141	+ * but isn't. This can ultimately cause corruption.
	1142	+ */
	1143	+ if (be32_to_cpu(hand->o2net_idle_timeout_ms) !=
	1144	+ o2net_idle_timeout(sc->sc_node)) {
	1145	+ mlog(ML_NOTICE, SC_NODEF_FMT " uses a network idle timeout of "
	1146	+ "%u ms, but we use %u ms locally. disconnecting\n",
	1147	+ SC_NODEF_ARGS(sc),
	1148	+ be32_to_cpu(hand->o2net_idle_timeout_ms),
	1149	+ o2net_idle_timeout(sc->sc_node));
	1150	+ o2net_ensure_shutdown(nn, sc, -ENOTCONN);
	1151	+ return -1;
	1152	+ }
	1153	+
	1154	+ if (be32_to_cpu(hand->o2net_keepalive_delay_ms) !=
	1155	+ o2net_keepalive_delay(sc->sc_node)) {
	1156	+ mlog(ML_NOTICE, SC_NODEF_FMT " uses a keepalive delay of "
	1157	+ "%u ms, but we use %u ms locally. disconnecting\n",
	1158	+ SC_NODEF_ARGS(sc),
	1159	+ be32_to_cpu(hand->o2net_keepalive_delay_ms),
	1160	+ o2net_keepalive_delay(sc->sc_node));
	1161	+ o2net_ensure_shutdown(nn, sc, -ENOTCONN);
	1162	+ return -1;
	1163	+ }
	1164	+
	1165	+ if (be32_to_cpu(hand->o2hb_heartbeat_timeout_ms) !=
	1166	+ O2HB_MAX_WRITE_TIMEOUT_MS) {
	1167	+ mlog(ML_NOTICE, SC_NODEF_FMT " uses a heartbeat timeout of "
	1168	+ "%u ms, but we use %u ms locally. disconnecting\n",
	1169	+ SC_NODEF_ARGS(sc),
	1170	+ be32_to_cpu(hand->o2hb_heartbeat_timeout_ms),
	1171	+ O2HB_MAX_WRITE_TIMEOUT_MS);
	1172	+ o2net_ensure_shutdown(nn, sc, -ENOTCONN);
	1173	+ return -1;
	1174	+ }
	1175	+
1126	1176	sc->sc_handshake_ok = 1;
1127	1177
1128	1178	spin_lock(&nn->nn_lock);
...	...	@@ -1155,6 +1205,23 @@
1155	1205	sclog(sc, "receiving\n");
1156	1206	do_gettimeofday(&sc->sc_tv_advance_start);
1157	1207
	1208	+ if (unlikely(sc->sc_handshake_ok == 0)) {
	1209	+ if(sc->sc_page_off < sizeof(struct o2net_handshake)) {
	1210	+ data = page_address(sc->sc_page) + sc->sc_page_off;
	1211	+ datalen = sizeof(struct o2net_handshake) - sc->sc_page_off;
	1212	+ ret = o2net_recv_tcp_msg(sc->sc_sock, data, datalen);
	1213	+ if (ret > 0)
	1214	+ sc->sc_page_off += ret;
	1215	+ }
	1216	+
	1217	+ if (sc->sc_page_off == sizeof(struct o2net_handshake)) {
	1218	+ o2net_check_handshake(sc);
	1219	+ if (unlikely(sc->sc_handshake_ok == 0))
	1220	+ ret = -EPROTO;
	1221	+ }
	1222	+ goto out;
	1223	+ }
	1224	+
1158	1225	/* do we need more header? */
1159	1226	if (sc->sc_page_off < sizeof(struct o2net_msg)) {
1160	1227	data = page_address(sc->sc_page) + sc->sc_page_off;
...	...	@@ -1162,15 +1229,6 @@
1162	1229	ret = o2net_recv_tcp_msg(sc->sc_sock, data, datalen);
1163	1230	if (ret > 0) {
1164	1231	sc->sc_page_off += ret;
1165		-
1166		- /* this working relies on the handshake being
1167		- * smaller than the normal message header */
1168		- if (sc->sc_page_off >= sizeof(struct o2net_handshake)&&
1169		- !sc->sc_handshake_ok && o2net_check_handshake(sc)) {
1170		- ret = -EPROTO;
1171		- goto out;
1172		- }
1173		-
1174	1232	/* only swab incoming here.. we can
1175	1233	* only get here once as we cross from
1176	1234	* being under to over */
...	...	@@ -1272,6 +1330,18 @@
1272	1330	return ret;
1273	1331	}
1274	1332
	1333	+static void o2net_initialize_handshake(void)
	1334	+{
	1335	+ o2net_hand->o2hb_heartbeat_timeout_ms = cpu_to_be32(
	1336	+ O2HB_MAX_WRITE_TIMEOUT_MS);
	1337	+ o2net_hand->o2net_idle_timeout_ms = cpu_to_be32(
	1338	+ o2net_idle_timeout(NULL));
	1339	+ o2net_hand->o2net_keepalive_delay_ms = cpu_to_be32(
	1340	+ o2net_keepalive_delay(NULL));
	1341	+ o2net_hand->o2net_reconnect_delay_ms = cpu_to_be32(
	1342	+ o2net_reconnect_delay(NULL));
	1343	+}
	1344	+
1275	1345	/* ------------------------------------------------------------ */
1276	1346
1277	1347	/* called when a connect completes and after a sock is accepted. the
...	...	@@ -1286,6 +1356,7 @@
1286	1356	(unsigned long long)O2NET_PROTOCOL_VERSION,
1287	1357	(unsigned long long)be64_to_cpu(o2net_hand->connector_id));
1288	1358
	1359	+ o2net_initialize_handshake();
1289	1360	o2net_sendpage(sc, o2net_hand, sizeof(*o2net_hand));
1290	1361	sc_put(sc);
1291	1362	}
...	...	@@ -1514,6 +1585,8 @@
1514	1585
1515	1586	if (node_num != o2nm_this_node())
1516	1587	o2net_disconnect_node(node);
	1588	+
	1589	+ BUG_ON(atomic_read(&o2net_connected_peers) < 0);
1517	1590	}
1518	1591
1519	1592	static void o2net_hb_node_up_cb(struct o2nm_node *node, int node_num,
...	...	@@ -1677,6 +1750,7 @@
1677	1750	o2net_register_callbacks(sc->sc_sock->sk, sc);
1678	1751	o2net_sc_queue_work(sc, &sc->sc_rx_work);
1679	1752
	1753	+ o2net_initialize_handshake();
1680	1754	o2net_sendpage(sc, o2net_hand, sizeof(*o2net_hand));
1681	1755
1682	1756	out:
...	...	@@ -108,6 +108,7 @@
108	108	int o2net_start_listening(struct o2nm_node *node);
109	109	void o2net_stop_listening(struct o2nm_node *node);
110	110	void o2net_disconnect_node(struct o2nm_node *node);
	111	+int o2net_num_connected_peers(void);
111	112
112	113	int o2net_init(void);
113	114	void o2net_exit(void);
...	...	@@ -38,6 +38,9 @@
38	38	* locking semantics of the file system using the protocol. It should
39	39	* be somewhere else, I'm sure, but right now it isn't.
40	40	*
	41	+ * New in version 5:
	42	+ * - Network timeout checking protocol
	43	+ *
41	44	* New in version 4:
42	45	* - Remove i_generation from lock names for better stat performance.
43	46	*
44	47
...	...	@@ -48,10 +51,14 @@
48	51	* - full 64 bit i_size in the metadata lock lvbs
49	52	* - introduction of "rw" lock and pushing meta/data locking down
50	53	*/
51		-#define O2NET_PROTOCOL_VERSION 4ULL
	54	+#define O2NET_PROTOCOL_VERSION 5ULL
52	55	struct o2net_handshake {
53	56	__be64 protocol_version;
54	57	__be64 connector_id;
	58	+ __be32 o2hb_heartbeat_timeout_ms;
	59	+ __be32 o2net_idle_timeout_ms;
	60	+ __be32 o2net_keepalive_delay_ms;
	61	+ __be32 o2net_reconnect_delay_ms;
55	62	};
56	63
57	64	struct o2net_node {