Gitweb:
http://git.fedorahosted.org/git/?p=cluster.git;a=commitdiff;h=444a46c80e3...
Commit: 444a46c80e3ee12544eb8960fdea9f7cefc78d1e
Parent: a1e906313b85a6d0451feda7a6222cfd4386caf5
Author: Christine Caulfield <ccaulfie(a)redhat.com>
AuthorDate: Fri Jan 11 13:45:50 2013 +0000
Committer: Fabio M. Di Nitto <fdinitto(a)redhat.com>
CommitterDate: Wed Mar 20 17:25:16 2013 +0100
cman|fenced: Fix node killing in case of a 2node cluster that suffers brief network
outage
This patch fixes a rare but nasty condition in cman and fenced. In a 2node cluster
if the network splits for a period of time longer than the token timeout but
shorter than the time needed to fence a node then both nodes can send 'kill'
packets to the other with the effect that both nodes' cmans will quit
leaving no operational cluster.
This patch adds a check for a 2node cluster and only sends a 'kill' packet
to the node with the higher nodeid thus ensuring a predictable response
to such events and ensuring that services can continue to run.
rhbz#923861
Signed-off-by: Christine Caulfield <ccaulfie(a)redhat.com>
Acked-By: Fabio M. Di Nitto <fdinitto(a)redhat.com>
---
cman/daemon/commands.c | 27 ++++++++++++++++++++++++---
1 files changed, 24 insertions(+), 3 deletions(-)
diff --git a/cman/daemon/commands.c b/cman/daemon/commands.c
index 1fafdac..6f3ec4e 100644
--- a/cman/daemon/commands.c
+++ b/cman/daemon/commands.c
@@ -1829,10 +1829,31 @@ static void do_process_transition(int nodeid, char *data, int
len)
/* Don't duplicate messages */
if (node->state != NODESTATE_AISONLY) {
if (cluster_is_quorate) {
- P_MEMB("Killing node %s because it has rejoined the cluster with existing
state", node->name);
- log_printf(LOG_CRIT, "Killing node %s because it has rejoined the cluster with
existing state", node->name);
node->state = NODESTATE_AISONLY;
- send_kill(nodeid, CLUSTER_KILL_REJOIN);
+ /* Oh, this gets even more complicated. Don't send a KILL message if we are in a
two_node
+ * cluster and that node has a lower node ID than us.
+ * This allows fencing time to startup and caters for the situation where
+ * a node rejoins REALLY quickly, before fencing has had time to work.
+ * I've split this up a bit partly for clarity, but mainly so allow us to
+ * print out helpful messages as to what we are up to here.
+ */
+ if (two_node) {
+ if (node->node_id > us->node_id) {
+ log_printf(LOG_CRIT, "Killing node %s because it has rejoined the cluster with
existing state and has higher node ID", node->name);
+ P_MEMB("Killing node %s because it has rejoined the cluster with existing
state and has higher node ID", node->name);
+ send_kill(nodeid, CLUSTER_KILL_REJOIN);
+ }
+ else {
+ log_printf(LOG_CRIT, "Not killing node %s despite it rejoining the cluster
with existing state, it has a lower node ID", node->name);
+ P_MEMB("Not killing node %s despite it rejoining the cluster with existing
state, it has a lower node ID", node->name);
+ }
+ }
+ else {
+ log_printf(LOG_CRIT, "Killing node %s because it has rejoined the cluster with
existing state", node->name);
+ P_MEMB("Killing node %s because it has rejoined the cluster with existing
state", node->name);
+ send_kill(nodeid, CLUSTER_KILL_REJOIN);
+ }
+
}
else {
P_MEMB("Node %s not joined to cman because it has existing state",
node->name);