Gitweb: http://git.fedorahosted.org/git/?p=cluster.git;a=commitdiff;h=75c20f80a14595... Commit: 75c20f80a14595f912814bb78e66fc5702c82296 Parent: b0c42d287f7f749794a61796f016c2e56148d64b Author: Christine Caulfield ccaulfie@redhat.com AuthorDate: Fri Jan 11 13:45:50 2013 +0000 Committer: Christine Caulfield ccaulfie@redhat.com CommitterDate: Fri Jan 11 13:45:50 2013 +0000
cman|fenced: Fix node killing in case of a 2node cluster that suffers brief network outage
This patch fixes a rare but nasty condition in cman and fenced. In a 2node cluster if the network splits for a period of time longer than the token timeout but shorter than the time needed to fence a node then both nodes can send 'kill' packets to the other with the effect that both nodes' cmans will quit leaving no operational cluster.
This patch adds a check for a 2node cluster and only sends a 'kill' packet to the node with the higher nodeid thus ensuring a predictable response to such events and ensuring that services can continue to run.
rhbz#876731
Signed-off-by: Christine Caulfield ccaulfie@redhat.com Acked-By: Fabio M. Di Nitto fdinitto@redhat.com --- cman/daemon/commands.c | 27 ++++++++++++++++++++++++--- 1 files changed, 24 insertions(+), 3 deletions(-)
diff --git a/cman/daemon/commands.c b/cman/daemon/commands.c index 1fafdac..6f3ec4e 100644 --- a/cman/daemon/commands.c +++ b/cman/daemon/commands.c @@ -1829,10 +1829,31 @@ static void do_process_transition(int nodeid, char *data, int len) /* Don't duplicate messages */ if (node->state != NODESTATE_AISONLY) { if (cluster_is_quorate) { - P_MEMB("Killing node %s because it has rejoined the cluster with existing state", node->name); - log_printf(LOG_CRIT, "Killing node %s because it has rejoined the cluster with existing state", node->name); node->state = NODESTATE_AISONLY; - send_kill(nodeid, CLUSTER_KILL_REJOIN); + /* Oh, this gets even more complicated. Don't send a KILL message if we are in a two_node + * cluster and that node has a lower node ID than us. + * This allows fencing time to startup and caters for the situation where + * a node rejoins REALLY quickly, before fencing has had time to work. + * I've split this up a bit partly for clarity, but mainly so allow us to + * print out helpful messages as to what we are up to here. + */ + if (two_node) { + if (node->node_id > us->node_id) { + log_printf(LOG_CRIT, "Killing node %s because it has rejoined the cluster with existing state and has higher node ID", node->name); + P_MEMB("Killing node %s because it has rejoined the cluster with existing state and has higher node ID", node->name); + send_kill(nodeid, CLUSTER_KILL_REJOIN); + } + else { + log_printf(LOG_CRIT, "Not killing node %s despite it rejoining the cluster with existing state, it has a lower node ID", node->name); + P_MEMB("Not killing node %s despite it rejoining the cluster with existing state, it has a lower node ID", node->name); + } + } + else { + log_printf(LOG_CRIT, "Killing node %s because it has rejoined the cluster with existing state", node->name); + P_MEMB("Killing node %s because it has rejoined the cluster with existing state", node->name); + send_kill(nodeid, CLUSTER_KILL_REJOIN); + } + } else { P_MEMB("Node %s not joined to cman because it has existing state", node->name);