Gitweb: http://git.fedorahosted.org/git/cluster.git?p=cluster.git;a=commitdiff;h=904... Commit: 904a3db6bfe5262d5cc1394756911e98a40b2157 Parent: 6dd65cf344d05730cbccb99ce5265e84f762bfde Author: David Teigland teigland@redhat.com AuthorDate: Tue Aug 17 16:42:30 2010 -0500 Committer: David Teigland teigland@redhat.com CommitterDate: Mon Nov 15 13:22:53 2010 -0600
fenced: use post_join_delay after cluster join
When the cluster has lost quorum due to a node failure, the next event is generally a cluster node join which gives the cluster quorum again. With quorum, fenced begins fencing any failed nodes, applying post_fail_delay since the last cpg event was a node failure. In this case, however, post_join_delay is more appropriate since the chances are good that nodes being fenced will be joining. Detect this case where a node joins the cluster giving it quorum, and use post_join_delay.
bz 624844
Signed-off-by: David Teigland teigland@redhat.com --- fence/fenced/fd.h | 1 + fence/fenced/main.c | 1 + fence/fenced/member_cman.c | 12 ++++++++++++ fence/fenced/recover.c | 5 ++++- 4 files changed, 18 insertions(+), 1 deletions(-)
diff --git a/fence/fenced/fd.h b/fence/fenced/fd.h index 39a34ad..a5a78bf 100644 --- a/fence/fenced/fd.h +++ b/fence/fenced/fd.h @@ -64,6 +64,7 @@ extern int daemon_quit; extern int cluster_down; extern struct list_head domains; extern int cluster_quorate; +extern int cluster_quorate_from_last_update; extern uint32_t cluster_ringid_seq; extern uint64_t quorate_time; extern int our_nodeid; diff --git a/fence/fenced/main.c b/fence/fenced/main.c index a371dc8..e5ab568 100644 --- a/fence/fenced/main.c +++ b/fence/fenced/main.c @@ -1069,6 +1069,7 @@ int daemon_quit; int cluster_down; struct list_head domains; int cluster_quorate; +int cluster_quorate_from_last_update; uint32_t cluster_ringid_seq; uint64_t quorate_time; int our_nodeid; diff --git a/fence/fenced/member_cman.c b/fence/fenced/member_cman.c index b9d8341..0919b8e 100644 --- a/fence/fenced/member_cman.c +++ b/fence/fenced/member_cman.c @@ -150,6 +150,7 @@ static void update_cluster(void) { cman_cluster_t info; int quorate = cluster_quorate; + int removed = 0, added = 0; int i, rv;
rv = cman_get_cluster(ch, &info); @@ -183,6 +184,7 @@ static void update_cluster(void) old_nodes[i].cn_nodeid, cluster_ringid_seq);
node_history_cluster_remove(old_nodes[i].cn_nodeid); + removed++; } }
@@ -194,8 +196,18 @@ static void update_cluster(void) cman_nodes[i].cn_nodeid, cluster_ringid_seq);
node_history_cluster_add(cman_nodes[i].cn_nodeid); + added++; } } + + if (removed) { + cluster_quorate_from_last_update = 0; + } else if (added) { + if (!quorate && cluster_quorate) + cluster_quorate_from_last_update = 1; + else + cluster_quorate_from_last_update = 0; + } }
/* Note: in fence delay loop we aren't processing callbacks so won't diff --git a/fence/fenced/recover.c b/fence/fenced/recover.c index d3bf35f..a7ca047 100644 --- a/fence/fenced/recover.c +++ b/fence/fenced/recover.c @@ -181,7 +181,7 @@ void delay_fencing(struct fd *fd, int node_join) if (list_empty(&fd->victims)) return;
- if (node_join) { + if (node_join || cluster_quorate_from_last_update) { delay = cfgd_post_join_delay; delay_type = "post_join_delay"; } else { @@ -189,6 +189,9 @@ void delay_fencing(struct fd *fd, int node_join) delay_type = "post_fail_delay"; }
+ log_debug("delay %s %d quorate_from_last_update %d", + delay_type, delay, cluster_quorate_from_last_update); + if (delay == 0) goto out;