Gitweb:
http://git.fedorahosted.org/git/cluster.git?p=cluster.git;a=commitdiff;h=...
Commit: d8a76837ecd24a5f7ab9a2ebf72745e6c6752c0b
Parent: 6bcd9db82232332f40667a6dd409829f7c15a1a0
Author: David Teigland <teigland(a)redhat.com>
AuthorDate: Tue Aug 17 16:42:30 2010 -0500
Committer: David Teigland <teigland(a)redhat.com>
CommitterDate: Tue Aug 17 16:42:30 2010 -0500
fenced: use post_join_delay after cluster join
When the cluster has lost quorum due to a node failure,
the next event is generally a cluster node join which
gives the cluster quorum again. With quorum, fenced
begins fencing any failed nodes, applying post_fail_delay
since the last cpg event was a node failure. In this
case, however, post_join_delay is more appropriate since
the chances are good that nodes being fenced will be joining.
Detect this case where a node joins the cluster giving it
quorum, and use post_join_delay.
bz 624844
Signed-off-by: David Teigland <teigland(a)redhat.com>
---
fence/fenced/fd.h | 1 +
fence/fenced/main.c | 1 +
fence/fenced/member_cman.c | 12 ++++++++++++
fence/fenced/recover.c | 5 ++++-
4 files changed, 18 insertions(+), 1 deletions(-)
diff --git a/fence/fenced/fd.h b/fence/fenced/fd.h
index 39a34ad..a5a78bf 100644
--- a/fence/fenced/fd.h
+++ b/fence/fenced/fd.h
@@ -64,6 +64,7 @@ extern int daemon_quit;
extern int cluster_down;
extern struct list_head domains;
extern int cluster_quorate;
+extern int cluster_quorate_from_last_update;
extern uint32_t cluster_ringid_seq;
extern uint64_t quorate_time;
extern int our_nodeid;
diff --git a/fence/fenced/main.c b/fence/fenced/main.c
index a371dc8..e5ab568 100644
--- a/fence/fenced/main.c
+++ b/fence/fenced/main.c
@@ -1069,6 +1069,7 @@ int daemon_quit;
int cluster_down;
struct list_head domains;
int cluster_quorate;
+int cluster_quorate_from_last_update;
uint32_t cluster_ringid_seq;
uint64_t quorate_time;
int our_nodeid;
diff --git a/fence/fenced/member_cman.c b/fence/fenced/member_cman.c
index b9d8341..0919b8e 100644
--- a/fence/fenced/member_cman.c
+++ b/fence/fenced/member_cman.c
@@ -150,6 +150,7 @@ static void update_cluster(void)
{
cman_cluster_t info;
int quorate = cluster_quorate;
+ int removed = 0, added = 0;
int i, rv;
rv = cman_get_cluster(ch, &info);
@@ -183,6 +184,7 @@ static void update_cluster(void)
old_nodes[i].cn_nodeid, cluster_ringid_seq);
node_history_cluster_remove(old_nodes[i].cn_nodeid);
+ removed++;
}
}
@@ -194,8 +196,18 @@ static void update_cluster(void)
cman_nodes[i].cn_nodeid, cluster_ringid_seq);
node_history_cluster_add(cman_nodes[i].cn_nodeid);
+ added++;
}
}
+
+ if (removed) {
+ cluster_quorate_from_last_update = 0;
+ } else if (added) {
+ if (!quorate && cluster_quorate)
+ cluster_quorate_from_last_update = 1;
+ else
+ cluster_quorate_from_last_update = 0;
+ }
}
/* Note: in fence delay loop we aren't processing callbacks so won't
diff --git a/fence/fenced/recover.c b/fence/fenced/recover.c
index d3bf35f..a7ca047 100644
--- a/fence/fenced/recover.c
+++ b/fence/fenced/recover.c
@@ -181,7 +181,7 @@ void delay_fencing(struct fd *fd, int node_join)
if (list_empty(&fd->victims))
return;
- if (node_join) {
+ if (node_join || cluster_quorate_from_last_update) {
delay = cfgd_post_join_delay;
delay_type = "post_join_delay";
} else {
@@ -189,6 +189,9 @@ void delay_fencing(struct fd *fd, int node_join)
delay_type = "post_fail_delay";
}
+ log_debug("delay %s %d quorate_from_last_update %d",
+ delay_type, delay, cluster_quorate_from_last_update);
+
if (delay == 0)
goto out;