Gitweb:
http://git.fedorahosted.org/git/cluster.git?p=cluster.git;a=commitdiff;h=...
Commit: 0fc5da5fdc190ed50e94640136fea177a900ea57
Parent: c5311da33ef0b2558f3a75d1a9ab763e8ac8d365
Author: David Teigland <teigland(a)redhat.com>
AuthorDate: Thu Aug 5 17:05:26 2010 -0500
Committer: David Teigland <teigland(a)redhat.com>
CommitterDate: Tue Aug 17 14:51:40 2010 -0500
fenced: use post_join_delay after cluster join
When the cluster has lost quorum due to a node failure,
the next event is generally a cluster node join which
gives the cluster quorum again. With quorum, fenced
begins fencing any failed nodes, applying post_fail_delay
since the last cpg event was a node failure. In this
case, however, post_join_delay is more appropriate since
the chances are good that nodes being fenced will be joining.
Detect this case where a node joins the cluster giving it
quorum, and use post_join_delay.
bz 575952
Signed-off-by: David Teigland <teigland(a)redhat.com>
---
fence/fenced/member_cman.c | 70 ++++++++++++++++++++++++++++++++++++++++++--
fence/fenced/recover.c | 6 +++-
2 files changed, 72 insertions(+), 4 deletions(-)
diff --git a/fence/fenced/member_cman.c b/fence/fenced/member_cman.c
index 9e22ece..6ef74c3 100644
--- a/fence/fenced/member_cman.c
+++ b/fence/fenced/member_cman.c
@@ -17,9 +17,14 @@
#define BUFLEN 128
static cman_handle_t ch;
-static int cman_quorate;
+static cman_node_t old_nodes[MAX_NODES];
+static int old_node_count;
+static int old_quorate;
static cman_node_t cman_nodes[MAX_NODES];
static int cman_node_count;
+static int cman_quorate;
+int cman_quorate_from_last_change;
+
static char name_buf[CMAN_MAX_NODENAME_LEN+1];
extern struct list_head domains;
@@ -27,6 +32,26 @@ extern struct list_head domains;
char *our_name;
int our_nodeid;
+static int _is_member(cman_node_t *node_list, int count, int nodeid)
+{
+ int i;
+
+ for (i = 0; i < count; i++) {
+ if (node_list[i].cn_nodeid == nodeid)
+ return node_list[i].cn_member;
+ }
+ return 0;
+}
+
+static int is_old_member(int nodeid)
+{
+ return _is_member(old_nodes, old_node_count, nodeid);
+}
+
+static int is_cman_member(int nodeid)
+{
+ return _is_member(cman_nodes, cman_node_count, nodeid);
+}
static int name_equal(char *name1, char *name2)
{
@@ -89,15 +114,54 @@ static cman_node_t *find_cluster_node_name(char *name)
static void statechange(void)
{
- int rv;
+ int i, rv;
+ int removed = 0, added = 0;
+
+ old_quorate = cman_quorate;
+ old_node_count = cman_node_count;
+ memcpy(&old_nodes, &cman_nodes, sizeof(old_nodes));
cman_quorate = cman_is_quorate(ch);
cman_node_count = 0;
memset(&cman_nodes, 0, sizeof(cman_nodes));
-
rv = cman_get_nodes(ch, MAX_NODES, &cman_node_count, cman_nodes);
if (rv < 0)
log_error("cman_get_nodes error %d %d", rv, errno);
+
+ /* Never allow node ID 0 to be considered a member #315711 */
+ for (i = 0; i < cman_node_count; i++) {
+ if (cman_nodes[i].cn_nodeid == 0) {
+ cman_nodes[i].cn_member = 0;
+ break;
+ }
+ }
+
+ for (i = 0; i < old_node_count; i++) {
+ if (old_nodes[i].cn_member &&
+ !is_cman_member(old_nodes[i].cn_nodeid)) {
+ removed++;
+ log_debug("cman: node %d removed",
+ old_nodes[i].cn_nodeid);
+ }
+ }
+
+ for (i = 0; i < cman_node_count; i++) {
+ if (cman_nodes[i].cn_member &&
+ !is_old_member(cman_nodes[i].cn_nodeid)) {
+ added++;
+ log_debug("cman: node %d added",
+ cman_nodes[i].cn_nodeid);
+ }
+ }
+
+ if (removed) {
+ cman_quorate_from_last_change = 0;
+ } else if (added) {
+ if (!old_quorate && cman_quorate)
+ cman_quorate_from_last_change = 1;
+ else
+ cman_quorate_from_last_change = 0;
+ }
}
static void cman_callback(cman_handle_t h, void *private, int reason, int arg)
diff --git a/fence/fenced/recover.c b/fence/fenced/recover.c
index 7f8aace..732ec81 100644
--- a/fence/fenced/recover.c
+++ b/fence/fenced/recover.c
@@ -18,6 +18,7 @@
extern int our_nodeid;
extern commandline_t comline;
+extern int cman_quorate_from_last_change;
/* Fencing recovery algorithm
@@ -302,7 +303,7 @@ static void delay_fencing(fd_t *fd, int start_type)
fd_node_t *node;
char *delay_type;
- if (start_type == GROUP_NODE_JOIN) {
+ if ((start_type == GROUP_NODE_JOIN) || cman_quorate_from_last_change) {
delay = comline.post_join_delay;
delay_type = "post_join_delay";
} else {
@@ -310,6 +311,9 @@ static void delay_fencing(fd_t *fd, int start_type)
delay_type = "post_fail_delay";
}
+ log_debug("delay_fencing %s %d quorate_from_last_change %d",
+ delay_type, delay, cman_quorate_from_last_change);
+
if (delay == 0)
goto out;