fence: master - fenced: use cpg ringid
by David Teigland
Gitweb: http://git.fedorahosted.org/git/fence.git?p=fence.git;a=commitdiff;h=f8c7...
Commit: f8c79f99a281f93807a9cb3289a8da6897b17253
Parent: 740a1a8ba973894cce3dfa8e69db2b4cc87cc5df
Author: David Teigland <teigland(a)redhat.com>
AuthorDate: Tue Nov 16 10:26:52 2010 -0600
Committer: David Teigland <teigland(a)redhat.com>
CommitterDate: Tue Nov 16 10:26:52 2010 -0600
fenced: use cpg ringid
bz 584140
Use the new totem cpg callback to synchronize cman events and cpg events.
Without this fix, the following two test cases cause fenced to become
confused and stuck.
test 1
------
- nodes 1,2,3,4
- configure no fencing for all, or force fencing to fail
- service cman start on all
- use iptables to add network partition: 1 | 2,3,4
- remove iptables partition resulting in merge: 1,2,3,4
- nodes 2,3,4 should kill corosync on node 1 automatically
- reboot node 1 if any dlm or gfs were being used
(otherwise just verify all cluster daemons have exited)
- service cman start on node 1
- fence_tool ls on 1,2,3,4 should all show normal state
(4 members, 0 victims, wait state none, members 1,2,3,4)
fence domain
member count 4
victim count 0
victim now 0
master nodeid 2
wait state none
members 1 2 3 4
test 2
------
- nodes 1,2,3,4
- configure no fencing for all, or force fencing to fail
- service cman start on all
- use iptables to add network partition: 1,2 | 3,4
- remove iptables partition resulting in merge: 1,2,3,4
- reboot nodes 1 and 2 if any dlm or gfs were being used
(otherwise just verify all cluster daemons have exited)
(a variation of this test reboots nodes 3 and 4 instead)
- service cman start on nodes 1 and 2
- fence_tool ls on 1,2,3,4 should all show normal state
(4 members, 0 victims, wait state none, members 1,2,3,4)
fence domain
member count 4
victim count 0
victim now 0
master nodeid 2
wait state none
members 1 2 3 4
Signed-off-by: David Teigland <teigland(a)redhat.com>
---
fence/fenced/cpg.c | 285 ++++++++++++++++++++++++++++++++++----------
fence/fenced/fd.h | 7 +
fence/fenced/main.c | 1 +
fence/fenced/member_cman.c | 7 +-
4 files changed, 237 insertions(+), 63 deletions(-)
diff --git a/fence/fenced/cpg.c b/fence/fenced/cpg.c
index 4ecef4a..eec2ba6 100644
--- a/fence/fenced/cpg.c
+++ b/fence/fenced/cpg.c
@@ -75,7 +75,11 @@ struct id_info {
};
static cpg_handle_t cpg_handle_daemon;
+static cpg_handle_t cpg_handle_domain;
+static struct cpg_name group_name_daemon;
+static struct cpg_name group_name_domain;
static int cpg_fd_daemon;
+static int cpg_fd_domain;
static struct protocol our_protocol;
static struct list_head daemon_nodes;
static struct cpg_address daemon_member[MAX_NODES];
@@ -134,6 +138,38 @@ static void log_config(const struct cpg_name *group_name,
m_buf, j_buf, l_buf);
}
+static void log_ringid(cpg_handle_t handle,
+ struct cpg_ring_id *ringid,
+ const uint32_t *member_list,
+ size_t member_list_entries)
+{
+ char m_buf[128];
+ size_t i, len, pos;
+ int ret;
+ const char *name = "unknown";
+
+ if (handle == cpg_handle_domain)
+ name = group_name_domain.value;
+ else if (handle == cpg_handle_daemon)
+ name = group_name_daemon.value;
+
+ memset(m_buf, 0, sizeof(m_buf));
+
+ len = sizeof(m_buf);
+ pos = 0;
+ for (i = 0; i < member_list_entries; i++) {
+ ret = snprintf(m_buf + pos, len - pos, " %u",
+ member_list[i]);
+ if (ret >= len - pos)
+ break;
+ pos += ret;
+ }
+
+ log_debug("%s ring %u:%llu %zu memb%s",
+ name, ringid->nodeid, (unsigned long long)ringid->seq,
+ member_list_entries, m_buf);
+}
+
static void fd_info_in(struct fd_info *fi)
{
fi->fd_info_size = le32_to_cpu(fi->fd_info_size);
@@ -374,7 +410,7 @@ static void node_history_start(struct fd *fd, int nodeid)
node->add_time = time(NULL);
}
-static void node_history_left(struct fd *fd, int nodeid)
+static void node_history_left(struct fd *fd, int nodeid, uint32_t seq)
{
struct node_history *node;
@@ -385,9 +421,10 @@ static void node_history_left(struct fd *fd, int nodeid)
}
node->left_time = time(NULL);
+ node->left_seq = seq;
}
-static void node_history_fail(struct fd *fd, int nodeid)
+static void node_history_fail(struct fd *fd, int nodeid, uint32_t seq)
{
struct node_history *node;
@@ -398,6 +435,7 @@ static void node_history_fail(struct fd *fd, int nodeid)
}
node->fail_time = time(NULL);
+ node->fail_seq = seq;
node->check_quorum = 1;
}
@@ -648,36 +686,11 @@ static void receive_victim_done(struct fd *fd, struct fd_header *hd, int len)
free(node);
}
+/* we know that the quorum value here is consistent with the cpg events
+ because the ringid's are in sync per the previous check_ringid_done */
+
static int check_quorum_done(struct fd *fd)
{
- struct node_history *node;
- int wait_count = 0;
-
- /* We don't want to trust the cluster_quorate value until we know
- that cman has seen the same nodes fail that we have. So, we
- first make sure that all nodes we've seen fail are also
- failed in cman, then we can just check cluster_quorate. This
- assumes that we'll get to this function to do all the checks
- before any of the failed nodes can actually rejoin and become
- cman members again (if that assumption doesn't hold, perhaps
- do something with timestamps of join/fail). */
-
- list_for_each_entry(node, &fd->node_history, list) {
- if (!node->check_quorum)
- continue;
-
- if (!is_cluster_member_reread(node->nodeid)) {
- node->check_quorum = 0;
- } else {
- log_debug("check_quorum %d is_cluster_member",
- node->nodeid);
- wait_count++;
- }
- }
-
- if (wait_count)
- return 0;
-
if (!cluster_quorate) {
log_debug("check_quorum not quorate");
return 0;
@@ -687,8 +700,28 @@ static int check_quorum_done(struct fd *fd)
return 1;
}
+/* wait for cman ringid and cpg ringid to be the same so we know our
+ information from each service is based on the same node state */
+
+static int check_ringid_done(struct fd *fd)
+{
+ if (cluster_ringid_seq != (uint32_t)fd->cpg_ringid.seq) {
+ log_debug("check_ringid cluster %u cpg %u:%llu",
+ cluster_ringid_seq, fd->cpg_ringid.nodeid,
+ (unsigned long long)fd->cpg_ringid.seq);
+ return 0;
+ }
+
+ log_debug("check_ringid done cluster %u cpg %u:%llu",
+ cluster_ringid_seq, fd->cpg_ringid.nodeid,
+ (unsigned long long)fd->cpg_ringid.seq);
+ return 1;
+}
+
static int wait_conditions_done(struct fd *fd)
{
+ if (!check_ringid_done(fd))
+ return 0;
if (!check_quorum_done(fd))
return 0;
return 1;
@@ -831,6 +864,25 @@ static int match_change(struct fd *fd, struct change *cg, struct fd_header *hd,
return 0;
}
+ /* this start message couldn't have been sent for a cg preceding
+ a confchg when the sending node failed or left */
+
+ if ((node->fail_seq > cg->seq) || (node->left_seq > cg->seq)) {
+ log_debug("match_change %d:%u skip cg %u fail cg %u left cg %u",
+ hd->nodeid, seq, cg->seq,
+ node->fail_seq, node->left_seq);
+ return 0;
+ }
+
+ /* if we matched the last start message from this node against our
+ cg N, then don't match this stsart message against an earlier cg */
+
+ if (node->last_match_seq > cg->seq) {
+ log_debug("match_change %d:%u skip cg %u last matched cg %u",
+ hd->nodeid, seq, cg->seq, node->last_match_seq);
+ return 0;
+ }
+
/* verify this is the right change by matching the counts
and the nodeids of the current members */
@@ -865,6 +917,8 @@ static int match_change(struct fd *fd, struct change *cg, struct fd_header *hd,
if (members_mismatch)
return 0;
+ node->last_match_seq = cg->seq;
+
log_debug("match_change %d:%u matches cg %u", hd->nodeid, seq, cg->seq);
return 1;
}
@@ -919,19 +973,45 @@ static int match_change(struct fd *fd, struct change *cg, struct fd_header *hd,
is > cpg ringid, then return 0 for conditions_done so we won't send
start and will wait until the most recent cpg confchg (matching the
current cman one) to send a start. Waits for cpg to catch up with cman.
+
+ Final solution is the patch adding check_ringid_done() that waits for
+ cman and cpg to both be on the same ringid before going ahead to check
+ quorum and send starts.
*/
static struct change *find_change(struct fd *fd, struct fd_header *hd,
struct fd_info *fi, struct id_info *ids)
{
struct change *cg;
+ struct change *cg1 = NULL, *cg2 = NULL;
list_for_each_entry_reverse(cg, &fd->changes, list) {
if (!match_change(fd, cg, hd, fi, ids))
continue;
- return cg;
+
+ if (!(hd->flags & FD_MFLG_DUPLICATE_CG))
+ return cg;
+
+ /* this start message is for the second of two matching cg's */
+
+ if (!cg1) {
+ cg1 = cg;
+ log_debug("find_change %d:%u match1 %u look for dup",
+ hd->nodeid, hd->msgdata, cg1->seq);
+ continue;
+ } else {
+ cg2 = cg;
+ log_debug("find_change %d:%u match1 %u match2 %u",
+ hd->nodeid, hd->msgdata, cg1->seq, cg2->seq);
+ break;
+ }
}
+ if (cg1 && cg2)
+ return cg2;
+ if (cg1)
+ return cg1;
+
log_debug("find_change %d:%u no match", hd->nodeid, hd->msgdata);
return NULL;
}
@@ -1064,19 +1144,17 @@ static int count_ids(struct fd *fd)
return count;
}
-static void send_info(struct fd *fd, int type)
+static void send_info(struct fd *fd, struct change *cg, int type,
+ uint32_t flags)
{
- struct change *cg;
struct fd_header *hd;
struct fd_info *fi;
struct id_info *id;
struct node_history *node;
char *buf;
- uint32_t flags;
+ uint32_t idflags;
int len, id_count;
- cg = list_first_entry(&fd->changes, struct change, list);
-
id_count = count_ids(fd);
len = sizeof(struct fd_header) + sizeof(struct fd_info) +
@@ -1097,6 +1175,8 @@ static void send_info(struct fd *fd, int type)
hd->type = type;
hd->msgdata = cg->seq;
+ hd->flags = flags;
+
if (cg->we_joined)
hd->flags |= FD_MFLG_JOINING;
if (fd->init_complete || fd->local_init_complete)
@@ -1116,11 +1196,11 @@ static void send_info(struct fd *fd, int type)
/* fill in id_info entries */
list_for_each_entry(node, &fd->node_history, list) {
- flags = 0;
+ idflags = 0;
if (find_memb(cg, node->nodeid))
- flags = IDI_NODEID_IS_MEMBER;
+ idflags = IDI_NODEID_IS_MEMBER;
- id->flags = cpu_to_le32(flags);
+ id->flags = cpu_to_le32(idflags);
id->nodeid = cpu_to_le32(node->nodeid);
id->fence_external_node= cpu_to_le32(node->fence_external_node);
id->fence_master = cpu_to_le32(node->fence_master);
@@ -1130,8 +1210,8 @@ static void send_info(struct fd *fd, int type)
id++;
}
- log_debug("send_%s cg %u flags %x counts %u %d %d %d %d",
- type == FD_MSG_START ? "start" : "complete",
+ log_debug("send_%s %d:%u flags %x started %u m %d j %d r %d f %d",
+ type == FD_MSG_START ? "start" : "complete", our_nodeid,
cg->seq, hd->flags, fd->started_count, cg->member_count,
cg->joined_count, cg->remove_count, cg->failed_count);
@@ -1140,9 +1220,45 @@ static void send_info(struct fd *fd, int type)
free(buf);
}
+static int same_members(struct change *cg1, struct change *cg2)
+{
+ struct member *memb;
+
+ list_for_each_entry(memb, &cg1->members, list) {
+ if (!find_memb(cg2, memb->nodeid))
+ return 0;
+ }
+ return 1;
+}
+
static void send_start(struct fd *fd)
{
- send_info(fd, FD_MSG_START);
+ struct change *cg = list_first_entry(&fd->changes, struct change, list);
+ struct change *cgtmp;
+ uint32_t flags = 0;
+
+ /* look for a previous matching cg that we don't want others to
+ confuse for this one */
+
+ list_for_each_entry(cgtmp, &fd->changes, list) {
+ if (cgtmp->sent_start)
+ continue;
+
+ if (cgtmp->seq < cg->seq &&
+ cgtmp->member_count == cg->member_count &&
+ cgtmp->joined_count == cg->joined_count &&
+ cgtmp->remove_count == cg->remove_count &&
+ cgtmp->failed_count == cg->failed_count &&
+ same_members(cgtmp, cg)) {
+ log_debug("duplicate old cg %u new cg %u",
+ cgtmp->seq, cg->seq);
+ flags = FD_MFLG_DUPLICATE_CG;
+ }
+ }
+
+ cg->sent_start = 1;
+
+ send_info(fd, cg, FD_MSG_START, flags);
}
/* same content as a start message, a new (incomplete) node will look for
@@ -1151,7 +1267,9 @@ static void send_start(struct fd *fd)
static void send_complete(struct fd *fd)
{
- send_info(fd, FD_MSG_COMPLETE);
+ struct change *cg = list_first_entry(&fd->changes, struct change, list);
+
+ send_info(fd, cg, FD_MSG_COMPLETE, 0);
}
/* FIXME: better to just look in victims list for any nodes with init_victim? */
@@ -1321,9 +1439,9 @@ static int add_change(struct fd *fd,
list_add_tail(&memb->list, &cg->removed);
if (memb->failed)
- node_history_fail(fd, memb->nodeid);
+ node_history_fail(fd, memb->nodeid, cg->seq);
else
- node_history_left(fd, memb->nodeid);
+ node_history_left(fd, memb->nodeid, cg->seq);
log_debug("add_change cg %u remove nodeid %d reason %d",
cg->seq, memb->nodeid, left_list[i].reason);
@@ -1354,8 +1472,8 @@ static int add_change(struct fd *fd,
list_for_each_entry(memb, &cg->members, list)
node_history_init(fd, memb->nodeid);
- log_debug("add_change cg %u counts member %d joined %d remove %d "
- "failed %d", cg->seq, cg->member_count, cg->joined_count,
+ log_debug("add_change cg %u m %d j %d r %d f %d",
+ cg->seq, cg->member_count, cg->joined_count,
cg->remove_count, cg->failed_count);
list_add(&cg->list, &fd->changes);
@@ -1530,9 +1648,36 @@ static void deliver_cb_domain(cpg_handle_t handle,
apply_changes(fd);
}
-static cpg_callbacks_t cpg_callbacks_domain = {
+/* save ringid to compare with cman's.
+ also save member_list to double check with cman's member list?
+ they should match */
+
+static void totem_cb_domain(cpg_handle_t handle,
+ struct cpg_ring_id ring_id,
+ uint32_t member_list_entries,
+ const uint32_t *member_list)
+{
+ struct fd *fd;
+
+ log_ringid(handle, &ring_id, member_list, member_list_entries);
+
+ fd = find_fd_handle(handle);
+ if (!fd) {
+ log_error("totem_cb no fence domain for handle");
+ return;
+ }
+
+ fd->cpg_ringid.nodeid = ring_id.nodeid;
+ fd->cpg_ringid.seq = ring_id.seq;
+
+ apply_changes(fd);
+}
+
+static cpg_model_v1_data_t cpg_callbacks_domain = {
.cpg_deliver_fn = deliver_cb_domain,
.cpg_confchg_fn = confchg_cb_domain,
+ .cpg_totem_confchg_fn = totem_cb_domain,
+ .flags = CPG_MODEL_V1_DELIVER_INITIAL_TOTEM_CONF,
};
static void process_cpg_domain(int ci)
@@ -1556,32 +1701,35 @@ static void process_cpg_domain(int ci)
int fd_join(struct fd *fd)
{
cpg_error_t error;
- cpg_handle_t h;
struct cpg_name name;
- int i = 0, f, ci;
+ int i = 0, ci;
- error = cpg_initialize(&h, &cpg_callbacks_domain);
+ error = cpg_model_initialize(&cpg_handle_domain, CPG_MODEL_V1,
+ (cpg_model_data_t *)&cpg_callbacks_domain,
+ NULL);
if (error != CPG_OK) {
- log_error("cpg_initialize error %d", error);
+ log_error("cpg_model_initialize error %d", error);
goto fail_free;
}
- cpg_fd_get(h, &f);
+ cpg_fd_get(cpg_handle_domain, &cpg_fd_domain);
- ci = client_add(f, process_cpg_domain, NULL);
+ ci = client_add(cpg_fd_domain, process_cpg_domain, NULL);
list_add(&fd->list, &domains);
- fd->cpg_handle = h;
+ fd->cpg_handle = cpg_handle_domain;
fd->cpg_client = ci;
- fd->cpg_fd = f;
+ fd->cpg_fd = cpg_fd_domain;
fd->joining_group = 1;
memset(&name, 0, sizeof(name));
sprintf(name.value, "fenced:%s", fd->name);
name.length = strlen(name.value) + 1;
+ memcpy(&group_name_domain, &name, sizeof(struct cpg_name));
+ log_debug("cpg_join %s ...", name.value);
retry:
- error = cpg_join(h, &name);
+ error = cpg_join(cpg_handle_domain, &name);
if (error == CPG_ERR_TRY_AGAIN) {
sleep(1);
if (!(++i % 10))
@@ -1598,7 +1746,7 @@ int fd_join(struct fd *fd)
fail:
list_del(&fd->list);
client_dead(ci);
- cpg_finalize(h);
+ cpg_finalize(cpg_handle_domain);
fail_free:
free(fd);
return error;
@@ -1616,6 +1764,7 @@ int fd_leave(struct fd *fd)
sprintf(name.value, "fenced:%s", fd->name);
name.length = strlen(name.value) + 1;
+ log_debug("cpg_leave %s ...", name.value);
retry:
error = cpg_leave(fd->cpg_handle, &name);
if (error == CPG_ERR_TRY_AGAIN) {
@@ -2088,9 +2237,19 @@ static void confchg_cb_daemon(cpg_handle_t handle,
}
}
-static cpg_callbacks_t cpg_callbacks_daemon = {
+static void totem_cb_daemon(cpg_handle_t handle,
+ struct cpg_ring_id ring_id,
+ uint32_t member_list_entries,
+ const uint32_t *member_list)
+{
+ log_ringid(handle, &ring_id, member_list, member_list_entries);
+}
+
+static cpg_model_v1_data_t cpg_callbacks_daemon = {
.cpg_deliver_fn = deliver_cb_daemon,
.cpg_confchg_fn = confchg_cb_daemon,
+ .cpg_totem_confchg_fn = totem_cb_daemon,
+ .flags = CPG_MODEL_V1_DELIVER_INITIAL_TOTEM_CONF,
};
void process_cpg_daemon(int ci)
@@ -2149,9 +2308,11 @@ int setup_cpg_daemon(void)
our_protocol.daemon_max[1] = 1;
our_protocol.daemon_max[2] = 1;
- error = cpg_initialize(&cpg_handle_daemon, &cpg_callbacks_daemon);
+ error = cpg_model_initialize(&cpg_handle_daemon, CPG_MODEL_V1,
+ (cpg_model_data_t *)&cpg_callbacks_daemon,
+ NULL);
if (error != CPG_OK) {
- log_error("daemon cpg_initialize error %d", error);
+ log_error("daemon cpg_model_initialize error %d", error);
goto ret;
}
@@ -2160,6 +2321,7 @@ int setup_cpg_daemon(void)
memset(&name, 0, sizeof(name));
sprintf(name.value, "fenced:daemon");
name.length = strlen(name.value) + 1;
+ memcpy(&group_name_daemon, &name, sizeof(struct cpg_name));
log_debug("cpg_join %s ...", name.value);
retry:
@@ -2200,6 +2362,7 @@ void close_cpg_daemon(void)
sprintf(name.value, "fenced:daemon");
name.length = strlen(name.value) + 1;
+ log_debug("cpg_leave %s ...", name.value);
retry:
error = cpg_leave(cpg_handle_daemon, &name);
if (error == CPG_ERR_TRY_AGAIN) {
diff --git a/fence/fenced/fd.h b/fence/fenced/fd.h
index f026973..f3c3696 100644
--- a/fence/fenced/fd.h
+++ b/fence/fenced/fd.h
@@ -59,6 +59,7 @@ extern int daemon_quit;
extern int cluster_down;
extern struct list_head domains;
extern int cluster_quorate;
+extern uint32_t cluster_ringid_seq;
extern uint64_t quorate_time;
extern int our_nodeid;
extern char daemon_debug_buf[256];
@@ -88,6 +89,7 @@ do { \
#define FD_MFLG_JOINING 1 /* accompanies start, we are joining */
#define FD_MFLG_COMPLETE 2 /* accompanies start, we have complete info */
+#define FD_MFLG_DUPLICATE_CG 4
struct fd_header {
uint16_t version[3];
@@ -115,6 +117,7 @@ struct change {
int failed_count;
int state; /* CGST_ */
int we_joined;
+ int sent_start;
uint32_t seq; /* just used as a reference when debugging */
uint64_t create_time;
};
@@ -139,6 +142,9 @@ struct node_history {
int fence_external_node;
int fence_master;
int fence_how; /* VIC_DONE_ */
+ uint32_t last_match_seq;
+ uint32_t fail_seq;
+ uint32_t left_seq;
};
struct node {
@@ -165,6 +171,7 @@ struct fd {
struct list_head node_history;
int init_complete;
int local_init_complete;
+ struct cpg_ring_id cpg_ringid;
/* general domain membership */
diff --git a/fence/fenced/main.c b/fence/fenced/main.c
index fb9fe67..adbac6b 100644
--- a/fence/fenced/main.c
+++ b/fence/fenced/main.c
@@ -1016,6 +1016,7 @@ int daemon_quit;
int cluster_down;
struct list_head domains;
int cluster_quorate;
+uint32_t cluster_ringid_seq;
uint64_t quorate_time;
int our_nodeid;
char daemon_debug_buf[256];
diff --git a/fence/fenced/member_cman.c b/fence/fenced/member_cman.c
index 111612d..e97794d 100644
--- a/fence/fenced/member_cman.c
+++ b/fence/fenced/member_cman.c
@@ -56,6 +56,7 @@ static void quorum_callback(quorum_handle_t h, uint32_t quorate,
int i;
cluster_quorate = quorate;
+ cluster_ringid_seq = (uint32_t)ring_seq;
if (!prev_quorate && cluster_quorate)
quorate_time = time(NULL);
@@ -71,14 +72,16 @@ static void quorum_callback(quorum_handle_t h, uint32_t quorate,
for (i = 0; i < old_node_count; i++) {
if (!is_cluster_member(old_nodes[i])) {
- log_debug("cluster node %d removed", old_nodes[i]);
+ log_debug("cluster node %d removed seq %u",
+ old_nodes[i], cluster_ringid_seq);
node_history_cluster_remove(old_nodes[i]);
}
}
for (i = 0; i < quorum_node_count; i++) {
if (!is_old_member(quorum_nodes[i])) {
- log_debug("cluster node %d added", quorum_nodes[i]);
+ log_debug("cluster node %d added seq %u",
+ quorum_nodes[i], cluster_ringid_seq);
node_history_cluster_add(quorum_nodes[i]);
}
}
13 years, 7 months
cluster: RHEL4 - GFS: Clean up posix locks after withdraw
by Bob Peterson
Gitweb: http://git.fedorahosted.org/git/cluster.git?p=cluster.git;a=commitdiff;h=...
Commit: e6ce10a475c1d9967a62e79546d23d5fedf936c1
Parent: 371a1a0eaff5c89cafe49f51d7494d4a10676a1a
Author: Bob Peterson <rpeterso(a)redhat.com>
AuthorDate: Tue Nov 16 09:10:06 2010 -0600
Committer: Bob Peterson <rpeterso(a)redhat.com>
CommitterDate: Tue Nov 16 09:10:06 2010 -0600
GFS: Clean up posix locks after withdraw
This patch allows GFS to clean up its posix locks after a withdraw
by calling posix_lock_file_wait rather than returning an error.
rhbz#642339
---
gfs-kernel/src/gfs/lm.c | 2 +-
1 files changed, 1 insertions(+), 1 deletions(-)
diff --git a/gfs-kernel/src/gfs/lm.c b/gfs-kernel/src/gfs/lm.c
index e1307ae..d1b3fb2 100644
--- a/gfs-kernel/src/gfs/lm.c
+++ b/gfs-kernel/src/gfs/lm.c
@@ -467,7 +467,7 @@ gfs_lm_punlock(struct gfs_sbd *sdp,
atomic_inc(&sdp->sd_lm_outstanding);
if (unlikely(test_bit(SDF_SHUTDOWN, &sdp->sd_flags)))
- error = -EIO;
+ error = posix_lock_file_wait(file, fl);
else
error = sdp->sd_lockstruct.ls_ops->lm_punlock(
sdp->sd_lockstruct.ls_lockspace,
13 years, 7 months
cluster: STABLE31 - cman init: fix NetworkManager and distro detection
by Fabio M. Di Nitto
Gitweb: http://git.fedorahosted.org/git/cluster.git?p=cluster.git;a=commitdiff;h=...
Commit: 27c8333b40ede1e29155774bf9cb7822c9e71f8e
Parent: 368725a9d88c4cb8eb0feee585c6ece51fe09fc8
Author: Fabio M. Di Nitto <fdinitto(a)redhat.com>
AuthorDate: Tue Nov 16 10:03:42 2010 +0100
Committer: Fabio M. Di Nitto <fdinitto(a)redhat.com>
CommitterDate: Tue Nov 16 10:03:42 2010 +0100
cman init: fix NetworkManager and distro detection
This change should finally allow Debian to use our init script almost pristine
and get the script to work on rpm based distro again.
Signed-off-by: Fabio M. Di Nitto <fdinitto(a)redhat.com>
---
cman/init.d/cman.in | 11 ++++++-----
1 files changed, 6 insertions(+), 5 deletions(-)
diff --git a/cman/init.d/cman.in b/cman/init.d/cman.in
index 8314673..33f5428 100644
--- a/cman/init.d/cman.in
+++ b/cman/init.d/cman.in
@@ -19,7 +19,7 @@
# set secure PATH
PATH="/bin:/usr/bin:/sbin:/usr/sbin:@SBINDIR@"
-local_chkconfig()
+chkconfig2()
{
case "$1" in
--levels)
@@ -61,14 +61,15 @@ if [ -d /etc/sysconfig ]; then
[ -f /etc/sysconfig/cluster ] && . /etc/sysconfig/cluster
[ -f /etc/sysconfig/cman ] && . /etc/sysconfig/cman
[ -z "$LOCK_FILE" ] && LOCK_FILE="/var/lock/subsys/cman"
+ netmanager=NetworkManager
fi
# deb based distros
-if [ -d /etc/default ]; then
+if [ ! -d /etc/sysconfig ]; then
[ -f /etc/default/cluster ] && . /etc/default/cluster
[ -f /etc/default/cman ] && . /etc/default/cman
[ -z "$LOCK_FILE" ] && LOCK_FILE="/var/lock/cman"
- type chkconfig > /dev/null 2>&1 || alias chkconfig=local_chkconfig
+ netmanager=network-manager
fi
# CMAN_CLUSTER_TIMEOUT -- amount of time to wait for joinging a cluster
@@ -299,7 +300,7 @@ sshd_enabled()
network_manager_enabled()
{
if status NetworkManager > /dev/null 2>&1 || \
- chkconfig NetworkManager; then
+ chkconfig2 $netmanager; then
errmsg="\nNetwork Manager is either running or configured to run. Please disable it in the cluster."
return 1
fi
@@ -368,7 +369,7 @@ xend_bridged_net_enabled() {
[ -z "$current_runlevel" ] && return 1
# xend doesn't start at this runlevel.
- ! chkconfig --levels "$current_runlevel" xend 2>/dev/null && return 1
+ ! chkconfig2 --levels "$current_runlevel" xend 2>/dev/null && return 1
# xend isn't configured to use bridged networking.
[ ! -f /etc/xen/xend-config.sxp ] && return 1
13 years, 7 months
cluster: STABLE31 - build: reinstate group_tool build
by Fabio M. Di Nitto
Gitweb: http://git.fedorahosted.org/git/cluster.git?p=cluster.git;a=commitdiff;h=...
Commit: 368725a9d88c4cb8eb0feee585c6ece51fe09fc8
Parent: 904a3db6bfe5262d5cc1394756911e98a40b2157
Author: Fabio M. Di Nitto <fdinitto(a)redhat.com>
AuthorDate: Tue Nov 16 09:23:45 2010 +0100
Committer: Fabio M. Di Nitto <fdinitto(a)redhat.com>
CommitterDate: Tue Nov 16 09:23:45 2010 +0100
build: reinstate group_tool build
Signed-off-by: Fabio M. Di Nitto <fdinitto(a)redhat.com>
---
group/Makefile | 3 +--
group/tool/Makefile | 7 -------
2 files changed, 1 insertions(+), 9 deletions(-)
diff --git a/group/Makefile b/group/Makefile
index 0113501..6a7cecf 100644
--- a/group/Makefile
+++ b/group/Makefile
@@ -1,5 +1,4 @@
include ../make/defines.mk
include $(OBJDIR)/make/passthrough.mk
-#SUBDIRS = lib dlm_controld tool daemon man
-SUBDIRS = lib dlm_controld daemon man
+SUBDIRS = lib dlm_controld tool daemon man
diff --git a/group/tool/Makefile b/group/tool/Makefile
index d6421f6..3004821 100644
--- a/group/tool/Makefile
+++ b/group/tool/Makefile
@@ -13,20 +13,13 @@ include $(OBJDIR)/make/uninstall.mk
OBJS= main.o
CFLAGS += -I$(S) -I$(S)/../daemon/ -I$(S)/../lib/
-CFLAGS += -I${dlmcontrolincdir}
-CFLAGS += -I${fencedincdir}
-CFLAGS += -I$(S)/../libgfscontrol
CFLAGS += -I${incdir}
CFLAGS += -I${KERNEL_SRC}/include/
-LDFLAGS += -L${dlmcontrollibdir} -ldlmcontrol
-LDFLAGS += -L${fencedlibdir} -lfenced
LDFLAGS += -L../lib -lgroup
-LDFLAGS += -L../libgfscontrol -lgfscontrol
LDFLAGS += -L${libdir}
LDDEPS += ../lib/libgroup.a
-LDDEPS += ../libgfscontrol/libgfscontrol.a
${TARGET}: ${OBJS} ${LDDEPS}
$(CC) -o $@ $^ $(LDFLAGS)
13 years, 7 months
cluster: RHEL6 - gfs_controld: fix plock owner in unmount
by David Teigland
Gitweb: http://git.fedorahosted.org/git/cluster.git?p=cluster.git;a=commitdiff;h=...
Commit: 6dc56fc71790f1eac600012d030f8da26a6d62e5
Parent: fb3ce7fd9b9bf8fa2b89e1525cd73bd4f832ecb4
Author: David Teigland <teigland(a)redhat.com>
AuthorDate: Tue Aug 17 15:39:40 2010 -0500
Committer: David Teigland <teigland(a)redhat.com>
CommitterDate: Mon Nov 15 13:28:08 2010 -0600
gfs_controld: fix plock owner in unmount
When a node owns any plock resources on a file system and that
fs is unmounted, the remaining nodes do nothing to change the
owner value on those resources. Any process that attempts to
access those plock resources will become stuck and require a
reboot. The fix is to change the owner to 0 (unowned) on any
resources owned by a node that unmounts.
bz 624822
Signed-off-by: David Teigland <teigland(a)redhat.com>
---
group/gfs_controld/cpg-old.c | 9 +++++++++
group/gfs_controld/gfs_daemon.h | 1 +
group/gfs_controld/plock.c | 31 +++++++++++++++++++++++++++++++
3 files changed, 41 insertions(+), 0 deletions(-)
diff --git a/group/gfs_controld/cpg-old.c b/group/gfs_controld/cpg-old.c
index 5342025..55353d0 100644
--- a/group/gfs_controld/cpg-old.c
+++ b/group/gfs_controld/cpg-old.c
@@ -2123,6 +2123,14 @@ static void reset_unfinished_recoveries(struct mountgroup *mg)
}
}
+static void reset_plock_resources(struct mountgroup *mg)
+{
+ struct mg_member *memb;
+
+ list_for_each_entry(memb, &mg->members_gone, list)
+ remove_resource_owner(mg, memb->nodeid);
+}
+
/*
old method:
A is rw mount, B mounts rw
@@ -2170,6 +2178,7 @@ void do_start(struct mountgroup *mg, int type, int member_count, int *nodeids)
recover_members(mg, member_count, nodeids, &pos, &neg);
reset_unfinished_recoveries(mg);
+ reset_plock_resources(mg);
if (mg->init) {
if (member_count == 1)
diff --git a/group/gfs_controld/gfs_daemon.h b/group/gfs_controld/gfs_daemon.h
index af7ed45..db8b7d9 100644
--- a/group/gfs_controld/gfs_daemon.h
+++ b/group/gfs_controld/gfs_daemon.h
@@ -317,6 +317,7 @@ void retrieve_plocks(struct mountgroup *mg);
void purge_plocks(struct mountgroup *mg, int nodeid, int unmount);
int fill_plock_dump_buf(struct mountgroup *mg);
int setup_misc_devices(void);
+void remove_resource_owner(struct mountgroup *mg, int nodeid);
/* util.c */
int we_are_in_fence_domain(void);
diff --git a/group/gfs_controld/plock.c b/group/gfs_controld/plock.c
index e487d41..4de793d 100644
--- a/group/gfs_controld/plock.c
+++ b/group/gfs_controld/plock.c
@@ -2205,6 +2205,37 @@ void purge_plocks(struct mountgroup *mg, int nodeid, int unmount)
unlink_checkpoint(mg);
}
+/* when a node unmounts we need to remove it as the owner of any resources */
+
+void remove_resource_owner(struct mountgroup *mg, int nodeid)
+{
+ struct resource *r, *r2;
+ int rem = 0;
+
+ if (!cfgd_plock_ownership)
+ return;
+
+ list_for_each_entry_safe(r, r2, &mg->plock_resources, list) {
+ if (r->owner == nodeid) {
+ log_plock(mg, "rem owner %d from %llu",
+ nodeid, (unsigned long long)r->number);
+ r->owner = 0;
+ r->flags |= R_GOT_UNOWN;
+ rem++;
+
+ /* should probably wait to do this until after
+ the finish barrier when we know everyone has
+ changed owner to 0 */
+ send_pending_plocks(mg, r);
+ }
+ }
+
+ if (rem)
+ mg->last_plock_time = time(NULL);
+
+ log_group(mg, "removed owner %d from %d resources", nodeid, rem);
+}
+
int fill_plock_dump_buf(struct mountgroup *mg)
{
struct posix_lock *po;
13 years, 7 months
cluster: STABLE31 - fenced: use post_join_delay after cluster join
by David Teigland
Gitweb: http://git.fedorahosted.org/git/cluster.git?p=cluster.git;a=commitdiff;h=...
Commit: 904a3db6bfe5262d5cc1394756911e98a40b2157
Parent: 6dd65cf344d05730cbccb99ce5265e84f762bfde
Author: David Teigland <teigland(a)redhat.com>
AuthorDate: Tue Aug 17 16:42:30 2010 -0500
Committer: David Teigland <teigland(a)redhat.com>
CommitterDate: Mon Nov 15 13:22:53 2010 -0600
fenced: use post_join_delay after cluster join
When the cluster has lost quorum due to a node failure,
the next event is generally a cluster node join which
gives the cluster quorum again. With quorum, fenced
begins fencing any failed nodes, applying post_fail_delay
since the last cpg event was a node failure. In this
case, however, post_join_delay is more appropriate since
the chances are good that nodes being fenced will be joining.
Detect this case where a node joins the cluster giving it
quorum, and use post_join_delay.
bz 624844
Signed-off-by: David Teigland <teigland(a)redhat.com>
---
fence/fenced/fd.h | 1 +
fence/fenced/main.c | 1 +
fence/fenced/member_cman.c | 12 ++++++++++++
fence/fenced/recover.c | 5 ++++-
4 files changed, 18 insertions(+), 1 deletions(-)
diff --git a/fence/fenced/fd.h b/fence/fenced/fd.h
index 39a34ad..a5a78bf 100644
--- a/fence/fenced/fd.h
+++ b/fence/fenced/fd.h
@@ -64,6 +64,7 @@ extern int daemon_quit;
extern int cluster_down;
extern struct list_head domains;
extern int cluster_quorate;
+extern int cluster_quorate_from_last_update;
extern uint32_t cluster_ringid_seq;
extern uint64_t quorate_time;
extern int our_nodeid;
diff --git a/fence/fenced/main.c b/fence/fenced/main.c
index a371dc8..e5ab568 100644
--- a/fence/fenced/main.c
+++ b/fence/fenced/main.c
@@ -1069,6 +1069,7 @@ int daemon_quit;
int cluster_down;
struct list_head domains;
int cluster_quorate;
+int cluster_quorate_from_last_update;
uint32_t cluster_ringid_seq;
uint64_t quorate_time;
int our_nodeid;
diff --git a/fence/fenced/member_cman.c b/fence/fenced/member_cman.c
index b9d8341..0919b8e 100644
--- a/fence/fenced/member_cman.c
+++ b/fence/fenced/member_cman.c
@@ -150,6 +150,7 @@ static void update_cluster(void)
{
cman_cluster_t info;
int quorate = cluster_quorate;
+ int removed = 0, added = 0;
int i, rv;
rv = cman_get_cluster(ch, &info);
@@ -183,6 +184,7 @@ static void update_cluster(void)
old_nodes[i].cn_nodeid, cluster_ringid_seq);
node_history_cluster_remove(old_nodes[i].cn_nodeid);
+ removed++;
}
}
@@ -194,8 +196,18 @@ static void update_cluster(void)
cman_nodes[i].cn_nodeid, cluster_ringid_seq);
node_history_cluster_add(cman_nodes[i].cn_nodeid);
+ added++;
}
}
+
+ if (removed) {
+ cluster_quorate_from_last_update = 0;
+ } else if (added) {
+ if (!quorate && cluster_quorate)
+ cluster_quorate_from_last_update = 1;
+ else
+ cluster_quorate_from_last_update = 0;
+ }
}
/* Note: in fence delay loop we aren't processing callbacks so won't
diff --git a/fence/fenced/recover.c b/fence/fenced/recover.c
index d3bf35f..a7ca047 100644
--- a/fence/fenced/recover.c
+++ b/fence/fenced/recover.c
@@ -181,7 +181,7 @@ void delay_fencing(struct fd *fd, int node_join)
if (list_empty(&fd->victims))
return;
- if (node_join) {
+ if (node_join || cluster_quorate_from_last_update) {
delay = cfgd_post_join_delay;
delay_type = "post_join_delay";
} else {
@@ -189,6 +189,9 @@ void delay_fencing(struct fd *fd, int node_join)
delay_type = "post_fail_delay";
}
+ log_debug("delay %s %d quorate_from_last_update %d",
+ delay_type, delay, cluster_quorate_from_last_update);
+
if (delay == 0)
goto out;
13 years, 7 months
cluster: STABLE31 - fenced: use cpg ringid
by David Teigland
Gitweb: http://git.fedorahosted.org/git/cluster.git?p=cluster.git;a=commitdiff;h=...
Commit: 6dd65cf344d05730cbccb99ce5265e84f762bfde
Parent: 85cfa7555b6f098a34757481346cdaa0625530b7
Author: David Teigland <teigland(a)redhat.com>
AuthorDate: Fri Mar 19 11:10:13 2010 -0500
Committer: David Teigland <teigland(a)redhat.com>
CommitterDate: Mon Nov 15 13:20:37 2010 -0600
fenced: use cpg ringid
bz 584140
Use the new totem cpg callback to synchronize cman events and cpg events.
Without this fix, the following two test cases cause fenced to become
confused and stuck.
test 1
------
- nodes 1,2,3,4
- configure no fencing for all, or force fencing to fail
- service cman start on all
- use iptables to add network partition: 1 | 2,3,4
- remove iptables partition resulting in merge: 1,2,3,4
- nodes 2,3,4 should kill corosync on node 1 automatically
- reboot node 1 if any dlm or gfs were being used
(otherwise just verify all cluster daemons have exited)
- service cman start on node 1
- fence_tool ls on 1,2,3,4 should all show normal state
(4 members, 0 victims, wait state none, members 1,2,3,4)
fence domain
member count 4
victim count 0
victim now 0
master nodeid 2
wait state none
members 1 2 3 4
test 2
------
- nodes 1,2,3,4
- configure no fencing for all, or force fencing to fail
- service cman start on all
- use iptables to add network partition: 1,2 | 3,4
- remove iptables partition resulting in merge: 1,2,3,4
- reboot nodes 1 and 2 if any dlm or gfs were being used
(otherwise just verify all cluster daemons have exited)
(a variation of this test reboots nodes 3 and 4 instead)
- service cman start on nodes 1 and 2
- fence_tool ls on 1,2,3,4 should all show normal state
(4 members, 0 victims, wait state none, members 1,2,3,4)
fence domain
member count 4
victim count 0
victim now 0
master nodeid 2
wait state none
members 1 2 3 4
Signed-off-by: David Teigland <teigland(a)redhat.com>
---
fence/fenced/cpg.c | 285 ++++++++++++++++++++++++++++++++++----------
fence/fenced/fd.h | 7 +
fence/fenced/main.c | 1 +
fence/fenced/member_cman.c | 16 ++-
4 files changed, 244 insertions(+), 65 deletions(-)
diff --git a/fence/fenced/cpg.c b/fence/fenced/cpg.c
index c9d86f3..a8629b9 100644
--- a/fence/fenced/cpg.c
+++ b/fence/fenced/cpg.c
@@ -73,7 +73,11 @@ struct id_info {
};
static cpg_handle_t cpg_handle_daemon;
+static cpg_handle_t cpg_handle_domain;
+static struct cpg_name group_name_daemon;
+static struct cpg_name group_name_domain;
static int cpg_fd_daemon;
+static int cpg_fd_domain;
static struct protocol our_protocol;
static struct list_head daemon_nodes;
static struct cpg_address daemon_member[MAX_NODES];
@@ -132,6 +136,38 @@ static void log_config(const struct cpg_name *group_name,
m_buf, j_buf, l_buf);
}
+static void log_ringid(cpg_handle_t handle,
+ struct cpg_ring_id *ringid,
+ const uint32_t *member_list,
+ size_t member_list_entries)
+{
+ char m_buf[128];
+ size_t i, len, pos;
+ int ret;
+ const char *name = "unknown";
+
+ if (handle == cpg_handle_domain)
+ name = group_name_domain.value;
+ else if (handle == cpg_handle_daemon)
+ name = group_name_daemon.value;
+
+ memset(m_buf, 0, sizeof(m_buf));
+
+ len = sizeof(m_buf);
+ pos = 0;
+ for (i = 0; i < member_list_entries; i++) {
+ ret = snprintf(m_buf + pos, len - pos, " %u",
+ member_list[i]);
+ if (ret >= len - pos)
+ break;
+ pos += ret;
+ }
+
+ log_debug("%s ring %u:%llu %zu memb%s",
+ name, ringid->nodeid, (unsigned long long)ringid->seq,
+ member_list_entries, m_buf);
+}
+
static void fd_info_in(struct fd_info *fi)
{
fi->fd_info_size = le32_to_cpu(fi->fd_info_size);
@@ -372,7 +408,7 @@ static void node_history_start(struct fd *fd, int nodeid)
node->add_time = time(NULL);
}
-static void node_history_left(struct fd *fd, int nodeid)
+static void node_history_left(struct fd *fd, int nodeid, uint32_t seq)
{
struct node_history *node;
@@ -383,9 +419,10 @@ static void node_history_left(struct fd *fd, int nodeid)
}
node->left_time = time(NULL);
+ node->left_seq = seq;
}
-static void node_history_fail(struct fd *fd, int nodeid)
+static void node_history_fail(struct fd *fd, int nodeid, uint32_t seq)
{
struct node_history *node;
@@ -396,6 +433,7 @@ static void node_history_fail(struct fd *fd, int nodeid)
}
node->fail_time = time(NULL);
+ node->fail_seq = seq;
node->check_quorum = 1;
}
@@ -646,36 +684,11 @@ static void receive_victim_done(struct fd *fd, struct fd_header *hd, int len)
free(node);
}
+/* we know that the quorum value here is consistent with the cpg events
+ because the ringid's are in sync per the previous check_ringid_done */
+
static int check_quorum_done(struct fd *fd)
{
- struct node_history *node;
- int wait_count = 0;
-
- /* We don't want to trust the cluster_quorate value until we know
- that cman has seen the same nodes fail that we have. So, we
- first make sure that all nodes we've seen fail are also
- failed in cman, then we can just check cluster_quorate. This
- assumes that we'll get to this function to do all the checks
- before any of the failed nodes can actually rejoin and become
- cman members again (if that assumption doesn't hold, perhaps
- do something with timestamps of join/fail). */
-
- list_for_each_entry(node, &fd->node_history, list) {
- if (!node->check_quorum)
- continue;
-
- if (!is_cluster_member_reread(node->nodeid)) {
- node->check_quorum = 0;
- } else {
- log_debug("check_quorum %d is_cluster_member",
- node->nodeid);
- wait_count++;
- }
- }
-
- if (wait_count)
- return 0;
-
if (!cluster_quorate) {
log_debug("check_quorum not quorate");
return 0;
@@ -685,8 +698,28 @@ static int check_quorum_done(struct fd *fd)
return 1;
}
+/* wait for cman ringid and cpg ringid to be the same so we know our
+ information from each service is based on the same node state */
+
+static int check_ringid_done(struct fd *fd)
+{
+ if (cluster_ringid_seq != (uint32_t)fd->cpg_ringid.seq) {
+ log_debug("check_ringid cluster %u cpg %u:%llu",
+ cluster_ringid_seq, fd->cpg_ringid.nodeid,
+ (unsigned long long)fd->cpg_ringid.seq);
+ return 0;
+ }
+
+ log_debug("check_ringid done cluster %u cpg %u:%llu",
+ cluster_ringid_seq, fd->cpg_ringid.nodeid,
+ (unsigned long long)fd->cpg_ringid.seq);
+ return 1;
+}
+
static int wait_conditions_done(struct fd *fd)
{
+ if (!check_ringid_done(fd))
+ return 0;
if (!check_quorum_done(fd))
return 0;
return 1;
@@ -829,6 +862,25 @@ static int match_change(struct fd *fd, struct change *cg, struct fd_header *hd,
return 0;
}
+ /* this start message couldn't have been sent for a cg preceding
+ a confchg when the sending node failed or left */
+
+ if ((node->fail_seq > cg->seq) || (node->left_seq > cg->seq)) {
+ log_debug("match_change %d:%u skip cg %u fail cg %u left cg %u",
+ hd->nodeid, seq, cg->seq,
+ node->fail_seq, node->left_seq);
+ return 0;
+ }
+
+ /* if we matched the last start message from this node against our
+ cg N, then don't match this stsart message against an earlier cg */
+
+ if (node->last_match_seq > cg->seq) {
+ log_debug("match_change %d:%u skip cg %u last matched cg %u",
+ hd->nodeid, seq, cg->seq, node->last_match_seq);
+ return 0;
+ }
+
/* verify this is the right change by matching the counts
and the nodeids of the current members */
@@ -863,6 +915,8 @@ static int match_change(struct fd *fd, struct change *cg, struct fd_header *hd,
if (members_mismatch)
return 0;
+ node->last_match_seq = cg->seq;
+
log_debug("match_change %d:%u matches cg %u", hd->nodeid, seq, cg->seq);
return 1;
}
@@ -917,19 +971,45 @@ static int match_change(struct fd *fd, struct change *cg, struct fd_header *hd,
is > cpg ringid, then return 0 for conditions_done so we won't send
start and will wait until the most recent cpg confchg (matching the
current cman one) to send a start. Waits for cpg to catch up with cman.
+
+ Final solution is the patch adding check_ringid_done() that waits for
+ cman and cpg to both be on the same ringid before going ahead to check
+ quorum and send starts.
*/
static struct change *find_change(struct fd *fd, struct fd_header *hd,
struct fd_info *fi, struct id_info *ids)
{
struct change *cg;
+ struct change *cg1 = NULL, *cg2 = NULL;
list_for_each_entry_reverse(cg, &fd->changes, list) {
if (!match_change(fd, cg, hd, fi, ids))
continue;
- return cg;
+
+ if (!(hd->flags & FD_MFLG_DUPLICATE_CG))
+ return cg;
+
+ /* this start message is for the second of two matching cg's */
+
+ if (!cg1) {
+ cg1 = cg;
+ log_debug("find_change %d:%u match1 %u look for dup",
+ hd->nodeid, hd->msgdata, cg1->seq);
+ continue;
+ } else {
+ cg2 = cg;
+ log_debug("find_change %d:%u match1 %u match2 %u",
+ hd->nodeid, hd->msgdata, cg1->seq, cg2->seq);
+ break;
+ }
}
+ if (cg1 && cg2)
+ return cg2;
+ if (cg1)
+ return cg1;
+
log_debug("find_change %d:%u no match", hd->nodeid, hd->msgdata);
return NULL;
}
@@ -1062,19 +1142,17 @@ static int count_ids(struct fd *fd)
return count;
}
-static void send_info(struct fd *fd, int type)
+static void send_info(struct fd *fd, struct change *cg, int type,
+ uint32_t flags)
{
- struct change *cg;
struct fd_header *hd;
struct fd_info *fi;
struct id_info *id;
struct node_history *node;
char *buf;
- uint32_t flags;
+ uint32_t idflags;
int len, id_count;
- cg = list_first_entry(&fd->changes, struct change, list);
-
id_count = count_ids(fd);
len = sizeof(struct fd_header) + sizeof(struct fd_info) +
@@ -1095,6 +1173,8 @@ static void send_info(struct fd *fd, int type)
hd->type = type;
hd->msgdata = cg->seq;
+ hd->flags = flags;
+
if (cg->we_joined)
hd->flags |= FD_MFLG_JOINING;
if (fd->init_complete || fd->local_init_complete)
@@ -1114,11 +1194,11 @@ static void send_info(struct fd *fd, int type)
/* fill in id_info entries */
list_for_each_entry(node, &fd->node_history, list) {
- flags = 0;
+ idflags = 0;
if (find_memb(cg, node->nodeid))
- flags = IDI_NODEID_IS_MEMBER;
+ idflags = IDI_NODEID_IS_MEMBER;
- id->flags = cpu_to_le32(flags);
+ id->flags = cpu_to_le32(idflags);
id->nodeid = cpu_to_le32(node->nodeid);
id->fence_external_node= cpu_to_le32(node->fence_external_node);
id->fence_master = cpu_to_le32(node->fence_master);
@@ -1128,8 +1208,8 @@ static void send_info(struct fd *fd, int type)
id++;
}
- log_debug("send_%s cg %u flags %x counts %u %d %d %d %d",
- type == FD_MSG_START ? "start" : "complete",
+ log_debug("send_%s %d:%u flags %x started %u m %d j %d r %d f %d",
+ type == FD_MSG_START ? "start" : "complete", our_nodeid,
cg->seq, hd->flags, fd->started_count, cg->member_count,
cg->joined_count, cg->remove_count, cg->failed_count);
@@ -1138,9 +1218,45 @@ static void send_info(struct fd *fd, int type)
free(buf);
}
+static int same_members(struct change *cg1, struct change *cg2)
+{
+ struct member *memb;
+
+ list_for_each_entry(memb, &cg1->members, list) {
+ if (!find_memb(cg2, memb->nodeid))
+ return 0;
+ }
+ return 1;
+}
+
static void send_start(struct fd *fd)
{
- send_info(fd, FD_MSG_START);
+ struct change *cg = list_first_entry(&fd->changes, struct change, list);
+ struct change *cgtmp;
+ uint32_t flags = 0;
+
+ /* look for a previous matching cg that we don't want others to
+ confuse for this one */
+
+ list_for_each_entry(cgtmp, &fd->changes, list) {
+ if (cgtmp->sent_start)
+ continue;
+
+ if (cgtmp->seq < cg->seq &&
+ cgtmp->member_count == cg->member_count &&
+ cgtmp->joined_count == cg->joined_count &&
+ cgtmp->remove_count == cg->remove_count &&
+ cgtmp->failed_count == cg->failed_count &&
+ same_members(cgtmp, cg)) {
+ log_debug("duplicate old cg %u new cg %u",
+ cgtmp->seq, cg->seq);
+ flags = FD_MFLG_DUPLICATE_CG;
+ }
+ }
+
+ cg->sent_start = 1;
+
+ send_info(fd, cg, FD_MSG_START, flags);
}
/* same content as a start message, a new (incomplete) node will look for
@@ -1149,7 +1265,9 @@ static void send_start(struct fd *fd)
static void send_complete(struct fd *fd)
{
- send_info(fd, FD_MSG_COMPLETE);
+ struct change *cg = list_first_entry(&fd->changes, struct change, list);
+
+ send_info(fd, cg, FD_MSG_COMPLETE, 0);
}
/* FIXME: better to just look in victims list for any nodes with init_victim? */
@@ -1317,9 +1435,9 @@ static int add_change(struct fd *fd,
list_add_tail(&memb->list, &cg->removed);
if (memb->failed)
- node_history_fail(fd, memb->nodeid);
+ node_history_fail(fd, memb->nodeid, cg->seq);
else
- node_history_left(fd, memb->nodeid);
+ node_history_left(fd, memb->nodeid, cg->seq);
log_debug("add_change cg %u remove nodeid %d reason %d",
cg->seq, memb->nodeid, left_list[i].reason);
@@ -1350,8 +1468,8 @@ static int add_change(struct fd *fd,
list_for_each_entry(memb, &cg->members, list)
node_history_init(fd, memb->nodeid);
- log_debug("add_change cg %u counts member %d joined %d remove %d "
- "failed %d", cg->seq, cg->member_count, cg->joined_count,
+ log_debug("add_change cg %u m %d j %d r %d f %d",
+ cg->seq, cg->member_count, cg->joined_count,
cg->remove_count, cg->failed_count);
list_add(&cg->list, &fd->changes);
@@ -1526,9 +1644,36 @@ static void deliver_cb_domain(cpg_handle_t handle,
apply_changes(fd);
}
-static cpg_callbacks_t cpg_callbacks_domain = {
+/* save ringid to compare with cman's.
+ also save member_list to double check with cman's member list?
+ they should match */
+
+static void totem_cb_domain(cpg_handle_t handle,
+ struct cpg_ring_id ring_id,
+ uint32_t member_list_entries,
+ const uint32_t *member_list)
+{
+ struct fd *fd;
+
+ log_ringid(handle, &ring_id, member_list, member_list_entries);
+
+ fd = find_fd_handle(handle);
+ if (!fd) {
+ log_error("totem_cb no fence domain for handle");
+ return;
+ }
+
+ fd->cpg_ringid.nodeid = ring_id.nodeid;
+ fd->cpg_ringid.seq = ring_id.seq;
+
+ apply_changes(fd);
+}
+
+static cpg_model_v1_data_t cpg_callbacks_domain = {
.cpg_deliver_fn = deliver_cb_domain,
.cpg_confchg_fn = confchg_cb_domain,
+ .cpg_totem_confchg_fn = totem_cb_domain,
+ .flags = CPG_MODEL_V1_DELIVER_INITIAL_TOTEM_CONF,
};
static void process_cpg_domain(int ci)
@@ -1552,32 +1697,35 @@ static void process_cpg_domain(int ci)
int fd_join(struct fd *fd)
{
cpg_error_t error;
- cpg_handle_t h;
struct cpg_name name;
- int i = 0, f, ci;
+ int i = 0, ci;
- error = cpg_initialize(&h, &cpg_callbacks_domain);
+ error = cpg_model_initialize(&cpg_handle_domain, CPG_MODEL_V1,
+ (cpg_model_data_t *)&cpg_callbacks_domain,
+ NULL);
if (error != CPG_OK) {
- log_error("cpg_initialize error %d", error);
+ log_error("cpg_model_initialize error %d", error);
goto fail_free;
}
- cpg_fd_get(h, &f);
+ cpg_fd_get(cpg_handle_domain, &cpg_fd_domain);
- ci = client_add(f, process_cpg_domain, NULL);
+ ci = client_add(cpg_fd_domain, process_cpg_domain, NULL);
list_add(&fd->list, &domains);
- fd->cpg_handle = h;
+ fd->cpg_handle = cpg_handle_domain;
fd->cpg_client = ci;
- fd->cpg_fd = f;
+ fd->cpg_fd = cpg_fd_domain;
fd->joining_group = 1;
memset(&name, 0, sizeof(name));
sprintf(name.value, "fenced:%s", fd->name);
name.length = strlen(name.value) + 1;
+ memcpy(&group_name_domain, &name, sizeof(struct cpg_name));
+ log_debug("cpg_join %s ...", name.value);
retry:
- error = cpg_join(h, &name);
+ error = cpg_join(cpg_handle_domain, &name);
if (error == CPG_ERR_TRY_AGAIN) {
sleep(1);
if (!(++i % 10))
@@ -1594,7 +1742,7 @@ int fd_join(struct fd *fd)
fail:
list_del(&fd->list);
client_dead(ci);
- cpg_finalize(h);
+ cpg_finalize(cpg_handle_domain);
fail_free:
free(fd);
return error;
@@ -1612,6 +1760,7 @@ int fd_leave(struct fd *fd)
sprintf(name.value, "fenced:%s", fd->name);
name.length = strlen(name.value) + 1;
+ log_debug("cpg_leave %s ...", name.value);
retry:
error = cpg_leave(fd->cpg_handle, &name);
if (error == CPG_ERR_TRY_AGAIN) {
@@ -2084,9 +2233,19 @@ static void confchg_cb_daemon(cpg_handle_t handle,
}
}
-static cpg_callbacks_t cpg_callbacks_daemon = {
+static void totem_cb_daemon(cpg_handle_t handle,
+ struct cpg_ring_id ring_id,
+ uint32_t member_list_entries,
+ const uint32_t *member_list)
+{
+ log_ringid(handle, &ring_id, member_list, member_list_entries);
+}
+
+static cpg_model_v1_data_t cpg_callbacks_daemon = {
.cpg_deliver_fn = deliver_cb_daemon,
.cpg_confchg_fn = confchg_cb_daemon,
+ .cpg_totem_confchg_fn = totem_cb_daemon,
+ .flags = CPG_MODEL_V1_DELIVER_INITIAL_TOTEM_CONF,
};
void process_cpg_daemon(int ci)
@@ -2145,9 +2304,11 @@ int setup_cpg_daemon(void)
our_protocol.daemon_max[1] = 1;
our_protocol.daemon_max[2] = 1;
- error = cpg_initialize(&cpg_handle_daemon, &cpg_callbacks_daemon);
+ error = cpg_model_initialize(&cpg_handle_daemon, CPG_MODEL_V1,
+ (cpg_model_data_t *)&cpg_callbacks_daemon,
+ NULL);
if (error != CPG_OK) {
- log_error("daemon cpg_initialize error %d", error);
+ log_error("daemon cpg_model_initialize error %d", error);
goto ret;
}
@@ -2156,6 +2317,7 @@ int setup_cpg_daemon(void)
memset(&name, 0, sizeof(name));
sprintf(name.value, "fenced:daemon");
name.length = strlen(name.value) + 1;
+ memcpy(&group_name_daemon, &name, sizeof(struct cpg_name));
log_debug("cpg_join %s ...", name.value);
retry:
@@ -2196,6 +2358,7 @@ void close_cpg_daemon(void)
sprintf(name.value, "fenced:daemon");
name.length = strlen(name.value) + 1;
+ log_debug("cpg_leave %s ...", name.value);
retry:
error = cpg_leave(cpg_handle_daemon, &name);
if (error == CPG_ERR_TRY_AGAIN) {
diff --git a/fence/fenced/fd.h b/fence/fenced/fd.h
index 9f64dff..39a34ad 100644
--- a/fence/fenced/fd.h
+++ b/fence/fenced/fd.h
@@ -64,6 +64,7 @@ extern int daemon_quit;
extern int cluster_down;
extern struct list_head domains;
extern int cluster_quorate;
+extern uint32_t cluster_ringid_seq;
extern uint64_t quorate_time;
extern int our_nodeid;
extern char our_name[MAX_NODENAME_LEN+1];
@@ -95,6 +96,7 @@ do { \
#define FD_MFLG_JOINING 1 /* accompanies start, we are joining */
#define FD_MFLG_COMPLETE 2 /* accompanies start, we have complete info */
+#define FD_MFLG_DUPLICATE_CG 4
struct fd_header {
uint16_t version[3];
@@ -122,6 +124,7 @@ struct change {
int failed_count;
int state; /* CGST_ */
int we_joined;
+ int sent_start;
uint32_t seq; /* just used as a reference when debugging */
uint64_t create_time;
};
@@ -146,6 +149,9 @@ struct node_history {
int fence_external_node;
int fence_master;
int fence_how; /* VIC_DONE_ */
+ uint32_t last_match_seq;
+ uint32_t fail_seq;
+ uint32_t left_seq;
};
struct node {
@@ -172,6 +178,7 @@ struct fd {
struct list_head node_history;
int init_complete;
int local_init_complete;
+ struct cpg_ring_id cpg_ringid;
/* general domain membership */
diff --git a/fence/fenced/main.c b/fence/fenced/main.c
index deb9515..a371dc8 100644
--- a/fence/fenced/main.c
+++ b/fence/fenced/main.c
@@ -1069,6 +1069,7 @@ int daemon_quit;
int cluster_down;
struct list_head domains;
int cluster_quorate;
+uint32_t cluster_ringid_seq;
uint64_t quorate_time;
int our_nodeid;
char our_name[MAX_NODENAME_LEN+1];
diff --git a/fence/fenced/member_cman.c b/fence/fenced/member_cman.c
index a245adf..b9d8341 100644
--- a/fence/fenced/member_cman.c
+++ b/fence/fenced/member_cman.c
@@ -148,9 +148,17 @@ int name_to_nodeid(char *name)
static void update_cluster(void)
{
+ cman_cluster_t info;
int quorate = cluster_quorate;
int i, rv;
+ rv = cman_get_cluster(ch, &info);
+ if (rv < 0) {
+ log_error("cman_get_cluster error %d %d", rv, errno);
+ return;
+ }
+ cluster_ringid_seq = info.ci_generation;
+
cluster_quorate = cman_is_quorate(ch);
if (!quorate && cluster_quorate)
@@ -171,8 +179,8 @@ static void update_cluster(void)
if (old_nodes[i].cn_member &&
!is_cluster_member(old_nodes[i].cn_nodeid)) {
- log_debug("cluster node %d removed",
- old_nodes[i].cn_nodeid);
+ log_debug("cluster node %d removed seq %u",
+ old_nodes[i].cn_nodeid, cluster_ringid_seq);
node_history_cluster_remove(old_nodes[i].cn_nodeid);
}
@@ -182,8 +190,8 @@ static void update_cluster(void)
if (cman_nodes[i].cn_member &&
!is_old_member(cman_nodes[i].cn_nodeid)) {
- log_debug("cluster node %d added",
- cman_nodes[i].cn_nodeid);
+ log_debug("cluster node %d added seq %u",
+ cman_nodes[i].cn_nodeid, cluster_ringid_seq);
node_history_cluster_add(cman_nodes[i].cn_nodeid);
}
13 years, 7 months
cluster: STABLE31 - group_tool: use other dump commands
by David Teigland
Gitweb: http://git.fedorahosted.org/git/cluster.git?p=cluster.git;a=commitdiff;h=...
Commit: 85cfa7555b6f098a34757481346cdaa0625530b7
Parent: 6ca25b8acc12aca6705638a065657882a8b62696
Author: David Teigland <teigland(a)redhat.com>
AuthorDate: Mon Nov 15 13:09:09 2010 -0600
Committer: David Teigland <teigland(a)redhat.com>
CommitterDate: Mon Nov 15 13:09:09 2010 -0600
group_tool: use other dump commands
fence_tool/dlm_tool/gfs_control instead of
libfenced/libdlmcontrol/libgfscontrol
Signed-off-by: David Teigland <teigland(a)redhat.com>
---
group/tool/main.c | 43 ++++---------------------------------------
1 files changed, 4 insertions(+), 39 deletions(-)
diff --git a/group/tool/main.c b/group/tool/main.c
index 33ea9a1..5b21531 100644
--- a/group/tool/main.c
+++ b/group/tool/main.c
@@ -15,9 +15,6 @@
#include "libgroup.h"
#include "groupd.h"
-#include "libfenced.h"
-#include "libdlmcontrol.h"
-#include "libgfscontrol.h"
#include "copyright.cf"
#define GROUP_LIBGROUP 2
@@ -108,7 +105,6 @@ static void print_usage(void)
printf("dump fence Show debug log from fenced (fence_tool dump)\n");
printf("dump dlm Show debug log from dlm_controld (dlm_tool dump)\n");
printf("dump gfs Show debug log from gfs_controld (gfs_control dump)\n");
- printf("dump plocks <name> Show posix locks from dlm_controld for lockspace <name>\n");
printf(" (dlm_tool plocks <name>)\n");
printf("\n");
}
@@ -707,50 +703,19 @@ int main(int argc, char **argv)
case OP_DUMP:
if (opt_ind && opt_ind < argc) {
if (!strncmp(argv[opt_ind], "gfs", 3)) {
- char gbuf[GFSC_DUMP_SIZE];
-
- memset(gbuf, 0, sizeof(gbuf));
-
- printf("dump gfs\n");
- gfsc_dump_debug(gbuf);
-
- do_write(STDOUT_FILENO, gbuf, strlen(gbuf));
+ system("gfs_control dump");
}
if (!strncmp(argv[opt_ind], "dlm", 3)) {
- char dbuf[DLMC_DUMP_SIZE];
-
- memset(dbuf, 0, sizeof(dbuf));
-
- printf("dump dlm\n");
- dlmc_dump_debug(dbuf);
-
- do_write(STDOUT_FILENO, dbuf, strlen(dbuf));
+ system("dlm_tool dump");
}
if (!strncmp(argv[opt_ind], "fence", 5)) {
- char fbuf[FENCED_DUMP_SIZE];
-
- memset(fbuf, 0, sizeof(fbuf));
-
- fenced_dump_debug(fbuf);
-
- do_write(STDOUT_FILENO, fbuf, strlen(fbuf));
+ system("fence_tool dump");
}
if (!strncmp(argv[opt_ind], "plocks", 6)) {
- char pbuf[DLMC_DUMP_SIZE];
-
- if (opt_ind + 1 >= argc) {
- printf("plock dump requires name\n");
- return -1;
- }
-
- memset(pbuf, 0, sizeof(pbuf));
-
- dlmc_dump_plocks(argv[opt_ind + 1], pbuf);
-
- do_write(STDOUT_FILENO, pbuf, strlen(pbuf));
+ fprintf(stderr, "use dlm_tool command\n");
}
} else {
char rbuf[GROUPD_DUMP_SIZE];
13 years, 7 months
cluster: STABLE31 - dlm_controld: Reset fs_notified when check_fs_done
by David Teigland
Gitweb: http://git.fedorahosted.org/git/cluster.git?p=cluster.git;a=commitdiff;h=...
Commit: 6ca25b8acc12aca6705638a065657882a8b62696
Parent: c399f4c0f0d7cc4467a68c41715c404b2afb9425
Author: Jiaju Zhang <jjzhang.linux(a)gmail.com>
AuthorDate: Mon Nov 8 16:07:31 2010 -0600
Committer: David Teigland <teigland(a)redhat.com>
CommitterDate: Fri Nov 12 15:00:21 2010 -0600
dlm_controld: Reset fs_notified when check_fs_done
This situation only seems to arise with ocfs2_controld.
Copying bug description from email
https://www.redhat.com/archives/cluster-devel/2010-November/msg00004.html
> About the issue that dlm_controld and fs_controld sit spinning,
> retrying and replying for the fs_notified check, I have a suspision
> that another scenario may also hit that logic:
>
> If the node->fs_notified has been set to 1 by previous change, when a
> new change comes and needs to check the node->fs_notified, because it
> has not been reset to 0, so check_fs_done will succeed even if
> dlm_controld has not received the notification from fs_controld this
> time.
> For example, given that the following membership changes n, n+1, n+2,
> we see what happens on node X:
> Step 1: cg n: node Y leaves with CPG_REASON_NODEDOWN reason,
> eventually in node X's ls->node_history, node Y's fs_notified
> = 1
> Step 2: cg n+1: node Y joins ...
> Step 3: cg n+2: node Y leaves with CPG_REASON_NODEDOWN reason, one
> possible scenario is: before fs_controld's notification
> arrives, dlm_controld has known node Y is down from CPG
> message and done a lot of work, and it saw node Y's
> fs_notified = 1 (been set in Step 1) then passed the fs check
> wrongly. So node Y's check_fs reset to 0.
> Step 4: fs_controld's notification arrives, it sees node Y's check_fs
> = 0 and assumes dlm_controld has not known node Y is down and
> retries to send the notification. But in fact, dlm_controld
> has already known this and finished all the work, which will
> result in the spinning ...
Signed-off-by: Jiaju Zhang <jjzhang.linux(a)gmail.com>
Signed-off-by: David Teigland <teigland(a)redhat.com>
---
group/dlm_controld/cpg.c | 1 +
1 files changed, 1 insertions(+), 0 deletions(-)
diff --git a/group/dlm_controld/cpg.c b/group/dlm_controld/cpg.c
index 9b0d223..12cb202 100644
--- a/group/dlm_controld/cpg.c
+++ b/group/dlm_controld/cpg.c
@@ -651,6 +651,7 @@ static int check_fs_done(struct lockspace *ls)
if (node->fs_notified) {
log_group(ls, "check_fs nodeid %d clear", node->nodeid);
node->check_fs = 0;
+ node->fs_notified = 0;
} else {
log_group(ls, "check_fs nodeid %d needs fs notify",
node->nodeid);
13 years, 7 months
cluster: STABLE31 - Cman: Add support for udpu and rdma transport
by Jan Friesse
Gitweb: http://git.fedorahosted.org/git/cluster.git?p=cluster.git;a=commitdiff;h=...
Commit: c399f4c0f0d7cc4467a68c41715c404b2afb9425
Parent: d3fc38cdc8b14006c44ed9268a4c9e266218ef07
Author: Jan Friesse <jfriesse(a)redhat.com>
AuthorDate: Thu Oct 7 15:30:54 2010 +0200
Committer: Jan Friesse <jfriesse(a)redhat.com>
CommitterDate: Wed Nov 10 15:33:08 2010 +0100
Cman: Add support for udpu and rdma transport
Handle corosync transport option, so it's possible to use udpu (UDP
unicast) and rdma (infiniband) transport.
---
cman/daemon/cman-preconfig.c | 120 ++++++++++++++++++++++++++++++++++++---
config/tools/xml/cluster.rng.in | 12 ++++-
2 files changed, 123 insertions(+), 9 deletions(-)
diff --git a/cman/daemon/cman-preconfig.c b/cman/daemon/cman-preconfig.c
index 2551bc6..5239d89 100644
--- a/cman/daemon/cman-preconfig.c
+++ b/cman/daemon/cman-preconfig.c
@@ -29,6 +29,13 @@
#define MAX_PATH_LEN PATH_MAX
+enum tx_mech {
+ TX_MECH_UDP,
+ TX_MECH_UDPB,
+ TX_MECH_UDPU,
+ TX_MECH_RDMA,
+};
+
static unsigned int debug;
static int cmanpre_readconfig(struct objdb_iface_ver0 *objdb, const char **error_string);
static int cmanpre_reloadconfig(struct objdb_iface_ver0 *objdb, int flush, const char **error_string);
@@ -219,8 +226,57 @@ static hdb_handle_t find_cman_logger(struct objdb_iface_ver0 *objdb, hdb_handle_
}
+static int add_udpu_members(struct objdb_iface_ver0 *objdb, hdb_handle_t interface_object_handle)
+{
+ char *cur_nodename;
+ hdb_handle_t altname_handle;
+ hdb_handle_t find_handle = 0;
+ hdb_handle_t find_handle2 = 0;
+ hdb_handle_t member_object_handle;
+ hdb_handle_t nodes_handle;
+ int cur_altname_depth;
+
+ nodes_handle = nodeslist_init(objdb, cluster_parent_handle, &find_handle);
+ while (nodes_handle) {
+ if (num_interfaces == 0) {
+ if (objdb_get_string(objdb, nodes_handle, "name", &cur_nodename)) {
+ nodes_handle = nodeslist_next(objdb, find_handle);
+ continue;
+ }
+ } else {
+ objdb->object_find_create(nodes_handle, "altname", strlen("altname"), &find_handle2);
+
+ cur_altname_depth = 0;
+ while (objdb->object_find_next(find_handle2, &altname_handle) == 0 &&
+ cur_altname_depth < num_interfaces)
+ cur_altname_depth++;
-static int add_ifaddr(struct objdb_iface_ver0 *objdb, char *mcast, char *ifaddr, int port, int broadcast)
+ if (cur_altname_depth == num_interfaces) {
+ if (objdb_get_string(objdb, altname_handle, "name", &cur_nodename)) {
+ nodes_handle = nodeslist_next(objdb, find_handle);
+ continue;
+ }
+ } else {
+ nodes_handle = nodeslist_next(objdb, find_handle);
+ continue;
+ }
+ objdb->object_find_destroy(find_handle2);
+ }
+
+ if (objdb->object_create(interface_object_handle, &member_object_handle,
+ "member", strlen("member")) == 0) {
+ objdb->object_key_create_typed(member_object_handle, "memberaddr",
+ cur_nodename, strlen(cur_nodename)+1, OBJDB_VALUETYPE_STRING);
+ }
+
+ nodes_handle = nodeslist_next(objdb, find_handle);
+ }
+ objdb->object_find_destroy(find_handle);
+
+ return 0;
+}
+
+static int add_ifaddr(struct objdb_iface_ver0 *objdb, char *mcast, char *ifaddr, int port, enum tx_mech transport)
{
hdb_handle_t totem_object_handle;
hdb_handle_t find_handle;
@@ -228,6 +284,12 @@ static int add_ifaddr(struct objdb_iface_ver0 *objdb, char *mcast, char *ifaddr,
struct sockaddr_storage if_addr, localhost, mcast_addr;
char tmp[132];
int ret = 0;
+ const char *tx_mech_to_str[] = {
+ [TX_MECH_UDP] = "udp",
+ [TX_MECH_UDPB] = "udp",
+ [TX_MECH_UDPU] = "udpu",
+ [TX_MECH_RDMA] = "iba",
+ };
/* Check the families match */
if (address_family(mcast, &mcast_addr, 0) !=
@@ -245,9 +307,10 @@ static int add_ifaddr(struct objdb_iface_ver0 *objdb, char *mcast, char *ifaddr,
objdb->object_find_create(OBJECT_PARENT_HANDLE, "totem", strlen("totem"), &find_handle);
if (objdb->object_find_next(find_handle, &totem_object_handle)) {
-
objdb->object_create(OBJECT_PARENT_HANDLE, &totem_object_handle,
"totem", strlen("totem"));
+ objdb->object_key_create_typed(totem_object_handle, "transport",
+ tx_mech_to_str[transport], strlen(tx_mech_to_str[transport]) + 1, OBJDB_VALUETYPE_STRING);
}
objdb->object_find_destroy(find_handle);
@@ -269,12 +332,22 @@ static int add_ifaddr(struct objdb_iface_ver0 *objdb, char *mcast, char *ifaddr,
objdb->object_key_create_typed(interface_object_handle, "bindnetaddr",
tmp, strlen(tmp)+1, OBJDB_VALUETYPE_STRING);
- if (broadcast)
+ switch (transport) {
+ case TX_MECH_UDPB:
objdb->object_key_create_typed(interface_object_handle, "broadcast",
"yes", strlen("yes")+1, OBJDB_VALUETYPE_STRING);
- else
+ break;
+ case TX_MECH_UDP:
+ case TX_MECH_RDMA:
objdb->object_key_create_typed(interface_object_handle, "mcastaddr",
mcast, strlen(mcast)+1, OBJDB_VALUETYPE_STRING);
+ break;
+ case TX_MECH_UDPU:
+ objdb->object_key_create_typed(interface_object_handle, "mcastaddr",
+ mcast, strlen(mcast)+1, OBJDB_VALUETYPE_STRING);
+ add_udpu_members(objdb, interface_object_handle);
+ break;
+ }
sprintf(tmp, "%d", port);
objdb->object_key_create_typed(interface_object_handle, "mcastport",
@@ -516,7 +589,7 @@ static int get_nodename(struct objdb_iface_ver0 *objdb)
hdb_handle_t find_handle;
hdb_handle_t node_object_handle;
hdb_handle_t alt_object;
- int broadcast = 0;
+ enum tx_mech transport = TX_MECH_UDP;
char *str;
int error;
@@ -621,12 +694,43 @@ static int get_nodename(struct objdb_iface_ver0 *objdb)
mcast_name = strdup("255.255.255.255");
if (!mcast_name)
return -1;
- broadcast = 1;
+ transport = TX_MECH_UDPB;
}
free(str);
}
- if (add_ifaddr(objdb, mcast_name, nodename, portnum, broadcast)) {
+ /* Check for transport */
+ if (!objdb_get_string(objdb, object_handle, "transport", &str)) {
+ if (strcmp(str, "udp") == 0) {
+ if (transport != TX_MECH_UDPB) {
+ transport = TX_MECH_UDP;
+ }
+ } else if (strcmp(str, "udpb") == 0) {
+ transport = TX_MECH_UDPB;
+ } else if (strcmp(str, "udpu") == 0) {
+ if (transport != TX_MECH_UDPB) {
+ transport = TX_MECH_UDPU;
+ } else {
+ sprintf(error_reason, "Transport and broadcast option are mutually exclusive");
+ write_cman_pipe("Transport and broadcast option are mutually exclusive");
+ return -1;
+ }
+ } else if (strcmp(str, "rdma") == 0) {
+ if (transport != TX_MECH_UDPB) {
+ transport = TX_MECH_RDMA;
+ } else {
+ sprintf(error_reason, "Transport and broadcast option are mutually exclusive");
+ write_cman_pipe("Transport and broadcast option are mutually exclusive");
+ return -1;
+ }
+ } else {
+ sprintf(error_reason, "Transport option value can be one of udp, udpb, udpu, rdma");
+ write_cman_pipe("Transport option value can be one of udp, udpb, udpu, rdma");
+ return -1;
+ }
+ }
+
+ if (add_ifaddr(objdb, mcast_name, nodename, portnum, transport)) {
write_cman_pipe(error_reason);
return -1;
}
@@ -649,7 +753,7 @@ static int get_nodename(struct objdb_iface_ver0 *objdb)
mcast = mcast_name;
}
- if (add_ifaddr(objdb, mcast, node, portnum, broadcast)) {
+ if (add_ifaddr(objdb, mcast, node, portnum, transport)) {
write_cman_pipe(error_reason);
return -1;
}
diff --git a/config/tools/xml/cluster.rng.in b/config/tools/xml/cluster.rng.in
index 15c6172..4b4da2e 100644
--- a/config/tools/xml/cluster.rng.in
+++ b/config/tools/xml/cluster.rng.in
@@ -124,7 +124,17 @@ To validate your cluster.conf against this schema, run:
</optional>
<optional>
<attribute name="broadcast" rha:description="enable cman broadcast" rha:default="no"/>
- </optional>
+ </optional>
+ <optional>
+ <attribute name="transport" rha:description="Specifies transport mechanism to use. Available values are udp (multicast default), udpb (broadcast), udpu (unicast) and rdma (Infiniband). corosync.conf(5)" rha:sample="">
+ <choice>
+ <value>udp</value>
+ <value>udpb</value>
+ <value>udpu</value>
+ <value>rdma</value>
+ </choice>
+ </attribute>
+ </optional>
<optional>
<attribute name="keyfile" rha:description=""/>
</optional>
13 years, 7 months