Gitweb:
http://git.fedorahosted.org/git/cluster.git?p=cluster.git;a=commitdiff;h=...
Commit: f1be533f910238ab5350e1a63b2ee18f548bff6b
Parent: 36279bd1cbed1b4a3a9026df00326148b42e67f6
Author: David Teigland <teigland(a)redhat.com>
AuthorDate: Wed Sep 16 14:44:14 2009 -0500
Committer: David Teigland <teigland(a)redhat.com>
CommitterDate: Wed Sep 16 14:44:14 2009 -0500
dlm_controld: copy some fenced changes
from the fenced commit bcc5fdef8473d99399c624a7bc15423a2af645c1
. copy some naming changes
. copy some logging changes
. copy some new checks for accepting start messages,
check for a start already matched to a struct change,
check that a node was a cluster member prior to the
creation of the struct change
Signed-off-by: David Teigland <teigland(a)redhat.com>
---
group/dlm_controld/cpg.c | 184 ++++++++++++++++++++++++++++++++------
group/dlm_controld/dlm_daemon.h | 8 +-
group/dlm_controld/main.c | 6 +-
group/dlm_controld/member_cman.c | 10 ++-
4 files changed, 173 insertions(+), 35 deletions(-)
diff --git a/group/dlm_controld/cpg.c b/group/dlm_controld/cpg.c
index 1704986..eb46bec 100644
--- a/group/dlm_controld/cpg.c
+++ b/group/dlm_controld/cpg.c
@@ -46,6 +46,8 @@ struct node {
int fs_notified;
uint64_t add_time;
uint64_t fence_time; /* for debug */
+ uint64_t cluster_add_time;
+ uint64_t cluster_remove_time;
uint32_t fence_queries; /* for debug */
uint32_t added_seq; /* for queries */
uint32_t removed_seq; /* for queries */
@@ -71,6 +73,7 @@ struct change {
int we_joined;
uint32_t seq; /* used as a reference for debugging, and for queries */
uint32_t combined_seq; /* for queries */
+ uint64_t create_time;
};
struct ls_info {
@@ -91,13 +94,66 @@ struct id_info {
};
int message_flow_control_on;
-static int daemon_cpg_fd;
-static cpg_handle_t daemon_cpg_handle;
+static cpg_handle_t cpg_handle_daemon;
+static int cpg_fd_daemon;
static struct protocol our_protocol;
static struct list_head daemon_nodes;
static struct cpg_address daemon_member[MAX_NODES];
static int daemon_member_count;
+static void log_config(const struct cpg_name *group_name,
+ const struct cpg_address *member_list,
+ size_t member_list_entries,
+ const struct cpg_address *left_list,
+ size_t left_list_entries,
+ const struct cpg_address *joined_list,
+ size_t joined_list_entries)
+{
+ char m_buf[128];
+ char j_buf[32];
+ char l_buf[32];
+ size_t i, len, pos;
+ int ret;
+
+ memset(m_buf, 0, sizeof(m_buf));
+ memset(j_buf, 0, sizeof(j_buf));
+ memset(l_buf, 0, sizeof(l_buf));
+
+ len = sizeof(m_buf);
+ pos = 0;
+ for (i = 0; i < member_list_entries; i++) {
+ ret = snprintf(m_buf + pos, len - pos, " %d",
+ member_list[i].nodeid);
+ if (ret >= len - pos)
+ break;
+ pos += ret;
+ }
+
+ len = sizeof(j_buf);
+ pos = 0;
+ for (i = 0; i < joined_list_entries; i++) {
+ ret = snprintf(j_buf + pos, len - pos, " %d",
+ joined_list[i].nodeid);
+ if (ret >= len - pos)
+ break;
+ pos += ret;
+ }
+
+ len = sizeof(l_buf);
+ pos = 0;
+ for (i = 0; i < left_list_entries; i++) {
+ ret = snprintf(l_buf + pos, len - pos, " %d",
+ left_list[i].nodeid);
+ if (ret >= len - pos)
+ break;
+ pos += ret;
+ }
+
+ log_debug("%s conf %zu %zu %zu memb%s join%s left%s", group_name->value,
+ member_list_entries, joined_list_entries, left_list_entries,
+ m_buf, j_buf, l_buf);
+}
+
static void ls_info_in(struct ls_info *li)
{
li->ls_info_size = le32_to_cpu(li->ls_info_size);
@@ -359,7 +415,44 @@ static void node_history_init(struct lockspace *ls, int nodeid,
node->add_time = 0;
list_add_tail(&node->list, &ls->node_history);
out:
- node->added_seq = cg->seq; /* for queries */
+ if (cg)
+ node->added_seq = cg->seq; /* for queries */
+}
+
+void node_history_cluster_add(int nodeid)
+{
+ struct lockspace *ls;
+ struct node *node;
+
+ list_for_each_entry(ls, &lockspaces, list) {
+ node_history_init(ls, nodeid, NULL);
+
+ node = get_node_history(ls, nodeid);
+ if (!node) {
+ log_error("node_history_cluster_add no nodeid %d",
+ nodeid);
+ return;
+ }
+
+ node->cluster_add_time = time(NULL);
+ }
+}
+
+void node_history_cluster_remove(int nodeid)
+{
+ struct lockspace *ls;
+ struct node *node;
+
+ list_for_each_entry(ls, &lockspaces, list) {
+ node = get_node_history(ls, nodeid);
+ if (!node) {
+ log_error("node_history_cluster_remove no nodeid %d",
+ nodeid);
+ return;
+ }
+
+ node->cluster_remove_time = time(NULL);
+ }
}
static void node_history_start(struct lockspace *ls, int nodeid)
@@ -766,7 +859,7 @@ static void set_plock_ckpt_node(struct lockspace *ls)
}
static struct id_info *get_id_struct(struct id_info *ids, int count, int size,
- int nodeid)
+ int nodeid)
{
struct id_info *id = ids;
int i;
@@ -787,6 +880,7 @@ static int match_change(struct lockspace *ls, struct change *cg,
{
struct id_info *id;
struct member *memb;
+ struct node *node;
uint32_t seq = hd->msgdata;
int i, members_mismatch;
@@ -800,7 +894,7 @@ static int match_change(struct lockspace *ls, struct change *cg,
if (!id) {
log_group(ls, "match_change %d:%u skip %u we are not in members",
hd->nodeid, seq, cg->seq);
- return 0;
+ return 0;
}
memb = find_memb(cg, hd->nodeid);
@@ -810,6 +904,30 @@ static int match_change(struct lockspace *ls, struct change *cg,
return 0;
}
+ if (memb->start && hd->type == DLM_MSG_START) {
+ log_group(ls, "match_change %d:%u skip %u already start",
+ hd->nodeid, seq, cg->seq);
+ return 0;
+ }
+
+ /* a node's start can't match a change if the node joined the cluster
+ more recently than the change was created */
+
+ node = get_node_history(ls, hd->nodeid);
+ if (!node) {
+ log_group(ls, "match_change %d:%u skip cg %u no node history",
+ hd->nodeid, seq, cg->seq);
+ return 0;
+ }
+
+ if (node->cluster_add_time > cg->create_time) {
+ log_debug("match_change %d:%u skip cg %u created %llu "
+ "cluster add %llu", hd->nodeid, seq, cg->seq,
+ (unsigned long long)cg->create_time,
+ (unsigned long long)node->cluster_add_time);
+ return 0;
+ }
+
/* verify this is the right change by matching the counts
and the nodeids of the current members */
@@ -837,6 +955,7 @@ static int match_change(struct lockspace *ls, struct change *cg,
}
id = (struct id_info *)((char *)id + li->id_info_size);
}
+
if (members_mismatch)
return 0;
@@ -924,7 +1043,7 @@ static void receive_start(struct lockspace *ls, struct dlm_header
*hd, int len)
added = is_added(ls, hd->nodeid);
- if (added && li->started_count) {
+ if (added && li->started_count && ls->started_count) {
log_error("receive_start %d:%u add node with started_count %u",
hd->nodeid, seq, li->started_count);
@@ -1166,6 +1285,7 @@ void process_lockspace_changes(void)
poll_fencing = 0;
poll_quorum = 0;
poll_fs = 0;
+
list_for_each_entry_safe(ls, safe, &lockspaces, list) {
if (!list_empty(&ls->changes))
apply_changes(ls);
@@ -1192,6 +1312,7 @@ static int add_change(struct lockspace *ls,
INIT_LIST_HEAD(&cg->members);
INIT_LIST_HEAD(&cg->removed);
cg->state = CGST_WAIT_CONDITIONS;
+ cg->create_time = time(NULL);
cg->seq = ++ls->change_seq;
if (!cg->seq)
cg->seq = ++ls->change_seq;
@@ -1275,7 +1396,8 @@ static int add_change(struct lockspace *ls,
return error;
}
-static int we_left(const struct cpg_address *left_list, size_t left_list_entries)
+static int we_left(const struct cpg_address *left_list,
+ size_t left_list_entries)
{
int i;
@@ -1300,6 +1422,10 @@ static void confchg_cb(cpg_handle_t handle,
struct member *memb;
int rv;
+ log_config(group_name, member_list, member_list_entries,
+ left_list, left_list_entries,
+ joined_list, joined_list_entries);
+
ls = find_ls_handle(handle);
if (!ls) {
log_error("confchg_cb no lockspace for cpg %s",
@@ -1450,7 +1576,7 @@ void update_flow_control_status(void)
cpg_flow_control_state_t flow_control_state;
cpg_error_t error;
- error = cpg_flow_control_state_get(daemon_cpg_handle,
+ error = cpg_flow_control_state_get(cpg_handle_daemon,
&flow_control_state);
if (error != CPG_OK) {
log_error("cpg_flow_control_state_get %d", error);
@@ -1470,7 +1596,7 @@ void update_flow_control_status(void)
}
}
-static void process_lockspace_cpg(int ci)
+static void process_cpg_lockspace(int ci)
{
struct lockspace *ls;
cpg_error_t error;
@@ -1516,7 +1642,7 @@ int dlm_join_lockspace(struct lockspace *ls)
cpg_fd_get(h, &fd);
- ci = client_add(fd, process_lockspace_cpg, NULL);
+ ci = client_add(fd, process_cpg_lockspace, NULL);
list_add(&ls->list, &lockspaces);
@@ -1846,7 +1972,7 @@ static void send_protocol(struct protocol *proto)
memcpy(pr, proto, sizeof(struct protocol));
protocol_out(pr);
- _send_message(daemon_cpg_handle, buf, len, DLM_MSG_PROTOCOL);
+ _send_message(cpg_handle_daemon, buf, len, DLM_MSG_PROTOCOL);
}
int set_protocol(void)
@@ -1857,7 +1983,7 @@ int set_protocol(void)
int rv;
memset(&pollfd, 0, sizeof(pollfd));
- pollfd.fd = daemon_cpg_fd;
+ pollfd.fd = cpg_fd_daemon;
pollfd.events = POLLIN;
while (1) {
@@ -1903,7 +2029,7 @@ int set_protocol(void)
}
if (pollfd.revents & POLLIN)
- process_cpg(0);
+ process_cpg_daemon(0);
if (pollfd.revents & (POLLERR | POLLHUP | POLLNVAL)) {
log_error("set_protocol poll revents %u",
pollfd.revents);
@@ -1949,6 +2075,8 @@ int set_protocol(void)
our_protocol.kernel_max[0],
our_protocol.kernel_max[1],
our_protocol.kernel_max[2]);
+
+ send_protocol(&our_protocol);
return 0;
}
@@ -1987,6 +2115,10 @@ static void confchg_cb_daemon(cpg_handle_t handle,
{
int i;
+ log_config(group_name, member_list, member_list_entries,
+ left_list, left_list_entries,
+ joined_list, joined_list_entries);
+
if (joined_list_entries)
send_protocol(&our_protocol);
@@ -2004,16 +2136,16 @@ static cpg_callbacks_t cpg_callbacks_daemon = {
.cpg_confchg_fn = confchg_cb_daemon,
};
-void process_cpg(int ci)
+void process_cpg_daemon(int ci)
{
cpg_error_t error;
- error = cpg_dispatch(daemon_cpg_handle, CPG_DISPATCH_ALL);
+ error = cpg_dispatch(cpg_handle_daemon, CPG_DISPATCH_ALL);
if (error != CPG_OK)
log_error("daemon cpg_dispatch error %d", error);
}
-int setup_cpg(void)
+int setup_cpg_daemon(void)
{
cpg_error_t error;
struct cpg_name name;
@@ -2029,20 +2161,20 @@ int setup_cpg(void)
our_protocol.kernel_max[1] = 1;
our_protocol.kernel_max[2] = 1;
- error = cpg_initialize(&daemon_cpg_handle, &cpg_callbacks_daemon);
+ error = cpg_initialize(&cpg_handle_daemon, &cpg_callbacks_daemon);
if (error != CPG_OK) {
log_error("daemon cpg_initialize error %d", error);
return -1;
}
- cpg_fd_get(daemon_cpg_handle, &daemon_cpg_fd);
+ cpg_fd_get(cpg_handle_daemon, &cpg_fd_daemon);
memset(&name, 0, sizeof(name));
sprintf(name.value, "dlm:controld");
name.length = strlen(name.value) + 1;
retry:
- error = cpg_join(daemon_cpg_handle, &name);
+ error = cpg_join(cpg_handle_daemon, &name);
if (error == CPG_ERR_TRY_AGAIN) {
sleep(1);
if (!(++i % 10))
@@ -2054,22 +2186,22 @@ int setup_cpg(void)
goto fail;
}
- log_debug("setup_cpg %d", daemon_cpg_fd);
- return daemon_cpg_fd;
+ log_debug("setup_cpg_daemon %d", cpg_fd_daemon);
+ return cpg_fd_daemon;
fail:
- cpg_finalize(daemon_cpg_handle);
+ cpg_finalize(cpg_handle_daemon);
return -1;
}
-void close_cpg(void)
+void close_cpg_daemon(void)
{
struct lockspace *ls;
cpg_error_t error;
struct cpg_name name;
int i = 0;
- if (!daemon_cpg_handle)
+ if (!cpg_handle_daemon)
return;
if (cluster_down)
goto fin;
@@ -2079,7 +2211,7 @@ void close_cpg(void)
name.length = strlen(name.value) + 1;
retry:
- error = cpg_leave(daemon_cpg_handle, &name);
+ error = cpg_leave(cpg_handle_daemon, &name);
if (error == CPG_ERR_TRY_AGAIN) {
sleep(1);
if (!(++i % 10))
@@ -2093,7 +2225,7 @@ void close_cpg(void)
if (ls->cpg_handle)
cpg_finalize(ls->cpg_handle);
}
- cpg_finalize(daemon_cpg_handle);
+ cpg_finalize(cpg_handle_daemon);
}
/* fs_controld has seen nodedown for nodeid; it's now ok for dlm to do
diff --git a/group/dlm_controld/dlm_daemon.h b/group/dlm_controld/dlm_daemon.h
index 65723d1..8143848 100644
--- a/group/dlm_controld/dlm_daemon.h
+++ b/group/dlm_controld/dlm_daemon.h
@@ -232,9 +232,9 @@ int read_ccs_int(const char *path, int *config_val);
int get_weight(int nodeid, char *lockspace);
/* cpg.c */
-int setup_cpg(void);
-void close_cpg(void);
-void process_cpg(int ci);
+int setup_cpg_daemon(void);
+void close_cpg_daemon(void);
+void process_cpg_daemon(int ci);
int set_protocol(void);
void process_lockspace_changes(void);
void dlm_send_message(struct lockspace *ls, char *buf, int len);
@@ -242,6 +242,8 @@ int dlm_join_lockspace(struct lockspace *ls);
int dlm_leave_lockspace(struct lockspace *ls);
const char *msg_name(int type);
void update_flow_control_status(void);
+void node_history_cluster_add(int nodeid);
+void node_history_cluster_remove(int nodeid);
int set_node_info(struct lockspace *ls, int nodeid, struct dlmc_node *node);
int set_lockspace_info(struct lockspace *ls, struct dlmc_lockspace *lockspace);
int set_lockspaces(int *count, struct dlmc_lockspace **lss_out);
diff --git a/group/dlm_controld/main.c b/group/dlm_controld/main.c
index 75ee55d..12e2592 100644
--- a/group/dlm_controld/main.c
+++ b/group/dlm_controld/main.c
@@ -938,10 +938,10 @@ static void loop(void)
log_debug("group_mode %d compat %d", group_mode, cfgd_groupd_compat);
if (group_mode == GROUP_LIBCPG) {
- rv = setup_cpg();
+ rv = setup_cpg_daemon();
if (rv < 0)
goto out;
- client_add(rv, process_cpg, cluster_dead);
+ client_add(rv, process_cpg_daemon, cluster_dead);
rv = set_protocol();
if (rv < 0)
@@ -1024,7 +1024,7 @@ static void loop(void)
if (cfgd_groupd_compat)
close_groupd();
if (group_mode == GROUP_LIBCPG) {
- close_cpg();
+ close_cpg_daemon();
close_plocks();
}
clear_configfs();
diff --git a/group/dlm_controld/member_cman.c b/group/dlm_controld/member_cman.c
index 4caaaa5..c6b7cc7 100644
--- a/group/dlm_controld/member_cman.c
+++ b/group/dlm_controld/member_cman.c
@@ -99,8 +99,10 @@ static void statechange(void)
if (old_nodes[i].cn_member &&
!is_cluster_member(old_nodes[i].cn_nodeid)) {
- log_debug("cman: node %d removed",
- old_nodes[i].cn_nodeid);
+ log_debug("cluster node %d removed",
+ old_nodes[i].cn_nodeid);
+
+ node_history_cluster_remove(old_nodes[i].cn_nodeid);
del_configfs_node(old_nodes[i].cn_nodeid);
}
@@ -119,9 +121,11 @@ static void statechange(void)
addrptr = &cman_nodes[i].cn_address;
}
- log_debug("cman: node %d added",
+ log_debug("cluster node %d added",
cman_nodes[i].cn_nodeid);
+ node_history_cluster_add(cman_nodes[i].cn_nodeid);
+
for (j = 0; j < num_addrs; j++) {
add_configfs_node(cman_nodes[i].cn_nodeid,
addrptr[j].cna_address,