Gitweb: http://git.fedorahosted.org/git/cluster.git?p=cluster.git;a=commitdiff;h=d8d... Commit: d8d8a6de00f7d0f9ca1944913a26b805984851d5 Parent: eeb80cd73160856a16f73fc45d1ab48904aa55f0 Author: David Teigland teigland@redhat.com AuthorDate: Mon Feb 27 13:10:25 2012 -0600 Committer: David Teigland teigland@redhat.com CommitterDate: Tue Mar 20 11:24:17 2012 -0500
dlm_controld: fencing and config
New /etc/dlm/dlm.conf with same config options as before, but using "key = val" format. Fencing config can also be specified. Without an explicit fencing config, default "dlm_stonith" agent is run to let stonith handle fencing.
Signed-off-by: David Teigland teigland@redhat.com --- dlm_controld/Makefile | 5 +- dlm_controld/action.c | 4 +- dlm_controld/config.c | 123 ++- dlm_controld/cpg.c | 1082 +----------------------- dlm_controld/daemon_cpg.c | 1921 ++++++++++++++++++++++++++++++++++++++++++ dlm_controld/dlm_controld.h | 1 + dlm_controld/dlm_daemon.h | 85 ++- dlm_controld/fence.c | 145 +++- dlm_controld/fence_config.c | 403 +++++++++ dlm_controld/fence_config.h | 62 ++ dlm_controld/lib.c | 19 + dlm_controld/libdlmcontrol.h | 1 + dlm_controld/main.c | 90 ++- dlm_controld/member.c | 51 +- dlm_controld/plock.c | 10 - dlm_tool/main.c | 17 +- fence/Makefile | 56 ++ fence/stonith_helper.c | 96 +++ 18 files changed, 3007 insertions(+), 1164 deletions(-)
diff --git a/dlm_controld/Makefile b/dlm_controld/Makefile index a648ab1..b60b8df 100644 --- a/dlm_controld/Makefile +++ b/dlm_controld/Makefile @@ -19,7 +19,9 @@ LIB_TARGET = $(LIB_SO).$(LIB_MAJOR).$(LIB_MINOR)
BIN_SOURCE = action.c \ cpg.c \ + daemon_cpg.c \ crc.c \ + fence_config.c \ fence.c \ main.c \ plock.c \ @@ -51,13 +53,10 @@ BIN_CFLAGS += -D_GNU_SOURCE -g \ -fdiagnostics-show-option \
BIN_CFLAGS += -fPIE -DPIE -BIN_CFLAGS += `xml2-config --cflags` BIN_CFLAGS += -I../include -I../libdlm
BIN_LDFLAGS += -Wl,-z,now -Wl,-z,relro -pie -BIN_LDFLAGS += `xml2-config --libs` BIN_LDFLAGS += -lpthread -lrt -lcpg -lcmap -lcfg -lquorum -#BIN_LDFLAGS += -lfenced
LIB_CFLAGS += $(BIN_CFLAGS) LIB_LDFLAGS += -Wl,-z,relro -pie diff --git a/dlm_controld/action.c b/dlm_controld/action.c index d4bf11d..6cb34f2 100644 --- a/dlm_controld/action.c +++ b/dlm_controld/action.c @@ -34,10 +34,8 @@ static int detect_protocol(void) }
rv = cmap_get_string(handle, "totem.rrp_mode", &str); - if (rv != CS_OK) { - log_error("cmap_get_string totem.rrp_mode error %d", rv); + if (rv != CS_OK) goto out; - }
log_debug("cmap totem.rrp_mode = '%s'", str);
diff --git a/dlm_controld/config.c b/dlm_controld/config.c index 3bf02c0..f668b86 100644 --- a/dlm_controld/config.c +++ b/dlm_controld/config.c @@ -7,7 +7,6 @@ */
#include "dlm_daemon.h" -#include <libxml/tree.h>
/* TODO: <dlm> @@ -37,66 +36,100 @@ static void proto_val(char *str, int *val) } }
-static void set_val(xmlNode *root, const char *name, int *opt, int *val) +static void set_val(char *line, int *val_out) { - xmlChar *str; + char key[PATH_MAX]; + char val[PATH_MAX]; + int rv;
- str = xmlGetProp(root, BAD_CAST name); - if (str && !(*opt)) { - *val = atoi((char *)str); - log_debug("config %s = %d", name, *val); - } + rv = sscanf(line, "%[^=]=%s", key, val); + if (rv != 2) + return; + + *val_out = atoi(val); + + log_debug("config %s=%d", key, *val_out); +} + +static void get_val(char *line, char *val_out) +{ + char key[PATH_MAX]; + char val[PATH_MAX]; + int rv; + + rv = sscanf(line, "%[^=]=%s", key, val); + if (rv != 2) + return; + + strcpy(val_out, val); }
void setup_config(int update) { - xmlDoc *doc; - xmlNode *root; - xmlChar *str; + FILE *file; + char line[PATH_MAX]; + char str[PATH_MAX];
if (!path_exists(CONF_FILE_PATH)) return;
- doc = xmlParseFile(CONF_FILE_PATH); - if (!doc) { - log_error("xml parse error %d %s", errno, CONF_FILE_PATH); + file = fopen(CONF_FILE_PATH, "r"); + if (!file) return; - }
- root = xmlDocGetRootElement(doc); - if (!root) { - log_error("xml root error %d %s", errno, CONF_FILE_PATH); - xmlFreeDoc(doc); - return; - } + while (fgets(line, PATH_MAX, file)) { + if (line[0] == '#') + continue; + if (line[0] == '\n') + continue; + + if (!optk_debug && !strncmp(line, "log_debug", strlen("log_debug"))) + set_val(line, &cfgk_debug); + + else if (!optk_timewarn && !strncmp(line, "timewarn", strlen("timewarn")) && !update) + set_val(line, &cfgk_timewarn); + + else if (!optd_enable_fencing && !strncmp(line, "enable_fencing", strlen("enable_fencing")) && !update) + set_val(line, &cfgd_enable_fencing); + + else if (!optd_enable_quorum_fencing && !strncmp(line, "enable_quorum_fencing", strlen("enable_quorum_fencing")) && !update) + set_val(line, &cfgd_enable_quorum_fencing); + + else if (!optd_enable_quorum_lockspace && !strncmp(line, "enable_quorum_lockspace", strlen("enable_quorum_lockspace")) && !update) + set_val(line, &cfgd_enable_quorum_lockspace); + + else if (!optd_enable_fscontrol && !strncmp(line, "enable_fscontrol", strlen("enable_fscontrol")) && !update) + set_val(line, &cfgd_enable_fscontrol); + + else if (!optd_enable_plock && !strncmp(line, "enable_plock", strlen("enable_plock")) && !update) + set_val(line, &cfgd_enable_plock); + + else if (!optd_plock_ownership && !strncmp(line, "plock_ownership", strlen("plock_ownership")) && !update) + set_val(line, &cfgd_plock_ownership); + + else if (!optd_plock_debug && !strncmp(line, "plock_debug", strlen("plock_debug"))) + set_val(line, &cfgd_plock_debug); + + else if (!optd_plock_rate_limit && !strncmp(line, "plock_rate_limit", strlen("plock_rate_limit"))) + set_val(line, &cfgd_plock_rate_limit); + + else if (!optd_drop_resources_time && !strncmp(line, "drop_resources_time", strlen("drop_resources_time"))) + set_val(line, &cfgd_drop_resources_time);
- if (update) - goto do_update; + else if (!optd_drop_resources_count && !strncmp(line, "drop_resources_count", strlen("drop_resources_count"))) + set_val(line, &cfgd_drop_resources_count);
- /* These config values are set from dlm.conf only if they haven't - already been set on the command line. */ + else if (!optd_drop_resources_age && !strncmp(line, "drop_resources_age", strlen("drop_resources_age"))) + set_val(line, &cfgd_drop_resources_age);
- str = xmlGetProp(root, BAD_CAST "protocol"); - if (str && !optk_protocol) { - proto_val((char *)str, &cfgk_protocol); - log_debug("config protocol = %d", cfgk_protocol); + else if (!optk_protocol && !strncmp(line, "protocol", strlen("protocol")) && !update) { + memset(str, 0, sizeof(str)); + get_val(line, str); + proto_val(str, &cfgk_protocol); + log_debug("config protocol = %d", cfgk_protocol); + } }
- set_val(root, "log_debug", &optk_debug, &cfgk_debug); - set_val(root, "timewarn", &optk_timewarn, &cfgk_timewarn); - set_val(root, "enable_fencing", &optd_enable_fencing, &cfgd_enable_fencing); - set_val(root, "enable_quorum", &optd_enable_quorum, &cfgd_enable_quorum); - set_val(root, "enable_fscontrol", &optd_enable_fscontrol, &cfgd_enable_fscontrol); - set_val(root, "enable_plock", &optd_enable_plock, &cfgd_enable_plock); - set_val(root, "plock_ownership", &optd_plock_ownership, &cfgd_plock_ownership); - do_update: - /* The following can be changed while running */ - set_val(root, "plock_debug", &optd_plock_debug, &cfgd_plock_debug); - set_val(root, "plock_rate_limit", &optd_plock_rate_limit, &cfgd_plock_rate_limit); - set_val(root, "drop_resources_time", &optd_drop_resources_time, &cfgd_drop_resources_time); - set_val(root, "drop_resources_count", &optd_drop_resources_count, &cfgd_drop_resources_count); - set_val(root, "drop_resources_age", &optd_drop_resources_age, &cfgd_drop_resources_age); - - xmlFreeDoc(doc); + fclose(file); }
diff --git a/dlm_controld/cpg.c b/dlm_controld/cpg.c index 1360493..1fb5372 100644 --- a/dlm_controld/cpg.c +++ b/dlm_controld/cpg.c @@ -1,5 +1,5 @@ /* - * Copyright 2004-2011 Red Hat, Inc. + * Copyright 2004-2012 Red Hat, Inc. * * This copyrighted material is made available to anyone wishing to use, * modify, copy, or redistribute it subject to the terms and conditions @@ -8,49 +8,6 @@
#include "dlm_daemon.h"
-#define PV_STATEFUL 0x0001 - -struct protocol_version { - uint16_t major; - uint16_t minor; - uint16_t patch; - uint16_t flags; -}; - -struct protocol { - union { - struct protocol_version dm_ver; - uint16_t daemon_max[4]; - }; - union { - struct protocol_version km_ver; - uint16_t kernel_max[4]; - }; - union { - struct protocol_version dr_ver; - uint16_t daemon_run[4]; - }; - union { - struct protocol_version kr_ver; - uint16_t kernel_run[4]; - }; -}; - -/* per dlm_controld cpg: daemon_nodes */ - -struct node_daemon { - struct list_head list; - int nodeid; - - uint64_t daemon_add_time; - uint64_t daemon_rem_time; - int daemon_member; - - int killed; - - struct protocol proto; -}; - /* per lockspace cpg: ls->node_history */
struct node { @@ -71,11 +28,10 @@ struct node { int check_fs; int fs_notified;
- int request_fencing; - int check_fencing; - uint64_t fail_realtime; - uint64_t fence_realtime; + int need_fencing; uint32_t fence_queries; /* for debug */ + uint64_t fail_walltime; + uint64_t fail_monotime; };
/* per lockspace confchg: ls->changes */ @@ -127,93 +83,6 @@ struct id_info { int nodeid; };
-int message_flow_control_on; -static cpg_handle_t cpg_handle_daemon; -static int cpg_fd_daemon; -static struct protocol our_protocol; -static struct list_head daemon_nodes; -static struct cpg_address daemon_member[MAX_NODES]; -static int daemon_member_count; - -static void log_config(const struct cpg_name *group_name, - const struct cpg_address *member_list, - size_t member_list_entries, - const struct cpg_address *left_list, - size_t left_list_entries, - const struct cpg_address *joined_list, - size_t joined_list_entries) -{ - char m_buf[128]; - char j_buf[32]; - char l_buf[32]; - size_t i, len, pos; - int ret; - - memset(m_buf, 0, sizeof(m_buf)); - memset(j_buf, 0, sizeof(j_buf)); - memset(l_buf, 0, sizeof(l_buf)); - - len = sizeof(m_buf); - pos = 0; - for (i = 0; i < member_list_entries; i++) { - ret = snprintf(m_buf + pos, len - pos, " %d", - member_list[i].nodeid); - if (ret >= len - pos) - break; - pos += ret; - } - - len = sizeof(j_buf); - pos = 0; - for (i = 0; i < joined_list_entries; i++) { - ret = snprintf(j_buf + pos, len - pos, " %d", - joined_list[i].nodeid); - if (ret >= len - pos) - break; - pos += ret; - } - - len = sizeof(l_buf); - pos = 0; - for (i = 0; i < left_list_entries; i++) { - ret = snprintf(l_buf + pos, len - pos, " %d", - left_list[i].nodeid); - if (ret >= len - pos) - break; - pos += ret; - } - - log_debug("%s conf %zu %zu %zu memb%s join%s left%s", group_name->value, - member_list_entries, joined_list_entries, left_list_entries, - m_buf, j_buf, l_buf); -} - -static void log_ringid(const char *name, - struct cpg_ring_id *ringid, - const uint32_t *member_list, - size_t member_list_entries) -{ - char m_buf[128]; - size_t i, len, pos; - int ret; - - memset(m_buf, 0, sizeof(m_buf)); - - len = sizeof(m_buf); - pos = 0; - for (i = 0; i < member_list_entries; i++) { - ret = snprintf(m_buf + pos, len - pos, " %u", - member_list[i]); - if (ret >= len - pos) - break; - pos += ret; - } - - log_debug("%s ring %u:%llu %zu memb%s", - name, ringid->nodeid, (unsigned long long)ringid->seq, - member_list_entries, m_buf); -} - static void ls_info_in(struct ls_info *li) { li->ls_info_size = le32_to_cpu(li->ls_info_size); @@ -243,93 +112,6 @@ static void ids_in(struct ls_info *li, struct id_info *ids) } }
-const char *msg_name(int type) -{ - switch (type) { - case DLM_MSG_PROTOCOL: - return "protocol"; - case DLM_MSG_START: - return "start"; - case DLM_MSG_PLOCK: - return "plock"; - case DLM_MSG_PLOCK_OWN: - return "plock_own"; - case DLM_MSG_PLOCK_DROP: - return "plock_drop"; - case DLM_MSG_PLOCK_SYNC_LOCK: - return "plock_sync_lock"; - case DLM_MSG_PLOCK_SYNC_WAITER: - return "plock_sync_waiter"; - case DLM_MSG_PLOCKS_DATA: - return "plocks_data"; - case DLM_MSG_PLOCKS_DONE: - return "plocks_done"; - case DLM_MSG_DEADLK_CYCLE_START: - return "deadlk_cycle_start"; - case DLM_MSG_DEADLK_CYCLE_END: - return "deadlk_cycle_end"; - case DLM_MSG_DEADLK_CHECKPOINT_READY: - return "deadlk_checkpoint_ready"; - case DLM_MSG_DEADLK_CANCEL_LOCK: - return "deadlk_cancel_lock"; - default: - return "unknown"; - } -} - -static int _send_message(cpg_handle_t h, void *buf, int len, int type) -{ - struct iovec iov; - cs_error_t error; - int retries = 0; - - iov.iov_base = buf; - iov.iov_len = len; - - retry: - error = cpg_mcast_joined(h, CPG_TYPE_AGREED, &iov, 1); - if (error == CS_ERR_TRY_AGAIN) { - retries++; - usleep(1000); - if (!(retries % 100)) - log_error("cpg_mcast_joined retry %d %s", - retries, msg_name(type)); - goto retry; - } - if (error != CS_OK) { - log_error("cpg_mcast_joined error %d handle %llx %s", - error, (unsigned long long)h, msg_name(type)); - return -1; - } - - if (retries) - log_debug("cpg_mcast_joined retried %d %s", - retries, msg_name(type)); - - return 0; -} - -/* header fields caller needs to set: type, to_nodeid, flags, msgdata */ - -void dlm_send_message(struct lockspace *ls, char *buf, int len) -{ - struct dlm_header *hd = (struct dlm_header *) buf; - int type = hd->type; - - hd->version[0] = cpu_to_le16(our_protocol.daemon_run[0]); - hd->version[1] = cpu_to_le16(our_protocol.daemon_run[1]); - hd->version[2] = cpu_to_le16(our_protocol.daemon_run[2]); - hd->type = cpu_to_le16(hd->type); - hd->nodeid = cpu_to_le32(our_nodeid); - hd->to_nodeid = cpu_to_le32(hd->to_nodeid); - hd->global_id = cpu_to_le32(ls->global_id); - hd->flags = cpu_to_le32(hd->flags); - hd->msgdata = cpu_to_le32(hd->msgdata); - hd->msgdata2 = cpu_to_le32(hd->msgdata2); - - _send_message(ls->cpg_handle, buf, len, type); -} - static struct member *find_memb(struct change *cg, int nodeid) { struct member *memb; @@ -424,7 +206,7 @@ static void free_ls(struct lockspace *ls) For 1: - node X fails - we see node X fail and X has non-zero start_time, - set check_fencing and record the fail time + set need_fencing and record the fail time - wait for X to be removed from all dlm cpg's (probably not necessary) - check that the fencing time is later than the recorded time above
@@ -442,8 +224,8 @@ static void free_ls(struct lockspace *ls) when we see a node not in this list, add entry for it with zero start_time record the time we get a good start message from the node, start_time clear start_time if the node leaves - if node fails with non-zero start_time, set check_fencing - when a node is fenced, clear start_time and clear check_fencing + if node fails with non-zero start_time, set need_fencing + when a node is fenced, clear start_time and clear need_fencing if a node remerges after this, no good start message, no new start_time set if a node fails with zero start_time, it doesn't need fencing if a node remerges before it's been fenced, no good start message, no new @@ -526,9 +308,7 @@ static void node_history_lockspace_fail(struct lockspace *ls, int nodeid, }
if (cfgd_enable_fencing && node->start_time) { - node->request_fencing = 1; - node->check_fencing = 1; - node->fence_realtime = 0; + node->need_fencing = 1; node->fence_queries = 0; }
@@ -543,7 +323,9 @@ static void node_history_lockspace_fail(struct lockspace *ls, int nodeid, node->lockspace_fail_time = now; node->lockspace_fail_seq = node->lockspace_rem_seq; node->lockspace_fail_reason = reason; /* for queries */ - node->fail_realtime = time(NULL); + + node->fail_monotime = now; + node->fail_walltime = time(NULL); }
static void node_history_start(struct lockspace *ls, int nodeid) @@ -601,9 +383,9 @@ static int check_ringid_done(struct lockspace *ls) static int check_fencing_done(struct lockspace *ls) { struct node *node; - uint64_t last_fenced_time; - int in_progress, wait_count = 0; - int rv; + uint64_t fence_monotime; + int wait_count = 0; + int rv, in_progress;
if (!cfgd_enable_fencing) { log_group(ls, "check_fencing disabled"); @@ -611,47 +393,35 @@ static int check_fencing_done(struct lockspace *ls) }
list_for_each_entry(node, &ls->node_history, list) { - if (!node->check_fencing) + if (!node->need_fencing) continue;
- /* check with fenced to see if the node has been - fenced since node->start_time */ - - rv = fence_node_time(node->nodeid, &last_fenced_time); + rv = fence_node_time(node->nodeid, &fence_monotime); if (rv < 0) { log_error("fenced_node_time error %d", rv); continue; }
- /* fenced gives us real time */ - - /* need >= not just > because in at least one case - we've seen fenced_time within the same second as - fail_time: with external fencing, e.g. fence_node */ - - if (last_fenced_time >= node->fail_realtime) { - log_group(ls, "check_fencing %d done " - "start %llu fail %llu last %llu", + if (fence_monotime >= node->fail_monotime) { + log_group(ls, "check_fencing %d done start %llu fail %llu fence %llu", node->nodeid, (unsigned long long)node->start_time, - (unsigned long long)node->fail_realtime, - (unsigned long long)last_fenced_time); - node->check_fencing = 0; + (unsigned long long)node->fail_monotime, + (unsigned long long)fence_monotime); + + node->need_fencing = 0; node->start_time = 0; - node->fence_realtime = last_fenced_time; + continue; } else { - if (!node->fence_queries || - node->fence_realtime != last_fenced_time) { - log_group(ls, "check_fencing %d wait " - "start %llu fail %llu last %llu", + if (!node->fence_queries) { + log_group(ls, "check_fencing %d wait start %llu fail %llu", node->nodeid, (unsigned long long)node->start_time, - (unsigned long long)node->fail_realtime, - (unsigned long long)last_fenced_time); + (unsigned long long)node->fail_monotime); node->fence_queries++; - node->fence_realtime = last_fenced_time; } wait_count++; + continue; } }
@@ -679,39 +449,6 @@ static int check_fencing_done(struct lockspace *ls) return 1; }
-static int need_fencing(struct lockspace *ls) -{ - struct node *node; - - list_for_each_entry(node, &ls->node_history, list) { - if (node->check_fencing) { - log_group(ls, "need_fencing %d", node->nodeid); - return 1; - } - } - return 0; -} - -/* we don't need to ask fenced to initiate fencing; it does - so itself when it sees a fence domain member fail. Without - fenced we'll probably need to ask another daemon to initiate - fencing, then check with it above, like we check libfenced. */ - -static void request_fencing(struct lockspace *ls) -{ - struct node *node; - int rv; - - list_for_each_entry(node, &ls->node_history, list) { - if (!node->request_fencing) - continue; - log_group(ls, "fence_request %d", node->nodeid); - rv = fence_request(node->nodeid); - if (!rv) - node->request_fencing = 0; - } -} - /* wait for local fs_controld to ack each failed node */
static int check_fs_done(struct lockspace *ls) @@ -839,43 +576,18 @@ static void stop_kernel(struct lockspace *ls, uint32_t seq) /* we know that the cluster_quorate value here is consistent with the cpg events because the ringid's are in sync per the check_ringid_done */
-/* - enable_quorum = 0 - Never wait for quorum here. - - enable_quorum = 1 - Wait for quorum before calling request_fencing() in cases where - fencing is needed. Reason for using this would be if fencing - system doesn't wait for quorum before carrying out a fencing - request, and we want to avoid having partitioned inquorate nodes - fencing quorate nodes in another partition. - - enable_quorum = 2 - Always wait for quorum as a general condition here. This - means that lockspace join/leave operations would block waiting - for quorum. There's not any reason to do this per se, unless - that's the behavior the application using the dlm wants. -*/ - static int wait_conditions_done(struct lockspace *ls) { if (!check_ringid_done(ls)) return 0;
- if ((cfgd_enable_quorum > 1) && !cluster_quorate) { + if ((cfgd_enable_quorum_lockspace > 1) && !cluster_quorate) { log_group(ls, "wait for quorum"); return 0; }
- if ((cfgd_enable_quorum > 0) && need_fencing(ls) && !cluster_quorate) { - log_group(ls, "wait for quorum before fencing"); - return 0; - } - - request_fencing(ls); - if (!check_fencing_done(ls)) { - poll_fencing++; + poll_lockspaces++; return 0; }
@@ -1437,7 +1149,7 @@ static void apply_changes(struct lockspace *ls)
case CGST_WAIT_MESSAGES: if (wait_messages_done(ls)) { - our_protocol.dr_ver.flags |= PV_STATEFUL; + set_protocol_stateful(); start_kernel(ls); prepare_plocks(ls); cleanup_changes(ls); @@ -1453,9 +1165,7 @@ void process_lockspace_changes(void) { struct lockspace *ls, *safe;
- poll_ringid = 0; - poll_fencing = 0; - poll_quorum = 0; + poll_lockspaces = 0; poll_fs = 0;
list_for_each_entry_safe(ls, safe, &lockspaces, list) { @@ -1464,24 +1174,6 @@ void process_lockspace_changes(void) } }
-static const char *reason_str(int reason) -{ - switch (reason) { - case CPG_REASON_JOIN: - return "join"; - case CPG_REASON_LEAVE: - return "leave"; - case CPG_REASON_NODEDOWN: - return "nodedown"; - case CPG_REASON_NODEUP: - return "nodeup"; - case CPG_REASON_PROCDOWN: - return "procdown"; - default: - return "unknown"; - }; -} - static int add_change(struct lockspace *ls, const struct cpg_address *member_list, size_t member_list_entries, @@ -1665,20 +1357,6 @@ static void confchg_cb(cpg_handle_t handle, #endif }
-static void dlm_header_in(struct dlm_header *hd) -{ - hd->version[0] = le16_to_cpu(hd->version[0]); - hd->version[1] = le16_to_cpu(hd->version[1]); - hd->version[2] = le16_to_cpu(hd->version[2]); - hd->type = le16_to_cpu(hd->type); - hd->nodeid = le32_to_cpu(hd->nodeid); - hd->to_nodeid = le32_to_cpu(hd->to_nodeid); - hd->global_id = le32_to_cpu(hd->global_id); - hd->flags = le32_to_cpu(hd->flags); - hd->msgdata = le32_to_cpu(hd->msgdata); - hd->msgdata2 = le32_to_cpu(hd->msgdata2); -} - /* after our join confchg, we want to ignore plock messages (see need_plocks checks below) until the point in time where the ckpt_node saves plock state (final start message received); at this time we want to shift from @@ -1693,6 +1371,7 @@ static void deliver_cb(cpg_handle_t handle, struct lockspace *ls; struct dlm_header *hd; int ignore_plock; + int rv;
ls = find_ls_handle(handle); if (!ls) { @@ -1700,7 +1379,7 @@ static void deliver_cb(cpg_handle_t handle, return; }
- if (len < sizeof(*hd)) { + if (len < sizeof(struct dlm_header)) { log_error("deliver_cb short message %zd", len); return; } @@ -1708,20 +1387,9 @@ static void deliver_cb(cpg_handle_t handle, hd = (struct dlm_header *)data; dlm_header_in(hd);
- if (hd->version[0] != our_protocol.daemon_run[0] || - hd->version[1] != our_protocol.daemon_run[1]) { - log_error("reject message from %d version %u.%u.%u vs %u.%u.%u", - nodeid, hd->version[0], hd->version[1], - hd->version[2], our_protocol.daemon_run[0], - our_protocol.daemon_run[1], - our_protocol.daemon_run[2]); + rv = dlm_header_validate(hd, nodeid); + if (rv < 0) return; - } - - if (hd->nodeid != nodeid) { - log_error("bad msg nodeid %d %d", hd->nodeid, nodeid); - return; - }
ignore_plock = 0;
@@ -1892,31 +1560,6 @@ static cpg_model_v1_data_t cpg_callbacks = { .flags = CPG_MODEL_V1_DELIVER_INITIAL_TOTEM_CONF, };
-void update_flow_control_status(void) -{ - cpg_flow_control_state_t flow_control_state; - cs_error_t error; - - error = cpg_flow_control_state_get(cpg_handle_daemon, - &flow_control_state); - if (error != CS_OK) { - log_error("cpg_flow_control_state_get %d", error); - return; - } - - if (flow_control_state == CPG_FLOW_CONTROL_ENABLED) { - if (message_flow_control_on == 0) { - log_debug("flow control on"); - } - message_flow_control_on = 1; - } else { - if (message_flow_control_on) { - log_debug("flow control off"); - } - message_flow_control_on = 0; - } -} - static void process_cpg_lockspace(int ci) { struct lockspace *ls; @@ -1933,8 +1576,6 @@ static void process_cpg_lockspace(int ci) log_error("cpg_dispatch error %d", error); return; } - - update_flow_control_status(); }
/* received an "online" uevent from dlm-kernel */ @@ -1945,14 +1586,6 @@ int dlm_join_lockspace(struct lockspace *ls) cpg_handle_t h; struct cpg_name name; int i = 0, fd, ci, rv; - int unused; - - rv = fence_in_progress(&unused); - if (cfgd_enable_fencing && rv < 0) { - log_error("dlm_join_lockspace no fence domain"); - rv = -1; - goto fail_free; - }
error = cpg_model_initialize(&h, CPG_MODEL_V1, (cpg_model_data_t *)&cpg_callbacks, NULL); @@ -2038,645 +1671,6 @@ int dlm_leave_lockspace(struct lockspace *ls) return 0; }
-static struct node_daemon *get_node_daemon(int nodeid) -{ - struct node_daemon *node; - - list_for_each_entry(node, &daemon_nodes, list) { - if (node->nodeid == nodeid) - return node; - } - return NULL; -} - -static void add_node_daemon(int nodeid) -{ - struct node_daemon *node; - - if (get_node_daemon(nodeid)) - return; - - node = malloc(sizeof(struct node_daemon)); - if (!node) { - log_error("add_node_daemon no mem"); - return; - } - memset(node, 0, sizeof(struct node_daemon)); - node->nodeid = nodeid; - list_add_tail(&node->list, &daemon_nodes); -} - -static void pv_in(struct protocol_version *pv) -{ - pv->major = le16_to_cpu(pv->major); - pv->minor = le16_to_cpu(pv->minor); - pv->patch = le16_to_cpu(pv->patch); - pv->flags = le16_to_cpu(pv->flags); -} - -static void pv_out(struct protocol_version *pv) -{ - pv->major = cpu_to_le16(pv->major); - pv->minor = cpu_to_le16(pv->minor); - pv->patch = cpu_to_le16(pv->patch); - pv->flags = cpu_to_le16(pv->flags); -} - -static void protocol_in(struct protocol *proto) -{ - pv_in(&proto->dm_ver); - pv_in(&proto->km_ver); - pv_in(&proto->dr_ver); - pv_in(&proto->kr_ver); -} - -static void protocol_out(struct protocol *proto) -{ - pv_out(&proto->dm_ver); - pv_out(&proto->km_ver); - pv_out(&proto->dr_ver); - pv_out(&proto->kr_ver); -} - -/* go through member list saved in last confchg, see if we have received a - proto message from each */ - -static int all_protocol_messages(void) -{ - struct node_daemon *node; - int i; - - if (!daemon_member_count) - return 0; - - for (i = 0; i < daemon_member_count; i++) { - node = get_node_daemon(daemon_member[i].nodeid); - if (!node) { - log_error("all_protocol_messages no node %d", - daemon_member[i].nodeid); - return 0; - } - - if (!node->proto.daemon_max[0]) - return 0; - } - return 1; -} - -static int pick_min_protocol(struct protocol *proto) -{ - uint16_t mind[4]; - uint16_t mink[4]; - struct node_daemon *node; - int i; - - memset(&mind, 0, sizeof(mind)); - memset(&mink, 0, sizeof(mink)); - - /* first choose the minimum major */ - - for (i = 0; i < daemon_member_count; i++) { - node = get_node_daemon(daemon_member[i].nodeid); - if (!node) { - log_error("pick_min_protocol no node %d", - daemon_member[i].nodeid); - return -1; - } - - if (!mind[0] || node->proto.daemon_max[0] < mind[0]) - mind[0] = node->proto.daemon_max[0]; - - if (!mink[0] || node->proto.kernel_max[0] < mink[0]) - mink[0] = node->proto.kernel_max[0]; - } - - if (!mind[0] || !mink[0]) { - log_error("pick_min_protocol zero major number"); - return -1; - } - - /* second pick the minimum minor with the chosen major */ - - for (i = 0; i < daemon_member_count; i++) { - node = get_node_daemon(daemon_member[i].nodeid); - if (!node) - continue; - - if (mind[0] == node->proto.daemon_max[0]) { - if (!mind[1] || node->proto.daemon_max[1] < mind[1]) - mind[1] = node->proto.daemon_max[1]; - } - - if (mink[0] == node->proto.kernel_max[0]) { - if (!mink[1] || node->proto.kernel_max[1] < mink[1]) - mink[1] = node->proto.kernel_max[1]; - } - } - - if (!mind[1] || !mink[1]) { - log_error("pick_min_protocol zero minor number"); - return -1; - } - - /* third pick the minimum patch with the chosen major.minor */ - - for (i = 0; i < daemon_member_count; i++) { - node = get_node_daemon(daemon_member[i].nodeid); - if (!node) - continue; - - if (mind[0] == node->proto.daemon_max[0] && - mind[1] == node->proto.daemon_max[1]) { - if (!mind[2] || node->proto.daemon_max[2] < mind[2]) - mind[2] = node->proto.daemon_max[2]; - } - - if (mink[0] == node->proto.kernel_max[0] && - mink[1] == node->proto.kernel_max[1]) { - if (!mink[2] || node->proto.kernel_max[2] < mink[2]) - mink[2] = node->proto.kernel_max[2]; - } - } - - if (!mind[2] || !mink[2]) { - log_error("pick_min_protocol zero patch number"); - return -1; - } - - memcpy(&proto->daemon_run, &mind, sizeof(mind)); - memcpy(&proto->kernel_run, &mink, sizeof(mink)); - return 0; -} - -static void receive_protocol(struct dlm_header *hd, int len) -{ - struct protocol *p; - struct node_daemon *node; - - p = (struct protocol *)((char *)hd + sizeof(struct dlm_header)); - protocol_in(p); - - if (len < sizeof(struct dlm_header) + sizeof(struct protocol)) { - log_error("receive_protocol invalid len %d from %d", - len, hd->nodeid); - return; - } - - /* zero is an invalid version value */ - - if (!p->daemon_max[0] || !p->daemon_max[1] || !p->daemon_max[2] || - !p->kernel_max[0] || !p->kernel_max[1] || !p->kernel_max[2]) { - log_error("receive_protocol invalid max value from %d " - "daemon %u.%u.%u kernel %u.%u.%u", hd->nodeid, - p->daemon_max[0], p->daemon_max[1], p->daemon_max[2], - p->kernel_max[0], p->kernel_max[1], p->kernel_max[2]); - return; - } - - /* the run values will be zero until a version is set, after - which none of the run values can be zero */ - - if (p->daemon_run[0] && (!p->daemon_run[1] || !p->daemon_run[2] || - !p->kernel_run[0] || !p->kernel_run[1] || !p->kernel_run[2])) { - log_error("receive_protocol invalid run value from %d " - "daemon %u.%u.%u kernel %u.%u.%u", hd->nodeid, - p->daemon_run[0], p->daemon_run[1], p->daemon_run[2], - p->kernel_run[0], p->kernel_run[1], p->kernel_run[2]); - return; - } - - /* save this node's proto so we can tell when we've got all, and - use it to select a minimum protocol from all */ - - node = get_node_daemon(hd->nodeid); - if (!node) { - log_error("receive_protocol no node %d", hd->nodeid); - return; - } - - if (!node->daemon_member) { - log_error("receive_protocol node %d not member", hd->nodeid); - return; - } - - log_debug("receive_protocol from %d max %u.%u.%u.%x run %u.%u.%u.%x", - hd->nodeid, - p->daemon_max[0], p->daemon_max[1], - p->daemon_max[2], p->daemon_max[3], - p->daemon_run[0], p->daemon_run[1], - p->daemon_run[2], p->daemon_run[3]); - log_debug("daemon node %d max %u.%u.%u.%x run %u.%u.%u.%x", - hd->nodeid, - node->proto.daemon_max[0], node->proto.daemon_max[1], - node->proto.daemon_max[2], node->proto.daemon_max[3], - node->proto.daemon_run[0], node->proto.daemon_run[1], - node->proto.daemon_run[2], node->proto.daemon_run[3]); - log_debug("daemon node %d join %llu left %llu local quorum %llu", - hd->nodeid, - (unsigned long long)node->daemon_add_time, - (unsigned long long)node->daemon_rem_time, - (unsigned long long)quorate_time); - - /* checking zero node->daemon_max[0] is a way to tell if we've received - an acceptable (non-stateful) proto message from the node since we - saw it join the daemon cpg */ - - if (hd->nodeid != our_nodeid && - !node->proto.daemon_max[0] && - (p->dr_ver.flags & PV_STATEFUL) && - (our_protocol.dr_ver.flags & PV_STATEFUL)) { - - log_debug("daemon node %d stateful merge", hd->nodeid); - - if (cluster_quorate && node->daemon_rem_time && - quorate_time < node->daemon_rem_time) { - log_debug("daemon node %d kill due to stateful merge", hd->nodeid); - if (!node->killed) - kick_node_from_cluster(hd->nodeid); - node->killed = 1; - } - - /* don't save p->proto into node->proto; we need to come - through here based on zero daemon_max[0] for other proto - messages like this one from the same node */ - - return; - } - - memcpy(&node->proto, p, sizeof(struct protocol)); - - /* if we have zero run values, and this msg has non-zero run values, - then adopt them as ours; otherwise save this proto message */ - - if (our_protocol.daemon_run[0]) - return; - - if (p->daemon_run[0]) { - our_protocol.daemon_run[0] = p->daemon_run[0]; - our_protocol.daemon_run[1] = p->daemon_run[1]; - our_protocol.daemon_run[2] = p->daemon_run[2]; - - our_protocol.kernel_run[0] = p->kernel_run[0]; - our_protocol.kernel_run[1] = p->kernel_run[1]; - our_protocol.kernel_run[2] = p->kernel_run[2]; - - log_debug("run protocol from nodeid %d", hd->nodeid); - } -} - -static void send_protocol(struct protocol *proto) -{ - struct dlm_header *hd; - struct protocol *pr; - char *buf; - int len; - - len = sizeof(struct dlm_header) + sizeof(struct protocol); - buf = malloc(len); - if (!buf) { - log_error("send_protocol no mem %d", len); - return; - } - memset(buf, 0, len); - - hd = (struct dlm_header *)buf; - pr = (struct protocol *)(buf + sizeof(*hd)); - - hd->type = cpu_to_le16(DLM_MSG_PROTOCOL); - hd->nodeid = cpu_to_le32(our_nodeid); - - memcpy(pr, proto, sizeof(struct protocol)); - protocol_out(pr); - - _send_message(cpg_handle_daemon, buf, len, DLM_MSG_PROTOCOL); -} - -int set_protocol(void) -{ - struct protocol proto; - struct pollfd pollfd; - int sent_proposal = 0; - int rv; - - memset(&pollfd, 0, sizeof(pollfd)); - pollfd.fd = cpg_fd_daemon; - pollfd.events = POLLIN; - - while (1) { - if (our_protocol.daemon_run[0]) - break; - - if (!sent_proposal && all_protocol_messages()) { - /* propose a protocol; look through info from all - nodes and pick the min for both daemon and kernel, - and propose that */ - - sent_proposal = 1; - - /* copy our max values */ - memcpy(&proto, &our_protocol, sizeof(struct protocol)); - - rv = pick_min_protocol(&proto); - if (rv < 0) - return rv; - - log_debug("set_protocol member_count %d propose " - "daemon %u.%u.%u kernel %u.%u.%u", - daemon_member_count, - proto.daemon_run[0], proto.daemon_run[1], - proto.daemon_run[2], proto.kernel_run[0], - proto.kernel_run[1], proto.kernel_run[2]); - - send_protocol(&proto); - } - - /* only process messages/events from daemon cpg until protocol - is established */ - - rv = poll(&pollfd, 1, -1); - if (rv == -1 && errno == EINTR) { - if (daemon_quit) - return -1; - continue; - } - if (rv < 0) { - log_error("set_protocol poll errno %d", errno); - return -1; - } - - if (pollfd.revents & POLLIN) - process_cpg_daemon(0); - if (pollfd.revents & (POLLERR | POLLHUP | POLLNVAL)) { - log_error("set_protocol poll revents %u", - pollfd.revents); - return -1; - } - } - - if (our_protocol.daemon_run[0] != our_protocol.daemon_max[0] || - our_protocol.daemon_run[1] > our_protocol.daemon_max[1]) { - log_error("incompatible daemon protocol run %u.%u.%u max %u.%u.%u", - our_protocol.daemon_run[0], - our_protocol.daemon_run[1], - our_protocol.daemon_run[2], - our_protocol.daemon_max[0], - our_protocol.daemon_max[1], - our_protocol.daemon_max[2]); - return -1; - } - - if (our_protocol.kernel_run[0] != our_protocol.kernel_max[0] || - our_protocol.kernel_run[1] > our_protocol.kernel_max[1]) { - log_error("incompatible kernel protocol run %u.%u.%u max %u.%u.%u", - our_protocol.kernel_run[0], - our_protocol.kernel_run[1], - our_protocol.kernel_run[2], - our_protocol.kernel_max[0], - our_protocol.kernel_max[1], - our_protocol.kernel_max[2]); - return -1; - } - - log_debug("daemon run %u.%u.%u max %u.%u.%u " - "kernel run %u.%u.%u max %u.%u.%u", - our_protocol.daemon_run[0], - our_protocol.daemon_run[1], - our_protocol.daemon_run[2], - our_protocol.daemon_max[0], - our_protocol.daemon_max[1], - our_protocol.daemon_max[2], - our_protocol.kernel_run[0], - our_protocol.kernel_run[1], - our_protocol.kernel_run[2], - our_protocol.kernel_max[0], - our_protocol.kernel_max[1], - our_protocol.kernel_max[2]); - - send_protocol(&our_protocol); - return 0; -} - -static void deliver_cb_daemon(cpg_handle_t handle, - const struct cpg_name *group_name, - uint32_t nodeid, uint32_t pid, - void *data, size_t len) -{ - struct dlm_header *hd; - - if (len < sizeof(*hd)) { - log_error("deliver_cb short message %zd", len); - return; - } - - hd = (struct dlm_header *)data; - dlm_header_in(hd); - - switch (hd->type) { - case DLM_MSG_PROTOCOL: - receive_protocol(hd, len); - break; - default: - log_error("deliver_cb_daemon unknown msg type %d", hd->type); - } -} - -static int in_daemon_member_list(int nodeid) -{ - int i; - - for (i = 0; i < daemon_member_count; i++) { - if (daemon_member[i].nodeid == nodeid) - return 1; - } - return 0; -} - -static void confchg_cb_daemon(cpg_handle_t handle, - const struct cpg_name *group_name, - const struct cpg_address *member_list, - size_t member_list_entries, - const struct cpg_address *left_list, - size_t left_list_entries, - const struct cpg_address *joined_list, - size_t joined_list_entries) -{ - struct node_daemon *node; - uint64_t now = monotime(); - int nodedown = 0, procdown = 0, leave = 0; - int i; - - log_config(group_name, member_list, member_list_entries, - left_list, left_list_entries, - joined_list, joined_list_entries); - - for (i = 0; i < left_list_entries; i++) { - if (left_list[i].reason == CPG_REASON_NODEDOWN) - nodedown++; - else if (left_list[i].reason == CPG_REASON_PROCDOWN) - procdown++; - else if (left_list[i].reason == CPG_REASON_LEAVE) - leave++; - } - - if (nodedown || procdown || leave) - log_debug("%s left nodedown %d procdown %d leave %d", - group_name->value, nodedown, procdown, leave); - - if (joined_list_entries) - send_protocol(&our_protocol); - - memset(&daemon_member, 0, sizeof(daemon_member)); - daemon_member_count = member_list_entries; - - for (i = 0; i < member_list_entries; i++) { - daemon_member[i] = member_list[i]; - /* add struct for nodes we've not seen before */ - add_node_daemon(member_list[i].nodeid); - } - - list_for_each_entry(node, &daemon_nodes, list) { - if (in_daemon_member_list(node->nodeid)) { - if (node->daemon_member) - continue; - - /* node joined daemon cpg */ - node->daemon_member = 1; - node->daemon_add_time = now; - } else { - if (!node->daemon_member) - continue; - - /* node left daemon cpg */ - node->daemon_member = 0; - node->killed = 0; - memset(&node->proto, 0, sizeof(struct protocol)); - node->daemon_rem_time = now; - } - } -} - -static void totem_cb_daemon(cpg_handle_t handle, - struct cpg_ring_id ring_id, - uint32_t member_list_entries, - const uint32_t *member_list) -{ - log_ringid("dlm:controld", &ring_id, member_list, member_list_entries); -} - -static cpg_model_v1_data_t cpg_callbacks_daemon = { - .cpg_deliver_fn = deliver_cb_daemon, - .cpg_confchg_fn = confchg_cb_daemon, - .cpg_totem_confchg_fn = totem_cb_daemon, - .flags = CPG_MODEL_V1_DELIVER_INITIAL_TOTEM_CONF, -}; - -void process_cpg_daemon(int ci) -{ - cs_error_t error; - - error = cpg_dispatch(cpg_handle_daemon, CS_DISPATCH_ALL); - if (error != CS_OK) - log_error("daemon cpg_dispatch error %d", error); -} - -int setup_cpg_daemon(void) -{ - cs_error_t error; - struct cpg_name name; - int i = 0; - - INIT_LIST_HEAD(&daemon_nodes); - - /* daemon 1.1.1 was cluster3/STABLE3/RHEL6 which is incompatible - with cluster4/RHEL7 */ - - memset(&our_protocol, 0, sizeof(our_protocol)); - - if (cfgd_enable_fscontrol) - our_protocol.daemon_max[0] = 2; - else - our_protocol.daemon_max[0] = 3; - - our_protocol.daemon_max[1] = 1; - our_protocol.daemon_max[2] = 1; - our_protocol.kernel_max[0] = 1; - our_protocol.kernel_max[1] = 1; - our_protocol.kernel_max[2] = 1; - - error = cpg_model_initialize(&cpg_handle_daemon, CPG_MODEL_V1, - (cpg_model_data_t *)&cpg_callbacks_daemon, - NULL); - if (error != CS_OK) { - log_error("daemon cpg_initialize error %d", error); - return -1; - } - - cpg_fd_get(cpg_handle_daemon, &cpg_fd_daemon); - - memset(&name, 0, sizeof(name)); - sprintf(name.value, "dlm:controld"); - name.length = strlen(name.value) + 1; - - log_debug("cpg_join %s ...", name.value); - retry: - error = cpg_join(cpg_handle_daemon, &name); - if (error == CS_ERR_TRY_AGAIN) { - sleep(1); - if (!(++i % 10)) - log_error("daemon cpg_join error retrying"); - goto retry; - } - if (error != CS_OK) { - log_error("daemon cpg_join error %d", error); - goto fail; - } - - log_debug("setup_cpg_daemon %d", cpg_fd_daemon); - return cpg_fd_daemon; - - fail: - cpg_finalize(cpg_handle_daemon); - return -1; -} - -void close_cpg_daemon(void) -{ - struct lockspace *ls; - cs_error_t error; - struct cpg_name name; - int i = 0; - - if (!cpg_handle_daemon) - return; - if (cluster_down) - goto fin; - - memset(&name, 0, sizeof(name)); - sprintf(name.value, "dlm:controld"); - name.length = strlen(name.value) + 1; - - log_debug("cpg_leave %s ...", name.value); - retry: - error = cpg_leave(cpg_handle_daemon, &name); - if (error == CS_ERR_TRY_AGAIN) { - sleep(1); - if (!(++i % 10)) - log_error("daemon cpg_leave error retrying"); - goto retry; - } - if (error != CS_OK) - log_error("daemon cpg_leave error %d", error); - fin: - list_for_each_entry(ls, &lockspaces, list) { - if (ls->cpg_handle) - cpg_finalize(ls->cpg_handle); - } - cpg_finalize(cpg_handle_daemon); -} - -/* fs_controld has seen nodedown for nodeid; it's now ok for dlm to do - recovery for the failed node */ - int set_fs_notified(struct lockspace *ls, int nodeid) { struct node *node; @@ -2755,12 +1749,8 @@ int set_lockspace_info(struct lockspace *ls, struct dlmc_lockspace *lockspace)
if (cg->state == CGST_WAIT_CONDITIONS) lockspace->cg_next.wait_condition = 5; - if (poll_ringid) - lockspace->cg_next.wait_condition = 1; else if (poll_fencing) lockspace->cg_next.wait_condition = 2; - else if (poll_quorum) - lockspace->cg_next.wait_condition = 3; else if (poll_fs) lockspace->cg_next.wait_condition = 4;
@@ -2795,7 +1785,7 @@ static int _set_node_info(struct lockspace *ls, struct change *cg, int nodeid, if (!n) goto out;
- if (n->check_fencing) + if (n->need_fencing) node->flags |= DLMC_NF_CHECK_FENCING; if (n->check_fs) node->flags |= DLMC_NF_CHECK_FS; diff --git a/dlm_controld/daemon_cpg.c b/dlm_controld/daemon_cpg.c new file mode 100644 index 0000000..e5e9270 --- /dev/null +++ b/dlm_controld/daemon_cpg.c @@ -0,0 +1,1921 @@ +/* + * Copyright 2004-2012 Red Hat, Inc. + * + * This copyrighted material is made available to anyone wishing to use, + * modify, copy, or redistribute it subject to the terms and conditions + * of the GNU General Public License v2 or (at your option) any later version. + */ + +#include "dlm_daemon.h" + +/* protocol_version flags */ +#define PV_STATEFUL 0x0001 + +struct protocol_version { + uint16_t major; + uint16_t minor; + uint16_t patch; + uint16_t flags; +}; + +struct protocol { + union { + struct protocol_version dm_ver; + uint16_t daemon_max[4]; + }; + union { + struct protocol_version km_ver; + uint16_t kernel_max[4]; + }; + union { + struct protocol_version dr_ver; + uint16_t daemon_run[4]; + }; + union { + struct protocol_version kr_ver; + uint16_t kernel_run[4]; + }; +}; + +/* fence_result flags */ +#define FR_FIPU 0x00000001 +#define FR_CLEAR_STARTUP 0x00000002 +#define FR_CLEAR_FIPU 0x00000004 + +struct fence_result { + uint32_t version; + uint32_t flags; + uint32_t nodeid; + uint32_t result; + uint64_t fence_walltime; + char unused[1000]; +}; + +struct node_daemon { + struct list_head list; + int nodeid; + + uint64_t daemon_add_time; + uint64_t daemon_rem_time; + int daemon_member; + + int killed; + + struct protocol proto; + + struct fence_config fence_config; + + int fence_in_progress_unknown; + int left_reason; + int recover_setup; + int need_fence_clear; + int need_fencing; + int fence_pid; + int fence_pid_wait; + int fence_actor_done; + int fence_actors[MAX_NODES]; + uint64_t fail_walltime; + uint64_t fail_monotime; + uint64_t fence_request_time; + uint64_t fence_walltime; + uint64_t fence_monotime; +}; + +#define REASON_STARTUP_FENCING -1 + +static cpg_handle_t cpg_handle_daemon; +static int cpg_fd_daemon; +static struct protocol our_protocol; +static struct list_head daemon_nodes; +static struct list_head startup_nodes; +static struct cpg_address daemon_member[MAX_NODES]; +static struct cpg_address daemon_joined[MAX_NODES]; +static struct cpg_address daemon_remove[MAX_NODES]; +static int daemon_member_count; +static int daemon_joined_count; +static int daemon_remove_count; +static int daemon_ringid_wait; +static struct cpg_ring_id daemon_ringid; +static int daemon_clear_nodeid; +static int daemon_clear_pid; +static uint64_t last_join_monotime; +static uint32_t last_join_seq; +static uint32_t send_fipu_seq; +static int fence_in_progress_unknown = 1; + +static void send_fence_result(int nodeid, int result, uint32_t flags, uint64_t walltime); +static void send_fence_clear(int nodeid, int result, uint32_t flags, uint64_t walltime); + +void log_config(const struct cpg_name *group_name, + const struct cpg_address *member_list, + size_t member_list_entries, + const struct cpg_address *left_list, + size_t left_list_entries, + const struct cpg_address *joined_list, + size_t joined_list_entries) +{ + char m_buf[128]; + char j_buf[32]; + char l_buf[32]; + size_t i, len, pos; + int ret; + + memset(m_buf, 0, sizeof(m_buf)); + memset(j_buf, 0, sizeof(j_buf)); + memset(l_buf, 0, sizeof(l_buf)); + + len = sizeof(m_buf); + pos = 0; + for (i = 0; i < member_list_entries; i++) { + ret = snprintf(m_buf + pos, len - pos, " %d", + member_list[i].nodeid); + if (ret >= len - pos) + break; + pos += ret; + } + + len = sizeof(j_buf); + pos = 0; + for (i = 0; i < joined_list_entries; i++) { + ret = snprintf(j_buf + pos, len - pos, " %d", + joined_list[i].nodeid); + if (ret >= len - pos) + break; + pos += ret; + } + + len = sizeof(l_buf); + pos = 0; + for (i = 0; i < left_list_entries; i++) { + ret = snprintf(l_buf + pos, len - pos, " %d", + left_list[i].nodeid); + if (ret >= len - pos) + break; + pos += ret; + } + + log_debug("%s conf %zu %zu %zu memb%s join%s left%s", group_name->value, + member_list_entries, joined_list_entries, left_list_entries, + m_buf, j_buf, l_buf); +} + +void log_ringid(const char *name, + struct cpg_ring_id *ringid, + const uint32_t *member_list, + size_t member_list_entries) +{ + char m_buf[128]; + size_t i, len, pos; + int ret; + + memset(m_buf, 0, sizeof(m_buf)); + + len = sizeof(m_buf); + pos = 0; + for (i = 0; i < member_list_entries; i++) { + ret = snprintf(m_buf + pos, len - pos, " %u", + member_list[i]); + if (ret >= len - pos) + break; + pos += ret; + } + + log_debug("%s ring %u:%llu %zu memb%s", + name, ringid->nodeid, (unsigned long long)ringid->seq, + member_list_entries, m_buf); +} + +const char *reason_str(int reason) +{ + switch (reason) { + case CPG_REASON_JOIN: + return "join"; + case CPG_REASON_LEAVE: + return "leave"; + case CPG_REASON_NODEDOWN: + return "nodedown"; + case CPG_REASON_NODEUP: + return "nodeup"; + case CPG_REASON_PROCDOWN: + return "procdown"; + default: + return "unknown"; + }; +} + +const char *msg_name(int type) +{ + switch (type) { + case DLM_MSG_PROTOCOL: + return "protocol"; + case DLM_MSG_FENCE_RESULT: + return "fence_result"; + case DLM_MSG_FENCE_CLEAR: + return "fence_clear"; + + case DLM_MSG_START: + return "start"; + case DLM_MSG_PLOCK: + return "plock"; + case DLM_MSG_PLOCK_OWN: + return "plock_own"; + case DLM_MSG_PLOCK_DROP: + return "plock_drop"; + case DLM_MSG_PLOCK_SYNC_LOCK: + return "plock_sync_lock"; + case DLM_MSG_PLOCK_SYNC_WAITER: + return "plock_sync_waiter"; + case DLM_MSG_PLOCKS_DATA: + return "plocks_data"; + case DLM_MSG_PLOCKS_DONE: + return "plocks_done"; + case DLM_MSG_DEADLK_CYCLE_START: + return "deadlk_cycle_start"; + case DLM_MSG_DEADLK_CYCLE_END: + return "deadlk_cycle_end"; + case DLM_MSG_DEADLK_CHECKPOINT_READY: + return "deadlk_checkpoint_ready"; + case DLM_MSG_DEADLK_CANCEL_LOCK: + return "deadlk_cancel_lock"; + default: + return "unknown"; + } +} + +static int _send_message(cpg_handle_t h, void *buf, int len, int type) +{ + struct iovec iov; + cs_error_t error; + int retries = 0; + + iov.iov_base = buf; + iov.iov_len = len; + + retry: + error = cpg_mcast_joined(h, CPG_TYPE_AGREED, &iov, 1); + if (error == CS_ERR_TRY_AGAIN) { + retries++; + usleep(1000); + if (!(retries % 100)) + log_error("cpg_mcast_joined retry %d %s", + retries, msg_name(type)); + goto retry; + } + if (error != CS_OK) { + log_error("cpg_mcast_joined error %d handle %llx %s", + error, (unsigned long long)h, msg_name(type)); + return -1; + } + + if (retries) + log_debug("cpg_mcast_joined retried %d %s", + retries, msg_name(type)); + + return 0; +} + +/* header fields caller needs to set: type, to_nodeid, flags, msgdata */ + +void dlm_send_message(struct lockspace *ls, char *buf, int len) +{ + struct dlm_header *hd = (struct dlm_header *) buf; + int type = hd->type; + + hd->version[0] = cpu_to_le16(our_protocol.daemon_run[0]); + hd->version[1] = cpu_to_le16(our_protocol.daemon_run[1]); + hd->version[2] = cpu_to_le16(our_protocol.daemon_run[2]); + hd->type = cpu_to_le16(hd->type); + hd->nodeid = cpu_to_le32(our_nodeid); + hd->to_nodeid = cpu_to_le32(hd->to_nodeid); + hd->global_id = cpu_to_le32(ls->global_id); + hd->flags = cpu_to_le32(hd->flags); + hd->msgdata = cpu_to_le32(hd->msgdata); + hd->msgdata2 = cpu_to_le32(hd->msgdata2); + + _send_message(ls->cpg_handle, buf, len, type); +} + +void dlm_header_in(struct dlm_header *hd) +{ + hd->version[0] = le16_to_cpu(hd->version[0]); + hd->version[1] = le16_to_cpu(hd->version[1]); + hd->version[2] = le16_to_cpu(hd->version[2]); + hd->type = le16_to_cpu(hd->type); + hd->nodeid = le32_to_cpu(hd->nodeid); + hd->to_nodeid = le32_to_cpu(hd->to_nodeid); + hd->global_id = le32_to_cpu(hd->global_id); + hd->flags = le32_to_cpu(hd->flags); + hd->msgdata = le32_to_cpu(hd->msgdata); + hd->msgdata2 = le32_to_cpu(hd->msgdata2); +} + +int dlm_header_validate(struct dlm_header *hd, int nodeid) +{ + if (hd->version[0] != our_protocol.daemon_run[0] || + hd->version[1] != our_protocol.daemon_run[1]) { + log_error("reject message from %d version %u.%u.%u vs %u.%u.%u", + nodeid, hd->version[0], hd->version[1], + hd->version[2], our_protocol.daemon_run[0], + our_protocol.daemon_run[1], + our_protocol.daemon_run[2]); + return -1; + } + + if (hd->nodeid != nodeid) { + log_error("bad msg nodeid %d %d", hd->nodeid, nodeid); + return -1; + } + + return 0; +} + +static struct node_daemon *get_node_daemon(int nodeid) +{ + struct node_daemon *node; + + list_for_each_entry(node, &daemon_nodes, list) { + if (node->nodeid == nodeid) + return node; + } + return NULL; +} + +static int nodes_need_fencing(void) +{ + struct node_daemon *node; + + list_for_each_entry(node, &daemon_nodes, list) { + if (node->need_fencing) + return 1; + } + return 0; +} + +static int all_daemon_members_fipu(void) +{ + struct node_daemon *node; + + list_for_each_entry(node, &daemon_nodes, list) { + if (!node->daemon_member) + continue; + if (!node->fence_in_progress_unknown) + return 0; + } + + list_for_each_entry(node, &daemon_nodes, list) { + if (!node->daemon_member) + continue; + node->fence_in_progress_unknown = 0; + } + + return 1; +} + +int fence_node_time(int nodeid, uint64_t *last_fenced) +{ + struct node_daemon *node; + + node = get_node_daemon(nodeid); + if (!node) + return -1; + + *last_fenced = node->fence_monotime; + return 0; +} + +int fence_in_progress(int *in_progress) +{ + if (fence_in_progress_unknown) { + *in_progress = 1; + } else if (!list_empty(&startup_nodes)) { + *in_progress = 2; + } else if (nodes_need_fencing()) { + *in_progress = 3; + } else { + *in_progress = 0; + } + return 0; +} + +void add_startup_node(int nodeid) +{ + struct node_daemon *node; + + node = malloc(sizeof(struct node_daemon)); + if (!node) { + log_error("add_startup_node no mem"); + return; + } + memset(node, 0, sizeof(struct node_daemon)); + node->nodeid = nodeid; + list_add_tail(&node->list, &startup_nodes); +} + +static int clear_startup_node(int nodeid, int all) +{ + struct node_daemon *node, *safe; + int count = 0; + + list_for_each_entry_safe(node, safe, &startup_nodes, list) { + if (all || node->nodeid == nodeid) { + list_del(&node->list); + free(node); + count++; + } + } + return count; +} + +static struct node_daemon *add_node_daemon(int nodeid) +{ + struct node_daemon *node; + struct fence_config *fc; + + node = get_node_daemon(nodeid); + if (node) + return node; + + node = malloc(sizeof(struct node_daemon)); + if (!node) { + log_error("add_node_daemon no mem"); + return NULL; + } + memset(node, 0, sizeof(struct node_daemon)); + node->nodeid = nodeid; + list_add_tail(&node->list, &daemon_nodes); + + /* TODO: allow the config to be reread */ + + fc = &node->fence_config; + fc->nodeid = nodeid; + + /* explicit command line arg has first priority */ + + if (optd_fence_all_agent) { + fc->dev[0] = &fence_all_device; + goto out; + } + + /* explicit config file setting has second priority */ + + fence_config_init(fc, (unsigned int)nodeid, (char *)CONF_FILE_PATH); + + /* no command line, no config file, use default, third priority */ + + if (!fc->dev[0] && fence_all_agent[0]) + fc->dev[0] = &fence_all_device; + out: + return node; +} + +/* A clean daemon member is a node that has joined the daemon cpg + from a "clean state", i.e. not a stateful merge. If would not + have joined the daemon cpg if it found uncontrolled dlm kernel + state (check_uncontrolled_lockspaces). We would not have + accepted and saved its protocol in node->proto.daemon if it + was a stateful merge. */ + +static int is_clean_daemon_member(int nodeid) +{ + struct node_daemon *node; + + node = get_node_daemon(nodeid); + if (node && node->daemon_member && node->proto.daemon_max[0]) + return 1; + return 0; +} + +static int in_daemon_list(int nodeid, struct cpg_address *daemon_list, int count) +{ + int i; + + for (i = 0; i < count; i++) { + if (daemon_list[i].nodeid == nodeid) + return 1; + } + return 0; +} + +/* save in node->fence_actors[] any nodeid present when the node + failed which therefore saw it fail, knows it needs fencing, and + can request fencing for it if it becomes the low actor. A node + added in the same change with the removed node does not qualify. */ + +static int set_fence_actors(struct node_daemon *node, int all_memb) +{ + int i, nodeid, count = 0, low = 0; + + memset(node->fence_actors, 0, sizeof(node->fence_actors)); + + for (i = 0; i < daemon_member_count; i++) { + nodeid = daemon_member[i].nodeid; + + if (!all_memb && in_daemon_list(nodeid, daemon_joined, daemon_joined_count)) + continue; + + node->fence_actors[count++] = nodeid; + + if (!low || nodeid < low) + low = nodeid; + } + + log_debug("set_fence_actors for %d low %d count %d", + node->nodeid, low, count); + return low; +} + +static int get_fence_actor(struct node_daemon *node) +{ + int i, low, low_i; + + retry: + low = 0; + + for (i = 0; i < MAX_NODES; i++) { + if (!node->fence_actors[i]) + continue; + + if (!low || node->fence_actors[i] < low) { + low = node->fence_actors[i]; + low_i = i; + } + } + + if (low && !in_daemon_list(low, daemon_member, daemon_member_count)) { + log_debug("get_fence_actor for %d low actor %d is gone", + node->nodeid, low); + + node->fence_actors[low_i] = 0; + goto retry; + } + + return low; +} + +/* if an actor fails to fence, it will send that result, and others + will clear it from the actors, which will cause the next lowest + actor to try */ + +static void clear_fence_actor(int nodeid, int actor) +{ + struct node_daemon *node; + int i; + + node = get_node_daemon(nodeid); + if (!node) + return; + + for (i = 0; i < MAX_NODES; i++) { + if (node->fence_actors[i] == actor) { + node->fence_actors[i] = 0; + return; + } + } +} + +/* TODO: handle delayed cleanup of more than one pid */ + +static void fence_pid_cancel(int nodeid, int pid) +{ + struct node_daemon *node; + int rv, result; + + log_debug("fence_pid_cancel nodeid %d pid %d", nodeid, pid); + + kill(pid, SIGKILL); + usleep(500000); + + rv = fence_result(nodeid, pid, &result); + if (rv == -EAGAIN) { + /* Try again later */ + daemon_clear_nodeid = nodeid; + daemon_clear_pid = pid; + } else { + log_debug("fence_pid_cancel nodeid %d pid %d done %d", + nodeid, pid, result); + + daemon_clear_nodeid = 0; + daemon_clear_pid = 0; + + node = get_node_daemon(nodeid); + if (node && node->fence_pid == pid) { + node->fence_pid_wait = 0; + node->fence_pid = 0; + } + } +} + +/* + * fence_in_progress_unknown (fipu) + * + * If current daemon members are fencing someone, and a new node + * joins, that new node needs to wait for the previous members to + * finish any fencing they're doing before it can start a lockspace. + * + * The previous members may be fencing the last node that was using + * the lockspace the new node is going to use, so if it doesn't wait, + * it could start using a lockspace with an unfenced user. + * + * So, the daemon starts with fence_in_progress_unknown set to + * indicate that other nodes may be fencing someone, and it won't + * start any lockspaces until it is clear. + * + * A node starts with fence_in_progress_unknown set and won't + * start any lockspaces until it's clear. + * + * When using startup_fencing: + * + * . When all nodes start up together, all have fipu set, + * and will go through startup fencing, which will eventually + * result in all nodes either being clean daemon members or fenced, + * so everyone will clear fipu by seeing that. + * + * . The more common case is when a new node joins other previously + * running nodes. The new node needs to be told that the others + * have no outstanding fencing ops before it can clear fipu. + * A previous member does send_fence_clear(0) to a new node once + * all fencing is complete. The two flags in send_fence_clear are + * usually sent together but may sometimes may be in separate messages: + * send_fence_clear(0, CLEAR_STARTUP) to clear startup_nodes right away + * send_fence_clear(0, CLEAR_FIPU) to clear fipu once all fencing is done + * + * When not using startup_fencing: + * + * . When all nodes start up together, all have fipu set, and all + * will be waiting to receive_fence_clear from a previous node + * in order to clear it. The nodes need to detect this situation, + * and when they do, they will know that everyone is in startup, + * so there can be no pending fencing on a previous node, so all + * can clear fipu. To detect this case, when a node starts up + * with !startup_fence, it sends a special send_fence_clear(-ENODATA, FIPU) + * message about itself to indicate it has fipu set and needs it cleared. + * After sending this, it checks to see if all present nodes have sent + * this same message about themselves. If so, then this startup + * case has been detected, an all will clear fipu. + * + * . New nodes that join after this startup initialization will be + * handled the same as when startup_fencing is set (above). + * + * + * startup_fencing + * --------------- + * + * case A + * all nodes start up, + * all have fipu set, + * all wait for startup_nodes to be empty, (joined or moved to need_fencing) + * all wait for no daemon_nodes to need_fencing, (joined or were fenced) + * all clear fipu + * + * later, + * + * case B + * new node starts, + * new node has fipu set, + * cur node sees need_fence_clear on new node + * cur node sees no pending fencing ops, + * cur node send_fence_clear(0) to new node, + * new node clears startup_nodes and fipu + * + * !startup_fencing + * ---------------- + * + * case C + * all nodes start up, + * all have fipu set, + * all send_fence_clear(-ENODATA,FIPU), + * all receive_fence_clear(-ENODATA,FIPU) from everyone, + * all_daemon_members_fipu() is 1, + * all clear fipu + * + * later same as case B above + */ + +/* + * TODO: limit to one agent running at once, in case both + * instances need to log into the same switch, for example. + */ + +static void daemon_fence_work(void) +{ + struct node_daemon *node, *safe; + int rv, nodeid, pid, need, low, actor, result; + uint32_t flags; + + if (daemon_ringid_wait) { + /* We've seen a nodedown confchg callback, but not the + corresponding ringid callback. */ + log_debug("fence work wait for cpg ringid"); + return; + } + + if (cluster_ringid_seq != daemon_ringid.seq) { + /* wait for ringids to be in sync */ + log_debug("fence work wait for cluster ringid"); + return; + } + + /* poll_fencing++; */ + + if (cfgd_enable_quorum_fencing && !cluster_quorate) { + /* wait for quorum before doing any fencing, but if there + is none, send_fence_clear below can unblock new nodes */ + log_debug("fence work wait for quorum"); + goto out_fipu; + } + + /* + * startup fencing + */ + + list_for_each_entry_safe(node, safe, &startup_nodes, list) { + if (is_clean_daemon_member(node->nodeid)) { + log_debug("fence startup %d member skip", node->nodeid); + list_del(&node->list); + free(node); + continue; + } + + if (!cfgd_startup_fence) + continue; + + if (monotime() - last_join_monotime < cfgd_startup_fence) { + log_debug("fence startup %d delay %d from %llu", + node->nodeid, cfgd_startup_fence, + (unsigned long long)last_join_monotime); + poll_fencing++; + continue; + } + + /* clear this entry and create a daemon_nodes entry with + need_fencing and the fence loops below will handle it */ + + nodeid = node->nodeid; + list_del(&node->list); + free(node); + + node = add_node_daemon(nodeid); + if (!node) { + log_debug("fence startup %d add failed", nodeid); + continue; + } + if (node->need_fencing) { + /* don't think this should happen? */ + log_error("fence startup %d already set", nodeid); + continue; + } + node->need_fencing = 1; + node->fence_config.pos = 0; + node->left_reason = REASON_STARTUP_FENCING; + node->fail_monotime = cluster_joined_monotime - 1; + node->fail_walltime = cluster_joined_walltime - 1; + node->fence_monotime = 0; + node->fence_walltime = 0; + node->fence_request_time = 0; + low = set_fence_actors(node, 1); + + log_debug("fence startup nodeid %d act %d", node->nodeid, low); + } + + /* + * request fencing + */ + + list_for_each_entry(node, &daemon_nodes, list) { + if (!node->need_fencing) + continue; + + if (node->fence_pid_wait) + continue; + + if (is_clean_daemon_member(node->nodeid)) { + /* node rejoined cleanly, doesn't need fencing */ + log_debug("fence request %d member skip", node->nodeid); + node->need_fencing = 0; + continue; + } + + /* get_fence_actor picks the low nodeid that existed + when node failed and is still around. if the current + actor fails, get_fence_actor will not find it in the + members list, will clear it, and return the next actor */ + + actor = get_fence_actor(node); + + if (!actor) { + log_error("fence request %d no actor", node->nodeid); + continue; + } + + if (actor != our_nodeid) { + log_debug("fence request %d defer to %d", + node->nodeid, actor); + continue; + } + + log_debug("fence request %d", node->nodeid); + + rv = fence_request(node->nodeid, + node->fail_walltime, node->fail_monotime, + &node->fence_config, &pid); + if (rv < 0) { + send_fence_result(node->nodeid, rv, 0, time(NULL)); + continue; + } + + node->fence_pid_wait = 1; + node->fence_pid = pid; + node->fence_request_time = monotime(); + } + + /* + * check outstanding fence requests + */ + + list_for_each_entry(node, &daemon_nodes, list) { + if (!node->need_fencing) + continue; + + if (!node->fence_pid_wait) { + /* + * another node is the actor, or we were actor, + * sent done msg and are waiting to recv it + */ + log_debug("fence wait %d for done", node->nodeid); + continue; + } + + if (is_clean_daemon_member(node->nodeid)) { + /* + * node has rejoined in clean state so we can + * abort outstanding fence op for it. all nodes + * will see and do this, so we don't need to send + * a fence result. + */ + log_debug("fence wait %d member skip", node->nodeid); + node->need_fencing = 0; + node->fence_walltime = time(NULL); + node->fence_monotime = monotime(); + fence_pid_cancel(node->nodeid, node->fence_pid); + continue; + } + + poll_fencing++; + + rv = fence_result(node->nodeid, node->fence_pid, &result); + if (rv == -EAGAIN) { + /* agent pid is still running */ + log_debug("fence wait %d pid %d running", + node->nodeid, node->fence_pid); + continue; + } + + if (rv < 0) { + /* shouldn't happen */ + log_error("fence wait %d pid %d error %d", + node->nodeid, node->fence_pid_wait, rv); + node->fence_pid_wait = 0; + continue; + } + + if (!result) { + /* agent exit 0, if there's another agent to run in + parallel, set it to run next, otherwise success */ + + log_debug("fence nodeid %d pid %d succeeded", + node->nodeid, node->fence_pid); + + node->fence_pid_wait = 0; + node->fence_pid = 0; + + rv = fence_config_next_parallel(&node->fence_config); + if (rv < 0) + send_fence_result(node->nodeid, 0, 0, time(NULL)); + } else { + /* agent exit 1, if there's another agent to run at + next priority, set it to run next, otherwise fail */ + + log_debug("fence nodeid %d pid %d failed %d", + node->nodeid, node->fence_pid, result); + + node->fence_pid_wait = 0; + node->fence_pid = 0; + + rv = fence_config_next_priority(&node->fence_config); + if (rv < 0) + send_fence_result(node->nodeid, result, 0, time(NULL)); + } + } + + /* + * clear fence_in_progress_unknown + */ + out_fipu: + need = nodes_need_fencing(); + + if (cfgd_startup_fence && fence_in_progress_unknown && !need && list_empty(&startup_nodes)) { + /* + * case A in comment above + * all nodes are starting and have fipu set, they all do + * startup fencing, and eventually see unknown nodes become + * members or get fenced, so all clear fipu for themselves. + */ + fence_in_progress_unknown = 0; + log_debug("fence_in_progress_unknown 0 startup"); + } + + if (!fence_in_progress_unknown) { + /* + * case B in comment above + * some cur nodes have fipu clear, new nodes have fipu set. + * A current node needs to send_fence_clear to the new nodes + * once all fencing is done so they clear fipu. + */ + low = 0; + + list_for_each_entry(node, &daemon_nodes, list) { + if (!node->daemon_member || node->need_fence_clear) + continue; + if (!low || node->nodeid < low) + low = node->nodeid; + } + + list_for_each_entry(node, &daemon_nodes, list) { + if (!node->daemon_member || !node->need_fence_clear) + continue; + if (node->nodeid == our_nodeid) { + node->need_fence_clear = 0; + continue; + } + if (low != our_nodeid) + continue; + + flags = 0; + + if (node->need_fence_clear & FR_CLEAR_STARTUP) { + flags |= FR_CLEAR_STARTUP; + node->need_fence_clear &= ~FR_CLEAR_STARTUP; + } + + if ((node->need_fence_clear & FR_CLEAR_FIPU) && !need) { + flags |= FR_CLEAR_FIPU; + node->need_fence_clear &= ~FR_CLEAR_FIPU; + } + + if (!flags) + continue; + + send_fence_clear(node->nodeid, 0, flags, 0); + } + } + + if (!cfgd_startup_fence && fence_in_progress_unknown) { + /* + * case C in comment above + * all nodes are starting and have fipu set. All expect a + * previous node to send_fence_clear so they can clear fipu. + * But there are no previous nodes. They need to detect this + * condition. Each node does send_fence_clear(ENODATA,FIPU). + * When all have received this from all, condition is + * detected and all clear fipu. + */ + if (all_daemon_members_fipu()) { + fence_in_progress_unknown = 0; + log_debug("fence_in_progress_unknown 0 all_fipu"); + } else if (last_join_seq > send_fipu_seq) { + /* the seq numbers keep us from spamming this msg */ + send_fence_clear(our_nodeid, -ENODATA, FR_FIPU, 0); + log_debug("send_fence_clear %d fipu", our_nodeid); + send_fipu_seq = last_join_seq; + } + } + + /* + * clean up a zombie pid from an agent we killed + */ + + if (daemon_clear_pid) + fence_pid_cancel(daemon_clear_nodeid, daemon_clear_pid); +} + +void process_fencing_changes(void) +{ + poll_fencing = 0; + daemon_fence_work(); +} + +static void receive_fence_clear(struct dlm_header *hd, int len) +{ + struct fence_result *fr; + struct node_daemon *node; + int count; + + fr = (struct fence_result *)((char *)hd + sizeof(struct dlm_header)); + + fr->flags = le32_to_cpu(fr->flags); + fr->nodeid = le32_to_cpu(fr->nodeid); + fr->result = le32_to_cpu(fr->result); + fr->fence_walltime = le64_to_cpu(fr->fence_walltime); + + if (len < sizeof(struct dlm_header) + sizeof(struct fence_result)) { + log_error("receive_fence_clear invalid len %d from %d", + len, hd->nodeid); + return; + } + + node = get_node_daemon(fr->nodeid); + if (!node) { + log_error("receive_fence_clear from %d no daemon node %d", + hd->nodeid, fr->nodeid); + return; + } + + log_debug("receive_fence_clear from %d for %d result %d flags %x", + hd->nodeid, fr->nodeid, fr->result, fr->flags); + + /* + * A node sends this message about itself indicating that it's in + * startup with fipu set. The only time we care about node->fipu + * is when all nodes are fipu in startup. node->need_fence_clear + * and node->fipu are not related, they address different cases. + */ + if ((fr->result == -ENODATA) && (fr->flags & FR_FIPU)) { + if (!fence_in_progress_unknown) + return; + + node->fence_in_progress_unknown = 1; + return; + } + + /* + * An previous member sends this to new members to tell them that + * they can clear startup_nodes and clear fipu. These two flags + * may come in separate messages if there is a pending fencing op + * when the new member joins (CLEAR_STARTUP will come right away, + * but CLEAR_FIPU will come once the fencing op is done.) + */ + if (!fr->result && (node->nodeid == our_nodeid)) { + if ((fr->flags & FR_CLEAR_STARTUP) && !list_empty(&startup_nodes)) { + count = clear_startup_node(0, 1); + log_debug("clear_startup_nodes %d", count); + } + + if ((fr->flags & FR_CLEAR_FIPU) && fence_in_progress_unknown) { + fence_in_progress_unknown = 0; + log_debug("fence_in_progress_unknown 0 recv"); + } + } + + /* this node doesn't need these flags any more */ + if (!fr->result) { + if (fr->flags & FR_CLEAR_STARTUP) + node->need_fence_clear &= ~FR_CLEAR_STARTUP; + if (fr->flags & FR_CLEAR_FIPU) + node->need_fence_clear &= ~FR_CLEAR_FIPU; + } +} + +static void send_fence_clear(int nodeid, int result, uint32_t flags, uint64_t walltime) +{ + struct dlm_header *hd; + struct fence_result *fr; + char *buf; + int len; + + len = sizeof(struct dlm_header) + sizeof(struct fence_result); + buf = malloc(len); + if (!buf) { + log_error("send_fence_clear no mem %d", len); + return; + } + memset(buf, 0, len); + + hd = (struct dlm_header *)buf; + fr = (struct fence_result *)(buf + sizeof(*hd)); + + hd->type = cpu_to_le16(DLM_MSG_FENCE_CLEAR); + hd->nodeid = cpu_to_le32(our_nodeid); + + fr->flags = cpu_to_le32(flags); + fr->nodeid = cpu_to_le32(nodeid); + fr->result = cpu_to_le32(result); + fr->fence_walltime = cpu_to_le64(walltime); + + _send_message(cpg_handle_daemon, buf, len, DLM_MSG_FENCE_CLEAR); +} + +static void receive_fence_result(struct dlm_header *hd, int len) +{ + struct fence_result *fr; + struct node_daemon *node; + int count; + + fr = (struct fence_result *)((char *)hd + sizeof(struct dlm_header)); + + fr->flags = le32_to_cpu(fr->flags); + fr->nodeid = le32_to_cpu(fr->nodeid); + fr->result = le32_to_cpu(fr->result); + fr->fence_walltime = le64_to_cpu(fr->fence_walltime); + + if (len < sizeof(struct dlm_header) + sizeof(struct fence_result)) { + log_error("receive_fence_result invalid len %d from %d", + len, hd->nodeid); + return; + } + + count = clear_startup_node(fr->nodeid, 0); + if (count) { + log_debug("receive_fence_result from %d for %d clear startup", + hd->nodeid, fr->nodeid); + } + + node = get_node_daemon(fr->nodeid); + if (!node) { + log_error("receive_fence_result from %d for %d no daemon node", + hd->nodeid, fr->nodeid); + return; + } + + log_debug("receive_fence_result from %d for %d result %d walltime %llu", + hd->nodeid, fr->nodeid, fr->result, + (unsigned long long)fr->fence_walltime); + + if (!node->need_fencing) { + /* should never happen */ + log_error("receive_fence_result from %d for %d result %d no need_fencing", + hd->nodeid, fr->nodeid, fr->result); + return; + } + + if (fr->result == -ECANCELED) { + /* if an agent pid is running, kill it and clean up */ + if (node->fence_pid_wait && node->fence_pid) + fence_pid_cancel(node->nodeid, node->fence_pid); + fr->result = 0; /* force success below */ + } + + if (!fr->result) { + node->need_fencing = 0; + node->fence_walltime = fr->fence_walltime; + node->fence_monotime = monotime(); + node->fence_actor_done = hd->nodeid; + } else { + clear_fence_actor(fr->nodeid, hd->nodeid); + } +} + +static void send_fence_result(int nodeid, int result, uint32_t flags, uint64_t walltime) +{ + struct dlm_header *hd; + struct fence_result *fr; + char *buf; + int len; + + len = sizeof(struct dlm_header) + sizeof(struct fence_result); + buf = malloc(len); + if (!buf) { + log_error("send_fence_result no mem %d", len); + return; + } + memset(buf, 0, len); + + hd = (struct dlm_header *)buf; + fr = (struct fence_result *)(buf + sizeof(*hd)); + + hd->type = cpu_to_le16(DLM_MSG_FENCE_RESULT); + hd->nodeid = cpu_to_le32(our_nodeid); + + fr->flags = cpu_to_le32(flags); + fr->nodeid = cpu_to_le32(nodeid); + fr->result = cpu_to_le32(result); + fr->fence_walltime = cpu_to_le64(walltime); + + _send_message(cpg_handle_daemon, buf, len, DLM_MSG_FENCE_RESULT); +} + +void fence_ack_node(int nodeid) +{ + send_fence_result(nodeid, -ECANCELED, 0, time(NULL)); +} + +void set_protocol_stateful(void) +{ + our_protocol.dr_ver.flags |= PV_STATEFUL; +} + +static void pv_in(struct protocol_version *pv) +{ + pv->major = le16_to_cpu(pv->major); + pv->minor = le16_to_cpu(pv->minor); + pv->patch = le16_to_cpu(pv->patch); + pv->flags = le16_to_cpu(pv->flags); +} + +static void pv_out(struct protocol_version *pv) +{ + pv->major = cpu_to_le16(pv->major); + pv->minor = cpu_to_le16(pv->minor); + pv->patch = cpu_to_le16(pv->patch); + pv->flags = cpu_to_le16(pv->flags); +} + +static void protocol_in(struct protocol *proto) +{ + pv_in(&proto->dm_ver); + pv_in(&proto->km_ver); + pv_in(&proto->dr_ver); + pv_in(&proto->kr_ver); +} + +static void protocol_out(struct protocol *proto) +{ + pv_out(&proto->dm_ver); + pv_out(&proto->km_ver); + pv_out(&proto->dr_ver); + pv_out(&proto->kr_ver); +} + +/* go through member list saved in last confchg, see if we have received a + proto message from each */ + +static int all_protocol_messages(void) +{ + struct node_daemon *node; + int i; + + if (!daemon_member_count) + return 0; + + for (i = 0; i < daemon_member_count; i++) { + node = get_node_daemon(daemon_member[i].nodeid); + if (!node) { + log_error("all_protocol_messages no node %d", + daemon_member[i].nodeid); + return 0; + } + + if (!node->proto.daemon_max[0]) + return 0; + } + return 1; +} + +static int pick_min_protocol(struct protocol *proto) +{ + uint16_t mind[4]; + uint16_t mink[4]; + struct node_daemon *node; + int i; + + memset(&mind, 0, sizeof(mind)); + memset(&mink, 0, sizeof(mink)); + + /* first choose the minimum major */ + + for (i = 0; i < daemon_member_count; i++) { + node = get_node_daemon(daemon_member[i].nodeid); + if (!node) { + log_error("pick_min_protocol no node %d", + daemon_member[i].nodeid); + return -1; + } + + if (!mind[0] || node->proto.daemon_max[0] < mind[0]) + mind[0] = node->proto.daemon_max[0]; + + if (!mink[0] || node->proto.kernel_max[0] < mink[0]) + mink[0] = node->proto.kernel_max[0]; + } + + if (!mind[0] || !mink[0]) { + log_error("pick_min_protocol zero major number"); + return -1; + } + + /* second pick the minimum minor with the chosen major */ + + for (i = 0; i < daemon_member_count; i++) { + node = get_node_daemon(daemon_member[i].nodeid); + if (!node) + continue; + + if (mind[0] == node->proto.daemon_max[0]) { + if (!mind[1] || node->proto.daemon_max[1] < mind[1]) + mind[1] = node->proto.daemon_max[1]; + } + + if (mink[0] == node->proto.kernel_max[0]) { + if (!mink[1] || node->proto.kernel_max[1] < mink[1]) + mink[1] = node->proto.kernel_max[1]; + } + } + + if (!mind[1] || !mink[1]) { + log_error("pick_min_protocol zero minor number"); + return -1; + } + + /* third pick the minimum patch with the chosen major.minor */ + + for (i = 0; i < daemon_member_count; i++) { + node = get_node_daemon(daemon_member[i].nodeid); + if (!node) + continue; + + if (mind[0] == node->proto.daemon_max[0] && + mind[1] == node->proto.daemon_max[1]) { + if (!mind[2] || node->proto.daemon_max[2] < mind[2]) + mind[2] = node->proto.daemon_max[2]; + } + + if (mink[0] == node->proto.kernel_max[0] && + mink[1] == node->proto.kernel_max[1]) { + if (!mink[2] || node->proto.kernel_max[2] < mink[2]) + mink[2] = node->proto.kernel_max[2]; + } + } + + if (!mind[2] || !mink[2]) { + log_error("pick_min_protocol zero patch number"); + return -1; + } + + memcpy(&proto->daemon_run, &mind, sizeof(mind)); + memcpy(&proto->kernel_run, &mink, sizeof(mink)); + return 0; +} + +static void receive_protocol(struct dlm_header *hd, int len) +{ + struct protocol *p; + struct node_daemon *node; + int new = 0; + + p = (struct protocol *)((char *)hd + sizeof(struct dlm_header)); + protocol_in(p); + + if (len < sizeof(struct dlm_header) + sizeof(struct protocol)) { + log_error("receive_protocol invalid len %d from %d", + len, hd->nodeid); + return; + } + + /* zero is an invalid version value */ + + if (!p->daemon_max[0] || !p->daemon_max[1] || !p->daemon_max[2] || + !p->kernel_max[0] || !p->kernel_max[1] || !p->kernel_max[2]) { + log_error("receive_protocol invalid max value from %d " + "daemon %u.%u.%u kernel %u.%u.%u", hd->nodeid, + p->daemon_max[0], p->daemon_max[1], p->daemon_max[2], + p->kernel_max[0], p->kernel_max[1], p->kernel_max[2]); + return; + } + + /* the run values will be zero until a version is set, after + which none of the run values can be zero */ + + if (p->daemon_run[0] && (!p->daemon_run[1] || !p->daemon_run[2] || + !p->kernel_run[0] || !p->kernel_run[1] || !p->kernel_run[2])) { + log_error("receive_protocol invalid run value from %d " + "daemon %u.%u.%u kernel %u.%u.%u", hd->nodeid, + p->daemon_run[0], p->daemon_run[1], p->daemon_run[2], + p->kernel_run[0], p->kernel_run[1], p->kernel_run[2]); + return; + } + + /* save this node's proto so we can tell when we've got all, and + use it to select a minimum protocol from all */ + + node = get_node_daemon(hd->nodeid); + if (!node) { + log_error("receive_protocol no node %d", hd->nodeid); + return; + } + + if (!node->daemon_member) { + log_error("receive_protocol node %d not member", hd->nodeid); + return; + } + + log_debug("receive_protocol %d max %u.%u.%u.%x run %u.%u.%u.%x", + hd->nodeid, + p->daemon_max[0], p->daemon_max[1], + p->daemon_max[2], p->daemon_max[3], + p->daemon_run[0], p->daemon_run[1], + p->daemon_run[2], p->daemon_run[3]); + + if (memcmp(&node->proto, p, sizeof(struct protocol))) { + log_debug("daemon node %d prot max %u.%u.%u.%x run %u.%u.%u.%x", + hd->nodeid, + node->proto.daemon_max[0], node->proto.daemon_max[1], + node->proto.daemon_max[2], node->proto.daemon_max[3], + node->proto.daemon_run[0], node->proto.daemon_run[1], + node->proto.daemon_run[2], node->proto.daemon_run[3]); + new = 1; + } + + /* checking zero node->daemon_max[0] is a way to tell if we've received + an acceptable (non-stateful) proto message from the node since we + saw it join the daemon cpg */ + + if (hd->nodeid != our_nodeid && + !node->proto.daemon_max[0] && + (p->dr_ver.flags & PV_STATEFUL) && + (our_protocol.dr_ver.flags & PV_STATEFUL)) { + + log_debug("daemon node %d stateful merge", hd->nodeid); + log_debug("daemon node %d join %llu left %llu local quorum %llu", + hd->nodeid, + (unsigned long long)node->daemon_add_time, + (unsigned long long)node->daemon_rem_time, + (unsigned long long)cluster_quorate_monotime); + + if (cluster_quorate && node->daemon_rem_time && + cluster_quorate_monotime < node->daemon_rem_time) { + log_debug("daemon node %d kill due to stateful merge", hd->nodeid); + if (!node->killed) + kick_node_from_cluster(hd->nodeid); + node->killed = 1; + } + + /* don't save p->proto into node->proto; we need to come + through here based on zero daemon_max[0] for other proto + messages like this one from the same node */ + + return; + } + + if (new) { + memcpy(&node->proto, p, sizeof(struct protocol)); + + log_debug("daemon node %d save max %u.%u.%u.%x run %u.%u.%u.%x", + node->nodeid, + node->proto.daemon_max[0], node->proto.daemon_max[1], + node->proto.daemon_max[2], node->proto.daemon_max[3], + node->proto.daemon_run[0], node->proto.daemon_run[1], + node->proto.daemon_run[2], node->proto.daemon_run[3]); + } + + /* if we have zero run values, and this msg has non-zero run values, + then adopt them as ours; otherwise save this proto message */ + + if (our_protocol.daemon_run[0]) + return; + + if (p->daemon_run[0]) { + our_protocol.daemon_run[0] = p->daemon_run[0]; + our_protocol.daemon_run[1] = p->daemon_run[1]; + our_protocol.daemon_run[2] = p->daemon_run[2]; + + our_protocol.kernel_run[0] = p->kernel_run[0]; + our_protocol.kernel_run[1] = p->kernel_run[1]; + our_protocol.kernel_run[2] = p->kernel_run[2]; + + log_debug("run protocol from nodeid %d", hd->nodeid); + } +} + +static void send_protocol(struct protocol *proto) +{ + struct dlm_header *hd; + struct protocol *pr; + char *buf; + int len; + + len = sizeof(struct dlm_header) + sizeof(struct protocol); + buf = malloc(len); + if (!buf) { + log_error("send_protocol no mem %d", len); + return; + } + memset(buf, 0, len); + + hd = (struct dlm_header *)buf; + pr = (struct protocol *)(buf + sizeof(*hd)); + + hd->type = cpu_to_le16(DLM_MSG_PROTOCOL); + hd->nodeid = cpu_to_le32(our_nodeid); + + memcpy(pr, proto, sizeof(struct protocol)); + protocol_out(pr); + + _send_message(cpg_handle_daemon, buf, len, DLM_MSG_PROTOCOL); +} + +int set_protocol(void) +{ + struct protocol proto; + struct pollfd pollfd; + int sent_proposal = 0; + int rv; + + memset(&pollfd, 0, sizeof(pollfd)); + pollfd.fd = cpg_fd_daemon; + pollfd.events = POLLIN; + + while (1) { + if (our_protocol.daemon_run[0]) + break; + + if (!sent_proposal && all_protocol_messages()) { + /* propose a protocol; look through info from all + nodes and pick the min for both daemon and kernel, + and propose that */ + + sent_proposal = 1; + + /* copy our max values */ + memcpy(&proto, &our_protocol, sizeof(struct protocol)); + + rv = pick_min_protocol(&proto); + if (rv < 0) + return rv; + + log_debug("set_protocol member_count %d propose " + "daemon %u.%u.%u kernel %u.%u.%u", + daemon_member_count, + proto.daemon_run[0], proto.daemon_run[1], + proto.daemon_run[2], proto.kernel_run[0], + proto.kernel_run[1], proto.kernel_run[2]); + + send_protocol(&proto); + } + + /* only process messages/events from daemon cpg until protocol + is established */ + + rv = poll(&pollfd, 1, -1); + if (rv == -1 && errno == EINTR) { + if (daemon_quit) + return -1; + continue; + } + if (rv < 0) { + log_error("set_protocol poll errno %d", errno); + return -1; + } + + if (pollfd.revents & POLLIN) + process_cpg_daemon(0); + if (pollfd.revents & (POLLERR | POLLHUP | POLLNVAL)) { + log_error("set_protocol poll revents %u", + pollfd.revents); + return -1; + } + } + + if (our_protocol.daemon_run[0] != our_protocol.daemon_max[0] || + our_protocol.daemon_run[1] > our_protocol.daemon_max[1]) { + log_error("incompatible daemon protocol run %u.%u.%u max %u.%u.%u", + our_protocol.daemon_run[0], + our_protocol.daemon_run[1], + our_protocol.daemon_run[2], + our_protocol.daemon_max[0], + our_protocol.daemon_max[1], + our_protocol.daemon_max[2]); + return -1; + } + + if (our_protocol.kernel_run[0] != our_protocol.kernel_max[0] || + our_protocol.kernel_run[1] > our_protocol.kernel_max[1]) { + log_error("incompatible kernel protocol run %u.%u.%u max %u.%u.%u", + our_protocol.kernel_run[0], + our_protocol.kernel_run[1], + our_protocol.kernel_run[2], + our_protocol.kernel_max[0], + our_protocol.kernel_max[1], + our_protocol.kernel_max[2]); + return -1; + } + + log_debug("daemon run %u.%u.%u max %u.%u.%u " + "kernel run %u.%u.%u max %u.%u.%u", + our_protocol.daemon_run[0], + our_protocol.daemon_run[1], + our_protocol.daemon_run[2], + our_protocol.daemon_max[0], + our_protocol.daemon_max[1], + our_protocol.daemon_max[2], + our_protocol.kernel_run[0], + our_protocol.kernel_run[1], + our_protocol.kernel_run[2], + our_protocol.kernel_max[0], + our_protocol.kernel_max[1], + our_protocol.kernel_max[2]); + + send_protocol(&our_protocol); + return 0; +} + +static void deliver_cb_daemon(cpg_handle_t handle, + const struct cpg_name *group_name, + uint32_t nodeid, uint32_t pid, + void *data, size_t len) +{ + struct dlm_header *hd; + + if (len < sizeof(*hd)) { + log_error("deliver_cb short message %zd", len); + return; + } + + hd = (struct dlm_header *)data; + dlm_header_in(hd); + + switch (hd->type) { + case DLM_MSG_PROTOCOL: + receive_protocol(hd, len); + break; + case DLM_MSG_FENCE_RESULT: + receive_fence_result(hd, len); + break; + case DLM_MSG_FENCE_CLEAR: + receive_fence_clear(hd, len); + break; + default: + log_error("deliver_cb_daemon unknown msg type %d", hd->type); + } + + daemon_fence_work(); +} + +static void confchg_cb_daemon(cpg_handle_t handle, + const struct cpg_name *group_name, + const struct cpg_address *member_list, + size_t member_list_entries, + const struct cpg_address *left_list, + size_t left_list_entries, + const struct cpg_address *joined_list, + size_t joined_list_entries) +{ + struct node_daemon *node; + uint64_t now, now_wall; + int nodedown = 0, procdown = 0, leave = 0; + int we_joined = 0; + int i, reason, low; + + now = monotime(); + now_wall = time(NULL); + + log_config(group_name, member_list, member_list_entries, + left_list, left_list_entries, + joined_list, joined_list_entries); + + memset(&daemon_member, 0, sizeof(daemon_member)); + daemon_member_count = member_list_entries; + + for (i = 0; i < member_list_entries; i++) { + daemon_member[i] = member_list[i]; + /* add struct for nodes we've not seen before */ + add_node_daemon(member_list[i].nodeid); + } + + memset(&daemon_joined, 0, sizeof(daemon_joined)); + daemon_joined_count = joined_list_entries; + + for (i = 0; i < joined_list_entries; i++) { + daemon_joined[i] = joined_list[i]; + if (joined_list[i].nodeid == our_nodeid) + we_joined = 1; + } + + memset(&daemon_remove, 0, sizeof(daemon_remove)); + daemon_remove_count = left_list_entries; + + for (i = 0; i < left_list_entries; i++) { + daemon_remove[i] = left_list[i]; + + if (left_list[i].reason == CPG_REASON_NODEDOWN) + nodedown++; + else if (left_list[i].reason == CPG_REASON_PROCDOWN) + procdown++; + else if (left_list[i].reason == CPG_REASON_LEAVE) + leave++; + } + + if (nodedown || procdown || leave) + log_debug("%s left nodedown %d procdown %d leave %d", + group_name->value, nodedown, procdown, leave); + + if (nodedown) + daemon_ringid_wait = 1; + + if (joined_list_entries) + send_protocol(&our_protocol); + + list_for_each_entry(node, &daemon_nodes, list) { + if (in_daemon_list(node->nodeid, daemon_member, daemon_member_count)) { + if (node->daemon_member) + continue; + + /* node joined daemon cpg */ + node->daemon_member = 1; + node->daemon_add_time = now; + + last_join_monotime = now; + last_join_seq++; + + /* a joining node shows prev members in joined list */ + if (!we_joined) + node->need_fence_clear = FR_CLEAR_STARTUP|FR_CLEAR_FIPU; + + if (node->need_fencing) { + /* need_fencing will be cleared if we accept a + valid proto from it */ + log_error("daemon new nodeid %d needs fencing", + node->nodeid); + } + + } else { + if (!node->daemon_member) + continue; + + /* node left daemon cpg */ + node->daemon_member = 0; + node->killed = 0; + memset(&node->proto, 0, sizeof(struct protocol)); + node->daemon_rem_time = now; + + /* tell loop below to look at this node */ + node->recover_setup = 1; + } + } + + /* set up recovery work for nodes that just failed */ + + /* TODO: limit to nodes with a valid proto? + * node_history_lockspace_fail() would only set + * need_fencing if node->start_time was non-zero. */ + + list_for_each_entry(node, &daemon_nodes, list) { + if (!node->recover_setup) + continue; + + node->recover_setup = 0; + reason = 0; + low = 0; + + if (!cfgd_enable_fencing) + continue; + + if (node->need_fencing) { + log_error("daemon remove nodeid %d already needs fencing", + node->nodeid); + continue; + } + + for (i = 0; i < left_list_entries; i++) { + if (left_list[i].nodeid != node->nodeid) + continue; + reason = left_list[i].reason; + break; + } + + if (reason == CPG_REASON_NODEDOWN || reason == CPG_REASON_PROCDOWN) { + node->need_fencing = 1; + node->fence_config.pos = 0; + node->left_reason = reason; + node->fail_monotime = now; + node->fail_walltime = now_wall; + node->fence_monotime = 0; + node->fence_walltime = 0; + node->fence_request_time = 0; + low = set_fence_actors(node, 0); + } + + log_debug("daemon remove %d %s need_fencing %d low %d", + node->nodeid, reason_str(reason), node->need_fencing, low); + } + + daemon_fence_work(); +} + +static void totem_cb_daemon(cpg_handle_t handle, + struct cpg_ring_id ring_id, + uint32_t member_list_entries, + const uint32_t *member_list) +{ + daemon_ringid.nodeid = ring_id.nodeid; + daemon_ringid.seq = ring_id.seq; + daemon_ringid_wait = 0; + + log_ringid("dlm:controld", &ring_id, member_list, member_list_entries); + + daemon_fence_work(); +} + +static cpg_model_v1_data_t cpg_callbacks_daemon = { + .cpg_deliver_fn = deliver_cb_daemon, + .cpg_confchg_fn = confchg_cb_daemon, + .cpg_totem_confchg_fn = totem_cb_daemon, + .flags = CPG_MODEL_V1_DELIVER_INITIAL_TOTEM_CONF, +}; + +void process_cpg_daemon(int ci) +{ + cs_error_t error; + + error = cpg_dispatch(cpg_handle_daemon, CS_DISPATCH_ALL); + if (error != CS_OK) + log_error("daemon cpg_dispatch error %d", error); +} + +int setup_cpg_daemon(void) +{ + cs_error_t error; + struct cpg_name name; + int i = 0; + + /* daemon 1.1.1 was cluster3/STABLE3/RHEL6 which is incompatible + with cluster4/RHEL7 */ + + memset(&our_protocol, 0, sizeof(our_protocol)); + + if (cfgd_enable_fscontrol) + our_protocol.daemon_max[0] = 2; + else + our_protocol.daemon_max[0] = 3; + + our_protocol.daemon_max[1] = 1; + our_protocol.daemon_max[2] = 1; + our_protocol.kernel_max[0] = 1; + our_protocol.kernel_max[1] = 1; + our_protocol.kernel_max[2] = 1; + + error = cpg_model_initialize(&cpg_handle_daemon, CPG_MODEL_V1, + (cpg_model_data_t *)&cpg_callbacks_daemon, + NULL); + if (error != CS_OK) { + log_error("daemon cpg_initialize error %d", error); + return -1; + } + + cpg_fd_get(cpg_handle_daemon, &cpg_fd_daemon); + + memset(&name, 0, sizeof(name)); + sprintf(name.value, "dlm:controld"); + name.length = strlen(name.value) + 1; + + log_debug("cpg_join %s ...", name.value); + retry: + error = cpg_join(cpg_handle_daemon, &name); + if (error == CS_ERR_TRY_AGAIN) { + sleep(1); + if (!(++i % 10)) + log_error("daemon cpg_join error retrying"); + goto retry; + } + if (error != CS_OK) { + log_error("daemon cpg_join error %d", error); + goto fail; + } + + log_debug("setup_cpg_daemon %d", cpg_fd_daemon); + return cpg_fd_daemon; + + fail: + cpg_finalize(cpg_handle_daemon); + return -1; +} + +void close_cpg_daemon(void) +{ + struct lockspace *ls; + cs_error_t error; + struct cpg_name name; + int i = 0; + + if (!cpg_handle_daemon) + return; + if (cluster_down) + goto fin; + + memset(&name, 0, sizeof(name)); + sprintf(name.value, "dlm:controld"); + name.length = strlen(name.value) + 1; + + log_debug("cpg_leave %s ...", name.value); + retry: + error = cpg_leave(cpg_handle_daemon, &name); + if (error == CS_ERR_TRY_AGAIN) { + sleep(1); + if (!(++i % 10)) + log_error("daemon cpg_leave error retrying"); + goto retry; + } + if (error != CS_OK) + log_error("daemon cpg_leave error %d", error); + fin: + list_for_each_entry(ls, &lockspaces, list) { + if (ls->cpg_handle) + cpg_finalize(ls->cpg_handle); + } + cpg_finalize(cpg_handle_daemon); +} + +void init_daemon(void) +{ + INIT_LIST_HEAD(&daemon_nodes); + INIT_LIST_HEAD(&startup_nodes); + +} diff --git a/dlm_controld/dlm_controld.h b/dlm_controld/dlm_controld.h index 638b0de..96b425c 100644 --- a/dlm_controld/dlm_controld.h +++ b/dlm_controld/dlm_controld.h @@ -29,6 +29,7 @@ #define DLMC_CMD_FS_NOTIFIED 9 #define DLMC_CMD_DEADLOCK_CHECK 10 #define DLMC_CMD_DUMP_LOG_PLOCK 11 +#define DLMC_CMD_FENCE_ACK 12
struct dlmc_header { unsigned int magic; diff --git a/dlm_controld/dlm_daemon.h b/dlm_controld/dlm_daemon.h index 5fca041..5e83db3 100644 --- a/dlm_controld/dlm_daemon.h +++ b/dlm_controld/dlm_daemon.h @@ -9,16 +9,18 @@ #ifndef __DLM_DAEMON_DOT_H__ #define __DLM_DAEMON_DOT_H__
-#include <sys/types.h> #include <asm/types.h> +#include <sys/types.h> #include <sys/uio.h> -#include <netinet/in.h> #include <sys/socket.h> #include <sys/un.h> #include <sys/ioctl.h> #include <sys/stat.h> #include <sys/utsname.h> #include <sys/poll.h> +#include <sys/time.h> +#include <sys/wait.h> +#include <netinet/in.h> #include <netinet/in.h> #include <arpa/inet.h> #include <net/if.h> @@ -37,7 +39,6 @@ #include <syslog.h> #include <sched.h> #include <signal.h> -#include <sys/time.h> #include <dirent.h>
#include <corosync/cpg.h> @@ -45,6 +46,7 @@ #include <linux/dlmconstants.h> #include "libdlmcontrol.h" #include "dlm_controld.h" +#include "fence_config.h" #include "list.h" #include "rbtree.h" #include "linux_endian.h" @@ -60,9 +62,9 @@
/* TODO: get CONFDIR, LOGDIR, RUNDIR from build */
-#define RUNDIR "/var/run/cluster" -#define LOGDIR "/var/log/cluster" -#define CONFDIR "/etc" +#define RUNDIR "/var/run/dlm" +#define LOGDIR "/var/log/dlm" +#define CONFDIR "/etc/dlm"
#define RUN_FILE_NAME "dlm_controld.pid" #define LOG_FILE_NAME "dlm_controld.log" @@ -80,7 +82,8 @@
#define DEFAULT_DEBUG_LOGFILE 0 #define DEFAULT_ENABLE_FENCING 1 -#define DEFAULT_ENABLE_QUORUM 0 +#define DEFAULT_ENABLE_QUORUM_FENCING 1 +#define DEFAULT_ENABLE_QUORUM_LOCKSPACE 0 #define DEFAULT_ENABLE_FSCONTROL 0 #define DEFAULT_ENABLE_PLOCK 1 #define DEFAULT_PLOCK_DEBUG 0 @@ -89,6 +92,8 @@ #define DEFAULT_DROP_RESOURCES_TIME 10000 /* 10 sec */ #define DEFAULT_DROP_RESOURCES_COUNT 10 #define DEFAULT_DROP_RESOURCES_AGE 10000 /* 10 sec */ +#define DEFAULT_FENCE_ALL_AGENT "dlm_stonith" +#define DEFAULT_STARTUP_FENCE 30
/* DLM_LOCKSPACE_LEN: maximum lockspace name length, from linux/dlmconstants.h. Copied in libdlm.h so apps don't need to include the kernel header. @@ -118,9 +123,8 @@ EXTERN int daemon_debug_opt; EXTERN int daemon_quit; EXTERN int cluster_down; -EXTERN int poll_ringid; +EXTERN int poll_lockspaces; EXTERN int poll_fencing; -EXTERN int poll_quorum; EXTERN int poll_fs; EXTERN int poll_ignore_plock; EXTERN int poll_drop_plock; @@ -128,20 +132,24 @@ EXTERN int plock_fd; EXTERN int plock_ci; EXTERN struct list_head lockspaces; EXTERN int cluster_quorate; -EXTERN uint64_t quorate_time; +EXTERN uint64_t cluster_quorate_monotime; +EXTERN uint64_t cluster_joined_monotime; +EXTERN uint64_t cluster_joined_walltime; EXTERN uint32_t cluster_ringid_seq; EXTERN char cluster_name[DLM_LOCKSPACE_LEN+1]; EXTERN int our_nodeid; EXTERN uint32_t control_minor; EXTERN uint32_t monitor_minor; EXTERN uint32_t plock_minor; +EXTERN struct fence_device fence_all_device;
EXTERN int optk_debug; EXTERN int optk_timewarn; EXTERN int optk_protocol; EXTERN int optd_debug_logfile; EXTERN int optd_enable_fencing; -EXTERN int optd_enable_quorum; +EXTERN int optd_enable_quorum_fencing; +EXTERN int optd_enable_quorum_lockspace; EXTERN int optd_enable_fscontrol; EXTERN int optd_enable_plock; EXTERN int optd_plock_debug; @@ -150,13 +158,16 @@ EXTERN int optd_plock_ownership; EXTERN int optd_drop_resources_time; EXTERN int optd_drop_resources_count; EXTERN int optd_drop_resources_age; +EXTERN int optd_startup_fence; +EXTERN int optd_fence_all_agent;
EXTERN int cfgk_debug; EXTERN int cfgk_timewarn; EXTERN int cfgk_protocol; EXTERN int cfgd_debug_logfile; EXTERN int cfgd_enable_fencing; -EXTERN int cfgd_enable_quorum; +EXTERN int cfgd_enable_quorum_fencing; +EXTERN int cfgd_enable_quorum_lockspace; EXTERN int cfgd_enable_fscontrol; EXTERN int cfgd_enable_plock; EXTERN int cfgd_plock_debug; @@ -165,6 +176,8 @@ EXTERN int cfgd_plock_ownership; EXTERN int cfgd_drop_resources_time; EXTERN int cfgd_drop_resources_count; EXTERN int cfgd_drop_resources_age; +EXTERN int cfgd_startup_fence; +EXTERN char fence_all_agent[PATH_MAX];
#define LOG_DUMP_SIZE DLMC_DUMP_SIZE
@@ -194,7 +207,9 @@ enum { DLM_MSG_DEADLK_CYCLE_START, DLM_MSG_DEADLK_CYCLE_END, DLM_MSG_DEADLK_CHECKPOINT_READY, - DLM_MSG_DEADLK_CANCEL_LOCK + DLM_MSG_DEADLK_CANCEL_LOCK, + DLM_MSG_FENCE_RESULT, + DLM_MSG_FENCE_CLEAR, };
/* dlm_header flags */ @@ -288,15 +303,10 @@ void setup_config(int update); int get_weight(int nodeid, char *lockspace);
/* cpg.c */ -int setup_cpg_daemon(void); -void close_cpg_daemon(void); -void process_cpg_daemon(int ci); -int set_protocol(void); void process_lockspace_changes(void); -void dlm_send_message(struct lockspace *ls, char *buf, int len); +void process_fencing_changes(void); int dlm_join_lockspace(struct lockspace *ls); int dlm_leave_lockspace(struct lockspace *ls); -const char *msg_name(int type); void update_flow_control_status(void); int set_node_info(struct lockspace *ls, int nodeid, struct dlmc_node *node); int set_lockspace_info(struct lockspace *ls, struct dlmc_lockspace *lockspace); @@ -305,6 +315,36 @@ int set_lockspace_nodes(struct lockspace *ls, int option, int *node_count, struct dlmc_node **nodes_out); int set_fs_notified(struct lockspace *ls, int nodeid);
+/* daemon_cpg.c */ +void init_daemon(void); +void fence_ack_node(int nodeid); +void add_startup_node(int nodeid); +const char *reason_str(int reason); +const char *msg_name(int type); +void dlm_send_message(struct lockspace *ls, char *buf, int len); +void dlm_header_in(struct dlm_header *hd); +int dlm_header_validate(struct dlm_header *hd, int nodeid); +int fence_node_time(int nodeid, uint64_t *last_fenced); +int fence_in_progress(int *in_progress); +int setup_cpg_daemon(void); +void close_cpg_daemon(void); +void process_cpg_daemon(int ci); +void set_protocol_stateful(void); +int set_protocol(void); + +void log_config(const struct cpg_name *group_name, + const struct cpg_address *member_list, + size_t member_list_entries, + const struct cpg_address *left_list, + size_t left_list_entries, + const struct cpg_address *joined_list, + size_t joined_list_entries); + +void log_ringid(const char *name, + struct cpg_ring_id *ringid, + const uint32_t *member_list, + size_t member_list_entries); + /* deadlock.c */ void setup_deadlock(void); void send_cycle_start(struct lockspace *ls); @@ -346,11 +386,12 @@ int setup_cluster_cfg(void); void close_cluster_cfg(void); void process_cluster_cfg(int ci); void kick_node_from_cluster(int nodeid); +int setup_node_config(void);
/* fence.c */ -int fence_request(int nodeid); -int fence_node_time(int nodeid, uint64_t *last_fenced_time); -int fence_in_progress(int *count); +int fence_request(int nodeid, uint64_t fail_walltime, uint64_t fail_monotime, + struct fence_config *fc, int *pid_out); +int fence_result(int nodeid, int pid, int *result);
/* netlink.c */ int setup_netlink(void); diff --git a/dlm_controld/fence.c b/dlm_controld/fence.c index 50b7d4d..91c5e67 100644 --- a/dlm_controld/fence.c +++ b/dlm_controld/fence.c @@ -7,28 +7,147 @@ */
#include "dlm_daemon.h" -#include <pacemaker/crm/stonith-ng.h>
-int fence_request(int nodeid) +static int run_agent(char *agent, char *args, int *pid_out) { - int rv; - rv = stonith_api_kick_helper(nodeid, 300, 1); - if (rv) { - log_error("stonith_api_kick_helper %d error %d", nodeid, rv); - return rv; + int pid, len; + int pw_fd = -1; /* parent write file descriptor */ + int cr_fd = -1; /* child read file descriptor */ + int pfd[2]; + + len = strlen(args); + + if (pipe(pfd)) + return -errno; + + cr_fd = pfd[0]; + pw_fd = pfd[1]; + + pid = fork(); + if (pid < 0) { + close(cr_fd); + close(pw_fd); + return -errno; } - return 0; + + if (pid) { + /* parent */ + int ret; + + do { + ret = write(pw_fd, args, len); + } while (ret < 0 && errno == EINTR); + + if (ret != len) + goto fail; + + close(cr_fd); + close(pw_fd); + + *pid_out = pid; + return 0; + } else { + /* child */ + int c_stdout, c_stderr; + + /* redirect agent stdout/stderr to /dev/null */ + close(1); + c_stdout = open("/dev/null", O_WRONLY); + if (c_stdout < 0) + goto fail; + close(2); + c_stderr = open("/dev/null", O_WRONLY); + if (c_stderr < 0) + goto fail; + + /* redirect agent stdin from parent */ + close(0); + if (dup(cr_fd) < 0) + goto fail; + + close(cr_fd); + close(pw_fd); + + execlp(agent, agent, NULL); + exit(EXIT_FAILURE); + } + fail: + close(cr_fd); + close(pw_fd); + return -1; }
-int fence_node_time(int nodeid, uint64_t *last_fenced_time) +int fence_request(int nodeid, uint64_t fail_walltime, uint64_t fail_monotime, + struct fence_config *fc, int *pid_out) { - *last_fenced_time = stonith_api_time_helper(nodeid, 0); + struct fence_device *dev; + char args[FENCE_CONFIG_ARGS_MAX]; + char extra[FENCE_CONFIG_NAME_MAX]; + int rv, pid = -1; + + memset(args, 0, sizeof(args)); + + memset(extra, 0, sizeof(extra)); + snprintf(extra, sizeof(extra)-1, "fail_time=%llu\n", (unsigned long long)fail_walltime); + + dev = fc->dev[fc->pos]; + if (!dev) + return -1; + + rv = fence_config_agent_args(fc, extra, args); + if (rv < 0) { + log_error("fence_request %d args error %d", nodeid, rv); + return rv; + } + + rv = run_agent(dev->agent, args, &pid); + if (rv < 0) { + log_error("fence_request %d agent %s pid %d run error %d", + nodeid, dev->agent, pid, rv); + return rv; + } + + log_debug("fence_request %d pos %d agent %s pid %d running", + nodeid, fc->pos, dev->agent, pid); + + *pid_out = pid; return 0; }
-int fence_in_progress(int *count) +/* + * if pid has exited, return 0 with exit code in result + * if pid is running, return -EAGAIN + * other error, return -EXXX + */ + +int fence_result(int nodeid, int pid, int *result) { - *count = 0; - return 0; + int status, rv; + + rv = waitpid(pid, &status, WNOHANG); + + if (rv < 0) { + /* shouldn't happen */ + log_error("agent pid %d nodeid %d errno %d", + pid, nodeid, errno); + return rv; + + } else if (!rv) { + /* pid still running */ + return -EAGAIN; + + } else if (WIFEXITED(status)) { + /* pid exited */ + + *result = WEXITSTATUS(status); + + log_error("agent pid %d nodeid %d result %d", + pid, nodeid, *result); + return 0; + + } else { + /* pid state changed but still running */ + return -EAGAIN; + } }
diff --git a/dlm_controld/fence_config.c b/dlm_controld/fence_config.c new file mode 100644 index 0000000..9f72e36 --- /dev/null +++ b/dlm_controld/fence_config.c @@ -0,0 +1,403 @@ +/* + * Copyright 2012 Red Hat, Inc. + * + * This copyrighted material is made available to anyone wishing to use, + * modify, copy, or redistribute it subject to the terms and conditions + * of the GNU General Public License v2 or (at your option) any later version. + */ + + +#include <stdio.h> +#include <stdlib.h> +#include <unistd.h> +#include <string.h> +#include <errno.h> + +#include "fence_config.h" + +#if 0 + +Empty new line separates the config for each fence device. + +- + +fence_all fence_foo key=val ... + +Special fence config format that applies to all nodes, allows +no per node config parameters, and cannot be used with any +other fence device configuration. + +- + +device <dev_name> <agent> <dev_args> +connect <dev_name> node=<nodeid> <con_args> + +General fence config format, allowing per node config +parameters. + +- + +device foo fence_foo ipaddr=1.1.1.1 login=x password=y +connect foo node=1 port=1 +connect foo node=2 port=2 +connect foo node=3 port=3 + +Simple example of nodes connected to switch ports. +If fencing with the device fails, the next device +listed for the node, if any, will be tried. + +- + +device foo:1 fence_foo ipaddr=1.1.1.1 login=x password=y +connect foo:1 node=1 port=1 +connect foo:1 node=2 port=2 +connect foo:1 node=3 port=3 + +device foo:2 fence_foo ipaddr=2.2.2.2 login=x password=y +connect foo:2 node=1 port=1 +connect foo:2 node=2 port=2 +connect foo:2 node=3 port=3 + +Associate two parallel path/power devices that must both +succeed for fencing to succeed. Devices have same base +name with :1 :2 suffix. + +- + +device foo fence_foo ipaddr=1.1.1.1 login=x password=y +connect foo node=1 port=1 +connect foo node=2 port=2 +connect foo node=3 port=3 +unfence foo + +Add unfence line to indicate nodes connected to the device +should be unfenced. + +#endif + +#define MAX_LINE (FENCE_CONFIG_ARGS_MAX + (3 * FENCE_CONFIG_NAME_MAX)) + +static unsigned int con_args_nodeid(char *args) +{ + char *k; + unsigned int v; + int rv; + + k = strstr(args, "node="); + + rv = sscanf(k, "node=%u", &v); + if (rv != 1) + return 0; + return v; +} + +static int read_config_section(unsigned int nodeid, FILE *file, char *dev_line, + struct fence_device **dev_out, + struct fence_connect **con_out) +{ + struct fence_device *dev; + struct fence_connect *con; + char line[MAX_LINE]; + char unused[FENCE_CONFIG_NAME_MAX]; + char agent[FENCE_CONFIG_NAME_MAX]; + char dev_name[FENCE_CONFIG_NAME_MAX]; + char con_name[FENCE_CONFIG_NAME_MAX]; + char dev_args[FENCE_CONFIG_ARGS_MAX]; + char con_args[FENCE_CONFIG_ARGS_MAX]; + int rv, unfence = 0; + + if (strlen(dev_line) > MAX_LINE) + return -1; + + memset(dev_name, 0, sizeof(dev_name)); + memset(agent, 0, sizeof(agent)); + memset(dev_args, 0, sizeof(dev_args)); + + rv = sscanf(dev_line, "%s %s %s %[^\n]s\n", unused, dev_name, agent, dev_args); + if (rv < 3) + return -1; + + while (fgets(line, MAX_LINE, file)) { + if (line[0] == '\n') + break; + if (line[0] == '#') + continue; + + if (!strncmp(line, "unfence", strlen("unfence"))) { + /* TODO: verify dev_name matches */ + unfence = 1; + continue; + } + + /* invalid config */ + if (strncmp(line, "connect", strlen("connect"))) + return -1; + + memset(con_name, 0, sizeof(con_name)); + memset(con_args, 0, sizeof(con_args)); + + rv = sscanf(line, "%s %s %[^\n]s", unused, con_name, con_args); + if (rv < 3) + return -1; + + /* invalid config */ + if (strncmp(dev_name, con_name, FENCE_CONFIG_NAME_MAX)) + return -1; + + /* skip connection for another node */ + if (con_args_nodeid(con_args) != nodeid) + continue; + + dev = malloc(sizeof(struct fence_device)); + if (!dev) + return -ENOMEM; + + con = malloc(sizeof(struct fence_connect)); + if (!con) { + free(dev); + return -ENOMEM; + } + + memset(dev, 0, sizeof(struct fence_device)); + memset(con, 0, sizeof(struct fence_connect)); + + strncpy(dev->name, dev_name, FENCE_CONFIG_NAME_MAX-1); + strncpy(dev->agent, agent, FENCE_CONFIG_NAME_MAX-1); + strncpy(dev->args, dev_args, FENCE_CONFIG_ARGS_MAX-1); + strncpy(con->name, con_name, FENCE_CONFIG_NAME_MAX-1); + strncpy(con->args, con_args, FENCE_CONFIG_ARGS_MAX-1); + dev->unfence = unfence; + + *dev_out = dev; + *con_out = con; + return 0; + } + + return -1; +} + +void fence_config_free(struct fence_config *fc) +{ + struct fence_device *dev; + struct fence_connect *con; + int i; + + for (i = 0; i < FENCE_CONFIG_DEVS_MAX; i++) { + dev = fc->dev[i]; + con = fc->con[i]; + if (dev) + free(dev); + if (con) + free(con); + } + + memset(fc, 0, sizeof(struct fence_config)); +} + +int fence_config_init(struct fence_config *fc, unsigned int nodeid, char *path) +{ + char line[MAX_LINE]; + struct fence_device *dev; + struct fence_connect *con; + FILE *file; + int pos = 0; + int rv; + + fc->nodeid = nodeid; + + file = fopen(path, "r"); + if (!file) + return -1; + + while (fgets(line, MAX_LINE, file)) { + if (line[0] == '#') + continue; + if (line[0] == '\n') + continue; + + if (!strncmp(line, "fence_all", strlen("fence_all"))) { + /* fence_all cannot be used with other fence devices */ + if (pos) { + rv = -EINVAL; + goto out; + } + + dev = malloc(sizeof(struct fence_device)); + if (!dev) { + rv = -ENOMEM; + goto out; + } + + rv = sscanf(line, "%s %s %[^\n]s\n", dev->name, dev->agent, dev->args); + if (rv < 2) { + rv = -EINVAL; + goto out; + } + + fc->dev[0] = dev; + fc->pos = 0; + rv = 0; + goto out; + } + + if (strncmp(line, "device", strlen("device"))) + continue; + + /* read connect and unfence lines following a device line */ + + rv = read_config_section(nodeid, file, line, &dev, &con); + if (rv < 0) + continue; + + fc->dev[pos] = dev; + fc->con[pos] = con; + pos++; + } + + if (pos) + rv = 0; + out: + fclose(file); + return rv; +} + +static int same_base_name(struct fence_device *a, + struct fence_device *b) +{ + int len, i; + + len = strlen(a->name); + if (len > strlen(b->name)) + len = strlen(b->name); + + for (i = 0; i < len; i++) { + if (a->name[i] == ':' && b->name[i] == ':') + return 1; + if (a->name[i] == b->name[i]) + continue; + return 0; + } + return 0; +} + +/* + * if next dev is in parallel with last one, + * set d,c return 0, else -1 + * + * two consecutive devs with same basename are parallel + */ + +int fence_config_next_parallel(struct fence_config *fc) +{ + struct fence_device *prev, *next; + int d = fc->pos; + + if (d >= FENCE_CONFIG_DEVS_MAX) + return -1; + + prev = fc->dev[d]; + next = fc->dev[d+1]; + + if (!next) + return -1; + + if (same_base_name(prev, next)) { + fc->pos = d+1; + return 0; + } + return -1; +} + +/* + * if there's a dev with the next priority, + * set d,c return 0, else -1 + * + * look for another dev with a non-matching basename + */ + +int fence_config_next_priority(struct fence_config *fc) +{ + struct fence_device *prev, *next; + int d = fc->pos; + int i; + + if (d >= FENCE_CONFIG_DEVS_MAX) + return -1; + + prev = fc->dev[d]; + + for (i = d+1; i < FENCE_CONFIG_DEVS_MAX; i++) { + next = fc->dev[i]; + + if (!next) + return -1; + + if (same_base_name(prev, next)) + continue; + + fc->pos = d+1; + return 0; + } + return -1; +} + +int fence_config_agent_args(struct fence_config *fc, char *extra, char *args) +{ + struct fence_device *dev; + struct fence_connect *con; + char node[FENCE_CONFIG_NAME_MAX]; + char *p; + int n = 0; + int i, len; + + dev = fc->dev[fc->pos]; + con = fc->con[fc->pos]; + + memset(node, 0, sizeof(node)); + snprintf(node, FENCE_CONFIG_NAME_MAX-1, "node=%u\n", fc->nodeid); + len = strlen(node); + + if (dev) + len += strlen(dev->args) + 1; /* +1 for \n */ + if (con) + len += strlen(con->args) + 1; + if (extra) + len += strlen(extra) + 1; + + if (len > FENCE_CONFIG_ARGS_MAX - 1) + return -1; + + if (dev && dev->args[0]) { + p = dev->args; + + for (i = 0; i < strlen(dev->args); i++) { + if (*p == ' ') + args[n++] = '\n'; + else + args[n++] = *p; + p++; + } + args[n++] = '\n'; + } + + if (con && con->args[0]) { + p = con->args; + + for (i = 0; i < strlen(con->args); i++) { + if (*p == ' ') + args[n++] = '\n'; + else + args[n++] = *p; + p++; + } + args[n++] = '\n'; + } + + if (!strstr(args, "node=")) + strcat(args, node); + if (extra) + strcat(args, extra); + + return 0; +} + diff --git a/dlm_controld/fence_config.h b/dlm_controld/fence_config.h new file mode 100644 index 0000000..47e7bda --- /dev/null +++ b/dlm_controld/fence_config.h @@ -0,0 +1,62 @@ +/* + * Copyright 2012 Red Hat, Inc. + * + * This copyrighted material is made available to anyone wishing to use, + * modify, copy, or redistribute it subject to the terms and conditions + * of the GNU General Public License v2 or (at your option) any later version. + */ + +#ifndef _FENCE_CONFIG_H_ +#define _FENCE_CONFIG_H_ + +#define FENCE_CONFIG_DEVS_MAX 4 /* max devs per node */ +#define FENCE_CONFIG_NAME_MAX 256 /* including terminating \0 */ +#define FENCE_CONFIG_ARGS_MAX 4096 /* including terminating \0 */ + +struct fence_device { + char name[FENCE_CONFIG_NAME_MAX]; + char agent[FENCE_CONFIG_NAME_MAX]; + char args[FENCE_CONFIG_ARGS_MAX]; + int unfence; +}; + +struct fence_connect { + char name[FENCE_CONFIG_NAME_MAX]; + char args[FENCE_CONFIG_ARGS_MAX]; +}; + +/* describes fence config for one node */ + +struct fence_config { + struct fence_device *dev[FENCE_CONFIG_DEVS_MAX]; + struct fence_connect *con[FENCE_CONFIG_DEVS_MAX]; + unsigned int nodeid; + int pos; +}; + + +int fence_config_init(struct fence_config *fc, unsigned int nodeid, char *path); +void fence_config_free(struct fence_config *fc); + +/* + * Iterate through fence_config, sets pos to indicate next to try. + * Based on two rules: + * + * - next_parallel is the next device with the same base name + * as the current device (base name is name preceding ":") + * + * - next_priority is the next device without the same base name + * as the current device + */ + +int fence_config_next_parallel(struct fence_config *fc); +int fence_config_next_priority(struct fence_config *fc); + +/* + * Combine dev->args and con->args, replacing ' ' with '\n'. + * Also add "node=nodeid" if "node=" does not already exist. + */ + +int fence_config_agent_args(struct fence_config *fc, char *extra, char *args); + +#endif diff --git a/dlm_controld/lib.c b/dlm_controld/lib.c index 09f84cf..228f037 100644 --- a/dlm_controld/lib.c +++ b/dlm_controld/lib.c @@ -425,3 +425,22 @@ int dlmc_deadlock_check(char *name) return rv; }
+int dlmc_fence_ack(char *name) +{ + struct dlmc_header h; + int fd, rv; + + init_header(&h, DLMC_CMD_FENCE_ACK, name, 0); + + fd = do_connect(DLMC_SOCK_PATH); + if (fd < 0) { + rv = fd; + goto out; + } + + rv = do_write(fd, &h, sizeof(h)); + close(fd); + out: + return rv; +} + diff --git a/dlm_controld/libdlmcontrol.h b/dlm_controld/libdlmcontrol.h index 609372d..4034372 100644 --- a/dlm_controld/libdlmcontrol.h +++ b/dlm_controld/libdlmcontrol.h @@ -93,6 +93,7 @@ int dlmc_fs_notified(int fd, char *name, int nodeid); int dlmc_fs_result(int fd, char *name, int *type, int *nodeid, int *result);
int dlmc_deadlock_check(char *name); +int dlmc_fence_ack(char *name);
#endif
diff --git a/dlm_controld/main.c b/dlm_controld/main.c index e0420b6..e623a6f 100644 --- a/dlm_controld/main.c +++ b/dlm_controld/main.c @@ -163,6 +163,10 @@ static void sigterm_handler(int sig) daemon_quit = 1; }
+static void sigchld_handler(int sig) +{ +} + static struct lockspace *create_ls(char *name) { struct lockspace *ls; @@ -649,6 +653,10 @@ static void process_connection(int ci) }
switch (h.command) { + case DLMC_CMD_FENCE_ACK: + fence_ack_node(atoi(h.name)); + break; + case DLMC_CMD_FS_REGISTER: if (cfgd_enable_fscontrol) { rv = fs_register_add(h.name); @@ -869,6 +877,8 @@ static void loop(void) void (*workfn) (int ci); void (*deadfn) (int ci);
+ setup_config(0); + rv = setup_queries(); if (rv < 0) goto out; @@ -884,13 +894,15 @@ static void loop(void) if (rv > 0) client_add(rv, process_cluster_cfg, cluster_dead);
+ rv = setup_node_config(); + if (rv < 0) + goto out; + rv = setup_cluster(); if (rv < 0) goto out; client_add(rv, process_cluster, cluster_dead);
- setup_config(0); - rv = check_uncontrolled_lockspaces(); if (rv < 0) goto out; @@ -945,8 +957,10 @@ static void loop(void) if (rv == -1 && errno == EINTR) { if (daemon_quit && list_empty(&lockspaces)) goto out; - log_error("shutdown ignored, active lockspaces"); - daemon_quit = 0; + if (daemon_quit) { + log_error("shutdown ignored, active lockspaces"); + daemon_quit = 0; + } continue; } if (rv < 0) { @@ -977,7 +991,12 @@ static void loop(void)
poll_timeout = -1;
- if (poll_fencing || poll_fs) { + if (poll_fencing) { + process_fencing_changes(); + poll_timeout = 1000; + } + + if (poll_lockspaces || poll_fs) { process_lockspace_changes(); poll_timeout = 1000; } @@ -1097,7 +1116,7 @@ static void print_usage(void) printf(" -f <num> Enable (1) or disable (0) fencing recovery dependency\n"); printf(" Default is %d\n", DEFAULT_ENABLE_FENCING); printf(" -q <num> Enable (1) or disable (0) quorum recovery dependency\n"); - printf(" Default is %d\n", DEFAULT_ENABLE_QUORUM); + printf(" Default is %d\n", DEFAULT_ENABLE_QUORUM_FENCING); printf(" -s <num> Enable (1) or disable (0) fs_controld recovery coordination\n"); printf(" Default is %d\n", DEFAULT_ENABLE_FSCONTROL); #if 0 @@ -1121,7 +1140,7 @@ static void print_usage(void) printf(" -V Print program version information, then exit\n"); }
-#define OPTION_STRING "LDKf:q:p:Pl:o:t:c:a:hVr:s:" +#define OPTION_STRING "LDKf:q:p:Pl:o:t:c:a:hVr:s:e:d:"
static void read_arguments(int argc, char **argv) { @@ -1151,21 +1170,35 @@ static void read_arguments(int argc, char **argv) cfgk_protocol = atoi(optarg); break;
+ case 's': + optd_enable_fscontrol = 1; + cfgd_enable_fscontrol = atoi(optarg); + break; + + /* fencing options */ + case 'f': optd_enable_fencing = 1; cfgd_enable_fencing = atoi(optarg); break;
case 'q': - optd_enable_quorum = 1; - cfgd_enable_quorum = atoi(optarg); + optd_enable_quorum_fencing = 1; + cfgd_enable_quorum_fencing = atoi(optarg); break;
- case 's': - optd_enable_fscontrol = 1; - cfgd_enable_fscontrol = atoi(optarg); + case 'e': + optd_fence_all_agent = 1; + strcpy(fence_all_agent, optarg); break;
+ case 'd': + optd_startup_fence = 1; + cfgd_startup_fence = atoi(optarg); + + + /* plock options */ + case 'p': optd_enable_plock = 1; cfgd_enable_plock = atoi(optarg); @@ -1256,15 +1289,15 @@ static void set_scheduler(void)
int main(int argc, char **argv) { - int fd; + struct sigaction act; + int fd, rv;
cfgk_debug = -1; cfgk_timewarn = -1; cfgk_protocol = PROTO_DETECT; cfgd_debug_logfile = DEFAULT_DEBUG_LOGFILE; - cfgd_enable_fencing = DEFAULT_ENABLE_FENCING; - cfgd_enable_quorum = DEFAULT_ENABLE_QUORUM; - cfgd_enable_quorum = DEFAULT_ENABLE_FSCONTROL; + cfgd_enable_fscontrol = DEFAULT_ENABLE_FSCONTROL; + cfgd_enable_plock = DEFAULT_ENABLE_PLOCK; cfgd_plock_debug = DEFAULT_PLOCK_DEBUG; cfgd_plock_rate_limit = DEFAULT_PLOCK_RATE_LIMIT; @@ -1273,8 +1306,20 @@ int main(int argc, char **argv) cfgd_drop_resources_count = DEFAULT_DROP_RESOURCES_COUNT; cfgd_drop_resources_age = DEFAULT_DROP_RESOURCES_AGE;
+ cfgd_enable_fencing = DEFAULT_ENABLE_FENCING; + cfgd_enable_quorum_lockspace= DEFAULT_ENABLE_QUORUM_LOCKSPACE; + cfgd_enable_quorum_fencing = DEFAULT_ENABLE_QUORUM_FENCING; + cfgd_startup_fence = DEFAULT_STARTUP_FENCE; + + strcpy(fence_all_agent, DEFAULT_FENCE_ALL_AGENT); + memset(&fence_all_device, 0, sizeof(struct fence_device)); + strcpy(fence_all_device.name, "fence_all"); + strcpy(fence_all_device.agent, fence_all_agent); + INIT_LIST_HEAD(&lockspaces); INIT_LIST_HEAD(&fs_register_list); + + init_daemon();
read_arguments(argc, argv);
@@ -1293,7 +1338,18 @@ int main(int argc, char **argv)
log_level(NULL, LOG_INFO, "dlm_controld %s started", RELEASE_VERSION);
- signal(SIGTERM, sigterm_handler); + memset(&act, 0, sizeof(act)); + act.sa_handler = sigterm_handler; + rv = sigaction(SIGTERM, &act, NULL); + if (rv < 0) + return -rv; + + memset(&act, 0, sizeof(act)); + act.sa_handler = sigchld_handler; + act.sa_flags = SA_NOCLDSTOP; + rv = sigaction(SIGCHLD, &act, NULL); + if (rv < 0) + return -rv;
set_scheduler();
diff --git a/dlm_controld/member.c b/dlm_controld/member.c index 7af581c..841e5e3 100644 --- a/dlm_controld/member.c +++ b/dlm_controld/member.c @@ -9,6 +9,7 @@ #include "dlm_daemon.h" #include <corosync/corotypes.h> #include <corosync/cfg.h> +#include <corosync/cmap.h> #include <corosync/quorum.h>
static corosync_cfg_handle_t ch; @@ -111,8 +112,13 @@ static void quorum_callback(quorum_handle_t h, uint32_t quorate, int i, j, num_addrs; uint64_t now = monotime();
+ if (!cluster_joined_monotime) { + cluster_joined_monotime = now; + cluster_joined_walltime = time(NULL); + } + if (!cluster_quorate && quorate) - quorate_time = now; + cluster_quorate_monotime = now;
cluster_quorate = quorate; cluster_ringid_seq = (uint32_t)ring_seq; @@ -174,8 +180,10 @@ void process_cluster(int ci) cs_error_t err;
err = quorum_dispatch(qh, CS_DISPATCH_ALL); - if (err != CS_OK) + if (err != CS_OK) { + log_error("process_cluster quorum_dispatch %d", err); cluster_dead(0); + } }
/* Force re-read of quorum nodes */ @@ -184,8 +192,10 @@ void update_cluster(void) cs_error_t err;
err = quorum_dispatch(qh, CS_DISPATCH_ONE); - if (err != CS_OK) + if (err != CS_OK) { + log_error("update_cluster quorum_dispatch %d", err); cluster_dead(0); + } }
int setup_cluster(void) @@ -272,8 +282,10 @@ void process_cluster_cfg(int ci) cs_error_t err;
err = corosync_cfg_dispatch(ch, CS_DISPATCH_ALL); - if (err != CS_OK) + if (err != CS_OK) { + log_error("process_cluster_cfg cfg_dispatch %d", err); cluster_dead(0); + } }
int setup_cluster_cfg(void) @@ -319,3 +331,34 @@ void close_cluster_cfg(void) corosync_cfg_finalize(ch); }
+int setup_node_config(void) +{ + char key[CMAP_KEYNAME_MAXLEN+1]; + cmap_handle_t h; + cs_error_t err; + uint32_t nodeid; + int i; + + err = cmap_initialize(&h); + if (err != CS_OK) { + log_error("corosync cmap init error %d", err); + return -1; + } + + for (i = 0; i < MAX_NODES; i++) { + snprintf(key, CMAP_KEYNAME_MAXLEN, "nodelist.node.%d.nodeid", i); + + err = cmap_get_uint32(h, key, &nodeid); + if (err != CS_OK) + break; + + log_debug("node_config %d", nodeid); + + if (cfgd_enable_fencing && cfgd_startup_fence) + add_startup_node(nodeid); + } + + cmap_finalize(h); + return 0; +} + diff --git a/dlm_controld/plock.c b/dlm_controld/plock.c index b6dc0a4..ed98e9c 100644 --- a/dlm_controld/plock.c +++ b/dlm_controld/plock.c @@ -27,8 +27,6 @@ static struct timeval plock_rate_last;
static int plock_device_fd = -1;
-extern int message_flow_control_on; - #define RD_CONTINUE 0x00000001
struct resource_data { @@ -1480,14 +1478,6 @@ int limit_plocks(void) { struct timeval now;
- /* Don't send more messages while the cpg message queue is backed up */ - - if (message_flow_control_on) { - update_flow_control_status(); - if (message_flow_control_on) - return 1; - } - if (!cfgd_plock_rate_limit || !plock_read_count) return 0;
diff --git a/dlm_tool/main.c b/dlm_tool/main.c index c9a15db..c378f2b 100644 --- a/dlm_tool/main.c +++ b/dlm_tool/main.c @@ -38,6 +38,7 @@ #define OP_LOCKDUMP 8 #define OP_LOCKDEBUG 9 #define OP_LOG_PLOCK 10 +#define OP_FENCE_ACK 11
static char *prog_name; static char *lsname; @@ -182,7 +183,7 @@ static void print_usage(void) printf("\n"); printf("dlm_tool [options] [join | leave | lockdump | lockdebug |\n" " ls | dump | log_plock | plocks |\n" - " deadlock_check]\n"); + " fence_ack]\n"); printf("\n"); printf("Options:\n"); printf(" -n Show all node information in ls\n"); @@ -321,6 +322,11 @@ static void decode_arguments(int argc, char **argv) operation = OP_DEADLOCK_CHECK; opt_ind = optind + 1; break; + } else if (!strncmp(argv[optind], "fence_ack", 9) && + (strlen(argv[optind]) == 9)) { + operation = OP_FENCE_ACK; + opt_ind = optind + 1; + break; } else if (!strncmp(argv[optind], "dump", 4) && (strlen(argv[optind]) == 4)) { operation = OP_DUMP; @@ -1241,6 +1247,11 @@ static void do_deadlock_check(char *name) dlmc_deadlock_check(name); }
+static void do_fence_ack(char *name) +{ + dlmc_fence_ack(name); +} + static void do_plocks(char *name) { char buf[DLMC_DUMP_SIZE]; @@ -1335,6 +1346,10 @@ int main(int argc, char **argv) case OP_LOCKDEBUG: do_lockdebug(lsname); break; + + case OP_FENCE_ACK: + do_fence_ack(lsname); + break; } return 0; } diff --git a/fence/Makefile b/fence/Makefile new file mode 100644 index 0000000..c6ff2c0 --- /dev/null +++ b/fence/Makefile @@ -0,0 +1,56 @@ +DESTDIR= +PREFIX=/usr +BINDIR=$(PREFIX)/sbin +#MANDIR=$(PREFIX)/share/man + +BIN_TARGET = dlm_stonith +#MAN_TARGET = dlm_stonith.8 + +BIN_SOURCE = stonith_helper.c + +BIN_CFLAGS += -D_GNU_SOURCE -g \ + -Wall \ + -Wformat \ + -Wformat-security \ + -Wmissing-prototypes \ + -Wnested-externs \ + -Wpointer-arith \ + -Wextra -Wshadow \ + -Wcast-align \ + -Wwrite-strings \ + -Waggregate-return \ + -Wstrict-prototypes \ + -Winline \ + -Wredundant-decls \ + -Wno-sign-compare \ + -Wno-unused-parameter \ + -Wp,-D_FORTIFY_SOURCE=2 \ + -fexceptions \ + -fasynchronous-unwind-tables \ + -fdiagnostics-show-option \ + +BIN_CFLAGS += -fPIE -DPIE +BIN_CFLAGS += `xml2-config --cflags` +BIN_CFLAGS += -I../include + +BIN_LDFLAGS += -Wl,-z,now -Wl,-z,relro -pie +BIN_LDFLAGS += `xml2-config --libs` + +all: $(BIN_TARGET) + +$(BIN_TARGET): $(BIN_SOURCE) + $(CC) $(BIN_CFLAGS) $(BIN_LDFLAGS) $(BIN_SOURCE) -o $@ -L. + +clean: + rm -f *.o *.so *.so.* $(BIN_TARGET) + + +INSTALL=$(shell which install) + +.PHONY: install +install: all + $(INSTALL) -d $(DESTDIR)/$(BINDIR) + $(INSTALL) -d $(DESTDIR)/$(MANDIR)/man8 + $(INSTALL) -c -m 755 $(BIN_TARGET) $(DESTDIR)/$(BINDIR) +# $(INSTALL) -m 644 $(MAN_TARGET) $(DESTDIR)/$(MANDIR)/man8/ + diff --git a/fence/stonith_helper.c b/fence/stonith_helper.c new file mode 100644 index 0000000..df44219 --- /dev/null +++ b/fence/stonith_helper.c @@ -0,0 +1,96 @@ +/* + * Copyright 2012 Red Hat, Inc. + * + * This copyrighted material is made available to anyone wishing to use, + * modify, copy, or redistribute it subject to the terms and conditions + * of the GNU General Public License v2 or (at your option) any later version. + */ + +#include <stdio.h> +#include <stdint.h> +#include <string.h> +#include <unistd.h> +#include <syslog.h> +#include <pacemaker/crm/stonith-ng.h> + +int nodeid; +uint64_t fail_time; + +#define MAX_ARG_LEN 1024 + +static int get_options(int argc, char *argv[]) +{ + char arg[MAX_ARG_LEN]; + char key[MAX_ARG_LEN]; + char val[MAX_ARG_LEN]; + char c; + int rv; + + if (argc > 1) { + while ((c = getopt(argc, argv, "n:t:")) != -1) { + switch (c) { + case 'n': + nodeid = atoi(optarg); + break; + case 't': + fail_time = strtoull(optarg, NULL, 0); + break; + } + } + } else { + while (fgets(arg, sizeof(arg), stdin)) { + rv = sscanf(arg, "%[^=]=%s\n", key, val); + if (rv != 2) + continue; + + if (!strcmp(key, "node")) + nodeid = atoi(val); + else if (!strcmp(key, "fail_time")) + fail_time = strtoull(val, NULL, 0); + } + } + + if (!nodeid) { + fprintf(stderr, "no node\n"); + return -1; + } + + if (!fail_time) { + fprintf(stderr, "no fail_time\n"); + return -1; + } + + return 0; +} + +int main(int argc, char *argv[]) +{ + uint64_t t; + int rv; + + rv = get_options(argc, argv); + if (rv) + return rv; + + t = stonith_api_time_helper(nodeid, 0); + if (t >= fail_time) + return 0; + + rv = stonith_api_kick_helper(nodeid, 300, 1); + if (rv) { + fprintf(stderr, "kick_helper error %d nodeid %d\n", rv, nodeid); + openlog("stonith_helper", LOG_CONS | LOG_PID, LOG_DAEMON); + syslog(LOG_ERR, "kick_helper error %d nodeid %d\n", rv, nodeid); + return rv; + } + + while (1) { + t = stonith_api_time_helper(nodeid, 0); + if (t >= fail_time) + return 0; + sleep(1); + } + + return -1; +} +
cluster-commits@lists.fedorahosted.org