dlm: master - dlm_controld: fencing and config - cluster-commits - Fedora mailing-lists

20 Mar 2012

Gitweb:        http://git.fedorahosted.org/git/cluster.git?p=cluster.git;a=commitdiff;h=d8d...
Commit:        d8d8a6de00f7d0f9ca1944913a26b805984851d5
Parent:        eeb80cd73160856a16f73fc45d1ab48904aa55f0
Author:        David Teigland teigland@redhat.com
AuthorDate:    Mon Feb 27 13:10:25 2012 -0600
Committer:     David Teigland teigland@redhat.com
CommitterDate: Tue Mar 20 11:24:17 2012 -0500
dlm_controld: fencing and config
New /etc/dlm/dlm.conf with same config options as
before, but using "key = val" format.  Fencing
config can also be specified.  Without an explicit
fencing config, default "dlm_stonith" agent is run
to let stonith handle fencing.
Signed-off-by: David Teigland teigland@redhat.com
---
 dlm_controld/Makefile        |    5 +-
 dlm_controld/action.c        |    4 +-
 dlm_controld/config.c        |  123 ++-
 dlm_controld/cpg.c           | 1082 +-----------------------
 dlm_controld/daemon_cpg.c    | 1921 ++++++++++++++++++++++++++++++++++++++++++
 dlm_controld/dlm_controld.h  |    1 +
 dlm_controld/dlm_daemon.h    |   85 ++-
 dlm_controld/fence.c         |  145 +++-
 dlm_controld/fence_config.c  |  403 +++++++++
 dlm_controld/fence_config.h  |   62 ++
 dlm_controld/lib.c           |   19 +
 dlm_controld/libdlmcontrol.h |    1 +
 dlm_controld/main.c          |   90 ++-
 dlm_controld/member.c        |   51 +-
 dlm_controld/plock.c         |   10 -
 dlm_tool/main.c              |   17 +-
 fence/Makefile               |   56 ++
 fence/stonith_helper.c       |   96 +++
 18 files changed, 3007 insertions(+), 1164 deletions(-)

diff --git a/dlm_controld/Makefile b/dlm_controld/Makefile
index a648ab1..b60b8df 100644
--- a/dlm_controld/Makefile
+++ b/dlm_controld/Makefile
@@ -19,7 +19,9 @@ LIB_TARGET = $(LIB_SO).$(LIB_MAJOR).$(LIB_MINOR)
BIN_SOURCE = action.c \
              cpg.c \
+             daemon_cpg.c \
              crc.c \
+             fence_config.c \
              fence.c \
              main.c \
              plock.c \
@@ -51,13 +53,10 @@ BIN_CFLAGS += -D_GNU_SOURCE -g \
    -fdiagnostics-show-option \
BIN_CFLAGS += -fPIE -DPIE
-BIN_CFLAGS += `xml2-config --cflags`
 BIN_CFLAGS += -I../include -I../libdlm
BIN_LDFLAGS += -Wl,-z,now -Wl,-z,relro -pie
-BIN_LDFLAGS += `xml2-config --libs`
 BIN_LDFLAGS += -lpthread -lrt -lcpg -lcmap -lcfg -lquorum
-#BIN_LDFLAGS += -lfenced
LIB_CFLAGS += $(BIN_CFLAGS)
 LIB_LDFLAGS += -Wl,-z,relro -pie
diff --git a/dlm_controld/action.c b/dlm_controld/action.c
index d4bf11d..6cb34f2 100644
--- a/dlm_controld/action.c
+++ b/dlm_controld/action.c
@@ -34,10 +34,8 @@ static int detect_protocol(void)
    }
rv = cmap_get_string(handle, "totem.rrp_mode", &str);
-	if (rv != CS_OK) {
-		log_error("cmap_get_string totem.rrp_mode error %d", rv);
+	if (rv != CS_OK)
    	goto out;
-	}
log_debug("cmap totem.rrp_mode = '%s'", str);
diff --git a/dlm_controld/config.c b/dlm_controld/config.c
index 3bf02c0..f668b86 100644
--- a/dlm_controld/config.c
+++ b/dlm_controld/config.c
@@ -7,7 +7,6 @@
  */
#include "dlm_daemon.h"
-#include <libxml/tree.h>
/* TODO:
  <dlm>
@@ -37,66 +36,100 @@ static void proto_val(char *str, int *val)
    }
 }
-static void set_val(xmlNode *root, const char *name, int *opt, int *val)
+static void set_val(char *line, int *val_out)
 {
-	xmlChar *str;
+	char key[PATH_MAX];
+	char val[PATH_MAX];
+	int rv;
-	str = xmlGetProp(root, BAD_CAST name);
-	if (str && !(*opt)) {
-		*val = atoi((char *)str);
-		log_debug("config %s = %d", name, *val);
-	}
+	rv = sscanf(line, "%[^=]=%s", key, val);
+	if (rv != 2)
+		return;
+
+	*val_out = atoi(val);
+
+	log_debug("config %s=%d", key, *val_out);
+}
+
+static void get_val(char *line, char *val_out)
+{
+	char key[PATH_MAX];
+	char val[PATH_MAX];
+	int rv;
+
+	rv = sscanf(line, "%[^=]=%s", key, val);
+	if (rv != 2)
+		return;
+
+	strcpy(val_out, val);
 }
void setup_config(int update)
 {
-	xmlDoc *doc;
-	xmlNode *root;
-	xmlChar *str;
+	FILE *file;
+	char line[PATH_MAX];
+	char str[PATH_MAX];
if (!path_exists(CONF_FILE_PATH))
    	return;
-	doc = xmlParseFile(CONF_FILE_PATH);
-	if (!doc) {
-		log_error("xml parse error %d %s", errno, CONF_FILE_PATH);
+	file = fopen(CONF_FILE_PATH, "r");
+	if (!file)
    	return;
-	}
-	root = xmlDocGetRootElement(doc);
-	if (!root) {
-		log_error("xml root error %d %s", errno, CONF_FILE_PATH);
-		xmlFreeDoc(doc);
-		return;
-	}
+	while (fgets(line, PATH_MAX, file)) {
+		if (line[0] == '#')
+			continue;
+		if (line[0] == '\n')
+			continue;
+
+		if (!optk_debug && !strncmp(line, "log_debug", strlen("log_debug")))
+			set_val(line, &cfgk_debug);
+
+		else if (!optk_timewarn && !strncmp(line, "timewarn", strlen("timewarn")) && !update)
+			set_val(line, &cfgk_timewarn);
+
+		else if (!optd_enable_fencing && !strncmp(line, "enable_fencing", strlen("enable_fencing")) && !update)
+			set_val(line, &cfgd_enable_fencing);
+
+		else if (!optd_enable_quorum_fencing && !strncmp(line, "enable_quorum_fencing", strlen("enable_quorum_fencing")) && !update)
+			set_val(line, &cfgd_enable_quorum_fencing);
+
+		else if (!optd_enable_quorum_lockspace && !strncmp(line, "enable_quorum_lockspace", strlen("enable_quorum_lockspace")) && !update)
+			set_val(line, &cfgd_enable_quorum_lockspace);
+
+		else if (!optd_enable_fscontrol && !strncmp(line, "enable_fscontrol", strlen("enable_fscontrol")) && !update)
+			set_val(line, &cfgd_enable_fscontrol);
+
+		else if (!optd_enable_plock && !strncmp(line, "enable_plock", strlen("enable_plock")) && !update)
+			set_val(line, &cfgd_enable_plock);
+
+		else if (!optd_plock_ownership && !strncmp(line, "plock_ownership", strlen("plock_ownership")) && !update)
+			set_val(line, &cfgd_plock_ownership);
+
+		else if (!optd_plock_debug && !strncmp(line, "plock_debug", strlen("plock_debug")))
+			set_val(line, &cfgd_plock_debug);
+
+		else if (!optd_plock_rate_limit && !strncmp(line, "plock_rate_limit", strlen("plock_rate_limit")))
+			set_val(line, &cfgd_plock_rate_limit);
+
+		else if (!optd_drop_resources_time && !strncmp(line, "drop_resources_time", strlen("drop_resources_time")))
+			set_val(line, &cfgd_drop_resources_time);
-	if (update)
-		goto do_update;
+		else if (!optd_drop_resources_count && !strncmp(line, "drop_resources_count", strlen("drop_resources_count")))
+			set_val(line, &cfgd_drop_resources_count);
-	/* These config values are set from dlm.conf only if they haven't
-	   already been set on the command line. */
+		else if (!optd_drop_resources_age && !strncmp(line, "drop_resources_age", strlen("drop_resources_age")))
+			set_val(line, &cfgd_drop_resources_age);
-	str = xmlGetProp(root, BAD_CAST "protocol");
-	if (str && !optk_protocol) {
-		proto_val((char *)str, &cfgk_protocol);
-		log_debug("config protocol = %d", cfgk_protocol);
+		else if (!optk_protocol && !strncmp(line, "protocol", strlen("protocol")) && !update) {
+			memset(str, 0, sizeof(str));
+			get_val(line, str);
+			proto_val(str, &cfgk_protocol);
+			log_debug("config protocol = %d", cfgk_protocol);
+		}
    }
-	set_val(root, "log_debug", &optk_debug, &cfgk_debug);
-	set_val(root, "timewarn", &optk_timewarn, &cfgk_timewarn);
-	set_val(root, "enable_fencing", &optd_enable_fencing, &cfgd_enable_fencing);
-	set_val(root, "enable_quorum", &optd_enable_quorum, &cfgd_enable_quorum);
-	set_val(root, "enable_fscontrol", &optd_enable_fscontrol, &cfgd_enable_fscontrol);
-	set_val(root, "enable_plock", &optd_enable_plock, &cfgd_enable_plock);
-	set_val(root, "plock_ownership", &optd_plock_ownership, &cfgd_plock_ownership);
- do_update:
-	/* The following can be changed while running */
-	set_val(root, "plock_debug", &optd_plock_debug, &cfgd_plock_debug);
-	set_val(root, "plock_rate_limit", &optd_plock_rate_limit, &cfgd_plock_rate_limit);
-	set_val(root, "drop_resources_time", &optd_drop_resources_time, &cfgd_drop_resources_time);
-	set_val(root, "drop_resources_count", &optd_drop_resources_count, &cfgd_drop_resources_count);
-	set_val(root, "drop_resources_age", &optd_drop_resources_age, &cfgd_drop_resources_age);
-
-	xmlFreeDoc(doc);
+	fclose(file);
 }
diff --git a/dlm_controld/cpg.c b/dlm_controld/cpg.c
index 1360493..1fb5372 100644
--- a/dlm_controld/cpg.c
+++ b/dlm_controld/cpg.c
@@ -1,5 +1,5 @@
 /*
- * Copyright 2004-2011 Red Hat, Inc.
+ * Copyright 2004-2012 Red Hat, Inc.
  *
  * This copyrighted material is made available to anyone wishing to use,
  * modify, copy, or redistribute it subject to the terms and conditions
@@ -8,49 +8,6 @@
#include "dlm_daemon.h"
-#define PV_STATEFUL 0x0001
-
-struct protocol_version {
-	uint16_t major;
-	uint16_t minor;
-	uint16_t patch;
-	uint16_t flags;
-};
-
-struct protocol {
-	union {
-		struct protocol_version dm_ver;
-		uint16_t                daemon_max[4];
-	};
-	union {
-		struct protocol_version km_ver;
-		uint16_t                kernel_max[4];
-	};
-	union {
-		struct protocol_version dr_ver;
-		uint16_t                daemon_run[4];
-	};
-	union {
-		struct protocol_version kr_ver;
-		uint16_t                kernel_run[4];
-	};
-};
-
-/* per dlm_controld cpg: daemon_nodes */
-
-struct node_daemon {
-	struct list_head list;
-	int nodeid;
-
-	uint64_t daemon_add_time;
-	uint64_t daemon_rem_time;
-	int daemon_member;
-
-	int killed;
-
-	struct protocol proto;
-};
-
 /* per lockspace cpg: ls->node_history */
struct node {
@@ -71,11 +28,10 @@ struct node {
    int check_fs;
    int fs_notified;
-	int request_fencing;
-	int check_fencing;
-	uint64_t fail_realtime;
-	uint64_t fence_realtime;
+	int need_fencing;
    uint32_t fence_queries;	/* for debug */
+	uint64_t fail_walltime;
+	uint64_t fail_monotime;
 };
/* per lockspace confchg: ls->changes */
@@ -127,93 +83,6 @@ struct id_info {
    int nodeid;
 };
-int message_flow_control_on;
-static cpg_handle_t cpg_handle_daemon;
-static int cpg_fd_daemon;
-static struct protocol our_protocol;
-static struct list_head daemon_nodes;
-static struct cpg_address daemon_member[MAX_NODES];
-static int daemon_member_count;
-
-static void log_config(const struct cpg_name *group_name,
-		       const struct cpg_address *member_list,
-		       size_t member_list_entries,
-		       const struct cpg_address *left_list,
-		       size_t left_list_entries,
-		       const struct cpg_address *joined_list,
-		       size_t joined_list_entries)
-{
-	char m_buf[128];
-	char j_buf[32];
-	char l_buf[32];
-	size_t i, len, pos;
-	int ret;
-
-	memset(m_buf, 0, sizeof(m_buf));
-	memset(j_buf, 0, sizeof(j_buf));
-	memset(l_buf, 0, sizeof(l_buf));
-
-	len = sizeof(m_buf);
-	pos = 0;
-	for (i = 0; i < member_list_entries; i++) {
-		ret = snprintf(m_buf + pos, len - pos, " %d",
-			       member_list[i].nodeid);
-		if (ret >= len - pos)
-			break;
-		pos += ret;
-	}
-
-	len = sizeof(j_buf);
-	pos = 0;
-	for (i = 0; i < joined_list_entries; i++) {
-		ret = snprintf(j_buf + pos, len - pos, " %d",
-			       joined_list[i].nodeid);
-		if (ret >= len - pos)
-			break;
-		pos += ret;
-	}
-
-	len = sizeof(l_buf);
-	pos = 0;
-	for (i = 0; i < left_list_entries; i++) {
-		ret = snprintf(l_buf + pos, len - pos, " %d",
-			       left_list[i].nodeid);
-		if (ret >= len - pos)
-			break;
-		pos += ret;
-	}
-
-	log_debug("%s conf %zu %zu %zu memb%s join%s left%s", group_name->value,
-		  member_list_entries, joined_list_entries, left_list_entries,
-		  m_buf, j_buf, l_buf);
-}
-
-static void log_ringid(const char *name,
-                       struct cpg_ring_id *ringid,
-                       const uint32_t *member_list,
-                       size_t member_list_entries)
-{
-	char m_buf[128];
-	size_t i, len, pos;
-	int ret;
-
-	memset(m_buf, 0, sizeof(m_buf));
-
-	len = sizeof(m_buf);
-	pos = 0;
-	for (i = 0; i < member_list_entries; i++) {
-		ret = snprintf(m_buf + pos, len - pos, " %u",
-			       member_list[i]);
-		if (ret >= len - pos)
-			break;
-		pos += ret;
-	}
-
-	log_debug("%s ring %u:%llu %zu memb%s",
-		  name, ringid->nodeid, (unsigned long long)ringid->seq,
-		  member_list_entries, m_buf);
-}
-
 static void ls_info_in(struct ls_info *li)
 {
    li->ls_info_size  = le32_to_cpu(li->ls_info_size);
@@ -243,93 +112,6 @@ static void ids_in(struct ls_info *li, struct id_info *ids)
    }
 }
-const char *msg_name(int type)
-{
-	switch (type) {
-	case DLM_MSG_PROTOCOL:
-		return "protocol";
-	case DLM_MSG_START:
-		return "start";
-	case DLM_MSG_PLOCK:
-		return "plock";
-	case DLM_MSG_PLOCK_OWN:
-		return "plock_own";
-	case DLM_MSG_PLOCK_DROP:
-		return "plock_drop";
-	case DLM_MSG_PLOCK_SYNC_LOCK:
-		return "plock_sync_lock";
-	case DLM_MSG_PLOCK_SYNC_WAITER:
-		return "plock_sync_waiter";
-	case DLM_MSG_PLOCKS_DATA:
-		return "plocks_data";
-	case DLM_MSG_PLOCKS_DONE:
-		return "plocks_done";
-	case DLM_MSG_DEADLK_CYCLE_START:
-		return "deadlk_cycle_start";
-	case DLM_MSG_DEADLK_CYCLE_END:
-		return "deadlk_cycle_end";
-	case DLM_MSG_DEADLK_CHECKPOINT_READY:
-		return "deadlk_checkpoint_ready";
-	case DLM_MSG_DEADLK_CANCEL_LOCK:
-		return "deadlk_cancel_lock";
-	default:
-		return "unknown";
-	}
-}
-
-static int _send_message(cpg_handle_t h, void *buf, int len, int type)
-{
-	struct iovec iov;
-	cs_error_t error;
-	int retries = 0;
-
-	iov.iov_base = buf;
-	iov.iov_len = len;
-
- retry:
-	error = cpg_mcast_joined(h, CPG_TYPE_AGREED, &iov, 1);
-	if (error == CS_ERR_TRY_AGAIN) {
-		retries++;
-		usleep(1000);
-		if (!(retries % 100))
-			log_error("cpg_mcast_joined retry %d %s",
-				   retries, msg_name(type));
-		goto retry;
-	}
-	if (error != CS_OK) {
-		log_error("cpg_mcast_joined error %d handle %llx %s",
-			  error, (unsigned long long)h, msg_name(type));
-		return -1;
-	}
-
-	if (retries)
-		log_debug("cpg_mcast_joined retried %d %s",
-			  retries, msg_name(type));
-
-	return 0;
-}
-
-/* header fields caller needs to set: type, to_nodeid, flags, msgdata */
-
-void dlm_send_message(struct lockspace *ls, char *buf, int len)
-{
-	struct dlm_header *hd = (struct dlm_header *) buf;
-	int type = hd->type;
-
-	hd->version[0]  = cpu_to_le16(our_protocol.daemon_run[0]);
-	hd->version[1]  = cpu_to_le16(our_protocol.daemon_run[1]);
-	hd->version[2]  = cpu_to_le16(our_protocol.daemon_run[2]);
-	hd->type	= cpu_to_le16(hd->type);
-	hd->nodeid      = cpu_to_le32(our_nodeid);
-	hd->to_nodeid   = cpu_to_le32(hd->to_nodeid);
-	hd->global_id   = cpu_to_le32(ls->global_id);
-	hd->flags       = cpu_to_le32(hd->flags);
-	hd->msgdata     = cpu_to_le32(hd->msgdata);
-	hd->msgdata2    = cpu_to_le32(hd->msgdata2);
-
-	_send_message(ls->cpg_handle, buf, len, type);
-}
-
 static struct member *find_memb(struct change *cg, int nodeid)
 {
    struct member *memb;
@@ -424,7 +206,7 @@ static void free_ls(struct lockspace *ls)
    For 1:
    - node X fails
    - we see node X fail and X has non-zero start_time,
-     set check_fencing and record the fail time
+     set need_fencing and record the fail time
    - wait for X to be removed from all dlm cpg's  (probably not necessary)
    - check that the fencing time is later than the recorded time above
@@ -442,8 +224,8 @@ static void free_ls(struct lockspace *ls)
    when we see a node not in this list, add entry for it with zero start_time
    record the time we get a good start message from the node, start_time
    clear start_time if the node leaves
-   if node fails with non-zero start_time, set check_fencing
-   when a node is fenced, clear start_time and clear check_fencing
+   if node fails with non-zero start_time, set need_fencing
+   when a node is fenced, clear start_time and clear need_fencing
    if a node remerges after this, no good start message, no new start_time set
    if a node fails with zero start_time, it doesn't need fencing
    if a node remerges before it's been fenced, no good start message, no new
@@ -526,9 +308,7 @@ static void node_history_lockspace_fail(struct lockspace *ls, int nodeid,
    }
if (cfgd_enable_fencing && node->start_time) {
-		node->request_fencing = 1;
-		node->check_fencing = 1;
-		node->fence_realtime = 0;
+		node->need_fencing = 1;
    	node->fence_queries = 0;
    }
@@ -543,7 +323,9 @@ static void node_history_lockspace_fail(struct lockspace *ls, int nodeid,
    node->lockspace_fail_time = now;
    node->lockspace_fail_seq = node->lockspace_rem_seq;
    node->lockspace_fail_reason = reason;	/* for queries */
-	node->fail_realtime = time(NULL);
+
+	node->fail_monotime = now;
+	node->fail_walltime = time(NULL);
 }
static void node_history_start(struct lockspace *ls, int nodeid)
@@ -601,9 +383,9 @@ static int check_ringid_done(struct lockspace *ls)
 static int check_fencing_done(struct lockspace *ls)
 {
    struct node *node;
-	uint64_t last_fenced_time;
-	int in_progress, wait_count = 0;
-	int rv;
+	uint64_t fence_monotime;
+	int wait_count = 0;
+	int rv, in_progress;
if (!cfgd_enable_fencing) {
    	log_group(ls, "check_fencing disabled");
@@ -611,47 +393,35 @@ static int check_fencing_done(struct lockspace *ls)
    }
list_for_each_entry(node, &ls->node_history, list) {
-		if (!node->check_fencing)
+		if (!node->need_fencing)
    		continue;
-		/* check with fenced to see if the node has been
-		   fenced since node->start_time */
-
-		rv = fence_node_time(node->nodeid, &last_fenced_time);
+		rv = fence_node_time(node->nodeid, &fence_monotime);
    	if (rv < 0) {
    		log_error("fenced_node_time error %d", rv);
    		continue;
    	}
-		/* fenced gives us real time */
-
-		/* need >= not just > because in at least one case
-		   we've seen fenced_time within the same second as
-		   fail_time: with external fencing, e.g. fence_node */
-
-		if (last_fenced_time >= node->fail_realtime) {
-			log_group(ls, "check_fencing %d done "
-				  "start %llu fail %llu last %llu",
+		if (fence_monotime >= node->fail_monotime) {
+			log_group(ls, "check_fencing %d done start %llu fail %llu fence %llu",
    			  node->nodeid,
    			  (unsigned long long)node->start_time,
-				  (unsigned long long)node->fail_realtime,
-				  (unsigned long long)last_fenced_time);
-			node->check_fencing = 0;
+				  (unsigned long long)node->fail_monotime,
+				  (unsigned long long)fence_monotime);
+
+			node->need_fencing = 0;
    		node->start_time = 0;
-			node->fence_realtime = last_fenced_time;
+			continue;
    	} else {
-			if (!node->fence_queries ||
-			    node->fence_realtime != last_fenced_time) {
-				log_group(ls, "check_fencing %d wait "
-					  "start %llu fail %llu last %llu",
+			if (!node->fence_queries) {
+				log_group(ls, "check_fencing %d wait start %llu fail %llu",
    				  node->nodeid,
    				 (unsigned long long)node->start_time,
-					 (unsigned long long)node->fail_realtime,
-					 (unsigned long long)last_fenced_time);
+					 (unsigned long long)node->fail_monotime);
    			node->fence_queries++;
-				node->fence_realtime = last_fenced_time;
    		}
    		wait_count++;
+			continue;
    	}
    }
@@ -679,39 +449,6 @@ static int check_fencing_done(struct lockspace *ls)
    return 1;
 }
-static int need_fencing(struct lockspace *ls)
-{
-	struct node *node;
-
-	list_for_each_entry(node, &ls->node_history, list) {
-		if (node->check_fencing) {
-			log_group(ls, "need_fencing %d", node->nodeid);
-			return 1;
-		}
-	}
-	return 0;
-}
-
-/* we don't need to ask fenced to initiate fencing; it does
-   so itself when it sees a fence domain member fail.  Without
-   fenced we'll probably need to ask another daemon to initiate
-   fencing, then check with it above, like we check libfenced. */
-
-static void request_fencing(struct lockspace *ls)
-{
-	struct node *node;
-	int rv;
-
-	list_for_each_entry(node, &ls->node_history, list) {
-		if (!node->request_fencing)
-			continue;
-		log_group(ls, "fence_request %d", node->nodeid);
-		rv = fence_request(node->nodeid);
-		if (!rv)
-			node->request_fencing = 0;
-	}
-}
-
 /* wait for local fs_controld to ack each failed node */
static int check_fs_done(struct lockspace *ls)
@@ -839,43 +576,18 @@ static void stop_kernel(struct lockspace *ls, uint32_t seq)
 /* we know that the cluster_quorate value here is consistent with the cpg events
    because the ringid's are in sync per the check_ringid_done */
-/*
-   enable_quorum = 0
-	Never wait for quorum here.
-
-   enable_quorum = 1
-	Wait for quorum before calling request_fencing() in cases where
-	fencing is needed.  Reason for using this would be if fencing
-	system doesn't wait for quorum before carrying out a fencing
-	request, and we want to avoid having partitioned inquorate nodes
-	fencing quorate nodes in another partition.
-	   
-   enable_quorum = 2
-	Always wait for quorum as a general condition here. This
-	means that lockspace join/leave operations would block waiting
-	for quorum.  There's not any reason to do this per se, unless
-	that's the behavior the application using the dlm wants.
-*/
-
 static int wait_conditions_done(struct lockspace *ls)
 {
    if (!check_ringid_done(ls))
    	return 0;
-	if ((cfgd_enable_quorum > 1) && !cluster_quorate) {
+	if ((cfgd_enable_quorum_lockspace > 1) && !cluster_quorate) {
    	log_group(ls, "wait for quorum");
    	return 0;
    }
-	if ((cfgd_enable_quorum > 0) && need_fencing(ls) && !cluster_quorate) {
-		log_group(ls, "wait for quorum before fencing");
-		return 0;
-	}
-
-	request_fencing(ls);
-
    if (!check_fencing_done(ls)) {
-		poll_fencing++;
+		poll_lockspaces++;
    	return 0;
    }
@@ -1437,7 +1149,7 @@ static void apply_changes(struct lockspace *ls)
case CGST_WAIT_MESSAGES:
    	if (wait_messages_done(ls)) {
-			our_protocol.dr_ver.flags |= PV_STATEFUL;
+			set_protocol_stateful();
    		start_kernel(ls);
    		prepare_plocks(ls);
    		cleanup_changes(ls);
@@ -1453,9 +1165,7 @@ void process_lockspace_changes(void)
 {
    struct lockspace *ls, *safe;
-	poll_ringid = 0;
-	poll_fencing = 0;
-	poll_quorum = 0;
+	poll_lockspaces = 0;
    poll_fs = 0;
list_for_each_entry_safe(ls, safe, &lockspaces, list) {
@@ -1464,24 +1174,6 @@ void process_lockspace_changes(void)
    }
 }
-static const char *reason_str(int reason)
-{
-	switch (reason) {
-	case CPG_REASON_JOIN:
-		return "join";
-	case CPG_REASON_LEAVE:
-		return "leave";
-	case CPG_REASON_NODEDOWN:
-		return "nodedown";
-	case CPG_REASON_NODEUP:
-		return "nodeup";
-	case CPG_REASON_PROCDOWN:
-		return "procdown";
-	default:
-		return "unknown";
-	};
-}
-
 static int add_change(struct lockspace *ls,
    	      const struct cpg_address *member_list,
    	      size_t member_list_entries,
@@ -1665,20 +1357,6 @@ static void confchg_cb(cpg_handle_t handle,
 #endif
 }
-static void dlm_header_in(struct dlm_header *hd)
-{
-	hd->version[0]  = le16_to_cpu(hd->version[0]);
-	hd->version[1]  = le16_to_cpu(hd->version[1]);
-	hd->version[2]  = le16_to_cpu(hd->version[2]);
-	hd->type        = le16_to_cpu(hd->type);
-	hd->nodeid      = le32_to_cpu(hd->nodeid);
-	hd->to_nodeid   = le32_to_cpu(hd->to_nodeid);
-	hd->global_id   = le32_to_cpu(hd->global_id);
-	hd->flags       = le32_to_cpu(hd->flags);
-	hd->msgdata     = le32_to_cpu(hd->msgdata);
-	hd->msgdata2    = le32_to_cpu(hd->msgdata2);
-}
-
 /* after our join confchg, we want to ignore plock messages (see need_plocks
    checks below) until the point in time where the ckpt_node saves plock
    state (final start message received); at this time we want to shift from
@@ -1693,6 +1371,7 @@ static void deliver_cb(cpg_handle_t handle,
    struct lockspace *ls;
    struct dlm_header *hd;
    int ignore_plock;
+	int rv;
ls = find_ls_handle(handle);
    if (!ls) {
@@ -1700,7 +1379,7 @@ static void deliver_cb(cpg_handle_t handle,
    	return;
    }
-	if (len < sizeof(*hd)) {
+	if (len < sizeof(struct dlm_header)) {
    	log_error("deliver_cb short message %zd", len);
    	return;
    }
@@ -1708,20 +1387,9 @@ static void deliver_cb(cpg_handle_t handle,
    hd = (struct dlm_header *)data;
    dlm_header_in(hd);
-	if (hd->version[0] != our_protocol.daemon_run[0] ||
-	    hd->version[1] != our_protocol.daemon_run[1]) {
-		log_error("reject message from %d version %u.%u.%u vs %u.%u.%u",
-			  nodeid, hd->version[0], hd->version[1],
-			  hd->version[2], our_protocol.daemon_run[0],
-			  our_protocol.daemon_run[1],
-			  our_protocol.daemon_run[2]);
+	rv = dlm_header_validate(hd, nodeid);
+	if (rv < 0)
    	return;
-	}
-
-	if (hd->nodeid != nodeid) {
-		log_error("bad msg nodeid %d %d", hd->nodeid, nodeid);
-		return;
-	}
ignore_plock = 0;
@@ -1892,31 +1560,6 @@ static cpg_model_v1_data_t cpg_callbacks = {
    .flags = CPG_MODEL_V1_DELIVER_INITIAL_TOTEM_CONF,
 };
-void update_flow_control_status(void)
-{
-	cpg_flow_control_state_t flow_control_state;
-	cs_error_t error;
-
-	error = cpg_flow_control_state_get(cpg_handle_daemon,
-					   &flow_control_state);
-	if (error != CS_OK) {
-		log_error("cpg_flow_control_state_get %d", error);
-		return;
-	}
-
-	if (flow_control_state == CPG_FLOW_CONTROL_ENABLED) {
-		if (message_flow_control_on == 0) {
-			log_debug("flow control on");
-		}
-		message_flow_control_on = 1;
-	} else {
-		if (message_flow_control_on) {
-			log_debug("flow control off");
-		}
-		message_flow_control_on = 0;
-	}
-}
-
 static void process_cpg_lockspace(int ci)
 {
    struct lockspace *ls;
@@ -1933,8 +1576,6 @@ static void process_cpg_lockspace(int ci)
    	log_error("cpg_dispatch error %d", error);
    	return;
    }
-
-	update_flow_control_status();
 }
/* received an "online" uevent from dlm-kernel */
@@ -1945,14 +1586,6 @@ int dlm_join_lockspace(struct lockspace *ls)
    cpg_handle_t h;
    struct cpg_name name;
    int i = 0, fd, ci, rv;
-	int unused;
-
-	rv = fence_in_progress(&unused);
-	if (cfgd_enable_fencing && rv < 0) {
-		log_error("dlm_join_lockspace no fence domain");
-		rv = -1;
-		goto fail_free;
-	}
error = cpg_model_initialize(&h, CPG_MODEL_V1,
    			     (cpg_model_data_t *)&cpg_callbacks, NULL);
@@ -2038,645 +1671,6 @@ int dlm_leave_lockspace(struct lockspace *ls)
    return 0;
 }
-static struct node_daemon *get_node_daemon(int nodeid)
-{
-	struct node_daemon *node;
-
-	list_for_each_entry(node, &daemon_nodes, list) {
-		if (node->nodeid == nodeid)
-			return node;
-	}
-	return NULL;
-}
-
-static void add_node_daemon(int nodeid)
-{
-	struct node_daemon *node;
-
-	if (get_node_daemon(nodeid))
-		return;
-
-	node = malloc(sizeof(struct node_daemon));
-	if (!node) {
-		log_error("add_node_daemon no mem");
-		return;
-	}
-	memset(node, 0, sizeof(struct node_daemon));
-	node->nodeid = nodeid;
-	list_add_tail(&node->list, &daemon_nodes);
-}
-
-static void pv_in(struct protocol_version *pv)
-{
-	pv->major = le16_to_cpu(pv->major);
-	pv->minor = le16_to_cpu(pv->minor);
-	pv->patch = le16_to_cpu(pv->patch);
-	pv->flags = le16_to_cpu(pv->flags);
-}
-
-static void pv_out(struct protocol_version *pv)
-{
-	pv->major = cpu_to_le16(pv->major);
-	pv->minor = cpu_to_le16(pv->minor);
-	pv->patch = cpu_to_le16(pv->patch);
-	pv->flags = cpu_to_le16(pv->flags);
-}
-
-static void protocol_in(struct protocol *proto)
-{
-	pv_in(&proto->dm_ver);
-	pv_in(&proto->km_ver);
-	pv_in(&proto->dr_ver);
-	pv_in(&proto->kr_ver);
-}
-
-static void protocol_out(struct protocol *proto)
-{
-	pv_out(&proto->dm_ver);
-	pv_out(&proto->km_ver);
-	pv_out(&proto->dr_ver);
-	pv_out(&proto->kr_ver);
-}
-
-/* go through member list saved in last confchg, see if we have received a
-   proto message from each */
-
-static int all_protocol_messages(void)
-{
-	struct node_daemon *node;
-	int i;
-
-	if (!daemon_member_count)
-		return 0;
-
-	for (i = 0; i < daemon_member_count; i++) {
-		node = get_node_daemon(daemon_member[i].nodeid);
-		if (!node) {
-			log_error("all_protocol_messages no node %d",
-				  daemon_member[i].nodeid);
-			return 0;
-		}
-
-		if (!node->proto.daemon_max[0])
-			return 0;
-	}
-	return 1;
-}
-
-static int pick_min_protocol(struct protocol *proto)
-{
-	uint16_t mind[4];
-	uint16_t mink[4];
-	struct node_daemon *node;
-	int i;
-
-	memset(&mind, 0, sizeof(mind));
-	memset(&mink, 0, sizeof(mink));
-
-	/* first choose the minimum major */
-
-	for (i = 0; i < daemon_member_count; i++) {
-		node = get_node_daemon(daemon_member[i].nodeid);
-		if (!node) {
-			log_error("pick_min_protocol no node %d",
-				  daemon_member[i].nodeid);
-			return -1;
-		}
-
-		if (!mind[0] || node->proto.daemon_max[0] < mind[0])
-			mind[0] = node->proto.daemon_max[0];
-
-		if (!mink[0] || node->proto.kernel_max[0] < mink[0])
-			mink[0] = node->proto.kernel_max[0];
-	}
-
-	if (!mind[0] || !mink[0]) {
-		log_error("pick_min_protocol zero major number");
-		return -1;
-	}
-
-	/* second pick the minimum minor with the chosen major */
-
-	for (i = 0; i < daemon_member_count; i++) {
-		node = get_node_daemon(daemon_member[i].nodeid);
-		if (!node)
-			continue;
-
-		if (mind[0] == node->proto.daemon_max[0]) {
-			if (!mind[1] || node->proto.daemon_max[1] < mind[1])
-				mind[1] = node->proto.daemon_max[1];
-		}
-
-		if (mink[0] == node->proto.kernel_max[0]) {
-			if (!mink[1] || node->proto.kernel_max[1] < mink[1])
-				mink[1] = node->proto.kernel_max[1];
-		}
-	}
-
-	if (!mind[1] || !mink[1]) {
-		log_error("pick_min_protocol zero minor number");
-		return -1;
-	}
-
-	/* third pick the minimum patch with the chosen major.minor */
-
-	for (i = 0; i < daemon_member_count; i++) {
-		node = get_node_daemon(daemon_member[i].nodeid);
-		if (!node)
-			continue;
-
-		if (mind[0] == node->proto.daemon_max[0] &&
-		    mind[1] == node->proto.daemon_max[1]) {
-			if (!mind[2] || node->proto.daemon_max[2] < mind[2])
-				mind[2] = node->proto.daemon_max[2];
-		}
-
-		if (mink[0] == node->proto.kernel_max[0] &&
-		    mink[1] == node->proto.kernel_max[1]) {
-			if (!mink[2] || node->proto.kernel_max[2] < mink[2])
-				mink[2] = node->proto.kernel_max[2];
-		}
-	}
-
-	if (!mind[2] || !mink[2]) {
-		log_error("pick_min_protocol zero patch number");
-		return -1;
-	}
-
-	memcpy(&proto->daemon_run, &mind, sizeof(mind));
-	memcpy(&proto->kernel_run, &mink, sizeof(mink));
-	return 0;
-}
-
-static void receive_protocol(struct dlm_header *hd, int len)
-{
-	struct protocol *p;
-	struct node_daemon *node;
-
-	p = (struct protocol *)((char *)hd + sizeof(struct dlm_header));
-	protocol_in(p);
-
-	if (len < sizeof(struct dlm_header) + sizeof(struct protocol)) {
-		log_error("receive_protocol invalid len %d from %d",
-			  len, hd->nodeid);
-		return;
-	}
-
-	/* zero is an invalid version value */
-
-	if (!p->daemon_max[0] || !p->daemon_max[1] || !p->daemon_max[2] ||
-	    !p->kernel_max[0] || !p->kernel_max[1] || !p->kernel_max[2]) {
-		log_error("receive_protocol invalid max value from %d "
-			  "daemon %u.%u.%u kernel %u.%u.%u", hd->nodeid,
-			  p->daemon_max[0], p->daemon_max[1], p->daemon_max[2],
-			  p->kernel_max[0], p->kernel_max[1], p->kernel_max[2]);
-		return;
-	}
-
-	/* the run values will be zero until a version is set, after
-	   which none of the run values can be zero */
-
-	if (p->daemon_run[0] && (!p->daemon_run[1] || !p->daemon_run[2] ||
-	    !p->kernel_run[0] || !p->kernel_run[1] || !p->kernel_run[2])) {
-		log_error("receive_protocol invalid run value from %d "
-			  "daemon %u.%u.%u kernel %u.%u.%u", hd->nodeid,
-			  p->daemon_run[0], p->daemon_run[1], p->daemon_run[2],
-			  p->kernel_run[0], p->kernel_run[1], p->kernel_run[2]);
-		return;
-	}
-
-	/* save this node's proto so we can tell when we've got all, and
-	   use it to select a minimum protocol from all */
-
-	node = get_node_daemon(hd->nodeid);
-	if (!node) {
-		log_error("receive_protocol no node %d", hd->nodeid);
-		return;
-	}
-
-	if (!node->daemon_member) {
-		log_error("receive_protocol node %d not member", hd->nodeid);
-		return;
-	}
-
-	log_debug("receive_protocol from %d max %u.%u.%u.%x run %u.%u.%u.%x",
-		  hd->nodeid,
-		  p->daemon_max[0], p->daemon_max[1],
-		  p->daemon_max[2], p->daemon_max[3],
-		  p->daemon_run[0], p->daemon_run[1],
-		  p->daemon_run[2], p->daemon_run[3]);
-	log_debug("daemon node %d max %u.%u.%u.%x run %u.%u.%u.%x",
-		  hd->nodeid,
-		  node->proto.daemon_max[0], node->proto.daemon_max[1],
-		  node->proto.daemon_max[2], node->proto.daemon_max[3],
-		  node->proto.daemon_run[0], node->proto.daemon_run[1],
-		  node->proto.daemon_run[2], node->proto.daemon_run[3]);
-	log_debug("daemon node %d join %llu left %llu local quorum %llu",
-		  hd->nodeid,
-		  (unsigned long long)node->daemon_add_time,
-		  (unsigned long long)node->daemon_rem_time,
-		  (unsigned long long)quorate_time);
-
-	/* checking zero node->daemon_max[0] is a way to tell if we've received
-	   an acceptable (non-stateful) proto message from the node since we
-	   saw it join the daemon cpg */
-
-	if (hd->nodeid != our_nodeid &&
-	    !node->proto.daemon_max[0] &&
-	    (p->dr_ver.flags & PV_STATEFUL) &&
-	    (our_protocol.dr_ver.flags & PV_STATEFUL)) {
-
-		log_debug("daemon node %d stateful merge", hd->nodeid);
-
-		if (cluster_quorate && node->daemon_rem_time &&
-		    quorate_time < node->daemon_rem_time) {
-			log_debug("daemon node %d kill due to stateful merge", hd->nodeid);
-			if (!node->killed)
-				kick_node_from_cluster(hd->nodeid);
-			node->killed = 1;
-		}
-
-		/* don't save p->proto into node->proto; we need to come
-		   through here based on zero daemon_max[0] for other proto
-		   messages like this one from the same node */
-
-		return;
-	}
-
-	memcpy(&node->proto, p, sizeof(struct protocol));
-
-	/* if we have zero run values, and this msg has non-zero run values,
-	   then adopt them as ours; otherwise save this proto message */
-
-	if (our_protocol.daemon_run[0])
-		return;
-
-	if (p->daemon_run[0]) {
-		our_protocol.daemon_run[0] = p->daemon_run[0];
-		our_protocol.daemon_run[1] = p->daemon_run[1];
-		our_protocol.daemon_run[2] = p->daemon_run[2];
-
-		our_protocol.kernel_run[0] = p->kernel_run[0];
-		our_protocol.kernel_run[1] = p->kernel_run[1];
-		our_protocol.kernel_run[2] = p->kernel_run[2];
-
-		log_debug("run protocol from nodeid %d", hd->nodeid);
-	}
-}
-
-static void send_protocol(struct protocol *proto)
-{
-	struct dlm_header *hd;
-	struct protocol *pr;
-	char *buf;
-	int len;
-
-	len = sizeof(struct dlm_header) + sizeof(struct protocol);
-	buf = malloc(len);
-	if (!buf) {
-		log_error("send_protocol no mem %d", len);
-		return;
-	}
-	memset(buf, 0, len);
-
-	hd = (struct dlm_header *)buf;
-	pr = (struct protocol *)(buf + sizeof(*hd));
-
-	hd->type = cpu_to_le16(DLM_MSG_PROTOCOL);
-	hd->nodeid = cpu_to_le32(our_nodeid);
-
-	memcpy(pr, proto, sizeof(struct protocol));
-	protocol_out(pr);
-
-	_send_message(cpg_handle_daemon, buf, len, DLM_MSG_PROTOCOL);
-}
-
-int set_protocol(void)
-{
-	struct protocol proto;
-	struct pollfd pollfd;
-	int sent_proposal = 0;
-	int rv;
-
-	memset(&pollfd, 0, sizeof(pollfd));
-	pollfd.fd = cpg_fd_daemon;
-	pollfd.events = POLLIN;
-
-	while (1) {
-		if (our_protocol.daemon_run[0])
-			break;
-
-		if (!sent_proposal && all_protocol_messages()) {
-			/* propose a protocol; look through info from all
-			   nodes and pick the min for both daemon and kernel,
-			   and propose that */
-
-			sent_proposal = 1;
-
-			/* copy our max values */
-			memcpy(&proto, &our_protocol, sizeof(struct protocol));
-
-			rv = pick_min_protocol(&proto);
-			if (rv < 0)
-				return rv;
-
-			log_debug("set_protocol member_count %d propose "
-				  "daemon %u.%u.%u kernel %u.%u.%u",
-				  daemon_member_count,
-				  proto.daemon_run[0], proto.daemon_run[1],
-				  proto.daemon_run[2], proto.kernel_run[0],
-				  proto.kernel_run[1], proto.kernel_run[2]);
-
-			send_protocol(&proto);
-		}
-
-		/* only process messages/events from daemon cpg until protocol
-		   is established */
-
-		rv = poll(&pollfd, 1, -1);
-		if (rv == -1 && errno == EINTR) {
-			if (daemon_quit)
-				return -1;
-			continue;
-		}
-		if (rv < 0) {
-			log_error("set_protocol poll errno %d", errno);
-			return -1;
-		}
-
-		if (pollfd.revents & POLLIN)
-			process_cpg_daemon(0);
-		if (pollfd.revents & (POLLERR | POLLHUP | POLLNVAL)) {
-			log_error("set_protocol poll revents %u",
-				  pollfd.revents);
-			return -1;
-		}
-	}
-
-	if (our_protocol.daemon_run[0] != our_protocol.daemon_max[0] ||
-	    our_protocol.daemon_run[1] > our_protocol.daemon_max[1]) {
-		log_error("incompatible daemon protocol run %u.%u.%u max %u.%u.%u",
-			our_protocol.daemon_run[0],
-			our_protocol.daemon_run[1],
-			our_protocol.daemon_run[2],
-			our_protocol.daemon_max[0],
-			our_protocol.daemon_max[1],
-			our_protocol.daemon_max[2]);
-		return -1;
-	}
-
-	if (our_protocol.kernel_run[0] != our_protocol.kernel_max[0] ||
-	    our_protocol.kernel_run[1] > our_protocol.kernel_max[1]) {
-		log_error("incompatible kernel protocol run %u.%u.%u max %u.%u.%u",
-			our_protocol.kernel_run[0],
-			our_protocol.kernel_run[1],
-			our_protocol.kernel_run[2],
-			our_protocol.kernel_max[0],
-			our_protocol.kernel_max[1],
-			our_protocol.kernel_max[2]);
-		return -1;
-	}
-
-	log_debug("daemon run %u.%u.%u max %u.%u.%u "
-		  "kernel run %u.%u.%u max %u.%u.%u",
-		  our_protocol.daemon_run[0],
-		  our_protocol.daemon_run[1],
-		  our_protocol.daemon_run[2],
-		  our_protocol.daemon_max[0],
-		  our_protocol.daemon_max[1],
-		  our_protocol.daemon_max[2],
-		  our_protocol.kernel_run[0],
-		  our_protocol.kernel_run[1],
-		  our_protocol.kernel_run[2],
-		  our_protocol.kernel_max[0],
-		  our_protocol.kernel_max[1],
-		  our_protocol.kernel_max[2]);
-
-	send_protocol(&our_protocol);
-	return 0;
-}
-
-static void deliver_cb_daemon(cpg_handle_t handle,
-			      const struct cpg_name *group_name,
-			      uint32_t nodeid, uint32_t pid,
-			      void *data, size_t len)
-{
-	struct dlm_header *hd;
-
-	if (len < sizeof(*hd)) {
-		log_error("deliver_cb short message %zd", len);
-		return;
-	}
-
-	hd = (struct dlm_header *)data;
-	dlm_header_in(hd);
-
-	switch (hd->type) {
-	case DLM_MSG_PROTOCOL:
-		receive_protocol(hd, len);
-		break;
-	default:
-		log_error("deliver_cb_daemon unknown msg type %d", hd->type);
-	}
-}
-
-static int in_daemon_member_list(int nodeid)
-{
-	int i;
-
-	for (i = 0; i < daemon_member_count; i++) {
-		if (daemon_member[i].nodeid == nodeid)
-			return 1;
-	}
-	return 0;
-}
-
-static void confchg_cb_daemon(cpg_handle_t handle,
-			      const struct cpg_name *group_name,
-			      const struct cpg_address *member_list,
-			      size_t member_list_entries,
-			      const struct cpg_address *left_list,
-			      size_t left_list_entries,
-			      const struct cpg_address *joined_list,
-			      size_t joined_list_entries)
-{
-	struct node_daemon *node;
-	uint64_t now = monotime();
-	int nodedown = 0, procdown = 0, leave = 0;
-	int i;
-
-	log_config(group_name, member_list, member_list_entries,
-		   left_list, left_list_entries,
-		   joined_list, joined_list_entries);
-
-	for (i = 0; i < left_list_entries; i++) {
-		if (left_list[i].reason == CPG_REASON_NODEDOWN)
-			nodedown++;
-		else if (left_list[i].reason == CPG_REASON_PROCDOWN)
-			procdown++;
-		else if (left_list[i].reason == CPG_REASON_LEAVE)
-			leave++;
-	}
-
-	if (nodedown || procdown || leave)
-		log_debug("%s left nodedown %d procdown %d leave %d",
-			  group_name->value, nodedown, procdown, leave);
-
-	if (joined_list_entries)
-		send_protocol(&our_protocol);
-
-	memset(&daemon_member, 0, sizeof(daemon_member));
-	daemon_member_count = member_list_entries;
-
-	for (i = 0; i < member_list_entries; i++) {
-		daemon_member[i] = member_list[i];
-		/* add struct for nodes we've not seen before */
-		add_node_daemon(member_list[i].nodeid);
-	}
-
-	list_for_each_entry(node, &daemon_nodes, list) {
-		if (in_daemon_member_list(node->nodeid)) {
-			if (node->daemon_member)
-				continue;
-
-			/* node joined daemon cpg */
-			node->daemon_member = 1;
-			node->daemon_add_time = now;
-		} else {
-			if (!node->daemon_member)
-				continue;
-
-			/* node left daemon cpg */
-			node->daemon_member = 0;
-			node->killed = 0;
-			memset(&node->proto, 0, sizeof(struct protocol));
-			node->daemon_rem_time = now;
-		}
-	}
-}
-
-static void totem_cb_daemon(cpg_handle_t handle,
-                            struct cpg_ring_id ring_id,
-                            uint32_t member_list_entries,
-                            const uint32_t *member_list)
-{
-	log_ringid("dlm:controld", &ring_id, member_list, member_list_entries);
-}
-
-static cpg_model_v1_data_t cpg_callbacks_daemon = {
-	.cpg_deliver_fn = deliver_cb_daemon,
-	.cpg_confchg_fn = confchg_cb_daemon,
-	.cpg_totem_confchg_fn = totem_cb_daemon,
-	.flags = CPG_MODEL_V1_DELIVER_INITIAL_TOTEM_CONF,
-};
-
-void process_cpg_daemon(int ci)
-{
-	cs_error_t error;
-
-	error = cpg_dispatch(cpg_handle_daemon, CS_DISPATCH_ALL);
-	if (error != CS_OK)
-		log_error("daemon cpg_dispatch error %d", error);
-}
-
-int setup_cpg_daemon(void)
-{
-	cs_error_t error;
-	struct cpg_name name;
-	int i = 0;
-
-	INIT_LIST_HEAD(&daemon_nodes);
-
-	/* daemon 1.1.1 was cluster3/STABLE3/RHEL6 which is incompatible
-	   with cluster4/RHEL7 */ 
-
-	memset(&our_protocol, 0, sizeof(our_protocol));
-
-	if (cfgd_enable_fscontrol)
-		our_protocol.daemon_max[0] = 2;
-	else
-		our_protocol.daemon_max[0] = 3;
-
-	our_protocol.daemon_max[1] = 1;
-	our_protocol.daemon_max[2] = 1;
-	our_protocol.kernel_max[0] = 1;
-	our_protocol.kernel_max[1] = 1;
-	our_protocol.kernel_max[2] = 1;
-
-	error = cpg_model_initialize(&cpg_handle_daemon, CPG_MODEL_V1,
-				     (cpg_model_data_t *)&cpg_callbacks_daemon,
-				     NULL);
-	if (error != CS_OK) {
-		log_error("daemon cpg_initialize error %d", error);
-		return -1;
-	}
-
-	cpg_fd_get(cpg_handle_daemon, &cpg_fd_daemon);
-
-	memset(&name, 0, sizeof(name));
-	sprintf(name.value, "dlm:controld");
-	name.length = strlen(name.value) + 1;
-
-	log_debug("cpg_join %s ...", name.value);
- retry:
-	error = cpg_join(cpg_handle_daemon, &name);
-	if (error == CS_ERR_TRY_AGAIN) {
-		sleep(1);
-		if (!(++i % 10))
-			log_error("daemon cpg_join error retrying");
-		goto retry;
-	}
-	if (error != CS_OK) {
-		log_error("daemon cpg_join error %d", error);
-		goto fail;
-	}
-
-	log_debug("setup_cpg_daemon %d", cpg_fd_daemon);
-	return cpg_fd_daemon;
-
- fail:
-	cpg_finalize(cpg_handle_daemon);
-	return -1;
-}
-
-void close_cpg_daemon(void)
-{
-	struct lockspace *ls;
-	cs_error_t error;
-	struct cpg_name name;
-	int i = 0;
-
-	if (!cpg_handle_daemon)
-		return;
-	if (cluster_down)
-		goto fin;
-
-	memset(&name, 0, sizeof(name));
-	sprintf(name.value, "dlm:controld");
-	name.length = strlen(name.value) + 1;
-
-	log_debug("cpg_leave %s ...", name.value);
- retry:
-	error = cpg_leave(cpg_handle_daemon, &name);
-	if (error == CS_ERR_TRY_AGAIN) {
-		sleep(1);
-		if (!(++i % 10))
-			log_error("daemon cpg_leave error retrying");
-		goto retry;
-	}
-	if (error != CS_OK)
-		log_error("daemon cpg_leave error %d", error);
- fin:
-	list_for_each_entry(ls, &lockspaces, list) {
-		if (ls->cpg_handle)
-			cpg_finalize(ls->cpg_handle);
-	}
-	cpg_finalize(cpg_handle_daemon);
-}
-
-/* fs_controld has seen nodedown for nodeid; it's now ok for dlm to do
-   recovery for the failed node */
-
 int set_fs_notified(struct lockspace *ls, int nodeid)
 {
    struct node *node;
@@ -2755,12 +1749,8 @@ int set_lockspace_info(struct lockspace *ls, struct dlmc_lockspace *lockspace)
if (cg->state == CGST_WAIT_CONDITIONS)
    	lockspace->cg_next.wait_condition = 5;
-	if (poll_ringid)
-		lockspace->cg_next.wait_condition = 1;
    else if (poll_fencing)
    	lockspace->cg_next.wait_condition = 2;
-	else if (poll_quorum)
-		lockspace->cg_next.wait_condition = 3;
    else if (poll_fs)
    	lockspace->cg_next.wait_condition = 4;
@@ -2795,7 +1785,7 @@ static int _set_node_info(struct lockspace *ls, struct change *cg, int nodeid,
    if (!n)
    	goto out;
-	if (n->check_fencing)
+	if (n->need_fencing)
    	node->flags |= DLMC_NF_CHECK_FENCING;
    if (n->check_fs)
    	node->flags |= DLMC_NF_CHECK_FS;
diff --git a/dlm_controld/daemon_cpg.c b/dlm_controld/daemon_cpg.c
new file mode 100644
index 0000000..e5e9270
--- /dev/null
+++ b/dlm_controld/daemon_cpg.c
@@ -0,0 +1,1921 @@
+/*
+ * Copyright 2004-2012 Red Hat, Inc.
+ *
+ * This copyrighted material is made available to anyone wishing to use,
+ * modify, copy, or redistribute it subject to the terms and conditions
+ * of the GNU General Public License v2 or (at your option) any later version.
+ */
+
+#include "dlm_daemon.h"
+
+/* protocol_version flags */
+#define PV_STATEFUL 0x0001
+
+struct protocol_version {
+	uint16_t major;
+	uint16_t minor;
+	uint16_t patch;
+	uint16_t flags;
+};
+
+struct protocol {
+	union {
+		struct protocol_version dm_ver;
+		uint16_t                daemon_max[4];
+	};
+	union {
+		struct protocol_version km_ver;
+		uint16_t                kernel_max[4];
+	};
+	union {
+		struct protocol_version dr_ver;
+		uint16_t                daemon_run[4];
+	};
+	union {
+		struct protocol_version kr_ver;
+		uint16_t                kernel_run[4];
+	};
+};
+
+/* fence_result flags */
+#define FR_FIPU			0x00000001
+#define FR_CLEAR_STARTUP	0x00000002
+#define FR_CLEAR_FIPU		0x00000004
+
+struct fence_result {
+	uint32_t version;
+	uint32_t flags;
+	uint32_t nodeid;
+	uint32_t result;
+	uint64_t fence_walltime;
+	char unused[1000];
+};
+
+struct node_daemon {
+	struct list_head list;
+	int nodeid;
+
+	uint64_t daemon_add_time;
+	uint64_t daemon_rem_time;
+	int daemon_member;
+
+	int killed;
+
+	struct protocol proto;
+
+	struct fence_config fence_config;
+
+	int fence_in_progress_unknown;
+	int left_reason;
+	int recover_setup;
+	int need_fence_clear;
+	int need_fencing;
+	int fence_pid;
+	int fence_pid_wait;
+	int fence_actor_done;
+	int fence_actors[MAX_NODES];
+	uint64_t fail_walltime;
+	uint64_t fail_monotime;
+	uint64_t fence_request_time;
+	uint64_t fence_walltime;
+	uint64_t fence_monotime;
+};
+
+#define REASON_STARTUP_FENCING -1
+
+static cpg_handle_t cpg_handle_daemon;
+static int cpg_fd_daemon;
+static struct protocol our_protocol;
+static struct list_head daemon_nodes;
+static struct list_head startup_nodes;
+static struct cpg_address daemon_member[MAX_NODES];
+static struct cpg_address daemon_joined[MAX_NODES];
+static struct cpg_address daemon_remove[MAX_NODES];
+static int daemon_member_count;
+static int daemon_joined_count;
+static int daemon_remove_count;
+static int daemon_ringid_wait;
+static struct cpg_ring_id daemon_ringid;
+static int daemon_clear_nodeid;
+static int daemon_clear_pid;
+static uint64_t last_join_monotime;
+static uint32_t last_join_seq;
+static uint32_t send_fipu_seq;
+static int fence_in_progress_unknown = 1;
+
+static void send_fence_result(int nodeid, int result, uint32_t flags, uint64_t walltime);
+static void send_fence_clear(int nodeid, int result, uint32_t flags, uint64_t walltime);
+
+void log_config(const struct cpg_name *group_name,
+		const struct cpg_address *member_list,
+		size_t member_list_entries,
+		const struct cpg_address *left_list,
+		size_t left_list_entries,
+		const struct cpg_address *joined_list,
+		size_t joined_list_entries)
+{
+	char m_buf[128];
+	char j_buf[32];
+	char l_buf[32];
+	size_t i, len, pos;
+	int ret;
+
+	memset(m_buf, 0, sizeof(m_buf));
+	memset(j_buf, 0, sizeof(j_buf));
+	memset(l_buf, 0, sizeof(l_buf));
+
+	len = sizeof(m_buf);
+	pos = 0;
+	for (i = 0; i < member_list_entries; i++) {
+		ret = snprintf(m_buf + pos, len - pos, " %d",
+			       member_list[i].nodeid);
+		if (ret >= len - pos)
+			break;
+		pos += ret;
+	}
+
+	len = sizeof(j_buf);
+	pos = 0;
+	for (i = 0; i < joined_list_entries; i++) {
+		ret = snprintf(j_buf + pos, len - pos, " %d",
+			       joined_list[i].nodeid);
+		if (ret >= len - pos)
+			break;
+		pos += ret;
+	}
+
+	len = sizeof(l_buf);
+	pos = 0;
+	for (i = 0; i < left_list_entries; i++) {
+		ret = snprintf(l_buf + pos, len - pos, " %d",
+			       left_list[i].nodeid);
+		if (ret >= len - pos)
+			break;
+		pos += ret;
+	}
+
+	log_debug("%s conf %zu %zu %zu memb%s join%s left%s", group_name->value,
+		  member_list_entries, joined_list_entries, left_list_entries,
+		  m_buf, j_buf, l_buf);
+}
+
+void log_ringid(const char *name,
+                struct cpg_ring_id *ringid,
+                const uint32_t *member_list,
+                size_t member_list_entries)
+{
+	char m_buf[128];
+	size_t i, len, pos;
+	int ret;
+
+	memset(m_buf, 0, sizeof(m_buf));
+
+	len = sizeof(m_buf);
+	pos = 0;
+	for (i = 0; i < member_list_entries; i++) {
+		ret = snprintf(m_buf + pos, len - pos, " %u",
+			       member_list[i]);
+		if (ret >= len - pos)
+			break;
+		pos += ret;
+	}
+
+	log_debug("%s ring %u:%llu %zu memb%s",
+		  name, ringid->nodeid, (unsigned long long)ringid->seq,
+		  member_list_entries, m_buf);
+}
+
+const char *reason_str(int reason)
+{
+	switch (reason) {
+	case CPG_REASON_JOIN:
+		return "join";
+	case CPG_REASON_LEAVE:
+		return "leave";
+	case CPG_REASON_NODEDOWN:
+		return "nodedown";
+	case CPG_REASON_NODEUP:
+		return "nodeup";
+	case CPG_REASON_PROCDOWN:
+		return "procdown";
+	default:
+		return "unknown";
+	};
+}
+
+const char *msg_name(int type)
+{
+	switch (type) {
+	case DLM_MSG_PROTOCOL:
+		return "protocol";
+	case DLM_MSG_FENCE_RESULT:
+		return "fence_result";
+	case DLM_MSG_FENCE_CLEAR:
+		return "fence_clear";
+
+	case DLM_MSG_START:
+		return "start";
+	case DLM_MSG_PLOCK:
+		return "plock";
+	case DLM_MSG_PLOCK_OWN:
+		return "plock_own";
+	case DLM_MSG_PLOCK_DROP:
+		return "plock_drop";
+	case DLM_MSG_PLOCK_SYNC_LOCK:
+		return "plock_sync_lock";
+	case DLM_MSG_PLOCK_SYNC_WAITER:
+		return "plock_sync_waiter";
+	case DLM_MSG_PLOCKS_DATA:
+		return "plocks_data";
+	case DLM_MSG_PLOCKS_DONE:
+		return "plocks_done";
+	case DLM_MSG_DEADLK_CYCLE_START:
+		return "deadlk_cycle_start";
+	case DLM_MSG_DEADLK_CYCLE_END:
+		return "deadlk_cycle_end";
+	case DLM_MSG_DEADLK_CHECKPOINT_READY:
+		return "deadlk_checkpoint_ready";
+	case DLM_MSG_DEADLK_CANCEL_LOCK:
+		return "deadlk_cancel_lock";
+	default:
+		return "unknown";
+	}
+}
+
+static int _send_message(cpg_handle_t h, void *buf, int len, int type)
+{
+	struct iovec iov;
+	cs_error_t error;
+	int retries = 0;
+
+	iov.iov_base = buf;
+	iov.iov_len = len;
+
+ retry:
+	error = cpg_mcast_joined(h, CPG_TYPE_AGREED, &iov, 1);
+	if (error == CS_ERR_TRY_AGAIN) {
+		retries++;
+		usleep(1000);
+		if (!(retries % 100))
+			log_error("cpg_mcast_joined retry %d %s",
+				   retries, msg_name(type));
+		goto retry;
+	}
+	if (error != CS_OK) {
+		log_error("cpg_mcast_joined error %d handle %llx %s",
+			  error, (unsigned long long)h, msg_name(type));
+		return -1;
+	}
+
+	if (retries)
+		log_debug("cpg_mcast_joined retried %d %s",
+			  retries, msg_name(type));
+
+	return 0;
+}
+
+/* header fields caller needs to set: type, to_nodeid, flags, msgdata */
+
+void dlm_send_message(struct lockspace *ls, char *buf, int len)
+{
+	struct dlm_header *hd = (struct dlm_header *) buf;
+	int type = hd->type;
+
+	hd->version[0]  = cpu_to_le16(our_protocol.daemon_run[0]);
+	hd->version[1]  = cpu_to_le16(our_protocol.daemon_run[1]);
+	hd->version[2]  = cpu_to_le16(our_protocol.daemon_run[2]);
+	hd->type	= cpu_to_le16(hd->type);
+	hd->nodeid      = cpu_to_le32(our_nodeid);
+	hd->to_nodeid   = cpu_to_le32(hd->to_nodeid);
+	hd->global_id   = cpu_to_le32(ls->global_id);
+	hd->flags       = cpu_to_le32(hd->flags);
+	hd->msgdata     = cpu_to_le32(hd->msgdata);
+	hd->msgdata2    = cpu_to_le32(hd->msgdata2);
+
+	_send_message(ls->cpg_handle, buf, len, type);
+}
+
+void dlm_header_in(struct dlm_header *hd)
+{
+	hd->version[0]  = le16_to_cpu(hd->version[0]);
+	hd->version[1]  = le16_to_cpu(hd->version[1]);
+	hd->version[2]  = le16_to_cpu(hd->version[2]);
+	hd->type        = le16_to_cpu(hd->type);
+	hd->nodeid      = le32_to_cpu(hd->nodeid);
+	hd->to_nodeid   = le32_to_cpu(hd->to_nodeid);
+	hd->global_id   = le32_to_cpu(hd->global_id);
+	hd->flags       = le32_to_cpu(hd->flags);
+	hd->msgdata     = le32_to_cpu(hd->msgdata);
+	hd->msgdata2    = le32_to_cpu(hd->msgdata2);
+}
+
+int dlm_header_validate(struct dlm_header *hd, int nodeid)
+{
+	if (hd->version[0] != our_protocol.daemon_run[0] ||
+	    hd->version[1] != our_protocol.daemon_run[1]) {
+		log_error("reject message from %d version %u.%u.%u vs %u.%u.%u",
+			  nodeid, hd->version[0], hd->version[1],
+			  hd->version[2], our_protocol.daemon_run[0],
+			  our_protocol.daemon_run[1],
+			  our_protocol.daemon_run[2]);
+		return -1;
+	}
+
+	if (hd->nodeid != nodeid) {
+		log_error("bad msg nodeid %d %d", hd->nodeid, nodeid);
+		return -1;
+	}
+
+	return 0;
+}
+
+static struct node_daemon *get_node_daemon(int nodeid)
+{
+	struct node_daemon *node;
+
+	list_for_each_entry(node, &daemon_nodes, list) {
+		if (node->nodeid == nodeid)
+			return node;
+	}
+	return NULL;
+}
+
+static int nodes_need_fencing(void)
+{
+	struct node_daemon *node;
+
+	list_for_each_entry(node, &daemon_nodes, list) {
+		if (node->need_fencing)
+			return 1;
+	}
+	return 0;
+}
+
+static int all_daemon_members_fipu(void)
+{
+	struct node_daemon *node;
+
+	list_for_each_entry(node, &daemon_nodes, list) {
+		if (!node->daemon_member)
+			continue;
+		if (!node->fence_in_progress_unknown)
+			return 0;
+	}
+
+	list_for_each_entry(node, &daemon_nodes, list) {
+		if (!node->daemon_member)
+			continue;
+		node->fence_in_progress_unknown = 0;
+	}
+
+	return 1;
+}
+
+int fence_node_time(int nodeid, uint64_t *last_fenced)
+{
+	struct node_daemon *node;
+
+	node = get_node_daemon(nodeid);
+	if (!node)
+		return -1;
+
+	*last_fenced = node->fence_monotime;
+	return 0;
+}
+
+int fence_in_progress(int *in_progress)
+{
+	if (fence_in_progress_unknown) {
+		*in_progress = 1;
+	} else if (!list_empty(&startup_nodes)) {
+		*in_progress = 2;
+	} else if (nodes_need_fencing()) {
+		*in_progress = 3;
+	} else {
+		*in_progress = 0;
+	}
+	return 0;
+}
+
+void add_startup_node(int nodeid)
+{
+	struct node_daemon *node;
+
+	node = malloc(sizeof(struct node_daemon));
+	if (!node) {
+		log_error("add_startup_node no mem");
+		return;
+	}
+	memset(node, 0, sizeof(struct node_daemon));
+	node->nodeid = nodeid;
+	list_add_tail(&node->list, &startup_nodes);
+}
+
+static int clear_startup_node(int nodeid, int all)
+{
+	struct node_daemon *node, *safe;
+	int count = 0;
+
+	list_for_each_entry_safe(node, safe, &startup_nodes, list) {
+		if (all || node->nodeid == nodeid) {
+			list_del(&node->list);
+			free(node);
+			count++;
+		}
+	}
+	return count;
+}
+
+static struct node_daemon *add_node_daemon(int nodeid)
+{
+	struct node_daemon *node;
+	struct fence_config *fc;
+
+	node = get_node_daemon(nodeid);
+	if (node)
+		return node;
+
+	node = malloc(sizeof(struct node_daemon));
+	if (!node) {
+		log_error("add_node_daemon no mem");
+		return NULL;
+	}
+	memset(node, 0, sizeof(struct node_daemon));
+	node->nodeid = nodeid;
+	list_add_tail(&node->list, &daemon_nodes);
+
+	/* TODO: allow the config to be reread */
+
+	fc = &node->fence_config;
+	fc->nodeid = nodeid;
+
+	/* explicit command line arg has first priority */
+
+	if (optd_fence_all_agent) {
+		fc->dev[0] = &fence_all_device;
+		goto out;
+	}
+
+	/* explicit config file setting has second priority */
+
+	fence_config_init(fc, (unsigned int)nodeid, (char *)CONF_FILE_PATH);
+
+	/* no command line, no config file, use default, third priority */
+
+	if (!fc->dev[0] && fence_all_agent[0])
+		fc->dev[0] = &fence_all_device;
+ out:
+	return node;
+}
+
+/* A clean daemon member is a node that has joined the daemon cpg
+   from a "clean state", i.e. not a stateful merge.  If would not
+   have joined the daemon cpg if it found uncontrolled dlm kernel
+   state (check_uncontrolled_lockspaces).  We would not have
+   accepted and saved its protocol in node->proto.daemon if it
+   was a stateful merge. */
+
+static int is_clean_daemon_member(int nodeid)
+{
+	struct node_daemon *node;
+
+	node = get_node_daemon(nodeid);
+	if (node && node->daemon_member && node->proto.daemon_max[0])
+		return 1;
+	return 0;
+}
+
+static int in_daemon_list(int nodeid, struct cpg_address *daemon_list, int count)
+{
+	int i;
+
+	for (i = 0; i < count; i++) {
+		if (daemon_list[i].nodeid == nodeid)
+			return 1;
+	}
+	return 0;
+}
+
+/* save in node->fence_actors[] any nodeid present when the node
+   failed which therefore saw it fail, knows it needs fencing, and
+   can request fencing for it if it becomes the low actor.  A node
+   added in the same change with the removed node does not qualify. */
+
+static int set_fence_actors(struct node_daemon *node, int all_memb)
+{
+	int i, nodeid, count = 0, low = 0;
+
+	memset(node->fence_actors, 0, sizeof(node->fence_actors));
+
+	for (i = 0; i < daemon_member_count; i++) {
+		nodeid = daemon_member[i].nodeid;
+
+		if (!all_memb && in_daemon_list(nodeid, daemon_joined, daemon_joined_count))
+			continue;
+
+		node->fence_actors[count++] = nodeid;
+
+		if (!low || nodeid < low)
+			low = nodeid;
+	}
+
+	log_debug("set_fence_actors for %d low %d count %d",
+		  node->nodeid, low, count);
+	return low;
+}
+
+static int get_fence_actor(struct node_daemon *node)
+{
+	int i, low, low_i;
+
+ retry:
+	low = 0;
+
+	for (i = 0; i < MAX_NODES; i++) {
+		if (!node->fence_actors[i])
+			continue;
+
+		if (!low || node->fence_actors[i] < low) {
+			low = node->fence_actors[i];
+			low_i = i;
+		}
+	}
+
+	if (low && !in_daemon_list(low, daemon_member, daemon_member_count)) {
+		log_debug("get_fence_actor for %d low actor %d is gone",
+			  node->nodeid, low);
+
+		node->fence_actors[low_i] = 0;
+		goto retry;
+	}
+
+	return low;
+}
+
+/* if an actor fails to fence, it will send that result, and others
+   will clear it from the actors, which will cause the next lowest
+   actor to try */
+
+static void clear_fence_actor(int nodeid, int actor)
+{
+	struct node_daemon *node;
+	int i;
+
+	node = get_node_daemon(nodeid);
+	if (!node)
+		return;
+
+	for (i = 0; i < MAX_NODES; i++) {
+		if (node->fence_actors[i] == actor) {
+			node->fence_actors[i] = 0;
+			return;
+		}
+	}
+}
+
+/* TODO: handle delayed cleanup of more than one pid */
+
+static void fence_pid_cancel(int nodeid, int pid)
+{
+	struct node_daemon *node;
+	int rv, result;
+
+	log_debug("fence_pid_cancel nodeid %d pid %d", nodeid, pid);
+
+	kill(pid, SIGKILL);
+	usleep(500000);
+
+	rv = fence_result(nodeid, pid, &result);
+	if (rv == -EAGAIN) {
+		/* Try again later */
+		daemon_clear_nodeid = nodeid;
+		daemon_clear_pid = pid;
+	} else {
+		log_debug("fence_pid_cancel nodeid %d pid %d done %d",
+			  nodeid, pid, result);
+
+		daemon_clear_nodeid = 0;
+		daemon_clear_pid = 0;
+
+		node = get_node_daemon(nodeid);
+		if (node && node->fence_pid == pid) {
+			node->fence_pid_wait = 0;
+			node->fence_pid = 0;
+		}
+	}
+}
+
+/*
+ * fence_in_progress_unknown (fipu)
+ *
+ * If current daemon members are fencing someone, and a new node
+ * joins, that new node needs to wait for the previous members to
+ * finish any fencing they're doing before it can start a lockspace.
+ *
+ * The previous members may be fencing the last node that was using
+ * the lockspace the new node is going to use, so if it doesn't wait,
+ * it could start using a lockspace with an unfenced user.
+ *
+ * So, the daemon starts with fence_in_progress_unknown set to
+ * indicate that other nodes may be fencing someone, and it won't
+ * start any lockspaces until it is clear.
+ *
+ * A node starts with fence_in_progress_unknown set and won't
+ * start any lockspaces until it's clear.
+ *
+ * When using startup_fencing:
+ *
+ * . When all nodes start up together, all have fipu set,
+ * and will go through startup fencing, which will eventually
+ * result in all nodes either being clean daemon members or fenced,
+ * so everyone will clear fipu by seeing that.
+ *
+ * . The more common case is when a new node joins other previously
+ * running nodes.  The new node needs to be told that the others
+ * have no outstanding fencing ops before it can clear fipu.
+ * A previous member does send_fence_clear(0) to a new node once
+ * all fencing is complete.  The two flags in send_fence_clear are
+ * usually sent together but may sometimes may be in separate messages:
+ * send_fence_clear(0, CLEAR_STARTUP) to clear startup_nodes right away
+ * send_fence_clear(0, CLEAR_FIPU) to clear fipu once all fencing is done
+ *
+ * When not using startup_fencing:
+ *
+ * . When all nodes start up together, all have fipu set, and all
+ * will be waiting to receive_fence_clear from a previous node
+ * in order to clear it.  The nodes need to detect this situation,
+ * and when they do, they will know that everyone is in startup,
+ * so there can be no pending fencing on a previous node, so all
+ * can clear fipu.  To detect this case, when a node starts up
+ * with !startup_fence, it sends a special send_fence_clear(-ENODATA, FIPU)
+ * message about itself to indicate it has fipu set and needs it cleared.
+ * After sending this, it checks to see if all present nodes have sent
+ * this same message about themselves.  If so, then this startup
+ * case has been detected, an all will clear fipu.
+ *
+ * . New nodes that join after this startup initialization will be
+ * handled the same as when startup_fencing is set (above).
+ *
+ *
+ * startup_fencing
+ * ---------------
+ *
+ * case A
+ * all nodes start up,
+ * all have fipu set,
+ * all wait for startup_nodes to be empty, (joined or moved to need_fencing)
+ * all wait for no daemon_nodes to need_fencing, (joined or were fenced)
+ * all clear fipu
+ *
+ * later,
+ *
+ * case B
+ * new node starts,
+ * new node has fipu set,
+ * cur node sees need_fence_clear on new node
+ * cur node sees no pending fencing ops,
+ * cur node send_fence_clear(0) to new node,
+ * new node clears startup_nodes and fipu
+ *
+ * !startup_fencing
+ * ----------------
+ *
+ * case C
+ * all nodes start up,
+ * all have fipu set,
+ * all send_fence_clear(-ENODATA,FIPU),
+ * all receive_fence_clear(-ENODATA,FIPU) from everyone,
+ * all_daemon_members_fipu() is 1,
+ * all clear fipu
+ *
+ * later same as case B above
+ */
+
+/*
+ * TODO: limit to one agent running at once, in case both
+ * instances need to log into the same switch, for example.
+ */
+
+static void daemon_fence_work(void)
+{
+	struct node_daemon *node, *safe;
+	int rv, nodeid, pid, need, low, actor, result;
+	uint32_t flags;
+
+	if (daemon_ringid_wait) {
+		/* We've seen a nodedown confchg callback, but not the
+		   corresponding ringid callback. */
+		log_debug("fence work wait for cpg ringid");
+		return;
+	}
+
+	if (cluster_ringid_seq != daemon_ringid.seq) {
+		/* wait for ringids to be in sync */
+		log_debug("fence work wait for cluster ringid");
+		return;
+	}
+
+	/* poll_fencing++; */
+
+	if (cfgd_enable_quorum_fencing && !cluster_quorate) {
+		/* wait for quorum before doing any fencing, but if there
+		   is none, send_fence_clear below can unblock new nodes */
+		log_debug("fence work wait for quorum");
+		goto out_fipu;
+	}
+
+	/*
+	 * startup fencing
+	 */
+
+	list_for_each_entry_safe(node, safe, &startup_nodes, list) {
+		if (is_clean_daemon_member(node->nodeid)) {
+			log_debug("fence startup %d member skip", node->nodeid);
+			list_del(&node->list);
+			free(node);
+			continue;
+		}
+
+		if (!cfgd_startup_fence)
+			continue;
+
+		if (monotime() - last_join_monotime < cfgd_startup_fence) {
+			log_debug("fence startup %d delay %d from %llu",
+				  node->nodeid, cfgd_startup_fence,
+				  (unsigned long long)last_join_monotime);
+			poll_fencing++;
+			continue;
+		}
+
+		/* clear this entry and create a daemon_nodes entry with
+		   need_fencing and the fence loops below will handle it */
+
+		nodeid = node->nodeid;
+		list_del(&node->list);
+		free(node);
+
+		node = add_node_daemon(nodeid);
+		if (!node) {
+			log_debug("fence startup %d add failed", nodeid);
+			continue;
+		}
+		if (node->need_fencing) {
+			/* don't think this should happen? */
+			log_error("fence startup %d already set", nodeid);
+			continue;
+		}
+		node->need_fencing = 1;
+		node->fence_config.pos = 0;
+		node->left_reason = REASON_STARTUP_FENCING;
+		node->fail_monotime = cluster_joined_monotime - 1;
+		node->fail_walltime = cluster_joined_walltime - 1;
+		node->fence_monotime = 0;
+		node->fence_walltime = 0;
+		node->fence_request_time = 0;
+		low = set_fence_actors(node, 1);
+
+		log_debug("fence startup nodeid %d act %d", node->nodeid, low);
+	}
+
+	/*
+	 * request fencing
+	 */
+
+	list_for_each_entry(node, &daemon_nodes, list) {
+		if (!node->need_fencing)
+			continue;
+
+		if (node->fence_pid_wait)
+			continue;
+
+		if (is_clean_daemon_member(node->nodeid)) {
+			/* node rejoined cleanly, doesn't need fencing */
+			log_debug("fence request %d member skip", node->nodeid);
+			node->need_fencing = 0;
+			continue;
+		}
+
+		/* get_fence_actor picks the low nodeid that existed
+		   when node failed and is still around.  if the current
+		   actor fails, get_fence_actor will not find it in the
+		   members list, will clear it, and return the next actor */
+
+		actor = get_fence_actor(node);
+
+		if (!actor) {
+			log_error("fence request %d no actor", node->nodeid);
+			continue;
+		}
+
+		if (actor != our_nodeid) {
+			log_debug("fence request %d defer to %d",
+				  node->nodeid, actor);
+			continue;
+		}
+
+		log_debug("fence request %d", node->nodeid);
+
+		rv = fence_request(node->nodeid,
+				   node->fail_walltime, node->fail_monotime,
+				   &node->fence_config, &pid);
+		if (rv < 0) {
+			send_fence_result(node->nodeid, rv, 0, time(NULL));
+			continue;
+		}
+
+		node->fence_pid_wait = 1;
+		node->fence_pid = pid;
+		node->fence_request_time = monotime();
+	}
+
+	/*
+	 * check outstanding fence requests
+	 */
+
+	list_for_each_entry(node, &daemon_nodes, list) {
+		if (!node->need_fencing)
+			continue;
+
+		if (!node->fence_pid_wait) {
+			/*
+			 * another node is the actor, or we were actor,
+			 * sent done msg and are waiting to recv it
+			 */
+			log_debug("fence wait %d for done", node->nodeid);
+			continue;
+		}
+
+		if (is_clean_daemon_member(node->nodeid)) {
+			/*
+			 * node has rejoined in clean state so we can
+			 * abort outstanding fence op for it.  all nodes
+			 * will see and do this, so we don't need to send
+			 * a fence result.
+			 */
+			log_debug("fence wait %d member skip", node->nodeid);
+			node->need_fencing = 0;
+			node->fence_walltime = time(NULL);
+			node->fence_monotime = monotime();
+			fence_pid_cancel(node->nodeid, node->fence_pid);
+			continue;
+		}
+
+		poll_fencing++;
+
+		rv = fence_result(node->nodeid, node->fence_pid, &result);
+		if (rv == -EAGAIN) {
+			/* agent pid is still running */
+			log_debug("fence wait %d pid %d running",
+				  node->nodeid, node->fence_pid);
+			continue;
+		}
+
+		if (rv < 0) {
+			/* shouldn't happen */
+			log_error("fence wait %d pid %d error %d",
+				  node->nodeid, node->fence_pid_wait, rv);
+			node->fence_pid_wait = 0;
+			continue;
+		}
+
+		if (!result) {
+			/* agent exit 0, if there's another agent to run in
+			   parallel, set it to run next, otherwise success */
+
+			log_debug("fence nodeid %d pid %d succeeded",
+				  node->nodeid, node->fence_pid);
+
+			node->fence_pid_wait = 0;
+			node->fence_pid = 0;
+
+			rv = fence_config_next_parallel(&node->fence_config);
+			if (rv < 0)
+				send_fence_result(node->nodeid, 0, 0, time(NULL));
+		} else {
+			/* agent exit 1, if there's another agent to run at
+			   next priority, set it to run next, otherwise fail */
+
+			log_debug("fence nodeid %d pid %d failed %d",
+				  node->nodeid, node->fence_pid, result);
+
+			node->fence_pid_wait = 0;
+			node->fence_pid = 0;
+
+			rv = fence_config_next_priority(&node->fence_config);
+			if (rv < 0)
+				send_fence_result(node->nodeid, result, 0, time(NULL));
+		}
+	}
+
+	/*
+	 * clear fence_in_progress_unknown
+	 */
+ out_fipu:
+	need = nodes_need_fencing();
+
+	if (cfgd_startup_fence && fence_in_progress_unknown && !need && list_empty(&startup_nodes)) {
+		/*
+		 * case A in comment above
+		 * all nodes are starting and have fipu set, they all do
+		 * startup fencing, and eventually see unknown nodes become
+		 * members or get fenced, so all clear fipu for themselves.
+		 */
+		fence_in_progress_unknown = 0;
+		log_debug("fence_in_progress_unknown 0 startup");
+	}
+
+	if (!fence_in_progress_unknown) {
+		/*
+		 * case B in comment above
+		 * some cur nodes have fipu clear, new nodes have fipu set.
+		 * A current node needs to send_fence_clear to the new nodes
+		 * once all fencing is done so they clear fipu.
+		 */
+		low = 0;
+
+		list_for_each_entry(node, &daemon_nodes, list) {
+			if (!node->daemon_member || node->need_fence_clear)
+				continue;
+			if (!low || node->nodeid < low)
+				low = node->nodeid;
+		}
+
+		list_for_each_entry(node, &daemon_nodes, list) {
+			if (!node->daemon_member || !node->need_fence_clear)
+				continue;
+			if (node->nodeid == our_nodeid) {
+				node->need_fence_clear = 0;
+				continue;
+			}
+			if (low != our_nodeid)
+				continue;
+
+			flags = 0;
+
+			if (node->need_fence_clear & FR_CLEAR_STARTUP) {
+				flags |= FR_CLEAR_STARTUP;
+				node->need_fence_clear &= ~FR_CLEAR_STARTUP;
+			}
+
+			if ((node->need_fence_clear & FR_CLEAR_FIPU) && !need) {
+				flags |= FR_CLEAR_FIPU;
+				node->need_fence_clear &= ~FR_CLEAR_FIPU;
+			}
+
+			if (!flags)
+				continue;
+
+			send_fence_clear(node->nodeid, 0, flags, 0);
+		}
+	}
+
+	if (!cfgd_startup_fence && fence_in_progress_unknown) {
+		/*
+		 * case C in comment above
+		 * all nodes are starting and have fipu set.  All expect a
+		 * previous node to send_fence_clear so they can clear fipu.
+		 * But there are no previous nodes. They need to detect this
+		 * condition.  Each node does send_fence_clear(ENODATA,FIPU).
+		 * When all have received this from all, condition is
+		 * detected and all clear fipu.
+		 */
+		if (all_daemon_members_fipu()) {
+			fence_in_progress_unknown = 0;
+			log_debug("fence_in_progress_unknown 0 all_fipu");
+		} else if (last_join_seq > send_fipu_seq) {
+			/* the seq numbers keep us from spamming this msg */
+			send_fence_clear(our_nodeid, -ENODATA, FR_FIPU, 0);
+			log_debug("send_fence_clear %d fipu", our_nodeid);
+			send_fipu_seq = last_join_seq;
+		}
+	}
+
+	/*
+	 * clean up a zombie pid from an agent we killed
+	 */
+
+	if (daemon_clear_pid)
+		fence_pid_cancel(daemon_clear_nodeid, daemon_clear_pid);
+}
+
+void process_fencing_changes(void)
+{
+	poll_fencing = 0;
+	daemon_fence_work();
+}
+
+static void receive_fence_clear(struct dlm_header *hd, int len)
+{
+	struct fence_result *fr;
+	struct node_daemon *node;
+	int count;
+
+	fr = (struct fence_result *)((char *)hd + sizeof(struct dlm_header));
+
+	fr->flags          = le32_to_cpu(fr->flags);
+	fr->nodeid         = le32_to_cpu(fr->nodeid);
+	fr->result         = le32_to_cpu(fr->result);
+	fr->fence_walltime = le64_to_cpu(fr->fence_walltime);
+
+	if (len < sizeof(struct dlm_header) + sizeof(struct fence_result)) {
+		log_error("receive_fence_clear invalid len %d from %d",
+			  len, hd->nodeid);
+		return;
+	}
+
+	node = get_node_daemon(fr->nodeid);
+	if (!node) {
+		log_error("receive_fence_clear from %d no daemon node %d",
+			  hd->nodeid, fr->nodeid);
+		return;
+	}
+
+	log_debug("receive_fence_clear from %d for %d result %d flags %x",
+		  hd->nodeid, fr->nodeid, fr->result, fr->flags);
+
+	/*
+	 * A node sends this message about itself indicating that it's in
+	 * startup with fipu set.  The only time we care about node->fipu
+	 * is when all nodes are fipu in startup. node->need_fence_clear
+	 * and node->fipu are not related, they address different cases.
+	 */
+	if ((fr->result == -ENODATA) && (fr->flags & FR_FIPU)) {
+		if (!fence_in_progress_unknown)
+			return;
+
+		node->fence_in_progress_unknown = 1;
+		return;
+	}
+
+	/*
+	 * An previous member sends this to new members to tell them that
+	 * they can clear startup_nodes and clear fipu.  These two flags
+	 * may come in separate messages if there is a pending fencing op
+	 * when the new member joins (CLEAR_STARTUP will come right away,
+	 * but CLEAR_FIPU will come once the fencing op is done.)
+	 */
+	if (!fr->result && (node->nodeid == our_nodeid)) {
+		if ((fr->flags & FR_CLEAR_STARTUP) && !list_empty(&startup_nodes)) {
+			count = clear_startup_node(0, 1);
+			log_debug("clear_startup_nodes %d", count);
+		}
+
+		if ((fr->flags & FR_CLEAR_FIPU) && fence_in_progress_unknown) {
+			fence_in_progress_unknown = 0;
+			log_debug("fence_in_progress_unknown 0 recv");
+		}
+	}
+
+	/* this node doesn't need these flags any more */
+	if (!fr->result) {
+		if (fr->flags & FR_CLEAR_STARTUP)
+			node->need_fence_clear &= ~FR_CLEAR_STARTUP;
+		if (fr->flags & FR_CLEAR_FIPU)
+			node->need_fence_clear &= ~FR_CLEAR_FIPU;
+	}
+}
+
+static void send_fence_clear(int nodeid, int result, uint32_t flags, uint64_t walltime)
+{
+	struct dlm_header *hd;
+	struct fence_result *fr;
+	char *buf;
+	int len;
+
+	len = sizeof(struct dlm_header) + sizeof(struct fence_result);
+	buf = malloc(len);
+	if (!buf) {
+		log_error("send_fence_clear no mem %d", len);
+		return;
+	}
+	memset(buf, 0, len);
+
+	hd = (struct dlm_header *)buf;
+	fr = (struct fence_result *)(buf + sizeof(*hd));
+
+	hd->type = cpu_to_le16(DLM_MSG_FENCE_CLEAR);
+	hd->nodeid = cpu_to_le32(our_nodeid);
+
+	fr->flags          = cpu_to_le32(flags);
+	fr->nodeid         = cpu_to_le32(nodeid);
+	fr->result         = cpu_to_le32(result);
+	fr->fence_walltime = cpu_to_le64(walltime);
+
+	_send_message(cpg_handle_daemon, buf, len, DLM_MSG_FENCE_CLEAR);
+}
+
+static void receive_fence_result(struct dlm_header *hd, int len)
+{
+	struct fence_result *fr;
+	struct node_daemon *node;
+	int count;
+
+	fr = (struct fence_result *)((char *)hd + sizeof(struct dlm_header));
+
+	fr->flags          = le32_to_cpu(fr->flags);
+	fr->nodeid         = le32_to_cpu(fr->nodeid);
+	fr->result         = le32_to_cpu(fr->result);
+	fr->fence_walltime = le64_to_cpu(fr->fence_walltime);
+
+	if (len < sizeof(struct dlm_header) + sizeof(struct fence_result)) {
+		log_error("receive_fence_result invalid len %d from %d",
+			  len, hd->nodeid);
+		return;
+	}
+
+	count = clear_startup_node(fr->nodeid, 0);
+	if (count) {
+		log_debug("receive_fence_result from %d for %d clear startup",
+			  hd->nodeid, fr->nodeid);
+	}
+
+	node = get_node_daemon(fr->nodeid);
+	if (!node) {
+		log_error("receive_fence_result from %d for %d no daemon node",
+			  hd->nodeid, fr->nodeid);
+		return;
+	}
+
+	log_debug("receive_fence_result from %d for %d result %d walltime %llu",
+		  hd->nodeid, fr->nodeid, fr->result,
+		  (unsigned long long)fr->fence_walltime);
+
+	if (!node->need_fencing) {
+		/* should never happen */
+		log_error("receive_fence_result from %d for %d result %d no need_fencing",
+		  	  hd->nodeid, fr->nodeid, fr->result);
+		return;
+	}
+
+	if (fr->result == -ECANCELED) {
+		/* if an agent pid is running, kill it and clean up */
+		if (node->fence_pid_wait && node->fence_pid)
+			fence_pid_cancel(node->nodeid, node->fence_pid);
+		fr->result = 0; /* force success below */
+	}
+
+	if (!fr->result) {
+		node->need_fencing = 0;
+		node->fence_walltime = fr->fence_walltime;
+		node->fence_monotime = monotime();
+		node->fence_actor_done = hd->nodeid;
+	} else {
+		clear_fence_actor(fr->nodeid, hd->nodeid);
+	}
+}
+
+static void send_fence_result(int nodeid, int result, uint32_t flags, uint64_t walltime)
+{
+	struct dlm_header *hd;
+	struct fence_result *fr;
+	char *buf;
+	int len;
+
+	len = sizeof(struct dlm_header) + sizeof(struct fence_result);
+	buf = malloc(len);
+	if (!buf) {
+		log_error("send_fence_result no mem %d", len);
+		return;
+	}
+	memset(buf, 0, len);
+
+	hd = (struct dlm_header *)buf;
+	fr = (struct fence_result *)(buf + sizeof(*hd));
+
+	hd->type = cpu_to_le16(DLM_MSG_FENCE_RESULT);
+	hd->nodeid = cpu_to_le32(our_nodeid);
+
+	fr->flags          = cpu_to_le32(flags);
+	fr->nodeid         = cpu_to_le32(nodeid);
+	fr->result         = cpu_to_le32(result);
+	fr->fence_walltime = cpu_to_le64(walltime);
+
+	_send_message(cpg_handle_daemon, buf, len, DLM_MSG_FENCE_RESULT);
+}
+
+void fence_ack_node(int nodeid)
+{
+	send_fence_result(nodeid, -ECANCELED, 0, time(NULL));
+}
+
+void set_protocol_stateful(void)
+{
+	our_protocol.dr_ver.flags |= PV_STATEFUL;
+}
+
+static void pv_in(struct protocol_version *pv)
+{
+	pv->major = le16_to_cpu(pv->major);
+	pv->minor = le16_to_cpu(pv->minor);
+	pv->patch = le16_to_cpu(pv->patch);
+	pv->flags = le16_to_cpu(pv->flags);
+}
+
+static void pv_out(struct protocol_version *pv)
+{
+	pv->major = cpu_to_le16(pv->major);
+	pv->minor = cpu_to_le16(pv->minor);
+	pv->patch = cpu_to_le16(pv->patch);
+	pv->flags = cpu_to_le16(pv->flags);
+}
+
+static void protocol_in(struct protocol *proto)
+{
+	pv_in(&proto->dm_ver);
+	pv_in(&proto->km_ver);
+	pv_in(&proto->dr_ver);
+	pv_in(&proto->kr_ver);
+}
+
+static void protocol_out(struct protocol *proto)
+{
+	pv_out(&proto->dm_ver);
+	pv_out(&proto->km_ver);
+	pv_out(&proto->dr_ver);
+	pv_out(&proto->kr_ver);
+}
+
+/* go through member list saved in last confchg, see if we have received a
+   proto message from each */
+
+static int all_protocol_messages(void)
+{
+	struct node_daemon *node;
+	int i;
+
+	if (!daemon_member_count)
+		return 0;
+
+	for (i = 0; i < daemon_member_count; i++) {
+		node = get_node_daemon(daemon_member[i].nodeid);
+		if (!node) {
+			log_error("all_protocol_messages no node %d",
+				  daemon_member[i].nodeid);
+			return 0;
+		}
+
+		if (!node->proto.daemon_max[0])
+			return 0;
+	}
+	return 1;
+}
+
+static int pick_min_protocol(struct protocol *proto)
+{
+	uint16_t mind[4];
+	uint16_t mink[4];
+	struct node_daemon *node;
+	int i;
+
+	memset(&mind, 0, sizeof(mind));
+	memset(&mink, 0, sizeof(mink));
+
+	/* first choose the minimum major */
+
+	for (i = 0; i < daemon_member_count; i++) {
+		node = get_node_daemon(daemon_member[i].nodeid);
+		if (!node) {
+			log_error("pick_min_protocol no node %d",
+				  daemon_member[i].nodeid);
+			return -1;
+		}
+
+		if (!mind[0] || node->proto.daemon_max[0] < mind[0])
+			mind[0] = node->proto.daemon_max[0];
+
+		if (!mink[0] || node->proto.kernel_max[0] < mink[0])
+			mink[0] = node->proto.kernel_max[0];
+	}
+
+	if (!mind[0] || !mink[0]) {
+		log_error("pick_min_protocol zero major number");
+		return -1;
+	}
+
+	/* second pick the minimum minor with the chosen major */
+
+	for (i = 0; i < daemon_member_count; i++) {
+		node = get_node_daemon(daemon_member[i].nodeid);
+		if (!node)
+			continue;
+
+		if (mind[0] == node->proto.daemon_max[0]) {
+			if (!mind[1] || node->proto.daemon_max[1] < mind[1])
+				mind[1] = node->proto.daemon_max[1];
+		}
+
+		if (mink[0] == node->proto.kernel_max[0]) {
+			if (!mink[1] || node->proto.kernel_max[1] < mink[1])
+				mink[1] = node->proto.kernel_max[1];
+		}
+	}
+
+	if (!mind[1] || !mink[1]) {
+		log_error("pick_min_protocol zero minor number");
+		return -1;
+	}
+
+	/* third pick the minimum patch with the chosen major.minor */
+
+	for (i = 0; i < daemon_member_count; i++) {
+		node = get_node_daemon(daemon_member[i].nodeid);
+		if (!node)
+			continue;
+
+		if (mind[0] == node->proto.daemon_max[0] &&
+		    mind[1] == node->proto.daemon_max[1]) {
+			if (!mind[2] || node->proto.daemon_max[2] < mind[2])
+				mind[2] = node->proto.daemon_max[2];
+		}
+
+		if (mink[0] == node->proto.kernel_max[0] &&
+		    mink[1] == node->proto.kernel_max[1]) {
+			if (!mink[2] || node->proto.kernel_max[2] < mink[2])
+				mink[2] = node->proto.kernel_max[2];
+		}
+	}
+
+	if (!mind[2] || !mink[2]) {
+		log_error("pick_min_protocol zero patch number");
+		return -1;
+	}
+
+	memcpy(&proto->daemon_run, &mind, sizeof(mind));
+	memcpy(&proto->kernel_run, &mink, sizeof(mink));
+	return 0;
+}
+
+static void receive_protocol(struct dlm_header *hd, int len)
+{
+	struct protocol *p;
+	struct node_daemon *node;
+	int new = 0;
+
+	p = (struct protocol *)((char *)hd + sizeof(struct dlm_header));
+	protocol_in(p);
+
+	if (len < sizeof(struct dlm_header) + sizeof(struct protocol)) {
+		log_error("receive_protocol invalid len %d from %d",
+			  len, hd->nodeid);
+		return;
+	}
+
+	/* zero is an invalid version value */
+
+	if (!p->daemon_max[0] || !p->daemon_max[1] || !p->daemon_max[2] ||
+	    !p->kernel_max[0] || !p->kernel_max[1] || !p->kernel_max[2]) {
+		log_error("receive_protocol invalid max value from %d "
+			  "daemon %u.%u.%u kernel %u.%u.%u", hd->nodeid,
+			  p->daemon_max[0], p->daemon_max[1], p->daemon_max[2],
+			  p->kernel_max[0], p->kernel_max[1], p->kernel_max[2]);
+		return;
+	}
+
+	/* the run values will be zero until a version is set, after
+	   which none of the run values can be zero */
+
+	if (p->daemon_run[0] && (!p->daemon_run[1] || !p->daemon_run[2] ||
+	    !p->kernel_run[0] || !p->kernel_run[1] || !p->kernel_run[2])) {
+		log_error("receive_protocol invalid run value from %d "
+			  "daemon %u.%u.%u kernel %u.%u.%u", hd->nodeid,
+			  p->daemon_run[0], p->daemon_run[1], p->daemon_run[2],
+			  p->kernel_run[0], p->kernel_run[1], p->kernel_run[2]);
+		return;
+	}
+
+	/* save this node's proto so we can tell when we've got all, and
+	   use it to select a minimum protocol from all */
+
+	node = get_node_daemon(hd->nodeid);
+	if (!node) {
+		log_error("receive_protocol no node %d", hd->nodeid);
+		return;
+	}
+
+	if (!node->daemon_member) {
+		log_error("receive_protocol node %d not member", hd->nodeid);
+		return;
+	}
+
+	log_debug("receive_protocol %d max %u.%u.%u.%x run %u.%u.%u.%x",
+		  hd->nodeid,
+		  p->daemon_max[0], p->daemon_max[1],
+		  p->daemon_max[2], p->daemon_max[3],
+		  p->daemon_run[0], p->daemon_run[1],
+		  p->daemon_run[2], p->daemon_run[3]);
+
+	if (memcmp(&node->proto, p, sizeof(struct protocol))) {
+		log_debug("daemon node %d prot max %u.%u.%u.%x run %u.%u.%u.%x",
+			  hd->nodeid,
+			  node->proto.daemon_max[0], node->proto.daemon_max[1],
+			  node->proto.daemon_max[2], node->proto.daemon_max[3],
+			  node->proto.daemon_run[0], node->proto.daemon_run[1],
+			  node->proto.daemon_run[2], node->proto.daemon_run[3]);
+		new = 1;
+	}
+
+	/* checking zero node->daemon_max[0] is a way to tell if we've received
+	   an acceptable (non-stateful) proto message from the node since we
+	   saw it join the daemon cpg */
+
+	if (hd->nodeid != our_nodeid &&
+	    !node->proto.daemon_max[0] &&
+	    (p->dr_ver.flags & PV_STATEFUL) &&
+	    (our_protocol.dr_ver.flags & PV_STATEFUL)) {
+
+		log_debug("daemon node %d stateful merge", hd->nodeid);
+		log_debug("daemon node %d join %llu left %llu local quorum %llu",
+			  hd->nodeid,
+			  (unsigned long long)node->daemon_add_time,
+			  (unsigned long long)node->daemon_rem_time,
+			  (unsigned long long)cluster_quorate_monotime);
+
+		if (cluster_quorate && node->daemon_rem_time &&
+		    cluster_quorate_monotime < node->daemon_rem_time) {
+			log_debug("daemon node %d kill due to stateful merge", hd->nodeid);
+			if (!node->killed)
+				kick_node_from_cluster(hd->nodeid);
+			node->killed = 1;
+		}
+
+		/* don't save p->proto into node->proto; we need to come
+		   through here based on zero daemon_max[0] for other proto
+		   messages like this one from the same node */
+
+		return;
+	}
+
+	if (new) {
+		memcpy(&node->proto, p, sizeof(struct protocol));
+
+		log_debug("daemon node %d save max %u.%u.%u.%x run %u.%u.%u.%x",
+			  node->nodeid,
+			  node->proto.daemon_max[0], node->proto.daemon_max[1],
+			  node->proto.daemon_max[2], node->proto.daemon_max[3],
+			  node->proto.daemon_run[0], node->proto.daemon_run[1],
+			  node->proto.daemon_run[2], node->proto.daemon_run[3]);
+	}
+
+	/* if we have zero run values, and this msg has non-zero run values,
+	   then adopt them as ours; otherwise save this proto message */
+
+	if (our_protocol.daemon_run[0])
+		return;
+
+	if (p->daemon_run[0]) {
+		our_protocol.daemon_run[0] = p->daemon_run[0];
+		our_protocol.daemon_run[1] = p->daemon_run[1];
+		our_protocol.daemon_run[2] = p->daemon_run[2];
+
+		our_protocol.kernel_run[0] = p->kernel_run[0];
+		our_protocol.kernel_run[1] = p->kernel_run[1];
+		our_protocol.kernel_run[2] = p->kernel_run[2];
+
+		log_debug("run protocol from nodeid %d", hd->nodeid);
+	}
+}
+
+static void send_protocol(struct protocol *proto)
+{
+	struct dlm_header *hd;
+	struct protocol *pr;
+	char *buf;
+	int len;
+
+	len = sizeof(struct dlm_header) + sizeof(struct protocol);
+	buf = malloc(len);
+	if (!buf) {
+		log_error("send_protocol no mem %d", len);
+		return;
+	}
+	memset(buf, 0, len);
+
+	hd = (struct dlm_header *)buf;
+	pr = (struct protocol *)(buf + sizeof(*hd));
+
+	hd->type = cpu_to_le16(DLM_MSG_PROTOCOL);
+	hd->nodeid = cpu_to_le32(our_nodeid);
+
+	memcpy(pr, proto, sizeof(struct protocol));
+	protocol_out(pr);
+
+	_send_message(cpg_handle_daemon, buf, len, DLM_MSG_PROTOCOL);
+}
+
+int set_protocol(void)
+{
+	struct protocol proto;
+	struct pollfd pollfd;
+	int sent_proposal = 0;
+	int rv;
+
+	memset(&pollfd, 0, sizeof(pollfd));
+	pollfd.fd = cpg_fd_daemon;
+	pollfd.events = POLLIN;
+
+	while (1) {
+		if (our_protocol.daemon_run[0])
+			break;
+
+		if (!sent_proposal && all_protocol_messages()) {
+			/* propose a protocol; look through info from all
+			   nodes and pick the min for both daemon and kernel,
+			   and propose that */
+
+			sent_proposal = 1;
+
+			/* copy our max values */
+			memcpy(&proto, &our_protocol, sizeof(struct protocol));
+
+			rv = pick_min_protocol(&proto);
+			if (rv < 0)
+				return rv;
+
+			log_debug("set_protocol member_count %d propose "
+				  "daemon %u.%u.%u kernel %u.%u.%u",
+				  daemon_member_count,
+				  proto.daemon_run[0], proto.daemon_run[1],
+				  proto.daemon_run[2], proto.kernel_run[0],
+				  proto.kernel_run[1], proto.kernel_run[2]);
+
+			send_protocol(&proto);
+		}
+
+		/* only process messages/events from daemon cpg until protocol
+		   is established */
+
+		rv = poll(&pollfd, 1, -1);
+		if (rv == -1 && errno == EINTR) {
+			if (daemon_quit)
+				return -1;
+			continue;
+		}
+		if (rv < 0) {
+			log_error("set_protocol poll errno %d", errno);
+			return -1;
+		}
+
+		if (pollfd.revents & POLLIN)
+			process_cpg_daemon(0);
+		if (pollfd.revents & (POLLERR | POLLHUP | POLLNVAL)) {
+			log_error("set_protocol poll revents %u",
+				  pollfd.revents);
+			return -1;
+		}
+	}
+
+	if (our_protocol.daemon_run[0] != our_protocol.daemon_max[0] ||
+	    our_protocol.daemon_run[1] > our_protocol.daemon_max[1]) {
+		log_error("incompatible daemon protocol run %u.%u.%u max %u.%u.%u",
+			our_protocol.daemon_run[0],
+			our_protocol.daemon_run[1],
+			our_protocol.daemon_run[2],
+			our_protocol.daemon_max[0],
+			our_protocol.daemon_max[1],
+			our_protocol.daemon_max[2]);
+		return -1;
+	}
+
+	if (our_protocol.kernel_run[0] != our_protocol.kernel_max[0] ||
+	    our_protocol.kernel_run[1] > our_protocol.kernel_max[1]) {
+		log_error("incompatible kernel protocol run %u.%u.%u max %u.%u.%u",
+			our_protocol.kernel_run[0],
+			our_protocol.kernel_run[1],
+			our_protocol.kernel_run[2],
+			our_protocol.kernel_max[0],
+			our_protocol.kernel_max[1],
+			our_protocol.kernel_max[2]);
+		return -1;
+	}
+
+	log_debug("daemon run %u.%u.%u max %u.%u.%u "
+		  "kernel run %u.%u.%u max %u.%u.%u",
+		  our_protocol.daemon_run[0],
+		  our_protocol.daemon_run[1],
+		  our_protocol.daemon_run[2],
+		  our_protocol.daemon_max[0],
+		  our_protocol.daemon_max[1],
+		  our_protocol.daemon_max[2],
+		  our_protocol.kernel_run[0],
+		  our_protocol.kernel_run[1],
+		  our_protocol.kernel_run[2],
+		  our_protocol.kernel_max[0],
+		  our_protocol.kernel_max[1],
+		  our_protocol.kernel_max[2]);
+
+	send_protocol(&our_protocol);
+	return 0;
+}
+
+static void deliver_cb_daemon(cpg_handle_t handle,
+			      const struct cpg_name *group_name,
+			      uint32_t nodeid, uint32_t pid,
+			      void *data, size_t len)
+{
+	struct dlm_header *hd;
+
+	if (len < sizeof(*hd)) {
+		log_error("deliver_cb short message %zd", len);
+		return;
+	}
+
+	hd = (struct dlm_header *)data;
+	dlm_header_in(hd);
+
+	switch (hd->type) {
+	case DLM_MSG_PROTOCOL:
+		receive_protocol(hd, len);
+		break;
+	case DLM_MSG_FENCE_RESULT:
+		receive_fence_result(hd, len);
+		break;
+	case DLM_MSG_FENCE_CLEAR:
+		receive_fence_clear(hd, len);
+		break;
+	default:
+		log_error("deliver_cb_daemon unknown msg type %d", hd->type);
+	}
+
+	daemon_fence_work();
+}
+
+static void confchg_cb_daemon(cpg_handle_t handle,
+			      const struct cpg_name *group_name,
+			      const struct cpg_address *member_list,
+			      size_t member_list_entries,
+			      const struct cpg_address *left_list,
+			      size_t left_list_entries,
+			      const struct cpg_address *joined_list,
+			      size_t joined_list_entries)
+{
+	struct node_daemon *node;
+	uint64_t now, now_wall;
+	int nodedown = 0, procdown = 0, leave = 0;
+	int we_joined = 0;
+	int i, reason, low;
+
+	now = monotime();
+	now_wall = time(NULL);
+
+	log_config(group_name, member_list, member_list_entries,
+		   left_list, left_list_entries,
+		   joined_list, joined_list_entries);
+
+	memset(&daemon_member, 0, sizeof(daemon_member));
+	daemon_member_count = member_list_entries;
+
+	for (i = 0; i < member_list_entries; i++) {
+		daemon_member[i] = member_list[i];
+		/* add struct for nodes we've not seen before */
+		add_node_daemon(member_list[i].nodeid);
+	}
+
+	memset(&daemon_joined, 0, sizeof(daemon_joined));
+	daemon_joined_count = joined_list_entries;
+
+	for (i = 0; i < joined_list_entries; i++) {
+		daemon_joined[i] = joined_list[i];
+		if (joined_list[i].nodeid == our_nodeid)
+			we_joined = 1;
+	}
+
+	memset(&daemon_remove, 0, sizeof(daemon_remove));
+	daemon_remove_count = left_list_entries;
+
+	for (i = 0; i < left_list_entries; i++) {
+		daemon_remove[i] = left_list[i];
+
+		if (left_list[i].reason == CPG_REASON_NODEDOWN)
+			nodedown++;
+		else if (left_list[i].reason == CPG_REASON_PROCDOWN)
+			procdown++;
+		else if (left_list[i].reason == CPG_REASON_LEAVE)
+			leave++;
+	}
+
+	if (nodedown || procdown || leave)
+		log_debug("%s left nodedown %d procdown %d leave %d",
+			  group_name->value, nodedown, procdown, leave);
+
+	if (nodedown)
+		daemon_ringid_wait = 1;
+
+	if (joined_list_entries)
+		send_protocol(&our_protocol);
+
+	list_for_each_entry(node, &daemon_nodes, list) {
+		if (in_daemon_list(node->nodeid, daemon_member, daemon_member_count)) {
+			if (node->daemon_member)
+				continue;
+
+			/* node joined daemon cpg */
+			node->daemon_member = 1;
+			node->daemon_add_time = now;
+
+			last_join_monotime = now;
+			last_join_seq++;
+
+			/* a joining node shows prev members in joined list */
+			if (!we_joined)
+				node->need_fence_clear = FR_CLEAR_STARTUP|FR_CLEAR_FIPU;
+
+			if (node->need_fencing) {
+				/* need_fencing will be cleared if we accept a
+				   valid proto from it */
+				log_error("daemon new nodeid %d needs fencing",
+					  node->nodeid);
+			}
+
+		} else {
+			if (!node->daemon_member)
+				continue;
+
+			/* node left daemon cpg */
+			node->daemon_member = 0;
+			node->killed = 0;
+			memset(&node->proto, 0, sizeof(struct protocol));
+			node->daemon_rem_time = now;
+
+			/* tell loop below to look at this node */
+			node->recover_setup = 1;
+		}
+	}
+
+	/* set up recovery work for nodes that just failed */
+
+	/* TODO: limit to nodes with a valid proto?
+	 * node_history_lockspace_fail() would only set
+	 * need_fencing if node->start_time was non-zero. */
+
+	list_for_each_entry(node, &daemon_nodes, list) {
+		if (!node->recover_setup)
+			continue;
+
+		node->recover_setup = 0;
+		reason = 0;
+		low = 0;
+
+		if (!cfgd_enable_fencing)
+			continue;
+
+		if (node->need_fencing) {
+			log_error("daemon remove nodeid %d already needs fencing",
+				  node->nodeid);
+			continue;
+		}
+
+		for (i = 0; i < left_list_entries; i++) {
+			if (left_list[i].nodeid != node->nodeid)
+				continue;
+			reason = left_list[i].reason;
+			break;
+		}
+
+		if (reason == CPG_REASON_NODEDOWN || reason == CPG_REASON_PROCDOWN) {
+			node->need_fencing = 1;
+			node->fence_config.pos = 0;
+			node->left_reason = reason;
+			node->fail_monotime = now;
+			node->fail_walltime = now_wall;
+			node->fence_monotime = 0;
+			node->fence_walltime = 0;
+			node->fence_request_time = 0;
+			low = set_fence_actors(node, 0);
+		}
+
+		log_debug("daemon remove %d %s need_fencing %d low %d",
+			  node->nodeid, reason_str(reason), node->need_fencing, low);
+	}
+
+	daemon_fence_work();
+}
+
+static void totem_cb_daemon(cpg_handle_t handle,
+                            struct cpg_ring_id ring_id,
+                            uint32_t member_list_entries,
+                            const uint32_t *member_list)
+{
+	daemon_ringid.nodeid = ring_id.nodeid;
+	daemon_ringid.seq = ring_id.seq;
+	daemon_ringid_wait = 0;
+
+	log_ringid("dlm:controld", &ring_id, member_list, member_list_entries);
+
+	daemon_fence_work();
+}
+
+static cpg_model_v1_data_t cpg_callbacks_daemon = {
+	.cpg_deliver_fn = deliver_cb_daemon,
+	.cpg_confchg_fn = confchg_cb_daemon,
+	.cpg_totem_confchg_fn = totem_cb_daemon,
+	.flags = CPG_MODEL_V1_DELIVER_INITIAL_TOTEM_CONF,
+};
+
+void process_cpg_daemon(int ci)
+{
+	cs_error_t error;
+
+	error = cpg_dispatch(cpg_handle_daemon, CS_DISPATCH_ALL);
+	if (error != CS_OK)
+		log_error("daemon cpg_dispatch error %d", error);
+}
+
+int setup_cpg_daemon(void)
+{
+	cs_error_t error;
+	struct cpg_name name;
+	int i = 0;
+
+	/* daemon 1.1.1 was cluster3/STABLE3/RHEL6 which is incompatible
+	   with cluster4/RHEL7 */ 
+
+	memset(&our_protocol, 0, sizeof(our_protocol));
+
+	if (cfgd_enable_fscontrol)
+		our_protocol.daemon_max[0] = 2;
+	else
+		our_protocol.daemon_max[0] = 3;
+
+	our_protocol.daemon_max[1] = 1;
+	our_protocol.daemon_max[2] = 1;
+	our_protocol.kernel_max[0] = 1;
+	our_protocol.kernel_max[1] = 1;
+	our_protocol.kernel_max[2] = 1;
+
+	error = cpg_model_initialize(&cpg_handle_daemon, CPG_MODEL_V1,
+				     (cpg_model_data_t *)&cpg_callbacks_daemon,
+				     NULL);
+	if (error != CS_OK) {
+		log_error("daemon cpg_initialize error %d", error);
+		return -1;
+	}
+
+	cpg_fd_get(cpg_handle_daemon, &cpg_fd_daemon);
+
+	memset(&name, 0, sizeof(name));
+	sprintf(name.value, "dlm:controld");
+	name.length = strlen(name.value) + 1;
+
+	log_debug("cpg_join %s ...", name.value);
+ retry:
+	error = cpg_join(cpg_handle_daemon, &name);
+	if (error == CS_ERR_TRY_AGAIN) {
+		sleep(1);
+		if (!(++i % 10))
+			log_error("daemon cpg_join error retrying");
+		goto retry;
+	}
+	if (error != CS_OK) {
+		log_error("daemon cpg_join error %d", error);
+		goto fail;
+	}
+
+	log_debug("setup_cpg_daemon %d", cpg_fd_daemon);
+	return cpg_fd_daemon;
+
+ fail:
+	cpg_finalize(cpg_handle_daemon);
+	return -1;
+}
+
+void close_cpg_daemon(void)
+{
+	struct lockspace *ls;
+	cs_error_t error;
+	struct cpg_name name;
+	int i = 0;
+
+	if (!cpg_handle_daemon)
+		return;
+	if (cluster_down)
+		goto fin;
+
+	memset(&name, 0, sizeof(name));
+	sprintf(name.value, "dlm:controld");
+	name.length = strlen(name.value) + 1;
+
+	log_debug("cpg_leave %s ...", name.value);
+ retry:
+	error = cpg_leave(cpg_handle_daemon, &name);
+	if (error == CS_ERR_TRY_AGAIN) {
+		sleep(1);
+		if (!(++i % 10))
+			log_error("daemon cpg_leave error retrying");
+		goto retry;
+	}
+	if (error != CS_OK)
+		log_error("daemon cpg_leave error %d", error);
+ fin:
+	list_for_each_entry(ls, &lockspaces, list) {
+		if (ls->cpg_handle)
+			cpg_finalize(ls->cpg_handle);
+	}
+	cpg_finalize(cpg_handle_daemon);
+}
+
+void init_daemon(void)
+{
+	INIT_LIST_HEAD(&daemon_nodes);
+	INIT_LIST_HEAD(&startup_nodes);
+
+}
diff --git a/dlm_controld/dlm_controld.h b/dlm_controld/dlm_controld.h
index 638b0de..96b425c 100644
--- a/dlm_controld/dlm_controld.h
+++ b/dlm_controld/dlm_controld.h
@@ -29,6 +29,7 @@
 #define DLMC_CMD_FS_NOTIFIED		9
 #define DLMC_CMD_DEADLOCK_CHECK		10
 #define DLMC_CMD_DUMP_LOG_PLOCK		11
+#define DLMC_CMD_FENCE_ACK		12
struct dlmc_header {
    unsigned int magic;
diff --git a/dlm_controld/dlm_daemon.h b/dlm_controld/dlm_daemon.h
index 5fca041..5e83db3 100644
--- a/dlm_controld/dlm_daemon.h
+++ b/dlm_controld/dlm_daemon.h
@@ -9,16 +9,18 @@
 #ifndef __DLM_DAEMON_DOT_H__
 #define __DLM_DAEMON_DOT_H__
-#include <sys/types.h>
 #include <asm/types.h>
+#include <sys/types.h>
 #include <sys/uio.h>
-#include <netinet/in.h>
 #include <sys/socket.h>
 #include <sys/un.h>
 #include <sys/ioctl.h>
 #include <sys/stat.h>
 #include <sys/utsname.h>
 #include <sys/poll.h>
+#include <sys/time.h>
+#include <sys/wait.h>
+#include <netinet/in.h>
 #include <netinet/in.h>
 #include <arpa/inet.h>
 #include <net/if.h>
@@ -37,7 +39,6 @@
 #include <syslog.h>
 #include <sched.h>
 #include <signal.h>
-#include <sys/time.h>
 #include <dirent.h>
#include <corosync/cpg.h>
@@ -45,6 +46,7 @@
 #include <linux/dlmconstants.h>
 #include "libdlmcontrol.h"
 #include "dlm_controld.h"
+#include "fence_config.h"
 #include "list.h"
 #include "rbtree.h"
 #include "linux_endian.h"
@@ -60,9 +62,9 @@
/* TODO: get CONFDIR, LOGDIR, RUNDIR from build */
-#define RUNDIR                   "/var/run/cluster"
-#define LOGDIR                   "/var/log/cluster"
-#define CONFDIR                  "/etc"
+#define RUNDIR                   "/var/run/dlm"
+#define LOGDIR                   "/var/log/dlm"
+#define CONFDIR                  "/etc/dlm"
#define RUN_FILE_NAME            "dlm_controld.pid"
 #define LOG_FILE_NAME            "dlm_controld.log"
@@ -80,7 +82,8 @@
#define DEFAULT_DEBUG_LOGFILE 0
 #define DEFAULT_ENABLE_FENCING 1
-#define DEFAULT_ENABLE_QUORUM 0
+#define DEFAULT_ENABLE_QUORUM_FENCING 1
+#define DEFAULT_ENABLE_QUORUM_LOCKSPACE 0
 #define DEFAULT_ENABLE_FSCONTROL 0
 #define DEFAULT_ENABLE_PLOCK 1
 #define DEFAULT_PLOCK_DEBUG 0
@@ -89,6 +92,8 @@
 #define DEFAULT_DROP_RESOURCES_TIME 10000 /* 10 sec */
 #define DEFAULT_DROP_RESOURCES_COUNT 10
 #define DEFAULT_DROP_RESOURCES_AGE 10000 /* 10 sec */
+#define DEFAULT_FENCE_ALL_AGENT "dlm_stonith"
+#define DEFAULT_STARTUP_FENCE 30
/* DLM_LOCKSPACE_LEN: maximum lockspace name length, from linux/dlmconstants.h.
    Copied in libdlm.h so apps don't need to include the kernel header.
@@ -118,9 +123,8 @@
 EXTERN int daemon_debug_opt;
 EXTERN int daemon_quit;
 EXTERN int cluster_down;
-EXTERN int poll_ringid;
+EXTERN int poll_lockspaces;
 EXTERN int poll_fencing;
-EXTERN int poll_quorum;
 EXTERN int poll_fs;
 EXTERN int poll_ignore_plock;
 EXTERN int poll_drop_plock;
@@ -128,20 +132,24 @@ EXTERN int plock_fd;
 EXTERN int plock_ci;
 EXTERN struct list_head lockspaces;
 EXTERN int cluster_quorate;
-EXTERN uint64_t quorate_time;
+EXTERN uint64_t cluster_quorate_monotime;
+EXTERN uint64_t cluster_joined_monotime;
+EXTERN uint64_t cluster_joined_walltime;
 EXTERN uint32_t cluster_ringid_seq;
 EXTERN char cluster_name[DLM_LOCKSPACE_LEN+1];
 EXTERN int our_nodeid;
 EXTERN uint32_t control_minor;
 EXTERN uint32_t monitor_minor;
 EXTERN uint32_t plock_minor;
+EXTERN struct fence_device fence_all_device;
EXTERN int optk_debug;
 EXTERN int optk_timewarn;
 EXTERN int optk_protocol;
 EXTERN int optd_debug_logfile;
 EXTERN int optd_enable_fencing;
-EXTERN int optd_enable_quorum;
+EXTERN int optd_enable_quorum_fencing;
+EXTERN int optd_enable_quorum_lockspace;
 EXTERN int optd_enable_fscontrol;
 EXTERN int optd_enable_plock;
 EXTERN int optd_plock_debug;
@@ -150,13 +158,16 @@ EXTERN int optd_plock_ownership;
 EXTERN int optd_drop_resources_time;
 EXTERN int optd_drop_resources_count;
 EXTERN int optd_drop_resources_age;
+EXTERN int optd_startup_fence;
+EXTERN int optd_fence_all_agent;
EXTERN int cfgk_debug;
 EXTERN int cfgk_timewarn;
 EXTERN int cfgk_protocol;
 EXTERN int cfgd_debug_logfile;
 EXTERN int cfgd_enable_fencing;
-EXTERN int cfgd_enable_quorum;
+EXTERN int cfgd_enable_quorum_fencing;
+EXTERN int cfgd_enable_quorum_lockspace;
 EXTERN int cfgd_enable_fscontrol;
 EXTERN int cfgd_enable_plock;
 EXTERN int cfgd_plock_debug;
@@ -165,6 +176,8 @@ EXTERN int cfgd_plock_ownership;
 EXTERN int cfgd_drop_resources_time;
 EXTERN int cfgd_drop_resources_count;
 EXTERN int cfgd_drop_resources_age;
+EXTERN int cfgd_startup_fence;
+EXTERN char fence_all_agent[PATH_MAX];
#define LOG_DUMP_SIZE DLMC_DUMP_SIZE
@@ -194,7 +207,9 @@ enum {
    DLM_MSG_DEADLK_CYCLE_START,
    DLM_MSG_DEADLK_CYCLE_END,
    DLM_MSG_DEADLK_CHECKPOINT_READY,
-	DLM_MSG_DEADLK_CANCEL_LOCK
+	DLM_MSG_DEADLK_CANCEL_LOCK,
+	DLM_MSG_FENCE_RESULT,
+	DLM_MSG_FENCE_CLEAR,
 };
/* dlm_header flags */
@@ -288,15 +303,10 @@ void setup_config(int update);
 int get_weight(int nodeid, char *lockspace);
/* cpg.c */
-int setup_cpg_daemon(void);
-void close_cpg_daemon(void);
-void process_cpg_daemon(int ci);
-int set_protocol(void);
 void process_lockspace_changes(void);
-void dlm_send_message(struct lockspace *ls, char *buf, int len);
+void process_fencing_changes(void);
 int dlm_join_lockspace(struct lockspace *ls);
 int dlm_leave_lockspace(struct lockspace *ls);
-const char *msg_name(int type);
 void update_flow_control_status(void);
 int set_node_info(struct lockspace *ls, int nodeid, struct dlmc_node *node);
 int set_lockspace_info(struct lockspace *ls, struct dlmc_lockspace *lockspace);
@@ -305,6 +315,36 @@ int set_lockspace_nodes(struct lockspace *ls, int option, int *node_count,
    		struct dlmc_node **nodes_out);
 int set_fs_notified(struct lockspace *ls, int nodeid);
+/* daemon_cpg.c */
+void init_daemon(void);
+void fence_ack_node(int nodeid);
+void add_startup_node(int nodeid);
+const char *reason_str(int reason);
+const char *msg_name(int type);
+void dlm_send_message(struct lockspace *ls, char *buf, int len);
+void dlm_header_in(struct dlm_header *hd);
+int dlm_header_validate(struct dlm_header *hd, int nodeid);
+int fence_node_time(int nodeid, uint64_t *last_fenced);
+int fence_in_progress(int *in_progress);
+int setup_cpg_daemon(void);
+void close_cpg_daemon(void);
+void process_cpg_daemon(int ci);
+void set_protocol_stateful(void);
+int set_protocol(void);
+
+void log_config(const struct cpg_name *group_name,
+                const struct cpg_address *member_list,
+                size_t member_list_entries,
+                const struct cpg_address *left_list,
+                size_t left_list_entries,
+                const struct cpg_address *joined_list,
+                size_t joined_list_entries);
+
+void log_ringid(const char *name,
+                struct cpg_ring_id *ringid,
+                const uint32_t *member_list,
+                size_t member_list_entries);
+
 /* deadlock.c */
 void setup_deadlock(void);
 void send_cycle_start(struct lockspace *ls);
@@ -346,11 +386,12 @@ int setup_cluster_cfg(void);
 void close_cluster_cfg(void);
 void process_cluster_cfg(int ci);
 void kick_node_from_cluster(int nodeid);
+int setup_node_config(void);
/* fence.c */
-int fence_request(int nodeid);
-int fence_node_time(int nodeid, uint64_t *last_fenced_time);
-int fence_in_progress(int *count);
+int fence_request(int nodeid, uint64_t fail_walltime, uint64_t fail_monotime,
+                  struct fence_config *fc, int *pid_out);
+int fence_result(int nodeid, int pid, int *result);
/* netlink.c */
 int setup_netlink(void);
diff --git a/dlm_controld/fence.c b/dlm_controld/fence.c
index 50b7d4d..91c5e67 100644
--- a/dlm_controld/fence.c
+++ b/dlm_controld/fence.c
@@ -7,28 +7,147 @@
  */
#include "dlm_daemon.h"
-#include <pacemaker/crm/stonith-ng.h>
-int fence_request(int nodeid)
+static int run_agent(char *agent, char *args, int *pid_out)
 {
-	int rv;
-	rv = stonith_api_kick_helper(nodeid, 300, 1);
-	if (rv) {
-		log_error("stonith_api_kick_helper %d error %d", nodeid, rv);
-		return rv;
+	int pid, len;
+	int pw_fd = -1;  /* parent write file descriptor */
+	int cr_fd = -1;  /* child read file descriptor */
+	int pfd[2];
+
+	len = strlen(args);
+
+	if (pipe(pfd))
+		return -errno;
+
+	cr_fd = pfd[0];
+	pw_fd = pfd[1];
+
+	pid = fork();
+	if (pid < 0) {
+		close(cr_fd);
+		close(pw_fd);
+		return -errno;
    }
-	return 0;
+
+	if (pid) {
+		/* parent */
+		int ret;
+
+		do {
+			ret = write(pw_fd, args, len);
+		} while (ret < 0 && errno == EINTR);
+
+		if (ret != len)
+			goto fail;
+
+		close(cr_fd);
+		close(pw_fd);
+
+		*pid_out = pid;
+		return 0;
+	} else {
+		/* child */
+		int c_stdout, c_stderr;
+
+		/* redirect agent stdout/stderr to /dev/null */
+		close(1);
+		c_stdout = open("/dev/null", O_WRONLY);
+		if (c_stdout < 0)
+			goto fail;
+		close(2);
+		c_stderr = open("/dev/null", O_WRONLY);
+		if (c_stderr < 0)
+			goto fail;
+
+		/* redirect agent stdin from parent */
+		close(0);
+		if (dup(cr_fd) < 0)
+			goto fail;
+
+		close(cr_fd);
+		close(pw_fd);
+
+		execlp(agent, agent, NULL);
+		exit(EXIT_FAILURE);
+	}
+ fail:
+	close(cr_fd);
+	close(pw_fd);
+	return -1;
 }
-int fence_node_time(int nodeid, uint64_t *last_fenced_time)
+int fence_request(int nodeid, uint64_t fail_walltime, uint64_t fail_monotime,
+		  struct fence_config *fc, int *pid_out)
 {
-	*last_fenced_time = stonith_api_time_helper(nodeid, 0);
+	struct fence_device *dev;
+	char args[FENCE_CONFIG_ARGS_MAX];
+	char extra[FENCE_CONFIG_NAME_MAX];
+	int rv, pid = -1;
+
+	memset(args, 0, sizeof(args));
+
+	memset(extra, 0, sizeof(extra));
+	snprintf(extra, sizeof(extra)-1, "fail_time=%llu\n", (unsigned long long)fail_walltime);
+
+	dev = fc->dev[fc->pos];
+	if (!dev)
+		return -1;
+
+	rv = fence_config_agent_args(fc, extra, args);
+	if (rv < 0) {
+		log_error("fence_request %d args error %d", nodeid, rv);
+		return rv;
+	}
+
+	rv = run_agent(dev->agent, args, &pid);
+	if (rv < 0) {
+		log_error("fence_request %d agent %s pid %d run error %d",
+			  nodeid, dev->agent, pid, rv);
+		return rv;
+	}
+
+	log_debug("fence_request %d pos %d agent %s pid %d running",
+		  nodeid, fc->pos, dev->agent, pid);
+
+	*pid_out = pid;
    return 0;
 }
-int fence_in_progress(int *count)
+/*
+ * if pid has exited, return 0 with exit code in result
+ * if pid is running, return -EAGAIN
+ * other error, return -EXXX
+ */
+
+int fence_result(int nodeid, int pid, int *result)
 {
-	*count = 0;
-	return 0;
+	int status, rv;
+
+	rv = waitpid(pid, &status, WNOHANG);
+
+	if (rv < 0) {
+		/* shouldn't happen */
+		log_error("agent pid %d nodeid %d errno %d",
+			  pid, nodeid, errno);
+		return rv;
+
+	} else if (!rv) {
+		/* pid still running */
+		return -EAGAIN;
+
+	} else if (WIFEXITED(status)) {
+		/* pid exited */
+
+		*result = WEXITSTATUS(status);
+
+		log_error("agent pid %d nodeid %d result %d",
+			  pid, nodeid, *result);
+		return 0;
+
+	} else {
+		/* pid state changed but still running */
+		return -EAGAIN;
+	}
 }
diff --git a/dlm_controld/fence_config.c b/dlm_controld/fence_config.c
new file mode 100644
index 0000000..9f72e36
--- /dev/null
+++ b/dlm_controld/fence_config.c
@@ -0,0 +1,403 @@
+/*
+ * Copyright 2012 Red Hat, Inc.
+ *
+ * This copyrighted material is made available to anyone wishing to use,
+ * modify, copy, or redistribute it subject to the terms and conditions
+ * of the GNU General Public License v2 or (at your option) any later version.
+ */
+
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <string.h>
+#include <errno.h>
+
+#include "fence_config.h"
+
+#if 0
+
+Empty new line separates the config for each fence device.
+
+-
+
+fence_all fence_foo key=val ...
+
+Special fence config format that applies to all nodes, allows
+no per node config parameters, and cannot be used with any
+other fence device configuration.
+
+-
+
+device  <dev_name> <agent> <dev_args>
+connect <dev_name> node=<nodeid> <con_args>
+
+General fence config format, allowing per node config
+parameters.
+
+-
+
+device  foo fence_foo ipaddr=1.1.1.1 login=x password=y
+connect foo node=1 port=1
+connect foo node=2 port=2
+connect foo node=3 port=3
+
+Simple example of nodes connected to switch ports.
+If fencing with the device fails, the next device
+listed for the node, if any, will be tried.
+
+-
+
+device  foo:1 fence_foo ipaddr=1.1.1.1 login=x password=y
+connect foo:1 node=1 port=1
+connect foo:1 node=2 port=2
+connect foo:1 node=3 port=3
+
+device  foo:2 fence_foo ipaddr=2.2.2.2 login=x password=y
+connect foo:2 node=1 port=1
+connect foo:2 node=2 port=2
+connect foo:2 node=3 port=3
+
+Associate two parallel path/power devices that must both
+succeed for fencing to succeed.  Devices have same base
+name with :1 :2 suffix.
+
+-
+
+device  foo fence_foo ipaddr=1.1.1.1 login=x password=y
+connect foo node=1 port=1
+connect foo node=2 port=2
+connect foo node=3 port=3
+unfence foo
+
+Add unfence line to indicate nodes connected to the device
+should be unfenced.
+
+#endif
+
+#define MAX_LINE (FENCE_CONFIG_ARGS_MAX + (3 * FENCE_CONFIG_NAME_MAX))
+
+static unsigned int con_args_nodeid(char *args)
+{
+	char *k;
+	unsigned int v;
+	int rv;
+
+	k = strstr(args, "node=");
+
+	rv = sscanf(k, "node=%u", &v);
+	if (rv != 1)
+		return 0;
+	return v;
+}
+
+static int read_config_section(unsigned int nodeid, FILE *file, char *dev_line,
+			       struct fence_device **dev_out,
+			       struct fence_connect **con_out)
+{
+	struct fence_device *dev;
+	struct fence_connect *con;
+	char line[MAX_LINE];
+	char unused[FENCE_CONFIG_NAME_MAX];
+	char agent[FENCE_CONFIG_NAME_MAX];
+	char dev_name[FENCE_CONFIG_NAME_MAX];
+	char con_name[FENCE_CONFIG_NAME_MAX];
+	char dev_args[FENCE_CONFIG_ARGS_MAX];
+	char con_args[FENCE_CONFIG_ARGS_MAX];
+	int rv, unfence = 0;
+
+	if (strlen(dev_line) > MAX_LINE)
+		return -1;
+
+	memset(dev_name, 0, sizeof(dev_name));
+	memset(agent, 0, sizeof(agent));
+	memset(dev_args, 0, sizeof(dev_args));
+
+	rv = sscanf(dev_line, "%s %s %s %[^\n]s\n", unused, dev_name, agent, dev_args);
+	if (rv < 3)
+		return -1;
+
+	while (fgets(line, MAX_LINE, file)) {
+		if (line[0] == '\n')
+			break;
+		if (line[0] == '#')
+			continue;
+
+		if (!strncmp(line, "unfence", strlen("unfence"))) {
+			/* TODO: verify dev_name matches */
+			unfence = 1;
+			continue;
+		}
+
+		/* invalid config */
+		if (strncmp(line, "connect", strlen("connect")))
+			return -1;
+
+		memset(con_name, 0, sizeof(con_name));
+		memset(con_args, 0, sizeof(con_args));
+
+		rv = sscanf(line, "%s %s %[^\n]s", unused, con_name, con_args);
+		if (rv < 3)
+			return -1;
+
+		/* invalid config */
+		if (strncmp(dev_name, con_name, FENCE_CONFIG_NAME_MAX))
+			return -1;
+
+		/* skip connection for another node */
+		if (con_args_nodeid(con_args) != nodeid)
+			continue;
+
+		dev = malloc(sizeof(struct fence_device));
+		if (!dev)
+			return -ENOMEM;
+
+		con = malloc(sizeof(struct fence_connect));
+		if (!con) {
+			free(dev);
+			return -ENOMEM;
+		}
+
+		memset(dev, 0, sizeof(struct fence_device));
+		memset(con, 0, sizeof(struct fence_connect));
+
+		strncpy(dev->name, dev_name, FENCE_CONFIG_NAME_MAX-1);
+		strncpy(dev->agent, agent, FENCE_CONFIG_NAME_MAX-1);
+		strncpy(dev->args, dev_args, FENCE_CONFIG_ARGS_MAX-1);
+		strncpy(con->name, con_name, FENCE_CONFIG_NAME_MAX-1);
+		strncpy(con->args, con_args, FENCE_CONFIG_ARGS_MAX-1);
+		dev->unfence = unfence;
+
+		*dev_out = dev;
+		*con_out = con;
+		return 0;
+	}
+
+	return -1;
+}
+
+void fence_config_free(struct fence_config *fc)
+{
+	struct fence_device *dev;
+	struct fence_connect *con;
+	int i;
+
+	for (i = 0; i < FENCE_CONFIG_DEVS_MAX; i++) {
+		dev = fc->dev[i];
+		con = fc->con[i];
+		if (dev)
+			free(dev);
+		if (con)
+			free(con);
+	}
+
+	memset(fc, 0, sizeof(struct fence_config));
+}
+
+int fence_config_init(struct fence_config *fc, unsigned int nodeid, char *path)
+{
+	char line[MAX_LINE];
+	struct fence_device *dev;
+	struct fence_connect *con;
+	FILE *file;
+	int pos = 0;
+	int rv;
+
+	fc->nodeid = nodeid;
+
+	file = fopen(path, "r");
+	if (!file)
+		return -1;
+
+	while (fgets(line, MAX_LINE, file)) {
+		if (line[0] == '#')
+			continue;
+		if (line[0] == '\n')
+			continue;
+
+		if (!strncmp(line, "fence_all", strlen("fence_all"))) {
+			/* fence_all cannot be used with other fence devices */
+			if (pos) {
+				rv = -EINVAL;
+				goto out;
+			}
+
+			dev = malloc(sizeof(struct fence_device));
+			if (!dev) {
+				rv = -ENOMEM;
+				goto out;
+			}
+
+			rv = sscanf(line, "%s %s %[^\n]s\n", dev->name, dev->agent, dev->args);
+			if (rv < 2) {
+				rv = -EINVAL;
+				goto out;
+			}
+
+			fc->dev[0] = dev;
+			fc->pos = 0;
+			rv = 0;
+			goto out;
+		}
+
+		if (strncmp(line, "device", strlen("device")))
+			continue;
+
+		/* read connect and unfence lines following a device line */
+
+		rv = read_config_section(nodeid, file, line, &dev, &con);
+		if (rv < 0)
+			continue;
+
+		fc->dev[pos] = dev;
+		fc->con[pos] = con;
+		pos++;
+	}
+
+	if (pos)
+		rv = 0;
+ out:
+	fclose(file);
+	return rv;
+}
+
+static int same_base_name(struct fence_device *a,
+			  struct fence_device *b)
+{
+	int len, i;
+
+	len = strlen(a->name);
+	if (len > strlen(b->name))
+		len = strlen(b->name);
+
+	for (i = 0; i < len; i++) {
+		if (a->name[i] == ':' && b->name[i] == ':')
+			return 1;
+		if (a->name[i] == b->name[i])
+			continue;
+		return 0;
+	}
+	return 0;
+}
+
+/*
+ * if next dev is in parallel with last one,
+ * set d,c return 0, else -1
+ *
+ * two consecutive devs with same basename are parallel
+ */
+
+int fence_config_next_parallel(struct fence_config *fc)
+{
+	struct fence_device *prev, *next;
+	int d = fc->pos;
+
+	if (d >= FENCE_CONFIG_DEVS_MAX)
+		return -1;
+
+	prev = fc->dev[d];
+	next = fc->dev[d+1];
+
+	if (!next)
+		return -1;
+
+	if (same_base_name(prev, next)) {
+		fc->pos = d+1;
+		return 0;
+	}
+	return -1;
+}
+
+/*
+ * if there's a dev with the next priority,
+ * set d,c return 0, else -1
+ *
+ * look for another dev with a non-matching basename
+ */
+
+int fence_config_next_priority(struct fence_config *fc)
+{
+	struct fence_device *prev, *next;
+	int d = fc->pos;
+	int i;
+
+	if (d >= FENCE_CONFIG_DEVS_MAX)
+		return -1;
+
+	prev = fc->dev[d];
+
+	for (i = d+1; i < FENCE_CONFIG_DEVS_MAX; i++) {
+		next = fc->dev[i];
+
+		if (!next)
+			return -1;
+
+		if (same_base_name(prev, next))
+			continue;
+
+		fc->pos = d+1;
+		return 0;
+	}
+	return -1;
+}
+
+int fence_config_agent_args(struct fence_config *fc, char *extra, char *args)
+{
+	struct fence_device *dev;
+	struct fence_connect *con;
+	char node[FENCE_CONFIG_NAME_MAX];
+	char *p;
+	int n = 0;
+	int i, len;
+
+	dev = fc->dev[fc->pos];
+	con = fc->con[fc->pos];
+
+	memset(node, 0, sizeof(node));
+	snprintf(node, FENCE_CONFIG_NAME_MAX-1, "node=%u\n", fc->nodeid);
+	len = strlen(node);
+
+	if (dev)
+		len += strlen(dev->args) + 1; /* +1 for \n */
+	if (con)
+		len += strlen(con->args) + 1;
+	if (extra)
+		len += strlen(extra) + 1;
+
+	if (len > FENCE_CONFIG_ARGS_MAX - 1)
+		return -1;
+
+	if (dev && dev->args[0]) {
+		p = dev->args;
+
+		for (i = 0; i < strlen(dev->args); i++) {
+			if (*p == ' ')
+				args[n++] = '\n';
+			else
+				args[n++] = *p;
+			p++;
+		}
+		args[n++] = '\n';
+	}
+
+	if (con && con->args[0]) {
+		p = con->args;
+
+		for (i = 0; i < strlen(con->args); i++) {
+			if (*p == ' ')
+				args[n++] = '\n';
+			else
+				args[n++] = *p;
+			p++;
+		}
+		args[n++] = '\n';
+	}
+
+	if (!strstr(args, "node="))
+		strcat(args, node);
+	if (extra)
+		strcat(args, extra);
+
+	return 0;
+}
+
diff --git a/dlm_controld/fence_config.h b/dlm_controld/fence_config.h
new file mode 100644
index 0000000..47e7bda
--- /dev/null
+++ b/dlm_controld/fence_config.h
@@ -0,0 +1,62 @@
+/*
+ * Copyright 2012 Red Hat, Inc.
+ *
+ * This copyrighted material is made available to anyone wishing to use,
+ * modify, copy, or redistribute it subject to the terms and conditions
+ * of the GNU General Public License v2 or (at your option) any later version.
+ */
+
+#ifndef _FENCE_CONFIG_H_
+#define _FENCE_CONFIG_H_
+
+#define FENCE_CONFIG_DEVS_MAX 4     /* max devs per node */
+#define FENCE_CONFIG_NAME_MAX 256   /* including terminating \0 */
+#define FENCE_CONFIG_ARGS_MAX 4096  /* including terminating \0 */
+
+struct fence_device {
+	char name[FENCE_CONFIG_NAME_MAX];
+	char agent[FENCE_CONFIG_NAME_MAX];
+	char args[FENCE_CONFIG_ARGS_MAX];
+	int unfence;
+};
+
+struct fence_connect {
+	char name[FENCE_CONFIG_NAME_MAX];
+	char args[FENCE_CONFIG_ARGS_MAX];
+};
+
+/* describes fence config for one node */
+
+struct fence_config {
+	struct fence_device *dev[FENCE_CONFIG_DEVS_MAX];
+	struct fence_connect *con[FENCE_CONFIG_DEVS_MAX];
+	unsigned int nodeid;
+	int pos;
+};
+
+
+int fence_config_init(struct fence_config *fc, unsigned int nodeid, char *path);
+void fence_config_free(struct fence_config *fc);
+
+/*
+ * Iterate through fence_config, sets pos to indicate next to try.
+ * Based on two rules:
+ *
+ * - next_parallel is the next device with the same base name
+ *   as the current device (base name is name preceding ":")
+ *
+ * - next_priority is the next device without the same base name
+ *   as the current device
+ */
+
+int fence_config_next_parallel(struct fence_config *fc);
+int fence_config_next_priority(struct fence_config *fc);
+
+/*
+ * Combine dev->args and con->args, replacing ' ' with '\n'.
+ * Also add "node=nodeid" if "node=" does not already exist.
+ */
+
+int fence_config_agent_args(struct fence_config *fc, char *extra, char *args);
+
+#endif
diff --git a/dlm_controld/lib.c b/dlm_controld/lib.c
index 09f84cf..228f037 100644
--- a/dlm_controld/lib.c
+++ b/dlm_controld/lib.c
@@ -425,3 +425,22 @@ int dlmc_deadlock_check(char *name)
    return rv;
 }
+int dlmc_fence_ack(char *name)
+{
+	struct dlmc_header h;
+	int fd, rv;
+
+	init_header(&h, DLMC_CMD_FENCE_ACK, name, 0);
+
+	fd = do_connect(DLMC_SOCK_PATH);
+	if (fd < 0) {
+		rv = fd;
+		goto out;
+	}
+
+	rv = do_write(fd, &h, sizeof(h));
+	close(fd);
+ out:
+	return rv;
+}
+
diff --git a/dlm_controld/libdlmcontrol.h b/dlm_controld/libdlmcontrol.h
index 609372d..4034372 100644
--- a/dlm_controld/libdlmcontrol.h
+++ b/dlm_controld/libdlmcontrol.h
@@ -93,6 +93,7 @@ int dlmc_fs_notified(int fd, char *name, int nodeid);
 int dlmc_fs_result(int fd, char *name, int *type, int *nodeid, int *result);
int dlmc_deadlock_check(char *name);
+int dlmc_fence_ack(char *name);
#endif
diff --git a/dlm_controld/main.c b/dlm_controld/main.c
index e0420b6..e623a6f 100644
--- a/dlm_controld/main.c
+++ b/dlm_controld/main.c
@@ -163,6 +163,10 @@ static void sigterm_handler(int sig)
    daemon_quit = 1;
 }
+static void sigchld_handler(int sig)
+{
+}
+
 static struct lockspace *create_ls(char *name)
 {
    struct lockspace *ls;
@@ -649,6 +653,10 @@ static void process_connection(int ci)
    }
switch (h.command) {
+	case DLMC_CMD_FENCE_ACK:
+		fence_ack_node(atoi(h.name));
+		break;
+
    case DLMC_CMD_FS_REGISTER:
    	if (cfgd_enable_fscontrol) {
    		rv = fs_register_add(h.name);
@@ -869,6 +877,8 @@ static void loop(void)
    void (*workfn) (int ci);
    void (*deadfn) (int ci);
+	setup_config(0);
+
    rv = setup_queries();
    if (rv < 0)
    	goto out;
@@ -884,13 +894,15 @@ static void loop(void)
    if (rv > 0) 
    	client_add(rv, process_cluster_cfg, cluster_dead);
+	rv = setup_node_config();
+	if (rv < 0)
+		goto out;
+
    rv = setup_cluster();
    if (rv < 0)
    	goto out;
    client_add(rv, process_cluster, cluster_dead);
-	setup_config(0);
-
    rv = check_uncontrolled_lockspaces();
    if (rv < 0)
    	goto out;
@@ -945,8 +957,10 @@ static void loop(void)
    	if (rv == -1 && errno == EINTR) {
    		if (daemon_quit && list_empty(&lockspaces))
    			goto out;
-			log_error("shutdown ignored, active lockspaces");
-			daemon_quit = 0;
+			if (daemon_quit) {
+				log_error("shutdown ignored, active lockspaces");
+				daemon_quit = 0;
+			}
    		continue;
    	}
    	if (rv < 0) {
@@ -977,7 +991,12 @@ static void loop(void)
poll_timeout = -1;
-		if (poll_fencing || poll_fs) {
+		if (poll_fencing) {
+			process_fencing_changes();
+			poll_timeout = 1000;
+		}
+
+		if (poll_lockspaces || poll_fs) {
    		process_lockspace_changes();
    		poll_timeout = 1000;
    	}
@@ -1097,7 +1116,7 @@ static void print_usage(void)
    printf("  -f <num>	Enable (1) or disable (0) fencing recovery dependency\n");
    printf("		Default is %d\n", DEFAULT_ENABLE_FENCING);
    printf("  -q <num>	Enable (1) or disable (0) quorum recovery dependency\n");
-	printf("		Default is %d\n", DEFAULT_ENABLE_QUORUM);
+	printf("		Default is %d\n", DEFAULT_ENABLE_QUORUM_FENCING);
    printf("  -s <num>      Enable (1) or disable (0) fs_controld recovery coordination\n");
    printf("                Default is %d\n", DEFAULT_ENABLE_FSCONTROL);
 #if 0
@@ -1121,7 +1140,7 @@ static void print_usage(void)
    printf("  -V		Print program version information, then exit\n");
 }
-#define OPTION_STRING "LDKf:q:p:Pl:o:t:c:a:hVr:s:"
+#define OPTION_STRING "LDKf:q:p:Pl:o:t:c:a:hVr:s:e:d:"
static void read_arguments(int argc, char **argv)
 {
@@ -1151,21 +1170,35 @@ static void read_arguments(int argc, char **argv)
    		cfgk_protocol = atoi(optarg);
    		break;
+		case 's':
+			optd_enable_fscontrol = 1;
+			cfgd_enable_fscontrol = atoi(optarg);
+			break;
+
+		/* fencing options */
+
    	case 'f':
    		optd_enable_fencing = 1;
    		cfgd_enable_fencing = atoi(optarg);
    		break;
case 'q':
-			optd_enable_quorum = 1;
-			cfgd_enable_quorum = atoi(optarg);
+			optd_enable_quorum_fencing = 1;
+			cfgd_enable_quorum_fencing = atoi(optarg);
    		break;
-		case 's':
-			optd_enable_fscontrol = 1;
-			cfgd_enable_fscontrol = atoi(optarg);
+		case 'e':
+			optd_fence_all_agent = 1;
+			strcpy(fence_all_agent, optarg);
    		break;
+		case 'd':
+			optd_startup_fence = 1;
+			cfgd_startup_fence = atoi(optarg);
+
+
+		/* plock options */
+
    	case 'p':
    		optd_enable_plock = 1;
    		cfgd_enable_plock = atoi(optarg);
@@ -1256,15 +1289,15 @@ static void set_scheduler(void)
int main(int argc, char **argv)
 {
-	int fd;
+	struct sigaction act;
+	int fd, rv;
cfgk_debug                  = -1;
    cfgk_timewarn               = -1;
    cfgk_protocol               = PROTO_DETECT;
    cfgd_debug_logfile          = DEFAULT_DEBUG_LOGFILE;
-	cfgd_enable_fencing         = DEFAULT_ENABLE_FENCING;
-	cfgd_enable_quorum          = DEFAULT_ENABLE_QUORUM;
-	cfgd_enable_quorum          = DEFAULT_ENABLE_FSCONTROL;
+	cfgd_enable_fscontrol       = DEFAULT_ENABLE_FSCONTROL;
+
    cfgd_enable_plock           = DEFAULT_ENABLE_PLOCK;
    cfgd_plock_debug            = DEFAULT_PLOCK_DEBUG;
    cfgd_plock_rate_limit       = DEFAULT_PLOCK_RATE_LIMIT;
@@ -1273,8 +1306,20 @@ int main(int argc, char **argv)
    cfgd_drop_resources_count   = DEFAULT_DROP_RESOURCES_COUNT;
    cfgd_drop_resources_age     = DEFAULT_DROP_RESOURCES_AGE;
+	cfgd_enable_fencing         = DEFAULT_ENABLE_FENCING;
+	cfgd_enable_quorum_lockspace= DEFAULT_ENABLE_QUORUM_LOCKSPACE;
+	cfgd_enable_quorum_fencing  = DEFAULT_ENABLE_QUORUM_FENCING;
+	cfgd_startup_fence          = DEFAULT_STARTUP_FENCE;
+
+	strcpy(fence_all_agent, DEFAULT_FENCE_ALL_AGENT);
+	memset(&fence_all_device, 0, sizeof(struct fence_device));
+	strcpy(fence_all_device.name, "fence_all");
+	strcpy(fence_all_device.agent, fence_all_agent);
+
    INIT_LIST_HEAD(&lockspaces);
    INIT_LIST_HEAD(&fs_register_list);
+	
+	init_daemon();
read_arguments(argc, argv);
@@ -1293,7 +1338,18 @@ int main(int argc, char **argv)
log_level(NULL, LOG_INFO, "dlm_controld %s started", RELEASE_VERSION);
-	signal(SIGTERM, sigterm_handler);
+	memset(&act, 0, sizeof(act));
+	act.sa_handler = sigterm_handler;
+	rv = sigaction(SIGTERM, &act, NULL);
+	if (rv < 0)
+		return -rv;
+
+	memset(&act, 0, sizeof(act));
+	act.sa_handler = sigchld_handler;
+	act.sa_flags = SA_NOCLDSTOP;
+	rv = sigaction(SIGCHLD, &act, NULL);
+	if (rv < 0)
+		return -rv;
set_scheduler();
diff --git a/dlm_controld/member.c b/dlm_controld/member.c
index 7af581c..841e5e3 100644
--- a/dlm_controld/member.c
+++ b/dlm_controld/member.c
@@ -9,6 +9,7 @@
 #include "dlm_daemon.h"
 #include <corosync/corotypes.h>
 #include <corosync/cfg.h>
+#include <corosync/cmap.h>
 #include <corosync/quorum.h>
static corosync_cfg_handle_t	ch;
@@ -111,8 +112,13 @@ static void quorum_callback(quorum_handle_t h, uint32_t quorate,
    int i, j, num_addrs;
    uint64_t now = monotime();
+	if (!cluster_joined_monotime) {
+		cluster_joined_monotime = now;
+		cluster_joined_walltime = time(NULL);
+	}
+
    if (!cluster_quorate && quorate)
-		quorate_time = now;
+		cluster_quorate_monotime = now;
cluster_quorate = quorate;
    cluster_ringid_seq = (uint32_t)ring_seq;
@@ -174,8 +180,10 @@ void process_cluster(int ci)
    cs_error_t err;
err = quorum_dispatch(qh, CS_DISPATCH_ALL);
-	if (err != CS_OK)
+	if (err != CS_OK) {
+		log_error("process_cluster quorum_dispatch %d", err);
    	cluster_dead(0);
+	}
 }
/* Force re-read of quorum nodes */
@@ -184,8 +192,10 @@ void update_cluster(void)
    cs_error_t err;
err = quorum_dispatch(qh, CS_DISPATCH_ONE);
-	if (err != CS_OK)
+	if (err != CS_OK) {
+		log_error("update_cluster quorum_dispatch %d", err);
    	cluster_dead(0);
+	}
 }
int setup_cluster(void)
@@ -272,8 +282,10 @@ void process_cluster_cfg(int ci)
    cs_error_t err;
err = corosync_cfg_dispatch(ch, CS_DISPATCH_ALL);
-	if (err != CS_OK)
+	if (err != CS_OK) {
+		log_error("process_cluster_cfg cfg_dispatch %d", err);
    	cluster_dead(0);
+	}
 }
int setup_cluster_cfg(void)
@@ -319,3 +331,34 @@ void close_cluster_cfg(void)
    corosync_cfg_finalize(ch);
 }
+int setup_node_config(void)
+{
+	char key[CMAP_KEYNAME_MAXLEN+1];
+	cmap_handle_t h;
+	cs_error_t err;
+	uint32_t nodeid;
+	int i;
+
+	err = cmap_initialize(&h);
+	if (err != CS_OK) {
+		log_error("corosync cmap init error %d", err);
+		return -1;
+	}
+
+	for (i = 0; i < MAX_NODES; i++) {
+		snprintf(key, CMAP_KEYNAME_MAXLEN, "nodelist.node.%d.nodeid", i);
+
+		err = cmap_get_uint32(h, key, &nodeid);
+		if (err != CS_OK)
+			break;
+
+		log_debug("node_config %d", nodeid);
+
+		if (cfgd_enable_fencing && cfgd_startup_fence)
+			add_startup_node(nodeid);
+	}
+
+	cmap_finalize(h);
+	return 0;
+}
+
diff --git a/dlm_controld/plock.c b/dlm_controld/plock.c
index b6dc0a4..ed98e9c 100644
--- a/dlm_controld/plock.c
+++ b/dlm_controld/plock.c
@@ -27,8 +27,6 @@ static struct timeval plock_rate_last;
static int plock_device_fd = -1;
-extern int message_flow_control_on;
-
 #define RD_CONTINUE 0x00000001
struct resource_data {
@@ -1480,14 +1478,6 @@ int limit_plocks(void)
 {
    struct timeval now;
-	/* Don't send more messages while the cpg message queue is backed up */
-
-	if (message_flow_control_on) {
-		update_flow_control_status();
-		if (message_flow_control_on)
-			return 1;
-	}
-
    if (!cfgd_plock_rate_limit || !plock_read_count)
    	return 0;
diff --git a/dlm_tool/main.c b/dlm_tool/main.c
index c9a15db..c378f2b 100644
--- a/dlm_tool/main.c
+++ b/dlm_tool/main.c
@@ -38,6 +38,7 @@
 #define OP_LOCKDUMP			8
 #define OP_LOCKDEBUG			9
 #define OP_LOG_PLOCK			10
+#define OP_FENCE_ACK			11
static char *prog_name;
 static char *lsname;
@@ -182,7 +183,7 @@ static void print_usage(void)
    printf("\n");
    printf("dlm_tool [options] [join | leave | lockdump | lockdebug |\n"
           "                    ls | dump | log_plock | plocks |\n"
-	       "                    deadlock_check]\n");
+	       "                    fence_ack]\n");
    printf("\n");
    printf("Options:\n");
    printf("  -n               Show all node information in ls\n");
@@ -321,6 +322,11 @@ static void decode_arguments(int argc, char **argv)
    		operation = OP_DEADLOCK_CHECK;
    		opt_ind = optind + 1;
    		break;
+		} else if (!strncmp(argv[optind], "fence_ack", 9) &&
+			   (strlen(argv[optind]) == 9)) {
+			operation = OP_FENCE_ACK;
+			opt_ind = optind + 1;
+			break;
    	} else if (!strncmp(argv[optind], "dump", 4) &&
    		   (strlen(argv[optind]) == 4)) {
    		operation = OP_DUMP;
@@ -1241,6 +1247,11 @@ static void do_deadlock_check(char *name)
    dlmc_deadlock_check(name);
 }
+static void do_fence_ack(char *name)
+{
+	dlmc_fence_ack(name);
+}
+
 static void do_plocks(char *name)
 {
    char buf[DLMC_DUMP_SIZE];
@@ -1335,6 +1346,10 @@ int main(int argc, char **argv)
    case OP_LOCKDEBUG:
    	do_lockdebug(lsname);
    	break;
+
+	case OP_FENCE_ACK:
+		do_fence_ack(lsname);
+		break;
    }
    return 0;
 }
diff --git a/fence/Makefile b/fence/Makefile
new file mode 100644
index 0000000..c6ff2c0
--- /dev/null
+++ b/fence/Makefile
@@ -0,0 +1,56 @@
+DESTDIR=
+PREFIX=/usr
+BINDIR=$(PREFIX)/sbin
+#MANDIR=$(PREFIX)/share/man
+
+BIN_TARGET = dlm_stonith
+#MAN_TARGET = dlm_stonith.8
+
+BIN_SOURCE = stonith_helper.c
+
+BIN_CFLAGS += -D_GNU_SOURCE -g \
+	-Wall \
+	-Wformat \
+	-Wformat-security \
+	-Wmissing-prototypes \
+	-Wnested-externs \
+	-Wpointer-arith \
+	-Wextra -Wshadow \
+	-Wcast-align \
+	-Wwrite-strings \
+	-Waggregate-return \
+	-Wstrict-prototypes \
+	-Winline \
+	-Wredundant-decls \
+	-Wno-sign-compare \
+	-Wno-unused-parameter \
+	-Wp,-D_FORTIFY_SOURCE=2 \
+	-fexceptions \
+	-fasynchronous-unwind-tables \
+	-fdiagnostics-show-option \
+
+BIN_CFLAGS += -fPIE -DPIE
+BIN_CFLAGS += `xml2-config --cflags`
+BIN_CFLAGS += -I../include
+
+BIN_LDFLAGS += -Wl,-z,now -Wl,-z,relro -pie
+BIN_LDFLAGS += `xml2-config --libs`
+
+all: $(BIN_TARGET)
+
+$(BIN_TARGET): $(BIN_SOURCE)
+	$(CC) $(BIN_CFLAGS) $(BIN_LDFLAGS) $(BIN_SOURCE) -o $@ -L.
+
+clean:
+	rm -f *.o *.so *.so.* $(BIN_TARGET)
+
+
+INSTALL=$(shell which install)
+
+.PHONY: install
+install: all
+	$(INSTALL) -d $(DESTDIR)/$(BINDIR)
+	$(INSTALL) -d $(DESTDIR)/$(MANDIR)/man8
+	$(INSTALL) -c -m 755 $(BIN_TARGET) $(DESTDIR)/$(BINDIR)
+#	$(INSTALL) -m 644 $(MAN_TARGET) $(DESTDIR)/$(MANDIR)/man8/
+
diff --git a/fence/stonith_helper.c b/fence/stonith_helper.c
new file mode 100644
index 0000000..df44219
--- /dev/null
+++ b/fence/stonith_helper.c
@@ -0,0 +1,96 @@
+/*
+ * Copyright 2012 Red Hat, Inc.
+ *
+ * This copyrighted material is made available to anyone wishing to use,
+ * modify, copy, or redistribute it subject to the terms and conditions
+ * of the GNU General Public License v2 or (at your option) any later version.
+ */
+
+#include <stdio.h>
+#include <stdint.h>
+#include <string.h>
+#include <unistd.h>
+#include <syslog.h>
+#include <pacemaker/crm/stonith-ng.h>
+
+int nodeid;
+uint64_t fail_time;
+
+#define MAX_ARG_LEN 1024
+
+static int get_options(int argc, char *argv[])
+{
+	char arg[MAX_ARG_LEN];
+	char key[MAX_ARG_LEN];
+	char val[MAX_ARG_LEN];
+	char c;
+	int rv;
+
+	if (argc > 1) {
+		while ((c = getopt(argc, argv, "n:t:")) != -1) {
+			switch (c) {
+			case 'n':
+				nodeid = atoi(optarg);
+				break;
+			case 't':
+				fail_time = strtoull(optarg, NULL, 0);
+				break;
+			}
+		}
+	} else {
+		while (fgets(arg, sizeof(arg), stdin)) {
+			rv = sscanf(arg, "%[^=]=%s\n", key, val);
+			if (rv != 2)
+				continue;
+
+			if (!strcmp(key, "node"))
+				nodeid = atoi(val);
+			else if (!strcmp(key, "fail_time"))
+				fail_time = strtoull(val, NULL, 0);
+		}
+	}
+
+	if (!nodeid) {
+		fprintf(stderr, "no node\n");
+		return -1;
+	}
+
+	if (!fail_time) {
+		fprintf(stderr, "no fail_time\n");
+		return -1;
+	}
+
+	return 0;
+}
+
+int main(int argc, char *argv[])
+{
+	uint64_t t;
+	int rv;
+
+	rv = get_options(argc, argv);
+	if (rv)
+		return rv;
+
+	t = stonith_api_time_helper(nodeid, 0);
+	if (t >= fail_time)
+		return 0;
+
+	rv = stonith_api_kick_helper(nodeid, 300, 1);
+	if (rv) {
+		fprintf(stderr, "kick_helper error %d nodeid %d\n", rv, nodeid);
+		openlog("stonith_helper", LOG_CONS | LOG_PID, LOG_DAEMON);
+		syslog(LOG_ERR, "kick_helper error %d nodeid %d\n", rv, nodeid);
+		return rv;
+	}
+
+	while (1) {
+		t = stonith_api_time_helper(nodeid, 0);
+		if (t >= fail_time)
+			return 0;
+		sleep(1);
+	}
+
+	return -1;
+}
+