[dlm/f19] Fixes related to parallel startup
David Teigland
teigland at fedoraproject.org
Tue Jun 25 19:39:06 UTC 2013
commit fe7437c45bad647aeb11af497a668aacf108e78f
Author: David Teigland <teigland at redhat.com>
Date: Tue Jun 25 14:38:44 2013 -0500
Fixes related to parallel startup
0001-man-fix-dlm.conf-man-page.patch | 48 +++++++++
...d-daemon_fence_work-should-wait-for-confc.patch | 33 ++++++
...d-exclude-fencing-work-during-set_protoco.patch | 104 ++++++++++++++++++++
...dlm_controld-unify-fence-delay-variations.patch | 99 +++++++++++++++++++
dlm.spec | 19 +++-
5 files changed, 299 insertions(+), 4 deletions(-)
---
diff --git a/0001-man-fix-dlm.conf-man-page.patch b/0001-man-fix-dlm.conf-man-page.patch
new file mode 100644
index 0000000..a859a01
--- /dev/null
+++ b/0001-man-fix-dlm.conf-man-page.patch
@@ -0,0 +1,48 @@
+From 41f4121768ab4948898945007f49168acfac6c9f Mon Sep 17 00:00:00 2001
+From: David Teigland <teigland at redhat.com>
+Date: Fri, 17 May 2013 11:06:12 -0500
+Subject: [PATCH 1/4] man: fix dlm.conf man page
+
+Signed-off-by: David Teigland <teigland at redhat.com>
+---
+ dlm_controld/dlm.conf.5 | 10 +++++-----
+ 1 file changed, 5 insertions(+), 5 deletions(-)
+
+diff --git a/dlm_controld/dlm.conf.5 b/dlm_controld/dlm.conf.5
+index 95f74b9..793435a 100644
+--- a/dlm_controld/dlm.conf.5
++++ b/dlm_controld/dlm.conf.5
+@@ -349,15 +349,15 @@ Example of nodeid 1 as master of all resources:
+
+ lockspace foo nodir=1
+ .br
+-master node=1
++master foo node=1
+
+ Example of nodeid's 1 and 2 as masters of all resources:
+
+ lockspace foo nodir=1
+ .br
+-master node=1
++master foo node=1
+ .br
+-master node=2
++master foo node=2
+
+ Lock management will be partitioned among the available masters. There
+ can be any number of masters defined. The designated master nodes will
+@@ -376,9 +376,9 @@ can also be assigned to master nodes, e.g.
+
+ lockspace foo nodir=1
+ .br
+-master node=1 weight=2
++master foo node=1 weight=2
+ .br
+-master node=2 weight=1
++master foo node=2 weight=1
+
+ In which case node 1 will master 2/3 of the total resources and node 2
+ will master the other 1/3.
+--
+1.8.1.rc1.5.g7e0651a
+
diff --git a/0002-dlm_controld-daemon_fence_work-should-wait-for-confc.patch b/0002-dlm_controld-daemon_fence_work-should-wait-for-confc.patch
new file mode 100644
index 0000000..98d9500
--- /dev/null
+++ b/0002-dlm_controld-daemon_fence_work-should-wait-for-confc.patch
@@ -0,0 +1,33 @@
+From 5bdbe083ed23abc955309ea23fd6f008852b05b8 Mon Sep 17 00:00:00 2001
+From: David Teigland <teigland at redhat.com>
+Date: Tue, 25 Jun 2013 11:04:49 -0500
+Subject: [PATCH 2/4] dlm_controld: daemon_fence_work should wait for confchg
+
+If daemon_last_join_monotime has not yet been initialized
+by the first confchg, then daemon_fence_work() should
+wait for that to happen.
+
+Signed-off-by: David Teigland <teigland at redhat.com>
+---
+ dlm_controld/daemon_cpg.c | 5 +++++
+ 1 file changed, 5 insertions(+)
+
+diff --git a/dlm_controld/daemon_cpg.c b/dlm_controld/daemon_cpg.c
+index 8c4cff2..0758560 100644
+--- a/dlm_controld/daemon_cpg.c
++++ b/dlm_controld/daemon_cpg.c
+@@ -865,6 +865,11 @@ static void daemon_fence_work(void)
+ if (!opt(enable_startup_fencing_ind))
+ continue;
+
++ if (!daemon_last_join_monotime) {
++ log_debug("fence startup %d wait for confchg", node->nodeid);
++ continue;
++ }
++
+ if (monotime() - daemon_last_join_monotime < opt(post_join_delay_ind)) {
+ log_debug("fence startup %d delay %d from %llu",
+ node->nodeid, opt(post_join_delay_ind),
+--
+1.8.1.rc1.5.g7e0651a
+
diff --git a/0003-dlm_controld-exclude-fencing-work-during-set_protoco.patch b/0003-dlm_controld-exclude-fencing-work-during-set_protoco.patch
new file mode 100644
index 0000000..9f77ec6
--- /dev/null
+++ b/0003-dlm_controld-exclude-fencing-work-during-set_protoco.patch
@@ -0,0 +1,104 @@
+From f367c91ac4f4f8012456f4916c0c99b00d8569c4 Mon Sep 17 00:00:00 2001
+From: David Teigland <teigland at redhat.com>
+Date: Tue, 25 Jun 2013 11:02:22 -0500
+Subject: [PATCH 3/4] dlm_controld: exclude fencing work during set_protocol
+
+During set_protocol we only want to process protocol
+messages, and only process fencing status later during
+main daemon processing.
+
+Signed-off-by: David Teigland <teigland at redhat.com>
+---
+ dlm_controld/daemon_cpg.c | 26 ++++++++++++++++++++++++--
+ dlm_controld/dlm_daemon.h | 1 +
+ dlm_controld/main.c | 4 ++++
+ 3 files changed, 29 insertions(+), 2 deletions(-)
+
+diff --git a/dlm_controld/daemon_cpg.c b/dlm_controld/daemon_cpg.c
+index 0758560..0db48f5 100644
+--- a/dlm_controld/daemon_cpg.c
++++ b/dlm_controld/daemon_cpg.c
+@@ -766,6 +766,9 @@ static void daemon_fence_work(void)
+ int retry = 0;
+ uint32_t flags;
+
++ if (!daemon_fence_allow)
++ return;
++
+ if (daemon_ringid_wait) {
+ /* We've seen a nodedown confchg callback, but not the
+ corresponding ringid callback. */
+@@ -1811,6 +1814,7 @@ int set_protocol(void)
+ {
+ struct protocol proto;
+ struct pollfd pollfd;
++ cs_error_t error;
+ int sent_proposal = 0;
+ int rv;
+
+@@ -1860,8 +1864,17 @@ int set_protocol(void)
+ return -1;
+ }
+
+- if (pollfd.revents & POLLIN)
+- process_cpg_daemon(0);
++ if (pollfd.revents & POLLIN) {
++ /*
++ * don't use process_cpg_daemon() because we only want to
++ * dispatch one thing at a time because we only want to
++ * handling protocol related things here.
++ */
++
++ error = cpg_dispatch(cpg_handle_daemon, CS_DISPATCH_ONE);
++ if (error != CS_OK)
++ log_error("daemon cpg_dispatch one error %d", error);
++ }
+ if (pollfd.revents & (POLLERR | POLLHUP | POLLNVAL)) {
+ log_error("set_protocol poll revents %u",
+ pollfd.revents);
+@@ -1927,6 +1940,15 @@ static void deliver_cb_daemon(cpg_handle_t handle,
+ hd = (struct dlm_header *)data;
+ dlm_header_in(hd);
+
++ if (!daemon_fence_allow && hd->type != DLM_MSG_PROTOCOL) {
++ /* don't think this will happen; if it does we may
++ need to verify that it's correct to ignore these
++ messages instead of saving them to process after
++ allow is set */
++ log_debug("deliver_cb_daemon ignore non proto msg %d", hd->type);
++ return;
++ }
++
+ switch (hd->type) {
+ case DLM_MSG_PROTOCOL:
+ receive_protocol(hd, len);
+diff --git a/dlm_controld/dlm_daemon.h b/dlm_controld/dlm_daemon.h
+index 11a4777..dbe22ba 100644
+--- a/dlm_controld/dlm_daemon.h
++++ b/dlm_controld/dlm_daemon.h
+@@ -162,6 +162,7 @@ EXTERN int daemon_quit;
+ EXTERN int cluster_down;
+ EXTERN int poll_lockspaces;
+ EXTERN unsigned int retry_fencing;
++EXTERN int daemon_fence_allow;
+ EXTERN int poll_fs;
+ EXTERN int poll_ignore_plock;
+ EXTERN int poll_drop_plock;
+diff --git a/dlm_controld/main.c b/dlm_controld/main.c
+index 8fb16ef..287b82d 100644
+--- a/dlm_controld/main.c
++++ b/dlm_controld/main.c
+@@ -1021,6 +1021,10 @@ static void loop(void)
+ sd_notify(0, "READY=1");
+ #endif
+
++ /* We want to wait for our protocol to be set before
++ we start to process fencing. */
++ daemon_fence_allow = 1;
++
+ for (;;) {
+ rv = poll(pollfd, client_maxi + 1, poll_timeout);
+ if (rv == -1 && errno == EINTR) {
+--
+1.8.1.rc1.5.g7e0651a
+
diff --git a/0004-dlm_controld-unify-fence-delay-variations.patch b/0004-dlm_controld-unify-fence-delay-variations.patch
new file mode 100644
index 0000000..2a99b13
--- /dev/null
+++ b/0004-dlm_controld-unify-fence-delay-variations.patch
@@ -0,0 +1,99 @@
+From 2548250de3991f1f0aca297bbd072b525a132841 Mon Sep 17 00:00:00 2001
+From: David Teigland <teigland at redhat.com>
+Date: Tue, 25 Jun 2013 13:18:47 -0500
+Subject: [PATCH 4/4] dlm_controld: unify fence delay variations
+
+The fence delay period begins after a node joins the
+cluster or joins the daemon cpg. Apply this delay
+to both startup and normal fencing.
+
+Signed-off-by: David Teigland <teigland at redhat.com>
+---
+ dlm_controld/daemon_cpg.c | 15 +++++++--------
+ dlm_controld/dlm_daemon.h | 2 +-
+ dlm_controld/member.c | 2 +-
+ 3 files changed, 9 insertions(+), 10 deletions(-)
+
+diff --git a/dlm_controld/daemon_cpg.c b/dlm_controld/daemon_cpg.c
+index 0db48f5..d88cd46 100644
+--- a/dlm_controld/daemon_cpg.c
++++ b/dlm_controld/daemon_cpg.c
+@@ -107,7 +107,6 @@ static int daemon_remove_count;
+ static int daemon_ringid_wait;
+ static struct cpg_ring_id daemon_ringid;
+ static int daemon_fence_pid;
+-static uint64_t daemon_last_join_monotime;
+ static uint32_t last_join_seq;
+ static uint32_t send_fipu_seq;
+ static int wait_clear_fipu;
+@@ -868,15 +867,15 @@ static void daemon_fence_work(void)
+ if (!opt(enable_startup_fencing_ind))
+ continue;
+
+- if (!daemon_last_join_monotime) {
+- log_debug("fence startup %d wait for confchg", node->nodeid);
++ if (!fence_delay_begin) {
++ log_debug("fence startup %d wait for initial delay", node->nodeid);
+ continue;
+ }
+
+- if (monotime() - daemon_last_join_monotime < opt(post_join_delay_ind)) {
++ if (monotime() - fence_delay_begin < opt(post_join_delay_ind)) {
+ log_debug("fence startup %d delay %d from %llu",
+ node->nodeid, opt(post_join_delay_ind),
+- (unsigned long long)daemon_last_join_monotime);
++ (unsigned long long)fence_delay_begin);
+ retry = 1;
+ continue;
+ }
+@@ -959,10 +958,10 @@ static void daemon_fence_work(void)
+ time between it joining the cluster (giving cluster quorum)
+ and joining the daemon cpg, which allows it to bypass fencing */
+
+- if (monotime() - cluster_last_join_monotime < opt(post_join_delay_ind)) {
++ if (monotime() - fence_delay_begin < opt(post_join_delay_ind)) {
+ log_debug("fence request %d delay %d from %llu",
+ node->nodeid, opt(post_join_delay_ind),
+- (unsigned long long)cluster_last_join_monotime);
++ (unsigned long long)fence_delay_begin);
+ node->delay_fencing = 1;
+ retry = 1;
+ continue;
+@@ -2042,7 +2041,7 @@ static void confchg_cb_daemon(cpg_handle_t handle,
+ node->daemon_member = 1;
+ node->daemon_add_time = now;
+
+- daemon_last_join_monotime = now;
++ fence_delay_begin = now;
+ last_join_seq++;
+
+ /* a joining node shows prev members in joined list */
+diff --git a/dlm_controld/dlm_daemon.h b/dlm_controld/dlm_daemon.h
+index dbe22ba..62508ea 100644
+--- a/dlm_controld/dlm_daemon.h
++++ b/dlm_controld/dlm_daemon.h
+@@ -171,7 +171,7 @@ EXTERN int plock_ci;
+ EXTERN struct list_head lockspaces;
+ EXTERN int cluster_quorate;
+ EXTERN int cluster_two_node;
+-EXTERN uint64_t cluster_last_join_monotime;
++EXTERN uint64_t fence_delay_begin;
+ EXTERN uint64_t cluster_quorate_monotime;
+ EXTERN uint64_t cluster_joined_monotime;
+ EXTERN uint64_t cluster_joined_walltime;
+diff --git a/dlm_controld/member.c b/dlm_controld/member.c
+index fca3248..d4031ee 100644
+--- a/dlm_controld/member.c
++++ b/dlm_controld/member.c
+@@ -151,7 +151,7 @@ static void quorum_callback(quorum_handle_t h, uint32_t quorate,
+ quorum_nodes[i], cluster_ringid_seq);
+ add_cluster_node(quorum_nodes[i], now);
+
+- cluster_last_join_monotime = now;
++ fence_delay_begin = now;
+
+ err = corosync_cfg_get_node_addrs(ch, quorum_nodes[i],
+ MAX_NODE_ADDRESSES,
+--
+1.8.1.rc1.5.g7e0651a
+
diff --git a/dlm.spec b/dlm.spec
index 2d47827..1276841 100644
--- a/dlm.spec
+++ b/dlm.spec
@@ -1,6 +1,6 @@
Name: dlm
Version: 4.0.1
-Release: 1%{?dist}
+Release: 2%{?dist}
License: GPLv2 and GPLv2+ and LGPLv2+
# For a breakdown of the licensing, see README.license
Group: System Environment/Kernel
@@ -12,9 +12,13 @@ BuildRequires: pacemaker-libs-devel >= 1.1.7
BuildRequires: libxml2-devel
BuildRequires: systemd-units
BuildRequires: systemd-devel
-Source0: http://people.redhat.com/teigland/%{name}-%{version}.tar.gz
+Source0: http://git.fedorahosted.org/cgit/dlm.git/snapshot/%{name}-%{version}.tar.gz
+
+Patch0: 0001-man-fix-dlm.conf-man-page.patch
+Patch1: 0002-dlm_controld-daemon_fence_work-should-wait-for-confc.patch
+Patch2: 0003-dlm_controld-exclude-fencing-work-during-set_protoco.patch
+Patch3: 0004-dlm_controld-unify-fence-delay-variations.patch
-#Patch0: 0001-foo.patch
%if 0%{?rhel}
ExclusiveArch: i686 x86_64
@@ -33,7 +37,11 @@ The kernel dlm requires a user daemon to control membership.
%prep
%setup -q
-#%patch0 -p1 -b .0001-foo.patch
+
+%patch0 -p1 -b .0001-man-fix-dlm.conf-man-page.patch
+%patch1 -p1 -b .0002-dlm_controld-daemon_fence_work-should-wait-for-confc.patch
+%patch2 -p1 -b .0003-dlm_controld-exclude-fencing-work-during-set_protoco.patch
+%patch3 -p1 -b .0004-dlm_controld-unify-fence-delay-variations.patch
%build
# upstream does not require configure
@@ -105,6 +113,9 @@ developing applications that use %{name}.
%{_libdir}/pkgconfig/*.pc
%changelog
+* Tue Jun 25 2013 David Teigland <teigland at redhat.com> - 4.0.1-2
+- Fixes related to parallel startup
+
* Wed Mar 06 2013 David Teigland <teigland at redhat.com> - 4.0.1-1
- New usptream release, fencing fixes
More information about the scm-commits
mailing list