src/cmd.c
by David Teigland
src/cmd.c | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
New commits:
commit 06c6ffce24f71bfa9eb58342949a0bb62752efd4
Author: David Teigland <teigland(a)redhat.com>
Date: Mon Aug 27 11:05:35 2012 -0500
sanlock: fix status of lockspaces in add and rem
The sanlock client status command was reporting
lockspaces in the rem list twice, and missing the
add list, instead of once over both lists.
Signed-off-by: David Teigland <teigland(a)redhat.com>
diff --git a/src/cmd.c b/src/cmd.c
index c293275..7b7119f 100644
--- a/src/cmd.c
+++ b/src/cmd.c
@@ -1644,7 +1644,7 @@ static void cmd_status(int fd, struct sm_header *h_recv, int client_maxi)
pthread_mutex_lock(&spaces_mutex);
list_for_each_entry(sp, &spaces, list)
send_state_lockspace(fd, sp, "spaces");
- list_for_each_entry(sp, &spaces_rem, list)
+ list_for_each_entry(sp, &spaces_add, list)
send_state_lockspace(fd, sp, "spaces_rem");
list_for_each_entry(sp, &spaces_rem, list)
send_state_lockspace(fd, sp, "spaces_add");
11 years, 1 month
[sosreport][PATCHv3] sos/plugins: add plugin for sanlock
by Federico Simoncelli
Signed-off-by: Federico Simoncelli <fsimonce(a)redhat.com>
---
AUTHORS | 1 +
sos/plugins/sanlock.py | 31 +++++++++++++++++++++++++++++++
2 files changed, 32 insertions(+), 0 deletions(-)
create mode 100755 sos/plugins/sanlock.py
diff --git a/AUTHORS b/AUTHORS
index 1973fc9..860fd7b 100644
--- a/AUTHORS
+++ b/AUTHORS
@@ -17,3 +17,4 @@ Sadique Puthen <sputhenp(a)redhat.com>
Shijoe George <spanjikk(a)redhat.com>
Steve Conklin <sconklin(a)redhat.com>
Tomas Smetana <tsmetana(a)redhat.com>
+Federico Simoncelli <fsimonce(a)redhat.com>
diff --git a/sos/plugins/sanlock.py b/sos/plugins/sanlock.py
new file mode 100755
index 0000000..fcbb2d4
--- /dev/null
+++ b/sos/plugins/sanlock.py
@@ -0,0 +1,31 @@
+### This program is free software; you can redistribute it and/or modify
+## it under the terms of the GNU General Public License as published by
+## the Free Software Foundation; either version 2 of the License, or
+## (at your option) any later version.
+
+## This program is distributed in the hope that it will be useful,
+## but WITHOUT ANY WARRANTY; without even the implied warranty of
+## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+## GNU General Public License for more details.
+
+## You should have received a copy of the GNU General Public License
+## along with this program; if not, write to the Free Software
+## Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+
+import sos.plugintools
+
+class sanlock(sos.plugintools.PluginBase):
+ """sanlock-related information
+ """
+ def checkenabled(self):
+ self.packages = [ "sanlock" ]
+ self.files = [ "/etc/sysconfig/sanlock" ]
+ return sos.plugintools.PluginBase.checkenabled(self)
+
+ def setup(self):
+ self.addCopySpec("/etc/sysconfig/sanlock")
+ self.addCopySpec("/var/log/sanlock.log*")
+ self.collectExtOutput("sanlock client status -D")
+ self.collectExtOutput("sanlock client host_status -D")
+ self.collectExtOutput("sanlock client log_dump")
+ return
--
1.7.1
11 years, 1 month
2 commits - src/client_cmd.c src/resource.c
by David Teigland
src/client_cmd.c | 73 ++++++++++++++++++++++++++++++++++++++++++++++++++++++-
src/resource.c | 29 ++++++++++++++++-----
2 files changed, 94 insertions(+), 8 deletions(-)
New commits:
commit dba7985aebfa961779feca28d39d4407f7f8073a
Author: David Teigland <teigland(a)redhat.com>
Date: Thu Aug 23 10:46:09 2012 -0500
sanlock: show host_status for all lockspaces
when none are specified on the command line,
i.e. "sanlock client host_status"
Signed-off-by: David Teigland <teigland(a)redhat.com>
diff --git a/src/client_cmd.c b/src/client_cmd.c
index 25f8cc6..ee96ae2 100644
--- a/src/client_cmd.c
+++ b/src/client_cmd.c
@@ -416,7 +416,7 @@ int sanlock_status(int debug, char sort_arg)
return rv;
}
-int sanlock_host_status(int debug, char *lockspace_name)
+static int lockspace_host_status(int debug, char *lockspace_name)
{
struct sm_header h;
struct sanlk_state st;
@@ -474,6 +474,77 @@ int sanlock_host_status(int debug, char *lockspace_name)
return rv;
}
+int sanlock_host_status(int debug, char *lockspace_name)
+{
+ struct sm_header h;
+ struct sanlk_state state;
+ char maxstr[SANLK_STATE_MAXSTR];
+ char maxbin[SANLK_STATE_MAXSTR];
+ struct sanlk_state *st;
+ char *str, *bin;
+ struct sanlk_lockspace *ls;
+ int fd, rv, i;
+
+ if (lockspace_name && lockspace_name[0])
+ return lockspace_host_status(debug, lockspace_name);
+
+ fd = send_command(SM_CMD_STATUS, SANLK_STATE_LOCKSPACE);
+ if (fd < 0)
+ return fd;
+
+ rv = recv(fd, &h, sizeof(h), MSG_WAITALL);
+ if (rv < 0) {
+ rv = -errno;
+ close(fd);
+ return rv;
+ }
+ if (rv != sizeof(h)) {
+ close(fd);
+ return -1;
+ }
+
+ st = &state;
+ str = maxstr;
+ bin = maxbin;
+
+ while (1) {
+ memset(&state, 0, sizeof(state));
+ memset(maxstr, 0, sizeof(maxstr));
+ memset(maxbin, 0, sizeof(maxbin));
+
+ rv = recv(fd, st, sizeof(struct sanlk_state), MSG_WAITALL);
+ if (!rv)
+ break;
+ if (rv != sizeof(struct sanlk_state))
+ break;
+
+ if (st->str_len) {
+ rv = recv(fd, str, st->str_len, MSG_WAITALL);
+ if (rv != st->str_len)
+ break;
+ }
+
+ recv_bin(fd, st, bin);
+
+ if (st->type != SANLK_STATE_LOCKSPACE)
+ continue;
+
+ ls = (struct sanlk_lockspace *)bin;
+
+ sort_bufs[sort_count++] = strdup(ls->name);
+ }
+
+ close(fd);
+
+ for (i = 0; i < sort_count; i++) {
+ printf("lockspace %s\n", sort_bufs[i]);
+ lockspace_host_status(debug, sort_bufs[i]);
+ free(sort_bufs[i]);
+ }
+
+ return 0;
+}
+
int sanlock_log_dump(int max_size)
{
struct sm_header h;
commit eb6abcf10550ef544d427d6709b0894acb622845
Author: David Teigland <teigland(a)redhat.com>
Date: Thu Aug 23 09:53:42 2012 -0500
daemon: use helper for examine request kill
This kill() call was missed when moving kill to
the helper in 5c51530b5bccb8cfadc8a8591558b9ab99b95cb3
Signed-off-by: David Teigland <teigland(a)redhat.com>
diff --git a/src/resource.c b/src/resource.c
index 4ad4a05..7208d92 100644
--- a/src/resource.c
+++ b/src/resource.c
@@ -31,6 +31,7 @@
#include "resource.h"
#include "task.h"
#include "mode_block.h"
+#include "helper.h"
/* from cmd.c */
void send_state_resource(int fd, struct resource *r, const char *list_name, int pid, uint32_t token_id);
@@ -812,9 +813,10 @@ static int examine_token(struct task *task, struct token *token,
static void do_req_kill_pid(struct token *tt, int pid)
{
+ struct helper_msg hm;
struct resource *r;
uint32_t flags;
- int found = 0;
+ int rv, found = 0;
pthread_mutex_lock(&resource_mutex);
r = find_resource(tt, &resources_held);
@@ -833,16 +835,29 @@ static void do_req_kill_pid(struct token *tt, int pid)
log_debug("do_req_kill_pid %d flags %x %.48s:%.48s",
pid, flags, tt->r.lockspace_name, tt->r.name);
- /* TODO: share code with kill_pids() to gradually
- * escalate from killscript, SIGTERM, SIGKILL */
+ if (helper_kill_fd == -1) {
+ log_error("do_req_kill_pid %d no helper fd", pid);
+ return;
+ }
+
+ /* TODO: handle kill via runpath? or select signal? escalate? */
- kill(pid, SIGTERM);
+ memset(&hm, 0, sizeof(hm));
+ hm.type = HELPER_MSG_KILLPID;
+ hm.pid = pid;
+ hm.sig = SIGKILL;
if (flags & R_RESTRICT_SIGKILL)
- return;
+ hm.sig = SIGTERM;
+
+ retry:
+ rv = write(helper_kill_fd, &hm, sizeof(hm));
+ if (rv == -1 && errno == EINTR)
+ goto retry;
- sleep(1);
- kill(pid, SIGKILL);
+ if (rv == -1)
+ log_error("do_req_kill_pid %d helper write error %d",
+ pid, errno);
}
int set_resource_examine(char *space_name, char *res_name)
11 years, 1 month
[sosreport][PATCHv2] sos/plugins: add plugin for sanlock
by Federico Simoncelli
Signed-off-by: Federico Simoncelli <fsimonce(a)redhat.com>
---
AUTHORS | 1 +
sos/plugins/sanlock.py | 31 +++++++++++++++++++++++++++++++
2 files changed, 32 insertions(+), 0 deletions(-)
create mode 100755 sos/plugins/sanlock.py
diff --git a/AUTHORS b/AUTHORS
index 1973fc9..860fd7b 100644
--- a/AUTHORS
+++ b/AUTHORS
@@ -17,3 +17,4 @@ Sadique Puthen <sputhenp(a)redhat.com>
Shijoe George <spanjikk(a)redhat.com>
Steve Conklin <sconklin(a)redhat.com>
Tomas Smetana <tsmetana(a)redhat.com>
+Federico Simoncelli <fsimonce(a)redhat.com>
diff --git a/sos/plugins/sanlock.py b/sos/plugins/sanlock.py
new file mode 100755
index 0000000..432f7ee
--- /dev/null
+++ b/sos/plugins/sanlock.py
@@ -0,0 +1,31 @@
+### This program is free software; you can redistribute it and/or modify
+## it under the terms of the GNU General Public License as published by
+## the Free Software Foundation; either version 2 of the License, or
+## (at your option) any later version.
+
+## This program is distributed in the hope that it will be useful,
+## but WITHOUT ANY WARRANTY; without even the implied warranty of
+## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+## GNU General Public License for more details.
+
+## You should have received a copy of the GNU General Public License
+## along with this program; if not, write to the Free Software
+## Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+
+import sos.plugintools
+
+class sanlock(sos.plugintools.PluginBase):
+ """sanlock-related information
+ """
+ def checkenabled(self):
+ self.packages = [ "sanlock" ]
+ self.files = [ "/etc/sysconfig/sanlock" ]
+ return sos.plugintools.PluginBase.checkenabled(self)
+
+ def setup(self):
+ self.addCopySpec("/etc/sysconfig/sanlock")
+ self.addCopySpec("/var/log/sanlock.log*")
+ self.collectExtOutput("sanlock client status -D")
+ self.collectExtOutput("sanlock client host_status")
+ self.collectExtOutput("sanlock client log_dump")
+ return
--
1.7.1
11 years, 1 month
[sosreport][PATCH] sos/plugins: add plugin for sanlock
by Federico Simoncelli
Signed-off-by: Federico Simoncelli <fsimonce(a)redhat.com>
---
AUTHORS | 1 +
sos/plugins/sanlock.py | 29 +++++++++++++++++++++++++++++
2 files changed, 30 insertions(+), 0 deletions(-)
create mode 100755 sos/plugins/sanlock.py
diff --git a/AUTHORS b/AUTHORS
index 1973fc9..860fd7b 100644
--- a/AUTHORS
+++ b/AUTHORS
@@ -17,3 +17,4 @@ Sadique Puthen <sputhenp(a)redhat.com>
Shijoe George <spanjikk(a)redhat.com>
Steve Conklin <sconklin(a)redhat.com>
Tomas Smetana <tsmetana(a)redhat.com>
+Federico Simoncelli <fsimonce(a)redhat.com>
diff --git a/sos/plugins/sanlock.py b/sos/plugins/sanlock.py
new file mode 100755
index 0000000..075c847
--- /dev/null
+++ b/sos/plugins/sanlock.py
@@ -0,0 +1,29 @@
+### This program is free software; you can redistribute it and/or modify
+## it under the terms of the GNU General Public License as published by
+## the Free Software Foundation; either version 2 of the License, or
+## (at your option) any later version.
+
+## This program is distributed in the hope that it will be useful,
+## but WITHOUT ANY WARRANTY; without even the implied warranty of
+## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+## GNU General Public License for more details.
+
+## You should have received a copy of the GNU General Public License
+## along with this program; if not, write to the Free Software
+## Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+
+import sos.plugintools
+
+class sanlock(sos.plugintools.PluginBase):
+ """sanlock-related information
+ """
+ def checkenabled(self):
+ self.packages = [ "sanlock" ]
+ self.files = [ "/etc/sysconfig/sanlock" ]
+ return sos.plugintools.PluginBase.checkenabled(self)
+
+ def setup(self):
+ self.addCopySpec("/etc/sysconfig/sanlock")
+ self.addCopySpec("/var/log/sanlock.log*")
+ self.collectExtOutput("sanlock client status")
+ return
--
1.7.1
11 years, 1 month
init.d/sanlock init.d/wdmd
by David Teigland
init.d/sanlock | 15 ++++++++-------
init.d/wdmd | 49 ++++++++++++++++++++++++++++++++++++-------------
2 files changed, 44 insertions(+), 20 deletions(-)
New commits:
commit 7d2698c0da0272312a0b05c5304e32ff66170391
Author: Federico Simoncelli <fsimonce(a)redhat.com>
Date: Mon Aug 6 12:28:52 2012 -0400
init: use checkpid when stopping the services
When the pid file wasn't removed (eg: forced reboot, etc...) the
services were printing confusing warnings during restart/condrestart.
Signed-off-by: Federico Simoncelli <fsimonce(a)redhat.com>
diff --git a/init.d/sanlock b/init.d/sanlock
index bd8dccb..83b35e8 100644
--- a/init.d/sanlock
+++ b/init.d/sanlock
@@ -48,7 +48,9 @@ start() {
}
stop() {
- echo -n $"Sending stop signal $prog: "
+ PID=$(pidofproc -p $runfile $prog)
+
+ echo -n $"Sending stop signal $prog ($PID): "
killproc -p $runfile $prog -TERM
retval=$?
echo
@@ -57,9 +59,10 @@ stop() {
return $retval
fi
- echo -n $"Waiting for $prog to stop:"
+ echo -n $"Waiting for $prog ($PID) to stop:"
+
timeout=10
- while [ -e $runfile ]; do
+ while checkpid $PID; do
sleep 1
timeout=$((timeout - 1))
if [ "$timeout" -le 0 ]; then
@@ -74,9 +77,8 @@ stop() {
}
restart() {
- stop && start
- retval=$?
- return $retval
+ rh_status_q && stop
+ start
}
reload() {
@@ -122,4 +124,3 @@ case "$1" in
exit 2
esac
exit $?
-
diff --git a/init.d/wdmd b/init.d/wdmd
index 19fc3ae..af45561 100644
--- a/init.d/wdmd
+++ b/init.d/wdmd
@@ -1,4 +1,4 @@
-#!/bin/sh
+#!/bin/bash
#
# wdmd - watchdog multiplexing daemon
#
@@ -31,20 +31,24 @@ WDMDOPTS="-G $WDMDGROUP"
[ -f /etc/sysconfig/$prog ] && . /etc/sysconfig/$prog
-start() {
- [ -x $exec ] || exit 5
-
- if [ ! -d /var/run/$prog ]; then
- mkdir -p /var/run/$prog
- [ -x /sbin/restorecon ] && restorecon /var/run/$prog
- fi
-
+watchdog_check() {
if [ ! -c /dev/watchdog ]; then
echo -n $"Loading the softdog kernel module: "
modprobe softdog && udevadm settle
[ -c /dev/watchdog ] && success || failure
echo
fi
+}
+
+start() {
+ watchdog_check
+
+ [ -x $exec ] || exit 5
+
+ if [ ! -d /var/run/$prog ]; then
+ install -d -g $WDMDGROUP -m 775 /var/run/$prog
+ [ -x /sbin/restorecon ] && restorecon /var/run/$prog
+ fi
echo -n $"Starting $prog: "
daemon $prog $WDMDOPTS
@@ -55,16 +59,36 @@ start() {
}
stop() {
- echo -n $"Stopping $prog: "
+ PID=$(pidofproc -p $runfile $prog)
+
+ echo -n $"Sending stop signal $prog ($PID): "
killproc -p $runfile $prog -TERM
retval=$?
echo
- [ $retval -eq 0 ] && rm -f $lockfile
+
+ if [ $retval -ne 0 ]; then
+ return $retval
+ fi
+
+ echo -n $"Waiting for $prog ($PID) to stop:"
+
+ timeout=10
+ while checkpid $PID; do
+ sleep 1
+ timeout=$((timeout - 1))
+ if [ "$timeout" -le 0 ]; then
+ failure; echo
+ return 1
+ fi
+ done
+
+ success; echo
+ rm -f $lockfile
return $retval
}
restart() {
- stop
+ rh_status_q && stop
start
}
@@ -111,4 +135,3 @@ case "$1" in
exit 2
esac
exit $?
-
11 years, 1 month
src/paxos_lease.c
by David Teigland
src/paxos_lease.c | 16 ++++++++++++++++
1 file changed, 16 insertions(+)
New commits:
commit 73184e5785857e7c759e219efd361c0fceec090e
Author: David Teigland <teigland(a)redhat.com>
Date: Wed Aug 15 12:20:33 2012 -0500
sanlock: fix paxos acquire host_id check
The host_id check should be skipped in the case where
the local host_id was the previous paxos lease owner,
but in a previous lockspace generation in which the
paxos lease was not cleanly released. Checking our
own host_id does nothing and is a waste of time in
this case.
Signed-off-by: David Teigland <teigland(a)redhat.com>
diff --git a/src/paxos_lease.c b/src/paxos_lease.c
index a52c085..68529f3 100644
--- a/src/paxos_lease.c
+++ b/src/paxos_lease.c
@@ -1156,6 +1156,22 @@ int paxos_lease_acquire(struct task *task,
}
/*
+ * We were the last host to hold this lease, but in a previous
+ * lockspace generation in which we didn't cleanly release the
+ * paxos lease.
+ */
+
+ if (cur_leader.owner_id == token->host_id &&
+ cur_leader.owner_generation < token->host_generation) {
+ log_token(token, "paxos_acquire past owner id %llu gen %llu %llu",
+ (unsigned long long)token->host_id,
+ (unsigned long long)token->host_generation,
+ (unsigned long long)cur_leader.owner_generation);
+ copy_cur_leader = 1;
+ goto run;
+ }
+
+ /*
* Check if current owner is alive based on its host_id renewals.
* If the current owner has been dead long enough we can assume that
* its watchdog has triggered and we can go for the paxos lease.
11 years, 1 month
Changes to 'io_timeout5'
by David Teigland
New branch 'io_timeout5' available with the following commits:
commit 05445ab440bd84139803e7454eefc8463f825e27
Author: David Teigland <teigland(a)redhat.com>
Date: Mon Jul 23 08:58:57 2012 -0500
sanlock: adjustable io timeouts
New sanlock_add_lockspace_timeout() api to allow
the timeout to be specified per lockspace.
Also correctly handle nodes that may be using
different timeouts in the same lockspace.
Signed-off-by: David Teigland <teigland(a)redhat.com>
11 years, 1 month
3 commits - src/cmd.c src/main.c src/sanlock_internal.h src/timeouts.h tests/clientn tests/sanlk_client.c wdmd/main.c
by David Teigland
src/cmd.c | 4 +-
src/main.c | 65 ++++++++++++++++++++++++++---------------
src/sanlock_internal.h | 2 -
src/timeouts.h | 7 ----
tests/clientn | 77 ++++++++++++++++++++++++++++++++++++++++++++++++-
tests/sanlk_client.c | 4 ++
wdmd/main.c | 40 +++++++++++++++++++------
7 files changed, 155 insertions(+), 44 deletions(-)
New commits:
commit e1548ab53b1f12c0825cb614238a630069a93c1b
Author: David Teigland <teigland(a)redhat.com>
Date: Mon Aug 13 16:49:18 2012 -0500
clientn: add tests
Signed-off-by: David Teigland <teigland(a)redhat.com>
diff --git a/tests/clientn b/tests/clientn
index e47c7d9..85b4181 100755
--- a/tests/clientn
+++ b/tests/clientn
@@ -29,23 +29,85 @@ elif [ "$cmd" == "start" ]; then
./sanlk_client test r$i $dev $off $killpath &
done
-elif [ "$cmd" == "error" ]; then
+elif [ "$cmd" == "delay" ]; then
+
+ sec=$3
+
+ pid=`cat /var/run/sanlock/sanlock.pid`
+
+ echo sync with daemon renewals
+ kill -s SIGSTOP $pid
+ sleep 20
+ kill -s SIGCONT $pid
+ sleep 1
+
+ echo sigstop sanlock pid $pid
+ kill -s SIGSTOP $pid
+
+ echo sleep $sec
+ sleep $sec
+
+ echo sigcont sanlock pid $pid
+ kill -s SIGCONT $pid
+
+elif [ "$cmd" == "iodelay" ]; then
+
+ sec=$4
+ pid=`cat /var/run/sanlock/sanlock.pid`
+
+ echo sync with daemon renewals
+ kill -s SIGSTOP $pid
+ sleep 20
+ kill -s SIGCONT $pid
+ sleep 2
+
+ echo save linear
rm -f /tmp/client-state.txt
+ rm -f /tmp/client-linear.txt
+ rm -f /tmp/client-error.txt
+ dmsetup table $dev > /tmp/client-linear.txt
+ sed "s/linear/error/" /tmp/client-linear.txt > /tmp/client-error.txt
+ echo load error
+ dmsetup suspend $dev
+ dmsetup load $dev /tmp/client-error.txt
+ dmsetup resume $dev
+
+ echo sleep $sec
+ sleep $sec
+
+ echo load linear
+ dmsetup suspend $dev
+ dmsetup load $dev /tmp/client-linear.txt
+ dmsetup resume $dev
+
+elif [ "$cmd" == "error" ]; then
+
+ echo save linear
+ rm -f /tmp/client-state.txt
rm -f /tmp/client-linear.txt
rm -f /tmp/client-error.txt
dmsetup table $dev > /tmp/client-linear.txt
sed "s/linear/error/" /tmp/client-linear.txt > /tmp/client-error.txt
+ echo load error
dmsetup suspend $dev
dmsetup load $dev /tmp/client-error.txt
dmsetup resume $dev
+elif [ "$cmd" == "linear" ]; then
+
+ echo load linear
+ dmsetup suspend $dev
+ dmsetup load $dev /tmp/client-linear.txt
+ dmsetup resume $dev
+
elif [ "$cmd" == "resume" ]; then
hostid=$4
+ echo load linear
dmsetup suspend $dev
dmsetup load $dev /tmp/client-linear.txt
dmsetup resume $dev
@@ -74,6 +136,19 @@ else
echo " sanlock client add_lockspace -s test:HOSTID:DEV:0"
echo " starts N ./sanlk_client processes"
echo ""
+ echo "clientn N delay SEC"
+ echo " sigstop sanlock daemon"
+ echo " sleep SEC"
+ echo " sigcont sanlock daemon"
+ echo ""
+ echo "clientn N iodelay DEV SEC"
+ echo " block i/o to DEV"
+ echo " sleep SEC"
+ echo " unblock i/o to DEV"
+ echo ""
+ echo "clientn N linear DEV"
+ echo " unblock i/o to DEV"
+ echo ""
echo "clientn N error DEV"
echo " blocks i/o to DEV"
echo " causes KILLPATH to run"
commit aa0092c7d0244c0461bc2a13f62c0250ba7e43a8
Author: David Teigland <teigland(a)redhat.com>
Date: Fri Aug 10 15:44:15 2012 -0500
sanlock: base kill sig on last renewal
Tracking progression through the grace time by
counting one retry per second doesn't work in
the case were sanlock doesn't run every second
(e.g. sigstop or delayed scheduling). This will
cause sanlock to attempt to use killpath even
when there's nearly no time left before reset.
So, the transition to sigkill should depend on
the current time from the last lease renewal.
Signed-off-by: David Teigland <teigland(a)redhat.com>
diff --git a/src/cmd.c b/src/cmd.c
index bc7d7da..c293275 100644
--- a/src/cmd.c
+++ b/src/cmd.c
@@ -1359,7 +1359,7 @@ static int print_state_daemon(char *str)
"id_renewal=%d "
"id_renewal_fail=%d "
"id_renewal_warn=%d "
- "kill_count_grace=%d "
+ "kill_grace_seconds=%d "
"helper_pid=%d "
"helper_kill_fd=%d "
"helper_full_count=%u "
@@ -1371,7 +1371,7 @@ static int print_state_daemon(char *str)
main_task.id_renewal_seconds,
main_task.id_renewal_fail_seconds,
main_task.id_renewal_warn_seconds,
- kill_count_grace,
+ kill_grace_seconds,
helper_pid,
helper_kill_fd,
helper_full_count,
diff --git a/src/main.c b/src/main.c
index e5f0885..3ecc1f5 100644
--- a/src/main.c
+++ b/src/main.c
@@ -124,6 +124,11 @@ static void send_helper_kill(struct space *sp, struct client *cl, int sig)
if ((cl->flags & CL_RUNPATH_SENT) && (sig == SIGRUNPATH))
return;
+ if (helper_kill_fd == -1) {
+ log_error("send_helper_kill pid %d no fd", cl->pid);
+ return;
+ }
+
memset(&hm, 0, sizeof(hm));
if (sig == SIGRUNPATH) {
@@ -535,9 +540,9 @@ static int client_using_space(struct client *cl, struct space *sp)
static void kill_pids(struct space *sp)
{
struct client *cl;
- uint64_t now;
+ uint64_t now, last_success;
int ci, fd, pid, sig;
- int do_kill;
+ int do_kill, in_grace;
/*
* all remaining pids using sp are stuck, we've made max attempts to
@@ -546,6 +551,17 @@ static void kill_pids(struct space *sp)
if (sp->killing_pids > 1)
return;
+ /*
+ * If we happen to renew our lease after we've started killing pids,
+ * the period we allow for graceful shutdown will be extended. This
+ * is an incidental effect, although it may be nice. The previous
+ * behavior would still be ok, where we only ever allow up to
+ * kill_grace_seconds for graceful shutdown before moving to sigkill.
+ */
+ pthread_mutex_lock(&sp->mutex);
+ last_success = sp->lease_status.renewal_last_success;
+ pthread_mutex_unlock(&sp->mutex);
+
now = monotime();
for (ci = 0; ci <= client_maxi; ci++) {
@@ -578,32 +594,33 @@ static void kill_pids(struct space *sp)
fd = cl->fd;
pid = cl->pid;
-
/*
- * from zero to kill_count_grace seconds, we try killing
- * the pid with either killpath or sigterm. killpath if
- * it's configured and and we've seen a helper status recently.
- * (sigkill will be used in place of sigterm if restricted.)
- *
- * after kill_count_grace seconds, we'll try killing the
- * pid with sigkill. (sigterm will be used in place of
- * sigkill if restricted.)
+ * the transition from using killpath/sigterm to sigkill
+ * is when now >=
+ * last successful lease renewal +
+ * id_renewal_fail_seconds +
+ * kill_grace_seconds
*/
- if (cl->killpath[0] &&
- (helper_kill_fd != -1) &&
- (kill_count_grace > 0) &&
- (cl->kill_count <= kill_count_grace) &&
- (now - helper_last_status < (HELPER_STATUS_INTERVAL * 2)))
+ in_grace = now < (last_success + main_task.id_renewal_fail_seconds + kill_grace_seconds);
+
+ if ((kill_grace_seconds > 0) && in_grace && cl->killpath[0]) {
sig = SIGRUNPATH;
- else if (cl->restrict & SANLK_RESTRICT_SIGKILL)
+ } else if (in_grace) {
sig = SIGTERM;
- else if (cl->restrict & SANLK_RESTRICT_SIGTERM)
+ } else {
sig = SIGKILL;
- else if ((kill_count_grace > 0) &&
- (cl->kill_count <= kill_count_grace))
+ }
+
+ /*
+ * sigterm will be used in place of sigkill if restricted
+ * sigkill will be used in place of sigterm if restricted
+ */
+
+ if ((sig == SIGKILL) && (cl->restrict & SANLK_RESTRICT_SIGKILL))
sig = SIGTERM;
- else
+
+ if ((sig == SIGTERM) && (cl->restrict & SANLK_RESTRICT_SIGTERM))
sig = SIGKILL;
do_kill = 1;
@@ -1966,7 +1983,7 @@ static int read_command_line(int argc, char *argv[])
if (com.type == COM_DAEMON) {
sec = atoi(optionarg);
if (sec <= 60 && sec >= 0)
- kill_count_grace = sec;
+ kill_grace_seconds = sec;
} else {
com.local_host_generation = atoll(optionarg);
}
@@ -2343,8 +2360,8 @@ int main(int argc, char *argv[])
/* initialize global EXTERN variables */
- kill_count_max = 60;
- kill_count_grace = DEFAULT_GRACE_SEC;
+ kill_count_max = 100;
+ kill_grace_seconds = DEFAULT_GRACE_SEC;
helper_ci = -1;
helper_pid = -1;
helper_kill_fd = -1;
diff --git a/src/sanlock_internal.h b/src/sanlock_internal.h
index 9950ebd..9a30763 100644
--- a/src/sanlock_internal.h
+++ b/src/sanlock_internal.h
@@ -316,7 +316,7 @@ EXTERN int external_shutdown;
EXTERN char our_host_name_global[SANLK_NAME_LEN+1];
EXTERN int kill_count_max;
-EXTERN int kill_count_grace;
+EXTERN int kill_grace_seconds;
EXTERN int helper_ci;
EXTERN int helper_pid;
EXTERN int helper_kill_fd;
diff --git a/src/timeouts.h b/src/timeouts.h
index f62bb6f..80b9fc9 100644
--- a/src/timeouts.h
+++ b/src/timeouts.h
@@ -226,7 +226,7 @@
*
* Working backward from the earlier watchdog firing at T170, leaving 10 seconds
* for SIGKILL to succeed, we need to begin SIGKILL at T160. This means we
- * have from T120 to T160 to allow graceful kill to complete. So, kill_count_grace
+ * have from T120 to T160 to allow graceful kill to complete. So, kill_grace_seconds
* should be set to 40 by default (T120 to T160).
*
* T40: last successful disk renewal
@@ -234,11 +234,6 @@
* T160 - T169: SIGKILL once per second (10 sec)
* T170 - T179: watchdog fires sometime (SIGKILL continues)
* T180: other hosts acquire our leases
- *
- * The interval between each kill count/attempt is approx 1 sec,
- * so kill_count/kill_count_grace/kill_count_max serve as both
- * the number/count of attempts and the number of seconds spent
- * using that kind of termination.
*/
commit 871d47a6b0d9b1e07600b773c2aa3c34d99e2af1
Author: David Teigland <teigland(a)redhat.com>
Date: Fri Aug 10 10:41:47 2012 -0500
wdmd: pet after reopen and use 1 sec interval after failure
We need to do a keepalive ioctl after reopening the device.
Also, delaying the final check until the last second can
sometimes not leave enough time to reactivate the
watchdog, so check every second after a test failure.
Signed-off-by: David Teigland <teigland(a)redhat.com>
diff --git a/tests/sanlk_client.c b/tests/sanlk_client.c
index f63356d..a0efc61 100644
--- a/tests/sanlk_client.c
+++ b/tests/sanlk_client.c
@@ -60,12 +60,16 @@ int main(int argc, char *argv[])
return -1;
}
+ if (!strcmp(path, "none"))
+ goto acquire;
+
rv = sanlock_killpath(sock, SANLK_KILLPATH_PID, path, args);
if (rv < 0) {
fprintf(stderr, "killpath error %d\n", rv);
return -1;
}
+ acquire:
rv = sanlock_acquire(sock, -1, 0, 1, &res, NULL);
if (rv < 0) {
fprintf(stderr, "acquire error %d\n", rv);
diff --git a/wdmd/main.c b/wdmd/main.c
index e289f44..2e41e91 100644
--- a/wdmd/main.c
+++ b/wdmd/main.c
@@ -44,6 +44,7 @@
#define RELEASE_VERSION "2.4"
#define DEFAULT_TEST_INTERVAL 10
+#define RECOVER_TEST_INTERVAL 1
#define DEFAULT_FIRE_TIMEOUT 60
#define DEFAULT_HIGH_PRIORITY 1
@@ -57,6 +58,7 @@ static int daemon_quit;
static int daemon_debug;
static int socket_gid;
static time_t last_keepalive;
+static time_t last_closeunclean;
static char lockfile_path[PATH_MAX];
static int dev_fd = -1;
static int shm_fd;
@@ -70,7 +72,8 @@ struct script_status {
is not very sophisticated, but it's simple. If we wait up to 2 seconds
for each script to exit, and have 5 scripts, that's up to 10 seconds we
spend in test_scripts, and it's simplest if the max time in test_scripts
- does not excede the test_interval (10). */
+ does not excede the test_interval (10). FIXME: this is not entirely
+ true since the test_interval was changed to 1 after a failure. */
#define SCRIPT_WAIT_SECONDS 2
#define MAX_SCRIPTS 4
@@ -387,6 +390,7 @@ static int setup_clients(void)
static int test_clients(void)
{
uint64_t t;
+ time_t last_ping;
int fail_count = 0;
int i;
@@ -398,14 +402,20 @@ static int test_clients(void)
if (!client[i].expire)
continue;
+ if (last_keepalive > last_closeunclean)
+ last_ping = last_keepalive;
+ else
+ last_ping = last_closeunclean;
+
if (t >= client[i].expire) {
- log_error("test failed ci %d pid %d now %llu keepalive %llu renewal %llu expire %llu %s",
- i, client[i].pid,
+ log_error("test failed rem %d now %llu ping %llu close %llu renewal %llu expire %llu client %d %s",
+ DEFAULT_FIRE_TIMEOUT - (int)(t - last_ping),
(unsigned long long)t,
(unsigned long long)last_keepalive,
+ (unsigned long long)last_closeunclean,
(unsigned long long)client[i].renewal,
(unsigned long long)client[i].expire,
- client[i].name);
+ client[i].pid, client[i].name);
fail_count++;
continue;
}
@@ -431,12 +441,13 @@ static int test_clients(void)
*/
if (t >= client[i].expire - DEFAULT_TEST_INTERVAL) {
- log_error("test warning pid %d now %llu keepalive %llu renewal %llu expire %llu",
- client[i].pid,
+ log_error("test warning now %llu ping %llu close %llu renewal %llu expire %llu client %d %s",
(unsigned long long)t,
(unsigned long long)last_keepalive,
+ (unsigned long long)last_closeunclean,
(unsigned long long)client[i].renewal,
- (unsigned long long)client[i].expire);
+ (unsigned long long)client[i].expire,
+ client[i].pid, client[i].name);
fail_count++;
continue;
}
@@ -718,6 +729,8 @@ static void close_watchdog_unclean(void)
log_error("/dev/watchdog closed unclean");
close(dev_fd);
dev_fd = -1;
+
+ last_closeunclean = monotime();
}
static void close_watchdog(void)
@@ -913,23 +926,30 @@ static int test_loop(void)
if (!fail_count) {
if (dev_fd == -1) {
- log_error("/dev/watchdog reopen");
open_dev();
+ pet_watchdog();
+ log_error("/dev/watchdog reopen");
} else {
pet_watchdog();
}
+
+ test_interval = DEFAULT_TEST_INTERVAL;
} else {
/* If we can patch the kernel so that close
does not generate a ping, then we can skip
this close, and just not pet the device in
this case. Also see test_client above. */
close_watchdog_unclean();
+
+ test_interval = RECOVER_TEST_INTERVAL;
}
}
sleep_seconds = test_time + test_interval - monotime();
- poll_timeout = (sleep_seconds > 0) ? sleep_seconds * 1000 : 1;
- log_debug("sleep_seconds %d", sleep_seconds);
+ poll_timeout = (sleep_seconds > 0) ? sleep_seconds * 1000 : 500;
+
+ log_debug("test_interval %d sleep_seconds %d poll_timeout %d",
+ test_interval, sleep_seconds, poll_timeout);
}
return 0;
11 years, 1 month