src/direct_lib.c | 11 +++
src/host_id.c | 15 -----
src/main.c | 143 +++++++++++++++++++++++++++++++++++++------------
src/sanlock.8 | 7 --
src/sanlock_internal.h | 1
src/sanlock_resource.h | 1
src/token_manager.c | 17 +++--
7 files changed, 130 insertions(+), 65 deletions(-)
New commits:
commit dd9dfa9c9d544ff12684ab94b5e30fc165206052
Author: David Teigland <teigland(a)redhat.com>
Date: Wed Sep 7 16:49:13 2011 -0500
libsanlock: fix function stubs
would like to get rid of this ugliness
diff --git a/src/direct_lib.c b/src/direct_lib.c
index e2c7d7a..7d7f9a9 100644
--- a/src/direct_lib.c
+++ b/src/direct_lib.c
@@ -31,9 +31,16 @@ void log_level(int space_id GNUC_UNUSED, int token_id GNUC_UNUSED,
{
}
-int host_id_disk_info(char *name GNUC_UNUSED, struct sync_disk *disk GNUC_UNUSED);
+int lockspace_disk(char *space_name GNUC_UNUSED, struct sync_disk *disk GNUC_UNUSED);
-int host_id_disk_info(char *name GNUC_UNUSED, struct sync_disk *disk GNUC_UNUSED)
+int lockspace_disk(char *space_name GNUC_UNUSED, struct sync_disk *disk GNUC_UNUSED)
+{
+ return -1;
+}
+
+int host_info(char *space_name, uint64_t host_id, struct host_status *hs_out);
+
+int host_info(char *space_name GNUC_UNUSED, uint64_t host_id GNUC_UNUSED, struct
host_status *hs_out GNUC_UNUSED)
{
return -1;
}
commit 4d1ed9632189bc2752ad6a684145ca38971ab959
Author: David Teigland <teigland(a)redhat.com>
Date: Wed Sep 7 15:54:53 2011 -0500
sanlock: remove BLOCK_WD force mode
don't include this option until we're ready to use it,
in case the usage specifics end up being different
diff --git a/src/host_id.c b/src/host_id.c
index 1b82eeb..86df60b 100644
--- a/src/host_id.c
+++ b/src/host_id.c
@@ -163,17 +163,6 @@ int lockspace_disk(char *space_name, struct sync_disk *disk)
return rv;
}
-void block_watchdog_updates(char *space_name)
-{
- struct space *sp;
-
- pthread_mutex_lock(&spaces_mutex);
- sp = _search_space(space_name, NULL, 0, &spaces, NULL, NULL);
- if (sp)
- sp->block_watchdog_updates = 1;
- pthread_mutex_unlock(&spaces_mutex);
-}
-
#if 0
static void clear_bit(int host_id, char *bitmap)
{
@@ -563,9 +552,7 @@ static void *lockspace_thread(void *arg_in)
* pet the watchdog
*/
- if (delta_result == SANLK_OK &&
- !sp->thread_stop &&
- !sp->block_watchdog_updates)
+ if (delta_result == SANLK_OK && !sp->thread_stop)
update_watchdog_file(sp, last_success);
pthread_mutex_unlock(&sp->mutex);
diff --git a/src/sanlock.8 b/src/sanlock.8
index 9a33990..e3196e9 100644
--- a/src/sanlock.8
+++ b/src/sanlock.8
@@ -477,11 +477,6 @@ take:
process has exited, the resource lease will be released, and can then be
acquired by anyone.
-\fB2\fP (BLOCK_WD): stop updating the watchdog (/dev/watchdog keepalive
-via wdmd_test_live) for the lockspace, which will lead to /dev/watchdog
-firing and reseting the host. The resource lease can be acquired after
-the timeout for a failed host.
-
.SH SEE ALSO
.BR wdmd (8)
diff --git a/src/sanlock_internal.h b/src/sanlock_internal.h
index 70c696d..7a26749 100644
--- a/src/sanlock_internal.h
+++ b/src/sanlock_internal.h
@@ -124,7 +124,6 @@ struct space {
int space_dead;
int killing_pids;
int external_remove;
- int block_watchdog_updates;
int thread_stop;
int wd_fd;
pthread_t thread;
diff --git a/src/sanlock_resource.h b/src/sanlock_resource.h
index 4998c16..770ebfc 100644
--- a/src/sanlock_resource.h
+++ b/src/sanlock_resource.h
@@ -28,7 +28,6 @@
/* request flags */
#define SANLK_REQ_KILL_PID 0x00000001
-#define SANLK_REQ_BLOCK_WD 0x00000002
int sanlock_register(void);
diff --git a/src/token_manager.c b/src/token_manager.c
index a15e47a..a701cc3 100644
--- a/src/token_manager.c
+++ b/src/token_manager.c
@@ -462,10 +462,8 @@ static void *resource_thread(void *arg GNUC_UNUSED)
log_error("req_kill_pid %d %.48s:%.48s", pid,
tt->r.lockspace_name, tt->r.name);
kill(pid, SIGKILL);
-
- } else if (req.force_mode == SANLK_REQ_BLOCK_WD) {
- log_error("req_block_wd %.48s", tt->r.lockspace_name);
- block_watchdog_updates(tt->r.lockspace_name);
+ } else {
+ log_error("req force_mode unknown %u", req.force_mode);
}
}
}
commit 0a356d94812872ebc7d8c39f79cac7cc88997ab1
Author: David Teigland <teigland(a)redhat.com>
Date: Wed Sep 7 12:12:34 2011 -0500
sanlock: restructure lockspace checks
shifts some code around, should be no functional change.
not checking sp after the list_move is better.
diff --git a/src/main.c b/src/main.c
index 51c1fe7..c249ace 100644
--- a/src/main.c
+++ b/src/main.c
@@ -424,10 +424,6 @@ static int client_using_space(struct client *cl, struct space *sp)
log_spoke(sp, token, "client_using_space pid %d", cl->pid);
token->space_dead = sp->space_dead;
rv = 1;
-
- /* we could break here after finding one if we didn't care
- * about setting token->space_dead which isn't really
- * necessary; it just avoids trying to release the token */
}
return rv;
}
@@ -586,9 +582,42 @@ static int main_loop(void)
last_check = now;
check_interval = STANDARD_CHECK_INTERVAL;
+ /*
+ * check the condition of each lockspace,
+ * if pids are being killed, have pids all exited?
+ * is its host_id being renewed?, if not kill pids
+ */
+
pthread_mutex_lock(&spaces_mutex);
list_for_each_entry_safe(sp, safe, &spaces, list) {
- check_all = 0;
+
+ if (sp->killing_pids && all_pids_dead(sp)) {
+ /*
+ * move sp to spaces_rem so main_loop
+ * will no longer see it.
+ */
+ log_space(sp, "set thread_stop");
+ pthread_mutex_lock(&sp->mutex);
+ sp->thread_stop = 1;
+ unlink_watchdog_file(sp);
+ pthread_mutex_unlock(&sp->mutex);
+ list_move(&sp->list, &spaces_rem);
+ continue;
+ }
+
+ if (sp->killing_pids) {
+ /*
+ * continue to kill the pids with increasing
+ * levels of severity until they all exit
+ */
+ kill_pids(sp);
+ check_interval = RECOVERY_CHECK_INTERVAL;
+ continue;
+ }
+
+ /*
+ * check host_id lease renewal
+ */
if (sp->align_size > check_buf_len) {
if (check_buf)
@@ -599,37 +628,21 @@ static int main_loop(void)
if (check_buf)
memset(check_buf, 0, check_buf_len);
- if (sp->killing_pids) {
- if (all_pids_dead(sp)) {
- log_space(sp, "set thread_stop");
- pthread_mutex_lock(&sp->mutex);
- sp->thread_stop = 1;
- unlink_watchdog_file(sp);
- pthread_mutex_unlock(&sp->mutex);
- list_move(&sp->list, &spaces_rem);
- } else {
- kill_pids(sp);
- }
- } else {
- rv = check_our_lease(&main_task, sp,
- &check_all, check_buf);
-
- if (rv || external_shutdown || sp->external_remove) {
- log_space(sp, "set killing_pids check %d "
- "shutdown %d remove %d",
- rv, external_shutdown,
- sp->external_remove);
- sp->space_dead = 1;
- sp->killing_pids = 1;
- kill_pids(sp);
- }
- }
+ check_all = 0;
- if (!sp->killing_pids && check_all)
- check_other_leases(&main_task, sp, check_buf);
+ rv = check_our_lease(&main_task, sp, &check_all, check_buf);
- if (sp->killing_pids)
+ if (rv || external_shutdown || sp->external_remove) {
+ log_space(sp, "set killing_pids check %d shutdown %d remove %d",
+ rv, external_shutdown, sp->external_remove);
+ sp->space_dead = 1;
+ sp->killing_pids = 1;
+ kill_pids(sp);
check_interval = RECOVERY_CHECK_INTERVAL;
+
+ } else if (check_all) {
+ check_other_leases(&main_task, sp, check_buf);
+ }
}
empty = list_empty(&spaces);
pthread_mutex_unlock(&spaces_mutex);
@@ -1516,6 +1529,51 @@ static void cmd_add_lockspace(struct cmd_args *ca)
client_resume(ca->ci_in);
}
+/*
+ * TODO: rem_lockspace works like a renewal failure would, and abandons
+ * resource leases (tokens) without releasing them. Unlike the renewal
+ * failure case, rem_lockspace most likely releases the host_id.
+ *
+ * What might be nice is an option where rem_lockspace would try to
+ * release resource leases before releasing the lockspace host_id.
+ * (We don't really want to be releasing tokens after we've released
+ * our host_id for the token's lockspace.)
+ *
+ * - kill all pids (by looking at struct resource pid?)
+ * - wait for all pids to exit
+ * o have us or other thread release their tokens/resources
+ * o wait for tokens/resources to be released, although the release
+ * may fail or time out, we don't want to wait too long
+ * - set sp->external_remove
+ * - main_loop sets sp->thread_stop (should find no pids)
+ * - main_loop unlinks watchdog
+ * - lockspace_thread releases host_id
+ *
+ * The aim is that we kill pids and wait for resources to be released
+ * before main_loop gets involved and before the lockspace_thread is
+ * told to stop.
+ *
+ * An alternative messy is to add another condition to the current
+ * main_loop checks:
+ *
+ * if (sp->killing_pids && all_pids_dead(sp) &&
all_tokens_released(sp)) {
+ * sp->thread_stop = 1;
+ * unlink_watchdog_file(sp);
+ * list_move(spaces_rem);
+ * }
+ *
+ * all_tokens_released would just return 1 in case we're not doing
+ * the releases
+ *
+ * release_token_async would need to learn to put the resources onto
+ * dispose list in this case
+ *
+ * consider using the resources/dispose_resources list for all_pids_dead
+ * and kill_pids? instead of the clients[].tokens[] loops? actually,
+ * could we remove tokens and cl->tokens altogether and just use the
+ * resources list?
+ */
+
static void cmd_rem_lockspace(struct cmd_args *ca)
{
struct sanlk_lockspace lockspace;
diff --git a/src/sanlock.8 b/src/sanlock.8
index f004159..9a33990 100644
--- a/src/sanlock.8
+++ b/src/sanlock.8
@@ -278,7 +278,7 @@ This will allow resources to be acquired in the lockspace.
Tell the sanlock daemon to release the specified host_id in the lockspace.
Any processes holding resource leases in this lockspace will be killed,
-and the leases released.
+and the resource leases not released.
.BR "sanlock client command -r" " RESOURCE " \
\fB-c\fP " " \fIpath\fP " " \fIargs\fP
commit 65de7ce8f6f67bcc54803d56e499dc611668aad0
Author: David Teigland <teigland(a)redhat.com>
Date: Wed Sep 7 11:16:29 2011 -0500
sanlock: use flags in struct resource
diff --git a/src/main.c b/src/main.c
index f30960b..51c1fe7 100644
--- a/src/main.c
+++ b/src/main.c
@@ -1952,6 +1952,25 @@ static int print_token_state(struct token *t, char *str)
* 14. [repeat 7-13 for each client]
*/
+/*
+ * TODO:
+ * increase max transfer size
+ * send spaces/spaces_add/spaces_rem instead of just spaces
+ * use resources/dispose_resources to send resource state instead of cl->tokens
+ *
+ * . daemon
+ * . clients
+ * . lockspaces from spaces/spaces_add/spaces_rem
+ * . resources from resources/dispose_resources
+ *
+ * in print function,
+ * for each client pid, go through resources and print any for that pid, clear the res
+ * print any remaining detached resources (orphan, dispose)
+ *
+ * sanlock client host_status <lockspace_name>
+ * send sp->host_status[] for the named lockspace
+ */
+
static void cmd_status(int fd, struct sm_header *h_recv)
{
struct sm_header h;
diff --git a/src/token_manager.c b/src/token_manager.c
index 4d447a6..a15e47a 100644
--- a/src/token_manager.c
+++ b/src/token_manager.c
@@ -39,11 +39,13 @@ static pthread_t resource_pt;
static int resource_thread_stop;
static int resource_examine;
+#define R_EXAMINE 0x00000001
+
struct resource {
struct list_head list;
struct token *token;
int pid;
- int examine;
+ uint32_t flags;
uint64_t lver;
struct sanlk_resource r;
};
@@ -59,7 +61,7 @@ int set_resource_examine(char *space_name, char *res_name)
continue;
if (res_name && strncmp(r->r.name, res_name, NAME_ID_SIZE))
continue;
- r->examine = 1;
+ r->flags |= R_EXAMINE;
resource_examine = 1;
count++;
}
@@ -75,7 +77,7 @@ static struct resource *find_resource_examine(void)
struct resource *r;
list_for_each_entry(r, &resources, list) {
- if (r->examine)
+ if (r->flags & R_EXAMINE)
return r;
}
return NULL;
@@ -421,7 +423,7 @@ static void *resource_thread(void *arg GNUC_UNUSED)
pthread_mutex_unlock(&resource_mutex);
continue;
}
- r->examine = 0;
+ r->flags &= ~R_EXAMINE;
/* we can't safely access r->token here, and
r may be freed after we release mutex, so copy
@@ -482,6 +484,7 @@ void release_token_async(struct token *token)
r = find_resource(token, &resources);
if (r) {
/* assert r->token == token ? */
+
if (token->space_dead || (token->acquire_result != SANLK_OK)) {
_del_resource(r);
free(token);