src/direct_lib.c | 11 +++ src/host_id.c | 15 ----- src/main.c | 143 +++++++++++++++++++++++++++++++++++++------------ src/sanlock.8 | 7 -- src/sanlock_internal.h | 1 src/sanlock_resource.h | 1 src/token_manager.c | 17 +++-- 7 files changed, 130 insertions(+), 65 deletions(-)
New commits: commit dd9dfa9c9d544ff12684ab94b5e30fc165206052 Author: David Teigland teigland@redhat.com Date: Wed Sep 7 16:49:13 2011 -0500
libsanlock: fix function stubs
would like to get rid of this ugliness
diff --git a/src/direct_lib.c b/src/direct_lib.c index e2c7d7a..7d7f9a9 100644 --- a/src/direct_lib.c +++ b/src/direct_lib.c @@ -31,9 +31,16 @@ void log_level(int space_id GNUC_UNUSED, int token_id GNUC_UNUSED, { }
-int host_id_disk_info(char *name GNUC_UNUSED, struct sync_disk *disk GNUC_UNUSED); +int lockspace_disk(char *space_name GNUC_UNUSED, struct sync_disk *disk GNUC_UNUSED);
-int host_id_disk_info(char *name GNUC_UNUSED, struct sync_disk *disk GNUC_UNUSED) +int lockspace_disk(char *space_name GNUC_UNUSED, struct sync_disk *disk GNUC_UNUSED) +{ + return -1; +} + +int host_info(char *space_name, uint64_t host_id, struct host_status *hs_out); + +int host_info(char *space_name GNUC_UNUSED, uint64_t host_id GNUC_UNUSED, struct host_status *hs_out GNUC_UNUSED) { return -1; }
commit 4d1ed9632189bc2752ad6a684145ca38971ab959 Author: David Teigland teigland@redhat.com Date: Wed Sep 7 15:54:53 2011 -0500
sanlock: remove BLOCK_WD force mode
don't include this option until we're ready to use it, in case the usage specifics end up being different
diff --git a/src/host_id.c b/src/host_id.c index 1b82eeb..86df60b 100644 --- a/src/host_id.c +++ b/src/host_id.c @@ -163,17 +163,6 @@ int lockspace_disk(char *space_name, struct sync_disk *disk) return rv; }
-void block_watchdog_updates(char *space_name) -{ - struct space *sp; - - pthread_mutex_lock(&spaces_mutex); - sp = _search_space(space_name, NULL, 0, &spaces, NULL, NULL); - if (sp) - sp->block_watchdog_updates = 1; - pthread_mutex_unlock(&spaces_mutex); -} - #if 0 static void clear_bit(int host_id, char *bitmap) { @@ -563,9 +552,7 @@ static void *lockspace_thread(void *arg_in) * pet the watchdog */
- if (delta_result == SANLK_OK && - !sp->thread_stop && - !sp->block_watchdog_updates) + if (delta_result == SANLK_OK && !sp->thread_stop) update_watchdog_file(sp, last_success);
pthread_mutex_unlock(&sp->mutex); diff --git a/src/sanlock.8 b/src/sanlock.8 index 9a33990..e3196e9 100644 --- a/src/sanlock.8 +++ b/src/sanlock.8 @@ -477,11 +477,6 @@ take: process has exited, the resource lease will be released, and can then be acquired by anyone.
-\fB2\fP (BLOCK_WD): stop updating the watchdog (/dev/watchdog keepalive -via wdmd_test_live) for the lockspace, which will lead to /dev/watchdog -firing and reseting the host. The resource lease can be acquired after -the timeout for a failed host. - .SH SEE ALSO .BR wdmd (8)
diff --git a/src/sanlock_internal.h b/src/sanlock_internal.h index 70c696d..7a26749 100644 --- a/src/sanlock_internal.h +++ b/src/sanlock_internal.h @@ -124,7 +124,6 @@ struct space { int space_dead; int killing_pids; int external_remove; - int block_watchdog_updates; int thread_stop; int wd_fd; pthread_t thread; diff --git a/src/sanlock_resource.h b/src/sanlock_resource.h index 4998c16..770ebfc 100644 --- a/src/sanlock_resource.h +++ b/src/sanlock_resource.h @@ -28,7 +28,6 @@
/* request flags */ #define SANLK_REQ_KILL_PID 0x00000001 -#define SANLK_REQ_BLOCK_WD 0x00000002
int sanlock_register(void);
diff --git a/src/token_manager.c b/src/token_manager.c index a15e47a..a701cc3 100644 --- a/src/token_manager.c +++ b/src/token_manager.c @@ -462,10 +462,8 @@ static void *resource_thread(void *arg GNUC_UNUSED) log_error("req_kill_pid %d %.48s:%.48s", pid, tt->r.lockspace_name, tt->r.name); kill(pid, SIGKILL); - - } else if (req.force_mode == SANLK_REQ_BLOCK_WD) { - log_error("req_block_wd %.48s", tt->r.lockspace_name); - block_watchdog_updates(tt->r.lockspace_name); + } else { + log_error("req force_mode unknown %u", req.force_mode); } } }
commit 0a356d94812872ebc7d8c39f79cac7cc88997ab1 Author: David Teigland teigland@redhat.com Date: Wed Sep 7 12:12:34 2011 -0500
sanlock: restructure lockspace checks
shifts some code around, should be no functional change. not checking sp after the list_move is better.
diff --git a/src/main.c b/src/main.c index 51c1fe7..c249ace 100644 --- a/src/main.c +++ b/src/main.c @@ -424,10 +424,6 @@ static int client_using_space(struct client *cl, struct space *sp) log_spoke(sp, token, "client_using_space pid %d", cl->pid); token->space_dead = sp->space_dead; rv = 1; - - /* we could break here after finding one if we didn't care - * about setting token->space_dead which isn't really - * necessary; it just avoids trying to release the token */ } return rv; } @@ -586,9 +582,42 @@ static int main_loop(void) last_check = now; check_interval = STANDARD_CHECK_INTERVAL;
+ /* + * check the condition of each lockspace, + * if pids are being killed, have pids all exited? + * is its host_id being renewed?, if not kill pids + */ + pthread_mutex_lock(&spaces_mutex); list_for_each_entry_safe(sp, safe, &spaces, list) { - check_all = 0; + + if (sp->killing_pids && all_pids_dead(sp)) { + /* + * move sp to spaces_rem so main_loop + * will no longer see it. + */ + log_space(sp, "set thread_stop"); + pthread_mutex_lock(&sp->mutex); + sp->thread_stop = 1; + unlink_watchdog_file(sp); + pthread_mutex_unlock(&sp->mutex); + list_move(&sp->list, &spaces_rem); + continue; + } + + if (sp->killing_pids) { + /* + * continue to kill the pids with increasing + * levels of severity until they all exit + */ + kill_pids(sp); + check_interval = RECOVERY_CHECK_INTERVAL; + continue; + } + + /* + * check host_id lease renewal + */
if (sp->align_size > check_buf_len) { if (check_buf) @@ -599,37 +628,21 @@ static int main_loop(void) if (check_buf) memset(check_buf, 0, check_buf_len);
- if (sp->killing_pids) { - if (all_pids_dead(sp)) { - log_space(sp, "set thread_stop"); - pthread_mutex_lock(&sp->mutex); - sp->thread_stop = 1; - unlink_watchdog_file(sp); - pthread_mutex_unlock(&sp->mutex); - list_move(&sp->list, &spaces_rem); - } else { - kill_pids(sp); - } - } else { - rv = check_our_lease(&main_task, sp, - &check_all, check_buf); - - if (rv || external_shutdown || sp->external_remove) { - log_space(sp, "set killing_pids check %d " - "shutdown %d remove %d", - rv, external_shutdown, - sp->external_remove); - sp->space_dead = 1; - sp->killing_pids = 1; - kill_pids(sp); - } - } + check_all = 0;
- if (!sp->killing_pids && check_all) - check_other_leases(&main_task, sp, check_buf); + rv = check_our_lease(&main_task, sp, &check_all, check_buf);
- if (sp->killing_pids) + if (rv || external_shutdown || sp->external_remove) { + log_space(sp, "set killing_pids check %d shutdown %d remove %d", + rv, external_shutdown, sp->external_remove); + sp->space_dead = 1; + sp->killing_pids = 1; + kill_pids(sp); check_interval = RECOVERY_CHECK_INTERVAL; + + } else if (check_all) { + check_other_leases(&main_task, sp, check_buf); + } } empty = list_empty(&spaces); pthread_mutex_unlock(&spaces_mutex); @@ -1516,6 +1529,51 @@ static void cmd_add_lockspace(struct cmd_args *ca) client_resume(ca->ci_in); }
+/* + * TODO: rem_lockspace works like a renewal failure would, and abandons + * resource leases (tokens) without releasing them. Unlike the renewal + * failure case, rem_lockspace most likely releases the host_id. + * + * What might be nice is an option where rem_lockspace would try to + * release resource leases before releasing the lockspace host_id. + * (We don't really want to be releasing tokens after we've released + * our host_id for the token's lockspace.) + * + * - kill all pids (by looking at struct resource pid?) + * - wait for all pids to exit + * o have us or other thread release their tokens/resources + * o wait for tokens/resources to be released, although the release + * may fail or time out, we don't want to wait too long + * - set sp->external_remove + * - main_loop sets sp->thread_stop (should find no pids) + * - main_loop unlinks watchdog + * - lockspace_thread releases host_id + * + * The aim is that we kill pids and wait for resources to be released + * before main_loop gets involved and before the lockspace_thread is + * told to stop. + * + * An alternative messy is to add another condition to the current + * main_loop checks: + * + * if (sp->killing_pids && all_pids_dead(sp) && all_tokens_released(sp)) { + * sp->thread_stop = 1; + * unlink_watchdog_file(sp); + * list_move(spaces_rem); + * } + * + * all_tokens_released would just return 1 in case we're not doing + * the releases + * + * release_token_async would need to learn to put the resources onto + * dispose list in this case + * + * consider using the resources/dispose_resources list for all_pids_dead + * and kill_pids? instead of the clients[].tokens[] loops? actually, + * could we remove tokens and cl->tokens altogether and just use the + * resources list? + */ + static void cmd_rem_lockspace(struct cmd_args *ca) { struct sanlk_lockspace lockspace; diff --git a/src/sanlock.8 b/src/sanlock.8 index f004159..9a33990 100644 --- a/src/sanlock.8 +++ b/src/sanlock.8 @@ -278,7 +278,7 @@ This will allow resources to be acquired in the lockspace.
Tell the sanlock daemon to release the specified host_id in the lockspace. Any processes holding resource leases in this lockspace will be killed, -and the leases released. +and the resource leases not released.
.BR "sanlock client command -r" " RESOURCE " \ \fB-c\fP " " \fIpath\fP " " \fIargs\fP
commit 65de7ce8f6f67bcc54803d56e499dc611668aad0 Author: David Teigland teigland@redhat.com Date: Wed Sep 7 11:16:29 2011 -0500
sanlock: use flags in struct resource
diff --git a/src/main.c b/src/main.c index f30960b..51c1fe7 100644 --- a/src/main.c +++ b/src/main.c @@ -1952,6 +1952,25 @@ static int print_token_state(struct token *t, char *str) * 14. [repeat 7-13 for each client] */
+/* + * TODO: + * increase max transfer size + * send spaces/spaces_add/spaces_rem instead of just spaces + * use resources/dispose_resources to send resource state instead of cl->tokens + * + * . daemon + * . clients + * . lockspaces from spaces/spaces_add/spaces_rem + * . resources from resources/dispose_resources + * + * in print function, + * for each client pid, go through resources and print any for that pid, clear the res + * print any remaining detached resources (orphan, dispose) + * + * sanlock client host_status <lockspace_name> + * send sp->host_status[] for the named lockspace + */ + static void cmd_status(int fd, struct sm_header *h_recv) { struct sm_header h; diff --git a/src/token_manager.c b/src/token_manager.c index 4d447a6..a15e47a 100644 --- a/src/token_manager.c +++ b/src/token_manager.c @@ -39,11 +39,13 @@ static pthread_t resource_pt; static int resource_thread_stop; static int resource_examine;
+#define R_EXAMINE 0x00000001 + struct resource { struct list_head list; struct token *token; int pid; - int examine; + uint32_t flags; uint64_t lver; struct sanlk_resource r; }; @@ -59,7 +61,7 @@ int set_resource_examine(char *space_name, char *res_name) continue; if (res_name && strncmp(r->r.name, res_name, NAME_ID_SIZE)) continue; - r->examine = 1; + r->flags |= R_EXAMINE; resource_examine = 1; count++; } @@ -75,7 +77,7 @@ static struct resource *find_resource_examine(void) struct resource *r;
list_for_each_entry(r, &resources, list) { - if (r->examine) + if (r->flags & R_EXAMINE) return r; } return NULL; @@ -421,7 +423,7 @@ static void *resource_thread(void *arg GNUC_UNUSED) pthread_mutex_unlock(&resource_mutex); continue; } - r->examine = 0; + r->flags &= ~R_EXAMINE;
/* we can't safely access r->token here, and r may be freed after we release mutex, so copy @@ -482,6 +484,7 @@ void release_token_async(struct token *token) r = find_resource(token, &resources); if (r) { /* assert r->token == token ? */ + if (token->space_dead || (token->acquire_result != SANLK_OK)) { _del_resource(r); free(token);
sanlock-devel@lists.fedorahosted.org