src/main.c | 70 +++-----
src/sanlock_internal.h | 6
src/token_manager.c | 387 ++++++++++++++++++++++++++++++++++---------------
src/token_manager.h | 7
tests/devcount.c | 7
5 files changed, 310 insertions(+), 167 deletions(-)
New commits:
commit 6b26b667fdaca59e91c31522b03f88bce42aad18
Author: David Teigland <teigland(a)redhat.com>
Date: Wed Mar 9 17:22:23 2011 -0600
sanlock: setowner improvements
Allow setowner to be used on either migrate destination (migrate
succeeded) or migrate source (migrate failed). Also improve the
organization and structure of the migration code a bit to make it
cleaner.
diff --git a/src/main.c b/src/main.c
index f0e549c..f90b6d9 100644
--- a/src/main.c
+++ b/src/main.c
@@ -192,7 +192,7 @@ static void client_pid_dead(int ci)
int delay_release = 0;
int i, pid;
- log_debug("client_pid_dead ci %d pid %d", ci, cl->pid);
+ log_debug("client_pid_dead ci %d fd %d pid %d", ci, cl->fd, cl->pid);
/* cmd_acquire_thread may still be waiting for the tokens
to be acquired. if it is, tell it to release them when
@@ -427,9 +427,6 @@ static int main_loop(void)
return 0;
}
-/* FIXME: allow setowner on the source to "cancel" a migration by clearing
- next_owner. This means allowing CMD_SETOWNER when cmd_active == CMD_MIGRATE */
-
static int set_cmd_active(int ci_target, int cmd)
{
struct client *cl = &client[ci_target];
@@ -448,6 +445,12 @@ static int set_cmd_active(int ci_target, int cmd)
if (cl->need_setowner && cmd == SM_CMD_SETOWNER)
cl->need_setowner = 0;
+ if (cl->cmd_active == SM_CMD_MIGRATE && cmd == SM_CMD_SETOWNER) {
+ cl->cmd_active = SM_CMD_SETOWNER;
+ pthread_mutex_unlock(&cl->mutex);
+ return 0;
+ }
+
cmd_active = cl->cmd_active;
if (!cmd) {
@@ -636,7 +639,7 @@ static void *cmd_acquire_thread(void *args_in)
int fd, rv, i, j, disks_len, num_disks, empty_slots, opened;
int alloc_count = 0, add_count = 0, open_count = 0, acquire_count = 0;
int pos = 0, need_setowner = 0, pid_dead = 0;
- int new_tokens_count, migrate_result;
+ int new_tokens_count;
cl = &client[ca->ci_target];
fd = client[ca->ci_in].fd;
@@ -873,28 +876,19 @@ static void *cmd_acquire_thread(void *args_in)
for (i = 0; i < new_tokens_count; i++) {
token = new_tokens[i];
- if (opt.flags & SANLK_FLG_INCOMING) {
- rv = check_incoming_state(token, opt_str, &migrate_result);
- if (rv < 0) {
- log_errot(token, "cmd_acquire incoming state %d", rv);
- goto fail_release;
- }
-
- /* source set_next_owner_other() wasn't called or failed */
- if (migrate_result != DP_OK)
- rv = set_next_owner_self(token);
+ if (opt.flags & SANLK_FLG_REACQUIRE)
+ reacquire_lver = token->prev_lver;
+ if (opt.flags & SANLK_FLG_INCOMING) {
+ rv = incoming_token(token, opt_str);
} else {
- if (opt.flags & SANLK_FLG_REACQUIRE)
- reacquire_lver = token->prev_lver;
-
rv = acquire_token(token, reacquire_lver, new_num_hosts);
}
save_resource_leader(token);
if (rv < 0) {
- log_errot(token, "cmd_acquire lease %d flags %x",
+ log_errot(token, "cmd_acquire %d flags %x",
rv, opt.flags);
goto fail_release;
}
@@ -1131,23 +1125,7 @@ static void *cmd_migrate_thread(void *args_in)
if (!token)
continue;
- /* the migrating flag causes the source to avoid freeing the lease
- * if the pid exits before the dest has written itself as next_owner.
- * i.e. we can't rely on paxos_lease_release seeing next_owner_id
- * non-zero because the cmd_migrate can be called on the source,
- * followed by the source pid exiting before the dest gets to write
- * next_owner_id to itself */
-
- token->migrating = 1;
-
- if (target_host_id) {
- rv = set_next_owner_other(token, target_host_id);
- token->migrate_result = rv;
- } else {
- /* acquire-incoming on the destination will call
- set_next_owner_self() */
- token->migrate_result = 0;
- }
+ migrate_token(token, target_host_id);
ret = snprintf(reply_str + pos, reply_len - pos,
"lockspace_name=%s "
@@ -1178,6 +1156,8 @@ static void *cmd_migrate_thread(void *args_in)
reply_str[reply_len-1] = '\0';
reply:
+ /* no set_cmd_active(0), only setowner is allowed after this */
+
log_debug("cmd_migrate done result %d", result);
memcpy(&h, &ca->header, sizeof(struct sm_header));
@@ -1199,10 +1179,6 @@ static void *cmd_migrate_thread(void *args_in)
return NULL;
}
-/* become the full owner of leases that were migrated to us;
- go through each of the pid's tokens, set next_owner_id for each,
- then reply to client with result */
-
static void *cmd_setowner_thread(void *args_in)
{
struct cmd_args *ca = args_in;
@@ -1389,9 +1365,12 @@ static int print_token_state(struct token *t, char *str)
snprintf(str, SANLK_STATE_MAXSTR-1,
"token_id=%u "
+ "migrating=%d "
+ "incoming=%d "
"acquire_result=%d "
- "migrate_result=%d "
"release_result=%d "
+ "migrate_result=%d "
+ "incoming_result=%d "
"setowner_result=%d "
"leader.lver=%llu "
"leader.timestamp=%llu "
@@ -1399,9 +1378,12 @@ static int print_token_state(struct token *t, char *str)
"leader.owner_generation=%llu "
"leader.next_owner_id=%llu",
t->token_id,
+ t->migrating,
+ t->incoming,
t->acquire_result,
- t->migrate_result,
t->release_result,
+ t->migrate_result,
+ t->incoming_result,
t->setowner_result,
(unsigned long long)t->leader.lver,
(unsigned long long)t->leader.timestamp,
@@ -1699,8 +1681,10 @@ static void process_cmd_daemon(int ci, struct sm_header *h_recv)
switch (h_recv->cmd) {
case SM_CMD_REGISTER:
rv = get_peer_pid(fd, &pid);
- if (rv < 0)
+ if (rv < 0) {
+ log_error("cmd_register ci %d fd %d get pid failed", ci, fd);
break;
+ }
log_debug("cmd_register ci %d fd %d pid %d", ci, fd, pid);
client[ci].pid = pid;
client[ci].deadfn = client_pid_dead;
diff --git a/src/sanlock_internal.h b/src/sanlock_internal.h
index 22dd21e..493954f 100644
--- a/src/sanlock_internal.h
+++ b/src/sanlock_internal.h
@@ -85,11 +85,13 @@ struct token {
struct sync_disk *disks;
/* internal */
- int migrating;
int token_id; /* used to refer to this token instance in log messages */
+ int migrating;
+ int incoming;
int acquire_result;
- int migrate_result;
int release_result;
+ int migrate_result;
+ int incoming_result;
int setowner_result;
uint64_t prev_lver; /* just used to pass a value between functions */
struct leader_record leader; /* copy of last leader_record we wrote */
diff --git a/src/token_manager.c b/src/token_manager.c
index aa537df..bc86b07 100644
--- a/src/token_manager.c
+++ b/src/token_manager.c
@@ -212,45 +212,6 @@ int acquire_token(struct token *token, uint64_t reacquire_lver,
return rv; /* DP_OK */
}
-/* return < 0 on error, 1 on success */
-
-int setowner_token(struct token *token)
-{
- struct leader_record leader_ret;
- int rv;
-
- rv = paxos_lease_leader_read(token, &leader_ret);
- if (rv < 0)
- return rv;
-
- if (memcmp(&token->leader, &leader_ret, sizeof(struct leader_record))) {
- log_errot(token, "setowner leader_read mismatch");
- return -1;
- }
-
- /* we want the dblocks to reflect a full, proper ownership, so we
- do the full acquire rather than just writing a new leader_record */
-
- rv = paxos_lease_acquire(token, 1, &leader_ret, 0, 0);
-
- token->setowner_result = rv;
-
- /* we set acquire_result here for at least one reason: because release
- will not release the token if acquire_result is not 1 */
-
- token->acquire_result = rv;
-
- log_token(token, "setowner rv %d lver %llu at %llu", rv,
- (unsigned long long)token->leader.lver,
- (unsigned long long)token->leader.timestamp);
-
- if (rv < 0)
- return rv;
-
- memcpy(&token->leader, &leader_ret, sizeof(struct leader_record));
- return rv; /* DP_OK */
-}
-
/*
* migration creates special cases for release. if either the source or
* the dest calls release_token and read leader shows next_owner_id is not
@@ -263,17 +224,19 @@ int setowner_token(struct token *token)
*
* (A second mechanism is also needed to prevent release of migrating leases,
* the token->migrating flag. This is because we need to block releases
- * on the source effective immediately, before next_owner may be written.)
+ * on the source effective immediately, before next_owner may be written.
+ * The token->incoming flag is the same, although is probably unnecessary
+ * given the paxos_lease_release checking of next_owner.)
*
* setowner on the destination, in the case of migration success, moves a
* disk lease from being in limbo (both owner and next_owner set), to having
* just an owner (the dest). After this the owner (dest) can release it.
*
- * TODO: setowner on the source, in the case of migration failure, moves the
+ * setowner on the source, in the case of migration failure, moves the
* disk lease from being in limbo with both owner and next_owner, to having
- * just an owner (the source). This setowner call will need to ignore the
- * fact that the leader block doesn't match its latest copy since it may
- * have been the dest that wrote next_owner at the start of migration.
+ * just an owner (the source). This setowner needs to ignore the fact that
+ * the leader block doesn't match its latest copy since it may have been the
+ * dest that wrote next_owner at the start of migration.
*
* We don't have to worry that next_owner is ever running the vm; setowner
* on dest is required to complete successfully (making dest the owner and
@@ -283,9 +246,7 @@ int setowner_token(struct token *token)
* the source/owner continues running and renewing its host_id, then no
* other host will be able to take ownership of the lease, because they will
* see that the owner is alive. The source/owner will be able to acquire
- * the lease, though. So, the source/owner needs to either
- * 1. call setowner to clear next_owner_id, then call release to free it (TODO above)
- * 2. call acquire to acquire the lease, then call release to free it
+ * the lease, though.
*
* If migration fails, the source/owner does not free the paxos lease, and
* the source/owner does not continue running or renewing its host_id, then
@@ -304,9 +265,6 @@ int setowner_token(struct token *token)
* - if migration fails because the dest host fails,
* setowner on the source will clear next_owner, and allow source to
* continue running and holding the lease, or release the lease.
- * setowner on source would have to ignore the fact that the leader
- * will have been changed since it last read or wrote it (by the dest
- * writing itself as the next_owner_id)
*
* - if migration fails because the source host fails,
* the paxos lease will be left on disk with owner and next_owner set,
@@ -341,7 +299,12 @@ int release_token(struct token *token)
int rv;
if (token->migrating) {
- log_errot(token, "release skip migrating");
+ log_token(token, "release skip migrating");
+ return DP_ERROR;
+ }
+
+ if (token->incoming) {
+ log_token(token, "release skip incoming");
return DP_ERROR;
}
@@ -358,71 +321,9 @@ int release_token(struct token *token)
return rv; /* DP_OK */
}
-/* return < 0 on error, 1 on success */
-
-int set_next_owner_other(struct token *token, uint64_t target_host_id)
-{
- struct leader_record leader;
- int rv;
-
- rv = paxos_lease_leader_read(token, &leader);
- if (rv < 0)
- return rv;
-
- if (memcmp(&leader, &token->leader, sizeof(struct leader_record))) {
- log_errot(token, "set_next_owner_other leader changed before migrate");
- return DP_BAD_LEADER;
- }
-
- if (leader.num_hosts < target_host_id) {
- log_errot(token, "set_next_owner_other num_hosts %llu "
- "target_host_id %llu",
- (unsigned long long)leader.num_hosts,
- (unsigned long long)target_host_id);
- return DP_BAD_NUMHOSTS;
- }
-
- leader.next_owner_id = target_host_id;
-
- rv = paxos_lease_leader_write(token, &leader);
- if (rv < 0)
- return rv;
-
- memcpy(&token->leader, &leader, sizeof(struct leader_record));
- return rv; /* DP_OK */
-}
-
-/* return < 0 on error, 1 on success */
-
-int set_next_owner_self(struct token *token)
-{
- struct leader_record leader;
- int rv;
-
- rv = paxos_lease_leader_read(token, &leader);
- if (rv < 0)
- return rv;
-
- if (leader.num_hosts < token->host_id) {
- log_errot(token, "set_next_owner_self num_hosts %llu host_id %llu",
- (unsigned long long)leader.num_hosts,
- (unsigned long long)token->host_id);
- return DP_BAD_NUMHOSTS;
- }
-
- leader.next_owner_id = token->host_id;
-
- rv = paxos_lease_leader_write(token, &leader);
- if (rv < 0)
- return rv;
-
- memcpy(&token->leader, &leader, sizeof(struct leader_record));
- return rv; /* DP_OK */
-}
-
/*
* migration destination verifies the migrate state sent from source,
- * which needs to be consisent with the source having successfully written
+ * which needs to be consistent with the source having successfully written
* the next_owner itself, or having not tried or tried and failed.
*
* If we can't read the leader, return an error, and the migration
@@ -434,7 +335,8 @@ int set_next_owner_self(struct token *token)
int parse_incoming_state(struct token *token, char *str, int *migrate_result,
struct leader_record *leader);
-int check_incoming_state(struct token *token, char *opt_str, int *migrate_result_out)
+static int check_incoming_state(struct token *token, char *opt_str,
+ int *migrate_result_out)
{
struct leader_record leader_ret;
struct leader_record leader_src;
@@ -500,6 +402,261 @@ int check_incoming_state(struct token *token, char *opt_str, int
*migrate_result
return DP_OK;
}
+/*
+ * Migration
+ *
+ * sanlock_migrate() called on source
+ * sanlock_acquire() called on destination with INCOMING flag
+ *
+ * migrate_token() called on source, output state str
+ * incoming_token() called on dest, input state str from source
+ *
+ * if migrate_token() sets next_owner in set_next_owner_other(),
+ * then incoming_token() will not set next_owner in set_next_owner_self()
+ *
+ * if migrate_token() does not set next_owner in set_next_owner_other(),
+ * then incoming_token() will set next_owner in set_net_owner_self()
+ *
+ * after migrate_token() on source and incoming_token() on destination
+ * both succeed, vm migration happens
+ *
+ * if vm migration succeeds
+ * sanlock_setowner() / setowner_token() is called on the destination,
+ * leases become owned by dest owner=D, next_owner cleared,
+ * token->incoming cleared, the leases will be freed when the dest vm pid exits
+ *
+ * if vm migration fails
+ * sanlock_setowner() / setowner_token() is called on the source,
+ * leases remain owned by source, next_owner cleared,
+ * token->migrating cleared, the leases will be freed when the source vm pid exits
+ *
+ * (the migrating flag on source and incoming flag on dest cause the host to
+ * skip freeing the lease if the pid exits before the next_owner has been
+ * written to the lease leader block. release_token() also looks at the
+ * next_owner field in the leader block and will not free the lease if it is
+ * set.)
+ *
+ * TODO: what if migration fails, and source pid exits prior to setowner_token
+ * being called on the source? The lease is still owned by the source host,
+ * but no pid on the source is holding/using it. A new vm/pid on the source
+ * host can probably acquire it, but not a vm/pid on a different host.
+ *
+ * TODO: if setowner_token() fails on the destination after a successful
+ * migration, will returning an error cause libvirt to then call
+ * setowner_token() on the source?
+ */
+
+static int setowner_token_migrating(struct token *token)
+{
+ struct leader_record leader;
+ int rv;
+
+ rv = paxos_lease_leader_read(token, &leader);
+ if (rv < 0)
+ return rv;
+
+ /* the owner should still be us since migration failed */
+
+ if (token->leader.owner_id != leader.owner_id) {
+ log_errot(token, "setowner migrating bad owner %llu %llu",
+ (unsigned long long)token->leader.owner_id,
+ (unsigned long long)leader.owner_id);
+ return DP_ERROR;
+ }
+
+ /* leader block we just read may not match our last saved copy
+ because next_owner_id may have been set by the destination */
+
+ /* next_owner_id was set either by us in set_next_owner_other
+ or by the destination in set_next_owner_self, so it should
+ not be zero */
+
+ if (leader.next_owner_id == 0) {
+ log_errot(token, "setowner migrating zero next_owner");
+ return DP_ERROR;
+ }
+
+ leader.next_owner_id = 0;
+
+ rv = paxos_lease_leader_write(token, &leader);
+ if (rv < 0)
+ return rv;
+
+ memcpy(&token->leader, &leader, sizeof(struct leader_record));
+ return rv; /* DP_OK */
+}
+
+static int setowner_token_incoming(struct token *token)
+{
+ struct leader_record leader;
+ int rv;
+
+ rv = paxos_lease_leader_read(token, &leader);
+ if (rv < 0)
+ return rv;
+
+ if (token->leader.next_owner_id != leader.next_owner_id) {
+ log_errot(token, "setowner incoming bad next_owner %llu %llu",
+ (unsigned long long)token->leader.next_owner_id,
+ (unsigned long long)leader.next_owner_id);
+ return DP_ERROR;
+ }
+
+ /* should match what we last saved in incoming_token() */
+
+ if (memcmp(&token->leader, &leader, sizeof(struct leader_record))) {
+ log_errot(token, "setowner incoming leader_read mismatch");
+ return -1;
+ }
+
+ /* we want the dblocks to reflect a full, proper ownership, so we
+ do the full acquire rather than just writing a new leader_record */
+
+ rv = paxos_lease_acquire(token, 1, &leader, 0, 0);
+ if (rv < 0)
+ return rv;
+
+ memcpy(&token->leader, &leader, sizeof(struct leader_record));
+ return rv; /* DP_OK */
+}
+
+/* we set acquire_result for setowner_incoming for at least one reason,
+ because release will not free the token if acquire_result is not 1 */
+
+/* return < 0 on error, 1 on success */
+
+int setowner_token(struct token *token)
+{
+ int rv;
+
+ log_token(token, "setowner incoming %d migrating %d",
+ token->migrating, token->incoming);
+
+ if (token->migrating) {
+ rv = setowner_token_migrating(token);
+
+ if (rv == DP_OK)
+ token->migrating = 0;
+
+ token->setowner_result = rv;
+
+ } else if (token->incoming) {
+ rv = setowner_token_incoming(token);
+
+ if (rv == DP_OK)
+ token->incoming = 0;
+
+ token->setowner_result = rv;
+ token->acquire_result = rv;
+ } else {
+ log_errot(token, "setowner ignoring");
+ }
+
+ return rv;
+}
+
+static int set_next_owner_other(struct token *token, uint64_t target_host_id)
+{
+ struct leader_record leader;
+ int rv;
+
+ rv = paxos_lease_leader_read(token, &leader);
+ if (rv < 0)
+ return rv;
+
+ if (memcmp(&leader, &token->leader, sizeof(struct leader_record))) {
+ log_errot(token, "set_next_owner_other leader changed before migrate");
+ return DP_BAD_LEADER;
+ }
+
+ if (leader.num_hosts < target_host_id) {
+ log_errot(token, "set_next_owner_other num_hosts %llu "
+ "target_host_id %llu",
+ (unsigned long long)leader.num_hosts,
+ (unsigned long long)target_host_id);
+ return DP_BAD_NUMHOSTS;
+ }
+
+ leader.next_owner_id = target_host_id;
+
+ rv = paxos_lease_leader_write(token, &leader);
+ if (rv < 0)
+ return rv;
+
+ memcpy(&token->leader, &leader, sizeof(struct leader_record));
+ return rv; /* DP_OK */
+}
+
+int migrate_token(struct token *token, uint64_t target_host_id)
+{
+ int rv = 0;
+
+ log_token(token, "migrate %llu", (unsigned long long)target_host_id);
+
+ if (target_host_id) {
+ /* migrate_token() on the source calls set_next_owner_other() */
+ rv = set_next_owner_other(token, target_host_id);
+ token->migrate_result = rv;
+ } else {
+ /* incoming_token() on the dest will call set_next_owner_self() */
+ token->migrate_result = 0;
+ }
+
+ token->migrating = 1;
+
+ return rv;
+}
+
+static int set_next_owner_self(struct token *token)
+{
+ struct leader_record leader;
+ int rv;
+
+ /* would could skip this read by reusing the read from check_incoming_state */
+ rv = paxos_lease_leader_read(token, &leader);
+ if (rv < 0)
+ return rv;
+
+ if (leader.num_hosts < token->host_id) {
+ log_errot(token, "set_next_owner_self num_hosts %llu host_id %llu",
+ (unsigned long long)leader.num_hosts,
+ (unsigned long long)token->host_id);
+ return DP_BAD_NUMHOSTS;
+ }
+
+ leader.next_owner_id = token->host_id;
+
+ rv = paxos_lease_leader_write(token, &leader);
+ if (rv < 0)
+ return rv;
+
+ memcpy(&token->leader, &leader, sizeof(struct leader_record));
+ return rv; /* DP_OK */
+}
+
+int incoming_token(struct token *token, char *opt_str)
+{
+ int migrate_result;
+ int rv;
+
+ log_token(token, "incoming");
+
+ rv = check_incoming_state(token, opt_str, &migrate_result);
+ if (rv < 0) {
+ log_errot(token, "incoming state error %d", rv);
+ goto out;
+ }
+
+ if (migrate_result != DP_OK)
+ rv = set_next_owner_self(token);
+ out:
+ token->incoming_result = rv;
+
+ token->incoming = 1;
+
+ return rv;
+}
+
int create_token(int num_disks, struct token **token_out)
{
struct token *token;
diff --git a/src/token_manager.h b/src/token_manager.h
index 34b3c5b..1bbc9ed 100644
--- a/src/token_manager.h
+++ b/src/token_manager.h
@@ -13,11 +13,8 @@ int acquire_token(struct token *token, uint64_t reacquire_lver,
int new_num_hosts);
int release_token(struct token *token);
int setowner_token(struct token *token);
-
-int check_incoming_state(struct token *token, char *opt_str,
- int *migrate_result_out);
-int set_next_owner_other(struct token *token, uint64_t target_host_id);
-int set_next_owner_self(struct token *token);
+int migrate_token(struct token *token, uint64_t target_host_id);
+int incoming_token(struct token *token, char *opt_str);
int create_token(int num_disks, struct token **token_out);
void free_token(struct token *token);
diff --git a/tests/devcount.c b/tests/devcount.c
index b8e8aa3..b83a49a 100644
--- a/tests/devcount.c
+++ b/tests/devcount.c
@@ -399,7 +399,7 @@ static int do_lock(int argc, char *argv[])
sock = sanlock_register();
if (sock < 0) {
printf("%d sanlock_register error %d\n",
- child_pid, rv);
+ child_pid, sock);
exit(-1);
}
@@ -476,6 +476,7 @@ static void write_migrate(char *state, int offset)
goto fail;
}
+ close(fd);
return;
fail:
@@ -572,6 +573,7 @@ static int wait_migrate_incoming(char *state_out)
goto fail;
}
+ close(fd);
return 0;
fail:
@@ -639,6 +641,7 @@ static void wait_migrate_stopped(char *state_in)
goto fail;
}
+ close(fd);
return;
fail:
@@ -735,7 +738,7 @@ static int do_migrate(int argc, char *argv[])
sock = sanlock_register();
if (sock < 0) {
printf("%d sanlock_register error %d\n",
- child_pid, rv);
+ child_pid, sock);
exit(-1);
}