src/direct.c | 6
src/leader.h | 2
src/main.c | 131 ++++-------
src/paxos_lease.c | 75 ++----
src/paxos_lease.h | 1
src/sanlock_internal.h | 1
src/token_manager.c | 236 ++++++++++++++++----
src/token_manager.h | 8
tests/devcount.c | 558 +++++++++++++++++++++++++++++++++++++++++++------
9 files changed, 784 insertions(+), 234 deletions(-)
New commits:
commit 90f87d6cfef9e7533ac5ec51451e6e1640e33e57
Author: David Teigland <teigland(a)redhat.com>
Date: Tue Mar 8 16:45:08 2011 -0600
devcount: migrate test
Added a test that exercises sanlock_migrate by migrating the lease
protecting a standard devcount test from one host to the next in
turn, using a separate disk block to coordinate and pass the migration
state string between hosts.
Use two disks, one for sanlock leases and the other for the devcount
program to do its counting on.
host1: devcount init /dev/vg/leases /dev/vg/count
host1: devcount migrate /dev/vg/leases rw /dev/vg/count 5 300 1 3
host2: devcount migrate /dev/vg/leases rw /dev/vg/count 5 300 2 3
host3: devcount migrate /dev/vg/leases rw /dev/vg/count 5 300 3 3
host1 will acquire lease, start devcount rw, migrate lease to host2
host2 will acquire lease, start devcount rw, migrate lease to host3
host3 will acquire lease, start devcount rw, migrate lease to host1
etc
If sanlock ever allows devcount rw to run on two hosts at the same
time, it will immediately detect the collision, print an error and stop.
For an example, manually start two overlapping devcounts:
host1: devcount rw /dev/vg/count 5 300 1
host2: devcount rw /dev/vg/count 5 300 2
diff --git a/tests/devcount.c b/tests/devcount.c
index c29be3f..b8e8aa3 100644
--- a/tests/devcount.c
+++ b/tests/devcount.c
@@ -13,6 +13,7 @@
#include <errno.h>
#include <limits.h>
#include <time.h>
+#include <signal.h>
#include "sanlock.h"
#include "sanlock_admin.h"
@@ -26,10 +27,7 @@ int count_offset;
int lock_offset;
int our_hostid;
-
-int seconds;
-int verify;
-int quiet;
+int max_hostid;
struct entry {
uint32_t turn;
@@ -52,14 +50,15 @@ static int rand_int(int a, int b)
/* 64 byte entry: can fit up to 8 nodes in a 512 byte block */
-void print_entries(char *buf)
+void print_entries(int pid, char *buf)
{
struct entry *e = (struct entry *)buf;
int i;
for (i = 0; i < (512 / sizeof(struct entry)); i++) {
- printf("index %d turn %u time %llu %u:%llu:%llu "
+ printf("%d index %d turn %u time %llu %u:%llu:%llu "
"last %u %llu %u:%llu:%llu\n",
+ pid,
i,
e->turn,
(unsigned long long)e->time,
@@ -75,10 +74,11 @@ void print_entries(char *buf)
}
}
-void print_our_we(struct entry *our_we)
+void print_our_we(int pid, struct entry *our_we)
{
- printf("w index %d turn %u time %llu %u:%llu:%llu "
+ printf("%d w index %d turn %u time %llu %u:%llu:%llu "
"last %u %llu %u:%llu:%llu\n",
+ pid,
our_hostid - 1,
our_we->turn,
(unsigned long long)our_we->time,
@@ -94,9 +94,10 @@ void print_our_we(struct entry *our_we)
#define COUNT_ARGS 6
#define LOCK_ARGS 8
+#define MIGRATE_ARGS 9
-/*
- * devcount count <count_disk> <rsec> <wsec> <hostid>
+/*
+ * devcount rw|wr <count_disk> <sec1> <sec2> <hostid>
*/
static int do_count(int argc, char *argv[])
@@ -107,18 +108,27 @@ static int do_count(int argc, char *argv[])
time_t start;
uint32_t our_pid = getpid();
uint32_t max_turn;
+ int sec1, sec2;
int read_seconds, write_seconds;
if (argc < COUNT_ARGS)
return -1;
strcpy(count_path, argv[2]);
- read_seconds = atoi(argv[3]);
- write_seconds = atoi(argv[4]);
+ sec1 = atoi(argv[3]);
+ sec2 = atoi(argv[4]);
our_hostid = atoi(argv[5]);
- printf("%d count count_disk %s rsec %d wsec %d our_hostid %d\n",
- our_pid, count_path, read_seconds, write_seconds, our_hostid);
+ if (!strcmp(argv[1], "rw")) {
+ read_seconds = sec1;
+ write_seconds = sec2;
+ } else {
+ write_seconds = sec1;
+ read_seconds = sec2;
+ }
+
+ printf("%d %s count_disk %s sec1 %d sec2 %d our_hostid %d\n",
+ our_pid, argv[1], count_path, sec1, sec2, our_hostid);
fd = open(count_path, O_RDWR | O_DIRECT | O_SYNC, 0);
if (fd < 0) {
@@ -154,11 +164,6 @@ static int do_count(int argc, char *argv[])
goto fail;
}
- /*
- * Quickly read, then reread for a few seconds to see if
- * the previous writer does another write.
- */
-
lseek(fd, count_offset, SEEK_SET);
rv = read(fd, rbuf, 512);
@@ -167,30 +172,36 @@ static int do_count(int argc, char *argv[])
goto fail;
}
- /* print_entries(rbuf); */
+ /* print_entries(our_pid, rbuf); */
- for (i = 0; i < read_seconds; i++) {
- sleep(1);
+ /*
+ * reading for "rw"
+ */
- lseek(fd, count_offset, SEEK_SET);
+ if (!strcmp(argv[1], "rw")) {
+ for (i = 0; i < read_seconds; i++) {
+ sleep(1);
- rv = read(fd, vbuf, 512);
- if (rv != 512) {
- perror("read failed");
- goto fail;
- }
+ lseek(fd, count_offset, SEEK_SET);
- if (memcmp(rbuf, vbuf, 512)) {
- printf("rbuf:\n");
- print_entries(rbuf);
- printf("vbuf:\n");
- print_entries(vbuf);
- goto fail;
+ rv = read(fd, vbuf, 512);
+ if (rv != 512) {
+ perror("read failed");
+ goto fail;
+ }
+
+ if (memcmp(rbuf, vbuf, 512)) {
+ printf("%d rbuf:\n", our_pid);
+ print_entries(our_pid, rbuf);
+ printf("%d vbuf:\n", our_pid);
+ print_entries(our_pid, vbuf);
+ goto fail;
+ }
}
}
/*
- * Now start writing
+ * writing
*/
re = (struct entry *)rbuf;
@@ -209,14 +220,14 @@ static int do_count(int argc, char *argv[])
}
if (max_turn != max_re->turn) {
- printf("max_turn %d max_re->turn %d\n", max_turn,
- max_re->turn);
+ printf("%d max_turn %d max_re->turn %d\n", our_pid,
+ max_turn, max_re->turn);
goto fail;
}
/*
- printf("max index %d turn %d count %llu\n", max_i, max_turn,
- (unsigned long long)max_re->count);
+ printf("%d max index %d turn %d count %llu\n", our_pid,
+ max_i, max_turn, (unsigned long long)max_re->count);
*/
memcpy(wbuf, rbuf, 512);
@@ -245,7 +256,7 @@ static int do_count(int argc, char *argv[])
}
printf("%d first write\n", our_pid);
- print_our_we(our_we);
+ print_our_we(our_pid, our_we);
start = time(NULL);
@@ -266,7 +277,7 @@ static int do_count(int argc, char *argv[])
}
printf("%d last write\n", our_pid);
- print_our_we(our_we);
+ print_our_we(our_pid, our_we);
if (turn_file) {
fprintf(turn_file, "turn %03u start %llu end %llu host %u pid %u\n",
@@ -278,6 +289,34 @@ static int do_count(int argc, char *argv[])
fclose(turn_file);
}
+ /*
+ * reading for "wr"
+ */
+
+ if (!strcmp(argv[1], "wr")) {
+ memcpy(rbuf, wbuf, 512);
+
+ for (i = 0; i < read_seconds; i++) {
+ sleep(1);
+
+ lseek(fd, count_offset, SEEK_SET);
+
+ rv = read(fd, vbuf, 512);
+ if (rv != 512) {
+ perror("read failed");
+ goto fail;
+ }
+
+ if (memcmp(rbuf, vbuf, 512)) {
+ printf("%d rbuf:\n", our_pid);
+ print_entries(our_pid, rbuf);
+ printf("%d vbuf:\n", our_pid);
+ print_entries(our_pid, vbuf);
+ goto fail;
+ }
+ }
+ }
+
return 0;
fail:
printf("sleeping...\n");
@@ -286,9 +325,9 @@ static int do_count(int argc, char *argv[])
}
/*
- * devcount lock <lock_disk> count <count_disk> <rsec> <wsec> <hostid>
+ * devcount lock <lock_disk> rw <count_disk> <sec1> <sec2> <hostid>
* sanlock add_lockspace -s devcount:<hostid>:<lock_disk>:0
- * devcount count <count_disk> <rsec> <wsec> <hostid>
+ * devcount rw <count_disk> <sec1> <sec2> <hostid>
*/
static int do_lock(int argc, char *argv[])
@@ -297,11 +336,13 @@ static int do_lock(int argc, char *argv[])
struct sanlk_lockspace lockspace;
struct sanlk_resource *res;
int i, j, pid, rv, sock, len, status;
- uint32_t our_pid = getpid();
+ uint32_t parent_pid = getpid();
if (argc < LOCK_ARGS)
return -1;
+ count_offset = 0;
+
strcpy(lock_path, argv[2]);
strcpy(count_path, argv[4]);
our_hostid = atoi(argv[7]);
@@ -318,7 +359,7 @@ static int do_lock(int argc, char *argv[])
res->disks[0].offset = 1024000;
printf("%d lock_disk %s count_disk %s our_hostid %d\n",
- our_pid, lock_path, count_path, our_hostid);
+ parent_pid, lock_path, count_path, our_hostid);
memset(&lockspace, 0, sizeof(lockspace));
strcpy(lockspace.name, "devcount");
@@ -328,16 +369,16 @@ static int do_lock(int argc, char *argv[])
rv = sanlock_add_lockspace(&lockspace, 0);
if (rv < 0) {
- printf("sanlock_add_lockspace error %d\n", rv);
+ printf("%d sanlock_add_lockspace error %d\n", parent_pid, rv);
exit(EXIT_FAILURE);
}
- printf("sanlock_add_lockspace done\n");
+ printf("%d sanlock_add_lockspace done\n", parent_pid);
/*
* argv[0] = devcount
* argv[1] = lock
* argv[2] = <lock_disk>
- * argv[3] = count
+ * argv[3] = rw
* start copying at argv[3]
*/
@@ -351,18 +392,27 @@ static int do_lock(int argc, char *argv[])
while (1) {
pid = fork();
if (!pid) {
- int our_pid = getpid();
+ int child_pid = getpid();
printf("\n");
sock = sanlock_register();
+ if (sock < 0) {
+ printf("%d sanlock_register error %d\n",
+ child_pid, rv);
+ exit(-1);
+ }
+
rv = sanlock_acquire(sock, -1, 1, &res, NULL);
if (rv < 0) {
printf("%d sanlock_acquire error %d\n",
- our_pid, rv);
+ child_pid, rv);
+ /* all hosts are trying to acquire so we
+ expect this to acquire only sometimes;
+ TODO: exit with an error for some rv's */
exit(0);
}
- printf("%d sanlock_acquire done\n", our_pid);
+ printf("%d sanlock_acquire done\n", child_pid);
execv(av[0], av);
perror("execv devcount problem");
@@ -370,14 +420,390 @@ static int do_lock(int argc, char *argv[])
}
waitpid(pid, &status, 0);
+
+ /* TODO: goto fail if exit status is an error */
+
sleep(rand_int(0, 1));
}
+
+ fail:
+ printf("test failed...\n");
+ sleep(1000000);
+}
+
+/* counting block: count_path offset 0
+ * incoming block: count_path offset 4K
+ * stopped block: count_path offset 8K */
+
+static void write_migrate(char *state, int offset)
+{
+ char *wbuf, **p_wbuf;
+ int fd, rv;
+
+ if (strlen(state) > 512) {
+ printf("state string too long\n");
+ goto fail;
+ }
+
+ fd = open(count_path, O_RDWR | O_DIRECT | O_SYNC, 0);
+ if (fd < 0) {
+ perror("open failed");
+ goto fail;
+ }
+
+ rv = ioctl(fd, BLKFLSBUF);
+ if (rv) {
+ perror("BLKFLSBUF failed");
+ goto fail;
+ }
+
+ p_wbuf = &wbuf;
+
+ rv = posix_memalign((void *)p_wbuf, getpagesize(), 512);
+ if (rv) {
+ perror("posix_memalign failed");
+ goto fail;
+ }
+
+ memset(wbuf, 0, 512);
+ memcpy(wbuf, state, strlen(state));
+
+ lseek(fd, offset, SEEK_SET);
+
+ rv = write(fd, wbuf, 512);
+ if (rv != 512) {
+ perror("write failed");
+ goto fail;
+ }
+
+ return;
+
+ fail:
+ printf("write_migrate %d failed %s\n", offset, state);
+ sleep(10000000);
+}
+
+static void write_migrate_incoming(char *state)
+{
+ write_migrate(state, 4096);
+}
+
+static void write_migrate_stopped(char *state)
+{
+ write_migrate(state, 4096*2);
+}
+
+/* read incoming block until it's set and our_hostid is next */
+
+static int wait_migrate_incoming(char *state_out)
+{
+ char *rbuf, **p_rbuf, *wbuf, **p_wbuf;
+ char *owner_id, *val_str;
+ int fd, rv, val;
+ int offset = 4096;
+
+ fd = open(count_path, O_RDWR | O_DIRECT | O_SYNC, 0);
+ if (fd < 0) {
+ perror("open failed");
+ goto fail;
+ }
+
+ rv = ioctl(fd, BLKFLSBUF);
+ if (rv) {
+ perror("BLKFLSBUF failed");
+ goto fail;
+ }
+
+ p_rbuf = &rbuf;
+ p_wbuf = &wbuf;
+
+ rv = posix_memalign((void *)p_rbuf, getpagesize(), 512);
+ if (rv) {
+ perror("posix_memalign failed");
+ goto fail;
+ }
+
+ rv = posix_memalign((void *)p_wbuf, getpagesize(), 512);
+ if (rv) {
+ perror("posix_memalign failed");
+ goto fail;
+ }
+
+ retry:
+ lseek(fd, offset, SEEK_SET);
+
+ rv = read(fd, rbuf, 512);
+ if (rv != 512) {
+ perror("read failed");
+ goto fail;
+ }
+ rbuf[511] = '\0';
+
+ /* init case to get things going */
+ if (!rbuf[0] && our_hostid == 1) {
+ return 1;
+ }
+
+ owner_id = strstr(rbuf, "leader.owner_id=");
+ if (!owner_id) {
+ goto retry;
+ }
+
+ val_str = strstr(owner_id, "=") + 1;
+ if (!val_str) {
+ goto retry;
+ }
+
+ val = atoi(val_str);
+ if ((val % max_hostid)+1 != our_hostid) {
+ goto retry;
+ }
+
+ strcpy(state_out, rbuf);
+
+ memset(wbuf, 0, 512);
+ strcpy(wbuf, "empty");
+
+ lseek(fd, offset, SEEK_SET);
+
+ rv = write(fd, wbuf, 512);
+ if (rv != 512) {
+ perror("write failed");
+ goto fail;
+ }
+
+ return 0;
+
+ fail:
+ printf("wait_migrate_incoming failed %s\n", state_out);
+ sleep(10000000);
+}
+
+/* read stopped block until it matches state_in */
+
+static void wait_migrate_stopped(char *state_in)
+{
+ char *rbuf, **p_rbuf, *wbuf, **p_wbuf;
+ int fd, rv;
+ int offset = 4096 * 2;
+
+ fd = open(count_path, O_RDWR | O_DIRECT | O_SYNC, 0);
+ if (fd < 0) {
+ perror("open failed");
+ goto fail;
+ }
+
+ rv = ioctl(fd, BLKFLSBUF);
+ if (rv) {
+ perror("BLKFLSBUF failed");
+ goto fail;
+ }
+
+ p_rbuf = &rbuf;
+ p_wbuf = &wbuf;
+
+ rv = posix_memalign((void *)p_rbuf, getpagesize(), 512);
+ if (rv) {
+ perror("posix_memalign failed");
+ goto fail;
+ }
+
+ rv = posix_memalign((void *)p_wbuf, getpagesize(), 512);
+ if (rv) {
+ perror("posix_memalign failed");
+ goto fail;
+ }
+
+ retry:
+ lseek(fd, offset, SEEK_SET);
+
+ rv = read(fd, rbuf, 512);
+ if (rv != 512) {
+ perror("read failed");
+ goto fail;
+ }
+ rbuf[511] = '\0';
+
+ if (strcmp(rbuf, state_in)) {
+ sleep(1);
+ goto retry;
+ }
+
+ memset(wbuf, 0, 512);
+
+ lseek(fd, offset, SEEK_SET);
+
+ rv = write(fd, wbuf, 512);
+ if (rv != 512) {
+ perror("write failed");
+ goto fail;
+ }
+
+ return;
+
+ fail:
+ printf("wait_migrate_stopped failed %s\n", state_in);
+ sleep(10000000);
+}
+
+#define MAX_MIGRATE_STATE 512 /* keep in one block for simplicity */
+
+static int do_migrate(int argc, char *argv[])
+{
+ char incoming[MAX_MIGRATE_STATE];
+ char *av[MIGRATE_ARGS+1];
+ char *state;
+ struct sanlk_lockspace lockspace;
+ struct sanlk_resource *res;
+ struct sanlk_options *opt;
+ int i, j, pid, rv, sock, len, status, init;
+ uint32_t parent_pid = getpid();
+
+ if (argc < MIGRATE_ARGS)
+ return -1;
+
+ count_offset = 0;
+
+ strcpy(lock_path, argv[2]);
+ strcpy(count_path, argv[4]);
+ our_hostid = atoi(argv[7]);
+ max_hostid = atoi(argv[8]);
+
+ len = sizeof(struct sanlk_resource) + sizeof(struct sanlk_disk);
+ res = malloc(len);
+ memset(res, 0, len);
+ strcpy(res->lockspace_name, "devcount");
+ snprintf(res->name, SANLK_NAME_LEN, "resource%s", count_path);
+ res->name[SANLK_NAME_LEN-1] = '\0';
+ res->num_disks = 1;
+ strncpy(res->disks[0].path, lock_path, SANLK_PATH_LEN);
+ res->disks[0].path[SANLK_PATH_LEN-1] = '\0';
+ res->disks[0].offset = 1024000;
+
+ len = sizeof(struct sanlk_options) + MAX_MIGRATE_STATE;
+ opt = malloc(len);
+ memset(opt, 0, len);
+
+ printf("%d lock_disk %s count_disk %s our_hostid %d max_hostid\n",
+ parent_pid, lock_path, count_path, our_hostid, max_hostid);
+
+ memset(&lockspace, 0, sizeof(lockspace));
+ strcpy(lockspace.name, "devcount");
+ strcpy(lockspace.host_id_disk.path, lock_path);
+ lockspace.host_id_disk.offset = lock_offset;
+ lockspace.host_id = our_hostid;
+
+ rv = sanlock_add_lockspace(&lockspace, 0);
+ if (rv < 0) {
+ printf("%d sanlock_add_lockspace error %d\n", parent_pid, rv);
+ exit(EXIT_FAILURE);
+ }
+ printf("%d sanlock_add_lockspace done\n", parent_pid);
+
+ /*
+ * argv[0] = devcount
+ * argv[1] = migrate
+ * argv[2] = <lock_disk>
+ * argv[3] = rw
+ * start copying at argv[3]
+ */
+
+ j = 0;
+ memset(av, 0, sizeof(char *) * MIGRATE_ARGS+1);
+
+ av[j++] = strdup(argv[0]);
+ for (i = 3; i < MIGRATE_ARGS; i++)
+ av[j++] = strdup(argv[i]);
+
+ memset(incoming, 0, sizeof(incoming));
+
+ while (1) {
+ init = wait_migrate_incoming(incoming);
+
+ pid = fork();
+ if (!pid) {
+ int child_pid = getpid();
+
+ printf("\n");
+
+ if (!init) {
+ opt->flags = SANLK_FLG_INCOMING;
+ opt->len = strlen(incoming);
+ strncpy(opt->str, incoming, MAX_MIGRATE_STATE);
+ }
+
+ sock = sanlock_register();
+ if (sock < 0) {
+ printf("%d sanlock_register error %d\n",
+ child_pid, rv);
+ exit(-1);
+ }
+
+ rv = sanlock_acquire(sock, -1, 1, &res, opt);
+ if (rv < 0) {
+ printf("%d sanlock_acquire error %d in %s\n",
+ child_pid, rv, opt->str);
+ /* only one host should be trying to acquire
+ so this should always succeed */
+ exit(-1);
+ }
+ printf("%d sanlock_acquire done\n", child_pid);
+
+ if (init)
+ goto skip_setowner;
+
+ wait_migrate_stopped(incoming);
+
+ rv = sanlock_setowner(sock, -1);
+ if (rv < 0) {
+ printf("%d sanlock_setowner error %d\n",
+ child_pid, rv);
+ exit(-1);
+ }
+ printf("%d sanlock_setowner done\n", child_pid);
+ skip_setowner:
+ execv(av[0], av);
+ perror("execv devcount problem");
+ exit(EXIT_FAILURE);
+ }
+
+ /* let the child run for 10 seconds before stopping it;
+ if the child exits before the 10 seconds, the sanlock_migrate
+ call should return an error */
+
+ sleep(10);
+
+ rv = sanlock_migrate(-1, pid, 0, &state);
+ if (rv < 0 || !state) {
+ printf("%d sanlock_migrate error %d\n", parent_pid, rv);
+ goto fail;
+ }
+
+ write_migrate_incoming(state);
+
+ kill(pid, SIGSTOP);
+
+ write_migrate_stopped(state);
+
+ kill(pid, SIGKILL);
+
+ waitpid(pid, &status, 0);
+
+ free(state);
+
+ /* TODO: goto fail if exit status is an error */
+ }
+
+ fail:
+ printf("test failed...\n");
+ sleep(10000000);
}
/*
* devcount init <lock_disk> <count_disk>
* sanlock direct init -n 8 -s devcount:0:<lock_disk>:0
* sanlock direct init -n 8 -r devcount:resource<count_disk>:<lock_disk>:1024000
+ * dd if=/dev/zero of=<count_disk> bs=512 count=24
*/
int do_init(int argc, char *argv[])
@@ -414,6 +840,16 @@ int do_init(int argc, char *argv[])
printf("%s\n", command);
system(command);
+
+ memset(command, 0, sizeof(command));
+
+ snprintf(command, sizeof(command),
+ "dd if=/dev/zero of=%s bs=512 count=24",
+ count_path);
+
+ printf("%s\n", command);
+
+ system(command);
}
int main(int argc, char *argv[])
@@ -426,12 +862,15 @@ int main(int argc, char *argv[])
if (!strcmp(argv[1], "init"))
rv = do_init(argc, argv);
- else if (!strcmp(argv[1], "count"))
+ else if (!strcmp(argv[1], "rw") || !strcmp(argv[1], "wr"))
rv = do_count(argc, argv);
else if (!strcmp(argv[1], "lock"))
rv = do_lock(argc, argv);
+ else if (!strcmp(argv[1], "migrate"))
+ rv = do_migrate(argc, argv);
+
if (!rv)
return 0;
@@ -447,14 +886,19 @@ int main(int argc, char *argv[])
printf("devcount init <lock_disk> <count_disk>\n");
printf(" sanlock direct init -n 8 -s devcount:0:<lock_disk>:0\n");
printf(" sanlock direct init -n 8 -r devcount:resource<count_disk>:<lock_disk>:1024000\n");
+ printf(" dd if=/dev/zero of=<count_disk> bs=512 count=24\n");
+ printf("\n");
+ printf("devcount migrate <lock_disk> rw <count_disk> <sec1> <sec2> <hostid> <max_hostid>\n");
+ printf(" sanlock add_lockspace -s devcount:<hostid>:<lock_disk>:0\n");
+ printf(" loop around fork, sanlock_acquire, exec devcount rw\n");
printf("\n");
- printf("devcount lock <lock_disk> count <count_disk> <rsec> <wsec> <hostid>\n");
+ printf("devcount lock <lock_disk> rw <count_disk> <sec1> <sec2> <hostid>\n");
printf(" sanlock add_lockspace -s devcount:<hostid>:<lock_disk>:0\n");
- printf(" loop around fork, sanlock_acquire, exec devcount count\n");
+ printf(" loop around fork, sanlock_acquire, exec devcount rw\n");
printf("\n");
- printf("devcount count <count_disk> <rsec> <wsec> <hostid>\n");
- printf(" read disk count for rsec seconds, looking for any writes\n");
- printf(" write disk count for wsec seconds, (wsec 0 indefinite)\n");
+ printf("devcount rw <count_disk> <sec1> <sec2> <hostid>\n");
+ printf(" rw: read count for sec1, looking for writes, then write for sec2\n");
+ printf(" wr: write count for sec1, then read for sec2, looking for writes\n");
printf("\n");
return -1;
}
commit 3eb2a167a531bbd49f30e1a24eec2e5b28c1cb6b
Author: David Teigland <teigland(a)redhat.com>
Date: Tue Mar 8 16:35:29 2011 -0600
sanlock: migration working
A lot of various changes and fixes to the migration code paths
to make them work. This adds some comments and TODO's related
to incomplete handling of migration failure cases.
diff --git a/src/direct.c b/src/direct.c
index 3b39ea3..3bdfbf8 100644
--- a/src/direct.c
+++ b/src/direct.c
@@ -116,9 +116,11 @@ static int do_paxos_action(void)
return -1;
}
- rv = paxos_lease_migrate(token, &leader_read, &leader_ret, com.target_host_id);
+ leader_read.next_owner_id = com.target_host_id;
+
+ rv = paxos_lease_leader_write(token, &leader_read);
if (rv < 0) {
- log_tool("cannot migrate lease on %s",
+ log_tool("cannot write lease on %s",
token->resource_name);
return -1;
}
diff --git a/src/leader.h b/src/leader.h
index c76da4a..46c0b5a 100644
--- a/src/leader.h
+++ b/src/leader.h
@@ -40,6 +40,8 @@ enum {
DP_BAD_SECTORSIZE = -26,
DP_REACQUIRE_LVER = -27,
DP_BAD_LOCKSPACE = -28,
+ DP_LEADER_MIGRATE = -29,
+ DP_OTHER_OWNER = -30,
};
/* does not include terminating null byte */
diff --git a/src/main.c b/src/main.c
index 6402f19..f0e549c 100644
--- a/src/main.c
+++ b/src/main.c
@@ -427,6 +427,9 @@ static int main_loop(void)
return 0;
}
+/* FIXME: allow setowner on the source to "cancel" a migration by clearing
+ next_owner. This means allowing CMD_SETOWNER when cmd_active == CMD_MIGRATE */
+
static int set_cmd_active(int ci_target, int cmd)
{
struct client *cl = &client[ci_target];
@@ -442,6 +445,9 @@ static int set_cmd_active(int ci_target, int cmd)
return -EBUSY;
}
+ if (cl->need_setowner && cmd == SM_CMD_SETOWNER)
+ cl->need_setowner = 0;
+
cmd_active = cl->cmd_active;
if (!cmd) {
@@ -552,9 +558,8 @@ static int parse_key_val(char *str, const char *key_arg, char *val_arg, int len)
* "lockspace_name=.... resource_name=.... "
*/
-static int parse_migrate_state(struct token *token, char *str,
- int *migrate_result,
- struct leader_record *leader)
+int parse_incoming_state(struct token *token, char *str, int *migrate_result,
+ struct leader_record *leader)
{
char state[SANLK_STATE_MAXSTR];
char name[128];
@@ -571,7 +576,7 @@ static int parse_migrate_state(struct token *token, char *str,
if (!begin)
return -1;
- end = strstr(begin, "lockspace_name=");
+ end = strstr(begin+strlen(name), "lockspace_name=");
if (!end)
end = str + strlen(str) + 1;
@@ -587,27 +592,27 @@ static int parse_migrate_state(struct token *token, char *str,
rv = parse_key_val(state, "migrate_result", val_str, sizeof(val_str));
if (rv < 0)
- return rv;
+ return -2;
*migrate_result = atoi(val_str);
rv = parse_key_val(state, "leader.lver", val_str, sizeof(val_str));
if (rv < 0)
- return rv;
+ return -3;
leader->lver = strtoull(val_str, NULL, 0);
rv = parse_key_val(state, "leader.timestamp", val_str, sizeof(val_str));
if (rv < 0)
- return rv;
+ return -4;
leader->timestamp = strtoull(val_str, NULL, 0);
rv = parse_key_val(state, "leader.owner_id", val_str, sizeof(val_str));
if (rv < 0)
- return rv;
+ return -5;
leader->owner_id = strtoull(val_str, NULL, 0);
rv = parse_key_val(state, "leader.next_owner_id", val_str, sizeof(val_str));
if (rv < 0)
- return rv;
+ return -6;
leader->next_owner_id = strtoull(val_str, NULL, 0);
return 0;
@@ -623,7 +628,6 @@ static void *cmd_acquire_thread(void *args_in)
struct token *new_tokens[SANLK_MAX_RESOURCES];
struct sanlk_resource res;
struct sanlk_options opt;
- struct leader_record leader;
char *opt_str;
char num_hosts_str[16];
uint64_t reacquire_lver = 0;
@@ -870,23 +874,22 @@ static void *cmd_acquire_thread(void *args_in)
token = new_tokens[i];
if (opt.flags & SANLK_FLG_INCOMING) {
- migrate_result = 0;
- memset(&leader, 0, sizeof(leader));
- rv = parse_migrate_state(token, opt_str, &migrate_result, &leader);
- if (rv < 0 || !migrate_result) {
- log_errot(token, "cmd_acquire migrate state "
- "bad %d len %zd", migrate_result,
- strlen(opt_str));
+ rv = check_incoming_state(token, opt_str, &migrate_result);
+ if (rv < 0) {
+ log_errot(token, "cmd_acquire incoming state %d", rv);
goto fail_release;
}
- } else if (opt.flags & SANLK_FLG_REACQUIRE) {
- reacquire_lver = token->prev_lver;
- }
- if (opt.flags & SANLK_FLG_INCOMING)
- rv = receive_token(token, migrate_result, &leader);
- else
+ /* source set_next_owner_other() wasn't called or failed */
+ if (migrate_result != DP_OK)
+ rv = set_next_owner_self(token);
+
+ } else {
+ if (opt.flags & SANLK_FLG_REACQUIRE)
+ reacquire_lver = token->prev_lver;
+
rv = acquire_token(token, reacquire_lver, new_num_hosts);
+ }
save_resource_leader(token);
@@ -1109,11 +1112,6 @@ static void *cmd_migrate_thread(void *args_in)
goto reply;
}
- if (!target_host_id) {
- result = -EINVAL;
- goto reply;
- }
-
for (i = 0; i < SANLK_MAX_RESOURCES; i++) {
if (cl->tokens[i])
total++;
@@ -1123,7 +1121,6 @@ static void *cmd_migrate_thread(void *args_in)
reply_str = malloc(reply_len);
if (!reply_str) {
result = -ENOMEM;
- total = 0;
goto reply;
}
memset(reply_str, 0, reply_len);
@@ -1134,11 +1131,23 @@ static void *cmd_migrate_thread(void *args_in)
if (!token)
continue;
- /* if migrate_token() fails it is not fatal, we can still
- procede with the migration; receive_token() will attempt
- to set next_owner_id */
+ /* the migrating flag causes the source to avoid freeing the lease
+ * if the pid exits before the dest has written itself as next_owner.
+ * i.e. we can't rely on paxos_lease_release seeing next_owner_id
+ * non-zero because the cmd_migrate can be called on the source,
+ * followed by the source pid exiting before the dest gets to write
+ * next_owner_id to itself */
+
+ token->migrating = 1;
- migrate_token(token, target_host_id);
+ if (target_host_id) {
+ rv = set_next_owner_other(token, target_host_id);
+ token->migrate_result = rv;
+ } else {
+ /* acquire-incoming on the destination will call
+ set_next_owner_self() */
+ token->migrate_result = 0;
+ }
ret = snprintf(reply_str + pos, reply_len - pos,
"lockspace_name=%s "
@@ -1169,24 +1178,21 @@ static void *cmd_migrate_thread(void *args_in)
reply_str[reply_len-1] = '\0';
reply:
- /* TODO: for success I don't think we want to clear cmd_active
- here. We probably want to wait until the migrate is done
- and then do set_cmd_active(0)? */
-
- if (result < 0)
- set_cmd_active(ca->ci_target, 0);
-
log_debug("cmd_migrate done result %d", result);
memcpy(&h, &ca->header, sizeof(struct sm_header));
- h.length = sizeof(h) + strlen(reply_str)+1;
- h.data = result;
- send(fd, &h, sizeof(h), MSG_NOSIGNAL);
- if (total)
- send(fd, reply_str, strlen(reply_str)+1, MSG_NOSIGNAL);
- if (reply_str)
+ if (reply_str) {
+ h.length = sizeof(h) + strlen(reply_str)+1;
+ h.data = result;
+ send(fd, &h, sizeof(h), MSG_NOSIGNAL);
+ send(fd, reply_str, strlen(reply_str)+1, MSG_NOSIGNAL);
free(reply_str);
+ } else {
+ h.length = sizeof(h);
+ h.data = result;
+ send(fd, &h, sizeof(h), MSG_NOSIGNAL);
+ }
client_back(ca->ci_in, fd);
free(ca);
@@ -1202,10 +1208,9 @@ static void *cmd_setowner_thread(void *args_in)
struct cmd_args *ca = args_in;
struct sm_header h;
struct token *token;
- struct token *tokens_reply;
struct client *cl;
- int result = 0, total = 0, total2 = 0;
- int fd, rv, i, tokens_len;
+ int result = 0;
+ int fd, rv, i;
cl = &client[ca->ci_target];
fd = client[ca->ci_in].fd;
@@ -1214,20 +1219,6 @@ static void *cmd_setowner_thread(void *args_in)
ca->ci_in, ca->ci_target, cl->pid);
for (i = 0; i < SANLK_MAX_RESOURCES; i++) {
- if (cl->tokens[i])
- total++;
- }
-
- tokens_len = total * sizeof(struct token);
- tokens_reply = malloc(tokens_len);
- if (!tokens_reply) {
- result = -ENOMEM;
- total = 0;
- goto reply;
- }
- memset(tokens_reply, 0, tokens_len);
-
- for (i = 0; i < SANLK_MAX_RESOURCES; i++) {
token = cl->tokens[i];
if (!token)
continue;
@@ -1235,26 +1226,16 @@ static void *cmd_setowner_thread(void *args_in)
rv = setowner_token(token);
if (rv < 0)
result = -1;
-
- if (total2 == total) {
- log_error("cmd_setowner total %d changed", total);
- continue;
- }
-
- memcpy(&tokens_reply[total2++], token, sizeof(struct token));
}
- reply:
set_cmd_active(ca->ci_target, 0);
- log_debug("cmd_setowner done %d", total);
+ log_debug("cmd_setowner done");
memcpy(&h, &ca->header, sizeof(struct sm_header));
- h.length = sizeof(h) + tokens_len;
+ h.length = sizeof(h);
h.data = result;
send(fd, &h, sizeof(h), MSG_NOSIGNAL);
- if (total)
- send(fd, tokens_reply, tokens_len, MSG_NOSIGNAL);
client_back(ca->ci_in, fd);
free(ca);
diff --git a/src/paxos_lease.c b/src/paxos_lease.c
index ee1fb75..f693f62 100644
--- a/src/paxos_lease.c
+++ b/src/paxos_lease.c
@@ -551,8 +551,6 @@ int paxos_lease_leader_read(struct token *token, struct leader_record *leader_re
goto fail;
}
- log_token(token, "leader_read d %u reps %u", d, leader_reps[d]);
-
log_token(token, "leader_read owner %llu lver %llu hosts %llu "
"time %llu res %s",
(unsigned long long)prev_leader.owner_id,
@@ -597,7 +595,7 @@ static int write_new_leader(struct token *token, struct leader_record *nl)
* ref: obtain()
*/
-int paxos_lease_acquire(struct token *token, int force GNUC_UNUSED,
+int paxos_lease_acquire(struct token *token, int force,
struct leader_record *leader_ret,
uint64_t reacquire_lver,
int new_num_hosts)
@@ -610,19 +608,25 @@ int paxos_lease_acquire(struct token *token, int force GNUC_UNUSED,
uint64_t last_timestamp = 0;
int error;
- log_token(token, "paxos_acquire begin");
+ log_token(token, "paxos_acquire begin force %d", force);
error = paxos_lease_leader_read(token, &prev_leader);
if (error < 0)
goto out;
+ if (force)
+ goto run;
+
if (prev_leader.timestamp == LEASE_FREE) {
log_token(token, "paxos_acquire lease free");
goto run;
}
- if (prev_leader.owner_id == token->host_id) {
- log_token(token, "paxos_acquire already owner");
+ if (prev_leader.owner_id == token->host_id &&
+ prev_leader.owner_generation == token->host_generation) {
+ log_token(token, "paxos_acquire already owner id %llu gen %llu",
+ (unsigned long long)token->host_id,
+ (unsigned long long)token->host_generation);
goto run;
}
@@ -795,51 +799,18 @@ int paxos_lease_acquire(struct token *token, int force GNUC_UNUSED,
return error;
}
-int paxos_lease_migrate(struct token *token,
- struct leader_record *leader_last,
- struct leader_record *leader_ret,
- uint64_t target_host_id)
+int paxos_lease_leader_write(struct token *token,
+ struct leader_record *leader_new)
{
- struct leader_record new_leader;
- int rv, d;
int error;
- log_token(token, "paxos_migrate begin");
+ log_token(token, "paxos_lease_leader_write begin");
- for (d = 0; d < token->num_disks; d++) {
- memset(&new_leader, 0, sizeof(struct leader_record));
+ leader_new->checksum = leader_checksum(leader_new);
- rv = read_leader(&token->disks[d], &new_leader);
- if (rv < 0)
- continue;
+ error = write_new_leader(token, leader_new);
- if (memcmp(&new_leader, leader_last,
- sizeof(struct leader_record))) {
- log_errot(token, "paxos_migrate leader changed before migrate");
- error = DP_BAD_LEADER;
- goto out;
- }
- }
-
- if (new_leader.num_hosts < target_host_id) {
- log_errot(token, "paxos_migrate num_hosts %llu target_host_id %llu",
- (unsigned long long)new_leader.num_hosts,
- (unsigned long long)target_host_id);
- error = DP_BAD_NUMHOSTS;
- goto out;
- }
-
- new_leader.next_owner_id = target_host_id;
- new_leader.timestamp = time(NULL);
- new_leader.checksum = leader_checksum(&new_leader);
-
- error = write_new_leader(token, &new_leader);
- if (error < 0)
- goto out;
-
- memcpy(leader_ret, &new_leader, sizeof(struct leader_record));
- out:
- log_token(token, "paxos_migrate done %d", error);
+ log_token(token, "paxos_lease_leader_write done %d", error);
return error;
}
@@ -896,11 +867,23 @@ int paxos_lease_release(struct token *token,
if (memcmp(&new_leader, leader_last,
sizeof(struct leader_record))) {
- log_errot(token, "leader changed before release");
+ log_errot(token, "release error leader changed");
return DP_BAD_LEADER;
}
}
+ if (new_leader.owner_id != token->host_id) {
+ log_errot(token, "release error other owner_id %llu",
+ (unsigned long long)new_leader.owner_id);
+ return DP_OTHER_OWNER;
+ }
+
+ if (new_leader.next_owner_id) {
+ log_errot(token, "release error next_owner_id %llu",
+ (unsigned long long)new_leader.next_owner_id);
+ return DP_LEADER_MIGRATE;
+ }
+
new_leader.timestamp = LEASE_FREE;
new_leader.checksum = leader_checksum(&new_leader);
diff --git a/src/paxos_lease.h b/src/paxos_lease.h
index bf8e600..8981ed7 100644
--- a/src/paxos_lease.h
+++ b/src/paxos_lease.h
@@ -12,6 +12,7 @@
uint32_t leader_checksum(struct leader_record *lr);
int majority_disks(struct token *token, int num);
int paxos_lease_leader_read(struct token *token, struct leader_record *leader_ret);
+int paxos_lease_leader_write(struct token *token, struct leader_record *leader_new);
int paxos_lease_acquire(struct token *token, int force,
struct leader_record *leader_ret,
uint64_t reacquire_lver,
diff --git a/src/sanlock_internal.h b/src/sanlock_internal.h
index 3f1c1f6..22dd21e 100644
--- a/src/sanlock_internal.h
+++ b/src/sanlock_internal.h
@@ -85,6 +85,7 @@ struct token {
struct sync_disk *disks;
/* internal */
+ int migrating;
int token_id; /* used to refer to this token instance in log messages */
int acquire_result;
int migrate_result;
diff --git a/src/token_manager.c b/src/token_manager.c
index 5d09ef0..aa537df 100644
--- a/src/token_manager.c
+++ b/src/token_manager.c
@@ -231,10 +231,15 @@ int setowner_token(struct token *token)
/* we want the dblocks to reflect a full, proper ownership, so we
do the full acquire rather than just writing a new leader_record */
- rv = paxos_lease_acquire(token, 0, &leader_ret, 0, 0);
+ rv = paxos_lease_acquire(token, 1, &leader_ret, 0, 0);
token->setowner_result = rv;
+ /* we set acquire_result here for at least one reason: because release
+ will not release the token if acquire_result is not 1 */
+
+ token->acquire_result = rv;
+
log_token(token, "setowner rv %d lver %llu at %llu", rv,
(unsigned long long)token->leader.lver,
(unsigned long long)token->leader.timestamp);
@@ -246,6 +251,88 @@ int setowner_token(struct token *token)
return rv; /* DP_OK */
}
+/*
+ * migration creates special cases for release. if either the source or
+ * the dest calls release_token and read leader shows next_owner_id is not
+ * zero, it means migration is in progress, and they should not free the
+ * lease.
+ *
+ * In other words, paxos release should only be done (lease freed) on a
+ * "fully owned", clean lease, i.e. next_owner_id is zero, and current
+ * leader matches our last leader.
+ *
+ * (A second mechanism is also needed to prevent release of migrating leases,
+ * the token->migrating flag. This is because we need to block releases
+ * on the source effective immediately, before next_owner may be written.)
+ *
+ * setowner on the destination, in the case of migration success, moves a
+ * disk lease from being in limbo (both owner and next_owner set), to having
+ * just an owner (the dest). After this the owner (dest) can release it.
+ *
+ * TODO: setowner on the source, in the case of migration failure, moves the
+ * disk lease from being in limbo with both owner and next_owner, to having
+ * just an owner (the source). This setowner call will need to ignore the
+ * fact that the leader block doesn't match its latest copy since it may
+ * have been the dest that wrote next_owner at the start of migration.
+ *
+ * We don't have to worry that next_owner is ever running the vm; setowner
+ * on dest is required to complete successfully (making dest the owner and
+ * clearing next_owner) before vm is resumed on the dest.
+ *
+ * If migration fails, the source/owner does not free the paxos lease, but
+ * the source/owner continues running and renewing its host_id, then no
+ * other host will be able to take ownership of the lease, because they will
+ * see that the owner is alive. The source/owner will be able to acquire
+ * the lease, though. So, the source/owner needs to either
+ * 1. call setowner to clear next_owner_id, then call release to free it (TODO above)
+ * 2. call acquire to acquire the lease, then call release to free it
+ *
+ * If migration fails, the source/owner does not free the paxos lease, and
+ * the source/owner does not continue running or renewing its host_id, then
+ * another host will be able to take ownership of the lease, because they
+ * will see that the owner is not alive (or comes back with a different
+ * generation).
+ *
+ * For migration, the paxos lease is in limbo: both owner and next_owner
+ * are set, and in this state neither the source nor the dest can free the
+ * paxos lease. The limbo state needs to be cleared (next_owner cleared)
+ * before the lease can be freed.
+ *
+ * - if migration succeeds, the dest will call setowner to clear next_owner
+ * and bring the lease out of limbo
+ *
+ * - if migration fails because the dest host fails,
+ * setowner on the source will clear next_owner, and allow source to
+ * continue running and holding the lease, or release the lease.
+ * setowner on source would have to ignore the fact that the leader
+ * will have been changed since it last read or wrote it (by the dest
+ * writing itself as the next_owner_id)
+ *
+ * - if migration fails because the source host fails,
+ * the paxos lease will be left on disk with owner and next_owner set,
+ * and neither source nor dest owning the lease to free it.
+ * The lease can be acquired because someone will see that the owner's
+ * host_id is not renewed (or a different generation). This acquire
+ * will clear next_owner.
+ *
+ * - if migration fails and both source and dest fail, the lease can be
+ * acquired because someone will see that the owner's host_id is not
+ * renewed (or a diff generation). This acquire will clear next_owner.
+ *
+ * - if migration fails because the the dest qemu fails but dest host still ok
+ * setowner on the source will clear next_owner (same as if dest host fails)
+ *
+ * - if migration fails because the the source qemu fails but source host still ok
+ * sanlock will not free the lease in release_token because next_owner is set.
+ * no other host can acquire the lease because its owned by a live host_id.
+ * the source host can acquire the lease again, and then free it. what causes
+ * the source host to try to acquire the lease again? trying to start the vm
+ * on the source again...
+ *
+ * - if migration fails because the the source and dest qemu fails but hosts ok
+ * same as prev
+ */
+
/* return < 0 on error, 1 on success */
int release_token(struct token *token)
@@ -253,12 +340,9 @@ int release_token(struct token *token)
struct leader_record leader_ret;
int rv;
- if (token->leader.owner_id != token->host_id) {
- /* this case occurs on the receiving side of migration, when
- the local host hasn't become the lease owner (just next_owner),
- and the pid fails, causing sm to clean up the pid's tokens */
- log_token(token, "release we are not owner");
- return 1;
+ if (token->migrating) {
+ log_errot(token, "release skip migrating");
+ return DP_ERROR;
}
rv = paxos_lease_release(token, &token->leader, &leader_ret);
@@ -274,96 +358,146 @@ int release_token(struct token *token)
return rv; /* DP_OK */
}
-/* migration source: writes leader_record.next_owner_id = target_host_id */
/* return < 0 on error, 1 on success */
-int migrate_token(struct token *token, uint64_t target_host_id)
+int set_next_owner_other(struct token *token, uint64_t target_host_id)
{
- struct leader_record leader_ret;
+ struct leader_record leader;
int rv;
- rv = paxos_lease_migrate(token, &token->leader, &leader_ret, target_host_id);
+ rv = paxos_lease_leader_read(token, &leader);
+ if (rv < 0)
+ return rv;
- token->migrate_result = rv;
+ if (memcmp(&leader, &token->leader, sizeof(struct leader_record))) {
+ log_errot(token, "set_next_owner_other leader changed before migrate");
+ return DP_BAD_LEADER;
+ }
- log_token(token, "migrate rv %d", rv);
+ if (leader.num_hosts < target_host_id) {
+ log_errot(token, "set_next_owner_other num_hosts %llu "
+ "target_host_id %llu",
+ (unsigned long long)leader.num_hosts,
+ (unsigned long long)target_host_id);
+ return DP_BAD_NUMHOSTS;
+ }
+
+ leader.next_owner_id = target_host_id;
+ rv = paxos_lease_leader_write(token, &leader);
if (rv < 0)
return rv;
- memcpy(&token->leader, &leader_ret, sizeof(struct leader_record));
+ memcpy(&token->leader, &leader, sizeof(struct leader_record));
return rv; /* DP_OK */
}
+/* return < 0 on error, 1 on success */
+
+int set_next_owner_self(struct token *token)
+{
+ struct leader_record leader;
+ int rv;
+
+ rv = paxos_lease_leader_read(token, &leader);
+ if (rv < 0)
+ return rv;
+
+ if (leader.num_hosts < token->host_id) {
+ log_errot(token, "set_next_owner_self num_hosts %llu host_id %llu",
+ (unsigned long long)leader.num_hosts,
+ (unsigned long long)token->host_id);
+ return DP_BAD_NUMHOSTS;
+ }
+
+ leader.next_owner_id = token->host_id;
+
+ rv = paxos_lease_leader_write(token, &leader);
+ if (rv < 0)
+ return rv;
+
+ memcpy(&token->leader, &leader, sizeof(struct leader_record));
+ return rv; /* DP_OK */
+}
/*
- * migration target: verifies that the source wrote us as the next_owner_id
- *
- * When everything is working correctly, we just verify here that
- * fields in leader_ret match what we see in leader_src
- * (created from opt_str which was returned by sanlock_migrate()
- * on the source).
+ * migration destination verifies the migrate state sent from source,
+ * which needs to be consisent with the source having successfully written
+ * the next_owner itself, or having not tried or tried and failed.
*
* If we can't read the leader, return an error, and the migration
* needs to be aborted.
*/
+
/* return < 0 on error, 1 on success */
-int receive_token(struct token *token, int migrate_result,
- struct leader_record *leader_src)
+int parse_incoming_state(struct token *token, char *str, int *migrate_result,
+ struct leader_record *leader);
+
+int check_incoming_state(struct token *token, char *opt_str, int *migrate_result_out)
{
- struct leader_record leader_read;
+ struct leader_record leader_ret;
+ struct leader_record leader_src;
+ int migrate_result;
int rv;
- rv = paxos_lease_leader_read(token, &leader_read);
+ rv = paxos_lease_leader_read(token, &leader_ret);
if (rv < 0)
return rv;
+ rv = parse_incoming_state(token, opt_str, &migrate_result, &leader_src);
+ if (rv < 0) {
+ log_errot(token, "check_incoming_state parse error %d result %d len %zd",
+ rv, migrate_result, strlen(opt_str));
+ return rv;
+ }
+
+ *migrate_result_out = migrate_result;
+
+ /* source successfully wrote next_owner */
+
if (migrate_result == DP_OK) {
- if (leader_src->next_owner_id == token->host_id &&
- leader_read.next_owner_id == token->host_id &&
- leader_src->lver == leader_read.lver &&
- leader_src->timestamp == leader_read.timestamp) {
- log_token(token, "receive_token all match");
+ if (leader_src.next_owner_id == token->host_id &&
+ leader_ret.next_owner_id == token->host_id &&
+ leader_src.lver == leader_ret.lver &&
+ leader_src.timestamp == leader_ret.timestamp) {
+ log_token(token, "check_incoming_state all match");
return DP_OK;
} else {
- log_errot(token, "receive_token mismatch "
+ log_errot(token, "check_incoming_state mismatch "
"next_owner %llu %llu %llu "
"lver %llu %llu "
"timestamp %llu %llu",
(unsigned long long)token->host_id,
- (unsigned long long)leader_src->next_owner_id,
- (unsigned long long)leader_read.next_owner_id,
- (unsigned long long)leader_src->lver,
- (unsigned long long)leader_read.lver,
- (unsigned long long)leader_src->timestamp,
- (unsigned long long)leader_read.timestamp);
+ (unsigned long long)leader_src.next_owner_id,
+ (unsigned long long)leader_ret.next_owner_id,
+ (unsigned long long)leader_src.lver,
+ (unsigned long long)leader_ret.lver,
+ (unsigned long long)leader_src.timestamp,
+ (unsigned long long)leader_ret.timestamp);
return -1;
}
}
- /* migrate_result < 0, source could not write next_owner_id, so it
- should still be 0 */
+ /* migrate_result <= 0, source could not (or did not) write next_owner_id,
+ so it should still be 0 */
- if (leader_src->owner_id != leader_read.owner_id ||
- leader_src->timestamp != leader_read.timestamp ||
- leader_read.next_owner_id != 0) {
+ if (leader_src.owner_id != leader_ret.owner_id ||
+ leader_src.timestamp != leader_ret.timestamp ||
+ leader_ret.next_owner_id != 0) {
- log_errot(token, "receive_token mismatch migrate_result %d "
+ log_errot(token, "check_incoming_state mismatch migrate_result %d "
"next_owner %llu owner %llu %llu timestamp %llu %llu",
migrate_result,
- (unsigned long long)leader_read.next_owner_id,
- (unsigned long long)leader_src->owner_id,
- (unsigned long long)leader_read.owner_id,
- (unsigned long long)leader_src->timestamp,
- (unsigned long long)leader_read.timestamp);
+ (unsigned long long)leader_ret.next_owner_id,
+ (unsigned long long)leader_src.owner_id,
+ (unsigned long long)leader_ret.owner_id,
+ (unsigned long long)leader_src.timestamp,
+ (unsigned long long)leader_ret.timestamp);
return -1;
}
- /* since the source failed to write next_owner_id to be us, we do it
- instead */
-
- return migrate_token(token, token->host_id);
+ return DP_OK;
}
int create_token(int num_disks, struct token **token_out)
diff --git a/src/token_manager.h b/src/token_manager.h
index 7119936..34b3c5b 100644
--- a/src/token_manager.h
+++ b/src/token_manager.h
@@ -12,11 +12,13 @@
int acquire_token(struct token *token, uint64_t reacquire_lver,
int new_num_hosts);
int release_token(struct token *token);
-int migrate_token(struct token *token, uint64_t target_host_id);
-int receive_token(struct token *token, int migrate_result,
- struct leader_record *leader_src);
int setowner_token(struct token *token);
+int check_incoming_state(struct token *token, char *opt_str,
+ int *migrate_result_out);
+int set_next_owner_other(struct token *token, uint64_t target_host_id);
+int set_next_owner_self(struct token *token);
+
int create_token(int num_disks, struct token **token_out);
void free_token(struct token *token);
void release_token_async(struct token *token);