src/delta_lease.c | 18 -
src/leader.h | 5
src/main.c | 2
src/paxos_lease.c | 571 +++++++++++++++++++++++++++++++++---------------------
src/sanlock_rv.h | 67 +++---
5 files changed, 402 insertions(+), 261 deletions(-)
New commits:
commit 5a56aa105d308615661e249ff9e486ddda7065ff
Author: David Teigland <teigland(a)redhat.com>
Date: Tue May 3 16:28:14 2011 -0500
sanlock: correct paxos usage
Rework how the disk paxos algorithm is applied to the
ownership and updating of the leader_record. How this
was done and worked previously had never been thoroughly
understood and was wrong in a number of general ways.
Some main concepts driving these changes:
. ballots are run to commit each new lver
. a node that successfully completes the ballot needs to
commit the result from that ballot, which means
writing the lver from that ballot and the dblock values
to the leader_record
. nodes need to monitor the leader_record to learn if
the next lver has been commited, and if so, return that
result
. the hosts participating in a ballot are the ones with
the latest matching lver in their dblocks
. each node's mbal should always increase
This includes a new incompatible disk format change,
due to the addition of some extra info fields to the leader
and dblock structs.
Error codes are also changed to describe the new conditions.
diff --git a/src/delta_lease.c b/src/delta_lease.c
index 63fb2a0..036d7dc 100644
--- a/src/delta_lease.c
+++ b/src/delta_lease.c
@@ -83,7 +83,7 @@ static int verify_leader(struct sync_disk *disk,
log_error("verify_leader %llu wrong magic %x %s",
(unsigned long long)host_id,
lr->magic, disk->path);
- result = SANLK_BAD_MAGIC;
+ result = SANLK_LEADER_MAGIC;
goto fail;
}
@@ -91,7 +91,7 @@ static int verify_leader(struct sync_disk *disk,
log_error("verify_leader %llu wrong version %x %s",
(unsigned long long)host_id,
lr->version, disk->path);
- result = SANLK_BAD_VERSION;
+ result = SANLK_LEADER_VERSION;
goto fail;
}
@@ -99,7 +99,7 @@ static int verify_leader(struct sync_disk *disk,
log_error("verify_leader %llu wrong sector size %d %d %s",
(unsigned long long)host_id,
lr->sector_size, disk->sector_size, disk->path);
- result = SANLK_BAD_SECTORSIZE;
+ result = SANLK_LEADER_SECTORSIZE;
goto fail;
}
@@ -107,7 +107,7 @@ static int verify_leader(struct sync_disk *disk,
log_error("verify_leader %llu wrong space name %.48s %.48s %s",
(unsigned long long)host_id,
lr->space_name, space_name, disk->path);
- result = SANLK_BAD_LOCKSPACE;
+ result = SANLK_LEADER_LOCKSPACE;
goto fail;
}
@@ -119,7 +119,7 @@ static int verify_leader(struct sync_disk *disk,
log_error("verify_leader %llu wrong resource name %.48s %.48s %s",
(unsigned long long)host_id,
lr->resource_name, resource_name, disk->path);
- result = SANLK_BAD_RESOURCEID;
+ result = SANLK_LEADER_RESOURCE;
goto fail;
}
@@ -129,7 +129,7 @@ static int verify_leader(struct sync_disk *disk,
log_error("verify_leader %llu wrong checksum %x %x %s",
(unsigned long long)host_id,
lr->checksum, sum, disk->path);
- result = SANLK_BAD_CHECKSUM;
+ result = SANLK_LEADER_CHECKSUM;
goto fail;
}
@@ -168,7 +168,7 @@ int delta_lease_leader_read(struct timeout *ti,
sizeof(struct leader_record),
ti->io_timeout_seconds, ti->use_aio, "delta_leader");
if (rv < 0)
- return SANLK_READ_LEADERS;
+ return SANLK_LEADER_READ;
error = verify_leader(disk, space_name, host_id, &leader, caller);
if (error < 0)
@@ -293,7 +293,7 @@ int delta_lease_renew(struct timeout *ti,
return error;
if (leader.owner_id != our_host_id)
- return SANLK_BAD_LEADER;
+ return SANLK_RENEW_OWNER;
new_ts = time(NULL);
@@ -331,7 +331,7 @@ int delta_lease_renew(struct timeout *ti,
(unsigned long long)host_id);
log_leader_error(0, space_name, host_id, disk, &leader,
"delta_renew_write");
log_leader_error(0, space_name, host_id, disk, &leader_read,
"delta_renew_reread");
- return SANLK_BAD_LEADER;
+ return SANLK_RENEW_DIFF;
}
memcpy(leader_ret, &leader, sizeof(struct leader_record));
diff --git a/src/leader.h b/src/leader.h
index 87b3d31..e100bb6 100644
--- a/src/leader.h
+++ b/src/leader.h
@@ -16,7 +16,7 @@
#define NAME_ID_SIZE 48
#define PAXOS_DISK_MAGIC 0x06152010
-#define PAXOS_DISK_VERSION_MAJOR 0x00040000
+#define PAXOS_DISK_VERSION_MAJOR 0x00050000
#define PAXOS_DISK_VERSION_MINOR 0x00000001
#define DELTA_DISK_MAGIC 0x12212010
@@ -58,6 +58,9 @@ struct leader_record {
uint64_t unused1;
uint32_t checksum;
uint32_t unused2;
+ uint64_t write_id; /* for extra info, debug */
+ uint64_t write_generation; /* for extra info, debug */
+ uint64_t write_timestamp; /* for extra info, debug */
};
#endif
diff --git a/src/main.c b/src/main.c
index d30dd00..40cd20c 100644
--- a/src/main.c
+++ b/src/main.c
@@ -920,7 +920,7 @@ static void *cmd_acquire_thread(void *args_in)
rv = acquire_token(token, acquire_lver, new_num_hosts);
if (rv < 0) {
- if (rv == SANLK_LIVE_LEADER && com.quiet_fail) {
+ if (rv == SANLK_ACQUIRE_IDLIVE && com.quiet_fail) {
log_token(token, "cmd_acquire %d,%d,%d paxos_lease %d",
cl_ci, cl_fd, cl_pid, rv);
} else {
diff --git a/src/paxos_lease.c b/src/paxos_lease.c
index c846d47..1e84dc7 100644
--- a/src/paxos_lease.c
+++ b/src/paxos_lease.c
@@ -29,23 +29,18 @@
#include "delta_lease.h"
#include "paxos_lease.h"
-/*
- * largely copied from vdsm.git/sync_manager/
- */
-
-#define NO_VAL 0
-
struct request_record {
uint64_t lver;
uint8_t force_mode;
};
-/* ref: ballot_ticket_record */
struct paxos_dblock {
- uint64_t mbal; /* aka curr_bal */
- uint64_t bal; /* aka inp_bal */
- uint64_t inp; /* aka inp_val */
- uint64_t lver; /* leader version */
+ uint64_t mbal;
+ uint64_t bal;
+ uint64_t inp; /* host_id */
+ uint64_t inp2; /* host_id generation */
+ uint64_t inp3; /* host_id's timestamp */
+ uint64_t lver;
};
int majority_disks(struct token *token, int num)
@@ -132,7 +127,7 @@ static int read_dblocks(struct timeout *ti,
data = malloc(data_len);
if (!data) {
log_error("read_dblocks malloc %d %s", data_len, disk->path);
- rv = -1;
+ rv = -ENOMEM;
goto out;
}
@@ -188,12 +183,9 @@ static int read_request(struct timeout *ti,
}
#endif
-/* host_id and inp are both generally our_host_id */
-
-static int run_disk_paxos(struct timeout *ti,
- struct token *token, uint64_t host_id, uint64_t inp,
- int num_hosts, uint64_t lver,
- struct paxos_dblock *dblock_out)
+static int run_ballot(struct timeout *ti, struct token *token, int num_hosts,
+ uint64_t next_lver, uint64_t our_mbal,
+ struct paxos_dblock *dblock_out)
{
struct paxos_dblock bk[num_hosts];
struct paxos_dblock bk_max;
@@ -201,48 +193,7 @@ static int run_disk_paxos(struct timeout *ti,
int num_disks = token->r.num_disks;
int num_writes, num_reads;
int d, q, rv;
-
- if (!host_id) {
- log_errot(token, "invalid host_id");
- return SANLK_INVAL;
- }
-
- if (!inp) {
- log_errot(token, "invalid inp");
- return SANLK_INVAL;
- }
-
- /* read one of our own dblock's to get initial dblock values */
-
- memset(&dblock, 0, sizeof(struct paxos_dblock));
-
- for (d = 0; d < num_disks; d++) {
- rv = read_dblock(ti, &token->disks[d], host_id, &dblock);
- if (rv < 0)
- continue;
- /* need only one dblock to get initial values */
- break;
- }
-
- if (rv < 0) {
- log_errot(token, "no initial dblock found");
- return SANLK_OWN_DBLOCK;
- }
-
- log_token(token, "initial dblock %u mbal %llu bal %llu inp %llu lver %llu",
d,
- (unsigned long long)dblock.mbal,
- (unsigned long long)dblock.bal,
- (unsigned long long)dblock.inp,
- (unsigned long long)dblock.lver);
-
- if (lver > dblock.lver) {
- dblock.mbal = host_id;
- dblock.bal = 0; /* or NO_VAL? lamport paper has 0 */
- dblock.inp = NO_VAL;
- dblock.lver = lver;
- } else {
- dblock.mbal += num_hosts;
- }
+ int q_max = -1;
/*
* phase 1
@@ -255,22 +206,29 @@ static int run_disk_paxos(struct timeout *ti,
* component is greater than dblock[p].mbal."
*/
+ log_token(token, "ballot %llu phase1 mbal %llu",
+ (unsigned long long)next_lver,
+ (unsigned long long)our_mbal);
+
+ memset(&dblock, 0, sizeof(struct paxos_dblock));
+ dblock.mbal = our_mbal;
+ dblock.lver = next_lver;
+
memset(&bk_max, 0, sizeof(struct paxos_dblock));
- bk_max.bal = NO_VAL;
- bk_max.inp = NO_VAL;
num_writes = 0;
for (d = 0; d < num_disks; d++) {
- rv = write_dblock(ti, &token->disks[d], host_id, &dblock);
+ rv = write_dblock(ti, &token->disks[d], token->host_id, &dblock);
if (rv < 0)
continue;
num_writes++;
}
if (!majority_disks(token, num_writes)) {
- log_errot(token, "cannot write dblock to majority of disks");
- return SANLK_WRITE1_DBLOCKS;
+ log_errot(token, "ballot %llu dblock write error %d",
+ (unsigned long long)next_lver, rv);
+ return SANLK_DBLOCK_WRITE;
}
num_reads = 0;
@@ -286,38 +244,49 @@ static int run_disk_paxos(struct timeout *ti,
continue;
if (bk[q].lver > dblock.lver) {
- log_errot(token, "bk %d %d lver %llu dblock lver %llu",
- d, q,
- (unsigned long long)bk[q].lver,
- (unsigned long long)dblock.lver);
- return SANLK_READ1_LVER;
+ /* I don't think this should happen */
+ log_errot(token, "ballot %llu larger lver[%d] %llu",
+ (unsigned long long)next_lver, q,
+ (unsigned long long)bk[q].lver);
+ return SANLK_DBLOCK_LVER;
}
/* see "It aborts the ballot" in comment above */
if (bk[q].mbal > dblock.mbal) {
- log_errot(token, "bk %d %d mbal %llu dblock mbal %llu",
- d, q,
- (unsigned long long)bk[q].mbal,
- (unsigned long long)dblock.mbal);
- return SANLK_READ1_MBAL;
+ log_errot(token, "ballot %llu mbal %llu larger mbal[%d] %llu",
+ (unsigned long long)next_lver,
+ (unsigned long long)our_mbal, q,
+ (unsigned long long)bk[q].mbal);
+ return SANLK_DBLOCK_MBAL;
}
/* see choosing inp for phase 2 in comment below */
- if (bk[q].inp == NO_VAL)
+ if (!bk[q].inp)
continue;
- if (bk_max.bal == NO_VAL || bk[q].bal > bk_max.bal)
+ if (!bk[q].bal) {
+ log_errot(token, "ballot %llu zero bal inp[%d] %llu",
+ (unsigned long long)next_lver, q,
+ (unsigned long long)bk[q].inp);
+ continue;
+ }
+
+ if (bk[q].bal > bk_max.bal) {
bk_max = bk[q];
+ q_max = q;
+ }
}
}
if (!majority_disks(token, num_reads)) {
- log_errot(token, "cannot read dblocks on majority of disks");
- return SANLK_READ1_DBLOCKS;
+ log_errot(token, "ballot %llu dblock read error %d",
+ (unsigned long long)next_lver, rv);
+ return SANLK_DBLOCK_READ;
}
+
/*
* "When it completes phase 1, p chooses a new value of dblock[p].inp,
* sets dblock[p].bal to dblock[p].mbal (its current ballot number),
@@ -333,31 +302,59 @@ static int run_disk_paxos(struct timeout *ti,
* nonInitBlks having the largest value of bk.bal."
*/
- log_token(token, "bk_max inp %llu bal %llu",
- (unsigned long long)bk_max.inp,
- (unsigned long long)bk_max.bal);
-
- dblock.inp = (bk_max.inp == NO_VAL) ? inp : bk_max.inp;
+ if (bk_max.inp) {
+ /* lver and mbal are already set */
+ dblock.inp = bk_max.inp;
+ dblock.inp2 = bk_max.inp2;
+ dblock.inp3 = bk_max.inp3;
+ } else {
+ /* lver and mbal are already set */
+ dblock.inp = token->host_id;
+ dblock.inp2 = token->host_generation;
+ dblock.inp3 = time(NULL);
+ }
dblock.bal = dblock.mbal;
+ if (bk_max.inp) {
+ /* not a problem, but interesting to see, so use log_error */
+ log_errot(token, "ballot %llu bk_max[%d] lver %llu mbal %llu bal %llu inp %llu
%llu %llu",
+ (unsigned long long)next_lver, q_max,
+ (unsigned long long)bk_max.lver,
+ (unsigned long long)bk_max.mbal,
+ (unsigned long long)bk_max.bal,
+ (unsigned long long)bk_max.inp,
+ (unsigned long long)bk_max.inp2,
+ (unsigned long long)bk_max.inp3);
+ }
+
+
/*
* phase 2
*
* Same description as phase 1, same sequence of writes/reads.
*/
+ log_token(token, "ballot %llu phase2 bal %llu inp %llu %llu %llu q_max %d",
+ (unsigned long long)dblock.lver,
+ (unsigned long long)dblock.bal,
+ (unsigned long long)dblock.inp,
+ (unsigned long long)dblock.inp2,
+ (unsigned long long)dblock.inp3,
+ q_max);
+
num_writes = 0;
for (d = 0; d < num_disks; d++) {
- rv = write_dblock(ti, &token->disks[d], host_id, &dblock);
+ rv = write_dblock(ti, &token->disks[d], token->host_id, &dblock);
if (rv < 0)
continue;
num_writes++;
}
if (!majority_disks(token, num_writes)) {
- log_errot(token, "cannot write dblock to majority of disks 2");
- return SANLK_WRITE2_DBLOCKS;
+ log_errot(token, "ballot %llu our dblock write2 error %d",
+ (unsigned long long)next_lver, rv);
+ return SANLK_DBLOCK_WRITE;
}
num_reads = 0;
@@ -373,28 +370,29 @@ static int run_disk_paxos(struct timeout *ti,
continue;
if (bk[q].lver > dblock.lver) {
- log_errot(token, "bk %d %d lver %llu dblock lver %llu",
- d, q,
- (unsigned long long)bk[q].lver,
- (unsigned long long)dblock.lver);
- return SANLK_READ2_LVER;
+ /* I don't think this should happen */
+ log_errot(token, "ballot %llu larger2 lver[%d] %llu",
+ (unsigned long long)next_lver, q,
+ (unsigned long long)bk[q].lver);
+ return SANLK_DBLOCK_LVER;
}
/* see "It aborts the ballot" in comment above */
if (bk[q].mbal > dblock.mbal) {
- log_errot(token, "bk %d %d mbal %llu dblock mbal %llu",
- d, q,
- (unsigned long long)bk[q].mbal,
- (unsigned long long)dblock.mbal);
- return SANLK_READ2_MBAL;
+ log_errot(token, "ballot %llu mbal %llu larger2 mbal[%d] %llu",
+ (unsigned long long)next_lver,
+ (unsigned long long)our_mbal, q,
+ (unsigned long long)bk[q].mbal);
+ return SANLK_DBLOCK_MBAL;
}
}
}
if (!majority_disks(token, num_reads)) {
- log_errot(token, "cannot read dblocks from majority of disks 2");
- return SANLK_READ2_DBLOCKS;
+ log_errot(token, "ballot %llu dblock read2 error %d",
+ (unsigned long long)next_lver, rv);
+ return SANLK_DBLOCK_READ;
}
/* "When it completes phase 2, p has committed dblock[p].inp." */
@@ -440,6 +438,11 @@ static void log_leader_error(int result,
lr->resource_name,
(unsigned long long)lr->timestamp,
lr->checksum);
+
+ log_errot(token, "leader5 wi %llu wg %llu wt %llu",
+ (unsigned long long)lr->write_id,
+ (unsigned long long)lr->write_generation,
+ (unsigned long long)lr->write_timestamp);
}
static int verify_leader(struct token *token, struct sync_disk *disk,
@@ -453,35 +456,35 @@ static int verify_leader(struct token *token, struct sync_disk
*disk,
if (lr->magic != PAXOS_DISK_MAGIC) {
log_errot(token, "verify_leader wrong magic %x %s",
lr->magic, disk->path);
- result = SANLK_BAD_MAGIC;
+ result = SANLK_LEADER_MAGIC;
goto fail;
}
if ((lr->version & 0xFFFF0000) != PAXOS_DISK_VERSION_MAJOR) {
log_errot(token, "verify_leader wrong version %x %s",
lr->version, disk->path);
- result = SANLK_BAD_VERSION;
+ result = SANLK_LEADER_VERSION;
goto fail;
}
if (lr->sector_size != disk->sector_size) {
log_errot(token, "verify_leader wrong sector size %d %d %s",
lr->sector_size, disk->sector_size, disk->path);
- result = SANLK_BAD_SECTORSIZE;
+ result = SANLK_LEADER_SECTORSIZE;
goto fail;
}
if (strncmp(lr->space_name, token->r.lockspace_name, NAME_ID_SIZE)) {
log_errot(token, "verify_leader wrong space name %.48s %.48s %s",
lr->space_name, token->r.lockspace_name, disk->path);
- result = SANLK_BAD_LOCKSPACE;
+ result = SANLK_LEADER_LOCKSPACE;
goto fail;
}
if (strncmp(lr->resource_name, token->r.name, NAME_ID_SIZE)) {
log_errot(token, "verify_leader wrong resource name %.48s %.48s %s",
lr->resource_name, token->r.name, disk->path);
- result = SANLK_BAD_RESOURCEID;
+ result = SANLK_LEADER_RESOURCE;
goto fail;
}
@@ -489,7 +492,7 @@ static int verify_leader(struct token *token, struct sync_disk *disk,
log_errot(token, "verify_leader num_hosts too small %llu %llu %s",
(unsigned long long)lr->num_hosts,
(unsigned long long)token->host_id, disk->path);
- result = SANLK_BAD_NUMHOSTS;
+ result = SANLK_LEADER_NUMHOSTS;
goto fail;
}
@@ -498,7 +501,7 @@ static int verify_leader(struct token *token, struct sync_disk *disk,
if (lr->checksum != sum) {
log_errot(token, "verify_leader wrong checksum %x %x %s",
lr->checksum, sum, disk->path);
- result = SANLK_BAD_CHECKSUM;
+ result = SANLK_LEADER_CHECKSUM;
goto fail;
}
@@ -529,7 +532,7 @@ int paxos_lease_leader_read(struct timeout *ti,
struct token *token, struct leader_record *leader_ret,
const char *caller)
{
- struct leader_record prev_leader;
+ struct leader_record leader;
struct leader_record *leaders;
int *leader_reps;
int leaders_len, leader_reps_len;
@@ -543,21 +546,20 @@ int paxos_lease_leader_read(struct timeout *ti,
leaders = malloc(leaders_len);
if (!leaders)
- return SANLK_NOMEM;
+ return -ENOMEM;
leader_reps = malloc(leader_reps_len);
if (!leader_reps) {
free(leaders);
- return SANLK_NOMEM;
+ return -ENOMEM;
}
/*
* find a leader block that's consistent on the majority of disks,
* so we can use as the basis for the new leader
- * ref: validate_multiple_disk_leader
*/
- memset(&prev_leader, 0, sizeof(struct leader_record));
+ memset(&leader, 0, sizeof(struct leader_record));
memset(leaders, 0, leaders_len);
memset(leader_reps, 0, leader_reps_len);
@@ -587,8 +589,8 @@ int paxos_lease_leader_read(struct timeout *ti,
}
if (!majority_disks(token, num_reads)) {
- log_errot(token, "paxos_leader_read no majority reads");
- error = SANLK_READ_LEADERS;
+ log_errot(token, "%s leader_read error %d", caller, rv);
+ error = SANLK_LEADER_READ;
goto fail;
}
@@ -601,29 +603,26 @@ int paxos_lease_leader_read(struct timeout *ti,
continue;
/* leader on d is the same on a majority of disks,
- prev_leader becomes the prototype for new_leader */
+ leader becomes the prototype for new_leader */
- memcpy(&prev_leader, &leaders[d], sizeof(struct leader_record));
+ memcpy(&leader, &leaders[d], sizeof(struct leader_record));
found = 1;
break;
}
if (!found) {
- log_errot(token, "paxos_leader_read no majority reps");
- error = SANLK_DIFF_LEADERS;
+ log_errot(token, "%s leader_read inconsistent", caller);
+ error = SANLK_LEADER_DIFF;
goto fail;
}
- log_token(token, "%s leader_read owner %llu lver %llu hosts %llu "
- "time %llu res %s",
- caller ? caller : "unknown",
- (unsigned long long)prev_leader.owner_id,
- (unsigned long long)prev_leader.lver,
- (unsigned long long)prev_leader.num_hosts,
- (unsigned long long)prev_leader.timestamp,
- prev_leader.resource_name);
-
- memcpy(leader_ret, &prev_leader, sizeof(struct leader_record));
+ log_token(token, "%s leader_read %llu owner %llu %llu %llu", caller,
+ (unsigned long long)leader.lver,
+ (unsigned long long)leader.owner_id,
+ (unsigned long long)leader.owner_generation,
+ (unsigned long long)leader.timestamp);
+
+ memcpy(leader_ret, &leader, sizeof(struct leader_record));
return SANLK_OK;
fail:
@@ -632,8 +631,8 @@ int paxos_lease_leader_read(struct timeout *ti,
return error;
}
-static int write_new_leader(struct timeout *ti,
- struct token *token, struct leader_record *nl)
+static int write_new_leader(struct timeout *ti, struct token *token,
+ struct leader_record *nl, const char *caller)
{
int num_disks = token->r.num_disks;
int num_writes = 0;
@@ -648,16 +647,23 @@ static int write_new_leader(struct timeout *ti,
}
if (!majority_disks(token, num_writes)) {
- log_errot(token, "write_new_leader no majority writes");
- error = SANLK_WRITE_LEADERS;
+ log_errot(token, "%s write_new_leader no majority writes", caller);
+ error = SANLK_LEADER_WRITE;
}
return error;
}
/*
- * acquire a lease
- * ref: obtain()
+ * If we hang or crash after completing a ballot successfully, but before
+ * commiting the leader_record, then the next host that runs a ballot (with the
+ * same lver since we did not commit the new lver to the leader_record) will
+ * commit the same inp values that we were about to commit. If the inp values
+ * they commit indicate we (who crashed or hung) are the new owner, then the
+ * other hosts will begin monitoring the liveness of our host_id. Once enough
+ * time has passed, they assume we're dead, and go on with new versions. The
+ * "enough time" ensures that if we hung before writing the leader, that we
+ * won't wake up and finally write what will then be an old invalid leader.
*/
int paxos_lease_acquire(struct timeout *ti,
@@ -666,32 +672,44 @@ int paxos_lease_acquire(struct timeout *ti,
uint64_t acquire_lver,
int new_num_hosts)
{
- struct leader_record prev_leader;
+ struct leader_record cur_leader;
+ struct leader_record tmp_leader;
struct leader_record new_leader;
struct leader_record host_id_leader;
struct sync_disk host_id_disk;
struct paxos_dblock dblock;
time_t start;
+ uint64_t next_lver;
+ uint64_t our_mbal = 0;
uint64_t last_timestamp = 0;
- int error, rv, disk_open = 0;
+ int error, rv, d, num_reads, disk_open = 0;
- log_token(token, "paxos_acquire begin lver %llu flags %x",
+ log_token(token, "paxos_acquire begin acquire_lver %llu flags %x",
(unsigned long long)acquire_lver, flags);
+ restart:
- error = paxos_lease_leader_read(ti, token, &prev_leader,
"paxos_acquire");
+ error = paxos_lease_leader_read(ti, token, &cur_leader, "paxos_acquire");
if (error < 0)
goto out;
if (flags & PAXOS_ACQUIRE_FORCE)
goto run;
- if (prev_leader.timestamp == LEASE_FREE) {
+ if (acquire_lver && cur_leader.lver != acquire_lver) {
+ log_errot(token, "paxos_acquire acquire_lver %llu cur_leader %llu",
+ (unsigned long long)acquire_lver,
+ (unsigned long long)cur_leader.lver);
+ error = SANLK_ACQUIRE_LVER;
+ goto out;
+ }
+
+ if (cur_leader.timestamp == LEASE_FREE) {
log_token(token, "paxos_acquire lease free");
goto run;
}
- if (prev_leader.owner_id == token->host_id &&
- prev_leader.owner_generation == token->host_generation) {
+ if (cur_leader.owner_id == token->host_id &&
+ cur_leader.owner_generation == token->host_generation) {
log_token(token, "paxos_acquire already owner id %llu gen %llu",
(unsigned long long)token->host_id,
(unsigned long long)token->host_generation);
@@ -704,37 +722,40 @@ int paxos_lease_acquire(struct timeout *ti,
* its watchdog has triggered and we can go for the paxos lease.
*/
- log_token(token, "paxos_acquire check owner_id %llu",
- (unsigned long long)prev_leader.owner_id);
+ log_token(token, "paxos_acquire check owner_id %llu gen %llu",
+ (unsigned long long)cur_leader.owner_id,
+ (unsigned long long)cur_leader.owner_generation);
- memset(&host_id_disk, 0, sizeof(host_id_disk));
+ if (!disk_open) {
+ memset(&host_id_disk, 0, sizeof(host_id_disk));
- rv = host_id_disk_info(prev_leader.space_name, &host_id_disk);
- if (rv < 0) {
- log_errot(token, "paxos_acquire no lockspace info %.48s",
- prev_leader.space_name);
- error = SANLK_BAD_SPACE_NAME;
- goto out;
- }
+ rv = host_id_disk_info(cur_leader.space_name, &host_id_disk);
+ if (rv < 0) {
+ log_errot(token, "paxos_acquire no lockspace info %.48s",
+ cur_leader.space_name);
+ error = SANLK_ACQUIRE_LOCKSPACE;
+ goto out;
+ }
- disk_open = open_disks_fd(&host_id_disk, 1);
- if (disk_open != 1) {
- log_errot(token, "paxos_acquire cannot open host_id_disk");
- error = SANLK_BAD_SPACE_DISK;
- goto out;
+ disk_open = open_disks_fd(&host_id_disk, 1);
+ if (disk_open != 1) {
+ log_errot(token, "paxos_acquire cannot open host_id_disk");
+ error = SANLK_ACQUIRE_IDDISK;
+ goto out;
+ }
}
start = time(NULL);
while (1) {
error = delta_lease_leader_read(ti, &host_id_disk,
- prev_leader.space_name,
- prev_leader.owner_id,
+ cur_leader.space_name,
+ cur_leader.owner_id,
&host_id_leader,
"paxos_acquire");
if (error < 0) {
log_errot(token, "paxos_acquire host_id %llu read %d",
- (unsigned long long)prev_leader.owner_id,
+ (unsigned long long)cur_leader.owner_id,
error);
goto out;
}
@@ -746,7 +767,7 @@ int paxos_lease_acquire(struct timeout *ti,
if (host_id_leader.timestamp == LEASE_FREE) {
log_token(token, "paxos_acquire host_id %llu free",
- (unsigned long long)prev_leader.owner_id);
+ (unsigned long long)cur_leader.owner_id);
goto run;
}
@@ -754,9 +775,9 @@ int paxos_lease_acquire(struct timeout *ti,
owned this paxos lease; acquiring a host_id also cannot be
done in less than host_id_timeout_sec */
- if (host_id_leader.owner_id != prev_leader.owner_id) {
+ if (host_id_leader.owner_id != cur_leader.owner_id) {
log_token(token, "paxos_acquire host_id %llu owner %llu",
- (unsigned long long)prev_leader.owner_id,
+ (unsigned long long)cur_leader.owner_id,
(unsigned long long)host_id_leader.owner_id);
goto run;
}
@@ -765,12 +786,12 @@ int paxos_lease_acquire(struct timeout *ti,
owned the lease in a previous generation without freeing it,
and no longer owns it */
- if (host_id_leader.owner_generation > prev_leader.owner_generation) {
+ if (host_id_leader.owner_generation > cur_leader.owner_generation) {
log_token(token, "paxos_acquire host_id %llu "
"generation now %llu old %llu",
- (unsigned long long)prev_leader.owner_id,
+ (unsigned long long)cur_leader.owner_id,
(unsigned long long)host_id_leader.owner_generation,
- (unsigned long long)prev_leader.owner_generation);
+ (unsigned long long)cur_leader.owner_generation);
goto run;
}
@@ -789,14 +810,14 @@ int paxos_lease_acquire(struct timeout *ti,
if (time(NULL) - start > ti->host_id_timeout_seconds) {
log_token(token, "paxos_acquire host_id %llu expired %llu",
- (unsigned long long)prev_leader.owner_id,
+ (unsigned long long)cur_leader.owner_id,
(unsigned long long)host_id_leader.timestamp);
goto run;
}
#if 0
if (time(NULL) - host_id_leader.timestamp > ti->host_id_timeout_seconds) {
log_token(token, "paxos_acquire host_id %llu expired %llu",
- (unsigned long long)prev_leader.owner_id,
+ (unsigned long long)cur_leader.owner_id,
(unsigned long long)host_id_leader.timestamp);
goto run;
}
@@ -807,88 +828,162 @@ int paxos_lease_acquire(struct timeout *ti,
if (last_timestamp && (host_id_leader.timestamp != last_timestamp)) {
if (flags & PAXOS_ACQUIRE_QUIET_FAIL) {
log_token(token, "paxos_acquire host_id %llu alive",
- (unsigned long long)prev_leader.owner_id);
+ (unsigned long long)cur_leader.owner_id);
} else {
log_errot(token, "paxos_acquire host_id %llu alive",
- (unsigned long long)prev_leader.owner_id);
+ (unsigned long long)cur_leader.owner_id);
}
- error = SANLK_LIVE_LEADER;
+ error = SANLK_ACQUIRE_IDLIVE;
goto out;
}
last_timestamp = host_id_leader.timestamp;
- sleep(1);
+ sleep(2);
+
+ error = paxos_lease_leader_read(ti, token, &tmp_leader,
"paxos_acquire");
+ if (error < 0)
+ goto out;
+
+ if (memcmp(&cur_leader, &tmp_leader, sizeof(struct leader_record))) {
+ log_token(token, "paxos_acquire restart leader changed");
+ goto restart;
+ }
}
run:
- if (acquire_lver && prev_leader.lver != acquire_lver) {
- log_errot(token, "paxos_acquire acquire_lver %llu prev_leader %llu",
- (unsigned long long)acquire_lver,
- (unsigned long long)prev_leader.lver);
- error = SANLK_REACQUIRE_LVER;
+ /*
+ * Use the disk paxos algorithm to attempt to commit a new leader.
+ *
+ * If we complete a ballot successfully, we can commit a leader record
+ * with next_lver. If we find a higher mbal during a ballot, we increase
+ * our own mbal and try the ballot again.
+ *
+ * next_lver is derived from cur_leader with a zero or timed out owner.
+ * We need to monitor the leader record to see if another host commits
+ * a new leader_record with next_lver.
+ */
+
+ next_lver = cur_leader.lver + 1;
+
+ num_reads = 0;
+
+ for (d = 0; d < token->r.num_disks; d++) {
+ rv = read_dblock(ti, &token->disks[d], token->host_id, &dblock);
+ if (rv < 0)
+ continue;
+ num_reads++;
+
+ if (dblock.mbal > our_mbal)
+ our_mbal = dblock.mbal;
+ }
+
+ if (!num_reads) {
+ log_errot(token, "paxos_acquire cannot read our dblock %d", rv);
+ error = SANLK_DBLOCK_READ;
goto out;
}
- /* TODO: test: while we were waiting in host_id_timeout_seconds loop
- * above, another host has finished that loop, come through here
- * and become the new leader (so if we were to read the leader record
- * again right here it would be different from our prev_leader).
- * what if the other host not only acquired the leader but also
- * freed it when we get here? */
+ /* TODO: may not need to increase mbal if dblock.inp and inp2 match
+ current host_id and generation? */
- /*
- * run disk paxos to reach consensus on a new leader
- */
+ if (!our_mbal)
+ our_mbal = token->host_id;
+ else
+ our_mbal += cur_leader.max_hosts;
- memcpy(&new_leader, &prev_leader, sizeof(struct leader_record));
- new_leader.lver += 1; /* req.lver */
+ retry_ballot:
- error = run_disk_paxos(ti, token, token->host_id, token->host_id,
- new_leader.num_hosts, new_leader.lver, &dblock);
- if (error < 0) {
- log_errot(token, "paxos_acquire paxos error %d", error);
+ error = paxos_lease_leader_read(ti, token, &tmp_leader, "paxos_acquire");
+ if (error < 0)
+ goto out;
+
+ if (tmp_leader.lver == next_lver) {
+ /*
+ * another host has commited a leader_record for next_lver,
+ * check which inp (owner_id) they commited (possibly us).
+ */
+
+ if (tmp_leader.owner_id == token->host_id &&
+ tmp_leader.owner_generation == token->host_generation) {
+ /* not a problem, but interesting to see, so use log_error */
+
+ log_errot(token, "paxos_acquire %llu our id commited by %llu",
+ (unsigned long long)next_lver,
+ (unsigned long long)tmp_leader.write_id);
+
+ memcpy(leader_ret, &tmp_leader, sizeof(struct leader_record));
+ error = SANLK_OK;
+ } else {
+ /* not a problem, but interesting to see, so use log_error */
+
+ log_errot(token, "paxos_acquire %llu owner is %llu",
+ (unsigned long long)next_lver,
+ (unsigned long long)tmp_leader.owner_id);
+
+ error = SANLK_ACQUIRE_OWNED;
+ }
goto out;
}
- log_token(token, "paxos_acquire paxos result dblock mbal %llu bal %llu inp %llu
lver %llu",
- (unsigned long long)dblock.mbal,
- (unsigned long long)dblock.bal,
- (unsigned long long)dblock.inp,
- (unsigned long long)dblock.lver);
+ error = run_ballot(ti, token, cur_leader.num_hosts, next_lver, our_mbal,
+ &dblock);
- /* the inp value we commited wasn't us */
+ if (error == SANLK_DBLOCK_MBAL) {
+ log_token(token, "paxos_acquire %llu retry ballot",
+ (unsigned long long)next_lver);
+ our_mbal += cur_leader.max_hosts;
+ goto retry_ballot;
+ }
- if (dblock.inp != token->host_id) {
- log_errot(token, "paxos_acquire paxos contention our_host_id %llu "
- "mbal %llu bal %llu inp %llu lver %llu",
- (unsigned long long)token->host_id,
- (unsigned long long)dblock.mbal,
- (unsigned long long)dblock.bal,
- (unsigned long long)dblock.inp,
- (unsigned long long)dblock.lver);
- error = SANLK_OTHER_INP;
+ if (error < 0) {
+ log_errot(token, "paxos_acquire %llu ballot error %d",
+ (unsigned long long)next_lver, error);
goto out;
}
- /* dblock has the disk paxos result: consensus inp and lver */
+ /* ballot success, commit next_lver with dblock values */
- new_leader.owner_id = token->host_id;
- new_leader.owner_generation = token->host_generation;
+ memcpy(&new_leader, &cur_leader, sizeof(struct leader_record));
new_leader.lver = dblock.lver;
- new_leader.timestamp = time(NULL);
+ new_leader.owner_id = dblock.inp;
+ new_leader.owner_generation = dblock.inp2;
+ new_leader.timestamp = dblock.inp3;
+
+ new_leader.write_id = token->host_id;
+ new_leader.write_generation = token->host_generation;
+ new_leader.write_timestamp = time(NULL);
+
if (new_num_hosts)
new_leader.num_hosts = new_num_hosts;
new_leader.checksum = leader_checksum(&new_leader);
- error = write_new_leader(ti, token, &new_leader);
+ error = write_new_leader(ti, token, &new_leader, "paxos_acquire");
if (error < 0)
goto out;
+ if (new_leader.owner_id != token->host_id) {
+ /* not a problem, but interesting to see, so use log_error */
+
+ log_errot(token, "paxos_acquire %llu commit other owner %llu %llu %llu",
+ (unsigned long long)new_leader.lver,
+ (unsigned long long)new_leader.owner_id,
+ (unsigned long long)new_leader.owner_generation,
+ (unsigned long long)new_leader.timestamp);
+
+ error = SANLK_ACQUIRE_OTHER;
+ goto out;
+ }
+
+ log_token(token, "paxos_acquire %llu owner %llu %llu %llu done",
+ (unsigned long long)next_lver,
+ (unsigned long long)new_leader.owner_id,
+ (unsigned long long)new_leader.owner_generation,
+ (unsigned long long)new_leader.timestamp);
+
memcpy(leader_ret, &new_leader, sizeof(struct leader_record));
+ error = SANLK_OK;
out:
- log_token(token, "paxos_acquire done %d", error);
-
if (disk_open)
close_disks(&host_id_disk, 1);
@@ -964,21 +1059,57 @@ int paxos_lease_release(struct timeout *ti,
goto out;
}
- if (memcmp(&leader, leader_last, sizeof(struct leader_record))) {
- log_errot(token, "release error leader changed");
- return SANLK_BAD_LEADER;
+ if (leader.lver != leader_last->lver) {
+ log_errot(token, "paxos_release %llu other lver %llu",
+ (unsigned long long)leader_last->lver,
+ (unsigned long long)leader.lver);
+ return SANLK_RELEASE_LVER;
}
- if (leader.owner_id != token->host_id) {
- log_errot(token, "release error other owner_id %llu",
- (unsigned long long)leader.owner_id);
- return SANLK_OTHER_OWNER;
+ if (leader.owner_id != token->host_id ||
+ leader.owner_generation != token->host_generation) {
+ log_errot(token, "paxos_release %llu other owner %llu %llu %llu",
+ (unsigned long long)leader_last->lver,
+ (unsigned long long)leader.owner_id,
+ (unsigned long long)leader.owner_generation,
+ (unsigned long long)leader.timestamp);
+ return SANLK_RELEASE_OWNER;
+ }
+
+ if (memcmp(&leader, leader_last, sizeof(struct leader_record))) {
+ /*
+ * This will happen when two hosts finish the same ballot
+ * successfully, the second commiting the same inp values
+ * that the first did, as it should. But the second will
+ * write it's own write_id/gen/timestap, which will differ
+ * from what the first host wrote. So when the first host
+ * rereads here in the release, it will find different
+ * write_id/gen/timestamp from what it wrote. This is
+ * perfectly fine (use log_error since it's interesting
+ * to see when this happens.)
+ */
+ log_errot(token, "paxos_release %llu leader different "
+ "write %llu %llu %llu vs %llu %llu %llu",
+ (unsigned long long)leader_last->lver,
+ (unsigned long long)leader_last->write_id,
+ (unsigned long long)leader_last->write_generation,
+ (unsigned long long)leader_last->write_timestamp,
+ (unsigned long long)leader.write_id,
+ (unsigned long long)leader.write_generation,
+ (unsigned long long)leader.write_timestamp);
+ /*
+ log_leader_error(0, token, &token->disks[0], leader_last,
"paxos_release");
+ log_leader_error(0, token, &token->disks[0], &leader,
"paxos_release");
+ */
}
leader.timestamp = LEASE_FREE;
+ leader.write_id = token->host_id;
+ leader.write_generation = token->host_generation;
+ leader.write_timestamp = time(NULL);
leader.checksum = leader_checksum(&leader);
- error = write_new_leader(ti, token, &leader);
+ error = write_new_leader(ti, token, &leader, "paxos_release");
if (error < 0)
goto out;
diff --git a/src/sanlock_rv.h b/src/sanlock_rv.h
index 614e08c..85db9c3 100644
--- a/src/sanlock_rv.h
+++ b/src/sanlock_rv.h
@@ -12,35 +12,42 @@
#define SANLK_OK 1
#define SANLK_NONE 0 /* unused */
#define SANLK_ERROR -201
-#define SANLK_INVAL -202
-#define SANLK_NOMEM -203
-#define SANLK_LIVE_LEADER -204
-#define SANLK_DIFF_LEADERS -205
-#define SANLK_READ_LEADERS -206
-#define SANLK_OWN_DBLOCK -207
-#define SANLK_WRITE1_DBLOCKS -208
-#define SANLK_WRITE2_DBLOCKS -209
-#define SANLK_WRITE_REQUESTS -210
-#define SANLK_WRITE_LEADERS -211
-#define SANLK_READ1_MBAL -212
-#define SANLK_READ1_LVER -213
-#define SANLK_READ2_MBAL -214
-#define SANLK_READ2_LVER -215
-#define SANLK_READ1_DBLOCKS -216
-#define SANLK_READ2_DBLOCKS -217
-#define SANLK_BAD_MAGIC -218
-#define SANLK_BAD_VERSION -219
-#define SANLK_BAD_CLUSTERMODE -220
-#define SANLK_BAD_RESOURCEID -221
-#define SANLK_BAD_NUMHOSTS -222
-#define SANLK_BAD_CHECKSUM -223
-#define SANLK_BAD_LEADER -224
-#define SANLK_OTHER_INP -225
-#define SANLK_BAD_SECTORSIZE -226
-#define SANLK_REACQUIRE_LVER -227
-#define SANLK_BAD_LOCKSPACE -228
-#define SANLK_OTHER_OWNER -229
-#define SANLK_BAD_SPACE_NAME -230
-#define SANLK_BAD_SPACE_DISK -231
+
+/* run_ballot */
+
+#define SANLK_DBLOCK_READ -210
+#define SANLK_DBLOCK_WRITE -211
+#define SANLK_DBLOCK_LVER -212
+#define SANLK_DBLOCK_MBAL -213
+
+/* verify_leader, leader_read, leader_write (paxos or delta) */
+
+#define SANLK_LEADER_READ -220
+#define SANLK_LEADER_WRITE -221
+#define SANLK_LEADER_DIFF -222
+#define SANLK_LEADER_MAGIC -223
+#define SANLK_LEADER_VERSION -224
+#define SANLK_LEADER_SECTORSIZE -225
+#define SANLK_LEADER_LOCKSPACE -226
+#define SANLK_LEADER_RESOURCE -227
+#define SANLK_LEADER_NUMHOSTS -228
+#define SANLK_LEADER_CHECKSUM -229
+
+/* paxos_lease_acquire, paxos_lease_release */
+
+#define SANLK_ACQUIRE_LVER -240
+#define SANLK_ACQUIRE_LOCKSPACE -241
+#define SANLK_ACQUIRE_IDDISK -242
+#define SANLK_ACQUIRE_IDLIVE -243
+#define SANLK_ACQUIRE_OWNED -244
+#define SANLK_ACQUIRE_OTHER -245
+
+#define SANLK_RELEASE_LVER -250
+#define SANLK_RELEASE_OWNER -251
+
+/* delta_lease_renew */
+
+#define SANLK_RENEW_OWNER -260
+#define SANLK_RENEW_DIFF -261
#endif