This is an automated email from the git hooks/post-receive script.
teigland pushed a change to branch testing in repository sanlock.
at 76332a0 sanlock: fix release interference with paxos
This branch includes the following new commits:
new 63768d8 sanlock: preserve dblock values when setting shared flag new abab5e6 sanlock: add all dblock vals to debug new c10f23b sanlock: fix detection of shared lease new 76332a0 sanlock: fix release interference with paxos
The 4 revisions listed above as "new" are entirely new to this repository and will be described in separate emails. The revisions listed as "adds" were already present in the repository and have only been added to this reference.
This is an automated email from the git hooks/post-receive script.
teigland pushed a commit to branch testing in repository sanlock.
commit 63768d8b96763a763a134b9c7d3362995158aa94 Author: David Teigland teigland@redhat.com Date: Fri Nov 17 16:38:06 2017 -0600
sanlock: preserve dblock values when setting shared flag
steps for acquiring a shared lease: 1. acquire paxos lease 2. write shared flag into mode_block 3. release paxos lease
The mode_block follows the dblock, and when writing the new mode_block, the dblock values were cleared.
If another host is running a paxos ballot for the same lver (leader version) of the lease as was acquired in step 1, then clearing the dblock values in step 2 can cause the paxos ballot to miss the fact that the lease has already been acquired for the given version, and the second host will think that it also has acquired the lease with the same version.
This is fixed by keeping the dblock values intact when writing the mode_block in step 2. --- src/direct.c | 3 ++- src/paxos_lease.c | 5 ++++- src/paxos_lease.h | 3 +++ src/resource.c | 58 +++++++++++++++++++++++++++++++++++++------------------ 4 files changed, 48 insertions(+), 21 deletions(-)
diff --git a/src/direct.c b/src/direct.c index d8bad50..3d4832a 100644 --- a/src/direct.c +++ b/src/direct.c @@ -80,6 +80,7 @@ static int do_paxos_action(int action, struct task *task, int io_timeout, struct { struct token *token; struct leader_record leader; + struct paxos_dblock dblock; int disks_len, token_len; int j, rv = 0;
@@ -123,7 +124,7 @@ static int do_paxos_action(int action, struct task *task, int io_timeout, struct token->host_id = local_host_id; token->host_generation = local_host_generation;
- rv = paxos_lease_acquire(task, token, 0, leader_ret, 0, num_hosts); + rv = paxos_lease_acquire(task, token, 0, leader_ret, &dblock, 0, num_hosts); break;
case ACT_RELEASE: diff --git a/src/paxos_lease.c b/src/paxos_lease.c index 3666e21..726056f 100644 --- a/src/paxos_lease.c +++ b/src/paxos_lease.c @@ -52,7 +52,7 @@ uint32_t leader_checksum(struct leader_record *lr) return crc32c((uint32_t)~1, (uint8_t *)lr, LEADER_CHECKSUM_LEN); }
-static uint32_t dblock_checksum(struct paxos_dblock *pd) +uint32_t dblock_checksum(struct paxos_dblock *pd) { return crc32c((uint32_t)~1, (uint8_t *)pd, DBLOCK_CHECKSUM_LEN); } @@ -765,6 +765,7 @@ static int run_ballot(struct task *task, struct token *token, int num_hosts, (unsigned long long)next_lver, error); }
+ memcpy(dblock_out, &dblock, sizeof(struct paxos_dblock)); return error; }
@@ -1396,6 +1397,7 @@ int paxos_lease_acquire(struct task *task, struct token *token, uint32_t flags, struct leader_record *leader_ret, + struct paxos_dblock *dblock_ret, uint64_t acquire_lver, int new_num_hosts) { @@ -1881,6 +1883,7 @@ int paxos_lease_acquire(struct task *task, (unsigned long long)new_leader.timestamp);
memcpy(leader_ret, &new_leader, sizeof(struct leader_record)); + memcpy(dblock_ret, &dblock, sizeof(struct paxos_dblock)); error = SANLK_OK;
out: diff --git a/src/paxos_lease.h b/src/paxos_lease.h index ec015dd..cce85ed 100644 --- a/src/paxos_lease.h +++ b/src/paxos_lease.h @@ -16,6 +16,8 @@
uint32_t leader_checksum(struct leader_record *lr);
+uint32_t dblock_checksum(struct paxos_dblock *pd); + int paxos_lease_leader_read(struct task *task, struct token *token, struct leader_record *leader_ret, @@ -25,6 +27,7 @@ int paxos_lease_acquire(struct task *task, struct token *token, uint32_t flags, struct leader_record *leader_ret, + struct paxos_dblock *dblock_ret, uint64_t acquire_lver, int new_num_hosts);
diff --git a/src/resource.c b/src/resource.c index 584b66f..317fcb7 100644 --- a/src/resource.c +++ b/src/resource.c @@ -297,13 +297,16 @@ void check_mode_block(struct token *token, uint64_t next_lver, int q, char *dblo }
static int write_host_block(struct task *task, struct token *token, - uint64_t host_id, uint64_t mb_gen, uint32_t mb_flags) + uint64_t host_id, uint64_t mb_gen, uint32_t mb_flags, + struct paxos_dblock *pd) { struct sync_disk *disk; struct mode_block mb; struct mode_block mb_end; + struct paxos_dblock pd_end; char *iobuf, **p_iobuf; uint64_t offset; + uint32_t checksum; int num_disks = token->r.num_disks; int iobuf_len, rv, d;
@@ -321,6 +324,20 @@ static int write_host_block(struct task *task, struct token *token,
memset(iobuf, 0, iobuf_len);
+ /* + * When writing our mode block, we need to keep our dblock + * values intact because other hosts may be running the + * paxos algorithm and these values need to remain intact + * for them to reach the correct result. + */ + if (pd) { + paxos_dblock_out(pd, &pd_end); + checksum = dblock_checksum(&pd_end); + pd->checksum = checksum; + pd_end.checksum = cpu_to_le32(checksum); + memcpy(iobuf, (char *)&pd_end, sizeof(struct paxos_dblock)); + } + if (mb_gen || mb_flags) { memset(&mb, 0, sizeof(mb)); mb.flags = mb_flags; @@ -350,7 +367,6 @@ static int write_host_block(struct task *task, struct token *token, if (rv != SANLK_AIO_TIMEOUT) free(iobuf); return rv; - }
static int read_mode_block(struct task *task, struct token *token, @@ -433,7 +449,7 @@ static int clear_dead_shared(struct task *task, struct token *token, continue; }
- rv = write_host_block(task, token, host_id, 0, 0); + rv = write_host_block(task, token, host_id, 0, 0, NULL); if (rv < 0) { log_errot(token, "clear_dead_shared host_id %llu write_host_block %d", (unsigned long long)host_id, rv); @@ -569,7 +585,8 @@ int res_get_lvb(struct sanlk_resource *res, char **lvb_out, int *lvblen)
static int acquire_disk(struct task *task, struct token *token, uint64_t acquire_lver, int new_num_hosts, - int owner_nowait, struct leader_record *leader) + int owner_nowait, struct leader_record *leader, + struct paxos_dblock *dblock) { struct leader_record leader_tmp; int rv; @@ -586,8 +603,8 @@ static int acquire_disk(struct task *task, struct token *token,
memset(&leader_tmp, 0, sizeof(leader_tmp));
- rv = paxos_lease_acquire(task, token, flags, &leader_tmp, acquire_lver, - new_num_hosts); + rv = paxos_lease_acquire(task, token, flags, &leader_tmp, dblock, + acquire_lver, new_num_hosts);
log_token(token, "acquire_disk rv %d lver %llu at %llu", rv, (unsigned long long)leader_tmp.lver, @@ -798,7 +815,7 @@ static int _release_token(struct task *task, struct token *token, */
if (r_flags & R_ERASE_ALL) { - rv = write_host_block(task, token, token->host_id, 0, 0); + rv = write_host_block(task, token, token->host_id, 0, 0, NULL); if (rv < 0) { log_errot(token, "release_token erase all write_host_block %d", rv); ret = rv; @@ -828,7 +845,7 @@ static int _release_token(struct task *task, struct token *token, (unsigned long long)lver, rv);
} else if (r_flags & R_UNDO_SHARED) { - rv = write_host_block(task, token, token->host_id, 0, 0); + rv = write_host_block(task, token, token->host_id, 0, 0, NULL); if (rv < 0) { log_errot(token, "release_token undo shared write_host_block %d", rv); ret = rv; @@ -849,7 +866,7 @@ static int _release_token(struct task *task, struct token *token, } else if (r_flags & R_SHARED) { /* normal release of sh lease */
- rv = write_host_block(task, token, token->host_id, 0, 0); + rv = write_host_block(task, token, token->host_id, 0, 0, NULL); if (rv < 0) { log_errot(token, "release_token shared write_host_block %d", rv); ret = rv; @@ -879,7 +896,7 @@ static int _release_token(struct task *task, struct token *token, }
/* Failure here is not a big deal and can be ignored. */ - rv = write_host_block(task, token, token->host_id, 0, 0); + rv = write_host_block(task, token, token->host_id, 0, 0, NULL); if (rv < 0) log_errot(token, "release_token write_host_block %d", rv);
@@ -1094,6 +1111,7 @@ static struct resource *new_resource(struct token *token) static int convert_sh2ex_token(struct task *task, struct resource *r, struct token *token) { struct leader_record leader; + struct paxos_dblock dblock; int live_count = 0; int retries; int error; @@ -1114,7 +1132,7 @@ static int convert_sh2ex_token(struct task *task, struct resource *r, struct tok
token->flags |= T_WRITE_DBLOCK_MBLOCK_SH;
- rv = paxos_lease_acquire(task, token, 0, &leader, 0, 0); + rv = paxos_lease_acquire(task, token, 0, &leader, &dblock, 0, 0);
token->flags &= ~T_WRITE_DBLOCK_MBLOCK_SH;
@@ -1186,7 +1204,7 @@ static int convert_sh2ex_token(struct task *task, struct resource *r, struct tok }
do_mb: - rv = write_host_block(task, token, token->host_id, 0, 0); + rv = write_host_block(task, token, token->host_id, 0, 0, &dblock); if (rv < 0) { log_errot(token, "convert_sh2ex write_host_block error %d", rv);
@@ -1259,7 +1277,8 @@ static int convert_ex2sh_token(struct task *task, struct resource *r, struct tok if (r->flags & R_LVB_WRITE_RELEASE) write_lvb_block(task, r, token);
- rv = write_host_block(task, token, token->host_id, token->host_generation, MBLOCK_SHARED); + /* FIXME: preserve dblock */ + rv = write_host_block(task, token, token->host_id, token->host_generation, MBLOCK_SHARED, NULL); if (rv < 0) { log_errot(token, "convert_ex2sh write_host_block error %d", rv); return rv; @@ -1382,6 +1401,7 @@ int acquire_token(struct task *task, struct token *token, uint32_t cmd_flags, char *killpath, char *killargs) { struct leader_record leader; + struct paxos_dblock dblock; struct resource *r; uint64_t acquire_lver = 0; uint32_t new_num_hosts = 0; @@ -1565,7 +1585,7 @@ int acquire_token(struct task *task, struct token *token, uint32_t cmd_flags, retry: memset(&leader, 0, sizeof(struct leader_record));
- rv = acquire_disk(task, token, acquire_lver, new_num_hosts, owner_nowait, &leader); + rv = acquire_disk(task, token, acquire_lver, new_num_hosts, owner_nowait, &leader, &dblock);
if (rv == SANLK_ACQUIRE_IDLIVE || rv == SANLK_ACQUIRE_OWNED || rv == SANLK_ACQUIRE_OTHER) { /* @@ -1631,7 +1651,7 @@ int acquire_token(struct task *task, struct token *token, uint32_t cmd_flags, */
if (token->acquire_flags & SANLK_RES_SHARED) { - rv = write_host_block(task, token, token->host_id, token->host_generation, MBLOCK_SHARED); + rv = write_host_block(task, token, token->host_id, token->host_generation, MBLOCK_SHARED, &dblock); if (rv < 0) { log_errot(token, "acquire_token sh write_host_block error %d", rv); r->flags &= ~R_SHARED; @@ -1990,7 +2010,7 @@ static void resource_thread_release(struct task *task, struct resource *r, struc log_token(token, "release async r_flags %x", r_flags);
if (r_flags & R_ERASE_ALL) { - rv = write_host_block(task, token, token->host_id, 0, 0); + rv = write_host_block(task, token, token->host_id, 0, 0, NULL); if (rv < 0) log_errot(token, "release async erase all write_host_block %d", rv);
@@ -2015,7 +2035,7 @@ static void resource_thread_release(struct task *task, struct resource *r, struc (unsigned long long)r->leader.lver, rv);
} else if (r_flags & R_UNDO_SHARED) { - rv = write_host_block(task, token, token->host_id, 0, 0); + rv = write_host_block(task, token, token->host_id, 0, 0, NULL); if (rv < 0) log_errot(token, "release async undo shared write_host_block %d", rv);
@@ -2032,7 +2052,7 @@ static void resource_thread_release(struct task *task, struct resource *r, struc } else if (r_flags & R_SHARED) { /* normal release of sh lease */
- rv = write_host_block(task, token, token->host_id, 0, 0); + rv = write_host_block(task, token, token->host_id, 0, 0, NULL); if (rv < 0) log_errot(token, "release async shared write_host_block %d", rv);
@@ -2051,7 +2071,7 @@ static void resource_thread_release(struct task *task, struct resource *r, struc }
/* Failure here is not a big deal and can be ignored. */ - rv = write_host_block(task, token, token->host_id, 0, 0); + rv = write_host_block(task, token, token->host_id, 0, 0, NULL); if (rv < 0) log_errot(token, "release async write_host_block %d", rv);
This is an automated email from the git hooks/post-receive script.
teigland pushed a commit to branch testing in repository sanlock.
commit abab5e6f19abc927dba11635614c19a5590f9c04 Author: David Teigland teigland@redhat.com Date: Mon Nov 20 15:00:54 2017 -0600
sanlock: add all dblock vals to debug
for first four hosts.
TODO: make configurable --- src/paxos_lease.c | 84 ++++++++++++++++++++++++++++++++++++++++++++++++++----- 1 file changed, 77 insertions(+), 7 deletions(-)
diff --git a/src/paxos_lease.c b/src/paxos_lease.c index 726056f..777757b 100644 --- a/src/paxos_lease.c +++ b/src/paxos_lease.c @@ -431,6 +431,8 @@ static int run_ballot(struct task *task, struct token *token, int num_hosts, uint64_t next_lver, uint64_t our_mbal, struct paxos_dblock *dblock_out) { + char bk_debug[256]; + char bk_str[64]; struct paxos_dblock dblock; struct paxos_dblock bk_in; struct paxos_dblock bk_max; @@ -477,7 +479,7 @@ static int run_ballot(struct task *task, struct token *token, int num_hosts, * component is greater than dblock[p].mbal." */
- log_token(token, "ballot %llu phase1 mbal %llu", + log_token(token, "ballot %llu phase1 write mbal %llu", (unsigned long long)next_lver, (unsigned long long)our_mbal);
@@ -491,6 +493,7 @@ static int run_ballot(struct task *task, struct token *token, int num_hosts, num_writes = 0;
for (d = 0; d < num_disks; d++) { + /* acquire io: write 1 */ rv = write_dblock(task, token, &token->disks[d], token->host_id, &dblock); if (rv < 0) continue; @@ -504,6 +507,8 @@ static int run_ballot(struct task *task, struct token *token, int num_hosts, goto out; }
+ memset(bk_debug, 0, sizeof(bk_debug)); + num_reads = 0;
for (d = 0; d < num_disks; d++) { @@ -513,6 +518,7 @@ static int run_ballot(struct task *task, struct token *token, int num_hosts, continue; memset(iobuf[d], 0, iobuf_len);
+ /* acquire io: read 2 */ rv = read_iobuf(disk->fd, disk->offset, iobuf[d], iobuf_len, task, token->io_timeout, NULL); if (rv == SANLK_AIO_TIMEOUT) iobuf[d] = NULL; @@ -528,6 +534,19 @@ static int run_ballot(struct task *task, struct token *token, int num_hosts, paxos_dblock_in(bk_end, &bk_in); bk = &bk_in;
+ if (q < 4) { + memset(bk_str, 0, sizeof(bk_str)); + snprintf(bk_str, 64, "%d:%llu:%llu:%llu:%llu:%llu:%llu,", q, + (unsigned long long)bk_in.mbal, + (unsigned long long)bk_in.bal, + (unsigned long long)bk_in.inp, + (unsigned long long)bk_in.inp2, + (unsigned long long)bk_in.inp3, + (unsigned long long)bk_in.lver); + bk_str[63] = '\0'; + strcat(bk_debug, bk_str); + } + rv = verify_dblock(token, bk, checksum); if (rv < 0) continue; @@ -576,6 +595,9 @@ static int run_ballot(struct task *task, struct token *token, int num_hosts, } }
+ log_token(token, "ballot %llu phase1 read %s", + (unsigned long long)next_lver, bk_debug); + if (!majority_disks(num_disks, num_reads)) { log_errot(token, "ballot %llu dblock read error %d", (unsigned long long)next_lver, rv); @@ -634,7 +656,7 @@ static int run_ballot(struct task *task, struct token *token, int num_hosts,
phase2 = 1;
- log_token(token, "ballot %llu phase2 bal %llu inp %llu %llu %llu q_max %d", + log_token(token, "ballot %llu phase2 write bal %llu inp %llu %llu %llu q_max %d", (unsigned long long)dblock.lver, (unsigned long long)dblock.bal, (unsigned long long)dblock.inp, @@ -645,6 +667,7 @@ static int run_ballot(struct task *task, struct token *token, int num_hosts, num_writes = 0;
for (d = 0; d < num_disks; d++) { + /* acquire io: write 2 */ rv = write_dblock(task, token, &token->disks[d], token->host_id, &dblock); if (rv < 0) continue; @@ -658,6 +681,8 @@ static int run_ballot(struct task *task, struct token *token, int num_hosts, goto out; }
+ memset(bk_debug, 0, sizeof(bk_debug)); + num_reads = 0;
for (d = 0; d < num_disks; d++) { @@ -667,6 +692,7 @@ static int run_ballot(struct task *task, struct token *token, int num_hosts, continue; memset(iobuf[d], 0, iobuf_len);
+ /* acquire io: read 3 */ rv = read_iobuf(disk->fd, disk->offset, iobuf[d], iobuf_len, task, token->io_timeout, NULL); if (rv == SANLK_AIO_TIMEOUT) iobuf[d] = NULL; @@ -682,6 +708,19 @@ static int run_ballot(struct task *task, struct token *token, int num_hosts, paxos_dblock_in(bk_end, &bk_in); bk = &bk_in;
+ if (q < 4) { + memset(bk_str, 0, sizeof(bk_str)); + snprintf(bk_str, 64, "%d:%llu:%llu:%llu:%llu:%llu:%llu,", q, + (unsigned long long)bk->mbal, + (unsigned long long)bk->bal, + (unsigned long long)bk->inp, + (unsigned long long)bk->inp2, + (unsigned long long)bk->inp3, + (unsigned long long)bk->lver); + bk_str[63] = '\0'; + strcat(bk_debug, bk_str); + } + rv = verify_dblock(token, bk, checksum); if (rv < 0) continue; @@ -731,6 +770,9 @@ static int run_ballot(struct task *task, struct token *token, int num_hosts, } }
+ log_token(token, "ballot %llu phase2 read %s", + (unsigned long long)next_lver, bk_debug); + if (!majority_disks(num_disks, num_reads)) { log_errot(token, "ballot %llu dblock read2 error %d", (unsigned long long)next_lver, rv); @@ -1120,8 +1162,11 @@ static int _lease_read_one(struct task *task, struct paxos_dblock *our_dblock, uint64_t *max_mbal, int *max_q, - const char *caller) + const char *caller, + int log_bk_vals) { + char bk_debug[256]; + char bk_str[64]; struct leader_record leader_end; struct paxos_dblock our_dblock_end; struct paxos_dblock bk; @@ -1162,6 +1207,8 @@ static int _lease_read_one(struct task *task, if (rv < 0) goto out;
+ memset(bk_debug, 0, sizeof(bk_debug)); + for (q = 0; q < leader_ret->num_hosts; q++) { bk_end = (struct paxos_dblock *)(iobuf + ((2 + q) * sector_size));
@@ -1169,6 +1216,19 @@ static int _lease_read_one(struct task *task,
paxos_dblock_in(bk_end, &bk);
+ if (log_bk_vals && (q < 4)) { + memset(bk_str, 0, sizeof(bk_str)); + snprintf(bk_str, 64, "%d:%llu:%llu:%llu:%llu:%llu:%llu,", q, + (unsigned long long)bk.mbal, + (unsigned long long)bk.bal, + (unsigned long long)bk.inp, + (unsigned long long)bk.inp2, + (unsigned long long)bk.inp3, + (unsigned long long)bk.lver); + bk_str[63] = '\0'; + strcat(bk_debug, bk_str); + } + rv = verify_dblock(token, &bk, checksum); if (rv < 0) goto out; @@ -1181,6 +1241,14 @@ static int _lease_read_one(struct task *task, *max_mbal = tmp_mbal; *max_q = tmp_q;
+ if (log_bk_vals) + log_token(token, "leader %llu owner %llu %llu %llu dblocks %s", + (unsigned long long)leader_ret->lver, + (unsigned long long)leader_ret->owner_id, + (unsigned long long)leader_ret->owner_generation, + (unsigned long long)leader_ret->timestamp, + bk_debug); + out: if (rv != SANLK_AIO_TIMEOUT) free(iobuf); @@ -1227,7 +1295,7 @@ static int _lease_read_num(struct task *task,
for (d = 0; d < num_disks; d++) { rv = _lease_read_one(task, token, &token->disks[d], &leader_one, - &dblock_one, &mbal_one, &q_one, caller); + &dblock_one, &mbal_one, &q_one, caller, 0); if (rv < 0) continue;
@@ -1297,7 +1365,7 @@ static int _lease_read_num(struct task *task,
static int paxos_lease_read(struct task *task, struct token *token, struct leader_record *leader_ret, - uint64_t *max_mbal, const char *caller) + uint64_t *max_mbal, const char *caller, int log_bk_vals) { struct paxos_dblock our_dblock; int rv, q = -1; @@ -1307,7 +1375,7 @@ static int paxos_lease_read(struct task *task, struct token *token, leader_ret, &our_dblock, max_mbal, &q, caller); else rv = _lease_read_one(task, token, &token->disks[0], - leader_ret, &our_dblock, max_mbal, &q, caller); + leader_ret, &our_dblock, max_mbal, &q, caller, log_bk_vals);
if (rv == SANLK_OK) log_token(token, "%s leader %llu owner %llu %llu %llu max mbal[%d] %llu " @@ -1427,7 +1495,8 @@ int paxos_lease_acquire(struct task *task,
restart:
- error = paxos_lease_read(task, token, &cur_leader, &max_mbal, "paxos_acquire"); + /* acquire io: read 1 */ + error = paxos_lease_read(task, token, &cur_leader, &max_mbal, "paxos_acquire", 1); if (error < 0) goto out;
@@ -1730,6 +1799,7 @@ int paxos_lease_acquire(struct task *task, copy_cur_leader = 0; memcpy(&tmp_leader, &cur_leader, sizeof(struct leader_record)); } else { + /* acquire io: read 1 (for retry) */ error = paxos_lease_leader_read(task, token, &tmp_leader, "paxos_acquire"); if (error < 0) goto out;
This is an automated email from the git hooks/post-receive script.
teigland pushed a commit to branch testing in repository sanlock.
commit c10f23b5c19eaa0ca201e4f8a8d526d2543f07cb Author: David Teigland teigland@redhat.com Date: Tue Nov 21 11:02:17 2017 -0600
sanlock: fix detection of shared lease
When a host is acquiring a lease and detects that another host holds it shared, it will check if the host with the shared lease is dead. Before the dead-host check, the shared lease holder may have released its shared lease by clearing its mode_block. The host checking the shared lease needs to check if the shared lease has been released. --- src/resource.c | 53 ++++++++++++++++++++++++++++++++++++++--------------- 1 file changed, 38 insertions(+), 15 deletions(-)
diff --git a/src/resource.c b/src/resource.c index 317fcb7..6309b9c 100644 --- a/src/resource.c +++ b/src/resource.c @@ -291,8 +291,9 @@ void check_mode_block(struct token *token, uint64_t next_lver, int q, char *dblo if (mb.flags & MBLOCK_SHARED) { set_id_bit(q + 1, token->shared_bitmap, NULL); token->shared_count++; - log_token(token, "ballot %llu mode[%d] shared %d", - (unsigned long long)next_lver, q, token->shared_count); + log_token(token, "ballot %llu mode[%d] shared %d gen %llu", + (unsigned long long)next_lver, q, token->shared_count, + (unsigned long long)mb.generation); } }
@@ -370,14 +371,13 @@ static int write_host_block(struct task *task, struct token *token, }
static int read_mode_block(struct task *task, struct token *token, - uint64_t host_id, uint64_t *max_gen) + uint64_t host_id, struct mode_block *mb_out) { struct sync_disk *disk; struct mode_block *mb_end; struct mode_block mb; char *iobuf, **p_iobuf; uint64_t offset; - uint64_t max = 0; int num_disks = token->r.num_disks; int iobuf_len, rv, d;
@@ -406,24 +406,23 @@ static int read_mode_block(struct task *task, struct token *token,
mode_block_in(mb_end, &mb);
- if (!(mb.flags & MBLOCK_SHARED)) - continue; + memcpy(mb_out, &mb, sizeof(struct mode_block));
- if (!max || mb.generation > max) - max = mb.generation; + /* FIXME: combine results for multi-disk case */ + break; }
if (rv != SANLK_AIO_TIMEOUT) free(iobuf);
- *max_gen = max; return rv; }
static int clear_dead_shared(struct task *task, struct token *token, int num_hosts, int *live_count) { - uint64_t host_id, max_gen = 0; + struct mode_block mb; + uint64_t host_id; int i, rv = 0, live = 0;
for (i = 0; i < num_hosts; i++) { @@ -435,16 +434,36 @@ static int clear_dead_shared(struct task *task, struct token *token, if (!test_id_bit(host_id, token->shared_bitmap)) continue;
- rv = read_mode_block(task, token, host_id, &max_gen); + memset(&mb, 0, sizeof(mb)); + + rv = read_mode_block(task, token, host_id, &mb); if (rv < 0) { log_errot(token, "clear_dead_shared read_mode_block %llu %d", (unsigned long long)host_id, rv); return rv; }
- if (host_live(token->r.lockspace_name, host_id, max_gen)) { + log_token(token, "clear_dead_shared host_id %llu mode_block: flags %x gen %llu", + (unsigned long long)host_id, mb.flags, (unsigned long long)mb.generation); + + /* + * We get to this function because we saw the shared flag during + * paxos, but the holder of the shared lease may have dropped their + * shared lease and cleared the mode_block since then. + */ + if (!(mb.flags & MBLOCK_SHARED)) + continue; + + if (!mb.generation) { + /* shouldn't happen; if the shared flag is set, the generation should also be set. */ + log_errot(token, "clear_dead_shared host_id %llu mode_block: flags %x gen %llu", + (unsigned long long)host_id, mb.flags, (unsigned long long)mb.generation); + continue; + } + + if (host_live(token->r.lockspace_name, host_id, mb.generation)) { log_token(token, "clear_dead_shared host_id %llu gen %llu alive", - (unsigned long long)host_id, (unsigned long long)max_gen); + (unsigned long long)host_id, (unsigned long long)mb.generation); live++; continue; } @@ -456,8 +475,12 @@ static int clear_dead_shared(struct task *task, struct token *token, return rv; }
- log_token(token, "clear_dead_shared host_id %llu gen %llu dead and cleared", - (unsigned long long)host_id, (unsigned long long)max_gen); + /* + * not an error, just useful to have a record of when we clear a shared + * lock that was left by a failed host. + */ + log_errot(token, "cleared shared lease for dead host_id %llu gen %llu", + (unsigned long long)host_id, (unsigned long long)mb.generation); }
*live_count = live;
This is an automated email from the git hooks/post-receive script.
teigland pushed a commit to branch testing in repository sanlock.
commit 76332a0c5364578ea01adc801c743984aaf62e9f Author: David Teigland teigland@redhat.com Date: Wed Nov 22 16:04:36 2017 -0600
sanlock: fix release interference with paxos
The process of releasing a lease on disk involved first zeroing our own dblock values prior to zeroing the leader record. Clearing dblock values can interfere with an in-progress paxos ballot, and can result in a second host owning the same lease version that we released.
The incorrect zeroing of dblocks was added in commit d6bef45b9716c581d99466280a52a01c9ebe3bf7
The reason for zeroing was to fix an issue with shared leases here: e40e1f6e22f9b10f08d53fc7da94f5158d9e4ae8 That issue is now resolved by setting a new flag in the dblock to indicate we are releasing the lease. The new flag is set, along with the unchanged dblock values, before clearing the leader block. --- src/leader.h | 2 +- src/ondisk.c | 2 ++ src/paxos_dblock.h | 8 +++++ src/paxos_lease.c | 86 ++++++++++++++++++++++++++++++++++++++++++++------ src/resource.c | 64 +++++++++++++++++++++++++++++++------ src/sanlock_internal.h | 1 + 6 files changed, 143 insertions(+), 20 deletions(-)
diff --git a/src/leader.h b/src/leader.h index b95866a..d67c243 100644 --- a/src/leader.h +++ b/src/leader.h @@ -18,7 +18,7 @@ #define PAXOS_DISK_MAGIC 0x06152010 #define PAXOS_DISK_CLEAR 0x11282016 #define PAXOS_DISK_VERSION_MAJOR 0x00060000 -#define PAXOS_DISK_VERSION_MINOR 0x00000002 +#define PAXOS_DISK_VERSION_MINOR 0x00000003
#define DELTA_DISK_MAGIC 0x12212010 #define DELTA_DISK_VERSION_MAJOR 0x00030000 diff --git a/src/ondisk.c b/src/ondisk.c index 83b6832..21f9b64 100644 --- a/src/ondisk.c +++ b/src/ondisk.c @@ -92,6 +92,7 @@ void paxos_dblock_in(struct paxos_dblock *end, struct paxos_dblock *pd) pd->inp3 = le64_to_cpu(end->inp3); pd->lver = le64_to_cpu(end->lver); pd->checksum = le32_to_cpu(end->checksum); + pd->flags = le32_to_cpu(end->flags); }
void paxos_dblock_out(struct paxos_dblock *pd, struct paxos_dblock *end) @@ -104,6 +105,7 @@ void paxos_dblock_out(struct paxos_dblock *pd, struct paxos_dblock *end) end->lver = cpu_to_le64(pd->lver); /* N.B. the checksum must be computed after the byte swapping */ /* paxos_dblock_out(pd, end); checksum = compute(end), end->checksum = cpu_to_le32(checksum); */ + end->flags = cpu_to_le32(pd->flags); }
void mode_block_in(struct mode_block *end, struct mode_block *mb) diff --git a/src/paxos_dblock.h b/src/paxos_dblock.h index a07abc4..7d08c95 100644 --- a/src/paxos_dblock.h +++ b/src/paxos_dblock.h @@ -12,6 +12,8 @@
#define DBLOCK_CHECKSUM_LEN 48 /* ends before checksum field */
+#define DBLOCK_FL_RELEASED 0x00000001 + struct paxos_dblock { uint64_t mbal; uint64_t bal; @@ -20,6 +22,12 @@ struct paxos_dblock { uint64_t inp3; /* host_id's timestamp */ uint64_t lver; uint32_t checksum; + uint32_t flags; /* DBLOCK_FL_ */ };
+/* + * This struct cannot grow any larger than MBLOCK_OFFSET (128) + * because the mode_block starts at that offset in the same sector. + */ + #endif diff --git a/src/paxos_lease.c b/src/paxos_lease.c index 777757b..59a854b 100644 --- a/src/paxos_lease.c +++ b/src/paxos_lease.c @@ -1696,9 +1696,9 @@ int paxos_lease_acquire(struct task *task, if (cur_leader.write_id != cur_leader.owner_id) { rv = read_dblock(task, token, &token->disks[0], cur_leader.owner_id, &owner_dblock); - if (!rv && !owner_dblock.lver) { + if (!rv && (owner_dblock.flags & DBLOCK_FL_RELEASED)) { /* not an error, but interesting to see */ - log_errot(token, "paxos_acquire owner %llu %llu %llu writer %llu owner dblock free", + log_errot(token, "paxos_acquire owner %llu %llu %llu writer %llu owner dblock released", (unsigned long long)cur_leader.owner_id, (unsigned long long)cur_leader.owner_generation, (unsigned long long)cur_leader.timestamp, @@ -2026,28 +2026,68 @@ int paxos_lease_release(struct task *task, last = leader_last;
if (leader.lver != last->lver) { - log_errot(token, "paxos_release %llu other lver %llu", + log_errot(token, "paxos_release other lver " + "last lver %llu owner %llu %llu %llu writer %llu %llu %llu " + "disk lver %llu owner %llu %llu %llu writer %llu %llu %llu", (unsigned long long)last->lver, - (unsigned long long)leader.lver); + (unsigned long long)last->owner_id, + (unsigned long long)last->owner_generation, + (unsigned long long)last->timestamp, + (unsigned long long)last->write_id, + (unsigned long long)last->write_generation, + (unsigned long long)last->write_timestamp, + (unsigned long long)leader.lver, + (unsigned long long)leader.owner_id, + (unsigned long long)leader.owner_generation, + (unsigned long long)leader.timestamp, + (unsigned long long)leader.write_id, + (unsigned long long)leader.write_generation, + (unsigned long long)leader.write_timestamp); return SANLK_RELEASE_LVER; }
if (leader.timestamp == LEASE_FREE) { - log_errot(token, "paxos_release %llu already free %llu %llu %llu", + log_errot(token, "paxos_release already free " + "last lver %llu owner %llu %llu %llu writer %llu %llu %llu " + "disk lver %llu owner %llu %llu %llu writer %llu %llu %llu", (unsigned long long)last->lver, + (unsigned long long)last->owner_id, + (unsigned long long)last->owner_generation, + (unsigned long long)last->timestamp, + (unsigned long long)last->write_id, + (unsigned long long)last->write_generation, + (unsigned long long)last->write_timestamp, + (unsigned long long)leader.lver, (unsigned long long)leader.owner_id, (unsigned long long)leader.owner_generation, - (unsigned long long)leader.timestamp); + (unsigned long long)leader.timestamp, + (unsigned long long)leader.write_id, + (unsigned long long)leader.write_generation, + (unsigned long long)leader.write_timestamp); + return SANLK_RELEASE_OWNER; }
if (leader.owner_id != token->host_id || leader.owner_generation != token->host_generation) { - log_errot(token, "paxos_release %llu other owner %llu %llu %llu", + log_errot(token, "paxos_release other owner " + "last lver %llu owner %llu %llu %llu writer %llu %llu %llu " + "disk lver %llu owner %llu %llu %llu writer %llu %llu %llu", (unsigned long long)last->lver, + (unsigned long long)last->owner_id, + (unsigned long long)last->owner_generation, + (unsigned long long)last->timestamp, + (unsigned long long)last->write_id, + (unsigned long long)last->write_generation, + (unsigned long long)last->write_timestamp, + (unsigned long long)leader.lver, (unsigned long long)leader.owner_id, (unsigned long long)leader.owner_generation, - (unsigned long long)leader.timestamp); + (unsigned long long)leader.timestamp, + (unsigned long long)leader.write_id, + (unsigned long long)leader.write_generation, + (unsigned long long)leader.write_timestamp); + return SANLK_RELEASE_OWNER; }
@@ -2063,19 +2103,45 @@ int paxos_lease_release(struct task *task, * perfectly fine (use log_error since it's interesting * to see when this happens.) */ - log_errot(token, "paxos_release %llu leader different " - "write %llu %llu %llu vs %llu %llu %llu", + log_errot(token, "paxos_release different vals " + "last lver %llu owner %llu %llu %llu writer %llu %llu %llu " + "disk lver %llu owner %llu %llu %llu writer %llu %llu %llu", (unsigned long long)last->lver, + (unsigned long long)last->owner_id, + (unsigned long long)last->owner_generation, + (unsigned long long)last->timestamp, (unsigned long long)last->write_id, (unsigned long long)last->write_generation, (unsigned long long)last->write_timestamp, + (unsigned long long)leader.lver, + (unsigned long long)leader.owner_id, + (unsigned long long)leader.owner_generation, + (unsigned long long)leader.timestamp, (unsigned long long)leader.write_id, (unsigned long long)leader.write_generation, (unsigned long long)leader.write_timestamp); + /* log_leader_error(0, token, &token->disks[0], last, "paxos_release"); log_leader_error(0, token, &token->disks[0], &leader, "paxos_release"); */ + + /* + * If another host was the writer and committed us as the + * owner, then we don't zero the leader record when we release, + * we just release our dblock (by setting the release flag, + * already done prior to calling paxos_lease_release). This is + * because other hosts will ignore our leader record if we were + * not the writer once we release our dblock. Those other + * hosts will then run a ballot and commit/write a new leader. + * If we are also zeroing the leader, that can race with + * another host writing a new leader, and we could clobber the + * new leader. + */ + if (leader.write_id != token->host_id) { + log_token(token, "paxos_release %llu skip leader write", (unsigned long long)last->lver); + return 0; + } }
if (resrename) diff --git a/src/resource.c b/src/resource.c index 6309b9c..3339422 100644 --- a/src/resource.c +++ b/src/resource.c @@ -361,8 +361,21 @@ static int write_host_block(struct task *task, struct token *token, log_errot(token, "write_host_block host_id %llu flags %x gen %llu rv %d", (unsigned long long)host_id, mb_flags, (unsigned long long)mb_gen, rv); } else { - log_token(token, "write_host_block host_id %llu flags %x gen %llu", - (unsigned long long)host_id, mb_flags, (unsigned long long)mb_gen); + if (pd) + log_token(token, "write_host_block host_id %llu flags %x gen %llu dblock %llu:%llu:%llu:%llu:%llu:%llu%s", + (unsigned long long)host_id, + mb_flags, + (unsigned long long)mb_gen, + (unsigned long long)pd->mbal, + (unsigned long long)pd->bal, + (unsigned long long)pd->inp, + (unsigned long long)pd->inp2, + (unsigned long long)pd->inp3, + (unsigned long long)pd->lver, + (pd->flags & DBLOCK_FL_RELEASED) ? ":RELEASED." : "."); + else + log_token(token, "write_host_block host_id %llu flags %x gen %llu dblock 0", + (unsigned long long)host_id, mb_flags, (unsigned long long)mb_gen); }
if (rv != SANLK_AIO_TIMEOUT) @@ -370,6 +383,29 @@ static int write_host_block(struct task *task, struct token *token, return rv; }
+static int write_host_block_zero_dblock_release(struct task *task, struct token *token) +{ + struct paxos_dblock dblock; + + memcpy(&dblock, &token->resource->acquire_dblock, sizeof(dblock)); + + dblock.flags = DBLOCK_FL_RELEASED; + + return write_host_block(task, token, token->host_id, 0, 0, &dblock); +} + +static int write_host_block_shared_dblock_release(struct task *task, struct token *token) +{ + struct paxos_dblock dblock; + + memcpy(&dblock, &token->resource->acquire_dblock, sizeof(dblock)); + + dblock.flags = DBLOCK_FL_RELEASED; + + return write_host_block(task, token, token->host_id, token->host_generation, + MBLOCK_SHARED, &dblock); +} + static int read_mode_block(struct task *task, struct token *token, uint64_t host_id, struct mode_block *mb_out) { @@ -838,6 +874,8 @@ static int _release_token(struct task *task, struct token *token, */
if (r_flags & R_ERASE_ALL) { + /* FIXME: figure out what to clear to avoid disrupting ongoing paxos */ + rv = write_host_block(task, token, token->host_id, 0, 0, NULL); if (rv < 0) { log_errot(token, "release_token erase all write_host_block %d", rv); @@ -868,6 +906,8 @@ static int _release_token(struct task *task, struct token *token, (unsigned long long)lver, rv);
} else if (r_flags & R_UNDO_SHARED) { + /* FIXME: figure out what to clear to avoid disrupting ongoing paxos */ + rv = write_host_block(task, token, token->host_id, 0, 0, NULL); if (rv < 0) { log_errot(token, "release_token undo shared write_host_block %d", rv); @@ -889,7 +929,7 @@ static int _release_token(struct task *task, struct token *token, } else if (r_flags & R_SHARED) { /* normal release of sh lease */
- rv = write_host_block(task, token, token->host_id, 0, 0, NULL); + rv = write_host_block_zero_dblock_release(task, token); if (rv < 0) { log_errot(token, "release_token shared write_host_block %d", rv); ret = rv; @@ -919,7 +959,7 @@ static int _release_token(struct task *task, struct token *token, }
/* Failure here is not a big deal and can be ignored. */ - rv = write_host_block(task, token, token->host_id, 0, 0, NULL); + rv = write_host_block_zero_dblock_release(task, token); if (rv < 0) log_errot(token, "release_token write_host_block %d", rv);
@@ -1300,8 +1340,7 @@ static int convert_ex2sh_token(struct task *task, struct resource *r, struct tok if (r->flags & R_LVB_WRITE_RELEASE) write_lvb_block(task, r, token);
- /* FIXME: preserve dblock */ - rv = write_host_block(task, token, token->host_id, token->host_generation, MBLOCK_SHARED, NULL); + rv = write_host_block_shared_dblock_release(task, token); if (rv < 0) { log_errot(token, "convert_ex2sh write_host_block error %d", rv); return rv; @@ -1668,13 +1707,15 @@ int acquire_token(struct task *task, struct token *token, uint32_t cmd_flags, if (!(token->acquire_flags & SANLK_RES_SHARED)) token->r.lver = leader.lver;
+ memcpy(&token->resource->acquire_dblock, &dblock, sizeof(dblock)); + /* * acquiring shared lease, so we set SHARED in our mode_block * and release the leader owner. */
if (token->acquire_flags & SANLK_RES_SHARED) { - rv = write_host_block(task, token, token->host_id, token->host_generation, MBLOCK_SHARED, &dblock); + rv = write_host_block_shared_dblock_release(task, token); if (rv < 0) { log_errot(token, "acquire_token sh write_host_block error %d", rv); r->flags &= ~R_SHARED; @@ -2033,6 +2074,8 @@ static void resource_thread_release(struct task *task, struct resource *r, struc log_token(token, "release async r_flags %x", r_flags);
if (r_flags & R_ERASE_ALL) { + /* FIXME: figure out what to clear to avoid disrupting ongoing paxos */ + rv = write_host_block(task, token, token->host_id, 0, 0, NULL); if (rv < 0) log_errot(token, "release async erase all write_host_block %d", rv); @@ -2058,6 +2101,8 @@ static void resource_thread_release(struct task *task, struct resource *r, struc (unsigned long long)r->leader.lver, rv);
} else if (r_flags & R_UNDO_SHARED) { + /* FIXME: figure out what to clear to avoid disrupting ongoing paxos */ + rv = write_host_block(task, token, token->host_id, 0, 0, NULL); if (rv < 0) log_errot(token, "release async undo shared write_host_block %d", rv); @@ -2075,7 +2120,7 @@ static void resource_thread_release(struct task *task, struct resource *r, struc } else if (r_flags & R_SHARED) { /* normal release of sh lease */
- rv = write_host_block(task, token, token->host_id, 0, 0, NULL); + rv = write_host_block_zero_dblock_release(task, token); if (rv < 0) log_errot(token, "release async shared write_host_block %d", rv);
@@ -2094,7 +2139,7 @@ static void resource_thread_release(struct task *task, struct resource *r, struc }
/* Failure here is not a big deal and can be ignored. */ - rv = write_host_block(task, token, token->host_id, 0, 0, NULL); + rv = write_host_block_zero_dblock_release(task, token); if (rv < 0) log_errot(token, "release async write_host_block %d", rv);
@@ -2256,6 +2301,7 @@ static void *resource_thread(void *arg GNUC_UNUSED) tt->host_generation = r->host_generation; tt->token_id = r->release_token_id; tt->io_timeout = r->io_timeout; + tt->resource = r;
/* * Set the time after which we should try to release this diff --git a/src/sanlock_internal.h b/src/sanlock_internal.h index 472380f..279b56d 100644 --- a/src/sanlock_internal.h +++ b/src/sanlock_internal.h @@ -130,6 +130,7 @@ struct resource { char killpath[SANLK_HELPER_PATH_LEN]; /* copied from client */ char killargs[SANLK_HELPER_ARGS_LEN]; /* copied from client */ struct leader_record leader; /* copy of last leader_record we wrote */ + struct paxos_dblock acquire_dblock; /* dblock we wrote in acquire */ struct sanlk_resource r; };
sanlock-devel@lists.fedorahosted.org