src/paxos_lease.c | 64 ++++++++++++++++++++++++++++++++++++++++++++++++------ 1 file changed, 57 insertions(+), 7 deletions(-)
New commits: commit e40e1f6e22f9b10f08d53fc7da94f5158d9e4ae8 Author: David Teigland teigland@redhat.com Date: Wed Jan 28 13:20:35 2015 -0600
sanlock: fix paxos release
Fix a case where a paxos release is clobbered, leaving the lease held on disk, but not by the host. Other hosts cannot acquire the lease until the owner is gone, or acquires and releases it again.
An extra write is added to every paxos release, making a sanlock_release two writes instead of one. A host releasing a paxos lease now writes to its own dblock, zeroing it, then writes the leader record with the owner field zeroed as before. (A side effect is that the mbal values in dblocks restart from zero at each acquire, rather than increasing indefinately.)
The clobbered release only happened when: - multiple hosts are attempting to acquire the lease at once, running the paxos ballot concurrently - two or more hosts, say A and B, complete the ballot, and both write the leader setting the owner to A - A writes the leader first with itself as owner - A then quickly releases the lease, writing the leader again, with the owner zeroed - B writes the ballot result to the leader, setting the owner to A - the leader write from B arrives after the leader write from A which released the lease - so B changes the owner back from zero to A again
This is the correct operation of the algorithm, and there is no clear way to prevent it (at least without a major rethinking of the implementation).
The solution is to: 1. zero the dblock during release, in addition to the leader owner 2. detect when the leader is clobbered by checking the dblock
A host trying to acquire a lease after the release was clobbered will: 1. see that the lease is owned by A 2. see that the owner of the lease, A, is alive 3. see that last writer of the leader (B) is not the same host as the owner of the lease (A) 4. read the dblock of the owner (A) to see if it is zeroed or not 5. if the owner's dblock is zero, it means that the release was clobbered as described above 6. the host trying to acquire the lease will go ahead and run a new ballot to acquire the lease
This solution uses the write_id part of the leader record which has previously only been used as a debugging aid. Using the write_id is not essential to the solution, but is an optimization. Without it, the owner's dblock would need to be read more often (any time acquire sees a lease held by a live host).
The problem of clobbered releases will occur when a lease is released very quickly after being acquired. This quick release is what happens when acquiring a shared lease, so this problem is most likely to be visible when multiple hosts all attempt to acquire a shared lease at once.
Signed-off-by: David Teigland teigland@redhat.com
diff --git a/src/paxos_lease.c b/src/paxos_lease.c index 3e9c306..560392c 100644 --- a/src/paxos_lease.c +++ b/src/paxos_lease.c @@ -279,8 +279,8 @@ int paxos_lease_leader_clobber(struct task *task, return rv; }
-#if 0 static int read_dblock(struct task *task, + struct token *token, struct sync_disk *disk, uint64_t host_id, struct paxos_dblock *pd) @@ -291,13 +291,14 @@ static int read_dblock(struct task *task, /* 1 leader block + 1 request block; host_id N is block offset N-1 */
rv = read_sectors(disk, 2 + host_id - 1, 1, (char *)&pd_end, sizeof(struct paxos_dblock), - task, "dblock"); + task, token->io_timeout, "dblock");
paxos_dblock_in(&pd_end, pd);
return rv; }
+#if 0 static int read_dblocks(struct task *task, struct sync_disk *disk, struct paxos_dblock *pds, @@ -1428,6 +1429,7 @@ int paxos_lease_acquire(struct task *task, struct leader_record tmp_leader; struct leader_record new_leader; struct paxos_dblock dblock; + struct paxos_dblock owner_dblock; struct host_status hs; uint64_t wait_start, now; uint64_t last_timestamp; @@ -1473,7 +1475,10 @@ int paxos_lease_acquire(struct task *task,
if (cur_leader.owner_id == token->host_id && cur_leader.owner_generation == token->host_generation) { - log_token(token, "paxos_acquire already owner id %llu gen %llu", + log_token(token, "paxos_acquire owner %llu %llu %llu is already local", + (unsigned long long)cur_leader.owner_id, + (unsigned long long)cur_leader.owner_generation, + (unsigned long long)cur_leader.timestamp, (unsigned long long)token->host_id, (unsigned long long)token->host_generation); copy_cur_leader = 1; @@ -1488,10 +1493,11 @@ int paxos_lease_acquire(struct task *task,
if (cur_leader.owner_id == token->host_id && cur_leader.owner_generation < token->host_generation) { - log_token(token, "paxos_acquire past owner id %llu gen %llu %llu", - (unsigned long long)token->host_id, - (unsigned long long)token->host_generation, - (unsigned long long)cur_leader.owner_generation); + log_token(token, "paxos_acquire owner %llu %llu %llu was old local new is %llu", + (unsigned long long)cur_leader.owner_id, + (unsigned long long)cur_leader.owner_generation, + (unsigned long long)cur_leader.timestamp, + (unsigned long long)token->host_generation); copy_cur_leader = 1; goto run; } @@ -1618,6 +1624,32 @@ int paxos_lease_acquire(struct task *task, (unsigned long long)host_id_leader.timestamp); } memcpy(leader_ret, &cur_leader, sizeof(struct leader_record)); + + /* It's possible that the live owner has released the + lease, but its release was clobbered by another host + that was running the ballot with it and wrote it as + the owner. If the leader writer was not the owner, + check if the owner's dblock is cleared. If so, then + the owner released the lease and we can run a + ballot. Comparing the write_id and owner_id is not + required; we could always read the owner dblock + here, but comparing the writer and owner can + eliminate many unnecessary dblock reads. */ + + if (cur_leader.write_id != cur_leader.owner_id) { + rv = read_dblock(task, token, &token->disks[0], + cur_leader.owner_id, &owner_dblock); + if (!rv && !owner_dblock.lver) { + /* not an error, but interesting to see */ + log_errot(token, "paxos_acquire owner %llu %llu %llu writer %llu owner dblock free", + (unsigned long long)cur_leader.owner_id, + (unsigned long long)cur_leader.owner_generation, + (unsigned long long)cur_leader.timestamp, + (unsigned long long)cur_leader.write_id); + goto run; + } + } + error = SANLK_ACQUIRE_IDLIVE; goto out; } @@ -1911,6 +1943,24 @@ int paxos_lease_release(struct task *task, struct leader_record *last; int error;
+ /* + * If we are releasing this lease very quickly after acquiring it, + * there's a chance that another host was running the same acquire + * ballot that we were and also committed us as the owner of this + * lease, writing our inp values to the leader after we did ourself. + * That leader write from the other host may happen after the leader + * write we will do here releasing ownership. So the release we do + * here may be clobbered and lost. The result is that we own the lease + * on disk, but don't know it, so it won't be released unless we happen + * to acquire and release it again. The solution is that we clear our + * dblock in addition to clearing the leader record. Other hosts can + * then check our dblock to see if we really do own the lease. If the + * leader says we own the lease, but our dblock is cleared, then our + * leader write in release was clobbered, and other hosts will run a + * ballot to set a new owner. + */ + paxos_erase_dblock(task, token, token->host_id); + error = paxos_lease_leader_read(task, token, &leader, "paxos_release"); if (error < 0) { log_errot(token, "paxos_release leader_read error %d", error);
sanlock-devel@lists.fedorahosted.org