src/paxos_lease.c src/resource.c
by David Teigland
src/paxos_lease.c | 18 --
src/resource.c | 458 ++++++++++++++++++++++++++++++++----------------------
2 files changed, 272 insertions(+), 204 deletions(-)
New commits:
commit d6bef45b9716c581d99466280a52a01c9ebe3bf7
Author: David Teigland <teigland(a)redhat.com>
Date: Thu Dec 10 16:41:04 2015 -0600
sanlock: fix release clearing of host block
This is a comprehensive fix for the regression
caused by e40e1f6e22f9b10f08d53fc7da94f5158d9e4ae8
which caused the dblock to be zeroed in paxos_lease_release.
That commit failed to account for the fact that clearing
the dblock also cleared the mblock (mode_block) that has
the SHARED flag. This broke shared leases by clearing
the SHARED flag immediately after setting it.
Clearing the dblock and mblock is now moved up to
the resource layer above the paxos layer. Every
place that releases paxos leases or modifies the
mode block now needs to choose what should be
done with the "host block" (referring to the
host's sector containing the dblock and mblock.)
diff --git a/src/paxos_lease.c b/src/paxos_lease.c
index eae2fd9..2206c95 100644
--- a/src/paxos_lease.c
+++ b/src/paxos_lease.c
@@ -1934,24 +1934,6 @@ int paxos_lease_release(struct task *task,
struct leader_record *last;
int error;
- /*
- * If we are releasing this lease very quickly after acquiring it,
- * there's a chance that another host was running the same acquire
- * ballot that we were and also committed us as the owner of this
- * lease, writing our inp values to the leader after we did ourself.
- * That leader write from the other host may happen after the leader
- * write we will do here releasing ownership. So the release we do
- * here may be clobbered and lost. The result is that we own the lease
- * on disk, but don't know it, so it won't be released unless we happen
- * to acquire and release it again. The solution is that we clear our
- * dblock in addition to clearing the leader record. Other hosts can
- * then check our dblock to see if we really do own the lease. If the
- * leader says we own the lease, but our dblock is cleared, then our
- * leader write in release was clobbered, and other hosts will run a
- * ballot to set a new owner.
- */
- paxos_erase_dblock(task, token, token->host_id);
-
error = paxos_lease_leader_read(task, token, &leader, "paxos_release");
if (error < 0) {
log_errot(token, "paxos_release leader_read error %d", error);
diff --git a/src/resource.c b/src/resource.c
index 8767746..55472d9 100644
--- a/src/resource.c
+++ b/src/resource.c
@@ -296,8 +296,8 @@ void check_mode_block(struct token *token, uint64_t next_lver, int q, char *dblo
}
}
-static int set_mode_block(struct task *task, struct token *token,
- uint64_t host_id, uint64_t gen, uint32_t flags)
+static int write_host_block(struct task *task, struct token *token,
+ uint64_t host_id, uint64_t mb_gen, uint32_t mb_flags)
{
struct sync_disk *disk;
struct mode_block mb;
@@ -319,21 +319,20 @@ static int set_mode_block(struct task *task, struct token *token,
if (rv)
return -ENOMEM;
- for (d = 0; d < num_disks; d++) {
- disk = &token->disks[d];
-
- offset = disk->offset + ((2 + host_id - 1) * disk->sector_size);
-
- rv = read_iobuf(disk->fd, offset, iobuf, iobuf_len, task, token->io_timeout);
- if (rv < 0)
- break;
+ memset(iobuf, 0, iobuf_len);
+ if (mb_gen || mb_flags) {
memset(&mb, 0, sizeof(mb));
- mb.flags = flags;
- mb.generation = gen;
-
+ mb.flags = mb_flags;
+ mb.generation = mb_gen;
mode_block_out(&mb, &mb_end);
memcpy(iobuf + MBLOCK_OFFSET, &mb_end, sizeof(struct mode_block));
+ }
+
+ for (d = 0; d < num_disks; d++) {
+ disk = &token->disks[d];
+
+ offset = disk->offset + ((2 + host_id - 1) * disk->sector_size);
rv = write_iobuf(disk->fd, offset, iobuf, iobuf_len, task, token->io_timeout);
if (rv < 0)
@@ -341,16 +340,17 @@ static int set_mode_block(struct task *task, struct token *token,
}
if (rv < 0) {
- log_errot(token, "set_mode_block host_id %llu flags %x gen %llu d %d rv %d",
- (unsigned long long)host_id, flags, (unsigned long long)gen, d, rv);
+ log_errot(token, "write_host_block host_id %llu flags %x gen %llu rv %d",
+ (unsigned long long)host_id, mb_flags, (unsigned long long)mb_gen, rv);
} else {
- log_token(token, "set_mode_block host_id %llu flags %x gen %llu",
- (unsigned long long)host_id, flags, (unsigned long long)gen);
+ log_token(token, "write_host_block host_id %llu flags %x gen %llu",
+ (unsigned long long)host_id, mb_flags, (unsigned long long)mb_gen);
}
if (rv != SANLK_AIO_TIMEOUT)
free(iobuf);
return rv;
+
}
static int read_mode_block(struct task *task, struct token *token,
@@ -433,9 +433,9 @@ static int clear_dead_shared(struct task *task, struct token *token,
continue;
}
- rv = set_mode_block(task, token, host_id, 0, 0);
+ rv = write_host_block(task, token, host_id, 0, 0);
if (rv < 0) {
- log_errot(token, "clear_dead_shared host_id %llu set_mode_block %d",
+ log_errot(token, "clear_dead_shared host_id %llu write_host_block %d",
(unsigned long long)host_id, rv);
return rv;
}
@@ -624,34 +624,56 @@ static int release_disk(struct task *task, struct token *token,
* 2. perform on-disk operations to remove this host's ownership of the lease
* 3. list_del and free the struct resource
*
- * The on-disk operations in step 2 include five variations:
- * . skip all on-disk operations
- * . write zero mode block
- * . write zero leader record
- * . write zero mode block and write zero leader record
- * . write zero dblock and write zero leader record
+ * Normal cases:
+ *
+ * 1. release ex lease
+ *
+ * . zero our dblock values [see *]
+ * (zeroing our mblock at the same time is ok because it's not used)
+ * . Use paxos_lease_release to set LEASE_FREE in leader_record.
+ * . (If r->leader is zero, it implies that the on-disk lease was never
+ * acquired, so all on-disk operations are skipped.)
+ *
+ * 2. release sh lease (R_SHARED is set in r_flags)
+ *
+ * . As a shared lease holder we do not own the leader, so no
+ * change to the leader is needed.
+ * . zero our mblock values (our SHARED flag)
+ * (zeroing our dblock at the same time is ok because it's not used)
*
- * These on-disk variations are controlled by the following:
+ * Unusual cases:
*
- * - if the token indicates it's being released because the lockspace is
- * failed/dead, then all on-disk operations are skipped.
+ * 3. skip all disk operations
*
- * - if r->leader is zero, it implies that the on-disk lease was never
- * acquired, so all on-disk operations are skipped.
+ * . "nodisk" is used when the caller only needs to remove the token (step 1),
+ * i.e. on an error path prior to any disk operations having been started.
*
- * - if the caller has specified nodisk, then all on-disk operations are
- * skipped.
+ * . the token is being released because the lockspace is failed/dead,
+ * so disk operations are skipped since they'll fail.
*
- * - if R_SHARED is set, then only the host's mode_block is zeroed.
+ * . the token is being released after acquiring the lease failed,
+ * e.g. it was owned by another host.
*
- * - if R_SHARED is not set, then the leader_record is released
- * (involves reading it, verifying it's owned, then writing it)
+ * 4. try to unwind from failed acquire of a shared lease (R_UNDO_SHARED)
*
- * - if R_UNDO_SHARED is set, then the mode_block and leader_record
- * are both cleared
+ * . A disk operation failed while trying to acquire a shared lease,
+ * so we want to back out and leave the lease unowned. This means
+ * ensuring that our mblock does not have SHARED set and that we
+ * don't own the leader.
+ * . zero our mblock values
+ * . zero our dblock values [see *]
+ * . Use paxos_lease_release to set LEASE_FREE in leader_record.
+ *
+ * 5. try to unwind from failed acquire (R_ERASE_ALL)
+ *
+ * . A disk operation failed at some point while changing a lease,
+ * and we want to clear all ownership/state we have in the lease.
+ * . zero our mblock values
+ * . zero our dblock values [see * and **]
+ * . Use paxos_lease_release to set LEASE_FREE in leader_record.
+ *
+ * (4 and 5 are basically the same and should be combined)
*
- * - if R_ERASE_ALL is set, then the dblock and leader_record
- * are both cleared
*
* Error handling:
*
@@ -660,7 +682,23 @@ static int release_disk(struct task *task, struct token *token,
* The resource_thread will retry the on-disk operations until they succeed,
* then free the resource.
*
- * For ERASE_ALL we don't want another host running the ballot to select
+ * [*] Reason for clearing our dblock when releasing an ex/owned lease:
+ * If we are releasing this lease very quickly after acquiring it,
+ * there's a chance that another host was running the same acquire
+ * ballot that we were and also committed us as the owner of this
+ * lease, writing our inp values to the leader after we did ourself.
+ * That leader write from the other host may happen after the leader
+ * write we will do here releasing ownership. So the release we do
+ * here may be clobbered and lost. The result is that we own the lease
+ * on disk, but don't know it, so it won't be released unless we happen
+ * to acquire and release it again. The solution is that we clear our
+ * dblock in addition to clearing the leader record. Other hosts can
+ * then check our dblock to see if we really do own the lease. If the
+ * leader says we own the lease, but our dblock is cleared, then our
+ * leader write in release was clobbered, and other hosts will run a
+ * ballot to set a new owner.
+ *
+ * [**] For ERASE_ALL we don't want another host running the ballot to select
* our dblock values and commit them, making us the owner after we've aborted
* the acquire. So, we clear our dblock values first to prevent that from
* happening from this point forward. However, another host contending for the
@@ -691,8 +729,10 @@ static int _release_token(struct task *task, struct token *token,
struct resource *r = token->resource;
uint64_t lver;
uint32_t r_flags = 0;
+ int retry_async = 0;
int last_token = 0;
- int rv = 0;
+ int ret = SANLK_OK;
+ int rv;
/* We keep r on the resources_rem list while doing the actual release
on disk so another acquire for the same resource will see it on
@@ -727,31 +767,45 @@ static int _release_token(struct task *task, struct token *token,
if (token->space_dead) {
/* don't bother trying disk op which will probably timeout */
close_disks(token->disks, token->r.num_disks);
- rv = SANLK_OK;
goto out;
}
- if (nodisk) {
- rv = SANLK_OK;
+ if (nodisk)
goto out;
- }
if (!opened) {
rv = open_disks_fd(token->disks, token->r.num_disks);
if (rv < 0) {
log_errot(token, "release_token open error %d", rv);
+ ret = rv;
goto out;
}
}
+ log_token(token, "release_token r_flags %x lver %llu,
+ r_flags, (unsigned long long)lver);
+
+ /*
+ * In all cases we want to (or can) clear both dblock and mblock.
+ *
+ * Cases where we want to release ownership of the leader:
+ * . releasing ex lease !(r_flags & R_SHARED)
+ * . R_UNDO_SHARED
+ * . R_ERASE_ALL
+ *
+ * Cases where we don't want to release ownership of the leader:
+ * . releasing sh lease: (r_flags & R_SHARED)
+ */
+
if (r_flags & R_ERASE_ALL) {
- rv = paxos_erase_dblock(task, token, token->host_id);
+ rv = write_host_block(task, token, token->host_id, 0, 0);
if (rv < 0) {
- log_errot(token, "release_token erase_dblock error %d r_flags %x",
- rv, r_flags);
- goto out_close;
+ log_errot(token, "release_token erase all write_host_block %d", rv);
+ ret = rv;
}
- log_token(token, "release_token erase dblock done");
+
+ if (rv == SANLK_AIO_TIMEOUT)
+ retry_async = 1;
/* Even when acquire did not get far enough to get a copy of the
leader (!lver), we still want to try to release the leader
@@ -762,32 +816,59 @@ static int _release_token(struct task *task, struct token *token,
else
rv = paxos_lease_release(task, token, NULL, &r->leader, &leader);
+ if (rv < 0)
+ ret = rv;
+
+ if (rv == SANLK_AIO_TIMEOUT)
+ retry_async = 1;
+
/* want to see this result in sanlock.log but not worry people with error */
log_level(0, token->token_id, NULL, LOG_WARNING,
- "release_token erase leader lver %llu rv %d",
+ "release_token erase all leader lver %llu rv %d",
(unsigned long long)lver, rv);
- goto out_close;
- }
+ } else if (r_flags & R_UNDO_SHARED) {
+ rv = write_host_block(task, token, token->host_id, 0, 0);
+ if (rv < 0) {
+ log_errot(token, "release_token undo shared write_host_block %d", rv);
+ ret = rv;
+ }
- if ((r_flags & R_SHARED) || (r_flags & R_UNDO_SHARED)) {
- rv = set_mode_block(task, token, token->host_id, 0, 0);
+ if (rv == SANLK_AIO_TIMEOUT)
+ retry_async = 1;
+
+ rv = release_disk(task, token, resrename, &r->leader);
if (rv < 0) {
- log_errot(token, "release_token set_mode_block error %d r_flags %x",
- rv, r_flags);
- goto out_close;
+ log_errot(token, "release_token undo shared release leader %d", rv);
+ ret = rv;
}
- }
- if (!lver) {
- /* zero lver means acquire did not get to the point of writing a leader,
- so we don't need to release the lease on disk. */
- close_disks(token->disks, token->r.num_disks);
- rv = SANLK_OK;
- goto out;
- }
+ if (rv == SANLK_AIO_TIMEOUT)
+ retry_async = 1;
+
+ } else if (r_flags & R_SHARED) {
+ /* normal release of sh lease */
+
+ rv = write_host_block(task, token, token->host_id, 0, 0);
+ if (rv < 0) {
+ log_errot(token, "release_token shared write_host_block %d", rv);
+ ret = rv;
+ }
+
+ if (rv == SANLK_AIO_TIMEOUT)
+ retry_async = 1;
+
+ } else {
+ /* normal release of ex lease */
+
+ if (!lver) {
+ /* zero lver means acquire did not get to the point of writing a leader,
+ so we don't need to release the lease on disk. */
+ close_disks(token->disks, token->r.num_disks);
+ ret = SANLK_OK;
+ goto out;
+ }
- if (!(r_flags & R_SHARED) || (r_flags & R_UNDO_SHARED)) {
if (r_flags & R_LVB_WRITE_RELEASE) {
rv = write_lvb_block(task, r, token);
if (!rv)
@@ -797,20 +878,35 @@ static int _release_token(struct task *task, struct token *token,
/* do we want to give more effort to writing lvb? */
}
+ /* Failure here is not a big deal and can be ignored. */
+ rv = write_host_block(task, token, token->host_id, 0, 0);
+ if (rv < 0)
+ log_errot(token, "release_token write_host_block %d", rv);
+
rv = release_disk(task, token, resrename, &r->leader);
if (rv < 0) {
- log_errot(token, "release_token disk error %d r_flags %x lver %llu",
- rv, r_flags, (unsigned long long)lver);
- goto out_close;
+ log_errot(token, "release_token release leader %d", rv);
+ ret = rv;
}
- }
- log_token(token, "release_token done %d r_flags %x t_flags %x", rv, r->flags, token->flags);
+ if (rv == SANLK_AIO_TIMEOUT)
+ retry_async = 1;
+ }
- out_close:
close_disks(token->disks, token->r.num_disks);
-
out:
+ if (!retry_async) {
+ if (ret != SANLK_OK)
+ log_token(token, "release_token error %d r_flags %x", ret, r_flags);
+ else
+ log_token(token, "release_token done r_flags %x", ret, r_flags);
+ pthread_mutex_lock(&resource_mutex);
+ list_del(&r->list);
+ pthread_mutex_unlock(&resource_mutex);
+ free_resource(r);
+ return ret;
+ }
+
/*
* If a transient i/o error prevented the release on disk,
* then handle this like an async release; set R_THREAD_RELEASE,
@@ -819,19 +915,12 @@ static int _release_token(struct task *task, struct token *token,
* disk, preventing others from acquiring it.
*/
+ log_errot(token, "release_token timeout r_flags %x", r_flags);
pthread_mutex_lock(&resource_mutex);
- if (rv == SANLK_AIO_TIMEOUT) {
- r->flags |= R_THREAD_RELEASE;
- r->release_token_id = token->token_id;
- } else {
- list_del(&r->list);
- }
+ r->flags |= R_THREAD_RELEASE;
+ r->release_token_id = token->token_id;
pthread_mutex_unlock(&resource_mutex);
-
- if (rv != SANLK_AIO_TIMEOUT)
- free_resource(r);
-
- return rv;
+ return SANLK_AIO_TIMEOUT;
}
static int release_token_nodisk(struct task *task, struct token *token)
@@ -1006,8 +1095,7 @@ static int convert_sh2ex_token(struct task *task, struct resource *r, struct tok
{
struct leader_record leader;
int live_count = 0;
- int fail_count = 0;
- int undo_dblock = 0;
+ int retries;
int error;
int rv;
@@ -1033,12 +1121,15 @@ static int convert_sh2ex_token(struct task *task, struct resource *r, struct tok
if (rv < 0) {
log_errot(token, "convert_sh2ex acquire error %d t_flags %x", rv, token->flags);
- /* If we might own the lease, then we need to do on-disk release
- of owner and dblock. Keep token and SH mblock. */
+ /* If the acquire failed before anything important was written,
+ then this RETRACT flag will not be set, and there is nothing
+ to undo/cleanup; we can simply return an error. Otherwise,
+ the acquire failed part way through, and we need to try to
+ clean up our state on disk. Do on-disk release of owner.
+ Keep token and SH mblock. */
if (token->flags & T_RETRACT_PAXOS) {
token->flags &= ~T_RETRACT_PAXOS;
- undo_dblock = 1;
error = rv;
goto fail;
}
@@ -1071,10 +1162,12 @@ static int convert_sh2ex_token(struct task *task, struct resource *r, struct tok
if (live_count) {
/*
- * A live host with a sh lock exists. The token is kept, the
- * lease owner is released.
+ * The convert fails because a live host with a sh lock exists.
+ * The token/lease is kept shared, the lease owner is released.
+ * Our SHARED mblock bit is still set on disk because
+ * T_WRITE_DBLOCK_MBLOCK_SH kept it set during acquire,
+ * so we only need to release the lease owner.
*/
-
rv = release_disk(task, token, NULL, &leader);
if (rv < 0) {
log_errot(token, "convert_sh2ex release_disk error %d", rv);
@@ -1088,27 +1181,14 @@ static int convert_sh2ex_token(struct task *task, struct resource *r, struct tok
}
do_mb:
- rv = set_mode_block(task, token, token->host_id, token->host_generation, 0);
+ rv = write_host_block(task, token, token->host_id, 0, 0);
if (rv < 0) {
- log_errot(token, "convert_sh2ex set_mode_block error %d %d", rv, fail_count);
-
- /* We have the ex lease, so return success. We just need to clear
- our SH mblock. We retry a couple times, and then set ERASE_ALL
- so that when the token is later released, both owner and mblock will
- be cleared. */
-
- if ((rv == SANLK_AIO_TIMEOUT) && (fail_count < 2)) {
- fail_count++;
- sleep(1);
- goto do_mb;
- }
-
- if (rv < 0 && (rv != SANLK_AIO_TIMEOUT)) {
- error = rv;
- goto fail;
- }
+ log_errot(token, "convert_sh2ex write_host_block error %d", rv);
- r->flags |= R_ERASE_ALL;
+ /* We have the ex lease, so return success. We just failed to
+ clear our SH mblock. When we later release this lease,
+ the release includes clearing the dblock/mblock, so there's
+ not really anything we need to do. */
}
/* TODO: clean up the duplication of stuff among: t, t->r, r, r->r */
@@ -1140,38 +1220,19 @@ static int convert_sh2ex_token(struct task *task, struct resource *r, struct tok
* lockspace/leases should be considered invalid.
*/
- fail_count++;
-
if (token->space_dead)
return error;
- if (undo_dblock) {
- token->flags |= T_WRITE_DBLOCK_MBLOCK_SH;
-
- rv = paxos_erase_dblock(task, token, token->host_id);
-
- token->flags &= ~T_WRITE_DBLOCK_MBLOCK_SH;
-
- if ((rv == SANLK_AIO_TIMEOUT) && (fail_count < token->io_timeout)) {
- log_errot(token, "convert_sh2ex fail %d undo dblock timeout", fail_count);
- sleep(fail_count);
- goto fail;
- } else if (rv < 0) {
- log_errot(token, "convert_sh2ex fail %d undo dblock error %d", fail_count, rv);
- r->flags |= R_ERASE_ALL;
- return error;
- }
-
- undo_dblock = 0;
- }
-
+ retries = 0;
+ retry:
rv = paxos_lease_release(task, token, NULL, leader.lver ? &leader : NULL, &leader);
- if ((rv == SANLK_AIO_TIMEOUT) && (fail_count < token->io_timeout)) {
- log_errot(token, "convert_sh2ex fail %d undo owner timeout", fail_count);
- sleep(fail_count);
- goto fail;
+ if ((rv == SANLK_AIO_TIMEOUT) && (retries < 3)) {
+ retries++;
+ log_errot(token, "convert_sh2ex fail %d undo owner timeout", retries);
+ sleep(token->io_timeout);
+ goto retry;
} else if (rv < 0) {
- log_errot(token, "convert_sh2ex fail %d undo owner error %d", fail_count, rv);
+ log_errot(token, "convert_sh2ex fail %d undo owner error %d", retries, rv);
r->flags |= R_ERASE_ALL;
return error;
}
@@ -1193,9 +1254,9 @@ static int convert_ex2sh_token(struct task *task, struct resource *r, struct tok
if (r->flags & R_LVB_WRITE_RELEASE)
write_lvb_block(task, r, token);
- rv = set_mode_block(task, token, token->host_id, token->host_generation, MBLOCK_SHARED);
+ rv = write_host_block(task, token, token->host_id, token->host_generation, MBLOCK_SHARED);
if (rv < 0) {
- log_errot(token, "convert_ex2sh set_mode_block error %d", rv);
+ log_errot(token, "convert_ex2sh write_host_block error %d", rv);
return rv;
}
@@ -1565,10 +1626,9 @@ int acquire_token(struct task *task, struct token *token, uint32_t cmd_flags,
*/
if (token->acquire_flags & SANLK_RES_SHARED) {
- rv = set_mode_block(task, token, token->host_id,
- token->host_generation, MBLOCK_SHARED);
+ rv = write_host_block(task, token, token->host_id, token->host_generation, MBLOCK_SHARED);
if (rv < 0) {
- log_errot(token, "acquire_token sh set_mode_block error %d", rv);
+ log_errot(token, "acquire_token sh write_host_block error %d", rv);
r->flags &= ~R_SHARED;
r->flags |= R_UNDO_SHARED;
release_token_opened(task, token);
@@ -1576,13 +1636,7 @@ int acquire_token(struct task *task, struct token *token, uint32_t cmd_flags,
}
/* the token is kept, the paxos lease is released but with shared set */
-
- token->flags |= T_WRITE_DBLOCK_MBLOCK_SH;
-
rv = release_disk(task, token, NULL, &leader);
-
- token->flags &= T_WRITE_DBLOCK_MBLOCK_SH;
-
if (rv < 0) {
log_errot(token, "acquire_token sh release_disk error %d", rv);
r->flags &= ~R_SHARED;
@@ -1902,7 +1956,10 @@ static void resource_thread_release(struct task *task, struct resource *r, struc
struct leader_record leader;
struct space_info spi;
uint32_t r_flags;
- int rv = 0;
+ int retry_async = 0;
+ int rv;
+
+ r_flags = r->flags;
rv = open_disks_fd(token->disks, token->r.num_disks);
if (rv < 0) {
@@ -1915,20 +1972,25 @@ static void resource_thread_release(struct task *task, struct resource *r, struc
rv = lockspace_info(token->r.lockspace_name, &spi);
if (rv < 0 || spi.killing_pids) {
+ log_token(token, "release async info %d %d", rv, spi.killing_pids);
rv = -1;
goto out_close;
}
- r_flags = r->flags;
+ /*
+ * See comments in _release_token.
+ * FIXME: avoid duplicating all this from _release_token.
+ */
+
+ log_token(token, "release async r_flags %x, r_flags);
if (r_flags & R_ERASE_ALL) {
- rv = paxos_erase_dblock(task, token, token->host_id);
- if (rv < 0) {
- log_errot(token, "release async erase_dblock error %d r_flags %x",
- rv, r_flags);
- goto out_close;
- }
- log_token(token, "release async erase dblock done");
+ rv = write_host_block(task, token, token->host_id, 0, 0);
+ if (rv < 0)
+ log_errot(token, "release async erase all write_host_block %d", rv);
+
+ if (rv == SANLK_AIO_TIMEOUT)
+ retry_async = 1;
/* Even when acquire did not get far enough to get a copy of the
leader (!lver), we still want to try to release the leader
@@ -1939,56 +2001,80 @@ static void resource_thread_release(struct task *task, struct resource *r, struc
else
rv = paxos_lease_release(task, token, NULL, &r->leader, &leader);
+ if (rv == SANLK_AIO_TIMEOUT)
+ retry_async = 1;
+
/* want to see this result in sanlock.log but not worry people with error */
log_level(0, token->token_id, NULL, LOG_WARNING,
- "release async erase leader lver %llu rv %d",
+ "release async erase all leader lver %llu rv %d",
(unsigned long long)r->leader.lver, rv);
- goto out_close;
- }
+ } else if (r_flags & R_UNDO_SHARED) {
+ rv = write_host_block(task, token, token->host_id, 0, 0);
+ if (rv < 0)
+ log_errot(token, "release async undo shared write_host_block %d", rv);
- if ((r_flags & R_SHARED) || (r_flags & R_UNDO_SHARED)) {
- rv = set_mode_block(task, token, token->host_id, 0, 0);
- if (rv < 0) {
- log_errot(token, "release async set_mode_block error %d r_flags %x",
- rv, r_flags);
- goto out_close;
- }
- }
+ if (rv == SANLK_AIO_TIMEOUT)
+ retry_async = 1;
+
+ rv = release_disk(task, token, NULL, &r->leader);
+ if (rv < 0)
+ log_errot(token, "release async undo shared release leader %d", rv);
+
+ if (rv == SANLK_AIO_TIMEOUT)
+ retry_async = 1;
+
+ } else if (r_flags & R_SHARED) {
+ /* normal release of sh lease */
+
+ rv = write_host_block(task, token, token->host_id, 0, 0);
+ if (rv < 0)
+ log_errot(token, "release async shared write_host_block %d", rv);
+
+ if (rv == SANLK_AIO_TIMEOUT)
+ retry_async = 1;
+ } else {
+ /* normal release of ex lease */
- if (!(r_flags & R_SHARED) || (r_flags & R_UNDO_SHARED)) {
if (r_flags & R_LVB_WRITE_RELEASE) {
rv = write_lvb_block(task, r, token);
if (!rv)
r->flags &= ~R_LVB_WRITE_RELEASE;
else
log_errot(token, "release async write_lvb error %d", rv);
+ /* do we want to give more effort to writing lvb? */
}
+ /* Failure here is not a big deal and can be ignored. */
+ rv = write_host_block(task, token, token->host_id, 0, 0);
+ if (rv < 0)
+ log_errot(token, "release async write_host_block %d", rv);
+
rv = release_disk(task, token, NULL, &r->leader);
- if (rv < 0) {
- log_errot(token, "release async disk error %d r_flags %x lver %llu",
- rv, r_flags, (unsigned long long)r->leader.lver);
- goto out_close;
- }
- }
+ if (rv < 0)
+ log_errot(token, "release async release leader %d", rv);
- log_token(token, "release async done %d r_flags %x", rv, r_flags);
+ if (rv == SANLK_AIO_TIMEOUT)
+ retry_async = 1;
+ }
out_close:
close_disks(token->disks, token->r.num_disks);
-
out:
- pthread_mutex_lock(&resource_mutex);
- if (rv == SANLK_AIO_TIMEOUT) {
- r->flags |= R_THREAD_RELEASE;
- } else {
+ if (!retry_async) {
+ log_token(token, "release async done r_flags %x", r_flags);
+ pthread_mutex_lock(&resource_mutex);
list_del(&r->list);
+ pthread_mutex_unlock(&resource_mutex);
+ free_resource(r);
+ return;
}
- pthread_mutex_unlock(&resource_mutex);
- if (rv != SANLK_AIO_TIMEOUT)
- free_resource(r);
+ /* Keep the resource on the list to keep trying. */
+ log_token(token, "release async timeout r_flags %x", r_flags);
+ pthread_mutex_lock(&resource_mutex);
+ r->flags |= R_THREAD_RELEASE;
+ pthread_mutex_unlock(&resource_mutex);
}
static void resource_thread_examine(struct task *task, struct token *tt, int pid, uint64_t lver)
7 years, 11 months
src/sanlock.8
by David Teigland
src/sanlock.8 | 168 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
1 file changed, 168 insertions(+)
New commits:
commit 47cf665d310c726f93a07b9c139f8be5dcb4a65a
Author: David Teigland <teigland(a)redhat.com>
Date: Fri Dec 4 17:01:42 2015 -0600
sanlock: add man page section about internals
diff --git a/src/sanlock.8 b/src/sanlock.8
index eb735aa..c605b64 100644
--- a/src/sanlock.8
+++ b/src/sanlock.8
@@ -870,6 +870,174 @@ using the ORPHAN flag (-O 1), or release the orphan lease using the ORPHAN
flag (-O 1). All orphan leases can be released by setting the lockspace
name (-s lockspace_name) with no resource name.
+.SH INTERNALS
+
+.SS Disk Format
+
+.IP \[bu] 2
+This example uses 512 byte sectors.
+.IP \[bu] 2
+Each lockspace is 1MB. It holds 2000 delta_leases, one per sector,
+supporting up to 2000 hosts.
+.IP \[bu] 2
+Each paxos_lease is 1MB. It is used as a lease for one resource.
+.IP \[bu] 2
+The leader_record structure is used differently by each lease type.
+.IP \[bu] 2
+To display all leader_record fields, see sanlock direct read_leader.
+.IP \[bu] 2
+A lockspace is often followed on disk by the paxos_leases used within that
+lockspace, but this layout is not required.
+.IP \[bu] 2
+The request_record and host_id bitmap are used for requests/events.
+.IP \[bu] 2
+The mode_block contains the SHARED flag indicating a lease is held in the
+shared mode.
+.IP \[bu] 2
+In a lockspace, the host using host_id N writes to a single delta_lease in
+sector N-1. No other hosts write to this sector. All hosts read all
+lockspace sectors when renewing their own delta_lease, and are able to
+monitor renewals of all delta_leases.
+.IP \[bu] 2
+In a paxos_lease, each host has a dedicated sector it writes to,
+containing its own paxos_dblock and mode_block structures. Its sector is
+based on its host_id; host_id 1 writes to the dblock/mode_block in sector
+2 of the paxos_lease.
+.IP \[bu] 2
+The paxos_dblock structures are used by the paxos_lease algorithm, and the
+result is written to the leader_record.
+
+.P
+
+.B 0x000000 lockspace foo:0:/path:0
+
+(There is no representation on disk of the lockspace in general, only the
+sequence of specific delta_leases which collectively represent the
+lockspace.)
+
+.B delta_lease foo:1:/path:0
+.nf
+0x000 0 leader_record (sector 0, for host_id 1)
+ magic: 0x12212010
+ space_name: foo
+ resource_name: host uuid/name
+ \.\.\.
+ host_id bitmap (leader_record + 256)
+.fi
+
+.B delta_lease foo:2:/path:0
+.nf
+0x200 512 leader_record (sector 1, for host_id 2)
+ magic: 0x12212010
+ space_name: foo
+ resource_name: host uuid/name
+ \.\.\.
+ host_id bitmap (leader_record + 256)
+.fi
+
+.B delta_lease foo:3:/path:0
+.nf
+0x400 1024 leader_record (sector 2, for host_id 3)
+ magic: 0x12212010
+ space_name: foo
+ resource_name: host uuid/name
+ \.\.\.
+ host_id bitmap (leader_record + 256)
+.fi
+
+.B delta_lease foo:2000:/path:0
+.nf
+0xF9E00 leader_record (sector 1999, for host_id 2000)
+ magic: 0x12212010
+ space_name: foo
+ resource_name: host uuid/name
+ \.\.\.
+ host_id bitmap (leader_record + 256)
+.fi
+
+.B 0x100000 paxos_lease foo:example1:/path:1048576
+.nf
+0x000 0 leader_record (sector 0)
+ magic: 0x06152010
+ space_name: foo
+ resource_name: example1
+
+0x200 512 request_record (sector 1)
+ magic: 0x08292011
+
+0x400 1024 paxos_dblock (sector 2, for host_id 1)
+0x480 1152 mode_block (paxos_dblock + 128)
+
+0x600 1536 paxos_dblock (sector 3, for host_id 2)
+0x680 1664 mode_block (paxos_dblock + 128)
+
+0x800 2048 paxos_dblock (sector 4, for host_id 3)
+0x880 2176 mode_block (paxos_dblock + 128)
+
+0xFA200 paxos_dblock (sector 2001, for host_id 2000)
+0xFA280 mode_block (paxos_dblock + 128)
+.fi
+
+.B 0x200000 paxos_lease foo:example2:/path:2097152
+.nf
+0x000 0 leader_record (sector 0)
+ magic: 0x06152010
+ space_name: foo
+ resource_name: example2
+
+0x200 512 request_record (sector 1)
+ magic: 0x08292011
+
+0x400 1024 paxos_dblock (sector 2, for host_id 1)
+0x480 1152 mode_block (paxos_dblock + 128)
+
+0x600 1536 paxos_dblock (sector 3, for host_id 2)
+0x680 1664 mode_block (paxos_dblock + 128)
+
+0x800 2048 paxos_dblock (sector 4, for host_id 3)
+0x880 2176 mode_block (paxos_dblock + 128)
+
+0xFA200 paxos_dblock (sector 2001, for host_id 2000)
+0xFA280 mode_block (paxos_dblock + 128)
+.fi
+
+.SS Lease ownership
+
+Not shown in the leader_record structures above are the owner_id,
+owner_generation and timestamp fields. These are the fields that define
+the lease owner.
+
+The delta_lease at sector N for host_id N+1 has leader_record.owner_id
+N+1. The leader_record.owner_generation is incremented each time the
+delta_lease is acquired. When a delta_lease is acquired, the
+leader_record.timestamp field is set to the time of the host and the
+leader_record.resource_name is set to the unique name of the host. When
+the host renews the delta_lease, it writes a new leader_record.timestamp.
+When a host releases a delta_lease, it writes zero to
+leader_record.timestamp.
+
+When a host acquires a paxos_lease, it uses the host_id/generation value
+from the delta_lease it holds in the lockspace. It uses this
+host_id/generation to identify itself in the paxos_dblock when running the
+paxos algorithm. The result of the algorithm is the winning
+host_id/generation - the new owner of the paxos_lease. The winning
+host_id/generation are written to the paxos_lease leader_record.owner_id
+and leader_record.owner_generation fields and leader_record.timestamp is
+set. When a host releases a paxos_lease, it sets leader_record.timestamp
+to 0.
+
+When a paxos_lease is free (leader_record.timestamp is 0), multiple hosts
+may attempt to acquire it. The paxos algorithm, using the paxos_dblock
+structures, will select only one of the hosts as the new owner, and that
+owner is written in the leader_record. The paxos_lease will no longer be
+free (non-zero timestamp). Other hosts will see this and will not attempt
+to acquire the paxos_lease until it is free again.
+
+If a paxos_lease is owned (non-zero timestamp), but the owner has not
+renewed its delta_lease for a specific length of time, then the owner
+value in the paxos_lease becomes expired, and other hosts will use the
+paxos algorithm to acquire the paxos_lease, and set a new owner.
+
.SH SEE ALSO
.BR wdmd (8)
7 years, 12 months
src/resource.c
by David Teigland
src/resource.c | 5 +++++
1 file changed, 5 insertions(+)
New commits:
commit ce98f5f8141bdf012caf40046dab7fa69f433fdf
Author: David Teigland <teigland(a)redhat.com>
Date: Fri Dec 4 14:21:08 2015 -0600
sanlock: don't clear the shared flag on disk
When a shared lease is acquired, the shared flag is
written to the mode_block (following the dblock structure),
and then the leader record is released.
In commit e40e1f6e22f9b10f08d53fc7da94f5158d9e4ae8,
releasing the leader record also began to clear the
dblock structure. Clearing the dblock also wrongly
cleared the mode_block, causing the shared flag to
be lost. This means that sanlock believes a shared
lease is held, when in fact on disk it is not held,
and can be acquired exclusively by another host.
diff --git a/src/resource.c b/src/resource.c
index b378f62..8767746 100644
--- a/src/resource.c
+++ b/src/resource.c
@@ -1577,7 +1577,12 @@ int acquire_token(struct task *task, struct token *token, uint32_t cmd_flags,
/* the token is kept, the paxos lease is released but with shared set */
+ token->flags |= T_WRITE_DBLOCK_MBLOCK_SH;
+
rv = release_disk(task, token, NULL, &leader);
+
+ token->flags &= T_WRITE_DBLOCK_MBLOCK_SH;
+
if (rv < 0) {
log_errot(token, "acquire_token sh release_disk error %d", rv);
r->flags &= ~R_SHARED;
8 years