invalid lockspace found
by Frido Roose
Hello,
I'm not sure if this is the appropriate place to ask, but we're running
libvirtd with the sanlock plugin.
Ater a while, I noticed the following in the sanlock logfile after I
couldn't create a new VM:
838555 sanlock daemon started aio 1 10 renew 20 80 host
144ce1f8-ddf7-4139-8e97-291aeddd7b81.arqua time 1330086549
838556 s1 lockspace
__LIBVIRT__DISKS__:13:/var/lib/libvirt/sanlock/__LIBVIRT__DISKS__:0
838616 s1:r1 resource
__LIBVIRT__DISKS__:7cf11c7a5da5d530521646d738636e84:/var/lib/libvirt/sanlock/7cf11c7a5da5d530521646d738636e84:0
for 1,9,31377
838632 s1:r2 resource
__LIBVIRT__DISKS__:b2ad1e0874ba7c316cc848d3e0a98439:/var/lib/libvirt/sanlock/b2ad1e0874ba7c316cc848d3e0a98439:0
for 2,12,31499
839477 s1 check_our_lease warning 60 last_success 839417
839478 s1 check_our_lease warning 61 last_success 839417
839479 s1 check_our_lease warning 62 last_success 839417
839480 s1 check_our_lease warning 63 last_success 839417
839481 s1 check_our_lease warning 64 last_success 839417
839482 s1 check_our_lease warning 65 last_success 839417
839483 s1 check_our_lease warning 66 last_success 839417
839484 s1 check_our_lease warning 67 last_success 839417
839485 s1 check_our_lease warning 68 last_success 839417
839486 s1 check_our_lease warning 69 last_success 839417
839487 s1 check_our_lease warning 70 last_success 839417
839488 s1 check_our_lease warning 71 last_success 839417
839489 s1 check_our_lease warning 72 last_success 839417
839490 s1 check_our_lease warning 73 last_success 839417
839491 s1 check_our_lease warning 74 last_success 839417
839492 s1 check_our_lease warning 75 last_success 839417
839493 s1 check_our_lease warning 76 last_success 839417
839494 s1 check_our_lease warning 77 last_success 839417
839495 s1 check_our_lease warning 78 last_success 839417
839496 s1 check_our_lease warning 79 last_success 839417
839497 s1 check_our_lease failed 80
841667 s1 renewed 841667 delta_length 2229 too long
841671 r3 cmd_acquire 1,9,7234 invalid lockspace found -1 failed 0 name
__LIBVIRT__DISKS__
841672 r4 cmd_acquire 2,10,7268 invalid lockspace found -1 failed 0 name
__LIBVIRT__DISKS__
841674 r5 cmd_acquire 2,10,7765 invalid lockspace found -1 failed 0 name
__LIBVIRT__DISKS__
841675 r6 cmd_acquire 1,9,7735 invalid lockspace found -1 failed 0 name
__LIBVIRT__DISKS__
842012 r7 cmd_acquire 1,9,8495 invalid lockspace found -1 failed 0 name
__LIBVIRT__DISKS__
842013 r8 cmd_acquire 1,9,8622 invalid lockspace found -1 failed 0 name
__LIBVIRT__DISKS__
842015 r9 cmd_acquire 1,9,9130 invalid lockspace found -1 failed 0 name
__LIBVIRT__DISKS__
842016 r10 cmd_acquire 1,9,9241 invalid lockspace found -1 failed 0 name
__LIBVIRT__DISKS__
842019 r11 cmd_acquire 1,9,9669 invalid lockspace found -1 failed 0 name
__LIBVIRT__DISKS__
842020 r12 cmd_acquire 2,10,9697 invalid lockspace found -1 failed 0 name
__LIBVIRT__DISKS__
842021 r13 cmd_acquire 1,9,10146 invalid lockspace found -1 failed 0 name
__LIBVIRT__DISKS__
842022 r14 cmd_acquire 1,9,10240 invalid lockspace found -1 failed 0 name
__LIBVIRT__DISKS__
842115 r15 cmd_acquire 1,9,10755 invalid lockspace found -1 failed 0 name
__LIBVIRT__DISKS__
842117 r16 cmd_acquire 1,9,11008 invalid lockspace found -1 failed 0 name
__LIBVIRT__DISKS__
842410 r17 cmd_acquire 1,9,11511 invalid lockspace found -1 failed 0 name
__LIBVIRT__DISKS__
842410 r18 cmd_acquire 1,9,11616 invalid lockspace found -1 failed 0 name
__LIBVIRT__DISKS__
843609 r19 cmd_acquire 1,9,12835 invalid lockspace found -1 failed 0 name
__LIBVIRT__DISKS__
843616 r20 cmd_acquire 1,9,13093 invalid lockspace found -1 failed 0 name
__LIBVIRT__DISKS__
So for some reason, the lockspace became invalid (and it looks like the
host lease was not renewed).
Running sanlock-1.8-2.el6.x86_64.
Anything else I can check?
Best regards,
Frido
11 years, 9 months
2 commits - src/paxos_lease.c
by David Teigland
src/paxos_lease.c | 7 +++++++
1 file changed, 7 insertions(+)
New commits:
commit 12f4276f4adaecdd3a6b4ef409c25ac2e5c13a2a
Author: David Teigland <teigland(a)redhat.com>
Date: Mon Feb 27 12:52:11 2012 -0600
sanlock: fix leaking fd
Recent shared lease commit started leaking this fd.
Signed-off-by: David Teigland <teigland(a)redhat.com>
diff --git a/src/paxos_lease.c b/src/paxos_lease.c
index 19218eb..807972f 100644
--- a/src/paxos_lease.c
+++ b/src/paxos_lease.c
@@ -1064,6 +1064,7 @@ int paxos_lease_acquire(struct task *task,
error = SANLK_ACQUIRE_IDDISK;
goto out;
}
+ disk_open = 1;
}
rv = host_info(cur_leader.space_name, cur_leader.owner_id, &hs);
commit dc0e0927d9f1e63f8f53feffe14cee1aede30e3f
Author: David Teigland <teigland(a)redhat.com>
Date: Mon Feb 27 11:00:59 2012 -0600
sanlock: add paxos sanity check
Check for a condition that I don't think should ever happen.
Signed-off-by: David Teigland <teigland(a)redhat.com>
diff --git a/src/paxos_lease.c b/src/paxos_lease.c
index d1243cb..19218eb 100644
--- a/src/paxos_lease.c
+++ b/src/paxos_lease.c
@@ -1293,6 +1293,12 @@ int paxos_lease_acquire(struct task *task,
goto restart;
}
+ if (memcmp(&cur_leader, &tmp_leader, sizeof(struct leader_record))) {
+ /* I don't think this should ever happen. */
+ log_errot(token, "paxos_acquire restart leader changed2");
+ goto restart;
+ }
+
error = run_ballot(task, token, cur_leader.num_hosts, next_lver, our_mbal,
&dblock);
11 years, 9 months
2 commits - src/paxos_lease.c src/resource.c
by David Teigland
src/paxos_lease.c | 22 ++++++++++++++++++++++
src/resource.c | 3 +++
2 files changed, 25 insertions(+)
New commits:
commit fe03cafde177757ffe3c9e24d269c4ad5328f538
Author: David Teigland <teigland(a)redhat.com>
Date: Fri Feb 24 16:25:51 2012 -0600
sanlock: fix problem in paxos
The special case in the comment should be recognized
and handled properly.
Signed-off-by: David Teigland <teigland(a)redhat.com>
diff --git a/src/paxos_lease.c b/src/paxos_lease.c
index 9fa62b1..d1243cb 100644
--- a/src/paxos_lease.c
+++ b/src/paxos_lease.c
@@ -1220,6 +1220,11 @@ int paxos_lease_acquire(struct task *task,
* current host_id and generation?
*/
+ /* This next_lver assignment is based on the original cur_leader, not a
+ re-reading of the leader here, i.e. we cannot just re-read the leader
+ here, and make next_lver one more than that. This is because another
+ node may have made us the owner of next_lver as it is now. */
+
next_lver = cur_leader.lver + 1;
if (!our_dblock.mbal)
@@ -1271,6 +1276,23 @@ int paxos_lease_acquire(struct task *task,
goto out;
}
+ if (tmp_leader.lver > next_lver) {
+ /*
+ * A case where this was observed: for next_lver 65 we abort1, and delay.
+ * While sleeping, the lease v65 (which was acquired during our abort1) is
+ * released and then reacquired as v66. When we goto retry_ballot, our
+ * next_lver is 65, but the current lver on disk is 66, causing us to
+ * we fail in the larger1 check.)
+ */
+ log_token(token, "paxos_acquire stale next_lver %llu now %llu owner %llu %llu %llu",
+ (unsigned long long)next_lver,
+ (unsigned long long)tmp_leader.lver,
+ (unsigned long long)tmp_leader.owner_id,
+ (unsigned long long)tmp_leader.owner_generation,
+ (unsigned long long)tmp_leader.timestamp);
+ goto restart;
+ }
+
error = run_ballot(task, token, cur_leader.num_hosts, next_lver, our_mbal,
&dblock);
commit 74383a19e5a1254fc661630b2c6bc1d9b3b098d6
Author: David Teigland <teigland(a)redhat.com>
Date: Fri Feb 24 15:25:13 2012 -0600
sanlock: fix missing close_disks
which was leaking fd's
Signed-off-by: David Teigland <teigland(a)redhat.com>
diff --git a/src/resource.c b/src/resource.c
index 0df17a4..a76b88d 100644
--- a/src/resource.c
+++ b/src/resource.c
@@ -349,17 +349,20 @@ static int _release_token(struct task *task, struct token *token, int opened,
if ((r->flags & R_SHARED) && !last_token) {
/* will release when final sh token is released */
log_token(token, "release_token more shared");
+ close_disks(token->disks, token->r.num_disks);
return SANLK_OK;
}
if (!last_token) {
/* should never happen */
log_errot(token, "release_token exclusive not last");
+ close_disks(token->disks, token->r.num_disks);
return SANLK_ERROR;
}
if (!lver) {
/* never acquired on disk so no need to release on disk */
+ close_disks(token->disks, token->r.num_disks);
rv = SANLK_OK;
goto out;
}
11 years, 9 months
Changes to 'shared'
by David Teigland
New branch 'shared' available with the following commits:
commit 950734136e384a33e4ab2c68f3ab7b38d222cb0b
Author: David Teigland <teigland(a)redhat.com>
Date: Tue Feb 21 16:43:48 2012 -0600
sanlock: add shared leases
Shared lease will fail if the lease is held normally
(exclusive). Normal lease will fail if a host holds
it shared.
Signed-off-by: David Teigland <teigland(a)redhat.com>
commit 6cdf956b03662e659e7c11188aa1d3dacd483f5d
Author: David Teigland <teigland(a)redhat.com>
Date: Tue Feb 21 10:24:47 2012 -0600
init scripts: fix path to restorecon
Signed-off-by: David Teigland <teigland(a)redhat.com>
11 years, 9 months
2 commits - src/cmd.c src/resource.c src/resource.h
by David Teigland
src/cmd.c | 11 ++++++-----
src/resource.c | 7 +++----
src/resource.h | 5 +++--
3 files changed, 12 insertions(+), 11 deletions(-)
New commits:
commit db393f0dcf036320d4d524a27daefce21ba3c862
Author: David Teigland <teigland(a)redhat.com>
Date: Wed Feb 15 14:01:11 2012 -0600
sanlock: fix debug line
and shift where save_resource_lver is called (no change in behavior)
Signed-off-by: David Teigland <teigland(a)redhat.com>
diff --git a/src/cmd.c b/src/cmd.c
index d44b835..0c65b06 100644
--- a/src/cmd.c
+++ b/src/cmd.c
@@ -350,6 +350,7 @@ static void cmd_acquire(struct task *task, struct cmd_args *ca)
result = rv;
goto done;
}
+ save_resource_lver(token, token->leader.lver);
acquire_count++;
}
diff --git a/src/resource.c b/src/resource.c
index 9002f06..ea9d7bb 100644
--- a/src/resource.c
+++ b/src/resource.c
@@ -83,7 +83,7 @@ static struct resource *find_resource(struct token *token,
return NULL;
}
-static void save_resource_lver(struct token *token, uint64_t lver)
+void save_resource_lver(struct token *token, uint64_t lver)
{
struct resource *r;
@@ -190,15 +190,14 @@ int acquire_token(struct task *task, struct token *token,
close_disks(token->disks, token->r.num_disks);
log_token(token, "acquire rv %d lver %llu at %llu", rv,
- (unsigned long long)token->leader.lver,
- (unsigned long long)token->leader.timestamp);
+ (unsigned long long)leader_ret.lver,
+ (unsigned long long)leader_ret.timestamp);
if (rv < 0)
return rv;
memcpy(&token->leader, &leader_ret, sizeof(struct leader_record));
token->r.lver = token->leader.lver;
- save_resource_lver(token, token->leader.lver);
return rv; /* SANLK_OK */
}
diff --git a/src/resource.h b/src/resource.h
index 72de40a..85f3be5 100644
--- a/src/resource.h
+++ b/src/resource.h
@@ -6,8 +6,8 @@
* of the GNU General Public License v2 or (at your option) any later version.
*/
-#ifndef __TOKEN_MANAGER_H__
-#define __TOKEN_MANAGER_H__
+#ifndef __RESOURCE_H__
+#define __RESOURCE_H__
int acquire_token(struct task *task, struct token *token,
uint64_t acquire_lver, int new_num_hosts);
@@ -21,6 +21,7 @@ int request_token(struct task *task, struct token *token, uint32_t force_mode,
int add_resource(struct token *token, int pid, uint32_t cl_restrict);
void del_resource(struct token *token);
+void save_resource_lver(struct token *token, uint64_t lver);
int set_resource_examine(char *space_name, char *res_name);
commit 24009b9f13d4c5a1d9becf9e856159016c111b75
Author: David Teigland <teigland(a)redhat.com>
Date: Wed Feb 15 13:57:23 2012 -0600
sanlock: fix error exit
missing free
Signed-off-by: David Teigland <teigland(a)redhat.com>
diff --git a/src/cmd.c b/src/cmd.c
index 3cef77d..d44b835 100644
--- a/src/cmd.c
+++ b/src/cmd.c
@@ -766,9 +766,8 @@ static void cmd_request(struct task *task, struct cmd_args *ca)
rv = recv(fd, token->disks, disks_len, MSG_WAITALL);
if (rv != disks_len) {
- free(token);
result = -ENOTCONN;
- goto reply;
+ goto reply_free;
}
/* zero out pad1 and pad2, see WARNING above */
@@ -787,18 +786,19 @@ static void cmd_request(struct task *task, struct cmd_args *ca)
error = request_token(task, token, force_mode, &owner_id);
if (error < 0) {
result = error;
- goto reply;
+ goto reply_free;
}
result = 0;
if (!token->acquire_lver && !force_mode)
- goto reply;
+ goto reply_free;
if (owner_id)
host_status_set_bit(token->r.lockspace_name, owner_id);
- reply:
+ reply_free:
free(token);
+ reply:
log_debug("cmd_request %d,%d done %d", ca->ci_in, fd, result);
send_result(fd, &ca->header, result);
11 years, 10 months