src/client_msg.c | 6 -
src/delta_lease.c | 9 -
src/delta_lease.h | 1
src/direct.c | 3
src/host_id.c | 276 +++++++++++++++++++++++++++++++++++++------------
src/host_id.h | 3
src/leader.h | 8 +
src/main.c | 39 ++++--
src/sanlock_internal.h | 21 +++
src/task.c | 1
10 files changed, 280 insertions(+), 87 deletions(-)
New commits:
commit 258db5c545d4f9ac8d8a041a298867e6a950b4f1
Author: Federico Simoncelli <fsimonce(a)redhat.com>
Date: Wed Aug 17 15:35:37 2011 -0500
client: return appropriate errno on failure
diff --git a/src/client_msg.c b/src/client_msg.c
index 21b18db..96ad970 100644
--- a/src/client_msg.c
+++ b/src/client_msg.c
@@ -38,7 +38,7 @@ int connect_socket(int *sock_fd)
s = socket(AF_LOCAL, SOCK_STREAM, 0);
if (s < 0)
- return -1;
+ return -errno;
rv = sanlock_socket_address(&addr);
if (rv < 0)
@@ -46,7 +46,7 @@ int connect_socket(int *sock_fd)
rv = connect(s, (struct sockaddr *) &addr, sizeof(struct sockaddr_un));
if (rv < 0) {
- rv = -1;
+ rv = -errno;
close(s);
return rv;
}
@@ -70,7 +70,7 @@ int send_header(int sock, int cmd, uint32_t cmd_flags, int datalen,
rv = send(sock, (void *) &header, sizeof(struct sm_header), 0);
if (rv < 0)
- return -1;
+ return -errno;
return 0;
}
commit 9fc87ead232eb7ac07dfb98be91d18ce772f4ccb
Author: David Teigland <teigland(a)redhat.com>
Date: Wed Aug 17 15:22:17 2011 -0500
sanlock: check other host_id leases
keep track of the owner and timestamp changes of other host_id
leases, and check if our host_id bit has been set in their
bitmap (there's no code yet that would set it)
diff --git a/src/delta_lease.c b/src/delta_lease.c
index b210d7f..12386ce 100644
--- a/src/delta_lease.c
+++ b/src/delta_lease.c
@@ -325,6 +325,7 @@ int delta_lease_renew(struct task *task,
struct sync_disk *disk,
char *space_name,
int prev_result,
+ int *read_result,
struct leader_record *leader_last,
struct leader_record *leader_ret)
{
@@ -338,12 +339,11 @@ int delta_lease_renew(struct task *task,
if (!leader_last)
return -EINVAL;
+ *read_result = SANLK_ERROR;
+
host_id = leader_last->owner_id;
- /* read all delta leases */
- iobuf_len = direct_align(disk);
- if (iobuf_len <= 0)
- return -EINVAL;
+ iobuf_len = sp->align_size;
/* offset of our leader_record */
offset = (host_id - 1) * disk->sector_size;
@@ -428,6 +428,7 @@ int delta_lease_renew(struct task *task,
}
read_done:
+ *read_result = SANLK_OK;
memcpy(&leader, task->iobuf+offset, sizeof(struct leader_record));
rv = verify_leader(disk, space_name, host_id, &leader, "delta_renew");
diff --git a/src/delta_lease.h b/src/delta_lease.h
index 9206c76..4a0cad2 100644
--- a/src/delta_lease.h
+++ b/src/delta_lease.h
@@ -29,6 +29,7 @@ int delta_lease_renew(struct task *task,
struct sync_disk *disk,
char *space_name,
int prev_result,
+ int *read_result,
struct leader_record *leader_last,
struct leader_record *leader_ret);
diff --git a/src/direct.c b/src/direct.c
index 8b09fe8..f8f15db 100644
--- a/src/direct.c
+++ b/src/direct.c
@@ -191,7 +191,7 @@ static int do_delta_action(int action,
struct leader_record leader;
struct sync_disk sd;
struct space space;
- int rv;
+ int read_result, rv;
/* for log_space in delta functions */
memset(&space, 0, sizeof(space));
@@ -230,6 +230,7 @@ static int do_delta_action(int action,
rv = delta_lease_renew(task, &space, &sd,
ls->name,
-1,
+ &read_result,
&leader,
&leader);
break;
diff --git a/src/host_id.c b/src/host_id.c
index 67e532a..a16194e 100644
--- a/src/host_id.c
+++ b/src/host_id.c
@@ -31,6 +31,7 @@
#include "host_id.h"
#include "watchdog.h"
#include "task.h"
+#include "direct.h"
static unsigned int space_id_counter = 1;
@@ -161,11 +162,107 @@ int host_id_disk_info(char *name, struct sync_disk *disk)
return rv;
}
+static void clear_bit(int host_id, char *bitmap)
+{
+ char *byte = bitmap + ((host_id - 1) / 8);
+ unsigned int bit = host_id % 8;
+
+ *byte &= ~bit;
+}
+
+static void set_bit(int host_id, char *bitmap)
+{
+ char *byte = bitmap + ((host_id - 1) / 8);
+ unsigned int bit = host_id % 8;
+
+ *byte |= bit;
+}
+
+static int test_bit(int host_id, char *bitmap)
+{
+ char *byte = bitmap + ((host_id - 1) / 8);
+ unsigned int bit = host_id % 8;
+
+ return *byte & bit;
+}
+
+/*
+ * when entering the monitor loop in paxos_lease, once
+ * last_check - last_live > host_dead_seconds, it's expired
+ *
+ * at local time t=last_live, we read timstamp=X
+ * at local time t=last_check, we read timestamp=X
+ * so once the difference between last_live and last_check
+ * is > host_dead_seconds, the host has not renewed it's
+ * timestamp in host_dead_seconds.
+ */
+
+void check_other_leases(struct task *task, struct space *sp, char *buf)
+{
+ struct leader_record *leader;
+ struct sync_disk *disk;
+ struct host_info *info;
+ char *bitmap;
+ uint64_t now;
+ int i, new;
+
+ disk = &sp->host_id_disk;
+
+ now = monotime();
+ new = 0;
+
+ for (i = 0; i < DEFAULT_MAX_HOSTS; i++) {
+ if (i+1 == sp->host_id)
+ continue;
+
+ info = &sp->host_info[i];
+ info->last_check = now;
+
+ leader = (struct leader_record *)(buf + (i * disk->sector_size));
+
+ if (info->owner_id == leader->owner_id &&
+ info->owner_generation == leader->owner_generation &&
+ info->timestamp == leader->timestamp) {
+ continue;
+ }
+
+ info->owner_id = leader->owner_id;
+ info->owner_generation = leader->owner_generation;
+ info->timestamp = leader->timestamp;
+ info->last_live = now;
+
+ bitmap = (char *)leader + HOSTID_BITMAP_OFFSET;
+
+ if (!test_bit(sp->host_id, bitmap))
+ continue;
+
+ /* this host has made a request for us, we won't take a new
+ request from this host for another request_finish_seconds */
+
+ if (now - info->last_req < task->request_finish_seconds)
+ continue;
+
+ log_space(sp, "request from host_id %d", i+1);
+ info->last_req = now;
+ new = 1;
+ }
+
+ /* TODO: add a thread that will periodically scan spaces and
+ for any with req_count > req_check, scan request blocks for
+ all locally held paxos leases in that lockspace. */
+
+ if (new) {
+ pthread_mutex_lock(&sp->mutex);
+ sp->req_count++;
+ pthread_mutex_unlock(&sp->mutex);
+ }
+}
+
/*
* check if our_host_id_thread has renewed within timeout
*/
-int host_id_check(struct task *task, struct space *sp)
+int check_our_lease(struct task *task, struct space *sp, int *check_all, char
*check_buf)
{
uint64_t last_success;
int corrupt_result;
@@ -174,31 +271,39 @@ int host_id_check(struct task *task, struct space *sp)
pthread_mutex_lock(&sp->mutex);
last_success = sp->lease_status.renewal_last_success;
corrupt_result = sp->lease_status.corrupt_result;
+
+ if (sp->lease_status.renewal_read_count > sp->lease_status.renewal_read_check)
{
+ /* main loop will pass this buf to check_other_leases next */
+ sp->lease_status.renewal_read_check = sp->lease_status.renewal_read_count;
+ *check_all = 1;
+ if (check_buf)
+ memcpy(check_buf, sp->lease_status.renewal_read_buf, sp->align_size);
+ }
pthread_mutex_unlock(&sp->mutex);
if (corrupt_result) {
- log_erros(sp, "host_id_check corrupt %d", corrupt_result);
- return 0;
+ log_erros(sp, "check_our_lease corrupt %d", corrupt_result);
+ return -1;
}
gap = monotime() - last_success;
if (gap >= task->id_renewal_fail_seconds) {
- log_erros(sp, "host_id_check failed %d", gap);
- return 0;
+ log_erros(sp, "check_our_lease failed %d", gap);
+ return -1;
}
if (gap >= task->id_renewal_warn_seconds) {
- log_erros(sp, "host_id_check warning %d last_success %llu",
+ log_erros(sp, "check_our_lease warning %d last_success %llu",
gap, (unsigned long long)last_success);
}
if (com.debug_renew > 1) {
- log_space(sp, "host_id_check good %d %llu",
+ log_space(sp, "check_our_lease good %d %llu",
gap, (unsigned long long)last_success);
}
- return 1;
+ return 0;
}
/* If a renewal result is one of the listed errors, it means our
@@ -227,9 +332,9 @@ static void *lockspace_thread(void *arg_in)
struct task task;
struct space *sp;
struct leader_record leader;
- time_t last_attempt, last_success;
- int rv, result, delta_length, gap;
- int delta_result = 0;
+ uint64_t delta_begin, last_success;
+ int rv, delta_length, renewal_interval;
+ int acquire_result, delta_result, read_result;
int opened = 0;
int stop = 0;
@@ -240,49 +345,64 @@ static void *lockspace_thread(void *arg_in)
setup_task_aio(&task, main_task.use_aio, HOSTID_AIO_CB_SIZE);
memcpy(task.name, sp->space_name, NAME_ID_SIZE);
- last_attempt = monotime();
+ delta_begin = monotime();
rv = open_disk(&sp->host_id_disk);
if (rv < 0) {
log_erros(sp, "open_disk %s error %d", sp->host_id_disk.path, rv);
- result = -ENODEV;
+ acquire_result = -ENODEV;
goto set_status;
}
opened = 1;
- result = delta_lease_acquire(&task, sp, &sp->host_id_disk,
- sp->space_name, our_host_name_global,
- sp->host_id, &leader);
- delta_result = result;
- delta_length = monotime() - last_attempt;
+ sp->align_size = direct_align(&sp->host_id_disk);
+
+ sp->lease_status.renewal_read_buf = malloc(sp->align_size);
+ if (!sp->lease_status.renewal_read_buf) {
+ acquire_result = -ENOMEM;
+ goto set_status;
+ }
+
+ /*
+ * acquire the delta lease
+ */
+
+ delta_begin = monotime();
+
+ delta_result = delta_lease_acquire(&task, sp, &sp->host_id_disk,
+ sp->space_name, our_host_name_global,
+ sp->host_id, &leader);
+ delta_length = monotime() - delta_begin;
- if (result == SANLK_OK)
+ if (delta_result == SANLK_OK)
last_success = leader.timestamp;
+ acquire_result = delta_result;
+
/* we need to start the watchdog after we acquire the host_id but
before we allow any pid's to begin running */
- if (result == SANLK_OK) {
+ if (delta_result == SANLK_OK) {
rv = create_watchdog_file(sp, last_success);
if (rv < 0) {
log_erros(sp, "create_watchdog failed %d", rv);
- result = SANLK_ERROR;
+ acquire_result = SANLK_ERROR;
}
}
set_status:
pthread_mutex_lock(&sp->mutex);
- sp->lease_status.acquire_last_result = result;
- sp->lease_status.acquire_last_attempt = last_attempt;
- if (result == SANLK_OK)
+ sp->lease_status.acquire_last_result = acquire_result;
+ sp->lease_status.acquire_last_attempt = delta_begin;
+ if (delta_result == SANLK_OK)
sp->lease_status.acquire_last_success = last_success;
- sp->lease_status.renewal_last_result = result;
- sp->lease_status.renewal_last_attempt = last_attempt;
- if (result == SANLK_OK)
+ sp->lease_status.renewal_last_result = acquire_result;
+ sp->lease_status.renewal_last_attempt = delta_begin;
+ if (delta_result == SANLK_OK)
sp->lease_status.renewal_last_success = last_success;
pthread_mutex_unlock(&sp->mutex);
- if (result < 0)
+ if (acquire_result < 0)
goto out;
sp->host_generation = leader.owner_generation;
@@ -291,10 +411,14 @@ static void *lockspace_thread(void *arg_in)
pthread_mutex_lock(&sp->mutex);
stop = sp->thread_stop;
pthread_mutex_unlock(&sp->mutex);
-
if (stop)
break;
+
+ /*
+ * wait between each renewal
+ */
+
if (monotime() - last_success < task.id_renewal_seconds) {
sleep(1);
continue;
@@ -304,54 +428,69 @@ static void *lockspace_thread(void *arg_in)
usleep(500000);
}
- last_attempt = monotime();
- result = delta_lease_renew(&task, sp, &sp->host_id_disk,
- sp->space_name, delta_result,
- &leader, &leader);
- delta_result = result;
- delta_length = monotime() - last_attempt;
+ /*
+ * do a renewal, measuring length of time spent in renewal,
+ * and the length of time between successful renewals
+ */
+
+ delta_begin = monotime();
- if (result == SANLK_OK)
+ delta_result = delta_lease_renew(&task, sp, &sp->host_id_disk,
+ sp->space_name, delta_result,
+ &read_result, &leader, &leader);
+ delta_length = monotime() - delta_begin;
+
+ if (delta_result == SANLK_OK) {
+ renewal_interval = leader.timestamp - last_success;
last_success = leader.timestamp;
+ }
+
+
+ /*
+ * publish the results
+ */
pthread_mutex_lock(&sp->mutex);
- sp->lease_status.renewal_last_result = result;
- sp->lease_status.renewal_last_attempt = last_attempt;
+ sp->lease_status.renewal_last_result = delta_result;
+ sp->lease_status.renewal_last_attempt = delta_begin;
- if (result == SANLK_OK) {
- gap = last_success - sp->lease_status.renewal_last_success;
+ if (delta_result == SANLK_OK)
sp->lease_status.renewal_last_success = last_success;
- if (delta_length > task.id_renewal_seconds) {
- log_erros(sp, "renewed %llu delta_length %d too long",
- (unsigned long long)last_success,
- delta_length);
- } else if (com.debug_renew) {
- log_space(sp, "renewed %llu delta_length %d interval %d",
- (unsigned long long)last_success,
- delta_length, gap);
- }
-
- if (!sp->thread_stop)
- update_watchdog_file(sp, last_success);
- } else {
- log_erros(sp, "renewal error %d delta_length %d last_success %llu",
- result, delta_length,
- (unsigned long long)sp->lease_status.renewal_last_success);
+ if (delta_result != SANLK_OK && !sp->lease_status.corrupt_result)
+ sp->lease_status.corrupt_result = corrupt_result(delta_result);
- if (!sp->lease_status.corrupt_result)
- sp->lease_status.corrupt_result = corrupt_result(result);
+ if (read_result == SANLK_OK && task.iobuf) {
+ memcpy(sp->lease_status.renewal_read_buf, task.iobuf, sp->align_size);
+ sp->lease_status.renewal_read_count++;
}
+
+
+ /*
+ * pet the watchdog
+ */
+
+ if (delta_result == SANLK_OK && !sp->thread_stop)
+ update_watchdog_file(sp, last_success);
+
pthread_mutex_unlock(&sp->mutex);
- /* TODO: pass off all the delta leases we read (in task->iobuf)
- for analysis by another thread */
/*
- if (result == SANLK_OK)
- queue_delta_lease_analysis(sp, task->iobuf);
- */
+ * log the results
+ */
+
+ if (delta_result != SANLK_OK) {
+ log_erros(sp, "renewal error %d delta_length %d last_success %llu",
+ delta_result, delta_length, (unsigned long long)last_success);
+ } else if (delta_length > task.id_renewal_seconds) {
+ log_erros(sp, "renewed %llu delta_length %d too long",
+ (unsigned long long)last_success, delta_length);
+ } else if (com.debug_renew) {
+ log_space(sp, "renewed %llu delta_length %d interval %d",
+ (unsigned long long)last_success, delta_length, renewal_interval);
+ }
}
/* unlink called below to get it done ASAP */
@@ -368,6 +507,13 @@ static void *lockspace_thread(void *arg_in)
return NULL;
}
+static void free_sp(struct space *sp)
+{
+ if (sp->lease_status.renewal_read_buf)
+ free(sp->lease_status.renewal_read_buf);
+ free(sp);
+}
+
/*
* When this function returns, it needs to be safe to being processing lease
* requests and allowing pid's to run, so we need to own our host_id, and the
@@ -498,7 +644,7 @@ int add_lockspace(struct sanlk_lockspace *ls)
list_del(&sp->list);
pthread_mutex_unlock(&spaces_mutex);
fail_free:
- free(sp);
+ free_sp(sp);
return rv;
}
@@ -601,7 +747,7 @@ void free_lockspaces(int wait)
if (!rv) {
log_space(sp, "free lockspace");
list_del(&sp->list);
- free(sp);
+ free_sp(sp);
}
}
pthread_mutex_unlock(&spaces_mutex);
diff --git a/src/host_id.h b/src/host_id.h
index 5cc8b50..69cb43f 100644
--- a/src/host_id.h
+++ b/src/host_id.h
@@ -13,7 +13,8 @@ int print_space_state(struct space *sp, char *str);
int _get_space_info(char *space_name, struct space *sp_out);
int get_space_info(char *space_name, struct space *sp_out);
int host_id_disk_info(char *name, struct sync_disk *disk);
-int host_id_check(struct task *task, struct space *sp);
+int check_our_lease(struct task *task, struct space *sp, int *check_all, char
*check_buf);
+void check_other_leases(struct task *task, struct space *sp, char *buf);
int add_lockspace(struct sanlk_lockspace *ls);
int rem_lockspace(struct sanlk_lockspace *ls);
void free_lockspaces(int wait);
diff --git a/src/leader.h b/src/leader.h
index dca9b62..2830ae1 100644
--- a/src/leader.h
+++ b/src/leader.h
@@ -17,7 +17,7 @@
#define PAXOS_DISK_MAGIC 0x06152010
#define PAXOS_DISK_VERSION_MAJOR 0x00050000
-#define PAXOS_DISK_VERSION_MINOR 0x00000001
+#define PAXOS_DISK_VERSION_MINOR 0x00000001
#define DELTA_DISK_MAGIC 0x12212010
#define DELTA_DISK_VERSION_MAJOR 0x00030000
@@ -63,4 +63,10 @@ struct leader_record {
uint64_t write_timestamp; /* for extra info, debug */
};
+/* leader_record can use first 256 bytes of a sector,
+ bitmap uses the last 256 bytes */
+
+#define LEADER_RECORD_MAX 256
+#define HOSTID_BITMAP_OFFSET 256
+
#endif
diff --git a/src/main.c b/src/main.c
index c17b18f..3f0f448 100644
--- a/src/main.c
+++ b/src/main.c
@@ -546,7 +546,9 @@ static int main_loop(void)
struct timeval now, last_check;
int poll_timeout, check_interval;
unsigned int ms;
- int i, rv, empty, space_dead;
+ int i, rv, empty, check_all;
+ char *check_buf = NULL;
+ int check_buf_len = 0;
gettimeofday(&last_check, NULL);
poll_timeout = STANDARD_CHECK_INTERVAL;
@@ -582,9 +584,21 @@ static int main_loop(void)
continue;
}
last_check = now;
+ check_interval = STANDARD_CHECK_INTERVAL;
pthread_mutex_lock(&spaces_mutex);
list_for_each_entry_safe(sp, safe, &spaces, list) {
+ check_all = 0;
+
+ if (sp->align_size > check_buf_len) {
+ if (check_buf)
+ free(check_buf);
+ check_buf_len = sp->align_size;
+ check_buf = malloc(check_buf_len);
+ }
+ if (check_buf)
+ memset(check_buf, 0, check_buf_len);
+
if (sp->killing_pids) {
if (all_pids_dead(sp)) {
log_space(sp, "set thread_stop");
@@ -596,24 +610,26 @@ static int main_loop(void)
} else {
kill_pids(sp);
}
- check_interval = RECOVERY_CHECK_INTERVAL;
} else {
- space_dead = !host_id_check(&main_task, sp);
+ rv = check_our_lease(&main_task, sp,
+ &check_all, check_buf);
- if (space_dead || external_shutdown ||
- sp->external_remove) {
- log_space(sp, "set killing_pids dead %d "
+ if (rv || external_shutdown || sp->external_remove) {
+ log_space(sp, "set killing_pids check %d "
"shutdown %d remove %d",
- space_dead, external_shutdown,
+ rv, external_shutdown,
sp->external_remove);
- sp->space_dead = space_dead;
+ sp->space_dead = 1;
sp->killing_pids = 1;
kill_pids(sp);
- check_interval = RECOVERY_CHECK_INTERVAL;
- } else {
- check_interval = STANDARD_CHECK_INTERVAL;
}
}
+
+ if (!sp->killing_pids && check_all)
+ check_other_leases(&main_task, sp, check_buf);
+
+ if (sp->killing_pids)
+ check_interval = RECOVERY_CHECK_INTERVAL;
}
empty = list_empty(&spaces);
pthread_mutex_unlock(&spaces_mutex);
@@ -3018,6 +3034,7 @@ int main(int argc, char *argv[])
int rv;
BUILD_BUG_ON(sizeof(struct sanlk_disk) != sizeof(struct sync_disk));
+ BUILD_BUG_ON(sizeof(struct leader_record) > LEADER_RECORD_MAX);
memset(&com, 0, sizeof(com));
com.use_watchdog = DEFAULT_USE_WATCHDOG;
diff --git a/src/sanlock_internal.h b/src/sanlock_internal.h
index 7f6f740..0356807 100644
--- a/src/sanlock_internal.h
+++ b/src/sanlock_internal.h
@@ -102,14 +102,28 @@ struct lease_status {
uint64_t acquire_last_success;
uint64_t renewal_last_attempt;
uint64_t renewal_last_success;
+
+ uint32_t renewal_read_count;
+ uint32_t renewal_read_check;
+ char *renewal_read_buf;
+};
+
+struct host_info {
+ uint64_t last_check; /* local monotime */
+ uint64_t last_live; /* local monotime */
+ uint64_t last_req; /* local monotime */
+ uint64_t owner_id;
+ uint64_t owner_generation;
+ uint64_t timestamp; /* remote monotime */
};
struct space {
+ struct list_head list;
char space_name[NAME_ID_SIZE];
uint64_t host_id;
uint64_t host_generation;
struct sync_disk host_id_disk;
- struct list_head list;
+ int align_size;
int space_id; /* used to refer to this space instance in log messages */
int space_dead;
int killing_pids;
@@ -119,6 +133,9 @@ struct space {
pthread_mutex_t mutex; /* protects lease_status, thread_stop */
struct lease_status lease_status;
int wd_fd;
+ uint32_t req_count;
+ uint32_t req_check;
+ struct host_info host_info[DEFAULT_MAX_HOSTS];
};
struct sm_header {
@@ -440,6 +457,8 @@ struct task {
int host_dead_seconds; /* calculated */
+ int request_finish_seconds; /* calculated */
+
unsigned int io_count; /* stats */
unsigned int to_count; /* stats */
diff --git a/src/task.c b/src/task.c
index c0ab44b..d54dabe 100644
--- a/src/task.c
+++ b/src/task.c
@@ -51,6 +51,7 @@ void setup_task_timeouts(struct task *task, int io_timeout_arg)
task->id_renewal_fail_seconds = id_renewal_fail_seconds;
task->id_renewal_warn_seconds = id_renewal_warn_seconds;
task->host_dead_seconds = host_dead_seconds;
+ task->request_finish_seconds = 3 * id_renewal_seconds; /* random */
/* the rest are calculated as needed in place */
/* hack to make just main thread log this info */