March 2011 - sanlock-devel - Fedora Mailing-Lists

src/client_resource.c src/direct.c src/main.c src/paxos_lease.c src/sanlock_internal.h src/token_manager.c src/token_manager.h

by David Teigland

src/client_resource.c | 2 src/direct.c | 48 +++++----- src/main.c | 216 +++++++++++++++++++++++-------------------------- src/paxos_lease.c | 28 +++--- src/sanlock_internal.h | 13 +- src/token_manager.c | 43 +-------- src/token_manager.h | 2 7 files changed, 157 insertions(+), 195 deletions(-) New commits: commit dabfd65b1da818a68d5a4e9610cfb2d4863a7ab4 Author: David Teigland <teigland(a)redhat.com> Date: Wed Mar 30 16:09:16 2011 -0500 sanlock: rework internal structs Use a sanlk_resource struct in the token instead of duplicating the fields. This allows the use of the recent res_to_str function, and we can remove the second parallel implementation. diff --git a/src/client_resource.c b/src/client_resource.c index 8bab02b..46eceae 100644 --- a/src/client_resource.c +++ b/src/client_resource.c @@ -465,7 +465,7 @@ int sanlock_args_to_state(int res_count, return rv; } - if (strlen(str) > SANLK_MAX_RES_STR) { + if (strlen(str) > SANLK_MAX_RES_STR - 1) { free(str); free(state); return -EINVAL; diff --git a/src/direct.c b/src/direct.c index a4f70b0..2bce141 100644 --- a/src/direct.c +++ b/src/direct.c @@ -35,15 +35,33 @@ static int do_paxos_action(void) struct sanlk_resource *res; struct token *token; struct leader_record leader_read, leader_ret; + int disks_len, token_len; int num_opened; int i, j, rv = 0; for (i = 0; i < com.res_count; i++) { res = com.res_args[i]; - rv = create_token(res->num_disks, &token); - if (rv < 0) - return rv; + disks_len = res->num_disks * sizeof(struct sync_disk); + token_len = sizeof(struct token) + disks_len; + + token = malloc(token_len); + if (!token) + return -ENOMEM; + memset(token, 0, token_len); + token->disks = (struct sync_disk *)&token->r.disks[0]; + token->r.num_disks = res->num_disks; + memcpy(token->r.lockspace_name, res->lockspace_name, SANLK_NAME_LEN); + memcpy(token->r.name, res->name, SANLK_NAME_LEN); + + /* WARNING sync_disk == sanlk_disk */ + + memcpy(token->disks, &res->disks, disks_len); + + for (j = 0; j < token->r.num_disks; j++) { + token->disks[j].sector_size = 0; + token->disks[j].fd = 0; + } /* * TODO: verify all resources are in the same lockspace? @@ -54,21 +72,7 @@ static int do_paxos_action(void) token->host_id = com.local_host_id; token->host_generation = com.local_host_generation; - strncpy(token->space_name, res->lockspace_name, NAME_ID_SIZE); - strncpy(token->resource_name, res->name, NAME_ID_SIZE); - - /* see WARNING above about sync_disk == sanlk_disk */ - - memcpy(token->disks, &res->disks, - token->num_disks * sizeof(struct sync_disk)); - - /* zero out pad1 and pad2, see WARNING above */ - for (j = 0; j < token->num_disks; j++) { - token->disks[j].sector_size = 0; - token->disks[j].fd = 0; - } - - num_opened = open_disks(token->disks, token->num_disks); + num_opened = open_disks(token->disks, token->r.num_disks); if (!majority_disks(token, num_opened)) { log_tool("cannot open majority of disks"); return -1; @@ -87,7 +91,7 @@ static int do_paxos_action(void) rv = paxos_lease_acquire(token, 0, &leader_ret, 0, com.num_hosts); if (rv < 0) { log_tool("cannot acquire lease on %s", - token->resource_name); + token->r.name); return -1; } break; @@ -96,20 +100,20 @@ static int do_paxos_action(void) rv = paxos_lease_leader_read(token, &leader_read); if (rv < 0) { log_tool("cannot read lease on %s", - token->resource_name); + token->r.name); return -1; } rv = paxos_lease_release(token, &leader_read, &leader_ret); if (rv < 0) { log_tool("cannot release lease on %s", - token->resource_name); + token->r.name); return -1; } break; } - free_token(token); + free(token); } return 0; diff --git a/src/main.c b/src/main.c index 9d38fc1..4ab96ed 100644 --- a/src/main.c +++ b/src/main.c @@ -237,7 +237,7 @@ static int client_using_space(struct client *cl, struct space *sp) token = cl->tokens[i]; if (!token) continue; - if (strncmp(token->space_name, sp->space_name, NAME_ID_SIZE)) + if (strncmp(token->r.lockspace_name, sp->space_name, NAME_ID_SIZE)) continue; rv = 1; log_spoke(sp, token, "client_using_space pid %d", cl->pid); @@ -482,7 +482,6 @@ static void *cmd_acquire_thread(void *args_in) struct cmd_args *ca = args_in; struct sm_header h; struct client *cl; - struct sync_disk *disks = NULL; struct token *token = NULL; struct token *new_tokens[SANLK_MAX_RESOURCES]; struct sanlk_resource res; @@ -491,7 +490,8 @@ static void *cmd_acquire_thread(void *args_in) char *opt_str; uint64_t acquire_lver = 0; uint32_t new_num_hosts = 0; - int fd, rv, i, j, disks_len, num_disks, empty_slots, opened; + int token_len, disks_len; + int fd, rv, i, j, empty_slots, opened; int alloc_count = 0, add_count = 0, open_count = 0, acquire_count = 0; int pos = 0, pid_dead = 0; int new_tokens_count; @@ -534,80 +534,69 @@ static void *cmd_acquire_thread(void *args_in) */ for (i = 0; i < new_tokens_count; i++) { - token = malloc(sizeof(struct token)); - if (!token) { - rv = -ENOMEM; - goto fail_free; - } - memset(token, 0, sizeof(struct token)); - /* - * receive sanlk_resource, copy into token + * receive sanlk_resource, create token for it */ rv = recv(fd, &res, sizeof(struct sanlk_resource), MSG_WAITALL); if (rv > 0) pos += rv; if (rv != sizeof(struct sanlk_resource)) { - log_error("cmd_acquire recv %d %d", rv, errno); - free(token); + log_error("cmd_acquire recv res %d %d", rv, errno); rv = -EIO; goto fail_free; } - strncpy(token->space_name, res.lockspace_name, SANLK_NAME_LEN); - strncpy(token->resource_name, res.name, SANLK_NAME_LEN); + if (res.num_disks > MAX_DISKS) { + rv = -ERANGE; + goto fail_free; + } + + disks_len = res.num_disks * sizeof(struct sync_disk); + token_len = sizeof(struct token) + disks_len; + + token = malloc(token_len); + if (!token) { + rv = -ENOMEM; + goto fail_free; + } + memset(token, 0, token_len); + token->disks = (struct sync_disk *)&token->r.disks[0]; /* shorthand */ + token->r.num_disks = res.num_disks; + memcpy(token->r.lockspace_name, res.lockspace_name, SANLK_NAME_LEN); + memcpy(token->r.name, res.name, SANLK_NAME_LEN); + token->acquire_lver = res.lver; token->acquire_data64 = res.data64; token->acquire_data32 = res.data32; token->acquire_flags = res.flags; - token->num_disks = res.num_disks; /* * receive sanlk_disk's / sync_disk's * * WARNING: as a shortcut, this requires that sync_disk and * sanlk_disk match; this is the reason for the pad fields - * in sanlk_disk (TODO: let these differ) + * in sanlk_disk (TODO: let these differ?) */ - num_disks = token->num_disks; - if (num_disks > MAX_DISKS) { - free(token); - rv = -ERANGE; - goto fail_free; - } - - disks = malloc(num_disks * sizeof(struct sync_disk)); - if (!disks) { - free(token); - rv = -ENOMEM; - goto fail_free; - } - - disks_len = num_disks * sizeof(struct sync_disk); - memset(disks, 0, disks_len); - - rv = recv(fd, disks, disks_len, MSG_WAITALL); + rv = recv(fd, token->disks, disks_len, MSG_WAITALL); if (rv > 0) pos += rv; if (rv != disks_len) { - log_error("cmd_acquire recv %d %d", rv, errno); - free(disks); + log_error("cmd_acquire recv disks %d %d", rv, errno); free(token); rv = -EIO; goto fail_free; } /* zero out pad1 and pad2, see WARNING above */ - for (j = 0; j < num_disks; j++) { - disks[j].sector_size = 0; - disks[j].fd = 0; + for (j = 0; j < token->r.num_disks; j++) { + token->disks[j].sector_size = 0; + token->disks[j].fd = 0; } token->token_id = token_id_counter++; - token->disks = disks; new_tokens[i] = token; alloc_count++; @@ -618,7 +607,7 @@ static void *cmd_acquire_thread(void *args_in) * represents for reference from later log messages. */ log_errot(token, "lockspace %.48s resource %.48s has token_id %u for pid %u", - token->space_name, token->resource_name, token->token_id, cl->pid); + token->r.lockspace_name, token->r.name, token->token_id, cl->pid); } /* @@ -629,7 +618,7 @@ static void *cmd_acquire_thread(void *args_in) if (rv > 0) pos += rv; if (rv != sizeof(struct sanlk_options)) { - log_error("cmd_acquire recv %d %d", rv, errno); + log_error("cmd_acquire recv opt %d %d", rv, errno); rv = -EIO; goto fail_free; } @@ -651,7 +640,7 @@ static void *cmd_acquire_thread(void *args_in) if (rv > 0) pos += rv; if (rv != opt.len) { - log_error("cmd_acquire recv %d %d", rv, errno); + log_error("cmd_acquire recv opt_str %d %d", rv, errno); free(opt_str); rv = -EIO; goto fail_free; @@ -669,10 +658,10 @@ static void *cmd_acquire_thread(void *args_in) for (i = 0; i < new_tokens_count; i++) { token = new_tokens[i]; - rv = get_space_info(token->space_name, &space); + rv = get_space_info(token->r.lockspace_name, &space); if (rv < 0 || space.killing_pids) { log_errot(token, "cmd_acquire bad space %.48s", - token->space_name); + token->r.lockspace_name); goto fail_free; } token->host_id = space.host_id; @@ -691,7 +680,7 @@ static void *cmd_acquire_thread(void *args_in) for (i = 0; i < new_tokens_count; i++) { token = new_tokens[i]; - opened = open_disks(token->disks, token->num_disks); + opened = open_disks(token->disks, token->r.num_disks); if (!majority_disks(token, opened)) { log_errot(token, "cmd_acquire open_disks %d", opened); rv = -ENODEV; @@ -751,11 +740,12 @@ static void *cmd_acquire_thread(void *args_in) /* space may have failed while new tokens were being acquired */ for (i = 0; i < new_tokens_count; i++) { token = new_tokens[i]; - rv = get_space_info(token->space_name, &space); + rv = get_space_info(token->r.lockspace_name, &space); if (!rv && !space.killing_pids && space.host_id == token->host_id) continue; pthread_mutex_unlock(&cl->mutex); - log_errot(token, "cmd_acquire bad space %.48s", token->space_name); + log_errot(token, "cmd_acquire bad space %.48s", + token->r.lockspace_name); rv = -EINVAL; goto fail_release; } @@ -792,9 +782,9 @@ static void *cmd_acquire_thread(void *args_in) if (!token) continue; release_token(token); - close_disks(token->disks, token->num_disks); + close_disks(token->disks, token->r.num_disks); del_resource(token); - free_token(token); + free(token); } fail_release: @@ -803,7 +793,7 @@ static void *cmd_acquire_thread(void *args_in) fail_close: for (i = 0; i < open_count; i++) - close_disks(new_tokens[i]->disks, new_tokens[i]->num_disks); + close_disks(new_tokens[i]->disks, new_tokens[i]->r.num_disks); fail_del: for (i = 0; i < add_count; i++) @@ -811,7 +801,7 @@ static void *cmd_acquire_thread(void *args_in) fail_free: for (i = 0; i < alloc_count; i++) - free_token(new_tokens[i]); + free(new_tokens[i]); fail_reply: set_cmd_active(ca->ci_target, 0); @@ -858,7 +848,7 @@ static void *cmd_release_thread(void *args_in) rv = release_token(token); if (rv < 0) result = -1; - free_token(token); + free(token); cl->tokens[j] = NULL; } goto reply; @@ -881,15 +871,15 @@ static void *cmd_release_thread(void *args_in) if (!token) continue; - if (memcmp(token->space_name, res.lockspace_name, NAME_ID_SIZE)) + if (memcmp(token->r.lockspace_name, res.lockspace_name, NAME_ID_SIZE)) continue; - if (memcmp(token->resource_name, res.name, NAME_ID_SIZE)) + if (memcmp(token->r.name, res.name, NAME_ID_SIZE)) continue; rv = release_token(token); if (rv < 0) result = -1; - free_token(token); + free(token); cl->tokens[j] = NULL; found = 1; break; @@ -923,10 +913,12 @@ static void *cmd_inquire_thread(void *args_in) struct cmd_args *ca = args_in; struct sm_header h; struct token *token; - struct sanlk_resource *res; - char *reply_str; struct client *cl; - int fd, i, d, reply_len, result = 0, total = 0, ret, pos, reply_str_len = 0; + char *state, *str; + int state_maxlen = 0, state_strlen = 0; + int res_count = 0, cat_count = 0; + int result = 0; + int fd, i, rv; cl = &client[ca->ci_target]; fd = client[ca->ci_in].fd; @@ -936,77 +928,85 @@ static void *cmd_inquire_thread(void *args_in) for (i = 0; i < SANLK_MAX_RESOURCES; i++) { if (cl->tokens[i]) - total++; + res_count++; } - /* TODO: use sanlock_res_to_str() */ + state_maxlen = res_count * (SANLK_MAX_RES_STR + 1); - reply_len = total * SANLK_MAX_RES_STR; - reply_str = malloc(reply_len); - if (!reply_str) { + state = malloc(state_maxlen); + if (!state) { result = -ENOMEM; goto reply; } - memset(reply_str, 0, reply_len); - res = (struct sanlk_resource *)reply_str; - pos = 0; - ret = 0; + memset(state, 0, state_maxlen); + + /* should match sanlock_args_to_state() */ for (i = 0; i < SANLK_MAX_RESOURCES; i++) { token = cl->tokens[i]; if (!token) continue; - ret = snprintf(reply_str + pos, reply_len - pos, "%s:%s", - token->space_name, token->resource_name); + /* check number of tokens hasn't changed since first count */ - if (ret >= reply_len - pos) - goto out; - pos += ret; + if (cat_count >= res_count) { + log_error("cmd_inquire count changed %d %d", + res_count, cat_count); + result = -1; + goto reply; + } - for (d = 0; d < token->num_disks; d++) { - ret = snprintf(reply_str + pos, reply_len - pos, ":%s:%llu", - token->disks[d].path, - (unsigned long long)token->disks[d].offset); + str = NULL; - if (ret >= reply_len - pos) - goto out; - pos += ret; + rv = sanlock_res_to_str(&token->r, &str); + if (rv < 0 || !str) { + log_errot(token, "cmd_inquire res_to_str error %d", rv); + result = -2; + goto reply; } - ret = snprintf(reply_str + pos, reply_len - pos, ":%llu ", - (unsigned long long)token->leader.lver); + if (strlen(str) > SANLK_MAX_RES_STR - 1) { + log_errot(token, "cmd_inquire str too long %zu", strlen(str)); + free(str); + result = -3; + goto reply; + } - if (ret >= reply_len - pos) - goto out; - pos += ret; - } + /* space is str separator, so it's invalid within each str */ - /* remove trailing space */ - pos--; - reply_str[pos++] = '\0'; - reply_str_len = strlen(reply_str); + if (strstr(str, " ")) { + log_errot(token, "cmd_inquire str has space"); + free(str); + result = -4; + goto reply; + } - ret = 0; - out: - if (ret) - result = -ENOSPC; + if (i) + strcat(state, " "); + strcat(state, str); + cat_count++; + free(str); + } + + state[state_maxlen - 1] = '\0'; + state_strlen = strlen(state); + result = 0; reply: set_cmd_active(ca->ci_target, 0); - log_debug("cmd_inquire done result %d total %d pos %d reply_str_len %d", - result, total, pos, reply_str_len); + log_debug("cmd_inquire done result %d res_count %d strlen %d", + result, res_count, state_strlen); memcpy(&h, &ca->header, sizeof(struct sm_header)); h.data = result; - h.data2 = total; + h.data2 = res_count; - if (reply_str) { - h.length = sizeof(h) + pos; + if (state) { + h.length = sizeof(h) + state_strlen + 1; send(fd, &h, sizeof(h), MSG_NOSIGNAL); - send(fd, reply_str, pos, MSG_NOSIGNAL); - free(reply_str); + send(fd, state, state_strlen + 1, MSG_NOSIGNAL); + free(state); } else { h.length = sizeof(h); send(fd, &h, sizeof(h), MSG_NOSIGNAL); @@ -1208,7 +1208,6 @@ static void cmd_status(int fd, struct sm_header *h_recv) struct sanlk_state cst; struct sanlk_state rst; struct sanlk_lockspace lockspace; - struct sanlk_resource resource; char str[SANLK_STATE_MAXSTR]; struct token *token; struct space *sp; @@ -1301,21 +1300,16 @@ static void cmd_status(int fd, struct sm_header *h_recv) str_len = print_token_state(token, str); memset(&rst, 0, sizeof(rst)); rst.type = SANLK_STATE_RESOURCE; - strncpy(rst.name, token->resource_name, NAME_ID_SIZE); + strncpy(rst.name, token->r.name, NAME_ID_SIZE); rst.str_len = str_len; send(fd, &rst, sizeof(rst), MSG_NOSIGNAL); if (str_len) send(fd, str, str_len, MSG_NOSIGNAL); - memset(&resource, 0, sizeof(resource)); - strncpy(resource.lockspace_name, token->space_name, NAME_ID_SIZE); - strncpy(resource.name, token->resource_name, NAME_ID_SIZE); - resource.num_disks = token->num_disks; - - send(fd, &resource, sizeof(resource), MSG_NOSIGNAL); + send(fd, &token->r, sizeof(struct sanlk_resource), MSG_NOSIGNAL); - for (j = 0; j < token->num_disks; j++) { + for (j = 0; j < token->r.num_disks; j++) { send(fd, &token->disks[j], sizeof(struct sanlk_disk), MSG_NOSIGNAL); } } diff --git a/src/paxos_lease.c b/src/paxos_lease.c index ad54920..c9c07cd 100644 --- a/src/paxos_lease.c +++ b/src/paxos_lease.c @@ -49,7 +49,7 @@ struct paxos_dblock { int majority_disks(struct token *token, int num) { - int num_disks = token->num_disks; + int num_disks = token->r.num_disks; /* odd number of disks */ @@ -187,7 +187,7 @@ static int run_disk_paxos(struct token *token, uint64_t host_id, uint64_t inp, struct paxos_dblock bk[num_hosts]; struct paxos_dblock bk_max; struct paxos_dblock dblock; - int num_disks = token->num_disks; + int num_disks = token->r.num_disks; int num_writes, num_reads; int d, q, rv; @@ -427,15 +427,15 @@ static int verify_leader(struct token *token, struct sync_disk *disk, return DP_BAD_SECTORSIZE; } - if (strncmp(lr->space_name, token->space_name, NAME_ID_SIZE)) { + if (strncmp(lr->space_name, token->r.lockspace_name, NAME_ID_SIZE)) { log_errot(token, "verify_leader wrong space name %.48s %.48s %s", - lr->space_name, token->space_name, disk->path); + lr->space_name, token->r.lockspace_name, disk->path); return DP_BAD_LOCKSPACE; } - if (strncmp(lr->resource_name, token->resource_name, NAME_ID_SIZE)) { + if (strncmp(lr->resource_name, token->r.name, NAME_ID_SIZE)) { log_errot(token, "verify_leader wrong resource name %.48s %.48s %s", - lr->resource_name, token->resource_name, disk->path); + lr->resource_name, token->r.name, disk->path); return DP_BAD_RESOURCEID; } @@ -471,7 +471,7 @@ int paxos_lease_leader_read(struct token *token, struct leader_record *leader_re int *leader_reps; int leaders_len, leader_reps_len; int num_reads; - int num_disks = token->num_disks; + int num_disks = token->r.num_disks; int rv, d, i, found; int error; @@ -570,7 +570,7 @@ int paxos_lease_leader_read(struct token *token, struct leader_record *leader_re static int write_new_leader(struct token *token, struct leader_record *nl) { - int num_disks = token->num_disks; + int num_disks = token->r.num_disks; int num_writes = 0; int error = DP_OK; int rv, d; @@ -824,7 +824,7 @@ int paxos_lease_renew(struct token *token, int rv, d; int error; - for (d = 0; d < token->num_disks; d++) { + for (d = 0; d < token->r.num_disks; d++) { memset(&new_leader, 0, sizeof(struct leader_record)); rv = read_leader(&token->disks[d], &new_leader); @@ -896,8 +896,8 @@ int paxos_lease_init(struct token *token, int num_hosts, int max_hosts) uint32_t offset, ss; uint64_t bb, be, sb, se; - printf("initialize lease for resource %.48s\n", token->resource_name); - for (d = 0; d < token->num_disks; d++) { + printf("initialize lease for resource %.48s\n", token->r.name); + for (d = 0; d < token->r.num_disks; d++) { printf("disk %s offset %llu/%llu sector_size %d\n", token->disks[d].path, (unsigned long long)token->disks[d].offset, @@ -931,11 +931,11 @@ int paxos_lease_init(struct token *token, int num_hosts, int max_hosts) leader.num_hosts = num_hosts; leader.max_hosts = max_hosts; leader.timestamp = LEASE_FREE; - strncpy(leader.space_name, token->space_name, NAME_ID_SIZE); - strncpy(leader.resource_name, token->resource_name, NAME_ID_SIZE); + strncpy(leader.space_name, token->r.lockspace_name, NAME_ID_SIZE); + strncpy(leader.resource_name, token->r.name, NAME_ID_SIZE); leader.checksum = leader_checksum(&leader); - for (d = 0; d < token->num_disks; d++) { + for (d = 0; d < token->r.num_disks; d++) { write_leader(&token->disks[d], &leader); write_request(&token->disks[d], &req); for (q = 0; q < max_hosts; q++) diff --git a/src/sanlock_internal.h b/src/sanlock_internal.h index 4a1a2f4..0fa14e5 100644 --- a/src/sanlock_internal.h +++ b/src/sanlock_internal.h @@ -67,27 +67,24 @@ struct sync_disk { threads. */ struct token { - /* mirror external sanlk_resource from acquire */ - char space_name[NAME_ID_SIZE]; - char resource_name[NAME_ID_SIZE]; + /* values copied from acquire res arg */ uint64_t acquire_lver; uint64_t acquire_data64; uint32_t acquire_data32; uint32_t acquire_flags; - int num_disks; - /* copied from the sp with space_name */ + /* copied from the sp with r.lockspace_name */ uint64_t host_id; uint64_t host_generation; - /* disks from acquire */ - struct sync_disk *disks; - /* internal */ int token_id; /* used to refer to this token instance in log messages */ int acquire_result; int release_result; struct leader_record leader; /* copy of last leader_record we wrote */ + + struct sync_disk *disks; /* shorthand, points to r.disks[0] */ + struct sanlk_resource r; }; struct lease_status { diff --git a/src/token_manager.c b/src/token_manager.c index 2cdc1fd..2c88560 100644 --- a/src/token_manager.c +++ b/src/token_manager.c @@ -49,9 +49,9 @@ static struct resource *find_resource(struct token *token, struct resource *r; list_for_each_entry(r, head, list) { - if (strncmp(r->space_name, token->space_name, NAME_ID_SIZE)) + if (strncmp(r->space_name, token->r.lockspace_name, NAME_ID_SIZE)) continue; - if (strncmp(r->resource_name, token->resource_name, NAME_ID_SIZE)) + if (strncmp(r->resource_name, token->r.name, NAME_ID_SIZE)) continue; return r; } @@ -86,8 +86,8 @@ int add_resource(struct token *token, int pid) } memset(r, 0, sizeof(struct resource)); - strncpy(r->space_name, token->space_name, NAME_ID_SIZE); - strncpy(r->resource_name, token->resource_name, NAME_ID_SIZE); + strncpy(r->space_name, token->r.lockspace_name, NAME_ID_SIZE); + strncpy(r->resource_name, token->r.name, NAME_ID_SIZE); r->token = token; r->pid = pid; list_add_tail(&r->list, &resources); @@ -160,37 +160,6 @@ int release_token(struct token *token) return rv; /* DP_OK */ } -/* return < 0 on error, 1 on success */ - -int create_token(int num_disks, struct token **token_out) -{ - struct token *token; - struct sync_disk *disks; - - token = malloc(sizeof(struct token)); - if (!token) - return -ENOMEM; - memset(token, 0, sizeof(struct token)); - - disks = malloc(num_disks * sizeof(struct sync_disk)); - if (!disks) { - free(token); - return -ENOMEM; - } - - token->disks = disks; - token->num_disks = num_disks; - *token_out = token; - return 0; -} - -void free_token(struct token *token) -{ - if (token->disks) - free(token->disks); - free(token); -} - /* thread that releases tokens of pid's that die */ static void *async_release_thread(void *arg GNUC_UNUSED) @@ -216,7 +185,7 @@ static void *async_release_thread(void *arg GNUC_UNUSED) if (token->acquire_result == 1) release_token(token); - close_disks(token->disks, token->num_disks); + close_disks(token->disks, token->r.num_disks); /* we don't want to remove r from dispose_list until after the lease is released because we don't want a new token for @@ -226,7 +195,7 @@ static void *async_release_thread(void *arg GNUC_UNUSED) pthread_mutex_lock(&resource_mutex); _del_resource(r); pthread_mutex_unlock(&resource_mutex); - free_token(token); + free(token); } out: return NULL; diff --git a/src/token_manager.h b/src/token_manager.h index 64f95b9..3fddfe1 100644 --- a/src/token_manager.h +++ b/src/token_manager.h @@ -12,8 +12,6 @@ int acquire_token(struct token *token, uint64_t acquire_lver, int new_num_hosts); int release_token(struct token *token); -int create_token(int num_disks, struct token **token_out); -void free_token(struct token *token); void release_token_async(struct token *token); int add_resource(struct token *token, int pid);

13 years, 1 month

1
0
0 / 0

src/client_resource.c src/diskio.c src/main.c src/sanlock.h src/sanlock_internal.h src/sanlock_resource.h tests/res_string.c

by David Teigland

src/client_resource.c | 289 ++++++++++++++++++++++++++++++++++++++ src/diskio.c | 17 -- src/main.c | 366 ++++++++++--------------------------------------- src/sanlock.h | 24 ++- src/sanlock_internal.h | 12 - src/sanlock_resource.h | 50 +++++- tests/res_string.c | 80 ++++++++++ 7 files changed, 499 insertions(+), 339 deletions(-) New commits: commit 8a887a9878871209d8cde83f249ca3bbb9ccbd1f Author: David Teigland <teigland(a)redhat.com> Date: Wed Mar 30 10:20:59 2011 -0500 sanlock: add resource string conversion functions Make sanlock_inquire always return a state string, and add api functions to convert between string and struct resource formats. Start using these conversion functions elsewhere to avoid string parsing and formating in multiple places. diff --git a/src/client_resource.c b/src/client_resource.c index 57b9588..8bab02b 100644 --- a/src/client_resource.c +++ b/src/client_resource.c @@ -145,7 +145,8 @@ int sanlock_acquire(int sock, int pid, uint32_t flags, int res_count, return rv; } -int sanlock_inquire(int sock, int pid, uint32_t flags, int *res_count, void **res_out) +int sanlock_inquire(int sock, int pid, uint32_t flags, int *res_count, + char **res_state) { struct sm_header h; char *reply_data = NULL; @@ -153,8 +154,8 @@ int sanlock_inquire(int sock, int pid, uint32_t flags, int *res_count, void **re *res_count = 0; - if (res_out) - *res_out = NULL; + if (res_state) + *res_state = NULL; if (sock == -1) { /* connect to daemon and ask it to acquire a lease for @@ -206,8 +207,8 @@ int sanlock_inquire(int sock, int pid, uint32_t flags, int *res_count, void **re goto out; } - if (res_out) - *res_out = reply_data; + if (res_state) + *res_state = reply_data; else free(reply_data); @@ -276,3 +277,281 @@ int sanlock_release(int sock, int pid, uint32_t flags, int res_count, return rv; } +/* + * convert from struct sanlk_resource to string with format: + * <lockspace_name>:<resource_name>:<path>:<offset>[:<path>:<offset>...]:<lver> + */ + +int sanlock_res_to_str(struct sanlk_resource *res, char **str_ret) +{ + char *str; + int ret, len, pos, d; + + str = malloc(SANLK_MAX_RES_STR + 1); + if (!str) + return -ENOMEM; + memset(str, 0, SANLK_MAX_RES_STR + 1); + + len = SANLK_MAX_RES_STR; + pos = 0; + + ret = snprintf(str + pos, len - pos, "%s:%s", + res->lockspace_name, res->name); + + if (ret >= len - pos) + goto fail; + pos += ret; + + for (d = 0; d < res->num_disks; d++) { + ret = snprintf(str + pos, len - pos, ":%s:%llu", + res->disks[d].path, + (unsigned long long)res->disks[d].offset); + + if (ret >= len - pos) + goto fail; + pos += ret; + } + + ret = snprintf(str + pos, len - pos, ":%llu", + (unsigned long long)res->lver); + + if (ret > len - pos) + goto fail; + pos += ret; + + if (pos > len) + goto fail; + + *str_ret = str; + return 0; + + fail: + free(str); + return -EINVAL; +} + +/* + * convert to struct sanlk_resource from string with format: + * <lockspace_name>:<resource_name>:<path>:<offset>[:<path>:<offset>...][:<lver>] + */ + +int sanlock_str_to_res(char *str, struct sanlk_resource **res_ret) +{ + struct sanlk_resource *res; + char sub[SANLK_PATH_LEN + 1]; + int i, j, d, rv, len, sub_count, colons, num_disks, have_lver; + + if (strlen(str) < 3) + return -ENXIO; + + colons = 0; + for (i = 0; i < strlen(str); i++) { + if (str[i] == '\\') { + i++; + continue; + } + + if (str[i] == ':') + colons++; + } + if (!colons || (colons == 2)) { + return -1; + } + + num_disks = (colons - 1) / 2; + have_lver = (colons - 1) % 2; + + if (num_disks > MAX_DISKS) + return -2; + + len = sizeof(struct sanlk_resource) + num_disks * sizeof(struct sanlk_disk); + + res = malloc(len); + if (!res) + return -ENOMEM; + memset(res, 0, len); + + res->num_disks = num_disks; + + d = 0; + sub_count = 0; + j = 0; + memset(sub, 0, sizeof(sub)); + + len = strlen(str); + + for (i = 0; i < len + 1; i++) { + if (str[i] == '\\') { + if (i == (len - 1)) + goto fail; + + i++; + sub[j++] = str[i]; + continue; + } + if (i < len && str[i] != ':') { + if (j >= SANLK_PATH_LEN) + goto fail; + sub[j++] = str[i]; + continue; + } + + /* do something with sub when we hit ':' or end of str, + first and second subs are lockspace and resource names, + then even sub is path, odd sub is offset */ + + if (sub_count < 2 && strlen(sub) > NAME_ID_SIZE) + goto fail; + if (sub_count >= 2 && (strlen(sub) > SANLK_PATH_LEN-1 || strlen(sub) < 1)) + goto fail; + + if (sub_count == 0) { + strncpy(res->lockspace_name, sub, NAME_ID_SIZE); + + } else if (sub_count == 1) { + strncpy(res->name, sub, NAME_ID_SIZE); + + } else if (!(sub_count % 2)) { + if (have_lver && (d == num_disks)) { + res->flags |= SANLK_RES_LVER; + res->lver = strtoull(sub, NULL, 0); + } else { + strncpy(res->disks[d].path, sub, SANLK_PATH_LEN - 1); + } + } else { + rv = sscanf(sub, "%llu", (unsigned long long *)&res->disks[d].offset); + if (rv != 1) + goto fail; + d++; + } + + sub_count++; + j = 0; + memset(sub, 0, sizeof(sub)); + } + + *res_ret = res; + return 0; + + fail: + free(res); + return -1; +} + +/* + * convert from array of struct sanlk_resource * to state string with format: + * "RESOURCE1 RESOURCE2 RESOURCE3 ..." + * RESOURCE format in sanlock_res_to_str() comment + */ + +int sanlock_args_to_state(int res_count, + struct sanlk_resource *res_args[], + char **res_state) +{ + char *str, *state; + int i, rv; + + state = malloc(res_count * (SANLK_MAX_RES_STR + 1)); + if (!state) + return -ENOMEM; + memset(state, 0, res_count * (SANLK_MAX_RES_STR + 1)); + + for (i = 0; i < res_count; i++) { + str = NULL; + + rv = sanlock_res_to_str(res_args[i], &str); + if (rv < 0 || !str) { + free(state); + return rv; + } + + if (strlen(str) > SANLK_MAX_RES_STR) { + free(str); + free(state); + return -EINVAL; + } + + /* space is str separator, so it's invalid within each str */ + + if (strstr(str, " ")) { + free(str); + free(state); + return -EINVAL; + } + + if (i) + strcat(state, " "); + strcat(state, str); + free(str); + } + + *res_state = state; + return 0; +} + +/* + * convert to array of struct sanlk_resource * from state string with format: + * "RESOURCE1 RESOURCE2 RESOURCE3 ..." + * RESOURCE format in sanlock_str_to_res() comment + */ + +int sanlock_state_to_args(char *res_state, + int *res_count, + struct sanlk_resource ***res_args) +{ + struct sanlk_resource **args; + struct sanlk_resource *res; + char str[SANLK_MAX_RES_STR + 1]; + int count = 1, arg_count = 0; + int i, j, len, rv; + + for (i = 0; i < strlen(res_state); i++) { + if (res_state[i] == ' ') + count++; + } + + *res_count = count; + + args = malloc(count * sizeof(*args)); + if (!args) + return -ENOMEM; + memset(args, 0, count * sizeof(*args)); + + j = 0; + memset(str, 0, sizeof(str)); + + len = strlen(res_state); + + for (i = 0; i < len + 1; i++) { + if (i < len && res_state[i] != ' ') { + str[j++] = res_state[i]; + continue; + } + + rv = sanlock_str_to_res(str, &res); + if (rv < 0 || !res) + goto fail_free; + + if (arg_count == count) + goto fail_free; + + args[arg_count++] = res; + + j = 0; + memset(str, 0, sizeof(str)); + } + + *res_count = arg_count; + *res_args = args; + return 0; + + fail_free: + for (i = 0; i < count; i++) { + if (args[i]) + free(args[i]); + } + free(args); + fail: + return rv; +} + diff --git a/src/diskio.c b/src/diskio.c index 16b615a..3c01650 100644 --- a/src/diskio.c +++ b/src/diskio.c @@ -107,23 +107,6 @@ int open_disks(struct sync_disk *disks, int num_disks) orig_offset = disk->offset; - switch (disk->units) { - case SANLK_UNITS_BYTES: - break; - case SANLK_UNITS_SECTORS: - disk->offset = orig_offset * ss; - break; - case SANLK_UNITS_KB: - disk->offset = orig_offset * 1024; - break; - case SANLK_UNITS_MB: - disk->offset = orig_offset * 1024 * 1024; - break; - default: - log_error("invalid offset units %d", disk->units); - goto fail; - } - if (disk->offset % disk->sector_size) { log_error("invalid offset %llu sector size %u %s", (unsigned long long)disk->offset, diff --git a/src/main.c b/src/main.c index ee4566e..9d38fc1 100644 --- a/src/main.c +++ b/src/main.c @@ -477,59 +477,6 @@ static void client_recv_all(int ci, struct sm_header *h_recv, int pos) log_debug("recv_all ci %d rem %d total %d", ci, rem, total); } -/* str format: "abc=123 def=456 ghi=780" */ - -static int parse_key_val(char *str, const char *key_arg, char *val_arg, int len) -{ - int copy_key, copy_val, i, kvi; - char key[64], val[64]; - - memset(val_arg, 0, len); - - copy_key = 1; - copy_val = 0; - kvi = 0; - - for (i = 0; i < strlen(str); i++) { - if (str[i] == ' ') { - if (!strcmp(key, key_arg)) { - strncpy(val_arg, val, len); - return 0; - } - memset(key, 0, sizeof(key)); - memset(val, 0, sizeof(val)); - copy_key = 1; - copy_val = 0; - kvi = 0; - continue; - } - - if (str[i] == '=') { - copy_key = 0; - copy_val = 1; - kvi = 0; - continue; - } - - if (copy_key) - key[kvi++] = str[i]; - else if (copy_val) - val[kvi++] = str[i]; - - if (kvi > 62) { - log_error("invalid timeout parameter"); - return -1; - } - } - - if (!strcmp(key, key_arg)) { - strncpy(val_arg, val, len); - return 0; - } - - return -1; -} - static void *cmd_acquire_thread(void *args_in) { struct cmd_args *ca = args_in; @@ -540,11 +487,10 @@ static void *cmd_acquire_thread(void *args_in) struct token *new_tokens[SANLK_MAX_RESOURCES]; struct sanlk_resource res; struct sanlk_options opt; + struct space space; char *opt_str; - char num_hosts_str[16]; uint64_t acquire_lver = 0; - struct space space; - int new_num_hosts = 0; + uint32_t new_num_hosts = 0; int fd, rv, i, j, disks_len, num_disks, empty_slots, opened; int alloc_count = 0, add_count = 0, open_count = 0, acquire_count = 0; int pos = 0, pid_dead = 0; @@ -721,21 +667,6 @@ static void *cmd_acquire_thread(void *args_in) * all command input has been received, start doing the acquire */ - if (opt.flags & SANLK_OPT_NUM_HOSTS) { - if (!opt_str) - goto fail_free; - - memset(num_hosts_str, 0, sizeof(num_hosts_str)); - - rv = parse_key_val(opt_str, "num_hosts", num_hosts_str, 15); - if (rv < 0) { - log_error("cmd_acquire num_hosts error"); - goto fail_free; - } - - new_num_hosts = atoi(num_hosts_str); - } - for (i = 0; i < new_tokens_count; i++) { token = new_tokens[i]; rv = get_space_info(token->space_name, &space); @@ -774,6 +705,8 @@ static void *cmd_acquire_thread(void *args_in) if (token->acquire_flags & SANLK_RES_LVER) acquire_lver = token->acquire_lver; + if (token->acquire_flags & SANLK_RES_NUM_HOSTS) + new_num_hosts = token->acquire_data32; rv = acquire_token(token, acquire_lver, new_num_hosts); if (rv < 0) { @@ -1006,13 +939,9 @@ static void *cmd_inquire_thread(void *args_in) total++; } - /* - * struct output [sanlk_resource][sanlk_disk...][sanlk_resource][sanlk_disk...]... - * string output "RESOURCE1 RESOURCE2 RESOURCE3 ..." - * RESOURCE = <lockspace_name>:<resource_name>:<path>:<offset>[:<path>:<offset>...]:<lver> - */ + /* TODO: use sanlock_res_to_str() */ - reply_len = total * SANLK_STATE_MAXSTR; /* rough estimate */ + reply_len = total * SANLK_MAX_RES_STR; reply_str = malloc(reply_len); if (!reply_str) { result = -ENOMEM; @@ -1028,52 +957,36 @@ static void *cmd_inquire_thread(void *args_in) if (!token) continue; - if (ca->header.cmd_flags & SANLK_INQ_STRUCT) { - strcpy(res->lockspace_name, token->space_name); - strcpy(res->name, token->resource_name); - res->lver = token->leader.lver; - res->flags |= SANLK_RES_LVER; - res->num_disks = token->num_disks; - pos += sizeof(struct sanlk_resource); - - for (d = 0; d < token->num_disks; d++) { - strcpy(res->disks[d].path, token->disks[d].path); - res->disks[d].offset = token->disks[d].offset; - pos += sizeof(struct sanlk_disk); - } - res++; - } else { - ret = snprintf(reply_str + pos, reply_len - pos, "%s:%s", - token->space_name, token->resource_name); + ret = snprintf(reply_str + pos, reply_len - pos, "%s:%s", + token->space_name, token->resource_name); - if (ret >= reply_len - pos) - goto out; - pos += ret; - - for (d = 0; d < token->num_disks; d++) { - ret = snprintf(reply_str + pos, reply_len - pos, ":%s:%llu", - token->disks[d].path, - (unsigned long long)token->disks[d].offset); - - if (ret >= reply_len - pos) - goto out; - pos += ret; - } + if (ret >= reply_len - pos) + goto out; + pos += ret; - ret = snprintf(reply_str + pos, reply_len - pos, ":%llu ", - (unsigned long long)token->leader.lver); + for (d = 0; d < token->num_disks; d++) { + ret = snprintf(reply_str + pos, reply_len - pos, ":%s:%llu", + token->disks[d].path, + (unsigned long long)token->disks[d].offset); if (ret >= reply_len - pos) goto out; pos += ret; } - } - if (ca->header.cmd_flags & SANLK_INQ_STRING) { - reply_str_len = strlen(reply_str); - reply_str[pos++] = '\0'; + ret = snprintf(reply_str + pos, reply_len - pos, ":%llu ", + (unsigned long long)token->leader.lver); + + if (ret >= reply_len - pos) + goto out; + pos += ret; } + /* remove trailing space */ + pos--; + reply_str[pos++] = '\0'; + reply_str_len = strlen(reply_str); + ret = 0; out: if (ret) @@ -1778,25 +1691,6 @@ static int do_daemon(void) return rv; } -static int create_sanlk_resource(int num_disks, struct sanlk_resource **res_out) -{ - struct sanlk_resource *res; - int len; - - len = sizeof(struct sanlk_resource) + - num_disks * sizeof(struct sanlk_disk); - - res = malloc(sizeof(struct sanlk_resource) + - (num_disks * sizeof(struct sanlk_disk))); - if (!res) - return -ENOMEM; - memset(res, 0, len); - - res->num_disks = num_disks; - *res_out = res; - return 0; -} - /* arg = <lockspace_name>:<host_id>:<path>:<offset> */ static int parse_arg_lockspace(char *arg) @@ -1834,7 +1728,7 @@ static int parse_arg_lockspace(char *arg) if (offset) com.lockspace.host_id_disk.offset = atoll(offset); - log_debug("lockspace arg %s %llu %s %llu", + log_debug("lockspace %s host_id %llu path %s offset %llu", com.lockspace.name, (unsigned long long)com.lockspace.host_id, com.lockspace.host_id_disk.path, @@ -1843,150 +1737,33 @@ static int parse_arg_lockspace(char *arg) return 0; } -/* arg = <lockspace_name>:<resource_name>:<path>:<offset>[:<path>:<offset>...][:<lver>] */ - static int parse_arg_resource(char *arg) { struct sanlk_resource *res; - char sub[SANLK_PATH_LEN + 1]; - char unit[SANLK_PATH_LEN + 1]; - int sub_count; - int colons; - int num_disks; - int have_lver; - int rv, i, j, d; - int len = strlen(arg); + int rv, i; if (com.res_count >= SANLK_MAX_RESOURCES) { log_tool("resource args over max %d", SANLK_MAX_RESOURCES); return -1; } - colons = 0; - for (i = 0; i < strlen(arg); i++) { - if (arg[i] == '\\') { - i++; - continue; - } - - if (arg[i] == ':') - colons++; - } - if (!colons || (colons == 2)) { - log_tool("invalid resource arg"); - return -1; - } - - num_disks = (colons - 1) / 2; - have_lver = (colons - 1) % 2; - - if (num_disks > MAX_DISKS) { - log_tool("invalid resource arg num_disks %d", num_disks); - return -1; - } - - rv = create_sanlk_resource(num_disks, &res); + rv = sanlock_str_to_res(arg, &res); if (rv < 0) { - log_tool("resource arg create error %d num_disks %d", rv, num_disks); + log_tool("resource arg parse error %d\n", rv); return rv; } com.res_args[com.res_count] = res; com.res_count++; - d = 0; - sub_count = 0; - j = 0; - memset(sub, 0, sizeof(sub)); - - for (i = 0; i < len + 1; i++) { - if (arg[i] == '\\') { - if (i == (len - 1)) { - log_tool("invalid resource arg string"); - goto fail; - } - - i++; - sub[j++] = arg[i]; - continue; - } - if (i < len && arg[i] != ':') { - if (j >= SANLK_PATH_LEN) { - log_tool("resource arg length error"); - goto fail; - } - sub[j++] = arg[i]; - continue; - } - - /* do something with sub when we hit ':' or end of arg, - first and second subs are lockspace and resource names, - then even sub is path, odd sub is offset */ - - if (sub_count < 2 && strlen(sub) > NAME_ID_SIZE) { - log_tool("option arg component %s too long", sub); - goto fail; - } - if (sub_count >= 2 && (strlen(sub) > SANLK_PATH_LEN-1 || strlen(sub) < 1)) { - log_tool("option arg component %s too long", sub); - goto fail; - } - - if (sub_count == 0) { - strncpy(res->lockspace_name, sub, NAME_ID_SIZE); - - } else if (sub_count == 1) { - strncpy(res->name, sub, NAME_ID_SIZE); - - } else if (!(sub_count % 2)) { - if (have_lver && (d == num_disks)) { - res->flags |= SANLK_RES_LVER; - res->lver = strtoull(sub, NULL, 0); - } else { - strncpy(res->disks[d].path, sub, SANLK_PATH_LEN - 1); - } - } else { - memset(&unit, 0, sizeof(unit)); - rv = sscanf(sub, "%llu%s", (unsigned long long *)&res->disks[d].offset, unit); - if (!rv || rv > 2) { - log_tool("lease arg offset error"); - goto fail; - } - if (rv > 1) { - if (unit[0] == 's') - res->disks[d].units = SANLK_UNITS_SECTORS; - else if (unit[0] == 'K' && unit[1] == 'B') - res->disks[d].units = SANLK_UNITS_KB; - else if (unit[0] == 'M' && unit[1] == 'B') - res->disks[d].units = SANLK_UNITS_MB; - else { - log_tool("unit unknkown: %s", unit); - goto fail; - } - } - d++; - } - - sub_count++; - j = 0; - memset(sub, 0, sizeof(sub)); - } - - log_debug("resource arg %s %s num_disks %d flags %x", - res->lockspace_name, res->name, res->num_disks, res->flags); - if (res->flags & SANLK_RES_LVER) - log_debug("resource lver %llu", (unsigned long long)res->lver); + log_debug("resource %s %s num_disks %d flags %x lver %llu", + res->lockspace_name, res->name, res->num_disks, res->flags, + (unsigned long long)res->lver); for (i = 0; i < res->num_disks; i++) { - log_debug("resource arg disk %s %llu %u", - res->disks[i].path, - (unsigned long long)res->disks[i].offset, - res->disks[i].units); + log_debug("resource disk %s %llu", res->disks[i].path, + (unsigned long long)res->disks[i].offset); } return 0; - - fail: - free(res); - return -1; } static void set_timeout(char *key, char *val) @@ -2186,8 +1963,7 @@ static void print_usage(void) printf(" <lockspace_name> name of lockspace\n"); printf(" <resource_name> name of resource being leased\n"); printf(" <path> disk path where resource leases are written\n"); - printf(" <offset>[s|KB|MB] offset on disk, default unit bytes\n"); - printf(" [s = sectors, KB = 1024 bytes, MB = 1024 KB]\n"); + printf(" <offset> offset on disk in bytes\n"); printf(" <lver> optional disk leader version of resource for acquire\n"); printf("\n"); } @@ -2416,10 +2192,21 @@ static int read_command_line(int argc, char *argv[]) static int do_client(void) { - struct sanlk_options *opt = NULL; - char *res_str = NULL; + struct sanlk_resource **res_args = NULL; + struct sanlk_resource *res; + char *res_state = NULL; int i, fd, rv = 0; + if (com.action == ACT_COMMAND || com.action == ACT_ACQUIRE) { + if (com.num_hosts) { + for (i = 0; i < com.res_count; i++) { + res = com.res_args[i]; + res->flags |= SANLK_RES_NUM_HOSTS; + res->data32 = com.num_hosts; + } + } + } + switch (com.action) { case ACT_STATUS: rv = sanlock_status(options.debug); @@ -2443,22 +2230,12 @@ static int do_client(void) if (fd < 0) goto out; - if (com.num_hosts) { - opt = malloc(sizeof(struct sanlk_options) + 16); - memset(opt, 0, sizeof(struct sanlk_options) + 16); - snprintf(opt->str, 15, "num_hosts=%d", com.num_hosts); - opt->flags = SANLK_OPT_NUM_HOSTS; - opt->len = strlen(opt->str); - } - log_tool("acquire fd %d", fd); - rv = sanlock_acquire(fd, -1, 0, com.res_count, com.res_args, opt); + rv = sanlock_acquire(fd, -1, 0, com.res_count, com.res_args, NULL); log_tool("acquire done %d", rv); if (rv < 0) goto out; - if (opt) - free(opt); if (!command[0]) { while (1) @@ -2497,29 +2274,38 @@ static int do_client(void) case ACT_INQUIRE: log_tool("inquire pid %d", com.pid); - rv = sanlock_inquire(-1, com.pid, SANLK_INQ_STRING, &com.res_count, (void *)&res_str); + rv = sanlock_inquire(-1, com.pid, 0, &com.res_count, &res_state); log_tool("inquire done %d res_count %d", rv, com.res_count); - if (res_str) { - log_tool("%s", res_str); - free(res_str); - } + if (rv < 0) + break; + log_tool("\"%s\"", res_state); if (!options.debug) break; - log_tool("inquire pid %d STRUCT", com.pid); - rv = sanlock_inquire(-1, com.pid, SANLK_INQ_STRUCT, &com.res_count, (void *)&com.res_args); - log_tool("inquire done %d res_count %d", rv, com.res_count); - if (com.res_args[0]) { - for (i = 0; i < com.res_count; i++) { - struct sanlk_resource *res = com.res_args[i]; - log_tool("%s:%s:%s:%llu:%llu", - res->lockspace_name, res->name, res->disks[0].path, - (unsigned long long)res->disks[0].offset, - (unsigned long long)res->lver); - free(res); - } + com.res_count = 0; + + rv = sanlock_state_to_args(res_state, &com.res_count, &res_args); + log_tool("\nstate_to_args done %d res_count %d", rv, com.res_count); + if (rv < 0) + break; + + free(res_state); + res_state = NULL; + + for (i = 0; i < com.res_count; i++) { + res = res_args[i]; + log_tool("\"%s:%s:%s:%llu:%llu\"", + res->lockspace_name, res->name, res->disks[0].path, + (unsigned long long)res->disks[0].offset, + (unsigned long long)res->lver); } + + rv = sanlock_args_to_state(com.res_count, res_args, &res_state); + log_tool("\nargs_to_state done %d", rv); + if (rv < 0) + break; + log_tool("\"%s\"", res_state); break; default: diff --git a/src/sanlock.h b/src/sanlock.h index 063b199..c4510d2 100644 --- a/src/sanlock.h +++ b/src/sanlock.h @@ -21,22 +21,30 @@ #define SANLK_PATH_LEN 1024 -/* disk offset units */ +/* + * max length of a sanlk_resource in string format + * <lockspace_name>:<resource_name>:<path>:<offset>[:<path>:<offset>...]:<lver> + * 48 SANLK_NAME_LEN + * + 1 colon + * + 48 SANLK_NAME_LEN + * + 1 colon + * + 4184 (4 MAX_DISKS * (1024 SANLK_PATH_LEN + 1 colon + 20 offset + 1 colon)) + * + 20 lver + * ------ + * 4302 + */ -#define SANLK_UNITS_BYTES 0 -#define SANLK_UNITS_SECTORS 1 -#define SANLK_UNITS_KB 2 -#define SANLK_UNITS_MB 3 +#define SANLK_MAX_RES_STR 4400 struct sanlk_disk { char path[SANLK_PATH_LEN]; /* must include terminating \0 */ uint64_t offset; - uint32_t units; uint32_t pad1; uint32_t pad2; }; -#define SANLK_RES_LVER 0x1 +#define SANLK_RES_LVER 0x1 /* lver field is set */ +#define SANLK_RES_NUM_HOSTS 0x2 /* data32 field is new num_hosts */ struct sanlk_resource { char lockspace_name[SANLK_NAME_LEN]; /* terminating \0 not required */ @@ -54,8 +62,6 @@ struct sanlk_resource { /* command-specific command options (can include per resource data, but that requires the extra work of segmenting it by resource name) */ -#define SANLK_OPT_NUM_HOSTS 0x1 - struct sanlk_options { char owner_name[SANLK_NAME_LEN]; /* optional user friendly name */ uint32_t flags; diff --git a/src/sanlock_internal.h b/src/sanlock_internal.h index 71fb347..4a1a2f4 100644 --- a/src/sanlock_internal.h +++ b/src/sanlock_internal.h @@ -28,7 +28,7 @@ /* max disks in a single lease */ -#define MAX_DISKS 8 +#define MAX_DISKS 4 /* default max number of hosts supported */ @@ -53,15 +53,13 @@ points to 1 leader_record + 1 request_record + MAX_HOSTS paxos_dblock's = 256 blocks = 128KB, ref: lease_item_record */ +/* must mirror external sanlk_disk */ + struct sync_disk { - /* mirror external sanlk_disk */ char path[SANLK_PATH_LEN]; uint64_t offset; - uint32_t units; - - /* internal */ - uint32_t sector_size; - int fd; + uint32_t sector_size; /* sanlk_disk pad1 */ + int fd; /* sanlk_disk pad2 */ }; /* Once token and token->disks are initialized by the main loop, the only diff --git a/src/sanlock_resource.h b/src/sanlock_resource.h index 12ec934..fe822fc 100644 --- a/src/sanlock_resource.h +++ b/src/sanlock_resource.h @@ -30,21 +30,49 @@ int sanlock_acquire(int sock, int pid, uint32_t flags, int res_count, int sanlock_release(int sock, int pid, uint32_t flags, int res_count, struct sanlk_resource *res_args[]); +int sanlock_inquire(int sock, int pid, uint32_t flags, int *res_count, + char **res_state); + + +/* + * Functions to convert between string and struct resource formats. + * All allocate space for returned data that the caller must free. + */ + + +/* + * convert from struct sanlk_resource to string with format: + * <lockspace_name>:<resource_name>:<path>:<offset>[:<path>:<offset>...]:<lver> + */ + +int sanlock_res_to_str(struct sanlk_resource *res, char **str_ret); + /* - * SANLK_INQ_STRING - * allocates and returns a state string, caller frees. + * convert to struct sanlk_resource from string with format: + * <lockspace_name>:<resource_name>:<path>:<offset>[:<path>:<offset>...][:<lver>] + */ + +int sanlock_str_to_res(char *str, struct sanlk_resource **res_ret); + +/* + * convert from array of struct sanlk_resource * to state string with format: * "RESOURCE1 RESOURCE2 RESOURCE3 ..." - * RESOURCE = <lockspace_name>:<resource_name>:<path>:<offset>[:<path>:<offset>...]:<version> - * - * SANLK_INQ_STRUCT - * allocates and returns an array of sanlk_resource structs, caller frees. - * [sanlk_resource][sanlk_disk...][sanlk_resource][sanlk_disk...]... + * RESOURCE format in sanlock_res_to_str() comment */ -#define SANLK_INQ_STRING 0x1 -#define SANLK_INQ_STRUCT 0x2 +int sanlock_args_to_state(int res_count, + struct sanlk_resource *res_args[], + char **res_state); + +/* + * convert to array of struct sanlk_resource * from state string with format: + * "RESOURCE1 RESOURCE2 RESOURCE3 ..." + * RESOURCE format in sanlock_str_to_res() comment + */ + +int sanlock_state_to_args(char *res_state, + int *res_count, + struct sanlk_resource ***res_args); -int sanlock_inquire(int sock, int pid, uint32_t flags, int *res_count, - void **res_out); #endif diff --git a/tests/res_string.c b/tests/res_string.c new file mode 100644 index 0000000..d1508c8 --- /dev/null +++ b/tests/res_string.c @@ -0,0 +1,80 @@ +#include <inttypes.h> +#include <unistd.h> +#include <stdio.h> +#include <stdlib.h> +#include <stdint.h> +#include <stddef.h> +#include <string.h> + +#include "sanlock.h" +#include "sanlock_resource.h" + +void print_res(struct sanlk_resource *res) +{ + int i; + + printf("\"%s:%s", res->lockspace_name, res->name); + + for (i = 0; i < res->num_disks; i++) { + printf(":%s:%llu", res->disks[i].path, + (unsigned long long)res->disks[i].offset); + } + printf(":%llu\"\n", (unsigned long long)res->lver); +} + +int main(int argc, char *argv[]) +{ + struct sanlk_resource *res; + struct sanlk_resource **res_args = NULL; + char *state; + int res_count; + int rv, i; + + state = malloc(1024 * 1024); + memset(state, 0, 1024 * 1024); + + printf("\n"); + printf("sanlock_str_to_res\n", rv); + printf("--------------------------------------------------------------------------------\n"); + + for (i = 1; i < argc; i++) { + rv = sanlock_str_to_res(argv[i], &res); + + print_res(res); + + free(res); + res = NULL; + + if (i > 1) + strcat(state, " "); + strcat(state, argv[i]); + } + + printf("\n"); + printf("combined state\n"); + printf("--------------------------------------------------------------------------------\n"); + printf("\"%s\"\n", state); + + rv = sanlock_state_to_args(state, &res_count, &res_args); + + printf("\n"); + printf("sanlock_state_to_args %d res_count %d\n", rv, res_count); + printf("--------------------------------------------------------------------------------\n"); + for (i = 0; i < res_count; i++) { + res = res_args[i]; + print_res(res); + } + + free(state); + state = NULL; + + rv = sanlock_args_to_state(res_count, res_args, &state); + + printf("\n"); + printf("sanlock_args_to_state %d\n", rv); + printf("--------------------------------------------------------------------------------\n"); + printf("\"%s\"\n", state); + + return 0; +} +

13 years, 1 month

1
0
0 / 0

src/client_admin.c src/client_resource.c src/main.c src/sanlock_resource.h

by David Teigland

src/client_admin.c | 12 +-------- src/client_resource.c | 43 +++++++++-------------------------- src/main.c | 59 ++++++++++++++++++++++++++++++++++--------------- src/sanlock_resource.h | 2 + 4 files changed, 57 insertions(+), 59 deletions(-) New commits: commit 4af02ff18b1c54c86cdb4008486a118c5560b838 Author: David Teigland <teigland(a)redhat.com> Date: Thu Mar 17 17:13:53 2011 -0500 sanlock: clean up command return data - consistently use h.data as single success/failure 0/-1 command result - consistently set h.data2 as number of items returned - don't return extra result data that's not used diff --git a/src/client_admin.c b/src/client_admin.c index f89fb1d..6490d36 100644 --- a/src/client_admin.c +++ b/src/client_admin.c @@ -35,22 +35,14 @@ int sanlock_shutdown(void) { struct sm_header h; - int fd, rv; + int fd; fd = send_command(SM_CMD_SHUTDOWN, 0); if (fd < 0) return fd; - memset(&h, 0, sizeof(h)); - - rv = recv(fd, &h, sizeof(h), MSG_WAITALL); - if (rv != sizeof(h)) - rv = -1; - else - rv = 0; - close(fd); - return rv; + return 0; } int sanlock_log_dump(void) diff --git a/src/client_resource.c b/src/client_resource.c index 087235b..57b9588 100644 --- a/src/client_resource.c +++ b/src/client_resource.c @@ -129,19 +129,16 @@ int sanlock_acquire(int sock, int pid, uint32_t flags, int res_count, } } + /* get result */ + memset(&h, 0, sizeof(h)); - rv = recv(fd, &h, sizeof(struct sm_header), MSG_WAITALL); + rv = recv(fd, &h, sizeof(h), MSG_WAITALL); if (rv != sizeof(h)) { rv = -1; goto out; } - - if (h.data != res_count) { - rv = -1; - goto out; - } - rv = 0; + rv = (int)h.data; out: if (sock == -1) close(fd); @@ -180,9 +177,11 @@ int sanlock_inquire(int sock, int pid, uint32_t flags, int *res_count, void **re if (rv < 0) return rv; + /* get result */ + memset(&h, 0, sizeof(h)); - rv = recv(fd, &h, sizeof(struct sm_header), MSG_WAITALL); + rv = recv(fd, &h, sizeof(h), MSG_WAITALL); if (rv != sizeof(h)) { rv = -1; goto out; @@ -207,19 +206,13 @@ int sanlock_inquire(int sock, int pid, uint32_t flags, int *res_count, void **re goto out; } - if (h.data) { - free(reply_data); - rv = (int)h.data; - goto out; - } - if (res_out) *res_out = reply_data; else free(reply_data); *res_count = (int)h.data2; - rv = 0; + rv = (int)h.data; out: if (sock == -1) close(fd); @@ -234,7 +227,6 @@ int sanlock_release(int sock, int pid, uint32_t flags, int res_count, struct sanlk_resource *res_args[]) { struct sm_header h; - int results[SANLK_MAX_RESOURCES]; int fd, rv, i, data2, datalen; if (sock == -1) { @@ -268,27 +260,16 @@ int sanlock_release(int sock, int pid, uint32_t flags, int res_count, } } + /* get result */ + memset(&h, 0, sizeof(h)); - memset(&results, 0, sizeof(results)); - rv = recv(fd, &h, sizeof(struct sm_header), MSG_WAITALL); + rv = recv(fd, &h, sizeof(h), MSG_WAITALL); if (rv != sizeof(h)) { rv = -1; goto out; } - - rv = recv(fd, &results, sizeof(int) * res_count, MSG_WAITALL); - if (rv != sizeof(int) * res_count) { - rv = -1; - goto out; - } - - rv = 0; - for (i = 0; i < res_count; i++) { - if (results[i] != 1) { - rv = -1; - } - } + rv = (int)h.data; out: if (sock == -1) close(fd); diff --git a/src/main.c b/src/main.c index 386ad67..ee4566e 100644 --- a/src/main.c +++ b/src/main.c @@ -844,7 +844,8 @@ static void *cmd_acquire_thread(void *args_in) memcpy(&h, &ca->header, sizeof(struct sm_header)); h.length = sizeof(h); - h.data = new_tokens_count; + h.data = 0; + h.data2 = 0; send(fd, &h, sizeof(h), MSG_NOSIGNAL); client_back(ca->ci_in, fd); @@ -886,7 +887,8 @@ static void *cmd_acquire_thread(void *args_in) memcpy(&h, &ca->header, sizeof(struct sm_header)); h.length = sizeof(h); - h.data = rv; + h.data = -1; + h.data2 = 0; send(fd, &h, sizeof(h), MSG_NOSIGNAL); if (pid_dead) @@ -903,9 +905,8 @@ static void *cmd_release_thread(void *args_in) struct sm_header h; struct token *token; struct sanlk_resource res; - int results[SANLK_MAX_RESOURCES]; struct client *cl; - int fd, rv, i, j, found, rem_tokens_count; + int fd, rv, i, j, found, result = 0; cl = &client[ca->ci_target]; fd = client[ca->ci_in].fd; @@ -913,15 +914,31 @@ static void *cmd_release_thread(void *args_in) log_debug("cmd_release ci_in %d ci_target %d pid %d", ca->ci_in, ca->ci_target, cl->pid); - memset(results, 0, sizeof(results)); - rem_tokens_count = ca->header.data; + /* caller wants to release all resources */ - for (i = 0; i < rem_tokens_count; i++) { + if (ca->header.cmd_flags & SANLK_REL_ALL) { + for (j = 0; j < SANLK_MAX_RESOURCES; j++) { + token = cl->tokens[j]; + if (!token) + continue; + + rv = release_token(token); + if (rv < 0) + result = -1; + free_token(token); + cl->tokens[j] = NULL; + } + goto reply; + } + + /* caller is specifying specific resources to release */ + + for (i = 0; i < ca->header.data; i++) { rv = recv(fd, &res, sizeof(struct sanlk_resource), MSG_WAITALL); if (rv != sizeof(struct sanlk_resource)) { log_error("cmd_release recv fd %d %d %d", fd, rv, errno); - results[i] = -1; - break; + result = -1; + goto reply; } found = 0; @@ -937,9 +954,10 @@ static void *cmd_release_thread(void *args_in) continue; rv = release_token(token); + if (rv < 0) + result = -1; free_token(token); cl->tokens[j] = NULL; - results[i] = rv; found = 1; break; } @@ -947,18 +965,20 @@ static void *cmd_release_thread(void *args_in) if (!found) { log_error("cmd_release pid %d no resource %s", cl->pid, res.name); - results[i] = -ENOENT; + result = -1; } } + reply: set_cmd_active(ca->ci_target, 0); - log_debug("cmd_release done %d", rem_tokens_count); + log_debug("cmd_release done"); memcpy(&h, &ca->header, sizeof(struct sm_header)); - h.length = sizeof(h) + sizeof(int) * rem_tokens_count; - send(fd, &h, sizeof(struct sm_header), MSG_NOSIGNAL); - send(fd, &results, sizeof(int) * rem_tokens_count, MSG_NOSIGNAL); + h.length = sizeof(h); + h.data = result; + h.data2 = 0; + send(fd, &h, sizeof(h), MSG_NOSIGNAL); client_back(ca->ci_in, fd); free(ca); @@ -1066,7 +1086,6 @@ static void *cmd_inquire_thread(void *args_in) result, total, pos, reply_str_len); memcpy(&h, &ca->header, sizeof(struct sm_header)); - h.data = result; h.data2 = total; @@ -1143,6 +1162,7 @@ static void *cmd_add_lockspace_thread(void *args_in) memcpy(&h, &ca->header, sizeof(struct sm_header)); h.length = sizeof(h); h.data = result; + h.data2 = 0; send(fd, &h, sizeof(h), MSG_NOSIGNAL); client_back(ca->ci_in, fd); @@ -1190,6 +1210,7 @@ static void *cmd_rem_lockspace_thread(void *args_in) memcpy(&h, &ca->header, sizeof(struct sm_header)); h.length = sizeof(h); h.data = result; + h.data2 = 0; send(fd, &h, sizeof(h), MSG_NOSIGNAL); client_back(ca->ci_in, fd); @@ -1441,7 +1462,8 @@ static void process_cmd_thread_lockspace(int ci_in, struct sm_header *h_recv) memcpy(&h, h_recv, sizeof(struct sm_header)); h.length = sizeof(h); h.data = rv; - send(client[ci_in].fd, &h, sizeof(struct sm_header), MSG_NOSIGNAL); + h.data2 = 0; + send(client[ci_in].fd, &h, sizeof(h), MSG_NOSIGNAL); close(client[ci_in].fd); } @@ -1520,7 +1542,8 @@ static void process_cmd_thread_resource(int ci_in, struct sm_header *h_recv) memcpy(&h, h_recv, sizeof(struct sm_header)); h.length = sizeof(h); h.data = rv; - send(client[ci_in].fd, &h, sizeof(struct sm_header), MSG_NOSIGNAL); + h.data2 = 0; + send(client[ci_in].fd, &h, sizeof(h), MSG_NOSIGNAL); client_back(ci_in, client[ci_in].fd); } diff --git a/src/sanlock_resource.h b/src/sanlock_resource.h index 2d6d293..12ec934 100644 --- a/src/sanlock_resource.h +++ b/src/sanlock_resource.h @@ -25,6 +25,8 @@ int sanlock_acquire(int sock, int pid, uint32_t flags, int res_count, struct sanlk_resource *res_args[], struct sanlk_options *opt_in); +#define SANLK_REL_ALL 0x1 + int sanlock_release(int sock, int pid, uint32_t flags, int res_count, struct sanlk_resource *res_args[]);

13 years, 1 month

1
0
0 / 0

src/main.c src/Makefile src/sanlock_internal.h src/watchdog.c src/watchdog.h

by David Teigland

src/Makefile | 3 src/main.c | 16 --- src/sanlock_internal.h | 3 src/watchdog.c | 208 ------------------------------------------------- src/watchdog.h | 2 5 files changed, 1 insertion(+), 231 deletions(-) New commits: commit ea7129df969f1f6eebcb6191f7e65d20d43bbdd9 Author: David Teigland <teigland(a)redhat.com> Date: Tue Mar 15 16:34:27 2011 -0500 sanlock: remove wdtest now that we're using libwdmd diff --git a/src/Makefile b/src/Makefile index 31df6d2..3cbd0e0 100644 --- a/src/Makefile +++ b/src/Makefile @@ -51,8 +51,7 @@ CFLAGS += -D_GNU_SOURCE -g \ -Wp,-D_FORTIFY_SOURCE=2 \ -fexceptions \ -fasynchronous-unwind-tables \ - -fdiagnostics-show-option \ - -DUSE_WDMD + -fdiagnostics-show-option CMD_LDFLAGS = -lpthread -lrt -lblkid -lsanlock -lwdmd diff --git a/src/main.c b/src/main.c index 91c930c..9a15a9d 100644 --- a/src/main.c +++ b/src/main.c @@ -1805,10 +1805,6 @@ static int make_dirs(void) if (rv < 0 && errno != EEXIST) goto out; - rv = mkdir(SANLK_WDTEST_DIR, 0777); - if (rv < 0 && errno != EEXIST) - goto out; - rv = 0; out: umask(old_umask); @@ -2216,7 +2212,6 @@ static void print_usage(void) printf(" daemon start daemon\n"); printf(" client send request to daemon (default type if none given)\n"); printf(" direct access storage directly (no coordination with daemon)\n"); - printf(" wdtest watchdog test for expired host_id lease\n"); printf("\n"); printf("client actions: ask daemon to:\n"); printf(" status send internal state\n"); @@ -2363,10 +2358,6 @@ static int read_command_line(int argc, char *argv[]) com.type = COM_CLIENT; act = argv[2]; i = 3; - } else if (!strcmp(arg1, "wdtest")) { - com.type = COM_WDTEST; - act = argv[2]; - i = 3; } else { com.type = COM_CLIENT; act = argv[1]; @@ -2426,9 +2417,6 @@ static int read_command_line(int argc, char *argv[]) exit(EXIT_FAILURE); } break; - - case COM_WDTEST: - break; }; @@ -2747,10 +2735,6 @@ int main(int argc, char *argv[]) case COM_DIRECT: rv = do_direct(); break; - - case COM_WDTEST: - rv = do_wdtest(); - break; }; out: return rv; diff --git a/src/sanlock_internal.h b/src/sanlock_internal.h index 493954f..bf47e3a 100644 --- a/src/sanlock_internal.h +++ b/src/sanlock_internal.h @@ -42,7 +42,6 @@ #define SANLK_RUN_DIR "/var/run/sanlock" #define SANLK_LOG_DIR "/var/log" -#define SANLK_WDTEST_DIR "/var/run/sanlock/wdtest" #define SANLK_SOCKET_NAME "sanlock_sock" #define SANLK_LOGFILE_NAME "sanlock.log" #define SANLK_LOCKFILE_NAME "sanlock.pid" @@ -126,7 +125,6 @@ struct space { pthread_cond_t cond; struct lease_status lease_status; int wd_fd; - char wdtest_path[PATH_MAX]; }; struct sm_header { @@ -306,7 +304,6 @@ struct command_line { #define COM_DAEMON 1 #define COM_CLIENT 2 #define COM_DIRECT 3 -#define COM_WDTEST 4 enum { ACT_STATUS = 1, diff --git a/src/watchdog.c b/src/watchdog.c index e92ad74..9f16fa7 100644 --- a/src/watchdog.c +++ b/src/watchdog.c @@ -37,8 +37,6 @@ * after the pid goes ahead. */ -#ifdef USE_WDMD - #include "../wdmd/wdmd.h" static int daemon_wdmd_con; @@ -193,209 +191,3 @@ int setup_watchdog(void) return -1; } -int do_wdtest(void) -{ - return -1; -} - -#else - -#define BUF_SIZE 128 - -static int do_write(int fd, void *buf, size_t count) -{ - int rv, off = 0; - - retry: - rv = write(fd, (char *)buf + off, count); - if (rv == -1 && errno == EINTR) - goto retry; - if (rv < 0) { - return -1; - } - - if (rv != count) { - count -= rv; - off += rv; - goto retry; - } - return 0; -} - -void update_watchdog_file(struct space *sp, uint64_t timestamp) -{ - char buf[BUF_SIZE]; - - if (!options.use_watchdog) - return; - - memset(buf, 0, sizeof(buf)); - snprintf(buf, sizeof(buf), "renewal %llu expire %llu\n", - (unsigned long long)timestamp, - (unsigned long long)timestamp + to.host_id_renewal_fail_seconds); - - lseek(sp->wd_fd, 0, SEEK_SET); - - do_write(sp->wd_fd, buf, sizeof(buf)); -} - -int create_watchdog_file(struct space *sp, uint64_t timestamp) -{ - char buf[BUF_SIZE]; - int rv, fd; - - if (!options.use_watchdog) - return 0; - - snprintf(sp->wdtest_path, PATH_MAX, "%s/%s_hostid%llu", - SANLK_WDTEST_DIR, sp->space_name, - (unsigned long long)sp->host_id); - - /* If this open fails with EEXIST I don't think it's safe to unlink - * watchdog_path and try again. If the daemon had failed while pid's - * remained running, then the daemon is restarted (before watchdog - * triggers) and we start renewing host_id again and get here. If we - * were to unlink the wd file right here, and then the daemon failed - * again, we'd possibly be left with pid's running that had been - * connected to the previous daemon instance, and the watchdog file - * unlinked, so the watchdog won't reset us. - * - * If the open fails with EEXIST we could open the existing file and go - * on, although there's currently no mechanism to reattach to any - * running pid's we're supposed to be supervising. */ - - fd = open(sp->wdtest_path, O_WRONLY|O_CREAT|O_EXCL|O_NONBLOCK, 0666); - if (fd < 0) { - log_erros(sp, "create_watchdog_file open %s error %d", - sp->wdtest_path, errno); - return fd; - } - - memset(buf, 0, sizeof(buf)); - snprintf(buf, sizeof(buf), "renewal %llu expire %llu\n", - (unsigned long long)timestamp, - (unsigned long long)timestamp + to.host_id_renewal_fail_seconds); - - rv = do_write(fd, buf, sizeof(buf)); - if (rv < 0) { - log_erros(sp, "create_watchdog_file write error %d", rv); - close(fd); - return rv; - } - - sp->wd_fd = fd; - return 0; -} - -void unlink_watchdog_file(struct space *sp) -{ - if (!options.use_watchdog) - return; - - unlink(sp->wdtest_path); -} - -void close_watchdog_file(struct space *sp) -{ - close(sp->wd_fd); -} - -void close_watchdog(void) -{ -} - -int setup_watchdog(void) -{ - DIR *d; - struct dirent *de; - int rv = 0; - - if (!options.use_watchdog) - return 0; - - d = opendir(SANLK_WDTEST_DIR); - if (!d) - return 0; - - while ((de = readdir(d))) { - if (de->d_name[0] == '.') - continue; - - log_error("stale wdtest file: %s/%s", - SANLK_WDTEST_DIR, de->d_name); - rv = -1; - } - closedir(d); - - return rv; -} - -int do_wdtest(void) -{ - DIR *d; - struct dirent *de; - char path[PATH_MAX]; - char buf[BUF_SIZE]; - unsigned long long renewal = 0, expire = 0; - time_t t; - int fail_count = 0; - int rv, fd; - - openlog("sanlock_wdtest", LOG_CONS | LOG_PID, LOG_USER); - - d = opendir(SANLK_WDTEST_DIR); - if (!d) - return 0; - - while ((de = readdir(d))) { - if (de->d_name[0] == '.') - continue; - - snprintf(path, PATH_MAX-1, "%s/%s", - SANLK_WDTEST_DIR, de->d_name); - - fd = open(path, O_RDONLY|O_NONBLOCK, 0666); - if (fd < 0) { - syslog(LOG_ERR, "open error %s", path); - continue; - } - - memset(buf, 0, sizeof(buf)); - rv = read(fd, buf, sizeof(buf)); - if (rv < 0) { - syslog(LOG_ERR, "read error %s", path); - close(fd); - continue; - } - - close(fd); - - sscanf(buf, "renewal %llu expire %llu", &renewal, &expire); - - t = time(NULL); - - /* TODO: remove this line, just for debugging */ - syslog(LOG_ERR, "%s renewal %llu expire %llu now %llu", - path, - (unsigned long long)renewal, - (unsigned long long)expire, - (unsigned long long)t); - - if (t < expire) - continue; - - syslog(LOG_CRIT, "%s test fail renewal %llu expire %llu now %llu", - path, - (unsigned long long)renewal, - (unsigned long long)expire, - (unsigned long long)t); - - fail_count++; - } - closedir(d); - - if (fail_count) - return -1; - return 0; -} -#endif diff --git a/src/watchdog.h b/src/watchdog.h index 4e05c55..7f8cb1f 100644 --- a/src/watchdog.h +++ b/src/watchdog.h @@ -17,6 +17,4 @@ void close_watchdog_file(struct space *sp); int setup_watchdog(void); void close_watchdog(void); -int do_wdtest(void); - #endif

13 years, 1 month

1
0
0 / 0

src/client_admin.c src/client_msg.c src/client_resource.c src/diskio.c src/host_id.c src/main.c src/paxos_lease.c src/watchdog.c

by David Teigland

src/client_admin.c | 16 ++++++++-------- src/client_msg.c | 14 +++++++------- src/client_resource.c | 24 ++++++++++++------------ src/diskio.c | 16 ++++++++-------- src/host_id.c | 16 ++++++++-------- src/main.c | 2 +- src/paxos_lease.c | 38 +++++++++++++++++--------------------- src/watchdog.c | 4 +++- 8 files changed, 64 insertions(+), 66 deletions(-) New commits: commit 67679d023967542150f66f8a2758810e05593cf6 Author: David Teigland <teigland(a)redhat.com> Date: Tue Mar 15 16:23:38 2011 -0500 sanlock: various fixes - when freeing a disk lease in paxos_lease_release, use the proper paxos_lease_leader_read so we don't try to free the lease if everything isn't ok (e.g. we were trying to free it even after failing to read it.) - quit doing "return -errno;" everwhere. There were some places where it resulted in returning 0 when we shouldn't, (e.g. when recv returns 0 after recving nothing when the conection was closed), which can be very bad, (e.g. from sanlock_acquire(), indicating the lease is acquired when it isn't.) - I think finally properly synchronize unlink_watchdog_file and update_watchdog_file diff --git a/src/client_admin.c b/src/client_admin.c index acd9dba..e801694 100644 --- a/src/client_admin.c +++ b/src/client_admin.c @@ -45,7 +45,7 @@ int sanlock_shutdown(void) rv = recv(fd, &h, sizeof(h), MSG_WAITALL); if (rv != sizeof(h)) - rv = -errno; + rv = -1; else rv = 0; @@ -67,7 +67,7 @@ int sanlock_log_dump(void) rv = recv(fd, &h, sizeof(h), MSG_WAITALL); if (rv != sizeof(h)) { - rv = -errno; + rv = -1; goto out; } @@ -82,7 +82,7 @@ int sanlock_log_dump(void) rv = recv(fd, buf, len, MSG_WAITALL); if (rv != len) { - rv = -errno; + rv = -1; goto out; } @@ -178,7 +178,7 @@ int sanlock_status(int debug) rv = recv(fd, &h, sizeof(h), MSG_WAITALL); if (rv != sizeof(h)) - return -errno; + return -1; while (1) { @@ -186,12 +186,12 @@ int sanlock_status(int debug) if (!rv) break; if (rv != sizeof(st)) - return -errno; + return -1; if (st.str_len) { rv = recv(fd, str, st.str_len, MSG_WAITALL); if (rv != st.str_len) - return -errno; + return -1; } switch (st.type) { @@ -228,7 +228,7 @@ static int cmd_lockspace(int cmd, struct sanlk_lockspace *ls, uint32_t flags) rv = send(fd, (void *)ls, sizeof(struct sanlk_lockspace), 0); if (rv < 0) { - rv = -errno; + rv = -1; goto out; } @@ -236,7 +236,7 @@ static int cmd_lockspace(int cmd, struct sanlk_lockspace *ls, uint32_t flags) rv = recv(fd, &h, sizeof(struct sm_header), MSG_WAITALL); if (rv != sizeof(h)) { - rv = -errno; + rv = -1; goto out; } diff --git a/src/client_msg.c b/src/client_msg.c index 5dc6673..d752af6 100644 --- a/src/client_msg.c +++ b/src/client_msg.c @@ -50,7 +50,7 @@ int setup_listener_socket(int *listener_socket) s = socket(AF_LOCAL, SOCK_STREAM, 0); if (s < 0) - return -errno; + return -1; rv = get_socket_address(&addr); if (rv < 0) @@ -59,21 +59,21 @@ int setup_listener_socket(int *listener_socket) unlink(addr.sun_path); rv = bind(s, (struct sockaddr *) &addr, sizeof(struct sockaddr_un)); if (rv < 0) { - rv = -errno; + rv = -1; close(s); return rv; } rv = listen(s, 5); if (rv < 0) { - rv = -errno; + rv = -1; close(s); return rv; } rv = fchmod(s, 666); if (rv < 0) { - rv = -errno; + rv = -1; close(s); return rv; } @@ -88,7 +88,7 @@ int connect_socket(int *sock_fd) s = socket(AF_LOCAL, SOCK_STREAM, 0); if (s < 0) - return -errno; + return -1; rv = get_socket_address(&addr); if (rv < 0) @@ -96,7 +96,7 @@ int connect_socket(int *sock_fd) rv = connect(s, (struct sockaddr *) &addr, sizeof(struct sockaddr_un)); if (rv < 0) { - rv = -errno; + rv = -1; close(s); return rv; } @@ -118,7 +118,7 @@ int send_header(int sock, int cmd, int datalen, uint32_t data, uint32_t data2) rv = send(sock, (void *) &header, sizeof(struct sm_header), 0); if (rv < 0) - return -errno; + return -1; return 0; } diff --git a/src/client_resource.c b/src/client_resource.c index ab17b7e..ef19326 100644 --- a/src/client_resource.c +++ b/src/client_resource.c @@ -104,27 +104,27 @@ int sanlock_acquire(int sock, int pid, int res_count, res = res_args[i]; rv = send(fd, res, sizeof(struct sanlk_resource), 0); if (rv < 0) { - rv = -errno; + rv = -1; goto out; } rv = send(fd, res->disks, sizeof(struct sanlk_disk) * res->num_disks, 0); if (rv < 0) { - rv = -errno; + rv = -1; goto out; } } rv = send(fd, &opt, sizeof(struct sanlk_options), 0); if (rv < 0) { - rv = -errno; + rv = -1; goto out; } if (opt.len) { rv = send(fd, opt_in->str, opt.len, 0); if (rv < 0) { - rv = -errno; + rv = -1; goto out; } } @@ -133,7 +133,7 @@ int sanlock_acquire(int sock, int pid, int res_count, rv = recv(fd, &h, sizeof(struct sm_header), MSG_WAITALL); if (rv != sizeof(h)) { - rv = -errno; + rv = -1; goto out; } @@ -177,7 +177,7 @@ int sanlock_migrate(int sock, int pid, uint64_t target_host_id, char **state) rv = send(fd, &target_host_id, sizeof(uint64_t), 0); if (rv < 0) { - rv = -errno; + rv = -1; goto out; } @@ -185,7 +185,7 @@ int sanlock_migrate(int sock, int pid, uint64_t target_host_id, char **state) rv = recv(fd, &h, sizeof(struct sm_header), MSG_WAITALL); if (rv != sizeof(h)) { - rv = -errno; + rv = -1; goto out; } @@ -199,7 +199,7 @@ int sanlock_migrate(int sock, int pid, uint64_t target_host_id, char **state) rv = recv(fd, reply_str, len, MSG_WAITALL); if (rv != len) { free(reply_str); - rv = -errno; + rv = -1; goto out; } @@ -255,7 +255,7 @@ int sanlock_release(int sock, int pid, int res_count, for (i = 0; i < res_count; i++) { rv = send(fd, res_args[i], sizeof(struct sanlk_resource), 0); if (rv < 0) { - rv = -errno; + rv = -1; goto out; } } @@ -265,13 +265,13 @@ int sanlock_release(int sock, int pid, int res_count, rv = recv(fd, &h, sizeof(struct sm_header), MSG_WAITALL); if (rv != sizeof(h)) { - rv = -errno; + rv = -1; goto out; } rv = recv(fd, &results, sizeof(int) * res_count, MSG_WAITALL); if (rv != sizeof(int) * res_count) { - rv = -errno; + rv = -1; goto out; } @@ -317,7 +317,7 @@ int sanlock_setowner(int sock, int pid) rv = recv(fd, &h, sizeof(struct sm_header), MSG_WAITALL); if (rv != sizeof(h)) { - rv = -errno; + rv = -1; goto out; } diff --git a/src/diskio.c b/src/diskio.c index a1bf92c..16b615a 100644 --- a/src/diskio.c +++ b/src/diskio.c @@ -149,14 +149,14 @@ static int do_write(int fd, uint64_t offset, const char *buf, int len) ret = lseek(fd, offset, SEEK_SET); if (ret != offset) - return -errno; + return -1; retry: rv = write(fd, buf + pos, len); if (rv == -1 && errno == EINTR) goto retry; if (rv < 0) - return -errno; + return -1; /* if (rv != len && len == sector_size) return error? partial sector writes should not happen AFAIK, and @@ -178,7 +178,7 @@ static int do_read(int fd, uint64_t offset, char *buf, int len) ret = lseek(fd, offset, SEEK_SET); if (ret != offset) - return -errno; + return -1; while (pos < len) { rv = read(fd, buf + pos, len - pos); @@ -187,7 +187,7 @@ static int do_read(int fd, uint64_t offset, char *buf, int len) if (rv == -1 && errno == EINTR) continue; if (rv < 0) - return -errno; + return -1; pos += rv; } @@ -215,7 +215,7 @@ static int do_write_aio(int fd, uint64_t offset, char *buf, int len, rv = aio_write(&cb); if (rv < 0) - return -errno; + return -1; rv = aio_suspend(&p_cb, 1, &ts); if (!rv) @@ -225,7 +225,7 @@ static int do_write_aio(int fd, uint64_t offset, char *buf, int len, rv = aio_cancel(fd, &cb); if (rv < 0) - return -errno; + return -1; if (rv == AIO_ALLDONE) return 0; @@ -264,7 +264,7 @@ static int do_read_aio(int fd, uint64_t offset, char *buf, int len, int io_timeo rv = aio_read(&cb); if (rv < 0) - return -errno; + return -1; rv = aio_suspend(&p_cb, 1, &ts); if (!rv) @@ -274,7 +274,7 @@ static int do_read_aio(int fd, uint64_t offset, char *buf, int len, int io_timeo rv = aio_cancel(fd, &cb); if (rv < 0) - return -errno; + return -1; if (rv == AIO_ALLDONE) return 0; diff --git a/src/host_id.c b/src/host_id.c index f6f7b83..b3dc51f 100644 --- a/src/host_id.c +++ b/src/host_id.c @@ -235,6 +235,7 @@ static void *host_id_thread(void *arg_in) result = delta_lease_renew(sp, &sp->host_id_disk, sp->space_name, our_host_id, sp->host_id, &leader); + dl_result = result; t = leader.timestamp; pthread_mutex_lock(&sp->mutex); @@ -251,20 +252,19 @@ static void *host_id_thread(void *arg_in) sp->lease_status.max_renewal_interval = good_diff; sp->lease_status.max_renewal_time = t; } - } - pthread_mutex_unlock(&sp->mutex); - if (result < 0) { - log_erros(sp, "host_id %llu renewal error %d last good %llu", - (unsigned long long)sp->host_id, result, - (unsigned long long)sp->lease_status.renewal_good_time); - } else { log_space(sp, "host_id %llu renewal %llu interval %d", (unsigned long long)sp->host_id, (unsigned long long)t, good_diff); - update_watchdog_file(sp, t); + if (!sp->thread_stop) + update_watchdog_file(sp, t); + } else { + log_erros(sp, "host_id %llu renewal error %d last good %llu", + (unsigned long long)sp->host_id, result, + (unsigned long long)sp->lease_status.renewal_good_time); } + pthread_mutex_unlock(&sp->mutex); } /* unlink called below to get it done ASAP */ diff --git a/src/main.c b/src/main.c index b80f87f..91c930c 100644 --- a/src/main.c +++ b/src/main.c @@ -397,9 +397,9 @@ static int main_loop(void) log_space(sp, "set thread_stop"); pthread_mutex_lock(&sp->mutex); sp->thread_stop = 1; + unlink_watchdog_file(sp); pthread_cond_broadcast(&sp->cond); pthread_mutex_unlock(&sp->mutex); - unlink_watchdog_file(sp); list_move(&sp->list, &spaces_remove); } else { kill_pids(sp); diff --git a/src/paxos_lease.c b/src/paxos_lease.c index f693f62..9f815d6 100644 --- a/src/paxos_lease.c +++ b/src/paxos_lease.c @@ -854,44 +854,40 @@ int paxos_lease_release(struct token *token, struct leader_record *leader_last, struct leader_record *leader_ret) { - struct leader_record new_leader; - int rv, d; + struct leader_record leader; int error; - for (d = 0; d < token->num_disks; d++) { - memset(&new_leader, 0, sizeof(struct leader_record)); - - rv = read_leader(&token->disks[d], &new_leader); - if (rv < 0) - continue; + error = paxos_lease_leader_read(token, &leader); + if (error < 0) { + log_errot(token, "release error cannot read leader"); + goto out; + } - if (memcmp(&new_leader, leader_last, - sizeof(struct leader_record))) { - log_errot(token, "release error leader changed"); - return DP_BAD_LEADER; - } + if (memcmp(&leader, leader_last, sizeof(struct leader_record))) { + log_errot(token, "release error leader changed"); + return DP_BAD_LEADER; } - if (new_leader.owner_id != token->host_id) { + if (leader.owner_id != token->host_id) { log_errot(token, "release error other owner_id %llu", - (unsigned long long)new_leader.owner_id); + (unsigned long long)leader.owner_id); return DP_OTHER_OWNER; } - if (new_leader.next_owner_id) { + if (leader.next_owner_id) { log_errot(token, "release error next_owner_id %llu", - (unsigned long long)new_leader.next_owner_id); + (unsigned long long)leader.next_owner_id); return DP_LEADER_MIGRATE; } - new_leader.timestamp = LEASE_FREE; - new_leader.checksum = leader_checksum(&new_leader); + leader.timestamp = LEASE_FREE; + leader.checksum = leader_checksum(&leader); - error = write_new_leader(token, &new_leader); + error = write_new_leader(token, &leader); if (error < 0) goto out; - memcpy(leader_ret, &new_leader, sizeof(struct leader_record)); + memcpy(leader_ret, &leader, sizeof(struct leader_record)); out: return error; } diff --git a/src/watchdog.c b/src/watchdog.c index 563654e..e92ad74 100644 --- a/src/watchdog.c +++ b/src/watchdog.c @@ -102,6 +102,8 @@ void unlink_watchdog_file(struct space *sp) if (!options.use_watchdog) return; + log_space(sp, "wdmd_test_live 0 0 to disable"); + rv = wdmd_test_live(sp->wd_fd, 0, 0); if (rv < 0) log_erros(sp, "wdmd_test_live failed %d", rv); @@ -209,7 +211,7 @@ static int do_write(int fd, void *buf, size_t count) if (rv == -1 && errno == EINTR) goto retry; if (rv < 0) { - return -errno; + return -1; } if (rv != count) {

13 years, 1 month

1
0
0 / 0

src/main.c

by David Teigland

src/main.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) New commits: commit 285ab4a63c6e4fee96983a7648c28081172ac479 Author: David Teigland <teigland(a)redhat.com> Date: Mon Mar 14 13:31:28 2011 -0500 sanlock: fix wdmd stop order Need to stop wdmd renewal thread before disabling wdmd for a given resource; otherwise, the wd test can be reenabled right after we disable it. diff --git a/src/main.c b/src/main.c index f90b6d9..b80f87f 100644 --- a/src/main.c +++ b/src/main.c @@ -394,12 +394,12 @@ static int main_loop(void) list_for_each_entry_safe(sp, safe, &spaces, list) { if (sp->killing_pids) { if (all_pids_dead(sp)) { - unlink_watchdog_file(sp); log_space(sp, "set thread_stop"); pthread_mutex_lock(&sp->mutex); sp->thread_stop = 1; pthread_cond_broadcast(&sp->cond); pthread_mutex_unlock(&sp->mutex); + unlink_watchdog_file(sp); list_move(&sp->list, &spaces_remove); } else { kill_pids(sp);

13 years, 1 month

1
0
0 / 0

src/token_manager.c tests/devcount.c

by David Teigland

src/token_manager.c | 19 ++++++++++++++++--- tests/devcount.c | 11 +++++++++-- 2 files changed, 25 insertions(+), 5 deletions(-) New commits: commit 3db854c867ae53aa08f1d9c1476dc5a7c8cd991c Author: David Teigland <teigland(a)redhat.com> Date: Thu Mar 10 14:21:34 2011 -0600 sanlock: migrate to target fix fix check when source gives target host_id to migrate add another validation check for setowner incoming have devcount also test specified migrate target diff --git a/src/token_manager.c b/src/token_manager.c index bc86b07..5b4ea53 100644 --- a/src/token_manager.c +++ b/src/token_manager.c @@ -364,7 +364,7 @@ static int check_incoming_state(struct token *token, char *opt_str, leader_src.lver == leader_ret.lver && leader_src.timestamp == leader_ret.timestamp) { log_token(token, "check_incoming_state all match"); - return DP_OK; + goto out_ok; } else { log_errot(token, "check_incoming_state mismatch " "next_owner %llu %llu %llu " @@ -399,6 +399,8 @@ static int check_incoming_state(struct token *token, char *opt_str, return -1; } + out_ok: + memcpy(&token->leader, &leader_ret, sizeof(struct leader_record)); return DP_OK; } @@ -495,7 +497,18 @@ static int setowner_token_incoming(struct token *token) if (rv < 0) return rv; - if (token->leader.next_owner_id != leader.next_owner_id) { + /* the owner should be the same (the source) as when we last read + the leader in incoming_token */ + + if (token->leader.owner_id != leader.owner_id) { + log_errot(token, "setowner incoming bad owner %llu %llu", + (unsigned long long)token->leader.owner_id, + (unsigned long long)leader.owner_id); + return DP_ERROR; + } + + if (token->leader.next_owner_id != leader.next_owner_id || + token->host_id != leader.next_owner_id) { log_errot(token, "setowner incoming bad next_owner %llu %llu", (unsigned long long)token->leader.next_owner_id, (unsigned long long)leader.next_owner_id); @@ -529,7 +542,7 @@ int setowner_token(struct token *token) { int rv; - log_token(token, "setowner incoming %d migrating %d", + log_token(token, "setowner migrating %d incoming %d", token->migrating, token->incoming); if (token->migrating) { diff --git a/tests/devcount.c b/tests/devcount.c index b83a49a..d60e4f9 100644 --- a/tests/devcount.c +++ b/tests/devcount.c @@ -659,7 +659,7 @@ static int do_migrate(int argc, char *argv[]) struct sanlk_lockspace lockspace; struct sanlk_resource *res; struct sanlk_options *opt; - int i, j, pid, rv, sock, len, status, init; + int i, j, pid, rv, sock, len, status, init, target; uint32_t parent_pid = getpid(); if (argc < MIGRATE_ARGS) @@ -776,7 +776,14 @@ static int do_migrate(int argc, char *argv[]) sleep(10); - rv = sanlock_migrate(-1, pid, 0, &state); + /* exercise both migrate options: giving target on host or not */ + + if (rand_int(1,3) == 1) + target = (our_hostid % max_hostid) + 1; + else + target = 0; + + rv = sanlock_migrate(-1, pid, target, &state); if (rv < 0 || !state) { printf("%d sanlock_migrate error %d\n", parent_pid, rv); goto fail;

13 years, 1 month

1
0
0 / 0

src/main.c src/sanlock_internal.h src/token_manager.c src/token_manager.h tests/devcount.c

by David Teigland

src/main.c | 70 +++----- src/sanlock_internal.h | 6 src/token_manager.c | 387 ++++++++++++++++++++++++++++++++++--------------- src/token_manager.h | 7 tests/devcount.c | 7 5 files changed, 310 insertions(+), 167 deletions(-) New commits: commit 6b26b667fdaca59e91c31522b03f88bce42aad18 Author: David Teigland <teigland(a)redhat.com> Date: Wed Mar 9 17:22:23 2011 -0600 sanlock: setowner improvements Allow setowner to be used on either migrate destination (migrate succeeded) or migrate source (migrate failed). Also improve the organization and structure of the migration code a bit to make it cleaner. diff --git a/src/main.c b/src/main.c index f0e549c..f90b6d9 100644 --- a/src/main.c +++ b/src/main.c @@ -192,7 +192,7 @@ static void client_pid_dead(int ci) int delay_release = 0; int i, pid; - log_debug("client_pid_dead ci %d pid %d", ci, cl->pid); + log_debug("client_pid_dead ci %d fd %d pid %d", ci, cl->fd, cl->pid); /* cmd_acquire_thread may still be waiting for the tokens to be acquired. if it is, tell it to release them when @@ -427,9 +427,6 @@ static int main_loop(void) return 0; } -/* FIXME: allow setowner on the source to "cancel" a migration by clearing - next_owner. This means allowing CMD_SETOWNER when cmd_active == CMD_MIGRATE */ - static int set_cmd_active(int ci_target, int cmd) { struct client *cl = &client[ci_target]; @@ -448,6 +445,12 @@ static int set_cmd_active(int ci_target, int cmd) if (cl->need_setowner && cmd == SM_CMD_SETOWNER) cl->need_setowner = 0; + if (cl->cmd_active == SM_CMD_MIGRATE && cmd == SM_CMD_SETOWNER) { + cl->cmd_active = SM_CMD_SETOWNER; + pthread_mutex_unlock(&cl->mutex); + return 0; + } + cmd_active = cl->cmd_active; if (!cmd) { @@ -636,7 +639,7 @@ static void *cmd_acquire_thread(void *args_in) int fd, rv, i, j, disks_len, num_disks, empty_slots, opened; int alloc_count = 0, add_count = 0, open_count = 0, acquire_count = 0; int pos = 0, need_setowner = 0, pid_dead = 0; - int new_tokens_count, migrate_result; + int new_tokens_count; cl = &client[ca->ci_target]; fd = client[ca->ci_in].fd; @@ -873,28 +876,19 @@ static void *cmd_acquire_thread(void *args_in) for (i = 0; i < new_tokens_count; i++) { token = new_tokens[i]; - if (opt.flags & SANLK_FLG_INCOMING) { - rv = check_incoming_state(token, opt_str, &migrate_result); - if (rv < 0) { - log_errot(token, "cmd_acquire incoming state %d", rv); - goto fail_release; - } - - /* source set_next_owner_other() wasn't called or failed */ - if (migrate_result != DP_OK) - rv = set_next_owner_self(token); + if (opt.flags & SANLK_FLG_REACQUIRE) + reacquire_lver = token->prev_lver; + if (opt.flags & SANLK_FLG_INCOMING) { + rv = incoming_token(token, opt_str); } else { - if (opt.flags & SANLK_FLG_REACQUIRE) - reacquire_lver = token->prev_lver; - rv = acquire_token(token, reacquire_lver, new_num_hosts); } save_resource_leader(token); if (rv < 0) { - log_errot(token, "cmd_acquire lease %d flags %x", + log_errot(token, "cmd_acquire %d flags %x", rv, opt.flags); goto fail_release; } @@ -1131,23 +1125,7 @@ static void *cmd_migrate_thread(void *args_in) if (!token) continue; - /* the migrating flag causes the source to avoid freeing the lease - * if the pid exits before the dest has written itself as next_owner. - * i.e. we can't rely on paxos_lease_release seeing next_owner_id - * non-zero because the cmd_migrate can be called on the source, - * followed by the source pid exiting before the dest gets to write - * next_owner_id to itself */ - - token->migrating = 1; - - if (target_host_id) { - rv = set_next_owner_other(token, target_host_id); - token->migrate_result = rv; - } else { - /* acquire-incoming on the destination will call - set_next_owner_self() */ - token->migrate_result = 0; - } + migrate_token(token, target_host_id); ret = snprintf(reply_str + pos, reply_len - pos, "lockspace_name=%s " @@ -1178,6 +1156,8 @@ static void *cmd_migrate_thread(void *args_in) reply_str[reply_len-1] = '\0'; reply: + /* no set_cmd_active(0), only setowner is allowed after this */ + log_debug("cmd_migrate done result %d", result); memcpy(&h, &ca->header, sizeof(struct sm_header)); @@ -1199,10 +1179,6 @@ static void *cmd_migrate_thread(void *args_in) return NULL; } -/* become the full owner of leases that were migrated to us; - go through each of the pid's tokens, set next_owner_id for each, - then reply to client with result */ - static void *cmd_setowner_thread(void *args_in) { struct cmd_args *ca = args_in; @@ -1389,9 +1365,12 @@ static int print_token_state(struct token *t, char *str) snprintf(str, SANLK_STATE_MAXSTR-1, "token_id=%u " + "migrating=%d " + "incoming=%d " "acquire_result=%d " - "migrate_result=%d " "release_result=%d " + "migrate_result=%d " + "incoming_result=%d " "setowner_result=%d " "leader.lver=%llu " "leader.timestamp=%llu " @@ -1399,9 +1378,12 @@ static int print_token_state(struct token *t, char *str) "leader.owner_generation=%llu " "leader.next_owner_id=%llu", t->token_id, + t->migrating, + t->incoming, t->acquire_result, - t->migrate_result, t->release_result, + t->migrate_result, + t->incoming_result, t->setowner_result, (unsigned long long)t->leader.lver, (unsigned long long)t->leader.timestamp, @@ -1699,8 +1681,10 @@ static void process_cmd_daemon(int ci, struct sm_header *h_recv) switch (h_recv->cmd) { case SM_CMD_REGISTER: rv = get_peer_pid(fd, &pid); - if (rv < 0) + if (rv < 0) { + log_error("cmd_register ci %d fd %d get pid failed", ci, fd); break; + } log_debug("cmd_register ci %d fd %d pid %d", ci, fd, pid); client[ci].pid = pid; client[ci].deadfn = client_pid_dead; diff --git a/src/sanlock_internal.h b/src/sanlock_internal.h index 22dd21e..493954f 100644 --- a/src/sanlock_internal.h +++ b/src/sanlock_internal.h @@ -85,11 +85,13 @@ struct token { struct sync_disk *disks; /* internal */ - int migrating; int token_id; /* used to refer to this token instance in log messages */ + int migrating; + int incoming; int acquire_result; - int migrate_result; int release_result; + int migrate_result; + int incoming_result; int setowner_result; uint64_t prev_lver; /* just used to pass a value between functions */ struct leader_record leader; /* copy of last leader_record we wrote */ diff --git a/src/token_manager.c b/src/token_manager.c index aa537df..bc86b07 100644 --- a/src/token_manager.c +++ b/src/token_manager.c @@ -212,45 +212,6 @@ int acquire_token(struct token *token, uint64_t reacquire_lver, return rv; /* DP_OK */ } -/* return < 0 on error, 1 on success */ - -int setowner_token(struct token *token) -{ - struct leader_record leader_ret; - int rv; - - rv = paxos_lease_leader_read(token, &leader_ret); - if (rv < 0) - return rv; - - if (memcmp(&token->leader, &leader_ret, sizeof(struct leader_record))) { - log_errot(token, "setowner leader_read mismatch"); - return -1; - } - - /* we want the dblocks to reflect a full, proper ownership, so we - do the full acquire rather than just writing a new leader_record */ - - rv = paxos_lease_acquire(token, 1, &leader_ret, 0, 0); - - token->setowner_result = rv; - - /* we set acquire_result here for at least one reason: because release - will not release the token if acquire_result is not 1 */ - - token->acquire_result = rv; - - log_token(token, "setowner rv %d lver %llu at %llu", rv, - (unsigned long long)token->leader.lver, - (unsigned long long)token->leader.timestamp); - - if (rv < 0) - return rv; - - memcpy(&token->leader, &leader_ret, sizeof(struct leader_record)); - return rv; /* DP_OK */ -} - /* * migration creates special cases for release. if either the source or * the dest calls release_token and read leader shows next_owner_id is not @@ -263,17 +224,19 @@ int setowner_token(struct token *token) * * (A second mechanism is also needed to prevent release of migrating leases, * the token->migrating flag. This is because we need to block releases - * on the source effective immediately, before next_owner may be written.) + * on the source effective immediately, before next_owner may be written. + * The token->incoming flag is the same, although is probably unnecessary + * given the paxos_lease_release checking of next_owner.) * * setowner on the destination, in the case of migration success, moves a * disk lease from being in limbo (both owner and next_owner set), to having * just an owner (the dest). After this the owner (dest) can release it. * - * TODO: setowner on the source, in the case of migration failure, moves the + * setowner on the source, in the case of migration failure, moves the * disk lease from being in limbo with both owner and next_owner, to having - * just an owner (the source). This setowner call will need to ignore the - * fact that the leader block doesn't match its latest copy since it may - * have been the dest that wrote next_owner at the start of migration. + * just an owner (the source). This setowner needs to ignore the fact that + * the leader block doesn't match its latest copy since it may have been the + * dest that wrote next_owner at the start of migration. * * We don't have to worry that next_owner is ever running the vm; setowner * on dest is required to complete successfully (making dest the owner and @@ -283,9 +246,7 @@ int setowner_token(struct token *token) * the source/owner continues running and renewing its host_id, then no * other host will be able to take ownership of the lease, because they will * see that the owner is alive. The source/owner will be able to acquire - * the lease, though. So, the source/owner needs to either - * 1. call setowner to clear next_owner_id, then call release to free it (TODO above) - * 2. call acquire to acquire the lease, then call release to free it + * the lease, though. * * If migration fails, the source/owner does not free the paxos lease, and * the source/owner does not continue running or renewing its host_id, then @@ -304,9 +265,6 @@ int setowner_token(struct token *token) * - if migration fails because the dest host fails, * setowner on the source will clear next_owner, and allow source to * continue running and holding the lease, or release the lease. - * setowner on source would have to ignore the fact that the leader - * will have been changed since it last read or wrote it (by the dest - * writing itself as the next_owner_id) * * - if migration fails because the source host fails, * the paxos lease will be left on disk with owner and next_owner set, @@ -341,7 +299,12 @@ int release_token(struct token *token) int rv; if (token->migrating) { - log_errot(token, "release skip migrating"); + log_token(token, "release skip migrating"); + return DP_ERROR; + } + + if (token->incoming) { + log_token(token, "release skip incoming"); return DP_ERROR; } @@ -358,71 +321,9 @@ int release_token(struct token *token) return rv; /* DP_OK */ } -/* return < 0 on error, 1 on success */ - -int set_next_owner_other(struct token *token, uint64_t target_host_id) -{ - struct leader_record leader; - int rv; - - rv = paxos_lease_leader_read(token, &leader); - if (rv < 0) - return rv; - - if (memcmp(&leader, &token->leader, sizeof(struct leader_record))) { - log_errot(token, "set_next_owner_other leader changed before migrate"); - return DP_BAD_LEADER; - } - - if (leader.num_hosts < target_host_id) { - log_errot(token, "set_next_owner_other num_hosts %llu " - "target_host_id %llu", - (unsigned long long)leader.num_hosts, - (unsigned long long)target_host_id); - return DP_BAD_NUMHOSTS; - } - - leader.next_owner_id = target_host_id; - - rv = paxos_lease_leader_write(token, &leader); - if (rv < 0) - return rv; - - memcpy(&token->leader, &leader, sizeof(struct leader_record)); - return rv; /* DP_OK */ -} - -/* return < 0 on error, 1 on success */ - -int set_next_owner_self(struct token *token) -{ - struct leader_record leader; - int rv; - - rv = paxos_lease_leader_read(token, &leader); - if (rv < 0) - return rv; - - if (leader.num_hosts < token->host_id) { - log_errot(token, "set_next_owner_self num_hosts %llu host_id %llu", - (unsigned long long)leader.num_hosts, - (unsigned long long)token->host_id); - return DP_BAD_NUMHOSTS; - } - - leader.next_owner_id = token->host_id; - - rv = paxos_lease_leader_write(token, &leader); - if (rv < 0) - return rv; - - memcpy(&token->leader, &leader, sizeof(struct leader_record)); - return rv; /* DP_OK */ -} - /* * migration destination verifies the migrate state sent from source, - * which needs to be consisent with the source having successfully written + * which needs to be consistent with the source having successfully written * the next_owner itself, or having not tried or tried and failed. * * If we can't read the leader, return an error, and the migration @@ -434,7 +335,8 @@ int set_next_owner_self(struct token *token) int parse_incoming_state(struct token *token, char *str, int *migrate_result, struct leader_record *leader); -int check_incoming_state(struct token *token, char *opt_str, int *migrate_result_out) +static int check_incoming_state(struct token *token, char *opt_str, + int *migrate_result_out) { struct leader_record leader_ret; struct leader_record leader_src; @@ -500,6 +402,261 @@ int check_incoming_state(struct token *token, char *opt_str, int *migrate_result return DP_OK; } +/* + * Migration + * + * sanlock_migrate() called on source + * sanlock_acquire() called on destination with INCOMING flag + * + * migrate_token() called on source, output state str + * incoming_token() called on dest, input state str from source + * + * if migrate_token() sets next_owner in set_next_owner_other(), + * then incoming_token() will not set next_owner in set_next_owner_self() + * + * if migrate_token() does not set next_owner in set_next_owner_other(), + * then incoming_token() will set next_owner in set_net_owner_self() + * + * after migrate_token() on source and incoming_token() on destination + * both succeed, vm migration happens + * + * if vm migration succeeds + * sanlock_setowner() / setowner_token() is called on the destination, + * leases become owned by dest owner=D, next_owner cleared, + * token->incoming cleared, the leases will be freed when the dest vm pid exits + * + * if vm migration fails + * sanlock_setowner() / setowner_token() is called on the source, + * leases remain owned by source, next_owner cleared, + * token->migrating cleared, the leases will be freed when the source vm pid exits + * + * (the migrating flag on source and incoming flag on dest cause the host to + * skip freeing the lease if the pid exits before the next_owner has been + * written to the lease leader block. release_token() also looks at the + * next_owner field in the leader block and will not free the lease if it is + * set.) + * + * TODO: what if migration fails, and source pid exits prior to setowner_token + * being called on the source? The lease is still owned by the source host, + * but no pid on the source is holding/using it. A new vm/pid on the source + * host can probably acquire it, but not a vm/pid on a different host. + * + * TODO: if setowner_token() fails on the destination after a successful + * migration, will returning an error cause libvirt to then call + * setowner_token() on the source? + */ + +static int setowner_token_migrating(struct token *token) +{ + struct leader_record leader; + int rv; + + rv = paxos_lease_leader_read(token, &leader); + if (rv < 0) + return rv; + + /* the owner should still be us since migration failed */ + + if (token->leader.owner_id != leader.owner_id) { + log_errot(token, "setowner migrating bad owner %llu %llu", + (unsigned long long)token->leader.owner_id, + (unsigned long long)leader.owner_id); + return DP_ERROR; + } + + /* leader block we just read may not match our last saved copy + because next_owner_id may have been set by the destination */ + + /* next_owner_id was set either by us in set_next_owner_other + or by the destination in set_next_owner_self, so it should + not be zero */ + + if (leader.next_owner_id == 0) { + log_errot(token, "setowner migrating zero next_owner"); + return DP_ERROR; + } + + leader.next_owner_id = 0; + + rv = paxos_lease_leader_write(token, &leader); + if (rv < 0) + return rv; + + memcpy(&token->leader, &leader, sizeof(struct leader_record)); + return rv; /* DP_OK */ +} + +static int setowner_token_incoming(struct token *token) +{ + struct leader_record leader; + int rv; + + rv = paxos_lease_leader_read(token, &leader); + if (rv < 0) + return rv; + + if (token->leader.next_owner_id != leader.next_owner_id) { + log_errot(token, "setowner incoming bad next_owner %llu %llu", + (unsigned long long)token->leader.next_owner_id, + (unsigned long long)leader.next_owner_id); + return DP_ERROR; + } + + /* should match what we last saved in incoming_token() */ + + if (memcmp(&token->leader, &leader, sizeof(struct leader_record))) { + log_errot(token, "setowner incoming leader_read mismatch"); + return -1; + } + + /* we want the dblocks to reflect a full, proper ownership, so we + do the full acquire rather than just writing a new leader_record */ + + rv = paxos_lease_acquire(token, 1, &leader, 0, 0); + if (rv < 0) + return rv; + + memcpy(&token->leader, &leader, sizeof(struct leader_record)); + return rv; /* DP_OK */ +} + +/* we set acquire_result for setowner_incoming for at least one reason, + because release will not free the token if acquire_result is not 1 */ + +/* return < 0 on error, 1 on success */ + +int setowner_token(struct token *token) +{ + int rv; + + log_token(token, "setowner incoming %d migrating %d", + token->migrating, token->incoming); + + if (token->migrating) { + rv = setowner_token_migrating(token); + + if (rv == DP_OK) + token->migrating = 0; + + token->setowner_result = rv; + + } else if (token->incoming) { + rv = setowner_token_incoming(token); + + if (rv == DP_OK) + token->incoming = 0; + + token->setowner_result = rv; + token->acquire_result = rv; + } else { + log_errot(token, "setowner ignoring"); + } + + return rv; +} + +static int set_next_owner_other(struct token *token, uint64_t target_host_id) +{ + struct leader_record leader; + int rv; + + rv = paxos_lease_leader_read(token, &leader); + if (rv < 0) + return rv; + + if (memcmp(&leader, &token->leader, sizeof(struct leader_record))) { + log_errot(token, "set_next_owner_other leader changed before migrate"); + return DP_BAD_LEADER; + } + + if (leader.num_hosts < target_host_id) { + log_errot(token, "set_next_owner_other num_hosts %llu " + "target_host_id %llu", + (unsigned long long)leader.num_hosts, + (unsigned long long)target_host_id); + return DP_BAD_NUMHOSTS; + } + + leader.next_owner_id = target_host_id; + + rv = paxos_lease_leader_write(token, &leader); + if (rv < 0) + return rv; + + memcpy(&token->leader, &leader, sizeof(struct leader_record)); + return rv; /* DP_OK */ +} + +int migrate_token(struct token *token, uint64_t target_host_id) +{ + int rv = 0; + + log_token(token, "migrate %llu", (unsigned long long)target_host_id); + + if (target_host_id) { + /* migrate_token() on the source calls set_next_owner_other() */ + rv = set_next_owner_other(token, target_host_id); + token->migrate_result = rv; + } else { + /* incoming_token() on the dest will call set_next_owner_self() */ + token->migrate_result = 0; + } + + token->migrating = 1; + + return rv; +} + +static int set_next_owner_self(struct token *token) +{ + struct leader_record leader; + int rv; + + /* would could skip this read by reusing the read from check_incoming_state */ + rv = paxos_lease_leader_read(token, &leader); + if (rv < 0) + return rv; + + if (leader.num_hosts < token->host_id) { + log_errot(token, "set_next_owner_self num_hosts %llu host_id %llu", + (unsigned long long)leader.num_hosts, + (unsigned long long)token->host_id); + return DP_BAD_NUMHOSTS; + } + + leader.next_owner_id = token->host_id; + + rv = paxos_lease_leader_write(token, &leader); + if (rv < 0) + return rv; + + memcpy(&token->leader, &leader, sizeof(struct leader_record)); + return rv; /* DP_OK */ +} + +int incoming_token(struct token *token, char *opt_str) +{ + int migrate_result; + int rv; + + log_token(token, "incoming"); + + rv = check_incoming_state(token, opt_str, &migrate_result); + if (rv < 0) { + log_errot(token, "incoming state error %d", rv); + goto out; + } + + if (migrate_result != DP_OK) + rv = set_next_owner_self(token); + out: + token->incoming_result = rv; + + token->incoming = 1; + + return rv; +} + int create_token(int num_disks, struct token **token_out) { struct token *token; diff --git a/src/token_manager.h b/src/token_manager.h index 34b3c5b..1bbc9ed 100644 --- a/src/token_manager.h +++ b/src/token_manager.h @@ -13,11 +13,8 @@ int acquire_token(struct token *token, uint64_t reacquire_lver, int new_num_hosts); int release_token(struct token *token); int setowner_token(struct token *token); - -int check_incoming_state(struct token *token, char *opt_str, - int *migrate_result_out); -int set_next_owner_other(struct token *token, uint64_t target_host_id); -int set_next_owner_self(struct token *token); +int migrate_token(struct token *token, uint64_t target_host_id); +int incoming_token(struct token *token, char *opt_str); int create_token(int num_disks, struct token **token_out); void free_token(struct token *token); diff --git a/tests/devcount.c b/tests/devcount.c index b8e8aa3..b83a49a 100644 --- a/tests/devcount.c +++ b/tests/devcount.c @@ -399,7 +399,7 @@ static int do_lock(int argc, char *argv[]) sock = sanlock_register(); if (sock < 0) { printf("%d sanlock_register error %d\n", - child_pid, rv); + child_pid, sock); exit(-1); } @@ -476,6 +476,7 @@ static void write_migrate(char *state, int offset) goto fail; } + close(fd); return; fail: @@ -572,6 +573,7 @@ static int wait_migrate_incoming(char *state_out) goto fail; } + close(fd); return 0; fail: @@ -639,6 +641,7 @@ static void wait_migrate_stopped(char *state_in) goto fail; } + close(fd); return; fail: @@ -735,7 +738,7 @@ static int do_migrate(int argc, char *argv[]) sock = sanlock_register(); if (sock < 0) { printf("%d sanlock_register error %d\n", - child_pid, rv); + child_pid, sock); exit(-1); }

13 years, 1 month

1
0
0 / 0

2 commits - src/direct.c src/leader.h src/main.c src/paxos_lease.c src/paxos_lease.h src/sanlock_internal.h src/token_manager.c src/token_manager.h tests/devcount.c

by David Teigland

src/direct.c | 6 src/leader.h | 2 src/main.c | 131 ++++------- src/paxos_lease.c | 75 ++---- src/paxos_lease.h | 1 src/sanlock_internal.h | 1 src/token_manager.c | 236 ++++++++++++++++---- src/token_manager.h | 8 tests/devcount.c | 558 +++++++++++++++++++++++++++++++++++++++++++------ 9 files changed, 784 insertions(+), 234 deletions(-) New commits: commit 90f87d6cfef9e7533ac5ec51451e6e1640e33e57 Author: David Teigland <teigland(a)redhat.com> Date: Tue Mar 8 16:45:08 2011 -0600 devcount: migrate test Added a test that exercises sanlock_migrate by migrating the lease protecting a standard devcount test from one host to the next in turn, using a separate disk block to coordinate and pass the migration state string between hosts. Use two disks, one for sanlock leases and the other for the devcount program to do its counting on. host1: devcount init /dev/vg/leases /dev/vg/count host1: devcount migrate /dev/vg/leases rw /dev/vg/count 5 300 1 3 host2: devcount migrate /dev/vg/leases rw /dev/vg/count 5 300 2 3 host3: devcount migrate /dev/vg/leases rw /dev/vg/count 5 300 3 3 host1 will acquire lease, start devcount rw, migrate lease to host2 host2 will acquire lease, start devcount rw, migrate lease to host3 host3 will acquire lease, start devcount rw, migrate lease to host1 etc If sanlock ever allows devcount rw to run on two hosts at the same time, it will immediately detect the collision, print an error and stop. For an example, manually start two overlapping devcounts: host1: devcount rw /dev/vg/count 5 300 1 host2: devcount rw /dev/vg/count 5 300 2 diff --git a/tests/devcount.c b/tests/devcount.c index c29be3f..b8e8aa3 100644 --- a/tests/devcount.c +++ b/tests/devcount.c @@ -13,6 +13,7 @@ #include <errno.h> #include <limits.h> #include <time.h> +#include <signal.h> #include "sanlock.h" #include "sanlock_admin.h" @@ -26,10 +27,7 @@ int count_offset; int lock_offset; int our_hostid; - -int seconds; -int verify; -int quiet; +int max_hostid; struct entry { uint32_t turn; @@ -52,14 +50,15 @@ static int rand_int(int a, int b) /* 64 byte entry: can fit up to 8 nodes in a 512 byte block */ -void print_entries(char *buf) +void print_entries(int pid, char *buf) { struct entry *e = (struct entry *)buf; int i; for (i = 0; i < (512 / sizeof(struct entry)); i++) { - printf("index %d turn %u time %llu %u:%llu:%llu " + printf("%d index %d turn %u time %llu %u:%llu:%llu " "last %u %llu %u:%llu:%llu\n", + pid, i, e->turn, (unsigned long long)e->time, @@ -75,10 +74,11 @@ void print_entries(char *buf) } } -void print_our_we(struct entry *our_we) +void print_our_we(int pid, struct entry *our_we) { - printf("w index %d turn %u time %llu %u:%llu:%llu " + printf("%d w index %d turn %u time %llu %u:%llu:%llu " "last %u %llu %u:%llu:%llu\n", + pid, our_hostid - 1, our_we->turn, (unsigned long long)our_we->time, @@ -94,9 +94,10 @@ void print_our_we(struct entry *our_we) #define COUNT_ARGS 6 #define LOCK_ARGS 8 +#define MIGRATE_ARGS 9 -/* - * devcount count <count_disk> <rsec> <wsec> <hostid> +/* + * devcount rw|wr <count_disk> <sec1> <sec2> <hostid> */ static int do_count(int argc, char *argv[]) @@ -107,18 +108,27 @@ static int do_count(int argc, char *argv[]) time_t start; uint32_t our_pid = getpid(); uint32_t max_turn; + int sec1, sec2; int read_seconds, write_seconds; if (argc < COUNT_ARGS) return -1; strcpy(count_path, argv[2]); - read_seconds = atoi(argv[3]); - write_seconds = atoi(argv[4]); + sec1 = atoi(argv[3]); + sec2 = atoi(argv[4]); our_hostid = atoi(argv[5]); - printf("%d count count_disk %s rsec %d wsec %d our_hostid %d\n", - our_pid, count_path, read_seconds, write_seconds, our_hostid); + if (!strcmp(argv[1], "rw")) { + read_seconds = sec1; + write_seconds = sec2; + } else { + write_seconds = sec1; + read_seconds = sec2; + } + + printf("%d %s count_disk %s sec1 %d sec2 %d our_hostid %d\n", + our_pid, argv[1], count_path, sec1, sec2, our_hostid); fd = open(count_path, O_RDWR | O_DIRECT | O_SYNC, 0); if (fd < 0) { @@ -154,11 +164,6 @@ static int do_count(int argc, char *argv[]) goto fail; } - /* - * Quickly read, then reread for a few seconds to see if - * the previous writer does another write. - */ - lseek(fd, count_offset, SEEK_SET); rv = read(fd, rbuf, 512); @@ -167,30 +172,36 @@ static int do_count(int argc, char *argv[]) goto fail; } - /* print_entries(rbuf); */ + /* print_entries(our_pid, rbuf); */ - for (i = 0; i < read_seconds; i++) { - sleep(1); + /* + * reading for "rw" + */ - lseek(fd, count_offset, SEEK_SET); + if (!strcmp(argv[1], "rw")) { + for (i = 0; i < read_seconds; i++) { + sleep(1); - rv = read(fd, vbuf, 512); - if (rv != 512) { - perror("read failed"); - goto fail; - } + lseek(fd, count_offset, SEEK_SET); - if (memcmp(rbuf, vbuf, 512)) { - printf("rbuf:\n"); - print_entries(rbuf); - printf("vbuf:\n"); - print_entries(vbuf); - goto fail; + rv = read(fd, vbuf, 512); + if (rv != 512) { + perror("read failed"); + goto fail; + } + + if (memcmp(rbuf, vbuf, 512)) { + printf("%d rbuf:\n", our_pid); + print_entries(our_pid, rbuf); + printf("%d vbuf:\n", our_pid); + print_entries(our_pid, vbuf); + goto fail; + } } } /* - * Now start writing + * writing */ re = (struct entry *)rbuf; @@ -209,14 +220,14 @@ static int do_count(int argc, char *argv[]) } if (max_turn != max_re->turn) { - printf("max_turn %d max_re->turn %d\n", max_turn, - max_re->turn); + printf("%d max_turn %d max_re->turn %d\n", our_pid, + max_turn, max_re->turn); goto fail; } /* - printf("max index %d turn %d count %llu\n", max_i, max_turn, - (unsigned long long)max_re->count); + printf("%d max index %d turn %d count %llu\n", our_pid, + max_i, max_turn, (unsigned long long)max_re->count); */ memcpy(wbuf, rbuf, 512); @@ -245,7 +256,7 @@ static int do_count(int argc, char *argv[]) } printf("%d first write\n", our_pid); - print_our_we(our_we); + print_our_we(our_pid, our_we); start = time(NULL); @@ -266,7 +277,7 @@ static int do_count(int argc, char *argv[]) } printf("%d last write\n", our_pid); - print_our_we(our_we); + print_our_we(our_pid, our_we); if (turn_file) { fprintf(turn_file, "turn %03u start %llu end %llu host %u pid %u\n", @@ -278,6 +289,34 @@ static int do_count(int argc, char *argv[]) fclose(turn_file); } + /* + * reading for "wr" + */ + + if (!strcmp(argv[1], "wr")) { + memcpy(rbuf, wbuf, 512); + + for (i = 0; i < read_seconds; i++) { + sleep(1); + + lseek(fd, count_offset, SEEK_SET); + + rv = read(fd, vbuf, 512); + if (rv != 512) { + perror("read failed"); + goto fail; + } + + if (memcmp(rbuf, vbuf, 512)) { + printf("%d rbuf:\n", our_pid); + print_entries(our_pid, rbuf); + printf("%d vbuf:\n", our_pid); + print_entries(our_pid, vbuf); + goto fail; + } + } + } + return 0; fail: printf("sleeping...\n"); @@ -286,9 +325,9 @@ static int do_count(int argc, char *argv[]) } /* - * devcount lock <lock_disk> count <count_disk> <rsec> <wsec> <hostid> + * devcount lock <lock_disk> rw <count_disk> <sec1> <sec2> <hostid> * sanlock add_lockspace -s devcount:<hostid>:<lock_disk>:0 - * devcount count <count_disk> <rsec> <wsec> <hostid> + * devcount rw <count_disk> <sec1> <sec2> <hostid> */ static int do_lock(int argc, char *argv[]) @@ -297,11 +336,13 @@ static int do_lock(int argc, char *argv[]) struct sanlk_lockspace lockspace; struct sanlk_resource *res; int i, j, pid, rv, sock, len, status; - uint32_t our_pid = getpid(); + uint32_t parent_pid = getpid(); if (argc < LOCK_ARGS) return -1; + count_offset = 0; + strcpy(lock_path, argv[2]); strcpy(count_path, argv[4]); our_hostid = atoi(argv[7]); @@ -318,7 +359,7 @@ static int do_lock(int argc, char *argv[]) res->disks[0].offset = 1024000; printf("%d lock_disk %s count_disk %s our_hostid %d\n", - our_pid, lock_path, count_path, our_hostid); + parent_pid, lock_path, count_path, our_hostid); memset(&lockspace, 0, sizeof(lockspace)); strcpy(lockspace.name, "devcount"); @@ -328,16 +369,16 @@ static int do_lock(int argc, char *argv[]) rv = sanlock_add_lockspace(&lockspace, 0); if (rv < 0) { - printf("sanlock_add_lockspace error %d\n", rv); + printf("%d sanlock_add_lockspace error %d\n", parent_pid, rv); exit(EXIT_FAILURE); } - printf("sanlock_add_lockspace done\n"); + printf("%d sanlock_add_lockspace done\n", parent_pid); /* * argv[0] = devcount * argv[1] = lock * argv[2] = <lock_disk> - * argv[3] = count + * argv[3] = rw * start copying at argv[3] */ @@ -351,18 +392,27 @@ static int do_lock(int argc, char *argv[]) while (1) { pid = fork(); if (!pid) { - int our_pid = getpid(); + int child_pid = getpid(); printf("\n"); sock = sanlock_register(); + if (sock < 0) { + printf("%d sanlock_register error %d\n", + child_pid, rv); + exit(-1); + } + rv = sanlock_acquire(sock, -1, 1, &res, NULL); if (rv < 0) { printf("%d sanlock_acquire error %d\n", - our_pid, rv); + child_pid, rv); + /* all hosts are trying to acquire so we + expect this to acquire only sometimes; + TODO: exit with an error for some rv's */ exit(0); } - printf("%d sanlock_acquire done\n", our_pid); + printf("%d sanlock_acquire done\n", child_pid); execv(av[0], av); perror("execv devcount problem"); @@ -370,14 +420,390 @@ static int do_lock(int argc, char *argv[]) } waitpid(pid, &status, 0); + + /* TODO: goto fail if exit status is an error */ + sleep(rand_int(0, 1)); } + + fail: + printf("test failed...\n"); + sleep(1000000); +} + +/* counting block: count_path offset 0 + * incoming block: count_path offset 4K + * stopped block: count_path offset 8K */ + +static void write_migrate(char *state, int offset) +{ + char *wbuf, **p_wbuf; + int fd, rv; + + if (strlen(state) > 512) { + printf("state string too long\n"); + goto fail; + } + + fd = open(count_path, O_RDWR | O_DIRECT | O_SYNC, 0); + if (fd < 0) { + perror("open failed"); + goto fail; + } + + rv = ioctl(fd, BLKFLSBUF); + if (rv) { + perror("BLKFLSBUF failed"); + goto fail; + } + + p_wbuf = &wbuf; + + rv = posix_memalign((void *)p_wbuf, getpagesize(), 512); + if (rv) { + perror("posix_memalign failed"); + goto fail; + } + + memset(wbuf, 0, 512); + memcpy(wbuf, state, strlen(state)); + + lseek(fd, offset, SEEK_SET); + + rv = write(fd, wbuf, 512); + if (rv != 512) { + perror("write failed"); + goto fail; + } + + return; + + fail: + printf("write_migrate %d failed %s\n", offset, state); + sleep(10000000); +} + +static void write_migrate_incoming(char *state) +{ + write_migrate(state, 4096); +} + +static void write_migrate_stopped(char *state) +{ + write_migrate(state, 4096*2); +} + +/* read incoming block until it's set and our_hostid is next */ + +static int wait_migrate_incoming(char *state_out) +{ + char *rbuf, **p_rbuf, *wbuf, **p_wbuf; + char *owner_id, *val_str; + int fd, rv, val; + int offset = 4096; + + fd = open(count_path, O_RDWR | O_DIRECT | O_SYNC, 0); + if (fd < 0) { + perror("open failed"); + goto fail; + } + + rv = ioctl(fd, BLKFLSBUF); + if (rv) { + perror("BLKFLSBUF failed"); + goto fail; + } + + p_rbuf = &rbuf; + p_wbuf = &wbuf; + + rv = posix_memalign((void *)p_rbuf, getpagesize(), 512); + if (rv) { + perror("posix_memalign failed"); + goto fail; + } + + rv = posix_memalign((void *)p_wbuf, getpagesize(), 512); + if (rv) { + perror("posix_memalign failed"); + goto fail; + } + + retry: + lseek(fd, offset, SEEK_SET); + + rv = read(fd, rbuf, 512); + if (rv != 512) { + perror("read failed"); + goto fail; + } + rbuf[511] = '\0'; + + /* init case to get things going */ + if (!rbuf[0] && our_hostid == 1) { + return 1; + } + + owner_id = strstr(rbuf, "leader.owner_id="); + if (!owner_id) { + goto retry; + } + + val_str = strstr(owner_id, "=") + 1; + if (!val_str) { + goto retry; + } + + val = atoi(val_str); + if ((val % max_hostid)+1 != our_hostid) { + goto retry; + } + + strcpy(state_out, rbuf); + + memset(wbuf, 0, 512); + strcpy(wbuf, "empty"); + + lseek(fd, offset, SEEK_SET); + + rv = write(fd, wbuf, 512); + if (rv != 512) { + perror("write failed"); + goto fail; + } + + return 0; + + fail: + printf("wait_migrate_incoming failed %s\n", state_out); + sleep(10000000); +} + +/* read stopped block until it matches state_in */ + +static void wait_migrate_stopped(char *state_in) +{ + char *rbuf, **p_rbuf, *wbuf, **p_wbuf; + int fd, rv; + int offset = 4096 * 2; + + fd = open(count_path, O_RDWR | O_DIRECT | O_SYNC, 0); + if (fd < 0) { + perror("open failed"); + goto fail; + } + + rv = ioctl(fd, BLKFLSBUF); + if (rv) { + perror("BLKFLSBUF failed"); + goto fail; + } + + p_rbuf = &rbuf; + p_wbuf = &wbuf; + + rv = posix_memalign((void *)p_rbuf, getpagesize(), 512); + if (rv) { + perror("posix_memalign failed"); + goto fail; + } + + rv = posix_memalign((void *)p_wbuf, getpagesize(), 512); + if (rv) { + perror("posix_memalign failed"); + goto fail; + } + + retry: + lseek(fd, offset, SEEK_SET); + + rv = read(fd, rbuf, 512); + if (rv != 512) { + perror("read failed"); + goto fail; + } + rbuf[511] = '\0'; + + if (strcmp(rbuf, state_in)) { + sleep(1); + goto retry; + } + + memset(wbuf, 0, 512); + + lseek(fd, offset, SEEK_SET); + + rv = write(fd, wbuf, 512); + if (rv != 512) { + perror("write failed"); + goto fail; + } + + return; + + fail: + printf("wait_migrate_stopped failed %s\n", state_in); + sleep(10000000); +} + +#define MAX_MIGRATE_STATE 512 /* keep in one block for simplicity */ + +static int do_migrate(int argc, char *argv[]) +{ + char incoming[MAX_MIGRATE_STATE]; + char *av[MIGRATE_ARGS+1]; + char *state; + struct sanlk_lockspace lockspace; + struct sanlk_resource *res; + struct sanlk_options *opt; + int i, j, pid, rv, sock, len, status, init; + uint32_t parent_pid = getpid(); + + if (argc < MIGRATE_ARGS) + return -1; + + count_offset = 0; + + strcpy(lock_path, argv[2]); + strcpy(count_path, argv[4]); + our_hostid = atoi(argv[7]); + max_hostid = atoi(argv[8]); + + len = sizeof(struct sanlk_resource) + sizeof(struct sanlk_disk); + res = malloc(len); + memset(res, 0, len); + strcpy(res->lockspace_name, "devcount"); + snprintf(res->name, SANLK_NAME_LEN, "resource%s", count_path); + res->name[SANLK_NAME_LEN-1] = '\0'; + res->num_disks = 1; + strncpy(res->disks[0].path, lock_path, SANLK_PATH_LEN); + res->disks[0].path[SANLK_PATH_LEN-1] = '\0'; + res->disks[0].offset = 1024000; + + len = sizeof(struct sanlk_options) + MAX_MIGRATE_STATE; + opt = malloc(len); + memset(opt, 0, len); + + printf("%d lock_disk %s count_disk %s our_hostid %d max_hostid\n", + parent_pid, lock_path, count_path, our_hostid, max_hostid); + + memset(&lockspace, 0, sizeof(lockspace)); + strcpy(lockspace.name, "devcount"); + strcpy(lockspace.host_id_disk.path, lock_path); + lockspace.host_id_disk.offset = lock_offset; + lockspace.host_id = our_hostid; + + rv = sanlock_add_lockspace(&lockspace, 0); + if (rv < 0) { + printf("%d sanlock_add_lockspace error %d\n", parent_pid, rv); + exit(EXIT_FAILURE); + } + printf("%d sanlock_add_lockspace done\n", parent_pid); + + /* + * argv[0] = devcount + * argv[1] = migrate + * argv[2] = <lock_disk> + * argv[3] = rw + * start copying at argv[3] + */ + + j = 0; + memset(av, 0, sizeof(char *) * MIGRATE_ARGS+1); + + av[j++] = strdup(argv[0]); + for (i = 3; i < MIGRATE_ARGS; i++) + av[j++] = strdup(argv[i]); + + memset(incoming, 0, sizeof(incoming)); + + while (1) { + init = wait_migrate_incoming(incoming); + + pid = fork(); + if (!pid) { + int child_pid = getpid(); + + printf("\n"); + + if (!init) { + opt->flags = SANLK_FLG_INCOMING; + opt->len = strlen(incoming); + strncpy(opt->str, incoming, MAX_MIGRATE_STATE); + } + + sock = sanlock_register(); + if (sock < 0) { + printf("%d sanlock_register error %d\n", + child_pid, rv); + exit(-1); + } + + rv = sanlock_acquire(sock, -1, 1, &res, opt); + if (rv < 0) { + printf("%d sanlock_acquire error %d in %s\n", + child_pid, rv, opt->str); + /* only one host should be trying to acquire + so this should always succeed */ + exit(-1); + } + printf("%d sanlock_acquire done\n", child_pid); + + if (init) + goto skip_setowner; + + wait_migrate_stopped(incoming); + + rv = sanlock_setowner(sock, -1); + if (rv < 0) { + printf("%d sanlock_setowner error %d\n", + child_pid, rv); + exit(-1); + } + printf("%d sanlock_setowner done\n", child_pid); + skip_setowner: + execv(av[0], av); + perror("execv devcount problem"); + exit(EXIT_FAILURE); + } + + /* let the child run for 10 seconds before stopping it; + if the child exits before the 10 seconds, the sanlock_migrate + call should return an error */ + + sleep(10); + + rv = sanlock_migrate(-1, pid, 0, &state); + if (rv < 0 || !state) { + printf("%d sanlock_migrate error %d\n", parent_pid, rv); + goto fail; + } + + write_migrate_incoming(state); + + kill(pid, SIGSTOP); + + write_migrate_stopped(state); + + kill(pid, SIGKILL); + + waitpid(pid, &status, 0); + + free(state); + + /* TODO: goto fail if exit status is an error */ + } + + fail: + printf("test failed...\n"); + sleep(10000000); } /* * devcount init <lock_disk> <count_disk> * sanlock direct init -n 8 -s devcount:0:<lock_disk>:0 * sanlock direct init -n 8 -r devcount:resource<count_disk>:<lock_disk>:1024000 + * dd if=/dev/zero of=<count_disk> bs=512 count=24 */ int do_init(int argc, char *argv[]) @@ -414,6 +840,16 @@ int do_init(int argc, char *argv[]) printf("%s\n", command); system(command); + + memset(command, 0, sizeof(command)); + + snprintf(command, sizeof(command), + "dd if=/dev/zero of=%s bs=512 count=24", + count_path); + + printf("%s\n", command); + + system(command); } int main(int argc, char *argv[]) @@ -426,12 +862,15 @@ int main(int argc, char *argv[]) if (!strcmp(argv[1], "init")) rv = do_init(argc, argv); - else if (!strcmp(argv[1], "count")) + else if (!strcmp(argv[1], "rw") || !strcmp(argv[1], "wr")) rv = do_count(argc, argv); else if (!strcmp(argv[1], "lock")) rv = do_lock(argc, argv); + else if (!strcmp(argv[1], "migrate")) + rv = do_migrate(argc, argv); + if (!rv) return 0; @@ -447,14 +886,19 @@ int main(int argc, char *argv[]) printf("devcount init <lock_disk> <count_disk>\n"); printf(" sanlock direct init -n 8 -s devcount:0:<lock_disk>:0\n"); printf(" sanlock direct init -n 8 -r devcount:resource<count_disk>:<lock_disk>:1024000\n"); + printf(" dd if=/dev/zero of=<count_disk> bs=512 count=24\n"); + printf("\n"); + printf("devcount migrate <lock_disk> rw <count_disk> <sec1> <sec2> <hostid> <max_hostid>\n"); + printf(" sanlock add_lockspace -s devcount:<hostid>:<lock_disk>:0\n"); + printf(" loop around fork, sanlock_acquire, exec devcount rw\n"); printf("\n"); - printf("devcount lock <lock_disk> count <count_disk> <rsec> <wsec> <hostid>\n"); + printf("devcount lock <lock_disk> rw <count_disk> <sec1> <sec2> <hostid>\n"); printf(" sanlock add_lockspace -s devcount:<hostid>:<lock_disk>:0\n"); - printf(" loop around fork, sanlock_acquire, exec devcount count\n"); + printf(" loop around fork, sanlock_acquire, exec devcount rw\n"); printf("\n"); - printf("devcount count <count_disk> <rsec> <wsec> <hostid>\n"); - printf(" read disk count for rsec seconds, looking for any writes\n"); - printf(" write disk count for wsec seconds, (wsec 0 indefinite)\n"); + printf("devcount rw <count_disk> <sec1> <sec2> <hostid>\n"); + printf(" rw: read count for sec1, looking for writes, then write for sec2\n"); + printf(" wr: write count for sec1, then read for sec2, looking for writes\n"); printf("\n"); return -1; } commit 3eb2a167a531bbd49f30e1a24eec2e5b28c1cb6b Author: David Teigland <teigland(a)redhat.com> Date: Tue Mar 8 16:35:29 2011 -0600 sanlock: migration working A lot of various changes and fixes to the migration code paths to make them work. This adds some comments and TODO's related to incomplete handling of migration failure cases. diff --git a/src/direct.c b/src/direct.c index 3b39ea3..3bdfbf8 100644 --- a/src/direct.c +++ b/src/direct.c @@ -116,9 +116,11 @@ static int do_paxos_action(void) return -1; } - rv = paxos_lease_migrate(token, &leader_read, &leader_ret, com.target_host_id); + leader_read.next_owner_id = com.target_host_id; + + rv = paxos_lease_leader_write(token, &leader_read); if (rv < 0) { - log_tool("cannot migrate lease on %s", + log_tool("cannot write lease on %s", token->resource_name); return -1; } diff --git a/src/leader.h b/src/leader.h index c76da4a..46c0b5a 100644 --- a/src/leader.h +++ b/src/leader.h @@ -40,6 +40,8 @@ enum { DP_BAD_SECTORSIZE = -26, DP_REACQUIRE_LVER = -27, DP_BAD_LOCKSPACE = -28, + DP_LEADER_MIGRATE = -29, + DP_OTHER_OWNER = -30, }; /* does not include terminating null byte */ diff --git a/src/main.c b/src/main.c index 6402f19..f0e549c 100644 --- a/src/main.c +++ b/src/main.c @@ -427,6 +427,9 @@ static int main_loop(void) return 0; } +/* FIXME: allow setowner on the source to "cancel" a migration by clearing + next_owner. This means allowing CMD_SETOWNER when cmd_active == CMD_MIGRATE */ + static int set_cmd_active(int ci_target, int cmd) { struct client *cl = &client[ci_target]; @@ -442,6 +445,9 @@ static int set_cmd_active(int ci_target, int cmd) return -EBUSY; } + if (cl->need_setowner && cmd == SM_CMD_SETOWNER) + cl->need_setowner = 0; + cmd_active = cl->cmd_active; if (!cmd) { @@ -552,9 +558,8 @@ static int parse_key_val(char *str, const char *key_arg, char *val_arg, int len) * "lockspace_name=.... resource_name=.... " */ -static int parse_migrate_state(struct token *token, char *str, - int *migrate_result, - struct leader_record *leader) +int parse_incoming_state(struct token *token, char *str, int *migrate_result, + struct leader_record *leader) { char state[SANLK_STATE_MAXSTR]; char name[128]; @@ -571,7 +576,7 @@ static int parse_migrate_state(struct token *token, char *str, if (!begin) return -1; - end = strstr(begin, "lockspace_name="); + end = strstr(begin+strlen(name), "lockspace_name="); if (!end) end = str + strlen(str) + 1; @@ -587,27 +592,27 @@ static int parse_migrate_state(struct token *token, char *str, rv = parse_key_val(state, "migrate_result", val_str, sizeof(val_str)); if (rv < 0) - return rv; + return -2; *migrate_result = atoi(val_str); rv = parse_key_val(state, "leader.lver", val_str, sizeof(val_str)); if (rv < 0) - return rv; + return -3; leader->lver = strtoull(val_str, NULL, 0); rv = parse_key_val(state, "leader.timestamp", val_str, sizeof(val_str)); if (rv < 0) - return rv; + return -4; leader->timestamp = strtoull(val_str, NULL, 0); rv = parse_key_val(state, "leader.owner_id", val_str, sizeof(val_str)); if (rv < 0) - return rv; + return -5; leader->owner_id = strtoull(val_str, NULL, 0); rv = parse_key_val(state, "leader.next_owner_id", val_str, sizeof(val_str)); if (rv < 0) - return rv; + return -6; leader->next_owner_id = strtoull(val_str, NULL, 0); return 0; @@ -623,7 +628,6 @@ static void *cmd_acquire_thread(void *args_in) struct token *new_tokens[SANLK_MAX_RESOURCES]; struct sanlk_resource res; struct sanlk_options opt; - struct leader_record leader; char *opt_str; char num_hosts_str[16]; uint64_t reacquire_lver = 0; @@ -870,23 +874,22 @@ static void *cmd_acquire_thread(void *args_in) token = new_tokens[i]; if (opt.flags & SANLK_FLG_INCOMING) { - migrate_result = 0; - memset(&leader, 0, sizeof(leader)); - rv = parse_migrate_state(token, opt_str, &migrate_result, &leader); - if (rv < 0 || !migrate_result) { - log_errot(token, "cmd_acquire migrate state " - "bad %d len %zd", migrate_result, - strlen(opt_str)); + rv = check_incoming_state(token, opt_str, &migrate_result); + if (rv < 0) { + log_errot(token, "cmd_acquire incoming state %d", rv); goto fail_release; } - } else if (opt.flags & SANLK_FLG_REACQUIRE) { - reacquire_lver = token->prev_lver; - } - if (opt.flags & SANLK_FLG_INCOMING) - rv = receive_token(token, migrate_result, &leader); - else + /* source set_next_owner_other() wasn't called or failed */ + if (migrate_result != DP_OK) + rv = set_next_owner_self(token); + + } else { + if (opt.flags & SANLK_FLG_REACQUIRE) + reacquire_lver = token->prev_lver; + rv = acquire_token(token, reacquire_lver, new_num_hosts); + } save_resource_leader(token); @@ -1109,11 +1112,6 @@ static void *cmd_migrate_thread(void *args_in) goto reply; } - if (!target_host_id) { - result = -EINVAL; - goto reply; - } - for (i = 0; i < SANLK_MAX_RESOURCES; i++) { if (cl->tokens[i]) total++; @@ -1123,7 +1121,6 @@ static void *cmd_migrate_thread(void *args_in) reply_str = malloc(reply_len); if (!reply_str) { result = -ENOMEM; - total = 0; goto reply; } memset(reply_str, 0, reply_len); @@ -1134,11 +1131,23 @@ static void *cmd_migrate_thread(void *args_in) if (!token) continue; - /* if migrate_token() fails it is not fatal, we can still - procede with the migration; receive_token() will attempt - to set next_owner_id */ + /* the migrating flag causes the source to avoid freeing the lease + * if the pid exits before the dest has written itself as next_owner. + * i.e. we can't rely on paxos_lease_release seeing next_owner_id + * non-zero because the cmd_migrate can be called on the source, + * followed by the source pid exiting before the dest gets to write + * next_owner_id to itself */ + + token->migrating = 1; - migrate_token(token, target_host_id); + if (target_host_id) { + rv = set_next_owner_other(token, target_host_id); + token->migrate_result = rv; + } else { + /* acquire-incoming on the destination will call + set_next_owner_self() */ + token->migrate_result = 0; + } ret = snprintf(reply_str + pos, reply_len - pos, "lockspace_name=%s " @@ -1169,24 +1178,21 @@ static void *cmd_migrate_thread(void *args_in) reply_str[reply_len-1] = '\0'; reply: - /* TODO: for success I don't think we want to clear cmd_active - here. We probably want to wait until the migrate is done - and then do set_cmd_active(0)? */ - - if (result < 0) - set_cmd_active(ca->ci_target, 0); - log_debug("cmd_migrate done result %d", result); memcpy(&h, &ca->header, sizeof(struct sm_header)); - h.length = sizeof(h) + strlen(reply_str)+1; - h.data = result; - send(fd, &h, sizeof(h), MSG_NOSIGNAL); - if (total) - send(fd, reply_str, strlen(reply_str)+1, MSG_NOSIGNAL); - if (reply_str) + if (reply_str) { + h.length = sizeof(h) + strlen(reply_str)+1; + h.data = result; + send(fd, &h, sizeof(h), MSG_NOSIGNAL); + send(fd, reply_str, strlen(reply_str)+1, MSG_NOSIGNAL); free(reply_str); + } else { + h.length = sizeof(h); + h.data = result; + send(fd, &h, sizeof(h), MSG_NOSIGNAL); + } client_back(ca->ci_in, fd); free(ca); @@ -1202,10 +1208,9 @@ static void *cmd_setowner_thread(void *args_in) struct cmd_args *ca = args_in; struct sm_header h; struct token *token; - struct token *tokens_reply; struct client *cl; - int result = 0, total = 0, total2 = 0; - int fd, rv, i, tokens_len; + int result = 0; + int fd, rv, i; cl = &client[ca->ci_target]; fd = client[ca->ci_in].fd; @@ -1214,20 +1219,6 @@ static void *cmd_setowner_thread(void *args_in) ca->ci_in, ca->ci_target, cl->pid); for (i = 0; i < SANLK_MAX_RESOURCES; i++) { - if (cl->tokens[i]) - total++; - } - - tokens_len = total * sizeof(struct token); - tokens_reply = malloc(tokens_len); - if (!tokens_reply) { - result = -ENOMEM; - total = 0; - goto reply; - } - memset(tokens_reply, 0, tokens_len); - - for (i = 0; i < SANLK_MAX_RESOURCES; i++) { token = cl->tokens[i]; if (!token) continue; @@ -1235,26 +1226,16 @@ static void *cmd_setowner_thread(void *args_in) rv = setowner_token(token); if (rv < 0) result = -1; - - if (total2 == total) { - log_error("cmd_setowner total %d changed", total); - continue; - } - - memcpy(&tokens_reply[total2++], token, sizeof(struct token)); } - reply: set_cmd_active(ca->ci_target, 0); - log_debug("cmd_setowner done %d", total); + log_debug("cmd_setowner done"); memcpy(&h, &ca->header, sizeof(struct sm_header)); - h.length = sizeof(h) + tokens_len; + h.length = sizeof(h); h.data = result; send(fd, &h, sizeof(h), MSG_NOSIGNAL); - if (total) - send(fd, tokens_reply, tokens_len, MSG_NOSIGNAL); client_back(ca->ci_in, fd); free(ca); diff --git a/src/paxos_lease.c b/src/paxos_lease.c index ee1fb75..f693f62 100644 --- a/src/paxos_lease.c +++ b/src/paxos_lease.c @@ -551,8 +551,6 @@ int paxos_lease_leader_read(struct token *token, struct leader_record *leader_re goto fail; } - log_token(token, "leader_read d %u reps %u", d, leader_reps[d]); - log_token(token, "leader_read owner %llu lver %llu hosts %llu " "time %llu res %s", (unsigned long long)prev_leader.owner_id, @@ -597,7 +595,7 @@ static int write_new_leader(struct token *token, struct leader_record *nl) * ref: obtain() */ -int paxos_lease_acquire(struct token *token, int force GNUC_UNUSED, +int paxos_lease_acquire(struct token *token, int force, struct leader_record *leader_ret, uint64_t reacquire_lver, int new_num_hosts) @@ -610,19 +608,25 @@ int paxos_lease_acquire(struct token *token, int force GNUC_UNUSED, uint64_t last_timestamp = 0; int error; - log_token(token, "paxos_acquire begin"); + log_token(token, "paxos_acquire begin force %d", force); error = paxos_lease_leader_read(token, &prev_leader); if (error < 0) goto out; + if (force) + goto run; + if (prev_leader.timestamp == LEASE_FREE) { log_token(token, "paxos_acquire lease free"); goto run; } - if (prev_leader.owner_id == token->host_id) { - log_token(token, "paxos_acquire already owner"); + if (prev_leader.owner_id == token->host_id && + prev_leader.owner_generation == token->host_generation) { + log_token(token, "paxos_acquire already owner id %llu gen %llu", + (unsigned long long)token->host_id, + (unsigned long long)token->host_generation); goto run; } @@ -795,51 +799,18 @@ int paxos_lease_acquire(struct token *token, int force GNUC_UNUSED, return error; } -int paxos_lease_migrate(struct token *token, - struct leader_record *leader_last, - struct leader_record *leader_ret, - uint64_t target_host_id) +int paxos_lease_leader_write(struct token *token, + struct leader_record *leader_new) { - struct leader_record new_leader; - int rv, d; int error; - log_token(token, "paxos_migrate begin"); + log_token(token, "paxos_lease_leader_write begin"); - for (d = 0; d < token->num_disks; d++) { - memset(&new_leader, 0, sizeof(struct leader_record)); + leader_new->checksum = leader_checksum(leader_new); - rv = read_leader(&token->disks[d], &new_leader); - if (rv < 0) - continue; + error = write_new_leader(token, leader_new); - if (memcmp(&new_leader, leader_last, - sizeof(struct leader_record))) { - log_errot(token, "paxos_migrate leader changed before migrate"); - error = DP_BAD_LEADER; - goto out; - } - } - - if (new_leader.num_hosts < target_host_id) { - log_errot(token, "paxos_migrate num_hosts %llu target_host_id %llu", - (unsigned long long)new_leader.num_hosts, - (unsigned long long)target_host_id); - error = DP_BAD_NUMHOSTS; - goto out; - } - - new_leader.next_owner_id = target_host_id; - new_leader.timestamp = time(NULL); - new_leader.checksum = leader_checksum(&new_leader); - - error = write_new_leader(token, &new_leader); - if (error < 0) - goto out; - - memcpy(leader_ret, &new_leader, sizeof(struct leader_record)); - out: - log_token(token, "paxos_migrate done %d", error); + log_token(token, "paxos_lease_leader_write done %d", error); return error; } @@ -896,11 +867,23 @@ int paxos_lease_release(struct token *token, if (memcmp(&new_leader, leader_last, sizeof(struct leader_record))) { - log_errot(token, "leader changed before release"); + log_errot(token, "release error leader changed"); return DP_BAD_LEADER; } } + if (new_leader.owner_id != token->host_id) { + log_errot(token, "release error other owner_id %llu", + (unsigned long long)new_leader.owner_id); + return DP_OTHER_OWNER; + } + + if (new_leader.next_owner_id) { + log_errot(token, "release error next_owner_id %llu", + (unsigned long long)new_leader.next_owner_id); + return DP_LEADER_MIGRATE; + } + new_leader.timestamp = LEASE_FREE; new_leader.checksum = leader_checksum(&new_leader); diff --git a/src/paxos_lease.h b/src/paxos_lease.h index bf8e600..8981ed7 100644 --- a/src/paxos_lease.h +++ b/src/paxos_lease.h @@ -12,6 +12,7 @@ uint32_t leader_checksum(struct leader_record *lr); int majority_disks(struct token *token, int num); int paxos_lease_leader_read(struct token *token, struct leader_record *leader_ret); +int paxos_lease_leader_write(struct token *token, struct leader_record *leader_new); int paxos_lease_acquire(struct token *token, int force, struct leader_record *leader_ret, uint64_t reacquire_lver, diff --git a/src/sanlock_internal.h b/src/sanlock_internal.h index 3f1c1f6..22dd21e 100644 --- a/src/sanlock_internal.h +++ b/src/sanlock_internal.h @@ -85,6 +85,7 @@ struct token { struct sync_disk *disks; /* internal */ + int migrating; int token_id; /* used to refer to this token instance in log messages */ int acquire_result; int migrate_result; diff --git a/src/token_manager.c b/src/token_manager.c index 5d09ef0..aa537df 100644 --- a/src/token_manager.c +++ b/src/token_manager.c @@ -231,10 +231,15 @@ int setowner_token(struct token *token) /* we want the dblocks to reflect a full, proper ownership, so we do the full acquire rather than just writing a new leader_record */ - rv = paxos_lease_acquire(token, 0, &leader_ret, 0, 0); + rv = paxos_lease_acquire(token, 1, &leader_ret, 0, 0); token->setowner_result = rv; + /* we set acquire_result here for at least one reason: because release + will not release the token if acquire_result is not 1 */ + + token->acquire_result = rv; + log_token(token, "setowner rv %d lver %llu at %llu", rv, (unsigned long long)token->leader.lver, (unsigned long long)token->leader.timestamp); @@ -246,6 +251,88 @@ int setowner_token(struct token *token) return rv; /* DP_OK */ } +/* + * migration creates special cases for release. if either the source or + * the dest calls release_token and read leader shows next_owner_id is not + * zero, it means migration is in progress, and they should not free the + * lease. + * + * In other words, paxos release should only be done (lease freed) on a + * "fully owned", clean lease, i.e. next_owner_id is zero, and current + * leader matches our last leader. + * + * (A second mechanism is also needed to prevent release of migrating leases, + * the token->migrating flag. This is because we need to block releases + * on the source effective immediately, before next_owner may be written.) + * + * setowner on the destination, in the case of migration success, moves a + * disk lease from being in limbo (both owner and next_owner set), to having + * just an owner (the dest). After this the owner (dest) can release it. + * + * TODO: setowner on the source, in the case of migration failure, moves the + * disk lease from being in limbo with both owner and next_owner, to having + * just an owner (the source). This setowner call will need to ignore the + * fact that the leader block doesn't match its latest copy since it may + * have been the dest that wrote next_owner at the start of migration. + * + * We don't have to worry that next_owner is ever running the vm; setowner + * on dest is required to complete successfully (making dest the owner and + * clearing next_owner) before vm is resumed on the dest. + * + * If migration fails, the source/owner does not free the paxos lease, but + * the source/owner continues running and renewing its host_id, then no + * other host will be able to take ownership of the lease, because they will + * see that the owner is alive. The source/owner will be able to acquire + * the lease, though. So, the source/owner needs to either + * 1. call setowner to clear next_owner_id, then call release to free it (TODO above) + * 2. call acquire to acquire the lease, then call release to free it + * + * If migration fails, the source/owner does not free the paxos lease, and + * the source/owner does not continue running or renewing its host_id, then + * another host will be able to take ownership of the lease, because they + * will see that the owner is not alive (or comes back with a different + * generation). + * + * For migration, the paxos lease is in limbo: both owner and next_owner + * are set, and in this state neither the source nor the dest can free the + * paxos lease. The limbo state needs to be cleared (next_owner cleared) + * before the lease can be freed. + * + * - if migration succeeds, the dest will call setowner to clear next_owner + * and bring the lease out of limbo + * + * - if migration fails because the dest host fails, + * setowner on the source will clear next_owner, and allow source to + * continue running and holding the lease, or release the lease. + * setowner on source would have to ignore the fact that the leader + * will have been changed since it last read or wrote it (by the dest + * writing itself as the next_owner_id) + * + * - if migration fails because the source host fails, + * the paxos lease will be left on disk with owner and next_owner set, + * and neither source nor dest owning the lease to free it. + * The lease can be acquired because someone will see that the owner's + * host_id is not renewed (or a different generation). This acquire + * will clear next_owner. + * + * - if migration fails and both source and dest fail, the lease can be + * acquired because someone will see that the owner's host_id is not + * renewed (or a diff generation). This acquire will clear next_owner. + * + * - if migration fails because the the dest qemu fails but dest host still ok + * setowner on the source will clear next_owner (same as if dest host fails) + * + * - if migration fails because the the source qemu fails but source host still ok + * sanlock will not free the lease in release_token because next_owner is set. + * no other host can acquire the lease because its owned by a live host_id. + * the source host can acquire the lease again, and then free it. what causes + * the source host to try to acquire the lease again? trying to start the vm + * on the source again... + * + * - if migration fails because the the source and dest qemu fails but hosts ok + * same as prev + */ + /* return < 0 on error, 1 on success */ int release_token(struct token *token) @@ -253,12 +340,9 @@ int release_token(struct token *token) struct leader_record leader_ret; int rv; - if (token->leader.owner_id != token->host_id) { - /* this case occurs on the receiving side of migration, when - the local host hasn't become the lease owner (just next_owner), - and the pid fails, causing sm to clean up the pid's tokens */ - log_token(token, "release we are not owner"); - return 1; + if (token->migrating) { + log_errot(token, "release skip migrating"); + return DP_ERROR; } rv = paxos_lease_release(token, &token->leader, &leader_ret); @@ -274,96 +358,146 @@ int release_token(struct token *token) return rv; /* DP_OK */ } -/* migration source: writes leader_record.next_owner_id = target_host_id */ /* return < 0 on error, 1 on success */ -int migrate_token(struct token *token, uint64_t target_host_id) +int set_next_owner_other(struct token *token, uint64_t target_host_id) { - struct leader_record leader_ret; + struct leader_record leader; int rv; - rv = paxos_lease_migrate(token, &token->leader, &leader_ret, target_host_id); + rv = paxos_lease_leader_read(token, &leader); + if (rv < 0) + return rv; - token->migrate_result = rv; + if (memcmp(&leader, &token->leader, sizeof(struct leader_record))) { + log_errot(token, "set_next_owner_other leader changed before migrate"); + return DP_BAD_LEADER; + } - log_token(token, "migrate rv %d", rv); + if (leader.num_hosts < target_host_id) { + log_errot(token, "set_next_owner_other num_hosts %llu " + "target_host_id %llu", + (unsigned long long)leader.num_hosts, + (unsigned long long)target_host_id); + return DP_BAD_NUMHOSTS; + } + + leader.next_owner_id = target_host_id; + rv = paxos_lease_leader_write(token, &leader); if (rv < 0) return rv; - memcpy(&token->leader, &leader_ret, sizeof(struct leader_record)); + memcpy(&token->leader, &leader, sizeof(struct leader_record)); return rv; /* DP_OK */ } +/* return < 0 on error, 1 on success */ + +int set_next_owner_self(struct token *token) +{ + struct leader_record leader; + int rv; + + rv = paxos_lease_leader_read(token, &leader); + if (rv < 0) + return rv; + + if (leader.num_hosts < token->host_id) { + log_errot(token, "set_next_owner_self num_hosts %llu host_id %llu", + (unsigned long long)leader.num_hosts, + (unsigned long long)token->host_id); + return DP_BAD_NUMHOSTS; + } + + leader.next_owner_id = token->host_id; + + rv = paxos_lease_leader_write(token, &leader); + if (rv < 0) + return rv; + + memcpy(&token->leader, &leader, sizeof(struct leader_record)); + return rv; /* DP_OK */ +} /* - * migration target: verifies that the source wrote us as the next_owner_id - * - * When everything is working correctly, we just verify here that - * fields in leader_ret match what we see in leader_src - * (created from opt_str which was returned by sanlock_migrate() - * on the source). + * migration destination verifies the migrate state sent from source, + * which needs to be consisent with the source having successfully written + * the next_owner itself, or having not tried or tried and failed. * * If we can't read the leader, return an error, and the migration * needs to be aborted. */ + /* return < 0 on error, 1 on success */ -int receive_token(struct token *token, int migrate_result, - struct leader_record *leader_src) +int parse_incoming_state(struct token *token, char *str, int *migrate_result, + struct leader_record *leader); + +int check_incoming_state(struct token *token, char *opt_str, int *migrate_result_out) { - struct leader_record leader_read; + struct leader_record leader_ret; + struct leader_record leader_src; + int migrate_result; int rv; - rv = paxos_lease_leader_read(token, &leader_read); + rv = paxos_lease_leader_read(token, &leader_ret); if (rv < 0) return rv; + rv = parse_incoming_state(token, opt_str, &migrate_result, &leader_src); + if (rv < 0) { + log_errot(token, "check_incoming_state parse error %d result %d len %zd", + rv, migrate_result, strlen(opt_str)); + return rv; + } + + *migrate_result_out = migrate_result; + + /* source successfully wrote next_owner */ + if (migrate_result == DP_OK) { - if (leader_src->next_owner_id == token->host_id && - leader_read.next_owner_id == token->host_id && - leader_src->lver == leader_read.lver && - leader_src->timestamp == leader_read.timestamp) { - log_token(token, "receive_token all match"); + if (leader_src.next_owner_id == token->host_id && + leader_ret.next_owner_id == token->host_id && + leader_src.lver == leader_ret.lver && + leader_src.timestamp == leader_ret.timestamp) { + log_token(token, "check_incoming_state all match"); return DP_OK; } else { - log_errot(token, "receive_token mismatch " + log_errot(token, "check_incoming_state mismatch " "next_owner %llu %llu %llu " "lver %llu %llu " "timestamp %llu %llu", (unsigned long long)token->host_id, - (unsigned long long)leader_src->next_owner_id, - (unsigned long long)leader_read.next_owner_id, - (unsigned long long)leader_src->lver, - (unsigned long long)leader_read.lver, - (unsigned long long)leader_src->timestamp, - (unsigned long long)leader_read.timestamp); + (unsigned long long)leader_src.next_owner_id, + (unsigned long long)leader_ret.next_owner_id, + (unsigned long long)leader_src.lver, + (unsigned long long)leader_ret.lver, + (unsigned long long)leader_src.timestamp, + (unsigned long long)leader_ret.timestamp); return -1; } } - /* migrate_result < 0, source could not write next_owner_id, so it - should still be 0 */ + /* migrate_result <= 0, source could not (or did not) write next_owner_id, + so it should still be 0 */ - if (leader_src->owner_id != leader_read.owner_id || - leader_src->timestamp != leader_read.timestamp || - leader_read.next_owner_id != 0) { + if (leader_src.owner_id != leader_ret.owner_id || + leader_src.timestamp != leader_ret.timestamp || + leader_ret.next_owner_id != 0) { - log_errot(token, "receive_token mismatch migrate_result %d " + log_errot(token, "check_incoming_state mismatch migrate_result %d " "next_owner %llu owner %llu %llu timestamp %llu %llu", migrate_result, - (unsigned long long)leader_read.next_owner_id, - (unsigned long long)leader_src->owner_id, - (unsigned long long)leader_read.owner_id, - (unsigned long long)leader_src->timestamp, - (unsigned long long)leader_read.timestamp); + (unsigned long long)leader_ret.next_owner_id, + (unsigned long long)leader_src.owner_id, + (unsigned long long)leader_ret.owner_id, + (unsigned long long)leader_src.timestamp, + (unsigned long long)leader_ret.timestamp); return -1; } - /* since the source failed to write next_owner_id to be us, we do it - instead */ - - return migrate_token(token, token->host_id); + return DP_OK; } int create_token(int num_disks, struct token **token_out) diff --git a/src/token_manager.h b/src/token_manager.h index 7119936..34b3c5b 100644 --- a/src/token_manager.h +++ b/src/token_manager.h @@ -12,11 +12,13 @@ int acquire_token(struct token *token, uint64_t reacquire_lver, int new_num_hosts); int release_token(struct token *token); -int migrate_token(struct token *token, uint64_t target_host_id); -int receive_token(struct token *token, int migrate_result, - struct leader_record *leader_src); int setowner_token(struct token *token); +int check_incoming_state(struct token *token, char *opt_str, + int *migrate_result_out); +int set_next_owner_other(struct token *token, uint64_t target_host_id); +int set_next_owner_self(struct token *token); + int create_token(int num_disks, struct token **token_out); void free_token(struct token *token); void release_token_async(struct token *token);

13 years, 1 month

1
0
0 / 0

src/paxos_lease.c

by David Teigland

src/paxos_lease.c | 22 +++++++++++++++++++++- 1 file changed, 21 insertions(+), 1 deletion(-) New commits: commit 5be245a0c186d319b7572ca812a0067a48b420dc Author: David Teigland <teigland(a)redhat.com> Date: Tue Mar 1 16:12:31 2011 -0600 sanlock: fix host_id expiration check When checking if another hostid has expired we cannot base it on the last disk renewal time from the other host, because the other host may not have time in sync. diff --git a/src/paxos_lease.c b/src/paxos_lease.c index 96e77cc..ee1fb75 100644 --- a/src/paxos_lease.c +++ b/src/paxos_lease.c @@ -606,6 +606,7 @@ int paxos_lease_acquire(struct token *token, int force GNUC_UNUSED, struct leader_record new_leader; struct leader_record host_id_leader; struct paxos_dblock dblock; + time_t start; uint64_t last_timestamp = 0; int error; @@ -634,6 +635,8 @@ int paxos_lease_acquire(struct token *token, int force GNUC_UNUSED, log_token(token, "paxos_acquire check owner_id %llu", (unsigned long long)prev_leader.owner_id); + start = time(NULL); + while (1) { error = host_id_leader_read(prev_leader.space_name, prev_leader.owner_id, @@ -682,14 +685,31 @@ int paxos_lease_acquire(struct token *token, int force GNUC_UNUSED, /* if the owner hasn't renewed its host_id lease for host_id_timeout_seconds then its watchdog should have fired - by now */ + by now + + if we trust that the clocks are in sync among hosts, then this + check could be: if (time(NULL) - host_id_leader.timestamp > + to.host_id_timeout_seconds), but if the clocks are out of sync, + this check would easily give two hosts the lease. + + N.B. we need to be careful about ever comparing local time(NULL) + to a time value we read off disk from another node that may + have different time. */ + if (time(NULL) - start > to.host_id_timeout_seconds) { + log_token(token, "paxos_acquire host_id %llu expired %llu", + (unsigned long long)prev_leader.owner_id, + (unsigned long long)host_id_leader.timestamp); + goto run; + } +#if 0 if (time(NULL) - host_id_leader.timestamp > to.host_id_timeout_seconds) { log_token(token, "paxos_acquire host_id %llu expired %llu", (unsigned long long)prev_leader.owner_id, (unsigned long long)host_id_leader.timestamp); goto run; } +#endif /* the owner is renewing its host_id so it's alive */

13 years, 2 months

1
0
0 / 0

2024

2023

2022

2021

2020

2019

2018

2017

2016

2015

2014

2013

2012

2011

sanlock-devel March 2011