The first patch adds a new 'sanlock client spawn' command, which runs a series of commands with arguments under lease, and then explicitly releases the lease.
It is recommended to use -P 1, to handle the case where sanlock client spawn dies while a child is still running, where it would otherwise be unsafe for the daemon to automatically release the lease.
The second patch introduces the release flag SANLK_REL_NO_DISK, so that the series of commands may also include removal of the resource lease file or disk itself, as part of a resource de-provisioning operation.
In the command line tool, a new option -d 1 is made available to ACT_SPAWN, and for consistency also to the ACT_RELEASE action.
Claudio Fontana (2): sanlock: add sanlock client spawn action sanlock: add SANLK_REL_NO_DISK
src/cmd.c | 6 +- src/main.c | 207 ++++++++++++++++++++++++++++++++++++++--- src/resource.c | 2 +- src/resource.h | 3 + src/sanlock.8 | 85 +++++++++++++++++ src/sanlock_internal.h | 9 ++ src/sanlock_resource.h | 8 ++ 7 files changed, 306 insertions(+), 14 deletions(-)
this new action overcomes some limitations of action
sanlock client command
by allowing to run multiple commands in succession, all under the same lease, as long as they complete successfully (exit status 0), all spawned as child processes using fork() and exec().
After all processes are successfully executed, or at the first failure, the lease is released explicitly with sanlock_release() before exiting,
removing the time window in which the resource remains busy, just waiting for the POLLHUP in the daemon to be processed.
sanlock_internal.h: add new ACT_SPAWN action, and its options
main.c: implement command line parsing and the ACT_SPAWN action
sanlock.8: document the new action and its options
Signed-off-by: Claudio Fontana cfontana@suse.de --- src/main.c | 196 ++++++++++++++++++++++++++++++++++++++--- src/sanlock.8 | 65 ++++++++++++++ src/sanlock_internal.h | 8 ++ 3 files changed, 258 insertions(+), 11 deletions(-)
diff --git a/src/main.c b/src/main.c index 8317509..919cc7d 100644 --- a/src/main.c +++ b/src/main.c @@ -2319,6 +2319,7 @@ static void print_usage(void) printf("sanlock client inq_lockspace -s LOCKSPACE\n"); printf("sanlock client rem_lockspace -s LOCKSPACE\n"); printf("sanlock client command -r RESOURCE [-h 0|1] -c <path> <args>\n"); + printf("sanlock client spawn -r RESOURCE [-h 0|1] [-O 0|1] [-P 0|1] -c COUNT CMD [ARG...] [-c COUNT CMD [ARG...]]\n"); printf("sanlock client acquire -r RESOURCE -p|-C <id>\n"); printf("sanlock client convert -r RESOURCE -p|-C <id>\n"); printf("sanlock client release -r RESOURCE -p|-C <id>\n"); @@ -2369,6 +2370,52 @@ static void print_usage(void) printf("\n"); }
+/* + * read_spawn_command - parse a single "-c COUNT CMD [ARG...]" option group + * + * Called with I pointing at the COUNT argument (the option argument + * that follows -c). Parses COUNT and the subsequent CMD [ARG...] words, + * appends a new entry to com.spawn_args, and returns the updated index + * pointing at the next option. The caller can then check argv[i] to decide + * whether another -c group follows. + * + * Returns the updated index (>= 0) on success, or a negative error code. + */ +static int read_spawn_command(int i, int argc, char *argv[]) +{ + int count, k; + + if (i >= argc) { + log_tool("spawn -c requires COUNT and CMD"); + return -EINVAL; + } + count = strtol(argv[i], NULL, 0); + if (i + count >= argc) { + log_tool("spawn -c COUNT past the end of arguments"); + return -EINVAL; + } + + com.spawn_args = realloc(com.spawn_args, + (com.spawn_count + 1) * sizeof(struct spawn_cmd)); + if (!com.spawn_args) + return -ENOMEM; + + com.spawn_args[com.spawn_count].argc = count; + com.spawn_args[com.spawn_count].argv = malloc((count + 1) * sizeof(char *)); + if (!com.spawn_args[com.spawn_count].argv) + return -ENOMEM; + + for (k = 0; k < count; k++) { + i += 1; + com.spawn_args[com.spawn_count].argv[k] = strdup(argv[i]); + if (!com.spawn_args[com.spawn_count].argv[k]) + return -ENOMEM; + } + com.spawn_args[com.spawn_count].argv[count] = NULL; + com.spawn_count += 1; + return i + 1; +} + static int read_command_line(int argc, char *argv[]) { char optchar; @@ -2447,6 +2494,8 @@ static int read_command_line(int argc, char *argv[]) com.action = ACT_REM_LOCKSPACE; else if (!strcmp(act, "command")) com.action = ACT_COMMAND; + else if (!strcmp(act, "spawn")) + com.action = ACT_SPAWN; else if (!strcmp(act, "acquire")) com.action = ACT_ACQUIRE; else if (!strcmp(act, "convert")) @@ -2622,7 +2671,8 @@ static int read_command_line(int argc, char *argv[]) com.wait = atoi(optionarg); break; case 'h': - if (com.action == ACT_GETS || com.action == ACT_CLIENT_READ || com.action == ACT_COMMAND) + if (com.action == ACT_GETS || com.action == ACT_CLIENT_READ || + com.action == ACT_COMMAND || com.action == ACT_SPAWN) com.get_hosts = atoi(optionarg); else com.high_priority = atoi(optionarg); @@ -2720,7 +2770,22 @@ static int read_command_line(int argc, char *argv[]) break;
case 'c': - begin_command = 1; + if (com.action == ACT_SPAWN) { + /* + * repeated -c COUNT CMD [ARG...] where COUNT includes the + * CMD itself. This lets the parser consume arguments with no + * ambiguity regardless of their content. + */ + do { + i = read_spawn_command(i, argc, argv); + if (i < 0) + return i; + } while (i < argc && !strcmp(argv[i], "-c") && ++i); + /* i already points at the next option, so continue the loop */ + continue; + } else { + begin_command = 1; + } break;
case 'A': @@ -3533,6 +3598,116 @@ static void do_client_version(void) (proto & 0x0000FFFF)); }
+/* + * log_owner - for ACT_COMMAND and ACT_SPAWN, log the owner when failing to acquire + * + * Only logs when option "-h 1" is used + */ +static void log_owner(struct sanlk_host *owner, char *owner_name) +{ + if (com.get_hosts && (owner->host_id || owner_name)) { + log_tool("owner: host_id %llu generation %llu timestamp %llu state %s name %s", + (unsigned long long)owner->host_id, + (unsigned long long)owner->generation, + (unsigned long long)owner->timestamp, + host_state_str(owner->flags), + owner_name ?: "none"); + } +} + +/* + * do_spawn - implementation of "sanlock client spawn" + * + * Acquires the resource lease, runs a sequence of commands specified via + * repeated "-c COUNT CMD [ARG...]" options, then releases the lease explicitly + * and synchronously before returning. + */ +static int do_spawn(void) +{ + struct sanlk_host owner = { 0 }; + char *owner_name = NULL; + uint32_t flags = 0; + pid_t pid; + int status, exit_code; + int i, fd, rv; + + if (!com.spawn_count) { + log_tool("spawn requires at least one -c command"); + return -EINVAL; + } + + log_tool("register"); + fd = sanlock_register(); + log_tool("register done %d", fd); + if (fd < 0) + return fd; + /* + * Do not allow the daemon to use SIGTERM or SIGKILL on this process + * for recovery. If the lockspace lease cannot be renewed, the daemon + * must fence the node via the watchdog instead. This prevents the + * unsafe scenario of a killed supervisor with a still-running writer. + */ + sanlock_restrict(fd, SANLK_RESTRICT_SIGKILL | SANLK_RESTRICT_SIGTERM); + flags |= com.orphan ? SANLK_ACQUIRE_ORPHAN : 0; + + log_tool("acquire fd %d", fd); + if (com.get_hosts) + rv = sanlock_acquire2(fd, -1, flags, com.res_args[0], NULL, &owner, &owner_name); + else + rv = sanlock_acquire(fd, -1, flags, com.res_count, com.res_args, NULL); + log_tool("acquire done %d", rv); + + if (rv < 0) { + log_owner(&owner, owner_name); + if (owner_name) + free(owner_name); + close(fd); + return rv; + } + /* + * Run each command sequentially under the held lease, stop on failure. + */ + exit_code = 0; + for (i = 0; i < com.spawn_count; i++) { + log_tool("spawn: %s", com.spawn_args[i].argv[0]); + pid = fork(); + if (pid < 0) { + log_tool("spawn fork failed: %s", strerror(errno)); + exit_code = -errno; + break; + } else if (pid == 0) { + /* child: close the lease fd so it is not inherited */ + close(fd); + execv(com.spawn_args[i].argv[0], com.spawn_args[i].argv); + perror("spawn execv failed"); + exit(1); + } + /* parent: wait for child to complete before continuing */ + if (waitpid(pid, &status, 0) < 0) { + log_tool("spawn waitpid failed: %s", strerror(errno)); + exit_code = -errno; + break; + } + exit_code = WIFEXITED(status) ? WEXITSTATUS(status) : 1; + log_tool("spawn done %d", exit_code); + if (exit_code != 0) + break; + } + /* + * Explicit synchronous release. When sanlock_release() returns the + * daemon has already processed the release, unlike ACT_COMMAND which + * relies on POLLHUP as an asynchronous method to release the lease. + */ + log_tool("release"); + rv = sanlock_release(fd, -1, SANLK_REL_ALL, 0, NULL); + log_tool("release done %d", rv); + close(fd); + if (rv < 0) { + return rv; + } + return exit_code; +} + static int do_client(void) { struct sanlk_host_event he; @@ -3550,7 +3725,7 @@ static int do_client(void) int i, fd; int rv = 0;
- if (com.action == ACT_COMMAND || com.action == ACT_ACQUIRE) { + if (com.action == ACT_COMMAND || com.action == ACT_SPAWN || com.action == ACT_ACQUIRE) { for (i = 0; i < com.res_count; i++) { res = com.res_args[i];
@@ -3592,6 +3767,10 @@ static int do_client(void) log_tool("shutdown done %d", rv); break;
+ case ACT_SPAWN: + rv = do_spawn(); + break; + case ACT_COMMAND: log_tool("register"); fd = sanlock_register(); @@ -3611,14 +3790,9 @@ static int do_client(void) log_tool("acquire done %d", rv);
if (rv < 0) { - if (com.get_hosts && (owner.host_id || owner_name)) { - log_tool("owner: host_id %llu generation %llu timestamp %llu state %s name %s", - (unsigned long long)owner.host_id, (unsigned long long)owner.generation, - (unsigned long long)owner.timestamp, host_state_str(owner.flags), - owner_name ?: "none"); - if (owner_name) - free(owner_name); - } + log_owner(&owner, owner_name); + if (owner_name) + free(owner_name); goto out; }
diff --git a/src/sanlock.8 b/src/sanlock.8 index e591060..5f11348 100644 --- a/src/sanlock.8 +++ b/src/sanlock.8 @@ -765,6 +765,71 @@ Register with the sanlock daemon, acquire the specified resource lease, and exec the command at path with args. When the command exits, the sanlock daemon will release the lease. -c must be the final option.
+.BR "sanlock client spawn -r" " RESOURCE " \ +\fB-c\fP " " \fICOUNT\fP " " \fICMD\fP " [" \fIARG\fP "...] " \ +\fR[\fP\fB-c\fP " " \fICOUNT\fP " " \fICMD\fP " [" \fIARG\fP "...]\fR]\fP..." + +Register with the sanlock daemon, acquire the specified resource lease, +fork and exec each command specified by +.B -c +sequentially as separate processes, checking the exit status of each +process, and only moving to the next process on success. +After all processes are successfully executed, or at the first failure, +the lease is released explicitly before exiting. + +.BR "sanlock client spawn" + +differs substantially from + +.BR "sanlock client command" + +which execs a single command, and relies +on the daemon detecting process exit via POLLHUP to release the lease +asynchronously. + +Instead, this action uses fork and waitpid so that the process survives +to call +.B sanlock_release() +synchronously. This means that on exit, the sanlock daemon has already +processed the release, and a valid subsequent request to acquire the +same resource will not fail with EAGAIN. + +sanlock_restrict(SANLK_RESTRICT_SIGKILL | SANLK_RESTRICT_SIGTERM) is called +after sanlock_register() so that if the lockspace lease cannot be renewed, +the daemon fences the node via the watchdog rather than sending signals to +this process. A killed supervisor with a still-running writer child would +be unsafe. + +For each command to spawn, a separate +.B -c +option is used, followed by the count of the arguments and the arguments +themselves (mirroring the argc, argv convention). +This allows +.I COUNT +arguments to be consumed unambiguously regardless of their content, +including arguments that would otherwise look like options. +Commands are run in order and execution stops on the first failure, +mirroring "command1 && command2" shell behavior. + +.B -P 1 +The PERSISTENT flag is recommended, so that if this process dies while a +child is still running, the lease becomes an orphan. This protects against +the daemon releasing the lease while a child is still writing to storage. + +To release an orphan lease left by a crashed spawn invocation: + +sanlock client release -r LOCKSPACE:RESOURCE:PATH:OFFSET -O 1 + +Use +.B -h 1 +to log the current owner information (host_id, generation, timestamp, +and owner name) when the acquire fails because another host holds the +lease. + +Use +.B -O 1 +to acquire an existing orphan lease + .BR "sanlock client acquire -r" " RESOURCE " \ \fB-p\fP " " \fIpid\fP .br diff --git a/src/sanlock_internal.h b/src/sanlock_internal.h index b58144b..ab74382 100644 --- a/src/sanlock_internal.h +++ b/src/sanlock_internal.h @@ -366,6 +366,11 @@ EXTERN struct client *client; #define DEFAULT_MAX_SECTORS_KB_ALIGN 0 /* set it to align size */ #define DEFAULT_MAX_SECTORS_KB_NUM 1024 /* set it to num KB for all lockspaces */
+struct spawn_cmd { + int argc; + char **argv; /* argv[argc] is NULL */ +}; + struct command_line { int type; /* COM_ */ int action; /* ACT_ */ @@ -437,6 +442,8 @@ struct command_line { struct sanlk_rindex rindex; /* -x RINDEX */ struct sanlk_lockspace lockspace; /* -s LOCKSPACE */ struct sanlk_resource *res_args[SANLK_MAX_RESOURCES]; /* -r RESOURCE */ + struct spawn_cmd *spawn_args; /* -c COUNT CMD [ARG...], for ACT_SPAWN */ + int spawn_count; };
EXTERN struct command_line com; @@ -498,6 +505,7 @@ enum { ACT_CLIENT_INIT_HOST, ACT_READ, ACT_SET_HOST, + ACT_SPAWN, };
EXTERN int external_shutdown;
this flag skips the on-disk portion of the release.
resource.h, resource.c: export release_token_nodisk() cmd.c: cmd_release(): handle the new flag by calling release_token_nodisk()
sanlock_internal.h: add com.no_disk to struct command_line main.c: new -d 1 option: set the new flag for client ACT_RELEASE and ACT_SPAWN sanlock.8: document the new -d 1 option for sanlock client
Signed-off-by: Claudio Fontana cfontana@suse.de --- src/cmd.c | 6 +++++- src/main.c | 15 ++++++++++++--- src/resource.c | 2 +- src/resource.h | 3 +++ src/sanlock.8 | 20 ++++++++++++++++++++ src/sanlock_internal.h | 1 + src/sanlock_resource.h | 8 ++++++++ 7 files changed, 50 insertions(+), 5 deletions(-)
diff --git a/src/cmd.c b/src/cmd.c index d405ddf..aac1bfa 100644 --- a/src/cmd.c +++ b/src/cmd.c @@ -767,7 +767,11 @@ static void cmd_release(struct task *task, struct cmd_args *ca, uint32_t cmd)
for (i = 0; i < rem_tokens_count; i++) { token = rem_tokens[i]; - rv = release_token(task, token, resrename); + if (ca->header.cmd_flags & SANLK_REL_NO_DISK) { + rv = release_token_nodisk(task, token); + } else { + rv = release_token(task, token, resrename); + } if (rv < 0) result = rv; free(token); diff --git a/src/main.c b/src/main.c index 919cc7d..e5f9f20 100644 --- a/src/main.c +++ b/src/main.c @@ -2319,7 +2319,7 @@ static void print_usage(void) printf("sanlock client inq_lockspace -s LOCKSPACE\n"); printf("sanlock client rem_lockspace -s LOCKSPACE\n"); printf("sanlock client command -r RESOURCE [-h 0|1] -c <path> <args>\n"); - printf("sanlock client spawn -r RESOURCE [-h 0|1] [-O 0|1] [-P 0|1] -c COUNT CMD [ARG...] [-c COUNT CMD [ARG...]]\n"); + printf("sanlock client spawn -r RESOURCE [-h 0|1] [-O 0|1] [-P 0|1] [-d 0|1] -c COUNT CMD [ARG...] [-c COUNT CMD [ARG...]]\n"); printf("sanlock client acquire -r RESOURCE -p|-C <id>\n"); printf("sanlock client convert -r RESOURCE -p|-C <id>\n"); printf("sanlock client release -r RESOURCE -p|-C <id>\n"); @@ -2710,7 +2710,11 @@ static int read_command_line(int argc, char *argv[]) com.cid = atoi(optionarg); break; case 'd': - com.he_data = strtoull(optionarg, NULL, 0); + if (com.action == ACT_SET_EVENT) { + com.he_data = strtoull(optionarg, NULL, 0); + } else { + com.no_disk = atoi(optionarg); + } break; case 'e': if (com.rindex_op) { @@ -3699,7 +3703,11 @@ static int do_spawn(void) * relies on POLLHUP as an asynchronous method to release the lease. */ log_tool("release"); - rv = sanlock_release(fd, -1, SANLK_REL_ALL, 0, NULL); + flags = SANLK_REL_ALL; + if (exit_code == 0) { + flags |= com.no_disk ? SANLK_REL_NO_DISK : 0; + } + rv = sanlock_release(fd, -1, flags, 0, NULL); log_tool("release done %d", rv); close(fd); if (rv < 0) { @@ -3902,6 +3910,7 @@ static int do_client(void) } flags |= com.orphan ? SANLK_REL_ORPHAN : 0; flags |= com.all ? SANLK_REL_ALL: 0; + flags |= com.no_disk ? SANLK_REL_NO_DISK : 0; rv = sanlock_release(-1, id, flags, com.res_count, com.res_args); log_tool("release done %d", rv); break; diff --git a/src/resource.c b/src/resource.c index de71962..d1ac8fc 100644 --- a/src/resource.c +++ b/src/resource.c @@ -1330,7 +1330,7 @@ static int release_token_nodisk_opened(struct task *task, struct token *token) return _release_token(task, token, NULL, 1, 1); }
-static int release_token_nodisk(struct task *task, struct token *token) +int release_token_nodisk(struct task *task, struct token *token) { return _release_token(task, token, NULL, 0, 1); } diff --git a/src/resource.h b/src/resource.h index f01fa6a..d1fb7a0 100644 --- a/src/resource.h +++ b/src/resource.h @@ -38,6 +38,9 @@ int acquire_token(struct task *task, struct token *token, uint32_t cmd_flags, int release_token(struct task *task, struct token *token, struct sanlk_resource *resrename);
+/* locks resource_mutex */ +int release_token_nodisk(struct task *task, struct token *token); + /* locks resource_mutex */ void release_token_async(struct token *token);
diff --git a/src/sanlock.8 b/src/sanlock.8 index 5f11348..d57c078 100644 --- a/src/sanlock.8 +++ b/src/sanlock.8 @@ -820,6 +820,12 @@ To release an orphan lease left by a crashed spawn invocation:
sanlock client release -r LOCKSPACE:RESOURCE:PATH:OFFSET -O 1
+.B -d 1 +When the NODISK option is enabled, the on-disk part of the release is +skipped on a successful command sequence. +This makes it possible to create a complete resource de-provisioning +path, where the lease file or disk itself is removed. + Use .B -h 1 to log the current owner information (host_id, generation, timestamp, @@ -1172,6 +1178,20 @@ name (-s lockspace_name) with no resource name.
.P
+.SS Releasing resource leases without on-disk changes + +A resource lease can be released with the NODISK flag (-d 1). This instructs +the daemon not to perform the on-disk portion of the release. +This affects commands that explicitly release the leases, ie: + +.B sanlock client release + +and + +.B sanlock client spawn + +.P + .SS Renewal history
sanlock saves a limited history of lease renewal information in each lockspace. diff --git a/src/sanlock_internal.h b/src/sanlock_internal.h index ab74382..7cc1306 100644 --- a/src/sanlock_internal.h +++ b/src/sanlock_internal.h @@ -444,6 +444,7 @@ struct command_line { struct sanlk_resource *res_args[SANLK_MAX_RESOURCES]; /* -r RESOURCE */ struct spawn_cmd *spawn_args; /* -c COUNT CMD [ARG...], for ACT_SPAWN */ int spawn_count; + int no_disk; };
EXTERN struct command_line com; diff --git a/src/sanlock_resource.h b/src/sanlock_resource.h index 7f0f6b7..b5ce78b 100644 --- a/src/sanlock_resource.h +++ b/src/sanlock_resource.h @@ -85,11 +85,19 @@ * If the resource name is set, then an * orphan with the matching resource name is * released. + * + * SANLK_REL_NO_DISK + * Skip the on-disk portion of release. + * + * Introduced to be able to remove the + * resource file or disk itself while holding + * the lease, to avoid ENOENT on release. */
#define SANLK_REL_ALL 0x00000001 #define SANLK_REL_RENAME 0x00000002 #define SANLK_REL_ORPHAN 0x00000004 +#define SANLK_REL_NO_DISK 0x00000008 /* SANLK_FOR_CLIENT_ID 0x00000010 defined above */
/*
Thanks, it worked nicely for me, I pushed it out with some small adjustments to the man page.
Dave
On 3/13/26 18:34, David Teigland wrote:
Thanks, it worked nicely for me, I pushed it out with some small adjustments to the man page.
Dave
Thank you!
This is the platform I am building (the experimental sanlock branch), and specifically the file where sanlock is used,
I know there are many things to test and improve (I haven't consider the watchdog yet for example),
just in case you have some comments or suggestions for improvement:
https://github.com/openSUSE/virtx/tree/sanlock
https://github.com/openSUSE/virtx/blob/sanlock/pkg/lockman/lockman.go
---
Claudio
sanlock-devel@lists.fedorahosted.org