This is an automated email from the git hooks/post-receive script.
teigland pushed a commit to branch master
in repository sanlock.
The following commit(s) were added to refs/heads/master by this push:
new 05cb313 sanlock: allow setting max_sectors_kb
05cb313 is described below
commit 05cb3135589742cec05e41563eae54fb26079656
Author: David Teigland <teigland(a)redhat.com>
AuthorDate: Wed Nov 7 14:17:31 2018 -0600
sanlock: allow setting max_sectors_kb
Allow sanlock to set this sysfs file:
/sys/dev/block/<major>:<minor>/queue/max_sectors_kb
to optimize read io's sent to a storage device. By default
this value is 512K which means each 1M read is split into
two 512K reads that are sent to storage. If set to 1024,
then each 1M read is sent to storage without being split.
The sanlock behavior is controlled by the max_sectors_kb
setting in sanlock.conf:
max_sectors_kb = ignore
sanlock doesn't set it (the current default).
max_sectors_kb = align
sanlock will set it to the lockspace align size (1M/2M/4M/8M)
when adding the lockspace.
max_sectors_kb = <num_kb>
sanlock will set it to <num_kb> for each lockspace.
---
src/diskio.c | 123 +++++++++++++++++++++++++++++++++++++++++++++++++
src/diskio.h | 4 ++
src/lockspace.c | 76 ++++++++++++++++++++++++++++++
src/main.c | 25 ++++++++++
src/sanlock.8 | 9 ++++
src/sanlock.conf | 3 ++
src/sanlock_internal.h | 7 +++
7 files changed, 247 insertions(+)
diff --git a/src/diskio.c b/src/diskio.c
index 8aa654d..83b0b1e 100644
--- a/src/diskio.c
+++ b/src/diskio.c
@@ -21,16 +21,138 @@
#include <sys/types.h>
#include <sys/time.h>
#include <sys/stat.h>
+#include <sys/sysmacros.h>
#include <blkid/blkid.h>
#include <libaio.h> /* linux aio */
#include <aio.h> /* posix aio */
+#include <aio.h> /* posix aio */
#include "sanlock_internal.h"
#include "diskio.h"
#include "direct.h"
#include "log.h"
+int read_sysfs_size(const char *disk_path, const char *name, unsigned int *val)
+{
+ char path[PATH_MAX];
+ char buf[32];
+ struct stat st;
+ int major, minor;
+ size_t len;
+ int fd;
+ int rv = -1;
+
+ rv = stat(disk_path, &st);
+ if (rv < 0)
+ return -1;
+
+ major = (int)major(st.st_rdev);
+ minor = (int)minor(st.st_rdev);
+
+ snprintf(path, sizeof(path), "/sys/dev/block/%d:%d/queue/%s", major, minor, name);
+
+ fd = open(path, O_RDONLY, 0);
+ if (fd < 0)
+ return -1;
+
+ rv = read(fd, buf, sizeof(buf));
+ if (rv < 0) {
+ close(fd);
+ return -1;
+ }
+
+ if ((len = strlen(buf)) && buf[len - 1] == '\n')
+ buf[--len] = '\0';
+
+ if (strlen(buf)) {
+ *val = atoi(buf);
+ rv = 0;
+ }
+
+ close(fd);
+ return rv;
+}
+
+static int write_sysfs_size(const char *disk_path, const char *name, unsigned int val)
+{
+ char path[PATH_MAX];
+ char buf[32];
+ struct stat st;
+ int major, minor;
+ int fd;
+ int rv;
+
+ rv = stat(disk_path, &st);
+ if (rv < 0) {
+ log_debug("write_sysfs_size stat error %d %s", errno, disk_path);
+ return -1;
+ }
+
+ major = (int)major(st.st_rdev);
+ minor = (int)minor(st.st_rdev);
+
+ snprintf(path, sizeof(path), "/sys/dev/block/%d:%d/queue/%s", major, minor, name);
+
+ memset(buf, 0, sizeof(buf));
+ snprintf(buf, sizeof(buf), "%u", val);
+
+ fd = open(path, O_RDWR, 0);
+ if (fd < 0) {
+ log_debug("write_sysfs_size open error %d %s", errno, path);
+ return -1;
+ }
+
+ rv = write(fd, buf, strlen(buf));
+ if (rv < 0) {
+ log_debug("write_sysfs_size write %s error %d %s", buf, errno, path);
+ close(fd);
+ return -1;
+ }
+
+ close(fd);
+ return 0;
+}
+
+/*
+ * The default max_sectors_kb is 512 (KB), so a 1MB read is split into two
+ * 512KB reads. Adjust this to at least do 1MB io's.
+ */
+
+int set_max_sectors_kb(struct sync_disk *disk, uint32_t set_kb)
+{
+ unsigned int max_kb = 0;
+ int rv;
+
+ rv = read_sysfs_size(disk->path, "max_sectors_kb", &max_kb);
+ if (rv < 0) {
+ log_debug("set_max_sectors_kb read error %d %s", rv, disk->path);
+ return rv;
+ }
+
+ if (max_kb == set_kb)
+ return 0;
+
+ rv = write_sysfs_size(disk->path, "max_sectors_kb", set_kb);
+ if (rv < 0) {
+ log_debug("set_max_sectors_kb write %u error %d %s", set_kb, rv, disk->path);
+ return rv;
+ }
+
+ return 0;
+}
+
+int get_max_sectors_kb(struct sync_disk *disk, uint32_t *max_sectors_kb)
+{
+ unsigned int max = 0;
+ int rv;
+
+ rv = read_sysfs_size(disk->path, "max_sectors_kb", &max);
+ if (!rv)
+ *max_sectors_kb = max;
+ return rv;
+}
+
static int set_disk_properties(struct sync_disk *disk)
{
blkid_probe probe;
@@ -47,6 +169,7 @@ static int set_disk_properties(struct sync_disk *disk)
blkid_free_probe(probe);
disk->sector_size = sector_size;
+
return 0;
}
diff --git a/src/diskio.h b/src/diskio.h
index 772f743..1ed65d2 100644
--- a/src/diskio.h
+++ b/src/diskio.h
@@ -17,6 +17,10 @@ int open_disks(struct sync_disk *disks, int num_disks);
int open_disks_fd(struct sync_disk *disks, int num_disks);
int majority_disks(int num_disks, int num);
+int read_sysfs_size(const char *path, const char *name, unsigned int *val);
+int set_max_sectors_kb(struct sync_disk *disk, uint32_t max_sectors_kb);
+int get_max_sectors_kb(struct sync_disk *disk, uint32_t *max_sectors_kb);
+
/*
* iobuf functions require the caller to allocate iobuf using posix_memalign
* and pass it into the function
diff --git a/src/lockspace.c b/src/lockspace.c
index dbc999e..41f96cb 100644
--- a/src/lockspace.c
+++ b/src/lockspace.c
@@ -23,6 +23,7 @@
#include <sys/socket.h>
#include <sys/time.h>
#include <sys/un.h>
+#include <sys/stat.h>
#include "sanlock_internal.h"
#include "sanlock_admin.h"
@@ -580,6 +581,79 @@ static void save_renewal_history(struct space *sp, int delta_result,
}
}
+#define ONE_MB_IN_BYTES 1048576
+#define ONE_MB_IN_KB 1024
+
+static void set_lockspace_max_sectors_kb(struct space *sp, int sector_size, int align_size)
+{
+ struct stat st;
+ int align_size_kb = align_size / 1024; /* align_size is in bytes */
+ unsigned int hw_kb = 0;
+ unsigned int set_kb = 0;
+ int rv;
+
+ if (fstat(sp->host_id_disk.fd, &st) < 0) {
+ log_erros(sp, "set_lockspace_max_sectors_kb fstat error %d", errno);
+ return;
+ }
+
+ /* file not device */
+ if (S_ISREG(st.st_mode))
+ return;
+
+ if (com.max_sectors_kb_ignore)
+ return;
+ else if (com.max_sectors_kb_align)
+ set_kb = align_size_kb;
+ else if (com.max_sectors_kb_num)
+ set_kb = com.max_sectors_kb_num;
+ else
+ return;
+
+ rv = read_sysfs_size(sp->host_id_disk.path, "max_hw_sectors_kb", &hw_kb);
+ if (rv < 0 || !hw_kb) {
+ log_space(sp, "set_lockspace_max_sectors_kb max_hw_sectors_kb unknown %d %u", rv, hw_kb);
+ return;
+ }
+
+ if (hw_kb < set_kb) {
+ /*
+ * If the hardware won't support requested size, try setting 1MB.
+ */
+ if (hw_kb < ONE_MB_IN_KB) {
+ log_space(sp, "set_lockspace_max_sectors_kb small hw_kb %u req_kb %u", hw_kb, set_kb);
+ return;
+ }
+
+ if (set_kb < 1024) {
+ log_space(sp, "set_lockspace_max_sectors_kb small hw_kb %u small req_kb %u", hw_kb, set_kb);
+ return;
+ }
+
+ set_kb = ONE_MB_IN_KB;
+
+ log_space(sp, "set_lockspace_max_sectors_kb small hw_kb %u using 1024", hw_kb);
+
+ rv = set_max_sectors_kb(&sp->host_id_disk, set_kb);
+ if (rv < 0) {
+ log_space(sp, "set_lockspace_max_sectors_kb small hw_kb %u set 1024 error %d", hw_kb, rv);
+ return;
+ }
+ } else {
+ /*
+ * Tell the kernel to send hardware io's as large as the lease size.
+ */
+
+ log_space(sp, "set_lockspace_max_sectors_kb hw_kb %u setting %u", hw_kb, set_kb);
+
+ rv = set_max_sectors_kb(&sp->host_id_disk, set_kb);
+ if (rv < 0) {
+ log_space(sp, "set_lockspace_max_sectors_kb hw_kb %u set %u error %d", hw_kb, set_kb, rv);
+ return;
+ }
+ }
+}
+
/*
* This thread must not be stopped unless all pids that may be using any
* resources in it are dead/gone. (The USED flag in the lockspace represents
@@ -665,6 +739,8 @@ static void *lockspace_thread(void *arg_in)
sp->align_size = align_size;
sp->max_hosts = max_hosts;
+ set_lockspace_max_sectors_kb(sp, sector_size, align_size);
+
sp->lease_status.renewal_read_buf = malloc(sp->align_size);
if (!sp->lease_status.renewal_read_buf) {
acquire_result = -ENOMEM;
diff --git a/src/main.c b/src/main.c
index 1767328..e4514e7 100644
--- a/src/main.c
+++ b/src/main.c
@@ -23,6 +23,7 @@
#include <sched.h>
#include <pwd.h>
#include <grp.h>
+#include <ctype.h>
#include <sys/types.h>
#include <sys/prctl.h>
#include <sys/wait.h>
@@ -2651,6 +2652,30 @@ static void read_config_file(void)
com.debug_io_submit = 1;
if (strstr(str, "complete"))
com.debug_io_complete = 1;
+
+ } else if (!strcmp(str, "max_sectors_kb")) {
+ memset(str, 0, sizeof(str));
+ get_val_str(line, str);
+ if (strstr(str, "ignore")) {
+ com.max_sectors_kb_ignore = 1;
+ com.max_sectors_kb_align = 0;
+ com.max_sectors_kb_num = 0;
+ } else if (strstr(str, "align")) {
+ com.max_sectors_kb_ignore = 0;
+ com.max_sectors_kb_align = 1;
+ com.max_sectors_kb_num = 0;
+ } else if (isdigit(str[0])) {
+ int num = atoi(str);
+ if (!num || (num % 2) || (num > 8192)) {
+ log_error("ignore invalid num max_sectors_kb %s", str);
+ } else {
+ com.max_sectors_kb_ignore = 0;
+ com.max_sectors_kb_align = 0;
+ com.max_sectors_kb_num = num;
+ }
+ } else {
+ log_error("ignore unknown max_sectors_kb %s", str);
+ }
}
}
diff --git a/src/sanlock.8 b/src/sanlock.8
index 7f5f4b7..0bc38ea 100644
--- a/src/sanlock.8
+++ b/src/sanlock.8
@@ -1333,6 +1333,15 @@ Add debug logging for each i/o. "submit" (no quotes) produces debug
output at submission time, "complete" produces debug output at completion
time, and "submit,complete" (no space) produces both.
+.IP \[bu] 2
+max_sectors_kb = <str>|<num>
+.br
+Set to "ignore" (no quotes) to prevent sanlock from checking or
+changing max_sectors_kb for the lockspace disk when starting a lockspace.
+Set to "align" (no quotes) to set max_sectors_kb for the lockspace disk
+to the align size of the lockspace.
+Set to a number to set a specific number of KB for all lockspace disks.
+
.SH SEE ALSO
.BR wdmd (8)
diff --git a/src/sanlock.conf b/src/sanlock.conf
index 9cd867f..7deecd2 100644
--- a/src/sanlock.conf
+++ b/src/sanlock.conf
@@ -52,3 +52,6 @@
#
# debug_io = <str>
# command line: n/a
+#
+# max_sectors_kb = <str>
+# command line: n/a
diff --git a/src/sanlock_internal.h b/src/sanlock_internal.h
index 01b0123..ebf9946 100644
--- a/src/sanlock_internal.h
+++ b/src/sanlock_internal.h
@@ -330,6 +330,10 @@ EXTERN struct client *client;
#define DEFAULT_QUIET_FAIL 1
#define DEFAULT_RENEWAL_HISTORY_SIZE 180 /* about 1 hour with 20 sec renewal interval */
+#define DEFAULT_MAX_SECTORS_KB_IGNORE 1 /* don't change it */
+#define DEFAULT_MAX_SECTORS_KB_ALIGN 0 /* set it to align size */
+#define DEFAULT_MAX_SECTORS_KB_NUM 0 /* set it to num KB for all lockspaces */
+
struct command_line {
int type; /* COM_ */
int action; /* ACT_ */
@@ -338,6 +342,9 @@ struct command_line {
int debug_io_submit;
int debug_io_complete;
int paxos_debug_all;
+ int max_sectors_kb_ignore;
+ int max_sectors_kb_align;
+ int max_sectors_kb_num;
int quiet_fail;
int wait;
int use_watchdog;
--
To stop receiving notification emails like this one, please contact
the administrator of this repository.