This is an automated email from the git hooks/post-receive script.
andyp pushed a commit to branch andyp-dio-test in repository gfs2-utils.
commit f757540559a9402fa839a96728123f2a1d6f9380 Author: Andrew Price anprice@redhat.com AuthorDate: Wed Aug 15 02:09:24 2018 +0100
fsck.gfs2: Use O_DIRECT
This required changes in libgfs2 and fsck.gfs2 to make sure buffers are appropriately aligned. posix_fadvise()-based readahead is also removed.
This adds a significant overhead where buffered i/o was previously beneficial. This should be accounted for in future improvements.
Signed-off-by: Andrew Price anprice@redhat.com --- gfs2/fsck/initialize.c | 42 +++++++++++-------- gfs2/fsck/metawalk.c | 101 ---------------------------------------------- gfs2/fsck/pass1.c | 10 ----- gfs2/libgfs2/buf.c | 10 +++-- gfs2/libgfs2/fs_ops.c | 4 +- gfs2/libgfs2/structures.c | 20 +++++---- 6 files changed, 45 insertions(+), 142 deletions(-)
diff --git a/gfs2/fsck/initialize.c b/gfs2/fsck/initialize.c index d1c620a..f6eea30 100644 --- a/gfs2/fsck/initialize.c +++ b/gfs2/fsck/initialize.c @@ -1,5 +1,6 @@ #include "clusterautoconfig.h"
+#define _GNU_SOURCE #include <stdio.h> #include <stdint.h> #include <inttypes.h> @@ -135,10 +136,12 @@ static int set_block_ranges(struct gfs2_sbd *sdp) struct osi_node *n, *next = NULL; struct rgrp_tree *rgd; struct gfs2_rindex *ri; - char buf[sdp->sd_sb.sb_bsize]; uint64_t rmax = 0; uint64_t rmin = 0; - int error; + ssize_t bytes; + off_t off; + void *buf; + int err;
log_info( _("Setting block ranges..."));
@@ -165,28 +168,32 @@ static int set_block_ranges(struct gfs2_sbd *sdp) last_data_block = rmax; first_data_block = rmin;
- if (fsck_lseek(sdp->device_fd, (last_fs_block * sdp->sd_sb.sb_bsize))){ - log_crit( _("Can't seek to last block in file system: %llu" - " (0x%llx)\n"), (unsigned long long)last_fs_block, - (unsigned long long)last_fs_block); + err = posix_memalign(&buf, getpagesize(), sdp->sd_sb.sb_bsize); + if (err != 0) { + log_crit(_("Failed to allocate memory for buffer: %s\n"), + strerror(err)); goto fail; } - - memset(buf, 0, sdp->sd_sb.sb_bsize); - error = read(sdp->device_fd, buf, sdp->sd_sb.sb_bsize); - if (error != sdp->sd_sb.sb_bsize){ - log_crit( _("Can't read last block in file system (error %u), " - "last_fs_block: %llu (0x%llx)\n"), error, - (unsigned long long)last_fs_block, - (unsigned long long)last_fs_block); - goto fail; + off = last_fs_block * sdp->sd_sb.sb_bsize; + bytes = pread(sdp->device_fd, buf, sdp->sd_sb.sb_bsize, off); + if (bytes == -1) { + log_crit(_("Error reading last filesystem block (%"PRIu64"): %s\n"), + last_fs_block, strerror(errno)); + goto fail_free; + } + if (bytes != sdp->sd_sb.sb_bsize) { + log_crit(_("Bad read of %ldB for last filesystem block (%"PRIu64")\n"), + (long int)bytes, last_fs_block); + goto fail_free; }
log_info(_("0x%llx to 0x%llx\n"), (unsigned long long)first_data_block, (unsigned long long)last_data_block); + free(buf); return 0; - - fail: +fail_free: + free(buf); +fail: log_info( _("Error\n")); return -1; } @@ -1569,6 +1576,7 @@ int initialize(struct gfs2_sbd *sdp, int force_check, int preen, open_flag = O_RDONLY; else open_flag = O_RDWR | O_EXCL; + open_flag |= O_DIRECT|O_CLOEXEC|O_NOATIME;
sdp->device_fd = open(opts.device, open_flag); if (sdp->device_fd < 0) { diff --git a/gfs2/fsck/metawalk.c b/gfs2/fsck/metawalk.c index a7780d7..027587a 100644 --- a/gfs2/fsck/metawalk.c +++ b/gfs2/fsck/metawalk.c @@ -683,37 +683,6 @@ bad_leaf: return 1; }
-static int u64cmp(const void *p1, const void *p2) -{ - uint64_t a = *(uint64_t *)p1; - uint64_t b = *(uint64_t *)p2; - - if (a > b) - return 1; - if (a < b) - return -1; - - return 0; -} - -static void dir_leaf_reada(struct gfs2_inode *ip, uint64_t *tbl, unsigned hsize) -{ - uint64_t *t = alloca(hsize * sizeof(uint64_t)); - uint64_t leaf_no; - struct gfs2_sbd *sdp = ip->i_sbd; - unsigned n = 0; - unsigned i; - - for (i = 0; i < hsize; i++) { - leaf_no = be64_to_cpu(tbl[i]); - if (valid_block_ip(ip, leaf_no)) - t[n++] = leaf_no * sdp->bsize; - } - qsort(t, n, sizeof(uint64_t), u64cmp); - for (i = 0; i < n; i++) - posix_fadvise(sdp->device_fd, t[i], sdp->bsize, POSIX_FADV_WILLNEED); -} - /* Checks exhash directory entries */ int check_leaf_blks(struct gfs2_inode *ip, struct metawalk_fxns *pass) { @@ -738,17 +707,10 @@ int check_leaf_blks(struct gfs2_inode *ip, struct metawalk_fxns *pass) orig_di_height = ip->i_di.di_height; orig_di_blocks = ip->i_di.di_blocks;
- /* Turn off system readahead */ - posix_fadvise(sdp->device_fd, 0, 0, POSIX_FADV_RANDOM); - - /* Readahead */ - dir_leaf_reada(ip, tbl, hsize); - if (pass->check_hash_tbl) { error = pass->check_hash_tbl(ip, tbl, hsize, pass->private); if (error < 0) { free(tbl); - posix_fadvise(sdp->device_fd, 0, 0, POSIX_FADV_NORMAL); return error; } /* If hash table changes were made, read it in again. */ @@ -785,7 +747,6 @@ int check_leaf_blks(struct gfs2_inode *ip, struct metawalk_fxns *pass) (unsigned long long)ip->i_di.di_num.no_addr, (unsigned long long)ip->i_di.di_num.no_addr); free(tbl); - posix_fadvise(sdp->device_fd, 0, 0, POSIX_FADV_NORMAL); return 1; } lindex = 0; @@ -830,7 +791,6 @@ int check_leaf_blks(struct gfs2_inode *ip, struct metawalk_fxns *pass) struct gfs2_leaf leaf; if (fsck_abort) { free(tbl); - posix_fadvise(sdp->device_fd, 0, 0, POSIX_FADV_NORMAL); return 0; } error = check_leaf(ip, lindex, pass, &leaf_no, &leaf, @@ -879,7 +839,6 @@ int check_leaf_blks(struct gfs2_inode *ip, struct metawalk_fxns *pass) lindex += ref_count; } /* for every leaf block */ free(tbl); - posix_fadvise(sdp->device_fd, 0, 0, POSIX_FADV_NORMAL); return 0; }
@@ -1157,59 +1116,6 @@ static void free_metalist(struct gfs2_inode *ip, osi_list_t *mlp) } }
-static void file_ra(struct gfs2_inode *ip, struct gfs2_buffer_head *bh, - int head_size, int maxptrs, int h) -{ - struct gfs2_sbd *sdp = ip->i_sbd; - uint64_t *p, sblock = 0, block; - int extlen = 0; - - if (h + 2 == ip->i_di.di_height) { - p = (uint64_t *)(bh->b_data + head_size); - if (*p && *(p + 1)) { - sblock = be64_to_cpu(*p); - p++; - block = be64_to_cpu(*p); - extlen = block - sblock; - if (extlen > 1 && extlen <= maxptrs) { - posix_fadvise(sdp->device_fd, - sblock * sdp->bsize, - (extlen + 1) * sdp->bsize, - POSIX_FADV_WILLNEED); - return; - } - } - extlen = 0; - } - for (p = (uint64_t *)(bh->b_data + head_size); - p < (uint64_t *)(bh->b_data + sdp->bsize); p++) { - if (*p) { - if (!sblock) { - sblock = be64_to_cpu(*p); - extlen = 1; - continue; - } - block = be64_to_cpu(*p); - if (block == sblock + extlen) { - extlen++; - continue; - } - } - if (extlen && sblock) { - if (extlen > 1) - extlen--; - posix_fadvise(sdp->device_fd, sblock * sdp->bsize, - extlen * sdp->bsize, - POSIX_FADV_WILLNEED); - extlen = 0; - p--; - } - } - if (extlen) - posix_fadvise(sdp->device_fd, sblock * sdp->bsize, - extlen * sdp->bsize, POSIX_FADV_WILLNEED); -} - /** * build_and_check_metalist - check a bunch of indirect blocks * This includes hash table blocks for directories @@ -1230,7 +1136,6 @@ static int build_and_check_metalist(struct gfs2_inode *ip, osi_list_t *mlp, int h, head_size, iblk_type; uint64_t *ptr, block, *undoptr; int error, was_duplicate, is_valid; - int maxptrs;
osi_list_add(&metabh->b_altlist, &mlp[0]);
@@ -1254,16 +1159,12 @@ static int build_and_check_metalist(struct gfs2_inode *ip, osi_list_t *mlp, iblk_type = GFS2_METATYPE_IN; if (ip->i_sbd->gfs1) { head_size = sizeof(struct gfs_indirect); - maxptrs = (ip->i_sbd->bsize - head_size) / - sizeof(uint64_t); } else { head_size = sizeof(struct gfs2_meta_header); - maxptrs = ip->i_sbd->sd_inptrs; } } else { iblk_type = GFS2_METATYPE_DI; head_size = sizeof(struct gfs2_dinode); - maxptrs = ip->i_sbd->sd_diptrs; } prev_list = &mlp[h - 1]; cur_list = &mlp[h]; @@ -1278,8 +1179,6 @@ static int build_and_check_metalist(struct gfs2_inode *ip, osi_list_t *mlp, continue; }
- if (pass->readahead) - file_ra(ip, bh, head_size, maxptrs, h); /* Now check the metadata itself */ for (ptr = (uint64_t *)(bh->b_data + head_size); (char *)ptr < (bh->b_data + ip->i_sbd->bsize); diff --git a/gfs2/fsck/pass1.c b/gfs2/fsck/pass1.c index 3c2f74f..2788b1b 100644 --- a/gfs2/fsck/pass1.c +++ b/gfs2/fsck/pass1.c @@ -1889,22 +1889,12 @@ static int pass1_process_bitmap(struct gfs2_sbd *sdp, struct rgrp_tree *rgd, uin uint64_t block; struct gfs2_inode *ip; int q; - /* Readahead numbers arrived at by experiment */ - unsigned rawin = 50; - unsigned ralen = 100 * sdp->bsize; - unsigned r = 0;
for (i = 0; i < n; i++) { int is_inode; uint32_t check_magic;
block = ibuf[i]; - - if (r++ == rawin) { - posix_fadvise(sdp->device_fd, block * sdp->bsize, ralen, POSIX_FADV_WILLNEED); - r = 0; - } - /* skip gfs1 rindex indirect blocks */ if (sdp->gfs1 && blockfind(&gfs1_rindex_blks, block)) { log_debug(_("Skipping rindex indir block " diff --git a/gfs2/libgfs2/buf.c b/gfs2/libgfs2/buf.c index 92cd393..a2af4ff 100644 --- a/gfs2/libgfs2/buf.c +++ b/gfs2/libgfs2/buf.c @@ -26,14 +26,16 @@ struct gfs2_buffer_head *bget(struct gfs2_sbd *sdp, uint64_t num) { struct gfs2_buffer_head *bh; + void *buf;
- bh = calloc(1, sizeof(struct gfs2_buffer_head) + sdp->bsize); - if (bh == NULL) + if (posix_memalign(&buf, getpagesize(), sizeof(*bh) + sdp->bsize)) return NULL; + memset(buf, 0, sizeof(*bh) + sdp->bsize);
+ bh = (struct gfs2_buffer_head *)((char *)buf + sdp->bsize); bh->b_blocknr = num; bh->sdp = sdp; - bh->iov.iov_base = (char *)bh + sizeof(struct gfs2_buffer_head); + bh->iov.iov_base = buf; bh->iov.iov_len = sdp->bsize;
return bh; @@ -106,7 +108,7 @@ int brelse(struct gfs2_buffer_head *bh) bh->b_blocknr = -1; if (bh->b_altlist.next && !osi_list_empty(&bh->b_altlist)) osi_list_del(&bh->b_altlist); - free(bh); + free(bh->iov.iov_base); return error; }
diff --git a/gfs2/libgfs2/fs_ops.c b/gfs2/libgfs2/fs_ops.c index 7e87e43..2c1b1d5 100644 --- a/gfs2/libgfs2/fs_ops.c +++ b/gfs2/libgfs2/fs_ops.c @@ -1500,13 +1500,13 @@ int lgfs2_write_filemeta(struct gfs2_inode *ip) } lgfs2_fill_indir(start, bh->b_data + sdp->bsize, ptr0, ptrs, &p); if (bwrite(bh)) { - free(bh); + free(bh->iov.iov_base); return 1; } } ptr0 += ptrs; } - free(bh); + free(bh->iov.iov_base); return 0; }
diff --git a/gfs2/libgfs2/structures.c b/gfs2/libgfs2/structures.c index c84701d..b857489 100644 --- a/gfs2/libgfs2/structures.c +++ b/gfs2/libgfs2/structures.c @@ -74,12 +74,15 @@ int lgfs2_sb_write(const struct gfs2_sb *sb, int fd, const unsigned bsize) struct iovec *iov; const size_t sb_addr = GFS2_SB_ADDR * GFS2_BASIC_BLOCK / bsize; const size_t len = sb_addr + 1; + ssize_t bytes; + void *buf;
/* We only need 2 blocks: one for zeroing and a second for the superblock */ - char *buf = calloc(2, bsize); - if (buf == NULL) + if ((err = posix_memalign(&buf, getpagesize(), 2 * bsize))) { + errno = err; return -1; - + } + memset(buf, 0, 2 * bsize); iov = malloc(len * sizeof(*iov)); if (iov == NULL) goto out_buf; @@ -89,10 +92,11 @@ int lgfs2_sb_write(const struct gfs2_sb *sb, int fd, const unsigned bsize) iov[i].iov_len = bsize; }
- gfs2_sb_out(sb, buf + bsize); - iov[sb_addr].iov_base = buf + bsize; + iov[sb_addr].iov_base = (char *)buf + bsize; + gfs2_sb_out(sb, iov[sb_addr].iov_base);
- if (pwritev(fd, iov, len, 0) < (len * bsize)) + bytes = pwritev(fd, iov, len, 0); + if (bytes < (long)(len * bsize)) goto out_iov;
err = 0; @@ -181,7 +185,7 @@ int lgfs2_write_journal_data(struct gfs2_inode *ip) #endif
if (bwrite(bh)) { - free(bh); + free(bh->iov.iov_base); return -1; }
@@ -190,7 +194,7 @@ int lgfs2_write_journal_data(struct gfs2_inode *ip)
} while (++bh->b_blocknr < jext0 + blocks);
- free(bh); + free(bh->iov.iov_base); return 0; }
cluster-commits@lists.fedorahosted.org