rpms/kernel/devel kernel.spec, 1.1463, 1.1464 linux-2.6-btrfs-experimental-branch.patch, 1.1, 1.2
Josef Bacik
josef at fedoraproject.org
Tue Mar 24 15:52:29 UTC 2009
Author: josef
Update of /cvs/pkgs/rpms/kernel/devel
In directory cvs1.fedora.phx.redhat.com:/tmp/cvs-serv12994
Modified Files:
kernel.spec linux-2.6-btrfs-experimental-branch.patch
Log Message:
added fsync replay fixes to btrfs updates patch
Index: kernel.spec
===================================================================
RCS file: /cvs/pkgs/rpms/kernel/devel/kernel.spec,v
retrieving revision 1.1463
retrieving revision 1.1464
diff -u -r1.1463 -r1.1464
--- kernel.spec 24 Mar 2009 01:31:28 -0000 1.1463
+++ kernel.spec 24 Mar 2009 15:51:57 -0000 1.1464
@@ -1810,6 +1810,9 @@
# and build.
%changelog
+* Tue Mar 24 2009 Josef Bacik <josef at toxicpanda.com>
+- fsync replay fixes for btrfs
+
* Mon Mar 23 2009 Dave Jones <davej at redhat.com>
- 2.6.29
linux-2.6-btrfs-experimental-branch.patch:
Index: linux-2.6-btrfs-experimental-branch.patch
===================================================================
RCS file: /cvs/pkgs/rpms/kernel/devel/linux-2.6-btrfs-experimental-branch.patch,v
retrieving revision 1.1
retrieving revision 1.2
diff -u -r1.1 -r1.2
--- linux-2.6-btrfs-experimental-branch.patch 16 Mar 2009 17:30:23 -0000 1.1
+++ linux-2.6-btrfs-experimental-branch.patch 24 Mar 2009 15:51:57 -0000 1.2
@@ -5205,3 +5205,946 @@
out:
if (path)
btrfs_free_path(path);
+diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h
+index 72677ce..3af4cfb 100644
+--- a/fs/btrfs/btrfs_inode.h
++++ b/fs/btrfs/btrfs_inode.h
+@@ -86,12 +86,6 @@ struct btrfs_inode {
+ */
+ u64 logged_trans;
+
+- /*
+- * trans that last made a change that should be fully fsync'd. This
+- * gets reset to zero each time the inode is logged
+- */
+- u64 log_dirty_trans;
+-
+ /* total number of bytes pending delalloc, used by stat to calc the
+ * real block usage of the file
+ */
+@@ -121,6 +115,13 @@ struct btrfs_inode {
+ /* the start of block group preferred for allocations. */
+ u64 block_group;
+
++ /* the fsync log has some corner cases that mean we have to check
++ * directories to see if any unlinks have been done before
++ * the directory was logged. See tree-log.c for all the
++ * details
++ */
++ u64 last_unlink_trans;
++
+ struct inode vfs_inode;
+ };
+
+diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
+index 4ddce91..2737fac 100644
+--- a/fs/btrfs/ctree.h
++++ b/fs/btrfs/ctree.h
+@@ -695,7 +695,12 @@ struct btrfs_fs_info {
+
+ u64 generation;
+ u64 last_trans_committed;
+- u64 last_trans_new_blockgroup;
++
++ /*
++ * this is updated to the current trans every time a full commit
++ * is required instead of the faster short fsync log commits
++ */
++ u64 last_trans_log_full_commit;
+ u64 open_ioctl_trans;
+ unsigned long mount_opt;
+ u64 max_extent;
+diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
+index 8933d15..0c482e0 100644
+--- a/fs/btrfs/extent-tree.c
++++ b/fs/btrfs/extent-tree.c
+@@ -5897,7 +5897,7 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans,
+
+ extent_root = root->fs_info->extent_root;
+
+- root->fs_info->last_trans_new_blockgroup = trans->transid;
++ root->fs_info->last_trans_log_full_commit = trans->transid;
+
+ cache = kzalloc(sizeof(*cache), GFP_NOFS);
+ if (!cache)
+diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
+index f06c275..32d10a6 100644
+--- a/fs/btrfs/file.c
++++ b/fs/btrfs/file.c
+@@ -1173,8 +1173,11 @@ out_nolock:
+ ret = btrfs_log_dentry_safe(trans, root,
+ file->f_dentry);
+ if (ret == 0) {
+- btrfs_sync_log(trans, root);
+- btrfs_end_transaction(trans, root);
++ ret = btrfs_sync_log(trans, root);
++ if (ret == 0)
++ btrfs_end_transaction(trans, root);
++ else
++ btrfs_commit_transaction(trans, root);
+ } else {
+ btrfs_commit_transaction(trans, root);
+ }
+@@ -1266,8 +1269,11 @@ int btrfs_sync_file(struct file *file, struct dentry *dentry, int datasync)
+ if (ret > 0) {
+ ret = btrfs_commit_transaction(trans, root);
+ } else {
+- btrfs_sync_log(trans, root);
+- ret = btrfs_end_transaction(trans, root);
++ ret = btrfs_sync_log(trans, root);
++ if (ret == 0)
++ ret = btrfs_end_transaction(trans, root);
++ else
++ ret = btrfs_commit_transaction(trans, root);
+ }
+ mutex_lock(&dentry->d_inode->i_mutex);
+ out:
+diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
+index 9b4faac..bffd79f 100644
+--- a/fs/btrfs/inode.c
++++ b/fs/btrfs/inode.c
+@@ -2246,8 +2246,6 @@ int btrfs_unlink_inode(struct btrfs_trans_handle *trans,
+ ret = btrfs_del_inode_ref_in_log(trans, root, name, name_len,
+ inode, dir->i_ino);
+ BUG_ON(ret != 0 && ret != -ENOENT);
+- if (ret != -ENOENT)
+- BTRFS_I(dir)->log_dirty_trans = trans->transid;
+
+ ret = btrfs_del_dir_entries_in_log(trans, root, name, name_len,
+ dir, index);
+@@ -2280,6 +2278,9 @@ static int btrfs_unlink(struct inode *dir, struct dentry *dentry)
+ trans = btrfs_start_transaction(root, 1);
+
+ btrfs_set_trans_block_group(trans, dir);
++
++ btrfs_record_unlink_dir(trans, dir, dentry->d_inode, 0);
++
+ ret = btrfs_unlink_inode(trans, root, dir, dentry->d_inode,
+ dentry->d_name.name, dentry->d_name.len);
+
+@@ -3042,7 +3043,7 @@ static noinline void init_btrfs_i(struct inode *inode)
+ bi->disk_i_size = 0;
+ bi->flags = 0;
+ bi->index_cnt = (u64)-1;
+- bi->log_dirty_trans = 0;
++ bi->last_unlink_trans = 0;
+ extent_map_tree_init(&BTRFS_I(inode)->extent_tree, GFP_NOFS);
+ extent_io_tree_init(&BTRFS_I(inode)->io_tree,
+ inode->i_mapping, GFP_NOFS);
+@@ -3786,6 +3787,8 @@ static int btrfs_link(struct dentry *old_dentry, struct inode *dir,
+ drop_inode = 1;
+
+ nr = trans->blocks_used;
++
++ btrfs_log_new_name(trans, inode, NULL, dentry->d_parent);
+ btrfs_end_transaction_throttle(trans, root);
+ fail:
+ if (drop_inode) {
+@@ -4666,6 +4669,15 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry,
+
+ trans = btrfs_start_transaction(root, 1);
+
++ /*
++ * this is an ugly little race, but the rename is required to make
++ * sure that if we crash, the inode is either at the old name
++ * or the new one. pinning the log transaction lets us make sure
++ * we don't allow a log commit to come in after we unlink the
++ * name but before we add the new name back in.
++ */
++ btrfs_pin_log_trans(root);
++
+ btrfs_set_trans_block_group(trans, new_dir);
+
+ btrfs_inc_nlink(old_dentry->d_inode);
+@@ -4673,6 +4685,9 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry,
+ new_dir->i_ctime = new_dir->i_mtime = ctime;
+ old_inode->i_ctime = ctime;
+
++ if (old_dentry->d_parent != new_dentry->d_parent)
++ btrfs_record_unlink_dir(trans, old_dir, old_inode, 1);
++
+ ret = btrfs_unlink_inode(trans, root, old_dir, old_dentry->d_inode,
+ old_dentry->d_name.name,
+ old_dentry->d_name.len);
+@@ -4704,7 +4719,14 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry,
+ if (ret)
+ goto out_fail;
+
++ btrfs_log_new_name(trans, old_inode, old_dir,
++ new_dentry->d_parent);
+ out_fail:
++
++ /* this btrfs_end_log_trans just allows the current
++ * log-sub transaction to complete
++ */
++ btrfs_end_log_trans(root);
+ btrfs_end_transaction_throttle(trans, root);
+ out_unlock:
+ return ret;
+diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
+index 9c462fb..fc9b87a 100644
+--- a/fs/btrfs/tree-log.c
++++ b/fs/btrfs/tree-log.c
+@@ -35,6 +35,49 @@
+ #define LOG_INODE_EXISTS 1
+
+ /*
++ * directory trouble cases
++ *
++ * 1) on rename or unlink, if the inode being unlinked isn't in the fsync
++ * log, we must force a full commit before doing an fsync of the directory
++ * where the unlink was done.
++ * ---> record transid of last unlink/rename per directory
++ *
++ * mkdir foo/some_dir
++ * normal commit
++ * rename foo/some_dir foo2/some_dir
++ * mkdir foo/some_dir
++ * fsync foo/some_dir/some_file
++ *
++ * The fsync above will unlink the original some_dir without recording
++ * it in its new location (foo2). After a crash, some_dir will be gone
++ * unless the fsync of some_file forces a full commit
++ *
++ * 2) we must log any new names for any file or dir that is in the fsync
++ * log. ---> check inode while renaming/linking.
++ *
++ * 2a) we must log any new names for any file or dir during rename
++ * when the directory they are being removed from was logged.
++ * ---> check inode and old parent dir during rename
++ *
++ * 2a is actually the more important variant. With the extra logging
++ * a crash might unlink the old name without recreating the new one
++ *
++ * 3) after a crash, we must go through any directories with a link count
++ * of zero and redo the rm -rf
++ *
++ * mkdir f1/foo
++ * normal commit
++ * rm -rf f1/foo
++ * fsync(f1)
++ *
++ * The directory f1 was fully removed from the FS, but fsync was never
++ * called on f1, only its parent dir. After a crash the rm -rf must
++ * be replayed. This must be able to recurse down the entire
++ * directory tree. The inode link count fixup code takes care of the
++ * ugly details.
++ */
++
++/*
+ * stages for the tree walking. The first
+ * stage (0) is to only pin down the blocks we find
+ * the second stage (1) is to make sure that all the inodes
+@@ -47,12 +90,17 @@
+ #define LOG_WALK_REPLAY_INODES 1
+ #define LOG_WALK_REPLAY_ALL 2
+
+-static int __btrfs_log_inode(struct btrfs_trans_handle *trans,
++static int btrfs_log_inode(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root, struct inode *inode,
+ int inode_only);
+ static int link_to_fixup_dir(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct btrfs_path *path, u64 objectid);
++static noinline int replay_dir_deletes(struct btrfs_trans_handle *trans,
++ struct btrfs_root *root,
++ struct btrfs_root *log,
++ struct btrfs_path *path,
++ u64 dirid, int del_all);
+
+ /*
+ * tree logging is a special write ahead log used to make sure that
+@@ -133,10 +181,25 @@ static int join_running_log_trans(struct btrfs_root *root)
+ }
+
+ /*
++ * This either makes the current running log transaction wait
++ * until you call btrfs_end_log_trans() or it makes any future
++ * log transactions wait until you call btrfs_end_log_trans()
++ */
++int btrfs_pin_log_trans(struct btrfs_root *root)
++{
++ int ret = -ENOENT;
++
++ mutex_lock(&root->log_mutex);
++ atomic_inc(&root->log_writers);
++ mutex_unlock(&root->log_mutex);
++ return ret;
++}
++
++/*
+ * indicate we're done making changes to the log tree
+ * and wake up anyone waiting to do a sync
+ */
+-static int end_log_trans(struct btrfs_root *root)
++int btrfs_end_log_trans(struct btrfs_root *root)
+ {
+ if (atomic_dec_and_test(&root->log_writers)) {
+ smp_mb();
+@@ -203,7 +266,6 @@ static int process_one_buffer(struct btrfs_root *log,
+ mutex_lock(&log->fs_info->pinned_mutex);
+ btrfs_update_pinned_extents(log->fs_info->extent_root,
+ eb->start, eb->len, 1);
+- mutex_unlock(&log->fs_info->pinned_mutex);
+ }
+
+ if (btrfs_buffer_uptodate(eb, gen)) {
+@@ -603,6 +665,7 @@ static noinline int drop_one_dir_item(struct btrfs_trans_handle *trans,
+
+ ret = link_to_fixup_dir(trans, root, path, location.objectid);
+ BUG_ON(ret);
++
+ ret = btrfs_unlink_inode(trans, root, dir, inode, name, name_len);
+ BUG_ON(ret);
+ kfree(name);
+@@ -804,6 +867,7 @@ conflict_again:
+ victim_name_len)) {
+ btrfs_inc_nlink(inode);
+ btrfs_release_path(root, path);
++
+ ret = btrfs_unlink_inode(trans, root, dir,
+ inode, victim_name,
+ victim_name_len);
+@@ -922,13 +986,20 @@ static noinline int fixup_inode_link_count(struct btrfs_trans_handle *trans,
+ key.offset--;
+ btrfs_release_path(root, path);
+ }
+- btrfs_free_path(path);
++ btrfs_release_path(root, path);
+ if (nlink != inode->i_nlink) {
+ inode->i_nlink = nlink;
+ btrfs_update_inode(trans, root, inode);
+ }
+ BTRFS_I(inode)->index_cnt = (u64)-1;
+
++ if (inode->i_nlink == 0 && S_ISDIR(inode->i_mode)) {
++ ret = replay_dir_deletes(trans, root, NULL, path,
++ inode->i_ino, 1);
++ BUG_ON(ret);
++ }
++ btrfs_free_path(path);
++
+ return 0;
+ }
+
+@@ -971,9 +1042,12 @@ static noinline int fixup_inode_link_counts(struct btrfs_trans_handle *trans,
+
+ iput(inode);
+
+- if (key.offset == 0)
+- break;
+- key.offset--;
++ /*
++ * fixup on a directory may create new entries,
++ * make sure we always look for the highset possible
++ * offset
++ */
++ key.offset = (u64)-1;
+ }
+ btrfs_release_path(root, path);
+ return 0;
+@@ -1313,11 +1387,11 @@ again:
+ read_extent_buffer(eb, name, (unsigned long)(di + 1),
+ name_len);
+ log_di = NULL;
+- if (dir_key->type == BTRFS_DIR_ITEM_KEY) {
++ if (log && dir_key->type == BTRFS_DIR_ITEM_KEY) {
+ log_di = btrfs_lookup_dir_item(trans, log, log_path,
+ dir_key->objectid,
+ name, name_len, 0);
+- } else if (dir_key->type == BTRFS_DIR_INDEX_KEY) {
++ } else if (log && dir_key->type == BTRFS_DIR_INDEX_KEY) {
+ log_di = btrfs_lookup_dir_index_item(trans, log,
+ log_path,
+ dir_key->objectid,
+@@ -1378,7 +1452,7 @@ static noinline int replay_dir_deletes(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct btrfs_root *log,
+ struct btrfs_path *path,
+- u64 dirid)
++ u64 dirid, int del_all)
+ {
+ u64 range_start;
+ u64 range_end;
+@@ -1408,10 +1482,14 @@ again:
+ range_start = 0;
+ range_end = 0;
+ while (1) {
+- ret = find_dir_range(log, path, dirid, key_type,
+- &range_start, &range_end);
+- if (ret != 0)
+- break;
++ if (del_all)
++ range_end = (u64)-1;
++ else {
++ ret = find_dir_range(log, path, dirid, key_type,
++ &range_start, &range_end);
++ if (ret != 0)
++ break;
++ }
+
+ dir_key.offset = range_start;
+ while (1) {
+@@ -1437,7 +1515,8 @@ again:
+ break;
+
+ ret = check_item_in_log(trans, root, log, path,
+- log_path, dir, &found_key);
++ log_path, dir,
++ &found_key);
+ BUG_ON(ret);
+ if (found_key.offset == (u64)-1)
+ break;
+@@ -1514,7 +1593,7 @@ static int replay_one_buffer(struct btrfs_root *log, struct extent_buffer *eb,
+ mode = btrfs_inode_mode(eb, inode_item);
+ if (S_ISDIR(mode)) {
+ ret = replay_dir_deletes(wc->trans,
+- root, log, path, key.objectid);
++ root, log, path, key.objectid, 0);
+ BUG_ON(ret);
+ }
+ ret = overwrite_item(wc->trans, root, path,
+@@ -1533,6 +1612,17 @@ static int replay_one_buffer(struct btrfs_root *log, struct extent_buffer *eb,
+ root, inode, inode->i_size,
+ BTRFS_EXTENT_DATA_KEY);
+ BUG_ON(ret);
++
++ /* if the nlink count is zero here, the iput
++ * will free the inode. We bump it to make
++ * sure it doesn't get freed until the link
++ * count fixup is done
++ */
++ if (inode->i_nlink == 0) {
++ btrfs_inc_nlink(inode);
++ btrfs_update_inode(wc->trans,
++ root, inode);
++ }
+ iput(inode);
+ }
+ ret = link_to_fixup_dir(wc->trans, root,
+@@ -1840,7 +1930,8 @@ static int update_log_root(struct btrfs_trans_handle *trans,
+ return ret;
+ }
+
+-static int wait_log_commit(struct btrfs_root *root, unsigned long transid)
++static int wait_log_commit(struct btrfs_trans_handle *trans,
++ struct btrfs_root *root, unsigned long transid)
+ {
+ DEFINE_WAIT(wait);
+ int index = transid % 2;
+@@ -1854,9 +1945,12 @@ static int wait_log_commit(struct btrfs_root *root, unsigned long transid)
+ prepare_to_wait(&root->log_commit_wait[index],
+ &wait, TASK_UNINTERRUPTIBLE);
+ mutex_unlock(&root->log_mutex);
+- if (root->log_transid < transid + 2 &&
++
++ if (root->fs_info->last_trans_log_full_commit !=
++ trans->transid && root->log_transid < transid + 2 &&
+ atomic_read(&root->log_commit[index]))
+ schedule();
++
+ finish_wait(&root->log_commit_wait[index], &wait);
+ mutex_lock(&root->log_mutex);
+ } while (root->log_transid < transid + 2 &&
+@@ -1864,14 +1958,16 @@ static int wait_log_commit(struct btrfs_root *root, unsigned long transid)
+ return 0;
+ }
+
+-static int wait_for_writer(struct btrfs_root *root)
++static int wait_for_writer(struct btrfs_trans_handle *trans,
++ struct btrfs_root *root)
+ {
+ DEFINE_WAIT(wait);
+ while (atomic_read(&root->log_writers)) {
+ prepare_to_wait(&root->log_writer_wait,
+ &wait, TASK_UNINTERRUPTIBLE);
+ mutex_unlock(&root->log_mutex);
+- if (atomic_read(&root->log_writers))
++ if (root->fs_info->last_trans_log_full_commit !=
++ trans->transid && atomic_read(&root->log_writers))
+ schedule();
+ mutex_lock(&root->log_mutex);
+ finish_wait(&root->log_writer_wait, &wait);
+@@ -1882,7 +1978,14 @@ static int wait_for_writer(struct btrfs_root *root)
+ /*
+ * btrfs_sync_log does sends a given tree log down to the disk and
+ * updates the super blocks to record it. When this call is done,
+- * you know that any inodes previously logged are safely on disk
++ * you know that any inodes previously logged are safely on disk only
++ * if it returns 0.
++ *
++ * Any other return value means you need to call btrfs_commit_transaction.
++ * Some of the edge cases for fsyncing directories that have had unlinks
++ * or renames done in the past mean that sometimes the only safe
++ * fsync is to commit the whole FS. When btrfs_sync_log returns -EAGAIN,
++ * that has happened.
+ */
+ int btrfs_sync_log(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root)
+@@ -1896,7 +1999,7 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
+ mutex_lock(&root->log_mutex);
+ index1 = root->log_transid % 2;
+ if (atomic_read(&root->log_commit[index1])) {
+- wait_log_commit(root, root->log_transid);
++ wait_log_commit(trans, root, root->log_transid);
+ mutex_unlock(&root->log_mutex);
+ return 0;
+ }
+@@ -1904,18 +2007,26 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
+
+ /* wait for previous tree log sync to complete */
+ if (atomic_read(&root->log_commit[(index1 + 1) % 2]))
+- wait_log_commit(root, root->log_transid - 1);
++ wait_log_commit(trans, root, root->log_transid - 1);
+
+ while (1) {
+ unsigned long batch = root->log_batch;
+ mutex_unlock(&root->log_mutex);
+ schedule_timeout_uninterruptible(1);
+ mutex_lock(&root->log_mutex);
+- wait_for_writer(root);
++
++ wait_for_writer(trans, root);
+ if (batch == root->log_batch)
+ break;
+ }
+
++ /* bail out if we need to do a full commit */
++ if (root->fs_info->last_trans_log_full_commit == trans->transid) {
++ ret = -EAGAIN;
++ mutex_unlock(&root->log_mutex);
++ goto out;
++ }
++
+ ret = btrfs_write_and_wait_marked_extents(log, &log->dirty_log_pages);
+ BUG_ON(ret);
+
+@@ -1951,16 +2062,29 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
+
+ index2 = log_root_tree->log_transid % 2;
+ if (atomic_read(&log_root_tree->log_commit[index2])) {
+- wait_log_commit(log_root_tree, log_root_tree->log_transid);
++ wait_log_commit(trans, log_root_tree,
++ log_root_tree->log_transid);
+ mutex_unlock(&log_root_tree->log_mutex);
+ goto out;
+ }
+ atomic_set(&log_root_tree->log_commit[index2], 1);
+
+- if (atomic_read(&log_root_tree->log_commit[(index2 + 1) % 2]))
+- wait_log_commit(log_root_tree, log_root_tree->log_transid - 1);
++ if (atomic_read(&log_root_tree->log_commit[(index2 + 1) % 2])) {
++ wait_log_commit(trans, log_root_tree,
++ log_root_tree->log_transid - 1);
++ }
++
++ wait_for_writer(trans, log_root_tree);
+
+- wait_for_writer(log_root_tree);
++ /*
++ * now that we've moved on to the tree of log tree roots,
++ * check the full commit flag again
++ */
++ if (root->fs_info->last_trans_log_full_commit == trans->transid) {
++ mutex_unlock(&log_root_tree->log_mutex);
++ ret = -EAGAIN;
++ goto out_wake_log_root;
++ }
+
+ ret = btrfs_write_and_wait_marked_extents(log_root_tree,
+ &log_root_tree->dirty_log_pages);
+@@ -1985,7 +2109,9 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
+ * in and cause problems either.
+ */
+ write_ctree_super(trans, root->fs_info->tree_root, 2);
++ ret = 0;
+
++out_wake_log_root:
+ atomic_set(&log_root_tree->log_commit[index2], 0);
+ smp_mb();
+ if (waitqueue_active(&log_root_tree->log_commit_wait[index2]))
+@@ -1998,7 +2124,8 @@ out:
+ return 0;
+ }
+
+-/* * free all the extents used by the tree log. This should be called
++/*
++ * free all the extents used by the tree log. This should be called
+ * at commit time of the full transaction
+ */
+ int btrfs_free_log(struct btrfs_trans_handle *trans, struct btrfs_root *root)
+@@ -2132,7 +2259,7 @@ int btrfs_del_dir_entries_in_log(struct btrfs_trans_handle *trans,
+
+ btrfs_free_path(path);
+ mutex_unlock(&BTRFS_I(dir)->log_mutex);
+- end_log_trans(root);
++ btrfs_end_log_trans(root);
+
+ return 0;
+ }
+@@ -2159,7 +2286,7 @@ int btrfs_del_inode_ref_in_log(struct btrfs_trans_handle *trans,
+ ret = btrfs_del_inode_ref(trans, log, name, name_len, inode->i_ino,
+ dirid, &index);
+ mutex_unlock(&BTRFS_I(inode)->log_mutex);
+- end_log_trans(root);
++ btrfs_end_log_trans(root);
+
+ return ret;
+ }
+@@ -2559,7 +2686,7 @@ static noinline int copy_items(struct btrfs_trans_handle *trans,
+ *
+ * This handles both files and directories.
+ */
+-static int __btrfs_log_inode(struct btrfs_trans_handle *trans,
++static int btrfs_log_inode(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root, struct inode *inode,
+ int inode_only)
+ {
+@@ -2585,28 +2712,17 @@ static int __btrfs_log_inode(struct btrfs_trans_handle *trans,
+ min_key.offset = 0;
+
+ max_key.objectid = inode->i_ino;
++
++ /* today the code can only do partial logging of directories */
++ if (!S_ISDIR(inode->i_mode))
++ inode_only = LOG_INODE_ALL;
++
+ if (inode_only == LOG_INODE_EXISTS || S_ISDIR(inode->i_mode))
+ max_key.type = BTRFS_XATTR_ITEM_KEY;
+ else
+ max_key.type = (u8)-1;
+ max_key.offset = (u64)-1;
+
+- /*
+- * if this inode has already been logged and we're in inode_only
+- * mode, we don't want to delete the things that have already
+- * been written to the log.
+- *
+- * But, if the inode has been through an inode_only log,
+- * the logged_trans field is not set. This allows us to catch
+- * any new names for this inode in the backrefs by logging it
+- * again
+- */
+- if (inode_only == LOG_INODE_EXISTS &&
+- BTRFS_I(inode)->logged_trans == trans->transid) {
+- btrfs_free_path(path);
+- btrfs_free_path(dst_path);
+- goto out;
+- }
+ mutex_lock(&BTRFS_I(inode)->log_mutex);
+
+ /*
+@@ -2693,7 +2809,6 @@ next_slot:
+ if (inode_only == LOG_INODE_ALL && S_ISDIR(inode->i_mode)) {
+ btrfs_release_path(root, path);
+ btrfs_release_path(log, dst_path);
+- BTRFS_I(inode)->log_dirty_trans = 0;
+ ret = log_directory_changes(trans, root, inode, path, dst_path);
+ BUG_ON(ret);
+ }
+@@ -2702,19 +2817,69 @@ next_slot:
+
+ btrfs_free_path(path);
+ btrfs_free_path(dst_path);
+-out:
+ return 0;
+ }
+
+-int btrfs_log_inode(struct btrfs_trans_handle *trans,
+- struct btrfs_root *root, struct inode *inode,
+- int inode_only)
++/*
++ * follow the dentry parent pointers up the chain and see if any
++ * of the directories in it require a full commit before they can
++ * be logged. Returns zero if nothing special needs to be done or 1 if
++ * a full commit is required.
++ */
++static noinline int check_parent_dirs_for_sync(struct btrfs_trans_handle *trans,
++ struct inode *inode,
++ struct dentry *parent,
++ struct super_block *sb,
++ u64 last_committed)
+ {
+- int ret;
++ int ret = 0;
++ struct btrfs_root *root;
+
+- start_log_trans(trans, root);
+- ret = __btrfs_log_inode(trans, root, inode, inode_only);
+- end_log_trans(root);
++ /*
++ * for regular files, if its inode is already on disk, we don't
++ * have to worry about the parents at all. This is because
++ * we can use the last_unlink_trans field to record renames
++ * and other fun in this file.
++ */
++ if (S_ISREG(inode->i_mode) &&
++ BTRFS_I(inode)->generation <= last_committed &&
++ BTRFS_I(inode)->last_unlink_trans <= last_committed)
++ goto out;
++
++ if (!S_ISDIR(inode->i_mode)) {
++ if (!parent || !parent->d_inode || sb != parent->d_inode->i_sb)
++ goto out;
++ inode = parent->d_inode;
++ }
++
++ while (1) {
++ BTRFS_I(inode)->logged_trans = trans->transid;
++ smp_mb();
++
++ if (BTRFS_I(inode)->last_unlink_trans > last_committed) {
++ root = BTRFS_I(inode)->root;
++
++ /*
++ * make sure any commits to the log are forced
++ * to be full commits
++ */
++ root->fs_info->last_trans_log_full_commit =
++ trans->transid;
++ ret = 1;
++ break;
++ }
++
++ if (!parent || !parent->d_inode || sb != parent->d_inode->i_sb)
++ break;
++
++ if (parent == sb->s_root)
++ break;
++
++ parent = parent->d_parent;
++ inode = parent->d_inode;
++
++ }
++out:
+ return ret;
+ }
+
+@@ -2724,31 +2889,65 @@ int btrfs_log_inode(struct btrfs_trans_handle *trans,
+ * only logging is done of any parent directories that are older than
+ * the last committed transaction
+ */
+-int btrfs_log_dentry(struct btrfs_trans_handle *trans,
+- struct btrfs_root *root, struct dentry *dentry)
++int btrfs_log_inode_parent(struct btrfs_trans_handle *trans,
++ struct btrfs_root *root, struct inode *inode,
++ struct dentry *parent, int exists_only)
+ {
+- int inode_only = LOG_INODE_ALL;
++ int inode_only = exists_only ? LOG_INODE_EXISTS : LOG_INODE_ALL;
+ struct super_block *sb;
+- int ret;
++ int ret = 0;
++ u64 last_committed = root->fs_info->last_trans_committed;
++
++ sb = inode->i_sb;
++
++ if (root->fs_info->last_trans_log_full_commit >
++ root->fs_info->last_trans_committed) {
++ ret = 1;
++ goto end_no_trans;
++ }
++
++ ret = check_parent_dirs_for_sync(trans, inode, parent,
++ sb, last_committed);
++ if (ret)
++ goto end_no_trans;
+
+ start_log_trans(trans, root);
+- sb = dentry->d_inode->i_sb;
+- while (1) {
+- ret = __btrfs_log_inode(trans, root, dentry->d_inode,
+- inode_only);
+- BUG_ON(ret);
+- inode_only = LOG_INODE_EXISTS;
+
+- dentry = dentry->d_parent;
+- if (!dentry || !dentry->d_inode || sb != dentry->d_inode->i_sb)
++ ret = btrfs_log_inode(trans, root, inode, inode_only);
++ BUG_ON(ret);
++
++ /*
++ * for regular files, if its inode is already on disk, we don't
++ * have to worry about the parents at all. This is because
++ * we can use the last_unlink_trans field to record renames
++ * and other fun in this file.
++ */
++ if (S_ISREG(inode->i_mode) &&
++ BTRFS_I(inode)->generation <= last_committed &&
++ BTRFS_I(inode)->last_unlink_trans <= last_committed)
++ goto no_parent;
++
++ inode_only = LOG_INODE_EXISTS;
++ while (1) {
++ if (!parent || !parent->d_inode || sb != parent->d_inode->i_sb)
+ break;
+
+- if (BTRFS_I(dentry->d_inode)->generation <=
+- root->fs_info->last_trans_committed)
++ inode = parent->d_inode;
++ if (BTRFS_I(inode)->generation >
++ root->fs_info->last_trans_committed) {
++ ret = btrfs_log_inode(trans, root, inode, inode_only);
++ BUG_ON(ret);
++ }
++ if (parent == sb->s_root)
+ break;
++
++ parent = parent->d_parent;
+ }
+- end_log_trans(root);
+- return 0;
++no_parent:
++ ret = 0;
++ btrfs_end_log_trans(root);
++end_no_trans:
++ return ret;
+ }
+
+ /*
+@@ -2760,12 +2959,8 @@ int btrfs_log_dentry(struct btrfs_trans_handle *trans,
+ int btrfs_log_dentry_safe(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root, struct dentry *dentry)
+ {
+- u64 gen;
+- gen = root->fs_info->last_trans_new_blockgroup;
+- if (gen > root->fs_info->last_trans_committed)
+- return 1;
+- else
+- return btrfs_log_dentry(trans, root, dentry);
++ return btrfs_log_inode_parent(trans, root, dentry->d_inode,
++ dentry->d_parent, 0);
+ }
+
+ /*
+@@ -2884,3 +3079,94 @@ again:
+ kfree(log_root_tree);
+ return 0;
+ }
++
++/*
++ * there are some corner cases where we want to force a full
++ * commit instead of allowing a directory to be logged.
++ *
++ * They revolve around files there were unlinked from the directory, and
++ * this function updates the parent directory so that a full commit is
++ * properly done if it is fsync'd later after the unlinks are done.
++ */
++void btrfs_record_unlink_dir(struct btrfs_trans_handle *trans,
++ struct inode *dir, struct inode *inode,
++ int for_rename)
++{
++ /*
++ * when we're logging a file, if it hasn't been renamed
++ * or unlinked, and its inode is fully committed on disk,
++ * we don't have to worry about walking up the directory chain
++ * to log its parents.
++ *
++ * So, we use the last_unlink_trans field to put this transid
++ * into the file. When the file is logged we check it and
++ * don't log the parents if the file is fully on disk.
++ */
++ if (S_ISREG(inode->i_mode))
++ BTRFS_I(inode)->last_unlink_trans = trans->transid;
++
++ /*
++ * if this directory was already logged any new
++ * names for this file/dir will get recorded
++ */
++ smp_mb();
++ if (BTRFS_I(dir)->logged_trans == trans->transid)
++ return;
++
++ /*
++ * if the inode we're about to unlink was logged,
++ * the log will be properly updated for any new names
++ */
++ if (BTRFS_I(inode)->logged_trans == trans->transid)
++ return;
++
++ /*
++ * when renaming files across directories, if the directory
++ * there we're unlinking from gets fsync'd later on, there's
++ * no way to find the destination directory later and fsync it
++ * properly. So, we have to be conservative and force commits
++ * so the new name gets discovered.
++ */
++ if (for_rename)
++ goto record;
++
++ /* we can safely do the unlink without any special recording */
++ return;
++
++record:
++ BTRFS_I(dir)->last_unlink_trans = trans->transid;
++}
++
++/*
++ * Call this after adding a new name for a file and it will properly
++ * update the log to reflect the new name.
++ *
++ * It will return zero if all goes well, and it will return 1 if a
++ * full transaction commit is required.
++ */
++int btrfs_log_new_name(struct btrfs_trans_handle *trans,
++ struct inode *inode, struct inode *old_dir,
++ struct dentry *parent)
++{
++ struct btrfs_root * root = BTRFS_I(inode)->root;
++
++ /*
++ * this will force the logging code to walk the dentry chain
++ * up for the file
++ */
++ if (S_ISREG(inode->i_mode))
++ BTRFS_I(inode)->last_unlink_trans = trans->transid;
++
++ /*
++ * if this inode hasn't been logged and directory we're renaming it
++ * from hasn't been logged, we don't need to log it
++ */
++ if (BTRFS_I(inode)->logged_trans <=
++ root->fs_info->last_trans_committed &&
++ (!old_dir || BTRFS_I(old_dir)->logged_trans <=
++ root->fs_info->last_trans_committed))
++ return 0;
++
++ return btrfs_log_inode_parent(trans, root, inode, parent, 1);
++}
++
+diff --git a/fs/btrfs/tree-log.h b/fs/btrfs/tree-log.h
+index b9409b3..d09c760 100644
+--- a/fs/btrfs/tree-log.h
++++ b/fs/btrfs/tree-log.h
+@@ -22,14 +22,9 @@
+ int btrfs_sync_log(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root);
+ int btrfs_free_log(struct btrfs_trans_handle *trans, struct btrfs_root *root);
+-int btrfs_log_dentry(struct btrfs_trans_handle *trans,
+- struct btrfs_root *root, struct dentry *dentry);
+ int btrfs_recover_log_trees(struct btrfs_root *tree_root);
+ int btrfs_log_dentry_safe(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root, struct dentry *dentry);
+-int btrfs_log_inode(struct btrfs_trans_handle *trans,
+- struct btrfs_root *root, struct inode *inode,
+- int inode_only);
+ int btrfs_del_dir_entries_in_log(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ const char *name, int name_len,
+@@ -38,4 +33,16 @@ int btrfs_del_inode_ref_in_log(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ const char *name, int name_len,
+ struct inode *inode, u64 dirid);
++int btrfs_join_running_log_trans(struct btrfs_root *root);
++int btrfs_end_log_trans(struct btrfs_root *root);
++int btrfs_pin_log_trans(struct btrfs_root *root);
++int btrfs_log_inode_parent(struct btrfs_trans_handle *trans,
++ struct btrfs_root *root, struct inode *inode,
++ struct dentry *parent, int exists_only);
++void btrfs_record_unlink_dir(struct btrfs_trans_handle *trans,
++ struct inode *dir, struct inode *inode,
++ int for_rename);
++int btrfs_log_new_name(struct btrfs_trans_handle *trans,
++ struct inode *inode, struct inode *old_dir,
++ struct dentry *parent);
+ #endif
More information about the scm-commits
mailing list