Btrfs: fix extent pinning bugs in the tree log
authorChris Mason <chris.mason@oracle.com>
Tue, 1 Nov 2011 00:52:39 +0000 (20:52 -0400)
committerChris Mason <chris.mason@oracle.com>
Sun, 6 Nov 2011 08:03:48 +0000 (03:03 -0500)
The tree log had two important bugs that could cause corruptions after a
crash.  Sometimes we were allowing tree log blocks to be reused after
the tree log was committed but before the transaction commit was done.

This allowed a future metadata write to overwrite the tree log data.  It
is fixed by adding a new variant of freeing reserved extents that always
pins them.  Credit goes to Stefan Behrens and Arne Jansen for many many
hours spent tracking this bug down.

During tree log replay, we do a pass through the tree log and pin all
the extents we find.  This makes sure the replay code won't go in and
use any of those blocks for new allocations during replay.  The problem
is the free space cache isn't honoring these pinned extents.  So the
allocator can end up handing them out, leading to all kinds of problems
during replay.

The fix here is to force any free space cache to load while we pin the
extents, and then to make sure we remove the pinned extents from the
free space rbtree.

Signed-off-by: Chris Mason <chris.mason@oracle.com>
Reported-by: Stefan Behrens <sbehrens@giantdisaster.de>
fs/btrfs/ctree.h
fs/btrfs/extent-tree.c
fs/btrfs/tree-log.c

index 2276209..f63c9b3 100644 (file)
@@ -2156,6 +2156,9 @@ int btrfs_lookup_extent_info(struct btrfs_trans_handle *trans,
                             u64 num_bytes, u64 *refs, u64 *flags);
 int btrfs_pin_extent(struct btrfs_root *root,
                     u64 bytenr, u64 num, int reserved);
+int btrfs_pin_extent_for_log_replay(struct btrfs_trans_handle *trans,
+                                   struct btrfs_root *root,
+                                   u64 bytenr, u64 num_bytes);
 int btrfs_cross_ref_exist(struct btrfs_trans_handle *trans,
                          struct btrfs_root *root,
                          u64 objectid, u64 offset, u64 bytenr);
@@ -2206,6 +2209,8 @@ int btrfs_free_extent(struct btrfs_trans_handle *trans,
                      u64 root_objectid, u64 owner, u64 offset);
 
 int btrfs_free_reserved_extent(struct btrfs_root *root, u64 start, u64 len);
+int btrfs_free_and_pin_reserved_extent(struct btrfs_root *root,
+                                      u64 start, u64 len);
 int btrfs_prepare_extent_commit(struct btrfs_trans_handle *trans,
                                struct btrfs_root *root);
 int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans,
index 28c4809..cb76266 100644 (file)
@@ -4344,6 +4344,34 @@ int btrfs_pin_extent(struct btrfs_root *root,
        return 0;
 }
 
+/*
+ * this function must be called within transaction
+ */
+int btrfs_pin_extent_for_log_replay(struct btrfs_trans_handle *trans,
+                                   struct btrfs_root *root,
+                                   u64 bytenr, u64 num_bytes)
+{
+       struct btrfs_block_group_cache *cache;
+
+       cache = btrfs_lookup_block_group(root->fs_info, bytenr);
+       BUG_ON(!cache);
+
+       /*
+        * pull in the free space cache (if any) so that our pin
+        * removes the free space from the cache.  We have load_only set
+        * to one because the slow code to read in the free extents does check
+        * the pinned extents.
+        */
+       cache_block_group(cache, trans, root, 1);
+
+       pin_down_extent(root, cache, bytenr, num_bytes, 0);
+
+       /* remove us from the free space cache (if we're there at all) */
+       btrfs_remove_free_space(cache, bytenr, num_bytes);
+       btrfs_put_block_group(cache);
+       return 0;
+}
+
 /**
  * btrfs_update_reserved_bytes - update the block_group and space info counters
  * @cache:     The cache we are manipulating
@@ -5487,7 +5515,8 @@ again:
        return ret;
 }
 
-int btrfs_free_reserved_extent(struct btrfs_root *root, u64 start, u64 len)
+static int __btrfs_free_reserved_extent(struct btrfs_root *root,
+                                       u64 start, u64 len, int pin)
 {
        struct btrfs_block_group_cache *cache;
        int ret = 0;
@@ -5502,8 +5531,12 @@ int btrfs_free_reserved_extent(struct btrfs_root *root, u64 start, u64 len)
        if (btrfs_test_opt(root, DISCARD))
                ret = btrfs_discard_extent(root, start, len, NULL);
 
-       btrfs_add_free_space(cache, start, len);
-       btrfs_update_reserved_bytes(cache, len, RESERVE_FREE);
+       if (pin)
+               pin_down_extent(root, cache, start, len, 1);
+       else {
+               btrfs_add_free_space(cache, start, len);
+               btrfs_update_reserved_bytes(cache, len, RESERVE_FREE);
+       }
        btrfs_put_block_group(cache);
 
        trace_btrfs_reserved_extent_free(root, start, len);
@@ -5511,6 +5544,18 @@ int btrfs_free_reserved_extent(struct btrfs_root *root, u64 start, u64 len)
        return ret;
 }
 
+int btrfs_free_reserved_extent(struct btrfs_root *root,
+                                       u64 start, u64 len)
+{
+       return __btrfs_free_reserved_extent(root, start, len, 0);
+}
+
+int btrfs_free_and_pin_reserved_extent(struct btrfs_root *root,
+                                      u64 start, u64 len)
+{
+       return __btrfs_free_reserved_extent(root, start, len, 1);
+}
+
 static int alloc_reserved_file_extent(struct btrfs_trans_handle *trans,
                                      struct btrfs_root *root,
                                      u64 parent, u64 root_objectid,
index 310ab22..8ca1b6b 100644 (file)
@@ -276,8 +276,9 @@ static int process_one_buffer(struct btrfs_root *log,
                              struct walk_control *wc, u64 gen)
 {
        if (wc->pin)
-               btrfs_pin_extent(log->fs_info->extent_root,
-                                eb->start, eb->len, 0);
+               btrfs_pin_extent_for_log_replay(wc->trans,
+                                               log->fs_info->extent_root,
+                                               eb->start, eb->len);
 
        if (btrfs_buffer_uptodate(eb, gen)) {
                if (wc->write)
@@ -1760,7 +1761,7 @@ static noinline int walk_down_log_tree(struct btrfs_trans_handle *trans,
 
                                WARN_ON(root_owner !=
                                        BTRFS_TREE_LOG_OBJECTID);
-                               ret = btrfs_free_reserved_extent(root,
+                               ret = btrfs_free_and_pin_reserved_extent(root,
                                                         bytenr, blocksize);
                                BUG_ON(ret);
                        }
@@ -1828,7 +1829,7 @@ static noinline int walk_up_log_tree(struct btrfs_trans_handle *trans,
                                btrfs_tree_unlock(next);
 
                                WARN_ON(root_owner != BTRFS_TREE_LOG_OBJECTID);
-                               ret = btrfs_free_reserved_extent(root,
+                               ret = btrfs_free_and_pin_reserved_extent(root,
                                                path->nodes[*level]->start,
                                                path->nodes[*level]->len);
                                BUG_ON(ret);
@@ -1897,7 +1898,7 @@ static int walk_log_tree(struct btrfs_trans_handle *trans,
 
                        WARN_ON(log->root_key.objectid !=
                                BTRFS_TREE_LOG_OBJECTID);
-                       ret = btrfs_free_reserved_extent(log, next->start,
+                       ret = btrfs_free_and_pin_reserved_extent(log, next->start,
                                                         next->len);
                        BUG_ON(ret);
                }