btrfs: use readahead API for scrub
authorArne Jansen <sensille@gmx.net>
Fri, 10 Jun 2011 10:39:23 +0000 (12:39 +0200)
committerArne Jansen <sensille@gmx.net>
Sun, 2 Oct 2011 06:48:45 +0000 (08:48 +0200)
Scrub uses a simple tree-enumeration to bring the relevant portions
of the extent- and csum-tree into the page cache before starting the
scrub-I/O. This is now replaced by using the new readahead-API.
During readahead the scrub is being accounted as paused, so it won't
hold off transaction commits.

This change raises the average disk bandwith utilisation on my test
volume from 70% to 90%. On another volume, the time for a test run
went down from 89s to 43s.

Changes v5:
 - reada1/2 are now of type struct reada_control *

Signed-off-by: Arne Jansen <sensille@gmx.net>
fs/btrfs/scrub.c

index a8d03d5..f930f27 100644 (file)
  * any can be found.
  *
  * Future enhancements:
- *  - To enhance the performance, better read-ahead strategies for the
- *    extent-tree can be employed.
  *  - In case an unrepairable extent is encountered, track which files are
  *    affected and report them
  *  - In case of a read error on files with nodatasum, map the file and read
  *    the extent to trigger a writeback of the good copy
  *  - track and record media errors, throw out bad devices
  *  - add a mode to also read unallocated space
- *  - make the prefetch cancellable
  */
 
 struct scrub_bio;
@@ -741,13 +738,16 @@ static noinline_for_stack int scrub_stripe(struct scrub_dev *sdev,
        int slot;
        int i;
        u64 nstripes;
-       int start_stripe;
        struct extent_buffer *l;
        struct btrfs_key key;
        u64 physical;
        u64 logical;
        u64 generation;
        u64 mirror_num;
+       struct reada_control *reada1;
+       struct reada_control *reada2;
+       struct btrfs_key key_start;
+       struct btrfs_key key_end;
 
        u64 increment = map->stripe_len;
        u64 offset;
@@ -779,81 +779,67 @@ static noinline_for_stack int scrub_stripe(struct scrub_dev *sdev,
        if (!path)
                return -ENOMEM;
 
-       path->reada = 2;
        path->search_commit_root = 1;
        path->skip_locking = 1;
 
        /*
-        * find all extents for each stripe and just read them to get
-        * them into the page cache
-        * FIXME: we can do better. build a more intelligent prefetching
+        * trigger the readahead for extent tree csum tree and wait for
+        * completion. During readahead, the scrub is officially paused
+        * to not hold off transaction commits
         */
        logical = base + offset;
-       physical = map->stripes[num].physical;
-       ret = 0;
-       for (i = 0; i < nstripes; ++i) {
-               key.objectid = logical;
-               key.type = BTRFS_EXTENT_ITEM_KEY;
-               key.offset = (u64)0;
-
-               ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
-               if (ret < 0)
-                       goto out_noplug;
-
-               /*
-                * we might miss half an extent here, but that doesn't matter,
-                * as it's only the prefetch
-                */
-               while (1) {
-                       l = path->nodes[0];
-                       slot = path->slots[0];
-                       if (slot >= btrfs_header_nritems(l)) {
-                               ret = btrfs_next_leaf(root, path);
-                               if (ret == 0)
-                                       continue;
-                               if (ret < 0)
-                                       goto out_noplug;
 
-                               break;
-                       }
-                       btrfs_item_key_to_cpu(l, &key, slot);
+       wait_event(sdev->list_wait,
+                  atomic_read(&sdev->in_flight) == 0);
+       atomic_inc(&fs_info->scrubs_paused);
+       wake_up(&fs_info->scrub_pause_wait);
 
-                       if (key.objectid >= logical + map->stripe_len)
-                               break;
+       /* FIXME it might be better to start readahead at commit root */
+       key_start.objectid = logical;
+       key_start.type = BTRFS_EXTENT_ITEM_KEY;
+       key_start.offset = (u64)0;
+       key_end.objectid = base + offset + nstripes * increment;
+       key_end.type = BTRFS_EXTENT_ITEM_KEY;
+       key_end.offset = (u64)0;
+       reada1 = btrfs_reada_add(root, &key_start, &key_end);
+
+       key_start.objectid = BTRFS_EXTENT_CSUM_OBJECTID;
+       key_start.type = BTRFS_EXTENT_CSUM_KEY;
+       key_start.offset = logical;
+       key_end.objectid = BTRFS_EXTENT_CSUM_OBJECTID;
+       key_end.type = BTRFS_EXTENT_CSUM_KEY;
+       key_end.offset = base + offset + nstripes * increment;
+       reada2 = btrfs_reada_add(csum_root, &key_start, &key_end);
+
+       if (!IS_ERR(reada1))
+               btrfs_reada_wait(reada1);
+       if (!IS_ERR(reada2))
+               btrfs_reada_wait(reada2);
 
-                       path->slots[0]++;
-               }
-               btrfs_release_path(path);
-               logical += increment;
-               physical += map->stripe_len;
-               cond_resched();
+       mutex_lock(&fs_info->scrub_lock);
+       while (atomic_read(&fs_info->scrub_pause_req)) {
+               mutex_unlock(&fs_info->scrub_lock);
+               wait_event(fs_info->scrub_pause_wait,
+                  atomic_read(&fs_info->scrub_pause_req) == 0);
+               mutex_lock(&fs_info->scrub_lock);
        }
+       atomic_dec(&fs_info->scrubs_paused);
+       mutex_unlock(&fs_info->scrub_lock);
+       wake_up(&fs_info->scrub_pause_wait);
 
        /*
         * collect all data csums for the stripe to avoid seeking during
         * the scrub. This might currently (crc32) end up to be about 1MB
         */
-       start_stripe = 0;
        blk_start_plug(&plug);
-again:
-       logical = base + offset + start_stripe * increment;
-       for (i = start_stripe; i < nstripes; ++i) {
-               ret = btrfs_lookup_csums_range(csum_root, logical,
-                                              logical + map->stripe_len - 1,
-                                              &sdev->csum_list, 1);
-               if (ret)
-                       goto out;
 
-               logical += increment;
-               cond_resched();
-       }
        /*
         * now find all extents for each stripe and scrub them
         */
-       logical = base + offset + start_stripe * increment;
-       physical = map->stripes[num].physical + start_stripe * map->stripe_len;
+       logical = base + offset;
+       physical = map->stripes[num].physical;
        ret = 0;
-       for (i = start_stripe; i < nstripes; ++i) {
+       for (i = 0; i < nstripes; ++i) {
                /*
                 * canceled?
                 */
@@ -882,11 +868,14 @@ again:
                        atomic_dec(&fs_info->scrubs_paused);
                        mutex_unlock(&fs_info->scrub_lock);
                        wake_up(&fs_info->scrub_pause_wait);
-                       scrub_free_csums(sdev);
-                       start_stripe = i;
-                       goto again;
                }
 
+               ret = btrfs_lookup_csums_range(csum_root, logical,
+                                              logical + map->stripe_len - 1,
+                                              &sdev->csum_list, 1);
+               if (ret)
+                       goto out;
+
                key.objectid = logical;
                key.type = BTRFS_EXTENT_ITEM_KEY;
                key.offset = (u64)0;
@@ -982,7 +971,6 @@ next:
 
 out:
        blk_finish_plug(&plug);
-out_noplug:
        btrfs_free_path(path);
        return ret < 0 ? ret : 0;
 }