btrfs scrub: add fixup code for errors on nodatasum files
[pandora-kernel.git] / fs / btrfs / scrub.c
1 /*
2  * Copyright (C) 2011 STRATO.  All rights reserved.
3  *
4  * This program is free software; you can redistribute it and/or
5  * modify it under the terms of the GNU General Public
6  * License v2 as published by the Free Software Foundation.
7  *
8  * This program is distributed in the hope that it will be useful,
9  * but WITHOUT ANY WARRANTY; without even the implied warranty of
10  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
11  * General Public License for more details.
12  *
13  * You should have received a copy of the GNU General Public
14  * License along with this program; if not, write to the
15  * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
16  * Boston, MA 021110-1307, USA.
17  */
18
19 #include <linux/blkdev.h>
20 #include <linux/ratelimit.h>
21 #include "ctree.h"
22 #include "volumes.h"
23 #include "disk-io.h"
24 #include "ordered-data.h"
25 #include "transaction.h"
26 #include "backref.h"
27
28 /*
29  * This is only the first step towards a full-features scrub. It reads all
30  * extent and super block and verifies the checksums. In case a bad checksum
31  * is found or the extent cannot be read, good data will be written back if
32  * any can be found.
33  *
34  * Future enhancements:
35  *  - To enhance the performance, better read-ahead strategies for the
36  *    extent-tree can be employed.
37  *  - In case an unrepairable extent is encountered, track which files are
38  *    affected and report them
39  *  - In case of a read error on files with nodatasum, map the file and read
40  *    the extent to trigger a writeback of the good copy
41  *  - track and record media errors, throw out bad devices
42  *  - add a mode to also read unallocated space
43  *  - make the prefetch cancellable
44  */
45
46 struct scrub_bio;
47 struct scrub_page;
48 struct scrub_dev;
49 static void scrub_bio_end_io(struct bio *bio, int err);
50 static void scrub_checksum(struct btrfs_work *work);
51 static int scrub_checksum_data(struct scrub_dev *sdev,
52                                struct scrub_page *spag, void *buffer);
53 static int scrub_checksum_tree_block(struct scrub_dev *sdev,
54                                      struct scrub_page *spag, u64 logical,
55                                      void *buffer);
56 static int scrub_checksum_super(struct scrub_bio *sbio, void *buffer);
57 static int scrub_fixup_check(struct scrub_bio *sbio, int ix);
58 static void scrub_fixup_end_io(struct bio *bio, int err);
59 static int scrub_fixup_io(int rw, struct block_device *bdev, sector_t sector,
60                           struct page *page);
61 static void scrub_fixup(struct scrub_bio *sbio, int ix);
62
63 #define SCRUB_PAGES_PER_BIO     16      /* 64k per bio */
64 #define SCRUB_BIOS_PER_DEV      16      /* 1 MB per device in flight */
65
66 struct scrub_page {
67         u64                     flags;  /* extent flags */
68         u64                     generation;
69         int                     mirror_num;
70         int                     have_csum;
71         u8                      csum[BTRFS_CSUM_SIZE];
72 };
73
74 struct scrub_bio {
75         int                     index;
76         struct scrub_dev        *sdev;
77         struct bio              *bio;
78         int                     err;
79         u64                     logical;
80         u64                     physical;
81         struct scrub_page       spag[SCRUB_PAGES_PER_BIO];
82         u64                     count;
83         int                     next_free;
84         struct btrfs_work       work;
85 };
86
87 struct scrub_dev {
88         struct scrub_bio        *bios[SCRUB_BIOS_PER_DEV];
89         struct btrfs_device     *dev;
90         int                     first_free;
91         int                     curr;
92         atomic_t                in_flight;
93         atomic_t                fixup_cnt;
94         spinlock_t              list_lock;
95         wait_queue_head_t       list_wait;
96         u16                     csum_size;
97         struct list_head        csum_list;
98         atomic_t                cancel_req;
99         int                     readonly;
100         /*
101          * statistics
102          */
103         struct btrfs_scrub_progress stat;
104         spinlock_t              stat_lock;
105 };
106
107 struct scrub_fixup_nodatasum {
108         struct scrub_dev        *sdev;
109         u64                     logical;
110         struct btrfs_root       *root;
111         struct btrfs_work       work;
112         int                     mirror_num;
113 };
114
115 struct scrub_warning {
116         struct btrfs_path       *path;
117         u64                     extent_item_size;
118         char                    *scratch_buf;
119         char                    *msg_buf;
120         const char              *errstr;
121         sector_t                sector;
122         u64                     logical;
123         struct btrfs_device     *dev;
124         int                     msg_bufsize;
125         int                     scratch_bufsize;
126 };
127
128 static void scrub_free_csums(struct scrub_dev *sdev)
129 {
130         while (!list_empty(&sdev->csum_list)) {
131                 struct btrfs_ordered_sum *sum;
132                 sum = list_first_entry(&sdev->csum_list,
133                                        struct btrfs_ordered_sum, list);
134                 list_del(&sum->list);
135                 kfree(sum);
136         }
137 }
138
139 static void scrub_free_bio(struct bio *bio)
140 {
141         int i;
142         struct page *last_page = NULL;
143
144         if (!bio)
145                 return;
146
147         for (i = 0; i < bio->bi_vcnt; ++i) {
148                 if (bio->bi_io_vec[i].bv_page == last_page)
149                         continue;
150                 last_page = bio->bi_io_vec[i].bv_page;
151                 __free_page(last_page);
152         }
153         bio_put(bio);
154 }
155
156 static noinline_for_stack void scrub_free_dev(struct scrub_dev *sdev)
157 {
158         int i;
159
160         if (!sdev)
161                 return;
162
163         for (i = 0; i < SCRUB_BIOS_PER_DEV; ++i) {
164                 struct scrub_bio *sbio = sdev->bios[i];
165
166                 if (!sbio)
167                         break;
168
169                 scrub_free_bio(sbio->bio);
170                 kfree(sbio);
171         }
172
173         scrub_free_csums(sdev);
174         kfree(sdev);
175 }
176
177 static noinline_for_stack
178 struct scrub_dev *scrub_setup_dev(struct btrfs_device *dev)
179 {
180         struct scrub_dev *sdev;
181         int             i;
182         struct btrfs_fs_info *fs_info = dev->dev_root->fs_info;
183
184         sdev = kzalloc(sizeof(*sdev), GFP_NOFS);
185         if (!sdev)
186                 goto nomem;
187         sdev->dev = dev;
188         for (i = 0; i < SCRUB_BIOS_PER_DEV; ++i) {
189                 struct scrub_bio *sbio;
190
191                 sbio = kzalloc(sizeof(*sbio), GFP_NOFS);
192                 if (!sbio)
193                         goto nomem;
194                 sdev->bios[i] = sbio;
195
196                 sbio->index = i;
197                 sbio->sdev = sdev;
198                 sbio->count = 0;
199                 sbio->work.func = scrub_checksum;
200
201                 if (i != SCRUB_BIOS_PER_DEV-1)
202                         sdev->bios[i]->next_free = i + 1;
203                 else
204                         sdev->bios[i]->next_free = -1;
205         }
206         sdev->first_free = 0;
207         sdev->curr = -1;
208         atomic_set(&sdev->in_flight, 0);
209         atomic_set(&sdev->fixup_cnt, 0);
210         atomic_set(&sdev->cancel_req, 0);
211         sdev->csum_size = btrfs_super_csum_size(&fs_info->super_copy);
212         INIT_LIST_HEAD(&sdev->csum_list);
213
214         spin_lock_init(&sdev->list_lock);
215         spin_lock_init(&sdev->stat_lock);
216         init_waitqueue_head(&sdev->list_wait);
217         return sdev;
218
219 nomem:
220         scrub_free_dev(sdev);
221         return ERR_PTR(-ENOMEM);
222 }
223
224 static int scrub_print_warning_inode(u64 inum, u64 offset, u64 root, void *ctx)
225 {
226         u64 isize;
227         u32 nlink;
228         int ret;
229         int i;
230         struct extent_buffer *eb;
231         struct btrfs_inode_item *inode_item;
232         struct scrub_warning *swarn = ctx;
233         struct btrfs_fs_info *fs_info = swarn->dev->dev_root->fs_info;
234         struct inode_fs_paths *ipath = NULL;
235         struct btrfs_root *local_root;
236         struct btrfs_key root_key;
237
238         root_key.objectid = root;
239         root_key.type = BTRFS_ROOT_ITEM_KEY;
240         root_key.offset = (u64)-1;
241         local_root = btrfs_read_fs_root_no_name(fs_info, &root_key);
242         if (IS_ERR(local_root)) {
243                 ret = PTR_ERR(local_root);
244                 goto err;
245         }
246
247         ret = inode_item_info(inum, 0, local_root, swarn->path);
248         if (ret) {
249                 btrfs_release_path(swarn->path);
250                 goto err;
251         }
252
253         eb = swarn->path->nodes[0];
254         inode_item = btrfs_item_ptr(eb, swarn->path->slots[0],
255                                         struct btrfs_inode_item);
256         isize = btrfs_inode_size(eb, inode_item);
257         nlink = btrfs_inode_nlink(eb, inode_item);
258         btrfs_release_path(swarn->path);
259
260         ipath = init_ipath(4096, local_root, swarn->path);
261         ret = paths_from_inode(inum, ipath);
262
263         if (ret < 0)
264                 goto err;
265
266         /*
267          * we deliberately ignore the bit ipath might have been too small to
268          * hold all of the paths here
269          */
270         for (i = 0; i < ipath->fspath->elem_cnt; ++i)
271                 printk(KERN_WARNING "btrfs: %s at logical %llu on dev "
272                         "%s, sector %llu, root %llu, inode %llu, offset %llu, "
273                         "length %llu, links %u (path: %s)\n", swarn->errstr,
274                         swarn->logical, swarn->dev->name,
275                         (unsigned long long)swarn->sector, root, inum, offset,
276                         min(isize - offset, (u64)PAGE_SIZE), nlink,
277                         ipath->fspath->str[i]);
278
279         free_ipath(ipath);
280         return 0;
281
282 err:
283         printk(KERN_WARNING "btrfs: %s at logical %llu on dev "
284                 "%s, sector %llu, root %llu, inode %llu, offset %llu: path "
285                 "resolving failed with ret=%d\n", swarn->errstr,
286                 swarn->logical, swarn->dev->name,
287                 (unsigned long long)swarn->sector, root, inum, offset, ret);
288
289         free_ipath(ipath);
290         return 0;
291 }
292
293 static void scrub_print_warning(const char *errstr, struct scrub_bio *sbio,
294                                 int ix)
295 {
296         struct btrfs_device *dev = sbio->sdev->dev;
297         struct btrfs_fs_info *fs_info = dev->dev_root->fs_info;
298         struct btrfs_path *path;
299         struct btrfs_key found_key;
300         struct extent_buffer *eb;
301         struct btrfs_extent_item *ei;
302         struct scrub_warning swarn;
303         u32 item_size;
304         int ret;
305         u64 ref_root;
306         u8 ref_level;
307         unsigned long ptr = 0;
308         const int bufsize = 4096;
309         u64 extent_offset;
310
311         path = btrfs_alloc_path();
312
313         swarn.scratch_buf = kmalloc(bufsize, GFP_NOFS);
314         swarn.msg_buf = kmalloc(bufsize, GFP_NOFS);
315         swarn.sector = (sbio->physical + ix * PAGE_SIZE) >> 9;
316         swarn.logical = sbio->logical + ix * PAGE_SIZE;
317         swarn.errstr = errstr;
318         swarn.dev = dev;
319         swarn.msg_bufsize = bufsize;
320         swarn.scratch_bufsize = bufsize;
321
322         if (!path || !swarn.scratch_buf || !swarn.msg_buf)
323                 goto out;
324
325         ret = extent_from_logical(fs_info, swarn.logical, path, &found_key);
326         if (ret < 0)
327                 goto out;
328
329         extent_offset = swarn.logical - found_key.objectid;
330         swarn.extent_item_size = found_key.offset;
331
332         eb = path->nodes[0];
333         ei = btrfs_item_ptr(eb, path->slots[0], struct btrfs_extent_item);
334         item_size = btrfs_item_size_nr(eb, path->slots[0]);
335
336         if (ret & BTRFS_EXTENT_FLAG_TREE_BLOCK) {
337                 do {
338                         ret = tree_backref_for_extent(&ptr, eb, ei, item_size,
339                                                         &ref_root, &ref_level);
340                         printk(KERN_WARNING "%s at logical %llu on dev %s, "
341                                 "sector %llu: metadata %s (level %d) in tree "
342                                 "%llu\n", errstr, swarn.logical, dev->name,
343                                 (unsigned long long)swarn.sector,
344                                 ref_level ? "node" : "leaf",
345                                 ret < 0 ? -1 : ref_level,
346                                 ret < 0 ? -1 : ref_root);
347                 } while (ret != 1);
348         } else {
349                 swarn.path = path;
350                 iterate_extent_inodes(fs_info, path, found_key.objectid,
351                                         extent_offset,
352                                         scrub_print_warning_inode, &swarn);
353         }
354
355 out:
356         btrfs_free_path(path);
357         kfree(swarn.scratch_buf);
358         kfree(swarn.msg_buf);
359 }
360
361 static int scrub_fixup_readpage(u64 inum, u64 offset, u64 root, void *ctx)
362 {
363         struct page *page;
364         unsigned long index;
365         struct scrub_fixup_nodatasum *fixup = ctx;
366         int ret;
367         int corrected;
368         struct btrfs_key key;
369         struct inode *inode;
370         u64 end = offset + PAGE_SIZE - 1;
371         struct btrfs_root *local_root;
372
373         key.objectid = root;
374         key.type = BTRFS_ROOT_ITEM_KEY;
375         key.offset = (u64)-1;
376         local_root = btrfs_read_fs_root_no_name(fixup->root->fs_info, &key);
377         if (IS_ERR(local_root))
378                 return PTR_ERR(local_root);
379
380         key.type = BTRFS_INODE_ITEM_KEY;
381         key.objectid = inum;
382         key.offset = 0;
383         inode = btrfs_iget(fixup->root->fs_info->sb, &key, local_root, NULL);
384         if (IS_ERR(inode))
385                 return PTR_ERR(inode);
386
387         ret = set_extent_bit(&BTRFS_I(inode)->io_tree, offset, end,
388                                 EXTENT_DAMAGED, 0, NULL, NULL, GFP_NOFS);
389
390         /* set_extent_bit should either succeed or give proper error */
391         WARN_ON(ret > 0);
392         if (ret)
393                 return ret < 0 ? ret : -EFAULT;
394
395         index = offset >> PAGE_CACHE_SHIFT;
396
397         page = find_or_create_page(inode->i_mapping, index, GFP_NOFS);
398         if (!page)
399                 return -ENOMEM;
400
401         ret = extent_read_full_page(&BTRFS_I(inode)->io_tree, page,
402                                         btrfs_get_extent, fixup->mirror_num);
403         wait_on_page_locked(page);
404         corrected = !test_range_bit(&BTRFS_I(inode)->io_tree, offset, end,
405                                         EXTENT_DAMAGED, 0, NULL);
406
407         if (corrected)
408                 WARN_ON(!PageUptodate(page));
409         else
410                 clear_extent_bit(&BTRFS_I(inode)->io_tree, offset, end,
411                                         EXTENT_DAMAGED, 0, 0, NULL, GFP_NOFS);
412
413         put_page(page);
414         iput(inode);
415
416         if (ret < 0)
417                 return ret;
418
419         if (ret == 0 && corrected) {
420                 /*
421                  * we only need to call readpage for one of the inodes belonging
422                  * to this extent. so make iterate_extent_inodes stop
423                  */
424                 return 1;
425         }
426
427         return -EIO;
428 }
429
430 static void scrub_fixup_nodatasum(struct btrfs_work *work)
431 {
432         int ret;
433         struct scrub_fixup_nodatasum *fixup;
434         struct scrub_dev *sdev;
435         struct btrfs_trans_handle *trans = NULL;
436         struct btrfs_fs_info *fs_info;
437         struct btrfs_path *path;
438         int uncorrectable = 0;
439
440         fixup = container_of(work, struct scrub_fixup_nodatasum, work);
441         sdev = fixup->sdev;
442         fs_info = fixup->root->fs_info;
443
444         path = btrfs_alloc_path();
445         if (!path) {
446                 spin_lock(&sdev->stat_lock);
447                 ++sdev->stat.malloc_errors;
448                 spin_unlock(&sdev->stat_lock);
449                 uncorrectable = 1;
450                 goto out;
451         }
452
453         trans = btrfs_join_transaction(fixup->root);
454         if (IS_ERR(trans)) {
455                 uncorrectable = 1;
456                 goto out;
457         }
458
459         /*
460          * the idea is to trigger a regular read through the standard path. we
461          * read a page from the (failed) logical address by specifying the
462          * corresponding copynum of the failed sector. thus, that readpage is
463          * expected to fail.
464          * that is the point where on-the-fly error correction will kick in
465          * (once it's finished) and rewrite the failed sector if a good copy
466          * can be found.
467          */
468         ret = iterate_inodes_from_logical(fixup->logical, fixup->root->fs_info,
469                                                 path, scrub_fixup_readpage,
470                                                 fixup);
471         if (ret < 0) {
472                 uncorrectable = 1;
473                 goto out;
474         }
475         WARN_ON(ret != 1);
476
477         spin_lock(&sdev->stat_lock);
478         ++sdev->stat.corrected_errors;
479         spin_unlock(&sdev->stat_lock);
480
481 out:
482         if (trans && !IS_ERR(trans))
483                 btrfs_end_transaction(trans, fixup->root);
484         if (uncorrectable) {
485                 spin_lock(&sdev->stat_lock);
486                 ++sdev->stat.uncorrectable_errors;
487                 spin_unlock(&sdev->stat_lock);
488                 printk_ratelimited(KERN_ERR "btrfs: unable to fixup "
489                                         "(nodatasum) error at logical %llu\n",
490                                         fixup->logical);
491         }
492
493         btrfs_free_path(path);
494         kfree(fixup);
495
496         /* see caller why we're pretending to be paused in the scrub counters */
497         mutex_lock(&fs_info->scrub_lock);
498         atomic_dec(&fs_info->scrubs_running);
499         atomic_dec(&fs_info->scrubs_paused);
500         mutex_unlock(&fs_info->scrub_lock);
501         atomic_dec(&sdev->fixup_cnt);
502         wake_up(&fs_info->scrub_pause_wait);
503         wake_up(&sdev->list_wait);
504 }
505
506 /*
507  * scrub_recheck_error gets called when either verification of the page
508  * failed or the bio failed to read, e.g. with EIO. In the latter case,
509  * recheck_error gets called for every page in the bio, even though only
510  * one may be bad
511  */
512 static int scrub_recheck_error(struct scrub_bio *sbio, int ix)
513 {
514         struct scrub_dev *sdev = sbio->sdev;
515         u64 sector = (sbio->physical + ix * PAGE_SIZE) >> 9;
516         static DEFINE_RATELIMIT_STATE(_rs, DEFAULT_RATELIMIT_INTERVAL,
517                                         DEFAULT_RATELIMIT_BURST);
518
519         if (sbio->err) {
520                 if (scrub_fixup_io(READ, sbio->sdev->dev->bdev, sector,
521                                    sbio->bio->bi_io_vec[ix].bv_page) == 0) {
522                         if (scrub_fixup_check(sbio, ix) == 0)
523                                 return 0;
524                 }
525                 if (__ratelimit(&_rs))
526                         scrub_print_warning("i/o error", sbio, ix);
527         } else {
528                 if (__ratelimit(&_rs))
529                         scrub_print_warning("checksum error", sbio, ix);
530         }
531
532         spin_lock(&sdev->stat_lock);
533         ++sdev->stat.read_errors;
534         spin_unlock(&sdev->stat_lock);
535
536         scrub_fixup(sbio, ix);
537         return 1;
538 }
539
540 static int scrub_fixup_check(struct scrub_bio *sbio, int ix)
541 {
542         int ret = 1;
543         struct page *page;
544         void *buffer;
545         u64 flags = sbio->spag[ix].flags;
546
547         page = sbio->bio->bi_io_vec[ix].bv_page;
548         buffer = kmap_atomic(page, KM_USER0);
549         if (flags & BTRFS_EXTENT_FLAG_DATA) {
550                 ret = scrub_checksum_data(sbio->sdev,
551                                           sbio->spag + ix, buffer);
552         } else if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) {
553                 ret = scrub_checksum_tree_block(sbio->sdev,
554                                                 sbio->spag + ix,
555                                                 sbio->logical + ix * PAGE_SIZE,
556                                                 buffer);
557         } else {
558                 WARN_ON(1);
559         }
560         kunmap_atomic(buffer, KM_USER0);
561
562         return ret;
563 }
564
565 static void scrub_fixup_end_io(struct bio *bio, int err)
566 {
567         complete((struct completion *)bio->bi_private);
568 }
569
570 static void scrub_fixup(struct scrub_bio *sbio, int ix)
571 {
572         struct scrub_dev *sdev = sbio->sdev;
573         struct btrfs_fs_info *fs_info = sdev->dev->dev_root->fs_info;
574         struct btrfs_mapping_tree *map_tree = &fs_info->mapping_tree;
575         struct btrfs_multi_bio *multi = NULL;
576         struct scrub_fixup_nodatasum *fixup;
577         u64 logical = sbio->logical + ix * PAGE_SIZE;
578         u64 length;
579         int i;
580         int ret;
581         DECLARE_COMPLETION_ONSTACK(complete);
582
583         if ((sbio->spag[ix].flags & BTRFS_EXTENT_FLAG_DATA) &&
584             (sbio->spag[ix].have_csum == 0)) {
585                 fixup = kzalloc(sizeof(*fixup), GFP_NOFS);
586                 if (!fixup)
587                         goto uncorrectable;
588                 fixup->sdev = sdev;
589                 fixup->logical = logical;
590                 fixup->root = fs_info->extent_root;
591                 fixup->mirror_num = sbio->spag[ix].mirror_num;
592                 /*
593                  * increment scrubs_running to prevent cancel requests from
594                  * completing as long as a fixup worker is running. we must also
595                  * increment scrubs_paused to prevent deadlocking on pause
596                  * requests used for transactions commits (as the worker uses a
597                  * transaction context). it is safe to regard the fixup worker
598                  * as paused for all matters practical. effectively, we only
599                  * avoid cancellation requests from completing.
600                  */
601                 mutex_lock(&fs_info->scrub_lock);
602                 atomic_inc(&fs_info->scrubs_running);
603                 atomic_inc(&fs_info->scrubs_paused);
604                 mutex_unlock(&fs_info->scrub_lock);
605                 atomic_inc(&sdev->fixup_cnt);
606                 fixup->work.func = scrub_fixup_nodatasum;
607                 btrfs_queue_worker(&fs_info->scrub_workers, &fixup->work);
608                 return;
609         }
610
611         length = PAGE_SIZE;
612         ret = btrfs_map_block(map_tree, REQ_WRITE, logical, &length,
613                               &multi, 0);
614         if (ret || !multi || length < PAGE_SIZE) {
615                 printk(KERN_ERR
616                        "scrub_fixup: btrfs_map_block failed us for %llu\n",
617                        (unsigned long long)logical);
618                 WARN_ON(1);
619                 return;
620         }
621
622         if (multi->num_stripes == 1)
623                 /* there aren't any replicas */
624                 goto uncorrectable;
625
626         /*
627          * first find a good copy
628          */
629         for (i = 0; i < multi->num_stripes; ++i) {
630                 if (i + 1 == sbio->spag[ix].mirror_num)
631                         continue;
632
633                 if (scrub_fixup_io(READ, multi->stripes[i].dev->bdev,
634                                    multi->stripes[i].physical >> 9,
635                                    sbio->bio->bi_io_vec[ix].bv_page)) {
636                         /* I/O-error, this is not a good copy */
637                         continue;
638                 }
639
640                 if (scrub_fixup_check(sbio, ix) == 0)
641                         break;
642         }
643         if (i == multi->num_stripes)
644                 goto uncorrectable;
645
646         if (!sdev->readonly) {
647                 /*
648                  * bi_io_vec[ix].bv_page now contains good data, write it back
649                  */
650                 if (scrub_fixup_io(WRITE, sdev->dev->bdev,
651                                    (sbio->physical + ix * PAGE_SIZE) >> 9,
652                                    sbio->bio->bi_io_vec[ix].bv_page)) {
653                         /* I/O-error, writeback failed, give up */
654                         goto uncorrectable;
655                 }
656         }
657
658         kfree(multi);
659         spin_lock(&sdev->stat_lock);
660         ++sdev->stat.corrected_errors;
661         spin_unlock(&sdev->stat_lock);
662
663         printk_ratelimited(KERN_ERR "btrfs: fixed up error at logical %llu\n",
664                                (unsigned long long)logical);
665         return;
666
667 uncorrectable:
668         kfree(multi);
669         spin_lock(&sdev->stat_lock);
670         ++sdev->stat.uncorrectable_errors;
671         spin_unlock(&sdev->stat_lock);
672
673         printk_ratelimited(KERN_ERR "btrfs: unable to fixup (regular) error at "
674                                 "logical %llu\n", (unsigned long long)logical);
675 }
676
677 static int scrub_fixup_io(int rw, struct block_device *bdev, sector_t sector,
678                          struct page *page)
679 {
680         struct bio *bio = NULL;
681         int ret;
682         DECLARE_COMPLETION_ONSTACK(complete);
683
684         bio = bio_alloc(GFP_NOFS, 1);
685         bio->bi_bdev = bdev;
686         bio->bi_sector = sector;
687         bio_add_page(bio, page, PAGE_SIZE, 0);
688         bio->bi_end_io = scrub_fixup_end_io;
689         bio->bi_private = &complete;
690         submit_bio(rw, bio);
691
692         /* this will also unplug the queue */
693         wait_for_completion(&complete);
694
695         ret = !test_bit(BIO_UPTODATE, &bio->bi_flags);
696         bio_put(bio);
697         return ret;
698 }
699
700 static void scrub_bio_end_io(struct bio *bio, int err)
701 {
702         struct scrub_bio *sbio = bio->bi_private;
703         struct scrub_dev *sdev = sbio->sdev;
704         struct btrfs_fs_info *fs_info = sdev->dev->dev_root->fs_info;
705
706         sbio->err = err;
707         sbio->bio = bio;
708
709         btrfs_queue_worker(&fs_info->scrub_workers, &sbio->work);
710 }
711
712 static void scrub_checksum(struct btrfs_work *work)
713 {
714         struct scrub_bio *sbio = container_of(work, struct scrub_bio, work);
715         struct scrub_dev *sdev = sbio->sdev;
716         struct page *page;
717         void *buffer;
718         int i;
719         u64 flags;
720         u64 logical;
721         int ret;
722
723         if (sbio->err) {
724                 ret = 0;
725                 for (i = 0; i < sbio->count; ++i)
726                         ret |= scrub_recheck_error(sbio, i);
727                 if (!ret) {
728                         spin_lock(&sdev->stat_lock);
729                         ++sdev->stat.unverified_errors;
730                         spin_unlock(&sdev->stat_lock);
731                 }
732
733                 sbio->bio->bi_flags &= ~(BIO_POOL_MASK - 1);
734                 sbio->bio->bi_flags |= 1 << BIO_UPTODATE;
735                 sbio->bio->bi_phys_segments = 0;
736                 sbio->bio->bi_idx = 0;
737
738                 for (i = 0; i < sbio->count; i++) {
739                         struct bio_vec *bi;
740                         bi = &sbio->bio->bi_io_vec[i];
741                         bi->bv_offset = 0;
742                         bi->bv_len = PAGE_SIZE;
743                 }
744                 goto out;
745         }
746         for (i = 0; i < sbio->count; ++i) {
747                 page = sbio->bio->bi_io_vec[i].bv_page;
748                 buffer = kmap_atomic(page, KM_USER0);
749                 flags = sbio->spag[i].flags;
750                 logical = sbio->logical + i * PAGE_SIZE;
751                 ret = 0;
752                 if (flags & BTRFS_EXTENT_FLAG_DATA) {
753                         ret = scrub_checksum_data(sdev, sbio->spag + i, buffer);
754                 } else if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) {
755                         ret = scrub_checksum_tree_block(sdev, sbio->spag + i,
756                                                         logical, buffer);
757                 } else if (flags & BTRFS_EXTENT_FLAG_SUPER) {
758                         BUG_ON(i);
759                         (void)scrub_checksum_super(sbio, buffer);
760                 } else {
761                         WARN_ON(1);
762                 }
763                 kunmap_atomic(buffer, KM_USER0);
764                 if (ret) {
765                         ret = scrub_recheck_error(sbio, i);
766                         if (!ret) {
767                                 spin_lock(&sdev->stat_lock);
768                                 ++sdev->stat.unverified_errors;
769                                 spin_unlock(&sdev->stat_lock);
770                         }
771                 }
772         }
773
774 out:
775         scrub_free_bio(sbio->bio);
776         sbio->bio = NULL;
777         spin_lock(&sdev->list_lock);
778         sbio->next_free = sdev->first_free;
779         sdev->first_free = sbio->index;
780         spin_unlock(&sdev->list_lock);
781         atomic_dec(&sdev->in_flight);
782         wake_up(&sdev->list_wait);
783 }
784
785 static int scrub_checksum_data(struct scrub_dev *sdev,
786                                struct scrub_page *spag, void *buffer)
787 {
788         u8 csum[BTRFS_CSUM_SIZE];
789         u32 crc = ~(u32)0;
790         int fail = 0;
791         struct btrfs_root *root = sdev->dev->dev_root;
792
793         if (!spag->have_csum)
794                 return 0;
795
796         crc = btrfs_csum_data(root, buffer, crc, PAGE_SIZE);
797         btrfs_csum_final(crc, csum);
798         if (memcmp(csum, spag->csum, sdev->csum_size))
799                 fail = 1;
800
801         spin_lock(&sdev->stat_lock);
802         ++sdev->stat.data_extents_scrubbed;
803         sdev->stat.data_bytes_scrubbed += PAGE_SIZE;
804         if (fail)
805                 ++sdev->stat.csum_errors;
806         spin_unlock(&sdev->stat_lock);
807
808         return fail;
809 }
810
811 static int scrub_checksum_tree_block(struct scrub_dev *sdev,
812                                      struct scrub_page *spag, u64 logical,
813                                      void *buffer)
814 {
815         struct btrfs_header *h;
816         struct btrfs_root *root = sdev->dev->dev_root;
817         struct btrfs_fs_info *fs_info = root->fs_info;
818         u8 csum[BTRFS_CSUM_SIZE];
819         u32 crc = ~(u32)0;
820         int fail = 0;
821         int crc_fail = 0;
822
823         /*
824          * we don't use the getter functions here, as we
825          * a) don't have an extent buffer and
826          * b) the page is already kmapped
827          */
828         h = (struct btrfs_header *)buffer;
829
830         if (logical != le64_to_cpu(h->bytenr))
831                 ++fail;
832
833         if (spag->generation != le64_to_cpu(h->generation))
834                 ++fail;
835
836         if (memcmp(h->fsid, fs_info->fsid, BTRFS_UUID_SIZE))
837                 ++fail;
838
839         if (memcmp(h->chunk_tree_uuid, fs_info->chunk_tree_uuid,
840                    BTRFS_UUID_SIZE))
841                 ++fail;
842
843         crc = btrfs_csum_data(root, buffer + BTRFS_CSUM_SIZE, crc,
844                               PAGE_SIZE - BTRFS_CSUM_SIZE);
845         btrfs_csum_final(crc, csum);
846         if (memcmp(csum, h->csum, sdev->csum_size))
847                 ++crc_fail;
848
849         spin_lock(&sdev->stat_lock);
850         ++sdev->stat.tree_extents_scrubbed;
851         sdev->stat.tree_bytes_scrubbed += PAGE_SIZE;
852         if (crc_fail)
853                 ++sdev->stat.csum_errors;
854         if (fail)
855                 ++sdev->stat.verify_errors;
856         spin_unlock(&sdev->stat_lock);
857
858         return fail || crc_fail;
859 }
860
861 static int scrub_checksum_super(struct scrub_bio *sbio, void *buffer)
862 {
863         struct btrfs_super_block *s;
864         u64 logical;
865         struct scrub_dev *sdev = sbio->sdev;
866         struct btrfs_root *root = sdev->dev->dev_root;
867         struct btrfs_fs_info *fs_info = root->fs_info;
868         u8 csum[BTRFS_CSUM_SIZE];
869         u32 crc = ~(u32)0;
870         int fail = 0;
871
872         s = (struct btrfs_super_block *)buffer;
873         logical = sbio->logical;
874
875         if (logical != le64_to_cpu(s->bytenr))
876                 ++fail;
877
878         if (sbio->spag[0].generation != le64_to_cpu(s->generation))
879                 ++fail;
880
881         if (memcmp(s->fsid, fs_info->fsid, BTRFS_UUID_SIZE))
882                 ++fail;
883
884         crc = btrfs_csum_data(root, buffer + BTRFS_CSUM_SIZE, crc,
885                               PAGE_SIZE - BTRFS_CSUM_SIZE);
886         btrfs_csum_final(crc, csum);
887         if (memcmp(csum, s->csum, sbio->sdev->csum_size))
888                 ++fail;
889
890         if (fail) {
891                 /*
892                  * if we find an error in a super block, we just report it.
893                  * They will get written with the next transaction commit
894                  * anyway
895                  */
896                 spin_lock(&sdev->stat_lock);
897                 ++sdev->stat.super_errors;
898                 spin_unlock(&sdev->stat_lock);
899         }
900
901         return fail;
902 }
903
904 static int scrub_submit(struct scrub_dev *sdev)
905 {
906         struct scrub_bio *sbio;
907         struct bio *bio;
908         int i;
909
910         if (sdev->curr == -1)
911                 return 0;
912
913         sbio = sdev->bios[sdev->curr];
914
915         bio = bio_alloc(GFP_NOFS, sbio->count);
916         if (!bio)
917                 goto nomem;
918
919         bio->bi_private = sbio;
920         bio->bi_end_io = scrub_bio_end_io;
921         bio->bi_bdev = sdev->dev->bdev;
922         bio->bi_sector = sbio->physical >> 9;
923
924         for (i = 0; i < sbio->count; ++i) {
925                 struct page *page;
926                 int ret;
927
928                 page = alloc_page(GFP_NOFS);
929                 if (!page)
930                         goto nomem;
931
932                 ret = bio_add_page(bio, page, PAGE_SIZE, 0);
933                 if (!ret) {
934                         __free_page(page);
935                         goto nomem;
936                 }
937         }
938
939         sbio->err = 0;
940         sdev->curr = -1;
941         atomic_inc(&sdev->in_flight);
942
943         submit_bio(READ, bio);
944
945         return 0;
946
947 nomem:
948         scrub_free_bio(bio);
949
950         return -ENOMEM;
951 }
952
953 static int scrub_page(struct scrub_dev *sdev, u64 logical, u64 len,
954                       u64 physical, u64 flags, u64 gen, int mirror_num,
955                       u8 *csum, int force)
956 {
957         struct scrub_bio *sbio;
958
959 again:
960         /*
961          * grab a fresh bio or wait for one to become available
962          */
963         while (sdev->curr == -1) {
964                 spin_lock(&sdev->list_lock);
965                 sdev->curr = sdev->first_free;
966                 if (sdev->curr != -1) {
967                         sdev->first_free = sdev->bios[sdev->curr]->next_free;
968                         sdev->bios[sdev->curr]->next_free = -1;
969                         sdev->bios[sdev->curr]->count = 0;
970                         spin_unlock(&sdev->list_lock);
971                 } else {
972                         spin_unlock(&sdev->list_lock);
973                         wait_event(sdev->list_wait, sdev->first_free != -1);
974                 }
975         }
976         sbio = sdev->bios[sdev->curr];
977         if (sbio->count == 0) {
978                 sbio->physical = physical;
979                 sbio->logical = logical;
980         } else if (sbio->physical + sbio->count * PAGE_SIZE != physical ||
981                    sbio->logical + sbio->count * PAGE_SIZE != logical) {
982                 int ret;
983
984                 ret = scrub_submit(sdev);
985                 if (ret)
986                         return ret;
987                 goto again;
988         }
989         sbio->spag[sbio->count].flags = flags;
990         sbio->spag[sbio->count].generation = gen;
991         sbio->spag[sbio->count].have_csum = 0;
992         sbio->spag[sbio->count].mirror_num = mirror_num;
993         if (csum) {
994                 sbio->spag[sbio->count].have_csum = 1;
995                 memcpy(sbio->spag[sbio->count].csum, csum, sdev->csum_size);
996         }
997         ++sbio->count;
998         if (sbio->count == SCRUB_PAGES_PER_BIO || force) {
999                 int ret;
1000
1001                 ret = scrub_submit(sdev);
1002                 if (ret)
1003                         return ret;
1004         }
1005
1006         return 0;
1007 }
1008
1009 static int scrub_find_csum(struct scrub_dev *sdev, u64 logical, u64 len,
1010                            u8 *csum)
1011 {
1012         struct btrfs_ordered_sum *sum = NULL;
1013         int ret = 0;
1014         unsigned long i;
1015         unsigned long num_sectors;
1016         u32 sectorsize = sdev->dev->dev_root->sectorsize;
1017
1018         while (!list_empty(&sdev->csum_list)) {
1019                 sum = list_first_entry(&sdev->csum_list,
1020                                        struct btrfs_ordered_sum, list);
1021                 if (sum->bytenr > logical)
1022                         return 0;
1023                 if (sum->bytenr + sum->len > logical)
1024                         break;
1025
1026                 ++sdev->stat.csum_discards;
1027                 list_del(&sum->list);
1028                 kfree(sum);
1029                 sum = NULL;
1030         }
1031         if (!sum)
1032                 return 0;
1033
1034         num_sectors = sum->len / sectorsize;
1035         for (i = 0; i < num_sectors; ++i) {
1036                 if (sum->sums[i].bytenr == logical) {
1037                         memcpy(csum, &sum->sums[i].sum, sdev->csum_size);
1038                         ret = 1;
1039                         break;
1040                 }
1041         }
1042         if (ret && i == num_sectors - 1) {
1043                 list_del(&sum->list);
1044                 kfree(sum);
1045         }
1046         return ret;
1047 }
1048
1049 /* scrub extent tries to collect up to 64 kB for each bio */
1050 static int scrub_extent(struct scrub_dev *sdev, u64 logical, u64 len,
1051                         u64 physical, u64 flags, u64 gen, int mirror_num)
1052 {
1053         int ret;
1054         u8 csum[BTRFS_CSUM_SIZE];
1055
1056         while (len) {
1057                 u64 l = min_t(u64, len, PAGE_SIZE);
1058                 int have_csum = 0;
1059
1060                 if (flags & BTRFS_EXTENT_FLAG_DATA) {
1061                         /* push csums to sbio */
1062                         have_csum = scrub_find_csum(sdev, logical, l, csum);
1063                         if (have_csum == 0)
1064                                 ++sdev->stat.no_csum;
1065                 }
1066                 ret = scrub_page(sdev, logical, l, physical, flags, gen,
1067                                  mirror_num, have_csum ? csum : NULL, 0);
1068                 if (ret)
1069                         return ret;
1070                 len -= l;
1071                 logical += l;
1072                 physical += l;
1073         }
1074         return 0;
1075 }
1076
1077 static noinline_for_stack int scrub_stripe(struct scrub_dev *sdev,
1078         struct map_lookup *map, int num, u64 base, u64 length)
1079 {
1080         struct btrfs_path *path;
1081         struct btrfs_fs_info *fs_info = sdev->dev->dev_root->fs_info;
1082         struct btrfs_root *root = fs_info->extent_root;
1083         struct btrfs_root *csum_root = fs_info->csum_root;
1084         struct btrfs_extent_item *extent;
1085         struct blk_plug plug;
1086         u64 flags;
1087         int ret;
1088         int slot;
1089         int i;
1090         u64 nstripes;
1091         int start_stripe;
1092         struct extent_buffer *l;
1093         struct btrfs_key key;
1094         u64 physical;
1095         u64 logical;
1096         u64 generation;
1097         int mirror_num;
1098
1099         u64 increment = map->stripe_len;
1100         u64 offset;
1101
1102         nstripes = length;
1103         offset = 0;
1104         do_div(nstripes, map->stripe_len);
1105         if (map->type & BTRFS_BLOCK_GROUP_RAID0) {
1106                 offset = map->stripe_len * num;
1107                 increment = map->stripe_len * map->num_stripes;
1108                 mirror_num = 1;
1109         } else if (map->type & BTRFS_BLOCK_GROUP_RAID10) {
1110                 int factor = map->num_stripes / map->sub_stripes;
1111                 offset = map->stripe_len * (num / map->sub_stripes);
1112                 increment = map->stripe_len * factor;
1113                 mirror_num = num % map->sub_stripes + 1;
1114         } else if (map->type & BTRFS_BLOCK_GROUP_RAID1) {
1115                 increment = map->stripe_len;
1116                 mirror_num = num % map->num_stripes + 1;
1117         } else if (map->type & BTRFS_BLOCK_GROUP_DUP) {
1118                 increment = map->stripe_len;
1119                 mirror_num = num % map->num_stripes + 1;
1120         } else {
1121                 increment = map->stripe_len;
1122                 mirror_num = 1;
1123         }
1124
1125         path = btrfs_alloc_path();
1126         if (!path)
1127                 return -ENOMEM;
1128
1129         path->reada = 2;
1130         path->search_commit_root = 1;
1131         path->skip_locking = 1;
1132
1133         /*
1134          * find all extents for each stripe and just read them to get
1135          * them into the page cache
1136          * FIXME: we can do better. build a more intelligent prefetching
1137          */
1138         logical = base + offset;
1139         physical = map->stripes[num].physical;
1140         ret = 0;
1141         for (i = 0; i < nstripes; ++i) {
1142                 key.objectid = logical;
1143                 key.type = BTRFS_EXTENT_ITEM_KEY;
1144                 key.offset = (u64)0;
1145
1146                 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
1147                 if (ret < 0)
1148                         goto out_noplug;
1149
1150                 /*
1151                  * we might miss half an extent here, but that doesn't matter,
1152                  * as it's only the prefetch
1153                  */
1154                 while (1) {
1155                         l = path->nodes[0];
1156                         slot = path->slots[0];
1157                         if (slot >= btrfs_header_nritems(l)) {
1158                                 ret = btrfs_next_leaf(root, path);
1159                                 if (ret == 0)
1160                                         continue;
1161                                 if (ret < 0)
1162                                         goto out_noplug;
1163
1164                                 break;
1165                         }
1166                         btrfs_item_key_to_cpu(l, &key, slot);
1167
1168                         if (key.objectid >= logical + map->stripe_len)
1169                                 break;
1170
1171                         path->slots[0]++;
1172                 }
1173                 btrfs_release_path(path);
1174                 logical += increment;
1175                 physical += map->stripe_len;
1176                 cond_resched();
1177         }
1178
1179         /*
1180          * collect all data csums for the stripe to avoid seeking during
1181          * the scrub. This might currently (crc32) end up to be about 1MB
1182          */
1183         start_stripe = 0;
1184         blk_start_plug(&plug);
1185 again:
1186         logical = base + offset + start_stripe * increment;
1187         for (i = start_stripe; i < nstripes; ++i) {
1188                 ret = btrfs_lookup_csums_range(csum_root, logical,
1189                                                logical + map->stripe_len - 1,
1190                                                &sdev->csum_list, 1);
1191                 if (ret)
1192                         goto out;
1193
1194                 logical += increment;
1195                 cond_resched();
1196         }
1197         /*
1198          * now find all extents for each stripe and scrub them
1199          */
1200         logical = base + offset + start_stripe * increment;
1201         physical = map->stripes[num].physical + start_stripe * map->stripe_len;
1202         ret = 0;
1203         for (i = start_stripe; i < nstripes; ++i) {
1204                 /*
1205                  * canceled?
1206                  */
1207                 if (atomic_read(&fs_info->scrub_cancel_req) ||
1208                     atomic_read(&sdev->cancel_req)) {
1209                         ret = -ECANCELED;
1210                         goto out;
1211                 }
1212                 /*
1213                  * check to see if we have to pause
1214                  */
1215                 if (atomic_read(&fs_info->scrub_pause_req)) {
1216                         /* push queued extents */
1217                         scrub_submit(sdev);
1218                         wait_event(sdev->list_wait,
1219                                    atomic_read(&sdev->in_flight) == 0);
1220                         atomic_inc(&fs_info->scrubs_paused);
1221                         wake_up(&fs_info->scrub_pause_wait);
1222                         mutex_lock(&fs_info->scrub_lock);
1223                         while (atomic_read(&fs_info->scrub_pause_req)) {
1224                                 mutex_unlock(&fs_info->scrub_lock);
1225                                 wait_event(fs_info->scrub_pause_wait,
1226                                    atomic_read(&fs_info->scrub_pause_req) == 0);
1227                                 mutex_lock(&fs_info->scrub_lock);
1228                         }
1229                         atomic_dec(&fs_info->scrubs_paused);
1230                         mutex_unlock(&fs_info->scrub_lock);
1231                         wake_up(&fs_info->scrub_pause_wait);
1232                         scrub_free_csums(sdev);
1233                         start_stripe = i;
1234                         goto again;
1235                 }
1236
1237                 key.objectid = logical;
1238                 key.type = BTRFS_EXTENT_ITEM_KEY;
1239                 key.offset = (u64)0;
1240
1241                 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
1242                 if (ret < 0)
1243                         goto out;
1244                 if (ret > 0) {
1245                         ret = btrfs_previous_item(root, path, 0,
1246                                                   BTRFS_EXTENT_ITEM_KEY);
1247                         if (ret < 0)
1248                                 goto out;
1249                         if (ret > 0) {
1250                                 /* there's no smaller item, so stick with the
1251                                  * larger one */
1252                                 btrfs_release_path(path);
1253                                 ret = btrfs_search_slot(NULL, root, &key,
1254                                                         path, 0, 0);
1255                                 if (ret < 0)
1256                                         goto out;
1257                         }
1258                 }
1259
1260                 while (1) {
1261                         l = path->nodes[0];
1262                         slot = path->slots[0];
1263                         if (slot >= btrfs_header_nritems(l)) {
1264                                 ret = btrfs_next_leaf(root, path);
1265                                 if (ret == 0)
1266                                         continue;
1267                                 if (ret < 0)
1268                                         goto out;
1269
1270                                 break;
1271                         }
1272                         btrfs_item_key_to_cpu(l, &key, slot);
1273
1274                         if (key.objectid + key.offset <= logical)
1275                                 goto next;
1276
1277                         if (key.objectid >= logical + map->stripe_len)
1278                                 break;
1279
1280                         if (btrfs_key_type(&key) != BTRFS_EXTENT_ITEM_KEY)
1281                                 goto next;
1282
1283                         extent = btrfs_item_ptr(l, slot,
1284                                                 struct btrfs_extent_item);
1285                         flags = btrfs_extent_flags(l, extent);
1286                         generation = btrfs_extent_generation(l, extent);
1287
1288                         if (key.objectid < logical &&
1289                             (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK)) {
1290                                 printk(KERN_ERR
1291                                        "btrfs scrub: tree block %llu spanning "
1292                                        "stripes, ignored. logical=%llu\n",
1293                                        (unsigned long long)key.objectid,
1294                                        (unsigned long long)logical);
1295                                 goto next;
1296                         }
1297
1298                         /*
1299                          * trim extent to this stripe
1300                          */
1301                         if (key.objectid < logical) {
1302                                 key.offset -= logical - key.objectid;
1303                                 key.objectid = logical;
1304                         }
1305                         if (key.objectid + key.offset >
1306                             logical + map->stripe_len) {
1307                                 key.offset = logical + map->stripe_len -
1308                                              key.objectid;
1309                         }
1310
1311                         ret = scrub_extent(sdev, key.objectid, key.offset,
1312                                            key.objectid - logical + physical,
1313                                            flags, generation, mirror_num);
1314                         if (ret)
1315                                 goto out;
1316
1317 next:
1318                         path->slots[0]++;
1319                 }
1320                 btrfs_release_path(path);
1321                 logical += increment;
1322                 physical += map->stripe_len;
1323                 spin_lock(&sdev->stat_lock);
1324                 sdev->stat.last_physical = physical;
1325                 spin_unlock(&sdev->stat_lock);
1326         }
1327         /* push queued extents */
1328         scrub_submit(sdev);
1329
1330 out:
1331         blk_finish_plug(&plug);
1332 out_noplug:
1333         btrfs_free_path(path);
1334         return ret < 0 ? ret : 0;
1335 }
1336
1337 static noinline_for_stack int scrub_chunk(struct scrub_dev *sdev,
1338         u64 chunk_tree, u64 chunk_objectid, u64 chunk_offset, u64 length)
1339 {
1340         struct btrfs_mapping_tree *map_tree =
1341                 &sdev->dev->dev_root->fs_info->mapping_tree;
1342         struct map_lookup *map;
1343         struct extent_map *em;
1344         int i;
1345         int ret = -EINVAL;
1346
1347         read_lock(&map_tree->map_tree.lock);
1348         em = lookup_extent_mapping(&map_tree->map_tree, chunk_offset, 1);
1349         read_unlock(&map_tree->map_tree.lock);
1350
1351         if (!em)
1352                 return -EINVAL;
1353
1354         map = (struct map_lookup *)em->bdev;
1355         if (em->start != chunk_offset)
1356                 goto out;
1357
1358         if (em->len < length)
1359                 goto out;
1360
1361         for (i = 0; i < map->num_stripes; ++i) {
1362                 if (map->stripes[i].dev == sdev->dev) {
1363                         ret = scrub_stripe(sdev, map, i, chunk_offset, length);
1364                         if (ret)
1365                                 goto out;
1366                 }
1367         }
1368 out:
1369         free_extent_map(em);
1370
1371         return ret;
1372 }
1373
1374 static noinline_for_stack
1375 int scrub_enumerate_chunks(struct scrub_dev *sdev, u64 start, u64 end)
1376 {
1377         struct btrfs_dev_extent *dev_extent = NULL;
1378         struct btrfs_path *path;
1379         struct btrfs_root *root = sdev->dev->dev_root;
1380         struct btrfs_fs_info *fs_info = root->fs_info;
1381         u64 length;
1382         u64 chunk_tree;
1383         u64 chunk_objectid;
1384         u64 chunk_offset;
1385         int ret;
1386         int slot;
1387         struct extent_buffer *l;
1388         struct btrfs_key key;
1389         struct btrfs_key found_key;
1390         struct btrfs_block_group_cache *cache;
1391
1392         path = btrfs_alloc_path();
1393         if (!path)
1394                 return -ENOMEM;
1395
1396         path->reada = 2;
1397         path->search_commit_root = 1;
1398         path->skip_locking = 1;
1399
1400         key.objectid = sdev->dev->devid;
1401         key.offset = 0ull;
1402         key.type = BTRFS_DEV_EXTENT_KEY;
1403
1404
1405         while (1) {
1406                 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
1407                 if (ret < 0)
1408                         break;
1409                 if (ret > 0) {
1410                         if (path->slots[0] >=
1411                             btrfs_header_nritems(path->nodes[0])) {
1412                                 ret = btrfs_next_leaf(root, path);
1413                                 if (ret)
1414                                         break;
1415                         }
1416                 }
1417
1418                 l = path->nodes[0];
1419                 slot = path->slots[0];
1420
1421                 btrfs_item_key_to_cpu(l, &found_key, slot);
1422
1423                 if (found_key.objectid != sdev->dev->devid)
1424                         break;
1425
1426                 if (btrfs_key_type(&found_key) != BTRFS_DEV_EXTENT_KEY)
1427                         break;
1428
1429                 if (found_key.offset >= end)
1430                         break;
1431
1432                 if (found_key.offset < key.offset)
1433                         break;
1434
1435                 dev_extent = btrfs_item_ptr(l, slot, struct btrfs_dev_extent);
1436                 length = btrfs_dev_extent_length(l, dev_extent);
1437
1438                 if (found_key.offset + length <= start) {
1439                         key.offset = found_key.offset + length;
1440                         btrfs_release_path(path);
1441                         continue;
1442                 }
1443
1444                 chunk_tree = btrfs_dev_extent_chunk_tree(l, dev_extent);
1445                 chunk_objectid = btrfs_dev_extent_chunk_objectid(l, dev_extent);
1446                 chunk_offset = btrfs_dev_extent_chunk_offset(l, dev_extent);
1447
1448                 /*
1449                  * get a reference on the corresponding block group to prevent
1450                  * the chunk from going away while we scrub it
1451                  */
1452                 cache = btrfs_lookup_block_group(fs_info, chunk_offset);
1453                 if (!cache) {
1454                         ret = -ENOENT;
1455                         break;
1456                 }
1457                 ret = scrub_chunk(sdev, chunk_tree, chunk_objectid,
1458                                   chunk_offset, length);
1459                 btrfs_put_block_group(cache);
1460                 if (ret)
1461                         break;
1462
1463                 key.offset = found_key.offset + length;
1464                 btrfs_release_path(path);
1465         }
1466
1467         btrfs_free_path(path);
1468
1469         /*
1470          * ret can still be 1 from search_slot or next_leaf,
1471          * that's not an error
1472          */
1473         return ret < 0 ? ret : 0;
1474 }
1475
1476 static noinline_for_stack int scrub_supers(struct scrub_dev *sdev)
1477 {
1478         int     i;
1479         u64     bytenr;
1480         u64     gen;
1481         int     ret;
1482         struct btrfs_device *device = sdev->dev;
1483         struct btrfs_root *root = device->dev_root;
1484
1485         gen = root->fs_info->last_trans_committed;
1486
1487         for (i = 0; i < BTRFS_SUPER_MIRROR_MAX; i++) {
1488                 bytenr = btrfs_sb_offset(i);
1489                 if (bytenr + BTRFS_SUPER_INFO_SIZE >= device->total_bytes)
1490                         break;
1491
1492                 ret = scrub_page(sdev, bytenr, PAGE_SIZE, bytenr,
1493                                  BTRFS_EXTENT_FLAG_SUPER, gen, i, NULL, 1);
1494                 if (ret)
1495                         return ret;
1496         }
1497         wait_event(sdev->list_wait, atomic_read(&sdev->in_flight) == 0);
1498
1499         return 0;
1500 }
1501
1502 /*
1503  * get a reference count on fs_info->scrub_workers. start worker if necessary
1504  */
1505 static noinline_for_stack int scrub_workers_get(struct btrfs_root *root)
1506 {
1507         struct btrfs_fs_info *fs_info = root->fs_info;
1508
1509         mutex_lock(&fs_info->scrub_lock);
1510         if (fs_info->scrub_workers_refcnt == 0) {
1511                 btrfs_init_workers(&fs_info->scrub_workers, "scrub",
1512                            fs_info->thread_pool_size, &fs_info->generic_worker);
1513                 fs_info->scrub_workers.idle_thresh = 4;
1514                 btrfs_start_workers(&fs_info->scrub_workers, 1);
1515         }
1516         ++fs_info->scrub_workers_refcnt;
1517         mutex_unlock(&fs_info->scrub_lock);
1518
1519         return 0;
1520 }
1521
1522 static noinline_for_stack void scrub_workers_put(struct btrfs_root *root)
1523 {
1524         struct btrfs_fs_info *fs_info = root->fs_info;
1525
1526         mutex_lock(&fs_info->scrub_lock);
1527         if (--fs_info->scrub_workers_refcnt == 0)
1528                 btrfs_stop_workers(&fs_info->scrub_workers);
1529         WARN_ON(fs_info->scrub_workers_refcnt < 0);
1530         mutex_unlock(&fs_info->scrub_lock);
1531 }
1532
1533
1534 int btrfs_scrub_dev(struct btrfs_root *root, u64 devid, u64 start, u64 end,
1535                     struct btrfs_scrub_progress *progress, int readonly)
1536 {
1537         struct scrub_dev *sdev;
1538         struct btrfs_fs_info *fs_info = root->fs_info;
1539         int ret;
1540         struct btrfs_device *dev;
1541
1542         if (btrfs_fs_closing(root->fs_info))
1543                 return -EINVAL;
1544
1545         /*
1546          * check some assumptions
1547          */
1548         if (root->sectorsize != PAGE_SIZE ||
1549             root->sectorsize != root->leafsize ||
1550             root->sectorsize != root->nodesize) {
1551                 printk(KERN_ERR "btrfs_scrub: size assumptions fail\n");
1552                 return -EINVAL;
1553         }
1554
1555         ret = scrub_workers_get(root);
1556         if (ret)
1557                 return ret;
1558
1559         mutex_lock(&root->fs_info->fs_devices->device_list_mutex);
1560         dev = btrfs_find_device(root, devid, NULL, NULL);
1561         if (!dev || dev->missing) {
1562                 mutex_unlock(&root->fs_info->fs_devices->device_list_mutex);
1563                 scrub_workers_put(root);
1564                 return -ENODEV;
1565         }
1566         mutex_lock(&fs_info->scrub_lock);
1567
1568         if (!dev->in_fs_metadata) {
1569                 mutex_unlock(&fs_info->scrub_lock);
1570                 mutex_unlock(&root->fs_info->fs_devices->device_list_mutex);
1571                 scrub_workers_put(root);
1572                 return -ENODEV;
1573         }
1574
1575         if (dev->scrub_device) {
1576                 mutex_unlock(&fs_info->scrub_lock);
1577                 mutex_unlock(&root->fs_info->fs_devices->device_list_mutex);
1578                 scrub_workers_put(root);
1579                 return -EINPROGRESS;
1580         }
1581         sdev = scrub_setup_dev(dev);
1582         if (IS_ERR(sdev)) {
1583                 mutex_unlock(&fs_info->scrub_lock);
1584                 mutex_unlock(&root->fs_info->fs_devices->device_list_mutex);
1585                 scrub_workers_put(root);
1586                 return PTR_ERR(sdev);
1587         }
1588         sdev->readonly = readonly;
1589         dev->scrub_device = sdev;
1590
1591         atomic_inc(&fs_info->scrubs_running);
1592         mutex_unlock(&fs_info->scrub_lock);
1593         mutex_unlock(&root->fs_info->fs_devices->device_list_mutex);
1594
1595         down_read(&fs_info->scrub_super_lock);
1596         ret = scrub_supers(sdev);
1597         up_read(&fs_info->scrub_super_lock);
1598
1599         if (!ret)
1600                 ret = scrub_enumerate_chunks(sdev, start, end);
1601
1602         wait_event(sdev->list_wait, atomic_read(&sdev->in_flight) == 0);
1603         atomic_dec(&fs_info->scrubs_running);
1604         wake_up(&fs_info->scrub_pause_wait);
1605
1606         wait_event(sdev->list_wait, atomic_read(&sdev->fixup_cnt) == 0);
1607
1608         if (progress)
1609                 memcpy(progress, &sdev->stat, sizeof(*progress));
1610
1611         mutex_lock(&fs_info->scrub_lock);
1612         dev->scrub_device = NULL;
1613         mutex_unlock(&fs_info->scrub_lock);
1614
1615         scrub_free_dev(sdev);
1616         scrub_workers_put(root);
1617
1618         return ret;
1619 }
1620
1621 int btrfs_scrub_pause(struct btrfs_root *root)
1622 {
1623         struct btrfs_fs_info *fs_info = root->fs_info;
1624
1625         mutex_lock(&fs_info->scrub_lock);
1626         atomic_inc(&fs_info->scrub_pause_req);
1627         while (atomic_read(&fs_info->scrubs_paused) !=
1628                atomic_read(&fs_info->scrubs_running)) {
1629                 mutex_unlock(&fs_info->scrub_lock);
1630                 wait_event(fs_info->scrub_pause_wait,
1631                            atomic_read(&fs_info->scrubs_paused) ==
1632                            atomic_read(&fs_info->scrubs_running));
1633                 mutex_lock(&fs_info->scrub_lock);
1634         }
1635         mutex_unlock(&fs_info->scrub_lock);
1636
1637         return 0;
1638 }
1639
1640 int btrfs_scrub_continue(struct btrfs_root *root)
1641 {
1642         struct btrfs_fs_info *fs_info = root->fs_info;
1643
1644         atomic_dec(&fs_info->scrub_pause_req);
1645         wake_up(&fs_info->scrub_pause_wait);
1646         return 0;
1647 }
1648
1649 int btrfs_scrub_pause_super(struct btrfs_root *root)
1650 {
1651         down_write(&root->fs_info->scrub_super_lock);
1652         return 0;
1653 }
1654
1655 int btrfs_scrub_continue_super(struct btrfs_root *root)
1656 {
1657         up_write(&root->fs_info->scrub_super_lock);
1658         return 0;
1659 }
1660
1661 int btrfs_scrub_cancel(struct btrfs_root *root)
1662 {
1663         struct btrfs_fs_info *fs_info = root->fs_info;
1664
1665         mutex_lock(&fs_info->scrub_lock);
1666         if (!atomic_read(&fs_info->scrubs_running)) {
1667                 mutex_unlock(&fs_info->scrub_lock);
1668                 return -ENOTCONN;
1669         }
1670
1671         atomic_inc(&fs_info->scrub_cancel_req);
1672         while (atomic_read(&fs_info->scrubs_running)) {
1673                 mutex_unlock(&fs_info->scrub_lock);
1674                 wait_event(fs_info->scrub_pause_wait,
1675                            atomic_read(&fs_info->scrubs_running) == 0);
1676                 mutex_lock(&fs_info->scrub_lock);
1677         }
1678         atomic_dec(&fs_info->scrub_cancel_req);
1679         mutex_unlock(&fs_info->scrub_lock);
1680
1681         return 0;
1682 }
1683
1684 int btrfs_scrub_cancel_dev(struct btrfs_root *root, struct btrfs_device *dev)
1685 {
1686         struct btrfs_fs_info *fs_info = root->fs_info;
1687         struct scrub_dev *sdev;
1688
1689         mutex_lock(&fs_info->scrub_lock);
1690         sdev = dev->scrub_device;
1691         if (!sdev) {
1692                 mutex_unlock(&fs_info->scrub_lock);
1693                 return -ENOTCONN;
1694         }
1695         atomic_inc(&sdev->cancel_req);
1696         while (dev->scrub_device) {
1697                 mutex_unlock(&fs_info->scrub_lock);
1698                 wait_event(fs_info->scrub_pause_wait,
1699                            dev->scrub_device == NULL);
1700                 mutex_lock(&fs_info->scrub_lock);
1701         }
1702         mutex_unlock(&fs_info->scrub_lock);
1703
1704         return 0;
1705 }
1706 int btrfs_scrub_cancel_devid(struct btrfs_root *root, u64 devid)
1707 {
1708         struct btrfs_fs_info *fs_info = root->fs_info;
1709         struct btrfs_device *dev;
1710         int ret;
1711
1712         /*
1713          * we have to hold the device_list_mutex here so the device
1714          * does not go away in cancel_dev. FIXME: find a better solution
1715          */
1716         mutex_lock(&fs_info->fs_devices->device_list_mutex);
1717         dev = btrfs_find_device(root, devid, NULL, NULL);
1718         if (!dev) {
1719                 mutex_unlock(&fs_info->fs_devices->device_list_mutex);
1720                 return -ENODEV;
1721         }
1722         ret = btrfs_scrub_cancel_dev(root, dev);
1723         mutex_unlock(&fs_info->fs_devices->device_list_mutex);
1724
1725         return ret;
1726 }
1727
1728 int btrfs_scrub_progress(struct btrfs_root *root, u64 devid,
1729                          struct btrfs_scrub_progress *progress)
1730 {
1731         struct btrfs_device *dev;
1732         struct scrub_dev *sdev = NULL;
1733
1734         mutex_lock(&root->fs_info->fs_devices->device_list_mutex);
1735         dev = btrfs_find_device(root, devid, NULL, NULL);
1736         if (dev)
1737                 sdev = dev->scrub_device;
1738         if (sdev)
1739                 memcpy(progress, &sdev->stat, sizeof(*progress));
1740         mutex_unlock(&root->fs_info->fs_devices->device_list_mutex);
1741
1742         return dev ? (sdev ? 0 : -ENOTCONN) : -ENODEV;
1743 }