#include "compat.h"
#include "ctree.h"
#include "btrfs_inode.h"
+#include "volumes.h"
static struct kmem_cache *extent_state_cache;
static struct kmem_cache *extent_buffer_cache;
*
* This should be called with the tree lock held.
*/
-static int merge_state(struct extent_io_tree *tree,
- struct extent_state *state)
+static void merge_state(struct extent_io_tree *tree,
+ struct extent_state *state)
{
struct extent_state *other;
struct rb_node *other_node;
if (state->state & (EXTENT_IOBITS | EXTENT_BOUNDARY))
- return 0;
+ return;
other_node = rb_prev(&state->rb_node);
if (other_node) {
if (other->start == state->end + 1 &&
other->state == state->state) {
merge_cb(tree, state, other);
- other->start = state->start;
- state->tree = NULL;
- rb_erase(&state->rb_node, &tree->state);
- free_extent_state(state);
- state = NULL;
+ state->end = other->end;
+ other->tree = NULL;
+ rb_erase(&other->rb_node, &tree->state);
+ free_extent_state(other);
}
}
-
- return 0;
}
-static int set_state_cb(struct extent_io_tree *tree,
+static void set_state_cb(struct extent_io_tree *tree,
struct extent_state *state, int *bits)
{
- if (tree->ops && tree->ops->set_bit_hook) {
- return tree->ops->set_bit_hook(tree->mapping->host,
- state, bits);
- }
-
- return 0;
+ if (tree->ops && tree->ops->set_bit_hook)
+ tree->ops->set_bit_hook(tree->mapping->host, state, bits);
}
static void clear_state_cb(struct extent_io_tree *tree,
tree->ops->clear_bit_hook(tree->mapping->host, state, bits);
}
+static void set_state_bits(struct extent_io_tree *tree,
+ struct extent_state *state, int *bits);
+
/*
* insert an extent_state struct into the tree. 'bits' are set on the
* struct before it is inserted.
int *bits)
{
struct rb_node *node;
- int bits_to_set = *bits & ~EXTENT_CTLBITS;
- int ret;
if (end < start) {
printk(KERN_ERR "btrfs end < start %llu %llu\n",
}
state->start = start;
state->end = end;
- ret = set_state_cb(tree, state, bits);
- if (ret)
- return ret;
- if (bits_to_set & EXTENT_DIRTY)
- tree->dirty_bytes += end - start + 1;
- state->state |= bits_to_set;
+ set_state_bits(tree, state, bits);
+
node = tree_insert(&tree->state, end, &state->rb_node);
if (node) {
struct extent_state *found;
"%llu %llu\n", (unsigned long long)found->start,
(unsigned long long)found->end,
(unsigned long long)start, (unsigned long long)end);
- free_extent_state(state);
return -EEXIST;
}
state->tree = tree;
return 0;
}
-static int split_cb(struct extent_io_tree *tree, struct extent_state *orig,
+static void split_cb(struct extent_io_tree *tree, struct extent_state *orig,
u64 split)
{
if (tree->ops && tree->ops->split_extent_hook)
- return tree->ops->split_extent_hook(tree->mapping->host,
- orig, split);
- return 0;
+ tree->ops->split_extent_hook(tree->mapping->host, orig, split);
}
/*
cached_state = NULL;
}
- if (cached && cached->tree && cached->start == start) {
+ if (cached && cached->tree && cached->start <= start &&
+ cached->end > start) {
if (clear)
atomic_dec(&cached->refs);
state = cached;
if (start > end)
break;
- if (need_resched()) {
- spin_unlock(&tree->lock);
- cond_resched();
- spin_lock(&tree->lock);
- }
+ cond_resched_lock(&tree->lock);
}
out:
spin_unlock(&tree->lock);
return 0;
}
-static int set_state_bits(struct extent_io_tree *tree,
+static void set_state_bits(struct extent_io_tree *tree,
struct extent_state *state,
int *bits)
{
- int ret;
int bits_to_set = *bits & ~EXTENT_CTLBITS;
- ret = set_state_cb(tree, state, bits);
- if (ret)
- return ret;
+ set_state_cb(tree, state, bits);
if ((bits_to_set & EXTENT_DIRTY) && !(state->state & EXTENT_DIRTY)) {
u64 range = state->end - state->start + 1;
tree->dirty_bytes += range;
}
state->state |= bits_to_set;
-
- return 0;
}
static void cache_state(struct extent_state *state,
spin_lock(&tree->lock);
if (cached_state && *cached_state) {
state = *cached_state;
- if (state->start == start && state->tree) {
+ if (state->start <= start && state->end > start &&
+ state->tree) {
node = &state->rb_node;
goto hit_next;
}
goto out;
}
- err = set_state_bits(tree, state, &bits);
- if (err)
- goto out;
+ set_state_bits(tree, state, &bits);
- next_node = rb_next(node);
cache_state(state, cached_state);
merge_state(tree, state);
if (last_end == (u64)-1)
goto out;
start = last_end + 1;
+ next_node = rb_next(&state->rb_node);
if (next_node && start < end && prealloc && !need_resched()) {
state = rb_entry(next_node, struct extent_state,
rb_node);
if (err)
goto out;
if (state->end <= end) {
- err = set_state_bits(tree, state, &bits);
- if (err)
- goto out;
+ set_state_bits(tree, state, &bits);
cache_state(state, cached_state);
merge_state(tree, state);
if (last_end == (u64)-1)
* Avoid to free 'prealloc' if it can be merged with
* the later extent.
*/
- atomic_inc(&prealloc->refs);
err = insert_state(tree, prealloc, start, this_end,
&bits);
BUG_ON(err == -EEXIST);
goto out;
}
cache_state(prealloc, cached_state);
- free_extent_state(prealloc);
prealloc = NULL;
start = this_end + 1;
goto search_again;
err = split_state(tree, state, prealloc, end + 1);
BUG_ON(err == -EEXIST);
- err = set_state_bits(tree, prealloc, &bits);
+ set_state_bits(tree, prealloc, &bits);
+ cache_state(prealloc, cached_state);
+ merge_state(tree, prealloc);
+ prealloc = NULL;
+ goto out;
+ }
+
+ goto search_again;
+
+out:
+ spin_unlock(&tree->lock);
+ if (prealloc)
+ free_extent_state(prealloc);
+
+ return err;
+
+search_again:
+ if (start > end)
+ goto out;
+ spin_unlock(&tree->lock);
+ if (mask & __GFP_WAIT)
+ cond_resched();
+ goto again;
+}
+
+/**
+ * convert_extent - convert all bits in a given range from one bit to another
+ * @tree: the io tree to search
+ * @start: the start offset in bytes
+ * @end: the end offset in bytes (inclusive)
+ * @bits: the bits to set in this range
+ * @clear_bits: the bits to clear in this range
+ * @mask: the allocation mask
+ *
+ * This will go through and set bits for the given range. If any states exist
+ * already in this range they are set with the given bit and cleared of the
+ * clear_bits. This is only meant to be used by things that are mergeable, ie
+ * converting from say DELALLOC to DIRTY. This is not meant to be used with
+ * boundary bits like LOCK.
+ */
+int convert_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
+ int bits, int clear_bits, gfp_t mask)
+{
+ struct extent_state *state;
+ struct extent_state *prealloc = NULL;
+ struct rb_node *node;
+ int err = 0;
+ u64 last_start;
+ u64 last_end;
+
+again:
+ if (!prealloc && (mask & __GFP_WAIT)) {
+ prealloc = alloc_extent_state(mask);
+ if (!prealloc)
+ return -ENOMEM;
+ }
+
+ spin_lock(&tree->lock);
+ /*
+ * this search will find all the extents that end after
+ * our range starts.
+ */
+ node = tree_search(tree, start);
+ if (!node) {
+ prealloc = alloc_extent_state_atomic(prealloc);
+ if (!prealloc)
+ return -ENOMEM;
+ err = insert_state(tree, prealloc, start, end, &bits);
+ prealloc = NULL;
+ BUG_ON(err == -EEXIST);
+ goto out;
+ }
+ state = rb_entry(node, struct extent_state, rb_node);
+hit_next:
+ last_start = state->start;
+ last_end = state->end;
+
+ /*
+ * | ---- desired range ---- |
+ * | state |
+ *
+ * Just lock what we found and keep going
+ */
+ if (state->start == start && state->end <= end) {
+ struct rb_node *next_node;
+
+ set_state_bits(tree, state, &bits);
+ clear_state_bit(tree, state, &clear_bits, 0);
+
+ merge_state(tree, state);
+ if (last_end == (u64)-1)
+ goto out;
+
+ start = last_end + 1;
+ next_node = rb_next(&state->rb_node);
+ if (next_node && start < end && prealloc && !need_resched()) {
+ state = rb_entry(next_node, struct extent_state,
+ rb_node);
+ if (state->start == start)
+ goto hit_next;
+ }
+ goto search_again;
+ }
+
+ /*
+ * | ---- desired range ---- |
+ * | state |
+ * or
+ * | ------------- state -------------- |
+ *
+ * We need to split the extent we found, and may flip bits on
+ * second half.
+ *
+ * If the extent we found extends past our
+ * range, we just split and search again. It'll get split
+ * again the next time though.
+ *
+ * If the extent we found is inside our range, we set the
+ * desired bit on it.
+ */
+ if (state->start < start) {
+ prealloc = alloc_extent_state_atomic(prealloc);
+ if (!prealloc)
+ return -ENOMEM;
+ err = split_state(tree, state, prealloc, start);
+ BUG_ON(err == -EEXIST);
+ prealloc = NULL;
+ if (err)
+ goto out;
+ if (state->end <= end) {
+ set_state_bits(tree, state, &bits);
+ clear_state_bit(tree, state, &clear_bits, 0);
+ merge_state(tree, state);
+ if (last_end == (u64)-1)
+ goto out;
+ start = last_end + 1;
+ }
+ goto search_again;
+ }
+ /*
+ * | ---- desired range ---- |
+ * | state | or | state |
+ *
+ * There's a hole, we need to insert something in it and
+ * ignore the extent we found.
+ */
+ if (state->start > start) {
+ u64 this_end;
+ if (end < last_start)
+ this_end = end;
+ else
+ this_end = last_start - 1;
+
+ prealloc = alloc_extent_state_atomic(prealloc);
+ if (!prealloc)
+ return -ENOMEM;
+
+ /*
+ * Avoid to free 'prealloc' if it can be merged with
+ * the later extent.
+ */
+ err = insert_state(tree, prealloc, start, this_end,
+ &bits);
+ BUG_ON(err == -EEXIST);
if (err) {
+ free_extent_state(prealloc);
prealloc = NULL;
goto out;
}
- cache_state(prealloc, cached_state);
+ prealloc = NULL;
+ start = this_end + 1;
+ goto search_again;
+ }
+ /*
+ * | ---- desired range ---- |
+ * | state |
+ * We need to split the extent, and set the bit
+ * on the first half
+ */
+ if (state->start <= end && state->end > end) {
+ prealloc = alloc_extent_state_atomic(prealloc);
+ if (!prealloc)
+ return -ENOMEM;
+
+ err = split_state(tree, state, prealloc, end + 1);
+ BUG_ON(err == -EEXIST);
+
+ set_state_bits(tree, prealloc, &bits);
+ clear_state_bit(tree, prealloc, &clear_bits, 0);
+
merge_state(tree, prealloc);
prealloc = NULL;
goto out;
struct extent_state **cached_state, gfp_t mask)
{
return set_extent_bit(tree, start, end,
- EXTENT_DELALLOC | EXTENT_DIRTY | EXTENT_UPTODATE,
+ EXTENT_DELALLOC | EXTENT_UPTODATE,
0, NULL, cached_state, mask);
}
return 0;
}
-/*
- * find the first offset in the io tree with 'bits' set. zero is
- * returned if we find something, and *start_ret and *end_ret are
- * set to reflect the state struct that was found.
- *
- * If nothing was found, 1 is returned, < 0 on error
- */
-int find_first_extent_bit(struct extent_io_tree *tree, u64 start,
- u64 *start_ret, u64 *end_ret, int bits)
-{
- struct rb_node *node;
- struct extent_state *state;
- int ret = 1;
-
- spin_lock(&tree->lock);
- /*
- * this search will find all the extents that end after
- * our range starts.
- */
- node = tree_search(tree, start);
- if (!node)
- goto out;
-
- while (1) {
- state = rb_entry(node, struct extent_state, rb_node);
- if (state->end >= start && (state->state & bits)) {
- *start_ret = state->start;
- *end_ret = state->end;
- ret = 0;
- break;
- }
- node = rb_next(node);
- if (!node)
- break;
- }
-out:
- spin_unlock(&tree->lock);
- return ret;
-}
-
/* find the first state struct with 'bits' set after 'start', and
* return it. tree->lock must be held. NULL will returned if
* nothing was found after 'start'
return NULL;
}
+/*
+ * find the first offset in the io tree with 'bits' set. zero is
+ * returned if we find something, and *start_ret and *end_ret are
+ * set to reflect the state struct that was found.
+ *
+ * If nothing was found, 1 is returned, < 0 on error
+ */
+int find_first_extent_bit(struct extent_io_tree *tree, u64 start,
+ u64 *start_ret, u64 *end_ret, int bits)
+{
+ struct extent_state *state;
+ int ret = 1;
+
+ spin_lock(&tree->lock);
+ state = find_first_extent_bit_state(tree, start, bits);
+ if (state) {
+ *start_ret = state->start;
+ *end_ret = state->end;
+ ret = 0;
+ }
+ spin_unlock(&tree->lock);
+ return ret;
+}
+
/*
* find a contiguous range of bytes in the file marked as delalloc, not
* more than 'max_bytes'. start and end are used to return the range,
int bitset = 0;
spin_lock(&tree->lock);
- if (cached && cached->tree && cached->start == start)
+ if (cached && cached->tree && cached->start <= start &&
+ cached->end > start)
node = &cached->rb_node;
else
node = tree_search(tree, start);
return 0;
}
+/*
+ * When IO fails, either with EIO or csum verification fails, we
+ * try other mirrors that might have a good copy of the data. This
+ * io_failure_record is used to record state as we go through all the
+ * mirrors. If another mirror has good data, the page is set up to date
+ * and things continue. If a good mirror can't be found, the original
+ * bio end_io callback is called to indicate things have failed.
+ */
+struct io_failure_record {
+ struct page *page;
+ u64 start;
+ u64 len;
+ u64 logical;
+ unsigned long bio_flags;
+ int this_mirror;
+ int failed_mirror;
+ int in_validation;
+};
+
+static int free_io_failure(struct inode *inode, struct io_failure_record *rec,
+ int did_repair)
+{
+ int ret;
+ int err = 0;
+ struct extent_io_tree *failure_tree = &BTRFS_I(inode)->io_failure_tree;
+
+ set_state_private(failure_tree, rec->start, 0);
+ ret = clear_extent_bits(failure_tree, rec->start,
+ rec->start + rec->len - 1,
+ EXTENT_LOCKED | EXTENT_DIRTY, GFP_NOFS);
+ if (ret)
+ err = ret;
+
+ if (did_repair) {
+ ret = clear_extent_bits(&BTRFS_I(inode)->io_tree, rec->start,
+ rec->start + rec->len - 1,
+ EXTENT_DAMAGED, GFP_NOFS);
+ if (ret && !err)
+ err = ret;
+ }
+
+ kfree(rec);
+ return err;
+}
+
+static void repair_io_failure_callback(struct bio *bio, int err)
+{
+ complete(bio->bi_private);
+}
+
+/*
+ * this bypasses the standard btrfs submit functions deliberately, as
+ * the standard behavior is to write all copies in a raid setup. here we only
+ * want to write the one bad copy. so we do the mapping for ourselves and issue
+ * submit_bio directly.
+ * to avoid any synchonization issues, wait for the data after writing, which
+ * actually prevents the read that triggered the error from finishing.
+ * currently, there can be no more than two copies of every data bit. thus,
+ * exactly one rewrite is required.
+ */
+int repair_io_failure(struct btrfs_mapping_tree *map_tree, u64 start,
+ u64 length, u64 logical, struct page *page,
+ int mirror_num)
+{
+ struct bio *bio;
+ struct btrfs_device *dev;
+ DECLARE_COMPLETION_ONSTACK(compl);
+ u64 map_length = 0;
+ u64 sector;
+ struct btrfs_bio *bbio = NULL;
+ int ret;
+
+ BUG_ON(!mirror_num);
+
+ bio = bio_alloc(GFP_NOFS, 1);
+ if (!bio)
+ return -EIO;
+ bio->bi_private = &compl;
+ bio->bi_end_io = repair_io_failure_callback;
+ bio->bi_size = 0;
+ map_length = length;
+
+ ret = btrfs_map_block(map_tree, WRITE, logical,
+ &map_length, &bbio, mirror_num);
+ if (ret) {
+ bio_put(bio);
+ return -EIO;
+ }
+ BUG_ON(mirror_num != bbio->mirror_num);
+ sector = bbio->stripes[mirror_num-1].physical >> 9;
+ bio->bi_sector = sector;
+ dev = bbio->stripes[mirror_num-1].dev;
+ kfree(bbio);
+ if (!dev || !dev->bdev || !dev->writeable) {
+ bio_put(bio);
+ return -EIO;
+ }
+ bio->bi_bdev = dev->bdev;
+ bio_add_page(bio, page, length, start-page_offset(page));
+ submit_bio(WRITE_SYNC, bio);
+ wait_for_completion(&compl);
+
+ if (!test_bit(BIO_UPTODATE, &bio->bi_flags)) {
+ /* try to remap that extent elsewhere? */
+ bio_put(bio);
+ return -EIO;
+ }
+
+ printk(KERN_INFO "btrfs read error corrected: ino %lu off %llu (dev %s "
+ "sector %llu)\n", page->mapping->host->i_ino, start,
+ dev->name, sector);
+
+ bio_put(bio);
+ return 0;
+}
+
+/*
+ * each time an IO finishes, we do a fast check in the IO failure tree
+ * to see if we need to process or clean up an io_failure_record
+ */
+static int clean_io_failure(u64 start, struct page *page)
+{
+ u64 private;
+ u64 private_failure;
+ struct io_failure_record *failrec;
+ struct btrfs_mapping_tree *map_tree;
+ struct extent_state *state;
+ int num_copies;
+ int did_repair = 0;
+ int ret;
+ struct inode *inode = page->mapping->host;
+
+ private = 0;
+ ret = count_range_bits(&BTRFS_I(inode)->io_failure_tree, &private,
+ (u64)-1, 1, EXTENT_DIRTY, 0);
+ if (!ret)
+ return 0;
+
+ ret = get_state_private(&BTRFS_I(inode)->io_failure_tree, start,
+ &private_failure);
+ if (ret)
+ return 0;
+
+ failrec = (struct io_failure_record *)(unsigned long) private_failure;
+ BUG_ON(!failrec->this_mirror);
+
+ if (failrec->in_validation) {
+ /* there was no real error, just free the record */
+ pr_debug("clean_io_failure: freeing dummy error at %llu\n",
+ failrec->start);
+ did_repair = 1;
+ goto out;
+ }
+
+ spin_lock(&BTRFS_I(inode)->io_tree.lock);
+ state = find_first_extent_bit_state(&BTRFS_I(inode)->io_tree,
+ failrec->start,
+ EXTENT_LOCKED);
+ spin_unlock(&BTRFS_I(inode)->io_tree.lock);
+
+ if (state && state->start == failrec->start) {
+ map_tree = &BTRFS_I(inode)->root->fs_info->mapping_tree;
+ num_copies = btrfs_num_copies(map_tree, failrec->logical,
+ failrec->len);
+ if (num_copies > 1) {
+ ret = repair_io_failure(map_tree, start, failrec->len,
+ failrec->logical, page,
+ failrec->failed_mirror);
+ did_repair = !ret;
+ }
+ }
+
+out:
+ if (!ret)
+ ret = free_io_failure(inode, failrec, did_repair);
+
+ return ret;
+}
+
+/*
+ * this is a generic handler for readpage errors (default
+ * readpage_io_failed_hook). if other copies exist, read those and write back
+ * good data to the failed position. does not investigate in remapping the
+ * failed extent elsewhere, hoping the device will be smart enough to do this as
+ * needed
+ */
+
+static int bio_readpage_error(struct bio *failed_bio, struct page *page,
+ u64 start, u64 end, int failed_mirror,
+ struct extent_state *state)
+{
+ struct io_failure_record *failrec = NULL;
+ u64 private;
+ struct extent_map *em;
+ struct inode *inode = page->mapping->host;
+ struct extent_io_tree *failure_tree = &BTRFS_I(inode)->io_failure_tree;
+ struct extent_io_tree *tree = &BTRFS_I(inode)->io_tree;
+ struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
+ struct bio *bio;
+ int num_copies;
+ int ret;
+ int read_mode;
+ u64 logical;
+
+ BUG_ON(failed_bio->bi_rw & REQ_WRITE);
+
+ ret = get_state_private(failure_tree, start, &private);
+ if (ret) {
+ failrec = kzalloc(sizeof(*failrec), GFP_NOFS);
+ if (!failrec)
+ return -ENOMEM;
+ failrec->start = start;
+ failrec->len = end - start + 1;
+ failrec->this_mirror = 0;
+ failrec->bio_flags = 0;
+ failrec->in_validation = 0;
+
+ read_lock(&em_tree->lock);
+ em = lookup_extent_mapping(em_tree, start, failrec->len);
+ if (!em) {
+ read_unlock(&em_tree->lock);
+ kfree(failrec);
+ return -EIO;
+ }
+
+ if (em->start > start || em->start + em->len < start) {
+ free_extent_map(em);
+ em = NULL;
+ }
+ read_unlock(&em_tree->lock);
+
+ if (!em || IS_ERR(em)) {
+ kfree(failrec);
+ return -EIO;
+ }
+ logical = start - em->start;
+ logical = em->block_start + logical;
+ if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) {
+ logical = em->block_start;
+ failrec->bio_flags = EXTENT_BIO_COMPRESSED;
+ extent_set_compress_type(&failrec->bio_flags,
+ em->compress_type);
+ }
+ pr_debug("bio_readpage_error: (new) logical=%llu, start=%llu, "
+ "len=%llu\n", logical, start, failrec->len);
+ failrec->logical = logical;
+ free_extent_map(em);
+
+ /* set the bits in the private failure tree */
+ ret = set_extent_bits(failure_tree, start, end,
+ EXTENT_LOCKED | EXTENT_DIRTY, GFP_NOFS);
+ if (ret >= 0)
+ ret = set_state_private(failure_tree, start,
+ (u64)(unsigned long)failrec);
+ /* set the bits in the inode's tree */
+ if (ret >= 0)
+ ret = set_extent_bits(tree, start, end, EXTENT_DAMAGED,
+ GFP_NOFS);
+ if (ret < 0) {
+ kfree(failrec);
+ return ret;
+ }
+ } else {
+ failrec = (struct io_failure_record *)(unsigned long)private;
+ pr_debug("bio_readpage_error: (found) logical=%llu, "
+ "start=%llu, len=%llu, validation=%d\n",
+ failrec->logical, failrec->start, failrec->len,
+ failrec->in_validation);
+ /*
+ * when data can be on disk more than twice, add to failrec here
+ * (e.g. with a list for failed_mirror) to make
+ * clean_io_failure() clean all those errors at once.
+ */
+ }
+ num_copies = btrfs_num_copies(
+ &BTRFS_I(inode)->root->fs_info->mapping_tree,
+ failrec->logical, failrec->len);
+ if (num_copies == 1) {
+ /*
+ * we only have a single copy of the data, so don't bother with
+ * all the retry and error correction code that follows. no
+ * matter what the error is, it is very likely to persist.
+ */
+ pr_debug("bio_readpage_error: cannot repair, num_copies == 1. "
+ "state=%p, num_copies=%d, next_mirror %d, "
+ "failed_mirror %d\n", state, num_copies,
+ failrec->this_mirror, failed_mirror);
+ free_io_failure(inode, failrec, 0);
+ return -EIO;
+ }
+
+ if (!state) {
+ spin_lock(&tree->lock);
+ state = find_first_extent_bit_state(tree, failrec->start,
+ EXTENT_LOCKED);
+ if (state && state->start != failrec->start)
+ state = NULL;
+ spin_unlock(&tree->lock);
+ }
+
+ /*
+ * there are two premises:
+ * a) deliver good data to the caller
+ * b) correct the bad sectors on disk
+ */
+ if (failed_bio->bi_vcnt > 1) {
+ /*
+ * to fulfill b), we need to know the exact failing sectors, as
+ * we don't want to rewrite any more than the failed ones. thus,
+ * we need separate read requests for the failed bio
+ *
+ * if the following BUG_ON triggers, our validation request got
+ * merged. we need separate requests for our algorithm to work.
+ */
+ BUG_ON(failrec->in_validation);
+ failrec->in_validation = 1;
+ failrec->this_mirror = failed_mirror;
+ read_mode = READ_SYNC | REQ_FAILFAST_DEV;
+ } else {
+ /*
+ * we're ready to fulfill a) and b) alongside. get a good copy
+ * of the failed sector and if we succeed, we have setup
+ * everything for repair_io_failure to do the rest for us.
+ */
+ if (failrec->in_validation) {
+ BUG_ON(failrec->this_mirror != failed_mirror);
+ failrec->in_validation = 0;
+ failrec->this_mirror = 0;
+ }
+ failrec->failed_mirror = failed_mirror;
+ failrec->this_mirror++;
+ if (failrec->this_mirror == failed_mirror)
+ failrec->this_mirror++;
+ read_mode = READ_SYNC;
+ }
+
+ if (!state || failrec->this_mirror > num_copies) {
+ pr_debug("bio_readpage_error: (fail) state=%p, num_copies=%d, "
+ "next_mirror %d, failed_mirror %d\n", state,
+ num_copies, failrec->this_mirror, failed_mirror);
+ free_io_failure(inode, failrec, 0);
+ return -EIO;
+ }
+
+ bio = bio_alloc(GFP_NOFS, 1);
+ bio->bi_private = state;
+ bio->bi_end_io = failed_bio->bi_end_io;
+ bio->bi_sector = failrec->logical >> 9;
+ bio->bi_bdev = BTRFS_I(inode)->root->fs_info->fs_devices->latest_bdev;
+ bio->bi_size = 0;
+
+ bio_add_page(bio, page, failrec->len, start - page_offset(page));
+
+ pr_debug("bio_readpage_error: submitting new read[%#x] to "
+ "this_mirror=%d, num_copies=%d, in_validation=%d\n", read_mode,
+ failrec->this_mirror, num_copies, failrec->in_validation);
+
+ tree->ops->submit_bio_hook(inode, read_mode, bio, failrec->this_mirror,
+ failrec->bio_flags, 0);
+ return 0;
+}
+
/* lots and lots of room for performance fixes in the end_bio funcs */
/*
struct extent_state *cached = NULL;
struct extent_state *state;
+ pr_debug("end_bio_extent_readpage: bi_vcnt=%d, idx=%d, err=%d, "
+ "mirror=%ld\n", bio->bi_vcnt, bio->bi_idx, err,
+ (long int)bio->bi_bdev);
tree = &BTRFS_I(page->mapping->host)->io_tree;
start = ((u64)page->index << PAGE_CACHE_SHIFT) +
state);
if (ret)
uptodate = 0;
+ else
+ clean_io_failure(start, page);
}
- if (!uptodate && tree->ops &&
- tree->ops->readpage_io_failed_hook) {
- ret = tree->ops->readpage_io_failed_hook(bio, page,
- start, end, NULL);
+ if (!uptodate) {
+ u64 failed_mirror;
+ failed_mirror = (unsigned long)bio->bi_bdev;
+ if (tree->ops && tree->ops->readpage_io_failed_hook)
+ ret = tree->ops->readpage_io_failed_hook(
+ bio, page, start, end,
+ failed_mirror, state);
+ else
+ ret = bio_readpage_error(bio, page, start, end,
+ failed_mirror, NULL);
if (ret == 0) {
uptodate =
test_bit(BIO_UPTODATE, &bio->bi_flags);
mirror_num, bio_flags, start);
else
submit_bio(rw, bio);
+
if (bio_flagged(bio, BIO_EOPNOTSUPP))
ret = -EOPNOTSUPP;
bio_put(bio);
}
int extent_read_full_page(struct extent_io_tree *tree, struct page *page,
- get_extent_t *get_extent)
+ get_extent_t *get_extent, int mirror_num)
{
struct bio *bio = NULL;
unsigned long bio_flags = 0;
int ret;
- ret = __extent_read_full_page(tree, page, get_extent, &bio, 0,
+ ret = __extent_read_full_page(tree, page, get_extent, &bio, mirror_num,
&bio_flags);
if (bio)
- ret = submit_one_bio(READ, bio, 0, bio_flags);
+ ret = submit_one_bio(READ, bio, mirror_num, bio_flags);
return ret;
}
int compressed;
int write_flags;
unsigned long nr_written = 0;
+ bool fill_delalloc = true;
if (wbc->sync_mode == WB_SYNC_ALL)
write_flags = WRITE_SYNC;
trace___extent_writepage(page, inode, wbc);
WARN_ON(!PageLocked(page));
+
+ ClearPageError(page);
+
pg_offset = i_size & (PAGE_CACHE_SIZE - 1);
if (page->index > end_index ||
(page->index == end_index && !pg_offset)) {
set_page_extent_mapped(page);
+ if (!tree->ops || !tree->ops->fill_delalloc)
+ fill_delalloc = false;
+
delalloc_start = start;
delalloc_end = 0;
page_started = 0;
- if (!epd->extent_locked) {
+ if (!epd->extent_locked && fill_delalloc) {
u64 delalloc_to_write = 0;
/*
* make sure the wbc mapping index is at least updated
pgoff_t index;
pgoff_t end; /* Inclusive */
int scanned = 0;
+ int tag;
pagevec_init(&pvec, 0);
if (wbc->range_cyclic) {
end = wbc->range_end >> PAGE_CACHE_SHIFT;
scanned = 1;
}
+ if (wbc->sync_mode == WB_SYNC_ALL)
+ tag = PAGECACHE_TAG_TOWRITE;
+ else
+ tag = PAGECACHE_TAG_DIRTY;
retry:
+ if (wbc->sync_mode == WB_SYNC_ALL)
+ tag_pages_for_writeback(mapping, index, end);
while (!done && !nr_to_write_done && (index <= end) &&
- (nr_pages = pagevec_lookup_tag(&pvec, mapping, &index,
- PAGECACHE_TAG_DIRTY, min(end - index,
- (pgoff_t)PAGEVEC_SIZE-1) + 1))) {
+ (nr_pages = pagevec_lookup_tag(&pvec, mapping, &index, tag,
+ min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1))) {
unsigned i;
scanned = 1;
* swizzled back from swapper_space to tmpfs file
* mapping
*/
- if (tree->ops && tree->ops->write_cache_pages_lock_hook)
- tree->ops->write_cache_pages_lock_hook(page);
- else
- lock_page(page);
+ if (tree->ops &&
+ tree->ops->write_cache_pages_lock_hook) {
+ tree->ops->write_cache_pages_lock_hook(page,
+ data, flush_fn);
+ } else {
+ if (!trylock_page(page)) {
+ flush_fn(data);
+ lock_page(page);
+ }
+ }
if (unlikely(page->mapping != mapping)) {
unlock_page(page);
struct writeback_control *wbc)
{
int ret;
- struct address_space *mapping = page->mapping;
struct extent_page_data epd = {
.bio = NULL,
.tree = tree,
.extent_locked = 0,
.sync_io = wbc->sync_mode == WB_SYNC_ALL,
};
- struct writeback_control wbc_writepages = {
- .sync_mode = wbc->sync_mode,
- .older_than_this = NULL,
- .nr_to_write = 64,
- .range_start = page_offset(page) + PAGE_CACHE_SIZE,
- .range_end = (loff_t)-1,
- };
ret = __extent_writepage(page, wbc, &epd);
- extent_write_cache_pages(tree, mapping, &wbc_writepages,
- __extent_writepage, &epd, flush_write_bio);
flush_epd_write_bio(&epd);
return ret;
}
};
struct writeback_control wbc_writepages = {
.sync_mode = mode,
- .older_than_this = NULL,
.nr_to_write = nr_pages * 2,
.range_start = start,
.range_end = end + 1,
return ret;
}
-static inline struct page *extent_buffer_page(struct extent_buffer *eb,
+inline struct page *extent_buffer_page(struct extent_buffer *eb,
unsigned long i)
{
struct page *p;
return p;
}
-static inline unsigned long num_extent_pages(u64 start, u64 len)
+inline unsigned long num_extent_pages(u64 start, u64 len)
{
return ((start + len + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT) -
(start >> PAGE_CACHE_SHIFT);
return NULL;
eb->start = start;
eb->len = len;
- spin_lock_init(&eb->lock);
- init_waitqueue_head(&eb->lock_wq);
+ rwlock_init(&eb->lock);
+ atomic_set(&eb->write_locks, 0);
+ atomic_set(&eb->read_locks, 0);
+ atomic_set(&eb->blocking_readers, 0);
+ atomic_set(&eb->blocking_writers, 0);
+ atomic_set(&eb->spinning_readers, 0);
+ atomic_set(&eb->spinning_writers, 0);
+ init_waitqueue_head(&eb->write_lock_wq);
+ init_waitqueue_head(&eb->read_lock_wq);
#if LEAK_DEBUG
spin_lock_irqsave(&leak_lock, flags);
i = 0;
}
for (; i < num_pages; i++, index++) {
- p = find_or_create_page(mapping, index, GFP_NOFS | __GFP_HIGHMEM);
+ p = find_or_create_page(mapping, index, GFP_NOFS);
if (!p) {
WARN_ON(1);
goto free_eb;
PAGECACHE_TAG_DIRTY);
}
spin_unlock_irq(&page->mapping->tree_lock);
+ ClearPageError(page);
unlock_page(page);
}
return 0;
return was_dirty;
}
+static int __eb_straddles_pages(u64 start, u64 len)
+{
+ if (len < PAGE_CACHE_SIZE)
+ return 1;
+ if (start & (PAGE_CACHE_SIZE - 1))
+ return 1;
+ if ((start + len) & (PAGE_CACHE_SIZE - 1))
+ return 1;
+ return 0;
+}
+
+static int eb_straddles_pages(struct extent_buffer *eb)
+{
+ return __eb_straddles_pages(eb->start, eb->len);
+}
+
int clear_extent_buffer_uptodate(struct extent_io_tree *tree,
struct extent_buffer *eb,
struct extent_state **cached_state)
num_pages = num_extent_pages(eb->start, eb->len);
clear_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags);
- clear_extent_uptodate(tree, eb->start, eb->start + eb->len - 1,
- cached_state, GFP_NOFS);
+ if (eb_straddles_pages(eb)) {
+ clear_extent_uptodate(tree, eb->start, eb->start + eb->len - 1,
+ cached_state, GFP_NOFS);
+ }
for (i = 0; i < num_pages; i++) {
page = extent_buffer_page(eb, i);
if (page)
num_pages = num_extent_pages(eb->start, eb->len);
- set_extent_uptodate(tree, eb->start, eb->start + eb->len - 1,
- NULL, GFP_NOFS);
+ if (eb_straddles_pages(eb)) {
+ set_extent_uptodate(tree, eb->start, eb->start + eb->len - 1,
+ NULL, GFP_NOFS);
+ }
for (i = 0; i < num_pages; i++) {
page = extent_buffer_page(eb, i);
if ((i == 0 && (eb->start & (PAGE_CACHE_SIZE - 1))) ||
int uptodate;
unsigned long index;
- ret = test_range_bit(tree, start, end, EXTENT_UPTODATE, 1, NULL);
- if (ret)
- return 1;
+ if (__eb_straddles_pages(start, end - start + 1)) {
+ ret = test_range_bit(tree, start, end,
+ EXTENT_UPTODATE, 1, NULL);
+ if (ret)
+ return 1;
+ }
while (start <= end) {
index = start >> PAGE_CACHE_SHIFT;
page = find_get_page(tree->mapping, index);
if (test_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags))
return 1;
- ret = test_range_bit(tree, eb->start, eb->start + eb->len - 1,
- EXTENT_UPTODATE, 1, cached_state);
- if (ret)
- return ret;
+ if (eb_straddles_pages(eb)) {
+ ret = test_range_bit(tree, eb->start, eb->start + eb->len - 1,
+ EXTENT_UPTODATE, 1, cached_state);
+ if (ret)
+ return ret;
+ }
num_pages = num_extent_pages(eb->start, eb->len);
for (i = 0; i < num_pages; i++) {
}
int read_extent_buffer_pages(struct extent_io_tree *tree,
- struct extent_buffer *eb,
- u64 start, int wait,
+ struct extent_buffer *eb, u64 start, int wait,
get_extent_t *get_extent, int mirror_num)
{
unsigned long i;
if (test_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags))
return 0;
- if (test_range_bit(tree, eb->start, eb->start + eb->len - 1,
- EXTENT_UPTODATE, 1, NULL)) {
- return 0;
+ if (eb_straddles_pages(eb)) {
+ if (test_range_bit(tree, eb->start, eb->start + eb->len - 1,
+ EXTENT_UPTODATE, 1, NULL)) {
+ return 0;
+ }
}
if (start) {
num_pages = num_extent_pages(eb->start, eb->len);
for (i = start_i; i < num_pages; i++) {
page = extent_buffer_page(eb, i);
- if (!wait) {
+ if (wait == WAIT_NONE) {
if (!trylock_page(page))
goto unlock_exit;
} else {
if (bio)
submit_one_bio(READ, bio, mirror_num, bio_flags);
- if (ret || !wait)
+ if (ret || wait != WAIT_COMPLETE)
return ret;
for (i = start_i; i < num_pages; i++) {
page = extent_buffer_page(eb, i);
cur = min(len, (PAGE_CACHE_SIZE - offset));
- kaddr = kmap_atomic(page, KM_USER1);
+ kaddr = page_address(page);
memcpy(dst, kaddr + offset, cur);
- kunmap_atomic(kaddr, KM_USER1);
dst += cur;
len -= cur;
}
int map_private_extent_buffer(struct extent_buffer *eb, unsigned long start,
- unsigned long min_len, char **token, char **map,
+ unsigned long min_len, char **map,
unsigned long *map_start,
- unsigned long *map_len, int km)
+ unsigned long *map_len)
{
size_t offset = start & (PAGE_CACHE_SIZE - 1);
char *kaddr;
}
p = extent_buffer_page(eb, i);
- kaddr = kmap_atomic(p, km);
- *token = kaddr;
+ kaddr = page_address(p);
*map = kaddr + offset;
*map_len = PAGE_CACHE_SIZE - offset;
return 0;
}
-int map_extent_buffer(struct extent_buffer *eb, unsigned long start,
- unsigned long min_len,
- char **token, char **map,
- unsigned long *map_start,
- unsigned long *map_len, int km)
-{
- int err;
- int save = 0;
- if (eb->map_token) {
- unmap_extent_buffer(eb, eb->map_token, km);
- eb->map_token = NULL;
- save = 1;
- }
- err = map_private_extent_buffer(eb, start, min_len, token, map,
- map_start, map_len, km);
- if (!err && save) {
- eb->map_token = *token;
- eb->kaddr = *map;
- eb->map_start = *map_start;
- eb->map_len = *map_len;
- }
- return err;
-}
-
-void unmap_extent_buffer(struct extent_buffer *eb, char *token, int km)
-{
- kunmap_atomic(token, km);
-}
-
int memcmp_extent_buffer(struct extent_buffer *eb, const void *ptrv,
unsigned long start,
unsigned long len)
cur = min(len, (PAGE_CACHE_SIZE - offset));
- kaddr = kmap_atomic(page, KM_USER0);
+ kaddr = page_address(page);
ret = memcmp(ptr, kaddr + offset, cur);
- kunmap_atomic(kaddr, KM_USER0);
if (ret)
break;
WARN_ON(!PageUptodate(page));
cur = min(len, PAGE_CACHE_SIZE - offset);
- kaddr = kmap_atomic(page, KM_USER1);
+ kaddr = page_address(page);
memcpy(kaddr + offset, src, cur);
- kunmap_atomic(kaddr, KM_USER1);
src += cur;
len -= cur;
WARN_ON(!PageUptodate(page));
cur = min(len, PAGE_CACHE_SIZE - offset);
- kaddr = kmap_atomic(page, KM_USER0);
+ kaddr = page_address(page);
memset(kaddr + offset, c, cur);
- kunmap_atomic(kaddr, KM_USER0);
len -= cur;
offset = 0;
cur = min(len, (unsigned long)(PAGE_CACHE_SIZE - offset));
- kaddr = kmap_atomic(page, KM_USER0);
+ kaddr = page_address(page);
read_extent_buffer(src, kaddr + offset, src_offset, cur);
- kunmap_atomic(kaddr, KM_USER0);
src_offset += cur;
len -= cur;
unsigned long dst_off, unsigned long src_off,
unsigned long len)
{
- char *dst_kaddr = kmap_atomic(dst_page, KM_USER0);
+ char *dst_kaddr = page_address(dst_page);
if (dst_page == src_page) {
memmove(dst_kaddr + dst_off, dst_kaddr + src_off, len);
} else {
- char *src_kaddr = kmap_atomic(src_page, KM_USER1);
+ char *src_kaddr = page_address(src_page);
char *p = dst_kaddr + dst_off + len;
char *s = src_kaddr + src_off + len;
while (len--)
*--p = *--s;
-
- kunmap_atomic(src_kaddr, KM_USER1);
}
- kunmap_atomic(dst_kaddr, KM_USER0);
}
static inline bool areas_overlap(unsigned long src, unsigned long dst, unsigned long len)
unsigned long dst_off, unsigned long src_off,
unsigned long len)
{
- char *dst_kaddr = kmap_atomic(dst_page, KM_USER0);
+ char *dst_kaddr = page_address(dst_page);
char *src_kaddr;
if (dst_page != src_page) {
- src_kaddr = kmap_atomic(src_page, KM_USER1);
+ src_kaddr = page_address(src_page);
} else {
src_kaddr = dst_kaddr;
BUG_ON(areas_overlap(src_off, dst_off, len));
}
memcpy(dst_kaddr + dst_off, src_kaddr + src_off, len);
- kunmap_atomic(dst_kaddr, KM_USER0);
- if (dst_page != src_page)
- kunmap_atomic(src_kaddr, KM_USER1);
}
void memcpy_extent_buffer(struct extent_buffer *dst, unsigned long dst_offset,