btrfs: Fix up 32/64-bit compatibility for new ioctls
[pandora-kernel.git] / fs / btrfs / extent_io.c
index 7055d11..47fdba7 100644 (file)
@@ -17,6 +17,7 @@
 #include "compat.h"
 #include "ctree.h"
 #include "btrfs_inode.h"
+#include "volumes.h"
 
 static struct kmem_cache *extent_state_cache;
 static struct kmem_cache *extent_buffer_cache;
@@ -254,14 +255,14 @@ static void merge_cb(struct extent_io_tree *tree, struct extent_state *new,
  *
  * This should be called with the tree lock held.
  */
-static int merge_state(struct extent_io_tree *tree,
-                      struct extent_state *state)
+static void merge_state(struct extent_io_tree *tree,
+                       struct extent_state *state)
 {
        struct extent_state *other;
        struct rb_node *other_node;
 
        if (state->state & (EXTENT_IOBITS | EXTENT_BOUNDARY))
-               return 0;
+               return;
 
        other_node = rb_prev(&state->rb_node);
        if (other_node) {
@@ -281,26 +282,19 @@ static int merge_state(struct extent_io_tree *tree,
                if (other->start == state->end + 1 &&
                    other->state == state->state) {
                        merge_cb(tree, state, other);
-                       other->start = state->start;
-                       state->tree = NULL;
-                       rb_erase(&state->rb_node, &tree->state);
-                       free_extent_state(state);
-                       state = NULL;
+                       state->end = other->end;
+                       other->tree = NULL;
+                       rb_erase(&other->rb_node, &tree->state);
+                       free_extent_state(other);
                }
        }
-
-       return 0;
 }
 
-static int set_state_cb(struct extent_io_tree *tree,
+static void set_state_cb(struct extent_io_tree *tree,
                         struct extent_state *state, int *bits)
 {
-       if (tree->ops && tree->ops->set_bit_hook) {
-               return tree->ops->set_bit_hook(tree->mapping->host,
-                                              state, bits);
-       }
-
-       return 0;
+       if (tree->ops && tree->ops->set_bit_hook)
+               tree->ops->set_bit_hook(tree->mapping->host, state, bits);
 }
 
 static void clear_state_cb(struct extent_io_tree *tree,
@@ -310,6 +304,9 @@ static void clear_state_cb(struct extent_io_tree *tree,
                tree->ops->clear_bit_hook(tree->mapping->host, state, bits);
 }
 
+static void set_state_bits(struct extent_io_tree *tree,
+                          struct extent_state *state, int *bits);
+
 /*
  * insert an extent_state struct into the tree.  'bits' are set on the
  * struct before it is inserted.
@@ -325,8 +322,6 @@ static int insert_state(struct extent_io_tree *tree,
                        int *bits)
 {
        struct rb_node *node;
-       int bits_to_set = *bits & ~EXTENT_CTLBITS;
-       int ret;
 
        if (end < start) {
                printk(KERN_ERR "btrfs end < start %llu %llu\n",
@@ -336,13 +331,9 @@ static int insert_state(struct extent_io_tree *tree,
        }
        state->start = start;
        state->end = end;
-       ret = set_state_cb(tree, state, bits);
-       if (ret)
-               return ret;
 
-       if (bits_to_set & EXTENT_DIRTY)
-               tree->dirty_bytes += end - start + 1;
-       state->state |= bits_to_set;
+       set_state_bits(tree, state, bits);
+
        node = tree_insert(&tree->state, end, &state->rb_node);
        if (node) {
                struct extent_state *found;
@@ -351,7 +342,6 @@ static int insert_state(struct extent_io_tree *tree,
                       "%llu %llu\n", (unsigned long long)found->start,
                       (unsigned long long)found->end,
                       (unsigned long long)start, (unsigned long long)end);
-               free_extent_state(state);
                return -EEXIST;
        }
        state->tree = tree;
@@ -359,13 +349,11 @@ static int insert_state(struct extent_io_tree *tree,
        return 0;
 }
 
-static int split_cb(struct extent_io_tree *tree, struct extent_state *orig,
+static void split_cb(struct extent_io_tree *tree, struct extent_state *orig,
                     u64 split)
 {
        if (tree->ops && tree->ops->split_extent_hook)
-               return tree->ops->split_extent_hook(tree->mapping->host,
-                                                   orig, split);
-       return 0;
+               tree->ops->split_extent_hook(tree->mapping->host, orig, split);
 }
 
 /*
@@ -500,7 +488,8 @@ again:
                        cached_state = NULL;
                }
 
-               if (cached && cached->tree && cached->start == start) {
+               if (cached && cached->tree && cached->start <= start &&
+                   cached->end > start) {
                        if (clear)
                                atomic_dec(&cached->refs);
                        state = cached;
@@ -660,34 +649,25 @@ again:
                if (start > end)
                        break;
 
-               if (need_resched()) {
-                       spin_unlock(&tree->lock);
-                       cond_resched();
-                       spin_lock(&tree->lock);
-               }
+               cond_resched_lock(&tree->lock);
        }
 out:
        spin_unlock(&tree->lock);
        return 0;
 }
 
-static int set_state_bits(struct extent_io_tree *tree,
+static void set_state_bits(struct extent_io_tree *tree,
                           struct extent_state *state,
                           int *bits)
 {
-       int ret;
        int bits_to_set = *bits & ~EXTENT_CTLBITS;
 
-       ret = set_state_cb(tree, state, bits);
-       if (ret)
-               return ret;
+       set_state_cb(tree, state, bits);
        if ((bits_to_set & EXTENT_DIRTY) && !(state->state & EXTENT_DIRTY)) {
                u64 range = state->end - state->start + 1;
                tree->dirty_bytes += range;
        }
        state->state |= bits_to_set;
-
-       return 0;
 }
 
 static void cache_state(struct extent_state *state,
@@ -742,7 +722,8 @@ again:
        spin_lock(&tree->lock);
        if (cached_state && *cached_state) {
                state = *cached_state;
-               if (state->start == start && state->tree) {
+               if (state->start <= start && state->end > start &&
+                   state->tree) {
                        node = &state->rb_node;
                        goto hit_next;
                }
@@ -779,17 +760,15 @@ hit_next:
                        goto out;
                }
 
-               err = set_state_bits(tree, state, &bits);
-               if (err)
-                       goto out;
+               set_state_bits(tree, state, &bits);
 
-               next_node = rb_next(node);
                cache_state(state, cached_state);
                merge_state(tree, state);
                if (last_end == (u64)-1)
                        goto out;
 
                start = last_end + 1;
+               next_node = rb_next(&state->rb_node);
                if (next_node && start < end && prealloc && !need_resched()) {
                        state = rb_entry(next_node, struct extent_state,
                                         rb_node);
@@ -830,9 +809,7 @@ hit_next:
                if (err)
                        goto out;
                if (state->end <= end) {
-                       err = set_state_bits(tree, state, &bits);
-                       if (err)
-                               goto out;
+                       set_state_bits(tree, state, &bits);
                        cache_state(state, cached_state);
                        merge_state(tree, state);
                        if (last_end == (u64)-1)
@@ -862,7 +839,6 @@ hit_next:
                 * Avoid to free 'prealloc' if it can be merged with
                 * the later extent.
                 */
-               atomic_inc(&prealloc->refs);
                err = insert_state(tree, prealloc, start, this_end,
                                   &bits);
                BUG_ON(err == -EEXIST);
@@ -872,7 +848,6 @@ hit_next:
                        goto out;
                }
                cache_state(prealloc, cached_state);
-               free_extent_state(prealloc);
                prealloc = NULL;
                start = this_end + 1;
                goto search_again;
@@ -895,12 +870,196 @@ hit_next:
                err = split_state(tree, state, prealloc, end + 1);
                BUG_ON(err == -EEXIST);
 
-               err = set_state_bits(tree, prealloc, &bits);
+               set_state_bits(tree, prealloc, &bits);
+               cache_state(prealloc, cached_state);
+               merge_state(tree, prealloc);
+               prealloc = NULL;
+               goto out;
+       }
+
+       goto search_again;
+
+out:
+       spin_unlock(&tree->lock);
+       if (prealloc)
+               free_extent_state(prealloc);
+
+       return err;
+
+search_again:
+       if (start > end)
+               goto out;
+       spin_unlock(&tree->lock);
+       if (mask & __GFP_WAIT)
+               cond_resched();
+       goto again;
+}
+
+/**
+ * convert_extent - convert all bits in a given range from one bit to another
+ * @tree:      the io tree to search
+ * @start:     the start offset in bytes
+ * @end:       the end offset in bytes (inclusive)
+ * @bits:      the bits to set in this range
+ * @clear_bits:        the bits to clear in this range
+ * @mask:      the allocation mask
+ *
+ * This will go through and set bits for the given range.  If any states exist
+ * already in this range they are set with the given bit and cleared of the
+ * clear_bits.  This is only meant to be used by things that are mergeable, ie
+ * converting from say DELALLOC to DIRTY.  This is not meant to be used with
+ * boundary bits like LOCK.
+ */
+int convert_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
+                      int bits, int clear_bits, gfp_t mask)
+{
+       struct extent_state *state;
+       struct extent_state *prealloc = NULL;
+       struct rb_node *node;
+       int err = 0;
+       u64 last_start;
+       u64 last_end;
+
+again:
+       if (!prealloc && (mask & __GFP_WAIT)) {
+               prealloc = alloc_extent_state(mask);
+               if (!prealloc)
+                       return -ENOMEM;
+       }
+
+       spin_lock(&tree->lock);
+       /*
+        * this search will find all the extents that end after
+        * our range starts.
+        */
+       node = tree_search(tree, start);
+       if (!node) {
+               prealloc = alloc_extent_state_atomic(prealloc);
+               if (!prealloc)
+                       return -ENOMEM;
+               err = insert_state(tree, prealloc, start, end, &bits);
+               prealloc = NULL;
+               BUG_ON(err == -EEXIST);
+               goto out;
+       }
+       state = rb_entry(node, struct extent_state, rb_node);
+hit_next:
+       last_start = state->start;
+       last_end = state->end;
+
+       /*
+        * | ---- desired range ---- |
+        * | state |
+        *
+        * Just lock what we found and keep going
+        */
+       if (state->start == start && state->end <= end) {
+               struct rb_node *next_node;
+
+               set_state_bits(tree, state, &bits);
+               clear_state_bit(tree, state, &clear_bits, 0);
+
+               merge_state(tree, state);
+               if (last_end == (u64)-1)
+                       goto out;
+
+               start = last_end + 1;
+               next_node = rb_next(&state->rb_node);
+               if (next_node && start < end && prealloc && !need_resched()) {
+                       state = rb_entry(next_node, struct extent_state,
+                                        rb_node);
+                       if (state->start == start)
+                               goto hit_next;
+               }
+               goto search_again;
+       }
+
+       /*
+        *     | ---- desired range ---- |
+        * | state |
+        *   or
+        * | ------------- state -------------- |
+        *
+        * We need to split the extent we found, and may flip bits on
+        * second half.
+        *
+        * If the extent we found extends past our
+        * range, we just split and search again.  It'll get split
+        * again the next time though.
+        *
+        * If the extent we found is inside our range, we set the
+        * desired bit on it.
+        */
+       if (state->start < start) {
+               prealloc = alloc_extent_state_atomic(prealloc);
+               if (!prealloc)
+                       return -ENOMEM;
+               err = split_state(tree, state, prealloc, start);
+               BUG_ON(err == -EEXIST);
+               prealloc = NULL;
+               if (err)
+                       goto out;
+               if (state->end <= end) {
+                       set_state_bits(tree, state, &bits);
+                       clear_state_bit(tree, state, &clear_bits, 0);
+                       merge_state(tree, state);
+                       if (last_end == (u64)-1)
+                               goto out;
+                       start = last_end + 1;
+               }
+               goto search_again;
+       }
+       /*
+        * | ---- desired range ---- |
+        *     | state | or               | state |
+        *
+        * There's a hole, we need to insert something in it and
+        * ignore the extent we found.
+        */
+       if (state->start > start) {
+               u64 this_end;
+               if (end < last_start)
+                       this_end = end;
+               else
+                       this_end = last_start - 1;
+
+               prealloc = alloc_extent_state_atomic(prealloc);
+               if (!prealloc)
+                       return -ENOMEM;
+
+               /*
+                * Avoid to free 'prealloc' if it can be merged with
+                * the later extent.
+                */
+               err = insert_state(tree, prealloc, start, this_end,
+                                  &bits);
+               BUG_ON(err == -EEXIST);
                if (err) {
+                       free_extent_state(prealloc);
                        prealloc = NULL;
                        goto out;
                }
-               cache_state(prealloc, cached_state);
+               prealloc = NULL;
+               start = this_end + 1;
+               goto search_again;
+       }
+       /*
+        * | ---- desired range ---- |
+        *                        | state |
+        * We need to split the extent, and set the bit
+        * on the first half
+        */
+       if (state->start <= end && state->end > end) {
+               prealloc = alloc_extent_state_atomic(prealloc);
+               if (!prealloc)
+                       return -ENOMEM;
+
+               err = split_state(tree, state, prealloc, end + 1);
+               BUG_ON(err == -EEXIST);
+
+               set_state_bits(tree, prealloc, &bits);
+               clear_state_bit(tree, prealloc, &clear_bits, 0);
+
                merge_state(tree, prealloc);
                prealloc = NULL;
                goto out;
@@ -949,7 +1108,7 @@ int set_extent_delalloc(struct extent_io_tree *tree, u64 start, u64 end,
                        struct extent_state **cached_state, gfp_t mask)
 {
        return set_extent_bit(tree, start, end,
-                             EXTENT_DELALLOC | EXTENT_DIRTY | EXTENT_UPTODATE,
+                             EXTENT_DELALLOC | EXTENT_UPTODATE,
                              0, NULL, cached_state, mask);
 }
 
@@ -1061,46 +1220,6 @@ static int set_range_writeback(struct extent_io_tree *tree, u64 start, u64 end)
        return 0;
 }
 
-/*
- * find the first offset in the io tree with 'bits' set. zero is
- * returned if we find something, and *start_ret and *end_ret are
- * set to reflect the state struct that was found.
- *
- * If nothing was found, 1 is returned, < 0 on error
- */
-int find_first_extent_bit(struct extent_io_tree *tree, u64 start,
-                         u64 *start_ret, u64 *end_ret, int bits)
-{
-       struct rb_node *node;
-       struct extent_state *state;
-       int ret = 1;
-
-       spin_lock(&tree->lock);
-       /*
-        * this search will find all the extents that end after
-        * our range starts.
-        */
-       node = tree_search(tree, start);
-       if (!node)
-               goto out;
-
-       while (1) {
-               state = rb_entry(node, struct extent_state, rb_node);
-               if (state->end >= start && (state->state & bits)) {
-                       *start_ret = state->start;
-                       *end_ret = state->end;
-                       ret = 0;
-                       break;
-               }
-               node = rb_next(node);
-               if (!node)
-                       break;
-       }
-out:
-       spin_unlock(&tree->lock);
-       return ret;
-}
-
 /* find the first state struct with 'bits' set after 'start', and
  * return it.  tree->lock must be held.  NULL will returned if
  * nothing was found after 'start'
@@ -1132,6 +1251,30 @@ out:
        return NULL;
 }
 
+/*
+ * find the first offset in the io tree with 'bits' set. zero is
+ * returned if we find something, and *start_ret and *end_ret are
+ * set to reflect the state struct that was found.
+ *
+ * If nothing was found, 1 is returned, < 0 on error
+ */
+int find_first_extent_bit(struct extent_io_tree *tree, u64 start,
+                         u64 *start_ret, u64 *end_ret, int bits)
+{
+       struct extent_state *state;
+       int ret = 1;
+
+       spin_lock(&tree->lock);
+       state = find_first_extent_bit_state(tree, start, bits);
+       if (state) {
+               *start_ret = state->start;
+               *end_ret = state->end;
+               ret = 0;
+       }
+       spin_unlock(&tree->lock);
+       return ret;
+}
+
 /*
  * find a contiguous range of bytes in the file marked as delalloc, not
  * more than 'max_bytes'.  start and end are used to return the range,
@@ -1564,7 +1707,8 @@ int test_range_bit(struct extent_io_tree *tree, u64 start, u64 end,
        int bitset = 0;
 
        spin_lock(&tree->lock);
-       if (cached && cached->tree && cached->start == start)
+       if (cached && cached->tree && cached->start <= start &&
+           cached->end > start)
                node = &cached->rb_node;
        else
                node = tree_search(tree, start);
@@ -1644,6 +1788,368 @@ static int check_page_writeback(struct extent_io_tree *tree,
        return 0;
 }
 
+/*
+ * When IO fails, either with EIO or csum verification fails, we
+ * try other mirrors that might have a good copy of the data.  This
+ * io_failure_record is used to record state as we go through all the
+ * mirrors.  If another mirror has good data, the page is set up to date
+ * and things continue.  If a good mirror can't be found, the original
+ * bio end_io callback is called to indicate things have failed.
+ */
+struct io_failure_record {
+       struct page *page;
+       u64 start;
+       u64 len;
+       u64 logical;
+       unsigned long bio_flags;
+       int this_mirror;
+       int failed_mirror;
+       int in_validation;
+};
+
+static int free_io_failure(struct inode *inode, struct io_failure_record *rec,
+                               int did_repair)
+{
+       int ret;
+       int err = 0;
+       struct extent_io_tree *failure_tree = &BTRFS_I(inode)->io_failure_tree;
+
+       set_state_private(failure_tree, rec->start, 0);
+       ret = clear_extent_bits(failure_tree, rec->start,
+                               rec->start + rec->len - 1,
+                               EXTENT_LOCKED | EXTENT_DIRTY, GFP_NOFS);
+       if (ret)
+               err = ret;
+
+       if (did_repair) {
+               ret = clear_extent_bits(&BTRFS_I(inode)->io_tree, rec->start,
+                                       rec->start + rec->len - 1,
+                                       EXTENT_DAMAGED, GFP_NOFS);
+               if (ret && !err)
+                       err = ret;
+       }
+
+       kfree(rec);
+       return err;
+}
+
+static void repair_io_failure_callback(struct bio *bio, int err)
+{
+       complete(bio->bi_private);
+}
+
+/*
+ * this bypasses the standard btrfs submit functions deliberately, as
+ * the standard behavior is to write all copies in a raid setup. here we only
+ * want to write the one bad copy. so we do the mapping for ourselves and issue
+ * submit_bio directly.
+ * to avoid any synchonization issues, wait for the data after writing, which
+ * actually prevents the read that triggered the error from finishing.
+ * currently, there can be no more than two copies of every data bit. thus,
+ * exactly one rewrite is required.
+ */
+int repair_io_failure(struct btrfs_mapping_tree *map_tree, u64 start,
+                       u64 length, u64 logical, struct page *page,
+                       int mirror_num)
+{
+       struct bio *bio;
+       struct btrfs_device *dev;
+       DECLARE_COMPLETION_ONSTACK(compl);
+       u64 map_length = 0;
+       u64 sector;
+       struct btrfs_bio *bbio = NULL;
+       int ret;
+
+       BUG_ON(!mirror_num);
+
+       bio = bio_alloc(GFP_NOFS, 1);
+       if (!bio)
+               return -EIO;
+       bio->bi_private = &compl;
+       bio->bi_end_io = repair_io_failure_callback;
+       bio->bi_size = 0;
+       map_length = length;
+
+       ret = btrfs_map_block(map_tree, WRITE, logical,
+                             &map_length, &bbio, mirror_num);
+       if (ret) {
+               bio_put(bio);
+               return -EIO;
+       }
+       BUG_ON(mirror_num != bbio->mirror_num);
+       sector = bbio->stripes[mirror_num-1].physical >> 9;
+       bio->bi_sector = sector;
+       dev = bbio->stripes[mirror_num-1].dev;
+       kfree(bbio);
+       if (!dev || !dev->bdev || !dev->writeable) {
+               bio_put(bio);
+               return -EIO;
+       }
+       bio->bi_bdev = dev->bdev;
+       bio_add_page(bio, page, length, start-page_offset(page));
+       submit_bio(WRITE_SYNC, bio);
+       wait_for_completion(&compl);
+
+       if (!test_bit(BIO_UPTODATE, &bio->bi_flags)) {
+               /* try to remap that extent elsewhere? */
+               bio_put(bio);
+               return -EIO;
+       }
+
+       printk(KERN_INFO "btrfs read error corrected: ino %lu off %llu (dev %s "
+                       "sector %llu)\n", page->mapping->host->i_ino, start,
+                       dev->name, sector);
+
+       bio_put(bio);
+       return 0;
+}
+
+/*
+ * each time an IO finishes, we do a fast check in the IO failure tree
+ * to see if we need to process or clean up an io_failure_record
+ */
+static int clean_io_failure(u64 start, struct page *page)
+{
+       u64 private;
+       u64 private_failure;
+       struct io_failure_record *failrec;
+       struct btrfs_mapping_tree *map_tree;
+       struct extent_state *state;
+       int num_copies;
+       int did_repair = 0;
+       int ret;
+       struct inode *inode = page->mapping->host;
+
+       private = 0;
+       ret = count_range_bits(&BTRFS_I(inode)->io_failure_tree, &private,
+                               (u64)-1, 1, EXTENT_DIRTY, 0);
+       if (!ret)
+               return 0;
+
+       ret = get_state_private(&BTRFS_I(inode)->io_failure_tree, start,
+                               &private_failure);
+       if (ret)
+               return 0;
+
+       failrec = (struct io_failure_record *)(unsigned long) private_failure;
+       BUG_ON(!failrec->this_mirror);
+
+       if (failrec->in_validation) {
+               /* there was no real error, just free the record */
+               pr_debug("clean_io_failure: freeing dummy error at %llu\n",
+                        failrec->start);
+               did_repair = 1;
+               goto out;
+       }
+
+       spin_lock(&BTRFS_I(inode)->io_tree.lock);
+       state = find_first_extent_bit_state(&BTRFS_I(inode)->io_tree,
+                                           failrec->start,
+                                           EXTENT_LOCKED);
+       spin_unlock(&BTRFS_I(inode)->io_tree.lock);
+
+       if (state && state->start == failrec->start) {
+               map_tree = &BTRFS_I(inode)->root->fs_info->mapping_tree;
+               num_copies = btrfs_num_copies(map_tree, failrec->logical,
+                                               failrec->len);
+               if (num_copies > 1)  {
+                       ret = repair_io_failure(map_tree, start, failrec->len,
+                                               failrec->logical, page,
+                                               failrec->failed_mirror);
+                       did_repair = !ret;
+               }
+       }
+
+out:
+       if (!ret)
+               ret = free_io_failure(inode, failrec, did_repair);
+
+       return ret;
+}
+
+/*
+ * this is a generic handler for readpage errors (default
+ * readpage_io_failed_hook). if other copies exist, read those and write back
+ * good data to the failed position. does not investigate in remapping the
+ * failed extent elsewhere, hoping the device will be smart enough to do this as
+ * needed
+ */
+
+static int bio_readpage_error(struct bio *failed_bio, struct page *page,
+                               u64 start, u64 end, int failed_mirror,
+                               struct extent_state *state)
+{
+       struct io_failure_record *failrec = NULL;
+       u64 private;
+       struct extent_map *em;
+       struct inode *inode = page->mapping->host;
+       struct extent_io_tree *failure_tree = &BTRFS_I(inode)->io_failure_tree;
+       struct extent_io_tree *tree = &BTRFS_I(inode)->io_tree;
+       struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
+       struct bio *bio;
+       int num_copies;
+       int ret;
+       int read_mode;
+       u64 logical;
+
+       BUG_ON(failed_bio->bi_rw & REQ_WRITE);
+
+       ret = get_state_private(failure_tree, start, &private);
+       if (ret) {
+               failrec = kzalloc(sizeof(*failrec), GFP_NOFS);
+               if (!failrec)
+                       return -ENOMEM;
+               failrec->start = start;
+               failrec->len = end - start + 1;
+               failrec->this_mirror = 0;
+               failrec->bio_flags = 0;
+               failrec->in_validation = 0;
+
+               read_lock(&em_tree->lock);
+               em = lookup_extent_mapping(em_tree, start, failrec->len);
+               if (!em) {
+                       read_unlock(&em_tree->lock);
+                       kfree(failrec);
+                       return -EIO;
+               }
+
+               if (em->start > start || em->start + em->len < start) {
+                       free_extent_map(em);
+                       em = NULL;
+               }
+               read_unlock(&em_tree->lock);
+
+               if (!em || IS_ERR(em)) {
+                       kfree(failrec);
+                       return -EIO;
+               }
+               logical = start - em->start;
+               logical = em->block_start + logical;
+               if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) {
+                       logical = em->block_start;
+                       failrec->bio_flags = EXTENT_BIO_COMPRESSED;
+                       extent_set_compress_type(&failrec->bio_flags,
+                                                em->compress_type);
+               }
+               pr_debug("bio_readpage_error: (new) logical=%llu, start=%llu, "
+                        "len=%llu\n", logical, start, failrec->len);
+               failrec->logical = logical;
+               free_extent_map(em);
+
+               /* set the bits in the private failure tree */
+               ret = set_extent_bits(failure_tree, start, end,
+                                       EXTENT_LOCKED | EXTENT_DIRTY, GFP_NOFS);
+               if (ret >= 0)
+                       ret = set_state_private(failure_tree, start,
+                                               (u64)(unsigned long)failrec);
+               /* set the bits in the inode's tree */
+               if (ret >= 0)
+                       ret = set_extent_bits(tree, start, end, EXTENT_DAMAGED,
+                                               GFP_NOFS);
+               if (ret < 0) {
+                       kfree(failrec);
+                       return ret;
+               }
+       } else {
+               failrec = (struct io_failure_record *)(unsigned long)private;
+               pr_debug("bio_readpage_error: (found) logical=%llu, "
+                        "start=%llu, len=%llu, validation=%d\n",
+                        failrec->logical, failrec->start, failrec->len,
+                        failrec->in_validation);
+               /*
+                * when data can be on disk more than twice, add to failrec here
+                * (e.g. with a list for failed_mirror) to make
+                * clean_io_failure() clean all those errors at once.
+                */
+       }
+       num_copies = btrfs_num_copies(
+                             &BTRFS_I(inode)->root->fs_info->mapping_tree,
+                             failrec->logical, failrec->len);
+       if (num_copies == 1) {
+               /*
+                * we only have a single copy of the data, so don't bother with
+                * all the retry and error correction code that follows. no
+                * matter what the error is, it is very likely to persist.
+                */
+               pr_debug("bio_readpage_error: cannot repair, num_copies == 1. "
+                        "state=%p, num_copies=%d, next_mirror %d, "
+                        "failed_mirror %d\n", state, num_copies,
+                        failrec->this_mirror, failed_mirror);
+               free_io_failure(inode, failrec, 0);
+               return -EIO;
+       }
+
+       if (!state) {
+               spin_lock(&tree->lock);
+               state = find_first_extent_bit_state(tree, failrec->start,
+                                                   EXTENT_LOCKED);
+               if (state && state->start != failrec->start)
+                       state = NULL;
+               spin_unlock(&tree->lock);
+       }
+
+       /*
+        * there are two premises:
+        *      a) deliver good data to the caller
+        *      b) correct the bad sectors on disk
+        */
+       if (failed_bio->bi_vcnt > 1) {
+               /*
+                * to fulfill b), we need to know the exact failing sectors, as
+                * we don't want to rewrite any more than the failed ones. thus,
+                * we need separate read requests for the failed bio
+                *
+                * if the following BUG_ON triggers, our validation request got
+                * merged. we need separate requests for our algorithm to work.
+                */
+               BUG_ON(failrec->in_validation);
+               failrec->in_validation = 1;
+               failrec->this_mirror = failed_mirror;
+               read_mode = READ_SYNC | REQ_FAILFAST_DEV;
+       } else {
+               /*
+                * we're ready to fulfill a) and b) alongside. get a good copy
+                * of the failed sector and if we succeed, we have setup
+                * everything for repair_io_failure to do the rest for us.
+                */
+               if (failrec->in_validation) {
+                       BUG_ON(failrec->this_mirror != failed_mirror);
+                       failrec->in_validation = 0;
+                       failrec->this_mirror = 0;
+               }
+               failrec->failed_mirror = failed_mirror;
+               failrec->this_mirror++;
+               if (failrec->this_mirror == failed_mirror)
+                       failrec->this_mirror++;
+               read_mode = READ_SYNC;
+       }
+
+       if (!state || failrec->this_mirror > num_copies) {
+               pr_debug("bio_readpage_error: (fail) state=%p, num_copies=%d, "
+                        "next_mirror %d, failed_mirror %d\n", state,
+                        num_copies, failrec->this_mirror, failed_mirror);
+               free_io_failure(inode, failrec, 0);
+               return -EIO;
+       }
+
+       bio = bio_alloc(GFP_NOFS, 1);
+       bio->bi_private = state;
+       bio->bi_end_io = failed_bio->bi_end_io;
+       bio->bi_sector = failrec->logical >> 9;
+       bio->bi_bdev = BTRFS_I(inode)->root->fs_info->fs_devices->latest_bdev;
+       bio->bi_size = 0;
+
+       bio_add_page(bio, page, failrec->len, start - page_offset(page));
+
+       pr_debug("bio_readpage_error: submitting new read[%#x] to "
+                "this_mirror=%d, num_copies=%d, in_validation=%d\n", read_mode,
+                failrec->this_mirror, num_copies, failrec->in_validation);
+
+       tree->ops->submit_bio_hook(inode, read_mode, bio, failrec->this_mirror,
+                                       failrec->bio_flags, 0);
+       return 0;
+}
+
 /* lots and lots of room for performance fixes in the end_bio funcs */
 
 /*
@@ -1742,6 +2248,9 @@ static void end_bio_extent_readpage(struct bio *bio, int err)
                struct extent_state *cached = NULL;
                struct extent_state *state;
 
+               pr_debug("end_bio_extent_readpage: bi_vcnt=%d, idx=%d, err=%d, "
+                        "mirror=%ld\n", bio->bi_vcnt, bio->bi_idx, err,
+                        (long int)bio->bi_bdev);
                tree = &BTRFS_I(page->mapping->host)->io_tree;
 
                start = ((u64)page->index << PAGE_CACHE_SHIFT) +
@@ -1772,11 +2281,19 @@ static void end_bio_extent_readpage(struct bio *bio, int err)
                                                              state);
                        if (ret)
                                uptodate = 0;
+                       else
+                               clean_io_failure(start, page);
                }
-               if (!uptodate && tree->ops &&
-                   tree->ops->readpage_io_failed_hook) {
-                       ret = tree->ops->readpage_io_failed_hook(bio, page,
-                                                        start, end, NULL);
+               if (!uptodate) {
+                       u64 failed_mirror;
+                       failed_mirror = (unsigned long)bio->bi_bdev;
+                       if (tree->ops && tree->ops->readpage_io_failed_hook)
+                               ret = tree->ops->readpage_io_failed_hook(
+                                               bio, page, start, end,
+                                               failed_mirror, state);
+                       else
+                               ret = bio_readpage_error(bio, page, start, end,
+                                                        failed_mirror, NULL);
                        if (ret == 0) {
                                uptodate =
                                        test_bit(BIO_UPTODATE, &bio->bi_flags);
@@ -1856,6 +2373,7 @@ static int submit_one_bio(int rw, struct bio *bio, int mirror_num,
                                           mirror_num, bio_flags, start);
        else
                submit_bio(rw, bio);
+
        if (bio_flagged(bio, BIO_EOPNOTSUPP))
                ret = -EOPNOTSUPP;
        bio_put(bio);
@@ -2121,16 +2639,16 @@ out:
 }
 
 int extent_read_full_page(struct extent_io_tree *tree, struct page *page,
-                           get_extent_t *get_extent)
+                           get_extent_t *get_extent, int mirror_num)
 {
        struct bio *bio = NULL;
        unsigned long bio_flags = 0;
        int ret;
 
-       ret = __extent_read_full_page(tree, page, get_extent, &bio, 0,
+       ret = __extent_read_full_page(tree, page, get_extent, &bio, mirror_num,
                                      &bio_flags);
        if (bio)
-               ret = submit_one_bio(READ, bio, 0, bio_flags);
+               ret = submit_one_bio(READ, bio, mirror_num, bio_flags);
        return ret;
 }
 
@@ -2181,6 +2699,7 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc,
        int compressed;
        int write_flags;
        unsigned long nr_written = 0;
+       bool fill_delalloc = true;
 
        if (wbc->sync_mode == WB_SYNC_ALL)
                write_flags = WRITE_SYNC;
@@ -2190,6 +2709,9 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc,
        trace___extent_writepage(page, inode, wbc);
 
        WARN_ON(!PageLocked(page));
+
+       ClearPageError(page);
+
        pg_offset = i_size & (PAGE_CACHE_SIZE - 1);
        if (page->index > end_index ||
           (page->index == end_index && !pg_offset)) {
@@ -2211,10 +2733,13 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc,
 
        set_page_extent_mapped(page);
 
+       if (!tree->ops || !tree->ops->fill_delalloc)
+               fill_delalloc = false;
+
        delalloc_start = start;
        delalloc_end = 0;
        page_started = 0;
-       if (!epd->extent_locked) {
+       if (!epd->extent_locked && fill_delalloc) {
                u64 delalloc_to_write = 0;
                /*
                 * make sure the wbc mapping index is at least updated
@@ -2432,6 +2957,7 @@ static int extent_write_cache_pages(struct extent_io_tree *tree,
        pgoff_t index;
        pgoff_t end;            /* Inclusive */
        int scanned = 0;
+       int tag;
 
        pagevec_init(&pvec, 0);
        if (wbc->range_cyclic) {
@@ -2442,11 +2968,16 @@ static int extent_write_cache_pages(struct extent_io_tree *tree,
                end = wbc->range_end >> PAGE_CACHE_SHIFT;
                scanned = 1;
        }
+       if (wbc->sync_mode == WB_SYNC_ALL)
+               tag = PAGECACHE_TAG_TOWRITE;
+       else
+               tag = PAGECACHE_TAG_DIRTY;
 retry:
+       if (wbc->sync_mode == WB_SYNC_ALL)
+               tag_pages_for_writeback(mapping, index, end);
        while (!done && !nr_to_write_done && (index <= end) &&
-              (nr_pages = pagevec_lookup_tag(&pvec, mapping, &index,
-                             PAGECACHE_TAG_DIRTY, min(end - index,
-                                 (pgoff_t)PAGEVEC_SIZE-1) + 1))) {
+              (nr_pages = pagevec_lookup_tag(&pvec, mapping, &index, tag,
+                       min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1))) {
                unsigned i;
 
                scanned = 1;
@@ -2460,10 +2991,16 @@ retry:
                         * swizzled back from swapper_space to tmpfs file
                         * mapping
                         */
-                       if (tree->ops && tree->ops->write_cache_pages_lock_hook)
-                               tree->ops->write_cache_pages_lock_hook(page);
-                       else
-                               lock_page(page);
+                       if (tree->ops &&
+                           tree->ops->write_cache_pages_lock_hook) {
+                               tree->ops->write_cache_pages_lock_hook(page,
+                                                              data, flush_fn);
+                       } else {
+                               if (!trylock_page(page)) {
+                                       flush_fn(data);
+                                       lock_page(page);
+                               }
+                       }
 
                        if (unlikely(page->mapping != mapping)) {
                                unlock_page(page);
@@ -2541,7 +3078,6 @@ int extent_write_full_page(struct extent_io_tree *tree, struct page *page,
                          struct writeback_control *wbc)
 {
        int ret;
-       struct address_space *mapping = page->mapping;
        struct extent_page_data epd = {
                .bio = NULL,
                .tree = tree,
@@ -2549,18 +3085,9 @@ int extent_write_full_page(struct extent_io_tree *tree, struct page *page,
                .extent_locked = 0,
                .sync_io = wbc->sync_mode == WB_SYNC_ALL,
        };
-       struct writeback_control wbc_writepages = {
-               .sync_mode      = wbc->sync_mode,
-               .older_than_this = NULL,
-               .nr_to_write    = 64,
-               .range_start    = page_offset(page) + PAGE_CACHE_SIZE,
-               .range_end      = (loff_t)-1,
-       };
 
        ret = __extent_writepage(page, wbc, &epd);
 
-       extent_write_cache_pages(tree, mapping, &wbc_writepages,
-                                __extent_writepage, &epd, flush_write_bio);
        flush_epd_write_bio(&epd);
        return ret;
 }
@@ -2584,7 +3111,6 @@ int extent_write_locked_range(struct extent_io_tree *tree, struct inode *inode,
        };
        struct writeback_control wbc_writepages = {
                .sync_mode      = mode,
-               .older_than_this = NULL,
                .nr_to_write    = nr_pages * 2,
                .range_start    = start,
                .range_end      = end + 1,
@@ -2976,7 +3502,7 @@ out:
        return ret;
 }
 
-static inline struct page *extent_buffer_page(struct extent_buffer *eb,
+inline struct page *extent_buffer_page(struct extent_buffer *eb,
                                              unsigned long i)
 {
        struct page *p;
@@ -3001,7 +3527,7 @@ static inline struct page *extent_buffer_page(struct extent_buffer *eb,
        return p;
 }
 
-static inline unsigned long num_extent_pages(u64 start, u64 len)
+inline unsigned long num_extent_pages(u64 start, u64 len)
 {
        return ((start + len + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT) -
                (start >> PAGE_CACHE_SHIFT);
@@ -3022,8 +3548,15 @@ static struct extent_buffer *__alloc_extent_buffer(struct extent_io_tree *tree,
                return NULL;
        eb->start = start;
        eb->len = len;
-       spin_lock_init(&eb->lock);
-       init_waitqueue_head(&eb->lock_wq);
+       rwlock_init(&eb->lock);
+       atomic_set(&eb->write_locks, 0);
+       atomic_set(&eb->read_locks, 0);
+       atomic_set(&eb->blocking_readers, 0);
+       atomic_set(&eb->blocking_writers, 0);
+       atomic_set(&eb->spinning_readers, 0);
+       atomic_set(&eb->spinning_writers, 0);
+       init_waitqueue_head(&eb->write_lock_wq);
+       init_waitqueue_head(&eb->read_lock_wq);
 
 #if LEAK_DEBUG
        spin_lock_irqsave(&leak_lock, flags);
@@ -3119,7 +3652,7 @@ struct extent_buffer *alloc_extent_buffer(struct extent_io_tree *tree,
                i = 0;
        }
        for (; i < num_pages; i++, index++) {
-               p = find_or_create_page(mapping, index, GFP_NOFS | __GFP_HIGHMEM);
+               p = find_or_create_page(mapping, index, GFP_NOFS);
                if (!p) {
                        WARN_ON(1);
                        goto free_eb;
@@ -3247,6 +3780,7 @@ int clear_extent_buffer_dirty(struct extent_io_tree *tree,
                                                PAGECACHE_TAG_DIRTY);
                }
                spin_unlock_irq(&page->mapping->tree_lock);
+               ClearPageError(page);
                unlock_page(page);
        }
        return 0;
@@ -3266,6 +3800,22 @@ int set_extent_buffer_dirty(struct extent_io_tree *tree,
        return was_dirty;
 }
 
+static int __eb_straddles_pages(u64 start, u64 len)
+{
+       if (len < PAGE_CACHE_SIZE)
+               return 1;
+       if (start & (PAGE_CACHE_SIZE - 1))
+               return 1;
+       if ((start + len) & (PAGE_CACHE_SIZE - 1))
+               return 1;
+       return 0;
+}
+
+static int eb_straddles_pages(struct extent_buffer *eb)
+{
+       return __eb_straddles_pages(eb->start, eb->len);
+}
+
 int clear_extent_buffer_uptodate(struct extent_io_tree *tree,
                                struct extent_buffer *eb,
                                struct extent_state **cached_state)
@@ -3277,8 +3827,10 @@ int clear_extent_buffer_uptodate(struct extent_io_tree *tree,
        num_pages = num_extent_pages(eb->start, eb->len);
        clear_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags);
 
-       clear_extent_uptodate(tree, eb->start, eb->start + eb->len - 1,
-                             cached_state, GFP_NOFS);
+       if (eb_straddles_pages(eb)) {
+               clear_extent_uptodate(tree, eb->start, eb->start + eb->len - 1,
+                                     cached_state, GFP_NOFS);
+       }
        for (i = 0; i < num_pages; i++) {
                page = extent_buffer_page(eb, i);
                if (page)
@@ -3296,8 +3848,10 @@ int set_extent_buffer_uptodate(struct extent_io_tree *tree,
 
        num_pages = num_extent_pages(eb->start, eb->len);
 
-       set_extent_uptodate(tree, eb->start, eb->start + eb->len - 1,
-                           NULL, GFP_NOFS);
+       if (eb_straddles_pages(eb)) {
+               set_extent_uptodate(tree, eb->start, eb->start + eb->len - 1,
+                                   NULL, GFP_NOFS);
+       }
        for (i = 0; i < num_pages; i++) {
                page = extent_buffer_page(eb, i);
                if ((i == 0 && (eb->start & (PAGE_CACHE_SIZE - 1))) ||
@@ -3320,9 +3874,12 @@ int extent_range_uptodate(struct extent_io_tree *tree,
        int uptodate;
        unsigned long index;
 
-       ret = test_range_bit(tree, start, end, EXTENT_UPTODATE, 1, NULL);
-       if (ret)
-               return 1;
+       if (__eb_straddles_pages(start, end - start + 1)) {
+               ret = test_range_bit(tree, start, end,
+                                    EXTENT_UPTODATE, 1, NULL);
+               if (ret)
+                       return 1;
+       }
        while (start <= end) {
                index = start >> PAGE_CACHE_SHIFT;
                page = find_get_page(tree->mapping, index);
@@ -3350,10 +3907,12 @@ int extent_buffer_uptodate(struct extent_io_tree *tree,
        if (test_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags))
                return 1;
 
-       ret = test_range_bit(tree, eb->start, eb->start + eb->len - 1,
-                          EXTENT_UPTODATE, 1, cached_state);
-       if (ret)
-               return ret;
+       if (eb_straddles_pages(eb)) {
+               ret = test_range_bit(tree, eb->start, eb->start + eb->len - 1,
+                                  EXTENT_UPTODATE, 1, cached_state);
+               if (ret)
+                       return ret;
+       }
 
        num_pages = num_extent_pages(eb->start, eb->len);
        for (i = 0; i < num_pages; i++) {
@@ -3367,8 +3926,7 @@ int extent_buffer_uptodate(struct extent_io_tree *tree,
 }
 
 int read_extent_buffer_pages(struct extent_io_tree *tree,
-                            struct extent_buffer *eb,
-                            u64 start, int wait,
+                            struct extent_buffer *eb, u64 start, int wait,
                             get_extent_t *get_extent, int mirror_num)
 {
        unsigned long i;
@@ -3386,9 +3944,11 @@ int read_extent_buffer_pages(struct extent_io_tree *tree,
        if (test_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags))
                return 0;
 
-       if (test_range_bit(tree, eb->start, eb->start + eb->len - 1,
-                          EXTENT_UPTODATE, 1, NULL)) {
-               return 0;
+       if (eb_straddles_pages(eb)) {
+               if (test_range_bit(tree, eb->start, eb->start + eb->len - 1,
+                                  EXTENT_UPTODATE, 1, NULL)) {
+                       return 0;
+               }
        }
 
        if (start) {
@@ -3402,7 +3962,7 @@ int read_extent_buffer_pages(struct extent_io_tree *tree,
        num_pages = num_extent_pages(eb->start, eb->len);
        for (i = start_i; i < num_pages; i++) {
                page = extent_buffer_page(eb, i);
-               if (!wait) {
+               if (wait == WAIT_NONE) {
                        if (!trylock_page(page))
                                goto unlock_exit;
                } else {
@@ -3446,7 +4006,7 @@ int read_extent_buffer_pages(struct extent_io_tree *tree,
        if (bio)
                submit_one_bio(READ, bio, mirror_num, bio_flags);
 
-       if (ret || !wait)
+       if (ret || wait != WAIT_COMPLETE)
                return ret;
 
        for (i = start_i; i < num_pages; i++) {
@@ -3492,9 +4052,8 @@ void read_extent_buffer(struct extent_buffer *eb, void *dstv,
                page = extent_buffer_page(eb, i);
 
                cur = min(len, (PAGE_CACHE_SIZE - offset));
-               kaddr = kmap_atomic(page, KM_USER1);
+               kaddr = page_address(page);
                memcpy(dst, kaddr + offset, cur);
-               kunmap_atomic(kaddr, KM_USER1);
 
                dst += cur;
                len -= cur;
@@ -3504,9 +4063,9 @@ void read_extent_buffer(struct extent_buffer *eb, void *dstv,
 }
 
 int map_private_extent_buffer(struct extent_buffer *eb, unsigned long start,
-                              unsigned long min_len, char **token, char **map,
+                              unsigned long min_len, char **map,
                               unsigned long *map_start,
-                              unsigned long *map_len, int km)
+                              unsigned long *map_len)
 {
        size_t offset = start & (PAGE_CACHE_SIZE - 1);
        char *kaddr;
@@ -3536,42 +4095,12 @@ int map_private_extent_buffer(struct extent_buffer *eb, unsigned long start,
        }
 
        p = extent_buffer_page(eb, i);
-       kaddr = kmap_atomic(p, km);
-       *token = kaddr;
+       kaddr = page_address(p);
        *map = kaddr + offset;
        *map_len = PAGE_CACHE_SIZE - offset;
        return 0;
 }
 
-int map_extent_buffer(struct extent_buffer *eb, unsigned long start,
-                     unsigned long min_len,
-                     char **token, char **map,
-                     unsigned long *map_start,
-                     unsigned long *map_len, int km)
-{
-       int err;
-       int save = 0;
-       if (eb->map_token) {
-               unmap_extent_buffer(eb, eb->map_token, km);
-               eb->map_token = NULL;
-               save = 1;
-       }
-       err = map_private_extent_buffer(eb, start, min_len, token, map,
-                                      map_start, map_len, km);
-       if (!err && save) {
-               eb->map_token = *token;
-               eb->kaddr = *map;
-               eb->map_start = *map_start;
-               eb->map_len = *map_len;
-       }
-       return err;
-}
-
-void unmap_extent_buffer(struct extent_buffer *eb, char *token, int km)
-{
-       kunmap_atomic(token, km);
-}
-
 int memcmp_extent_buffer(struct extent_buffer *eb, const void *ptrv,
                          unsigned long start,
                          unsigned long len)
@@ -3595,9 +4124,8 @@ int memcmp_extent_buffer(struct extent_buffer *eb, const void *ptrv,
 
                cur = min(len, (PAGE_CACHE_SIZE - offset));
 
-               kaddr = kmap_atomic(page, KM_USER0);
+               kaddr = page_address(page);
                ret = memcmp(ptr, kaddr + offset, cur);
-               kunmap_atomic(kaddr, KM_USER0);
                if (ret)
                        break;
 
@@ -3630,9 +4158,8 @@ void write_extent_buffer(struct extent_buffer *eb, const void *srcv,
                WARN_ON(!PageUptodate(page));
 
                cur = min(len, PAGE_CACHE_SIZE - offset);
-               kaddr = kmap_atomic(page, KM_USER1);
+               kaddr = page_address(page);
                memcpy(kaddr + offset, src, cur);
-               kunmap_atomic(kaddr, KM_USER1);
 
                src += cur;
                len -= cur;
@@ -3661,9 +4188,8 @@ void memset_extent_buffer(struct extent_buffer *eb, char c,
                WARN_ON(!PageUptodate(page));
 
                cur = min(len, PAGE_CACHE_SIZE - offset);
-               kaddr = kmap_atomic(page, KM_USER0);
+               kaddr = page_address(page);
                memset(kaddr + offset, c, cur);
-               kunmap_atomic(kaddr, KM_USER0);
 
                len -= cur;
                offset = 0;
@@ -3694,9 +4220,8 @@ void copy_extent_buffer(struct extent_buffer *dst, struct extent_buffer *src,
 
                cur = min(len, (unsigned long)(PAGE_CACHE_SIZE - offset));
 
-               kaddr = kmap_atomic(page, KM_USER0);
+               kaddr = page_address(page);
                read_extent_buffer(src, kaddr + offset, src_offset, cur);
-               kunmap_atomic(kaddr, KM_USER0);
 
                src_offset += cur;
                len -= cur;
@@ -3709,20 +4234,17 @@ static void move_pages(struct page *dst_page, struct page *src_page,
                       unsigned long dst_off, unsigned long src_off,
                       unsigned long len)
 {
-       char *dst_kaddr = kmap_atomic(dst_page, KM_USER0);
+       char *dst_kaddr = page_address(dst_page);
        if (dst_page == src_page) {
                memmove(dst_kaddr + dst_off, dst_kaddr + src_off, len);
        } else {
-               char *src_kaddr = kmap_atomic(src_page, KM_USER1);
+               char *src_kaddr = page_address(src_page);
                char *p = dst_kaddr + dst_off + len;
                char *s = src_kaddr + src_off + len;
 
                while (len--)
                        *--p = *--s;
-
-               kunmap_atomic(src_kaddr, KM_USER1);
        }
-       kunmap_atomic(dst_kaddr, KM_USER0);
 }
 
 static inline bool areas_overlap(unsigned long src, unsigned long dst, unsigned long len)
@@ -3735,20 +4257,17 @@ static void copy_pages(struct page *dst_page, struct page *src_page,
                       unsigned long dst_off, unsigned long src_off,
                       unsigned long len)
 {
-       char *dst_kaddr = kmap_atomic(dst_page, KM_USER0);
+       char *dst_kaddr = page_address(dst_page);
        char *src_kaddr;
 
        if (dst_page != src_page) {
-               src_kaddr = kmap_atomic(src_page, KM_USER1);
+               src_kaddr = page_address(src_page);
        } else {
                src_kaddr = dst_kaddr;
                BUG_ON(areas_overlap(src_off, dst_off, len));
        }
 
        memcpy(dst_kaddr + dst_off, src_kaddr + src_off, len);
-       kunmap_atomic(dst_kaddr, KM_USER0);
-       if (dst_page != src_page)
-               kunmap_atomic(src_kaddr, KM_USER1);
 }
 
 void memcpy_extent_buffer(struct extent_buffer *dst, unsigned long dst_offset,