Btrfs end_bio_extent_readpage should look for locked bits
[pandora-kernel.git] / fs / btrfs / extent_io.c
index 92ac519..5ae0bff 100644 (file)
@@ -690,6 +690,15 @@ static void cache_state(struct extent_state *state,
        }
 }
 
+static void uncache_state(struct extent_state **cached_ptr)
+{
+       if (cached_ptr && (*cached_ptr)) {
+               struct extent_state *state = *cached_ptr;
+               *cached_ptr = NULL;
+               free_extent_state(state);
+       }
+}
+
 /*
  * set some bits on a range in the tree.  This may require allocations or
  * sleeping, so the gfp mask is used to indicate what is allowed.
@@ -940,10 +949,10 @@ static int clear_extent_new(struct extent_io_tree *tree, u64 start, u64 end,
 }
 
 int set_extent_uptodate(struct extent_io_tree *tree, u64 start, u64 end,
-                       gfp_t mask)
+                       struct extent_state **cached_state, gfp_t mask)
 {
-       return set_extent_bit(tree, start, end, EXTENT_UPTODATE, 0, NULL,
-                             NULL, mask);
+       return set_extent_bit(tree, start, end, EXTENT_UPTODATE, 0,
+                             NULL, cached_state, mask);
 }
 
 static int clear_extent_uptodate(struct extent_io_tree *tree, u64 start,
@@ -1012,8 +1021,7 @@ int unlock_extent_cached(struct extent_io_tree *tree, u64 start, u64 end,
                                mask);
 }
 
-int unlock_extent(struct extent_io_tree *tree, u64 start, u64 end,
-                 gfp_t mask)
+int unlock_extent(struct extent_io_tree *tree, u64 start, u64 end, gfp_t mask)
 {
        return clear_extent_bit(tree, start, end, EXTENT_LOCKED, 1, 0, NULL,
                                mask);
@@ -1433,12 +1441,13 @@ int extent_clear_unlock_delalloc(struct inode *inode,
  */
 u64 count_range_bits(struct extent_io_tree *tree,
                     u64 *start, u64 search_end, u64 max_bytes,
-                    unsigned long bits)
+                    unsigned long bits, int contig)
 {
        struct rb_node *node;
        struct extent_state *state;
        u64 cur_start = *start;
        u64 total_bytes = 0;
+       u64 last = 0;
        int found = 0;
 
        if (search_end <= cur_start) {
@@ -1463,7 +1472,9 @@ u64 count_range_bits(struct extent_io_tree *tree,
                state = rb_entry(node, struct extent_state, rb_node);
                if (state->start > search_end)
                        break;
-               if (state->end >= cur_start && (state->state & bits)) {
+               if (contig && found && state->start > last + 1)
+                       break;
+               if (state->end >= cur_start && (state->state & bits) == bits) {
                        total_bytes += min(search_end, state->end) + 1 -
                                       max(cur_start, state->start);
                        if (total_bytes >= max_bytes)
@@ -1472,6 +1483,9 @@ u64 count_range_bits(struct extent_io_tree *tree,
                                *start = state->start;
                                found = 1;
                        }
+                       last = state->end;
+               } else if (contig && found) {
+                       break;
                }
                node = rb_next(node);
                if (!node)
@@ -1729,6 +1743,9 @@ static void end_bio_extent_readpage(struct bio *bio, int err)
 
        do {
                struct page *page = bvec->bv_page;
+               struct extent_state *cached = NULL;
+               struct extent_state *state;
+
                tree = &BTRFS_I(page->mapping->host)->io_tree;
 
                start = ((u64)page->index << PAGE_CACHE_SHIFT) +
@@ -1743,9 +1760,20 @@ static void end_bio_extent_readpage(struct bio *bio, int err)
                if (++bvec <= bvec_end)
                        prefetchw(&bvec->bv_page->flags);
 
+               spin_lock(&tree->lock);
+               state = find_first_extent_bit_state(tree, start, EXTENT_LOCKED);
+               if (state && state->start == start) {
+                       /*
+                        * take a reference on the state, unlock will drop
+                        * the ref
+                        */
+                       cache_state(state, &cached);
+               }
+               spin_unlock(&tree->lock);
+
                if (uptodate && tree->ops && tree->ops->readpage_end_io_hook) {
                        ret = tree->ops->readpage_end_io_hook(page, start, end,
-                                                             NULL);
+                                                             state);
                        if (ret)
                                uptodate = 0;
                }
@@ -1758,15 +1786,16 @@ static void end_bio_extent_readpage(struct bio *bio, int err)
                                        test_bit(BIO_UPTODATE, &bio->bi_flags);
                                if (err)
                                        uptodate = 0;
+                               uncache_state(&cached);
                                continue;
                        }
                }
 
                if (uptodate) {
-                       set_extent_uptodate(tree, start, end,
+                       set_extent_uptodate(tree, start, end, &cached,
                                            GFP_ATOMIC);
                }
-               unlock_extent(tree, start, end, GFP_ATOMIC);
+               unlock_extent_cached(tree, start, end, &cached, GFP_ATOMIC);
 
                if (whole_page) {
                        if (uptodate) {
@@ -1805,6 +1834,7 @@ static void end_bio_extent_preparewrite(struct bio *bio, int err)
 
        do {
                struct page *page = bvec->bv_page;
+               struct extent_state *cached = NULL;
                tree = &BTRFS_I(page->mapping->host)->io_tree;
 
                start = ((u64)page->index << PAGE_CACHE_SHIFT) +
@@ -1815,13 +1845,14 @@ static void end_bio_extent_preparewrite(struct bio *bio, int err)
                        prefetchw(&bvec->bv_page->flags);
 
                if (uptodate) {
-                       set_extent_uptodate(tree, start, end, GFP_ATOMIC);
+                       set_extent_uptodate(tree, start, end, &cached,
+                                           GFP_ATOMIC);
                } else {
                        ClearPageUptodate(page);
                        SetPageError(page);
                }
 
-               unlock_extent(tree, start, end, GFP_ATOMIC);
+               unlock_extent_cached(tree, start, end, &cached, GFP_ATOMIC);
 
        } while (bvec >= bio->bi_io_vec);
 
@@ -2010,14 +2041,17 @@ static int __extent_read_full_page(struct extent_io_tree *tree,
        while (cur <= end) {
                if (cur >= last_byte) {
                        char *userpage;
+                       struct extent_state *cached = NULL;
+
                        iosize = PAGE_CACHE_SIZE - page_offset;
                        userpage = kmap_atomic(page, KM_USER0);
                        memset(userpage + page_offset, 0, iosize);
                        flush_dcache_page(page);
                        kunmap_atomic(userpage, KM_USER0);
                        set_extent_uptodate(tree, cur, cur + iosize - 1,
-                                           GFP_NOFS);
-                       unlock_extent(tree, cur, cur + iosize - 1, GFP_NOFS);
+                                           &cached, GFP_NOFS);
+                       unlock_extent_cached(tree, cur, cur + iosize - 1,
+                                            &cached, GFP_NOFS);
                        break;
                }
                em = get_extent(inode, page, page_offset, cur,
@@ -2057,14 +2091,17 @@ static int __extent_read_full_page(struct extent_io_tree *tree,
                /* we've found a hole, just zero and go on */
                if (block_start == EXTENT_MAP_HOLE) {
                        char *userpage;
+                       struct extent_state *cached = NULL;
+
                        userpage = kmap_atomic(page, KM_USER0);
                        memset(userpage + page_offset, 0, iosize);
                        flush_dcache_page(page);
                        kunmap_atomic(userpage, KM_USER0);
 
                        set_extent_uptodate(tree, cur, cur + iosize - 1,
-                                           GFP_NOFS);
-                       unlock_extent(tree, cur, cur + iosize - 1, GFP_NOFS);
+                                           &cached, GFP_NOFS);
+                       unlock_extent_cached(tree, cur, cur + iosize - 1,
+                                            &cached, GFP_NOFS);
                        cur = cur + iosize;
                        page_offset += iosize;
                        continue;
@@ -2186,6 +2223,8 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc,
        else
                write_flags = WRITE;
 
+       trace___extent_writepage(page, inode, wbc);
+
        WARN_ON(!PageLocked(page));
        pg_offset = i_size & (PAGE_CACHE_SIZE - 1);
        if (page->index > end_index ||
@@ -2781,9 +2820,12 @@ int extent_prepare_write(struct extent_io_tree *tree,
                        iocount++;
                        block_start = block_start + iosize;
                } else {
-                       set_extent_uptodate(tree, block_start, cur_end,
+                       struct extent_state *cached = NULL;
+
+                       set_extent_uptodate(tree, block_start, cur_end, &cached,
                                            GFP_NOFS);
-                       unlock_extent(tree, block_start, cur_end, GFP_NOFS);
+                       unlock_extent_cached(tree, block_start, cur_end,
+                                            &cached, GFP_NOFS);
                        block_start = cur_end + 1;
                }
                page_offset = block_start & (PAGE_CACHE_SIZE - 1);
@@ -2912,6 +2954,46 @@ out:
        return sector;
 }
 
+/*
+ * helper function for fiemap, which doesn't want to see any holes.
+ * This maps until we find something past 'last'
+ */
+static struct extent_map *get_extent_skip_holes(struct inode *inode,
+                                               u64 offset,
+                                               u64 last,
+                                               get_extent_t *get_extent)
+{
+       u64 sectorsize = BTRFS_I(inode)->root->sectorsize;
+       struct extent_map *em;
+       u64 len;
+
+       if (offset >= last)
+               return NULL;
+
+       while(1) {
+               len = last - offset;
+               if (len == 0)
+                       break;
+               len = (len + sectorsize - 1) & ~(sectorsize - 1);
+               em = get_extent(inode, NULL, 0, offset, len, 0);
+               if (!em || IS_ERR(em))
+                       return em;
+
+               /* if this isn't a hole return it */
+               if (!test_bit(EXTENT_FLAG_VACANCY, &em->flags) &&
+                   em->block_start != EXTENT_MAP_HOLE) {
+                       return em;
+               }
+
+               /* this is a hole, advance to the next extent */
+               offset = extent_map_end(em);
+               free_extent_map(em);
+               if (offset >= last)
+                       break;
+       }
+       return NULL;
+}
+
 int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
                __u64 start, __u64 len, get_extent_t *get_extent)
 {
@@ -2921,16 +3003,19 @@ int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
        u32 flags = 0;
        u32 found_type;
        u64 last;
+       u64 last_for_get_extent = 0;
        u64 disko = 0;
+       u64 isize = i_size_read(inode);
        struct btrfs_key found_key;
        struct extent_map *em = NULL;
        struct extent_state *cached_state = NULL;
        struct btrfs_path *path;
        struct btrfs_file_extent_item *item;
        int end = 0;
-       u64 em_start = 0, em_len = 0;
+       u64 em_start = 0;
+       u64 em_len = 0;
+       u64 em_end = 0;
        unsigned long emflags;
-       int hole = 0;
 
        if (len == 0)
                return -EINVAL;
@@ -2940,6 +3025,10 @@ int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
                return -ENOMEM;
        path->leave_spinning = 1;
 
+       /*
+        * lookup the last file extent.  We're not using i_size here
+        * because there might be preallocation past i_size
+        */
        ret = btrfs_lookup_file_extent(NULL, BTRFS_I(inode)->root,
                                       path, inode->i_ino, -1, 0);
        if (ret < 0) {
@@ -2953,18 +3042,38 @@ int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
        btrfs_item_key_to_cpu(path->nodes[0], &found_key, path->slots[0]);
        found_type = btrfs_key_type(&found_key);
 
-       /* No extents, just return */
+       /* No extents, but there might be delalloc bits */
        if (found_key.objectid != inode->i_ino ||
            found_type != BTRFS_EXTENT_DATA_KEY) {
-               btrfs_free_path(path);
-               return 0;
+               /* have to trust i_size as the end */
+               last = (u64)-1;
+               last_for_get_extent = isize;
+       } else {
+               /*
+                * remember the start of the last extent.  There are a
+                * bunch of different factors that go into the length of the
+                * extent, so its much less complex to remember where it started
+                */
+               last = found_key.offset;
+               last_for_get_extent = last + 1;
        }
-       last = found_key.offset;
        btrfs_free_path(path);
 
+       /*
+        * we might have some extents allocated but more delalloc past those
+        * extents.  so, we trust isize unless the start of the last extent is
+        * beyond isize
+        */
+       if (last < isize) {
+               last = (u64)-1;
+               last_for_get_extent = isize;
+       }
+
        lock_extent_bits(&BTRFS_I(inode)->io_tree, start, start + len, 0,
                         &cached_state, GFP_NOFS);
-       em = get_extent(inode, NULL, 0, off, max - off, 0);
+
+       em = get_extent_skip_holes(inode, off, last_for_get_extent,
+                                  get_extent);
        if (!em)
                goto out;
        if (IS_ERR(em)) {
@@ -2973,22 +3082,38 @@ int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
        }
 
        while (!end) {
-               hole = 0;
-               off = em->start + em->len;
-               if (off >= max)
-                       end = 1;
+               u64 offset_in_extent;
 
-               if (em->block_start == EXTENT_MAP_HOLE) {
-                       hole = 1;
-                       goto next;
-               }
+               /* break if the extent we found is outside the range */
+               if (em->start >= max || extent_map_end(em) < off)
+                       break;
 
-               em_start = em->start;
-               em_len = em->len;
+               /*
+                * get_extent may return an extent that starts before our
+                * requested range.  We have to make sure the ranges
+                * we return to fiemap always move forward and don't
+                * overlap, so adjust the offsets here
+                */
+               em_start = max(em->start, off);
 
+               /*
+                * record the offset from the start of the extent
+                * for adjusting the disk offset below
+                */
+               offset_in_extent = em_start - em->start;
+               em_end = extent_map_end(em);
+               em_len = em_end - em_start;
+               emflags = em->flags;
                disko = 0;
                flags = 0;
 
+               /*
+                * bump off for our next call to get_extent
+                */
+               off = extent_map_end(em);
+               if (off >= max)
+                       end = 1;
+
                if (em->block_start == EXTENT_MAP_LAST_BYTE) {
                        end = 1;
                        flags |= FIEMAP_EXTENT_LAST;
@@ -2999,42 +3124,34 @@ int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
                        flags |= (FIEMAP_EXTENT_DELALLOC |
                                  FIEMAP_EXTENT_UNKNOWN);
                } else {
-                       disko = em->block_start;
+                       disko = em->block_start + offset_in_extent;
                }
                if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags))
                        flags |= FIEMAP_EXTENT_ENCODED;
 
-next:
-               emflags = em->flags;
                free_extent_map(em);
                em = NULL;
-               if (!end) {
-                       em = get_extent(inode, NULL, 0, off, max - off, 0);
-                       if (!em)
-                               goto out;
-                       if (IS_ERR(em)) {
-                               ret = PTR_ERR(em);
-                               goto out;
-                       }
-                       emflags = em->flags;
-               }
-
-               if (test_bit(EXTENT_FLAG_VACANCY, &emflags)) {
+               if ((em_start >= last) || em_len == (u64)-1 ||
+                  (last == (u64)-1 && isize <= em_end)) {
                        flags |= FIEMAP_EXTENT_LAST;
                        end = 1;
                }
 
-               if (em_start == last) {
+               /* now scan forward to see if this is really the last extent. */
+               em = get_extent_skip_holes(inode, off, last_for_get_extent,
+                                          get_extent);
+               if (IS_ERR(em)) {
+                       ret = PTR_ERR(em);
+                       goto out;
+               }
+               if (!em) {
                        flags |= FIEMAP_EXTENT_LAST;
                        end = 1;
                }
-
-               if (!hole) {
-                       ret = fiemap_fill_next_extent(fieinfo, em_start, disko,
-                                               em_len, flags);
-                       if (ret)
-                               goto out_free;
-               }
+               ret = fiemap_fill_next_extent(fieinfo, em_start, disko,
+                                             em_len, flags);
+               if (ret)
+                       goto out_free;
        }
 out_free:
        free_extent_map(em);
@@ -3374,7 +3491,7 @@ int set_extent_buffer_uptodate(struct extent_io_tree *tree,
        num_pages = num_extent_pages(eb->start, eb->len);
 
        set_extent_uptodate(tree, eb->start, eb->start + eb->len - 1,
-                           GFP_NOFS);
+                           NULL, GFP_NOFS);
        for (i = 0; i < num_pages; i++) {
                page = extent_buffer_page(eb, i);
                if ((i == 0 && (eb->start & (PAGE_CACHE_SIZE - 1))) ||
@@ -3609,6 +3726,7 @@ int map_private_extent_buffer(struct extent_buffer *eb, unsigned long start,
                       "wanted %lu %lu\n", (unsigned long long)eb->start,
                       eb->len, start, min_len);
                WARN_ON(1);
+               return -EINVAL;
        }
 
        p = extent_buffer_page(eb, i);
@@ -3801,6 +3919,12 @@ static void move_pages(struct page *dst_page, struct page *src_page,
        kunmap_atomic(dst_kaddr, KM_USER0);
 }
 
+static inline bool areas_overlap(unsigned long src, unsigned long dst, unsigned long len)
+{
+       unsigned long distance = (src > dst) ? src - dst : dst - src;
+       return distance < len;
+}
+
 static void copy_pages(struct page *dst_page, struct page *src_page,
                       unsigned long dst_off, unsigned long src_off,
                       unsigned long len)
@@ -3808,10 +3932,12 @@ static void copy_pages(struct page *dst_page, struct page *src_page,
        char *dst_kaddr = kmap_atomic(dst_page, KM_USER0);
        char *src_kaddr;
 
-       if (dst_page != src_page)
+       if (dst_page != src_page) {
                src_kaddr = kmap_atomic(src_page, KM_USER1);
-       else
+       } else {
                src_kaddr = dst_kaddr;
+               BUG_ON(areas_overlap(src_off, dst_off, len));
+       }
 
        memcpy(dst_kaddr + dst_off, src_kaddr + src_off, len);
        kunmap_atomic(dst_kaddr, KM_USER0);
@@ -3886,7 +4012,7 @@ void memmove_extent_buffer(struct extent_buffer *dst, unsigned long dst_offset,
                       "len %lu len %lu\n", dst_offset, len, dst->len);
                BUG_ON(1);
        }
-       if (dst_offset < src_offset) {
+       if (!areas_overlap(src_offset, dst_offset, len)) {
                memcpy_extent_buffer(dst, dst_offset, src_offset, len);
                return;
        }