Btrfs: don't release pages when we can't clear the uptodate bits
[pandora-kernel.git] / fs / btrfs / extent_io.c
index d74e6af..e7aeba2 100644 (file)
@@ -104,7 +104,7 @@ void extent_io_tree_init(struct extent_io_tree *tree,
                          struct address_space *mapping, gfp_t mask)
 {
        tree->state = RB_ROOT;
-       tree->buffer = RB_ROOT;
+       INIT_RADIX_TREE(&tree->buffer, GFP_ATOMIC);
        tree->ops = NULL;
        tree->dirty_bytes = 0;
        spin_lock_init(&tree->lock);
@@ -235,50 +235,6 @@ static inline struct rb_node *tree_search(struct extent_io_tree *tree,
        return ret;
 }
 
-static struct extent_buffer *buffer_tree_insert(struct extent_io_tree *tree,
-                                         u64 offset, struct rb_node *node)
-{
-       struct rb_root *root = &tree->buffer;
-       struct rb_node **p = &root->rb_node;
-       struct rb_node *parent = NULL;
-       struct extent_buffer *eb;
-
-       while (*p) {
-               parent = *p;
-               eb = rb_entry(parent, struct extent_buffer, rb_node);
-
-               if (offset < eb->start)
-                       p = &(*p)->rb_left;
-               else if (offset > eb->start)
-                       p = &(*p)->rb_right;
-               else
-                       return eb;
-       }
-
-       rb_link_node(node, parent, p);
-       rb_insert_color(node, root);
-       return NULL;
-}
-
-static struct extent_buffer *buffer_search(struct extent_io_tree *tree,
-                                          u64 offset)
-{
-       struct rb_root *root = &tree->buffer;
-       struct rb_node *n = root->rb_node;
-       struct extent_buffer *eb;
-
-       while (n) {
-               eb = rb_entry(n, struct extent_buffer, rb_node);
-               if (offset < eb->start)
-                       n = n->rb_left;
-               else if (offset > eb->start)
-                       n = n->rb_right;
-               else
-                       return eb;
-       }
-       return NULL;
-}
-
 static void merge_cb(struct extent_io_tree *tree, struct extent_state *new,
                     struct extent_state *other)
 {
@@ -1872,9 +1828,9 @@ static void end_bio_extent_preparewrite(struct bio *bio, int err)
        bio_put(bio);
 }
 
-static struct bio *
-extent_bio_alloc(struct block_device *bdev, u64 first_sector, int nr_vecs,
-                gfp_t gfp_flags)
+struct bio *
+btrfs_bio_alloc(struct block_device *bdev, u64 first_sector, int nr_vecs,
+               gfp_t gfp_flags)
 {
        struct bio *bio;
 
@@ -1901,17 +1857,15 @@ static int submit_one_bio(int rw, struct bio *bio, int mirror_num,
        struct page *page = bvec->bv_page;
        struct extent_io_tree *tree = bio->bi_private;
        u64 start;
-       u64 end;
 
        start = ((u64)page->index << PAGE_CACHE_SHIFT) + bvec->bv_offset;
-       end = start + bvec->bv_len - 1;
 
        bio->bi_private = NULL;
 
        bio_get(bio);
 
        if (tree->ops && tree->ops->submit_bio_hook)
-               tree->ops->submit_bio_hook(page->mapping->host, rw, bio,
+               ret = tree->ops->submit_bio_hook(page->mapping->host, rw, bio,
                                           mirror_num, bio_flags, start);
        else
                submit_bio(rw, bio);
@@ -1965,7 +1919,9 @@ static int submit_extent_page(int rw, struct extent_io_tree *tree,
        else
                nr = bio_get_nr_vecs(bdev);
 
-       bio = extent_bio_alloc(bdev, sector, nr, GFP_NOFS | __GFP_HIGH);
+       bio = btrfs_bio_alloc(bdev, sector, nr, GFP_NOFS | __GFP_HIGH);
+       if (!bio)
+               return -ENOMEM;
 
        bio_add_page(bio, page, page_size, offset);
        bio->bi_end_io = end_io_func;
@@ -1990,6 +1946,7 @@ void set_page_extent_mapped(struct page *page)
 
 static void set_page_extent_head(struct page *page, unsigned long len)
 {
+       WARN_ON(!PagePrivate(page));
        set_page_private(page, EXTENT_PAGE_PRIVATE_FIRST_PAGE | len << 2);
 }
 
@@ -2074,8 +2031,11 @@ static int __extent_read_full_page(struct extent_io_tree *tree,
                BUG_ON(extent_map_end(em) <= cur);
                BUG_ON(end < cur);
 
-               if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags))
+               if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) {
                        this_bio_flag = EXTENT_BIO_COMPRESSED;
+                       extent_set_compress_type(&this_bio_flag,
+                                                em->compress_type);
+               }
 
                iosize = min(extent_map_end(em) - cur, end - cur + 1);
                cur_end = min(extent_map_end(em) - 1, end);
@@ -2169,7 +2129,7 @@ int extent_read_full_page(struct extent_io_tree *tree, struct page *page,
        ret = __extent_read_full_page(tree, page, get_extent, &bio, 0,
                                      &bio_flags);
        if (bio)
-               submit_one_bio(READ, bio, 0, bio_flags);
+               ret = submit_one_bio(READ, bio, 0, bio_flags);
        return ret;
 }
 
@@ -2204,7 +2164,6 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc,
        u64 last_byte = i_size_read(inode);
        u64 block_start;
        u64 iosize;
-       u64 unlock_start;
        sector_t sector;
        struct extent_state *cached_state = NULL;
        struct extent_map *em;
@@ -2329,7 +2288,6 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc,
                if (tree->ops && tree->ops->writepage_end_io_hook)
                        tree->ops->writepage_end_io_hook(page, start,
                                                         page_end, NULL, 1);
-               unlock_start = page_end + 1;
                goto done;
        }
 
@@ -2340,7 +2298,6 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc,
                        if (tree->ops && tree->ops->writepage_end_io_hook)
                                tree->ops->writepage_end_io_hook(page, cur,
                                                         page_end, NULL, 1);
-                       unlock_start = page_end + 1;
                        break;
                }
                em = epd->get_extent(inode, page, pg_offset, cur,
@@ -2387,7 +2344,6 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc,
 
                        cur += iosize;
                        pg_offset += iosize;
-                       unlock_start = cur;
                        continue;
                }
                /* leave this out until we have a page_mkwrite call */
@@ -2473,7 +2429,6 @@ static int extent_write_cache_pages(struct extent_io_tree *tree,
        pgoff_t index;
        pgoff_t end;            /* Inclusive */
        int scanned = 0;
-       int range_whole = 0;
 
        pagevec_init(&pvec, 0);
        if (wbc->range_cyclic) {
@@ -2482,8 +2437,6 @@ static int extent_write_cache_pages(struct extent_io_tree *tree,
        } else {
                index = wbc->range_start >> PAGE_CACHE_SHIFT;
                end = wbc->range_end >> PAGE_CACHE_SHIFT;
-               if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX)
-                       range_whole = 1;
                scanned = 1;
        }
 retry:
@@ -2823,6 +2776,8 @@ int extent_prepare_write(struct extent_io_tree *tree,
                                         NULL, 1,
                                         end_bio_extent_preparewrite, 0,
                                         0, 0);
+                       if (ret && !err)
+                               err = ret;
                        iocount++;
                        block_start = block_start + iosize;
                } else {
@@ -2867,9 +2822,17 @@ int try_release_extent_state(struct extent_map_tree *map,
                 * at this point we can safely clear everything except the
                 * locked bit and the nodatasum bit
                 */
-               clear_extent_bit(tree, start, end,
+               ret = clear_extent_bit(tree, start, end,
                                 ~(EXTENT_LOCKED | EXTENT_NODATASUM),
                                 0, 0, NULL, mask);
+
+               /* if clear_extent_bit failed for enomem reasons,
+                * we can't allow the release to continue.
+                */
+               if (ret < 0)
+                       ret = 0;
+               else
+                       ret = 1;
        }
        return ret;
 }
@@ -2952,21 +2915,53 @@ out:
 int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
                __u64 start, __u64 len, get_extent_t *get_extent)
 {
-       int ret;
+       int ret = 0;
        u64 off = start;
        u64 max = start + len;
        u32 flags = 0;
+       u32 found_type;
+       u64 last;
        u64 disko = 0;
+       struct btrfs_key found_key;
        struct extent_map *em = NULL;
        struct extent_state *cached_state = NULL;
+       struct btrfs_path *path;
+       struct btrfs_file_extent_item *item;
        int end = 0;
        u64 em_start = 0, em_len = 0;
        unsigned long emflags;
-       ret = 0;
+       int hole = 0;
 
        if (len == 0)
                return -EINVAL;
 
+       path = btrfs_alloc_path();
+       if (!path)
+               return -ENOMEM;
+       path->leave_spinning = 1;
+
+       ret = btrfs_lookup_file_extent(NULL, BTRFS_I(inode)->root,
+                                      path, inode->i_ino, -1, 0);
+       if (ret < 0) {
+               btrfs_free_path(path);
+               return ret;
+       }
+       WARN_ON(!ret);
+       path->slots[0]--;
+       item = btrfs_item_ptr(path->nodes[0], path->slots[0],
+                             struct btrfs_file_extent_item);
+       btrfs_item_key_to_cpu(path->nodes[0], &found_key, path->slots[0]);
+       found_type = btrfs_key_type(&found_key);
+
+       /* No extents, just return */
+       if (found_key.objectid != inode->i_ino ||
+           found_type != BTRFS_EXTENT_DATA_KEY) {
+               btrfs_free_path(path);
+               return 0;
+       }
+       last = found_key.offset;
+       btrfs_free_path(path);
+
        lock_extent_bits(&BTRFS_I(inode)->io_tree, start, start + len, 0,
                         &cached_state, GFP_NOFS);
        em = get_extent(inode, NULL, 0, off, max - off, 0);
@@ -2976,11 +2971,18 @@ int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
                ret = PTR_ERR(em);
                goto out;
        }
+
        while (!end) {
+               hole = 0;
                off = em->start + em->len;
                if (off >= max)
                        end = 1;
 
+               if (em->block_start == EXTENT_MAP_HOLE) {
+                       hole = 1;
+                       goto next;
+               }
+
                em_start = em->start;
                em_len = em->len;
 
@@ -2990,8 +2992,6 @@ int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
                if (em->block_start == EXTENT_MAP_LAST_BYTE) {
                        end = 1;
                        flags |= FIEMAP_EXTENT_LAST;
-               } else if (em->block_start == EXTENT_MAP_HOLE) {
-                       flags |= FIEMAP_EXTENT_UNWRITTEN;
                } else if (em->block_start == EXTENT_MAP_INLINE) {
                        flags |= (FIEMAP_EXTENT_DATA_INLINE |
                                  FIEMAP_EXTENT_NOT_ALIGNED);
@@ -3004,10 +3004,10 @@ int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
                if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags))
                        flags |= FIEMAP_EXTENT_ENCODED;
 
+next:
                emflags = em->flags;
                free_extent_map(em);
                em = NULL;
-
                if (!end) {
                        em = get_extent(inode, NULL, 0, off, max - off, 0);
                        if (!em)
@@ -3018,15 +3018,23 @@ int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
                        }
                        emflags = em->flags;
                }
+
                if (test_bit(EXTENT_FLAG_VACANCY, &emflags)) {
                        flags |= FIEMAP_EXTENT_LAST;
                        end = 1;
                }
 
-               ret = fiemap_fill_next_extent(fieinfo, em_start, disko,
-                                       em_len, flags);
-               if (ret)
-                       goto out_free;
+               if (em_start == last) {
+                       flags |= FIEMAP_EXTENT_LAST;
+                       end = 1;
+               }
+
+               if (!hole) {
+                       ret = fiemap_fill_next_extent(fieinfo, em_start, disko,
+                                               em_len, flags);
+                       if (ret)
+                               goto out_free;
+               }
        }
 out_free:
        free_extent_map(em);
@@ -3078,10 +3086,13 @@ static struct extent_buffer *__alloc_extent_buffer(struct extent_io_tree *tree,
 #endif
 
        eb = kmem_cache_zalloc(extent_buffer_cache, mask);
+       if (eb == NULL)
+               return NULL;
        eb->start = start;
        eb->len = len;
        spin_lock_init(&eb->lock);
        init_waitqueue_head(&eb->lock_wq);
+       INIT_RCU_HEAD(&eb->rcu_head);
 
 #if LEAK_DEBUG
        spin_lock_irqsave(&leak_lock, flags);
@@ -3104,6 +3115,39 @@ static void __free_extent_buffer(struct extent_buffer *eb)
        kmem_cache_free(extent_buffer_cache, eb);
 }
 
+/*
+ * Helper for releasing extent buffer page.
+ */
+static void btrfs_release_extent_buffer_page(struct extent_buffer *eb,
+                                               unsigned long start_idx)
+{
+       unsigned long index;
+       struct page *page;
+
+       if (!eb->first_page)
+               return;
+
+       index = num_extent_pages(eb->start, eb->len);
+       if (start_idx >= index)
+               return;
+
+       do {
+               index--;
+               page = extent_buffer_page(eb, index);
+               if (page)
+                       page_cache_release(page);
+       } while (index != start_idx);
+}
+
+/*
+ * Helper for releasing the extent buffer.
+ */
+static inline void btrfs_release_extent_buffer(struct extent_buffer *eb)
+{
+       btrfs_release_extent_buffer_page(eb, 0);
+       __free_extent_buffer(eb);
+}
+
 struct extent_buffer *alloc_extent_buffer(struct extent_io_tree *tree,
                                          u64 start, unsigned long len,
                                          struct page *page0,
@@ -3117,16 +3161,16 @@ struct extent_buffer *alloc_extent_buffer(struct extent_io_tree *tree,
        struct page *p;
        struct address_space *mapping = tree->mapping;
        int uptodate = 1;
+       int ret;
 
-       spin_lock(&tree->buffer_lock);
-       eb = buffer_search(tree, start);
-       if (eb) {
-               atomic_inc(&eb->refs);
-               spin_unlock(&tree->buffer_lock);
+       rcu_read_lock();
+       eb = radix_tree_lookup(&tree->buffer, start >> PAGE_CACHE_SHIFT);
+       if (eb && atomic_inc_not_zero(&eb->refs)) {
+               rcu_read_unlock();
                mark_page_accessed(eb->first_page);
                return eb;
        }
-       spin_unlock(&tree->buffer_lock);
+       rcu_read_unlock();
 
        eb = __alloc_extent_buffer(tree, start, len, mask);
        if (!eb)
@@ -3160,31 +3204,59 @@ struct extent_buffer *alloc_extent_buffer(struct extent_io_tree *tree,
                }
                if (!PageUptodate(p))
                        uptodate = 0;
-               unlock_page(p);
+
+               /*
+                * see below about how we avoid a nasty race with release page
+                * and why we unlock later
+                */
+               if (i != 0)
+                       unlock_page(p);
        }
        if (uptodate)
                set_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags);
 
+       ret = radix_tree_preload(GFP_NOFS & ~__GFP_HIGHMEM);
+       if (ret)
+               goto free_eb;
+
        spin_lock(&tree->buffer_lock);
-       exists = buffer_tree_insert(tree, start, &eb->rb_node);
-       if (exists) {
+       ret = radix_tree_insert(&tree->buffer, start >> PAGE_CACHE_SHIFT, eb);
+       if (ret == -EEXIST) {
+               exists = radix_tree_lookup(&tree->buffer,
+                                               start >> PAGE_CACHE_SHIFT);
                /* add one reference for the caller */
                atomic_inc(&exists->refs);
                spin_unlock(&tree->buffer_lock);
+               radix_tree_preload_end();
                goto free_eb;
        }
        /* add one reference for the tree */
        atomic_inc(&eb->refs);
        spin_unlock(&tree->buffer_lock);
+       radix_tree_preload_end();
+
+       /*
+        * there is a race where release page may have
+        * tried to find this extent buffer in the radix
+        * but failed.  It will tell the VM it is safe to
+        * reclaim the, and it will clear the page private bit.
+        * We must make sure to set the page private bit properly
+        * after the extent buffer is in the radix tree so
+        * it doesn't get lost
+        */
+       set_page_extent_mapped(eb->first_page);
+       set_page_extent_head(eb->first_page, eb->len);
+       if (!page0)
+               unlock_page(eb->first_page);
        return eb;
 
 free_eb:
+       if (eb->first_page && !page0)
+               unlock_page(eb->first_page);
+
        if (!atomic_dec_and_test(&eb->refs))
                return exists;
-       for (index = 1; index < i; index++)
-               page_cache_release(extent_buffer_page(eb, index));
-       page_cache_release(extent_buffer_page(eb, 0));
-       __free_extent_buffer(eb);
+       btrfs_release_extent_buffer(eb);
        return exists;
 }
 
@@ -3194,16 +3266,16 @@ struct extent_buffer *find_extent_buffer(struct extent_io_tree *tree,
 {
        struct extent_buffer *eb;
 
-       spin_lock(&tree->buffer_lock);
-       eb = buffer_search(tree, start);
-       if (eb)
-               atomic_inc(&eb->refs);
-       spin_unlock(&tree->buffer_lock);
-
-       if (eb)
+       rcu_read_lock();
+       eb = radix_tree_lookup(&tree->buffer, start >> PAGE_CACHE_SHIFT);
+       if (eb && atomic_inc_not_zero(&eb->refs)) {
+               rcu_read_unlock();
                mark_page_accessed(eb->first_page);
+               return eb;
+       }
+       rcu_read_unlock();
 
-       return eb;
+       return NULL;
 }
 
 void free_extent_buffer(struct extent_buffer *eb)
@@ -3232,10 +3304,11 @@ int clear_extent_buffer_dirty(struct extent_io_tree *tree,
                        continue;
 
                lock_page(page);
+               WARN_ON(!PagePrivate(page));
+
+               set_page_extent_mapped(page);
                if (i == 0)
                        set_page_extent_head(page, eb->len);
-               else
-                       set_page_private(page, EXTENT_PAGE_PRIVATE);
 
                clear_page_dirty_for_io(page);
                spin_lock_irq(&page->mapping->tree_lock);
@@ -3425,6 +3498,13 @@ int read_extent_buffer_pages(struct extent_io_tree *tree,
 
        for (i = start_i; i < num_pages; i++) {
                page = extent_buffer_page(eb, i);
+
+               WARN_ON(!PagePrivate(page));
+
+               set_page_extent_mapped(page);
+               if (i == 0)
+                       set_page_extent_head(page, eb->len);
+
                if (inc_all_pages)
                        page_cache_get(page);
                if (!PageUptodate(page)) {
@@ -3833,34 +3913,47 @@ void memmove_extent_buffer(struct extent_buffer *dst, unsigned long dst_offset,
        }
 }
 
+static inline void btrfs_release_extent_buffer_rcu(struct rcu_head *head)
+{
+       struct extent_buffer *eb =
+                       container_of(head, struct extent_buffer, rcu_head);
+
+       btrfs_release_extent_buffer(eb);
+}
+
 int try_release_extent_buffer(struct extent_io_tree *tree, struct page *page)
 {
        u64 start = page_offset(page);
        struct extent_buffer *eb;
        int ret = 1;
-       unsigned long i;
-       unsigned long num_pages;
 
        spin_lock(&tree->buffer_lock);
-       eb = buffer_search(tree, start);
-       if (!eb)
-               goto out;
+       eb = radix_tree_lookup(&tree->buffer, start >> PAGE_CACHE_SHIFT);
+       if (!eb) {
+               spin_unlock(&tree->buffer_lock);
+               return ret;
+       }
 
-       if (atomic_read(&eb->refs) > 1) {
+       if (test_bit(EXTENT_BUFFER_DIRTY, &eb->bflags)) {
                ret = 0;
                goto out;
        }
-       if (test_bit(EXTENT_BUFFER_DIRTY, &eb->bflags)) {
+
+       /*
+        * set @eb->refs to 0 if it is already 1, and then release the @eb.
+        * Or go back.
+        */
+       if (atomic_cmpxchg(&eb->refs, 1, 0) != 1) {
                ret = 0;
                goto out;
        }
-       /* at this point we can safely release the extent buffer */
-       num_pages = num_extent_pages(eb->start, eb->len);
-       for (i = 0; i < num_pages; i++)
-               page_cache_release(extent_buffer_page(eb, i));
-       rb_erase(&eb->rb_node, &tree->buffer);
-       __free_extent_buffer(eb);
+
+       radix_tree_delete(&tree->buffer, start >> PAGE_CACHE_SHIFT);
 out:
        spin_unlock(&tree->buffer_lock);
+
+       /* at this point we can safely release the extent buffer */
+       if (atomic_read(&eb->refs) == 0)
+               call_rcu(&eb->rcu_head, btrfs_release_extent_buffer_rcu);
        return ret;
 }