Merge branch 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/mason/btrfs...
authorLinus Torvalds <torvalds@linux-foundation.org>
Sat, 30 Oct 2010 16:05:48 +0000 (09:05 -0700)
committerLinus Torvalds <torvalds@linux-foundation.org>
Sat, 30 Oct 2010 16:05:48 +0000 (09:05 -0700)
* 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/mason/btrfs-unstable: (39 commits)
  Btrfs: deal with errors from updating the tree log
  Btrfs: allow subvol deletion by unprivileged user with -o user_subvol_rm_allowed
  Btrfs: make SNAP_DESTROY async
  Btrfs: add SNAP_CREATE_ASYNC ioctl
  Btrfs: add START_SYNC, WAIT_SYNC ioctls
  Btrfs: async transaction commit
  Btrfs: fix deadlock in btrfs_commit_transaction
  Btrfs: fix lockdep warning on clone ioctl
  Btrfs: fix clone ioctl where range is adjacent to extent
  Btrfs: fix delalloc checks in clone ioctl
  Btrfs: drop unused variable in block_alloc_rsv
  Btrfs: cleanup warnings from gcc 4.6 (nonbugs)
  Btrfs: Fix variables set but not read (bugs found by gcc 4.6)
  Btrfs: Use ERR_CAST helpers
  Btrfs: use memdup_user helpers
  Btrfs: fix raid code for removing missing drives
  Btrfs: Switch the extent buffer rbtree into a radix tree
  Btrfs: restructure try_release_extent_buffer()
  Btrfs: use the flusher threads for delalloc throttling
  Btrfs: tune the chunk allocation to 5% of the FS as metadata
  ...

Fix up trivial conflicts in fs/btrfs/super.c and fs/fs-writeback.c, and
remove use of INIT_RCU_HEAD in fs/btrfs/extent_io.c (that init macro was
useless and removed in commit 5e8067adfdba: "rcu head remove init")

1  2 
fs/btrfs/disk-io.c
fs/btrfs/extent-tree.c
fs/btrfs/extent_io.c
fs/btrfs/inode.c
fs/btrfs/super.c
fs/btrfs/volumes.c
fs/fs-writeback.c
include/linux/writeback.h

diff --combined fs/btrfs/disk-io.c
@@@ -338,7 -338,6 +338,6 @@@ static int csum_dirty_buffer(struct btr
        struct extent_io_tree *tree;
        u64 start = (u64)page->index << PAGE_CACHE_SHIFT;
        u64 found_start;
-       int found_level;
        unsigned long len;
        struct extent_buffer *eb;
        int ret;
                WARN_ON(1);
                goto err;
        }
-       found_level = btrfs_header_level(eb);
        csum_tree_block(root, eb, 0);
  err:
        free_extent_buffer(eb);
@@@ -481,9 -478,12 +478,12 @@@ static void end_workqueue_bio(struct bi
        end_io_wq->work.flags = 0;
  
        if (bio->bi_rw & REQ_WRITE) {
-               if (end_io_wq->metadata)
+               if (end_io_wq->metadata == 1)
                        btrfs_queue_worker(&fs_info->endio_meta_write_workers,
                                           &end_io_wq->work);
+               else if (end_io_wq->metadata == 2)
+                       btrfs_queue_worker(&fs_info->endio_freespace_worker,
+                                          &end_io_wq->work);
                else
                        btrfs_queue_worker(&fs_info->endio_write_workers,
                                           &end_io_wq->work);
        }
  }
  
+ /*
+  * For the metadata arg you want
+  *
+  * 0 - if data
+  * 1 - if normal metadta
+  * 2 - if writing to the free space cache area
+  */
  int btrfs_bio_wq_end_io(struct btrfs_fs_info *info, struct bio *bio,
                        int metadata)
  {
@@@ -533,11 -540,9 +540,9 @@@ int btrfs_congested_async(struct btrfs_
  
  static void run_one_async_start(struct btrfs_work *work)
  {
-       struct btrfs_fs_info *fs_info;
        struct async_submit_bio *async;
  
        async = container_of(work, struct  async_submit_bio, work);
-       fs_info = BTRFS_I(async->inode)->root->fs_info;
        async->submit_bio_start(async->inode, async->rw, async->bio,
                               async->mirror_num, async->bio_flags,
                               async->bio_offset);
@@@ -850,12 -855,8 +855,8 @@@ struct extent_buffer *read_tree_block(s
                                      u32 blocksize, u64 parent_transid)
  {
        struct extent_buffer *buf = NULL;
-       struct inode *btree_inode = root->fs_info->btree_inode;
-       struct extent_io_tree *io_tree;
        int ret;
  
-       io_tree = &BTRFS_I(btree_inode)->io_tree;
        buf = btrfs_find_create_tree_block(root, bytenr, blocksize);
        if (!buf)
                return NULL;
@@@ -1377,7 -1378,6 +1378,6 @@@ static int bio_ready_for_csum(struct bi
        u64 start = 0;
        struct page *page;
        struct extent_io_tree *io_tree = NULL;
-       struct btrfs_fs_info *info = NULL;
        struct bio_vec *bvec;
        int i;
        int ret;
                buf_len = page->private >> 2;
                start = page_offset(page) + bvec->bv_offset;
                io_tree = &BTRFS_I(page->mapping->host)->io_tree;
-               info = BTRFS_I(page->mapping->host)->root->fs_info;
        }
        /* are we fully contained in this bio? */
        if (buf_len <= length)
@@@ -1680,12 -1679,12 +1679,12 @@@ struct btrfs_root *open_ctree(struct su
  
        init_waitqueue_head(&fs_info->transaction_throttle);
        init_waitqueue_head(&fs_info->transaction_wait);
+       init_waitqueue_head(&fs_info->transaction_blocked_wait);
        init_waitqueue_head(&fs_info->async_submit_wait);
  
        __setup_root(4096, 4096, 4096, 4096, tree_root,
                     fs_info, BTRFS_ROOT_TREE_OBJECTID);
  
        bh = btrfs_read_dev_super(fs_devices->latest_bdev);
        if (!bh)
                goto fail_iput;
        btrfs_init_workers(&fs_info->endio_write_workers, "endio-write",
                           fs_info->thread_pool_size,
                           &fs_info->generic_worker);
+       btrfs_init_workers(&fs_info->endio_freespace_worker, "freespace-write",
+                          1, &fs_info->generic_worker);
  
        /*
         * endios are largely parallel and should have a very
        btrfs_start_workers(&fs_info->endio_meta_workers, 1);
        btrfs_start_workers(&fs_info->endio_meta_write_workers, 1);
        btrfs_start_workers(&fs_info->endio_write_workers, 1);
+       btrfs_start_workers(&fs_info->endio_freespace_worker, 1);
  
        fs_info->bdi.ra_pages *= btrfs_super_num_devices(disk_super);
        fs_info->bdi.ra_pages = max(fs_info->bdi.ra_pages,
        if (!(sb->s_flags & MS_RDONLY)) {
                down_read(&fs_info->cleanup_work_sem);
                btrfs_orphan_cleanup(fs_info->fs_root);
+               btrfs_orphan_cleanup(fs_info->tree_root);
                up_read(&fs_info->cleanup_work_sem);
        }
  
@@@ -2035,6 -2038,7 +2038,7 @@@ fail_sb_buffer
        btrfs_stop_workers(&fs_info->endio_meta_workers);
        btrfs_stop_workers(&fs_info->endio_meta_write_workers);
        btrfs_stop_workers(&fs_info->endio_write_workers);
+       btrfs_stop_workers(&fs_info->endio_freespace_worker);
        btrfs_stop_workers(&fs_info->submit_workers);
  fail_iput:
        invalidate_inode_pages2(fs_info->btree_inode->i_mapping);
@@@ -2063,7 -2067,7 +2067,7 @@@ static void btrfs_end_buffer_write_sync
        if (uptodate) {
                set_buffer_uptodate(bh);
        } else {
 -              if (!buffer_eopnotsupp(bh) && printk_ratelimit()) {
 +              if (printk_ratelimit()) {
                        printk(KERN_WARNING "lost page write due to "
                                        "I/O error on %s\n",
                                       bdevname(bh->b_bdev, b));
@@@ -2200,10 -2204,21 +2204,10 @@@ static int write_dev_supers(struct btrf
                        bh->b_end_io = btrfs_end_buffer_write_sync;
                }
  
 -              if (i == last_barrier && do_barriers && device->barriers) {
 -                      ret = submit_bh(WRITE_BARRIER, bh);
 -                      if (ret == -EOPNOTSUPP) {
 -                              printk("btrfs: disabling barriers on dev %s\n",
 -                                     device->name);
 -                              set_buffer_uptodate(bh);
 -                              device->barriers = 0;
 -                              /* one reference for submit_bh */
 -                              get_bh(bh);
 -                              lock_buffer(bh);
 -                              ret = submit_bh(WRITE_SYNC, bh);
 -                      }
 -              } else {
 +              if (i == last_barrier && do_barriers)
 +                      ret = submit_bh(WRITE_FLUSH_FUA, bh);
 +              else
                        ret = submit_bh(WRITE_SYNC, bh);
 -              }
  
                if (ret)
                        errors++;
@@@ -2410,6 -2425,7 +2414,7 @@@ int close_ctree(struct btrfs_root *root
        fs_info->closing = 1;
        smp_mb();
  
+       btrfs_put_block_group_cache(fs_info);
        if (!(fs_info->sb->s_flags & MS_RDONLY)) {
                ret =  btrfs_commit_super(root);
                if (ret)
        btrfs_stop_workers(&fs_info->endio_meta_workers);
        btrfs_stop_workers(&fs_info->endio_meta_write_workers);
        btrfs_stop_workers(&fs_info->endio_write_workers);
+       btrfs_stop_workers(&fs_info->endio_freespace_worker);
        btrfs_stop_workers(&fs_info->submit_workers);
  
        btrfs_close_devices(fs_info->fs_devices);
diff --combined fs/btrfs/extent-tree.c
@@@ -242,6 -242,12 +242,12 @@@ get_caching_control(struct btrfs_block_
                return NULL;
        }
  
+       /* We're loading it the fast way, so we don't have a caching_ctl. */
+       if (!cache->caching_ctl) {
+               spin_unlock(&cache->lock);
+               return NULL;
+       }
        ctl = cache->caching_ctl;
        atomic_inc(&ctl->count);
        spin_unlock(&cache->lock);
@@@ -421,7 -427,9 +427,9 @@@ err
        return 0;
  }
  
- static int cache_block_group(struct btrfs_block_group_cache *cache)
+ static int cache_block_group(struct btrfs_block_group_cache *cache,
+                            struct btrfs_trans_handle *trans,
+                            int load_cache_only)
  {
        struct btrfs_fs_info *fs_info = cache->fs_info;
        struct btrfs_caching_control *caching_ctl;
        if (cache->cached != BTRFS_CACHE_NO)
                return 0;
  
+       /*
+        * We can't do the read from on-disk cache during a commit since we need
+        * to have the normal tree locking.
+        */
+       if (!trans->transaction->in_commit) {
+               spin_lock(&cache->lock);
+               if (cache->cached != BTRFS_CACHE_NO) {
+                       spin_unlock(&cache->lock);
+                       return 0;
+               }
+               cache->cached = BTRFS_CACHE_STARTED;
+               spin_unlock(&cache->lock);
+               ret = load_free_space_cache(fs_info, cache);
+               spin_lock(&cache->lock);
+               if (ret == 1) {
+                       cache->cached = BTRFS_CACHE_FINISHED;
+                       cache->last_byte_to_unpin = (u64)-1;
+               } else {
+                       cache->cached = BTRFS_CACHE_NO;
+               }
+               spin_unlock(&cache->lock);
+               if (ret == 1)
+                       return 0;
+       }
+       if (load_cache_only)
+               return 0;
        caching_ctl = kzalloc(sizeof(*caching_ctl), GFP_KERNEL);
        BUG_ON(!caching_ctl);
  
@@@ -509,7 -547,7 +547,7 @@@ static struct btrfs_space_info *__find_
  
        rcu_read_lock();
        list_for_each_entry_rcu(found, head, list) {
-               if (found->flags == flags) {
+               if (found->flags & flags) {
                        rcu_read_unlock();
                        return found;
                }
@@@ -542,6 -580,15 +580,15 @@@ static u64 div_factor(u64 num, int fact
        return num;
  }
  
+ static u64 div_factor_fine(u64 num, int factor)
+ {
+       if (factor == 100)
+               return num;
+       num *= factor;
+       do_div(num, 100);
+       return num;
+ }
  u64 btrfs_find_block_group(struct btrfs_root *root,
                           u64 search_start, u64 search_hint, int owner)
  {
@@@ -1695,7 -1742,8 +1742,7 @@@ static int remove_extent_backref(struc
  static void btrfs_issue_discard(struct block_device *bdev,
                                u64 start, u64 len)
  {
 -      blkdev_issue_discard(bdev, start >> 9, len >> 9, GFP_KERNEL,
 -                      BLKDEV_IFL_WAIT | BLKDEV_IFL_BARRIER);
 +      blkdev_issue_discard(bdev, start >> 9, len >> 9, GFP_KERNEL, 0);
  }
  
  static int btrfs_discard_extent(struct btrfs_root *root, u64 bytenr,
@@@ -2687,6 -2735,109 +2734,109 @@@ next_block_group(struct btrfs_root *roo
        return cache;
  }
  
+ static int cache_save_setup(struct btrfs_block_group_cache *block_group,
+                           struct btrfs_trans_handle *trans,
+                           struct btrfs_path *path)
+ {
+       struct btrfs_root *root = block_group->fs_info->tree_root;
+       struct inode *inode = NULL;
+       u64 alloc_hint = 0;
+       int num_pages = 0;
+       int retries = 0;
+       int ret = 0;
+       /*
+        * If this block group is smaller than 100 megs don't bother caching the
+        * block group.
+        */
+       if (block_group->key.offset < (100 * 1024 * 1024)) {
+               spin_lock(&block_group->lock);
+               block_group->disk_cache_state = BTRFS_DC_WRITTEN;
+               spin_unlock(&block_group->lock);
+               return 0;
+       }
+ again:
+       inode = lookup_free_space_inode(root, block_group, path);
+       if (IS_ERR(inode) && PTR_ERR(inode) != -ENOENT) {
+               ret = PTR_ERR(inode);
+               btrfs_release_path(root, path);
+               goto out;
+       }
+       if (IS_ERR(inode)) {
+               BUG_ON(retries);
+               retries++;
+               if (block_group->ro)
+                       goto out_free;
+               ret = create_free_space_inode(root, trans, block_group, path);
+               if (ret)
+                       goto out_free;
+               goto again;
+       }
+       /*
+        * We want to set the generation to 0, that way if anything goes wrong
+        * from here on out we know not to trust this cache when we load up next
+        * time.
+        */
+       BTRFS_I(inode)->generation = 0;
+       ret = btrfs_update_inode(trans, root, inode);
+       WARN_ON(ret);
+       if (i_size_read(inode) > 0) {
+               ret = btrfs_truncate_free_space_cache(root, trans, path,
+                                                     inode);
+               if (ret)
+                       goto out_put;
+       }
+       spin_lock(&block_group->lock);
+       if (block_group->cached != BTRFS_CACHE_FINISHED) {
+               spin_unlock(&block_group->lock);
+               goto out_put;
+       }
+       spin_unlock(&block_group->lock);
+       num_pages = (int)div64_u64(block_group->key.offset, 1024 * 1024 * 1024);
+       if (!num_pages)
+               num_pages = 1;
+       /*
+        * Just to make absolutely sure we have enough space, we're going to
+        * preallocate 12 pages worth of space for each block group.  In
+        * practice we ought to use at most 8, but we need extra space so we can
+        * add our header and have a terminator between the extents and the
+        * bitmaps.
+        */
+       num_pages *= 16;
+       num_pages *= PAGE_CACHE_SIZE;
+       ret = btrfs_check_data_free_space(inode, num_pages);
+       if (ret)
+               goto out_put;
+       ret = btrfs_prealloc_file_range_trans(inode, trans, 0, 0, num_pages,
+                                             num_pages, num_pages,
+                                             &alloc_hint);
+       btrfs_free_reserved_data_space(inode, num_pages);
+ out_put:
+       iput(inode);
+ out_free:
+       btrfs_release_path(root, path);
+ out:
+       spin_lock(&block_group->lock);
+       if (ret)
+               block_group->disk_cache_state = BTRFS_DC_ERROR;
+       else
+               block_group->disk_cache_state = BTRFS_DC_SETUP;
+       spin_unlock(&block_group->lock);
+       return ret;
+ }
  int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans,
                                   struct btrfs_root *root)
  {
        if (!path)
                return -ENOMEM;
  
+ again:
+       while (1) {
+               cache = btrfs_lookup_first_block_group(root->fs_info, last);
+               while (cache) {
+                       if (cache->disk_cache_state == BTRFS_DC_CLEAR)
+                               break;
+                       cache = next_block_group(root, cache);
+               }
+               if (!cache) {
+                       if (last == 0)
+                               break;
+                       last = 0;
+                       continue;
+               }
+               err = cache_save_setup(cache, trans, path);
+               last = cache->key.objectid + cache->key.offset;
+               btrfs_put_block_group(cache);
+       }
        while (1) {
                if (last == 0) {
                        err = btrfs_run_delayed_refs(trans, root,
  
                cache = btrfs_lookup_first_block_group(root->fs_info, last);
                while (cache) {
+                       if (cache->disk_cache_state == BTRFS_DC_CLEAR) {
+                               btrfs_put_block_group(cache);
+                               goto again;
+                       }
                        if (cache->dirty)
                                break;
                        cache = next_block_group(root, cache);
                        continue;
                }
  
+               if (cache->disk_cache_state == BTRFS_DC_SETUP)
+                       cache->disk_cache_state = BTRFS_DC_NEED_WRITE;
                cache->dirty = 0;
                last = cache->key.objectid + cache->key.offset;
  
                btrfs_put_block_group(cache);
        }
  
+       while (1) {
+               /*
+                * I don't think this is needed since we're just marking our
+                * preallocated extent as written, but just in case it can't
+                * hurt.
+                */
+               if (last == 0) {
+                       err = btrfs_run_delayed_refs(trans, root,
+                                                    (unsigned long)-1);
+                       BUG_ON(err);
+               }
+               cache = btrfs_lookup_first_block_group(root->fs_info, last);
+               while (cache) {
+                       /*
+                        * Really this shouldn't happen, but it could if we
+                        * couldn't write the entire preallocated extent and
+                        * splitting the extent resulted in a new block.
+                        */
+                       if (cache->dirty) {
+                               btrfs_put_block_group(cache);
+                               goto again;
+                       }
+                       if (cache->disk_cache_state == BTRFS_DC_NEED_WRITE)
+                               break;
+                       cache = next_block_group(root, cache);
+               }
+               if (!cache) {
+                       if (last == 0)
+                               break;
+                       last = 0;
+                       continue;
+               }
+               btrfs_write_out_cache(root, trans, cache, path);
+               /*
+                * If we didn't have an error then the cache state is still
+                * NEED_WRITE, so we can set it to WRITTEN.
+                */
+               if (cache->disk_cache_state == BTRFS_DC_NEED_WRITE)
+                       cache->disk_cache_state = BTRFS_DC_WRITTEN;
+               last = cache->key.objectid + cache->key.offset;
+               btrfs_put_block_group(cache);
+       }
        btrfs_free_path(path);
        return 0;
  }
@@@ -2762,6 -2985,7 +2984,7 @@@ static int update_space_info(struct btr
        if (found) {
                spin_lock(&found->lock);
                found->total_bytes += total_bytes;
+               found->disk_total += total_bytes * factor;
                found->bytes_used += bytes_used;
                found->disk_used += bytes_used * factor;
                found->full = 0;
                                BTRFS_BLOCK_GROUP_SYSTEM |
                                BTRFS_BLOCK_GROUP_METADATA);
        found->total_bytes = total_bytes;
+       found->disk_total = total_bytes * factor;
        found->bytes_used = bytes_used;
        found->disk_used = bytes_used * factor;
        found->bytes_pinned = 0;
@@@ -2882,11 -3107,16 +3106,16 @@@ int btrfs_check_data_free_space(struct 
        struct btrfs_space_info *data_sinfo;
        struct btrfs_root *root = BTRFS_I(inode)->root;
        u64 used;
-       int ret = 0, committed = 0;
+       int ret = 0, committed = 0, alloc_chunk = 1;
  
        /* make sure bytes are sectorsize aligned */
        bytes = (bytes + root->sectorsize - 1) & ~((u64)root->sectorsize - 1);
  
+       if (root == root->fs_info->tree_root) {
+               alloc_chunk = 0;
+               committed = 1;
+       }
        data_sinfo = BTRFS_I(inode)->space_info;
        if (!data_sinfo)
                goto alloc;
@@@ -2905,7 -3135,7 +3134,7 @@@ again
                 * if we don't have enough free bytes in this space then we need
                 * to alloc a new chunk.
                 */
-               if (!data_sinfo->full) {
+               if (!data_sinfo->full && alloc_chunk) {
                        u64 alloc_target;
  
                        data_sinfo->force_alloc = 1;
@@@ -2997,10 -3227,11 +3226,11 @@@ static void force_metadata_allocation(s
        rcu_read_unlock();
  }
  
- static int should_alloc_chunk(struct btrfs_space_info *sinfo,
-                             u64 alloc_bytes)
+ static int should_alloc_chunk(struct btrfs_root *root,
+                             struct btrfs_space_info *sinfo, u64 alloc_bytes)
  {
        u64 num_bytes = sinfo->total_bytes - sinfo->bytes_readonly;
+       u64 thresh;
  
        if (sinfo->bytes_used + sinfo->bytes_reserved +
            alloc_bytes + 256 * 1024 * 1024 < num_bytes)
            alloc_bytes < div_factor(num_bytes, 8))
                return 0;
  
+       thresh = btrfs_super_total_bytes(&root->fs_info->super_copy);
+       thresh = max_t(u64, 256 * 1024 * 1024, div_factor_fine(thresh, 5));
+       if (num_bytes > thresh && sinfo->bytes_used < div_factor(num_bytes, 3))
+               return 0;
        return 1;
  }
  
@@@ -3041,12 -3278,20 +3277,20 @@@ static int do_chunk_alloc(struct btrfs_
                goto out;
        }
  
-       if (!force && !should_alloc_chunk(space_info, alloc_bytes)) {
+       if (!force && !should_alloc_chunk(extent_root, space_info,
+                                         alloc_bytes)) {
                spin_unlock(&space_info->lock);
                goto out;
        }
        spin_unlock(&space_info->lock);
  
+       /*
+        * If we have mixed data/metadata chunks we want to make sure we keep
+        * allocating mixed chunks instead of individual chunks.
+        */
+       if (btrfs_mixed_space_info(space_info))
+               flags |= (BTRFS_BLOCK_GROUP_DATA | BTRFS_BLOCK_GROUP_METADATA);
        /*
         * if we're doing a data chunk, go ahead and make sure that
         * we keep a reasonable number of metadata chunks allocated in the
        return ret;
  }
  
- static int maybe_allocate_chunk(struct btrfs_trans_handle *trans,
-                               struct btrfs_root *root,
-                               struct btrfs_space_info *sinfo, u64 num_bytes)
- {
-       int ret;
-       int end_trans = 0;
-       if (sinfo->full)
-               return 0;
-       spin_lock(&sinfo->lock);
-       ret = should_alloc_chunk(sinfo, num_bytes + 2 * 1024 * 1024);
-       spin_unlock(&sinfo->lock);
-       if (!ret)
-               return 0;
-       if (!trans) {
-               trans = btrfs_join_transaction(root, 1);
-               BUG_ON(IS_ERR(trans));
-               end_trans = 1;
-       }
-       ret = do_chunk_alloc(trans, root->fs_info->extent_root,
-                            num_bytes + 2 * 1024 * 1024,
-                            get_alloc_profile(root, sinfo->flags), 0);
-       if (end_trans)
-               btrfs_end_transaction(trans, root);
-       return ret == 1 ? 1 : 0;
- }
  /*
   * shrink metadata reservation for delalloc
   */
  static int shrink_delalloc(struct btrfs_trans_handle *trans,
-                          struct btrfs_root *root, u64 to_reclaim)
+                          struct btrfs_root *root, u64 to_reclaim, int sync)
  {
        struct btrfs_block_rsv *block_rsv;
+       struct btrfs_space_info *space_info;
        u64 reserved;
        u64 max_reclaim;
        u64 reclaimed = 0;
        int pause = 1;
-       int ret;
+       int nr_pages = (2 * 1024 * 1024) >> PAGE_CACHE_SHIFT;
  
        block_rsv = &root->fs_info->delalloc_block_rsv;
-       spin_lock(&block_rsv->lock);
-       reserved = block_rsv->reserved;
-       spin_unlock(&block_rsv->lock);
+       space_info = block_rsv->space_info;
+       smp_mb();
+       reserved = space_info->bytes_reserved;
  
        if (reserved == 0)
                return 0;
        max_reclaim = min(reserved, to_reclaim);
  
        while (1) {
-               ret = btrfs_start_one_delalloc_inode(root, trans ? 1 : 0);
-               if (!ret) {
-                       __set_current_state(TASK_INTERRUPTIBLE);
-                       schedule_timeout(pause);
-                       pause <<= 1;
-                       if (pause > HZ / 10)
-                               pause = HZ / 10;
-               } else {
-                       pause = 1;
-               }
+               /* have the flusher threads jump in and do some IO */
+               smp_mb();
+               nr_pages = min_t(unsigned long, nr_pages,
+                      root->fs_info->delalloc_bytes >> PAGE_CACHE_SHIFT);
+               writeback_inodes_sb_nr_if_idle(root->fs_info->sb, nr_pages);
  
-               spin_lock(&block_rsv->lock);
-               if (reserved > block_rsv->reserved)
-                       reclaimed = reserved - block_rsv->reserved;
-               reserved = block_rsv->reserved;
-               spin_unlock(&block_rsv->lock);
+               spin_lock(&space_info->lock);
+               if (reserved > space_info->bytes_reserved)
+                       reclaimed += reserved - space_info->bytes_reserved;
+               reserved = space_info->bytes_reserved;
+               spin_unlock(&space_info->lock);
  
                if (reserved == 0 || reclaimed >= max_reclaim)
                        break;
  
                if (trans && trans->transaction->blocked)
                        return -EAGAIN;
+               __set_current_state(TASK_INTERRUPTIBLE);
+               schedule_timeout(pause);
+               pause <<= 1;
+               if (pause > HZ / 10)
+                       pause = HZ / 10;
        }
        return reclaimed >= to_reclaim;
  }
  
- static int should_retry_reserve(struct btrfs_trans_handle *trans,
-                               struct btrfs_root *root,
-                               struct btrfs_block_rsv *block_rsv,
-                               u64 num_bytes, int *retries)
+ /*
+  * Retries tells us how many times we've called reserve_metadata_bytes.  The
+  * idea is if this is the first call (retries == 0) then we will add to our
+  * reserved count if we can't make the allocation in order to hold our place
+  * while we go and try and free up space.  That way for retries > 1 we don't try
+  * and add space, we just check to see if the amount of unused space is >= the
+  * total space, meaning that our reservation is valid.
+  *
+  * However if we don't intend to retry this reservation, pass -1 as retries so
+  * that it short circuits this logic.
+  */
+ static int reserve_metadata_bytes(struct btrfs_trans_handle *trans,
+                                 struct btrfs_root *root,
+                                 struct btrfs_block_rsv *block_rsv,
+                                 u64 orig_bytes, int flush)
  {
        struct btrfs_space_info *space_info = block_rsv->space_info;
-       int ret;
+       u64 unused;
+       u64 num_bytes = orig_bytes;
+       int retries = 0;
+       int ret = 0;
+       bool reserved = false;
+       bool committed = false;
  
-       if ((*retries) > 2)
-               return -ENOSPC;
+ again:
+       ret = -ENOSPC;
+       if (reserved)
+               num_bytes = 0;
  
-       ret = maybe_allocate_chunk(trans, root, space_info, num_bytes);
-       if (ret)
-               return 1;
+       spin_lock(&space_info->lock);
+       unused = space_info->bytes_used + space_info->bytes_reserved +
+                space_info->bytes_pinned + space_info->bytes_readonly +
+                space_info->bytes_may_use;
  
-       if (trans && trans->transaction->in_commit)
-               return -ENOSPC;
+       /*
+        * The idea here is that we've not already over-reserved the block group
+        * then we can go ahead and save our reservation first and then start
+        * flushing if we need to.  Otherwise if we've already overcommitted
+        * lets start flushing stuff first and then come back and try to make
+        * our reservation.
+        */
+       if (unused <= space_info->total_bytes) {
+               unused -= space_info->total_bytes;
+               if (unused >= num_bytes) {
+                       if (!reserved)
+                               space_info->bytes_reserved += orig_bytes;
+                       ret = 0;
+               } else {
+                       /*
+                        * Ok set num_bytes to orig_bytes since we aren't
+                        * overocmmitted, this way we only try and reclaim what
+                        * we need.
+                        */
+                       num_bytes = orig_bytes;
+               }
+       } else {
+               /*
+                * Ok we're over committed, set num_bytes to the overcommitted
+                * amount plus the amount of bytes that we need for this
+                * reservation.
+                */
+               num_bytes = unused - space_info->total_bytes +
+                       (orig_bytes * (retries + 1));
+       }
  
-       ret = shrink_delalloc(trans, root, num_bytes);
-       if (ret)
-               return ret;
+       /*
+        * Couldn't make our reservation, save our place so while we're trying
+        * to reclaim space we can actually use it instead of somebody else
+        * stealing it from us.
+        */
+       if (ret && !reserved) {
+               space_info->bytes_reserved += orig_bytes;
+               reserved = true;
+       }
  
-       spin_lock(&space_info->lock);
-       if (space_info->bytes_pinned < num_bytes)
-               ret = 1;
        spin_unlock(&space_info->lock);
-       if (ret)
-               return -ENOSPC;
-       (*retries)++;
  
-       if (trans)
-               return -EAGAIN;
+       if (!ret)
+               return 0;
  
-       trans = btrfs_join_transaction(root, 1);
-       BUG_ON(IS_ERR(trans));
-       ret = btrfs_commit_transaction(trans, root);
-       BUG_ON(ret);
+       if (!flush)
+               goto out;
  
-       return 1;
- }
+       /*
+        * We do synchronous shrinking since we don't actually unreserve
+        * metadata until after the IO is completed.
+        */
+       ret = shrink_delalloc(trans, root, num_bytes, 1);
+       if (ret > 0)
+               return 0;
+       else if (ret < 0)
+               goto out;
  
- static int reserve_metadata_bytes(struct btrfs_block_rsv *block_rsv,
-                                 u64 num_bytes)
- {
-       struct btrfs_space_info *space_info = block_rsv->space_info;
-       u64 unused;
-       int ret = -ENOSPC;
+       /*
+        * So if we were overcommitted it's possible that somebody else flushed
+        * out enough space and we simply didn't have enough space to reclaim,
+        * so go back around and try again.
+        */
+       if (retries < 2) {
+               retries++;
+               goto again;
+       }
  
        spin_lock(&space_info->lock);
-       unused = space_info->bytes_used + space_info->bytes_reserved +
-                space_info->bytes_pinned + space_info->bytes_readonly;
+       /*
+        * Not enough space to be reclaimed, don't bother committing the
+        * transaction.
+        */
+       if (space_info->bytes_pinned < orig_bytes)
+               ret = -ENOSPC;
+       spin_unlock(&space_info->lock);
+       if (ret)
+               goto out;
  
-       if (unused < space_info->total_bytes)
-               unused = space_info->total_bytes - unused;
-       else
-               unused = 0;
+       ret = -EAGAIN;
+       if (trans || committed)
+               goto out;
  
-       if (unused >= num_bytes) {
-               if (block_rsv->priority >= 10) {
-                       space_info->bytes_reserved += num_bytes;
-                       ret = 0;
-               } else {
-                       if ((unused + block_rsv->reserved) *
-                           block_rsv->priority >=
-                           (num_bytes + block_rsv->reserved) * 10) {
-                               space_info->bytes_reserved += num_bytes;
-                               ret = 0;
-                       }
-               }
+       ret = -ENOSPC;
+       trans = btrfs_join_transaction(root, 1);
+       if (IS_ERR(trans))
+               goto out;
+       ret = btrfs_commit_transaction(trans, root);
+       if (!ret) {
+               trans = NULL;
+               committed = true;
+               goto again;
+       }
+ out:
+       if (reserved) {
+               spin_lock(&space_info->lock);
+               space_info->bytes_reserved -= orig_bytes;
+               spin_unlock(&space_info->lock);
        }
-       spin_unlock(&space_info->lock);
  
        return ret;
  }
@@@ -3327,18 -3607,14 +3606,14 @@@ struct btrfs_block_rsv *btrfs_alloc_blo
  {
        struct btrfs_block_rsv *block_rsv;
        struct btrfs_fs_info *fs_info = root->fs_info;
-       u64 alloc_target;
  
        block_rsv = kmalloc(sizeof(*block_rsv), GFP_NOFS);
        if (!block_rsv)
                return NULL;
  
        btrfs_init_block_rsv(block_rsv);
-       alloc_target = btrfs_get_alloc_profile(root, 0);
        block_rsv->space_info = __find_space_info(fs_info,
                                                  BTRFS_BLOCK_GROUP_METADATA);
        return block_rsv;
  }
  
@@@ -3369,23 -3645,19 +3644,19 @@@ void btrfs_add_durable_block_rsv(struc
  int btrfs_block_rsv_add(struct btrfs_trans_handle *trans,
                        struct btrfs_root *root,
                        struct btrfs_block_rsv *block_rsv,
-                       u64 num_bytes, int *retries)
+                       u64 num_bytes)
  {
        int ret;
  
        if (num_bytes == 0)
                return 0;
- again:
-       ret = reserve_metadata_bytes(block_rsv, num_bytes);
+       ret = reserve_metadata_bytes(trans, root, block_rsv, num_bytes, 1);
        if (!ret) {
                block_rsv_add_bytes(block_rsv, num_bytes, 1);
                return 0;
        }
  
-       ret = should_retry_reserve(trans, root, block_rsv, num_bytes, retries);
-       if (ret > 0)
-               goto again;
        return ret;
  }
  
@@@ -3420,7 -3692,8 +3691,8 @@@ int btrfs_block_rsv_check(struct btrfs_
                return 0;
  
        if (block_rsv->refill_used) {
-               ret = reserve_metadata_bytes(block_rsv, num_bytes);
+               ret = reserve_metadata_bytes(trans, root, block_rsv,
+                                            num_bytes, 0);
                if (!ret) {
                        block_rsv_add_bytes(block_rsv, num_bytes, 0);
                        return 0;
@@@ -3499,6 -3772,8 +3771,8 @@@ static u64 calc_global_metadata_size(st
  
        sinfo = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_METADATA);
        spin_lock(&sinfo->lock);
+       if (sinfo->flags & BTRFS_BLOCK_GROUP_DATA)
+               data_used = 0;
        meta_used = sinfo->bytes_used;
        spin_unlock(&sinfo->lock);
  
@@@ -3526,7 -3801,8 +3800,8 @@@ static void update_global_block_rsv(str
        block_rsv->size = num_bytes;
  
        num_bytes = sinfo->bytes_used + sinfo->bytes_pinned +
-                   sinfo->bytes_reserved + sinfo->bytes_readonly;
+                   sinfo->bytes_reserved + sinfo->bytes_readonly +
+                   sinfo->bytes_may_use;
  
        if (sinfo->total_bytes > num_bytes) {
                num_bytes = sinfo->total_bytes - num_bytes;
@@@ -3597,7 -3873,7 +3872,7 @@@ static u64 calc_trans_metadata_size(str
  
  int btrfs_trans_reserve_metadata(struct btrfs_trans_handle *trans,
                                 struct btrfs_root *root,
-                                int num_items, int *retries)
+                                int num_items)
  {
        u64 num_bytes;
        int ret;
  
        num_bytes = calc_trans_metadata_size(root, num_items);
        ret = btrfs_block_rsv_add(trans, root, &root->fs_info->trans_block_rsv,
-                                 num_bytes, retries);
+                                 num_bytes);
        if (!ret) {
                trans->bytes_reserved += num_bytes;
                trans->block_rsv = &root->fs_info->trans_block_rsv;
@@@ -3681,14 -3957,13 +3956,13 @@@ int btrfs_delalloc_reserve_metadata(str
        struct btrfs_block_rsv *block_rsv = &root->fs_info->delalloc_block_rsv;
        u64 to_reserve;
        int nr_extents;
-       int retries = 0;
        int ret;
  
        if (btrfs_transaction_in_commit(root->fs_info))
                schedule_timeout(1);
  
        num_bytes = ALIGN(num_bytes, root->sectorsize);
- again:
        spin_lock(&BTRFS_I(inode)->accounting_lock);
        nr_extents = atomic_read(&BTRFS_I(inode)->outstanding_extents) + 1;
        if (nr_extents > BTRFS_I(inode)->reserved_extents) {
                nr_extents = 0;
                to_reserve = 0;
        }
+       spin_unlock(&BTRFS_I(inode)->accounting_lock);
  
        to_reserve += calc_csum_metadata_size(inode, num_bytes);
-       ret = reserve_metadata_bytes(block_rsv, to_reserve);
-       if (ret) {
-               spin_unlock(&BTRFS_I(inode)->accounting_lock);
-               ret = should_retry_reserve(NULL, root, block_rsv, to_reserve,
-                                          &retries);
-               if (ret > 0)
-                       goto again;
+       ret = reserve_metadata_bytes(NULL, root, block_rsv, to_reserve, 1);
+       if (ret)
                return ret;
-       }
  
+       spin_lock(&BTRFS_I(inode)->accounting_lock);
        BTRFS_I(inode)->reserved_extents += nr_extents;
        atomic_inc(&BTRFS_I(inode)->outstanding_extents);
        spin_unlock(&BTRFS_I(inode)->accounting_lock);
        block_rsv_add_bytes(block_rsv, to_reserve, 1);
  
        if (block_rsv->size > 512 * 1024 * 1024)
-               shrink_delalloc(NULL, root, to_reserve);
+               shrink_delalloc(NULL, root, to_reserve, 0);
  
        return 0;
  }
@@@ -3776,12 -4047,12 +4046,12 @@@ static int update_block_group(struct bt
                              struct btrfs_root *root,
                              u64 bytenr, u64 num_bytes, int alloc)
  {
-       struct btrfs_block_group_cache *cache;
+       struct btrfs_block_group_cache *cache = NULL;
        struct btrfs_fs_info *info = root->fs_info;
-       int factor;
        u64 total = num_bytes;
        u64 old_val;
        u64 byte_in_group;
+       int factor;
  
        /* block accounting for super block */
        spin_lock(&info->delalloc_lock);
                        factor = 2;
                else
                        factor = 1;
+               /*
+                * If this block group has free space cache written out, we
+                * need to make sure to load it if we are removing space.  This
+                * is because we need the unpinning stage to actually add the
+                * space back to the block group, otherwise we will leak space.
+                */
+               if (!alloc && cache->cached == BTRFS_CACHE_NO)
+                       cache_block_group(cache, trans, 1);
                byte_in_group = bytenr - cache->key.objectid;
                WARN_ON(byte_in_group > cache->key.offset);
  
                spin_lock(&cache->space_info->lock);
                spin_lock(&cache->lock);
+               if (btrfs_super_cache_generation(&info->super_copy) != 0 &&
+                   cache->disk_cache_state < BTRFS_DC_CLEAR)
+                       cache->disk_cache_state = BTRFS_DC_CLEAR;
                cache->dirty = 1;
                old_val = btrfs_block_group_used(&cache->item);
                num_bytes = min(total, cache->key.offset - byte_in_group);
@@@ -4554,6 -4839,7 +4838,7 @@@ static noinline int find_free_extent(st
        bool found_uncached_bg = false;
        bool failed_cluster_refill = false;
        bool failed_alloc = false;
+       bool use_cluster = true;
        u64 ideal_cache_percent = 0;
        u64 ideal_cache_offset = 0;
  
                return -ENOSPC;
        }
  
+       /*
+        * If the space info is for both data and metadata it means we have a
+        * small filesystem and we can't use the clustering stuff.
+        */
+       if (btrfs_mixed_space_info(space_info))
+               use_cluster = false;
        if (orig_root->ref_cows || empty_size)
                allowed_chunk_alloc = 1;
  
-       if (data & BTRFS_BLOCK_GROUP_METADATA) {
+       if (data & BTRFS_BLOCK_GROUP_METADATA && use_cluster) {
                last_ptr = &root->fs_info->meta_alloc_cluster;
                if (!btrfs_test_opt(root, SSD))
                        empty_cluster = 64 * 1024;
        }
  
-       if ((data & BTRFS_BLOCK_GROUP_DATA) && btrfs_test_opt(root, SSD)) {
+       if ((data & BTRFS_BLOCK_GROUP_DATA) && use_cluster &&
+           btrfs_test_opt(root, SSD)) {
                last_ptr = &root->fs_info->data_alloc_cluster;
        }
  
@@@ -4641,6 -4935,10 +4934,10 @@@ have_block_group
                if (unlikely(block_group->cached == BTRFS_CACHE_NO)) {
                        u64 free_percent;
  
+                       ret = cache_block_group(block_group, trans, 1);
+                       if (block_group->cached == BTRFS_CACHE_FINISHED)
+                               goto have_block_group;
                        free_percent = btrfs_block_group_used(&block_group->item);
                        free_percent *= 100;
                        free_percent = div64_u64(free_percent,
                        if (loop > LOOP_CACHING_NOWAIT ||
                            (loop > LOOP_FIND_IDEAL &&
                             atomic_read(&space_info->caching_threads) < 2)) {
-                               ret = cache_block_group(block_group);
+                               ret = cache_block_group(block_group, trans, 0);
                                BUG_ON(ret);
                        }
                        found_uncached_bg = true;
@@@ -5218,7 -5516,7 +5515,7 @@@ int btrfs_alloc_logged_file_extent(stru
        u64 num_bytes = ins->offset;
  
        block_group = btrfs_lookup_block_group(root->fs_info, ins->objectid);
-       cache_block_group(block_group);
+       cache_block_group(block_group, trans, 0);
        caching_ctl = get_caching_control(block_group);
  
        if (!caching_ctl) {
@@@ -5308,7 -5606,8 +5605,8 @@@ use_block_rsv(struct btrfs_trans_handl
        block_rsv = get_block_rsv(trans, root);
  
        if (block_rsv->size == 0) {
-               ret = reserve_metadata_bytes(block_rsv, blocksize);
+               ret = reserve_metadata_bytes(trans, root, block_rsv,
+                                            blocksize, 0);
                if (ret)
                        return ERR_PTR(ret);
                return block_rsv;
        if (!ret)
                return block_rsv;
  
-       WARN_ON(1);
-       printk(KERN_INFO"block_rsv size %llu reserved %llu freed %llu %llu\n",
-               block_rsv->size, block_rsv->reserved,
-               block_rsv->freed[0], block_rsv->freed[1]);
        return ERR_PTR(-ENOSPC);
  }
  
@@@ -5421,7 -5715,6 +5714,6 @@@ static noinline void reada_walk_down(st
        u64 generation;
        u64 refs;
        u64 flags;
-       u64 last = 0;
        u32 nritems;
        u32 blocksize;
        struct btrfs_key key;
@@@ -5489,7 -5782,6 +5781,6 @@@ reada
                                           generation);
                if (ret)
                        break;
-               last = bytenr + blocksize;
                nread++;
        }
        wc->reada_slot = slot;
        return ret;
  }
  
+ void btrfs_put_block_group_cache(struct btrfs_fs_info *info)
+ {
+       struct btrfs_block_group_cache *block_group;
+       u64 last = 0;
+       while (1) {
+               struct inode *inode;
+               block_group = btrfs_lookup_first_block_group(info, last);
+               while (block_group) {
+                       spin_lock(&block_group->lock);
+                       if (block_group->iref)
+                               break;
+                       spin_unlock(&block_group->lock);
+                       block_group = next_block_group(info->tree_root,
+                                                      block_group);
+               }
+               if (!block_group) {
+                       if (last == 0)
+                               break;
+                       last = 0;
+                       continue;
+               }
+               inode = block_group->inode;
+               block_group->iref = 0;
+               block_group->inode = NULL;
+               spin_unlock(&block_group->lock);
+               iput(inode);
+               last = block_group->key.objectid + block_group->key.offset;
+               btrfs_put_block_group(block_group);
+       }
+ }
  int btrfs_free_block_groups(struct btrfs_fs_info *info)
  {
        struct btrfs_block_group_cache *block_group;
@@@ -7896,6 -8222,8 +8221,8 @@@ int btrfs_read_block_groups(struct btrf
        struct btrfs_key key;
        struct btrfs_key found_key;
        struct extent_buffer *leaf;
+       int need_clear = 0;
+       u64 cache_gen;
  
        root = info->extent_root;
        key.objectid = 0;
        if (!path)
                return -ENOMEM;
  
+       cache_gen = btrfs_super_cache_generation(&root->fs_info->super_copy);
+       if (cache_gen != 0 &&
+           btrfs_super_generation(&root->fs_info->super_copy) != cache_gen)
+               need_clear = 1;
+       if (btrfs_test_opt(root, CLEAR_CACHE))
+               need_clear = 1;
+       if (!btrfs_test_opt(root, SPACE_CACHE) && cache_gen)
+               printk(KERN_INFO "btrfs: disk space caching is enabled\n");
        while (1) {
                ret = find_first_block_group(root, path, &key);
                if (ret > 0)
                INIT_LIST_HEAD(&cache->list);
                INIT_LIST_HEAD(&cache->cluster_list);
  
+               if (need_clear)
+                       cache->disk_cache_state = BTRFS_DC_CLEAR;
                /*
                 * we only want to have 32k of ram per block group for keeping
                 * track of free space, and if we pass 1/2 of that we want to
@@@ -8031,6 -8371,7 +8370,7 @@@ int btrfs_make_block_group(struct btrfs
        cache->key.offset = size;
        cache->key.type = BTRFS_BLOCK_GROUP_ITEM_KEY;
        cache->sectorsize = root->sectorsize;
+       cache->fs_info = root->fs_info;
  
        /*
         * we only want to have 32k of ram per block group for keeping track
@@@ -8087,8 -8428,11 +8427,11 @@@ int btrfs_remove_block_group(struct btr
        struct btrfs_path *path;
        struct btrfs_block_group_cache *block_group;
        struct btrfs_free_cluster *cluster;
+       struct btrfs_root *tree_root = root->fs_info->tree_root;
        struct btrfs_key key;
+       struct inode *inode;
        int ret;
+       int factor;
  
        root = root->fs_info->extent_root;
  
        BUG_ON(!block_group->ro);
  
        memcpy(&key, &block_group->key, sizeof(key));
+       if (block_group->flags & (BTRFS_BLOCK_GROUP_DUP |
+                                 BTRFS_BLOCK_GROUP_RAID1 |
+                                 BTRFS_BLOCK_GROUP_RAID10))
+               factor = 2;
+       else
+               factor = 1;
  
        /* make sure this block group isn't part of an allocation cluster */
        cluster = &root->fs_info->data_alloc_cluster;
        path = btrfs_alloc_path();
        BUG_ON(!path);
  
+       inode = lookup_free_space_inode(root, block_group, path);
+       if (!IS_ERR(inode)) {
+               btrfs_orphan_add(trans, inode);
+               clear_nlink(inode);
+               /* One for the block groups ref */
+               spin_lock(&block_group->lock);
+               if (block_group->iref) {
+                       block_group->iref = 0;
+                       block_group->inode = NULL;
+                       spin_unlock(&block_group->lock);
+                       iput(inode);
+               } else {
+                       spin_unlock(&block_group->lock);
+               }
+               /* One for our lookup ref */
+               iput(inode);
+       }
+       key.objectid = BTRFS_FREE_SPACE_OBJECTID;
+       key.offset = block_group->key.objectid;
+       key.type = 0;
+       ret = btrfs_search_slot(trans, tree_root, &key, path, -1, 1);
+       if (ret < 0)
+               goto out;
+       if (ret > 0)
+               btrfs_release_path(tree_root, path);
+       if (ret == 0) {
+               ret = btrfs_del_item(trans, tree_root, path);
+               if (ret)
+                       goto out;
+               btrfs_release_path(tree_root, path);
+       }
        spin_lock(&root->fs_info->block_group_cache_lock);
        rb_erase(&block_group->cache_node,
                 &root->fs_info->block_group_cache_tree);
        spin_lock(&block_group->space_info->lock);
        block_group->space_info->total_bytes -= block_group->key.offset;
        block_group->space_info->bytes_readonly -= block_group->key.offset;
+       block_group->space_info->disk_total -= block_group->key.offset * factor;
        spin_unlock(&block_group->space_info->lock);
  
+       memcpy(&key, &block_group->key, sizeof(key));
        btrfs_clear_space_info_full(root->fs_info);
  
        btrfs_put_block_group(block_group);
diff --combined fs/btrfs/extent_io.c
@@@ -104,7 -104,7 +104,7 @@@ void extent_io_tree_init(struct extent_
                          struct address_space *mapping, gfp_t mask)
  {
        tree->state = RB_ROOT;
-       tree->buffer = RB_ROOT;
+       INIT_RADIX_TREE(&tree->buffer, GFP_ATOMIC);
        tree->ops = NULL;
        tree->dirty_bytes = 0;
        spin_lock_init(&tree->lock);
@@@ -235,50 -235,6 +235,6 @@@ static inline struct rb_node *tree_sear
        return ret;
  }
  
- static struct extent_buffer *buffer_tree_insert(struct extent_io_tree *tree,
-                                         u64 offset, struct rb_node *node)
- {
-       struct rb_root *root = &tree->buffer;
-       struct rb_node **p = &root->rb_node;
-       struct rb_node *parent = NULL;
-       struct extent_buffer *eb;
-       while (*p) {
-               parent = *p;
-               eb = rb_entry(parent, struct extent_buffer, rb_node);
-               if (offset < eb->start)
-                       p = &(*p)->rb_left;
-               else if (offset > eb->start)
-                       p = &(*p)->rb_right;
-               else
-                       return eb;
-       }
-       rb_link_node(node, parent, p);
-       rb_insert_color(node, root);
-       return NULL;
- }
- static struct extent_buffer *buffer_search(struct extent_io_tree *tree,
-                                          u64 offset)
- {
-       struct rb_root *root = &tree->buffer;
-       struct rb_node *n = root->rb_node;
-       struct extent_buffer *eb;
-       while (n) {
-               eb = rb_entry(n, struct extent_buffer, rb_node);
-               if (offset < eb->start)
-                       n = n->rb_left;
-               else if (offset > eb->start)
-                       n = n->rb_right;
-               else
-                       return eb;
-       }
-       return NULL;
- }
  static void merge_cb(struct extent_io_tree *tree, struct extent_state *new,
                     struct extent_state *other)
  {
@@@ -1901,10 -1857,8 +1857,8 @@@ static int submit_one_bio(int rw, struc
        struct page *page = bvec->bv_page;
        struct extent_io_tree *tree = bio->bi_private;
        u64 start;
-       u64 end;
  
        start = ((u64)page->index << PAGE_CACHE_SHIFT) + bvec->bv_offset;
-       end = start + bvec->bv_len - 1;
  
        bio->bi_private = NULL;
  
@@@ -2204,7 -2158,6 +2158,6 @@@ static int __extent_writepage(struct pa
        u64 last_byte = i_size_read(inode);
        u64 block_start;
        u64 iosize;
-       u64 unlock_start;
        sector_t sector;
        struct extent_state *cached_state = NULL;
        struct extent_map *em;
                if (tree->ops && tree->ops->writepage_end_io_hook)
                        tree->ops->writepage_end_io_hook(page, start,
                                                         page_end, NULL, 1);
-               unlock_start = page_end + 1;
                goto done;
        }
  
                        if (tree->ops && tree->ops->writepage_end_io_hook)
                                tree->ops->writepage_end_io_hook(page, cur,
                                                         page_end, NULL, 1);
-                       unlock_start = page_end + 1;
                        break;
                }
                em = epd->get_extent(inode, page, pg_offset, cur,
  
                        cur += iosize;
                        pg_offset += iosize;
-                       unlock_start = cur;
                        continue;
                }
                /* leave this out until we have a page_mkwrite call */
@@@ -2473,7 -2423,6 +2423,6 @@@ static int extent_write_cache_pages(str
        pgoff_t index;
        pgoff_t end;            /* Inclusive */
        int scanned = 0;
-       int range_whole = 0;
  
        pagevec_init(&pvec, 0);
        if (wbc->range_cyclic) {
        } else {
                index = wbc->range_start >> PAGE_CACHE_SHIFT;
                end = wbc->range_end >> PAGE_CACHE_SHIFT;
-               if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX)
-                       range_whole = 1;
                scanned = 1;
        }
  retry:
@@@ -2823,6 -2770,8 +2770,8 @@@ int extent_prepare_write(struct extent_
                                         NULL, 1,
                                         end_bio_extent_preparewrite, 0,
                                         0, 0);
+                       if (ret && !err)
+                               err = ret;
                        iocount++;
                        block_start = block_start + iosize;
                } else {
@@@ -3082,6 -3031,7 +3031,6 @@@ static struct extent_buffer *__alloc_ex
        eb->len = len;
        spin_lock_init(&eb->lock);
        init_waitqueue_head(&eb->lock_wq);
 -      INIT_RCU_HEAD(&eb->rcu_head);
  
  #if LEAK_DEBUG
        spin_lock_irqsave(&leak_lock, flags);
@@@ -3104,6 -3054,39 +3053,39 @@@ static void __free_extent_buffer(struc
        kmem_cache_free(extent_buffer_cache, eb);
  }
  
+ /*
+  * Helper for releasing extent buffer page.
+  */
+ static void btrfs_release_extent_buffer_page(struct extent_buffer *eb,
+                                               unsigned long start_idx)
+ {
+       unsigned long index;
+       struct page *page;
+       if (!eb->first_page)
+               return;
+       index = num_extent_pages(eb->start, eb->len);
+       if (start_idx >= index)
+               return;
+       do {
+               index--;
+               page = extent_buffer_page(eb, index);
+               if (page)
+                       page_cache_release(page);
+       } while (index != start_idx);
+ }
+ /*
+  * Helper for releasing the extent buffer.
+  */
+ static inline void btrfs_release_extent_buffer(struct extent_buffer *eb)
+ {
+       btrfs_release_extent_buffer_page(eb, 0);
+       __free_extent_buffer(eb);
+ }
  struct extent_buffer *alloc_extent_buffer(struct extent_io_tree *tree,
                                          u64 start, unsigned long len,
                                          struct page *page0,
        struct page *p;
        struct address_space *mapping = tree->mapping;
        int uptodate = 1;
+       int ret;
  
-       spin_lock(&tree->buffer_lock);
-       eb = buffer_search(tree, start);
-       if (eb) {
-               atomic_inc(&eb->refs);
-               spin_unlock(&tree->buffer_lock);
+       rcu_read_lock();
+       eb = radix_tree_lookup(&tree->buffer, start >> PAGE_CACHE_SHIFT);
+       if (eb && atomic_inc_not_zero(&eb->refs)) {
+               rcu_read_unlock();
                mark_page_accessed(eb->first_page);
                return eb;
        }
-       spin_unlock(&tree->buffer_lock);
+       rcu_read_unlock();
  
        eb = __alloc_extent_buffer(tree, start, len, mask);
        if (!eb)
        if (uptodate)
                set_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags);
  
+       ret = radix_tree_preload(GFP_NOFS & ~__GFP_HIGHMEM);
+       if (ret)
+               goto free_eb;
        spin_lock(&tree->buffer_lock);
-       exists = buffer_tree_insert(tree, start, &eb->rb_node);
-       if (exists) {
+       ret = radix_tree_insert(&tree->buffer, start >> PAGE_CACHE_SHIFT, eb);
+       if (ret == -EEXIST) {
+               exists = radix_tree_lookup(&tree->buffer,
+                                               start >> PAGE_CACHE_SHIFT);
                /* add one reference for the caller */
                atomic_inc(&exists->refs);
                spin_unlock(&tree->buffer_lock);
+               radix_tree_preload_end();
                goto free_eb;
        }
        /* add one reference for the tree */
        atomic_inc(&eb->refs);
        spin_unlock(&tree->buffer_lock);
+       radix_tree_preload_end();
        return eb;
  
  free_eb:
        if (!atomic_dec_and_test(&eb->refs))
                return exists;
-       for (index = 1; index < i; index++)
-               page_cache_release(extent_buffer_page(eb, index));
-       page_cache_release(extent_buffer_page(eb, 0));
-       __free_extent_buffer(eb);
+       btrfs_release_extent_buffer(eb);
        return exists;
  }
  
@@@ -3194,16 -3182,16 +3181,16 @@@ struct extent_buffer *find_extent_buffe
  {
        struct extent_buffer *eb;
  
-       spin_lock(&tree->buffer_lock);
-       eb = buffer_search(tree, start);
-       if (eb)
-               atomic_inc(&eb->refs);
-       spin_unlock(&tree->buffer_lock);
-       if (eb)
+       rcu_read_lock();
+       eb = radix_tree_lookup(&tree->buffer, start >> PAGE_CACHE_SHIFT);
+       if (eb && atomic_inc_not_zero(&eb->refs)) {
+               rcu_read_unlock();
                mark_page_accessed(eb->first_page);
+               return eb;
+       }
+       rcu_read_unlock();
  
-       return eb;
+       return NULL;
  }
  
  void free_extent_buffer(struct extent_buffer *eb)
@@@ -3833,34 -3821,45 +3820,45 @@@ void memmove_extent_buffer(struct exten
        }
  }
  
+ static inline void btrfs_release_extent_buffer_rcu(struct rcu_head *head)
+ {
+       struct extent_buffer *eb =
+                       container_of(head, struct extent_buffer, rcu_head);
+       btrfs_release_extent_buffer(eb);
+ }
  int try_release_extent_buffer(struct extent_io_tree *tree, struct page *page)
  {
        u64 start = page_offset(page);
        struct extent_buffer *eb;
        int ret = 1;
-       unsigned long i;
-       unsigned long num_pages;
  
        spin_lock(&tree->buffer_lock);
-       eb = buffer_search(tree, start);
+       eb = radix_tree_lookup(&tree->buffer, start >> PAGE_CACHE_SHIFT);
        if (!eb)
                goto out;
  
-       if (atomic_read(&eb->refs) > 1) {
+       if (test_bit(EXTENT_BUFFER_DIRTY, &eb->bflags)) {
                ret = 0;
                goto out;
        }
-       if (test_bit(EXTENT_BUFFER_DIRTY, &eb->bflags)) {
+       /*
+        * set @eb->refs to 0 if it is already 1, and then release the @eb.
+        * Or go back.
+        */
+       if (atomic_cmpxchg(&eb->refs, 1, 0) != 1) {
                ret = 0;
                goto out;
        }
-       /* at this point we can safely release the extent buffer */
-       num_pages = num_extent_pages(eb->start, eb->len);
-       for (i = 0; i < num_pages; i++)
-               page_cache_release(extent_buffer_page(eb, i));
-       rb_erase(&eb->rb_node, &tree->buffer);
-       __free_extent_buffer(eb);
+       radix_tree_delete(&tree->buffer, start >> PAGE_CACHE_SHIFT);
  out:
        spin_unlock(&tree->buffer_lock);
+       /* at this point we can safely release the extent buffer */
+       if (atomic_read(&eb->refs) == 0)
+               call_rcu(&eb->rcu_head, btrfs_release_extent_buffer_rcu);
        return ret;
  }
diff --combined fs/btrfs/inode.c
@@@ -319,8 -319,6 +319,6 @@@ static noinline int compress_file_range
        struct btrfs_root *root = BTRFS_I(inode)->root;
        struct btrfs_trans_handle *trans;
        u64 num_bytes;
-       u64 orig_start;
-       u64 disk_num_bytes;
        u64 blocksize = root->sectorsize;
        u64 actual_end;
        u64 isize = i_size_read(inode);
        int i;
        int will_compress;
  
-       orig_start = start;
        actual_end = min_t(u64, isize, end + 1);
  again:
        will_compress = 0;
        total_compressed = min(total_compressed, max_uncompressed);
        num_bytes = (end - start + blocksize) & ~(blocksize - 1);
        num_bytes = max(blocksize,  num_bytes);
-       disk_num_bytes = num_bytes;
        total_in = 0;
        ret = 0;
  
                if (total_compressed >= total_in) {
                        will_compress = 0;
                } else {
-                       disk_num_bytes = total_compressed;
                        num_bytes = total_in;
                }
        }
@@@ -757,20 -751,17 +751,17 @@@ static noinline int cow_file_range(stru
        u64 disk_num_bytes;
        u64 cur_alloc_size;
        u64 blocksize = root->sectorsize;
-       u64 actual_end;
-       u64 isize = i_size_read(inode);
        struct btrfs_key ins;
        struct extent_map *em;
        struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
        int ret = 0;
  
+       BUG_ON(root == root->fs_info->tree_root);
        trans = btrfs_join_transaction(root, 1);
        BUG_ON(!trans);
        btrfs_set_trans_block_group(trans, inode);
        trans->block_rsv = &root->fs_info->delalloc_block_rsv;
  
-       actual_end = min_t(u64, isize, end + 1);
        num_bytes = (end - start + blocksize) & ~(blocksize - 1);
        num_bytes = max(blocksize,  num_bytes);
        disk_num_bytes = num_bytes;
@@@ -1035,10 -1026,16 +1026,16 @@@ static noinline int run_delalloc_nocow(
        int type;
        int nocow;
        int check_prev = 1;
+       bool nolock = false;
  
        path = btrfs_alloc_path();
        BUG_ON(!path);
-       trans = btrfs_join_transaction(root, 1);
+       if (root == root->fs_info->tree_root) {
+               nolock = true;
+               trans = btrfs_join_transaction_nolock(root, 1);
+       } else {
+               trans = btrfs_join_transaction(root, 1);
+       }
        BUG_ON(!trans);
  
        cow_start = (u64)-1;
@@@ -1211,8 -1208,13 +1208,13 @@@ out_check
                BUG_ON(ret);
        }
  
-       ret = btrfs_end_transaction(trans, root);
-       BUG_ON(ret);
+       if (nolock) {
+               ret = btrfs_end_transaction_nolock(trans, root);
+               BUG_ON(ret);
+       } else {
+               ret = btrfs_end_transaction(trans, root);
+               BUG_ON(ret);
+       }
        btrfs_free_path(path);
        return 0;
  }
@@@ -1289,6 -1291,8 +1291,8 @@@ static int btrfs_set_bit_hook(struct in
        if (!(state->state & EXTENT_DELALLOC) && (*bits & EXTENT_DELALLOC)) {
                struct btrfs_root *root = BTRFS_I(inode)->root;
                u64 len = state->end + 1 - state->start;
+               int do_list = (root->root_key.objectid !=
+                              BTRFS_ROOT_TREE_OBJECTID);
  
                if (*bits & EXTENT_FIRST_DELALLOC)
                        *bits &= ~EXTENT_FIRST_DELALLOC;
                spin_lock(&root->fs_info->delalloc_lock);
                BTRFS_I(inode)->delalloc_bytes += len;
                root->fs_info->delalloc_bytes += len;
-               if (list_empty(&BTRFS_I(inode)->delalloc_inodes)) {
+               if (do_list && list_empty(&BTRFS_I(inode)->delalloc_inodes)) {
                        list_add_tail(&BTRFS_I(inode)->delalloc_inodes,
                                      &root->fs_info->delalloc_inodes);
                }
@@@ -1321,6 -1325,8 +1325,8 @@@ static int btrfs_clear_bit_hook(struct 
        if ((state->state & EXTENT_DELALLOC) && (*bits & EXTENT_DELALLOC)) {
                struct btrfs_root *root = BTRFS_I(inode)->root;
                u64 len = state->end + 1 - state->start;
+               int do_list = (root->root_key.objectid !=
+                              BTRFS_ROOT_TREE_OBJECTID);
  
                if (*bits & EXTENT_FIRST_DELALLOC)
                        *bits &= ~EXTENT_FIRST_DELALLOC;
                if (*bits & EXTENT_DO_ACCOUNTING)
                        btrfs_delalloc_release_metadata(inode, len);
  
-               if (root->root_key.objectid != BTRFS_DATA_RELOC_TREE_OBJECTID)
+               if (root->root_key.objectid != BTRFS_DATA_RELOC_TREE_OBJECTID
+                   && do_list)
                        btrfs_free_reserved_data_space(inode, len);
  
                spin_lock(&root->fs_info->delalloc_lock);
                root->fs_info->delalloc_bytes -= len;
                BTRFS_I(inode)->delalloc_bytes -= len;
  
-               if (BTRFS_I(inode)->delalloc_bytes == 0 &&
+               if (do_list && BTRFS_I(inode)->delalloc_bytes == 0 &&
                    !list_empty(&BTRFS_I(inode)->delalloc_inodes)) {
                        list_del_init(&BTRFS_I(inode)->delalloc_inodes);
                }
@@@ -1372,7 -1379,7 +1379,7 @@@ int btrfs_merge_bio_hook(struct page *p
  
        if (map_length < length + size)
                return 1;
-       return 0;
+       return ret;
  }
  
  /*
@@@ -1426,7 -1433,10 +1433,10 @@@ static int btrfs_submit_bio_hook(struc
  
        skip_sum = BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM;
  
-       ret = btrfs_bio_wq_end_io(root->fs_info, bio, 0);
+       if (root == root->fs_info->tree_root)
+               ret = btrfs_bio_wq_end_io(root->fs_info, bio, 2);
+       else
+               ret = btrfs_bio_wq_end_io(root->fs_info, bio, 0);
        BUG_ON(ret);
  
        if (!(rw & REQ_WRITE)) {
@@@ -1662,6 -1672,7 +1672,7 @@@ static int btrfs_finish_ordered_io(stru
        struct extent_state *cached_state = NULL;
        int compressed = 0;
        int ret;
+       bool nolock = false;
  
        ret = btrfs_dec_test_ordered_pending(inode, &ordered_extent, start,
                                             end - start + 1);
                return 0;
        BUG_ON(!ordered_extent);
  
+       nolock = (root == root->fs_info->tree_root);
        if (test_bit(BTRFS_ORDERED_NOCOW, &ordered_extent->flags)) {
                BUG_ON(!list_empty(&ordered_extent->list));
                ret = btrfs_ordered_update_i_size(inode, 0, ordered_extent);
                if (!ret) {
-                       trans = btrfs_join_transaction(root, 1);
+                       if (nolock)
+                               trans = btrfs_join_transaction_nolock(root, 1);
+                       else
+                               trans = btrfs_join_transaction(root, 1);
+                       BUG_ON(!trans);
                        btrfs_set_trans_block_group(trans, inode);
                        trans->block_rsv = &root->fs_info->delalloc_block_rsv;
                        ret = btrfs_update_inode(trans, root, inode);
                         ordered_extent->file_offset + ordered_extent->len - 1,
                         0, &cached_state, GFP_NOFS);
  
-       trans = btrfs_join_transaction(root, 1);
+       if (nolock)
+               trans = btrfs_join_transaction_nolock(root, 1);
+       else
+               trans = btrfs_join_transaction(root, 1);
        btrfs_set_trans_block_group(trans, inode);
        trans->block_rsv = &root->fs_info->delalloc_block_rsv;
  
                                                ordered_extent->len);
                BUG_ON(ret);
        } else {
+               BUG_ON(root == root->fs_info->tree_root);
                ret = insert_reserved_file_extent(trans, inode,
                                                ordered_extent->file_offset,
                                                ordered_extent->start,
        ret = btrfs_update_inode(trans, root, inode);
        BUG_ON(ret);
  out:
-       btrfs_delalloc_release_metadata(inode, ordered_extent->len);
-       if (trans)
-               btrfs_end_transaction(trans, root);
+       if (nolock) {
+               if (trans)
+                       btrfs_end_transaction_nolock(trans, root);
+       } else {
+               btrfs_delalloc_release_metadata(inode, ordered_extent->len);
+               if (trans)
+                       btrfs_end_transaction(trans, root);
+       }
        /* once for us */
        btrfs_put_ordered_extent(ordered_extent);
        /* once for the tree */
@@@ -2237,7 -2264,6 +2264,6 @@@ void btrfs_orphan_cleanup(struct btrfs_
  {
        struct btrfs_path *path;
        struct extent_buffer *leaf;
-       struct btrfs_item *item;
        struct btrfs_key key, found_key;
        struct btrfs_trans_handle *trans;
        struct inode *inode;
  
                /* pull out the item */
                leaf = path->nodes[0];
-               item = btrfs_item_nr(leaf, path->slots[0]);
                btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
  
                /* make sure the item matches what we want */
@@@ -2651,7 -2676,8 +2676,8 @@@ int btrfs_unlink_inode(struct btrfs_tra
  
        ret = btrfs_del_dir_entries_in_log(trans, root, name, name_len,
                                           dir, index);
-       BUG_ON(ret);
+       if (ret == -ENOENT)
+               ret = 0;
  err:
        btrfs_free_path(path);
        if (ret)
@@@ -2672,8 -2698,8 +2698,8 @@@ static int check_path_shared(struct btr
  {
        struct extent_buffer *eb;
        int level;
-       int ret;
        u64 refs = 1;
+       int uninitialized_var(ret);
  
        for (level = 0; level < BTRFS_MAX_LEVEL; level++) {
                if (!path->nodes[level])
                if (refs > 1)
                        return 1;
        }
-       return 0;
+       return ret; /* XXX callers? */
  }
  
  /*
@@@ -3196,7 -3222,7 +3222,7 @@@ int btrfs_truncate_inode_items(struct b
  
        BUG_ON(new_size > 0 && min_type != BTRFS_EXTENT_DATA_KEY);
  
-       if (root->ref_cows)
+       if (root->ref_cows || root == root->fs_info->tree_root)
                btrfs_drop_extent_cache(inode, new_size & (~mask), (u64)-1, 0);
  
        path = btrfs_alloc_path();
@@@ -3344,7 -3370,8 +3370,8 @@@ delete
                } else {
                        break;
                }
-               if (found_extent && root->ref_cows) {
+               if (found_extent && (root->ref_cows ||
+                                    root == root->fs_info->tree_root)) {
                        btrfs_set_path_blocking(path);
                        ret = btrfs_free_extent(trans, root, extent_start,
                                                extent_num_bytes, 0,
@@@ -3675,7 -3702,8 +3702,8 @@@ void btrfs_evict_inode(struct inode *in
        int ret;
  
        truncate_inode_pages(&inode->i_data, 0);
-       if (inode->i_nlink && btrfs_root_refs(&root->root_item) != 0)
+       if (inode->i_nlink && (btrfs_root_refs(&root->root_item) != 0 ||
+                              root == root->fs_info->tree_root))
                goto no_delete;
  
        if (is_bad_inode(inode)) {
@@@ -3849,7 -3877,7 +3877,7 @@@ again
        p = &root->inode_tree.rb_node;
        parent = NULL;
  
 -      if (hlist_unhashed(&inode->i_hash))
 +      if (inode_unhashed(inode))
                return;
  
        spin_lock(&root->inode_lock);
@@@ -3888,7 -3916,14 +3916,14 @@@ static void inode_tree_del(struct inod
        }
        spin_unlock(&root->inode_lock);
  
-       if (empty && btrfs_root_refs(&root->root_item) == 0) {
+       /*
+        * Free space cache has inodes in the tree root, but the tree root has a
+        * root_refs of 0, so this could end up dropping the tree root as a
+        * snapshot, so we need the extra !root->fs_info->tree_root check to
+        * make sure we don't drop it.
+        */
+       if (empty && btrfs_root_refs(&root->root_item) == 0 &&
+           root != root->fs_info->tree_root) {
                synchronize_srcu(&root->fs_info->subvol_srcu);
                spin_lock(&root->inode_lock);
                empty = RB_EMPTY_ROOT(&root->inode_tree);
@@@ -4282,14 -4317,24 +4317,24 @@@ int btrfs_write_inode(struct inode *ino
        struct btrfs_root *root = BTRFS_I(inode)->root;
        struct btrfs_trans_handle *trans;
        int ret = 0;
+       bool nolock = false;
  
        if (BTRFS_I(inode)->dummy_inode)
                return 0;
  
+       smp_mb();
+       nolock = (root->fs_info->closing && root == root->fs_info->tree_root);
        if (wbc->sync_mode == WB_SYNC_ALL) {
-               trans = btrfs_join_transaction(root, 1);
+               if (nolock)
+                       trans = btrfs_join_transaction_nolock(root, 1);
+               else
+                       trans = btrfs_join_transaction(root, 1);
                btrfs_set_trans_block_group(trans, inode);
-               ret = btrfs_commit_transaction(trans, root);
+               if (nolock)
+                       ret = btrfs_end_transaction_nolock(trans, root);
+               else
+                       ret = btrfs_commit_transaction(trans, root);
        }
        return ret;
  }
@@@ -4758,7 -4803,7 +4803,7 @@@ static int btrfs_link(struct dentry *ol
        }
  
        btrfs_set_trans_block_group(trans, dir);
 -      atomic_inc(&inode->i_count);
 +      ihold(inode);
  
        err = btrfs_add_nondir(trans, dentry, inode, 1, index);
  
@@@ -5645,7 -5690,6 +5690,6 @@@ static void btrfs_submit_direct(int rw
        struct btrfs_root *root = BTRFS_I(inode)->root;
        struct btrfs_dio_private *dip;
        struct bio_vec *bvec = bio->bi_io_vec;
-       u64 start;
        int skip_sum;
        int write = rw & REQ_WRITE;
        int ret = 0;
        dip->inode = inode;
        dip->logical_offset = file_offset;
  
-       start = dip->logical_offset;
        dip->bytes = 0;
        do {
                dip->bytes += bvec->bv_len;
@@@ -6308,6 -6351,21 +6351,21 @@@ void btrfs_destroy_inode(struct inode *
                spin_unlock(&root->fs_info->ordered_extent_lock);
        }
  
+       if (root == root->fs_info->tree_root) {
+               struct btrfs_block_group_cache *block_group;
+               block_group = btrfs_lookup_block_group(root->fs_info,
+                                               BTRFS_I(inode)->block_group);
+               if (block_group && block_group->inode == inode) {
+                       spin_lock(&block_group->lock);
+                       block_group->inode = NULL;
+                       spin_unlock(&block_group->lock);
+                       btrfs_put_block_group(block_group);
+               } else if (block_group) {
+                       btrfs_put_block_group(block_group);
+               }
+       }
        spin_lock(&root->orphan_lock);
        if (!list_empty(&BTRFS_I(inode)->i_orphan)) {
                printk(KERN_INFO "BTRFS: inode %lu still on the orphan list\n",
@@@ -6340,7 -6398,8 +6398,8 @@@ int btrfs_drop_inode(struct inode *inod
  {
        struct btrfs_root *root = BTRFS_I(inode)->root;
  
-       if (btrfs_root_refs(&root->root_item) == 0)
+       if (btrfs_root_refs(&root->root_item) == 0 &&
+           root != root->fs_info->tree_root)
                return 1;
        else
                return generic_drop_inode(inode);
@@@ -6609,7 -6668,8 +6668,8 @@@ int btrfs_start_delalloc_inodes(struct 
        return 0;
  }
  
- int btrfs_start_one_delalloc_inode(struct btrfs_root *root, int delay_iput)
+ int btrfs_start_one_delalloc_inode(struct btrfs_root *root, int delay_iput,
+                                  int sync)
  {
        struct btrfs_inode *binode;
        struct inode *inode = NULL;
        spin_unlock(&root->fs_info->delalloc_lock);
  
        if (inode) {
-               write_inode_now(inode, 0);
+               if (sync) {
+                       filemap_write_and_wait(inode->i_mapping);
+                       /*
+                        * We have to do this because compression doesn't
+                        * actually set PG_writeback until it submits the pages
+                        * for IO, which happens in an async thread, so we could
+                        * race and not actually wait for any writeback pages
+                        * because they've not been submitted yet.  Technically
+                        * this could still be the case for the ordered stuff
+                        * since the async thread may not have started to do its
+                        * work yet.  If this becomes the case then we need to
+                        * figure out a way to make sure that in writepage we
+                        * wait for any async pages to be submitted before
+                        * returning so that fdatawait does what its supposed to
+                        * do.
+                        */
+                       btrfs_wait_ordered_range(inode, 0, (u64)-1);
+               } else {
+                       filemap_flush(inode->i_mapping);
+               }
                if (delay_iput)
                        btrfs_add_delayed_iput(inode);
                else
@@@ -6757,27 -6836,33 +6836,33 @@@ out_unlock
        return err;
  }
  
- int btrfs_prealloc_file_range(struct inode *inode, int mode,
-                             u64 start, u64 num_bytes, u64 min_size,
-                             loff_t actual_len, u64 *alloc_hint)
+ static int __btrfs_prealloc_file_range(struct inode *inode, int mode,
+                                      u64 start, u64 num_bytes, u64 min_size,
+                                      loff_t actual_len, u64 *alloc_hint,
+                                      struct btrfs_trans_handle *trans)
  {
-       struct btrfs_trans_handle *trans;
        struct btrfs_root *root = BTRFS_I(inode)->root;
        struct btrfs_key ins;
        u64 cur_offset = start;
        int ret = 0;
+       bool own_trans = true;
  
+       if (trans)
+               own_trans = false;
        while (num_bytes > 0) {
-               trans = btrfs_start_transaction(root, 3);
-               if (IS_ERR(trans)) {
-                       ret = PTR_ERR(trans);
-                       break;
+               if (own_trans) {
+                       trans = btrfs_start_transaction(root, 3);
+                       if (IS_ERR(trans)) {
+                               ret = PTR_ERR(trans);
+                               break;
+                       }
                }
  
                ret = btrfs_reserve_extent(trans, root, num_bytes, min_size,
                                           0, *alloc_hint, (u64)-1, &ins, 1);
                if (ret) {
-                       btrfs_end_transaction(trans, root);
+                       if (own_trans)
+                               btrfs_end_transaction(trans, root);
                        break;
                }
  
                ret = btrfs_update_inode(trans, root, inode);
                BUG_ON(ret);
  
-               btrfs_end_transaction(trans, root);
+               if (own_trans)
+                       btrfs_end_transaction(trans, root);
        }
        return ret;
  }
  
+ int btrfs_prealloc_file_range(struct inode *inode, int mode,
+                             u64 start, u64 num_bytes, u64 min_size,
+                             loff_t actual_len, u64 *alloc_hint)
+ {
+       return __btrfs_prealloc_file_range(inode, mode, start, num_bytes,
+                                          min_size, actual_len, alloc_hint,
+                                          NULL);
+ }
+ int btrfs_prealloc_file_range_trans(struct inode *inode,
+                                   struct btrfs_trans_handle *trans, int mode,
+                                   u64 start, u64 num_bytes, u64 min_size,
+                                   loff_t actual_len, u64 *alloc_hint)
+ {
+       return __btrfs_prealloc_file_range(inode, mode, start, num_bytes,
+                                          min_size, actual_len, alloc_hint, trans);
+ }
  static long btrfs_fallocate(struct inode *inode, int mode,
                            loff_t offset, loff_t len)
  {
diff --combined fs/btrfs/super.c
@@@ -61,6 -61,8 +61,8 @@@ static void btrfs_put_super(struct supe
  
        ret = close_ctree(root);
        sb->s_fs_info = NULL;
+       (void)ret; /* FIXME: need to fix VFS to return error? */
  }
  
  enum {
@@@ -68,7 -70,8 +70,8 @@@
        Opt_nodatacow, Opt_max_inline, Opt_alloc_start, Opt_nobarrier, Opt_ssd,
        Opt_nossd, Opt_ssd_spread, Opt_thread_pool, Opt_noacl, Opt_compress,
        Opt_compress_force, Opt_notreelog, Opt_ratio, Opt_flushoncommit,
-       Opt_discard, Opt_err,
+       Opt_discard, Opt_space_cache, Opt_clear_cache, Opt_err,
+       Opt_user_subvol_rm_allowed,
  };
  
  static match_table_t tokens = {
@@@ -92,6 -95,9 +95,9 @@@
        {Opt_flushoncommit, "flushoncommit"},
        {Opt_ratio, "metadata_ratio=%d"},
        {Opt_discard, "discard"},
+       {Opt_space_cache, "space_cache"},
+       {Opt_clear_cache, "clear_cache"},
+       {Opt_user_subvol_rm_allowed, "user_subvol_rm_allowed"},
        {Opt_err, NULL},
  };
  
@@@ -235,6 -241,16 +241,16 @@@ int btrfs_parse_options(struct btrfs_ro
                case Opt_discard:
                        btrfs_set_opt(info->mount_opt, DISCARD);
                        break;
+               case Opt_space_cache:
+                       printk(KERN_INFO "btrfs: enabling disk space caching\n");
+                       btrfs_set_opt(info->mount_opt, SPACE_CACHE);
+               case Opt_clear_cache:
+                       printk(KERN_INFO "btrfs: force clearing of disk cache\n");
+                       btrfs_set_opt(info->mount_opt, CLEAR_CACHE);
+                       break;
+               case Opt_user_subvol_rm_allowed:
+                       btrfs_set_opt(info->mount_opt, USER_SUBVOL_RM_ALLOWED);
+                       break;
                case Opt_err:
                        printk(KERN_INFO "btrfs: unrecognized mount option "
                               "'%s'\n", p);
@@@ -380,7 -396,7 +396,7 @@@ static struct dentry *get_default_root(
  find_root:
        new_root = btrfs_read_fs_root_no_name(root->fs_info, &location);
        if (IS_ERR(new_root))
-               return ERR_PTR(PTR_ERR(new_root));
+               return ERR_CAST(new_root);
  
        if (btrfs_root_refs(&new_root->root_item) == 0)
                return ERR_PTR(-ENOENT);
@@@ -436,7 -452,6 +452,6 @@@ static int btrfs_fill_super(struct supe
  {
        struct inode *inode;
        struct dentry *root_dentry;
-       struct btrfs_super_block *disk_super;
        struct btrfs_root *tree_root;
        struct btrfs_key key;
        int err;
                return PTR_ERR(tree_root);
        }
        sb->s_fs_info = tree_root;
-       disk_super = &tree_root->fs_info->super_copy;
  
        key.objectid = BTRFS_FIRST_FREE_OBJECTID;
        key.type = BTRFS_INODE_ITEM_KEY;
@@@ -560,8 -574,8 +574,8 @@@ static int btrfs_test_super(struct supe
   * Note:  This is based on get_sb_bdev from fs/super.c with a few additions
   *      for multiple device setup.  Make sure to keep it in sync.
   */
 -static int btrfs_get_sb(struct file_system_type *fs_type, int flags,
 -              const char *dev_name, void *data, struct vfsmount *mnt)
 +static struct dentry *btrfs_mount(struct file_system_type *fs_type, int flags,
 +              const char *dev_name, void *data)
  {
        struct block_device *bdev = NULL;
        struct super_block *s;
        char *subvol_name = NULL;
        u64 subvol_objectid = 0;
        int error = 0;
-       int found = 0;
  
        if (!(flags & MS_RDONLY))
                mode |= FMODE_WRITE;
                                          &subvol_name, &subvol_objectid,
                                          &fs_devices);
        if (error)
 -              return error;
 +              return ERR_PTR(error);
  
        error = btrfs_scan_one_device(dev_name, mode, fs_type, &fs_devices);
        if (error)
                        goto error_close_devices;
                }
  
-               found = 1;
                btrfs_close_devices(fs_devices);
        } else {
                char b[BDEVNAME_SIZE];
        if (IS_ERR(root)) {
                error = PTR_ERR(root);
                deactivate_locked_super(s);
-               goto error;
+               goto error_free_subvol_name;
        }
        /* if they gave us a subvolume name bind mount into that */
        if (strcmp(subvol_name, ".")) {
                        deactivate_locked_super(s);
                        error = PTR_ERR(new_root);
                        dput(root);
-                       goto error_close_devices;
+                       goto error_free_subvol_name;
                }
                if (!new_root->d_inode) {
                        dput(root);
                        dput(new_root);
                        deactivate_locked_super(s);
                        error = -ENXIO;
-                       goto error_close_devices;
+                       goto error_free_subvol_name;
                }
                dput(root);
                root = new_root;
        }
  
 -      mnt->mnt_sb = s;
 -      mnt->mnt_root = root;
 -
        kfree(subvol_name);
 -      return 0;
 +      return root;
  
  error_s:
        error = PTR_ERR(s);
@@@ -665,8 -680,7 +677,7 @@@ error_close_devices
        btrfs_close_devices(fs_devices);
  error_free_subvol_name:
        kfree(subvol_name);
- error:
 -      return error;
 +      return ERR_PTR(error);
  }
  
  static int btrfs_remount(struct super_block *sb, int *flags, char *data)
@@@ -713,18 -727,25 +724,25 @@@ static int btrfs_statfs(struct dentry *
        struct list_head *head = &root->fs_info->space_info;
        struct btrfs_space_info *found;
        u64 total_used = 0;
+       u64 total_used_data = 0;
        int bits = dentry->d_sb->s_blocksize_bits;
        __be32 *fsid = (__be32 *)root->fs_info->fsid;
  
        rcu_read_lock();
-       list_for_each_entry_rcu(found, head, list)
+       list_for_each_entry_rcu(found, head, list) {
+               if (found->flags & (BTRFS_BLOCK_GROUP_METADATA |
+                                   BTRFS_BLOCK_GROUP_SYSTEM))
+                       total_used_data += found->disk_total;
+               else
+                       total_used_data += found->disk_used;
                total_used += found->disk_used;
+       }
        rcu_read_unlock();
  
        buf->f_namelen = BTRFS_NAME_LEN;
        buf->f_blocks = btrfs_super_total_bytes(disk_super) >> bits;
        buf->f_bfree = buf->f_blocks - (total_used >> bits);
-       buf->f_bavail = buf->f_bfree;
+       buf->f_bavail = buf->f_blocks - (total_used_data >> bits);
        buf->f_bsize = dentry->d_sb->s_blocksize;
        buf->f_type = BTRFS_SUPER_MAGIC;
  
  static struct file_system_type btrfs_fs_type = {
        .owner          = THIS_MODULE,
        .name           = "btrfs",
 -      .get_sb         = btrfs_get_sb,
 +      .mount          = btrfs_mount,
        .kill_sb        = kill_anon_super,
        .fs_flags       = FS_REQUIRES_DEV,
  };
@@@ -812,7 -833,6 +830,7 @@@ static const struct file_operations btr
        .unlocked_ioctl  = btrfs_control_ioctl,
        .compat_ioctl = btrfs_control_ioctl,
        .owner   = THIS_MODULE,
 +      .llseek = noop_llseek,
  };
  
  static struct miscdevice btrfs_misc = {
diff --combined fs/btrfs/volumes.c
@@@ -398,6 -398,7 +398,6 @@@ static noinline int device_list_add(con
                device->work.func = pending_bios_fn;
                memcpy(device->uuid, disk_super->dev_item.uuid,
                       BTRFS_UUID_SIZE);
 -              device->barriers = 1;
                spin_lock_init(&device->io_lock);
                device->name = kstrdup(path, GFP_NOFS);
                if (!device->name) {
@@@ -461,6 -462,7 +461,6 @@@ static struct btrfs_fs_devices *clone_f
                device->devid = orig_dev->devid;
                device->work.func = pending_bios_fn;
                memcpy(device->uuid, orig_dev->uuid, sizeof(device->uuid));
 -              device->barriers = 1;
                spin_lock_init(&device->io_lock);
                INIT_LIST_HEAD(&device->dev_list);
                INIT_LIST_HEAD(&device->dev_alloc_list);
@@@ -1487,6 -1489,7 +1487,6 @@@ int btrfs_init_new_device(struct btrfs_
        trans = btrfs_start_transaction(root, 0);
        lock_chunks(root);
  
 -      device->barriers = 1;
        device->writeable = 1;
        device->work.func = pending_bios_fn;
        generate_random_uuid(device->uuid);
@@@ -1898,7 -1901,6 +1898,6 @@@ int btrfs_balance(struct btrfs_root *de
        u64 size_to_free;
        struct btrfs_path *path;
        struct btrfs_key key;
-       struct btrfs_chunk *chunk;
        struct btrfs_root *chunk_root = dev_root->fs_info->chunk_root;
        struct btrfs_trans_handle *trans;
        struct btrfs_key found_key;
                if (found_key.objectid != key.objectid)
                        break;
  
-               chunk = btrfs_item_ptr(path->nodes[0],
-                                      path->slots[0],
-                                      struct btrfs_chunk);
                /* chunk zero is special */
                if (found_key.offset == 0)
                        break;
@@@ -3031,8 -3030,7 +3027,7 @@@ int btrfs_map_bio(struct btrfs_root *ro
                }
                bio->bi_sector = multi->stripes[dev_nr].physical >> 9;
                dev = multi->stripes[dev_nr].dev;
-               BUG_ON(rw == WRITE && !dev->writeable);
-               if (dev && dev->bdev) {
+               if (dev && dev->bdev && (rw != WRITE || dev->writeable)) {
                        bio->bi_bdev = dev->bdev;
                        if (async_submit)
                                schedule_bio(root, dev, rw, bio);
@@@ -3081,6 -3079,7 +3076,6 @@@ static struct btrfs_device *add_missing
                return NULL;
        list_add(&device->dev_list,
                 &fs_devices->devices);
 -      device->barriers = 1;
        device->dev_root = root->fs_info->dev_root;
        device->devid = devid;
        device->work.func = pending_bios_fn;
diff --combined fs/fs-writeback.c
@@@ -79,11 -79,6 +79,11 @@@ static inline struct backing_dev_info *
        return sb->s_bdi;
  }
  
 +static inline struct inode *wb_inode(struct list_head *head)
 +{
 +      return list_entry(head, struct inode, i_wb_list);
 +}
 +
  static void bdi_queue_work(struct backing_dev_info *bdi,
                struct wb_writeback_work *work)
  {
@@@ -177,11 -172,11 +177,11 @@@ static void redirty_tail(struct inode *
        if (!list_empty(&wb->b_dirty)) {
                struct inode *tail;
  
 -              tail = list_entry(wb->b_dirty.next, struct inode, i_list);
 +              tail = wb_inode(wb->b_dirty.next);
                if (time_before(inode->dirtied_when, tail->dirtied_when))
                        inode->dirtied_when = jiffies;
        }
 -      list_move(&inode->i_list, &wb->b_dirty);
 +      list_move(&inode->i_wb_list, &wb->b_dirty);
  }
  
  /*
@@@ -191,7 -186,7 +191,7 @@@ static void requeue_io(struct inode *in
  {
        struct bdi_writeback *wb = &inode_to_bdi(inode)->wb;
  
 -      list_move(&inode->i_list, &wb->b_more_io);
 +      list_move(&inode->i_wb_list, &wb->b_more_io);
  }
  
  static void inode_sync_complete(struct inode *inode)
@@@ -232,14 -227,14 +232,14 @@@ static void move_expired_inodes(struct 
        int do_sb_sort = 0;
  
        while (!list_empty(delaying_queue)) {
 -              inode = list_entry(delaying_queue->prev, struct inode, i_list);
 +              inode = wb_inode(delaying_queue->prev);
                if (older_than_this &&
                    inode_dirtied_after(inode, *older_than_this))
                        break;
                if (sb && sb != inode->i_sb)
                        do_sb_sort = 1;
                sb = inode->i_sb;
 -              list_move(&inode->i_list, &tmp);
 +              list_move(&inode->i_wb_list, &tmp);
        }
  
        /* just one sb in list, splice to dispatch_queue and we're done */
  
        /* Move inodes from one superblock together */
        while (!list_empty(&tmp)) {
 -              inode = list_entry(tmp.prev, struct inode, i_list);
 -              sb = inode->i_sb;
 +              sb = wb_inode(tmp.prev)->i_sb;
                list_for_each_prev_safe(pos, node, &tmp) {
 -                      inode = list_entry(pos, struct inode, i_list);
 +                      inode = wb_inode(pos);
                        if (inode->i_sb == sb)
 -                              list_move(&inode->i_list, dispatch_queue);
 +                              list_move(&inode->i_wb_list, dispatch_queue);
                }
        }
  }
@@@ -412,13 -408,16 +412,13 @@@ writeback_single_inode(struct inode *in
                         * completion.
                         */
                        redirty_tail(inode);
 -              } else if (atomic_read(&inode->i_count)) {
 -                      /*
 -                       * The inode is clean, inuse
 -                       */
 -                      list_move(&inode->i_list, &inode_in_use);
                } else {
                        /*
 -                       * The inode is clean, unused
 +                       * The inode is clean.  At this point we either have
 +                       * a reference to the inode or it's on it's way out.
 +                       * No need to add it back to the LRU.
                         */
 -                      list_move(&inode->i_list, &inode_unused);
 +                      list_del_init(&inode->i_wb_list);
                }
        }
        inode_sync_complete(inode);
@@@ -466,7 -465,8 +466,7 @@@ static int writeback_sb_inodes(struct s
  {
        while (!list_empty(&wb->b_io)) {
                long pages_skipped;
 -              struct inode *inode = list_entry(wb->b_io.prev,
 -                                               struct inode, i_list);
 +              struct inode *inode = wb_inode(wb->b_io.prev);
  
                if (inode->i_sb != sb) {
                        if (only_this_sb) {
                        return 0;
                }
  
 -              if (inode->i_state & (I_NEW | I_WILL_FREE)) {
 +              /*
 +               * Don't bother with new inodes or inodes beeing freed, first
 +               * kind does not need peridic writeout yet, and for the latter
 +               * kind writeout is handled by the freer.
 +               */
 +              if (inode->i_state & (I_NEW | I_FREEING | I_WILL_FREE)) {
                        requeue_io(inode);
                        continue;
                }
 +
                /*
                 * Was this inode dirtied after sync_sb_inodes was called?
                 * This keeps sync from extra jobs and livelock.
                if (inode_dirtied_after(inode, wbc->wb_start))
                        return 1;
  
 -              BUG_ON(inode->i_state & I_FREEING);
                __iget(inode);
                pages_skipped = wbc->pages_skipped;
                writeback_single_inode(inode, wbc);
@@@ -541,7 -536,8 +541,7 @@@ void writeback_inodes_wb(struct bdi_wri
                queue_io(wb, wbc->older_than_this);
  
        while (!list_empty(&wb->b_io)) {
 -              struct inode *inode = list_entry(wb->b_io.prev,
 -                                               struct inode, i_list);
 +              struct inode *inode = wb_inode(wb->b_io.prev);
                struct super_block *sb = inode->i_sb;
  
                if (!pin_sb_for_writeback(sb)) {
@@@ -586,7 -582,7 +586,7 @@@ static inline bool over_bground_thresh(
        global_dirty_limits(&background_thresh, &dirty_thresh);
  
        return (global_page_state(NR_FILE_DIRTY) +
 -              global_page_state(NR_UNSTABLE_NFS) >= background_thresh);
 +              global_page_state(NR_UNSTABLE_NFS) > background_thresh);
  }
  
  /*
@@@ -679,7 -675,8 +679,7 @@@ static long wb_writeback(struct bdi_wri
                 */
                spin_lock(&inode_lock);
                if (!list_empty(&wb->b_more_io))  {
 -                      inode = list_entry(wb->b_more_io.prev,
 -                                              struct inode, i_list);
 +                      inode = wb_inode(wb->b_more_io.prev);
                        trace_wbc_writeback_wait(&wbc, wb->bdi);
                        inode_wait_for_writeback(inode);
                }
@@@ -707,17 -704,6 +707,17 @@@ get_next_work_item(struct backing_dev_i
        return work;
  }
  
 +/*
 + * Add in the number of potentially dirty inodes, because each inode
 + * write can dirty pagecache in the underlying blockdev.
 + */
 +static unsigned long get_nr_dirty_pages(void)
 +{
 +      return global_page_state(NR_FILE_DIRTY) +
 +              global_page_state(NR_UNSTABLE_NFS) +
 +              get_nr_dirty_inodes();
 +}
 +
  static long wb_check_old_data_flush(struct bdi_writeback *wb)
  {
        unsigned long expired;
                return 0;
  
        wb->last_old_flush = jiffies;
 -      nr_pages = global_page_state(NR_FILE_DIRTY) +
 -                      global_page_state(NR_UNSTABLE_NFS) +
 -                      (inodes_stat.nr_inodes - inodes_stat.nr_unused);
 +      nr_pages = get_nr_dirty_pages();
  
        if (nr_pages) {
                struct wb_writeback_work work = {
@@@ -802,7 -790,7 +802,7 @@@ int bdi_writeback_thread(void *data
        struct backing_dev_info *bdi = wb->bdi;
        long pages_written;
  
 -      current->flags |= PF_FLUSHER | PF_SWAPWRITE;
 +      current->flags |= PF_SWAPWRITE;
        set_freezable();
        wb->last_active = jiffies;
  
@@@ -974,7 -962,7 +974,7 @@@ void __mark_inode_dirty(struct inode *i
                 * dirty list.  Add blockdev inodes as well.
                 */
                if (!S_ISBLK(inode->i_mode)) {
 -                      if (hlist_unhashed(&inode->i_hash))
 +                      if (inode_unhashed(inode))
                                goto out;
                }
                if (inode->i_state & I_FREEING)
                        }
  
                        inode->dirtied_when = jiffies;
 -                      list_move(&inode->i_list, &bdi->wb.b_dirty);
 +                      list_move(&inode->i_wb_list, &bdi->wb.b_dirty);
                }
        }
  out:
@@@ -1081,30 -1069,44 +1081,42 @@@ static void wait_sb_inodes(struct super
  }
  
  /**
-  * writeback_inodes_sb        -       writeback dirty inodes from given super_block
+  * writeback_inodes_sb_nr -   writeback dirty inodes from given super_block
   * @sb: the superblock
+  * @nr: the number of pages to write
   *
   * Start writeback on some inodes on this super_block. No guarantees are made
   * on how many (if any) will be written, and this function does not wait
-  * for IO completion of submitted IO. The number of pages submitted is
-  * returned.
+  * for IO completion of submitted IO.
   */
- void writeback_inodes_sb(struct super_block *sb)
+ void writeback_inodes_sb_nr(struct super_block *sb, unsigned long nr)
  {
        DECLARE_COMPLETION_ONSTACK(done);
        struct wb_writeback_work work = {
                .sb             = sb,
                .sync_mode      = WB_SYNC_NONE,
                .done           = &done,
+               .nr_pages       = nr,
        };
  
        WARN_ON(!rwsem_is_locked(&sb->s_umount));
-       work.nr_pages = get_nr_dirty_pages();
        bdi_queue_work(sb->s_bdi, &work);
        wait_for_completion(&done);
  }
 -      return writeback_inodes_sb_nr(sb, global_page_state(NR_FILE_DIRTY) +
 -                            global_page_state(NR_UNSTABLE_NFS) +
 -                            (inodes_stat.nr_inodes - inodes_stat.nr_unused));
+ EXPORT_SYMBOL(writeback_inodes_sb_nr);
+ /**
+  * writeback_inodes_sb        -       writeback dirty inodes from given super_block
+  * @sb: the superblock
+  *
+  * Start writeback on some inodes on this super_block. No guarantees are made
+  * on how many (if any) will be written, and this function does not wait
+  * for IO completion of submitted IO.
+  */
+ void writeback_inodes_sb(struct super_block *sb)
+ {
++      return writeback_inodes_sb_nr(sb, get_nr_dirty_pages());
+ }
  EXPORT_SYMBOL(writeback_inodes_sb);
  
  /**
@@@ -1126,6 -1128,27 +1138,27 @@@ int writeback_inodes_sb_if_idle(struct 
  }
  EXPORT_SYMBOL(writeback_inodes_sb_if_idle);
  
+ /**
+  * writeback_inodes_sb_if_idle        -       start writeback if none underway
+  * @sb: the superblock
+  * @nr: the number of pages to write
+  *
+  * Invoke writeback_inodes_sb if no writeback is currently underway.
+  * Returns 1 if writeback was started, 0 if not.
+  */
+ int writeback_inodes_sb_nr_if_idle(struct super_block *sb,
+                                  unsigned long nr)
+ {
+       if (!writeback_in_progress(sb->s_bdi)) {
+               down_read(&sb->s_umount);
+               writeback_inodes_sb_nr(sb, nr);
+               up_read(&sb->s_umount);
+               return 1;
+       } else
+               return 0;
+ }
+ EXPORT_SYMBOL(writeback_inodes_sb_nr_if_idle);
  /**
   * sync_inodes_sb     -       sync sb inode pages
   * @sb: the superblock
@@@ -1207,23 -1230,3 +1240,23 @@@ int sync_inode(struct inode *inode, str
        return ret;
  }
  EXPORT_SYMBOL(sync_inode);
 +
 +/**
 + * sync_inode - write an inode to disk
 + * @inode: the inode to sync
 + * @wait: wait for I/O to complete.
 + *
 + * Write an inode to disk and adjust it's dirty state after completion.
 + *
 + * Note: only writes the actual inode, no associated data or other metadata.
 + */
 +int sync_inode_metadata(struct inode *inode, int wait)
 +{
 +      struct writeback_control wbc = {
 +              .sync_mode = wait ? WB_SYNC_ALL : WB_SYNC_NONE,
 +              .nr_to_write = 0, /* metadata-only */
 +      };
 +
 +      return sync_inode(inode, &wbc);
 +}
 +EXPORT_SYMBOL(sync_inode_metadata);
@@@ -10,6 -10,8 +10,6 @@@
  struct backing_dev_info;
  
  extern spinlock_t inode_lock;
 -extern struct list_head inode_in_use;
 -extern struct list_head inode_unused;
  
  /*
   * fs/fs-writeback.c
@@@ -58,7 -60,9 +58,9 @@@ struct writeback_control 
  struct bdi_writeback;
  int inode_wait(void *);
  void writeback_inodes_sb(struct super_block *);
+ void writeback_inodes_sb_nr(struct super_block *, unsigned long nr);
  int writeback_inodes_sb_if_idle(struct super_block *);
+ int writeback_inodes_sb_nr_if_idle(struct super_block *, unsigned long nr);
  void sync_inodes_sb(struct super_block *);
  void writeback_inodes_wb(struct bdi_writeback *wb,
                struct writeback_control *wbc);
@@@ -141,16 -145,12 +143,16 @@@ typedef int (*writepage_t)(struct page 
  
  int generic_writepages(struct address_space *mapping,
                       struct writeback_control *wbc);
 +void tag_pages_for_writeback(struct address_space *mapping,
 +                           pgoff_t start, pgoff_t end);
  int write_cache_pages(struct address_space *mapping,
                      struct writeback_control *wbc, writepage_t writepage,
                      void *data);
  int do_writepages(struct address_space *mapping, struct writeback_control *wbc);
  void set_page_dirty_balance(struct page *page, int page_mkwrite);
  void writeback_set_ratelimit(void);
 +void tag_pages_for_writeback(struct address_space *mapping,
 +                           pgoff_t start, pgoff_t end);
  
  /* pdflush.c */
  extern int nr_pdflush_threads;        /* Global so it can be exported to sysctl