Btrfs: make sure reserve_metadata_bytes doesn't leak out strange errors

[pandora-kernel.git] / fs / btrfs / extent-tree.c
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c

index 169bd62..4d08ed7 100644 (file)
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -320,12 +320,12 @@ static u64 add_new_free_space(struct btrfs_block_group_cache *block_group,
         return total_added;
  }
  
-static int caching_kthread(void *data)
+static noinline void caching_thread(struct btrfs_work *work)
  {
-       struct btrfs_block_group_cache *block_group = data;
-       struct btrfs_fs_info *fs_info = block_group->fs_info;
-       struct btrfs_caching_control *caching_ctl = block_group->caching_ctl;
-       struct btrfs_root *extent_root = fs_info->extent_root;
+       struct btrfs_block_group_cache *block_group;
+       struct btrfs_fs_info *fs_info;
+       struct btrfs_caching_control *caching_ctl;
+       struct btrfs_root *extent_root;
         struct btrfs_path *path;
         struct extent_buffer *leaf;
         struct btrfs_key key;
@@ -334,9 +334,14 @@ static int caching_kthread(void *data)
         u32 nritems;
         int ret = 0;
  
+       caching_ctl = container_of(work, struct btrfs_caching_control, work);
+       block_group = caching_ctl->block_group;
+       fs_info = block_group->fs_info;
+       extent_root = fs_info->extent_root;
+
         path = btrfs_alloc_path();
         if (!path)
-               return -ENOMEM;
+               goto out;
  
         last = max_t(u64, block_group->key.objectid, BTRFS_SUPER_INFO_OFFSET);
  
@@ -348,7 +353,7 @@ static int caching_kthread(void *data)
          */
         path->skip_locking = 1;
         path->search_commit_root = 1;
-       path->reada = 2;
+       path->reada = 1;
  
         key.objectid = last;
         key.offset = 0;
@@ -366,8 +371,7 @@ again:
         nritems = btrfs_header_nritems(leaf);
  
         while (1) {
-               smp_mb();
-               if (fs_info->closing > 1) {
+               if (btrfs_fs_closing(fs_info) > 1) {
                         last = (u64)-1;
                         break;
                 }
@@ -379,15 +383,18 @@ again:
                         if (ret)
                                 break;
  
-                       caching_ctl->progress = last;
-                       btrfs_release_path(path);
-                       up_read(&fs_info->extent_commit_sem);
-                       mutex_unlock(&caching_ctl->mutex);
-                       if (btrfs_transaction_in_commit(fs_info))
-                               schedule_timeout(1);
-                       else
+                       if (need_resched() ||
+                           btrfs_next_leaf(extent_root, path)) {
+                               caching_ctl->progress = last;
+                               btrfs_release_path(path);
+                               up_read(&fs_info->extent_commit_sem);
+                               mutex_unlock(&caching_ctl->mutex);
                                 cond_resched();
-                       goto again;
+                               goto again;
+                       }
+                       leaf = path->nodes[0];
+                       nritems = btrfs_header_nritems(leaf);
+                       continue;
                 }
  
                 if (key.objectid < block_group->key.objectid) {
@@ -431,13 +438,11 @@ err:
         free_excluded_extents(extent_root, block_group);
  
         mutex_unlock(&caching_ctl->mutex);
+out:
         wake_up(&caching_ctl->wait);
  
         put_caching_control(caching_ctl);
-       atomic_dec(&block_group->space_info->caching_threads);
         btrfs_put_block_group(block_group);
-
-       return 0;
  }
  
  static int cache_block_group(struct btrfs_block_group_cache *cache,
@@ -447,7 +452,6 @@ static int cache_block_group(struct btrfs_block_group_cache *cache,
  {
         struct btrfs_fs_info *fs_info = cache->fs_info;
         struct btrfs_caching_control *caching_ctl;
-       struct task_struct *tsk;
         int ret = 0;
  
         smp_mb();
@@ -499,6 +503,7 @@ static int cache_block_group(struct btrfs_block_group_cache *cache,
         caching_ctl->progress = cache->key.objectid;
         /* one for caching kthread, one for caching block group list */
         atomic_set(&caching_ctl->count, 2);
+       caching_ctl->work.func = caching_thread;
  
         spin_lock(&cache->lock);
         if (cache->cached != BTRFS_CACHE_NO) {
@@ -514,16 +519,9 @@ static int cache_block_group(struct btrfs_block_group_cache *cache,
         list_add_tail(&caching_ctl->list, &fs_info->caching_block_groups);
         up_write(&fs_info->extent_commit_sem);
  
-       atomic_inc(&cache->space_info->caching_threads);
         btrfs_get_block_group(cache);
  
-       tsk = kthread_run(caching_kthread, cache, "btrfs-cache-%llu\n",
-                         cache->key.objectid);
-       if (IS_ERR(tsk)) {
-               ret = PTR_ERR(tsk);
-               printk(KERN_ERR "error running thread %d\n", ret);
-               BUG();
-       }
+       btrfs_queue_worker(&fs_info->caching_workers, &caching_ctl->work);
  
         return ret;
  }
@@ -2930,9 +2928,10 @@ static int update_space_info(struct btrfs_fs_info *info, u64 flags,
         found->full = 0;
         found->force_alloc = CHUNK_ALLOC_NO_FORCE;
         found->chunk_alloc = 0;
+       found->flush = 0;
+       init_waitqueue_head(&found->wait);
         *space_info = found;
         list_add_rcu(&found->list, &info->space_info);
-       atomic_set(&found->caching_threads, 0);
         return 0;
  }
  
@@ -3065,7 +3064,7 @@ again:
                         spin_unlock(&data_sinfo->lock);
  alloc:
                         alloc_target = btrfs_get_alloc_profile(root, 1);
-                       trans = btrfs_join_transaction(root, 1);
+                       trans = btrfs_join_transaction(root);
                         if (IS_ERR(trans))
                                 return PTR_ERR(trans);
  
@@ -3087,13 +3086,21 @@ alloc:
                         }
                         goto again;
                 }
+
+               /*
+                * If we have less pinned bytes than we want to allocate then
+                * don't bother committing the transaction, it won't help us.
+                */
+               if (data_sinfo->bytes_pinned < bytes)
+                       committed = 1;
                 spin_unlock(&data_sinfo->lock);
  
                 /* commit the current transaction and try again */
  commit_trans:
-               if (!committed && !root->fs_info->open_ioctl_trans) {
+               if (!committed &&
+                   !atomic_read(&root->fs_info->open_ioctl_trans)) {
                         committed = 1;
-                       trans = btrfs_join_transaction(root, 1);
+                       trans = btrfs_join_transaction(root);
                         if (IS_ERR(trans))
                                 return PTR_ERR(trans);
                         ret = btrfs_commit_transaction(trans, root);
@@ -3304,9 +3311,13 @@ static int shrink_delalloc(struct btrfs_trans_handle *trans,
         if (reserved == 0)
                 return 0;
  
-       /* nothing to shrink - nothing to reclaim */
-       if (root->fs_info->delalloc_bytes == 0)
+       smp_mb();
+       if (root->fs_info->delalloc_bytes == 0) {
+               if (trans)
+                       return 0;
+               btrfs_wait_ordered_extents(root, 0, 0);
                 return 0;
+       }
  
         max_reclaim = min(reserved, to_reclaim);
  
@@ -3350,6 +3361,8 @@ static int shrink_delalloc(struct btrfs_trans_handle *trans,
                 }
  
         }
+       if (reclaimed >= to_reclaim && !trans)
+               btrfs_wait_ordered_extents(root, 0, 0);
         return reclaimed >= to_reclaim;
  }
  
@@ -3374,15 +3387,36 @@ static int reserve_metadata_bytes(struct btrfs_trans_handle *trans,
         u64 num_bytes = orig_bytes;
         int retries = 0;
         int ret = 0;
-       bool reserved = false;
         bool committed = false;
+       bool flushing = false;
  
  again:
-       ret = -ENOSPC;
-       if (reserved)
-               num_bytes = 0;
-
+       ret = 0;
         spin_lock(&space_info->lock);
+       /*
+        * We only want to wait if somebody other than us is flushing and we are
+        * actually alloed to flush.
+        */
+       while (flush && !flushing && space_info->flush) {
+               spin_unlock(&space_info->lock);
+               /*
+                * If we have a trans handle we can't wait because the flusher
+                * may have to commit the transaction, which would mean we would
+                * deadlock since we are waiting for the flusher to finish, but
+                * hold the current transaction open.
+                */
+               if (trans)
+                       return -EAGAIN;
+               ret = wait_event_interruptible(space_info->wait,
+                                              !space_info->flush);
+               /* Must have been interrupted, return */
+               if (ret)
+                       return -EINTR;
+
+               spin_lock(&space_info->lock);
+       }
+
+       ret = -ENOSPC;
         unused = space_info->bytes_used + space_info->bytes_reserved +
                  space_info->bytes_pinned + space_info->bytes_readonly +
                  space_info->bytes_may_use;
@@ -3397,8 +3431,7 @@ again:
         if (unused <= space_info->total_bytes) {
                 unused = space_info->total_bytes - unused;
                 if (unused >= num_bytes) {
-                       if (!reserved)
-                               space_info->bytes_reserved += orig_bytes;
+                       space_info->bytes_reserved += orig_bytes;
                         ret = 0;
                 } else {
                         /*
@@ -3423,17 +3456,14 @@ again:
          * to reclaim space we can actually use it instead of somebody else
          * stealing it from us.
          */
-       if (ret && !reserved) {
-               space_info->bytes_reserved += orig_bytes;
-               reserved = true;
+       if (ret && flush) {
+               flushing = true;
+               space_info->flush = 1;
         }
  
         spin_unlock(&space_info->lock);
  
-       if (!ret)
-               return 0;
-
-       if (!flush)
+       if (!ret || !flush)
                 goto out;
  
         /*
@@ -3441,11 +3471,11 @@ again:
          * metadata until after the IO is completed.
          */
         ret = shrink_delalloc(trans, root, num_bytes, 1);
-       if (ret > 0)
-               return 0;
-       else if (ret < 0)
+       if (ret < 0)
                 goto out;
  
+       ret = 0;
+
         /*
          * So if we were overcommitted it's possible that somebody else flushed
          * out enough space and we simply didn't have enough space to reclaim,
@@ -3456,11 +3486,11 @@ again:
                 goto again;
         }
  
-       spin_lock(&space_info->lock);
         /*
          * Not enough space to be reclaimed, don't bother committing the
          * transaction.
          */
+       spin_lock(&space_info->lock);
         if (space_info->bytes_pinned < orig_bytes)
                 ret = -ENOSPC;
         spin_unlock(&space_info->lock);
@@ -3468,11 +3498,14 @@ again:
                 goto out;
  
         ret = -EAGAIN;
-       if (trans || committed)
+       if (trans)
                 goto out;
  
         ret = -ENOSPC;
-       trans = btrfs_join_transaction(root, 1);
+       if (committed)
+               goto out;
+
+       trans = btrfs_join_transaction(root);
         if (IS_ERR(trans))
                 goto out;
         ret = btrfs_commit_transaction(trans, root);
@@ -3483,12 +3516,12 @@ again:
         }
  
  out:
-       if (reserved) {
+       if (flushing) {
                 spin_lock(&space_info->lock);
-               space_info->bytes_reserved -= orig_bytes;
+               space_info->flush = 0;
+               wake_up_all(&space_info->wait);
                 spin_unlock(&space_info->lock);
         }
-
         return ret;
  }
  
@@ -3698,8 +3731,7 @@ int btrfs_block_rsv_check(struct btrfs_trans_handle *trans,
         if (commit_trans) {
                 if (trans)
                         return -EAGAIN;
-
-               trans = btrfs_join_transaction(root, 1);
+               trans = btrfs_join_transaction(root);
                 BUG_ON(IS_ERR(trans));
                 ret = btrfs_commit_transaction(trans, root);
                 return 0;
@@ -3837,24 +3869,35 @@ static void release_global_block_rsv(struct btrfs_fs_info *fs_info)
         WARN_ON(fs_info->chunk_block_rsv.reserved > 0);
  }
  
-int btrfs_trans_reserve_metadata(struct btrfs_trans_handle *trans,
-                                struct btrfs_root *root,
-                                int num_items)
+int btrfs_truncate_reserve_metadata(struct btrfs_trans_handle *trans,
+                                   struct btrfs_root *root,
+                                   struct btrfs_block_rsv *rsv)
  {
+       struct btrfs_block_rsv *trans_rsv = &root->fs_info->trans_block_rsv;
         u64 num_bytes;
         int ret;
  
-       if (num_items == 0 || root->fs_info->chunk_root == root)
+       /*
+        * Truncate should be freeing data, but give us 2 items just in case it
+        * needs to use some space.  We may want to be smarter about this in the
+        * future.
+        */
+       num_bytes = btrfs_calc_trans_metadata_size(root, 2);
+
+       /* We already have enough bytes, just return */
+       if (rsv->reserved >= num_bytes)
                 return 0;
  
-       num_bytes = btrfs_calc_trans_metadata_size(root, num_items);
-       ret = btrfs_block_rsv_add(trans, root, &root->fs_info->trans_block_rsv,
-                                 num_bytes);
-       if (!ret) {
-               trans->bytes_reserved += num_bytes;
-               trans->block_rsv = &root->fs_info->trans_block_rsv;
-       }
-       return ret;
+       num_bytes -= rsv->reserved;
+
+       /*
+        * You should have reserved enough space before hand to do this, so this
+        * should not fail.
+        */
+       ret = block_rsv_migrate_bytes(trans_rsv, rsv, num_bytes);
+       BUG_ON(ret);
+
+       return 0;
  }
  
  void btrfs_trans_release_metadata(struct btrfs_trans_handle *trans,
@@ -3877,23 +3920,18 @@ int btrfs_orphan_reserve_metadata(struct btrfs_trans_handle *trans,
         struct btrfs_block_rsv *dst_rsv = root->orphan_block_rsv;
  
         /*
-        * one for deleting orphan item, one for updating inode and
-        * two for calling btrfs_truncate_inode_items.
-        *
-        * btrfs_truncate_inode_items is a delete operation, it frees
-        * more space than it uses in most cases. So two units of
-        * metadata space should be enough for calling it many times.
-        * If all of the metadata space is used, we can commit
-        * transaction and use space it freed.
+        * We need to hold space in order to delete our orphan item once we've
+        * added it, so this takes the reservation so we can release it later
+        * when we are truly done with the orphan item.
          */
-       u64 num_bytes = btrfs_calc_trans_metadata_size(root, 4);
+       u64 num_bytes = btrfs_calc_trans_metadata_size(root, 1);
         return block_rsv_migrate_bytes(src_rsv, dst_rsv, num_bytes);
  }
  
  void btrfs_orphan_release_metadata(struct inode *inode)
  {
         struct btrfs_root *root = BTRFS_I(inode)->root;
-       u64 num_bytes = btrfs_calc_trans_metadata_size(root, 4);
+       u64 num_bytes = btrfs_calc_trans_metadata_size(root, 1);
         btrfs_block_rsv_release(root, root->orphan_block_rsv, num_bytes);
  }
  
@@ -3912,6 +3950,30 @@ int btrfs_snap_reserve_metadata(struct btrfs_trans_handle *trans,
         return block_rsv_migrate_bytes(src_rsv, dst_rsv, num_bytes);
  }
  
+static unsigned drop_outstanding_extent(struct inode *inode)
+{
+       unsigned dropped_extents = 0;
+
+       spin_lock(&BTRFS_I(inode)->lock);
+       BUG_ON(!BTRFS_I(inode)->outstanding_extents);
+       BTRFS_I(inode)->outstanding_extents--;
+
+       /*
+        * If we have more or the same amount of outsanding extents than we have
+        * reserved then we need to leave the reserved extents count alone.
+        */
+       if (BTRFS_I(inode)->outstanding_extents >=
+           BTRFS_I(inode)->reserved_extents)
+               goto out;
+
+       dropped_extents = BTRFS_I(inode)->reserved_extents -
+               BTRFS_I(inode)->outstanding_extents;
+       BTRFS_I(inode)->reserved_extents -= dropped_extents;
+out:
+       spin_unlock(&BTRFS_I(inode)->lock);
+       return dropped_extents;
+}
+
  static u64 calc_csum_metadata_size(struct inode *inode, u64 num_bytes)
  {
         return num_bytes >>= 3;
@@ -3921,9 +3983,8 @@ int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes)
  {
         struct btrfs_root *root = BTRFS_I(inode)->root;
         struct btrfs_block_rsv *block_rsv = &root->fs_info->delalloc_block_rsv;
-       u64 to_reserve;
-       int nr_extents;
-       int reserved_extents;
+       u64 to_reserve = 0;
+       unsigned nr_extents = 0;
         int ret;
  
         if (btrfs_transaction_in_commit(root->fs_info))
@@ -3931,66 +3992,49 @@ int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes)
  
         num_bytes = ALIGN(num_bytes, root->sectorsize);
  
-       nr_extents = atomic_read(&BTRFS_I(inode)->outstanding_extents) + 1;
-       reserved_extents = atomic_read(&BTRFS_I(inode)->reserved_extents);
+       spin_lock(&BTRFS_I(inode)->lock);
+       BTRFS_I(inode)->outstanding_extents++;
+
+       if (BTRFS_I(inode)->outstanding_extents >
+           BTRFS_I(inode)->reserved_extents) {
+               nr_extents = BTRFS_I(inode)->outstanding_extents -
+                       BTRFS_I(inode)->reserved_extents;
+               BTRFS_I(inode)->reserved_extents += nr_extents;
  
-       if (nr_extents > reserved_extents) {
-               nr_extents -= reserved_extents;
                 to_reserve = btrfs_calc_trans_metadata_size(root, nr_extents);
-       } else {
-               nr_extents = 0;
-               to_reserve = 0;
         }
+       spin_unlock(&BTRFS_I(inode)->lock);
  
         to_reserve += calc_csum_metadata_size(inode, num_bytes);
         ret = reserve_metadata_bytes(NULL, root, block_rsv, to_reserve, 1);
-       if (ret)
+       if (ret) {
+               unsigned dropped;
+               /*
+                * We don't need the return value since our reservation failed,
+                * we just need to clean up our counter.
+                */
+               dropped = drop_outstanding_extent(inode);
+               WARN_ON(dropped > 1);
                 return ret;
-
-       atomic_add(nr_extents, &BTRFS_I(inode)->reserved_extents);
-       atomic_inc(&BTRFS_I(inode)->outstanding_extents);
+       }
  
         block_rsv_add_bytes(block_rsv, to_reserve, 1);
  
-       if (block_rsv->size > 512 * 1024 * 1024)
-               shrink_delalloc(NULL, root, to_reserve, 0);
-
         return 0;
  }
  
  void btrfs_delalloc_release_metadata(struct inode *inode, u64 num_bytes)
  {
         struct btrfs_root *root = BTRFS_I(inode)->root;
-       u64 to_free;
-       int nr_extents;
-       int reserved_extents;
+       u64 to_free = 0;
+       unsigned dropped;
  
         num_bytes = ALIGN(num_bytes, root->sectorsize);
-       atomic_dec(&BTRFS_I(inode)->outstanding_extents);
-       WARN_ON(atomic_read(&BTRFS_I(inode)->outstanding_extents) < 0);
-
-       reserved_extents = atomic_read(&BTRFS_I(inode)->reserved_extents);
-       do {
-               int old, new;
-
-               nr_extents = atomic_read(&BTRFS_I(inode)->outstanding_extents);
-               if (nr_extents >= reserved_extents) {
-                       nr_extents = 0;
-                       break;
-               }
-               old = reserved_extents;
-               nr_extents = reserved_extents - nr_extents;
-               new = reserved_extents - nr_extents;
-               old = atomic_cmpxchg(&BTRFS_I(inode)->reserved_extents,
-                                    reserved_extents, new);
-               if (likely(old == reserved_extents))
-                       break;
-               reserved_extents = old;
-       } while (1);
+       dropped = drop_outstanding_extent(inode);
  
         to_free = calc_csum_metadata_size(inode, num_bytes);
-       if (nr_extents > 0)
-               to_free += btrfs_calc_trans_metadata_size(root, nr_extents);
+       if (dropped > 0)
+               to_free += btrfs_calc_trans_metadata_size(root, dropped);
  
         btrfs_block_rsv_release(root, &root->fs_info->delalloc_block_rsv,
                                 to_free);
@@ -4810,7 +4854,7 @@ static noinline int find_free_extent(struct btrfs_trans_handle *trans,
                                      u64 num_bytes, u64 empty_size,
                                      u64 search_start, u64 search_end,
                                      u64 hint_byte, struct btrfs_key *ins,
-                                    int data)
+                                    u64 data)
  {
         int ret = 0;
         struct btrfs_root *root = orig_root->fs_info->extent_root;
@@ -4837,7 +4881,7 @@ static noinline int find_free_extent(struct btrfs_trans_handle *trans,
  
         space_info = __find_space_info(root->fs_info, data);
         if (!space_info) {
-               printk(KERN_ERR "No space info for %d\n", data);
+               printk(KERN_ERR "No space info for %llu\n", data);
                 return -ENOSPC;
         }
  
@@ -4958,14 +5002,10 @@ have_block_group:
                         }
  
                         /*
-                        * We only want to start kthread caching if we are at
-                        * the point where we will wait for caching to make
-                        * progress, or if our ideal search is over and we've
-                        * found somebody to start caching.
+                        * The caching workers are limited to 2 threads, so we
+                        * can queue as much work as we care to.
                          */
-                       if (loop > LOOP_CACHING_NOWAIT ||
-                           (loop > LOOP_FIND_IDEAL &&
-                            atomic_read(&space_info->caching_threads) < 2)) {
+                       if (loop > LOOP_FIND_IDEAL) {
                                 ret = cache_block_group(block_group, trans,
                                                         orig_root, 0);
                                 BUG_ON(ret);
@@ -4987,6 +5027,15 @@ have_block_group:
                 if (unlikely(block_group->ro))
                         goto loop;
  
+               spin_lock(&block_group->free_space_ctl->tree_lock);
+               if (cached &&
+                   block_group->free_space_ctl->free_space <
+                   num_bytes + empty_size) {
+                       spin_unlock(&block_group->free_space_ctl->tree_lock);
+                       goto loop;
+               }
+               spin_unlock(&block_group->free_space_ctl->tree_lock);
+
                 /*
                  * Ok we want to try and use the cluster allocator, so lets look
                  * there, unless we are on LOOP_NO_EMPTY_SIZE, since we will
@@ -5150,6 +5199,7 @@ checks:
                         btrfs_add_free_space(block_group, offset,
                                              search_start - offset);
                 BUG_ON(offset > search_start);
+               btrfs_put_block_group(block_group);
                 break;
  loop:
                 failed_cluster_refill = false;
@@ -5172,15 +5222,12 @@ loop:
          * LOOP_NO_EMPTY_SIZE, set empty_size and empty_cluster to 0 and try
          *                      again
          */
-       if (!ins->objectid && loop < LOOP_NO_EMPTY_SIZE &&
-           (found_uncached_bg || empty_size || empty_cluster ||
-            allowed_chunk_alloc)) {
+       if (!ins->objectid && loop < LOOP_NO_EMPTY_SIZE) {
                 index = 0;
                 if (loop == LOOP_FIND_IDEAL && found_uncached_bg) {
                         found_uncached_bg = false;
                         loop++;
-                       if (!ideal_cache_percent &&
-                           atomic_read(&space_info->caching_threads))
+                       if (!ideal_cache_percent)
                                 goto search;
  
                         /*
@@ -5214,42 +5261,39 @@ loop:
                         goto search;
                 }
  
-               if (loop < LOOP_CACHING_WAIT) {
-                       loop++;
-                       goto search;
-               }
+               loop++;
  
                 if (loop == LOOP_ALLOC_CHUNK) {
-                       empty_size = 0;
-                       empty_cluster = 0;
-               }
+                      if (allowed_chunk_alloc) {
+                               ret = do_chunk_alloc(trans, root, num_bytes +
+                                                    2 * 1024 * 1024, data,
+                                                    CHUNK_ALLOC_LIMITED);
+                               allowed_chunk_alloc = 0;
+                               if (ret == 1)
+                                       done_chunk_alloc = 1;
+                       } else if (!done_chunk_alloc &&
+                                  space_info->force_alloc ==
+                                  CHUNK_ALLOC_NO_FORCE) {
+                               space_info->force_alloc = CHUNK_ALLOC_LIMITED;
+                       }
  
-               if (allowed_chunk_alloc) {
-                       ret = do_chunk_alloc(trans, root, num_bytes +
-                                            2 * 1024 * 1024, data,
-                                            CHUNK_ALLOC_LIMITED);
-                       allowed_chunk_alloc = 0;
-                       done_chunk_alloc = 1;
-               } else if (!done_chunk_alloc &&
-                          space_info->force_alloc == CHUNK_ALLOC_NO_FORCE) {
-                       space_info->force_alloc = CHUNK_ALLOC_LIMITED;
+                      /*
+                       * We didn't allocate a chunk, go ahead and drop the
+                       * empty size and loop again.
+                       */
+                      if (!done_chunk_alloc)
+                              loop = LOOP_NO_EMPTY_SIZE;
                 }
  
-               if (loop < LOOP_NO_EMPTY_SIZE) {
-                       loop++;
-                       goto search;
+               if (loop == LOOP_NO_EMPTY_SIZE) {
+                       empty_size = 0;
+                       empty_cluster = 0;
                 }
-               ret = -ENOSPC;
+
+               goto search;
         } else if (!ins->objectid) {
                 ret = -ENOSPC;
-       }
-
-       /* we found what we needed */
-       if (ins->objectid) {
-               if (!(data & BTRFS_BLOCK_GROUP_DATA))
-                       trans->block_group = block_group->key.objectid;
-
-               btrfs_put_block_group(block_group);
+       } else if (ins->objectid) {
                 ret = 0;
         }
  
@@ -5586,7 +5630,7 @@ struct extent_buffer *btrfs_init_new_buffer(struct btrfs_trans_handle *trans,
         if (!buf)
                 return ERR_PTR(-ENOMEM);
         btrfs_set_header_generation(buf, trans->transid);
-       btrfs_set_buffer_lockdep_class(buf, level);
+       btrfs_set_buffer_lockdep_class(root->root_key.objectid, buf, level);
         btrfs_tree_lock(buf);
         clean_tree_block(trans, root, buf);
  
@@ -5873,7 +5917,7 @@ static noinline int walk_down_proc(struct btrfs_trans_handle *trans,
                         return 1;
  
                 if (path->locks[level] && !wc->keep_locks) {
-                       btrfs_tree_unlock(eb);
+                       btrfs_tree_unlock_rw(eb, path->locks[level]);
                         path->locks[level] = 0;
                 }
                 return 0;
@@ -5897,7 +5941,7 @@ static noinline int walk_down_proc(struct btrfs_trans_handle *trans,
          * keep the tree lock
          */
         if (path->locks[level] && level > 0) {
-               btrfs_tree_unlock(eb);
+               btrfs_tree_unlock_rw(eb, path->locks[level]);
                 path->locks[level] = 0;
         }
         return 0;
@@ -6010,7 +6054,7 @@ static noinline int do_walk_down(struct btrfs_trans_handle *trans,
         BUG_ON(level != btrfs_header_level(next));
         path->nodes[level] = next;
         path->slots[level] = 0;
-       path->locks[level] = 1;
+       path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING;
         wc->level = level;
         if (wc->level == 1)
                 wc->reada_slot = 0;
@@ -6081,7 +6125,7 @@ static noinline int walk_up_proc(struct btrfs_trans_handle *trans,
                         BUG_ON(level == 0);
                         btrfs_tree_lock(eb);
                         btrfs_set_lock_blocking(eb);
-                       path->locks[level] = 1;
+                       path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING;
  
                         ret = btrfs_lookup_extent_info(trans, root,
                                                        eb->start, eb->len,
@@ -6090,8 +6134,7 @@ static noinline int walk_up_proc(struct btrfs_trans_handle *trans,
                         BUG_ON(ret);
                         BUG_ON(wc->refs[level] == 0);
                         if (wc->refs[level] == 1) {
-                               btrfs_tree_unlock(eb);
-                               path->locks[level] = 0;
+                               btrfs_tree_unlock_rw(eb, path->locks[level]);
                                 return 1;
                         }
                 }
@@ -6113,7 +6156,7 @@ static noinline int walk_up_proc(struct btrfs_trans_handle *trans,
                     btrfs_header_generation(eb) == trans->transid) {
                         btrfs_tree_lock(eb);
                         btrfs_set_lock_blocking(eb);
-                       path->locks[level] = 1;
+                       path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING;
                 }
                 clean_tree_block(trans, root, eb);
         }
@@ -6192,7 +6235,8 @@ static noinline int walk_up_tree(struct btrfs_trans_handle *trans,
                                 return 0;
  
                         if (path->locks[level]) {
-                               btrfs_tree_unlock(path->nodes[level]);
+                               btrfs_tree_unlock_rw(path->nodes[level],
+                                                    path->locks[level]);
                                 path->locks[level] = 0;
                         }
                         free_extent_buffer(path->nodes[level]);
@@ -6244,7 +6288,7 @@ int btrfs_drop_snapshot(struct btrfs_root *root,
                 path->nodes[level] = btrfs_lock_root_node(root);
                 btrfs_set_lock_blocking(path->nodes[level]);
                 path->slots[level] = 0;
-               path->locks[level] = 1;
+               path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING;
                 memset(&wc->update_progress, 0,
                        sizeof(wc->update_progress));
         } else {
@@ -6412,7 +6456,7 @@ int btrfs_drop_subtree(struct btrfs_trans_handle *trans,
         level = btrfs_header_level(node);
         path->nodes[level] = node;
         path->slots[level] = 0;
-       path->locks[level] = 1;
+       path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING;
  
         wc->refs[parent_level] = 1;
         wc->flags[parent_level] = BTRFS_BLOCK_FLAG_FULL_BACKREF;
@@ -6487,15 +6531,28 @@ static u64 update_block_group_flags(struct btrfs_root *root, u64 flags)
         return flags;
  }
  
-static int set_block_group_ro(struct btrfs_block_group_cache *cache)
+static int set_block_group_ro(struct btrfs_block_group_cache *cache, int force)
  {
         struct btrfs_space_info *sinfo = cache->space_info;
         u64 num_bytes;
+       u64 min_allocable_bytes;
         int ret = -ENOSPC;
  
         if (cache->ro)
                 return 0;
  
+       /*
+        * We need some metadata space and system metadata space for
+        * allocating chunks in some corner cases until we force to set
+        * it to be readonly.
+        */
+       if ((sinfo->flags &
+            (BTRFS_BLOCK_GROUP_SYSTEM | BTRFS_BLOCK_GROUP_METADATA)) &&
+           !force)
+               min_allocable_bytes = 1 * 1024 * 1024;
+       else
+               min_allocable_bytes = 0;
+
         spin_lock(&sinfo->lock);
         spin_lock(&cache->lock);
         num_bytes = cache->key.offset - cache->reserved - cache->pinned -
@@ -6503,7 +6560,8 @@ static int set_block_group_ro(struct btrfs_block_group_cache *cache)
  
         if (sinfo->bytes_used + sinfo->bytes_reserved + sinfo->bytes_pinned +
             sinfo->bytes_may_use + sinfo->bytes_readonly +
-           cache->reserved_pinned + num_bytes <= sinfo->total_bytes) {
+           cache->reserved_pinned + num_bytes + min_allocable_bytes <=
+           sinfo->total_bytes) {
                 sinfo->bytes_readonly += num_bytes;
                 sinfo->bytes_reserved += cache->reserved_pinned;
                 cache->reserved_pinned = 0;
@@ -6526,7 +6584,7 @@ int btrfs_set_block_group_ro(struct btrfs_root *root,
  
         BUG_ON(cache->ro);
  
-       trans = btrfs_join_transaction(root, 1);
+       trans = btrfs_join_transaction(root);
         BUG_ON(IS_ERR(trans));
  
         alloc_flags = update_block_group_flags(root, cache->flags);
@@ -6534,7 +6592,7 @@ int btrfs_set_block_group_ro(struct btrfs_root *root,
                 do_chunk_alloc(trans, root, 2 * 1024 * 1024, alloc_flags,
                                CHUNK_ALLOC_FORCE);
  
-       ret = set_block_group_ro(cache);
+       ret = set_block_group_ro(cache, 0);
         if (!ret)
                 goto out;
         alloc_flags = get_alloc_profile(root, cache->space_info->flags);
@@ -6542,7 +6600,7 @@ int btrfs_set_block_group_ro(struct btrfs_root *root,
                              CHUNK_ALLOC_FORCE);
         if (ret < 0)
                 goto out;
-       ret = set_block_group_ro(cache);
+       ret = set_block_group_ro(cache, 0);
  out:
         btrfs_end_transaction(trans, root);
         return ret;
@@ -6882,6 +6940,7 @@ int btrfs_read_block_groups(struct btrfs_root *root)
         path = btrfs_alloc_path();
         if (!path)
                 return -ENOMEM;
+       path->reada = 1;
  
         cache_gen = btrfs_super_cache_generation(&root->fs_info->super_copy);
         if (cache_gen != 0 &&
@@ -6978,7 +7037,7 @@ int btrfs_read_block_groups(struct btrfs_root *root)
  
                 set_avail_alloc_bits(root->fs_info, cache->flags);
                 if (btrfs_chunk_readonly(root, cache->key.objectid))
-                       set_block_group_ro(cache);
+                       set_block_group_ro(cache, 1);
         }
  
         list_for_each_entry_rcu(space_info, &root->fs_info->space_info, list) {
@@ -6992,9 +7051,9 @@ int btrfs_read_block_groups(struct btrfs_root *root)
                  * mirrored block groups.
                  */
                 list_for_each_entry(cache, &space_info->block_groups[3], list)
-                       set_block_group_ro(cache);
+                       set_block_group_ro(cache, 1);
                 list_for_each_entry(cache, &space_info->block_groups[4], list)
-                       set_block_group_ro(cache);
+                       set_block_group_ro(cache, 1);
         }
  
         init_global_block_rsv(info);