Btrfs: Add a write ahead tree log to optimize synchronous operations
[pandora-kernel.git] / fs / btrfs / extent-tree.c
index dc3c03c..646b914 100644 (file)
@@ -18,6 +18,7 @@
 #include <linux/sched.h>
 #include <linux/pagemap.h>
 #include <linux/writeback.h>
+#include <linux/blkdev.h>
 #include "hash.h"
 #include "crc32c.h"
 #include "ctree.h"
@@ -26,6 +27,7 @@
 #include "transaction.h"
 #include "volumes.h"
 #include "locking.h"
+#include "ref-cache.h"
 
 #define BLOCK_GROUP_DATA     EXTENT_WRITEBACK
 #define BLOCK_GROUP_METADATA EXTENT_UPTODATE
@@ -245,6 +247,7 @@ static int noinline find_search_start(struct btrfs_root *root,
        u64 search_start = *start_ret;
        int wrapped = 0;
 
+       WARN_ON(!mutex_is_locked(&root->fs_info->alloc_mutex));
        total_fs_bytes = btrfs_super_total_bytes(&root->fs_info->super_copy);
        free_space_cache = &root->fs_info->free_space_cache;
 
@@ -318,7 +321,7 @@ no_cache:
                cache = btrfs_lookup_first_block_group(root->fs_info, last);
        }
        cache_miss = 0;
-       cache = __btrfs_find_block_group(root, cache, last, data, 0);
+       cache = btrfs_find_block_group(root, cache, last, data, 0);
        if (!cache)
                goto no_cache;
        *cache_ret = cache;
@@ -378,19 +381,25 @@ __btrfs_find_block_group(struct btrfs_root *root,
                struct btrfs_block_group_cache *shint;
                shint = btrfs_lookup_first_block_group(info, search_start);
                if (shint && block_group_bits(shint, data) && !shint->ro) {
+                       spin_lock(&shint->lock);
                        used = btrfs_block_group_used(&shint->item);
                        if (used + shint->pinned <
                            div_factor(shint->key.offset, factor)) {
+                               spin_unlock(&shint->lock);
                                return shint;
                        }
+                       spin_unlock(&shint->lock);
                }
        }
        if (hint && !hint->ro && block_group_bits(hint, data)) {
+               spin_lock(&hint->lock);
                used = btrfs_block_group_used(&hint->item);
                if (used + hint->pinned <
                    div_factor(hint->key.offset, factor)) {
+                       spin_unlock(&hint->lock);
                        return hint;
                }
+               spin_unlock(&hint->lock);
                last = hint->key.objectid + hint->key.offset;
        } else {
                if (hint)
@@ -412,6 +421,7 @@ again:
                }
 
                cache = (struct btrfs_block_group_cache *)(unsigned long)ptr;
+               spin_lock(&cache->lock);
                last = cache->key.objectid + cache->key.offset;
                used = btrfs_block_group_used(&cache->item);
 
@@ -419,9 +429,11 @@ again:
                        free_check = div_factor(cache->key.offset, factor);
                        if (used + cache->pinned < free_check) {
                                found_group = cache;
+                               spin_unlock(&cache->lock);
                                goto found;
                        }
                }
+               spin_unlock(&cache->lock);
                cond_resched();
        }
        if (!wrapped) {
@@ -446,9 +458,7 @@ struct btrfs_block_group_cache *btrfs_find_block_group(struct btrfs_root *root,
 {
 
        struct btrfs_block_group_cache *ret;
-       mutex_lock(&root->fs_info->alloc_mutex);
        ret = __btrfs_find_block_group(root, hint, search_start, data, owner);
-       mutex_unlock(&root->fs_info->alloc_mutex);
        return ret;
 }
 static u64 hash_extent_ref(u64 root_objectid, u64 ref_generation,
@@ -486,6 +496,23 @@ static int match_extent_ref(struct extent_buffer *leaf,
        return ret == 0;
 }
 
+/* simple helper to search for an existing extent at a given offset */
+int btrfs_lookup_extent(struct btrfs_root *root, struct btrfs_path *path,
+                       u64 start, u64 len)
+{
+       int ret;
+       struct btrfs_key key;
+
+       maybe_lock_mutex(root);
+       key.objectid = start;
+       key.offset = len;
+       btrfs_set_key_type(&key, BTRFS_EXTENT_ITEM_KEY);
+       ret = btrfs_search_slot(NULL, root->fs_info->extent_root, &key, path,
+                               0, 0);
+       maybe_unlock_mutex(root);
+       return ret;
+}
+
 static int noinline lookup_extent_backref(struct btrfs_trans_handle *trans,
                                          struct btrfs_root *root,
                                          struct btrfs_path *path, u64 bytenr,
@@ -793,70 +820,57 @@ out:
        return 0;
 }
 
-u32 btrfs_count_snapshots_in_path(struct btrfs_root *root,
-                                 struct btrfs_path *count_path,
-                                 u64 expected_owner,
-                                 u64 first_extent)
+
+static int get_reference_status(struct btrfs_root *root, u64 bytenr,
+                               u64 parent_gen, u64 ref_objectid,
+                               u64 *min_generation, u32 *ref_count)
 {
        struct btrfs_root *extent_root = root->fs_info->extent_root;
        struct btrfs_path *path;
-       u64 bytenr;
-       u64 found_objectid;
-       u64 found_owner;
+       struct extent_buffer *leaf;
+       struct btrfs_extent_ref *ref_item;
+       struct btrfs_key key;
+       struct btrfs_key found_key;
        u64 root_objectid = root->root_key.objectid;
-       u32 total_count = 0;
-       u32 extent_refs;
-       u32 cur_count;
+       u64 ref_generation;
        u32 nritems;
        int ret;
-       struct btrfs_key key;
-       struct btrfs_key found_key;
-       struct extent_buffer *l;
-       struct btrfs_extent_item *item;
-       struct btrfs_extent_ref *ref_item;
-       int level = -1;
-
-       /* FIXME, needs locking */
-       BUG();
-
-       mutex_lock(&root->fs_info->alloc_mutex);
-       path = btrfs_alloc_path();
-again:
-       if (level == -1)
-               bytenr = first_extent;
-       else
-               bytenr = count_path->nodes[level]->start;
 
-       cur_count = 0;
        key.objectid = bytenr;
        key.offset = 0;
+       key.type = BTRFS_EXTENT_ITEM_KEY;
 
-       btrfs_set_key_type(&key, BTRFS_EXTENT_ITEM_KEY);
+       path = btrfs_alloc_path();
+       mutex_lock(&root->fs_info->alloc_mutex);
        ret = btrfs_search_slot(NULL, extent_root, &key, path, 0, 0);
        if (ret < 0)
                goto out;
        BUG_ON(ret == 0);
 
-       l = path->nodes[0];
-       btrfs_item_key_to_cpu(l, &found_key, path->slots[0]);
+       leaf = path->nodes[0];
+       btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
 
        if (found_key.objectid != bytenr ||
            found_key.type != BTRFS_EXTENT_ITEM_KEY) {
+               ret = 1;
                goto out;
        }
 
-       item = btrfs_item_ptr(l, path->slots[0], struct btrfs_extent_item);
-       extent_refs = btrfs_extent_refs(l, item);
+       *ref_count = 0;
+       *min_generation = (u64)-1;
+
        while (1) {
-               l = path->nodes[0];
-               nritems = btrfs_header_nritems(l);
+               leaf = path->nodes[0];
+               nritems = btrfs_header_nritems(leaf);
                if (path->slots[0] >= nritems) {
                        ret = btrfs_next_leaf(extent_root, path);
+                       if (ret < 0)
+                               goto out;
                        if (ret == 0)
                                continue;
                        break;
                }
-               btrfs_item_key_to_cpu(l, &found_key, path->slots[0]);
+               btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
                if (found_key.objectid != bytenr)
                        break;
 
@@ -865,61 +879,123 @@ again:
                        continue;
                }
 
-               cur_count++;
-               ref_item = btrfs_item_ptr(l, path->slots[0],
+               ref_item = btrfs_item_ptr(leaf, path->slots[0],
                                          struct btrfs_extent_ref);
-               found_objectid = btrfs_ref_root(l, ref_item);
-
-               if (found_objectid != root_objectid) {
-                       total_count = 2;
-                       goto out;
-               }
-               if (level == -1) {
-                       found_owner = btrfs_ref_objectid(l, ref_item);
-                       if (found_owner != expected_owner) {
-                               total_count = 2;
-                               goto out;
-                       }
-                       /*
-                        * nasty.  we don't count a reference held by
-                        * the running transaction.  This allows nodatacow
-                        * to avoid cow most of the time
-                        */
-                       if (found_owner >= BTRFS_FIRST_FREE_OBJECTID &&
-                           btrfs_ref_generation(l, ref_item) ==
-                           root->fs_info->generation) {
-                               extent_refs--;
-                       }
+               ref_generation = btrfs_ref_generation(leaf, ref_item);
+               /*
+                * For (parent_gen > 0 && parent_gen > ref_gen):
+                *
+                * we reach here through the oldest root, therefore
+                * all other reference from same snapshot should have
+                * a larger generation.
+                */
+               if ((root_objectid != btrfs_ref_root(leaf, ref_item)) ||
+                   (parent_gen > 0 && parent_gen > ref_generation) ||
+                   (ref_objectid >= BTRFS_FIRST_FREE_OBJECTID &&
+                    ref_objectid != btrfs_ref_objectid(leaf, ref_item))) {
+                       if (ref_count)
+                               *ref_count = 2;
+                       break;
                }
-               total_count = 1;
+
+               *ref_count = 1;
+               if (*min_generation > ref_generation)
+                       *min_generation = ref_generation;
+
                path->slots[0]++;
        }
-       /*
-        * if there is more than one reference against a data extent,
-        * we have to assume the other ref is another snapshot
-        */
-       if (level == -1 && extent_refs > 1) {
-               total_count = 2;
+       ret = 0;
+out:
+       mutex_unlock(&root->fs_info->alloc_mutex);
+       btrfs_free_path(path);
+       return ret;
+}
+
+int btrfs_cross_ref_exists(struct btrfs_trans_handle *trans,
+                          struct btrfs_root *root,
+                          struct btrfs_key *key, u64 bytenr)
+{
+       struct btrfs_root *old_root;
+       struct btrfs_path *path = NULL;
+       struct extent_buffer *eb;
+       struct btrfs_file_extent_item *item;
+       u64 ref_generation;
+       u64 min_generation;
+       u64 extent_start;
+       u32 ref_count;
+       int level;
+       int ret;
+
+       BUG_ON(trans == NULL);
+       BUG_ON(key->type != BTRFS_EXTENT_DATA_KEY);
+       ret = get_reference_status(root, bytenr, 0, key->objectid,
+                                  &min_generation, &ref_count);
+       if (ret)
+               return ret;
+
+       if (ref_count != 1)
+               return 1;
+
+       old_root = root->dirty_root->root;
+       ref_generation = old_root->root_key.offset;
+
+       /* all references are created in running transaction */
+       if (min_generation > ref_generation) {
+               ret = 0;
                goto out;
        }
-       if (cur_count == 0) {
-               total_count = 0;
+
+       path = btrfs_alloc_path();
+       if (!path) {
+               ret = -ENOMEM;
                goto out;
        }
-       if (level >= 0 && root->node == count_path->nodes[level])
+
+       path->skip_locking = 1;
+       /* if no item found, the extent is referenced by other snapshot */
+       ret = btrfs_search_slot(NULL, old_root, key, path, 0, 0);
+       if (ret)
                goto out;
-       level++;
-       btrfs_release_path(root, path);
-       goto again;
 
+       eb = path->nodes[0];
+       item = btrfs_item_ptr(eb, path->slots[0],
+                             struct btrfs_file_extent_item);
+       if (btrfs_file_extent_type(eb, item) != BTRFS_FILE_EXTENT_REG ||
+           btrfs_file_extent_disk_bytenr(eb, item) != bytenr) {
+               ret = 1;
+               goto out;
+       }
+
+       for (level = BTRFS_MAX_LEVEL - 1; level >= -1; level--) {
+               if (level >= 0) {
+                       eb = path->nodes[level];
+                       if (!eb)
+                               continue;
+                       extent_start = eb->start;
+               } else
+                       extent_start = bytenr;
+
+               ret = get_reference_status(root, extent_start, ref_generation,
+                                          0, &min_generation, &ref_count);
+               if (ret)
+                       goto out;
+
+               if (ref_count != 1) {
+                       ret = 1;
+                       goto out;
+               }
+               if (level >= 0)
+                       ref_generation = btrfs_header_generation(eb);
+       }
+       ret = 0;
 out:
-       btrfs_free_path(path);
-       mutex_unlock(&root->fs_info->alloc_mutex);
-       return total_count;
+       if (path)
+               btrfs_free_path(path);
+       return ret;
 }
 
 int btrfs_inc_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
-                 struct extent_buffer *buf)
+                 struct extent_buffer *buf, int cache_ref)
 {
        u64 bytenr;
        u32 nritems;
@@ -929,14 +1005,15 @@ int btrfs_inc_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
        int level;
        int ret;
        int faili;
+       int nr_file_extents = 0;
 
        if (!root->ref_cows)
                return 0;
 
-       mutex_lock(&root->fs_info->alloc_mutex);
        level = btrfs_header_level(buf);
        nritems = btrfs_header_nritems(buf);
        for (i = 0; i < nritems; i++) {
+               cond_resched();
                if (level == 0) {
                        u64 disk_bytenr;
                        btrfs_item_key_to_cpu(buf, &key, i);
@@ -950,29 +1027,85 @@ int btrfs_inc_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
                        disk_bytenr = btrfs_file_extent_disk_bytenr(buf, fi);
                        if (disk_bytenr == 0)
                                continue;
+
+                       if (buf != root->commit_root)
+                               nr_file_extents++;
+
+                       mutex_lock(&root->fs_info->alloc_mutex);
                        ret = __btrfs_inc_extent_ref(trans, root, disk_bytenr,
                                    btrfs_file_extent_disk_num_bytes(buf, fi),
                                    root->root_key.objectid, trans->transid,
                                    key.objectid, key.offset);
+                       mutex_unlock(&root->fs_info->alloc_mutex);
                        if (ret) {
                                faili = i;
+                               WARN_ON(1);
                                goto fail;
                        }
                } else {
                        bytenr = btrfs_node_blockptr(buf, i);
                        btrfs_node_key_to_cpu(buf, &key, i);
+
+                       mutex_lock(&root->fs_info->alloc_mutex);
                        ret = __btrfs_inc_extent_ref(trans, root, bytenr,
                                           btrfs_level_size(root, level - 1),
                                           root->root_key.objectid,
                                           trans->transid,
                                           level - 1, key.objectid);
+                       mutex_unlock(&root->fs_info->alloc_mutex);
                        if (ret) {
                                faili = i;
+                               WARN_ON(1);
                                goto fail;
                        }
                }
        }
-       mutex_unlock(&root->fs_info->alloc_mutex);
+       /* cache orignal leaf block's references */
+       if (level == 0 && cache_ref && buf != root->commit_root) {
+               struct btrfs_leaf_ref *ref;
+               struct btrfs_extent_info *info;
+
+               ref = btrfs_alloc_leaf_ref(root, nr_file_extents);
+               if (!ref) {
+                       WARN_ON(1);
+                       goto out;
+               }
+
+               ref->root_gen = root->root_key.offset;
+               ref->bytenr = buf->start;
+               ref->owner = btrfs_header_owner(buf);
+               ref->generation = btrfs_header_generation(buf);
+               ref->nritems = nr_file_extents;
+               info = ref->extents;
+
+               for (i = 0; nr_file_extents > 0 && i < nritems; i++) {
+                       u64 disk_bytenr;
+                       btrfs_item_key_to_cpu(buf, &key, i);
+                       if (btrfs_key_type(&key) != BTRFS_EXTENT_DATA_KEY)
+                               continue;
+                       fi = btrfs_item_ptr(buf, i,
+                                           struct btrfs_file_extent_item);
+                       if (btrfs_file_extent_type(buf, fi) ==
+                           BTRFS_FILE_EXTENT_INLINE)
+                               continue;
+                       disk_bytenr = btrfs_file_extent_disk_bytenr(buf, fi);
+                       if (disk_bytenr == 0)
+                               continue;
+
+                       info->bytenr = disk_bytenr;
+                       info->num_bytes =
+                               btrfs_file_extent_disk_num_bytes(buf, fi);
+                       info->objectid = key.objectid;
+                       info->offset = key.offset;
+                       info++;
+               }
+
+               BUG_ON(!root->ref_tree);
+               ret = btrfs_add_leaf_ref(root, ref);
+               WARN_ON(ret);
+               btrfs_free_leaf_ref(root, ref);
+       }
+out:
        return 0;
 fail:
        WARN_ON(1);
@@ -1003,7 +1136,6 @@ fail:
                }
        }
 #endif
-       mutex_unlock(&root->fs_info->alloc_mutex);
        return ret;
 }
 
@@ -1115,7 +1247,6 @@ static int update_space_info(struct btrfs_fs_info *info, u64 flags,
                found->total_bytes += total_bytes;
                found->bytes_used += bytes_used;
                found->full = 0;
-               WARN_ON(found->total_bytes < found->bytes_used);
                *space_info = found;
                return 0;
        }
@@ -1242,6 +1373,7 @@ static int update_block_group(struct btrfs_trans_handle *trans,
        u64 start;
        u64 end;
 
+       WARN_ON(!mutex_is_locked(&root->fs_info->alloc_mutex));
        while(total) {
                cache = btrfs_lookup_block_group(info, bytenr);
                if (!cache) {
@@ -1254,21 +1386,25 @@ static int update_block_group(struct btrfs_trans_handle *trans,
                set_extent_bits(&info->block_group_cache, start, end,
                                BLOCK_GROUP_DIRTY, GFP_NOFS);
 
+               spin_lock(&cache->lock);
                old_val = btrfs_block_group_used(&cache->item);
                num_bytes = min(total, cache->key.offset - byte_in_group);
                if (alloc) {
                        old_val += num_bytes;
                        cache->space_info->bytes_used += num_bytes;
+                       btrfs_set_block_group_used(&cache->item, old_val);
+                       spin_unlock(&cache->lock);
                } else {
                        old_val -= num_bytes;
                        cache->space_info->bytes_used -= num_bytes;
+                       btrfs_set_block_group_used(&cache->item, old_val);
+                       spin_unlock(&cache->lock);
                        if (mark_free) {
                                set_extent_dirty(&info->free_space_cache,
                                                 bytenr, bytenr + num_bytes - 1,
                                                 GFP_NOFS);
                        }
                }
-               btrfs_set_block_group_used(&cache->item, old_val);
                total -= num_bytes;
                bytenr += num_bytes;
        }
@@ -1290,13 +1426,14 @@ static u64 first_logical_byte(struct btrfs_root *root, u64 search_start)
 }
 
 
-static int update_pinned_extents(struct btrfs_root *root,
+int btrfs_update_pinned_extents(struct btrfs_root *root,
                                u64 bytenr, u64 num, int pin)
 {
        u64 len;
        struct btrfs_block_group_cache *cache;
        struct btrfs_fs_info *fs_info = root->fs_info;
 
+       WARN_ON(!mutex_is_locked(&root->fs_info->alloc_mutex));
        if (pin) {
                set_extent_dirty(&fs_info->pinned_extents,
                                bytenr, bytenr + num - 1, GFP_NOFS);
@@ -1316,14 +1453,18 @@ static int update_pinned_extents(struct btrfs_root *root,
                }
                if (pin) {
                        if (cache) {
+                               spin_lock(&cache->lock);
                                cache->pinned += len;
                                cache->space_info->bytes_pinned += len;
+                               spin_unlock(&cache->lock);
                        }
                        fs_info->total_pinned += len;
                } else {
                        if (cache) {
+                               spin_lock(&cache->lock);
                                cache->pinned -= len;
                                cache->space_info->bytes_pinned -= len;
+                               spin_unlock(&cache->lock);
                        }
                        fs_info->total_pinned -= len;
                }
@@ -1368,9 +1509,14 @@ int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans,
                                            EXTENT_DIRTY);
                if (ret)
                        break;
-               update_pinned_extents(root, start, end + 1 - start, 0);
+               btrfs_update_pinned_extents(root, start, end + 1 - start, 0);
                clear_extent_dirty(unpin, start, end, GFP_NOFS);
                set_extent_dirty(free_space_cache, start, end, GFP_NOFS);
+               if (need_resched()) {
+                       mutex_unlock(&root->fs_info->alloc_mutex);
+                       cond_resched();
+                       mutex_lock(&root->fs_info->alloc_mutex);
+               }
        }
        mutex_unlock(&root->fs_info->alloc_mutex);
        return 0;
@@ -1391,6 +1537,7 @@ static int finish_current_insert(struct btrfs_trans_handle *trans,
        int level;
        int err = 0;
 
+       WARN_ON(!mutex_is_locked(&extent_root->fs_info->alloc_mutex));
        btrfs_set_stack_extent_refs(&extent_item, 1);
        btrfs_set_key_type(&ins, BTRFS_EXTENT_ITEM_KEY);
        path = btrfs_alloc_path();
@@ -1407,8 +1554,13 @@ static int finish_current_insert(struct btrfs_trans_handle *trans,
                                        &extent_item, sizeof(extent_item));
                clear_extent_bits(&info->extent_ins, start, end, EXTENT_LOCKED,
                                  GFP_NOFS);
-               eb = read_tree_block(extent_root, ins.objectid, ins.offset,
-                                    trans->transid);
+
+               eb = btrfs_find_create_tree_block(extent_root, ins.objectid,
+                                          ins.offset);
+
+               if (!btrfs_buffer_uptodate(eb, trans->transid))
+                       btrfs_read_buffer(eb, trans->transid);
+
                btrfs_tree_lock(eb);
                level = btrfs_header_level(eb);
                if (level == 0) {
@@ -1427,6 +1579,11 @@ static int finish_current_insert(struct btrfs_trans_handle *trans,
                                          0, level,
                                          btrfs_disk_key_objectid(&first));
                BUG_ON(err);
+               if (need_resched()) {
+                       mutex_unlock(&extent_root->fs_info->alloc_mutex);
+                       cond_resched();
+                       mutex_lock(&extent_root->fs_info->alloc_mutex);
+               }
        }
        btrfs_free_path(path);
        return 0;
@@ -1437,17 +1594,25 @@ static int pin_down_bytes(struct btrfs_root *root, u64 bytenr, u32 num_bytes,
 {
        int err = 0;
 
+       WARN_ON(!mutex_is_locked(&root->fs_info->alloc_mutex));
        if (!pending) {
                struct extent_buffer *buf;
                buf = btrfs_find_tree_block(root, bytenr, num_bytes);
                if (buf) {
-                       if (!btrfs_try_tree_lock(buf) &&
-                           btrfs_buffer_uptodate(buf, 0)) {
+                       /* we can reuse a block if it hasn't been written
+                        * and it is from this transaction.  We can't
+                        * reuse anything from the tree log root because
+                        * it has tiny sub-transactions.
+                        */
+                       if (btrfs_buffer_uptodate(buf, 0) &&
+                           btrfs_try_tree_lock(buf)) {
                                u64 transid =
                                    root->fs_info->running_transaction->transid;
                                u64 header_transid =
                                        btrfs_header_generation(buf);
-                               if (header_transid == transid &&
+                               if (btrfs_header_owner(buf) !=
+                                   BTRFS_TREE_LOG_OBJECTID &&
+                                   header_transid == transid &&
                                    !btrfs_header_flag(buf,
                                               BTRFS_HEADER_FLAG_WRITTEN)) {
                                        clean_tree_block(NULL, root, buf);
@@ -1459,7 +1624,7 @@ static int pin_down_bytes(struct btrfs_root *root, u64 bytenr, u32 num_bytes,
                        }
                        free_extent_buffer(buf);
                }
-               update_pinned_extents(root, bytenr, num_bytes, 1);
+               btrfs_update_pinned_extents(root, bytenr, num_bytes, 1);
        } else {
                set_extent_bits(&root->fs_info->pending_del,
                                bytenr, bytenr + num_bytes - 1,
@@ -1490,6 +1655,7 @@ static int __free_extent(struct btrfs_trans_handle *trans, struct btrfs_root
        struct btrfs_extent_item *ei;
        u32 refs;
 
+       WARN_ON(!mutex_is_locked(&root->fs_info->alloc_mutex));
        key.objectid = bytenr;
        btrfs_set_key_type(&key, BTRFS_EXTENT_ITEM_KEY);
        key.offset = num_bytes;
@@ -1572,6 +1738,10 @@ static int __free_extent(struct btrfs_trans_handle *trans, struct btrfs_root
        if (refs == 0) {
                u64 super_used;
                u64 root_used;
+#ifdef BIO_RW_DISCARD
+               u64 map_length = num_bytes;
+               struct btrfs_multi_bio *multi = NULL;
+#endif
 
                if (pin) {
                        ret = pin_down_bytes(root, bytenr, num_bytes, 0);
@@ -1599,6 +1769,26 @@ static int __free_extent(struct btrfs_trans_handle *trans, struct btrfs_root
                ret = update_block_group(trans, root, bytenr, num_bytes, 0,
                                         mark_free);
                BUG_ON(ret);
+
+#ifdef BIO_RW_DISCARD
+               /* Tell the block device(s) that the sectors can be discarded */
+               ret = btrfs_map_block(&root->fs_info->mapping_tree, READ,
+                                     bytenr, &map_length, &multi, 0);
+               if (!ret) {
+                       struct btrfs_bio_stripe *stripe = multi->stripes;
+                       int i;
+
+                       if (map_length > num_bytes)
+                               map_length = num_bytes;
+
+                       for (i = 0; i < multi->num_stripes; i++, stripe++) {
+                               blkdev_issue_discard(stripe->dev->bdev,
+                                                    stripe->physical >> 9,
+                                                    map_length >> 9);
+                       }
+                       kfree(multi);
+               }
+#endif
        }
        btrfs_free_path(path);
        finish_current_insert(trans, extent_root);
@@ -1619,6 +1809,7 @@ static int del_pending_extents(struct btrfs_trans_handle *trans, struct
        struct extent_io_tree *pending_del;
        struct extent_io_tree *pinned_extents;
 
+       WARN_ON(!mutex_is_locked(&extent_root->fs_info->alloc_mutex));
        pending_del = &extent_root->fs_info->pending_del;
        pinned_extents = &extent_root->fs_info->pinned_extents;
 
@@ -1627,15 +1818,28 @@ static int del_pending_extents(struct btrfs_trans_handle *trans, struct
                                            EXTENT_LOCKED);
                if (ret)
                        break;
-               update_pinned_extents(extent_root, start, end + 1 - start, 1);
                clear_extent_bits(pending_del, start, end, EXTENT_LOCKED,
                                  GFP_NOFS);
-               ret = __free_extent(trans, extent_root,
-                                    start, end + 1 - start,
-                                    extent_root->root_key.objectid,
-                                    0, 0, 0, 0, 0);
+               if (!test_range_bit(&extent_root->fs_info->extent_ins,
+                                   start, end, EXTENT_LOCKED, 0)) {
+                       btrfs_update_pinned_extents(extent_root, start,
+                                             end + 1 - start, 1);
+                       ret = __free_extent(trans, extent_root,
+                                            start, end + 1 - start,
+                                            extent_root->root_key.objectid,
+                                            0, 0, 0, 0, 0);
+               } else {
+                       clear_extent_bits(&extent_root->fs_info->extent_ins,
+                                         start, end, EXTENT_LOCKED, GFP_NOFS);
+               }
                if (ret)
                        err = ret;
+
+               if (need_resched()) {
+                       mutex_unlock(&extent_root->fs_info->alloc_mutex);
+                       cond_resched();
+                       mutex_lock(&extent_root->fs_info->alloc_mutex);
+               }
        }
        return err;
 }
@@ -1664,6 +1868,8 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
        ret = __free_extent(trans, root, bytenr, num_bytes, root_objectid,
                            ref_generation, owner_objectid, owner_offset,
                            pin, pin == 0);
+
+       finish_current_insert(trans, root->fs_info->extent_root);
        pending_ret = del_pending_extents(trans, root->fs_info->extent_root);
        return ret ? ret : pending_ret;
 }
@@ -1734,6 +1940,12 @@ static int noinline find_free_extent(struct btrfs_trans_handle *trans,
        if ((data & BTRFS_BLOCK_GROUP_DATA) && btrfs_test_opt(root, SSD)) {
                last_ptr = &root->fs_info->last_data_alloc;
        }
+       if (root->root_key.objectid == BTRFS_TREE_LOG_OBJECTID) {
+               last_ptr = &root->fs_info->last_log_alloc;
+               if (!last_ptr == 0 && root->fs_info->last_alloc) {
+                       *last_ptr = root->fs_info->last_alloc + empty_cluster;
+               }
+       }
 
        if (last_ptr) {
                if (*last_ptr)
@@ -1753,12 +1965,12 @@ static int noinline find_free_extent(struct btrfs_trans_handle *trans,
                block_group = btrfs_lookup_first_block_group(info, hint_byte);
                if (!block_group)
                        hint_byte = search_start;
-               block_group = __btrfs_find_block_group(root, block_group,
+               block_group = btrfs_find_block_group(root, block_group,
                                                     hint_byte, data, 1);
                if (last_ptr && *last_ptr == 0 && block_group)
                        hint_byte = block_group->key.objectid;
        } else {
-               block_group = __btrfs_find_block_group(root,
+               block_group = btrfs_find_block_group(root,
                                                     trans->block_group,
                                                     search_start, data, 1);
        }
@@ -1880,7 +2092,7 @@ enospc:
        }
        block_group = btrfs_lookup_first_block_group(info, search_start);
        cond_resched();
-       block_group = __btrfs_find_block_group(root, block_group,
+       block_group = btrfs_find_block_group(root, block_group,
                                             search_start, data, 0);
        goto check_failed;
 
@@ -1888,36 +2100,17 @@ error:
        return ret;
 }
 
-/*
- * finds a free extent and does all the dirty work required for allocation
- * returns the key for the extent through ins, and a tree buffer for
- * the first block of the extent through buf.
- *
- * returns 0 if everything worked, non-zero otherwise.
- */
-int btrfs_alloc_extent(struct btrfs_trans_handle *trans,
-                      struct btrfs_root *root,
-                      u64 num_bytes, u64 min_alloc_size,
-                      u64 root_objectid, u64 ref_generation,
-                      u64 owner, u64 owner_offset,
-                      u64 empty_size, u64 hint_byte,
-                      u64 search_end, struct btrfs_key *ins, u64 data)
+static int __btrfs_reserve_extent(struct btrfs_trans_handle *trans,
+                                 struct btrfs_root *root,
+                                 u64 num_bytes, u64 min_alloc_size,
+                                 u64 empty_size, u64 hint_byte,
+                                 u64 search_end, struct btrfs_key *ins,
+                                 u64 data)
 {
        int ret;
-       int pending_ret;
-       u64 super_used;
-       u64 root_used;
        u64 search_start = 0;
        u64 alloc_profile;
-       u32 sizes[2];
        struct btrfs_fs_info *info = root->fs_info;
-       struct btrfs_root *extent_root = info->extent_root;
-       struct btrfs_extent_item *extent_item;
-       struct btrfs_extent_ref *ref;
-       struct btrfs_path *path;
-       struct btrfs_key keys[2];
-
-       maybe_lock_mutex(root);
 
        if (data) {
                alloc_profile = info->avail_data_alloc_bits &
@@ -1967,11 +2160,57 @@ again:
        }
        if (ret) {
                printk("allocation failed flags %Lu\n", data);
-       }
-       if (ret) {
                BUG();
-               goto out;
        }
+       clear_extent_dirty(&root->fs_info->free_space_cache,
+                          ins->objectid, ins->objectid + ins->offset - 1,
+                          GFP_NOFS);
+       return 0;
+}
+
+int btrfs_free_reserved_extent(struct btrfs_root *root, u64 start, u64 len)
+{
+       maybe_lock_mutex(root);
+       set_extent_dirty(&root->fs_info->free_space_cache,
+                        start, start + len - 1, GFP_NOFS);
+       maybe_unlock_mutex(root);
+       return 0;
+}
+
+int btrfs_reserve_extent(struct btrfs_trans_handle *trans,
+                                 struct btrfs_root *root,
+                                 u64 num_bytes, u64 min_alloc_size,
+                                 u64 empty_size, u64 hint_byte,
+                                 u64 search_end, struct btrfs_key *ins,
+                                 u64 data)
+{
+       int ret;
+       maybe_lock_mutex(root);
+       ret = __btrfs_reserve_extent(trans, root, num_bytes, min_alloc_size,
+                                    empty_size, hint_byte, search_end, ins,
+                                    data);
+       maybe_unlock_mutex(root);
+       return ret;
+}
+
+static int __btrfs_alloc_reserved_extent(struct btrfs_trans_handle *trans,
+                                        struct btrfs_root *root,
+                                        u64 root_objectid, u64 ref_generation,
+                                        u64 owner, u64 owner_offset,
+                                        struct btrfs_key *ins)
+{
+       int ret;
+       int pending_ret;
+       u64 super_used;
+       u64 root_used;
+       u64 num_bytes = ins->offset;
+       u32 sizes[2];
+       struct btrfs_fs_info *info = root->fs_info;
+       struct btrfs_root *extent_root = info->extent_root;
+       struct btrfs_extent_item *extent_item;
+       struct btrfs_extent_ref *ref;
+       struct btrfs_path *path;
+       struct btrfs_key keys[2];
 
        /* block accounting for super block */
        spin_lock_irq(&info->delalloc_lock);
@@ -1983,10 +2222,6 @@ again:
        root_used = btrfs_root_used(&root->root_item);
        btrfs_set_root_used(&root->root_item, root_used + num_bytes);
 
-       clear_extent_dirty(&root->fs_info->free_space_cache,
-                          ins->objectid, ins->objectid + ins->offset - 1,
-                          GFP_NOFS);
-
        if (root == extent_root) {
                set_extent_bits(&root->fs_info->extent_ins, ins->objectid,
                                ins->objectid + ins->offset - 1,
@@ -1994,10 +2229,6 @@ again:
                goto update_block;
        }
 
-       WARN_ON(trans->alloc_exclude_nr);
-       trans->alloc_exclude_start = ins->objectid;
-       trans->alloc_exclude_nr = ins->offset;
-
        memcpy(&keys[0], ins, sizeof(*ins));
        keys[1].offset = hash_extent_ref(root_objectid, ref_generation,
                                         owner, owner_offset);
@@ -2047,9 +2278,103 @@ update_block:
                BUG();
        }
 out:
+       return ret;
+}
+
+int btrfs_alloc_reserved_extent(struct btrfs_trans_handle *trans,
+                               struct btrfs_root *root,
+                               u64 root_objectid, u64 ref_generation,
+                               u64 owner, u64 owner_offset,
+                               struct btrfs_key *ins)
+{
+       int ret;
+       maybe_lock_mutex(root);
+       ret = __btrfs_alloc_reserved_extent(trans, root, root_objectid,
+                                           ref_generation, owner,
+                                           owner_offset, ins);
+       maybe_unlock_mutex(root);
+       return ret;
+}
+
+/*
+ * this is used by the tree logging recovery code.  It records that
+ * an extent has been allocated and makes sure to clear the free
+ * space cache bits as well
+ */
+int btrfs_alloc_logged_extent(struct btrfs_trans_handle *trans,
+                               struct btrfs_root *root,
+                               u64 root_objectid, u64 ref_generation,
+                               u64 owner, u64 owner_offset,
+                               struct btrfs_key *ins)
+{
+       int ret;
+       struct btrfs_block_group_cache *block_group;
+
+       maybe_lock_mutex(root);
+       block_group = btrfs_lookup_block_group(root->fs_info, ins->objectid);
+       cache_block_group(root, block_group);
+
+       clear_extent_dirty(&root->fs_info->free_space_cache,
+                          ins->objectid, ins->objectid + ins->offset - 1,
+                          GFP_NOFS);
+       ret = __btrfs_alloc_reserved_extent(trans, root, root_objectid,
+                                           ref_generation, owner,
+                                           owner_offset, ins);
+       maybe_unlock_mutex(root);
+       return ret;
+}
+
+/*
+ * finds a free extent and does all the dirty work required for allocation
+ * returns the key for the extent through ins, and a tree buffer for
+ * the first block of the extent through buf.
+ *
+ * returns 0 if everything worked, non-zero otherwise.
+ */
+int btrfs_alloc_extent(struct btrfs_trans_handle *trans,
+                      struct btrfs_root *root,
+                      u64 num_bytes, u64 min_alloc_size,
+                      u64 root_objectid, u64 ref_generation,
+                      u64 owner, u64 owner_offset,
+                      u64 empty_size, u64 hint_byte,
+                      u64 search_end, struct btrfs_key *ins, u64 data)
+{
+       int ret;
+
+       maybe_lock_mutex(root);
+
+       ret = __btrfs_reserve_extent(trans, root, num_bytes,
+                                    min_alloc_size, empty_size, hint_byte,
+                                    search_end, ins, data);
+       BUG_ON(ret);
+       ret = __btrfs_alloc_reserved_extent(trans, root, root_objectid,
+                                           ref_generation, owner,
+                                           owner_offset, ins);
+       BUG_ON(ret);
+
        maybe_unlock_mutex(root);
        return ret;
 }
+
+struct extent_buffer *btrfs_init_new_buffer(struct btrfs_trans_handle *trans,
+                                           struct btrfs_root *root,
+                                           u64 bytenr, u32 blocksize)
+{
+       struct extent_buffer *buf;
+
+       buf = btrfs_find_create_tree_block(root, bytenr, blocksize);
+       if (!buf)
+               return ERR_PTR(-ENOMEM);
+       btrfs_set_header_generation(buf, trans->transid);
+       btrfs_tree_lock(buf);
+       clean_tree_block(trans, root, buf);
+       btrfs_set_buffer_uptodate(buf);
+       set_extent_dirty(&trans->transaction->dirty_pages, buf->start,
+                        buf->start + buf->len - 1, GFP_NOFS);
+       trans->blocks_used++;
+       return buf;
+}
+
 /*
  * helper function to allocate a block for a given tree
  * returns the tree buffer or NULL.
@@ -2076,34 +2401,13 @@ struct extent_buffer *btrfs_alloc_free_block(struct btrfs_trans_handle *trans,
                BUG_ON(ret > 0);
                return ERR_PTR(ret);
        }
-       buf = btrfs_find_create_tree_block(root, ins.objectid, blocksize);
-       if (!buf) {
-               btrfs_free_extent(trans, root, ins.objectid, blocksize,
-                                 root->root_key.objectid, ref_generation,
-                                 0, 0, 0);
-               return ERR_PTR(-ENOMEM);
-       }
-       btrfs_set_header_generation(buf, trans->transid);
-       btrfs_tree_lock(buf);
-       clean_tree_block(trans, root, buf);
-       btrfs_set_buffer_uptodate(buf);
-
-       if (PageDirty(buf->first_page)) {
-               printk("page %lu dirty\n", buf->first_page->index);
-               WARN_ON(1);
-       }
 
-       set_extent_dirty(&trans->transaction->dirty_pages, buf->start,
-                        buf->start + buf->len - 1, GFP_NOFS);
-       if (!btrfs_test_opt(root, SSD))
-               btrfs_set_buffer_defrag(buf);
-       trans->blocks_used++;
+       buf = btrfs_init_new_buffer(trans, root, ins.objectid, blocksize);
        return buf;
 }
 
-static int noinline drop_leaf_ref(struct btrfs_trans_handle *trans,
-                                 struct btrfs_root *root,
-                                 struct extent_buffer *leaf)
+int btrfs_drop_leaf_ref(struct btrfs_trans_handle *trans,
+                       struct btrfs_root *root, struct extent_buffer *leaf)
 {
        u64 leaf_owner;
        u64 leaf_generation;
@@ -2120,6 +2424,7 @@ static int noinline drop_leaf_ref(struct btrfs_trans_handle *trans,
 
        for (i = 0; i < nritems; i++) {
                u64 disk_bytenr;
+               cond_resched();
 
                btrfs_item_key_to_cpu(leaf, &key, i);
                if (btrfs_key_type(&key) != BTRFS_EXTENT_DATA_KEY)
@@ -2135,73 +2440,88 @@ static int noinline drop_leaf_ref(struct btrfs_trans_handle *trans,
                disk_bytenr = btrfs_file_extent_disk_bytenr(leaf, fi);
                if (disk_bytenr == 0)
                        continue;
+
+               mutex_lock(&root->fs_info->alloc_mutex);
                ret = __btrfs_free_extent(trans, root, disk_bytenr,
                                btrfs_file_extent_disk_num_bytes(leaf, fi),
                                leaf_owner, leaf_generation,
                                key.objectid, key.offset, 0);
+               mutex_unlock(&root->fs_info->alloc_mutex);
+
+               atomic_inc(&root->fs_info->throttle_gen);
+               wake_up(&root->fs_info->transaction_throttle);
+               cond_resched();
+
                BUG_ON(ret);
        }
        return 0;
 }
 
-static void noinline reada_walk_down(struct btrfs_root *root,
-                                    struct extent_buffer *node,
-                                    int slot)
+static int noinline cache_drop_leaf_ref(struct btrfs_trans_handle *trans,
+                                       struct btrfs_root *root,
+                                       struct btrfs_leaf_ref *ref)
 {
-       u64 bytenr;
-       u64 last = 0;
-       u32 nritems;
-       u32 refs;
-       u32 blocksize;
-       int ret;
        int i;
-       int level;
-       int skipped = 0;
-
-       nritems = btrfs_header_nritems(node);
-       level = btrfs_header_level(node);
-       if (level)
-               return;
-
-       for (i = slot; i < nritems && skipped < 32; i++) {
-               bytenr = btrfs_node_blockptr(node, i);
-               if (last && ((bytenr > last && bytenr - last > 32 * 1024) ||
-                            (last > bytenr && last - bytenr > 32 * 1024))) {
-                       skipped++;
-                       continue;
-               }
-               blocksize = btrfs_level_size(root, level - 1);
-               if (i != slot) {
-                       ret = lookup_extent_ref(NULL, root, bytenr,
-                                               blocksize, &refs);
-                       BUG_ON(ret);
-                       if (refs != 1) {
-                               skipped++;
-                               continue;
-                       }
-               }
-               ret = readahead_tree_block(root, bytenr, blocksize,
-                                          btrfs_node_ptr_generation(node, i));
-               last = bytenr + blocksize;
+       int ret;
+       struct btrfs_extent_info *info = ref->extents;
+
+       for (i = 0; i < ref->nritems; i++) {
+               mutex_lock(&root->fs_info->alloc_mutex);
+               ret = __btrfs_free_extent(trans, root,
+                                       info->bytenr, info->num_bytes,
+                                       ref->owner, ref->generation,
+                                       info->objectid, info->offset, 0);
+               mutex_unlock(&root->fs_info->alloc_mutex);
+
+               atomic_inc(&root->fs_info->throttle_gen);
+               wake_up(&root->fs_info->transaction_throttle);
                cond_resched();
-               if (ret)
-                       break;
+
+               BUG_ON(ret);
+               info++;
        }
+
+       return 0;
 }
 
-/*
- * we want to avoid as much random IO as we can with the alloc mutex
- * held, so drop the lock and do the lookup, then do it again with the
- * lock held.
- */
 int drop_snap_lookup_refcount(struct btrfs_root *root, u64 start, u64 len,
                              u32 *refs)
 {
-       mutex_unlock(&root->fs_info->alloc_mutex);
-       lookup_extent_ref(NULL, root, start, len, refs);
+       int ret;
+
+       ret = lookup_extent_ref(NULL, root, start, len, refs);
+       BUG_ON(ret);
+
+#if 0 // some debugging code in case we see problems here
+       /* if the refs count is one, it won't get increased again.  But
+        * if the ref count is > 1, someone may be decreasing it at
+        * the same time we are.
+        */
+       if (*refs != 1) {
+               struct extent_buffer *eb = NULL;
+               eb = btrfs_find_create_tree_block(root, start, len);
+               if (eb)
+                       btrfs_tree_lock(eb);
+
+               mutex_lock(&root->fs_info->alloc_mutex);
+               ret = lookup_extent_ref(NULL, root, start, len, refs);
+               BUG_ON(ret);
+               mutex_unlock(&root->fs_info->alloc_mutex);
+
+               if (eb) {
+                       btrfs_tree_unlock(eb);
+                       free_extent_buffer(eb);
+               }
+               if (*refs == 1) {
+                       printk("block %llu went down to one during drop_snap\n",
+                              (unsigned long long)start);
+               }
+
+       }
+#endif
+
        cond_resched();
-       mutex_lock(&root->fs_info->alloc_mutex);
-       return lookup_extent_ref(NULL, root, start, len, refs);
+       return ret;
 }
 
 /*
@@ -2219,12 +2539,11 @@ static int noinline walk_down_tree(struct btrfs_trans_handle *trans,
        struct extent_buffer *next;
        struct extent_buffer *cur;
        struct extent_buffer *parent;
+       struct btrfs_leaf_ref *ref;
        u32 blocksize;
        int ret;
        u32 refs;
 
-       mutex_lock(&root->fs_info->alloc_mutex);
-
        WARN_ON(*level < 0);
        WARN_ON(*level >= BTRFS_MAX_LEVEL);
        ret = drop_snap_lookup_refcount(root, path->nodes[*level]->start,
@@ -2248,7 +2567,7 @@ static int noinline walk_down_tree(struct btrfs_trans_handle *trans,
                    btrfs_header_nritems(cur))
                        break;
                if (*level == 0) {
-                       ret = drop_leaf_ref(trans, root, cur);
+                       ret = btrfs_drop_leaf_ref(trans, root, cur);
                        BUG_ON(ret);
                        break;
                }
@@ -2263,43 +2582,60 @@ static int noinline walk_down_tree(struct btrfs_trans_handle *trans,
                        root_owner = btrfs_header_owner(parent);
                        root_gen = btrfs_header_generation(parent);
                        path->slots[*level]++;
+
+                       mutex_lock(&root->fs_info->alloc_mutex);
                        ret = __btrfs_free_extent(trans, root, bytenr,
                                                blocksize, root_owner,
                                                root_gen, 0, 0, 1);
                        BUG_ON(ret);
+                       mutex_unlock(&root->fs_info->alloc_mutex);
+
+                       atomic_inc(&root->fs_info->throttle_gen);
+                       wake_up(&root->fs_info->transaction_throttle);
+                       cond_resched();
+
                        continue;
                }
+               /*
+                * at this point, we have a single ref, and since the
+                * only place referencing this extent is a dead root
+                * the reference count should never go higher.
+                * So, we don't need to check it again
+                */
+               if (*level == 1) {
+                       struct btrfs_key key;
+                       btrfs_node_key_to_cpu(cur, &key, path->slots[*level]);
+                       ref = btrfs_lookup_leaf_ref(root, bytenr);
+                       if (ref) {
+                               ret = cache_drop_leaf_ref(trans, root, ref);
+                               BUG_ON(ret);
+                               btrfs_remove_leaf_ref(root, ref);
+                               btrfs_free_leaf_ref(root, ref);
+                               *level = 0;
+                               break;
+                       }
+                       if (printk_ratelimit())
+                               printk("leaf ref miss for bytenr %llu\n",
+                                      (unsigned long long)bytenr);
+               }
                next = btrfs_find_tree_block(root, bytenr, blocksize);
                if (!next || !btrfs_buffer_uptodate(next, ptr_gen)) {
                        free_extent_buffer(next);
-                       mutex_unlock(&root->fs_info->alloc_mutex);
-
-                       if (path->slots[*level] == 0)
-                               reada_walk_down(root, cur, path->slots[*level]);
 
                        next = read_tree_block(root, bytenr, blocksize,
                                               ptr_gen);
                        cond_resched();
-                       mutex_lock(&root->fs_info->alloc_mutex);
-
-                       /* we've dropped the lock, double check */
-                       ret = drop_snap_lookup_refcount(root, bytenr,
-                                               blocksize, &refs);
+#if 0
+                       /*
+                        * this is a debugging check and can go away
+                        * the ref should never go all the way down to 1
+                        * at this point
+                        */
+                       ret = lookup_extent_ref(NULL, root, bytenr, blocksize,
+                                               &refs);
                        BUG_ON(ret);
-                       if (refs != 1) {
-                               parent = path->nodes[*level];
-                               root_owner = btrfs_header_owner(parent);
-                               root_gen = btrfs_header_generation(parent);
-
-                               path->slots[*level]++;
-                               free_extent_buffer(next);
-                               ret = __btrfs_free_extent(trans, root, bytenr,
-                                                       blocksize,
-                                                       root_owner,
-                                                       root_gen, 0, 0, 1);
-                               BUG_ON(ret);
-                               continue;
-                       }
+                       WARN_ON(refs != 1);
+#endif
                }
                WARN_ON(*level <= 0);
                if (path->nodes[*level-1])
@@ -2307,28 +2643,33 @@ static int noinline walk_down_tree(struct btrfs_trans_handle *trans,
                path->nodes[*level-1] = next;
                *level = btrfs_header_level(next);
                path->slots[*level] = 0;
+               cond_resched();
        }
 out:
        WARN_ON(*level < 0);
        WARN_ON(*level >= BTRFS_MAX_LEVEL);
 
        if (path->nodes[*level] == root->node) {
-               root_owner = root->root_key.objectid;
                parent = path->nodes[*level];
+               bytenr = path->nodes[*level]->start;
        } else {
                parent = path->nodes[*level + 1];
-               root_owner = btrfs_header_owner(parent);
+               bytenr = btrfs_node_blockptr(parent, path->slots[*level + 1]);
        }
 
+       blocksize = btrfs_level_size(root, *level);
+       root_owner = btrfs_header_owner(parent);
        root_gen = btrfs_header_generation(parent);
-       ret = __btrfs_free_extent(trans, root, path->nodes[*level]->start,
-                               path->nodes[*level]->len,
-                               root_owner, root_gen, 0, 0, 1);
+
+       mutex_lock(&root->fs_info->alloc_mutex);
+       ret = __btrfs_free_extent(trans, root, bytenr, blocksize,
+                                 root_owner, root_gen, 0, 0, 1);
        free_extent_buffer(path->nodes[*level]);
        path->nodes[*level] = NULL;
        *level += 1;
        BUG_ON(ret);
        mutex_unlock(&root->fs_info->alloc_mutex);
+
        cond_resched();
        return 0;
 }
@@ -2430,6 +2771,10 @@ int btrfs_drop_snapshot(struct btrfs_trans_handle *trans, struct btrfs_root
                btrfs_node_key(node, &found_key, path->slots[level]);
                WARN_ON(memcmp(&found_key, &root_item->drop_progress,
                               sizeof(found_key)));
+               /*
+                * unlock our path, this is safe because only this
+                * function is allowed to delete this snapshot
+                */
                for (i = 0; i < BTRFS_MAX_LEVEL; i++) {
                        if (path->nodes[i] && path->locks[i]) {
                                path->locks[i] = 0;
@@ -2453,6 +2798,8 @@ int btrfs_drop_snapshot(struct btrfs_trans_handle *trans, struct btrfs_root
                        ret = -EAGAIN;
                        break;
                }
+               atomic_inc(&root->fs_info->throttle_gen);
+               wake_up(&root->fs_info->transaction_throttle);
        }
        for (i = 0; i <= orig_level; i++) {
                if (path->nodes[i]) {
@@ -2514,6 +2861,7 @@ static int noinline relocate_inode_pages(struct inode *inode, u64 start,
        struct file_ra_state *ra;
        unsigned long total_read = 0;
        unsigned long ra_pages;
+       struct btrfs_ordered_extent *ordered;
        struct btrfs_trans_handle *trans;
 
        ra = kzalloc(sizeof(*ra), GFP_NOFS);
@@ -2532,9 +2880,9 @@ static int noinline relocate_inode_pages(struct inode *inode, u64 start,
                                       calc_ra(i, last_index, ra_pages));
                }
                total_read++;
-               if (((u64)i << PAGE_CACHE_SHIFT) > inode->i_size)
+again:
+               if (((u64)i << PAGE_CACHE_SHIFT) > i_size_read(inode))
                        goto truncate_racing;
-
                page = grab_cache_page(inode->i_mapping, i);
                if (!page) {
                        goto out_unlock;
@@ -2548,34 +2896,51 @@ static int noinline relocate_inode_pages(struct inode *inode, u64 start,
                                goto out_unlock;
                        }
                }
-#if LINUX_VERSION_CODE <= KERNEL_VERSION(2,6,18)
-               ClearPageDirty(page);
-#else
-               cancel_dirty_page(page, PAGE_CACHE_SIZE);
-#endif
                wait_on_page_writeback(page);
-               set_page_extent_mapped(page);
+
                page_start = (u64)page->index << PAGE_CACHE_SHIFT;
                page_end = page_start + PAGE_CACHE_SIZE - 1;
-
                lock_extent(io_tree, page_start, page_end, GFP_NOFS);
 
-               set_extent_delalloc(io_tree, page_start,
-                                   page_end, GFP_NOFS);
+               ordered = btrfs_lookup_ordered_extent(inode, page_start);
+               if (ordered) {
+                       unlock_extent(io_tree, page_start, page_end, GFP_NOFS);
+                       unlock_page(page);
+                       page_cache_release(page);
+                       btrfs_start_ordered_extent(inode, ordered, 1);
+                       btrfs_put_ordered_extent(ordered);
+                       goto again;
+               }
+               set_page_extent_mapped(page);
+
+               /*
+                * make sure page_mkwrite is called for this page if userland
+                * wants to change it from mmap
+                */
+               clear_page_dirty_for_io(page);
+
+               btrfs_set_extent_delalloc(inode, page_start, page_end);
                set_page_dirty(page);
 
                unlock_extent(io_tree, page_start, page_end, GFP_NOFS);
                unlock_page(page);
                page_cache_release(page);
        }
-       balance_dirty_pages_ratelimited_nr(inode->i_mapping,
-                                          total_read);
 
 out_unlock:
+       /* we have to start the IO in order to get the ordered extents
+        * instantiated.  This allows the relocation to code to wait
+        * for all the ordered extents to hit the disk.
+        *
+        * Otherwise, it would constantly loop over the same extents
+        * because the old ones don't get deleted  until the IO is
+        * started
+        */
+       btrfs_fdatawrite_range(inode->i_mapping, start, start + len - 1,
+                              WB_SYNC_NONE);
        kfree(ra);
        trans = btrfs_start_transaction(BTRFS_I(inode)->root, 1);
        if (trans) {
-               btrfs_add_ordered_inode(inode);
                btrfs_end_transaction(trans, BTRFS_I(inode)->root);
                mark_inode_dirty(inode);
        }
@@ -2613,7 +2978,6 @@ static int find_root_for_ref(struct btrfs_root *root,
        u64 root_search_start = BTRFS_FS_TREE_OBJECTID;
        u64 found_bytenr;
        int ret;
-       int i;
 
        root_location.offset = (u64)-1;
        root_location.type = BTRFS_ROOT_ITEM_KEY;
@@ -2637,12 +3001,6 @@ static int find_root_for_ref(struct btrfs_root *root,
                                found_bytenr = path->nodes[level]->start;
                }
 
-               for (i = level; i < BTRFS_MAX_LEVEL; i++) {
-                       if (!path->nodes[i])
-                               break;
-                       free_extent_buffer(path->nodes[i]);
-                       path->nodes[i] = NULL;
-               }
                btrfs_release_path(cur_root, path);
 
                if (found_bytenr == bytenr) {
@@ -2691,6 +3049,8 @@ static int noinline relocate_one_reference(struct btrfs_root *extent_root,
        int ret;
        int level;
 
+       WARN_ON(!mutex_is_locked(&extent_root->fs_info->alloc_mutex));
+
        ref = btrfs_item_ptr(path->nodes[0], path->slots[0],
                             struct btrfs_extent_ref);
        ref_root = btrfs_ref_root(path->nodes[0], ref);
@@ -2709,6 +3069,7 @@ static int noinline relocate_one_reference(struct btrfs_root *extent_root,
        found_root = btrfs_read_fs_root_no_name(extent_root->fs_info,
                                                &root_location);
        BUG_ON(!found_root);
+       mutex_unlock(&extent_root->fs_info->alloc_mutex);
 
        if (ref_objectid >= BTRFS_FIRST_FREE_OBJECTID) {
                found_key.objectid = ref_objectid;
@@ -2750,9 +3111,9 @@ static int noinline relocate_one_reference(struct btrfs_root *extent_root,
                /* this can happen if the reference is not against
                 * the latest version of the tree root
                 */
-               if (is_bad_inode(inode)) {
+               if (is_bad_inode(inode))
                        goto out;
-               }
+
                *last_file_objectid = inode->i_ino;
                *last_file_root = found_root->root_key.objectid;
                *last_file_offset = ref_offset;
@@ -2762,7 +3123,7 @@ static int noinline relocate_one_reference(struct btrfs_root *extent_root,
        } else {
                struct btrfs_trans_handle *trans;
                struct extent_buffer *eb;
-               int i;
+               int needs_lock = 0;
 
                eb = read_tree_block(found_root, extent_key->objectid,
                                     extent_key->offset, 0);
@@ -2784,26 +3145,40 @@ static int noinline relocate_one_reference(struct btrfs_root *extent_root,
                if (ret)
                        goto out;
 
+               /*
+                * right here almost anything could happen to our key,
+                * but that's ok.  The cow below will either relocate it
+                * or someone else will have relocated it.  Either way,
+                * it is in a different spot than it was before and
+                * we're happy.
+                */
+
                trans = btrfs_start_transaction(found_root, 1);
 
+               if (found_root == extent_root->fs_info->extent_root ||
+                   found_root == extent_root->fs_info->chunk_root ||
+                   found_root == extent_root->fs_info->dev_root) {
+                       needs_lock = 1;
+                       mutex_lock(&extent_root->fs_info->alloc_mutex);
+               }
+
                path->lowest_level = level;
                path->reada = 2;
                ret = btrfs_search_slot(trans, found_root, &found_key, path,
                                        0, 1);
                path->lowest_level = 0;
-               for (i = level; i < BTRFS_MAX_LEVEL; i++) {
-                       if (!path->nodes[i])
-                               break;
-                       free_extent_buffer(path->nodes[i]);
-                       path->nodes[i] = NULL;
-               }
                btrfs_release_path(found_root, path);
+
                if (found_root == found_root->fs_info->extent_root)
                        btrfs_extent_post_op(trans, found_root);
+               if (needs_lock)
+                       mutex_unlock(&extent_root->fs_info->alloc_mutex);
+
                btrfs_end_transaction(trans, found_root);
-       }
 
+       }
 out:
+       mutex_lock(&extent_root->fs_info->alloc_mutex);
        return 0;
 }
 
@@ -2943,9 +3318,15 @@ int __alloc_chunk_for_shrink(struct btrfs_root *root,
        u64 new_alloc_flags;
        u64 calc;
 
+       spin_lock(&shrink_block_group->lock);
        if (btrfs_block_group_used(&shrink_block_group->item) > 0) {
+               spin_unlock(&shrink_block_group->lock);
+               mutex_unlock(&root->fs_info->alloc_mutex);
 
                trans = btrfs_start_transaction(root, 1);
+               mutex_lock(&root->fs_info->alloc_mutex);
+               spin_lock(&shrink_block_group->lock);
+
                new_alloc_flags = update_block_group_flags(root,
                                                   shrink_block_group->flags);
                if (new_alloc_flags != shrink_block_group->flags) {
@@ -2954,10 +3335,16 @@ int __alloc_chunk_for_shrink(struct btrfs_root *root,
                } else {
                        calc = shrink_block_group->key.offset;
                }
+               spin_unlock(&shrink_block_group->lock);
+
                do_chunk_alloc(trans, root->fs_info->extent_root,
                               calc + 2 * 1024 * 1024, new_alloc_flags, force);
+
+               mutex_unlock(&root->fs_info->alloc_mutex);
                btrfs_end_transaction(trans, root);
-       }
+               mutex_lock(&root->fs_info->alloc_mutex);
+       } else
+               spin_unlock(&shrink_block_group->lock);
        return 0;
 }
 
@@ -3009,6 +3396,13 @@ again:
        key.type = 0;
        cur_byte = key.objectid;
 
+       mutex_unlock(&root->fs_info->alloc_mutex);
+
+       btrfs_start_delalloc_inodes(root);
+       btrfs_wait_ordered_extents(tree_root, 0);
+
+       mutex_lock(&root->fs_info->alloc_mutex);
+
        ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
        if (ret < 0)
                goto out;
@@ -3033,9 +3427,9 @@ again:
                if (ret < 0)
                        goto out;
 
+next:
                leaf = path->nodes[0];
                nritems = btrfs_header_nritems(leaf);
-next:
                if (path->slots[0] >= nritems) {
                        ret = btrfs_next_leaf(root, path);
                        if (ret < 0)
@@ -3085,13 +3479,18 @@ next:
                printk("btrfs relocate found %llu last extent was %llu\n",
                       (unsigned long long)total_found,
                       (unsigned long long)found_key.objectid);
+               mutex_unlock(&root->fs_info->alloc_mutex);
                trans = btrfs_start_transaction(tree_root, 1);
                btrfs_commit_transaction(trans, tree_root);
 
                btrfs_clean_old_snapshots(tree_root);
 
+               btrfs_start_delalloc_inodes(root);
+               btrfs_wait_ordered_extents(tree_root, 0);
+
                trans = btrfs_start_transaction(tree_root, 1);
                btrfs_commit_transaction(trans, tree_root);
+               mutex_lock(&root->fs_info->alloc_mutex);
                goto again;
        }
 
@@ -3099,14 +3498,20 @@ next:
         * we've freed all the extents, now remove the block
         * group item from the tree
         */
+       mutex_unlock(&root->fs_info->alloc_mutex);
+
        trans = btrfs_start_transaction(root, 1);
+
+       mutex_lock(&root->fs_info->alloc_mutex);
        memcpy(&key, &shrink_block_group->key, sizeof(key));
 
        ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
        if (ret > 0)
                ret = -EIO;
-       if (ret < 0)
+       if (ret < 0) {
+               btrfs_end_transaction(trans, root);
                goto out;
+       }
 
        clear_extent_bits(&info->block_group_cache, key.objectid,
                          key.objectid + key.offset - 1,
@@ -3117,12 +3522,18 @@ next:
                           key.objectid, key.objectid + key.offset - 1,
                           (unsigned int)-1, GFP_NOFS);
 
+       /*
        memset(shrink_block_group, 0, sizeof(*shrink_block_group));
        kfree(shrink_block_group);
+       */
 
        btrfs_del_item(trans, root, path);
+       btrfs_release_path(root, path);
+       mutex_unlock(&root->fs_info->alloc_mutex);
        btrfs_commit_transaction(trans, root);
 
+       mutex_lock(&root->fs_info->alloc_mutex);
+
        /* the code to unpin extents might set a few bits in the free
         * space cache for this range again
         */
@@ -3212,6 +3623,7 @@ int btrfs_read_block_groups(struct btrfs_root *root)
                        break;
                }
 
+               spin_lock_init(&cache->lock);
                read_extent_buffer(leaf, &cache->item,
                                   btrfs_item_ptr_offset(leaf, path->slots[0]),
                                   sizeof(cache->item));
@@ -3239,10 +3651,12 @@ int btrfs_read_block_groups(struct btrfs_root *root)
                /* use EXTENT_LOCKED to prevent merging */
                set_extent_bits(block_group_cache, found_key.objectid,
                                found_key.objectid + found_key.offset - 1,
-                               bit | EXTENT_LOCKED, GFP_NOFS);
+                               EXTENT_LOCKED, GFP_NOFS);
                set_state_private(block_group_cache, found_key.objectid,
                                  (unsigned long)cache);
-
+               set_extent_bits(block_group_cache, found_key.objectid,
+                               found_key.objectid + found_key.offset - 1,
+                               bit | EXTENT_LOCKED, GFP_NOFS);
                if (key.objectid >=
                    btrfs_super_total_bytes(&info->super_copy))
                        break;
@@ -3265,13 +3679,17 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans,
        struct btrfs_block_group_cache *cache;
        struct extent_io_tree *block_group_cache;
 
+       WARN_ON(!mutex_is_locked(&root->fs_info->alloc_mutex));
        extent_root = root->fs_info->extent_root;
        block_group_cache = &root->fs_info->block_group_cache;
 
+       root->fs_info->last_trans_new_blockgroup = trans->transid;
+
        cache = kzalloc(sizeof(*cache), GFP_NOFS);
        BUG_ON(!cache);
        cache->key.objectid = chunk_offset;
        cache->key.offset = size;
+       spin_lock_init(&cache->lock);
        btrfs_set_key_type(&cache->key, BTRFS_BLOCK_GROUP_ITEM_KEY);
 
        btrfs_set_block_group_used(&cache->item, bytes_used);
@@ -3286,10 +3704,13 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans,
        bit = block_group_state_bits(type);
        set_extent_bits(block_group_cache, chunk_offset,
                        chunk_offset + size - 1,
-                       bit | EXTENT_LOCKED, GFP_NOFS);
-
+                       EXTENT_LOCKED, GFP_NOFS);
        set_state_private(block_group_cache, chunk_offset,
                          (unsigned long)cache);
+       set_extent_bits(block_group_cache, chunk_offset,
+                       chunk_offset + size - 1,
+                       bit | EXTENT_LOCKED, GFP_NOFS);
+
        ret = btrfs_insert_item(trans, extent_root, &cache->key, &cache->item,
                                sizeof(cache->item));
        BUG_ON(ret);