Btrfs: fix deadlock when throttling transactions
[pandora-kernel.git] / fs / btrfs / transaction.c
index 37c2302..eb55863 100644 (file)
@@ -27,6 +27,7 @@
 #include "transaction.h"
 #include "locking.h"
 #include "tree-log.h"
+#include "inode-map.h"
 
 #define BTRFS_ROOT_TRANS_TAG 0
 
@@ -34,6 +35,7 @@ static noinline void put_transaction(struct btrfs_transaction *transaction)
 {
        WARN_ON(atomic_read(&transaction->use_count) == 0);
        if (atomic_dec_and_test(&transaction->use_count)) {
+               BUG_ON(!list_empty(&transaction->list));
                memset(transaction, 0, sizeof(*transaction));
                kmem_cache_free(btrfs_transaction_cachep, transaction);
        }
@@ -48,47 +50,72 @@ static noinline void switch_commit_root(struct btrfs_root *root)
 /*
  * either allocate a new transaction or hop into the existing one
  */
-static noinline int join_transaction(struct btrfs_root *root)
+static noinline int join_transaction(struct btrfs_root *root, int nofail)
 {
        struct btrfs_transaction *cur_trans;
+
+       spin_lock(&root->fs_info->trans_lock);
+       if (root->fs_info->trans_no_join) {
+               if (!nofail) {
+                       spin_unlock(&root->fs_info->trans_lock);
+                       return -EBUSY;
+               }
+       }
+
        cur_trans = root->fs_info->running_transaction;
-       if (!cur_trans) {
-               cur_trans = kmem_cache_alloc(btrfs_transaction_cachep,
-                                            GFP_NOFS);
-               if (!cur_trans)
-                       return -ENOMEM;
-               root->fs_info->generation++;
-               atomic_set(&cur_trans->num_writers, 1);
-               cur_trans->num_joined = 0;
-               cur_trans->transid = root->fs_info->generation;
-               init_waitqueue_head(&cur_trans->writer_wait);
-               init_waitqueue_head(&cur_trans->commit_wait);
-               cur_trans->in_commit = 0;
-               cur_trans->blocked = 0;
-               atomic_set(&cur_trans->use_count, 1);
-               cur_trans->commit_done = 0;
-               cur_trans->start_time = get_seconds();
-
-               cur_trans->delayed_refs.root = RB_ROOT;
-               cur_trans->delayed_refs.num_entries = 0;
-               cur_trans->delayed_refs.num_heads_ready = 0;
-               cur_trans->delayed_refs.num_heads = 0;
-               cur_trans->delayed_refs.flushing = 0;
-               cur_trans->delayed_refs.run_delayed_start = 0;
-               spin_lock_init(&cur_trans->delayed_refs.lock);
-
-               INIT_LIST_HEAD(&cur_trans->pending_snapshots);
-               list_add_tail(&cur_trans->list, &root->fs_info->trans_list);
-               extent_io_tree_init(&cur_trans->dirty_pages,
-                                    root->fs_info->btree_inode->i_mapping,
-                                    GFP_NOFS);
-               spin_lock(&root->fs_info->new_trans_lock);
-               root->fs_info->running_transaction = cur_trans;
-               spin_unlock(&root->fs_info->new_trans_lock);
-       } else {
+       if (cur_trans) {
+               atomic_inc(&cur_trans->use_count);
                atomic_inc(&cur_trans->num_writers);
                cur_trans->num_joined++;
+               spin_unlock(&root->fs_info->trans_lock);
+               return 0;
        }
+       spin_unlock(&root->fs_info->trans_lock);
+
+       cur_trans = kmem_cache_alloc(btrfs_transaction_cachep, GFP_NOFS);
+       if (!cur_trans)
+               return -ENOMEM;
+       spin_lock(&root->fs_info->trans_lock);
+       if (root->fs_info->running_transaction) {
+               kmem_cache_free(btrfs_transaction_cachep, cur_trans);
+               cur_trans = root->fs_info->running_transaction;
+               atomic_inc(&cur_trans->use_count);
+               atomic_inc(&cur_trans->num_writers);
+               cur_trans->num_joined++;
+               spin_unlock(&root->fs_info->trans_lock);
+               return 0;
+       }
+       atomic_set(&cur_trans->num_writers, 1);
+       cur_trans->num_joined = 0;
+       init_waitqueue_head(&cur_trans->writer_wait);
+       init_waitqueue_head(&cur_trans->commit_wait);
+       cur_trans->in_commit = 0;
+       cur_trans->blocked = 0;
+       /*
+        * One for this trans handle, one so it will live on until we
+        * commit the transaction.
+        */
+       atomic_set(&cur_trans->use_count, 2);
+       cur_trans->commit_done = 0;
+       cur_trans->start_time = get_seconds();
+
+       cur_trans->delayed_refs.root = RB_ROOT;
+       cur_trans->delayed_refs.num_entries = 0;
+       cur_trans->delayed_refs.num_heads_ready = 0;
+       cur_trans->delayed_refs.num_heads = 0;
+       cur_trans->delayed_refs.flushing = 0;
+       cur_trans->delayed_refs.run_delayed_start = 0;
+       spin_lock_init(&cur_trans->commit_lock);
+       spin_lock_init(&cur_trans->delayed_refs.lock);
+
+       INIT_LIST_HEAD(&cur_trans->pending_snapshots);
+       list_add_tail(&cur_trans->list, &root->fs_info->trans_list);
+       extent_io_tree_init(&cur_trans->dirty_pages,
+                            root->fs_info->btree_inode->i_mapping);
+       root->fs_info->generation++;
+       cur_trans->transid = root->fs_info->generation;
+       root->fs_info->running_transaction = cur_trans;
+       spin_unlock(&root->fs_info->trans_lock);
 
        return 0;
 }
@@ -99,36 +126,82 @@ static noinline int join_transaction(struct btrfs_root *root)
  * to make sure the old root from before we joined the transaction is deleted
  * when the transaction commits
  */
-static noinline int record_root_in_trans(struct btrfs_trans_handle *trans,
-                                        struct btrfs_root *root)
+static int record_root_in_trans(struct btrfs_trans_handle *trans,
+                              struct btrfs_root *root)
 {
        if (root->ref_cows && root->last_trans < trans->transid) {
                WARN_ON(root == root->fs_info->extent_root);
                WARN_ON(root->commit_root != root->node);
 
+               /*
+                * see below for in_trans_setup usage rules
+                * we have the reloc mutex held now, so there
+                * is only one writer in this function
+                */
+               root->in_trans_setup = 1;
+
+               /* make sure readers find in_trans_setup before
+                * they find our root->last_trans update
+                */
+               smp_wmb();
+
+               spin_lock(&root->fs_info->fs_roots_radix_lock);
+               if (root->last_trans == trans->transid) {
+                       spin_unlock(&root->fs_info->fs_roots_radix_lock);
+                       return 0;
+               }
                radix_tree_tag_set(&root->fs_info->fs_roots_radix,
                           (unsigned long)root->root_key.objectid,
                           BTRFS_ROOT_TRANS_TAG);
+               spin_unlock(&root->fs_info->fs_roots_radix_lock);
                root->last_trans = trans->transid;
+
+               /* this is pretty tricky.  We don't want to
+                * take the relocation lock in btrfs_record_root_in_trans
+                * unless we're really doing the first setup for this root in
+                * this transaction.
+                *
+                * Normally we'd use root->last_trans as a flag to decide
+                * if we want to take the expensive mutex.
+                *
+                * But, we have to set root->last_trans before we
+                * init the relocation root, otherwise, we trip over warnings
+                * in ctree.c.  The solution used here is to flag ourselves
+                * with root->in_trans_setup.  When this is 1, we're still
+                * fixing up the reloc trees and everyone must wait.
+                *
+                * When this is zero, they can trust root->last_trans and fly
+                * through btrfs_record_root_in_trans without having to take the
+                * lock.  smp_wmb() makes sure that all the writes above are
+                * done before we pop in the zero below
+                */
                btrfs_init_reloc_root(trans, root);
+               smp_wmb();
+               root->in_trans_setup = 0;
        }
        return 0;
 }
 
+
 int btrfs_record_root_in_trans(struct btrfs_trans_handle *trans,
                               struct btrfs_root *root)
 {
        if (!root->ref_cows)
                return 0;
 
-       mutex_lock(&root->fs_info->trans_mutex);
-       if (root->last_trans == trans->transid) {
-               mutex_unlock(&root->fs_info->trans_mutex);
+       /*
+        * see record_root_in_trans for comments about in_trans_setup usage
+        * and barriers
+        */
+       smp_rmb();
+       if (root->last_trans == trans->transid &&
+           !root->in_trans_setup)
                return 0;
-       }
 
+       mutex_lock(&root->fs_info->reloc_mutex);
        record_root_in_trans(trans, root);
-       mutex_unlock(&root->fs_info->trans_mutex);
+       mutex_unlock(&root->fs_info->reloc_mutex);
+
        return 0;
 }
 
@@ -140,21 +213,23 @@ static void wait_current_trans(struct btrfs_root *root)
 {
        struct btrfs_transaction *cur_trans;
 
+       spin_lock(&root->fs_info->trans_lock);
        cur_trans = root->fs_info->running_transaction;
        if (cur_trans && cur_trans->blocked) {
                DEFINE_WAIT(wait);
                atomic_inc(&cur_trans->use_count);
+               spin_unlock(&root->fs_info->trans_lock);
                while (1) {
                        prepare_to_wait(&root->fs_info->transaction_wait, &wait,
                                        TASK_UNINTERRUPTIBLE);
                        if (!cur_trans->blocked)
                                break;
-                       mutex_unlock(&root->fs_info->trans_mutex);
                        schedule();
-                       mutex_lock(&root->fs_info->trans_mutex);
                }
                finish_wait(&root->fs_info->transaction_wait, &wait);
                put_transaction(cur_trans);
+       } else {
+               spin_unlock(&root->fs_info->trans_lock);
        }
 }
 
@@ -167,10 +242,16 @@ enum btrfs_trans_type {
 
 static int may_wait_transaction(struct btrfs_root *root, int type)
 {
-       if (!root->fs_info->log_root_recovering &&
-           ((type == TRANS_START && !root->fs_info->open_ioctl_trans) ||
-            type == TRANS_USERSPACE))
+       if (root->fs_info->log_root_recovering)
+               return 0;
+
+       if (type == TRANS_USERSPACE)
                return 1;
+
+       if (type == TRANS_START &&
+           !atomic_read(&root->fs_info->open_ioctl_trans))
+               return 1;
+
        return 0;
 }
 
@@ -179,41 +260,62 @@ static struct btrfs_trans_handle *start_transaction(struct btrfs_root *root,
 {
        struct btrfs_trans_handle *h;
        struct btrfs_transaction *cur_trans;
-       int retries = 0;
+       u64 num_bytes = 0;
        int ret;
 
        if (root->fs_info->fs_state & BTRFS_SUPER_FLAG_ERROR)
                return ERR_PTR(-EROFS);
+
+       if (current->journal_info) {
+               WARN_ON(type != TRANS_JOIN && type != TRANS_JOIN_NOLOCK);
+               h = current->journal_info;
+               h->use_count++;
+               h->orig_rsv = h->block_rsv;
+               h->block_rsv = NULL;
+               goto got_it;
+       }
+
+       /*
+        * Do the reservation before we join the transaction so we can do all
+        * the appropriate flushing if need be.
+        */
+       if (num_items > 0 && root != root->fs_info->chunk_root) {
+               num_bytes = btrfs_calc_trans_metadata_size(root, num_items);
+               ret = btrfs_block_rsv_add(NULL, root,
+                                         &root->fs_info->trans_block_rsv,
+                                         num_bytes);
+               if (ret)
+                       return ERR_PTR(ret);
+       }
 again:
        h = kmem_cache_alloc(btrfs_trans_handle_cachep, GFP_NOFS);
        if (!h)
                return ERR_PTR(-ENOMEM);
 
-       if (type != TRANS_JOIN_NOLOCK)
-               mutex_lock(&root->fs_info->trans_mutex);
        if (may_wait_transaction(root, type))
                wait_current_trans(root);
 
-       ret = join_transaction(root);
+       do {
+               ret = join_transaction(root, type == TRANS_JOIN_NOLOCK);
+               if (ret == -EBUSY)
+                       wait_current_trans(root);
+       } while (ret == -EBUSY);
+
        if (ret < 0) {
                kmem_cache_free(btrfs_trans_handle_cachep, h);
-               if (type != TRANS_JOIN_NOLOCK)
-                       mutex_unlock(&root->fs_info->trans_mutex);
                return ERR_PTR(ret);
        }
 
        cur_trans = root->fs_info->running_transaction;
-       atomic_inc(&cur_trans->use_count);
-       if (type != TRANS_JOIN_NOLOCK)
-               mutex_unlock(&root->fs_info->trans_mutex);
 
        h->transid = cur_trans->transid;
        h->transaction = cur_trans;
        h->blocks_used = 0;
-       h->block_group = 0;
        h->bytes_reserved = 0;
        h->delayed_ref_updates = 0;
+       h->use_count = 1;
        h->block_rsv = NULL;
+       h->orig_rsv = NULL;
 
        smp_mb();
        if (cur_trans->blocked && may_wait_transaction(root, type)) {
@@ -221,31 +323,13 @@ again:
                goto again;
        }
 
-       if (num_items > 0) {
-               ret = btrfs_trans_reserve_metadata(h, root, num_items);
-               if (ret == -EAGAIN && !retries) {
-                       retries++;
-                       btrfs_commit_transaction(h, root);
-                       goto again;
-               } else if (ret == -EAGAIN) {
-                       /*
-                        * We have already retried and got EAGAIN, so really we
-                        * don't have space, so set ret to -ENOSPC.
-                        */
-                       ret = -ENOSPC;
-               }
-
-               if (ret < 0) {
-                       btrfs_end_transaction(h, root);
-                       return ERR_PTR(ret);
-               }
+       if (num_bytes) {
+               h->block_rsv = &root->fs_info->trans_block_rsv;
+               h->bytes_reserved = num_bytes;
        }
 
-       if (type != TRANS_JOIN_NOLOCK)
-               mutex_lock(&root->fs_info->trans_mutex);
-       record_root_in_trans(h, root);
-       if (type != TRANS_JOIN_NOLOCK)
-               mutex_unlock(&root->fs_info->trans_mutex);
+got_it:
+       btrfs_record_root_in_trans(h, root);
 
        if (!current->journal_info && type != TRANS_USERSPACE)
                current->journal_info = h;
@@ -257,22 +341,19 @@ struct btrfs_trans_handle *btrfs_start_transaction(struct btrfs_root *root,
 {
        return start_transaction(root, num_items, TRANS_START);
 }
-struct btrfs_trans_handle *btrfs_join_transaction(struct btrfs_root *root,
-                                                  int num_blocks)
+struct btrfs_trans_handle *btrfs_join_transaction(struct btrfs_root *root)
 {
        return start_transaction(root, 0, TRANS_JOIN);
 }
 
-struct btrfs_trans_handle *btrfs_join_transaction_nolock(struct btrfs_root *root,
-                                                         int num_blocks)
+struct btrfs_trans_handle *btrfs_join_transaction_nolock(struct btrfs_root *root)
 {
        return start_transaction(root, 0, TRANS_JOIN_NOLOCK);
 }
 
-struct btrfs_trans_handle *btrfs_start_ioctl_transaction(struct btrfs_root *r,
-                                                        int num_blocks)
+struct btrfs_trans_handle *btrfs_start_ioctl_transaction(struct btrfs_root *root)
 {
-       return start_transaction(r, 0, TRANS_USERSPACE);
+       return start_transaction(root, 0, TRANS_USERSPACE);
 }
 
 /* wait for a transaction commit to be fully complete */
@@ -280,17 +361,13 @@ static noinline int wait_for_commit(struct btrfs_root *root,
                                    struct btrfs_transaction *commit)
 {
        DEFINE_WAIT(wait);
-       mutex_lock(&root->fs_info->trans_mutex);
        while (!commit->commit_done) {
                prepare_to_wait(&commit->commit_wait, &wait,
                                TASK_UNINTERRUPTIBLE);
                if (commit->commit_done)
                        break;
-               mutex_unlock(&root->fs_info->trans_mutex);
                schedule();
-               mutex_lock(&root->fs_info->trans_mutex);
        }
-       mutex_unlock(&root->fs_info->trans_mutex);
        finish_wait(&commit->commit_wait, &wait);
        return 0;
 }
@@ -300,102 +377,56 @@ int btrfs_wait_for_commit(struct btrfs_root *root, u64 transid)
        struct btrfs_transaction *cur_trans = NULL, *t;
        int ret;
 
-       mutex_lock(&root->fs_info->trans_mutex);
-
        ret = 0;
        if (transid) {
                if (transid <= root->fs_info->last_trans_committed)
-                       goto out_unlock;
+                       goto out;
 
                /* find specified transaction */
+               spin_lock(&root->fs_info->trans_lock);
                list_for_each_entry(t, &root->fs_info->trans_list, list) {
                        if (t->transid == transid) {
                                cur_trans = t;
+                               atomic_inc(&cur_trans->use_count);
                                break;
                        }
                        if (t->transid > transid)
                                break;
                }
+               spin_unlock(&root->fs_info->trans_lock);
                ret = -EINVAL;
                if (!cur_trans)
-                       goto out_unlock;  /* bad transid */
+                       goto out;  /* bad transid */
        } else {
                /* find newest transaction that is committing | committed */
+               spin_lock(&root->fs_info->trans_lock);
                list_for_each_entry_reverse(t, &root->fs_info->trans_list,
                                            list) {
                        if (t->in_commit) {
                                if (t->commit_done)
-                                       goto out_unlock;
+                                       break;
                                cur_trans = t;
+                               atomic_inc(&cur_trans->use_count);
                                break;
                        }
                }
+               spin_unlock(&root->fs_info->trans_lock);
                if (!cur_trans)
-                       goto out_unlock;  /* nothing committing|committed */
+                       goto out;  /* nothing committing|committed */
        }
 
-       atomic_inc(&cur_trans->use_count);
-       mutex_unlock(&root->fs_info->trans_mutex);
-
        wait_for_commit(root, cur_trans);
 
-       mutex_lock(&root->fs_info->trans_mutex);
        put_transaction(cur_trans);
        ret = 0;
-out_unlock:
-       mutex_unlock(&root->fs_info->trans_mutex);
+out:
        return ret;
 }
 
-#if 0
-/*
- * rate limit against the drop_snapshot code.  This helps to slow down new
- * operations if the drop_snapshot code isn't able to keep up.
- */
-static void throttle_on_drops(struct btrfs_root *root)
-{
-       struct btrfs_fs_info *info = root->fs_info;
-       int harder_count = 0;
-
-harder:
-       if (atomic_read(&info->throttles)) {
-               DEFINE_WAIT(wait);
-               int thr;
-               thr = atomic_read(&info->throttle_gen);
-
-               do {
-                       prepare_to_wait(&info->transaction_throttle,
-                                       &wait, TASK_UNINTERRUPTIBLE);
-                       if (!atomic_read(&info->throttles)) {
-                               finish_wait(&info->transaction_throttle, &wait);
-                               break;
-                       }
-                       schedule();
-                       finish_wait(&info->transaction_throttle, &wait);
-               } while (thr == atomic_read(&info->throttle_gen));
-               harder_count++;
-
-               if (root->fs_info->total_ref_cache_size > 1 * 1024 * 1024 &&
-                   harder_count < 2)
-                       goto harder;
-
-               if (root->fs_info->total_ref_cache_size > 5 * 1024 * 1024 &&
-                   harder_count < 10)
-                       goto harder;
-
-               if (root->fs_info->total_ref_cache_size > 10 * 1024 * 1024 &&
-                   harder_count < 20)
-                       goto harder;
-       }
-}
-#endif
-
 void btrfs_throttle(struct btrfs_root *root)
 {
-       mutex_lock(&root->fs_info->trans_mutex);
-       if (!root->fs_info->open_ioctl_trans)
+       if (!atomic_read(&root->fs_info->open_ioctl_trans))
                wait_current_trans(root);
-       mutex_unlock(&root->fs_info->trans_mutex);
 }
 
 static int should_end_transaction(struct btrfs_trans_handle *trans,
@@ -413,6 +444,7 @@ int btrfs_should_end_transaction(struct btrfs_trans_handle *trans,
        struct btrfs_transaction *cur_trans = trans->transaction;
        int updates;
 
+       smp_mb();
        if (cur_trans->blocked || cur_trans->delayed_refs.flushing)
                return 1;
 
@@ -431,6 +463,11 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans,
        struct btrfs_fs_info *info = root->fs_info;
        int count = 0;
 
+       if (--trans->use_count) {
+               trans->block_rsv = trans->orig_rsv;
+               return 0;
+       }
+
        while (count < 4) {
                unsigned long cur = trans->delayed_ref_updates;
                trans->delayed_ref_updates = 0;
@@ -453,15 +490,24 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans,
 
        btrfs_trans_release_metadata(trans, root);
 
-       if (lock && !root->fs_info->open_ioctl_trans &&
-           should_end_transaction(trans, root))
+       if (lock && !atomic_read(&root->fs_info->open_ioctl_trans) &&
+           should_end_transaction(trans, root)) {
                trans->transaction->blocked = 1;
+               smp_wmb();
+       }
 
        if (lock && cur_trans->blocked && !cur_trans->in_commit) {
-               if (throttle)
+               if (throttle) {
+                       /*
+                        * We may race with somebody else here so end up having
+                        * to call end_transaction on ourselves again, so inc
+                        * our use_count.
+                        */
+                       trans->use_count++;
                        return btrfs_commit_transaction(trans, root);
-               else
+               } else {
                        wake_up_process(info->transaction_kthread);
+               }
        }
 
        WARN_ON(cur_trans != info->running_transaction);
@@ -487,19 +533,40 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans,
 int btrfs_end_transaction(struct btrfs_trans_handle *trans,
                          struct btrfs_root *root)
 {
-       return __btrfs_end_transaction(trans, root, 0, 1);
+       int ret;
+
+       ret = __btrfs_end_transaction(trans, root, 0, 1);
+       if (ret)
+               return ret;
+       return 0;
 }
 
 int btrfs_end_transaction_throttle(struct btrfs_trans_handle *trans,
                                   struct btrfs_root *root)
 {
-       return __btrfs_end_transaction(trans, root, 1, 1);
+       int ret;
+
+       ret = __btrfs_end_transaction(trans, root, 1, 1);
+       if (ret)
+               return ret;
+       return 0;
 }
 
 int btrfs_end_transaction_nolock(struct btrfs_trans_handle *trans,
                                 struct btrfs_root *root)
 {
-       return __btrfs_end_transaction(trans, root, 0, 0);
+       int ret;
+
+       ret = __btrfs_end_transaction(trans, root, 0, 0);
+       if (ret)
+               return ret;
+       return 0;
+}
+
+int btrfs_end_transaction_dmeta(struct btrfs_trans_handle *trans,
+                               struct btrfs_root *root)
+{
+       return __btrfs_end_transaction(trans, root, 1, 1);
 }
 
 /*
@@ -725,9 +792,9 @@ static noinline int commit_cowonly_roots(struct btrfs_trans_handle *trans,
  */
 int btrfs_add_dead_root(struct btrfs_root *root)
 {
-       mutex_lock(&root->fs_info->trans_mutex);
+       spin_lock(&root->fs_info->trans_lock);
        list_add(&root->root_list, &root->fs_info->dead_roots);
-       mutex_unlock(&root->fs_info->trans_mutex);
+       spin_unlock(&root->fs_info->trans_lock);
        return 0;
 }
 
@@ -743,6 +810,7 @@ static noinline int commit_fs_roots(struct btrfs_trans_handle *trans,
        int ret;
        int err = 0;
 
+       spin_lock(&fs_info->fs_roots_radix_lock);
        while (1) {
                ret = radix_tree_gang_lookup_tag(&fs_info->fs_roots_radix,
                                                 (void **)gang, 0,
@@ -755,13 +823,20 @@ static noinline int commit_fs_roots(struct btrfs_trans_handle *trans,
                        radix_tree_tag_clear(&fs_info->fs_roots_radix,
                                        (unsigned long)root->root_key.objectid,
                                        BTRFS_ROOT_TRANS_TAG);
+                       spin_unlock(&fs_info->fs_roots_radix_lock);
 
                        btrfs_free_log(trans, root);
                        btrfs_update_reloc_root(trans, root);
                        btrfs_orphan_commit_root(trans, root);
 
+                       btrfs_save_ino_cache(root, trans);
+
                        if (root->commit_root != root->node) {
+                               mutex_lock(&root->fs_commit_mutex);
                                switch_commit_root(root);
+                               btrfs_unpin_free_ino(root);
+                               mutex_unlock(&root->fs_commit_mutex);
+
                                btrfs_set_root_node(&root->root_item,
                                                    root->node);
                        }
@@ -769,10 +844,12 @@ static noinline int commit_fs_roots(struct btrfs_trans_handle *trans,
                        err = btrfs_update_root(trans, fs_info->tree_root,
                                                &root->root_key,
                                                &root->root_item);
+                       spin_lock(&fs_info->fs_roots_radix_lock);
                        if (err)
                                break;
                }
        }
+       spin_unlock(&fs_info->fs_roots_radix_lock);
        return err;
 }
 
@@ -802,104 +879,13 @@ int btrfs_defrag_root(struct btrfs_root *root, int cacheonly)
                btrfs_btree_balance_dirty(info->tree_root, nr);
                cond_resched();
 
-               if (root->fs_info->closing || ret != -EAGAIN)
+               if (btrfs_fs_closing(root->fs_info) || ret != -EAGAIN)
                        break;
        }
        root->defrag_running = 0;
        return ret;
 }
 
-#if 0
-/*
- * when dropping snapshots, we generate a ton of delayed refs, and it makes
- * sense not to join the transaction while it is trying to flush the current
- * queue of delayed refs out.
- *
- * This is used by the drop snapshot code only
- */
-static noinline int wait_transaction_pre_flush(struct btrfs_fs_info *info)
-{
-       DEFINE_WAIT(wait);
-
-       mutex_lock(&info->trans_mutex);
-       while (info->running_transaction &&
-              info->running_transaction->delayed_refs.flushing) {
-               prepare_to_wait(&info->transaction_wait, &wait,
-                               TASK_UNINTERRUPTIBLE);
-               mutex_unlock(&info->trans_mutex);
-
-               schedule();
-
-               mutex_lock(&info->trans_mutex);
-               finish_wait(&info->transaction_wait, &wait);
-       }
-       mutex_unlock(&info->trans_mutex);
-       return 0;
-}
-
-/*
- * Given a list of roots that need to be deleted, call btrfs_drop_snapshot on
- * all of them
- */
-int btrfs_drop_dead_root(struct btrfs_root *root)
-{
-       struct btrfs_trans_handle *trans;
-       struct btrfs_root *tree_root = root->fs_info->tree_root;
-       unsigned long nr;
-       int ret;
-
-       while (1) {
-               /*
-                * we don't want to jump in and create a bunch of
-                * delayed refs if the transaction is starting to close
-                */
-               wait_transaction_pre_flush(tree_root->fs_info);
-               trans = btrfs_start_transaction(tree_root, 1);
-
-               /*
-                * we've joined a transaction, make sure it isn't
-                * closing right now
-                */
-               if (trans->transaction->delayed_refs.flushing) {
-                       btrfs_end_transaction(trans, tree_root);
-                       continue;
-               }
-
-               ret = btrfs_drop_snapshot(trans, root);
-               if (ret != -EAGAIN)
-                       break;
-
-               ret = btrfs_update_root(trans, tree_root,
-                                       &root->root_key,
-                                       &root->root_item);
-               if (ret)
-                       break;
-
-               nr = trans->blocks_used;
-               ret = btrfs_end_transaction(trans, tree_root);
-               BUG_ON(ret);
-
-               btrfs_btree_balance_dirty(tree_root, nr);
-               cond_resched();
-       }
-       BUG_ON(ret);
-
-       ret = btrfs_del_root(trans, tree_root, &root->root_key);
-       BUG_ON(ret);
-
-       nr = trans->blocks_used;
-       ret = btrfs_end_transaction(trans, tree_root);
-       BUG_ON(ret);
-
-       free_extent_buffer(root->node);
-       free_extent_buffer(root->commit_root);
-       kfree(root);
-
-       btrfs_btree_balance_dirty(tree_root, nr);
-       return ret;
-}
-#endif
-
 /*
  * new snapshots need to be created at a very specific time in the
  * transaction commit.  This does the actual creation
@@ -930,7 +916,7 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
                goto fail;
        }
 
-       ret = btrfs_find_free_objectid(trans, tree_root, 0, &objectid);
+       ret = btrfs_find_free_objectid(tree_root, &objectid);
        if (ret) {
                pending->error = ret;
                goto fail;
@@ -967,7 +953,7 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
        BUG_ON(ret);
        ret = btrfs_insert_dir_item(trans, parent_root,
                                dentry->d_name.name, dentry->d_name.len,
-                               parent_inode->i_ino, &key,
+                               parent_inode, &key,
                                BTRFS_FT_DIR, index);
        BUG_ON(ret);
 
@@ -976,6 +962,15 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
        ret = btrfs_update_inode(trans, parent_root, parent_inode);
        BUG_ON(ret);
 
+       /*
+        * pull in the delayed directory update
+        * and the delayed inode item
+        * otherwise we corrupt the FS during
+        * snapshot
+        */
+       ret = btrfs_run_delayed_items(trans, root);
+       BUG_ON(ret);
+
        record_root_in_trans(trans, root);
        btrfs_set_root_last_snapshot(&root->root_item, trans->transid);
        memcpy(new_root_item, &root->root_item, sizeof(*new_root_item));
@@ -1009,7 +1004,7 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
         */
        ret = btrfs_add_root_ref(trans, tree_root, objectid,
                                 parent_root->root_key.objectid,
-                                parent_inode->i_ino, index,
+                                btrfs_ino(parent_inode), index,
                                 dentry->d_name.name, dentry->d_name.len);
        BUG_ON(ret);
        dput(parent);
@@ -1066,20 +1061,20 @@ static void update_super_roots(struct btrfs_root *root)
 int btrfs_transaction_in_commit(struct btrfs_fs_info *info)
 {
        int ret = 0;
-       spin_lock(&info->new_trans_lock);
+       spin_lock(&info->trans_lock);
        if (info->running_transaction)
                ret = info->running_transaction->in_commit;
-       spin_unlock(&info->new_trans_lock);
+       spin_unlock(&info->trans_lock);
        return ret;
 }
 
 int btrfs_transaction_blocked(struct btrfs_fs_info *info)
 {
        int ret = 0;
-       spin_lock(&info->new_trans_lock);
+       spin_lock(&info->trans_lock);
        if (info->running_transaction)
                ret = info->running_transaction->blocked;
-       spin_unlock(&info->new_trans_lock);
+       spin_unlock(&info->trans_lock);
        return ret;
 }
 
@@ -1103,9 +1098,7 @@ static void wait_current_trans_commit_start(struct btrfs_root *root,
                                    &wait);
                        break;
                }
-               mutex_unlock(&root->fs_info->trans_mutex);
                schedule();
-               mutex_lock(&root->fs_info->trans_mutex);
                finish_wait(&root->fs_info->transaction_blocked_wait, &wait);
        }
 }
@@ -1131,9 +1124,7 @@ static void wait_current_trans_commit_start_and_unblock(struct btrfs_root *root,
                                    &wait);
                        break;
                }
-               mutex_unlock(&root->fs_info->trans_mutex);
                schedule();
-               mutex_lock(&root->fs_info->trans_mutex);
                finish_wait(&root->fs_info->transaction_wait,
                            &wait);
        }
@@ -1171,7 +1162,7 @@ int btrfs_commit_transaction_async(struct btrfs_trans_handle *trans,
 
        INIT_DELAYED_WORK(&ac->work, do_async_commit);
        ac->root = root;
-       ac->newtrans = btrfs_join_transaction(root, 0);
+       ac->newtrans = btrfs_join_transaction(root);
        if (IS_ERR(ac->newtrans)) {
                int err = PTR_ERR(ac->newtrans);
                kfree(ac);
@@ -1179,23 +1170,22 @@ int btrfs_commit_transaction_async(struct btrfs_trans_handle *trans,
        }
 
        /* take transaction reference */
-       mutex_lock(&root->fs_info->trans_mutex);
        cur_trans = trans->transaction;
        atomic_inc(&cur_trans->use_count);
-       mutex_unlock(&root->fs_info->trans_mutex);
 
        btrfs_end_transaction(trans, root);
        schedule_delayed_work(&ac->work, 0);
 
        /* wait for transaction to start and unblock */
-       mutex_lock(&root->fs_info->trans_mutex);
        if (wait_for_unblock)
                wait_current_trans_commit_start_and_unblock(root, cur_trans);
        else
                wait_current_trans_commit_start(root, cur_trans);
-       put_transaction(cur_trans);
-       mutex_unlock(&root->fs_info->trans_mutex);
 
+       if (current->journal_info == trans)
+               current->journal_info = NULL;
+
+       put_transaction(cur_trans);
        return 0;
 }
 
@@ -1238,38 +1228,41 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
        ret = btrfs_run_delayed_refs(trans, root, 0);
        BUG_ON(ret);
 
-       mutex_lock(&root->fs_info->trans_mutex);
+       spin_lock(&cur_trans->commit_lock);
        if (cur_trans->in_commit) {
+               spin_unlock(&cur_trans->commit_lock);
                atomic_inc(&cur_trans->use_count);
-               mutex_unlock(&root->fs_info->trans_mutex);
                btrfs_end_transaction(trans, root);
 
                ret = wait_for_commit(root, cur_trans);
                BUG_ON(ret);
 
-               mutex_lock(&root->fs_info->trans_mutex);
                put_transaction(cur_trans);
-               mutex_unlock(&root->fs_info->trans_mutex);
 
                return 0;
        }
 
        trans->transaction->in_commit = 1;
        trans->transaction->blocked = 1;
+       spin_unlock(&cur_trans->commit_lock);
        wake_up(&root->fs_info->transaction_blocked_wait);
 
+       spin_lock(&root->fs_info->trans_lock);
        if (cur_trans->list.prev != &root->fs_info->trans_list) {
                prev_trans = list_entry(cur_trans->list.prev,
                                        struct btrfs_transaction, list);
                if (!prev_trans->commit_done) {
                        atomic_inc(&prev_trans->use_count);
-                       mutex_unlock(&root->fs_info->trans_mutex);
+                       spin_unlock(&root->fs_info->trans_lock);
 
                        wait_for_commit(root, prev_trans);
 
-                       mutex_lock(&root->fs_info->trans_mutex);
                        put_transaction(prev_trans);
+               } else {
+                       spin_unlock(&root->fs_info->trans_lock);
                }
+       } else {
+               spin_unlock(&root->fs_info->trans_lock);
        }
 
        if (now < cur_trans->start_time || now - cur_trans->start_time < 1)
@@ -1277,12 +1270,12 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
 
        do {
                int snap_pending = 0;
+
                joined = cur_trans->num_joined;
                if (!list_empty(&trans->transaction->pending_snapshots))
                        snap_pending = 1;
 
                WARN_ON(cur_trans != trans->transaction);
-               mutex_unlock(&root->fs_info->trans_mutex);
 
                if (flush_on_commit || snap_pending) {
                        btrfs_start_delalloc_inodes(root, 1);
@@ -1290,6 +1283,9 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
                        BUG_ON(ret);
                }
 
+               ret = btrfs_run_delayed_items(trans, root);
+               BUG_ON(ret);
+
                /*
                 * rename don't use btrfs_join_transaction, so, once we
                 * set the transaction to blocked above, we aren't going
@@ -1302,23 +1298,48 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
                prepare_to_wait(&cur_trans->writer_wait, &wait,
                                TASK_UNINTERRUPTIBLE);
 
-               smp_mb();
                if (atomic_read(&cur_trans->num_writers) > 1)
                        schedule_timeout(MAX_SCHEDULE_TIMEOUT);
                else if (should_grow)
                        schedule_timeout(1);
 
-               mutex_lock(&root->fs_info->trans_mutex);
                finish_wait(&cur_trans->writer_wait, &wait);
        } while (atomic_read(&cur_trans->num_writers) > 1 ||
                 (should_grow && cur_trans->num_joined != joined));
 
+       /*
+        * Ok now we need to make sure to block out any other joins while we
+        * commit the transaction.  We could have started a join before setting
+        * no_join so make sure to wait for num_writers to == 1 again.
+        */
+       spin_lock(&root->fs_info->trans_lock);
+       root->fs_info->trans_no_join = 1;
+       spin_unlock(&root->fs_info->trans_lock);
+       wait_event(cur_trans->writer_wait,
+                  atomic_read(&cur_trans->num_writers) == 1);
+
+       /*
+        * the reloc mutex makes sure that we stop
+        * the balancing code from coming in and moving
+        * extents around in the middle of the commit
+        */
+       mutex_lock(&root->fs_info->reloc_mutex);
+
+       ret = btrfs_run_delayed_items(trans, root);
+       BUG_ON(ret);
+
        ret = create_pending_snapshots(trans, root->fs_info);
        BUG_ON(ret);
 
        ret = btrfs_run_delayed_refs(trans, root, (unsigned long)-1);
        BUG_ON(ret);
 
+       /*
+        * make sure none of the code above managed to slip in a
+        * delayed item
+        */
+       btrfs_assert_delayed_root_empty(root);
+
        WARN_ON(cur_trans != trans->transaction);
 
        btrfs_scrub_pause(root);
@@ -1351,9 +1372,6 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
        btrfs_prepare_extent_commit(trans, root);
 
        cur_trans = root->fs_info->running_transaction;
-       spin_lock(&root->fs_info->new_trans_lock);
-       root->fs_info->running_transaction = NULL;
-       spin_unlock(&root->fs_info->new_trans_lock);
 
        btrfs_set_root_node(&root->fs_info->tree_root->root_item,
                            root->fs_info->tree_root->node);
@@ -1374,10 +1392,14 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
               sizeof(root->fs_info->super_copy));
 
        trans->transaction->blocked = 0;
+       spin_lock(&root->fs_info->trans_lock);
+       root->fs_info->running_transaction = NULL;
+       root->fs_info->trans_no_join = 0;
+       spin_unlock(&root->fs_info->trans_lock);
+       mutex_unlock(&root->fs_info->reloc_mutex);
 
        wake_up(&root->fs_info->transaction_wait);
 
-       mutex_unlock(&root->fs_info->trans_mutex);
        ret = btrfs_write_and_wait_transaction(trans, root);
        BUG_ON(ret);
        write_ctree_super(trans, root, 0);
@@ -1390,22 +1412,21 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
 
        btrfs_finish_extent_commit(trans, root);
 
-       mutex_lock(&root->fs_info->trans_mutex);
-
        cur_trans->commit_done = 1;
 
        root->fs_info->last_trans_committed = cur_trans->transid;
 
        wake_up(&cur_trans->commit_wait);
 
+       spin_lock(&root->fs_info->trans_lock);
        list_del_init(&cur_trans->list);
+       spin_unlock(&root->fs_info->trans_lock);
+
        put_transaction(cur_trans);
        put_transaction(cur_trans);
 
        trace_btrfs_transaction_commit(root);
 
-       mutex_unlock(&root->fs_info->trans_mutex);
-
        btrfs_scrub_continue(root);
 
        if (current->journal_info == trans)
@@ -1427,14 +1448,16 @@ int btrfs_clean_old_snapshots(struct btrfs_root *root)
        LIST_HEAD(list);
        struct btrfs_fs_info *fs_info = root->fs_info;
 
-       mutex_lock(&fs_info->trans_mutex);
+       spin_lock(&fs_info->trans_lock);
        list_splice_init(&fs_info->dead_roots, &list);
-       mutex_unlock(&fs_info->trans_mutex);
+       spin_unlock(&fs_info->trans_lock);
 
        while (!list_empty(&list)) {
                root = list_entry(list.next, struct btrfs_root, root_list);
                list_del(&root->root_list);
 
+               btrfs_kill_all_delayed_nodes(root);
+
                if (btrfs_header_backref_rev(root->node) <
                    BTRFS_MIXED_BACKREF_REV)
                        btrfs_drop_snapshot(root, NULL, 0);