Btrfs: fix deadlock when throttling transactions
[pandora-kernel.git] / fs / btrfs / transaction.c
index 2b3590b..eb55863 100644 (file)
@@ -126,28 +126,85 @@ static noinline int join_transaction(struct btrfs_root *root, int nofail)
  * to make sure the old root from before we joined the transaction is deleted
  * when the transaction commits
  */
-int btrfs_record_root_in_trans(struct btrfs_trans_handle *trans,
+static int record_root_in_trans(struct btrfs_trans_handle *trans,
                               struct btrfs_root *root)
 {
        if (root->ref_cows && root->last_trans < trans->transid) {
                WARN_ON(root == root->fs_info->extent_root);
                WARN_ON(root->commit_root != root->node);
 
+               /*
+                * see below for in_trans_setup usage rules
+                * we have the reloc mutex held now, so there
+                * is only one writer in this function
+                */
+               root->in_trans_setup = 1;
+
+               /* make sure readers find in_trans_setup before
+                * they find our root->last_trans update
+                */
+               smp_wmb();
+
                spin_lock(&root->fs_info->fs_roots_radix_lock);
                if (root->last_trans == trans->transid) {
                        spin_unlock(&root->fs_info->fs_roots_radix_lock);
                        return 0;
                }
-               root->last_trans = trans->transid;
                radix_tree_tag_set(&root->fs_info->fs_roots_radix,
                           (unsigned long)root->root_key.objectid,
                           BTRFS_ROOT_TRANS_TAG);
                spin_unlock(&root->fs_info->fs_roots_radix_lock);
+               root->last_trans = trans->transid;
+
+               /* this is pretty tricky.  We don't want to
+                * take the relocation lock in btrfs_record_root_in_trans
+                * unless we're really doing the first setup for this root in
+                * this transaction.
+                *
+                * Normally we'd use root->last_trans as a flag to decide
+                * if we want to take the expensive mutex.
+                *
+                * But, we have to set root->last_trans before we
+                * init the relocation root, otherwise, we trip over warnings
+                * in ctree.c.  The solution used here is to flag ourselves
+                * with root->in_trans_setup.  When this is 1, we're still
+                * fixing up the reloc trees and everyone must wait.
+                *
+                * When this is zero, they can trust root->last_trans and fly
+                * through btrfs_record_root_in_trans without having to take the
+                * lock.  smp_wmb() makes sure that all the writes above are
+                * done before we pop in the zero below
+                */
                btrfs_init_reloc_root(trans, root);
+               smp_wmb();
+               root->in_trans_setup = 0;
        }
        return 0;
 }
 
+
+int btrfs_record_root_in_trans(struct btrfs_trans_handle *trans,
+                              struct btrfs_root *root)
+{
+       if (!root->ref_cows)
+               return 0;
+
+       /*
+        * see record_root_in_trans for comments about in_trans_setup usage
+        * and barriers
+        */
+       smp_rmb();
+       if (root->last_trans == trans->transid &&
+           !root->in_trans_setup)
+               return 0;
+
+       mutex_lock(&root->fs_info->reloc_mutex);
+       record_root_in_trans(trans, root);
+       mutex_unlock(&root->fs_info->reloc_mutex);
+
+       return 0;
+}
+
 /* wait for commit against the current transaction to become unblocked
  * when this is done, it is safe to start a new transaction, but the current
  * transaction might not be fully on disk.
@@ -203,7 +260,7 @@ static struct btrfs_trans_handle *start_transaction(struct btrfs_root *root,
 {
        struct btrfs_trans_handle *h;
        struct btrfs_transaction *cur_trans;
-       int retries = 0;
+       u64 num_bytes = 0;
        int ret;
 
        if (root->fs_info->fs_state & BTRFS_SUPER_FLAG_ERROR)
@@ -217,6 +274,19 @@ static struct btrfs_trans_handle *start_transaction(struct btrfs_root *root,
                h->block_rsv = NULL;
                goto got_it;
        }
+
+       /*
+        * Do the reservation before we join the transaction so we can do all
+        * the appropriate flushing if need be.
+        */
+       if (num_items > 0 && root != root->fs_info->chunk_root) {
+               num_bytes = btrfs_calc_trans_metadata_size(root, num_items);
+               ret = btrfs_block_rsv_add(NULL, root,
+                                         &root->fs_info->trans_block_rsv,
+                                         num_bytes);
+               if (ret)
+                       return ERR_PTR(ret);
+       }
 again:
        h = kmem_cache_alloc(btrfs_trans_handle_cachep, GFP_NOFS);
        if (!h)
@@ -253,24 +323,9 @@ again:
                goto again;
        }
 
-       if (num_items > 0) {
-               ret = btrfs_trans_reserve_metadata(h, root, num_items);
-               if (ret == -EAGAIN && !retries) {
-                       retries++;
-                       btrfs_commit_transaction(h, root);
-                       goto again;
-               } else if (ret == -EAGAIN) {
-                       /*
-                        * We have already retried and got EAGAIN, so really we
-                        * don't have space, so set ret to -ENOSPC.
-                        */
-                       ret = -ENOSPC;
-               }
-
-               if (ret < 0) {
-                       btrfs_end_transaction(h, root);
-                       return ERR_PTR(ret);
-               }
+       if (num_bytes) {
+               h->block_rsv = &root->fs_info->trans_block_rsv;
+               h->bytes_reserved = num_bytes;
        }
 
 got_it:
@@ -442,10 +497,17 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans,
        }
 
        if (lock && cur_trans->blocked && !cur_trans->in_commit) {
-               if (throttle)
+               if (throttle) {
+                       /*
+                        * We may race with somebody else here so end up having
+                        * to call end_transaction on ourselves again, so inc
+                        * our use_count.
+                        */
+                       trans->use_count++;
                        return btrfs_commit_transaction(trans, root);
-               else
+               } else {
                        wake_up_process(info->transaction_kthread);
+               }
        }
 
        WARN_ON(cur_trans != info->running_transaction);
@@ -882,7 +944,7 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
        parent = dget_parent(dentry);
        parent_inode = parent->d_inode;
        parent_root = BTRFS_I(parent_inode)->root;
-       btrfs_record_root_in_trans(trans, parent_root);
+       record_root_in_trans(trans, parent_root);
 
        /*
         * insert the directory item
@@ -900,7 +962,16 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
        ret = btrfs_update_inode(trans, parent_root, parent_inode);
        BUG_ON(ret);
 
-       btrfs_record_root_in_trans(trans, root);
+       /*
+        * pull in the delayed directory update
+        * and the delayed inode item
+        * otherwise we corrupt the FS during
+        * snapshot
+        */
+       ret = btrfs_run_delayed_items(trans, root);
+       BUG_ON(ret);
+
+       record_root_in_trans(trans, root);
        btrfs_set_root_last_snapshot(&root->root_item, trans->transid);
        memcpy(new_root_item, &root->root_item, sizeof(*new_root_item));
        btrfs_check_and_init_root_item(new_root_item);
@@ -961,14 +1032,6 @@ static noinline int create_pending_snapshots(struct btrfs_trans_handle *trans,
        int ret;
 
        list_for_each_entry(pending, head, list) {
-               /*
-                * We must deal with the delayed items before creating
-                * snapshots, or we will create a snapthot with inconsistent
-                * information.
-               */
-               ret = btrfs_run_delayed_items(trans, fs_info->fs_root);
-               BUG_ON(ret);
-
                ret = create_pending_snapshot(trans, fs_info, pending);
                BUG_ON(ret);
        }
@@ -1241,21 +1304,42 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
                        schedule_timeout(1);
 
                finish_wait(&cur_trans->writer_wait, &wait);
-               spin_lock(&root->fs_info->trans_lock);
-               root->fs_info->trans_no_join = 1;
-               spin_unlock(&root->fs_info->trans_lock);
        } while (atomic_read(&cur_trans->num_writers) > 1 ||
                 (should_grow && cur_trans->num_joined != joined));
 
-       ret = create_pending_snapshots(trans, root->fs_info);
-       BUG_ON(ret);
+       /*
+        * Ok now we need to make sure to block out any other joins while we
+        * commit the transaction.  We could have started a join before setting
+        * no_join so make sure to wait for num_writers to == 1 again.
+        */
+       spin_lock(&root->fs_info->trans_lock);
+       root->fs_info->trans_no_join = 1;
+       spin_unlock(&root->fs_info->trans_lock);
+       wait_event(cur_trans->writer_wait,
+                  atomic_read(&cur_trans->num_writers) == 1);
+
+       /*
+        * the reloc mutex makes sure that we stop
+        * the balancing code from coming in and moving
+        * extents around in the middle of the commit
+        */
+       mutex_lock(&root->fs_info->reloc_mutex);
 
        ret = btrfs_run_delayed_items(trans, root);
        BUG_ON(ret);
 
+       ret = create_pending_snapshots(trans, root->fs_info);
+       BUG_ON(ret);
+
        ret = btrfs_run_delayed_refs(trans, root, (unsigned long)-1);
        BUG_ON(ret);
 
+       /*
+        * make sure none of the code above managed to slip in a
+        * delayed item
+        */
+       btrfs_assert_delayed_root_empty(root);
+
        WARN_ON(cur_trans != trans->transaction);
 
        btrfs_scrub_pause(root);
@@ -1312,6 +1396,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
        root->fs_info->running_transaction = NULL;
        root->fs_info->trans_no_join = 0;
        spin_unlock(&root->fs_info->trans_lock);
+       mutex_unlock(&root->fs_info->reloc_mutex);
 
        wake_up(&root->fs_info->transaction_wait);