Btrfs: fix deadlock when throttling transactions

[pandora-kernel.git] / fs / btrfs / transaction.c
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c

index 2d5c6d2..eb55863 100644 (file)
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -126,28 +126,85 @@ static noinline int join_transaction(struct btrfs_root *root, int nofail)
   * to make sure the old root from before we joined the transaction is deleted
   * when the transaction commits
   */
-int btrfs_record_root_in_trans(struct btrfs_trans_handle *trans,
+static int record_root_in_trans(struct btrfs_trans_handle *trans,
                                struct btrfs_root *root)
  {
         if (root->ref_cows && root->last_trans < trans->transid) {
                 WARN_ON(root == root->fs_info->extent_root);
                 WARN_ON(root->commit_root != root->node);
  
+               /*
+                * see below for in_trans_setup usage rules
+                * we have the reloc mutex held now, so there
+                * is only one writer in this function
+                */
+               root->in_trans_setup = 1;
+
+               /* make sure readers find in_trans_setup before
+                * they find our root->last_trans update
+                */
+               smp_wmb();
+
                 spin_lock(&root->fs_info->fs_roots_radix_lock);
                 if (root->last_trans == trans->transid) {
                         spin_unlock(&root->fs_info->fs_roots_radix_lock);
                         return 0;
                 }
-               root->last_trans = trans->transid;
                 radix_tree_tag_set(&root->fs_info->fs_roots_radix,
                            (unsigned long)root->root_key.objectid,
                            BTRFS_ROOT_TRANS_TAG);
                 spin_unlock(&root->fs_info->fs_roots_radix_lock);
+               root->last_trans = trans->transid;
+
+               /* this is pretty tricky.  We don't want to
+                * take the relocation lock in btrfs_record_root_in_trans
+                * unless we're really doing the first setup for this root in
+                * this transaction.
+                *
+                * Normally we'd use root->last_trans as a flag to decide
+                * if we want to take the expensive mutex.
+                *
+                * But, we have to set root->last_trans before we
+                * init the relocation root, otherwise, we trip over warnings
+                * in ctree.c.  The solution used here is to flag ourselves
+                * with root->in_trans_setup.  When this is 1, we're still
+                * fixing up the reloc trees and everyone must wait.
+                *
+                * When this is zero, they can trust root->last_trans and fly
+                * through btrfs_record_root_in_trans without having to take the
+                * lock.  smp_wmb() makes sure that all the writes above are
+                * done before we pop in the zero below
+                */
                 btrfs_init_reloc_root(trans, root);
+               smp_wmb();
+               root->in_trans_setup = 0;
         }
         return 0;
  }
  
+
+int btrfs_record_root_in_trans(struct btrfs_trans_handle *trans,
+                              struct btrfs_root *root)
+{
+       if (!root->ref_cows)
+               return 0;
+
+       /*
+        * see record_root_in_trans for comments about in_trans_setup usage
+        * and barriers
+        */
+       smp_rmb();
+       if (root->last_trans == trans->transid &&
+           !root->in_trans_setup)
+               return 0;
+
+       mutex_lock(&root->fs_info->reloc_mutex);
+       record_root_in_trans(trans, root);
+       mutex_unlock(&root->fs_info->reloc_mutex);
+
+       return 0;
+}
+
  /* wait for commit against the current transaction to become unblocked
   * when this is done, it is safe to start a new transaction, but the current
   * transaction might not be fully on disk.
@@ -203,7 +260,7 @@ static struct btrfs_trans_handle *start_transaction(struct btrfs_root *root,
  {
         struct btrfs_trans_handle *h;
         struct btrfs_transaction *cur_trans;
-       int retries = 0;
+       u64 num_bytes = 0;
         int ret;
  
         if (root->fs_info->fs_state & BTRFS_SUPER_FLAG_ERROR)
@@ -217,6 +274,19 @@ static struct btrfs_trans_handle *start_transaction(struct btrfs_root *root,
                 h->block_rsv = NULL;
                 goto got_it;
         }
+
+       /*
+        * Do the reservation before we join the transaction so we can do all
+        * the appropriate flushing if need be.
+        */
+       if (num_items > 0 && root != root->fs_info->chunk_root) {
+               num_bytes = btrfs_calc_trans_metadata_size(root, num_items);
+               ret = btrfs_block_rsv_add(NULL, root,
+                                         &root->fs_info->trans_block_rsv,
+                                         num_bytes);
+               if (ret)
+                       return ERR_PTR(ret);
+       }
  again:
         h = kmem_cache_alloc(btrfs_trans_handle_cachep, GFP_NOFS);
         if (!h)
@@ -253,24 +323,9 @@ again:
                 goto again;
         }
  
-       if (num_items > 0) {
-               ret = btrfs_trans_reserve_metadata(h, root, num_items);
-               if (ret == -EAGAIN && !retries) {
-                       retries++;
-                       btrfs_commit_transaction(h, root);
-                       goto again;
-               } else if (ret == -EAGAIN) {
-                       /*
-                        * We have already retried and got EAGAIN, so really we
-                        * don't have space, so set ret to -ENOSPC.
-                        */
-                       ret = -ENOSPC;
-               }
-
-               if (ret < 0) {
-                       btrfs_end_transaction(h, root);
-                       return ERR_PTR(ret);
-               }
+       if (num_bytes) {
+               h->block_rsv = &root->fs_info->trans_block_rsv;
+               h->bytes_reserved = num_bytes;
         }
  
  got_it:
@@ -349,7 +404,7 @@ int btrfs_wait_for_commit(struct btrfs_root *root, u64 transid)
                                             list) {
                         if (t->in_commit) {
                                 if (t->commit_done)
-                                       goto out;
+                                       break;
                                 cur_trans = t;
                                 atomic_inc(&cur_trans->use_count);
                                 break;
@@ -442,10 +497,17 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans,
         }
  
         if (lock && cur_trans->blocked && !cur_trans->in_commit) {
-               if (throttle)
+               if (throttle) {
+                       /*
+                        * We may race with somebody else here so end up having
+                        * to call end_transaction on ourselves again, so inc
+                        * our use_count.
+                        */
+                       trans->use_count++;
                         return btrfs_commit_transaction(trans, root);
-               else
+               } else {
                         wake_up_process(info->transaction_kthread);
+               }
         }
  
         WARN_ON(cur_trans != info->running_transaction);
@@ -817,7 +879,7 @@ int btrfs_defrag_root(struct btrfs_root *root, int cacheonly)
                 btrfs_btree_balance_dirty(info->tree_root, nr);
                 cond_resched();
  
-               if (root->fs_info->closing || ret != -EAGAIN)
+               if (btrfs_fs_closing(root->fs_info) || ret != -EAGAIN)
                         break;
         }
         root->defrag_running = 0;
@@ -882,7 +944,7 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
         parent = dget_parent(dentry);
         parent_inode = parent->d_inode;
         parent_root = BTRFS_I(parent_inode)->root;
-       btrfs_record_root_in_trans(trans, parent_root);
+       record_root_in_trans(trans, parent_root);
  
         /*
          * insert the directory item
@@ -900,7 +962,16 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
         ret = btrfs_update_inode(trans, parent_root, parent_inode);
         BUG_ON(ret);
  
-       btrfs_record_root_in_trans(trans, root);
+       /*
+        * pull in the delayed directory update
+        * and the delayed inode item
+        * otherwise we corrupt the FS during
+        * snapshot
+        */
+       ret = btrfs_run_delayed_items(trans, root);
+       BUG_ON(ret);
+
+       record_root_in_trans(trans, root);
         btrfs_set_root_last_snapshot(&root->root_item, trans->transid);
         memcpy(new_root_item, &root->root_item, sizeof(*new_root_item));
         btrfs_check_and_init_root_item(new_root_item);
@@ -961,14 +1032,6 @@ static noinline int create_pending_snapshots(struct btrfs_trans_handle *trans,
         int ret;
  
         list_for_each_entry(pending, head, list) {
-               /*
-                * We must deal with the delayed items before creating
-                * snapshots, or we will create a snapthot with inconsistent
-                * information.
-               */
-               ret = btrfs_run_delayed_items(trans, fs_info->fs_root);
-               BUG_ON(ret);
-
                 ret = create_pending_snapshot(trans, fs_info, pending);
                 BUG_ON(ret);
         }
@@ -1118,8 +1181,11 @@ int btrfs_commit_transaction_async(struct btrfs_trans_handle *trans,
                 wait_current_trans_commit_start_and_unblock(root, cur_trans);
         else
                 wait_current_trans_commit_start(root, cur_trans);
-       put_transaction(cur_trans);
  
+       if (current->journal_info == trans)
+               current->journal_info = NULL;
+
+       put_transaction(cur_trans);
         return 0;
  }
  
@@ -1238,21 +1304,42 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
                         schedule_timeout(1);
  
                 finish_wait(&cur_trans->writer_wait, &wait);
-               spin_lock(&root->fs_info->trans_lock);
-               root->fs_info->trans_no_join = 1;
-               spin_unlock(&root->fs_info->trans_lock);
         } while (atomic_read(&cur_trans->num_writers) > 1 ||
                  (should_grow && cur_trans->num_joined != joined));
  
-       ret = create_pending_snapshots(trans, root->fs_info);
-       BUG_ON(ret);
+       /*
+        * Ok now we need to make sure to block out any other joins while we
+        * commit the transaction.  We could have started a join before setting
+        * no_join so make sure to wait for num_writers to == 1 again.
+        */
+       spin_lock(&root->fs_info->trans_lock);
+       root->fs_info->trans_no_join = 1;
+       spin_unlock(&root->fs_info->trans_lock);
+       wait_event(cur_trans->writer_wait,
+                  atomic_read(&cur_trans->num_writers) == 1);
+
+       /*
+        * the reloc mutex makes sure that we stop
+        * the balancing code from coming in and moving
+        * extents around in the middle of the commit
+        */
+       mutex_lock(&root->fs_info->reloc_mutex);
  
         ret = btrfs_run_delayed_items(trans, root);
         BUG_ON(ret);
  
+       ret = create_pending_snapshots(trans, root->fs_info);
+       BUG_ON(ret);
+
         ret = btrfs_run_delayed_refs(trans, root, (unsigned long)-1);
         BUG_ON(ret);
  
+       /*
+        * make sure none of the code above managed to slip in a
+        * delayed item
+        */
+       btrfs_assert_delayed_root_empty(root);
+
         WARN_ON(cur_trans != trans->transaction);
  
         btrfs_scrub_pause(root);
@@ -1309,6 +1396,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
         root->fs_info->running_transaction = NULL;
         root->fs_info->trans_no_join = 0;
         spin_unlock(&root->fs_info->trans_lock);
+       mutex_unlock(&root->fs_info->reloc_mutex);
  
         wake_up(&root->fs_info->transaction_wait);