Merge git://git.kernel.org/pub/scm/linux/kernel/git/jejb/scsi-rc-fixes-2.6
[pandora-kernel.git] / fs / btrfs / transaction.c
1 /*
2  * Copyright (C) 2007 Oracle.  All rights reserved.
3  *
4  * This program is free software; you can redistribute it and/or
5  * modify it under the terms of the GNU General Public
6  * License v2 as published by the Free Software Foundation.
7  *
8  * This program is distributed in the hope that it will be useful,
9  * but WITHOUT ANY WARRANTY; without even the implied warranty of
10  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
11  * General Public License for more details.
12  *
13  * You should have received a copy of the GNU General Public
14  * License along with this program; if not, write to the
15  * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
16  * Boston, MA 021110-1307, USA.
17  */
18
19 #include <linux/fs.h>
20 #include <linux/slab.h>
21 #include <linux/sched.h>
22 #include <linux/writeback.h>
23 #include <linux/pagemap.h>
24 #include <linux/blkdev.h>
25 #include "ctree.h"
26 #include "disk-io.h"
27 #include "transaction.h"
28 #include "locking.h"
29 #include "tree-log.h"
30 #include "inode-map.h"
31
32 #define BTRFS_ROOT_TRANS_TAG 0
33
34 static noinline void put_transaction(struct btrfs_transaction *transaction)
35 {
36         WARN_ON(atomic_read(&transaction->use_count) == 0);
37         if (atomic_dec_and_test(&transaction->use_count)) {
38                 memset(transaction, 0, sizeof(*transaction));
39                 kmem_cache_free(btrfs_transaction_cachep, transaction);
40         }
41 }
42
43 static noinline void switch_commit_root(struct btrfs_root *root)
44 {
45         free_extent_buffer(root->commit_root);
46         root->commit_root = btrfs_root_node(root);
47 }
48
49 /*
50  * either allocate a new transaction or hop into the existing one
51  */
52 static noinline int join_transaction(struct btrfs_root *root)
53 {
54         struct btrfs_transaction *cur_trans;
55         cur_trans = root->fs_info->running_transaction;
56         if (!cur_trans) {
57                 cur_trans = kmem_cache_alloc(btrfs_transaction_cachep,
58                                              GFP_NOFS);
59                 if (!cur_trans)
60                         return -ENOMEM;
61                 root->fs_info->generation++;
62                 atomic_set(&cur_trans->num_writers, 1);
63                 cur_trans->num_joined = 0;
64                 cur_trans->transid = root->fs_info->generation;
65                 init_waitqueue_head(&cur_trans->writer_wait);
66                 init_waitqueue_head(&cur_trans->commit_wait);
67                 cur_trans->in_commit = 0;
68                 cur_trans->blocked = 0;
69                 atomic_set(&cur_trans->use_count, 1);
70                 cur_trans->commit_done = 0;
71                 cur_trans->start_time = get_seconds();
72
73                 cur_trans->delayed_refs.root = RB_ROOT;
74                 cur_trans->delayed_refs.num_entries = 0;
75                 cur_trans->delayed_refs.num_heads_ready = 0;
76                 cur_trans->delayed_refs.num_heads = 0;
77                 cur_trans->delayed_refs.flushing = 0;
78                 cur_trans->delayed_refs.run_delayed_start = 0;
79                 spin_lock_init(&cur_trans->delayed_refs.lock);
80
81                 INIT_LIST_HEAD(&cur_trans->pending_snapshots);
82                 list_add_tail(&cur_trans->list, &root->fs_info->trans_list);
83                 extent_io_tree_init(&cur_trans->dirty_pages,
84                                      root->fs_info->btree_inode->i_mapping);
85                 spin_lock(&root->fs_info->new_trans_lock);
86                 root->fs_info->running_transaction = cur_trans;
87                 spin_unlock(&root->fs_info->new_trans_lock);
88         } else {
89                 atomic_inc(&cur_trans->num_writers);
90                 cur_trans->num_joined++;
91         }
92
93         return 0;
94 }
95
96 /*
97  * this does all the record keeping required to make sure that a reference
98  * counted root is properly recorded in a given transaction.  This is required
99  * to make sure the old root from before we joined the transaction is deleted
100  * when the transaction commits
101  */
102 static noinline int record_root_in_trans(struct btrfs_trans_handle *trans,
103                                          struct btrfs_root *root)
104 {
105         if (root->ref_cows && root->last_trans < trans->transid) {
106                 WARN_ON(root == root->fs_info->extent_root);
107                 WARN_ON(root->commit_root != root->node);
108
109                 radix_tree_tag_set(&root->fs_info->fs_roots_radix,
110                            (unsigned long)root->root_key.objectid,
111                            BTRFS_ROOT_TRANS_TAG);
112                 root->last_trans = trans->transid;
113                 btrfs_init_reloc_root(trans, root);
114         }
115         return 0;
116 }
117
118 int btrfs_record_root_in_trans(struct btrfs_trans_handle *trans,
119                                struct btrfs_root *root)
120 {
121         if (!root->ref_cows)
122                 return 0;
123
124         mutex_lock(&root->fs_info->trans_mutex);
125         if (root->last_trans == trans->transid) {
126                 mutex_unlock(&root->fs_info->trans_mutex);
127                 return 0;
128         }
129
130         record_root_in_trans(trans, root);
131         mutex_unlock(&root->fs_info->trans_mutex);
132         return 0;
133 }
134
135 /* wait for commit against the current transaction to become unblocked
136  * when this is done, it is safe to start a new transaction, but the current
137  * transaction might not be fully on disk.
138  */
139 static void wait_current_trans(struct btrfs_root *root)
140 {
141         struct btrfs_transaction *cur_trans;
142
143         cur_trans = root->fs_info->running_transaction;
144         if (cur_trans && cur_trans->blocked) {
145                 DEFINE_WAIT(wait);
146                 atomic_inc(&cur_trans->use_count);
147                 while (1) {
148                         prepare_to_wait(&root->fs_info->transaction_wait, &wait,
149                                         TASK_UNINTERRUPTIBLE);
150                         if (!cur_trans->blocked)
151                                 break;
152                         mutex_unlock(&root->fs_info->trans_mutex);
153                         schedule();
154                         mutex_lock(&root->fs_info->trans_mutex);
155                 }
156                 finish_wait(&root->fs_info->transaction_wait, &wait);
157                 put_transaction(cur_trans);
158         }
159 }
160
161 enum btrfs_trans_type {
162         TRANS_START,
163         TRANS_JOIN,
164         TRANS_USERSPACE,
165         TRANS_JOIN_NOLOCK,
166 };
167
168 static int may_wait_transaction(struct btrfs_root *root, int type)
169 {
170         if (!root->fs_info->log_root_recovering &&
171             ((type == TRANS_START && !root->fs_info->open_ioctl_trans) ||
172              type == TRANS_USERSPACE))
173                 return 1;
174         return 0;
175 }
176
177 static struct btrfs_trans_handle *start_transaction(struct btrfs_root *root,
178                                                     u64 num_items, int type)
179 {
180         struct btrfs_trans_handle *h;
181         struct btrfs_transaction *cur_trans;
182         int retries = 0;
183         int ret;
184
185         if (root->fs_info->fs_state & BTRFS_SUPER_FLAG_ERROR)
186                 return ERR_PTR(-EROFS);
187 again:
188         h = kmem_cache_alloc(btrfs_trans_handle_cachep, GFP_NOFS);
189         if (!h)
190                 return ERR_PTR(-ENOMEM);
191
192         if (type != TRANS_JOIN_NOLOCK)
193                 mutex_lock(&root->fs_info->trans_mutex);
194         if (may_wait_transaction(root, type))
195                 wait_current_trans(root);
196
197         ret = join_transaction(root);
198         if (ret < 0) {
199                 kmem_cache_free(btrfs_trans_handle_cachep, h);
200                 if (type != TRANS_JOIN_NOLOCK)
201                         mutex_unlock(&root->fs_info->trans_mutex);
202                 return ERR_PTR(ret);
203         }
204
205         cur_trans = root->fs_info->running_transaction;
206         atomic_inc(&cur_trans->use_count);
207         if (type != TRANS_JOIN_NOLOCK)
208                 mutex_unlock(&root->fs_info->trans_mutex);
209
210         h->transid = cur_trans->transid;
211         h->transaction = cur_trans;
212         h->blocks_used = 0;
213         h->block_group = 0;
214         h->bytes_reserved = 0;
215         h->delayed_ref_updates = 0;
216         h->block_rsv = NULL;
217
218         smp_mb();
219         if (cur_trans->blocked && may_wait_transaction(root, type)) {
220                 btrfs_commit_transaction(h, root);
221                 goto again;
222         }
223
224         if (num_items > 0) {
225                 ret = btrfs_trans_reserve_metadata(h, root, num_items);
226                 if (ret == -EAGAIN && !retries) {
227                         retries++;
228                         btrfs_commit_transaction(h, root);
229                         goto again;
230                 } else if (ret == -EAGAIN) {
231                         /*
232                          * We have already retried and got EAGAIN, so really we
233                          * don't have space, so set ret to -ENOSPC.
234                          */
235                         ret = -ENOSPC;
236                 }
237
238                 if (ret < 0) {
239                         btrfs_end_transaction(h, root);
240                         return ERR_PTR(ret);
241                 }
242         }
243
244         if (type != TRANS_JOIN_NOLOCK)
245                 mutex_lock(&root->fs_info->trans_mutex);
246         record_root_in_trans(h, root);
247         if (type != TRANS_JOIN_NOLOCK)
248                 mutex_unlock(&root->fs_info->trans_mutex);
249
250         if (!current->journal_info && type != TRANS_USERSPACE)
251                 current->journal_info = h;
252         return h;
253 }
254
255 struct btrfs_trans_handle *btrfs_start_transaction(struct btrfs_root *root,
256                                                    int num_items)
257 {
258         return start_transaction(root, num_items, TRANS_START);
259 }
260 struct btrfs_trans_handle *btrfs_join_transaction(struct btrfs_root *root,
261                                                    int num_blocks)
262 {
263         return start_transaction(root, 0, TRANS_JOIN);
264 }
265
266 struct btrfs_trans_handle *btrfs_join_transaction_nolock(struct btrfs_root *root,
267                                                           int num_blocks)
268 {
269         return start_transaction(root, 0, TRANS_JOIN_NOLOCK);
270 }
271
272 struct btrfs_trans_handle *btrfs_start_ioctl_transaction(struct btrfs_root *r,
273                                                          int num_blocks)
274 {
275         return start_transaction(r, 0, TRANS_USERSPACE);
276 }
277
278 /* wait for a transaction commit to be fully complete */
279 static noinline int wait_for_commit(struct btrfs_root *root,
280                                     struct btrfs_transaction *commit)
281 {
282         DEFINE_WAIT(wait);
283         mutex_lock(&root->fs_info->trans_mutex);
284         while (!commit->commit_done) {
285                 prepare_to_wait(&commit->commit_wait, &wait,
286                                 TASK_UNINTERRUPTIBLE);
287                 if (commit->commit_done)
288                         break;
289                 mutex_unlock(&root->fs_info->trans_mutex);
290                 schedule();
291                 mutex_lock(&root->fs_info->trans_mutex);
292         }
293         mutex_unlock(&root->fs_info->trans_mutex);
294         finish_wait(&commit->commit_wait, &wait);
295         return 0;
296 }
297
298 int btrfs_wait_for_commit(struct btrfs_root *root, u64 transid)
299 {
300         struct btrfs_transaction *cur_trans = NULL, *t;
301         int ret;
302
303         mutex_lock(&root->fs_info->trans_mutex);
304
305         ret = 0;
306         if (transid) {
307                 if (transid <= root->fs_info->last_trans_committed)
308                         goto out_unlock;
309
310                 /* find specified transaction */
311                 list_for_each_entry(t, &root->fs_info->trans_list, list) {
312                         if (t->transid == transid) {
313                                 cur_trans = t;
314                                 break;
315                         }
316                         if (t->transid > transid)
317                                 break;
318                 }
319                 ret = -EINVAL;
320                 if (!cur_trans)
321                         goto out_unlock;  /* bad transid */
322         } else {
323                 /* find newest transaction that is committing | committed */
324                 list_for_each_entry_reverse(t, &root->fs_info->trans_list,
325                                             list) {
326                         if (t->in_commit) {
327                                 if (t->commit_done)
328                                         goto out_unlock;
329                                 cur_trans = t;
330                                 break;
331                         }
332                 }
333                 if (!cur_trans)
334                         goto out_unlock;  /* nothing committing|committed */
335         }
336
337         atomic_inc(&cur_trans->use_count);
338         mutex_unlock(&root->fs_info->trans_mutex);
339
340         wait_for_commit(root, cur_trans);
341
342         mutex_lock(&root->fs_info->trans_mutex);
343         put_transaction(cur_trans);
344         ret = 0;
345 out_unlock:
346         mutex_unlock(&root->fs_info->trans_mutex);
347         return ret;
348 }
349
350 void btrfs_throttle(struct btrfs_root *root)
351 {
352         mutex_lock(&root->fs_info->trans_mutex);
353         if (!root->fs_info->open_ioctl_trans)
354                 wait_current_trans(root);
355         mutex_unlock(&root->fs_info->trans_mutex);
356 }
357
358 static int should_end_transaction(struct btrfs_trans_handle *trans,
359                                   struct btrfs_root *root)
360 {
361         int ret;
362         ret = btrfs_block_rsv_check(trans, root,
363                                     &root->fs_info->global_block_rsv, 0, 5);
364         return ret ? 1 : 0;
365 }
366
367 int btrfs_should_end_transaction(struct btrfs_trans_handle *trans,
368                                  struct btrfs_root *root)
369 {
370         struct btrfs_transaction *cur_trans = trans->transaction;
371         int updates;
372
373         if (cur_trans->blocked || cur_trans->delayed_refs.flushing)
374                 return 1;
375
376         updates = trans->delayed_ref_updates;
377         trans->delayed_ref_updates = 0;
378         if (updates)
379                 btrfs_run_delayed_refs(trans, root, updates);
380
381         return should_end_transaction(trans, root);
382 }
383
384 static int __btrfs_end_transaction(struct btrfs_trans_handle *trans,
385                           struct btrfs_root *root, int throttle, int lock)
386 {
387         struct btrfs_transaction *cur_trans = trans->transaction;
388         struct btrfs_fs_info *info = root->fs_info;
389         int count = 0;
390
391         while (count < 4) {
392                 unsigned long cur = trans->delayed_ref_updates;
393                 trans->delayed_ref_updates = 0;
394                 if (cur &&
395                     trans->transaction->delayed_refs.num_heads_ready > 64) {
396                         trans->delayed_ref_updates = 0;
397
398                         /*
399                          * do a full flush if the transaction is trying
400                          * to close
401                          */
402                         if (trans->transaction->delayed_refs.flushing)
403                                 cur = 0;
404                         btrfs_run_delayed_refs(trans, root, cur);
405                 } else {
406                         break;
407                 }
408                 count++;
409         }
410
411         btrfs_trans_release_metadata(trans, root);
412
413         if (lock && !root->fs_info->open_ioctl_trans &&
414             should_end_transaction(trans, root))
415                 trans->transaction->blocked = 1;
416
417         if (lock && cur_trans->blocked && !cur_trans->in_commit) {
418                 if (throttle)
419                         return btrfs_commit_transaction(trans, root);
420                 else
421                         wake_up_process(info->transaction_kthread);
422         }
423
424         WARN_ON(cur_trans != info->running_transaction);
425         WARN_ON(atomic_read(&cur_trans->num_writers) < 1);
426         atomic_dec(&cur_trans->num_writers);
427
428         smp_mb();
429         if (waitqueue_active(&cur_trans->writer_wait))
430                 wake_up(&cur_trans->writer_wait);
431         put_transaction(cur_trans);
432
433         if (current->journal_info == trans)
434                 current->journal_info = NULL;
435         memset(trans, 0, sizeof(*trans));
436         kmem_cache_free(btrfs_trans_handle_cachep, trans);
437
438         if (throttle)
439                 btrfs_run_delayed_iputs(root);
440
441         return 0;
442 }
443
444 int btrfs_end_transaction(struct btrfs_trans_handle *trans,
445                           struct btrfs_root *root)
446 {
447         int ret;
448
449         ret = __btrfs_end_transaction(trans, root, 0, 1);
450         if (ret)
451                 return ret;
452         return 0;
453 }
454
455 int btrfs_end_transaction_throttle(struct btrfs_trans_handle *trans,
456                                    struct btrfs_root *root)
457 {
458         int ret;
459
460         ret = __btrfs_end_transaction(trans, root, 1, 1);
461         if (ret)
462                 return ret;
463         return 0;
464 }
465
466 int btrfs_end_transaction_nolock(struct btrfs_trans_handle *trans,
467                                  struct btrfs_root *root)
468 {
469         int ret;
470
471         ret = __btrfs_end_transaction(trans, root, 0, 0);
472         if (ret)
473                 return ret;
474         return 0;
475 }
476
477 int btrfs_end_transaction_dmeta(struct btrfs_trans_handle *trans,
478                                 struct btrfs_root *root)
479 {
480         return __btrfs_end_transaction(trans, root, 1, 1);
481 }
482
483 /*
484  * when btree blocks are allocated, they have some corresponding bits set for
485  * them in one of two extent_io trees.  This is used to make sure all of
486  * those extents are sent to disk but does not wait on them
487  */
488 int btrfs_write_marked_extents(struct btrfs_root *root,
489                                struct extent_io_tree *dirty_pages, int mark)
490 {
491         int ret;
492         int err = 0;
493         int werr = 0;
494         struct page *page;
495         struct inode *btree_inode = root->fs_info->btree_inode;
496         u64 start = 0;
497         u64 end;
498         unsigned long index;
499
500         while (1) {
501                 ret = find_first_extent_bit(dirty_pages, start, &start, &end,
502                                             mark);
503                 if (ret)
504                         break;
505                 while (start <= end) {
506                         cond_resched();
507
508                         index = start >> PAGE_CACHE_SHIFT;
509                         start = (u64)(index + 1) << PAGE_CACHE_SHIFT;
510                         page = find_get_page(btree_inode->i_mapping, index);
511                         if (!page)
512                                 continue;
513
514                         btree_lock_page_hook(page);
515                         if (!page->mapping) {
516                                 unlock_page(page);
517                                 page_cache_release(page);
518                                 continue;
519                         }
520
521                         if (PageWriteback(page)) {
522                                 if (PageDirty(page))
523                                         wait_on_page_writeback(page);
524                                 else {
525                                         unlock_page(page);
526                                         page_cache_release(page);
527                                         continue;
528                                 }
529                         }
530                         err = write_one_page(page, 0);
531                         if (err)
532                                 werr = err;
533                         page_cache_release(page);
534                 }
535         }
536         if (err)
537                 werr = err;
538         return werr;
539 }
540
541 /*
542  * when btree blocks are allocated, they have some corresponding bits set for
543  * them in one of two extent_io trees.  This is used to make sure all of
544  * those extents are on disk for transaction or log commit.  We wait
545  * on all the pages and clear them from the dirty pages state tree
546  */
547 int btrfs_wait_marked_extents(struct btrfs_root *root,
548                               struct extent_io_tree *dirty_pages, int mark)
549 {
550         int ret;
551         int err = 0;
552         int werr = 0;
553         struct page *page;
554         struct inode *btree_inode = root->fs_info->btree_inode;
555         u64 start = 0;
556         u64 end;
557         unsigned long index;
558
559         while (1) {
560                 ret = find_first_extent_bit(dirty_pages, start, &start, &end,
561                                             mark);
562                 if (ret)
563                         break;
564
565                 clear_extent_bits(dirty_pages, start, end, mark, GFP_NOFS);
566                 while (start <= end) {
567                         index = start >> PAGE_CACHE_SHIFT;
568                         start = (u64)(index + 1) << PAGE_CACHE_SHIFT;
569                         page = find_get_page(btree_inode->i_mapping, index);
570                         if (!page)
571                                 continue;
572                         if (PageDirty(page)) {
573                                 btree_lock_page_hook(page);
574                                 wait_on_page_writeback(page);
575                                 err = write_one_page(page, 0);
576                                 if (err)
577                                         werr = err;
578                         }
579                         wait_on_page_writeback(page);
580                         page_cache_release(page);
581                         cond_resched();
582                 }
583         }
584         if (err)
585                 werr = err;
586         return werr;
587 }
588
589 /*
590  * when btree blocks are allocated, they have some corresponding bits set for
591  * them in one of two extent_io trees.  This is used to make sure all of
592  * those extents are on disk for transaction or log commit
593  */
594 int btrfs_write_and_wait_marked_extents(struct btrfs_root *root,
595                                 struct extent_io_tree *dirty_pages, int mark)
596 {
597         int ret;
598         int ret2;
599
600         ret = btrfs_write_marked_extents(root, dirty_pages, mark);
601         ret2 = btrfs_wait_marked_extents(root, dirty_pages, mark);
602         return ret || ret2;
603 }
604
605 int btrfs_write_and_wait_transaction(struct btrfs_trans_handle *trans,
606                                      struct btrfs_root *root)
607 {
608         if (!trans || !trans->transaction) {
609                 struct inode *btree_inode;
610                 btree_inode = root->fs_info->btree_inode;
611                 return filemap_write_and_wait(btree_inode->i_mapping);
612         }
613         return btrfs_write_and_wait_marked_extents(root,
614                                            &trans->transaction->dirty_pages,
615                                            EXTENT_DIRTY);
616 }
617
618 /*
619  * this is used to update the root pointer in the tree of tree roots.
620  *
621  * But, in the case of the extent allocation tree, updating the root
622  * pointer may allocate blocks which may change the root of the extent
623  * allocation tree.
624  *
625  * So, this loops and repeats and makes sure the cowonly root didn't
626  * change while the root pointer was being updated in the metadata.
627  */
628 static int update_cowonly_root(struct btrfs_trans_handle *trans,
629                                struct btrfs_root *root)
630 {
631         int ret;
632         u64 old_root_bytenr;
633         u64 old_root_used;
634         struct btrfs_root *tree_root = root->fs_info->tree_root;
635
636         old_root_used = btrfs_root_used(&root->root_item);
637         btrfs_write_dirty_block_groups(trans, root);
638
639         while (1) {
640                 old_root_bytenr = btrfs_root_bytenr(&root->root_item);
641                 if (old_root_bytenr == root->node->start &&
642                     old_root_used == btrfs_root_used(&root->root_item))
643                         break;
644
645                 btrfs_set_root_node(&root->root_item, root->node);
646                 ret = btrfs_update_root(trans, tree_root,
647                                         &root->root_key,
648                                         &root->root_item);
649                 BUG_ON(ret);
650
651                 old_root_used = btrfs_root_used(&root->root_item);
652                 ret = btrfs_write_dirty_block_groups(trans, root);
653                 BUG_ON(ret);
654         }
655
656         if (root != root->fs_info->extent_root)
657                 switch_commit_root(root);
658
659         return 0;
660 }
661
662 /*
663  * update all the cowonly tree roots on disk
664  */
665 static noinline int commit_cowonly_roots(struct btrfs_trans_handle *trans,
666                                          struct btrfs_root *root)
667 {
668         struct btrfs_fs_info *fs_info = root->fs_info;
669         struct list_head *next;
670         struct extent_buffer *eb;
671         int ret;
672
673         ret = btrfs_run_delayed_refs(trans, root, (unsigned long)-1);
674         BUG_ON(ret);
675
676         eb = btrfs_lock_root_node(fs_info->tree_root);
677         btrfs_cow_block(trans, fs_info->tree_root, eb, NULL, 0, &eb);
678         btrfs_tree_unlock(eb);
679         free_extent_buffer(eb);
680
681         ret = btrfs_run_delayed_refs(trans, root, (unsigned long)-1);
682         BUG_ON(ret);
683
684         while (!list_empty(&fs_info->dirty_cowonly_roots)) {
685                 next = fs_info->dirty_cowonly_roots.next;
686                 list_del_init(next);
687                 root = list_entry(next, struct btrfs_root, dirty_list);
688
689                 update_cowonly_root(trans, root);
690         }
691
692         down_write(&fs_info->extent_commit_sem);
693         switch_commit_root(fs_info->extent_root);
694         up_write(&fs_info->extent_commit_sem);
695
696         return 0;
697 }
698
699 /*
700  * dead roots are old snapshots that need to be deleted.  This allocates
701  * a dirty root struct and adds it into the list of dead roots that need to
702  * be deleted
703  */
704 int btrfs_add_dead_root(struct btrfs_root *root)
705 {
706         mutex_lock(&root->fs_info->trans_mutex);
707         list_add(&root->root_list, &root->fs_info->dead_roots);
708         mutex_unlock(&root->fs_info->trans_mutex);
709         return 0;
710 }
711
712 /*
713  * update all the cowonly tree roots on disk
714  */
715 static noinline int commit_fs_roots(struct btrfs_trans_handle *trans,
716                                     struct btrfs_root *root)
717 {
718         struct btrfs_root *gang[8];
719         struct btrfs_fs_info *fs_info = root->fs_info;
720         int i;
721         int ret;
722         int err = 0;
723
724         while (1) {
725                 ret = radix_tree_gang_lookup_tag(&fs_info->fs_roots_radix,
726                                                  (void **)gang, 0,
727                                                  ARRAY_SIZE(gang),
728                                                  BTRFS_ROOT_TRANS_TAG);
729                 if (ret == 0)
730                         break;
731                 for (i = 0; i < ret; i++) {
732                         root = gang[i];
733                         radix_tree_tag_clear(&fs_info->fs_roots_radix,
734                                         (unsigned long)root->root_key.objectid,
735                                         BTRFS_ROOT_TRANS_TAG);
736
737                         btrfs_free_log(trans, root);
738                         btrfs_update_reloc_root(trans, root);
739                         btrfs_orphan_commit_root(trans, root);
740
741                         btrfs_save_ino_cache(root, trans);
742
743                         if (root->commit_root != root->node) {
744                                 mutex_lock(&root->fs_commit_mutex);
745                                 switch_commit_root(root);
746                                 btrfs_unpin_free_ino(root);
747                                 mutex_unlock(&root->fs_commit_mutex);
748
749                                 btrfs_set_root_node(&root->root_item,
750                                                     root->node);
751                         }
752
753                         err = btrfs_update_root(trans, fs_info->tree_root,
754                                                 &root->root_key,
755                                                 &root->root_item);
756                         if (err)
757                                 break;
758                 }
759         }
760         return err;
761 }
762
763 /*
764  * defrag a given btree.  If cacheonly == 1, this won't read from the disk,
765  * otherwise every leaf in the btree is read and defragged.
766  */
767 int btrfs_defrag_root(struct btrfs_root *root, int cacheonly)
768 {
769         struct btrfs_fs_info *info = root->fs_info;
770         struct btrfs_trans_handle *trans;
771         int ret;
772         unsigned long nr;
773
774         if (xchg(&root->defrag_running, 1))
775                 return 0;
776
777         while (1) {
778                 trans = btrfs_start_transaction(root, 0);
779                 if (IS_ERR(trans))
780                         return PTR_ERR(trans);
781
782                 ret = btrfs_defrag_leaves(trans, root, cacheonly);
783
784                 nr = trans->blocks_used;
785                 btrfs_end_transaction(trans, root);
786                 btrfs_btree_balance_dirty(info->tree_root, nr);
787                 cond_resched();
788
789                 if (root->fs_info->closing || ret != -EAGAIN)
790                         break;
791         }
792         root->defrag_running = 0;
793         return ret;
794 }
795
796 /*
797  * new snapshots need to be created at a very specific time in the
798  * transaction commit.  This does the actual creation
799  */
800 static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
801                                    struct btrfs_fs_info *fs_info,
802                                    struct btrfs_pending_snapshot *pending)
803 {
804         struct btrfs_key key;
805         struct btrfs_root_item *new_root_item;
806         struct btrfs_root *tree_root = fs_info->tree_root;
807         struct btrfs_root *root = pending->root;
808         struct btrfs_root *parent_root;
809         struct inode *parent_inode;
810         struct dentry *parent;
811         struct dentry *dentry;
812         struct extent_buffer *tmp;
813         struct extent_buffer *old;
814         int ret;
815         u64 to_reserve = 0;
816         u64 index = 0;
817         u64 objectid;
818         u64 root_flags;
819
820         new_root_item = kmalloc(sizeof(*new_root_item), GFP_NOFS);
821         if (!new_root_item) {
822                 pending->error = -ENOMEM;
823                 goto fail;
824         }
825
826         ret = btrfs_find_free_objectid(tree_root, &objectid);
827         if (ret) {
828                 pending->error = ret;
829                 goto fail;
830         }
831
832         btrfs_reloc_pre_snapshot(trans, pending, &to_reserve);
833         btrfs_orphan_pre_snapshot(trans, pending, &to_reserve);
834
835         if (to_reserve > 0) {
836                 ret = btrfs_block_rsv_add(trans, root, &pending->block_rsv,
837                                           to_reserve);
838                 if (ret) {
839                         pending->error = ret;
840                         goto fail;
841                 }
842         }
843
844         key.objectid = objectid;
845         key.offset = (u64)-1;
846         key.type = BTRFS_ROOT_ITEM_KEY;
847
848         trans->block_rsv = &pending->block_rsv;
849
850         dentry = pending->dentry;
851         parent = dget_parent(dentry);
852         parent_inode = parent->d_inode;
853         parent_root = BTRFS_I(parent_inode)->root;
854         record_root_in_trans(trans, parent_root);
855
856         /*
857          * insert the directory item
858          */
859         ret = btrfs_set_inode_index(parent_inode, &index);
860         BUG_ON(ret);
861         ret = btrfs_insert_dir_item(trans, parent_root,
862                                 dentry->d_name.name, dentry->d_name.len,
863                                 parent_inode, &key,
864                                 BTRFS_FT_DIR, index);
865         BUG_ON(ret);
866
867         btrfs_i_size_write(parent_inode, parent_inode->i_size +
868                                          dentry->d_name.len * 2);
869         ret = btrfs_update_inode(trans, parent_root, parent_inode);
870         BUG_ON(ret);
871
872         record_root_in_trans(trans, root);
873         btrfs_set_root_last_snapshot(&root->root_item, trans->transid);
874         memcpy(new_root_item, &root->root_item, sizeof(*new_root_item));
875         btrfs_check_and_init_root_item(new_root_item);
876
877         root_flags = btrfs_root_flags(new_root_item);
878         if (pending->readonly)
879                 root_flags |= BTRFS_ROOT_SUBVOL_RDONLY;
880         else
881                 root_flags &= ~BTRFS_ROOT_SUBVOL_RDONLY;
882         btrfs_set_root_flags(new_root_item, root_flags);
883
884         old = btrfs_lock_root_node(root);
885         btrfs_cow_block(trans, root, old, NULL, 0, &old);
886         btrfs_set_lock_blocking(old);
887
888         btrfs_copy_root(trans, root, old, &tmp, objectid);
889         btrfs_tree_unlock(old);
890         free_extent_buffer(old);
891
892         btrfs_set_root_node(new_root_item, tmp);
893         /* record when the snapshot was created in key.offset */
894         key.offset = trans->transid;
895         ret = btrfs_insert_root(trans, tree_root, &key, new_root_item);
896         btrfs_tree_unlock(tmp);
897         free_extent_buffer(tmp);
898         BUG_ON(ret);
899
900         /*
901          * insert root back/forward references
902          */
903         ret = btrfs_add_root_ref(trans, tree_root, objectid,
904                                  parent_root->root_key.objectid,
905                                  btrfs_ino(parent_inode), index,
906                                  dentry->d_name.name, dentry->d_name.len);
907         BUG_ON(ret);
908         dput(parent);
909
910         key.offset = (u64)-1;
911         pending->snap = btrfs_read_fs_root_no_name(root->fs_info, &key);
912         BUG_ON(IS_ERR(pending->snap));
913
914         btrfs_reloc_post_snapshot(trans, pending);
915         btrfs_orphan_post_snapshot(trans, pending);
916 fail:
917         kfree(new_root_item);
918         btrfs_block_rsv_release(root, &pending->block_rsv, (u64)-1);
919         return 0;
920 }
921
922 /*
923  * create all the snapshots we've scheduled for creation
924  */
925 static noinline int create_pending_snapshots(struct btrfs_trans_handle *trans,
926                                              struct btrfs_fs_info *fs_info)
927 {
928         struct btrfs_pending_snapshot *pending;
929         struct list_head *head = &trans->transaction->pending_snapshots;
930         int ret;
931
932         list_for_each_entry(pending, head, list) {
933                 /*
934                  * We must deal with the delayed items before creating
935                  * snapshots, or we will create a snapthot with inconsistent
936                  * information.
937                 */
938                 ret = btrfs_run_delayed_items(trans, fs_info->fs_root);
939                 BUG_ON(ret);
940
941                 ret = create_pending_snapshot(trans, fs_info, pending);
942                 BUG_ON(ret);
943         }
944         return 0;
945 }
946
947 static void update_super_roots(struct btrfs_root *root)
948 {
949         struct btrfs_root_item *root_item;
950         struct btrfs_super_block *super;
951
952         super = &root->fs_info->super_copy;
953
954         root_item = &root->fs_info->chunk_root->root_item;
955         super->chunk_root = root_item->bytenr;
956         super->chunk_root_generation = root_item->generation;
957         super->chunk_root_level = root_item->level;
958
959         root_item = &root->fs_info->tree_root->root_item;
960         super->root = root_item->bytenr;
961         super->generation = root_item->generation;
962         super->root_level = root_item->level;
963         if (super->cache_generation != 0 || btrfs_test_opt(root, SPACE_CACHE))
964                 super->cache_generation = root_item->generation;
965 }
966
967 int btrfs_transaction_in_commit(struct btrfs_fs_info *info)
968 {
969         int ret = 0;
970         spin_lock(&info->new_trans_lock);
971         if (info->running_transaction)
972                 ret = info->running_transaction->in_commit;
973         spin_unlock(&info->new_trans_lock);
974         return ret;
975 }
976
977 int btrfs_transaction_blocked(struct btrfs_fs_info *info)
978 {
979         int ret = 0;
980         spin_lock(&info->new_trans_lock);
981         if (info->running_transaction)
982                 ret = info->running_transaction->blocked;
983         spin_unlock(&info->new_trans_lock);
984         return ret;
985 }
986
987 /*
988  * wait for the current transaction commit to start and block subsequent
989  * transaction joins
990  */
991 static void wait_current_trans_commit_start(struct btrfs_root *root,
992                                             struct btrfs_transaction *trans)
993 {
994         DEFINE_WAIT(wait);
995
996         if (trans->in_commit)
997                 return;
998
999         while (1) {
1000                 prepare_to_wait(&root->fs_info->transaction_blocked_wait, &wait,
1001                                 TASK_UNINTERRUPTIBLE);
1002                 if (trans->in_commit) {
1003                         finish_wait(&root->fs_info->transaction_blocked_wait,
1004                                     &wait);
1005                         break;
1006                 }
1007                 mutex_unlock(&root->fs_info->trans_mutex);
1008                 schedule();
1009                 mutex_lock(&root->fs_info->trans_mutex);
1010                 finish_wait(&root->fs_info->transaction_blocked_wait, &wait);
1011         }
1012 }
1013
1014 /*
1015  * wait for the current transaction to start and then become unblocked.
1016  * caller holds ref.
1017  */
1018 static void wait_current_trans_commit_start_and_unblock(struct btrfs_root *root,
1019                                          struct btrfs_transaction *trans)
1020 {
1021         DEFINE_WAIT(wait);
1022
1023         if (trans->commit_done || (trans->in_commit && !trans->blocked))
1024                 return;
1025
1026         while (1) {
1027                 prepare_to_wait(&root->fs_info->transaction_wait, &wait,
1028                                 TASK_UNINTERRUPTIBLE);
1029                 if (trans->commit_done ||
1030                     (trans->in_commit && !trans->blocked)) {
1031                         finish_wait(&root->fs_info->transaction_wait,
1032                                     &wait);
1033                         break;
1034                 }
1035                 mutex_unlock(&root->fs_info->trans_mutex);
1036                 schedule();
1037                 mutex_lock(&root->fs_info->trans_mutex);
1038                 finish_wait(&root->fs_info->transaction_wait,
1039                             &wait);
1040         }
1041 }
1042
1043 /*
1044  * commit transactions asynchronously. once btrfs_commit_transaction_async
1045  * returns, any subsequent transaction will not be allowed to join.
1046  */
1047 struct btrfs_async_commit {
1048         struct btrfs_trans_handle *newtrans;
1049         struct btrfs_root *root;
1050         struct delayed_work work;
1051 };
1052
1053 static void do_async_commit(struct work_struct *work)
1054 {
1055         struct btrfs_async_commit *ac =
1056                 container_of(work, struct btrfs_async_commit, work.work);
1057
1058         btrfs_commit_transaction(ac->newtrans, ac->root);
1059         kfree(ac);
1060 }
1061
1062 int btrfs_commit_transaction_async(struct btrfs_trans_handle *trans,
1063                                    struct btrfs_root *root,
1064                                    int wait_for_unblock)
1065 {
1066         struct btrfs_async_commit *ac;
1067         struct btrfs_transaction *cur_trans;
1068
1069         ac = kmalloc(sizeof(*ac), GFP_NOFS);
1070         if (!ac)
1071                 return -ENOMEM;
1072
1073         INIT_DELAYED_WORK(&ac->work, do_async_commit);
1074         ac->root = root;
1075         ac->newtrans = btrfs_join_transaction(root, 0);
1076         if (IS_ERR(ac->newtrans)) {
1077                 int err = PTR_ERR(ac->newtrans);
1078                 kfree(ac);
1079                 return err;
1080         }
1081
1082         /* take transaction reference */
1083         mutex_lock(&root->fs_info->trans_mutex);
1084         cur_trans = trans->transaction;
1085         atomic_inc(&cur_trans->use_count);
1086         mutex_unlock(&root->fs_info->trans_mutex);
1087
1088         btrfs_end_transaction(trans, root);
1089         schedule_delayed_work(&ac->work, 0);
1090
1091         /* wait for transaction to start and unblock */
1092         mutex_lock(&root->fs_info->trans_mutex);
1093         if (wait_for_unblock)
1094                 wait_current_trans_commit_start_and_unblock(root, cur_trans);
1095         else
1096                 wait_current_trans_commit_start(root, cur_trans);
1097         put_transaction(cur_trans);
1098         mutex_unlock(&root->fs_info->trans_mutex);
1099
1100         return 0;
1101 }
1102
1103 /*
1104  * btrfs_transaction state sequence:
1105  *    in_commit = 0, blocked = 0  (initial)
1106  *    in_commit = 1, blocked = 1
1107  *    blocked = 0
1108  *    commit_done = 1
1109  */
1110 int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
1111                              struct btrfs_root *root)
1112 {
1113         unsigned long joined = 0;
1114         struct btrfs_transaction *cur_trans;
1115         struct btrfs_transaction *prev_trans = NULL;
1116         DEFINE_WAIT(wait);
1117         int ret;
1118         int should_grow = 0;
1119         unsigned long now = get_seconds();
1120         int flush_on_commit = btrfs_test_opt(root, FLUSHONCOMMIT);
1121
1122         btrfs_run_ordered_operations(root, 0);
1123
1124         /* make a pass through all the delayed refs we have so far
1125          * any runnings procs may add more while we are here
1126          */
1127         ret = btrfs_run_delayed_refs(trans, root, 0);
1128         BUG_ON(ret);
1129
1130         btrfs_trans_release_metadata(trans, root);
1131
1132         cur_trans = trans->transaction;
1133         /*
1134          * set the flushing flag so procs in this transaction have to
1135          * start sending their work down.
1136          */
1137         cur_trans->delayed_refs.flushing = 1;
1138
1139         ret = btrfs_run_delayed_refs(trans, root, 0);
1140         BUG_ON(ret);
1141
1142         mutex_lock(&root->fs_info->trans_mutex);
1143         if (cur_trans->in_commit) {
1144                 atomic_inc(&cur_trans->use_count);
1145                 mutex_unlock(&root->fs_info->trans_mutex);
1146                 btrfs_end_transaction(trans, root);
1147
1148                 ret = wait_for_commit(root, cur_trans);
1149                 BUG_ON(ret);
1150
1151                 mutex_lock(&root->fs_info->trans_mutex);
1152                 put_transaction(cur_trans);
1153                 mutex_unlock(&root->fs_info->trans_mutex);
1154
1155                 return 0;
1156         }
1157
1158         trans->transaction->in_commit = 1;
1159         trans->transaction->blocked = 1;
1160         wake_up(&root->fs_info->transaction_blocked_wait);
1161
1162         if (cur_trans->list.prev != &root->fs_info->trans_list) {
1163                 prev_trans = list_entry(cur_trans->list.prev,
1164                                         struct btrfs_transaction, list);
1165                 if (!prev_trans->commit_done) {
1166                         atomic_inc(&prev_trans->use_count);
1167                         mutex_unlock(&root->fs_info->trans_mutex);
1168
1169                         wait_for_commit(root, prev_trans);
1170
1171                         mutex_lock(&root->fs_info->trans_mutex);
1172                         put_transaction(prev_trans);
1173                 }
1174         }
1175
1176         if (now < cur_trans->start_time || now - cur_trans->start_time < 1)
1177                 should_grow = 1;
1178
1179         do {
1180                 int snap_pending = 0;
1181                 joined = cur_trans->num_joined;
1182                 if (!list_empty(&trans->transaction->pending_snapshots))
1183                         snap_pending = 1;
1184
1185                 WARN_ON(cur_trans != trans->transaction);
1186                 mutex_unlock(&root->fs_info->trans_mutex);
1187
1188                 if (flush_on_commit || snap_pending) {
1189                         btrfs_start_delalloc_inodes(root, 1);
1190                         ret = btrfs_wait_ordered_extents(root, 0, 1);
1191                         BUG_ON(ret);
1192                 }
1193
1194                 ret = btrfs_run_delayed_items(trans, root);
1195                 BUG_ON(ret);
1196
1197                 /*
1198                  * rename don't use btrfs_join_transaction, so, once we
1199                  * set the transaction to blocked above, we aren't going
1200                  * to get any new ordered operations.  We can safely run
1201                  * it here and no for sure that nothing new will be added
1202                  * to the list
1203                  */
1204                 btrfs_run_ordered_operations(root, 1);
1205
1206                 prepare_to_wait(&cur_trans->writer_wait, &wait,
1207                                 TASK_UNINTERRUPTIBLE);
1208
1209                 smp_mb();
1210                 if (atomic_read(&cur_trans->num_writers) > 1)
1211                         schedule_timeout(MAX_SCHEDULE_TIMEOUT);
1212                 else if (should_grow)
1213                         schedule_timeout(1);
1214
1215                 mutex_lock(&root->fs_info->trans_mutex);
1216                 finish_wait(&cur_trans->writer_wait, &wait);
1217         } while (atomic_read(&cur_trans->num_writers) > 1 ||
1218                  (should_grow && cur_trans->num_joined != joined));
1219
1220         ret = create_pending_snapshots(trans, root->fs_info);
1221         BUG_ON(ret);
1222
1223         ret = btrfs_run_delayed_items(trans, root);
1224         BUG_ON(ret);
1225
1226         ret = btrfs_run_delayed_refs(trans, root, (unsigned long)-1);
1227         BUG_ON(ret);
1228
1229         WARN_ON(cur_trans != trans->transaction);
1230
1231         btrfs_scrub_pause(root);
1232         /* btrfs_commit_tree_roots is responsible for getting the
1233          * various roots consistent with each other.  Every pointer
1234          * in the tree of tree roots has to point to the most up to date
1235          * root for every subvolume and other tree.  So, we have to keep
1236          * the tree logging code from jumping in and changing any
1237          * of the trees.
1238          *
1239          * At this point in the commit, there can't be any tree-log
1240          * writers, but a little lower down we drop the trans mutex
1241          * and let new people in.  By holding the tree_log_mutex
1242          * from now until after the super is written, we avoid races
1243          * with the tree-log code.
1244          */
1245         mutex_lock(&root->fs_info->tree_log_mutex);
1246
1247         ret = commit_fs_roots(trans, root);
1248         BUG_ON(ret);
1249
1250         /* commit_fs_roots gets rid of all the tree log roots, it is now
1251          * safe to free the root of tree log roots
1252          */
1253         btrfs_free_log_root_tree(trans, root->fs_info);
1254
1255         ret = commit_cowonly_roots(trans, root);
1256         BUG_ON(ret);
1257
1258         btrfs_prepare_extent_commit(trans, root);
1259
1260         cur_trans = root->fs_info->running_transaction;
1261         spin_lock(&root->fs_info->new_trans_lock);
1262         root->fs_info->running_transaction = NULL;
1263         spin_unlock(&root->fs_info->new_trans_lock);
1264
1265         btrfs_set_root_node(&root->fs_info->tree_root->root_item,
1266                             root->fs_info->tree_root->node);
1267         switch_commit_root(root->fs_info->tree_root);
1268
1269         btrfs_set_root_node(&root->fs_info->chunk_root->root_item,
1270                             root->fs_info->chunk_root->node);
1271         switch_commit_root(root->fs_info->chunk_root);
1272
1273         update_super_roots(root);
1274
1275         if (!root->fs_info->log_root_recovering) {
1276                 btrfs_set_super_log_root(&root->fs_info->super_copy, 0);
1277                 btrfs_set_super_log_root_level(&root->fs_info->super_copy, 0);
1278         }
1279
1280         memcpy(&root->fs_info->super_for_commit, &root->fs_info->super_copy,
1281                sizeof(root->fs_info->super_copy));
1282
1283         trans->transaction->blocked = 0;
1284
1285         wake_up(&root->fs_info->transaction_wait);
1286
1287         mutex_unlock(&root->fs_info->trans_mutex);
1288         ret = btrfs_write_and_wait_transaction(trans, root);
1289         BUG_ON(ret);
1290         write_ctree_super(trans, root, 0);
1291
1292         /*
1293          * the super is written, we can safely allow the tree-loggers
1294          * to go about their business
1295          */
1296         mutex_unlock(&root->fs_info->tree_log_mutex);
1297
1298         btrfs_finish_extent_commit(trans, root);
1299
1300         mutex_lock(&root->fs_info->trans_mutex);
1301
1302         cur_trans->commit_done = 1;
1303
1304         root->fs_info->last_trans_committed = cur_trans->transid;
1305
1306         wake_up(&cur_trans->commit_wait);
1307
1308         list_del_init(&cur_trans->list);
1309         put_transaction(cur_trans);
1310         put_transaction(cur_trans);
1311
1312         trace_btrfs_transaction_commit(root);
1313
1314         mutex_unlock(&root->fs_info->trans_mutex);
1315
1316         btrfs_scrub_continue(root);
1317
1318         if (current->journal_info == trans)
1319                 current->journal_info = NULL;
1320
1321         kmem_cache_free(btrfs_trans_handle_cachep, trans);
1322
1323         if (current != root->fs_info->transaction_kthread)
1324                 btrfs_run_delayed_iputs(root);
1325
1326         return ret;
1327 }
1328
1329 /*
1330  * interface function to delete all the snapshots we have scheduled for deletion
1331  */
1332 int btrfs_clean_old_snapshots(struct btrfs_root *root)
1333 {
1334         LIST_HEAD(list);
1335         struct btrfs_fs_info *fs_info = root->fs_info;
1336
1337         mutex_lock(&fs_info->trans_mutex);
1338         list_splice_init(&fs_info->dead_roots, &list);
1339         mutex_unlock(&fs_info->trans_mutex);
1340
1341         while (!list_empty(&list)) {
1342                 root = list_entry(list.next, struct btrfs_root, root_list);
1343                 list_del(&root->root_list);
1344
1345                 btrfs_kill_all_delayed_nodes(root);
1346
1347                 if (btrfs_header_backref_rev(root->node) <
1348                     BTRFS_MIXED_BACKREF_REV)
1349                         btrfs_drop_snapshot(root, NULL, 0);
1350                 else
1351                         btrfs_drop_snapshot(root, NULL, 1);
1352         }
1353         return 0;
1354 }