Linux 2.6.34-rc1
[pandora-kernel.git] / fs / btrfs / transaction.c
1 /*
2  * Copyright (C) 2007 Oracle.  All rights reserved.
3  *
4  * This program is free software; you can redistribute it and/or
5  * modify it under the terms of the GNU General Public
6  * License v2 as published by the Free Software Foundation.
7  *
8  * This program is distributed in the hope that it will be useful,
9  * but WITHOUT ANY WARRANTY; without even the implied warranty of
10  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
11  * General Public License for more details.
12  *
13  * You should have received a copy of the GNU General Public
14  * License along with this program; if not, write to the
15  * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
16  * Boston, MA 021110-1307, USA.
17  */
18
19 #include <linux/fs.h>
20 #include <linux/sched.h>
21 #include <linux/writeback.h>
22 #include <linux/pagemap.h>
23 #include <linux/blkdev.h>
24 #include "ctree.h"
25 #include "disk-io.h"
26 #include "transaction.h"
27 #include "locking.h"
28 #include "tree-log.h"
29
30 #define BTRFS_ROOT_TRANS_TAG 0
31
32 static noinline void put_transaction(struct btrfs_transaction *transaction)
33 {
34         WARN_ON(transaction->use_count == 0);
35         transaction->use_count--;
36         if (transaction->use_count == 0) {
37                 list_del_init(&transaction->list);
38                 memset(transaction, 0, sizeof(*transaction));
39                 kmem_cache_free(btrfs_transaction_cachep, transaction);
40         }
41 }
42
43 static noinline void switch_commit_root(struct btrfs_root *root)
44 {
45         free_extent_buffer(root->commit_root);
46         root->commit_root = btrfs_root_node(root);
47 }
48
49 /*
50  * either allocate a new transaction or hop into the existing one
51  */
52 static noinline int join_transaction(struct btrfs_root *root)
53 {
54         struct btrfs_transaction *cur_trans;
55         cur_trans = root->fs_info->running_transaction;
56         if (!cur_trans) {
57                 cur_trans = kmem_cache_alloc(btrfs_transaction_cachep,
58                                              GFP_NOFS);
59                 BUG_ON(!cur_trans);
60                 root->fs_info->generation++;
61                 cur_trans->num_writers = 1;
62                 cur_trans->num_joined = 0;
63                 cur_trans->transid = root->fs_info->generation;
64                 init_waitqueue_head(&cur_trans->writer_wait);
65                 init_waitqueue_head(&cur_trans->commit_wait);
66                 cur_trans->in_commit = 0;
67                 cur_trans->blocked = 0;
68                 cur_trans->use_count = 1;
69                 cur_trans->commit_done = 0;
70                 cur_trans->start_time = get_seconds();
71
72                 cur_trans->delayed_refs.root.rb_node = NULL;
73                 cur_trans->delayed_refs.num_entries = 0;
74                 cur_trans->delayed_refs.num_heads_ready = 0;
75                 cur_trans->delayed_refs.num_heads = 0;
76                 cur_trans->delayed_refs.flushing = 0;
77                 cur_trans->delayed_refs.run_delayed_start = 0;
78                 spin_lock_init(&cur_trans->delayed_refs.lock);
79
80                 INIT_LIST_HEAD(&cur_trans->pending_snapshots);
81                 list_add_tail(&cur_trans->list, &root->fs_info->trans_list);
82                 extent_io_tree_init(&cur_trans->dirty_pages,
83                                      root->fs_info->btree_inode->i_mapping,
84                                      GFP_NOFS);
85                 spin_lock(&root->fs_info->new_trans_lock);
86                 root->fs_info->running_transaction = cur_trans;
87                 spin_unlock(&root->fs_info->new_trans_lock);
88         } else {
89                 cur_trans->num_writers++;
90                 cur_trans->num_joined++;
91         }
92
93         return 0;
94 }
95
96 /*
97  * this does all the record keeping required to make sure that a reference
98  * counted root is properly recorded in a given transaction.  This is required
99  * to make sure the old root from before we joined the transaction is deleted
100  * when the transaction commits
101  */
102 static noinline int record_root_in_trans(struct btrfs_trans_handle *trans,
103                                          struct btrfs_root *root)
104 {
105         if (root->ref_cows && root->last_trans < trans->transid) {
106                 WARN_ON(root == root->fs_info->extent_root);
107                 WARN_ON(root->commit_root != root->node);
108
109                 radix_tree_tag_set(&root->fs_info->fs_roots_radix,
110                            (unsigned long)root->root_key.objectid,
111                            BTRFS_ROOT_TRANS_TAG);
112                 root->last_trans = trans->transid;
113                 btrfs_init_reloc_root(trans, root);
114         }
115         return 0;
116 }
117
118 int btrfs_record_root_in_trans(struct btrfs_trans_handle *trans,
119                                struct btrfs_root *root)
120 {
121         if (!root->ref_cows)
122                 return 0;
123
124         mutex_lock(&root->fs_info->trans_mutex);
125         if (root->last_trans == trans->transid) {
126                 mutex_unlock(&root->fs_info->trans_mutex);
127                 return 0;
128         }
129
130         record_root_in_trans(trans, root);
131         mutex_unlock(&root->fs_info->trans_mutex);
132         return 0;
133 }
134
135 /* wait for commit against the current transaction to become unblocked
136  * when this is done, it is safe to start a new transaction, but the current
137  * transaction might not be fully on disk.
138  */
139 static void wait_current_trans(struct btrfs_root *root)
140 {
141         struct btrfs_transaction *cur_trans;
142
143         cur_trans = root->fs_info->running_transaction;
144         if (cur_trans && cur_trans->blocked) {
145                 DEFINE_WAIT(wait);
146                 cur_trans->use_count++;
147                 while (1) {
148                         prepare_to_wait(&root->fs_info->transaction_wait, &wait,
149                                         TASK_UNINTERRUPTIBLE);
150                         if (cur_trans->blocked) {
151                                 mutex_unlock(&root->fs_info->trans_mutex);
152                                 schedule();
153                                 mutex_lock(&root->fs_info->trans_mutex);
154                                 finish_wait(&root->fs_info->transaction_wait,
155                                             &wait);
156                         } else {
157                                 finish_wait(&root->fs_info->transaction_wait,
158                                             &wait);
159                                 break;
160                         }
161                 }
162                 put_transaction(cur_trans);
163         }
164 }
165
166 enum btrfs_trans_type {
167         TRANS_START,
168         TRANS_JOIN,
169         TRANS_USERSPACE,
170 };
171
172 static struct btrfs_trans_handle *start_transaction(struct btrfs_root *root,
173                                              int num_blocks, int type)
174 {
175         struct btrfs_trans_handle *h =
176                 kmem_cache_alloc(btrfs_trans_handle_cachep, GFP_NOFS);
177         int ret;
178
179         mutex_lock(&root->fs_info->trans_mutex);
180         if (!root->fs_info->log_root_recovering &&
181             ((type == TRANS_START && !root->fs_info->open_ioctl_trans) ||
182              type == TRANS_USERSPACE))
183                 wait_current_trans(root);
184         ret = join_transaction(root);
185         BUG_ON(ret);
186
187         h->transid = root->fs_info->running_transaction->transid;
188         h->transaction = root->fs_info->running_transaction;
189         h->blocks_reserved = num_blocks;
190         h->blocks_used = 0;
191         h->block_group = 0;
192         h->alloc_exclude_nr = 0;
193         h->alloc_exclude_start = 0;
194         h->delayed_ref_updates = 0;
195
196         if (!current->journal_info && type != TRANS_USERSPACE)
197                 current->journal_info = h;
198
199         root->fs_info->running_transaction->use_count++;
200         record_root_in_trans(h, root);
201         mutex_unlock(&root->fs_info->trans_mutex);
202         return h;
203 }
204
205 struct btrfs_trans_handle *btrfs_start_transaction(struct btrfs_root *root,
206                                                    int num_blocks)
207 {
208         return start_transaction(root, num_blocks, TRANS_START);
209 }
210 struct btrfs_trans_handle *btrfs_join_transaction(struct btrfs_root *root,
211                                                    int num_blocks)
212 {
213         return start_transaction(root, num_blocks, TRANS_JOIN);
214 }
215
216 struct btrfs_trans_handle *btrfs_start_ioctl_transaction(struct btrfs_root *r,
217                                                          int num_blocks)
218 {
219         return start_transaction(r, num_blocks, TRANS_USERSPACE);
220 }
221
222 /* wait for a transaction commit to be fully complete */
223 static noinline int wait_for_commit(struct btrfs_root *root,
224                                     struct btrfs_transaction *commit)
225 {
226         DEFINE_WAIT(wait);
227         mutex_lock(&root->fs_info->trans_mutex);
228         while (!commit->commit_done) {
229                 prepare_to_wait(&commit->commit_wait, &wait,
230                                 TASK_UNINTERRUPTIBLE);
231                 if (commit->commit_done)
232                         break;
233                 mutex_unlock(&root->fs_info->trans_mutex);
234                 schedule();
235                 mutex_lock(&root->fs_info->trans_mutex);
236         }
237         mutex_unlock(&root->fs_info->trans_mutex);
238         finish_wait(&commit->commit_wait, &wait);
239         return 0;
240 }
241
242 #if 0
243 /*
244  * rate limit against the drop_snapshot code.  This helps to slow down new
245  * operations if the drop_snapshot code isn't able to keep up.
246  */
247 static void throttle_on_drops(struct btrfs_root *root)
248 {
249         struct btrfs_fs_info *info = root->fs_info;
250         int harder_count = 0;
251
252 harder:
253         if (atomic_read(&info->throttles)) {
254                 DEFINE_WAIT(wait);
255                 int thr;
256                 thr = atomic_read(&info->throttle_gen);
257
258                 do {
259                         prepare_to_wait(&info->transaction_throttle,
260                                         &wait, TASK_UNINTERRUPTIBLE);
261                         if (!atomic_read(&info->throttles)) {
262                                 finish_wait(&info->transaction_throttle, &wait);
263                                 break;
264                         }
265                         schedule();
266                         finish_wait(&info->transaction_throttle, &wait);
267                 } while (thr == atomic_read(&info->throttle_gen));
268                 harder_count++;
269
270                 if (root->fs_info->total_ref_cache_size > 1 * 1024 * 1024 &&
271                     harder_count < 2)
272                         goto harder;
273
274                 if (root->fs_info->total_ref_cache_size > 5 * 1024 * 1024 &&
275                     harder_count < 10)
276                         goto harder;
277
278                 if (root->fs_info->total_ref_cache_size > 10 * 1024 * 1024 &&
279                     harder_count < 20)
280                         goto harder;
281         }
282 }
283 #endif
284
285 void btrfs_throttle(struct btrfs_root *root)
286 {
287         mutex_lock(&root->fs_info->trans_mutex);
288         if (!root->fs_info->open_ioctl_trans)
289                 wait_current_trans(root);
290         mutex_unlock(&root->fs_info->trans_mutex);
291 }
292
293 static int __btrfs_end_transaction(struct btrfs_trans_handle *trans,
294                           struct btrfs_root *root, int throttle)
295 {
296         struct btrfs_transaction *cur_trans;
297         struct btrfs_fs_info *info = root->fs_info;
298         int count = 0;
299
300         while (count < 4) {
301                 unsigned long cur = trans->delayed_ref_updates;
302                 trans->delayed_ref_updates = 0;
303                 if (cur &&
304                     trans->transaction->delayed_refs.num_heads_ready > 64) {
305                         trans->delayed_ref_updates = 0;
306
307                         /*
308                          * do a full flush if the transaction is trying
309                          * to close
310                          */
311                         if (trans->transaction->delayed_refs.flushing)
312                                 cur = 0;
313                         btrfs_run_delayed_refs(trans, root, cur);
314                 } else {
315                         break;
316                 }
317                 count++;
318         }
319
320         mutex_lock(&info->trans_mutex);
321         cur_trans = info->running_transaction;
322         WARN_ON(cur_trans != trans->transaction);
323         WARN_ON(cur_trans->num_writers < 1);
324         cur_trans->num_writers--;
325
326         if (waitqueue_active(&cur_trans->writer_wait))
327                 wake_up(&cur_trans->writer_wait);
328         put_transaction(cur_trans);
329         mutex_unlock(&info->trans_mutex);
330
331         if (current->journal_info == trans)
332                 current->journal_info = NULL;
333         memset(trans, 0, sizeof(*trans));
334         kmem_cache_free(btrfs_trans_handle_cachep, trans);
335
336         if (throttle)
337                 btrfs_run_delayed_iputs(root);
338
339         return 0;
340 }
341
342 int btrfs_end_transaction(struct btrfs_trans_handle *trans,
343                           struct btrfs_root *root)
344 {
345         return __btrfs_end_transaction(trans, root, 0);
346 }
347
348 int btrfs_end_transaction_throttle(struct btrfs_trans_handle *trans,
349                                    struct btrfs_root *root)
350 {
351         return __btrfs_end_transaction(trans, root, 1);
352 }
353
354 /*
355  * when btree blocks are allocated, they have some corresponding bits set for
356  * them in one of two extent_io trees.  This is used to make sure all of
357  * those extents are sent to disk but does not wait on them
358  */
359 int btrfs_write_marked_extents(struct btrfs_root *root,
360                                struct extent_io_tree *dirty_pages, int mark)
361 {
362         int ret;
363         int err = 0;
364         int werr = 0;
365         struct page *page;
366         struct inode *btree_inode = root->fs_info->btree_inode;
367         u64 start = 0;
368         u64 end;
369         unsigned long index;
370
371         while (1) {
372                 ret = find_first_extent_bit(dirty_pages, start, &start, &end,
373                                             mark);
374                 if (ret)
375                         break;
376                 while (start <= end) {
377                         cond_resched();
378
379                         index = start >> PAGE_CACHE_SHIFT;
380                         start = (u64)(index + 1) << PAGE_CACHE_SHIFT;
381                         page = find_get_page(btree_inode->i_mapping, index);
382                         if (!page)
383                                 continue;
384
385                         btree_lock_page_hook(page);
386                         if (!page->mapping) {
387                                 unlock_page(page);
388                                 page_cache_release(page);
389                                 continue;
390                         }
391
392                         if (PageWriteback(page)) {
393                                 if (PageDirty(page))
394                                         wait_on_page_writeback(page);
395                                 else {
396                                         unlock_page(page);
397                                         page_cache_release(page);
398                                         continue;
399                                 }
400                         }
401                         err = write_one_page(page, 0);
402                         if (err)
403                                 werr = err;
404                         page_cache_release(page);
405                 }
406         }
407         if (err)
408                 werr = err;
409         return werr;
410 }
411
412 /*
413  * when btree blocks are allocated, they have some corresponding bits set for
414  * them in one of two extent_io trees.  This is used to make sure all of
415  * those extents are on disk for transaction or log commit.  We wait
416  * on all the pages and clear them from the dirty pages state tree
417  */
418 int btrfs_wait_marked_extents(struct btrfs_root *root,
419                               struct extent_io_tree *dirty_pages, int mark)
420 {
421         int ret;
422         int err = 0;
423         int werr = 0;
424         struct page *page;
425         struct inode *btree_inode = root->fs_info->btree_inode;
426         u64 start = 0;
427         u64 end;
428         unsigned long index;
429
430         while (1) {
431                 ret = find_first_extent_bit(dirty_pages, start, &start, &end,
432                                             mark);
433                 if (ret)
434                         break;
435
436                 clear_extent_bits(dirty_pages, start, end, mark, GFP_NOFS);
437                 while (start <= end) {
438                         index = start >> PAGE_CACHE_SHIFT;
439                         start = (u64)(index + 1) << PAGE_CACHE_SHIFT;
440                         page = find_get_page(btree_inode->i_mapping, index);
441                         if (!page)
442                                 continue;
443                         if (PageDirty(page)) {
444                                 btree_lock_page_hook(page);
445                                 wait_on_page_writeback(page);
446                                 err = write_one_page(page, 0);
447                                 if (err)
448                                         werr = err;
449                         }
450                         wait_on_page_writeback(page);
451                         page_cache_release(page);
452                         cond_resched();
453                 }
454         }
455         if (err)
456                 werr = err;
457         return werr;
458 }
459
460 /*
461  * when btree blocks are allocated, they have some corresponding bits set for
462  * them in one of two extent_io trees.  This is used to make sure all of
463  * those extents are on disk for transaction or log commit
464  */
465 int btrfs_write_and_wait_marked_extents(struct btrfs_root *root,
466                                 struct extent_io_tree *dirty_pages, int mark)
467 {
468         int ret;
469         int ret2;
470
471         ret = btrfs_write_marked_extents(root, dirty_pages, mark);
472         ret2 = btrfs_wait_marked_extents(root, dirty_pages, mark);
473         return ret || ret2;
474 }
475
476 int btrfs_write_and_wait_transaction(struct btrfs_trans_handle *trans,
477                                      struct btrfs_root *root)
478 {
479         if (!trans || !trans->transaction) {
480                 struct inode *btree_inode;
481                 btree_inode = root->fs_info->btree_inode;
482                 return filemap_write_and_wait(btree_inode->i_mapping);
483         }
484         return btrfs_write_and_wait_marked_extents(root,
485                                            &trans->transaction->dirty_pages,
486                                            EXTENT_DIRTY);
487 }
488
489 /*
490  * this is used to update the root pointer in the tree of tree roots.
491  *
492  * But, in the case of the extent allocation tree, updating the root
493  * pointer may allocate blocks which may change the root of the extent
494  * allocation tree.
495  *
496  * So, this loops and repeats and makes sure the cowonly root didn't
497  * change while the root pointer was being updated in the metadata.
498  */
499 static int update_cowonly_root(struct btrfs_trans_handle *trans,
500                                struct btrfs_root *root)
501 {
502         int ret;
503         u64 old_root_bytenr;
504         u64 old_root_used;
505         struct btrfs_root *tree_root = root->fs_info->tree_root;
506
507         old_root_used = btrfs_root_used(&root->root_item);
508         btrfs_write_dirty_block_groups(trans, root);
509
510         while (1) {
511                 old_root_bytenr = btrfs_root_bytenr(&root->root_item);
512                 if (old_root_bytenr == root->node->start &&
513                     old_root_used == btrfs_root_used(&root->root_item))
514                         break;
515
516                 btrfs_set_root_node(&root->root_item, root->node);
517                 ret = btrfs_update_root(trans, tree_root,
518                                         &root->root_key,
519                                         &root->root_item);
520                 BUG_ON(ret);
521
522                 old_root_used = btrfs_root_used(&root->root_item);
523                 ret = btrfs_write_dirty_block_groups(trans, root);
524                 BUG_ON(ret);
525         }
526
527         if (root != root->fs_info->extent_root)
528                 switch_commit_root(root);
529
530         return 0;
531 }
532
533 /*
534  * update all the cowonly tree roots on disk
535  */
536 static noinline int commit_cowonly_roots(struct btrfs_trans_handle *trans,
537                                          struct btrfs_root *root)
538 {
539         struct btrfs_fs_info *fs_info = root->fs_info;
540         struct list_head *next;
541         struct extent_buffer *eb;
542         int ret;
543
544         ret = btrfs_run_delayed_refs(trans, root, (unsigned long)-1);
545         BUG_ON(ret);
546
547         eb = btrfs_lock_root_node(fs_info->tree_root);
548         btrfs_cow_block(trans, fs_info->tree_root, eb, NULL, 0, &eb);
549         btrfs_tree_unlock(eb);
550         free_extent_buffer(eb);
551
552         ret = btrfs_run_delayed_refs(trans, root, (unsigned long)-1);
553         BUG_ON(ret);
554
555         while (!list_empty(&fs_info->dirty_cowonly_roots)) {
556                 next = fs_info->dirty_cowonly_roots.next;
557                 list_del_init(next);
558                 root = list_entry(next, struct btrfs_root, dirty_list);
559
560                 update_cowonly_root(trans, root);
561         }
562
563         down_write(&fs_info->extent_commit_sem);
564         switch_commit_root(fs_info->extent_root);
565         up_write(&fs_info->extent_commit_sem);
566
567         return 0;
568 }
569
570 /*
571  * dead roots are old snapshots that need to be deleted.  This allocates
572  * a dirty root struct and adds it into the list of dead roots that need to
573  * be deleted
574  */
575 int btrfs_add_dead_root(struct btrfs_root *root)
576 {
577         mutex_lock(&root->fs_info->trans_mutex);
578         list_add(&root->root_list, &root->fs_info->dead_roots);
579         mutex_unlock(&root->fs_info->trans_mutex);
580         return 0;
581 }
582
583 /*
584  * update all the cowonly tree roots on disk
585  */
586 static noinline int commit_fs_roots(struct btrfs_trans_handle *trans,
587                                     struct btrfs_root *root)
588 {
589         struct btrfs_root *gang[8];
590         struct btrfs_fs_info *fs_info = root->fs_info;
591         int i;
592         int ret;
593         int err = 0;
594
595         while (1) {
596                 ret = radix_tree_gang_lookup_tag(&fs_info->fs_roots_radix,
597                                                  (void **)gang, 0,
598                                                  ARRAY_SIZE(gang),
599                                                  BTRFS_ROOT_TRANS_TAG);
600                 if (ret == 0)
601                         break;
602                 for (i = 0; i < ret; i++) {
603                         root = gang[i];
604                         radix_tree_tag_clear(&fs_info->fs_roots_radix,
605                                         (unsigned long)root->root_key.objectid,
606                                         BTRFS_ROOT_TRANS_TAG);
607
608                         btrfs_free_log(trans, root);
609                         btrfs_update_reloc_root(trans, root);
610
611                         if (root->commit_root != root->node) {
612                                 switch_commit_root(root);
613                                 btrfs_set_root_node(&root->root_item,
614                                                     root->node);
615                         }
616
617                         err = btrfs_update_root(trans, fs_info->tree_root,
618                                                 &root->root_key,
619                                                 &root->root_item);
620                         if (err)
621                                 break;
622                 }
623         }
624         return err;
625 }
626
627 /*
628  * defrag a given btree.  If cacheonly == 1, this won't read from the disk,
629  * otherwise every leaf in the btree is read and defragged.
630  */
631 int btrfs_defrag_root(struct btrfs_root *root, int cacheonly)
632 {
633         struct btrfs_fs_info *info = root->fs_info;
634         int ret;
635         struct btrfs_trans_handle *trans;
636         unsigned long nr;
637
638         smp_mb();
639         if (root->defrag_running)
640                 return 0;
641         trans = btrfs_start_transaction(root, 1);
642         while (1) {
643                 root->defrag_running = 1;
644                 ret = btrfs_defrag_leaves(trans, root, cacheonly);
645                 nr = trans->blocks_used;
646                 btrfs_end_transaction(trans, root);
647                 btrfs_btree_balance_dirty(info->tree_root, nr);
648                 cond_resched();
649
650                 trans = btrfs_start_transaction(root, 1);
651                 if (root->fs_info->closing || ret != -EAGAIN)
652                         break;
653         }
654         root->defrag_running = 0;
655         smp_mb();
656         btrfs_end_transaction(trans, root);
657         return 0;
658 }
659
660 #if 0
661 /*
662  * when dropping snapshots, we generate a ton of delayed refs, and it makes
663  * sense not to join the transaction while it is trying to flush the current
664  * queue of delayed refs out.
665  *
666  * This is used by the drop snapshot code only
667  */
668 static noinline int wait_transaction_pre_flush(struct btrfs_fs_info *info)
669 {
670         DEFINE_WAIT(wait);
671
672         mutex_lock(&info->trans_mutex);
673         while (info->running_transaction &&
674                info->running_transaction->delayed_refs.flushing) {
675                 prepare_to_wait(&info->transaction_wait, &wait,
676                                 TASK_UNINTERRUPTIBLE);
677                 mutex_unlock(&info->trans_mutex);
678
679                 schedule();
680
681                 mutex_lock(&info->trans_mutex);
682                 finish_wait(&info->transaction_wait, &wait);
683         }
684         mutex_unlock(&info->trans_mutex);
685         return 0;
686 }
687
688 /*
689  * Given a list of roots that need to be deleted, call btrfs_drop_snapshot on
690  * all of them
691  */
692 int btrfs_drop_dead_root(struct btrfs_root *root)
693 {
694         struct btrfs_trans_handle *trans;
695         struct btrfs_root *tree_root = root->fs_info->tree_root;
696         unsigned long nr;
697         int ret;
698
699         while (1) {
700                 /*
701                  * we don't want to jump in and create a bunch of
702                  * delayed refs if the transaction is starting to close
703                  */
704                 wait_transaction_pre_flush(tree_root->fs_info);
705                 trans = btrfs_start_transaction(tree_root, 1);
706
707                 /*
708                  * we've joined a transaction, make sure it isn't
709                  * closing right now
710                  */
711                 if (trans->transaction->delayed_refs.flushing) {
712                         btrfs_end_transaction(trans, tree_root);
713                         continue;
714                 }
715
716                 ret = btrfs_drop_snapshot(trans, root);
717                 if (ret != -EAGAIN)
718                         break;
719
720                 ret = btrfs_update_root(trans, tree_root,
721                                         &root->root_key,
722                                         &root->root_item);
723                 if (ret)
724                         break;
725
726                 nr = trans->blocks_used;
727                 ret = btrfs_end_transaction(trans, tree_root);
728                 BUG_ON(ret);
729
730                 btrfs_btree_balance_dirty(tree_root, nr);
731                 cond_resched();
732         }
733         BUG_ON(ret);
734
735         ret = btrfs_del_root(trans, tree_root, &root->root_key);
736         BUG_ON(ret);
737
738         nr = trans->blocks_used;
739         ret = btrfs_end_transaction(trans, tree_root);
740         BUG_ON(ret);
741
742         free_extent_buffer(root->node);
743         free_extent_buffer(root->commit_root);
744         kfree(root);
745
746         btrfs_btree_balance_dirty(tree_root, nr);
747         return ret;
748 }
749 #endif
750
751 /*
752  * new snapshots need to be created at a very specific time in the
753  * transaction commit.  This does the actual creation
754  */
755 static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
756                                    struct btrfs_fs_info *fs_info,
757                                    struct btrfs_pending_snapshot *pending)
758 {
759         struct btrfs_key key;
760         struct btrfs_root_item *new_root_item;
761         struct btrfs_root *tree_root = fs_info->tree_root;
762         struct btrfs_root *root = pending->root;
763         struct extent_buffer *tmp;
764         struct extent_buffer *old;
765         int ret;
766         u64 objectid;
767
768         new_root_item = kmalloc(sizeof(*new_root_item), GFP_NOFS);
769         if (!new_root_item) {
770                 ret = -ENOMEM;
771                 goto fail;
772         }
773         ret = btrfs_find_free_objectid(trans, tree_root, 0, &objectid);
774         if (ret)
775                 goto fail;
776
777         record_root_in_trans(trans, root);
778         btrfs_set_root_last_snapshot(&root->root_item, trans->transid);
779         memcpy(new_root_item, &root->root_item, sizeof(*new_root_item));
780
781         key.objectid = objectid;
782         /* record when the snapshot was created in key.offset */
783         key.offset = trans->transid;
784         btrfs_set_key_type(&key, BTRFS_ROOT_ITEM_KEY);
785
786         old = btrfs_lock_root_node(root);
787         btrfs_cow_block(trans, root, old, NULL, 0, &old);
788         btrfs_set_lock_blocking(old);
789
790         btrfs_copy_root(trans, root, old, &tmp, objectid);
791         btrfs_tree_unlock(old);
792         free_extent_buffer(old);
793
794         btrfs_set_root_node(new_root_item, tmp);
795         ret = btrfs_insert_root(trans, root->fs_info->tree_root, &key,
796                                 new_root_item);
797         btrfs_tree_unlock(tmp);
798         free_extent_buffer(tmp);
799         if (ret)
800                 goto fail;
801
802         key.offset = (u64)-1;
803         memcpy(&pending->root_key, &key, sizeof(key));
804 fail:
805         kfree(new_root_item);
806         return ret;
807 }
808
809 static noinline int finish_pending_snapshot(struct btrfs_fs_info *fs_info,
810                                    struct btrfs_pending_snapshot *pending)
811 {
812         int ret;
813         int namelen;
814         u64 index = 0;
815         struct btrfs_trans_handle *trans;
816         struct inode *parent_inode;
817         struct btrfs_root *parent_root;
818
819         parent_inode = pending->dentry->d_parent->d_inode;
820         parent_root = BTRFS_I(parent_inode)->root;
821         trans = btrfs_join_transaction(parent_root, 1);
822
823         /*
824          * insert the directory item
825          */
826         namelen = strlen(pending->name);
827         ret = btrfs_set_inode_index(parent_inode, &index);
828         ret = btrfs_insert_dir_item(trans, parent_root,
829                             pending->name, namelen,
830                             parent_inode->i_ino,
831                             &pending->root_key, BTRFS_FT_DIR, index);
832
833         if (ret)
834                 goto fail;
835
836         btrfs_i_size_write(parent_inode, parent_inode->i_size + namelen * 2);
837         ret = btrfs_update_inode(trans, parent_root, parent_inode);
838         BUG_ON(ret);
839
840         ret = btrfs_add_root_ref(trans, parent_root->fs_info->tree_root,
841                                  pending->root_key.objectid,
842                                  parent_root->root_key.objectid,
843                                  parent_inode->i_ino, index, pending->name,
844                                  namelen);
845
846         BUG_ON(ret);
847
848 fail:
849         btrfs_end_transaction(trans, fs_info->fs_root);
850         return ret;
851 }
852
853 /*
854  * create all the snapshots we've scheduled for creation
855  */
856 static noinline int create_pending_snapshots(struct btrfs_trans_handle *trans,
857                                              struct btrfs_fs_info *fs_info)
858 {
859         struct btrfs_pending_snapshot *pending;
860         struct list_head *head = &trans->transaction->pending_snapshots;
861         int ret;
862
863         list_for_each_entry(pending, head, list) {
864                 ret = create_pending_snapshot(trans, fs_info, pending);
865                 BUG_ON(ret);
866         }
867         return 0;
868 }
869
870 static noinline int finish_pending_snapshots(struct btrfs_trans_handle *trans,
871                                              struct btrfs_fs_info *fs_info)
872 {
873         struct btrfs_pending_snapshot *pending;
874         struct list_head *head = &trans->transaction->pending_snapshots;
875         int ret;
876
877         while (!list_empty(head)) {
878                 pending = list_entry(head->next,
879                                      struct btrfs_pending_snapshot, list);
880                 ret = finish_pending_snapshot(fs_info, pending);
881                 BUG_ON(ret);
882                 list_del(&pending->list);
883                 kfree(pending->name);
884                 kfree(pending);
885         }
886         return 0;
887 }
888
889 static void update_super_roots(struct btrfs_root *root)
890 {
891         struct btrfs_root_item *root_item;
892         struct btrfs_super_block *super;
893
894         super = &root->fs_info->super_copy;
895
896         root_item = &root->fs_info->chunk_root->root_item;
897         super->chunk_root = root_item->bytenr;
898         super->chunk_root_generation = root_item->generation;
899         super->chunk_root_level = root_item->level;
900
901         root_item = &root->fs_info->tree_root->root_item;
902         super->root = root_item->bytenr;
903         super->generation = root_item->generation;
904         super->root_level = root_item->level;
905 }
906
907 int btrfs_transaction_in_commit(struct btrfs_fs_info *info)
908 {
909         int ret = 0;
910         spin_lock(&info->new_trans_lock);
911         if (info->running_transaction)
912                 ret = info->running_transaction->in_commit;
913         spin_unlock(&info->new_trans_lock);
914         return ret;
915 }
916
917 int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
918                              struct btrfs_root *root)
919 {
920         unsigned long joined = 0;
921         unsigned long timeout = 1;
922         struct btrfs_transaction *cur_trans;
923         struct btrfs_transaction *prev_trans = NULL;
924         DEFINE_WAIT(wait);
925         int ret;
926         int should_grow = 0;
927         unsigned long now = get_seconds();
928         int flush_on_commit = btrfs_test_opt(root, FLUSHONCOMMIT);
929
930         btrfs_run_ordered_operations(root, 0);
931
932         /* make a pass through all the delayed refs we have so far
933          * any runnings procs may add more while we are here
934          */
935         ret = btrfs_run_delayed_refs(trans, root, 0);
936         BUG_ON(ret);
937
938         cur_trans = trans->transaction;
939         /*
940          * set the flushing flag so procs in this transaction have to
941          * start sending their work down.
942          */
943         cur_trans->delayed_refs.flushing = 1;
944
945         ret = btrfs_run_delayed_refs(trans, root, 0);
946         BUG_ON(ret);
947
948         mutex_lock(&root->fs_info->trans_mutex);
949         if (cur_trans->in_commit) {
950                 cur_trans->use_count++;
951                 mutex_unlock(&root->fs_info->trans_mutex);
952                 btrfs_end_transaction(trans, root);
953
954                 ret = wait_for_commit(root, cur_trans);
955                 BUG_ON(ret);
956
957                 mutex_lock(&root->fs_info->trans_mutex);
958                 put_transaction(cur_trans);
959                 mutex_unlock(&root->fs_info->trans_mutex);
960
961                 return 0;
962         }
963
964         trans->transaction->in_commit = 1;
965         trans->transaction->blocked = 1;
966         if (cur_trans->list.prev != &root->fs_info->trans_list) {
967                 prev_trans = list_entry(cur_trans->list.prev,
968                                         struct btrfs_transaction, list);
969                 if (!prev_trans->commit_done) {
970                         prev_trans->use_count++;
971                         mutex_unlock(&root->fs_info->trans_mutex);
972
973                         wait_for_commit(root, prev_trans);
974
975                         mutex_lock(&root->fs_info->trans_mutex);
976                         put_transaction(prev_trans);
977                 }
978         }
979
980         if (now < cur_trans->start_time || now - cur_trans->start_time < 1)
981                 should_grow = 1;
982
983         do {
984                 int snap_pending = 0;
985                 joined = cur_trans->num_joined;
986                 if (!list_empty(&trans->transaction->pending_snapshots))
987                         snap_pending = 1;
988
989                 WARN_ON(cur_trans != trans->transaction);
990                 prepare_to_wait(&cur_trans->writer_wait, &wait,
991                                 TASK_UNINTERRUPTIBLE);
992
993                 if (cur_trans->num_writers > 1)
994                         timeout = MAX_SCHEDULE_TIMEOUT;
995                 else if (should_grow)
996                         timeout = 1;
997
998                 mutex_unlock(&root->fs_info->trans_mutex);
999
1000                 if (flush_on_commit) {
1001                         btrfs_start_delalloc_inodes(root, 1);
1002                         ret = btrfs_wait_ordered_extents(root, 0, 1);
1003                         BUG_ON(ret);
1004                 } else if (snap_pending) {
1005                         ret = btrfs_wait_ordered_extents(root, 0, 1);
1006                         BUG_ON(ret);
1007                 }
1008
1009                 /*
1010                  * rename don't use btrfs_join_transaction, so, once we
1011                  * set the transaction to blocked above, we aren't going
1012                  * to get any new ordered operations.  We can safely run
1013                  * it here and no for sure that nothing new will be added
1014                  * to the list
1015                  */
1016                 btrfs_run_ordered_operations(root, 1);
1017
1018                 smp_mb();
1019                 if (cur_trans->num_writers > 1 || should_grow)
1020                         schedule_timeout(timeout);
1021
1022                 mutex_lock(&root->fs_info->trans_mutex);
1023                 finish_wait(&cur_trans->writer_wait, &wait);
1024         } while (cur_trans->num_writers > 1 ||
1025                  (should_grow && cur_trans->num_joined != joined));
1026
1027         ret = create_pending_snapshots(trans, root->fs_info);
1028         BUG_ON(ret);
1029
1030         ret = btrfs_run_delayed_refs(trans, root, (unsigned long)-1);
1031         BUG_ON(ret);
1032
1033         WARN_ON(cur_trans != trans->transaction);
1034
1035         /* btrfs_commit_tree_roots is responsible for getting the
1036          * various roots consistent with each other.  Every pointer
1037          * in the tree of tree roots has to point to the most up to date
1038          * root for every subvolume and other tree.  So, we have to keep
1039          * the tree logging code from jumping in and changing any
1040          * of the trees.
1041          *
1042          * At this point in the commit, there can't be any tree-log
1043          * writers, but a little lower down we drop the trans mutex
1044          * and let new people in.  By holding the tree_log_mutex
1045          * from now until after the super is written, we avoid races
1046          * with the tree-log code.
1047          */
1048         mutex_lock(&root->fs_info->tree_log_mutex);
1049
1050         ret = commit_fs_roots(trans, root);
1051         BUG_ON(ret);
1052
1053         /* commit_fs_roots gets rid of all the tree log roots, it is now
1054          * safe to free the root of tree log roots
1055          */
1056         btrfs_free_log_root_tree(trans, root->fs_info);
1057
1058         ret = commit_cowonly_roots(trans, root);
1059         BUG_ON(ret);
1060
1061         btrfs_prepare_extent_commit(trans, root);
1062
1063         cur_trans = root->fs_info->running_transaction;
1064         spin_lock(&root->fs_info->new_trans_lock);
1065         root->fs_info->running_transaction = NULL;
1066         spin_unlock(&root->fs_info->new_trans_lock);
1067
1068         btrfs_set_root_node(&root->fs_info->tree_root->root_item,
1069                             root->fs_info->tree_root->node);
1070         switch_commit_root(root->fs_info->tree_root);
1071
1072         btrfs_set_root_node(&root->fs_info->chunk_root->root_item,
1073                             root->fs_info->chunk_root->node);
1074         switch_commit_root(root->fs_info->chunk_root);
1075
1076         update_super_roots(root);
1077
1078         if (!root->fs_info->log_root_recovering) {
1079                 btrfs_set_super_log_root(&root->fs_info->super_copy, 0);
1080                 btrfs_set_super_log_root_level(&root->fs_info->super_copy, 0);
1081         }
1082
1083         memcpy(&root->fs_info->super_for_commit, &root->fs_info->super_copy,
1084                sizeof(root->fs_info->super_copy));
1085
1086         trans->transaction->blocked = 0;
1087
1088         wake_up(&root->fs_info->transaction_wait);
1089
1090         mutex_unlock(&root->fs_info->trans_mutex);
1091         ret = btrfs_write_and_wait_transaction(trans, root);
1092         BUG_ON(ret);
1093         write_ctree_super(trans, root, 0);
1094
1095         /*
1096          * the super is written, we can safely allow the tree-loggers
1097          * to go about their business
1098          */
1099         mutex_unlock(&root->fs_info->tree_log_mutex);
1100
1101         btrfs_finish_extent_commit(trans, root);
1102
1103         /* do the directory inserts of any pending snapshot creations */
1104         finish_pending_snapshots(trans, root->fs_info);
1105
1106         mutex_lock(&root->fs_info->trans_mutex);
1107
1108         cur_trans->commit_done = 1;
1109
1110         root->fs_info->last_trans_committed = cur_trans->transid;
1111
1112         wake_up(&cur_trans->commit_wait);
1113
1114         put_transaction(cur_trans);
1115         put_transaction(cur_trans);
1116
1117         mutex_unlock(&root->fs_info->trans_mutex);
1118
1119         if (current->journal_info == trans)
1120                 current->journal_info = NULL;
1121
1122         kmem_cache_free(btrfs_trans_handle_cachep, trans);
1123
1124         if (current != root->fs_info->transaction_kthread)
1125                 btrfs_run_delayed_iputs(root);
1126
1127         return ret;
1128 }
1129
1130 /*
1131  * interface function to delete all the snapshots we have scheduled for deletion
1132  */
1133 int btrfs_clean_old_snapshots(struct btrfs_root *root)
1134 {
1135         LIST_HEAD(list);
1136         struct btrfs_fs_info *fs_info = root->fs_info;
1137
1138         mutex_lock(&fs_info->trans_mutex);
1139         list_splice_init(&fs_info->dead_roots, &list);
1140         mutex_unlock(&fs_info->trans_mutex);
1141
1142         while (!list_empty(&list)) {
1143                 root = list_entry(list.next, struct btrfs_root, root_list);
1144                 list_del(&root->root_list);
1145
1146                 if (btrfs_header_backref_rev(root->node) <
1147                     BTRFS_MIXED_BACKREF_REV)
1148                         btrfs_drop_snapshot(root, 0);
1149                 else
1150                         btrfs_drop_snapshot(root, 1);
1151         }
1152         return 0;
1153 }